1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/systm.h> 27 #include <sys/types.h> 28 #include <sys/param.h> 29 #include <sys/thread.h> 30 #include <sys/cpuvar.h> 31 #include <sys/cpupart.h> 32 #include <sys/kmem.h> 33 #include <sys/cmn_err.h> 34 #include <sys/kstat.h> 35 #include <sys/processor.h> 36 #include <sys/disp.h> 37 #include <sys/group.h> 38 #include <sys/pghw.h> 39 #include <sys/bitset.h> 40 #include <sys/lgrp.h> 41 #include <sys/cmt.h> 42 #include <sys/cpu_pm.h> 43 44 /* 45 * CMT scheduler / dispatcher support 46 * 47 * This file implements CMT scheduler support using Processor Groups. 48 * The CMT processor group class creates and maintains the CMT class 49 * specific processor group pg_cmt_t. 50 * 51 * ---------------------------- <-- pg_cmt_t * 52 * | pghw_t | 53 * ---------------------------- 54 * | CMT class specific data | 55 * | - hierarchy linkage | 56 * | - CMT load balancing data| 57 * | - active CPU group/bitset| 58 * ---------------------------- 59 * 60 * The scheduler/dispatcher leverages knowledge of the performance 61 * relevant CMT sharing relationships existing between cpus to implement 62 * optimized affinity, load balancing, and coalescence policies. 63 * 64 * Load balancing policy seeks to improve performance by minimizing 65 * contention over shared processor resources / facilities, Affinity 66 * policies seek to improve cache and TLB utilization. Coalescence 67 * policies improve resource utilization and ultimately power efficiency. 68 * 69 * The CMT PGs created by this class are already arranged into a 70 * hierarchy (which is done in the pghw layer). To implement the top-down 71 * CMT load balancing algorithm, the CMT PGs additionally maintain 72 * parent, child and sibling hierarchy relationships. 73 * Parent PGs always contain a superset of their children(s) resources, 74 * each PG can have at most one parent, and siblings are the group of PGs 75 * sharing the same parent. 76 * 77 * On UMA based systems, the CMT load balancing algorithm begins by balancing 78 * load across the group of top level PGs in the system hierarchy. 79 * On NUMA systems, the CMT load balancing algorithm balances load across the 80 * group of top level PGs in each leaf lgroup...but for root homed threads, 81 * is willing to balance against all the top level PGs in the system. 82 * 83 * Groups of top level PGs are maintained to implement the above, one for each 84 * leaf lgroup (containing the top level PGs in that lgroup), and one (for the 85 * root lgroup) that contains all the top level PGs in the system. 86 */ 87 static cmt_lgrp_t *cmt_lgrps = NULL; /* cmt_lgrps list head */ 88 static cmt_lgrp_t *cpu0_lgrp = NULL; /* boot CPU's initial lgrp */ 89 /* used for null_proc_lpa */ 90 cmt_lgrp_t *cmt_root = NULL; /* Reference to root cmt pg */ 91 92 static int is_cpu0 = 1; /* true if this is boot CPU context */ 93 94 /* 95 * Array of hardware sharing relationships that are blacklisted. 96 * CMT scheduling optimizations won't be performed for blacklisted sharing 97 * relationships. 98 */ 99 static int cmt_hw_blacklisted[PGHW_NUM_COMPONENTS]; 100 101 /* 102 * Set this to non-zero to disable CMT scheduling 103 * This must be done via kmdb -d, as /etc/system will be too late 104 */ 105 int cmt_sched_disabled = 0; 106 107 /* 108 * Status codes for CMT lineage validation 109 * See pg_cmt_lineage_validate() below 110 */ 111 typedef enum cmt_lineage_validation { 112 CMT_LINEAGE_VALID, 113 CMT_LINEAGE_NON_CONCENTRIC, 114 CMT_LINEAGE_PG_SPANS_LGRPS, 115 CMT_LINEAGE_NON_PROMOTABLE, 116 CMT_LINEAGE_REPAIRED, 117 CMT_LINEAGE_UNRECOVERABLE 118 } cmt_lineage_validation_t; 119 120 /* 121 * Status of the current lineage under construction. 122 * One must be holding cpu_lock to change this. 123 */ 124 cmt_lineage_validation_t cmt_lineage_status = CMT_LINEAGE_VALID; 125 126 /* 127 * Power domain definitions (on x86) are defined by ACPI, and 128 * therefore may be subject to BIOS bugs. 129 */ 130 #define PG_CMT_HW_SUSPECT(hw) PGHW_IS_PM_DOMAIN(hw) 131 132 /* 133 * Macro to test if PG is managed by the CMT PG class 134 */ 135 #define IS_CMT_PG(pg) (((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id) 136 137 static pg_cid_t pg_cmt_class_id; /* PG class id */ 138 139 static pg_t *pg_cmt_alloc(); 140 static void pg_cmt_free(pg_t *); 141 static void pg_cmt_cpu_init(cpu_t *, cpu_pg_t *); 142 static void pg_cmt_cpu_fini(cpu_t *, cpu_pg_t *); 143 static void pg_cmt_cpu_active(cpu_t *); 144 static void pg_cmt_cpu_inactive(cpu_t *); 145 static void pg_cmt_cpupart_in(cpu_t *, cpupart_t *); 146 static void pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *); 147 static char *pg_cmt_policy_name(pg_t *); 148 static void pg_cmt_hier_sort(pg_cmt_t **, int); 149 static pg_cmt_t *pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *); 150 static int pg_cmt_cpu_belongs(pg_t *, cpu_t *); 151 static int pg_cmt_hw(pghw_type_t); 152 static cmt_lgrp_t *pg_cmt_find_lgrp(lgrp_handle_t); 153 static cmt_lgrp_t *pg_cmt_lgrp_create(lgrp_handle_t); 154 static void cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t, 155 kthread_t *, kthread_t *); 156 static void cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t, 157 kthread_t *, kthread_t *); 158 static void cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *); 159 static cmt_lineage_validation_t pg_cmt_lineage_validate(pg_cmt_t **, int *, 160 cpu_pg_t *); 161 162 /* 163 * CMT PG ops 164 */ 165 struct pg_ops pg_ops_cmt = { 166 pg_cmt_alloc, 167 pg_cmt_free, 168 pg_cmt_cpu_init, 169 pg_cmt_cpu_fini, 170 pg_cmt_cpu_active, 171 pg_cmt_cpu_inactive, 172 pg_cmt_cpupart_in, 173 NULL, /* cpupart_out */ 174 pg_cmt_cpupart_move, 175 pg_cmt_cpu_belongs, 176 pg_cmt_policy_name, 177 }; 178 179 /* 180 * Initialize the CMT PG class 181 */ 182 void 183 pg_cmt_class_init(void) 184 { 185 if (cmt_sched_disabled) 186 return; 187 188 pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL); 189 } 190 191 /* 192 * Called to indicate a new CPU has started up so 193 * that either t0 or the slave startup thread can 194 * be accounted for. 195 */ 196 void 197 pg_cmt_cpu_startup(cpu_t *cp) 198 { 199 pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread, 200 cp->cpu_thread); 201 } 202 203 /* 204 * Return non-zero if thread can migrate between "from" and "to" 205 * without a performance penalty 206 */ 207 int 208 pg_cmt_can_migrate(cpu_t *from, cpu_t *to) 209 { 210 if (from->cpu_physid->cpu_cacheid == 211 to->cpu_physid->cpu_cacheid) 212 return (1); 213 return (0); 214 } 215 216 /* 217 * CMT class specific PG allocation 218 */ 219 static pg_t * 220 pg_cmt_alloc(void) 221 { 222 return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP)); 223 } 224 225 /* 226 * Class specific PG de-allocation 227 */ 228 static void 229 pg_cmt_free(pg_t *pg) 230 { 231 ASSERT(pg != NULL); 232 ASSERT(IS_CMT_PG(pg)); 233 234 kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t)); 235 } 236 237 /* 238 * Given a hardware sharing relationship, return which dispatcher 239 * policies should be implemented to optimize performance and efficiency 240 */ 241 static pg_cmt_policy_t 242 pg_cmt_policy(pghw_type_t hw) 243 { 244 pg_cmt_policy_t p; 245 246 /* 247 * Give the platform a chance to override the default 248 */ 249 if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY) 250 return (p); 251 252 switch (hw) { 253 case PGHW_IPIPE: 254 case PGHW_FPU: 255 case PGHW_PROCNODE: 256 case PGHW_CHIP: 257 return (CMT_BALANCE); 258 case PGHW_CACHE: 259 return (CMT_AFFINITY); 260 case PGHW_POW_ACTIVE: 261 case PGHW_POW_IDLE: 262 return (CMT_BALANCE); 263 default: 264 return (CMT_NO_POLICY); 265 } 266 } 267 268 /* 269 * Rank the importance of optimizing for the pg1 relationship vs. 270 * the pg2 relationship. 271 */ 272 static pg_cmt_t * 273 pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2) 274 { 275 pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw; 276 pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw; 277 278 /* 279 * A power domain is only important if CPUPM is enabled. 280 */ 281 if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) { 282 if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2)) 283 return (pg2); 284 if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1)) 285 return (pg1); 286 } 287 288 /* 289 * Otherwise, ask the platform 290 */ 291 if (pg_plat_hw_rank(hw1, hw2) == hw1) 292 return (pg1); 293 else 294 return (pg2); 295 } 296 297 /* 298 * Initialize CMT callbacks for the given PG 299 */ 300 static void 301 cmt_callback_init(pg_t *pg) 302 { 303 /* 304 * Stick with the default callbacks if there isn't going to be 305 * any CMT thread placement optimizations implemented. 306 */ 307 if (((pg_cmt_t *)pg)->cmt_policy == CMT_NO_POLICY) 308 return; 309 310 switch (((pghw_t *)pg)->pghw_hw) { 311 case PGHW_POW_ACTIVE: 312 pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr; 313 pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr; 314 break; 315 default: 316 pg->pg_cb.thread_swtch = cmt_ev_thread_swtch; 317 318 } 319 } 320 321 /* 322 * Promote PG above it's current parent. 323 * This is only legal if PG has an equal or greater number of CPUs than its 324 * parent. 325 * 326 * This routine operates on the CPU specific processor group data (for the CPUs 327 * in the PG being promoted), and may be invoked from a context where one CPU's 328 * PG data is under construction. In this case the argument "pgdata", if not 329 * NULL, is a reference to the CPU's under-construction PG data. 330 */ 331 static void 332 cmt_hier_promote(pg_cmt_t *pg, cpu_pg_t *pgdata) 333 { 334 pg_cmt_t *parent; 335 group_t *children; 336 cpu_t *cpu; 337 group_iter_t iter; 338 pg_cpu_itr_t cpu_iter; 339 int r; 340 int err; 341 int nchildren; 342 343 ASSERT(MUTEX_HELD(&cpu_lock)); 344 345 parent = pg->cmt_parent; 346 if (parent == NULL) { 347 /* 348 * Nothing to do 349 */ 350 return; 351 } 352 353 ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent)); 354 355 /* 356 * We're changing around the hierarchy, which is actively traversed 357 * by the dispatcher. Pause CPUS to ensure exclusivity. 358 */ 359 pause_cpus(NULL); 360 361 /* 362 * If necessary, update the parent's sibling set, replacing parent 363 * with PG. 364 */ 365 if (parent->cmt_siblings) { 366 if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE) 367 != -1) { 368 r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE); 369 ASSERT(r != -1); 370 } 371 } 372 373 /* 374 * If the parent is at the top of the hierarchy, replace it's entry 375 * in the root lgroup's group of top level PGs. 376 */ 377 if (parent->cmt_parent == NULL && 378 parent->cmt_siblings != &cmt_root->cl_pgs) { 379 if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE) 380 != -1) { 381 r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE); 382 ASSERT(r != -1); 383 } 384 } 385 386 /* 387 * We assume (and therefore assert) that the PG being promoted is an 388 * only child of it's parent. Update the parent's children set 389 * replacing PG's entry with the parent (since the parent is becoming 390 * the child). Then have PG and the parent swap children sets and 391 * children counts. 392 */ 393 ASSERT(GROUP_SIZE(parent->cmt_children) <= 1); 394 if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) { 395 r = group_add(parent->cmt_children, parent, GRP_NORESIZE); 396 ASSERT(r != -1); 397 } 398 399 children = pg->cmt_children; 400 pg->cmt_children = parent->cmt_children; 401 parent->cmt_children = children; 402 403 nchildren = pg->cmt_nchildren; 404 pg->cmt_nchildren = parent->cmt_nchildren; 405 parent->cmt_nchildren = nchildren; 406 407 /* 408 * Update the sibling references for PG and it's parent 409 */ 410 pg->cmt_siblings = parent->cmt_siblings; 411 parent->cmt_siblings = pg->cmt_children; 412 413 /* 414 * Update any cached lineages in the per CPU pg data. 415 */ 416 PG_CPU_ITR_INIT(pg, cpu_iter); 417 while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { 418 int idx; 419 int sz; 420 pg_cmt_t *cpu_pg; 421 cpu_pg_t *pgd; /* CPU's PG data */ 422 423 /* 424 * The CPU's whose lineage is under construction still 425 * references the bootstrap CPU PG data structure. 426 */ 427 if (pg_cpu_is_bootstrapped(cpu)) 428 pgd = pgdata; 429 else 430 pgd = cpu->cpu_pg; 431 432 /* 433 * Iterate over the CPU's PGs updating the children 434 * of the PG being promoted, since they have a new parent. 435 */ 436 group_iter_init(&iter); 437 while ((cpu_pg = group_iterate(&pgd->cmt_pgs, &iter)) != NULL) { 438 if (cpu_pg->cmt_parent == pg) { 439 cpu_pg->cmt_parent = parent; 440 } 441 } 442 443 /* 444 * Update the CMT load balancing lineage 445 */ 446 if ((idx = group_find(&pgd->cmt_pgs, (void *)pg)) == -1) { 447 /* 448 * Unless this is the CPU who's lineage is being 449 * constructed, the PG being promoted should be 450 * in the lineage. 451 */ 452 ASSERT(pg_cpu_is_bootstrapped(cpu)); 453 continue; 454 } 455 456 ASSERT(idx > 0); 457 ASSERT(GROUP_ACCESS(&pgd->cmt_pgs, idx - 1) == parent); 458 459 /* 460 * Have the child and the parent swap places in the CPU's 461 * lineage 462 */ 463 group_remove_at(&pgd->cmt_pgs, idx); 464 group_remove_at(&pgd->cmt_pgs, idx - 1); 465 err = group_add_at(&pgd->cmt_pgs, parent, idx); 466 ASSERT(err == 0); 467 err = group_add_at(&pgd->cmt_pgs, pg, idx - 1); 468 ASSERT(err == 0); 469 470 /* 471 * Ensure cmt_lineage references CPU's leaf PG. 472 * Since cmt_pgs is top-down ordered, the bottom is the last 473 * element. 474 */ 475 if ((sz = GROUP_SIZE(&pgd->cmt_pgs)) > 0) 476 pgd->cmt_lineage = GROUP_ACCESS(&pgd->cmt_pgs, sz - 1); 477 } 478 479 /* 480 * Update the parent references for PG and it's parent 481 */ 482 pg->cmt_parent = parent->cmt_parent; 483 parent->cmt_parent = pg; 484 485 start_cpus(); 486 } 487 488 /* 489 * CMT class callback for a new CPU entering the system 490 * 491 * This routine operates on the CPU specific processor group data (for the CPU 492 * being initialized). The argument "pgdata" is a reference to the CPU's PG 493 * data to be constructed. 494 * 495 * cp->cpu_pg is used by the dispatcher to access the CPU's PG data 496 * references a "bootstrap" structure. pg_cmt_cpu_init() and the routines it 497 * calls must be careful to operate only on the "pgdata" argument, and not 498 * cp->cpu_pg. 499 */ 500 static void 501 pg_cmt_cpu_init(cpu_t *cp, cpu_pg_t *pgdata) 502 { 503 pg_cmt_t *pg; 504 group_t *cmt_pgs; 505 int levels, level; 506 pghw_type_t hw; 507 pg_t *pg_cache = NULL; 508 pg_cmt_t *cpu_cmt_hier[PGHW_NUM_COMPONENTS]; 509 lgrp_handle_t lgrp_handle; 510 cmt_lgrp_t *lgrp; 511 cmt_lineage_validation_t lineage_status; 512 513 ASSERT(MUTEX_HELD(&cpu_lock)); 514 ASSERT(pg_cpu_is_bootstrapped(cp)); 515 516 if (cmt_sched_disabled) 517 return; 518 519 /* 520 * A new CPU is coming into the system. 521 * Interrogate the platform to see if the CPU 522 * has any performance or efficiency relevant 523 * sharing relationships 524 */ 525 cmt_pgs = &pgdata->cmt_pgs; 526 pgdata->cmt_lineage = NULL; 527 528 bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier)); 529 levels = 0; 530 for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) { 531 532 pg_cmt_policy_t policy; 533 534 /* 535 * We're only interested in the hw sharing relationships 536 * for which we know how to optimize. 537 */ 538 policy = pg_cmt_policy(hw); 539 if (policy == CMT_NO_POLICY || 540 pg_plat_hw_shared(cp, hw) == 0) 541 continue; 542 543 /* 544 * We will still create the PGs for hardware sharing 545 * relationships that have been blacklisted, but won't 546 * implement CMT thread placement optimizations against them. 547 */ 548 if (cmt_hw_blacklisted[hw] == 1) 549 policy = CMT_NO_POLICY; 550 551 /* 552 * Find (or create) the PG associated with 553 * the hw sharing relationship in which cp 554 * belongs. 555 * 556 * Determine if a suitable PG already 557 * exists, or if one needs to be created. 558 */ 559 pg = (pg_cmt_t *)pghw_place_cpu(cp, hw); 560 if (pg == NULL) { 561 /* 562 * Create a new one. 563 * Initialize the common... 564 */ 565 pg = (pg_cmt_t *)pg_create(pg_cmt_class_id); 566 567 /* ... physical ... */ 568 pghw_init((pghw_t *)pg, cp, hw); 569 570 /* 571 * ... and CMT specific portions of the 572 * structure. 573 */ 574 pg->cmt_policy = policy; 575 576 /* CMT event callbacks */ 577 cmt_callback_init((pg_t *)pg); 578 579 bitset_init(&pg->cmt_cpus_actv_set); 580 group_create(&pg->cmt_cpus_actv); 581 } else { 582 ASSERT(IS_CMT_PG(pg)); 583 } 584 585 ((pghw_t *)pg)->pghw_generation++; 586 587 /* Add the CPU to the PG */ 588 pg_cpu_add((pg_t *)pg, cp, pgdata); 589 590 /* 591 * Ensure capacity of the active CPU group/bitset 592 */ 593 group_expand(&pg->cmt_cpus_actv, 594 GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 595 596 if (cp->cpu_seqid >= 597 bitset_capacity(&pg->cmt_cpus_actv_set)) { 598 bitset_resize(&pg->cmt_cpus_actv_set, 599 cp->cpu_seqid + 1); 600 } 601 602 /* 603 * Build a lineage of CMT PGs for load balancing / coalescence 604 */ 605 if (policy & (CMT_BALANCE | CMT_COALESCE)) { 606 cpu_cmt_hier[levels++] = pg; 607 } 608 609 /* Cache this for later */ 610 if (hw == PGHW_CACHE) 611 pg_cache = (pg_t *)pg; 612 } 613 614 group_expand(cmt_pgs, levels); 615 616 if (cmt_root == NULL) 617 cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand()); 618 619 /* 620 * Find the lgrp that encapsulates this CPU's CMT hierarchy 621 */ 622 lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 623 if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL) 624 lgrp = pg_cmt_lgrp_create(lgrp_handle); 625 626 /* 627 * Ascendingly sort the PGs in the lineage by number of CPUs 628 */ 629 pg_cmt_hier_sort(cpu_cmt_hier, levels); 630 631 /* 632 * Examine the lineage and validate it. 633 * This routine will also try to fix the lineage along with the 634 * rest of the PG hierarchy should it detect an issue. 635 * 636 * If it returns anything other than VALID or REPAIRED, an 637 * unrecoverable error has occurred, and we cannot proceed. 638 */ 639 lineage_status = pg_cmt_lineage_validate(cpu_cmt_hier, &levels, pgdata); 640 if ((lineage_status != CMT_LINEAGE_VALID) && 641 (lineage_status != CMT_LINEAGE_REPAIRED)) { 642 /* 643 * In the case of an unrecoverable error where CMT scheduling 644 * has been disabled, assert that the under construction CPU's 645 * PG data has an empty CMT load balancing lineage. 646 */ 647 ASSERT((cmt_sched_disabled == 0) || 648 (GROUP_SIZE(&(pgdata->cmt_pgs)) == 0)); 649 return; 650 } 651 652 /* 653 * For existing PGs in the lineage, verify that the parent is 654 * correct, as the generation in the lineage may have changed 655 * as a result of the sorting. Start the traversal at the top 656 * of the lineage, moving down. 657 */ 658 for (level = levels - 1; level >= 0; ) { 659 int reorg; 660 661 reorg = 0; 662 pg = cpu_cmt_hier[level]; 663 664 /* 665 * Promote PGs at an incorrect generation into place. 666 */ 667 while (pg->cmt_parent && 668 pg->cmt_parent != cpu_cmt_hier[level + 1]) { 669 cmt_hier_promote(pg, pgdata); 670 reorg++; 671 } 672 if (reorg > 0) 673 level = levels - 1; 674 else 675 level--; 676 } 677 678 /* 679 * For each of the PGs in the CPU's lineage: 680 * - Add an entry in the CPU sorted CMT PG group 681 * which is used for top down CMT load balancing 682 * - Tie the PG into the CMT hierarchy by connecting 683 * it to it's parent and siblings. 684 */ 685 for (level = 0; level < levels; level++) { 686 uint_t children; 687 int err; 688 689 pg = cpu_cmt_hier[level]; 690 err = group_add_at(cmt_pgs, pg, levels - level - 1); 691 ASSERT(err == 0); 692 693 if (level == 0) 694 pgdata->cmt_lineage = (pg_t *)pg; 695 696 if (pg->cmt_siblings != NULL) { 697 /* Already initialized */ 698 ASSERT(pg->cmt_parent == NULL || 699 pg->cmt_parent == cpu_cmt_hier[level + 1]); 700 ASSERT(pg->cmt_siblings == &lgrp->cl_pgs || 701 ((pg->cmt_parent != NULL) && 702 pg->cmt_siblings == pg->cmt_parent->cmt_children)); 703 continue; 704 } 705 706 if ((level + 1) == levels) { 707 pg->cmt_parent = NULL; 708 709 pg->cmt_siblings = &lgrp->cl_pgs; 710 children = ++lgrp->cl_npgs; 711 if (cmt_root != lgrp) 712 cmt_root->cl_npgs++; 713 } else { 714 pg->cmt_parent = cpu_cmt_hier[level + 1]; 715 716 /* 717 * A good parent keeps track of their children. 718 * The parent's children group is also the PG's 719 * siblings. 720 */ 721 if (pg->cmt_parent->cmt_children == NULL) { 722 pg->cmt_parent->cmt_children = 723 kmem_zalloc(sizeof (group_t), KM_SLEEP); 724 group_create(pg->cmt_parent->cmt_children); 725 } 726 pg->cmt_siblings = pg->cmt_parent->cmt_children; 727 children = ++pg->cmt_parent->cmt_nchildren; 728 } 729 730 group_expand(pg->cmt_siblings, children); 731 group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs); 732 } 733 734 /* 735 * Cache the chip and core IDs in the cpu_t->cpu_physid structure 736 * for fast lookups later. 737 */ 738 if (cp->cpu_physid) { 739 cp->cpu_physid->cpu_chipid = 740 pg_plat_hw_instance_id(cp, PGHW_CHIP); 741 cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp); 742 743 /* 744 * If this cpu has a PG representing shared cache, then set 745 * cpu_cacheid to that PG's logical id 746 */ 747 if (pg_cache) 748 cp->cpu_physid->cpu_cacheid = pg_cache->pg_id; 749 } 750 751 /* CPU0 only initialization */ 752 if (is_cpu0) { 753 is_cpu0 = 0; 754 cpu0_lgrp = lgrp; 755 } 756 757 } 758 759 /* 760 * Class callback when a CPU is leaving the system (deletion) 761 * 762 * "pgdata" is a reference to the CPU's PG data to be deconstructed. 763 * 764 * cp->cpu_pg is used by the dispatcher to access the CPU's PG data 765 * references a "bootstrap" structure across this function's invocation. 766 * pg_cmt_cpu_fini() and the routines it calls must be careful to operate only 767 * on the "pgdata" argument, and not cp->cpu_pg. 768 */ 769 static void 770 pg_cmt_cpu_fini(cpu_t *cp, cpu_pg_t *pgdata) 771 { 772 group_iter_t i; 773 pg_cmt_t *pg; 774 group_t *pgs, *cmt_pgs; 775 lgrp_handle_t lgrp_handle; 776 cmt_lgrp_t *lgrp; 777 778 if (cmt_sched_disabled) 779 return; 780 781 ASSERT(pg_cpu_is_bootstrapped(cp)); 782 783 pgs = &pgdata->pgs; 784 cmt_pgs = &pgdata->cmt_pgs; 785 786 /* 787 * Find the lgroup that encapsulates this CPU's CMT hierarchy 788 */ 789 lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 790 791 lgrp = pg_cmt_find_lgrp(lgrp_handle); 792 if (ncpus == 1 && lgrp != cpu0_lgrp) { 793 /* 794 * One might wonder how we could be deconfiguring the 795 * only CPU in the system. 796 * 797 * On Starcat systems when null_proc_lpa is detected, 798 * the boot CPU (which is already configured into a leaf 799 * lgroup), is moved into the root lgroup. This is done by 800 * deconfiguring it from both lgroups and processor 801 * groups), and then later reconfiguring it back in. This 802 * call to pg_cmt_cpu_fini() is part of that deconfiguration. 803 * 804 * This special case is detected by noting that the platform 805 * has changed the CPU's lgrp affiliation (since it now 806 * belongs in the root). In this case, use the cmt_lgrp_t 807 * cached for the boot CPU, since this is what needs to be 808 * torn down. 809 */ 810 lgrp = cpu0_lgrp; 811 } 812 813 ASSERT(lgrp != NULL); 814 815 /* 816 * First, clean up anything load balancing specific for each of 817 * the CPU's PGs that participated in CMT load balancing 818 */ 819 pg = (pg_cmt_t *)pgdata->cmt_lineage; 820 while (pg != NULL) { 821 822 ((pghw_t *)pg)->pghw_generation++; 823 824 /* 825 * Remove the PG from the CPU's load balancing lineage 826 */ 827 (void) group_remove(cmt_pgs, pg, GRP_RESIZE); 828 829 /* 830 * If it's about to become empty, destroy it's children 831 * group, and remove it's reference from it's siblings. 832 * This is done here (rather than below) to avoid removing 833 * our reference from a PG that we just eliminated. 834 */ 835 if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) { 836 if (pg->cmt_children != NULL) 837 group_destroy(pg->cmt_children); 838 if (pg->cmt_siblings != NULL) { 839 if (pg->cmt_siblings == &lgrp->cl_pgs) 840 lgrp->cl_npgs--; 841 else 842 pg->cmt_parent->cmt_nchildren--; 843 } 844 } 845 pg = pg->cmt_parent; 846 } 847 ASSERT(GROUP_SIZE(cmt_pgs) == 0); 848 849 /* 850 * Now that the load balancing lineage updates have happened, 851 * remove the CPU from all it's PGs (destroying any that become 852 * empty). 853 */ 854 group_iter_init(&i); 855 while ((pg = group_iterate(pgs, &i)) != NULL) { 856 if (IS_CMT_PG(pg) == 0) 857 continue; 858 859 pg_cpu_delete((pg_t *)pg, cp, pgdata); 860 /* 861 * Deleting the CPU from the PG changes the CPU's 862 * PG group over which we are actively iterating 863 * Re-initialize the iteration 864 */ 865 group_iter_init(&i); 866 867 if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) { 868 869 /* 870 * The PG has become zero sized, so destroy it. 871 */ 872 group_destroy(&pg->cmt_cpus_actv); 873 bitset_fini(&pg->cmt_cpus_actv_set); 874 pghw_fini((pghw_t *)pg); 875 876 pg_destroy((pg_t *)pg); 877 } 878 } 879 } 880 881 /* 882 * Class callback when a CPU is entering a cpu partition 883 */ 884 static void 885 pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp) 886 { 887 group_t *pgs; 888 pg_t *pg; 889 group_iter_t i; 890 891 ASSERT(MUTEX_HELD(&cpu_lock)); 892 893 if (cmt_sched_disabled) 894 return; 895 896 pgs = &cp->cpu_pg->pgs; 897 898 /* 899 * Ensure that the new partition's PG bitset 900 * is large enough for all CMT PG's to which cp 901 * belongs 902 */ 903 group_iter_init(&i); 904 while ((pg = group_iterate(pgs, &i)) != NULL) { 905 if (IS_CMT_PG(pg) == 0) 906 continue; 907 908 if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id) 909 bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1); 910 } 911 } 912 913 /* 914 * Class callback when a CPU is actually moving partitions 915 */ 916 static void 917 pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp) 918 { 919 cpu_t *cpp; 920 group_t *pgs; 921 pg_t *pg; 922 group_iter_t pg_iter; 923 pg_cpu_itr_t cpu_iter; 924 boolean_t found; 925 926 ASSERT(MUTEX_HELD(&cpu_lock)); 927 928 if (cmt_sched_disabled) 929 return; 930 931 pgs = &cp->cpu_pg->pgs; 932 group_iter_init(&pg_iter); 933 934 /* 935 * Iterate over the CPUs CMT PGs 936 */ 937 while ((pg = group_iterate(pgs, &pg_iter)) != NULL) { 938 939 if (IS_CMT_PG(pg) == 0) 940 continue; 941 942 /* 943 * Add the PG to the bitset in the new partition. 944 */ 945 bitset_add(&newpp->cp_cmt_pgs, pg->pg_id); 946 947 /* 948 * Remove the PG from the bitset in the old partition 949 * if the last of the PG's CPUs have left. 950 */ 951 found = B_FALSE; 952 PG_CPU_ITR_INIT(pg, cpu_iter); 953 while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) { 954 if (cpp == cp) 955 continue; 956 if (CPU_ACTIVE(cpp) && 957 cpp->cpu_part->cp_id == oldpp->cp_id) { 958 found = B_TRUE; 959 break; 960 } 961 } 962 if (!found) 963 bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id); 964 } 965 } 966 967 /* 968 * Class callback when a CPU becomes active (online) 969 * 970 * This is called in a context where CPUs are paused 971 */ 972 static void 973 pg_cmt_cpu_active(cpu_t *cp) 974 { 975 int err; 976 group_iter_t i; 977 pg_cmt_t *pg; 978 group_t *pgs; 979 980 ASSERT(MUTEX_HELD(&cpu_lock)); 981 982 if (cmt_sched_disabled) 983 return; 984 985 pgs = &cp->cpu_pg->pgs; 986 group_iter_init(&i); 987 988 /* 989 * Iterate over the CPU's PGs 990 */ 991 while ((pg = group_iterate(pgs, &i)) != NULL) { 992 993 if (IS_CMT_PG(pg) == 0) 994 continue; 995 996 /* 997 * Move to the next generation since topology is changing 998 */ 999 ((pghw_t *)pg)->pghw_generation++; 1000 1001 err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 1002 ASSERT(err == 0); 1003 1004 /* 1005 * If this is the first active CPU in the PG, and it 1006 * represents a hardware sharing relationship over which 1007 * CMT load balancing is performed, add it as a candidate 1008 * for balancing with it's siblings. 1009 */ 1010 if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 && 1011 (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) { 1012 err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE); 1013 ASSERT(err == 0); 1014 1015 /* 1016 * If this is a top level PG, add it as a balancing 1017 * candidate when balancing within the root lgroup. 1018 */ 1019 if (pg->cmt_parent == NULL && 1020 pg->cmt_siblings != &cmt_root->cl_pgs) { 1021 err = group_add(&cmt_root->cl_pgs, pg, 1022 GRP_NORESIZE); 1023 ASSERT(err == 0); 1024 } 1025 } 1026 1027 /* 1028 * Notate the CPU in the PGs active CPU bitset. 1029 * Also notate the PG as being active in it's associated 1030 * partition 1031 */ 1032 bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 1033 bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id); 1034 } 1035 } 1036 1037 /* 1038 * Class callback when a CPU goes inactive (offline) 1039 * 1040 * This is called in a context where CPUs are paused 1041 */ 1042 static void 1043 pg_cmt_cpu_inactive(cpu_t *cp) 1044 { 1045 int err; 1046 group_t *pgs; 1047 pg_cmt_t *pg; 1048 cpu_t *cpp; 1049 group_iter_t i; 1050 pg_cpu_itr_t cpu_itr; 1051 boolean_t found; 1052 1053 ASSERT(MUTEX_HELD(&cpu_lock)); 1054 1055 if (cmt_sched_disabled) 1056 return; 1057 1058 pgs = &cp->cpu_pg->pgs; 1059 group_iter_init(&i); 1060 1061 while ((pg = group_iterate(pgs, &i)) != NULL) { 1062 1063 if (IS_CMT_PG(pg) == 0) 1064 continue; 1065 1066 /* 1067 * Move to the next generation since topology is changing 1068 */ 1069 ((pghw_t *)pg)->pghw_generation++; 1070 1071 /* 1072 * Remove the CPU from the CMT PGs active CPU group 1073 * bitmap 1074 */ 1075 err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 1076 ASSERT(err == 0); 1077 1078 bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 1079 1080 /* 1081 * If there are no more active CPUs in this PG over which 1082 * load was balanced, remove it as a balancing candidate. 1083 */ 1084 if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 && 1085 (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) { 1086 err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); 1087 ASSERT(err == 0); 1088 1089 if (pg->cmt_parent == NULL && 1090 pg->cmt_siblings != &cmt_root->cl_pgs) { 1091 err = group_remove(&cmt_root->cl_pgs, pg, 1092 GRP_NORESIZE); 1093 ASSERT(err == 0); 1094 } 1095 } 1096 1097 /* 1098 * Assert the number of active CPUs does not exceed 1099 * the total number of CPUs in the PG 1100 */ 1101 ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <= 1102 GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 1103 1104 /* 1105 * Update the PG bitset in the CPU's old partition 1106 */ 1107 found = B_FALSE; 1108 PG_CPU_ITR_INIT(pg, cpu_itr); 1109 while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) { 1110 if (cpp == cp) 1111 continue; 1112 if (CPU_ACTIVE(cpp) && 1113 cpp->cpu_part->cp_id == cp->cpu_part->cp_id) { 1114 found = B_TRUE; 1115 break; 1116 } 1117 } 1118 if (!found) { 1119 bitset_del(&cp->cpu_part->cp_cmt_pgs, 1120 ((pg_t *)pg)->pg_id); 1121 } 1122 } 1123 } 1124 1125 /* 1126 * Return non-zero if the CPU belongs in the given PG 1127 */ 1128 static int 1129 pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp) 1130 { 1131 cpu_t *pg_cpu; 1132 1133 pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0); 1134 1135 ASSERT(pg_cpu != NULL); 1136 1137 /* 1138 * The CPU belongs if, given the nature of the hardware sharing 1139 * relationship represented by the PG, the CPU has that 1140 * relationship with some other CPU already in the PG 1141 */ 1142 if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw)) 1143 return (1); 1144 1145 return (0); 1146 } 1147 1148 /* 1149 * Sort the CPUs CMT hierarchy, where "size" is the number of levels. 1150 */ 1151 static void 1152 pg_cmt_hier_sort(pg_cmt_t **hier, int size) 1153 { 1154 int i, j, inc, sz; 1155 int start, end; 1156 pg_t *tmp; 1157 pg_t **h = (pg_t **)hier; 1158 1159 /* 1160 * First sort by number of CPUs 1161 */ 1162 inc = size / 2; 1163 while (inc > 0) { 1164 for (i = inc; i < size; i++) { 1165 j = i; 1166 tmp = h[i]; 1167 while ((j >= inc) && 1168 (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) { 1169 h[j] = h[j - inc]; 1170 j = j - inc; 1171 } 1172 h[j] = tmp; 1173 } 1174 if (inc == 2) 1175 inc = 1; 1176 else 1177 inc = (inc * 5) / 11; 1178 } 1179 1180 /* 1181 * Break ties by asking the platform. 1182 * Determine if h[i] outranks h[i + 1] and if so, swap them. 1183 */ 1184 for (start = 0; start < size; start++) { 1185 1186 /* 1187 * Find various contiguous sets of elements, 1188 * in the array, with the same number of cpus 1189 */ 1190 end = start; 1191 sz = PG_NUM_CPUS(h[start]); 1192 while ((end < size) && (sz == PG_NUM_CPUS(h[end]))) 1193 end++; 1194 /* 1195 * Sort each such set of the array by rank 1196 */ 1197 for (i = start + 1; i < end; i++) { 1198 j = i - 1; 1199 tmp = h[i]; 1200 while (j >= start && 1201 pg_cmt_hier_rank(hier[j], 1202 (pg_cmt_t *)tmp) == hier[j]) { 1203 h[j + 1] = h[j]; 1204 j--; 1205 } 1206 h[j + 1] = tmp; 1207 } 1208 } 1209 } 1210 1211 /* 1212 * Return a cmt_lgrp_t * given an lgroup handle. 1213 */ 1214 static cmt_lgrp_t * 1215 pg_cmt_find_lgrp(lgrp_handle_t hand) 1216 { 1217 cmt_lgrp_t *lgrp; 1218 1219 ASSERT(MUTEX_HELD(&cpu_lock)); 1220 1221 lgrp = cmt_lgrps; 1222 while (lgrp != NULL) { 1223 if (lgrp->cl_hand == hand) 1224 break; 1225 lgrp = lgrp->cl_next; 1226 } 1227 return (lgrp); 1228 } 1229 1230 /* 1231 * Create a cmt_lgrp_t with the specified handle. 1232 */ 1233 static cmt_lgrp_t * 1234 pg_cmt_lgrp_create(lgrp_handle_t hand) 1235 { 1236 cmt_lgrp_t *lgrp; 1237 1238 ASSERT(MUTEX_HELD(&cpu_lock)); 1239 1240 lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP); 1241 1242 lgrp->cl_hand = hand; 1243 lgrp->cl_npgs = 0; 1244 lgrp->cl_next = cmt_lgrps; 1245 cmt_lgrps = lgrp; 1246 group_create(&lgrp->cl_pgs); 1247 1248 return (lgrp); 1249 } 1250 1251 /* 1252 * Interfaces to enable and disable power aware dispatching 1253 * The caller must be holding cpu_lock. 1254 * 1255 * Return 0 on success and -1 on failure. 1256 */ 1257 int 1258 cmt_pad_enable(pghw_type_t type) 1259 { 1260 group_t *hwset; 1261 group_iter_t iter; 1262 pg_cmt_t *pg; 1263 1264 ASSERT(PGHW_IS_PM_DOMAIN(type)); 1265 ASSERT(MUTEX_HELD(&cpu_lock)); 1266 1267 if ((hwset = pghw_set_lookup(type)) == NULL || 1268 cmt_hw_blacklisted[type]) { 1269 /* 1270 * Unable to find any instances of the specified type 1271 * of power domain, or the power domains have been blacklisted. 1272 */ 1273 return (-1); 1274 } 1275 1276 /* 1277 * Iterate over the power domains, setting the default dispatcher 1278 * policy for power/performance optimization. 1279 * 1280 * Simply setting the policy isn't enough in the case where the power 1281 * domain is an only child of another PG. Because the dispatcher walks 1282 * the PG hierarchy in a top down fashion, the higher up PG's policy 1283 * will dominate. So promote the power domain above it's parent if both 1284 * PG and it's parent have the same CPUs to ensure it's policy 1285 * dominates. 1286 */ 1287 group_iter_init(&iter); 1288 while ((pg = group_iterate(hwset, &iter)) != NULL) { 1289 /* 1290 * If the power domain is an only child to a parent 1291 * not implementing the same policy, promote the child 1292 * above the parent to activate the policy. 1293 */ 1294 pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw); 1295 while ((pg->cmt_parent != NULL) && 1296 (pg->cmt_parent->cmt_policy != pg->cmt_policy) && 1297 (PG_NUM_CPUS((pg_t *)pg) == 1298 PG_NUM_CPUS((pg_t *)pg->cmt_parent))) { 1299 cmt_hier_promote(pg, NULL); 1300 } 1301 } 1302 1303 return (0); 1304 } 1305 1306 int 1307 cmt_pad_disable(pghw_type_t type) 1308 { 1309 group_t *hwset; 1310 group_iter_t iter; 1311 pg_cmt_t *pg; 1312 pg_cmt_t *child; 1313 1314 ASSERT(PGHW_IS_PM_DOMAIN(type)); 1315 ASSERT(MUTEX_HELD(&cpu_lock)); 1316 1317 if ((hwset = pghw_set_lookup(type)) == NULL) { 1318 /* 1319 * Unable to find any instances of the specified type of 1320 * power domain. 1321 */ 1322 return (-1); 1323 } 1324 /* 1325 * Iterate over the power domains, setting the default dispatcher 1326 * policy for performance optimization (load balancing). 1327 */ 1328 group_iter_init(&iter); 1329 while ((pg = group_iterate(hwset, &iter)) != NULL) { 1330 1331 /* 1332 * If the power domain has an only child that implements 1333 * policy other than load balancing, promote the child 1334 * above the power domain to ensure it's policy dominates. 1335 */ 1336 if (pg->cmt_children != NULL && 1337 GROUP_SIZE(pg->cmt_children) == 1) { 1338 child = GROUP_ACCESS(pg->cmt_children, 0); 1339 if ((child->cmt_policy & CMT_BALANCE) == 0) { 1340 cmt_hier_promote(child, NULL); 1341 } 1342 } 1343 pg->cmt_policy = CMT_BALANCE; 1344 } 1345 return (0); 1346 } 1347 1348 /* ARGSUSED */ 1349 static void 1350 cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old, 1351 kthread_t *new) 1352 { 1353 pg_cmt_t *cmt_pg = (pg_cmt_t *)pg; 1354 1355 if (old == cp->cpu_idle_thread) { 1356 atomic_add_32(&cmt_pg->cmt_utilization, 1); 1357 } else if (new == cp->cpu_idle_thread) { 1358 atomic_add_32(&cmt_pg->cmt_utilization, -1); 1359 } 1360 } 1361 1362 /* 1363 * Macro to test whether a thread is currently runnable on a CPU in a PG. 1364 */ 1365 #define THREAD_RUNNABLE_IN_PG(t, pg) \ 1366 ((t)->t_state == TS_RUN && \ 1367 (t)->t_disp_queue->disp_cpu && \ 1368 bitset_in_set(&(pg)->cmt_cpus_actv_set, \ 1369 (t)->t_disp_queue->disp_cpu->cpu_seqid)) 1370 1371 static void 1372 cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old, 1373 kthread_t *new) 1374 { 1375 pg_cmt_t *cmt = (pg_cmt_t *)pg; 1376 cpupm_domain_t *dom; 1377 uint32_t u; 1378 1379 if (old == cp->cpu_idle_thread) { 1380 ASSERT(new != cp->cpu_idle_thread); 1381 u = atomic_add_32_nv(&cmt->cmt_utilization, 1); 1382 if (u == 1) { 1383 /* 1384 * Notify the CPU power manager that the domain 1385 * is non-idle. 1386 */ 1387 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1388 cpupm_utilization_event(cp, now, dom, 1389 CPUPM_DOM_BUSY_FROM_IDLE); 1390 } 1391 } else if (new == cp->cpu_idle_thread) { 1392 ASSERT(old != cp->cpu_idle_thread); 1393 u = atomic_add_32_nv(&cmt->cmt_utilization, -1); 1394 if (u == 0) { 1395 /* 1396 * The domain is idle, notify the CPU power 1397 * manager. 1398 * 1399 * Avoid notifying if the thread is simply migrating 1400 * between CPUs in the domain. 1401 */ 1402 if (!THREAD_RUNNABLE_IN_PG(old, cmt)) { 1403 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1404 cpupm_utilization_event(cp, now, dom, 1405 CPUPM_DOM_IDLE_FROM_BUSY); 1406 } 1407 } 1408 } 1409 } 1410 1411 /* ARGSUSED */ 1412 static void 1413 cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t) 1414 { 1415 pg_cmt_t *cmt = (pg_cmt_t *)pg; 1416 cpupm_domain_t *dom; 1417 1418 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1419 cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY); 1420 } 1421 1422 /* 1423 * Return the name of the CMT scheduling policy 1424 * being implemented across this PG 1425 */ 1426 static char * 1427 pg_cmt_policy_name(pg_t *pg) 1428 { 1429 pg_cmt_policy_t policy; 1430 1431 policy = ((pg_cmt_t *)pg)->cmt_policy; 1432 1433 if (policy & CMT_AFFINITY) { 1434 if (policy & CMT_BALANCE) 1435 return ("Load Balancing & Affinity"); 1436 else if (policy & CMT_COALESCE) 1437 return ("Load Coalescence & Affinity"); 1438 else 1439 return ("Affinity"); 1440 } else { 1441 if (policy & CMT_BALANCE) 1442 return ("Load Balancing"); 1443 else if (policy & CMT_COALESCE) 1444 return ("Load Coalescence"); 1445 else 1446 return ("None"); 1447 } 1448 } 1449 1450 /* 1451 * Prune PG, and all other instances of PG's hardware sharing relationship 1452 * from the CMT PG hierarchy. 1453 * 1454 * This routine operates on the CPU specific processor group data (for the CPUs 1455 * in the PG being pruned), and may be invoked from a context where one CPU's 1456 * PG data is under construction. In this case the argument "pgdata", if not 1457 * NULL, is a reference to the CPU's under-construction PG data. 1458 */ 1459 static int 1460 pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata) 1461 { 1462 group_t *hwset, *children; 1463 int i, j, r, size = *sz; 1464 group_iter_t hw_iter, child_iter; 1465 pg_cpu_itr_t cpu_iter; 1466 pg_cmt_t *pg, *child; 1467 cpu_t *cpu; 1468 int cap_needed; 1469 pghw_type_t hw; 1470 1471 ASSERT(MUTEX_HELD(&cpu_lock)); 1472 1473 hw = ((pghw_t *)pg_bad)->pghw_hw; 1474 1475 if (hw == PGHW_POW_ACTIVE) { 1476 cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. " 1477 "Event Based CPUPM Unavailable"); 1478 } else if (hw == PGHW_POW_IDLE) { 1479 cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. " 1480 "Dispatcher assisted CPUPM disabled."); 1481 } 1482 1483 /* 1484 * Find and eliminate the PG from the lineage. 1485 */ 1486 for (i = 0; i < size; i++) { 1487 if (lineage[i] == pg_bad) { 1488 for (j = i; j < size - 1; j++) 1489 lineage[j] = lineage[j + 1]; 1490 *sz = size - 1; 1491 break; 1492 } 1493 } 1494 1495 /* 1496 * We'll prune all instances of the hardware sharing relationship 1497 * represented by pg. But before we do that (and pause CPUs) we need 1498 * to ensure the hierarchy's groups are properly sized. 1499 */ 1500 hwset = pghw_set_lookup(hw); 1501 1502 /* 1503 * Blacklist the hardware so future processor groups of this type won't 1504 * participate in CMT thread placement. 1505 * 1506 * XXX 1507 * For heterogeneous system configurations, this might be overkill. 1508 * We may only need to blacklist the illegal PGs, and other instances 1509 * of this hardware sharing relationship may be ok. 1510 */ 1511 cmt_hw_blacklisted[hw] = 1; 1512 1513 /* 1514 * For each of the PGs being pruned, ensure sufficient capacity in 1515 * the siblings set for the PG's children 1516 */ 1517 group_iter_init(&hw_iter); 1518 while ((pg = group_iterate(hwset, &hw_iter)) != NULL) { 1519 /* 1520 * PG is being pruned, but if it is bringing up more than 1521 * one child, ask for more capacity in the siblings group. 1522 */ 1523 cap_needed = 0; 1524 if (pg->cmt_children && 1525 GROUP_SIZE(pg->cmt_children) > 1) { 1526 cap_needed = GROUP_SIZE(pg->cmt_children) - 1; 1527 1528 group_expand(pg->cmt_siblings, 1529 GROUP_SIZE(pg->cmt_siblings) + cap_needed); 1530 1531 /* 1532 * If this is a top level group, also ensure the 1533 * capacity in the root lgrp level CMT grouping. 1534 */ 1535 if (pg->cmt_parent == NULL && 1536 pg->cmt_siblings != &cmt_root->cl_pgs) { 1537 group_expand(&cmt_root->cl_pgs, 1538 GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed); 1539 cmt_root->cl_npgs += cap_needed; 1540 } 1541 } 1542 } 1543 1544 /* 1545 * We're operating on the PG hierarchy. Pause CPUs to ensure 1546 * exclusivity with respect to the dispatcher. 1547 */ 1548 pause_cpus(NULL); 1549 1550 /* 1551 * Prune all PG instances of the hardware sharing relationship 1552 * represented by pg. 1553 */ 1554 group_iter_init(&hw_iter); 1555 while ((pg = group_iterate(hwset, &hw_iter)) != NULL) { 1556 1557 /* 1558 * Remove PG from it's group of siblings, if it's there. 1559 */ 1560 if (pg->cmt_siblings) { 1561 (void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); 1562 } 1563 if (pg->cmt_parent == NULL && 1564 pg->cmt_siblings != &cmt_root->cl_pgs) { 1565 (void) group_remove(&cmt_root->cl_pgs, pg, 1566 GRP_NORESIZE); 1567 } 1568 1569 /* 1570 * Indicate that no CMT policy will be implemented across 1571 * this PG. 1572 */ 1573 pg->cmt_policy = CMT_NO_POLICY; 1574 1575 /* 1576 * Move PG's children from it's children set to it's parent's 1577 * children set. Note that the parent's children set, and PG's 1578 * siblings set are the same thing. 1579 * 1580 * Because we are iterating over the same group that we are 1581 * operating on (removing the children), first add all of PG's 1582 * children to the parent's children set, and once we are done 1583 * iterating, empty PG's children set. 1584 */ 1585 if (pg->cmt_children != NULL) { 1586 children = pg->cmt_children; 1587 1588 group_iter_init(&child_iter); 1589 while ((child = group_iterate(children, &child_iter)) 1590 != NULL) { 1591 if (pg->cmt_siblings != NULL) { 1592 r = group_add(pg->cmt_siblings, child, 1593 GRP_NORESIZE); 1594 ASSERT(r == 0); 1595 1596 if (pg->cmt_parent == NULL && 1597 pg->cmt_siblings != 1598 &cmt_root->cl_pgs) { 1599 r = group_add(&cmt_root->cl_pgs, 1600 child, GRP_NORESIZE); 1601 ASSERT(r == 0); 1602 } 1603 } 1604 } 1605 group_empty(pg->cmt_children); 1606 } 1607 1608 /* 1609 * Reset the callbacks to the defaults 1610 */ 1611 pg_callback_set_defaults((pg_t *)pg); 1612 1613 /* 1614 * Update all the CPU lineages in each of PG's CPUs 1615 */ 1616 PG_CPU_ITR_INIT(pg, cpu_iter); 1617 while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { 1618 pg_cmt_t *cpu_pg; 1619 group_iter_t liter; /* Iterator for the lineage */ 1620 cpu_pg_t *cpd; /* CPU's PG data */ 1621 1622 /* 1623 * The CPU's lineage is under construction still 1624 * references the bootstrap CPU PG data structure. 1625 */ 1626 if (pg_cpu_is_bootstrapped(cpu)) 1627 cpd = pgdata; 1628 else 1629 cpd = cpu->cpu_pg; 1630 1631 /* 1632 * Iterate over the CPU's PGs updating the children 1633 * of the PG being promoted, since they have a new 1634 * parent and siblings set. 1635 */ 1636 group_iter_init(&liter); 1637 while ((cpu_pg = group_iterate(&cpd->pgs, 1638 &liter)) != NULL) { 1639 if (cpu_pg->cmt_parent == pg) { 1640 cpu_pg->cmt_parent = pg->cmt_parent; 1641 cpu_pg->cmt_siblings = pg->cmt_siblings; 1642 } 1643 } 1644 1645 /* 1646 * Update the CPU's lineages 1647 * 1648 * Remove the PG from the CPU's group used for CMT 1649 * scheduling. 1650 */ 1651 (void) group_remove(&cpd->cmt_pgs, pg, GRP_NORESIZE); 1652 } 1653 } 1654 start_cpus(); 1655 return (0); 1656 } 1657 1658 /* 1659 * Disable CMT scheduling 1660 */ 1661 static void 1662 pg_cmt_disable(void) 1663 { 1664 cpu_t *cpu; 1665 1666 ASSERT(MUTEX_HELD(&cpu_lock)); 1667 1668 pause_cpus(NULL); 1669 cpu = cpu_list; 1670 1671 do { 1672 if (cpu->cpu_pg) 1673 group_empty(&cpu->cpu_pg->cmt_pgs); 1674 } while ((cpu = cpu->cpu_next) != cpu_list); 1675 1676 cmt_sched_disabled = 1; 1677 start_cpus(); 1678 cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable"); 1679 } 1680 1681 /* 1682 * CMT lineage validation 1683 * 1684 * This routine is invoked by pg_cmt_cpu_init() to validate the integrity 1685 * of the PGs in a CPU's lineage. This is necessary because it's possible that 1686 * some groupings (power domain groupings in particular) may be defined by 1687 * sources that are buggy (e.g. BIOS bugs). In such cases, it may not be 1688 * possible to integrate those groupings into the CMT PG hierarchy, if doing 1689 * so would violate the subset invariant of the hierarchy, which says that 1690 * a PG must be subset of its parent (if it has one). 1691 * 1692 * pg_cmt_lineage_validate()'s purpose is to detect grouping definitions that 1693 * would result in a violation of this invariant. If a violation is found, 1694 * and the PG is of a grouping type who's definition is known to originate from 1695 * suspect sources (BIOS), then pg_cmt_prune() will be invoked to prune the 1696 * PG (and all other instances PG's sharing relationship type) from the CMT 1697 * hierarchy. Further, future instances of that sharing relationship type won't 1698 * be added. If the grouping definition doesn't originate from suspect 1699 * sources, then pg_cmt_disable() will be invoked to log an error, and disable 1700 * CMT scheduling altogether. 1701 * 1702 * This routine is invoked after the CPU has been added to the PGs in which 1703 * it belongs, but before those PGs have been added to (or had their place 1704 * adjusted in) the CMT PG hierarchy. 1705 * 1706 * The first argument is the CPUs PG lineage (essentially an array of PGs in 1707 * which the CPU belongs) that has already been sorted in ascending order 1708 * by CPU count. Some of the PGs in the CPUs lineage may already have other 1709 * CPUs in them, and have already been integrated into the CMT hierarchy. 1710 * 1711 * The addition of this new CPU to these pre-existing PGs means that those 1712 * PGs may need to be promoted up in the hierarchy to satisfy the subset 1713 * invariant. In additon to testing the subset invariant for the lineage, 1714 * this routine also verifies that the addition of the new CPU to the 1715 * existing PGs wouldn't cause the subset invariant to be violated in 1716 * the exiting lineages. 1717 * 1718 * This routine will normally return one of the following: 1719 * CMT_LINEAGE_VALID - There were no problems detected with the lineage. 1720 * CMT_LINEAGE_REPAIRED - Problems were detected, but repaired via pruning. 1721 * 1722 * Otherwise, this routine will return a value indicating which error it 1723 * was unable to recover from (and set cmt_lineage_status along the way). 1724 * 1725 * This routine operates on the CPU specific processor group data (for the CPU 1726 * whose lineage is being validated), which is under-construction. 1727 * "pgdata" is a reference to the CPU's under-construction PG data. 1728 * This routine must be careful to operate only on "pgdata", and not cp->cpu_pg. 1729 */ 1730 static cmt_lineage_validation_t 1731 pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata) 1732 { 1733 int i, j, size; 1734 pg_cmt_t *pg, *pg_next, *pg_bad, *pg_tmp, *parent; 1735 cpu_t *cp; 1736 pg_cpu_itr_t cpu_iter; 1737 lgrp_handle_t lgrp; 1738 1739 ASSERT(MUTEX_HELD(&cpu_lock)); 1740 1741 revalidate: 1742 size = *sz; 1743 pg_bad = NULL; 1744 lgrp = LGRP_NULL_HANDLE; 1745 for (i = 0; i < size; i++) { 1746 1747 pg = lineage[i]; 1748 if (i < size - 1) 1749 pg_next = lineage[i + 1]; 1750 else 1751 pg_next = NULL; 1752 1753 /* 1754 * We assume that the lineage has already been sorted 1755 * by the number of CPUs. In fact, we depend on it. 1756 */ 1757 ASSERT(pg_next == NULL || 1758 (PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)pg_next))); 1759 1760 /* 1761 * The CPUs PG lineage was passed as the first argument to 1762 * this routine and contains the sorted list of the CPU's 1763 * PGs. Ultimately, the ordering of the PGs in that list, and 1764 * the ordering as traversed by the cmt_parent list must be 1765 * the same. PG promotion will be used as the mechanism to 1766 * achieve this, but first we need to look for cases where 1767 * promotion will be necessary, and validate that will be 1768 * possible without violating the subset invarient described 1769 * above. 1770 * 1771 * Since the PG topology is in the middle of being changed, we 1772 * need to check whether the PG's existing parent (if any) is 1773 * part of this CPU's lineage (and therefore should contain 1774 * the new CPU). If not, it means that the addition of the 1775 * new CPU should have made this PG have more CPUs than its 1776 * parent (and other ancestors not in the same lineage) and 1777 * will need to be promoted into place. 1778 * 1779 * We need to verify all of this to defend against a buggy 1780 * BIOS giving bad power domain CPU groupings. Sigh. 1781 */ 1782 parent = pg->cmt_parent; 1783 while (parent != NULL) { 1784 /* 1785 * Determine if the parent/ancestor is in this lineage 1786 */ 1787 pg_tmp = NULL; 1788 for (j = 0; (j < size) && (pg_tmp != parent); j++) { 1789 pg_tmp = lineage[j]; 1790 } 1791 if (pg_tmp == parent) { 1792 /* 1793 * It's in the lineage. The concentricity 1794 * checks will handle the rest. 1795 */ 1796 break; 1797 } 1798 /* 1799 * If it is not in the lineage, PG will eventually 1800 * need to be promoted above it. Verify the ancestor 1801 * is a proper subset. There is still an error if 1802 * the ancestor has the same number of CPUs as PG, 1803 * since that would imply it should be in the lineage, 1804 * and we already know it isn't. 1805 */ 1806 if (PG_NUM_CPUS((pg_t *)parent) >= 1807 PG_NUM_CPUS((pg_t *)pg)) { 1808 /* 1809 * Not a proper subset if the parent/ancestor 1810 * has the same or more CPUs than PG. 1811 */ 1812 cmt_lineage_status = CMT_LINEAGE_NON_PROMOTABLE; 1813 goto handle_error; 1814 } 1815 parent = parent->cmt_parent; 1816 } 1817 1818 /* 1819 * Walk each of the CPUs in the PGs group and perform 1820 * consistency checks along the way. 1821 */ 1822 PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter); 1823 while ((cp = pg_cpu_next(&cpu_iter)) != NULL) { 1824 /* 1825 * Verify that there aren't any CPUs contained in PG 1826 * that the next PG in the lineage (which is larger 1827 * or same size) doesn't also contain. 1828 */ 1829 if (pg_next != NULL && 1830 pg_cpu_find((pg_t *)pg_next, cp) == B_FALSE) { 1831 cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC; 1832 goto handle_error; 1833 } 1834 1835 /* 1836 * Verify that all the CPUs in the PG are in the same 1837 * lgroup. 1838 */ 1839 if (lgrp == LGRP_NULL_HANDLE) { 1840 lgrp = lgrp_plat_cpu_to_hand(cp->cpu_id); 1841 } else if (lgrp_plat_cpu_to_hand(cp->cpu_id) != lgrp) { 1842 cmt_lineage_status = CMT_LINEAGE_PG_SPANS_LGRPS; 1843 goto handle_error; 1844 } 1845 } 1846 } 1847 1848 handle_error: 1849 /* 1850 * Some of these validation errors can result when the CPU grouping 1851 * information is derived from buggy sources (for example, incorrect 1852 * ACPI tables on x86 systems). 1853 * 1854 * We'll try to recover in such cases by pruning out the illegal 1855 * groupings from the PG hierarchy, which means that we won't optimize 1856 * for those levels, but we will for the remaining ones. 1857 */ 1858 switch (cmt_lineage_status) { 1859 case CMT_LINEAGE_VALID: 1860 case CMT_LINEAGE_REPAIRED: 1861 break; 1862 case CMT_LINEAGE_PG_SPANS_LGRPS: 1863 /* 1864 * We've detected a PG whose CPUs span lgroups. 1865 * 1866 * This isn't supported, as the dispatcher isn't allowed to 1867 * to do CMT thread placement across lgroups, as this would 1868 * conflict with policies implementing MPO thread affinity. 1869 * 1870 * If the PG is of a sharing relationship type known to 1871 * legitimately span lgroups, specify that no CMT thread 1872 * placement policy should be implemented, and prune the PG 1873 * from the existing CMT PG hierarchy. 1874 * 1875 * Otherwise, fall though to the case below for handling. 1876 */ 1877 if (((pghw_t *)pg)->pghw_hw == PGHW_CHIP) { 1878 if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) { 1879 cmt_lineage_status = CMT_LINEAGE_REPAIRED; 1880 goto revalidate; 1881 } 1882 } 1883 /*LINTED*/ 1884 case CMT_LINEAGE_NON_PROMOTABLE: 1885 /* 1886 * We've detected a PG that already exists in another CPU's 1887 * lineage that cannot cannot legally be promoted into place 1888 * without breaking the invariants of the hierarchy. 1889 */ 1890 if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) { 1891 if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) { 1892 cmt_lineage_status = CMT_LINEAGE_REPAIRED; 1893 goto revalidate; 1894 } 1895 } 1896 /* 1897 * Something went wrong trying to prune out the bad level. 1898 * Disable CMT scheduling altogether. 1899 */ 1900 pg_cmt_disable(); 1901 break; 1902 case CMT_LINEAGE_NON_CONCENTRIC: 1903 /* 1904 * We've detected a non-concentric PG lineage, which means that 1905 * there's a PG in the lineage that has CPUs that the next PG 1906 * over in the lineage (which is the same size or larger) 1907 * doesn't have. 1908 * 1909 * In this case, we examine the two PGs to see if either 1910 * grouping is defined by potentially buggy sources. 1911 * 1912 * If one has less CPUs than the other, and contains CPUs 1913 * not found in the parent, and it is an untrusted enumeration, 1914 * then prune it. If both have the same number of CPUs, then 1915 * prune the one that is untrusted. 1916 * 1917 * This process repeats until we have a concentric lineage, 1918 * or we would have to prune out level derived from what we 1919 * thought was a reliable source, in which case CMT scheduling 1920 * is disabled altogether. 1921 */ 1922 if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)pg_next)) && 1923 (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) { 1924 pg_bad = pg; 1925 } else if (PG_NUM_CPUS((pg_t *)pg) == 1926 PG_NUM_CPUS((pg_t *)pg_next)) { 1927 if (PG_CMT_HW_SUSPECT(((pghw_t *)pg_next)->pghw_hw)) { 1928 pg_bad = pg_next; 1929 } else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) { 1930 pg_bad = pg; 1931 } 1932 } 1933 if (pg_bad) { 1934 if (pg_cmt_prune(pg_bad, lineage, sz, pgdata) == 0) { 1935 cmt_lineage_status = CMT_LINEAGE_REPAIRED; 1936 goto revalidate; 1937 } 1938 } 1939 /* 1940 * Something went wrong trying to identify and/or prune out 1941 * the bad level. Disable CMT scheduling altogether. 1942 */ 1943 pg_cmt_disable(); 1944 break; 1945 default: 1946 /* 1947 * If we're here, we've encountered a validation error for 1948 * which we don't know how to recover. In this case, disable 1949 * CMT scheduling altogether. 1950 */ 1951 cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE; 1952 pg_cmt_disable(); 1953 } 1954 return (cmt_lineage_status); 1955 } 1956