1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/systm.h> 27 #include <sys/types.h> 28 #include <sys/param.h> 29 #include <sys/thread.h> 30 #include <sys/cpuvar.h> 31 #include <sys/cpupart.h> 32 #include <sys/kmem.h> 33 #include <sys/cmn_err.h> 34 #include <sys/kstat.h> 35 #include <sys/processor.h> 36 #include <sys/disp.h> 37 #include <sys/group.h> 38 #include <sys/pghw.h> 39 #include <sys/bitset.h> 40 #include <sys/lgrp.h> 41 #include <sys/cmt.h> 42 #include <sys/cpu_pm.h> 43 44 /* 45 * CMT scheduler / dispatcher support 46 * 47 * This file implements CMT scheduler support using Processor Groups. 48 * The CMT processor group class creates and maintains the CMT class 49 * specific processor group pg_cmt_t. 50 * 51 * ---------------------------- <-- pg_cmt_t * 52 * | pghw_t | 53 * ---------------------------- 54 * | CMT class specific data | 55 * | - hierarchy linkage | 56 * | - CMT load balancing data| 57 * | - active CPU group/bitset| 58 * ---------------------------- 59 * 60 * The scheduler/dispatcher leverages knowledge of the performance 61 * relevant CMT sharing relationships existing between cpus to implement 62 * optimized affinity, load balancing, and coalescence policies. 63 * 64 * Load balancing policy seeks to improve performance by minimizing 65 * contention over shared processor resources / facilities, Affinity 66 * policies seek to improve cache and TLB utilization. Coalescence 67 * policies improve resource utilization and ultimately power efficiency. 68 * 69 * The CMT PGs created by this class are already arranged into a 70 * hierarchy (which is done in the pghw layer). To implement the top-down 71 * CMT load balancing algorithm, the CMT PGs additionally maintain 72 * parent, child and sibling hierarchy relationships. 73 * Parent PGs always contain a superset of their children(s) resources, 74 * each PG can have at most one parent, and siblings are the group of PGs 75 * sharing the same parent. 76 * 77 * On NUMA systems, the CMT load balancing algorithm balances across the 78 * CMT PGs within their respective lgroups. On UMA based system, there 79 * exists a top level group of PGs to balance across. On NUMA systems multiple 80 * top level groups are instantiated, where the top level balancing begins by 81 * balancng across the CMT PGs within their respective (per lgroup) top level 82 * groups. 83 */ 84 static cmt_lgrp_t *cmt_lgrps = NULL; /* cmt_lgrps list head */ 85 static cmt_lgrp_t *cpu0_lgrp = NULL; /* boot CPU's initial lgrp */ 86 /* used for null_proc_lpa */ 87 cmt_lgrp_t *cmt_root = NULL; /* Reference to root cmt pg */ 88 89 static int is_cpu0 = 1; /* true if this is boot CPU context */ 90 91 /* 92 * Array of hardware sharing relationships that are blacklisted. 93 * PGs won't be instantiated for blacklisted hardware sharing relationships. 94 */ 95 static int cmt_hw_blacklisted[PGHW_NUM_COMPONENTS]; 96 97 /* 98 * Set this to non-zero to disable CMT scheduling 99 * This must be done via kmdb -d, as /etc/system will be too late 100 */ 101 int cmt_sched_disabled = 0; 102 103 /* 104 * Status codes for CMT lineage validation 105 * See pg_cmt_lineage_validate() below 106 */ 107 typedef enum cmt_lineage_validation { 108 CMT_LINEAGE_VALID, 109 CMT_LINEAGE_NON_CONCENTRIC, 110 CMT_LINEAGE_PG_SPANS_LGRPS, 111 CMT_LINEAGE_NON_PROMOTABLE, 112 CMT_LINEAGE_REPAIRED, 113 CMT_LINEAGE_UNRECOVERABLE 114 } cmt_lineage_validation_t; 115 116 /* 117 * Status of the current lineage under construction. 118 * One must be holding cpu_lock to change this. 119 */ 120 cmt_lineage_validation_t cmt_lineage_status = CMT_LINEAGE_VALID; 121 122 /* 123 * Power domain definitions (on x86) are defined by ACPI, and 124 * therefore may be subject to BIOS bugs. 125 */ 126 #define PG_CMT_HW_SUSPECT(hw) PGHW_IS_PM_DOMAIN(hw) 127 128 /* 129 * Macro to test if PG is managed by the CMT PG class 130 */ 131 #define IS_CMT_PG(pg) (((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id) 132 133 static pg_cid_t pg_cmt_class_id; /* PG class id */ 134 135 static pg_t *pg_cmt_alloc(); 136 static void pg_cmt_free(pg_t *); 137 static void pg_cmt_cpu_init(cpu_t *, cpu_pg_t *); 138 static void pg_cmt_cpu_fini(cpu_t *, cpu_pg_t *); 139 static void pg_cmt_cpu_active(cpu_t *); 140 static void pg_cmt_cpu_inactive(cpu_t *); 141 static void pg_cmt_cpupart_in(cpu_t *, cpupart_t *); 142 static void pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *); 143 static char *pg_cmt_policy_name(pg_t *); 144 static void pg_cmt_hier_sort(pg_cmt_t **, int); 145 static pg_cmt_t *pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *); 146 static int pg_cmt_cpu_belongs(pg_t *, cpu_t *); 147 static int pg_cmt_hw(pghw_type_t); 148 static cmt_lgrp_t *pg_cmt_find_lgrp(lgrp_handle_t); 149 static cmt_lgrp_t *pg_cmt_lgrp_create(lgrp_handle_t); 150 static void cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t, 151 kthread_t *, kthread_t *); 152 static void cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t, 153 kthread_t *, kthread_t *); 154 static void cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *); 155 static cmt_lineage_validation_t pg_cmt_lineage_validate(pg_cmt_t **, int *, 156 cpu_pg_t *); 157 158 159 /* 160 * CMT PG ops 161 */ 162 struct pg_ops pg_ops_cmt = { 163 pg_cmt_alloc, 164 pg_cmt_free, 165 pg_cmt_cpu_init, 166 pg_cmt_cpu_fini, 167 pg_cmt_cpu_active, 168 pg_cmt_cpu_inactive, 169 pg_cmt_cpupart_in, 170 NULL, /* cpupart_out */ 171 pg_cmt_cpupart_move, 172 pg_cmt_cpu_belongs, 173 pg_cmt_policy_name, 174 }; 175 176 /* 177 * Initialize the CMT PG class 178 */ 179 void 180 pg_cmt_class_init(void) 181 { 182 if (cmt_sched_disabled) 183 return; 184 185 pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL); 186 } 187 188 /* 189 * Called to indicate a new CPU has started up so 190 * that either t0 or the slave startup thread can 191 * be accounted for. 192 */ 193 void 194 pg_cmt_cpu_startup(cpu_t *cp) 195 { 196 pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread, 197 cp->cpu_thread); 198 } 199 200 /* 201 * Return non-zero if thread can migrate between "from" and "to" 202 * without a performance penalty 203 */ 204 int 205 pg_cmt_can_migrate(cpu_t *from, cpu_t *to) 206 { 207 if (from->cpu_physid->cpu_cacheid == 208 to->cpu_physid->cpu_cacheid) 209 return (1); 210 return (0); 211 } 212 213 /* 214 * CMT class specific PG allocation 215 */ 216 static pg_t * 217 pg_cmt_alloc(void) 218 { 219 return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP)); 220 } 221 222 /* 223 * Class specific PG de-allocation 224 */ 225 static void 226 pg_cmt_free(pg_t *pg) 227 { 228 ASSERT(pg != NULL); 229 ASSERT(IS_CMT_PG(pg)); 230 231 kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t)); 232 } 233 234 /* 235 * Given a hardware sharing relationship, return which dispatcher 236 * policies should be implemented to optimize performance and efficiency 237 */ 238 static pg_cmt_policy_t 239 pg_cmt_policy(pghw_type_t hw) 240 { 241 pg_cmt_policy_t p; 242 243 /* 244 * Give the platform a chance to override the default 245 */ 246 if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY) 247 return (p); 248 249 switch (hw) { 250 case PGHW_IPIPE: 251 case PGHW_FPU: 252 case PGHW_CHIP: 253 return (CMT_BALANCE); 254 case PGHW_CACHE: 255 return (CMT_AFFINITY); 256 case PGHW_POW_ACTIVE: 257 case PGHW_POW_IDLE: 258 return (CMT_BALANCE); 259 default: 260 return (CMT_NO_POLICY); 261 } 262 } 263 264 /* 265 * Rank the importance of optimizing for the pg1 relationship vs. 266 * the pg2 relationship. 267 */ 268 static pg_cmt_t * 269 pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2) 270 { 271 pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw; 272 pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw; 273 274 /* 275 * A power domain is only important if CPUPM is enabled. 276 */ 277 if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) { 278 if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2)) 279 return (pg2); 280 if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1)) 281 return (pg1); 282 } 283 284 /* 285 * Otherwise, ask the platform 286 */ 287 if (pg_plat_hw_rank(hw1, hw2) == hw1) 288 return (pg1); 289 else 290 return (pg2); 291 } 292 293 /* 294 * Initialize CMT callbacks for the given PG 295 */ 296 static void 297 cmt_callback_init(pg_t *pg) 298 { 299 switch (((pghw_t *)pg)->pghw_hw) { 300 case PGHW_POW_ACTIVE: 301 pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr; 302 pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr; 303 break; 304 default: 305 pg->pg_cb.thread_swtch = cmt_ev_thread_swtch; 306 307 } 308 } 309 310 /* 311 * Promote PG above it's current parent. 312 * This is only legal if PG has an equal or greater number of CPUs than its 313 * parent. 314 * 315 * This routine operates on the CPU specific processor group data (for the CPUs 316 * in the PG being promoted), and may be invoked from a context where one CPU's 317 * PG data is under construction. In this case the argument "pgdata", if not 318 * NULL, is a reference to the CPU's under-construction PG data. 319 */ 320 static void 321 cmt_hier_promote(pg_cmt_t *pg, cpu_pg_t *pgdata) 322 { 323 pg_cmt_t *parent; 324 group_t *children; 325 cpu_t *cpu; 326 group_iter_t iter; 327 pg_cpu_itr_t cpu_iter; 328 int r; 329 int err; 330 331 ASSERT(MUTEX_HELD(&cpu_lock)); 332 333 parent = pg->cmt_parent; 334 if (parent == NULL) { 335 /* 336 * Nothing to do 337 */ 338 return; 339 } 340 341 ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent)); 342 343 /* 344 * We're changing around the hierarchy, which is actively traversed 345 * by the dispatcher. Pause CPUS to ensure exclusivity. 346 */ 347 pause_cpus(NULL); 348 349 /* 350 * If necessary, update the parent's sibling set, replacing parent 351 * with PG. 352 */ 353 if (parent->cmt_siblings) { 354 if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE) 355 != -1) { 356 r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE); 357 ASSERT(r != -1); 358 } 359 } 360 361 /* 362 * If the parent is at the top of the hierarchy, replace it's entry 363 * in the root lgroup's group of top level PGs. 364 */ 365 if (parent->cmt_parent == NULL && 366 parent->cmt_siblings != &cmt_root->cl_pgs) { 367 if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE) 368 != -1) { 369 r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE); 370 ASSERT(r != -1); 371 } 372 } 373 374 /* 375 * We assume (and therefore assert) that the PG being promoted is an 376 * only child of it's parent. Update the parent's children set 377 * replacing PG's entry with the parent (since the parent is becoming 378 * the child). Then have PG and the parent swap children sets. 379 */ 380 ASSERT(GROUP_SIZE(parent->cmt_children) <= 1); 381 if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) { 382 r = group_add(parent->cmt_children, parent, GRP_NORESIZE); 383 ASSERT(r != -1); 384 } 385 386 children = pg->cmt_children; 387 pg->cmt_children = parent->cmt_children; 388 parent->cmt_children = children; 389 390 /* 391 * Update the sibling references for PG and it's parent 392 */ 393 pg->cmt_siblings = parent->cmt_siblings; 394 parent->cmt_siblings = pg->cmt_children; 395 396 /* 397 * Update any cached lineages in the per CPU pg data. 398 */ 399 PG_CPU_ITR_INIT(pg, cpu_iter); 400 while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { 401 int idx; 402 pg_cmt_t *cpu_pg; 403 cpu_pg_t *pgd; /* CPU's PG data */ 404 405 /* 406 * The CPU's whose lineage is under construction still 407 * references the bootstrap CPU PG data structure. 408 */ 409 if (pg_cpu_is_bootstrapped(cpu)) 410 pgd = pgdata; 411 else 412 pgd = cpu->cpu_pg; 413 414 /* 415 * Iterate over the CPU's PGs updating the children 416 * of the PG being promoted, since they have a new parent. 417 */ 418 group_iter_init(&iter); 419 while ((cpu_pg = group_iterate(&pgd->cmt_pgs, &iter)) != NULL) { 420 if (cpu_pg->cmt_parent == pg) { 421 cpu_pg->cmt_parent = parent; 422 } 423 } 424 425 /* 426 * Update the CMT load balancing lineage 427 */ 428 if ((idx = group_find(&pgd->cmt_pgs, (void *)pg)) == -1) { 429 /* 430 * Unless this is the CPU who's lineage is being 431 * constructed, the PG being promoted should be 432 * in the lineage. 433 */ 434 ASSERT(pg_cpu_is_bootstrapped(cpu)); 435 continue; 436 } 437 438 ASSERT(GROUP_ACCESS(&pgd->cmt_pgs, idx - 1) == parent); 439 ASSERT(idx > 0); 440 441 /* 442 * Have the child and the parent swap places in the CPU's 443 * lineage 444 */ 445 group_remove_at(&pgd->cmt_pgs, idx); 446 group_remove_at(&pgd->cmt_pgs, idx - 1); 447 err = group_add_at(&pgd->cmt_pgs, parent, idx); 448 ASSERT(err == 0); 449 err = group_add_at(&pgd->cmt_pgs, pg, idx - 1); 450 ASSERT(err == 0); 451 } 452 453 /* 454 * Update the parent references for PG and it's parent 455 */ 456 pg->cmt_parent = parent->cmt_parent; 457 parent->cmt_parent = pg; 458 459 start_cpus(); 460 } 461 462 /* 463 * CMT class callback for a new CPU entering the system 464 * 465 * This routine operates on the CPU specific processor group data (for the CPU 466 * being initialized). The argument "pgdata" is a reference to the CPU's PG 467 * data to be constructed. 468 * 469 * cp->cpu_pg is used by the dispatcher to access the CPU's PG data 470 * references a "bootstrap" structure. pg_cmt_cpu_init() and the routines it 471 * calls must be careful to operate only on the "pgdata" argument, and not 472 * cp->cpu_pg. 473 */ 474 static void 475 pg_cmt_cpu_init(cpu_t *cp, cpu_pg_t *pgdata) 476 { 477 pg_cmt_t *pg; 478 group_t *cmt_pgs; 479 int levels, level; 480 pghw_type_t hw; 481 pg_t *pg_cache = NULL; 482 pg_cmt_t *cpu_cmt_hier[PGHW_NUM_COMPONENTS]; 483 lgrp_handle_t lgrp_handle; 484 cmt_lgrp_t *lgrp; 485 cmt_lineage_validation_t lineage_status; 486 487 ASSERT(MUTEX_HELD(&cpu_lock)); 488 ASSERT(pg_cpu_is_bootstrapped(cp)); 489 490 if (cmt_sched_disabled) 491 return; 492 493 /* 494 * A new CPU is coming into the system. 495 * Interrogate the platform to see if the CPU 496 * has any performance or efficiency relevant 497 * sharing relationships 498 */ 499 cmt_pgs = &pgdata->cmt_pgs; 500 pgdata->cmt_lineage = NULL; 501 502 bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier)); 503 levels = 0; 504 for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) { 505 506 pg_cmt_policy_t policy; 507 508 /* 509 * We're only interested in the hw sharing relationships 510 * for which we know how to optimize. 511 */ 512 policy = pg_cmt_policy(hw); 513 if (policy == CMT_NO_POLICY || 514 pg_plat_hw_shared(cp, hw) == 0) 515 continue; 516 517 /* 518 * Continue if the hardware sharing relationship has been 519 * blacklisted. 520 */ 521 if (cmt_hw_blacklisted[hw]) { 522 continue; 523 } 524 525 /* 526 * Find (or create) the PG associated with 527 * the hw sharing relationship in which cp 528 * belongs. 529 * 530 * Determine if a suitable PG already 531 * exists, or if one needs to be created. 532 */ 533 pg = (pg_cmt_t *)pghw_place_cpu(cp, hw); 534 if (pg == NULL) { 535 /* 536 * Create a new one. 537 * Initialize the common... 538 */ 539 pg = (pg_cmt_t *)pg_create(pg_cmt_class_id); 540 541 /* ... physical ... */ 542 pghw_init((pghw_t *)pg, cp, hw); 543 544 /* 545 * ... and CMT specific portions of the 546 * structure. 547 */ 548 pg->cmt_policy = policy; 549 550 /* CMT event callbacks */ 551 cmt_callback_init((pg_t *)pg); 552 553 bitset_init(&pg->cmt_cpus_actv_set); 554 group_create(&pg->cmt_cpus_actv); 555 } else { 556 ASSERT(IS_CMT_PG(pg)); 557 } 558 559 /* Add the CPU to the PG */ 560 pg_cpu_add((pg_t *)pg, cp, pgdata); 561 562 /* 563 * Ensure capacity of the active CPU group/bitset 564 */ 565 group_expand(&pg->cmt_cpus_actv, 566 GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 567 568 if (cp->cpu_seqid >= 569 bitset_capacity(&pg->cmt_cpus_actv_set)) { 570 bitset_resize(&pg->cmt_cpus_actv_set, 571 cp->cpu_seqid + 1); 572 } 573 574 /* 575 * Build a lineage of CMT PGs for load balancing / coalescence 576 */ 577 if (policy & (CMT_BALANCE | CMT_COALESCE)) { 578 cpu_cmt_hier[levels++] = pg; 579 } 580 581 /* Cache this for later */ 582 if (hw == PGHW_CACHE) 583 pg_cache = (pg_t *)pg; 584 } 585 586 group_expand(cmt_pgs, levels); 587 588 if (cmt_root == NULL) 589 cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand()); 590 591 /* 592 * Find the lgrp that encapsulates this CPU's CMT hierarchy 593 */ 594 lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 595 if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL) 596 lgrp = pg_cmt_lgrp_create(lgrp_handle); 597 598 /* 599 * Ascendingly sort the PGs in the lineage by number of CPUs 600 */ 601 pg_cmt_hier_sort(cpu_cmt_hier, levels); 602 603 /* 604 * Examine the lineage and validate it. 605 * This routine will also try to fix the lineage along with the 606 * rest of the PG hierarchy should it detect an issue. 607 * 608 * If it returns anything other than VALID or REPAIRED, an 609 * unrecoverable error has occurred, and we cannot proceed. 610 */ 611 lineage_status = pg_cmt_lineage_validate(cpu_cmt_hier, &levels, pgdata); 612 if ((lineage_status != CMT_LINEAGE_VALID) && 613 (lineage_status != CMT_LINEAGE_REPAIRED)) { 614 /* 615 * In the case of an unrecoverable error where CMT scheduling 616 * has been disabled, assert that the under construction CPU's 617 * PG data has an empty CMT load balancing lineage. 618 */ 619 ASSERT((cmt_sched_disabled == 0) || 620 (GROUP_SIZE(&(pgdata->cmt_pgs)) == 0)); 621 return; 622 } 623 624 /* 625 * For existing PGs in the lineage, verify that the parent is 626 * correct, as the generation in the lineage may have changed 627 * as a result of the sorting. Start the traversal at the top 628 * of the lineage, moving down. 629 */ 630 for (level = levels - 1; level >= 0; ) { 631 int reorg; 632 633 reorg = 0; 634 pg = cpu_cmt_hier[level]; 635 636 /* 637 * Promote PGs at an incorrect generation into place. 638 */ 639 while (pg->cmt_parent && 640 pg->cmt_parent != cpu_cmt_hier[level + 1]) { 641 cmt_hier_promote(pg, pgdata); 642 reorg++; 643 } 644 if (reorg > 0) 645 level = levels - 1; 646 else 647 level--; 648 } 649 650 /* 651 * For each of the PGs in the CPU's lineage: 652 * - Add an entry in the CPU sorted CMT PG group 653 * which is used for top down CMT load balancing 654 * - Tie the PG into the CMT hierarchy by connecting 655 * it to it's parent and siblings. 656 */ 657 for (level = 0; level < levels; level++) { 658 uint_t children; 659 int err; 660 661 pg = cpu_cmt_hier[level]; 662 err = group_add_at(cmt_pgs, pg, levels - level - 1); 663 ASSERT(err == 0); 664 665 if (level == 0) 666 pgdata->cmt_lineage = (pg_t *)pg; 667 668 if (pg->cmt_siblings != NULL) { 669 /* Already initialized */ 670 ASSERT(pg->cmt_parent == NULL || 671 pg->cmt_parent == cpu_cmt_hier[level + 1]); 672 ASSERT(pg->cmt_siblings == &lgrp->cl_pgs || 673 ((pg->cmt_parent != NULL) && 674 pg->cmt_siblings == pg->cmt_parent->cmt_children)); 675 continue; 676 } 677 678 if ((level + 1) == levels) { 679 pg->cmt_parent = NULL; 680 681 pg->cmt_siblings = &lgrp->cl_pgs; 682 children = ++lgrp->cl_npgs; 683 if (cmt_root != lgrp) 684 cmt_root->cl_npgs++; 685 } else { 686 pg->cmt_parent = cpu_cmt_hier[level + 1]; 687 688 /* 689 * A good parent keeps track of their children. 690 * The parent's children group is also the PG's 691 * siblings. 692 */ 693 if (pg->cmt_parent->cmt_children == NULL) { 694 pg->cmt_parent->cmt_children = 695 kmem_zalloc(sizeof (group_t), KM_SLEEP); 696 group_create(pg->cmt_parent->cmt_children); 697 } 698 pg->cmt_siblings = pg->cmt_parent->cmt_children; 699 children = ++pg->cmt_parent->cmt_nchildren; 700 } 701 702 group_expand(pg->cmt_siblings, children); 703 group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs); 704 } 705 706 /* 707 * Cache the chip and core IDs in the cpu_t->cpu_physid structure 708 * for fast lookups later. 709 */ 710 if (cp->cpu_physid) { 711 cp->cpu_physid->cpu_chipid = 712 pg_plat_hw_instance_id(cp, PGHW_CHIP); 713 cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp); 714 715 /* 716 * If this cpu has a PG representing shared cache, then set 717 * cpu_cacheid to that PG's logical id 718 */ 719 if (pg_cache) 720 cp->cpu_physid->cpu_cacheid = pg_cache->pg_id; 721 } 722 723 /* CPU0 only initialization */ 724 if (is_cpu0) { 725 is_cpu0 = 0; 726 cpu0_lgrp = lgrp; 727 } 728 729 } 730 731 /* 732 * Class callback when a CPU is leaving the system (deletion) 733 * 734 * "pgdata" is a reference to the CPU's PG data to be deconstructed. 735 * 736 * cp->cpu_pg is used by the dispatcher to access the CPU's PG data 737 * references a "bootstrap" structure across this function's invocation. 738 * pg_cmt_cpu_init() and the routines it calls must be careful to operate only 739 * on the "pgdata" argument, and not cp->cpu_pg. 740 */ 741 static void 742 pg_cmt_cpu_fini(cpu_t *cp, cpu_pg_t *pgdata) 743 { 744 group_iter_t i; 745 pg_cmt_t *pg; 746 group_t *pgs, *cmt_pgs; 747 lgrp_handle_t lgrp_handle; 748 cmt_lgrp_t *lgrp; 749 750 if (cmt_sched_disabled) 751 return; 752 753 ASSERT(pg_cpu_is_bootstrapped(cp)); 754 755 pgs = &pgdata->pgs; 756 cmt_pgs = &pgdata->cmt_pgs; 757 758 /* 759 * Find the lgroup that encapsulates this CPU's CMT hierarchy 760 */ 761 lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 762 763 lgrp = pg_cmt_find_lgrp(lgrp_handle); 764 if (ncpus == 1 && lgrp != cpu0_lgrp) { 765 /* 766 * One might wonder how we could be deconfiguring the 767 * only CPU in the system. 768 * 769 * On Starcat systems when null_proc_lpa is detected, 770 * the boot CPU (which is already configured into a leaf 771 * lgroup), is moved into the root lgroup. This is done by 772 * deconfiguring it from both lgroups and processor 773 * groups), and then later reconfiguring it back in. This 774 * call to pg_cmt_cpu_fini() is part of that deconfiguration. 775 * 776 * This special case is detected by noting that the platform 777 * has changed the CPU's lgrp affiliation (since it now 778 * belongs in the root). In this case, use the cmt_lgrp_t 779 * cached for the boot CPU, since this is what needs to be 780 * torn down. 781 */ 782 lgrp = cpu0_lgrp; 783 } 784 785 ASSERT(lgrp != NULL); 786 787 /* 788 * First, clean up anything load balancing specific for each of 789 * the CPU's PGs that participated in CMT load balancing 790 */ 791 pg = (pg_cmt_t *)pgdata->cmt_lineage; 792 while (pg != NULL) { 793 794 /* 795 * Remove the PG from the CPU's load balancing lineage 796 */ 797 (void) group_remove(cmt_pgs, pg, GRP_RESIZE); 798 799 /* 800 * If it's about to become empty, destroy it's children 801 * group, and remove it's reference from it's siblings. 802 * This is done here (rather than below) to avoid removing 803 * our reference from a PG that we just eliminated. 804 */ 805 if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) { 806 if (pg->cmt_children != NULL) 807 group_destroy(pg->cmt_children); 808 if (pg->cmt_siblings != NULL) { 809 if (pg->cmt_siblings == &lgrp->cl_pgs) 810 lgrp->cl_npgs--; 811 else 812 pg->cmt_parent->cmt_nchildren--; 813 } 814 } 815 pg = pg->cmt_parent; 816 } 817 ASSERT(GROUP_SIZE(cmt_pgs) == 0); 818 819 /* 820 * Now that the load balancing lineage updates have happened, 821 * remove the CPU from all it's PGs (destroying any that become 822 * empty). 823 */ 824 group_iter_init(&i); 825 while ((pg = group_iterate(pgs, &i)) != NULL) { 826 if (IS_CMT_PG(pg) == 0) 827 continue; 828 829 pg_cpu_delete((pg_t *)pg, cp, pgdata); 830 /* 831 * Deleting the CPU from the PG changes the CPU's 832 * PG group over which we are actively iterating 833 * Re-initialize the iteration 834 */ 835 group_iter_init(&i); 836 837 if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) { 838 839 /* 840 * The PG has become zero sized, so destroy it. 841 */ 842 group_destroy(&pg->cmt_cpus_actv); 843 bitset_fini(&pg->cmt_cpus_actv_set); 844 pghw_fini((pghw_t *)pg); 845 846 pg_destroy((pg_t *)pg); 847 } 848 } 849 } 850 851 /* 852 * Class callback when a CPU is entering a cpu partition 853 */ 854 static void 855 pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp) 856 { 857 group_t *pgs; 858 pg_t *pg; 859 group_iter_t i; 860 861 ASSERT(MUTEX_HELD(&cpu_lock)); 862 863 if (cmt_sched_disabled) 864 return; 865 866 pgs = &cp->cpu_pg->pgs; 867 868 /* 869 * Ensure that the new partition's PG bitset 870 * is large enough for all CMT PG's to which cp 871 * belongs 872 */ 873 group_iter_init(&i); 874 while ((pg = group_iterate(pgs, &i)) != NULL) { 875 if (IS_CMT_PG(pg) == 0) 876 continue; 877 878 if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id) 879 bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1); 880 } 881 } 882 883 /* 884 * Class callback when a CPU is actually moving partitions 885 */ 886 static void 887 pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp) 888 { 889 cpu_t *cpp; 890 group_t *pgs; 891 pg_t *pg; 892 group_iter_t pg_iter; 893 pg_cpu_itr_t cpu_iter; 894 boolean_t found; 895 896 ASSERT(MUTEX_HELD(&cpu_lock)); 897 898 if (cmt_sched_disabled) 899 return; 900 901 pgs = &cp->cpu_pg->pgs; 902 group_iter_init(&pg_iter); 903 904 /* 905 * Iterate over the CPUs CMT PGs 906 */ 907 while ((pg = group_iterate(pgs, &pg_iter)) != NULL) { 908 909 if (IS_CMT_PG(pg) == 0) 910 continue; 911 912 /* 913 * Add the PG to the bitset in the new partition. 914 */ 915 bitset_add(&newpp->cp_cmt_pgs, pg->pg_id); 916 917 /* 918 * Remove the PG from the bitset in the old partition 919 * if the last of the PG's CPUs have left. 920 */ 921 found = B_FALSE; 922 PG_CPU_ITR_INIT(pg, cpu_iter); 923 while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) { 924 if (cpp == cp) 925 continue; 926 if (CPU_ACTIVE(cpp) && 927 cpp->cpu_part->cp_id == oldpp->cp_id) { 928 found = B_TRUE; 929 break; 930 } 931 } 932 if (!found) 933 bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id); 934 } 935 } 936 937 /* 938 * Class callback when a CPU becomes active (online) 939 * 940 * This is called in a context where CPUs are paused 941 */ 942 static void 943 pg_cmt_cpu_active(cpu_t *cp) 944 { 945 int err; 946 group_iter_t i; 947 pg_cmt_t *pg; 948 group_t *pgs; 949 950 ASSERT(MUTEX_HELD(&cpu_lock)); 951 952 if (cmt_sched_disabled) 953 return; 954 955 pgs = &cp->cpu_pg->pgs; 956 group_iter_init(&i); 957 958 /* 959 * Iterate over the CPU's PGs 960 */ 961 while ((pg = group_iterate(pgs, &i)) != NULL) { 962 963 if (IS_CMT_PG(pg) == 0) 964 continue; 965 966 err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 967 ASSERT(err == 0); 968 969 /* 970 * If this is the first active CPU in the PG, and it 971 * represents a hardware sharing relationship over which 972 * CMT load balancing is performed, add it as a candidate 973 * for balancing with it's siblings. 974 */ 975 if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 && 976 (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) { 977 err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE); 978 ASSERT(err == 0); 979 980 /* 981 * If this is a top level PG, add it as a balancing 982 * candidate when balancing within the root lgroup. 983 */ 984 if (pg->cmt_parent == NULL && 985 pg->cmt_siblings != &cmt_root->cl_pgs) { 986 err = group_add(&cmt_root->cl_pgs, pg, 987 GRP_NORESIZE); 988 ASSERT(err == 0); 989 } 990 } 991 992 /* 993 * Notate the CPU in the PGs active CPU bitset. 994 * Also notate the PG as being active in it's associated 995 * partition 996 */ 997 bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 998 bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id); 999 } 1000 } 1001 1002 /* 1003 * Class callback when a CPU goes inactive (offline) 1004 * 1005 * This is called in a context where CPUs are paused 1006 */ 1007 static void 1008 pg_cmt_cpu_inactive(cpu_t *cp) 1009 { 1010 int err; 1011 group_t *pgs; 1012 pg_cmt_t *pg; 1013 cpu_t *cpp; 1014 group_iter_t i; 1015 pg_cpu_itr_t cpu_itr; 1016 boolean_t found; 1017 1018 ASSERT(MUTEX_HELD(&cpu_lock)); 1019 1020 if (cmt_sched_disabled) 1021 return; 1022 1023 pgs = &cp->cpu_pg->pgs; 1024 group_iter_init(&i); 1025 1026 while ((pg = group_iterate(pgs, &i)) != NULL) { 1027 1028 if (IS_CMT_PG(pg) == 0) 1029 continue; 1030 1031 /* 1032 * Remove the CPU from the CMT PGs active CPU group 1033 * bitmap 1034 */ 1035 err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 1036 ASSERT(err == 0); 1037 1038 bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 1039 1040 /* 1041 * If there are no more active CPUs in this PG over which 1042 * load was balanced, remove it as a balancing candidate. 1043 */ 1044 if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 && 1045 (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) { 1046 err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); 1047 ASSERT(err == 0); 1048 1049 if (pg->cmt_parent == NULL && 1050 pg->cmt_siblings != &cmt_root->cl_pgs) { 1051 err = group_remove(&cmt_root->cl_pgs, pg, 1052 GRP_NORESIZE); 1053 ASSERT(err == 0); 1054 } 1055 } 1056 1057 /* 1058 * Assert the number of active CPUs does not exceed 1059 * the total number of CPUs in the PG 1060 */ 1061 ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <= 1062 GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 1063 1064 /* 1065 * Update the PG bitset in the CPU's old partition 1066 */ 1067 found = B_FALSE; 1068 PG_CPU_ITR_INIT(pg, cpu_itr); 1069 while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) { 1070 if (cpp == cp) 1071 continue; 1072 if (CPU_ACTIVE(cpp) && 1073 cpp->cpu_part->cp_id == cp->cpu_part->cp_id) { 1074 found = B_TRUE; 1075 break; 1076 } 1077 } 1078 if (!found) { 1079 bitset_del(&cp->cpu_part->cp_cmt_pgs, 1080 ((pg_t *)pg)->pg_id); 1081 } 1082 } 1083 } 1084 1085 /* 1086 * Return non-zero if the CPU belongs in the given PG 1087 */ 1088 static int 1089 pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp) 1090 { 1091 cpu_t *pg_cpu; 1092 1093 pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0); 1094 1095 ASSERT(pg_cpu != NULL); 1096 1097 /* 1098 * The CPU belongs if, given the nature of the hardware sharing 1099 * relationship represented by the PG, the CPU has that 1100 * relationship with some other CPU already in the PG 1101 */ 1102 if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw)) 1103 return (1); 1104 1105 return (0); 1106 } 1107 1108 /* 1109 * Sort the CPUs CMT hierarchy, where "size" is the number of levels. 1110 */ 1111 static void 1112 pg_cmt_hier_sort(pg_cmt_t **hier, int size) 1113 { 1114 int i, j, inc; 1115 pg_t *tmp; 1116 pg_t **h = (pg_t **)hier; 1117 1118 /* 1119 * First sort by number of CPUs 1120 */ 1121 inc = size / 2; 1122 while (inc > 0) { 1123 for (i = inc; i < size; i++) { 1124 j = i; 1125 tmp = h[i]; 1126 while ((j >= inc) && 1127 (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) { 1128 h[j] = h[j - inc]; 1129 j = j - inc; 1130 } 1131 h[j] = tmp; 1132 } 1133 if (inc == 2) 1134 inc = 1; 1135 else 1136 inc = (inc * 5) / 11; 1137 } 1138 1139 /* 1140 * Break ties by asking the platform. 1141 * Determine if h[i] outranks h[i + 1] and if so, swap them. 1142 */ 1143 for (i = 0; i < size - 1; i++) { 1144 if ((PG_NUM_CPUS(h[i]) == PG_NUM_CPUS(h[i + 1])) && 1145 pg_cmt_hier_rank(hier[i], hier[i + 1]) == hier[i]) { 1146 tmp = h[i]; 1147 h[i] = h[i + 1]; 1148 h[i + 1] = tmp; 1149 } 1150 } 1151 } 1152 1153 /* 1154 * Return a cmt_lgrp_t * given an lgroup handle. 1155 */ 1156 static cmt_lgrp_t * 1157 pg_cmt_find_lgrp(lgrp_handle_t hand) 1158 { 1159 cmt_lgrp_t *lgrp; 1160 1161 ASSERT(MUTEX_HELD(&cpu_lock)); 1162 1163 lgrp = cmt_lgrps; 1164 while (lgrp != NULL) { 1165 if (lgrp->cl_hand == hand) 1166 break; 1167 lgrp = lgrp->cl_next; 1168 } 1169 return (lgrp); 1170 } 1171 1172 /* 1173 * Create a cmt_lgrp_t with the specified handle. 1174 */ 1175 static cmt_lgrp_t * 1176 pg_cmt_lgrp_create(lgrp_handle_t hand) 1177 { 1178 cmt_lgrp_t *lgrp; 1179 1180 ASSERT(MUTEX_HELD(&cpu_lock)); 1181 1182 lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP); 1183 1184 lgrp->cl_hand = hand; 1185 lgrp->cl_npgs = 0; 1186 lgrp->cl_next = cmt_lgrps; 1187 cmt_lgrps = lgrp; 1188 group_create(&lgrp->cl_pgs); 1189 1190 return (lgrp); 1191 } 1192 1193 /* 1194 * Interfaces to enable and disable power aware dispatching 1195 * The caller must be holding cpu_lock. 1196 * 1197 * Return 0 on success and -1 on failure. 1198 */ 1199 int 1200 cmt_pad_enable(pghw_type_t type) 1201 { 1202 group_t *hwset; 1203 group_iter_t iter; 1204 pg_cmt_t *pg; 1205 1206 ASSERT(PGHW_IS_PM_DOMAIN(type)); 1207 ASSERT(MUTEX_HELD(&cpu_lock)); 1208 1209 if ((hwset = pghw_set_lookup(type)) == NULL || 1210 cmt_hw_blacklisted[type]) { 1211 /* 1212 * Unable to find any instances of the specified type 1213 * of power domain, or the power domains have been blacklisted. 1214 */ 1215 return (-1); 1216 } 1217 1218 /* 1219 * Iterate over the power domains, setting the default dispatcher 1220 * policy for power/performance optimization. 1221 * 1222 * Simply setting the policy isn't enough in the case where the power 1223 * domain is an only child of another PG. Because the dispatcher walks 1224 * the PG hierarchy in a top down fashion, the higher up PG's policy 1225 * will dominate. So promote the power domain above it's parent if both 1226 * PG and it's parent have the same CPUs to ensure it's policy 1227 * dominates. 1228 */ 1229 group_iter_init(&iter); 1230 while ((pg = group_iterate(hwset, &iter)) != NULL) { 1231 /* 1232 * If the power domain is an only child to a parent 1233 * not implementing the same policy, promote the child 1234 * above the parent to activate the policy. 1235 */ 1236 pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw); 1237 while ((pg->cmt_parent != NULL) && 1238 (pg->cmt_parent->cmt_policy != pg->cmt_policy) && 1239 (PG_NUM_CPUS((pg_t *)pg) == 1240 PG_NUM_CPUS((pg_t *)pg->cmt_parent))) { 1241 cmt_hier_promote(pg, NULL); 1242 } 1243 } 1244 1245 return (0); 1246 } 1247 1248 int 1249 cmt_pad_disable(pghw_type_t type) 1250 { 1251 group_t *hwset; 1252 group_iter_t iter; 1253 pg_cmt_t *pg; 1254 pg_cmt_t *child; 1255 1256 ASSERT(PGHW_IS_PM_DOMAIN(type)); 1257 ASSERT(MUTEX_HELD(&cpu_lock)); 1258 1259 if ((hwset = pghw_set_lookup(type)) == NULL) { 1260 /* 1261 * Unable to find any instances of the specified type of 1262 * power domain. 1263 */ 1264 return (-1); 1265 } 1266 /* 1267 * Iterate over the power domains, setting the default dispatcher 1268 * policy for performance optimization (load balancing). 1269 */ 1270 group_iter_init(&iter); 1271 while ((pg = group_iterate(hwset, &iter)) != NULL) { 1272 1273 /* 1274 * If the power domain has an only child that implements 1275 * policy other than load balancing, promote the child 1276 * above the power domain to ensure it's policy dominates. 1277 */ 1278 if (pg->cmt_children != NULL && 1279 GROUP_SIZE(pg->cmt_children) == 1) { 1280 child = GROUP_ACCESS(pg->cmt_children, 0); 1281 if ((child->cmt_policy & CMT_BALANCE) == 0) { 1282 cmt_hier_promote(child, NULL); 1283 } 1284 } 1285 pg->cmt_policy = CMT_BALANCE; 1286 } 1287 return (0); 1288 } 1289 1290 /* ARGSUSED */ 1291 static void 1292 cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old, 1293 kthread_t *new) 1294 { 1295 pg_cmt_t *cmt_pg = (pg_cmt_t *)pg; 1296 1297 if (old == cp->cpu_idle_thread) { 1298 atomic_add_32(&cmt_pg->cmt_utilization, 1); 1299 } else if (new == cp->cpu_idle_thread) { 1300 atomic_add_32(&cmt_pg->cmt_utilization, -1); 1301 } 1302 } 1303 1304 /* 1305 * Macro to test whether a thread is currently runnable on a CPU in a PG. 1306 */ 1307 #define THREAD_RUNNABLE_IN_PG(t, pg) \ 1308 ((t)->t_state == TS_RUN && \ 1309 (t)->t_disp_queue->disp_cpu && \ 1310 bitset_in_set(&(pg)->cmt_cpus_actv_set, \ 1311 (t)->t_disp_queue->disp_cpu->cpu_seqid)) 1312 1313 static void 1314 cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old, 1315 kthread_t *new) 1316 { 1317 pg_cmt_t *cmt = (pg_cmt_t *)pg; 1318 cpupm_domain_t *dom; 1319 uint32_t u; 1320 1321 if (old == cp->cpu_idle_thread) { 1322 ASSERT(new != cp->cpu_idle_thread); 1323 u = atomic_add_32_nv(&cmt->cmt_utilization, 1); 1324 if (u == 1) { 1325 /* 1326 * Notify the CPU power manager that the domain 1327 * is non-idle. 1328 */ 1329 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1330 cpupm_utilization_event(cp, now, dom, 1331 CPUPM_DOM_BUSY_FROM_IDLE); 1332 } 1333 } else if (new == cp->cpu_idle_thread) { 1334 ASSERT(old != cp->cpu_idle_thread); 1335 u = atomic_add_32_nv(&cmt->cmt_utilization, -1); 1336 if (u == 0) { 1337 /* 1338 * The domain is idle, notify the CPU power 1339 * manager. 1340 * 1341 * Avoid notifying if the thread is simply migrating 1342 * between CPUs in the domain. 1343 */ 1344 if (!THREAD_RUNNABLE_IN_PG(old, cmt)) { 1345 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1346 cpupm_utilization_event(cp, now, dom, 1347 CPUPM_DOM_IDLE_FROM_BUSY); 1348 } 1349 } 1350 } 1351 } 1352 1353 /* ARGSUSED */ 1354 static void 1355 cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t) 1356 { 1357 pg_cmt_t *cmt = (pg_cmt_t *)pg; 1358 cpupm_domain_t *dom; 1359 1360 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1361 cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY); 1362 } 1363 1364 /* 1365 * Return the name of the CMT scheduling policy 1366 * being implemented across this PG 1367 */ 1368 static char * 1369 pg_cmt_policy_name(pg_t *pg) 1370 { 1371 pg_cmt_policy_t policy; 1372 1373 policy = ((pg_cmt_t *)pg)->cmt_policy; 1374 1375 if (policy & CMT_AFFINITY) { 1376 if (policy & CMT_BALANCE) 1377 return ("Load Balancing & Affinity"); 1378 else if (policy & CMT_COALESCE) 1379 return ("Load Coalescence & Affinity"); 1380 else 1381 return ("Affinity"); 1382 } else { 1383 if (policy & CMT_BALANCE) 1384 return ("Load Balancing"); 1385 else if (policy & CMT_COALESCE) 1386 return ("Load Coalescence"); 1387 else 1388 return ("None"); 1389 } 1390 } 1391 1392 /* 1393 * Prune PG, and all other instances of PG's hardware sharing relationship 1394 * from the PG hierarchy. 1395 * 1396 * This routine operates on the CPU specific processor group data (for the CPUs 1397 * in the PG being pruned), and may be invoked from a context where one CPU's 1398 * PG data is under construction. In this case the argument "pgdata", if not 1399 * NULL, is a reference to the CPU's under-construction PG data. 1400 */ 1401 static int 1402 pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata) 1403 { 1404 group_t *hwset, *children; 1405 int i, j, r, size = *sz; 1406 group_iter_t hw_iter, child_iter; 1407 pg_cpu_itr_t cpu_iter; 1408 pg_cmt_t *pg, *child; 1409 cpu_t *cpu; 1410 int cap_needed; 1411 pghw_type_t hw; 1412 1413 ASSERT(MUTEX_HELD(&cpu_lock)); 1414 1415 hw = ((pghw_t *)pg_bad)->pghw_hw; 1416 1417 if (hw == PGHW_POW_ACTIVE) { 1418 cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. " 1419 "Event Based CPUPM Unavailable"); 1420 } else if (hw == PGHW_POW_IDLE) { 1421 cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. " 1422 "Dispatcher assisted CPUPM disabled."); 1423 } 1424 1425 /* 1426 * Find and eliminate the PG from the lineage. 1427 */ 1428 for (i = 0; i < size; i++) { 1429 if (lineage[i] == pg_bad) { 1430 for (j = i; j < size - 1; j++) 1431 lineage[j] = lineage[j + 1]; 1432 *sz = size - 1; 1433 break; 1434 } 1435 } 1436 1437 /* 1438 * We'll prune all instances of the hardware sharing relationship 1439 * represented by pg. But before we do that (and pause CPUs) we need 1440 * to ensure the hierarchy's groups are properly sized. 1441 */ 1442 hwset = pghw_set_lookup(hw); 1443 1444 /* 1445 * Blacklist the hardware so that future groups won't be created. 1446 */ 1447 cmt_hw_blacklisted[hw] = 1; 1448 1449 /* 1450 * For each of the PGs being pruned, ensure sufficient capacity in 1451 * the siblings set for the PG's children 1452 */ 1453 group_iter_init(&hw_iter); 1454 while ((pg = group_iterate(hwset, &hw_iter)) != NULL) { 1455 /* 1456 * PG is being pruned, but if it is bringing up more than 1457 * one child, ask for more capacity in the siblings group. 1458 */ 1459 cap_needed = 0; 1460 if (pg->cmt_children && 1461 GROUP_SIZE(pg->cmt_children) > 1) { 1462 cap_needed = GROUP_SIZE(pg->cmt_children) - 1; 1463 1464 group_expand(pg->cmt_siblings, 1465 GROUP_SIZE(pg->cmt_siblings) + cap_needed); 1466 1467 /* 1468 * If this is a top level group, also ensure the 1469 * capacity in the root lgrp level CMT grouping. 1470 */ 1471 if (pg->cmt_parent == NULL && 1472 pg->cmt_siblings != &cmt_root->cl_pgs) { 1473 group_expand(&cmt_root->cl_pgs, 1474 GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed); 1475 } 1476 } 1477 } 1478 1479 /* 1480 * We're operating on the PG hierarchy. Pause CPUs to ensure 1481 * exclusivity with respect to the dispatcher. 1482 */ 1483 pause_cpus(NULL); 1484 1485 /* 1486 * Prune all PG instances of the hardware sharing relationship 1487 * represented by pg. 1488 */ 1489 group_iter_init(&hw_iter); 1490 while ((pg = group_iterate(hwset, &hw_iter)) != NULL) { 1491 1492 /* 1493 * Remove PG from it's group of siblings, if it's there. 1494 */ 1495 if (pg->cmt_siblings) { 1496 (void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); 1497 } 1498 if (pg->cmt_parent == NULL && 1499 pg->cmt_siblings != &cmt_root->cl_pgs) { 1500 (void) group_remove(&cmt_root->cl_pgs, pg, 1501 GRP_NORESIZE); 1502 } 1503 /* 1504 * Move PG's children from it's children set to it's parent's 1505 * children set. Note that the parent's children set, and PG's 1506 * siblings set are the same thing. 1507 * 1508 * Because we are iterating over the same group that we are 1509 * operating on (removing the children), first add all of PG's 1510 * children to the parent's children set, and once we are done 1511 * iterating, empty PG's children set. 1512 */ 1513 if (pg->cmt_children != NULL) { 1514 children = pg->cmt_children; 1515 1516 group_iter_init(&child_iter); 1517 while ((child = group_iterate(children, &child_iter)) 1518 != NULL) { 1519 if (pg->cmt_siblings != NULL) { 1520 r = group_add(pg->cmt_siblings, child, 1521 GRP_NORESIZE); 1522 ASSERT(r == 0); 1523 } 1524 } 1525 group_empty(pg->cmt_children); 1526 } 1527 1528 /* 1529 * Reset the callbacks to the defaults 1530 */ 1531 pg_callback_set_defaults((pg_t *)pg); 1532 1533 /* 1534 * Update all the CPU lineages in each of PG's CPUs 1535 */ 1536 PG_CPU_ITR_INIT(pg, cpu_iter); 1537 while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { 1538 pg_cmt_t *cpu_pg; 1539 group_iter_t liter; /* Iterator for the lineage */ 1540 cpu_pg_t *cpd; /* CPU's PG data */ 1541 1542 /* 1543 * The CPU's lineage is under construction still 1544 * references the bootstrap CPU PG data structure. 1545 */ 1546 if (pg_cpu_is_bootstrapped(cpu)) 1547 cpd = pgdata; 1548 else 1549 cpd = cpu->cpu_pg; 1550 1551 /* 1552 * Iterate over the CPU's PGs updating the children 1553 * of the PG being promoted, since they have a new 1554 * parent and siblings set. 1555 */ 1556 group_iter_init(&liter); 1557 while ((cpu_pg = group_iterate(&cpd->pgs, 1558 &liter)) != NULL) { 1559 if (cpu_pg->cmt_parent == pg) { 1560 cpu_pg->cmt_parent = pg->cmt_parent; 1561 cpu_pg->cmt_siblings = pg->cmt_siblings; 1562 } 1563 } 1564 1565 /* 1566 * Update the CPU's lineages 1567 */ 1568 (void) group_remove(&cpd->pgs, pg, GRP_NORESIZE); 1569 (void) group_remove(&cpd->cmt_pgs, pg, GRP_NORESIZE); 1570 } 1571 } 1572 start_cpus(); 1573 return (0); 1574 } 1575 1576 /* 1577 * Disable CMT scheduling 1578 */ 1579 static void 1580 pg_cmt_disable(void) 1581 { 1582 cpu_t *cpu; 1583 1584 ASSERT(MUTEX_HELD(&cpu_lock)); 1585 1586 pause_cpus(NULL); 1587 cpu = cpu_list; 1588 1589 do { 1590 if (cpu->cpu_pg) 1591 group_empty(&cpu->cpu_pg->cmt_pgs); 1592 } while ((cpu = cpu->cpu_next) != cpu_list); 1593 1594 cmt_sched_disabled = 1; 1595 start_cpus(); 1596 cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable"); 1597 } 1598 1599 /* 1600 * CMT lineage validation 1601 * 1602 * This routine is invoked by pg_cmt_cpu_init() to validate the integrity 1603 * of the PGs in a CPU's lineage. This is necessary because it's possible that 1604 * some groupings (power domain groupings in particular) may be defined by 1605 * sources that are buggy (e.g. BIOS bugs). In such cases, it may not be 1606 * possible to integrate those groupings into the CMT PG hierarchy, if doing 1607 * so would violate the subset invariant of the hierarchy, which says that 1608 * a PG must be subset of its parent (if it has one). 1609 * 1610 * pg_cmt_lineage_validate()'s purpose is to detect grouping definitions that 1611 * would result in a violation of this invariant. If a violation is found, 1612 * and the PG is of a grouping type who's definition is known to originate from 1613 * suspect sources (BIOS), then pg_cmt_prune() will be invoked to prune the 1614 * PG (and all other instances PG's sharing relationship type) from the 1615 * hierarchy. Further, future instances of that sharing relationship type won't 1616 * be instantiated. If the grouping definition doesn't originate from suspect 1617 * sources, then pg_cmt_disable() will be invoked to log an error, and disable 1618 * CMT scheduling altogether. 1619 * 1620 * This routine is invoked after the CPU has been added to the PGs in which 1621 * it belongs, but before those PGs have been added to (or had their place 1622 * adjusted in) the CMT PG hierarchy. 1623 * 1624 * The first argument is the CPUs PG lineage (essentially an array of PGs in 1625 * which the CPU belongs) that has already been sorted in ascending order 1626 * by CPU count. Some of the PGs in the CPUs lineage may already have other 1627 * CPUs in them, and have already been integrated into the CMT hierarchy. 1628 * 1629 * The addition of this new CPU to these pre-existing PGs means that those 1630 * PGs may need to be promoted up in the hierarchy to satisfy the subset 1631 * invariant. In additon to testing the subset invariant for the lineage, 1632 * this routine also verifies that the addition of the new CPU to the 1633 * existing PGs wouldn't cause the subset invariant to be violated in 1634 * the exiting lineages. 1635 * 1636 * This routine will normally return one of the following: 1637 * CMT_LINEAGE_VALID - There were no problems detected with the lineage. 1638 * CMT_LINEAGE_REPAIRED - Problems were detected, but repaired via pruning. 1639 * 1640 * Otherwise, this routine will return a value indicating which error it 1641 * was unable to recover from (and set cmt_lineage_status along the way). 1642 * 1643 * 1644 * This routine operates on the CPU specific processor group data (for the CPU 1645 * whose lineage is being validated), which is under-construction. 1646 * "pgdata" is a reference to the CPU's under-construction PG data. 1647 * This routine must be careful to operate only on "pgdata", and not cp->cpu_pg. 1648 */ 1649 static cmt_lineage_validation_t 1650 pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata) 1651 { 1652 int i, j, size; 1653 pg_cmt_t *pg, *pg_next, *pg_bad, *pg_tmp; 1654 cpu_t *cp; 1655 pg_cpu_itr_t cpu_iter; 1656 lgrp_handle_t lgrp; 1657 1658 ASSERT(MUTEX_HELD(&cpu_lock)); 1659 1660 revalidate: 1661 size = *sz; 1662 pg_bad = NULL; 1663 lgrp = LGRP_NULL_HANDLE; 1664 for (i = 0; i < size; i++) { 1665 1666 pg = lineage[i]; 1667 if (i < size - 1) 1668 pg_next = lineage[i + 1]; 1669 else 1670 pg_next = NULL; 1671 1672 /* 1673 * We assume that the lineage has already been sorted 1674 * by the number of CPUs. In fact, we depend on it. 1675 */ 1676 ASSERT(pg_next == NULL || 1677 (PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)pg_next))); 1678 1679 /* 1680 * Check to make sure that the existing parent of PG (if any) 1681 * is either in the PG's lineage, or the PG has more CPUs than 1682 * its existing parent and can and should be promoted above its 1683 * parent. 1684 * 1685 * Since the PG topology is in the middle of being changed, we 1686 * need to check whether the PG's existing parent (if any) is 1687 * part of its lineage (and therefore should contain the new 1688 * CPU). If not, it means that the addition of the new CPU 1689 * should have made this PG have more CPUs than its parent, and 1690 * this PG should be promoted to be above its existing parent 1691 * now. We need to verify all of this to defend against a buggy 1692 * BIOS giving bad power domain CPU groupings. Sigh. 1693 */ 1694 if (pg->cmt_parent) { 1695 /* 1696 * Determine if cmt_parent is in this lineage 1697 */ 1698 for (j = 0; j < size; j++) { 1699 pg_tmp = lineage[j]; 1700 if (pg_tmp == pg->cmt_parent) 1701 break; 1702 } 1703 if (pg_tmp != pg->cmt_parent) { 1704 /* 1705 * cmt_parent is not in the lineage, verify 1706 * it is a proper subset of PG. 1707 */ 1708 if (PG_NUM_CPUS((pg_t *)pg->cmt_parent) >= 1709 PG_NUM_CPUS((pg_t *)pg)) { 1710 /* 1711 * Not a proper subset if pg has less 1712 * CPUs than cmt_parent... 1713 */ 1714 cmt_lineage_status = 1715 CMT_LINEAGE_NON_PROMOTABLE; 1716 goto handle_error; 1717 } 1718 } 1719 } 1720 1721 /* 1722 * Walk each of the CPUs in the PGs group and perform 1723 * consistency checks along the way. 1724 */ 1725 PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter); 1726 while ((cp = pg_cpu_next(&cpu_iter)) != NULL) { 1727 /* 1728 * Verify that there aren't any CPUs contained in PG 1729 * that the next PG in the lineage (which is larger 1730 * or same size) doesn't also contain. 1731 */ 1732 if (pg_next != NULL && 1733 pg_cpu_find((pg_t *)pg_next, cp) == B_FALSE) { 1734 cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC; 1735 goto handle_error; 1736 } 1737 1738 /* 1739 * Verify that all the CPUs in the PG are in the same 1740 * lgroup. 1741 */ 1742 if (lgrp == LGRP_NULL_HANDLE) { 1743 lgrp = lgrp_plat_cpu_to_hand(cp->cpu_id); 1744 } else if (lgrp_plat_cpu_to_hand(cp->cpu_id) != lgrp) { 1745 cmt_lineage_status = CMT_LINEAGE_PG_SPANS_LGRPS; 1746 goto handle_error; 1747 } 1748 } 1749 } 1750 1751 handle_error: 1752 /* 1753 * Some of these validation errors can result when the CPU grouping 1754 * information is derived from buggy sources (for example, incorrect 1755 * ACPI tables on x86 systems). 1756 * 1757 * We'll try to recover in such cases by pruning out the illegal 1758 * groupings from the PG hierarchy, which means that we won't optimize 1759 * for those levels, but we will for the remaining ones. 1760 */ 1761 switch (cmt_lineage_status) { 1762 case CMT_LINEAGE_VALID: 1763 case CMT_LINEAGE_REPAIRED: 1764 break; 1765 case CMT_LINEAGE_PG_SPANS_LGRPS: 1766 /* 1767 * We've detected a PG whose CPUs span lgroups. 1768 * 1769 * This isn't supported, as the dispatcher isn't allowed to 1770 * to do CMT thread placement across lgroups, as this would 1771 * conflict with policies implementing MPO thread affinity. 1772 * 1773 * The handling for this falls through to the next case. 1774 */ 1775 case CMT_LINEAGE_NON_PROMOTABLE: 1776 /* 1777 * We've detected a PG that already exists in another CPU's 1778 * lineage that cannot cannot legally be promoted into place 1779 * without breaking the invariants of the hierarchy. 1780 */ 1781 if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) { 1782 if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) { 1783 cmt_lineage_status = CMT_LINEAGE_REPAIRED; 1784 goto revalidate; 1785 } 1786 } 1787 /* 1788 * Something went wrong trying to prune out the bad level. 1789 * Disable CMT scheduling altogether. 1790 */ 1791 pg_cmt_disable(); 1792 break; 1793 case CMT_LINEAGE_NON_CONCENTRIC: 1794 /* 1795 * We've detected a non-concentric PG lineage, which means that 1796 * there's a PG in the lineage that has CPUs that the next PG 1797 * over in the lineage (which is the same size or larger) 1798 * doesn't have. 1799 * 1800 * In this case, we examine the two PGs to see if either 1801 * grouping is defined by potentially buggy sources. 1802 * 1803 * If one has less CPUs than the other, and contains CPUs 1804 * not found in the parent, and it is an untrusted enumeration, 1805 * then prune it. If both have the same number of CPUs, then 1806 * prune the one that is untrusted. 1807 * 1808 * This process repeats until we have a concentric lineage, 1809 * or we would have to prune out level derived from what we 1810 * thought was a reliable source, in which case CMT scheduling 1811 * is disabled altogether. 1812 */ 1813 if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)pg_next)) && 1814 (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) { 1815 pg_bad = pg; 1816 } else if (PG_NUM_CPUS((pg_t *)pg) == 1817 PG_NUM_CPUS((pg_t *)pg_next)) { 1818 if (PG_CMT_HW_SUSPECT(((pghw_t *)pg_next)->pghw_hw)) { 1819 pg_bad = pg_next; 1820 } else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) { 1821 pg_bad = pg; 1822 } 1823 } 1824 if (pg_bad) { 1825 if (pg_cmt_prune(pg_bad, lineage, sz, pgdata) == 0) { 1826 cmt_lineage_status = CMT_LINEAGE_REPAIRED; 1827 goto revalidate; 1828 } 1829 } 1830 /* 1831 * Something went wrong trying to identify and/or prune out 1832 * the bad level. Disable CMT scheduling altogether. 1833 */ 1834 pg_cmt_disable(); 1835 break; 1836 default: 1837 /* 1838 * If we're here, we've encountered a validation error for 1839 * which we don't know how to recover. In this case, disable 1840 * CMT scheduling altogether. 1841 */ 1842 cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE; 1843 pg_cmt_disable(); 1844 } 1845 return (cmt_lineage_status); 1846 } 1847