1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/systm.h> 27 #include <sys/types.h> 28 #include <sys/param.h> 29 #include <sys/thread.h> 30 #include <sys/cpuvar.h> 31 #include <sys/cpupart.h> 32 #include <sys/kmem.h> 33 #include <sys/cmn_err.h> 34 #include <sys/kstat.h> 35 #include <sys/processor.h> 36 #include <sys/disp.h> 37 #include <sys/group.h> 38 #include <sys/pghw.h> 39 #include <sys/bitset.h> 40 #include <sys/lgrp.h> 41 #include <sys/cmt.h> 42 #include <sys/cpu_pm.h> 43 44 /* 45 * CMT scheduler / dispatcher support 46 * 47 * This file implements CMT scheduler support using Processor Groups. 48 * The CMT processor group class creates and maintains the CMT class 49 * specific processor group pg_cmt_t. 50 * 51 * ---------------------------- <-- pg_cmt_t * 52 * | pghw_t | 53 * ---------------------------- 54 * | CMT class specific data | 55 * | - hierarchy linkage | 56 * | - CMT load balancing data| 57 * | - active CPU group/bitset| 58 * ---------------------------- 59 * 60 * The scheduler/dispatcher leverages knowledge of the performance 61 * relevant CMT sharing relationships existing between cpus to implement 62 * optimized affinity, load balancing, and coalescence policies. 63 * 64 * Load balancing policy seeks to improve performance by minimizing 65 * contention over shared processor resources / facilities, Affinity 66 * policies seek to improve cache and TLB utilization. Coalescence 67 * policies improve resource utilization and ultimately power efficiency. 68 * 69 * The CMT PGs created by this class are already arranged into a 70 * hierarchy (which is done in the pghw layer). To implement the top-down 71 * CMT load balancing algorithm, the CMT PGs additionally maintain 72 * parent, child and sibling hierarchy relationships. 73 * Parent PGs always contain a superset of their children(s) resources, 74 * each PG can have at most one parent, and siblings are the group of PGs 75 * sharing the same parent. 76 * 77 * On NUMA systems, the CMT load balancing algorithm balances across the 78 * CMT PGs within their respective lgroups. On UMA based system, there 79 * exists a top level group of PGs to balance across. On NUMA systems multiple 80 * top level groups are instantiated, where the top level balancing begins by 81 * balancng across the CMT PGs within their respective (per lgroup) top level 82 * groups. 83 */ 84 static cmt_lgrp_t *cmt_lgrps = NULL; /* cmt_lgrps list head */ 85 static cmt_lgrp_t *cpu0_lgrp = NULL; /* boot CPU's initial lgrp */ 86 /* used for null_proc_lpa */ 87 cmt_lgrp_t *cmt_root = NULL; /* Reference to root cmt pg */ 88 89 static int is_cpu0 = 1; /* true if this is boot CPU context */ 90 91 /* 92 * Array of hardware sharing relationships that are blacklisted. 93 * PGs won't be instantiated for blacklisted hardware sharing relationships. 94 */ 95 static int cmt_hw_blacklisted[PGHW_NUM_COMPONENTS]; 96 97 /* 98 * Set this to non-zero to disable CMT scheduling 99 * This must be done via kmdb -d, as /etc/system will be too late 100 */ 101 int cmt_sched_disabled = 0; 102 103 /* 104 * Status codes for CMT lineage validation 105 * See pg_cmt_lineage_validate() below 106 */ 107 typedef enum cmt_lineage_validation { 108 CMT_LINEAGE_VALID, 109 CMT_LINEAGE_NON_CONCENTRIC, 110 CMT_LINEAGE_PG_SPANS_LGRPS, 111 CMT_LINEAGE_NON_PROMOTABLE, 112 CMT_LINEAGE_REPAIRED, 113 CMT_LINEAGE_UNRECOVERABLE 114 } cmt_lineage_validation_t; 115 116 /* 117 * Status of the current lineage under construction. 118 * One must be holding cpu_lock to change this. 119 */ 120 cmt_lineage_validation_t cmt_lineage_status = CMT_LINEAGE_VALID; 121 122 /* 123 * Power domain definitions (on x86) are defined by ACPI, and 124 * therefore may be subject to BIOS bugs. 125 */ 126 #define PG_CMT_HW_SUSPECT(hw) PGHW_IS_PM_DOMAIN(hw) 127 128 /* 129 * Macro to test if PG is managed by the CMT PG class 130 */ 131 #define IS_CMT_PG(pg) (((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id) 132 133 static pg_cid_t pg_cmt_class_id; /* PG class id */ 134 135 static pg_t *pg_cmt_alloc(); 136 static void pg_cmt_free(pg_t *); 137 static void pg_cmt_cpu_init(cpu_t *, cpu_pg_t *); 138 static void pg_cmt_cpu_fini(cpu_t *, cpu_pg_t *); 139 static void pg_cmt_cpu_active(cpu_t *); 140 static void pg_cmt_cpu_inactive(cpu_t *); 141 static void pg_cmt_cpupart_in(cpu_t *, cpupart_t *); 142 static void pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *); 143 static char *pg_cmt_policy_name(pg_t *); 144 static void pg_cmt_hier_sort(pg_cmt_t **, int); 145 static pg_cmt_t *pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *); 146 static int pg_cmt_cpu_belongs(pg_t *, cpu_t *); 147 static int pg_cmt_hw(pghw_type_t); 148 static cmt_lgrp_t *pg_cmt_find_lgrp(lgrp_handle_t); 149 static cmt_lgrp_t *pg_cmt_lgrp_create(lgrp_handle_t); 150 static void cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t, 151 kthread_t *, kthread_t *); 152 static void cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t, 153 kthread_t *, kthread_t *); 154 static void cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *); 155 static cmt_lineage_validation_t pg_cmt_lineage_validate(pg_cmt_t **, int *, 156 cpu_pg_t *); 157 158 159 /* 160 * CMT PG ops 161 */ 162 struct pg_ops pg_ops_cmt = { 163 pg_cmt_alloc, 164 pg_cmt_free, 165 pg_cmt_cpu_init, 166 pg_cmt_cpu_fini, 167 pg_cmt_cpu_active, 168 pg_cmt_cpu_inactive, 169 pg_cmt_cpupart_in, 170 NULL, /* cpupart_out */ 171 pg_cmt_cpupart_move, 172 pg_cmt_cpu_belongs, 173 pg_cmt_policy_name, 174 }; 175 176 /* 177 * Initialize the CMT PG class 178 */ 179 void 180 pg_cmt_class_init(void) 181 { 182 if (cmt_sched_disabled) 183 return; 184 185 pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL); 186 } 187 188 /* 189 * Called to indicate a new CPU has started up so 190 * that either t0 or the slave startup thread can 191 * be accounted for. 192 */ 193 void 194 pg_cmt_cpu_startup(cpu_t *cp) 195 { 196 pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread, 197 cp->cpu_thread); 198 } 199 200 /* 201 * Return non-zero if thread can migrate between "from" and "to" 202 * without a performance penalty 203 */ 204 int 205 pg_cmt_can_migrate(cpu_t *from, cpu_t *to) 206 { 207 if (from->cpu_physid->cpu_cacheid == 208 to->cpu_physid->cpu_cacheid) 209 return (1); 210 return (0); 211 } 212 213 /* 214 * CMT class specific PG allocation 215 */ 216 static pg_t * 217 pg_cmt_alloc(void) 218 { 219 return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP)); 220 } 221 222 /* 223 * Class specific PG de-allocation 224 */ 225 static void 226 pg_cmt_free(pg_t *pg) 227 { 228 ASSERT(pg != NULL); 229 ASSERT(IS_CMT_PG(pg)); 230 231 kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t)); 232 } 233 234 /* 235 * Given a hardware sharing relationship, return which dispatcher 236 * policies should be implemented to optimize performance and efficiency 237 */ 238 static pg_cmt_policy_t 239 pg_cmt_policy(pghw_type_t hw) 240 { 241 pg_cmt_policy_t p; 242 243 /* 244 * Give the platform a chance to override the default 245 */ 246 if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY) 247 return (p); 248 249 switch (hw) { 250 case PGHW_IPIPE: 251 case PGHW_FPU: 252 case PGHW_CHIP: 253 return (CMT_BALANCE); 254 case PGHW_CACHE: 255 return (CMT_AFFINITY); 256 case PGHW_POW_ACTIVE: 257 case PGHW_POW_IDLE: 258 return (CMT_BALANCE); 259 default: 260 return (CMT_NO_POLICY); 261 } 262 } 263 264 /* 265 * Rank the importance of optimizing for the pg1 relationship vs. 266 * the pg2 relationship. 267 */ 268 static pg_cmt_t * 269 pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2) 270 { 271 pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw; 272 pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw; 273 274 /* 275 * A power domain is only important if CPUPM is enabled. 276 */ 277 if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) { 278 if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2)) 279 return (pg2); 280 if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1)) 281 return (pg1); 282 } 283 284 /* 285 * Otherwise, ask the platform 286 */ 287 if (pg_plat_hw_rank(hw1, hw2) == hw1) 288 return (pg1); 289 else 290 return (pg2); 291 } 292 293 /* 294 * Initialize CMT callbacks for the given PG 295 */ 296 static void 297 cmt_callback_init(pg_t *pg) 298 { 299 switch (((pghw_t *)pg)->pghw_hw) { 300 case PGHW_POW_ACTIVE: 301 pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr; 302 pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr; 303 break; 304 default: 305 pg->pg_cb.thread_swtch = cmt_ev_thread_swtch; 306 307 } 308 } 309 310 /* 311 * Promote PG above it's current parent. 312 * This is only legal if PG has an equal or greater number of CPUs than its 313 * parent. 314 * 315 * This routine operates on the CPU specific processor group data (for the CPUs 316 * in the PG being promoted), and may be invoked from a context where one CPU's 317 * PG data is under construction. In this case the argument "pgdata", if not 318 * NULL, is a reference to the CPU's under-construction PG data. 319 */ 320 static void 321 cmt_hier_promote(pg_cmt_t *pg, cpu_pg_t *pgdata) 322 { 323 pg_cmt_t *parent; 324 group_t *children; 325 cpu_t *cpu; 326 group_iter_t iter; 327 pg_cpu_itr_t cpu_iter; 328 int r; 329 int err; 330 331 ASSERT(MUTEX_HELD(&cpu_lock)); 332 333 parent = pg->cmt_parent; 334 if (parent == NULL) { 335 /* 336 * Nothing to do 337 */ 338 return; 339 } 340 341 ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent)); 342 343 /* 344 * We're changing around the hierarchy, which is actively traversed 345 * by the dispatcher. Pause CPUS to ensure exclusivity. 346 */ 347 pause_cpus(NULL); 348 349 /* 350 * If necessary, update the parent's sibling set, replacing parent 351 * with PG. 352 */ 353 if (parent->cmt_siblings) { 354 if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE) 355 != -1) { 356 r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE); 357 ASSERT(r != -1); 358 } 359 } 360 361 /* 362 * If the parent is at the top of the hierarchy, replace it's entry 363 * in the root lgroup's group of top level PGs. 364 */ 365 if (parent->cmt_parent == NULL && 366 parent->cmt_siblings != &cmt_root->cl_pgs) { 367 if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE) 368 != -1) { 369 r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE); 370 ASSERT(r != -1); 371 } 372 } 373 374 /* 375 * We assume (and therefore assert) that the PG being promoted is an 376 * only child of it's parent. Update the parent's children set 377 * replacing PG's entry with the parent (since the parent is becoming 378 * the child). Then have PG and the parent swap children sets. 379 */ 380 ASSERT(GROUP_SIZE(parent->cmt_children) <= 1); 381 if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) { 382 r = group_add(parent->cmt_children, parent, GRP_NORESIZE); 383 ASSERT(r != -1); 384 } 385 386 children = pg->cmt_children; 387 pg->cmt_children = parent->cmt_children; 388 parent->cmt_children = children; 389 390 /* 391 * Update the sibling references for PG and it's parent 392 */ 393 pg->cmt_siblings = parent->cmt_siblings; 394 parent->cmt_siblings = pg->cmt_children; 395 396 /* 397 * Update any cached lineages in the per CPU pg data. 398 */ 399 PG_CPU_ITR_INIT(pg, cpu_iter); 400 while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { 401 int idx; 402 pg_cmt_t *cpu_pg; 403 cpu_pg_t *pgd; /* CPU's PG data */ 404 405 /* 406 * The CPU's whose lineage is under construction still 407 * references the bootstrap CPU PG data structure. 408 */ 409 if (pg_cpu_is_bootstrapped(cpu)) 410 pgd = pgdata; 411 else 412 pgd = cpu->cpu_pg; 413 414 /* 415 * Iterate over the CPU's PGs updating the children 416 * of the PG being promoted, since they have a new parent. 417 */ 418 group_iter_init(&iter); 419 while ((cpu_pg = group_iterate(&pgd->cmt_pgs, &iter)) != NULL) { 420 if (cpu_pg->cmt_parent == pg) { 421 cpu_pg->cmt_parent = parent; 422 } 423 } 424 425 /* 426 * Update the CMT load balancing lineage 427 */ 428 if ((idx = group_find(&pgd->cmt_pgs, (void *)pg)) == -1) { 429 /* 430 * Unless this is the CPU who's lineage is being 431 * constructed, the PG being promoted should be 432 * in the lineage. 433 */ 434 ASSERT(pg_cpu_is_bootstrapped(cpu)); 435 continue; 436 } 437 438 ASSERT(GROUP_ACCESS(&pgd->cmt_pgs, idx - 1) == parent); 439 ASSERT(idx > 0); 440 441 /* 442 * Have the child and the parent swap places in the CPU's 443 * lineage 444 */ 445 group_remove_at(&pgd->cmt_pgs, idx); 446 group_remove_at(&pgd->cmt_pgs, idx - 1); 447 err = group_add_at(&pgd->cmt_pgs, parent, idx); 448 ASSERT(err == 0); 449 err = group_add_at(&pgd->cmt_pgs, pg, idx - 1); 450 ASSERT(err == 0); 451 } 452 453 /* 454 * Update the parent references for PG and it's parent 455 */ 456 pg->cmt_parent = parent->cmt_parent; 457 parent->cmt_parent = pg; 458 459 start_cpus(); 460 } 461 462 /* 463 * CMT class callback for a new CPU entering the system 464 * 465 * This routine operates on the CPU specific processor group data (for the CPU 466 * being initialized). The argument "pgdata" is a reference to the CPU's PG 467 * data to be constructed. 468 * 469 * cp->cpu_pg is used by the dispatcher to access the CPU's PG data 470 * references a "bootstrap" structure. pg_cmt_cpu_init() and the routines it 471 * calls must be careful to operate only on the "pgdata" argument, and not 472 * cp->cpu_pg. 473 */ 474 static void 475 pg_cmt_cpu_init(cpu_t *cp, cpu_pg_t *pgdata) 476 { 477 pg_cmt_t *pg; 478 group_t *cmt_pgs; 479 int levels, level; 480 pghw_type_t hw; 481 pg_t *pg_cache = NULL; 482 pg_cmt_t *cpu_cmt_hier[PGHW_NUM_COMPONENTS]; 483 lgrp_handle_t lgrp_handle; 484 cmt_lgrp_t *lgrp; 485 cmt_lineage_validation_t lineage_status; 486 487 ASSERT(MUTEX_HELD(&cpu_lock)); 488 ASSERT(pg_cpu_is_bootstrapped(cp)); 489 490 if (cmt_sched_disabled) 491 return; 492 493 /* 494 * A new CPU is coming into the system. 495 * Interrogate the platform to see if the CPU 496 * has any performance or efficiency relevant 497 * sharing relationships 498 */ 499 cmt_pgs = &pgdata->cmt_pgs; 500 pgdata->cmt_lineage = NULL; 501 502 bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier)); 503 levels = 0; 504 for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) { 505 506 pg_cmt_policy_t policy; 507 508 /* 509 * We're only interested in the hw sharing relationships 510 * for which we know how to optimize. 511 */ 512 policy = pg_cmt_policy(hw); 513 if (policy == CMT_NO_POLICY || 514 pg_plat_hw_shared(cp, hw) == 0) 515 continue; 516 517 /* 518 * Continue if the hardware sharing relationship has been 519 * blacklisted. 520 */ 521 if (cmt_hw_blacklisted[hw]) { 522 continue; 523 } 524 525 /* 526 * Find (or create) the PG associated with 527 * the hw sharing relationship in which cp 528 * belongs. 529 * 530 * Determine if a suitable PG already 531 * exists, or if one needs to be created. 532 */ 533 pg = (pg_cmt_t *)pghw_place_cpu(cp, hw); 534 if (pg == NULL) { 535 /* 536 * Create a new one. 537 * Initialize the common... 538 */ 539 pg = (pg_cmt_t *)pg_create(pg_cmt_class_id); 540 541 /* ... physical ... */ 542 pghw_init((pghw_t *)pg, cp, hw); 543 544 /* 545 * ... and CMT specific portions of the 546 * structure. 547 */ 548 pg->cmt_policy = policy; 549 550 /* CMT event callbacks */ 551 cmt_callback_init((pg_t *)pg); 552 553 bitset_init(&pg->cmt_cpus_actv_set); 554 group_create(&pg->cmt_cpus_actv); 555 } else { 556 ASSERT(IS_CMT_PG(pg)); 557 } 558 559 /* Add the CPU to the PG */ 560 pg_cpu_add((pg_t *)pg, cp, pgdata); 561 562 /* 563 * Ensure capacity of the active CPU group/bitset 564 */ 565 group_expand(&pg->cmt_cpus_actv, 566 GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 567 568 if (cp->cpu_seqid >= 569 bitset_capacity(&pg->cmt_cpus_actv_set)) { 570 bitset_resize(&pg->cmt_cpus_actv_set, 571 cp->cpu_seqid + 1); 572 } 573 574 /* 575 * Build a lineage of CMT PGs for load balancing / coalescence 576 */ 577 if (policy & (CMT_BALANCE | CMT_COALESCE)) { 578 cpu_cmt_hier[levels++] = pg; 579 } 580 581 /* Cache this for later */ 582 if (hw == PGHW_CACHE) 583 pg_cache = (pg_t *)pg; 584 } 585 586 group_expand(cmt_pgs, levels); 587 588 if (cmt_root == NULL) 589 cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand()); 590 591 /* 592 * Find the lgrp that encapsulates this CPU's CMT hierarchy 593 */ 594 lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 595 if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL) 596 lgrp = pg_cmt_lgrp_create(lgrp_handle); 597 598 /* 599 * Ascendingly sort the PGs in the lineage by number of CPUs 600 */ 601 pg_cmt_hier_sort(cpu_cmt_hier, levels); 602 603 /* 604 * Examine the lineage and validate it. 605 * This routine will also try to fix the lineage along with the 606 * rest of the PG hierarchy should it detect an issue. 607 * 608 * If it returns anything other than VALID or REPAIRED, an 609 * unrecoverable error has occurred, and we cannot proceed. 610 */ 611 lineage_status = pg_cmt_lineage_validate(cpu_cmt_hier, &levels, pgdata); 612 if ((lineage_status != CMT_LINEAGE_VALID) && 613 (lineage_status != CMT_LINEAGE_REPAIRED)) { 614 /* 615 * In the case of an unrecoverable error where CMT scheduling 616 * has been disabled, assert that the under construction CPU's 617 * PG data has an empty CMT load balancing lineage. 618 */ 619 ASSERT((cmt_sched_disabled == 0) || 620 (GROUP_SIZE(&(pgdata->cmt_pgs)) == 0)); 621 return; 622 } 623 624 /* 625 * For existing PGs in the lineage, verify that the parent is 626 * correct, as the generation in the lineage may have changed 627 * as a result of the sorting. Start the traversal at the top 628 * of the lineage, moving down. 629 */ 630 for (level = levels - 1; level >= 0; ) { 631 int reorg; 632 633 reorg = 0; 634 pg = cpu_cmt_hier[level]; 635 636 /* 637 * Promote PGs at an incorrect generation into place. 638 */ 639 while (pg->cmt_parent && 640 pg->cmt_parent != cpu_cmt_hier[level + 1]) { 641 cmt_hier_promote(pg, pgdata); 642 reorg++; 643 } 644 if (reorg > 0) 645 level = levels - 1; 646 else 647 level--; 648 } 649 650 /* 651 * For each of the PGs in the CPU's lineage: 652 * - Add an entry in the CPU sorted CMT PG group 653 * which is used for top down CMT load balancing 654 * - Tie the PG into the CMT hierarchy by connecting 655 * it to it's parent and siblings. 656 */ 657 for (level = 0; level < levels; level++) { 658 uint_t children; 659 int err; 660 661 pg = cpu_cmt_hier[level]; 662 err = group_add_at(cmt_pgs, pg, levels - level - 1); 663 ASSERT(err == 0); 664 665 if (level == 0) 666 pgdata->cmt_lineage = (pg_t *)pg; 667 668 if (pg->cmt_siblings != NULL) { 669 /* Already initialized */ 670 ASSERT(pg->cmt_parent == NULL || 671 pg->cmt_parent == cpu_cmt_hier[level + 1]); 672 ASSERT(pg->cmt_siblings == &lgrp->cl_pgs || 673 ((pg->cmt_parent != NULL) && 674 pg->cmt_siblings == pg->cmt_parent->cmt_children)); 675 continue; 676 } 677 678 if ((level + 1) == levels) { 679 pg->cmt_parent = NULL; 680 681 pg->cmt_siblings = &lgrp->cl_pgs; 682 children = ++lgrp->cl_npgs; 683 if (cmt_root != lgrp) 684 cmt_root->cl_npgs++; 685 } else { 686 pg->cmt_parent = cpu_cmt_hier[level + 1]; 687 688 /* 689 * A good parent keeps track of their children. 690 * The parent's children group is also the PG's 691 * siblings. 692 */ 693 if (pg->cmt_parent->cmt_children == NULL) { 694 pg->cmt_parent->cmt_children = 695 kmem_zalloc(sizeof (group_t), KM_SLEEP); 696 group_create(pg->cmt_parent->cmt_children); 697 } 698 pg->cmt_siblings = pg->cmt_parent->cmt_children; 699 children = ++pg->cmt_parent->cmt_nchildren; 700 } 701 702 group_expand(pg->cmt_siblings, children); 703 group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs); 704 } 705 706 /* 707 * Cache the chip and core IDs in the cpu_t->cpu_physid structure 708 * for fast lookups later. 709 */ 710 if (cp->cpu_physid) { 711 cp->cpu_physid->cpu_chipid = 712 pg_plat_hw_instance_id(cp, PGHW_CHIP); 713 cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp); 714 715 /* 716 * If this cpu has a PG representing shared cache, then set 717 * cpu_cacheid to that PG's logical id 718 */ 719 if (pg_cache) 720 cp->cpu_physid->cpu_cacheid = pg_cache->pg_id; 721 } 722 723 /* CPU0 only initialization */ 724 if (is_cpu0) { 725 pg_cmt_cpu_startup(cp); 726 is_cpu0 = 0; 727 cpu0_lgrp = lgrp; 728 } 729 730 } 731 732 /* 733 * Class callback when a CPU is leaving the system (deletion) 734 * 735 * "pgdata" is a reference to the CPU's PG data to be deconstructed. 736 * 737 * cp->cpu_pg is used by the dispatcher to access the CPU's PG data 738 * references a "bootstrap" structure across this function's invocation. 739 * pg_cmt_cpu_init() and the routines it calls must be careful to operate only 740 * on the "pgdata" argument, and not cp->cpu_pg. 741 */ 742 static void 743 pg_cmt_cpu_fini(cpu_t *cp, cpu_pg_t *pgdata) 744 { 745 group_iter_t i; 746 pg_cmt_t *pg; 747 group_t *pgs, *cmt_pgs; 748 lgrp_handle_t lgrp_handle; 749 cmt_lgrp_t *lgrp; 750 751 if (cmt_sched_disabled) 752 return; 753 754 ASSERT(pg_cpu_is_bootstrapped(cp)); 755 756 pgs = &pgdata->pgs; 757 cmt_pgs = &pgdata->cmt_pgs; 758 759 /* 760 * Find the lgroup that encapsulates this CPU's CMT hierarchy 761 */ 762 lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 763 764 lgrp = pg_cmt_find_lgrp(lgrp_handle); 765 if (ncpus == 1 && lgrp != cpu0_lgrp) { 766 /* 767 * One might wonder how we could be deconfiguring the 768 * only CPU in the system. 769 * 770 * On Starcat systems when null_proc_lpa is detected, 771 * the boot CPU (which is already configured into a leaf 772 * lgroup), is moved into the root lgroup. This is done by 773 * deconfiguring it from both lgroups and processor 774 * groups), and then later reconfiguring it back in. This 775 * call to pg_cmt_cpu_fini() is part of that deconfiguration. 776 * 777 * This special case is detected by noting that the platform 778 * has changed the CPU's lgrp affiliation (since it now 779 * belongs in the root). In this case, use the cmt_lgrp_t 780 * cached for the boot CPU, since this is what needs to be 781 * torn down. 782 */ 783 lgrp = cpu0_lgrp; 784 } 785 786 ASSERT(lgrp != NULL); 787 788 /* 789 * First, clean up anything load balancing specific for each of 790 * the CPU's PGs that participated in CMT load balancing 791 */ 792 pg = (pg_cmt_t *)pgdata->cmt_lineage; 793 while (pg != NULL) { 794 795 /* 796 * Remove the PG from the CPU's load balancing lineage 797 */ 798 (void) group_remove(cmt_pgs, pg, GRP_RESIZE); 799 800 /* 801 * If it's about to become empty, destroy it's children 802 * group, and remove it's reference from it's siblings. 803 * This is done here (rather than below) to avoid removing 804 * our reference from a PG that we just eliminated. 805 */ 806 if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) { 807 if (pg->cmt_children != NULL) 808 group_destroy(pg->cmt_children); 809 if (pg->cmt_siblings != NULL) { 810 if (pg->cmt_siblings == &lgrp->cl_pgs) 811 lgrp->cl_npgs--; 812 else 813 pg->cmt_parent->cmt_nchildren--; 814 } 815 } 816 pg = pg->cmt_parent; 817 } 818 ASSERT(GROUP_SIZE(cmt_pgs) == 0); 819 820 /* 821 * Now that the load balancing lineage updates have happened, 822 * remove the CPU from all it's PGs (destroying any that become 823 * empty). 824 */ 825 group_iter_init(&i); 826 while ((pg = group_iterate(pgs, &i)) != NULL) { 827 if (IS_CMT_PG(pg) == 0) 828 continue; 829 830 pg_cpu_delete((pg_t *)pg, cp, pgdata); 831 /* 832 * Deleting the CPU from the PG changes the CPU's 833 * PG group over which we are actively iterating 834 * Re-initialize the iteration 835 */ 836 group_iter_init(&i); 837 838 if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) { 839 840 /* 841 * The PG has become zero sized, so destroy it. 842 */ 843 group_destroy(&pg->cmt_cpus_actv); 844 bitset_fini(&pg->cmt_cpus_actv_set); 845 pghw_fini((pghw_t *)pg); 846 847 pg_destroy((pg_t *)pg); 848 } 849 } 850 } 851 852 /* 853 * Class callback when a CPU is entering a cpu partition 854 */ 855 static void 856 pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp) 857 { 858 group_t *pgs; 859 pg_t *pg; 860 group_iter_t i; 861 862 ASSERT(MUTEX_HELD(&cpu_lock)); 863 864 if (cmt_sched_disabled) 865 return; 866 867 pgs = &cp->cpu_pg->pgs; 868 869 /* 870 * Ensure that the new partition's PG bitset 871 * is large enough for all CMT PG's to which cp 872 * belongs 873 */ 874 group_iter_init(&i); 875 while ((pg = group_iterate(pgs, &i)) != NULL) { 876 if (IS_CMT_PG(pg) == 0) 877 continue; 878 879 if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id) 880 bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1); 881 } 882 } 883 884 /* 885 * Class callback when a CPU is actually moving partitions 886 */ 887 static void 888 pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp) 889 { 890 cpu_t *cpp; 891 group_t *pgs; 892 pg_t *pg; 893 group_iter_t pg_iter; 894 pg_cpu_itr_t cpu_iter; 895 boolean_t found; 896 897 ASSERT(MUTEX_HELD(&cpu_lock)); 898 899 if (cmt_sched_disabled) 900 return; 901 902 pgs = &cp->cpu_pg->pgs; 903 group_iter_init(&pg_iter); 904 905 /* 906 * Iterate over the CPUs CMT PGs 907 */ 908 while ((pg = group_iterate(pgs, &pg_iter)) != NULL) { 909 910 if (IS_CMT_PG(pg) == 0) 911 continue; 912 913 /* 914 * Add the PG to the bitset in the new partition. 915 */ 916 bitset_add(&newpp->cp_cmt_pgs, pg->pg_id); 917 918 /* 919 * Remove the PG from the bitset in the old partition 920 * if the last of the PG's CPUs have left. 921 */ 922 found = B_FALSE; 923 PG_CPU_ITR_INIT(pg, cpu_iter); 924 while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) { 925 if (cpp == cp) 926 continue; 927 if (CPU_ACTIVE(cpp) && 928 cpp->cpu_part->cp_id == oldpp->cp_id) { 929 found = B_TRUE; 930 break; 931 } 932 } 933 if (!found) 934 bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id); 935 } 936 } 937 938 /* 939 * Class callback when a CPU becomes active (online) 940 * 941 * This is called in a context where CPUs are paused 942 */ 943 static void 944 pg_cmt_cpu_active(cpu_t *cp) 945 { 946 int err; 947 group_iter_t i; 948 pg_cmt_t *pg; 949 group_t *pgs; 950 951 ASSERT(MUTEX_HELD(&cpu_lock)); 952 953 if (cmt_sched_disabled) 954 return; 955 956 pgs = &cp->cpu_pg->pgs; 957 group_iter_init(&i); 958 959 /* 960 * Iterate over the CPU's PGs 961 */ 962 while ((pg = group_iterate(pgs, &i)) != NULL) { 963 964 if (IS_CMT_PG(pg) == 0) 965 continue; 966 967 err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 968 ASSERT(err == 0); 969 970 /* 971 * If this is the first active CPU in the PG, and it 972 * represents a hardware sharing relationship over which 973 * CMT load balancing is performed, add it as a candidate 974 * for balancing with it's siblings. 975 */ 976 if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 && 977 (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) { 978 err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE); 979 ASSERT(err == 0); 980 981 /* 982 * If this is a top level PG, add it as a balancing 983 * candidate when balancing within the root lgroup. 984 */ 985 if (pg->cmt_parent == NULL && 986 pg->cmt_siblings != &cmt_root->cl_pgs) { 987 err = group_add(&cmt_root->cl_pgs, pg, 988 GRP_NORESIZE); 989 ASSERT(err == 0); 990 } 991 } 992 993 /* 994 * Notate the CPU in the PGs active CPU bitset. 995 * Also notate the PG as being active in it's associated 996 * partition 997 */ 998 bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 999 bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id); 1000 } 1001 } 1002 1003 /* 1004 * Class callback when a CPU goes inactive (offline) 1005 * 1006 * This is called in a context where CPUs are paused 1007 */ 1008 static void 1009 pg_cmt_cpu_inactive(cpu_t *cp) 1010 { 1011 int err; 1012 group_t *pgs; 1013 pg_cmt_t *pg; 1014 cpu_t *cpp; 1015 group_iter_t i; 1016 pg_cpu_itr_t cpu_itr; 1017 boolean_t found; 1018 1019 ASSERT(MUTEX_HELD(&cpu_lock)); 1020 1021 if (cmt_sched_disabled) 1022 return; 1023 1024 pgs = &cp->cpu_pg->pgs; 1025 group_iter_init(&i); 1026 1027 while ((pg = group_iterate(pgs, &i)) != NULL) { 1028 1029 if (IS_CMT_PG(pg) == 0) 1030 continue; 1031 1032 /* 1033 * Remove the CPU from the CMT PGs active CPU group 1034 * bitmap 1035 */ 1036 err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 1037 ASSERT(err == 0); 1038 1039 bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 1040 1041 /* 1042 * If there are no more active CPUs in this PG over which 1043 * load was balanced, remove it as a balancing candidate. 1044 */ 1045 if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 && 1046 (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) { 1047 err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); 1048 ASSERT(err == 0); 1049 1050 if (pg->cmt_parent == NULL && 1051 pg->cmt_siblings != &cmt_root->cl_pgs) { 1052 err = group_remove(&cmt_root->cl_pgs, pg, 1053 GRP_NORESIZE); 1054 ASSERT(err == 0); 1055 } 1056 } 1057 1058 /* 1059 * Assert the number of active CPUs does not exceed 1060 * the total number of CPUs in the PG 1061 */ 1062 ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <= 1063 GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 1064 1065 /* 1066 * Update the PG bitset in the CPU's old partition 1067 */ 1068 found = B_FALSE; 1069 PG_CPU_ITR_INIT(pg, cpu_itr); 1070 while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) { 1071 if (cpp == cp) 1072 continue; 1073 if (CPU_ACTIVE(cpp) && 1074 cpp->cpu_part->cp_id == cp->cpu_part->cp_id) { 1075 found = B_TRUE; 1076 break; 1077 } 1078 } 1079 if (!found) { 1080 bitset_del(&cp->cpu_part->cp_cmt_pgs, 1081 ((pg_t *)pg)->pg_id); 1082 } 1083 } 1084 } 1085 1086 /* 1087 * Return non-zero if the CPU belongs in the given PG 1088 */ 1089 static int 1090 pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp) 1091 { 1092 cpu_t *pg_cpu; 1093 1094 pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0); 1095 1096 ASSERT(pg_cpu != NULL); 1097 1098 /* 1099 * The CPU belongs if, given the nature of the hardware sharing 1100 * relationship represented by the PG, the CPU has that 1101 * relationship with some other CPU already in the PG 1102 */ 1103 if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw)) 1104 return (1); 1105 1106 return (0); 1107 } 1108 1109 /* 1110 * Sort the CPUs CMT hierarchy, where "size" is the number of levels. 1111 */ 1112 static void 1113 pg_cmt_hier_sort(pg_cmt_t **hier, int size) 1114 { 1115 int i, j, inc; 1116 pg_t *tmp; 1117 pg_t **h = (pg_t **)hier; 1118 1119 /* 1120 * First sort by number of CPUs 1121 */ 1122 inc = size / 2; 1123 while (inc > 0) { 1124 for (i = inc; i < size; i++) { 1125 j = i; 1126 tmp = h[i]; 1127 while ((j >= inc) && 1128 (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) { 1129 h[j] = h[j - inc]; 1130 j = j - inc; 1131 } 1132 h[j] = tmp; 1133 } 1134 if (inc == 2) 1135 inc = 1; 1136 else 1137 inc = (inc * 5) / 11; 1138 } 1139 1140 /* 1141 * Break ties by asking the platform. 1142 * Determine if h[i] outranks h[i + 1] and if so, swap them. 1143 */ 1144 for (i = 0; i < size - 1; i++) { 1145 if ((PG_NUM_CPUS(h[i]) == PG_NUM_CPUS(h[i + 1])) && 1146 pg_cmt_hier_rank(hier[i], hier[i + 1]) == hier[i]) { 1147 tmp = h[i]; 1148 h[i] = h[i + 1]; 1149 h[i + 1] = tmp; 1150 } 1151 } 1152 } 1153 1154 /* 1155 * Return a cmt_lgrp_t * given an lgroup handle. 1156 */ 1157 static cmt_lgrp_t * 1158 pg_cmt_find_lgrp(lgrp_handle_t hand) 1159 { 1160 cmt_lgrp_t *lgrp; 1161 1162 ASSERT(MUTEX_HELD(&cpu_lock)); 1163 1164 lgrp = cmt_lgrps; 1165 while (lgrp != NULL) { 1166 if (lgrp->cl_hand == hand) 1167 break; 1168 lgrp = lgrp->cl_next; 1169 } 1170 return (lgrp); 1171 } 1172 1173 /* 1174 * Create a cmt_lgrp_t with the specified handle. 1175 */ 1176 static cmt_lgrp_t * 1177 pg_cmt_lgrp_create(lgrp_handle_t hand) 1178 { 1179 cmt_lgrp_t *lgrp; 1180 1181 ASSERT(MUTEX_HELD(&cpu_lock)); 1182 1183 lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP); 1184 1185 lgrp->cl_hand = hand; 1186 lgrp->cl_npgs = 0; 1187 lgrp->cl_next = cmt_lgrps; 1188 cmt_lgrps = lgrp; 1189 group_create(&lgrp->cl_pgs); 1190 1191 return (lgrp); 1192 } 1193 1194 /* 1195 * Interfaces to enable and disable power aware dispatching 1196 * The caller must be holding cpu_lock. 1197 * 1198 * Return 0 on success and -1 on failure. 1199 */ 1200 int 1201 cmt_pad_enable(pghw_type_t type) 1202 { 1203 group_t *hwset; 1204 group_iter_t iter; 1205 pg_cmt_t *pg; 1206 1207 ASSERT(PGHW_IS_PM_DOMAIN(type)); 1208 ASSERT(MUTEX_HELD(&cpu_lock)); 1209 1210 if ((hwset = pghw_set_lookup(type)) == NULL || 1211 cmt_hw_blacklisted[type]) { 1212 /* 1213 * Unable to find any instances of the specified type 1214 * of power domain, or the power domains have been blacklisted. 1215 */ 1216 return (-1); 1217 } 1218 1219 /* 1220 * Iterate over the power domains, setting the default dispatcher 1221 * policy for power/performance optimization. 1222 * 1223 * Simply setting the policy isn't enough in the case where the power 1224 * domain is an only child of another PG. Because the dispatcher walks 1225 * the PG hierarchy in a top down fashion, the higher up PG's policy 1226 * will dominate. So promote the power domain above it's parent if both 1227 * PG and it's parent have the same CPUs to ensure it's policy 1228 * dominates. 1229 */ 1230 group_iter_init(&iter); 1231 while ((pg = group_iterate(hwset, &iter)) != NULL) { 1232 /* 1233 * If the power domain is an only child to a parent 1234 * not implementing the same policy, promote the child 1235 * above the parent to activate the policy. 1236 */ 1237 pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw); 1238 while ((pg->cmt_parent != NULL) && 1239 (pg->cmt_parent->cmt_policy != pg->cmt_policy) && 1240 (PG_NUM_CPUS((pg_t *)pg) == 1241 PG_NUM_CPUS((pg_t *)pg->cmt_parent))) { 1242 cmt_hier_promote(pg, NULL); 1243 } 1244 } 1245 1246 return (0); 1247 } 1248 1249 int 1250 cmt_pad_disable(pghw_type_t type) 1251 { 1252 group_t *hwset; 1253 group_iter_t iter; 1254 pg_cmt_t *pg; 1255 pg_cmt_t *child; 1256 1257 ASSERT(PGHW_IS_PM_DOMAIN(type)); 1258 ASSERT(MUTEX_HELD(&cpu_lock)); 1259 1260 if ((hwset = pghw_set_lookup(type)) == NULL) { 1261 /* 1262 * Unable to find any instances of the specified type of 1263 * power domain. 1264 */ 1265 return (-1); 1266 } 1267 /* 1268 * Iterate over the power domains, setting the default dispatcher 1269 * policy for performance optimization (load balancing). 1270 */ 1271 group_iter_init(&iter); 1272 while ((pg = group_iterate(hwset, &iter)) != NULL) { 1273 1274 /* 1275 * If the power domain has an only child that implements 1276 * policy other than load balancing, promote the child 1277 * above the power domain to ensure it's policy dominates. 1278 */ 1279 if (pg->cmt_children != NULL && 1280 GROUP_SIZE(pg->cmt_children) == 1) { 1281 child = GROUP_ACCESS(pg->cmt_children, 0); 1282 if ((child->cmt_policy & CMT_BALANCE) == 0) { 1283 cmt_hier_promote(child, NULL); 1284 } 1285 } 1286 pg->cmt_policy = CMT_BALANCE; 1287 } 1288 return (0); 1289 } 1290 1291 /* ARGSUSED */ 1292 static void 1293 cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old, 1294 kthread_t *new) 1295 { 1296 pg_cmt_t *cmt_pg = (pg_cmt_t *)pg; 1297 1298 if (old == cp->cpu_idle_thread) { 1299 atomic_add_32(&cmt_pg->cmt_utilization, 1); 1300 } else if (new == cp->cpu_idle_thread) { 1301 atomic_add_32(&cmt_pg->cmt_utilization, -1); 1302 } 1303 } 1304 1305 /* 1306 * Macro to test whether a thread is currently runnable on a CPU in a PG. 1307 */ 1308 #define THREAD_RUNNABLE_IN_PG(t, pg) \ 1309 ((t)->t_state == TS_RUN && \ 1310 (t)->t_disp_queue->disp_cpu && \ 1311 bitset_in_set(&(pg)->cmt_cpus_actv_set, \ 1312 (t)->t_disp_queue->disp_cpu->cpu_seqid)) 1313 1314 static void 1315 cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old, 1316 kthread_t *new) 1317 { 1318 pg_cmt_t *cmt = (pg_cmt_t *)pg; 1319 cpupm_domain_t *dom; 1320 uint32_t u; 1321 1322 if (old == cp->cpu_idle_thread) { 1323 ASSERT(new != cp->cpu_idle_thread); 1324 u = atomic_add_32_nv(&cmt->cmt_utilization, 1); 1325 if (u == 1) { 1326 /* 1327 * Notify the CPU power manager that the domain 1328 * is non-idle. 1329 */ 1330 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1331 cpupm_utilization_event(cp, now, dom, 1332 CPUPM_DOM_BUSY_FROM_IDLE); 1333 } 1334 } else if (new == cp->cpu_idle_thread) { 1335 ASSERT(old != cp->cpu_idle_thread); 1336 u = atomic_add_32_nv(&cmt->cmt_utilization, -1); 1337 if (u == 0) { 1338 /* 1339 * The domain is idle, notify the CPU power 1340 * manager. 1341 * 1342 * Avoid notifying if the thread is simply migrating 1343 * between CPUs in the domain. 1344 */ 1345 if (!THREAD_RUNNABLE_IN_PG(old, cmt)) { 1346 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1347 cpupm_utilization_event(cp, now, dom, 1348 CPUPM_DOM_IDLE_FROM_BUSY); 1349 } 1350 } 1351 } 1352 } 1353 1354 /* ARGSUSED */ 1355 static void 1356 cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t) 1357 { 1358 pg_cmt_t *cmt = (pg_cmt_t *)pg; 1359 cpupm_domain_t *dom; 1360 1361 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1362 cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY); 1363 } 1364 1365 /* 1366 * Return the name of the CMT scheduling policy 1367 * being implemented across this PG 1368 */ 1369 static char * 1370 pg_cmt_policy_name(pg_t *pg) 1371 { 1372 pg_cmt_policy_t policy; 1373 1374 policy = ((pg_cmt_t *)pg)->cmt_policy; 1375 1376 if (policy & CMT_AFFINITY) { 1377 if (policy & CMT_BALANCE) 1378 return ("Load Balancing & Affinity"); 1379 else if (policy & CMT_COALESCE) 1380 return ("Load Coalescence & Affinity"); 1381 else 1382 return ("Affinity"); 1383 } else { 1384 if (policy & CMT_BALANCE) 1385 return ("Load Balancing"); 1386 else if (policy & CMT_COALESCE) 1387 return ("Load Coalescence"); 1388 else 1389 return ("None"); 1390 } 1391 } 1392 1393 /* 1394 * Prune PG, and all other instances of PG's hardware sharing relationship 1395 * from the PG hierarchy. 1396 * 1397 * This routine operates on the CPU specific processor group data (for the CPUs 1398 * in the PG being pruned), and may be invoked from a context where one CPU's 1399 * PG data is under construction. In this case the argument "pgdata", if not 1400 * NULL, is a reference to the CPU's under-construction PG data. 1401 */ 1402 static int 1403 pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata) 1404 { 1405 group_t *hwset, *children; 1406 int i, j, r, size = *sz; 1407 group_iter_t hw_iter, child_iter; 1408 pg_cpu_itr_t cpu_iter; 1409 pg_cmt_t *pg, *child; 1410 cpu_t *cpu; 1411 int cap_needed; 1412 pghw_type_t hw; 1413 1414 ASSERT(MUTEX_HELD(&cpu_lock)); 1415 1416 hw = ((pghw_t *)pg_bad)->pghw_hw; 1417 1418 if (hw == PGHW_POW_ACTIVE) { 1419 cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. " 1420 "Event Based CPUPM Unavailable"); 1421 } else if (hw == PGHW_POW_IDLE) { 1422 cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. " 1423 "Dispatcher assisted CPUPM disabled."); 1424 } 1425 1426 /* 1427 * Find and eliminate the PG from the lineage. 1428 */ 1429 for (i = 0; i < size; i++) { 1430 if (lineage[i] == pg_bad) { 1431 for (j = i; j < size - 1; j++) 1432 lineage[j] = lineage[j + 1]; 1433 *sz = size - 1; 1434 break; 1435 } 1436 } 1437 1438 /* 1439 * We'll prune all instances of the hardware sharing relationship 1440 * represented by pg. But before we do that (and pause CPUs) we need 1441 * to ensure the hierarchy's groups are properly sized. 1442 */ 1443 hwset = pghw_set_lookup(hw); 1444 1445 /* 1446 * Blacklist the hardware so that future groups won't be created. 1447 */ 1448 cmt_hw_blacklisted[hw] = 1; 1449 1450 /* 1451 * For each of the PGs being pruned, ensure sufficient capacity in 1452 * the siblings set for the PG's children 1453 */ 1454 group_iter_init(&hw_iter); 1455 while ((pg = group_iterate(hwset, &hw_iter)) != NULL) { 1456 /* 1457 * PG is being pruned, but if it is bringing up more than 1458 * one child, ask for more capacity in the siblings group. 1459 */ 1460 cap_needed = 0; 1461 if (pg->cmt_children && 1462 GROUP_SIZE(pg->cmt_children) > 1) { 1463 cap_needed = GROUP_SIZE(pg->cmt_children) - 1; 1464 1465 group_expand(pg->cmt_siblings, 1466 GROUP_SIZE(pg->cmt_siblings) + cap_needed); 1467 1468 /* 1469 * If this is a top level group, also ensure the 1470 * capacity in the root lgrp level CMT grouping. 1471 */ 1472 if (pg->cmt_parent == NULL && 1473 pg->cmt_siblings != &cmt_root->cl_pgs) { 1474 group_expand(&cmt_root->cl_pgs, 1475 GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed); 1476 } 1477 } 1478 } 1479 1480 /* 1481 * We're operating on the PG hierarchy. Pause CPUs to ensure 1482 * exclusivity with respect to the dispatcher. 1483 */ 1484 pause_cpus(NULL); 1485 1486 /* 1487 * Prune all PG instances of the hardware sharing relationship 1488 * represented by pg. 1489 */ 1490 group_iter_init(&hw_iter); 1491 while ((pg = group_iterate(hwset, &hw_iter)) != NULL) { 1492 1493 /* 1494 * Remove PG from it's group of siblings, if it's there. 1495 */ 1496 if (pg->cmt_siblings) { 1497 (void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); 1498 } 1499 if (pg->cmt_parent == NULL && 1500 pg->cmt_siblings != &cmt_root->cl_pgs) { 1501 (void) group_remove(&cmt_root->cl_pgs, pg, 1502 GRP_NORESIZE); 1503 } 1504 /* 1505 * Move PG's children from it's children set to it's parent's 1506 * children set. Note that the parent's children set, and PG's 1507 * siblings set are the same thing. 1508 * 1509 * Because we are iterating over the same group that we are 1510 * operating on (removing the children), first add all of PG's 1511 * children to the parent's children set, and once we are done 1512 * iterating, empty PG's children set. 1513 */ 1514 if (pg->cmt_children != NULL) { 1515 children = pg->cmt_children; 1516 1517 group_iter_init(&child_iter); 1518 while ((child = group_iterate(children, &child_iter)) 1519 != NULL) { 1520 if (pg->cmt_siblings != NULL) { 1521 r = group_add(pg->cmt_siblings, child, 1522 GRP_NORESIZE); 1523 ASSERT(r == 0); 1524 } 1525 } 1526 group_empty(pg->cmt_children); 1527 } 1528 1529 /* 1530 * Reset the callbacks to the defaults 1531 */ 1532 pg_callback_set_defaults((pg_t *)pg); 1533 1534 /* 1535 * Update all the CPU lineages in each of PG's CPUs 1536 */ 1537 PG_CPU_ITR_INIT(pg, cpu_iter); 1538 while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { 1539 pg_cmt_t *cpu_pg; 1540 group_iter_t liter; /* Iterator for the lineage */ 1541 cpu_pg_t *cpd; /* CPU's PG data */ 1542 1543 /* 1544 * The CPU's lineage is under construction still 1545 * references the bootstrap CPU PG data structure. 1546 */ 1547 if (pg_cpu_is_bootstrapped(cpu)) 1548 cpd = pgdata; 1549 else 1550 cpd = cpu->cpu_pg; 1551 1552 /* 1553 * Iterate over the CPU's PGs updating the children 1554 * of the PG being promoted, since they have a new 1555 * parent and siblings set. 1556 */ 1557 group_iter_init(&liter); 1558 while ((cpu_pg = group_iterate(&cpd->pgs, 1559 &liter)) != NULL) { 1560 if (cpu_pg->cmt_parent == pg) { 1561 cpu_pg->cmt_parent = pg->cmt_parent; 1562 cpu_pg->cmt_siblings = pg->cmt_siblings; 1563 } 1564 } 1565 1566 /* 1567 * Update the CPU's lineages 1568 */ 1569 (void) group_remove(&cpd->pgs, pg, GRP_NORESIZE); 1570 (void) group_remove(&cpd->cmt_pgs, pg, GRP_NORESIZE); 1571 } 1572 } 1573 start_cpus(); 1574 return (0); 1575 } 1576 1577 /* 1578 * Disable CMT scheduling 1579 */ 1580 static void 1581 pg_cmt_disable(void) 1582 { 1583 cpu_t *cpu; 1584 1585 ASSERT(MUTEX_HELD(&cpu_lock)); 1586 1587 pause_cpus(NULL); 1588 cpu = cpu_list; 1589 1590 do { 1591 if (cpu->cpu_pg) 1592 group_empty(&cpu->cpu_pg->cmt_pgs); 1593 } while ((cpu = cpu->cpu_next) != cpu_list); 1594 1595 cmt_sched_disabled = 1; 1596 start_cpus(); 1597 cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable"); 1598 } 1599 1600 /* 1601 * CMT lineage validation 1602 * 1603 * This routine is invoked by pg_cmt_cpu_init() to validate the integrity 1604 * of the PGs in a CPU's lineage. This is necessary because it's possible that 1605 * some groupings (power domain groupings in particular) may be defined by 1606 * sources that are buggy (e.g. BIOS bugs). In such cases, it may not be 1607 * possible to integrate those groupings into the CMT PG hierarchy, if doing 1608 * so would violate the subset invariant of the hierarchy, which says that 1609 * a PG must be subset of its parent (if it has one). 1610 * 1611 * pg_cmt_lineage_validate()'s purpose is to detect grouping definitions that 1612 * would result in a violation of this invariant. If a violation is found, 1613 * and the PG is of a grouping type who's definition is known to originate from 1614 * suspect sources (BIOS), then pg_cmt_prune() will be invoked to prune the 1615 * PG (and all other instances PG's sharing relationship type) from the 1616 * hierarchy. Further, future instances of that sharing relationship type won't 1617 * be instantiated. If the grouping definition doesn't originate from suspect 1618 * sources, then pg_cmt_disable() will be invoked to log an error, and disable 1619 * CMT scheduling altogether. 1620 * 1621 * This routine is invoked after the CPU has been added to the PGs in which 1622 * it belongs, but before those PGs have been added to (or had their place 1623 * adjusted in) the CMT PG hierarchy. 1624 * 1625 * The first argument is the CPUs PG lineage (essentially an array of PGs in 1626 * which the CPU belongs) that has already been sorted in ascending order 1627 * by CPU count. Some of the PGs in the CPUs lineage may already have other 1628 * CPUs in them, and have already been integrated into the CMT hierarchy. 1629 * 1630 * The addition of this new CPU to these pre-existing PGs means that those 1631 * PGs may need to be promoted up in the hierarchy to satisfy the subset 1632 * invariant. In additon to testing the subset invariant for the lineage, 1633 * this routine also verifies that the addition of the new CPU to the 1634 * existing PGs wouldn't cause the subset invariant to be violated in 1635 * the exiting lineages. 1636 * 1637 * This routine will normally return one of the following: 1638 * CMT_LINEAGE_VALID - There were no problems detected with the lineage. 1639 * CMT_LINEAGE_REPAIRED - Problems were detected, but repaired via pruning. 1640 * 1641 * Otherwise, this routine will return a value indicating which error it 1642 * was unable to recover from (and set cmt_lineage_status along the way). 1643 * 1644 * 1645 * This routine operates on the CPU specific processor group data (for the CPU 1646 * whose lineage is being validated), which is under-construction. 1647 * "pgdata" is a reference to the CPU's under-construction PG data. 1648 * This routine must be careful to operate only on "pgdata", and not cp->cpu_pg. 1649 */ 1650 static cmt_lineage_validation_t 1651 pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata) 1652 { 1653 int i, j, size; 1654 pg_cmt_t *pg, *pg_next, *pg_bad, *pg_tmp; 1655 cpu_t *cp; 1656 pg_cpu_itr_t cpu_iter; 1657 lgrp_handle_t lgrp; 1658 1659 ASSERT(MUTEX_HELD(&cpu_lock)); 1660 1661 revalidate: 1662 size = *sz; 1663 pg_bad = NULL; 1664 lgrp = LGRP_NULL_HANDLE; 1665 for (i = 0; i < size; i++) { 1666 1667 pg = lineage[i]; 1668 if (i < size - 1) 1669 pg_next = lineage[i + 1]; 1670 else 1671 pg_next = NULL; 1672 1673 /* 1674 * We assume that the lineage has already been sorted 1675 * by the number of CPUs. In fact, we depend on it. 1676 */ 1677 ASSERT(pg_next == NULL || 1678 (PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)pg_next))); 1679 1680 /* 1681 * Check to make sure that the existing parent of PG (if any) 1682 * is either in the PG's lineage, or the PG has more CPUs than 1683 * its existing parent and can and should be promoted above its 1684 * parent. 1685 * 1686 * Since the PG topology is in the middle of being changed, we 1687 * need to check whether the PG's existing parent (if any) is 1688 * part of its lineage (and therefore should contain the new 1689 * CPU). If not, it means that the addition of the new CPU 1690 * should have made this PG have more CPUs than its parent, and 1691 * this PG should be promoted to be above its existing parent 1692 * now. We need to verify all of this to defend against a buggy 1693 * BIOS giving bad power domain CPU groupings. Sigh. 1694 */ 1695 if (pg->cmt_parent) { 1696 /* 1697 * Determine if cmt_parent is in this lineage 1698 */ 1699 for (j = 0; j < size; j++) { 1700 pg_tmp = lineage[j]; 1701 if (pg_tmp == pg->cmt_parent) 1702 break; 1703 } 1704 if (pg_tmp != pg->cmt_parent) { 1705 /* 1706 * cmt_parent is not in the lineage, verify 1707 * it is a proper subset of PG. 1708 */ 1709 if (PG_NUM_CPUS((pg_t *)pg->cmt_parent) >= 1710 PG_NUM_CPUS((pg_t *)pg)) { 1711 /* 1712 * Not a proper subset if pg has less 1713 * CPUs than cmt_parent... 1714 */ 1715 cmt_lineage_status = 1716 CMT_LINEAGE_NON_PROMOTABLE; 1717 goto handle_error; 1718 } 1719 } 1720 } 1721 1722 /* 1723 * Walk each of the CPUs in the PGs group and perform 1724 * consistency checks along the way. 1725 */ 1726 PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter); 1727 while ((cp = pg_cpu_next(&cpu_iter)) != NULL) { 1728 /* 1729 * Verify that there aren't any CPUs contained in PG 1730 * that the next PG in the lineage (which is larger 1731 * or same size) doesn't also contain. 1732 */ 1733 if (pg_next != NULL && 1734 pg_cpu_find((pg_t *)pg_next, cp) == B_FALSE) { 1735 cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC; 1736 goto handle_error; 1737 } 1738 1739 /* 1740 * Verify that all the CPUs in the PG are in the same 1741 * lgroup. 1742 */ 1743 if (lgrp == LGRP_NULL_HANDLE) { 1744 lgrp = lgrp_plat_cpu_to_hand(cp->cpu_id); 1745 } else if (lgrp_plat_cpu_to_hand(cp->cpu_id) != lgrp) { 1746 cmt_lineage_status = CMT_LINEAGE_PG_SPANS_LGRPS; 1747 goto handle_error; 1748 } 1749 } 1750 } 1751 1752 handle_error: 1753 /* 1754 * Some of these validation errors can result when the CPU grouping 1755 * information is derived from buggy sources (for example, incorrect 1756 * ACPI tables on x86 systems). 1757 * 1758 * We'll try to recover in such cases by pruning out the illegal 1759 * groupings from the PG hierarchy, which means that we won't optimize 1760 * for those levels, but we will for the remaining ones. 1761 */ 1762 switch (cmt_lineage_status) { 1763 case CMT_LINEAGE_VALID: 1764 case CMT_LINEAGE_REPAIRED: 1765 break; 1766 case CMT_LINEAGE_PG_SPANS_LGRPS: 1767 /* 1768 * We've detected a PG whose CPUs span lgroups. 1769 * 1770 * This isn't supported, as the dispatcher isn't allowed to 1771 * to do CMT thread placement across lgroups, as this would 1772 * conflict with policies implementing MPO thread affinity. 1773 * 1774 * The handling for this falls through to the next case. 1775 */ 1776 case CMT_LINEAGE_NON_PROMOTABLE: 1777 /* 1778 * We've detected a PG that already exists in another CPU's 1779 * lineage that cannot cannot legally be promoted into place 1780 * without breaking the invariants of the hierarchy. 1781 */ 1782 if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) { 1783 if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) { 1784 cmt_lineage_status = CMT_LINEAGE_REPAIRED; 1785 goto revalidate; 1786 } 1787 } 1788 /* 1789 * Something went wrong trying to prune out the bad level. 1790 * Disable CMT scheduling altogether. 1791 */ 1792 pg_cmt_disable(); 1793 break; 1794 case CMT_LINEAGE_NON_CONCENTRIC: 1795 /* 1796 * We've detected a non-concentric PG lineage, which means that 1797 * there's a PG in the lineage that has CPUs that the next PG 1798 * over in the lineage (which is the same size or larger) 1799 * doesn't have. 1800 * 1801 * In this case, we examine the two PGs to see if either 1802 * grouping is defined by potentially buggy sources. 1803 * 1804 * If one has less CPUs than the other, and contains CPUs 1805 * not found in the parent, and it is an untrusted enumeration, 1806 * then prune it. If both have the same number of CPUs, then 1807 * prune the one that is untrusted. 1808 * 1809 * This process repeats until we have a concentric lineage, 1810 * or we would have to prune out level derived from what we 1811 * thought was a reliable source, in which case CMT scheduling 1812 * is disabled altogether. 1813 */ 1814 if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)pg_next)) && 1815 (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) { 1816 pg_bad = pg; 1817 } else if (PG_NUM_CPUS((pg_t *)pg) == 1818 PG_NUM_CPUS((pg_t *)pg_next)) { 1819 if (PG_CMT_HW_SUSPECT(((pghw_t *)pg_next)->pghw_hw)) { 1820 pg_bad = pg_next; 1821 } else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) { 1822 pg_bad = pg; 1823 } 1824 } 1825 if (pg_bad) { 1826 if (pg_cmt_prune(pg_bad, lineage, sz, pgdata) == 0) { 1827 cmt_lineage_status = CMT_LINEAGE_REPAIRED; 1828 goto revalidate; 1829 } 1830 } 1831 /* 1832 * Something went wrong trying to identify and/or prune out 1833 * the bad level. Disable CMT scheduling altogether. 1834 */ 1835 pg_cmt_disable(); 1836 break; 1837 default: 1838 /* 1839 * If we're here, we've encountered a validation error for 1840 * which we don't know how to recover. In this case, disable 1841 * CMT scheduling altogether. 1842 */ 1843 cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE; 1844 pg_cmt_disable(); 1845 } 1846 return (cmt_lineage_status); 1847 } 1848