1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/systm.h> 27 #include <sys/types.h> 28 #include <sys/param.h> 29 #include <sys/thread.h> 30 #include <sys/cpuvar.h> 31 #include <sys/cpupart.h> 32 #include <sys/kmem.h> 33 #include <sys/cmn_err.h> 34 #include <sys/kstat.h> 35 #include <sys/processor.h> 36 #include <sys/disp.h> 37 #include <sys/group.h> 38 #include <sys/pghw.h> 39 #include <sys/bitset.h> 40 #include <sys/lgrp.h> 41 #include <sys/cmt.h> 42 #include <sys/cpu_pm.h> 43 44 /* 45 * CMT scheduler / dispatcher support 46 * 47 * This file implements CMT scheduler support using Processor Groups. 48 * The CMT processor group class creates and maintains the CMT class 49 * specific processor group pg_cmt_t. 50 * 51 * ---------------------------- <-- pg_cmt_t * 52 * | pghw_t | 53 * ---------------------------- 54 * | CMT class specific data | 55 * | - hierarchy linkage | 56 * | - CMT load balancing data| 57 * | - active CPU group/bitset| 58 * ---------------------------- 59 * 60 * The scheduler/dispatcher leverages knowledge of the performance 61 * relevant CMT sharing relationships existing between cpus to implement 62 * optimized affinity, load balancing, and coalescence policies. 63 * 64 * Load balancing policy seeks to improve performance by minimizing 65 * contention over shared processor resources / facilities, Affinity 66 * policies seek to improve cache and TLB utilization. Coalescence 67 * policies improve resource utilization and ultimately power efficiency. 68 * 69 * The CMT PGs created by this class are already arranged into a 70 * hierarchy (which is done in the pghw layer). To implement the top-down 71 * CMT load balancing algorithm, the CMT PGs additionally maintain 72 * parent, child and sibling hierarchy relationships. 73 * Parent PGs always contain a superset of their children(s) resources, 74 * each PG can have at most one parent, and siblings are the group of PGs 75 * sharing the same parent. 76 * 77 * On NUMA systems, the CMT load balancing algorithm balances across the 78 * CMT PGs within their respective lgroups. On UMA based system, there 79 * exists a top level group of PGs to balance across. On NUMA systems multiple 80 * top level groups are instantiated, where the top level balancing begins by 81 * balancng across the CMT PGs within their respective (per lgroup) top level 82 * groups. 83 */ 84 static cmt_lgrp_t *cmt_lgrps = NULL; /* cmt_lgrps list head */ 85 static cmt_lgrp_t *cpu0_lgrp = NULL; /* boot CPU's initial lgrp */ 86 /* used for null_proc_lpa */ 87 cmt_lgrp_t *cmt_root = NULL; /* Reference to root cmt pg */ 88 89 static int is_cpu0 = 1; /* true if this is boot CPU context */ 90 91 /* 92 * Array of hardware sharing relationships that are blacklisted. 93 * PGs won't be instantiated for blacklisted hardware sharing relationships. 94 */ 95 static int cmt_hw_blacklisted[PGHW_NUM_COMPONENTS]; 96 97 /* 98 * Set this to non-zero to disable CMT scheduling 99 * This must be done via kmdb -d, as /etc/system will be too late 100 */ 101 int cmt_sched_disabled = 0; 102 103 /* 104 * Status codes for CMT lineage validation 105 * See pg_cmt_lineage_validate() below 106 */ 107 typedef enum cmt_lineage_validation { 108 CMT_LINEAGE_VALID, 109 CMT_LINEAGE_NON_CONCENTRIC, 110 CMT_LINEAGE_PG_SPANS_LGRPS, 111 CMT_LINEAGE_NON_PROMOTABLE, 112 CMT_LINEAGE_REPAIRED, 113 CMT_LINEAGE_UNRECOVERABLE 114 } cmt_lineage_validation_t; 115 116 /* 117 * Status of the current lineage under construction. 118 * One must be holding cpu_lock to change this. 119 */ 120 cmt_lineage_validation_t cmt_lineage_status = CMT_LINEAGE_VALID; 121 122 /* 123 * Power domain definitions (on x86) are defined by ACPI, and 124 * therefore may be subject to BIOS bugs. 125 */ 126 #define PG_CMT_HW_SUSPECT(hw) PGHW_IS_PM_DOMAIN(hw) 127 128 /* 129 * Macro to test if PG is managed by the CMT PG class 130 */ 131 #define IS_CMT_PG(pg) (((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id) 132 133 static pg_cid_t pg_cmt_class_id; /* PG class id */ 134 135 static pg_t *pg_cmt_alloc(); 136 static void pg_cmt_free(pg_t *); 137 static void pg_cmt_cpu_init(cpu_t *); 138 static void pg_cmt_cpu_fini(cpu_t *); 139 static void pg_cmt_cpu_active(cpu_t *); 140 static void pg_cmt_cpu_inactive(cpu_t *); 141 static void pg_cmt_cpupart_in(cpu_t *, cpupart_t *); 142 static void pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *); 143 static char *pg_cmt_policy_name(pg_t *); 144 static void pg_cmt_hier_sort(pg_cmt_t **, int); 145 static pg_cmt_t *pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *); 146 static int pg_cmt_cpu_belongs(pg_t *, cpu_t *); 147 static int pg_cmt_hw(pghw_type_t); 148 static cmt_lgrp_t *pg_cmt_find_lgrp(lgrp_handle_t); 149 static cmt_lgrp_t *pg_cmt_lgrp_create(lgrp_handle_t); 150 static void cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t, 151 kthread_t *, kthread_t *); 152 static void cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t, 153 kthread_t *, kthread_t *); 154 static void cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *); 155 static cmt_lineage_validation_t pg_cmt_lineage_validate(pg_cmt_t **, int *); 156 157 158 /* 159 * CMT PG ops 160 */ 161 struct pg_ops pg_ops_cmt = { 162 pg_cmt_alloc, 163 pg_cmt_free, 164 pg_cmt_cpu_init, 165 pg_cmt_cpu_fini, 166 pg_cmt_cpu_active, 167 pg_cmt_cpu_inactive, 168 pg_cmt_cpupart_in, 169 NULL, /* cpupart_out */ 170 pg_cmt_cpupart_move, 171 pg_cmt_cpu_belongs, 172 pg_cmt_policy_name, 173 }; 174 175 /* 176 * Initialize the CMT PG class 177 */ 178 void 179 pg_cmt_class_init(void) 180 { 181 if (cmt_sched_disabled) 182 return; 183 184 pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL); 185 } 186 187 /* 188 * Called to indicate a new CPU has started up so 189 * that either t0 or the slave startup thread can 190 * be accounted for. 191 */ 192 void 193 pg_cmt_cpu_startup(cpu_t *cp) 194 { 195 pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread, 196 cp->cpu_thread); 197 } 198 199 /* 200 * Return non-zero if thread can migrate between "from" and "to" 201 * without a performance penalty 202 */ 203 int 204 pg_cmt_can_migrate(cpu_t *from, cpu_t *to) 205 { 206 if (from->cpu_physid->cpu_cacheid == 207 to->cpu_physid->cpu_cacheid) 208 return (1); 209 return (0); 210 } 211 212 /* 213 * CMT class specific PG allocation 214 */ 215 static pg_t * 216 pg_cmt_alloc(void) 217 { 218 return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP)); 219 } 220 221 /* 222 * Class specific PG de-allocation 223 */ 224 static void 225 pg_cmt_free(pg_t *pg) 226 { 227 ASSERT(pg != NULL); 228 ASSERT(IS_CMT_PG(pg)); 229 230 kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t)); 231 } 232 233 /* 234 * Given a hardware sharing relationship, return which dispatcher 235 * policies should be implemented to optimize performance and efficiency 236 */ 237 static pg_cmt_policy_t 238 pg_cmt_policy(pghw_type_t hw) 239 { 240 pg_cmt_policy_t p; 241 242 /* 243 * Give the platform a chance to override the default 244 */ 245 if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY) 246 return (p); 247 248 switch (hw) { 249 case PGHW_IPIPE: 250 case PGHW_FPU: 251 case PGHW_CHIP: 252 return (CMT_BALANCE); 253 case PGHW_CACHE: 254 return (CMT_AFFINITY); 255 case PGHW_POW_ACTIVE: 256 case PGHW_POW_IDLE: 257 return (CMT_BALANCE); 258 default: 259 return (CMT_NO_POLICY); 260 } 261 } 262 263 /* 264 * Rank the importance of optimizing for the pg1 relationship vs. 265 * the pg2 relationship. 266 */ 267 static pg_cmt_t * 268 pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2) 269 { 270 pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw; 271 pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw; 272 273 /* 274 * A power domain is only important if CPUPM is enabled. 275 */ 276 if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) { 277 if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2)) 278 return (pg2); 279 if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1)) 280 return (pg1); 281 } 282 283 /* 284 * Otherwise, ask the platform 285 */ 286 if (pg_plat_hw_rank(hw1, hw2) == hw1) 287 return (pg1); 288 else 289 return (pg2); 290 } 291 292 /* 293 * Initialize CMT callbacks for the given PG 294 */ 295 static void 296 cmt_callback_init(pg_t *pg) 297 { 298 switch (((pghw_t *)pg)->pghw_hw) { 299 case PGHW_POW_ACTIVE: 300 pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr; 301 pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr; 302 break; 303 default: 304 pg->pg_cb.thread_swtch = cmt_ev_thread_swtch; 305 306 } 307 } 308 309 /* 310 * Promote PG above it's current parent. 311 * This is only legal if PG has an equal or greater number of CPUs 312 * than it's parent. 313 */ 314 static void 315 cmt_hier_promote(pg_cmt_t *pg) 316 { 317 pg_cmt_t *parent; 318 group_t *children; 319 cpu_t *cpu; 320 group_iter_t iter; 321 pg_cpu_itr_t cpu_iter; 322 int r; 323 int err; 324 325 ASSERT(MUTEX_HELD(&cpu_lock)); 326 327 parent = pg->cmt_parent; 328 if (parent == NULL) { 329 /* 330 * Nothing to do 331 */ 332 return; 333 } 334 335 ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent)); 336 337 /* 338 * We're changing around the hierarchy, which is actively traversed 339 * by the dispatcher. Pause CPUS to ensure exclusivity. 340 */ 341 pause_cpus(NULL); 342 343 /* 344 * If necessary, update the parent's sibling set, replacing parent 345 * with PG. 346 */ 347 if (parent->cmt_siblings) { 348 if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE) 349 != -1) { 350 r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE); 351 ASSERT(r != -1); 352 } 353 } 354 355 /* 356 * If the parent is at the top of the hierarchy, replace it's entry 357 * in the root lgroup's group of top level PGs. 358 */ 359 if (parent->cmt_parent == NULL && 360 parent->cmt_siblings != &cmt_root->cl_pgs) { 361 if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE) 362 != -1) { 363 r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE); 364 ASSERT(r != -1); 365 } 366 } 367 368 /* 369 * We assume (and therefore assert) that the PG being promoted is an 370 * only child of it's parent. Update the parent's children set 371 * replacing PG's entry with the parent (since the parent is becoming 372 * the child). Then have PG and the parent swap children sets. 373 */ 374 ASSERT(GROUP_SIZE(parent->cmt_children) <= 1); 375 if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) { 376 r = group_add(parent->cmt_children, parent, GRP_NORESIZE); 377 ASSERT(r != -1); 378 } 379 380 children = pg->cmt_children; 381 pg->cmt_children = parent->cmt_children; 382 parent->cmt_children = children; 383 384 /* 385 * Update the sibling references for PG and it's parent 386 */ 387 pg->cmt_siblings = parent->cmt_siblings; 388 parent->cmt_siblings = pg->cmt_children; 389 390 /* 391 * Update any cached lineages in the per CPU pg data. 392 */ 393 PG_CPU_ITR_INIT(pg, cpu_iter); 394 while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { 395 int idx; 396 group_t *pgs; 397 pg_cmt_t *cpu_pg; 398 399 /* 400 * Iterate over the CPU's PGs updating the children 401 * of the PG being promoted, since they have a new parent. 402 */ 403 pgs = &cpu->cpu_pg->pgs; 404 group_iter_init(&iter); 405 while ((cpu_pg = group_iterate(pgs, &iter)) != NULL) { 406 if (cpu_pg->cmt_parent == pg) { 407 cpu_pg->cmt_parent = parent; 408 } 409 } 410 411 /* 412 * Update the CMT load balancing lineage 413 */ 414 pgs = &cpu->cpu_pg->cmt_pgs; 415 if ((idx = group_find(pgs, (void *)pg)) == -1) { 416 /* 417 * Unless this is the CPU who's lineage is being 418 * constructed, the PG being promoted should be 419 * in the lineage. 420 */ 421 ASSERT(GROUP_SIZE(pgs) == 0); 422 continue; 423 } 424 425 ASSERT(GROUP_ACCESS(pgs, idx - 1) == parent); 426 ASSERT(idx > 0); 427 428 /* 429 * Have the child and the parent swap places in the CPU's 430 * lineage 431 */ 432 group_remove_at(pgs, idx); 433 group_remove_at(pgs, idx - 1); 434 err = group_add_at(pgs, parent, idx); 435 ASSERT(err == 0); 436 err = group_add_at(pgs, pg, idx - 1); 437 ASSERT(err == 0); 438 } 439 440 /* 441 * Update the parent references for PG and it's parent 442 */ 443 pg->cmt_parent = parent->cmt_parent; 444 parent->cmt_parent = pg; 445 446 start_cpus(); 447 } 448 449 /* 450 * CMT class callback for a new CPU entering the system 451 */ 452 static void 453 pg_cmt_cpu_init(cpu_t *cp) 454 { 455 pg_cmt_t *pg; 456 group_t *cmt_pgs; 457 int levels, level; 458 pghw_type_t hw; 459 pg_t *pg_cache = NULL; 460 pg_cmt_t *cpu_cmt_hier[PGHW_NUM_COMPONENTS]; 461 lgrp_handle_t lgrp_handle; 462 cmt_lgrp_t *lgrp; 463 cmt_lineage_validation_t lineage_status; 464 465 ASSERT(MUTEX_HELD(&cpu_lock)); 466 467 if (cmt_sched_disabled) 468 return; 469 470 /* 471 * A new CPU is coming into the system. 472 * Interrogate the platform to see if the CPU 473 * has any performance or efficiency relevant 474 * sharing relationships 475 */ 476 cmt_pgs = &cp->cpu_pg->cmt_pgs; 477 cp->cpu_pg->cmt_lineage = NULL; 478 479 bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier)); 480 levels = 0; 481 for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) { 482 483 pg_cmt_policy_t policy; 484 485 /* 486 * We're only interested in the hw sharing relationships 487 * for which we know how to optimize. 488 */ 489 policy = pg_cmt_policy(hw); 490 if (policy == CMT_NO_POLICY || 491 pg_plat_hw_shared(cp, hw) == 0) 492 continue; 493 494 /* 495 * Continue if the hardware sharing relationship has been 496 * blacklisted. 497 */ 498 if (cmt_hw_blacklisted[hw]) { 499 continue; 500 } 501 502 /* 503 * Find (or create) the PG associated with 504 * the hw sharing relationship in which cp 505 * belongs. 506 * 507 * Determine if a suitable PG already 508 * exists, or if one needs to be created. 509 */ 510 pg = (pg_cmt_t *)pghw_place_cpu(cp, hw); 511 if (pg == NULL) { 512 /* 513 * Create a new one. 514 * Initialize the common... 515 */ 516 pg = (pg_cmt_t *)pg_create(pg_cmt_class_id); 517 518 /* ... physical ... */ 519 pghw_init((pghw_t *)pg, cp, hw); 520 521 /* 522 * ... and CMT specific portions of the 523 * structure. 524 */ 525 pg->cmt_policy = policy; 526 527 /* CMT event callbacks */ 528 cmt_callback_init((pg_t *)pg); 529 530 bitset_init(&pg->cmt_cpus_actv_set); 531 group_create(&pg->cmt_cpus_actv); 532 } else { 533 ASSERT(IS_CMT_PG(pg)); 534 } 535 536 /* Add the CPU to the PG */ 537 pg_cpu_add((pg_t *)pg, cp); 538 539 /* 540 * Ensure capacity of the active CPU group/bitset 541 */ 542 group_expand(&pg->cmt_cpus_actv, 543 GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 544 545 if (cp->cpu_seqid >= 546 bitset_capacity(&pg->cmt_cpus_actv_set)) { 547 bitset_resize(&pg->cmt_cpus_actv_set, 548 cp->cpu_seqid + 1); 549 } 550 551 /* 552 * Build a lineage of CMT PGs for load balancing / coalescence 553 */ 554 if (policy & (CMT_BALANCE | CMT_COALESCE)) { 555 cpu_cmt_hier[levels++] = pg; 556 } 557 558 /* Cache this for later */ 559 if (hw == PGHW_CACHE) 560 pg_cache = (pg_t *)pg; 561 } 562 563 group_expand(cmt_pgs, levels); 564 565 if (cmt_root == NULL) 566 cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand()); 567 568 /* 569 * Find the lgrp that encapsulates this CPU's CMT hierarchy 570 */ 571 lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 572 if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL) 573 lgrp = pg_cmt_lgrp_create(lgrp_handle); 574 575 /* 576 * Ascendingly sort the PGs in the lineage by number of CPUs 577 */ 578 pg_cmt_hier_sort(cpu_cmt_hier, levels); 579 580 /* 581 * Examine the lineage and validate it. 582 * This routine will also try to fix the lineage along with the 583 * rest of the PG hierarchy should it detect an issue. 584 * 585 * If it returns anything other than VALID or REPAIRED, an 586 * unrecoverable error has occurred, and we cannot proceed. 587 */ 588 lineage_status = pg_cmt_lineage_validate(cpu_cmt_hier, &levels); 589 if ((lineage_status != CMT_LINEAGE_VALID) && 590 (lineage_status != CMT_LINEAGE_REPAIRED)) 591 return; 592 593 /* 594 * For existing PGs in the lineage, verify that the parent is 595 * correct, as the generation in the lineage may have changed 596 * as a result of the sorting. Start the traversal at the top 597 * of the lineage, moving down. 598 */ 599 for (level = levels - 1; level >= 0; ) { 600 int reorg; 601 602 reorg = 0; 603 pg = cpu_cmt_hier[level]; 604 605 /* 606 * Promote PGs at an incorrect generation into place. 607 */ 608 while (pg->cmt_parent && 609 pg->cmt_parent != cpu_cmt_hier[level + 1]) { 610 cmt_hier_promote(pg); 611 reorg++; 612 } 613 if (reorg > 0) 614 level = levels - 1; 615 else 616 level--; 617 } 618 619 /* 620 * For each of the PGs in the CPU's lineage: 621 * - Add an entry in the CPU sorted CMT PG group 622 * which is used for top down CMT load balancing 623 * - Tie the PG into the CMT hierarchy by connecting 624 * it to it's parent and siblings. 625 */ 626 for (level = 0; level < levels; level++) { 627 uint_t children; 628 int err; 629 630 pg = cpu_cmt_hier[level]; 631 err = group_add_at(cmt_pgs, pg, levels - level - 1); 632 ASSERT(err == 0); 633 634 if (level == 0) 635 cp->cpu_pg->cmt_lineage = (pg_t *)pg; 636 637 if (pg->cmt_siblings != NULL) { 638 /* Already initialized */ 639 ASSERT(pg->cmt_parent == NULL || 640 pg->cmt_parent == cpu_cmt_hier[level + 1]); 641 ASSERT(pg->cmt_siblings == &lgrp->cl_pgs || 642 ((pg->cmt_parent != NULL) && 643 pg->cmt_siblings == pg->cmt_parent->cmt_children)); 644 continue; 645 } 646 647 if ((level + 1) == levels) { 648 pg->cmt_parent = NULL; 649 650 pg->cmt_siblings = &lgrp->cl_pgs; 651 children = ++lgrp->cl_npgs; 652 if (cmt_root != lgrp) 653 cmt_root->cl_npgs++; 654 } else { 655 pg->cmt_parent = cpu_cmt_hier[level + 1]; 656 657 /* 658 * A good parent keeps track of their children. 659 * The parent's children group is also the PG's 660 * siblings. 661 */ 662 if (pg->cmt_parent->cmt_children == NULL) { 663 pg->cmt_parent->cmt_children = 664 kmem_zalloc(sizeof (group_t), KM_SLEEP); 665 group_create(pg->cmt_parent->cmt_children); 666 } 667 pg->cmt_siblings = pg->cmt_parent->cmt_children; 668 children = ++pg->cmt_parent->cmt_nchildren; 669 } 670 671 group_expand(pg->cmt_siblings, children); 672 group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs); 673 } 674 675 /* 676 * Cache the chip and core IDs in the cpu_t->cpu_physid structure 677 * for fast lookups later. 678 */ 679 if (cp->cpu_physid) { 680 cp->cpu_physid->cpu_chipid = 681 pg_plat_hw_instance_id(cp, PGHW_CHIP); 682 cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp); 683 684 /* 685 * If this cpu has a PG representing shared cache, then set 686 * cpu_cacheid to that PG's logical id 687 */ 688 if (pg_cache) 689 cp->cpu_physid->cpu_cacheid = pg_cache->pg_id; 690 } 691 692 /* CPU0 only initialization */ 693 if (is_cpu0) { 694 pg_cmt_cpu_startup(cp); 695 is_cpu0 = 0; 696 cpu0_lgrp = lgrp; 697 } 698 699 } 700 701 /* 702 * Class callback when a CPU is leaving the system (deletion) 703 */ 704 static void 705 pg_cmt_cpu_fini(cpu_t *cp) 706 { 707 group_iter_t i; 708 pg_cmt_t *pg; 709 group_t *pgs, *cmt_pgs; 710 lgrp_handle_t lgrp_handle; 711 cmt_lgrp_t *lgrp; 712 713 if (cmt_sched_disabled) 714 return; 715 716 pgs = &cp->cpu_pg->pgs; 717 cmt_pgs = &cp->cpu_pg->cmt_pgs; 718 719 /* 720 * Find the lgroup that encapsulates this CPU's CMT hierarchy 721 */ 722 lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 723 724 lgrp = pg_cmt_find_lgrp(lgrp_handle); 725 if (ncpus == 1 && lgrp != cpu0_lgrp) { 726 /* 727 * One might wonder how we could be deconfiguring the 728 * only CPU in the system. 729 * 730 * On Starcat systems when null_proc_lpa is detected, 731 * the boot CPU (which is already configured into a leaf 732 * lgroup), is moved into the root lgroup. This is done by 733 * deconfiguring it from both lgroups and processor 734 * groups), and then later reconfiguring it back in. This 735 * call to pg_cmt_cpu_fini() is part of that deconfiguration. 736 * 737 * This special case is detected by noting that the platform 738 * has changed the CPU's lgrp affiliation (since it now 739 * belongs in the root). In this case, use the cmt_lgrp_t 740 * cached for the boot CPU, since this is what needs to be 741 * torn down. 742 */ 743 lgrp = cpu0_lgrp; 744 } 745 746 ASSERT(lgrp != NULL); 747 748 /* 749 * First, clean up anything load balancing specific for each of 750 * the CPU's PGs that participated in CMT load balancing 751 */ 752 pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage; 753 while (pg != NULL) { 754 755 /* 756 * Remove the PG from the CPU's load balancing lineage 757 */ 758 (void) group_remove(cmt_pgs, pg, GRP_RESIZE); 759 760 /* 761 * If it's about to become empty, destroy it's children 762 * group, and remove it's reference from it's siblings. 763 * This is done here (rather than below) to avoid removing 764 * our reference from a PG that we just eliminated. 765 */ 766 if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) { 767 if (pg->cmt_children != NULL) 768 group_destroy(pg->cmt_children); 769 if (pg->cmt_siblings != NULL) { 770 if (pg->cmt_siblings == &lgrp->cl_pgs) 771 lgrp->cl_npgs--; 772 else 773 pg->cmt_parent->cmt_nchildren--; 774 } 775 } 776 pg = pg->cmt_parent; 777 } 778 ASSERT(GROUP_SIZE(cmt_pgs) == 0); 779 780 /* 781 * Now that the load balancing lineage updates have happened, 782 * remove the CPU from all it's PGs (destroying any that become 783 * empty). 784 */ 785 group_iter_init(&i); 786 while ((pg = group_iterate(pgs, &i)) != NULL) { 787 if (IS_CMT_PG(pg) == 0) 788 continue; 789 790 pg_cpu_delete((pg_t *)pg, cp); 791 /* 792 * Deleting the CPU from the PG changes the CPU's 793 * PG group over which we are actively iterating 794 * Re-initialize the iteration 795 */ 796 group_iter_init(&i); 797 798 if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) { 799 800 /* 801 * The PG has become zero sized, so destroy it. 802 */ 803 group_destroy(&pg->cmt_cpus_actv); 804 bitset_fini(&pg->cmt_cpus_actv_set); 805 pghw_fini((pghw_t *)pg); 806 807 pg_destroy((pg_t *)pg); 808 } 809 } 810 } 811 812 /* 813 * Class callback when a CPU is entering a cpu partition 814 */ 815 static void 816 pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp) 817 { 818 group_t *pgs; 819 pg_t *pg; 820 group_iter_t i; 821 822 ASSERT(MUTEX_HELD(&cpu_lock)); 823 824 if (cmt_sched_disabled) 825 return; 826 827 pgs = &cp->cpu_pg->pgs; 828 829 /* 830 * Ensure that the new partition's PG bitset 831 * is large enough for all CMT PG's to which cp 832 * belongs 833 */ 834 group_iter_init(&i); 835 while ((pg = group_iterate(pgs, &i)) != NULL) { 836 if (IS_CMT_PG(pg) == 0) 837 continue; 838 839 if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id) 840 bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1); 841 } 842 } 843 844 /* 845 * Class callback when a CPU is actually moving partitions 846 */ 847 static void 848 pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp) 849 { 850 cpu_t *cpp; 851 group_t *pgs; 852 pg_t *pg; 853 group_iter_t pg_iter; 854 pg_cpu_itr_t cpu_iter; 855 boolean_t found; 856 857 ASSERT(MUTEX_HELD(&cpu_lock)); 858 859 if (cmt_sched_disabled) 860 return; 861 862 pgs = &cp->cpu_pg->pgs; 863 group_iter_init(&pg_iter); 864 865 /* 866 * Iterate over the CPUs CMT PGs 867 */ 868 while ((pg = group_iterate(pgs, &pg_iter)) != NULL) { 869 870 if (IS_CMT_PG(pg) == 0) 871 continue; 872 873 /* 874 * Add the PG to the bitset in the new partition. 875 */ 876 bitset_add(&newpp->cp_cmt_pgs, pg->pg_id); 877 878 /* 879 * Remove the PG from the bitset in the old partition 880 * if the last of the PG's CPUs have left. 881 */ 882 found = B_FALSE; 883 PG_CPU_ITR_INIT(pg, cpu_iter); 884 while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) { 885 if (cpp == cp) 886 continue; 887 if (CPU_ACTIVE(cpp) && 888 cpp->cpu_part->cp_id == oldpp->cp_id) { 889 found = B_TRUE; 890 break; 891 } 892 } 893 if (!found) 894 bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id); 895 } 896 } 897 898 /* 899 * Class callback when a CPU becomes active (online) 900 * 901 * This is called in a context where CPUs are paused 902 */ 903 static void 904 pg_cmt_cpu_active(cpu_t *cp) 905 { 906 int err; 907 group_iter_t i; 908 pg_cmt_t *pg; 909 group_t *pgs; 910 911 ASSERT(MUTEX_HELD(&cpu_lock)); 912 913 if (cmt_sched_disabled) 914 return; 915 916 pgs = &cp->cpu_pg->pgs; 917 group_iter_init(&i); 918 919 /* 920 * Iterate over the CPU's PGs 921 */ 922 while ((pg = group_iterate(pgs, &i)) != NULL) { 923 924 if (IS_CMT_PG(pg) == 0) 925 continue; 926 927 err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 928 ASSERT(err == 0); 929 930 /* 931 * If this is the first active CPU in the PG, and it 932 * represents a hardware sharing relationship over which 933 * CMT load balancing is performed, add it as a candidate 934 * for balancing with it's siblings. 935 */ 936 if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 && 937 (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) { 938 err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE); 939 ASSERT(err == 0); 940 941 /* 942 * If this is a top level PG, add it as a balancing 943 * candidate when balancing within the root lgroup. 944 */ 945 if (pg->cmt_parent == NULL && 946 pg->cmt_siblings != &cmt_root->cl_pgs) { 947 err = group_add(&cmt_root->cl_pgs, pg, 948 GRP_NORESIZE); 949 ASSERT(err == 0); 950 } 951 } 952 953 /* 954 * Notate the CPU in the PGs active CPU bitset. 955 * Also notate the PG as being active in it's associated 956 * partition 957 */ 958 bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 959 bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id); 960 } 961 } 962 963 /* 964 * Class callback when a CPU goes inactive (offline) 965 * 966 * This is called in a context where CPUs are paused 967 */ 968 static void 969 pg_cmt_cpu_inactive(cpu_t *cp) 970 { 971 int err; 972 group_t *pgs; 973 pg_cmt_t *pg; 974 cpu_t *cpp; 975 group_iter_t i; 976 pg_cpu_itr_t cpu_itr; 977 boolean_t found; 978 979 ASSERT(MUTEX_HELD(&cpu_lock)); 980 981 if (cmt_sched_disabled) 982 return; 983 984 pgs = &cp->cpu_pg->pgs; 985 group_iter_init(&i); 986 987 while ((pg = group_iterate(pgs, &i)) != NULL) { 988 989 if (IS_CMT_PG(pg) == 0) 990 continue; 991 992 /* 993 * Remove the CPU from the CMT PGs active CPU group 994 * bitmap 995 */ 996 err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 997 ASSERT(err == 0); 998 999 bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 1000 1001 /* 1002 * If there are no more active CPUs in this PG over which 1003 * load was balanced, remove it as a balancing candidate. 1004 */ 1005 if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 && 1006 (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) { 1007 err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); 1008 ASSERT(err == 0); 1009 1010 if (pg->cmt_parent == NULL && 1011 pg->cmt_siblings != &cmt_root->cl_pgs) { 1012 err = group_remove(&cmt_root->cl_pgs, pg, 1013 GRP_NORESIZE); 1014 ASSERT(err == 0); 1015 } 1016 } 1017 1018 /* 1019 * Assert the number of active CPUs does not exceed 1020 * the total number of CPUs in the PG 1021 */ 1022 ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <= 1023 GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 1024 1025 /* 1026 * Update the PG bitset in the CPU's old partition 1027 */ 1028 found = B_FALSE; 1029 PG_CPU_ITR_INIT(pg, cpu_itr); 1030 while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) { 1031 if (cpp == cp) 1032 continue; 1033 if (CPU_ACTIVE(cpp) && 1034 cpp->cpu_part->cp_id == cp->cpu_part->cp_id) { 1035 found = B_TRUE; 1036 break; 1037 } 1038 } 1039 if (!found) { 1040 bitset_del(&cp->cpu_part->cp_cmt_pgs, 1041 ((pg_t *)pg)->pg_id); 1042 } 1043 } 1044 } 1045 1046 /* 1047 * Return non-zero if the CPU belongs in the given PG 1048 */ 1049 static int 1050 pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp) 1051 { 1052 cpu_t *pg_cpu; 1053 1054 pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0); 1055 1056 ASSERT(pg_cpu != NULL); 1057 1058 /* 1059 * The CPU belongs if, given the nature of the hardware sharing 1060 * relationship represented by the PG, the CPU has that 1061 * relationship with some other CPU already in the PG 1062 */ 1063 if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw)) 1064 return (1); 1065 1066 return (0); 1067 } 1068 1069 /* 1070 * Sort the CPUs CMT hierarchy, where "size" is the number of levels. 1071 */ 1072 static void 1073 pg_cmt_hier_sort(pg_cmt_t **hier, int size) 1074 { 1075 int i, j, inc; 1076 pg_t *tmp; 1077 pg_t **h = (pg_t **)hier; 1078 1079 /* 1080 * First sort by number of CPUs 1081 */ 1082 inc = size / 2; 1083 while (inc > 0) { 1084 for (i = inc; i < size; i++) { 1085 j = i; 1086 tmp = h[i]; 1087 while ((j >= inc) && 1088 (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) { 1089 h[j] = h[j - inc]; 1090 j = j - inc; 1091 } 1092 h[j] = tmp; 1093 } 1094 if (inc == 2) 1095 inc = 1; 1096 else 1097 inc = (inc * 5) / 11; 1098 } 1099 1100 /* 1101 * Break ties by asking the platform. 1102 * Determine if h[i] outranks h[i + 1] and if so, swap them. 1103 */ 1104 for (i = 0; i < size - 1; i++) { 1105 if ((PG_NUM_CPUS(h[i]) == PG_NUM_CPUS(h[i + 1])) && 1106 pg_cmt_hier_rank(hier[i], hier[i + 1]) == hier[i]) { 1107 tmp = h[i]; 1108 h[i] = h[i + 1]; 1109 h[i + 1] = tmp; 1110 } 1111 } 1112 } 1113 1114 /* 1115 * Return a cmt_lgrp_t * given an lgroup handle. 1116 */ 1117 static cmt_lgrp_t * 1118 pg_cmt_find_lgrp(lgrp_handle_t hand) 1119 { 1120 cmt_lgrp_t *lgrp; 1121 1122 ASSERT(MUTEX_HELD(&cpu_lock)); 1123 1124 lgrp = cmt_lgrps; 1125 while (lgrp != NULL) { 1126 if (lgrp->cl_hand == hand) 1127 break; 1128 lgrp = lgrp->cl_next; 1129 } 1130 return (lgrp); 1131 } 1132 1133 /* 1134 * Create a cmt_lgrp_t with the specified handle. 1135 */ 1136 static cmt_lgrp_t * 1137 pg_cmt_lgrp_create(lgrp_handle_t hand) 1138 { 1139 cmt_lgrp_t *lgrp; 1140 1141 ASSERT(MUTEX_HELD(&cpu_lock)); 1142 1143 lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP); 1144 1145 lgrp->cl_hand = hand; 1146 lgrp->cl_npgs = 0; 1147 lgrp->cl_next = cmt_lgrps; 1148 cmt_lgrps = lgrp; 1149 group_create(&lgrp->cl_pgs); 1150 1151 return (lgrp); 1152 } 1153 1154 /* 1155 * Interfaces to enable and disable power aware dispatching 1156 * The caller must be holding cpu_lock. 1157 * 1158 * Return 0 on success and -1 on failure. 1159 */ 1160 int 1161 cmt_pad_enable(pghw_type_t type) 1162 { 1163 group_t *hwset; 1164 group_iter_t iter; 1165 pg_cmt_t *pg; 1166 1167 ASSERT(PGHW_IS_PM_DOMAIN(type)); 1168 ASSERT(MUTEX_HELD(&cpu_lock)); 1169 1170 if ((hwset = pghw_set_lookup(type)) == NULL || 1171 cmt_hw_blacklisted[type]) { 1172 /* 1173 * Unable to find any instances of the specified type 1174 * of power domain, or the power domains have been blacklisted. 1175 */ 1176 return (-1); 1177 } 1178 1179 /* 1180 * Iterate over the power domains, setting the default dispatcher 1181 * policy for power/performance optimization. 1182 * 1183 * Simply setting the policy isn't enough in the case where the power 1184 * domain is an only child of another PG. Because the dispatcher walks 1185 * the PG hierarchy in a top down fashion, the higher up PG's policy 1186 * will dominate. So promote the power domain above it's parent if both 1187 * PG and it's parent have the same CPUs to ensure it's policy 1188 * dominates. 1189 */ 1190 group_iter_init(&iter); 1191 while ((pg = group_iterate(hwset, &iter)) != NULL) { 1192 /* 1193 * If the power domain is an only child to a parent 1194 * not implementing the same policy, promote the child 1195 * above the parent to activate the policy. 1196 */ 1197 pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw); 1198 while ((pg->cmt_parent != NULL) && 1199 (pg->cmt_parent->cmt_policy != pg->cmt_policy) && 1200 (PG_NUM_CPUS((pg_t *)pg) == 1201 PG_NUM_CPUS((pg_t *)pg->cmt_parent))) { 1202 cmt_hier_promote(pg); 1203 } 1204 } 1205 1206 return (0); 1207 } 1208 1209 int 1210 cmt_pad_disable(pghw_type_t type) 1211 { 1212 group_t *hwset; 1213 group_iter_t iter; 1214 pg_cmt_t *pg; 1215 pg_cmt_t *child; 1216 1217 ASSERT(PGHW_IS_PM_DOMAIN(type)); 1218 ASSERT(MUTEX_HELD(&cpu_lock)); 1219 1220 if ((hwset = pghw_set_lookup(type)) == NULL) { 1221 /* 1222 * Unable to find any instances of the specified type of 1223 * power domain. 1224 */ 1225 return (-1); 1226 } 1227 /* 1228 * Iterate over the power domains, setting the default dispatcher 1229 * policy for performance optimization (load balancing). 1230 */ 1231 group_iter_init(&iter); 1232 while ((pg = group_iterate(hwset, &iter)) != NULL) { 1233 1234 /* 1235 * If the power domain has an only child that implements 1236 * policy other than load balancing, promote the child 1237 * above the power domain to ensure it's policy dominates. 1238 */ 1239 if (pg->cmt_children != NULL && 1240 GROUP_SIZE(pg->cmt_children) == 1) { 1241 child = GROUP_ACCESS(pg->cmt_children, 0); 1242 if ((child->cmt_policy & CMT_BALANCE) == 0) { 1243 cmt_hier_promote(child); 1244 } 1245 } 1246 pg->cmt_policy = CMT_BALANCE; 1247 } 1248 return (0); 1249 } 1250 1251 /* ARGSUSED */ 1252 static void 1253 cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old, 1254 kthread_t *new) 1255 { 1256 pg_cmt_t *cmt_pg = (pg_cmt_t *)pg; 1257 1258 if (old == cp->cpu_idle_thread) { 1259 atomic_add_32(&cmt_pg->cmt_utilization, 1); 1260 } else if (new == cp->cpu_idle_thread) { 1261 atomic_add_32(&cmt_pg->cmt_utilization, -1); 1262 } 1263 } 1264 1265 /* 1266 * Macro to test whether a thread is currently runnable on a CPU in a PG. 1267 */ 1268 #define THREAD_RUNNABLE_IN_PG(t, pg) \ 1269 ((t)->t_state == TS_RUN && \ 1270 (t)->t_disp_queue->disp_cpu && \ 1271 bitset_in_set(&(pg)->cmt_cpus_actv_set, \ 1272 (t)->t_disp_queue->disp_cpu->cpu_seqid)) 1273 1274 static void 1275 cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old, 1276 kthread_t *new) 1277 { 1278 pg_cmt_t *cmt = (pg_cmt_t *)pg; 1279 cpupm_domain_t *dom; 1280 uint32_t u; 1281 1282 if (old == cp->cpu_idle_thread) { 1283 ASSERT(new != cp->cpu_idle_thread); 1284 u = atomic_add_32_nv(&cmt->cmt_utilization, 1); 1285 if (u == 1) { 1286 /* 1287 * Notify the CPU power manager that the domain 1288 * is non-idle. 1289 */ 1290 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1291 cpupm_utilization_event(cp, now, dom, 1292 CPUPM_DOM_BUSY_FROM_IDLE); 1293 } 1294 } else if (new == cp->cpu_idle_thread) { 1295 ASSERT(old != cp->cpu_idle_thread); 1296 u = atomic_add_32_nv(&cmt->cmt_utilization, -1); 1297 if (u == 0) { 1298 /* 1299 * The domain is idle, notify the CPU power 1300 * manager. 1301 * 1302 * Avoid notifying if the thread is simply migrating 1303 * between CPUs in the domain. 1304 */ 1305 if (!THREAD_RUNNABLE_IN_PG(old, cmt)) { 1306 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1307 cpupm_utilization_event(cp, now, dom, 1308 CPUPM_DOM_IDLE_FROM_BUSY); 1309 } 1310 } 1311 } 1312 } 1313 1314 /* ARGSUSED */ 1315 static void 1316 cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t) 1317 { 1318 pg_cmt_t *cmt = (pg_cmt_t *)pg; 1319 cpupm_domain_t *dom; 1320 1321 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1322 cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY); 1323 } 1324 1325 /* 1326 * Return the name of the CMT scheduling policy 1327 * being implemented across this PG 1328 */ 1329 static char * 1330 pg_cmt_policy_name(pg_t *pg) 1331 { 1332 pg_cmt_policy_t policy; 1333 1334 policy = ((pg_cmt_t *)pg)->cmt_policy; 1335 1336 if (policy & CMT_AFFINITY) { 1337 if (policy & CMT_BALANCE) 1338 return ("Load Balancing & Affinity"); 1339 else if (policy & CMT_COALESCE) 1340 return ("Load Coalescence & Affinity"); 1341 else 1342 return ("Affinity"); 1343 } else { 1344 if (policy & CMT_BALANCE) 1345 return ("Load Balancing"); 1346 else if (policy & CMT_COALESCE) 1347 return ("Load Coalescence"); 1348 else 1349 return ("None"); 1350 } 1351 } 1352 1353 /* 1354 * Prune PG, and all other instances of PG's hardware sharing relationship 1355 * from the PG hierarchy. 1356 */ 1357 static int 1358 pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz) 1359 { 1360 group_t *hwset, *children; 1361 int i, j, r, size = *sz; 1362 group_iter_t hw_iter, child_iter; 1363 pg_cpu_itr_t cpu_iter; 1364 pg_cmt_t *pg, *child; 1365 cpu_t *cpu; 1366 int cap_needed; 1367 pghw_type_t hw; 1368 1369 ASSERT(MUTEX_HELD(&cpu_lock)); 1370 1371 hw = ((pghw_t *)pg_bad)->pghw_hw; 1372 1373 if (hw == PGHW_POW_ACTIVE) { 1374 cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. " 1375 "Event Based CPUPM Unavailable"); 1376 } else if (hw == PGHW_POW_IDLE) { 1377 cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. " 1378 "Dispatcher assisted CPUPM disabled."); 1379 } 1380 1381 /* 1382 * Find and eliminate the PG from the lineage. 1383 */ 1384 for (i = 0; i < size; i++) { 1385 if (lineage[i] == pg_bad) { 1386 for (j = i; j < size - 1; j++) 1387 lineage[j] = lineage[j + 1]; 1388 *sz = size - 1; 1389 break; 1390 } 1391 } 1392 1393 /* 1394 * We'll prune all instances of the hardware sharing relationship 1395 * represented by pg. But before we do that (and pause CPUs) we need 1396 * to ensure the hierarchy's groups are properly sized. 1397 */ 1398 hwset = pghw_set_lookup(hw); 1399 1400 /* 1401 * Blacklist the hardware so that future groups won't be created. 1402 */ 1403 cmt_hw_blacklisted[hw] = 1; 1404 1405 /* 1406 * For each of the PGs being pruned, ensure sufficient capacity in 1407 * the siblings set for the PG's children 1408 */ 1409 group_iter_init(&hw_iter); 1410 while ((pg = group_iterate(hwset, &hw_iter)) != NULL) { 1411 /* 1412 * PG is being pruned, but if it is bringing up more than 1413 * one child, ask for more capacity in the siblings group. 1414 */ 1415 cap_needed = 0; 1416 if (pg->cmt_children && 1417 GROUP_SIZE(pg->cmt_children) > 1) { 1418 cap_needed = GROUP_SIZE(pg->cmt_children) - 1; 1419 1420 group_expand(pg->cmt_siblings, 1421 GROUP_SIZE(pg->cmt_siblings) + cap_needed); 1422 1423 /* 1424 * If this is a top level group, also ensure the 1425 * capacity in the root lgrp level CMT grouping. 1426 */ 1427 if (pg->cmt_parent == NULL && 1428 pg->cmt_siblings != &cmt_root->cl_pgs) { 1429 group_expand(&cmt_root->cl_pgs, 1430 GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed); 1431 } 1432 } 1433 } 1434 1435 /* 1436 * We're operating on the PG hierarchy. Pause CPUs to ensure 1437 * exclusivity with respect to the dispatcher. 1438 */ 1439 pause_cpus(NULL); 1440 1441 /* 1442 * Prune all PG instances of the hardware sharing relationship 1443 * represented by pg. 1444 */ 1445 group_iter_init(&hw_iter); 1446 while ((pg = group_iterate(hwset, &hw_iter)) != NULL) { 1447 1448 /* 1449 * Remove PG from it's group of siblings, if it's there. 1450 */ 1451 if (pg->cmt_siblings) { 1452 (void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); 1453 } 1454 if (pg->cmt_parent == NULL && 1455 pg->cmt_siblings != &cmt_root->cl_pgs) { 1456 (void) group_remove(&cmt_root->cl_pgs, pg, 1457 GRP_NORESIZE); 1458 } 1459 /* 1460 * Move PG's children from it's children set to it's parent's 1461 * children set. Note that the parent's children set, and PG's 1462 * siblings set are the same thing. 1463 * 1464 * Because we are iterating over the same group that we are 1465 * operating on (removing the children), first add all of PG's 1466 * children to the parent's children set, and once we are done 1467 * iterating, empty PG's children set. 1468 */ 1469 if (pg->cmt_children != NULL) { 1470 children = pg->cmt_children; 1471 1472 group_iter_init(&child_iter); 1473 while ((child = group_iterate(children, &child_iter)) 1474 != NULL) { 1475 if (pg->cmt_siblings != NULL) { 1476 r = group_add(pg->cmt_siblings, child, 1477 GRP_NORESIZE); 1478 ASSERT(r == 0); 1479 } 1480 } 1481 group_empty(pg->cmt_children); 1482 } 1483 1484 /* 1485 * Reset the callbacks to the defaults 1486 */ 1487 pg_callback_set_defaults((pg_t *)pg); 1488 1489 /* 1490 * Update all the CPU lineages in each of PG's CPUs 1491 */ 1492 PG_CPU_ITR_INIT(pg, cpu_iter); 1493 while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { 1494 group_t *pgs; 1495 pg_cmt_t *cpu_pg; 1496 group_iter_t liter; /* Iterator for the lineage */ 1497 1498 /* 1499 * Iterate over the CPU's PGs updating the children 1500 * of the PG being promoted, since they have a new 1501 * parent and siblings set. 1502 */ 1503 pgs = &cpu->cpu_pg->pgs; 1504 group_iter_init(&liter); 1505 while ((cpu_pg = group_iterate(pgs, &liter)) != NULL) { 1506 if (cpu_pg->cmt_parent == pg) { 1507 cpu_pg->cmt_parent = pg->cmt_parent; 1508 cpu_pg->cmt_siblings = pg->cmt_siblings; 1509 } 1510 } 1511 1512 /* 1513 * Update the CPU's lineages 1514 */ 1515 pgs = &cpu->cpu_pg->cmt_pgs; 1516 (void) group_remove(pgs, pg, GRP_NORESIZE); 1517 pgs = &cpu->cpu_pg->pgs; 1518 (void) group_remove(pgs, pg, GRP_NORESIZE); 1519 } 1520 } 1521 start_cpus(); 1522 return (0); 1523 } 1524 1525 /* 1526 * Disable CMT scheduling 1527 */ 1528 static void 1529 pg_cmt_disable(void) 1530 { 1531 cpu_t *cpu; 1532 1533 pause_cpus(NULL); 1534 cpu = cpu_list; 1535 1536 do { 1537 if (cpu->cpu_pg) 1538 group_empty(&cpu->cpu_pg->cmt_pgs); 1539 } while ((cpu = cpu->cpu_next) != cpu_list); 1540 1541 cmt_sched_disabled = 1; 1542 start_cpus(); 1543 cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable"); 1544 } 1545 1546 /* 1547 * CMT lineage validation 1548 * 1549 * This routine is invoked by pg_cmt_cpu_init() to validate the integrity 1550 * of the PGs in a CPU's lineage. This is necessary because it's possible that 1551 * some groupings (power domain groupings in particular) may be defined by 1552 * sources that are buggy (e.g. BIOS bugs). In such cases, it may not be 1553 * possible to integrate those groupings into the CMT PG hierarchy, if doing 1554 * so would violate the subset invariant of the hierarchy, which says that 1555 * a PG must be subset of its parent (if it has one). 1556 * 1557 * pg_cmt_lineage_validate()'s purpose is to detect grouping definitions that 1558 * would result in a violation of this invariant. If a violation is found, 1559 * and the PG is of a grouping type who's definition is known to originate from 1560 * suspect sources (BIOS), then pg_cmt_prune() will be invoked to prune the 1561 * PG (and all other instances PG's sharing relationship type) from the 1562 * hierarchy. Further, future instances of that sharing relationship type won't 1563 * be instantiated. If the grouping definition doesn't originate from suspect 1564 * sources, then pg_cmt_disable() will be invoked to log an error, and disable 1565 * CMT scheduling altogether. 1566 * 1567 * This routine is invoked after the CPU has been added to the PGs in which 1568 * it belongs, but before those PGs have been added to (or had their place 1569 * adjusted in) the CMT PG hierarchy. 1570 * 1571 * The first argument is the CPUs PG lineage (essentially an array of PGs in 1572 * which the CPU belongs) that has already been sorted in ascending order 1573 * by CPU count. Some of the PGs in the CPUs lineage may already have other 1574 * CPUs in them, and have already been integrated into the CMT hierarchy. 1575 * 1576 * The addition of this new CPU to these pre-existing PGs means that those 1577 * PGs may need to be promoted up in the hierarchy to satisfy the subset 1578 * invariant. In additon to testing the subset invariant for the lineage, 1579 * this routine also verifies that the addition of the new CPU to the 1580 * existing PGs wouldn't cause the subset invariant to be violated in 1581 * the exiting lineages. 1582 * 1583 * This routine will normally return one of the following: 1584 * CMT_LINEAGE_VALID - There were no problems detected with the lineage. 1585 * CMT_LINEAGE_REPAIRED - Problems were detected, but repaired via pruning. 1586 * 1587 * Otherwise, this routine will return a value indicating which error it 1588 * was unable to recover from (and set cmt_lineage_status along the way). 1589 */ 1590 static cmt_lineage_validation_t 1591 pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz) 1592 { 1593 int i, j, size; 1594 pg_cmt_t *pg, *pg_next, *pg_bad, *pg_tmp; 1595 cpu_t *cp; 1596 pg_cpu_itr_t cpu_iter; 1597 lgrp_handle_t lgrp; 1598 1599 ASSERT(MUTEX_HELD(&cpu_lock)); 1600 1601 revalidate: 1602 size = *sz; 1603 pg_bad = NULL; 1604 lgrp = LGRP_NULL_HANDLE; 1605 for (i = 0; i < size; i++) { 1606 1607 pg = lineage[i]; 1608 if (i < size - 1) 1609 pg_next = lineage[i + 1]; 1610 else 1611 pg_next = NULL; 1612 1613 /* 1614 * We assume that the lineage has already been sorted 1615 * by the number of CPUs. In fact, we depend on it. 1616 */ 1617 ASSERT(pg_next == NULL || 1618 (PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)pg_next))); 1619 1620 /* 1621 * Check to make sure that the existing parent of PG (if any) 1622 * is either in the PG's lineage, or the PG has more CPUs than 1623 * its existing parent and can and should be promoted above its 1624 * parent. 1625 * 1626 * Since the PG topology is in the middle of being changed, we 1627 * need to check whether the PG's existing parent (if any) is 1628 * part of its lineage (and therefore should contain the new 1629 * CPU). If not, it means that the addition of the new CPU 1630 * should have made this PG have more CPUs than its parent, and 1631 * this PG should be promoted to be above its existing parent 1632 * now. We need to verify all of this to defend against a buggy 1633 * BIOS giving bad power domain CPU groupings. Sigh. 1634 */ 1635 if (pg->cmt_parent) { 1636 /* 1637 * Determine if cmt_parent is in this lineage 1638 */ 1639 for (j = 0; j < size; j++) { 1640 pg_tmp = lineage[j]; 1641 if (pg_tmp == pg->cmt_parent) 1642 break; 1643 } 1644 if (pg_tmp != pg->cmt_parent) { 1645 /* 1646 * cmt_parent is not in the lineage, verify 1647 * it is a proper subset of PG. 1648 */ 1649 if (PG_NUM_CPUS((pg_t *)pg->cmt_parent) >= 1650 PG_NUM_CPUS((pg_t *)pg)) { 1651 /* 1652 * Not a proper subset if pg has less 1653 * CPUs than cmt_parent... 1654 */ 1655 cmt_lineage_status = 1656 CMT_LINEAGE_NON_PROMOTABLE; 1657 goto handle_error; 1658 } 1659 } 1660 } 1661 1662 /* 1663 * Walk each of the CPUs in the PGs group and perform 1664 * consistency checks along the way. 1665 */ 1666 PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter); 1667 while ((cp = pg_cpu_next(&cpu_iter)) != NULL) { 1668 /* 1669 * Verify that there aren't any CPUs contained in PG 1670 * that the next PG in the lineage (which is larger 1671 * or same size) doesn't also contain. 1672 */ 1673 if (pg_next != NULL && 1674 pg_cpu_find((pg_t *)pg_next, cp) == B_FALSE) { 1675 cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC; 1676 goto handle_error; 1677 } 1678 1679 /* 1680 * Verify that all the CPUs in the PG are in the same 1681 * lgroup. 1682 */ 1683 if (lgrp == LGRP_NULL_HANDLE) { 1684 lgrp = lgrp_plat_cpu_to_hand(cp->cpu_id); 1685 } else if (lgrp_plat_cpu_to_hand(cp->cpu_id) != lgrp) { 1686 cmt_lineage_status = CMT_LINEAGE_PG_SPANS_LGRPS; 1687 goto handle_error; 1688 } 1689 } 1690 } 1691 1692 handle_error: 1693 /* 1694 * Some of these validation errors can result when the CPU grouping 1695 * information is derived from buggy sources (for example, incorrect 1696 * ACPI tables on x86 systems). 1697 * 1698 * We'll try to recover in such cases by pruning out the illegal 1699 * groupings from the PG hierarchy, which means that we won't optimize 1700 * for those levels, but we will for the remaining ones. 1701 */ 1702 switch (cmt_lineage_status) { 1703 case CMT_LINEAGE_VALID: 1704 case CMT_LINEAGE_REPAIRED: 1705 break; 1706 case CMT_LINEAGE_PG_SPANS_LGRPS: 1707 /* 1708 * We've detected a PG whose CPUs span lgroups. 1709 * 1710 * This isn't supported, as the dispatcher isn't allowed to 1711 * to do CMT thread placement across lgroups, as this would 1712 * conflict with policies implementing MPO thread affinity. 1713 * 1714 * The handling for this falls through to the next case. 1715 */ 1716 case CMT_LINEAGE_NON_PROMOTABLE: 1717 /* 1718 * We've detected a PG that already exists in another CPU's 1719 * lineage that cannot cannot legally be promoted into place 1720 * without breaking the invariants of the hierarchy. 1721 */ 1722 if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) { 1723 if (pg_cmt_prune(pg, lineage, sz) == 0) { 1724 cmt_lineage_status = CMT_LINEAGE_REPAIRED; 1725 goto revalidate; 1726 } 1727 } 1728 /* 1729 * Something went wrong trying to prune out the bad level. 1730 * Disable CMT scheduling altogether. 1731 */ 1732 pg_cmt_disable(); 1733 break; 1734 case CMT_LINEAGE_NON_CONCENTRIC: 1735 /* 1736 * We've detected a non-concentric PG lineage, which means that 1737 * there's a PG in the lineage that has CPUs that the next PG 1738 * over in the lineage (which is the same size or larger) 1739 * doesn't have. 1740 * 1741 * In this case, we examine the two PGs to see if either 1742 * grouping is defined by potentially buggy sources. 1743 * 1744 * If one has less CPUs than the other, and contains CPUs 1745 * not found in the parent, and it is an untrusted enumeration, 1746 * then prune it. If both have the same number of CPUs, then 1747 * prune the one that is untrusted. 1748 * 1749 * This process repeats until we have a concentric lineage, 1750 * or we would have to prune out level derived from what we 1751 * thought was a reliable source, in which case CMT scheduling 1752 * is disabled altogether. 1753 */ 1754 if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)pg_next)) && 1755 (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) { 1756 pg_bad = pg; 1757 } else if (PG_NUM_CPUS((pg_t *)pg) == 1758 PG_NUM_CPUS((pg_t *)pg_next)) { 1759 if (PG_CMT_HW_SUSPECT(((pghw_t *)pg_next)->pghw_hw)) { 1760 pg_bad = pg_next; 1761 } else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) { 1762 pg_bad = pg; 1763 } 1764 } 1765 if (pg_bad) { 1766 if (pg_cmt_prune(pg_bad, lineage, sz) == 0) { 1767 cmt_lineage_status = CMT_LINEAGE_REPAIRED; 1768 goto revalidate; 1769 } 1770 } 1771 /* 1772 * Something went wrong trying to identify and/or prune out 1773 * the bad level. Disable CMT scheduling altogether. 1774 */ 1775 pg_cmt_disable(); 1776 break; 1777 default: 1778 /* 1779 * If we're here, we've encountered a validation error for 1780 * which we don't know how to recover. In this case, disable 1781 * CMT scheduling altogether. 1782 */ 1783 cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE; 1784 pg_cmt_disable(); 1785 } 1786 return (cmt_lineage_status); 1787 } 1788