1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/systm.h> 27 #include <sys/types.h> 28 #include <sys/param.h> 29 #include <sys/thread.h> 30 #include <sys/cpuvar.h> 31 #include <sys/cpupart.h> 32 #include <sys/kmem.h> 33 #include <sys/cmn_err.h> 34 #include <sys/kstat.h> 35 #include <sys/processor.h> 36 #include <sys/disp.h> 37 #include <sys/group.h> 38 #include <sys/pghw.h> 39 #include <sys/bitset.h> 40 #include <sys/lgrp.h> 41 #include <sys/cmt.h> 42 #include <sys/cpu_pm.h> 43 44 /* 45 * CMT scheduler / dispatcher support 46 * 47 * This file implements CMT scheduler support using Processor Groups. 48 * The CMT processor group class creates and maintains the CMT class 49 * specific processor group pg_cmt_t. 50 * 51 * ---------------------------- <-- pg_cmt_t * 52 * | pghw_t | 53 * ---------------------------- 54 * | CMT class specific data | 55 * | - hierarchy linkage | 56 * | - CMT load balancing data| 57 * | - active CPU group/bitset| 58 * ---------------------------- 59 * 60 * The scheduler/dispatcher leverages knowledge of the performance 61 * relevant CMT sharing relationships existing between cpus to implement 62 * optimized affinity, load balancing, and coalescence policies. 63 * 64 * Load balancing policy seeks to improve performance by minimizing 65 * contention over shared processor resources / facilities, Affinity 66 * policies seek to improve cache and TLB utilization. Coalescence 67 * policies improve resource utilization and ultimately power efficiency. 68 * 69 * The CMT PGs created by this class are already arranged into a 70 * hierarchy (which is done in the pghw layer). To implement the top-down 71 * CMT load balancing algorithm, the CMT PGs additionally maintain 72 * parent, child and sibling hierarchy relationships. 73 * Parent PGs always contain a superset of their children(s) resources, 74 * each PG can have at most one parent, and siblings are the group of PGs 75 * sharing the same parent. 76 * 77 * On NUMA systems, the CMT load balancing algorithm balances across the 78 * CMT PGs within their respective lgroups. On UMA based system, there 79 * exists a top level group of PGs to balance across. On NUMA systems multiple 80 * top level groups are instantiated, where the top level balancing begins by 81 * balancng across the CMT PGs within their respective (per lgroup) top level 82 * groups. 83 */ 84 static cmt_lgrp_t *cmt_lgrps = NULL; /* cmt_lgrps list head */ 85 static cmt_lgrp_t *cpu0_lgrp = NULL; /* boot CPU's initial lgrp */ 86 /* used for null_proc_lpa */ 87 cmt_lgrp_t *cmt_root = NULL; /* Reference to root cmt pg */ 88 89 static int is_cpu0 = 1; /* true if this is boot CPU context */ 90 91 /* 92 * Array of hardware sharing relationships that are blacklisted. 93 * PGs won't be instantiated for blacklisted hardware sharing relationships. 94 */ 95 static int cmt_hw_blacklisted[PGHW_NUM_COMPONENTS]; 96 97 /* 98 * Set this to non-zero to disable CMT scheduling 99 * This must be done via kmdb -d, as /etc/system will be too late 100 */ 101 int cmt_sched_disabled = 0; 102 103 static pg_cid_t pg_cmt_class_id; /* PG class id */ 104 105 static pg_t *pg_cmt_alloc(); 106 static void pg_cmt_free(pg_t *); 107 static void pg_cmt_cpu_init(cpu_t *); 108 static void pg_cmt_cpu_fini(cpu_t *); 109 static void pg_cmt_cpu_active(cpu_t *); 110 static void pg_cmt_cpu_inactive(cpu_t *); 111 static void pg_cmt_cpupart_in(cpu_t *, cpupart_t *); 112 static void pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *); 113 static char *pg_cmt_policy_name(pg_t *); 114 static void pg_cmt_hier_sort(pg_cmt_t **, int); 115 static pg_cmt_t *pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *); 116 static int pg_cmt_cpu_belongs(pg_t *, cpu_t *); 117 static int pg_cmt_hw(pghw_type_t); 118 static cmt_lgrp_t *pg_cmt_find_lgrp(lgrp_handle_t); 119 static cmt_lgrp_t *pg_cmt_lgrp_create(lgrp_handle_t); 120 static int pg_cmt_lineage_validate(pg_cmt_t **, int *); 121 static void cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t, 122 kthread_t *, kthread_t *); 123 static void cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t, 124 kthread_t *, kthread_t *); 125 static void cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *); 126 127 /* 128 * Macro to test if PG is managed by the CMT PG class 129 */ 130 #define IS_CMT_PG(pg) (((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id) 131 132 /* 133 * Status codes for CMT lineage validation 134 * See cmt_lineage_validate() below 135 */ 136 typedef enum cmt_lineage_validation { 137 CMT_LINEAGE_VALID, 138 CMT_LINEAGE_NON_CONCENTRIC, 139 CMT_LINEAGE_REPAIRED, 140 CMT_LINEAGE_UNRECOVERABLE 141 } cmt_lineage_validation_t; 142 143 /* 144 * Status of the current lineage under construction. 145 * One must be holding cpu_lock to change this. 146 */ 147 static cmt_lineage_validation_t cmt_lineage_status = CMT_LINEAGE_VALID; 148 149 /* 150 * Power domain definitions (on x86) are defined by ACPI, and 151 * therefore may be subject to BIOS bugs. 152 */ 153 #define PG_CMT_HW_SUSPECT(hw) PGHW_IS_PM_DOMAIN(hw) 154 155 /* 156 * CMT PG ops 157 */ 158 struct pg_ops pg_ops_cmt = { 159 pg_cmt_alloc, 160 pg_cmt_free, 161 pg_cmt_cpu_init, 162 pg_cmt_cpu_fini, 163 pg_cmt_cpu_active, 164 pg_cmt_cpu_inactive, 165 pg_cmt_cpupart_in, 166 NULL, /* cpupart_out */ 167 pg_cmt_cpupart_move, 168 pg_cmt_cpu_belongs, 169 pg_cmt_policy_name, 170 }; 171 172 /* 173 * Initialize the CMT PG class 174 */ 175 void 176 pg_cmt_class_init(void) 177 { 178 if (cmt_sched_disabled) 179 return; 180 181 pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL); 182 } 183 184 /* 185 * Called to indicate a new CPU has started up so 186 * that either t0 or the slave startup thread can 187 * be accounted for. 188 */ 189 void 190 pg_cmt_cpu_startup(cpu_t *cp) 191 { 192 pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread, 193 cp->cpu_thread); 194 } 195 196 /* 197 * Return non-zero if thread can migrate between "from" and "to" 198 * without a performance penalty 199 */ 200 int 201 pg_cmt_can_migrate(cpu_t *from, cpu_t *to) 202 { 203 if (from->cpu_physid->cpu_cacheid == 204 to->cpu_physid->cpu_cacheid) 205 return (1); 206 return (0); 207 } 208 209 /* 210 * CMT class specific PG allocation 211 */ 212 static pg_t * 213 pg_cmt_alloc(void) 214 { 215 return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP)); 216 } 217 218 /* 219 * Class specific PG de-allocation 220 */ 221 static void 222 pg_cmt_free(pg_t *pg) 223 { 224 ASSERT(pg != NULL); 225 ASSERT(IS_CMT_PG(pg)); 226 227 kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t)); 228 } 229 230 /* 231 * Given a hardware sharing relationship, return which dispatcher 232 * policies should be implemented to optimize performance and efficiency 233 */ 234 static pg_cmt_policy_t 235 pg_cmt_policy(pghw_type_t hw) 236 { 237 pg_cmt_policy_t p; 238 239 /* 240 * Give the platform a chance to override the default 241 */ 242 if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY) 243 return (p); 244 245 switch (hw) { 246 case PGHW_IPIPE: 247 case PGHW_FPU: 248 case PGHW_CHIP: 249 return (CMT_BALANCE); 250 case PGHW_CACHE: 251 return (CMT_AFFINITY); 252 case PGHW_POW_ACTIVE: 253 case PGHW_POW_IDLE: 254 return (CMT_BALANCE); 255 default: 256 return (CMT_NO_POLICY); 257 } 258 } 259 260 /* 261 * Rank the importance of optimizing for the pg1 relationship vs. 262 * the pg2 relationship. 263 */ 264 static pg_cmt_t * 265 pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2) 266 { 267 pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw; 268 pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw; 269 270 /* 271 * A power domain is only important if CPUPM is enabled. 272 */ 273 if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) { 274 if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2)) 275 return (pg2); 276 if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1)) 277 return (pg1); 278 } 279 280 /* 281 * Otherwise, ask the platform 282 */ 283 if (pg_plat_hw_rank(hw1, hw2) == hw1) 284 return (pg1); 285 else 286 return (pg2); 287 } 288 289 /* 290 * Initialize CMT callbacks for the given PG 291 */ 292 static void 293 cmt_callback_init(pg_t *pg) 294 { 295 switch (((pghw_t *)pg)->pghw_hw) { 296 case PGHW_POW_ACTIVE: 297 pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr; 298 pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr; 299 break; 300 default: 301 pg->pg_cb.thread_swtch = cmt_ev_thread_swtch; 302 303 } 304 } 305 306 /* 307 * Promote PG above it's current parent. 308 * This is only legal if PG has an equal or greater number of CPUs 309 * than it's parent. 310 */ 311 static void 312 cmt_hier_promote(pg_cmt_t *pg) 313 { 314 pg_cmt_t *parent; 315 group_t *children; 316 cpu_t *cpu; 317 group_iter_t iter; 318 pg_cpu_itr_t cpu_iter; 319 int r; 320 int err; 321 322 ASSERT(MUTEX_HELD(&cpu_lock)); 323 324 parent = pg->cmt_parent; 325 if (parent == NULL) { 326 /* 327 * Nothing to do 328 */ 329 return; 330 } 331 332 ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent)); 333 334 /* 335 * We're changing around the hierarchy, which is actively traversed 336 * by the dispatcher. Pause CPUS to ensure exclusivity. 337 */ 338 pause_cpus(NULL); 339 340 /* 341 * If necessary, update the parent's sibling set, replacing parent 342 * with PG. 343 */ 344 if (parent->cmt_siblings) { 345 if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE) 346 != -1) { 347 r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE); 348 ASSERT(r != -1); 349 } 350 } 351 352 /* 353 * If the parent is at the top of the hierarchy, replace it's entry 354 * in the root lgroup's group of top level PGs. 355 */ 356 if (parent->cmt_parent == NULL && 357 parent->cmt_siblings != &cmt_root->cl_pgs) { 358 if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE) 359 != -1) { 360 r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE); 361 ASSERT(r != -1); 362 } 363 } 364 365 /* 366 * We assume (and therefore assert) that the PG being promoted is an 367 * only child of it's parent. Update the parent's children set 368 * replacing PG's entry with the parent (since the parent is becoming 369 * the child). Then have PG and the parent swap children sets. 370 */ 371 ASSERT(GROUP_SIZE(parent->cmt_children) <= 1); 372 if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) { 373 r = group_add(parent->cmt_children, parent, GRP_NORESIZE); 374 ASSERT(r != -1); 375 } 376 377 children = pg->cmt_children; 378 pg->cmt_children = parent->cmt_children; 379 parent->cmt_children = children; 380 381 /* 382 * Update the sibling references for PG and it's parent 383 */ 384 pg->cmt_siblings = parent->cmt_siblings; 385 parent->cmt_siblings = pg->cmt_children; 386 387 /* 388 * Update any cached lineages in the per CPU pg data. 389 */ 390 PG_CPU_ITR_INIT(pg, cpu_iter); 391 while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { 392 int idx; 393 group_t *pgs; 394 pg_cmt_t *cpu_pg; 395 396 /* 397 * Iterate over the CPU's PGs updating the children 398 * of the PG being promoted, since they have a new parent. 399 */ 400 pgs = &cpu->cpu_pg->pgs; 401 group_iter_init(&iter); 402 while ((cpu_pg = group_iterate(pgs, &iter)) != NULL) { 403 if (cpu_pg->cmt_parent == pg) { 404 cpu_pg->cmt_parent = parent; 405 } 406 } 407 408 /* 409 * Update the CMT load balancing lineage 410 */ 411 pgs = &cpu->cpu_pg->cmt_pgs; 412 if ((idx = group_find(pgs, (void *)pg)) == -1) { 413 /* 414 * Unless this is the CPU who's lineage is being 415 * constructed, the PG being promoted should be 416 * in the lineage. 417 */ 418 ASSERT(GROUP_SIZE(pgs) == 0); 419 continue; 420 } 421 422 ASSERT(GROUP_ACCESS(pgs, idx - 1) == parent); 423 ASSERT(idx > 0); 424 425 /* 426 * Have the child and the parent swap places in the CPU's 427 * lineage 428 */ 429 group_remove_at(pgs, idx); 430 group_remove_at(pgs, idx - 1); 431 err = group_add_at(pgs, parent, idx); 432 ASSERT(err == 0); 433 err = group_add_at(pgs, pg, idx - 1); 434 ASSERT(err == 0); 435 } 436 437 /* 438 * Update the parent references for PG and it's parent 439 */ 440 pg->cmt_parent = parent->cmt_parent; 441 parent->cmt_parent = pg; 442 443 start_cpus(); 444 } 445 446 /* 447 * CMT class callback for a new CPU entering the system 448 */ 449 static void 450 pg_cmt_cpu_init(cpu_t *cp) 451 { 452 pg_cmt_t *pg; 453 group_t *cmt_pgs; 454 int levels, level; 455 pghw_type_t hw; 456 pg_t *pg_cache = NULL; 457 pg_cmt_t *cpu_cmt_hier[PGHW_NUM_COMPONENTS]; 458 lgrp_handle_t lgrp_handle; 459 cmt_lgrp_t *lgrp; 460 461 ASSERT(MUTEX_HELD(&cpu_lock)); 462 463 if (cmt_sched_disabled) 464 return; 465 466 /* 467 * A new CPU is coming into the system. 468 * Interrogate the platform to see if the CPU 469 * has any performance or efficiency relevant 470 * sharing relationships 471 */ 472 cmt_pgs = &cp->cpu_pg->cmt_pgs; 473 cp->cpu_pg->cmt_lineage = NULL; 474 475 bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier)); 476 levels = 0; 477 for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) { 478 479 pg_cmt_policy_t policy; 480 481 /* 482 * We're only interested in the hw sharing relationships 483 * for which we know how to optimize. 484 */ 485 policy = pg_cmt_policy(hw); 486 if (policy == CMT_NO_POLICY || 487 pg_plat_hw_shared(cp, hw) == 0) 488 continue; 489 490 /* 491 * Continue if the hardware sharing relationship has been 492 * blacklisted. 493 */ 494 if (cmt_hw_blacklisted[hw]) { 495 continue; 496 } 497 498 /* 499 * Find (or create) the PG associated with 500 * the hw sharing relationship in which cp 501 * belongs. 502 * 503 * Determine if a suitable PG already 504 * exists, or if one needs to be created. 505 */ 506 pg = (pg_cmt_t *)pghw_place_cpu(cp, hw); 507 if (pg == NULL) { 508 /* 509 * Create a new one. 510 * Initialize the common... 511 */ 512 pg = (pg_cmt_t *)pg_create(pg_cmt_class_id); 513 514 /* ... physical ... */ 515 pghw_init((pghw_t *)pg, cp, hw); 516 517 /* 518 * ... and CMT specific portions of the 519 * structure. 520 */ 521 pg->cmt_policy = policy; 522 523 /* CMT event callbacks */ 524 cmt_callback_init((pg_t *)pg); 525 526 bitset_init(&pg->cmt_cpus_actv_set); 527 group_create(&pg->cmt_cpus_actv); 528 } else { 529 ASSERT(IS_CMT_PG(pg)); 530 } 531 532 /* Add the CPU to the PG */ 533 pg_cpu_add((pg_t *)pg, cp); 534 535 /* 536 * Ensure capacity of the active CPU group/bitset 537 */ 538 group_expand(&pg->cmt_cpus_actv, 539 GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 540 541 if (cp->cpu_seqid >= 542 bitset_capacity(&pg->cmt_cpus_actv_set)) { 543 bitset_resize(&pg->cmt_cpus_actv_set, 544 cp->cpu_seqid + 1); 545 } 546 547 /* 548 * Build a lineage of CMT PGs for load balancing / coalescence 549 */ 550 if (policy & (CMT_BALANCE | CMT_COALESCE)) { 551 cpu_cmt_hier[levels++] = pg; 552 } 553 554 /* Cache this for later */ 555 if (hw == PGHW_CACHE) 556 pg_cache = (pg_t *)pg; 557 } 558 559 group_expand(cmt_pgs, levels); 560 561 if (cmt_root == NULL) 562 cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand()); 563 564 /* 565 * Find the lgrp that encapsulates this CPU's CMT hierarchy 566 */ 567 lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 568 if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL) 569 lgrp = pg_cmt_lgrp_create(lgrp_handle); 570 571 /* 572 * Ascendingly sort the PGs in the lineage by number of CPUs 573 */ 574 pg_cmt_hier_sort(cpu_cmt_hier, levels); 575 576 /* 577 * Examine the lineage and validate it. 578 * This routine will also try to fix the lineage along with the 579 * rest of the PG hierarchy should it detect an issue. 580 * 581 * If it returns -1, an unrecoverable error has happened and we 582 * need to return. 583 */ 584 if (pg_cmt_lineage_validate(cpu_cmt_hier, &levels) < 0) 585 return; 586 587 /* 588 * For existing PGs in the lineage, verify that the parent is 589 * correct, as the generation in the lineage may have changed 590 * as a result of the sorting. Start the traversal at the top 591 * of the lineage, moving down. 592 */ 593 for (level = levels - 1; level >= 0; ) { 594 int reorg; 595 596 reorg = 0; 597 pg = cpu_cmt_hier[level]; 598 599 /* 600 * Promote PGs at an incorrect generation into place. 601 */ 602 while (pg->cmt_parent && 603 pg->cmt_parent != cpu_cmt_hier[level + 1]) { 604 cmt_hier_promote(pg); 605 reorg++; 606 } 607 if (reorg > 0) 608 level = levels - 1; 609 else 610 level--; 611 } 612 613 /* 614 * For each of the PGs in the CPU's lineage: 615 * - Add an entry in the CPU sorted CMT PG group 616 * which is used for top down CMT load balancing 617 * - Tie the PG into the CMT hierarchy by connecting 618 * it to it's parent and siblings. 619 */ 620 for (level = 0; level < levels; level++) { 621 uint_t children; 622 int err; 623 624 pg = cpu_cmt_hier[level]; 625 err = group_add_at(cmt_pgs, pg, levels - level - 1); 626 ASSERT(err == 0); 627 628 if (level == 0) 629 cp->cpu_pg->cmt_lineage = (pg_t *)pg; 630 631 if (pg->cmt_siblings != NULL) { 632 /* Already initialized */ 633 ASSERT(pg->cmt_parent == NULL || 634 pg->cmt_parent == cpu_cmt_hier[level + 1]); 635 ASSERT(pg->cmt_siblings == &lgrp->cl_pgs || 636 ((pg->cmt_parent != NULL) && 637 pg->cmt_siblings == pg->cmt_parent->cmt_children)); 638 continue; 639 } 640 641 if ((level + 1) == levels) { 642 pg->cmt_parent = NULL; 643 644 pg->cmt_siblings = &lgrp->cl_pgs; 645 children = ++lgrp->cl_npgs; 646 if (cmt_root != lgrp) 647 cmt_root->cl_npgs++; 648 } else { 649 pg->cmt_parent = cpu_cmt_hier[level + 1]; 650 651 /* 652 * A good parent keeps track of their children. 653 * The parent's children group is also the PG's 654 * siblings. 655 */ 656 if (pg->cmt_parent->cmt_children == NULL) { 657 pg->cmt_parent->cmt_children = 658 kmem_zalloc(sizeof (group_t), KM_SLEEP); 659 group_create(pg->cmt_parent->cmt_children); 660 } 661 pg->cmt_siblings = pg->cmt_parent->cmt_children; 662 children = ++pg->cmt_parent->cmt_nchildren; 663 } 664 665 group_expand(pg->cmt_siblings, children); 666 group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs); 667 } 668 669 /* 670 * Cache the chip and core IDs in the cpu_t->cpu_physid structure 671 * for fast lookups later. 672 */ 673 if (cp->cpu_physid) { 674 cp->cpu_physid->cpu_chipid = 675 pg_plat_hw_instance_id(cp, PGHW_CHIP); 676 cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp); 677 678 /* 679 * If this cpu has a PG representing shared cache, then set 680 * cpu_cacheid to that PG's logical id 681 */ 682 if (pg_cache) 683 cp->cpu_physid->cpu_cacheid = pg_cache->pg_id; 684 } 685 686 /* CPU0 only initialization */ 687 if (is_cpu0) { 688 pg_cmt_cpu_startup(cp); 689 is_cpu0 = 0; 690 cpu0_lgrp = lgrp; 691 } 692 693 } 694 695 /* 696 * Class callback when a CPU is leaving the system (deletion) 697 */ 698 static void 699 pg_cmt_cpu_fini(cpu_t *cp) 700 { 701 group_iter_t i; 702 pg_cmt_t *pg; 703 group_t *pgs, *cmt_pgs; 704 lgrp_handle_t lgrp_handle; 705 cmt_lgrp_t *lgrp; 706 707 if (cmt_sched_disabled) 708 return; 709 710 pgs = &cp->cpu_pg->pgs; 711 cmt_pgs = &cp->cpu_pg->cmt_pgs; 712 713 /* 714 * Find the lgroup that encapsulates this CPU's CMT hierarchy 715 */ 716 lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 717 718 lgrp = pg_cmt_find_lgrp(lgrp_handle); 719 if (ncpus == 1 && lgrp != cpu0_lgrp) { 720 /* 721 * One might wonder how we could be deconfiguring the 722 * only CPU in the system. 723 * 724 * On Starcat systems when null_proc_lpa is detected, 725 * the boot CPU (which is already configured into a leaf 726 * lgroup), is moved into the root lgroup. This is done by 727 * deconfiguring it from both lgroups and processor 728 * groups), and then later reconfiguring it back in. This 729 * call to pg_cmt_cpu_fini() is part of that deconfiguration. 730 * 731 * This special case is detected by noting that the platform 732 * has changed the CPU's lgrp affiliation (since it now 733 * belongs in the root). In this case, use the cmt_lgrp_t 734 * cached for the boot CPU, since this is what needs to be 735 * torn down. 736 */ 737 lgrp = cpu0_lgrp; 738 } 739 740 ASSERT(lgrp != NULL); 741 742 /* 743 * First, clean up anything load balancing specific for each of 744 * the CPU's PGs that participated in CMT load balancing 745 */ 746 pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage; 747 while (pg != NULL) { 748 749 /* 750 * Remove the PG from the CPU's load balancing lineage 751 */ 752 (void) group_remove(cmt_pgs, pg, GRP_RESIZE); 753 754 /* 755 * If it's about to become empty, destroy it's children 756 * group, and remove it's reference from it's siblings. 757 * This is done here (rather than below) to avoid removing 758 * our reference from a PG that we just eliminated. 759 */ 760 if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) { 761 if (pg->cmt_children != NULL) 762 group_destroy(pg->cmt_children); 763 if (pg->cmt_siblings != NULL) { 764 if (pg->cmt_siblings == &lgrp->cl_pgs) 765 lgrp->cl_npgs--; 766 else 767 pg->cmt_parent->cmt_nchildren--; 768 } 769 } 770 pg = pg->cmt_parent; 771 } 772 ASSERT(GROUP_SIZE(cmt_pgs) == 0); 773 774 /* 775 * Now that the load balancing lineage updates have happened, 776 * remove the CPU from all it's PGs (destroying any that become 777 * empty). 778 */ 779 group_iter_init(&i); 780 while ((pg = group_iterate(pgs, &i)) != NULL) { 781 if (IS_CMT_PG(pg) == 0) 782 continue; 783 784 pg_cpu_delete((pg_t *)pg, cp); 785 /* 786 * Deleting the CPU from the PG changes the CPU's 787 * PG group over which we are actively iterating 788 * Re-initialize the iteration 789 */ 790 group_iter_init(&i); 791 792 if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) { 793 794 /* 795 * The PG has become zero sized, so destroy it. 796 */ 797 group_destroy(&pg->cmt_cpus_actv); 798 bitset_fini(&pg->cmt_cpus_actv_set); 799 pghw_fini((pghw_t *)pg); 800 801 pg_destroy((pg_t *)pg); 802 } 803 } 804 } 805 806 /* 807 * Class callback when a CPU is entering a cpu partition 808 */ 809 static void 810 pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp) 811 { 812 group_t *pgs; 813 pg_t *pg; 814 group_iter_t i; 815 816 ASSERT(MUTEX_HELD(&cpu_lock)); 817 818 if (cmt_sched_disabled) 819 return; 820 821 pgs = &cp->cpu_pg->pgs; 822 823 /* 824 * Ensure that the new partition's PG bitset 825 * is large enough for all CMT PG's to which cp 826 * belongs 827 */ 828 group_iter_init(&i); 829 while ((pg = group_iterate(pgs, &i)) != NULL) { 830 if (IS_CMT_PG(pg) == 0) 831 continue; 832 833 if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id) 834 bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1); 835 } 836 } 837 838 /* 839 * Class callback when a CPU is actually moving partitions 840 */ 841 static void 842 pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp) 843 { 844 cpu_t *cpp; 845 group_t *pgs; 846 pg_t *pg; 847 group_iter_t pg_iter; 848 pg_cpu_itr_t cpu_iter; 849 boolean_t found; 850 851 ASSERT(MUTEX_HELD(&cpu_lock)); 852 853 if (cmt_sched_disabled) 854 return; 855 856 pgs = &cp->cpu_pg->pgs; 857 group_iter_init(&pg_iter); 858 859 /* 860 * Iterate over the CPUs CMT PGs 861 */ 862 while ((pg = group_iterate(pgs, &pg_iter)) != NULL) { 863 864 if (IS_CMT_PG(pg) == 0) 865 continue; 866 867 /* 868 * Add the PG to the bitset in the new partition. 869 */ 870 bitset_add(&newpp->cp_cmt_pgs, pg->pg_id); 871 872 /* 873 * Remove the PG from the bitset in the old partition 874 * if the last of the PG's CPUs have left. 875 */ 876 found = B_FALSE; 877 PG_CPU_ITR_INIT(pg, cpu_iter); 878 while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) { 879 if (cpp == cp) 880 continue; 881 if (CPU_ACTIVE(cpp) && 882 cpp->cpu_part->cp_id == oldpp->cp_id) { 883 found = B_TRUE; 884 break; 885 } 886 } 887 if (!found) 888 bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id); 889 } 890 } 891 892 /* 893 * Class callback when a CPU becomes active (online) 894 * 895 * This is called in a context where CPUs are paused 896 */ 897 static void 898 pg_cmt_cpu_active(cpu_t *cp) 899 { 900 int err; 901 group_iter_t i; 902 pg_cmt_t *pg; 903 group_t *pgs; 904 905 ASSERT(MUTEX_HELD(&cpu_lock)); 906 907 if (cmt_sched_disabled) 908 return; 909 910 pgs = &cp->cpu_pg->pgs; 911 group_iter_init(&i); 912 913 /* 914 * Iterate over the CPU's PGs 915 */ 916 while ((pg = group_iterate(pgs, &i)) != NULL) { 917 918 if (IS_CMT_PG(pg) == 0) 919 continue; 920 921 err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 922 ASSERT(err == 0); 923 924 /* 925 * If this is the first active CPU in the PG, and it 926 * represents a hardware sharing relationship over which 927 * CMT load balancing is performed, add it as a candidate 928 * for balancing with it's siblings. 929 */ 930 if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 && 931 (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) { 932 err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE); 933 ASSERT(err == 0); 934 935 /* 936 * If this is a top level PG, add it as a balancing 937 * candidate when balancing within the root lgroup. 938 */ 939 if (pg->cmt_parent == NULL && 940 pg->cmt_siblings != &cmt_root->cl_pgs) { 941 err = group_add(&cmt_root->cl_pgs, pg, 942 GRP_NORESIZE); 943 ASSERT(err == 0); 944 } 945 } 946 947 /* 948 * Notate the CPU in the PGs active CPU bitset. 949 * Also notate the PG as being active in it's associated 950 * partition 951 */ 952 bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 953 bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id); 954 } 955 } 956 957 /* 958 * Class callback when a CPU goes inactive (offline) 959 * 960 * This is called in a context where CPUs are paused 961 */ 962 static void 963 pg_cmt_cpu_inactive(cpu_t *cp) 964 { 965 int err; 966 group_t *pgs; 967 pg_cmt_t *pg; 968 cpu_t *cpp; 969 group_iter_t i; 970 pg_cpu_itr_t cpu_itr; 971 boolean_t found; 972 973 ASSERT(MUTEX_HELD(&cpu_lock)); 974 975 if (cmt_sched_disabled) 976 return; 977 978 pgs = &cp->cpu_pg->pgs; 979 group_iter_init(&i); 980 981 while ((pg = group_iterate(pgs, &i)) != NULL) { 982 983 if (IS_CMT_PG(pg) == 0) 984 continue; 985 986 /* 987 * Remove the CPU from the CMT PGs active CPU group 988 * bitmap 989 */ 990 err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 991 ASSERT(err == 0); 992 993 bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 994 995 /* 996 * If there are no more active CPUs in this PG over which 997 * load was balanced, remove it as a balancing candidate. 998 */ 999 if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 && 1000 (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) { 1001 err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); 1002 ASSERT(err == 0); 1003 1004 if (pg->cmt_parent == NULL && 1005 pg->cmt_siblings != &cmt_root->cl_pgs) { 1006 err = group_remove(&cmt_root->cl_pgs, pg, 1007 GRP_NORESIZE); 1008 ASSERT(err == 0); 1009 } 1010 } 1011 1012 /* 1013 * Assert the number of active CPUs does not exceed 1014 * the total number of CPUs in the PG 1015 */ 1016 ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <= 1017 GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 1018 1019 /* 1020 * Update the PG bitset in the CPU's old partition 1021 */ 1022 found = B_FALSE; 1023 PG_CPU_ITR_INIT(pg, cpu_itr); 1024 while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) { 1025 if (cpp == cp) 1026 continue; 1027 if (CPU_ACTIVE(cpp) && 1028 cpp->cpu_part->cp_id == cp->cpu_part->cp_id) { 1029 found = B_TRUE; 1030 break; 1031 } 1032 } 1033 if (!found) { 1034 bitset_del(&cp->cpu_part->cp_cmt_pgs, 1035 ((pg_t *)pg)->pg_id); 1036 } 1037 } 1038 } 1039 1040 /* 1041 * Return non-zero if the CPU belongs in the given PG 1042 */ 1043 static int 1044 pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp) 1045 { 1046 cpu_t *pg_cpu; 1047 1048 pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0); 1049 1050 ASSERT(pg_cpu != NULL); 1051 1052 /* 1053 * The CPU belongs if, given the nature of the hardware sharing 1054 * relationship represented by the PG, the CPU has that 1055 * relationship with some other CPU already in the PG 1056 */ 1057 if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw)) 1058 return (1); 1059 1060 return (0); 1061 } 1062 1063 /* 1064 * Sort the CPUs CMT hierarchy, where "size" is the number of levels. 1065 */ 1066 static void 1067 pg_cmt_hier_sort(pg_cmt_t **hier, int size) 1068 { 1069 int i, j, inc; 1070 pg_t *tmp; 1071 pg_t **h = (pg_t **)hier; 1072 1073 /* 1074 * First sort by number of CPUs 1075 */ 1076 inc = size / 2; 1077 while (inc > 0) { 1078 for (i = inc; i < size; i++) { 1079 j = i; 1080 tmp = h[i]; 1081 while ((j >= inc) && 1082 (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) { 1083 h[j] = h[j - inc]; 1084 j = j - inc; 1085 } 1086 h[j] = tmp; 1087 } 1088 if (inc == 2) 1089 inc = 1; 1090 else 1091 inc = (inc * 5) / 11; 1092 } 1093 1094 /* 1095 * Break ties by asking the platform. 1096 * Determine if h[i] outranks h[i + 1] and if so, swap them. 1097 */ 1098 for (i = 0; i < size - 1; i++) { 1099 if ((PG_NUM_CPUS(h[i]) == PG_NUM_CPUS(h[i + 1])) && 1100 pg_cmt_hier_rank(hier[i], hier[i + 1]) == hier[i]) { 1101 tmp = h[i]; 1102 h[i] = h[i + 1]; 1103 h[i + 1] = tmp; 1104 } 1105 } 1106 } 1107 1108 /* 1109 * Return a cmt_lgrp_t * given an lgroup handle. 1110 */ 1111 static cmt_lgrp_t * 1112 pg_cmt_find_lgrp(lgrp_handle_t hand) 1113 { 1114 cmt_lgrp_t *lgrp; 1115 1116 ASSERT(MUTEX_HELD(&cpu_lock)); 1117 1118 lgrp = cmt_lgrps; 1119 while (lgrp != NULL) { 1120 if (lgrp->cl_hand == hand) 1121 break; 1122 lgrp = lgrp->cl_next; 1123 } 1124 return (lgrp); 1125 } 1126 1127 /* 1128 * Create a cmt_lgrp_t with the specified handle. 1129 */ 1130 static cmt_lgrp_t * 1131 pg_cmt_lgrp_create(lgrp_handle_t hand) 1132 { 1133 cmt_lgrp_t *lgrp; 1134 1135 ASSERT(MUTEX_HELD(&cpu_lock)); 1136 1137 lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP); 1138 1139 lgrp->cl_hand = hand; 1140 lgrp->cl_npgs = 0; 1141 lgrp->cl_next = cmt_lgrps; 1142 cmt_lgrps = lgrp; 1143 group_create(&lgrp->cl_pgs); 1144 1145 return (lgrp); 1146 } 1147 1148 /* 1149 * Interfaces to enable and disable power aware dispatching 1150 * The caller must be holding cpu_lock. 1151 * 1152 * Return 0 on success and -1 on failure. 1153 */ 1154 int 1155 cmt_pad_enable(pghw_type_t type) 1156 { 1157 group_t *hwset; 1158 group_iter_t iter; 1159 pg_cmt_t *pg; 1160 1161 ASSERT(PGHW_IS_PM_DOMAIN(type)); 1162 ASSERT(MUTEX_HELD(&cpu_lock)); 1163 1164 if ((hwset = pghw_set_lookup(type)) == NULL || 1165 cmt_hw_blacklisted[type]) { 1166 /* 1167 * Unable to find any instances of the specified type 1168 * of power domain, or the power domains have been blacklisted. 1169 */ 1170 return (-1); 1171 } 1172 1173 /* 1174 * Iterate over the power domains, setting the default dispatcher 1175 * policy for power/performance optimization. 1176 * 1177 * Simply setting the policy isn't enough in the case where the power 1178 * domain is an only child of another PG. Because the dispatcher walks 1179 * the PG hierarchy in a top down fashion, the higher up PG's policy 1180 * will dominate. So promote the power domain above it's parent if both 1181 * PG and it's parent have the same CPUs to ensure it's policy 1182 * dominates. 1183 */ 1184 group_iter_init(&iter); 1185 while ((pg = group_iterate(hwset, &iter)) != NULL) { 1186 /* 1187 * If the power domain is an only child to a parent 1188 * not implementing the same policy, promote the child 1189 * above the parent to activate the policy. 1190 */ 1191 pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw); 1192 while ((pg->cmt_parent != NULL) && 1193 (pg->cmt_parent->cmt_policy != pg->cmt_policy) && 1194 (PG_NUM_CPUS((pg_t *)pg) == 1195 PG_NUM_CPUS((pg_t *)pg->cmt_parent))) { 1196 cmt_hier_promote(pg); 1197 } 1198 } 1199 1200 return (0); 1201 } 1202 1203 int 1204 cmt_pad_disable(pghw_type_t type) 1205 { 1206 group_t *hwset; 1207 group_iter_t iter; 1208 pg_cmt_t *pg; 1209 pg_cmt_t *child; 1210 1211 ASSERT(PGHW_IS_PM_DOMAIN(type)); 1212 ASSERT(MUTEX_HELD(&cpu_lock)); 1213 1214 if ((hwset = pghw_set_lookup(type)) == NULL) { 1215 /* 1216 * Unable to find any instances of the specified type of 1217 * power domain. 1218 */ 1219 return (-1); 1220 } 1221 /* 1222 * Iterate over the power domains, setting the default dispatcher 1223 * policy for performance optimization (load balancing). 1224 */ 1225 group_iter_init(&iter); 1226 while ((pg = group_iterate(hwset, &iter)) != NULL) { 1227 1228 /* 1229 * If the power domain has an only child that implements 1230 * policy other than load balancing, promote the child 1231 * above the power domain to ensure it's policy dominates. 1232 */ 1233 if (GROUP_SIZE(pg->cmt_children) == 1) { 1234 child = GROUP_ACCESS(pg->cmt_children, 0); 1235 if ((child->cmt_policy & CMT_BALANCE) == 0) { 1236 cmt_hier_promote(child); 1237 } 1238 } 1239 pg->cmt_policy = CMT_BALANCE; 1240 } 1241 return (0); 1242 } 1243 1244 /* ARGSUSED */ 1245 static void 1246 cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old, 1247 kthread_t *new) 1248 { 1249 pg_cmt_t *cmt_pg = (pg_cmt_t *)pg; 1250 1251 if (old == cp->cpu_idle_thread) { 1252 atomic_add_32(&cmt_pg->cmt_utilization, 1); 1253 } else if (new == cp->cpu_idle_thread) { 1254 atomic_add_32(&cmt_pg->cmt_utilization, -1); 1255 } 1256 } 1257 1258 /* 1259 * Macro to test whether a thread is currently runnable on a CPU in a PG. 1260 */ 1261 #define THREAD_RUNNABLE_IN_PG(t, pg) \ 1262 ((t)->t_state == TS_RUN && \ 1263 (t)->t_disp_queue->disp_cpu && \ 1264 bitset_in_set(&(pg)->cmt_cpus_actv_set, \ 1265 (t)->t_disp_queue->disp_cpu->cpu_seqid)) 1266 1267 static void 1268 cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old, 1269 kthread_t *new) 1270 { 1271 pg_cmt_t *cmt = (pg_cmt_t *)pg; 1272 cpupm_domain_t *dom; 1273 uint32_t u; 1274 1275 if (old == cp->cpu_idle_thread) { 1276 ASSERT(new != cp->cpu_idle_thread); 1277 u = atomic_add_32_nv(&cmt->cmt_utilization, 1); 1278 if (u == 1) { 1279 /* 1280 * Notify the CPU power manager that the domain 1281 * is non-idle. 1282 */ 1283 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1284 cpupm_utilization_event(cp, now, dom, 1285 CPUPM_DOM_BUSY_FROM_IDLE); 1286 } 1287 } else if (new == cp->cpu_idle_thread) { 1288 ASSERT(old != cp->cpu_idle_thread); 1289 u = atomic_add_32_nv(&cmt->cmt_utilization, -1); 1290 if (u == 0) { 1291 /* 1292 * The domain is idle, notify the CPU power 1293 * manager. 1294 * 1295 * Avoid notifying if the thread is simply migrating 1296 * between CPUs in the domain. 1297 */ 1298 if (!THREAD_RUNNABLE_IN_PG(old, cmt)) { 1299 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1300 cpupm_utilization_event(cp, now, dom, 1301 CPUPM_DOM_IDLE_FROM_BUSY); 1302 } 1303 } 1304 } 1305 } 1306 1307 /* ARGSUSED */ 1308 static void 1309 cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t) 1310 { 1311 pg_cmt_t *cmt = (pg_cmt_t *)pg; 1312 cpupm_domain_t *dom; 1313 1314 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1315 cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY); 1316 } 1317 1318 /* 1319 * Return the name of the CMT scheduling policy 1320 * being implemented across this PG 1321 */ 1322 static char * 1323 pg_cmt_policy_name(pg_t *pg) 1324 { 1325 pg_cmt_policy_t policy; 1326 1327 policy = ((pg_cmt_t *)pg)->cmt_policy; 1328 1329 if (policy & CMT_AFFINITY) { 1330 if (policy & CMT_BALANCE) 1331 return ("Load Balancing & Affinity"); 1332 else if (policy & CMT_COALESCE) 1333 return ("Load Coalescence & Affinity"); 1334 else 1335 return ("Affinity"); 1336 } else { 1337 if (policy & CMT_BALANCE) 1338 return ("Load Balancing"); 1339 else if (policy & CMT_COALESCE) 1340 return ("Load Coalescence"); 1341 else 1342 return ("None"); 1343 } 1344 } 1345 1346 /* 1347 * Prune PG, and all other instances of PG's hardware sharing relationship 1348 * from the PG hierarchy. 1349 */ 1350 static int 1351 pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz) 1352 { 1353 group_t *hwset, *children; 1354 int i, j, r, size = *sz; 1355 group_iter_t hw_iter, child_iter; 1356 pg_cpu_itr_t cpu_iter; 1357 pg_cmt_t *pg, *child; 1358 cpu_t *cpu; 1359 int cap_needed; 1360 pghw_type_t hw; 1361 1362 ASSERT(MUTEX_HELD(&cpu_lock)); 1363 1364 hw = ((pghw_t *)pg_bad)->pghw_hw; 1365 1366 if (hw == PGHW_POW_ACTIVE) { 1367 cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. " 1368 "Event Based CPUPM Unavailable"); 1369 } else if (hw == PGHW_POW_IDLE) { 1370 cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. " 1371 "Dispatcher assisted CPUPM disabled."); 1372 } 1373 1374 /* 1375 * Find and eliminate the PG from the lineage. 1376 */ 1377 for (i = 0; i < size; i++) { 1378 if (lineage[i] == pg_bad) { 1379 for (j = i; j < size - 1; j++) 1380 lineage[j] = lineage[j + 1]; 1381 *sz = size - 1; 1382 break; 1383 } 1384 } 1385 1386 /* 1387 * We'll prune all instances of the hardware sharing relationship 1388 * represented by pg. But before we do that (and pause CPUs) we need 1389 * to ensure the hierarchy's groups are properly sized. 1390 */ 1391 hwset = pghw_set_lookup(hw); 1392 1393 /* 1394 * Blacklist the hardware so that future groups won't be created. 1395 */ 1396 cmt_hw_blacklisted[hw] = 1; 1397 1398 /* 1399 * For each of the PGs being pruned, ensure sufficient capacity in 1400 * the siblings set for the PG's children 1401 */ 1402 group_iter_init(&hw_iter); 1403 while ((pg = group_iterate(hwset, &hw_iter)) != NULL) { 1404 /* 1405 * PG is being pruned, but if it is bringing up more than 1406 * one child, ask for more capacity in the siblings group. 1407 */ 1408 cap_needed = 0; 1409 if (pg->cmt_children && 1410 GROUP_SIZE(pg->cmt_children) > 1) { 1411 cap_needed = GROUP_SIZE(pg->cmt_children) - 1; 1412 1413 group_expand(pg->cmt_siblings, 1414 GROUP_SIZE(pg->cmt_siblings) + cap_needed); 1415 1416 /* 1417 * If this is a top level group, also ensure the 1418 * capacity in the root lgrp level CMT grouping. 1419 */ 1420 if (pg->cmt_parent == NULL && 1421 pg->cmt_siblings != &cmt_root->cl_pgs) { 1422 group_expand(&cmt_root->cl_pgs, 1423 GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed); 1424 } 1425 } 1426 } 1427 1428 /* 1429 * We're operating on the PG hierarchy. Pause CPUs to ensure 1430 * exclusivity with respect to the dispatcher. 1431 */ 1432 pause_cpus(NULL); 1433 1434 /* 1435 * Prune all PG instances of the hardware sharing relationship 1436 * represented by pg. 1437 */ 1438 group_iter_init(&hw_iter); 1439 while ((pg = group_iterate(hwset, &hw_iter)) != NULL) { 1440 1441 /* 1442 * Remove PG from it's group of siblings, if it's there. 1443 */ 1444 if (pg->cmt_siblings) { 1445 (void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); 1446 } 1447 if (pg->cmt_parent == NULL && 1448 pg->cmt_siblings != &cmt_root->cl_pgs) { 1449 (void) group_remove(&cmt_root->cl_pgs, pg, 1450 GRP_NORESIZE); 1451 } 1452 /* 1453 * Add PGs children to it's group of siblings. 1454 */ 1455 if (pg->cmt_children != NULL) { 1456 children = pg->cmt_children; 1457 1458 group_iter_init(&child_iter); 1459 while ((child = group_iterate(children, &child_iter)) 1460 != NULL) { 1461 /* 1462 * Transplant child from it's siblings set to 1463 * PGs. 1464 */ 1465 if (pg->cmt_siblings != NULL && 1466 child->cmt_siblings != NULL && 1467 group_remove(child->cmt_siblings, child, 1468 GRP_NORESIZE) != -1) { 1469 r = group_add(pg->cmt_siblings, child, 1470 GRP_NORESIZE); 1471 ASSERT(r == 0); 1472 } 1473 } 1474 } 1475 1476 /* 1477 * Reset the callbacks to the defaults 1478 */ 1479 pg_callback_set_defaults((pg_t *)pg); 1480 1481 /* 1482 * Update all the CPU lineages in each of PG's CPUs 1483 */ 1484 PG_CPU_ITR_INIT(pg, cpu_iter); 1485 while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { 1486 group_t *pgs; 1487 pg_cmt_t *cpu_pg; 1488 group_iter_t liter; /* Iterator for the lineage */ 1489 1490 /* 1491 * Iterate over the CPU's PGs updating the children 1492 * of the PG being promoted, since they have a new 1493 * parent and siblings set. 1494 */ 1495 pgs = &cpu->cpu_pg->pgs; 1496 group_iter_init(&liter); 1497 while ((cpu_pg = group_iterate(pgs, &liter)) != NULL) { 1498 if (cpu_pg->cmt_parent == pg) { 1499 cpu_pg->cmt_parent = pg->cmt_parent; 1500 cpu_pg->cmt_siblings = pg->cmt_siblings; 1501 } 1502 } 1503 1504 /* 1505 * Update the CPU's lineages 1506 */ 1507 pgs = &cpu->cpu_pg->cmt_pgs; 1508 (void) group_remove(pgs, pg, GRP_NORESIZE); 1509 pgs = &cpu->cpu_pg->pgs; 1510 (void) group_remove(pgs, pg, GRP_NORESIZE); 1511 } 1512 } 1513 start_cpus(); 1514 return (0); 1515 } 1516 1517 /* 1518 * Disable CMT scheduling 1519 */ 1520 static void 1521 pg_cmt_disable(void) 1522 { 1523 cpu_t *cpu; 1524 1525 pause_cpus(NULL); 1526 cpu = cpu_list; 1527 1528 do { 1529 if (cpu->cpu_pg) 1530 group_empty(&cpu->cpu_pg->cmt_pgs); 1531 } while ((cpu = cpu->cpu_next) != cpu_list); 1532 1533 cmt_sched_disabled = 1; 1534 start_cpus(); 1535 cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable"); 1536 } 1537 1538 static int 1539 pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz) 1540 { 1541 int i, size; 1542 pg_cmt_t *pg, *parent, *pg_bad; 1543 cpu_t *cp; 1544 pg_cpu_itr_t cpu_iter; 1545 1546 ASSERT(MUTEX_HELD(&cpu_lock)); 1547 1548 revalidate: 1549 size = *sz; 1550 pg_bad = NULL; 1551 for (i = 0; i < size - 1; i++) { 1552 1553 pg = lineage[i]; 1554 parent = lineage[i + 1]; 1555 1556 /* 1557 * We assume that the lineage has already been sorted 1558 * by the number of CPUs. In fact, we depend on it. 1559 */ 1560 ASSERT(PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)parent)); 1561 1562 /* 1563 * Walk each of the CPUs in the PGs group, and verify that 1564 * the next larger PG contains at least the CPUs in this one. 1565 */ 1566 PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter); 1567 while ((cp = pg_cpu_next(&cpu_iter)) != NULL) { 1568 if (pg_cpu_find((pg_t *)parent, cp) == B_FALSE) { 1569 cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC; 1570 goto handle_error; 1571 } 1572 } 1573 } 1574 1575 handle_error: 1576 switch (cmt_lineage_status) { 1577 case CMT_LINEAGE_VALID: 1578 case CMT_LINEAGE_REPAIRED: 1579 break; 1580 case CMT_LINEAGE_NON_CONCENTRIC: 1581 /* 1582 * We've detected a non-concentric PG lineage. 1583 * 1584 * This can happen when some of the CPU grouping information 1585 * is derived from buggy sources (for example, incorrect ACPI 1586 * tables on x86 systems). 1587 * 1588 * We attempt to recover from this by pruning out the 1589 * illegal groupings from the PG hierarchy, which means that 1590 * we won't optimize for those levels, but we will for the 1591 * remaining ones. 1592 * 1593 * If a given level has CPUs not found in it's parent, then 1594 * we examine the PG and it's parent to see if either grouping 1595 * is enumerated from potentially buggy sources. 1596 * 1597 * If one has less CPUs than the other, and contains CPUs 1598 * not found in the parent, and it is an untrusted enumeration, 1599 * then prune it. If both have the same number of CPUs, then 1600 * prune the one that is untrusted. 1601 * 1602 * This process repeats until we have a concentric lineage, 1603 * or we would have to prune out level derived from what we 1604 * thought was a reliable source, in which case CMT scheduling 1605 * is disabled all together. 1606 */ 1607 if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)parent)) && 1608 (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) { 1609 pg_bad = pg; 1610 } else if (PG_NUM_CPUS((pg_t *)pg) == 1611 PG_NUM_CPUS((pg_t *)parent)) { 1612 if (PG_CMT_HW_SUSPECT(((pghw_t *)parent)->pghw_hw)) { 1613 pg_bad = parent; 1614 } else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) { 1615 pg_bad = pg; 1616 } 1617 } 1618 if (pg_bad) { 1619 if (pg_cmt_prune(pg_bad, lineage, sz) == 0) { 1620 cmt_lineage_status = CMT_LINEAGE_REPAIRED; 1621 goto revalidate; 1622 } 1623 } 1624 /*FALLTHROUGH*/ 1625 default: 1626 /* 1627 * If we're here, something has gone wrong in trying to 1628 * recover from a illegal PG hierarchy, or we've encountered 1629 * a validation error for which we don't know how to recover. 1630 * In this case, disable CMT scheduling all together. 1631 */ 1632 pg_cmt_disable(); 1633 cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE; 1634 return (-1); 1635 } 1636 return (0); 1637 } 1638