1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/systm.h> 27 #include <sys/types.h> 28 #include <sys/param.h> 29 #include <sys/thread.h> 30 #include <sys/cpuvar.h> 31 #include <sys/cpupart.h> 32 #include <sys/kmem.h> 33 #include <sys/cmn_err.h> 34 #include <sys/kstat.h> 35 #include <sys/processor.h> 36 #include <sys/disp.h> 37 #include <sys/group.h> 38 #include <sys/pghw.h> 39 #include <sys/bitset.h> 40 #include <sys/lgrp.h> 41 #include <sys/cmt.h> 42 #include <sys/cpu_pm.h> 43 44 /* 45 * CMT scheduler / dispatcher support 46 * 47 * This file implements CMT scheduler support using Processor Groups. 48 * The CMT processor group class creates and maintains the CMT class 49 * specific processor group pg_cmt_t. 50 * 51 * ---------------------------- <-- pg_cmt_t * 52 * | pghw_t | 53 * ---------------------------- 54 * | CMT class specific data | 55 * | - hierarchy linkage | 56 * | - CMT load balancing data| 57 * | - active CPU group/bitset| 58 * ---------------------------- 59 * 60 * The scheduler/dispatcher leverages knowledge of the performance 61 * relevant CMT sharing relationships existing between cpus to implement 62 * optimized affinity, load balancing, and coalescence policies. 63 * 64 * Load balancing policy seeks to improve performance by minimizing 65 * contention over shared processor resources / facilities, Affinity 66 * policies seek to improve cache and TLB utilization. Coalescence 67 * policies improve resource utilization and ultimately power efficiency. 68 * 69 * The CMT PGs created by this class are already arranged into a 70 * hierarchy (which is done in the pghw layer). To implement the top-down 71 * CMT load balancing algorithm, the CMT PGs additionally maintain 72 * parent, child and sibling hierarchy relationships. 73 * Parent PGs always contain a superset of their children(s) resources, 74 * each PG can have at most one parent, and siblings are the group of PGs 75 * sharing the same parent. 76 * 77 * On UMA based systems, the CMT load balancing algorithm begins by balancing 78 * load across the group of top level PGs in the system hierarchy. 79 * On NUMA systems, the CMT load balancing algorithm balances load across the 80 * group of top level PGs in each leaf lgroup...but for root homed threads, 81 * is willing to balance against all the top level PGs in the system. 82 * 83 * Groups of top level PGs are maintained to implement the above, one for each 84 * leaf lgroup (containing the top level PGs in that lgroup), and one (for the 85 * root lgroup) that contains all the top level PGs in the system. 86 */ 87 static cmt_lgrp_t *cmt_lgrps = NULL; /* cmt_lgrps list head */ 88 static cmt_lgrp_t *cpu0_lgrp = NULL; /* boot CPU's initial lgrp */ 89 /* used for null_proc_lpa */ 90 cmt_lgrp_t *cmt_root = NULL; /* Reference to root cmt pg */ 91 92 static int is_cpu0 = 1; /* true if this is boot CPU context */ 93 94 /* 95 * Array of hardware sharing relationships that are blacklisted. 96 * CMT scheduling optimizations won't be performed for blacklisted sharing 97 * relationships. 98 */ 99 static int cmt_hw_blacklisted[PGHW_NUM_COMPONENTS]; 100 101 /* 102 * Set this to non-zero to disable CMT scheduling 103 * This must be done via kmdb -d, as /etc/system will be too late 104 */ 105 int cmt_sched_disabled = 0; 106 107 /* 108 * Status codes for CMT lineage validation 109 * See pg_cmt_lineage_validate() below 110 */ 111 typedef enum cmt_lineage_validation { 112 CMT_LINEAGE_VALID, 113 CMT_LINEAGE_NON_CONCENTRIC, 114 CMT_LINEAGE_PG_SPANS_LGRPS, 115 CMT_LINEAGE_NON_PROMOTABLE, 116 CMT_LINEAGE_REPAIRED, 117 CMT_LINEAGE_UNRECOVERABLE 118 } cmt_lineage_validation_t; 119 120 /* 121 * Status of the current lineage under construction. 122 * One must be holding cpu_lock to change this. 123 */ 124 cmt_lineage_validation_t cmt_lineage_status = CMT_LINEAGE_VALID; 125 126 /* 127 * Power domain definitions (on x86) are defined by ACPI, and 128 * therefore may be subject to BIOS bugs. 129 */ 130 #define PG_CMT_HW_SUSPECT(hw) PGHW_IS_PM_DOMAIN(hw) 131 132 /* 133 * Macro to test if PG is managed by the CMT PG class 134 */ 135 #define IS_CMT_PG(pg) (((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id) 136 137 static pg_cid_t pg_cmt_class_id; /* PG class id */ 138 139 static pg_t *pg_cmt_alloc(); 140 static void pg_cmt_free(pg_t *); 141 static void pg_cmt_cpu_init(cpu_t *, cpu_pg_t *); 142 static void pg_cmt_cpu_fini(cpu_t *, cpu_pg_t *); 143 static void pg_cmt_cpu_active(cpu_t *); 144 static void pg_cmt_cpu_inactive(cpu_t *); 145 static void pg_cmt_cpupart_in(cpu_t *, cpupart_t *); 146 static void pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *); 147 static char *pg_cmt_policy_name(pg_t *); 148 static void pg_cmt_hier_sort(pg_cmt_t **, int); 149 static pg_cmt_t *pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *); 150 static int pg_cmt_cpu_belongs(pg_t *, cpu_t *); 151 static int pg_cmt_hw(pghw_type_t); 152 static cmt_lgrp_t *pg_cmt_find_lgrp(lgrp_handle_t); 153 static cmt_lgrp_t *pg_cmt_lgrp_create(lgrp_handle_t); 154 static void cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t, 155 kthread_t *, kthread_t *); 156 static void cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t, 157 kthread_t *, kthread_t *); 158 static void cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *); 159 static cmt_lineage_validation_t pg_cmt_lineage_validate(pg_cmt_t **, int *, 160 cpu_pg_t *); 161 162 163 /* 164 * CMT PG ops 165 */ 166 struct pg_ops pg_ops_cmt = { 167 pg_cmt_alloc, 168 pg_cmt_free, 169 pg_cmt_cpu_init, 170 pg_cmt_cpu_fini, 171 pg_cmt_cpu_active, 172 pg_cmt_cpu_inactive, 173 pg_cmt_cpupart_in, 174 NULL, /* cpupart_out */ 175 pg_cmt_cpupart_move, 176 pg_cmt_cpu_belongs, 177 pg_cmt_policy_name, 178 }; 179 180 /* 181 * Initialize the CMT PG class 182 */ 183 void 184 pg_cmt_class_init(void) 185 { 186 if (cmt_sched_disabled) 187 return; 188 189 pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL); 190 } 191 192 /* 193 * Called to indicate a new CPU has started up so 194 * that either t0 or the slave startup thread can 195 * be accounted for. 196 */ 197 void 198 pg_cmt_cpu_startup(cpu_t *cp) 199 { 200 pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread, 201 cp->cpu_thread); 202 } 203 204 /* 205 * Return non-zero if thread can migrate between "from" and "to" 206 * without a performance penalty 207 */ 208 int 209 pg_cmt_can_migrate(cpu_t *from, cpu_t *to) 210 { 211 if (from->cpu_physid->cpu_cacheid == 212 to->cpu_physid->cpu_cacheid) 213 return (1); 214 return (0); 215 } 216 217 /* 218 * CMT class specific PG allocation 219 */ 220 static pg_t * 221 pg_cmt_alloc(void) 222 { 223 return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP)); 224 } 225 226 /* 227 * Class specific PG de-allocation 228 */ 229 static void 230 pg_cmt_free(pg_t *pg) 231 { 232 ASSERT(pg != NULL); 233 ASSERT(IS_CMT_PG(pg)); 234 235 kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t)); 236 } 237 238 /* 239 * Given a hardware sharing relationship, return which dispatcher 240 * policies should be implemented to optimize performance and efficiency 241 */ 242 static pg_cmt_policy_t 243 pg_cmt_policy(pghw_type_t hw) 244 { 245 pg_cmt_policy_t p; 246 247 /* 248 * Give the platform a chance to override the default 249 */ 250 if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY) 251 return (p); 252 253 switch (hw) { 254 case PGHW_IPIPE: 255 case PGHW_FPU: 256 case PGHW_CHIP: 257 return (CMT_BALANCE); 258 case PGHW_CACHE: 259 return (CMT_AFFINITY); 260 case PGHW_POW_ACTIVE: 261 case PGHW_POW_IDLE: 262 return (CMT_BALANCE); 263 default: 264 return (CMT_NO_POLICY); 265 } 266 } 267 268 /* 269 * Rank the importance of optimizing for the pg1 relationship vs. 270 * the pg2 relationship. 271 */ 272 static pg_cmt_t * 273 pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2) 274 { 275 pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw; 276 pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw; 277 278 /* 279 * A power domain is only important if CPUPM is enabled. 280 */ 281 if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) { 282 if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2)) 283 return (pg2); 284 if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1)) 285 return (pg1); 286 } 287 288 /* 289 * Otherwise, ask the platform 290 */ 291 if (pg_plat_hw_rank(hw1, hw2) == hw1) 292 return (pg1); 293 else 294 return (pg2); 295 } 296 297 /* 298 * Initialize CMT callbacks for the given PG 299 */ 300 static void 301 cmt_callback_init(pg_t *pg) 302 { 303 /* 304 * Stick with the default callbacks if there isn't going to be 305 * any CMT thread placement optimizations implemented. 306 */ 307 if (((pg_cmt_t *)pg)->cmt_policy == CMT_NO_POLICY) 308 return; 309 310 switch (((pghw_t *)pg)->pghw_hw) { 311 case PGHW_POW_ACTIVE: 312 pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr; 313 pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr; 314 break; 315 default: 316 pg->pg_cb.thread_swtch = cmt_ev_thread_swtch; 317 318 } 319 } 320 321 /* 322 * Promote PG above it's current parent. 323 * This is only legal if PG has an equal or greater number of CPUs than its 324 * parent. 325 * 326 * This routine operates on the CPU specific processor group data (for the CPUs 327 * in the PG being promoted), and may be invoked from a context where one CPU's 328 * PG data is under construction. In this case the argument "pgdata", if not 329 * NULL, is a reference to the CPU's under-construction PG data. 330 */ 331 static void 332 cmt_hier_promote(pg_cmt_t *pg, cpu_pg_t *pgdata) 333 { 334 pg_cmt_t *parent; 335 group_t *children; 336 cpu_t *cpu; 337 group_iter_t iter; 338 pg_cpu_itr_t cpu_iter; 339 int r; 340 int err; 341 342 ASSERT(MUTEX_HELD(&cpu_lock)); 343 344 parent = pg->cmt_parent; 345 if (parent == NULL) { 346 /* 347 * Nothing to do 348 */ 349 return; 350 } 351 352 ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent)); 353 354 /* 355 * We're changing around the hierarchy, which is actively traversed 356 * by the dispatcher. Pause CPUS to ensure exclusivity. 357 */ 358 pause_cpus(NULL); 359 360 /* 361 * If necessary, update the parent's sibling set, replacing parent 362 * with PG. 363 */ 364 if (parent->cmt_siblings) { 365 if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE) 366 != -1) { 367 r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE); 368 ASSERT(r != -1); 369 } 370 } 371 372 /* 373 * If the parent is at the top of the hierarchy, replace it's entry 374 * in the root lgroup's group of top level PGs. 375 */ 376 if (parent->cmt_parent == NULL && 377 parent->cmt_siblings != &cmt_root->cl_pgs) { 378 if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE) 379 != -1) { 380 r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE); 381 ASSERT(r != -1); 382 } 383 } 384 385 /* 386 * We assume (and therefore assert) that the PG being promoted is an 387 * only child of it's parent. Update the parent's children set 388 * replacing PG's entry with the parent (since the parent is becoming 389 * the child). Then have PG and the parent swap children sets. 390 */ 391 ASSERT(GROUP_SIZE(parent->cmt_children) <= 1); 392 if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) { 393 r = group_add(parent->cmt_children, parent, GRP_NORESIZE); 394 ASSERT(r != -1); 395 } 396 397 children = pg->cmt_children; 398 pg->cmt_children = parent->cmt_children; 399 parent->cmt_children = children; 400 401 /* 402 * Update the sibling references for PG and it's parent 403 */ 404 pg->cmt_siblings = parent->cmt_siblings; 405 parent->cmt_siblings = pg->cmt_children; 406 407 /* 408 * Update any cached lineages in the per CPU pg data. 409 */ 410 PG_CPU_ITR_INIT(pg, cpu_iter); 411 while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { 412 int idx; 413 pg_cmt_t *cpu_pg; 414 cpu_pg_t *pgd; /* CPU's PG data */ 415 416 /* 417 * The CPU's whose lineage is under construction still 418 * references the bootstrap CPU PG data structure. 419 */ 420 if (pg_cpu_is_bootstrapped(cpu)) 421 pgd = pgdata; 422 else 423 pgd = cpu->cpu_pg; 424 425 /* 426 * Iterate over the CPU's PGs updating the children 427 * of the PG being promoted, since they have a new parent. 428 */ 429 group_iter_init(&iter); 430 while ((cpu_pg = group_iterate(&pgd->cmt_pgs, &iter)) != NULL) { 431 if (cpu_pg->cmt_parent == pg) { 432 cpu_pg->cmt_parent = parent; 433 } 434 } 435 436 /* 437 * Update the CMT load balancing lineage 438 */ 439 if ((idx = group_find(&pgd->cmt_pgs, (void *)pg)) == -1) { 440 /* 441 * Unless this is the CPU who's lineage is being 442 * constructed, the PG being promoted should be 443 * in the lineage. 444 */ 445 ASSERT(pg_cpu_is_bootstrapped(cpu)); 446 continue; 447 } 448 449 ASSERT(GROUP_ACCESS(&pgd->cmt_pgs, idx - 1) == parent); 450 ASSERT(idx > 0); 451 452 /* 453 * Have the child and the parent swap places in the CPU's 454 * lineage 455 */ 456 group_remove_at(&pgd->cmt_pgs, idx); 457 group_remove_at(&pgd->cmt_pgs, idx - 1); 458 err = group_add_at(&pgd->cmt_pgs, parent, idx); 459 ASSERT(err == 0); 460 err = group_add_at(&pgd->cmt_pgs, pg, idx - 1); 461 ASSERT(err == 0); 462 } 463 464 /* 465 * Update the parent references for PG and it's parent 466 */ 467 pg->cmt_parent = parent->cmt_parent; 468 parent->cmt_parent = pg; 469 470 start_cpus(); 471 } 472 473 /* 474 * CMT class callback for a new CPU entering the system 475 * 476 * This routine operates on the CPU specific processor group data (for the CPU 477 * being initialized). The argument "pgdata" is a reference to the CPU's PG 478 * data to be constructed. 479 * 480 * cp->cpu_pg is used by the dispatcher to access the CPU's PG data 481 * references a "bootstrap" structure. pg_cmt_cpu_init() and the routines it 482 * calls must be careful to operate only on the "pgdata" argument, and not 483 * cp->cpu_pg. 484 */ 485 static void 486 pg_cmt_cpu_init(cpu_t *cp, cpu_pg_t *pgdata) 487 { 488 pg_cmt_t *pg; 489 group_t *cmt_pgs; 490 int levels, level; 491 pghw_type_t hw; 492 pg_t *pg_cache = NULL; 493 pg_cmt_t *cpu_cmt_hier[PGHW_NUM_COMPONENTS]; 494 lgrp_handle_t lgrp_handle; 495 cmt_lgrp_t *lgrp; 496 cmt_lineage_validation_t lineage_status; 497 498 ASSERT(MUTEX_HELD(&cpu_lock)); 499 ASSERT(pg_cpu_is_bootstrapped(cp)); 500 501 if (cmt_sched_disabled) 502 return; 503 504 /* 505 * A new CPU is coming into the system. 506 * Interrogate the platform to see if the CPU 507 * has any performance or efficiency relevant 508 * sharing relationships 509 */ 510 cmt_pgs = &pgdata->cmt_pgs; 511 pgdata->cmt_lineage = NULL; 512 513 bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier)); 514 levels = 0; 515 for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) { 516 517 pg_cmt_policy_t policy; 518 519 /* 520 * We're only interested in the hw sharing relationships 521 * for which we know how to optimize. 522 */ 523 policy = pg_cmt_policy(hw); 524 if (policy == CMT_NO_POLICY || 525 pg_plat_hw_shared(cp, hw) == 0) 526 continue; 527 528 /* 529 * We will still create the PGs for hardware sharing 530 * relationships that have been blacklisted, but won't 531 * implement CMT thread placement optimizations against them. 532 */ 533 if (cmt_hw_blacklisted[hw] == 1) 534 policy = CMT_NO_POLICY; 535 536 /* 537 * Find (or create) the PG associated with 538 * the hw sharing relationship in which cp 539 * belongs. 540 * 541 * Determine if a suitable PG already 542 * exists, or if one needs to be created. 543 */ 544 pg = (pg_cmt_t *)pghw_place_cpu(cp, hw); 545 if (pg == NULL) { 546 /* 547 * Create a new one. 548 * Initialize the common... 549 */ 550 pg = (pg_cmt_t *)pg_create(pg_cmt_class_id); 551 552 /* ... physical ... */ 553 pghw_init((pghw_t *)pg, cp, hw); 554 555 /* 556 * ... and CMT specific portions of the 557 * structure. 558 */ 559 pg->cmt_policy = policy; 560 561 /* CMT event callbacks */ 562 cmt_callback_init((pg_t *)pg); 563 564 bitset_init(&pg->cmt_cpus_actv_set); 565 group_create(&pg->cmt_cpus_actv); 566 } else { 567 ASSERT(IS_CMT_PG(pg)); 568 } 569 570 /* Add the CPU to the PG */ 571 pg_cpu_add((pg_t *)pg, cp, pgdata); 572 573 /* 574 * Ensure capacity of the active CPU group/bitset 575 */ 576 group_expand(&pg->cmt_cpus_actv, 577 GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 578 579 if (cp->cpu_seqid >= 580 bitset_capacity(&pg->cmt_cpus_actv_set)) { 581 bitset_resize(&pg->cmt_cpus_actv_set, 582 cp->cpu_seqid + 1); 583 } 584 585 /* 586 * Build a lineage of CMT PGs for load balancing / coalescence 587 */ 588 if (policy & (CMT_BALANCE | CMT_COALESCE)) { 589 cpu_cmt_hier[levels++] = pg; 590 } 591 592 /* Cache this for later */ 593 if (hw == PGHW_CACHE) 594 pg_cache = (pg_t *)pg; 595 } 596 597 group_expand(cmt_pgs, levels); 598 599 if (cmt_root == NULL) 600 cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand()); 601 602 /* 603 * Find the lgrp that encapsulates this CPU's CMT hierarchy 604 */ 605 lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 606 if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL) 607 lgrp = pg_cmt_lgrp_create(lgrp_handle); 608 609 /* 610 * Ascendingly sort the PGs in the lineage by number of CPUs 611 */ 612 pg_cmt_hier_sort(cpu_cmt_hier, levels); 613 614 /* 615 * Examine the lineage and validate it. 616 * This routine will also try to fix the lineage along with the 617 * rest of the PG hierarchy should it detect an issue. 618 * 619 * If it returns anything other than VALID or REPAIRED, an 620 * unrecoverable error has occurred, and we cannot proceed. 621 */ 622 lineage_status = pg_cmt_lineage_validate(cpu_cmt_hier, &levels, pgdata); 623 if ((lineage_status != CMT_LINEAGE_VALID) && 624 (lineage_status != CMT_LINEAGE_REPAIRED)) { 625 /* 626 * In the case of an unrecoverable error where CMT scheduling 627 * has been disabled, assert that the under construction CPU's 628 * PG data has an empty CMT load balancing lineage. 629 */ 630 ASSERT((cmt_sched_disabled == 0) || 631 (GROUP_SIZE(&(pgdata->cmt_pgs)) == 0)); 632 return; 633 } 634 635 /* 636 * For existing PGs in the lineage, verify that the parent is 637 * correct, as the generation in the lineage may have changed 638 * as a result of the sorting. Start the traversal at the top 639 * of the lineage, moving down. 640 */ 641 for (level = levels - 1; level >= 0; ) { 642 int reorg; 643 644 reorg = 0; 645 pg = cpu_cmt_hier[level]; 646 647 /* 648 * Promote PGs at an incorrect generation into place. 649 */ 650 while (pg->cmt_parent && 651 pg->cmt_parent != cpu_cmt_hier[level + 1]) { 652 cmt_hier_promote(pg, pgdata); 653 reorg++; 654 } 655 if (reorg > 0) 656 level = levels - 1; 657 else 658 level--; 659 } 660 661 /* 662 * For each of the PGs in the CPU's lineage: 663 * - Add an entry in the CPU sorted CMT PG group 664 * which is used for top down CMT load balancing 665 * - Tie the PG into the CMT hierarchy by connecting 666 * it to it's parent and siblings. 667 */ 668 for (level = 0; level < levels; level++) { 669 uint_t children; 670 int err; 671 672 pg = cpu_cmt_hier[level]; 673 err = group_add_at(cmt_pgs, pg, levels - level - 1); 674 ASSERT(err == 0); 675 676 if (level == 0) 677 pgdata->cmt_lineage = (pg_t *)pg; 678 679 if (pg->cmt_siblings != NULL) { 680 /* Already initialized */ 681 ASSERT(pg->cmt_parent == NULL || 682 pg->cmt_parent == cpu_cmt_hier[level + 1]); 683 ASSERT(pg->cmt_siblings == &lgrp->cl_pgs || 684 ((pg->cmt_parent != NULL) && 685 pg->cmt_siblings == pg->cmt_parent->cmt_children)); 686 continue; 687 } 688 689 if ((level + 1) == levels) { 690 pg->cmt_parent = NULL; 691 692 pg->cmt_siblings = &lgrp->cl_pgs; 693 children = ++lgrp->cl_npgs; 694 if (cmt_root != lgrp) 695 cmt_root->cl_npgs++; 696 } else { 697 pg->cmt_parent = cpu_cmt_hier[level + 1]; 698 699 /* 700 * A good parent keeps track of their children. 701 * The parent's children group is also the PG's 702 * siblings. 703 */ 704 if (pg->cmt_parent->cmt_children == NULL) { 705 pg->cmt_parent->cmt_children = 706 kmem_zalloc(sizeof (group_t), KM_SLEEP); 707 group_create(pg->cmt_parent->cmt_children); 708 } 709 pg->cmt_siblings = pg->cmt_parent->cmt_children; 710 children = ++pg->cmt_parent->cmt_nchildren; 711 } 712 713 group_expand(pg->cmt_siblings, children); 714 group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs); 715 } 716 717 /* 718 * Cache the chip and core IDs in the cpu_t->cpu_physid structure 719 * for fast lookups later. 720 */ 721 if (cp->cpu_physid) { 722 cp->cpu_physid->cpu_chipid = 723 pg_plat_hw_instance_id(cp, PGHW_CHIP); 724 cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp); 725 726 /* 727 * If this cpu has a PG representing shared cache, then set 728 * cpu_cacheid to that PG's logical id 729 */ 730 if (pg_cache) 731 cp->cpu_physid->cpu_cacheid = pg_cache->pg_id; 732 } 733 734 /* CPU0 only initialization */ 735 if (is_cpu0) { 736 is_cpu0 = 0; 737 cpu0_lgrp = lgrp; 738 } 739 740 } 741 742 /* 743 * Class callback when a CPU is leaving the system (deletion) 744 * 745 * "pgdata" is a reference to the CPU's PG data to be deconstructed. 746 * 747 * cp->cpu_pg is used by the dispatcher to access the CPU's PG data 748 * references a "bootstrap" structure across this function's invocation. 749 * pg_cmt_cpu_init() and the routines it calls must be careful to operate only 750 * on the "pgdata" argument, and not cp->cpu_pg. 751 */ 752 static void 753 pg_cmt_cpu_fini(cpu_t *cp, cpu_pg_t *pgdata) 754 { 755 group_iter_t i; 756 pg_cmt_t *pg; 757 group_t *pgs, *cmt_pgs; 758 lgrp_handle_t lgrp_handle; 759 cmt_lgrp_t *lgrp; 760 761 if (cmt_sched_disabled) 762 return; 763 764 ASSERT(pg_cpu_is_bootstrapped(cp)); 765 766 pgs = &pgdata->pgs; 767 cmt_pgs = &pgdata->cmt_pgs; 768 769 /* 770 * Find the lgroup that encapsulates this CPU's CMT hierarchy 771 */ 772 lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 773 774 lgrp = pg_cmt_find_lgrp(lgrp_handle); 775 if (ncpus == 1 && lgrp != cpu0_lgrp) { 776 /* 777 * One might wonder how we could be deconfiguring the 778 * only CPU in the system. 779 * 780 * On Starcat systems when null_proc_lpa is detected, 781 * the boot CPU (which is already configured into a leaf 782 * lgroup), is moved into the root lgroup. This is done by 783 * deconfiguring it from both lgroups and processor 784 * groups), and then later reconfiguring it back in. This 785 * call to pg_cmt_cpu_fini() is part of that deconfiguration. 786 * 787 * This special case is detected by noting that the platform 788 * has changed the CPU's lgrp affiliation (since it now 789 * belongs in the root). In this case, use the cmt_lgrp_t 790 * cached for the boot CPU, since this is what needs to be 791 * torn down. 792 */ 793 lgrp = cpu0_lgrp; 794 } 795 796 ASSERT(lgrp != NULL); 797 798 /* 799 * First, clean up anything load balancing specific for each of 800 * the CPU's PGs that participated in CMT load balancing 801 */ 802 pg = (pg_cmt_t *)pgdata->cmt_lineage; 803 while (pg != NULL) { 804 805 /* 806 * Remove the PG from the CPU's load balancing lineage 807 */ 808 (void) group_remove(cmt_pgs, pg, GRP_RESIZE); 809 810 /* 811 * If it's about to become empty, destroy it's children 812 * group, and remove it's reference from it's siblings. 813 * This is done here (rather than below) to avoid removing 814 * our reference from a PG that we just eliminated. 815 */ 816 if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) { 817 if (pg->cmt_children != NULL) 818 group_destroy(pg->cmt_children); 819 if (pg->cmt_siblings != NULL) { 820 if (pg->cmt_siblings == &lgrp->cl_pgs) 821 lgrp->cl_npgs--; 822 else 823 pg->cmt_parent->cmt_nchildren--; 824 } 825 } 826 pg = pg->cmt_parent; 827 } 828 ASSERT(GROUP_SIZE(cmt_pgs) == 0); 829 830 /* 831 * Now that the load balancing lineage updates have happened, 832 * remove the CPU from all it's PGs (destroying any that become 833 * empty). 834 */ 835 group_iter_init(&i); 836 while ((pg = group_iterate(pgs, &i)) != NULL) { 837 if (IS_CMT_PG(pg) == 0) 838 continue; 839 840 pg_cpu_delete((pg_t *)pg, cp, pgdata); 841 /* 842 * Deleting the CPU from the PG changes the CPU's 843 * PG group over which we are actively iterating 844 * Re-initialize the iteration 845 */ 846 group_iter_init(&i); 847 848 if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) { 849 850 /* 851 * The PG has become zero sized, so destroy it. 852 */ 853 group_destroy(&pg->cmt_cpus_actv); 854 bitset_fini(&pg->cmt_cpus_actv_set); 855 pghw_fini((pghw_t *)pg); 856 857 pg_destroy((pg_t *)pg); 858 } 859 } 860 } 861 862 /* 863 * Class callback when a CPU is entering a cpu partition 864 */ 865 static void 866 pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp) 867 { 868 group_t *pgs; 869 pg_t *pg; 870 group_iter_t i; 871 872 ASSERT(MUTEX_HELD(&cpu_lock)); 873 874 if (cmt_sched_disabled) 875 return; 876 877 pgs = &cp->cpu_pg->pgs; 878 879 /* 880 * Ensure that the new partition's PG bitset 881 * is large enough for all CMT PG's to which cp 882 * belongs 883 */ 884 group_iter_init(&i); 885 while ((pg = group_iterate(pgs, &i)) != NULL) { 886 if (IS_CMT_PG(pg) == 0) 887 continue; 888 889 if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id) 890 bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1); 891 } 892 } 893 894 /* 895 * Class callback when a CPU is actually moving partitions 896 */ 897 static void 898 pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp) 899 { 900 cpu_t *cpp; 901 group_t *pgs; 902 pg_t *pg; 903 group_iter_t pg_iter; 904 pg_cpu_itr_t cpu_iter; 905 boolean_t found; 906 907 ASSERT(MUTEX_HELD(&cpu_lock)); 908 909 if (cmt_sched_disabled) 910 return; 911 912 pgs = &cp->cpu_pg->pgs; 913 group_iter_init(&pg_iter); 914 915 /* 916 * Iterate over the CPUs CMT PGs 917 */ 918 while ((pg = group_iterate(pgs, &pg_iter)) != NULL) { 919 920 if (IS_CMT_PG(pg) == 0) 921 continue; 922 923 /* 924 * Add the PG to the bitset in the new partition. 925 */ 926 bitset_add(&newpp->cp_cmt_pgs, pg->pg_id); 927 928 /* 929 * Remove the PG from the bitset in the old partition 930 * if the last of the PG's CPUs have left. 931 */ 932 found = B_FALSE; 933 PG_CPU_ITR_INIT(pg, cpu_iter); 934 while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) { 935 if (cpp == cp) 936 continue; 937 if (CPU_ACTIVE(cpp) && 938 cpp->cpu_part->cp_id == oldpp->cp_id) { 939 found = B_TRUE; 940 break; 941 } 942 } 943 if (!found) 944 bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id); 945 } 946 } 947 948 /* 949 * Class callback when a CPU becomes active (online) 950 * 951 * This is called in a context where CPUs are paused 952 */ 953 static void 954 pg_cmt_cpu_active(cpu_t *cp) 955 { 956 int err; 957 group_iter_t i; 958 pg_cmt_t *pg; 959 group_t *pgs; 960 961 ASSERT(MUTEX_HELD(&cpu_lock)); 962 963 if (cmt_sched_disabled) 964 return; 965 966 pgs = &cp->cpu_pg->pgs; 967 group_iter_init(&i); 968 969 /* 970 * Iterate over the CPU's PGs 971 */ 972 while ((pg = group_iterate(pgs, &i)) != NULL) { 973 974 if (IS_CMT_PG(pg) == 0) 975 continue; 976 977 err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 978 ASSERT(err == 0); 979 980 /* 981 * If this is the first active CPU in the PG, and it 982 * represents a hardware sharing relationship over which 983 * CMT load balancing is performed, add it as a candidate 984 * for balancing with it's siblings. 985 */ 986 if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 && 987 (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) { 988 err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE); 989 ASSERT(err == 0); 990 991 /* 992 * If this is a top level PG, add it as a balancing 993 * candidate when balancing within the root lgroup. 994 */ 995 if (pg->cmt_parent == NULL && 996 pg->cmt_siblings != &cmt_root->cl_pgs) { 997 err = group_add(&cmt_root->cl_pgs, pg, 998 GRP_NORESIZE); 999 ASSERT(err == 0); 1000 } 1001 } 1002 1003 /* 1004 * Notate the CPU in the PGs active CPU bitset. 1005 * Also notate the PG as being active in it's associated 1006 * partition 1007 */ 1008 bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 1009 bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id); 1010 } 1011 } 1012 1013 /* 1014 * Class callback when a CPU goes inactive (offline) 1015 * 1016 * This is called in a context where CPUs are paused 1017 */ 1018 static void 1019 pg_cmt_cpu_inactive(cpu_t *cp) 1020 { 1021 int err; 1022 group_t *pgs; 1023 pg_cmt_t *pg; 1024 cpu_t *cpp; 1025 group_iter_t i; 1026 pg_cpu_itr_t cpu_itr; 1027 boolean_t found; 1028 1029 ASSERT(MUTEX_HELD(&cpu_lock)); 1030 1031 if (cmt_sched_disabled) 1032 return; 1033 1034 pgs = &cp->cpu_pg->pgs; 1035 group_iter_init(&i); 1036 1037 while ((pg = group_iterate(pgs, &i)) != NULL) { 1038 1039 if (IS_CMT_PG(pg) == 0) 1040 continue; 1041 1042 /* 1043 * Remove the CPU from the CMT PGs active CPU group 1044 * bitmap 1045 */ 1046 err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 1047 ASSERT(err == 0); 1048 1049 bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 1050 1051 /* 1052 * If there are no more active CPUs in this PG over which 1053 * load was balanced, remove it as a balancing candidate. 1054 */ 1055 if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 && 1056 (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) { 1057 err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); 1058 ASSERT(err == 0); 1059 1060 if (pg->cmt_parent == NULL && 1061 pg->cmt_siblings != &cmt_root->cl_pgs) { 1062 err = group_remove(&cmt_root->cl_pgs, pg, 1063 GRP_NORESIZE); 1064 ASSERT(err == 0); 1065 } 1066 } 1067 1068 /* 1069 * Assert the number of active CPUs does not exceed 1070 * the total number of CPUs in the PG 1071 */ 1072 ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <= 1073 GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 1074 1075 /* 1076 * Update the PG bitset in the CPU's old partition 1077 */ 1078 found = B_FALSE; 1079 PG_CPU_ITR_INIT(pg, cpu_itr); 1080 while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) { 1081 if (cpp == cp) 1082 continue; 1083 if (CPU_ACTIVE(cpp) && 1084 cpp->cpu_part->cp_id == cp->cpu_part->cp_id) { 1085 found = B_TRUE; 1086 break; 1087 } 1088 } 1089 if (!found) { 1090 bitset_del(&cp->cpu_part->cp_cmt_pgs, 1091 ((pg_t *)pg)->pg_id); 1092 } 1093 } 1094 } 1095 1096 /* 1097 * Return non-zero if the CPU belongs in the given PG 1098 */ 1099 static int 1100 pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp) 1101 { 1102 cpu_t *pg_cpu; 1103 1104 pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0); 1105 1106 ASSERT(pg_cpu != NULL); 1107 1108 /* 1109 * The CPU belongs if, given the nature of the hardware sharing 1110 * relationship represented by the PG, the CPU has that 1111 * relationship with some other CPU already in the PG 1112 */ 1113 if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw)) 1114 return (1); 1115 1116 return (0); 1117 } 1118 1119 /* 1120 * Sort the CPUs CMT hierarchy, where "size" is the number of levels. 1121 */ 1122 static void 1123 pg_cmt_hier_sort(pg_cmt_t **hier, int size) 1124 { 1125 int i, j, inc; 1126 pg_t *tmp; 1127 pg_t **h = (pg_t **)hier; 1128 1129 /* 1130 * First sort by number of CPUs 1131 */ 1132 inc = size / 2; 1133 while (inc > 0) { 1134 for (i = inc; i < size; i++) { 1135 j = i; 1136 tmp = h[i]; 1137 while ((j >= inc) && 1138 (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) { 1139 h[j] = h[j - inc]; 1140 j = j - inc; 1141 } 1142 h[j] = tmp; 1143 } 1144 if (inc == 2) 1145 inc = 1; 1146 else 1147 inc = (inc * 5) / 11; 1148 } 1149 1150 /* 1151 * Break ties by asking the platform. 1152 * Determine if h[i] outranks h[i + 1] and if so, swap them. 1153 */ 1154 for (i = 0; i < size - 1; i++) { 1155 if ((PG_NUM_CPUS(h[i]) == PG_NUM_CPUS(h[i + 1])) && 1156 pg_cmt_hier_rank(hier[i], hier[i + 1]) == hier[i]) { 1157 tmp = h[i]; 1158 h[i] = h[i + 1]; 1159 h[i + 1] = tmp; 1160 } 1161 } 1162 } 1163 1164 /* 1165 * Return a cmt_lgrp_t * given an lgroup handle. 1166 */ 1167 static cmt_lgrp_t * 1168 pg_cmt_find_lgrp(lgrp_handle_t hand) 1169 { 1170 cmt_lgrp_t *lgrp; 1171 1172 ASSERT(MUTEX_HELD(&cpu_lock)); 1173 1174 lgrp = cmt_lgrps; 1175 while (lgrp != NULL) { 1176 if (lgrp->cl_hand == hand) 1177 break; 1178 lgrp = lgrp->cl_next; 1179 } 1180 return (lgrp); 1181 } 1182 1183 /* 1184 * Create a cmt_lgrp_t with the specified handle. 1185 */ 1186 static cmt_lgrp_t * 1187 pg_cmt_lgrp_create(lgrp_handle_t hand) 1188 { 1189 cmt_lgrp_t *lgrp; 1190 1191 ASSERT(MUTEX_HELD(&cpu_lock)); 1192 1193 lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP); 1194 1195 lgrp->cl_hand = hand; 1196 lgrp->cl_npgs = 0; 1197 lgrp->cl_next = cmt_lgrps; 1198 cmt_lgrps = lgrp; 1199 group_create(&lgrp->cl_pgs); 1200 1201 return (lgrp); 1202 } 1203 1204 /* 1205 * Interfaces to enable and disable power aware dispatching 1206 * The caller must be holding cpu_lock. 1207 * 1208 * Return 0 on success and -1 on failure. 1209 */ 1210 int 1211 cmt_pad_enable(pghw_type_t type) 1212 { 1213 group_t *hwset; 1214 group_iter_t iter; 1215 pg_cmt_t *pg; 1216 1217 ASSERT(PGHW_IS_PM_DOMAIN(type)); 1218 ASSERT(MUTEX_HELD(&cpu_lock)); 1219 1220 if ((hwset = pghw_set_lookup(type)) == NULL || 1221 cmt_hw_blacklisted[type]) { 1222 /* 1223 * Unable to find any instances of the specified type 1224 * of power domain, or the power domains have been blacklisted. 1225 */ 1226 return (-1); 1227 } 1228 1229 /* 1230 * Iterate over the power domains, setting the default dispatcher 1231 * policy for power/performance optimization. 1232 * 1233 * Simply setting the policy isn't enough in the case where the power 1234 * domain is an only child of another PG. Because the dispatcher walks 1235 * the PG hierarchy in a top down fashion, the higher up PG's policy 1236 * will dominate. So promote the power domain above it's parent if both 1237 * PG and it's parent have the same CPUs to ensure it's policy 1238 * dominates. 1239 */ 1240 group_iter_init(&iter); 1241 while ((pg = group_iterate(hwset, &iter)) != NULL) { 1242 /* 1243 * If the power domain is an only child to a parent 1244 * not implementing the same policy, promote the child 1245 * above the parent to activate the policy. 1246 */ 1247 pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw); 1248 while ((pg->cmt_parent != NULL) && 1249 (pg->cmt_parent->cmt_policy != pg->cmt_policy) && 1250 (PG_NUM_CPUS((pg_t *)pg) == 1251 PG_NUM_CPUS((pg_t *)pg->cmt_parent))) { 1252 cmt_hier_promote(pg, NULL); 1253 } 1254 } 1255 1256 return (0); 1257 } 1258 1259 int 1260 cmt_pad_disable(pghw_type_t type) 1261 { 1262 group_t *hwset; 1263 group_iter_t iter; 1264 pg_cmt_t *pg; 1265 pg_cmt_t *child; 1266 1267 ASSERT(PGHW_IS_PM_DOMAIN(type)); 1268 ASSERT(MUTEX_HELD(&cpu_lock)); 1269 1270 if ((hwset = pghw_set_lookup(type)) == NULL) { 1271 /* 1272 * Unable to find any instances of the specified type of 1273 * power domain. 1274 */ 1275 return (-1); 1276 } 1277 /* 1278 * Iterate over the power domains, setting the default dispatcher 1279 * policy for performance optimization (load balancing). 1280 */ 1281 group_iter_init(&iter); 1282 while ((pg = group_iterate(hwset, &iter)) != NULL) { 1283 1284 /* 1285 * If the power domain has an only child that implements 1286 * policy other than load balancing, promote the child 1287 * above the power domain to ensure it's policy dominates. 1288 */ 1289 if (pg->cmt_children != NULL && 1290 GROUP_SIZE(pg->cmt_children) == 1) { 1291 child = GROUP_ACCESS(pg->cmt_children, 0); 1292 if ((child->cmt_policy & CMT_BALANCE) == 0) { 1293 cmt_hier_promote(child, NULL); 1294 } 1295 } 1296 pg->cmt_policy = CMT_BALANCE; 1297 } 1298 return (0); 1299 } 1300 1301 /* ARGSUSED */ 1302 static void 1303 cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old, 1304 kthread_t *new) 1305 { 1306 pg_cmt_t *cmt_pg = (pg_cmt_t *)pg; 1307 1308 if (old == cp->cpu_idle_thread) { 1309 atomic_add_32(&cmt_pg->cmt_utilization, 1); 1310 } else if (new == cp->cpu_idle_thread) { 1311 atomic_add_32(&cmt_pg->cmt_utilization, -1); 1312 } 1313 } 1314 1315 /* 1316 * Macro to test whether a thread is currently runnable on a CPU in a PG. 1317 */ 1318 #define THREAD_RUNNABLE_IN_PG(t, pg) \ 1319 ((t)->t_state == TS_RUN && \ 1320 (t)->t_disp_queue->disp_cpu && \ 1321 bitset_in_set(&(pg)->cmt_cpus_actv_set, \ 1322 (t)->t_disp_queue->disp_cpu->cpu_seqid)) 1323 1324 static void 1325 cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old, 1326 kthread_t *new) 1327 { 1328 pg_cmt_t *cmt = (pg_cmt_t *)pg; 1329 cpupm_domain_t *dom; 1330 uint32_t u; 1331 1332 if (old == cp->cpu_idle_thread) { 1333 ASSERT(new != cp->cpu_idle_thread); 1334 u = atomic_add_32_nv(&cmt->cmt_utilization, 1); 1335 if (u == 1) { 1336 /* 1337 * Notify the CPU power manager that the domain 1338 * is non-idle. 1339 */ 1340 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1341 cpupm_utilization_event(cp, now, dom, 1342 CPUPM_DOM_BUSY_FROM_IDLE); 1343 } 1344 } else if (new == cp->cpu_idle_thread) { 1345 ASSERT(old != cp->cpu_idle_thread); 1346 u = atomic_add_32_nv(&cmt->cmt_utilization, -1); 1347 if (u == 0) { 1348 /* 1349 * The domain is idle, notify the CPU power 1350 * manager. 1351 * 1352 * Avoid notifying if the thread is simply migrating 1353 * between CPUs in the domain. 1354 */ 1355 if (!THREAD_RUNNABLE_IN_PG(old, cmt)) { 1356 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1357 cpupm_utilization_event(cp, now, dom, 1358 CPUPM_DOM_IDLE_FROM_BUSY); 1359 } 1360 } 1361 } 1362 } 1363 1364 /* ARGSUSED */ 1365 static void 1366 cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t) 1367 { 1368 pg_cmt_t *cmt = (pg_cmt_t *)pg; 1369 cpupm_domain_t *dom; 1370 1371 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1372 cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY); 1373 } 1374 1375 /* 1376 * Return the name of the CMT scheduling policy 1377 * being implemented across this PG 1378 */ 1379 static char * 1380 pg_cmt_policy_name(pg_t *pg) 1381 { 1382 pg_cmt_policy_t policy; 1383 1384 policy = ((pg_cmt_t *)pg)->cmt_policy; 1385 1386 if (policy & CMT_AFFINITY) { 1387 if (policy & CMT_BALANCE) 1388 return ("Load Balancing & Affinity"); 1389 else if (policy & CMT_COALESCE) 1390 return ("Load Coalescence & Affinity"); 1391 else 1392 return ("Affinity"); 1393 } else { 1394 if (policy & CMT_BALANCE) 1395 return ("Load Balancing"); 1396 else if (policy & CMT_COALESCE) 1397 return ("Load Coalescence"); 1398 else 1399 return ("None"); 1400 } 1401 } 1402 1403 /* 1404 * Prune PG, and all other instances of PG's hardware sharing relationship 1405 * from the CMT PG hierarchy. 1406 * 1407 * This routine operates on the CPU specific processor group data (for the CPUs 1408 * in the PG being pruned), and may be invoked from a context where one CPU's 1409 * PG data is under construction. In this case the argument "pgdata", if not 1410 * NULL, is a reference to the CPU's under-construction PG data. 1411 */ 1412 static int 1413 pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata) 1414 { 1415 group_t *hwset, *children; 1416 int i, j, r, size = *sz; 1417 group_iter_t hw_iter, child_iter; 1418 pg_cpu_itr_t cpu_iter; 1419 pg_cmt_t *pg, *child; 1420 cpu_t *cpu; 1421 int cap_needed; 1422 pghw_type_t hw; 1423 1424 ASSERT(MUTEX_HELD(&cpu_lock)); 1425 1426 hw = ((pghw_t *)pg_bad)->pghw_hw; 1427 1428 if (hw == PGHW_POW_ACTIVE) { 1429 cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. " 1430 "Event Based CPUPM Unavailable"); 1431 } else if (hw == PGHW_POW_IDLE) { 1432 cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. " 1433 "Dispatcher assisted CPUPM disabled."); 1434 } 1435 1436 /* 1437 * Find and eliminate the PG from the lineage. 1438 */ 1439 for (i = 0; i < size; i++) { 1440 if (lineage[i] == pg_bad) { 1441 for (j = i; j < size - 1; j++) 1442 lineage[j] = lineage[j + 1]; 1443 *sz = size - 1; 1444 break; 1445 } 1446 } 1447 1448 /* 1449 * We'll prune all instances of the hardware sharing relationship 1450 * represented by pg. But before we do that (and pause CPUs) we need 1451 * to ensure the hierarchy's groups are properly sized. 1452 */ 1453 hwset = pghw_set_lookup(hw); 1454 1455 /* 1456 * Blacklist the hardware so future processor groups of this type won't 1457 * participate in CMT thread placement. 1458 * 1459 * XXX 1460 * For heterogeneous system configurations, this might be overkill. 1461 * We may only need to blacklist the illegal PGs, and other instances 1462 * of this hardware sharing relationship may be ok. 1463 */ 1464 cmt_hw_blacklisted[hw] = 1; 1465 1466 /* 1467 * For each of the PGs being pruned, ensure sufficient capacity in 1468 * the siblings set for the PG's children 1469 */ 1470 group_iter_init(&hw_iter); 1471 while ((pg = group_iterate(hwset, &hw_iter)) != NULL) { 1472 /* 1473 * PG is being pruned, but if it is bringing up more than 1474 * one child, ask for more capacity in the siblings group. 1475 */ 1476 cap_needed = 0; 1477 if (pg->cmt_children && 1478 GROUP_SIZE(pg->cmt_children) > 1) { 1479 cap_needed = GROUP_SIZE(pg->cmt_children) - 1; 1480 1481 group_expand(pg->cmt_siblings, 1482 GROUP_SIZE(pg->cmt_siblings) + cap_needed); 1483 1484 /* 1485 * If this is a top level group, also ensure the 1486 * capacity in the root lgrp level CMT grouping. 1487 */ 1488 if (pg->cmt_parent == NULL && 1489 pg->cmt_siblings != &cmt_root->cl_pgs) { 1490 group_expand(&cmt_root->cl_pgs, 1491 GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed); 1492 cmt_root->cl_npgs += cap_needed; 1493 } 1494 } 1495 } 1496 1497 /* 1498 * We're operating on the PG hierarchy. Pause CPUs to ensure 1499 * exclusivity with respect to the dispatcher. 1500 */ 1501 pause_cpus(NULL); 1502 1503 /* 1504 * Prune all PG instances of the hardware sharing relationship 1505 * represented by pg. 1506 */ 1507 group_iter_init(&hw_iter); 1508 while ((pg = group_iterate(hwset, &hw_iter)) != NULL) { 1509 1510 /* 1511 * Remove PG from it's group of siblings, if it's there. 1512 */ 1513 if (pg->cmt_siblings) { 1514 (void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); 1515 } 1516 if (pg->cmt_parent == NULL && 1517 pg->cmt_siblings != &cmt_root->cl_pgs) { 1518 (void) group_remove(&cmt_root->cl_pgs, pg, 1519 GRP_NORESIZE); 1520 } 1521 1522 /* 1523 * Indicate that no CMT policy will be implemented across 1524 * this PG. 1525 */ 1526 pg->cmt_policy = CMT_NO_POLICY; 1527 1528 /* 1529 * Move PG's children from it's children set to it's parent's 1530 * children set. Note that the parent's children set, and PG's 1531 * siblings set are the same thing. 1532 * 1533 * Because we are iterating over the same group that we are 1534 * operating on (removing the children), first add all of PG's 1535 * children to the parent's children set, and once we are done 1536 * iterating, empty PG's children set. 1537 */ 1538 if (pg->cmt_children != NULL) { 1539 children = pg->cmt_children; 1540 1541 group_iter_init(&child_iter); 1542 while ((child = group_iterate(children, &child_iter)) 1543 != NULL) { 1544 if (pg->cmt_siblings != NULL) { 1545 r = group_add(pg->cmt_siblings, child, 1546 GRP_NORESIZE); 1547 ASSERT(r == 0); 1548 1549 if (pg->cmt_parent == NULL && 1550 pg->cmt_siblings != 1551 &cmt_root->cl_pgs) { 1552 r = group_add(&cmt_root->cl_pgs, 1553 child, GRP_NORESIZE); 1554 ASSERT(r == 0); 1555 } 1556 } 1557 } 1558 group_empty(pg->cmt_children); 1559 } 1560 1561 /* 1562 * Reset the callbacks to the defaults 1563 */ 1564 pg_callback_set_defaults((pg_t *)pg); 1565 1566 /* 1567 * Update all the CPU lineages in each of PG's CPUs 1568 */ 1569 PG_CPU_ITR_INIT(pg, cpu_iter); 1570 while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { 1571 pg_cmt_t *cpu_pg; 1572 group_iter_t liter; /* Iterator for the lineage */ 1573 cpu_pg_t *cpd; /* CPU's PG data */ 1574 1575 /* 1576 * The CPU's lineage is under construction still 1577 * references the bootstrap CPU PG data structure. 1578 */ 1579 if (pg_cpu_is_bootstrapped(cpu)) 1580 cpd = pgdata; 1581 else 1582 cpd = cpu->cpu_pg; 1583 1584 /* 1585 * Iterate over the CPU's PGs updating the children 1586 * of the PG being promoted, since they have a new 1587 * parent and siblings set. 1588 */ 1589 group_iter_init(&liter); 1590 while ((cpu_pg = group_iterate(&cpd->pgs, 1591 &liter)) != NULL) { 1592 if (cpu_pg->cmt_parent == pg) { 1593 cpu_pg->cmt_parent = pg->cmt_parent; 1594 cpu_pg->cmt_siblings = pg->cmt_siblings; 1595 } 1596 } 1597 1598 /* 1599 * Update the CPU's lineages 1600 * 1601 * Remove the PG from the CPU's group used for CMT 1602 * scheduling. 1603 */ 1604 (void) group_remove(&cpd->cmt_pgs, pg, GRP_NORESIZE); 1605 } 1606 } 1607 start_cpus(); 1608 return (0); 1609 } 1610 1611 /* 1612 * Disable CMT scheduling 1613 */ 1614 static void 1615 pg_cmt_disable(void) 1616 { 1617 cpu_t *cpu; 1618 1619 ASSERT(MUTEX_HELD(&cpu_lock)); 1620 1621 pause_cpus(NULL); 1622 cpu = cpu_list; 1623 1624 do { 1625 if (cpu->cpu_pg) 1626 group_empty(&cpu->cpu_pg->cmt_pgs); 1627 } while ((cpu = cpu->cpu_next) != cpu_list); 1628 1629 cmt_sched_disabled = 1; 1630 start_cpus(); 1631 cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable"); 1632 } 1633 1634 /* 1635 * CMT lineage validation 1636 * 1637 * This routine is invoked by pg_cmt_cpu_init() to validate the integrity 1638 * of the PGs in a CPU's lineage. This is necessary because it's possible that 1639 * some groupings (power domain groupings in particular) may be defined by 1640 * sources that are buggy (e.g. BIOS bugs). In such cases, it may not be 1641 * possible to integrate those groupings into the CMT PG hierarchy, if doing 1642 * so would violate the subset invariant of the hierarchy, which says that 1643 * a PG must be subset of its parent (if it has one). 1644 * 1645 * pg_cmt_lineage_validate()'s purpose is to detect grouping definitions that 1646 * would result in a violation of this invariant. If a violation is found, 1647 * and the PG is of a grouping type who's definition is known to originate from 1648 * suspect sources (BIOS), then pg_cmt_prune() will be invoked to prune the 1649 * PG (and all other instances PG's sharing relationship type) from the 1650 * hierarchy. Further, future instances of that sharing relationship type won't 1651 * be instantiated. If the grouping definition doesn't originate from suspect 1652 * sources, then pg_cmt_disable() will be invoked to log an error, and disable 1653 * CMT scheduling altogether. 1654 * 1655 * This routine is invoked after the CPU has been added to the PGs in which 1656 * it belongs, but before those PGs have been added to (or had their place 1657 * adjusted in) the CMT PG hierarchy. 1658 * 1659 * The first argument is the CPUs PG lineage (essentially an array of PGs in 1660 * which the CPU belongs) that has already been sorted in ascending order 1661 * by CPU count. Some of the PGs in the CPUs lineage may already have other 1662 * CPUs in them, and have already been integrated into the CMT hierarchy. 1663 * 1664 * The addition of this new CPU to these pre-existing PGs means that those 1665 * PGs may need to be promoted up in the hierarchy to satisfy the subset 1666 * invariant. In additon to testing the subset invariant for the lineage, 1667 * this routine also verifies that the addition of the new CPU to the 1668 * existing PGs wouldn't cause the subset invariant to be violated in 1669 * the exiting lineages. 1670 * 1671 * This routine will normally return one of the following: 1672 * CMT_LINEAGE_VALID - There were no problems detected with the lineage. 1673 * CMT_LINEAGE_REPAIRED - Problems were detected, but repaired via pruning. 1674 * 1675 * Otherwise, this routine will return a value indicating which error it 1676 * was unable to recover from (and set cmt_lineage_status along the way). 1677 * 1678 * 1679 * This routine operates on the CPU specific processor group data (for the CPU 1680 * whose lineage is being validated), which is under-construction. 1681 * "pgdata" is a reference to the CPU's under-construction PG data. 1682 * This routine must be careful to operate only on "pgdata", and not cp->cpu_pg. 1683 */ 1684 static cmt_lineage_validation_t 1685 pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata) 1686 { 1687 int i, j, size; 1688 pg_cmt_t *pg, *pg_next, *pg_bad, *pg_tmp; 1689 cpu_t *cp; 1690 pg_cpu_itr_t cpu_iter; 1691 lgrp_handle_t lgrp; 1692 1693 ASSERT(MUTEX_HELD(&cpu_lock)); 1694 1695 revalidate: 1696 size = *sz; 1697 pg_bad = NULL; 1698 lgrp = LGRP_NULL_HANDLE; 1699 for (i = 0; i < size; i++) { 1700 1701 pg = lineage[i]; 1702 if (i < size - 1) 1703 pg_next = lineage[i + 1]; 1704 else 1705 pg_next = NULL; 1706 1707 /* 1708 * We assume that the lineage has already been sorted 1709 * by the number of CPUs. In fact, we depend on it. 1710 */ 1711 ASSERT(pg_next == NULL || 1712 (PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)pg_next))); 1713 1714 /* 1715 * Check to make sure that the existing parent of PG (if any) 1716 * is either in the PG's lineage, or the PG has more CPUs than 1717 * its existing parent and can and should be promoted above its 1718 * parent. 1719 * 1720 * Since the PG topology is in the middle of being changed, we 1721 * need to check whether the PG's existing parent (if any) is 1722 * part of its lineage (and therefore should contain the new 1723 * CPU). If not, it means that the addition of the new CPU 1724 * should have made this PG have more CPUs than its parent, and 1725 * this PG should be promoted to be above its existing parent 1726 * now. We need to verify all of this to defend against a buggy 1727 * BIOS giving bad power domain CPU groupings. Sigh. 1728 */ 1729 if (pg->cmt_parent) { 1730 /* 1731 * Determine if cmt_parent is in this lineage 1732 */ 1733 for (j = 0; j < size; j++) { 1734 pg_tmp = lineage[j]; 1735 if (pg_tmp == pg->cmt_parent) 1736 break; 1737 } 1738 if (pg_tmp != pg->cmt_parent) { 1739 /* 1740 * cmt_parent is not in the lineage, verify 1741 * it is a proper subset of PG. 1742 */ 1743 if (PG_NUM_CPUS((pg_t *)pg->cmt_parent) >= 1744 PG_NUM_CPUS((pg_t *)pg)) { 1745 /* 1746 * Not a proper subset if pg has less 1747 * CPUs than cmt_parent... 1748 */ 1749 cmt_lineage_status = 1750 CMT_LINEAGE_NON_PROMOTABLE; 1751 goto handle_error; 1752 } 1753 } 1754 } 1755 1756 /* 1757 * Walk each of the CPUs in the PGs group and perform 1758 * consistency checks along the way. 1759 */ 1760 PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter); 1761 while ((cp = pg_cpu_next(&cpu_iter)) != NULL) { 1762 /* 1763 * Verify that there aren't any CPUs contained in PG 1764 * that the next PG in the lineage (which is larger 1765 * or same size) doesn't also contain. 1766 */ 1767 if (pg_next != NULL && 1768 pg_cpu_find((pg_t *)pg_next, cp) == B_FALSE) { 1769 cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC; 1770 goto handle_error; 1771 } 1772 1773 /* 1774 * Verify that all the CPUs in the PG are in the same 1775 * lgroup. 1776 */ 1777 if (lgrp == LGRP_NULL_HANDLE) { 1778 lgrp = lgrp_plat_cpu_to_hand(cp->cpu_id); 1779 } else if (lgrp_plat_cpu_to_hand(cp->cpu_id) != lgrp) { 1780 cmt_lineage_status = CMT_LINEAGE_PG_SPANS_LGRPS; 1781 goto handle_error; 1782 } 1783 } 1784 } 1785 1786 handle_error: 1787 /* 1788 * Some of these validation errors can result when the CPU grouping 1789 * information is derived from buggy sources (for example, incorrect 1790 * ACPI tables on x86 systems). 1791 * 1792 * We'll try to recover in such cases by pruning out the illegal 1793 * groupings from the PG hierarchy, which means that we won't optimize 1794 * for those levels, but we will for the remaining ones. 1795 */ 1796 switch (cmt_lineage_status) { 1797 case CMT_LINEAGE_VALID: 1798 case CMT_LINEAGE_REPAIRED: 1799 break; 1800 case CMT_LINEAGE_PG_SPANS_LGRPS: 1801 /* 1802 * We've detected a PG whose CPUs span lgroups. 1803 * 1804 * This isn't supported, as the dispatcher isn't allowed to 1805 * to do CMT thread placement across lgroups, as this would 1806 * conflict with policies implementing MPO thread affinity. 1807 * 1808 * If the PG is of a sharing relationship type known to 1809 * legitimately span lgroups, specify that no CMT thread 1810 * placement policy should be implemented, and prune the PG 1811 * from the existing CMT PG hierarchy. 1812 * 1813 * Otherwise, fall though to the case below for handling. 1814 */ 1815 if (((pghw_t *)pg)->pghw_hw == PGHW_CHIP) { 1816 if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) { 1817 cmt_lineage_status = CMT_LINEAGE_REPAIRED; 1818 goto revalidate; 1819 } 1820 } 1821 /*LINTED*/ 1822 case CMT_LINEAGE_NON_PROMOTABLE: 1823 /* 1824 * We've detected a PG that already exists in another CPU's 1825 * lineage that cannot cannot legally be promoted into place 1826 * without breaking the invariants of the hierarchy. 1827 */ 1828 if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) { 1829 if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) { 1830 cmt_lineage_status = CMT_LINEAGE_REPAIRED; 1831 goto revalidate; 1832 } 1833 } 1834 /* 1835 * Something went wrong trying to prune out the bad level. 1836 * Disable CMT scheduling altogether. 1837 */ 1838 pg_cmt_disable(); 1839 break; 1840 case CMT_LINEAGE_NON_CONCENTRIC: 1841 /* 1842 * We've detected a non-concentric PG lineage, which means that 1843 * there's a PG in the lineage that has CPUs that the next PG 1844 * over in the lineage (which is the same size or larger) 1845 * doesn't have. 1846 * 1847 * In this case, we examine the two PGs to see if either 1848 * grouping is defined by potentially buggy sources. 1849 * 1850 * If one has less CPUs than the other, and contains CPUs 1851 * not found in the parent, and it is an untrusted enumeration, 1852 * then prune it. If both have the same number of CPUs, then 1853 * prune the one that is untrusted. 1854 * 1855 * This process repeats until we have a concentric lineage, 1856 * or we would have to prune out level derived from what we 1857 * thought was a reliable source, in which case CMT scheduling 1858 * is disabled altogether. 1859 */ 1860 if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)pg_next)) && 1861 (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) { 1862 pg_bad = pg; 1863 } else if (PG_NUM_CPUS((pg_t *)pg) == 1864 PG_NUM_CPUS((pg_t *)pg_next)) { 1865 if (PG_CMT_HW_SUSPECT(((pghw_t *)pg_next)->pghw_hw)) { 1866 pg_bad = pg_next; 1867 } else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) { 1868 pg_bad = pg; 1869 } 1870 } 1871 if (pg_bad) { 1872 if (pg_cmt_prune(pg_bad, lineage, sz, pgdata) == 0) { 1873 cmt_lineage_status = CMT_LINEAGE_REPAIRED; 1874 goto revalidate; 1875 } 1876 } 1877 /* 1878 * Something went wrong trying to identify and/or prune out 1879 * the bad level. Disable CMT scheduling altogether. 1880 */ 1881 pg_cmt_disable(); 1882 break; 1883 default: 1884 /* 1885 * If we're here, we've encountered a validation error for 1886 * which we don't know how to recover. In this case, disable 1887 * CMT scheduling altogether. 1888 */ 1889 cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE; 1890 pg_cmt_disable(); 1891 } 1892 return (cmt_lineage_status); 1893 } 1894