1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/systm.h> 27 #include <sys/types.h> 28 #include <sys/param.h> 29 #include <sys/thread.h> 30 #include <sys/cpuvar.h> 31 #include <sys/cpupart.h> 32 #include <sys/kmem.h> 33 #include <sys/cmn_err.h> 34 #include <sys/kstat.h> 35 #include <sys/processor.h> 36 #include <sys/disp.h> 37 #include <sys/group.h> 38 #include <sys/pghw.h> 39 #include <sys/bitset.h> 40 #include <sys/lgrp.h> 41 #include <sys/cmt.h> 42 #include <sys/cpu_pm.h> 43 44 /* 45 * CMT scheduler / dispatcher support 46 * 47 * This file implements CMT scheduler support using Processor Groups. 48 * The CMT processor group class creates and maintains the CMT class 49 * specific processor group pg_cmt_t. 50 * 51 * ---------------------------- <-- pg_cmt_t * 52 * | pghw_t | 53 * ---------------------------- 54 * | CMT class specific data | 55 * | - hierarchy linkage | 56 * | - CMT load balancing data| 57 * | - active CPU group/bitset| 58 * ---------------------------- 59 * 60 * The scheduler/dispatcher leverages knowledge of the performance 61 * relevant CMT sharing relationships existing between cpus to implement 62 * optimized affinity, load balancing, and coalescence policies. 63 * 64 * Load balancing policy seeks to improve performance by minimizing 65 * contention over shared processor resources / facilities, Affinity 66 * policies seek to improve cache and TLB utilization. Coalescence 67 * policies improve resource utilization and ultimately power efficiency. 68 * 69 * The CMT PGs created by this class are already arranged into a 70 * hierarchy (which is done in the pghw layer). To implement the top-down 71 * CMT load balancing algorithm, the CMT PGs additionally maintain 72 * parent, child and sibling hierarchy relationships. 73 * Parent PGs always contain a superset of their children(s) resources, 74 * each PG can have at most one parent, and siblings are the group of PGs 75 * sharing the same parent. 76 * 77 * On UMA based systems, the CMT load balancing algorithm begins by balancing 78 * load across the group of top level PGs in the system hierarchy. 79 * On NUMA systems, the CMT load balancing algorithm balances load across the 80 * group of top level PGs in each leaf lgroup...but for root homed threads, 81 * is willing to balance against all the top level PGs in the system. 82 * 83 * Groups of top level PGs are maintained to implement the above, one for each 84 * leaf lgroup (containing the top level PGs in that lgroup), and one (for the 85 * root lgroup) that contains all the top level PGs in the system. 86 */ 87 static cmt_lgrp_t *cmt_lgrps = NULL; /* cmt_lgrps list head */ 88 static cmt_lgrp_t *cpu0_lgrp = NULL; /* boot CPU's initial lgrp */ 89 /* used for null_proc_lpa */ 90 cmt_lgrp_t *cmt_root = NULL; /* Reference to root cmt pg */ 91 92 static int is_cpu0 = 1; /* true if this is boot CPU context */ 93 94 /* 95 * Array of hardware sharing relationships that are blacklisted. 96 * CMT scheduling optimizations won't be performed for blacklisted sharing 97 * relationships. 98 */ 99 static int cmt_hw_blacklisted[PGHW_NUM_COMPONENTS]; 100 101 /* 102 * Set this to non-zero to disable CMT scheduling 103 * This must be done via kmdb -d, as /etc/system will be too late 104 */ 105 int cmt_sched_disabled = 0; 106 107 /* 108 * Status codes for CMT lineage validation 109 * See pg_cmt_lineage_validate() below 110 */ 111 typedef enum cmt_lineage_validation { 112 CMT_LINEAGE_VALID, 113 CMT_LINEAGE_NON_CONCENTRIC, 114 CMT_LINEAGE_PG_SPANS_LGRPS, 115 CMT_LINEAGE_NON_PROMOTABLE, 116 CMT_LINEAGE_REPAIRED, 117 CMT_LINEAGE_UNRECOVERABLE 118 } cmt_lineage_validation_t; 119 120 /* 121 * Status of the current lineage under construction. 122 * One must be holding cpu_lock to change this. 123 */ 124 cmt_lineage_validation_t cmt_lineage_status = CMT_LINEAGE_VALID; 125 126 /* 127 * Power domain definitions (on x86) are defined by ACPI, and 128 * therefore may be subject to BIOS bugs. 129 */ 130 #define PG_CMT_HW_SUSPECT(hw) PGHW_IS_PM_DOMAIN(hw) 131 132 /* 133 * Macro to test if PG is managed by the CMT PG class 134 */ 135 #define IS_CMT_PG(pg) (((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id) 136 137 static pg_cid_t pg_cmt_class_id; /* PG class id */ 138 139 static pg_t *pg_cmt_alloc(); 140 static void pg_cmt_free(pg_t *); 141 static void pg_cmt_cpu_init(cpu_t *, cpu_pg_t *); 142 static void pg_cmt_cpu_fini(cpu_t *, cpu_pg_t *); 143 static void pg_cmt_cpu_active(cpu_t *); 144 static void pg_cmt_cpu_inactive(cpu_t *); 145 static void pg_cmt_cpupart_in(cpu_t *, cpupart_t *); 146 static void pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *); 147 static char *pg_cmt_policy_name(pg_t *); 148 static void pg_cmt_hier_sort(pg_cmt_t **, int); 149 static pg_cmt_t *pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *); 150 static int pg_cmt_cpu_belongs(pg_t *, cpu_t *); 151 static int pg_cmt_hw(pghw_type_t); 152 static cmt_lgrp_t *pg_cmt_find_lgrp(lgrp_handle_t); 153 static cmt_lgrp_t *pg_cmt_lgrp_create(lgrp_handle_t); 154 static void cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t, 155 kthread_t *, kthread_t *); 156 static void cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t, 157 kthread_t *, kthread_t *); 158 static void cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *); 159 static cmt_lineage_validation_t pg_cmt_lineage_validate(pg_cmt_t **, int *, 160 cpu_pg_t *); 161 162 163 /* 164 * CMT PG ops 165 */ 166 struct pg_ops pg_ops_cmt = { 167 pg_cmt_alloc, 168 pg_cmt_free, 169 pg_cmt_cpu_init, 170 pg_cmt_cpu_fini, 171 pg_cmt_cpu_active, 172 pg_cmt_cpu_inactive, 173 pg_cmt_cpupart_in, 174 NULL, /* cpupart_out */ 175 pg_cmt_cpupart_move, 176 pg_cmt_cpu_belongs, 177 pg_cmt_policy_name, 178 }; 179 180 /* 181 * Initialize the CMT PG class 182 */ 183 void 184 pg_cmt_class_init(void) 185 { 186 if (cmt_sched_disabled) 187 return; 188 189 pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL); 190 } 191 192 /* 193 * Called to indicate a new CPU has started up so 194 * that either t0 or the slave startup thread can 195 * be accounted for. 196 */ 197 void 198 pg_cmt_cpu_startup(cpu_t *cp) 199 { 200 pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread, 201 cp->cpu_thread); 202 } 203 204 /* 205 * Return non-zero if thread can migrate between "from" and "to" 206 * without a performance penalty 207 */ 208 int 209 pg_cmt_can_migrate(cpu_t *from, cpu_t *to) 210 { 211 if (from->cpu_physid->cpu_cacheid == 212 to->cpu_physid->cpu_cacheid) 213 return (1); 214 return (0); 215 } 216 217 /* 218 * CMT class specific PG allocation 219 */ 220 static pg_t * 221 pg_cmt_alloc(void) 222 { 223 return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP)); 224 } 225 226 /* 227 * Class specific PG de-allocation 228 */ 229 static void 230 pg_cmt_free(pg_t *pg) 231 { 232 ASSERT(pg != NULL); 233 ASSERT(IS_CMT_PG(pg)); 234 235 kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t)); 236 } 237 238 /* 239 * Given a hardware sharing relationship, return which dispatcher 240 * policies should be implemented to optimize performance and efficiency 241 */ 242 static pg_cmt_policy_t 243 pg_cmt_policy(pghw_type_t hw) 244 { 245 pg_cmt_policy_t p; 246 247 /* 248 * Give the platform a chance to override the default 249 */ 250 if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY) 251 return (p); 252 253 switch (hw) { 254 case PGHW_IPIPE: 255 case PGHW_FPU: 256 case PGHW_PROCNODE: 257 case PGHW_CHIP: 258 return (CMT_BALANCE); 259 case PGHW_CACHE: 260 return (CMT_AFFINITY); 261 case PGHW_POW_ACTIVE: 262 case PGHW_POW_IDLE: 263 return (CMT_BALANCE); 264 default: 265 return (CMT_NO_POLICY); 266 } 267 } 268 269 /* 270 * Rank the importance of optimizing for the pg1 relationship vs. 271 * the pg2 relationship. 272 */ 273 static pg_cmt_t * 274 pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2) 275 { 276 pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw; 277 pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw; 278 279 /* 280 * A power domain is only important if CPUPM is enabled. 281 */ 282 if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) { 283 if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2)) 284 return (pg2); 285 if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1)) 286 return (pg1); 287 } 288 289 /* 290 * Otherwise, ask the platform 291 */ 292 if (pg_plat_hw_rank(hw1, hw2) == hw1) 293 return (pg1); 294 else 295 return (pg2); 296 } 297 298 /* 299 * Initialize CMT callbacks for the given PG 300 */ 301 static void 302 cmt_callback_init(pg_t *pg) 303 { 304 /* 305 * Stick with the default callbacks if there isn't going to be 306 * any CMT thread placement optimizations implemented. 307 */ 308 if (((pg_cmt_t *)pg)->cmt_policy == CMT_NO_POLICY) 309 return; 310 311 switch (((pghw_t *)pg)->pghw_hw) { 312 case PGHW_POW_ACTIVE: 313 pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr; 314 pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr; 315 break; 316 default: 317 pg->pg_cb.thread_swtch = cmt_ev_thread_swtch; 318 319 } 320 } 321 322 /* 323 * Promote PG above it's current parent. 324 * This is only legal if PG has an equal or greater number of CPUs than its 325 * parent. 326 * 327 * This routine operates on the CPU specific processor group data (for the CPUs 328 * in the PG being promoted), and may be invoked from a context where one CPU's 329 * PG data is under construction. In this case the argument "pgdata", if not 330 * NULL, is a reference to the CPU's under-construction PG data. 331 */ 332 static void 333 cmt_hier_promote(pg_cmt_t *pg, cpu_pg_t *pgdata) 334 { 335 pg_cmt_t *parent; 336 group_t *children; 337 cpu_t *cpu; 338 group_iter_t iter; 339 pg_cpu_itr_t cpu_iter; 340 int r; 341 int err; 342 int nchildren; 343 344 ASSERT(MUTEX_HELD(&cpu_lock)); 345 346 parent = pg->cmt_parent; 347 if (parent == NULL) { 348 /* 349 * Nothing to do 350 */ 351 return; 352 } 353 354 ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent)); 355 356 /* 357 * We're changing around the hierarchy, which is actively traversed 358 * by the dispatcher. Pause CPUS to ensure exclusivity. 359 */ 360 pause_cpus(NULL); 361 362 /* 363 * If necessary, update the parent's sibling set, replacing parent 364 * with PG. 365 */ 366 if (parent->cmt_siblings) { 367 if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE) 368 != -1) { 369 r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE); 370 ASSERT(r != -1); 371 } 372 } 373 374 /* 375 * If the parent is at the top of the hierarchy, replace it's entry 376 * in the root lgroup's group of top level PGs. 377 */ 378 if (parent->cmt_parent == NULL && 379 parent->cmt_siblings != &cmt_root->cl_pgs) { 380 if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE) 381 != -1) { 382 r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE); 383 ASSERT(r != -1); 384 } 385 } 386 387 /* 388 * We assume (and therefore assert) that the PG being promoted is an 389 * only child of it's parent. Update the parent's children set 390 * replacing PG's entry with the parent (since the parent is becoming 391 * the child). Then have PG and the parent swap children sets and 392 * children counts. 393 */ 394 ASSERT(GROUP_SIZE(parent->cmt_children) <= 1); 395 if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) { 396 r = group_add(parent->cmt_children, parent, GRP_NORESIZE); 397 ASSERT(r != -1); 398 } 399 400 children = pg->cmt_children; 401 pg->cmt_children = parent->cmt_children; 402 parent->cmt_children = children; 403 404 nchildren = pg->cmt_nchildren; 405 pg->cmt_nchildren = parent->cmt_nchildren; 406 parent->cmt_nchildren = nchildren; 407 408 /* 409 * Update the sibling references for PG and it's parent 410 */ 411 pg->cmt_siblings = parent->cmt_siblings; 412 parent->cmt_siblings = pg->cmt_children; 413 414 /* 415 * Update any cached lineages in the per CPU pg data. 416 */ 417 PG_CPU_ITR_INIT(pg, cpu_iter); 418 while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { 419 int idx; 420 int sz; 421 pg_cmt_t *cpu_pg; 422 cpu_pg_t *pgd; /* CPU's PG data */ 423 424 /* 425 * The CPU's whose lineage is under construction still 426 * references the bootstrap CPU PG data structure. 427 */ 428 if (pg_cpu_is_bootstrapped(cpu)) 429 pgd = pgdata; 430 else 431 pgd = cpu->cpu_pg; 432 433 /* 434 * Iterate over the CPU's PGs updating the children 435 * of the PG being promoted, since they have a new parent. 436 */ 437 group_iter_init(&iter); 438 while ((cpu_pg = group_iterate(&pgd->cmt_pgs, &iter)) != NULL) { 439 if (cpu_pg->cmt_parent == pg) { 440 cpu_pg->cmt_parent = parent; 441 } 442 } 443 444 /* 445 * Update the CMT load balancing lineage 446 */ 447 if ((idx = group_find(&pgd->cmt_pgs, (void *)pg)) == -1) { 448 /* 449 * Unless this is the CPU who's lineage is being 450 * constructed, the PG being promoted should be 451 * in the lineage. 452 */ 453 ASSERT(pg_cpu_is_bootstrapped(cpu)); 454 continue; 455 } 456 457 ASSERT(idx > 0); 458 ASSERT(GROUP_ACCESS(&pgd->cmt_pgs, idx - 1) == parent); 459 460 /* 461 * Have the child and the parent swap places in the CPU's 462 * lineage 463 */ 464 group_remove_at(&pgd->cmt_pgs, idx); 465 group_remove_at(&pgd->cmt_pgs, idx - 1); 466 err = group_add_at(&pgd->cmt_pgs, parent, idx); 467 ASSERT(err == 0); 468 err = group_add_at(&pgd->cmt_pgs, pg, idx - 1); 469 ASSERT(err == 0); 470 471 /* 472 * Ensure cmt_lineage references CPU's leaf PG. 473 * Since cmt_pgs is top-down ordered, the bottom is the last 474 * element. 475 */ 476 if ((sz = GROUP_SIZE(&pgd->cmt_pgs)) > 0) 477 pgd->cmt_lineage = GROUP_ACCESS(&pgd->cmt_pgs, sz - 1); 478 } 479 480 /* 481 * Update the parent references for PG and it's parent 482 */ 483 pg->cmt_parent = parent->cmt_parent; 484 parent->cmt_parent = pg; 485 486 start_cpus(); 487 } 488 489 /* 490 * CMT class callback for a new CPU entering the system 491 * 492 * This routine operates on the CPU specific processor group data (for the CPU 493 * being initialized). The argument "pgdata" is a reference to the CPU's PG 494 * data to be constructed. 495 * 496 * cp->cpu_pg is used by the dispatcher to access the CPU's PG data 497 * references a "bootstrap" structure. pg_cmt_cpu_init() and the routines it 498 * calls must be careful to operate only on the "pgdata" argument, and not 499 * cp->cpu_pg. 500 */ 501 static void 502 pg_cmt_cpu_init(cpu_t *cp, cpu_pg_t *pgdata) 503 { 504 pg_cmt_t *pg; 505 group_t *cmt_pgs; 506 int levels, level; 507 pghw_type_t hw; 508 pg_t *pg_cache = NULL; 509 pg_cmt_t *cpu_cmt_hier[PGHW_NUM_COMPONENTS]; 510 lgrp_handle_t lgrp_handle; 511 cmt_lgrp_t *lgrp; 512 cmt_lineage_validation_t lineage_status; 513 514 ASSERT(MUTEX_HELD(&cpu_lock)); 515 ASSERT(pg_cpu_is_bootstrapped(cp)); 516 517 if (cmt_sched_disabled) 518 return; 519 520 /* 521 * A new CPU is coming into the system. 522 * Interrogate the platform to see if the CPU 523 * has any performance or efficiency relevant 524 * sharing relationships 525 */ 526 cmt_pgs = &pgdata->cmt_pgs; 527 pgdata->cmt_lineage = NULL; 528 529 bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier)); 530 levels = 0; 531 for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) { 532 533 pg_cmt_policy_t policy; 534 535 /* 536 * We're only interested in the hw sharing relationships 537 * for which we know how to optimize. 538 */ 539 policy = pg_cmt_policy(hw); 540 if (policy == CMT_NO_POLICY || 541 pg_plat_hw_shared(cp, hw) == 0) 542 continue; 543 544 /* 545 * We will still create the PGs for hardware sharing 546 * relationships that have been blacklisted, but won't 547 * implement CMT thread placement optimizations against them. 548 */ 549 if (cmt_hw_blacklisted[hw] == 1) 550 policy = CMT_NO_POLICY; 551 552 /* 553 * Find (or create) the PG associated with 554 * the hw sharing relationship in which cp 555 * belongs. 556 * 557 * Determine if a suitable PG already 558 * exists, or if one needs to be created. 559 */ 560 pg = (pg_cmt_t *)pghw_place_cpu(cp, hw); 561 if (pg == NULL) { 562 /* 563 * Create a new one. 564 * Initialize the common... 565 */ 566 pg = (pg_cmt_t *)pg_create(pg_cmt_class_id); 567 568 /* ... physical ... */ 569 pghw_init((pghw_t *)pg, cp, hw); 570 571 /* 572 * ... and CMT specific portions of the 573 * structure. 574 */ 575 pg->cmt_policy = policy; 576 577 /* CMT event callbacks */ 578 cmt_callback_init((pg_t *)pg); 579 580 bitset_init(&pg->cmt_cpus_actv_set); 581 group_create(&pg->cmt_cpus_actv); 582 } else { 583 ASSERT(IS_CMT_PG(pg)); 584 } 585 586 /* Add the CPU to the PG */ 587 pg_cpu_add((pg_t *)pg, cp, pgdata); 588 589 /* 590 * Ensure capacity of the active CPU group/bitset 591 */ 592 group_expand(&pg->cmt_cpus_actv, 593 GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 594 595 if (cp->cpu_seqid >= 596 bitset_capacity(&pg->cmt_cpus_actv_set)) { 597 bitset_resize(&pg->cmt_cpus_actv_set, 598 cp->cpu_seqid + 1); 599 } 600 601 /* 602 * Build a lineage of CMT PGs for load balancing / coalescence 603 */ 604 if (policy & (CMT_BALANCE | CMT_COALESCE)) { 605 cpu_cmt_hier[levels++] = pg; 606 } 607 608 /* Cache this for later */ 609 if (hw == PGHW_CACHE) 610 pg_cache = (pg_t *)pg; 611 } 612 613 group_expand(cmt_pgs, levels); 614 615 if (cmt_root == NULL) 616 cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand()); 617 618 /* 619 * Find the lgrp that encapsulates this CPU's CMT hierarchy 620 */ 621 lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 622 if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL) 623 lgrp = pg_cmt_lgrp_create(lgrp_handle); 624 625 /* 626 * Ascendingly sort the PGs in the lineage by number of CPUs 627 */ 628 pg_cmt_hier_sort(cpu_cmt_hier, levels); 629 630 /* 631 * Examine the lineage and validate it. 632 * This routine will also try to fix the lineage along with the 633 * rest of the PG hierarchy should it detect an issue. 634 * 635 * If it returns anything other than VALID or REPAIRED, an 636 * unrecoverable error has occurred, and we cannot proceed. 637 */ 638 lineage_status = pg_cmt_lineage_validate(cpu_cmt_hier, &levels, pgdata); 639 if ((lineage_status != CMT_LINEAGE_VALID) && 640 (lineage_status != CMT_LINEAGE_REPAIRED)) { 641 /* 642 * In the case of an unrecoverable error where CMT scheduling 643 * has been disabled, assert that the under construction CPU's 644 * PG data has an empty CMT load balancing lineage. 645 */ 646 ASSERT((cmt_sched_disabled == 0) || 647 (GROUP_SIZE(&(pgdata->cmt_pgs)) == 0)); 648 return; 649 } 650 651 /* 652 * For existing PGs in the lineage, verify that the parent is 653 * correct, as the generation in the lineage may have changed 654 * as a result of the sorting. Start the traversal at the top 655 * of the lineage, moving down. 656 */ 657 for (level = levels - 1; level >= 0; ) { 658 int reorg; 659 660 reorg = 0; 661 pg = cpu_cmt_hier[level]; 662 663 /* 664 * Promote PGs at an incorrect generation into place. 665 */ 666 while (pg->cmt_parent && 667 pg->cmt_parent != cpu_cmt_hier[level + 1]) { 668 cmt_hier_promote(pg, pgdata); 669 reorg++; 670 } 671 if (reorg > 0) 672 level = levels - 1; 673 else 674 level--; 675 } 676 677 /* 678 * For each of the PGs in the CPU's lineage: 679 * - Add an entry in the CPU sorted CMT PG group 680 * which is used for top down CMT load balancing 681 * - Tie the PG into the CMT hierarchy by connecting 682 * it to it's parent and siblings. 683 */ 684 for (level = 0; level < levels; level++) { 685 uint_t children; 686 int err; 687 688 pg = cpu_cmt_hier[level]; 689 err = group_add_at(cmt_pgs, pg, levels - level - 1); 690 ASSERT(err == 0); 691 692 if (level == 0) 693 pgdata->cmt_lineage = (pg_t *)pg; 694 695 if (pg->cmt_siblings != NULL) { 696 /* Already initialized */ 697 ASSERT(pg->cmt_parent == NULL || 698 pg->cmt_parent == cpu_cmt_hier[level + 1]); 699 ASSERT(pg->cmt_siblings == &lgrp->cl_pgs || 700 ((pg->cmt_parent != NULL) && 701 pg->cmt_siblings == pg->cmt_parent->cmt_children)); 702 continue; 703 } 704 705 if ((level + 1) == levels) { 706 pg->cmt_parent = NULL; 707 708 pg->cmt_siblings = &lgrp->cl_pgs; 709 children = ++lgrp->cl_npgs; 710 if (cmt_root != lgrp) 711 cmt_root->cl_npgs++; 712 } else { 713 pg->cmt_parent = cpu_cmt_hier[level + 1]; 714 715 /* 716 * A good parent keeps track of their children. 717 * The parent's children group is also the PG's 718 * siblings. 719 */ 720 if (pg->cmt_parent->cmt_children == NULL) { 721 pg->cmt_parent->cmt_children = 722 kmem_zalloc(sizeof (group_t), KM_SLEEP); 723 group_create(pg->cmt_parent->cmt_children); 724 } 725 pg->cmt_siblings = pg->cmt_parent->cmt_children; 726 children = ++pg->cmt_parent->cmt_nchildren; 727 } 728 729 group_expand(pg->cmt_siblings, children); 730 group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs); 731 } 732 733 /* 734 * Cache the chip and core IDs in the cpu_t->cpu_physid structure 735 * for fast lookups later. 736 */ 737 if (cp->cpu_physid) { 738 cp->cpu_physid->cpu_chipid = 739 pg_plat_hw_instance_id(cp, PGHW_CHIP); 740 cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp); 741 742 /* 743 * If this cpu has a PG representing shared cache, then set 744 * cpu_cacheid to that PG's logical id 745 */ 746 if (pg_cache) 747 cp->cpu_physid->cpu_cacheid = pg_cache->pg_id; 748 } 749 750 /* CPU0 only initialization */ 751 if (is_cpu0) { 752 is_cpu0 = 0; 753 cpu0_lgrp = lgrp; 754 } 755 756 } 757 758 /* 759 * Class callback when a CPU is leaving the system (deletion) 760 * 761 * "pgdata" is a reference to the CPU's PG data to be deconstructed. 762 * 763 * cp->cpu_pg is used by the dispatcher to access the CPU's PG data 764 * references a "bootstrap" structure across this function's invocation. 765 * pg_cmt_cpu_init() and the routines it calls must be careful to operate only 766 * on the "pgdata" argument, and not cp->cpu_pg. 767 */ 768 static void 769 pg_cmt_cpu_fini(cpu_t *cp, cpu_pg_t *pgdata) 770 { 771 group_iter_t i; 772 pg_cmt_t *pg; 773 group_t *pgs, *cmt_pgs; 774 lgrp_handle_t lgrp_handle; 775 cmt_lgrp_t *lgrp; 776 777 if (cmt_sched_disabled) 778 return; 779 780 ASSERT(pg_cpu_is_bootstrapped(cp)); 781 782 pgs = &pgdata->pgs; 783 cmt_pgs = &pgdata->cmt_pgs; 784 785 /* 786 * Find the lgroup that encapsulates this CPU's CMT hierarchy 787 */ 788 lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 789 790 lgrp = pg_cmt_find_lgrp(lgrp_handle); 791 if (ncpus == 1 && lgrp != cpu0_lgrp) { 792 /* 793 * One might wonder how we could be deconfiguring the 794 * only CPU in the system. 795 * 796 * On Starcat systems when null_proc_lpa is detected, 797 * the boot CPU (which is already configured into a leaf 798 * lgroup), is moved into the root lgroup. This is done by 799 * deconfiguring it from both lgroups and processor 800 * groups), and then later reconfiguring it back in. This 801 * call to pg_cmt_cpu_fini() is part of that deconfiguration. 802 * 803 * This special case is detected by noting that the platform 804 * has changed the CPU's lgrp affiliation (since it now 805 * belongs in the root). In this case, use the cmt_lgrp_t 806 * cached for the boot CPU, since this is what needs to be 807 * torn down. 808 */ 809 lgrp = cpu0_lgrp; 810 } 811 812 ASSERT(lgrp != NULL); 813 814 /* 815 * First, clean up anything load balancing specific for each of 816 * the CPU's PGs that participated in CMT load balancing 817 */ 818 pg = (pg_cmt_t *)pgdata->cmt_lineage; 819 while (pg != NULL) { 820 821 /* 822 * Remove the PG from the CPU's load balancing lineage 823 */ 824 (void) group_remove(cmt_pgs, pg, GRP_RESIZE); 825 826 /* 827 * If it's about to become empty, destroy it's children 828 * group, and remove it's reference from it's siblings. 829 * This is done here (rather than below) to avoid removing 830 * our reference from a PG that we just eliminated. 831 */ 832 if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) { 833 if (pg->cmt_children != NULL) 834 group_destroy(pg->cmt_children); 835 if (pg->cmt_siblings != NULL) { 836 if (pg->cmt_siblings == &lgrp->cl_pgs) 837 lgrp->cl_npgs--; 838 else 839 pg->cmt_parent->cmt_nchildren--; 840 } 841 } 842 pg = pg->cmt_parent; 843 } 844 ASSERT(GROUP_SIZE(cmt_pgs) == 0); 845 846 /* 847 * Now that the load balancing lineage updates have happened, 848 * remove the CPU from all it's PGs (destroying any that become 849 * empty). 850 */ 851 group_iter_init(&i); 852 while ((pg = group_iterate(pgs, &i)) != NULL) { 853 if (IS_CMT_PG(pg) == 0) 854 continue; 855 856 pg_cpu_delete((pg_t *)pg, cp, pgdata); 857 /* 858 * Deleting the CPU from the PG changes the CPU's 859 * PG group over which we are actively iterating 860 * Re-initialize the iteration 861 */ 862 group_iter_init(&i); 863 864 if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) { 865 866 /* 867 * The PG has become zero sized, so destroy it. 868 */ 869 group_destroy(&pg->cmt_cpus_actv); 870 bitset_fini(&pg->cmt_cpus_actv_set); 871 pghw_fini((pghw_t *)pg); 872 873 pg_destroy((pg_t *)pg); 874 } 875 } 876 } 877 878 /* 879 * Class callback when a CPU is entering a cpu partition 880 */ 881 static void 882 pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp) 883 { 884 group_t *pgs; 885 pg_t *pg; 886 group_iter_t i; 887 888 ASSERT(MUTEX_HELD(&cpu_lock)); 889 890 if (cmt_sched_disabled) 891 return; 892 893 pgs = &cp->cpu_pg->pgs; 894 895 /* 896 * Ensure that the new partition's PG bitset 897 * is large enough for all CMT PG's to which cp 898 * belongs 899 */ 900 group_iter_init(&i); 901 while ((pg = group_iterate(pgs, &i)) != NULL) { 902 if (IS_CMT_PG(pg) == 0) 903 continue; 904 905 if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id) 906 bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1); 907 } 908 } 909 910 /* 911 * Class callback when a CPU is actually moving partitions 912 */ 913 static void 914 pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp) 915 { 916 cpu_t *cpp; 917 group_t *pgs; 918 pg_t *pg; 919 group_iter_t pg_iter; 920 pg_cpu_itr_t cpu_iter; 921 boolean_t found; 922 923 ASSERT(MUTEX_HELD(&cpu_lock)); 924 925 if (cmt_sched_disabled) 926 return; 927 928 pgs = &cp->cpu_pg->pgs; 929 group_iter_init(&pg_iter); 930 931 /* 932 * Iterate over the CPUs CMT PGs 933 */ 934 while ((pg = group_iterate(pgs, &pg_iter)) != NULL) { 935 936 if (IS_CMT_PG(pg) == 0) 937 continue; 938 939 /* 940 * Add the PG to the bitset in the new partition. 941 */ 942 bitset_add(&newpp->cp_cmt_pgs, pg->pg_id); 943 944 /* 945 * Remove the PG from the bitset in the old partition 946 * if the last of the PG's CPUs have left. 947 */ 948 found = B_FALSE; 949 PG_CPU_ITR_INIT(pg, cpu_iter); 950 while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) { 951 if (cpp == cp) 952 continue; 953 if (CPU_ACTIVE(cpp) && 954 cpp->cpu_part->cp_id == oldpp->cp_id) { 955 found = B_TRUE; 956 break; 957 } 958 } 959 if (!found) 960 bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id); 961 } 962 } 963 964 /* 965 * Class callback when a CPU becomes active (online) 966 * 967 * This is called in a context where CPUs are paused 968 */ 969 static void 970 pg_cmt_cpu_active(cpu_t *cp) 971 { 972 int err; 973 group_iter_t i; 974 pg_cmt_t *pg; 975 group_t *pgs; 976 977 ASSERT(MUTEX_HELD(&cpu_lock)); 978 979 if (cmt_sched_disabled) 980 return; 981 982 pgs = &cp->cpu_pg->pgs; 983 group_iter_init(&i); 984 985 /* 986 * Iterate over the CPU's PGs 987 */ 988 while ((pg = group_iterate(pgs, &i)) != NULL) { 989 990 if (IS_CMT_PG(pg) == 0) 991 continue; 992 993 err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 994 ASSERT(err == 0); 995 996 /* 997 * If this is the first active CPU in the PG, and it 998 * represents a hardware sharing relationship over which 999 * CMT load balancing is performed, add it as a candidate 1000 * for balancing with it's siblings. 1001 */ 1002 if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 && 1003 (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) { 1004 err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE); 1005 ASSERT(err == 0); 1006 1007 /* 1008 * If this is a top level PG, add it as a balancing 1009 * candidate when balancing within the root lgroup. 1010 */ 1011 if (pg->cmt_parent == NULL && 1012 pg->cmt_siblings != &cmt_root->cl_pgs) { 1013 err = group_add(&cmt_root->cl_pgs, pg, 1014 GRP_NORESIZE); 1015 ASSERT(err == 0); 1016 } 1017 } 1018 1019 /* 1020 * Notate the CPU in the PGs active CPU bitset. 1021 * Also notate the PG as being active in it's associated 1022 * partition 1023 */ 1024 bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 1025 bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id); 1026 } 1027 } 1028 1029 /* 1030 * Class callback when a CPU goes inactive (offline) 1031 * 1032 * This is called in a context where CPUs are paused 1033 */ 1034 static void 1035 pg_cmt_cpu_inactive(cpu_t *cp) 1036 { 1037 int err; 1038 group_t *pgs; 1039 pg_cmt_t *pg; 1040 cpu_t *cpp; 1041 group_iter_t i; 1042 pg_cpu_itr_t cpu_itr; 1043 boolean_t found; 1044 1045 ASSERT(MUTEX_HELD(&cpu_lock)); 1046 1047 if (cmt_sched_disabled) 1048 return; 1049 1050 pgs = &cp->cpu_pg->pgs; 1051 group_iter_init(&i); 1052 1053 while ((pg = group_iterate(pgs, &i)) != NULL) { 1054 1055 if (IS_CMT_PG(pg) == 0) 1056 continue; 1057 1058 /* 1059 * Remove the CPU from the CMT PGs active CPU group 1060 * bitmap 1061 */ 1062 err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 1063 ASSERT(err == 0); 1064 1065 bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 1066 1067 /* 1068 * If there are no more active CPUs in this PG over which 1069 * load was balanced, remove it as a balancing candidate. 1070 */ 1071 if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 && 1072 (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) { 1073 err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); 1074 ASSERT(err == 0); 1075 1076 if (pg->cmt_parent == NULL && 1077 pg->cmt_siblings != &cmt_root->cl_pgs) { 1078 err = group_remove(&cmt_root->cl_pgs, pg, 1079 GRP_NORESIZE); 1080 ASSERT(err == 0); 1081 } 1082 } 1083 1084 /* 1085 * Assert the number of active CPUs does not exceed 1086 * the total number of CPUs in the PG 1087 */ 1088 ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <= 1089 GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 1090 1091 /* 1092 * Update the PG bitset in the CPU's old partition 1093 */ 1094 found = B_FALSE; 1095 PG_CPU_ITR_INIT(pg, cpu_itr); 1096 while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) { 1097 if (cpp == cp) 1098 continue; 1099 if (CPU_ACTIVE(cpp) && 1100 cpp->cpu_part->cp_id == cp->cpu_part->cp_id) { 1101 found = B_TRUE; 1102 break; 1103 } 1104 } 1105 if (!found) { 1106 bitset_del(&cp->cpu_part->cp_cmt_pgs, 1107 ((pg_t *)pg)->pg_id); 1108 } 1109 } 1110 } 1111 1112 /* 1113 * Return non-zero if the CPU belongs in the given PG 1114 */ 1115 static int 1116 pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp) 1117 { 1118 cpu_t *pg_cpu; 1119 1120 pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0); 1121 1122 ASSERT(pg_cpu != NULL); 1123 1124 /* 1125 * The CPU belongs if, given the nature of the hardware sharing 1126 * relationship represented by the PG, the CPU has that 1127 * relationship with some other CPU already in the PG 1128 */ 1129 if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw)) 1130 return (1); 1131 1132 return (0); 1133 } 1134 1135 /* 1136 * Sort the CPUs CMT hierarchy, where "size" is the number of levels. 1137 */ 1138 static void 1139 pg_cmt_hier_sort(pg_cmt_t **hier, int size) 1140 { 1141 int i, j, inc, sz; 1142 int start, end; 1143 pg_t *tmp; 1144 pg_t **h = (pg_t **)hier; 1145 1146 /* 1147 * First sort by number of CPUs 1148 */ 1149 inc = size / 2; 1150 while (inc > 0) { 1151 for (i = inc; i < size; i++) { 1152 j = i; 1153 tmp = h[i]; 1154 while ((j >= inc) && 1155 (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) { 1156 h[j] = h[j - inc]; 1157 j = j - inc; 1158 } 1159 h[j] = tmp; 1160 } 1161 if (inc == 2) 1162 inc = 1; 1163 else 1164 inc = (inc * 5) / 11; 1165 } 1166 1167 /* 1168 * Break ties by asking the platform. 1169 * Determine if h[i] outranks h[i + 1] and if so, swap them. 1170 */ 1171 for (start = 0; start < size; start++) { 1172 1173 /* 1174 * Find various contiguous sets of elements, 1175 * in the array, with the same number of cpus 1176 */ 1177 end = start; 1178 sz = PG_NUM_CPUS(h[start]); 1179 while ((end < size) && (sz == PG_NUM_CPUS(h[end]))) 1180 end++; 1181 /* 1182 * Sort each such set of the array by rank 1183 */ 1184 for (i = start + 1; i < end; i++) { 1185 j = i - 1; 1186 tmp = h[i]; 1187 while (j >= start && 1188 pg_cmt_hier_rank(hier[j], 1189 (pg_cmt_t *)tmp) == hier[j]) { 1190 h[j + 1] = h[j]; 1191 j--; 1192 } 1193 h[j + 1] = tmp; 1194 } 1195 } 1196 } 1197 1198 /* 1199 * Return a cmt_lgrp_t * given an lgroup handle. 1200 */ 1201 static cmt_lgrp_t * 1202 pg_cmt_find_lgrp(lgrp_handle_t hand) 1203 { 1204 cmt_lgrp_t *lgrp; 1205 1206 ASSERT(MUTEX_HELD(&cpu_lock)); 1207 1208 lgrp = cmt_lgrps; 1209 while (lgrp != NULL) { 1210 if (lgrp->cl_hand == hand) 1211 break; 1212 lgrp = lgrp->cl_next; 1213 } 1214 return (lgrp); 1215 } 1216 1217 /* 1218 * Create a cmt_lgrp_t with the specified handle. 1219 */ 1220 static cmt_lgrp_t * 1221 pg_cmt_lgrp_create(lgrp_handle_t hand) 1222 { 1223 cmt_lgrp_t *lgrp; 1224 1225 ASSERT(MUTEX_HELD(&cpu_lock)); 1226 1227 lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP); 1228 1229 lgrp->cl_hand = hand; 1230 lgrp->cl_npgs = 0; 1231 lgrp->cl_next = cmt_lgrps; 1232 cmt_lgrps = lgrp; 1233 group_create(&lgrp->cl_pgs); 1234 1235 return (lgrp); 1236 } 1237 1238 /* 1239 * Interfaces to enable and disable power aware dispatching 1240 * The caller must be holding cpu_lock. 1241 * 1242 * Return 0 on success and -1 on failure. 1243 */ 1244 int 1245 cmt_pad_enable(pghw_type_t type) 1246 { 1247 group_t *hwset; 1248 group_iter_t iter; 1249 pg_cmt_t *pg; 1250 1251 ASSERT(PGHW_IS_PM_DOMAIN(type)); 1252 ASSERT(MUTEX_HELD(&cpu_lock)); 1253 1254 if ((hwset = pghw_set_lookup(type)) == NULL || 1255 cmt_hw_blacklisted[type]) { 1256 /* 1257 * Unable to find any instances of the specified type 1258 * of power domain, or the power domains have been blacklisted. 1259 */ 1260 return (-1); 1261 } 1262 1263 /* 1264 * Iterate over the power domains, setting the default dispatcher 1265 * policy for power/performance optimization. 1266 * 1267 * Simply setting the policy isn't enough in the case where the power 1268 * domain is an only child of another PG. Because the dispatcher walks 1269 * the PG hierarchy in a top down fashion, the higher up PG's policy 1270 * will dominate. So promote the power domain above it's parent if both 1271 * PG and it's parent have the same CPUs to ensure it's policy 1272 * dominates. 1273 */ 1274 group_iter_init(&iter); 1275 while ((pg = group_iterate(hwset, &iter)) != NULL) { 1276 /* 1277 * If the power domain is an only child to a parent 1278 * not implementing the same policy, promote the child 1279 * above the parent to activate the policy. 1280 */ 1281 pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw); 1282 while ((pg->cmt_parent != NULL) && 1283 (pg->cmt_parent->cmt_policy != pg->cmt_policy) && 1284 (PG_NUM_CPUS((pg_t *)pg) == 1285 PG_NUM_CPUS((pg_t *)pg->cmt_parent))) { 1286 cmt_hier_promote(pg, NULL); 1287 } 1288 } 1289 1290 return (0); 1291 } 1292 1293 int 1294 cmt_pad_disable(pghw_type_t type) 1295 { 1296 group_t *hwset; 1297 group_iter_t iter; 1298 pg_cmt_t *pg; 1299 pg_cmt_t *child; 1300 1301 ASSERT(PGHW_IS_PM_DOMAIN(type)); 1302 ASSERT(MUTEX_HELD(&cpu_lock)); 1303 1304 if ((hwset = pghw_set_lookup(type)) == NULL) { 1305 /* 1306 * Unable to find any instances of the specified type of 1307 * power domain. 1308 */ 1309 return (-1); 1310 } 1311 /* 1312 * Iterate over the power domains, setting the default dispatcher 1313 * policy for performance optimization (load balancing). 1314 */ 1315 group_iter_init(&iter); 1316 while ((pg = group_iterate(hwset, &iter)) != NULL) { 1317 1318 /* 1319 * If the power domain has an only child that implements 1320 * policy other than load balancing, promote the child 1321 * above the power domain to ensure it's policy dominates. 1322 */ 1323 if (pg->cmt_children != NULL && 1324 GROUP_SIZE(pg->cmt_children) == 1) { 1325 child = GROUP_ACCESS(pg->cmt_children, 0); 1326 if ((child->cmt_policy & CMT_BALANCE) == 0) { 1327 cmt_hier_promote(child, NULL); 1328 } 1329 } 1330 pg->cmt_policy = CMT_BALANCE; 1331 } 1332 return (0); 1333 } 1334 1335 /* ARGSUSED */ 1336 static void 1337 cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old, 1338 kthread_t *new) 1339 { 1340 pg_cmt_t *cmt_pg = (pg_cmt_t *)pg; 1341 1342 if (old == cp->cpu_idle_thread) { 1343 atomic_add_32(&cmt_pg->cmt_utilization, 1); 1344 } else if (new == cp->cpu_idle_thread) { 1345 atomic_add_32(&cmt_pg->cmt_utilization, -1); 1346 } 1347 } 1348 1349 /* 1350 * Macro to test whether a thread is currently runnable on a CPU in a PG. 1351 */ 1352 #define THREAD_RUNNABLE_IN_PG(t, pg) \ 1353 ((t)->t_state == TS_RUN && \ 1354 (t)->t_disp_queue->disp_cpu && \ 1355 bitset_in_set(&(pg)->cmt_cpus_actv_set, \ 1356 (t)->t_disp_queue->disp_cpu->cpu_seqid)) 1357 1358 static void 1359 cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old, 1360 kthread_t *new) 1361 { 1362 pg_cmt_t *cmt = (pg_cmt_t *)pg; 1363 cpupm_domain_t *dom; 1364 uint32_t u; 1365 1366 if (old == cp->cpu_idle_thread) { 1367 ASSERT(new != cp->cpu_idle_thread); 1368 u = atomic_add_32_nv(&cmt->cmt_utilization, 1); 1369 if (u == 1) { 1370 /* 1371 * Notify the CPU power manager that the domain 1372 * is non-idle. 1373 */ 1374 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1375 cpupm_utilization_event(cp, now, dom, 1376 CPUPM_DOM_BUSY_FROM_IDLE); 1377 } 1378 } else if (new == cp->cpu_idle_thread) { 1379 ASSERT(old != cp->cpu_idle_thread); 1380 u = atomic_add_32_nv(&cmt->cmt_utilization, -1); 1381 if (u == 0) { 1382 /* 1383 * The domain is idle, notify the CPU power 1384 * manager. 1385 * 1386 * Avoid notifying if the thread is simply migrating 1387 * between CPUs in the domain. 1388 */ 1389 if (!THREAD_RUNNABLE_IN_PG(old, cmt)) { 1390 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1391 cpupm_utilization_event(cp, now, dom, 1392 CPUPM_DOM_IDLE_FROM_BUSY); 1393 } 1394 } 1395 } 1396 } 1397 1398 /* ARGSUSED */ 1399 static void 1400 cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t) 1401 { 1402 pg_cmt_t *cmt = (pg_cmt_t *)pg; 1403 cpupm_domain_t *dom; 1404 1405 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1406 cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY); 1407 } 1408 1409 /* 1410 * Return the name of the CMT scheduling policy 1411 * being implemented across this PG 1412 */ 1413 static char * 1414 pg_cmt_policy_name(pg_t *pg) 1415 { 1416 pg_cmt_policy_t policy; 1417 1418 policy = ((pg_cmt_t *)pg)->cmt_policy; 1419 1420 if (policy & CMT_AFFINITY) { 1421 if (policy & CMT_BALANCE) 1422 return ("Load Balancing & Affinity"); 1423 else if (policy & CMT_COALESCE) 1424 return ("Load Coalescence & Affinity"); 1425 else 1426 return ("Affinity"); 1427 } else { 1428 if (policy & CMT_BALANCE) 1429 return ("Load Balancing"); 1430 else if (policy & CMT_COALESCE) 1431 return ("Load Coalescence"); 1432 else 1433 return ("None"); 1434 } 1435 } 1436 1437 /* 1438 * Prune PG, and all other instances of PG's hardware sharing relationship 1439 * from the CMT PG hierarchy. 1440 * 1441 * This routine operates on the CPU specific processor group data (for the CPUs 1442 * in the PG being pruned), and may be invoked from a context where one CPU's 1443 * PG data is under construction. In this case the argument "pgdata", if not 1444 * NULL, is a reference to the CPU's under-construction PG data. 1445 */ 1446 static int 1447 pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata) 1448 { 1449 group_t *hwset, *children; 1450 int i, j, r, size = *sz; 1451 group_iter_t hw_iter, child_iter; 1452 pg_cpu_itr_t cpu_iter; 1453 pg_cmt_t *pg, *child; 1454 cpu_t *cpu; 1455 int cap_needed; 1456 pghw_type_t hw; 1457 1458 ASSERT(MUTEX_HELD(&cpu_lock)); 1459 1460 hw = ((pghw_t *)pg_bad)->pghw_hw; 1461 1462 if (hw == PGHW_POW_ACTIVE) { 1463 cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. " 1464 "Event Based CPUPM Unavailable"); 1465 } else if (hw == PGHW_POW_IDLE) { 1466 cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. " 1467 "Dispatcher assisted CPUPM disabled."); 1468 } 1469 1470 /* 1471 * Find and eliminate the PG from the lineage. 1472 */ 1473 for (i = 0; i < size; i++) { 1474 if (lineage[i] == pg_bad) { 1475 for (j = i; j < size - 1; j++) 1476 lineage[j] = lineage[j + 1]; 1477 *sz = size - 1; 1478 break; 1479 } 1480 } 1481 1482 /* 1483 * We'll prune all instances of the hardware sharing relationship 1484 * represented by pg. But before we do that (and pause CPUs) we need 1485 * to ensure the hierarchy's groups are properly sized. 1486 */ 1487 hwset = pghw_set_lookup(hw); 1488 1489 /* 1490 * Blacklist the hardware so future processor groups of this type won't 1491 * participate in CMT thread placement. 1492 * 1493 * XXX 1494 * For heterogeneous system configurations, this might be overkill. 1495 * We may only need to blacklist the illegal PGs, and other instances 1496 * of this hardware sharing relationship may be ok. 1497 */ 1498 cmt_hw_blacklisted[hw] = 1; 1499 1500 /* 1501 * For each of the PGs being pruned, ensure sufficient capacity in 1502 * the siblings set for the PG's children 1503 */ 1504 group_iter_init(&hw_iter); 1505 while ((pg = group_iterate(hwset, &hw_iter)) != NULL) { 1506 /* 1507 * PG is being pruned, but if it is bringing up more than 1508 * one child, ask for more capacity in the siblings group. 1509 */ 1510 cap_needed = 0; 1511 if (pg->cmt_children && 1512 GROUP_SIZE(pg->cmt_children) > 1) { 1513 cap_needed = GROUP_SIZE(pg->cmt_children) - 1; 1514 1515 group_expand(pg->cmt_siblings, 1516 GROUP_SIZE(pg->cmt_siblings) + cap_needed); 1517 1518 /* 1519 * If this is a top level group, also ensure the 1520 * capacity in the root lgrp level CMT grouping. 1521 */ 1522 if (pg->cmt_parent == NULL && 1523 pg->cmt_siblings != &cmt_root->cl_pgs) { 1524 group_expand(&cmt_root->cl_pgs, 1525 GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed); 1526 cmt_root->cl_npgs += cap_needed; 1527 } 1528 } 1529 } 1530 1531 /* 1532 * We're operating on the PG hierarchy. Pause CPUs to ensure 1533 * exclusivity with respect to the dispatcher. 1534 */ 1535 pause_cpus(NULL); 1536 1537 /* 1538 * Prune all PG instances of the hardware sharing relationship 1539 * represented by pg. 1540 */ 1541 group_iter_init(&hw_iter); 1542 while ((pg = group_iterate(hwset, &hw_iter)) != NULL) { 1543 1544 /* 1545 * Remove PG from it's group of siblings, if it's there. 1546 */ 1547 if (pg->cmt_siblings) { 1548 (void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); 1549 } 1550 if (pg->cmt_parent == NULL && 1551 pg->cmt_siblings != &cmt_root->cl_pgs) { 1552 (void) group_remove(&cmt_root->cl_pgs, pg, 1553 GRP_NORESIZE); 1554 } 1555 1556 /* 1557 * Indicate that no CMT policy will be implemented across 1558 * this PG. 1559 */ 1560 pg->cmt_policy = CMT_NO_POLICY; 1561 1562 /* 1563 * Move PG's children from it's children set to it's parent's 1564 * children set. Note that the parent's children set, and PG's 1565 * siblings set are the same thing. 1566 * 1567 * Because we are iterating over the same group that we are 1568 * operating on (removing the children), first add all of PG's 1569 * children to the parent's children set, and once we are done 1570 * iterating, empty PG's children set. 1571 */ 1572 if (pg->cmt_children != NULL) { 1573 children = pg->cmt_children; 1574 1575 group_iter_init(&child_iter); 1576 while ((child = group_iterate(children, &child_iter)) 1577 != NULL) { 1578 if (pg->cmt_siblings != NULL) { 1579 r = group_add(pg->cmt_siblings, child, 1580 GRP_NORESIZE); 1581 ASSERT(r == 0); 1582 1583 if (pg->cmt_parent == NULL && 1584 pg->cmt_siblings != 1585 &cmt_root->cl_pgs) { 1586 r = group_add(&cmt_root->cl_pgs, 1587 child, GRP_NORESIZE); 1588 ASSERT(r == 0); 1589 } 1590 } 1591 } 1592 group_empty(pg->cmt_children); 1593 } 1594 1595 /* 1596 * Reset the callbacks to the defaults 1597 */ 1598 pg_callback_set_defaults((pg_t *)pg); 1599 1600 /* 1601 * Update all the CPU lineages in each of PG's CPUs 1602 */ 1603 PG_CPU_ITR_INIT(pg, cpu_iter); 1604 while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { 1605 pg_cmt_t *cpu_pg; 1606 group_iter_t liter; /* Iterator for the lineage */ 1607 cpu_pg_t *cpd; /* CPU's PG data */ 1608 1609 /* 1610 * The CPU's lineage is under construction still 1611 * references the bootstrap CPU PG data structure. 1612 */ 1613 if (pg_cpu_is_bootstrapped(cpu)) 1614 cpd = pgdata; 1615 else 1616 cpd = cpu->cpu_pg; 1617 1618 /* 1619 * Iterate over the CPU's PGs updating the children 1620 * of the PG being promoted, since they have a new 1621 * parent and siblings set. 1622 */ 1623 group_iter_init(&liter); 1624 while ((cpu_pg = group_iterate(&cpd->pgs, 1625 &liter)) != NULL) { 1626 if (cpu_pg->cmt_parent == pg) { 1627 cpu_pg->cmt_parent = pg->cmt_parent; 1628 cpu_pg->cmt_siblings = pg->cmt_siblings; 1629 } 1630 } 1631 1632 /* 1633 * Update the CPU's lineages 1634 * 1635 * Remove the PG from the CPU's group used for CMT 1636 * scheduling. 1637 */ 1638 (void) group_remove(&cpd->cmt_pgs, pg, GRP_NORESIZE); 1639 } 1640 } 1641 start_cpus(); 1642 return (0); 1643 } 1644 1645 /* 1646 * Disable CMT scheduling 1647 */ 1648 static void 1649 pg_cmt_disable(void) 1650 { 1651 cpu_t *cpu; 1652 1653 ASSERT(MUTEX_HELD(&cpu_lock)); 1654 1655 pause_cpus(NULL); 1656 cpu = cpu_list; 1657 1658 do { 1659 if (cpu->cpu_pg) 1660 group_empty(&cpu->cpu_pg->cmt_pgs); 1661 } while ((cpu = cpu->cpu_next) != cpu_list); 1662 1663 cmt_sched_disabled = 1; 1664 start_cpus(); 1665 cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable"); 1666 } 1667 1668 /* 1669 * CMT lineage validation 1670 * 1671 * This routine is invoked by pg_cmt_cpu_init() to validate the integrity 1672 * of the PGs in a CPU's lineage. This is necessary because it's possible that 1673 * some groupings (power domain groupings in particular) may be defined by 1674 * sources that are buggy (e.g. BIOS bugs). In such cases, it may not be 1675 * possible to integrate those groupings into the CMT PG hierarchy, if doing 1676 * so would violate the subset invariant of the hierarchy, which says that 1677 * a PG must be subset of its parent (if it has one). 1678 * 1679 * pg_cmt_lineage_validate()'s purpose is to detect grouping definitions that 1680 * would result in a violation of this invariant. If a violation is found, 1681 * and the PG is of a grouping type who's definition is known to originate from 1682 * suspect sources (BIOS), then pg_cmt_prune() will be invoked to prune the 1683 * PG (and all other instances PG's sharing relationship type) from the CMT 1684 * hierarchy. Further, future instances of that sharing relationship type won't 1685 * be added. If the grouping definition doesn't originate from suspect 1686 * sources, then pg_cmt_disable() will be invoked to log an error, and disable 1687 * CMT scheduling altogether. 1688 * 1689 * This routine is invoked after the CPU has been added to the PGs in which 1690 * it belongs, but before those PGs have been added to (or had their place 1691 * adjusted in) the CMT PG hierarchy. 1692 * 1693 * The first argument is the CPUs PG lineage (essentially an array of PGs in 1694 * which the CPU belongs) that has already been sorted in ascending order 1695 * by CPU count. Some of the PGs in the CPUs lineage may already have other 1696 * CPUs in them, and have already been integrated into the CMT hierarchy. 1697 * 1698 * The addition of this new CPU to these pre-existing PGs means that those 1699 * PGs may need to be promoted up in the hierarchy to satisfy the subset 1700 * invariant. In additon to testing the subset invariant for the lineage, 1701 * this routine also verifies that the addition of the new CPU to the 1702 * existing PGs wouldn't cause the subset invariant to be violated in 1703 * the exiting lineages. 1704 * 1705 * This routine will normally return one of the following: 1706 * CMT_LINEAGE_VALID - There were no problems detected with the lineage. 1707 * CMT_LINEAGE_REPAIRED - Problems were detected, but repaired via pruning. 1708 * 1709 * Otherwise, this routine will return a value indicating which error it 1710 * was unable to recover from (and set cmt_lineage_status along the way). 1711 * 1712 * This routine operates on the CPU specific processor group data (for the CPU 1713 * whose lineage is being validated), which is under-construction. 1714 * "pgdata" is a reference to the CPU's under-construction PG data. 1715 * This routine must be careful to operate only on "pgdata", and not cp->cpu_pg. 1716 */ 1717 static cmt_lineage_validation_t 1718 pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata) 1719 { 1720 int i, j, size; 1721 pg_cmt_t *pg, *pg_next, *pg_bad, *pg_tmp, *parent; 1722 cpu_t *cp; 1723 pg_cpu_itr_t cpu_iter; 1724 lgrp_handle_t lgrp; 1725 1726 ASSERT(MUTEX_HELD(&cpu_lock)); 1727 1728 revalidate: 1729 size = *sz; 1730 pg_bad = NULL; 1731 lgrp = LGRP_NULL_HANDLE; 1732 for (i = 0; i < size; i++) { 1733 1734 pg = lineage[i]; 1735 if (i < size - 1) 1736 pg_next = lineage[i + 1]; 1737 else 1738 pg_next = NULL; 1739 1740 /* 1741 * We assume that the lineage has already been sorted 1742 * by the number of CPUs. In fact, we depend on it. 1743 */ 1744 ASSERT(pg_next == NULL || 1745 (PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)pg_next))); 1746 1747 /* 1748 * The CPUs PG lineage was passed as the first argument to 1749 * this routine and contains the sorted list of the CPU's 1750 * PGs. Ultimately, the ordering of the PGs in that list, and 1751 * the ordering as traversed by the cmt_parent list must be 1752 * the same. PG promotion will be used as the mechanism to 1753 * achieve this, but first we need to look for cases where 1754 * promotion will be necessary, and validate that will be 1755 * possible without violating the subset invarient described 1756 * above. 1757 * 1758 * Since the PG topology is in the middle of being changed, we 1759 * need to check whether the PG's existing parent (if any) is 1760 * part of this CPU's lineage (and therefore should contain 1761 * the new CPU). If not, it means that the addition of the 1762 * new CPU should have made this PG have more CPUs than its 1763 * parent (and other ancestors not in the same lineage) and 1764 * will need to be promoted into place. 1765 * 1766 * We need to verify all of this to defend against a buggy 1767 * BIOS giving bad power domain CPU groupings. Sigh. 1768 */ 1769 parent = pg->cmt_parent; 1770 while (parent != NULL) { 1771 /* 1772 * Determine if the parent/ancestor is in this lineage 1773 */ 1774 pg_tmp = NULL; 1775 for (j = 0; (j < size) && (pg_tmp != parent); j++) { 1776 pg_tmp = lineage[j]; 1777 } 1778 if (pg_tmp == parent) { 1779 /* 1780 * It's in the lineage. The concentricity 1781 * checks will handle the rest. 1782 */ 1783 break; 1784 } 1785 /* 1786 * If it is not in the lineage, PG will eventually 1787 * need to be promoted above it. Verify the ancestor 1788 * is a proper subset. There is still an error if 1789 * the ancestor has the same number of CPUs as PG, 1790 * since that would imply it should be in the lineage, 1791 * and we already know it isn't. 1792 */ 1793 if (PG_NUM_CPUS((pg_t *)parent) >= 1794 PG_NUM_CPUS((pg_t *)pg)) { 1795 /* 1796 * Not a proper subset if the parent/ancestor 1797 * has the same or more CPUs than PG. 1798 */ 1799 cmt_lineage_status = CMT_LINEAGE_NON_PROMOTABLE; 1800 goto handle_error; 1801 } 1802 parent = parent->cmt_parent; 1803 } 1804 1805 /* 1806 * Walk each of the CPUs in the PGs group and perform 1807 * consistency checks along the way. 1808 */ 1809 PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter); 1810 while ((cp = pg_cpu_next(&cpu_iter)) != NULL) { 1811 /* 1812 * Verify that there aren't any CPUs contained in PG 1813 * that the next PG in the lineage (which is larger 1814 * or same size) doesn't also contain. 1815 */ 1816 if (pg_next != NULL && 1817 pg_cpu_find((pg_t *)pg_next, cp) == B_FALSE) { 1818 cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC; 1819 goto handle_error; 1820 } 1821 1822 /* 1823 * Verify that all the CPUs in the PG are in the same 1824 * lgroup. 1825 */ 1826 if (lgrp == LGRP_NULL_HANDLE) { 1827 lgrp = lgrp_plat_cpu_to_hand(cp->cpu_id); 1828 } else if (lgrp_plat_cpu_to_hand(cp->cpu_id) != lgrp) { 1829 cmt_lineage_status = CMT_LINEAGE_PG_SPANS_LGRPS; 1830 goto handle_error; 1831 } 1832 } 1833 } 1834 1835 handle_error: 1836 /* 1837 * Some of these validation errors can result when the CPU grouping 1838 * information is derived from buggy sources (for example, incorrect 1839 * ACPI tables on x86 systems). 1840 * 1841 * We'll try to recover in such cases by pruning out the illegal 1842 * groupings from the PG hierarchy, which means that we won't optimize 1843 * for those levels, but we will for the remaining ones. 1844 */ 1845 switch (cmt_lineage_status) { 1846 case CMT_LINEAGE_VALID: 1847 case CMT_LINEAGE_REPAIRED: 1848 break; 1849 case CMT_LINEAGE_PG_SPANS_LGRPS: 1850 /* 1851 * We've detected a PG whose CPUs span lgroups. 1852 * 1853 * This isn't supported, as the dispatcher isn't allowed to 1854 * to do CMT thread placement across lgroups, as this would 1855 * conflict with policies implementing MPO thread affinity. 1856 * 1857 * If the PG is of a sharing relationship type known to 1858 * legitimately span lgroups, specify that no CMT thread 1859 * placement policy should be implemented, and prune the PG 1860 * from the existing CMT PG hierarchy. 1861 * 1862 * Otherwise, fall though to the case below for handling. 1863 */ 1864 if (((pghw_t *)pg)->pghw_hw == PGHW_CHIP) { 1865 if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) { 1866 cmt_lineage_status = CMT_LINEAGE_REPAIRED; 1867 goto revalidate; 1868 } 1869 } 1870 /*LINTED*/ 1871 case CMT_LINEAGE_NON_PROMOTABLE: 1872 /* 1873 * We've detected a PG that already exists in another CPU's 1874 * lineage that cannot cannot legally be promoted into place 1875 * without breaking the invariants of the hierarchy. 1876 */ 1877 if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) { 1878 if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) { 1879 cmt_lineage_status = CMT_LINEAGE_REPAIRED; 1880 goto revalidate; 1881 } 1882 } 1883 /* 1884 * Something went wrong trying to prune out the bad level. 1885 * Disable CMT scheduling altogether. 1886 */ 1887 pg_cmt_disable(); 1888 break; 1889 case CMT_LINEAGE_NON_CONCENTRIC: 1890 /* 1891 * We've detected a non-concentric PG lineage, which means that 1892 * there's a PG in the lineage that has CPUs that the next PG 1893 * over in the lineage (which is the same size or larger) 1894 * doesn't have. 1895 * 1896 * In this case, we examine the two PGs to see if either 1897 * grouping is defined by potentially buggy sources. 1898 * 1899 * If one has less CPUs than the other, and contains CPUs 1900 * not found in the parent, and it is an untrusted enumeration, 1901 * then prune it. If both have the same number of CPUs, then 1902 * prune the one that is untrusted. 1903 * 1904 * This process repeats until we have a concentric lineage, 1905 * or we would have to prune out level derived from what we 1906 * thought was a reliable source, in which case CMT scheduling 1907 * is disabled altogether. 1908 */ 1909 if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)pg_next)) && 1910 (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) { 1911 pg_bad = pg; 1912 } else if (PG_NUM_CPUS((pg_t *)pg) == 1913 PG_NUM_CPUS((pg_t *)pg_next)) { 1914 if (PG_CMT_HW_SUSPECT(((pghw_t *)pg_next)->pghw_hw)) { 1915 pg_bad = pg_next; 1916 } else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) { 1917 pg_bad = pg; 1918 } 1919 } 1920 if (pg_bad) { 1921 if (pg_cmt_prune(pg_bad, lineage, sz, pgdata) == 0) { 1922 cmt_lineage_status = CMT_LINEAGE_REPAIRED; 1923 goto revalidate; 1924 } 1925 } 1926 /* 1927 * Something went wrong trying to identify and/or prune out 1928 * the bad level. Disable CMT scheduling altogether. 1929 */ 1930 pg_cmt_disable(); 1931 break; 1932 default: 1933 /* 1934 * If we're here, we've encountered a validation error for 1935 * which we don't know how to recover. In this case, disable 1936 * CMT scheduling altogether. 1937 */ 1938 cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE; 1939 pg_cmt_disable(); 1940 } 1941 return (cmt_lineage_status); 1942 } 1943