1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 #include <sys/systm.h> 26 #include <sys/types.h> 27 #include <sys/param.h> 28 #include <sys/thread.h> 29 #include <sys/cpuvar.h> 30 #include <sys/cpupart.h> 31 #include <sys/kmem.h> 32 #include <sys/cmn_err.h> 33 #include <sys/kstat.h> 34 #include <sys/processor.h> 35 #include <sys/disp.h> 36 #include <sys/group.h> 37 #include <sys/pghw.h> 38 #include <sys/bitset.h> 39 #include <sys/lgrp.h> 40 #include <sys/cmt.h> 41 #include <sys/cpu_pm.h> 42 43 /* 44 * CMT scheduler / dispatcher support 45 * 46 * This file implements CMT scheduler support using Processor Groups. 47 * The CMT processor group class creates and maintains the CMT class 48 * specific processor group pg_cmt_t. 49 * 50 * ---------------------------- <-- pg_cmt_t * 51 * | pghw_t | 52 * ---------------------------- 53 * | CMT class specific data | 54 * | - hierarchy linkage | 55 * | - CMT load balancing data| 56 * | - active CPU group/bitset| 57 * ---------------------------- 58 * 59 * The scheduler/dispatcher leverages knowledge of the performance 60 * relevant CMT sharing relationships existing between cpus to implement 61 * optimized affinity, load balancing, and coalescence policies. 62 * 63 * Load balancing policy seeks to improve performance by minimizing 64 * contention over shared processor resources / facilities, Affinity 65 * policies seek to improve cache and TLB utilization. Coalescence 66 * policies improve resource utilization and ultimately power efficiency. 67 * 68 * The CMT PGs created by this class are already arranged into a 69 * hierarchy (which is done in the pghw layer). To implement the top-down 70 * CMT load balancing algorithm, the CMT PGs additionally maintain 71 * parent, child and sibling hierarchy relationships. 72 * Parent PGs always contain a superset of their children(s) resources, 73 * each PG can have at most one parent, and siblings are the group of PGs 74 * sharing the same parent. 75 * 76 * On UMA based systems, the CMT load balancing algorithm begins by balancing 77 * load across the group of top level PGs in the system hierarchy. 78 * On NUMA systems, the CMT load balancing algorithm balances load across the 79 * group of top level PGs in each leaf lgroup...but for root homed threads, 80 * is willing to balance against all the top level PGs in the system. 81 * 82 * Groups of top level PGs are maintained to implement the above, one for each 83 * leaf lgroup (containing the top level PGs in that lgroup), and one (for the 84 * root lgroup) that contains all the top level PGs in the system. 85 */ 86 static cmt_lgrp_t *cmt_lgrps = NULL; /* cmt_lgrps list head */ 87 static cmt_lgrp_t *cpu0_lgrp = NULL; /* boot CPU's initial lgrp */ 88 /* used for null_proc_lpa */ 89 cmt_lgrp_t *cmt_root = NULL; /* Reference to root cmt pg */ 90 91 static int is_cpu0 = 1; /* true if this is boot CPU context */ 92 93 /* 94 * Array of hardware sharing relationships that are blacklisted. 95 * CMT scheduling optimizations won't be performed for blacklisted sharing 96 * relationships. 97 */ 98 static int cmt_hw_blacklisted[PGHW_NUM_COMPONENTS]; 99 100 /* 101 * Set this to non-zero to disable CMT scheduling 102 * This must be done via kmdb -d, as /etc/system will be too late 103 */ 104 int cmt_sched_disabled = 0; 105 106 /* 107 * Status codes for CMT lineage validation 108 * See pg_cmt_lineage_validate() below 109 */ 110 typedef enum cmt_lineage_validation { 111 CMT_LINEAGE_VALID, 112 CMT_LINEAGE_NON_CONCENTRIC, 113 CMT_LINEAGE_PG_SPANS_LGRPS, 114 CMT_LINEAGE_NON_PROMOTABLE, 115 CMT_LINEAGE_REPAIRED, 116 CMT_LINEAGE_UNRECOVERABLE 117 } cmt_lineage_validation_t; 118 119 /* 120 * Status of the current lineage under construction. 121 * One must be holding cpu_lock to change this. 122 */ 123 cmt_lineage_validation_t cmt_lineage_status = CMT_LINEAGE_VALID; 124 125 /* 126 * Power domain definitions (on x86) are defined by ACPI, and 127 * therefore may be subject to BIOS bugs. 128 */ 129 #define PG_CMT_HW_SUSPECT(hw) PGHW_IS_PM_DOMAIN(hw) 130 131 /* 132 * Macro to test if PG is managed by the CMT PG class 133 */ 134 #define IS_CMT_PG(pg) (((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id) 135 136 static pg_cid_t pg_cmt_class_id; /* PG class id */ 137 138 static pg_t *pg_cmt_alloc(); 139 static void pg_cmt_free(pg_t *); 140 static void pg_cmt_cpu_init(cpu_t *, cpu_pg_t *); 141 static void pg_cmt_cpu_fini(cpu_t *, cpu_pg_t *); 142 static void pg_cmt_cpu_active(cpu_t *); 143 static void pg_cmt_cpu_inactive(cpu_t *); 144 static void pg_cmt_cpupart_in(cpu_t *, cpupart_t *); 145 static void pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *); 146 static char *pg_cmt_policy_name(pg_t *); 147 static void pg_cmt_hier_sort(pg_cmt_t **, int); 148 static pg_cmt_t *pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *); 149 static int pg_cmt_cpu_belongs(pg_t *, cpu_t *); 150 static int pg_cmt_hw(pghw_type_t); 151 static cmt_lgrp_t *pg_cmt_find_lgrp(lgrp_handle_t); 152 static cmt_lgrp_t *pg_cmt_lgrp_create(lgrp_handle_t); 153 static void cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t, 154 kthread_t *, kthread_t *); 155 static void cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t, 156 kthread_t *, kthread_t *); 157 static void cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *); 158 static cmt_lineage_validation_t pg_cmt_lineage_validate(pg_cmt_t **, int *, 159 cpu_pg_t *); 160 161 /* 162 * CMT PG ops 163 */ 164 struct pg_ops pg_ops_cmt = { 165 pg_cmt_alloc, 166 pg_cmt_free, 167 pg_cmt_cpu_init, 168 pg_cmt_cpu_fini, 169 pg_cmt_cpu_active, 170 pg_cmt_cpu_inactive, 171 pg_cmt_cpupart_in, 172 NULL, /* cpupart_out */ 173 pg_cmt_cpupart_move, 174 pg_cmt_cpu_belongs, 175 pg_cmt_policy_name, 176 }; 177 178 /* 179 * Initialize the CMT PG class 180 */ 181 void 182 pg_cmt_class_init(void) 183 { 184 if (cmt_sched_disabled) 185 return; 186 187 pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL); 188 } 189 190 /* 191 * Called to indicate a new CPU has started up so 192 * that either t0 or the slave startup thread can 193 * be accounted for. 194 */ 195 void 196 pg_cmt_cpu_startup(cpu_t *cp) 197 { 198 pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread, 199 cp->cpu_thread); 200 } 201 202 /* 203 * Return non-zero if thread can migrate between "from" and "to" 204 * without a performance penalty 205 */ 206 int 207 pg_cmt_can_migrate(cpu_t *from, cpu_t *to) 208 { 209 if (from->cpu_physid->cpu_cacheid == 210 to->cpu_physid->cpu_cacheid) 211 return (1); 212 return (0); 213 } 214 215 /* 216 * CMT class specific PG allocation 217 */ 218 static pg_t * 219 pg_cmt_alloc(void) 220 { 221 return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP)); 222 } 223 224 /* 225 * Class specific PG de-allocation 226 */ 227 static void 228 pg_cmt_free(pg_t *pg) 229 { 230 ASSERT(pg != NULL); 231 ASSERT(IS_CMT_PG(pg)); 232 233 kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t)); 234 } 235 236 /* 237 * Given a hardware sharing relationship, return which dispatcher 238 * policies should be implemented to optimize performance and efficiency 239 */ 240 static pg_cmt_policy_t 241 pg_cmt_policy(pghw_type_t hw) 242 { 243 pg_cmt_policy_t p; 244 245 /* 246 * Give the platform a chance to override the default 247 */ 248 if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY) 249 return (p); 250 251 switch (hw) { 252 case PGHW_IPIPE: 253 case PGHW_FPU: 254 case PGHW_PROCNODE: 255 case PGHW_CHIP: 256 return (CMT_BALANCE); 257 case PGHW_CACHE: 258 return (CMT_AFFINITY | CMT_BALANCE); 259 case PGHW_POW_ACTIVE: 260 case PGHW_POW_IDLE: 261 return (CMT_BALANCE); 262 default: 263 return (CMT_NO_POLICY); 264 } 265 } 266 267 /* 268 * Rank the importance of optimizing for the pg1 relationship vs. 269 * the pg2 relationship. 270 */ 271 static pg_cmt_t * 272 pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2) 273 { 274 pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw; 275 pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw; 276 277 /* 278 * A power domain is only important if CPUPM is enabled. 279 */ 280 if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) { 281 if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2)) 282 return (pg2); 283 if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1)) 284 return (pg1); 285 } 286 287 /* 288 * Otherwise, ask the platform 289 */ 290 if (pg_plat_hw_rank(hw1, hw2) == hw1) 291 return (pg1); 292 else 293 return (pg2); 294 } 295 296 /* 297 * Initialize CMT callbacks for the given PG 298 */ 299 static void 300 cmt_callback_init(pg_t *pg) 301 { 302 /* 303 * Stick with the default callbacks if there isn't going to be 304 * any CMT thread placement optimizations implemented. 305 */ 306 if (((pg_cmt_t *)pg)->cmt_policy == CMT_NO_POLICY) 307 return; 308 309 switch (((pghw_t *)pg)->pghw_hw) { 310 case PGHW_POW_ACTIVE: 311 pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr; 312 pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr; 313 break; 314 default: 315 pg->pg_cb.thread_swtch = cmt_ev_thread_swtch; 316 317 } 318 } 319 320 /* 321 * Promote PG above it's current parent. 322 * This is only legal if PG has an equal or greater number of CPUs than its 323 * parent. 324 * 325 * This routine operates on the CPU specific processor group data (for the CPUs 326 * in the PG being promoted), and may be invoked from a context where one CPU's 327 * PG data is under construction. In this case the argument "pgdata", if not 328 * NULL, is a reference to the CPU's under-construction PG data. 329 */ 330 static void 331 cmt_hier_promote(pg_cmt_t *pg, cpu_pg_t *pgdata) 332 { 333 pg_cmt_t *parent; 334 group_t *children; 335 cpu_t *cpu; 336 group_iter_t iter; 337 pg_cpu_itr_t cpu_iter; 338 int r; 339 int err; 340 int nchildren; 341 342 ASSERT(MUTEX_HELD(&cpu_lock)); 343 344 parent = pg->cmt_parent; 345 if (parent == NULL) { 346 /* 347 * Nothing to do 348 */ 349 return; 350 } 351 352 ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent)); 353 354 /* 355 * We're changing around the hierarchy, which is actively traversed 356 * by the dispatcher. Pause CPUS to ensure exclusivity. 357 */ 358 pause_cpus(NULL); 359 360 /* 361 * If necessary, update the parent's sibling set, replacing parent 362 * with PG. 363 */ 364 if (parent->cmt_siblings) { 365 if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE) 366 != -1) { 367 r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE); 368 ASSERT(r != -1); 369 } 370 } 371 372 /* 373 * If the parent is at the top of the hierarchy, replace it's entry 374 * in the root lgroup's group of top level PGs. 375 */ 376 if (parent->cmt_parent == NULL && 377 parent->cmt_siblings != &cmt_root->cl_pgs) { 378 if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE) 379 != -1) { 380 r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE); 381 ASSERT(r != -1); 382 } 383 } 384 385 /* 386 * We assume (and therefore assert) that the PG being promoted is an 387 * only child of it's parent. Update the parent's children set 388 * replacing PG's entry with the parent (since the parent is becoming 389 * the child). Then have PG and the parent swap children sets and 390 * children counts. 391 */ 392 ASSERT(GROUP_SIZE(parent->cmt_children) <= 1); 393 if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) { 394 r = group_add(parent->cmt_children, parent, GRP_NORESIZE); 395 ASSERT(r != -1); 396 } 397 398 children = pg->cmt_children; 399 pg->cmt_children = parent->cmt_children; 400 parent->cmt_children = children; 401 402 nchildren = pg->cmt_nchildren; 403 pg->cmt_nchildren = parent->cmt_nchildren; 404 parent->cmt_nchildren = nchildren; 405 406 /* 407 * Update the sibling references for PG and it's parent 408 */ 409 pg->cmt_siblings = parent->cmt_siblings; 410 parent->cmt_siblings = pg->cmt_children; 411 412 /* 413 * Update any cached lineages in the per CPU pg data. 414 */ 415 PG_CPU_ITR_INIT(pg, cpu_iter); 416 while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { 417 int idx; 418 int sz; 419 pg_cmt_t *cpu_pg; 420 cpu_pg_t *pgd; /* CPU's PG data */ 421 422 /* 423 * The CPU's whose lineage is under construction still 424 * references the bootstrap CPU PG data structure. 425 */ 426 if (pg_cpu_is_bootstrapped(cpu)) 427 pgd = pgdata; 428 else 429 pgd = cpu->cpu_pg; 430 431 /* 432 * Iterate over the CPU's PGs updating the children 433 * of the PG being promoted, since they have a new parent. 434 */ 435 group_iter_init(&iter); 436 while ((cpu_pg = group_iterate(&pgd->cmt_pgs, &iter)) != NULL) { 437 if (cpu_pg->cmt_parent == pg) { 438 cpu_pg->cmt_parent = parent; 439 } 440 } 441 442 /* 443 * Update the CMT load balancing lineage 444 */ 445 if ((idx = group_find(&pgd->cmt_pgs, (void *)pg)) == -1) { 446 /* 447 * Unless this is the CPU who's lineage is being 448 * constructed, the PG being promoted should be 449 * in the lineage. 450 */ 451 ASSERT(pg_cpu_is_bootstrapped(cpu)); 452 continue; 453 } 454 455 ASSERT(idx > 0); 456 ASSERT(GROUP_ACCESS(&pgd->cmt_pgs, idx - 1) == parent); 457 458 /* 459 * Have the child and the parent swap places in the CPU's 460 * lineage 461 */ 462 group_remove_at(&pgd->cmt_pgs, idx); 463 group_remove_at(&pgd->cmt_pgs, idx - 1); 464 err = group_add_at(&pgd->cmt_pgs, parent, idx); 465 ASSERT(err == 0); 466 err = group_add_at(&pgd->cmt_pgs, pg, idx - 1); 467 ASSERT(err == 0); 468 469 /* 470 * Ensure cmt_lineage references CPU's leaf PG. 471 * Since cmt_pgs is top-down ordered, the bottom is the last 472 * element. 473 */ 474 if ((sz = GROUP_SIZE(&pgd->cmt_pgs)) > 0) 475 pgd->cmt_lineage = GROUP_ACCESS(&pgd->cmt_pgs, sz - 1); 476 } 477 478 /* 479 * Update the parent references for PG and it's parent 480 */ 481 pg->cmt_parent = parent->cmt_parent; 482 parent->cmt_parent = pg; 483 484 start_cpus(); 485 } 486 487 /* 488 * CMT class callback for a new CPU entering the system 489 * 490 * This routine operates on the CPU specific processor group data (for the CPU 491 * being initialized). The argument "pgdata" is a reference to the CPU's PG 492 * data to be constructed. 493 * 494 * cp->cpu_pg is used by the dispatcher to access the CPU's PG data 495 * references a "bootstrap" structure. pg_cmt_cpu_init() and the routines it 496 * calls must be careful to operate only on the "pgdata" argument, and not 497 * cp->cpu_pg. 498 */ 499 static void 500 pg_cmt_cpu_init(cpu_t *cp, cpu_pg_t *pgdata) 501 { 502 pg_cmt_t *pg; 503 group_t *cmt_pgs; 504 int levels, level; 505 pghw_type_t hw; 506 pg_t *pg_cache = NULL; 507 pg_cmt_t *cpu_cmt_hier[PGHW_NUM_COMPONENTS]; 508 lgrp_handle_t lgrp_handle; 509 cmt_lgrp_t *lgrp; 510 cmt_lineage_validation_t lineage_status; 511 512 ASSERT(MUTEX_HELD(&cpu_lock)); 513 ASSERT(pg_cpu_is_bootstrapped(cp)); 514 515 if (cmt_sched_disabled) 516 return; 517 518 /* 519 * A new CPU is coming into the system. 520 * Interrogate the platform to see if the CPU 521 * has any performance or efficiency relevant 522 * sharing relationships 523 */ 524 cmt_pgs = &pgdata->cmt_pgs; 525 pgdata->cmt_lineage = NULL; 526 527 bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier)); 528 levels = 0; 529 for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) { 530 531 pg_cmt_policy_t policy; 532 533 /* 534 * We're only interested in the hw sharing relationships 535 * for which we know how to optimize. 536 */ 537 policy = pg_cmt_policy(hw); 538 if (policy == CMT_NO_POLICY || 539 pg_plat_hw_shared(cp, hw) == 0) 540 continue; 541 542 /* 543 * We will still create the PGs for hardware sharing 544 * relationships that have been blacklisted, but won't 545 * implement CMT thread placement optimizations against them. 546 */ 547 if (cmt_hw_blacklisted[hw] == 1) 548 policy = CMT_NO_POLICY; 549 550 /* 551 * Find (or create) the PG associated with 552 * the hw sharing relationship in which cp 553 * belongs. 554 * 555 * Determine if a suitable PG already 556 * exists, or if one needs to be created. 557 */ 558 pg = (pg_cmt_t *)pghw_place_cpu(cp, hw); 559 if (pg == NULL) { 560 /* 561 * Create a new one. 562 * Initialize the common... 563 */ 564 pg = (pg_cmt_t *)pg_create(pg_cmt_class_id); 565 566 /* ... physical ... */ 567 pghw_init((pghw_t *)pg, cp, hw); 568 569 /* 570 * ... and CMT specific portions of the 571 * structure. 572 */ 573 pg->cmt_policy = policy; 574 575 /* CMT event callbacks */ 576 cmt_callback_init((pg_t *)pg); 577 578 bitset_init(&pg->cmt_cpus_actv_set); 579 group_create(&pg->cmt_cpus_actv); 580 } else { 581 ASSERT(IS_CMT_PG(pg)); 582 } 583 584 ((pghw_t *)pg)->pghw_generation++; 585 586 /* Add the CPU to the PG */ 587 pg_cpu_add((pg_t *)pg, cp, pgdata); 588 589 /* 590 * Ensure capacity of the active CPU group/bitset 591 */ 592 group_expand(&pg->cmt_cpus_actv, 593 GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 594 595 if (cp->cpu_seqid >= 596 bitset_capacity(&pg->cmt_cpus_actv_set)) { 597 bitset_resize(&pg->cmt_cpus_actv_set, 598 cp->cpu_seqid + 1); 599 } 600 601 /* 602 * Build a lineage of CMT PGs for load balancing / coalescence 603 */ 604 if (policy & (CMT_BALANCE | CMT_COALESCE)) { 605 cpu_cmt_hier[levels++] = pg; 606 } 607 608 /* Cache this for later */ 609 if (hw == PGHW_CACHE) 610 pg_cache = (pg_t *)pg; 611 } 612 613 group_expand(cmt_pgs, levels); 614 615 if (cmt_root == NULL) 616 cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand()); 617 618 /* 619 * Find the lgrp that encapsulates this CPU's CMT hierarchy 620 */ 621 lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 622 if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL) 623 lgrp = pg_cmt_lgrp_create(lgrp_handle); 624 625 /* 626 * Ascendingly sort the PGs in the lineage by number of CPUs 627 */ 628 pg_cmt_hier_sort(cpu_cmt_hier, levels); 629 630 /* 631 * Examine the lineage and validate it. 632 * This routine will also try to fix the lineage along with the 633 * rest of the PG hierarchy should it detect an issue. 634 * 635 * If it returns anything other than VALID or REPAIRED, an 636 * unrecoverable error has occurred, and we cannot proceed. 637 */ 638 lineage_status = pg_cmt_lineage_validate(cpu_cmt_hier, &levels, pgdata); 639 if ((lineage_status != CMT_LINEAGE_VALID) && 640 (lineage_status != CMT_LINEAGE_REPAIRED)) { 641 /* 642 * In the case of an unrecoverable error where CMT scheduling 643 * has been disabled, assert that the under construction CPU's 644 * PG data has an empty CMT load balancing lineage. 645 */ 646 ASSERT((cmt_sched_disabled == 0) || 647 (GROUP_SIZE(&(pgdata->cmt_pgs)) == 0)); 648 return; 649 } 650 651 /* 652 * For existing PGs in the lineage, verify that the parent is 653 * correct, as the generation in the lineage may have changed 654 * as a result of the sorting. Start the traversal at the top 655 * of the lineage, moving down. 656 */ 657 for (level = levels - 1; level >= 0; ) { 658 int reorg; 659 660 reorg = 0; 661 pg = cpu_cmt_hier[level]; 662 663 /* 664 * Promote PGs at an incorrect generation into place. 665 */ 666 while (pg->cmt_parent && 667 pg->cmt_parent != cpu_cmt_hier[level + 1]) { 668 cmt_hier_promote(pg, pgdata); 669 reorg++; 670 } 671 if (reorg > 0) 672 level = levels - 1; 673 else 674 level--; 675 } 676 677 /* 678 * For each of the PGs in the CPU's lineage: 679 * - Add an entry in the CPU sorted CMT PG group 680 * which is used for top down CMT load balancing 681 * - Tie the PG into the CMT hierarchy by connecting 682 * it to it's parent and siblings. 683 */ 684 for (level = 0; level < levels; level++) { 685 uint_t children; 686 int err; 687 688 pg = cpu_cmt_hier[level]; 689 err = group_add_at(cmt_pgs, pg, levels - level - 1); 690 ASSERT(err == 0); 691 692 if (level == 0) 693 pgdata->cmt_lineage = (pg_t *)pg; 694 695 if (pg->cmt_siblings != NULL) { 696 /* Already initialized */ 697 ASSERT(pg->cmt_parent == NULL || 698 pg->cmt_parent == cpu_cmt_hier[level + 1]); 699 ASSERT(pg->cmt_siblings == &lgrp->cl_pgs || 700 ((pg->cmt_parent != NULL) && 701 pg->cmt_siblings == pg->cmt_parent->cmt_children)); 702 continue; 703 } 704 705 if ((level + 1) == levels) { 706 pg->cmt_parent = NULL; 707 708 pg->cmt_siblings = &lgrp->cl_pgs; 709 children = ++lgrp->cl_npgs; 710 if (cmt_root != lgrp) 711 cmt_root->cl_npgs++; 712 } else { 713 pg->cmt_parent = cpu_cmt_hier[level + 1]; 714 715 /* 716 * A good parent keeps track of their children. 717 * The parent's children group is also the PG's 718 * siblings. 719 */ 720 if (pg->cmt_parent->cmt_children == NULL) { 721 pg->cmt_parent->cmt_children = 722 kmem_zalloc(sizeof (group_t), KM_SLEEP); 723 group_create(pg->cmt_parent->cmt_children); 724 } 725 pg->cmt_siblings = pg->cmt_parent->cmt_children; 726 children = ++pg->cmt_parent->cmt_nchildren; 727 } 728 729 group_expand(pg->cmt_siblings, children); 730 group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs); 731 } 732 733 /* 734 * Cache the chip and core IDs in the cpu_t->cpu_physid structure 735 * for fast lookups later. 736 */ 737 if (cp->cpu_physid) { 738 cp->cpu_physid->cpu_chipid = 739 pg_plat_hw_instance_id(cp, PGHW_CHIP); 740 cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp); 741 742 /* 743 * If this cpu has a PG representing shared cache, then set 744 * cpu_cacheid to that PG's logical id 745 */ 746 if (pg_cache) 747 cp->cpu_physid->cpu_cacheid = pg_cache->pg_id; 748 } 749 750 /* CPU0 only initialization */ 751 if (is_cpu0) { 752 is_cpu0 = 0; 753 cpu0_lgrp = lgrp; 754 } 755 756 } 757 758 /* 759 * Class callback when a CPU is leaving the system (deletion) 760 * 761 * "pgdata" is a reference to the CPU's PG data to be deconstructed. 762 * 763 * cp->cpu_pg is used by the dispatcher to access the CPU's PG data 764 * references a "bootstrap" structure across this function's invocation. 765 * pg_cmt_cpu_fini() and the routines it calls must be careful to operate only 766 * on the "pgdata" argument, and not cp->cpu_pg. 767 */ 768 static void 769 pg_cmt_cpu_fini(cpu_t *cp, cpu_pg_t *pgdata) 770 { 771 group_iter_t i; 772 pg_cmt_t *pg; 773 group_t *pgs, *cmt_pgs; 774 lgrp_handle_t lgrp_handle; 775 cmt_lgrp_t *lgrp; 776 777 if (cmt_sched_disabled) 778 return; 779 780 ASSERT(pg_cpu_is_bootstrapped(cp)); 781 782 pgs = &pgdata->pgs; 783 cmt_pgs = &pgdata->cmt_pgs; 784 785 /* 786 * Find the lgroup that encapsulates this CPU's CMT hierarchy 787 */ 788 lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 789 790 lgrp = pg_cmt_find_lgrp(lgrp_handle); 791 if (ncpus == 1 && lgrp != cpu0_lgrp) { 792 /* 793 * One might wonder how we could be deconfiguring the 794 * only CPU in the system. 795 * 796 * On Starcat systems when null_proc_lpa is detected, 797 * the boot CPU (which is already configured into a leaf 798 * lgroup), is moved into the root lgroup. This is done by 799 * deconfiguring it from both lgroups and processor 800 * groups), and then later reconfiguring it back in. This 801 * call to pg_cmt_cpu_fini() is part of that deconfiguration. 802 * 803 * This special case is detected by noting that the platform 804 * has changed the CPU's lgrp affiliation (since it now 805 * belongs in the root). In this case, use the cmt_lgrp_t 806 * cached for the boot CPU, since this is what needs to be 807 * torn down. 808 */ 809 lgrp = cpu0_lgrp; 810 } 811 812 ASSERT(lgrp != NULL); 813 814 /* 815 * First, clean up anything load balancing specific for each of 816 * the CPU's PGs that participated in CMT load balancing 817 */ 818 pg = (pg_cmt_t *)pgdata->cmt_lineage; 819 while (pg != NULL) { 820 821 ((pghw_t *)pg)->pghw_generation++; 822 823 /* 824 * Remove the PG from the CPU's load balancing lineage 825 */ 826 (void) group_remove(cmt_pgs, pg, GRP_RESIZE); 827 828 /* 829 * If it's about to become empty, destroy it's children 830 * group, and remove it's reference from it's siblings. 831 * This is done here (rather than below) to avoid removing 832 * our reference from a PG that we just eliminated. 833 */ 834 if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) { 835 if (pg->cmt_children != NULL) 836 group_destroy(pg->cmt_children); 837 if (pg->cmt_siblings != NULL) { 838 if (pg->cmt_siblings == &lgrp->cl_pgs) 839 lgrp->cl_npgs--; 840 else 841 pg->cmt_parent->cmt_nchildren--; 842 } 843 } 844 pg = pg->cmt_parent; 845 } 846 ASSERT(GROUP_SIZE(cmt_pgs) == 0); 847 848 /* 849 * Now that the load balancing lineage updates have happened, 850 * remove the CPU from all it's PGs (destroying any that become 851 * empty). 852 */ 853 group_iter_init(&i); 854 while ((pg = group_iterate(pgs, &i)) != NULL) { 855 if (IS_CMT_PG(pg) == 0) 856 continue; 857 858 pg_cpu_delete((pg_t *)pg, cp, pgdata); 859 /* 860 * Deleting the CPU from the PG changes the CPU's 861 * PG group over which we are actively iterating 862 * Re-initialize the iteration 863 */ 864 group_iter_init(&i); 865 866 if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) { 867 868 /* 869 * The PG has become zero sized, so destroy it. 870 */ 871 group_destroy(&pg->cmt_cpus_actv); 872 bitset_fini(&pg->cmt_cpus_actv_set); 873 pghw_fini((pghw_t *)pg); 874 875 pg_destroy((pg_t *)pg); 876 } 877 } 878 } 879 880 /* 881 * Class callback when a CPU is entering a cpu partition 882 */ 883 static void 884 pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp) 885 { 886 group_t *pgs; 887 pg_t *pg; 888 group_iter_t i; 889 890 ASSERT(MUTEX_HELD(&cpu_lock)); 891 892 if (cmt_sched_disabled) 893 return; 894 895 pgs = &cp->cpu_pg->pgs; 896 897 /* 898 * Ensure that the new partition's PG bitset 899 * is large enough for all CMT PG's to which cp 900 * belongs 901 */ 902 group_iter_init(&i); 903 while ((pg = group_iterate(pgs, &i)) != NULL) { 904 if (IS_CMT_PG(pg) == 0) 905 continue; 906 907 if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id) 908 bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1); 909 } 910 } 911 912 /* 913 * Class callback when a CPU is actually moving partitions 914 */ 915 static void 916 pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp) 917 { 918 cpu_t *cpp; 919 group_t *pgs; 920 pg_t *pg; 921 group_iter_t pg_iter; 922 pg_cpu_itr_t cpu_iter; 923 boolean_t found; 924 925 ASSERT(MUTEX_HELD(&cpu_lock)); 926 927 if (cmt_sched_disabled) 928 return; 929 930 pgs = &cp->cpu_pg->pgs; 931 group_iter_init(&pg_iter); 932 933 /* 934 * Iterate over the CPUs CMT PGs 935 */ 936 while ((pg = group_iterate(pgs, &pg_iter)) != NULL) { 937 938 if (IS_CMT_PG(pg) == 0) 939 continue; 940 941 /* 942 * Add the PG to the bitset in the new partition. 943 */ 944 bitset_add(&newpp->cp_cmt_pgs, pg->pg_id); 945 946 /* 947 * Remove the PG from the bitset in the old partition 948 * if the last of the PG's CPUs have left. 949 */ 950 found = B_FALSE; 951 PG_CPU_ITR_INIT(pg, cpu_iter); 952 while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) { 953 if (cpp == cp) 954 continue; 955 if (CPU_ACTIVE(cpp) && 956 cpp->cpu_part->cp_id == oldpp->cp_id) { 957 found = B_TRUE; 958 break; 959 } 960 } 961 if (!found) 962 bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id); 963 } 964 } 965 966 /* 967 * Class callback when a CPU becomes active (online) 968 * 969 * This is called in a context where CPUs are paused 970 */ 971 static void 972 pg_cmt_cpu_active(cpu_t *cp) 973 { 974 int err; 975 group_iter_t i; 976 pg_cmt_t *pg; 977 group_t *pgs; 978 979 ASSERT(MUTEX_HELD(&cpu_lock)); 980 981 if (cmt_sched_disabled) 982 return; 983 984 pgs = &cp->cpu_pg->pgs; 985 group_iter_init(&i); 986 987 /* 988 * Iterate over the CPU's PGs 989 */ 990 while ((pg = group_iterate(pgs, &i)) != NULL) { 991 992 if (IS_CMT_PG(pg) == 0) 993 continue; 994 995 /* 996 * Move to the next generation since topology is changing 997 */ 998 ((pghw_t *)pg)->pghw_generation++; 999 1000 err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 1001 ASSERT(err == 0); 1002 1003 /* 1004 * If this is the first active CPU in the PG, and it 1005 * represents a hardware sharing relationship over which 1006 * CMT load balancing is performed, add it as a candidate 1007 * for balancing with it's siblings. 1008 */ 1009 if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 && 1010 (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) { 1011 err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE); 1012 ASSERT(err == 0); 1013 1014 /* 1015 * If this is a top level PG, add it as a balancing 1016 * candidate when balancing within the root lgroup. 1017 */ 1018 if (pg->cmt_parent == NULL && 1019 pg->cmt_siblings != &cmt_root->cl_pgs) { 1020 err = group_add(&cmt_root->cl_pgs, pg, 1021 GRP_NORESIZE); 1022 ASSERT(err == 0); 1023 } 1024 } 1025 1026 /* 1027 * Notate the CPU in the PGs active CPU bitset. 1028 * Also notate the PG as being active in it's associated 1029 * partition 1030 */ 1031 bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 1032 bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id); 1033 } 1034 } 1035 1036 /* 1037 * Class callback when a CPU goes inactive (offline) 1038 * 1039 * This is called in a context where CPUs are paused 1040 */ 1041 static void 1042 pg_cmt_cpu_inactive(cpu_t *cp) 1043 { 1044 int err; 1045 group_t *pgs; 1046 pg_cmt_t *pg; 1047 cpu_t *cpp; 1048 group_iter_t i; 1049 pg_cpu_itr_t cpu_itr; 1050 boolean_t found; 1051 1052 ASSERT(MUTEX_HELD(&cpu_lock)); 1053 1054 if (cmt_sched_disabled) 1055 return; 1056 1057 pgs = &cp->cpu_pg->pgs; 1058 group_iter_init(&i); 1059 1060 while ((pg = group_iterate(pgs, &i)) != NULL) { 1061 1062 if (IS_CMT_PG(pg) == 0) 1063 continue; 1064 1065 /* 1066 * Move to the next generation since topology is changing 1067 */ 1068 ((pghw_t *)pg)->pghw_generation++; 1069 1070 /* 1071 * Remove the CPU from the CMT PGs active CPU group 1072 * bitmap 1073 */ 1074 err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 1075 ASSERT(err == 0); 1076 1077 bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 1078 1079 /* 1080 * If there are no more active CPUs in this PG over which 1081 * load was balanced, remove it as a balancing candidate. 1082 */ 1083 if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 && 1084 (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) { 1085 err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); 1086 ASSERT(err == 0); 1087 1088 if (pg->cmt_parent == NULL && 1089 pg->cmt_siblings != &cmt_root->cl_pgs) { 1090 err = group_remove(&cmt_root->cl_pgs, pg, 1091 GRP_NORESIZE); 1092 ASSERT(err == 0); 1093 } 1094 } 1095 1096 /* 1097 * Assert the number of active CPUs does not exceed 1098 * the total number of CPUs in the PG 1099 */ 1100 ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <= 1101 GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 1102 1103 /* 1104 * Update the PG bitset in the CPU's old partition 1105 */ 1106 found = B_FALSE; 1107 PG_CPU_ITR_INIT(pg, cpu_itr); 1108 while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) { 1109 if (cpp == cp) 1110 continue; 1111 if (CPU_ACTIVE(cpp) && 1112 cpp->cpu_part->cp_id == cp->cpu_part->cp_id) { 1113 found = B_TRUE; 1114 break; 1115 } 1116 } 1117 if (!found) { 1118 bitset_del(&cp->cpu_part->cp_cmt_pgs, 1119 ((pg_t *)pg)->pg_id); 1120 } 1121 } 1122 } 1123 1124 /* 1125 * Return non-zero if the CPU belongs in the given PG 1126 */ 1127 static int 1128 pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp) 1129 { 1130 cpu_t *pg_cpu; 1131 1132 pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0); 1133 1134 ASSERT(pg_cpu != NULL); 1135 1136 /* 1137 * The CPU belongs if, given the nature of the hardware sharing 1138 * relationship represented by the PG, the CPU has that 1139 * relationship with some other CPU already in the PG 1140 */ 1141 if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw)) 1142 return (1); 1143 1144 return (0); 1145 } 1146 1147 /* 1148 * Sort the CPUs CMT hierarchy, where "size" is the number of levels. 1149 */ 1150 static void 1151 pg_cmt_hier_sort(pg_cmt_t **hier, int size) 1152 { 1153 int i, j, inc, sz; 1154 int start, end; 1155 pg_t *tmp; 1156 pg_t **h = (pg_t **)hier; 1157 1158 /* 1159 * First sort by number of CPUs 1160 */ 1161 inc = size / 2; 1162 while (inc > 0) { 1163 for (i = inc; i < size; i++) { 1164 j = i; 1165 tmp = h[i]; 1166 while ((j >= inc) && 1167 (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) { 1168 h[j] = h[j - inc]; 1169 j = j - inc; 1170 } 1171 h[j] = tmp; 1172 } 1173 if (inc == 2) 1174 inc = 1; 1175 else 1176 inc = (inc * 5) / 11; 1177 } 1178 1179 /* 1180 * Break ties by asking the platform. 1181 * Determine if h[i] outranks h[i + 1] and if so, swap them. 1182 */ 1183 for (start = 0; start < size; start++) { 1184 1185 /* 1186 * Find various contiguous sets of elements, 1187 * in the array, with the same number of cpus 1188 */ 1189 end = start; 1190 sz = PG_NUM_CPUS(h[start]); 1191 while ((end < size) && (sz == PG_NUM_CPUS(h[end]))) 1192 end++; 1193 /* 1194 * Sort each such set of the array by rank 1195 */ 1196 for (i = start + 1; i < end; i++) { 1197 j = i - 1; 1198 tmp = h[i]; 1199 while (j >= start && 1200 pg_cmt_hier_rank(hier[j], 1201 (pg_cmt_t *)tmp) == hier[j]) { 1202 h[j + 1] = h[j]; 1203 j--; 1204 } 1205 h[j + 1] = tmp; 1206 } 1207 } 1208 } 1209 1210 /* 1211 * Return a cmt_lgrp_t * given an lgroup handle. 1212 */ 1213 static cmt_lgrp_t * 1214 pg_cmt_find_lgrp(lgrp_handle_t hand) 1215 { 1216 cmt_lgrp_t *lgrp; 1217 1218 ASSERT(MUTEX_HELD(&cpu_lock)); 1219 1220 lgrp = cmt_lgrps; 1221 while (lgrp != NULL) { 1222 if (lgrp->cl_hand == hand) 1223 break; 1224 lgrp = lgrp->cl_next; 1225 } 1226 return (lgrp); 1227 } 1228 1229 /* 1230 * Create a cmt_lgrp_t with the specified handle. 1231 */ 1232 static cmt_lgrp_t * 1233 pg_cmt_lgrp_create(lgrp_handle_t hand) 1234 { 1235 cmt_lgrp_t *lgrp; 1236 1237 ASSERT(MUTEX_HELD(&cpu_lock)); 1238 1239 lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP); 1240 1241 lgrp->cl_hand = hand; 1242 lgrp->cl_npgs = 0; 1243 lgrp->cl_next = cmt_lgrps; 1244 cmt_lgrps = lgrp; 1245 group_create(&lgrp->cl_pgs); 1246 1247 return (lgrp); 1248 } 1249 1250 /* 1251 * Interfaces to enable and disable power aware dispatching 1252 * The caller must be holding cpu_lock. 1253 * 1254 * Return 0 on success and -1 on failure. 1255 */ 1256 int 1257 cmt_pad_enable(pghw_type_t type) 1258 { 1259 group_t *hwset; 1260 group_iter_t iter; 1261 pg_cmt_t *pg; 1262 1263 ASSERT(PGHW_IS_PM_DOMAIN(type)); 1264 ASSERT(MUTEX_HELD(&cpu_lock)); 1265 1266 if ((hwset = pghw_set_lookup(type)) == NULL || 1267 cmt_hw_blacklisted[type]) { 1268 /* 1269 * Unable to find any instances of the specified type 1270 * of power domain, or the power domains have been blacklisted. 1271 */ 1272 return (-1); 1273 } 1274 1275 /* 1276 * Iterate over the power domains, setting the default dispatcher 1277 * policy for power/performance optimization. 1278 * 1279 * Simply setting the policy isn't enough in the case where the power 1280 * domain is an only child of another PG. Because the dispatcher walks 1281 * the PG hierarchy in a top down fashion, the higher up PG's policy 1282 * will dominate. So promote the power domain above it's parent if both 1283 * PG and it's parent have the same CPUs to ensure it's policy 1284 * dominates. 1285 */ 1286 group_iter_init(&iter); 1287 while ((pg = group_iterate(hwset, &iter)) != NULL) { 1288 /* 1289 * If the power domain is an only child to a parent 1290 * not implementing the same policy, promote the child 1291 * above the parent to activate the policy. 1292 */ 1293 pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw); 1294 while ((pg->cmt_parent != NULL) && 1295 (pg->cmt_parent->cmt_policy != pg->cmt_policy) && 1296 (PG_NUM_CPUS((pg_t *)pg) == 1297 PG_NUM_CPUS((pg_t *)pg->cmt_parent))) { 1298 cmt_hier_promote(pg, NULL); 1299 } 1300 } 1301 1302 return (0); 1303 } 1304 1305 int 1306 cmt_pad_disable(pghw_type_t type) 1307 { 1308 group_t *hwset; 1309 group_iter_t iter; 1310 pg_cmt_t *pg; 1311 pg_cmt_t *child; 1312 1313 ASSERT(PGHW_IS_PM_DOMAIN(type)); 1314 ASSERT(MUTEX_HELD(&cpu_lock)); 1315 1316 if ((hwset = pghw_set_lookup(type)) == NULL) { 1317 /* 1318 * Unable to find any instances of the specified type of 1319 * power domain. 1320 */ 1321 return (-1); 1322 } 1323 /* 1324 * Iterate over the power domains, setting the default dispatcher 1325 * policy for performance optimization (load balancing). 1326 */ 1327 group_iter_init(&iter); 1328 while ((pg = group_iterate(hwset, &iter)) != NULL) { 1329 1330 /* 1331 * If the power domain has an only child that implements 1332 * policy other than load balancing, promote the child 1333 * above the power domain to ensure it's policy dominates. 1334 */ 1335 if (pg->cmt_children != NULL && 1336 GROUP_SIZE(pg->cmt_children) == 1) { 1337 child = GROUP_ACCESS(pg->cmt_children, 0); 1338 if ((child->cmt_policy & CMT_BALANCE) == 0) { 1339 cmt_hier_promote(child, NULL); 1340 } 1341 } 1342 pg->cmt_policy = CMT_BALANCE; 1343 } 1344 return (0); 1345 } 1346 1347 /* ARGSUSED */ 1348 static void 1349 cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old, 1350 kthread_t *new) 1351 { 1352 pg_cmt_t *cmt_pg = (pg_cmt_t *)pg; 1353 1354 if (old == cp->cpu_idle_thread) { 1355 atomic_add_32(&cmt_pg->cmt_utilization, 1); 1356 } else if (new == cp->cpu_idle_thread) { 1357 atomic_add_32(&cmt_pg->cmt_utilization, -1); 1358 } 1359 } 1360 1361 /* 1362 * Macro to test whether a thread is currently runnable on a CPU in a PG. 1363 */ 1364 #define THREAD_RUNNABLE_IN_PG(t, pg) \ 1365 ((t)->t_state == TS_RUN && \ 1366 (t)->t_disp_queue->disp_cpu && \ 1367 bitset_in_set(&(pg)->cmt_cpus_actv_set, \ 1368 (t)->t_disp_queue->disp_cpu->cpu_seqid)) 1369 1370 static void 1371 cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old, 1372 kthread_t *new) 1373 { 1374 pg_cmt_t *cmt = (pg_cmt_t *)pg; 1375 cpupm_domain_t *dom; 1376 uint32_t u; 1377 1378 if (old == cp->cpu_idle_thread) { 1379 ASSERT(new != cp->cpu_idle_thread); 1380 u = atomic_add_32_nv(&cmt->cmt_utilization, 1); 1381 if (u == 1) { 1382 /* 1383 * Notify the CPU power manager that the domain 1384 * is non-idle. 1385 */ 1386 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1387 cpupm_utilization_event(cp, now, dom, 1388 CPUPM_DOM_BUSY_FROM_IDLE); 1389 } 1390 } else if (new == cp->cpu_idle_thread) { 1391 ASSERT(old != cp->cpu_idle_thread); 1392 u = atomic_add_32_nv(&cmt->cmt_utilization, -1); 1393 if (u == 0) { 1394 /* 1395 * The domain is idle, notify the CPU power 1396 * manager. 1397 * 1398 * Avoid notifying if the thread is simply migrating 1399 * between CPUs in the domain. 1400 */ 1401 if (!THREAD_RUNNABLE_IN_PG(old, cmt)) { 1402 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1403 cpupm_utilization_event(cp, now, dom, 1404 CPUPM_DOM_IDLE_FROM_BUSY); 1405 } 1406 } 1407 } 1408 } 1409 1410 /* ARGSUSED */ 1411 static void 1412 cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t) 1413 { 1414 pg_cmt_t *cmt = (pg_cmt_t *)pg; 1415 cpupm_domain_t *dom; 1416 1417 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1418 cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY); 1419 } 1420 1421 /* 1422 * Return the name of the CMT scheduling policy 1423 * being implemented across this PG 1424 */ 1425 static char * 1426 pg_cmt_policy_name(pg_t *pg) 1427 { 1428 pg_cmt_policy_t policy; 1429 1430 policy = ((pg_cmt_t *)pg)->cmt_policy; 1431 1432 if (policy & CMT_AFFINITY) { 1433 if (policy & CMT_BALANCE) 1434 return ("Load Balancing & Affinity"); 1435 else if (policy & CMT_COALESCE) 1436 return ("Load Coalescence & Affinity"); 1437 else 1438 return ("Affinity"); 1439 } else { 1440 if (policy & CMT_BALANCE) 1441 return ("Load Balancing"); 1442 else if (policy & CMT_COALESCE) 1443 return ("Load Coalescence"); 1444 else 1445 return ("None"); 1446 } 1447 } 1448 1449 /* 1450 * Prune PG, and all other instances of PG's hardware sharing relationship 1451 * from the CMT PG hierarchy. 1452 * 1453 * This routine operates on the CPU specific processor group data (for the CPUs 1454 * in the PG being pruned), and may be invoked from a context where one CPU's 1455 * PG data is under construction. In this case the argument "pgdata", if not 1456 * NULL, is a reference to the CPU's under-construction PG data. 1457 */ 1458 static int 1459 pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata) 1460 { 1461 group_t *hwset, *children; 1462 int i, j, r, size = *sz; 1463 group_iter_t hw_iter, child_iter; 1464 pg_cpu_itr_t cpu_iter; 1465 pg_cmt_t *pg, *child; 1466 cpu_t *cpu; 1467 int cap_needed; 1468 pghw_type_t hw; 1469 1470 ASSERT(MUTEX_HELD(&cpu_lock)); 1471 1472 /* 1473 * Inform pghw layer that this PG is pruned. 1474 */ 1475 pghw_cmt_fini((pghw_t *)pg_bad); 1476 1477 hw = ((pghw_t *)pg_bad)->pghw_hw; 1478 1479 if (hw == PGHW_POW_ACTIVE) { 1480 cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. " 1481 "Event Based CPUPM Unavailable"); 1482 } else if (hw == PGHW_POW_IDLE) { 1483 cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. " 1484 "Dispatcher assisted CPUPM disabled."); 1485 } 1486 1487 /* 1488 * Find and eliminate the PG from the lineage. 1489 */ 1490 for (i = 0; i < size; i++) { 1491 if (lineage[i] == pg_bad) { 1492 for (j = i; j < size - 1; j++) 1493 lineage[j] = lineage[j + 1]; 1494 *sz = size - 1; 1495 break; 1496 } 1497 } 1498 1499 /* 1500 * We'll prune all instances of the hardware sharing relationship 1501 * represented by pg. But before we do that (and pause CPUs) we need 1502 * to ensure the hierarchy's groups are properly sized. 1503 */ 1504 hwset = pghw_set_lookup(hw); 1505 1506 /* 1507 * Blacklist the hardware so future processor groups of this type won't 1508 * participate in CMT thread placement. 1509 * 1510 * XXX 1511 * For heterogeneous system configurations, this might be overkill. 1512 * We may only need to blacklist the illegal PGs, and other instances 1513 * of this hardware sharing relationship may be ok. 1514 */ 1515 cmt_hw_blacklisted[hw] = 1; 1516 1517 /* 1518 * For each of the PGs being pruned, ensure sufficient capacity in 1519 * the siblings set for the PG's children 1520 */ 1521 group_iter_init(&hw_iter); 1522 while ((pg = group_iterate(hwset, &hw_iter)) != NULL) { 1523 /* 1524 * PG is being pruned, but if it is bringing up more than 1525 * one child, ask for more capacity in the siblings group. 1526 */ 1527 cap_needed = 0; 1528 if (pg->cmt_children && 1529 GROUP_SIZE(pg->cmt_children) > 1) { 1530 cap_needed = GROUP_SIZE(pg->cmt_children) - 1; 1531 1532 group_expand(pg->cmt_siblings, 1533 GROUP_SIZE(pg->cmt_siblings) + cap_needed); 1534 1535 /* 1536 * If this is a top level group, also ensure the 1537 * capacity in the root lgrp level CMT grouping. 1538 */ 1539 if (pg->cmt_parent == NULL && 1540 pg->cmt_siblings != &cmt_root->cl_pgs) { 1541 group_expand(&cmt_root->cl_pgs, 1542 GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed); 1543 cmt_root->cl_npgs += cap_needed; 1544 } 1545 } 1546 } 1547 1548 /* 1549 * We're operating on the PG hierarchy. Pause CPUs to ensure 1550 * exclusivity with respect to the dispatcher. 1551 */ 1552 pause_cpus(NULL); 1553 1554 /* 1555 * Prune all PG instances of the hardware sharing relationship 1556 * represented by pg. 1557 */ 1558 group_iter_init(&hw_iter); 1559 while ((pg = group_iterate(hwset, &hw_iter)) != NULL) { 1560 1561 /* 1562 * Remove PG from it's group of siblings, if it's there. 1563 */ 1564 if (pg->cmt_siblings) { 1565 (void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); 1566 } 1567 if (pg->cmt_parent == NULL && 1568 pg->cmt_siblings != &cmt_root->cl_pgs) { 1569 (void) group_remove(&cmt_root->cl_pgs, pg, 1570 GRP_NORESIZE); 1571 } 1572 1573 /* 1574 * Indicate that no CMT policy will be implemented across 1575 * this PG. 1576 */ 1577 pg->cmt_policy = CMT_NO_POLICY; 1578 1579 /* 1580 * Move PG's children from it's children set to it's parent's 1581 * children set. Note that the parent's children set, and PG's 1582 * siblings set are the same thing. 1583 * 1584 * Because we are iterating over the same group that we are 1585 * operating on (removing the children), first add all of PG's 1586 * children to the parent's children set, and once we are done 1587 * iterating, empty PG's children set. 1588 */ 1589 if (pg->cmt_children != NULL) { 1590 children = pg->cmt_children; 1591 1592 group_iter_init(&child_iter); 1593 while ((child = group_iterate(children, &child_iter)) 1594 != NULL) { 1595 if (pg->cmt_siblings != NULL) { 1596 r = group_add(pg->cmt_siblings, child, 1597 GRP_NORESIZE); 1598 ASSERT(r == 0); 1599 1600 if (pg->cmt_parent == NULL && 1601 pg->cmt_siblings != 1602 &cmt_root->cl_pgs) { 1603 r = group_add(&cmt_root->cl_pgs, 1604 child, GRP_NORESIZE); 1605 ASSERT(r == 0); 1606 } 1607 } 1608 } 1609 group_empty(pg->cmt_children); 1610 } 1611 1612 /* 1613 * Reset the callbacks to the defaults 1614 */ 1615 pg_callback_set_defaults((pg_t *)pg); 1616 1617 /* 1618 * Update all the CPU lineages in each of PG's CPUs 1619 */ 1620 PG_CPU_ITR_INIT(pg, cpu_iter); 1621 while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { 1622 pg_cmt_t *cpu_pg; 1623 group_iter_t liter; /* Iterator for the lineage */ 1624 cpu_pg_t *cpd; /* CPU's PG data */ 1625 1626 /* 1627 * The CPU's lineage is under construction still 1628 * references the bootstrap CPU PG data structure. 1629 */ 1630 if (pg_cpu_is_bootstrapped(cpu)) 1631 cpd = pgdata; 1632 else 1633 cpd = cpu->cpu_pg; 1634 1635 /* 1636 * Iterate over the CPU's PGs updating the children 1637 * of the PG being promoted, since they have a new 1638 * parent and siblings set. 1639 */ 1640 group_iter_init(&liter); 1641 while ((cpu_pg = group_iterate(&cpd->pgs, 1642 &liter)) != NULL) { 1643 if (cpu_pg->cmt_parent == pg) { 1644 cpu_pg->cmt_parent = pg->cmt_parent; 1645 cpu_pg->cmt_siblings = pg->cmt_siblings; 1646 } 1647 } 1648 1649 /* 1650 * Update the CPU's lineages 1651 * 1652 * Remove the PG from the CPU's group used for CMT 1653 * scheduling. 1654 */ 1655 (void) group_remove(&cpd->cmt_pgs, pg, GRP_NORESIZE); 1656 } 1657 } 1658 start_cpus(); 1659 return (0); 1660 } 1661 1662 /* 1663 * Disable CMT scheduling 1664 */ 1665 static void 1666 pg_cmt_disable(void) 1667 { 1668 cpu_t *cpu; 1669 1670 ASSERT(MUTEX_HELD(&cpu_lock)); 1671 1672 pause_cpus(NULL); 1673 cpu = cpu_list; 1674 1675 do { 1676 if (cpu->cpu_pg) 1677 group_empty(&cpu->cpu_pg->cmt_pgs); 1678 } while ((cpu = cpu->cpu_next) != cpu_list); 1679 1680 cmt_sched_disabled = 1; 1681 start_cpus(); 1682 cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable"); 1683 } 1684 1685 /* 1686 * CMT lineage validation 1687 * 1688 * This routine is invoked by pg_cmt_cpu_init() to validate the integrity 1689 * of the PGs in a CPU's lineage. This is necessary because it's possible that 1690 * some groupings (power domain groupings in particular) may be defined by 1691 * sources that are buggy (e.g. BIOS bugs). In such cases, it may not be 1692 * possible to integrate those groupings into the CMT PG hierarchy, if doing 1693 * so would violate the subset invariant of the hierarchy, which says that 1694 * a PG must be subset of its parent (if it has one). 1695 * 1696 * pg_cmt_lineage_validate()'s purpose is to detect grouping definitions that 1697 * would result in a violation of this invariant. If a violation is found, 1698 * and the PG is of a grouping type who's definition is known to originate from 1699 * suspect sources (BIOS), then pg_cmt_prune() will be invoked to prune the 1700 * PG (and all other instances PG's sharing relationship type) from the CMT 1701 * hierarchy. Further, future instances of that sharing relationship type won't 1702 * be added. If the grouping definition doesn't originate from suspect 1703 * sources, then pg_cmt_disable() will be invoked to log an error, and disable 1704 * CMT scheduling altogether. 1705 * 1706 * This routine is invoked after the CPU has been added to the PGs in which 1707 * it belongs, but before those PGs have been added to (or had their place 1708 * adjusted in) the CMT PG hierarchy. 1709 * 1710 * The first argument is the CPUs PG lineage (essentially an array of PGs in 1711 * which the CPU belongs) that has already been sorted in ascending order 1712 * by CPU count. Some of the PGs in the CPUs lineage may already have other 1713 * CPUs in them, and have already been integrated into the CMT hierarchy. 1714 * 1715 * The addition of this new CPU to these pre-existing PGs means that those 1716 * PGs may need to be promoted up in the hierarchy to satisfy the subset 1717 * invariant. In additon to testing the subset invariant for the lineage, 1718 * this routine also verifies that the addition of the new CPU to the 1719 * existing PGs wouldn't cause the subset invariant to be violated in 1720 * the exiting lineages. 1721 * 1722 * This routine will normally return one of the following: 1723 * CMT_LINEAGE_VALID - There were no problems detected with the lineage. 1724 * CMT_LINEAGE_REPAIRED - Problems were detected, but repaired via pruning. 1725 * 1726 * Otherwise, this routine will return a value indicating which error it 1727 * was unable to recover from (and set cmt_lineage_status along the way). 1728 * 1729 * This routine operates on the CPU specific processor group data (for the CPU 1730 * whose lineage is being validated), which is under-construction. 1731 * "pgdata" is a reference to the CPU's under-construction PG data. 1732 * This routine must be careful to operate only on "pgdata", and not cp->cpu_pg. 1733 */ 1734 static cmt_lineage_validation_t 1735 pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata) 1736 { 1737 int i, j, size; 1738 pg_cmt_t *pg, *pg_next, *pg_bad, *pg_tmp, *parent; 1739 cpu_t *cp; 1740 pg_cpu_itr_t cpu_iter; 1741 lgrp_handle_t lgrp; 1742 1743 ASSERT(MUTEX_HELD(&cpu_lock)); 1744 1745 revalidate: 1746 size = *sz; 1747 pg_bad = NULL; 1748 lgrp = LGRP_NULL_HANDLE; 1749 for (i = 0; i < size; i++) { 1750 1751 pg = lineage[i]; 1752 if (i < size - 1) 1753 pg_next = lineage[i + 1]; 1754 else 1755 pg_next = NULL; 1756 1757 /* 1758 * We assume that the lineage has already been sorted 1759 * by the number of CPUs. In fact, we depend on it. 1760 */ 1761 ASSERT(pg_next == NULL || 1762 (PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)pg_next))); 1763 1764 /* 1765 * The CPUs PG lineage was passed as the first argument to 1766 * this routine and contains the sorted list of the CPU's 1767 * PGs. Ultimately, the ordering of the PGs in that list, and 1768 * the ordering as traversed by the cmt_parent list must be 1769 * the same. PG promotion will be used as the mechanism to 1770 * achieve this, but first we need to look for cases where 1771 * promotion will be necessary, and validate that will be 1772 * possible without violating the subset invarient described 1773 * above. 1774 * 1775 * Since the PG topology is in the middle of being changed, we 1776 * need to check whether the PG's existing parent (if any) is 1777 * part of this CPU's lineage (and therefore should contain 1778 * the new CPU). If not, it means that the addition of the 1779 * new CPU should have made this PG have more CPUs than its 1780 * parent (and other ancestors not in the same lineage) and 1781 * will need to be promoted into place. 1782 * 1783 * We need to verify all of this to defend against a buggy 1784 * BIOS giving bad power domain CPU groupings. Sigh. 1785 */ 1786 parent = pg->cmt_parent; 1787 while (parent != NULL) { 1788 /* 1789 * Determine if the parent/ancestor is in this lineage 1790 */ 1791 pg_tmp = NULL; 1792 for (j = 0; (j < size) && (pg_tmp != parent); j++) { 1793 pg_tmp = lineage[j]; 1794 } 1795 if (pg_tmp == parent) { 1796 /* 1797 * It's in the lineage. The concentricity 1798 * checks will handle the rest. 1799 */ 1800 break; 1801 } 1802 /* 1803 * If it is not in the lineage, PG will eventually 1804 * need to be promoted above it. Verify the ancestor 1805 * is a proper subset. There is still an error if 1806 * the ancestor has the same number of CPUs as PG, 1807 * since that would imply it should be in the lineage, 1808 * and we already know it isn't. 1809 */ 1810 if (PG_NUM_CPUS((pg_t *)parent) >= 1811 PG_NUM_CPUS((pg_t *)pg)) { 1812 /* 1813 * Not a proper subset if the parent/ancestor 1814 * has the same or more CPUs than PG. 1815 */ 1816 cmt_lineage_status = CMT_LINEAGE_NON_PROMOTABLE; 1817 goto handle_error; 1818 } 1819 parent = parent->cmt_parent; 1820 } 1821 1822 /* 1823 * Walk each of the CPUs in the PGs group and perform 1824 * consistency checks along the way. 1825 */ 1826 PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter); 1827 while ((cp = pg_cpu_next(&cpu_iter)) != NULL) { 1828 /* 1829 * Verify that there aren't any CPUs contained in PG 1830 * that the next PG in the lineage (which is larger 1831 * or same size) doesn't also contain. 1832 */ 1833 if (pg_next != NULL && 1834 pg_cpu_find((pg_t *)pg_next, cp) == B_FALSE) { 1835 cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC; 1836 goto handle_error; 1837 } 1838 1839 /* 1840 * Verify that all the CPUs in the PG are in the same 1841 * lgroup. 1842 */ 1843 if (lgrp == LGRP_NULL_HANDLE) { 1844 lgrp = lgrp_plat_cpu_to_hand(cp->cpu_id); 1845 } else if (lgrp_plat_cpu_to_hand(cp->cpu_id) != lgrp) { 1846 cmt_lineage_status = CMT_LINEAGE_PG_SPANS_LGRPS; 1847 goto handle_error; 1848 } 1849 } 1850 } 1851 1852 handle_error: 1853 /* 1854 * Some of these validation errors can result when the CPU grouping 1855 * information is derived from buggy sources (for example, incorrect 1856 * ACPI tables on x86 systems). 1857 * 1858 * We'll try to recover in such cases by pruning out the illegal 1859 * groupings from the PG hierarchy, which means that we won't optimize 1860 * for those levels, but we will for the remaining ones. 1861 */ 1862 switch (cmt_lineage_status) { 1863 case CMT_LINEAGE_VALID: 1864 case CMT_LINEAGE_REPAIRED: 1865 break; 1866 case CMT_LINEAGE_PG_SPANS_LGRPS: 1867 /* 1868 * We've detected a PG whose CPUs span lgroups. 1869 * 1870 * This isn't supported, as the dispatcher isn't allowed to 1871 * to do CMT thread placement across lgroups, as this would 1872 * conflict with policies implementing MPO thread affinity. 1873 * 1874 * If the PG is of a sharing relationship type known to 1875 * legitimately span lgroups, specify that no CMT thread 1876 * placement policy should be implemented, and prune the PG 1877 * from the existing CMT PG hierarchy. 1878 * 1879 * Otherwise, fall though to the case below for handling. 1880 */ 1881 if (((pghw_t *)pg)->pghw_hw == PGHW_CHIP) { 1882 if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) { 1883 cmt_lineage_status = CMT_LINEAGE_REPAIRED; 1884 goto revalidate; 1885 } 1886 } 1887 /*LINTED*/ 1888 case CMT_LINEAGE_NON_PROMOTABLE: 1889 /* 1890 * We've detected a PG that already exists in another CPU's 1891 * lineage that cannot cannot legally be promoted into place 1892 * without breaking the invariants of the hierarchy. 1893 */ 1894 if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) { 1895 if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) { 1896 cmt_lineage_status = CMT_LINEAGE_REPAIRED; 1897 goto revalidate; 1898 } 1899 } 1900 /* 1901 * Something went wrong trying to prune out the bad level. 1902 * Disable CMT scheduling altogether. 1903 */ 1904 pg_cmt_disable(); 1905 break; 1906 case CMT_LINEAGE_NON_CONCENTRIC: 1907 /* 1908 * We've detected a non-concentric PG lineage, which means that 1909 * there's a PG in the lineage that has CPUs that the next PG 1910 * over in the lineage (which is the same size or larger) 1911 * doesn't have. 1912 * 1913 * In this case, we examine the two PGs to see if either 1914 * grouping is defined by potentially buggy sources. 1915 * 1916 * If one has less CPUs than the other, and contains CPUs 1917 * not found in the parent, and it is an untrusted enumeration, 1918 * then prune it. If both have the same number of CPUs, then 1919 * prune the one that is untrusted. 1920 * 1921 * This process repeats until we have a concentric lineage, 1922 * or we would have to prune out level derived from what we 1923 * thought was a reliable source, in which case CMT scheduling 1924 * is disabled altogether. 1925 */ 1926 if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)pg_next)) && 1927 (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) { 1928 pg_bad = pg; 1929 } else if (PG_NUM_CPUS((pg_t *)pg) == 1930 PG_NUM_CPUS((pg_t *)pg_next)) { 1931 if (PG_CMT_HW_SUSPECT(((pghw_t *)pg_next)->pghw_hw)) { 1932 pg_bad = pg_next; 1933 } else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) { 1934 pg_bad = pg; 1935 } 1936 } 1937 if (pg_bad) { 1938 if (pg_cmt_prune(pg_bad, lineage, sz, pgdata) == 0) { 1939 cmt_lineage_status = CMT_LINEAGE_REPAIRED; 1940 goto revalidate; 1941 } 1942 } 1943 /* 1944 * Something went wrong trying to identify and/or prune out 1945 * the bad level. Disable CMT scheduling altogether. 1946 */ 1947 pg_cmt_disable(); 1948 break; 1949 default: 1950 /* 1951 * If we're here, we've encountered a validation error for 1952 * which we don't know how to recover. In this case, disable 1953 * CMT scheduling altogether. 1954 */ 1955 cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE; 1956 pg_cmt_disable(); 1957 } 1958 return (cmt_lineage_status); 1959 } 1960