1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/systm.h> 27 #include <sys/types.h> 28 #include <sys/param.h> 29 #include <sys/thread.h> 30 #include <sys/cpuvar.h> 31 #include <sys/cpupart.h> 32 #include <sys/kmem.h> 33 #include <sys/cmn_err.h> 34 #include <sys/kstat.h> 35 #include <sys/processor.h> 36 #include <sys/disp.h> 37 #include <sys/group.h> 38 #include <sys/pghw.h> 39 #include <sys/bitset.h> 40 #include <sys/lgrp.h> 41 #include <sys/cmt.h> 42 #include <sys/cpu_pm.h> 43 44 /* 45 * CMT scheduler / dispatcher support 46 * 47 * This file implements CMT scheduler support using Processor Groups. 48 * The CMT processor group class creates and maintains the CMT class 49 * specific processor group pg_cmt_t. 50 * 51 * ---------------------------- <-- pg_cmt_t * 52 * | pghw_t | 53 * ---------------------------- 54 * | CMT class specific data | 55 * | - hierarchy linkage | 56 * | - CMT load balancing data| 57 * | - active CPU group/bitset| 58 * ---------------------------- 59 * 60 * The scheduler/dispatcher leverages knowledge of the performance 61 * relevant CMT sharing relationships existing between cpus to implement 62 * optimized affinity, load balancing, and coalescence policies. 63 * 64 * Load balancing policy seeks to improve performance by minimizing 65 * contention over shared processor resources / facilities, Affinity 66 * policies seek to improve cache and TLB utilization. Coalescence 67 * policies improve resource utilization and ultimately power efficiency. 68 * 69 * The CMT PGs created by this class are already arranged into a 70 * hierarchy (which is done in the pghw layer). To implement the top-down 71 * CMT load balancing algorithm, the CMT PGs additionally maintain 72 * parent, child and sibling hierarchy relationships. 73 * Parent PGs always contain a superset of their children(s) resources, 74 * each PG can have at most one parent, and siblings are the group of PGs 75 * sharing the same parent. 76 * 77 * On UMA based systems, the CMT load balancing algorithm begins by balancing 78 * load across the group of top level PGs in the system hierarchy. 79 * On NUMA systems, the CMT load balancing algorithm balances load across the 80 * group of top level PGs in each leaf lgroup...but for root homed threads, 81 * is willing to balance against all the top level PGs in the system. 82 * 83 * Groups of top level PGs are maintained to implement the above, one for each 84 * leaf lgroup (containing the top level PGs in that lgroup), and one (for the 85 * root lgroup) that contains all the top level PGs in the system. 86 */ 87 static cmt_lgrp_t *cmt_lgrps = NULL; /* cmt_lgrps list head */ 88 static cmt_lgrp_t *cpu0_lgrp = NULL; /* boot CPU's initial lgrp */ 89 /* used for null_proc_lpa */ 90 cmt_lgrp_t *cmt_root = NULL; /* Reference to root cmt pg */ 91 92 static int is_cpu0 = 1; /* true if this is boot CPU context */ 93 94 /* 95 * Array of hardware sharing relationships that are blacklisted. 96 * CMT scheduling optimizations won't be performed for blacklisted sharing 97 * relationships. 98 */ 99 static int cmt_hw_blacklisted[PGHW_NUM_COMPONENTS]; 100 101 /* 102 * Set this to non-zero to disable CMT scheduling 103 * This must be done via kmdb -d, as /etc/system will be too late 104 */ 105 int cmt_sched_disabled = 0; 106 107 /* 108 * Status codes for CMT lineage validation 109 * See pg_cmt_lineage_validate() below 110 */ 111 typedef enum cmt_lineage_validation { 112 CMT_LINEAGE_VALID, 113 CMT_LINEAGE_NON_CONCENTRIC, 114 CMT_LINEAGE_PG_SPANS_LGRPS, 115 CMT_LINEAGE_NON_PROMOTABLE, 116 CMT_LINEAGE_REPAIRED, 117 CMT_LINEAGE_UNRECOVERABLE 118 } cmt_lineage_validation_t; 119 120 /* 121 * Status of the current lineage under construction. 122 * One must be holding cpu_lock to change this. 123 */ 124 cmt_lineage_validation_t cmt_lineage_status = CMT_LINEAGE_VALID; 125 126 /* 127 * Power domain definitions (on x86) are defined by ACPI, and 128 * therefore may be subject to BIOS bugs. 129 */ 130 #define PG_CMT_HW_SUSPECT(hw) PGHW_IS_PM_DOMAIN(hw) 131 132 /* 133 * Macro to test if PG is managed by the CMT PG class 134 */ 135 #define IS_CMT_PG(pg) (((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id) 136 137 static pg_cid_t pg_cmt_class_id; /* PG class id */ 138 139 static pg_t *pg_cmt_alloc(); 140 static void pg_cmt_free(pg_t *); 141 static void pg_cmt_cpu_init(cpu_t *, cpu_pg_t *); 142 static void pg_cmt_cpu_fini(cpu_t *, cpu_pg_t *); 143 static void pg_cmt_cpu_active(cpu_t *); 144 static void pg_cmt_cpu_inactive(cpu_t *); 145 static void pg_cmt_cpupart_in(cpu_t *, cpupart_t *); 146 static void pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *); 147 static char *pg_cmt_policy_name(pg_t *); 148 static void pg_cmt_hier_sort(pg_cmt_t **, int); 149 static pg_cmt_t *pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *); 150 static int pg_cmt_cpu_belongs(pg_t *, cpu_t *); 151 static int pg_cmt_hw(pghw_type_t); 152 static cmt_lgrp_t *pg_cmt_find_lgrp(lgrp_handle_t); 153 static cmt_lgrp_t *pg_cmt_lgrp_create(lgrp_handle_t); 154 static void cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t, 155 kthread_t *, kthread_t *); 156 static void cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t, 157 kthread_t *, kthread_t *); 158 static void cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *); 159 static cmt_lineage_validation_t pg_cmt_lineage_validate(pg_cmt_t **, int *, 160 cpu_pg_t *); 161 162 163 /* 164 * CMT PG ops 165 */ 166 struct pg_ops pg_ops_cmt = { 167 pg_cmt_alloc, 168 pg_cmt_free, 169 pg_cmt_cpu_init, 170 pg_cmt_cpu_fini, 171 pg_cmt_cpu_active, 172 pg_cmt_cpu_inactive, 173 pg_cmt_cpupart_in, 174 NULL, /* cpupart_out */ 175 pg_cmt_cpupart_move, 176 pg_cmt_cpu_belongs, 177 pg_cmt_policy_name, 178 }; 179 180 /* 181 * Initialize the CMT PG class 182 */ 183 void 184 pg_cmt_class_init(void) 185 { 186 if (cmt_sched_disabled) 187 return; 188 189 pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL); 190 } 191 192 /* 193 * Called to indicate a new CPU has started up so 194 * that either t0 or the slave startup thread can 195 * be accounted for. 196 */ 197 void 198 pg_cmt_cpu_startup(cpu_t *cp) 199 { 200 pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread, 201 cp->cpu_thread); 202 } 203 204 /* 205 * Return non-zero if thread can migrate between "from" and "to" 206 * without a performance penalty 207 */ 208 int 209 pg_cmt_can_migrate(cpu_t *from, cpu_t *to) 210 { 211 if (from->cpu_physid->cpu_cacheid == 212 to->cpu_physid->cpu_cacheid) 213 return (1); 214 return (0); 215 } 216 217 /* 218 * CMT class specific PG allocation 219 */ 220 static pg_t * 221 pg_cmt_alloc(void) 222 { 223 return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP)); 224 } 225 226 /* 227 * Class specific PG de-allocation 228 */ 229 static void 230 pg_cmt_free(pg_t *pg) 231 { 232 ASSERT(pg != NULL); 233 ASSERT(IS_CMT_PG(pg)); 234 235 kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t)); 236 } 237 238 /* 239 * Given a hardware sharing relationship, return which dispatcher 240 * policies should be implemented to optimize performance and efficiency 241 */ 242 static pg_cmt_policy_t 243 pg_cmt_policy(pghw_type_t hw) 244 { 245 pg_cmt_policy_t p; 246 247 /* 248 * Give the platform a chance to override the default 249 */ 250 if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY) 251 return (p); 252 253 switch (hw) { 254 case PGHW_IPIPE: 255 case PGHW_FPU: 256 case PGHW_PROCNODE: 257 case PGHW_CHIP: 258 return (CMT_BALANCE); 259 case PGHW_CACHE: 260 return (CMT_AFFINITY); 261 case PGHW_POW_ACTIVE: 262 case PGHW_POW_IDLE: 263 return (CMT_BALANCE); 264 default: 265 return (CMT_NO_POLICY); 266 } 267 } 268 269 /* 270 * Rank the importance of optimizing for the pg1 relationship vs. 271 * the pg2 relationship. 272 */ 273 static pg_cmt_t * 274 pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2) 275 { 276 pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw; 277 pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw; 278 279 /* 280 * A power domain is only important if CPUPM is enabled. 281 */ 282 if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) { 283 if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2)) 284 return (pg2); 285 if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1)) 286 return (pg1); 287 } 288 289 /* 290 * Otherwise, ask the platform 291 */ 292 if (pg_plat_hw_rank(hw1, hw2) == hw1) 293 return (pg1); 294 else 295 return (pg2); 296 } 297 298 /* 299 * Initialize CMT callbacks for the given PG 300 */ 301 static void 302 cmt_callback_init(pg_t *pg) 303 { 304 /* 305 * Stick with the default callbacks if there isn't going to be 306 * any CMT thread placement optimizations implemented. 307 */ 308 if (((pg_cmt_t *)pg)->cmt_policy == CMT_NO_POLICY) 309 return; 310 311 switch (((pghw_t *)pg)->pghw_hw) { 312 case PGHW_POW_ACTIVE: 313 pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr; 314 pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr; 315 break; 316 default: 317 pg->pg_cb.thread_swtch = cmt_ev_thread_swtch; 318 319 } 320 } 321 322 /* 323 * Promote PG above it's current parent. 324 * This is only legal if PG has an equal or greater number of CPUs than its 325 * parent. 326 * 327 * This routine operates on the CPU specific processor group data (for the CPUs 328 * in the PG being promoted), and may be invoked from a context where one CPU's 329 * PG data is under construction. In this case the argument "pgdata", if not 330 * NULL, is a reference to the CPU's under-construction PG data. 331 */ 332 static void 333 cmt_hier_promote(pg_cmt_t *pg, cpu_pg_t *pgdata) 334 { 335 pg_cmt_t *parent; 336 group_t *children; 337 cpu_t *cpu; 338 group_iter_t iter; 339 pg_cpu_itr_t cpu_iter; 340 int r; 341 int err; 342 343 ASSERT(MUTEX_HELD(&cpu_lock)); 344 345 parent = pg->cmt_parent; 346 if (parent == NULL) { 347 /* 348 * Nothing to do 349 */ 350 return; 351 } 352 353 ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent)); 354 355 /* 356 * We're changing around the hierarchy, which is actively traversed 357 * by the dispatcher. Pause CPUS to ensure exclusivity. 358 */ 359 pause_cpus(NULL); 360 361 /* 362 * If necessary, update the parent's sibling set, replacing parent 363 * with PG. 364 */ 365 if (parent->cmt_siblings) { 366 if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE) 367 != -1) { 368 r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE); 369 ASSERT(r != -1); 370 } 371 } 372 373 /* 374 * If the parent is at the top of the hierarchy, replace it's entry 375 * in the root lgroup's group of top level PGs. 376 */ 377 if (parent->cmt_parent == NULL && 378 parent->cmt_siblings != &cmt_root->cl_pgs) { 379 if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE) 380 != -1) { 381 r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE); 382 ASSERT(r != -1); 383 } 384 } 385 386 /* 387 * We assume (and therefore assert) that the PG being promoted is an 388 * only child of it's parent. Update the parent's children set 389 * replacing PG's entry with the parent (since the parent is becoming 390 * the child). Then have PG and the parent swap children sets. 391 */ 392 ASSERT(GROUP_SIZE(parent->cmt_children) <= 1); 393 if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) { 394 r = group_add(parent->cmt_children, parent, GRP_NORESIZE); 395 ASSERT(r != -1); 396 } 397 398 children = pg->cmt_children; 399 pg->cmt_children = parent->cmt_children; 400 parent->cmt_children = children; 401 402 /* 403 * Update the sibling references for PG and it's parent 404 */ 405 pg->cmt_siblings = parent->cmt_siblings; 406 parent->cmt_siblings = pg->cmt_children; 407 408 /* 409 * Update any cached lineages in the per CPU pg data. 410 */ 411 PG_CPU_ITR_INIT(pg, cpu_iter); 412 while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { 413 int idx; 414 pg_cmt_t *cpu_pg; 415 cpu_pg_t *pgd; /* CPU's PG data */ 416 417 /* 418 * The CPU's whose lineage is under construction still 419 * references the bootstrap CPU PG data structure. 420 */ 421 if (pg_cpu_is_bootstrapped(cpu)) 422 pgd = pgdata; 423 else 424 pgd = cpu->cpu_pg; 425 426 /* 427 * Iterate over the CPU's PGs updating the children 428 * of the PG being promoted, since they have a new parent. 429 */ 430 group_iter_init(&iter); 431 while ((cpu_pg = group_iterate(&pgd->cmt_pgs, &iter)) != NULL) { 432 if (cpu_pg->cmt_parent == pg) { 433 cpu_pg->cmt_parent = parent; 434 } 435 } 436 437 /* 438 * Update the CMT load balancing lineage 439 */ 440 if ((idx = group_find(&pgd->cmt_pgs, (void *)pg)) == -1) { 441 /* 442 * Unless this is the CPU who's lineage is being 443 * constructed, the PG being promoted should be 444 * in the lineage. 445 */ 446 ASSERT(pg_cpu_is_bootstrapped(cpu)); 447 continue; 448 } 449 450 ASSERT(GROUP_ACCESS(&pgd->cmt_pgs, idx - 1) == parent); 451 ASSERT(idx > 0); 452 453 /* 454 * Have the child and the parent swap places in the CPU's 455 * lineage 456 */ 457 group_remove_at(&pgd->cmt_pgs, idx); 458 group_remove_at(&pgd->cmt_pgs, idx - 1); 459 err = group_add_at(&pgd->cmt_pgs, parent, idx); 460 ASSERT(err == 0); 461 err = group_add_at(&pgd->cmt_pgs, pg, idx - 1); 462 ASSERT(err == 0); 463 } 464 465 /* 466 * Update the parent references for PG and it's parent 467 */ 468 pg->cmt_parent = parent->cmt_parent; 469 parent->cmt_parent = pg; 470 471 start_cpus(); 472 } 473 474 /* 475 * CMT class callback for a new CPU entering the system 476 * 477 * This routine operates on the CPU specific processor group data (for the CPU 478 * being initialized). The argument "pgdata" is a reference to the CPU's PG 479 * data to be constructed. 480 * 481 * cp->cpu_pg is used by the dispatcher to access the CPU's PG data 482 * references a "bootstrap" structure. pg_cmt_cpu_init() and the routines it 483 * calls must be careful to operate only on the "pgdata" argument, and not 484 * cp->cpu_pg. 485 */ 486 static void 487 pg_cmt_cpu_init(cpu_t *cp, cpu_pg_t *pgdata) 488 { 489 pg_cmt_t *pg; 490 group_t *cmt_pgs; 491 int levels, level; 492 pghw_type_t hw; 493 pg_t *pg_cache = NULL; 494 pg_cmt_t *cpu_cmt_hier[PGHW_NUM_COMPONENTS]; 495 lgrp_handle_t lgrp_handle; 496 cmt_lgrp_t *lgrp; 497 cmt_lineage_validation_t lineage_status; 498 499 ASSERT(MUTEX_HELD(&cpu_lock)); 500 ASSERT(pg_cpu_is_bootstrapped(cp)); 501 502 if (cmt_sched_disabled) 503 return; 504 505 /* 506 * A new CPU is coming into the system. 507 * Interrogate the platform to see if the CPU 508 * has any performance or efficiency relevant 509 * sharing relationships 510 */ 511 cmt_pgs = &pgdata->cmt_pgs; 512 pgdata->cmt_lineage = NULL; 513 514 bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier)); 515 levels = 0; 516 for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) { 517 518 pg_cmt_policy_t policy; 519 520 /* 521 * We're only interested in the hw sharing relationships 522 * for which we know how to optimize. 523 */ 524 policy = pg_cmt_policy(hw); 525 if (policy == CMT_NO_POLICY || 526 pg_plat_hw_shared(cp, hw) == 0) 527 continue; 528 529 /* 530 * We will still create the PGs for hardware sharing 531 * relationships that have been blacklisted, but won't 532 * implement CMT thread placement optimizations against them. 533 */ 534 if (cmt_hw_blacklisted[hw] == 1) 535 policy = CMT_NO_POLICY; 536 537 /* 538 * Find (or create) the PG associated with 539 * the hw sharing relationship in which cp 540 * belongs. 541 * 542 * Determine if a suitable PG already 543 * exists, or if one needs to be created. 544 */ 545 pg = (pg_cmt_t *)pghw_place_cpu(cp, hw); 546 if (pg == NULL) { 547 /* 548 * Create a new one. 549 * Initialize the common... 550 */ 551 pg = (pg_cmt_t *)pg_create(pg_cmt_class_id); 552 553 /* ... physical ... */ 554 pghw_init((pghw_t *)pg, cp, hw); 555 556 /* 557 * ... and CMT specific portions of the 558 * structure. 559 */ 560 pg->cmt_policy = policy; 561 562 /* CMT event callbacks */ 563 cmt_callback_init((pg_t *)pg); 564 565 bitset_init(&pg->cmt_cpus_actv_set); 566 group_create(&pg->cmt_cpus_actv); 567 } else { 568 ASSERT(IS_CMT_PG(pg)); 569 } 570 571 /* Add the CPU to the PG */ 572 pg_cpu_add((pg_t *)pg, cp, pgdata); 573 574 /* 575 * Ensure capacity of the active CPU group/bitset 576 */ 577 group_expand(&pg->cmt_cpus_actv, 578 GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 579 580 if (cp->cpu_seqid >= 581 bitset_capacity(&pg->cmt_cpus_actv_set)) { 582 bitset_resize(&pg->cmt_cpus_actv_set, 583 cp->cpu_seqid + 1); 584 } 585 586 /* 587 * Build a lineage of CMT PGs for load balancing / coalescence 588 */ 589 if (policy & (CMT_BALANCE | CMT_COALESCE)) { 590 cpu_cmt_hier[levels++] = pg; 591 } 592 593 /* Cache this for later */ 594 if (hw == PGHW_CACHE) 595 pg_cache = (pg_t *)pg; 596 } 597 598 group_expand(cmt_pgs, levels); 599 600 if (cmt_root == NULL) 601 cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand()); 602 603 /* 604 * Find the lgrp that encapsulates this CPU's CMT hierarchy 605 */ 606 lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 607 if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL) 608 lgrp = pg_cmt_lgrp_create(lgrp_handle); 609 610 /* 611 * Ascendingly sort the PGs in the lineage by number of CPUs 612 */ 613 pg_cmt_hier_sort(cpu_cmt_hier, levels); 614 615 /* 616 * Examine the lineage and validate it. 617 * This routine will also try to fix the lineage along with the 618 * rest of the PG hierarchy should it detect an issue. 619 * 620 * If it returns anything other than VALID or REPAIRED, an 621 * unrecoverable error has occurred, and we cannot proceed. 622 */ 623 lineage_status = pg_cmt_lineage_validate(cpu_cmt_hier, &levels, pgdata); 624 if ((lineage_status != CMT_LINEAGE_VALID) && 625 (lineage_status != CMT_LINEAGE_REPAIRED)) { 626 /* 627 * In the case of an unrecoverable error where CMT scheduling 628 * has been disabled, assert that the under construction CPU's 629 * PG data has an empty CMT load balancing lineage. 630 */ 631 ASSERT((cmt_sched_disabled == 0) || 632 (GROUP_SIZE(&(pgdata->cmt_pgs)) == 0)); 633 return; 634 } 635 636 /* 637 * For existing PGs in the lineage, verify that the parent is 638 * correct, as the generation in the lineage may have changed 639 * as a result of the sorting. Start the traversal at the top 640 * of the lineage, moving down. 641 */ 642 for (level = levels - 1; level >= 0; ) { 643 int reorg; 644 645 reorg = 0; 646 pg = cpu_cmt_hier[level]; 647 648 /* 649 * Promote PGs at an incorrect generation into place. 650 */ 651 while (pg->cmt_parent && 652 pg->cmt_parent != cpu_cmt_hier[level + 1]) { 653 cmt_hier_promote(pg, pgdata); 654 reorg++; 655 } 656 if (reorg > 0) 657 level = levels - 1; 658 else 659 level--; 660 } 661 662 /* 663 * For each of the PGs in the CPU's lineage: 664 * - Add an entry in the CPU sorted CMT PG group 665 * which is used for top down CMT load balancing 666 * - Tie the PG into the CMT hierarchy by connecting 667 * it to it's parent and siblings. 668 */ 669 for (level = 0; level < levels; level++) { 670 uint_t children; 671 int err; 672 673 pg = cpu_cmt_hier[level]; 674 err = group_add_at(cmt_pgs, pg, levels - level - 1); 675 ASSERT(err == 0); 676 677 if (level == 0) 678 pgdata->cmt_lineage = (pg_t *)pg; 679 680 if (pg->cmt_siblings != NULL) { 681 /* Already initialized */ 682 ASSERT(pg->cmt_parent == NULL || 683 pg->cmt_parent == cpu_cmt_hier[level + 1]); 684 ASSERT(pg->cmt_siblings == &lgrp->cl_pgs || 685 ((pg->cmt_parent != NULL) && 686 pg->cmt_siblings == pg->cmt_parent->cmt_children)); 687 continue; 688 } 689 690 if ((level + 1) == levels) { 691 pg->cmt_parent = NULL; 692 693 pg->cmt_siblings = &lgrp->cl_pgs; 694 children = ++lgrp->cl_npgs; 695 if (cmt_root != lgrp) 696 cmt_root->cl_npgs++; 697 } else { 698 pg->cmt_parent = cpu_cmt_hier[level + 1]; 699 700 /* 701 * A good parent keeps track of their children. 702 * The parent's children group is also the PG's 703 * siblings. 704 */ 705 if (pg->cmt_parent->cmt_children == NULL) { 706 pg->cmt_parent->cmt_children = 707 kmem_zalloc(sizeof (group_t), KM_SLEEP); 708 group_create(pg->cmt_parent->cmt_children); 709 } 710 pg->cmt_siblings = pg->cmt_parent->cmt_children; 711 children = ++pg->cmt_parent->cmt_nchildren; 712 } 713 714 group_expand(pg->cmt_siblings, children); 715 group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs); 716 } 717 718 /* 719 * Cache the chip and core IDs in the cpu_t->cpu_physid structure 720 * for fast lookups later. 721 */ 722 if (cp->cpu_physid) { 723 cp->cpu_physid->cpu_chipid = 724 pg_plat_hw_instance_id(cp, PGHW_CHIP); 725 cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp); 726 727 /* 728 * If this cpu has a PG representing shared cache, then set 729 * cpu_cacheid to that PG's logical id 730 */ 731 if (pg_cache) 732 cp->cpu_physid->cpu_cacheid = pg_cache->pg_id; 733 } 734 735 /* CPU0 only initialization */ 736 if (is_cpu0) { 737 is_cpu0 = 0; 738 cpu0_lgrp = lgrp; 739 } 740 741 } 742 743 /* 744 * Class callback when a CPU is leaving the system (deletion) 745 * 746 * "pgdata" is a reference to the CPU's PG data to be deconstructed. 747 * 748 * cp->cpu_pg is used by the dispatcher to access the CPU's PG data 749 * references a "bootstrap" structure across this function's invocation. 750 * pg_cmt_cpu_init() and the routines it calls must be careful to operate only 751 * on the "pgdata" argument, and not cp->cpu_pg. 752 */ 753 static void 754 pg_cmt_cpu_fini(cpu_t *cp, cpu_pg_t *pgdata) 755 { 756 group_iter_t i; 757 pg_cmt_t *pg; 758 group_t *pgs, *cmt_pgs; 759 lgrp_handle_t lgrp_handle; 760 cmt_lgrp_t *lgrp; 761 762 if (cmt_sched_disabled) 763 return; 764 765 ASSERT(pg_cpu_is_bootstrapped(cp)); 766 767 pgs = &pgdata->pgs; 768 cmt_pgs = &pgdata->cmt_pgs; 769 770 /* 771 * Find the lgroup that encapsulates this CPU's CMT hierarchy 772 */ 773 lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 774 775 lgrp = pg_cmt_find_lgrp(lgrp_handle); 776 if (ncpus == 1 && lgrp != cpu0_lgrp) { 777 /* 778 * One might wonder how we could be deconfiguring the 779 * only CPU in the system. 780 * 781 * On Starcat systems when null_proc_lpa is detected, 782 * the boot CPU (which is already configured into a leaf 783 * lgroup), is moved into the root lgroup. This is done by 784 * deconfiguring it from both lgroups and processor 785 * groups), and then later reconfiguring it back in. This 786 * call to pg_cmt_cpu_fini() is part of that deconfiguration. 787 * 788 * This special case is detected by noting that the platform 789 * has changed the CPU's lgrp affiliation (since it now 790 * belongs in the root). In this case, use the cmt_lgrp_t 791 * cached for the boot CPU, since this is what needs to be 792 * torn down. 793 */ 794 lgrp = cpu0_lgrp; 795 } 796 797 ASSERT(lgrp != NULL); 798 799 /* 800 * First, clean up anything load balancing specific for each of 801 * the CPU's PGs that participated in CMT load balancing 802 */ 803 pg = (pg_cmt_t *)pgdata->cmt_lineage; 804 while (pg != NULL) { 805 806 /* 807 * Remove the PG from the CPU's load balancing lineage 808 */ 809 (void) group_remove(cmt_pgs, pg, GRP_RESIZE); 810 811 /* 812 * If it's about to become empty, destroy it's children 813 * group, and remove it's reference from it's siblings. 814 * This is done here (rather than below) to avoid removing 815 * our reference from a PG that we just eliminated. 816 */ 817 if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) { 818 if (pg->cmt_children != NULL) 819 group_destroy(pg->cmt_children); 820 if (pg->cmt_siblings != NULL) { 821 if (pg->cmt_siblings == &lgrp->cl_pgs) 822 lgrp->cl_npgs--; 823 else 824 pg->cmt_parent->cmt_nchildren--; 825 } 826 } 827 pg = pg->cmt_parent; 828 } 829 ASSERT(GROUP_SIZE(cmt_pgs) == 0); 830 831 /* 832 * Now that the load balancing lineage updates have happened, 833 * remove the CPU from all it's PGs (destroying any that become 834 * empty). 835 */ 836 group_iter_init(&i); 837 while ((pg = group_iterate(pgs, &i)) != NULL) { 838 if (IS_CMT_PG(pg) == 0) 839 continue; 840 841 pg_cpu_delete((pg_t *)pg, cp, pgdata); 842 /* 843 * Deleting the CPU from the PG changes the CPU's 844 * PG group over which we are actively iterating 845 * Re-initialize the iteration 846 */ 847 group_iter_init(&i); 848 849 if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) { 850 851 /* 852 * The PG has become zero sized, so destroy it. 853 */ 854 group_destroy(&pg->cmt_cpus_actv); 855 bitset_fini(&pg->cmt_cpus_actv_set); 856 pghw_fini((pghw_t *)pg); 857 858 pg_destroy((pg_t *)pg); 859 } 860 } 861 } 862 863 /* 864 * Class callback when a CPU is entering a cpu partition 865 */ 866 static void 867 pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp) 868 { 869 group_t *pgs; 870 pg_t *pg; 871 group_iter_t i; 872 873 ASSERT(MUTEX_HELD(&cpu_lock)); 874 875 if (cmt_sched_disabled) 876 return; 877 878 pgs = &cp->cpu_pg->pgs; 879 880 /* 881 * Ensure that the new partition's PG bitset 882 * is large enough for all CMT PG's to which cp 883 * belongs 884 */ 885 group_iter_init(&i); 886 while ((pg = group_iterate(pgs, &i)) != NULL) { 887 if (IS_CMT_PG(pg) == 0) 888 continue; 889 890 if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id) 891 bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1); 892 } 893 } 894 895 /* 896 * Class callback when a CPU is actually moving partitions 897 */ 898 static void 899 pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp) 900 { 901 cpu_t *cpp; 902 group_t *pgs; 903 pg_t *pg; 904 group_iter_t pg_iter; 905 pg_cpu_itr_t cpu_iter; 906 boolean_t found; 907 908 ASSERT(MUTEX_HELD(&cpu_lock)); 909 910 if (cmt_sched_disabled) 911 return; 912 913 pgs = &cp->cpu_pg->pgs; 914 group_iter_init(&pg_iter); 915 916 /* 917 * Iterate over the CPUs CMT PGs 918 */ 919 while ((pg = group_iterate(pgs, &pg_iter)) != NULL) { 920 921 if (IS_CMT_PG(pg) == 0) 922 continue; 923 924 /* 925 * Add the PG to the bitset in the new partition. 926 */ 927 bitset_add(&newpp->cp_cmt_pgs, pg->pg_id); 928 929 /* 930 * Remove the PG from the bitset in the old partition 931 * if the last of the PG's CPUs have left. 932 */ 933 found = B_FALSE; 934 PG_CPU_ITR_INIT(pg, cpu_iter); 935 while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) { 936 if (cpp == cp) 937 continue; 938 if (CPU_ACTIVE(cpp) && 939 cpp->cpu_part->cp_id == oldpp->cp_id) { 940 found = B_TRUE; 941 break; 942 } 943 } 944 if (!found) 945 bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id); 946 } 947 } 948 949 /* 950 * Class callback when a CPU becomes active (online) 951 * 952 * This is called in a context where CPUs are paused 953 */ 954 static void 955 pg_cmt_cpu_active(cpu_t *cp) 956 { 957 int err; 958 group_iter_t i; 959 pg_cmt_t *pg; 960 group_t *pgs; 961 962 ASSERT(MUTEX_HELD(&cpu_lock)); 963 964 if (cmt_sched_disabled) 965 return; 966 967 pgs = &cp->cpu_pg->pgs; 968 group_iter_init(&i); 969 970 /* 971 * Iterate over the CPU's PGs 972 */ 973 while ((pg = group_iterate(pgs, &i)) != NULL) { 974 975 if (IS_CMT_PG(pg) == 0) 976 continue; 977 978 err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 979 ASSERT(err == 0); 980 981 /* 982 * If this is the first active CPU in the PG, and it 983 * represents a hardware sharing relationship over which 984 * CMT load balancing is performed, add it as a candidate 985 * for balancing with it's siblings. 986 */ 987 if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 && 988 (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) { 989 err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE); 990 ASSERT(err == 0); 991 992 /* 993 * If this is a top level PG, add it as a balancing 994 * candidate when balancing within the root lgroup. 995 */ 996 if (pg->cmt_parent == NULL && 997 pg->cmt_siblings != &cmt_root->cl_pgs) { 998 err = group_add(&cmt_root->cl_pgs, pg, 999 GRP_NORESIZE); 1000 ASSERT(err == 0); 1001 } 1002 } 1003 1004 /* 1005 * Notate the CPU in the PGs active CPU bitset. 1006 * Also notate the PG as being active in it's associated 1007 * partition 1008 */ 1009 bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 1010 bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id); 1011 } 1012 } 1013 1014 /* 1015 * Class callback when a CPU goes inactive (offline) 1016 * 1017 * This is called in a context where CPUs are paused 1018 */ 1019 static void 1020 pg_cmt_cpu_inactive(cpu_t *cp) 1021 { 1022 int err; 1023 group_t *pgs; 1024 pg_cmt_t *pg; 1025 cpu_t *cpp; 1026 group_iter_t i; 1027 pg_cpu_itr_t cpu_itr; 1028 boolean_t found; 1029 1030 ASSERT(MUTEX_HELD(&cpu_lock)); 1031 1032 if (cmt_sched_disabled) 1033 return; 1034 1035 pgs = &cp->cpu_pg->pgs; 1036 group_iter_init(&i); 1037 1038 while ((pg = group_iterate(pgs, &i)) != NULL) { 1039 1040 if (IS_CMT_PG(pg) == 0) 1041 continue; 1042 1043 /* 1044 * Remove the CPU from the CMT PGs active CPU group 1045 * bitmap 1046 */ 1047 err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 1048 ASSERT(err == 0); 1049 1050 bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 1051 1052 /* 1053 * If there are no more active CPUs in this PG over which 1054 * load was balanced, remove it as a balancing candidate. 1055 */ 1056 if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 && 1057 (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) { 1058 err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); 1059 ASSERT(err == 0); 1060 1061 if (pg->cmt_parent == NULL && 1062 pg->cmt_siblings != &cmt_root->cl_pgs) { 1063 err = group_remove(&cmt_root->cl_pgs, pg, 1064 GRP_NORESIZE); 1065 ASSERT(err == 0); 1066 } 1067 } 1068 1069 /* 1070 * Assert the number of active CPUs does not exceed 1071 * the total number of CPUs in the PG 1072 */ 1073 ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <= 1074 GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 1075 1076 /* 1077 * Update the PG bitset in the CPU's old partition 1078 */ 1079 found = B_FALSE; 1080 PG_CPU_ITR_INIT(pg, cpu_itr); 1081 while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) { 1082 if (cpp == cp) 1083 continue; 1084 if (CPU_ACTIVE(cpp) && 1085 cpp->cpu_part->cp_id == cp->cpu_part->cp_id) { 1086 found = B_TRUE; 1087 break; 1088 } 1089 } 1090 if (!found) { 1091 bitset_del(&cp->cpu_part->cp_cmt_pgs, 1092 ((pg_t *)pg)->pg_id); 1093 } 1094 } 1095 } 1096 1097 /* 1098 * Return non-zero if the CPU belongs in the given PG 1099 */ 1100 static int 1101 pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp) 1102 { 1103 cpu_t *pg_cpu; 1104 1105 pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0); 1106 1107 ASSERT(pg_cpu != NULL); 1108 1109 /* 1110 * The CPU belongs if, given the nature of the hardware sharing 1111 * relationship represented by the PG, the CPU has that 1112 * relationship with some other CPU already in the PG 1113 */ 1114 if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw)) 1115 return (1); 1116 1117 return (0); 1118 } 1119 1120 /* 1121 * Sort the CPUs CMT hierarchy, where "size" is the number of levels. 1122 */ 1123 static void 1124 pg_cmt_hier_sort(pg_cmt_t **hier, int size) 1125 { 1126 int i, j, inc, sz; 1127 int start, end; 1128 pg_t *tmp; 1129 pg_t **h = (pg_t **)hier; 1130 1131 /* 1132 * First sort by number of CPUs 1133 */ 1134 inc = size / 2; 1135 while (inc > 0) { 1136 for (i = inc; i < size; i++) { 1137 j = i; 1138 tmp = h[i]; 1139 while ((j >= inc) && 1140 (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) { 1141 h[j] = h[j - inc]; 1142 j = j - inc; 1143 } 1144 h[j] = tmp; 1145 } 1146 if (inc == 2) 1147 inc = 1; 1148 else 1149 inc = (inc * 5) / 11; 1150 } 1151 1152 /* 1153 * Break ties by asking the platform. 1154 * Determine if h[i] outranks h[i + 1] and if so, swap them. 1155 */ 1156 for (start = 0; start < size; start++) { 1157 1158 /* 1159 * Find various contiguous sets of elements, 1160 * in the array, with the same number of cpus 1161 */ 1162 end = start; 1163 sz = PG_NUM_CPUS(h[start]); 1164 while ((end < size) && (sz == PG_NUM_CPUS(h[end]))) 1165 end++; 1166 /* 1167 * Sort each such set of the array by rank 1168 */ 1169 for (i = start + 1; i < end; i++) { 1170 j = i - 1; 1171 tmp = h[i]; 1172 while (j >= start && 1173 pg_cmt_hier_rank(hier[j], 1174 (pg_cmt_t *)tmp) == hier[j]) { 1175 h[j + 1] = h[j]; 1176 j--; 1177 } 1178 h[j + 1] = tmp; 1179 } 1180 } 1181 } 1182 1183 /* 1184 * Return a cmt_lgrp_t * given an lgroup handle. 1185 */ 1186 static cmt_lgrp_t * 1187 pg_cmt_find_lgrp(lgrp_handle_t hand) 1188 { 1189 cmt_lgrp_t *lgrp; 1190 1191 ASSERT(MUTEX_HELD(&cpu_lock)); 1192 1193 lgrp = cmt_lgrps; 1194 while (lgrp != NULL) { 1195 if (lgrp->cl_hand == hand) 1196 break; 1197 lgrp = lgrp->cl_next; 1198 } 1199 return (lgrp); 1200 } 1201 1202 /* 1203 * Create a cmt_lgrp_t with the specified handle. 1204 */ 1205 static cmt_lgrp_t * 1206 pg_cmt_lgrp_create(lgrp_handle_t hand) 1207 { 1208 cmt_lgrp_t *lgrp; 1209 1210 ASSERT(MUTEX_HELD(&cpu_lock)); 1211 1212 lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP); 1213 1214 lgrp->cl_hand = hand; 1215 lgrp->cl_npgs = 0; 1216 lgrp->cl_next = cmt_lgrps; 1217 cmt_lgrps = lgrp; 1218 group_create(&lgrp->cl_pgs); 1219 1220 return (lgrp); 1221 } 1222 1223 /* 1224 * Interfaces to enable and disable power aware dispatching 1225 * The caller must be holding cpu_lock. 1226 * 1227 * Return 0 on success and -1 on failure. 1228 */ 1229 int 1230 cmt_pad_enable(pghw_type_t type) 1231 { 1232 group_t *hwset; 1233 group_iter_t iter; 1234 pg_cmt_t *pg; 1235 1236 ASSERT(PGHW_IS_PM_DOMAIN(type)); 1237 ASSERT(MUTEX_HELD(&cpu_lock)); 1238 1239 if ((hwset = pghw_set_lookup(type)) == NULL || 1240 cmt_hw_blacklisted[type]) { 1241 /* 1242 * Unable to find any instances of the specified type 1243 * of power domain, or the power domains have been blacklisted. 1244 */ 1245 return (-1); 1246 } 1247 1248 /* 1249 * Iterate over the power domains, setting the default dispatcher 1250 * policy for power/performance optimization. 1251 * 1252 * Simply setting the policy isn't enough in the case where the power 1253 * domain is an only child of another PG. Because the dispatcher walks 1254 * the PG hierarchy in a top down fashion, the higher up PG's policy 1255 * will dominate. So promote the power domain above it's parent if both 1256 * PG and it's parent have the same CPUs to ensure it's policy 1257 * dominates. 1258 */ 1259 group_iter_init(&iter); 1260 while ((pg = group_iterate(hwset, &iter)) != NULL) { 1261 /* 1262 * If the power domain is an only child to a parent 1263 * not implementing the same policy, promote the child 1264 * above the parent to activate the policy. 1265 */ 1266 pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw); 1267 while ((pg->cmt_parent != NULL) && 1268 (pg->cmt_parent->cmt_policy != pg->cmt_policy) && 1269 (PG_NUM_CPUS((pg_t *)pg) == 1270 PG_NUM_CPUS((pg_t *)pg->cmt_parent))) { 1271 cmt_hier_promote(pg, NULL); 1272 } 1273 } 1274 1275 return (0); 1276 } 1277 1278 int 1279 cmt_pad_disable(pghw_type_t type) 1280 { 1281 group_t *hwset; 1282 group_iter_t iter; 1283 pg_cmt_t *pg; 1284 pg_cmt_t *child; 1285 1286 ASSERT(PGHW_IS_PM_DOMAIN(type)); 1287 ASSERT(MUTEX_HELD(&cpu_lock)); 1288 1289 if ((hwset = pghw_set_lookup(type)) == NULL) { 1290 /* 1291 * Unable to find any instances of the specified type of 1292 * power domain. 1293 */ 1294 return (-1); 1295 } 1296 /* 1297 * Iterate over the power domains, setting the default dispatcher 1298 * policy for performance optimization (load balancing). 1299 */ 1300 group_iter_init(&iter); 1301 while ((pg = group_iterate(hwset, &iter)) != NULL) { 1302 1303 /* 1304 * If the power domain has an only child that implements 1305 * policy other than load balancing, promote the child 1306 * above the power domain to ensure it's policy dominates. 1307 */ 1308 if (pg->cmt_children != NULL && 1309 GROUP_SIZE(pg->cmt_children) == 1) { 1310 child = GROUP_ACCESS(pg->cmt_children, 0); 1311 if ((child->cmt_policy & CMT_BALANCE) == 0) { 1312 cmt_hier_promote(child, NULL); 1313 } 1314 } 1315 pg->cmt_policy = CMT_BALANCE; 1316 } 1317 return (0); 1318 } 1319 1320 /* ARGSUSED */ 1321 static void 1322 cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old, 1323 kthread_t *new) 1324 { 1325 pg_cmt_t *cmt_pg = (pg_cmt_t *)pg; 1326 1327 if (old == cp->cpu_idle_thread) { 1328 atomic_add_32(&cmt_pg->cmt_utilization, 1); 1329 } else if (new == cp->cpu_idle_thread) { 1330 atomic_add_32(&cmt_pg->cmt_utilization, -1); 1331 } 1332 } 1333 1334 /* 1335 * Macro to test whether a thread is currently runnable on a CPU in a PG. 1336 */ 1337 #define THREAD_RUNNABLE_IN_PG(t, pg) \ 1338 ((t)->t_state == TS_RUN && \ 1339 (t)->t_disp_queue->disp_cpu && \ 1340 bitset_in_set(&(pg)->cmt_cpus_actv_set, \ 1341 (t)->t_disp_queue->disp_cpu->cpu_seqid)) 1342 1343 static void 1344 cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old, 1345 kthread_t *new) 1346 { 1347 pg_cmt_t *cmt = (pg_cmt_t *)pg; 1348 cpupm_domain_t *dom; 1349 uint32_t u; 1350 1351 if (old == cp->cpu_idle_thread) { 1352 ASSERT(new != cp->cpu_idle_thread); 1353 u = atomic_add_32_nv(&cmt->cmt_utilization, 1); 1354 if (u == 1) { 1355 /* 1356 * Notify the CPU power manager that the domain 1357 * is non-idle. 1358 */ 1359 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1360 cpupm_utilization_event(cp, now, dom, 1361 CPUPM_DOM_BUSY_FROM_IDLE); 1362 } 1363 } else if (new == cp->cpu_idle_thread) { 1364 ASSERT(old != cp->cpu_idle_thread); 1365 u = atomic_add_32_nv(&cmt->cmt_utilization, -1); 1366 if (u == 0) { 1367 /* 1368 * The domain is idle, notify the CPU power 1369 * manager. 1370 * 1371 * Avoid notifying if the thread is simply migrating 1372 * between CPUs in the domain. 1373 */ 1374 if (!THREAD_RUNNABLE_IN_PG(old, cmt)) { 1375 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1376 cpupm_utilization_event(cp, now, dom, 1377 CPUPM_DOM_IDLE_FROM_BUSY); 1378 } 1379 } 1380 } 1381 } 1382 1383 /* ARGSUSED */ 1384 static void 1385 cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t) 1386 { 1387 pg_cmt_t *cmt = (pg_cmt_t *)pg; 1388 cpupm_domain_t *dom; 1389 1390 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1391 cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY); 1392 } 1393 1394 /* 1395 * Return the name of the CMT scheduling policy 1396 * being implemented across this PG 1397 */ 1398 static char * 1399 pg_cmt_policy_name(pg_t *pg) 1400 { 1401 pg_cmt_policy_t policy; 1402 1403 policy = ((pg_cmt_t *)pg)->cmt_policy; 1404 1405 if (policy & CMT_AFFINITY) { 1406 if (policy & CMT_BALANCE) 1407 return ("Load Balancing & Affinity"); 1408 else if (policy & CMT_COALESCE) 1409 return ("Load Coalescence & Affinity"); 1410 else 1411 return ("Affinity"); 1412 } else { 1413 if (policy & CMT_BALANCE) 1414 return ("Load Balancing"); 1415 else if (policy & CMT_COALESCE) 1416 return ("Load Coalescence"); 1417 else 1418 return ("None"); 1419 } 1420 } 1421 1422 /* 1423 * Prune PG, and all other instances of PG's hardware sharing relationship 1424 * from the CMT PG hierarchy. 1425 * 1426 * This routine operates on the CPU specific processor group data (for the CPUs 1427 * in the PG being pruned), and may be invoked from a context where one CPU's 1428 * PG data is under construction. In this case the argument "pgdata", if not 1429 * NULL, is a reference to the CPU's under-construction PG data. 1430 */ 1431 static int 1432 pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata) 1433 { 1434 group_t *hwset, *children; 1435 int i, j, r, size = *sz; 1436 group_iter_t hw_iter, child_iter; 1437 pg_cpu_itr_t cpu_iter; 1438 pg_cmt_t *pg, *child; 1439 cpu_t *cpu; 1440 int cap_needed; 1441 pghw_type_t hw; 1442 1443 ASSERT(MUTEX_HELD(&cpu_lock)); 1444 1445 hw = ((pghw_t *)pg_bad)->pghw_hw; 1446 1447 if (hw == PGHW_POW_ACTIVE) { 1448 cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. " 1449 "Event Based CPUPM Unavailable"); 1450 } else if (hw == PGHW_POW_IDLE) { 1451 cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. " 1452 "Dispatcher assisted CPUPM disabled."); 1453 } 1454 1455 /* 1456 * Find and eliminate the PG from the lineage. 1457 */ 1458 for (i = 0; i < size; i++) { 1459 if (lineage[i] == pg_bad) { 1460 for (j = i; j < size - 1; j++) 1461 lineage[j] = lineage[j + 1]; 1462 *sz = size - 1; 1463 break; 1464 } 1465 } 1466 1467 /* 1468 * We'll prune all instances of the hardware sharing relationship 1469 * represented by pg. But before we do that (and pause CPUs) we need 1470 * to ensure the hierarchy's groups are properly sized. 1471 */ 1472 hwset = pghw_set_lookup(hw); 1473 1474 /* 1475 * Blacklist the hardware so future processor groups of this type won't 1476 * participate in CMT thread placement. 1477 * 1478 * XXX 1479 * For heterogeneous system configurations, this might be overkill. 1480 * We may only need to blacklist the illegal PGs, and other instances 1481 * of this hardware sharing relationship may be ok. 1482 */ 1483 cmt_hw_blacklisted[hw] = 1; 1484 1485 /* 1486 * For each of the PGs being pruned, ensure sufficient capacity in 1487 * the siblings set for the PG's children 1488 */ 1489 group_iter_init(&hw_iter); 1490 while ((pg = group_iterate(hwset, &hw_iter)) != NULL) { 1491 /* 1492 * PG is being pruned, but if it is bringing up more than 1493 * one child, ask for more capacity in the siblings group. 1494 */ 1495 cap_needed = 0; 1496 if (pg->cmt_children && 1497 GROUP_SIZE(pg->cmt_children) > 1) { 1498 cap_needed = GROUP_SIZE(pg->cmt_children) - 1; 1499 1500 group_expand(pg->cmt_siblings, 1501 GROUP_SIZE(pg->cmt_siblings) + cap_needed); 1502 1503 /* 1504 * If this is a top level group, also ensure the 1505 * capacity in the root lgrp level CMT grouping. 1506 */ 1507 if (pg->cmt_parent == NULL && 1508 pg->cmt_siblings != &cmt_root->cl_pgs) { 1509 group_expand(&cmt_root->cl_pgs, 1510 GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed); 1511 cmt_root->cl_npgs += cap_needed; 1512 } 1513 } 1514 } 1515 1516 /* 1517 * We're operating on the PG hierarchy. Pause CPUs to ensure 1518 * exclusivity with respect to the dispatcher. 1519 */ 1520 pause_cpus(NULL); 1521 1522 /* 1523 * Prune all PG instances of the hardware sharing relationship 1524 * represented by pg. 1525 */ 1526 group_iter_init(&hw_iter); 1527 while ((pg = group_iterate(hwset, &hw_iter)) != NULL) { 1528 1529 /* 1530 * Remove PG from it's group of siblings, if it's there. 1531 */ 1532 if (pg->cmt_siblings) { 1533 (void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); 1534 } 1535 if (pg->cmt_parent == NULL && 1536 pg->cmt_siblings != &cmt_root->cl_pgs) { 1537 (void) group_remove(&cmt_root->cl_pgs, pg, 1538 GRP_NORESIZE); 1539 } 1540 1541 /* 1542 * Indicate that no CMT policy will be implemented across 1543 * this PG. 1544 */ 1545 pg->cmt_policy = CMT_NO_POLICY; 1546 1547 /* 1548 * Move PG's children from it's children set to it's parent's 1549 * children set. Note that the parent's children set, and PG's 1550 * siblings set are the same thing. 1551 * 1552 * Because we are iterating over the same group that we are 1553 * operating on (removing the children), first add all of PG's 1554 * children to the parent's children set, and once we are done 1555 * iterating, empty PG's children set. 1556 */ 1557 if (pg->cmt_children != NULL) { 1558 children = pg->cmt_children; 1559 1560 group_iter_init(&child_iter); 1561 while ((child = group_iterate(children, &child_iter)) 1562 != NULL) { 1563 if (pg->cmt_siblings != NULL) { 1564 r = group_add(pg->cmt_siblings, child, 1565 GRP_NORESIZE); 1566 ASSERT(r == 0); 1567 1568 if (pg->cmt_parent == NULL && 1569 pg->cmt_siblings != 1570 &cmt_root->cl_pgs) { 1571 r = group_add(&cmt_root->cl_pgs, 1572 child, GRP_NORESIZE); 1573 ASSERT(r == 0); 1574 } 1575 } 1576 } 1577 group_empty(pg->cmt_children); 1578 } 1579 1580 /* 1581 * Reset the callbacks to the defaults 1582 */ 1583 pg_callback_set_defaults((pg_t *)pg); 1584 1585 /* 1586 * Update all the CPU lineages in each of PG's CPUs 1587 */ 1588 PG_CPU_ITR_INIT(pg, cpu_iter); 1589 while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { 1590 pg_cmt_t *cpu_pg; 1591 group_iter_t liter; /* Iterator for the lineage */ 1592 cpu_pg_t *cpd; /* CPU's PG data */ 1593 1594 /* 1595 * The CPU's lineage is under construction still 1596 * references the bootstrap CPU PG data structure. 1597 */ 1598 if (pg_cpu_is_bootstrapped(cpu)) 1599 cpd = pgdata; 1600 else 1601 cpd = cpu->cpu_pg; 1602 1603 /* 1604 * Iterate over the CPU's PGs updating the children 1605 * of the PG being promoted, since they have a new 1606 * parent and siblings set. 1607 */ 1608 group_iter_init(&liter); 1609 while ((cpu_pg = group_iterate(&cpd->pgs, 1610 &liter)) != NULL) { 1611 if (cpu_pg->cmt_parent == pg) { 1612 cpu_pg->cmt_parent = pg->cmt_parent; 1613 cpu_pg->cmt_siblings = pg->cmt_siblings; 1614 } 1615 } 1616 1617 /* 1618 * Update the CPU's lineages 1619 * 1620 * Remove the PG from the CPU's group used for CMT 1621 * scheduling. 1622 */ 1623 (void) group_remove(&cpd->cmt_pgs, pg, GRP_NORESIZE); 1624 } 1625 } 1626 start_cpus(); 1627 return (0); 1628 } 1629 1630 /* 1631 * Disable CMT scheduling 1632 */ 1633 static void 1634 pg_cmt_disable(void) 1635 { 1636 cpu_t *cpu; 1637 1638 ASSERT(MUTEX_HELD(&cpu_lock)); 1639 1640 pause_cpus(NULL); 1641 cpu = cpu_list; 1642 1643 do { 1644 if (cpu->cpu_pg) 1645 group_empty(&cpu->cpu_pg->cmt_pgs); 1646 } while ((cpu = cpu->cpu_next) != cpu_list); 1647 1648 cmt_sched_disabled = 1; 1649 start_cpus(); 1650 cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable"); 1651 } 1652 1653 /* 1654 * CMT lineage validation 1655 * 1656 * This routine is invoked by pg_cmt_cpu_init() to validate the integrity 1657 * of the PGs in a CPU's lineage. This is necessary because it's possible that 1658 * some groupings (power domain groupings in particular) may be defined by 1659 * sources that are buggy (e.g. BIOS bugs). In such cases, it may not be 1660 * possible to integrate those groupings into the CMT PG hierarchy, if doing 1661 * so would violate the subset invariant of the hierarchy, which says that 1662 * a PG must be subset of its parent (if it has one). 1663 * 1664 * pg_cmt_lineage_validate()'s purpose is to detect grouping definitions that 1665 * would result in a violation of this invariant. If a violation is found, 1666 * and the PG is of a grouping type who's definition is known to originate from 1667 * suspect sources (BIOS), then pg_cmt_prune() will be invoked to prune the 1668 * PG (and all other instances PG's sharing relationship type) from the 1669 * hierarchy. Further, future instances of that sharing relationship type won't 1670 * be instantiated. If the grouping definition doesn't originate from suspect 1671 * sources, then pg_cmt_disable() will be invoked to log an error, and disable 1672 * CMT scheduling altogether. 1673 * 1674 * This routine is invoked after the CPU has been added to the PGs in which 1675 * it belongs, but before those PGs have been added to (or had their place 1676 * adjusted in) the CMT PG hierarchy. 1677 * 1678 * The first argument is the CPUs PG lineage (essentially an array of PGs in 1679 * which the CPU belongs) that has already been sorted in ascending order 1680 * by CPU count. Some of the PGs in the CPUs lineage may already have other 1681 * CPUs in them, and have already been integrated into the CMT hierarchy. 1682 * 1683 * The addition of this new CPU to these pre-existing PGs means that those 1684 * PGs may need to be promoted up in the hierarchy to satisfy the subset 1685 * invariant. In additon to testing the subset invariant for the lineage, 1686 * this routine also verifies that the addition of the new CPU to the 1687 * existing PGs wouldn't cause the subset invariant to be violated in 1688 * the exiting lineages. 1689 * 1690 * This routine will normally return one of the following: 1691 * CMT_LINEAGE_VALID - There were no problems detected with the lineage. 1692 * CMT_LINEAGE_REPAIRED - Problems were detected, but repaired via pruning. 1693 * 1694 * Otherwise, this routine will return a value indicating which error it 1695 * was unable to recover from (and set cmt_lineage_status along the way). 1696 * 1697 * 1698 * This routine operates on the CPU specific processor group data (for the CPU 1699 * whose lineage is being validated), which is under-construction. 1700 * "pgdata" is a reference to the CPU's under-construction PG data. 1701 * This routine must be careful to operate only on "pgdata", and not cp->cpu_pg. 1702 */ 1703 static cmt_lineage_validation_t 1704 pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata) 1705 { 1706 int i, j, size; 1707 pg_cmt_t *pg, *pg_next, *pg_bad, *pg_tmp; 1708 cpu_t *cp; 1709 pg_cpu_itr_t cpu_iter; 1710 lgrp_handle_t lgrp; 1711 1712 ASSERT(MUTEX_HELD(&cpu_lock)); 1713 1714 revalidate: 1715 size = *sz; 1716 pg_bad = NULL; 1717 lgrp = LGRP_NULL_HANDLE; 1718 for (i = 0; i < size; i++) { 1719 1720 pg = lineage[i]; 1721 if (i < size - 1) 1722 pg_next = lineage[i + 1]; 1723 else 1724 pg_next = NULL; 1725 1726 /* 1727 * We assume that the lineage has already been sorted 1728 * by the number of CPUs. In fact, we depend on it. 1729 */ 1730 ASSERT(pg_next == NULL || 1731 (PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)pg_next))); 1732 1733 /* 1734 * Check to make sure that the existing parent of PG (if any) 1735 * is either in the PG's lineage, or the PG has more CPUs than 1736 * its existing parent and can and should be promoted above its 1737 * parent. 1738 * 1739 * Since the PG topology is in the middle of being changed, we 1740 * need to check whether the PG's existing parent (if any) is 1741 * part of its lineage (and therefore should contain the new 1742 * CPU). If not, it means that the addition of the new CPU 1743 * should have made this PG have more CPUs than its parent, and 1744 * this PG should be promoted to be above its existing parent 1745 * now. We need to verify all of this to defend against a buggy 1746 * BIOS giving bad power domain CPU groupings. Sigh. 1747 */ 1748 if (pg->cmt_parent) { 1749 /* 1750 * Determine if cmt_parent is in this lineage 1751 */ 1752 for (j = 0; j < size; j++) { 1753 pg_tmp = lineage[j]; 1754 if (pg_tmp == pg->cmt_parent) 1755 break; 1756 } 1757 if (pg_tmp != pg->cmt_parent) { 1758 /* 1759 * cmt_parent is not in the lineage, verify 1760 * it is a proper subset of PG. 1761 */ 1762 if (PG_NUM_CPUS((pg_t *)pg->cmt_parent) >= 1763 PG_NUM_CPUS((pg_t *)pg)) { 1764 /* 1765 * Not a proper subset if pg has less 1766 * CPUs than cmt_parent... 1767 */ 1768 cmt_lineage_status = 1769 CMT_LINEAGE_NON_PROMOTABLE; 1770 goto handle_error; 1771 } 1772 } 1773 } 1774 1775 /* 1776 * Walk each of the CPUs in the PGs group and perform 1777 * consistency checks along the way. 1778 */ 1779 PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter); 1780 while ((cp = pg_cpu_next(&cpu_iter)) != NULL) { 1781 /* 1782 * Verify that there aren't any CPUs contained in PG 1783 * that the next PG in the lineage (which is larger 1784 * or same size) doesn't also contain. 1785 */ 1786 if (pg_next != NULL && 1787 pg_cpu_find((pg_t *)pg_next, cp) == B_FALSE) { 1788 cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC; 1789 goto handle_error; 1790 } 1791 1792 /* 1793 * Verify that all the CPUs in the PG are in the same 1794 * lgroup. 1795 */ 1796 if (lgrp == LGRP_NULL_HANDLE) { 1797 lgrp = lgrp_plat_cpu_to_hand(cp->cpu_id); 1798 } else if (lgrp_plat_cpu_to_hand(cp->cpu_id) != lgrp) { 1799 cmt_lineage_status = CMT_LINEAGE_PG_SPANS_LGRPS; 1800 goto handle_error; 1801 } 1802 } 1803 } 1804 1805 handle_error: 1806 /* 1807 * Some of these validation errors can result when the CPU grouping 1808 * information is derived from buggy sources (for example, incorrect 1809 * ACPI tables on x86 systems). 1810 * 1811 * We'll try to recover in such cases by pruning out the illegal 1812 * groupings from the PG hierarchy, which means that we won't optimize 1813 * for those levels, but we will for the remaining ones. 1814 */ 1815 switch (cmt_lineage_status) { 1816 case CMT_LINEAGE_VALID: 1817 case CMT_LINEAGE_REPAIRED: 1818 break; 1819 case CMT_LINEAGE_PG_SPANS_LGRPS: 1820 /* 1821 * We've detected a PG whose CPUs span lgroups. 1822 * 1823 * This isn't supported, as the dispatcher isn't allowed to 1824 * to do CMT thread placement across lgroups, as this would 1825 * conflict with policies implementing MPO thread affinity. 1826 * 1827 * If the PG is of a sharing relationship type known to 1828 * legitimately span lgroups, specify that no CMT thread 1829 * placement policy should be implemented, and prune the PG 1830 * from the existing CMT PG hierarchy. 1831 * 1832 * Otherwise, fall though to the case below for handling. 1833 */ 1834 if (((pghw_t *)pg)->pghw_hw == PGHW_CHIP) { 1835 if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) { 1836 cmt_lineage_status = CMT_LINEAGE_REPAIRED; 1837 goto revalidate; 1838 } 1839 } 1840 /*LINTED*/ 1841 case CMT_LINEAGE_NON_PROMOTABLE: 1842 /* 1843 * We've detected a PG that already exists in another CPU's 1844 * lineage that cannot cannot legally be promoted into place 1845 * without breaking the invariants of the hierarchy. 1846 */ 1847 if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) { 1848 if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) { 1849 cmt_lineage_status = CMT_LINEAGE_REPAIRED; 1850 goto revalidate; 1851 } 1852 } 1853 /* 1854 * Something went wrong trying to prune out the bad level. 1855 * Disable CMT scheduling altogether. 1856 */ 1857 pg_cmt_disable(); 1858 break; 1859 case CMT_LINEAGE_NON_CONCENTRIC: 1860 /* 1861 * We've detected a non-concentric PG lineage, which means that 1862 * there's a PG in the lineage that has CPUs that the next PG 1863 * over in the lineage (which is the same size or larger) 1864 * doesn't have. 1865 * 1866 * In this case, we examine the two PGs to see if either 1867 * grouping is defined by potentially buggy sources. 1868 * 1869 * If one has less CPUs than the other, and contains CPUs 1870 * not found in the parent, and it is an untrusted enumeration, 1871 * then prune it. If both have the same number of CPUs, then 1872 * prune the one that is untrusted. 1873 * 1874 * This process repeats until we have a concentric lineage, 1875 * or we would have to prune out level derived from what we 1876 * thought was a reliable source, in which case CMT scheduling 1877 * is disabled altogether. 1878 */ 1879 if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)pg_next)) && 1880 (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) { 1881 pg_bad = pg; 1882 } else if (PG_NUM_CPUS((pg_t *)pg) == 1883 PG_NUM_CPUS((pg_t *)pg_next)) { 1884 if (PG_CMT_HW_SUSPECT(((pghw_t *)pg_next)->pghw_hw)) { 1885 pg_bad = pg_next; 1886 } else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) { 1887 pg_bad = pg; 1888 } 1889 } 1890 if (pg_bad) { 1891 if (pg_cmt_prune(pg_bad, lineage, sz, pgdata) == 0) { 1892 cmt_lineage_status = CMT_LINEAGE_REPAIRED; 1893 goto revalidate; 1894 } 1895 } 1896 /* 1897 * Something went wrong trying to identify and/or prune out 1898 * the bad level. Disable CMT scheduling altogether. 1899 */ 1900 pg_cmt_disable(); 1901 break; 1902 default: 1903 /* 1904 * If we're here, we've encountered a validation error for 1905 * which we don't know how to recover. In this case, disable 1906 * CMT scheduling altogether. 1907 */ 1908 cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE; 1909 pg_cmt_disable(); 1910 } 1911 return (cmt_lineage_status); 1912 } 1913