1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/systm.h> 27 #include <sys/types.h> 28 #include <sys/param.h> 29 #include <sys/thread.h> 30 #include <sys/cpuvar.h> 31 #include <sys/cpupart.h> 32 #include <sys/kmem.h> 33 #include <sys/cmn_err.h> 34 #include <sys/kstat.h> 35 #include <sys/processor.h> 36 #include <sys/disp.h> 37 #include <sys/group.h> 38 #include <sys/pghw.h> 39 #include <sys/bitset.h> 40 #include <sys/lgrp.h> 41 #include <sys/cmt.h> 42 43 /* 44 * CMT scheduler / dispatcher support 45 * 46 * This file implements CMT scheduler support using Processor Groups. 47 * The CMT processor group class creates and maintains the CMT class 48 * specific processor group pg_cmt_t. 49 * 50 * ---------------------------- <-- pg_cmt_t * 51 * | pghw_t | 52 * ---------------------------- 53 * | CMT class specific data | 54 * | - hierarchy linkage | 55 * | - CMT load balancing data| 56 * | - active CPU group/bitset| 57 * ---------------------------- 58 * 59 * The scheduler/dispatcher leverages knowledge of the performance 60 * relevant CMT sharing relationships existing between cpus to implement 61 * optimized affinity and load balancing policies. 62 * 63 * Load balancing policy seeks to improve performance by minimizing 64 * contention over shared processor resources / facilities, while the 65 * affinity policies seek to improve cache and TLB utilization. 66 * 67 * The CMT PGs created by this class are already arranged into a 68 * hierarchy (which is done in the pghw layer). To implement the top-down 69 * CMT load balancing algorithm, the CMT PGs additionally maintain 70 * parent, child and sibling hierarchy relationships. 71 * Parent PGs always contain a superset of their children(s) resources, 72 * each PG can have at most one parent, and siblings are the group of PGs 73 * sharing the same parent. 74 * 75 * On NUMA systems, the CMT load balancing algorithm balances across the 76 * CMT PGs within their respective lgroups. On UMA based system, there 77 * exists a top level group of PGs to balance across. On NUMA systems multiple 78 * top level groups are instantiated, where the top level balancing begins by 79 * balancng across the CMT PGs within their respective (per lgroup) top level 80 * groups. 81 */ 82 typedef struct cmt_lgrp { 83 group_t cl_pgs; /* Top level group of active CMT PGs */ 84 int cl_npgs; /* # of top level PGs in the lgroup */ 85 lgrp_handle_t cl_hand; /* lgroup's platform handle */ 86 struct cmt_lgrp *cl_next; /* next cmt_lgrp */ 87 } cmt_lgrp_t; 88 89 static cmt_lgrp_t *cmt_lgrps = NULL; /* cmt_lgrps list head */ 90 static cmt_lgrp_t *cpu0_lgrp = NULL; /* boot CPU's initial lgrp */ 91 /* used for null_proc_lpa */ 92 static cmt_lgrp_t *cmt_root = NULL; /* Reference to root cmt pg */ 93 94 static int is_cpu0 = 1; /* true if this is boot CPU context */ 95 96 /* 97 * Set this to non-zero to disable CMT scheduling 98 * This must be done via kmdb -d, as /etc/system will be too late 99 */ 100 static int cmt_sched_disabled = 0; 101 102 static pg_cid_t pg_cmt_class_id; /* PG class id */ 103 104 static pg_t *pg_cmt_alloc(); 105 static void pg_cmt_free(pg_t *); 106 static void pg_cmt_cpu_init(cpu_t *); 107 static void pg_cmt_cpu_fini(cpu_t *); 108 static void pg_cmt_cpu_active(cpu_t *); 109 static void pg_cmt_cpu_inactive(cpu_t *); 110 static void pg_cmt_cpupart_in(cpu_t *, cpupart_t *); 111 static void pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *); 112 static void pg_cmt_hier_pack(void **, int); 113 static int pg_cmt_cpu_belongs(pg_t *, cpu_t *); 114 static int pg_cmt_hw(pghw_type_t); 115 static cmt_lgrp_t *pg_cmt_find_lgrp(lgrp_handle_t); 116 static cmt_lgrp_t *pg_cmt_lgrp_create(lgrp_handle_t); 117 118 /* 119 * Macro to test if PG is managed by the CMT PG class 120 */ 121 #define IS_CMT_PG(pg) (((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id) 122 123 /* 124 * CMT PG ops 125 */ 126 struct pg_ops pg_ops_cmt = { 127 pg_cmt_alloc, 128 pg_cmt_free, 129 pg_cmt_cpu_init, 130 pg_cmt_cpu_fini, 131 pg_cmt_cpu_active, 132 pg_cmt_cpu_inactive, 133 pg_cmt_cpupart_in, 134 NULL, /* cpupart_out */ 135 pg_cmt_cpupart_move, 136 pg_cmt_cpu_belongs, 137 }; 138 139 /* 140 * Initialize the CMT PG class 141 */ 142 void 143 pg_cmt_class_init(void) 144 { 145 if (cmt_sched_disabled) 146 return; 147 148 pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL); 149 } 150 151 /* 152 * Called to indicate a new CPU has started up so 153 * that either t0 or the slave startup thread can 154 * be accounted for. 155 */ 156 void 157 pg_cmt_cpu_startup(cpu_t *cp) 158 { 159 PG_NRUN_UPDATE(cp, 1); 160 } 161 162 /* 163 * Adjust the CMT load in the CMT PGs in which the CPU belongs 164 * Note that "n" can be positive in the case of increasing 165 * load, or negative in the case of decreasing load. 166 */ 167 void 168 pg_cmt_load(cpu_t *cp, int n) 169 { 170 pg_cmt_t *pg; 171 172 pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage; 173 while (pg != NULL) { 174 ASSERT(IS_CMT_PG(pg)); 175 atomic_add_32(&pg->cmt_nrunning, n); 176 pg = pg->cmt_parent; 177 } 178 } 179 180 /* 181 * Return non-zero if thread can migrate between "from" and "to" 182 * without a performance penalty 183 */ 184 int 185 pg_cmt_can_migrate(cpu_t *from, cpu_t *to) 186 { 187 if (from->cpu_physid->cpu_cacheid == 188 to->cpu_physid->cpu_cacheid) 189 return (1); 190 return (0); 191 } 192 193 /* 194 * CMT class specific PG allocation 195 */ 196 static pg_t * 197 pg_cmt_alloc(void) 198 { 199 return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP)); 200 } 201 202 /* 203 * Class specific PG de-allocation 204 */ 205 static void 206 pg_cmt_free(pg_t *pg) 207 { 208 ASSERT(pg != NULL); 209 ASSERT(IS_CMT_PG(pg)); 210 211 kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t)); 212 } 213 214 /* 215 * Return 1 if CMT scheduling policies should be impelmented 216 * for the specified hardware sharing relationship. 217 */ 218 static int 219 pg_cmt_hw(pghw_type_t hw) 220 { 221 return (pg_plat_cmt_load_bal_hw(hw) || 222 pg_plat_cmt_affinity_hw(hw)); 223 } 224 225 /* 226 * CMT class callback for a new CPU entering the system 227 */ 228 static void 229 pg_cmt_cpu_init(cpu_t *cp) 230 { 231 pg_cmt_t *pg; 232 group_t *cmt_pgs; 233 int level, max_level, nlevels; 234 pghw_type_t hw; 235 pg_t *pg_cache = NULL; 236 pg_cmt_t *cpu_cmt_hier[PGHW_NUM_COMPONENTS]; 237 lgrp_handle_t lgrp_handle; 238 cmt_lgrp_t *lgrp; 239 240 ASSERT(MUTEX_HELD(&cpu_lock)); 241 242 /* 243 * A new CPU is coming into the system. 244 * Interrogate the platform to see if the CPU 245 * has any performance relevant CMT sharing 246 * relationships 247 */ 248 cmt_pgs = &cp->cpu_pg->cmt_pgs; 249 cp->cpu_pg->cmt_lineage = NULL; 250 251 bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier)); 252 max_level = nlevels = 0; 253 for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) { 254 255 /* 256 * We're only interested in CMT hw sharing relationships 257 */ 258 if (pg_cmt_hw(hw) == 0 || pg_plat_hw_shared(cp, hw) == 0) 259 continue; 260 261 /* 262 * Find (or create) the PG associated with 263 * the hw sharing relationship in which cp 264 * belongs. 265 * 266 * Determine if a suitable PG already 267 * exists, or if one needs to be created. 268 */ 269 pg = (pg_cmt_t *)pghw_place_cpu(cp, hw); 270 if (pg == NULL) { 271 /* 272 * Create a new one. 273 * Initialize the common... 274 */ 275 pg = (pg_cmt_t *)pg_create(pg_cmt_class_id); 276 277 /* ... physical ... */ 278 pghw_init((pghw_t *)pg, cp, hw); 279 280 /* 281 * ... and CMT specific portions of the 282 * structure. 283 */ 284 bitset_init(&pg->cmt_cpus_actv_set); 285 group_create(&pg->cmt_cpus_actv); 286 } else { 287 ASSERT(IS_CMT_PG(pg)); 288 } 289 290 /* Add the CPU to the PG */ 291 pg_cpu_add((pg_t *)pg, cp); 292 293 /* 294 * Ensure capacity of the active CPU group/bitset 295 */ 296 group_expand(&pg->cmt_cpus_actv, 297 GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 298 299 if (cp->cpu_seqid >= 300 bitset_capacity(&pg->cmt_cpus_actv_set)) { 301 bitset_resize(&pg->cmt_cpus_actv_set, 302 cp->cpu_seqid + 1); 303 } 304 305 /* 306 * Build a lineage of CMT PGs for load balancing 307 */ 308 if (pg_plat_cmt_load_bal_hw(hw)) { 309 level = pghw_level(hw); 310 cpu_cmt_hier[level] = pg; 311 if (level > max_level) 312 max_level = level; 313 nlevels++; 314 } 315 316 /* Cache this for later */ 317 if (hw == PGHW_CACHE) 318 pg_cache = (pg_t *)pg; 319 } 320 321 /* 322 * Pack out any gaps in the constructed lineage, 323 * then size it out. 324 * 325 * Gaps may exist where the architecture knows 326 * about a hardware sharing relationship, but such a 327 * relationship either isn't relevant for load 328 * balancing or doesn't exist between CPUs on the system. 329 */ 330 pg_cmt_hier_pack((void **)cpu_cmt_hier, max_level + 1); 331 group_expand(cmt_pgs, nlevels); 332 333 334 if (cmt_root == NULL) 335 cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand()); 336 337 /* 338 * Find the lgrp that encapsulates this CPU's CMT hierarchy. 339 * and locate/create a suitable cmt_lgrp_t. 340 */ 341 lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 342 if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL) 343 lgrp = pg_cmt_lgrp_create(lgrp_handle); 344 345 /* 346 * For each of the PGs in the CPU's lineage: 347 * - Add an entry in the CPU's CMT PG group 348 * which is used by the dispatcher to implement load balancing 349 * policy. 350 * - Tie the PG into the CMT hierarchy by connecting 351 * it to it's parent and siblings. 352 */ 353 for (level = 0; level < nlevels; level++) { 354 uint_t children; 355 int err; 356 357 pg = cpu_cmt_hier[level]; 358 err = group_add_at(cmt_pgs, pg, nlevels - level - 1); 359 ASSERT(err == 0); 360 361 if (level == 0) 362 cp->cpu_pg->cmt_lineage = (pg_t *)pg; 363 364 if (pg->cmt_siblings != NULL) { 365 /* Already initialized */ 366 ASSERT(pg->cmt_parent == NULL || 367 pg->cmt_parent == cpu_cmt_hier[level + 1]); 368 ASSERT(pg->cmt_siblings == &lgrp->cl_pgs || 369 ((pg->cmt_parent != NULL) && 370 pg->cmt_siblings == pg->cmt_parent->cmt_children)); 371 continue; 372 } 373 374 if ((level + 1) == nlevels) { 375 pg->cmt_parent = NULL; 376 377 pg->cmt_siblings = &lgrp->cl_pgs; 378 children = ++lgrp->cl_npgs; 379 cmt_root->cl_npgs++; 380 } else { 381 pg->cmt_parent = cpu_cmt_hier[level + 1]; 382 383 /* 384 * A good parent keeps track of their children. 385 * The parent's children group is also the PG's 386 * siblings. 387 */ 388 if (pg->cmt_parent->cmt_children == NULL) { 389 pg->cmt_parent->cmt_children = 390 kmem_zalloc(sizeof (group_t), KM_SLEEP); 391 group_create(pg->cmt_parent->cmt_children); 392 } 393 pg->cmt_siblings = pg->cmt_parent->cmt_children; 394 children = ++pg->cmt_parent->cmt_nchildren; 395 } 396 397 group_expand(pg->cmt_siblings, children); 398 group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs); 399 } 400 401 /* 402 * Cache the chip and core IDs in the cpu_t->cpu_physid structure 403 * for fast lookups later. 404 */ 405 if (cp->cpu_physid) { 406 cp->cpu_physid->cpu_chipid = 407 pg_plat_hw_instance_id(cp, PGHW_CHIP); 408 cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp); 409 410 /* 411 * If this cpu has a PG representing shared cache, then set 412 * cpu_cacheid to that PG's logical id 413 */ 414 if (pg_cache) 415 cp->cpu_physid->cpu_cacheid = pg_cache->pg_id; 416 } 417 418 /* CPU0 only initialization */ 419 if (is_cpu0) { 420 pg_cmt_cpu_startup(cp); 421 is_cpu0 = 0; 422 cpu0_lgrp = lgrp; 423 } 424 425 } 426 427 /* 428 * Class callback when a CPU is leaving the system (deletion) 429 */ 430 static void 431 pg_cmt_cpu_fini(cpu_t *cp) 432 { 433 group_iter_t i; 434 pg_cmt_t *pg; 435 group_t *pgs, *cmt_pgs; 436 lgrp_handle_t lgrp_handle; 437 cmt_lgrp_t *lgrp; 438 439 pgs = &cp->cpu_pg->pgs; 440 cmt_pgs = &cp->cpu_pg->cmt_pgs; 441 442 /* 443 * Find the lgroup that encapsulates this CPU's CMT hierarchy 444 */ 445 lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 446 447 lgrp = pg_cmt_find_lgrp(lgrp_handle); 448 if (lgrp == NULL) { 449 /* 450 * This is a bit of a special case. 451 * The only way this can happen is if the CPU's lgrp 452 * handle changed out from underneath us, which is what 453 * happens with null_proc_lpa on starcat systems. 454 * 455 * Use the initial boot CPU lgrp, since this is what 456 * we need to tear down. 457 */ 458 lgrp = cpu0_lgrp; 459 } 460 461 /* 462 * First, clean up anything load balancing specific for each of 463 * the CPU's PGs that participated in CMT load balancing 464 */ 465 pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage; 466 while (pg != NULL) { 467 468 /* 469 * Remove the PG from the CPU's load balancing lineage 470 */ 471 (void) group_remove(cmt_pgs, pg, GRP_RESIZE); 472 473 /* 474 * If it's about to become empty, destroy it's children 475 * group, and remove it's reference from it's siblings. 476 * This is done here (rather than below) to avoid removing 477 * our reference from a PG that we just eliminated. 478 */ 479 if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) { 480 if (pg->cmt_children != NULL) 481 group_destroy(pg->cmt_children); 482 if (pg->cmt_siblings != NULL) { 483 if (pg->cmt_siblings == &lgrp->cl_pgs) 484 lgrp->cl_npgs--; 485 else 486 pg->cmt_parent->cmt_nchildren--; 487 } 488 } 489 pg = pg->cmt_parent; 490 } 491 ASSERT(GROUP_SIZE(cmt_pgs) == 0); 492 493 /* 494 * Now that the load balancing lineage updates have happened, 495 * remove the CPU from all it's PGs (destroying any that become 496 * empty). 497 */ 498 group_iter_init(&i); 499 while ((pg = group_iterate(pgs, &i)) != NULL) { 500 if (IS_CMT_PG(pg) == 0) 501 continue; 502 503 pg_cpu_delete((pg_t *)pg, cp); 504 /* 505 * Deleting the CPU from the PG changes the CPU's 506 * PG group over which we are actively iterating 507 * Re-initialize the iteration 508 */ 509 group_iter_init(&i); 510 511 if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) { 512 513 /* 514 * The PG has become zero sized, so destroy it. 515 */ 516 group_destroy(&pg->cmt_cpus_actv); 517 bitset_fini(&pg->cmt_cpus_actv_set); 518 pghw_fini((pghw_t *)pg); 519 520 pg_destroy((pg_t *)pg); 521 } 522 } 523 } 524 525 /* 526 * Class callback when a CPU is entering a cpu partition 527 */ 528 static void 529 pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp) 530 { 531 group_t *pgs; 532 pg_t *pg; 533 group_iter_t i; 534 535 ASSERT(MUTEX_HELD(&cpu_lock)); 536 537 pgs = &cp->cpu_pg->pgs; 538 539 /* 540 * Ensure that the new partition's PG bitset 541 * is large enough for all CMT PG's to which cp 542 * belongs 543 */ 544 group_iter_init(&i); 545 while ((pg = group_iterate(pgs, &i)) != NULL) { 546 if (IS_CMT_PG(pg) == 0) 547 continue; 548 549 if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id) 550 bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1); 551 } 552 } 553 554 /* 555 * Class callback when a CPU is actually moving partitions 556 */ 557 static void 558 pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp) 559 { 560 cpu_t *cpp; 561 group_t *pgs; 562 pg_t *pg; 563 group_iter_t pg_iter; 564 pg_cpu_itr_t cpu_iter; 565 boolean_t found; 566 567 ASSERT(MUTEX_HELD(&cpu_lock)); 568 569 pgs = &cp->cpu_pg->pgs; 570 group_iter_init(&pg_iter); 571 572 /* 573 * Iterate over the CPUs CMT PGs 574 */ 575 while ((pg = group_iterate(pgs, &pg_iter)) != NULL) { 576 577 if (IS_CMT_PG(pg) == 0) 578 continue; 579 580 /* 581 * Add the PG to the bitset in the new partition. 582 */ 583 bitset_add(&newpp->cp_cmt_pgs, pg->pg_id); 584 585 /* 586 * Remove the PG from the bitset in the old partition 587 * if the last of the PG's CPUs have left. 588 */ 589 found = B_FALSE; 590 PG_CPU_ITR_INIT(pg, cpu_iter); 591 while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) { 592 if (cpp == cp) 593 continue; 594 if (CPU_ACTIVE(cpp) && 595 cpp->cpu_part->cp_id == oldpp->cp_id) { 596 found = B_TRUE; 597 break; 598 } 599 } 600 if (!found) 601 bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id); 602 } 603 } 604 605 /* 606 * Class callback when a CPU becomes active (online) 607 * 608 * This is called in a context where CPUs are paused 609 */ 610 static void 611 pg_cmt_cpu_active(cpu_t *cp) 612 { 613 int err; 614 group_iter_t i; 615 pg_cmt_t *pg; 616 group_t *pgs; 617 618 ASSERT(MUTEX_HELD(&cpu_lock)); 619 620 pgs = &cp->cpu_pg->pgs; 621 group_iter_init(&i); 622 623 /* 624 * Iterate over the CPU's PGs 625 */ 626 while ((pg = group_iterate(pgs, &i)) != NULL) { 627 628 if (IS_CMT_PG(pg) == 0) 629 continue; 630 631 err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 632 ASSERT(err == 0); 633 634 /* 635 * If this is the first active CPU in the PG, and it 636 * represents a hardware sharing relationship over which 637 * CMT load balancing is performed, add it as a candidate 638 * for balancing with it's siblings. 639 */ 640 if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 && 641 pg_plat_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) { 642 err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE); 643 ASSERT(err == 0); 644 645 /* 646 * If this is a top level PG, add it as a balancing 647 * candidate when balancing within the root lgroup 648 */ 649 if (pg->cmt_parent == NULL) { 650 err = group_add(&cmt_root->cl_pgs, pg, 651 GRP_NORESIZE); 652 ASSERT(err == 0); 653 } 654 } 655 656 /* 657 * Notate the CPU in the PGs active CPU bitset. 658 * Also notate the PG as being active in it's associated 659 * partition 660 */ 661 bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 662 bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id); 663 } 664 } 665 666 /* 667 * Class callback when a CPU goes inactive (offline) 668 * 669 * This is called in a context where CPUs are paused 670 */ 671 static void 672 pg_cmt_cpu_inactive(cpu_t *cp) 673 { 674 int err; 675 group_t *pgs; 676 pg_cmt_t *pg; 677 cpu_t *cpp; 678 group_iter_t i; 679 pg_cpu_itr_t cpu_itr; 680 boolean_t found; 681 682 ASSERT(MUTEX_HELD(&cpu_lock)); 683 684 pgs = &cp->cpu_pg->pgs; 685 group_iter_init(&i); 686 687 while ((pg = group_iterate(pgs, &i)) != NULL) { 688 689 if (IS_CMT_PG(pg) == 0) 690 continue; 691 692 /* 693 * Remove the CPU from the CMT PGs active CPU group 694 * bitmap 695 */ 696 err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 697 ASSERT(err == 0); 698 699 bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 700 701 /* 702 * If there are no more active CPUs in this PG over which 703 * load was balanced, remove it as a balancing candidate. 704 */ 705 if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 && 706 pg_plat_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) { 707 err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); 708 ASSERT(err == 0); 709 710 if (pg->cmt_parent == NULL) { 711 err = group_remove(&cmt_root->cl_pgs, pg, 712 GRP_NORESIZE); 713 ASSERT(err == 0); 714 } 715 } 716 717 /* 718 * Assert the number of active CPUs does not exceed 719 * the total number of CPUs in the PG 720 */ 721 ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <= 722 GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 723 724 /* 725 * Update the PG bitset in the CPU's old partition 726 */ 727 found = B_FALSE; 728 PG_CPU_ITR_INIT(pg, cpu_itr); 729 while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) { 730 if (cpp == cp) 731 continue; 732 if (CPU_ACTIVE(cpp) && 733 cpp->cpu_part->cp_id == cp->cpu_part->cp_id) { 734 found = B_TRUE; 735 break; 736 } 737 } 738 if (!found) { 739 bitset_del(&cp->cpu_part->cp_cmt_pgs, 740 ((pg_t *)pg)->pg_id); 741 } 742 } 743 } 744 745 /* 746 * Return non-zero if the CPU belongs in the given PG 747 */ 748 static int 749 pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp) 750 { 751 cpu_t *pg_cpu; 752 753 pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0); 754 755 ASSERT(pg_cpu != NULL); 756 757 /* 758 * The CPU belongs if, given the nature of the hardware sharing 759 * relationship represented by the PG, the CPU has that 760 * relationship with some other CPU already in the PG 761 */ 762 if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw)) 763 return (1); 764 765 return (0); 766 } 767 768 /* 769 * Hierarchy packing utility routine. The hierarchy order is preserved. 770 */ 771 static void 772 pg_cmt_hier_pack(void *hier[], int sz) 773 { 774 int i, j; 775 776 for (i = 0; i < sz; i++) { 777 if (hier[i] != NULL) 778 continue; 779 780 for (j = i; j < sz; j++) { 781 if (hier[j] != NULL) { 782 hier[i] = hier[j]; 783 hier[j] = NULL; 784 break; 785 } 786 } 787 if (j == sz) 788 break; 789 } 790 } 791 792 /* 793 * Return a cmt_lgrp_t * given an lgroup handle. 794 */ 795 static cmt_lgrp_t * 796 pg_cmt_find_lgrp(lgrp_handle_t hand) 797 { 798 cmt_lgrp_t *lgrp; 799 800 ASSERT(MUTEX_HELD(&cpu_lock)); 801 802 lgrp = cmt_lgrps; 803 while (lgrp != NULL) { 804 if (lgrp->cl_hand == hand) 805 break; 806 lgrp = lgrp->cl_next; 807 } 808 return (lgrp); 809 } 810 811 /* 812 * Create a cmt_lgrp_t with the specified handle. 813 */ 814 static cmt_lgrp_t * 815 pg_cmt_lgrp_create(lgrp_handle_t hand) 816 { 817 cmt_lgrp_t *lgrp; 818 819 ASSERT(MUTEX_HELD(&cpu_lock)); 820 821 lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP); 822 823 lgrp->cl_hand = hand; 824 lgrp->cl_npgs = 0; 825 lgrp->cl_next = cmt_lgrps; 826 cmt_lgrps = lgrp; 827 group_create(&lgrp->cl_pgs); 828 829 return (lgrp); 830 } 831 832 /* 833 * Perform multi-level CMT load balancing of running threads. 834 * 835 * tp is the thread being enqueued. 836 * cp is a hint CPU, against which CMT load balancing will be performed. 837 * 838 * Returns cp, or a CPU better than cp with respect to balancing 839 * running thread load. 840 */ 841 cpu_t * 842 cmt_balance(kthread_t *tp, cpu_t *cp) 843 { 844 int hint, i, cpu, nsiblings; 845 int self = 0; 846 group_t *cmt_pgs, *siblings; 847 pg_cmt_t *pg, *pg_tmp, *tpg = NULL; 848 int pg_nrun, tpg_nrun; 849 int level = 0; 850 cpu_t *newcp; 851 852 ASSERT(THREAD_LOCK_HELD(tp)); 853 854 cmt_pgs = &cp->cpu_pg->cmt_pgs; 855 856 if (GROUP_SIZE(cmt_pgs) == 0) 857 return (cp); /* nothing to do */ 858 859 if (tp == curthread) 860 self = 1; 861 862 /* 863 * Balance across siblings in the CPUs CMT lineage 864 * If the thread is homed to the root lgroup, perform 865 * top level balancing against other top level PGs 866 * in the system. Otherwise, start with the default 867 * top level siblings group, which is within the leaf lgroup 868 */ 869 pg = GROUP_ACCESS(cmt_pgs, level); 870 if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) 871 siblings = &cmt_root->cl_pgs; 872 else 873 siblings = pg->cmt_siblings; 874 875 /* 876 * Traverse down the lineage until we find a level that needs 877 * balancing, or we get to the end. 878 */ 879 for (;;) { 880 nsiblings = GROUP_SIZE(siblings); /* self inclusive */ 881 if (nsiblings == 1) 882 goto next_level; 883 884 pg_nrun = pg->cmt_nrunning; 885 if (self && 886 bitset_in_set(&pg->cmt_cpus_actv_set, CPU->cpu_seqid)) 887 pg_nrun--; /* Ignore curthread's effect */ 888 889 hint = CPU_PSEUDO_RANDOM() % nsiblings; 890 891 /* 892 * Find a balancing candidate from among our siblings 893 * "hint" is a hint for where to start looking 894 */ 895 i = hint; 896 do { 897 ASSERT(i < nsiblings); 898 pg_tmp = GROUP_ACCESS(siblings, i); 899 900 /* 901 * The candidate must not be us, and must 902 * have some CPU resources in the thread's 903 * partition 904 */ 905 if (pg_tmp != pg && 906 bitset_in_set(&tp->t_cpupart->cp_cmt_pgs, 907 ((pg_t *)pg_tmp)->pg_id)) { 908 tpg = pg_tmp; 909 break; 910 } 911 912 if (++i >= nsiblings) 913 i = 0; 914 } while (i != hint); 915 916 if (!tpg) 917 goto next_level; /* no candidates at this level */ 918 919 /* 920 * Check if the balancing target is underloaded 921 * Decide to balance if the target is running fewer 922 * threads, or if it's running the same number of threads 923 * with more online CPUs 924 */ 925 tpg_nrun = tpg->cmt_nrunning; 926 if (pg_nrun > tpg_nrun || 927 (pg_nrun == tpg_nrun && 928 (GROUP_SIZE(&tpg->cmt_cpus_actv) > 929 GROUP_SIZE(&pg->cmt_cpus_actv)))) { 930 break; 931 } 932 tpg = NULL; 933 934 next_level: 935 if (++level == GROUP_SIZE(cmt_pgs)) 936 break; 937 938 pg = GROUP_ACCESS(cmt_pgs, level); 939 siblings = pg->cmt_siblings; 940 } 941 942 if (tpg) { 943 uint_t tgt_size = GROUP_SIZE(&tpg->cmt_cpus_actv); 944 945 /* 946 * Select an idle CPU from the target 947 */ 948 hint = CPU_PSEUDO_RANDOM() % tgt_size; 949 cpu = hint; 950 do { 951 newcp = GROUP_ACCESS(&tpg->cmt_cpus_actv, cpu); 952 if (newcp->cpu_part == tp->t_cpupart && 953 newcp->cpu_dispatch_pri == -1) { 954 cp = newcp; 955 break; 956 } 957 if (++cpu == tgt_size) 958 cpu = 0; 959 } while (cpu != hint); 960 } 961 962 return (cp); 963 } 964