1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/systm.h> 27 #include <sys/types.h> 28 #include <sys/param.h> 29 #include <sys/thread.h> 30 #include <sys/cpuvar.h> 31 #include <sys/cpupart.h> 32 #include <sys/kmem.h> 33 #include <sys/cmn_err.h> 34 #include <sys/kstat.h> 35 #include <sys/processor.h> 36 #include <sys/disp.h> 37 #include <sys/group.h> 38 #include <sys/pghw.h> 39 #include <sys/bitset.h> 40 #include <sys/lgrp.h> 41 #include <sys/cmt.h> 42 43 /* 44 * CMT scheduler / dispatcher support 45 * 46 * This file implements CMT scheduler support using Processor Groups. 47 * The CMT processor group class creates and maintains the CMT class 48 * specific processor group pg_cmt_t. 49 * 50 * ---------------------------- <-- pg_cmt_t * 51 * | pghw_t | 52 * ---------------------------- 53 * | CMT class specific data | 54 * | - hierarchy linkage | 55 * | - CMT load balancing data| 56 * | - active CPU group/bitset| 57 * ---------------------------- 58 * 59 * The scheduler/dispatcher leverages knowledge of the performance 60 * relevant CMT sharing relationships existing between cpus to implement 61 * optimized affinity and load balancing policies. 62 * 63 * Load balancing policy seeks to improve performance by minimizing 64 * contention over shared processor resources / facilities, while the 65 * affinity policies seek to improve cache and TLB utilization. 66 * 67 * The CMT PGs created by this class are already arranged into a 68 * hierarchy (which is done in the pghw layer). To implement the top-down 69 * CMT load balancing algorithm, the CMT PGs additionally maintain 70 * parent, child and sibling hierarchy relationships. 71 * Parent PGs always contain a superset of their children(s) resources, 72 * each PG can have at most one parent, and siblings are the group of PGs 73 * sharing the same parent. 74 * 75 * On NUMA systems, the CMT load balancing algorithm balances across the 76 * CMT PGs within their respective lgroups. On UMA based system, there 77 * exists a top level group of PGs to balance across. On NUMA systems multiple 78 * top level groups are instantiated, where the top level balancing begins by 79 * balancng across the CMT PGs within their respective (per lgroup) top level 80 * groups. 81 */ 82 typedef struct cmt_lgrp { 83 group_t cl_pgs; /* Top level group of active CMT PGs */ 84 int cl_npgs; /* # of top level PGs in the lgroup */ 85 lgrp_handle_t cl_hand; /* lgroup's platform handle */ 86 struct cmt_lgrp *cl_next; /* next cmt_lgrp */ 87 } cmt_lgrp_t; 88 89 static cmt_lgrp_t *cmt_lgrps = NULL; /* cmt_lgrps list head */ 90 static cmt_lgrp_t *cpu0_lgrp = NULL; /* boot CPU's initial lgrp */ 91 /* used for null_proc_lpa */ 92 static cmt_lgrp_t *cmt_root = NULL; /* Reference to root cmt pg */ 93 94 static int is_cpu0 = 1; /* true if this is boot CPU context */ 95 96 /* 97 * Set this to non-zero to disable CMT scheduling 98 * This must be done via kmdb -d, as /etc/system will be too late 99 */ 100 static int cmt_sched_disabled = 0; 101 102 static pg_cid_t pg_cmt_class_id; /* PG class id */ 103 104 static pg_t *pg_cmt_alloc(); 105 static void pg_cmt_free(pg_t *); 106 static void pg_cmt_cpu_init(cpu_t *); 107 static void pg_cmt_cpu_fini(cpu_t *); 108 static void pg_cmt_cpu_active(cpu_t *); 109 static void pg_cmt_cpu_inactive(cpu_t *); 110 static void pg_cmt_cpupart_in(cpu_t *, cpupart_t *); 111 static void pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *); 112 static void pg_cmt_hier_pack(void **, int); 113 static int pg_cmt_cpu_belongs(pg_t *, cpu_t *); 114 static int pg_cmt_hw(pghw_type_t); 115 static cmt_lgrp_t *pg_cmt_find_lgrp(lgrp_handle_t); 116 static cmt_lgrp_t *pg_cmt_lgrp_create(lgrp_handle_t); 117 118 /* 119 * Macro to test if PG is managed by the CMT PG class 120 */ 121 #define IS_CMT_PG(pg) (((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id) 122 123 /* 124 * CMT PG ops 125 */ 126 struct pg_ops pg_ops_cmt = { 127 pg_cmt_alloc, 128 pg_cmt_free, 129 pg_cmt_cpu_init, 130 pg_cmt_cpu_fini, 131 pg_cmt_cpu_active, 132 pg_cmt_cpu_inactive, 133 pg_cmt_cpupart_in, 134 NULL, /* cpupart_out */ 135 pg_cmt_cpupart_move, 136 pg_cmt_cpu_belongs, 137 }; 138 139 /* 140 * Initialize the CMT PG class 141 */ 142 void 143 pg_cmt_class_init(void) 144 { 145 if (cmt_sched_disabled) 146 return; 147 148 pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL); 149 } 150 151 /* 152 * Called to indicate a new CPU has started up so 153 * that either t0 or the slave startup thread can 154 * be accounted for. 155 */ 156 void 157 pg_cmt_cpu_startup(cpu_t *cp) 158 { 159 PG_NRUN_UPDATE(cp, 1); 160 } 161 162 /* 163 * Adjust the CMT load in the CMT PGs in which the CPU belongs 164 * Note that "n" can be positive in the case of increasing 165 * load, or negative in the case of decreasing load. 166 */ 167 void 168 pg_cmt_load(cpu_t *cp, int n) 169 { 170 pg_cmt_t *pg; 171 172 pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage; 173 while (pg != NULL) { 174 ASSERT(IS_CMT_PG(pg)); 175 atomic_add_32(&pg->cmt_nrunning, n); 176 pg = pg->cmt_parent; 177 } 178 } 179 180 /* 181 * Return non-zero if thread can migrate between "from" and "to" 182 * without a performance penalty 183 */ 184 int 185 pg_cmt_can_migrate(cpu_t *from, cpu_t *to) 186 { 187 if (from->cpu_physid->cpu_cacheid == 188 to->cpu_physid->cpu_cacheid) 189 return (1); 190 return (0); 191 } 192 193 /* 194 * CMT class specific PG allocation 195 */ 196 static pg_t * 197 pg_cmt_alloc(void) 198 { 199 return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP)); 200 } 201 202 /* 203 * Class specific PG de-allocation 204 */ 205 static void 206 pg_cmt_free(pg_t *pg) 207 { 208 ASSERT(pg != NULL); 209 ASSERT(IS_CMT_PG(pg)); 210 211 kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t)); 212 } 213 214 /* 215 * Return 1 if CMT scheduling policies should be impelmented 216 * for the specified hardware sharing relationship. 217 */ 218 static int 219 pg_cmt_hw(pghw_type_t hw) 220 { 221 return (pg_plat_cmt_load_bal_hw(hw) || 222 pg_plat_cmt_affinity_hw(hw)); 223 } 224 225 /* 226 * CMT class callback for a new CPU entering the system 227 */ 228 static void 229 pg_cmt_cpu_init(cpu_t *cp) 230 { 231 pg_cmt_t *pg; 232 group_t *cmt_pgs; 233 int level, max_level, nlevels; 234 pghw_type_t hw; 235 pg_t *pg_cache = NULL; 236 pg_cmt_t *cpu_cmt_hier[PGHW_NUM_COMPONENTS]; 237 lgrp_handle_t lgrp_handle; 238 cmt_lgrp_t *lgrp; 239 240 ASSERT(MUTEX_HELD(&cpu_lock)); 241 242 /* 243 * A new CPU is coming into the system. 244 * Interrogate the platform to see if the CPU 245 * has any performance relevant CMT sharing 246 * relationships 247 */ 248 cmt_pgs = &cp->cpu_pg->cmt_pgs; 249 cp->cpu_pg->cmt_lineage = NULL; 250 251 bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier)); 252 max_level = nlevels = 0; 253 for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) { 254 255 /* 256 * We're only interested in CMT hw sharing relationships 257 */ 258 if (pg_cmt_hw(hw) == 0 || pg_plat_hw_shared(cp, hw) == 0) 259 continue; 260 261 /* 262 * Find (or create) the PG associated with 263 * the hw sharing relationship in which cp 264 * belongs. 265 * 266 * Determine if a suitable PG already 267 * exists, or if one needs to be created. 268 */ 269 pg = (pg_cmt_t *)pghw_place_cpu(cp, hw); 270 if (pg == NULL) { 271 /* 272 * Create a new one. 273 * Initialize the common... 274 */ 275 pg = (pg_cmt_t *)pg_create(pg_cmt_class_id); 276 277 /* ... physical ... */ 278 pghw_init((pghw_t *)pg, cp, hw); 279 280 /* 281 * ... and CMT specific portions of the 282 * structure. 283 */ 284 bitset_init(&pg->cmt_cpus_actv_set); 285 group_create(&pg->cmt_cpus_actv); 286 } else { 287 ASSERT(IS_CMT_PG(pg)); 288 } 289 290 /* Add the CPU to the PG */ 291 pg_cpu_add((pg_t *)pg, cp); 292 293 /* 294 * Ensure capacity of the active CPU group/bitset 295 */ 296 group_expand(&pg->cmt_cpus_actv, 297 GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 298 299 if (cp->cpu_seqid >= 300 bitset_capacity(&pg->cmt_cpus_actv_set)) { 301 bitset_resize(&pg->cmt_cpus_actv_set, 302 cp->cpu_seqid + 1); 303 } 304 305 /* 306 * Build a lineage of CMT PGs for load balancing 307 */ 308 if (pg_plat_cmt_load_bal_hw(hw)) { 309 level = pghw_level(hw); 310 cpu_cmt_hier[level] = pg; 311 if (level > max_level) 312 max_level = level; 313 nlevels++; 314 } 315 316 /* Cache this for later */ 317 if (hw == PGHW_CACHE) 318 pg_cache = (pg_t *)pg; 319 } 320 321 /* 322 * Pack out any gaps in the constructed lineage, 323 * then size it out. 324 * 325 * Gaps may exist where the architecture knows 326 * about a hardware sharing relationship, but such a 327 * relationship either isn't relevant for load 328 * balancing or doesn't exist between CPUs on the system. 329 */ 330 pg_cmt_hier_pack((void **)cpu_cmt_hier, max_level + 1); 331 group_expand(cmt_pgs, nlevels); 332 333 334 if (cmt_root == NULL) 335 cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand()); 336 337 /* 338 * Find the lgrp that encapsulates this CPU's CMT hierarchy. 339 * and locate/create a suitable cmt_lgrp_t. 340 */ 341 lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 342 if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL) 343 lgrp = pg_cmt_lgrp_create(lgrp_handle); 344 345 /* 346 * For each of the PGs in the CPU's lineage: 347 * - Add an entry in the CPU's CMT PG group 348 * which is used by the dispatcher to implement load balancing 349 * policy. 350 * - Tie the PG into the CMT hierarchy by connecting 351 * it to it's parent and siblings. 352 */ 353 for (level = 0; level < nlevels; level++) { 354 uint_t children; 355 int err; 356 357 pg = cpu_cmt_hier[level]; 358 err = group_add_at(cmt_pgs, pg, nlevels - level - 1); 359 ASSERT(err == 0); 360 361 if (level == 0) 362 cp->cpu_pg->cmt_lineage = (pg_t *)pg; 363 364 if (pg->cmt_siblings != NULL) { 365 /* Already initialized */ 366 ASSERT(pg->cmt_parent == NULL || 367 pg->cmt_parent == cpu_cmt_hier[level + 1]); 368 ASSERT(pg->cmt_siblings == &lgrp->cl_pgs || 369 ((pg->cmt_parent != NULL) && 370 pg->cmt_siblings == pg->cmt_parent->cmt_children)); 371 continue; 372 } 373 374 if ((level + 1) == nlevels) { 375 pg->cmt_parent = NULL; 376 377 pg->cmt_siblings = &lgrp->cl_pgs; 378 children = ++lgrp->cl_npgs; 379 cmt_root->cl_npgs++; 380 } else { 381 pg->cmt_parent = cpu_cmt_hier[level + 1]; 382 383 /* 384 * A good parent keeps track of their children. 385 * The parent's children group is also the PG's 386 * siblings. 387 */ 388 if (pg->cmt_parent->cmt_children == NULL) { 389 pg->cmt_parent->cmt_children = 390 kmem_zalloc(sizeof (group_t), KM_SLEEP); 391 group_create(pg->cmt_parent->cmt_children); 392 } 393 pg->cmt_siblings = pg->cmt_parent->cmt_children; 394 children = ++pg->cmt_parent->cmt_nchildren; 395 } 396 397 group_expand(pg->cmt_siblings, children); 398 group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs); 399 } 400 401 /* 402 * Cache the chip and core IDs in the cpu_t->cpu_physid structure 403 * for fast lookups later. 404 */ 405 if (cp->cpu_physid) { 406 cp->cpu_physid->cpu_chipid = 407 pg_plat_hw_instance_id(cp, PGHW_CHIP); 408 cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp); 409 410 /* 411 * If this cpu has a PG representing shared cache, then set 412 * cpu_cacheid to that PG's logical id 413 */ 414 if (pg_cache) 415 cp->cpu_physid->cpu_cacheid = pg_cache->pg_id; 416 } 417 418 /* CPU0 only initialization */ 419 if (is_cpu0) { 420 pg_cmt_cpu_startup(cp); 421 is_cpu0 = 0; 422 cpu0_lgrp = lgrp; 423 } 424 425 } 426 427 /* 428 * Class callback when a CPU is leaving the system (deletion) 429 */ 430 static void 431 pg_cmt_cpu_fini(cpu_t *cp) 432 { 433 group_iter_t i; 434 pg_cmt_t *pg; 435 group_t *pgs, *cmt_pgs; 436 lgrp_handle_t lgrp_handle; 437 cmt_lgrp_t *lgrp; 438 439 pgs = &cp->cpu_pg->pgs; 440 cmt_pgs = &cp->cpu_pg->cmt_pgs; 441 442 /* 443 * Find the lgroup that encapsulates this CPU's CMT hierarchy 444 */ 445 lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 446 447 lgrp = pg_cmt_find_lgrp(lgrp_handle); 448 if (ncpus == 1 && lgrp != cpu0_lgrp) { 449 /* 450 * One might wonder how we could be deconfiguring the 451 * only CPU in the system. 452 * 453 * On Starcat systems when null_proc_lpa is detected, 454 * the boot CPU (which is already configured into a leaf 455 * lgroup), is moved into the root lgroup. This is done by 456 * deconfiguring it from both lgroups and processor 457 * groups), and then later reconfiguring it back in. This 458 * call to pg_cmt_cpu_fini() is part of that deconfiguration. 459 * 460 * This special case is detected by noting that the platform 461 * has changed the CPU's lgrp affiliation (since it now 462 * belongs in the root). In this case, use the cmt_lgrp_t 463 * cached for the boot CPU, since this is what needs to be 464 * torn down. 465 */ 466 lgrp = cpu0_lgrp; 467 } 468 469 ASSERT(lgrp != NULL); 470 471 /* 472 * First, clean up anything load balancing specific for each of 473 * the CPU's PGs that participated in CMT load balancing 474 */ 475 pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage; 476 while (pg != NULL) { 477 478 /* 479 * Remove the PG from the CPU's load balancing lineage 480 */ 481 (void) group_remove(cmt_pgs, pg, GRP_RESIZE); 482 483 /* 484 * If it's about to become empty, destroy it's children 485 * group, and remove it's reference from it's siblings. 486 * This is done here (rather than below) to avoid removing 487 * our reference from a PG that we just eliminated. 488 */ 489 if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) { 490 if (pg->cmt_children != NULL) 491 group_destroy(pg->cmt_children); 492 if (pg->cmt_siblings != NULL) { 493 if (pg->cmt_siblings == &lgrp->cl_pgs) 494 lgrp->cl_npgs--; 495 else 496 pg->cmt_parent->cmt_nchildren--; 497 } 498 } 499 pg = pg->cmt_parent; 500 } 501 ASSERT(GROUP_SIZE(cmt_pgs) == 0); 502 503 /* 504 * Now that the load balancing lineage updates have happened, 505 * remove the CPU from all it's PGs (destroying any that become 506 * empty). 507 */ 508 group_iter_init(&i); 509 while ((pg = group_iterate(pgs, &i)) != NULL) { 510 if (IS_CMT_PG(pg) == 0) 511 continue; 512 513 pg_cpu_delete((pg_t *)pg, cp); 514 /* 515 * Deleting the CPU from the PG changes the CPU's 516 * PG group over which we are actively iterating 517 * Re-initialize the iteration 518 */ 519 group_iter_init(&i); 520 521 if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) { 522 523 /* 524 * The PG has become zero sized, so destroy it. 525 */ 526 group_destroy(&pg->cmt_cpus_actv); 527 bitset_fini(&pg->cmt_cpus_actv_set); 528 pghw_fini((pghw_t *)pg); 529 530 pg_destroy((pg_t *)pg); 531 } 532 } 533 } 534 535 /* 536 * Class callback when a CPU is entering a cpu partition 537 */ 538 static void 539 pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp) 540 { 541 group_t *pgs; 542 pg_t *pg; 543 group_iter_t i; 544 545 ASSERT(MUTEX_HELD(&cpu_lock)); 546 547 pgs = &cp->cpu_pg->pgs; 548 549 /* 550 * Ensure that the new partition's PG bitset 551 * is large enough for all CMT PG's to which cp 552 * belongs 553 */ 554 group_iter_init(&i); 555 while ((pg = group_iterate(pgs, &i)) != NULL) { 556 if (IS_CMT_PG(pg) == 0) 557 continue; 558 559 if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id) 560 bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1); 561 } 562 } 563 564 /* 565 * Class callback when a CPU is actually moving partitions 566 */ 567 static void 568 pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp) 569 { 570 cpu_t *cpp; 571 group_t *pgs; 572 pg_t *pg; 573 group_iter_t pg_iter; 574 pg_cpu_itr_t cpu_iter; 575 boolean_t found; 576 577 ASSERT(MUTEX_HELD(&cpu_lock)); 578 579 pgs = &cp->cpu_pg->pgs; 580 group_iter_init(&pg_iter); 581 582 /* 583 * Iterate over the CPUs CMT PGs 584 */ 585 while ((pg = group_iterate(pgs, &pg_iter)) != NULL) { 586 587 if (IS_CMT_PG(pg) == 0) 588 continue; 589 590 /* 591 * Add the PG to the bitset in the new partition. 592 */ 593 bitset_add(&newpp->cp_cmt_pgs, pg->pg_id); 594 595 /* 596 * Remove the PG from the bitset in the old partition 597 * if the last of the PG's CPUs have left. 598 */ 599 found = B_FALSE; 600 PG_CPU_ITR_INIT(pg, cpu_iter); 601 while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) { 602 if (cpp == cp) 603 continue; 604 if (CPU_ACTIVE(cpp) && 605 cpp->cpu_part->cp_id == oldpp->cp_id) { 606 found = B_TRUE; 607 break; 608 } 609 } 610 if (!found) 611 bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id); 612 } 613 } 614 615 /* 616 * Class callback when a CPU becomes active (online) 617 * 618 * This is called in a context where CPUs are paused 619 */ 620 static void 621 pg_cmt_cpu_active(cpu_t *cp) 622 { 623 int err; 624 group_iter_t i; 625 pg_cmt_t *pg; 626 group_t *pgs; 627 628 ASSERT(MUTEX_HELD(&cpu_lock)); 629 630 pgs = &cp->cpu_pg->pgs; 631 group_iter_init(&i); 632 633 /* 634 * Iterate over the CPU's PGs 635 */ 636 while ((pg = group_iterate(pgs, &i)) != NULL) { 637 638 if (IS_CMT_PG(pg) == 0) 639 continue; 640 641 err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 642 ASSERT(err == 0); 643 644 /* 645 * If this is the first active CPU in the PG, and it 646 * represents a hardware sharing relationship over which 647 * CMT load balancing is performed, add it as a candidate 648 * for balancing with it's siblings. 649 */ 650 if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 && 651 pg_plat_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) { 652 err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE); 653 ASSERT(err == 0); 654 655 /* 656 * If this is a top level PG, add it as a balancing 657 * candidate when balancing within the root lgroup 658 */ 659 if (pg->cmt_parent == NULL) { 660 err = group_add(&cmt_root->cl_pgs, pg, 661 GRP_NORESIZE); 662 ASSERT(err == 0); 663 } 664 } 665 666 /* 667 * Notate the CPU in the PGs active CPU bitset. 668 * Also notate the PG as being active in it's associated 669 * partition 670 */ 671 bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 672 bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id); 673 } 674 } 675 676 /* 677 * Class callback when a CPU goes inactive (offline) 678 * 679 * This is called in a context where CPUs are paused 680 */ 681 static void 682 pg_cmt_cpu_inactive(cpu_t *cp) 683 { 684 int err; 685 group_t *pgs; 686 pg_cmt_t *pg; 687 cpu_t *cpp; 688 group_iter_t i; 689 pg_cpu_itr_t cpu_itr; 690 boolean_t found; 691 692 ASSERT(MUTEX_HELD(&cpu_lock)); 693 694 pgs = &cp->cpu_pg->pgs; 695 group_iter_init(&i); 696 697 while ((pg = group_iterate(pgs, &i)) != NULL) { 698 699 if (IS_CMT_PG(pg) == 0) 700 continue; 701 702 /* 703 * Remove the CPU from the CMT PGs active CPU group 704 * bitmap 705 */ 706 err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 707 ASSERT(err == 0); 708 709 bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 710 711 /* 712 * If there are no more active CPUs in this PG over which 713 * load was balanced, remove it as a balancing candidate. 714 */ 715 if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 && 716 pg_plat_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) { 717 err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); 718 ASSERT(err == 0); 719 720 if (pg->cmt_parent == NULL) { 721 err = group_remove(&cmt_root->cl_pgs, pg, 722 GRP_NORESIZE); 723 ASSERT(err == 0); 724 } 725 } 726 727 /* 728 * Assert the number of active CPUs does not exceed 729 * the total number of CPUs in the PG 730 */ 731 ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <= 732 GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 733 734 /* 735 * Update the PG bitset in the CPU's old partition 736 */ 737 found = B_FALSE; 738 PG_CPU_ITR_INIT(pg, cpu_itr); 739 while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) { 740 if (cpp == cp) 741 continue; 742 if (CPU_ACTIVE(cpp) && 743 cpp->cpu_part->cp_id == cp->cpu_part->cp_id) { 744 found = B_TRUE; 745 break; 746 } 747 } 748 if (!found) { 749 bitset_del(&cp->cpu_part->cp_cmt_pgs, 750 ((pg_t *)pg)->pg_id); 751 } 752 } 753 } 754 755 /* 756 * Return non-zero if the CPU belongs in the given PG 757 */ 758 static int 759 pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp) 760 { 761 cpu_t *pg_cpu; 762 763 pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0); 764 765 ASSERT(pg_cpu != NULL); 766 767 /* 768 * The CPU belongs if, given the nature of the hardware sharing 769 * relationship represented by the PG, the CPU has that 770 * relationship with some other CPU already in the PG 771 */ 772 if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw)) 773 return (1); 774 775 return (0); 776 } 777 778 /* 779 * Hierarchy packing utility routine. The hierarchy order is preserved. 780 */ 781 static void 782 pg_cmt_hier_pack(void *hier[], int sz) 783 { 784 int i, j; 785 786 for (i = 0; i < sz; i++) { 787 if (hier[i] != NULL) 788 continue; 789 790 for (j = i; j < sz; j++) { 791 if (hier[j] != NULL) { 792 hier[i] = hier[j]; 793 hier[j] = NULL; 794 break; 795 } 796 } 797 if (j == sz) 798 break; 799 } 800 } 801 802 /* 803 * Return a cmt_lgrp_t * given an lgroup handle. 804 */ 805 static cmt_lgrp_t * 806 pg_cmt_find_lgrp(lgrp_handle_t hand) 807 { 808 cmt_lgrp_t *lgrp; 809 810 ASSERT(MUTEX_HELD(&cpu_lock)); 811 812 lgrp = cmt_lgrps; 813 while (lgrp != NULL) { 814 if (lgrp->cl_hand == hand) 815 break; 816 lgrp = lgrp->cl_next; 817 } 818 return (lgrp); 819 } 820 821 /* 822 * Create a cmt_lgrp_t with the specified handle. 823 */ 824 static cmt_lgrp_t * 825 pg_cmt_lgrp_create(lgrp_handle_t hand) 826 { 827 cmt_lgrp_t *lgrp; 828 829 ASSERT(MUTEX_HELD(&cpu_lock)); 830 831 lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP); 832 833 lgrp->cl_hand = hand; 834 lgrp->cl_npgs = 0; 835 lgrp->cl_next = cmt_lgrps; 836 cmt_lgrps = lgrp; 837 group_create(&lgrp->cl_pgs); 838 839 return (lgrp); 840 } 841 842 /* 843 * Perform multi-level CMT load balancing of running threads. 844 * 845 * tp is the thread being enqueued. 846 * cp is a hint CPU, against which CMT load balancing will be performed. 847 * 848 * Returns cp, or a CPU better than cp with respect to balancing 849 * running thread load. 850 */ 851 cpu_t * 852 cmt_balance(kthread_t *tp, cpu_t *cp) 853 { 854 int hint, i, cpu, nsiblings; 855 int self = 0; 856 group_t *cmt_pgs, *siblings; 857 pg_cmt_t *pg, *pg_tmp, *tpg = NULL; 858 int pg_nrun, tpg_nrun; 859 int level = 0; 860 cpu_t *newcp; 861 862 ASSERT(THREAD_LOCK_HELD(tp)); 863 864 cmt_pgs = &cp->cpu_pg->cmt_pgs; 865 866 if (GROUP_SIZE(cmt_pgs) == 0) 867 return (cp); /* nothing to do */ 868 869 if (tp == curthread) 870 self = 1; 871 872 /* 873 * Balance across siblings in the CPUs CMT lineage 874 * If the thread is homed to the root lgroup, perform 875 * top level balancing against other top level PGs 876 * in the system. Otherwise, start with the default 877 * top level siblings group, which is within the leaf lgroup 878 */ 879 pg = GROUP_ACCESS(cmt_pgs, level); 880 if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) 881 siblings = &cmt_root->cl_pgs; 882 else 883 siblings = pg->cmt_siblings; 884 885 /* 886 * Traverse down the lineage until we find a level that needs 887 * balancing, or we get to the end. 888 */ 889 for (;;) { 890 nsiblings = GROUP_SIZE(siblings); /* self inclusive */ 891 if (nsiblings == 1) 892 goto next_level; 893 894 pg_nrun = pg->cmt_nrunning; 895 if (self && 896 bitset_in_set(&pg->cmt_cpus_actv_set, CPU->cpu_seqid)) 897 pg_nrun--; /* Ignore curthread's effect */ 898 899 hint = CPU_PSEUDO_RANDOM() % nsiblings; 900 901 /* 902 * Find a balancing candidate from among our siblings 903 * "hint" is a hint for where to start looking 904 */ 905 i = hint; 906 do { 907 ASSERT(i < nsiblings); 908 pg_tmp = GROUP_ACCESS(siblings, i); 909 910 /* 911 * The candidate must not be us, and must 912 * have some CPU resources in the thread's 913 * partition 914 */ 915 if (pg_tmp != pg && 916 bitset_in_set(&tp->t_cpupart->cp_cmt_pgs, 917 ((pg_t *)pg_tmp)->pg_id)) { 918 tpg = pg_tmp; 919 break; 920 } 921 922 if (++i >= nsiblings) 923 i = 0; 924 } while (i != hint); 925 926 if (!tpg) 927 goto next_level; /* no candidates at this level */ 928 929 /* 930 * Check if the balancing target is underloaded 931 * Decide to balance if the target is running fewer 932 * threads, or if it's running the same number of threads 933 * with more online CPUs 934 */ 935 tpg_nrun = tpg->cmt_nrunning; 936 if (pg_nrun > tpg_nrun || 937 (pg_nrun == tpg_nrun && 938 (GROUP_SIZE(&tpg->cmt_cpus_actv) > 939 GROUP_SIZE(&pg->cmt_cpus_actv)))) { 940 break; 941 } 942 tpg = NULL; 943 944 next_level: 945 if (++level == GROUP_SIZE(cmt_pgs)) 946 break; 947 948 pg = GROUP_ACCESS(cmt_pgs, level); 949 siblings = pg->cmt_siblings; 950 } 951 952 if (tpg) { 953 uint_t tgt_size = GROUP_SIZE(&tpg->cmt_cpus_actv); 954 955 /* 956 * Select an idle CPU from the target 957 */ 958 hint = CPU_PSEUDO_RANDOM() % tgt_size; 959 cpu = hint; 960 do { 961 newcp = GROUP_ACCESS(&tpg->cmt_cpus_actv, cpu); 962 if (newcp->cpu_part == tp->t_cpupart && 963 newcp->cpu_dispatch_pri == -1) { 964 cp = newcp; 965 break; 966 } 967 if (++cpu == tgt_size) 968 cpu = 0; 969 } while (cpu != hint); 970 } 971 972 return (cp); 973 } 974