1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <sys/systm.h> 30 #include <sys/cmn_err.h> 31 #include <sys/cpuvar.h> 32 #include <sys/thread.h> 33 #include <sys/disp.h> 34 #include <sys/kmem.h> 35 #include <sys/debug.h> 36 #include <sys/cpupart.h> 37 #include <sys/pset.h> 38 #include <sys/var.h> 39 #include <sys/cyclic.h> 40 #include <sys/lgrp.h> 41 #include <sys/pghw.h> 42 #include <sys/loadavg.h> 43 #include <sys/class.h> 44 #include <sys/fss.h> 45 #include <sys/pool.h> 46 #include <sys/pool_pset.h> 47 #include <sys/policy.h> 48 49 /* 50 * Calling pool_lock() protects the pools configuration, which includes 51 * CPU partitions. cpu_lock protects the CPU partition list, and prevents 52 * partitions from being created or destroyed while the lock is held. 53 * The lock ordering with respect to related locks is: 54 * 55 * pool_lock() ---> cpu_lock ---> pidlock --> p_lock 56 * 57 * Blocking memory allocations may be made while holding "pool_lock" 58 * or cpu_lock. 59 */ 60 61 /* 62 * The cp_default partition is allocated statically, but its lgroup load average 63 * (lpl) list is allocated dynamically after kmem subsystem is initialized. This 64 * saves some memory since the space allocated reflects the actual number of 65 * lgroups supported by the platform. The lgrp facility provides a temporary 66 * space to hold lpl information during system bootstrap. 67 */ 68 69 cpupart_t *cp_list_head; 70 cpupart_t cp_default; 71 struct mach_cpupart cp_default_mach; 72 static cpupartid_t cp_id_next; 73 uint_t cp_numparts; 74 uint_t cp_numparts_nonempty; 75 76 /* 77 * Need to limit total number of partitions to avoid slowing down the 78 * clock code too much. The clock code traverses the list of 79 * partitions and needs to be able to execute in a reasonable amount 80 * of time (less than 1/hz seconds). The maximum is sized based on 81 * max_ncpus so it shouldn't be a problem unless there are large 82 * numbers of empty partitions. 83 */ 84 static uint_t cp_max_numparts; 85 86 /* 87 * Processor sets and CPU partitions are different but related concepts. 88 * A processor set is a user-level abstraction allowing users to create 89 * sets of CPUs and bind threads exclusively to those sets. A CPU 90 * partition is a kernel dispatcher object consisting of a set of CPUs 91 * and a global dispatch queue. The processor set abstraction is 92 * implemented via a CPU partition, and currently there is a 1-1 93 * mapping between processor sets and partitions (excluding the default 94 * partition, which is not visible as a processor set). Hence, the 95 * numbering for processor sets and CPU partitions is identical. This 96 * may not always be true in the future, and these macros could become 97 * less trivial if we support e.g. a processor set containing multiple 98 * CPU partitions. 99 */ 100 #define PSTOCP(psid) ((cpupartid_t)((psid) == PS_NONE ? CP_DEFAULT : (psid))) 101 #define CPTOPS(cpid) ((psetid_t)((cpid) == CP_DEFAULT ? PS_NONE : (cpid))) 102 103 104 static int cpupart_unbind_threads(cpupart_t *, boolean_t); 105 106 /* 107 * Find a CPU partition given a processor set ID. 108 */ 109 static cpupart_t * 110 cpupart_find_all(psetid_t psid) 111 { 112 cpupart_t *cp; 113 cpupartid_t cpid = PSTOCP(psid); 114 115 ASSERT(MUTEX_HELD(&cpu_lock)); 116 117 /* default partition not visible as a processor set */ 118 if (psid == CP_DEFAULT) 119 return (NULL); 120 121 if (psid == PS_MYID) 122 return (curthread->t_cpupart); 123 124 cp = cp_list_head; 125 do { 126 if (cp->cp_id == cpid) 127 return (cp); 128 cp = cp->cp_next; 129 } while (cp != cp_list_head); 130 return (NULL); 131 } 132 133 /* 134 * Find a CPU partition given a processor set ID if the processor set 135 * should be visible from the calling zone. 136 */ 137 cpupart_t * 138 cpupart_find(psetid_t psid) 139 { 140 cpupart_t *cp; 141 142 ASSERT(MUTEX_HELD(&cpu_lock)); 143 cp = cpupart_find_all(psid); 144 if (cp != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() && 145 zone_pset_get(curproc->p_zone) != CPTOPS(cp->cp_id)) 146 return (NULL); 147 return (cp); 148 } 149 150 static int 151 cpupart_kstat_update(kstat_t *ksp, int rw) 152 { 153 cpupart_t *cp = (cpupart_t *)ksp->ks_private; 154 cpupart_kstat_t *cpksp = ksp->ks_data; 155 156 if (rw == KSTAT_WRITE) 157 return (EACCES); 158 159 cpksp->cpk_updates.value.ui64 = cp->cp_updates; 160 cpksp->cpk_runnable.value.ui64 = cp->cp_nrunnable_cum; 161 cpksp->cpk_waiting.value.ui64 = cp->cp_nwaiting_cum; 162 cpksp->cpk_ncpus.value.ui32 = cp->cp_ncpus; 163 cpksp->cpk_avenrun_1min.value.ui32 = cp->cp_hp_avenrun[0] >> 164 (16 - FSHIFT); 165 cpksp->cpk_avenrun_5min.value.ui32 = cp->cp_hp_avenrun[1] >> 166 (16 - FSHIFT); 167 cpksp->cpk_avenrun_15min.value.ui32 = cp->cp_hp_avenrun[2] >> 168 (16 - FSHIFT); 169 return (0); 170 } 171 172 static void 173 cpupart_kstat_create(cpupart_t *cp) 174 { 175 kstat_t *ksp; 176 zoneid_t zoneid; 177 178 ASSERT(MUTEX_HELD(&cpu_lock)); 179 180 /* 181 * We have a bit of a chicken-egg problem since this code will 182 * get called to create the kstats for CP_DEFAULT before the 183 * pools framework gets initialized. We circumvent the problem 184 * by special-casing cp_default. 185 */ 186 if (cp != &cp_default && pool_pset_enabled()) 187 zoneid = GLOBAL_ZONEID; 188 else 189 zoneid = ALL_ZONES; 190 ksp = kstat_create_zone("unix", cp->cp_id, "pset", "misc", 191 KSTAT_TYPE_NAMED, 192 sizeof (cpupart_kstat_t) / sizeof (kstat_named_t), 0, zoneid); 193 if (ksp != NULL) { 194 cpupart_kstat_t *cpksp = ksp->ks_data; 195 196 kstat_named_init(&cpksp->cpk_updates, "updates", 197 KSTAT_DATA_UINT64); 198 kstat_named_init(&cpksp->cpk_runnable, "runnable", 199 KSTAT_DATA_UINT64); 200 kstat_named_init(&cpksp->cpk_waiting, "waiting", 201 KSTAT_DATA_UINT64); 202 kstat_named_init(&cpksp->cpk_ncpus, "ncpus", 203 KSTAT_DATA_UINT32); 204 kstat_named_init(&cpksp->cpk_avenrun_1min, "avenrun_1min", 205 KSTAT_DATA_UINT32); 206 kstat_named_init(&cpksp->cpk_avenrun_5min, "avenrun_5min", 207 KSTAT_DATA_UINT32); 208 kstat_named_init(&cpksp->cpk_avenrun_15min, "avenrun_15min", 209 KSTAT_DATA_UINT32); 210 211 ksp->ks_update = cpupart_kstat_update; 212 ksp->ks_private = cp; 213 214 kstat_install(ksp); 215 } 216 cp->cp_kstat = ksp; 217 } 218 219 /* 220 * Initialize the default partition and kpreempt disp queue. 221 */ 222 void 223 cpupart_initialize_default(void) 224 { 225 lgrp_id_t i; 226 227 cp_list_head = &cp_default; 228 cp_default.cp_next = &cp_default; 229 cp_default.cp_prev = &cp_default; 230 cp_default.cp_id = CP_DEFAULT; 231 cp_default.cp_kp_queue.disp_maxrunpri = -1; 232 cp_default.cp_kp_queue.disp_max_unbound_pri = -1; 233 cp_default.cp_kp_queue.disp_cpu = NULL; 234 cp_default.cp_gen = 0; 235 cp_default.cp_loadavg.lg_cur = 0; 236 cp_default.cp_loadavg.lg_len = 0; 237 cp_default.cp_loadavg.lg_total = 0; 238 for (i = 0; i < S_LOADAVG_SZ; i++) { 239 cp_default.cp_loadavg.lg_loads[i] = 0; 240 } 241 CPUSET_ZERO(cp_default.cp_mach->mc_haltset); 242 DISP_LOCK_INIT(&cp_default.cp_kp_queue.disp_lock); 243 cp_id_next = CP_DEFAULT + 1; 244 cpupart_kstat_create(&cp_default); 245 cp_numparts = 1; 246 if (cp_max_numparts == 0) /* allow for /etc/system tuning */ 247 cp_max_numparts = max_ncpus * 2 + 1; 248 /* 249 * Allocate space for cp_default list of lgrploads 250 */ 251 cp_default.cp_nlgrploads = lgrp_plat_max_lgrps(); 252 cp_default.cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * 253 cp_default.cp_nlgrploads, KM_SLEEP); 254 255 /* 256 * The initial lpl topology is created in a special lpl list 257 * lpl_bootstrap. It should be copied to cp_default. 258 * NOTE: lpl_topo_bootstrap() also updates CPU0 cpu_lpl pointer to point 259 * to the correct lpl in the cp_default.cp_lgrploads list. 260 */ 261 lpl_topo_bootstrap(cp_default.cp_lgrploads, 262 cp_default.cp_nlgrploads); 263 264 for (i = 0; i < cp_default.cp_nlgrploads; i++) { 265 cp_default.cp_lgrploads[i].lpl_lgrpid = i; 266 } 267 cp_default.cp_attr = PSET_NOESCAPE; 268 cp_numparts_nonempty = 1; 269 /* 270 * Set t0's home 271 */ 272 t0.t_lpl = &cp_default.cp_lgrploads[LGRP_ROOTID]; 273 274 bitset_init(&cp_default.cp_cmt_pgs); 275 } 276 277 278 static int 279 cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced) 280 { 281 cpupart_t *oldpp; 282 cpu_t *ncp, *newlist; 283 kthread_t *t; 284 int move_threads = 1; 285 lgrp_id_t lgrpid; 286 proc_t *p; 287 int lgrp_diff_lpl; 288 lpl_t *cpu_lpl; 289 int ret; 290 boolean_t unbind_all_threads = (forced != 0); 291 292 ASSERT(MUTEX_HELD(&cpu_lock)); 293 ASSERT(newpp != NULL); 294 295 oldpp = cp->cpu_part; 296 ASSERT(oldpp != NULL); 297 ASSERT(oldpp->cp_ncpus > 0); 298 299 if (newpp == oldpp) { 300 /* 301 * Don't need to do anything. 302 */ 303 return (0); 304 } 305 306 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_OUT); 307 308 if (!disp_bound_partition(cp, 0)) { 309 /* 310 * Don't need to move threads if there are no threads in 311 * the partition. Note that threads can't enter the 312 * partition while we're holding cpu_lock. 313 */ 314 move_threads = 0; 315 } else if (oldpp->cp_ncpus == 1) { 316 /* 317 * The last CPU is removed from a partition which has threads 318 * running in it. Some of these threads may be bound to this 319 * CPU. 320 * 321 * Attempt to unbind threads from the CPU and from the processor 322 * set. Note that no threads should be bound to this CPU since 323 * cpupart_move_threads will refuse to move bound threads to 324 * other CPUs. 325 */ 326 (void) cpu_unbind(oldpp->cp_cpulist->cpu_id, B_FALSE); 327 (void) cpupart_unbind_threads(oldpp, B_FALSE); 328 329 if (!disp_bound_partition(cp, 0)) { 330 /* 331 * No bound threads in this partition any more 332 */ 333 move_threads = 0; 334 } else { 335 /* 336 * There are still threads bound to the partition 337 */ 338 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 339 return (EBUSY); 340 } 341 } 342 343 /* 344 * If forced flag is set unbind any threads from this CPU. 345 * Otherwise unbind soft-bound threads only. 346 */ 347 if ((ret = cpu_unbind(cp->cpu_id, unbind_all_threads)) != 0) { 348 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 349 return (ret); 350 } 351 352 /* 353 * Stop further threads weak binding to this cpu. 354 */ 355 cpu_inmotion = cp; 356 membar_enter(); 357 358 /* 359 * Notify the Processor Groups subsystem that the CPU 360 * will be moving cpu partitions. This is done before 361 * CPUs are paused to provide an opportunity for any 362 * needed memory allocations. 363 */ 364 pg_cpupart_out(cp, oldpp); 365 pg_cpupart_in(cp, newpp); 366 367 again: 368 if (move_threads) { 369 int loop_count; 370 /* 371 * Check for threads strong or weak bound to this CPU. 372 */ 373 for (loop_count = 0; disp_bound_threads(cp, 0); loop_count++) { 374 if (loop_count >= 5) { 375 cpu_state_change_notify(cp->cpu_id, 376 CPU_CPUPART_IN); 377 pg_cpupart_out(cp, newpp); 378 pg_cpupart_in(cp, oldpp); 379 cpu_inmotion = NULL; 380 return (EBUSY); /* some threads still bound */ 381 } 382 delay(1); 383 } 384 } 385 386 /* 387 * Before we actually start changing data structures, notify 388 * the cyclic subsystem that we want to move this CPU out of its 389 * partition. 390 */ 391 if (!cyclic_move_out(cp)) { 392 /* 393 * This CPU must be the last CPU in a processor set with 394 * a bound cyclic. 395 */ 396 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 397 pg_cpupart_out(cp, newpp); 398 pg_cpupart_in(cp, oldpp); 399 cpu_inmotion = NULL; 400 return (EBUSY); 401 } 402 403 pause_cpus(cp); 404 405 if (move_threads) { 406 /* 407 * The thread on cpu before the pause thread may have read 408 * cpu_inmotion before we raised the barrier above. Check 409 * again. 410 */ 411 if (disp_bound_threads(cp, 1)) { 412 start_cpus(); 413 goto again; 414 } 415 416 } 417 418 /* 419 * Now that CPUs are paused, let the PG subsystem perform 420 * any necessary data structure updates. 421 */ 422 pg_cpupart_move(cp, oldpp, newpp); 423 424 /* save this cpu's lgroup -- it'll be the same in the new partition */ 425 lgrpid = cp->cpu_lpl->lpl_lgrpid; 426 427 cpu_lpl = cp->cpu_lpl; 428 /* 429 * let the lgroup framework know cp has left the partition 430 */ 431 lgrp_config(LGRP_CONFIG_CPUPART_DEL, (uintptr_t)cp, lgrpid); 432 433 /* move out of old partition */ 434 oldpp->cp_ncpus--; 435 if (oldpp->cp_ncpus > 0) { 436 437 ncp = cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part; 438 cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part; 439 if (oldpp->cp_cpulist == cp) { 440 oldpp->cp_cpulist = ncp; 441 } 442 } else { 443 ncp = oldpp->cp_cpulist = NULL; 444 cp_numparts_nonempty--; 445 ASSERT(cp_numparts_nonempty != 0); 446 } 447 oldpp->cp_gen++; 448 449 /* move into new partition */ 450 newlist = newpp->cp_cpulist; 451 if (newlist == NULL) { 452 newpp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp; 453 cp_numparts_nonempty++; 454 ASSERT(cp_numparts_nonempty != 0); 455 } else { 456 cp->cpu_next_part = newlist; 457 cp->cpu_prev_part = newlist->cpu_prev_part; 458 newlist->cpu_prev_part->cpu_next_part = cp; 459 newlist->cpu_prev_part = cp; 460 } 461 cp->cpu_part = newpp; 462 newpp->cp_ncpus++; 463 newpp->cp_gen++; 464 465 ASSERT(CPUSET_ISNULL(newpp->cp_mach->mc_haltset)); 466 ASSERT(CPUSET_ISNULL(oldpp->cp_mach->mc_haltset)); 467 468 /* 469 * let the lgroup framework know cp has entered the partition 470 */ 471 lgrp_config(LGRP_CONFIG_CPUPART_ADD, (uintptr_t)cp, lgrpid); 472 473 /* 474 * If necessary, move threads off processor. 475 */ 476 if (move_threads) { 477 ASSERT(ncp != NULL); 478 479 /* 480 * Walk thru the active process list to look for 481 * threads that need to have a new home lgroup, 482 * or the last CPU they run on is the same CPU 483 * being moved out of the partition. 484 */ 485 486 for (p = practive; p != NULL; p = p->p_next) { 487 488 t = p->p_tlist; 489 490 if (t == NULL) 491 continue; 492 493 lgrp_diff_lpl = 0; 494 495 do { 496 497 ASSERT(t->t_lpl != NULL); 498 499 /* 500 * Update the count of how many threads are 501 * in this CPU's lgroup but have a different lpl 502 */ 503 504 if (t->t_lpl != cpu_lpl && 505 t->t_lpl->lpl_lgrpid == lgrpid) 506 lgrp_diff_lpl++; 507 /* 508 * If the lgroup that t is assigned to no 509 * longer has any CPUs in t's partition, 510 * we'll have to choose a new lgroup for t. 511 */ 512 513 if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid, 514 t->t_cpupart)) { 515 lgrp_move_thread(t, 516 lgrp_choose(t, t->t_cpupart), 0); 517 } 518 519 /* 520 * make sure lpl points to our own partition 521 */ 522 ASSERT(t->t_lpl >= t->t_cpupart->cp_lgrploads && 523 (t->t_lpl < t->t_cpupart->cp_lgrploads + 524 t->t_cpupart->cp_nlgrploads)); 525 526 ASSERT(t->t_lpl->lpl_ncpu > 0); 527 528 /* Update CPU last ran on if it was this CPU */ 529 if (t->t_cpu == cp && t->t_cpupart == oldpp && 530 t->t_bound_cpu != cp) { 531 t->t_cpu = disp_lowpri_cpu(ncp, 532 t->t_lpl, t->t_pri, NULL); 533 } 534 t = t->t_forw; 535 } while (t != p->p_tlist); 536 537 /* 538 * Didn't find any threads in the same lgroup as this 539 * CPU with a different lpl, so remove the lgroup from 540 * the process lgroup bitmask. 541 */ 542 543 if (lgrp_diff_lpl) 544 klgrpset_del(p->p_lgrpset, lgrpid); 545 } 546 547 /* 548 * Walk thread list looking for threads that need to be 549 * rehomed, since there are some threads that are not in 550 * their process's p_tlist. 551 */ 552 553 t = curthread; 554 555 do { 556 ASSERT(t != NULL && t->t_lpl != NULL); 557 558 /* 559 * If the lgroup that t is assigned to no 560 * longer has any CPUs in t's partition, 561 * we'll have to choose a new lgroup for t. 562 * Also, choose best lgroup for home when 563 * thread has specified lgroup affinities, 564 * since there may be an lgroup with more 565 * affinity available after moving CPUs 566 * around. 567 */ 568 if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid, 569 t->t_cpupart) || t->t_lgrp_affinity) { 570 lgrp_move_thread(t, 571 lgrp_choose(t, t->t_cpupart), 1); 572 } 573 574 /* make sure lpl points to our own partition */ 575 ASSERT((t->t_lpl >= t->t_cpupart->cp_lgrploads) && 576 (t->t_lpl < t->t_cpupart->cp_lgrploads + 577 t->t_cpupart->cp_nlgrploads)); 578 579 ASSERT(t->t_lpl->lpl_ncpu > 0); 580 581 /* Update CPU last ran on if it was this CPU */ 582 if (t->t_cpu == cp && t->t_cpupart == oldpp && 583 t->t_bound_cpu != cp) { 584 t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl, 585 t->t_pri, NULL); 586 } 587 588 t = t->t_next; 589 } while (t != curthread); 590 591 /* 592 * Clear off the CPU's run queue, and the kp queue if the 593 * partition is now empty. 594 */ 595 disp_cpu_inactive(cp); 596 597 /* 598 * Make cp switch to a thread from the new partition. 599 */ 600 cp->cpu_runrun = 1; 601 cp->cpu_kprunrun = 1; 602 } 603 604 cpu_inmotion = NULL; 605 start_cpus(); 606 607 /* 608 * Let anyone interested know that cpu has been added to the set. 609 */ 610 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 611 612 /* 613 * Now let the cyclic subsystem know that it can reshuffle cyclics 614 * bound to the new processor set. 615 */ 616 cyclic_move_in(cp); 617 618 return (0); 619 } 620 621 /* 622 * Check if thread can be moved to a new cpu partition. Called by 623 * cpupart_move_thread() and pset_bind_start(). 624 */ 625 int 626 cpupart_movable_thread(kthread_id_t tp, cpupart_t *cp, int ignore) 627 { 628 ASSERT(MUTEX_HELD(&cpu_lock)); 629 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock)); 630 ASSERT(cp != NULL); 631 ASSERT(THREAD_LOCK_HELD(tp)); 632 633 /* 634 * CPU-bound threads can't be moved. 635 */ 636 if (!ignore) { 637 cpu_t *boundcpu = tp->t_bound_cpu ? tp->t_bound_cpu : 638 tp->t_weakbound_cpu; 639 if (boundcpu != NULL && boundcpu->cpu_part != cp) 640 return (EBUSY); 641 } 642 return (0); 643 } 644 645 /* 646 * Move thread to new partition. If ignore is non-zero, then CPU 647 * bindings should be ignored (this is used when destroying a 648 * partition). 649 */ 650 static int 651 cpupart_move_thread(kthread_id_t tp, cpupart_t *newpp, int ignore, 652 void *projbuf, void *zonebuf) 653 { 654 cpupart_t *oldpp = tp->t_cpupart; 655 int ret; 656 657 ASSERT(MUTEX_HELD(&cpu_lock)); 658 ASSERT(MUTEX_HELD(&pidlock)); 659 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock)); 660 ASSERT(newpp != NULL); 661 662 if (newpp->cp_cpulist == NULL) 663 return (EINVAL); 664 665 /* 666 * Check for errors first. 667 */ 668 thread_lock(tp); 669 if ((ret = cpupart_movable_thread(tp, newpp, ignore)) != 0) { 670 thread_unlock(tp); 671 return (ret); 672 } 673 674 /* move the thread */ 675 if (oldpp != newpp) { 676 /* 677 * Make the thread switch to the new partition. 678 */ 679 tp->t_cpupart = newpp; 680 ASSERT(tp->t_lpl != NULL); 681 /* 682 * Leave the thread on the same lgroup if possible; otherwise 683 * choose a new lgroup for it. In either case, update its 684 * t_lpl. 685 */ 686 if (LGRP_CPUS_IN_PART(tp->t_lpl->lpl_lgrpid, newpp) && 687 tp->t_lgrp_affinity == NULL) { 688 /* 689 * The thread's lgroup has CPUs in the thread's new 690 * partition, so the thread can stay assigned to the 691 * same lgroup. Update its t_lpl to point to the 692 * lpl_t for its lgroup in its new partition. 693 */ 694 lgrp_move_thread(tp, &tp->t_cpupart->\ 695 cp_lgrploads[tp->t_lpl->lpl_lgrpid], 1); 696 } else { 697 /* 698 * The thread's lgroup has no cpus in its new 699 * partition or it has specified lgroup affinities, 700 * so choose the best lgroup for the thread and 701 * assign it to that lgroup. 702 */ 703 lgrp_move_thread(tp, lgrp_choose(tp, tp->t_cpupart), 704 1); 705 } 706 /* 707 * make sure lpl points to our own partition 708 */ 709 ASSERT((tp->t_lpl >= tp->t_cpupart->cp_lgrploads) && 710 (tp->t_lpl < tp->t_cpupart->cp_lgrploads + 711 tp->t_cpupart->cp_nlgrploads)); 712 713 ASSERT(tp->t_lpl->lpl_ncpu > 0); 714 715 if (tp->t_state == TS_ONPROC) { 716 cpu_surrender(tp); 717 } else if (tp->t_state == TS_RUN) { 718 (void) dispdeq(tp); 719 setbackdq(tp); 720 } 721 } 722 723 /* 724 * Our binding has changed; set TP_CHANGEBIND. 725 */ 726 tp->t_proc_flag |= TP_CHANGEBIND; 727 aston(tp); 728 729 thread_unlock(tp); 730 fss_changepset(tp, newpp, projbuf, zonebuf); 731 732 return (0); /* success */ 733 } 734 735 736 /* 737 * This function binds a thread to a partition. Must be called with the 738 * p_lock of the containing process held (to keep the thread from going 739 * away), and thus also with cpu_lock held (since cpu_lock must be 740 * acquired before p_lock). If ignore is non-zero, then CPU bindings 741 * should be ignored (this is used when destroying a partition). 742 */ 743 int 744 cpupart_bind_thread(kthread_id_t tp, psetid_t psid, int ignore, void *projbuf, 745 void *zonebuf) 746 { 747 cpupart_t *newpp; 748 749 ASSERT(pool_lock_held()); 750 ASSERT(MUTEX_HELD(&cpu_lock)); 751 ASSERT(MUTEX_HELD(&pidlock)); 752 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock)); 753 754 if (psid == PS_NONE) 755 newpp = &cp_default; 756 else { 757 newpp = cpupart_find(psid); 758 if (newpp == NULL) { 759 return (EINVAL); 760 } 761 } 762 return (cpupart_move_thread(tp, newpp, ignore, projbuf, zonebuf)); 763 } 764 765 766 /* 767 * Create a new partition. On MP systems, this also allocates a 768 * kpreempt disp queue for that partition. 769 */ 770 int 771 cpupart_create(psetid_t *psid) 772 { 773 cpupart_t *pp; 774 lgrp_id_t i; 775 776 ASSERT(pool_lock_held()); 777 778 pp = kmem_zalloc(sizeof (cpupart_t), KM_SLEEP); 779 pp->cp_mach = kmem_zalloc(sizeof (struct mach_cpupart), KM_SLEEP); 780 pp->cp_nlgrploads = lgrp_plat_max_lgrps(); 781 pp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * pp->cp_nlgrploads, 782 KM_SLEEP); 783 784 mutex_enter(&cpu_lock); 785 if (cp_numparts == cp_max_numparts) { 786 mutex_exit(&cpu_lock); 787 kmem_free(pp->cp_lgrploads, sizeof (lpl_t) * pp->cp_nlgrploads); 788 pp->cp_lgrploads = NULL; 789 kmem_free(pp->cp_mach, sizeof (struct mach_cpupart)); 790 kmem_free(pp, sizeof (cpupart_t)); 791 return (ENOMEM); 792 } 793 cp_numparts++; 794 /* find the next free partition ID */ 795 while (cpupart_find(CPTOPS(cp_id_next)) != NULL) 796 cp_id_next++; 797 pp->cp_id = cp_id_next++; 798 pp->cp_ncpus = 0; 799 pp->cp_cpulist = NULL; 800 pp->cp_attr = 0; 801 klgrpset_clear(pp->cp_lgrpset); 802 pp->cp_kp_queue.disp_maxrunpri = -1; 803 pp->cp_kp_queue.disp_max_unbound_pri = -1; 804 pp->cp_kp_queue.disp_cpu = NULL; 805 pp->cp_gen = 0; 806 CPUSET_ZERO(pp->cp_mach->mc_haltset); 807 DISP_LOCK_INIT(&pp->cp_kp_queue.disp_lock); 808 *psid = CPTOPS(pp->cp_id); 809 disp_kp_alloc(&pp->cp_kp_queue, v.v_nglobpris); 810 cpupart_kstat_create(pp); 811 for (i = 0; i < pp->cp_nlgrploads; i++) { 812 pp->cp_lgrploads[i].lpl_lgrpid = i; 813 } 814 bitset_init(&pp->cp_cmt_pgs); 815 816 /* 817 * Pause all CPUs while changing the partition list, to make sure 818 * the clock thread (which traverses the list without holding 819 * cpu_lock) isn't running. 820 */ 821 pause_cpus(NULL); 822 pp->cp_next = cp_list_head; 823 pp->cp_prev = cp_list_head->cp_prev; 824 cp_list_head->cp_prev->cp_next = pp; 825 cp_list_head->cp_prev = pp; 826 start_cpus(); 827 mutex_exit(&cpu_lock); 828 829 return (0); 830 } 831 832 /* 833 * Move threads from specified partition to cp_default. If `force' is specified, 834 * move all threads, otherwise move only soft-bound threads. 835 */ 836 static int 837 cpupart_unbind_threads(cpupart_t *pp, boolean_t unbind_all) 838 { 839 void *projbuf, *zonebuf; 840 kthread_t *t; 841 proc_t *p; 842 int err = 0; 843 psetid_t psid = pp->cp_id; 844 845 ASSERT(pool_lock_held()); 846 ASSERT(MUTEX_HELD(&cpu_lock)); 847 848 if (pp == NULL || pp == &cp_default) { 849 return (EINVAL); 850 } 851 852 /* 853 * Pre-allocate enough buffers for FSS for all active projects and 854 * for all active zones on the system. Unused buffers will be 855 * freed later by fss_freebuf(). 856 */ 857 projbuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_PROJ); 858 zonebuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_ZONE); 859 860 mutex_enter(&pidlock); 861 t = curthread; 862 do { 863 if (t->t_bind_pset == psid) { 864 again: p = ttoproc(t); 865 mutex_enter(&p->p_lock); 866 if (ttoproc(t) != p) { 867 /* 868 * lwp_exit has changed this thread's process 869 * pointer before we grabbed its p_lock. 870 */ 871 mutex_exit(&p->p_lock); 872 goto again; 873 } 874 875 /* 876 * Can only unbind threads which have revocable binding 877 * unless force unbinding requested. 878 */ 879 if (unbind_all || TB_PSET_IS_SOFT(t)) { 880 err = cpupart_bind_thread(t, PS_NONE, 1, 881 projbuf, zonebuf); 882 if (err) { 883 mutex_exit(&p->p_lock); 884 mutex_exit(&pidlock); 885 fss_freebuf(projbuf, FSS_ALLOC_PROJ); 886 fss_freebuf(zonebuf, FSS_ALLOC_ZONE); 887 return (err); 888 } 889 t->t_bind_pset = PS_NONE; 890 } 891 mutex_exit(&p->p_lock); 892 } 893 t = t->t_next; 894 } while (t != curthread); 895 896 mutex_exit(&pidlock); 897 fss_freebuf(projbuf, FSS_ALLOC_PROJ); 898 fss_freebuf(zonebuf, FSS_ALLOC_ZONE); 899 return (err); 900 } 901 902 /* 903 * Destroy a partition. 904 */ 905 int 906 cpupart_destroy(psetid_t psid) 907 { 908 cpu_t *cp, *first_cp; 909 cpupart_t *pp, *newpp; 910 int err = 0; 911 912 ASSERT(pool_lock_held()); 913 mutex_enter(&cpu_lock); 914 915 pp = cpupart_find(psid); 916 if (pp == NULL || pp == &cp_default) { 917 mutex_exit(&cpu_lock); 918 return (EINVAL); 919 } 920 921 /* 922 * Unbind all the threads currently bound to the partition. 923 */ 924 err = cpupart_unbind_threads(pp, B_TRUE); 925 if (err) { 926 mutex_exit(&cpu_lock); 927 return (err); 928 } 929 930 newpp = &cp_default; 931 while ((cp = pp->cp_cpulist) != NULL) { 932 if (err = cpupart_move_cpu(cp, newpp, 0)) { 933 mutex_exit(&cpu_lock); 934 return (err); 935 } 936 } 937 938 ASSERT(bitset_is_null(&pp->cp_cmt_pgs)); 939 ASSERT(CPUSET_ISNULL(pp->cp_mach->mc_haltset)); 940 941 /* 942 * Teardown the partition's group of active CMT PGs now that 943 * all of the CPUs have left. 944 */ 945 bitset_fini(&pp->cp_cmt_pgs); 946 947 /* 948 * Reset the pointers in any offline processors so they won't 949 * try to rejoin the destroyed partition when they're turned 950 * online. 951 */ 952 first_cp = cp = CPU; 953 do { 954 if (cp->cpu_part == pp) { 955 ASSERT(cp->cpu_flags & CPU_OFFLINE); 956 cp->cpu_part = newpp; 957 } 958 cp = cp->cpu_next; 959 } while (cp != first_cp); 960 961 /* 962 * Pause all CPUs while changing the partition list, to make sure 963 * the clock thread (which traverses the list without holding 964 * cpu_lock) isn't running. 965 */ 966 pause_cpus(NULL); 967 pp->cp_prev->cp_next = pp->cp_next; 968 pp->cp_next->cp_prev = pp->cp_prev; 969 if (cp_list_head == pp) 970 cp_list_head = pp->cp_next; 971 start_cpus(); 972 973 if (cp_id_next > pp->cp_id) 974 cp_id_next = pp->cp_id; 975 976 if (pp->cp_kstat) 977 kstat_delete(pp->cp_kstat); 978 979 cp_numparts--; 980 981 disp_kp_free(&pp->cp_kp_queue); 982 kmem_free(pp->cp_lgrploads, sizeof (lpl_t) * pp->cp_nlgrploads); 983 pp->cp_lgrploads = NULL; 984 kmem_free(pp->cp_mach, sizeof (struct mach_cpupart)); 985 kmem_free(pp, sizeof (cpupart_t)); 986 mutex_exit(&cpu_lock); 987 988 return (err); 989 } 990 991 992 /* 993 * Return the ID of the partition to which the specified processor belongs. 994 */ 995 psetid_t 996 cpupart_query_cpu(cpu_t *cp) 997 { 998 ASSERT(MUTEX_HELD(&cpu_lock)); 999 1000 return (CPTOPS(cp->cpu_part->cp_id)); 1001 } 1002 1003 1004 /* 1005 * Attach a processor to an existing partition. 1006 */ 1007 int 1008 cpupart_attach_cpu(psetid_t psid, cpu_t *cp, int forced) 1009 { 1010 cpupart_t *pp; 1011 int err; 1012 1013 ASSERT(pool_lock_held()); 1014 ASSERT(MUTEX_HELD(&cpu_lock)); 1015 1016 pp = cpupart_find(psid); 1017 if (pp == NULL) 1018 return (EINVAL); 1019 if (cp->cpu_flags & CPU_OFFLINE) 1020 return (EINVAL); 1021 1022 err = cpupart_move_cpu(cp, pp, forced); 1023 return (err); 1024 } 1025 1026 /* 1027 * Get a list of cpus belonging to the partition. If numcpus is NULL, 1028 * this just checks for a valid partition. If numcpus is non-NULL but 1029 * cpulist is NULL, the current number of cpus is stored in *numcpus. 1030 * If both are non-NULL, the current number of cpus is stored in *numcpus, 1031 * and a list of those cpus up to the size originally in *numcpus is 1032 * stored in cpulist[]. Also, store the processor set id in *psid. 1033 * This is useful in case the processor set id passed in was PS_MYID. 1034 */ 1035 int 1036 cpupart_get_cpus(psetid_t *psid, processorid_t *cpulist, uint_t *numcpus) 1037 { 1038 cpupart_t *pp; 1039 uint_t ncpus; 1040 cpu_t *c; 1041 int i; 1042 1043 mutex_enter(&cpu_lock); 1044 pp = cpupart_find(*psid); 1045 if (pp == NULL) { 1046 mutex_exit(&cpu_lock); 1047 return (EINVAL); 1048 } 1049 *psid = CPTOPS(pp->cp_id); 1050 ncpus = pp->cp_ncpus; 1051 if (numcpus) { 1052 if (ncpus > *numcpus) { 1053 /* 1054 * Only copy as many cpus as were passed in, but 1055 * pass back the real number. 1056 */ 1057 uint_t t = ncpus; 1058 ncpus = *numcpus; 1059 *numcpus = t; 1060 } else 1061 *numcpus = ncpus; 1062 1063 if (cpulist) { 1064 c = pp->cp_cpulist; 1065 for (i = 0; i < ncpus; i++) { 1066 ASSERT(c != NULL); 1067 cpulist[i] = c->cpu_id; 1068 c = c->cpu_next_part; 1069 } 1070 } 1071 } 1072 mutex_exit(&cpu_lock); 1073 return (0); 1074 } 1075 1076 /* 1077 * Reallocate kpreempt queues for each CPU partition. Called from 1078 * disp_setup when a new scheduling class is loaded that increases the 1079 * number of priorities in the system. 1080 */ 1081 void 1082 cpupart_kpqalloc(pri_t npri) 1083 { 1084 cpupart_t *cpp; 1085 1086 ASSERT(MUTEX_HELD(&cpu_lock)); 1087 cpp = cp_list_head; 1088 do { 1089 disp_kp_alloc(&cpp->cp_kp_queue, npri); 1090 cpp = cpp->cp_next; 1091 } while (cpp != cp_list_head); 1092 } 1093 1094 int 1095 cpupart_get_loadavg(psetid_t psid, int *buf, int nelem) 1096 { 1097 cpupart_t *cp; 1098 int i; 1099 1100 ASSERT(nelem >= 0); 1101 ASSERT(nelem <= LOADAVG_NSTATS); 1102 ASSERT(MUTEX_HELD(&cpu_lock)); 1103 1104 cp = cpupart_find(psid); 1105 if (cp == NULL) 1106 return (EINVAL); 1107 for (i = 0; i < nelem; i++) 1108 buf[i] = cp->cp_hp_avenrun[i] >> (16 - FSHIFT); 1109 1110 return (0); 1111 } 1112 1113 1114 uint_t 1115 cpupart_list(psetid_t *list, uint_t nelem, int flag) 1116 { 1117 uint_t numpart = 0; 1118 cpupart_t *cp; 1119 1120 ASSERT(MUTEX_HELD(&cpu_lock)); 1121 ASSERT(flag == CP_ALL || flag == CP_NONEMPTY); 1122 1123 if (list != NULL) { 1124 cp = cp_list_head; 1125 do { 1126 if (((flag == CP_ALL) && (cp != &cp_default)) || 1127 ((flag == CP_NONEMPTY) && (cp->cp_ncpus != 0))) { 1128 if (numpart == nelem) 1129 break; 1130 list[numpart++] = CPTOPS(cp->cp_id); 1131 } 1132 cp = cp->cp_next; 1133 } while (cp != cp_list_head); 1134 } 1135 1136 ASSERT(numpart < cp_numparts); 1137 1138 if (flag == CP_ALL) 1139 numpart = cp_numparts - 1; /* leave out default partition */ 1140 else if (flag == CP_NONEMPTY) 1141 numpart = cp_numparts_nonempty; 1142 1143 return (numpart); 1144 } 1145 1146 int 1147 cpupart_setattr(psetid_t psid, uint_t attr) 1148 { 1149 cpupart_t *cp; 1150 1151 ASSERT(pool_lock_held()); 1152 1153 mutex_enter(&cpu_lock); 1154 if ((cp = cpupart_find(psid)) == NULL) { 1155 mutex_exit(&cpu_lock); 1156 return (EINVAL); 1157 } 1158 /* 1159 * PSET_NOESCAPE attribute for default cpu partition is always set 1160 */ 1161 if (cp == &cp_default && !(attr & PSET_NOESCAPE)) { 1162 mutex_exit(&cpu_lock); 1163 return (EINVAL); 1164 } 1165 cp->cp_attr = attr; 1166 mutex_exit(&cpu_lock); 1167 return (0); 1168 } 1169 1170 int 1171 cpupart_getattr(psetid_t psid, uint_t *attrp) 1172 { 1173 cpupart_t *cp; 1174 1175 mutex_enter(&cpu_lock); 1176 if ((cp = cpupart_find(psid)) == NULL) { 1177 mutex_exit(&cpu_lock); 1178 return (EINVAL); 1179 } 1180 *attrp = cp->cp_attr; 1181 mutex_exit(&cpu_lock); 1182 return (0); 1183 } 1184