1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/systm.h> 28 #include <sys/cmn_err.h> 29 #include <sys/cpuvar.h> 30 #include <sys/thread.h> 31 #include <sys/disp.h> 32 #include <sys/kmem.h> 33 #include <sys/debug.h> 34 #include <sys/cpupart.h> 35 #include <sys/pset.h> 36 #include <sys/var.h> 37 #include <sys/cyclic.h> 38 #include <sys/lgrp.h> 39 #include <sys/pghw.h> 40 #include <sys/loadavg.h> 41 #include <sys/class.h> 42 #include <sys/fss.h> 43 #include <sys/pool.h> 44 #include <sys/pool_pset.h> 45 #include <sys/policy.h> 46 47 /* 48 * Calling pool_lock() protects the pools configuration, which includes 49 * CPU partitions. cpu_lock protects the CPU partition list, and prevents 50 * partitions from being created or destroyed while the lock is held. 51 * The lock ordering with respect to related locks is: 52 * 53 * pool_lock() ---> cpu_lock ---> pidlock --> p_lock 54 * 55 * Blocking memory allocations may be made while holding "pool_lock" 56 * or cpu_lock. 57 */ 58 59 /* 60 * The cp_default partition is allocated statically, but its lgroup load average 61 * (lpl) list is allocated dynamically after kmem subsystem is initialized. This 62 * saves some memory since the space allocated reflects the actual number of 63 * lgroups supported by the platform. The lgrp facility provides a temporary 64 * space to hold lpl information during system bootstrap. 65 */ 66 67 cpupart_t *cp_list_head; 68 cpupart_t cp_default; 69 static cpupartid_t cp_id_next; 70 uint_t cp_numparts; 71 uint_t cp_numparts_nonempty; 72 73 /* 74 * Need to limit total number of partitions to avoid slowing down the 75 * clock code too much. The clock code traverses the list of 76 * partitions and needs to be able to execute in a reasonable amount 77 * of time (less than 1/hz seconds). The maximum is sized based on 78 * max_ncpus so it shouldn't be a problem unless there are large 79 * numbers of empty partitions. 80 */ 81 static uint_t cp_max_numparts; 82 83 /* 84 * Processor sets and CPU partitions are different but related concepts. 85 * A processor set is a user-level abstraction allowing users to create 86 * sets of CPUs and bind threads exclusively to those sets. A CPU 87 * partition is a kernel dispatcher object consisting of a set of CPUs 88 * and a global dispatch queue. The processor set abstraction is 89 * implemented via a CPU partition, and currently there is a 1-1 90 * mapping between processor sets and partitions (excluding the default 91 * partition, which is not visible as a processor set). Hence, the 92 * numbering for processor sets and CPU partitions is identical. This 93 * may not always be true in the future, and these macros could become 94 * less trivial if we support e.g. a processor set containing multiple 95 * CPU partitions. 96 */ 97 #define PSTOCP(psid) ((cpupartid_t)((psid) == PS_NONE ? CP_DEFAULT : (psid))) 98 #define CPTOPS(cpid) ((psetid_t)((cpid) == CP_DEFAULT ? PS_NONE : (cpid))) 99 100 101 static int cpupart_unbind_threads(cpupart_t *, boolean_t); 102 103 /* 104 * Find a CPU partition given a processor set ID. 105 */ 106 static cpupart_t * 107 cpupart_find_all(psetid_t psid) 108 { 109 cpupart_t *cp; 110 cpupartid_t cpid = PSTOCP(psid); 111 112 ASSERT(MUTEX_HELD(&cpu_lock)); 113 114 /* default partition not visible as a processor set */ 115 if (psid == CP_DEFAULT) 116 return (NULL); 117 118 if (psid == PS_MYID) 119 return (curthread->t_cpupart); 120 121 cp = cp_list_head; 122 do { 123 if (cp->cp_id == cpid) 124 return (cp); 125 cp = cp->cp_next; 126 } while (cp != cp_list_head); 127 return (NULL); 128 } 129 130 /* 131 * Find a CPU partition given a processor set ID if the processor set 132 * should be visible from the calling zone. 133 */ 134 cpupart_t * 135 cpupart_find(psetid_t psid) 136 { 137 cpupart_t *cp; 138 139 ASSERT(MUTEX_HELD(&cpu_lock)); 140 cp = cpupart_find_all(psid); 141 if (cp != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() && 142 zone_pset_get(curproc->p_zone) != CPTOPS(cp->cp_id)) 143 return (NULL); 144 return (cp); 145 } 146 147 static int 148 cpupart_kstat_update(kstat_t *ksp, int rw) 149 { 150 cpupart_t *cp = (cpupart_t *)ksp->ks_private; 151 cpupart_kstat_t *cpksp = ksp->ks_data; 152 153 if (rw == KSTAT_WRITE) 154 return (EACCES); 155 156 cpksp->cpk_updates.value.ui64 = cp->cp_updates; 157 cpksp->cpk_runnable.value.ui64 = cp->cp_nrunnable_cum; 158 cpksp->cpk_waiting.value.ui64 = cp->cp_nwaiting_cum; 159 cpksp->cpk_ncpus.value.ui32 = cp->cp_ncpus; 160 cpksp->cpk_avenrun_1min.value.ui32 = cp->cp_hp_avenrun[0] >> 161 (16 - FSHIFT); 162 cpksp->cpk_avenrun_5min.value.ui32 = cp->cp_hp_avenrun[1] >> 163 (16 - FSHIFT); 164 cpksp->cpk_avenrun_15min.value.ui32 = cp->cp_hp_avenrun[2] >> 165 (16 - FSHIFT); 166 return (0); 167 } 168 169 static void 170 cpupart_kstat_create(cpupart_t *cp) 171 { 172 kstat_t *ksp; 173 zoneid_t zoneid; 174 175 ASSERT(MUTEX_HELD(&cpu_lock)); 176 177 /* 178 * We have a bit of a chicken-egg problem since this code will 179 * get called to create the kstats for CP_DEFAULT before the 180 * pools framework gets initialized. We circumvent the problem 181 * by special-casing cp_default. 182 */ 183 if (cp != &cp_default && pool_pset_enabled()) 184 zoneid = GLOBAL_ZONEID; 185 else 186 zoneid = ALL_ZONES; 187 ksp = kstat_create_zone("unix", cp->cp_id, "pset", "misc", 188 KSTAT_TYPE_NAMED, 189 sizeof (cpupart_kstat_t) / sizeof (kstat_named_t), 0, zoneid); 190 if (ksp != NULL) { 191 cpupart_kstat_t *cpksp = ksp->ks_data; 192 193 kstat_named_init(&cpksp->cpk_updates, "updates", 194 KSTAT_DATA_UINT64); 195 kstat_named_init(&cpksp->cpk_runnable, "runnable", 196 KSTAT_DATA_UINT64); 197 kstat_named_init(&cpksp->cpk_waiting, "waiting", 198 KSTAT_DATA_UINT64); 199 kstat_named_init(&cpksp->cpk_ncpus, "ncpus", 200 KSTAT_DATA_UINT32); 201 kstat_named_init(&cpksp->cpk_avenrun_1min, "avenrun_1min", 202 KSTAT_DATA_UINT32); 203 kstat_named_init(&cpksp->cpk_avenrun_5min, "avenrun_5min", 204 KSTAT_DATA_UINT32); 205 kstat_named_init(&cpksp->cpk_avenrun_15min, "avenrun_15min", 206 KSTAT_DATA_UINT32); 207 208 ksp->ks_update = cpupart_kstat_update; 209 ksp->ks_private = cp; 210 211 kstat_install(ksp); 212 } 213 cp->cp_kstat = ksp; 214 } 215 216 /* 217 * Initialize the cpupart's lgrp partions (lpls) 218 */ 219 static void 220 cpupart_lpl_initialize(cpupart_t *cp) 221 { 222 int i, sz; 223 224 sz = cp->cp_nlgrploads = lgrp_plat_max_lgrps(); 225 cp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * sz, KM_SLEEP); 226 227 for (i = 0; i < sz; i++) { 228 /* 229 * The last entry of the lpl's resource set is always NULL 230 * by design (to facilitate iteration)...hence the "oversizing" 231 * by 1. 232 */ 233 cp->cp_lgrploads[i].lpl_rset_sz = sz + 1; 234 cp->cp_lgrploads[i].lpl_rset = 235 kmem_zalloc(sizeof (struct lgrp_ld *) * (sz + 1), KM_SLEEP); 236 cp->cp_lgrploads[i].lpl_id2rset = 237 kmem_zalloc(sizeof (int) * (sz + 1), KM_SLEEP); 238 cp->cp_lgrploads[i].lpl_lgrpid = i; 239 } 240 } 241 242 /* 243 * Teardown the cpupart's lgrp partitions 244 */ 245 static void 246 cpupart_lpl_teardown(cpupart_t *cp) 247 { 248 int i, sz; 249 lpl_t *lpl; 250 251 for (i = 0; i < cp->cp_nlgrploads; i++) { 252 lpl = &cp->cp_lgrploads[i]; 253 254 sz = lpl->lpl_rset_sz; 255 kmem_free(lpl->lpl_rset, sizeof (struct lgrp_ld *) * sz); 256 kmem_free(lpl->lpl_id2rset, sizeof (int) * sz); 257 lpl->lpl_rset = NULL; 258 lpl->lpl_id2rset = NULL; 259 } 260 kmem_free(cp->cp_lgrploads, sizeof (lpl_t) * cp->cp_nlgrploads); 261 cp->cp_lgrploads = NULL; 262 } 263 264 /* 265 * Initialize the default partition and kpreempt disp queue. 266 */ 267 void 268 cpupart_initialize_default(void) 269 { 270 lgrp_id_t i; 271 272 cp_list_head = &cp_default; 273 cp_default.cp_next = &cp_default; 274 cp_default.cp_prev = &cp_default; 275 cp_default.cp_id = CP_DEFAULT; 276 cp_default.cp_kp_queue.disp_maxrunpri = -1; 277 cp_default.cp_kp_queue.disp_max_unbound_pri = -1; 278 cp_default.cp_kp_queue.disp_cpu = NULL; 279 cp_default.cp_gen = 0; 280 cp_default.cp_loadavg.lg_cur = 0; 281 cp_default.cp_loadavg.lg_len = 0; 282 cp_default.cp_loadavg.lg_total = 0; 283 for (i = 0; i < S_LOADAVG_SZ; i++) { 284 cp_default.cp_loadavg.lg_loads[i] = 0; 285 } 286 DISP_LOCK_INIT(&cp_default.cp_kp_queue.disp_lock); 287 cp_id_next = CP_DEFAULT + 1; 288 cpupart_kstat_create(&cp_default); 289 cp_numparts = 1; 290 if (cp_max_numparts == 0) /* allow for /etc/system tuning */ 291 cp_max_numparts = max_ncpus * 2 + 1; 292 /* 293 * Allocate space for cp_default list of lgrploads 294 */ 295 cpupart_lpl_initialize(&cp_default); 296 297 /* 298 * The initial lpl topology is created in a special lpl list 299 * lpl_bootstrap. It should be copied to cp_default. 300 * NOTE: lpl_topo_bootstrap() also updates CPU0 cpu_lpl pointer to point 301 * to the correct lpl in the cp_default.cp_lgrploads list. 302 */ 303 lpl_topo_bootstrap(cp_default.cp_lgrploads, 304 cp_default.cp_nlgrploads); 305 306 307 cp_default.cp_attr = PSET_NOESCAPE; 308 cp_numparts_nonempty = 1; 309 /* 310 * Set t0's home 311 */ 312 t0.t_lpl = &cp_default.cp_lgrploads[LGRP_ROOTID]; 313 314 bitset_init(&cp_default.cp_cmt_pgs); 315 bitset_init(&cp_default.cp_haltset); 316 bitset_resize(&cp_default.cp_haltset, max_ncpus); 317 } 318 319 320 static int 321 cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced) 322 { 323 cpupart_t *oldpp; 324 cpu_t *ncp, *newlist; 325 kthread_t *t; 326 int move_threads = 1; 327 lgrp_id_t lgrpid; 328 proc_t *p; 329 int lgrp_diff_lpl; 330 lpl_t *cpu_lpl; 331 int ret; 332 boolean_t unbind_all_threads = (forced != 0); 333 334 ASSERT(MUTEX_HELD(&cpu_lock)); 335 ASSERT(newpp != NULL); 336 337 oldpp = cp->cpu_part; 338 ASSERT(oldpp != NULL); 339 ASSERT(oldpp->cp_ncpus > 0); 340 341 if (newpp == oldpp) { 342 /* 343 * Don't need to do anything. 344 */ 345 return (0); 346 } 347 348 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_OUT); 349 350 if (!disp_bound_partition(cp, 0)) { 351 /* 352 * Don't need to move threads if there are no threads in 353 * the partition. Note that threads can't enter the 354 * partition while we're holding cpu_lock. 355 */ 356 move_threads = 0; 357 } else if (oldpp->cp_ncpus == 1) { 358 /* 359 * The last CPU is removed from a partition which has threads 360 * running in it. Some of these threads may be bound to this 361 * CPU. 362 * 363 * Attempt to unbind threads from the CPU and from the processor 364 * set. Note that no threads should be bound to this CPU since 365 * cpupart_move_threads will refuse to move bound threads to 366 * other CPUs. 367 */ 368 (void) cpu_unbind(oldpp->cp_cpulist->cpu_id, B_FALSE); 369 (void) cpupart_unbind_threads(oldpp, B_FALSE); 370 371 if (!disp_bound_partition(cp, 0)) { 372 /* 373 * No bound threads in this partition any more 374 */ 375 move_threads = 0; 376 } else { 377 /* 378 * There are still threads bound to the partition 379 */ 380 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 381 return (EBUSY); 382 } 383 } 384 385 /* 386 * If forced flag is set unbind any threads from this CPU. 387 * Otherwise unbind soft-bound threads only. 388 */ 389 if ((ret = cpu_unbind(cp->cpu_id, unbind_all_threads)) != 0) { 390 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 391 return (ret); 392 } 393 394 /* 395 * Stop further threads weak binding to this cpu. 396 */ 397 cpu_inmotion = cp; 398 membar_enter(); 399 400 /* 401 * Notify the Processor Groups subsystem that the CPU 402 * will be moving cpu partitions. This is done before 403 * CPUs are paused to provide an opportunity for any 404 * needed memory allocations. 405 */ 406 pg_cpupart_out(cp, oldpp); 407 pg_cpupart_in(cp, newpp); 408 409 again: 410 if (move_threads) { 411 int loop_count; 412 /* 413 * Check for threads strong or weak bound to this CPU. 414 */ 415 for (loop_count = 0; disp_bound_threads(cp, 0); loop_count++) { 416 if (loop_count >= 5) { 417 cpu_state_change_notify(cp->cpu_id, 418 CPU_CPUPART_IN); 419 pg_cpupart_out(cp, newpp); 420 pg_cpupart_in(cp, oldpp); 421 cpu_inmotion = NULL; 422 return (EBUSY); /* some threads still bound */ 423 } 424 delay(1); 425 } 426 } 427 428 /* 429 * Before we actually start changing data structures, notify 430 * the cyclic subsystem that we want to move this CPU out of its 431 * partition. 432 */ 433 if (!cyclic_move_out(cp)) { 434 /* 435 * This CPU must be the last CPU in a processor set with 436 * a bound cyclic. 437 */ 438 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 439 pg_cpupart_out(cp, newpp); 440 pg_cpupart_in(cp, oldpp); 441 cpu_inmotion = NULL; 442 return (EBUSY); 443 } 444 445 pause_cpus(cp); 446 447 if (move_threads) { 448 /* 449 * The thread on cpu before the pause thread may have read 450 * cpu_inmotion before we raised the barrier above. Check 451 * again. 452 */ 453 if (disp_bound_threads(cp, 1)) { 454 start_cpus(); 455 goto again; 456 } 457 458 } 459 460 /* 461 * Now that CPUs are paused, let the PG subsystem perform 462 * any necessary data structure updates. 463 */ 464 pg_cpupart_move(cp, oldpp, newpp); 465 466 /* save this cpu's lgroup -- it'll be the same in the new partition */ 467 lgrpid = cp->cpu_lpl->lpl_lgrpid; 468 469 cpu_lpl = cp->cpu_lpl; 470 /* 471 * let the lgroup framework know cp has left the partition 472 */ 473 lgrp_config(LGRP_CONFIG_CPUPART_DEL, (uintptr_t)cp, lgrpid); 474 475 /* move out of old partition */ 476 oldpp->cp_ncpus--; 477 if (oldpp->cp_ncpus > 0) { 478 479 ncp = cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part; 480 cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part; 481 if (oldpp->cp_cpulist == cp) { 482 oldpp->cp_cpulist = ncp; 483 } 484 } else { 485 ncp = oldpp->cp_cpulist = NULL; 486 cp_numparts_nonempty--; 487 ASSERT(cp_numparts_nonempty != 0); 488 } 489 oldpp->cp_gen++; 490 491 /* move into new partition */ 492 newlist = newpp->cp_cpulist; 493 if (newlist == NULL) { 494 newpp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp; 495 cp_numparts_nonempty++; 496 ASSERT(cp_numparts_nonempty != 0); 497 } else { 498 cp->cpu_next_part = newlist; 499 cp->cpu_prev_part = newlist->cpu_prev_part; 500 newlist->cpu_prev_part->cpu_next_part = cp; 501 newlist->cpu_prev_part = cp; 502 } 503 cp->cpu_part = newpp; 504 newpp->cp_ncpus++; 505 newpp->cp_gen++; 506 507 ASSERT(bitset_is_null(&newpp->cp_haltset)); 508 ASSERT(bitset_is_null(&oldpp->cp_haltset)); 509 510 /* 511 * let the lgroup framework know cp has entered the partition 512 */ 513 lgrp_config(LGRP_CONFIG_CPUPART_ADD, (uintptr_t)cp, lgrpid); 514 515 /* 516 * If necessary, move threads off processor. 517 */ 518 if (move_threads) { 519 ASSERT(ncp != NULL); 520 521 /* 522 * Walk thru the active process list to look for 523 * threads that need to have a new home lgroup, 524 * or the last CPU they run on is the same CPU 525 * being moved out of the partition. 526 */ 527 528 for (p = practive; p != NULL; p = p->p_next) { 529 530 t = p->p_tlist; 531 532 if (t == NULL) 533 continue; 534 535 lgrp_diff_lpl = 0; 536 537 do { 538 539 ASSERT(t->t_lpl != NULL); 540 541 /* 542 * Update the count of how many threads are 543 * in this CPU's lgroup but have a different lpl 544 */ 545 546 if (t->t_lpl != cpu_lpl && 547 t->t_lpl->lpl_lgrpid == lgrpid) 548 lgrp_diff_lpl++; 549 /* 550 * If the lgroup that t is assigned to no 551 * longer has any CPUs in t's partition, 552 * we'll have to choose a new lgroup for t. 553 */ 554 555 if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid, 556 t->t_cpupart)) { 557 lgrp_move_thread(t, 558 lgrp_choose(t, t->t_cpupart), 0); 559 } 560 561 /* 562 * make sure lpl points to our own partition 563 */ 564 ASSERT(t->t_lpl >= t->t_cpupart->cp_lgrploads && 565 (t->t_lpl < t->t_cpupart->cp_lgrploads + 566 t->t_cpupart->cp_nlgrploads)); 567 568 ASSERT(t->t_lpl->lpl_ncpu > 0); 569 570 /* Update CPU last ran on if it was this CPU */ 571 if (t->t_cpu == cp && t->t_cpupart == oldpp && 572 t->t_bound_cpu != cp) { 573 t->t_cpu = disp_lowpri_cpu(ncp, 574 t->t_lpl, t->t_pri, NULL); 575 } 576 t = t->t_forw; 577 } while (t != p->p_tlist); 578 579 /* 580 * Didn't find any threads in the same lgroup as this 581 * CPU with a different lpl, so remove the lgroup from 582 * the process lgroup bitmask. 583 */ 584 585 if (lgrp_diff_lpl) 586 klgrpset_del(p->p_lgrpset, lgrpid); 587 } 588 589 /* 590 * Walk thread list looking for threads that need to be 591 * rehomed, since there are some threads that are not in 592 * their process's p_tlist. 593 */ 594 595 t = curthread; 596 597 do { 598 ASSERT(t != NULL && t->t_lpl != NULL); 599 600 /* 601 * If the lgroup that t is assigned to no 602 * longer has any CPUs in t's partition, 603 * we'll have to choose a new lgroup for t. 604 * Also, choose best lgroup for home when 605 * thread has specified lgroup affinities, 606 * since there may be an lgroup with more 607 * affinity available after moving CPUs 608 * around. 609 */ 610 if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid, 611 t->t_cpupart) || t->t_lgrp_affinity) { 612 lgrp_move_thread(t, 613 lgrp_choose(t, t->t_cpupart), 1); 614 } 615 616 /* make sure lpl points to our own partition */ 617 ASSERT((t->t_lpl >= t->t_cpupart->cp_lgrploads) && 618 (t->t_lpl < t->t_cpupart->cp_lgrploads + 619 t->t_cpupart->cp_nlgrploads)); 620 621 ASSERT(t->t_lpl->lpl_ncpu > 0); 622 623 /* Update CPU last ran on if it was this CPU */ 624 if (t->t_cpu == cp && t->t_cpupart == oldpp && 625 t->t_bound_cpu != cp) { 626 t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl, 627 t->t_pri, NULL); 628 } 629 630 t = t->t_next; 631 } while (t != curthread); 632 633 /* 634 * Clear off the CPU's run queue, and the kp queue if the 635 * partition is now empty. 636 */ 637 disp_cpu_inactive(cp); 638 639 /* 640 * Make cp switch to a thread from the new partition. 641 */ 642 cp->cpu_runrun = 1; 643 cp->cpu_kprunrun = 1; 644 } 645 646 cpu_inmotion = NULL; 647 start_cpus(); 648 649 /* 650 * Let anyone interested know that cpu has been added to the set. 651 */ 652 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 653 654 /* 655 * Now let the cyclic subsystem know that it can reshuffle cyclics 656 * bound to the new processor set. 657 */ 658 cyclic_move_in(cp); 659 660 return (0); 661 } 662 663 /* 664 * Check if thread can be moved to a new cpu partition. Called by 665 * cpupart_move_thread() and pset_bind_start(). 666 */ 667 int 668 cpupart_movable_thread(kthread_id_t tp, cpupart_t *cp, int ignore) 669 { 670 ASSERT(MUTEX_HELD(&cpu_lock)); 671 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock)); 672 ASSERT(cp != NULL); 673 ASSERT(THREAD_LOCK_HELD(tp)); 674 675 /* 676 * CPU-bound threads can't be moved. 677 */ 678 if (!ignore) { 679 cpu_t *boundcpu = tp->t_bound_cpu ? tp->t_bound_cpu : 680 tp->t_weakbound_cpu; 681 if (boundcpu != NULL && boundcpu->cpu_part != cp) 682 return (EBUSY); 683 } 684 685 if (tp->t_cid == sysdccid) { 686 return (EINVAL); /* For now, sysdc threads can't move */ 687 } 688 689 return (0); 690 } 691 692 /* 693 * Move thread to new partition. If ignore is non-zero, then CPU 694 * bindings should be ignored (this is used when destroying a 695 * partition). 696 */ 697 static int 698 cpupart_move_thread(kthread_id_t tp, cpupart_t *newpp, int ignore, 699 void *projbuf, void *zonebuf) 700 { 701 cpupart_t *oldpp = tp->t_cpupart; 702 int ret; 703 704 ASSERT(MUTEX_HELD(&cpu_lock)); 705 ASSERT(MUTEX_HELD(&pidlock)); 706 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock)); 707 ASSERT(newpp != NULL); 708 709 if (newpp->cp_cpulist == NULL) 710 return (EINVAL); 711 712 /* 713 * Check for errors first. 714 */ 715 thread_lock(tp); 716 if ((ret = cpupart_movable_thread(tp, newpp, ignore)) != 0) { 717 thread_unlock(tp); 718 return (ret); 719 } 720 721 /* move the thread */ 722 if (oldpp != newpp) { 723 /* 724 * Make the thread switch to the new partition. 725 */ 726 tp->t_cpupart = newpp; 727 ASSERT(tp->t_lpl != NULL); 728 /* 729 * Leave the thread on the same lgroup if possible; otherwise 730 * choose a new lgroup for it. In either case, update its 731 * t_lpl. 732 */ 733 if (LGRP_CPUS_IN_PART(tp->t_lpl->lpl_lgrpid, newpp) && 734 tp->t_lgrp_affinity == NULL) { 735 /* 736 * The thread's lgroup has CPUs in the thread's new 737 * partition, so the thread can stay assigned to the 738 * same lgroup. Update its t_lpl to point to the 739 * lpl_t for its lgroup in its new partition. 740 */ 741 lgrp_move_thread(tp, &tp->t_cpupart->\ 742 cp_lgrploads[tp->t_lpl->lpl_lgrpid], 1); 743 } else { 744 /* 745 * The thread's lgroup has no cpus in its new 746 * partition or it has specified lgroup affinities, 747 * so choose the best lgroup for the thread and 748 * assign it to that lgroup. 749 */ 750 lgrp_move_thread(tp, lgrp_choose(tp, tp->t_cpupart), 751 1); 752 } 753 /* 754 * make sure lpl points to our own partition 755 */ 756 ASSERT((tp->t_lpl >= tp->t_cpupart->cp_lgrploads) && 757 (tp->t_lpl < tp->t_cpupart->cp_lgrploads + 758 tp->t_cpupart->cp_nlgrploads)); 759 760 ASSERT(tp->t_lpl->lpl_ncpu > 0); 761 762 if (tp->t_state == TS_ONPROC) { 763 cpu_surrender(tp); 764 } else if (tp->t_state == TS_RUN) { 765 (void) dispdeq(tp); 766 setbackdq(tp); 767 } 768 } 769 770 /* 771 * Our binding has changed; set TP_CHANGEBIND. 772 */ 773 tp->t_proc_flag |= TP_CHANGEBIND; 774 aston(tp); 775 776 thread_unlock(tp); 777 fss_changepset(tp, newpp, projbuf, zonebuf); 778 779 return (0); /* success */ 780 } 781 782 783 /* 784 * This function binds a thread to a partition. Must be called with the 785 * p_lock of the containing process held (to keep the thread from going 786 * away), and thus also with cpu_lock held (since cpu_lock must be 787 * acquired before p_lock). If ignore is non-zero, then CPU bindings 788 * should be ignored (this is used when destroying a partition). 789 */ 790 int 791 cpupart_bind_thread(kthread_id_t tp, psetid_t psid, int ignore, void *projbuf, 792 void *zonebuf) 793 { 794 cpupart_t *newpp; 795 796 ASSERT(pool_lock_held()); 797 ASSERT(MUTEX_HELD(&cpu_lock)); 798 ASSERT(MUTEX_HELD(&pidlock)); 799 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock)); 800 801 if (psid == PS_NONE) 802 newpp = &cp_default; 803 else { 804 newpp = cpupart_find(psid); 805 if (newpp == NULL) { 806 return (EINVAL); 807 } 808 } 809 return (cpupart_move_thread(tp, newpp, ignore, projbuf, zonebuf)); 810 } 811 812 813 /* 814 * Create a new partition. On MP systems, this also allocates a 815 * kpreempt disp queue for that partition. 816 */ 817 int 818 cpupart_create(psetid_t *psid) 819 { 820 cpupart_t *pp; 821 822 ASSERT(pool_lock_held()); 823 824 pp = kmem_zalloc(sizeof (cpupart_t), KM_SLEEP); 825 pp->cp_nlgrploads = lgrp_plat_max_lgrps(); 826 pp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * pp->cp_nlgrploads, 827 KM_SLEEP); 828 829 mutex_enter(&cpu_lock); 830 if (cp_numparts == cp_max_numparts) { 831 mutex_exit(&cpu_lock); 832 kmem_free(pp->cp_lgrploads, sizeof (lpl_t) * pp->cp_nlgrploads); 833 pp->cp_lgrploads = NULL; 834 kmem_free(pp, sizeof (cpupart_t)); 835 return (ENOMEM); 836 } 837 cp_numparts++; 838 /* find the next free partition ID */ 839 while (cpupart_find(CPTOPS(cp_id_next)) != NULL) 840 cp_id_next++; 841 pp->cp_id = cp_id_next++; 842 pp->cp_ncpus = 0; 843 pp->cp_cpulist = NULL; 844 pp->cp_attr = 0; 845 klgrpset_clear(pp->cp_lgrpset); 846 pp->cp_kp_queue.disp_maxrunpri = -1; 847 pp->cp_kp_queue.disp_max_unbound_pri = -1; 848 pp->cp_kp_queue.disp_cpu = NULL; 849 pp->cp_gen = 0; 850 DISP_LOCK_INIT(&pp->cp_kp_queue.disp_lock); 851 *psid = CPTOPS(pp->cp_id); 852 disp_kp_alloc(&pp->cp_kp_queue, v.v_nglobpris); 853 cpupart_kstat_create(pp); 854 cpupart_lpl_initialize(pp); 855 856 bitset_init(&pp->cp_cmt_pgs); 857 858 /* 859 * Initialize and size the partition's bitset of halted CPUs 860 */ 861 bitset_init(&pp->cp_haltset); 862 bitset_resize(&pp->cp_haltset, max_ncpus); 863 864 /* 865 * Pause all CPUs while changing the partition list, to make sure 866 * the clock thread (which traverses the list without holding 867 * cpu_lock) isn't running. 868 */ 869 pause_cpus(NULL); 870 pp->cp_next = cp_list_head; 871 pp->cp_prev = cp_list_head->cp_prev; 872 cp_list_head->cp_prev->cp_next = pp; 873 cp_list_head->cp_prev = pp; 874 start_cpus(); 875 mutex_exit(&cpu_lock); 876 877 return (0); 878 } 879 880 /* 881 * Move threads from specified partition to cp_default. If `force' is specified, 882 * move all threads, otherwise move only soft-bound threads. 883 */ 884 static int 885 cpupart_unbind_threads(cpupart_t *pp, boolean_t unbind_all) 886 { 887 void *projbuf, *zonebuf; 888 kthread_t *t; 889 proc_t *p; 890 int err = 0; 891 psetid_t psid = pp->cp_id; 892 893 ASSERT(pool_lock_held()); 894 ASSERT(MUTEX_HELD(&cpu_lock)); 895 896 if (pp == NULL || pp == &cp_default) { 897 return (EINVAL); 898 } 899 900 /* 901 * Pre-allocate enough buffers for FSS for all active projects and 902 * for all active zones on the system. Unused buffers will be 903 * freed later by fss_freebuf(). 904 */ 905 projbuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_PROJ); 906 zonebuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_ZONE); 907 908 mutex_enter(&pidlock); 909 t = curthread; 910 do { 911 if (t->t_bind_pset == psid) { 912 again: p = ttoproc(t); 913 mutex_enter(&p->p_lock); 914 if (ttoproc(t) != p) { 915 /* 916 * lwp_exit has changed this thread's process 917 * pointer before we grabbed its p_lock. 918 */ 919 mutex_exit(&p->p_lock); 920 goto again; 921 } 922 923 /* 924 * Can only unbind threads which have revocable binding 925 * unless force unbinding requested. 926 */ 927 if (unbind_all || TB_PSET_IS_SOFT(t)) { 928 err = cpupart_bind_thread(t, PS_NONE, 1, 929 projbuf, zonebuf); 930 if (err) { 931 mutex_exit(&p->p_lock); 932 mutex_exit(&pidlock); 933 fss_freebuf(projbuf, FSS_ALLOC_PROJ); 934 fss_freebuf(zonebuf, FSS_ALLOC_ZONE); 935 return (err); 936 } 937 t->t_bind_pset = PS_NONE; 938 } 939 mutex_exit(&p->p_lock); 940 } 941 t = t->t_next; 942 } while (t != curthread); 943 944 mutex_exit(&pidlock); 945 fss_freebuf(projbuf, FSS_ALLOC_PROJ); 946 fss_freebuf(zonebuf, FSS_ALLOC_ZONE); 947 return (err); 948 } 949 950 /* 951 * Destroy a partition. 952 */ 953 int 954 cpupart_destroy(psetid_t psid) 955 { 956 cpu_t *cp, *first_cp; 957 cpupart_t *pp, *newpp; 958 int err = 0; 959 960 ASSERT(pool_lock_held()); 961 mutex_enter(&cpu_lock); 962 963 pp = cpupart_find(psid); 964 if (pp == NULL || pp == &cp_default) { 965 mutex_exit(&cpu_lock); 966 return (EINVAL); 967 } 968 969 /* 970 * Unbind all the threads currently bound to the partition. 971 */ 972 err = cpupart_unbind_threads(pp, B_TRUE); 973 if (err) { 974 mutex_exit(&cpu_lock); 975 return (err); 976 } 977 978 newpp = &cp_default; 979 while ((cp = pp->cp_cpulist) != NULL) { 980 if (err = cpupart_move_cpu(cp, newpp, 0)) { 981 mutex_exit(&cpu_lock); 982 return (err); 983 } 984 } 985 986 ASSERT(bitset_is_null(&pp->cp_cmt_pgs)); 987 ASSERT(bitset_is_null(&pp->cp_haltset)); 988 989 /* 990 * Teardown the partition's group of active CMT PGs and halted 991 * CPUs now that they have all left. 992 */ 993 bitset_fini(&pp->cp_cmt_pgs); 994 bitset_fini(&pp->cp_haltset); 995 996 /* 997 * Reset the pointers in any offline processors so they won't 998 * try to rejoin the destroyed partition when they're turned 999 * online. 1000 */ 1001 first_cp = cp = CPU; 1002 do { 1003 if (cp->cpu_part == pp) { 1004 ASSERT(cp->cpu_flags & CPU_OFFLINE); 1005 cp->cpu_part = newpp; 1006 } 1007 cp = cp->cpu_next; 1008 } while (cp != first_cp); 1009 1010 /* 1011 * Pause all CPUs while changing the partition list, to make sure 1012 * the clock thread (which traverses the list without holding 1013 * cpu_lock) isn't running. 1014 */ 1015 pause_cpus(NULL); 1016 pp->cp_prev->cp_next = pp->cp_next; 1017 pp->cp_next->cp_prev = pp->cp_prev; 1018 if (cp_list_head == pp) 1019 cp_list_head = pp->cp_next; 1020 start_cpus(); 1021 1022 if (cp_id_next > pp->cp_id) 1023 cp_id_next = pp->cp_id; 1024 1025 if (pp->cp_kstat) 1026 kstat_delete(pp->cp_kstat); 1027 1028 cp_numparts--; 1029 1030 disp_kp_free(&pp->cp_kp_queue); 1031 1032 cpupart_lpl_teardown(pp); 1033 1034 kmem_free(pp, sizeof (cpupart_t)); 1035 mutex_exit(&cpu_lock); 1036 1037 return (err); 1038 } 1039 1040 1041 /* 1042 * Return the ID of the partition to which the specified processor belongs. 1043 */ 1044 psetid_t 1045 cpupart_query_cpu(cpu_t *cp) 1046 { 1047 ASSERT(MUTEX_HELD(&cpu_lock)); 1048 1049 return (CPTOPS(cp->cpu_part->cp_id)); 1050 } 1051 1052 1053 /* 1054 * Attach a processor to an existing partition. 1055 */ 1056 int 1057 cpupart_attach_cpu(psetid_t psid, cpu_t *cp, int forced) 1058 { 1059 cpupart_t *pp; 1060 int err; 1061 1062 ASSERT(pool_lock_held()); 1063 ASSERT(MUTEX_HELD(&cpu_lock)); 1064 1065 pp = cpupart_find(psid); 1066 if (pp == NULL) 1067 return (EINVAL); 1068 if (cp->cpu_flags & CPU_OFFLINE) 1069 return (EINVAL); 1070 1071 err = cpupart_move_cpu(cp, pp, forced); 1072 return (err); 1073 } 1074 1075 /* 1076 * Get a list of cpus belonging to the partition. If numcpus is NULL, 1077 * this just checks for a valid partition. If numcpus is non-NULL but 1078 * cpulist is NULL, the current number of cpus is stored in *numcpus. 1079 * If both are non-NULL, the current number of cpus is stored in *numcpus, 1080 * and a list of those cpus up to the size originally in *numcpus is 1081 * stored in cpulist[]. Also, store the processor set id in *psid. 1082 * This is useful in case the processor set id passed in was PS_MYID. 1083 */ 1084 int 1085 cpupart_get_cpus(psetid_t *psid, processorid_t *cpulist, uint_t *numcpus) 1086 { 1087 cpupart_t *pp; 1088 uint_t ncpus; 1089 cpu_t *c; 1090 int i; 1091 1092 mutex_enter(&cpu_lock); 1093 pp = cpupart_find(*psid); 1094 if (pp == NULL) { 1095 mutex_exit(&cpu_lock); 1096 return (EINVAL); 1097 } 1098 *psid = CPTOPS(pp->cp_id); 1099 ncpus = pp->cp_ncpus; 1100 if (numcpus) { 1101 if (ncpus > *numcpus) { 1102 /* 1103 * Only copy as many cpus as were passed in, but 1104 * pass back the real number. 1105 */ 1106 uint_t t = ncpus; 1107 ncpus = *numcpus; 1108 *numcpus = t; 1109 } else 1110 *numcpus = ncpus; 1111 1112 if (cpulist) { 1113 c = pp->cp_cpulist; 1114 for (i = 0; i < ncpus; i++) { 1115 ASSERT(c != NULL); 1116 cpulist[i] = c->cpu_id; 1117 c = c->cpu_next_part; 1118 } 1119 } 1120 } 1121 mutex_exit(&cpu_lock); 1122 return (0); 1123 } 1124 1125 /* 1126 * Reallocate kpreempt queues for each CPU partition. Called from 1127 * disp_setup when a new scheduling class is loaded that increases the 1128 * number of priorities in the system. 1129 */ 1130 void 1131 cpupart_kpqalloc(pri_t npri) 1132 { 1133 cpupart_t *cpp; 1134 1135 ASSERT(MUTEX_HELD(&cpu_lock)); 1136 cpp = cp_list_head; 1137 do { 1138 disp_kp_alloc(&cpp->cp_kp_queue, npri); 1139 cpp = cpp->cp_next; 1140 } while (cpp != cp_list_head); 1141 } 1142 1143 int 1144 cpupart_get_loadavg(psetid_t psid, int *buf, int nelem) 1145 { 1146 cpupart_t *cp; 1147 int i; 1148 1149 ASSERT(nelem >= 0); 1150 ASSERT(nelem <= LOADAVG_NSTATS); 1151 ASSERT(MUTEX_HELD(&cpu_lock)); 1152 1153 cp = cpupart_find(psid); 1154 if (cp == NULL) 1155 return (EINVAL); 1156 for (i = 0; i < nelem; i++) 1157 buf[i] = cp->cp_hp_avenrun[i] >> (16 - FSHIFT); 1158 1159 return (0); 1160 } 1161 1162 1163 uint_t 1164 cpupart_list(psetid_t *list, uint_t nelem, int flag) 1165 { 1166 uint_t numpart = 0; 1167 cpupart_t *cp; 1168 1169 ASSERT(MUTEX_HELD(&cpu_lock)); 1170 ASSERT(flag == CP_ALL || flag == CP_NONEMPTY); 1171 1172 if (list != NULL) { 1173 cp = cp_list_head; 1174 do { 1175 if (((flag == CP_ALL) && (cp != &cp_default)) || 1176 ((flag == CP_NONEMPTY) && (cp->cp_ncpus != 0))) { 1177 if (numpart == nelem) 1178 break; 1179 list[numpart++] = CPTOPS(cp->cp_id); 1180 } 1181 cp = cp->cp_next; 1182 } while (cp != cp_list_head); 1183 } 1184 1185 ASSERT(numpart < cp_numparts); 1186 1187 if (flag == CP_ALL) 1188 numpart = cp_numparts - 1; /* leave out default partition */ 1189 else if (flag == CP_NONEMPTY) 1190 numpart = cp_numparts_nonempty; 1191 1192 return (numpart); 1193 } 1194 1195 int 1196 cpupart_setattr(psetid_t psid, uint_t attr) 1197 { 1198 cpupart_t *cp; 1199 1200 ASSERT(pool_lock_held()); 1201 1202 mutex_enter(&cpu_lock); 1203 if ((cp = cpupart_find(psid)) == NULL) { 1204 mutex_exit(&cpu_lock); 1205 return (EINVAL); 1206 } 1207 /* 1208 * PSET_NOESCAPE attribute for default cpu partition is always set 1209 */ 1210 if (cp == &cp_default && !(attr & PSET_NOESCAPE)) { 1211 mutex_exit(&cpu_lock); 1212 return (EINVAL); 1213 } 1214 cp->cp_attr = attr; 1215 mutex_exit(&cpu_lock); 1216 return (0); 1217 } 1218 1219 int 1220 cpupart_getattr(psetid_t psid, uint_t *attrp) 1221 { 1222 cpupart_t *cp; 1223 1224 mutex_enter(&cpu_lock); 1225 if ((cp = cpupart_find(psid)) == NULL) { 1226 mutex_exit(&cpu_lock); 1227 return (EINVAL); 1228 } 1229 *attrp = cp->cp_attr; 1230 mutex_exit(&cpu_lock); 1231 return (0); 1232 } 1233