1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2017 by Delphix. All rights reserved. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/systm.h> 28 #include <sys/cmn_err.h> 29 #include <sys/cpuvar.h> 30 #include <sys/thread.h> 31 #include <sys/disp.h> 32 #include <sys/kmem.h> 33 #include <sys/debug.h> 34 #include <sys/cpupart.h> 35 #include <sys/pset.h> 36 #include <sys/var.h> 37 #include <sys/cyclic.h> 38 #include <sys/lgrp.h> 39 #include <sys/pghw.h> 40 #include <sys/loadavg.h> 41 #include <sys/class.h> 42 #include <sys/fss.h> 43 #include <sys/pool.h> 44 #include <sys/pool_pset.h> 45 #include <sys/policy.h> 46 47 /* 48 * Calling pool_lock() protects the pools configuration, which includes 49 * CPU partitions. cpu_lock protects the CPU partition list, and prevents 50 * partitions from being created or destroyed while the lock is held. 51 * The lock ordering with respect to related locks is: 52 * 53 * pool_lock() ---> cpu_lock ---> pidlock --> p_lock 54 * 55 * Blocking memory allocations may be made while holding "pool_lock" 56 * or cpu_lock. 57 */ 58 59 /* 60 * The cp_default partition is allocated statically, but its lgroup load average 61 * (lpl) list is allocated dynamically after kmem subsystem is initialized. This 62 * saves some memory since the space allocated reflects the actual number of 63 * lgroups supported by the platform. The lgrp facility provides a temporary 64 * space to hold lpl information during system bootstrap. 65 */ 66 67 cpupart_t *cp_list_head; 68 cpupart_t cp_default; 69 static cpupartid_t cp_id_next; 70 uint_t cp_numparts; 71 uint_t cp_numparts_nonempty; 72 73 /* 74 * Need to limit total number of partitions to avoid slowing down the 75 * clock code too much. The clock code traverses the list of 76 * partitions and needs to be able to execute in a reasonable amount 77 * of time (less than 1/hz seconds). The maximum is sized based on 78 * max_ncpus so it shouldn't be a problem unless there are large 79 * numbers of empty partitions. 80 */ 81 static uint_t cp_max_numparts; 82 83 /* 84 * Processor sets and CPU partitions are different but related concepts. 85 * A processor set is a user-level abstraction allowing users to create 86 * sets of CPUs and bind threads exclusively to those sets. A CPU 87 * partition is a kernel dispatcher object consisting of a set of CPUs 88 * and a global dispatch queue. The processor set abstraction is 89 * implemented via a CPU partition, and currently there is a 1-1 90 * mapping between processor sets and partitions (excluding the default 91 * partition, which is not visible as a processor set). Hence, the 92 * numbering for processor sets and CPU partitions is identical. This 93 * may not always be true in the future, and these macros could become 94 * less trivial if we support e.g. a processor set containing multiple 95 * CPU partitions. 96 */ 97 #define PSTOCP(psid) ((cpupartid_t)((psid) == PS_NONE ? CP_DEFAULT : (psid))) 98 #define CPTOPS(cpid) ((psetid_t)((cpid) == CP_DEFAULT ? PS_NONE : (cpid))) 99 100 static int cpupart_unbind_threads(cpupart_t *, boolean_t); 101 102 /* 103 * Find a CPU partition given a processor set ID. 104 */ 105 static cpupart_t * 106 cpupart_find_all(psetid_t psid) 107 { 108 cpupart_t *cp; 109 cpupartid_t cpid = PSTOCP(psid); 110 111 ASSERT(MUTEX_HELD(&cpu_lock)); 112 113 /* default partition not visible as a processor set */ 114 if (psid == CP_DEFAULT) 115 return (NULL); 116 117 if (psid == PS_MYID) 118 return (curthread->t_cpupart); 119 120 cp = cp_list_head; 121 do { 122 if (cp->cp_id == cpid) 123 return (cp); 124 cp = cp->cp_next; 125 } while (cp != cp_list_head); 126 return (NULL); 127 } 128 129 /* 130 * Find a CPU partition given a processor set ID if the processor set 131 * should be visible from the calling zone. 132 */ 133 cpupart_t * 134 cpupart_find(psetid_t psid) 135 { 136 cpupart_t *cp; 137 138 ASSERT(MUTEX_HELD(&cpu_lock)); 139 cp = cpupart_find_all(psid); 140 if (cp != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() && 141 zone_pset_get(curproc->p_zone) != CPTOPS(cp->cp_id)) 142 return (NULL); 143 return (cp); 144 } 145 146 static int 147 cpupart_kstat_update(kstat_t *ksp, int rw) 148 { 149 cpupart_t *cp = (cpupart_t *)ksp->ks_private; 150 cpupart_kstat_t *cpksp = ksp->ks_data; 151 152 if (rw == KSTAT_WRITE) 153 return (EACCES); 154 155 cpksp->cpk_updates.value.ui64 = cp->cp_updates; 156 cpksp->cpk_runnable.value.ui64 = cp->cp_nrunnable_cum; 157 cpksp->cpk_waiting.value.ui64 = cp->cp_nwaiting_cum; 158 cpksp->cpk_ncpus.value.ui32 = cp->cp_ncpus; 159 cpksp->cpk_avenrun_1min.value.ui32 = cp->cp_hp_avenrun[0] >> 160 (16 - FSHIFT); 161 cpksp->cpk_avenrun_5min.value.ui32 = cp->cp_hp_avenrun[1] >> 162 (16 - FSHIFT); 163 cpksp->cpk_avenrun_15min.value.ui32 = cp->cp_hp_avenrun[2] >> 164 (16 - FSHIFT); 165 return (0); 166 } 167 168 static void 169 cpupart_kstat_create(cpupart_t *cp) 170 { 171 kstat_t *ksp; 172 zoneid_t zoneid; 173 174 ASSERT(MUTEX_HELD(&cpu_lock)); 175 176 /* 177 * We have a bit of a chicken-egg problem since this code will 178 * get called to create the kstats for CP_DEFAULT before the 179 * pools framework gets initialized. We circumvent the problem 180 * by special-casing cp_default. 181 */ 182 if (cp != &cp_default && pool_pset_enabled()) 183 zoneid = GLOBAL_ZONEID; 184 else 185 zoneid = ALL_ZONES; 186 ksp = kstat_create_zone("unix", cp->cp_id, "pset", "misc", 187 KSTAT_TYPE_NAMED, 188 sizeof (cpupart_kstat_t) / sizeof (kstat_named_t), 0, zoneid); 189 if (ksp != NULL) { 190 cpupart_kstat_t *cpksp = ksp->ks_data; 191 192 kstat_named_init(&cpksp->cpk_updates, "updates", 193 KSTAT_DATA_UINT64); 194 kstat_named_init(&cpksp->cpk_runnable, "runnable", 195 KSTAT_DATA_UINT64); 196 kstat_named_init(&cpksp->cpk_waiting, "waiting", 197 KSTAT_DATA_UINT64); 198 kstat_named_init(&cpksp->cpk_ncpus, "ncpus", 199 KSTAT_DATA_UINT32); 200 kstat_named_init(&cpksp->cpk_avenrun_1min, "avenrun_1min", 201 KSTAT_DATA_UINT32); 202 kstat_named_init(&cpksp->cpk_avenrun_5min, "avenrun_5min", 203 KSTAT_DATA_UINT32); 204 kstat_named_init(&cpksp->cpk_avenrun_15min, "avenrun_15min", 205 KSTAT_DATA_UINT32); 206 207 ksp->ks_update = cpupart_kstat_update; 208 ksp->ks_private = cp; 209 210 kstat_install(ksp); 211 } 212 cp->cp_kstat = ksp; 213 } 214 215 /* 216 * Initialize the cpupart's lgrp partions (lpls) 217 */ 218 static void 219 cpupart_lpl_initialize(cpupart_t *cp) 220 { 221 int i, sz; 222 223 sz = cp->cp_nlgrploads = lgrp_plat_max_lgrps(); 224 cp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * sz, KM_SLEEP); 225 226 for (i = 0; i < sz; i++) { 227 /* 228 * The last entry of the lpl's resource set is always NULL 229 * by design (to facilitate iteration)...hence the "oversizing" 230 * by 1. 231 */ 232 cp->cp_lgrploads[i].lpl_rset_sz = sz + 1; 233 cp->cp_lgrploads[i].lpl_rset = 234 kmem_zalloc(sizeof (struct lgrp_ld *) * (sz + 1), KM_SLEEP); 235 cp->cp_lgrploads[i].lpl_id2rset = 236 kmem_zalloc(sizeof (int) * (sz + 1), KM_SLEEP); 237 cp->cp_lgrploads[i].lpl_lgrpid = i; 238 } 239 } 240 241 /* 242 * Teardown the cpupart's lgrp partitions 243 */ 244 static void 245 cpupart_lpl_teardown(cpupart_t *cp) 246 { 247 int i, sz; 248 lpl_t *lpl; 249 250 for (i = 0; i < cp->cp_nlgrploads; i++) { 251 lpl = &cp->cp_lgrploads[i]; 252 253 sz = lpl->lpl_rset_sz; 254 kmem_free(lpl->lpl_rset, sizeof (struct lgrp_ld *) * sz); 255 kmem_free(lpl->lpl_id2rset, sizeof (int) * sz); 256 lpl->lpl_rset = NULL; 257 lpl->lpl_id2rset = NULL; 258 } 259 kmem_free(cp->cp_lgrploads, sizeof (lpl_t) * cp->cp_nlgrploads); 260 cp->cp_lgrploads = NULL; 261 } 262 263 /* 264 * Initialize the default partition and kpreempt disp queue. 265 */ 266 void 267 cpupart_initialize_default(void) 268 { 269 lgrp_id_t i; 270 271 cp_list_head = &cp_default; 272 cp_default.cp_next = &cp_default; 273 cp_default.cp_prev = &cp_default; 274 cp_default.cp_id = CP_DEFAULT; 275 cp_default.cp_kp_queue.disp_maxrunpri = -1; 276 cp_default.cp_kp_queue.disp_max_unbound_pri = -1; 277 cp_default.cp_kp_queue.disp_cpu = NULL; 278 cp_default.cp_gen = 0; 279 cp_default.cp_loadavg.lg_cur = 0; 280 cp_default.cp_loadavg.lg_len = 0; 281 cp_default.cp_loadavg.lg_total = 0; 282 for (i = 0; i < S_LOADAVG_SZ; i++) { 283 cp_default.cp_loadavg.lg_loads[i] = 0; 284 } 285 DISP_LOCK_INIT(&cp_default.cp_kp_queue.disp_lock); 286 cp_id_next = CP_DEFAULT + 1; 287 cpupart_kstat_create(&cp_default); 288 cp_numparts = 1; 289 if (cp_max_numparts == 0) /* allow for /etc/system tuning */ 290 cp_max_numparts = max_ncpus * 2 + 1; 291 /* 292 * Allocate space for cp_default list of lgrploads 293 */ 294 cpupart_lpl_initialize(&cp_default); 295 296 /* 297 * The initial lpl topology is created in a special lpl list 298 * lpl_bootstrap. It should be copied to cp_default. 299 * NOTE: lpl_topo_bootstrap() also updates CPU0 cpu_lpl pointer to point 300 * to the correct lpl in the cp_default.cp_lgrploads list. 301 */ 302 lpl_topo_bootstrap(cp_default.cp_lgrploads, 303 cp_default.cp_nlgrploads); 304 305 306 cp_default.cp_attr = PSET_NOESCAPE; 307 cp_numparts_nonempty = 1; 308 /* 309 * Set t0's home 310 */ 311 t0.t_lpl = &cp_default.cp_lgrploads[LGRP_ROOTID]; 312 313 bitset_init(&cp_default.cp_cmt_pgs); 314 bitset_init_fanout(&cp_default.cp_haltset, cp_haltset_fanout); 315 316 bitset_resize(&cp_default.cp_haltset, max_ncpus); 317 } 318 319 320 static int 321 cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced) 322 { 323 cpupart_t *oldpp; 324 cpu_t *ncp, *newlist; 325 kthread_t *t; 326 int move_threads = 1; 327 lgrp_id_t lgrpid; 328 proc_t *p; 329 int lgrp_diff_lpl; 330 lpl_t *cpu_lpl; 331 int ret; 332 boolean_t unbind_all_threads = (forced != 0); 333 334 ASSERT(MUTEX_HELD(&cpu_lock)); 335 ASSERT(newpp != NULL); 336 337 oldpp = cp->cpu_part; 338 ASSERT(oldpp != NULL); 339 ASSERT(oldpp->cp_ncpus > 0); 340 341 if (newpp == oldpp) { 342 /* 343 * Don't need to do anything. 344 */ 345 return (0); 346 } 347 348 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_OUT); 349 350 if (!disp_bound_partition(cp, 0)) { 351 /* 352 * Don't need to move threads if there are no threads in 353 * the partition. Note that threads can't enter the 354 * partition while we're holding cpu_lock. 355 */ 356 move_threads = 0; 357 } else if (oldpp->cp_ncpus == 1) { 358 /* 359 * The last CPU is removed from a partition which has threads 360 * running in it. Some of these threads may be bound to this 361 * CPU. 362 * 363 * Attempt to unbind threads from the CPU and from the processor 364 * set. Note that no threads should be bound to this CPU since 365 * cpupart_move_threads will refuse to move bound threads to 366 * other CPUs. 367 */ 368 (void) cpu_unbind(oldpp->cp_cpulist->cpu_id, B_FALSE); 369 (void) cpupart_unbind_threads(oldpp, B_FALSE); 370 371 if (!disp_bound_partition(cp, 0)) { 372 /* 373 * No bound threads in this partition any more 374 */ 375 move_threads = 0; 376 } else { 377 /* 378 * There are still threads bound to the partition 379 */ 380 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 381 return (EBUSY); 382 } 383 } 384 385 /* 386 * If forced flag is set unbind any threads from this CPU. 387 * Otherwise unbind soft-bound threads only. 388 */ 389 if ((ret = cpu_unbind(cp->cpu_id, unbind_all_threads)) != 0) { 390 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 391 return (ret); 392 } 393 394 /* 395 * Stop further threads weak binding to this cpu. 396 */ 397 cpu_inmotion = cp; 398 membar_enter(); 399 400 /* 401 * Notify the Processor Groups subsystem that the CPU 402 * will be moving cpu partitions. This is done before 403 * CPUs are paused to provide an opportunity for any 404 * needed memory allocations. 405 */ 406 pg_cpupart_out(cp, oldpp); 407 pg_cpupart_in(cp, newpp); 408 409 again: 410 if (move_threads) { 411 int loop_count; 412 /* 413 * Check for threads strong or weak bound to this CPU. 414 */ 415 for (loop_count = 0; disp_bound_threads(cp, 0); loop_count++) { 416 if (loop_count >= 5) { 417 cpu_state_change_notify(cp->cpu_id, 418 CPU_CPUPART_IN); 419 pg_cpupart_out(cp, newpp); 420 pg_cpupart_in(cp, oldpp); 421 cpu_inmotion = NULL; 422 return (EBUSY); /* some threads still bound */ 423 } 424 delay(1); 425 } 426 } 427 428 /* 429 * Before we actually start changing data structures, notify 430 * the cyclic subsystem that we want to move this CPU out of its 431 * partition. 432 */ 433 if (!cyclic_move_out(cp)) { 434 /* 435 * This CPU must be the last CPU in a processor set with 436 * a bound cyclic. 437 */ 438 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 439 pg_cpupart_out(cp, newpp); 440 pg_cpupart_in(cp, oldpp); 441 cpu_inmotion = NULL; 442 return (EBUSY); 443 } 444 445 pause_cpus(cp, NULL); 446 447 if (move_threads) { 448 /* 449 * The thread on cpu before the pause thread may have read 450 * cpu_inmotion before we raised the barrier above. Check 451 * again. 452 */ 453 if (disp_bound_threads(cp, 1)) { 454 start_cpus(); 455 goto again; 456 } 457 458 } 459 460 /* 461 * Now that CPUs are paused, let the PG subsystem perform 462 * any necessary data structure updates. 463 */ 464 pg_cpupart_move(cp, oldpp, newpp); 465 466 /* save this cpu's lgroup -- it'll be the same in the new partition */ 467 lgrpid = cp->cpu_lpl->lpl_lgrpid; 468 469 cpu_lpl = cp->cpu_lpl; 470 /* 471 * let the lgroup framework know cp has left the partition 472 */ 473 lgrp_config(LGRP_CONFIG_CPUPART_DEL, (uintptr_t)cp, lgrpid); 474 475 /* move out of old partition */ 476 oldpp->cp_ncpus--; 477 if (oldpp->cp_ncpus > 0) { 478 479 ncp = cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part; 480 cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part; 481 if (oldpp->cp_cpulist == cp) { 482 oldpp->cp_cpulist = ncp; 483 } 484 } else { 485 ncp = oldpp->cp_cpulist = NULL; 486 cp_numparts_nonempty--; 487 ASSERT(cp_numparts_nonempty != 0); 488 } 489 oldpp->cp_gen++; 490 491 /* move into new partition */ 492 newlist = newpp->cp_cpulist; 493 if (newlist == NULL) { 494 newpp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp; 495 cp_numparts_nonempty++; 496 ASSERT(cp_numparts_nonempty != 0); 497 } else { 498 cp->cpu_next_part = newlist; 499 cp->cpu_prev_part = newlist->cpu_prev_part; 500 newlist->cpu_prev_part->cpu_next_part = cp; 501 newlist->cpu_prev_part = cp; 502 } 503 cp->cpu_part = newpp; 504 newpp->cp_ncpus++; 505 newpp->cp_gen++; 506 507 ASSERT(bitset_is_null(&newpp->cp_haltset)); 508 ASSERT(bitset_is_null(&oldpp->cp_haltset)); 509 510 /* 511 * let the lgroup framework know cp has entered the partition 512 */ 513 lgrp_config(LGRP_CONFIG_CPUPART_ADD, (uintptr_t)cp, lgrpid); 514 515 /* 516 * If necessary, move threads off processor. 517 */ 518 if (move_threads) { 519 ASSERT(ncp != NULL); 520 521 /* 522 * Walk thru the active process list to look for 523 * threads that need to have a new home lgroup, 524 * or the last CPU they run on is the same CPU 525 * being moved out of the partition. 526 */ 527 528 for (p = practive; p != NULL; p = p->p_next) { 529 530 t = p->p_tlist; 531 532 if (t == NULL) 533 continue; 534 535 lgrp_diff_lpl = 0; 536 537 do { 538 539 ASSERT(t->t_lpl != NULL); 540 541 /* 542 * Update the count of how many threads are 543 * in this CPU's lgroup but have a different lpl 544 */ 545 546 if (t->t_lpl != cpu_lpl && 547 t->t_lpl->lpl_lgrpid == lgrpid) 548 lgrp_diff_lpl++; 549 /* 550 * If the lgroup that t is assigned to no 551 * longer has any CPUs in t's partition, 552 * we'll have to choose a new lgroup for t. 553 */ 554 555 if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid, 556 t->t_cpupart)) { 557 lgrp_move_thread(t, 558 lgrp_choose(t, t->t_cpupart), 0); 559 } 560 561 /* 562 * make sure lpl points to our own partition 563 */ 564 ASSERT(t->t_lpl >= t->t_cpupart->cp_lgrploads && 565 (t->t_lpl < t->t_cpupart->cp_lgrploads + 566 t->t_cpupart->cp_nlgrploads)); 567 568 ASSERT(t->t_lpl->lpl_ncpu > 0); 569 570 /* Update CPU last ran on if it was this CPU */ 571 if (t->t_cpu == cp && t->t_cpupart == oldpp && 572 t->t_bound_cpu != cp) { 573 t->t_cpu = disp_lowpri_cpu(ncp, 574 t->t_lpl, t->t_pri, NULL); 575 } 576 t = t->t_forw; 577 } while (t != p->p_tlist); 578 579 /* 580 * Didn't find any threads in the same lgroup as this 581 * CPU with a different lpl, so remove the lgroup from 582 * the process lgroup bitmask. 583 */ 584 585 if (lgrp_diff_lpl) 586 klgrpset_del(p->p_lgrpset, lgrpid); 587 } 588 589 /* 590 * Walk thread list looking for threads that need to be 591 * rehomed, since there are some threads that are not in 592 * their process's p_tlist. 593 */ 594 595 t = curthread; 596 597 do { 598 ASSERT(t != NULL && t->t_lpl != NULL); 599 600 /* 601 * If the lgroup that t is assigned to no 602 * longer has any CPUs in t's partition, 603 * we'll have to choose a new lgroup for t. 604 * Also, choose best lgroup for home when 605 * thread has specified lgroup affinities, 606 * since there may be an lgroup with more 607 * affinity available after moving CPUs 608 * around. 609 */ 610 if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid, 611 t->t_cpupart) || t->t_lgrp_affinity) { 612 lgrp_move_thread(t, 613 lgrp_choose(t, t->t_cpupart), 1); 614 } 615 616 /* make sure lpl points to our own partition */ 617 ASSERT((t->t_lpl >= t->t_cpupart->cp_lgrploads) && 618 (t->t_lpl < t->t_cpupart->cp_lgrploads + 619 t->t_cpupart->cp_nlgrploads)); 620 621 ASSERT(t->t_lpl->lpl_ncpu > 0); 622 623 /* Update CPU last ran on if it was this CPU */ 624 if (t->t_cpu == cp && t->t_cpupart == oldpp && 625 t->t_bound_cpu != cp) { 626 t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl, 627 t->t_pri, NULL); 628 } 629 630 t = t->t_next; 631 } while (t != curthread); 632 633 /* 634 * Clear off the CPU's run queue, and the kp queue if the 635 * partition is now empty. 636 */ 637 disp_cpu_inactive(cp); 638 639 /* 640 * Make cp switch to a thread from the new partition. 641 */ 642 cp->cpu_runrun = 1; 643 cp->cpu_kprunrun = 1; 644 } 645 646 cpu_inmotion = NULL; 647 start_cpus(); 648 649 /* 650 * Let anyone interested know that cpu has been added to the set. 651 */ 652 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 653 654 /* 655 * Now let the cyclic subsystem know that it can reshuffle cyclics 656 * bound to the new processor set. 657 */ 658 cyclic_move_in(cp); 659 660 return (0); 661 } 662 663 /* 664 * Check if thread can be moved to a new cpu partition. Called by 665 * cpupart_move_thread() and pset_bind_start(). 666 */ 667 int 668 cpupart_movable_thread(kthread_id_t tp, cpupart_t *cp, int ignore) 669 { 670 ASSERT(MUTEX_HELD(&cpu_lock)); 671 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock)); 672 ASSERT(cp != NULL); 673 ASSERT(THREAD_LOCK_HELD(tp)); 674 675 /* 676 * CPU-bound threads can't be moved. 677 */ 678 if (!ignore) { 679 cpu_t *boundcpu = tp->t_bound_cpu ? tp->t_bound_cpu : 680 tp->t_weakbound_cpu; 681 if (boundcpu != NULL && boundcpu->cpu_part != cp) 682 return (EBUSY); 683 } 684 685 if (tp->t_cid == sysdccid) { 686 return (EINVAL); /* For now, sysdc threads can't move */ 687 } 688 689 return (0); 690 } 691 692 /* 693 * Move thread to new partition. If ignore is non-zero, then CPU 694 * bindings should be ignored (this is used when destroying a 695 * partition). 696 */ 697 static int 698 cpupart_move_thread(kthread_id_t tp, cpupart_t *newpp, int ignore, 699 void *projbuf, void *zonebuf) 700 { 701 cpupart_t *oldpp = tp->t_cpupart; 702 int ret; 703 704 ASSERT(MUTEX_HELD(&cpu_lock)); 705 ASSERT(MUTEX_HELD(&pidlock)); 706 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock)); 707 ASSERT(newpp != NULL); 708 709 if (newpp->cp_cpulist == NULL) 710 return (EINVAL); 711 712 /* 713 * Check for errors first. 714 */ 715 thread_lock(tp); 716 if ((ret = cpupart_movable_thread(tp, newpp, ignore)) != 0) { 717 thread_unlock(tp); 718 return (ret); 719 } 720 721 /* move the thread */ 722 if (oldpp != newpp) { 723 /* 724 * Make the thread switch to the new partition. 725 */ 726 tp->t_cpupart = newpp; 727 ASSERT(tp->t_lpl != NULL); 728 /* 729 * Leave the thread on the same lgroup if possible; otherwise 730 * choose a new lgroup for it. In either case, update its 731 * t_lpl. 732 */ 733 if (LGRP_CPUS_IN_PART(tp->t_lpl->lpl_lgrpid, newpp) && 734 tp->t_lgrp_affinity == NULL) { 735 /* 736 * The thread's lgroup has CPUs in the thread's new 737 * partition, so the thread can stay assigned to the 738 * same lgroup. Update its t_lpl to point to the 739 * lpl_t for its lgroup in its new partition. 740 */ 741 lgrp_move_thread(tp, &tp->t_cpupart->\ 742 cp_lgrploads[tp->t_lpl->lpl_lgrpid], 1); 743 } else { 744 /* 745 * The thread's lgroup has no cpus in its new 746 * partition or it has specified lgroup affinities, 747 * so choose the best lgroup for the thread and 748 * assign it to that lgroup. 749 */ 750 lgrp_move_thread(tp, lgrp_choose(tp, tp->t_cpupart), 751 1); 752 } 753 /* 754 * make sure lpl points to our own partition 755 */ 756 ASSERT((tp->t_lpl >= tp->t_cpupart->cp_lgrploads) && 757 (tp->t_lpl < tp->t_cpupart->cp_lgrploads + 758 tp->t_cpupart->cp_nlgrploads)); 759 760 ASSERT(tp->t_lpl->lpl_ncpu > 0); 761 762 if (tp->t_state == TS_ONPROC) { 763 cpu_surrender(tp); 764 } else if (tp->t_state == TS_RUN) { 765 (void) dispdeq(tp); 766 setbackdq(tp); 767 } 768 } 769 770 /* 771 * Our binding has changed; set TP_CHANGEBIND. 772 */ 773 tp->t_proc_flag |= TP_CHANGEBIND; 774 aston(tp); 775 776 thread_unlock(tp); 777 fss_changepset(tp, newpp, projbuf, zonebuf); 778 779 return (0); /* success */ 780 } 781 782 783 /* 784 * This function binds a thread to a partition. Must be called with the 785 * p_lock of the containing process held (to keep the thread from going 786 * away), and thus also with cpu_lock held (since cpu_lock must be 787 * acquired before p_lock). If ignore is non-zero, then CPU bindings 788 * should be ignored (this is used when destroying a partition). 789 */ 790 int 791 cpupart_bind_thread(kthread_id_t tp, psetid_t psid, int ignore, void *projbuf, 792 void *zonebuf) 793 { 794 cpupart_t *newpp; 795 796 ASSERT(pool_lock_held()); 797 ASSERT(MUTEX_HELD(&cpu_lock)); 798 ASSERT(MUTEX_HELD(&pidlock)); 799 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock)); 800 801 if (psid == PS_NONE) 802 newpp = &cp_default; 803 else { 804 newpp = cpupart_find(psid); 805 if (newpp == NULL) { 806 return (EINVAL); 807 } 808 } 809 return (cpupart_move_thread(tp, newpp, ignore, projbuf, zonebuf)); 810 } 811 812 813 /* 814 * Create a new partition. On MP systems, this also allocates a 815 * kpreempt disp queue for that partition. 816 */ 817 int 818 cpupart_create(psetid_t *psid) 819 { 820 cpupart_t *pp; 821 822 ASSERT(pool_lock_held()); 823 824 pp = kmem_zalloc(sizeof (cpupart_t), KM_SLEEP); 825 826 mutex_enter(&cpu_lock); 827 if (cp_numparts == cp_max_numparts) { 828 mutex_exit(&cpu_lock); 829 kmem_free(pp, sizeof (cpupart_t)); 830 return (ENOMEM); 831 } 832 cp_numparts++; 833 /* find the next free partition ID */ 834 while (cpupart_find(CPTOPS(cp_id_next)) != NULL) 835 cp_id_next++; 836 pp->cp_id = cp_id_next++; 837 pp->cp_ncpus = 0; 838 pp->cp_cpulist = NULL; 839 pp->cp_attr = 0; 840 klgrpset_clear(pp->cp_lgrpset); 841 pp->cp_kp_queue.disp_maxrunpri = -1; 842 pp->cp_kp_queue.disp_max_unbound_pri = -1; 843 pp->cp_kp_queue.disp_cpu = NULL; 844 pp->cp_gen = 0; 845 DISP_LOCK_INIT(&pp->cp_kp_queue.disp_lock); 846 *psid = CPTOPS(pp->cp_id); 847 disp_kp_alloc(&pp->cp_kp_queue, v.v_nglobpris); 848 cpupart_kstat_create(pp); 849 cpupart_lpl_initialize(pp); 850 851 bitset_init(&pp->cp_cmt_pgs); 852 853 /* 854 * Initialize and size the partition's bitset of halted CPUs. 855 */ 856 bitset_init_fanout(&pp->cp_haltset, cp_haltset_fanout); 857 bitset_resize(&pp->cp_haltset, max_ncpus); 858 859 /* 860 * Pause all CPUs while changing the partition list, to make sure 861 * the clock thread (which traverses the list without holding 862 * cpu_lock) isn't running. 863 */ 864 pause_cpus(NULL, NULL); 865 pp->cp_next = cp_list_head; 866 pp->cp_prev = cp_list_head->cp_prev; 867 cp_list_head->cp_prev->cp_next = pp; 868 cp_list_head->cp_prev = pp; 869 start_cpus(); 870 mutex_exit(&cpu_lock); 871 872 return (0); 873 } 874 875 /* 876 * Move threads from specified partition to cp_default. If `force' is specified, 877 * move all threads, otherwise move only soft-bound threads. 878 */ 879 static int 880 cpupart_unbind_threads(cpupart_t *pp, boolean_t unbind_all) 881 { 882 void *projbuf, *zonebuf; 883 kthread_t *t; 884 proc_t *p; 885 int err = 0; 886 psetid_t psid = pp->cp_id; 887 888 ASSERT(pool_lock_held()); 889 ASSERT(MUTEX_HELD(&cpu_lock)); 890 891 if (pp == NULL || pp == &cp_default) { 892 return (EINVAL); 893 } 894 895 /* 896 * Pre-allocate enough buffers for FSS for all active projects and 897 * for all active zones on the system. Unused buffers will be 898 * freed later by fss_freebuf(). 899 */ 900 projbuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_PROJ); 901 zonebuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_ZONE); 902 903 mutex_enter(&pidlock); 904 t = curthread; 905 do { 906 if (t->t_bind_pset == psid) { 907 again: p = ttoproc(t); 908 mutex_enter(&p->p_lock); 909 if (ttoproc(t) != p) { 910 /* 911 * lwp_exit has changed this thread's process 912 * pointer before we grabbed its p_lock. 913 */ 914 mutex_exit(&p->p_lock); 915 goto again; 916 } 917 918 /* 919 * Can only unbind threads which have revocable binding 920 * unless force unbinding requested. 921 */ 922 if (unbind_all || TB_PSET_IS_SOFT(t)) { 923 err = cpupart_bind_thread(t, PS_NONE, 1, 924 projbuf, zonebuf); 925 if (err) { 926 mutex_exit(&p->p_lock); 927 mutex_exit(&pidlock); 928 fss_freebuf(projbuf, FSS_ALLOC_PROJ); 929 fss_freebuf(zonebuf, FSS_ALLOC_ZONE); 930 return (err); 931 } 932 t->t_bind_pset = PS_NONE; 933 } 934 mutex_exit(&p->p_lock); 935 } 936 t = t->t_next; 937 } while (t != curthread); 938 939 mutex_exit(&pidlock); 940 fss_freebuf(projbuf, FSS_ALLOC_PROJ); 941 fss_freebuf(zonebuf, FSS_ALLOC_ZONE); 942 return (err); 943 } 944 945 /* 946 * Destroy a partition. 947 */ 948 int 949 cpupart_destroy(psetid_t psid) 950 { 951 cpu_t *cp, *first_cp; 952 cpupart_t *pp, *newpp; 953 int err = 0; 954 955 ASSERT(pool_lock_held()); 956 mutex_enter(&cpu_lock); 957 958 pp = cpupart_find(psid); 959 if (pp == NULL || pp == &cp_default) { 960 mutex_exit(&cpu_lock); 961 return (EINVAL); 962 } 963 964 /* 965 * Unbind all the threads currently bound to the partition. 966 */ 967 err = cpupart_unbind_threads(pp, B_TRUE); 968 if (err) { 969 mutex_exit(&cpu_lock); 970 return (err); 971 } 972 973 newpp = &cp_default; 974 while ((cp = pp->cp_cpulist) != NULL) { 975 if (err = cpupart_move_cpu(cp, newpp, 0)) { 976 mutex_exit(&cpu_lock); 977 return (err); 978 } 979 } 980 981 ASSERT(bitset_is_null(&pp->cp_cmt_pgs)); 982 ASSERT(bitset_is_null(&pp->cp_haltset)); 983 984 /* 985 * Teardown the partition's group of active CMT PGs and halted 986 * CPUs now that they have all left. 987 */ 988 bitset_fini(&pp->cp_cmt_pgs); 989 bitset_fini(&pp->cp_haltset); 990 991 /* 992 * Reset the pointers in any offline processors so they won't 993 * try to rejoin the destroyed partition when they're turned 994 * online. 995 */ 996 first_cp = cp = CPU; 997 do { 998 if (cp->cpu_part == pp) { 999 ASSERT(cp->cpu_flags & CPU_OFFLINE); 1000 cp->cpu_part = newpp; 1001 } 1002 cp = cp->cpu_next; 1003 } while (cp != first_cp); 1004 1005 /* 1006 * Pause all CPUs while changing the partition list, to make sure 1007 * the clock thread (which traverses the list without holding 1008 * cpu_lock) isn't running. 1009 */ 1010 pause_cpus(NULL, NULL); 1011 pp->cp_prev->cp_next = pp->cp_next; 1012 pp->cp_next->cp_prev = pp->cp_prev; 1013 if (cp_list_head == pp) 1014 cp_list_head = pp->cp_next; 1015 start_cpus(); 1016 1017 if (cp_id_next > pp->cp_id) 1018 cp_id_next = pp->cp_id; 1019 1020 if (pp->cp_kstat) 1021 kstat_delete(pp->cp_kstat); 1022 1023 cp_numparts--; 1024 1025 disp_kp_free(&pp->cp_kp_queue); 1026 1027 cpupart_lpl_teardown(pp); 1028 1029 kmem_free(pp, sizeof (cpupart_t)); 1030 mutex_exit(&cpu_lock); 1031 1032 return (err); 1033 } 1034 1035 1036 /* 1037 * Return the ID of the partition to which the specified processor belongs. 1038 */ 1039 psetid_t 1040 cpupart_query_cpu(cpu_t *cp) 1041 { 1042 ASSERT(MUTEX_HELD(&cpu_lock)); 1043 1044 return (CPTOPS(cp->cpu_part->cp_id)); 1045 } 1046 1047 1048 /* 1049 * Attach a processor to an existing partition. 1050 */ 1051 int 1052 cpupart_attach_cpu(psetid_t psid, cpu_t *cp, int forced) 1053 { 1054 cpupart_t *pp; 1055 int err; 1056 1057 ASSERT(pool_lock_held()); 1058 ASSERT(MUTEX_HELD(&cpu_lock)); 1059 1060 pp = cpupart_find(psid); 1061 if (pp == NULL) 1062 return (EINVAL); 1063 if (cp->cpu_flags & CPU_OFFLINE) 1064 return (EINVAL); 1065 1066 err = cpupart_move_cpu(cp, pp, forced); 1067 return (err); 1068 } 1069 1070 /* 1071 * Get a list of cpus belonging to the partition. If numcpus is NULL, 1072 * this just checks for a valid partition. If numcpus is non-NULL but 1073 * cpulist is NULL, the current number of cpus is stored in *numcpus. 1074 * If both are non-NULL, the current number of cpus is stored in *numcpus, 1075 * and a list of those cpus up to the size originally in *numcpus is 1076 * stored in cpulist[]. Also, store the processor set id in *psid. 1077 * This is useful in case the processor set id passed in was PS_MYID. 1078 */ 1079 int 1080 cpupart_get_cpus(psetid_t *psid, processorid_t *cpulist, uint_t *numcpus) 1081 { 1082 cpupart_t *pp; 1083 uint_t ncpus; 1084 cpu_t *c; 1085 int i; 1086 1087 mutex_enter(&cpu_lock); 1088 pp = cpupart_find(*psid); 1089 if (pp == NULL) { 1090 mutex_exit(&cpu_lock); 1091 return (EINVAL); 1092 } 1093 *psid = CPTOPS(pp->cp_id); 1094 ncpus = pp->cp_ncpus; 1095 if (numcpus) { 1096 if (ncpus > *numcpus) { 1097 /* 1098 * Only copy as many cpus as were passed in, but 1099 * pass back the real number. 1100 */ 1101 uint_t t = ncpus; 1102 ncpus = *numcpus; 1103 *numcpus = t; 1104 } else 1105 *numcpus = ncpus; 1106 1107 if (cpulist) { 1108 c = pp->cp_cpulist; 1109 for (i = 0; i < ncpus; i++) { 1110 ASSERT(c != NULL); 1111 cpulist[i] = c->cpu_id; 1112 c = c->cpu_next_part; 1113 } 1114 } 1115 } 1116 mutex_exit(&cpu_lock); 1117 return (0); 1118 } 1119 1120 /* 1121 * Reallocate kpreempt queues for each CPU partition. Called from 1122 * disp_setup when a new scheduling class is loaded that increases the 1123 * number of priorities in the system. 1124 */ 1125 void 1126 cpupart_kpqalloc(pri_t npri) 1127 { 1128 cpupart_t *cpp; 1129 1130 ASSERT(MUTEX_HELD(&cpu_lock)); 1131 cpp = cp_list_head; 1132 do { 1133 disp_kp_alloc(&cpp->cp_kp_queue, npri); 1134 cpp = cpp->cp_next; 1135 } while (cpp != cp_list_head); 1136 } 1137 1138 int 1139 cpupart_get_loadavg(psetid_t psid, int *buf, int nelem) 1140 { 1141 cpupart_t *cp; 1142 int i; 1143 1144 ASSERT(nelem >= 0); 1145 ASSERT(nelem <= LOADAVG_NSTATS); 1146 ASSERT(MUTEX_HELD(&cpu_lock)); 1147 1148 cp = cpupart_find(psid); 1149 if (cp == NULL) 1150 return (EINVAL); 1151 for (i = 0; i < nelem; i++) 1152 buf[i] = cp->cp_hp_avenrun[i] >> (16 - FSHIFT); 1153 1154 return (0); 1155 } 1156 1157 1158 uint_t 1159 cpupart_list(psetid_t *list, uint_t nelem, int flag) 1160 { 1161 uint_t numpart = 0; 1162 cpupart_t *cp; 1163 1164 ASSERT(MUTEX_HELD(&cpu_lock)); 1165 ASSERT(flag == CP_ALL || flag == CP_NONEMPTY); 1166 1167 if (list != NULL) { 1168 cp = cp_list_head; 1169 do { 1170 if (((flag == CP_ALL) && (cp != &cp_default)) || 1171 ((flag == CP_NONEMPTY) && (cp->cp_ncpus != 0))) { 1172 if (numpart == nelem) 1173 break; 1174 list[numpart++] = CPTOPS(cp->cp_id); 1175 } 1176 cp = cp->cp_next; 1177 } while (cp != cp_list_head); 1178 } 1179 1180 ASSERT(numpart < cp_numparts); 1181 1182 if (flag == CP_ALL) 1183 numpart = cp_numparts - 1; /* leave out default partition */ 1184 else if (flag == CP_NONEMPTY) 1185 numpart = cp_numparts_nonempty; 1186 1187 return (numpart); 1188 } 1189 1190 int 1191 cpupart_setattr(psetid_t psid, uint_t attr) 1192 { 1193 cpupart_t *cp; 1194 1195 ASSERT(pool_lock_held()); 1196 1197 mutex_enter(&cpu_lock); 1198 if ((cp = cpupart_find(psid)) == NULL) { 1199 mutex_exit(&cpu_lock); 1200 return (EINVAL); 1201 } 1202 /* 1203 * PSET_NOESCAPE attribute for default cpu partition is always set 1204 */ 1205 if (cp == &cp_default && !(attr & PSET_NOESCAPE)) { 1206 mutex_exit(&cpu_lock); 1207 return (EINVAL); 1208 } 1209 cp->cp_attr = attr; 1210 mutex_exit(&cpu_lock); 1211 return (0); 1212 } 1213 1214 int 1215 cpupart_getattr(psetid_t psid, uint_t *attrp) 1216 { 1217 cpupart_t *cp; 1218 1219 mutex_enter(&cpu_lock); 1220 if ((cp = cpupart_find(psid)) == NULL) { 1221 mutex_exit(&cpu_lock); 1222 return (EINVAL); 1223 } 1224 *attrp = cp->cp_attr; 1225 mutex_exit(&cpu_lock); 1226 return (0); 1227 } 1228