1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 #include <sys/types.h> 26 #include <sys/systm.h> 27 #include <sys/cmn_err.h> 28 #include <sys/cpuvar.h> 29 #include <sys/thread.h> 30 #include <sys/disp.h> 31 #include <sys/kmem.h> 32 #include <sys/debug.h> 33 #include <sys/cpupart.h> 34 #include <sys/pset.h> 35 #include <sys/var.h> 36 #include <sys/cyclic.h> 37 #include <sys/lgrp.h> 38 #include <sys/pghw.h> 39 #include <sys/loadavg.h> 40 #include <sys/class.h> 41 #include <sys/fss.h> 42 #include <sys/pool.h> 43 #include <sys/pool_pset.h> 44 #include <sys/policy.h> 45 46 /* 47 * Calling pool_lock() protects the pools configuration, which includes 48 * CPU partitions. cpu_lock protects the CPU partition list, and prevents 49 * partitions from being created or destroyed while the lock is held. 50 * The lock ordering with respect to related locks is: 51 * 52 * pool_lock() ---> cpu_lock ---> pidlock --> p_lock 53 * 54 * Blocking memory allocations may be made while holding "pool_lock" 55 * or cpu_lock. 56 */ 57 58 /* 59 * The cp_default partition is allocated statically, but its lgroup load average 60 * (lpl) list is allocated dynamically after kmem subsystem is initialized. This 61 * saves some memory since the space allocated reflects the actual number of 62 * lgroups supported by the platform. The lgrp facility provides a temporary 63 * space to hold lpl information during system bootstrap. 64 */ 65 66 cpupart_t *cp_list_head; 67 cpupart_t cp_default; 68 static cpupartid_t cp_id_next; 69 uint_t cp_numparts; 70 uint_t cp_numparts_nonempty; 71 72 /* 73 * Need to limit total number of partitions to avoid slowing down the 74 * clock code too much. The clock code traverses the list of 75 * partitions and needs to be able to execute in a reasonable amount 76 * of time (less than 1/hz seconds). The maximum is sized based on 77 * max_ncpus so it shouldn't be a problem unless there are large 78 * numbers of empty partitions. 79 */ 80 static uint_t cp_max_numparts; 81 82 /* 83 * Processor sets and CPU partitions are different but related concepts. 84 * A processor set is a user-level abstraction allowing users to create 85 * sets of CPUs and bind threads exclusively to those sets. A CPU 86 * partition is a kernel dispatcher object consisting of a set of CPUs 87 * and a global dispatch queue. The processor set abstraction is 88 * implemented via a CPU partition, and currently there is a 1-1 89 * mapping between processor sets and partitions (excluding the default 90 * partition, which is not visible as a processor set). Hence, the 91 * numbering for processor sets and CPU partitions is identical. This 92 * may not always be true in the future, and these macros could become 93 * less trivial if we support e.g. a processor set containing multiple 94 * CPU partitions. 95 */ 96 #define PSTOCP(psid) ((cpupartid_t)((psid) == PS_NONE ? CP_DEFAULT : (psid))) 97 #define CPTOPS(cpid) ((psetid_t)((cpid) == CP_DEFAULT ? PS_NONE : (cpid))) 98 99 static int cpupart_unbind_threads(cpupart_t *, boolean_t); 100 101 /* 102 * Find a CPU partition given a processor set ID. 103 */ 104 static cpupart_t * 105 cpupart_find_all(psetid_t psid) 106 { 107 cpupart_t *cp; 108 cpupartid_t cpid = PSTOCP(psid); 109 110 ASSERT(MUTEX_HELD(&cpu_lock)); 111 112 /* default partition not visible as a processor set */ 113 if (psid == CP_DEFAULT) 114 return (NULL); 115 116 if (psid == PS_MYID) 117 return (curthread->t_cpupart); 118 119 cp = cp_list_head; 120 do { 121 if (cp->cp_id == cpid) 122 return (cp); 123 cp = cp->cp_next; 124 } while (cp != cp_list_head); 125 return (NULL); 126 } 127 128 /* 129 * Find a CPU partition given a processor set ID if the processor set 130 * should be visible from the calling zone. 131 */ 132 cpupart_t * 133 cpupart_find(psetid_t psid) 134 { 135 cpupart_t *cp; 136 137 ASSERT(MUTEX_HELD(&cpu_lock)); 138 cp = cpupart_find_all(psid); 139 if (cp != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() && 140 zone_pset_get(curproc->p_zone) != CPTOPS(cp->cp_id)) 141 return (NULL); 142 return (cp); 143 } 144 145 static int 146 cpupart_kstat_update(kstat_t *ksp, int rw) 147 { 148 cpupart_t *cp = (cpupart_t *)ksp->ks_private; 149 cpupart_kstat_t *cpksp = ksp->ks_data; 150 151 if (rw == KSTAT_WRITE) 152 return (EACCES); 153 154 cpksp->cpk_updates.value.ui64 = cp->cp_updates; 155 cpksp->cpk_runnable.value.ui64 = cp->cp_nrunnable_cum; 156 cpksp->cpk_waiting.value.ui64 = cp->cp_nwaiting_cum; 157 cpksp->cpk_ncpus.value.ui32 = cp->cp_ncpus; 158 cpksp->cpk_avenrun_1min.value.ui32 = cp->cp_hp_avenrun[0] >> 159 (16 - FSHIFT); 160 cpksp->cpk_avenrun_5min.value.ui32 = cp->cp_hp_avenrun[1] >> 161 (16 - FSHIFT); 162 cpksp->cpk_avenrun_15min.value.ui32 = cp->cp_hp_avenrun[2] >> 163 (16 - FSHIFT); 164 return (0); 165 } 166 167 static void 168 cpupart_kstat_create(cpupart_t *cp) 169 { 170 kstat_t *ksp; 171 zoneid_t zoneid; 172 173 ASSERT(MUTEX_HELD(&cpu_lock)); 174 175 /* 176 * We have a bit of a chicken-egg problem since this code will 177 * get called to create the kstats for CP_DEFAULT before the 178 * pools framework gets initialized. We circumvent the problem 179 * by special-casing cp_default. 180 */ 181 if (cp != &cp_default && pool_pset_enabled()) 182 zoneid = GLOBAL_ZONEID; 183 else 184 zoneid = ALL_ZONES; 185 ksp = kstat_create_zone("unix", cp->cp_id, "pset", "misc", 186 KSTAT_TYPE_NAMED, 187 sizeof (cpupart_kstat_t) / sizeof (kstat_named_t), 0, zoneid); 188 if (ksp != NULL) { 189 cpupart_kstat_t *cpksp = ksp->ks_data; 190 191 kstat_named_init(&cpksp->cpk_updates, "updates", 192 KSTAT_DATA_UINT64); 193 kstat_named_init(&cpksp->cpk_runnable, "runnable", 194 KSTAT_DATA_UINT64); 195 kstat_named_init(&cpksp->cpk_waiting, "waiting", 196 KSTAT_DATA_UINT64); 197 kstat_named_init(&cpksp->cpk_ncpus, "ncpus", 198 KSTAT_DATA_UINT32); 199 kstat_named_init(&cpksp->cpk_avenrun_1min, "avenrun_1min", 200 KSTAT_DATA_UINT32); 201 kstat_named_init(&cpksp->cpk_avenrun_5min, "avenrun_5min", 202 KSTAT_DATA_UINT32); 203 kstat_named_init(&cpksp->cpk_avenrun_15min, "avenrun_15min", 204 KSTAT_DATA_UINT32); 205 206 ksp->ks_update = cpupart_kstat_update; 207 ksp->ks_private = cp; 208 209 kstat_install(ksp); 210 } 211 cp->cp_kstat = ksp; 212 } 213 214 /* 215 * Initialize the cpupart's lgrp partions (lpls) 216 */ 217 static void 218 cpupart_lpl_initialize(cpupart_t *cp) 219 { 220 int i, sz; 221 222 sz = cp->cp_nlgrploads = lgrp_plat_max_lgrps(); 223 cp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * sz, KM_SLEEP); 224 225 for (i = 0; i < sz; i++) { 226 /* 227 * The last entry of the lpl's resource set is always NULL 228 * by design (to facilitate iteration)...hence the "oversizing" 229 * by 1. 230 */ 231 cp->cp_lgrploads[i].lpl_rset_sz = sz + 1; 232 cp->cp_lgrploads[i].lpl_rset = 233 kmem_zalloc(sizeof (struct lgrp_ld *) * (sz + 1), KM_SLEEP); 234 cp->cp_lgrploads[i].lpl_id2rset = 235 kmem_zalloc(sizeof (int) * (sz + 1), KM_SLEEP); 236 cp->cp_lgrploads[i].lpl_lgrpid = i; 237 } 238 } 239 240 /* 241 * Teardown the cpupart's lgrp partitions 242 */ 243 static void 244 cpupart_lpl_teardown(cpupart_t *cp) 245 { 246 int i, sz; 247 lpl_t *lpl; 248 249 for (i = 0; i < cp->cp_nlgrploads; i++) { 250 lpl = &cp->cp_lgrploads[i]; 251 252 sz = lpl->lpl_rset_sz; 253 kmem_free(lpl->lpl_rset, sizeof (struct lgrp_ld *) * sz); 254 kmem_free(lpl->lpl_id2rset, sizeof (int) * sz); 255 lpl->lpl_rset = NULL; 256 lpl->lpl_id2rset = NULL; 257 } 258 kmem_free(cp->cp_lgrploads, sizeof (lpl_t) * cp->cp_nlgrploads); 259 cp->cp_lgrploads = NULL; 260 } 261 262 /* 263 * Initialize the default partition and kpreempt disp queue. 264 */ 265 void 266 cpupart_initialize_default(void) 267 { 268 lgrp_id_t i; 269 270 cp_list_head = &cp_default; 271 cp_default.cp_next = &cp_default; 272 cp_default.cp_prev = &cp_default; 273 cp_default.cp_id = CP_DEFAULT; 274 cp_default.cp_kp_queue.disp_maxrunpri = -1; 275 cp_default.cp_kp_queue.disp_max_unbound_pri = -1; 276 cp_default.cp_kp_queue.disp_cpu = NULL; 277 cp_default.cp_gen = 0; 278 cp_default.cp_loadavg.lg_cur = 0; 279 cp_default.cp_loadavg.lg_len = 0; 280 cp_default.cp_loadavg.lg_total = 0; 281 for (i = 0; i < S_LOADAVG_SZ; i++) { 282 cp_default.cp_loadavg.lg_loads[i] = 0; 283 } 284 DISP_LOCK_INIT(&cp_default.cp_kp_queue.disp_lock); 285 cp_id_next = CP_DEFAULT + 1; 286 cpupart_kstat_create(&cp_default); 287 cp_numparts = 1; 288 if (cp_max_numparts == 0) /* allow for /etc/system tuning */ 289 cp_max_numparts = max_ncpus * 2 + 1; 290 /* 291 * Allocate space for cp_default list of lgrploads 292 */ 293 cpupart_lpl_initialize(&cp_default); 294 295 /* 296 * The initial lpl topology is created in a special lpl list 297 * lpl_bootstrap. It should be copied to cp_default. 298 * NOTE: lpl_topo_bootstrap() also updates CPU0 cpu_lpl pointer to point 299 * to the correct lpl in the cp_default.cp_lgrploads list. 300 */ 301 lpl_topo_bootstrap(cp_default.cp_lgrploads, 302 cp_default.cp_nlgrploads); 303 304 305 cp_default.cp_attr = PSET_NOESCAPE; 306 cp_numparts_nonempty = 1; 307 /* 308 * Set t0's home 309 */ 310 t0.t_lpl = &cp_default.cp_lgrploads[LGRP_ROOTID]; 311 312 bitset_init(&cp_default.cp_cmt_pgs); 313 bitset_init_fanout(&cp_default.cp_haltset, cp_haltset_fanout); 314 315 bitset_resize(&cp_default.cp_haltset, max_ncpus); 316 } 317 318 319 static int 320 cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced) 321 { 322 cpupart_t *oldpp; 323 cpu_t *ncp, *newlist; 324 kthread_t *t; 325 int move_threads = 1; 326 lgrp_id_t lgrpid; 327 proc_t *p; 328 int lgrp_diff_lpl; 329 lpl_t *cpu_lpl; 330 int ret; 331 boolean_t unbind_all_threads = (forced != 0); 332 333 ASSERT(MUTEX_HELD(&cpu_lock)); 334 ASSERT(newpp != NULL); 335 336 oldpp = cp->cpu_part; 337 ASSERT(oldpp != NULL); 338 ASSERT(oldpp->cp_ncpus > 0); 339 340 if (newpp == oldpp) { 341 /* 342 * Don't need to do anything. 343 */ 344 return (0); 345 } 346 347 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_OUT); 348 349 if (!disp_bound_partition(cp, 0)) { 350 /* 351 * Don't need to move threads if there are no threads in 352 * the partition. Note that threads can't enter the 353 * partition while we're holding cpu_lock. 354 */ 355 move_threads = 0; 356 } else if (oldpp->cp_ncpus == 1) { 357 /* 358 * The last CPU is removed from a partition which has threads 359 * running in it. Some of these threads may be bound to this 360 * CPU. 361 * 362 * Attempt to unbind threads from the CPU and from the processor 363 * set. Note that no threads should be bound to this CPU since 364 * cpupart_move_threads will refuse to move bound threads to 365 * other CPUs. 366 */ 367 (void) cpu_unbind(oldpp->cp_cpulist->cpu_id, B_FALSE); 368 (void) cpupart_unbind_threads(oldpp, B_FALSE); 369 370 if (!disp_bound_partition(cp, 0)) { 371 /* 372 * No bound threads in this partition any more 373 */ 374 move_threads = 0; 375 } else { 376 /* 377 * There are still threads bound to the partition 378 */ 379 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 380 return (EBUSY); 381 } 382 } 383 384 /* 385 * If forced flag is set unbind any threads from this CPU. 386 * Otherwise unbind soft-bound threads only. 387 */ 388 if ((ret = cpu_unbind(cp->cpu_id, unbind_all_threads)) != 0) { 389 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 390 return (ret); 391 } 392 393 /* 394 * Stop further threads weak binding to this cpu. 395 */ 396 cpu_inmotion = cp; 397 membar_enter(); 398 399 /* 400 * Notify the Processor Groups subsystem that the CPU 401 * will be moving cpu partitions. This is done before 402 * CPUs are paused to provide an opportunity for any 403 * needed memory allocations. 404 */ 405 pg_cpupart_out(cp, oldpp); 406 pg_cpupart_in(cp, newpp); 407 408 again: 409 if (move_threads) { 410 int loop_count; 411 /* 412 * Check for threads strong or weak bound to this CPU. 413 */ 414 for (loop_count = 0; disp_bound_threads(cp, 0); loop_count++) { 415 if (loop_count >= 5) { 416 cpu_state_change_notify(cp->cpu_id, 417 CPU_CPUPART_IN); 418 pg_cpupart_out(cp, newpp); 419 pg_cpupart_in(cp, oldpp); 420 cpu_inmotion = NULL; 421 return (EBUSY); /* some threads still bound */ 422 } 423 delay(1); 424 } 425 } 426 427 /* 428 * Before we actually start changing data structures, notify 429 * the cyclic subsystem that we want to move this CPU out of its 430 * partition. 431 */ 432 if (!cyclic_move_out(cp)) { 433 /* 434 * This CPU must be the last CPU in a processor set with 435 * a bound cyclic. 436 */ 437 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 438 pg_cpupart_out(cp, newpp); 439 pg_cpupart_in(cp, oldpp); 440 cpu_inmotion = NULL; 441 return (EBUSY); 442 } 443 444 pause_cpus(cp); 445 446 if (move_threads) { 447 /* 448 * The thread on cpu before the pause thread may have read 449 * cpu_inmotion before we raised the barrier above. Check 450 * again. 451 */ 452 if (disp_bound_threads(cp, 1)) { 453 start_cpus(); 454 goto again; 455 } 456 457 } 458 459 /* 460 * Now that CPUs are paused, let the PG subsystem perform 461 * any necessary data structure updates. 462 */ 463 pg_cpupart_move(cp, oldpp, newpp); 464 465 /* save this cpu's lgroup -- it'll be the same in the new partition */ 466 lgrpid = cp->cpu_lpl->lpl_lgrpid; 467 468 cpu_lpl = cp->cpu_lpl; 469 /* 470 * let the lgroup framework know cp has left the partition 471 */ 472 lgrp_config(LGRP_CONFIG_CPUPART_DEL, (uintptr_t)cp, lgrpid); 473 474 /* move out of old partition */ 475 oldpp->cp_ncpus--; 476 if (oldpp->cp_ncpus > 0) { 477 478 ncp = cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part; 479 cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part; 480 if (oldpp->cp_cpulist == cp) { 481 oldpp->cp_cpulist = ncp; 482 } 483 } else { 484 ncp = oldpp->cp_cpulist = NULL; 485 cp_numparts_nonempty--; 486 ASSERT(cp_numparts_nonempty != 0); 487 } 488 oldpp->cp_gen++; 489 490 /* move into new partition */ 491 newlist = newpp->cp_cpulist; 492 if (newlist == NULL) { 493 newpp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp; 494 cp_numparts_nonempty++; 495 ASSERT(cp_numparts_nonempty != 0); 496 } else { 497 cp->cpu_next_part = newlist; 498 cp->cpu_prev_part = newlist->cpu_prev_part; 499 newlist->cpu_prev_part->cpu_next_part = cp; 500 newlist->cpu_prev_part = cp; 501 } 502 cp->cpu_part = newpp; 503 newpp->cp_ncpus++; 504 newpp->cp_gen++; 505 506 ASSERT(bitset_is_null(&newpp->cp_haltset)); 507 ASSERT(bitset_is_null(&oldpp->cp_haltset)); 508 509 /* 510 * let the lgroup framework know cp has entered the partition 511 */ 512 lgrp_config(LGRP_CONFIG_CPUPART_ADD, (uintptr_t)cp, lgrpid); 513 514 /* 515 * If necessary, move threads off processor. 516 */ 517 if (move_threads) { 518 ASSERT(ncp != NULL); 519 520 /* 521 * Walk thru the active process list to look for 522 * threads that need to have a new home lgroup, 523 * or the last CPU they run on is the same CPU 524 * being moved out of the partition. 525 */ 526 527 for (p = practive; p != NULL; p = p->p_next) { 528 529 t = p->p_tlist; 530 531 if (t == NULL) 532 continue; 533 534 lgrp_diff_lpl = 0; 535 536 do { 537 538 ASSERT(t->t_lpl != NULL); 539 540 /* 541 * Update the count of how many threads are 542 * in this CPU's lgroup but have a different lpl 543 */ 544 545 if (t->t_lpl != cpu_lpl && 546 t->t_lpl->lpl_lgrpid == lgrpid) 547 lgrp_diff_lpl++; 548 /* 549 * If the lgroup that t is assigned to no 550 * longer has any CPUs in t's partition, 551 * we'll have to choose a new lgroup for t. 552 */ 553 554 if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid, 555 t->t_cpupart)) { 556 lgrp_move_thread(t, 557 lgrp_choose(t, t->t_cpupart), 0); 558 } 559 560 /* 561 * make sure lpl points to our own partition 562 */ 563 ASSERT(t->t_lpl >= t->t_cpupart->cp_lgrploads && 564 (t->t_lpl < t->t_cpupart->cp_lgrploads + 565 t->t_cpupart->cp_nlgrploads)); 566 567 ASSERT(t->t_lpl->lpl_ncpu > 0); 568 569 /* Update CPU last ran on if it was this CPU */ 570 if (t->t_cpu == cp && t->t_cpupart == oldpp && 571 t->t_bound_cpu != cp) { 572 t->t_cpu = disp_lowpri_cpu(ncp, 573 t->t_lpl, t->t_pri, NULL); 574 } 575 t = t->t_forw; 576 } while (t != p->p_tlist); 577 578 /* 579 * Didn't find any threads in the same lgroup as this 580 * CPU with a different lpl, so remove the lgroup from 581 * the process lgroup bitmask. 582 */ 583 584 if (lgrp_diff_lpl) 585 klgrpset_del(p->p_lgrpset, lgrpid); 586 } 587 588 /* 589 * Walk thread list looking for threads that need to be 590 * rehomed, since there are some threads that are not in 591 * their process's p_tlist. 592 */ 593 594 t = curthread; 595 596 do { 597 ASSERT(t != NULL && t->t_lpl != NULL); 598 599 /* 600 * If the lgroup that t is assigned to no 601 * longer has any CPUs in t's partition, 602 * we'll have to choose a new lgroup for t. 603 * Also, choose best lgroup for home when 604 * thread has specified lgroup affinities, 605 * since there may be an lgroup with more 606 * affinity available after moving CPUs 607 * around. 608 */ 609 if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid, 610 t->t_cpupart) || t->t_lgrp_affinity) { 611 lgrp_move_thread(t, 612 lgrp_choose(t, t->t_cpupart), 1); 613 } 614 615 /* make sure lpl points to our own partition */ 616 ASSERT((t->t_lpl >= t->t_cpupart->cp_lgrploads) && 617 (t->t_lpl < t->t_cpupart->cp_lgrploads + 618 t->t_cpupart->cp_nlgrploads)); 619 620 ASSERT(t->t_lpl->lpl_ncpu > 0); 621 622 /* Update CPU last ran on if it was this CPU */ 623 if (t->t_cpu == cp && t->t_cpupart == oldpp && 624 t->t_bound_cpu != cp) { 625 t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl, 626 t->t_pri, NULL); 627 } 628 629 t = t->t_next; 630 } while (t != curthread); 631 632 /* 633 * Clear off the CPU's run queue, and the kp queue if the 634 * partition is now empty. 635 */ 636 disp_cpu_inactive(cp); 637 638 /* 639 * Make cp switch to a thread from the new partition. 640 */ 641 cp->cpu_runrun = 1; 642 cp->cpu_kprunrun = 1; 643 } 644 645 cpu_inmotion = NULL; 646 start_cpus(); 647 648 /* 649 * Let anyone interested know that cpu has been added to the set. 650 */ 651 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 652 653 /* 654 * Now let the cyclic subsystem know that it can reshuffle cyclics 655 * bound to the new processor set. 656 */ 657 cyclic_move_in(cp); 658 659 return (0); 660 } 661 662 /* 663 * Check if thread can be moved to a new cpu partition. Called by 664 * cpupart_move_thread() and pset_bind_start(). 665 */ 666 int 667 cpupart_movable_thread(kthread_id_t tp, cpupart_t *cp, int ignore) 668 { 669 ASSERT(MUTEX_HELD(&cpu_lock)); 670 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock)); 671 ASSERT(cp != NULL); 672 ASSERT(THREAD_LOCK_HELD(tp)); 673 674 /* 675 * CPU-bound threads can't be moved. 676 */ 677 if (!ignore) { 678 cpu_t *boundcpu = tp->t_bound_cpu ? tp->t_bound_cpu : 679 tp->t_weakbound_cpu; 680 if (boundcpu != NULL && boundcpu->cpu_part != cp) 681 return (EBUSY); 682 } 683 684 if (tp->t_cid == sysdccid) { 685 return (EINVAL); /* For now, sysdc threads can't move */ 686 } 687 688 return (0); 689 } 690 691 /* 692 * Move thread to new partition. If ignore is non-zero, then CPU 693 * bindings should be ignored (this is used when destroying a 694 * partition). 695 */ 696 static int 697 cpupart_move_thread(kthread_id_t tp, cpupart_t *newpp, int ignore, 698 void *projbuf, void *zonebuf) 699 { 700 cpupart_t *oldpp = tp->t_cpupart; 701 int ret; 702 703 ASSERT(MUTEX_HELD(&cpu_lock)); 704 ASSERT(MUTEX_HELD(&pidlock)); 705 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock)); 706 ASSERT(newpp != NULL); 707 708 if (newpp->cp_cpulist == NULL) 709 return (EINVAL); 710 711 /* 712 * Check for errors first. 713 */ 714 thread_lock(tp); 715 if ((ret = cpupart_movable_thread(tp, newpp, ignore)) != 0) { 716 thread_unlock(tp); 717 return (ret); 718 } 719 720 /* move the thread */ 721 if (oldpp != newpp) { 722 /* 723 * Make the thread switch to the new partition. 724 */ 725 tp->t_cpupart = newpp; 726 ASSERT(tp->t_lpl != NULL); 727 /* 728 * Leave the thread on the same lgroup if possible; otherwise 729 * choose a new lgroup for it. In either case, update its 730 * t_lpl. 731 */ 732 if (LGRP_CPUS_IN_PART(tp->t_lpl->lpl_lgrpid, newpp) && 733 tp->t_lgrp_affinity == NULL) { 734 /* 735 * The thread's lgroup has CPUs in the thread's new 736 * partition, so the thread can stay assigned to the 737 * same lgroup. Update its t_lpl to point to the 738 * lpl_t for its lgroup in its new partition. 739 */ 740 lgrp_move_thread(tp, &tp->t_cpupart->\ 741 cp_lgrploads[tp->t_lpl->lpl_lgrpid], 1); 742 } else { 743 /* 744 * The thread's lgroup has no cpus in its new 745 * partition or it has specified lgroup affinities, 746 * so choose the best lgroup for the thread and 747 * assign it to that lgroup. 748 */ 749 lgrp_move_thread(tp, lgrp_choose(tp, tp->t_cpupart), 750 1); 751 } 752 /* 753 * make sure lpl points to our own partition 754 */ 755 ASSERT((tp->t_lpl >= tp->t_cpupart->cp_lgrploads) && 756 (tp->t_lpl < tp->t_cpupart->cp_lgrploads + 757 tp->t_cpupart->cp_nlgrploads)); 758 759 ASSERT(tp->t_lpl->lpl_ncpu > 0); 760 761 if (tp->t_state == TS_ONPROC) { 762 cpu_surrender(tp); 763 } else if (tp->t_state == TS_RUN) { 764 (void) dispdeq(tp); 765 setbackdq(tp); 766 } 767 } 768 769 /* 770 * Our binding has changed; set TP_CHANGEBIND. 771 */ 772 tp->t_proc_flag |= TP_CHANGEBIND; 773 aston(tp); 774 775 thread_unlock(tp); 776 fss_changepset(tp, newpp, projbuf, zonebuf); 777 778 return (0); /* success */ 779 } 780 781 782 /* 783 * This function binds a thread to a partition. Must be called with the 784 * p_lock of the containing process held (to keep the thread from going 785 * away), and thus also with cpu_lock held (since cpu_lock must be 786 * acquired before p_lock). If ignore is non-zero, then CPU bindings 787 * should be ignored (this is used when destroying a partition). 788 */ 789 int 790 cpupart_bind_thread(kthread_id_t tp, psetid_t psid, int ignore, void *projbuf, 791 void *zonebuf) 792 { 793 cpupart_t *newpp; 794 795 ASSERT(pool_lock_held()); 796 ASSERT(MUTEX_HELD(&cpu_lock)); 797 ASSERT(MUTEX_HELD(&pidlock)); 798 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock)); 799 800 if (psid == PS_NONE) 801 newpp = &cp_default; 802 else { 803 newpp = cpupart_find(psid); 804 if (newpp == NULL) { 805 return (EINVAL); 806 } 807 } 808 return (cpupart_move_thread(tp, newpp, ignore, projbuf, zonebuf)); 809 } 810 811 812 /* 813 * Create a new partition. On MP systems, this also allocates a 814 * kpreempt disp queue for that partition. 815 */ 816 int 817 cpupart_create(psetid_t *psid) 818 { 819 cpupart_t *pp; 820 821 ASSERT(pool_lock_held()); 822 823 pp = kmem_zalloc(sizeof (cpupart_t), KM_SLEEP); 824 pp->cp_nlgrploads = lgrp_plat_max_lgrps(); 825 pp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * pp->cp_nlgrploads, 826 KM_SLEEP); 827 828 mutex_enter(&cpu_lock); 829 if (cp_numparts == cp_max_numparts) { 830 mutex_exit(&cpu_lock); 831 kmem_free(pp->cp_lgrploads, sizeof (lpl_t) * pp->cp_nlgrploads); 832 pp->cp_lgrploads = NULL; 833 kmem_free(pp, sizeof (cpupart_t)); 834 return (ENOMEM); 835 } 836 cp_numparts++; 837 /* find the next free partition ID */ 838 while (cpupart_find(CPTOPS(cp_id_next)) != NULL) 839 cp_id_next++; 840 pp->cp_id = cp_id_next++; 841 pp->cp_ncpus = 0; 842 pp->cp_cpulist = NULL; 843 pp->cp_attr = 0; 844 klgrpset_clear(pp->cp_lgrpset); 845 pp->cp_kp_queue.disp_maxrunpri = -1; 846 pp->cp_kp_queue.disp_max_unbound_pri = -1; 847 pp->cp_kp_queue.disp_cpu = NULL; 848 pp->cp_gen = 0; 849 DISP_LOCK_INIT(&pp->cp_kp_queue.disp_lock); 850 *psid = CPTOPS(pp->cp_id); 851 disp_kp_alloc(&pp->cp_kp_queue, v.v_nglobpris); 852 cpupart_kstat_create(pp); 853 cpupart_lpl_initialize(pp); 854 855 bitset_init(&pp->cp_cmt_pgs); 856 857 /* 858 * Initialize and size the partition's bitset of halted CPUs. 859 */ 860 bitset_init_fanout(&pp->cp_haltset, cp_haltset_fanout); 861 bitset_resize(&pp->cp_haltset, max_ncpus); 862 863 /* 864 * Pause all CPUs while changing the partition list, to make sure 865 * the clock thread (which traverses the list without holding 866 * cpu_lock) isn't running. 867 */ 868 pause_cpus(NULL); 869 pp->cp_next = cp_list_head; 870 pp->cp_prev = cp_list_head->cp_prev; 871 cp_list_head->cp_prev->cp_next = pp; 872 cp_list_head->cp_prev = pp; 873 start_cpus(); 874 mutex_exit(&cpu_lock); 875 876 return (0); 877 } 878 879 /* 880 * Move threads from specified partition to cp_default. If `force' is specified, 881 * move all threads, otherwise move only soft-bound threads. 882 */ 883 static int 884 cpupart_unbind_threads(cpupart_t *pp, boolean_t unbind_all) 885 { 886 void *projbuf, *zonebuf; 887 kthread_t *t; 888 proc_t *p; 889 int err = 0; 890 psetid_t psid = pp->cp_id; 891 892 ASSERT(pool_lock_held()); 893 ASSERT(MUTEX_HELD(&cpu_lock)); 894 895 if (pp == NULL || pp == &cp_default) { 896 return (EINVAL); 897 } 898 899 /* 900 * Pre-allocate enough buffers for FSS for all active projects and 901 * for all active zones on the system. Unused buffers will be 902 * freed later by fss_freebuf(). 903 */ 904 projbuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_PROJ); 905 zonebuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_ZONE); 906 907 mutex_enter(&pidlock); 908 t = curthread; 909 do { 910 if (t->t_bind_pset == psid) { 911 again: p = ttoproc(t); 912 mutex_enter(&p->p_lock); 913 if (ttoproc(t) != p) { 914 /* 915 * lwp_exit has changed this thread's process 916 * pointer before we grabbed its p_lock. 917 */ 918 mutex_exit(&p->p_lock); 919 goto again; 920 } 921 922 /* 923 * Can only unbind threads which have revocable binding 924 * unless force unbinding requested. 925 */ 926 if (unbind_all || TB_PSET_IS_SOFT(t)) { 927 err = cpupart_bind_thread(t, PS_NONE, 1, 928 projbuf, zonebuf); 929 if (err) { 930 mutex_exit(&p->p_lock); 931 mutex_exit(&pidlock); 932 fss_freebuf(projbuf, FSS_ALLOC_PROJ); 933 fss_freebuf(zonebuf, FSS_ALLOC_ZONE); 934 return (err); 935 } 936 t->t_bind_pset = PS_NONE; 937 } 938 mutex_exit(&p->p_lock); 939 } 940 t = t->t_next; 941 } while (t != curthread); 942 943 mutex_exit(&pidlock); 944 fss_freebuf(projbuf, FSS_ALLOC_PROJ); 945 fss_freebuf(zonebuf, FSS_ALLOC_ZONE); 946 return (err); 947 } 948 949 /* 950 * Destroy a partition. 951 */ 952 int 953 cpupart_destroy(psetid_t psid) 954 { 955 cpu_t *cp, *first_cp; 956 cpupart_t *pp, *newpp; 957 int err = 0; 958 959 ASSERT(pool_lock_held()); 960 mutex_enter(&cpu_lock); 961 962 pp = cpupart_find(psid); 963 if (pp == NULL || pp == &cp_default) { 964 mutex_exit(&cpu_lock); 965 return (EINVAL); 966 } 967 968 /* 969 * Unbind all the threads currently bound to the partition. 970 */ 971 err = cpupart_unbind_threads(pp, B_TRUE); 972 if (err) { 973 mutex_exit(&cpu_lock); 974 return (err); 975 } 976 977 newpp = &cp_default; 978 while ((cp = pp->cp_cpulist) != NULL) { 979 if (err = cpupart_move_cpu(cp, newpp, 0)) { 980 mutex_exit(&cpu_lock); 981 return (err); 982 } 983 } 984 985 ASSERT(bitset_is_null(&pp->cp_cmt_pgs)); 986 ASSERT(bitset_is_null(&pp->cp_haltset)); 987 988 /* 989 * Teardown the partition's group of active CMT PGs and halted 990 * CPUs now that they have all left. 991 */ 992 bitset_fini(&pp->cp_cmt_pgs); 993 bitset_fini(&pp->cp_haltset); 994 995 /* 996 * Reset the pointers in any offline processors so they won't 997 * try to rejoin the destroyed partition when they're turned 998 * online. 999 */ 1000 first_cp = cp = CPU; 1001 do { 1002 if (cp->cpu_part == pp) { 1003 ASSERT(cp->cpu_flags & CPU_OFFLINE); 1004 cp->cpu_part = newpp; 1005 } 1006 cp = cp->cpu_next; 1007 } while (cp != first_cp); 1008 1009 /* 1010 * Pause all CPUs while changing the partition list, to make sure 1011 * the clock thread (which traverses the list without holding 1012 * cpu_lock) isn't running. 1013 */ 1014 pause_cpus(NULL); 1015 pp->cp_prev->cp_next = pp->cp_next; 1016 pp->cp_next->cp_prev = pp->cp_prev; 1017 if (cp_list_head == pp) 1018 cp_list_head = pp->cp_next; 1019 start_cpus(); 1020 1021 if (cp_id_next > pp->cp_id) 1022 cp_id_next = pp->cp_id; 1023 1024 if (pp->cp_kstat) 1025 kstat_delete(pp->cp_kstat); 1026 1027 cp_numparts--; 1028 1029 disp_kp_free(&pp->cp_kp_queue); 1030 1031 cpupart_lpl_teardown(pp); 1032 1033 kmem_free(pp, sizeof (cpupart_t)); 1034 mutex_exit(&cpu_lock); 1035 1036 return (err); 1037 } 1038 1039 1040 /* 1041 * Return the ID of the partition to which the specified processor belongs. 1042 */ 1043 psetid_t 1044 cpupart_query_cpu(cpu_t *cp) 1045 { 1046 ASSERT(MUTEX_HELD(&cpu_lock)); 1047 1048 return (CPTOPS(cp->cpu_part->cp_id)); 1049 } 1050 1051 1052 /* 1053 * Attach a processor to an existing partition. 1054 */ 1055 int 1056 cpupart_attach_cpu(psetid_t psid, cpu_t *cp, int forced) 1057 { 1058 cpupart_t *pp; 1059 int err; 1060 1061 ASSERT(pool_lock_held()); 1062 ASSERT(MUTEX_HELD(&cpu_lock)); 1063 1064 pp = cpupart_find(psid); 1065 if (pp == NULL) 1066 return (EINVAL); 1067 if (cp->cpu_flags & CPU_OFFLINE) 1068 return (EINVAL); 1069 1070 err = cpupart_move_cpu(cp, pp, forced); 1071 return (err); 1072 } 1073 1074 /* 1075 * Get a list of cpus belonging to the partition. If numcpus is NULL, 1076 * this just checks for a valid partition. If numcpus is non-NULL but 1077 * cpulist is NULL, the current number of cpus is stored in *numcpus. 1078 * If both are non-NULL, the current number of cpus is stored in *numcpus, 1079 * and a list of those cpus up to the size originally in *numcpus is 1080 * stored in cpulist[]. Also, store the processor set id in *psid. 1081 * This is useful in case the processor set id passed in was PS_MYID. 1082 */ 1083 int 1084 cpupart_get_cpus(psetid_t *psid, processorid_t *cpulist, uint_t *numcpus) 1085 { 1086 cpupart_t *pp; 1087 uint_t ncpus; 1088 cpu_t *c; 1089 int i; 1090 1091 mutex_enter(&cpu_lock); 1092 pp = cpupart_find(*psid); 1093 if (pp == NULL) { 1094 mutex_exit(&cpu_lock); 1095 return (EINVAL); 1096 } 1097 *psid = CPTOPS(pp->cp_id); 1098 ncpus = pp->cp_ncpus; 1099 if (numcpus) { 1100 if (ncpus > *numcpus) { 1101 /* 1102 * Only copy as many cpus as were passed in, but 1103 * pass back the real number. 1104 */ 1105 uint_t t = ncpus; 1106 ncpus = *numcpus; 1107 *numcpus = t; 1108 } else 1109 *numcpus = ncpus; 1110 1111 if (cpulist) { 1112 c = pp->cp_cpulist; 1113 for (i = 0; i < ncpus; i++) { 1114 ASSERT(c != NULL); 1115 cpulist[i] = c->cpu_id; 1116 c = c->cpu_next_part; 1117 } 1118 } 1119 } 1120 mutex_exit(&cpu_lock); 1121 return (0); 1122 } 1123 1124 /* 1125 * Reallocate kpreempt queues for each CPU partition. Called from 1126 * disp_setup when a new scheduling class is loaded that increases the 1127 * number of priorities in the system. 1128 */ 1129 void 1130 cpupart_kpqalloc(pri_t npri) 1131 { 1132 cpupart_t *cpp; 1133 1134 ASSERT(MUTEX_HELD(&cpu_lock)); 1135 cpp = cp_list_head; 1136 do { 1137 disp_kp_alloc(&cpp->cp_kp_queue, npri); 1138 cpp = cpp->cp_next; 1139 } while (cpp != cp_list_head); 1140 } 1141 1142 int 1143 cpupart_get_loadavg(psetid_t psid, int *buf, int nelem) 1144 { 1145 cpupart_t *cp; 1146 int i; 1147 1148 ASSERT(nelem >= 0); 1149 ASSERT(nelem <= LOADAVG_NSTATS); 1150 ASSERT(MUTEX_HELD(&cpu_lock)); 1151 1152 cp = cpupart_find(psid); 1153 if (cp == NULL) 1154 return (EINVAL); 1155 for (i = 0; i < nelem; i++) 1156 buf[i] = cp->cp_hp_avenrun[i] >> (16 - FSHIFT); 1157 1158 return (0); 1159 } 1160 1161 1162 uint_t 1163 cpupart_list(psetid_t *list, uint_t nelem, int flag) 1164 { 1165 uint_t numpart = 0; 1166 cpupart_t *cp; 1167 1168 ASSERT(MUTEX_HELD(&cpu_lock)); 1169 ASSERT(flag == CP_ALL || flag == CP_NONEMPTY); 1170 1171 if (list != NULL) { 1172 cp = cp_list_head; 1173 do { 1174 if (((flag == CP_ALL) && (cp != &cp_default)) || 1175 ((flag == CP_NONEMPTY) && (cp->cp_ncpus != 0))) { 1176 if (numpart == nelem) 1177 break; 1178 list[numpart++] = CPTOPS(cp->cp_id); 1179 } 1180 cp = cp->cp_next; 1181 } while (cp != cp_list_head); 1182 } 1183 1184 ASSERT(numpart < cp_numparts); 1185 1186 if (flag == CP_ALL) 1187 numpart = cp_numparts - 1; /* leave out default partition */ 1188 else if (flag == CP_NONEMPTY) 1189 numpart = cp_numparts_nonempty; 1190 1191 return (numpart); 1192 } 1193 1194 int 1195 cpupart_setattr(psetid_t psid, uint_t attr) 1196 { 1197 cpupart_t *cp; 1198 1199 ASSERT(pool_lock_held()); 1200 1201 mutex_enter(&cpu_lock); 1202 if ((cp = cpupart_find(psid)) == NULL) { 1203 mutex_exit(&cpu_lock); 1204 return (EINVAL); 1205 } 1206 /* 1207 * PSET_NOESCAPE attribute for default cpu partition is always set 1208 */ 1209 if (cp == &cp_default && !(attr & PSET_NOESCAPE)) { 1210 mutex_exit(&cpu_lock); 1211 return (EINVAL); 1212 } 1213 cp->cp_attr = attr; 1214 mutex_exit(&cpu_lock); 1215 return (0); 1216 } 1217 1218 int 1219 cpupart_getattr(psetid_t psid, uint_t *attrp) 1220 { 1221 cpupart_t *cp; 1222 1223 mutex_enter(&cpu_lock); 1224 if ((cp = cpupart_find(psid)) == NULL) { 1225 mutex_exit(&cpu_lock); 1226 return (EINVAL); 1227 } 1228 *attrp = cp->cp_attr; 1229 mutex_exit(&cpu_lock); 1230 return (0); 1231 } 1232