1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <sys/systm.h> 30 #include <sys/cmn_err.h> 31 #include <sys/cpuvar.h> 32 #include <sys/thread.h> 33 #include <sys/disp.h> 34 #include <sys/kmem.h> 35 #include <sys/debug.h> 36 #include <sys/cpupart.h> 37 #include <sys/pset.h> 38 #include <sys/var.h> 39 #include <sys/cyclic.h> 40 #include <sys/lgrp.h> 41 #include <sys/pghw.h> 42 #include <sys/loadavg.h> 43 #include <sys/class.h> 44 #include <sys/fss.h> 45 #include <sys/pool.h> 46 #include <sys/pool_pset.h> 47 #include <sys/policy.h> 48 49 /* 50 * Calling pool_lock() protects the pools configuration, which includes 51 * CPU partitions. cpu_lock protects the CPU partition list, and prevents 52 * partitions from being created or destroyed while the lock is held. 53 * The lock ordering with respect to related locks is: 54 * 55 * pool_lock() ---> cpu_lock ---> pidlock --> p_lock 56 * 57 * Blocking memory allocations may be made while holding "pool_lock" 58 * or cpu_lock. 59 */ 60 61 /* 62 * The cp_default partition is allocated statically, but its lgroup load average 63 * (lpl) list is allocated dynamically after kmem subsystem is initialized. This 64 * saves some memory since the space allocated reflects the actual number of 65 * lgroups supported by the platform. The lgrp facility provides a temporary 66 * space to hold lpl information during system bootstrap. 67 */ 68 69 cpupart_t *cp_list_head; 70 cpupart_t cp_default; 71 struct mach_cpupart cp_default_mach; 72 static cpupartid_t cp_id_next; 73 uint_t cp_numparts; 74 uint_t cp_numparts_nonempty; 75 76 /* 77 * Need to limit total number of partitions to avoid slowing down the 78 * clock code too much. The clock code traverses the list of 79 * partitions and needs to be able to execute in a reasonable amount 80 * of time (less than 1/hz seconds). The maximum is sized based on 81 * max_ncpus so it shouldn't be a problem unless there are large 82 * numbers of empty partitions. 83 */ 84 static uint_t cp_max_numparts; 85 86 /* 87 * Processor sets and CPU partitions are different but related concepts. 88 * A processor set is a user-level abstraction allowing users to create 89 * sets of CPUs and bind threads exclusively to those sets. A CPU 90 * partition is a kernel dispatcher object consisting of a set of CPUs 91 * and a global dispatch queue. The processor set abstraction is 92 * implemented via a CPU partition, and currently there is a 1-1 93 * mapping between processor sets and partitions (excluding the default 94 * partition, which is not visible as a processor set). Hence, the 95 * numbering for processor sets and CPU partitions is identical. This 96 * may not always be true in the future, and these macros could become 97 * less trivial if we support e.g. a processor set containing multiple 98 * CPU partitions. 99 */ 100 #define PSTOCP(psid) ((cpupartid_t)((psid) == PS_NONE ? CP_DEFAULT : (psid))) 101 #define CPTOPS(cpid) ((psetid_t)((cpid) == CP_DEFAULT ? PS_NONE : (cpid))) 102 103 /* 104 * Find a CPU partition given a processor set ID. 105 */ 106 static cpupart_t * 107 cpupart_find_all(psetid_t psid) 108 { 109 cpupart_t *cp; 110 cpupartid_t cpid = PSTOCP(psid); 111 112 ASSERT(MUTEX_HELD(&cpu_lock)); 113 114 /* default partition not visible as a processor set */ 115 if (psid == CP_DEFAULT) 116 return (NULL); 117 118 if (psid == PS_MYID) 119 return (curthread->t_cpupart); 120 121 cp = cp_list_head; 122 do { 123 if (cp->cp_id == cpid) 124 return (cp); 125 cp = cp->cp_next; 126 } while (cp != cp_list_head); 127 return (NULL); 128 } 129 130 /* 131 * Find a CPU partition given a processor set ID if the processor set 132 * should be visible from the calling zone. 133 */ 134 cpupart_t * 135 cpupart_find(psetid_t psid) 136 { 137 cpupart_t *cp; 138 139 ASSERT(MUTEX_HELD(&cpu_lock)); 140 cp = cpupart_find_all(psid); 141 if (cp != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() && 142 zone_pset_get(curproc->p_zone) != CPTOPS(cp->cp_id)) 143 return (NULL); 144 return (cp); 145 } 146 147 static int 148 cpupart_kstat_update(kstat_t *ksp, int rw) 149 { 150 cpupart_t *cp = (cpupart_t *)ksp->ks_private; 151 cpupart_kstat_t *cpksp = ksp->ks_data; 152 153 if (rw == KSTAT_WRITE) 154 return (EACCES); 155 156 cpksp->cpk_updates.value.ui64 = cp->cp_updates; 157 cpksp->cpk_runnable.value.ui64 = cp->cp_nrunnable_cum; 158 cpksp->cpk_waiting.value.ui64 = cp->cp_nwaiting_cum; 159 cpksp->cpk_ncpus.value.ui32 = cp->cp_ncpus; 160 cpksp->cpk_avenrun_1min.value.ui32 = cp->cp_hp_avenrun[0] >> 161 (16 - FSHIFT); 162 cpksp->cpk_avenrun_5min.value.ui32 = cp->cp_hp_avenrun[1] >> 163 (16 - FSHIFT); 164 cpksp->cpk_avenrun_15min.value.ui32 = cp->cp_hp_avenrun[2] >> 165 (16 - FSHIFT); 166 return (0); 167 } 168 169 static void 170 cpupart_kstat_create(cpupart_t *cp) 171 { 172 kstat_t *ksp; 173 zoneid_t zoneid; 174 175 ASSERT(MUTEX_HELD(&cpu_lock)); 176 177 /* 178 * We have a bit of a chicken-egg problem since this code will 179 * get called to create the kstats for CP_DEFAULT before the 180 * pools framework gets initialized. We circumvent the problem 181 * by special-casing cp_default. 182 */ 183 if (cp != &cp_default && pool_pset_enabled()) 184 zoneid = GLOBAL_ZONEID; 185 else 186 zoneid = ALL_ZONES; 187 ksp = kstat_create_zone("unix", cp->cp_id, "pset", "misc", 188 KSTAT_TYPE_NAMED, 189 sizeof (cpupart_kstat_t) / sizeof (kstat_named_t), 0, zoneid); 190 if (ksp != NULL) { 191 cpupart_kstat_t *cpksp = ksp->ks_data; 192 193 kstat_named_init(&cpksp->cpk_updates, "updates", 194 KSTAT_DATA_UINT64); 195 kstat_named_init(&cpksp->cpk_runnable, "runnable", 196 KSTAT_DATA_UINT64); 197 kstat_named_init(&cpksp->cpk_waiting, "waiting", 198 KSTAT_DATA_UINT64); 199 kstat_named_init(&cpksp->cpk_ncpus, "ncpus", 200 KSTAT_DATA_UINT32); 201 kstat_named_init(&cpksp->cpk_avenrun_1min, "avenrun_1min", 202 KSTAT_DATA_UINT32); 203 kstat_named_init(&cpksp->cpk_avenrun_5min, "avenrun_5min", 204 KSTAT_DATA_UINT32); 205 kstat_named_init(&cpksp->cpk_avenrun_15min, "avenrun_15min", 206 KSTAT_DATA_UINT32); 207 208 ksp->ks_update = cpupart_kstat_update; 209 ksp->ks_private = cp; 210 211 kstat_install(ksp); 212 } 213 cp->cp_kstat = ksp; 214 } 215 216 /* 217 * Initialize the default partition and kpreempt disp queue. 218 */ 219 void 220 cpupart_initialize_default(void) 221 { 222 lgrp_id_t i; 223 224 cp_list_head = &cp_default; 225 cp_default.cp_next = &cp_default; 226 cp_default.cp_prev = &cp_default; 227 cp_default.cp_id = CP_DEFAULT; 228 cp_default.cp_kp_queue.disp_maxrunpri = -1; 229 cp_default.cp_kp_queue.disp_max_unbound_pri = -1; 230 cp_default.cp_kp_queue.disp_cpu = NULL; 231 cp_default.cp_gen = 0; 232 cp_default.cp_loadavg.lg_cur = 0; 233 cp_default.cp_loadavg.lg_len = 0; 234 cp_default.cp_loadavg.lg_total = 0; 235 for (i = 0; i < S_LOADAVG_SZ; i++) { 236 cp_default.cp_loadavg.lg_loads[i] = 0; 237 } 238 CPUSET_ZERO(cp_default.cp_mach->mc_haltset); 239 DISP_LOCK_INIT(&cp_default.cp_kp_queue.disp_lock); 240 cp_id_next = CP_DEFAULT + 1; 241 cpupart_kstat_create(&cp_default); 242 cp_numparts = 1; 243 if (cp_max_numparts == 0) /* allow for /etc/system tuning */ 244 cp_max_numparts = max_ncpus * 2 + 1; 245 /* 246 * Allocate space for cp_default list of lgrploads 247 */ 248 cp_default.cp_nlgrploads = lgrp_plat_max_lgrps(); 249 cp_default.cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * 250 cp_default.cp_nlgrploads, KM_SLEEP); 251 252 /* 253 * The initial lpl topology is created in a special lpl list 254 * lpl_bootstrap. It should be copied to cp_default. 255 * NOTE: lpl_topo_bootstrap() also updates CPU0 cpu_lpl pointer to point 256 * to the correct lpl in the cp_default.cp_lgrploads list. 257 */ 258 lpl_topo_bootstrap(cp_default.cp_lgrploads, 259 cp_default.cp_nlgrploads); 260 261 for (i = 0; i < cp_default.cp_nlgrploads; i++) { 262 cp_default.cp_lgrploads[i].lpl_lgrpid = i; 263 } 264 cp_default.cp_attr = PSET_NOESCAPE; 265 cp_numparts_nonempty = 1; 266 /* 267 * Set t0's home 268 */ 269 t0.t_lpl = &cp_default.cp_lgrploads[LGRP_ROOTID]; 270 271 bitset_init(&cp_default.cp_cmt_pgs); 272 } 273 274 275 static int 276 cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced) 277 { 278 cpupart_t *oldpp; 279 cpu_t *ncp, *newlist; 280 kthread_t *t; 281 int move_threads = 1; 282 lgrp_id_t lgrpid; 283 proc_t *p; 284 int lgrp_diff_lpl; 285 lpl_t *cpu_lpl; 286 int ret; 287 288 ASSERT(MUTEX_HELD(&cpu_lock)); 289 ASSERT(newpp != NULL); 290 291 oldpp = cp->cpu_part; 292 ASSERT(oldpp != NULL); 293 ASSERT(oldpp->cp_ncpus > 0); 294 295 if (newpp == oldpp) { 296 /* 297 * Don't need to do anything. 298 */ 299 return (0); 300 } 301 302 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_OUT); 303 304 if (!disp_bound_partition(cp, 0)) { 305 /* 306 * Don't need to move threads if there are no threads in 307 * the partition. Note that threads can't enter the 308 * partition while we're holding cpu_lock. 309 */ 310 move_threads = 0; 311 } else if (oldpp->cp_ncpus == 1) { 312 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 313 return (EBUSY); 314 } 315 316 if (forced && (ret = cpu_unbind(cp->cpu_id)) != 0) { 317 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 318 return (ret); 319 } 320 321 /* 322 * Stop further threads weak binding to this cpu. 323 */ 324 cpu_inmotion = cp; 325 membar_enter(); 326 327 /* 328 * Notify the Processor Groups subsystem that the CPU 329 * will be moving cpu partitions. This is done before 330 * CPUs are paused to provide an opportunity for any 331 * needed memory allocations. 332 */ 333 pg_cpupart_out(cp, oldpp); 334 pg_cpupart_in(cp, newpp); 335 336 again: 337 if (move_threads) { 338 int loop_count; 339 /* 340 * Check for threads strong or weak bound to this CPU. 341 */ 342 for (loop_count = 0; disp_bound_threads(cp, 0); loop_count++) { 343 if (loop_count >= 5) { 344 cpu_state_change_notify(cp->cpu_id, 345 CPU_CPUPART_IN); 346 pg_cpupart_out(cp, newpp); 347 pg_cpupart_in(cp, oldpp); 348 cpu_inmotion = NULL; 349 return (EBUSY); /* some threads still bound */ 350 } 351 delay(1); 352 } 353 } 354 355 /* 356 * Before we actually start changing data structures, notify 357 * the cyclic subsystem that we want to move this CPU out of its 358 * partition. 359 */ 360 if (!cyclic_move_out(cp)) { 361 /* 362 * This CPU must be the last CPU in a processor set with 363 * a bound cyclic. 364 */ 365 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 366 pg_cpupart_out(cp, newpp); 367 pg_cpupart_in(cp, oldpp); 368 cpu_inmotion = NULL; 369 return (EBUSY); 370 } 371 372 pause_cpus(cp); 373 374 if (move_threads) { 375 /* 376 * The thread on cpu before the pause thread may have read 377 * cpu_inmotion before we raised the barrier above. Check 378 * again. 379 */ 380 if (disp_bound_threads(cp, 1)) { 381 start_cpus(); 382 goto again; 383 } 384 385 } 386 387 /* 388 * Now that CPUs are paused, let the PG subsystem perform 389 * any necessary data structure updates. 390 */ 391 pg_cpupart_move(cp, oldpp, newpp); 392 393 /* save this cpu's lgroup -- it'll be the same in the new partition */ 394 lgrpid = cp->cpu_lpl->lpl_lgrpid; 395 396 cpu_lpl = cp->cpu_lpl; 397 /* 398 * let the lgroup framework know cp has left the partition 399 */ 400 lgrp_config(LGRP_CONFIG_CPUPART_DEL, (uintptr_t)cp, lgrpid); 401 402 /* move out of old partition */ 403 oldpp->cp_ncpus--; 404 if (oldpp->cp_ncpus > 0) { 405 406 ncp = cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part; 407 cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part; 408 if (oldpp->cp_cpulist == cp) { 409 oldpp->cp_cpulist = ncp; 410 } 411 } else { 412 ncp = oldpp->cp_cpulist = NULL; 413 cp_numparts_nonempty--; 414 ASSERT(cp_numparts_nonempty != 0); 415 } 416 oldpp->cp_gen++; 417 418 /* move into new partition */ 419 newlist = newpp->cp_cpulist; 420 if (newlist == NULL) { 421 newpp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp; 422 cp_numparts_nonempty++; 423 ASSERT(cp_numparts_nonempty != 0); 424 } else { 425 cp->cpu_next_part = newlist; 426 cp->cpu_prev_part = newlist->cpu_prev_part; 427 newlist->cpu_prev_part->cpu_next_part = cp; 428 newlist->cpu_prev_part = cp; 429 } 430 cp->cpu_part = newpp; 431 newpp->cp_ncpus++; 432 newpp->cp_gen++; 433 434 ASSERT(CPUSET_ISNULL(newpp->cp_mach->mc_haltset)); 435 ASSERT(CPUSET_ISNULL(oldpp->cp_mach->mc_haltset)); 436 437 /* 438 * let the lgroup framework know cp has entered the partition 439 */ 440 lgrp_config(LGRP_CONFIG_CPUPART_ADD, (uintptr_t)cp, lgrpid); 441 442 /* 443 * If necessary, move threads off processor. 444 */ 445 if (move_threads) { 446 ASSERT(ncp != NULL); 447 448 /* 449 * Walk thru the active process list to look for 450 * threads that need to have a new home lgroup, 451 * or the last CPU they run on is the same CPU 452 * being moved out of the partition. 453 */ 454 455 for (p = practive; p != NULL; p = p->p_next) { 456 457 t = p->p_tlist; 458 459 if (t == NULL) 460 continue; 461 462 lgrp_diff_lpl = 0; 463 464 do { 465 466 ASSERT(t->t_lpl != NULL); 467 468 /* 469 * Update the count of how many threads are 470 * in this CPU's lgroup but have a different lpl 471 */ 472 473 if (t->t_lpl != cpu_lpl && 474 t->t_lpl->lpl_lgrpid == lgrpid) 475 lgrp_diff_lpl++; 476 /* 477 * If the lgroup that t is assigned to no 478 * longer has any CPUs in t's partition, 479 * we'll have to choose a new lgroup for t. 480 */ 481 482 if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid, 483 t->t_cpupart)) { 484 lgrp_move_thread(t, 485 lgrp_choose(t, t->t_cpupart), 0); 486 } 487 488 /* 489 * make sure lpl points to our own partition 490 */ 491 ASSERT(t->t_lpl >= t->t_cpupart->cp_lgrploads && 492 (t->t_lpl < t->t_cpupart->cp_lgrploads + 493 t->t_cpupart->cp_nlgrploads)); 494 495 ASSERT(t->t_lpl->lpl_ncpu > 0); 496 497 /* Update CPU last ran on if it was this CPU */ 498 if (t->t_cpu == cp && t->t_cpupart == oldpp && 499 t->t_bound_cpu != cp) { 500 t->t_cpu = disp_lowpri_cpu(ncp, 501 t->t_lpl, t->t_pri, NULL); 502 } 503 t = t->t_forw; 504 } while (t != p->p_tlist); 505 506 /* 507 * Didn't find any threads in the same lgroup as this 508 * CPU with a different lpl, so remove the lgroup from 509 * the process lgroup bitmask. 510 */ 511 512 if (lgrp_diff_lpl) 513 klgrpset_del(p->p_lgrpset, lgrpid); 514 } 515 516 /* 517 * Walk thread list looking for threads that need to be 518 * rehomed, since there are some threads that are not in 519 * their process's p_tlist. 520 */ 521 522 t = curthread; 523 524 do { 525 ASSERT(t != NULL && t->t_lpl != NULL); 526 527 /* 528 * If the lgroup that t is assigned to no 529 * longer has any CPUs in t's partition, 530 * we'll have to choose a new lgroup for t. 531 * Also, choose best lgroup for home when 532 * thread has specified lgroup affinities, 533 * since there may be an lgroup with more 534 * affinity available after moving CPUs 535 * around. 536 */ 537 if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid, 538 t->t_cpupart) || t->t_lgrp_affinity) { 539 lgrp_move_thread(t, 540 lgrp_choose(t, t->t_cpupart), 1); 541 } 542 543 /* make sure lpl points to our own partition */ 544 ASSERT((t->t_lpl >= t->t_cpupart->cp_lgrploads) && 545 (t->t_lpl < t->t_cpupart->cp_lgrploads + 546 t->t_cpupart->cp_nlgrploads)); 547 548 ASSERT(t->t_lpl->lpl_ncpu > 0); 549 550 /* Update CPU last ran on if it was this CPU */ 551 if (t->t_cpu == cp && t->t_cpupart == oldpp && 552 t->t_bound_cpu != cp) { 553 t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl, 554 t->t_pri, NULL); 555 } 556 557 t = t->t_next; 558 } while (t != curthread); 559 560 /* 561 * Clear off the CPU's run queue, and the kp queue if the 562 * partition is now empty. 563 */ 564 disp_cpu_inactive(cp); 565 566 /* 567 * Make cp switch to a thread from the new partition. 568 */ 569 cp->cpu_runrun = 1; 570 cp->cpu_kprunrun = 1; 571 } 572 573 cpu_inmotion = NULL; 574 start_cpus(); 575 576 /* 577 * Let anyone interested know that cpu has been added to the set. 578 */ 579 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 580 581 /* 582 * Now let the cyclic subsystem know that it can reshuffle cyclics 583 * bound to the new processor set. 584 */ 585 cyclic_move_in(cp); 586 587 return (0); 588 } 589 590 /* 591 * Check if thread can be moved to a new cpu partition. Called by 592 * cpupart_move_thread() and pset_bind_start(). 593 */ 594 int 595 cpupart_movable_thread(kthread_id_t tp, cpupart_t *cp, int ignore) 596 { 597 ASSERT(MUTEX_HELD(&cpu_lock)); 598 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock)); 599 ASSERT(cp != NULL); 600 ASSERT(THREAD_LOCK_HELD(tp)); 601 602 /* 603 * CPU-bound threads can't be moved. 604 */ 605 if (!ignore) { 606 cpu_t *boundcpu = tp->t_bound_cpu ? tp->t_bound_cpu : 607 tp->t_weakbound_cpu; 608 if (boundcpu != NULL && boundcpu->cpu_part != cp) 609 return (EBUSY); 610 } 611 return (0); 612 } 613 614 /* 615 * Move thread to new partition. If ignore is non-zero, then CPU 616 * bindings should be ignored (this is used when destroying a 617 * partition). 618 */ 619 static int 620 cpupart_move_thread(kthread_id_t tp, cpupart_t *newpp, int ignore, 621 void *projbuf, void *zonebuf) 622 { 623 cpupart_t *oldpp = tp->t_cpupart; 624 int ret; 625 626 ASSERT(MUTEX_HELD(&cpu_lock)); 627 ASSERT(MUTEX_HELD(&pidlock)); 628 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock)); 629 ASSERT(newpp != NULL); 630 631 if (newpp->cp_cpulist == NULL) 632 return (EINVAL); 633 634 /* 635 * Check for errors first. 636 */ 637 thread_lock(tp); 638 if ((ret = cpupart_movable_thread(tp, newpp, ignore)) != 0) { 639 thread_unlock(tp); 640 return (ret); 641 } 642 643 /* move the thread */ 644 if (oldpp != newpp) { 645 /* 646 * Make the thread switch to the new partition. 647 */ 648 tp->t_cpupart = newpp; 649 ASSERT(tp->t_lpl != NULL); 650 /* 651 * Leave the thread on the same lgroup if possible; otherwise 652 * choose a new lgroup for it. In either case, update its 653 * t_lpl. 654 */ 655 if (LGRP_CPUS_IN_PART(tp->t_lpl->lpl_lgrpid, newpp) && 656 tp->t_lgrp_affinity == NULL) { 657 /* 658 * The thread's lgroup has CPUs in the thread's new 659 * partition, so the thread can stay assigned to the 660 * same lgroup. Update its t_lpl to point to the 661 * lpl_t for its lgroup in its new partition. 662 */ 663 lgrp_move_thread(tp, &tp->t_cpupart->\ 664 cp_lgrploads[tp->t_lpl->lpl_lgrpid], 1); 665 } else { 666 /* 667 * The thread's lgroup has no cpus in its new 668 * partition or it has specified lgroup affinities, 669 * so choose the best lgroup for the thread and 670 * assign it to that lgroup. 671 */ 672 lgrp_move_thread(tp, lgrp_choose(tp, tp->t_cpupart), 673 1); 674 } 675 /* 676 * make sure lpl points to our own partition 677 */ 678 ASSERT((tp->t_lpl >= tp->t_cpupart->cp_lgrploads) && 679 (tp->t_lpl < tp->t_cpupart->cp_lgrploads + 680 tp->t_cpupart->cp_nlgrploads)); 681 682 ASSERT(tp->t_lpl->lpl_ncpu > 0); 683 684 if (tp->t_state == TS_ONPROC) { 685 cpu_surrender(tp); 686 } else if (tp->t_state == TS_RUN) { 687 (void) dispdeq(tp); 688 setbackdq(tp); 689 } 690 } 691 692 /* 693 * Our binding has changed; set TP_CHANGEBIND. 694 */ 695 tp->t_proc_flag |= TP_CHANGEBIND; 696 aston(tp); 697 698 thread_unlock(tp); 699 fss_changepset(tp, newpp, projbuf, zonebuf); 700 701 return (0); /* success */ 702 } 703 704 705 /* 706 * This function binds a thread to a partition. Must be called with the 707 * p_lock of the containing process held (to keep the thread from going 708 * away), and thus also with cpu_lock held (since cpu_lock must be 709 * acquired before p_lock). If ignore is non-zero, then CPU bindings 710 * should be ignored (this is used when destroying a partition). 711 */ 712 int 713 cpupart_bind_thread(kthread_id_t tp, psetid_t psid, int ignore, void *projbuf, 714 void *zonebuf) 715 { 716 cpupart_t *newpp; 717 718 ASSERT(pool_lock_held()); 719 ASSERT(MUTEX_HELD(&cpu_lock)); 720 ASSERT(MUTEX_HELD(&pidlock)); 721 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock)); 722 723 if (psid == PS_NONE) 724 newpp = &cp_default; 725 else { 726 newpp = cpupart_find(psid); 727 if (newpp == NULL) { 728 return (EINVAL); 729 } 730 } 731 return (cpupart_move_thread(tp, newpp, ignore, projbuf, zonebuf)); 732 } 733 734 735 /* 736 * Create a new partition. On MP systems, this also allocates a 737 * kpreempt disp queue for that partition. 738 */ 739 int 740 cpupart_create(psetid_t *psid) 741 { 742 cpupart_t *pp; 743 lgrp_id_t i; 744 745 ASSERT(pool_lock_held()); 746 747 pp = kmem_zalloc(sizeof (cpupart_t), KM_SLEEP); 748 pp->cp_mach = kmem_zalloc(sizeof (struct mach_cpupart), KM_SLEEP); 749 pp->cp_nlgrploads = lgrp_plat_max_lgrps(); 750 pp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * pp->cp_nlgrploads, 751 KM_SLEEP); 752 753 mutex_enter(&cpu_lock); 754 if (cp_numparts == cp_max_numparts) { 755 mutex_exit(&cpu_lock); 756 kmem_free(pp->cp_lgrploads, sizeof (lpl_t) * pp->cp_nlgrploads); 757 pp->cp_lgrploads = NULL; 758 kmem_free(pp->cp_mach, sizeof (struct mach_cpupart)); 759 kmem_free(pp, sizeof (cpupart_t)); 760 return (ENOMEM); 761 } 762 cp_numparts++; 763 /* find the next free partition ID */ 764 while (cpupart_find(CPTOPS(cp_id_next)) != NULL) 765 cp_id_next++; 766 pp->cp_id = cp_id_next++; 767 pp->cp_ncpus = 0; 768 pp->cp_cpulist = NULL; 769 pp->cp_attr = 0; 770 klgrpset_clear(pp->cp_lgrpset); 771 pp->cp_kp_queue.disp_maxrunpri = -1; 772 pp->cp_kp_queue.disp_max_unbound_pri = -1; 773 pp->cp_kp_queue.disp_cpu = NULL; 774 pp->cp_gen = 0; 775 CPUSET_ZERO(pp->cp_mach->mc_haltset); 776 DISP_LOCK_INIT(&pp->cp_kp_queue.disp_lock); 777 *psid = CPTOPS(pp->cp_id); 778 disp_kp_alloc(&pp->cp_kp_queue, v.v_nglobpris); 779 cpupart_kstat_create(pp); 780 for (i = 0; i < pp->cp_nlgrploads; i++) { 781 pp->cp_lgrploads[i].lpl_lgrpid = i; 782 } 783 bitset_init(&pp->cp_cmt_pgs); 784 785 /* 786 * Pause all CPUs while changing the partition list, to make sure 787 * the clock thread (which traverses the list without holding 788 * cpu_lock) isn't running. 789 */ 790 pause_cpus(NULL); 791 pp->cp_next = cp_list_head; 792 pp->cp_prev = cp_list_head->cp_prev; 793 cp_list_head->cp_prev->cp_next = pp; 794 cp_list_head->cp_prev = pp; 795 start_cpus(); 796 mutex_exit(&cpu_lock); 797 798 return (0); 799 } 800 801 802 /* 803 * Destroy a partition. 804 */ 805 int 806 cpupart_destroy(psetid_t psid) 807 { 808 cpu_t *cp, *first_cp; 809 cpupart_t *pp, *newpp; 810 int err = 0; 811 void *projbuf, *zonebuf; 812 kthread_t *t; 813 proc_t *p; 814 815 ASSERT(pool_lock_held()); 816 mutex_enter(&cpu_lock); 817 818 pp = cpupart_find(psid); 819 if (pp == NULL || pp == &cp_default) { 820 mutex_exit(&cpu_lock); 821 return (EINVAL); 822 } 823 824 /* 825 * Pre-allocate enough buffers for FSS for all active projects and 826 * for all active zones on the system. Unused buffers will be 827 * freed later by fss_freebuf(). 828 */ 829 projbuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_PROJ); 830 zonebuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_ZONE); 831 832 /* 833 * First need to unbind all the threads currently bound to the 834 * partition. Then do the actual destroy (which moves the CPUs). 835 */ 836 mutex_enter(&pidlock); 837 t = curthread; 838 do { 839 if (t->t_bind_pset == psid) { 840 again: p = ttoproc(t); 841 mutex_enter(&p->p_lock); 842 if (ttoproc(t) != p) { 843 /* 844 * lwp_exit has changed this thread's process 845 * pointer before we grabbed its p_lock. 846 */ 847 mutex_exit(&p->p_lock); 848 goto again; 849 } 850 err = cpupart_bind_thread(t, PS_NONE, 1, 851 projbuf, zonebuf); 852 if (err) { 853 mutex_exit(&p->p_lock); 854 mutex_exit(&pidlock); 855 mutex_exit(&cpu_lock); 856 fss_freebuf(projbuf, FSS_ALLOC_PROJ); 857 fss_freebuf(zonebuf, FSS_ALLOC_ZONE); 858 return (err); 859 } 860 t->t_bind_pset = PS_NONE; 861 mutex_exit(&p->p_lock); 862 } 863 t = t->t_next; 864 } while (t != curthread); 865 866 mutex_exit(&pidlock); 867 fss_freebuf(projbuf, FSS_ALLOC_PROJ); 868 fss_freebuf(zonebuf, FSS_ALLOC_ZONE); 869 870 newpp = &cp_default; 871 while ((cp = pp->cp_cpulist) != NULL) { 872 if (err = cpupart_move_cpu(cp, newpp, 0)) { 873 mutex_exit(&cpu_lock); 874 return (err); 875 } 876 } 877 878 ASSERT(bitset_is_null(&pp->cp_cmt_pgs)); 879 ASSERT(CPUSET_ISNULL(pp->cp_mach->mc_haltset)); 880 881 /* 882 * Teardown the partition's group of active CMT PGs now that 883 * all of the CPUs have left. 884 */ 885 bitset_fini(&pp->cp_cmt_pgs); 886 887 /* 888 * Reset the pointers in any offline processors so they won't 889 * try to rejoin the destroyed partition when they're turned 890 * online. 891 */ 892 first_cp = cp = CPU; 893 do { 894 if (cp->cpu_part == pp) { 895 ASSERT(cp->cpu_flags & CPU_OFFLINE); 896 cp->cpu_part = newpp; 897 } 898 cp = cp->cpu_next; 899 } while (cp != first_cp); 900 901 /* 902 * Pause all CPUs while changing the partition list, to make sure 903 * the clock thread (which traverses the list without holding 904 * cpu_lock) isn't running. 905 */ 906 pause_cpus(NULL); 907 pp->cp_prev->cp_next = pp->cp_next; 908 pp->cp_next->cp_prev = pp->cp_prev; 909 if (cp_list_head == pp) 910 cp_list_head = pp->cp_next; 911 start_cpus(); 912 913 if (cp_id_next > pp->cp_id) 914 cp_id_next = pp->cp_id; 915 916 if (pp->cp_kstat) 917 kstat_delete(pp->cp_kstat); 918 919 cp_numparts--; 920 921 disp_kp_free(&pp->cp_kp_queue); 922 kmem_free(pp->cp_lgrploads, sizeof (lpl_t) * pp->cp_nlgrploads); 923 pp->cp_lgrploads = NULL; 924 kmem_free(pp->cp_mach, sizeof (struct mach_cpupart)); 925 kmem_free(pp, sizeof (cpupart_t)); 926 mutex_exit(&cpu_lock); 927 928 return (err); 929 } 930 931 932 /* 933 * Return the ID of the partition to which the specified processor belongs. 934 */ 935 psetid_t 936 cpupart_query_cpu(cpu_t *cp) 937 { 938 ASSERT(MUTEX_HELD(&cpu_lock)); 939 940 return (CPTOPS(cp->cpu_part->cp_id)); 941 } 942 943 944 /* 945 * Attach a processor to an existing partition. 946 */ 947 int 948 cpupart_attach_cpu(psetid_t psid, cpu_t *cp, int forced) 949 { 950 cpupart_t *pp; 951 int err; 952 953 ASSERT(pool_lock_held()); 954 ASSERT(MUTEX_HELD(&cpu_lock)); 955 956 pp = cpupart_find(psid); 957 if (pp == NULL) 958 return (EINVAL); 959 if (cp->cpu_flags & CPU_OFFLINE) 960 return (EINVAL); 961 962 err = cpupart_move_cpu(cp, pp, forced); 963 return (err); 964 } 965 966 /* 967 * Get a list of cpus belonging to the partition. If numcpus is NULL, 968 * this just checks for a valid partition. If numcpus is non-NULL but 969 * cpulist is NULL, the current number of cpus is stored in *numcpus. 970 * If both are non-NULL, the current number of cpus is stored in *numcpus, 971 * and a list of those cpus up to the size originally in *numcpus is 972 * stored in cpulist[]. Also, store the processor set id in *psid. 973 * This is useful in case the processor set id passed in was PS_MYID. 974 */ 975 int 976 cpupart_get_cpus(psetid_t *psid, processorid_t *cpulist, uint_t *numcpus) 977 { 978 cpupart_t *pp; 979 uint_t ncpus; 980 cpu_t *c; 981 int i; 982 983 mutex_enter(&cpu_lock); 984 pp = cpupart_find(*psid); 985 if (pp == NULL) { 986 mutex_exit(&cpu_lock); 987 return (EINVAL); 988 } 989 *psid = CPTOPS(pp->cp_id); 990 ncpus = pp->cp_ncpus; 991 if (numcpus) { 992 if (ncpus > *numcpus) { 993 /* 994 * Only copy as many cpus as were passed in, but 995 * pass back the real number. 996 */ 997 uint_t t = ncpus; 998 ncpus = *numcpus; 999 *numcpus = t; 1000 } else 1001 *numcpus = ncpus; 1002 1003 if (cpulist) { 1004 c = pp->cp_cpulist; 1005 for (i = 0; i < ncpus; i++) { 1006 ASSERT(c != NULL); 1007 cpulist[i] = c->cpu_id; 1008 c = c->cpu_next_part; 1009 } 1010 } 1011 } 1012 mutex_exit(&cpu_lock); 1013 return (0); 1014 } 1015 1016 /* 1017 * Reallocate kpreempt queues for each CPU partition. Called from 1018 * disp_setup when a new scheduling class is loaded that increases the 1019 * number of priorities in the system. 1020 */ 1021 void 1022 cpupart_kpqalloc(pri_t npri) 1023 { 1024 cpupart_t *cpp; 1025 1026 ASSERT(MUTEX_HELD(&cpu_lock)); 1027 cpp = cp_list_head; 1028 do { 1029 disp_kp_alloc(&cpp->cp_kp_queue, npri); 1030 cpp = cpp->cp_next; 1031 } while (cpp != cp_list_head); 1032 } 1033 1034 int 1035 cpupart_get_loadavg(psetid_t psid, int *buf, int nelem) 1036 { 1037 cpupart_t *cp; 1038 int i; 1039 1040 ASSERT(nelem >= 0); 1041 ASSERT(nelem <= LOADAVG_NSTATS); 1042 ASSERT(MUTEX_HELD(&cpu_lock)); 1043 1044 cp = cpupart_find(psid); 1045 if (cp == NULL) 1046 return (EINVAL); 1047 for (i = 0; i < nelem; i++) 1048 buf[i] = cp->cp_hp_avenrun[i] >> (16 - FSHIFT); 1049 1050 return (0); 1051 } 1052 1053 1054 uint_t 1055 cpupart_list(psetid_t *list, uint_t nelem, int flag) 1056 { 1057 uint_t numpart = 0; 1058 cpupart_t *cp; 1059 1060 ASSERT(MUTEX_HELD(&cpu_lock)); 1061 ASSERT(flag == CP_ALL || flag == CP_NONEMPTY); 1062 1063 if (list != NULL) { 1064 cp = cp_list_head; 1065 do { 1066 if (((flag == CP_ALL) && (cp != &cp_default)) || 1067 ((flag == CP_NONEMPTY) && (cp->cp_ncpus != 0))) { 1068 if (numpart == nelem) 1069 break; 1070 list[numpart++] = CPTOPS(cp->cp_id); 1071 } 1072 cp = cp->cp_next; 1073 } while (cp != cp_list_head); 1074 } 1075 1076 ASSERT(numpart < cp_numparts); 1077 1078 if (flag == CP_ALL) 1079 numpart = cp_numparts - 1; /* leave out default partition */ 1080 else if (flag == CP_NONEMPTY) 1081 numpart = cp_numparts_nonempty; 1082 1083 return (numpart); 1084 } 1085 1086 int 1087 cpupart_setattr(psetid_t psid, uint_t attr) 1088 { 1089 cpupart_t *cp; 1090 1091 ASSERT(pool_lock_held()); 1092 1093 mutex_enter(&cpu_lock); 1094 if ((cp = cpupart_find(psid)) == NULL) { 1095 mutex_exit(&cpu_lock); 1096 return (EINVAL); 1097 } 1098 /* 1099 * PSET_NOESCAPE attribute for default cpu partition is always set 1100 */ 1101 if (cp == &cp_default && !(attr & PSET_NOESCAPE)) { 1102 mutex_exit(&cpu_lock); 1103 return (EINVAL); 1104 } 1105 cp->cp_attr = attr; 1106 mutex_exit(&cpu_lock); 1107 return (0); 1108 } 1109 1110 int 1111 cpupart_getattr(psetid_t psid, uint_t *attrp) 1112 { 1113 cpupart_t *cp; 1114 1115 mutex_enter(&cpu_lock); 1116 if ((cp = cpupart_find(psid)) == NULL) { 1117 mutex_exit(&cpu_lock); 1118 return (EINVAL); 1119 } 1120 *attrp = cp->cp_attr; 1121 mutex_exit(&cpu_lock); 1122 return (0); 1123 } 1124