1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <sys/systm.h> 30 #include <sys/cmn_err.h> 31 #include <sys/cpuvar.h> 32 #include <sys/thread.h> 33 #include <sys/disp.h> 34 #include <sys/kmem.h> 35 #include <sys/debug.h> 36 #include <sys/cpupart.h> 37 #include <sys/pset.h> 38 #include <sys/var.h> 39 #include <sys/cyclic.h> 40 #include <sys/lgrp.h> 41 #include <sys/chip.h> 42 #include <sys/loadavg.h> 43 #include <sys/class.h> 44 #include <sys/fss.h> 45 #include <sys/pool.h> 46 #include <sys/pool_pset.h> 47 #include <sys/policy.h> 48 49 /* 50 * Calling pool_lock() protects the pools configuration, which includes 51 * CPU partitions. cpu_lock protects the CPU partition list, and prevents 52 * partitions from being created or destroyed while the lock is held. 53 * The lock ordering with respect to related locks is: 54 * 55 * pool_lock() ---> cpu_lock ---> pidlock --> p_lock 56 * 57 * Blocking memory allocations may be made while holding "pool_lock" 58 * or cpu_lock. 59 */ 60 61 /* 62 * The cp_default partition is allocated statically, but its lgroup load average 63 * (lpl) list is allocated dynamically after kmem subsystem is initialized. This 64 * saves some memory since the space allocated reflects the actual number of 65 * lgroups supported by the platform. The lgrp facility provides a temporary 66 * space to hold lpl information during system bootstrap. 67 */ 68 69 cpupart_t *cp_list_head; 70 cpupart_t cp_default; 71 struct mach_cpupart cp_default_mach; 72 static cpupartid_t cp_id_next; 73 uint_t cp_numparts; 74 uint_t cp_numparts_nonempty; 75 76 /* 77 * Need to limit total number of partitions to avoid slowing down the 78 * clock code too much. The clock code traverses the list of 79 * partitions and needs to be able to execute in a reasonable amount 80 * of time (less than 1/hz seconds). The maximum is sized based on 81 * max_ncpus so it shouldn't be a problem unless there are large 82 * numbers of empty partitions. 83 */ 84 static uint_t cp_max_numparts; 85 86 /* 87 * Processor sets and CPU partitions are different but related concepts. 88 * A processor set is a user-level abstraction allowing users to create 89 * sets of CPUs and bind threads exclusively to those sets. A CPU 90 * partition is a kernel dispatcher object consisting of a set of CPUs 91 * and a global dispatch queue. The processor set abstraction is 92 * implemented via a CPU partition, and currently there is a 1-1 93 * mapping between processor sets and partitions (excluding the default 94 * partition, which is not visible as a processor set). Hence, the 95 * numbering for processor sets and CPU partitions is identical. This 96 * may not always be true in the future, and these macros could become 97 * less trivial if we support e.g. a processor set containing multiple 98 * CPU partitions. 99 */ 100 #define PSTOCP(psid) ((cpupartid_t)((psid) == PS_NONE ? CP_DEFAULT : (psid))) 101 #define CPTOPS(cpid) ((psetid_t)((cpid) == CP_DEFAULT ? PS_NONE : (cpid))) 102 103 /* 104 * Find a CPU partition given a processor set ID. 105 */ 106 static cpupart_t * 107 cpupart_find_all(psetid_t psid) 108 { 109 cpupart_t *cp; 110 cpupartid_t cpid = PSTOCP(psid); 111 112 ASSERT(MUTEX_HELD(&cpu_lock)); 113 114 /* default partition not visible as a processor set */ 115 if (psid == CP_DEFAULT) 116 return (NULL); 117 118 if (psid == PS_MYID) 119 return (curthread->t_cpupart); 120 121 cp = cp_list_head; 122 do { 123 if (cp->cp_id == cpid) 124 return (cp); 125 cp = cp->cp_next; 126 } while (cp != cp_list_head); 127 return (NULL); 128 } 129 130 /* 131 * Find a CPU partition given a processor set ID if the processor set 132 * should be visible from the calling zone. 133 */ 134 cpupart_t * 135 cpupart_find(psetid_t psid) 136 { 137 cpupart_t *cp; 138 139 ASSERT(MUTEX_HELD(&cpu_lock)); 140 cp = cpupart_find_all(psid); 141 if (cp != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() && 142 zone_pset_get(curproc->p_zone) != CPTOPS(cp->cp_id)) 143 return (NULL); 144 return (cp); 145 } 146 147 static int 148 cpupart_kstat_update(kstat_t *ksp, int rw) 149 { 150 cpupart_t *cp = (cpupart_t *)ksp->ks_private; 151 cpupart_kstat_t *cpksp = ksp->ks_data; 152 153 if (rw == KSTAT_WRITE) 154 return (EACCES); 155 156 cpksp->cpk_updates.value.ui64 = cp->cp_updates; 157 cpksp->cpk_runnable.value.ui64 = cp->cp_nrunnable_cum; 158 cpksp->cpk_waiting.value.ui64 = cp->cp_nwaiting_cum; 159 cpksp->cpk_ncpus.value.ui32 = cp->cp_ncpus; 160 cpksp->cpk_avenrun_1min.value.ui32 = cp->cp_hp_avenrun[0] >> 161 (16 - FSHIFT); 162 cpksp->cpk_avenrun_5min.value.ui32 = cp->cp_hp_avenrun[1] >> 163 (16 - FSHIFT); 164 cpksp->cpk_avenrun_15min.value.ui32 = cp->cp_hp_avenrun[2] >> 165 (16 - FSHIFT); 166 return (0); 167 } 168 169 static void 170 cpupart_kstat_create(cpupart_t *cp) 171 { 172 kstat_t *ksp; 173 zoneid_t zoneid; 174 175 ASSERT(MUTEX_HELD(&cpu_lock)); 176 177 /* 178 * We have a bit of a chicken-egg problem since this code will 179 * get called to create the kstats for CP_DEFAULT before the 180 * pools framework gets initialized. We circumvent the problem 181 * by special-casing cp_default. 182 */ 183 if (cp != &cp_default && pool_pset_enabled()) 184 zoneid = GLOBAL_ZONEID; 185 else 186 zoneid = ALL_ZONES; 187 ksp = kstat_create_zone("unix", cp->cp_id, "pset", "misc", 188 KSTAT_TYPE_NAMED, 189 sizeof (cpupart_kstat_t) / sizeof (kstat_named_t), 0, zoneid); 190 if (ksp != NULL) { 191 cpupart_kstat_t *cpksp = ksp->ks_data; 192 193 kstat_named_init(&cpksp->cpk_updates, "updates", 194 KSTAT_DATA_UINT64); 195 kstat_named_init(&cpksp->cpk_runnable, "runnable", 196 KSTAT_DATA_UINT64); 197 kstat_named_init(&cpksp->cpk_waiting, "waiting", 198 KSTAT_DATA_UINT64); 199 kstat_named_init(&cpksp->cpk_ncpus, "ncpus", 200 KSTAT_DATA_UINT32); 201 kstat_named_init(&cpksp->cpk_avenrun_1min, "avenrun_1min", 202 KSTAT_DATA_UINT32); 203 kstat_named_init(&cpksp->cpk_avenrun_5min, "avenrun_5min", 204 KSTAT_DATA_UINT32); 205 kstat_named_init(&cpksp->cpk_avenrun_15min, "avenrun_15min", 206 KSTAT_DATA_UINT32); 207 208 ksp->ks_update = cpupart_kstat_update; 209 ksp->ks_private = cp; 210 211 kstat_install(ksp); 212 } 213 cp->cp_kstat = ksp; 214 } 215 216 /* 217 * Initialize the default partition and kpreempt disp queue. 218 */ 219 void 220 cpupart_initialize_default(void) 221 { 222 lgrp_id_t i; 223 224 cp_list_head = &cp_default; 225 cp_default.cp_next = &cp_default; 226 cp_default.cp_prev = &cp_default; 227 cp_default.cp_id = CP_DEFAULT; 228 cp_default.cp_kp_queue.disp_maxrunpri = -1; 229 cp_default.cp_kp_queue.disp_max_unbound_pri = -1; 230 cp_default.cp_kp_queue.disp_cpu = NULL; 231 cp_default.cp_gen = 0; 232 cp_default.cp_loadavg.lg_cur = 0; 233 cp_default.cp_loadavg.lg_len = 0; 234 cp_default.cp_loadavg.lg_total = 0; 235 for (i = 0; i < S_LOADAVG_SZ; i++) { 236 cp_default.cp_loadavg.lg_loads[i] = 0; 237 } 238 CPUSET_ZERO(cp_default.cp_mach->mc_haltset); 239 DISP_LOCK_INIT(&cp_default.cp_kp_queue.disp_lock); 240 cp_id_next = CP_DEFAULT + 1; 241 cpupart_kstat_create(&cp_default); 242 cp_numparts = 1; 243 if (cp_max_numparts == 0) /* allow for /etc/system tuning */ 244 cp_max_numparts = max_ncpus * 2 + 1; 245 /* 246 * Allocate space for cp_default list of lgrploads 247 */ 248 cp_default.cp_nlgrploads = lgrp_plat_max_lgrps(); 249 cp_default.cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * 250 cp_default.cp_nlgrploads, KM_SLEEP); 251 252 /* 253 * The initial lpl topology is created in a special lpl list 254 * lpl_bootstrap. It should be copied to cp_default. 255 * NOTE: lpl_topo_bootstrap() also updates CPU0 cpu_lpl pointer to point 256 * to the correct lpl in the cp_default.cp_lgrploads list. 257 */ 258 lpl_topo_bootstrap(cp_default.cp_lgrploads, 259 cp_default.cp_nlgrploads); 260 261 for (i = 0; i < cp_default.cp_nlgrploads; i++) { 262 cp_default.cp_lgrploads[i].lpl_lgrpid = i; 263 } 264 cp_default.cp_attr = PSET_NOESCAPE; 265 cp_numparts_nonempty = 1; 266 /* 267 * Set t0's home 268 */ 269 t0.t_lpl = &cp_default.cp_lgrploads[LGRP_ROOTID]; 270 } 271 272 273 static int 274 cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced) 275 { 276 cpupart_t *oldpp; 277 cpu_t *ncp, *newlist; 278 kthread_t *t; 279 int move_threads = 1; 280 lgrp_id_t lgrpid; 281 proc_t *p; 282 int lgrp_diff_lpl; 283 lpl_t *cpu_lpl; 284 int ret; 285 286 ASSERT(MUTEX_HELD(&cpu_lock)); 287 ASSERT(newpp != NULL); 288 289 oldpp = cp->cpu_part; 290 ASSERT(oldpp != NULL); 291 ASSERT(oldpp->cp_ncpus > 0); 292 293 if (newpp == oldpp) { 294 /* 295 * Don't need to do anything. 296 */ 297 return (0); 298 } 299 300 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_OUT); 301 302 if (!disp_bound_partition(cp, 0)) { 303 /* 304 * Don't need to move threads if there are no threads in 305 * the partition. Note that threads can't enter the 306 * partition while we're holding cpu_lock. 307 */ 308 move_threads = 0; 309 } else if (oldpp->cp_ncpus == 1) { 310 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 311 return (EBUSY); 312 } 313 314 if (forced && (ret = cpu_unbind(cp->cpu_id)) != 0) { 315 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 316 return (ret); 317 } 318 319 /* 320 * Stop further threads weak binding to this cpu. 321 */ 322 cpu_inmotion = cp; 323 membar_enter(); 324 325 again: 326 if (move_threads) { 327 int loop_count; 328 /* 329 * Check for threads strong or weak bound to this CPU. 330 */ 331 for (loop_count = 0; disp_bound_threads(cp, 0); loop_count++) { 332 if (loop_count >= 5) { 333 cpu_state_change_notify(cp->cpu_id, 334 CPU_CPUPART_IN); 335 cpu_inmotion = NULL; 336 return (EBUSY); /* some threads still bound */ 337 } 338 delay(1); 339 } 340 } 341 342 /* 343 * Before we actually start changing data structures, notify 344 * the cyclic subsystem that we want to move this CPU out of its 345 * partition. 346 */ 347 if (!cyclic_move_out(cp)) { 348 /* 349 * This CPU must be the last CPU in a processor set with 350 * a bound cyclic. 351 */ 352 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 353 cpu_inmotion = NULL; 354 return (EBUSY); 355 } 356 357 pause_cpus(cp); 358 359 if (move_threads) { 360 /* 361 * The thread on cpu before the pause thread may have read 362 * cpu_inmotion before we raised the barrier above. Check 363 * again. 364 */ 365 if (disp_bound_threads(cp, 1)) { 366 start_cpus(); 367 goto again; 368 } 369 370 } 371 372 /* 373 * Update the set of chip's being spanned 374 */ 375 chip_cpu_move_part(cp, oldpp, newpp); 376 377 /* save this cpu's lgroup -- it'll be the same in the new partition */ 378 lgrpid = cp->cpu_lpl->lpl_lgrpid; 379 380 cpu_lpl = cp->cpu_lpl; 381 /* 382 * let the lgroup framework know cp has left the partition 383 */ 384 lgrp_config(LGRP_CONFIG_CPUPART_DEL, (uintptr_t)cp, lgrpid); 385 386 /* move out of old partition */ 387 oldpp->cp_ncpus--; 388 if (oldpp->cp_ncpus > 0) { 389 390 ncp = cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part; 391 cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part; 392 if (oldpp->cp_cpulist == cp) { 393 oldpp->cp_cpulist = ncp; 394 } 395 } else { 396 ncp = oldpp->cp_cpulist = NULL; 397 cp_numparts_nonempty--; 398 ASSERT(cp_numparts_nonempty != 0); 399 } 400 oldpp->cp_gen++; 401 402 /* move into new partition */ 403 newlist = newpp->cp_cpulist; 404 if (newlist == NULL) { 405 newpp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp; 406 cp_numparts_nonempty++; 407 ASSERT(cp_numparts_nonempty != 0); 408 } else { 409 cp->cpu_next_part = newlist; 410 cp->cpu_prev_part = newlist->cpu_prev_part; 411 newlist->cpu_prev_part->cpu_next_part = cp; 412 newlist->cpu_prev_part = cp; 413 } 414 cp->cpu_part = newpp; 415 newpp->cp_ncpus++; 416 newpp->cp_gen++; 417 418 ASSERT(CPUSET_ISNULL(newpp->cp_mach->mc_haltset)); 419 ASSERT(CPUSET_ISNULL(oldpp->cp_mach->mc_haltset)); 420 421 /* 422 * let the lgroup framework know cp has entered the partition 423 */ 424 lgrp_config(LGRP_CONFIG_CPUPART_ADD, (uintptr_t)cp, lgrpid); 425 426 /* 427 * If necessary, move threads off processor. 428 */ 429 if (move_threads) { 430 ASSERT(ncp != NULL); 431 432 /* 433 * Walk thru the active process list to look for 434 * threads that need to have a new home lgroup, 435 * or the last CPU they run on is the same CPU 436 * being moved out of the partition. 437 */ 438 439 for (p = practive; p != NULL; p = p->p_next) { 440 441 t = p->p_tlist; 442 443 if (t == NULL) 444 continue; 445 446 lgrp_diff_lpl = 0; 447 448 do { 449 450 ASSERT(t->t_lpl != NULL); 451 452 /* 453 * Update the count of how many threads are 454 * in this CPU's lgroup but have a different lpl 455 */ 456 457 if (t->t_lpl != cpu_lpl && 458 t->t_lpl->lpl_lgrpid == lgrpid) 459 lgrp_diff_lpl++; 460 /* 461 * If the lgroup that t is assigned to no 462 * longer has any CPUs in t's partition, 463 * we'll have to choose a new lgroup for t. 464 */ 465 466 if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid, 467 t->t_cpupart)) { 468 lgrp_move_thread(t, 469 lgrp_choose(t, t->t_cpupart), 0); 470 } 471 472 /* 473 * make sure lpl points to our own partition 474 */ 475 ASSERT(t->t_lpl >= t->t_cpupart->cp_lgrploads && 476 (t->t_lpl < t->t_cpupart->cp_lgrploads + 477 t->t_cpupart->cp_nlgrploads)); 478 479 ASSERT(t->t_lpl->lpl_ncpu > 0); 480 481 /* Update CPU last ran on if it was this CPU */ 482 if (t->t_cpu == cp && t->t_cpupart == oldpp && 483 t->t_bound_cpu != cp) { 484 t->t_cpu = disp_lowpri_cpu(ncp, 485 t->t_lpl, t->t_pri, NULL); 486 } 487 t = t->t_forw; 488 } while (t != p->p_tlist); 489 490 /* 491 * Didn't find any threads in the same lgroup as this 492 * CPU with a different lpl, so remove the lgroup from 493 * the process lgroup bitmask. 494 */ 495 496 if (lgrp_diff_lpl) 497 klgrpset_del(p->p_lgrpset, lgrpid); 498 } 499 500 /* 501 * Walk thread list looking for threads that need to be 502 * rehomed, since there are some threads that are not in 503 * their process's p_tlist. 504 */ 505 506 t = curthread; 507 508 do { 509 ASSERT(t != NULL && t->t_lpl != NULL); 510 511 /* 512 * If the lgroup that t is assigned to no 513 * longer has any CPUs in t's partition, 514 * we'll have to choose a new lgroup for t. 515 * Also, choose best lgroup for home when 516 * thread has specified lgroup affinities, 517 * since there may be an lgroup with more 518 * affinity available after moving CPUs 519 * around. 520 */ 521 if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid, 522 t->t_cpupart) || t->t_lgrp_affinity) { 523 lgrp_move_thread(t, 524 lgrp_choose(t, t->t_cpupart), 1); 525 } 526 527 /* make sure lpl points to our own partition */ 528 ASSERT((t->t_lpl >= t->t_cpupart->cp_lgrploads) && 529 (t->t_lpl < t->t_cpupart->cp_lgrploads + 530 t->t_cpupart->cp_nlgrploads)); 531 532 ASSERT(t->t_lpl->lpl_ncpu > 0); 533 534 /* Update CPU last ran on if it was this CPU */ 535 if (t->t_cpu == cp && t->t_cpupart == oldpp && 536 t->t_bound_cpu != cp) { 537 t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl, 538 t->t_pri, NULL); 539 } 540 541 t = t->t_next; 542 } while (t != curthread); 543 544 /* 545 * Clear off the CPU's run queue, and the kp queue if the 546 * partition is now empty. 547 */ 548 disp_cpu_inactive(cp); 549 550 /* 551 * Make cp switch to a thread from the new partition. 552 */ 553 cp->cpu_runrun = 1; 554 cp->cpu_kprunrun = 1; 555 } 556 557 cpu_inmotion = NULL; 558 start_cpus(); 559 560 /* 561 * Let anyone interested know that cpu has been added to the set. 562 */ 563 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 564 565 /* 566 * Now let the cyclic subsystem know that it can reshuffle cyclics 567 * bound to the new processor set. 568 */ 569 cyclic_move_in(cp); 570 571 return (0); 572 } 573 574 /* 575 * Check if thread can be moved to a new cpu partition. Called by 576 * cpupart_move_thread() and pset_bind_start(). 577 */ 578 int 579 cpupart_movable_thread(kthread_id_t tp, cpupart_t *cp, int ignore) 580 { 581 ASSERT(MUTEX_HELD(&cpu_lock)); 582 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock)); 583 ASSERT(cp != NULL); 584 ASSERT(THREAD_LOCK_HELD(tp)); 585 586 /* 587 * CPU-bound threads can't be moved. 588 */ 589 if (!ignore) { 590 cpu_t *boundcpu = tp->t_bound_cpu ? tp->t_bound_cpu : 591 tp->t_weakbound_cpu; 592 if (boundcpu != NULL && boundcpu->cpu_part != cp) 593 return (EBUSY); 594 } 595 return (0); 596 } 597 598 /* 599 * Move thread to new partition. If ignore is non-zero, then CPU 600 * bindings should be ignored (this is used when destroying a 601 * partition). 602 */ 603 static int 604 cpupart_move_thread(kthread_id_t tp, cpupart_t *newpp, int ignore, 605 void *projbuf, void *zonebuf) 606 { 607 cpupart_t *oldpp = tp->t_cpupart; 608 int ret; 609 610 ASSERT(MUTEX_HELD(&cpu_lock)); 611 ASSERT(MUTEX_HELD(&pidlock)); 612 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock)); 613 ASSERT(newpp != NULL); 614 615 if (newpp->cp_cpulist == NULL) 616 return (EINVAL); 617 618 /* 619 * Check for errors first. 620 */ 621 thread_lock(tp); 622 if ((ret = cpupart_movable_thread(tp, newpp, ignore)) != 0) { 623 thread_unlock(tp); 624 return (ret); 625 } 626 627 /* move the thread */ 628 if (oldpp != newpp) { 629 /* 630 * Make the thread switch to the new partition. 631 */ 632 tp->t_cpupart = newpp; 633 ASSERT(tp->t_lpl != NULL); 634 /* 635 * Leave the thread on the same lgroup if possible; otherwise 636 * choose a new lgroup for it. In either case, update its 637 * t_lpl. 638 */ 639 if (LGRP_CPUS_IN_PART(tp->t_lpl->lpl_lgrpid, newpp) && 640 tp->t_lgrp_affinity == NULL) { 641 /* 642 * The thread's lgroup has CPUs in the thread's new 643 * partition, so the thread can stay assigned to the 644 * same lgroup. Update its t_lpl to point to the 645 * lpl_t for its lgroup in its new partition. 646 */ 647 lgrp_move_thread(tp, &tp->t_cpupart->\ 648 cp_lgrploads[tp->t_lpl->lpl_lgrpid], 1); 649 } else { 650 /* 651 * The thread's lgroup has no cpus in its new 652 * partition or it has specified lgroup affinities, 653 * so choose the best lgroup for the thread and 654 * assign it to that lgroup. 655 */ 656 lgrp_move_thread(tp, lgrp_choose(tp, tp->t_cpupart), 657 1); 658 } 659 /* 660 * make sure lpl points to our own partition 661 */ 662 ASSERT((tp->t_lpl >= tp->t_cpupart->cp_lgrploads) && 663 (tp->t_lpl < tp->t_cpupart->cp_lgrploads + 664 tp->t_cpupart->cp_nlgrploads)); 665 666 ASSERT(tp->t_lpl->lpl_ncpu > 0); 667 668 if (tp->t_state == TS_ONPROC) { 669 cpu_surrender(tp); 670 } else if (tp->t_state == TS_RUN) { 671 (void) dispdeq(tp); 672 setbackdq(tp); 673 } 674 } 675 676 /* 677 * Our binding has changed; set TP_CHANGEBIND. 678 */ 679 tp->t_proc_flag |= TP_CHANGEBIND; 680 aston(tp); 681 682 thread_unlock(tp); 683 fss_changepset(tp, newpp, projbuf, zonebuf); 684 685 return (0); /* success */ 686 } 687 688 689 /* 690 * This function binds a thread to a partition. Must be called with the 691 * p_lock of the containing process held (to keep the thread from going 692 * away), and thus also with cpu_lock held (since cpu_lock must be 693 * acquired before p_lock). If ignore is non-zero, then CPU bindings 694 * should be ignored (this is used when destroying a partition). 695 */ 696 int 697 cpupart_bind_thread(kthread_id_t tp, psetid_t psid, int ignore, void *projbuf, 698 void *zonebuf) 699 { 700 cpupart_t *newpp; 701 702 ASSERT(pool_lock_held()); 703 ASSERT(MUTEX_HELD(&cpu_lock)); 704 ASSERT(MUTEX_HELD(&pidlock)); 705 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock)); 706 707 if (psid == PS_NONE) 708 newpp = &cp_default; 709 else { 710 newpp = cpupart_find(psid); 711 if (newpp == NULL) { 712 return (EINVAL); 713 } 714 } 715 return (cpupart_move_thread(tp, newpp, ignore, projbuf, zonebuf)); 716 } 717 718 719 /* 720 * Create a new partition. On MP systems, this also allocates a 721 * kpreempt disp queue for that partition. 722 */ 723 int 724 cpupart_create(psetid_t *psid) 725 { 726 cpupart_t *pp; 727 lgrp_id_t i; 728 729 ASSERT(pool_lock_held()); 730 731 pp = kmem_zalloc(sizeof (cpupart_t), KM_SLEEP); 732 pp->cp_mach = kmem_zalloc(sizeof (struct mach_cpupart), KM_SLEEP); 733 pp->cp_nlgrploads = lgrp_plat_max_lgrps(); 734 pp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * pp->cp_nlgrploads, 735 KM_SLEEP); 736 737 mutex_enter(&cpu_lock); 738 if (cp_numparts == cp_max_numparts) { 739 mutex_exit(&cpu_lock); 740 kmem_free(pp->cp_lgrploads, sizeof (lpl_t) * pp->cp_nlgrploads); 741 pp->cp_lgrploads = NULL; 742 kmem_free(pp->cp_mach, sizeof (struct mach_cpupart)); 743 kmem_free(pp, sizeof (cpupart_t)); 744 return (ENOMEM); 745 } 746 cp_numparts++; 747 /* find the next free partition ID */ 748 while (cpupart_find(CPTOPS(cp_id_next)) != NULL) 749 cp_id_next++; 750 pp->cp_id = cp_id_next++; 751 pp->cp_ncpus = 0; 752 pp->cp_cpulist = NULL; 753 pp->cp_attr = 0; 754 klgrpset_clear(pp->cp_lgrpset); 755 pp->cp_kp_queue.disp_maxrunpri = -1; 756 pp->cp_kp_queue.disp_max_unbound_pri = -1; 757 pp->cp_kp_queue.disp_cpu = NULL; 758 pp->cp_gen = 0; 759 CPUSET_ZERO(pp->cp_mach->mc_haltset); 760 DISP_LOCK_INIT(&pp->cp_kp_queue.disp_lock); 761 *psid = CPTOPS(pp->cp_id); 762 disp_kp_alloc(&pp->cp_kp_queue, v.v_nglobpris); 763 cpupart_kstat_create(pp); 764 for (i = 0; i < pp->cp_nlgrploads; i++) { 765 pp->cp_lgrploads[i].lpl_lgrpid = i; 766 } 767 CHIP_SET_ZERO(pp->cp_mach->mc_chipset); 768 769 /* 770 * Pause all CPUs while changing the partition list, to make sure 771 * the clock thread (which traverses the list without holding 772 * cpu_lock) isn't running. 773 */ 774 pause_cpus(NULL); 775 pp->cp_next = cp_list_head; 776 pp->cp_prev = cp_list_head->cp_prev; 777 cp_list_head->cp_prev->cp_next = pp; 778 cp_list_head->cp_prev = pp; 779 start_cpus(); 780 mutex_exit(&cpu_lock); 781 782 return (0); 783 } 784 785 786 /* 787 * Destroy a partition. 788 */ 789 int 790 cpupart_destroy(psetid_t psid) 791 { 792 cpu_t *cp, *first_cp; 793 cpupart_t *pp, *newpp; 794 int err = 0; 795 void *projbuf, *zonebuf; 796 kthread_t *t; 797 proc_t *p; 798 799 ASSERT(pool_lock_held()); 800 mutex_enter(&cpu_lock); 801 802 pp = cpupart_find(psid); 803 if (pp == NULL || pp == &cp_default) { 804 mutex_exit(&cpu_lock); 805 return (EINVAL); 806 } 807 808 /* 809 * Pre-allocate enough buffers for FSS for all active projects and 810 * for all active zones on the system. Unused buffers will be 811 * freed later by fss_freebuf(). 812 */ 813 projbuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_PROJ); 814 zonebuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_ZONE); 815 816 /* 817 * First need to unbind all the threads currently bound to the 818 * partition. Then do the actual destroy (which moves the CPUs). 819 */ 820 mutex_enter(&pidlock); 821 t = curthread; 822 do { 823 if (t->t_bind_pset == psid) { 824 again: p = ttoproc(t); 825 mutex_enter(&p->p_lock); 826 if (ttoproc(t) != p) { 827 /* 828 * lwp_exit has changed this thread's process 829 * pointer before we grabbed its p_lock. 830 */ 831 mutex_exit(&p->p_lock); 832 goto again; 833 } 834 err = cpupart_bind_thread(t, PS_NONE, 1, 835 projbuf, zonebuf); 836 if (err) { 837 mutex_exit(&p->p_lock); 838 mutex_exit(&pidlock); 839 mutex_exit(&cpu_lock); 840 fss_freebuf(projbuf, FSS_ALLOC_PROJ); 841 fss_freebuf(zonebuf, FSS_ALLOC_ZONE); 842 return (err); 843 } 844 t->t_bind_pset = PS_NONE; 845 mutex_exit(&p->p_lock); 846 } 847 t = t->t_next; 848 } while (t != curthread); 849 850 mutex_exit(&pidlock); 851 fss_freebuf(projbuf, FSS_ALLOC_PROJ); 852 fss_freebuf(zonebuf, FSS_ALLOC_ZONE); 853 854 newpp = &cp_default; 855 while ((cp = pp->cp_cpulist) != NULL) { 856 if (err = cpupart_move_cpu(cp, newpp, 0)) { 857 mutex_exit(&cpu_lock); 858 return (err); 859 } 860 } 861 862 ASSERT(CHIP_SET_ISNULL(pp->cp_mach->mc_chipset)); 863 ASSERT(CPUSET_ISNULL(pp->cp_mach->mc_haltset)); 864 865 /* 866 * Reset the pointers in any offline processors so they won't 867 * try to rejoin the destroyed partition when they're turned 868 * online. 869 */ 870 first_cp = cp = CPU; 871 do { 872 if (cp->cpu_part == pp) { 873 ASSERT(cp->cpu_flags & CPU_OFFLINE); 874 cp->cpu_part = newpp; 875 } 876 cp = cp->cpu_next; 877 } while (cp != first_cp); 878 879 /* 880 * Pause all CPUs while changing the partition list, to make sure 881 * the clock thread (which traverses the list without holding 882 * cpu_lock) isn't running. 883 */ 884 pause_cpus(NULL); 885 pp->cp_prev->cp_next = pp->cp_next; 886 pp->cp_next->cp_prev = pp->cp_prev; 887 if (cp_list_head == pp) 888 cp_list_head = pp->cp_next; 889 start_cpus(); 890 891 if (cp_id_next > pp->cp_id) 892 cp_id_next = pp->cp_id; 893 894 if (pp->cp_kstat) 895 kstat_delete(pp->cp_kstat); 896 897 cp_numparts--; 898 899 disp_kp_free(&pp->cp_kp_queue); 900 kmem_free(pp->cp_lgrploads, sizeof (lpl_t) * pp->cp_nlgrploads); 901 pp->cp_lgrploads = NULL; 902 kmem_free(pp->cp_mach, sizeof (struct mach_cpupart)); 903 kmem_free(pp, sizeof (cpupart_t)); 904 mutex_exit(&cpu_lock); 905 906 return (err); 907 } 908 909 910 /* 911 * Return the ID of the partition to which the specified processor belongs. 912 */ 913 psetid_t 914 cpupart_query_cpu(cpu_t *cp) 915 { 916 ASSERT(MUTEX_HELD(&cpu_lock)); 917 918 return (CPTOPS(cp->cpu_part->cp_id)); 919 } 920 921 922 /* 923 * Attach a processor to an existing partition. 924 */ 925 int 926 cpupart_attach_cpu(psetid_t psid, cpu_t *cp, int forced) 927 { 928 cpupart_t *pp; 929 int err; 930 931 ASSERT(pool_lock_held()); 932 ASSERT(MUTEX_HELD(&cpu_lock)); 933 934 pp = cpupart_find(psid); 935 if (pp == NULL) 936 return (EINVAL); 937 if (cp->cpu_flags & CPU_OFFLINE) 938 return (EINVAL); 939 940 err = cpupart_move_cpu(cp, pp, forced); 941 return (err); 942 } 943 944 /* 945 * Get a list of cpus belonging to the partition. If numcpus is NULL, 946 * this just checks for a valid partition. If numcpus is non-NULL but 947 * cpulist is NULL, the current number of cpus is stored in *numcpus. 948 * If both are non-NULL, the current number of cpus is stored in *numcpus, 949 * and a list of those cpus up to the size originally in *numcpus is 950 * stored in cpulist[]. Also, store the processor set id in *psid. 951 * This is useful in case the processor set id passed in was PS_MYID. 952 */ 953 int 954 cpupart_get_cpus(psetid_t *psid, processorid_t *cpulist, uint_t *numcpus) 955 { 956 cpupart_t *pp; 957 uint_t ncpus; 958 cpu_t *c; 959 int i; 960 961 mutex_enter(&cpu_lock); 962 pp = cpupart_find(*psid); 963 if (pp == NULL) { 964 mutex_exit(&cpu_lock); 965 return (EINVAL); 966 } 967 *psid = CPTOPS(pp->cp_id); 968 ncpus = pp->cp_ncpus; 969 if (numcpus) { 970 if (ncpus > *numcpus) { 971 /* 972 * Only copy as many cpus as were passed in, but 973 * pass back the real number. 974 */ 975 uint_t t = ncpus; 976 ncpus = *numcpus; 977 *numcpus = t; 978 } else 979 *numcpus = ncpus; 980 981 if (cpulist) { 982 c = pp->cp_cpulist; 983 for (i = 0; i < ncpus; i++) { 984 ASSERT(c != NULL); 985 cpulist[i] = c->cpu_id; 986 c = c->cpu_next_part; 987 } 988 } 989 } 990 mutex_exit(&cpu_lock); 991 return (0); 992 } 993 994 /* 995 * Reallocate kpreempt queues for each CPU partition. Called from 996 * disp_setup when a new scheduling class is loaded that increases the 997 * number of priorities in the system. 998 */ 999 void 1000 cpupart_kpqalloc(pri_t npri) 1001 { 1002 cpupart_t *cpp; 1003 1004 ASSERT(MUTEX_HELD(&cpu_lock)); 1005 cpp = cp_list_head; 1006 do { 1007 disp_kp_alloc(&cpp->cp_kp_queue, npri); 1008 cpp = cpp->cp_next; 1009 } while (cpp != cp_list_head); 1010 } 1011 1012 int 1013 cpupart_get_loadavg(psetid_t psid, int *buf, int nelem) 1014 { 1015 cpupart_t *cp; 1016 int i; 1017 1018 ASSERT(nelem >= 0); 1019 ASSERT(nelem <= LOADAVG_NSTATS); 1020 ASSERT(MUTEX_HELD(&cpu_lock)); 1021 1022 cp = cpupart_find(psid); 1023 if (cp == NULL) 1024 return (EINVAL); 1025 for (i = 0; i < nelem; i++) 1026 buf[i] = cp->cp_hp_avenrun[i] >> (16 - FSHIFT); 1027 1028 return (0); 1029 } 1030 1031 1032 uint_t 1033 cpupart_list(psetid_t *list, uint_t nelem, int flag) 1034 { 1035 uint_t numpart = 0; 1036 cpupart_t *cp; 1037 1038 ASSERT(MUTEX_HELD(&cpu_lock)); 1039 ASSERT(flag == CP_ALL || flag == CP_NONEMPTY); 1040 1041 if (list != NULL) { 1042 cp = cp_list_head; 1043 do { 1044 if (((flag == CP_ALL) && (cp != &cp_default)) || 1045 ((flag == CP_NONEMPTY) && (cp->cp_ncpus != 0))) { 1046 if (numpart == nelem) 1047 break; 1048 list[numpart++] = CPTOPS(cp->cp_id); 1049 } 1050 cp = cp->cp_next; 1051 } while (cp != cp_list_head); 1052 } 1053 1054 ASSERT(numpart < cp_numparts); 1055 1056 if (flag == CP_ALL) 1057 numpart = cp_numparts - 1; /* leave out default partition */ 1058 else if (flag == CP_NONEMPTY) 1059 numpart = cp_numparts_nonempty; 1060 1061 return (numpart); 1062 } 1063 1064 int 1065 cpupart_setattr(psetid_t psid, uint_t attr) 1066 { 1067 cpupart_t *cp; 1068 1069 ASSERT(pool_lock_held()); 1070 1071 mutex_enter(&cpu_lock); 1072 if ((cp = cpupart_find(psid)) == NULL) { 1073 mutex_exit(&cpu_lock); 1074 return (EINVAL); 1075 } 1076 /* 1077 * PSET_NOESCAPE attribute for default cpu partition is always set 1078 */ 1079 if (cp == &cp_default && !(attr & PSET_NOESCAPE)) { 1080 mutex_exit(&cpu_lock); 1081 return (EINVAL); 1082 } 1083 cp->cp_attr = attr; 1084 mutex_exit(&cpu_lock); 1085 return (0); 1086 } 1087 1088 int 1089 cpupart_getattr(psetid_t psid, uint_t *attrp) 1090 { 1091 cpupart_t *cp; 1092 1093 mutex_enter(&cpu_lock); 1094 if ((cp = cpupart_find(psid)) == NULL) { 1095 mutex_exit(&cpu_lock); 1096 return (EINVAL); 1097 } 1098 *attrp = cp->cp_attr; 1099 mutex_exit(&cpu_lock); 1100 return (0); 1101 } 1102