1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/systm.h> 31 #include <sys/cmn_err.h> 32 #include <sys/cpuvar.h> 33 #include <sys/thread.h> 34 #include <sys/disp.h> 35 #include <sys/kmem.h> 36 #include <sys/debug.h> 37 #include <sys/cpupart.h> 38 #include <sys/pset.h> 39 #include <sys/var.h> 40 #include <sys/cyclic.h> 41 #include <sys/lgrp.h> 42 #include <sys/chip.h> 43 #include <sys/loadavg.h> 44 #include <sys/class.h> 45 #include <sys/fss.h> 46 #include <sys/pool.h> 47 #include <sys/pool_pset.h> 48 #include <sys/policy.h> 49 50 /* 51 * Calling pool_lock() protects the pools configuration, which includes 52 * CPU partitions. cpu_lock protects the CPU partition list, and prevents 53 * partitions from being created or destroyed while the lock is held. 54 * The lock ordering with respect to related locks is: 55 * 56 * pool_lock() ---> cpu_lock ---> pidlock --> p_lock 57 * 58 * Blocking memory allocations may be made while holding "pool_lock" 59 * or cpu_lock. 60 */ 61 62 /* 63 * The cp_default partition is allocated statically, but its lgroup load average 64 * (lpl) list is allocated dynamically after kmem subsystem is initialized. This 65 * saves some memory since the space allocated reflects the actual number of 66 * lgroups supported by the platform. The lgrp facility provides a temporary 67 * space to hold lpl information during system bootstrap. 68 */ 69 70 cpupart_t *cp_list_head; 71 cpupart_t cp_default; 72 static cpupartid_t cp_id_next; 73 uint_t cp_numparts; 74 uint_t cp_numparts_nonempty; 75 76 /* 77 * Need to limit total number of partitions to avoid slowing down the 78 * clock code too much. The clock code traverses the list of 79 * partitions and needs to be able to execute in a reasonable amount 80 * of time (less than 1/hz seconds). The maximum is sized based on 81 * max_ncpus so it shouldn't be a problem unless there are large 82 * numbers of empty partitions. 83 */ 84 static uint_t cp_max_numparts; 85 86 /* 87 * Processor sets and CPU partitions are different but related concepts. 88 * A processor set is a user-level abstraction allowing users to create 89 * sets of CPUs and bind threads exclusively to those sets. A CPU 90 * partition is a kernel dispatcher object consisting of a set of CPUs 91 * and a global dispatch queue. The processor set abstraction is 92 * implemented via a CPU partition, and currently there is a 1-1 93 * mapping between processor sets and partitions (excluding the default 94 * partition, which is not visible as a processor set). Hence, the 95 * numbering for processor sets and CPU partitions is identical. This 96 * may not always be true in the future, and these macros could become 97 * less trivial if we support e.g. a processor set containing multiple 98 * CPU partitions. 99 */ 100 #define PSTOCP(psid) ((cpupartid_t)((psid) == PS_NONE ? CP_DEFAULT : (psid))) 101 #define CPTOPS(cpid) ((psetid_t)((cpid) == CP_DEFAULT ? PS_NONE : (cpid))) 102 103 104 /* 105 * Find a CPU partition given a processor set ID. 106 */ 107 static cpupart_t * 108 cpupart_find_all(psetid_t psid) 109 { 110 cpupart_t *cp; 111 cpupartid_t cpid = PSTOCP(psid); 112 113 ASSERT(MUTEX_HELD(&cpu_lock)); 114 115 /* default partition not visible as a processor set */ 116 if (psid == CP_DEFAULT) 117 return (NULL); 118 119 if (psid == PS_MYID) 120 return (curthread->t_cpupart); 121 122 cp = cp_list_head; 123 do { 124 if (cp->cp_id == cpid) 125 return (cp); 126 cp = cp->cp_next; 127 } while (cp != cp_list_head); 128 return (NULL); 129 } 130 131 /* 132 * Find a CPU partition given a processor set ID if the processor set 133 * should be visible from the calling zone. 134 */ 135 cpupart_t * 136 cpupart_find(psetid_t psid) 137 { 138 cpupart_t *cp; 139 140 ASSERT(MUTEX_HELD(&cpu_lock)); 141 cp = cpupart_find_all(psid); 142 if (cp != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() && 143 zone_pset_get(curproc->p_zone) != CPTOPS(cp->cp_id)) 144 return (NULL); 145 return (cp); 146 } 147 148 static int 149 cpupart_kstat_update(kstat_t *ksp, int rw) 150 { 151 cpupart_t *cp = (cpupart_t *)ksp->ks_private; 152 cpupart_kstat_t *cpksp = ksp->ks_data; 153 154 if (rw == KSTAT_WRITE) 155 return (EACCES); 156 157 cpksp->cpk_updates.value.ui64 = cp->cp_updates; 158 cpksp->cpk_runnable.value.ui64 = cp->cp_nrunnable_cum; 159 cpksp->cpk_waiting.value.ui64 = cp->cp_nwaiting_cum; 160 cpksp->cpk_ncpus.value.ui32 = cp->cp_ncpus; 161 cpksp->cpk_avenrun_1min.value.ui32 = cp->cp_hp_avenrun[0] >> 162 (16 - FSHIFT); 163 cpksp->cpk_avenrun_5min.value.ui32 = cp->cp_hp_avenrun[1] >> 164 (16 - FSHIFT); 165 cpksp->cpk_avenrun_15min.value.ui32 = cp->cp_hp_avenrun[2] >> 166 (16 - FSHIFT); 167 return (0); 168 } 169 170 static void 171 cpupart_kstat_create(cpupart_t *cp) 172 { 173 kstat_t *ksp; 174 zoneid_t zoneid; 175 176 ASSERT(MUTEX_HELD(&cpu_lock)); 177 178 /* 179 * We have a bit of a chicken-egg problem since this code will 180 * get called to create the kstats for CP_DEFAULT before the 181 * pools framework gets initialized. We circumvent the problem 182 * by special-casing cp_default. 183 */ 184 if (cp != &cp_default && pool_pset_enabled()) 185 zoneid = GLOBAL_ZONEID; 186 else 187 zoneid = ALL_ZONES; 188 ksp = kstat_create_zone("unix", cp->cp_id, "pset", "misc", 189 KSTAT_TYPE_NAMED, 190 sizeof (cpupart_kstat_t) / sizeof (kstat_named_t), 0, zoneid); 191 if (ksp != NULL) { 192 cpupart_kstat_t *cpksp = ksp->ks_data; 193 194 kstat_named_init(&cpksp->cpk_updates, "updates", 195 KSTAT_DATA_UINT64); 196 kstat_named_init(&cpksp->cpk_runnable, "runnable", 197 KSTAT_DATA_UINT64); 198 kstat_named_init(&cpksp->cpk_waiting, "waiting", 199 KSTAT_DATA_UINT64); 200 kstat_named_init(&cpksp->cpk_ncpus, "ncpus", 201 KSTAT_DATA_UINT32); 202 kstat_named_init(&cpksp->cpk_avenrun_1min, "avenrun_1min", 203 KSTAT_DATA_UINT32); 204 kstat_named_init(&cpksp->cpk_avenrun_5min, "avenrun_5min", 205 KSTAT_DATA_UINT32); 206 kstat_named_init(&cpksp->cpk_avenrun_15min, "avenrun_15min", 207 KSTAT_DATA_UINT32); 208 209 ksp->ks_update = cpupart_kstat_update; 210 ksp->ks_private = cp; 211 212 kstat_install(ksp); 213 } 214 cp->cp_kstat = ksp; 215 } 216 217 /* 218 * Initialize the default partition and kpreempt disp queue. 219 */ 220 void 221 cpupart_initialize_default(void) 222 { 223 lgrp_id_t i; 224 225 cp_list_head = &cp_default; 226 cp_default.cp_next = &cp_default; 227 cp_default.cp_prev = &cp_default; 228 cp_default.cp_id = CP_DEFAULT; 229 cp_default.cp_kp_queue.disp_maxrunpri = -1; 230 cp_default.cp_kp_queue.disp_max_unbound_pri = -1; 231 cp_default.cp_kp_queue.disp_cpu = NULL; 232 cp_default.cp_gen = 0; 233 cp_default.cp_loadavg.lg_cur = 0; 234 cp_default.cp_loadavg.lg_len = 0; 235 cp_default.cp_loadavg.lg_total = 0; 236 for (i = 0; i < S_LOADAVG_SZ; i++) { 237 cp_default.cp_loadavg.lg_loads[i] = 0; 238 } 239 CPUSET_ZERO(cp_default.cp_haltset); 240 DISP_LOCK_INIT(&cp_default.cp_kp_queue.disp_lock); 241 cp_id_next = CP_DEFAULT + 1; 242 cpupart_kstat_create(&cp_default); 243 cp_numparts = 1; 244 if (cp_max_numparts == 0) /* allow for /etc/system tuning */ 245 cp_max_numparts = max_ncpus * 2 + 1; 246 /* 247 * Allocate space for cp_default list of lgrploads 248 */ 249 cp_default.cp_nlgrploads = lgrp_plat_max_lgrps(); 250 cp_default.cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * 251 cp_default.cp_nlgrploads, KM_SLEEP); 252 253 /* 254 * The initial lpl topology is created in a special lpl list 255 * lpl_bootstrap. It should be copied to cp_default. 256 * NOTE: lpl_topo_bootstrap() also updates CPU0 cpu_lpl pointer to point 257 * to the correct lpl in the cp_default.cp_lgrploads list. 258 */ 259 lpl_topo_bootstrap(cp_default.cp_lgrploads, 260 cp_default.cp_nlgrploads); 261 262 for (i = 0; i < cp_default.cp_nlgrploads; i++) { 263 cp_default.cp_lgrploads[i].lpl_lgrpid = i; 264 } 265 cp_default.cp_attr = PSET_NOESCAPE; 266 cp_numparts_nonempty = 1; 267 /* 268 * Set t0's home 269 */ 270 t0.t_lpl = &cp_default.cp_lgrploads[LGRP_ROOTID]; 271 } 272 273 274 static int 275 cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced) 276 { 277 cpupart_t *oldpp; 278 cpu_t *ncp, *newlist; 279 kthread_t *t; 280 int move_threads = 1; 281 lgrp_id_t lgrpid; 282 proc_t *p; 283 int lgrp_diff_lpl; 284 lpl_t *cpu_lpl; 285 int ret; 286 287 ASSERT(MUTEX_HELD(&cpu_lock)); 288 ASSERT(newpp != NULL); 289 290 oldpp = cp->cpu_part; 291 ASSERT(oldpp != NULL); 292 ASSERT(oldpp->cp_ncpus > 0); 293 294 if (newpp == oldpp) { 295 /* 296 * Don't need to do anything. 297 */ 298 return (0); 299 } 300 301 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_OUT); 302 303 if (!disp_bound_partition(cp, 0)) { 304 /* 305 * Don't need to move threads if there are no threads in 306 * the partition. Note that threads can't enter the 307 * partition while we're holding cpu_lock. 308 */ 309 move_threads = 0; 310 } else if (oldpp->cp_ncpus == 1) { 311 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 312 return (EBUSY); 313 } 314 315 if (forced && (ret = cpu_unbind(cp->cpu_id)) != 0) { 316 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 317 return (ret); 318 } 319 320 /* 321 * Stop further threads weak binding to this cpu. 322 */ 323 cpu_inmotion = cp; 324 membar_enter(); 325 326 again: 327 if (move_threads) { 328 int loop_count; 329 /* 330 * Check for threads strong or weak bound to this CPU. 331 */ 332 for (loop_count = 0; disp_bound_threads(cp, 0); loop_count++) { 333 if (loop_count >= 5) { 334 cpu_state_change_notify(cp->cpu_id, 335 CPU_CPUPART_IN); 336 cpu_inmotion = NULL; 337 return (EBUSY); /* some threads still bound */ 338 } 339 delay(1); 340 } 341 } 342 343 /* 344 * Before we actually start changing data structures, notify 345 * the cyclic subsystem that we want to move this CPU out of its 346 * partition. 347 */ 348 if (!cyclic_move_out(cp)) { 349 /* 350 * This CPU must be the last CPU in a processor set with 351 * a bound cyclic. 352 */ 353 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 354 cpu_inmotion = NULL; 355 return (EBUSY); 356 } 357 358 pause_cpus(cp); 359 360 if (move_threads) { 361 /* 362 * The thread on cpu before the pause thread may have read 363 * cpu_inmotion before we raised the barrier above. Check 364 * again. 365 */ 366 if (disp_bound_threads(cp, 1)) { 367 start_cpus(); 368 goto again; 369 } 370 371 } 372 373 /* 374 * Update the set of chip's being spanned 375 */ 376 chip_cpu_move_part(cp, oldpp, newpp); 377 378 /* save this cpu's lgroup -- it'll be the same in the new partition */ 379 lgrpid = cp->cpu_lpl->lpl_lgrpid; 380 381 cpu_lpl = cp->cpu_lpl; 382 /* 383 * let the lgroup framework know cp has left the partition 384 */ 385 lgrp_config(LGRP_CONFIG_CPUPART_DEL, (uintptr_t)cp, lgrpid); 386 387 /* move out of old partition */ 388 oldpp->cp_ncpus--; 389 if (oldpp->cp_ncpus > 0) { 390 391 ncp = cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part; 392 cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part; 393 if (oldpp->cp_cpulist == cp) { 394 oldpp->cp_cpulist = ncp; 395 } 396 } else { 397 ncp = oldpp->cp_cpulist = NULL; 398 cp_numparts_nonempty--; 399 ASSERT(cp_numparts_nonempty != 0); 400 } 401 oldpp->cp_gen++; 402 403 /* move into new partition */ 404 newlist = newpp->cp_cpulist; 405 if (newlist == NULL) { 406 newpp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp; 407 cp_numparts_nonempty++; 408 ASSERT(cp_numparts_nonempty != 0); 409 } else { 410 cp->cpu_next_part = newlist; 411 cp->cpu_prev_part = newlist->cpu_prev_part; 412 newlist->cpu_prev_part->cpu_next_part = cp; 413 newlist->cpu_prev_part = cp; 414 } 415 cp->cpu_part = newpp; 416 newpp->cp_ncpus++; 417 newpp->cp_gen++; 418 419 ASSERT(CPUSET_ISNULL(newpp->cp_haltset)); 420 ASSERT(CPUSET_ISNULL(oldpp->cp_haltset)); 421 422 /* 423 * let the lgroup framework know cp has entered the partition 424 */ 425 lgrp_config(LGRP_CONFIG_CPUPART_ADD, (uintptr_t)cp, lgrpid); 426 427 /* 428 * If necessary, move threads off processor. 429 */ 430 if (move_threads) { 431 ASSERT(ncp != NULL); 432 433 /* 434 * Walk thru the active process list to look for 435 * threads that need to have a new home lgroup, 436 * or the last CPU they run on is the same CPU 437 * being moved out of the partition. 438 */ 439 440 for (p = practive; p != NULL; p = p->p_next) { 441 442 t = p->p_tlist; 443 444 if (t == NULL) 445 continue; 446 447 lgrp_diff_lpl = 0; 448 449 do { 450 451 ASSERT(t->t_lpl != NULL); 452 453 /* 454 * Update the count of how many threads are 455 * in this CPU's lgroup but have a different lpl 456 */ 457 458 if (t->t_lpl != cpu_lpl && 459 t->t_lpl->lpl_lgrpid == lgrpid) 460 lgrp_diff_lpl++; 461 /* 462 * If the lgroup that t is assigned to no 463 * longer has any CPUs in t's partition, 464 * we'll have to choose a new lgroup for t. 465 */ 466 467 if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid, 468 t->t_cpupart)) { 469 lgrp_move_thread(t, 470 lgrp_choose(t, t->t_cpupart), 0); 471 } 472 473 /* 474 * make sure lpl points to our own partition 475 */ 476 ASSERT(t->t_lpl >= t->t_cpupart->cp_lgrploads && 477 (t->t_lpl < t->t_cpupart->cp_lgrploads + 478 t->t_cpupart->cp_nlgrploads)); 479 480 ASSERT(t->t_lpl->lpl_ncpu > 0); 481 482 /* Update CPU last ran on if it was this CPU */ 483 if (t->t_cpu == cp && t->t_cpupart == oldpp && 484 t->t_bound_cpu != cp) { 485 t->t_cpu = disp_lowpri_cpu(ncp, 486 t->t_lpl, t->t_pri, NULL); 487 } 488 t = t->t_forw; 489 } while (t != p->p_tlist); 490 491 /* 492 * Didn't find any threads in the same lgroup as this 493 * CPU with a different lpl, so remove the lgroup from 494 * the process lgroup bitmask. 495 */ 496 497 if (lgrp_diff_lpl) 498 klgrpset_del(p->p_lgrpset, lgrpid); 499 } 500 501 /* 502 * Walk thread list looking for threads that need to be 503 * rehomed, since there are some threads that are not in 504 * their process's p_tlist. 505 */ 506 507 t = curthread; 508 509 do { 510 ASSERT(t != NULL && t->t_lpl != NULL); 511 512 /* 513 * If the lgroup that t is assigned to no 514 * longer has any CPUs in t's partition, 515 * we'll have to choose a new lgroup for t. 516 * Also, choose best lgroup for home when 517 * thread has specified lgroup affinities, 518 * since there may be an lgroup with more 519 * affinity available after moving CPUs 520 * around. 521 */ 522 if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid, 523 t->t_cpupart) || t->t_lgrp_affinity) { 524 lgrp_move_thread(t, 525 lgrp_choose(t, t->t_cpupart), 1); 526 } 527 528 /* make sure lpl points to our own partition */ 529 ASSERT((t->t_lpl >= t->t_cpupart->cp_lgrploads) && 530 (t->t_lpl < t->t_cpupart->cp_lgrploads + 531 t->t_cpupart->cp_nlgrploads)); 532 533 ASSERT(t->t_lpl->lpl_ncpu > 0); 534 535 /* Update CPU last ran on if it was this CPU */ 536 if (t->t_cpu == cp && t->t_cpupart == oldpp && 537 t->t_bound_cpu != cp) { 538 t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl, 539 t->t_pri, NULL); 540 } 541 542 t = t->t_next; 543 } while (t != curthread); 544 545 /* 546 * Clear off the CPU's run queue, and the kp queue if the 547 * partition is now empty. 548 */ 549 disp_cpu_inactive(cp); 550 551 /* 552 * Make cp switch to a thread from the new partition. 553 */ 554 cp->cpu_runrun = 1; 555 cp->cpu_kprunrun = 1; 556 } 557 558 cpu_inmotion = NULL; 559 start_cpus(); 560 561 /* 562 * Let anyone interested know that cpu has been added to the set. 563 */ 564 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); 565 566 /* 567 * Now let the cyclic subsystem know that it can reshuffle cyclics 568 * bound to the new processor set. 569 */ 570 cyclic_move_in(cp); 571 572 return (0); 573 } 574 575 /* 576 * Check if thread can be moved to a new cpu partition. Called by 577 * cpupart_move_thread() and pset_bind_start(). 578 */ 579 int 580 cpupart_movable_thread(kthread_id_t tp, cpupart_t *cp, int ignore) 581 { 582 ASSERT(MUTEX_HELD(&cpu_lock)); 583 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock)); 584 ASSERT(cp != NULL); 585 ASSERT(THREAD_LOCK_HELD(tp)); 586 587 /* 588 * CPU-bound threads can't be moved. 589 */ 590 if (!ignore) { 591 cpu_t *boundcpu = tp->t_bound_cpu ? tp->t_bound_cpu : 592 tp->t_weakbound_cpu; 593 if (boundcpu != NULL && boundcpu->cpu_part != cp) 594 return (EBUSY); 595 } 596 return (0); 597 } 598 599 /* 600 * Move thread to new partition. If ignore is non-zero, then CPU 601 * bindings should be ignored (this is used when destroying a 602 * partition). 603 */ 604 static int 605 cpupart_move_thread(kthread_id_t tp, cpupart_t *newpp, int ignore, 606 void *projbuf, void *zonebuf) 607 { 608 cpupart_t *oldpp = tp->t_cpupart; 609 int ret; 610 611 ASSERT(MUTEX_HELD(&cpu_lock)); 612 ASSERT(MUTEX_HELD(&pidlock)); 613 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock)); 614 ASSERT(newpp != NULL); 615 616 if (newpp->cp_cpulist == NULL) 617 return (EINVAL); 618 619 /* 620 * Check for errors first. 621 */ 622 thread_lock(tp); 623 if ((ret = cpupart_movable_thread(tp, newpp, ignore)) != 0) { 624 thread_unlock(tp); 625 return (ret); 626 } 627 628 /* move the thread */ 629 if (oldpp != newpp) { 630 /* 631 * Make the thread switch to the new partition. 632 */ 633 tp->t_cpupart = newpp; 634 ASSERT(tp->t_lpl != NULL); 635 /* 636 * Leave the thread on the same lgroup if possible; otherwise 637 * choose a new lgroup for it. In either case, update its 638 * t_lpl. 639 */ 640 if (LGRP_CPUS_IN_PART(tp->t_lpl->lpl_lgrpid, newpp) && 641 tp->t_lgrp_affinity == NULL) { 642 /* 643 * The thread's lgroup has CPUs in the thread's new 644 * partition, so the thread can stay assigned to the 645 * same lgroup. Update its t_lpl to point to the 646 * lpl_t for its lgroup in its new partition. 647 */ 648 lgrp_move_thread(tp, &tp->t_cpupart->\ 649 cp_lgrploads[tp->t_lpl->lpl_lgrpid], 1); 650 } else { 651 /* 652 * The thread's lgroup has no cpus in its new 653 * partition or it has specified lgroup affinities, 654 * so choose the best lgroup for the thread and 655 * assign it to that lgroup. 656 */ 657 lgrp_move_thread(tp, lgrp_choose(tp, tp->t_cpupart), 658 1); 659 } 660 /* 661 * make sure lpl points to our own partition 662 */ 663 ASSERT((tp->t_lpl >= tp->t_cpupart->cp_lgrploads) && 664 (tp->t_lpl < tp->t_cpupart->cp_lgrploads + 665 tp->t_cpupart->cp_nlgrploads)); 666 667 ASSERT(tp->t_lpl->lpl_ncpu > 0); 668 669 if (tp->t_state == TS_ONPROC) { 670 cpu_surrender(tp); 671 } else if (tp->t_state == TS_RUN) { 672 (void) dispdeq(tp); 673 setbackdq(tp); 674 } 675 } 676 677 /* 678 * Our binding has changed; set TP_CHANGEBIND. 679 */ 680 tp->t_proc_flag |= TP_CHANGEBIND; 681 aston(tp); 682 683 thread_unlock(tp); 684 fss_changepset(tp, newpp, projbuf, zonebuf); 685 686 return (0); /* success */ 687 } 688 689 690 /* 691 * This function binds a thread to a partition. Must be called with the 692 * p_lock of the containing process held (to keep the thread from going 693 * away), and thus also with cpu_lock held (since cpu_lock must be 694 * acquired before p_lock). If ignore is non-zero, then CPU bindings 695 * should be ignored (this is used when destroying a partition). 696 */ 697 int 698 cpupart_bind_thread(kthread_id_t tp, psetid_t psid, int ignore, void *projbuf, 699 void *zonebuf) 700 { 701 cpupart_t *newpp; 702 703 ASSERT(pool_lock_held()); 704 ASSERT(MUTEX_HELD(&cpu_lock)); 705 ASSERT(MUTEX_HELD(&pidlock)); 706 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock)); 707 708 if (psid == PS_NONE) 709 newpp = &cp_default; 710 else { 711 newpp = cpupart_find(psid); 712 if (newpp == NULL) { 713 return (EINVAL); 714 } 715 } 716 return (cpupart_move_thread(tp, newpp, ignore, projbuf, zonebuf)); 717 } 718 719 720 /* 721 * Create a new partition. On MP systems, this also allocates a 722 * kpreempt disp queue for that partition. 723 */ 724 int 725 cpupart_create(psetid_t *psid) 726 { 727 cpupart_t *pp; 728 lgrp_id_t i; 729 730 ASSERT(pool_lock_held()); 731 732 pp = kmem_zalloc(sizeof (cpupart_t), KM_SLEEP); 733 pp->cp_nlgrploads = lgrp_plat_max_lgrps(); 734 pp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * pp->cp_nlgrploads, 735 KM_SLEEP); 736 737 mutex_enter(&cpu_lock); 738 if (cp_numparts == cp_max_numparts) { 739 mutex_exit(&cpu_lock); 740 kmem_free(pp->cp_lgrploads, sizeof (lpl_t) * pp->cp_nlgrploads); 741 pp->cp_lgrploads = NULL; 742 kmem_free(pp, sizeof (cpupart_t)); 743 return (ENOMEM); 744 } 745 cp_numparts++; 746 /* find the next free partition ID */ 747 while (cpupart_find(CPTOPS(cp_id_next)) != NULL) 748 cp_id_next++; 749 pp->cp_id = cp_id_next++; 750 pp->cp_ncpus = 0; 751 pp->cp_cpulist = NULL; 752 pp->cp_attr = 0; 753 klgrpset_clear(pp->cp_lgrpset); 754 pp->cp_kp_queue.disp_maxrunpri = -1; 755 pp->cp_kp_queue.disp_max_unbound_pri = -1; 756 pp->cp_kp_queue.disp_cpu = NULL; 757 pp->cp_gen = 0; 758 CPUSET_ZERO(pp->cp_haltset); 759 DISP_LOCK_INIT(&pp->cp_kp_queue.disp_lock); 760 *psid = CPTOPS(pp->cp_id); 761 disp_kp_alloc(&pp->cp_kp_queue, v.v_nglobpris); 762 cpupart_kstat_create(pp); 763 for (i = 0; i < pp->cp_nlgrploads; i++) { 764 pp->cp_lgrploads[i].lpl_lgrpid = i; 765 } 766 CHIP_SET_ZERO(pp->cp_chipset); 767 768 /* 769 * Pause all CPUs while changing the partition list, to make sure 770 * the clock thread (which traverses the list without holding 771 * cpu_lock) isn't running. 772 */ 773 pause_cpus(NULL); 774 pp->cp_next = cp_list_head; 775 pp->cp_prev = cp_list_head->cp_prev; 776 cp_list_head->cp_prev->cp_next = pp; 777 cp_list_head->cp_prev = pp; 778 start_cpus(); 779 mutex_exit(&cpu_lock); 780 781 return (0); 782 } 783 784 785 /* 786 * Destroy a partition. 787 */ 788 int 789 cpupart_destroy(psetid_t psid) 790 { 791 cpu_t *cp, *first_cp; 792 cpupart_t *pp, *newpp; 793 int err = 0; 794 void *projbuf, *zonebuf; 795 kthread_t *t; 796 proc_t *p; 797 798 ASSERT(pool_lock_held()); 799 mutex_enter(&cpu_lock); 800 801 pp = cpupart_find(psid); 802 if (pp == NULL || pp == &cp_default) { 803 mutex_exit(&cpu_lock); 804 return (EINVAL); 805 } 806 807 /* 808 * Pre-allocate enough buffers for FSS for all active projects and 809 * for all active zones on the system. Unused buffers will be 810 * freed later by fss_freebuf(). 811 */ 812 projbuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_PROJ); 813 zonebuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_ZONE); 814 815 /* 816 * First need to unbind all the threads currently bound to the 817 * partition. Then do the actual destroy (which moves the CPUs). 818 */ 819 mutex_enter(&pidlock); 820 t = curthread; 821 do { 822 if (t->t_bind_pset == psid) { 823 again: p = ttoproc(t); 824 mutex_enter(&p->p_lock); 825 if (ttoproc(t) != p) { 826 /* 827 * lwp_exit has changed this thread's process 828 * pointer before we grabbed its p_lock. 829 */ 830 mutex_exit(&p->p_lock); 831 goto again; 832 } 833 err = cpupart_bind_thread(t, PS_NONE, 1, 834 projbuf, zonebuf); 835 if (err) { 836 mutex_exit(&p->p_lock); 837 mutex_exit(&pidlock); 838 mutex_exit(&cpu_lock); 839 fss_freebuf(projbuf, FSS_ALLOC_PROJ); 840 fss_freebuf(zonebuf, FSS_ALLOC_ZONE); 841 return (err); 842 } 843 t->t_bind_pset = PS_NONE; 844 mutex_exit(&p->p_lock); 845 } 846 t = t->t_next; 847 } while (t != curthread); 848 849 mutex_exit(&pidlock); 850 fss_freebuf(projbuf, FSS_ALLOC_PROJ); 851 fss_freebuf(zonebuf, FSS_ALLOC_ZONE); 852 853 newpp = &cp_default; 854 while ((cp = pp->cp_cpulist) != NULL) { 855 if (err = cpupart_move_cpu(cp, newpp, 0)) { 856 mutex_exit(&cpu_lock); 857 return (err); 858 } 859 } 860 861 ASSERT(CHIP_SET_ISNULL(pp->cp_chipset)); 862 ASSERT(CPUSET_ISNULL(pp->cp_haltset)); 863 864 /* 865 * Reset the pointers in any offline processors so they won't 866 * try to rejoin the destroyed partition when they're turned 867 * online. 868 */ 869 first_cp = cp = CPU; 870 do { 871 if (cp->cpu_part == pp) { 872 ASSERT(cp->cpu_flags & CPU_OFFLINE); 873 cp->cpu_part = newpp; 874 } 875 cp = cp->cpu_next; 876 } while (cp != first_cp); 877 878 /* 879 * Pause all CPUs while changing the partition list, to make sure 880 * the clock thread (which traverses the list without holding 881 * cpu_lock) isn't running. 882 */ 883 pause_cpus(NULL); 884 pp->cp_prev->cp_next = pp->cp_next; 885 pp->cp_next->cp_prev = pp->cp_prev; 886 if (cp_list_head == pp) 887 cp_list_head = pp->cp_next; 888 start_cpus(); 889 890 if (cp_id_next > pp->cp_id) 891 cp_id_next = pp->cp_id; 892 893 if (pp->cp_kstat) 894 kstat_delete(pp->cp_kstat); 895 896 cp_numparts--; 897 898 disp_kp_free(&pp->cp_kp_queue); 899 kmem_free(pp->cp_lgrploads, sizeof (lpl_t) * pp->cp_nlgrploads); 900 pp->cp_lgrploads = NULL; 901 kmem_free(pp, sizeof (cpupart_t)); 902 mutex_exit(&cpu_lock); 903 904 return (err); 905 } 906 907 908 /* 909 * Return the ID of the partition to which the specified processor belongs. 910 */ 911 psetid_t 912 cpupart_query_cpu(cpu_t *cp) 913 { 914 ASSERT(MUTEX_HELD(&cpu_lock)); 915 916 return (CPTOPS(cp->cpu_part->cp_id)); 917 } 918 919 920 /* 921 * Attach a processor to an existing partition. 922 */ 923 int 924 cpupart_attach_cpu(psetid_t psid, cpu_t *cp, int forced) 925 { 926 cpupart_t *pp; 927 int err; 928 929 ASSERT(pool_lock_held()); 930 ASSERT(MUTEX_HELD(&cpu_lock)); 931 932 pp = cpupart_find(psid); 933 if (pp == NULL) 934 return (EINVAL); 935 if (cp->cpu_flags & CPU_OFFLINE) 936 return (EINVAL); 937 938 err = cpupart_move_cpu(cp, pp, forced); 939 return (err); 940 } 941 942 /* 943 * Get a list of cpus belonging to the partition. If numcpus is NULL, 944 * this just checks for a valid partition. If numcpus is non-NULL but 945 * cpulist is NULL, the current number of cpus is stored in *numcpus. 946 * If both are non-NULL, the current number of cpus is stored in *numcpus, 947 * and a list of those cpus up to the size originally in *numcpus is 948 * stored in cpulist[]. Also, store the processor set id in *psid. 949 * This is useful in case the processor set id passed in was PS_MYID. 950 */ 951 int 952 cpupart_get_cpus(psetid_t *psid, processorid_t *cpulist, uint_t *numcpus) 953 { 954 cpupart_t *pp; 955 uint_t ncpus; 956 cpu_t *c; 957 int i; 958 959 mutex_enter(&cpu_lock); 960 pp = cpupart_find(*psid); 961 if (pp == NULL) { 962 mutex_exit(&cpu_lock); 963 return (EINVAL); 964 } 965 *psid = CPTOPS(pp->cp_id); 966 ncpus = pp->cp_ncpus; 967 if (numcpus) { 968 if (ncpus > *numcpus) { 969 /* 970 * Only copy as many cpus as were passed in, but 971 * pass back the real number. 972 */ 973 uint_t t = ncpus; 974 ncpus = *numcpus; 975 *numcpus = t; 976 } else 977 *numcpus = ncpus; 978 979 if (cpulist) { 980 c = pp->cp_cpulist; 981 for (i = 0; i < ncpus; i++) { 982 ASSERT(c != NULL); 983 cpulist[i] = c->cpu_id; 984 c = c->cpu_next_part; 985 } 986 } 987 } 988 mutex_exit(&cpu_lock); 989 return (0); 990 } 991 992 /* 993 * Reallocate kpreempt queues for each CPU partition. Called from 994 * disp_setup when a new scheduling class is loaded that increases the 995 * number of priorities in the system. 996 */ 997 void 998 cpupart_kpqalloc(pri_t npri) 999 { 1000 cpupart_t *cpp; 1001 1002 ASSERT(MUTEX_HELD(&cpu_lock)); 1003 cpp = cp_list_head; 1004 do { 1005 disp_kp_alloc(&cpp->cp_kp_queue, npri); 1006 cpp = cpp->cp_next; 1007 } while (cpp != cp_list_head); 1008 } 1009 1010 int 1011 cpupart_get_loadavg(psetid_t psid, int *buf, int nelem) 1012 { 1013 cpupart_t *cp; 1014 int i; 1015 1016 ASSERT(nelem >= 0); 1017 ASSERT(nelem <= LOADAVG_NSTATS); 1018 ASSERT(MUTEX_HELD(&cpu_lock)); 1019 1020 cp = cpupart_find(psid); 1021 if (cp == NULL) 1022 return (EINVAL); 1023 for (i = 0; i < nelem; i++) 1024 buf[i] = cp->cp_hp_avenrun[i] >> (16 - FSHIFT); 1025 1026 return (0); 1027 } 1028 1029 1030 uint_t 1031 cpupart_list(psetid_t *list, uint_t nelem, int flag) 1032 { 1033 uint_t numpart = 0; 1034 cpupart_t *cp; 1035 1036 ASSERT(MUTEX_HELD(&cpu_lock)); 1037 ASSERT(flag == CP_ALL || flag == CP_NONEMPTY); 1038 1039 if (list != NULL) { 1040 cp = cp_list_head; 1041 do { 1042 if (((flag == CP_ALL) && (cp != &cp_default)) || 1043 ((flag == CP_NONEMPTY) && (cp->cp_ncpus != 0))) { 1044 if (numpart == nelem) 1045 break; 1046 list[numpart++] = CPTOPS(cp->cp_id); 1047 } 1048 cp = cp->cp_next; 1049 } while (cp != cp_list_head); 1050 } 1051 1052 ASSERT(numpart < cp_numparts); 1053 1054 if (flag == CP_ALL) 1055 numpart = cp_numparts - 1; /* leave out default partition */ 1056 else if (flag == CP_NONEMPTY) 1057 numpart = cp_numparts_nonempty; 1058 1059 return (numpart); 1060 } 1061 1062 int 1063 cpupart_setattr(psetid_t psid, uint_t attr) 1064 { 1065 cpupart_t *cp; 1066 1067 ASSERT(pool_lock_held()); 1068 1069 mutex_enter(&cpu_lock); 1070 if ((cp = cpupart_find(psid)) == NULL) { 1071 mutex_exit(&cpu_lock); 1072 return (EINVAL); 1073 } 1074 /* 1075 * PSET_NOESCAPE attribute for default cpu partition is always set 1076 */ 1077 if (cp == &cp_default && !(attr & PSET_NOESCAPE)) { 1078 mutex_exit(&cpu_lock); 1079 return (EINVAL); 1080 } 1081 cp->cp_attr = attr; 1082 mutex_exit(&cpu_lock); 1083 return (0); 1084 } 1085 1086 int 1087 cpupart_getattr(psetid_t psid, uint_t *attrp) 1088 { 1089 cpupart_t *cp; 1090 1091 mutex_enter(&cpu_lock); 1092 if ((cp = cpupart_find(psid)) == NULL) { 1093 mutex_exit(&cpu_lock); 1094 return (EINVAL); 1095 } 1096 *attrp = cp->cp_attr; 1097 mutex_exit(&cpu_lock); 1098 return (0); 1099 } 1100