1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/pool.h> 30 #include <sys/pool_impl.h> 31 #include <sys/pool_pset.h> 32 #include <sys/id_space.h> 33 #include <sys/mutex.h> 34 #include <sys/nvpair.h> 35 #include <sys/cpuvar.h> 36 #include <sys/errno.h> 37 #include <sys/cmn_err.h> 38 #include <sys/systm.h> 39 #include <sys/proc.h> 40 #include <sys/fss.h> 41 #include <sys/class.h> 42 #include <sys/exacct.h> 43 #include <sys/utsname.h> 44 #include <sys/procset.h> 45 #include <sys/atomic.h> 46 #include <sys/zone.h> 47 #include <sys/policy.h> 48 #include <sys/schedctl.h> 49 50 /* 51 * RESOURCE POOLS 52 * 53 * The resource pools facility brings together process-bindable resource into 54 * a common abstraction called a pool. Processor sets and other entities can 55 * be configured, grouped, and labelled such that workload components can be 56 * associated with a subset of a system's total resources. 57 * 58 * When disabled, the pools facility is "invisible". All processes belong 59 * to the same pool (pool_default), and processor sets can be managed through 60 * the old pset() system call. When enabled, processor sets can only be 61 * managed via the pools facility. New pools can be created and associated 62 * with processor sets. Processes can be bound to pools which have non-empty 63 * resource sets. 64 * 65 * Locking: pool_lock() protects global pools state and must be called 66 * before modifying the configuration, or when taking a snapshot of the 67 * configuration. If pool_lock_intr() is used, the operation may be 68 * interrupted by a signal or a request. 69 * 70 * To prevent processes from being rebound between pools while they are 71 * the middle of an operation which affects resource set bindings, such 72 * operations must be surrounded by calls to pool_barrier_enter() and 73 * pool_barrier_exit(). This mechanism guarantees that such processes will 74 * be stopped either at the beginning or at the end of the barrier so that 75 * the rebind operation can atomically bind the process and its threads 76 * to new resource sets, and then let process run again. 77 * 78 * Lock ordering with respect to other locks is as follows: 79 * 80 * pool_lock() -> cpu_lock -> pidlock -> p_lock -> pool_barrier_lock 81 * 82 * Most static and global variables defined in this file are protected 83 * by calling pool_lock(). 84 * 85 * The operation that binds tasks and projects to pools is atomic. That is, 86 * either all processes in a given task or a project will be bound to a 87 * new pool, or (in case of an error) they will be all left bound to the 88 * old pool. Processes in a given task or a given project can only be bound to 89 * different pools if they were rebound individually one by one as single 90 * processes. Threads or LWPs of the same process do not have pool bindings, 91 * and are bound to the same resource sets associated with the resource pool 92 * of that process. 93 * 94 * The following picture shows one possible pool configuration with three 95 * pools and three processor sets. Note that processor set "foo" is not 96 * associated with any pools and therefore cannot have any processes 97 * bound to it. Two pools (default and foo) are associated with the 98 * same processor set (default). Also, note that processes in Task 2 99 * are bound to different pools. 100 * 101 * 102 * Processor Sets 103 * +---------+ 104 * +--------------+========================>| default | 105 * a| | +---------+ 106 * s| | || 107 * s| | +---------+ 108 * o| | | foo | 109 * c| | +---------+ 110 * i| | || 111 * a| | +---------+ 112 * t| | +------>| bar | 113 * e| | | +---------+ 114 * d| | | 115 * | | | 116 * +---------+ +---------+ +---------+ 117 * Pools | default |======| foo |======| bar | 118 * +---------+ +---------+ +---------+ 119 * @ @ @ @ @ @ 120 * b| | | | | | 121 * o| | | | | | 122 * u| +-----+ | +-------+ | +---+ 123 * n| | | | | | 124 * ....d|........|......|......|.........|.......|.... 125 * : | :: | | | :: | | : 126 * : +---+ :: +---+ +---+ +---+ :: +---+ +---+ : 127 * Processes : | p | :: | p | | p | | p | :: | p |...| p | : 128 * : +---+ :: +---+ +---+ +---+ :: +---+ +---+ : 129 * :........::......................::...............: 130 * Task 1 Task 2 Task N 131 * | | | 132 * | | | 133 * | +-----------+ | +-----------+ 134 * +--| Project 1 |--+ | Project N | 135 * +-----------+ +-----------+ 136 * 137 * This is just an illustration of relationships between processes, tasks, 138 * projects, pools, and processor sets. New types of resource sets will be 139 * added in the future. 140 */ 141 142 pool_t *pool_default; /* default pool which always exists */ 143 int pool_count; /* number of pools created on this system */ 144 int pool_state; /* pools state -- enabled/disabled */ 145 void *pool_buf; /* pre-commit snapshot of the pools state */ 146 size_t pool_bufsz; /* size of pool_buf */ 147 static hrtime_t pool_pool_mod; /* last modification time for pools */ 148 static hrtime_t pool_sys_mod; /* last modification time for system */ 149 static nvlist_t *pool_sys_prop; /* system properties */ 150 static id_space_t *pool_ids; /* pool ID space */ 151 static list_t pool_list; /* doubly-linked list of pools */ 152 static kmutex_t pool_mutex; /* protects pool_busy_* */ 153 static kcondvar_t pool_busy_cv; /* waiting for "pool_lock" */ 154 static kthread_t *pool_busy_thread; /* thread holding "pool_lock" */ 155 static kmutex_t pool_barrier_lock; /* synch. with pool_barrier_* */ 156 static kcondvar_t pool_barrier_cv; /* synch. with pool_barrier_* */ 157 static int pool_barrier_count; /* synch. with pool_barrier_* */ 158 159 /* 160 * Boot-time pool initialization. 161 */ 162 void 163 pool_init(void) 164 { 165 pool_ids = id_space_create("pool_ids", POOL_DEFAULT + 1, POOL_MAXID); 166 167 /* 168 * Initialize default pool. 169 */ 170 pool_default = kmem_zalloc(sizeof (pool_t), KM_SLEEP); 171 pool_default->pool_id = POOL_DEFAULT; 172 list_create(&pool_list, sizeof (pool_t), offsetof(pool_t, pool_link)); 173 list_insert_head(&pool_list, pool_default); 174 175 /* 176 * Initialize plugins for resource sets. 177 */ 178 pool_pset_init(); 179 pool_count = 1; 180 p0.p_pool = pool_default; 181 global_zone->zone_pool = pool_default; 182 pool_default->pool_ref = 1; 183 } 184 185 /* 186 * Synchronization routines. 187 * 188 * pool_lock is only called from syscall-level routines (processor_bind(), 189 * pset_*(), and /dev/pool ioctls). The pool "lock" may be held for long 190 * periods of time, including across sleeping operations, so we allow its 191 * acquisition to be interruptible. 192 * 193 * The current thread that owns the "lock" is stored in the variable 194 * pool_busy_thread, both to let pool_lock_held() work and to aid debugging. 195 */ 196 void 197 pool_lock(void) 198 { 199 mutex_enter(&pool_mutex); 200 ASSERT(!pool_lock_held()); 201 while (pool_busy_thread != NULL) 202 cv_wait(&pool_busy_cv, &pool_mutex); 203 pool_busy_thread = curthread; 204 mutex_exit(&pool_mutex); 205 } 206 207 int 208 pool_lock_intr(void) 209 { 210 mutex_enter(&pool_mutex); 211 ASSERT(!pool_lock_held()); 212 while (pool_busy_thread != NULL) { 213 if (cv_wait_sig(&pool_busy_cv, &pool_mutex) == 0) { 214 cv_signal(&pool_busy_cv); 215 mutex_exit(&pool_mutex); 216 return (1); 217 } 218 } 219 pool_busy_thread = curthread; 220 mutex_exit(&pool_mutex); 221 return (0); 222 } 223 224 int 225 pool_lock_held(void) 226 { 227 return (pool_busy_thread == curthread); 228 } 229 230 void 231 pool_unlock(void) 232 { 233 mutex_enter(&pool_mutex); 234 ASSERT(pool_lock_held()); 235 pool_busy_thread = NULL; 236 cv_signal(&pool_busy_cv); 237 mutex_exit(&pool_mutex); 238 } 239 240 /* 241 * Routines allowing fork(), exec(), exit(), and lwp_create() to synchronize 242 * with pool_do_bind(). 243 * 244 * Calls to pool_barrier_enter() and pool_barrier_exit() must bracket all 245 * operations which modify pool or pset associations. They can be called 246 * while the process is multi-threaded. In the common case, when current 247 * process is not being rebound (PBWAIT flag is not set), these functions 248 * will be just incrementing and decrementing reference counts. 249 */ 250 void 251 pool_barrier_enter(void) 252 { 253 proc_t *p = curproc; 254 255 ASSERT(MUTEX_HELD(&p->p_lock)); 256 while (p->p_poolflag & PBWAIT) 257 cv_wait(&p->p_poolcv, &p->p_lock); 258 p->p_poolcnt++; 259 } 260 261 void 262 pool_barrier_exit(void) 263 { 264 proc_t *p = curproc; 265 266 ASSERT(MUTEX_HELD(&p->p_lock)); 267 ASSERT(p->p_poolcnt > 0); 268 p->p_poolcnt--; 269 if (p->p_poolflag & PBWAIT) { 270 mutex_enter(&pool_barrier_lock); 271 ASSERT(pool_barrier_count > 0); 272 pool_barrier_count--; 273 if (pool_barrier_count == 0) 274 cv_signal(&pool_barrier_cv); 275 mutex_exit(&pool_barrier_lock); 276 while (p->p_poolflag & PBWAIT) 277 cv_wait(&p->p_poolcv, &p->p_lock); 278 } 279 } 280 281 /* 282 * Enable pools facility. 283 */ 284 static int 285 pool_enable(void) 286 { 287 int ret; 288 289 ASSERT(pool_lock_held()); 290 ASSERT(pool_count == 1); 291 292 ret = pool_pset_enable(); 293 if (ret != 0) 294 return (ret); 295 (void) nvlist_alloc(&pool_sys_prop, NV_UNIQUE_NAME, KM_SLEEP); 296 (void) nvlist_add_string(pool_sys_prop, "system.name", 297 "default"); 298 (void) nvlist_add_string(pool_sys_prop, "system.comment", ""); 299 (void) nvlist_add_int64(pool_sys_prop, "system.version", 1); 300 (void) nvlist_add_byte(pool_sys_prop, "system.bind-default", 1); 301 (void) nvlist_add_string(pool_sys_prop, "system.poold.objectives", 302 "wt-load"); 303 304 (void) nvlist_alloc(&pool_default->pool_props, 305 NV_UNIQUE_NAME, KM_SLEEP); 306 (void) nvlist_add_string(pool_default->pool_props, 307 "pool.name", "pool_default"); 308 (void) nvlist_add_string(pool_default->pool_props, "pool.comment", ""); 309 (void) nvlist_add_byte(pool_default->pool_props, "pool.default", 1); 310 (void) nvlist_add_byte(pool_default->pool_props, "pool.active", 1); 311 (void) nvlist_add_int64(pool_default->pool_props, 312 "pool.importance", 1); 313 (void) nvlist_add_int64(pool_default->pool_props, "pool.sys_id", 314 pool_default->pool_id); 315 316 pool_sys_mod = pool_pool_mod = gethrtime(); 317 318 return (ret); 319 } 320 321 /* 322 * Disable pools facility. 323 */ 324 static int 325 pool_disable(void) 326 { 327 int ret; 328 329 ASSERT(pool_lock_held()); 330 331 if (pool_count > 1) /* must destroy all pools first */ 332 return (EBUSY); 333 334 ret = pool_pset_disable(); 335 if (ret != 0) 336 return (ret); 337 if (pool_sys_prop != NULL) { 338 nvlist_free(pool_sys_prop); 339 pool_sys_prop = NULL; 340 } 341 if (pool_default->pool_props != NULL) { 342 nvlist_free(pool_default->pool_props); 343 pool_default->pool_props = NULL; 344 } 345 return (0); 346 } 347 348 pool_t * 349 pool_lookup_pool_by_name(char *name) 350 { 351 pool_t *pool = pool_default; 352 char *p; 353 354 ASSERT(pool_lock_held()); 355 for (pool = list_head(&pool_list); pool; 356 pool = list_next(&pool_list, pool)) { 357 if (nvlist_lookup_string(pool->pool_props, 358 "pool.name", &p) == 0 && strcmp(name, p) == 0) 359 return (pool); 360 } 361 return (NULL); 362 } 363 364 pool_t * 365 pool_lookup_pool_by_id(poolid_t poolid) 366 { 367 pool_t *pool = pool_default; 368 369 ASSERT(pool_lock_held()); 370 for (pool = list_head(&pool_list); pool; 371 pool = list_next(&pool_list, pool)) { 372 if (pool->pool_id == poolid) 373 return (pool); 374 } 375 return (NULL); 376 } 377 378 /* 379 * Create new pool, associate it with default resource sets, and give 380 * it a temporary name. 381 */ 382 static int 383 pool_pool_create(poolid_t *poolid) 384 { 385 pool_t *pool; 386 char pool_name[40]; 387 388 ASSERT(pool_lock_held()); 389 390 pool = kmem_zalloc(sizeof (pool_t), KM_SLEEP); 391 pool->pool_id = *poolid = id_alloc(pool_ids); 392 pool->pool_pset = pool_pset_default; 393 pool_pset_default->pset_npools++; 394 list_insert_tail(&pool_list, pool); 395 (void) nvlist_alloc(&pool->pool_props, NV_UNIQUE_NAME, KM_SLEEP); 396 (void) nvlist_add_int64(pool->pool_props, "pool.sys_id", pool->pool_id); 397 (void) nvlist_add_byte(pool->pool_props, "pool.default", 0); 398 pool_pool_mod = gethrtime(); 399 (void) snprintf(pool_name, sizeof (pool_name), "pool_%lld", 400 pool_pool_mod); 401 (void) nvlist_add_string(pool->pool_props, "pool.name", pool_name); 402 pool_count++; 403 return (0); 404 } 405 406 struct destroy_zone_arg { 407 pool_t *old; 408 pool_t *new; 409 }; 410 411 /* 412 * Update pool pointers for zones that are currently bound to pool "old" 413 * to be bound to pool "new". 414 */ 415 static int 416 pool_destroy_zone_cb(zone_t *zone, void *arg) 417 { 418 struct destroy_zone_arg *dza = arg; 419 420 ASSERT(pool_lock_held()); 421 ASSERT(MUTEX_HELD(&cpu_lock)); 422 423 if (zone_pool_get(zone) == dza->old) 424 zone_pool_set(zone, dza->new); 425 return (0); 426 } 427 428 /* 429 * Destroy specified pool, and rebind all processes in it 430 * to the default pool. 431 */ 432 static int 433 pool_pool_destroy(poolid_t poolid) 434 { 435 pool_t *pool; 436 int ret; 437 438 ASSERT(pool_lock_held()); 439 440 if (poolid == POOL_DEFAULT) 441 return (EINVAL); 442 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) 443 return (ESRCH); 444 ret = pool_do_bind(pool_default, P_POOLID, poolid, POOL_BIND_ALL); 445 if (ret == 0) { 446 struct destroy_zone_arg dzarg; 447 448 dzarg.old = pool; 449 dzarg.new = pool_default; 450 mutex_enter(&cpu_lock); 451 ret = zone_walk(pool_destroy_zone_cb, &dzarg); 452 mutex_exit(&cpu_lock); 453 ASSERT(ret == 0); 454 ASSERT(pool->pool_ref == 0); 455 (void) nvlist_free(pool->pool_props); 456 id_free(pool_ids, pool->pool_id); 457 pool->pool_pset->pset_npools--; 458 list_remove(&pool_list, pool); 459 pool_count--; 460 pool_pool_mod = gethrtime(); 461 kmem_free(pool, sizeof (pool_t)); 462 } 463 return (ret); 464 } 465 466 /* 467 * Create new pool or resource set. 468 */ 469 int 470 pool_create(int class, int subclass, id_t *id) 471 { 472 int ret; 473 474 ASSERT(pool_lock_held()); 475 if (pool_state == POOL_DISABLED) 476 return (ENOTACTIVE); 477 switch (class) { 478 case PEC_POOL: 479 ret = pool_pool_create((poolid_t *)id); 480 break; 481 case PEC_RES_COMP: 482 switch (subclass) { 483 case PREC_PSET: 484 ret = pool_pset_create((psetid_t *)id); 485 break; 486 default: 487 ret = EINVAL; 488 } 489 break; 490 case PEC_RES_AGG: 491 ret = ENOTSUP; 492 break; 493 default: 494 ret = EINVAL; 495 } 496 return (ret); 497 } 498 499 /* 500 * Destroy an existing pool or resource set. 501 */ 502 int 503 pool_destroy(int class, int subclass, id_t id) 504 { 505 int ret; 506 507 ASSERT(pool_lock_held()); 508 if (pool_state == POOL_DISABLED) 509 return (ENOTACTIVE); 510 switch (class) { 511 case PEC_POOL: 512 ret = pool_pool_destroy((poolid_t)id); 513 break; 514 case PEC_RES_COMP: 515 switch (subclass) { 516 case PREC_PSET: 517 ret = pool_pset_destroy((psetid_t)id); 518 break; 519 default: 520 ret = EINVAL; 521 } 522 break; 523 case PEC_RES_AGG: 524 ret = ENOTSUP; 525 break; 526 default: 527 ret = EINVAL; 528 } 529 return (ret); 530 } 531 532 /* 533 * Enable or disable pools. 534 */ 535 int 536 pool_status(int status) 537 { 538 int ret = 0; 539 540 ASSERT(pool_lock_held()); 541 542 if (pool_state == status) 543 return (0); 544 switch (status) { 545 case POOL_ENABLED: 546 ret = pool_enable(); 547 if (ret != 0) 548 return (ret); 549 pool_state = POOL_ENABLED; 550 break; 551 case POOL_DISABLED: 552 ret = pool_disable(); 553 if (ret != 0) 554 return (ret); 555 pool_state = POOL_DISABLED; 556 break; 557 default: 558 ret = EINVAL; 559 } 560 return (ret); 561 } 562 563 /* 564 * Associate pool with resource set. 565 */ 566 int 567 pool_assoc(poolid_t poolid, int idtype, id_t id) 568 { 569 int ret; 570 571 ASSERT(pool_lock_held()); 572 if (pool_state == POOL_DISABLED) 573 return (ENOTACTIVE); 574 switch (idtype) { 575 case PREC_PSET: 576 ret = pool_pset_assoc(poolid, (psetid_t)id); 577 break; 578 default: 579 ret = EINVAL; 580 } 581 if (ret == 0) 582 pool_pool_mod = gethrtime(); 583 return (ret); 584 } 585 586 /* 587 * Disassociate resource set from pool. 588 */ 589 int 590 pool_dissoc(poolid_t poolid, int idtype) 591 { 592 int ret; 593 594 ASSERT(pool_lock_held()); 595 if (pool_state == POOL_DISABLED) 596 return (ENOTACTIVE); 597 switch (idtype) { 598 case PREC_PSET: 599 ret = pool_pset_assoc(poolid, PS_NONE); 600 break; 601 default: 602 ret = EINVAL; 603 } 604 if (ret == 0) 605 pool_pool_mod = gethrtime(); 606 return (ret); 607 } 608 609 /* 610 * Transfer specified quantity of resources between resource sets. 611 */ 612 /*ARGSUSED*/ 613 int 614 pool_transfer(int type, id_t src, id_t dst, uint64_t qty) 615 { 616 int ret = EINVAL; 617 return (ret); 618 } 619 620 /* 621 * Transfer resources specified by their IDs between resource sets. 622 */ 623 int 624 pool_xtransfer(int type, id_t src, id_t dst, uint_t size, id_t *ids) 625 { 626 int ret; 627 628 ASSERT(pool_lock_held()); 629 if (pool_state == POOL_DISABLED) 630 return (ENOTACTIVE); 631 switch (type) { 632 case PREC_PSET: 633 ret = pool_pset_xtransfer((psetid_t)src, (psetid_t)dst, 634 size, ids); 635 break; 636 default: 637 ret = EINVAL; 638 } 639 return (ret); 640 } 641 642 /* 643 * Bind processes to pools. 644 */ 645 int 646 pool_bind(poolid_t poolid, idtype_t idtype, id_t id) 647 { 648 pool_t *pool; 649 650 ASSERT(pool_lock_held()); 651 652 if (pool_state == POOL_DISABLED) 653 return (ENOTACTIVE); 654 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) 655 return (ESRCH); 656 657 switch (idtype) { 658 case P_PID: 659 case P_TASKID: 660 case P_PROJID: 661 case P_ZONEID: 662 break; 663 default: 664 return (EINVAL); 665 } 666 return (pool_do_bind(pool, idtype, id, POOL_BIND_ALL)); 667 } 668 669 /* 670 * Query pool binding of the specifed process. 671 */ 672 int 673 pool_query_binding(idtype_t idtype, id_t id, id_t *poolid) 674 { 675 proc_t *p; 676 677 if (idtype != P_PID) 678 return (ENOTSUP); 679 if (id == P_MYID) 680 id = curproc->p_pid; 681 682 ASSERT(pool_lock_held()); 683 684 mutex_enter(&pidlock); 685 if ((p = prfind((pid_t)id)) == NULL) { 686 mutex_exit(&pidlock); 687 return (ESRCH); 688 } 689 mutex_enter(&p->p_lock); 690 /* 691 * In local zones, lie about pool bindings of processes from 692 * the global zone. 693 */ 694 if (!INGLOBALZONE(curproc) && INGLOBALZONE(p)) { 695 pool_t *pool; 696 697 pool = zone_pool_get(curproc->p_zone); 698 *poolid = pool->pool_id; 699 } else { 700 *poolid = p->p_pool->pool_id; 701 } 702 mutex_exit(&p->p_lock); 703 mutex_exit(&pidlock); 704 return (0); 705 } 706 707 static ea_object_t * 708 pool_system_pack(void) 709 { 710 ea_object_t *eo_system; 711 size_t bufsz = 0; 712 char *buf = NULL; 713 714 ASSERT(pool_lock_held()); 715 716 eo_system = ea_alloc_group(EXT_GROUP | EXC_LOCAL | EXD_GROUP_SYSTEM); 717 (void) ea_attach_item(eo_system, &pool_sys_mod, sizeof (hrtime_t), 718 EXC_LOCAL | EXD_SYSTEM_TSTAMP | EXT_UINT64); 719 if (INGLOBALZONE(curproc)) 720 (void) ea_attach_item(eo_system, &pool_pool_mod, 721 sizeof (hrtime_t), 722 EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64); 723 else 724 (void) ea_attach_item(eo_system, 725 &curproc->p_zone->zone_pool_mod, 726 sizeof (hrtime_t), 727 EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64); 728 (void) ea_attach_item(eo_system, &pool_pset_mod, sizeof (hrtime_t), 729 EXC_LOCAL | EXD_PSET_TSTAMP | EXT_UINT64); 730 (void) ea_attach_item(eo_system, &pool_cpu_mod, sizeof (hrtime_t), 731 EXC_LOCAL | EXD_CPU_TSTAMP | EXT_UINT64); 732 (void) nvlist_pack(pool_sys_prop, &buf, &bufsz, NV_ENCODE_NATIVE, 0); 733 (void) ea_attach_item(eo_system, buf, bufsz, 734 EXC_LOCAL | EXD_SYSTEM_PROP | EXT_RAW); 735 kmem_free(buf, bufsz); 736 return (eo_system); 737 } 738 739 /* 740 * Pack information about pools and attach it to specified exacct group. 741 */ 742 static int 743 pool_pool_pack(ea_object_t *eo_system) 744 { 745 ea_object_t *eo_pool; 746 pool_t *pool; 747 size_t bufsz; 748 char *buf; 749 pool_t *myzonepool; 750 751 ASSERT(pool_lock_held()); 752 myzonepool = zone_pool_get(curproc->p_zone); 753 for (pool = list_head(&pool_list); pool; 754 pool = list_next(&pool_list, pool)) { 755 if (!INGLOBALZONE(curproc) && myzonepool != pool) 756 continue; 757 bufsz = 0; 758 buf = NULL; 759 eo_pool = ea_alloc_group(EXT_GROUP | 760 EXC_LOCAL | EXD_GROUP_POOL); 761 (void) ea_attach_item(eo_pool, &pool->pool_id, sizeof (id_t), 762 EXC_LOCAL | EXD_POOL_POOLID | EXT_UINT32); 763 (void) ea_attach_item(eo_pool, &pool->pool_pset->pset_id, 764 sizeof (id_t), EXC_LOCAL | EXD_POOL_PSETID | EXT_UINT32); 765 (void) nvlist_pack(pool->pool_props, &buf, &bufsz, 766 NV_ENCODE_NATIVE, 0); 767 (void) ea_attach_item(eo_pool, buf, bufsz, 768 EXC_LOCAL | EXD_POOL_PROP | EXT_RAW); 769 kmem_free(buf, bufsz); 770 (void) ea_attach_to_group(eo_system, eo_pool); 771 } 772 return (0); 773 } 774 775 /* 776 * Pack the whole pool configuration in the specified buffer. 777 */ 778 int 779 pool_pack_conf(void *kbuf, size_t kbufsz, size_t *asize) 780 { 781 ea_object_t *eo_system; 782 size_t ksize; 783 int ret = 0; 784 785 ASSERT(pool_lock_held()); 786 787 eo_system = pool_system_pack(); /* 1. pack system */ 788 (void) pool_pool_pack(eo_system); /* 2. pack all pools */ 789 (void) pool_pset_pack(eo_system); /* 3. pack all psets */ 790 ksize = ea_pack_object(eo_system, NULL, 0); 791 if (kbuf == NULL || kbufsz == 0) 792 *asize = ksize; 793 else if (ksize > kbufsz) 794 ret = ENOMEM; 795 else 796 *asize = ea_pack_object(eo_system, kbuf, kbufsz); 797 ea_free_object(eo_system, EUP_ALLOC); 798 return (ret); 799 } 800 801 /* 802 * Start/end the commit transaction. If commit transaction is currently 803 * in progress, then all POOL_QUERY ioctls will return pools configuration 804 * at the beginning of transaction. 805 */ 806 int 807 pool_commit(int state) 808 { 809 ea_object_t *eo_system; 810 int ret = 0; 811 812 ASSERT(pool_lock_held()); 813 814 if (pool_state == POOL_DISABLED) 815 return (ENOTACTIVE); 816 switch (state) { 817 case 1: 818 /* 819 * Beginning commit transation. 820 */ 821 if (pool_buf != NULL) /* transaction in progress */ 822 return (EBUSY); 823 eo_system = pool_system_pack(); /* 1. pack system */ 824 (void) pool_pool_pack(eo_system); /* 2. pack all pools */ 825 (void) pool_pset_pack(eo_system); /* 3. pack all psets */ 826 pool_bufsz = ea_pack_object(eo_system, NULL, 0); 827 pool_buf = kmem_alloc(pool_bufsz, KM_SLEEP); 828 pool_bufsz = ea_pack_object(eo_system, pool_buf, pool_bufsz); 829 ea_free_object(eo_system, EUP_ALLOC); 830 break; 831 case 0: 832 /* 833 * Finishing commit transaction. 834 */ 835 if (pool_buf != NULL) { 836 kmem_free(pool_buf, pool_bufsz); 837 pool_buf = NULL; 838 pool_bufsz = 0; 839 } 840 break; 841 default: 842 ret = EINVAL; 843 } 844 return (ret); 845 } 846 847 /* 848 * Check is the specified property is special 849 */ 850 static pool_property_t * 851 pool_property_find(char *name, pool_property_t *list) 852 { 853 pool_property_t *prop; 854 855 for (prop = list; prop->pp_name != NULL; prop++) 856 if (strcmp(prop->pp_name, name) == 0) 857 return (prop); 858 return (NULL); 859 } 860 861 static pool_property_t pool_prop_sys[] = { 862 { "system.name", DATA_TYPE_STRING, PP_RDWR }, 863 { "system.comment", DATA_TYPE_STRING, PP_RDWR }, 864 { "system.version", DATA_TYPE_UINT64, PP_READ }, 865 { "system.bind-default", DATA_TYPE_BYTE, PP_RDWR }, 866 { "system.allocate-method", DATA_TYPE_STRING, 867 PP_RDWR | PP_OPTIONAL }, 868 { "system.poold.log-level", DATA_TYPE_STRING, 869 PP_RDWR | PP_OPTIONAL }, 870 { "system.poold.log-location", DATA_TYPE_STRING, 871 PP_RDWR | PP_OPTIONAL }, 872 { "system.poold.monitor-interval", DATA_TYPE_UINT64, 873 PP_RDWR | PP_OPTIONAL }, 874 { "system.poold.history-file", DATA_TYPE_STRING, 875 PP_RDWR | PP_OPTIONAL }, 876 { "system.poold.objectives", DATA_TYPE_STRING, 877 PP_RDWR | PP_OPTIONAL }, 878 { NULL, 0, 0 } 879 }; 880 881 static pool_property_t pool_prop_pool[] = { 882 { "pool.sys_id", DATA_TYPE_UINT64, PP_READ }, 883 { "pool.name", DATA_TYPE_STRING, PP_RDWR }, 884 { "pool.default", DATA_TYPE_BYTE, PP_READ }, 885 { "pool.active", DATA_TYPE_BYTE, PP_RDWR }, 886 { "pool.importance", DATA_TYPE_INT64, PP_RDWR }, 887 { "pool.comment", DATA_TYPE_STRING, PP_RDWR }, 888 { "pool.scheduler", DATA_TYPE_STRING, 889 PP_RDWR | PP_OPTIONAL }, 890 { NULL, 0, 0 } 891 }; 892 893 /* 894 * Common routine to put new property on the specified list 895 */ 896 int 897 pool_propput_common(nvlist_t *nvlist, nvpair_t *pair, pool_property_t *props) 898 { 899 pool_property_t *prop; 900 901 if ((prop = pool_property_find(nvpair_name(pair), props)) != NULL) { 902 /* 903 * No read-only properties or properties with bad types 904 */ 905 if (!(prop->pp_perm & PP_WRITE) || 906 prop->pp_type != nvpair_type(pair)) 907 return (EINVAL); 908 } 909 return (nvlist_add_nvpair(nvlist, pair)); 910 } 911 912 /* 913 * Common routine to remove property from the given list 914 */ 915 int 916 pool_proprm_common(nvlist_t *nvlist, char *name, pool_property_t *props) 917 { 918 pool_property_t *prop; 919 920 if ((prop = pool_property_find(name, props)) != NULL) { 921 if (!(prop->pp_perm & PP_OPTIONAL)) 922 return (EINVAL); 923 } 924 return (nvlist_remove_all(nvlist, name)); 925 } 926 927 static int 928 pool_system_propput(nvpair_t *pair) 929 { 930 int ret; 931 932 ASSERT(pool_lock_held()); 933 ret = pool_propput_common(pool_sys_prop, pair, pool_prop_sys); 934 if (ret == 0) 935 pool_sys_mod = gethrtime(); 936 return (ret); 937 } 938 939 static int 940 pool_system_proprm(char *name) 941 { 942 int ret; 943 944 ASSERT(pool_lock_held()); 945 ret = pool_proprm_common(pool_sys_prop, name, pool_prop_sys); 946 if (ret == 0) 947 pool_sys_mod = gethrtime(); 948 return (ret); 949 } 950 951 static int 952 pool_pool_propput(poolid_t poolid, nvpair_t *pair) 953 { 954 pool_t *pool; 955 int ret; 956 957 ASSERT(pool_lock_held()); 958 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) 959 return (ESRCH); 960 ret = pool_propput_common(pool->pool_props, pair, pool_prop_pool); 961 if (ret == 0) 962 pool_pool_mod = gethrtime(); 963 return (ret); 964 } 965 966 static int 967 pool_pool_proprm(poolid_t poolid, char *name) 968 { 969 int ret; 970 pool_t *pool; 971 972 ASSERT(pool_lock_held()); 973 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) 974 return (ESRCH); 975 ret = pool_proprm_common(pool->pool_props, name, pool_prop_pool); 976 if (ret == 0) 977 pool_pool_mod = gethrtime(); 978 return (ret); 979 } 980 981 int 982 pool_propput(int class, int subclass, id_t id, nvpair_t *pair) 983 { 984 int ret; 985 986 ASSERT(pool_lock_held()); 987 if (pool_state == POOL_DISABLED) 988 return (ENOTACTIVE); 989 switch (class) { 990 case PEC_SYSTEM: 991 ret = pool_system_propput(pair); 992 break; 993 case PEC_POOL: 994 ret = pool_pool_propput((poolid_t)id, pair); 995 break; 996 case PEC_RES_COMP: 997 switch (subclass) { 998 case PREC_PSET: 999 ret = pool_pset_propput((psetid_t)id, pair); 1000 break; 1001 default: 1002 ret = EINVAL; 1003 } 1004 break; 1005 case PEC_RES_AGG: 1006 ret = ENOTSUP; 1007 break; 1008 case PEC_COMP: 1009 switch (subclass) { 1010 case PCEC_CPU: 1011 ret = pool_cpu_propput((processorid_t)id, pair); 1012 break; 1013 default: 1014 ret = EINVAL; 1015 } 1016 break; 1017 default: 1018 ret = EINVAL; 1019 } 1020 return (ret); 1021 } 1022 1023 int 1024 pool_proprm(int class, int subclass, id_t id, char *name) 1025 { 1026 int ret; 1027 1028 ASSERT(pool_lock_held()); 1029 if (pool_state == POOL_DISABLED) 1030 return (ENOTACTIVE); 1031 switch (class) { 1032 case PEC_SYSTEM: 1033 ret = pool_system_proprm(name); 1034 break; 1035 case PEC_POOL: 1036 ret = pool_pool_proprm((poolid_t)id, name); 1037 break; 1038 case PEC_RES_COMP: 1039 switch (subclass) { 1040 case PREC_PSET: 1041 ret = pool_pset_proprm((psetid_t)id, name); 1042 break; 1043 default: 1044 ret = EINVAL; 1045 } 1046 break; 1047 case PEC_RES_AGG: 1048 ret = ENOTSUP; 1049 break; 1050 case PEC_COMP: 1051 switch (subclass) { 1052 case PCEC_CPU: 1053 ret = pool_cpu_proprm((processorid_t)id, name); 1054 break; 1055 default: 1056 ret = EINVAL; 1057 } 1058 break; 1059 default: 1060 ret = EINVAL; 1061 } 1062 return (ret); 1063 } 1064 1065 int 1066 pool_propget(char *name, int class, int subclass, id_t id, nvlist_t **nvlp) 1067 { 1068 int ret; 1069 nvlist_t *nvl; 1070 1071 ASSERT(pool_lock_held()); 1072 if (pool_state == POOL_DISABLED) 1073 return (ENOTACTIVE); 1074 1075 (void) nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP); 1076 1077 switch (class) { 1078 case PEC_SYSTEM: 1079 case PEC_POOL: 1080 ret = EINVAL; 1081 break; 1082 case PEC_RES_COMP: 1083 switch (subclass) { 1084 case PREC_PSET: 1085 ret = pool_pset_propget((psetid_t)id, name, nvl); 1086 break; 1087 default: 1088 ret = EINVAL; 1089 } 1090 break; 1091 case PEC_RES_AGG: 1092 ret = ENOTSUP; 1093 break; 1094 case PEC_COMP: 1095 switch (subclass) { 1096 case PCEC_CPU: 1097 ret = pool_cpu_propget((processorid_t)id, name, nvl); 1098 break; 1099 default: 1100 ret = EINVAL; 1101 } 1102 break; 1103 default: 1104 ret = EINVAL; 1105 } 1106 if (ret == 0) 1107 *nvlp = nvl; 1108 else 1109 nvlist_free(nvl); 1110 return (ret); 1111 } 1112 1113 /* 1114 * pool_bind_wake and pool_bind_wakeall are helper functions to undo PBWAITs 1115 * in case of failure in pool_do_bind(). 1116 */ 1117 static void 1118 pool_bind_wake(proc_t *p) 1119 { 1120 ASSERT(pool_lock_held()); 1121 1122 mutex_enter(&p->p_lock); 1123 ASSERT(p->p_poolflag & PBWAIT); 1124 if (p->p_poolcnt > 0) { 1125 mutex_enter(&pool_barrier_lock); 1126 pool_barrier_count -= p->p_poolcnt; 1127 mutex_exit(&pool_barrier_lock); 1128 } 1129 p->p_poolflag &= ~PBWAIT; 1130 cv_signal(&p->p_poolcv); 1131 mutex_exit(&p->p_lock); 1132 } 1133 1134 static void 1135 pool_bind_wakeall(proc_t **procs) 1136 { 1137 proc_t *p, **pp; 1138 1139 ASSERT(pool_lock_held()); 1140 for (pp = procs; (p = *pp) != NULL; pp++) 1141 pool_bind_wake(p); 1142 } 1143 1144 /* 1145 * Return the scheduling class for this pool, or 1146 * POOL_CLASS_UNSET if not set 1147 * POOL_CLASS_INVAL if set to an invalid class ID. 1148 */ 1149 id_t 1150 pool_get_class(pool_t *pool) 1151 { 1152 char *name; 1153 id_t cid; 1154 1155 ASSERT(pool_lock_held()); 1156 1157 if (nvlist_lookup_string(pool->pool_props, "pool.scheduler", 1158 &name) == 0) { 1159 if (getcidbyname(name, &cid) == 0) 1160 return (cid); 1161 else 1162 return (POOL_CLASS_INVAL); 1163 } 1164 return (POOL_CLASS_UNSET); 1165 } 1166 1167 /* 1168 * Move process to the new scheduling class. 1169 */ 1170 static void 1171 pool_change_class(proc_t *p, id_t cid) 1172 { 1173 kthread_t *t; 1174 void *cldata; 1175 id_t oldcid; 1176 void **bufs; 1177 void **buf; 1178 int nlwp; 1179 int ret; 1180 int i; 1181 1182 /* 1183 * Do not move kernel processes (such as zsched). 1184 */ 1185 if (p->p_flag & SSYS) 1186 return; 1187 /* 1188 * This process is in the pool barrier, so it can't possibly be 1189 * adding new threads and we can use p_lwpcnt + p_zombcnt + 1 1190 * (for possible agent LWP which doesn't use pool barrier) as 1191 * our upper bound. 1192 */ 1193 nlwp = p->p_lwpcnt + p->p_zombcnt + 1; 1194 1195 /* 1196 * Pre-allocate scheduling class specific buffers before 1197 * grabbing p_lock. 1198 */ 1199 bufs = kmem_zalloc(nlwp * sizeof (void *), KM_SLEEP); 1200 for (i = 0, buf = bufs; i < nlwp; i++, buf++) { 1201 ret = CL_ALLOC(buf, cid, KM_SLEEP); 1202 ASSERT(ret == 0); 1203 } 1204 1205 /* 1206 * Move threads one by one to the new scheduling class. 1207 * This never fails because we have all the right 1208 * privileges here. 1209 */ 1210 mutex_enter(&p->p_lock); 1211 ASSERT(p->p_poolflag & PBWAIT); 1212 buf = bufs; 1213 t = p->p_tlist; 1214 ASSERT(t != NULL); 1215 do { 1216 if (t->t_cid != cid) { 1217 oldcid = t->t_cid; 1218 cldata = t->t_cldata; 1219 ret = CL_ENTERCLASS(t, cid, NULL, NULL, *buf); 1220 ASSERT(ret == 0); 1221 CL_EXITCLASS(oldcid, cldata); 1222 schedctl_set_cidpri(t); 1223 *buf++ = NULL; 1224 } 1225 } while ((t = t->t_forw) != p->p_tlist); 1226 mutex_exit(&p->p_lock); 1227 /* 1228 * Free unused scheduling class specific buffers. 1229 */ 1230 for (i = 0, buf = bufs; i < nlwp; i++, buf++) { 1231 if (*buf != NULL) { 1232 CL_FREE(cid, *buf); 1233 *buf = NULL; 1234 } 1235 } 1236 kmem_free(bufs, nlwp * sizeof (void *)); 1237 } 1238 1239 /* 1240 * The meat of the bind operation. The steps in pool_do_bind are: 1241 * 1242 * 1) Set PBWAIT in the p_poolflag of any process of interest, and add all 1243 * such processes to an array. For any interesting process that has 1244 * threads inside the pool barrier set, increment a counter by the 1245 * count of such threads. Once PBWAIT is set on a process, that process 1246 * will not disappear. 1247 * 1248 * 2) Wait for the counter from step 2 to drop to zero. Any process which 1249 * calls pool_barrier_exit() and notices that PBWAIT has been set on it 1250 * will decrement that counter before going to sleep, and the process 1251 * calling pool_barrier_exit() which does the final decrement will wake us. 1252 * 1253 * 3) For each interesting process, perform a calculation on it to see if 1254 * the bind will actually succeed. This uses the following three 1255 * resource-set-specific functions: 1256 * 1257 * - int set_bind_start(procs, pool) 1258 * 1259 * Determine whether the given array of processes can be bound to the 1260 * resource set associated with the given pool. If it can, take and hold 1261 * any locks necessary to ensure that the operation will succeed, and 1262 * make any necessary reservations in the target resource set. If it 1263 * can't, return failure with no reservations made and no new locks held. 1264 * 1265 * - void set_bind_abort(procs, pool) 1266 * 1267 * set_bind_start() has completed successfully, but another resource set's 1268 * set_bind_start() has failed, and we haven't begun the bind yet. Undo 1269 * any reservations made and drop any locks acquired by our 1270 * set_bind_start(). 1271 * 1272 * - void set_bind_finish(void) 1273 * 1274 * The bind has completed successfully. The processes have been released, 1275 * and the reservation acquired in set_bind_start() has been depleted as 1276 * the processes have finished their bindings. Drop any locks acquired by 1277 * set_bind_start(). 1278 * 1279 * 4) If we've decided that we can proceed with the bind, iterate through 1280 * the list of interesting processes, grab the necessary locks (which 1281 * may differ per resource set), perform the bind, and ASSERT that it 1282 * succeeds. Once a process has been rebound, it can be awakened. 1283 * 1284 * The operations from step 4 must be kept in sync with anything which might 1285 * cause the bind operations (e.g., cpupart_bind_thread()) to fail, and 1286 * are thus located in the same source files as the associated bind operations. 1287 */ 1288 int 1289 pool_do_bind(pool_t *pool, idtype_t idtype, id_t id, int flags) 1290 { 1291 extern uint_t nproc; 1292 klwp_t *lwp = ttolwp(curthread); 1293 proc_t **pp, **procs; 1294 proc_t *prstart; 1295 int procs_count = 0; 1296 kproject_t *kpj; 1297 procset_t set; 1298 zone_t *zone; 1299 int procs_size; 1300 int rv = 0; 1301 proc_t *p; 1302 id_t cid = -1; 1303 1304 ASSERT(pool_lock_held()); 1305 1306 if ((cid = pool_get_class(pool)) == POOL_CLASS_INVAL) 1307 return (EINVAL); 1308 1309 if (idtype == P_ZONEID) { 1310 zone = zone_find_by_id(id); 1311 if (zone == NULL) 1312 return (ESRCH); 1313 if (zone_status_get(zone) > ZONE_IS_RUNNING) { 1314 zone_rele(zone); 1315 return (EBUSY); 1316 } 1317 } 1318 1319 if (idtype == P_PROJID) { 1320 kpj = project_hold_by_id(id, global_zone, PROJECT_HOLD_FIND); 1321 if (kpj == NULL) 1322 return (ESRCH); 1323 mutex_enter(&kpj->kpj_poolbind); 1324 } 1325 1326 if (idtype == P_PID) { 1327 /* 1328 * Fast-path for a single process case. 1329 */ 1330 procs_size = 2; /* procs is NULL-terminated */ 1331 procs = kmem_zalloc(procs_size * sizeof (proc_t *), KM_SLEEP); 1332 mutex_enter(&pidlock); 1333 } else { 1334 /* 1335 * We will need enough slots for proc_t pointers for as many as 1336 * twice the number of currently running processes (assuming 1337 * that each one could be in fork() creating a new child). 1338 */ 1339 for (;;) { 1340 procs_size = nproc * 2; 1341 procs = kmem_zalloc(procs_size * sizeof (proc_t *), 1342 KM_SLEEP); 1343 mutex_enter(&pidlock); 1344 1345 if (nproc * 2 <= procs_size) 1346 break; 1347 /* 1348 * If nproc has changed, try again. 1349 */ 1350 mutex_exit(&pidlock); 1351 kmem_free(procs, procs_size * sizeof (proc_t *)); 1352 } 1353 } 1354 1355 if (id == P_MYID) 1356 id = getmyid(idtype); 1357 setprocset(&set, POP_AND, idtype, id, P_ALL, 0); 1358 1359 /* 1360 * Do a first scan, and select target processes. 1361 */ 1362 if (idtype == P_PID) 1363 prstart = prfind(id); 1364 else 1365 prstart = practive; 1366 for (p = prstart, pp = procs; p != NULL; p = p->p_next) { 1367 mutex_enter(&p->p_lock); 1368 /* 1369 * Skip processes that don't match our (id, idtype) set or 1370 * on the way of becoming zombies. Skip kernel processes 1371 * from the global zone. 1372 */ 1373 if (procinset(p, &set) == 0 || 1374 p->p_poolflag & PEXITED || 1375 ((p->p_flag & SSYS) && INGLOBALZONE(p))) { 1376 mutex_exit(&p->p_lock); 1377 continue; 1378 } 1379 if (!INGLOBALZONE(p)) { 1380 switch (idtype) { 1381 case P_PID: 1382 case P_TASKID: 1383 /* 1384 * Can't bind processes or tasks 1385 * in local zones to pools. 1386 */ 1387 mutex_exit(&p->p_lock); 1388 mutex_exit(&pidlock); 1389 pool_bind_wakeall(procs); 1390 rv = EINVAL; 1391 goto out; 1392 case P_PROJID: 1393 /* 1394 * Only projects in the global 1395 * zone can be rebound. 1396 */ 1397 mutex_exit(&p->p_lock); 1398 continue; 1399 case P_POOLID: 1400 /* 1401 * When rebinding pools, processes can be 1402 * in different zones. 1403 */ 1404 break; 1405 } 1406 } 1407 1408 p->p_poolflag |= PBWAIT; 1409 /* 1410 * If some threads in this process are inside the pool 1411 * barrier, add them to pool_barrier_count, as we have 1412 * to wait for all of them to exit the barrier. 1413 */ 1414 if (p->p_poolcnt > 0) { 1415 mutex_enter(&pool_barrier_lock); 1416 pool_barrier_count += p->p_poolcnt; 1417 mutex_exit(&pool_barrier_lock); 1418 } 1419 ASSERT(pp < &procs[procs_size]); 1420 *pp++ = p; 1421 procs_count++; 1422 mutex_exit(&p->p_lock); 1423 1424 /* 1425 * We just found our process, so if we're only rebinding a 1426 * single process then get out of this loop. 1427 */ 1428 if (idtype == P_PID) 1429 break; 1430 } 1431 *pp = NULL; /* cap off the end of the array */ 1432 mutex_exit(&pidlock); 1433 1434 /* 1435 * Wait for relevant processes to stop before they try to enter the 1436 * barrier or at the exit from the barrier. Make sure that we do 1437 * not get stopped here while we're holding pool_lock. If we were 1438 * requested to stop, or got a signal then return EAGAIN to let the 1439 * library know that it needs to retry. 1440 */ 1441 mutex_enter(&pool_barrier_lock); 1442 lwp->lwp_nostop++; 1443 while (pool_barrier_count > 0) { 1444 (void) cv_wait_sig(&pool_barrier_cv, &pool_barrier_lock); 1445 if (pool_barrier_count > 0) { 1446 /* 1447 * We either got a signal or were requested to 1448 * stop by /proc. Bail out with EAGAIN. If we were 1449 * requested to stop, we'll stop in post_syscall() 1450 * on our way back to userland. 1451 */ 1452 mutex_exit(&pool_barrier_lock); 1453 pool_bind_wakeall(procs); 1454 lwp->lwp_nostop--; 1455 rv = EAGAIN; 1456 goto out; 1457 } 1458 } 1459 lwp->lwp_nostop--; 1460 mutex_exit(&pool_barrier_lock); 1461 1462 if (idtype == P_PID) 1463 goto skip; 1464 1465 /* 1466 * Do another run, and drop processes that were inside the barrier 1467 * in exit(), but when they have dropped to pool_barrier_exit 1468 * they have become of no interest to us. Pick up child processes that 1469 * were created by fork() but didn't exist during our first scan. 1470 * Their parents are now stopped at pool_barrier_exit in cfork(). 1471 */ 1472 mutex_enter(&pidlock); 1473 for (pp = procs; (p = *pp) != NULL; pp++) { 1474 if (p->p_poolflag & PEXITED) { 1475 ASSERT(p->p_lwpcnt == 0); 1476 pool_bind_wake(p); 1477 /* flip w/last non-NULL slot */ 1478 *pp = procs[procs_count - 1]; 1479 procs[procs_count - 1] = NULL; 1480 procs_count--; 1481 pp--; /* try this slot again */ 1482 continue; 1483 } 1484 /* 1485 * Look at the child and check if it should be rebound also. 1486 * We're holding pidlock, so it is safe to reference p_child. 1487 */ 1488 if ((p = p->p_child) == NULL) 1489 continue; 1490 1491 mutex_enter(&p->p_lock); 1492 /* 1493 * Skip processes in local zones if we're not binding 1494 * zones to pools (P_ZONEID). Skip kernel processes also. 1495 */ 1496 if ((!INGLOBALZONE(p) && idtype != P_ZONEID) || 1497 p->p_flag & SSYS) { 1498 mutex_exit(&p->p_lock); 1499 continue; 1500 } 1501 1502 /* 1503 * If the child process has been already created by fork(), has 1504 * not exited, and has not been added to the list already, 1505 * then add it now. We will hit this process again (since we 1506 * stick it at the end of the procs list) but it will ignored 1507 * because it will have the PBWAIT flag set. 1508 */ 1509 if (procinset(p, &set) && 1510 !(p->p_poolflag & PEXITED) && 1511 !(p->p_poolflag & PBWAIT)) { 1512 ASSERT(p->p_child == NULL); /* no child of a child */ 1513 procs[procs_count] = p; 1514 procs[procs_count + 1] = NULL; 1515 procs_count++; 1516 p->p_poolflag |= PBWAIT; 1517 } 1518 mutex_exit(&p->p_lock); 1519 } 1520 mutex_exit(&pidlock); 1521 skip: 1522 /* 1523 * If there's no processes to rebind then return ESRCH, unless 1524 * we're associating a pool with new resource set, destroying it, 1525 * or binding a zone to a pool. 1526 */ 1527 if (procs_count == 0) { 1528 if (idtype == P_POOLID || idtype == P_ZONEID) 1529 rv = 0; 1530 else 1531 rv = ESRCH; 1532 goto out; 1533 } 1534 1535 #ifdef DEBUG 1536 /* 1537 * All processes in the array should have PBWAIT set, and none should 1538 * be in the critical section. Even though p_poolflag is protected by 1539 * the p_lock, these assertions should be stable across the dropping of 1540 * p_lock. 1541 */ 1542 for (pp = procs; (p = *pp) != NULL; pp++) { 1543 ASSERT(p->p_poolflag & PBWAIT); 1544 ASSERT(p->p_poolcnt == 0); 1545 ASSERT(procinset(p, &set)); 1546 } 1547 #endif 1548 1549 /* 1550 * Do the check if processor set rebinding is going to succeed or not. 1551 */ 1552 if ((flags & POOL_BIND_PSET) && 1553 (rv = pset_bind_start(procs, pool)) != 0) { 1554 pool_bind_wakeall(procs); 1555 goto out; 1556 } 1557 1558 /* 1559 * At this point, all bind operations should succeed. 1560 */ 1561 for (pp = procs; (p = *pp) != NULL; pp++) { 1562 if (flags & POOL_BIND_PSET) { 1563 psetid_t psetid = pool->pool_pset->pset_id; 1564 void *zonebuf; 1565 void *projbuf; 1566 1567 /* 1568 * Pre-allocate one buffer for FSS (per-project 1569 * buffer for a new pset) in case if this is the 1570 * first thread from its current project getting 1571 * bound to this processor set. 1572 */ 1573 projbuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_PROJ); 1574 zonebuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_ZONE); 1575 1576 mutex_enter(&pidlock); 1577 mutex_enter(&p->p_lock); 1578 pool_pset_bind(p, psetid, projbuf, zonebuf); 1579 mutex_exit(&p->p_lock); 1580 mutex_exit(&pidlock); 1581 /* 1582 * Free buffers pre-allocated above if it 1583 * wasn't actually used. 1584 */ 1585 fss_freebuf(projbuf, FSS_ALLOC_PROJ); 1586 fss_freebuf(zonebuf, FSS_ALLOC_ZONE); 1587 } 1588 /* 1589 * Now let's change the scheduling class of this 1590 * process if our target pool has it defined. 1591 */ 1592 if (cid != POOL_CLASS_UNSET) 1593 pool_change_class(p, cid); 1594 1595 /* 1596 * It is safe to reference p_pool here without holding 1597 * p_lock because it cannot change underneath of us. 1598 * We're holding pool_lock here, so nobody else can be 1599 * moving this process between pools. If process "p" 1600 * would be exiting, we're guaranteed that it would be blocked 1601 * at pool_barrier_enter() in exit(). Otherwise, it would've 1602 * been skipped by one of our scans of the practive list 1603 * as a process with PEXITED flag set. 1604 */ 1605 if (p->p_pool != pool) { 1606 ASSERT(p->p_pool->pool_ref > 0); 1607 atomic_add_32(&p->p_pool->pool_ref, -1); 1608 p->p_pool = pool; 1609 atomic_add_32(&p->p_pool->pool_ref, 1); 1610 } 1611 /* 1612 * Okay, we've tortured this guy enough. 1613 * Let this poor process go now. 1614 */ 1615 pool_bind_wake(p); 1616 } 1617 if (flags & POOL_BIND_PSET) 1618 pset_bind_finish(); 1619 1620 out: switch (idtype) { 1621 case P_PROJID: 1622 ASSERT(kpj != NULL); 1623 mutex_exit(&kpj->kpj_poolbind); 1624 project_rele(kpj); 1625 break; 1626 case P_ZONEID: 1627 if (rv == 0) { 1628 mutex_enter(&cpu_lock); 1629 zone_pool_set(zone, pool); 1630 mutex_exit(&cpu_lock); 1631 } 1632 zone->zone_pool_mod = gethrtime(); 1633 zone_rele(zone); 1634 break; 1635 } 1636 1637 kmem_free(procs, procs_size * sizeof (proc_t *)); 1638 ASSERT(pool_barrier_count == 0); 1639 return (rv); 1640 } 1641