1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/pool.h> 28 #include <sys/pool_impl.h> 29 #include <sys/pool_pset.h> 30 #include <sys/id_space.h> 31 #include <sys/mutex.h> 32 #include <sys/nvpair.h> 33 #include <sys/cpuvar.h> 34 #include <sys/errno.h> 35 #include <sys/cmn_err.h> 36 #include <sys/systm.h> 37 #include <sys/proc.h> 38 #include <sys/fss.h> 39 #include <sys/class.h> 40 #include <sys/exacct.h> 41 #include <sys/utsname.h> 42 #include <sys/procset.h> 43 #include <sys/atomic.h> 44 #include <sys/zone.h> 45 #include <sys/policy.h> 46 #include <sys/schedctl.h> 47 48 /* 49 * RESOURCE POOLS 50 * 51 * The resource pools facility brings together process-bindable resource into 52 * a common abstraction called a pool. Processor sets and other entities can 53 * be configured, grouped, and labelled such that workload components can be 54 * associated with a subset of a system's total resources. 55 * 56 * When disabled, the pools facility is "invisible". All processes belong 57 * to the same pool (pool_default), and processor sets can be managed through 58 * the old pset() system call. When enabled, processor sets can only be 59 * managed via the pools facility. New pools can be created and associated 60 * with processor sets. Processes can be bound to pools which have non-empty 61 * resource sets. 62 * 63 * Locking: pool_lock() protects global pools state and must be called 64 * before modifying the configuration, or when taking a snapshot of the 65 * configuration. If pool_lock_intr() is used, the operation may be 66 * interrupted by a signal or a request. 67 * 68 * To prevent processes from being rebound between pools while they are 69 * the middle of an operation which affects resource set bindings, such 70 * operations must be surrounded by calls to pool_barrier_enter() and 71 * pool_barrier_exit(). This mechanism guarantees that such processes will 72 * be stopped either at the beginning or at the end of the barrier so that 73 * the rebind operation can atomically bind the process and its threads 74 * to new resource sets, and then let process run again. 75 * 76 * Lock ordering with respect to other locks is as follows: 77 * 78 * pool_lock() -> cpu_lock -> pidlock -> p_lock -> pool_barrier_lock 79 * 80 * Most static and global variables defined in this file are protected 81 * by calling pool_lock(). 82 * 83 * The operation that binds tasks and projects to pools is atomic. That is, 84 * either all processes in a given task or a project will be bound to a 85 * new pool, or (in case of an error) they will be all left bound to the 86 * old pool. Processes in a given task or a given project can only be bound to 87 * different pools if they were rebound individually one by one as single 88 * processes. Threads or LWPs of the same process do not have pool bindings, 89 * and are bound to the same resource sets associated with the resource pool 90 * of that process. 91 * 92 * The following picture shows one possible pool configuration with three 93 * pools and three processor sets. Note that processor set "foo" is not 94 * associated with any pools and therefore cannot have any processes 95 * bound to it. Two pools (default and foo) are associated with the 96 * same processor set (default). Also, note that processes in Task 2 97 * are bound to different pools. 98 * 99 * 100 * Processor Sets 101 * +---------+ 102 * +--------------+========================>| default | 103 * a| | +---------+ 104 * s| | || 105 * s| | +---------+ 106 * o| | | foo | 107 * c| | +---------+ 108 * i| | || 109 * a| | +---------+ 110 * t| | +------>| bar | 111 * e| | | +---------+ 112 * d| | | 113 * | | | 114 * +---------+ +---------+ +---------+ 115 * Pools | default |======| foo |======| bar | 116 * +---------+ +---------+ +---------+ 117 * @ @ @ @ @ @ 118 * b| | | | | | 119 * o| | | | | | 120 * u| +-----+ | +-------+ | +---+ 121 * n| | | | | | 122 * ....d|........|......|......|.........|.......|.... 123 * : | :: | | | :: | | : 124 * : +---+ :: +---+ +---+ +---+ :: +---+ +---+ : 125 * Processes : | p | :: | p | | p | | p | :: | p |...| p | : 126 * : +---+ :: +---+ +---+ +---+ :: +---+ +---+ : 127 * :........::......................::...............: 128 * Task 1 Task 2 Task N 129 * | | | 130 * | | | 131 * | +-----------+ | +-----------+ 132 * +--| Project 1 |--+ | Project N | 133 * +-----------+ +-----------+ 134 * 135 * This is just an illustration of relationships between processes, tasks, 136 * projects, pools, and processor sets. New types of resource sets will be 137 * added in the future. 138 */ 139 140 pool_t *pool_default; /* default pool which always exists */ 141 int pool_count; /* number of pools created on this system */ 142 int pool_state; /* pools state -- enabled/disabled */ 143 void *pool_buf; /* pre-commit snapshot of the pools state */ 144 size_t pool_bufsz; /* size of pool_buf */ 145 static hrtime_t pool_pool_mod; /* last modification time for pools */ 146 static hrtime_t pool_sys_mod; /* last modification time for system */ 147 static nvlist_t *pool_sys_prop; /* system properties */ 148 static id_space_t *pool_ids; /* pool ID space */ 149 static list_t pool_list; /* doubly-linked list of pools */ 150 static kmutex_t pool_mutex; /* protects pool_busy_* */ 151 static kcondvar_t pool_busy_cv; /* waiting for "pool_lock" */ 152 static kthread_t *pool_busy_thread; /* thread holding "pool_lock" */ 153 static kmutex_t pool_barrier_lock; /* synch. with pool_barrier_* */ 154 static kcondvar_t pool_barrier_cv; /* synch. with pool_barrier_* */ 155 static int pool_barrier_count; /* synch. with pool_barrier_* */ 156 157 /* 158 * Boot-time pool initialization. 159 */ 160 void 161 pool_init(void) 162 { 163 pool_ids = id_space_create("pool_ids", POOL_DEFAULT + 1, POOL_MAXID); 164 165 /* 166 * Initialize default pool. 167 */ 168 pool_default = kmem_zalloc(sizeof (pool_t), KM_SLEEP); 169 pool_default->pool_id = POOL_DEFAULT; 170 list_create(&pool_list, sizeof (pool_t), offsetof(pool_t, pool_link)); 171 list_insert_head(&pool_list, pool_default); 172 173 /* 174 * Initialize plugins for resource sets. 175 */ 176 pool_pset_init(); 177 pool_count = 1; 178 p0.p_pool = pool_default; 179 global_zone->zone_pool = pool_default; 180 pool_default->pool_ref = 1; 181 } 182 183 /* 184 * Synchronization routines. 185 * 186 * pool_lock is only called from syscall-level routines (processor_bind(), 187 * pset_*(), and /dev/pool ioctls). The pool "lock" may be held for long 188 * periods of time, including across sleeping operations, so we allow its 189 * acquisition to be interruptible. 190 * 191 * The current thread that owns the "lock" is stored in the variable 192 * pool_busy_thread, both to let pool_lock_held() work and to aid debugging. 193 */ 194 void 195 pool_lock(void) 196 { 197 mutex_enter(&pool_mutex); 198 ASSERT(!pool_lock_held()); 199 while (pool_busy_thread != NULL) 200 cv_wait(&pool_busy_cv, &pool_mutex); 201 pool_busy_thread = curthread; 202 mutex_exit(&pool_mutex); 203 } 204 205 int 206 pool_lock_intr(void) 207 { 208 mutex_enter(&pool_mutex); 209 ASSERT(!pool_lock_held()); 210 while (pool_busy_thread != NULL) { 211 if (cv_wait_sig(&pool_busy_cv, &pool_mutex) == 0) { 212 cv_signal(&pool_busy_cv); 213 mutex_exit(&pool_mutex); 214 return (1); 215 } 216 } 217 pool_busy_thread = curthread; 218 mutex_exit(&pool_mutex); 219 return (0); 220 } 221 222 int 223 pool_lock_held(void) 224 { 225 return (pool_busy_thread == curthread); 226 } 227 228 void 229 pool_unlock(void) 230 { 231 mutex_enter(&pool_mutex); 232 ASSERT(pool_lock_held()); 233 pool_busy_thread = NULL; 234 cv_signal(&pool_busy_cv); 235 mutex_exit(&pool_mutex); 236 } 237 238 /* 239 * Routines allowing fork(), exec(), exit(), and lwp_create() to synchronize 240 * with pool_do_bind(). 241 * 242 * Calls to pool_barrier_enter() and pool_barrier_exit() must bracket all 243 * operations which modify pool or pset associations. They can be called 244 * while the process is multi-threaded. In the common case, when current 245 * process is not being rebound (PBWAIT flag is not set), these functions 246 * will be just incrementing and decrementing reference counts. 247 */ 248 void 249 pool_barrier_enter(void) 250 { 251 proc_t *p = curproc; 252 253 ASSERT(MUTEX_HELD(&p->p_lock)); 254 while (p->p_poolflag & PBWAIT) 255 cv_wait(&p->p_poolcv, &p->p_lock); 256 p->p_poolcnt++; 257 } 258 259 void 260 pool_barrier_exit(void) 261 { 262 proc_t *p = curproc; 263 264 ASSERT(MUTEX_HELD(&p->p_lock)); 265 ASSERT(p->p_poolcnt > 0); 266 p->p_poolcnt--; 267 if (p->p_poolflag & PBWAIT) { 268 mutex_enter(&pool_barrier_lock); 269 ASSERT(pool_barrier_count > 0); 270 pool_barrier_count--; 271 if (pool_barrier_count == 0) 272 cv_signal(&pool_barrier_cv); 273 mutex_exit(&pool_barrier_lock); 274 while (p->p_poolflag & PBWAIT) 275 cv_wait(&p->p_poolcv, &p->p_lock); 276 } 277 } 278 279 /* 280 * Enable pools facility. 281 */ 282 static int 283 pool_enable(void) 284 { 285 int ret; 286 287 ASSERT(pool_lock_held()); 288 ASSERT(pool_count == 1); 289 290 ret = pool_pset_enable(); 291 if (ret != 0) 292 return (ret); 293 (void) nvlist_alloc(&pool_sys_prop, NV_UNIQUE_NAME, KM_SLEEP); 294 (void) nvlist_add_string(pool_sys_prop, "system.name", 295 "default"); 296 (void) nvlist_add_string(pool_sys_prop, "system.comment", ""); 297 (void) nvlist_add_int64(pool_sys_prop, "system.version", 1); 298 (void) nvlist_add_byte(pool_sys_prop, "system.bind-default", 1); 299 (void) nvlist_add_string(pool_sys_prop, "system.poold.objectives", 300 "wt-load"); 301 302 (void) nvlist_alloc(&pool_default->pool_props, 303 NV_UNIQUE_NAME, KM_SLEEP); 304 (void) nvlist_add_string(pool_default->pool_props, 305 "pool.name", "pool_default"); 306 (void) nvlist_add_string(pool_default->pool_props, "pool.comment", ""); 307 (void) nvlist_add_byte(pool_default->pool_props, "pool.default", 1); 308 (void) nvlist_add_byte(pool_default->pool_props, "pool.active", 1); 309 (void) nvlist_add_int64(pool_default->pool_props, 310 "pool.importance", 1); 311 (void) nvlist_add_int64(pool_default->pool_props, "pool.sys_id", 312 pool_default->pool_id); 313 314 pool_sys_mod = pool_pool_mod = gethrtime(); 315 316 return (ret); 317 } 318 319 /* 320 * Disable pools facility. 321 */ 322 static int 323 pool_disable(void) 324 { 325 int ret; 326 327 ASSERT(pool_lock_held()); 328 329 if (pool_count > 1) /* must destroy all pools first */ 330 return (EBUSY); 331 332 ret = pool_pset_disable(); 333 if (ret != 0) 334 return (ret); 335 if (pool_sys_prop != NULL) { 336 nvlist_free(pool_sys_prop); 337 pool_sys_prop = NULL; 338 } 339 if (pool_default->pool_props != NULL) { 340 nvlist_free(pool_default->pool_props); 341 pool_default->pool_props = NULL; 342 } 343 return (0); 344 } 345 346 pool_t * 347 pool_lookup_pool_by_name(char *name) 348 { 349 pool_t *pool = pool_default; 350 char *p; 351 352 ASSERT(pool_lock_held()); 353 for (pool = list_head(&pool_list); pool; 354 pool = list_next(&pool_list, pool)) { 355 if (nvlist_lookup_string(pool->pool_props, 356 "pool.name", &p) == 0 && strcmp(name, p) == 0) 357 return (pool); 358 } 359 return (NULL); 360 } 361 362 pool_t * 363 pool_lookup_pool_by_id(poolid_t poolid) 364 { 365 pool_t *pool = pool_default; 366 367 ASSERT(pool_lock_held()); 368 for (pool = list_head(&pool_list); pool; 369 pool = list_next(&pool_list, pool)) { 370 if (pool->pool_id == poolid) 371 return (pool); 372 } 373 return (NULL); 374 } 375 376 /* 377 * Create new pool, associate it with default resource sets, and give 378 * it a temporary name. 379 */ 380 static int 381 pool_pool_create(poolid_t *poolid) 382 { 383 pool_t *pool; 384 char pool_name[40]; 385 386 ASSERT(pool_lock_held()); 387 388 pool = kmem_zalloc(sizeof (pool_t), KM_SLEEP); 389 pool->pool_id = *poolid = id_alloc(pool_ids); 390 pool->pool_pset = pool_pset_default; 391 pool_pset_default->pset_npools++; 392 list_insert_tail(&pool_list, pool); 393 (void) nvlist_alloc(&pool->pool_props, NV_UNIQUE_NAME, KM_SLEEP); 394 (void) nvlist_add_int64(pool->pool_props, "pool.sys_id", pool->pool_id); 395 (void) nvlist_add_byte(pool->pool_props, "pool.default", 0); 396 pool_pool_mod = gethrtime(); 397 (void) snprintf(pool_name, sizeof (pool_name), "pool_%lld", 398 pool_pool_mod); 399 (void) nvlist_add_string(pool->pool_props, "pool.name", pool_name); 400 pool_count++; 401 return (0); 402 } 403 404 struct destroy_zone_arg { 405 pool_t *old; 406 pool_t *new; 407 }; 408 409 /* 410 * Update pool pointers for zones that are currently bound to pool "old" 411 * to be bound to pool "new". 412 */ 413 static int 414 pool_destroy_zone_cb(zone_t *zone, void *arg) 415 { 416 struct destroy_zone_arg *dza = arg; 417 418 ASSERT(pool_lock_held()); 419 ASSERT(MUTEX_HELD(&cpu_lock)); 420 421 if (zone_pool_get(zone) == dza->old) 422 zone_pool_set(zone, dza->new); 423 return (0); 424 } 425 426 /* 427 * Destroy specified pool, and rebind all processes in it 428 * to the default pool. 429 */ 430 static int 431 pool_pool_destroy(poolid_t poolid) 432 { 433 pool_t *pool; 434 int ret; 435 436 ASSERT(pool_lock_held()); 437 438 if (poolid == POOL_DEFAULT) 439 return (EINVAL); 440 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) 441 return (ESRCH); 442 ret = pool_do_bind(pool_default, P_POOLID, poolid, POOL_BIND_ALL); 443 if (ret == 0) { 444 struct destroy_zone_arg dzarg; 445 446 dzarg.old = pool; 447 dzarg.new = pool_default; 448 mutex_enter(&cpu_lock); 449 ret = zone_walk(pool_destroy_zone_cb, &dzarg); 450 mutex_exit(&cpu_lock); 451 ASSERT(ret == 0); 452 ASSERT(pool->pool_ref == 0); 453 (void) nvlist_free(pool->pool_props); 454 id_free(pool_ids, pool->pool_id); 455 pool->pool_pset->pset_npools--; 456 list_remove(&pool_list, pool); 457 pool_count--; 458 pool_pool_mod = gethrtime(); 459 kmem_free(pool, sizeof (pool_t)); 460 } 461 return (ret); 462 } 463 464 /* 465 * Create new pool or resource set. 466 */ 467 int 468 pool_create(int class, int subclass, id_t *id) 469 { 470 int ret; 471 472 ASSERT(pool_lock_held()); 473 if (pool_state == POOL_DISABLED) 474 return (ENOTACTIVE); 475 switch (class) { 476 case PEC_POOL: 477 ret = pool_pool_create((poolid_t *)id); 478 break; 479 case PEC_RES_COMP: 480 switch (subclass) { 481 case PREC_PSET: 482 ret = pool_pset_create((psetid_t *)id); 483 break; 484 default: 485 ret = EINVAL; 486 } 487 break; 488 case PEC_RES_AGG: 489 ret = ENOTSUP; 490 break; 491 default: 492 ret = EINVAL; 493 } 494 return (ret); 495 } 496 497 /* 498 * Destroy an existing pool or resource set. 499 */ 500 int 501 pool_destroy(int class, int subclass, id_t id) 502 { 503 int ret; 504 505 ASSERT(pool_lock_held()); 506 if (pool_state == POOL_DISABLED) 507 return (ENOTACTIVE); 508 switch (class) { 509 case PEC_POOL: 510 ret = pool_pool_destroy((poolid_t)id); 511 break; 512 case PEC_RES_COMP: 513 switch (subclass) { 514 case PREC_PSET: 515 ret = pool_pset_destroy((psetid_t)id); 516 break; 517 default: 518 ret = EINVAL; 519 } 520 break; 521 case PEC_RES_AGG: 522 ret = ENOTSUP; 523 break; 524 default: 525 ret = EINVAL; 526 } 527 return (ret); 528 } 529 530 /* 531 * Enable or disable pools. 532 */ 533 int 534 pool_status(int status) 535 { 536 int ret = 0; 537 538 ASSERT(pool_lock_held()); 539 540 if (pool_state == status) 541 return (0); 542 switch (status) { 543 case POOL_ENABLED: 544 ret = pool_enable(); 545 if (ret != 0) 546 return (ret); 547 pool_state = POOL_ENABLED; 548 break; 549 case POOL_DISABLED: 550 ret = pool_disable(); 551 if (ret != 0) 552 return (ret); 553 pool_state = POOL_DISABLED; 554 break; 555 default: 556 ret = EINVAL; 557 } 558 return (ret); 559 } 560 561 /* 562 * Associate pool with resource set. 563 */ 564 int 565 pool_assoc(poolid_t poolid, int idtype, id_t id) 566 { 567 int ret; 568 569 ASSERT(pool_lock_held()); 570 if (pool_state == POOL_DISABLED) 571 return (ENOTACTIVE); 572 switch (idtype) { 573 case PREC_PSET: 574 ret = pool_pset_assoc(poolid, (psetid_t)id); 575 break; 576 default: 577 ret = EINVAL; 578 } 579 if (ret == 0) 580 pool_pool_mod = gethrtime(); 581 return (ret); 582 } 583 584 /* 585 * Disassociate resource set from pool. 586 */ 587 int 588 pool_dissoc(poolid_t poolid, int idtype) 589 { 590 int ret; 591 592 ASSERT(pool_lock_held()); 593 if (pool_state == POOL_DISABLED) 594 return (ENOTACTIVE); 595 switch (idtype) { 596 case PREC_PSET: 597 ret = pool_pset_assoc(poolid, PS_NONE); 598 break; 599 default: 600 ret = EINVAL; 601 } 602 if (ret == 0) 603 pool_pool_mod = gethrtime(); 604 return (ret); 605 } 606 607 /* 608 * Transfer specified quantity of resources between resource sets. 609 */ 610 /*ARGSUSED*/ 611 int 612 pool_transfer(int type, id_t src, id_t dst, uint64_t qty) 613 { 614 int ret = EINVAL; 615 return (ret); 616 } 617 618 /* 619 * Transfer resources specified by their IDs between resource sets. 620 */ 621 int 622 pool_xtransfer(int type, id_t src, id_t dst, uint_t size, id_t *ids) 623 { 624 int ret; 625 626 ASSERT(pool_lock_held()); 627 if (pool_state == POOL_DISABLED) 628 return (ENOTACTIVE); 629 switch (type) { 630 case PREC_PSET: 631 ret = pool_pset_xtransfer((psetid_t)src, (psetid_t)dst, 632 size, ids); 633 break; 634 default: 635 ret = EINVAL; 636 } 637 return (ret); 638 } 639 640 /* 641 * Bind processes to pools. 642 */ 643 int 644 pool_bind(poolid_t poolid, idtype_t idtype, id_t id) 645 { 646 pool_t *pool; 647 648 ASSERT(pool_lock_held()); 649 650 if (pool_state == POOL_DISABLED) 651 return (ENOTACTIVE); 652 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) 653 return (ESRCH); 654 655 switch (idtype) { 656 case P_PID: 657 case P_TASKID: 658 case P_PROJID: 659 case P_ZONEID: 660 break; 661 default: 662 return (EINVAL); 663 } 664 return (pool_do_bind(pool, idtype, id, POOL_BIND_ALL)); 665 } 666 667 /* 668 * Query pool binding of the specifed process. 669 */ 670 int 671 pool_query_binding(idtype_t idtype, id_t id, id_t *poolid) 672 { 673 proc_t *p; 674 675 if (idtype != P_PID) 676 return (ENOTSUP); 677 if (id == P_MYID) 678 id = curproc->p_pid; 679 680 ASSERT(pool_lock_held()); 681 682 mutex_enter(&pidlock); 683 if ((p = prfind((pid_t)id)) == NULL) { 684 mutex_exit(&pidlock); 685 return (ESRCH); 686 } 687 mutex_enter(&p->p_lock); 688 /* 689 * In local zones, lie about pool bindings of processes from 690 * the global zone. 691 */ 692 if (!INGLOBALZONE(curproc) && INGLOBALZONE(p)) { 693 pool_t *pool; 694 695 pool = zone_pool_get(curproc->p_zone); 696 *poolid = pool->pool_id; 697 } else { 698 *poolid = p->p_pool->pool_id; 699 } 700 mutex_exit(&p->p_lock); 701 mutex_exit(&pidlock); 702 return (0); 703 } 704 705 static ea_object_t * 706 pool_system_pack(void) 707 { 708 ea_object_t *eo_system; 709 size_t bufsz = 0; 710 char *buf = NULL; 711 712 ASSERT(pool_lock_held()); 713 714 eo_system = ea_alloc_group(EXT_GROUP | EXC_LOCAL | EXD_GROUP_SYSTEM); 715 (void) ea_attach_item(eo_system, &pool_sys_mod, sizeof (hrtime_t), 716 EXC_LOCAL | EXD_SYSTEM_TSTAMP | EXT_UINT64); 717 if (INGLOBALZONE(curproc)) 718 (void) ea_attach_item(eo_system, &pool_pool_mod, 719 sizeof (hrtime_t), 720 EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64); 721 else 722 (void) ea_attach_item(eo_system, 723 &curproc->p_zone->zone_pool_mod, 724 sizeof (hrtime_t), 725 EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64); 726 (void) ea_attach_item(eo_system, &pool_pset_mod, sizeof (hrtime_t), 727 EXC_LOCAL | EXD_PSET_TSTAMP | EXT_UINT64); 728 (void) ea_attach_item(eo_system, &pool_cpu_mod, sizeof (hrtime_t), 729 EXC_LOCAL | EXD_CPU_TSTAMP | EXT_UINT64); 730 (void) nvlist_pack(pool_sys_prop, &buf, &bufsz, NV_ENCODE_NATIVE, 0); 731 (void) ea_attach_item(eo_system, buf, bufsz, 732 EXC_LOCAL | EXD_SYSTEM_PROP | EXT_RAW); 733 kmem_free(buf, bufsz); 734 return (eo_system); 735 } 736 737 /* 738 * Pack information about pools and attach it to specified exacct group. 739 */ 740 static int 741 pool_pool_pack(ea_object_t *eo_system) 742 { 743 ea_object_t *eo_pool; 744 pool_t *pool; 745 size_t bufsz; 746 char *buf; 747 pool_t *myzonepool; 748 749 ASSERT(pool_lock_held()); 750 myzonepool = zone_pool_get(curproc->p_zone); 751 for (pool = list_head(&pool_list); pool; 752 pool = list_next(&pool_list, pool)) { 753 if (!INGLOBALZONE(curproc) && myzonepool != pool) 754 continue; 755 bufsz = 0; 756 buf = NULL; 757 eo_pool = ea_alloc_group(EXT_GROUP | 758 EXC_LOCAL | EXD_GROUP_POOL); 759 (void) ea_attach_item(eo_pool, &pool->pool_id, sizeof (id_t), 760 EXC_LOCAL | EXD_POOL_POOLID | EXT_UINT32); 761 (void) ea_attach_item(eo_pool, &pool->pool_pset->pset_id, 762 sizeof (id_t), EXC_LOCAL | EXD_POOL_PSETID | EXT_UINT32); 763 (void) nvlist_pack(pool->pool_props, &buf, &bufsz, 764 NV_ENCODE_NATIVE, 0); 765 (void) ea_attach_item(eo_pool, buf, bufsz, 766 EXC_LOCAL | EXD_POOL_PROP | EXT_RAW); 767 kmem_free(buf, bufsz); 768 (void) ea_attach_to_group(eo_system, eo_pool); 769 } 770 return (0); 771 } 772 773 /* 774 * Pack the whole pool configuration in the specified buffer. 775 */ 776 int 777 pool_pack_conf(void *kbuf, size_t kbufsz, size_t *asize) 778 { 779 ea_object_t *eo_system; 780 size_t ksize; 781 int ret = 0; 782 783 ASSERT(pool_lock_held()); 784 785 eo_system = pool_system_pack(); /* 1. pack system */ 786 (void) pool_pool_pack(eo_system); /* 2. pack all pools */ 787 (void) pool_pset_pack(eo_system); /* 3. pack all psets */ 788 ksize = ea_pack_object(eo_system, NULL, 0); 789 if (kbuf == NULL || kbufsz == 0) 790 *asize = ksize; 791 else if (ksize > kbufsz) 792 ret = ENOMEM; 793 else 794 *asize = ea_pack_object(eo_system, kbuf, kbufsz); 795 ea_free_object(eo_system, EUP_ALLOC); 796 return (ret); 797 } 798 799 /* 800 * Start/end the commit transaction. If commit transaction is currently 801 * in progress, then all POOL_QUERY ioctls will return pools configuration 802 * at the beginning of transaction. 803 */ 804 int 805 pool_commit(int state) 806 { 807 ea_object_t *eo_system; 808 int ret = 0; 809 810 ASSERT(pool_lock_held()); 811 812 if (pool_state == POOL_DISABLED) 813 return (ENOTACTIVE); 814 switch (state) { 815 case 1: 816 /* 817 * Beginning commit transation. 818 */ 819 if (pool_buf != NULL) /* transaction in progress */ 820 return (EBUSY); 821 eo_system = pool_system_pack(); /* 1. pack system */ 822 (void) pool_pool_pack(eo_system); /* 2. pack all pools */ 823 (void) pool_pset_pack(eo_system); /* 3. pack all psets */ 824 pool_bufsz = ea_pack_object(eo_system, NULL, 0); 825 pool_buf = kmem_alloc(pool_bufsz, KM_SLEEP); 826 pool_bufsz = ea_pack_object(eo_system, pool_buf, pool_bufsz); 827 ea_free_object(eo_system, EUP_ALLOC); 828 break; 829 case 0: 830 /* 831 * Finishing commit transaction. 832 */ 833 if (pool_buf != NULL) { 834 kmem_free(pool_buf, pool_bufsz); 835 pool_buf = NULL; 836 pool_bufsz = 0; 837 } 838 break; 839 default: 840 ret = EINVAL; 841 } 842 return (ret); 843 } 844 845 /* 846 * Check is the specified property is special 847 */ 848 static pool_property_t * 849 pool_property_find(char *name, pool_property_t *list) 850 { 851 pool_property_t *prop; 852 853 for (prop = list; prop->pp_name != NULL; prop++) 854 if (strcmp(prop->pp_name, name) == 0) 855 return (prop); 856 return (NULL); 857 } 858 859 static pool_property_t pool_prop_sys[] = { 860 { "system.name", DATA_TYPE_STRING, PP_RDWR }, 861 { "system.comment", DATA_TYPE_STRING, PP_RDWR }, 862 { "system.version", DATA_TYPE_UINT64, PP_READ }, 863 { "system.bind-default", DATA_TYPE_BYTE, PP_RDWR }, 864 { "system.allocate-method", DATA_TYPE_STRING, 865 PP_RDWR | PP_OPTIONAL }, 866 { "system.poold.log-level", DATA_TYPE_STRING, 867 PP_RDWR | PP_OPTIONAL }, 868 { "system.poold.log-location", DATA_TYPE_STRING, 869 PP_RDWR | PP_OPTIONAL }, 870 { "system.poold.monitor-interval", DATA_TYPE_UINT64, 871 PP_RDWR | PP_OPTIONAL }, 872 { "system.poold.history-file", DATA_TYPE_STRING, 873 PP_RDWR | PP_OPTIONAL }, 874 { "system.poold.objectives", DATA_TYPE_STRING, 875 PP_RDWR | PP_OPTIONAL }, 876 { NULL, 0, 0 } 877 }; 878 879 static pool_property_t pool_prop_pool[] = { 880 { "pool.sys_id", DATA_TYPE_UINT64, PP_READ }, 881 { "pool.name", DATA_TYPE_STRING, PP_RDWR }, 882 { "pool.default", DATA_TYPE_BYTE, PP_READ }, 883 { "pool.active", DATA_TYPE_BYTE, PP_RDWR }, 884 { "pool.importance", DATA_TYPE_INT64, PP_RDWR }, 885 { "pool.comment", DATA_TYPE_STRING, PP_RDWR }, 886 { "pool.scheduler", DATA_TYPE_STRING, 887 PP_RDWR | PP_OPTIONAL }, 888 { NULL, 0, 0 } 889 }; 890 891 /* 892 * Common routine to put new property on the specified list 893 */ 894 int 895 pool_propput_common(nvlist_t *nvlist, nvpair_t *pair, pool_property_t *props) 896 { 897 pool_property_t *prop; 898 899 if ((prop = pool_property_find(nvpair_name(pair), props)) != NULL) { 900 /* 901 * No read-only properties or properties with bad types 902 */ 903 if (!(prop->pp_perm & PP_WRITE) || 904 prop->pp_type != nvpair_type(pair)) 905 return (EINVAL); 906 } 907 return (nvlist_add_nvpair(nvlist, pair)); 908 } 909 910 /* 911 * Common routine to remove property from the given list 912 */ 913 int 914 pool_proprm_common(nvlist_t *nvlist, char *name, pool_property_t *props) 915 { 916 pool_property_t *prop; 917 918 if ((prop = pool_property_find(name, props)) != NULL) { 919 if (!(prop->pp_perm & PP_OPTIONAL)) 920 return (EINVAL); 921 } 922 return (nvlist_remove_all(nvlist, name)); 923 } 924 925 static int 926 pool_system_propput(nvpair_t *pair) 927 { 928 int ret; 929 930 ASSERT(pool_lock_held()); 931 ret = pool_propput_common(pool_sys_prop, pair, pool_prop_sys); 932 if (ret == 0) 933 pool_sys_mod = gethrtime(); 934 return (ret); 935 } 936 937 static int 938 pool_system_proprm(char *name) 939 { 940 int ret; 941 942 ASSERT(pool_lock_held()); 943 ret = pool_proprm_common(pool_sys_prop, name, pool_prop_sys); 944 if (ret == 0) 945 pool_sys_mod = gethrtime(); 946 return (ret); 947 } 948 949 static int 950 pool_pool_propput(poolid_t poolid, nvpair_t *pair) 951 { 952 pool_t *pool; 953 int ret; 954 955 ASSERT(pool_lock_held()); 956 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) 957 return (ESRCH); 958 ret = pool_propput_common(pool->pool_props, pair, pool_prop_pool); 959 if (ret == 0) 960 pool_pool_mod = gethrtime(); 961 return (ret); 962 } 963 964 static int 965 pool_pool_proprm(poolid_t poolid, char *name) 966 { 967 int ret; 968 pool_t *pool; 969 970 ASSERT(pool_lock_held()); 971 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) 972 return (ESRCH); 973 ret = pool_proprm_common(pool->pool_props, name, pool_prop_pool); 974 if (ret == 0) 975 pool_pool_mod = gethrtime(); 976 return (ret); 977 } 978 979 int 980 pool_propput(int class, int subclass, id_t id, nvpair_t *pair) 981 { 982 int ret; 983 984 ASSERT(pool_lock_held()); 985 if (pool_state == POOL_DISABLED) 986 return (ENOTACTIVE); 987 switch (class) { 988 case PEC_SYSTEM: 989 ret = pool_system_propput(pair); 990 break; 991 case PEC_POOL: 992 ret = pool_pool_propput((poolid_t)id, pair); 993 break; 994 case PEC_RES_COMP: 995 switch (subclass) { 996 case PREC_PSET: 997 ret = pool_pset_propput((psetid_t)id, pair); 998 break; 999 default: 1000 ret = EINVAL; 1001 } 1002 break; 1003 case PEC_RES_AGG: 1004 ret = ENOTSUP; 1005 break; 1006 case PEC_COMP: 1007 switch (subclass) { 1008 case PCEC_CPU: 1009 ret = pool_cpu_propput((processorid_t)id, pair); 1010 break; 1011 default: 1012 ret = EINVAL; 1013 } 1014 break; 1015 default: 1016 ret = EINVAL; 1017 } 1018 return (ret); 1019 } 1020 1021 int 1022 pool_proprm(int class, int subclass, id_t id, char *name) 1023 { 1024 int ret; 1025 1026 ASSERT(pool_lock_held()); 1027 if (pool_state == POOL_DISABLED) 1028 return (ENOTACTIVE); 1029 switch (class) { 1030 case PEC_SYSTEM: 1031 ret = pool_system_proprm(name); 1032 break; 1033 case PEC_POOL: 1034 ret = pool_pool_proprm((poolid_t)id, name); 1035 break; 1036 case PEC_RES_COMP: 1037 switch (subclass) { 1038 case PREC_PSET: 1039 ret = pool_pset_proprm((psetid_t)id, name); 1040 break; 1041 default: 1042 ret = EINVAL; 1043 } 1044 break; 1045 case PEC_RES_AGG: 1046 ret = ENOTSUP; 1047 break; 1048 case PEC_COMP: 1049 switch (subclass) { 1050 case PCEC_CPU: 1051 ret = pool_cpu_proprm((processorid_t)id, name); 1052 break; 1053 default: 1054 ret = EINVAL; 1055 } 1056 break; 1057 default: 1058 ret = EINVAL; 1059 } 1060 return (ret); 1061 } 1062 1063 int 1064 pool_propget(char *name, int class, int subclass, id_t id, nvlist_t **nvlp) 1065 { 1066 int ret; 1067 nvlist_t *nvl; 1068 1069 ASSERT(pool_lock_held()); 1070 if (pool_state == POOL_DISABLED) 1071 return (ENOTACTIVE); 1072 1073 (void) nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP); 1074 1075 switch (class) { 1076 case PEC_SYSTEM: 1077 case PEC_POOL: 1078 ret = EINVAL; 1079 break; 1080 case PEC_RES_COMP: 1081 switch (subclass) { 1082 case PREC_PSET: 1083 ret = pool_pset_propget((psetid_t)id, name, nvl); 1084 break; 1085 default: 1086 ret = EINVAL; 1087 } 1088 break; 1089 case PEC_RES_AGG: 1090 ret = ENOTSUP; 1091 break; 1092 case PEC_COMP: 1093 switch (subclass) { 1094 case PCEC_CPU: 1095 ret = pool_cpu_propget((processorid_t)id, name, nvl); 1096 break; 1097 default: 1098 ret = EINVAL; 1099 } 1100 break; 1101 default: 1102 ret = EINVAL; 1103 } 1104 if (ret == 0) 1105 *nvlp = nvl; 1106 else 1107 nvlist_free(nvl); 1108 return (ret); 1109 } 1110 1111 /* 1112 * pool_bind_wake and pool_bind_wakeall are helper functions to undo PBWAITs 1113 * in case of failure in pool_do_bind(). 1114 */ 1115 static void 1116 pool_bind_wake(proc_t *p) 1117 { 1118 ASSERT(pool_lock_held()); 1119 1120 mutex_enter(&p->p_lock); 1121 ASSERT(p->p_poolflag & PBWAIT); 1122 if (p->p_poolcnt > 0) { 1123 mutex_enter(&pool_barrier_lock); 1124 pool_barrier_count -= p->p_poolcnt; 1125 mutex_exit(&pool_barrier_lock); 1126 } 1127 p->p_poolflag &= ~PBWAIT; 1128 cv_signal(&p->p_poolcv); 1129 mutex_exit(&p->p_lock); 1130 } 1131 1132 static void 1133 pool_bind_wakeall(proc_t **procs) 1134 { 1135 proc_t *p, **pp; 1136 1137 ASSERT(pool_lock_held()); 1138 for (pp = procs; (p = *pp) != NULL; pp++) 1139 pool_bind_wake(p); 1140 } 1141 1142 /* 1143 * Return the scheduling class for this pool, or 1144 * POOL_CLASS_UNSET if not set 1145 * POOL_CLASS_INVAL if set to an invalid class ID. 1146 */ 1147 id_t 1148 pool_get_class(pool_t *pool) 1149 { 1150 char *name; 1151 id_t cid; 1152 1153 ASSERT(pool_lock_held()); 1154 1155 if (nvlist_lookup_string(pool->pool_props, "pool.scheduler", 1156 &name) == 0) { 1157 if (getcidbyname(name, &cid) == 0) 1158 return (cid); 1159 else 1160 return (POOL_CLASS_INVAL); 1161 } 1162 return (POOL_CLASS_UNSET); 1163 } 1164 1165 /* 1166 * Move process to the new scheduling class. 1167 */ 1168 static void 1169 pool_change_class(proc_t *p, id_t cid) 1170 { 1171 kthread_t *t; 1172 void *cldata; 1173 id_t oldcid; 1174 void **bufs; 1175 void **buf; 1176 int nlwp; 1177 int ret; 1178 int i; 1179 1180 /* 1181 * Do not move kernel processes (such as zsched). 1182 */ 1183 if (p->p_flag & SSYS) 1184 return; 1185 /* 1186 * This process is in the pool barrier, so it can't possibly be 1187 * adding new threads and we can use p_lwpcnt + p_zombcnt + 1 1188 * (for possible agent LWP which doesn't use pool barrier) as 1189 * our upper bound. 1190 */ 1191 nlwp = p->p_lwpcnt + p->p_zombcnt + 1; 1192 1193 /* 1194 * Pre-allocate scheduling class specific buffers before 1195 * grabbing p_lock. 1196 */ 1197 bufs = kmem_zalloc(nlwp * sizeof (void *), KM_SLEEP); 1198 for (i = 0, buf = bufs; i < nlwp; i++, buf++) { 1199 ret = CL_ALLOC(buf, cid, KM_SLEEP); 1200 ASSERT(ret == 0); 1201 } 1202 1203 /* 1204 * Move threads one by one to the new scheduling class. 1205 * This never fails because we have all the right 1206 * privileges here. 1207 */ 1208 mutex_enter(&p->p_lock); 1209 ASSERT(p->p_poolflag & PBWAIT); 1210 buf = bufs; 1211 t = p->p_tlist; 1212 ASSERT(t != NULL); 1213 do { 1214 if (t->t_cid != cid) { 1215 oldcid = t->t_cid; 1216 cldata = t->t_cldata; 1217 ret = CL_ENTERCLASS(t, cid, NULL, NULL, *buf); 1218 ASSERT(ret == 0); 1219 CL_EXITCLASS(oldcid, cldata); 1220 schedctl_set_cidpri(t); 1221 *buf++ = NULL; 1222 } 1223 } while ((t = t->t_forw) != p->p_tlist); 1224 mutex_exit(&p->p_lock); 1225 /* 1226 * Free unused scheduling class specific buffers. 1227 */ 1228 for (i = 0, buf = bufs; i < nlwp; i++, buf++) { 1229 if (*buf != NULL) { 1230 CL_FREE(cid, *buf); 1231 *buf = NULL; 1232 } 1233 } 1234 kmem_free(bufs, nlwp * sizeof (void *)); 1235 } 1236 1237 /* 1238 * The meat of the bind operation. The steps in pool_do_bind are: 1239 * 1240 * 1) Set PBWAIT in the p_poolflag of any process of interest, and add all 1241 * such processes to an array. For any interesting process that has 1242 * threads inside the pool barrier set, increment a counter by the 1243 * count of such threads. Once PBWAIT is set on a process, that process 1244 * will not disappear. 1245 * 1246 * 2) Wait for the counter from step 2 to drop to zero. Any process which 1247 * calls pool_barrier_exit() and notices that PBWAIT has been set on it 1248 * will decrement that counter before going to sleep, and the process 1249 * calling pool_barrier_exit() which does the final decrement will wake us. 1250 * 1251 * 3) For each interesting process, perform a calculation on it to see if 1252 * the bind will actually succeed. This uses the following three 1253 * resource-set-specific functions: 1254 * 1255 * - int set_bind_start(procs, pool) 1256 * 1257 * Determine whether the given array of processes can be bound to the 1258 * resource set associated with the given pool. If it can, take and hold 1259 * any locks necessary to ensure that the operation will succeed, and 1260 * make any necessary reservations in the target resource set. If it 1261 * can't, return failure with no reservations made and no new locks held. 1262 * 1263 * - void set_bind_abort(procs, pool) 1264 * 1265 * set_bind_start() has completed successfully, but another resource set's 1266 * set_bind_start() has failed, and we haven't begun the bind yet. Undo 1267 * any reservations made and drop any locks acquired by our 1268 * set_bind_start(). 1269 * 1270 * - void set_bind_finish(void) 1271 * 1272 * The bind has completed successfully. The processes have been released, 1273 * and the reservation acquired in set_bind_start() has been depleted as 1274 * the processes have finished their bindings. Drop any locks acquired by 1275 * set_bind_start(). 1276 * 1277 * 4) If we've decided that we can proceed with the bind, iterate through 1278 * the list of interesting processes, grab the necessary locks (which 1279 * may differ per resource set), perform the bind, and ASSERT that it 1280 * succeeds. Once a process has been rebound, it can be awakened. 1281 * 1282 * The operations from step 4 must be kept in sync with anything which might 1283 * cause the bind operations (e.g., cpupart_bind_thread()) to fail, and 1284 * are thus located in the same source files as the associated bind operations. 1285 */ 1286 int 1287 pool_do_bind(pool_t *pool, idtype_t idtype, id_t id, int flags) 1288 { 1289 extern uint_t nproc; 1290 klwp_t *lwp = ttolwp(curthread); 1291 proc_t **pp, **procs; 1292 proc_t *prstart; 1293 int procs_count = 0; 1294 kproject_t *kpj; 1295 procset_t set; 1296 zone_t *zone; 1297 int procs_size; 1298 int rv = 0; 1299 proc_t *p; 1300 id_t cid = -1; 1301 1302 ASSERT(pool_lock_held()); 1303 1304 if ((cid = pool_get_class(pool)) == POOL_CLASS_INVAL) 1305 return (EINVAL); 1306 1307 if (idtype == P_ZONEID) { 1308 zone = zone_find_by_id(id); 1309 if (zone == NULL) 1310 return (ESRCH); 1311 if (zone_status_get(zone) > ZONE_IS_RUNNING) { 1312 zone_rele(zone); 1313 return (EBUSY); 1314 } 1315 } 1316 1317 if (idtype == P_PROJID) { 1318 kpj = project_hold_by_id(id, global_zone, PROJECT_HOLD_FIND); 1319 if (kpj == NULL) 1320 return (ESRCH); 1321 mutex_enter(&kpj->kpj_poolbind); 1322 } 1323 1324 if (idtype == P_PID) { 1325 /* 1326 * Fast-path for a single process case. 1327 */ 1328 procs_size = 2; /* procs is NULL-terminated */ 1329 procs = kmem_zalloc(procs_size * sizeof (proc_t *), KM_SLEEP); 1330 mutex_enter(&pidlock); 1331 } else { 1332 /* 1333 * We will need enough slots for proc_t pointers for as many as 1334 * twice the number of currently running processes (assuming 1335 * that each one could be in fork() creating a new child). 1336 */ 1337 for (;;) { 1338 procs_size = nproc * 2; 1339 procs = kmem_zalloc(procs_size * sizeof (proc_t *), 1340 KM_SLEEP); 1341 mutex_enter(&pidlock); 1342 1343 if (nproc * 2 <= procs_size) 1344 break; 1345 /* 1346 * If nproc has changed, try again. 1347 */ 1348 mutex_exit(&pidlock); 1349 kmem_free(procs, procs_size * sizeof (proc_t *)); 1350 } 1351 } 1352 1353 if (id == P_MYID) 1354 id = getmyid(idtype); 1355 setprocset(&set, POP_AND, idtype, id, P_ALL, 0); 1356 1357 /* 1358 * Do a first scan, and select target processes. 1359 */ 1360 if (idtype == P_PID) 1361 prstart = prfind(id); 1362 else 1363 prstart = practive; 1364 for (p = prstart, pp = procs; p != NULL; p = p->p_next) { 1365 mutex_enter(&p->p_lock); 1366 /* 1367 * Skip processes that don't match our (id, idtype) set or 1368 * on the way of becoming zombies. Skip kernel processes 1369 * from the global zone. 1370 */ 1371 if (procinset(p, &set) == 0 || 1372 p->p_poolflag & PEXITED || 1373 ((p->p_flag & SSYS) && INGLOBALZONE(p))) { 1374 mutex_exit(&p->p_lock); 1375 continue; 1376 } 1377 if (!INGLOBALZONE(p)) { 1378 switch (idtype) { 1379 case P_PID: 1380 case P_TASKID: 1381 /* 1382 * Can't bind processes or tasks 1383 * in local zones to pools. 1384 */ 1385 mutex_exit(&p->p_lock); 1386 mutex_exit(&pidlock); 1387 pool_bind_wakeall(procs); 1388 rv = EINVAL; 1389 goto out; 1390 case P_PROJID: 1391 /* 1392 * Only projects in the global 1393 * zone can be rebound. 1394 */ 1395 mutex_exit(&p->p_lock); 1396 continue; 1397 case P_POOLID: 1398 /* 1399 * When rebinding pools, processes can be 1400 * in different zones. 1401 */ 1402 break; 1403 } 1404 } 1405 1406 p->p_poolflag |= PBWAIT; 1407 /* 1408 * If some threads in this process are inside the pool 1409 * barrier, add them to pool_barrier_count, as we have 1410 * to wait for all of them to exit the barrier. 1411 */ 1412 if (p->p_poolcnt > 0) { 1413 mutex_enter(&pool_barrier_lock); 1414 pool_barrier_count += p->p_poolcnt; 1415 mutex_exit(&pool_barrier_lock); 1416 } 1417 ASSERT(pp < &procs[procs_size]); 1418 *pp++ = p; 1419 procs_count++; 1420 mutex_exit(&p->p_lock); 1421 1422 /* 1423 * We just found our process, so if we're only rebinding a 1424 * single process then get out of this loop. 1425 */ 1426 if (idtype == P_PID) 1427 break; 1428 } 1429 *pp = NULL; /* cap off the end of the array */ 1430 mutex_exit(&pidlock); 1431 1432 /* 1433 * Wait for relevant processes to stop before they try to enter the 1434 * barrier or at the exit from the barrier. Make sure that we do 1435 * not get stopped here while we're holding pool_lock. If we were 1436 * requested to stop, or got a signal then return EAGAIN to let the 1437 * library know that it needs to retry. 1438 */ 1439 mutex_enter(&pool_barrier_lock); 1440 lwp->lwp_nostop++; 1441 while (pool_barrier_count > 0) { 1442 (void) cv_wait_sig(&pool_barrier_cv, &pool_barrier_lock); 1443 if (pool_barrier_count > 0) { 1444 /* 1445 * We either got a signal or were requested to 1446 * stop by /proc. Bail out with EAGAIN. If we were 1447 * requested to stop, we'll stop in post_syscall() 1448 * on our way back to userland. 1449 */ 1450 mutex_exit(&pool_barrier_lock); 1451 pool_bind_wakeall(procs); 1452 lwp->lwp_nostop--; 1453 rv = EAGAIN; 1454 goto out; 1455 } 1456 } 1457 lwp->lwp_nostop--; 1458 mutex_exit(&pool_barrier_lock); 1459 1460 if (idtype == P_PID) { 1461 if ((p = *procs) == NULL) 1462 goto skip; 1463 mutex_enter(&p->p_lock); 1464 /* Drop the process if it is exiting */ 1465 if (p->p_poolflag & PEXITED) { 1466 mutex_exit(&p->p_lock); 1467 pool_bind_wake(p); 1468 procs_count--; 1469 } else 1470 mutex_exit(&p->p_lock); 1471 goto skip; 1472 } 1473 1474 /* 1475 * Do another run, and drop processes that were inside the barrier 1476 * in exit(), but when they have dropped to pool_barrier_exit 1477 * they have become of no interest to us. Pick up child processes that 1478 * were created by fork() but didn't exist during our first scan. 1479 * Their parents are now stopped at pool_barrier_exit in cfork(). 1480 */ 1481 mutex_enter(&pidlock); 1482 for (pp = procs; (p = *pp) != NULL; pp++) { 1483 mutex_enter(&p->p_lock); 1484 if (p->p_poolflag & PEXITED) { 1485 ASSERT(p->p_lwpcnt == 0); 1486 mutex_exit(&p->p_lock); 1487 pool_bind_wake(p); 1488 /* flip w/last non-NULL slot */ 1489 *pp = procs[procs_count - 1]; 1490 procs[procs_count - 1] = NULL; 1491 procs_count--; 1492 pp--; /* try this slot again */ 1493 continue; 1494 } else 1495 mutex_exit(&p->p_lock); 1496 /* 1497 * Look at the child and check if it should be rebound also. 1498 * We're holding pidlock, so it is safe to reference p_child. 1499 */ 1500 if ((p = p->p_child) == NULL) 1501 continue; 1502 1503 mutex_enter(&p->p_lock); 1504 1505 /* 1506 * Skip system processes and make sure that the child is in 1507 * the same task/project/pool/zone as the parent. 1508 */ 1509 if ((!INGLOBALZONE(p) && idtype != P_ZONEID && 1510 idtype != P_POOLID) || p->p_flag & SSYS) { 1511 mutex_exit(&p->p_lock); 1512 continue; 1513 } 1514 1515 /* 1516 * If the child process has been already created by fork(), has 1517 * not exited, and has not been added to the list already, 1518 * then add it now. We will hit this process again (since we 1519 * stick it at the end of the procs list) but it will ignored 1520 * because it will have the PBWAIT flag set. 1521 */ 1522 if (procinset(p, &set) && 1523 !(p->p_poolflag & PEXITED) && 1524 !(p->p_poolflag & PBWAIT)) { 1525 ASSERT(p->p_child == NULL); /* no child of a child */ 1526 procs[procs_count] = p; 1527 procs[procs_count + 1] = NULL; 1528 procs_count++; 1529 p->p_poolflag |= PBWAIT; 1530 } 1531 mutex_exit(&p->p_lock); 1532 } 1533 mutex_exit(&pidlock); 1534 skip: 1535 /* 1536 * If there's no processes to rebind then return ESRCH, unless 1537 * we're associating a pool with new resource set, destroying it, 1538 * or binding a zone to a pool. 1539 */ 1540 if (procs_count == 0) { 1541 if (idtype == P_POOLID || idtype == P_ZONEID) 1542 rv = 0; 1543 else 1544 rv = ESRCH; 1545 goto out; 1546 } 1547 1548 #ifdef DEBUG 1549 /* 1550 * All processes in the array should have PBWAIT set, and none 1551 * should be in the critical section. Thus, although p_poolflag 1552 * and p_poolcnt are protected by p_lock, their ASSERTions below 1553 * should be stable without it. procinset(), however, ASSERTs that 1554 * the p_lock is held upon entry. 1555 */ 1556 for (pp = procs; (p = *pp) != NULL; pp++) { 1557 int in_set; 1558 1559 mutex_enter(&p->p_lock); 1560 in_set = procinset(p, &set); 1561 mutex_exit(&p->p_lock); 1562 1563 ASSERT(in_set); 1564 ASSERT(p->p_poolflag & PBWAIT); 1565 ASSERT(p->p_poolcnt == 0); 1566 } 1567 #endif 1568 1569 /* 1570 * Do the check if processor set rebinding is going to succeed or not. 1571 */ 1572 if ((flags & POOL_BIND_PSET) && 1573 (rv = pset_bind_start(procs, pool)) != 0) { 1574 pool_bind_wakeall(procs); 1575 goto out; 1576 } 1577 1578 /* 1579 * At this point, all bind operations should succeed. 1580 */ 1581 for (pp = procs; (p = *pp) != NULL; pp++) { 1582 if (flags & POOL_BIND_PSET) { 1583 psetid_t psetid = pool->pool_pset->pset_id; 1584 void *zonebuf; 1585 void *projbuf; 1586 1587 /* 1588 * Pre-allocate one buffer for FSS (per-project 1589 * buffer for a new pset) in case if this is the 1590 * first thread from its current project getting 1591 * bound to this processor set. 1592 */ 1593 projbuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_PROJ); 1594 zonebuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_ZONE); 1595 1596 mutex_enter(&pidlock); 1597 mutex_enter(&p->p_lock); 1598 pool_pset_bind(p, psetid, projbuf, zonebuf); 1599 mutex_exit(&p->p_lock); 1600 mutex_exit(&pidlock); 1601 /* 1602 * Free buffers pre-allocated above if it 1603 * wasn't actually used. 1604 */ 1605 fss_freebuf(projbuf, FSS_ALLOC_PROJ); 1606 fss_freebuf(zonebuf, FSS_ALLOC_ZONE); 1607 } 1608 /* 1609 * Now let's change the scheduling class of this 1610 * process if our target pool has it defined. 1611 */ 1612 if (cid != POOL_CLASS_UNSET) 1613 pool_change_class(p, cid); 1614 1615 /* 1616 * It is safe to reference p_pool here without holding 1617 * p_lock because it cannot change underneath of us. 1618 * We're holding pool_lock here, so nobody else can be 1619 * moving this process between pools. If process "p" 1620 * would be exiting, we're guaranteed that it would be blocked 1621 * at pool_barrier_enter() in exit(). Otherwise, it would've 1622 * been skipped by one of our scans of the practive list 1623 * as a process with PEXITED flag set. 1624 */ 1625 if (p->p_pool != pool) { 1626 ASSERT(p->p_pool->pool_ref > 0); 1627 atomic_add_32(&p->p_pool->pool_ref, -1); 1628 p->p_pool = pool; 1629 atomic_add_32(&p->p_pool->pool_ref, 1); 1630 } 1631 /* 1632 * Okay, we've tortured this guy enough. 1633 * Let this poor process go now. 1634 */ 1635 pool_bind_wake(p); 1636 } 1637 if (flags & POOL_BIND_PSET) 1638 pset_bind_finish(); 1639 1640 out: switch (idtype) { 1641 case P_PROJID: 1642 ASSERT(kpj != NULL); 1643 mutex_exit(&kpj->kpj_poolbind); 1644 project_rele(kpj); 1645 break; 1646 case P_ZONEID: 1647 if (rv == 0) { 1648 mutex_enter(&cpu_lock); 1649 zone_pool_set(zone, pool); 1650 mutex_exit(&cpu_lock); 1651 } 1652 zone->zone_pool_mod = gethrtime(); 1653 zone_rele(zone); 1654 break; 1655 } 1656 1657 kmem_free(procs, procs_size * sizeof (proc_t *)); 1658 ASSERT(pool_barrier_count == 0); 1659 return (rv); 1660 } 1661