1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/pool.h> 29 #include <sys/pool_impl.h> 30 #include <sys/pool_pset.h> 31 #include <sys/id_space.h> 32 #include <sys/mutex.h> 33 #include <sys/nvpair.h> 34 #include <sys/cpuvar.h> 35 #include <sys/errno.h> 36 #include <sys/cmn_err.h> 37 #include <sys/systm.h> 38 #include <sys/proc.h> 39 #include <sys/fss.h> 40 #include <sys/class.h> 41 #include <sys/exacct.h> 42 #include <sys/utsname.h> 43 #include <sys/procset.h> 44 #include <sys/atomic.h> 45 #include <sys/zone.h> 46 #include <sys/policy.h> 47 48 /* 49 * RESOURCE POOLS 50 * 51 * The resource pools facility brings together process-bindable resource into 52 * a common abstraction called a pool. Processor sets and other entities can 53 * be configured, grouped, and labelled such that workload components can be 54 * associated with a subset of a system's total resources. 55 * 56 * When disabled, the pools facility is "invisible". All processes belong 57 * to the same pool (pool_default), and processor sets can be managed through 58 * the old pset() system call. When enabled, processor sets can only be 59 * managed via the pools facility. New pools can be created and associated 60 * with processor sets. Processes can be bound to pools which have non-empty 61 * resource sets. 62 * 63 * Locking: pool_lock() protects global pools state and must be called 64 * before modifying the configuration, or when taking a snapshot of the 65 * configuration. If pool_lock_intr() is used, the operation may be 66 * interrupted by a signal or a request. 67 * 68 * To prevent processes from being rebound between pools while they are 69 * the middle of an operation which affects resource set bindings, such 70 * operations must be surrounded by calls to pool_barrier_enter() and 71 * pool_barrier_exit(). This mechanism guarantees that such processes will 72 * be stopped either at the beginning or at the end of the barrier so that 73 * the rebind operation can atomically bind the process and its threads 74 * to new resource sets, and then let process run again. 75 * 76 * Lock ordering with respect to other locks is as follows: 77 * 78 * pool_lock() -> cpu_lock -> pidlock -> p_lock -> pool_barrier_lock 79 * 80 * Most static and global variables defined in this file are protected 81 * by calling pool_lock(). 82 * 83 * The operation that binds tasks and projects to pools is atomic. That is, 84 * either all processes in a given task or a project will be bound to a 85 * new pool, or (in case of an error) they will be all left bound to the 86 * old pool. Processes in a given task or a given project can only be bound to 87 * different pools if they were rebound individually one by one as single 88 * processes. Threads or LWPs of the same process do not have pool bindings, 89 * and are bound to the same resource sets associated with the resource pool 90 * of that process. 91 * 92 * The following picture shows one possible pool configuration with three 93 * pools and three processor sets. Note that processor set "foo" is not 94 * associated with any pools and therefore cannot have any processes 95 * bound to it. Two pools (default and foo) are associated with the 96 * same processor set (default). Also, note that processes in Task 2 97 * are bound to different pools. 98 * 99 * 100 * Processor Sets 101 * +---------+ 102 * +--------------+========================>| default | 103 * a| | +---------+ 104 * s| | || 105 * s| | +---------+ 106 * o| | | foo | 107 * c| | +---------+ 108 * i| | || 109 * a| | +---------+ 110 * t| | +------>| bar | 111 * e| | | +---------+ 112 * d| | | 113 * | | | 114 * +---------+ +---------+ +---------+ 115 * Pools | default |======| foo |======| bar | 116 * +---------+ +---------+ +---------+ 117 * @ @ @ @ @ @ 118 * b| | | | | | 119 * o| | | | | | 120 * u| +-----+ | +-------+ | +---+ 121 * n| | | | | | 122 * ....d|........|......|......|.........|.......|.... 123 * : | :: | | | :: | | : 124 * : +---+ :: +---+ +---+ +---+ :: +---+ +---+ : 125 * Processes : | p | :: | p | | p | | p | :: | p |...| p | : 126 * : +---+ :: +---+ +---+ +---+ :: +---+ +---+ : 127 * :........::......................::...............: 128 * Task 1 Task 2 Task N 129 * | | | 130 * | | | 131 * | +-----------+ | +-----------+ 132 * +--| Project 1 |--+ | Project N | 133 * +-----------+ +-----------+ 134 * 135 * This is just an illustration of relationships between processes, tasks, 136 * projects, pools, and processor sets. New types of resource sets will be 137 * added in the future. 138 */ 139 140 pool_t *pool_default; /* default pool which always exists */ 141 int pool_count; /* number of pools created on this system */ 142 int pool_state; /* pools state -- enabled/disabled */ 143 void *pool_buf; /* pre-commit snapshot of the pools state */ 144 size_t pool_bufsz; /* size of pool_buf */ 145 static hrtime_t pool_pool_mod; /* last modification time for pools */ 146 static hrtime_t pool_sys_mod; /* last modification time for system */ 147 static nvlist_t *pool_sys_prop; /* system properties */ 148 static id_space_t *pool_ids; /* pool ID space */ 149 static list_t pool_list; /* doubly-linked list of pools */ 150 static kmutex_t pool_mutex; /* protects pool_busy_* */ 151 static kcondvar_t pool_busy_cv; /* waiting for "pool_lock" */ 152 static kthread_t *pool_busy_thread; /* thread holding "pool_lock" */ 153 static kmutex_t pool_barrier_lock; /* synch. with pool_barrier_* */ 154 static kcondvar_t pool_barrier_cv; /* synch. with pool_barrier_* */ 155 static int pool_barrier_count; /* synch. with pool_barrier_* */ 156 157 /* 158 * Boot-time pool initialization. 159 */ 160 void 161 pool_init(void) 162 { 163 pool_ids = id_space_create("pool_ids", POOL_DEFAULT + 1, POOL_MAXID); 164 165 /* 166 * Initialize default pool. 167 */ 168 pool_default = kmem_zalloc(sizeof (pool_t), KM_SLEEP); 169 pool_default->pool_id = POOL_DEFAULT; 170 list_create(&pool_list, sizeof (pool_t), offsetof(pool_t, pool_link)); 171 list_insert_head(&pool_list, pool_default); 172 173 /* 174 * Initialize plugins for resource sets. 175 */ 176 pool_pset_init(); 177 pool_count = 1; 178 p0.p_pool = pool_default; 179 global_zone->zone_pool = pool_default; 180 pool_default->pool_ref = 1; 181 } 182 183 /* 184 * Synchronization routines. 185 * 186 * pool_lock is only called from syscall-level routines (processor_bind(), 187 * pset_*(), and /dev/pool ioctls). The pool "lock" may be held for long 188 * periods of time, including across sleeping operations, so we allow its 189 * acquisition to be interruptible. 190 * 191 * The current thread that owns the "lock" is stored in the variable 192 * pool_busy_thread, both to let pool_lock_held() work and to aid debugging. 193 */ 194 void 195 pool_lock(void) 196 { 197 mutex_enter(&pool_mutex); 198 while (pool_busy_thread != NULL) 199 cv_wait(&pool_busy_cv, &pool_mutex); 200 pool_busy_thread = curthread; 201 mutex_exit(&pool_mutex); 202 } 203 204 int 205 pool_lock_intr(void) 206 { 207 mutex_enter(&pool_mutex); 208 while (pool_busy_thread != NULL) { 209 if (cv_wait_sig(&pool_busy_cv, &pool_mutex) == 0) { 210 cv_signal(&pool_busy_cv); 211 mutex_exit(&pool_mutex); 212 return (1); 213 } 214 } 215 pool_busy_thread = curthread; 216 mutex_exit(&pool_mutex); 217 return (0); 218 } 219 220 int 221 pool_lock_held(void) 222 { 223 return (pool_busy_thread == curthread); 224 } 225 226 void 227 pool_unlock(void) 228 { 229 mutex_enter(&pool_mutex); 230 pool_busy_thread = NULL; 231 cv_signal(&pool_busy_cv); 232 mutex_exit(&pool_mutex); 233 } 234 235 /* 236 * Routines allowing fork(), exec(), exit(), and lwp_create() to synchronize 237 * with pool_do_bind(). 238 * 239 * Calls to pool_barrier_enter() and pool_barrier_exit() must bracket all 240 * operations which modify pool or pset associations. They can be called 241 * while the process is multi-threaded. In the common case, when current 242 * process is not being rebound (PBWAIT flag is not set), these functions 243 * will be just incrementing and decrementing reference counts. 244 */ 245 void 246 pool_barrier_enter(void) 247 { 248 proc_t *p = curproc; 249 250 ASSERT(MUTEX_HELD(&p->p_lock)); 251 while (p->p_poolflag & PBWAIT) 252 cv_wait(&p->p_poolcv, &p->p_lock); 253 p->p_poolcnt++; 254 } 255 256 void 257 pool_barrier_exit(void) 258 { 259 proc_t *p = curproc; 260 261 ASSERT(MUTEX_HELD(&p->p_lock)); 262 ASSERT(p->p_poolcnt > 0); 263 p->p_poolcnt--; 264 if (p->p_poolflag & PBWAIT) { 265 mutex_enter(&pool_barrier_lock); 266 ASSERT(pool_barrier_count > 0); 267 pool_barrier_count--; 268 if (pool_barrier_count == 0) 269 cv_signal(&pool_barrier_cv); 270 mutex_exit(&pool_barrier_lock); 271 while (p->p_poolflag & PBWAIT) 272 cv_wait(&p->p_poolcv, &p->p_lock); 273 } 274 } 275 276 /* 277 * Enable pools facility. 278 */ 279 static int 280 pool_enable(void) 281 { 282 int ret; 283 284 ASSERT(pool_lock_held()); 285 ASSERT(pool_count == 1); 286 287 ret = pool_pset_enable(); 288 if (ret != 0) 289 return (ret); 290 (void) nvlist_alloc(&pool_sys_prop, NV_UNIQUE_NAME, KM_SLEEP); 291 (void) nvlist_add_string(pool_sys_prop, "system.name", 292 "default"); 293 (void) nvlist_add_string(pool_sys_prop, "system.comment", ""); 294 (void) nvlist_add_int64(pool_sys_prop, "system.version", 1); 295 (void) nvlist_add_byte(pool_sys_prop, "system.bind-default", 1); 296 297 (void) nvlist_alloc(&pool_default->pool_props, 298 NV_UNIQUE_NAME, KM_SLEEP); 299 (void) nvlist_add_string(pool_default->pool_props, 300 "pool.name", "pool_default"); 301 (void) nvlist_add_string(pool_default->pool_props, "pool.comment", ""); 302 (void) nvlist_add_byte(pool_default->pool_props, "pool.default", 1); 303 (void) nvlist_add_byte(pool_default->pool_props, "pool.active", 1); 304 (void) nvlist_add_int64(pool_default->pool_props, 305 "pool.importance", 1); 306 (void) nvlist_add_int64(pool_default->pool_props, "pool.sys_id", 307 pool_default->pool_id); 308 309 pool_sys_mod = pool_pool_mod = gethrtime(); 310 311 return (ret); 312 } 313 314 /* 315 * Disable pools facility. 316 */ 317 static int 318 pool_disable(void) 319 { 320 int ret; 321 322 ASSERT(pool_lock_held()); 323 324 if (pool_count > 1) /* must destroy all pools first */ 325 return (EBUSY); 326 327 ret = pool_pset_disable(); 328 if (ret != 0) 329 return (ret); 330 if (pool_sys_prop != NULL) { 331 nvlist_free(pool_sys_prop); 332 pool_sys_prop = NULL; 333 } 334 if (pool_default->pool_props != NULL) { 335 nvlist_free(pool_default->pool_props); 336 pool_default->pool_props = NULL; 337 } 338 return (0); 339 } 340 341 pool_t * 342 pool_lookup_pool_by_name(char *name) 343 { 344 pool_t *pool = pool_default; 345 char *p; 346 347 ASSERT(pool_lock_held()); 348 for (pool = list_head(&pool_list); pool; 349 pool = list_next(&pool_list, pool)) { 350 if (nvlist_lookup_string(pool->pool_props, 351 "pool.name", &p) == 0 && strcmp(name, p) == 0) 352 return (pool); 353 } 354 return (NULL); 355 } 356 357 pool_t * 358 pool_lookup_pool_by_id(poolid_t poolid) 359 { 360 pool_t *pool = pool_default; 361 362 ASSERT(pool_lock_held()); 363 for (pool = list_head(&pool_list); pool; 364 pool = list_next(&pool_list, pool)) { 365 if (pool->pool_id == poolid) 366 return (pool); 367 } 368 return (NULL); 369 } 370 371 /* 372 * Create new pool, associate it with default resource sets, and give 373 * it a temporary name. 374 */ 375 static int 376 pool_pool_create(poolid_t *poolid) 377 { 378 pool_t *pool; 379 char pool_name[40]; 380 381 ASSERT(pool_lock_held()); 382 383 pool = kmem_zalloc(sizeof (pool_t), KM_SLEEP); 384 pool->pool_id = *poolid = id_alloc(pool_ids); 385 pool->pool_pset = pool_pset_default; 386 pool_pset_default->pset_npools++; 387 list_insert_tail(&pool_list, pool); 388 (void) nvlist_alloc(&pool->pool_props, NV_UNIQUE_NAME, KM_SLEEP); 389 (void) nvlist_add_int64(pool->pool_props, "pool.sys_id", pool->pool_id); 390 (void) nvlist_add_byte(pool->pool_props, "pool.default", 0); 391 pool_pool_mod = gethrtime(); 392 (void) snprintf(pool_name, sizeof (pool_name), "pool_%lld", 393 pool_pool_mod); 394 (void) nvlist_add_string(pool->pool_props, "pool.name", pool_name); 395 pool_count++; 396 return (0); 397 } 398 399 struct destroy_zone_arg { 400 pool_t *old; 401 pool_t *new; 402 }; 403 404 /* 405 * Update pool pointers for zones that are currently bound to pool "old" 406 * to be bound to pool "new". 407 */ 408 static int 409 pool_destroy_zone_cb(zone_t *zone, void *arg) 410 { 411 struct destroy_zone_arg *dza = arg; 412 413 ASSERT(pool_lock_held()); 414 ASSERT(MUTEX_HELD(&cpu_lock)); 415 416 if (zone_pool_get(zone) == dza->old) 417 zone_pool_set(zone, dza->new); 418 return (0); 419 } 420 421 /* 422 * Destroy specified pool, and rebind all processes in it 423 * to the default pool. 424 */ 425 static int 426 pool_pool_destroy(poolid_t poolid) 427 { 428 pool_t *pool; 429 int ret; 430 431 ASSERT(pool_lock_held()); 432 433 if (poolid == POOL_DEFAULT) 434 return (EINVAL); 435 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) 436 return (ESRCH); 437 ret = pool_do_bind(pool_default, P_POOLID, poolid, POOL_BIND_ALL); 438 if (ret == 0) { 439 struct destroy_zone_arg dzarg; 440 441 dzarg.old = pool; 442 dzarg.new = pool_default; 443 mutex_enter(&cpu_lock); 444 ret = zone_walk(pool_destroy_zone_cb, &dzarg); 445 mutex_exit(&cpu_lock); 446 ASSERT(ret == 0); 447 ASSERT(pool->pool_ref == 0); 448 (void) nvlist_free(pool->pool_props); 449 id_free(pool_ids, pool->pool_id); 450 pool->pool_pset->pset_npools--; 451 list_remove(&pool_list, pool); 452 pool_count--; 453 pool_pool_mod = gethrtime(); 454 kmem_free(pool, sizeof (pool_t)); 455 } 456 return (ret); 457 } 458 459 /* 460 * Create new pool or resource set. 461 */ 462 int 463 pool_create(int class, int subclass, id_t *id) 464 { 465 int ret; 466 467 ASSERT(pool_lock_held()); 468 if (pool_state == POOL_DISABLED) 469 return (ENOTACTIVE); 470 switch (class) { 471 case PEC_POOL: 472 ret = pool_pool_create((poolid_t *)id); 473 break; 474 case PEC_RES_COMP: 475 switch (subclass) { 476 case PREC_PSET: 477 ret = pool_pset_create((psetid_t *)id); 478 break; 479 default: 480 ret = EINVAL; 481 } 482 break; 483 case PEC_RES_AGG: 484 ret = ENOTSUP; 485 break; 486 default: 487 ret = EINVAL; 488 } 489 return (ret); 490 } 491 492 /* 493 * Destroy an existing pool or resource set. 494 */ 495 int 496 pool_destroy(int class, int subclass, id_t id) 497 { 498 int ret; 499 500 ASSERT(pool_lock_held()); 501 if (pool_state == POOL_DISABLED) 502 return (ENOTACTIVE); 503 switch (class) { 504 case PEC_POOL: 505 ret = pool_pool_destroy((poolid_t)id); 506 break; 507 case PEC_RES_COMP: 508 switch (subclass) { 509 case PREC_PSET: 510 ret = pool_pset_destroy((psetid_t)id); 511 break; 512 default: 513 ret = EINVAL; 514 } 515 break; 516 case PEC_RES_AGG: 517 ret = ENOTSUP; 518 break; 519 default: 520 ret = EINVAL; 521 } 522 return (ret); 523 } 524 525 /* 526 * Enable or disable pools. 527 */ 528 int 529 pool_status(int status) 530 { 531 int ret = 0; 532 533 ASSERT(pool_lock_held()); 534 535 if (pool_state == status) 536 return (0); 537 switch (status) { 538 case POOL_ENABLED: 539 ret = pool_enable(); 540 if (ret != 0) 541 return (ret); 542 pool_state = POOL_ENABLED; 543 break; 544 case POOL_DISABLED: 545 ret = pool_disable(); 546 if (ret != 0) 547 return (ret); 548 pool_state = POOL_DISABLED; 549 break; 550 default: 551 ret = EINVAL; 552 } 553 return (ret); 554 } 555 556 /* 557 * Associate pool with resource set. 558 */ 559 int 560 pool_assoc(poolid_t poolid, int idtype, id_t id) 561 { 562 int ret; 563 564 ASSERT(pool_lock_held()); 565 if (pool_state == POOL_DISABLED) 566 return (ENOTACTIVE); 567 switch (idtype) { 568 case PREC_PSET: 569 ret = pool_pset_assoc(poolid, (psetid_t)id); 570 break; 571 default: 572 ret = EINVAL; 573 } 574 if (ret == 0) 575 pool_pool_mod = gethrtime(); 576 return (ret); 577 } 578 579 /* 580 * Disassociate resource set from pool. 581 */ 582 int 583 pool_dissoc(poolid_t poolid, int idtype) 584 { 585 int ret; 586 587 ASSERT(pool_lock_held()); 588 if (pool_state == POOL_DISABLED) 589 return (ENOTACTIVE); 590 switch (idtype) { 591 case PREC_PSET: 592 ret = pool_pset_assoc(poolid, PS_NONE); 593 break; 594 default: 595 ret = EINVAL; 596 } 597 if (ret == 0) 598 pool_pool_mod = gethrtime(); 599 return (ret); 600 } 601 602 /* 603 * Transfer specified quantity of resources between resource sets. 604 */ 605 /*ARGSUSED*/ 606 int 607 pool_transfer(int type, id_t src, id_t dst, uint64_t qty) 608 { 609 int ret = EINVAL; 610 return (ret); 611 } 612 613 /* 614 * Transfer resources specified by their IDs between resource sets. 615 */ 616 int 617 pool_xtransfer(int type, id_t src, id_t dst, uint_t size, id_t *ids) 618 { 619 int ret; 620 621 ASSERT(pool_lock_held()); 622 if (pool_state == POOL_DISABLED) 623 return (ENOTACTIVE); 624 switch (type) { 625 case PREC_PSET: 626 ret = pool_pset_xtransfer((psetid_t)src, (psetid_t)dst, 627 size, ids); 628 break; 629 default: 630 ret = EINVAL; 631 } 632 return (ret); 633 } 634 635 /* 636 * Bind processes to pools. 637 */ 638 int 639 pool_bind(poolid_t poolid, idtype_t idtype, id_t id) 640 { 641 pool_t *pool; 642 643 ASSERT(pool_lock_held()); 644 645 if (pool_state == POOL_DISABLED) 646 return (ENOTACTIVE); 647 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) 648 return (ESRCH); 649 650 switch (idtype) { 651 case P_PID: 652 case P_TASKID: 653 case P_PROJID: 654 case P_ZONEID: 655 break; 656 default: 657 return (EINVAL); 658 } 659 return (pool_do_bind(pool, idtype, id, POOL_BIND_ALL)); 660 } 661 662 /* 663 * Query pool binding of the specifed process. 664 */ 665 int 666 pool_query_binding(idtype_t idtype, id_t id, id_t *poolid) 667 { 668 proc_t *p; 669 670 if (idtype != P_PID) 671 return (ENOTSUP); 672 if (id == P_MYID) 673 id = curproc->p_pid; 674 675 ASSERT(pool_lock_held()); 676 677 mutex_enter(&pidlock); 678 if ((p = prfind((pid_t)id)) == NULL) { 679 mutex_exit(&pidlock); 680 return (ESRCH); 681 } 682 mutex_enter(&p->p_lock); 683 /* 684 * In local zones, lie about pool bindings of processes from 685 * the global zone. 686 */ 687 if (!INGLOBALZONE(curproc) && INGLOBALZONE(p)) { 688 pool_t *pool; 689 690 pool = zone_pool_get(curproc->p_zone); 691 *poolid = pool->pool_id; 692 } else { 693 *poolid = p->p_pool->pool_id; 694 } 695 mutex_exit(&p->p_lock); 696 mutex_exit(&pidlock); 697 return (0); 698 } 699 700 static ea_object_t * 701 pool_system_pack(void) 702 { 703 ea_object_t *eo_system; 704 size_t bufsz = 0; 705 char *buf = NULL; 706 707 ASSERT(pool_lock_held()); 708 709 eo_system = ea_alloc_group(EXT_GROUP | EXC_LOCAL | EXD_GROUP_SYSTEM); 710 (void) ea_attach_item(eo_system, &pool_sys_mod, sizeof (hrtime_t), 711 EXC_LOCAL | EXD_SYSTEM_TSTAMP | EXT_UINT64); 712 if (INGLOBALZONE(curproc)) 713 (void) ea_attach_item(eo_system, &pool_pool_mod, 714 sizeof (hrtime_t), 715 EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64); 716 else 717 (void) ea_attach_item(eo_system, 718 &curproc->p_zone->zone_pool_mod, 719 sizeof (hrtime_t), 720 EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64); 721 (void) ea_attach_item(eo_system, &pool_pset_mod, sizeof (hrtime_t), 722 EXC_LOCAL | EXD_PSET_TSTAMP | EXT_UINT64); 723 (void) ea_attach_item(eo_system, &pool_cpu_mod, sizeof (hrtime_t), 724 EXC_LOCAL | EXD_CPU_TSTAMP | EXT_UINT64); 725 (void) nvlist_pack(pool_sys_prop, &buf, &bufsz, NV_ENCODE_NATIVE, 0); 726 (void) ea_attach_item(eo_system, buf, bufsz, 727 EXC_LOCAL | EXD_SYSTEM_PROP | EXT_RAW); 728 kmem_free(buf, bufsz); 729 return (eo_system); 730 } 731 732 /* 733 * Pack information about pools and attach it to specified exacct group. 734 */ 735 static int 736 pool_pool_pack(ea_object_t *eo_system) 737 { 738 ea_object_t *eo_pool; 739 pool_t *pool; 740 size_t bufsz; 741 char *buf; 742 pool_t *myzonepool; 743 744 ASSERT(pool_lock_held()); 745 myzonepool = zone_pool_get(curproc->p_zone); 746 for (pool = list_head(&pool_list); pool; 747 pool = list_next(&pool_list, pool)) { 748 if (!INGLOBALZONE(curproc) && myzonepool != pool) 749 continue; 750 bufsz = 0; 751 buf = NULL; 752 eo_pool = ea_alloc_group(EXT_GROUP | 753 EXC_LOCAL | EXD_GROUP_POOL); 754 (void) ea_attach_item(eo_pool, &pool->pool_id, sizeof (id_t), 755 EXC_LOCAL | EXD_POOL_POOLID | EXT_UINT32); 756 (void) ea_attach_item(eo_pool, &pool->pool_pset->pset_id, 757 sizeof (id_t), EXC_LOCAL | EXD_POOL_PSETID | EXT_UINT32); 758 (void) nvlist_pack(pool->pool_props, &buf, &bufsz, 759 NV_ENCODE_NATIVE, 0); 760 (void) ea_attach_item(eo_pool, buf, bufsz, 761 EXC_LOCAL | EXD_POOL_PROP | EXT_RAW); 762 kmem_free(buf, bufsz); 763 (void) ea_attach_to_group(eo_system, eo_pool); 764 } 765 return (0); 766 } 767 768 /* 769 * Pack the whole pool configuration in the specified buffer. 770 */ 771 int 772 pool_pack_conf(void *kbuf, size_t kbufsz, size_t *asize) 773 { 774 ea_object_t *eo_system; 775 size_t ksize; 776 int ret = 0; 777 778 ASSERT(pool_lock_held()); 779 780 eo_system = pool_system_pack(); /* 1. pack system */ 781 (void) pool_pool_pack(eo_system); /* 2. pack all pools */ 782 (void) pool_pset_pack(eo_system); /* 3. pack all psets */ 783 ksize = ea_pack_object(eo_system, NULL, 0); 784 if (kbuf == NULL || kbufsz == 0) 785 *asize = ksize; 786 else if (ksize > kbufsz) 787 ret = ENOMEM; 788 else 789 *asize = ea_pack_object(eo_system, kbuf, kbufsz); 790 ea_free_object(eo_system, EUP_ALLOC); 791 return (ret); 792 } 793 794 /* 795 * Start/end the commit transaction. If commit transaction is currently 796 * in progress, then all POOL_QUERY ioctls will return pools configuration 797 * at the beginning of transaction. 798 */ 799 int 800 pool_commit(int state) 801 { 802 ea_object_t *eo_system; 803 int ret = 0; 804 805 ASSERT(pool_lock_held()); 806 807 if (pool_state == POOL_DISABLED) 808 return (ENOTACTIVE); 809 switch (state) { 810 case 1: 811 /* 812 * Beginning commit transation. 813 */ 814 if (pool_buf != NULL) /* transaction in progress */ 815 return (EBUSY); 816 eo_system = pool_system_pack(); /* 1. pack system */ 817 (void) pool_pool_pack(eo_system); /* 2. pack all pools */ 818 (void) pool_pset_pack(eo_system); /* 3. pack all psets */ 819 pool_bufsz = ea_pack_object(eo_system, NULL, 0); 820 pool_buf = kmem_alloc(pool_bufsz, KM_SLEEP); 821 pool_bufsz = ea_pack_object(eo_system, pool_buf, pool_bufsz); 822 ea_free_object(eo_system, EUP_ALLOC); 823 break; 824 case 0: 825 /* 826 * Finishing commit transaction. 827 */ 828 if (pool_buf != NULL) { 829 kmem_free(pool_buf, pool_bufsz); 830 pool_buf = NULL; 831 pool_bufsz = 0; 832 } 833 break; 834 default: 835 ret = EINVAL; 836 } 837 return (ret); 838 } 839 840 /* 841 * Check is the specified property is special 842 */ 843 static pool_property_t * 844 pool_property_find(char *name, pool_property_t *list) 845 { 846 pool_property_t *prop; 847 848 for (prop = list; prop->pp_name != NULL; prop++) 849 if (strcmp(prop->pp_name, name) == 0) 850 return (prop); 851 return (NULL); 852 } 853 854 static pool_property_t pool_prop_sys[] = { 855 { "system.name", DATA_TYPE_STRING, PP_RDWR }, 856 { "system.comment", DATA_TYPE_STRING, PP_RDWR }, 857 { "system.version", DATA_TYPE_UINT64, PP_READ }, 858 { "system.bind-default", DATA_TYPE_BYTE, PP_RDWR }, 859 { "system.allocate-method", DATA_TYPE_STRING, 860 PP_RDWR | PP_OPTIONAL }, 861 { "system.poold.log-level", DATA_TYPE_STRING, 862 PP_RDWR | PP_OPTIONAL }, 863 { "system.poold.log-location", DATA_TYPE_STRING, 864 PP_RDWR | PP_OPTIONAL }, 865 { "system.poold.monitor-interval", DATA_TYPE_UINT64, 866 PP_RDWR | PP_OPTIONAL }, 867 { "system.poold.history-file", DATA_TYPE_STRING, 868 PP_RDWR | PP_OPTIONAL }, 869 { "system.poold.objectives", DATA_TYPE_STRING, 870 PP_RDWR | PP_OPTIONAL }, 871 { NULL, 0, 0 } 872 }; 873 874 static pool_property_t pool_prop_pool[] = { 875 { "pool.sys_id", DATA_TYPE_UINT64, PP_READ }, 876 { "pool.name", DATA_TYPE_STRING, PP_RDWR }, 877 { "pool.default", DATA_TYPE_BYTE, PP_READ }, 878 { "pool.active", DATA_TYPE_BYTE, PP_RDWR }, 879 { "pool.importance", DATA_TYPE_INT64, PP_RDWR }, 880 { "pool.comment", DATA_TYPE_STRING, PP_RDWR }, 881 { "pool.scheduler", DATA_TYPE_STRING, 882 PP_RDWR | PP_OPTIONAL }, 883 { NULL, 0, 0 } 884 }; 885 886 /* 887 * Common routine to put new property on the specified list 888 */ 889 int 890 pool_propput_common(nvlist_t *nvlist, nvpair_t *pair, pool_property_t *props) 891 { 892 pool_property_t *prop; 893 894 if ((prop = pool_property_find(nvpair_name(pair), props)) != NULL) { 895 /* 896 * No read-only properties or properties with bad types 897 */ 898 if (!(prop->pp_perm & PP_WRITE) || 899 prop->pp_type != nvpair_type(pair)) 900 return (EINVAL); 901 } 902 return (nvlist_add_nvpair(nvlist, pair)); 903 } 904 905 /* 906 * Common routine to remove property from the given list 907 */ 908 int 909 pool_proprm_common(nvlist_t *nvlist, char *name, pool_property_t *props) 910 { 911 pool_property_t *prop; 912 913 if ((prop = pool_property_find(name, props)) != NULL) { 914 if (!(prop->pp_perm & PP_OPTIONAL)) 915 return (EINVAL); 916 } 917 return (nvlist_remove_all(nvlist, name)); 918 } 919 920 static int 921 pool_system_propput(nvpair_t *pair) 922 { 923 int ret; 924 925 ASSERT(pool_lock_held()); 926 ret = pool_propput_common(pool_sys_prop, pair, pool_prop_sys); 927 if (ret == 0) 928 pool_sys_mod = gethrtime(); 929 return (ret); 930 } 931 932 static int 933 pool_system_proprm(char *name) 934 { 935 int ret; 936 937 ASSERT(pool_lock_held()); 938 ret = pool_proprm_common(pool_sys_prop, name, pool_prop_sys); 939 if (ret == 0) 940 pool_sys_mod = gethrtime(); 941 return (ret); 942 } 943 944 static int 945 pool_pool_propput(poolid_t poolid, nvpair_t *pair) 946 { 947 pool_t *pool; 948 int ret; 949 950 ASSERT(pool_lock_held()); 951 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) 952 return (ESRCH); 953 ret = pool_propput_common(pool->pool_props, pair, pool_prop_pool); 954 if (ret == 0) 955 pool_pool_mod = gethrtime(); 956 return (ret); 957 } 958 959 static int 960 pool_pool_proprm(poolid_t poolid, char *name) 961 { 962 int ret; 963 pool_t *pool; 964 965 ASSERT(pool_lock_held()); 966 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) 967 return (ESRCH); 968 ret = pool_proprm_common(pool->pool_props, name, pool_prop_pool); 969 if (ret == 0) 970 pool_pool_mod = gethrtime(); 971 return (ret); 972 } 973 974 int 975 pool_propput(int class, int subclass, id_t id, nvpair_t *pair) 976 { 977 int ret; 978 979 ASSERT(pool_lock_held()); 980 if (pool_state == POOL_DISABLED) 981 return (ENOTACTIVE); 982 switch (class) { 983 case PEC_SYSTEM: 984 ret = pool_system_propput(pair); 985 break; 986 case PEC_POOL: 987 ret = pool_pool_propput((poolid_t)id, pair); 988 break; 989 case PEC_RES_COMP: 990 switch (subclass) { 991 case PREC_PSET: 992 ret = pool_pset_propput((psetid_t)id, pair); 993 break; 994 default: 995 ret = EINVAL; 996 } 997 break; 998 case PEC_RES_AGG: 999 ret = ENOTSUP; 1000 break; 1001 case PEC_COMP: 1002 switch (subclass) { 1003 case PCEC_CPU: 1004 ret = pool_cpu_propput((processorid_t)id, pair); 1005 break; 1006 default: 1007 ret = EINVAL; 1008 } 1009 break; 1010 default: 1011 ret = EINVAL; 1012 } 1013 return (ret); 1014 } 1015 1016 int 1017 pool_proprm(int class, int subclass, id_t id, char *name) 1018 { 1019 int ret; 1020 1021 ASSERT(pool_lock_held()); 1022 if (pool_state == POOL_DISABLED) 1023 return (ENOTACTIVE); 1024 switch (class) { 1025 case PEC_SYSTEM: 1026 ret = pool_system_proprm(name); 1027 break; 1028 case PEC_POOL: 1029 ret = pool_pool_proprm((poolid_t)id, name); 1030 break; 1031 case PEC_RES_COMP: 1032 switch (subclass) { 1033 case PREC_PSET: 1034 ret = pool_pset_proprm((psetid_t)id, name); 1035 break; 1036 default: 1037 ret = EINVAL; 1038 } 1039 break; 1040 case PEC_RES_AGG: 1041 ret = ENOTSUP; 1042 break; 1043 case PEC_COMP: 1044 switch (subclass) { 1045 case PCEC_CPU: 1046 ret = pool_cpu_proprm((processorid_t)id, name); 1047 break; 1048 default: 1049 ret = EINVAL; 1050 } 1051 break; 1052 default: 1053 ret = EINVAL; 1054 } 1055 return (ret); 1056 } 1057 1058 int 1059 pool_propget(char *name, int class, int subclass, id_t id, nvlist_t **nvlp) 1060 { 1061 int ret; 1062 nvlist_t *nvl; 1063 1064 ASSERT(pool_lock_held()); 1065 if (pool_state == POOL_DISABLED) 1066 return (ENOTACTIVE); 1067 1068 (void) nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP); 1069 1070 switch (class) { 1071 case PEC_SYSTEM: 1072 case PEC_POOL: 1073 ret = EINVAL; 1074 break; 1075 case PEC_RES_COMP: 1076 switch (subclass) { 1077 case PREC_PSET: 1078 ret = pool_pset_propget((psetid_t)id, name, nvl); 1079 break; 1080 default: 1081 ret = EINVAL; 1082 } 1083 break; 1084 case PEC_RES_AGG: 1085 ret = ENOTSUP; 1086 break; 1087 case PEC_COMP: 1088 switch (subclass) { 1089 case PCEC_CPU: 1090 ret = pool_cpu_propget((processorid_t)id, name, nvl); 1091 break; 1092 default: 1093 ret = EINVAL; 1094 } 1095 break; 1096 default: 1097 ret = EINVAL; 1098 } 1099 if (ret == 0) 1100 *nvlp = nvl; 1101 else 1102 nvlist_free(nvl); 1103 return (ret); 1104 } 1105 1106 /* 1107 * pool_bind_wake and pool_bind_wakeall are helper functions to undo PBWAITs 1108 * in case of failure in pool_do_bind(). 1109 */ 1110 static void 1111 pool_bind_wake(proc_t *p) 1112 { 1113 ASSERT(pool_lock_held()); 1114 1115 mutex_enter(&p->p_lock); 1116 ASSERT(p->p_poolflag & PBWAIT); 1117 if (p->p_poolcnt > 0) { 1118 mutex_enter(&pool_barrier_lock); 1119 pool_barrier_count -= p->p_poolcnt; 1120 mutex_exit(&pool_barrier_lock); 1121 } 1122 p->p_poolflag &= ~PBWAIT; 1123 cv_signal(&p->p_poolcv); 1124 mutex_exit(&p->p_lock); 1125 } 1126 1127 static void 1128 pool_bind_wakeall(proc_t **procs) 1129 { 1130 proc_t *p, **pp; 1131 1132 ASSERT(pool_lock_held()); 1133 for (pp = procs; (p = *pp) != NULL; pp++) 1134 pool_bind_wake(p); 1135 } 1136 1137 /* 1138 * Return the scheduling class for this pool, or 1139 * POOL_CLASS_UNSET if not set 1140 * POOL_CLASS_INVAL if set to an invalid class ID. 1141 */ 1142 id_t 1143 pool_get_class(pool_t *pool) 1144 { 1145 char *name; 1146 id_t cid; 1147 1148 ASSERT(pool_lock_held()); 1149 1150 if (nvlist_lookup_string(pool->pool_props, "pool.scheduler", 1151 &name) == 0) { 1152 if (getcidbyname(name, &cid) == 0) 1153 return (cid); 1154 else 1155 return (POOL_CLASS_INVAL); 1156 } 1157 return (POOL_CLASS_UNSET); 1158 } 1159 1160 /* 1161 * Move process to the new scheduling class. 1162 */ 1163 static void 1164 pool_change_class(proc_t *p, id_t cid) 1165 { 1166 kthread_t *t; 1167 void *cldata; 1168 id_t oldcid; 1169 void **bufs; 1170 void **buf; 1171 int nlwp; 1172 int ret; 1173 int i; 1174 1175 /* 1176 * Do not move kernel processes (such as zsched). 1177 */ 1178 if (p->p_flag & SSYS) 1179 return; 1180 /* 1181 * This process is in the pool barrier, so it can't possibly be 1182 * adding new threads and we can use p_lwpcnt + p_zombcnt + 1 1183 * (for possible agent LWP which doesn't use pool barrier) as 1184 * our upper bound. 1185 */ 1186 nlwp = p->p_lwpcnt + p->p_zombcnt + 1; 1187 1188 /* 1189 * Pre-allocate scheduling class specific buffers before 1190 * grabbing p_lock. 1191 */ 1192 bufs = kmem_zalloc(nlwp * sizeof (void *), KM_SLEEP); 1193 for (i = 0, buf = bufs; i < nlwp; i++, buf++) { 1194 ret = CL_ALLOC(buf, cid, KM_SLEEP); 1195 ASSERT(ret == 0); 1196 } 1197 1198 /* 1199 * Move threads one by one to the new scheduling class. 1200 * This never fails because we have all the right 1201 * privileges here. 1202 */ 1203 mutex_enter(&p->p_lock); 1204 ASSERT(p->p_poolflag & PBWAIT); 1205 buf = bufs; 1206 t = p->p_tlist; 1207 ASSERT(t != NULL); 1208 do { 1209 if (t->t_cid != cid) { 1210 oldcid = t->t_cid; 1211 cldata = t->t_cldata; 1212 ret = CL_ENTERCLASS(t, cid, NULL, NULL, *buf); 1213 ASSERT(ret == 0); 1214 CL_EXITCLASS(oldcid, cldata); 1215 *buf++ = NULL; 1216 } 1217 } while ((t = t->t_forw) != p->p_tlist); 1218 mutex_exit(&p->p_lock); 1219 /* 1220 * Free unused scheduling class specific buffers. 1221 */ 1222 for (i = 0, buf = bufs; i < nlwp; i++, buf++) { 1223 if (*buf != NULL) { 1224 CL_FREE(cid, *buf); 1225 *buf = NULL; 1226 } 1227 } 1228 kmem_free(bufs, nlwp * sizeof (void *)); 1229 } 1230 1231 /* 1232 * The meat of the bind operation. The steps in pool_do_bind are: 1233 * 1234 * 1) Set PBWAIT in the p_poolflag of any process of interest, and add all 1235 * such processes to an array. For any interesting process that has 1236 * threads inside the pool barrier set, increment a counter by the 1237 * count of such threads. Once PBWAIT is set on a process, that process 1238 * will not disappear. 1239 * 1240 * 2) Wait for the counter from step 2 to drop to zero. Any process which 1241 * calls pool_barrier_exit() and notices that PBWAIT has been set on it 1242 * will decrement that counter before going to sleep, and the process 1243 * calling pool_barrier_exit() which does the final decrement will wake us. 1244 * 1245 * 3) For each interesting process, perform a calculation on it to see if 1246 * the bind will actually succeed. This uses the following three 1247 * resource-set-specific functions: 1248 * 1249 * - int set_bind_start(procs, pool) 1250 * 1251 * Determine whether the given array of processes can be bound to the 1252 * resource set associated with the given pool. If it can, take and hold 1253 * any locks necessary to ensure that the operation will succeed, and 1254 * make any necessary reservations in the target resource set. If it 1255 * can't, return failure with no reservations made and no new locks held. 1256 * 1257 * - void set_bind_abort(procs, pool) 1258 * 1259 * set_bind_start() has completed successfully, but another resource set's 1260 * set_bind_start() has failed, and we haven't begun the bind yet. Undo 1261 * any reservations made and drop any locks acquired by our 1262 * set_bind_start(). 1263 * 1264 * - void set_bind_finish(void) 1265 * 1266 * The bind has completed successfully. The processes have been released, 1267 * and the reservation acquired in set_bind_start() has been depleted as 1268 * the processes have finished their bindings. Drop any locks acquired by 1269 * set_bind_start(). 1270 * 1271 * 4) If we've decided that we can proceed with the bind, iterate through 1272 * the list of interesting processes, grab the necessary locks (which 1273 * may differ per resource set), perform the bind, and ASSERT that it 1274 * succeeds. Once a process has been rebound, it can be awakened. 1275 * 1276 * The operations from step 4 must be kept in sync with anything which might 1277 * cause the bind operations (e.g., cpupart_bind_thread()) to fail, and 1278 * are thus located in the same source files as the associated bind operations. 1279 */ 1280 int 1281 pool_do_bind(pool_t *pool, idtype_t idtype, id_t id, int flags) 1282 { 1283 extern uint_t nproc; 1284 klwp_t *lwp = ttolwp(curthread); 1285 proc_t **pp, **procs; 1286 proc_t *prstart; 1287 int procs_count = 0; 1288 kproject_t *kpj; 1289 procset_t set; 1290 zone_t *zone; 1291 int procs_size; 1292 int rv = 0; 1293 proc_t *p; 1294 id_t cid = -1; 1295 1296 ASSERT(pool_lock_held()); 1297 1298 if ((cid = pool_get_class(pool)) == POOL_CLASS_INVAL) 1299 return (EINVAL); 1300 1301 if (idtype == P_ZONEID) { 1302 zone = zone_find_by_id(id); 1303 if (zone == NULL) 1304 return (ESRCH); 1305 if (zone_status_get(zone) > ZONE_IS_RUNNING) { 1306 zone_rele(zone); 1307 return (EBUSY); 1308 } 1309 } 1310 1311 if (idtype == P_PROJID) { 1312 kpj = project_hold_by_id(id, GLOBAL_ZONEID, PROJECT_HOLD_FIND); 1313 if (kpj == NULL) 1314 return (ESRCH); 1315 mutex_enter(&kpj->kpj_poolbind); 1316 } 1317 1318 if (idtype == P_PID) { 1319 /* 1320 * Fast-path for a single process case. 1321 */ 1322 procs_size = 2; /* procs is NULL-terminated */ 1323 procs = kmem_zalloc(procs_size * sizeof (proc_t *), KM_SLEEP); 1324 mutex_enter(&pidlock); 1325 } else { 1326 /* 1327 * We will need enough slots for proc_t pointers for as many as 1328 * twice the number of currently running processes (assuming 1329 * that each one could be in fork() creating a new child). 1330 */ 1331 for (;;) { 1332 procs_size = nproc * 2; 1333 procs = kmem_zalloc(procs_size * sizeof (proc_t *), 1334 KM_SLEEP); 1335 mutex_enter(&pidlock); 1336 1337 if (nproc * 2 <= procs_size) 1338 break; 1339 /* 1340 * If nproc has changed, try again. 1341 */ 1342 mutex_exit(&pidlock); 1343 kmem_free(procs, procs_size * sizeof (proc_t *)); 1344 } 1345 } 1346 1347 if (id == P_MYID) 1348 id = getmyid(idtype); 1349 setprocset(&set, POP_AND, idtype, id, P_ALL, 0); 1350 1351 /* 1352 * Do a first scan, and select target processes. 1353 */ 1354 if (idtype == P_PID) 1355 prstart = prfind(id); 1356 else 1357 prstart = practive; 1358 for (p = prstart, pp = procs; p != NULL; p = p->p_next) { 1359 mutex_enter(&p->p_lock); 1360 /* 1361 * Skip processes that don't match our (id, idtype) set or 1362 * on the way of becoming zombies. Skip kernel processes 1363 * from the global zone. 1364 */ 1365 if (procinset(p, &set) == 0 || 1366 p->p_poolflag & PEXITED || 1367 ((p->p_flag & SSYS) && INGLOBALZONE(p))) { 1368 mutex_exit(&p->p_lock); 1369 continue; 1370 } 1371 if (!INGLOBALZONE(p)) { 1372 switch (idtype) { 1373 case P_PID: 1374 case P_TASKID: 1375 /* 1376 * Can't bind processes or tasks 1377 * in local zones to pools. 1378 */ 1379 mutex_exit(&p->p_lock); 1380 mutex_exit(&pidlock); 1381 pool_bind_wakeall(procs); 1382 rv = EINVAL; 1383 goto out; 1384 case P_PROJID: 1385 /* 1386 * Only projects in the global 1387 * zone can be rebound. 1388 */ 1389 mutex_exit(&p->p_lock); 1390 continue; 1391 case P_POOLID: 1392 /* 1393 * When rebinding pools, processes can be 1394 * in different zones. 1395 */ 1396 break; 1397 } 1398 } 1399 1400 p->p_poolflag |= PBWAIT; 1401 /* 1402 * If some threads in this process are inside the pool 1403 * barrier, add them to pool_barrier_count, as we have 1404 * to wait for all of them to exit the barrier. 1405 */ 1406 if (p->p_poolcnt > 0) { 1407 mutex_enter(&pool_barrier_lock); 1408 pool_barrier_count += p->p_poolcnt; 1409 mutex_exit(&pool_barrier_lock); 1410 } 1411 ASSERT(pp < &procs[procs_size]); 1412 *pp++ = p; 1413 procs_count++; 1414 mutex_exit(&p->p_lock); 1415 1416 /* 1417 * We just found our process, so if we're only rebinding a 1418 * single process then get out of this loop. 1419 */ 1420 if (idtype == P_PID) 1421 break; 1422 } 1423 *pp = NULL; /* cap off the end of the array */ 1424 mutex_exit(&pidlock); 1425 1426 /* 1427 * Wait for relevant processes to stop before they try to enter the 1428 * barrier or at the exit from the barrier. Make sure that we do 1429 * not get stopped here while we're holding pool_lock. If we were 1430 * requested to stop, or got a signal then return EAGAIN to let the 1431 * library know that it needs to retry. 1432 */ 1433 mutex_enter(&pool_barrier_lock); 1434 lwp->lwp_nostop++; 1435 while (pool_barrier_count > 0) { 1436 (void) cv_wait_sig(&pool_barrier_cv, &pool_barrier_lock); 1437 if (pool_barrier_count > 0) { 1438 /* 1439 * We either got a signal or were requested to 1440 * stop by /proc. Bail out with EAGAIN. If we were 1441 * requested to stop, we'll stop in post_syscall() 1442 * on our way back to userland. 1443 */ 1444 mutex_exit(&pool_barrier_lock); 1445 pool_bind_wakeall(procs); 1446 lwp->lwp_nostop--; 1447 rv = EAGAIN; 1448 goto out; 1449 } 1450 } 1451 lwp->lwp_nostop--; 1452 mutex_exit(&pool_barrier_lock); 1453 1454 if (idtype == P_PID) 1455 goto skip; 1456 1457 /* 1458 * Do another run, and drop processes that were inside the barrier 1459 * in exit(), but when they have dropped to pool_barrier_exit 1460 * they have become of no interest to us. Pick up child processes that 1461 * were created by fork() but didn't exist during our first scan. 1462 * Their parents are now stopped at pool_barrier_exit in cfork(). 1463 */ 1464 mutex_enter(&pidlock); 1465 for (pp = procs; (p = *pp) != NULL; pp++) { 1466 if (p->p_poolflag & PEXITED) { 1467 ASSERT(p->p_lwpcnt == 0); 1468 pool_bind_wake(p); 1469 /* flip w/last non-NULL slot */ 1470 *pp = procs[procs_count - 1]; 1471 procs[procs_count - 1] = NULL; 1472 procs_count--; 1473 pp--; /* try this slot again */ 1474 continue; 1475 } 1476 /* 1477 * Look at the child and check if it should be rebound also. 1478 * We're holding pidlock, so it is safe to reference p_child. 1479 */ 1480 if ((p = p->p_child) == NULL) 1481 continue; 1482 1483 mutex_enter(&p->p_lock); 1484 /* 1485 * Skip processes in local zones if we're not binding 1486 * zones to pools (P_ZONEID). Skip kernel processes also. 1487 */ 1488 if ((!INGLOBALZONE(p) && idtype != P_ZONEID) || 1489 p->p_flag & SSYS) { 1490 mutex_exit(&p->p_lock); 1491 continue; 1492 } 1493 1494 /* 1495 * If the child process has been already created by fork(), has 1496 * not exited, and has not been added to the list already, 1497 * then add it now. We will hit this process again (since we 1498 * stick it at the end of the procs list) but it will ignored 1499 * because it will have the PBWAIT flag set. 1500 */ 1501 if (procinset(p, &set) && 1502 !(p->p_poolflag & PEXITED) && 1503 !(p->p_poolflag & PBWAIT)) { 1504 ASSERT(p->p_child == NULL); /* no child of a child */ 1505 procs[procs_count] = p; 1506 procs[procs_count + 1] = NULL; 1507 procs_count++; 1508 p->p_poolflag |= PBWAIT; 1509 } 1510 mutex_exit(&p->p_lock); 1511 } 1512 mutex_exit(&pidlock); 1513 skip: 1514 /* 1515 * If there's no processes to rebind then return ESRCH, unless 1516 * we're associating a pool with new resource set, destroying it, 1517 * or binding a zone to a pool. 1518 */ 1519 if (procs_count == 0) { 1520 if (idtype == P_POOLID || idtype == P_ZONEID) 1521 rv = 0; 1522 else 1523 rv = ESRCH; 1524 goto out; 1525 } 1526 1527 #ifdef DEBUG 1528 /* 1529 * All processes in the array should have PBWAIT set, and none should 1530 * be in the critical section. Even though p_poolflag is protected by 1531 * the p_lock, these assertions should be stable across the dropping of 1532 * p_lock. 1533 */ 1534 for (pp = procs; (p = *pp) != NULL; pp++) { 1535 ASSERT(p->p_poolflag & PBWAIT); 1536 ASSERT(p->p_poolcnt == 0); 1537 ASSERT(procinset(p, &set)); 1538 } 1539 #endif 1540 1541 /* 1542 * Do the check if processor set rebinding is going to succeed or not. 1543 */ 1544 if ((flags & POOL_BIND_PSET) && 1545 (rv = pset_bind_start(procs, pool)) != 0) { 1546 pool_bind_wakeall(procs); 1547 goto out; 1548 } 1549 1550 /* 1551 * At this point, all bind operations should succeed. 1552 */ 1553 for (pp = procs; (p = *pp) != NULL; pp++) { 1554 if (flags & POOL_BIND_PSET) { 1555 psetid_t psetid = pool->pool_pset->pset_id; 1556 void *zonebuf; 1557 void *projbuf; 1558 1559 /* 1560 * Pre-allocate one buffer for FSS (per-project 1561 * buffer for a new pset) in case if this is the 1562 * first thread from its current project getting 1563 * bound to this processor set. 1564 */ 1565 projbuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_PROJ); 1566 zonebuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_ZONE); 1567 1568 mutex_enter(&pidlock); 1569 mutex_enter(&p->p_lock); 1570 pool_pset_bind(p, psetid, projbuf, zonebuf); 1571 mutex_exit(&p->p_lock); 1572 mutex_exit(&pidlock); 1573 /* 1574 * Free buffers pre-allocated above if it 1575 * wasn't actually used. 1576 */ 1577 fss_freebuf(projbuf, FSS_ALLOC_PROJ); 1578 fss_freebuf(zonebuf, FSS_ALLOC_ZONE); 1579 } 1580 /* 1581 * Now let's change the scheduling class of this 1582 * process if our target pool has it defined. 1583 */ 1584 if (cid != POOL_CLASS_UNSET) 1585 pool_change_class(p, cid); 1586 1587 /* 1588 * It is safe to reference p_pool here without holding 1589 * p_lock because it cannot change underneath of us. 1590 * We're holding pool_lock here, so nobody else can be 1591 * moving this process between pools. If process "p" 1592 * would be exiting, we're guaranteed that it would be blocked 1593 * at pool_barrier_enter() in exit(). Otherwise, it would've 1594 * been skipped by one of our scans of the practive list 1595 * as a process with PEXITED flag set. 1596 */ 1597 if (p->p_pool != pool) { 1598 ASSERT(p->p_pool->pool_ref > 0); 1599 atomic_add_32(&p->p_pool->pool_ref, -1); 1600 p->p_pool = pool; 1601 atomic_add_32(&p->p_pool->pool_ref, 1); 1602 } 1603 /* 1604 * Okay, we've tortured this guy enough. 1605 * Let this poor process go now. 1606 */ 1607 pool_bind_wake(p); 1608 } 1609 if (flags & POOL_BIND_PSET) 1610 pset_bind_finish(); 1611 1612 out: switch (idtype) { 1613 case P_PROJID: 1614 ASSERT(kpj != NULL); 1615 mutex_exit(&kpj->kpj_poolbind); 1616 project_rele(kpj); 1617 break; 1618 case P_ZONEID: 1619 if (rv == 0) { 1620 mutex_enter(&cpu_lock); 1621 zone_pool_set(zone, pool); 1622 mutex_exit(&cpu_lock); 1623 } 1624 zone->zone_pool_mod = gethrtime(); 1625 zone_rele(zone); 1626 break; 1627 } 1628 1629 kmem_free(procs, procs_size * sizeof (proc_t *)); 1630 ASSERT(pool_barrier_count == 0); 1631 return (rv); 1632 } 1633