1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/pool.h> 29 #include <sys/pool_impl.h> 30 #include <sys/pool_pset.h> 31 #include <sys/id_space.h> 32 #include <sys/mutex.h> 33 #include <sys/nvpair.h> 34 #include <sys/cpuvar.h> 35 #include <sys/errno.h> 36 #include <sys/cmn_err.h> 37 #include <sys/systm.h> 38 #include <sys/proc.h> 39 #include <sys/fss.h> 40 #include <sys/class.h> 41 #include <sys/exacct.h> 42 #include <sys/utsname.h> 43 #include <sys/procset.h> 44 #include <sys/atomic.h> 45 #include <sys/zone.h> 46 #include <sys/policy.h> 47 48 /* 49 * RESOURCE POOLS 50 * 51 * The resource pools facility brings together process-bindable resource into 52 * a common abstraction called a pool. Processor sets and other entities can 53 * be configured, grouped, and labelled such that workload components can be 54 * associated with a subset of a system's total resources. 55 * 56 * When disabled, the pools facility is "invisible". All processes belong 57 * to the same pool (pool_default), and processor sets can be managed through 58 * the old pset() system call. When enabled, processor sets can only be 59 * managed via the pools facility. New pools can be created and associated 60 * with processor sets. Processes can be bound to pools which have non-empty 61 * resource sets. 62 * 63 * Locking: pool_lock() protects global pools state and must be called 64 * before modifying the configuration, or when taking a snapshot of the 65 * configuration. If pool_lock_intr() is used, the operation may be 66 * interrupted by a signal or a request. 67 * 68 * To prevent processes from being rebound between pools while they are 69 * the middle of an operation which affects resource set bindings, such 70 * operations must be surrounded by calls to pool_barrier_enter() and 71 * pool_barrier_exit(). This mechanism guarantees that such processes will 72 * be stopped either at the beginning or at the end of the barrier so that 73 * the rebind operation can atomically bind the process and its threads 74 * to new resource sets, and then let process run again. 75 * 76 * Lock ordering with respect to other locks is as follows: 77 * 78 * pool_lock() -> cpu_lock -> pidlock -> p_lock -> pool_barrier_lock 79 * 80 * Most static and global variables defined in this file are protected 81 * by calling pool_lock(). 82 * 83 * The operation that binds tasks and projects to pools is atomic. That is, 84 * either all processes in a given task or a project will be bound to a 85 * new pool, or (in case of an error) they will be all left bound to the 86 * old pool. Processes in a given task or a given project can only be bound to 87 * different pools if they were rebound individually one by one as single 88 * processes. Threads or LWPs of the same process do not have pool bindings, 89 * and are bound to the same resource sets associated with the resource pool 90 * of that process. 91 * 92 * The following picture shows one possible pool configuration with three 93 * pools and three processor sets. Note that processor set "foo" is not 94 * associated with any pools and therefore cannot have any processes 95 * bound to it. Two pools (default and foo) are associated with the 96 * same processor set (default). Also, note that processes in Task 2 97 * are bound to different pools. 98 * 99 * 100 * Processor Sets 101 * +---------+ 102 * +--------------+========================>| default | 103 * a| | +---------+ 104 * s| | || 105 * s| | +---------+ 106 * o| | | foo | 107 * c| | +---------+ 108 * i| | || 109 * a| | +---------+ 110 * t| | +------>| bar | 111 * e| | | +---------+ 112 * d| | | 113 * | | | 114 * +---------+ +---------+ +---------+ 115 * Pools | default |======| foo |======| bar | 116 * +---------+ +---------+ +---------+ 117 * @ @ @ @ @ @ 118 * b| | | | | | 119 * o| | | | | | 120 * u| +-----+ | +-------+ | +---+ 121 * n| | | | | | 122 * ....d|........|......|......|.........|.......|.... 123 * : | :: | | | :: | | : 124 * : +---+ :: +---+ +---+ +---+ :: +---+ +---+ : 125 * Processes : | p | :: | p | | p | | p | :: | p |...| p | : 126 * : +---+ :: +---+ +---+ +---+ :: +---+ +---+ : 127 * :........::......................::...............: 128 * Task 1 Task 2 Task N 129 * | | | 130 * | | | 131 * | +-----------+ | +-----------+ 132 * +--| Project 1 |--+ | Project N | 133 * +-----------+ +-----------+ 134 * 135 * This is just an illustration of relationships between processes, tasks, 136 * projects, pools, and processor sets. New types of resource sets will be 137 * added in the future. 138 */ 139 140 pool_t *pool_default; /* default pool which always exists */ 141 int pool_count; /* number of pools created on this system */ 142 int pool_state; /* pools state -- enabled/disabled */ 143 void *pool_buf; /* pre-commit snapshot of the pools state */ 144 size_t pool_bufsz; /* size of pool_buf */ 145 static hrtime_t pool_pool_mod; /* last modification time for pools */ 146 static hrtime_t pool_sys_mod; /* last modification time for system */ 147 static nvlist_t *pool_sys_prop; /* system properties */ 148 static id_space_t *pool_ids; /* pool ID space */ 149 static list_t pool_list; /* doubly-linked list of pools */ 150 static kmutex_t pool_mutex; /* protects pool_busy_* */ 151 static kcondvar_t pool_busy_cv; /* waiting for "pool_lock" */ 152 static kthread_t *pool_busy_thread; /* thread holding "pool_lock" */ 153 static kmutex_t pool_barrier_lock; /* synch. with pool_barrier_* */ 154 static kcondvar_t pool_barrier_cv; /* synch. with pool_barrier_* */ 155 static int pool_barrier_count; /* synch. with pool_barrier_* */ 156 157 /* 158 * Boot-time pool initialization. 159 */ 160 void 161 pool_init(void) 162 { 163 pool_ids = id_space_create("pool_ids", POOL_DEFAULT + 1, POOL_MAXID); 164 165 /* 166 * Initialize default pool. 167 */ 168 pool_default = kmem_zalloc(sizeof (pool_t), KM_SLEEP); 169 pool_default->pool_id = POOL_DEFAULT; 170 list_create(&pool_list, sizeof (pool_t), offsetof(pool_t, pool_link)); 171 list_insert_head(&pool_list, pool_default); 172 173 /* 174 * Initialize plugins for resource sets. 175 */ 176 pool_pset_init(); 177 pool_count = 1; 178 p0.p_pool = pool_default; 179 global_zone->zone_pool = pool_default; 180 pool_default->pool_ref = 1; 181 } 182 183 /* 184 * Synchronization routines. 185 * 186 * pool_lock is only called from syscall-level routines (processor_bind(), 187 * pset_*(), and /dev/pool ioctls). The pool "lock" may be held for long 188 * periods of time, including across sleeping operations, so we allow its 189 * acquisition to be interruptible. 190 * 191 * The current thread that owns the "lock" is stored in the variable 192 * pool_busy_thread, both to let pool_lock_held() work and to aid debugging. 193 */ 194 void 195 pool_lock(void) 196 { 197 mutex_enter(&pool_mutex); 198 while (pool_busy_thread != NULL) 199 cv_wait(&pool_busy_cv, &pool_mutex); 200 pool_busy_thread = curthread; 201 mutex_exit(&pool_mutex); 202 } 203 204 int 205 pool_lock_intr(void) 206 { 207 mutex_enter(&pool_mutex); 208 while (pool_busy_thread != NULL) { 209 if (cv_wait_sig(&pool_busy_cv, &pool_mutex) == 0) { 210 cv_signal(&pool_busy_cv); 211 mutex_exit(&pool_mutex); 212 return (1); 213 } 214 } 215 pool_busy_thread = curthread; 216 mutex_exit(&pool_mutex); 217 return (0); 218 } 219 220 int 221 pool_lock_held(void) 222 { 223 return (pool_busy_thread == curthread); 224 } 225 226 void 227 pool_unlock(void) 228 { 229 mutex_enter(&pool_mutex); 230 pool_busy_thread = NULL; 231 cv_signal(&pool_busy_cv); 232 mutex_exit(&pool_mutex); 233 } 234 235 /* 236 * Routines allowing fork(), exec(), exit(), and lwp_create() to synchronize 237 * with pool_do_bind(). 238 * 239 * Calls to pool_barrier_enter() and pool_barrier_exit() must bracket all 240 * operations which modify pool or pset associations. They can be called 241 * while the process is multi-threaded. In the common case, when current 242 * process is not being rebound (PBWAIT flag is not set), these functions 243 * will be just incrementing and decrementing reference counts. 244 */ 245 void 246 pool_barrier_enter(void) 247 { 248 proc_t *p = curproc; 249 250 ASSERT(MUTEX_HELD(&p->p_lock)); 251 while (p->p_poolflag & PBWAIT) 252 cv_wait(&p->p_poolcv, &p->p_lock); 253 p->p_poolcnt++; 254 } 255 256 void 257 pool_barrier_exit(void) 258 { 259 proc_t *p = curproc; 260 261 ASSERT(MUTEX_HELD(&p->p_lock)); 262 ASSERT(p->p_poolcnt > 0); 263 p->p_poolcnt--; 264 if (p->p_poolflag & PBWAIT) { 265 mutex_enter(&pool_barrier_lock); 266 ASSERT(pool_barrier_count > 0); 267 pool_barrier_count--; 268 if (pool_barrier_count == 0) 269 cv_signal(&pool_barrier_cv); 270 mutex_exit(&pool_barrier_lock); 271 while (p->p_poolflag & PBWAIT) 272 cv_wait(&p->p_poolcv, &p->p_lock); 273 } 274 } 275 276 /* 277 * Enable pools facility. 278 */ 279 static int 280 pool_enable(void) 281 { 282 int ret; 283 284 ASSERT(pool_lock_held()); 285 ASSERT(pool_count == 1); 286 287 ret = pool_pset_enable(); 288 if (ret != 0) 289 return (ret); 290 (void) nvlist_alloc(&pool_sys_prop, NV_UNIQUE_NAME, KM_SLEEP); 291 (void) nvlist_add_string(pool_sys_prop, "system.name", 292 "default"); 293 (void) nvlist_add_string(pool_sys_prop, "system.comment", ""); 294 (void) nvlist_add_int64(pool_sys_prop, "system.version", 1); 295 (void) nvlist_add_byte(pool_sys_prop, "system.bind-default", 1); 296 (void) nvlist_add_string(pool_sys_prop, "system.poold.objectives", 297 "wt-load"); 298 299 (void) nvlist_alloc(&pool_default->pool_props, 300 NV_UNIQUE_NAME, KM_SLEEP); 301 (void) nvlist_add_string(pool_default->pool_props, 302 "pool.name", "pool_default"); 303 (void) nvlist_add_string(pool_default->pool_props, "pool.comment", ""); 304 (void) nvlist_add_byte(pool_default->pool_props, "pool.default", 1); 305 (void) nvlist_add_byte(pool_default->pool_props, "pool.active", 1); 306 (void) nvlist_add_int64(pool_default->pool_props, 307 "pool.importance", 1); 308 (void) nvlist_add_int64(pool_default->pool_props, "pool.sys_id", 309 pool_default->pool_id); 310 311 pool_sys_mod = pool_pool_mod = gethrtime(); 312 313 return (ret); 314 } 315 316 /* 317 * Disable pools facility. 318 */ 319 static int 320 pool_disable(void) 321 { 322 int ret; 323 324 ASSERT(pool_lock_held()); 325 326 if (pool_count > 1) /* must destroy all pools first */ 327 return (EBUSY); 328 329 ret = pool_pset_disable(); 330 if (ret != 0) 331 return (ret); 332 if (pool_sys_prop != NULL) { 333 nvlist_free(pool_sys_prop); 334 pool_sys_prop = NULL; 335 } 336 if (pool_default->pool_props != NULL) { 337 nvlist_free(pool_default->pool_props); 338 pool_default->pool_props = NULL; 339 } 340 return (0); 341 } 342 343 pool_t * 344 pool_lookup_pool_by_name(char *name) 345 { 346 pool_t *pool = pool_default; 347 char *p; 348 349 ASSERT(pool_lock_held()); 350 for (pool = list_head(&pool_list); pool; 351 pool = list_next(&pool_list, pool)) { 352 if (nvlist_lookup_string(pool->pool_props, 353 "pool.name", &p) == 0 && strcmp(name, p) == 0) 354 return (pool); 355 } 356 return (NULL); 357 } 358 359 pool_t * 360 pool_lookup_pool_by_id(poolid_t poolid) 361 { 362 pool_t *pool = pool_default; 363 364 ASSERT(pool_lock_held()); 365 for (pool = list_head(&pool_list); pool; 366 pool = list_next(&pool_list, pool)) { 367 if (pool->pool_id == poolid) 368 return (pool); 369 } 370 return (NULL); 371 } 372 373 /* 374 * Create new pool, associate it with default resource sets, and give 375 * it a temporary name. 376 */ 377 static int 378 pool_pool_create(poolid_t *poolid) 379 { 380 pool_t *pool; 381 char pool_name[40]; 382 383 ASSERT(pool_lock_held()); 384 385 pool = kmem_zalloc(sizeof (pool_t), KM_SLEEP); 386 pool->pool_id = *poolid = id_alloc(pool_ids); 387 pool->pool_pset = pool_pset_default; 388 pool_pset_default->pset_npools++; 389 list_insert_tail(&pool_list, pool); 390 (void) nvlist_alloc(&pool->pool_props, NV_UNIQUE_NAME, KM_SLEEP); 391 (void) nvlist_add_int64(pool->pool_props, "pool.sys_id", pool->pool_id); 392 (void) nvlist_add_byte(pool->pool_props, "pool.default", 0); 393 pool_pool_mod = gethrtime(); 394 (void) snprintf(pool_name, sizeof (pool_name), "pool_%lld", 395 pool_pool_mod); 396 (void) nvlist_add_string(pool->pool_props, "pool.name", pool_name); 397 pool_count++; 398 return (0); 399 } 400 401 struct destroy_zone_arg { 402 pool_t *old; 403 pool_t *new; 404 }; 405 406 /* 407 * Update pool pointers for zones that are currently bound to pool "old" 408 * to be bound to pool "new". 409 */ 410 static int 411 pool_destroy_zone_cb(zone_t *zone, void *arg) 412 { 413 struct destroy_zone_arg *dza = arg; 414 415 ASSERT(pool_lock_held()); 416 ASSERT(MUTEX_HELD(&cpu_lock)); 417 418 if (zone_pool_get(zone) == dza->old) 419 zone_pool_set(zone, dza->new); 420 return (0); 421 } 422 423 /* 424 * Destroy specified pool, and rebind all processes in it 425 * to the default pool. 426 */ 427 static int 428 pool_pool_destroy(poolid_t poolid) 429 { 430 pool_t *pool; 431 int ret; 432 433 ASSERT(pool_lock_held()); 434 435 if (poolid == POOL_DEFAULT) 436 return (EINVAL); 437 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) 438 return (ESRCH); 439 ret = pool_do_bind(pool_default, P_POOLID, poolid, POOL_BIND_ALL); 440 if (ret == 0) { 441 struct destroy_zone_arg dzarg; 442 443 dzarg.old = pool; 444 dzarg.new = pool_default; 445 mutex_enter(&cpu_lock); 446 ret = zone_walk(pool_destroy_zone_cb, &dzarg); 447 mutex_exit(&cpu_lock); 448 ASSERT(ret == 0); 449 ASSERT(pool->pool_ref == 0); 450 (void) nvlist_free(pool->pool_props); 451 id_free(pool_ids, pool->pool_id); 452 pool->pool_pset->pset_npools--; 453 list_remove(&pool_list, pool); 454 pool_count--; 455 pool_pool_mod = gethrtime(); 456 kmem_free(pool, sizeof (pool_t)); 457 } 458 return (ret); 459 } 460 461 /* 462 * Create new pool or resource set. 463 */ 464 int 465 pool_create(int class, int subclass, id_t *id) 466 { 467 int ret; 468 469 ASSERT(pool_lock_held()); 470 if (pool_state == POOL_DISABLED) 471 return (ENOTACTIVE); 472 switch (class) { 473 case PEC_POOL: 474 ret = pool_pool_create((poolid_t *)id); 475 break; 476 case PEC_RES_COMP: 477 switch (subclass) { 478 case PREC_PSET: 479 ret = pool_pset_create((psetid_t *)id); 480 break; 481 default: 482 ret = EINVAL; 483 } 484 break; 485 case PEC_RES_AGG: 486 ret = ENOTSUP; 487 break; 488 default: 489 ret = EINVAL; 490 } 491 return (ret); 492 } 493 494 /* 495 * Destroy an existing pool or resource set. 496 */ 497 int 498 pool_destroy(int class, int subclass, id_t id) 499 { 500 int ret; 501 502 ASSERT(pool_lock_held()); 503 if (pool_state == POOL_DISABLED) 504 return (ENOTACTIVE); 505 switch (class) { 506 case PEC_POOL: 507 ret = pool_pool_destroy((poolid_t)id); 508 break; 509 case PEC_RES_COMP: 510 switch (subclass) { 511 case PREC_PSET: 512 ret = pool_pset_destroy((psetid_t)id); 513 break; 514 default: 515 ret = EINVAL; 516 } 517 break; 518 case PEC_RES_AGG: 519 ret = ENOTSUP; 520 break; 521 default: 522 ret = EINVAL; 523 } 524 return (ret); 525 } 526 527 /* 528 * Enable or disable pools. 529 */ 530 int 531 pool_status(int status) 532 { 533 int ret = 0; 534 535 ASSERT(pool_lock_held()); 536 537 if (pool_state == status) 538 return (0); 539 switch (status) { 540 case POOL_ENABLED: 541 ret = pool_enable(); 542 if (ret != 0) 543 return (ret); 544 pool_state = POOL_ENABLED; 545 break; 546 case POOL_DISABLED: 547 ret = pool_disable(); 548 if (ret != 0) 549 return (ret); 550 pool_state = POOL_DISABLED; 551 break; 552 default: 553 ret = EINVAL; 554 } 555 return (ret); 556 } 557 558 /* 559 * Associate pool with resource set. 560 */ 561 int 562 pool_assoc(poolid_t poolid, int idtype, id_t id) 563 { 564 int ret; 565 566 ASSERT(pool_lock_held()); 567 if (pool_state == POOL_DISABLED) 568 return (ENOTACTIVE); 569 switch (idtype) { 570 case PREC_PSET: 571 ret = pool_pset_assoc(poolid, (psetid_t)id); 572 break; 573 default: 574 ret = EINVAL; 575 } 576 if (ret == 0) 577 pool_pool_mod = gethrtime(); 578 return (ret); 579 } 580 581 /* 582 * Disassociate resource set from pool. 583 */ 584 int 585 pool_dissoc(poolid_t poolid, int idtype) 586 { 587 int ret; 588 589 ASSERT(pool_lock_held()); 590 if (pool_state == POOL_DISABLED) 591 return (ENOTACTIVE); 592 switch (idtype) { 593 case PREC_PSET: 594 ret = pool_pset_assoc(poolid, PS_NONE); 595 break; 596 default: 597 ret = EINVAL; 598 } 599 if (ret == 0) 600 pool_pool_mod = gethrtime(); 601 return (ret); 602 } 603 604 /* 605 * Transfer specified quantity of resources between resource sets. 606 */ 607 /*ARGSUSED*/ 608 int 609 pool_transfer(int type, id_t src, id_t dst, uint64_t qty) 610 { 611 int ret = EINVAL; 612 return (ret); 613 } 614 615 /* 616 * Transfer resources specified by their IDs between resource sets. 617 */ 618 int 619 pool_xtransfer(int type, id_t src, id_t dst, uint_t size, id_t *ids) 620 { 621 int ret; 622 623 ASSERT(pool_lock_held()); 624 if (pool_state == POOL_DISABLED) 625 return (ENOTACTIVE); 626 switch (type) { 627 case PREC_PSET: 628 ret = pool_pset_xtransfer((psetid_t)src, (psetid_t)dst, 629 size, ids); 630 break; 631 default: 632 ret = EINVAL; 633 } 634 return (ret); 635 } 636 637 /* 638 * Bind processes to pools. 639 */ 640 int 641 pool_bind(poolid_t poolid, idtype_t idtype, id_t id) 642 { 643 pool_t *pool; 644 645 ASSERT(pool_lock_held()); 646 647 if (pool_state == POOL_DISABLED) 648 return (ENOTACTIVE); 649 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) 650 return (ESRCH); 651 652 switch (idtype) { 653 case P_PID: 654 case P_TASKID: 655 case P_PROJID: 656 case P_ZONEID: 657 break; 658 default: 659 return (EINVAL); 660 } 661 return (pool_do_bind(pool, idtype, id, POOL_BIND_ALL)); 662 } 663 664 /* 665 * Query pool binding of the specifed process. 666 */ 667 int 668 pool_query_binding(idtype_t idtype, id_t id, id_t *poolid) 669 { 670 proc_t *p; 671 672 if (idtype != P_PID) 673 return (ENOTSUP); 674 if (id == P_MYID) 675 id = curproc->p_pid; 676 677 ASSERT(pool_lock_held()); 678 679 mutex_enter(&pidlock); 680 if ((p = prfind((pid_t)id)) == NULL) { 681 mutex_exit(&pidlock); 682 return (ESRCH); 683 } 684 mutex_enter(&p->p_lock); 685 /* 686 * In local zones, lie about pool bindings of processes from 687 * the global zone. 688 */ 689 if (!INGLOBALZONE(curproc) && INGLOBALZONE(p)) { 690 pool_t *pool; 691 692 pool = zone_pool_get(curproc->p_zone); 693 *poolid = pool->pool_id; 694 } else { 695 *poolid = p->p_pool->pool_id; 696 } 697 mutex_exit(&p->p_lock); 698 mutex_exit(&pidlock); 699 return (0); 700 } 701 702 static ea_object_t * 703 pool_system_pack(void) 704 { 705 ea_object_t *eo_system; 706 size_t bufsz = 0; 707 char *buf = NULL; 708 709 ASSERT(pool_lock_held()); 710 711 eo_system = ea_alloc_group(EXT_GROUP | EXC_LOCAL | EXD_GROUP_SYSTEM); 712 (void) ea_attach_item(eo_system, &pool_sys_mod, sizeof (hrtime_t), 713 EXC_LOCAL | EXD_SYSTEM_TSTAMP | EXT_UINT64); 714 if (INGLOBALZONE(curproc)) 715 (void) ea_attach_item(eo_system, &pool_pool_mod, 716 sizeof (hrtime_t), 717 EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64); 718 else 719 (void) ea_attach_item(eo_system, 720 &curproc->p_zone->zone_pool_mod, 721 sizeof (hrtime_t), 722 EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64); 723 (void) ea_attach_item(eo_system, &pool_pset_mod, sizeof (hrtime_t), 724 EXC_LOCAL | EXD_PSET_TSTAMP | EXT_UINT64); 725 (void) ea_attach_item(eo_system, &pool_cpu_mod, sizeof (hrtime_t), 726 EXC_LOCAL | EXD_CPU_TSTAMP | EXT_UINT64); 727 (void) nvlist_pack(pool_sys_prop, &buf, &bufsz, NV_ENCODE_NATIVE, 0); 728 (void) ea_attach_item(eo_system, buf, bufsz, 729 EXC_LOCAL | EXD_SYSTEM_PROP | EXT_RAW); 730 kmem_free(buf, bufsz); 731 return (eo_system); 732 } 733 734 /* 735 * Pack information about pools and attach it to specified exacct group. 736 */ 737 static int 738 pool_pool_pack(ea_object_t *eo_system) 739 { 740 ea_object_t *eo_pool; 741 pool_t *pool; 742 size_t bufsz; 743 char *buf; 744 pool_t *myzonepool; 745 746 ASSERT(pool_lock_held()); 747 myzonepool = zone_pool_get(curproc->p_zone); 748 for (pool = list_head(&pool_list); pool; 749 pool = list_next(&pool_list, pool)) { 750 if (!INGLOBALZONE(curproc) && myzonepool != pool) 751 continue; 752 bufsz = 0; 753 buf = NULL; 754 eo_pool = ea_alloc_group(EXT_GROUP | 755 EXC_LOCAL | EXD_GROUP_POOL); 756 (void) ea_attach_item(eo_pool, &pool->pool_id, sizeof (id_t), 757 EXC_LOCAL | EXD_POOL_POOLID | EXT_UINT32); 758 (void) ea_attach_item(eo_pool, &pool->pool_pset->pset_id, 759 sizeof (id_t), EXC_LOCAL | EXD_POOL_PSETID | EXT_UINT32); 760 (void) nvlist_pack(pool->pool_props, &buf, &bufsz, 761 NV_ENCODE_NATIVE, 0); 762 (void) ea_attach_item(eo_pool, buf, bufsz, 763 EXC_LOCAL | EXD_POOL_PROP | EXT_RAW); 764 kmem_free(buf, bufsz); 765 (void) ea_attach_to_group(eo_system, eo_pool); 766 } 767 return (0); 768 } 769 770 /* 771 * Pack the whole pool configuration in the specified buffer. 772 */ 773 int 774 pool_pack_conf(void *kbuf, size_t kbufsz, size_t *asize) 775 { 776 ea_object_t *eo_system; 777 size_t ksize; 778 int ret = 0; 779 780 ASSERT(pool_lock_held()); 781 782 eo_system = pool_system_pack(); /* 1. pack system */ 783 (void) pool_pool_pack(eo_system); /* 2. pack all pools */ 784 (void) pool_pset_pack(eo_system); /* 3. pack all psets */ 785 ksize = ea_pack_object(eo_system, NULL, 0); 786 if (kbuf == NULL || kbufsz == 0) 787 *asize = ksize; 788 else if (ksize > kbufsz) 789 ret = ENOMEM; 790 else 791 *asize = ea_pack_object(eo_system, kbuf, kbufsz); 792 ea_free_object(eo_system, EUP_ALLOC); 793 return (ret); 794 } 795 796 /* 797 * Start/end the commit transaction. If commit transaction is currently 798 * in progress, then all POOL_QUERY ioctls will return pools configuration 799 * at the beginning of transaction. 800 */ 801 int 802 pool_commit(int state) 803 { 804 ea_object_t *eo_system; 805 int ret = 0; 806 807 ASSERT(pool_lock_held()); 808 809 if (pool_state == POOL_DISABLED) 810 return (ENOTACTIVE); 811 switch (state) { 812 case 1: 813 /* 814 * Beginning commit transation. 815 */ 816 if (pool_buf != NULL) /* transaction in progress */ 817 return (EBUSY); 818 eo_system = pool_system_pack(); /* 1. pack system */ 819 (void) pool_pool_pack(eo_system); /* 2. pack all pools */ 820 (void) pool_pset_pack(eo_system); /* 3. pack all psets */ 821 pool_bufsz = ea_pack_object(eo_system, NULL, 0); 822 pool_buf = kmem_alloc(pool_bufsz, KM_SLEEP); 823 pool_bufsz = ea_pack_object(eo_system, pool_buf, pool_bufsz); 824 ea_free_object(eo_system, EUP_ALLOC); 825 break; 826 case 0: 827 /* 828 * Finishing commit transaction. 829 */ 830 if (pool_buf != NULL) { 831 kmem_free(pool_buf, pool_bufsz); 832 pool_buf = NULL; 833 pool_bufsz = 0; 834 } 835 break; 836 default: 837 ret = EINVAL; 838 } 839 return (ret); 840 } 841 842 /* 843 * Check is the specified property is special 844 */ 845 static pool_property_t * 846 pool_property_find(char *name, pool_property_t *list) 847 { 848 pool_property_t *prop; 849 850 for (prop = list; prop->pp_name != NULL; prop++) 851 if (strcmp(prop->pp_name, name) == 0) 852 return (prop); 853 return (NULL); 854 } 855 856 static pool_property_t pool_prop_sys[] = { 857 { "system.name", DATA_TYPE_STRING, PP_RDWR }, 858 { "system.comment", DATA_TYPE_STRING, PP_RDWR }, 859 { "system.version", DATA_TYPE_UINT64, PP_READ }, 860 { "system.bind-default", DATA_TYPE_BYTE, PP_RDWR }, 861 { "system.allocate-method", DATA_TYPE_STRING, 862 PP_RDWR | PP_OPTIONAL }, 863 { "system.poold.log-level", DATA_TYPE_STRING, 864 PP_RDWR | PP_OPTIONAL }, 865 { "system.poold.log-location", DATA_TYPE_STRING, 866 PP_RDWR | PP_OPTIONAL }, 867 { "system.poold.monitor-interval", DATA_TYPE_UINT64, 868 PP_RDWR | PP_OPTIONAL }, 869 { "system.poold.history-file", DATA_TYPE_STRING, 870 PP_RDWR | PP_OPTIONAL }, 871 { "system.poold.objectives", DATA_TYPE_STRING, 872 PP_RDWR | PP_OPTIONAL }, 873 { NULL, 0, 0 } 874 }; 875 876 static pool_property_t pool_prop_pool[] = { 877 { "pool.sys_id", DATA_TYPE_UINT64, PP_READ }, 878 { "pool.name", DATA_TYPE_STRING, PP_RDWR }, 879 { "pool.default", DATA_TYPE_BYTE, PP_READ }, 880 { "pool.active", DATA_TYPE_BYTE, PP_RDWR }, 881 { "pool.importance", DATA_TYPE_INT64, PP_RDWR }, 882 { "pool.comment", DATA_TYPE_STRING, PP_RDWR }, 883 { "pool.scheduler", DATA_TYPE_STRING, 884 PP_RDWR | PP_OPTIONAL }, 885 { NULL, 0, 0 } 886 }; 887 888 /* 889 * Common routine to put new property on the specified list 890 */ 891 int 892 pool_propput_common(nvlist_t *nvlist, nvpair_t *pair, pool_property_t *props) 893 { 894 pool_property_t *prop; 895 896 if ((prop = pool_property_find(nvpair_name(pair), props)) != NULL) { 897 /* 898 * No read-only properties or properties with bad types 899 */ 900 if (!(prop->pp_perm & PP_WRITE) || 901 prop->pp_type != nvpair_type(pair)) 902 return (EINVAL); 903 } 904 return (nvlist_add_nvpair(nvlist, pair)); 905 } 906 907 /* 908 * Common routine to remove property from the given list 909 */ 910 int 911 pool_proprm_common(nvlist_t *nvlist, char *name, pool_property_t *props) 912 { 913 pool_property_t *prop; 914 915 if ((prop = pool_property_find(name, props)) != NULL) { 916 if (!(prop->pp_perm & PP_OPTIONAL)) 917 return (EINVAL); 918 } 919 return (nvlist_remove_all(nvlist, name)); 920 } 921 922 static int 923 pool_system_propput(nvpair_t *pair) 924 { 925 int ret; 926 927 ASSERT(pool_lock_held()); 928 ret = pool_propput_common(pool_sys_prop, pair, pool_prop_sys); 929 if (ret == 0) 930 pool_sys_mod = gethrtime(); 931 return (ret); 932 } 933 934 static int 935 pool_system_proprm(char *name) 936 { 937 int ret; 938 939 ASSERT(pool_lock_held()); 940 ret = pool_proprm_common(pool_sys_prop, name, pool_prop_sys); 941 if (ret == 0) 942 pool_sys_mod = gethrtime(); 943 return (ret); 944 } 945 946 static int 947 pool_pool_propput(poolid_t poolid, nvpair_t *pair) 948 { 949 pool_t *pool; 950 int ret; 951 952 ASSERT(pool_lock_held()); 953 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) 954 return (ESRCH); 955 ret = pool_propput_common(pool->pool_props, pair, pool_prop_pool); 956 if (ret == 0) 957 pool_pool_mod = gethrtime(); 958 return (ret); 959 } 960 961 static int 962 pool_pool_proprm(poolid_t poolid, char *name) 963 { 964 int ret; 965 pool_t *pool; 966 967 ASSERT(pool_lock_held()); 968 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) 969 return (ESRCH); 970 ret = pool_proprm_common(pool->pool_props, name, pool_prop_pool); 971 if (ret == 0) 972 pool_pool_mod = gethrtime(); 973 return (ret); 974 } 975 976 int 977 pool_propput(int class, int subclass, id_t id, nvpair_t *pair) 978 { 979 int ret; 980 981 ASSERT(pool_lock_held()); 982 if (pool_state == POOL_DISABLED) 983 return (ENOTACTIVE); 984 switch (class) { 985 case PEC_SYSTEM: 986 ret = pool_system_propput(pair); 987 break; 988 case PEC_POOL: 989 ret = pool_pool_propput((poolid_t)id, pair); 990 break; 991 case PEC_RES_COMP: 992 switch (subclass) { 993 case PREC_PSET: 994 ret = pool_pset_propput((psetid_t)id, pair); 995 break; 996 default: 997 ret = EINVAL; 998 } 999 break; 1000 case PEC_RES_AGG: 1001 ret = ENOTSUP; 1002 break; 1003 case PEC_COMP: 1004 switch (subclass) { 1005 case PCEC_CPU: 1006 ret = pool_cpu_propput((processorid_t)id, pair); 1007 break; 1008 default: 1009 ret = EINVAL; 1010 } 1011 break; 1012 default: 1013 ret = EINVAL; 1014 } 1015 return (ret); 1016 } 1017 1018 int 1019 pool_proprm(int class, int subclass, id_t id, char *name) 1020 { 1021 int ret; 1022 1023 ASSERT(pool_lock_held()); 1024 if (pool_state == POOL_DISABLED) 1025 return (ENOTACTIVE); 1026 switch (class) { 1027 case PEC_SYSTEM: 1028 ret = pool_system_proprm(name); 1029 break; 1030 case PEC_POOL: 1031 ret = pool_pool_proprm((poolid_t)id, name); 1032 break; 1033 case PEC_RES_COMP: 1034 switch (subclass) { 1035 case PREC_PSET: 1036 ret = pool_pset_proprm((psetid_t)id, name); 1037 break; 1038 default: 1039 ret = EINVAL; 1040 } 1041 break; 1042 case PEC_RES_AGG: 1043 ret = ENOTSUP; 1044 break; 1045 case PEC_COMP: 1046 switch (subclass) { 1047 case PCEC_CPU: 1048 ret = pool_cpu_proprm((processorid_t)id, name); 1049 break; 1050 default: 1051 ret = EINVAL; 1052 } 1053 break; 1054 default: 1055 ret = EINVAL; 1056 } 1057 return (ret); 1058 } 1059 1060 int 1061 pool_propget(char *name, int class, int subclass, id_t id, nvlist_t **nvlp) 1062 { 1063 int ret; 1064 nvlist_t *nvl; 1065 1066 ASSERT(pool_lock_held()); 1067 if (pool_state == POOL_DISABLED) 1068 return (ENOTACTIVE); 1069 1070 (void) nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP); 1071 1072 switch (class) { 1073 case PEC_SYSTEM: 1074 case PEC_POOL: 1075 ret = EINVAL; 1076 break; 1077 case PEC_RES_COMP: 1078 switch (subclass) { 1079 case PREC_PSET: 1080 ret = pool_pset_propget((psetid_t)id, name, nvl); 1081 break; 1082 default: 1083 ret = EINVAL; 1084 } 1085 break; 1086 case PEC_RES_AGG: 1087 ret = ENOTSUP; 1088 break; 1089 case PEC_COMP: 1090 switch (subclass) { 1091 case PCEC_CPU: 1092 ret = pool_cpu_propget((processorid_t)id, name, nvl); 1093 break; 1094 default: 1095 ret = EINVAL; 1096 } 1097 break; 1098 default: 1099 ret = EINVAL; 1100 } 1101 if (ret == 0) 1102 *nvlp = nvl; 1103 else 1104 nvlist_free(nvl); 1105 return (ret); 1106 } 1107 1108 /* 1109 * pool_bind_wake and pool_bind_wakeall are helper functions to undo PBWAITs 1110 * in case of failure in pool_do_bind(). 1111 */ 1112 static void 1113 pool_bind_wake(proc_t *p) 1114 { 1115 ASSERT(pool_lock_held()); 1116 1117 mutex_enter(&p->p_lock); 1118 ASSERT(p->p_poolflag & PBWAIT); 1119 if (p->p_poolcnt > 0) { 1120 mutex_enter(&pool_barrier_lock); 1121 pool_barrier_count -= p->p_poolcnt; 1122 mutex_exit(&pool_barrier_lock); 1123 } 1124 p->p_poolflag &= ~PBWAIT; 1125 cv_signal(&p->p_poolcv); 1126 mutex_exit(&p->p_lock); 1127 } 1128 1129 static void 1130 pool_bind_wakeall(proc_t **procs) 1131 { 1132 proc_t *p, **pp; 1133 1134 ASSERT(pool_lock_held()); 1135 for (pp = procs; (p = *pp) != NULL; pp++) 1136 pool_bind_wake(p); 1137 } 1138 1139 /* 1140 * Return the scheduling class for this pool, or 1141 * POOL_CLASS_UNSET if not set 1142 * POOL_CLASS_INVAL if set to an invalid class ID. 1143 */ 1144 id_t 1145 pool_get_class(pool_t *pool) 1146 { 1147 char *name; 1148 id_t cid; 1149 1150 ASSERT(pool_lock_held()); 1151 1152 if (nvlist_lookup_string(pool->pool_props, "pool.scheduler", 1153 &name) == 0) { 1154 if (getcidbyname(name, &cid) == 0) 1155 return (cid); 1156 else 1157 return (POOL_CLASS_INVAL); 1158 } 1159 return (POOL_CLASS_UNSET); 1160 } 1161 1162 /* 1163 * Move process to the new scheduling class. 1164 */ 1165 static void 1166 pool_change_class(proc_t *p, id_t cid) 1167 { 1168 kthread_t *t; 1169 void *cldata; 1170 id_t oldcid; 1171 void **bufs; 1172 void **buf; 1173 int nlwp; 1174 int ret; 1175 int i; 1176 1177 /* 1178 * Do not move kernel processes (such as zsched). 1179 */ 1180 if (p->p_flag & SSYS) 1181 return; 1182 /* 1183 * This process is in the pool barrier, so it can't possibly be 1184 * adding new threads and we can use p_lwpcnt + p_zombcnt + 1 1185 * (for possible agent LWP which doesn't use pool barrier) as 1186 * our upper bound. 1187 */ 1188 nlwp = p->p_lwpcnt + p->p_zombcnt + 1; 1189 1190 /* 1191 * Pre-allocate scheduling class specific buffers before 1192 * grabbing p_lock. 1193 */ 1194 bufs = kmem_zalloc(nlwp * sizeof (void *), KM_SLEEP); 1195 for (i = 0, buf = bufs; i < nlwp; i++, buf++) { 1196 ret = CL_ALLOC(buf, cid, KM_SLEEP); 1197 ASSERT(ret == 0); 1198 } 1199 1200 /* 1201 * Move threads one by one to the new scheduling class. 1202 * This never fails because we have all the right 1203 * privileges here. 1204 */ 1205 mutex_enter(&p->p_lock); 1206 ASSERT(p->p_poolflag & PBWAIT); 1207 buf = bufs; 1208 t = p->p_tlist; 1209 ASSERT(t != NULL); 1210 do { 1211 if (t->t_cid != cid) { 1212 oldcid = t->t_cid; 1213 cldata = t->t_cldata; 1214 ret = CL_ENTERCLASS(t, cid, NULL, NULL, *buf); 1215 ASSERT(ret == 0); 1216 CL_EXITCLASS(oldcid, cldata); 1217 *buf++ = NULL; 1218 } 1219 } while ((t = t->t_forw) != p->p_tlist); 1220 mutex_exit(&p->p_lock); 1221 /* 1222 * Free unused scheduling class specific buffers. 1223 */ 1224 for (i = 0, buf = bufs; i < nlwp; i++, buf++) { 1225 if (*buf != NULL) { 1226 CL_FREE(cid, *buf); 1227 *buf = NULL; 1228 } 1229 } 1230 kmem_free(bufs, nlwp * sizeof (void *)); 1231 } 1232 1233 /* 1234 * The meat of the bind operation. The steps in pool_do_bind are: 1235 * 1236 * 1) Set PBWAIT in the p_poolflag of any process of interest, and add all 1237 * such processes to an array. For any interesting process that has 1238 * threads inside the pool barrier set, increment a counter by the 1239 * count of such threads. Once PBWAIT is set on a process, that process 1240 * will not disappear. 1241 * 1242 * 2) Wait for the counter from step 2 to drop to zero. Any process which 1243 * calls pool_barrier_exit() and notices that PBWAIT has been set on it 1244 * will decrement that counter before going to sleep, and the process 1245 * calling pool_barrier_exit() which does the final decrement will wake us. 1246 * 1247 * 3) For each interesting process, perform a calculation on it to see if 1248 * the bind will actually succeed. This uses the following three 1249 * resource-set-specific functions: 1250 * 1251 * - int set_bind_start(procs, pool) 1252 * 1253 * Determine whether the given array of processes can be bound to the 1254 * resource set associated with the given pool. If it can, take and hold 1255 * any locks necessary to ensure that the operation will succeed, and 1256 * make any necessary reservations in the target resource set. If it 1257 * can't, return failure with no reservations made and no new locks held. 1258 * 1259 * - void set_bind_abort(procs, pool) 1260 * 1261 * set_bind_start() has completed successfully, but another resource set's 1262 * set_bind_start() has failed, and we haven't begun the bind yet. Undo 1263 * any reservations made and drop any locks acquired by our 1264 * set_bind_start(). 1265 * 1266 * - void set_bind_finish(void) 1267 * 1268 * The bind has completed successfully. The processes have been released, 1269 * and the reservation acquired in set_bind_start() has been depleted as 1270 * the processes have finished their bindings. Drop any locks acquired by 1271 * set_bind_start(). 1272 * 1273 * 4) If we've decided that we can proceed with the bind, iterate through 1274 * the list of interesting processes, grab the necessary locks (which 1275 * may differ per resource set), perform the bind, and ASSERT that it 1276 * succeeds. Once a process has been rebound, it can be awakened. 1277 * 1278 * The operations from step 4 must be kept in sync with anything which might 1279 * cause the bind operations (e.g., cpupart_bind_thread()) to fail, and 1280 * are thus located in the same source files as the associated bind operations. 1281 */ 1282 int 1283 pool_do_bind(pool_t *pool, idtype_t idtype, id_t id, int flags) 1284 { 1285 extern uint_t nproc; 1286 klwp_t *lwp = ttolwp(curthread); 1287 proc_t **pp, **procs; 1288 proc_t *prstart; 1289 int procs_count = 0; 1290 kproject_t *kpj; 1291 procset_t set; 1292 zone_t *zone; 1293 int procs_size; 1294 int rv = 0; 1295 proc_t *p; 1296 id_t cid = -1; 1297 1298 ASSERT(pool_lock_held()); 1299 1300 if ((cid = pool_get_class(pool)) == POOL_CLASS_INVAL) 1301 return (EINVAL); 1302 1303 if (idtype == P_ZONEID) { 1304 zone = zone_find_by_id(id); 1305 if (zone == NULL) 1306 return (ESRCH); 1307 if (zone_status_get(zone) > ZONE_IS_RUNNING) { 1308 zone_rele(zone); 1309 return (EBUSY); 1310 } 1311 } 1312 1313 if (idtype == P_PROJID) { 1314 kpj = project_hold_by_id(id, global_zone, PROJECT_HOLD_FIND); 1315 if (kpj == NULL) 1316 return (ESRCH); 1317 mutex_enter(&kpj->kpj_poolbind); 1318 } 1319 1320 if (idtype == P_PID) { 1321 /* 1322 * Fast-path for a single process case. 1323 */ 1324 procs_size = 2; /* procs is NULL-terminated */ 1325 procs = kmem_zalloc(procs_size * sizeof (proc_t *), KM_SLEEP); 1326 mutex_enter(&pidlock); 1327 } else { 1328 /* 1329 * We will need enough slots for proc_t pointers for as many as 1330 * twice the number of currently running processes (assuming 1331 * that each one could be in fork() creating a new child). 1332 */ 1333 for (;;) { 1334 procs_size = nproc * 2; 1335 procs = kmem_zalloc(procs_size * sizeof (proc_t *), 1336 KM_SLEEP); 1337 mutex_enter(&pidlock); 1338 1339 if (nproc * 2 <= procs_size) 1340 break; 1341 /* 1342 * If nproc has changed, try again. 1343 */ 1344 mutex_exit(&pidlock); 1345 kmem_free(procs, procs_size * sizeof (proc_t *)); 1346 } 1347 } 1348 1349 if (id == P_MYID) 1350 id = getmyid(idtype); 1351 setprocset(&set, POP_AND, idtype, id, P_ALL, 0); 1352 1353 /* 1354 * Do a first scan, and select target processes. 1355 */ 1356 if (idtype == P_PID) 1357 prstart = prfind(id); 1358 else 1359 prstart = practive; 1360 for (p = prstart, pp = procs; p != NULL; p = p->p_next) { 1361 mutex_enter(&p->p_lock); 1362 /* 1363 * Skip processes that don't match our (id, idtype) set or 1364 * on the way of becoming zombies. Skip kernel processes 1365 * from the global zone. 1366 */ 1367 if (procinset(p, &set) == 0 || 1368 p->p_poolflag & PEXITED || 1369 ((p->p_flag & SSYS) && INGLOBALZONE(p))) { 1370 mutex_exit(&p->p_lock); 1371 continue; 1372 } 1373 if (!INGLOBALZONE(p)) { 1374 switch (idtype) { 1375 case P_PID: 1376 case P_TASKID: 1377 /* 1378 * Can't bind processes or tasks 1379 * in local zones to pools. 1380 */ 1381 mutex_exit(&p->p_lock); 1382 mutex_exit(&pidlock); 1383 pool_bind_wakeall(procs); 1384 rv = EINVAL; 1385 goto out; 1386 case P_PROJID: 1387 /* 1388 * Only projects in the global 1389 * zone can be rebound. 1390 */ 1391 mutex_exit(&p->p_lock); 1392 continue; 1393 case P_POOLID: 1394 /* 1395 * When rebinding pools, processes can be 1396 * in different zones. 1397 */ 1398 break; 1399 } 1400 } 1401 1402 p->p_poolflag |= PBWAIT; 1403 /* 1404 * If some threads in this process are inside the pool 1405 * barrier, add them to pool_barrier_count, as we have 1406 * to wait for all of them to exit the barrier. 1407 */ 1408 if (p->p_poolcnt > 0) { 1409 mutex_enter(&pool_barrier_lock); 1410 pool_barrier_count += p->p_poolcnt; 1411 mutex_exit(&pool_barrier_lock); 1412 } 1413 ASSERT(pp < &procs[procs_size]); 1414 *pp++ = p; 1415 procs_count++; 1416 mutex_exit(&p->p_lock); 1417 1418 /* 1419 * We just found our process, so if we're only rebinding a 1420 * single process then get out of this loop. 1421 */ 1422 if (idtype == P_PID) 1423 break; 1424 } 1425 *pp = NULL; /* cap off the end of the array */ 1426 mutex_exit(&pidlock); 1427 1428 /* 1429 * Wait for relevant processes to stop before they try to enter the 1430 * barrier or at the exit from the barrier. Make sure that we do 1431 * not get stopped here while we're holding pool_lock. If we were 1432 * requested to stop, or got a signal then return EAGAIN to let the 1433 * library know that it needs to retry. 1434 */ 1435 mutex_enter(&pool_barrier_lock); 1436 lwp->lwp_nostop++; 1437 while (pool_barrier_count > 0) { 1438 (void) cv_wait_sig(&pool_barrier_cv, &pool_barrier_lock); 1439 if (pool_barrier_count > 0) { 1440 /* 1441 * We either got a signal or were requested to 1442 * stop by /proc. Bail out with EAGAIN. If we were 1443 * requested to stop, we'll stop in post_syscall() 1444 * on our way back to userland. 1445 */ 1446 mutex_exit(&pool_barrier_lock); 1447 pool_bind_wakeall(procs); 1448 lwp->lwp_nostop--; 1449 rv = EAGAIN; 1450 goto out; 1451 } 1452 } 1453 lwp->lwp_nostop--; 1454 mutex_exit(&pool_barrier_lock); 1455 1456 if (idtype == P_PID) 1457 goto skip; 1458 1459 /* 1460 * Do another run, and drop processes that were inside the barrier 1461 * in exit(), but when they have dropped to pool_barrier_exit 1462 * they have become of no interest to us. Pick up child processes that 1463 * were created by fork() but didn't exist during our first scan. 1464 * Their parents are now stopped at pool_barrier_exit in cfork(). 1465 */ 1466 mutex_enter(&pidlock); 1467 for (pp = procs; (p = *pp) != NULL; pp++) { 1468 if (p->p_poolflag & PEXITED) { 1469 ASSERT(p->p_lwpcnt == 0); 1470 pool_bind_wake(p); 1471 /* flip w/last non-NULL slot */ 1472 *pp = procs[procs_count - 1]; 1473 procs[procs_count - 1] = NULL; 1474 procs_count--; 1475 pp--; /* try this slot again */ 1476 continue; 1477 } 1478 /* 1479 * Look at the child and check if it should be rebound also. 1480 * We're holding pidlock, so it is safe to reference p_child. 1481 */ 1482 if ((p = p->p_child) == NULL) 1483 continue; 1484 1485 mutex_enter(&p->p_lock); 1486 /* 1487 * Skip processes in local zones if we're not binding 1488 * zones to pools (P_ZONEID). Skip kernel processes also. 1489 */ 1490 if ((!INGLOBALZONE(p) && idtype != P_ZONEID) || 1491 p->p_flag & SSYS) { 1492 mutex_exit(&p->p_lock); 1493 continue; 1494 } 1495 1496 /* 1497 * If the child process has been already created by fork(), has 1498 * not exited, and has not been added to the list already, 1499 * then add it now. We will hit this process again (since we 1500 * stick it at the end of the procs list) but it will ignored 1501 * because it will have the PBWAIT flag set. 1502 */ 1503 if (procinset(p, &set) && 1504 !(p->p_poolflag & PEXITED) && 1505 !(p->p_poolflag & PBWAIT)) { 1506 ASSERT(p->p_child == NULL); /* no child of a child */ 1507 procs[procs_count] = p; 1508 procs[procs_count + 1] = NULL; 1509 procs_count++; 1510 p->p_poolflag |= PBWAIT; 1511 } 1512 mutex_exit(&p->p_lock); 1513 } 1514 mutex_exit(&pidlock); 1515 skip: 1516 /* 1517 * If there's no processes to rebind then return ESRCH, unless 1518 * we're associating a pool with new resource set, destroying it, 1519 * or binding a zone to a pool. 1520 */ 1521 if (procs_count == 0) { 1522 if (idtype == P_POOLID || idtype == P_ZONEID) 1523 rv = 0; 1524 else 1525 rv = ESRCH; 1526 goto out; 1527 } 1528 1529 #ifdef DEBUG 1530 /* 1531 * All processes in the array should have PBWAIT set, and none should 1532 * be in the critical section. Even though p_poolflag is protected by 1533 * the p_lock, these assertions should be stable across the dropping of 1534 * p_lock. 1535 */ 1536 for (pp = procs; (p = *pp) != NULL; pp++) { 1537 ASSERT(p->p_poolflag & PBWAIT); 1538 ASSERT(p->p_poolcnt == 0); 1539 ASSERT(procinset(p, &set)); 1540 } 1541 #endif 1542 1543 /* 1544 * Do the check if processor set rebinding is going to succeed or not. 1545 */ 1546 if ((flags & POOL_BIND_PSET) && 1547 (rv = pset_bind_start(procs, pool)) != 0) { 1548 pool_bind_wakeall(procs); 1549 goto out; 1550 } 1551 1552 /* 1553 * At this point, all bind operations should succeed. 1554 */ 1555 for (pp = procs; (p = *pp) != NULL; pp++) { 1556 if (flags & POOL_BIND_PSET) { 1557 psetid_t psetid = pool->pool_pset->pset_id; 1558 void *zonebuf; 1559 void *projbuf; 1560 1561 /* 1562 * Pre-allocate one buffer for FSS (per-project 1563 * buffer for a new pset) in case if this is the 1564 * first thread from its current project getting 1565 * bound to this processor set. 1566 */ 1567 projbuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_PROJ); 1568 zonebuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_ZONE); 1569 1570 mutex_enter(&pidlock); 1571 mutex_enter(&p->p_lock); 1572 pool_pset_bind(p, psetid, projbuf, zonebuf); 1573 mutex_exit(&p->p_lock); 1574 mutex_exit(&pidlock); 1575 /* 1576 * Free buffers pre-allocated above if it 1577 * wasn't actually used. 1578 */ 1579 fss_freebuf(projbuf, FSS_ALLOC_PROJ); 1580 fss_freebuf(zonebuf, FSS_ALLOC_ZONE); 1581 } 1582 /* 1583 * Now let's change the scheduling class of this 1584 * process if our target pool has it defined. 1585 */ 1586 if (cid != POOL_CLASS_UNSET) 1587 pool_change_class(p, cid); 1588 1589 /* 1590 * It is safe to reference p_pool here without holding 1591 * p_lock because it cannot change underneath of us. 1592 * We're holding pool_lock here, so nobody else can be 1593 * moving this process between pools. If process "p" 1594 * would be exiting, we're guaranteed that it would be blocked 1595 * at pool_barrier_enter() in exit(). Otherwise, it would've 1596 * been skipped by one of our scans of the practive list 1597 * as a process with PEXITED flag set. 1598 */ 1599 if (p->p_pool != pool) { 1600 ASSERT(p->p_pool->pool_ref > 0); 1601 atomic_add_32(&p->p_pool->pool_ref, -1); 1602 p->p_pool = pool; 1603 atomic_add_32(&p->p_pool->pool_ref, 1); 1604 } 1605 /* 1606 * Okay, we've tortured this guy enough. 1607 * Let this poor process go now. 1608 */ 1609 pool_bind_wake(p); 1610 } 1611 if (flags & POOL_BIND_PSET) 1612 pset_bind_finish(); 1613 1614 out: switch (idtype) { 1615 case P_PROJID: 1616 ASSERT(kpj != NULL); 1617 mutex_exit(&kpj->kpj_poolbind); 1618 project_rele(kpj); 1619 break; 1620 case P_ZONEID: 1621 if (rv == 0) { 1622 mutex_enter(&cpu_lock); 1623 zone_pool_set(zone, pool); 1624 mutex_exit(&cpu_lock); 1625 } 1626 zone->zone_pool_mod = gethrtime(); 1627 zone_rele(zone); 1628 break; 1629 } 1630 1631 kmem_free(procs, procs_size * sizeof (proc_t *)); 1632 ASSERT(pool_barrier_count == 0); 1633 return (rv); 1634 } 1635