1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/pool.h> 30 #include <sys/pool_impl.h> 31 #include <sys/pool_pset.h> 32 #include <sys/id_space.h> 33 #include <sys/mutex.h> 34 #include <sys/nvpair.h> 35 #include <sys/cpuvar.h> 36 #include <sys/errno.h> 37 #include <sys/cmn_err.h> 38 #include <sys/systm.h> 39 #include <sys/proc.h> 40 #include <sys/fss.h> 41 #include <sys/class.h> 42 #include <sys/exacct.h> 43 #include <sys/utsname.h> 44 #include <sys/procset.h> 45 #include <sys/atomic.h> 46 #include <sys/zone.h> 47 #include <sys/policy.h> 48 49 /* 50 * RESOURCE POOLS 51 * 52 * The resource pools facility brings together process-bindable resource into 53 * a common abstraction called a pool. Processor sets and other entities can 54 * be configured, grouped, and labelled such that workload components can be 55 * associated with a subset of a system's total resources. 56 * 57 * When disabled, the pools facility is "invisible". All processes belong 58 * to the same pool (pool_default), and processor sets can be managed through 59 * the old pset() system call. When enabled, processor sets can only be 60 * managed via the pools facility. New pools can be created and associated 61 * with processor sets. Processes can be bound to pools which have non-empty 62 * resource sets. 63 * 64 * Locking: pool_lock() protects global pools state and must be called 65 * before modifying the configuration, or when taking a snapshot of the 66 * configuration. If pool_lock_intr() is used, the operation may be 67 * interrupted by a signal or a request. 68 * 69 * To prevent processes from being rebound between pools while they are 70 * the middle of an operation which affects resource set bindings, such 71 * operations must be surrounded by calls to pool_barrier_enter() and 72 * pool_barrier_exit(). This mechanism guarantees that such processes will 73 * be stopped either at the beginning or at the end of the barrier so that 74 * the rebind operation can atomically bind the process and its threads 75 * to new resource sets, and then let process run again. 76 * 77 * Lock ordering with respect to other locks is as follows: 78 * 79 * pool_lock() -> cpu_lock -> pidlock -> p_lock -> pool_barrier_lock 80 * 81 * Most static and global variables defined in this file are protected 82 * by calling pool_lock(). 83 * 84 * The operation that binds tasks and projects to pools is atomic. That is, 85 * either all processes in a given task or a project will be bound to a 86 * new pool, or (in case of an error) they will be all left bound to the 87 * old pool. Processes in a given task or a given project can only be bound to 88 * different pools if they were rebound individually one by one as single 89 * processes. Threads or LWPs of the same process do not have pool bindings, 90 * and are bound to the same resource sets associated with the resource pool 91 * of that process. 92 * 93 * The following picture shows one possible pool configuration with three 94 * pools and three processor sets. Note that processor set "foo" is not 95 * associated with any pools and therefore cannot have any processes 96 * bound to it. Two pools (default and foo) are associated with the 97 * same processor set (default). Also, note that processes in Task 2 98 * are bound to different pools. 99 * 100 * 101 * Processor Sets 102 * +---------+ 103 * +--------------+========================>| default | 104 * a| | +---------+ 105 * s| | || 106 * s| | +---------+ 107 * o| | | foo | 108 * c| | +---------+ 109 * i| | || 110 * a| | +---------+ 111 * t| | +------>| bar | 112 * e| | | +---------+ 113 * d| | | 114 * | | | 115 * +---------+ +---------+ +---------+ 116 * Pools | default |======| foo |======| bar | 117 * +---------+ +---------+ +---------+ 118 * @ @ @ @ @ @ 119 * b| | | | | | 120 * o| | | | | | 121 * u| +-----+ | +-------+ | +---+ 122 * n| | | | | | 123 * ....d|........|......|......|.........|.......|.... 124 * : | :: | | | :: | | : 125 * : +---+ :: +---+ +---+ +---+ :: +---+ +---+ : 126 * Processes : | p | :: | p | | p | | p | :: | p |...| p | : 127 * : +---+ :: +---+ +---+ +---+ :: +---+ +---+ : 128 * :........::......................::...............: 129 * Task 1 Task 2 Task N 130 * | | | 131 * | | | 132 * | +-----------+ | +-----------+ 133 * +--| Project 1 |--+ | Project N | 134 * +-----------+ +-----------+ 135 * 136 * This is just an illustration of relationships between processes, tasks, 137 * projects, pools, and processor sets. New types of resource sets will be 138 * added in the future. 139 */ 140 141 pool_t *pool_default; /* default pool which always exists */ 142 int pool_count; /* number of pools created on this system */ 143 int pool_state; /* pools state -- enabled/disabled */ 144 void *pool_buf; /* pre-commit snapshot of the pools state */ 145 size_t pool_bufsz; /* size of pool_buf */ 146 static hrtime_t pool_pool_mod; /* last modification time for pools */ 147 static hrtime_t pool_sys_mod; /* last modification time for system */ 148 static nvlist_t *pool_sys_prop; /* system properties */ 149 static id_space_t *pool_ids; /* pool ID space */ 150 static list_t pool_list; /* doubly-linked list of pools */ 151 static kmutex_t pool_mutex; /* protects pool_busy_* */ 152 static kcondvar_t pool_busy_cv; /* waiting for "pool_lock" */ 153 static kthread_t *pool_busy_thread; /* thread holding "pool_lock" */ 154 static kmutex_t pool_barrier_lock; /* synch. with pool_barrier_* */ 155 static kcondvar_t pool_barrier_cv; /* synch. with pool_barrier_* */ 156 static int pool_barrier_count; /* synch. with pool_barrier_* */ 157 158 /* 159 * Boot-time pool initialization. 160 */ 161 void 162 pool_init(void) 163 { 164 pool_ids = id_space_create("pool_ids", POOL_DEFAULT + 1, POOL_MAXID); 165 166 /* 167 * Initialize default pool. 168 */ 169 pool_default = kmem_zalloc(sizeof (pool_t), KM_SLEEP); 170 pool_default->pool_id = POOL_DEFAULT; 171 list_create(&pool_list, sizeof (pool_t), offsetof(pool_t, pool_link)); 172 list_insert_head(&pool_list, pool_default); 173 174 /* 175 * Initialize plugins for resource sets. 176 */ 177 pool_pset_init(); 178 pool_count = 1; 179 p0.p_pool = pool_default; 180 global_zone->zone_pool = pool_default; 181 pool_default->pool_ref = 1; 182 } 183 184 /* 185 * Synchronization routines. 186 * 187 * pool_lock is only called from syscall-level routines (processor_bind(), 188 * pset_*(), and /dev/pool ioctls). The pool "lock" may be held for long 189 * periods of time, including across sleeping operations, so we allow its 190 * acquisition to be interruptible. 191 * 192 * The current thread that owns the "lock" is stored in the variable 193 * pool_busy_thread, both to let pool_lock_held() work and to aid debugging. 194 */ 195 void 196 pool_lock(void) 197 { 198 mutex_enter(&pool_mutex); 199 while (pool_busy_thread != NULL) 200 cv_wait(&pool_busy_cv, &pool_mutex); 201 pool_busy_thread = curthread; 202 mutex_exit(&pool_mutex); 203 } 204 205 int 206 pool_lock_intr(void) 207 { 208 mutex_enter(&pool_mutex); 209 while (pool_busy_thread != NULL) { 210 if (cv_wait_sig(&pool_busy_cv, &pool_mutex) == 0) { 211 cv_signal(&pool_busy_cv); 212 mutex_exit(&pool_mutex); 213 return (1); 214 } 215 } 216 pool_busy_thread = curthread; 217 mutex_exit(&pool_mutex); 218 return (0); 219 } 220 221 int 222 pool_lock_held(void) 223 { 224 return (pool_busy_thread == curthread); 225 } 226 227 void 228 pool_unlock(void) 229 { 230 mutex_enter(&pool_mutex); 231 pool_busy_thread = NULL; 232 cv_signal(&pool_busy_cv); 233 mutex_exit(&pool_mutex); 234 } 235 236 /* 237 * Routines allowing fork(), exec(), exit(), and lwp_create() to synchronize 238 * with pool_do_bind(). 239 * 240 * Calls to pool_barrier_enter() and pool_barrier_exit() must bracket all 241 * operations which modify pool or pset associations. They can be called 242 * while the process is multi-threaded. In the common case, when current 243 * process is not being rebound (PBWAIT flag is not set), these functions 244 * will be just incrementing and decrementing reference counts. 245 */ 246 void 247 pool_barrier_enter(void) 248 { 249 proc_t *p = curproc; 250 251 ASSERT(MUTEX_HELD(&p->p_lock)); 252 while (p->p_poolflag & PBWAIT) 253 cv_wait(&p->p_poolcv, &p->p_lock); 254 p->p_poolcnt++; 255 } 256 257 void 258 pool_barrier_exit(void) 259 { 260 proc_t *p = curproc; 261 262 ASSERT(MUTEX_HELD(&p->p_lock)); 263 ASSERT(p->p_poolcnt > 0); 264 p->p_poolcnt--; 265 if (p->p_poolflag & PBWAIT) { 266 mutex_enter(&pool_barrier_lock); 267 ASSERT(pool_barrier_count > 0); 268 pool_barrier_count--; 269 if (pool_barrier_count == 0) 270 cv_signal(&pool_barrier_cv); 271 mutex_exit(&pool_barrier_lock); 272 while (p->p_poolflag & PBWAIT) 273 cv_wait(&p->p_poolcv, &p->p_lock); 274 } 275 } 276 277 /* 278 * Enable pools facility. 279 */ 280 static int 281 pool_enable(void) 282 { 283 int ret; 284 285 ASSERT(pool_lock_held()); 286 ASSERT(pool_count == 1); 287 288 ret = pool_pset_enable(); 289 if (ret != 0) 290 return (ret); 291 (void) nvlist_alloc(&pool_sys_prop, NV_UNIQUE_NAME, KM_SLEEP); 292 (void) nvlist_add_string(pool_sys_prop, "system.name", 293 utsname.nodename); 294 (void) nvlist_add_string(pool_sys_prop, "system.comment", ""); 295 (void) nvlist_add_int64(pool_sys_prop, "system.version", 1); 296 (void) nvlist_add_byte(pool_sys_prop, "system.bind-default", 1); 297 298 (void) nvlist_alloc(&pool_default->pool_props, 299 NV_UNIQUE_NAME, KM_SLEEP); 300 (void) nvlist_add_string(pool_default->pool_props, 301 "pool.name", "pool_default"); 302 (void) nvlist_add_string(pool_default->pool_props, "pool.comment", ""); 303 (void) nvlist_add_byte(pool_default->pool_props, "pool.default", 1); 304 (void) nvlist_add_byte(pool_default->pool_props, "pool.active", 1); 305 (void) nvlist_add_int64(pool_default->pool_props, 306 "pool.importance", 1); 307 (void) nvlist_add_int64(pool_default->pool_props, "pool.sys_id", 308 pool_default->pool_id); 309 310 pool_sys_mod = pool_pool_mod = gethrtime(); 311 312 return (ret); 313 } 314 315 /* 316 * Disable pools facility. 317 */ 318 static int 319 pool_disable(void) 320 { 321 int ret; 322 323 ASSERT(pool_lock_held()); 324 325 if (pool_count > 1) /* must destroy all pools first */ 326 return (EBUSY); 327 328 ret = pool_pset_disable(); 329 if (ret != 0) 330 return (ret); 331 if (pool_sys_prop != NULL) { 332 nvlist_free(pool_sys_prop); 333 pool_sys_prop = NULL; 334 } 335 if (pool_default->pool_props != NULL) { 336 nvlist_free(pool_default->pool_props); 337 pool_default->pool_props = NULL; 338 } 339 return (0); 340 } 341 342 pool_t * 343 pool_lookup_pool_by_name(char *name) 344 { 345 pool_t *pool = pool_default; 346 char *p; 347 348 ASSERT(pool_lock_held()); 349 for (pool = list_head(&pool_list); pool; 350 pool = list_next(&pool_list, pool)) { 351 if (nvlist_lookup_string(pool->pool_props, 352 "pool.name", &p) == 0 && strcmp(name, p) == 0) 353 return (pool); 354 } 355 return (NULL); 356 } 357 358 pool_t * 359 pool_lookup_pool_by_id(poolid_t poolid) 360 { 361 pool_t *pool = pool_default; 362 363 ASSERT(pool_lock_held()); 364 for (pool = list_head(&pool_list); pool; 365 pool = list_next(&pool_list, pool)) { 366 if (pool->pool_id == poolid) 367 return (pool); 368 } 369 return (NULL); 370 } 371 372 /* 373 * Create new pool, associate it with default resource sets, and give 374 * it a temporary name. 375 */ 376 static int 377 pool_pool_create(poolid_t *poolid) 378 { 379 pool_t *pool; 380 char pool_name[40]; 381 382 ASSERT(pool_lock_held()); 383 384 pool = kmem_zalloc(sizeof (pool_t), KM_SLEEP); 385 pool->pool_id = *poolid = id_alloc(pool_ids); 386 pool->pool_pset = pool_pset_default; 387 pool_pset_default->pset_npools++; 388 list_insert_tail(&pool_list, pool); 389 (void) nvlist_alloc(&pool->pool_props, NV_UNIQUE_NAME, KM_SLEEP); 390 (void) nvlist_add_int64(pool->pool_props, "pool.sys_id", pool->pool_id); 391 (void) nvlist_add_byte(pool->pool_props, "pool.default", 0); 392 pool_pool_mod = gethrtime(); 393 (void) snprintf(pool_name, sizeof (pool_name), "pool_%lld", 394 pool_pool_mod); 395 (void) nvlist_add_string(pool->pool_props, "pool.name", pool_name); 396 pool_count++; 397 return (0); 398 } 399 400 struct destroy_zone_arg { 401 pool_t *old; 402 pool_t *new; 403 }; 404 405 /* 406 * Update pool pointers for zones that are currently bound to pool "old" 407 * to be bound to pool "new". 408 */ 409 static int 410 pool_destroy_zone_cb(zone_t *zone, void *arg) 411 { 412 struct destroy_zone_arg *dza = arg; 413 414 ASSERT(pool_lock_held()); 415 ASSERT(MUTEX_HELD(&cpu_lock)); 416 417 if (zone_pool_get(zone) == dza->old) 418 zone_pool_set(zone, dza->new); 419 return (0); 420 } 421 422 /* 423 * Destroy specified pool, and rebind all processes in it 424 * to the default pool. 425 */ 426 static int 427 pool_pool_destroy(poolid_t poolid) 428 { 429 pool_t *pool; 430 int ret; 431 432 ASSERT(pool_lock_held()); 433 434 if (poolid == POOL_DEFAULT) 435 return (EINVAL); 436 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) 437 return (ESRCH); 438 ret = pool_do_bind(pool_default, P_POOLID, poolid, POOL_BIND_ALL); 439 if (ret == 0) { 440 struct destroy_zone_arg dzarg; 441 442 dzarg.old = pool; 443 dzarg.new = pool_default; 444 mutex_enter(&cpu_lock); 445 ret = zone_walk(pool_destroy_zone_cb, &dzarg); 446 mutex_exit(&cpu_lock); 447 ASSERT(ret == 0); 448 ASSERT(pool->pool_ref == 0); 449 (void) nvlist_free(pool->pool_props); 450 id_free(pool_ids, pool->pool_id); 451 pool->pool_pset->pset_npools--; 452 list_remove(&pool_list, pool); 453 pool_count--; 454 pool_pool_mod = gethrtime(); 455 kmem_free(pool, sizeof (pool_t)); 456 } 457 return (ret); 458 } 459 460 /* 461 * Create new pool or resource set. 462 */ 463 int 464 pool_create(int class, int subclass, id_t *id) 465 { 466 int ret; 467 468 ASSERT(pool_lock_held()); 469 if (pool_state == POOL_DISABLED) 470 return (ENOTACTIVE); 471 switch (class) { 472 case PEC_POOL: 473 ret = pool_pool_create((poolid_t *)id); 474 break; 475 case PEC_RES_COMP: 476 switch (subclass) { 477 case PREC_PSET: 478 ret = pool_pset_create((psetid_t *)id); 479 break; 480 default: 481 ret = EINVAL; 482 } 483 break; 484 case PEC_RES_AGG: 485 ret = ENOTSUP; 486 break; 487 default: 488 ret = EINVAL; 489 } 490 return (ret); 491 } 492 493 /* 494 * Destroy an existing pool or resource set. 495 */ 496 int 497 pool_destroy(int class, int subclass, id_t id) 498 { 499 int ret; 500 501 ASSERT(pool_lock_held()); 502 if (pool_state == POOL_DISABLED) 503 return (ENOTACTIVE); 504 switch (class) { 505 case PEC_POOL: 506 ret = pool_pool_destroy((poolid_t)id); 507 break; 508 case PEC_RES_COMP: 509 switch (subclass) { 510 case PREC_PSET: 511 ret = pool_pset_destroy((psetid_t)id); 512 break; 513 default: 514 ret = EINVAL; 515 } 516 break; 517 case PEC_RES_AGG: 518 ret = ENOTSUP; 519 break; 520 default: 521 ret = EINVAL; 522 } 523 return (ret); 524 } 525 526 /* 527 * Enable or disable pools. 528 */ 529 int 530 pool_status(int status) 531 { 532 int ret = 0; 533 534 ASSERT(pool_lock_held()); 535 536 if (pool_state == status) 537 return (0); 538 switch (status) { 539 case POOL_ENABLED: 540 ret = pool_enable(); 541 if (ret != 0) 542 return (ret); 543 pool_state = POOL_ENABLED; 544 break; 545 case POOL_DISABLED: 546 ret = pool_disable(); 547 if (ret != 0) 548 return (ret); 549 pool_state = POOL_DISABLED; 550 break; 551 default: 552 ret = EINVAL; 553 } 554 return (ret); 555 } 556 557 /* 558 * Associate pool with resource set. 559 */ 560 int 561 pool_assoc(poolid_t poolid, int idtype, id_t id) 562 { 563 int ret; 564 565 ASSERT(pool_lock_held()); 566 if (pool_state == POOL_DISABLED) 567 return (ENOTACTIVE); 568 switch (idtype) { 569 case PREC_PSET: 570 ret = pool_pset_assoc(poolid, (psetid_t)id); 571 break; 572 default: 573 ret = EINVAL; 574 } 575 if (ret == 0) 576 pool_pool_mod = gethrtime(); 577 return (ret); 578 } 579 580 /* 581 * Disassociate resource set from pool. 582 */ 583 int 584 pool_dissoc(poolid_t poolid, int idtype) 585 { 586 int ret; 587 588 ASSERT(pool_lock_held()); 589 if (pool_state == POOL_DISABLED) 590 return (ENOTACTIVE); 591 switch (idtype) { 592 case PREC_PSET: 593 ret = pool_pset_assoc(poolid, PS_NONE); 594 break; 595 default: 596 ret = EINVAL; 597 } 598 if (ret == 0) 599 pool_pool_mod = gethrtime(); 600 return (ret); 601 } 602 603 /* 604 * Transfer specified quantity of resources between resource sets. 605 */ 606 /*ARGSUSED*/ 607 int 608 pool_transfer(int type, id_t src, id_t dst, uint64_t qty) 609 { 610 int ret = EINVAL; 611 return (ret); 612 } 613 614 /* 615 * Transfer resources specified by their IDs between resource sets. 616 */ 617 int 618 pool_xtransfer(int type, id_t src, id_t dst, uint_t size, id_t *ids) 619 { 620 int ret; 621 622 ASSERT(pool_lock_held()); 623 if (pool_state == POOL_DISABLED) 624 return (ENOTACTIVE); 625 switch (type) { 626 case PREC_PSET: 627 ret = pool_pset_xtransfer((psetid_t)src, (psetid_t)dst, 628 size, ids); 629 break; 630 default: 631 ret = EINVAL; 632 } 633 return (ret); 634 } 635 636 /* 637 * Bind processes to pools. 638 */ 639 int 640 pool_bind(poolid_t poolid, idtype_t idtype, id_t id) 641 { 642 pool_t *pool; 643 644 ASSERT(pool_lock_held()); 645 646 if (pool_state == POOL_DISABLED) 647 return (ENOTACTIVE); 648 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) 649 return (ESRCH); 650 651 switch (idtype) { 652 case P_PID: 653 case P_TASKID: 654 case P_PROJID: 655 case P_ZONEID: 656 break; 657 default: 658 return (EINVAL); 659 } 660 return (pool_do_bind(pool, idtype, id, POOL_BIND_ALL)); 661 } 662 663 /* 664 * Query pool binding of the specifed process. 665 */ 666 int 667 pool_query_binding(idtype_t idtype, id_t id, id_t *poolid) 668 { 669 proc_t *p; 670 671 if (idtype != P_PID) 672 return (ENOTSUP); 673 if (id == P_MYID) 674 id = curproc->p_pid; 675 676 ASSERT(pool_lock_held()); 677 678 mutex_enter(&pidlock); 679 if ((p = prfind((pid_t)id)) == NULL) { 680 mutex_exit(&pidlock); 681 return (ESRCH); 682 } 683 mutex_enter(&p->p_lock); 684 /* 685 * In local zones, lie about pool bindings of processes from 686 * the global zone. 687 */ 688 if (!INGLOBALZONE(curproc) && INGLOBALZONE(p)) { 689 pool_t *pool; 690 691 pool = zone_pool_get(curproc->p_zone); 692 *poolid = pool->pool_id; 693 } else { 694 *poolid = p->p_pool->pool_id; 695 } 696 mutex_exit(&p->p_lock); 697 mutex_exit(&pidlock); 698 return (0); 699 } 700 701 static ea_object_t * 702 pool_system_pack(void) 703 { 704 ea_object_t *eo_system; 705 size_t bufsz = 0; 706 char *buf = NULL; 707 708 ASSERT(pool_lock_held()); 709 710 eo_system = ea_alloc_group(EXT_GROUP | EXC_LOCAL | EXD_GROUP_SYSTEM); 711 (void) ea_attach_item(eo_system, &pool_sys_mod, sizeof (hrtime_t), 712 EXC_LOCAL | EXD_SYSTEM_TSTAMP | EXT_UINT64); 713 if (INGLOBALZONE(curproc)) 714 (void) ea_attach_item(eo_system, &pool_pool_mod, 715 sizeof (hrtime_t), 716 EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64); 717 else 718 (void) ea_attach_item(eo_system, 719 &curproc->p_zone->zone_pool_mod, 720 sizeof (hrtime_t), 721 EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64); 722 (void) ea_attach_item(eo_system, &pool_pset_mod, sizeof (hrtime_t), 723 EXC_LOCAL | EXD_PSET_TSTAMP | EXT_UINT64); 724 (void) ea_attach_item(eo_system, &pool_cpu_mod, sizeof (hrtime_t), 725 EXC_LOCAL | EXD_CPU_TSTAMP | EXT_UINT64); 726 (void) nvlist_pack(pool_sys_prop, &buf, &bufsz, NV_ENCODE_NATIVE, 0); 727 (void) ea_attach_item(eo_system, buf, bufsz, 728 EXC_LOCAL | EXD_SYSTEM_PROP | EXT_RAW); 729 kmem_free(buf, bufsz); 730 return (eo_system); 731 } 732 733 /* 734 * Pack information about pools and attach it to specified exacct group. 735 */ 736 static int 737 pool_pool_pack(ea_object_t *eo_system) 738 { 739 ea_object_t *eo_pool; 740 pool_t *pool; 741 size_t bufsz; 742 char *buf; 743 pool_t *myzonepool; 744 745 ASSERT(pool_lock_held()); 746 myzonepool = zone_pool_get(curproc->p_zone); 747 for (pool = list_head(&pool_list); pool; 748 pool = list_next(&pool_list, pool)) { 749 if (!INGLOBALZONE(curproc) && myzonepool != pool) 750 continue; 751 bufsz = 0; 752 buf = NULL; 753 eo_pool = ea_alloc_group(EXT_GROUP | 754 EXC_LOCAL | EXD_GROUP_POOL); 755 (void) ea_attach_item(eo_pool, &pool->pool_id, sizeof (id_t), 756 EXC_LOCAL | EXD_POOL_POOLID | EXT_UINT32); 757 (void) ea_attach_item(eo_pool, &pool->pool_pset->pset_id, 758 sizeof (id_t), EXC_LOCAL | EXD_POOL_PSETID | EXT_UINT32); 759 (void) nvlist_pack(pool->pool_props, &buf, &bufsz, 760 NV_ENCODE_NATIVE, 0); 761 (void) ea_attach_item(eo_pool, buf, bufsz, 762 EXC_LOCAL | EXD_POOL_PROP | EXT_RAW); 763 kmem_free(buf, bufsz); 764 (void) ea_attach_to_group(eo_system, eo_pool); 765 } 766 return (0); 767 } 768 769 /* 770 * Pack the whole pool configuration in the specified buffer. 771 */ 772 int 773 pool_pack_conf(void *kbuf, size_t kbufsz, size_t *asize) 774 { 775 ea_object_t *eo_system; 776 size_t ksize; 777 int ret = 0; 778 779 ASSERT(pool_lock_held()); 780 781 eo_system = pool_system_pack(); /* 1. pack system */ 782 (void) pool_pool_pack(eo_system); /* 2. pack all pools */ 783 (void) pool_pset_pack(eo_system); /* 3. pack all psets */ 784 ksize = ea_pack_object(eo_system, NULL, 0); 785 if (kbuf == NULL || kbufsz == 0) 786 *asize = ksize; 787 else if (ksize > kbufsz) 788 ret = ENOMEM; 789 else 790 *asize = ea_pack_object(eo_system, kbuf, kbufsz); 791 ea_free_object(eo_system, EUP_ALLOC); 792 return (ret); 793 } 794 795 /* 796 * Start/end the commit transaction. If commit transaction is currently 797 * in progress, then all POOL_QUERY ioctls will return pools configuration 798 * at the beginning of transaction. 799 */ 800 int 801 pool_commit(int state) 802 { 803 ea_object_t *eo_system; 804 int ret = 0; 805 806 ASSERT(pool_lock_held()); 807 808 if (pool_state == POOL_DISABLED) 809 return (ENOTACTIVE); 810 switch (state) { 811 case 1: 812 /* 813 * Beginning commit transation. 814 */ 815 if (pool_buf != NULL) /* transaction in progress */ 816 return (EBUSY); 817 eo_system = pool_system_pack(); /* 1. pack system */ 818 (void) pool_pool_pack(eo_system); /* 2. pack all pools */ 819 (void) pool_pset_pack(eo_system); /* 3. pack all psets */ 820 pool_bufsz = ea_pack_object(eo_system, NULL, 0); 821 pool_buf = kmem_alloc(pool_bufsz, KM_SLEEP); 822 pool_bufsz = ea_pack_object(eo_system, pool_buf, pool_bufsz); 823 ea_free_object(eo_system, EUP_ALLOC); 824 break; 825 case 0: 826 /* 827 * Finishing commit transaction. 828 */ 829 if (pool_buf != NULL) { 830 kmem_free(pool_buf, pool_bufsz); 831 pool_buf = NULL; 832 pool_bufsz = 0; 833 } 834 break; 835 default: 836 ret = EINVAL; 837 } 838 return (ret); 839 } 840 841 /* 842 * Check is the specified property is special 843 */ 844 static pool_property_t * 845 pool_property_find(char *name, pool_property_t *list) 846 { 847 pool_property_t *prop; 848 849 for (prop = list; prop->pp_name != NULL; prop++) 850 if (strcmp(prop->pp_name, name) == 0) 851 return (prop); 852 return (NULL); 853 } 854 855 static pool_property_t pool_prop_sys[] = { 856 { "system.name", DATA_TYPE_STRING, PP_RDWR }, 857 { "system.comment", DATA_TYPE_STRING, PP_RDWR }, 858 { "system.version", DATA_TYPE_UINT64, PP_READ }, 859 { "system.bind-default", DATA_TYPE_BYTE, PP_RDWR }, 860 { "system.allocate-method", DATA_TYPE_STRING, 861 PP_RDWR | PP_OPTIONAL }, 862 { "system.poold.log-level", DATA_TYPE_STRING, 863 PP_RDWR | PP_OPTIONAL }, 864 { "system.poold.log-location", DATA_TYPE_STRING, 865 PP_RDWR | PP_OPTIONAL }, 866 { "system.poold.monitor-interval", DATA_TYPE_UINT64, 867 PP_RDWR | PP_OPTIONAL }, 868 { "system.poold.history-file", DATA_TYPE_STRING, 869 PP_RDWR | PP_OPTIONAL }, 870 { "system.poold.objectives", DATA_TYPE_STRING, 871 PP_RDWR | PP_OPTIONAL }, 872 { NULL, 0, 0 } 873 }; 874 875 static pool_property_t pool_prop_pool[] = { 876 { "pool.sys_id", DATA_TYPE_UINT64, PP_READ }, 877 { "pool.name", DATA_TYPE_STRING, PP_RDWR }, 878 { "pool.default", DATA_TYPE_BYTE, PP_READ }, 879 { "pool.active", DATA_TYPE_BYTE, PP_RDWR }, 880 { "pool.importance", DATA_TYPE_INT64, PP_RDWR }, 881 { "pool.comment", DATA_TYPE_STRING, PP_RDWR }, 882 { "pool.scheduler", DATA_TYPE_STRING, 883 PP_RDWR | PP_OPTIONAL }, 884 { NULL, 0, 0 } 885 }; 886 887 /* 888 * Common routine to put new property on the specified list 889 */ 890 int 891 pool_propput_common(nvlist_t *nvlist, nvpair_t *pair, pool_property_t *props) 892 { 893 pool_property_t *prop; 894 895 if ((prop = pool_property_find(nvpair_name(pair), props)) != NULL) { 896 /* 897 * No read-only properties or properties with bad types 898 */ 899 if (!(prop->pp_perm & PP_WRITE) || 900 prop->pp_type != nvpair_type(pair)) 901 return (EINVAL); 902 } 903 return (nvlist_add_nvpair(nvlist, pair)); 904 } 905 906 /* 907 * Common routine to remove property from the given list 908 */ 909 int 910 pool_proprm_common(nvlist_t *nvlist, char *name, pool_property_t *props) 911 { 912 pool_property_t *prop; 913 914 if ((prop = pool_property_find(name, props)) != NULL) { 915 if (!(prop->pp_perm & PP_OPTIONAL)) 916 return (EINVAL); 917 } 918 return (nvlist_remove_all(nvlist, name)); 919 } 920 921 static int 922 pool_system_propput(nvpair_t *pair) 923 { 924 int ret; 925 926 ASSERT(pool_lock_held()); 927 ret = pool_propput_common(pool_sys_prop, pair, pool_prop_sys); 928 if (ret == 0) 929 pool_sys_mod = gethrtime(); 930 return (ret); 931 } 932 933 static int 934 pool_system_proprm(char *name) 935 { 936 int ret; 937 938 ASSERT(pool_lock_held()); 939 ret = pool_proprm_common(pool_sys_prop, name, pool_prop_sys); 940 if (ret == 0) 941 pool_sys_mod = gethrtime(); 942 return (ret); 943 } 944 945 static int 946 pool_pool_propput(poolid_t poolid, nvpair_t *pair) 947 { 948 pool_t *pool; 949 int ret; 950 951 ASSERT(pool_lock_held()); 952 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) 953 return (ESRCH); 954 ret = pool_propput_common(pool->pool_props, pair, pool_prop_pool); 955 if (ret == 0) 956 pool_pool_mod = gethrtime(); 957 return (ret); 958 } 959 960 static int 961 pool_pool_proprm(poolid_t poolid, char *name) 962 { 963 int ret; 964 pool_t *pool; 965 966 ASSERT(pool_lock_held()); 967 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) 968 return (ESRCH); 969 ret = pool_proprm_common(pool->pool_props, name, pool_prop_pool); 970 if (ret == 0) 971 pool_pool_mod = gethrtime(); 972 return (ret); 973 } 974 975 int 976 pool_propput(int class, int subclass, id_t id, nvpair_t *pair) 977 { 978 int ret; 979 980 ASSERT(pool_lock_held()); 981 if (pool_state == POOL_DISABLED) 982 return (ENOTACTIVE); 983 switch (class) { 984 case PEC_SYSTEM: 985 ret = pool_system_propput(pair); 986 break; 987 case PEC_POOL: 988 ret = pool_pool_propput((poolid_t)id, pair); 989 break; 990 case PEC_RES_COMP: 991 switch (subclass) { 992 case PREC_PSET: 993 ret = pool_pset_propput((psetid_t)id, pair); 994 break; 995 default: 996 ret = EINVAL; 997 } 998 break; 999 case PEC_RES_AGG: 1000 ret = ENOTSUP; 1001 break; 1002 case PEC_COMP: 1003 switch (subclass) { 1004 case PCEC_CPU: 1005 ret = pool_cpu_propput((processorid_t)id, pair); 1006 break; 1007 default: 1008 ret = EINVAL; 1009 } 1010 break; 1011 default: 1012 ret = EINVAL; 1013 } 1014 return (ret); 1015 } 1016 1017 int 1018 pool_proprm(int class, int subclass, id_t id, char *name) 1019 { 1020 int ret; 1021 1022 ASSERT(pool_lock_held()); 1023 if (pool_state == POOL_DISABLED) 1024 return (ENOTACTIVE); 1025 switch (class) { 1026 case PEC_SYSTEM: 1027 ret = pool_system_proprm(name); 1028 break; 1029 case PEC_POOL: 1030 ret = pool_pool_proprm((poolid_t)id, name); 1031 break; 1032 case PEC_RES_COMP: 1033 switch (subclass) { 1034 case PREC_PSET: 1035 ret = pool_pset_proprm((psetid_t)id, name); 1036 break; 1037 default: 1038 ret = EINVAL; 1039 } 1040 break; 1041 case PEC_RES_AGG: 1042 ret = ENOTSUP; 1043 break; 1044 case PEC_COMP: 1045 switch (subclass) { 1046 case PCEC_CPU: 1047 ret = pool_cpu_proprm((processorid_t)id, name); 1048 break; 1049 default: 1050 ret = EINVAL; 1051 } 1052 break; 1053 default: 1054 ret = EINVAL; 1055 } 1056 return (ret); 1057 } 1058 1059 int 1060 pool_propget(char *name, int class, int subclass, id_t id, nvlist_t **nvlp) 1061 { 1062 int ret; 1063 nvlist_t *nvl; 1064 1065 ASSERT(pool_lock_held()); 1066 if (pool_state == POOL_DISABLED) 1067 return (ENOTACTIVE); 1068 1069 (void) nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP); 1070 1071 switch (class) { 1072 case PEC_SYSTEM: 1073 case PEC_POOL: 1074 ret = EINVAL; 1075 break; 1076 case PEC_RES_COMP: 1077 switch (subclass) { 1078 case PREC_PSET: 1079 ret = pool_pset_propget((psetid_t)id, name, nvl); 1080 break; 1081 default: 1082 ret = EINVAL; 1083 } 1084 break; 1085 case PEC_RES_AGG: 1086 ret = ENOTSUP; 1087 break; 1088 case PEC_COMP: 1089 switch (subclass) { 1090 case PCEC_CPU: 1091 ret = pool_cpu_propget((processorid_t)id, name, nvl); 1092 break; 1093 default: 1094 ret = EINVAL; 1095 } 1096 break; 1097 default: 1098 ret = EINVAL; 1099 } 1100 if (ret == 0) 1101 *nvlp = nvl; 1102 else 1103 nvlist_free(nvl); 1104 return (ret); 1105 } 1106 1107 /* 1108 * pool_bind_wake and pool_bind_wakeall are helper functions to undo PBWAITs 1109 * in case of failure in pool_do_bind(). 1110 */ 1111 static void 1112 pool_bind_wake(proc_t *p) 1113 { 1114 ASSERT(pool_lock_held()); 1115 1116 mutex_enter(&p->p_lock); 1117 ASSERT(p->p_poolflag & PBWAIT); 1118 if (p->p_poolcnt > 0) { 1119 mutex_enter(&pool_barrier_lock); 1120 pool_barrier_count -= p->p_poolcnt; 1121 mutex_exit(&pool_barrier_lock); 1122 } 1123 p->p_poolflag &= ~PBWAIT; 1124 cv_signal(&p->p_poolcv); 1125 mutex_exit(&p->p_lock); 1126 } 1127 1128 static void 1129 pool_bind_wakeall(proc_t **procs) 1130 { 1131 proc_t *p, **pp; 1132 1133 ASSERT(pool_lock_held()); 1134 for (pp = procs; (p = *pp) != NULL; pp++) 1135 pool_bind_wake(p); 1136 } 1137 1138 /* 1139 * Return the scheduling class for this pool, or 1140 * POOL_CLASS_UNSET if not set 1141 * POOL_CLASS_INVAL if set to an invalid class ID. 1142 */ 1143 id_t 1144 pool_get_class(pool_t *pool) 1145 { 1146 char *name; 1147 id_t cid; 1148 1149 ASSERT(pool_lock_held()); 1150 1151 if (nvlist_lookup_string(pool->pool_props, "pool.scheduler", 1152 &name) == 0) { 1153 if (getcidbyname(name, &cid) == 0) 1154 return (cid); 1155 else 1156 return (POOL_CLASS_INVAL); 1157 } 1158 return (POOL_CLASS_UNSET); 1159 } 1160 1161 /* 1162 * Move process to the new scheduling class. 1163 */ 1164 static void 1165 pool_change_class(proc_t *p, id_t cid) 1166 { 1167 kthread_t *t; 1168 void *cldata; 1169 id_t oldcid; 1170 void **bufs; 1171 void **buf; 1172 int nlwp; 1173 int ret; 1174 int i; 1175 1176 /* 1177 * Do not move kernel processes (such as zsched). 1178 */ 1179 if (p->p_flag & SSYS) 1180 return; 1181 /* 1182 * This process is in the pool barrier, so it can't possibly be 1183 * adding new threads and we can use p_lwpcnt + p_zombcnt + 1 1184 * (for possible agent LWP which doesn't use pool barrier) as 1185 * our upper bound. 1186 */ 1187 nlwp = p->p_lwpcnt + p->p_zombcnt + 1; 1188 1189 /* 1190 * Pre-allocate scheduling class specific buffers before 1191 * grabbing p_lock. 1192 */ 1193 bufs = kmem_zalloc(nlwp * sizeof (void *), KM_SLEEP); 1194 for (i = 0, buf = bufs; i < nlwp; i++, buf++) { 1195 ret = CL_ALLOC(buf, cid, KM_SLEEP); 1196 ASSERT(ret == 0); 1197 } 1198 1199 /* 1200 * Move threads one by one to the new scheduling class. 1201 * This never fails because we have all the right 1202 * privileges here. 1203 */ 1204 mutex_enter(&p->p_lock); 1205 ASSERT(p->p_poolflag & PBWAIT); 1206 buf = bufs; 1207 t = p->p_tlist; 1208 ASSERT(t != NULL); 1209 do { 1210 if (t->t_cid != cid) { 1211 oldcid = t->t_cid; 1212 cldata = t->t_cldata; 1213 ret = CL_ENTERCLASS(t, cid, NULL, NULL, *buf); 1214 ASSERT(ret == 0); 1215 CL_EXITCLASS(oldcid, cldata); 1216 *buf++ = NULL; 1217 } 1218 } while ((t = t->t_forw) != p->p_tlist); 1219 mutex_exit(&p->p_lock); 1220 /* 1221 * Free unused scheduling class specific buffers. 1222 */ 1223 for (i = 0, buf = bufs; i < nlwp; i++, buf++) { 1224 if (*buf != NULL) { 1225 CL_FREE(cid, *buf); 1226 *buf = NULL; 1227 } 1228 } 1229 kmem_free(bufs, nlwp * sizeof (void *)); 1230 } 1231 1232 /* 1233 * The meat of the bind operation. The steps in pool_do_bind are: 1234 * 1235 * 1) Set PBWAIT in the p_poolflag of any process of interest, and add all 1236 * such processes to an array. For any interesting process that has 1237 * threads inside the pool barrier set, increment a counter by the 1238 * count of such threads. Once PBWAIT is set on a process, that process 1239 * will not disappear. 1240 * 1241 * 2) Wait for the counter from step 2 to drop to zero. Any process which 1242 * calls pool_barrier_exit() and notices that PBWAIT has been set on it 1243 * will decrement that counter before going to sleep, and the process 1244 * calling pool_barrier_exit() which does the final decrement will wake us. 1245 * 1246 * 3) For each interesting process, perform a calculation on it to see if 1247 * the bind will actually succeed. This uses the following three 1248 * resource-set-specific functions: 1249 * 1250 * - int set_bind_start(procs, pool) 1251 * 1252 * Determine whether the given array of processes can be bound to the 1253 * resource set associated with the given pool. If it can, take and hold 1254 * any locks necessary to ensure that the operation will succeed, and 1255 * make any necessary reservations in the target resource set. If it 1256 * can't, return failure with no reservations made and no new locks held. 1257 * 1258 * - void set_bind_abort(procs, pool) 1259 * 1260 * set_bind_start() has completed successfully, but another resource set's 1261 * set_bind_start() has failed, and we haven't begun the bind yet. Undo 1262 * any reservations made and drop any locks acquired by our 1263 * set_bind_start(). 1264 * 1265 * - void set_bind_finish(void) 1266 * 1267 * The bind has completed successfully. The processes have been released, 1268 * and the reservation acquired in set_bind_start() has been depleted as 1269 * the processes have finished their bindings. Drop any locks acquired by 1270 * set_bind_start(). 1271 * 1272 * 4) If we've decided that we can proceed with the bind, iterate through 1273 * the list of interesting processes, grab the necessary locks (which 1274 * may differ per resource set), perform the bind, and ASSERT that it 1275 * succeeds. Once a process has been rebound, it can be awakened. 1276 * 1277 * The operations from step 4 must be kept in sync with anything which might 1278 * cause the bind operations (e.g., cpupart_bind_thread()) to fail, and 1279 * are thus located in the same source files as the associated bind operations. 1280 */ 1281 int 1282 pool_do_bind(pool_t *pool, idtype_t idtype, id_t id, int flags) 1283 { 1284 extern uint_t nproc; 1285 klwp_t *lwp = ttolwp(curthread); 1286 proc_t **pp, **procs; 1287 proc_t *prstart; 1288 int procs_count = 0; 1289 kproject_t *kpj; 1290 procset_t set; 1291 zone_t *zone; 1292 int procs_size; 1293 int rv = 0; 1294 proc_t *p; 1295 id_t cid = -1; 1296 1297 ASSERT(pool_lock_held()); 1298 1299 if ((cid = pool_get_class(pool)) == POOL_CLASS_INVAL) 1300 return (EINVAL); 1301 1302 if (idtype == P_ZONEID) { 1303 zone = zone_find_by_id(id); 1304 if (zone == NULL) 1305 return (ESRCH); 1306 if (zone_status_get(zone) > ZONE_IS_RUNNING) { 1307 zone_rele(zone); 1308 return (EBUSY); 1309 } 1310 } 1311 1312 if (idtype == P_PROJID) { 1313 kpj = project_hold_by_id(id, GLOBAL_ZONEID, PROJECT_HOLD_FIND); 1314 if (kpj == NULL) 1315 return (ESRCH); 1316 mutex_enter(&kpj->kpj_poolbind); 1317 } 1318 1319 if (idtype == P_PID) { 1320 /* 1321 * Fast-path for a single process case. 1322 */ 1323 procs_size = 2; /* procs is NULL-terminated */ 1324 procs = kmem_zalloc(procs_size * sizeof (proc_t *), KM_SLEEP); 1325 mutex_enter(&pidlock); 1326 } else { 1327 /* 1328 * We will need enough slots for proc_t pointers for as many as 1329 * twice the number of currently running processes (assuming 1330 * that each one could be in fork() creating a new child). 1331 */ 1332 for (;;) { 1333 procs_size = nproc * 2; 1334 procs = kmem_zalloc(procs_size * sizeof (proc_t *), 1335 KM_SLEEP); 1336 mutex_enter(&pidlock); 1337 1338 if (nproc * 2 <= procs_size) 1339 break; 1340 /* 1341 * If nproc has changed, try again. 1342 */ 1343 mutex_exit(&pidlock); 1344 kmem_free(procs, procs_size * sizeof (proc_t *)); 1345 } 1346 } 1347 1348 if (id == P_MYID) 1349 id = getmyid(idtype); 1350 setprocset(&set, POP_AND, idtype, id, P_ALL, 0); 1351 1352 /* 1353 * Do a first scan, and select target processes. 1354 */ 1355 if (idtype == P_PID) 1356 prstart = prfind(id); 1357 else 1358 prstart = practive; 1359 for (p = prstart, pp = procs; p != NULL; p = p->p_next) { 1360 mutex_enter(&p->p_lock); 1361 /* 1362 * Skip processes that don't match our (id, idtype) set or 1363 * on the way of becoming zombies. Skip kernel processes 1364 * from the global zone. 1365 */ 1366 if (procinset(p, &set) == 0 || 1367 p->p_poolflag & PEXITED || 1368 ((p->p_flag & SSYS) && INGLOBALZONE(p))) { 1369 mutex_exit(&p->p_lock); 1370 continue; 1371 } 1372 if (!INGLOBALZONE(p)) { 1373 switch (idtype) { 1374 case P_PID: 1375 case P_TASKID: 1376 /* 1377 * Can't bind processes or tasks 1378 * in local zones to pools. 1379 */ 1380 mutex_exit(&p->p_lock); 1381 mutex_exit(&pidlock); 1382 pool_bind_wakeall(procs); 1383 rv = EINVAL; 1384 goto out; 1385 case P_PROJID: 1386 /* 1387 * Only projects in the global 1388 * zone can be rebound. 1389 */ 1390 mutex_exit(&p->p_lock); 1391 continue; 1392 case P_POOLID: 1393 /* 1394 * When rebinding pools, processes can be 1395 * in different zones. 1396 */ 1397 break; 1398 } 1399 } 1400 1401 p->p_poolflag |= PBWAIT; 1402 /* 1403 * If some threads in this process are inside the pool 1404 * barrier, add them to pool_barrier_count, as we have 1405 * to wait for all of them to exit the barrier. 1406 */ 1407 if (p->p_poolcnt > 0) { 1408 mutex_enter(&pool_barrier_lock); 1409 pool_barrier_count += p->p_poolcnt; 1410 mutex_exit(&pool_barrier_lock); 1411 } 1412 ASSERT(pp < &procs[procs_size]); 1413 *pp++ = p; 1414 procs_count++; 1415 mutex_exit(&p->p_lock); 1416 1417 /* 1418 * We just found our process, so if we're only rebinding a 1419 * single process then get out of this loop. 1420 */ 1421 if (idtype == P_PID) 1422 break; 1423 } 1424 *pp = NULL; /* cap off the end of the array */ 1425 mutex_exit(&pidlock); 1426 1427 /* 1428 * Wait for relevant processes to stop before they try to enter the 1429 * barrier or at the exit from the barrier. Make sure that we do 1430 * not get stopped here while we're holding pool_lock. If we were 1431 * requested to stop, or got a signal then return EAGAIN to let the 1432 * library know that it needs to retry. 1433 */ 1434 mutex_enter(&pool_barrier_lock); 1435 lwp->lwp_nostop++; 1436 while (pool_barrier_count > 0) { 1437 (void) cv_wait_sig(&pool_barrier_cv, &pool_barrier_lock); 1438 if (pool_barrier_count > 0) { 1439 /* 1440 * We either got a signal or were requested to 1441 * stop by /proc. Bail out with EAGAIN. If we were 1442 * requested to stop, we'll stop in post_syscall() 1443 * on our way back to userland. 1444 */ 1445 mutex_exit(&pool_barrier_lock); 1446 pool_bind_wakeall(procs); 1447 lwp->lwp_nostop--; 1448 rv = EAGAIN; 1449 goto out; 1450 } 1451 } 1452 lwp->lwp_nostop--; 1453 mutex_exit(&pool_barrier_lock); 1454 1455 if (idtype == P_PID) 1456 goto skip; 1457 1458 /* 1459 * Do another run, and drop processes that were inside the barrier 1460 * in exit(), but when they have dropped to pool_barrier_exit 1461 * they have become of no interest to us. Pick up child processes that 1462 * were created by fork() but didn't exist during our first scan. 1463 * Their parents are now stopped at pool_barrier_exit in cfork(). 1464 */ 1465 mutex_enter(&pidlock); 1466 for (pp = procs; (p = *pp) != NULL; pp++) { 1467 if (p->p_poolflag & PEXITED) { 1468 ASSERT(p->p_lwpcnt == 0); 1469 pool_bind_wake(p); 1470 /* flip w/last non-NULL slot */ 1471 *pp = procs[procs_count - 1]; 1472 procs[procs_count - 1] = NULL; 1473 procs_count--; 1474 pp--; /* try this slot again */ 1475 continue; 1476 } 1477 /* 1478 * Look at the child and check if it should be rebound also. 1479 * We're holding pidlock, so it is safe to reference p_child. 1480 */ 1481 if ((p = p->p_child) == NULL) 1482 continue; 1483 1484 mutex_enter(&p->p_lock); 1485 /* 1486 * Skip processes in local zones if we're not binding 1487 * zones to pools (P_ZONEID). Skip kernel processes also. 1488 */ 1489 if ((!INGLOBALZONE(p) && idtype != P_ZONEID) || 1490 p->p_flag & SSYS) { 1491 mutex_exit(&p->p_lock); 1492 continue; 1493 } 1494 1495 /* 1496 * If the child process has been already created by fork(), has 1497 * not exited, and has not been added to the list already, 1498 * then add it now. We will hit this process again (since we 1499 * stick it at the end of the procs list) but it will ignored 1500 * because it will have the PBWAIT flag set. 1501 */ 1502 if (procinset(p, &set) && 1503 !(p->p_poolflag & PEXITED) && 1504 !(p->p_poolflag & PBWAIT)) { 1505 ASSERT(p->p_child == NULL); /* no child of a child */ 1506 procs[procs_count] = p; 1507 procs[procs_count + 1] = NULL; 1508 procs_count++; 1509 p->p_poolflag |= PBWAIT; 1510 } 1511 mutex_exit(&p->p_lock); 1512 } 1513 mutex_exit(&pidlock); 1514 skip: 1515 /* 1516 * If there's no processes to rebind then return ESRCH, unless 1517 * we're associating a pool with new resource set, destroying it, 1518 * or binding a zone to a pool. 1519 */ 1520 if (procs_count == 0) { 1521 if (idtype == P_POOLID || idtype == P_ZONEID) 1522 rv = 0; 1523 else 1524 rv = ESRCH; 1525 goto out; 1526 } 1527 1528 #ifdef DEBUG 1529 /* 1530 * All processes in the array should have PBWAIT set, and none should 1531 * be in the critical section. Even though p_poolflag is protected by 1532 * the p_lock, these assertions should be stable across the dropping of 1533 * p_lock. 1534 */ 1535 for (pp = procs; (p = *pp) != NULL; pp++) { 1536 ASSERT(p->p_poolflag & PBWAIT); 1537 ASSERT(p->p_poolcnt == 0); 1538 ASSERT(procinset(p, &set)); 1539 } 1540 #endif 1541 1542 /* 1543 * Do the check if processor set rebinding is going to succeed or not. 1544 */ 1545 if ((flags & POOL_BIND_PSET) && 1546 (rv = pset_bind_start(procs, pool)) != 0) { 1547 pool_bind_wakeall(procs); 1548 goto out; 1549 } 1550 1551 /* 1552 * At this point, all bind operations should succeed. 1553 */ 1554 for (pp = procs; (p = *pp) != NULL; pp++) { 1555 if (flags & POOL_BIND_PSET) { 1556 psetid_t psetid = pool->pool_pset->pset_id; 1557 void *zonebuf; 1558 void *projbuf; 1559 1560 /* 1561 * Pre-allocate one buffer for FSS (per-project 1562 * buffer for a new pset) in case if this is the 1563 * first thread from its current project getting 1564 * bound to this processor set. 1565 */ 1566 projbuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_PROJ); 1567 zonebuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_ZONE); 1568 1569 mutex_enter(&pidlock); 1570 mutex_enter(&p->p_lock); 1571 pool_pset_bind(p, psetid, projbuf, zonebuf); 1572 mutex_exit(&p->p_lock); 1573 mutex_exit(&pidlock); 1574 /* 1575 * Free buffers pre-allocated above if it 1576 * wasn't actually used. 1577 */ 1578 fss_freebuf(projbuf, FSS_ALLOC_PROJ); 1579 fss_freebuf(zonebuf, FSS_ALLOC_ZONE); 1580 } 1581 /* 1582 * Now let's change the scheduling class of this 1583 * process if our target pool has it defined. 1584 */ 1585 if (cid != POOL_CLASS_UNSET) 1586 pool_change_class(p, cid); 1587 1588 /* 1589 * It is safe to reference p_pool here without holding 1590 * p_lock because it cannot change underneath of us. 1591 * We're holding pool_lock here, so nobody else can be 1592 * moving this process between pools. If process "p" 1593 * would be exiting, we're guaranteed that it would be blocked 1594 * at pool_barrier_enter() in exit(). Otherwise, it would've 1595 * been skipped by one of our scans of the practive list 1596 * as a process with PEXITED flag set. 1597 */ 1598 if (p->p_pool != pool) { 1599 ASSERT(p->p_pool->pool_ref > 0); 1600 atomic_add_32(&p->p_pool->pool_ref, -1); 1601 p->p_pool = pool; 1602 atomic_add_32(&p->p_pool->pool_ref, 1); 1603 } 1604 /* 1605 * Okay, we've tortured this guy enough. 1606 * Let this poor process go now. 1607 */ 1608 pool_bind_wake(p); 1609 } 1610 if (flags & POOL_BIND_PSET) 1611 pset_bind_finish(); 1612 1613 out: switch (idtype) { 1614 case P_PROJID: 1615 ASSERT(kpj != NULL); 1616 mutex_exit(&kpj->kpj_poolbind); 1617 project_rele(kpj); 1618 break; 1619 case P_ZONEID: 1620 if (rv == 0) { 1621 mutex_enter(&cpu_lock); 1622 zone_pool_set(zone, pool); 1623 mutex_exit(&cpu_lock); 1624 } 1625 zone->zone_pool_mod = gethrtime(); 1626 zone_rele(zone); 1627 break; 1628 } 1629 1630 kmem_free(procs, procs_size * sizeof (proc_t *)); 1631 ASSERT(pool_barrier_count == 0); 1632 return (rv); 1633 } 1634