1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/pool.h> 28 #include <sys/pool_impl.h> 29 #include <sys/pool_pset.h> 30 #include <sys/id_space.h> 31 #include <sys/mutex.h> 32 #include <sys/nvpair.h> 33 #include <sys/cpuvar.h> 34 #include <sys/errno.h> 35 #include <sys/cmn_err.h> 36 #include <sys/systm.h> 37 #include <sys/proc.h> 38 #include <sys/fss.h> 39 #include <sys/class.h> 40 #include <sys/exacct.h> 41 #include <sys/utsname.h> 42 #include <sys/procset.h> 43 #include <sys/atomic.h> 44 #include <sys/zone.h> 45 #include <sys/policy.h> 46 #include <sys/schedctl.h> 47 #include <sys/taskq.h> 48 49 /* 50 * RESOURCE POOLS 51 * 52 * The resource pools facility brings together process-bindable resource into 53 * a common abstraction called a pool. Processor sets and other entities can 54 * be configured, grouped, and labelled such that workload components can be 55 * associated with a subset of a system's total resources. 56 * 57 * When disabled, the pools facility is "invisible". All processes belong 58 * to the same pool (pool_default), and processor sets can be managed through 59 * the old pset() system call. When enabled, processor sets can only be 60 * managed via the pools facility. New pools can be created and associated 61 * with processor sets. Processes can be bound to pools which have non-empty 62 * resource sets. 63 * 64 * Locking: pool_lock() protects global pools state and must be called 65 * before modifying the configuration, or when taking a snapshot of the 66 * configuration. If pool_lock_intr() is used, the operation may be 67 * interrupted by a signal or a request. 68 * 69 * To prevent processes from being rebound between pools while they are 70 * the middle of an operation which affects resource set bindings, such 71 * operations must be surrounded by calls to pool_barrier_enter() and 72 * pool_barrier_exit(). This mechanism guarantees that such processes will 73 * be stopped either at the beginning or at the end of the barrier so that 74 * the rebind operation can atomically bind the process and its threads 75 * to new resource sets, and then let process run again. 76 * 77 * Lock ordering with respect to other locks is as follows: 78 * 79 * pool_lock() -> cpu_lock -> pidlock -> p_lock -> pool_barrier_lock 80 * 81 * Most static and global variables defined in this file are protected 82 * by calling pool_lock(). 83 * 84 * The operation that binds tasks and projects to pools is atomic. That is, 85 * either all processes in a given task or a project will be bound to a 86 * new pool, or (in case of an error) they will be all left bound to the 87 * old pool. Processes in a given task or a given project can only be bound to 88 * different pools if they were rebound individually one by one as single 89 * processes. Threads or LWPs of the same process do not have pool bindings, 90 * and are bound to the same resource sets associated with the resource pool 91 * of that process. 92 * 93 * The following picture shows one possible pool configuration with three 94 * pools and three processor sets. Note that processor set "foo" is not 95 * associated with any pools and therefore cannot have any processes 96 * bound to it. Two pools (default and foo) are associated with the 97 * same processor set (default). Also, note that processes in Task 2 98 * are bound to different pools. 99 * 100 * 101 * Processor Sets 102 * +---------+ 103 * +--------------+========================>| default | 104 * a| | +---------+ 105 * s| | || 106 * s| | +---------+ 107 * o| | | foo | 108 * c| | +---------+ 109 * i| | || 110 * a| | +---------+ 111 * t| | +------>| bar | 112 * e| | | +---------+ 113 * d| | | 114 * | | | 115 * +---------+ +---------+ +---------+ 116 * Pools | default |======| foo |======| bar | 117 * +---------+ +---------+ +---------+ 118 * @ @ @ @ @ @ 119 * b| | | | | | 120 * o| | | | | | 121 * u| +-----+ | +-------+ | +---+ 122 * n| | | | | | 123 * ....d|........|......|......|.........|.......|.... 124 * : | :: | | | :: | | : 125 * : +---+ :: +---+ +---+ +---+ :: +---+ +---+ : 126 * Processes : | p | :: | p | | p | | p | :: | p |...| p | : 127 * : +---+ :: +---+ +---+ +---+ :: +---+ +---+ : 128 * :........::......................::...............: 129 * Task 1 Task 2 Task N 130 * | | | 131 * | | | 132 * | +-----------+ | +-----------+ 133 * +--| Project 1 |--+ | Project N | 134 * +-----------+ +-----------+ 135 * 136 * This is just an illustration of relationships between processes, tasks, 137 * projects, pools, and processor sets. New types of resource sets will be 138 * added in the future. 139 */ 140 141 pool_t *pool_default; /* default pool which always exists */ 142 int pool_count; /* number of pools created on this system */ 143 int pool_state; /* pools state -- enabled/disabled */ 144 void *pool_buf; /* pre-commit snapshot of the pools state */ 145 size_t pool_bufsz; /* size of pool_buf */ 146 static hrtime_t pool_pool_mod; /* last modification time for pools */ 147 static hrtime_t pool_sys_mod; /* last modification time for system */ 148 static nvlist_t *pool_sys_prop; /* system properties */ 149 static id_space_t *pool_ids; /* pool ID space */ 150 static list_t pool_list; /* doubly-linked list of pools */ 151 static kmutex_t pool_mutex; /* protects pool_busy_* */ 152 static kcondvar_t pool_busy_cv; /* waiting for "pool_lock" */ 153 static kthread_t *pool_busy_thread; /* thread holding "pool_lock" */ 154 static kmutex_t pool_barrier_lock; /* synch. with pool_barrier_* */ 155 static kcondvar_t pool_barrier_cv; /* synch. with pool_barrier_* */ 156 static int pool_barrier_count; /* synch. with pool_barrier_* */ 157 static list_t pool_event_cb_list; /* pool event callbacks */ 158 static boolean_t pool_event_cb_init = B_FALSE; 159 static kmutex_t pool_event_cb_lock; 160 static taskq_t *pool_event_cb_taskq = NULL; 161 162 void pool_event_dispatch(pool_event_t, poolid_t); 163 164 /* 165 * Boot-time pool initialization. 166 */ 167 void 168 pool_init(void) 169 { 170 pool_ids = id_space_create("pool_ids", POOL_DEFAULT + 1, POOL_MAXID); 171 172 /* 173 * Initialize default pool. 174 */ 175 pool_default = kmem_zalloc(sizeof (pool_t), KM_SLEEP); 176 pool_default->pool_id = POOL_DEFAULT; 177 list_create(&pool_list, sizeof (pool_t), offsetof(pool_t, pool_link)); 178 list_insert_head(&pool_list, pool_default); 179 180 /* 181 * Initialize plugins for resource sets. 182 */ 183 pool_pset_init(); 184 pool_count = 1; 185 p0.p_pool = pool_default; 186 global_zone->zone_pool = pool_default; 187 pool_default->pool_ref = 1; 188 } 189 190 /* 191 * Synchronization routines. 192 * 193 * pool_lock is only called from syscall-level routines (processor_bind(), 194 * pset_*(), and /dev/pool ioctls). The pool "lock" may be held for long 195 * periods of time, including across sleeping operations, so we allow its 196 * acquisition to be interruptible. 197 * 198 * The current thread that owns the "lock" is stored in the variable 199 * pool_busy_thread, both to let pool_lock_held() work and to aid debugging. 200 */ 201 void 202 pool_lock(void) 203 { 204 mutex_enter(&pool_mutex); 205 ASSERT(!pool_lock_held()); 206 while (pool_busy_thread != NULL) 207 cv_wait(&pool_busy_cv, &pool_mutex); 208 pool_busy_thread = curthread; 209 mutex_exit(&pool_mutex); 210 } 211 212 int 213 pool_lock_intr(void) 214 { 215 mutex_enter(&pool_mutex); 216 ASSERT(!pool_lock_held()); 217 while (pool_busy_thread != NULL) { 218 if (cv_wait_sig(&pool_busy_cv, &pool_mutex) == 0) { 219 cv_signal(&pool_busy_cv); 220 mutex_exit(&pool_mutex); 221 return (1); 222 } 223 } 224 pool_busy_thread = curthread; 225 mutex_exit(&pool_mutex); 226 return (0); 227 } 228 229 int 230 pool_lock_held(void) 231 { 232 return (pool_busy_thread == curthread); 233 } 234 235 void 236 pool_unlock(void) 237 { 238 mutex_enter(&pool_mutex); 239 ASSERT(pool_lock_held()); 240 pool_busy_thread = NULL; 241 cv_signal(&pool_busy_cv); 242 mutex_exit(&pool_mutex); 243 } 244 245 /* 246 * Routines allowing fork(), exec(), exit(), and lwp_create() to synchronize 247 * with pool_do_bind(). 248 * 249 * Calls to pool_barrier_enter() and pool_barrier_exit() must bracket all 250 * operations which modify pool or pset associations. They can be called 251 * while the process is multi-threaded. In the common case, when current 252 * process is not being rebound (PBWAIT flag is not set), these functions 253 * will be just incrementing and decrementing reference counts. 254 */ 255 void 256 pool_barrier_enter(void) 257 { 258 proc_t *p = curproc; 259 260 ASSERT(MUTEX_HELD(&p->p_lock)); 261 while (p->p_poolflag & PBWAIT) 262 cv_wait(&p->p_poolcv, &p->p_lock); 263 p->p_poolcnt++; 264 } 265 266 void 267 pool_barrier_exit(void) 268 { 269 proc_t *p = curproc; 270 271 ASSERT(MUTEX_HELD(&p->p_lock)); 272 ASSERT(p->p_poolcnt > 0); 273 p->p_poolcnt--; 274 if (p->p_poolflag & PBWAIT) { 275 mutex_enter(&pool_barrier_lock); 276 ASSERT(pool_barrier_count > 0); 277 pool_barrier_count--; 278 if (pool_barrier_count == 0) 279 cv_signal(&pool_barrier_cv); 280 mutex_exit(&pool_barrier_lock); 281 while (p->p_poolflag & PBWAIT) 282 cv_wait(&p->p_poolcv, &p->p_lock); 283 } 284 } 285 286 /* 287 * Enable pools facility. 288 */ 289 static int 290 pool_enable(void) 291 { 292 int ret; 293 294 ASSERT(pool_lock_held()); 295 ASSERT(pool_count == 1); 296 297 ret = pool_pset_enable(); 298 if (ret != 0) 299 return (ret); 300 (void) nvlist_alloc(&pool_sys_prop, NV_UNIQUE_NAME, KM_SLEEP); 301 (void) nvlist_add_string(pool_sys_prop, "system.name", 302 "default"); 303 (void) nvlist_add_string(pool_sys_prop, "system.comment", ""); 304 (void) nvlist_add_int64(pool_sys_prop, "system.version", 1); 305 (void) nvlist_add_byte(pool_sys_prop, "system.bind-default", 1); 306 (void) nvlist_add_string(pool_sys_prop, "system.poold.objectives", 307 "wt-load"); 308 309 (void) nvlist_alloc(&pool_default->pool_props, 310 NV_UNIQUE_NAME, KM_SLEEP); 311 (void) nvlist_add_string(pool_default->pool_props, 312 "pool.name", "pool_default"); 313 (void) nvlist_add_string(pool_default->pool_props, "pool.comment", ""); 314 (void) nvlist_add_byte(pool_default->pool_props, "pool.default", 1); 315 (void) nvlist_add_byte(pool_default->pool_props, "pool.active", 1); 316 (void) nvlist_add_int64(pool_default->pool_props, 317 "pool.importance", 1); 318 (void) nvlist_add_int64(pool_default->pool_props, "pool.sys_id", 319 pool_default->pool_id); 320 321 pool_sys_mod = pool_pool_mod = gethrtime(); 322 323 return (ret); 324 } 325 326 /* 327 * Disable pools facility. 328 */ 329 static int 330 pool_disable(void) 331 { 332 int ret; 333 334 ASSERT(pool_lock_held()); 335 336 if (pool_count > 1) /* must destroy all pools first */ 337 return (EBUSY); 338 339 ret = pool_pset_disable(); 340 if (ret != 0) 341 return (ret); 342 if (pool_sys_prop != NULL) { 343 nvlist_free(pool_sys_prop); 344 pool_sys_prop = NULL; 345 } 346 if (pool_default->pool_props != NULL) { 347 nvlist_free(pool_default->pool_props); 348 pool_default->pool_props = NULL; 349 } 350 return (0); 351 } 352 353 pool_t * 354 pool_lookup_pool_by_name(char *name) 355 { 356 pool_t *pool = pool_default; 357 char *p; 358 359 ASSERT(pool_lock_held()); 360 for (pool = list_head(&pool_list); pool; 361 pool = list_next(&pool_list, pool)) { 362 if (nvlist_lookup_string(pool->pool_props, 363 "pool.name", &p) == 0 && strcmp(name, p) == 0) 364 return (pool); 365 } 366 return (NULL); 367 } 368 369 pool_t * 370 pool_lookup_pool_by_id(poolid_t poolid) 371 { 372 pool_t *pool = pool_default; 373 374 ASSERT(pool_lock_held()); 375 for (pool = list_head(&pool_list); pool; 376 pool = list_next(&pool_list, pool)) { 377 if (pool->pool_id == poolid) 378 return (pool); 379 } 380 return (NULL); 381 } 382 383 pool_t * 384 pool_lookup_pool_by_pset(int id) 385 { 386 pool_t *pool = pool_default; 387 psetid_t psetid = (psetid_t)id; 388 389 ASSERT(pool_lock_held()); 390 for (pool = list_head(&pool_list); pool != NULL; 391 pool = list_next(&pool_list, pool)) { 392 if (pool->pool_pset->pset_id == psetid) 393 return (pool); 394 } 395 return (NULL); 396 } 397 398 /* 399 * Create new pool, associate it with default resource sets, and give 400 * it a temporary name. 401 */ 402 static int 403 pool_pool_create(poolid_t *poolid) 404 { 405 pool_t *pool; 406 char pool_name[40]; 407 408 ASSERT(pool_lock_held()); 409 410 pool = kmem_zalloc(sizeof (pool_t), KM_SLEEP); 411 pool->pool_id = *poolid = id_alloc(pool_ids); 412 pool->pool_pset = pool_pset_default; 413 pool_pset_default->pset_npools++; 414 list_insert_tail(&pool_list, pool); 415 (void) nvlist_alloc(&pool->pool_props, NV_UNIQUE_NAME, KM_SLEEP); 416 (void) nvlist_add_int64(pool->pool_props, "pool.sys_id", pool->pool_id); 417 (void) nvlist_add_byte(pool->pool_props, "pool.default", 0); 418 pool_pool_mod = gethrtime(); 419 (void) snprintf(pool_name, sizeof (pool_name), "pool_%lld", 420 pool_pool_mod); 421 (void) nvlist_add_string(pool->pool_props, "pool.name", pool_name); 422 pool_count++; 423 return (0); 424 } 425 426 struct destroy_zone_arg { 427 pool_t *old; 428 pool_t *new; 429 }; 430 431 /* 432 * Update pool pointers for zones that are currently bound to pool "old" 433 * to be bound to pool "new". 434 */ 435 static int 436 pool_destroy_zone_cb(zone_t *zone, void *arg) 437 { 438 struct destroy_zone_arg *dza = arg; 439 440 ASSERT(pool_lock_held()); 441 ASSERT(MUTEX_HELD(&cpu_lock)); 442 443 if (zone_pool_get(zone) == dza->old) 444 zone_pool_set(zone, dza->new); 445 return (0); 446 } 447 448 /* 449 * Destroy specified pool, and rebind all processes in it 450 * to the default pool. 451 */ 452 static int 453 pool_pool_destroy(poolid_t poolid) 454 { 455 pool_t *pool; 456 int ret; 457 458 ASSERT(pool_lock_held()); 459 460 if (poolid == POOL_DEFAULT) 461 return (EINVAL); 462 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) 463 return (ESRCH); 464 ret = pool_do_bind(pool_default, P_POOLID, poolid, POOL_BIND_ALL); 465 if (ret == 0) { 466 struct destroy_zone_arg dzarg; 467 468 dzarg.old = pool; 469 dzarg.new = pool_default; 470 mutex_enter(&cpu_lock); 471 ret = zone_walk(pool_destroy_zone_cb, &dzarg); 472 mutex_exit(&cpu_lock); 473 ASSERT(ret == 0); 474 ASSERT(pool->pool_ref == 0); 475 (void) nvlist_free(pool->pool_props); 476 id_free(pool_ids, pool->pool_id); 477 pool->pool_pset->pset_npools--; 478 list_remove(&pool_list, pool); 479 pool_count--; 480 pool_pool_mod = gethrtime(); 481 kmem_free(pool, sizeof (pool_t)); 482 } 483 return (ret); 484 } 485 486 /* 487 * Create new pool or resource set. 488 */ 489 int 490 pool_create(int class, int subclass, id_t *id) 491 { 492 int ret; 493 494 ASSERT(pool_lock_held()); 495 if (pool_state == POOL_DISABLED) 496 return (ENOTACTIVE); 497 switch (class) { 498 case PEC_POOL: 499 ret = pool_pool_create((poolid_t *)id); 500 break; 501 case PEC_RES_COMP: 502 switch (subclass) { 503 case PREC_PSET: 504 ret = pool_pset_create((psetid_t *)id); 505 break; 506 default: 507 ret = EINVAL; 508 } 509 break; 510 case PEC_RES_AGG: 511 ret = ENOTSUP; 512 break; 513 default: 514 ret = EINVAL; 515 } 516 return (ret); 517 } 518 519 /* 520 * Destroy an existing pool or resource set. 521 */ 522 int 523 pool_destroy(int class, int subclass, id_t id) 524 { 525 int ret; 526 527 ASSERT(pool_lock_held()); 528 if (pool_state == POOL_DISABLED) 529 return (ENOTACTIVE); 530 switch (class) { 531 case PEC_POOL: 532 ret = pool_pool_destroy((poolid_t)id); 533 break; 534 case PEC_RES_COMP: 535 switch (subclass) { 536 case PREC_PSET: 537 ret = pool_pset_destroy((psetid_t)id); 538 break; 539 default: 540 ret = EINVAL; 541 } 542 break; 543 case PEC_RES_AGG: 544 ret = ENOTSUP; 545 break; 546 default: 547 ret = EINVAL; 548 } 549 return (ret); 550 } 551 552 /* 553 * Enable or disable pools. 554 */ 555 int 556 pool_status(int status) 557 { 558 int ret = 0; 559 560 ASSERT(pool_lock_held()); 561 562 if (pool_state == status) 563 return (0); 564 switch (status) { 565 case POOL_ENABLED: 566 ret = pool_enable(); 567 if (ret != 0) 568 return (ret); 569 pool_state = POOL_ENABLED; 570 pool_event_dispatch(POOL_E_ENABLE, NULL); 571 break; 572 case POOL_DISABLED: 573 ret = pool_disable(); 574 if (ret != 0) 575 return (ret); 576 pool_state = POOL_DISABLED; 577 pool_event_dispatch(POOL_E_DISABLE, NULL); 578 break; 579 default: 580 ret = EINVAL; 581 } 582 return (ret); 583 } 584 585 /* 586 * Associate pool with resource set. 587 */ 588 int 589 pool_assoc(poolid_t poolid, int idtype, id_t id) 590 { 591 int ret; 592 593 ASSERT(pool_lock_held()); 594 if (pool_state == POOL_DISABLED) 595 return (ENOTACTIVE); 596 switch (idtype) { 597 case PREC_PSET: 598 ret = pool_pset_assoc(poolid, (psetid_t)id); 599 if (ret == 0) 600 pool_event_dispatch(POOL_E_CHANGE, poolid); 601 break; 602 default: 603 ret = EINVAL; 604 } 605 if (ret == 0) 606 pool_pool_mod = gethrtime(); 607 return (ret); 608 } 609 610 /* 611 * Disassociate resource set from pool. 612 */ 613 int 614 pool_dissoc(poolid_t poolid, int idtype) 615 { 616 int ret; 617 618 ASSERT(pool_lock_held()); 619 if (pool_state == POOL_DISABLED) 620 return (ENOTACTIVE); 621 switch (idtype) { 622 case PREC_PSET: 623 ret = pool_pset_assoc(poolid, PS_NONE); 624 if (ret == 0) 625 pool_event_dispatch(POOL_E_CHANGE, poolid); 626 break; 627 default: 628 ret = EINVAL; 629 } 630 if (ret == 0) 631 pool_pool_mod = gethrtime(); 632 return (ret); 633 } 634 635 /* 636 * Transfer specified quantity of resources between resource sets. 637 */ 638 /*ARGSUSED*/ 639 int 640 pool_transfer(int type, id_t src, id_t dst, uint64_t qty) 641 { 642 int ret = EINVAL; 643 644 return (ret); 645 } 646 647 static poolid_t 648 pool_lookup_id_by_pset(int id) 649 { 650 pool_t *pool = pool_default; 651 psetid_t psetid = (psetid_t)id; 652 653 ASSERT(pool_lock_held()); 654 for (pool = list_head(&pool_list); pool != NULL; 655 pool = list_next(&pool_list, pool)) { 656 if (pool->pool_pset->pset_id == psetid) 657 return (pool->pool_id); 658 } 659 return (POOL_INVALID); 660 } 661 662 /* 663 * Transfer resources specified by their IDs between resource sets. 664 */ 665 int 666 pool_xtransfer(int type, id_t src_pset, id_t dst_pset, uint_t size, id_t *ids) 667 { 668 int ret; 669 poolid_t src_pool, dst_pool; 670 671 ASSERT(pool_lock_held()); 672 if (pool_state == POOL_DISABLED) 673 return (ENOTACTIVE); 674 switch (type) { 675 case PREC_PSET: 676 ret = pool_pset_xtransfer((psetid_t)src_pset, 677 (psetid_t)dst_pset, size, ids); 678 679 if ((src_pool = pool_lookup_id_by_pset(src_pset)) == -1) 680 return (EINVAL); 681 if ((dst_pool = pool_lookup_id_by_pset(dst_pset)) == -1) 682 return (EINVAL); 683 pool_event_dispatch(POOL_E_CHANGE, src_pool); 684 pool_event_dispatch(POOL_E_CHANGE, dst_pool); 685 break; 686 default: 687 ret = EINVAL; 688 } 689 return (ret); 690 } 691 692 /* 693 * Bind processes to pools. 694 */ 695 int 696 pool_bind(poolid_t poolid, idtype_t idtype, id_t id) 697 { 698 pool_t *pool; 699 700 ASSERT(pool_lock_held()); 701 702 if (pool_state == POOL_DISABLED) 703 return (ENOTACTIVE); 704 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) 705 return (ESRCH); 706 707 switch (idtype) { 708 case P_PID: 709 case P_TASKID: 710 case P_PROJID: 711 case P_ZONEID: 712 break; 713 default: 714 return (EINVAL); 715 } 716 return (pool_do_bind(pool, idtype, id, POOL_BIND_ALL)); 717 } 718 719 /* 720 * Query pool binding of the specifed process. 721 */ 722 int 723 pool_query_binding(idtype_t idtype, id_t id, id_t *poolid) 724 { 725 proc_t *p; 726 727 if (idtype != P_PID) 728 return (ENOTSUP); 729 if (id == P_MYID) 730 id = curproc->p_pid; 731 732 ASSERT(pool_lock_held()); 733 734 mutex_enter(&pidlock); 735 if ((p = prfind((pid_t)id)) == NULL) { 736 mutex_exit(&pidlock); 737 return (ESRCH); 738 } 739 mutex_enter(&p->p_lock); 740 /* 741 * In local zones, lie about pool bindings of processes from 742 * the global zone. 743 */ 744 if (!INGLOBALZONE(curproc) && INGLOBALZONE(p)) { 745 pool_t *pool; 746 747 pool = zone_pool_get(curproc->p_zone); 748 *poolid = pool->pool_id; 749 } else { 750 *poolid = p->p_pool->pool_id; 751 } 752 mutex_exit(&p->p_lock); 753 mutex_exit(&pidlock); 754 return (0); 755 } 756 757 static ea_object_t * 758 pool_system_pack(void) 759 { 760 ea_object_t *eo_system; 761 size_t bufsz = 0; 762 char *buf = NULL; 763 764 ASSERT(pool_lock_held()); 765 766 eo_system = ea_alloc_group(EXT_GROUP | EXC_LOCAL | EXD_GROUP_SYSTEM); 767 (void) ea_attach_item(eo_system, &pool_sys_mod, sizeof (hrtime_t), 768 EXC_LOCAL | EXD_SYSTEM_TSTAMP | EXT_UINT64); 769 if (INGLOBALZONE(curproc)) 770 (void) ea_attach_item(eo_system, &pool_pool_mod, 771 sizeof (hrtime_t), 772 EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64); 773 else 774 (void) ea_attach_item(eo_system, 775 &curproc->p_zone->zone_pool_mod, 776 sizeof (hrtime_t), 777 EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64); 778 (void) ea_attach_item(eo_system, &pool_pset_mod, sizeof (hrtime_t), 779 EXC_LOCAL | EXD_PSET_TSTAMP | EXT_UINT64); 780 (void) ea_attach_item(eo_system, &pool_cpu_mod, sizeof (hrtime_t), 781 EXC_LOCAL | EXD_CPU_TSTAMP | EXT_UINT64); 782 (void) nvlist_pack(pool_sys_prop, &buf, &bufsz, NV_ENCODE_NATIVE, 0); 783 (void) ea_attach_item(eo_system, buf, bufsz, 784 EXC_LOCAL | EXD_SYSTEM_PROP | EXT_RAW); 785 kmem_free(buf, bufsz); 786 return (eo_system); 787 } 788 789 /* 790 * Pack information about pools and attach it to specified exacct group. 791 */ 792 static int 793 pool_pool_pack(ea_object_t *eo_system) 794 { 795 ea_object_t *eo_pool; 796 pool_t *pool; 797 size_t bufsz; 798 char *buf; 799 pool_t *myzonepool; 800 801 ASSERT(pool_lock_held()); 802 myzonepool = zone_pool_get(curproc->p_zone); 803 for (pool = list_head(&pool_list); pool; 804 pool = list_next(&pool_list, pool)) { 805 if (!INGLOBALZONE(curproc) && myzonepool != pool) 806 continue; 807 bufsz = 0; 808 buf = NULL; 809 eo_pool = ea_alloc_group(EXT_GROUP | 810 EXC_LOCAL | EXD_GROUP_POOL); 811 (void) ea_attach_item(eo_pool, &pool->pool_id, sizeof (id_t), 812 EXC_LOCAL | EXD_POOL_POOLID | EXT_UINT32); 813 (void) ea_attach_item(eo_pool, &pool->pool_pset->pset_id, 814 sizeof (id_t), EXC_LOCAL | EXD_POOL_PSETID | EXT_UINT32); 815 (void) nvlist_pack(pool->pool_props, &buf, &bufsz, 816 NV_ENCODE_NATIVE, 0); 817 (void) ea_attach_item(eo_pool, buf, bufsz, 818 EXC_LOCAL | EXD_POOL_PROP | EXT_RAW); 819 kmem_free(buf, bufsz); 820 (void) ea_attach_to_group(eo_system, eo_pool); 821 } 822 return (0); 823 } 824 825 /* 826 * Pack the whole pool configuration in the specified buffer. 827 */ 828 int 829 pool_pack_conf(void *kbuf, size_t kbufsz, size_t *asize) 830 { 831 ea_object_t *eo_system; 832 size_t ksize; 833 int ret = 0; 834 835 ASSERT(pool_lock_held()); 836 837 eo_system = pool_system_pack(); /* 1. pack system */ 838 (void) pool_pool_pack(eo_system); /* 2. pack all pools */ 839 (void) pool_pset_pack(eo_system); /* 3. pack all psets */ 840 ksize = ea_pack_object(eo_system, NULL, 0); 841 if (kbuf == NULL || kbufsz == 0) 842 *asize = ksize; 843 else if (ksize > kbufsz) 844 ret = ENOMEM; 845 else 846 *asize = ea_pack_object(eo_system, kbuf, kbufsz); 847 ea_free_object(eo_system, EUP_ALLOC); 848 return (ret); 849 } 850 851 /* 852 * Start/end the commit transaction. If commit transaction is currently 853 * in progress, then all POOL_QUERY ioctls will return pools configuration 854 * at the beginning of transaction. 855 */ 856 int 857 pool_commit(int state) 858 { 859 ea_object_t *eo_system; 860 int ret = 0; 861 862 ASSERT(pool_lock_held()); 863 864 if (pool_state == POOL_DISABLED) 865 return (ENOTACTIVE); 866 switch (state) { 867 case 1: 868 /* 869 * Beginning commit transation. 870 */ 871 if (pool_buf != NULL) /* transaction in progress */ 872 return (EBUSY); 873 eo_system = pool_system_pack(); /* 1. pack system */ 874 (void) pool_pool_pack(eo_system); /* 2. pack all pools */ 875 (void) pool_pset_pack(eo_system); /* 3. pack all psets */ 876 pool_bufsz = ea_pack_object(eo_system, NULL, 0); 877 pool_buf = kmem_alloc(pool_bufsz, KM_SLEEP); 878 pool_bufsz = ea_pack_object(eo_system, pool_buf, pool_bufsz); 879 ea_free_object(eo_system, EUP_ALLOC); 880 break; 881 case 0: 882 /* 883 * Finishing commit transaction. 884 */ 885 if (pool_buf != NULL) { 886 kmem_free(pool_buf, pool_bufsz); 887 pool_buf = NULL; 888 pool_bufsz = 0; 889 } 890 break; 891 default: 892 ret = EINVAL; 893 } 894 return (ret); 895 } 896 897 /* 898 * Check is the specified property is special 899 */ 900 static pool_property_t * 901 pool_property_find(char *name, pool_property_t *list) 902 { 903 pool_property_t *prop; 904 905 for (prop = list; prop->pp_name != NULL; prop++) 906 if (strcmp(prop->pp_name, name) == 0) 907 return (prop); 908 return (NULL); 909 } 910 911 static pool_property_t pool_prop_sys[] = { 912 { "system.name", DATA_TYPE_STRING, PP_RDWR }, 913 { "system.comment", DATA_TYPE_STRING, PP_RDWR }, 914 { "system.version", DATA_TYPE_UINT64, PP_READ }, 915 { "system.bind-default", DATA_TYPE_BYTE, PP_RDWR }, 916 { "system.allocate-method", DATA_TYPE_STRING, 917 PP_RDWR | PP_OPTIONAL }, 918 { "system.poold.log-level", DATA_TYPE_STRING, 919 PP_RDWR | PP_OPTIONAL }, 920 { "system.poold.log-location", DATA_TYPE_STRING, 921 PP_RDWR | PP_OPTIONAL }, 922 { "system.poold.monitor-interval", DATA_TYPE_UINT64, 923 PP_RDWR | PP_OPTIONAL }, 924 { "system.poold.history-file", DATA_TYPE_STRING, 925 PP_RDWR | PP_OPTIONAL }, 926 { "system.poold.objectives", DATA_TYPE_STRING, 927 PP_RDWR | PP_OPTIONAL }, 928 { NULL, 0, 0 } 929 }; 930 931 static pool_property_t pool_prop_pool[] = { 932 { "pool.sys_id", DATA_TYPE_UINT64, PP_READ }, 933 { "pool.name", DATA_TYPE_STRING, PP_RDWR }, 934 { "pool.default", DATA_TYPE_BYTE, PP_READ }, 935 { "pool.active", DATA_TYPE_BYTE, PP_RDWR }, 936 { "pool.importance", DATA_TYPE_INT64, PP_RDWR }, 937 { "pool.comment", DATA_TYPE_STRING, PP_RDWR }, 938 { "pool.scheduler", DATA_TYPE_STRING, 939 PP_RDWR | PP_OPTIONAL }, 940 { NULL, 0, 0 } 941 }; 942 943 /* 944 * Common routine to put new property on the specified list 945 */ 946 int 947 pool_propput_common(nvlist_t *nvlist, nvpair_t *pair, pool_property_t *props) 948 { 949 pool_property_t *prop; 950 951 if ((prop = pool_property_find(nvpair_name(pair), props)) != NULL) { 952 /* 953 * No read-only properties or properties with bad types 954 */ 955 if (!(prop->pp_perm & PP_WRITE) || 956 prop->pp_type != nvpair_type(pair)) 957 return (EINVAL); 958 } 959 return (nvlist_add_nvpair(nvlist, pair)); 960 } 961 962 /* 963 * Common routine to remove property from the given list 964 */ 965 int 966 pool_proprm_common(nvlist_t *nvlist, char *name, pool_property_t *props) 967 { 968 pool_property_t *prop; 969 970 if ((prop = pool_property_find(name, props)) != NULL) { 971 if (!(prop->pp_perm & PP_OPTIONAL)) 972 return (EINVAL); 973 } 974 return (nvlist_remove_all(nvlist, name)); 975 } 976 977 static int 978 pool_system_propput(nvpair_t *pair) 979 { 980 int ret; 981 982 ASSERT(pool_lock_held()); 983 ret = pool_propput_common(pool_sys_prop, pair, pool_prop_sys); 984 if (ret == 0) 985 pool_sys_mod = gethrtime(); 986 return (ret); 987 } 988 989 static int 990 pool_system_proprm(char *name) 991 { 992 int ret; 993 994 ASSERT(pool_lock_held()); 995 ret = pool_proprm_common(pool_sys_prop, name, pool_prop_sys); 996 if (ret == 0) 997 pool_sys_mod = gethrtime(); 998 return (ret); 999 } 1000 1001 static int 1002 pool_pool_propput(poolid_t poolid, nvpair_t *pair) 1003 { 1004 pool_t *pool; 1005 int ret; 1006 1007 ASSERT(pool_lock_held()); 1008 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) 1009 return (ESRCH); 1010 ret = pool_propput_common(pool->pool_props, pair, pool_prop_pool); 1011 if (ret == 0) 1012 pool_pool_mod = gethrtime(); 1013 return (ret); 1014 } 1015 1016 static int 1017 pool_pool_proprm(poolid_t poolid, char *name) 1018 { 1019 int ret; 1020 pool_t *pool; 1021 1022 ASSERT(pool_lock_held()); 1023 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) 1024 return (ESRCH); 1025 ret = pool_proprm_common(pool->pool_props, name, pool_prop_pool); 1026 if (ret == 0) 1027 pool_pool_mod = gethrtime(); 1028 return (ret); 1029 } 1030 1031 int 1032 pool_propput(int class, int subclass, id_t id, nvpair_t *pair) 1033 { 1034 int ret; 1035 1036 ASSERT(pool_lock_held()); 1037 if (pool_state == POOL_DISABLED) 1038 return (ENOTACTIVE); 1039 switch (class) { 1040 case PEC_SYSTEM: 1041 ret = pool_system_propput(pair); 1042 break; 1043 case PEC_POOL: 1044 ret = pool_pool_propput((poolid_t)id, pair); 1045 break; 1046 case PEC_RES_COMP: 1047 switch (subclass) { 1048 case PREC_PSET: 1049 ret = pool_pset_propput((psetid_t)id, pair); 1050 break; 1051 default: 1052 ret = EINVAL; 1053 } 1054 break; 1055 case PEC_RES_AGG: 1056 ret = ENOTSUP; 1057 break; 1058 case PEC_COMP: 1059 switch (subclass) { 1060 case PCEC_CPU: 1061 ret = pool_cpu_propput((processorid_t)id, pair); 1062 break; 1063 default: 1064 ret = EINVAL; 1065 } 1066 break; 1067 default: 1068 ret = EINVAL; 1069 } 1070 return (ret); 1071 } 1072 1073 int 1074 pool_proprm(int class, int subclass, id_t id, char *name) 1075 { 1076 int ret; 1077 1078 ASSERT(pool_lock_held()); 1079 if (pool_state == POOL_DISABLED) 1080 return (ENOTACTIVE); 1081 switch (class) { 1082 case PEC_SYSTEM: 1083 ret = pool_system_proprm(name); 1084 break; 1085 case PEC_POOL: 1086 ret = pool_pool_proprm((poolid_t)id, name); 1087 break; 1088 case PEC_RES_COMP: 1089 switch (subclass) { 1090 case PREC_PSET: 1091 ret = pool_pset_proprm((psetid_t)id, name); 1092 break; 1093 default: 1094 ret = EINVAL; 1095 } 1096 break; 1097 case PEC_RES_AGG: 1098 ret = ENOTSUP; 1099 break; 1100 case PEC_COMP: 1101 switch (subclass) { 1102 case PCEC_CPU: 1103 ret = pool_cpu_proprm((processorid_t)id, name); 1104 break; 1105 default: 1106 ret = EINVAL; 1107 } 1108 break; 1109 default: 1110 ret = EINVAL; 1111 } 1112 return (ret); 1113 } 1114 1115 int 1116 pool_propget(char *name, int class, int subclass, id_t id, nvlist_t **nvlp) 1117 { 1118 int ret; 1119 nvlist_t *nvl; 1120 1121 ASSERT(pool_lock_held()); 1122 if (pool_state == POOL_DISABLED) 1123 return (ENOTACTIVE); 1124 1125 (void) nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP); 1126 1127 switch (class) { 1128 case PEC_SYSTEM: 1129 case PEC_POOL: 1130 ret = EINVAL; 1131 break; 1132 case PEC_RES_COMP: 1133 switch (subclass) { 1134 case PREC_PSET: 1135 ret = pool_pset_propget((psetid_t)id, name, nvl); 1136 break; 1137 default: 1138 ret = EINVAL; 1139 } 1140 break; 1141 case PEC_RES_AGG: 1142 ret = ENOTSUP; 1143 break; 1144 case PEC_COMP: 1145 switch (subclass) { 1146 case PCEC_CPU: 1147 ret = pool_cpu_propget((processorid_t)id, name, nvl); 1148 break; 1149 default: 1150 ret = EINVAL; 1151 } 1152 break; 1153 default: 1154 ret = EINVAL; 1155 } 1156 if (ret == 0) 1157 *nvlp = nvl; 1158 else 1159 nvlist_free(nvl); 1160 return (ret); 1161 } 1162 1163 /* 1164 * pool_bind_wake and pool_bind_wakeall are helper functions to undo PBWAITs 1165 * in case of failure in pool_do_bind(). 1166 */ 1167 static void 1168 pool_bind_wake(proc_t *p) 1169 { 1170 ASSERT(pool_lock_held()); 1171 1172 mutex_enter(&p->p_lock); 1173 ASSERT(p->p_poolflag & PBWAIT); 1174 if (p->p_poolcnt > 0) { 1175 mutex_enter(&pool_barrier_lock); 1176 pool_barrier_count -= p->p_poolcnt; 1177 mutex_exit(&pool_barrier_lock); 1178 } 1179 p->p_poolflag &= ~PBWAIT; 1180 cv_signal(&p->p_poolcv); 1181 mutex_exit(&p->p_lock); 1182 } 1183 1184 static void 1185 pool_bind_wakeall(proc_t **procs) 1186 { 1187 proc_t *p, **pp; 1188 1189 ASSERT(pool_lock_held()); 1190 for (pp = procs; (p = *pp) != NULL; pp++) 1191 pool_bind_wake(p); 1192 } 1193 1194 /* 1195 * Return the scheduling class for this pool, or 1196 * POOL_CLASS_UNSET if not set 1197 * POOL_CLASS_INVAL if set to an invalid class ID. 1198 */ 1199 id_t 1200 pool_get_class(pool_t *pool) 1201 { 1202 char *name; 1203 id_t cid; 1204 1205 ASSERT(pool_lock_held()); 1206 1207 if (nvlist_lookup_string(pool->pool_props, "pool.scheduler", 1208 &name) == 0) { 1209 if (getcidbyname(name, &cid) == 0) 1210 return (cid); 1211 else 1212 return (POOL_CLASS_INVAL); 1213 } 1214 return (POOL_CLASS_UNSET); 1215 } 1216 1217 /* 1218 * Move process to the new scheduling class. 1219 */ 1220 static void 1221 pool_change_class(proc_t *p, id_t cid) 1222 { 1223 kthread_t *t; 1224 void *cldata; 1225 id_t oldcid; 1226 void **bufs; 1227 void **buf; 1228 int nlwp; 1229 int ret; 1230 int i; 1231 1232 /* 1233 * Do not move kernel processes (such as zsched). 1234 */ 1235 if (p->p_flag & SSYS) 1236 return; 1237 /* 1238 * This process is in the pool barrier, so it can't possibly be 1239 * adding new threads and we can use p_lwpcnt + p_zombcnt + 1 1240 * (for possible agent LWP which doesn't use pool barrier) as 1241 * our upper bound. 1242 */ 1243 nlwp = p->p_lwpcnt + p->p_zombcnt + 1; 1244 1245 /* 1246 * Pre-allocate scheduling class specific buffers before 1247 * grabbing p_lock. 1248 */ 1249 bufs = kmem_zalloc(nlwp * sizeof (void *), KM_SLEEP); 1250 for (i = 0, buf = bufs; i < nlwp; i++, buf++) { 1251 ret = CL_ALLOC(buf, cid, KM_SLEEP); 1252 ASSERT(ret == 0); 1253 } 1254 1255 /* 1256 * Move threads one by one to the new scheduling class. 1257 * This never fails because we have all the right 1258 * privileges here. 1259 */ 1260 mutex_enter(&p->p_lock); 1261 ASSERT(p->p_poolflag & PBWAIT); 1262 buf = bufs; 1263 t = p->p_tlist; 1264 ASSERT(t != NULL); 1265 do { 1266 if (t->t_cid != cid) { 1267 oldcid = t->t_cid; 1268 cldata = t->t_cldata; 1269 ret = CL_ENTERCLASS(t, cid, NULL, NULL, *buf); 1270 ASSERT(ret == 0); 1271 CL_EXITCLASS(oldcid, cldata); 1272 schedctl_set_cidpri(t); 1273 *buf++ = NULL; 1274 } 1275 } while ((t = t->t_forw) != p->p_tlist); 1276 mutex_exit(&p->p_lock); 1277 /* 1278 * Free unused scheduling class specific buffers. 1279 */ 1280 for (i = 0, buf = bufs; i < nlwp; i++, buf++) { 1281 if (*buf != NULL) { 1282 CL_FREE(cid, *buf); 1283 *buf = NULL; 1284 } 1285 } 1286 kmem_free(bufs, nlwp * sizeof (void *)); 1287 } 1288 1289 void 1290 pool_get_name(pool_t *pool, char **name) 1291 { 1292 ASSERT(pool_lock_held()); 1293 1294 (void) nvlist_lookup_string(pool->pool_props, "pool.name", name); 1295 1296 ASSERT(strlen(*name) != 0); 1297 } 1298 1299 1300 /* 1301 * The meat of the bind operation. The steps in pool_do_bind are: 1302 * 1303 * 1) Set PBWAIT in the p_poolflag of any process of interest, and add all 1304 * such processes to an array. For any interesting process that has 1305 * threads inside the pool barrier set, increment a counter by the 1306 * count of such threads. Once PBWAIT is set on a process, that process 1307 * will not disappear. 1308 * 1309 * 2) Wait for the counter from step 2 to drop to zero. Any process which 1310 * calls pool_barrier_exit() and notices that PBWAIT has been set on it 1311 * will decrement that counter before going to sleep, and the process 1312 * calling pool_barrier_exit() which does the final decrement will wake us. 1313 * 1314 * 3) For each interesting process, perform a calculation on it to see if 1315 * the bind will actually succeed. This uses the following three 1316 * resource-set-specific functions: 1317 * 1318 * - int set_bind_start(procs, pool) 1319 * 1320 * Determine whether the given array of processes can be bound to the 1321 * resource set associated with the given pool. If it can, take and hold 1322 * any locks necessary to ensure that the operation will succeed, and 1323 * make any necessary reservations in the target resource set. If it 1324 * can't, return failure with no reservations made and no new locks held. 1325 * 1326 * - void set_bind_abort(procs, pool) 1327 * 1328 * set_bind_start() has completed successfully, but another resource set's 1329 * set_bind_start() has failed, and we haven't begun the bind yet. Undo 1330 * any reservations made and drop any locks acquired by our 1331 * set_bind_start(). 1332 * 1333 * - void set_bind_finish(void) 1334 * 1335 * The bind has completed successfully. The processes have been released, 1336 * and the reservation acquired in set_bind_start() has been depleted as 1337 * the processes have finished their bindings. Drop any locks acquired by 1338 * set_bind_start(). 1339 * 1340 * 4) If we've decided that we can proceed with the bind, iterate through 1341 * the list of interesting processes, grab the necessary locks (which 1342 * may differ per resource set), perform the bind, and ASSERT that it 1343 * succeeds. Once a process has been rebound, it can be awakened. 1344 * 1345 * The operations from step 4 must be kept in sync with anything which might 1346 * cause the bind operations (e.g., cpupart_bind_thread()) to fail, and 1347 * are thus located in the same source files as the associated bind operations. 1348 */ 1349 int 1350 pool_do_bind(pool_t *pool, idtype_t idtype, id_t id, int flags) 1351 { 1352 extern uint_t nproc; 1353 klwp_t *lwp = ttolwp(curthread); 1354 proc_t **pp, **procs; 1355 proc_t *prstart; 1356 int procs_count = 0; 1357 kproject_t *kpj; 1358 procset_t set; 1359 zone_t *zone; 1360 int procs_size; 1361 int rv = 0; 1362 proc_t *p; 1363 id_t cid = -1; 1364 1365 ASSERT(pool_lock_held()); 1366 1367 if ((cid = pool_get_class(pool)) == POOL_CLASS_INVAL) 1368 return (EINVAL); 1369 1370 if (idtype == P_ZONEID) { 1371 zone = zone_find_by_id(id); 1372 if (zone == NULL) 1373 return (ESRCH); 1374 if (zone_status_get(zone) > ZONE_IS_RUNNING) { 1375 zone_rele(zone); 1376 return (EBUSY); 1377 } 1378 } 1379 1380 if (idtype == P_PROJID) { 1381 kpj = project_hold_by_id(id, global_zone, PROJECT_HOLD_FIND); 1382 if (kpj == NULL) 1383 return (ESRCH); 1384 mutex_enter(&kpj->kpj_poolbind); 1385 } 1386 1387 if (idtype == P_PID) { 1388 /* 1389 * Fast-path for a single process case. 1390 */ 1391 procs_size = 2; /* procs is NULL-terminated */ 1392 procs = kmem_zalloc(procs_size * sizeof (proc_t *), KM_SLEEP); 1393 mutex_enter(&pidlock); 1394 } else { 1395 /* 1396 * We will need enough slots for proc_t pointers for as many as 1397 * twice the number of currently running processes (assuming 1398 * that each one could be in fork() creating a new child). 1399 */ 1400 for (;;) { 1401 procs_size = nproc * 2; 1402 procs = kmem_zalloc(procs_size * sizeof (proc_t *), 1403 KM_SLEEP); 1404 mutex_enter(&pidlock); 1405 1406 if (nproc * 2 <= procs_size) 1407 break; 1408 /* 1409 * If nproc has changed, try again. 1410 */ 1411 mutex_exit(&pidlock); 1412 kmem_free(procs, procs_size * sizeof (proc_t *)); 1413 } 1414 } 1415 1416 if (id == P_MYID) 1417 id = getmyid(idtype); 1418 setprocset(&set, POP_AND, idtype, id, P_ALL, 0); 1419 1420 /* 1421 * Do a first scan, and select target processes. 1422 */ 1423 if (idtype == P_PID) 1424 prstart = prfind(id); 1425 else 1426 prstart = practive; 1427 for (p = prstart, pp = procs; p != NULL; p = p->p_next) { 1428 mutex_enter(&p->p_lock); 1429 /* 1430 * Skip processes that don't match our (id, idtype) set or 1431 * on the way of becoming zombies. Skip kernel processes 1432 * from the global zone. 1433 */ 1434 if (procinset(p, &set) == 0 || 1435 p->p_poolflag & PEXITED || 1436 ((p->p_flag & SSYS) && INGLOBALZONE(p))) { 1437 mutex_exit(&p->p_lock); 1438 continue; 1439 } 1440 if (!INGLOBALZONE(p)) { 1441 switch (idtype) { 1442 case P_PID: 1443 case P_TASKID: 1444 /* 1445 * Can't bind processes or tasks 1446 * in local zones to pools. 1447 */ 1448 mutex_exit(&p->p_lock); 1449 mutex_exit(&pidlock); 1450 pool_bind_wakeall(procs); 1451 rv = EINVAL; 1452 goto out; 1453 case P_PROJID: 1454 /* 1455 * Only projects in the global 1456 * zone can be rebound. 1457 */ 1458 mutex_exit(&p->p_lock); 1459 continue; 1460 case P_POOLID: 1461 /* 1462 * When rebinding pools, processes can be 1463 * in different zones. 1464 */ 1465 break; 1466 } 1467 } 1468 1469 p->p_poolflag |= PBWAIT; 1470 /* 1471 * If some threads in this process are inside the pool 1472 * barrier, add them to pool_barrier_count, as we have 1473 * to wait for all of them to exit the barrier. 1474 */ 1475 if (p->p_poolcnt > 0) { 1476 mutex_enter(&pool_barrier_lock); 1477 pool_barrier_count += p->p_poolcnt; 1478 mutex_exit(&pool_barrier_lock); 1479 } 1480 ASSERT(pp < &procs[procs_size]); 1481 *pp++ = p; 1482 procs_count++; 1483 mutex_exit(&p->p_lock); 1484 1485 /* 1486 * We just found our process, so if we're only rebinding a 1487 * single process then get out of this loop. 1488 */ 1489 if (idtype == P_PID) 1490 break; 1491 } 1492 *pp = NULL; /* cap off the end of the array */ 1493 mutex_exit(&pidlock); 1494 1495 /* 1496 * Wait for relevant processes to stop before they try to enter the 1497 * barrier or at the exit from the barrier. Make sure that we do 1498 * not get stopped here while we're holding pool_lock. If we were 1499 * requested to stop, or got a signal then return EAGAIN to let the 1500 * library know that it needs to retry. 1501 */ 1502 mutex_enter(&pool_barrier_lock); 1503 lwp->lwp_nostop++; 1504 while (pool_barrier_count > 0) { 1505 (void) cv_wait_sig(&pool_barrier_cv, &pool_barrier_lock); 1506 if (pool_barrier_count > 0) { 1507 /* 1508 * We either got a signal or were requested to 1509 * stop by /proc. Bail out with EAGAIN. If we were 1510 * requested to stop, we'll stop in post_syscall() 1511 * on our way back to userland. 1512 */ 1513 mutex_exit(&pool_barrier_lock); 1514 pool_bind_wakeall(procs); 1515 lwp->lwp_nostop--; 1516 rv = EAGAIN; 1517 goto out; 1518 } 1519 } 1520 lwp->lwp_nostop--; 1521 mutex_exit(&pool_barrier_lock); 1522 1523 if (idtype == P_PID) { 1524 if ((p = *procs) == NULL) 1525 goto skip; 1526 mutex_enter(&p->p_lock); 1527 /* Drop the process if it is exiting */ 1528 if (p->p_poolflag & PEXITED) { 1529 mutex_exit(&p->p_lock); 1530 pool_bind_wake(p); 1531 procs_count--; 1532 } else 1533 mutex_exit(&p->p_lock); 1534 goto skip; 1535 } 1536 1537 /* 1538 * Do another run, and drop processes that were inside the barrier 1539 * in exit(), but when they have dropped to pool_barrier_exit 1540 * they have become of no interest to us. Pick up child processes that 1541 * were created by fork() but didn't exist during our first scan. 1542 * Their parents are now stopped at pool_barrier_exit in cfork(). 1543 */ 1544 mutex_enter(&pidlock); 1545 for (pp = procs; (p = *pp) != NULL; pp++) { 1546 mutex_enter(&p->p_lock); 1547 if (p->p_poolflag & PEXITED) { 1548 ASSERT(p->p_lwpcnt == 0); 1549 mutex_exit(&p->p_lock); 1550 pool_bind_wake(p); 1551 /* flip w/last non-NULL slot */ 1552 *pp = procs[procs_count - 1]; 1553 procs[procs_count - 1] = NULL; 1554 procs_count--; 1555 pp--; /* try this slot again */ 1556 continue; 1557 } else 1558 mutex_exit(&p->p_lock); 1559 /* 1560 * Look at the child and check if it should be rebound also. 1561 * We're holding pidlock, so it is safe to reference p_child. 1562 */ 1563 if ((p = p->p_child) == NULL) 1564 continue; 1565 1566 mutex_enter(&p->p_lock); 1567 1568 /* 1569 * Skip system processes and make sure that the child is in 1570 * the same task/project/pool/zone as the parent. 1571 */ 1572 if ((!INGLOBALZONE(p) && idtype != P_ZONEID && 1573 idtype != P_POOLID) || p->p_flag & SSYS) { 1574 mutex_exit(&p->p_lock); 1575 continue; 1576 } 1577 1578 /* 1579 * If the child process has been already created by fork(), has 1580 * not exited, and has not been added to the list already, 1581 * then add it now. We will hit this process again (since we 1582 * stick it at the end of the procs list) but it will ignored 1583 * because it will have the PBWAIT flag set. 1584 */ 1585 if (procinset(p, &set) && 1586 !(p->p_poolflag & PEXITED) && 1587 !(p->p_poolflag & PBWAIT)) { 1588 ASSERT(p->p_child == NULL); /* no child of a child */ 1589 procs[procs_count] = p; 1590 procs[procs_count + 1] = NULL; 1591 procs_count++; 1592 p->p_poolflag |= PBWAIT; 1593 } 1594 mutex_exit(&p->p_lock); 1595 } 1596 mutex_exit(&pidlock); 1597 skip: 1598 /* 1599 * If there's no processes to rebind then return ESRCH, unless 1600 * we're associating a pool with new resource set, destroying it, 1601 * or binding a zone to a pool. 1602 */ 1603 if (procs_count == 0) { 1604 if (idtype == P_POOLID || idtype == P_ZONEID) 1605 rv = 0; 1606 else 1607 rv = ESRCH; 1608 goto out; 1609 } 1610 1611 #ifdef DEBUG 1612 /* 1613 * All processes in the array should have PBWAIT set, and none 1614 * should be in the critical section. Thus, although p_poolflag 1615 * and p_poolcnt are protected by p_lock, their ASSERTions below 1616 * should be stable without it. procinset(), however, ASSERTs that 1617 * the p_lock is held upon entry. 1618 */ 1619 for (pp = procs; (p = *pp) != NULL; pp++) { 1620 int in_set; 1621 1622 mutex_enter(&p->p_lock); 1623 in_set = procinset(p, &set); 1624 mutex_exit(&p->p_lock); 1625 1626 ASSERT(in_set); 1627 ASSERT(p->p_poolflag & PBWAIT); 1628 ASSERT(p->p_poolcnt == 0); 1629 } 1630 #endif 1631 1632 /* 1633 * Do the check if processor set rebinding is going to succeed or not. 1634 */ 1635 if ((flags & POOL_BIND_PSET) && 1636 (rv = pset_bind_start(procs, pool)) != 0) { 1637 pool_bind_wakeall(procs); 1638 goto out; 1639 } 1640 1641 /* 1642 * At this point, all bind operations should succeed. 1643 */ 1644 for (pp = procs; (p = *pp) != NULL; pp++) { 1645 if (flags & POOL_BIND_PSET) { 1646 psetid_t psetid = pool->pool_pset->pset_id; 1647 void *zonebuf; 1648 void *projbuf; 1649 1650 /* 1651 * Pre-allocate one buffer for FSS (per-project 1652 * buffer for a new pset) in case if this is the 1653 * first thread from its current project getting 1654 * bound to this processor set. 1655 */ 1656 projbuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_PROJ); 1657 zonebuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_ZONE); 1658 1659 mutex_enter(&pidlock); 1660 mutex_enter(&p->p_lock); 1661 pool_pset_bind(p, psetid, projbuf, zonebuf); 1662 mutex_exit(&p->p_lock); 1663 mutex_exit(&pidlock); 1664 /* 1665 * Free buffers pre-allocated above if it 1666 * wasn't actually used. 1667 */ 1668 fss_freebuf(projbuf, FSS_ALLOC_PROJ); 1669 fss_freebuf(zonebuf, FSS_ALLOC_ZONE); 1670 } 1671 /* 1672 * Now let's change the scheduling class of this 1673 * process if our target pool has it defined. 1674 */ 1675 if (cid != POOL_CLASS_UNSET) 1676 pool_change_class(p, cid); 1677 1678 /* 1679 * It is safe to reference p_pool here without holding 1680 * p_lock because it cannot change underneath of us. 1681 * We're holding pool_lock here, so nobody else can be 1682 * moving this process between pools. If process "p" 1683 * would be exiting, we're guaranteed that it would be blocked 1684 * at pool_barrier_enter() in exit(). Otherwise, it would've 1685 * been skipped by one of our scans of the practive list 1686 * as a process with PEXITED flag set. 1687 */ 1688 if (p->p_pool != pool) { 1689 ASSERT(p->p_pool->pool_ref > 0); 1690 atomic_add_32(&p->p_pool->pool_ref, -1); 1691 p->p_pool = pool; 1692 atomic_add_32(&p->p_pool->pool_ref, 1); 1693 } 1694 /* 1695 * Okay, we've tortured this guy enough. 1696 * Let this poor process go now. 1697 */ 1698 pool_bind_wake(p); 1699 } 1700 if (flags & POOL_BIND_PSET) 1701 pset_bind_finish(); 1702 1703 out: switch (idtype) { 1704 case P_PROJID: 1705 ASSERT(kpj != NULL); 1706 mutex_exit(&kpj->kpj_poolbind); 1707 project_rele(kpj); 1708 break; 1709 case P_ZONEID: 1710 if (rv == 0) { 1711 mutex_enter(&cpu_lock); 1712 zone_pool_set(zone, pool); 1713 mutex_exit(&cpu_lock); 1714 } 1715 zone->zone_pool_mod = gethrtime(); 1716 zone_rele(zone); 1717 break; 1718 } 1719 1720 kmem_free(procs, procs_size * sizeof (proc_t *)); 1721 ASSERT(pool_barrier_count == 0); 1722 return (rv); 1723 } 1724 1725 void 1726 pool_event_cb_register(pool_event_cb_t *cb) 1727 { 1728 ASSERT(!pool_lock_held() || panicstr); 1729 ASSERT(cb->pec_func != NULL); 1730 1731 mutex_enter(&pool_event_cb_lock); 1732 if (!pool_event_cb_init) { 1733 list_create(&pool_event_cb_list, sizeof (pool_event_cb_t), 1734 offsetof(pool_event_cb_t, pec_list)); 1735 pool_event_cb_init = B_TRUE; 1736 } 1737 list_insert_tail(&pool_event_cb_list, cb); 1738 mutex_exit(&pool_event_cb_lock); 1739 } 1740 1741 void 1742 pool_event_cb_unregister(pool_event_cb_t *cb) 1743 { 1744 ASSERT(!pool_lock_held() || panicstr); 1745 1746 mutex_enter(&pool_event_cb_lock); 1747 list_remove(&pool_event_cb_list, cb); 1748 mutex_exit(&pool_event_cb_lock); 1749 } 1750 1751 typedef struct { 1752 pool_event_t tqd_what; 1753 poolid_t tqd_id; 1754 } pool_tqd_t; 1755 1756 void 1757 pool_event_notify(void *arg) 1758 { 1759 pool_tqd_t *tqd = (pool_tqd_t *)arg; 1760 pool_event_cb_t *cb; 1761 1762 ASSERT(!pool_lock_held() || panicstr); 1763 1764 mutex_enter(&pool_event_cb_lock); 1765 for (cb = list_head(&pool_event_cb_list); cb != NULL; 1766 cb = list_next(&pool_event_cb_list, cb)) { 1767 cb->pec_func(tqd->tqd_what, tqd->tqd_id, cb->pec_arg); 1768 } 1769 mutex_exit(&pool_event_cb_lock); 1770 kmem_free(tqd, sizeof (*tqd)); 1771 } 1772 1773 void 1774 pool_event_dispatch(pool_event_t what, poolid_t id) 1775 { 1776 pool_tqd_t *tqd = NULL; 1777 1778 ASSERT(pool_lock_held()); 1779 1780 if (pool_event_cb_taskq == NULL) { 1781 pool_event_cb_taskq = taskq_create("pool_event_cb_taskq", 1, 1782 -1, 1, 1, TASKQ_PREPOPULATE); 1783 } 1784 1785 tqd = kmem_alloc(sizeof (*tqd), KM_SLEEP); 1786 tqd->tqd_what = what; 1787 tqd->tqd_id = id; 1788 1789 (void) taskq_dispatch(pool_event_cb_taskq, pool_event_notify, tqd, 1790 KM_SLEEP); 1791 } 1792