1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 #include <sys/pool.h> 27 #include <sys/pool_impl.h> 28 #include <sys/pool_pset.h> 29 #include <sys/id_space.h> 30 #include <sys/mutex.h> 31 #include <sys/nvpair.h> 32 #include <sys/cpuvar.h> 33 #include <sys/errno.h> 34 #include <sys/cmn_err.h> 35 #include <sys/systm.h> 36 #include <sys/proc.h> 37 #include <sys/fss.h> 38 #include <sys/class.h> 39 #include <sys/exacct.h> 40 #include <sys/utsname.h> 41 #include <sys/procset.h> 42 #include <sys/atomic.h> 43 #include <sys/zone.h> 44 #include <sys/policy.h> 45 #include <sys/schedctl.h> 46 #include <sys/taskq.h> 47 48 /* 49 * RESOURCE POOLS 50 * 51 * The resource pools facility brings together process-bindable resource into 52 * a common abstraction called a pool. Processor sets and other entities can 53 * be configured, grouped, and labelled such that workload components can be 54 * associated with a subset of a system's total resources. 55 * 56 * When disabled, the pools facility is "invisible". All processes belong 57 * to the same pool (pool_default), and processor sets can be managed through 58 * the old pset() system call. When enabled, processor sets can only be 59 * managed via the pools facility. New pools can be created and associated 60 * with processor sets. Processes can be bound to pools which have non-empty 61 * resource sets. 62 * 63 * Locking: pool_lock() protects global pools state and must be called 64 * before modifying the configuration, or when taking a snapshot of the 65 * configuration. If pool_lock_intr() is used, the operation may be 66 * interrupted by a signal or a request. 67 * 68 * To prevent processes from being rebound between pools while they are 69 * the middle of an operation which affects resource set bindings, such 70 * operations must be surrounded by calls to pool_barrier_enter() and 71 * pool_barrier_exit(). This mechanism guarantees that such processes will 72 * be stopped either at the beginning or at the end of the barrier so that 73 * the rebind operation can atomically bind the process and its threads 74 * to new resource sets, and then let process run again. 75 * 76 * Lock ordering with respect to other locks is as follows: 77 * 78 * pool_lock() -> cpu_lock -> pidlock -> p_lock -> pool_barrier_lock 79 * 80 * Most static and global variables defined in this file are protected 81 * by calling pool_lock(). 82 * 83 * The operation that binds tasks and projects to pools is atomic. That is, 84 * either all processes in a given task or a project will be bound to a 85 * new pool, or (in case of an error) they will be all left bound to the 86 * old pool. Processes in a given task or a given project can only be bound to 87 * different pools if they were rebound individually one by one as single 88 * processes. Threads or LWPs of the same process do not have pool bindings, 89 * and are bound to the same resource sets associated with the resource pool 90 * of that process. 91 * 92 * The following picture shows one possible pool configuration with three 93 * pools and three processor sets. Note that processor set "foo" is not 94 * associated with any pools and therefore cannot have any processes 95 * bound to it. Two pools (default and foo) are associated with the 96 * same processor set (default). Also, note that processes in Task 2 97 * are bound to different pools. 98 * 99 * 100 * Processor Sets 101 * +---------+ 102 * +--------------+========================>| default | 103 * a| | +---------+ 104 * s| | || 105 * s| | +---------+ 106 * o| | | foo | 107 * c| | +---------+ 108 * i| | || 109 * a| | +---------+ 110 * t| | +------>| bar | 111 * e| | | +---------+ 112 * d| | | 113 * | | | 114 * +---------+ +---------+ +---------+ 115 * Pools | default |======| foo |======| bar | 116 * +---------+ +---------+ +---------+ 117 * @ @ @ @ @ @ 118 * b| | | | | | 119 * o| | | | | | 120 * u| +-----+ | +-------+ | +---+ 121 * n| | | | | | 122 * ....d|........|......|......|.........|.......|.... 123 * : | :: | | | :: | | : 124 * : +---+ :: +---+ +---+ +---+ :: +---+ +---+ : 125 * Processes : | p | :: | p | | p | | p | :: | p |...| p | : 126 * : +---+ :: +---+ +---+ +---+ :: +---+ +---+ : 127 * :........::......................::...............: 128 * Task 1 Task 2 Task N 129 * | | | 130 * | | | 131 * | +-----------+ | +-----------+ 132 * +--| Project 1 |--+ | Project N | 133 * +-----------+ +-----------+ 134 * 135 * This is just an illustration of relationships between processes, tasks, 136 * projects, pools, and processor sets. New types of resource sets will be 137 * added in the future. 138 */ 139 140 pool_t *pool_default; /* default pool which always exists */ 141 int pool_count; /* number of pools created on this system */ 142 int pool_state; /* pools state -- enabled/disabled */ 143 void *pool_buf; /* pre-commit snapshot of the pools state */ 144 size_t pool_bufsz; /* size of pool_buf */ 145 static hrtime_t pool_pool_mod; /* last modification time for pools */ 146 static hrtime_t pool_sys_mod; /* last modification time for system */ 147 static nvlist_t *pool_sys_prop; /* system properties */ 148 static id_space_t *pool_ids; /* pool ID space */ 149 static list_t pool_list; /* doubly-linked list of pools */ 150 static kmutex_t pool_mutex; /* protects pool_busy_* */ 151 static kcondvar_t pool_busy_cv; /* waiting for "pool_lock" */ 152 static kthread_t *pool_busy_thread; /* thread holding "pool_lock" */ 153 static kmutex_t pool_barrier_lock; /* synch. with pool_barrier_* */ 154 static kcondvar_t pool_barrier_cv; /* synch. with pool_barrier_* */ 155 static int pool_barrier_count; /* synch. with pool_barrier_* */ 156 static list_t pool_event_cb_list; /* pool event callbacks */ 157 static boolean_t pool_event_cb_init = B_FALSE; 158 static kmutex_t pool_event_cb_lock; 159 static taskq_t *pool_event_cb_taskq = NULL; 160 161 void pool_event_dispatch(pool_event_t, poolid_t); 162 163 /* 164 * Boot-time pool initialization. 165 */ 166 void 167 pool_init(void) 168 { 169 pool_ids = id_space_create("pool_ids", POOL_DEFAULT + 1, POOL_MAXID); 170 171 /* 172 * Initialize default pool. 173 */ 174 pool_default = kmem_zalloc(sizeof (pool_t), KM_SLEEP); 175 pool_default->pool_id = POOL_DEFAULT; 176 list_create(&pool_list, sizeof (pool_t), offsetof(pool_t, pool_link)); 177 list_insert_head(&pool_list, pool_default); 178 179 /* 180 * Initialize plugins for resource sets. 181 */ 182 pool_pset_init(); 183 pool_count = 1; 184 p0.p_pool = pool_default; 185 global_zone->zone_pool = pool_default; 186 pool_default->pool_ref = 1; 187 } 188 189 /* 190 * Synchronization routines. 191 * 192 * pool_lock is only called from syscall-level routines (processor_bind(), 193 * pset_*(), and /dev/pool ioctls). The pool "lock" may be held for long 194 * periods of time, including across sleeping operations, so we allow its 195 * acquisition to be interruptible. 196 * 197 * The current thread that owns the "lock" is stored in the variable 198 * pool_busy_thread, both to let pool_lock_held() work and to aid debugging. 199 */ 200 void 201 pool_lock(void) 202 { 203 mutex_enter(&pool_mutex); 204 ASSERT(!pool_lock_held()); 205 while (pool_busy_thread != NULL) 206 cv_wait(&pool_busy_cv, &pool_mutex); 207 pool_busy_thread = curthread; 208 mutex_exit(&pool_mutex); 209 } 210 211 int 212 pool_lock_intr(void) 213 { 214 mutex_enter(&pool_mutex); 215 ASSERT(!pool_lock_held()); 216 while (pool_busy_thread != NULL) { 217 if (cv_wait_sig(&pool_busy_cv, &pool_mutex) == 0) { 218 cv_signal(&pool_busy_cv); 219 mutex_exit(&pool_mutex); 220 return (1); 221 } 222 } 223 pool_busy_thread = curthread; 224 mutex_exit(&pool_mutex); 225 return (0); 226 } 227 228 int 229 pool_lock_held(void) 230 { 231 return (pool_busy_thread == curthread); 232 } 233 234 void 235 pool_unlock(void) 236 { 237 mutex_enter(&pool_mutex); 238 ASSERT(pool_lock_held()); 239 pool_busy_thread = NULL; 240 cv_signal(&pool_busy_cv); 241 mutex_exit(&pool_mutex); 242 } 243 244 /* 245 * Routines allowing fork(), exec(), exit(), and lwp_create() to synchronize 246 * with pool_do_bind(). 247 * 248 * Calls to pool_barrier_enter() and pool_barrier_exit() must bracket all 249 * operations which modify pool or pset associations. They can be called 250 * while the process is multi-threaded. In the common case, when current 251 * process is not being rebound (PBWAIT flag is not set), these functions 252 * will be just incrementing and decrementing reference counts. 253 */ 254 void 255 pool_barrier_enter(void) 256 { 257 proc_t *p = curproc; 258 259 ASSERT(MUTEX_HELD(&p->p_lock)); 260 while (p->p_poolflag & PBWAIT) 261 cv_wait(&p->p_poolcv, &p->p_lock); 262 p->p_poolcnt++; 263 } 264 265 void 266 pool_barrier_exit(void) 267 { 268 proc_t *p = curproc; 269 270 ASSERT(MUTEX_HELD(&p->p_lock)); 271 ASSERT(p->p_poolcnt > 0); 272 p->p_poolcnt--; 273 if (p->p_poolflag & PBWAIT) { 274 mutex_enter(&pool_barrier_lock); 275 ASSERT(pool_barrier_count > 0); 276 pool_barrier_count--; 277 if (pool_barrier_count == 0) 278 cv_signal(&pool_barrier_cv); 279 mutex_exit(&pool_barrier_lock); 280 while (p->p_poolflag & PBWAIT) 281 cv_wait(&p->p_poolcv, &p->p_lock); 282 } 283 } 284 285 /* 286 * Enable pools facility. 287 */ 288 static int 289 pool_enable(void) 290 { 291 int ret; 292 293 ASSERT(pool_lock_held()); 294 ASSERT(pool_count == 1); 295 296 ret = pool_pset_enable(); 297 if (ret != 0) 298 return (ret); 299 (void) nvlist_alloc(&pool_sys_prop, NV_UNIQUE_NAME, KM_SLEEP); 300 (void) nvlist_add_string(pool_sys_prop, "system.name", 301 "default"); 302 (void) nvlist_add_string(pool_sys_prop, "system.comment", ""); 303 (void) nvlist_add_int64(pool_sys_prop, "system.version", 1); 304 (void) nvlist_add_byte(pool_sys_prop, "system.bind-default", 1); 305 (void) nvlist_add_string(pool_sys_prop, "system.poold.objectives", 306 "wt-load"); 307 308 (void) nvlist_alloc(&pool_default->pool_props, 309 NV_UNIQUE_NAME, KM_SLEEP); 310 (void) nvlist_add_string(pool_default->pool_props, 311 "pool.name", "pool_default"); 312 (void) nvlist_add_string(pool_default->pool_props, "pool.comment", ""); 313 (void) nvlist_add_byte(pool_default->pool_props, "pool.default", 1); 314 (void) nvlist_add_byte(pool_default->pool_props, "pool.active", 1); 315 (void) nvlist_add_int64(pool_default->pool_props, 316 "pool.importance", 1); 317 (void) nvlist_add_int64(pool_default->pool_props, "pool.sys_id", 318 pool_default->pool_id); 319 320 pool_sys_mod = pool_pool_mod = gethrtime(); 321 322 return (ret); 323 } 324 325 /* 326 * Disable pools facility. 327 */ 328 static int 329 pool_disable(void) 330 { 331 int ret; 332 333 ASSERT(pool_lock_held()); 334 335 if (pool_count > 1) /* must destroy all pools first */ 336 return (EBUSY); 337 338 ret = pool_pset_disable(); 339 if (ret != 0) 340 return (ret); 341 if (pool_sys_prop != NULL) { 342 nvlist_free(pool_sys_prop); 343 pool_sys_prop = NULL; 344 } 345 if (pool_default->pool_props != NULL) { 346 nvlist_free(pool_default->pool_props); 347 pool_default->pool_props = NULL; 348 } 349 return (0); 350 } 351 352 pool_t * 353 pool_lookup_pool_by_name(char *name) 354 { 355 pool_t *pool = pool_default; 356 char *p; 357 358 ASSERT(pool_lock_held()); 359 for (pool = list_head(&pool_list); pool; 360 pool = list_next(&pool_list, pool)) { 361 if (nvlist_lookup_string(pool->pool_props, 362 "pool.name", &p) == 0 && strcmp(name, p) == 0) 363 return (pool); 364 } 365 return (NULL); 366 } 367 368 pool_t * 369 pool_lookup_pool_by_id(poolid_t poolid) 370 { 371 pool_t *pool = pool_default; 372 373 ASSERT(pool_lock_held()); 374 for (pool = list_head(&pool_list); pool; 375 pool = list_next(&pool_list, pool)) { 376 if (pool->pool_id == poolid) 377 return (pool); 378 } 379 return (NULL); 380 } 381 382 pool_t * 383 pool_lookup_pool_by_pset(int id) 384 { 385 pool_t *pool = pool_default; 386 psetid_t psetid = (psetid_t)id; 387 388 ASSERT(pool_lock_held()); 389 for (pool = list_head(&pool_list); pool != NULL; 390 pool = list_next(&pool_list, pool)) { 391 if (pool->pool_pset->pset_id == psetid) 392 return (pool); 393 } 394 return (NULL); 395 } 396 397 /* 398 * Create new pool, associate it with default resource sets, and give 399 * it a temporary name. 400 */ 401 static int 402 pool_pool_create(poolid_t *poolid) 403 { 404 pool_t *pool; 405 char pool_name[40]; 406 407 ASSERT(pool_lock_held()); 408 409 pool = kmem_zalloc(sizeof (pool_t), KM_SLEEP); 410 pool->pool_id = *poolid = id_alloc(pool_ids); 411 pool->pool_pset = pool_pset_default; 412 pool_pset_default->pset_npools++; 413 list_insert_tail(&pool_list, pool); 414 (void) nvlist_alloc(&pool->pool_props, NV_UNIQUE_NAME, KM_SLEEP); 415 (void) nvlist_add_int64(pool->pool_props, "pool.sys_id", pool->pool_id); 416 (void) nvlist_add_byte(pool->pool_props, "pool.default", 0); 417 pool_pool_mod = gethrtime(); 418 (void) snprintf(pool_name, sizeof (pool_name), "pool_%lld", 419 pool_pool_mod); 420 (void) nvlist_add_string(pool->pool_props, "pool.name", pool_name); 421 pool_count++; 422 return (0); 423 } 424 425 struct destroy_zone_arg { 426 pool_t *old; 427 pool_t *new; 428 }; 429 430 /* 431 * Update pool pointers for zones that are currently bound to pool "old" 432 * to be bound to pool "new". 433 */ 434 static int 435 pool_destroy_zone_cb(zone_t *zone, void *arg) 436 { 437 struct destroy_zone_arg *dza = arg; 438 439 ASSERT(pool_lock_held()); 440 ASSERT(MUTEX_HELD(&cpu_lock)); 441 442 if (zone_pool_get(zone) == dza->old) 443 zone_pool_set(zone, dza->new); 444 return (0); 445 } 446 447 /* 448 * Destroy specified pool, and rebind all processes in it 449 * to the default pool. 450 */ 451 static int 452 pool_pool_destroy(poolid_t poolid) 453 { 454 pool_t *pool; 455 int ret; 456 457 ASSERT(pool_lock_held()); 458 459 if (poolid == POOL_DEFAULT) 460 return (EINVAL); 461 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) 462 return (ESRCH); 463 ret = pool_do_bind(pool_default, P_POOLID, poolid, POOL_BIND_ALL); 464 if (ret == 0) { 465 struct destroy_zone_arg dzarg; 466 467 dzarg.old = pool; 468 dzarg.new = pool_default; 469 mutex_enter(&cpu_lock); 470 ret = zone_walk(pool_destroy_zone_cb, &dzarg); 471 mutex_exit(&cpu_lock); 472 ASSERT(ret == 0); 473 ASSERT(pool->pool_ref == 0); 474 (void) nvlist_free(pool->pool_props); 475 id_free(pool_ids, pool->pool_id); 476 pool->pool_pset->pset_npools--; 477 list_remove(&pool_list, pool); 478 pool_count--; 479 pool_pool_mod = gethrtime(); 480 kmem_free(pool, sizeof (pool_t)); 481 } 482 return (ret); 483 } 484 485 /* 486 * Create new pool or resource set. 487 */ 488 int 489 pool_create(int class, int subclass, id_t *id) 490 { 491 int ret; 492 493 ASSERT(pool_lock_held()); 494 if (pool_state == POOL_DISABLED) 495 return (ENOTACTIVE); 496 switch (class) { 497 case PEC_POOL: 498 ret = pool_pool_create((poolid_t *)id); 499 break; 500 case PEC_RES_COMP: 501 switch (subclass) { 502 case PREC_PSET: 503 ret = pool_pset_create((psetid_t *)id); 504 break; 505 default: 506 ret = EINVAL; 507 } 508 break; 509 case PEC_RES_AGG: 510 ret = ENOTSUP; 511 break; 512 default: 513 ret = EINVAL; 514 } 515 return (ret); 516 } 517 518 /* 519 * Destroy an existing pool or resource set. 520 */ 521 int 522 pool_destroy(int class, int subclass, id_t id) 523 { 524 int ret; 525 526 ASSERT(pool_lock_held()); 527 if (pool_state == POOL_DISABLED) 528 return (ENOTACTIVE); 529 switch (class) { 530 case PEC_POOL: 531 ret = pool_pool_destroy((poolid_t)id); 532 break; 533 case PEC_RES_COMP: 534 switch (subclass) { 535 case PREC_PSET: 536 ret = pool_pset_destroy((psetid_t)id); 537 break; 538 default: 539 ret = EINVAL; 540 } 541 break; 542 case PEC_RES_AGG: 543 ret = ENOTSUP; 544 break; 545 default: 546 ret = EINVAL; 547 } 548 return (ret); 549 } 550 551 /* 552 * Enable or disable pools. 553 */ 554 int 555 pool_status(int status) 556 { 557 int ret = 0; 558 559 ASSERT(pool_lock_held()); 560 561 if (pool_state == status) 562 return (0); 563 switch (status) { 564 case POOL_ENABLED: 565 ret = pool_enable(); 566 if (ret != 0) 567 return (ret); 568 pool_state = POOL_ENABLED; 569 pool_event_dispatch(POOL_E_ENABLE, NULL); 570 break; 571 case POOL_DISABLED: 572 ret = pool_disable(); 573 if (ret != 0) 574 return (ret); 575 pool_state = POOL_DISABLED; 576 pool_event_dispatch(POOL_E_DISABLE, NULL); 577 break; 578 default: 579 ret = EINVAL; 580 } 581 return (ret); 582 } 583 584 /* 585 * Associate pool with resource set. 586 */ 587 int 588 pool_assoc(poolid_t poolid, int idtype, id_t id) 589 { 590 int ret; 591 592 ASSERT(pool_lock_held()); 593 if (pool_state == POOL_DISABLED) 594 return (ENOTACTIVE); 595 switch (idtype) { 596 case PREC_PSET: 597 ret = pool_pset_assoc(poolid, (psetid_t)id); 598 if (ret == 0) 599 pool_event_dispatch(POOL_E_CHANGE, poolid); 600 break; 601 default: 602 ret = EINVAL; 603 } 604 if (ret == 0) 605 pool_pool_mod = gethrtime(); 606 return (ret); 607 } 608 609 /* 610 * Disassociate resource set from pool. 611 */ 612 int 613 pool_dissoc(poolid_t poolid, int idtype) 614 { 615 int ret; 616 617 ASSERT(pool_lock_held()); 618 if (pool_state == POOL_DISABLED) 619 return (ENOTACTIVE); 620 switch (idtype) { 621 case PREC_PSET: 622 ret = pool_pset_assoc(poolid, PS_NONE); 623 if (ret == 0) 624 pool_event_dispatch(POOL_E_CHANGE, poolid); 625 break; 626 default: 627 ret = EINVAL; 628 } 629 if (ret == 0) 630 pool_pool_mod = gethrtime(); 631 return (ret); 632 } 633 634 /* 635 * Transfer specified quantity of resources between resource sets. 636 */ 637 /*ARGSUSED*/ 638 int 639 pool_transfer(int type, id_t src, id_t dst, uint64_t qty) 640 { 641 int ret = EINVAL; 642 643 return (ret); 644 } 645 646 static poolid_t 647 pool_lookup_id_by_pset(int id) 648 { 649 pool_t *pool = pool_default; 650 psetid_t psetid = (psetid_t)id; 651 652 ASSERT(pool_lock_held()); 653 for (pool = list_head(&pool_list); pool != NULL; 654 pool = list_next(&pool_list, pool)) { 655 if (pool->pool_pset->pset_id == psetid) 656 return (pool->pool_id); 657 } 658 return (POOL_INVALID); 659 } 660 661 /* 662 * Transfer resources specified by their IDs between resource sets. 663 */ 664 int 665 pool_xtransfer(int type, id_t src_pset, id_t dst_pset, uint_t size, id_t *ids) 666 { 667 int ret; 668 poolid_t src_pool, dst_pool; 669 670 ASSERT(pool_lock_held()); 671 if (pool_state == POOL_DISABLED) 672 return (ENOTACTIVE); 673 switch (type) { 674 case PREC_PSET: 675 ret = pool_pset_xtransfer((psetid_t)src_pset, 676 (psetid_t)dst_pset, size, ids); 677 if (ret == 0) { 678 if ((src_pool = pool_lookup_id_by_pset(src_pset)) != 679 POOL_INVALID) 680 pool_event_dispatch(POOL_E_CHANGE, src_pool); 681 if ((dst_pool = pool_lookup_id_by_pset(dst_pset)) != 682 POOL_INVALID) 683 pool_event_dispatch(POOL_E_CHANGE, dst_pool); 684 } 685 break; 686 default: 687 ret = EINVAL; 688 } 689 return (ret); 690 } 691 692 /* 693 * Bind processes to pools. 694 */ 695 int 696 pool_bind(poolid_t poolid, idtype_t idtype, id_t id) 697 { 698 pool_t *pool; 699 700 ASSERT(pool_lock_held()); 701 702 if (pool_state == POOL_DISABLED) 703 return (ENOTACTIVE); 704 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) 705 return (ESRCH); 706 707 switch (idtype) { 708 case P_PID: 709 case P_TASKID: 710 case P_PROJID: 711 case P_ZONEID: 712 break; 713 default: 714 return (EINVAL); 715 } 716 return (pool_do_bind(pool, idtype, id, POOL_BIND_ALL)); 717 } 718 719 /* 720 * Query pool binding of the specifed process. 721 */ 722 int 723 pool_query_binding(idtype_t idtype, id_t id, id_t *poolid) 724 { 725 proc_t *p; 726 727 if (idtype != P_PID) 728 return (ENOTSUP); 729 if (id == P_MYID) 730 id = curproc->p_pid; 731 732 ASSERT(pool_lock_held()); 733 734 mutex_enter(&pidlock); 735 if ((p = prfind((pid_t)id)) == NULL) { 736 mutex_exit(&pidlock); 737 return (ESRCH); 738 } 739 mutex_enter(&p->p_lock); 740 /* 741 * In local zones, lie about pool bindings of processes from 742 * the global zone. 743 */ 744 if (!INGLOBALZONE(curproc) && INGLOBALZONE(p)) { 745 pool_t *pool; 746 747 pool = zone_pool_get(curproc->p_zone); 748 *poolid = pool->pool_id; 749 } else { 750 *poolid = p->p_pool->pool_id; 751 } 752 mutex_exit(&p->p_lock); 753 mutex_exit(&pidlock); 754 return (0); 755 } 756 757 static ea_object_t * 758 pool_system_pack(void) 759 { 760 ea_object_t *eo_system; 761 size_t bufsz = 0; 762 char *buf = NULL; 763 764 ASSERT(pool_lock_held()); 765 766 eo_system = ea_alloc_group(EXT_GROUP | EXC_LOCAL | EXD_GROUP_SYSTEM); 767 (void) ea_attach_item(eo_system, &pool_sys_mod, sizeof (hrtime_t), 768 EXC_LOCAL | EXD_SYSTEM_TSTAMP | EXT_UINT64); 769 if (INGLOBALZONE(curproc)) 770 (void) ea_attach_item(eo_system, &pool_pool_mod, 771 sizeof (hrtime_t), 772 EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64); 773 else 774 (void) ea_attach_item(eo_system, 775 &curproc->p_zone->zone_pool_mod, 776 sizeof (hrtime_t), 777 EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64); 778 (void) ea_attach_item(eo_system, &pool_pset_mod, sizeof (hrtime_t), 779 EXC_LOCAL | EXD_PSET_TSTAMP | EXT_UINT64); 780 (void) ea_attach_item(eo_system, &pool_cpu_mod, sizeof (hrtime_t), 781 EXC_LOCAL | EXD_CPU_TSTAMP | EXT_UINT64); 782 (void) nvlist_pack(pool_sys_prop, &buf, &bufsz, NV_ENCODE_NATIVE, 0); 783 (void) ea_attach_item(eo_system, buf, bufsz, 784 EXC_LOCAL | EXD_SYSTEM_PROP | EXT_RAW); 785 kmem_free(buf, bufsz); 786 return (eo_system); 787 } 788 789 /* 790 * Pack information about pools and attach it to specified exacct group. 791 */ 792 static int 793 pool_pool_pack(ea_object_t *eo_system) 794 { 795 ea_object_t *eo_pool; 796 pool_t *pool; 797 size_t bufsz; 798 char *buf; 799 pool_t *myzonepool; 800 801 ASSERT(pool_lock_held()); 802 myzonepool = zone_pool_get(curproc->p_zone); 803 for (pool = list_head(&pool_list); pool; 804 pool = list_next(&pool_list, pool)) { 805 if (!INGLOBALZONE(curproc) && myzonepool != pool) 806 continue; 807 bufsz = 0; 808 buf = NULL; 809 eo_pool = ea_alloc_group(EXT_GROUP | 810 EXC_LOCAL | EXD_GROUP_POOL); 811 (void) ea_attach_item(eo_pool, &pool->pool_id, sizeof (id_t), 812 EXC_LOCAL | EXD_POOL_POOLID | EXT_UINT32); 813 (void) ea_attach_item(eo_pool, &pool->pool_pset->pset_id, 814 sizeof (id_t), EXC_LOCAL | EXD_POOL_PSETID | EXT_UINT32); 815 (void) nvlist_pack(pool->pool_props, &buf, &bufsz, 816 NV_ENCODE_NATIVE, 0); 817 (void) ea_attach_item(eo_pool, buf, bufsz, 818 EXC_LOCAL | EXD_POOL_PROP | EXT_RAW); 819 kmem_free(buf, bufsz); 820 (void) ea_attach_to_group(eo_system, eo_pool); 821 } 822 return (0); 823 } 824 825 /* 826 * Pack the whole pool configuration in the specified buffer. 827 */ 828 int 829 pool_pack_conf(void *kbuf, size_t kbufsz, size_t *asize) 830 { 831 ea_object_t *eo_system; 832 size_t ksize; 833 int ret = 0; 834 835 ASSERT(pool_lock_held()); 836 837 eo_system = pool_system_pack(); /* 1. pack system */ 838 (void) pool_pool_pack(eo_system); /* 2. pack all pools */ 839 (void) pool_pset_pack(eo_system); /* 3. pack all psets */ 840 ksize = ea_pack_object(eo_system, NULL, 0); 841 if (kbuf == NULL || kbufsz == 0) 842 *asize = ksize; 843 else if (ksize > kbufsz) 844 ret = ENOMEM; 845 else 846 *asize = ea_pack_object(eo_system, kbuf, kbufsz); 847 ea_free_object(eo_system, EUP_ALLOC); 848 return (ret); 849 } 850 851 /* 852 * Start/end the commit transaction. If commit transaction is currently 853 * in progress, then all POOL_QUERY ioctls will return pools configuration 854 * at the beginning of transaction. 855 */ 856 int 857 pool_commit(int state) 858 { 859 ea_object_t *eo_system; 860 int ret = 0; 861 862 ASSERT(pool_lock_held()); 863 864 if (pool_state == POOL_DISABLED) 865 return (ENOTACTIVE); 866 switch (state) { 867 case 1: 868 /* 869 * Beginning commit transation. 870 */ 871 if (pool_buf != NULL) /* transaction in progress */ 872 return (EBUSY); 873 eo_system = pool_system_pack(); /* 1. pack system */ 874 (void) pool_pool_pack(eo_system); /* 2. pack all pools */ 875 (void) pool_pset_pack(eo_system); /* 3. pack all psets */ 876 pool_bufsz = ea_pack_object(eo_system, NULL, 0); 877 pool_buf = kmem_alloc(pool_bufsz, KM_SLEEP); 878 pool_bufsz = ea_pack_object(eo_system, pool_buf, pool_bufsz); 879 ea_free_object(eo_system, EUP_ALLOC); 880 break; 881 case 0: 882 /* 883 * Finishing commit transaction. 884 */ 885 if (pool_buf != NULL) { 886 kmem_free(pool_buf, pool_bufsz); 887 pool_buf = NULL; 888 pool_bufsz = 0; 889 } 890 break; 891 default: 892 ret = EINVAL; 893 } 894 return (ret); 895 } 896 897 /* 898 * Check is the specified property is special 899 */ 900 static pool_property_t * 901 pool_property_find(char *name, pool_property_t *list) 902 { 903 pool_property_t *prop; 904 905 for (prop = list; prop->pp_name != NULL; prop++) 906 if (strcmp(prop->pp_name, name) == 0) 907 return (prop); 908 return (NULL); 909 } 910 911 static pool_property_t pool_prop_sys[] = { 912 { "system.name", DATA_TYPE_STRING, PP_RDWR }, 913 { "system.comment", DATA_TYPE_STRING, PP_RDWR }, 914 { "system.version", DATA_TYPE_UINT64, PP_READ }, 915 { "system.bind-default", DATA_TYPE_BYTE, PP_RDWR }, 916 { "system.allocate-method", DATA_TYPE_STRING, 917 PP_RDWR | PP_OPTIONAL }, 918 { "system.poold.log-level", DATA_TYPE_STRING, 919 PP_RDWR | PP_OPTIONAL }, 920 { "system.poold.log-location", DATA_TYPE_STRING, 921 PP_RDWR | PP_OPTIONAL }, 922 { "system.poold.monitor-interval", DATA_TYPE_UINT64, 923 PP_RDWR | PP_OPTIONAL }, 924 { "system.poold.history-file", DATA_TYPE_STRING, 925 PP_RDWR | PP_OPTIONAL }, 926 { "system.poold.objectives", DATA_TYPE_STRING, 927 PP_RDWR | PP_OPTIONAL }, 928 { NULL, 0, 0 } 929 }; 930 931 static pool_property_t pool_prop_pool[] = { 932 { "pool.sys_id", DATA_TYPE_UINT64, PP_READ }, 933 { "pool.name", DATA_TYPE_STRING, PP_RDWR }, 934 { "pool.default", DATA_TYPE_BYTE, PP_READ }, 935 { "pool.active", DATA_TYPE_BYTE, PP_RDWR }, 936 { "pool.importance", DATA_TYPE_INT64, PP_RDWR }, 937 { "pool.comment", DATA_TYPE_STRING, PP_RDWR }, 938 { "pool.scheduler", DATA_TYPE_STRING, 939 PP_RDWR | PP_OPTIONAL }, 940 { NULL, 0, 0 } 941 }; 942 943 /* 944 * Common routine to put new property on the specified list 945 */ 946 int 947 pool_propput_common(nvlist_t *nvlist, nvpair_t *pair, pool_property_t *props) 948 { 949 pool_property_t *prop; 950 951 if ((prop = pool_property_find(nvpair_name(pair), props)) != NULL) { 952 /* 953 * No read-only properties or properties with bad types 954 */ 955 if (!(prop->pp_perm & PP_WRITE) || 956 prop->pp_type != nvpair_type(pair)) 957 return (EINVAL); 958 } 959 return (nvlist_add_nvpair(nvlist, pair)); 960 } 961 962 /* 963 * Common routine to remove property from the given list 964 */ 965 int 966 pool_proprm_common(nvlist_t *nvlist, char *name, pool_property_t *props) 967 { 968 pool_property_t *prop; 969 970 if ((prop = pool_property_find(name, props)) != NULL) { 971 if (!(prop->pp_perm & PP_OPTIONAL)) 972 return (EINVAL); 973 } 974 return (nvlist_remove_all(nvlist, name)); 975 } 976 977 static int 978 pool_system_propput(nvpair_t *pair) 979 { 980 int ret; 981 982 ASSERT(pool_lock_held()); 983 ret = pool_propput_common(pool_sys_prop, pair, pool_prop_sys); 984 if (ret == 0) 985 pool_sys_mod = gethrtime(); 986 return (ret); 987 } 988 989 static int 990 pool_system_proprm(char *name) 991 { 992 int ret; 993 994 ASSERT(pool_lock_held()); 995 ret = pool_proprm_common(pool_sys_prop, name, pool_prop_sys); 996 if (ret == 0) 997 pool_sys_mod = gethrtime(); 998 return (ret); 999 } 1000 1001 static int 1002 pool_pool_propput(poolid_t poolid, nvpair_t *pair) 1003 { 1004 pool_t *pool; 1005 int ret; 1006 1007 ASSERT(pool_lock_held()); 1008 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) 1009 return (ESRCH); 1010 ret = pool_propput_common(pool->pool_props, pair, pool_prop_pool); 1011 if (ret == 0) 1012 pool_pool_mod = gethrtime(); 1013 return (ret); 1014 } 1015 1016 static int 1017 pool_pool_proprm(poolid_t poolid, char *name) 1018 { 1019 int ret; 1020 pool_t *pool; 1021 1022 ASSERT(pool_lock_held()); 1023 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL) 1024 return (ESRCH); 1025 ret = pool_proprm_common(pool->pool_props, name, pool_prop_pool); 1026 if (ret == 0) 1027 pool_pool_mod = gethrtime(); 1028 return (ret); 1029 } 1030 1031 int 1032 pool_propput(int class, int subclass, id_t id, nvpair_t *pair) 1033 { 1034 int ret; 1035 1036 ASSERT(pool_lock_held()); 1037 if (pool_state == POOL_DISABLED) 1038 return (ENOTACTIVE); 1039 switch (class) { 1040 case PEC_SYSTEM: 1041 ret = pool_system_propput(pair); 1042 break; 1043 case PEC_POOL: 1044 ret = pool_pool_propput((poolid_t)id, pair); 1045 break; 1046 case PEC_RES_COMP: 1047 switch (subclass) { 1048 case PREC_PSET: 1049 ret = pool_pset_propput((psetid_t)id, pair); 1050 break; 1051 default: 1052 ret = EINVAL; 1053 } 1054 break; 1055 case PEC_RES_AGG: 1056 ret = ENOTSUP; 1057 break; 1058 case PEC_COMP: 1059 switch (subclass) { 1060 case PCEC_CPU: 1061 ret = pool_cpu_propput((processorid_t)id, pair); 1062 break; 1063 default: 1064 ret = EINVAL; 1065 } 1066 break; 1067 default: 1068 ret = EINVAL; 1069 } 1070 return (ret); 1071 } 1072 1073 int 1074 pool_proprm(int class, int subclass, id_t id, char *name) 1075 { 1076 int ret; 1077 1078 ASSERT(pool_lock_held()); 1079 if (pool_state == POOL_DISABLED) 1080 return (ENOTACTIVE); 1081 switch (class) { 1082 case PEC_SYSTEM: 1083 ret = pool_system_proprm(name); 1084 break; 1085 case PEC_POOL: 1086 ret = pool_pool_proprm((poolid_t)id, name); 1087 break; 1088 case PEC_RES_COMP: 1089 switch (subclass) { 1090 case PREC_PSET: 1091 ret = pool_pset_proprm((psetid_t)id, name); 1092 break; 1093 default: 1094 ret = EINVAL; 1095 } 1096 break; 1097 case PEC_RES_AGG: 1098 ret = ENOTSUP; 1099 break; 1100 case PEC_COMP: 1101 switch (subclass) { 1102 case PCEC_CPU: 1103 ret = pool_cpu_proprm((processorid_t)id, name); 1104 break; 1105 default: 1106 ret = EINVAL; 1107 } 1108 break; 1109 default: 1110 ret = EINVAL; 1111 } 1112 return (ret); 1113 } 1114 1115 int 1116 pool_propget(char *name, int class, int subclass, id_t id, nvlist_t **nvlp) 1117 { 1118 int ret; 1119 nvlist_t *nvl; 1120 1121 ASSERT(pool_lock_held()); 1122 if (pool_state == POOL_DISABLED) 1123 return (ENOTACTIVE); 1124 1125 (void) nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP); 1126 1127 switch (class) { 1128 case PEC_SYSTEM: 1129 case PEC_POOL: 1130 ret = EINVAL; 1131 break; 1132 case PEC_RES_COMP: 1133 switch (subclass) { 1134 case PREC_PSET: 1135 ret = pool_pset_propget((psetid_t)id, name, nvl); 1136 break; 1137 default: 1138 ret = EINVAL; 1139 } 1140 break; 1141 case PEC_RES_AGG: 1142 ret = ENOTSUP; 1143 break; 1144 case PEC_COMP: 1145 switch (subclass) { 1146 case PCEC_CPU: 1147 ret = pool_cpu_propget((processorid_t)id, name, nvl); 1148 break; 1149 default: 1150 ret = EINVAL; 1151 } 1152 break; 1153 default: 1154 ret = EINVAL; 1155 } 1156 if (ret == 0) 1157 *nvlp = nvl; 1158 else 1159 nvlist_free(nvl); 1160 return (ret); 1161 } 1162 1163 /* 1164 * pool_bind_wake and pool_bind_wakeall are helper functions to undo PBWAITs 1165 * in case of failure in pool_do_bind(). 1166 */ 1167 static void 1168 pool_bind_wake(proc_t *p) 1169 { 1170 ASSERT(pool_lock_held()); 1171 1172 mutex_enter(&p->p_lock); 1173 ASSERT(p->p_poolflag & PBWAIT); 1174 if (p->p_poolcnt > 0) { 1175 mutex_enter(&pool_barrier_lock); 1176 pool_barrier_count -= p->p_poolcnt; 1177 mutex_exit(&pool_barrier_lock); 1178 } 1179 p->p_poolflag &= ~PBWAIT; 1180 cv_signal(&p->p_poolcv); 1181 mutex_exit(&p->p_lock); 1182 } 1183 1184 static void 1185 pool_bind_wakeall(proc_t **procs) 1186 { 1187 proc_t *p, **pp; 1188 1189 ASSERT(pool_lock_held()); 1190 for (pp = procs; (p = *pp) != NULL; pp++) 1191 pool_bind_wake(p); 1192 } 1193 1194 /* 1195 * Return the scheduling class for this pool, or 1196 * POOL_CLASS_UNSET if not set 1197 * POOL_CLASS_INVAL if set to an invalid class ID. 1198 */ 1199 id_t 1200 pool_get_class(pool_t *pool) 1201 { 1202 char *name; 1203 id_t cid; 1204 1205 ASSERT(pool_lock_held()); 1206 1207 if (nvlist_lookup_string(pool->pool_props, "pool.scheduler", 1208 &name) == 0) { 1209 if (getcidbyname(name, &cid) == 0) 1210 return (cid); 1211 else 1212 return (POOL_CLASS_INVAL); 1213 } 1214 return (POOL_CLASS_UNSET); 1215 } 1216 1217 /* 1218 * Move process to the new scheduling class. 1219 */ 1220 static void 1221 pool_change_class(proc_t *p, id_t cid) 1222 { 1223 kthread_t *t; 1224 void *cldata; 1225 id_t oldcid; 1226 void **bufs; 1227 void **buf; 1228 int nlwp; 1229 int ret; 1230 int i; 1231 1232 /* 1233 * Do not move kernel processes (such as zsched). 1234 */ 1235 if (p->p_flag & SSYS) 1236 return; 1237 /* 1238 * This process is in the pool barrier, so it can't possibly be 1239 * adding new threads and we can use p_lwpcnt + p_zombcnt + 1 1240 * (for possible agent LWP which doesn't use pool barrier) as 1241 * our upper bound. 1242 */ 1243 nlwp = p->p_lwpcnt + p->p_zombcnt + 1; 1244 1245 /* 1246 * Pre-allocate scheduling class specific buffers before 1247 * grabbing p_lock. 1248 */ 1249 bufs = kmem_zalloc(nlwp * sizeof (void *), KM_SLEEP); 1250 for (i = 0, buf = bufs; i < nlwp; i++, buf++) { 1251 ret = CL_ALLOC(buf, cid, KM_SLEEP); 1252 ASSERT(ret == 0); 1253 } 1254 1255 /* 1256 * Move threads one by one to the new scheduling class. 1257 * This never fails because we have all the right 1258 * privileges here. 1259 */ 1260 mutex_enter(&p->p_lock); 1261 ASSERT(p->p_poolflag & PBWAIT); 1262 buf = bufs; 1263 t = p->p_tlist; 1264 ASSERT(t != NULL); 1265 do { 1266 if (t->t_cid != cid) { 1267 oldcid = t->t_cid; 1268 cldata = t->t_cldata; 1269 ret = CL_ENTERCLASS(t, cid, NULL, NULL, *buf); 1270 ASSERT(ret == 0); 1271 CL_EXITCLASS(oldcid, cldata); 1272 schedctl_set_cidpri(t); 1273 *buf++ = NULL; 1274 } 1275 } while ((t = t->t_forw) != p->p_tlist); 1276 mutex_exit(&p->p_lock); 1277 /* 1278 * Free unused scheduling class specific buffers. 1279 */ 1280 for (i = 0, buf = bufs; i < nlwp; i++, buf++) { 1281 if (*buf != NULL) { 1282 CL_FREE(cid, *buf); 1283 *buf = NULL; 1284 } 1285 } 1286 kmem_free(bufs, nlwp * sizeof (void *)); 1287 } 1288 1289 void 1290 pool_get_name(pool_t *pool, char **name) 1291 { 1292 ASSERT(pool_lock_held()); 1293 1294 (void) nvlist_lookup_string(pool->pool_props, "pool.name", name); 1295 1296 ASSERT(strlen(*name) != 0); 1297 } 1298 1299 1300 /* 1301 * The meat of the bind operation. The steps in pool_do_bind are: 1302 * 1303 * 1) Set PBWAIT in the p_poolflag of any process of interest, and add all 1304 * such processes to an array. For any interesting process that has 1305 * threads inside the pool barrier set, increment a counter by the 1306 * count of such threads. Once PBWAIT is set on a process, that process 1307 * will not disappear. 1308 * 1309 * 2) Wait for the counter from step 2 to drop to zero. Any process which 1310 * calls pool_barrier_exit() and notices that PBWAIT has been set on it 1311 * will decrement that counter before going to sleep, and the process 1312 * calling pool_barrier_exit() which does the final decrement will wake us. 1313 * 1314 * 3) For each interesting process, perform a calculation on it to see if 1315 * the bind will actually succeed. This uses the following three 1316 * resource-set-specific functions: 1317 * 1318 * - int set_bind_start(procs, pool) 1319 * 1320 * Determine whether the given array of processes can be bound to the 1321 * resource set associated with the given pool. If it can, take and hold 1322 * any locks necessary to ensure that the operation will succeed, and 1323 * make any necessary reservations in the target resource set. If it 1324 * can't, return failure with no reservations made and no new locks held. 1325 * 1326 * - void set_bind_abort(procs, pool) 1327 * 1328 * set_bind_start() has completed successfully, but another resource set's 1329 * set_bind_start() has failed, and we haven't begun the bind yet. Undo 1330 * any reservations made and drop any locks acquired by our 1331 * set_bind_start(). 1332 * 1333 * - void set_bind_finish(void) 1334 * 1335 * The bind has completed successfully. The processes have been released, 1336 * and the reservation acquired in set_bind_start() has been depleted as 1337 * the processes have finished their bindings. Drop any locks acquired by 1338 * set_bind_start(). 1339 * 1340 * 4) If we've decided that we can proceed with the bind, iterate through 1341 * the list of interesting processes, grab the necessary locks (which 1342 * may differ per resource set), perform the bind, and ASSERT that it 1343 * succeeds. Once a process has been rebound, it can be awakened. 1344 * 1345 * The operations from step 4 must be kept in sync with anything which might 1346 * cause the bind operations (e.g., cpupart_bind_thread()) to fail, and 1347 * are thus located in the same source files as the associated bind operations. 1348 */ 1349 int 1350 pool_do_bind(pool_t *pool, idtype_t idtype, id_t id, int flags) 1351 { 1352 extern uint_t nproc; 1353 klwp_t *lwp = ttolwp(curthread); 1354 proc_t **pp, **procs; 1355 proc_t *prstart; 1356 int procs_count = 0; 1357 kproject_t *kpj; 1358 procset_t set; 1359 zone_t *zone; 1360 int procs_size; 1361 int rv = 0; 1362 proc_t *p; 1363 id_t cid = -1; 1364 1365 ASSERT(pool_lock_held()); 1366 1367 if ((cid = pool_get_class(pool)) == POOL_CLASS_INVAL) 1368 return (EINVAL); 1369 1370 if (idtype == P_ZONEID) { 1371 zone = zone_find_by_id(id); 1372 if (zone == NULL) 1373 return (ESRCH); 1374 if (zone_status_get(zone) > ZONE_IS_RUNNING) { 1375 zone_rele(zone); 1376 return (EBUSY); 1377 } 1378 } 1379 1380 if (idtype == P_PROJID) { 1381 kpj = project_hold_by_id(id, global_zone, PROJECT_HOLD_FIND); 1382 if (kpj == NULL) 1383 return (ESRCH); 1384 mutex_enter(&kpj->kpj_poolbind); 1385 } 1386 1387 if (idtype == P_PID) { 1388 /* 1389 * Fast-path for a single process case. 1390 */ 1391 procs_size = 2; /* procs is NULL-terminated */ 1392 procs = kmem_zalloc(procs_size * sizeof (proc_t *), KM_SLEEP); 1393 mutex_enter(&pidlock); 1394 } else { 1395 /* 1396 * We will need enough slots for proc_t pointers for as many as 1397 * twice the number of currently running processes (assuming 1398 * that each one could be in fork() creating a new child). 1399 */ 1400 for (;;) { 1401 procs_size = nproc * 2; 1402 procs = kmem_zalloc(procs_size * sizeof (proc_t *), 1403 KM_SLEEP); 1404 mutex_enter(&pidlock); 1405 1406 if (nproc * 2 <= procs_size) 1407 break; 1408 /* 1409 * If nproc has changed, try again. 1410 */ 1411 mutex_exit(&pidlock); 1412 kmem_free(procs, procs_size * sizeof (proc_t *)); 1413 } 1414 } 1415 1416 if (id == P_MYID) 1417 id = getmyid(idtype); 1418 setprocset(&set, POP_AND, idtype, id, P_ALL, 0); 1419 1420 /* 1421 * Do a first scan, and select target processes. 1422 */ 1423 if (idtype == P_PID) 1424 prstart = prfind(id); 1425 else 1426 prstart = practive; 1427 for (p = prstart, pp = procs; p != NULL; p = p->p_next) { 1428 mutex_enter(&p->p_lock); 1429 /* 1430 * Skip processes that don't match our (id, idtype) set or 1431 * on the way of becoming zombies. Skip kernel processes 1432 * from the global zone. 1433 */ 1434 if (procinset(p, &set) == 0 || 1435 p->p_poolflag & PEXITED || 1436 ((p->p_flag & SSYS) && INGLOBALZONE(p))) { 1437 mutex_exit(&p->p_lock); 1438 continue; 1439 } 1440 if (!INGLOBALZONE(p)) { 1441 switch (idtype) { 1442 case P_PID: 1443 case P_TASKID: 1444 /* 1445 * Can't bind processes or tasks 1446 * in local zones to pools. 1447 */ 1448 mutex_exit(&p->p_lock); 1449 mutex_exit(&pidlock); 1450 pool_bind_wakeall(procs); 1451 rv = EINVAL; 1452 goto out; 1453 case P_PROJID: 1454 /* 1455 * Only projects in the global 1456 * zone can be rebound. 1457 */ 1458 mutex_exit(&p->p_lock); 1459 continue; 1460 case P_POOLID: 1461 /* 1462 * When rebinding pools, processes can be 1463 * in different zones. 1464 */ 1465 break; 1466 } 1467 } 1468 1469 p->p_poolflag |= PBWAIT; 1470 /* 1471 * If some threads in this process are inside the pool 1472 * barrier, add them to pool_barrier_count, as we have 1473 * to wait for all of them to exit the barrier. 1474 */ 1475 if (p->p_poolcnt > 0) { 1476 mutex_enter(&pool_barrier_lock); 1477 pool_barrier_count += p->p_poolcnt; 1478 mutex_exit(&pool_barrier_lock); 1479 } 1480 ASSERT(pp < &procs[procs_size]); 1481 *pp++ = p; 1482 procs_count++; 1483 mutex_exit(&p->p_lock); 1484 1485 /* 1486 * We just found our process, so if we're only rebinding a 1487 * single process then get out of this loop. 1488 */ 1489 if (idtype == P_PID) 1490 break; 1491 } 1492 *pp = NULL; /* cap off the end of the array */ 1493 mutex_exit(&pidlock); 1494 1495 /* 1496 * Wait for relevant processes to stop before they try to enter the 1497 * barrier or at the exit from the barrier. Make sure that we do 1498 * not get stopped here while we're holding pool_lock. If we were 1499 * requested to stop, or got a signal then return EAGAIN to let the 1500 * library know that it needs to retry. 1501 */ 1502 mutex_enter(&pool_barrier_lock); 1503 lwp->lwp_nostop++; 1504 while (pool_barrier_count > 0) { 1505 (void) cv_wait_sig(&pool_barrier_cv, &pool_barrier_lock); 1506 if (pool_barrier_count > 0) { 1507 /* 1508 * We either got a signal or were requested to 1509 * stop by /proc. Bail out with EAGAIN. If we were 1510 * requested to stop, we'll stop in post_syscall() 1511 * on our way back to userland. 1512 */ 1513 mutex_exit(&pool_barrier_lock); 1514 pool_bind_wakeall(procs); 1515 lwp->lwp_nostop--; 1516 rv = EAGAIN; 1517 goto out; 1518 } 1519 } 1520 lwp->lwp_nostop--; 1521 mutex_exit(&pool_barrier_lock); 1522 1523 if (idtype == P_PID) { 1524 if ((p = *procs) == NULL) 1525 goto skip; 1526 mutex_enter(&p->p_lock); 1527 /* Drop the process if it is exiting */ 1528 if (p->p_poolflag & PEXITED) { 1529 mutex_exit(&p->p_lock); 1530 pool_bind_wake(p); 1531 procs_count--; 1532 } else 1533 mutex_exit(&p->p_lock); 1534 goto skip; 1535 } 1536 1537 /* 1538 * Do another run, and drop processes that were inside the barrier 1539 * in exit(), but when they have dropped to pool_barrier_exit 1540 * they have become of no interest to us. Pick up child processes that 1541 * were created by fork() but didn't exist during our first scan. 1542 * Their parents are now stopped at pool_barrier_exit in cfork(). 1543 */ 1544 mutex_enter(&pidlock); 1545 for (pp = procs; (p = *pp) != NULL; pp++) { 1546 mutex_enter(&p->p_lock); 1547 if (p->p_poolflag & PEXITED) { 1548 ASSERT(p->p_lwpcnt == 0); 1549 mutex_exit(&p->p_lock); 1550 pool_bind_wake(p); 1551 /* flip w/last non-NULL slot */ 1552 *pp = procs[procs_count - 1]; 1553 procs[procs_count - 1] = NULL; 1554 procs_count--; 1555 pp--; /* try this slot again */ 1556 continue; 1557 } else 1558 mutex_exit(&p->p_lock); 1559 /* 1560 * Look at the child and check if it should be rebound also. 1561 * We're holding pidlock, so it is safe to reference p_child. 1562 */ 1563 if ((p = p->p_child) == NULL) 1564 continue; 1565 1566 mutex_enter(&p->p_lock); 1567 1568 /* 1569 * Skip system processes and make sure that the child is in 1570 * the same task/project/pool/zone as the parent. 1571 */ 1572 if ((!INGLOBALZONE(p) && idtype != P_ZONEID && 1573 idtype != P_POOLID) || p->p_flag & SSYS) { 1574 mutex_exit(&p->p_lock); 1575 continue; 1576 } 1577 1578 /* 1579 * If the child process has been already created by fork(), has 1580 * not exited, and has not been added to the list already, 1581 * then add it now. We will hit this process again (since we 1582 * stick it at the end of the procs list) but it will ignored 1583 * because it will have the PBWAIT flag set. 1584 */ 1585 if (procinset(p, &set) && 1586 !(p->p_poolflag & PEXITED) && 1587 !(p->p_poolflag & PBWAIT)) { 1588 ASSERT(p->p_child == NULL); /* no child of a child */ 1589 procs[procs_count] = p; 1590 procs[procs_count + 1] = NULL; 1591 procs_count++; 1592 p->p_poolflag |= PBWAIT; 1593 } 1594 mutex_exit(&p->p_lock); 1595 } 1596 mutex_exit(&pidlock); 1597 skip: 1598 /* 1599 * If there's no processes to rebind then return ESRCH, unless 1600 * we're associating a pool with new resource set, destroying it, 1601 * or binding a zone to a pool. 1602 */ 1603 if (procs_count == 0) { 1604 if (idtype == P_POOLID || idtype == P_ZONEID) 1605 rv = 0; 1606 else 1607 rv = ESRCH; 1608 goto out; 1609 } 1610 1611 #ifdef DEBUG 1612 /* 1613 * All processes in the array should have PBWAIT set, and none 1614 * should be in the critical section. Thus, although p_poolflag 1615 * and p_poolcnt are protected by p_lock, their ASSERTions below 1616 * should be stable without it. procinset(), however, ASSERTs that 1617 * the p_lock is held upon entry. 1618 */ 1619 for (pp = procs; (p = *pp) != NULL; pp++) { 1620 int in_set; 1621 1622 mutex_enter(&p->p_lock); 1623 in_set = procinset(p, &set); 1624 mutex_exit(&p->p_lock); 1625 1626 ASSERT(in_set); 1627 ASSERT(p->p_poolflag & PBWAIT); 1628 ASSERT(p->p_poolcnt == 0); 1629 } 1630 #endif 1631 1632 /* 1633 * Do the check if processor set rebinding is going to succeed or not. 1634 */ 1635 if ((flags & POOL_BIND_PSET) && 1636 (rv = pset_bind_start(procs, pool)) != 0) { 1637 pool_bind_wakeall(procs); 1638 goto out; 1639 } 1640 1641 /* 1642 * At this point, all bind operations should succeed. 1643 */ 1644 for (pp = procs; (p = *pp) != NULL; pp++) { 1645 if (flags & POOL_BIND_PSET) { 1646 psetid_t psetid = pool->pool_pset->pset_id; 1647 void *zonebuf; 1648 void *projbuf; 1649 1650 /* 1651 * Pre-allocate one buffer for FSS (per-project 1652 * buffer for a new pset) in case if this is the 1653 * first thread from its current project getting 1654 * bound to this processor set. 1655 */ 1656 projbuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_PROJ); 1657 zonebuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_ZONE); 1658 1659 mutex_enter(&pidlock); 1660 mutex_enter(&p->p_lock); 1661 pool_pset_bind(p, psetid, projbuf, zonebuf); 1662 mutex_exit(&p->p_lock); 1663 mutex_exit(&pidlock); 1664 /* 1665 * Free buffers pre-allocated above if it 1666 * wasn't actually used. 1667 */ 1668 fss_freebuf(projbuf, FSS_ALLOC_PROJ); 1669 fss_freebuf(zonebuf, FSS_ALLOC_ZONE); 1670 } 1671 /* 1672 * Now let's change the scheduling class of this 1673 * process if our target pool has it defined. 1674 */ 1675 if (cid != POOL_CLASS_UNSET) 1676 pool_change_class(p, cid); 1677 1678 /* 1679 * It is safe to reference p_pool here without holding 1680 * p_lock because it cannot change underneath of us. 1681 * We're holding pool_lock here, so nobody else can be 1682 * moving this process between pools. If process "p" 1683 * would be exiting, we're guaranteed that it would be blocked 1684 * at pool_barrier_enter() in exit(). Otherwise, it would've 1685 * been skipped by one of our scans of the practive list 1686 * as a process with PEXITED flag set. 1687 */ 1688 if (p->p_pool != pool) { 1689 ASSERT(p->p_pool->pool_ref > 0); 1690 atomic_dec_32(&p->p_pool->pool_ref); 1691 p->p_pool = pool; 1692 atomic_inc_32(&p->p_pool->pool_ref); 1693 } 1694 /* 1695 * Okay, we've tortured this guy enough. 1696 * Let this poor process go now. 1697 */ 1698 pool_bind_wake(p); 1699 } 1700 if (flags & POOL_BIND_PSET) 1701 pset_bind_finish(); 1702 1703 out: switch (idtype) { 1704 case P_PROJID: 1705 ASSERT(kpj != NULL); 1706 mutex_exit(&kpj->kpj_poolbind); 1707 project_rele(kpj); 1708 break; 1709 case P_ZONEID: 1710 if (rv == 0) { 1711 mutex_enter(&cpu_lock); 1712 zone_pool_set(zone, pool); 1713 mutex_exit(&cpu_lock); 1714 } 1715 zone->zone_pool_mod = gethrtime(); 1716 zone_rele(zone); 1717 break; 1718 } 1719 1720 kmem_free(procs, procs_size * sizeof (proc_t *)); 1721 ASSERT(pool_barrier_count == 0); 1722 return (rv); 1723 } 1724 1725 void 1726 pool_event_cb_register(pool_event_cb_t *cb) 1727 { 1728 ASSERT(!pool_lock_held() || panicstr); 1729 ASSERT(cb->pec_func != NULL); 1730 1731 mutex_enter(&pool_event_cb_lock); 1732 if (!pool_event_cb_init) { 1733 list_create(&pool_event_cb_list, sizeof (pool_event_cb_t), 1734 offsetof(pool_event_cb_t, pec_list)); 1735 pool_event_cb_init = B_TRUE; 1736 } 1737 list_insert_tail(&pool_event_cb_list, cb); 1738 mutex_exit(&pool_event_cb_lock); 1739 } 1740 1741 void 1742 pool_event_cb_unregister(pool_event_cb_t *cb) 1743 { 1744 ASSERT(!pool_lock_held() || panicstr); 1745 1746 mutex_enter(&pool_event_cb_lock); 1747 list_remove(&pool_event_cb_list, cb); 1748 mutex_exit(&pool_event_cb_lock); 1749 } 1750 1751 typedef struct { 1752 pool_event_t tqd_what; 1753 poolid_t tqd_id; 1754 } pool_tqd_t; 1755 1756 void 1757 pool_event_notify(void *arg) 1758 { 1759 pool_tqd_t *tqd = (pool_tqd_t *)arg; 1760 pool_event_cb_t *cb; 1761 1762 ASSERT(!pool_lock_held() || panicstr); 1763 1764 mutex_enter(&pool_event_cb_lock); 1765 for (cb = list_head(&pool_event_cb_list); cb != NULL; 1766 cb = list_next(&pool_event_cb_list, cb)) { 1767 cb->pec_func(tqd->tqd_what, tqd->tqd_id, cb->pec_arg); 1768 } 1769 mutex_exit(&pool_event_cb_lock); 1770 kmem_free(tqd, sizeof (*tqd)); 1771 } 1772 1773 void 1774 pool_event_dispatch(pool_event_t what, poolid_t id) 1775 { 1776 pool_tqd_t *tqd = NULL; 1777 1778 ASSERT(pool_lock_held()); 1779 1780 if (pool_event_cb_taskq == NULL) { 1781 pool_event_cb_taskq = taskq_create("pool_event_cb_taskq", 1, 1782 -1, 1, 1, TASKQ_PREPOPULATE); 1783 } 1784 1785 tqd = kmem_alloc(sizeof (*tqd), KM_SLEEP); 1786 tqd->tqd_what = what; 1787 tqd->tqd_id = id; 1788 1789 (void) taskq_dispatch(pool_event_cb_taskq, pool_event_notify, tqd, 1790 KM_SLEEP); 1791 } 1792