xref: /illumos-gate/usr/src/uts/common/os/pool.c (revision 67d74cc3e7c9d9461311136a0b2069813a3fd927)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/pool.h>
27 #include <sys/pool_impl.h>
28 #include <sys/pool_pset.h>
29 #include <sys/id_space.h>
30 #include <sys/mutex.h>
31 #include <sys/nvpair.h>
32 #include <sys/cpuvar.h>
33 #include <sys/errno.h>
34 #include <sys/cmn_err.h>
35 #include <sys/systm.h>
36 #include <sys/proc.h>
37 #include <sys/fss.h>
38 #include <sys/class.h>
39 #include <sys/exacct.h>
40 #include <sys/utsname.h>
41 #include <sys/procset.h>
42 #include <sys/atomic.h>
43 #include <sys/zone.h>
44 #include <sys/policy.h>
45 #include <sys/schedctl.h>
46 #include <sys/taskq.h>
47 
48 /*
49  * RESOURCE POOLS
50  *
51  * The resource pools facility brings together process-bindable resource into
52  * a common abstraction called a pool. Processor sets and other entities can
53  * be configured, grouped, and labelled such that workload components can be
54  * associated with a subset of a system's total resources.
55  *
56  * When disabled, the pools facility is "invisible".  All processes belong
57  * to the same pool (pool_default), and processor sets can be managed through
58  * the old pset() system call.  When enabled, processor sets can only be
59  * managed via the pools facility.  New pools can be created and associated
60  * with processor sets.  Processes can be bound to pools which have non-empty
61  * resource sets.
62  *
63  * Locking: pool_lock() protects global pools state and must be called
64  * before modifying the configuration, or when taking a snapshot of the
65  * configuration.  If pool_lock_intr() is used, the operation may be
66  * interrupted by a signal or a request.
67  *
68  * To prevent processes from being rebound between pools while they are
69  * the middle of an operation which affects resource set bindings, such
70  * operations must be surrounded by calls to pool_barrier_enter() and
71  * pool_barrier_exit().  This mechanism guarantees that such processes will
72  * be stopped either at the beginning or at the end of the barrier so that
73  * the rebind operation can atomically bind the process and its threads
74  * to new resource sets, and then let process run again.
75  *
76  * Lock ordering with respect to other locks is as follows:
77  *
78  * 	pool_lock() -> cpu_lock -> pidlock -> p_lock -> pool_barrier_lock
79  *
80  * Most static and global variables defined in this file are protected
81  * by calling pool_lock().
82  *
83  * The operation that binds tasks and projects to pools is atomic.  That is,
84  * either all processes in a given task or a project will be bound to a
85  * new pool, or (in case of an error) they will be all left bound to the
86  * old pool. Processes in a given task or a given project can only be bound to
87  * different pools if they were rebound individually one by one as single
88  * processes.  Threads or LWPs of the same process do not have pool bindings,
89  * and are bound to the same resource sets associated with the resource pool
90  * of that process.
91  *
92  * The following picture shows one possible pool configuration with three
93  * pools and three processor sets.  Note that processor set "foo" is not
94  * associated with any pools and therefore cannot have any processes
95  * bound to it.  Two pools (default and foo) are associated with the
96  * same processor set (default).  Also, note that processes in Task 2
97  * are bound to different pools.
98  *
99  *
100  *							       Processor Sets
101  *								+---------+
102  *		       +--------------+========================>| default |
103  *		      a|	      |				+---------+
104  *		      s|	      |				    ||
105  *		      s|	      |				+---------+
106  *		      o|	      |				|   foo   |
107  *		      c|	      |				+---------+
108  *		      i|	      |				    ||
109  *		      a|	      |				+---------+
110  *		      t|	      |			+------>|   bar   |
111  *		      e|	      |			|	+---------+
112  *                    d|              |                 |
113  *                     |              |                 |
114  *	       +---------+      +---------+      +---------+
115  *     Pools   | default |======|   foo   |======|   bar   |
116  *	       +---------+      +---------+      +---------+
117  *	           @  @            @              @ @   @
118  *                b|  |            |              | |   |
119  *                o|  |            |              | |   |
120  *                u|  +-----+      |      +-------+ |   +---+
121  *                n|        |      |      |         |       |
122  *            ....d|........|......|......|.........|.......|....
123  *            :    |   ::   |      |      |    ::   |       |   :
124  *            :  +---+ :: +---+  +---+  +---+  :: +---+   +---+ :
125  *  Processes :  | p | :: | p |  | p |  | p |  :: | p |...| p | :
126  *            :  +---+ :: +---+  +---+  +---+  :: +---+   +---+ :
127  *            :........::......................::...............:
128  *              Task 1            Task 2              Task N
129  *                 |                 |                  |
130  *                 |                 |                  |
131  *                 |  +-----------+  |             +-----------+
132  *                 +--| Project 1 |--+             | Project N |
133  *                    +-----------+                +-----------+
134  *
135  * This is just an illustration of relationships between processes, tasks,
136  * projects, pools, and processor sets. New types of resource sets will be
137  * added in the future.
138  */
139 
140 pool_t		*pool_default;	/* default pool which always exists */
141 int		pool_count;	/* number of pools created on this system */
142 int		pool_state;	/* pools state -- enabled/disabled */
143 void		*pool_buf;	/* pre-commit snapshot of the pools state */
144 size_t		pool_bufsz;	/* size of pool_buf */
145 static hrtime_t	pool_pool_mod;	/* last modification time for pools */
146 static hrtime_t	pool_sys_mod;	/* last modification time for system */
147 static nvlist_t	*pool_sys_prop;	/* system properties */
148 static id_space_t *pool_ids;	/* pool ID space */
149 static list_t	pool_list;	/* doubly-linked list of pools */
150 static kmutex_t		pool_mutex;		/* protects pool_busy_* */
151 static kcondvar_t	pool_busy_cv;		/* waiting for "pool_lock" */
152 static kthread_t	*pool_busy_thread;	/* thread holding "pool_lock" */
153 static kmutex_t		pool_barrier_lock;	/* synch. with pool_barrier_* */
154 static kcondvar_t	pool_barrier_cv;	/* synch. with pool_barrier_* */
155 static int		pool_barrier_count;	/* synch. with pool_barrier_* */
156 static list_t		pool_event_cb_list;	/* pool event callbacks */
157 static boolean_t	pool_event_cb_init = B_FALSE;
158 static kmutex_t		pool_event_cb_lock;
159 static taskq_t		*pool_event_cb_taskq = NULL;
160 
161 void pool_event_dispatch(pool_event_t, poolid_t);
162 
163 /*
164  * Boot-time pool initialization.
165  */
166 void
167 pool_init(void)
168 {
169 	pool_ids = id_space_create("pool_ids", POOL_DEFAULT + 1, POOL_MAXID);
170 
171 	/*
172 	 * Initialize default pool.
173 	 */
174 	pool_default = kmem_zalloc(sizeof (pool_t), KM_SLEEP);
175 	pool_default->pool_id = POOL_DEFAULT;
176 	list_create(&pool_list, sizeof (pool_t), offsetof(pool_t, pool_link));
177 	list_insert_head(&pool_list, pool_default);
178 
179 	/*
180 	 * Initialize plugins for resource sets.
181 	 */
182 	pool_pset_init();
183 	pool_count = 1;
184 	p0.p_pool = pool_default;
185 	global_zone->zone_pool = pool_default;
186 	pool_default->pool_ref = 1;
187 }
188 
189 /*
190  * Synchronization routines.
191  *
192  * pool_lock is only called from syscall-level routines (processor_bind(),
193  * pset_*(), and /dev/pool ioctls).  The pool "lock" may be held for long
194  * periods of time, including across sleeping operations, so we allow its
195  * acquisition to be interruptible.
196  *
197  * The current thread that owns the "lock" is stored in the variable
198  * pool_busy_thread, both to let pool_lock_held() work and to aid debugging.
199  */
200 void
201 pool_lock(void)
202 {
203 	mutex_enter(&pool_mutex);
204 	ASSERT(!pool_lock_held());
205 	while (pool_busy_thread != NULL)
206 		cv_wait(&pool_busy_cv, &pool_mutex);
207 	pool_busy_thread = curthread;
208 	mutex_exit(&pool_mutex);
209 }
210 
211 int
212 pool_lock_intr(void)
213 {
214 	mutex_enter(&pool_mutex);
215 	ASSERT(!pool_lock_held());
216 	while (pool_busy_thread != NULL) {
217 		if (cv_wait_sig(&pool_busy_cv, &pool_mutex) == 0) {
218 			cv_signal(&pool_busy_cv);
219 			mutex_exit(&pool_mutex);
220 			return (1);
221 		}
222 	}
223 	pool_busy_thread = curthread;
224 	mutex_exit(&pool_mutex);
225 	return (0);
226 }
227 
228 int
229 pool_lock_held(void)
230 {
231 	return (pool_busy_thread == curthread);
232 }
233 
234 void
235 pool_unlock(void)
236 {
237 	mutex_enter(&pool_mutex);
238 	ASSERT(pool_lock_held());
239 	pool_busy_thread = NULL;
240 	cv_signal(&pool_busy_cv);
241 	mutex_exit(&pool_mutex);
242 }
243 
244 /*
245  * Routines allowing fork(), exec(), exit(), and lwp_create() to synchronize
246  * with pool_do_bind().
247  *
248  * Calls to pool_barrier_enter() and pool_barrier_exit() must bracket all
249  * operations which modify pool or pset associations.  They can be called
250  * while the process is multi-threaded.  In the common case, when current
251  * process is not being rebound (PBWAIT flag is not set), these functions
252  * will be just incrementing and decrementing reference counts.
253  */
254 void
255 pool_barrier_enter(void)
256 {
257 	proc_t *p = curproc;
258 
259 	ASSERT(MUTEX_HELD(&p->p_lock));
260 	while (p->p_poolflag & PBWAIT)
261 		cv_wait(&p->p_poolcv, &p->p_lock);
262 	p->p_poolcnt++;
263 }
264 
265 void
266 pool_barrier_exit(void)
267 {
268 	proc_t *p = curproc;
269 
270 	ASSERT(MUTEX_HELD(&p->p_lock));
271 	ASSERT(p->p_poolcnt > 0);
272 	p->p_poolcnt--;
273 	if (p->p_poolflag & PBWAIT) {
274 		mutex_enter(&pool_barrier_lock);
275 		ASSERT(pool_barrier_count > 0);
276 		pool_barrier_count--;
277 		if (pool_barrier_count == 0)
278 			cv_signal(&pool_barrier_cv);
279 		mutex_exit(&pool_barrier_lock);
280 		while (p->p_poolflag & PBWAIT)
281 			cv_wait(&p->p_poolcv, &p->p_lock);
282 	}
283 }
284 
285 /*
286  * Enable pools facility.
287  */
288 static int
289 pool_enable(void)
290 {
291 	int ret;
292 
293 	ASSERT(pool_lock_held());
294 	ASSERT(pool_count == 1);
295 
296 	ret = pool_pset_enable();
297 	if (ret != 0)
298 		return (ret);
299 	(void) nvlist_alloc(&pool_sys_prop, NV_UNIQUE_NAME, KM_SLEEP);
300 	(void) nvlist_add_string(pool_sys_prop, "system.name",
301 	    "default");
302 	(void) nvlist_add_string(pool_sys_prop, "system.comment", "");
303 	(void) nvlist_add_int64(pool_sys_prop, "system.version", 1);
304 	(void) nvlist_add_byte(pool_sys_prop, "system.bind-default", 1);
305 	(void) nvlist_add_string(pool_sys_prop, "system.poold.objectives",
306 	    "wt-load");
307 
308 	(void) nvlist_alloc(&pool_default->pool_props,
309 	    NV_UNIQUE_NAME, KM_SLEEP);
310 	(void) nvlist_add_string(pool_default->pool_props,
311 	    "pool.name", "pool_default");
312 	(void) nvlist_add_string(pool_default->pool_props, "pool.comment", "");
313 	(void) nvlist_add_byte(pool_default->pool_props, "pool.default", 1);
314 	(void) nvlist_add_byte(pool_default->pool_props, "pool.active", 1);
315 	(void) nvlist_add_int64(pool_default->pool_props,
316 	    "pool.importance", 1);
317 	(void) nvlist_add_int64(pool_default->pool_props, "pool.sys_id",
318 	    pool_default->pool_id);
319 
320 	pool_sys_mod = pool_pool_mod = gethrtime();
321 
322 	return (ret);
323 }
324 
325 /*
326  * Disable pools facility.
327  */
328 static int
329 pool_disable(void)
330 {
331 	int ret;
332 
333 	ASSERT(pool_lock_held());
334 
335 	if (pool_count > 1)	/* must destroy all pools first */
336 		return (EBUSY);
337 
338 	ret = pool_pset_disable();
339 	if (ret != 0)
340 		return (ret);
341 	if (pool_sys_prop != NULL) {
342 		nvlist_free(pool_sys_prop);
343 		pool_sys_prop = NULL;
344 	}
345 	if (pool_default->pool_props != NULL) {
346 		nvlist_free(pool_default->pool_props);
347 		pool_default->pool_props = NULL;
348 	}
349 	return (0);
350 }
351 
352 pool_t *
353 pool_lookup_pool_by_name(char *name)
354 {
355 	pool_t *pool = pool_default;
356 	char *p;
357 
358 	ASSERT(pool_lock_held());
359 	for (pool = list_head(&pool_list); pool;
360 	    pool = list_next(&pool_list, pool)) {
361 		if (nvlist_lookup_string(pool->pool_props,
362 		    "pool.name", &p) == 0 && strcmp(name, p) == 0)
363 			return (pool);
364 	}
365 	return (NULL);
366 }
367 
368 pool_t *
369 pool_lookup_pool_by_id(poolid_t poolid)
370 {
371 	pool_t *pool = pool_default;
372 
373 	ASSERT(pool_lock_held());
374 	for (pool = list_head(&pool_list); pool;
375 	    pool = list_next(&pool_list, pool)) {
376 		if (pool->pool_id == poolid)
377 			return (pool);
378 	}
379 	return (NULL);
380 }
381 
382 pool_t *
383 pool_lookup_pool_by_pset(int id)
384 {
385 	pool_t *pool = pool_default;
386 	psetid_t psetid = (psetid_t)id;
387 
388 	ASSERT(pool_lock_held());
389 	for (pool = list_head(&pool_list); pool != NULL;
390 	    pool = list_next(&pool_list, pool)) {
391 		if (pool->pool_pset->pset_id == psetid)
392 			return (pool);
393 	}
394 	return (NULL);
395 }
396 
397 /*
398  * Create new pool, associate it with default resource sets, and give
399  * it a temporary name.
400  */
401 static int
402 pool_pool_create(poolid_t *poolid)
403 {
404 	pool_t *pool;
405 	char pool_name[40];
406 
407 	ASSERT(pool_lock_held());
408 
409 	pool = kmem_zalloc(sizeof (pool_t), KM_SLEEP);
410 	pool->pool_id = *poolid = id_alloc(pool_ids);
411 	pool->pool_pset = pool_pset_default;
412 	pool_pset_default->pset_npools++;
413 	list_insert_tail(&pool_list, pool);
414 	(void) nvlist_alloc(&pool->pool_props, NV_UNIQUE_NAME, KM_SLEEP);
415 	(void) nvlist_add_int64(pool->pool_props, "pool.sys_id", pool->pool_id);
416 	(void) nvlist_add_byte(pool->pool_props, "pool.default", 0);
417 	pool_pool_mod = gethrtime();
418 	(void) snprintf(pool_name, sizeof (pool_name), "pool_%lld",
419 	    pool_pool_mod);
420 	(void) nvlist_add_string(pool->pool_props, "pool.name", pool_name);
421 	pool_count++;
422 	return (0);
423 }
424 
425 struct destroy_zone_arg {
426 	pool_t *old;
427 	pool_t *new;
428 };
429 
430 /*
431  * Update pool pointers for zones that are currently bound to pool "old"
432  * to be bound to pool "new".
433  */
434 static int
435 pool_destroy_zone_cb(zone_t *zone, void *arg)
436 {
437 	struct destroy_zone_arg *dza = arg;
438 
439 	ASSERT(pool_lock_held());
440 	ASSERT(MUTEX_HELD(&cpu_lock));
441 
442 	if (zone_pool_get(zone) == dza->old)
443 		zone_pool_set(zone, dza->new);
444 	return (0);
445 }
446 
447 /*
448  * Destroy specified pool, and rebind all processes in it
449  * to the default pool.
450  */
451 static int
452 pool_pool_destroy(poolid_t poolid)
453 {
454 	pool_t *pool;
455 	int ret;
456 
457 	ASSERT(pool_lock_held());
458 
459 	if (poolid == POOL_DEFAULT)
460 		return (EINVAL);
461 	if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
462 		return (ESRCH);
463 	ret = pool_do_bind(pool_default, P_POOLID, poolid, POOL_BIND_ALL);
464 	if (ret == 0) {
465 		struct destroy_zone_arg dzarg;
466 
467 		dzarg.old = pool;
468 		dzarg.new = pool_default;
469 		mutex_enter(&cpu_lock);
470 		ret = zone_walk(pool_destroy_zone_cb, &dzarg);
471 		mutex_exit(&cpu_lock);
472 		ASSERT(ret == 0);
473 		ASSERT(pool->pool_ref == 0);
474 		(void) nvlist_free(pool->pool_props);
475 		id_free(pool_ids, pool->pool_id);
476 		pool->pool_pset->pset_npools--;
477 		list_remove(&pool_list, pool);
478 		pool_count--;
479 		pool_pool_mod = gethrtime();
480 		kmem_free(pool, sizeof (pool_t));
481 	}
482 	return (ret);
483 }
484 
485 /*
486  * Create new pool or resource set.
487  */
488 int
489 pool_create(int class, int subclass, id_t *id)
490 {
491 	int ret;
492 
493 	ASSERT(pool_lock_held());
494 	if (pool_state == POOL_DISABLED)
495 		return (ENOTACTIVE);
496 	switch (class) {
497 	case PEC_POOL:
498 		ret = pool_pool_create((poolid_t *)id);
499 		break;
500 	case PEC_RES_COMP:
501 		switch (subclass) {
502 		case PREC_PSET:
503 			ret = pool_pset_create((psetid_t *)id);
504 			break;
505 		default:
506 			ret = EINVAL;
507 		}
508 		break;
509 	case PEC_RES_AGG:
510 		ret = ENOTSUP;
511 		break;
512 	default:
513 		ret = EINVAL;
514 	}
515 	return (ret);
516 }
517 
518 /*
519  * Destroy an existing pool or resource set.
520  */
521 int
522 pool_destroy(int class, int subclass, id_t id)
523 {
524 	int ret;
525 
526 	ASSERT(pool_lock_held());
527 	if (pool_state == POOL_DISABLED)
528 		return (ENOTACTIVE);
529 	switch (class) {
530 	case PEC_POOL:
531 		ret = pool_pool_destroy((poolid_t)id);
532 		break;
533 	case PEC_RES_COMP:
534 		switch (subclass) {
535 		case PREC_PSET:
536 			ret = pool_pset_destroy((psetid_t)id);
537 			break;
538 		default:
539 			ret = EINVAL;
540 		}
541 		break;
542 	case PEC_RES_AGG:
543 		ret = ENOTSUP;
544 		break;
545 	default:
546 		ret = EINVAL;
547 	}
548 	return (ret);
549 }
550 
551 /*
552  * Enable or disable pools.
553  */
554 int
555 pool_status(int status)
556 {
557 	int ret = 0;
558 
559 	ASSERT(pool_lock_held());
560 
561 	if (pool_state == status)
562 		return (0);
563 	switch (status) {
564 	case POOL_ENABLED:
565 		ret = pool_enable();
566 		if (ret != 0)
567 			return (ret);
568 		pool_state = POOL_ENABLED;
569 		pool_event_dispatch(POOL_E_ENABLE, 0);
570 		break;
571 	case POOL_DISABLED:
572 		ret = pool_disable();
573 		if (ret != 0)
574 			return (ret);
575 		pool_state = POOL_DISABLED;
576 		pool_event_dispatch(POOL_E_DISABLE, 0);
577 		break;
578 	default:
579 		ret = EINVAL;
580 	}
581 	return (ret);
582 }
583 
584 /*
585  * Associate pool with resource set.
586  */
587 int
588 pool_assoc(poolid_t poolid, int idtype, id_t id)
589 {
590 	int ret;
591 
592 	ASSERT(pool_lock_held());
593 	if (pool_state == POOL_DISABLED)
594 		return (ENOTACTIVE);
595 	switch (idtype) {
596 	case PREC_PSET:
597 		ret = pool_pset_assoc(poolid, (psetid_t)id);
598 		if (ret == 0)
599 			pool_event_dispatch(POOL_E_CHANGE, poolid);
600 		break;
601 	default:
602 		ret = EINVAL;
603 	}
604 	if (ret == 0)
605 		pool_pool_mod = gethrtime();
606 	return (ret);
607 }
608 
609 /*
610  * Disassociate resource set from pool.
611  */
612 int
613 pool_dissoc(poolid_t poolid, int idtype)
614 {
615 	int ret;
616 
617 	ASSERT(pool_lock_held());
618 	if (pool_state == POOL_DISABLED)
619 		return (ENOTACTIVE);
620 	switch (idtype) {
621 	case PREC_PSET:
622 		ret = pool_pset_assoc(poolid, PS_NONE);
623 		if (ret == 0)
624 			pool_event_dispatch(POOL_E_CHANGE, poolid);
625 		break;
626 	default:
627 		ret = EINVAL;
628 	}
629 	if (ret == 0)
630 		pool_pool_mod = gethrtime();
631 	return (ret);
632 }
633 
634 /*
635  * Transfer specified quantity of resources between resource sets.
636  */
637 /*ARGSUSED*/
638 int
639 pool_transfer(int type, id_t src, id_t dst, uint64_t qty)
640 {
641 	int ret = EINVAL;
642 
643 	return (ret);
644 }
645 
646 static poolid_t
647 pool_lookup_id_by_pset(int id)
648 {
649 	pool_t *pool = pool_default;
650 	psetid_t psetid = (psetid_t)id;
651 
652 	ASSERT(pool_lock_held());
653 	for (pool = list_head(&pool_list); pool != NULL;
654 	    pool = list_next(&pool_list, pool)) {
655 		if (pool->pool_pset->pset_id == psetid)
656 			return (pool->pool_id);
657 	}
658 	return (POOL_INVALID);
659 }
660 
661 /*
662  * Transfer resources specified by their IDs between resource sets.
663  */
664 int
665 pool_xtransfer(int type, id_t src_pset, id_t dst_pset, uint_t size, id_t *ids)
666 {
667 	int ret;
668 	poolid_t src_pool, dst_pool;
669 
670 	ASSERT(pool_lock_held());
671 	if (pool_state == POOL_DISABLED)
672 		return (ENOTACTIVE);
673 	switch (type) {
674 	case PREC_PSET:
675 		ret = pool_pset_xtransfer((psetid_t)src_pset,
676 		    (psetid_t)dst_pset, size, ids);
677 		if (ret == 0) {
678 			if ((src_pool =  pool_lookup_id_by_pset(src_pset)) !=
679 			    POOL_INVALID)
680 				pool_event_dispatch(POOL_E_CHANGE, src_pool);
681 			if ((dst_pool =  pool_lookup_id_by_pset(dst_pset)) !=
682 			    POOL_INVALID)
683 				pool_event_dispatch(POOL_E_CHANGE, dst_pool);
684 		}
685 		break;
686 	default:
687 		ret = EINVAL;
688 	}
689 	return (ret);
690 }
691 
692 /*
693  * Bind processes to pools.
694  */
695 int
696 pool_bind(poolid_t poolid, idtype_t idtype, id_t id)
697 {
698 	pool_t	*pool;
699 
700 	ASSERT(pool_lock_held());
701 
702 	if (pool_state == POOL_DISABLED)
703 		return (ENOTACTIVE);
704 	if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
705 		return (ESRCH);
706 
707 	switch (idtype) {
708 	case P_PID:
709 	case P_TASKID:
710 	case P_PROJID:
711 	case P_ZONEID:
712 		break;
713 	default:
714 		return (EINVAL);
715 	}
716 	return (pool_do_bind(pool, idtype, id, POOL_BIND_ALL));
717 }
718 
719 /*
720  * Query pool binding of the specifed process.
721  */
722 int
723 pool_query_binding(idtype_t idtype, id_t id, id_t *poolid)
724 {
725 	proc_t *p;
726 
727 	if (idtype != P_PID)
728 		return (ENOTSUP);
729 	if (id == P_MYID)
730 		id = curproc->p_pid;
731 
732 	ASSERT(pool_lock_held());
733 
734 	mutex_enter(&pidlock);
735 	if ((p = prfind((pid_t)id)) == NULL) {
736 		mutex_exit(&pidlock);
737 		return (ESRCH);
738 	}
739 	mutex_enter(&p->p_lock);
740 	/*
741 	 * In local zones, lie about pool bindings of processes from
742 	 * the global zone.
743 	 */
744 	if (!INGLOBALZONE(curproc) && INGLOBALZONE(p)) {
745 		pool_t *pool;
746 
747 		pool = zone_pool_get(curproc->p_zone);
748 		*poolid = pool->pool_id;
749 	} else {
750 		*poolid = p->p_pool->pool_id;
751 	}
752 	mutex_exit(&p->p_lock);
753 	mutex_exit(&pidlock);
754 	return (0);
755 }
756 
757 static ea_object_t *
758 pool_system_pack(void)
759 {
760 	ea_object_t *eo_system;
761 	size_t bufsz = 0;
762 	char *buf = NULL;
763 
764 	ASSERT(pool_lock_held());
765 
766 	eo_system = ea_alloc_group(EXT_GROUP | EXC_LOCAL | EXD_GROUP_SYSTEM);
767 	(void) ea_attach_item(eo_system, &pool_sys_mod, sizeof (hrtime_t),
768 	    EXC_LOCAL | EXD_SYSTEM_TSTAMP | EXT_UINT64);
769 	if (INGLOBALZONE(curproc))
770 		(void) ea_attach_item(eo_system, &pool_pool_mod,
771 		    sizeof (hrtime_t),
772 		    EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64);
773 	else
774 		(void) ea_attach_item(eo_system,
775 		    &curproc->p_zone->zone_pool_mod,
776 		    sizeof (hrtime_t),
777 		    EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64);
778 	(void) ea_attach_item(eo_system, &pool_pset_mod, sizeof (hrtime_t),
779 	    EXC_LOCAL | EXD_PSET_TSTAMP | EXT_UINT64);
780 	(void) ea_attach_item(eo_system, &pool_cpu_mod, sizeof (hrtime_t),
781 	    EXC_LOCAL | EXD_CPU_TSTAMP | EXT_UINT64);
782 	(void) nvlist_pack(pool_sys_prop, &buf, &bufsz, NV_ENCODE_NATIVE, 0);
783 	(void) ea_attach_item(eo_system, buf, bufsz,
784 	    EXC_LOCAL | EXD_SYSTEM_PROP | EXT_RAW);
785 	kmem_free(buf, bufsz);
786 	return (eo_system);
787 }
788 
789 /*
790  * Pack information about pools and attach it to specified exacct group.
791  */
792 static int
793 pool_pool_pack(ea_object_t *eo_system)
794 {
795 	ea_object_t *eo_pool;
796 	pool_t *pool;
797 	size_t bufsz;
798 	char *buf;
799 	pool_t *myzonepool;
800 
801 	ASSERT(pool_lock_held());
802 	myzonepool = zone_pool_get(curproc->p_zone);
803 	for (pool = list_head(&pool_list); pool;
804 	    pool = list_next(&pool_list, pool)) {
805 		if (!INGLOBALZONE(curproc) && myzonepool != pool)
806 			continue;
807 		bufsz = 0;
808 		buf = NULL;
809 		eo_pool = ea_alloc_group(EXT_GROUP |
810 		    EXC_LOCAL | EXD_GROUP_POOL);
811 		(void) ea_attach_item(eo_pool, &pool->pool_id, sizeof (id_t),
812 		    EXC_LOCAL | EXD_POOL_POOLID | EXT_UINT32);
813 		(void) ea_attach_item(eo_pool, &pool->pool_pset->pset_id,
814 		    sizeof (id_t), EXC_LOCAL | EXD_POOL_PSETID | EXT_UINT32);
815 		(void) nvlist_pack(pool->pool_props, &buf, &bufsz,
816 		    NV_ENCODE_NATIVE, 0);
817 		(void) ea_attach_item(eo_pool, buf, bufsz,
818 		    EXC_LOCAL | EXD_POOL_PROP | EXT_RAW);
819 		kmem_free(buf, bufsz);
820 		(void) ea_attach_to_group(eo_system, eo_pool);
821 	}
822 	return (0);
823 }
824 
825 /*
826  * Pack the whole pool configuration in the specified buffer.
827  */
828 int
829 pool_pack_conf(void *kbuf, size_t kbufsz, size_t *asize)
830 {
831 	ea_object_t *eo_system;
832 	size_t ksize;
833 	int ret = 0;
834 
835 	ASSERT(pool_lock_held());
836 
837 	eo_system = pool_system_pack();		/* 1. pack system */
838 	(void) pool_pool_pack(eo_system);	/* 2. pack all pools */
839 	(void) pool_pset_pack(eo_system);	/* 3. pack all psets */
840 	ksize = ea_pack_object(eo_system, NULL, 0);
841 	if (kbuf == NULL || kbufsz == 0)
842 		*asize = ksize;
843 	else if (ksize > kbufsz)
844 		ret = ENOMEM;
845 	else
846 		*asize = ea_pack_object(eo_system, kbuf, kbufsz);
847 	ea_free_object(eo_system, EUP_ALLOC);
848 	return (ret);
849 }
850 
851 /*
852  * Start/end the commit transaction.  If commit transaction is currently
853  * in progress, then all POOL_QUERY ioctls will return pools configuration
854  * at the beginning of transaction.
855  */
856 int
857 pool_commit(int state)
858 {
859 	ea_object_t *eo_system;
860 	int ret = 0;
861 
862 	ASSERT(pool_lock_held());
863 
864 	if (pool_state == POOL_DISABLED)
865 		return (ENOTACTIVE);
866 	switch (state) {
867 	case 1:
868 		/*
869 		 * Beginning commit transation.
870 		 */
871 		if (pool_buf != NULL)		/* transaction in progress */
872 			return (EBUSY);
873 		eo_system = pool_system_pack();		/* 1. pack system */
874 		(void) pool_pool_pack(eo_system);	/* 2. pack all pools */
875 		(void) pool_pset_pack(eo_system);	/* 3. pack all psets */
876 		pool_bufsz = ea_pack_object(eo_system, NULL, 0);
877 		pool_buf = kmem_alloc(pool_bufsz, KM_SLEEP);
878 		pool_bufsz = ea_pack_object(eo_system, pool_buf, pool_bufsz);
879 		ea_free_object(eo_system, EUP_ALLOC);
880 		break;
881 	case 0:
882 		/*
883 		 * Finishing commit transaction.
884 		 */
885 		if (pool_buf != NULL) {
886 			kmem_free(pool_buf, pool_bufsz);
887 			pool_buf = NULL;
888 			pool_bufsz = 0;
889 		}
890 		break;
891 	default:
892 		ret = EINVAL;
893 	}
894 	return (ret);
895 }
896 
897 /*
898  * Check is the specified property is special
899  */
900 static pool_property_t *
901 pool_property_find(char *name, pool_property_t *list)
902 {
903 	pool_property_t *prop;
904 
905 	for (prop = list; prop->pp_name != NULL; prop++)
906 		if (strcmp(prop->pp_name, name) == 0)
907 			return (prop);
908 	return (NULL);
909 }
910 
911 static pool_property_t pool_prop_sys[] = {
912 	{ "system.name",		DATA_TYPE_STRING,	PP_RDWR },
913 	{ "system.comment",		DATA_TYPE_STRING,	PP_RDWR },
914 	{ "system.version",		DATA_TYPE_UINT64,	PP_READ },
915 	{ "system.bind-default",	DATA_TYPE_BYTE,		PP_RDWR },
916 	{ "system.allocate-method",	DATA_TYPE_STRING,
917 	    PP_RDWR | PP_OPTIONAL },
918 	{ "system.poold.log-level",	DATA_TYPE_STRING,
919 	    PP_RDWR | PP_OPTIONAL },
920 	{ "system.poold.log-location",	DATA_TYPE_STRING,
921 	    PP_RDWR | PP_OPTIONAL },
922 	{ "system.poold.monitor-interval",	DATA_TYPE_UINT64,
923 	    PP_RDWR | PP_OPTIONAL },
924 	{ "system.poold.history-file",	DATA_TYPE_STRING,
925 	    PP_RDWR | PP_OPTIONAL },
926 	{ "system.poold.objectives",	DATA_TYPE_STRING,
927 	    PP_RDWR | PP_OPTIONAL },
928 	{ NULL,				0,			0 }
929 };
930 
931 static pool_property_t pool_prop_pool[] = {
932 	{ "pool.sys_id",		DATA_TYPE_UINT64,	PP_READ },
933 	{ "pool.name",			DATA_TYPE_STRING,	PP_RDWR },
934 	{ "pool.default",		DATA_TYPE_BYTE,		PP_READ },
935 	{ "pool.active",		DATA_TYPE_BYTE,		PP_RDWR },
936 	{ "pool.importance",		DATA_TYPE_INT64,	PP_RDWR },
937 	{ "pool.comment",		DATA_TYPE_STRING,	PP_RDWR },
938 	{ "pool.scheduler",		DATA_TYPE_STRING,
939 	    PP_RDWR | PP_OPTIONAL },
940 	{ NULL,				0,			0 }
941 };
942 
943 /*
944  * Common routine to put new property on the specified list
945  */
946 int
947 pool_propput_common(nvlist_t *nvlist, nvpair_t *pair, pool_property_t *props)
948 {
949 	pool_property_t *prop;
950 
951 	if ((prop = pool_property_find(nvpair_name(pair), props)) != NULL) {
952 		/*
953 		 * No read-only properties or properties with bad types
954 		 */
955 		if (!(prop->pp_perm & PP_WRITE) ||
956 		    prop->pp_type != nvpair_type(pair))
957 			return (EINVAL);
958 	}
959 	return (nvlist_add_nvpair(nvlist, pair));
960 }
961 
962 /*
963  * Common routine to remove property from the given list
964  */
965 int
966 pool_proprm_common(nvlist_t *nvlist, char *name, pool_property_t *props)
967 {
968 	pool_property_t *prop;
969 
970 	if ((prop = pool_property_find(name, props)) != NULL) {
971 		if (!(prop->pp_perm & PP_OPTIONAL))
972 			return (EINVAL);
973 	}
974 	return (nvlist_remove_all(nvlist, name));
975 }
976 
977 static int
978 pool_system_propput(nvpair_t *pair)
979 {
980 	int ret;
981 
982 	ASSERT(pool_lock_held());
983 	ret = pool_propput_common(pool_sys_prop, pair, pool_prop_sys);
984 	if (ret == 0)
985 		pool_sys_mod = gethrtime();
986 	return (ret);
987 }
988 
989 static int
990 pool_system_proprm(char *name)
991 {
992 	int ret;
993 
994 	ASSERT(pool_lock_held());
995 	ret = pool_proprm_common(pool_sys_prop, name, pool_prop_sys);
996 	if (ret == 0)
997 		pool_sys_mod = gethrtime();
998 	return (ret);
999 }
1000 
1001 static int
1002 pool_pool_propput(poolid_t poolid, nvpair_t *pair)
1003 {
1004 	pool_t *pool;
1005 	int ret;
1006 
1007 	ASSERT(pool_lock_held());
1008 	if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
1009 		return (ESRCH);
1010 	ret = pool_propput_common(pool->pool_props, pair, pool_prop_pool);
1011 	if (ret == 0)
1012 		pool_pool_mod = gethrtime();
1013 	return (ret);
1014 }
1015 
1016 static int
1017 pool_pool_proprm(poolid_t poolid, char *name)
1018 {
1019 	int ret;
1020 	pool_t *pool;
1021 
1022 	ASSERT(pool_lock_held());
1023 	if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
1024 		return (ESRCH);
1025 	ret = pool_proprm_common(pool->pool_props, name, pool_prop_pool);
1026 	if (ret == 0)
1027 		pool_pool_mod = gethrtime();
1028 	return (ret);
1029 }
1030 
1031 int
1032 pool_propput(int class, int subclass, id_t id, nvpair_t *pair)
1033 {
1034 	int ret;
1035 
1036 	ASSERT(pool_lock_held());
1037 	if (pool_state == POOL_DISABLED)
1038 		return (ENOTACTIVE);
1039 	switch (class) {
1040 	case PEC_SYSTEM:
1041 		ret = pool_system_propput(pair);
1042 		break;
1043 	case PEC_POOL:
1044 		ret = pool_pool_propput((poolid_t)id, pair);
1045 		break;
1046 	case PEC_RES_COMP:
1047 		switch (subclass) {
1048 		case PREC_PSET:
1049 			ret = pool_pset_propput((psetid_t)id, pair);
1050 			break;
1051 		default:
1052 			ret = EINVAL;
1053 		}
1054 		break;
1055 	case PEC_RES_AGG:
1056 		ret = ENOTSUP;
1057 		break;
1058 	case PEC_COMP:
1059 		switch (subclass) {
1060 		case PCEC_CPU:
1061 			ret = pool_cpu_propput((processorid_t)id, pair);
1062 			break;
1063 		default:
1064 			ret = EINVAL;
1065 		}
1066 		break;
1067 	default:
1068 		ret = EINVAL;
1069 	}
1070 	return (ret);
1071 }
1072 
1073 int
1074 pool_proprm(int class, int subclass, id_t id, char *name)
1075 {
1076 	int ret;
1077 
1078 	ASSERT(pool_lock_held());
1079 	if (pool_state == POOL_DISABLED)
1080 		return (ENOTACTIVE);
1081 	switch (class) {
1082 	case PEC_SYSTEM:
1083 		ret = pool_system_proprm(name);
1084 		break;
1085 	case PEC_POOL:
1086 		ret = pool_pool_proprm((poolid_t)id, name);
1087 		break;
1088 	case PEC_RES_COMP:
1089 		switch (subclass) {
1090 		case PREC_PSET:
1091 			ret = pool_pset_proprm((psetid_t)id, name);
1092 			break;
1093 		default:
1094 			ret = EINVAL;
1095 		}
1096 		break;
1097 	case PEC_RES_AGG:
1098 		ret = ENOTSUP;
1099 		break;
1100 	case PEC_COMP:
1101 		switch (subclass) {
1102 		case PCEC_CPU:
1103 			ret = pool_cpu_proprm((processorid_t)id, name);
1104 			break;
1105 		default:
1106 			ret = EINVAL;
1107 		}
1108 		break;
1109 	default:
1110 		ret = EINVAL;
1111 	}
1112 	return (ret);
1113 }
1114 
1115 int
1116 pool_propget(char *name, int class, int subclass, id_t id, nvlist_t **nvlp)
1117 {
1118 	int ret;
1119 	nvlist_t *nvl;
1120 
1121 	ASSERT(pool_lock_held());
1122 	if (pool_state == POOL_DISABLED)
1123 		return (ENOTACTIVE);
1124 
1125 	(void) nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP);
1126 
1127 	switch (class) {
1128 	case PEC_SYSTEM:
1129 	case PEC_POOL:
1130 		ret = EINVAL;
1131 		break;
1132 	case PEC_RES_COMP:
1133 		switch (subclass) {
1134 		case PREC_PSET:
1135 			ret = pool_pset_propget((psetid_t)id, name, nvl);
1136 			break;
1137 		default:
1138 			ret = EINVAL;
1139 		}
1140 		break;
1141 	case PEC_RES_AGG:
1142 		ret = ENOTSUP;
1143 		break;
1144 	case PEC_COMP:
1145 		switch (subclass) {
1146 		case PCEC_CPU:
1147 			ret = pool_cpu_propget((processorid_t)id, name, nvl);
1148 			break;
1149 		default:
1150 			ret = EINVAL;
1151 		}
1152 		break;
1153 	default:
1154 		ret = EINVAL;
1155 	}
1156 	if (ret == 0)
1157 		*nvlp = nvl;
1158 	else
1159 		nvlist_free(nvl);
1160 	return (ret);
1161 }
1162 
1163 /*
1164  * pool_bind_wake and pool_bind_wakeall are helper functions to undo PBWAITs
1165  * in case of failure in pool_do_bind().
1166  */
1167 static void
1168 pool_bind_wake(proc_t *p)
1169 {
1170 	ASSERT(pool_lock_held());
1171 
1172 	mutex_enter(&p->p_lock);
1173 	ASSERT(p->p_poolflag & PBWAIT);
1174 	if (p->p_poolcnt > 0) {
1175 		mutex_enter(&pool_barrier_lock);
1176 		pool_barrier_count -= p->p_poolcnt;
1177 		mutex_exit(&pool_barrier_lock);
1178 	}
1179 	p->p_poolflag &= ~PBWAIT;
1180 	cv_signal(&p->p_poolcv);
1181 	mutex_exit(&p->p_lock);
1182 }
1183 
1184 static void
1185 pool_bind_wakeall(proc_t **procs)
1186 {
1187 	proc_t *p, **pp;
1188 
1189 	ASSERT(pool_lock_held());
1190 	for (pp = procs; (p = *pp) != NULL; pp++)
1191 		pool_bind_wake(p);
1192 }
1193 
1194 /*
1195  * Return the scheduling class for this pool, or
1196  * 	POOL_CLASS_UNSET if not set
1197  * 	POOL_CLASS_INVAL if set to an invalid class ID.
1198  */
1199 id_t
1200 pool_get_class(pool_t *pool)
1201 {
1202 	char *name;
1203 	id_t cid;
1204 
1205 	ASSERT(pool_lock_held());
1206 
1207 	if (nvlist_lookup_string(pool->pool_props, "pool.scheduler",
1208 	    &name) == 0) {
1209 		if (getcidbyname(name, &cid) == 0)
1210 			return (cid);
1211 		else
1212 			return (POOL_CLASS_INVAL);
1213 	}
1214 	return (POOL_CLASS_UNSET);
1215 }
1216 
1217 /*
1218  * Move process to the new scheduling class.
1219  */
1220 static void
1221 pool_change_class(proc_t *p, id_t cid)
1222 {
1223 	kthread_t *t;
1224 	void *cldata;
1225 	id_t oldcid;
1226 	void **bufs;
1227 	void **buf;
1228 	int nlwp;
1229 	int ret;
1230 	int i;
1231 
1232 	/*
1233 	 * Do not move kernel processes (such as zsched).
1234 	 */
1235 	if (p->p_flag & SSYS)
1236 		return;
1237 	/*
1238 	 * This process is in the pool barrier, so it can't possibly be
1239 	 * adding new threads and we can use p_lwpcnt + p_zombcnt + 1
1240 	 * (for possible agent LWP which doesn't use pool barrier) as
1241 	 * our upper bound.
1242 	 */
1243 	nlwp = p->p_lwpcnt + p->p_zombcnt + 1;
1244 
1245 	/*
1246 	 * Pre-allocate scheduling class specific buffers before
1247 	 * grabbing p_lock.
1248 	 */
1249 	bufs = kmem_zalloc(nlwp * sizeof (void *), KM_SLEEP);
1250 	for (i = 0, buf = bufs; i < nlwp; i++, buf++) {
1251 		ret = CL_ALLOC(buf, cid, KM_SLEEP);
1252 		ASSERT(ret == 0);
1253 	}
1254 
1255 	/*
1256 	 * Move threads one by one to the new scheduling class.
1257 	 * This never fails because we have all the right
1258 	 * privileges here.
1259 	 */
1260 	mutex_enter(&p->p_lock);
1261 	ASSERT(p->p_poolflag & PBWAIT);
1262 	buf = bufs;
1263 	t = p->p_tlist;
1264 	ASSERT(t != NULL);
1265 	do {
1266 		if (t->t_cid != cid) {
1267 			oldcid = t->t_cid;
1268 			cldata = t->t_cldata;
1269 			ret = CL_ENTERCLASS(t, cid, NULL, NULL, *buf);
1270 			ASSERT(ret == 0);
1271 			CL_EXITCLASS(oldcid, cldata);
1272 			schedctl_set_cidpri(t);
1273 			*buf++ = NULL;
1274 		}
1275 	} while ((t = t->t_forw) != p->p_tlist);
1276 	mutex_exit(&p->p_lock);
1277 	/*
1278 	 * Free unused scheduling class specific buffers.
1279 	 */
1280 	for (i = 0, buf = bufs; i < nlwp; i++, buf++) {
1281 		if (*buf != NULL) {
1282 			CL_FREE(cid, *buf);
1283 			*buf = NULL;
1284 		}
1285 	}
1286 	kmem_free(bufs, nlwp * sizeof (void *));
1287 }
1288 
1289 void
1290 pool_get_name(pool_t *pool, char **name)
1291 {
1292 	ASSERT(pool_lock_held());
1293 
1294 	(void) nvlist_lookup_string(pool->pool_props, "pool.name", name);
1295 
1296 	ASSERT(strlen(*name) != 0);
1297 }
1298 
1299 
1300 /*
1301  * The meat of the bind operation.  The steps in pool_do_bind are:
1302  *
1303  * 1) Set PBWAIT in the p_poolflag of any process of interest, and add all
1304  *    such processes to an array.  For any interesting process that has
1305  *    threads inside the pool barrier set, increment a counter by the
1306  *    count of such threads.  Once PBWAIT is set on a process, that process
1307  *    will not disappear.
1308  *
1309  * 2) Wait for the counter from step 2 to drop to zero.  Any process which
1310  *    calls pool_barrier_exit() and notices that PBWAIT has been set on it
1311  *    will decrement that counter before going to sleep, and the process
1312  *    calling pool_barrier_exit() which does the final decrement will wake us.
1313  *
1314  * 3) For each interesting process, perform a calculation on it to see if
1315  *    the bind will actually succeed.  This uses the following three
1316  *    resource-set-specific functions:
1317  *
1318  *    - int set_bind_start(procs, pool)
1319  *
1320  *      Determine whether the given array of processes can be bound to the
1321  *      resource set associated with the given pool.  If it can, take and hold
1322  *      any locks necessary to ensure that the operation will succeed, and
1323  *      make any necessary reservations in the target resource set.  If it
1324  *      can't, return failure with no reservations made and no new locks held.
1325  *
1326  *    - void set_bind_abort(procs, pool)
1327  *
1328  *      set_bind_start() has completed successfully, but another resource set's
1329  *      set_bind_start() has failed, and we haven't begun the bind yet.  Undo
1330  *      any reservations made and drop any locks acquired by our
1331  *      set_bind_start().
1332  *
1333  *    - void set_bind_finish(void)
1334  *
1335  *      The bind has completed successfully.  The processes have been released,
1336  *      and the reservation acquired in set_bind_start() has been depleted as
1337  *      the processes have finished their bindings.  Drop any locks acquired by
1338  *      set_bind_start().
1339  *
1340  * 4) If we've decided that we can proceed with the bind, iterate through
1341  *    the list of interesting processes, grab the necessary locks (which
1342  *    may differ per resource set), perform the bind, and ASSERT that it
1343  *    succeeds.  Once a process has been rebound, it can be awakened.
1344  *
1345  * The operations from step 4 must be kept in sync with anything which might
1346  * cause the bind operations (e.g., cpupart_bind_thread()) to fail, and
1347  * are thus located in the same source files as the associated bind operations.
1348  */
1349 int
1350 pool_do_bind(pool_t *pool, idtype_t idtype, id_t id, int flags)
1351 {
1352 	extern uint_t nproc;
1353 	klwp_t *lwp = ttolwp(curthread);
1354 	proc_t **pp, **procs;
1355 	proc_t *prstart;
1356 	int procs_count = 0;
1357 	kproject_t *kpj;
1358 	procset_t set;
1359 	zone_t *zone;
1360 	int procs_size;
1361 	int rv = 0;
1362 	proc_t *p;
1363 	id_t cid = -1;
1364 
1365 	ASSERT(pool_lock_held());
1366 
1367 	if ((cid = pool_get_class(pool)) == POOL_CLASS_INVAL)
1368 		return (EINVAL);
1369 
1370 	if (idtype == P_ZONEID) {
1371 		zone = zone_find_by_id(id);
1372 		if (zone == NULL)
1373 			return (ESRCH);
1374 		if (zone_status_get(zone) > ZONE_IS_RUNNING) {
1375 			zone_rele(zone);
1376 			return (EBUSY);
1377 		}
1378 	}
1379 
1380 	if (idtype == P_PROJID) {
1381 		kpj = project_hold_by_id(id, global_zone, PROJECT_HOLD_FIND);
1382 		if (kpj == NULL)
1383 			return (ESRCH);
1384 		mutex_enter(&kpj->kpj_poolbind);
1385 	}
1386 
1387 	if (idtype == P_PID) {
1388 		/*
1389 		 * Fast-path for a single process case.
1390 		 */
1391 		procs_size = 2;	/* procs is NULL-terminated */
1392 		procs = kmem_zalloc(procs_size * sizeof (proc_t *), KM_SLEEP);
1393 		mutex_enter(&pidlock);
1394 	} else {
1395 		/*
1396 		 * We will need enough slots for proc_t pointers for as many as
1397 		 * twice the number of currently running processes (assuming
1398 		 * that each one could be in fork() creating a new child).
1399 		 */
1400 		for (;;) {
1401 			procs_size = nproc * 2;
1402 			procs = kmem_zalloc(procs_size * sizeof (proc_t *),
1403 			    KM_SLEEP);
1404 			mutex_enter(&pidlock);
1405 
1406 			if (nproc * 2 <= procs_size)
1407 				break;
1408 			/*
1409 			 * If nproc has changed, try again.
1410 			 */
1411 			mutex_exit(&pidlock);
1412 			kmem_free(procs, procs_size * sizeof (proc_t *));
1413 		}
1414 	}
1415 
1416 	if (id == P_MYID)
1417 		id = getmyid(idtype);
1418 	setprocset(&set, POP_AND, idtype, id, P_ALL, 0);
1419 
1420 	/*
1421 	 * Do a first scan, and select target processes.
1422 	 */
1423 	if (idtype == P_PID)
1424 		prstart = prfind(id);
1425 	else
1426 		prstart = practive;
1427 	for (p = prstart, pp = procs; p != NULL; p = p->p_next) {
1428 		mutex_enter(&p->p_lock);
1429 		/*
1430 		 * Skip processes that don't match our (id, idtype) set or
1431 		 * on the way of becoming zombies.  Skip kernel processes
1432 		 * from the global zone.
1433 		 */
1434 		if (procinset(p, &set) == 0 ||
1435 		    p->p_poolflag & PEXITED ||
1436 		    ((p->p_flag & SSYS) && INGLOBALZONE(p))) {
1437 			mutex_exit(&p->p_lock);
1438 			continue;
1439 		}
1440 		if (!INGLOBALZONE(p)) {
1441 			switch (idtype) {
1442 			case P_PID:
1443 			case P_TASKID:
1444 				/*
1445 				 * Can't bind processes or tasks
1446 				 * in local zones to pools.
1447 				 */
1448 				mutex_exit(&p->p_lock);
1449 				mutex_exit(&pidlock);
1450 				pool_bind_wakeall(procs);
1451 				rv = EINVAL;
1452 				goto out;
1453 			case P_PROJID:
1454 				/*
1455 				 * Only projects in the global
1456 				 * zone can be rebound.
1457 				 */
1458 				mutex_exit(&p->p_lock);
1459 				continue;
1460 			case P_POOLID:
1461 				/*
1462 				 * When rebinding pools, processes can be
1463 				 * in different zones.
1464 				 */
1465 				break;
1466 			}
1467 		}
1468 
1469 		p->p_poolflag |= PBWAIT;
1470 		/*
1471 		 * If some threads in this process are inside the pool
1472 		 * barrier, add them to pool_barrier_count, as we have
1473 		 * to wait for all of them to exit the barrier.
1474 		 */
1475 		if (p->p_poolcnt > 0) {
1476 			mutex_enter(&pool_barrier_lock);
1477 			pool_barrier_count += p->p_poolcnt;
1478 			mutex_exit(&pool_barrier_lock);
1479 		}
1480 		ASSERT(pp < &procs[procs_size]);
1481 		*pp++ = p;
1482 		procs_count++;
1483 		mutex_exit(&p->p_lock);
1484 
1485 		/*
1486 		 * We just found our process, so if we're only rebinding a
1487 		 * single process then get out of this loop.
1488 		 */
1489 		if (idtype == P_PID)
1490 			break;
1491 	}
1492 	*pp = NULL;	/* cap off the end of the array */
1493 	mutex_exit(&pidlock);
1494 
1495 	/*
1496 	 * Wait for relevant processes to stop before they try to enter the
1497 	 * barrier or at the exit from the barrier.  Make sure that we do
1498 	 * not get stopped here while we're holding pool_lock.  If we were
1499 	 * requested to stop, or got a signal then return EAGAIN to let the
1500 	 * library know that it needs to retry.
1501 	 */
1502 	mutex_enter(&pool_barrier_lock);
1503 	lwp->lwp_nostop++;
1504 	while (pool_barrier_count > 0) {
1505 		(void) cv_wait_sig(&pool_barrier_cv, &pool_barrier_lock);
1506 		if (pool_barrier_count > 0) {
1507 			/*
1508 			 * We either got a signal or were requested to
1509 			 * stop by /proc.  Bail out with EAGAIN.  If we were
1510 			 * requested to stop, we'll stop in post_syscall()
1511 			 * on our way back to userland.
1512 			 */
1513 			mutex_exit(&pool_barrier_lock);
1514 			pool_bind_wakeall(procs);
1515 			lwp->lwp_nostop--;
1516 			rv = EAGAIN;
1517 			goto out;
1518 		}
1519 	}
1520 	lwp->lwp_nostop--;
1521 	mutex_exit(&pool_barrier_lock);
1522 
1523 	if (idtype == P_PID) {
1524 		if ((p = *procs) == NULL)
1525 			goto skip;
1526 		mutex_enter(&p->p_lock);
1527 		/* Drop the process if it is exiting */
1528 		if (p->p_poolflag & PEXITED) {
1529 			mutex_exit(&p->p_lock);
1530 			pool_bind_wake(p);
1531 			procs_count--;
1532 		} else
1533 			mutex_exit(&p->p_lock);
1534 		goto skip;
1535 	}
1536 
1537 	/*
1538 	 * Do another run, and drop processes that were inside the barrier
1539 	 * in exit(), but when they have dropped to pool_barrier_exit
1540 	 * they have become of no interest to us.  Pick up child processes that
1541 	 * were created by fork() but didn't exist during our first scan.
1542 	 * Their parents are now stopped at pool_barrier_exit in cfork().
1543 	 */
1544 	mutex_enter(&pidlock);
1545 	for (pp = procs; (p = *pp) != NULL; pp++) {
1546 		mutex_enter(&p->p_lock);
1547 		if (p->p_poolflag & PEXITED) {
1548 			ASSERT(p->p_lwpcnt == 0);
1549 			mutex_exit(&p->p_lock);
1550 			pool_bind_wake(p);
1551 			/* flip w/last non-NULL slot */
1552 			*pp = procs[procs_count - 1];
1553 			procs[procs_count - 1] = NULL;
1554 			procs_count--;
1555 			pp--;			/* try this slot again */
1556 			continue;
1557 		} else
1558 			mutex_exit(&p->p_lock);
1559 		/*
1560 		 * Look at the child and check if it should be rebound also.
1561 		 * We're holding pidlock, so it is safe to reference p_child.
1562 		 */
1563 		if ((p = p->p_child) == NULL)
1564 			continue;
1565 
1566 		mutex_enter(&p->p_lock);
1567 
1568 		/*
1569 		 * Skip system processes and make sure that the child is in
1570 		 * the same task/project/pool/zone as the parent.
1571 		 */
1572 		if ((!INGLOBALZONE(p) && idtype != P_ZONEID &&
1573 		    idtype != P_POOLID) || p->p_flag & SSYS) {
1574 			mutex_exit(&p->p_lock);
1575 			continue;
1576 		}
1577 
1578 		/*
1579 		 * If the child process has been already created by fork(), has
1580 		 * not exited, and has not been added to the list already,
1581 		 * then add it now.  We will hit this process again (since we
1582 		 * stick it at the end of the procs list) but it will ignored
1583 		 * because it will have the PBWAIT flag set.
1584 		 */
1585 		if (procinset(p, &set) &&
1586 		    !(p->p_poolflag & PEXITED) &&
1587 		    !(p->p_poolflag & PBWAIT)) {
1588 			ASSERT(p->p_child == NULL); /* no child of a child */
1589 			procs[procs_count] = p;
1590 			procs[procs_count + 1] = NULL;
1591 			procs_count++;
1592 			p->p_poolflag |= PBWAIT;
1593 		}
1594 		mutex_exit(&p->p_lock);
1595 	}
1596 	mutex_exit(&pidlock);
1597 skip:
1598 	/*
1599 	 * If there's no processes to rebind then return ESRCH, unless
1600 	 * we're associating a pool with new resource set, destroying it,
1601 	 * or binding a zone to a pool.
1602 	 */
1603 	if (procs_count == 0) {
1604 		if (idtype == P_POOLID || idtype == P_ZONEID)
1605 			rv = 0;
1606 		else
1607 			rv = ESRCH;
1608 		goto out;
1609 	}
1610 
1611 #ifdef DEBUG
1612 	/*
1613 	 * All processes in the array should have PBWAIT set, and none
1614 	 * should be in the critical section. Thus, although p_poolflag
1615 	 * and p_poolcnt are protected by p_lock, their ASSERTions below
1616 	 * should be stable without it. procinset(), however, ASSERTs that
1617 	 * the p_lock is held upon entry.
1618 	 */
1619 	for (pp = procs; (p = *pp) != NULL; pp++) {
1620 		int in_set;
1621 
1622 		mutex_enter(&p->p_lock);
1623 		in_set = procinset(p, &set);
1624 		mutex_exit(&p->p_lock);
1625 
1626 		ASSERT(in_set);
1627 		ASSERT(p->p_poolflag & PBWAIT);
1628 		ASSERT(p->p_poolcnt == 0);
1629 	}
1630 #endif
1631 
1632 	/*
1633 	 * Do the check if processor set rebinding is going to succeed or not.
1634 	 */
1635 	if ((flags & POOL_BIND_PSET) &&
1636 	    (rv = pset_bind_start(procs, pool)) != 0) {
1637 		pool_bind_wakeall(procs);
1638 		goto out;
1639 	}
1640 
1641 	/*
1642 	 * At this point, all bind operations should succeed.
1643 	 */
1644 	for (pp = procs; (p = *pp) != NULL; pp++) {
1645 		if (flags & POOL_BIND_PSET) {
1646 			psetid_t psetid = pool->pool_pset->pset_id;
1647 			void *zonebuf;
1648 			void *projbuf;
1649 
1650 			/*
1651 			 * Pre-allocate one buffer for FSS (per-project
1652 			 * buffer for a new pset) in case if this is the
1653 			 * first thread from its current project getting
1654 			 * bound to this processor set.
1655 			 */
1656 			projbuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_PROJ);
1657 			zonebuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_ZONE);
1658 
1659 			mutex_enter(&pidlock);
1660 			mutex_enter(&p->p_lock);
1661 			pool_pset_bind(p, psetid, projbuf, zonebuf);
1662 			mutex_exit(&p->p_lock);
1663 			mutex_exit(&pidlock);
1664 			/*
1665 			 * Free buffers pre-allocated above if it
1666 			 * wasn't actually used.
1667 			 */
1668 			fss_freebuf(projbuf, FSS_ALLOC_PROJ);
1669 			fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
1670 		}
1671 		/*
1672 		 * Now let's change the scheduling class of this
1673 		 * process if our target pool has it defined.
1674 		 */
1675 		if (cid != POOL_CLASS_UNSET)
1676 			pool_change_class(p, cid);
1677 
1678 		/*
1679 		 * It is safe to reference p_pool here without holding
1680 		 * p_lock because it cannot change underneath of us.
1681 		 * We're holding pool_lock here, so nobody else can be
1682 		 * moving this process between pools.  If process "p"
1683 		 * would be exiting, we're guaranteed that it would be blocked
1684 		 * at pool_barrier_enter() in exit().  Otherwise, it would've
1685 		 * been skipped by one of our scans of the practive list
1686 		 * as a process with PEXITED flag set.
1687 		 */
1688 		if (p->p_pool != pool) {
1689 			ASSERT(p->p_pool->pool_ref > 0);
1690 			atomic_dec_32(&p->p_pool->pool_ref);
1691 			p->p_pool = pool;
1692 			atomic_inc_32(&p->p_pool->pool_ref);
1693 		}
1694 		/*
1695 		 * Okay, we've tortured this guy enough.
1696 		 * Let this poor process go now.
1697 		 */
1698 		pool_bind_wake(p);
1699 	}
1700 	if (flags & POOL_BIND_PSET)
1701 		pset_bind_finish();
1702 
1703 out:	switch (idtype) {
1704 	case P_PROJID:
1705 		ASSERT(kpj != NULL);
1706 		mutex_exit(&kpj->kpj_poolbind);
1707 		project_rele(kpj);
1708 		break;
1709 	case P_ZONEID:
1710 		if (rv == 0) {
1711 			mutex_enter(&cpu_lock);
1712 			zone_pool_set(zone, pool);
1713 			mutex_exit(&cpu_lock);
1714 		}
1715 		zone->zone_pool_mod = gethrtime();
1716 		zone_rele(zone);
1717 		break;
1718 	}
1719 
1720 	kmem_free(procs, procs_size * sizeof (proc_t *));
1721 	ASSERT(pool_barrier_count == 0);
1722 	return (rv);
1723 }
1724 
1725 void
1726 pool_event_cb_register(pool_event_cb_t *cb)
1727 {
1728 	ASSERT(!pool_lock_held() || panicstr);
1729 	ASSERT(cb->pec_func != NULL);
1730 
1731 	mutex_enter(&pool_event_cb_lock);
1732 	if (!pool_event_cb_init) {
1733 		list_create(&pool_event_cb_list,  sizeof (pool_event_cb_t),
1734 		    offsetof(pool_event_cb_t, pec_list));
1735 		pool_event_cb_init = B_TRUE;
1736 	}
1737 	list_insert_tail(&pool_event_cb_list, cb);
1738 	mutex_exit(&pool_event_cb_lock);
1739 }
1740 
1741 void
1742 pool_event_cb_unregister(pool_event_cb_t *cb)
1743 {
1744 	ASSERT(!pool_lock_held() || panicstr);
1745 
1746 	mutex_enter(&pool_event_cb_lock);
1747 	list_remove(&pool_event_cb_list, cb);
1748 	mutex_exit(&pool_event_cb_lock);
1749 }
1750 
1751 typedef struct {
1752 	pool_event_t	tqd_what;
1753 	poolid_t	tqd_id;
1754 } pool_tqd_t;
1755 
1756 void
1757 pool_event_notify(void *arg)
1758 {
1759 	pool_tqd_t	*tqd = (pool_tqd_t *)arg;
1760 	pool_event_cb_t	*cb;
1761 
1762 	ASSERT(!pool_lock_held() || panicstr);
1763 
1764 	mutex_enter(&pool_event_cb_lock);
1765 	for (cb = list_head(&pool_event_cb_list); cb != NULL;
1766 	    cb = list_next(&pool_event_cb_list, cb)) {
1767 		cb->pec_func(tqd->tqd_what, tqd->tqd_id, cb->pec_arg);
1768 	}
1769 	mutex_exit(&pool_event_cb_lock);
1770 	kmem_free(tqd, sizeof (*tqd));
1771 }
1772 
1773 void
1774 pool_event_dispatch(pool_event_t what, poolid_t id)
1775 {
1776 	pool_tqd_t *tqd = NULL;
1777 
1778 	ASSERT(pool_lock_held());
1779 
1780 	if (pool_event_cb_taskq == NULL) {
1781 		pool_event_cb_taskq = taskq_create("pool_event_cb_taskq", 1,
1782 		    -1, 1, 1, TASKQ_PREPOPULATE);
1783 	}
1784 
1785 	tqd = kmem_alloc(sizeof (*tqd), KM_SLEEP);
1786 	tqd->tqd_what = what;
1787 	tqd->tqd_id = id;
1788 
1789 	(void) taskq_dispatch(pool_event_cb_taskq, pool_event_notify, tqd,
1790 	    KM_SLEEP);
1791 }
1792