xref: /titanic_51/usr/src/uts/common/os/pool.c (revision fba5460f9ff1a4800cb5456354cb51962a5c092a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/pool.h>
29 #include <sys/pool_impl.h>
30 #include <sys/pool_pset.h>
31 #include <sys/id_space.h>
32 #include <sys/mutex.h>
33 #include <sys/nvpair.h>
34 #include <sys/cpuvar.h>
35 #include <sys/errno.h>
36 #include <sys/cmn_err.h>
37 #include <sys/systm.h>
38 #include <sys/proc.h>
39 #include <sys/fss.h>
40 #include <sys/class.h>
41 #include <sys/exacct.h>
42 #include <sys/utsname.h>
43 #include <sys/procset.h>
44 #include <sys/atomic.h>
45 #include <sys/zone.h>
46 #include <sys/policy.h>
47 
48 /*
49  * RESOURCE POOLS
50  *
51  * The resource pools facility brings together process-bindable resource into
52  * a common abstraction called a pool. Processor sets and other entities can
53  * be configured, grouped, and labelled such that workload components can be
54  * associated with a subset of a system's total resources.
55  *
56  * When disabled, the pools facility is "invisible".  All processes belong
57  * to the same pool (pool_default), and processor sets can be managed through
58  * the old pset() system call.  When enabled, processor sets can only be
59  * managed via the pools facility.  New pools can be created and associated
60  * with processor sets.  Processes can be bound to pools which have non-empty
61  * resource sets.
62  *
63  * Locking: pool_lock() protects global pools state and must be called
64  * before modifying the configuration, or when taking a snapshot of the
65  * configuration.  If pool_lock_intr() is used, the operation may be
66  * interrupted by a signal or a request.
67  *
68  * To prevent processes from being rebound between pools while they are
69  * the middle of an operation which affects resource set bindings, such
70  * operations must be surrounded by calls to pool_barrier_enter() and
71  * pool_barrier_exit().  This mechanism guarantees that such processes will
72  * be stopped either at the beginning or at the end of the barrier so that
73  * the rebind operation can atomically bind the process and its threads
74  * to new resource sets, and then let process run again.
75  *
76  * Lock ordering with respect to other locks is as follows:
77  *
78  * 	pool_lock() -> cpu_lock -> pidlock -> p_lock -> pool_barrier_lock
79  *
80  * Most static and global variables defined in this file are protected
81  * by calling pool_lock().
82  *
83  * The operation that binds tasks and projects to pools is atomic.  That is,
84  * either all processes in a given task or a project will be bound to a
85  * new pool, or (in case of an error) they will be all left bound to the
86  * old pool. Processes in a given task or a given project can only be bound to
87  * different pools if they were rebound individually one by one as single
88  * processes.  Threads or LWPs of the same process do not have pool bindings,
89  * and are bound to the same resource sets associated with the resource pool
90  * of that process.
91  *
92  * The following picture shows one possible pool configuration with three
93  * pools and three processor sets.  Note that processor set "foo" is not
94  * associated with any pools and therefore cannot have any processes
95  * bound to it.  Two pools (default and foo) are associated with the
96  * same processor set (default).  Also, note that processes in Task 2
97  * are bound to different pools.
98  *
99  *
100  *							       Processor Sets
101  *								+---------+
102  *		       +--------------+========================>| default |
103  *		      a|	      |				+---------+
104  *		      s|	      |				    ||
105  *		      s|	      |				+---------+
106  *		      o|	      |				|   foo   |
107  *		      c|	      |				+---------+
108  *		      i|	      |				    ||
109  *		      a|	      |				+---------+
110  *		      t|	      |			+------>|   bar   |
111  *		      e|	      |			|	+---------+
112  *                    d|              |                 |
113  *                     |              |                 |
114  *	       +---------+      +---------+      +---------+
115  *     Pools   | default |======|   foo   |======|   bar   |
116  *	       +---------+      +---------+      +---------+
117  *	           @  @            @              @ @   @
118  *                b|  |            |              | |   |
119  *                o|  |            |              | |   |
120  *                u|  +-----+      |      +-------+ |   +---+
121  *                n|        |      |      |         |       |
122  *            ....d|........|......|......|.........|.......|....
123  *            :    |   ::   |      |      |    ::   |       |   :
124  *            :  +---+ :: +---+  +---+  +---+  :: +---+   +---+ :
125  *  Processes :  | p | :: | p |  | p |  | p |  :: | p |...| p | :
126  *            :  +---+ :: +---+  +---+  +---+  :: +---+   +---+ :
127  *            :........::......................::...............:
128  *              Task 1            Task 2              Task N
129  *                 |                 |                  |
130  *                 |                 |                  |
131  *                 |  +-----------+  |             +-----------+
132  *                 +--| Project 1 |--+             | Project N |
133  *                    +-----------+                +-----------+
134  *
135  * This is just an illustration of relationships between processes, tasks,
136  * projects, pools, and processor sets. New types of resource sets will be
137  * added in the future.
138  */
139 
140 pool_t		*pool_default;	/* default pool which always exists */
141 int		pool_count;	/* number of pools created on this system */
142 int		pool_state;	/* pools state -- enabled/disabled */
143 void		*pool_buf;	/* pre-commit snapshot of the pools state */
144 size_t		pool_bufsz;	/* size of pool_buf */
145 static hrtime_t	pool_pool_mod;	/* last modification time for pools */
146 static hrtime_t	pool_sys_mod;	/* last modification time for system */
147 static nvlist_t	*pool_sys_prop;	/* system properties */
148 static id_space_t *pool_ids;	/* pool ID space */
149 static list_t	pool_list;	/* doubly-linked list of pools */
150 static kmutex_t		pool_mutex;		/* protects pool_busy_* */
151 static kcondvar_t	pool_busy_cv;		/* waiting for "pool_lock" */
152 static kthread_t	*pool_busy_thread;	/* thread holding "pool_lock" */
153 static kmutex_t		pool_barrier_lock;	/* synch. with pool_barrier_* */
154 static kcondvar_t	pool_barrier_cv;	/* synch. with pool_barrier_* */
155 static int		pool_barrier_count;	/* synch. with pool_barrier_* */
156 
157 /*
158  * Boot-time pool initialization.
159  */
160 void
161 pool_init(void)
162 {
163 	pool_ids = id_space_create("pool_ids", POOL_DEFAULT + 1, POOL_MAXID);
164 
165 	/*
166 	 * Initialize default pool.
167 	 */
168 	pool_default = kmem_zalloc(sizeof (pool_t), KM_SLEEP);
169 	pool_default->pool_id = POOL_DEFAULT;
170 	list_create(&pool_list, sizeof (pool_t), offsetof(pool_t, pool_link));
171 	list_insert_head(&pool_list, pool_default);
172 
173 	/*
174 	 * Initialize plugins for resource sets.
175 	 */
176 	pool_pset_init();
177 	pool_count = 1;
178 	p0.p_pool = pool_default;
179 	global_zone->zone_pool = pool_default;
180 	pool_default->pool_ref = 1;
181 }
182 
183 /*
184  * Synchronization routines.
185  *
186  * pool_lock is only called from syscall-level routines (processor_bind(),
187  * pset_*(), and /dev/pool ioctls).  The pool "lock" may be held for long
188  * periods of time, including across sleeping operations, so we allow its
189  * acquisition to be interruptible.
190  *
191  * The current thread that owns the "lock" is stored in the variable
192  * pool_busy_thread, both to let pool_lock_held() work and to aid debugging.
193  */
194 void
195 pool_lock(void)
196 {
197 	mutex_enter(&pool_mutex);
198 	while (pool_busy_thread != NULL)
199 		cv_wait(&pool_busy_cv, &pool_mutex);
200 	pool_busy_thread = curthread;
201 	mutex_exit(&pool_mutex);
202 }
203 
204 int
205 pool_lock_intr(void)
206 {
207 	mutex_enter(&pool_mutex);
208 	while (pool_busy_thread != NULL) {
209 		if (cv_wait_sig(&pool_busy_cv, &pool_mutex) == 0) {
210 			cv_signal(&pool_busy_cv);
211 			mutex_exit(&pool_mutex);
212 			return (1);
213 		}
214 	}
215 	pool_busy_thread = curthread;
216 	mutex_exit(&pool_mutex);
217 	return (0);
218 }
219 
220 int
221 pool_lock_held(void)
222 {
223 	return (pool_busy_thread == curthread);
224 }
225 
226 void
227 pool_unlock(void)
228 {
229 	mutex_enter(&pool_mutex);
230 	pool_busy_thread = NULL;
231 	cv_signal(&pool_busy_cv);
232 	mutex_exit(&pool_mutex);
233 }
234 
235 /*
236  * Routines allowing fork(), exec(), exit(), and lwp_create() to synchronize
237  * with pool_do_bind().
238  *
239  * Calls to pool_barrier_enter() and pool_barrier_exit() must bracket all
240  * operations which modify pool or pset associations.  They can be called
241  * while the process is multi-threaded.  In the common case, when current
242  * process is not being rebound (PBWAIT flag is not set), these functions
243  * will be just incrementing and decrementing reference counts.
244  */
245 void
246 pool_barrier_enter(void)
247 {
248 	proc_t *p = curproc;
249 
250 	ASSERT(MUTEX_HELD(&p->p_lock));
251 	while (p->p_poolflag & PBWAIT)
252 		cv_wait(&p->p_poolcv, &p->p_lock);
253 	p->p_poolcnt++;
254 }
255 
256 void
257 pool_barrier_exit(void)
258 {
259 	proc_t *p = curproc;
260 
261 	ASSERT(MUTEX_HELD(&p->p_lock));
262 	ASSERT(p->p_poolcnt > 0);
263 	p->p_poolcnt--;
264 	if (p->p_poolflag & PBWAIT) {
265 		mutex_enter(&pool_barrier_lock);
266 		ASSERT(pool_barrier_count > 0);
267 		pool_barrier_count--;
268 		if (pool_barrier_count == 0)
269 			cv_signal(&pool_barrier_cv);
270 		mutex_exit(&pool_barrier_lock);
271 		while (p->p_poolflag & PBWAIT)
272 			cv_wait(&p->p_poolcv, &p->p_lock);
273 	}
274 }
275 
276 /*
277  * Enable pools facility.
278  */
279 static int
280 pool_enable(void)
281 {
282 	int ret;
283 
284 	ASSERT(pool_lock_held());
285 	ASSERT(pool_count == 1);
286 
287 	ret = pool_pset_enable();
288 	if (ret != 0)
289 		return (ret);
290 	(void) nvlist_alloc(&pool_sys_prop, NV_UNIQUE_NAME, KM_SLEEP);
291 	(void) nvlist_add_string(pool_sys_prop, "system.name",
292 	    "default");
293 	(void) nvlist_add_string(pool_sys_prop, "system.comment", "");
294 	(void) nvlist_add_int64(pool_sys_prop, "system.version", 1);
295 	(void) nvlist_add_byte(pool_sys_prop, "system.bind-default", 1);
296 	(void) nvlist_add_string(pool_sys_prop, "system.poold.objectives",
297 	    "wt-load");
298 
299 	(void) nvlist_alloc(&pool_default->pool_props,
300 	    NV_UNIQUE_NAME, KM_SLEEP);
301 	(void) nvlist_add_string(pool_default->pool_props,
302 	    "pool.name", "pool_default");
303 	(void) nvlist_add_string(pool_default->pool_props, "pool.comment", "");
304 	(void) nvlist_add_byte(pool_default->pool_props, "pool.default", 1);
305 	(void) nvlist_add_byte(pool_default->pool_props, "pool.active", 1);
306 	(void) nvlist_add_int64(pool_default->pool_props,
307 	    "pool.importance", 1);
308 	(void) nvlist_add_int64(pool_default->pool_props, "pool.sys_id",
309 	    pool_default->pool_id);
310 
311 	pool_sys_mod = pool_pool_mod = gethrtime();
312 
313 	return (ret);
314 }
315 
316 /*
317  * Disable pools facility.
318  */
319 static int
320 pool_disable(void)
321 {
322 	int ret;
323 
324 	ASSERT(pool_lock_held());
325 
326 	if (pool_count > 1)	/* must destroy all pools first */
327 		return (EBUSY);
328 
329 	ret = pool_pset_disable();
330 	if (ret != 0)
331 		return (ret);
332 	if (pool_sys_prop != NULL) {
333 		nvlist_free(pool_sys_prop);
334 		pool_sys_prop = NULL;
335 	}
336 	if (pool_default->pool_props != NULL) {
337 		nvlist_free(pool_default->pool_props);
338 		pool_default->pool_props = NULL;
339 	}
340 	return (0);
341 }
342 
343 pool_t *
344 pool_lookup_pool_by_name(char *name)
345 {
346 	pool_t *pool = pool_default;
347 	char *p;
348 
349 	ASSERT(pool_lock_held());
350 	for (pool = list_head(&pool_list); pool;
351 	    pool = list_next(&pool_list, pool)) {
352 		if (nvlist_lookup_string(pool->pool_props,
353 		    "pool.name", &p) == 0 && strcmp(name, p) == 0)
354 			return (pool);
355 	}
356 	return (NULL);
357 }
358 
359 pool_t *
360 pool_lookup_pool_by_id(poolid_t poolid)
361 {
362 	pool_t *pool = pool_default;
363 
364 	ASSERT(pool_lock_held());
365 	for (pool = list_head(&pool_list); pool;
366 	    pool = list_next(&pool_list, pool)) {
367 		if (pool->pool_id == poolid)
368 			return (pool);
369 	}
370 	return (NULL);
371 }
372 
373 /*
374  * Create new pool, associate it with default resource sets, and give
375  * it a temporary name.
376  */
377 static int
378 pool_pool_create(poolid_t *poolid)
379 {
380 	pool_t *pool;
381 	char pool_name[40];
382 
383 	ASSERT(pool_lock_held());
384 
385 	pool = kmem_zalloc(sizeof (pool_t), KM_SLEEP);
386 	pool->pool_id = *poolid = id_alloc(pool_ids);
387 	pool->pool_pset = pool_pset_default;
388 	pool_pset_default->pset_npools++;
389 	list_insert_tail(&pool_list, pool);
390 	(void) nvlist_alloc(&pool->pool_props, NV_UNIQUE_NAME, KM_SLEEP);
391 	(void) nvlist_add_int64(pool->pool_props, "pool.sys_id", pool->pool_id);
392 	(void) nvlist_add_byte(pool->pool_props, "pool.default", 0);
393 	pool_pool_mod = gethrtime();
394 	(void) snprintf(pool_name, sizeof (pool_name), "pool_%lld",
395 	    pool_pool_mod);
396 	(void) nvlist_add_string(pool->pool_props, "pool.name", pool_name);
397 	pool_count++;
398 	return (0);
399 }
400 
401 struct destroy_zone_arg {
402 	pool_t *old;
403 	pool_t *new;
404 };
405 
406 /*
407  * Update pool pointers for zones that are currently bound to pool "old"
408  * to be bound to pool "new".
409  */
410 static int
411 pool_destroy_zone_cb(zone_t *zone, void *arg)
412 {
413 	struct destroy_zone_arg *dza = arg;
414 
415 	ASSERT(pool_lock_held());
416 	ASSERT(MUTEX_HELD(&cpu_lock));
417 
418 	if (zone_pool_get(zone) == dza->old)
419 		zone_pool_set(zone, dza->new);
420 	return (0);
421 }
422 
423 /*
424  * Destroy specified pool, and rebind all processes in it
425  * to the default pool.
426  */
427 static int
428 pool_pool_destroy(poolid_t poolid)
429 {
430 	pool_t *pool;
431 	int ret;
432 
433 	ASSERT(pool_lock_held());
434 
435 	if (poolid == POOL_DEFAULT)
436 		return (EINVAL);
437 	if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
438 		return (ESRCH);
439 	ret = pool_do_bind(pool_default, P_POOLID, poolid, POOL_BIND_ALL);
440 	if (ret == 0) {
441 		struct destroy_zone_arg dzarg;
442 
443 		dzarg.old = pool;
444 		dzarg.new = pool_default;
445 		mutex_enter(&cpu_lock);
446 		ret = zone_walk(pool_destroy_zone_cb, &dzarg);
447 		mutex_exit(&cpu_lock);
448 		ASSERT(ret == 0);
449 		ASSERT(pool->pool_ref == 0);
450 		(void) nvlist_free(pool->pool_props);
451 		id_free(pool_ids, pool->pool_id);
452 		pool->pool_pset->pset_npools--;
453 		list_remove(&pool_list, pool);
454 		pool_count--;
455 		pool_pool_mod = gethrtime();
456 		kmem_free(pool, sizeof (pool_t));
457 	}
458 	return (ret);
459 }
460 
461 /*
462  * Create new pool or resource set.
463  */
464 int
465 pool_create(int class, int subclass, id_t *id)
466 {
467 	int ret;
468 
469 	ASSERT(pool_lock_held());
470 	if (pool_state == POOL_DISABLED)
471 		return (ENOTACTIVE);
472 	switch (class) {
473 	case PEC_POOL:
474 		ret = pool_pool_create((poolid_t *)id);
475 		break;
476 	case PEC_RES_COMP:
477 		switch (subclass) {
478 		case PREC_PSET:
479 			ret = pool_pset_create((psetid_t *)id);
480 			break;
481 		default:
482 			ret = EINVAL;
483 		}
484 		break;
485 	case PEC_RES_AGG:
486 		ret = ENOTSUP;
487 		break;
488 	default:
489 		ret = EINVAL;
490 	}
491 	return (ret);
492 }
493 
494 /*
495  * Destroy an existing pool or resource set.
496  */
497 int
498 pool_destroy(int class, int subclass, id_t id)
499 {
500 	int ret;
501 
502 	ASSERT(pool_lock_held());
503 	if (pool_state == POOL_DISABLED)
504 		return (ENOTACTIVE);
505 	switch (class) {
506 	case PEC_POOL:
507 		ret = pool_pool_destroy((poolid_t)id);
508 		break;
509 	case PEC_RES_COMP:
510 		switch (subclass) {
511 		case PREC_PSET:
512 			ret = pool_pset_destroy((psetid_t)id);
513 			break;
514 		default:
515 			ret = EINVAL;
516 		}
517 		break;
518 	case PEC_RES_AGG:
519 		ret = ENOTSUP;
520 		break;
521 	default:
522 		ret = EINVAL;
523 	}
524 	return (ret);
525 }
526 
527 /*
528  * Enable or disable pools.
529  */
530 int
531 pool_status(int status)
532 {
533 	int ret = 0;
534 
535 	ASSERT(pool_lock_held());
536 
537 	if (pool_state == status)
538 		return (0);
539 	switch (status) {
540 	case POOL_ENABLED:
541 		ret = pool_enable();
542 		if (ret != 0)
543 			return (ret);
544 		pool_state = POOL_ENABLED;
545 		break;
546 	case POOL_DISABLED:
547 		ret = pool_disable();
548 		if (ret != 0)
549 			return (ret);
550 		pool_state = POOL_DISABLED;
551 		break;
552 	default:
553 		ret = EINVAL;
554 	}
555 	return (ret);
556 }
557 
558 /*
559  * Associate pool with resource set.
560  */
561 int
562 pool_assoc(poolid_t poolid, int idtype, id_t id)
563 {
564 	int ret;
565 
566 	ASSERT(pool_lock_held());
567 	if (pool_state == POOL_DISABLED)
568 		return (ENOTACTIVE);
569 	switch (idtype) {
570 	case PREC_PSET:
571 		ret = pool_pset_assoc(poolid, (psetid_t)id);
572 		break;
573 	default:
574 		ret = EINVAL;
575 	}
576 	if (ret == 0)
577 		pool_pool_mod = gethrtime();
578 	return (ret);
579 }
580 
581 /*
582  * Disassociate resource set from pool.
583  */
584 int
585 pool_dissoc(poolid_t poolid, int idtype)
586 {
587 	int ret;
588 
589 	ASSERT(pool_lock_held());
590 	if (pool_state == POOL_DISABLED)
591 		return (ENOTACTIVE);
592 	switch (idtype) {
593 	case PREC_PSET:
594 		ret = pool_pset_assoc(poolid, PS_NONE);
595 		break;
596 	default:
597 		ret = EINVAL;
598 	}
599 	if (ret == 0)
600 		pool_pool_mod = gethrtime();
601 	return (ret);
602 }
603 
604 /*
605  * Transfer specified quantity of resources between resource sets.
606  */
607 /*ARGSUSED*/
608 int
609 pool_transfer(int type, id_t src, id_t dst, uint64_t qty)
610 {
611 	int ret = EINVAL;
612 	return (ret);
613 }
614 
615 /*
616  * Transfer resources specified by their IDs between resource sets.
617  */
618 int
619 pool_xtransfer(int type, id_t src, id_t dst, uint_t size, id_t *ids)
620 {
621 	int ret;
622 
623 	ASSERT(pool_lock_held());
624 	if (pool_state == POOL_DISABLED)
625 		return (ENOTACTIVE);
626 	switch (type) {
627 	case PREC_PSET:
628 		ret = pool_pset_xtransfer((psetid_t)src, (psetid_t)dst,
629 		    size, ids);
630 		break;
631 	default:
632 		ret = EINVAL;
633 	}
634 	return (ret);
635 }
636 
637 /*
638  * Bind processes to pools.
639  */
640 int
641 pool_bind(poolid_t poolid, idtype_t idtype, id_t id)
642 {
643 	pool_t *pool;
644 
645 	ASSERT(pool_lock_held());
646 
647 	if (pool_state == POOL_DISABLED)
648 		return (ENOTACTIVE);
649 	if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
650 		return (ESRCH);
651 
652 	switch (idtype) {
653 	case P_PID:
654 	case P_TASKID:
655 	case P_PROJID:
656 	case P_ZONEID:
657 		break;
658 	default:
659 		return (EINVAL);
660 	}
661 	return (pool_do_bind(pool, idtype, id, POOL_BIND_ALL));
662 }
663 
664 /*
665  * Query pool binding of the specifed process.
666  */
667 int
668 pool_query_binding(idtype_t idtype, id_t id, id_t *poolid)
669 {
670 	proc_t *p;
671 
672 	if (idtype != P_PID)
673 		return (ENOTSUP);
674 	if (id == P_MYID)
675 		id = curproc->p_pid;
676 
677 	ASSERT(pool_lock_held());
678 
679 	mutex_enter(&pidlock);
680 	if ((p = prfind((pid_t)id)) == NULL) {
681 		mutex_exit(&pidlock);
682 		return (ESRCH);
683 	}
684 	mutex_enter(&p->p_lock);
685 	/*
686 	 * In local zones, lie about pool bindings of processes from
687 	 * the global zone.
688 	 */
689 	if (!INGLOBALZONE(curproc) && INGLOBALZONE(p)) {
690 		pool_t *pool;
691 
692 		pool = zone_pool_get(curproc->p_zone);
693 		*poolid = pool->pool_id;
694 	} else {
695 		*poolid = p->p_pool->pool_id;
696 	}
697 	mutex_exit(&p->p_lock);
698 	mutex_exit(&pidlock);
699 	return (0);
700 }
701 
702 static ea_object_t *
703 pool_system_pack(void)
704 {
705 	ea_object_t *eo_system;
706 	size_t bufsz = 0;
707 	char *buf = NULL;
708 
709 	ASSERT(pool_lock_held());
710 
711 	eo_system = ea_alloc_group(EXT_GROUP | EXC_LOCAL | EXD_GROUP_SYSTEM);
712 	(void) ea_attach_item(eo_system, &pool_sys_mod, sizeof (hrtime_t),
713 	    EXC_LOCAL | EXD_SYSTEM_TSTAMP | EXT_UINT64);
714 	if (INGLOBALZONE(curproc))
715 		(void) ea_attach_item(eo_system, &pool_pool_mod,
716 		    sizeof (hrtime_t),
717 		    EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64);
718 	else
719 		(void) ea_attach_item(eo_system,
720 		    &curproc->p_zone->zone_pool_mod,
721 		    sizeof (hrtime_t),
722 		    EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64);
723 	(void) ea_attach_item(eo_system, &pool_pset_mod, sizeof (hrtime_t),
724 	    EXC_LOCAL | EXD_PSET_TSTAMP | EXT_UINT64);
725 	(void) ea_attach_item(eo_system, &pool_cpu_mod, sizeof (hrtime_t),
726 	    EXC_LOCAL | EXD_CPU_TSTAMP | EXT_UINT64);
727 	(void) nvlist_pack(pool_sys_prop, &buf, &bufsz, NV_ENCODE_NATIVE, 0);
728 	(void) ea_attach_item(eo_system, buf, bufsz,
729 	    EXC_LOCAL | EXD_SYSTEM_PROP | EXT_RAW);
730 	kmem_free(buf, bufsz);
731 	return (eo_system);
732 }
733 
734 /*
735  * Pack information about pools and attach it to specified exacct group.
736  */
737 static int
738 pool_pool_pack(ea_object_t *eo_system)
739 {
740 	ea_object_t *eo_pool;
741 	pool_t *pool;
742 	size_t bufsz;
743 	char *buf;
744 	pool_t *myzonepool;
745 
746 	ASSERT(pool_lock_held());
747 	myzonepool = zone_pool_get(curproc->p_zone);
748 	for (pool = list_head(&pool_list); pool;
749 	    pool = list_next(&pool_list, pool)) {
750 		if (!INGLOBALZONE(curproc) && myzonepool != pool)
751 			continue;
752 		bufsz = 0;
753 		buf = NULL;
754 		eo_pool = ea_alloc_group(EXT_GROUP |
755 		    EXC_LOCAL | EXD_GROUP_POOL);
756 		(void) ea_attach_item(eo_pool, &pool->pool_id, sizeof (id_t),
757 		    EXC_LOCAL | EXD_POOL_POOLID | EXT_UINT32);
758 		(void) ea_attach_item(eo_pool, &pool->pool_pset->pset_id,
759 		    sizeof (id_t), EXC_LOCAL | EXD_POOL_PSETID | EXT_UINT32);
760 		(void) nvlist_pack(pool->pool_props, &buf, &bufsz,
761 		    NV_ENCODE_NATIVE, 0);
762 		(void) ea_attach_item(eo_pool, buf, bufsz,
763 		    EXC_LOCAL | EXD_POOL_PROP | EXT_RAW);
764 		kmem_free(buf, bufsz);
765 		(void) ea_attach_to_group(eo_system, eo_pool);
766 	}
767 	return (0);
768 }
769 
770 /*
771  * Pack the whole pool configuration in the specified buffer.
772  */
773 int
774 pool_pack_conf(void *kbuf, size_t kbufsz, size_t *asize)
775 {
776 	ea_object_t *eo_system;
777 	size_t ksize;
778 	int ret = 0;
779 
780 	ASSERT(pool_lock_held());
781 
782 	eo_system = pool_system_pack();		/* 1. pack system */
783 	(void) pool_pool_pack(eo_system);	/* 2. pack all pools */
784 	(void) pool_pset_pack(eo_system);	/* 3. pack all psets */
785 	ksize = ea_pack_object(eo_system, NULL, 0);
786 	if (kbuf == NULL || kbufsz == 0)
787 		*asize = ksize;
788 	else if (ksize > kbufsz)
789 		ret = ENOMEM;
790 	else
791 		*asize = ea_pack_object(eo_system, kbuf, kbufsz);
792 	ea_free_object(eo_system, EUP_ALLOC);
793 	return (ret);
794 }
795 
796 /*
797  * Start/end the commit transaction.  If commit transaction is currently
798  * in progress, then all POOL_QUERY ioctls will return pools configuration
799  * at the beginning of transaction.
800  */
801 int
802 pool_commit(int state)
803 {
804 	ea_object_t *eo_system;
805 	int ret = 0;
806 
807 	ASSERT(pool_lock_held());
808 
809 	if (pool_state == POOL_DISABLED)
810 		return (ENOTACTIVE);
811 	switch (state) {
812 	case 1:
813 		/*
814 		 * Beginning commit transation.
815 		 */
816 		if (pool_buf != NULL)		/* transaction in progress */
817 			return (EBUSY);
818 		eo_system = pool_system_pack();		/* 1. pack system */
819 		(void) pool_pool_pack(eo_system);	/* 2. pack all pools */
820 		(void) pool_pset_pack(eo_system);	/* 3. pack all psets */
821 		pool_bufsz = ea_pack_object(eo_system, NULL, 0);
822 		pool_buf = kmem_alloc(pool_bufsz, KM_SLEEP);
823 		pool_bufsz = ea_pack_object(eo_system, pool_buf, pool_bufsz);
824 		ea_free_object(eo_system, EUP_ALLOC);
825 		break;
826 	case 0:
827 		/*
828 		 * Finishing commit transaction.
829 		 */
830 		if (pool_buf != NULL) {
831 			kmem_free(pool_buf, pool_bufsz);
832 			pool_buf = NULL;
833 			pool_bufsz = 0;
834 		}
835 		break;
836 	default:
837 		ret = EINVAL;
838 	}
839 	return (ret);
840 }
841 
842 /*
843  * Check is the specified property is special
844  */
845 static pool_property_t *
846 pool_property_find(char *name, pool_property_t *list)
847 {
848 	pool_property_t *prop;
849 
850 	for (prop = list; prop->pp_name != NULL; prop++)
851 		if (strcmp(prop->pp_name, name) == 0)
852 			return (prop);
853 	return (NULL);
854 }
855 
856 static pool_property_t pool_prop_sys[] = {
857 	{ "system.name",		DATA_TYPE_STRING,	PP_RDWR },
858 	{ "system.comment",		DATA_TYPE_STRING,	PP_RDWR },
859 	{ "system.version",		DATA_TYPE_UINT64,	PP_READ },
860 	{ "system.bind-default",	DATA_TYPE_BYTE,		PP_RDWR },
861 	{ "system.allocate-method",	DATA_TYPE_STRING,
862 	    PP_RDWR | PP_OPTIONAL },
863 	{ "system.poold.log-level",	DATA_TYPE_STRING,
864 	    PP_RDWR | PP_OPTIONAL },
865 	{ "system.poold.log-location",	DATA_TYPE_STRING,
866 	    PP_RDWR | PP_OPTIONAL },
867 	{ "system.poold.monitor-interval",	DATA_TYPE_UINT64,
868 	    PP_RDWR | PP_OPTIONAL },
869 	{ "system.poold.history-file",	DATA_TYPE_STRING,
870 	    PP_RDWR | PP_OPTIONAL },
871 	{ "system.poold.objectives",	DATA_TYPE_STRING,
872 	    PP_RDWR | PP_OPTIONAL },
873 	{ NULL,				0,			0 }
874 };
875 
876 static pool_property_t pool_prop_pool[] = {
877 	{ "pool.sys_id",		DATA_TYPE_UINT64,	PP_READ },
878 	{ "pool.name",			DATA_TYPE_STRING,	PP_RDWR },
879 	{ "pool.default",		DATA_TYPE_BYTE,		PP_READ },
880 	{ "pool.active",		DATA_TYPE_BYTE,		PP_RDWR },
881 	{ "pool.importance",		DATA_TYPE_INT64,	PP_RDWR },
882 	{ "pool.comment",		DATA_TYPE_STRING,	PP_RDWR },
883 	{ "pool.scheduler",		DATA_TYPE_STRING,
884 	    PP_RDWR | PP_OPTIONAL },
885 	{ NULL,				0,			0 }
886 };
887 
888 /*
889  * Common routine to put new property on the specified list
890  */
891 int
892 pool_propput_common(nvlist_t *nvlist, nvpair_t *pair, pool_property_t *props)
893 {
894 	pool_property_t *prop;
895 
896 	if ((prop = pool_property_find(nvpair_name(pair), props)) != NULL) {
897 		/*
898 		 * No read-only properties or properties with bad types
899 		 */
900 		if (!(prop->pp_perm & PP_WRITE) ||
901 		    prop->pp_type != nvpair_type(pair))
902 			return (EINVAL);
903 	}
904 	return (nvlist_add_nvpair(nvlist, pair));
905 }
906 
907 /*
908  * Common routine to remove property from the given list
909  */
910 int
911 pool_proprm_common(nvlist_t *nvlist, char *name, pool_property_t *props)
912 {
913 	pool_property_t *prop;
914 
915 	if ((prop = pool_property_find(name, props)) != NULL) {
916 		if (!(prop->pp_perm & PP_OPTIONAL))
917 			return (EINVAL);
918 	}
919 	return (nvlist_remove_all(nvlist, name));
920 }
921 
922 static int
923 pool_system_propput(nvpair_t *pair)
924 {
925 	int ret;
926 
927 	ASSERT(pool_lock_held());
928 	ret = pool_propput_common(pool_sys_prop, pair, pool_prop_sys);
929 	if (ret == 0)
930 		pool_sys_mod = gethrtime();
931 	return (ret);
932 }
933 
934 static int
935 pool_system_proprm(char *name)
936 {
937 	int ret;
938 
939 	ASSERT(pool_lock_held());
940 	ret = pool_proprm_common(pool_sys_prop, name, pool_prop_sys);
941 	if (ret == 0)
942 		pool_sys_mod = gethrtime();
943 	return (ret);
944 }
945 
946 static int
947 pool_pool_propput(poolid_t poolid, nvpair_t *pair)
948 {
949 	pool_t *pool;
950 	int ret;
951 
952 	ASSERT(pool_lock_held());
953 	if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
954 		return (ESRCH);
955 	ret = pool_propput_common(pool->pool_props, pair, pool_prop_pool);
956 	if (ret == 0)
957 		pool_pool_mod = gethrtime();
958 	return (ret);
959 }
960 
961 static int
962 pool_pool_proprm(poolid_t poolid, char *name)
963 {
964 	int ret;
965 	pool_t *pool;
966 
967 	ASSERT(pool_lock_held());
968 	if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
969 		return (ESRCH);
970 	ret = pool_proprm_common(pool->pool_props, name, pool_prop_pool);
971 	if (ret == 0)
972 		pool_pool_mod = gethrtime();
973 	return (ret);
974 }
975 
976 int
977 pool_propput(int class, int subclass, id_t id, nvpair_t *pair)
978 {
979 	int ret;
980 
981 	ASSERT(pool_lock_held());
982 	if (pool_state == POOL_DISABLED)
983 		return (ENOTACTIVE);
984 	switch (class) {
985 	case PEC_SYSTEM:
986 		ret = pool_system_propput(pair);
987 		break;
988 	case PEC_POOL:
989 		ret = pool_pool_propput((poolid_t)id, pair);
990 		break;
991 	case PEC_RES_COMP:
992 		switch (subclass) {
993 		case PREC_PSET:
994 			ret = pool_pset_propput((psetid_t)id, pair);
995 			break;
996 		default:
997 			ret = EINVAL;
998 		}
999 		break;
1000 	case PEC_RES_AGG:
1001 		ret = ENOTSUP;
1002 		break;
1003 	case PEC_COMP:
1004 		switch (subclass) {
1005 		case PCEC_CPU:
1006 			ret = pool_cpu_propput((processorid_t)id, pair);
1007 			break;
1008 		default:
1009 			ret = EINVAL;
1010 		}
1011 		break;
1012 	default:
1013 		ret = EINVAL;
1014 	}
1015 	return (ret);
1016 }
1017 
1018 int
1019 pool_proprm(int class, int subclass, id_t id, char *name)
1020 {
1021 	int ret;
1022 
1023 	ASSERT(pool_lock_held());
1024 	if (pool_state == POOL_DISABLED)
1025 		return (ENOTACTIVE);
1026 	switch (class) {
1027 	case PEC_SYSTEM:
1028 		ret = pool_system_proprm(name);
1029 		break;
1030 	case PEC_POOL:
1031 		ret = pool_pool_proprm((poolid_t)id, name);
1032 		break;
1033 	case PEC_RES_COMP:
1034 		switch (subclass) {
1035 		case PREC_PSET:
1036 			ret = pool_pset_proprm((psetid_t)id, name);
1037 			break;
1038 		default:
1039 			ret = EINVAL;
1040 		}
1041 		break;
1042 	case PEC_RES_AGG:
1043 		ret = ENOTSUP;
1044 		break;
1045 	case PEC_COMP:
1046 		switch (subclass) {
1047 		case PCEC_CPU:
1048 			ret = pool_cpu_proprm((processorid_t)id, name);
1049 			break;
1050 		default:
1051 			ret = EINVAL;
1052 		}
1053 		break;
1054 	default:
1055 		ret = EINVAL;
1056 	}
1057 	return (ret);
1058 }
1059 
1060 int
1061 pool_propget(char *name, int class, int subclass, id_t id, nvlist_t **nvlp)
1062 {
1063 	int ret;
1064 	nvlist_t *nvl;
1065 
1066 	ASSERT(pool_lock_held());
1067 	if (pool_state == POOL_DISABLED)
1068 		return (ENOTACTIVE);
1069 
1070 	(void) nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP);
1071 
1072 	switch (class) {
1073 	case PEC_SYSTEM:
1074 	case PEC_POOL:
1075 		ret = EINVAL;
1076 		break;
1077 	case PEC_RES_COMP:
1078 		switch (subclass) {
1079 		case PREC_PSET:
1080 			ret = pool_pset_propget((psetid_t)id, name, nvl);
1081 			break;
1082 		default:
1083 			ret = EINVAL;
1084 		}
1085 		break;
1086 	case PEC_RES_AGG:
1087 		ret = ENOTSUP;
1088 		break;
1089 	case PEC_COMP:
1090 		switch (subclass) {
1091 		case PCEC_CPU:
1092 			ret = pool_cpu_propget((processorid_t)id, name, nvl);
1093 			break;
1094 		default:
1095 			ret = EINVAL;
1096 		}
1097 		break;
1098 	default:
1099 		ret = EINVAL;
1100 	}
1101 	if (ret == 0)
1102 		*nvlp = nvl;
1103 	else
1104 		nvlist_free(nvl);
1105 	return (ret);
1106 }
1107 
1108 /*
1109  * pool_bind_wake and pool_bind_wakeall are helper functions to undo PBWAITs
1110  * in case of failure in pool_do_bind().
1111  */
1112 static void
1113 pool_bind_wake(proc_t *p)
1114 {
1115 	ASSERT(pool_lock_held());
1116 
1117 	mutex_enter(&p->p_lock);
1118 	ASSERT(p->p_poolflag & PBWAIT);
1119 	if (p->p_poolcnt > 0) {
1120 		mutex_enter(&pool_barrier_lock);
1121 		pool_barrier_count -= p->p_poolcnt;
1122 		mutex_exit(&pool_barrier_lock);
1123 	}
1124 	p->p_poolflag &= ~PBWAIT;
1125 	cv_signal(&p->p_poolcv);
1126 	mutex_exit(&p->p_lock);
1127 }
1128 
1129 static void
1130 pool_bind_wakeall(proc_t **procs)
1131 {
1132 	proc_t *p, **pp;
1133 
1134 	ASSERT(pool_lock_held());
1135 	for (pp = procs; (p = *pp) != NULL; pp++)
1136 		pool_bind_wake(p);
1137 }
1138 
1139 /*
1140  * Return the scheduling class for this pool, or
1141  * 	POOL_CLASS_UNSET if not set
1142  * 	POOL_CLASS_INVAL if set to an invalid class ID.
1143  */
1144 id_t
1145 pool_get_class(pool_t *pool)
1146 {
1147 	char *name;
1148 	id_t cid;
1149 
1150 	ASSERT(pool_lock_held());
1151 
1152 	if (nvlist_lookup_string(pool->pool_props, "pool.scheduler",
1153 	    &name) == 0) {
1154 		if (getcidbyname(name, &cid) == 0)
1155 			return (cid);
1156 		else
1157 			return (POOL_CLASS_INVAL);
1158 	}
1159 	return (POOL_CLASS_UNSET);
1160 }
1161 
1162 /*
1163  * Move process to the new scheduling class.
1164  */
1165 static void
1166 pool_change_class(proc_t *p, id_t cid)
1167 {
1168 	kthread_t *t;
1169 	void *cldata;
1170 	id_t oldcid;
1171 	void **bufs;
1172 	void **buf;
1173 	int nlwp;
1174 	int ret;
1175 	int i;
1176 
1177 	/*
1178 	 * Do not move kernel processes (such as zsched).
1179 	 */
1180 	if (p->p_flag & SSYS)
1181 		return;
1182 	/*
1183 	 * This process is in the pool barrier, so it can't possibly be
1184 	 * adding new threads and we can use p_lwpcnt + p_zombcnt + 1
1185 	 * (for possible agent LWP which doesn't use pool barrier) as
1186 	 * our upper bound.
1187 	 */
1188 	nlwp = p->p_lwpcnt + p->p_zombcnt + 1;
1189 
1190 	/*
1191 	 * Pre-allocate scheduling class specific buffers before
1192 	 * grabbing p_lock.
1193 	 */
1194 	bufs = kmem_zalloc(nlwp * sizeof (void *), KM_SLEEP);
1195 	for (i = 0, buf = bufs; i < nlwp; i++, buf++) {
1196 		ret = CL_ALLOC(buf, cid, KM_SLEEP);
1197 		ASSERT(ret == 0);
1198 	}
1199 
1200 	/*
1201 	 * Move threads one by one to the new scheduling class.
1202 	 * This never fails because we have all the right
1203 	 * privileges here.
1204 	 */
1205 	mutex_enter(&p->p_lock);
1206 	ASSERT(p->p_poolflag & PBWAIT);
1207 	buf = bufs;
1208 	t = p->p_tlist;
1209 	ASSERT(t != NULL);
1210 	do {
1211 		if (t->t_cid != cid) {
1212 			oldcid = t->t_cid;
1213 			cldata = t->t_cldata;
1214 			ret = CL_ENTERCLASS(t, cid, NULL, NULL, *buf);
1215 			ASSERT(ret == 0);
1216 			CL_EXITCLASS(oldcid, cldata);
1217 			*buf++ = NULL;
1218 		}
1219 	} while ((t = t->t_forw) != p->p_tlist);
1220 	mutex_exit(&p->p_lock);
1221 	/*
1222 	 * Free unused scheduling class specific buffers.
1223 	 */
1224 	for (i = 0, buf = bufs; i < nlwp; i++, buf++) {
1225 		if (*buf != NULL) {
1226 			CL_FREE(cid, *buf);
1227 			*buf = NULL;
1228 		}
1229 	}
1230 	kmem_free(bufs, nlwp * sizeof (void *));
1231 }
1232 
1233 /*
1234  * The meat of the bind operation.  The steps in pool_do_bind are:
1235  *
1236  * 1) Set PBWAIT in the p_poolflag of any process of interest, and add all
1237  *    such processes to an array.  For any interesting process that has
1238  *    threads inside the pool barrier set, increment a counter by the
1239  *    count of such threads.  Once PBWAIT is set on a process, that process
1240  *    will not disappear.
1241  *
1242  * 2) Wait for the counter from step 2 to drop to zero.  Any process which
1243  *    calls pool_barrier_exit() and notices that PBWAIT has been set on it
1244  *    will decrement that counter before going to sleep, and the process
1245  *    calling pool_barrier_exit() which does the final decrement will wake us.
1246  *
1247  * 3) For each interesting process, perform a calculation on it to see if
1248  *    the bind will actually succeed.  This uses the following three
1249  *    resource-set-specific functions:
1250  *
1251  *    - int set_bind_start(procs, pool)
1252  *
1253  *      Determine whether the given array of processes can be bound to the
1254  *      resource set associated with the given pool.  If it can, take and hold
1255  *      any locks necessary to ensure that the operation will succeed, and
1256  *      make any necessary reservations in the target resource set.  If it
1257  *      can't, return failure with no reservations made and no new locks held.
1258  *
1259  *    - void set_bind_abort(procs, pool)
1260  *
1261  *      set_bind_start() has completed successfully, but another resource set's
1262  *      set_bind_start() has failed, and we haven't begun the bind yet.  Undo
1263  *      any reservations made and drop any locks acquired by our
1264  *      set_bind_start().
1265  *
1266  *    - void set_bind_finish(void)
1267  *
1268  *      The bind has completed successfully.  The processes have been released,
1269  *      and the reservation acquired in set_bind_start() has been depleted as
1270  *      the processes have finished their bindings.  Drop any locks acquired by
1271  *      set_bind_start().
1272  *
1273  * 4) If we've decided that we can proceed with the bind, iterate through
1274  *    the list of interesting processes, grab the necessary locks (which
1275  *    may differ per resource set), perform the bind, and ASSERT that it
1276  *    succeeds.  Once a process has been rebound, it can be awakened.
1277  *
1278  * The operations from step 4 must be kept in sync with anything which might
1279  * cause the bind operations (e.g., cpupart_bind_thread()) to fail, and
1280  * are thus located in the same source files as the associated bind operations.
1281  */
1282 int
1283 pool_do_bind(pool_t *pool, idtype_t idtype, id_t id, int flags)
1284 {
1285 	extern uint_t nproc;
1286 	klwp_t *lwp = ttolwp(curthread);
1287 	proc_t **pp, **procs;
1288 	proc_t *prstart;
1289 	int procs_count = 0;
1290 	kproject_t *kpj;
1291 	procset_t set;
1292 	zone_t *zone;
1293 	int procs_size;
1294 	int rv = 0;
1295 	proc_t *p;
1296 	id_t cid = -1;
1297 
1298 	ASSERT(pool_lock_held());
1299 
1300 	if ((cid = pool_get_class(pool)) == POOL_CLASS_INVAL)
1301 		return (EINVAL);
1302 
1303 	if (idtype == P_ZONEID) {
1304 		zone = zone_find_by_id(id);
1305 		if (zone == NULL)
1306 			return (ESRCH);
1307 		if (zone_status_get(zone) > ZONE_IS_RUNNING) {
1308 			zone_rele(zone);
1309 			return (EBUSY);
1310 		}
1311 	}
1312 
1313 	if (idtype == P_PROJID) {
1314 		kpj = project_hold_by_id(id, global_zone, PROJECT_HOLD_FIND);
1315 		if (kpj == NULL)
1316 			return (ESRCH);
1317 		mutex_enter(&kpj->kpj_poolbind);
1318 	}
1319 
1320 	if (idtype == P_PID) {
1321 		/*
1322 		 * Fast-path for a single process case.
1323 		 */
1324 		procs_size = 2;	/* procs is NULL-terminated */
1325 		procs = kmem_zalloc(procs_size * sizeof (proc_t *), KM_SLEEP);
1326 		mutex_enter(&pidlock);
1327 	} else {
1328 		/*
1329 		 * We will need enough slots for proc_t pointers for as many as
1330 		 * twice the number of currently running processes (assuming
1331 		 * that each one could be in fork() creating a new child).
1332 		 */
1333 		for (;;) {
1334 			procs_size = nproc * 2;
1335 			procs = kmem_zalloc(procs_size * sizeof (proc_t *),
1336 			    KM_SLEEP);
1337 			mutex_enter(&pidlock);
1338 
1339 			if (nproc * 2 <= procs_size)
1340 				break;
1341 			/*
1342 			 * If nproc has changed, try again.
1343 			 */
1344 			mutex_exit(&pidlock);
1345 			kmem_free(procs, procs_size * sizeof (proc_t *));
1346 		}
1347 	}
1348 
1349 	if (id == P_MYID)
1350 		id = getmyid(idtype);
1351 	setprocset(&set, POP_AND, idtype, id, P_ALL, 0);
1352 
1353 	/*
1354 	 * Do a first scan, and select target processes.
1355 	 */
1356 	if (idtype == P_PID)
1357 		prstart = prfind(id);
1358 	else
1359 		prstart = practive;
1360 	for (p = prstart, pp = procs; p != NULL; p = p->p_next) {
1361 		mutex_enter(&p->p_lock);
1362 		/*
1363 		 * Skip processes that don't match our (id, idtype) set or
1364 		 * on the way of becoming zombies.  Skip kernel processes
1365 		 * from the global zone.
1366 		 */
1367 		if (procinset(p, &set) == 0 ||
1368 		    p->p_poolflag & PEXITED ||
1369 		    ((p->p_flag & SSYS) && INGLOBALZONE(p))) {
1370 			mutex_exit(&p->p_lock);
1371 			continue;
1372 		}
1373 		if (!INGLOBALZONE(p)) {
1374 			switch (idtype) {
1375 			case P_PID:
1376 			case P_TASKID:
1377 				/*
1378 				 * Can't bind processes or tasks
1379 				 * in local zones to pools.
1380 				 */
1381 				mutex_exit(&p->p_lock);
1382 				mutex_exit(&pidlock);
1383 				pool_bind_wakeall(procs);
1384 				rv = EINVAL;
1385 				goto out;
1386 			case P_PROJID:
1387 				/*
1388 				 * Only projects in the global
1389 				 * zone can be rebound.
1390 				 */
1391 				mutex_exit(&p->p_lock);
1392 				continue;
1393 			case P_POOLID:
1394 				/*
1395 				 * When rebinding pools, processes can be
1396 				 * in different zones.
1397 				 */
1398 				break;
1399 			}
1400 		}
1401 
1402 		p->p_poolflag |= PBWAIT;
1403 		/*
1404 		 * If some threads in this process are inside the pool
1405 		 * barrier, add them to pool_barrier_count, as we have
1406 		 * to wait for all of them to exit the barrier.
1407 		 */
1408 		if (p->p_poolcnt > 0) {
1409 			mutex_enter(&pool_barrier_lock);
1410 			pool_barrier_count += p->p_poolcnt;
1411 			mutex_exit(&pool_barrier_lock);
1412 		}
1413 		ASSERT(pp < &procs[procs_size]);
1414 		*pp++ = p;
1415 		procs_count++;
1416 		mutex_exit(&p->p_lock);
1417 
1418 		/*
1419 		 * We just found our process, so if we're only rebinding a
1420 		 * single process then get out of this loop.
1421 		 */
1422 		if (idtype == P_PID)
1423 			break;
1424 	}
1425 	*pp = NULL;	/* cap off the end of the array */
1426 	mutex_exit(&pidlock);
1427 
1428 	/*
1429 	 * Wait for relevant processes to stop before they try to enter the
1430 	 * barrier or at the exit from the barrier.  Make sure that we do
1431 	 * not get stopped here while we're holding pool_lock.  If we were
1432 	 * requested to stop, or got a signal then return EAGAIN to let the
1433 	 * library know that it needs to retry.
1434 	 */
1435 	mutex_enter(&pool_barrier_lock);
1436 	lwp->lwp_nostop++;
1437 	while (pool_barrier_count > 0) {
1438 		(void) cv_wait_sig(&pool_barrier_cv, &pool_barrier_lock);
1439 		if (pool_barrier_count > 0) {
1440 			/*
1441 			 * We either got a signal or were requested to
1442 			 * stop by /proc.  Bail out with EAGAIN.  If we were
1443 			 * requested to stop, we'll stop in post_syscall()
1444 			 * on our way back to userland.
1445 			 */
1446 			mutex_exit(&pool_barrier_lock);
1447 			pool_bind_wakeall(procs);
1448 			lwp->lwp_nostop--;
1449 			rv = EAGAIN;
1450 			goto out;
1451 		}
1452 	}
1453 	lwp->lwp_nostop--;
1454 	mutex_exit(&pool_barrier_lock);
1455 
1456 	if (idtype == P_PID)
1457 		goto skip;
1458 
1459 	/*
1460 	 * Do another run, and drop processes that were inside the barrier
1461 	 * in exit(), but when they have dropped to pool_barrier_exit
1462 	 * they have become of no interest to us.  Pick up child processes that
1463 	 * were created by fork() but didn't exist during our first scan.
1464 	 * Their parents are now stopped at pool_barrier_exit in cfork().
1465 	 */
1466 	mutex_enter(&pidlock);
1467 	for (pp = procs; (p = *pp) != NULL; pp++) {
1468 		if (p->p_poolflag & PEXITED) {
1469 			ASSERT(p->p_lwpcnt == 0);
1470 			pool_bind_wake(p);
1471 			/* flip w/last non-NULL slot */
1472 			*pp = procs[procs_count - 1];
1473 			procs[procs_count - 1] = NULL;
1474 			procs_count--;
1475 			pp--;			/* try this slot again */
1476 			continue;
1477 		}
1478 		/*
1479 		 * Look at the child and check if it should be rebound also.
1480 		 * We're holding pidlock, so it is safe to reference p_child.
1481 		 */
1482 		if ((p = p->p_child) == NULL)
1483 			continue;
1484 
1485 		mutex_enter(&p->p_lock);
1486 		/*
1487 		 * Skip processes in local zones if we're not binding
1488 		 * zones to pools (P_ZONEID).  Skip kernel processes also.
1489 		 */
1490 		if ((!INGLOBALZONE(p) && idtype != P_ZONEID) ||
1491 		    p->p_flag & SSYS) {
1492 			mutex_exit(&p->p_lock);
1493 			continue;
1494 		}
1495 
1496 		/*
1497 		 * If the child process has been already created by fork(), has
1498 		 * not exited, and has not been added to the list already,
1499 		 * then add it now.  We will hit this process again (since we
1500 		 * stick it at the end of the procs list) but it will ignored
1501 		 * because it will have the PBWAIT flag set.
1502 		 */
1503 		if (procinset(p, &set) &&
1504 		    !(p->p_poolflag & PEXITED) &&
1505 		    !(p->p_poolflag & PBWAIT)) {
1506 			ASSERT(p->p_child == NULL); /* no child of a child */
1507 			procs[procs_count] = p;
1508 			procs[procs_count + 1] = NULL;
1509 			procs_count++;
1510 			p->p_poolflag |= PBWAIT;
1511 		}
1512 		mutex_exit(&p->p_lock);
1513 	}
1514 	mutex_exit(&pidlock);
1515 skip:
1516 	/*
1517 	 * If there's no processes to rebind then return ESRCH, unless
1518 	 * we're associating a pool with new resource set, destroying it,
1519 	 * or binding a zone to a pool.
1520 	 */
1521 	if (procs_count == 0) {
1522 		if (idtype == P_POOLID || idtype == P_ZONEID)
1523 			rv = 0;
1524 		else
1525 			rv = ESRCH;
1526 		goto out;
1527 	}
1528 
1529 #ifdef DEBUG
1530 	/*
1531 	 * All processes in the array should have PBWAIT set, and none should
1532 	 * be in the critical section.  Even though p_poolflag is protected by
1533 	 * the p_lock, these assertions should be stable across the dropping of
1534 	 * p_lock.
1535 	 */
1536 	for (pp = procs; (p = *pp) != NULL; pp++) {
1537 		ASSERT(p->p_poolflag & PBWAIT);
1538 		ASSERT(p->p_poolcnt == 0);
1539 		ASSERT(procinset(p, &set));
1540 	}
1541 #endif
1542 
1543 	/*
1544 	 * Do the check if processor set rebinding is going to succeed or not.
1545 	 */
1546 	if ((flags & POOL_BIND_PSET) &&
1547 	    (rv = pset_bind_start(procs, pool)) != 0) {
1548 		pool_bind_wakeall(procs);
1549 		goto out;
1550 	}
1551 
1552 	/*
1553 	 * At this point, all bind operations should succeed.
1554 	 */
1555 	for (pp = procs; (p = *pp) != NULL; pp++) {
1556 		if (flags & POOL_BIND_PSET) {
1557 			psetid_t psetid = pool->pool_pset->pset_id;
1558 			void *zonebuf;
1559 			void *projbuf;
1560 
1561 			/*
1562 			 * Pre-allocate one buffer for FSS (per-project
1563 			 * buffer for a new pset) in case if this is the
1564 			 * first thread from its current project getting
1565 			 * bound to this processor set.
1566 			 */
1567 			projbuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_PROJ);
1568 			zonebuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_ZONE);
1569 
1570 			mutex_enter(&pidlock);
1571 			mutex_enter(&p->p_lock);
1572 			pool_pset_bind(p, psetid, projbuf, zonebuf);
1573 			mutex_exit(&p->p_lock);
1574 			mutex_exit(&pidlock);
1575 			/*
1576 			 * Free buffers pre-allocated above if it
1577 			 * wasn't actually used.
1578 			 */
1579 			fss_freebuf(projbuf, FSS_ALLOC_PROJ);
1580 			fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
1581 		}
1582 		/*
1583 		 * Now let's change the scheduling class of this
1584 		 * process if our target pool has it defined.
1585 		 */
1586 		if (cid != POOL_CLASS_UNSET)
1587 			pool_change_class(p, cid);
1588 
1589 		/*
1590 		 * It is safe to reference p_pool here without holding
1591 		 * p_lock because it cannot change underneath of us.
1592 		 * We're holding pool_lock here, so nobody else can be
1593 		 * moving this process between pools.  If process "p"
1594 		 * would be exiting, we're guaranteed that it would be blocked
1595 		 * at pool_barrier_enter() in exit().  Otherwise, it would've
1596 		 * been skipped by one of our scans of the practive list
1597 		 * as a process with PEXITED flag set.
1598 		 */
1599 		if (p->p_pool != pool) {
1600 			ASSERT(p->p_pool->pool_ref > 0);
1601 			atomic_add_32(&p->p_pool->pool_ref, -1);
1602 			p->p_pool = pool;
1603 			atomic_add_32(&p->p_pool->pool_ref, 1);
1604 		}
1605 		/*
1606 		 * Okay, we've tortured this guy enough.
1607 		 * Let this poor process go now.
1608 		 */
1609 		pool_bind_wake(p);
1610 	}
1611 	if (flags & POOL_BIND_PSET)
1612 		pset_bind_finish();
1613 
1614 out:	switch (idtype) {
1615 	case P_PROJID:
1616 		ASSERT(kpj != NULL);
1617 		mutex_exit(&kpj->kpj_poolbind);
1618 		project_rele(kpj);
1619 		break;
1620 	case P_ZONEID:
1621 		if (rv == 0) {
1622 			mutex_enter(&cpu_lock);
1623 			zone_pool_set(zone, pool);
1624 			mutex_exit(&cpu_lock);
1625 		}
1626 		zone->zone_pool_mod = gethrtime();
1627 		zone_rele(zone);
1628 		break;
1629 	}
1630 
1631 	kmem_free(procs, procs_size * sizeof (proc_t *));
1632 	ASSERT(pool_barrier_count == 0);
1633 	return (rv);
1634 }
1635