xref: /illumos-gate/usr/src/uts/common/os/pool.c (revision f498645a3eecf2ddd304b4ea9c7f1b4c155ff79e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/pool.h>
29 #include <sys/pool_impl.h>
30 #include <sys/pool_pset.h>
31 #include <sys/id_space.h>
32 #include <sys/mutex.h>
33 #include <sys/nvpair.h>
34 #include <sys/cpuvar.h>
35 #include <sys/errno.h>
36 #include <sys/cmn_err.h>
37 #include <sys/systm.h>
38 #include <sys/proc.h>
39 #include <sys/fss.h>
40 #include <sys/class.h>
41 #include <sys/exacct.h>
42 #include <sys/utsname.h>
43 #include <sys/procset.h>
44 #include <sys/atomic.h>
45 #include <sys/zone.h>
46 #include <sys/policy.h>
47 
48 /*
49  * RESOURCE POOLS
50  *
51  * The resource pools facility brings together process-bindable resource into
52  * a common abstraction called a pool. Processor sets and other entities can
53  * be configured, grouped, and labelled such that workload components can be
54  * associated with a subset of a system's total resources.
55  *
56  * When disabled, the pools facility is "invisible".  All processes belong
57  * to the same pool (pool_default), and processor sets can be managed through
58  * the old pset() system call.  When enabled, processor sets can only be
59  * managed via the pools facility.  New pools can be created and associated
60  * with processor sets.  Processes can be bound to pools which have non-empty
61  * resource sets.
62  *
63  * Locking: pool_lock() protects global pools state and must be called
64  * before modifying the configuration, or when taking a snapshot of the
65  * configuration.  If pool_lock_intr() is used, the operation may be
66  * interrupted by a signal or a request.
67  *
68  * To prevent processes from being rebound between pools while they are
69  * the middle of an operation which affects resource set bindings, such
70  * operations must be surrounded by calls to pool_barrier_enter() and
71  * pool_barrier_exit().  This mechanism guarantees that such processes will
72  * be stopped either at the beginning or at the end of the barrier so that
73  * the rebind operation can atomically bind the process and its threads
74  * to new resource sets, and then let process run again.
75  *
76  * Lock ordering with respect to other locks is as follows:
77  *
78  * 	pool_lock() -> cpu_lock -> pidlock -> p_lock -> pool_barrier_lock
79  *
80  * Most static and global variables defined in this file are protected
81  * by calling pool_lock().
82  *
83  * The operation that binds tasks and projects to pools is atomic.  That is,
84  * either all processes in a given task or a project will be bound to a
85  * new pool, or (in case of an error) they will be all left bound to the
86  * old pool. Processes in a given task or a given project can only be bound to
87  * different pools if they were rebound individually one by one as single
88  * processes.  Threads or LWPs of the same process do not have pool bindings,
89  * and are bound to the same resource sets associated with the resource pool
90  * of that process.
91  *
92  * The following picture shows one possible pool configuration with three
93  * pools and three processor sets.  Note that processor set "foo" is not
94  * associated with any pools and therefore cannot have any processes
95  * bound to it.  Two pools (default and foo) are associated with the
96  * same processor set (default).  Also, note that processes in Task 2
97  * are bound to different pools.
98  *
99  *
100  *							       Processor Sets
101  *								+---------+
102  *		       +--------------+========================>| default |
103  *		      a|	      |				+---------+
104  *		      s|	      |				    ||
105  *		      s|	      |				+---------+
106  *		      o|	      |				|   foo   |
107  *		      c|	      |				+---------+
108  *		      i|	      |				    ||
109  *		      a|	      |				+---------+
110  *		      t|	      |			+------>|   bar   |
111  *		      e|	      |			|	+---------+
112  *                    d|              |                 |
113  *                     |              |                 |
114  *	       +---------+      +---------+      +---------+
115  *     Pools   | default |======|   foo   |======|   bar   |
116  *	       +---------+      +---------+      +---------+
117  *	           @  @            @              @ @   @
118  *                b|  |            |              | |   |
119  *                o|  |            |              | |   |
120  *                u|  +-----+      |      +-------+ |   +---+
121  *                n|        |      |      |         |       |
122  *            ....d|........|......|......|.........|.......|....
123  *            :    |   ::   |      |      |    ::   |       |   :
124  *            :  +---+ :: +---+  +---+  +---+  :: +---+   +---+ :
125  *  Processes :  | p | :: | p |  | p |  | p |  :: | p |...| p | :
126  *            :  +---+ :: +---+  +---+  +---+  :: +---+   +---+ :
127  *            :........::......................::...............:
128  *              Task 1            Task 2              Task N
129  *                 |                 |                  |
130  *                 |                 |                  |
131  *                 |  +-----------+  |             +-----------+
132  *                 +--| Project 1 |--+             | Project N |
133  *                    +-----------+                +-----------+
134  *
135  * This is just an illustration of relationships between processes, tasks,
136  * projects, pools, and processor sets. New types of resource sets will be
137  * added in the future.
138  */
139 
140 pool_t		*pool_default;	/* default pool which always exists */
141 int		pool_count;	/* number of pools created on this system */
142 int		pool_state;	/* pools state -- enabled/disabled */
143 void		*pool_buf;	/* pre-commit snapshot of the pools state */
144 size_t		pool_bufsz;	/* size of pool_buf */
145 static hrtime_t	pool_pool_mod;	/* last modification time for pools */
146 static hrtime_t	pool_sys_mod;	/* last modification time for system */
147 static nvlist_t	*pool_sys_prop;	/* system properties */
148 static id_space_t *pool_ids;	/* pool ID space */
149 static list_t	pool_list;	/* doubly-linked list of pools */
150 static kmutex_t		pool_mutex;		/* protects pool_busy_* */
151 static kcondvar_t	pool_busy_cv;		/* waiting for "pool_lock" */
152 static kthread_t	*pool_busy_thread;	/* thread holding "pool_lock" */
153 static kmutex_t		pool_barrier_lock;	/* synch. with pool_barrier_* */
154 static kcondvar_t	pool_barrier_cv;	/* synch. with pool_barrier_* */
155 static int		pool_barrier_count;	/* synch. with pool_barrier_* */
156 
157 /*
158  * Boot-time pool initialization.
159  */
160 void
161 pool_init(void)
162 {
163 	pool_ids = id_space_create("pool_ids", POOL_DEFAULT + 1, POOL_MAXID);
164 
165 	/*
166 	 * Initialize default pool.
167 	 */
168 	pool_default = kmem_zalloc(sizeof (pool_t), KM_SLEEP);
169 	pool_default->pool_id = POOL_DEFAULT;
170 	list_create(&pool_list, sizeof (pool_t), offsetof(pool_t, pool_link));
171 	list_insert_head(&pool_list, pool_default);
172 
173 	/*
174 	 * Initialize plugins for resource sets.
175 	 */
176 	pool_pset_init();
177 	pool_count = 1;
178 	p0.p_pool = pool_default;
179 	global_zone->zone_pool = pool_default;
180 	pool_default->pool_ref = 1;
181 }
182 
183 /*
184  * Synchronization routines.
185  *
186  * pool_lock is only called from syscall-level routines (processor_bind(),
187  * pset_*(), and /dev/pool ioctls).  The pool "lock" may be held for long
188  * periods of time, including across sleeping operations, so we allow its
189  * acquisition to be interruptible.
190  *
191  * The current thread that owns the "lock" is stored in the variable
192  * pool_busy_thread, both to let pool_lock_held() work and to aid debugging.
193  */
194 void
195 pool_lock(void)
196 {
197 	mutex_enter(&pool_mutex);
198 	while (pool_busy_thread != NULL)
199 		cv_wait(&pool_busy_cv, &pool_mutex);
200 	pool_busy_thread = curthread;
201 	mutex_exit(&pool_mutex);
202 }
203 
204 int
205 pool_lock_intr(void)
206 {
207 	mutex_enter(&pool_mutex);
208 	while (pool_busy_thread != NULL) {
209 		if (cv_wait_sig(&pool_busy_cv, &pool_mutex) == 0) {
210 			cv_signal(&pool_busy_cv);
211 			mutex_exit(&pool_mutex);
212 			return (1);
213 		}
214 	}
215 	pool_busy_thread = curthread;
216 	mutex_exit(&pool_mutex);
217 	return (0);
218 }
219 
220 int
221 pool_lock_held(void)
222 {
223 	return (pool_busy_thread == curthread);
224 }
225 
226 void
227 pool_unlock(void)
228 {
229 	mutex_enter(&pool_mutex);
230 	pool_busy_thread = NULL;
231 	cv_signal(&pool_busy_cv);
232 	mutex_exit(&pool_mutex);
233 }
234 
235 /*
236  * Routines allowing fork(), exec(), exit(), and lwp_create() to synchronize
237  * with pool_do_bind().
238  *
239  * Calls to pool_barrier_enter() and pool_barrier_exit() must bracket all
240  * operations which modify pool or pset associations.  They can be called
241  * while the process is multi-threaded.  In the common case, when current
242  * process is not being rebound (PBWAIT flag is not set), these functions
243  * will be just incrementing and decrementing reference counts.
244  */
245 void
246 pool_barrier_enter(void)
247 {
248 	proc_t *p = curproc;
249 
250 	ASSERT(MUTEX_HELD(&p->p_lock));
251 	while (p->p_poolflag & PBWAIT)
252 		cv_wait(&p->p_poolcv, &p->p_lock);
253 	p->p_poolcnt++;
254 }
255 
256 void
257 pool_barrier_exit(void)
258 {
259 	proc_t *p = curproc;
260 
261 	ASSERT(MUTEX_HELD(&p->p_lock));
262 	ASSERT(p->p_poolcnt > 0);
263 	p->p_poolcnt--;
264 	if (p->p_poolflag & PBWAIT) {
265 		mutex_enter(&pool_barrier_lock);
266 		ASSERT(pool_barrier_count > 0);
267 		pool_barrier_count--;
268 		if (pool_barrier_count == 0)
269 			cv_signal(&pool_barrier_cv);
270 		mutex_exit(&pool_barrier_lock);
271 		while (p->p_poolflag & PBWAIT)
272 			cv_wait(&p->p_poolcv, &p->p_lock);
273 	}
274 }
275 
276 /*
277  * Enable pools facility.
278  */
279 static int
280 pool_enable(void)
281 {
282 	int ret;
283 
284 	ASSERT(pool_lock_held());
285 	ASSERT(pool_count == 1);
286 
287 	ret = pool_pset_enable();
288 	if (ret != 0)
289 		return (ret);
290 	(void) nvlist_alloc(&pool_sys_prop, NV_UNIQUE_NAME, KM_SLEEP);
291 	(void) nvlist_add_string(pool_sys_prop, "system.name",
292 	    "default");
293 	(void) nvlist_add_string(pool_sys_prop, "system.comment", "");
294 	(void) nvlist_add_int64(pool_sys_prop, "system.version", 1);
295 	(void) nvlist_add_byte(pool_sys_prop, "system.bind-default", 1);
296 
297 	(void) nvlist_alloc(&pool_default->pool_props,
298 	    NV_UNIQUE_NAME, KM_SLEEP);
299 	(void) nvlist_add_string(pool_default->pool_props,
300 	    "pool.name", "pool_default");
301 	(void) nvlist_add_string(pool_default->pool_props, "pool.comment", "");
302 	(void) nvlist_add_byte(pool_default->pool_props, "pool.default", 1);
303 	(void) nvlist_add_byte(pool_default->pool_props, "pool.active", 1);
304 	(void) nvlist_add_int64(pool_default->pool_props,
305 	    "pool.importance", 1);
306 	(void) nvlist_add_int64(pool_default->pool_props, "pool.sys_id",
307 	    pool_default->pool_id);
308 
309 	pool_sys_mod = pool_pool_mod = gethrtime();
310 
311 	return (ret);
312 }
313 
314 /*
315  * Disable pools facility.
316  */
317 static int
318 pool_disable(void)
319 {
320 	int ret;
321 
322 	ASSERT(pool_lock_held());
323 
324 	if (pool_count > 1)	/* must destroy all pools first */
325 		return (EBUSY);
326 
327 	ret = pool_pset_disable();
328 	if (ret != 0)
329 		return (ret);
330 	if (pool_sys_prop != NULL) {
331 		nvlist_free(pool_sys_prop);
332 		pool_sys_prop = NULL;
333 	}
334 	if (pool_default->pool_props != NULL) {
335 		nvlist_free(pool_default->pool_props);
336 		pool_default->pool_props = NULL;
337 	}
338 	return (0);
339 }
340 
341 pool_t *
342 pool_lookup_pool_by_name(char *name)
343 {
344 	pool_t *pool = pool_default;
345 	char *p;
346 
347 	ASSERT(pool_lock_held());
348 	for (pool = list_head(&pool_list); pool;
349 	    pool = list_next(&pool_list, pool)) {
350 		if (nvlist_lookup_string(pool->pool_props,
351 		    "pool.name", &p) == 0 && strcmp(name, p) == 0)
352 			return (pool);
353 	}
354 	return (NULL);
355 }
356 
357 pool_t *
358 pool_lookup_pool_by_id(poolid_t poolid)
359 {
360 	pool_t *pool = pool_default;
361 
362 	ASSERT(pool_lock_held());
363 	for (pool = list_head(&pool_list); pool;
364 	    pool = list_next(&pool_list, pool)) {
365 		if (pool->pool_id == poolid)
366 			return (pool);
367 	}
368 	return (NULL);
369 }
370 
371 /*
372  * Create new pool, associate it with default resource sets, and give
373  * it a temporary name.
374  */
375 static int
376 pool_pool_create(poolid_t *poolid)
377 {
378 	pool_t *pool;
379 	char pool_name[40];
380 
381 	ASSERT(pool_lock_held());
382 
383 	pool = kmem_zalloc(sizeof (pool_t), KM_SLEEP);
384 	pool->pool_id = *poolid = id_alloc(pool_ids);
385 	pool->pool_pset = pool_pset_default;
386 	pool_pset_default->pset_npools++;
387 	list_insert_tail(&pool_list, pool);
388 	(void) nvlist_alloc(&pool->pool_props, NV_UNIQUE_NAME, KM_SLEEP);
389 	(void) nvlist_add_int64(pool->pool_props, "pool.sys_id", pool->pool_id);
390 	(void) nvlist_add_byte(pool->pool_props, "pool.default", 0);
391 	pool_pool_mod = gethrtime();
392 	(void) snprintf(pool_name, sizeof (pool_name), "pool_%lld",
393 	    pool_pool_mod);
394 	(void) nvlist_add_string(pool->pool_props, "pool.name", pool_name);
395 	pool_count++;
396 	return (0);
397 }
398 
399 struct destroy_zone_arg {
400 	pool_t *old;
401 	pool_t *new;
402 };
403 
404 /*
405  * Update pool pointers for zones that are currently bound to pool "old"
406  * to be bound to pool "new".
407  */
408 static int
409 pool_destroy_zone_cb(zone_t *zone, void *arg)
410 {
411 	struct destroy_zone_arg *dza = arg;
412 
413 	ASSERT(pool_lock_held());
414 	ASSERT(MUTEX_HELD(&cpu_lock));
415 
416 	if (zone_pool_get(zone) == dza->old)
417 		zone_pool_set(zone, dza->new);
418 	return (0);
419 }
420 
421 /*
422  * Destroy specified pool, and rebind all processes in it
423  * to the default pool.
424  */
425 static int
426 pool_pool_destroy(poolid_t poolid)
427 {
428 	pool_t *pool;
429 	int ret;
430 
431 	ASSERT(pool_lock_held());
432 
433 	if (poolid == POOL_DEFAULT)
434 		return (EINVAL);
435 	if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
436 		return (ESRCH);
437 	ret = pool_do_bind(pool_default, P_POOLID, poolid, POOL_BIND_ALL);
438 	if (ret == 0) {
439 		struct destroy_zone_arg dzarg;
440 
441 		dzarg.old = pool;
442 		dzarg.new = pool_default;
443 		mutex_enter(&cpu_lock);
444 		ret = zone_walk(pool_destroy_zone_cb, &dzarg);
445 		mutex_exit(&cpu_lock);
446 		ASSERT(ret == 0);
447 		ASSERT(pool->pool_ref == 0);
448 		(void) nvlist_free(pool->pool_props);
449 		id_free(pool_ids, pool->pool_id);
450 		pool->pool_pset->pset_npools--;
451 		list_remove(&pool_list, pool);
452 		pool_count--;
453 		pool_pool_mod = gethrtime();
454 		kmem_free(pool, sizeof (pool_t));
455 	}
456 	return (ret);
457 }
458 
459 /*
460  * Create new pool or resource set.
461  */
462 int
463 pool_create(int class, int subclass, id_t *id)
464 {
465 	int ret;
466 
467 	ASSERT(pool_lock_held());
468 	if (pool_state == POOL_DISABLED)
469 		return (ENOTACTIVE);
470 	switch (class) {
471 	case PEC_POOL:
472 		ret = pool_pool_create((poolid_t *)id);
473 		break;
474 	case PEC_RES_COMP:
475 		switch (subclass) {
476 		case PREC_PSET:
477 			ret = pool_pset_create((psetid_t *)id);
478 			break;
479 		default:
480 			ret = EINVAL;
481 		}
482 		break;
483 	case PEC_RES_AGG:
484 		ret = ENOTSUP;
485 		break;
486 	default:
487 		ret = EINVAL;
488 	}
489 	return (ret);
490 }
491 
492 /*
493  * Destroy an existing pool or resource set.
494  */
495 int
496 pool_destroy(int class, int subclass, id_t id)
497 {
498 	int ret;
499 
500 	ASSERT(pool_lock_held());
501 	if (pool_state == POOL_DISABLED)
502 		return (ENOTACTIVE);
503 	switch (class) {
504 	case PEC_POOL:
505 		ret = pool_pool_destroy((poolid_t)id);
506 		break;
507 	case PEC_RES_COMP:
508 		switch (subclass) {
509 		case PREC_PSET:
510 			ret = pool_pset_destroy((psetid_t)id);
511 			break;
512 		default:
513 			ret = EINVAL;
514 		}
515 		break;
516 	case PEC_RES_AGG:
517 		ret = ENOTSUP;
518 		break;
519 	default:
520 		ret = EINVAL;
521 	}
522 	return (ret);
523 }
524 
525 /*
526  * Enable or disable pools.
527  */
528 int
529 pool_status(int status)
530 {
531 	int ret = 0;
532 
533 	ASSERT(pool_lock_held());
534 
535 	if (pool_state == status)
536 		return (0);
537 	switch (status) {
538 	case POOL_ENABLED:
539 		ret = pool_enable();
540 		if (ret != 0)
541 			return (ret);
542 		pool_state = POOL_ENABLED;
543 		break;
544 	case POOL_DISABLED:
545 		ret = pool_disable();
546 		if (ret != 0)
547 			return (ret);
548 		pool_state = POOL_DISABLED;
549 		break;
550 	default:
551 		ret = EINVAL;
552 	}
553 	return (ret);
554 }
555 
556 /*
557  * Associate pool with resource set.
558  */
559 int
560 pool_assoc(poolid_t poolid, int idtype, id_t id)
561 {
562 	int ret;
563 
564 	ASSERT(pool_lock_held());
565 	if (pool_state == POOL_DISABLED)
566 		return (ENOTACTIVE);
567 	switch (idtype) {
568 	case PREC_PSET:
569 		ret = pool_pset_assoc(poolid, (psetid_t)id);
570 		break;
571 	default:
572 		ret = EINVAL;
573 	}
574 	if (ret == 0)
575 		pool_pool_mod = gethrtime();
576 	return (ret);
577 }
578 
579 /*
580  * Disassociate resource set from pool.
581  */
582 int
583 pool_dissoc(poolid_t poolid, int idtype)
584 {
585 	int ret;
586 
587 	ASSERT(pool_lock_held());
588 	if (pool_state == POOL_DISABLED)
589 		return (ENOTACTIVE);
590 	switch (idtype) {
591 	case PREC_PSET:
592 		ret = pool_pset_assoc(poolid, PS_NONE);
593 		break;
594 	default:
595 		ret = EINVAL;
596 	}
597 	if (ret == 0)
598 		pool_pool_mod = gethrtime();
599 	return (ret);
600 }
601 
602 /*
603  * Transfer specified quantity of resources between resource sets.
604  */
605 /*ARGSUSED*/
606 int
607 pool_transfer(int type, id_t src, id_t dst, uint64_t qty)
608 {
609 	int ret = EINVAL;
610 	return (ret);
611 }
612 
613 /*
614  * Transfer resources specified by their IDs between resource sets.
615  */
616 int
617 pool_xtransfer(int type, id_t src, id_t dst, uint_t size, id_t *ids)
618 {
619 	int ret;
620 
621 	ASSERT(pool_lock_held());
622 	if (pool_state == POOL_DISABLED)
623 		return (ENOTACTIVE);
624 	switch (type) {
625 	case PREC_PSET:
626 		ret = pool_pset_xtransfer((psetid_t)src, (psetid_t)dst,
627 		    size, ids);
628 		break;
629 	default:
630 		ret = EINVAL;
631 	}
632 	return (ret);
633 }
634 
635 /*
636  * Bind processes to pools.
637  */
638 int
639 pool_bind(poolid_t poolid, idtype_t idtype, id_t id)
640 {
641 	pool_t *pool;
642 
643 	ASSERT(pool_lock_held());
644 
645 	if (pool_state == POOL_DISABLED)
646 		return (ENOTACTIVE);
647 	if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
648 		return (ESRCH);
649 
650 	switch (idtype) {
651 	case P_PID:
652 	case P_TASKID:
653 	case P_PROJID:
654 	case P_ZONEID:
655 		break;
656 	default:
657 		return (EINVAL);
658 	}
659 	return (pool_do_bind(pool, idtype, id, POOL_BIND_ALL));
660 }
661 
662 /*
663  * Query pool binding of the specifed process.
664  */
665 int
666 pool_query_binding(idtype_t idtype, id_t id, id_t *poolid)
667 {
668 	proc_t *p;
669 
670 	if (idtype != P_PID)
671 		return (ENOTSUP);
672 	if (id == P_MYID)
673 		id = curproc->p_pid;
674 
675 	ASSERT(pool_lock_held());
676 
677 	mutex_enter(&pidlock);
678 	if ((p = prfind((pid_t)id)) == NULL) {
679 		mutex_exit(&pidlock);
680 		return (ESRCH);
681 	}
682 	mutex_enter(&p->p_lock);
683 	/*
684 	 * In local zones, lie about pool bindings of processes from
685 	 * the global zone.
686 	 */
687 	if (!INGLOBALZONE(curproc) && INGLOBALZONE(p)) {
688 		pool_t *pool;
689 
690 		pool = zone_pool_get(curproc->p_zone);
691 		*poolid = pool->pool_id;
692 	} else {
693 		*poolid = p->p_pool->pool_id;
694 	}
695 	mutex_exit(&p->p_lock);
696 	mutex_exit(&pidlock);
697 	return (0);
698 }
699 
700 static ea_object_t *
701 pool_system_pack(void)
702 {
703 	ea_object_t *eo_system;
704 	size_t bufsz = 0;
705 	char *buf = NULL;
706 
707 	ASSERT(pool_lock_held());
708 
709 	eo_system = ea_alloc_group(EXT_GROUP | EXC_LOCAL | EXD_GROUP_SYSTEM);
710 	(void) ea_attach_item(eo_system, &pool_sys_mod, sizeof (hrtime_t),
711 	    EXC_LOCAL | EXD_SYSTEM_TSTAMP | EXT_UINT64);
712 	if (INGLOBALZONE(curproc))
713 		(void) ea_attach_item(eo_system, &pool_pool_mod,
714 		    sizeof (hrtime_t),
715 		    EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64);
716 	else
717 		(void) ea_attach_item(eo_system,
718 		    &curproc->p_zone->zone_pool_mod,
719 		    sizeof (hrtime_t),
720 		    EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64);
721 	(void) ea_attach_item(eo_system, &pool_pset_mod, sizeof (hrtime_t),
722 	    EXC_LOCAL | EXD_PSET_TSTAMP | EXT_UINT64);
723 	(void) ea_attach_item(eo_system, &pool_cpu_mod, sizeof (hrtime_t),
724 	    EXC_LOCAL | EXD_CPU_TSTAMP | EXT_UINT64);
725 	(void) nvlist_pack(pool_sys_prop, &buf, &bufsz, NV_ENCODE_NATIVE, 0);
726 	(void) ea_attach_item(eo_system, buf, bufsz,
727 	    EXC_LOCAL | EXD_SYSTEM_PROP | EXT_RAW);
728 	kmem_free(buf, bufsz);
729 	return (eo_system);
730 }
731 
732 /*
733  * Pack information about pools and attach it to specified exacct group.
734  */
735 static int
736 pool_pool_pack(ea_object_t *eo_system)
737 {
738 	ea_object_t *eo_pool;
739 	pool_t *pool;
740 	size_t bufsz;
741 	char *buf;
742 	pool_t *myzonepool;
743 
744 	ASSERT(pool_lock_held());
745 	myzonepool = zone_pool_get(curproc->p_zone);
746 	for (pool = list_head(&pool_list); pool;
747 	    pool = list_next(&pool_list, pool)) {
748 		if (!INGLOBALZONE(curproc) && myzonepool != pool)
749 			continue;
750 		bufsz = 0;
751 		buf = NULL;
752 		eo_pool = ea_alloc_group(EXT_GROUP |
753 		    EXC_LOCAL | EXD_GROUP_POOL);
754 		(void) ea_attach_item(eo_pool, &pool->pool_id, sizeof (id_t),
755 		    EXC_LOCAL | EXD_POOL_POOLID | EXT_UINT32);
756 		(void) ea_attach_item(eo_pool, &pool->pool_pset->pset_id,
757 		    sizeof (id_t), EXC_LOCAL | EXD_POOL_PSETID | EXT_UINT32);
758 		(void) nvlist_pack(pool->pool_props, &buf, &bufsz,
759 		    NV_ENCODE_NATIVE, 0);
760 		(void) ea_attach_item(eo_pool, buf, bufsz,
761 		    EXC_LOCAL | EXD_POOL_PROP | EXT_RAW);
762 		kmem_free(buf, bufsz);
763 		(void) ea_attach_to_group(eo_system, eo_pool);
764 	}
765 	return (0);
766 }
767 
768 /*
769  * Pack the whole pool configuration in the specified buffer.
770  */
771 int
772 pool_pack_conf(void *kbuf, size_t kbufsz, size_t *asize)
773 {
774 	ea_object_t *eo_system;
775 	size_t ksize;
776 	int ret = 0;
777 
778 	ASSERT(pool_lock_held());
779 
780 	eo_system = pool_system_pack();		/* 1. pack system */
781 	(void) pool_pool_pack(eo_system);	/* 2. pack all pools */
782 	(void) pool_pset_pack(eo_system);	/* 3. pack all psets */
783 	ksize = ea_pack_object(eo_system, NULL, 0);
784 	if (kbuf == NULL || kbufsz == 0)
785 		*asize = ksize;
786 	else if (ksize > kbufsz)
787 		ret = ENOMEM;
788 	else
789 		*asize = ea_pack_object(eo_system, kbuf, kbufsz);
790 	ea_free_object(eo_system, EUP_ALLOC);
791 	return (ret);
792 }
793 
794 /*
795  * Start/end the commit transaction.  If commit transaction is currently
796  * in progress, then all POOL_QUERY ioctls will return pools configuration
797  * at the beginning of transaction.
798  */
799 int
800 pool_commit(int state)
801 {
802 	ea_object_t *eo_system;
803 	int ret = 0;
804 
805 	ASSERT(pool_lock_held());
806 
807 	if (pool_state == POOL_DISABLED)
808 		return (ENOTACTIVE);
809 	switch (state) {
810 	case 1:
811 		/*
812 		 * Beginning commit transation.
813 		 */
814 		if (pool_buf != NULL)		/* transaction in progress */
815 			return (EBUSY);
816 		eo_system = pool_system_pack();		/* 1. pack system */
817 		(void) pool_pool_pack(eo_system);	/* 2. pack all pools */
818 		(void) pool_pset_pack(eo_system);	/* 3. pack all psets */
819 		pool_bufsz = ea_pack_object(eo_system, NULL, 0);
820 		pool_buf = kmem_alloc(pool_bufsz, KM_SLEEP);
821 		pool_bufsz = ea_pack_object(eo_system, pool_buf, pool_bufsz);
822 		ea_free_object(eo_system, EUP_ALLOC);
823 		break;
824 	case 0:
825 		/*
826 		 * Finishing commit transaction.
827 		 */
828 		if (pool_buf != NULL) {
829 			kmem_free(pool_buf, pool_bufsz);
830 			pool_buf = NULL;
831 			pool_bufsz = 0;
832 		}
833 		break;
834 	default:
835 		ret = EINVAL;
836 	}
837 	return (ret);
838 }
839 
840 /*
841  * Check is the specified property is special
842  */
843 static pool_property_t *
844 pool_property_find(char *name, pool_property_t *list)
845 {
846 	pool_property_t *prop;
847 
848 	for (prop = list; prop->pp_name != NULL; prop++)
849 		if (strcmp(prop->pp_name, name) == 0)
850 			return (prop);
851 	return (NULL);
852 }
853 
854 static pool_property_t pool_prop_sys[] = {
855 	{ "system.name",		DATA_TYPE_STRING,	PP_RDWR },
856 	{ "system.comment",		DATA_TYPE_STRING,	PP_RDWR },
857 	{ "system.version",		DATA_TYPE_UINT64,	PP_READ },
858 	{ "system.bind-default",	DATA_TYPE_BYTE,		PP_RDWR },
859 	{ "system.allocate-method",	DATA_TYPE_STRING,
860 	    PP_RDWR | PP_OPTIONAL },
861 	{ "system.poold.log-level",	DATA_TYPE_STRING,
862 	    PP_RDWR | PP_OPTIONAL },
863 	{ "system.poold.log-location",	DATA_TYPE_STRING,
864 	    PP_RDWR | PP_OPTIONAL },
865 	{ "system.poold.monitor-interval",	DATA_TYPE_UINT64,
866 	    PP_RDWR | PP_OPTIONAL },
867 	{ "system.poold.history-file",	DATA_TYPE_STRING,
868 	    PP_RDWR | PP_OPTIONAL },
869 	{ "system.poold.objectives",	DATA_TYPE_STRING,
870 	    PP_RDWR | PP_OPTIONAL },
871 	{ NULL,				0,			0 }
872 };
873 
874 static pool_property_t pool_prop_pool[] = {
875 	{ "pool.sys_id",		DATA_TYPE_UINT64,	PP_READ },
876 	{ "pool.name",			DATA_TYPE_STRING,	PP_RDWR },
877 	{ "pool.default",		DATA_TYPE_BYTE,		PP_READ },
878 	{ "pool.active",		DATA_TYPE_BYTE,		PP_RDWR },
879 	{ "pool.importance",		DATA_TYPE_INT64,	PP_RDWR },
880 	{ "pool.comment",		DATA_TYPE_STRING,	PP_RDWR },
881 	{ "pool.scheduler",		DATA_TYPE_STRING,
882 	    PP_RDWR | PP_OPTIONAL },
883 	{ NULL,				0,			0 }
884 };
885 
886 /*
887  * Common routine to put new property on the specified list
888  */
889 int
890 pool_propput_common(nvlist_t *nvlist, nvpair_t *pair, pool_property_t *props)
891 {
892 	pool_property_t *prop;
893 
894 	if ((prop = pool_property_find(nvpair_name(pair), props)) != NULL) {
895 		/*
896 		 * No read-only properties or properties with bad types
897 		 */
898 		if (!(prop->pp_perm & PP_WRITE) ||
899 		    prop->pp_type != nvpair_type(pair))
900 			return (EINVAL);
901 	}
902 	return (nvlist_add_nvpair(nvlist, pair));
903 }
904 
905 /*
906  * Common routine to remove property from the given list
907  */
908 int
909 pool_proprm_common(nvlist_t *nvlist, char *name, pool_property_t *props)
910 {
911 	pool_property_t *prop;
912 
913 	if ((prop = pool_property_find(name, props)) != NULL) {
914 		if (!(prop->pp_perm & PP_OPTIONAL))
915 			return (EINVAL);
916 	}
917 	return (nvlist_remove_all(nvlist, name));
918 }
919 
920 static int
921 pool_system_propput(nvpair_t *pair)
922 {
923 	int ret;
924 
925 	ASSERT(pool_lock_held());
926 	ret = pool_propput_common(pool_sys_prop, pair, pool_prop_sys);
927 	if (ret == 0)
928 		pool_sys_mod = gethrtime();
929 	return (ret);
930 }
931 
932 static int
933 pool_system_proprm(char *name)
934 {
935 	int ret;
936 
937 	ASSERT(pool_lock_held());
938 	ret = pool_proprm_common(pool_sys_prop, name, pool_prop_sys);
939 	if (ret == 0)
940 		pool_sys_mod = gethrtime();
941 	return (ret);
942 }
943 
944 static int
945 pool_pool_propput(poolid_t poolid, nvpair_t *pair)
946 {
947 	pool_t *pool;
948 	int ret;
949 
950 	ASSERT(pool_lock_held());
951 	if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
952 		return (ESRCH);
953 	ret = pool_propput_common(pool->pool_props, pair, pool_prop_pool);
954 	if (ret == 0)
955 		pool_pool_mod = gethrtime();
956 	return (ret);
957 }
958 
959 static int
960 pool_pool_proprm(poolid_t poolid, char *name)
961 {
962 	int ret;
963 	pool_t *pool;
964 
965 	ASSERT(pool_lock_held());
966 	if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
967 		return (ESRCH);
968 	ret = pool_proprm_common(pool->pool_props, name, pool_prop_pool);
969 	if (ret == 0)
970 		pool_pool_mod = gethrtime();
971 	return (ret);
972 }
973 
974 int
975 pool_propput(int class, int subclass, id_t id, nvpair_t *pair)
976 {
977 	int ret;
978 
979 	ASSERT(pool_lock_held());
980 	if (pool_state == POOL_DISABLED)
981 		return (ENOTACTIVE);
982 	switch (class) {
983 	case PEC_SYSTEM:
984 		ret = pool_system_propput(pair);
985 		break;
986 	case PEC_POOL:
987 		ret = pool_pool_propput((poolid_t)id, pair);
988 		break;
989 	case PEC_RES_COMP:
990 		switch (subclass) {
991 		case PREC_PSET:
992 			ret = pool_pset_propput((psetid_t)id, pair);
993 			break;
994 		default:
995 			ret = EINVAL;
996 		}
997 		break;
998 	case PEC_RES_AGG:
999 		ret = ENOTSUP;
1000 		break;
1001 	case PEC_COMP:
1002 		switch (subclass) {
1003 		case PCEC_CPU:
1004 			ret = pool_cpu_propput((processorid_t)id, pair);
1005 			break;
1006 		default:
1007 			ret = EINVAL;
1008 		}
1009 		break;
1010 	default:
1011 		ret = EINVAL;
1012 	}
1013 	return (ret);
1014 }
1015 
1016 int
1017 pool_proprm(int class, int subclass, id_t id, char *name)
1018 {
1019 	int ret;
1020 
1021 	ASSERT(pool_lock_held());
1022 	if (pool_state == POOL_DISABLED)
1023 		return (ENOTACTIVE);
1024 	switch (class) {
1025 	case PEC_SYSTEM:
1026 		ret = pool_system_proprm(name);
1027 		break;
1028 	case PEC_POOL:
1029 		ret = pool_pool_proprm((poolid_t)id, name);
1030 		break;
1031 	case PEC_RES_COMP:
1032 		switch (subclass) {
1033 		case PREC_PSET:
1034 			ret = pool_pset_proprm((psetid_t)id, name);
1035 			break;
1036 		default:
1037 			ret = EINVAL;
1038 		}
1039 		break;
1040 	case PEC_RES_AGG:
1041 		ret = ENOTSUP;
1042 		break;
1043 	case PEC_COMP:
1044 		switch (subclass) {
1045 		case PCEC_CPU:
1046 			ret = pool_cpu_proprm((processorid_t)id, name);
1047 			break;
1048 		default:
1049 			ret = EINVAL;
1050 		}
1051 		break;
1052 	default:
1053 		ret = EINVAL;
1054 	}
1055 	return (ret);
1056 }
1057 
1058 int
1059 pool_propget(char *name, int class, int subclass, id_t id, nvlist_t **nvlp)
1060 {
1061 	int ret;
1062 	nvlist_t *nvl;
1063 
1064 	ASSERT(pool_lock_held());
1065 	if (pool_state == POOL_DISABLED)
1066 		return (ENOTACTIVE);
1067 
1068 	(void) nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP);
1069 
1070 	switch (class) {
1071 	case PEC_SYSTEM:
1072 	case PEC_POOL:
1073 		ret = EINVAL;
1074 		break;
1075 	case PEC_RES_COMP:
1076 		switch (subclass) {
1077 		case PREC_PSET:
1078 			ret = pool_pset_propget((psetid_t)id, name, nvl);
1079 			break;
1080 		default:
1081 			ret = EINVAL;
1082 		}
1083 		break;
1084 	case PEC_RES_AGG:
1085 		ret = ENOTSUP;
1086 		break;
1087 	case PEC_COMP:
1088 		switch (subclass) {
1089 		case PCEC_CPU:
1090 			ret = pool_cpu_propget((processorid_t)id, name, nvl);
1091 			break;
1092 		default:
1093 			ret = EINVAL;
1094 		}
1095 		break;
1096 	default:
1097 		ret = EINVAL;
1098 	}
1099 	if (ret == 0)
1100 		*nvlp = nvl;
1101 	else
1102 		nvlist_free(nvl);
1103 	return (ret);
1104 }
1105 
1106 /*
1107  * pool_bind_wake and pool_bind_wakeall are helper functions to undo PBWAITs
1108  * in case of failure in pool_do_bind().
1109  */
1110 static void
1111 pool_bind_wake(proc_t *p)
1112 {
1113 	ASSERT(pool_lock_held());
1114 
1115 	mutex_enter(&p->p_lock);
1116 	ASSERT(p->p_poolflag & PBWAIT);
1117 	if (p->p_poolcnt > 0) {
1118 		mutex_enter(&pool_barrier_lock);
1119 		pool_barrier_count -= p->p_poolcnt;
1120 		mutex_exit(&pool_barrier_lock);
1121 	}
1122 	p->p_poolflag &= ~PBWAIT;
1123 	cv_signal(&p->p_poolcv);
1124 	mutex_exit(&p->p_lock);
1125 }
1126 
1127 static void
1128 pool_bind_wakeall(proc_t **procs)
1129 {
1130 	proc_t *p, **pp;
1131 
1132 	ASSERT(pool_lock_held());
1133 	for (pp = procs; (p = *pp) != NULL; pp++)
1134 		pool_bind_wake(p);
1135 }
1136 
1137 /*
1138  * Return the scheduling class for this pool, or
1139  * 	POOL_CLASS_UNSET if not set
1140  * 	POOL_CLASS_INVAL if set to an invalid class ID.
1141  */
1142 id_t
1143 pool_get_class(pool_t *pool)
1144 {
1145 	char *name;
1146 	id_t cid;
1147 
1148 	ASSERT(pool_lock_held());
1149 
1150 	if (nvlist_lookup_string(pool->pool_props, "pool.scheduler",
1151 	    &name) == 0) {
1152 		if (getcidbyname(name, &cid) == 0)
1153 			return (cid);
1154 		else
1155 			return (POOL_CLASS_INVAL);
1156 	}
1157 	return (POOL_CLASS_UNSET);
1158 }
1159 
1160 /*
1161  * Move process to the new scheduling class.
1162  */
1163 static void
1164 pool_change_class(proc_t *p, id_t cid)
1165 {
1166 	kthread_t *t;
1167 	void *cldata;
1168 	id_t oldcid;
1169 	void **bufs;
1170 	void **buf;
1171 	int nlwp;
1172 	int ret;
1173 	int i;
1174 
1175 	/*
1176 	 * Do not move kernel processes (such as zsched).
1177 	 */
1178 	if (p->p_flag & SSYS)
1179 		return;
1180 	/*
1181 	 * This process is in the pool barrier, so it can't possibly be
1182 	 * adding new threads and we can use p_lwpcnt + p_zombcnt + 1
1183 	 * (for possible agent LWP which doesn't use pool barrier) as
1184 	 * our upper bound.
1185 	 */
1186 	nlwp = p->p_lwpcnt + p->p_zombcnt + 1;
1187 
1188 	/*
1189 	 * Pre-allocate scheduling class specific buffers before
1190 	 * grabbing p_lock.
1191 	 */
1192 	bufs = kmem_zalloc(nlwp * sizeof (void *), KM_SLEEP);
1193 	for (i = 0, buf = bufs; i < nlwp; i++, buf++) {
1194 		ret = CL_ALLOC(buf, cid, KM_SLEEP);
1195 		ASSERT(ret == 0);
1196 	}
1197 
1198 	/*
1199 	 * Move threads one by one to the new scheduling class.
1200 	 * This never fails because we have all the right
1201 	 * privileges here.
1202 	 */
1203 	mutex_enter(&p->p_lock);
1204 	ASSERT(p->p_poolflag & PBWAIT);
1205 	buf = bufs;
1206 	t = p->p_tlist;
1207 	ASSERT(t != NULL);
1208 	do {
1209 		if (t->t_cid != cid) {
1210 			oldcid = t->t_cid;
1211 			cldata = t->t_cldata;
1212 			ret = CL_ENTERCLASS(t, cid, NULL, NULL, *buf);
1213 			ASSERT(ret == 0);
1214 			CL_EXITCLASS(oldcid, cldata);
1215 			*buf++ = NULL;
1216 		}
1217 	} while ((t = t->t_forw) != p->p_tlist);
1218 	mutex_exit(&p->p_lock);
1219 	/*
1220 	 * Free unused scheduling class specific buffers.
1221 	 */
1222 	for (i = 0, buf = bufs; i < nlwp; i++, buf++) {
1223 		if (*buf != NULL) {
1224 			CL_FREE(cid, *buf);
1225 			*buf = NULL;
1226 		}
1227 	}
1228 	kmem_free(bufs, nlwp * sizeof (void *));
1229 }
1230 
1231 /*
1232  * The meat of the bind operation.  The steps in pool_do_bind are:
1233  *
1234  * 1) Set PBWAIT in the p_poolflag of any process of interest, and add all
1235  *    such processes to an array.  For any interesting process that has
1236  *    threads inside the pool barrier set, increment a counter by the
1237  *    count of such threads.  Once PBWAIT is set on a process, that process
1238  *    will not disappear.
1239  *
1240  * 2) Wait for the counter from step 2 to drop to zero.  Any process which
1241  *    calls pool_barrier_exit() and notices that PBWAIT has been set on it
1242  *    will decrement that counter before going to sleep, and the process
1243  *    calling pool_barrier_exit() which does the final decrement will wake us.
1244  *
1245  * 3) For each interesting process, perform a calculation on it to see if
1246  *    the bind will actually succeed.  This uses the following three
1247  *    resource-set-specific functions:
1248  *
1249  *    - int set_bind_start(procs, pool)
1250  *
1251  *      Determine whether the given array of processes can be bound to the
1252  *      resource set associated with the given pool.  If it can, take and hold
1253  *      any locks necessary to ensure that the operation will succeed, and
1254  *      make any necessary reservations in the target resource set.  If it
1255  *      can't, return failure with no reservations made and no new locks held.
1256  *
1257  *    - void set_bind_abort(procs, pool)
1258  *
1259  *      set_bind_start() has completed successfully, but another resource set's
1260  *      set_bind_start() has failed, and we haven't begun the bind yet.  Undo
1261  *      any reservations made and drop any locks acquired by our
1262  *      set_bind_start().
1263  *
1264  *    - void set_bind_finish(void)
1265  *
1266  *      The bind has completed successfully.  The processes have been released,
1267  *      and the reservation acquired in set_bind_start() has been depleted as
1268  *      the processes have finished their bindings.  Drop any locks acquired by
1269  *      set_bind_start().
1270  *
1271  * 4) If we've decided that we can proceed with the bind, iterate through
1272  *    the list of interesting processes, grab the necessary locks (which
1273  *    may differ per resource set), perform the bind, and ASSERT that it
1274  *    succeeds.  Once a process has been rebound, it can be awakened.
1275  *
1276  * The operations from step 4 must be kept in sync with anything which might
1277  * cause the bind operations (e.g., cpupart_bind_thread()) to fail, and
1278  * are thus located in the same source files as the associated bind operations.
1279  */
1280 int
1281 pool_do_bind(pool_t *pool, idtype_t idtype, id_t id, int flags)
1282 {
1283 	extern uint_t nproc;
1284 	klwp_t *lwp = ttolwp(curthread);
1285 	proc_t **pp, **procs;
1286 	proc_t *prstart;
1287 	int procs_count = 0;
1288 	kproject_t *kpj;
1289 	procset_t set;
1290 	zone_t *zone;
1291 	int procs_size;
1292 	int rv = 0;
1293 	proc_t *p;
1294 	id_t cid = -1;
1295 
1296 	ASSERT(pool_lock_held());
1297 
1298 	if ((cid = pool_get_class(pool)) == POOL_CLASS_INVAL)
1299 		return (EINVAL);
1300 
1301 	if (idtype == P_ZONEID) {
1302 		zone = zone_find_by_id(id);
1303 		if (zone == NULL)
1304 			return (ESRCH);
1305 		if (zone_status_get(zone) > ZONE_IS_RUNNING) {
1306 			zone_rele(zone);
1307 			return (EBUSY);
1308 		}
1309 	}
1310 
1311 	if (idtype == P_PROJID) {
1312 		kpj = project_hold_by_id(id, GLOBAL_ZONEID, PROJECT_HOLD_FIND);
1313 		if (kpj == NULL)
1314 			return (ESRCH);
1315 		mutex_enter(&kpj->kpj_poolbind);
1316 	}
1317 
1318 	if (idtype == P_PID) {
1319 		/*
1320 		 * Fast-path for a single process case.
1321 		 */
1322 		procs_size = 2;	/* procs is NULL-terminated */
1323 		procs = kmem_zalloc(procs_size * sizeof (proc_t *), KM_SLEEP);
1324 		mutex_enter(&pidlock);
1325 	} else {
1326 		/*
1327 		 * We will need enough slots for proc_t pointers for as many as
1328 		 * twice the number of currently running processes (assuming
1329 		 * that each one could be in fork() creating a new child).
1330 		 */
1331 		for (;;) {
1332 			procs_size = nproc * 2;
1333 			procs = kmem_zalloc(procs_size * sizeof (proc_t *),
1334 			    KM_SLEEP);
1335 			mutex_enter(&pidlock);
1336 
1337 			if (nproc * 2 <= procs_size)
1338 				break;
1339 			/*
1340 			 * If nproc has changed, try again.
1341 			 */
1342 			mutex_exit(&pidlock);
1343 			kmem_free(procs, procs_size * sizeof (proc_t *));
1344 		}
1345 	}
1346 
1347 	if (id == P_MYID)
1348 		id = getmyid(idtype);
1349 	setprocset(&set, POP_AND, idtype, id, P_ALL, 0);
1350 
1351 	/*
1352 	 * Do a first scan, and select target processes.
1353 	 */
1354 	if (idtype == P_PID)
1355 		prstart = prfind(id);
1356 	else
1357 		prstart = practive;
1358 	for (p = prstart, pp = procs; p != NULL; p = p->p_next) {
1359 		mutex_enter(&p->p_lock);
1360 		/*
1361 		 * Skip processes that don't match our (id, idtype) set or
1362 		 * on the way of becoming zombies.  Skip kernel processes
1363 		 * from the global zone.
1364 		 */
1365 		if (procinset(p, &set) == 0 ||
1366 		    p->p_poolflag & PEXITED ||
1367 		    ((p->p_flag & SSYS) && INGLOBALZONE(p))) {
1368 			mutex_exit(&p->p_lock);
1369 			continue;
1370 		}
1371 		if (!INGLOBALZONE(p)) {
1372 			switch (idtype) {
1373 			case P_PID:
1374 			case P_TASKID:
1375 				/*
1376 				 * Can't bind processes or tasks
1377 				 * in local zones to pools.
1378 				 */
1379 				mutex_exit(&p->p_lock);
1380 				mutex_exit(&pidlock);
1381 				pool_bind_wakeall(procs);
1382 				rv = EINVAL;
1383 				goto out;
1384 			case P_PROJID:
1385 				/*
1386 				 * Only projects in the global
1387 				 * zone can be rebound.
1388 				 */
1389 				mutex_exit(&p->p_lock);
1390 				continue;
1391 			case P_POOLID:
1392 				/*
1393 				 * When rebinding pools, processes can be
1394 				 * in different zones.
1395 				 */
1396 				break;
1397 			}
1398 		}
1399 
1400 		p->p_poolflag |= PBWAIT;
1401 		/*
1402 		 * If some threads in this process are inside the pool
1403 		 * barrier, add them to pool_barrier_count, as we have
1404 		 * to wait for all of them to exit the barrier.
1405 		 */
1406 		if (p->p_poolcnt > 0) {
1407 			mutex_enter(&pool_barrier_lock);
1408 			pool_barrier_count += p->p_poolcnt;
1409 			mutex_exit(&pool_barrier_lock);
1410 		}
1411 		ASSERT(pp < &procs[procs_size]);
1412 		*pp++ = p;
1413 		procs_count++;
1414 		mutex_exit(&p->p_lock);
1415 
1416 		/*
1417 		 * We just found our process, so if we're only rebinding a
1418 		 * single process then get out of this loop.
1419 		 */
1420 		if (idtype == P_PID)
1421 			break;
1422 	}
1423 	*pp = NULL;	/* cap off the end of the array */
1424 	mutex_exit(&pidlock);
1425 
1426 	/*
1427 	 * Wait for relevant processes to stop before they try to enter the
1428 	 * barrier or at the exit from the barrier.  Make sure that we do
1429 	 * not get stopped here while we're holding pool_lock.  If we were
1430 	 * requested to stop, or got a signal then return EAGAIN to let the
1431 	 * library know that it needs to retry.
1432 	 */
1433 	mutex_enter(&pool_barrier_lock);
1434 	lwp->lwp_nostop++;
1435 	while (pool_barrier_count > 0) {
1436 		(void) cv_wait_sig(&pool_barrier_cv, &pool_barrier_lock);
1437 		if (pool_barrier_count > 0) {
1438 			/*
1439 			 * We either got a signal or were requested to
1440 			 * stop by /proc.  Bail out with EAGAIN.  If we were
1441 			 * requested to stop, we'll stop in post_syscall()
1442 			 * on our way back to userland.
1443 			 */
1444 			mutex_exit(&pool_barrier_lock);
1445 			pool_bind_wakeall(procs);
1446 			lwp->lwp_nostop--;
1447 			rv = EAGAIN;
1448 			goto out;
1449 		}
1450 	}
1451 	lwp->lwp_nostop--;
1452 	mutex_exit(&pool_barrier_lock);
1453 
1454 	if (idtype == P_PID)
1455 		goto skip;
1456 
1457 	/*
1458 	 * Do another run, and drop processes that were inside the barrier
1459 	 * in exit(), but when they have dropped to pool_barrier_exit
1460 	 * they have become of no interest to us.  Pick up child processes that
1461 	 * were created by fork() but didn't exist during our first scan.
1462 	 * Their parents are now stopped at pool_barrier_exit in cfork().
1463 	 */
1464 	mutex_enter(&pidlock);
1465 	for (pp = procs; (p = *pp) != NULL; pp++) {
1466 		if (p->p_poolflag & PEXITED) {
1467 			ASSERT(p->p_lwpcnt == 0);
1468 			pool_bind_wake(p);
1469 			/* flip w/last non-NULL slot */
1470 			*pp = procs[procs_count - 1];
1471 			procs[procs_count - 1] = NULL;
1472 			procs_count--;
1473 			pp--;			/* try this slot again */
1474 			continue;
1475 		}
1476 		/*
1477 		 * Look at the child and check if it should be rebound also.
1478 		 * We're holding pidlock, so it is safe to reference p_child.
1479 		 */
1480 		if ((p = p->p_child) == NULL)
1481 			continue;
1482 
1483 		mutex_enter(&p->p_lock);
1484 		/*
1485 		 * Skip processes in local zones if we're not binding
1486 		 * zones to pools (P_ZONEID).  Skip kernel processes also.
1487 		 */
1488 		if ((!INGLOBALZONE(p) && idtype != P_ZONEID) ||
1489 		    p->p_flag & SSYS) {
1490 			mutex_exit(&p->p_lock);
1491 			continue;
1492 		}
1493 
1494 		/*
1495 		 * If the child process has been already created by fork(), has
1496 		 * not exited, and has not been added to the list already,
1497 		 * then add it now.  We will hit this process again (since we
1498 		 * stick it at the end of the procs list) but it will ignored
1499 		 * because it will have the PBWAIT flag set.
1500 		 */
1501 		if (procinset(p, &set) &&
1502 		    !(p->p_poolflag & PEXITED) &&
1503 		    !(p->p_poolflag & PBWAIT)) {
1504 			ASSERT(p->p_child == NULL); /* no child of a child */
1505 			procs[procs_count] = p;
1506 			procs[procs_count + 1] = NULL;
1507 			procs_count++;
1508 			p->p_poolflag |= PBWAIT;
1509 		}
1510 		mutex_exit(&p->p_lock);
1511 	}
1512 	mutex_exit(&pidlock);
1513 skip:
1514 	/*
1515 	 * If there's no processes to rebind then return ESRCH, unless
1516 	 * we're associating a pool with new resource set, destroying it,
1517 	 * or binding a zone to a pool.
1518 	 */
1519 	if (procs_count == 0) {
1520 		if (idtype == P_POOLID || idtype == P_ZONEID)
1521 			rv = 0;
1522 		else
1523 			rv = ESRCH;
1524 		goto out;
1525 	}
1526 
1527 #ifdef DEBUG
1528 	/*
1529 	 * All processes in the array should have PBWAIT set, and none should
1530 	 * be in the critical section.  Even though p_poolflag is protected by
1531 	 * the p_lock, these assertions should be stable across the dropping of
1532 	 * p_lock.
1533 	 */
1534 	for (pp = procs; (p = *pp) != NULL; pp++) {
1535 		ASSERT(p->p_poolflag & PBWAIT);
1536 		ASSERT(p->p_poolcnt == 0);
1537 		ASSERT(procinset(p, &set));
1538 	}
1539 #endif
1540 
1541 	/*
1542 	 * Do the check if processor set rebinding is going to succeed or not.
1543 	 */
1544 	if ((flags & POOL_BIND_PSET) &&
1545 	    (rv = pset_bind_start(procs, pool)) != 0) {
1546 		pool_bind_wakeall(procs);
1547 		goto out;
1548 	}
1549 
1550 	/*
1551 	 * At this point, all bind operations should succeed.
1552 	 */
1553 	for (pp = procs; (p = *pp) != NULL; pp++) {
1554 		if (flags & POOL_BIND_PSET) {
1555 			psetid_t psetid = pool->pool_pset->pset_id;
1556 			void *zonebuf;
1557 			void *projbuf;
1558 
1559 			/*
1560 			 * Pre-allocate one buffer for FSS (per-project
1561 			 * buffer for a new pset) in case if this is the
1562 			 * first thread from its current project getting
1563 			 * bound to this processor set.
1564 			 */
1565 			projbuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_PROJ);
1566 			zonebuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_ZONE);
1567 
1568 			mutex_enter(&pidlock);
1569 			mutex_enter(&p->p_lock);
1570 			pool_pset_bind(p, psetid, projbuf, zonebuf);
1571 			mutex_exit(&p->p_lock);
1572 			mutex_exit(&pidlock);
1573 			/*
1574 			 * Free buffers pre-allocated above if it
1575 			 * wasn't actually used.
1576 			 */
1577 			fss_freebuf(projbuf, FSS_ALLOC_PROJ);
1578 			fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
1579 		}
1580 		/*
1581 		 * Now let's change the scheduling class of this
1582 		 * process if our target pool has it defined.
1583 		 */
1584 		if (cid != POOL_CLASS_UNSET)
1585 			pool_change_class(p, cid);
1586 
1587 		/*
1588 		 * It is safe to reference p_pool here without holding
1589 		 * p_lock because it cannot change underneath of us.
1590 		 * We're holding pool_lock here, so nobody else can be
1591 		 * moving this process between pools.  If process "p"
1592 		 * would be exiting, we're guaranteed that it would be blocked
1593 		 * at pool_barrier_enter() in exit().  Otherwise, it would've
1594 		 * been skipped by one of our scans of the practive list
1595 		 * as a process with PEXITED flag set.
1596 		 */
1597 		if (p->p_pool != pool) {
1598 			ASSERT(p->p_pool->pool_ref > 0);
1599 			atomic_add_32(&p->p_pool->pool_ref, -1);
1600 			p->p_pool = pool;
1601 			atomic_add_32(&p->p_pool->pool_ref, 1);
1602 		}
1603 		/*
1604 		 * Okay, we've tortured this guy enough.
1605 		 * Let this poor process go now.
1606 		 */
1607 		pool_bind_wake(p);
1608 	}
1609 	if (flags & POOL_BIND_PSET)
1610 		pset_bind_finish();
1611 
1612 out:	switch (idtype) {
1613 	case P_PROJID:
1614 		ASSERT(kpj != NULL);
1615 		mutex_exit(&kpj->kpj_poolbind);
1616 		project_rele(kpj);
1617 		break;
1618 	case P_ZONEID:
1619 		if (rv == 0) {
1620 			mutex_enter(&cpu_lock);
1621 			zone_pool_set(zone, pool);
1622 			mutex_exit(&cpu_lock);
1623 		}
1624 		zone->zone_pool_mod = gethrtime();
1625 		zone_rele(zone);
1626 		break;
1627 	}
1628 
1629 	kmem_free(procs, procs_size * sizeof (proc_t *));
1630 	ASSERT(pool_barrier_count == 0);
1631 	return (rv);
1632 }
1633