xref: /titanic_51/usr/src/uts/common/os/pool.c (revision fd9e7635fa85e33de5aff912b955d797589f6f87)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/pool.h>
30 #include <sys/pool_impl.h>
31 #include <sys/pool_pset.h>
32 #include <sys/id_space.h>
33 #include <sys/mutex.h>
34 #include <sys/nvpair.h>
35 #include <sys/cpuvar.h>
36 #include <sys/errno.h>
37 #include <sys/cmn_err.h>
38 #include <sys/systm.h>
39 #include <sys/proc.h>
40 #include <sys/fss.h>
41 #include <sys/class.h>
42 #include <sys/exacct.h>
43 #include <sys/utsname.h>
44 #include <sys/procset.h>
45 #include <sys/atomic.h>
46 #include <sys/zone.h>
47 #include <sys/policy.h>
48 #include <sys/schedctl.h>
49 
50 /*
51  * RESOURCE POOLS
52  *
53  * The resource pools facility brings together process-bindable resource into
54  * a common abstraction called a pool. Processor sets and other entities can
55  * be configured, grouped, and labelled such that workload components can be
56  * associated with a subset of a system's total resources.
57  *
58  * When disabled, the pools facility is "invisible".  All processes belong
59  * to the same pool (pool_default), and processor sets can be managed through
60  * the old pset() system call.  When enabled, processor sets can only be
61  * managed via the pools facility.  New pools can be created and associated
62  * with processor sets.  Processes can be bound to pools which have non-empty
63  * resource sets.
64  *
65  * Locking: pool_lock() protects global pools state and must be called
66  * before modifying the configuration, or when taking a snapshot of the
67  * configuration.  If pool_lock_intr() is used, the operation may be
68  * interrupted by a signal or a request.
69  *
70  * To prevent processes from being rebound between pools while they are
71  * the middle of an operation which affects resource set bindings, such
72  * operations must be surrounded by calls to pool_barrier_enter() and
73  * pool_barrier_exit().  This mechanism guarantees that such processes will
74  * be stopped either at the beginning or at the end of the barrier so that
75  * the rebind operation can atomically bind the process and its threads
76  * to new resource sets, and then let process run again.
77  *
78  * Lock ordering with respect to other locks is as follows:
79  *
80  * 	pool_lock() -> cpu_lock -> pidlock -> p_lock -> pool_barrier_lock
81  *
82  * Most static and global variables defined in this file are protected
83  * by calling pool_lock().
84  *
85  * The operation that binds tasks and projects to pools is atomic.  That is,
86  * either all processes in a given task or a project will be bound to a
87  * new pool, or (in case of an error) they will be all left bound to the
88  * old pool. Processes in a given task or a given project can only be bound to
89  * different pools if they were rebound individually one by one as single
90  * processes.  Threads or LWPs of the same process do not have pool bindings,
91  * and are bound to the same resource sets associated with the resource pool
92  * of that process.
93  *
94  * The following picture shows one possible pool configuration with three
95  * pools and three processor sets.  Note that processor set "foo" is not
96  * associated with any pools and therefore cannot have any processes
97  * bound to it.  Two pools (default and foo) are associated with the
98  * same processor set (default).  Also, note that processes in Task 2
99  * are bound to different pools.
100  *
101  *
102  *							       Processor Sets
103  *								+---------+
104  *		       +--------------+========================>| default |
105  *		      a|	      |				+---------+
106  *		      s|	      |				    ||
107  *		      s|	      |				+---------+
108  *		      o|	      |				|   foo   |
109  *		      c|	      |				+---------+
110  *		      i|	      |				    ||
111  *		      a|	      |				+---------+
112  *		      t|	      |			+------>|   bar   |
113  *		      e|	      |			|	+---------+
114  *                    d|              |                 |
115  *                     |              |                 |
116  *	       +---------+      +---------+      +---------+
117  *     Pools   | default |======|   foo   |======|   bar   |
118  *	       +---------+      +---------+      +---------+
119  *	           @  @            @              @ @   @
120  *                b|  |            |              | |   |
121  *                o|  |            |              | |   |
122  *                u|  +-----+      |      +-------+ |   +---+
123  *                n|        |      |      |         |       |
124  *            ....d|........|......|......|.........|.......|....
125  *            :    |   ::   |      |      |    ::   |       |   :
126  *            :  +---+ :: +---+  +---+  +---+  :: +---+   +---+ :
127  *  Processes :  | p | :: | p |  | p |  | p |  :: | p |...| p | :
128  *            :  +---+ :: +---+  +---+  +---+  :: +---+   +---+ :
129  *            :........::......................::...............:
130  *              Task 1            Task 2              Task N
131  *                 |                 |                  |
132  *                 |                 |                  |
133  *                 |  +-----------+  |             +-----------+
134  *                 +--| Project 1 |--+             | Project N |
135  *                    +-----------+                +-----------+
136  *
137  * This is just an illustration of relationships between processes, tasks,
138  * projects, pools, and processor sets. New types of resource sets will be
139  * added in the future.
140  */
141 
142 pool_t		*pool_default;	/* default pool which always exists */
143 int		pool_count;	/* number of pools created on this system */
144 int		pool_state;	/* pools state -- enabled/disabled */
145 void		*pool_buf;	/* pre-commit snapshot of the pools state */
146 size_t		pool_bufsz;	/* size of pool_buf */
147 static hrtime_t	pool_pool_mod;	/* last modification time for pools */
148 static hrtime_t	pool_sys_mod;	/* last modification time for system */
149 static nvlist_t	*pool_sys_prop;	/* system properties */
150 static id_space_t *pool_ids;	/* pool ID space */
151 static list_t	pool_list;	/* doubly-linked list of pools */
152 static kmutex_t		pool_mutex;		/* protects pool_busy_* */
153 static kcondvar_t	pool_busy_cv;		/* waiting for "pool_lock" */
154 static kthread_t	*pool_busy_thread;	/* thread holding "pool_lock" */
155 static kmutex_t		pool_barrier_lock;	/* synch. with pool_barrier_* */
156 static kcondvar_t	pool_barrier_cv;	/* synch. with pool_barrier_* */
157 static int		pool_barrier_count;	/* synch. with pool_barrier_* */
158 
159 /*
160  * Boot-time pool initialization.
161  */
162 void
163 pool_init(void)
164 {
165 	pool_ids = id_space_create("pool_ids", POOL_DEFAULT + 1, POOL_MAXID);
166 
167 	/*
168 	 * Initialize default pool.
169 	 */
170 	pool_default = kmem_zalloc(sizeof (pool_t), KM_SLEEP);
171 	pool_default->pool_id = POOL_DEFAULT;
172 	list_create(&pool_list, sizeof (pool_t), offsetof(pool_t, pool_link));
173 	list_insert_head(&pool_list, pool_default);
174 
175 	/*
176 	 * Initialize plugins for resource sets.
177 	 */
178 	pool_pset_init();
179 	pool_count = 1;
180 	p0.p_pool = pool_default;
181 	global_zone->zone_pool = pool_default;
182 	pool_default->pool_ref = 1;
183 }
184 
185 /*
186  * Synchronization routines.
187  *
188  * pool_lock is only called from syscall-level routines (processor_bind(),
189  * pset_*(), and /dev/pool ioctls).  The pool "lock" may be held for long
190  * periods of time, including across sleeping operations, so we allow its
191  * acquisition to be interruptible.
192  *
193  * The current thread that owns the "lock" is stored in the variable
194  * pool_busy_thread, both to let pool_lock_held() work and to aid debugging.
195  */
196 void
197 pool_lock(void)
198 {
199 	mutex_enter(&pool_mutex);
200 	ASSERT(!pool_lock_held());
201 	while (pool_busy_thread != NULL)
202 		cv_wait(&pool_busy_cv, &pool_mutex);
203 	pool_busy_thread = curthread;
204 	mutex_exit(&pool_mutex);
205 }
206 
207 int
208 pool_lock_intr(void)
209 {
210 	mutex_enter(&pool_mutex);
211 	ASSERT(!pool_lock_held());
212 	while (pool_busy_thread != NULL) {
213 		if (cv_wait_sig(&pool_busy_cv, &pool_mutex) == 0) {
214 			cv_signal(&pool_busy_cv);
215 			mutex_exit(&pool_mutex);
216 			return (1);
217 		}
218 	}
219 	pool_busy_thread = curthread;
220 	mutex_exit(&pool_mutex);
221 	return (0);
222 }
223 
224 int
225 pool_lock_held(void)
226 {
227 	return (pool_busy_thread == curthread);
228 }
229 
230 void
231 pool_unlock(void)
232 {
233 	mutex_enter(&pool_mutex);
234 	ASSERT(pool_lock_held());
235 	pool_busy_thread = NULL;
236 	cv_signal(&pool_busy_cv);
237 	mutex_exit(&pool_mutex);
238 }
239 
240 /*
241  * Routines allowing fork(), exec(), exit(), and lwp_create() to synchronize
242  * with pool_do_bind().
243  *
244  * Calls to pool_barrier_enter() and pool_barrier_exit() must bracket all
245  * operations which modify pool or pset associations.  They can be called
246  * while the process is multi-threaded.  In the common case, when current
247  * process is not being rebound (PBWAIT flag is not set), these functions
248  * will be just incrementing and decrementing reference counts.
249  */
250 void
251 pool_barrier_enter(void)
252 {
253 	proc_t *p = curproc;
254 
255 	ASSERT(MUTEX_HELD(&p->p_lock));
256 	while (p->p_poolflag & PBWAIT)
257 		cv_wait(&p->p_poolcv, &p->p_lock);
258 	p->p_poolcnt++;
259 }
260 
261 void
262 pool_barrier_exit(void)
263 {
264 	proc_t *p = curproc;
265 
266 	ASSERT(MUTEX_HELD(&p->p_lock));
267 	ASSERT(p->p_poolcnt > 0);
268 	p->p_poolcnt--;
269 	if (p->p_poolflag & PBWAIT) {
270 		mutex_enter(&pool_barrier_lock);
271 		ASSERT(pool_barrier_count > 0);
272 		pool_barrier_count--;
273 		if (pool_barrier_count == 0)
274 			cv_signal(&pool_barrier_cv);
275 		mutex_exit(&pool_barrier_lock);
276 		while (p->p_poolflag & PBWAIT)
277 			cv_wait(&p->p_poolcv, &p->p_lock);
278 	}
279 }
280 
281 /*
282  * Enable pools facility.
283  */
284 static int
285 pool_enable(void)
286 {
287 	int ret;
288 
289 	ASSERT(pool_lock_held());
290 	ASSERT(pool_count == 1);
291 
292 	ret = pool_pset_enable();
293 	if (ret != 0)
294 		return (ret);
295 	(void) nvlist_alloc(&pool_sys_prop, NV_UNIQUE_NAME, KM_SLEEP);
296 	(void) nvlist_add_string(pool_sys_prop, "system.name",
297 	    "default");
298 	(void) nvlist_add_string(pool_sys_prop, "system.comment", "");
299 	(void) nvlist_add_int64(pool_sys_prop, "system.version", 1);
300 	(void) nvlist_add_byte(pool_sys_prop, "system.bind-default", 1);
301 	(void) nvlist_add_string(pool_sys_prop, "system.poold.objectives",
302 	    "wt-load");
303 
304 	(void) nvlist_alloc(&pool_default->pool_props,
305 	    NV_UNIQUE_NAME, KM_SLEEP);
306 	(void) nvlist_add_string(pool_default->pool_props,
307 	    "pool.name", "pool_default");
308 	(void) nvlist_add_string(pool_default->pool_props, "pool.comment", "");
309 	(void) nvlist_add_byte(pool_default->pool_props, "pool.default", 1);
310 	(void) nvlist_add_byte(pool_default->pool_props, "pool.active", 1);
311 	(void) nvlist_add_int64(pool_default->pool_props,
312 	    "pool.importance", 1);
313 	(void) nvlist_add_int64(pool_default->pool_props, "pool.sys_id",
314 	    pool_default->pool_id);
315 
316 	pool_sys_mod = pool_pool_mod = gethrtime();
317 
318 	return (ret);
319 }
320 
321 /*
322  * Disable pools facility.
323  */
324 static int
325 pool_disable(void)
326 {
327 	int ret;
328 
329 	ASSERT(pool_lock_held());
330 
331 	if (pool_count > 1)	/* must destroy all pools first */
332 		return (EBUSY);
333 
334 	ret = pool_pset_disable();
335 	if (ret != 0)
336 		return (ret);
337 	if (pool_sys_prop != NULL) {
338 		nvlist_free(pool_sys_prop);
339 		pool_sys_prop = NULL;
340 	}
341 	if (pool_default->pool_props != NULL) {
342 		nvlist_free(pool_default->pool_props);
343 		pool_default->pool_props = NULL;
344 	}
345 	return (0);
346 }
347 
348 pool_t *
349 pool_lookup_pool_by_name(char *name)
350 {
351 	pool_t *pool = pool_default;
352 	char *p;
353 
354 	ASSERT(pool_lock_held());
355 	for (pool = list_head(&pool_list); pool;
356 	    pool = list_next(&pool_list, pool)) {
357 		if (nvlist_lookup_string(pool->pool_props,
358 		    "pool.name", &p) == 0 && strcmp(name, p) == 0)
359 			return (pool);
360 	}
361 	return (NULL);
362 }
363 
364 pool_t *
365 pool_lookup_pool_by_id(poolid_t poolid)
366 {
367 	pool_t *pool = pool_default;
368 
369 	ASSERT(pool_lock_held());
370 	for (pool = list_head(&pool_list); pool;
371 	    pool = list_next(&pool_list, pool)) {
372 		if (pool->pool_id == poolid)
373 			return (pool);
374 	}
375 	return (NULL);
376 }
377 
378 /*
379  * Create new pool, associate it with default resource sets, and give
380  * it a temporary name.
381  */
382 static int
383 pool_pool_create(poolid_t *poolid)
384 {
385 	pool_t *pool;
386 	char pool_name[40];
387 
388 	ASSERT(pool_lock_held());
389 
390 	pool = kmem_zalloc(sizeof (pool_t), KM_SLEEP);
391 	pool->pool_id = *poolid = id_alloc(pool_ids);
392 	pool->pool_pset = pool_pset_default;
393 	pool_pset_default->pset_npools++;
394 	list_insert_tail(&pool_list, pool);
395 	(void) nvlist_alloc(&pool->pool_props, NV_UNIQUE_NAME, KM_SLEEP);
396 	(void) nvlist_add_int64(pool->pool_props, "pool.sys_id", pool->pool_id);
397 	(void) nvlist_add_byte(pool->pool_props, "pool.default", 0);
398 	pool_pool_mod = gethrtime();
399 	(void) snprintf(pool_name, sizeof (pool_name), "pool_%lld",
400 	    pool_pool_mod);
401 	(void) nvlist_add_string(pool->pool_props, "pool.name", pool_name);
402 	pool_count++;
403 	return (0);
404 }
405 
406 struct destroy_zone_arg {
407 	pool_t *old;
408 	pool_t *new;
409 };
410 
411 /*
412  * Update pool pointers for zones that are currently bound to pool "old"
413  * to be bound to pool "new".
414  */
415 static int
416 pool_destroy_zone_cb(zone_t *zone, void *arg)
417 {
418 	struct destroy_zone_arg *dza = arg;
419 
420 	ASSERT(pool_lock_held());
421 	ASSERT(MUTEX_HELD(&cpu_lock));
422 
423 	if (zone_pool_get(zone) == dza->old)
424 		zone_pool_set(zone, dza->new);
425 	return (0);
426 }
427 
428 /*
429  * Destroy specified pool, and rebind all processes in it
430  * to the default pool.
431  */
432 static int
433 pool_pool_destroy(poolid_t poolid)
434 {
435 	pool_t *pool;
436 	int ret;
437 
438 	ASSERT(pool_lock_held());
439 
440 	if (poolid == POOL_DEFAULT)
441 		return (EINVAL);
442 	if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
443 		return (ESRCH);
444 	ret = pool_do_bind(pool_default, P_POOLID, poolid, POOL_BIND_ALL);
445 	if (ret == 0) {
446 		struct destroy_zone_arg dzarg;
447 
448 		dzarg.old = pool;
449 		dzarg.new = pool_default;
450 		mutex_enter(&cpu_lock);
451 		ret = zone_walk(pool_destroy_zone_cb, &dzarg);
452 		mutex_exit(&cpu_lock);
453 		ASSERT(ret == 0);
454 		ASSERT(pool->pool_ref == 0);
455 		(void) nvlist_free(pool->pool_props);
456 		id_free(pool_ids, pool->pool_id);
457 		pool->pool_pset->pset_npools--;
458 		list_remove(&pool_list, pool);
459 		pool_count--;
460 		pool_pool_mod = gethrtime();
461 		kmem_free(pool, sizeof (pool_t));
462 	}
463 	return (ret);
464 }
465 
466 /*
467  * Create new pool or resource set.
468  */
469 int
470 pool_create(int class, int subclass, id_t *id)
471 {
472 	int ret;
473 
474 	ASSERT(pool_lock_held());
475 	if (pool_state == POOL_DISABLED)
476 		return (ENOTACTIVE);
477 	switch (class) {
478 	case PEC_POOL:
479 		ret = pool_pool_create((poolid_t *)id);
480 		break;
481 	case PEC_RES_COMP:
482 		switch (subclass) {
483 		case PREC_PSET:
484 			ret = pool_pset_create((psetid_t *)id);
485 			break;
486 		default:
487 			ret = EINVAL;
488 		}
489 		break;
490 	case PEC_RES_AGG:
491 		ret = ENOTSUP;
492 		break;
493 	default:
494 		ret = EINVAL;
495 	}
496 	return (ret);
497 }
498 
499 /*
500  * Destroy an existing pool or resource set.
501  */
502 int
503 pool_destroy(int class, int subclass, id_t id)
504 {
505 	int ret;
506 
507 	ASSERT(pool_lock_held());
508 	if (pool_state == POOL_DISABLED)
509 		return (ENOTACTIVE);
510 	switch (class) {
511 	case PEC_POOL:
512 		ret = pool_pool_destroy((poolid_t)id);
513 		break;
514 	case PEC_RES_COMP:
515 		switch (subclass) {
516 		case PREC_PSET:
517 			ret = pool_pset_destroy((psetid_t)id);
518 			break;
519 		default:
520 			ret = EINVAL;
521 		}
522 		break;
523 	case PEC_RES_AGG:
524 		ret = ENOTSUP;
525 		break;
526 	default:
527 		ret = EINVAL;
528 	}
529 	return (ret);
530 }
531 
532 /*
533  * Enable or disable pools.
534  */
535 int
536 pool_status(int status)
537 {
538 	int ret = 0;
539 
540 	ASSERT(pool_lock_held());
541 
542 	if (pool_state == status)
543 		return (0);
544 	switch (status) {
545 	case POOL_ENABLED:
546 		ret = pool_enable();
547 		if (ret != 0)
548 			return (ret);
549 		pool_state = POOL_ENABLED;
550 		break;
551 	case POOL_DISABLED:
552 		ret = pool_disable();
553 		if (ret != 0)
554 			return (ret);
555 		pool_state = POOL_DISABLED;
556 		break;
557 	default:
558 		ret = EINVAL;
559 	}
560 	return (ret);
561 }
562 
563 /*
564  * Associate pool with resource set.
565  */
566 int
567 pool_assoc(poolid_t poolid, int idtype, id_t id)
568 {
569 	int ret;
570 
571 	ASSERT(pool_lock_held());
572 	if (pool_state == POOL_DISABLED)
573 		return (ENOTACTIVE);
574 	switch (idtype) {
575 	case PREC_PSET:
576 		ret = pool_pset_assoc(poolid, (psetid_t)id);
577 		break;
578 	default:
579 		ret = EINVAL;
580 	}
581 	if (ret == 0)
582 		pool_pool_mod = gethrtime();
583 	return (ret);
584 }
585 
586 /*
587  * Disassociate resource set from pool.
588  */
589 int
590 pool_dissoc(poolid_t poolid, int idtype)
591 {
592 	int ret;
593 
594 	ASSERT(pool_lock_held());
595 	if (pool_state == POOL_DISABLED)
596 		return (ENOTACTIVE);
597 	switch (idtype) {
598 	case PREC_PSET:
599 		ret = pool_pset_assoc(poolid, PS_NONE);
600 		break;
601 	default:
602 		ret = EINVAL;
603 	}
604 	if (ret == 0)
605 		pool_pool_mod = gethrtime();
606 	return (ret);
607 }
608 
609 /*
610  * Transfer specified quantity of resources between resource sets.
611  */
612 /*ARGSUSED*/
613 int
614 pool_transfer(int type, id_t src, id_t dst, uint64_t qty)
615 {
616 	int ret = EINVAL;
617 	return (ret);
618 }
619 
620 /*
621  * Transfer resources specified by their IDs between resource sets.
622  */
623 int
624 pool_xtransfer(int type, id_t src, id_t dst, uint_t size, id_t *ids)
625 {
626 	int ret;
627 
628 	ASSERT(pool_lock_held());
629 	if (pool_state == POOL_DISABLED)
630 		return (ENOTACTIVE);
631 	switch (type) {
632 	case PREC_PSET:
633 		ret = pool_pset_xtransfer((psetid_t)src, (psetid_t)dst,
634 		    size, ids);
635 		break;
636 	default:
637 		ret = EINVAL;
638 	}
639 	return (ret);
640 }
641 
642 /*
643  * Bind processes to pools.
644  */
645 int
646 pool_bind(poolid_t poolid, idtype_t idtype, id_t id)
647 {
648 	pool_t *pool;
649 
650 	ASSERT(pool_lock_held());
651 
652 	if (pool_state == POOL_DISABLED)
653 		return (ENOTACTIVE);
654 	if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
655 		return (ESRCH);
656 
657 	switch (idtype) {
658 	case P_PID:
659 	case P_TASKID:
660 	case P_PROJID:
661 	case P_ZONEID:
662 		break;
663 	default:
664 		return (EINVAL);
665 	}
666 	return (pool_do_bind(pool, idtype, id, POOL_BIND_ALL));
667 }
668 
669 /*
670  * Query pool binding of the specifed process.
671  */
672 int
673 pool_query_binding(idtype_t idtype, id_t id, id_t *poolid)
674 {
675 	proc_t *p;
676 
677 	if (idtype != P_PID)
678 		return (ENOTSUP);
679 	if (id == P_MYID)
680 		id = curproc->p_pid;
681 
682 	ASSERT(pool_lock_held());
683 
684 	mutex_enter(&pidlock);
685 	if ((p = prfind((pid_t)id)) == NULL) {
686 		mutex_exit(&pidlock);
687 		return (ESRCH);
688 	}
689 	mutex_enter(&p->p_lock);
690 	/*
691 	 * In local zones, lie about pool bindings of processes from
692 	 * the global zone.
693 	 */
694 	if (!INGLOBALZONE(curproc) && INGLOBALZONE(p)) {
695 		pool_t *pool;
696 
697 		pool = zone_pool_get(curproc->p_zone);
698 		*poolid = pool->pool_id;
699 	} else {
700 		*poolid = p->p_pool->pool_id;
701 	}
702 	mutex_exit(&p->p_lock);
703 	mutex_exit(&pidlock);
704 	return (0);
705 }
706 
707 static ea_object_t *
708 pool_system_pack(void)
709 {
710 	ea_object_t *eo_system;
711 	size_t bufsz = 0;
712 	char *buf = NULL;
713 
714 	ASSERT(pool_lock_held());
715 
716 	eo_system = ea_alloc_group(EXT_GROUP | EXC_LOCAL | EXD_GROUP_SYSTEM);
717 	(void) ea_attach_item(eo_system, &pool_sys_mod, sizeof (hrtime_t),
718 	    EXC_LOCAL | EXD_SYSTEM_TSTAMP | EXT_UINT64);
719 	if (INGLOBALZONE(curproc))
720 		(void) ea_attach_item(eo_system, &pool_pool_mod,
721 		    sizeof (hrtime_t),
722 		    EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64);
723 	else
724 		(void) ea_attach_item(eo_system,
725 		    &curproc->p_zone->zone_pool_mod,
726 		    sizeof (hrtime_t),
727 		    EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64);
728 	(void) ea_attach_item(eo_system, &pool_pset_mod, sizeof (hrtime_t),
729 	    EXC_LOCAL | EXD_PSET_TSTAMP | EXT_UINT64);
730 	(void) ea_attach_item(eo_system, &pool_cpu_mod, sizeof (hrtime_t),
731 	    EXC_LOCAL | EXD_CPU_TSTAMP | EXT_UINT64);
732 	(void) nvlist_pack(pool_sys_prop, &buf, &bufsz, NV_ENCODE_NATIVE, 0);
733 	(void) ea_attach_item(eo_system, buf, bufsz,
734 	    EXC_LOCAL | EXD_SYSTEM_PROP | EXT_RAW);
735 	kmem_free(buf, bufsz);
736 	return (eo_system);
737 }
738 
739 /*
740  * Pack information about pools and attach it to specified exacct group.
741  */
742 static int
743 pool_pool_pack(ea_object_t *eo_system)
744 {
745 	ea_object_t *eo_pool;
746 	pool_t *pool;
747 	size_t bufsz;
748 	char *buf;
749 	pool_t *myzonepool;
750 
751 	ASSERT(pool_lock_held());
752 	myzonepool = zone_pool_get(curproc->p_zone);
753 	for (pool = list_head(&pool_list); pool;
754 	    pool = list_next(&pool_list, pool)) {
755 		if (!INGLOBALZONE(curproc) && myzonepool != pool)
756 			continue;
757 		bufsz = 0;
758 		buf = NULL;
759 		eo_pool = ea_alloc_group(EXT_GROUP |
760 		    EXC_LOCAL | EXD_GROUP_POOL);
761 		(void) ea_attach_item(eo_pool, &pool->pool_id, sizeof (id_t),
762 		    EXC_LOCAL | EXD_POOL_POOLID | EXT_UINT32);
763 		(void) ea_attach_item(eo_pool, &pool->pool_pset->pset_id,
764 		    sizeof (id_t), EXC_LOCAL | EXD_POOL_PSETID | EXT_UINT32);
765 		(void) nvlist_pack(pool->pool_props, &buf, &bufsz,
766 		    NV_ENCODE_NATIVE, 0);
767 		(void) ea_attach_item(eo_pool, buf, bufsz,
768 		    EXC_LOCAL | EXD_POOL_PROP | EXT_RAW);
769 		kmem_free(buf, bufsz);
770 		(void) ea_attach_to_group(eo_system, eo_pool);
771 	}
772 	return (0);
773 }
774 
775 /*
776  * Pack the whole pool configuration in the specified buffer.
777  */
778 int
779 pool_pack_conf(void *kbuf, size_t kbufsz, size_t *asize)
780 {
781 	ea_object_t *eo_system;
782 	size_t ksize;
783 	int ret = 0;
784 
785 	ASSERT(pool_lock_held());
786 
787 	eo_system = pool_system_pack();		/* 1. pack system */
788 	(void) pool_pool_pack(eo_system);	/* 2. pack all pools */
789 	(void) pool_pset_pack(eo_system);	/* 3. pack all psets */
790 	ksize = ea_pack_object(eo_system, NULL, 0);
791 	if (kbuf == NULL || kbufsz == 0)
792 		*asize = ksize;
793 	else if (ksize > kbufsz)
794 		ret = ENOMEM;
795 	else
796 		*asize = ea_pack_object(eo_system, kbuf, kbufsz);
797 	ea_free_object(eo_system, EUP_ALLOC);
798 	return (ret);
799 }
800 
801 /*
802  * Start/end the commit transaction.  If commit transaction is currently
803  * in progress, then all POOL_QUERY ioctls will return pools configuration
804  * at the beginning of transaction.
805  */
806 int
807 pool_commit(int state)
808 {
809 	ea_object_t *eo_system;
810 	int ret = 0;
811 
812 	ASSERT(pool_lock_held());
813 
814 	if (pool_state == POOL_DISABLED)
815 		return (ENOTACTIVE);
816 	switch (state) {
817 	case 1:
818 		/*
819 		 * Beginning commit transation.
820 		 */
821 		if (pool_buf != NULL)		/* transaction in progress */
822 			return (EBUSY);
823 		eo_system = pool_system_pack();		/* 1. pack system */
824 		(void) pool_pool_pack(eo_system);	/* 2. pack all pools */
825 		(void) pool_pset_pack(eo_system);	/* 3. pack all psets */
826 		pool_bufsz = ea_pack_object(eo_system, NULL, 0);
827 		pool_buf = kmem_alloc(pool_bufsz, KM_SLEEP);
828 		pool_bufsz = ea_pack_object(eo_system, pool_buf, pool_bufsz);
829 		ea_free_object(eo_system, EUP_ALLOC);
830 		break;
831 	case 0:
832 		/*
833 		 * Finishing commit transaction.
834 		 */
835 		if (pool_buf != NULL) {
836 			kmem_free(pool_buf, pool_bufsz);
837 			pool_buf = NULL;
838 			pool_bufsz = 0;
839 		}
840 		break;
841 	default:
842 		ret = EINVAL;
843 	}
844 	return (ret);
845 }
846 
847 /*
848  * Check is the specified property is special
849  */
850 static pool_property_t *
851 pool_property_find(char *name, pool_property_t *list)
852 {
853 	pool_property_t *prop;
854 
855 	for (prop = list; prop->pp_name != NULL; prop++)
856 		if (strcmp(prop->pp_name, name) == 0)
857 			return (prop);
858 	return (NULL);
859 }
860 
861 static pool_property_t pool_prop_sys[] = {
862 	{ "system.name",		DATA_TYPE_STRING,	PP_RDWR },
863 	{ "system.comment",		DATA_TYPE_STRING,	PP_RDWR },
864 	{ "system.version",		DATA_TYPE_UINT64,	PP_READ },
865 	{ "system.bind-default",	DATA_TYPE_BYTE,		PP_RDWR },
866 	{ "system.allocate-method",	DATA_TYPE_STRING,
867 	    PP_RDWR | PP_OPTIONAL },
868 	{ "system.poold.log-level",	DATA_TYPE_STRING,
869 	    PP_RDWR | PP_OPTIONAL },
870 	{ "system.poold.log-location",	DATA_TYPE_STRING,
871 	    PP_RDWR | PP_OPTIONAL },
872 	{ "system.poold.monitor-interval",	DATA_TYPE_UINT64,
873 	    PP_RDWR | PP_OPTIONAL },
874 	{ "system.poold.history-file",	DATA_TYPE_STRING,
875 	    PP_RDWR | PP_OPTIONAL },
876 	{ "system.poold.objectives",	DATA_TYPE_STRING,
877 	    PP_RDWR | PP_OPTIONAL },
878 	{ NULL,				0,			0 }
879 };
880 
881 static pool_property_t pool_prop_pool[] = {
882 	{ "pool.sys_id",		DATA_TYPE_UINT64,	PP_READ },
883 	{ "pool.name",			DATA_TYPE_STRING,	PP_RDWR },
884 	{ "pool.default",		DATA_TYPE_BYTE,		PP_READ },
885 	{ "pool.active",		DATA_TYPE_BYTE,		PP_RDWR },
886 	{ "pool.importance",		DATA_TYPE_INT64,	PP_RDWR },
887 	{ "pool.comment",		DATA_TYPE_STRING,	PP_RDWR },
888 	{ "pool.scheduler",		DATA_TYPE_STRING,
889 	    PP_RDWR | PP_OPTIONAL },
890 	{ NULL,				0,			0 }
891 };
892 
893 /*
894  * Common routine to put new property on the specified list
895  */
896 int
897 pool_propput_common(nvlist_t *nvlist, nvpair_t *pair, pool_property_t *props)
898 {
899 	pool_property_t *prop;
900 
901 	if ((prop = pool_property_find(nvpair_name(pair), props)) != NULL) {
902 		/*
903 		 * No read-only properties or properties with bad types
904 		 */
905 		if (!(prop->pp_perm & PP_WRITE) ||
906 		    prop->pp_type != nvpair_type(pair))
907 			return (EINVAL);
908 	}
909 	return (nvlist_add_nvpair(nvlist, pair));
910 }
911 
912 /*
913  * Common routine to remove property from the given list
914  */
915 int
916 pool_proprm_common(nvlist_t *nvlist, char *name, pool_property_t *props)
917 {
918 	pool_property_t *prop;
919 
920 	if ((prop = pool_property_find(name, props)) != NULL) {
921 		if (!(prop->pp_perm & PP_OPTIONAL))
922 			return (EINVAL);
923 	}
924 	return (nvlist_remove_all(nvlist, name));
925 }
926 
927 static int
928 pool_system_propput(nvpair_t *pair)
929 {
930 	int ret;
931 
932 	ASSERT(pool_lock_held());
933 	ret = pool_propput_common(pool_sys_prop, pair, pool_prop_sys);
934 	if (ret == 0)
935 		pool_sys_mod = gethrtime();
936 	return (ret);
937 }
938 
939 static int
940 pool_system_proprm(char *name)
941 {
942 	int ret;
943 
944 	ASSERT(pool_lock_held());
945 	ret = pool_proprm_common(pool_sys_prop, name, pool_prop_sys);
946 	if (ret == 0)
947 		pool_sys_mod = gethrtime();
948 	return (ret);
949 }
950 
951 static int
952 pool_pool_propput(poolid_t poolid, nvpair_t *pair)
953 {
954 	pool_t *pool;
955 	int ret;
956 
957 	ASSERT(pool_lock_held());
958 	if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
959 		return (ESRCH);
960 	ret = pool_propput_common(pool->pool_props, pair, pool_prop_pool);
961 	if (ret == 0)
962 		pool_pool_mod = gethrtime();
963 	return (ret);
964 }
965 
966 static int
967 pool_pool_proprm(poolid_t poolid, char *name)
968 {
969 	int ret;
970 	pool_t *pool;
971 
972 	ASSERT(pool_lock_held());
973 	if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
974 		return (ESRCH);
975 	ret = pool_proprm_common(pool->pool_props, name, pool_prop_pool);
976 	if (ret == 0)
977 		pool_pool_mod = gethrtime();
978 	return (ret);
979 }
980 
981 int
982 pool_propput(int class, int subclass, id_t id, nvpair_t *pair)
983 {
984 	int ret;
985 
986 	ASSERT(pool_lock_held());
987 	if (pool_state == POOL_DISABLED)
988 		return (ENOTACTIVE);
989 	switch (class) {
990 	case PEC_SYSTEM:
991 		ret = pool_system_propput(pair);
992 		break;
993 	case PEC_POOL:
994 		ret = pool_pool_propput((poolid_t)id, pair);
995 		break;
996 	case PEC_RES_COMP:
997 		switch (subclass) {
998 		case PREC_PSET:
999 			ret = pool_pset_propput((psetid_t)id, pair);
1000 			break;
1001 		default:
1002 			ret = EINVAL;
1003 		}
1004 		break;
1005 	case PEC_RES_AGG:
1006 		ret = ENOTSUP;
1007 		break;
1008 	case PEC_COMP:
1009 		switch (subclass) {
1010 		case PCEC_CPU:
1011 			ret = pool_cpu_propput((processorid_t)id, pair);
1012 			break;
1013 		default:
1014 			ret = EINVAL;
1015 		}
1016 		break;
1017 	default:
1018 		ret = EINVAL;
1019 	}
1020 	return (ret);
1021 }
1022 
1023 int
1024 pool_proprm(int class, int subclass, id_t id, char *name)
1025 {
1026 	int ret;
1027 
1028 	ASSERT(pool_lock_held());
1029 	if (pool_state == POOL_DISABLED)
1030 		return (ENOTACTIVE);
1031 	switch (class) {
1032 	case PEC_SYSTEM:
1033 		ret = pool_system_proprm(name);
1034 		break;
1035 	case PEC_POOL:
1036 		ret = pool_pool_proprm((poolid_t)id, name);
1037 		break;
1038 	case PEC_RES_COMP:
1039 		switch (subclass) {
1040 		case PREC_PSET:
1041 			ret = pool_pset_proprm((psetid_t)id, name);
1042 			break;
1043 		default:
1044 			ret = EINVAL;
1045 		}
1046 		break;
1047 	case PEC_RES_AGG:
1048 		ret = ENOTSUP;
1049 		break;
1050 	case PEC_COMP:
1051 		switch (subclass) {
1052 		case PCEC_CPU:
1053 			ret = pool_cpu_proprm((processorid_t)id, name);
1054 			break;
1055 		default:
1056 			ret = EINVAL;
1057 		}
1058 		break;
1059 	default:
1060 		ret = EINVAL;
1061 	}
1062 	return (ret);
1063 }
1064 
1065 int
1066 pool_propget(char *name, int class, int subclass, id_t id, nvlist_t **nvlp)
1067 {
1068 	int ret;
1069 	nvlist_t *nvl;
1070 
1071 	ASSERT(pool_lock_held());
1072 	if (pool_state == POOL_DISABLED)
1073 		return (ENOTACTIVE);
1074 
1075 	(void) nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP);
1076 
1077 	switch (class) {
1078 	case PEC_SYSTEM:
1079 	case PEC_POOL:
1080 		ret = EINVAL;
1081 		break;
1082 	case PEC_RES_COMP:
1083 		switch (subclass) {
1084 		case PREC_PSET:
1085 			ret = pool_pset_propget((psetid_t)id, name, nvl);
1086 			break;
1087 		default:
1088 			ret = EINVAL;
1089 		}
1090 		break;
1091 	case PEC_RES_AGG:
1092 		ret = ENOTSUP;
1093 		break;
1094 	case PEC_COMP:
1095 		switch (subclass) {
1096 		case PCEC_CPU:
1097 			ret = pool_cpu_propget((processorid_t)id, name, nvl);
1098 			break;
1099 		default:
1100 			ret = EINVAL;
1101 		}
1102 		break;
1103 	default:
1104 		ret = EINVAL;
1105 	}
1106 	if (ret == 0)
1107 		*nvlp = nvl;
1108 	else
1109 		nvlist_free(nvl);
1110 	return (ret);
1111 }
1112 
1113 /*
1114  * pool_bind_wake and pool_bind_wakeall are helper functions to undo PBWAITs
1115  * in case of failure in pool_do_bind().
1116  */
1117 static void
1118 pool_bind_wake(proc_t *p)
1119 {
1120 	ASSERT(pool_lock_held());
1121 
1122 	mutex_enter(&p->p_lock);
1123 	ASSERT(p->p_poolflag & PBWAIT);
1124 	if (p->p_poolcnt > 0) {
1125 		mutex_enter(&pool_barrier_lock);
1126 		pool_barrier_count -= p->p_poolcnt;
1127 		mutex_exit(&pool_barrier_lock);
1128 	}
1129 	p->p_poolflag &= ~PBWAIT;
1130 	cv_signal(&p->p_poolcv);
1131 	mutex_exit(&p->p_lock);
1132 }
1133 
1134 static void
1135 pool_bind_wakeall(proc_t **procs)
1136 {
1137 	proc_t *p, **pp;
1138 
1139 	ASSERT(pool_lock_held());
1140 	for (pp = procs; (p = *pp) != NULL; pp++)
1141 		pool_bind_wake(p);
1142 }
1143 
1144 /*
1145  * Return the scheduling class for this pool, or
1146  * 	POOL_CLASS_UNSET if not set
1147  * 	POOL_CLASS_INVAL if set to an invalid class ID.
1148  */
1149 id_t
1150 pool_get_class(pool_t *pool)
1151 {
1152 	char *name;
1153 	id_t cid;
1154 
1155 	ASSERT(pool_lock_held());
1156 
1157 	if (nvlist_lookup_string(pool->pool_props, "pool.scheduler",
1158 	    &name) == 0) {
1159 		if (getcidbyname(name, &cid) == 0)
1160 			return (cid);
1161 		else
1162 			return (POOL_CLASS_INVAL);
1163 	}
1164 	return (POOL_CLASS_UNSET);
1165 }
1166 
1167 /*
1168  * Move process to the new scheduling class.
1169  */
1170 static void
1171 pool_change_class(proc_t *p, id_t cid)
1172 {
1173 	kthread_t *t;
1174 	void *cldata;
1175 	id_t oldcid;
1176 	void **bufs;
1177 	void **buf;
1178 	int nlwp;
1179 	int ret;
1180 	int i;
1181 
1182 	/*
1183 	 * Do not move kernel processes (such as zsched).
1184 	 */
1185 	if (p->p_flag & SSYS)
1186 		return;
1187 	/*
1188 	 * This process is in the pool barrier, so it can't possibly be
1189 	 * adding new threads and we can use p_lwpcnt + p_zombcnt + 1
1190 	 * (for possible agent LWP which doesn't use pool barrier) as
1191 	 * our upper bound.
1192 	 */
1193 	nlwp = p->p_lwpcnt + p->p_zombcnt + 1;
1194 
1195 	/*
1196 	 * Pre-allocate scheduling class specific buffers before
1197 	 * grabbing p_lock.
1198 	 */
1199 	bufs = kmem_zalloc(nlwp * sizeof (void *), KM_SLEEP);
1200 	for (i = 0, buf = bufs; i < nlwp; i++, buf++) {
1201 		ret = CL_ALLOC(buf, cid, KM_SLEEP);
1202 		ASSERT(ret == 0);
1203 	}
1204 
1205 	/*
1206 	 * Move threads one by one to the new scheduling class.
1207 	 * This never fails because we have all the right
1208 	 * privileges here.
1209 	 */
1210 	mutex_enter(&p->p_lock);
1211 	ASSERT(p->p_poolflag & PBWAIT);
1212 	buf = bufs;
1213 	t = p->p_tlist;
1214 	ASSERT(t != NULL);
1215 	do {
1216 		if (t->t_cid != cid) {
1217 			oldcid = t->t_cid;
1218 			cldata = t->t_cldata;
1219 			ret = CL_ENTERCLASS(t, cid, NULL, NULL, *buf);
1220 			ASSERT(ret == 0);
1221 			CL_EXITCLASS(oldcid, cldata);
1222 			schedctl_set_cidpri(t);
1223 			*buf++ = NULL;
1224 		}
1225 	} while ((t = t->t_forw) != p->p_tlist);
1226 	mutex_exit(&p->p_lock);
1227 	/*
1228 	 * Free unused scheduling class specific buffers.
1229 	 */
1230 	for (i = 0, buf = bufs; i < nlwp; i++, buf++) {
1231 		if (*buf != NULL) {
1232 			CL_FREE(cid, *buf);
1233 			*buf = NULL;
1234 		}
1235 	}
1236 	kmem_free(bufs, nlwp * sizeof (void *));
1237 }
1238 
1239 /*
1240  * The meat of the bind operation.  The steps in pool_do_bind are:
1241  *
1242  * 1) Set PBWAIT in the p_poolflag of any process of interest, and add all
1243  *    such processes to an array.  For any interesting process that has
1244  *    threads inside the pool barrier set, increment a counter by the
1245  *    count of such threads.  Once PBWAIT is set on a process, that process
1246  *    will not disappear.
1247  *
1248  * 2) Wait for the counter from step 2 to drop to zero.  Any process which
1249  *    calls pool_barrier_exit() and notices that PBWAIT has been set on it
1250  *    will decrement that counter before going to sleep, and the process
1251  *    calling pool_barrier_exit() which does the final decrement will wake us.
1252  *
1253  * 3) For each interesting process, perform a calculation on it to see if
1254  *    the bind will actually succeed.  This uses the following three
1255  *    resource-set-specific functions:
1256  *
1257  *    - int set_bind_start(procs, pool)
1258  *
1259  *      Determine whether the given array of processes can be bound to the
1260  *      resource set associated with the given pool.  If it can, take and hold
1261  *      any locks necessary to ensure that the operation will succeed, and
1262  *      make any necessary reservations in the target resource set.  If it
1263  *      can't, return failure with no reservations made and no new locks held.
1264  *
1265  *    - void set_bind_abort(procs, pool)
1266  *
1267  *      set_bind_start() has completed successfully, but another resource set's
1268  *      set_bind_start() has failed, and we haven't begun the bind yet.  Undo
1269  *      any reservations made and drop any locks acquired by our
1270  *      set_bind_start().
1271  *
1272  *    - void set_bind_finish(void)
1273  *
1274  *      The bind has completed successfully.  The processes have been released,
1275  *      and the reservation acquired in set_bind_start() has been depleted as
1276  *      the processes have finished their bindings.  Drop any locks acquired by
1277  *      set_bind_start().
1278  *
1279  * 4) If we've decided that we can proceed with the bind, iterate through
1280  *    the list of interesting processes, grab the necessary locks (which
1281  *    may differ per resource set), perform the bind, and ASSERT that it
1282  *    succeeds.  Once a process has been rebound, it can be awakened.
1283  *
1284  * The operations from step 4 must be kept in sync with anything which might
1285  * cause the bind operations (e.g., cpupart_bind_thread()) to fail, and
1286  * are thus located in the same source files as the associated bind operations.
1287  */
1288 int
1289 pool_do_bind(pool_t *pool, idtype_t idtype, id_t id, int flags)
1290 {
1291 	extern uint_t nproc;
1292 	klwp_t *lwp = ttolwp(curthread);
1293 	proc_t **pp, **procs;
1294 	proc_t *prstart;
1295 	int procs_count = 0;
1296 	kproject_t *kpj;
1297 	procset_t set;
1298 	zone_t *zone;
1299 	int procs_size;
1300 	int rv = 0;
1301 	proc_t *p;
1302 	id_t cid = -1;
1303 
1304 	ASSERT(pool_lock_held());
1305 
1306 	if ((cid = pool_get_class(pool)) == POOL_CLASS_INVAL)
1307 		return (EINVAL);
1308 
1309 	if (idtype == P_ZONEID) {
1310 		zone = zone_find_by_id(id);
1311 		if (zone == NULL)
1312 			return (ESRCH);
1313 		if (zone_status_get(zone) > ZONE_IS_RUNNING) {
1314 			zone_rele(zone);
1315 			return (EBUSY);
1316 		}
1317 	}
1318 
1319 	if (idtype == P_PROJID) {
1320 		kpj = project_hold_by_id(id, global_zone, PROJECT_HOLD_FIND);
1321 		if (kpj == NULL)
1322 			return (ESRCH);
1323 		mutex_enter(&kpj->kpj_poolbind);
1324 	}
1325 
1326 	if (idtype == P_PID) {
1327 		/*
1328 		 * Fast-path for a single process case.
1329 		 */
1330 		procs_size = 2;	/* procs is NULL-terminated */
1331 		procs = kmem_zalloc(procs_size * sizeof (proc_t *), KM_SLEEP);
1332 		mutex_enter(&pidlock);
1333 	} else {
1334 		/*
1335 		 * We will need enough slots for proc_t pointers for as many as
1336 		 * twice the number of currently running processes (assuming
1337 		 * that each one could be in fork() creating a new child).
1338 		 */
1339 		for (;;) {
1340 			procs_size = nproc * 2;
1341 			procs = kmem_zalloc(procs_size * sizeof (proc_t *),
1342 			    KM_SLEEP);
1343 			mutex_enter(&pidlock);
1344 
1345 			if (nproc * 2 <= procs_size)
1346 				break;
1347 			/*
1348 			 * If nproc has changed, try again.
1349 			 */
1350 			mutex_exit(&pidlock);
1351 			kmem_free(procs, procs_size * sizeof (proc_t *));
1352 		}
1353 	}
1354 
1355 	if (id == P_MYID)
1356 		id = getmyid(idtype);
1357 	setprocset(&set, POP_AND, idtype, id, P_ALL, 0);
1358 
1359 	/*
1360 	 * Do a first scan, and select target processes.
1361 	 */
1362 	if (idtype == P_PID)
1363 		prstart = prfind(id);
1364 	else
1365 		prstart = practive;
1366 	for (p = prstart, pp = procs; p != NULL; p = p->p_next) {
1367 		mutex_enter(&p->p_lock);
1368 		/*
1369 		 * Skip processes that don't match our (id, idtype) set or
1370 		 * on the way of becoming zombies.  Skip kernel processes
1371 		 * from the global zone.
1372 		 */
1373 		if (procinset(p, &set) == 0 ||
1374 		    p->p_poolflag & PEXITED ||
1375 		    ((p->p_flag & SSYS) && INGLOBALZONE(p))) {
1376 			mutex_exit(&p->p_lock);
1377 			continue;
1378 		}
1379 		if (!INGLOBALZONE(p)) {
1380 			switch (idtype) {
1381 			case P_PID:
1382 			case P_TASKID:
1383 				/*
1384 				 * Can't bind processes or tasks
1385 				 * in local zones to pools.
1386 				 */
1387 				mutex_exit(&p->p_lock);
1388 				mutex_exit(&pidlock);
1389 				pool_bind_wakeall(procs);
1390 				rv = EINVAL;
1391 				goto out;
1392 			case P_PROJID:
1393 				/*
1394 				 * Only projects in the global
1395 				 * zone can be rebound.
1396 				 */
1397 				mutex_exit(&p->p_lock);
1398 				continue;
1399 			case P_POOLID:
1400 				/*
1401 				 * When rebinding pools, processes can be
1402 				 * in different zones.
1403 				 */
1404 				break;
1405 			}
1406 		}
1407 
1408 		p->p_poolflag |= PBWAIT;
1409 		/*
1410 		 * If some threads in this process are inside the pool
1411 		 * barrier, add them to pool_barrier_count, as we have
1412 		 * to wait for all of them to exit the barrier.
1413 		 */
1414 		if (p->p_poolcnt > 0) {
1415 			mutex_enter(&pool_barrier_lock);
1416 			pool_barrier_count += p->p_poolcnt;
1417 			mutex_exit(&pool_barrier_lock);
1418 		}
1419 		ASSERT(pp < &procs[procs_size]);
1420 		*pp++ = p;
1421 		procs_count++;
1422 		mutex_exit(&p->p_lock);
1423 
1424 		/*
1425 		 * We just found our process, so if we're only rebinding a
1426 		 * single process then get out of this loop.
1427 		 */
1428 		if (idtype == P_PID)
1429 			break;
1430 	}
1431 	*pp = NULL;	/* cap off the end of the array */
1432 	mutex_exit(&pidlock);
1433 
1434 	/*
1435 	 * Wait for relevant processes to stop before they try to enter the
1436 	 * barrier or at the exit from the barrier.  Make sure that we do
1437 	 * not get stopped here while we're holding pool_lock.  If we were
1438 	 * requested to stop, or got a signal then return EAGAIN to let the
1439 	 * library know that it needs to retry.
1440 	 */
1441 	mutex_enter(&pool_barrier_lock);
1442 	lwp->lwp_nostop++;
1443 	while (pool_barrier_count > 0) {
1444 		(void) cv_wait_sig(&pool_barrier_cv, &pool_barrier_lock);
1445 		if (pool_barrier_count > 0) {
1446 			/*
1447 			 * We either got a signal or were requested to
1448 			 * stop by /proc.  Bail out with EAGAIN.  If we were
1449 			 * requested to stop, we'll stop in post_syscall()
1450 			 * on our way back to userland.
1451 			 */
1452 			mutex_exit(&pool_barrier_lock);
1453 			pool_bind_wakeall(procs);
1454 			lwp->lwp_nostop--;
1455 			rv = EAGAIN;
1456 			goto out;
1457 		}
1458 	}
1459 	lwp->lwp_nostop--;
1460 	mutex_exit(&pool_barrier_lock);
1461 
1462 	if (idtype == P_PID)
1463 		goto skip;
1464 
1465 	/*
1466 	 * Do another run, and drop processes that were inside the barrier
1467 	 * in exit(), but when they have dropped to pool_barrier_exit
1468 	 * they have become of no interest to us.  Pick up child processes that
1469 	 * were created by fork() but didn't exist during our first scan.
1470 	 * Their parents are now stopped at pool_barrier_exit in cfork().
1471 	 */
1472 	mutex_enter(&pidlock);
1473 	for (pp = procs; (p = *pp) != NULL; pp++) {
1474 		if (p->p_poolflag & PEXITED) {
1475 			ASSERT(p->p_lwpcnt == 0);
1476 			pool_bind_wake(p);
1477 			/* flip w/last non-NULL slot */
1478 			*pp = procs[procs_count - 1];
1479 			procs[procs_count - 1] = NULL;
1480 			procs_count--;
1481 			pp--;			/* try this slot again */
1482 			continue;
1483 		}
1484 		/*
1485 		 * Look at the child and check if it should be rebound also.
1486 		 * We're holding pidlock, so it is safe to reference p_child.
1487 		 */
1488 		if ((p = p->p_child) == NULL)
1489 			continue;
1490 
1491 		mutex_enter(&p->p_lock);
1492 		/*
1493 		 * Skip processes in local zones if we're not binding
1494 		 * zones to pools (P_ZONEID).  Skip kernel processes also.
1495 		 */
1496 		if ((!INGLOBALZONE(p) && idtype != P_ZONEID) ||
1497 		    p->p_flag & SSYS) {
1498 			mutex_exit(&p->p_lock);
1499 			continue;
1500 		}
1501 
1502 		/*
1503 		 * If the child process has been already created by fork(), has
1504 		 * not exited, and has not been added to the list already,
1505 		 * then add it now.  We will hit this process again (since we
1506 		 * stick it at the end of the procs list) but it will ignored
1507 		 * because it will have the PBWAIT flag set.
1508 		 */
1509 		if (procinset(p, &set) &&
1510 		    !(p->p_poolflag & PEXITED) &&
1511 		    !(p->p_poolflag & PBWAIT)) {
1512 			ASSERT(p->p_child == NULL); /* no child of a child */
1513 			procs[procs_count] = p;
1514 			procs[procs_count + 1] = NULL;
1515 			procs_count++;
1516 			p->p_poolflag |= PBWAIT;
1517 		}
1518 		mutex_exit(&p->p_lock);
1519 	}
1520 	mutex_exit(&pidlock);
1521 skip:
1522 	/*
1523 	 * If there's no processes to rebind then return ESRCH, unless
1524 	 * we're associating a pool with new resource set, destroying it,
1525 	 * or binding a zone to a pool.
1526 	 */
1527 	if (procs_count == 0) {
1528 		if (idtype == P_POOLID || idtype == P_ZONEID)
1529 			rv = 0;
1530 		else
1531 			rv = ESRCH;
1532 		goto out;
1533 	}
1534 
1535 #ifdef DEBUG
1536 	/*
1537 	 * All processes in the array should have PBWAIT set, and none
1538 	 * should be in the critical section. Thus, although p_poolflag
1539 	 * and p_poolcnt are protected by p_lock, their ASSERTions below
1540 	 * should be stable without it. procinset(), however, ASSERTs that
1541 	 * the p_lock is held upon entry.
1542 	 */
1543 	for (pp = procs; (p = *pp) != NULL; pp++) {
1544 		int in_set;
1545 
1546 		mutex_enter(&p->p_lock);
1547 		in_set = procinset(p, &set);
1548 		mutex_exit(&p->p_lock);
1549 
1550 		ASSERT(in_set);
1551 		ASSERT(p->p_poolflag & PBWAIT);
1552 		ASSERT(p->p_poolcnt == 0);
1553 	}
1554 #endif
1555 
1556 	/*
1557 	 * Do the check if processor set rebinding is going to succeed or not.
1558 	 */
1559 	if ((flags & POOL_BIND_PSET) &&
1560 	    (rv = pset_bind_start(procs, pool)) != 0) {
1561 		pool_bind_wakeall(procs);
1562 		goto out;
1563 	}
1564 
1565 	/*
1566 	 * At this point, all bind operations should succeed.
1567 	 */
1568 	for (pp = procs; (p = *pp) != NULL; pp++) {
1569 		if (flags & POOL_BIND_PSET) {
1570 			psetid_t psetid = pool->pool_pset->pset_id;
1571 			void *zonebuf;
1572 			void *projbuf;
1573 
1574 			/*
1575 			 * Pre-allocate one buffer for FSS (per-project
1576 			 * buffer for a new pset) in case if this is the
1577 			 * first thread from its current project getting
1578 			 * bound to this processor set.
1579 			 */
1580 			projbuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_PROJ);
1581 			zonebuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_ZONE);
1582 
1583 			mutex_enter(&pidlock);
1584 			mutex_enter(&p->p_lock);
1585 			pool_pset_bind(p, psetid, projbuf, zonebuf);
1586 			mutex_exit(&p->p_lock);
1587 			mutex_exit(&pidlock);
1588 			/*
1589 			 * Free buffers pre-allocated above if it
1590 			 * wasn't actually used.
1591 			 */
1592 			fss_freebuf(projbuf, FSS_ALLOC_PROJ);
1593 			fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
1594 		}
1595 		/*
1596 		 * Now let's change the scheduling class of this
1597 		 * process if our target pool has it defined.
1598 		 */
1599 		if (cid != POOL_CLASS_UNSET)
1600 			pool_change_class(p, cid);
1601 
1602 		/*
1603 		 * It is safe to reference p_pool here without holding
1604 		 * p_lock because it cannot change underneath of us.
1605 		 * We're holding pool_lock here, so nobody else can be
1606 		 * moving this process between pools.  If process "p"
1607 		 * would be exiting, we're guaranteed that it would be blocked
1608 		 * at pool_barrier_enter() in exit().  Otherwise, it would've
1609 		 * been skipped by one of our scans of the practive list
1610 		 * as a process with PEXITED flag set.
1611 		 */
1612 		if (p->p_pool != pool) {
1613 			ASSERT(p->p_pool->pool_ref > 0);
1614 			atomic_add_32(&p->p_pool->pool_ref, -1);
1615 			p->p_pool = pool;
1616 			atomic_add_32(&p->p_pool->pool_ref, 1);
1617 		}
1618 		/*
1619 		 * Okay, we've tortured this guy enough.
1620 		 * Let this poor process go now.
1621 		 */
1622 		pool_bind_wake(p);
1623 	}
1624 	if (flags & POOL_BIND_PSET)
1625 		pset_bind_finish();
1626 
1627 out:	switch (idtype) {
1628 	case P_PROJID:
1629 		ASSERT(kpj != NULL);
1630 		mutex_exit(&kpj->kpj_poolbind);
1631 		project_rele(kpj);
1632 		break;
1633 	case P_ZONEID:
1634 		if (rv == 0) {
1635 			mutex_enter(&cpu_lock);
1636 			zone_pool_set(zone, pool);
1637 			mutex_exit(&cpu_lock);
1638 		}
1639 		zone->zone_pool_mod = gethrtime();
1640 		zone_rele(zone);
1641 		break;
1642 	}
1643 
1644 	kmem_free(procs, procs_size * sizeof (proc_t *));
1645 	ASSERT(pool_barrier_count == 0);
1646 	return (rv);
1647 }
1648