xref: /titanic_44/usr/src/uts/common/os/pool.c (revision 1a7c1b724419d3cb5fa6eea75123c6b2060ba31b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/pool.h>
30 #include <sys/pool_impl.h>
31 #include <sys/pool_pset.h>
32 #include <sys/id_space.h>
33 #include <sys/mutex.h>
34 #include <sys/nvpair.h>
35 #include <sys/cpuvar.h>
36 #include <sys/errno.h>
37 #include <sys/cmn_err.h>
38 #include <sys/systm.h>
39 #include <sys/proc.h>
40 #include <sys/fss.h>
41 #include <sys/class.h>
42 #include <sys/exacct.h>
43 #include <sys/utsname.h>
44 #include <sys/procset.h>
45 #include <sys/atomic.h>
46 #include <sys/zone.h>
47 #include <sys/policy.h>
48 
49 /*
50  * RESOURCE POOLS
51  *
52  * The resource pools facility brings together process-bindable resource into
53  * a common abstraction called a pool. Processor sets and other entities can
54  * be configured, grouped, and labelled such that workload components can be
55  * associated with a subset of a system's total resources.
56  *
57  * When disabled, the pools facility is "invisible".  All processes belong
58  * to the same pool (pool_default), and processor sets can be managed through
59  * the old pset() system call.  When enabled, processor sets can only be
60  * managed via the pools facility.  New pools can be created and associated
61  * with processor sets.  Processes can be bound to pools which have non-empty
62  * resource sets.
63  *
64  * Locking: pool_lock() protects global pools state and must be called
65  * before modifying the configuration, or when taking a snapshot of the
66  * configuration.  If pool_lock_intr() is used, the operation may be
67  * interrupted by a signal or a request.
68  *
69  * To prevent processes from being rebound between pools while they are
70  * the middle of an operation which affects resource set bindings, such
71  * operations must be surrounded by calls to pool_barrier_enter() and
72  * pool_barrier_exit().  This mechanism guarantees that such processes will
73  * be stopped either at the beginning or at the end of the barrier so that
74  * the rebind operation can atomically bind the process and its threads
75  * to new resource sets, and then let process run again.
76  *
77  * Lock ordering with respect to other locks is as follows:
78  *
79  * 	pool_lock() -> cpu_lock -> pidlock -> p_lock -> pool_barrier_lock
80  *
81  * Most static and global variables defined in this file are protected
82  * by calling pool_lock().
83  *
84  * The operation that binds tasks and projects to pools is atomic.  That is,
85  * either all processes in a given task or a project will be bound to a
86  * new pool, or (in case of an error) they will be all left bound to the
87  * old pool. Processes in a given task or a given project can only be bound to
88  * different pools if they were rebound individually one by one as single
89  * processes.  Threads or LWPs of the same process do not have pool bindings,
90  * and are bound to the same resource sets associated with the resource pool
91  * of that process.
92  *
93  * The following picture shows one possible pool configuration with three
94  * pools and three processor sets.  Note that processor set "foo" is not
95  * associated with any pools and therefore cannot have any processes
96  * bound to it.  Two pools (default and foo) are associated with the
97  * same processor set (default).  Also, note that processes in Task 2
98  * are bound to different pools.
99  *
100  *
101  *							       Processor Sets
102  *								+---------+
103  *		       +--------------+========================>| default |
104  *		      a|	      |				+---------+
105  *		      s|	      |				    ||
106  *		      s|	      |				+---------+
107  *		      o|	      |				|   foo   |
108  *		      c|	      |				+---------+
109  *		      i|	      |				    ||
110  *		      a|	      |				+---------+
111  *		      t|	      |			+------>|   bar   |
112  *		      e|	      |			|	+---------+
113  *                    d|              |                 |
114  *                     |              |                 |
115  *	       +---------+      +---------+      +---------+
116  *     Pools   | default |======|   foo   |======|   bar   |
117  *	       +---------+      +---------+      +---------+
118  *	           @  @            @              @ @   @
119  *                b|  |            |              | |   |
120  *                o|  |            |              | |   |
121  *                u|  +-----+      |      +-------+ |   +---+
122  *                n|        |      |      |         |       |
123  *            ....d|........|......|......|.........|.......|....
124  *            :    |   ::   |      |      |    ::   |       |   :
125  *            :  +---+ :: +---+  +---+  +---+  :: +---+   +---+ :
126  *  Processes :  | p | :: | p |  | p |  | p |  :: | p |...| p | :
127  *            :  +---+ :: +---+  +---+  +---+  :: +---+   +---+ :
128  *            :........::......................::...............:
129  *              Task 1            Task 2              Task N
130  *                 |                 |                  |
131  *                 |                 |                  |
132  *                 |  +-----------+  |             +-----------+
133  *                 +--| Project 1 |--+             | Project N |
134  *                    +-----------+                +-----------+
135  *
136  * This is just an illustration of relationships between processes, tasks,
137  * projects, pools, and processor sets. New types of resource sets will be
138  * added in the future.
139  */
140 
141 pool_t		*pool_default;	/* default pool which always exists */
142 int		pool_count;	/* number of pools created on this system */
143 int		pool_state;	/* pools state -- enabled/disabled */
144 void		*pool_buf;	/* pre-commit snapshot of the pools state */
145 size_t		pool_bufsz;	/* size of pool_buf */
146 static hrtime_t	pool_pool_mod;	/* last modification time for pools */
147 static hrtime_t	pool_sys_mod;	/* last modification time for system */
148 static nvlist_t	*pool_sys_prop;	/* system properties */
149 static id_space_t *pool_ids;	/* pool ID space */
150 static list_t	pool_list;	/* doubly-linked list of pools */
151 static kmutex_t		pool_mutex;		/* protects pool_busy_* */
152 static kcondvar_t	pool_busy_cv;		/* waiting for "pool_lock" */
153 static kthread_t	*pool_busy_thread;	/* thread holding "pool_lock" */
154 static kmutex_t		pool_barrier_lock;	/* synch. with pool_barrier_* */
155 static kcondvar_t	pool_barrier_cv;	/* synch. with pool_barrier_* */
156 static int		pool_barrier_count;	/* synch. with pool_barrier_* */
157 
158 /*
159  * Boot-time pool initialization.
160  */
161 void
162 pool_init(void)
163 {
164 	pool_ids = id_space_create("pool_ids", POOL_DEFAULT + 1, POOL_MAXID);
165 
166 	/*
167 	 * Initialize default pool.
168 	 */
169 	pool_default = kmem_zalloc(sizeof (pool_t), KM_SLEEP);
170 	pool_default->pool_id = POOL_DEFAULT;
171 	list_create(&pool_list, sizeof (pool_t), offsetof(pool_t, pool_link));
172 	list_insert_head(&pool_list, pool_default);
173 
174 	/*
175 	 * Initialize plugins for resource sets.
176 	 */
177 	pool_pset_init();
178 	pool_count = 1;
179 	p0.p_pool = pool_default;
180 	global_zone->zone_pool = pool_default;
181 	pool_default->pool_ref = 1;
182 }
183 
184 /*
185  * Synchronization routines.
186  *
187  * pool_lock is only called from syscall-level routines (processor_bind(),
188  * pset_*(), and /dev/pool ioctls).  The pool "lock" may be held for long
189  * periods of time, including across sleeping operations, so we allow its
190  * acquisition to be interruptible.
191  *
192  * The current thread that owns the "lock" is stored in the variable
193  * pool_busy_thread, both to let pool_lock_held() work and to aid debugging.
194  */
195 void
196 pool_lock(void)
197 {
198 	mutex_enter(&pool_mutex);
199 	while (pool_busy_thread != NULL)
200 		cv_wait(&pool_busy_cv, &pool_mutex);
201 	pool_busy_thread = curthread;
202 	mutex_exit(&pool_mutex);
203 }
204 
205 int
206 pool_lock_intr(void)
207 {
208 	mutex_enter(&pool_mutex);
209 	while (pool_busy_thread != NULL) {
210 		if (cv_wait_sig(&pool_busy_cv, &pool_mutex) == 0) {
211 			cv_signal(&pool_busy_cv);
212 			mutex_exit(&pool_mutex);
213 			return (1);
214 		}
215 	}
216 	pool_busy_thread = curthread;
217 	mutex_exit(&pool_mutex);
218 	return (0);
219 }
220 
221 int
222 pool_lock_held(void)
223 {
224 	return (pool_busy_thread == curthread);
225 }
226 
227 void
228 pool_unlock(void)
229 {
230 	mutex_enter(&pool_mutex);
231 	pool_busy_thread = NULL;
232 	cv_signal(&pool_busy_cv);
233 	mutex_exit(&pool_mutex);
234 }
235 
236 /*
237  * Routines allowing fork(), exec(), exit(), and lwp_create() to synchronize
238  * with pool_do_bind().
239  *
240  * Calls to pool_barrier_enter() and pool_barrier_exit() must bracket all
241  * operations which modify pool or pset associations.  They can be called
242  * while the process is multi-threaded.  In the common case, when current
243  * process is not being rebound (PBWAIT flag is not set), these functions
244  * will be just incrementing and decrementing reference counts.
245  */
246 void
247 pool_barrier_enter(void)
248 {
249 	proc_t *p = curproc;
250 
251 	ASSERT(MUTEX_HELD(&p->p_lock));
252 	while (p->p_poolflag & PBWAIT)
253 		cv_wait(&p->p_poolcv, &p->p_lock);
254 	p->p_poolcnt++;
255 }
256 
257 void
258 pool_barrier_exit(void)
259 {
260 	proc_t *p = curproc;
261 
262 	ASSERT(MUTEX_HELD(&p->p_lock));
263 	ASSERT(p->p_poolcnt > 0);
264 	p->p_poolcnt--;
265 	if (p->p_poolflag & PBWAIT) {
266 		mutex_enter(&pool_barrier_lock);
267 		ASSERT(pool_barrier_count > 0);
268 		pool_barrier_count--;
269 		if (pool_barrier_count == 0)
270 			cv_signal(&pool_barrier_cv);
271 		mutex_exit(&pool_barrier_lock);
272 		while (p->p_poolflag & PBWAIT)
273 			cv_wait(&p->p_poolcv, &p->p_lock);
274 	}
275 }
276 
277 /*
278  * Enable pools facility.
279  */
280 static int
281 pool_enable(void)
282 {
283 	int ret;
284 
285 	ASSERT(pool_lock_held());
286 	ASSERT(pool_count == 1);
287 
288 	ret = pool_pset_enable();
289 	if (ret != 0)
290 		return (ret);
291 	(void) nvlist_alloc(&pool_sys_prop, NV_UNIQUE_NAME, KM_SLEEP);
292 	(void) nvlist_add_string(pool_sys_prop, "system.name",
293 	    utsname.nodename);
294 	(void) nvlist_add_string(pool_sys_prop, "system.comment", "");
295 	(void) nvlist_add_int64(pool_sys_prop, "system.version", 1);
296 	(void) nvlist_add_byte(pool_sys_prop, "system.bind-default", 1);
297 
298 	(void) nvlist_alloc(&pool_default->pool_props,
299 	    NV_UNIQUE_NAME, KM_SLEEP);
300 	(void) nvlist_add_string(pool_default->pool_props,
301 	    "pool.name", "pool_default");
302 	(void) nvlist_add_string(pool_default->pool_props, "pool.comment", "");
303 	(void) nvlist_add_byte(pool_default->pool_props, "pool.default", 1);
304 	(void) nvlist_add_byte(pool_default->pool_props, "pool.active", 1);
305 	(void) nvlist_add_int64(pool_default->pool_props,
306 	    "pool.importance", 1);
307 	(void) nvlist_add_int64(pool_default->pool_props, "pool.sys_id",
308 	    pool_default->pool_id);
309 
310 	pool_sys_mod = pool_pool_mod = gethrtime();
311 
312 	return (ret);
313 }
314 
315 /*
316  * Disable pools facility.
317  */
318 static int
319 pool_disable(void)
320 {
321 	int ret;
322 
323 	ASSERT(pool_lock_held());
324 
325 	if (pool_count > 1)	/* must destroy all pools first */
326 		return (EBUSY);
327 
328 	ret = pool_pset_disable();
329 	if (ret != 0)
330 		return (ret);
331 	if (pool_sys_prop != NULL) {
332 		nvlist_free(pool_sys_prop);
333 		pool_sys_prop = NULL;
334 	}
335 	if (pool_default->pool_props != NULL) {
336 		nvlist_free(pool_default->pool_props);
337 		pool_default->pool_props = NULL;
338 	}
339 	return (0);
340 }
341 
342 pool_t *
343 pool_lookup_pool_by_name(char *name)
344 {
345 	pool_t *pool = pool_default;
346 	char *p;
347 
348 	ASSERT(pool_lock_held());
349 	for (pool = list_head(&pool_list); pool;
350 	    pool = list_next(&pool_list, pool)) {
351 		if (nvlist_lookup_string(pool->pool_props,
352 		    "pool.name", &p) == 0 && strcmp(name, p) == 0)
353 			return (pool);
354 	}
355 	return (NULL);
356 }
357 
358 pool_t *
359 pool_lookup_pool_by_id(poolid_t poolid)
360 {
361 	pool_t *pool = pool_default;
362 
363 	ASSERT(pool_lock_held());
364 	for (pool = list_head(&pool_list); pool;
365 	    pool = list_next(&pool_list, pool)) {
366 		if (pool->pool_id == poolid)
367 			return (pool);
368 	}
369 	return (NULL);
370 }
371 
372 /*
373  * Create new pool, associate it with default resource sets, and give
374  * it a temporary name.
375  */
376 static int
377 pool_pool_create(poolid_t *poolid)
378 {
379 	pool_t *pool;
380 	char pool_name[40];
381 
382 	ASSERT(pool_lock_held());
383 
384 	pool = kmem_zalloc(sizeof (pool_t), KM_SLEEP);
385 	pool->pool_id = *poolid = id_alloc(pool_ids);
386 	pool->pool_pset = pool_pset_default;
387 	pool_pset_default->pset_npools++;
388 	list_insert_tail(&pool_list, pool);
389 	(void) nvlist_alloc(&pool->pool_props, NV_UNIQUE_NAME, KM_SLEEP);
390 	(void) nvlist_add_int64(pool->pool_props, "pool.sys_id", pool->pool_id);
391 	(void) nvlist_add_byte(pool->pool_props, "pool.default", 0);
392 	pool_pool_mod = gethrtime();
393 	(void) snprintf(pool_name, sizeof (pool_name), "pool_%lld",
394 	    pool_pool_mod);
395 	(void) nvlist_add_string(pool->pool_props, "pool.name", pool_name);
396 	pool_count++;
397 	return (0);
398 }
399 
400 struct destroy_zone_arg {
401 	pool_t *old;
402 	pool_t *new;
403 };
404 
405 /*
406  * Update pool pointers for zones that are currently bound to pool "old"
407  * to be bound to pool "new".
408  */
409 static int
410 pool_destroy_zone_cb(zone_t *zone, void *arg)
411 {
412 	struct destroy_zone_arg *dza = arg;
413 
414 	ASSERT(pool_lock_held());
415 	ASSERT(MUTEX_HELD(&cpu_lock));
416 
417 	if (zone_pool_get(zone) == dza->old)
418 		zone_pool_set(zone, dza->new);
419 	return (0);
420 }
421 
422 /*
423  * Destroy specified pool, and rebind all processes in it
424  * to the default pool.
425  */
426 static int
427 pool_pool_destroy(poolid_t poolid)
428 {
429 	pool_t *pool;
430 	int ret;
431 
432 	ASSERT(pool_lock_held());
433 
434 	if (poolid == POOL_DEFAULT)
435 		return (EINVAL);
436 	if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
437 		return (ESRCH);
438 	ret = pool_do_bind(pool_default, P_POOLID, poolid, POOL_BIND_ALL);
439 	if (ret == 0) {
440 		struct destroy_zone_arg dzarg;
441 
442 		dzarg.old = pool;
443 		dzarg.new = pool_default;
444 		mutex_enter(&cpu_lock);
445 		ret = zone_walk(pool_destroy_zone_cb, &dzarg);
446 		mutex_exit(&cpu_lock);
447 		ASSERT(ret == 0);
448 		ASSERT(pool->pool_ref == 0);
449 		(void) nvlist_free(pool->pool_props);
450 		id_free(pool_ids, pool->pool_id);
451 		pool->pool_pset->pset_npools--;
452 		list_remove(&pool_list, pool);
453 		pool_count--;
454 		pool_pool_mod = gethrtime();
455 		kmem_free(pool, sizeof (pool_t));
456 	}
457 	return (ret);
458 }
459 
460 /*
461  * Create new pool or resource set.
462  */
463 int
464 pool_create(int class, int subclass, id_t *id)
465 {
466 	int ret;
467 
468 	ASSERT(pool_lock_held());
469 	if (pool_state == POOL_DISABLED)
470 		return (ENOTACTIVE);
471 	switch (class) {
472 	case PEC_POOL:
473 		ret = pool_pool_create((poolid_t *)id);
474 		break;
475 	case PEC_RES_COMP:
476 		switch (subclass) {
477 		case PREC_PSET:
478 			ret = pool_pset_create((psetid_t *)id);
479 			break;
480 		default:
481 			ret = EINVAL;
482 		}
483 		break;
484 	case PEC_RES_AGG:
485 		ret = ENOTSUP;
486 		break;
487 	default:
488 		ret = EINVAL;
489 	}
490 	return (ret);
491 }
492 
493 /*
494  * Destroy an existing pool or resource set.
495  */
496 int
497 pool_destroy(int class, int subclass, id_t id)
498 {
499 	int ret;
500 
501 	ASSERT(pool_lock_held());
502 	if (pool_state == POOL_DISABLED)
503 		return (ENOTACTIVE);
504 	switch (class) {
505 	case PEC_POOL:
506 		ret = pool_pool_destroy((poolid_t)id);
507 		break;
508 	case PEC_RES_COMP:
509 		switch (subclass) {
510 		case PREC_PSET:
511 			ret = pool_pset_destroy((psetid_t)id);
512 			break;
513 		default:
514 			ret = EINVAL;
515 		}
516 		break;
517 	case PEC_RES_AGG:
518 		ret = ENOTSUP;
519 		break;
520 	default:
521 		ret = EINVAL;
522 	}
523 	return (ret);
524 }
525 
526 /*
527  * Enable or disable pools.
528  */
529 int
530 pool_status(int status)
531 {
532 	int ret = 0;
533 
534 	ASSERT(pool_lock_held());
535 
536 	if (pool_state == status)
537 		return (0);
538 	switch (status) {
539 	case POOL_ENABLED:
540 		ret = pool_enable();
541 		if (ret != 0)
542 			return (ret);
543 		pool_state = POOL_ENABLED;
544 		break;
545 	case POOL_DISABLED:
546 		ret = pool_disable();
547 		if (ret != 0)
548 			return (ret);
549 		pool_state = POOL_DISABLED;
550 		break;
551 	default:
552 		ret = EINVAL;
553 	}
554 	return (ret);
555 }
556 
557 /*
558  * Associate pool with resource set.
559  */
560 int
561 pool_assoc(poolid_t poolid, int idtype, id_t id)
562 {
563 	int ret;
564 
565 	ASSERT(pool_lock_held());
566 	if (pool_state == POOL_DISABLED)
567 		return (ENOTACTIVE);
568 	switch (idtype) {
569 	case PREC_PSET:
570 		ret = pool_pset_assoc(poolid, (psetid_t)id);
571 		break;
572 	default:
573 		ret = EINVAL;
574 	}
575 	if (ret == 0)
576 		pool_pool_mod = gethrtime();
577 	return (ret);
578 }
579 
580 /*
581  * Disassociate resource set from pool.
582  */
583 int
584 pool_dissoc(poolid_t poolid, int idtype)
585 {
586 	int ret;
587 
588 	ASSERT(pool_lock_held());
589 	if (pool_state == POOL_DISABLED)
590 		return (ENOTACTIVE);
591 	switch (idtype) {
592 	case PREC_PSET:
593 		ret = pool_pset_assoc(poolid, PS_NONE);
594 		break;
595 	default:
596 		ret = EINVAL;
597 	}
598 	if (ret == 0)
599 		pool_pool_mod = gethrtime();
600 	return (ret);
601 }
602 
603 /*
604  * Transfer specified quantity of resources between resource sets.
605  */
606 /*ARGSUSED*/
607 int
608 pool_transfer(int type, id_t src, id_t dst, uint64_t qty)
609 {
610 	int ret = EINVAL;
611 	return (ret);
612 }
613 
614 /*
615  * Transfer resources specified by their IDs between resource sets.
616  */
617 int
618 pool_xtransfer(int type, id_t src, id_t dst, uint_t size, id_t *ids)
619 {
620 	int ret;
621 
622 	ASSERT(pool_lock_held());
623 	if (pool_state == POOL_DISABLED)
624 		return (ENOTACTIVE);
625 	switch (type) {
626 	case PREC_PSET:
627 		ret = pool_pset_xtransfer((psetid_t)src, (psetid_t)dst,
628 		    size, ids);
629 		break;
630 	default:
631 		ret = EINVAL;
632 	}
633 	return (ret);
634 }
635 
636 /*
637  * Bind processes to pools.
638  */
639 int
640 pool_bind(poolid_t poolid, idtype_t idtype, id_t id)
641 {
642 	pool_t *pool;
643 
644 	ASSERT(pool_lock_held());
645 
646 	if (pool_state == POOL_DISABLED)
647 		return (ENOTACTIVE);
648 	if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
649 		return (ESRCH);
650 
651 	switch (idtype) {
652 	case P_PID:
653 	case P_TASKID:
654 	case P_PROJID:
655 	case P_ZONEID:
656 		break;
657 	default:
658 		return (EINVAL);
659 	}
660 	return (pool_do_bind(pool, idtype, id, POOL_BIND_ALL));
661 }
662 
663 /*
664  * Query pool binding of the specifed process.
665  */
666 int
667 pool_query_binding(idtype_t idtype, id_t id, id_t *poolid)
668 {
669 	proc_t *p;
670 
671 	if (idtype != P_PID)
672 		return (ENOTSUP);
673 	if (id == P_MYID)
674 		id = curproc->p_pid;
675 
676 	ASSERT(pool_lock_held());
677 
678 	mutex_enter(&pidlock);
679 	if ((p = prfind((pid_t)id)) == NULL) {
680 		mutex_exit(&pidlock);
681 		return (ESRCH);
682 	}
683 	mutex_enter(&p->p_lock);
684 	/*
685 	 * In local zones, lie about pool bindings of processes from
686 	 * the global zone.
687 	 */
688 	if (!INGLOBALZONE(curproc) && INGLOBALZONE(p)) {
689 		pool_t *pool;
690 
691 		pool = zone_pool_get(curproc->p_zone);
692 		*poolid = pool->pool_id;
693 	} else {
694 		*poolid = p->p_pool->pool_id;
695 	}
696 	mutex_exit(&p->p_lock);
697 	mutex_exit(&pidlock);
698 	return (0);
699 }
700 
701 static ea_object_t *
702 pool_system_pack(void)
703 {
704 	ea_object_t *eo_system;
705 	size_t bufsz = 0;
706 	char *buf = NULL;
707 
708 	ASSERT(pool_lock_held());
709 
710 	eo_system = ea_alloc_group(EXT_GROUP | EXC_LOCAL | EXD_GROUP_SYSTEM);
711 	(void) ea_attach_item(eo_system, &pool_sys_mod, sizeof (hrtime_t),
712 	    EXC_LOCAL | EXD_SYSTEM_TSTAMP | EXT_UINT64);
713 	if (INGLOBALZONE(curproc))
714 		(void) ea_attach_item(eo_system, &pool_pool_mod,
715 		    sizeof (hrtime_t),
716 		    EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64);
717 	else
718 		(void) ea_attach_item(eo_system,
719 		    &curproc->p_zone->zone_pool_mod,
720 		    sizeof (hrtime_t),
721 		    EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64);
722 	(void) ea_attach_item(eo_system, &pool_pset_mod, sizeof (hrtime_t),
723 	    EXC_LOCAL | EXD_PSET_TSTAMP | EXT_UINT64);
724 	(void) ea_attach_item(eo_system, &pool_cpu_mod, sizeof (hrtime_t),
725 	    EXC_LOCAL | EXD_CPU_TSTAMP | EXT_UINT64);
726 	(void) nvlist_pack(pool_sys_prop, &buf, &bufsz, NV_ENCODE_NATIVE, 0);
727 	(void) ea_attach_item(eo_system, buf, bufsz,
728 	    EXC_LOCAL | EXD_SYSTEM_PROP | EXT_RAW);
729 	kmem_free(buf, bufsz);
730 	return (eo_system);
731 }
732 
733 /*
734  * Pack information about pools and attach it to specified exacct group.
735  */
736 static int
737 pool_pool_pack(ea_object_t *eo_system)
738 {
739 	ea_object_t *eo_pool;
740 	pool_t *pool;
741 	size_t bufsz;
742 	char *buf;
743 	pool_t *myzonepool;
744 
745 	ASSERT(pool_lock_held());
746 	myzonepool = zone_pool_get(curproc->p_zone);
747 	for (pool = list_head(&pool_list); pool;
748 	    pool = list_next(&pool_list, pool)) {
749 		if (!INGLOBALZONE(curproc) && myzonepool != pool)
750 			continue;
751 		bufsz = 0;
752 		buf = NULL;
753 		eo_pool = ea_alloc_group(EXT_GROUP |
754 		    EXC_LOCAL | EXD_GROUP_POOL);
755 		(void) ea_attach_item(eo_pool, &pool->pool_id, sizeof (id_t),
756 		    EXC_LOCAL | EXD_POOL_POOLID | EXT_UINT32);
757 		(void) ea_attach_item(eo_pool, &pool->pool_pset->pset_id,
758 		    sizeof (id_t), EXC_LOCAL | EXD_POOL_PSETID | EXT_UINT32);
759 		(void) nvlist_pack(pool->pool_props, &buf, &bufsz,
760 		    NV_ENCODE_NATIVE, 0);
761 		(void) ea_attach_item(eo_pool, buf, bufsz,
762 		    EXC_LOCAL | EXD_POOL_PROP | EXT_RAW);
763 		kmem_free(buf, bufsz);
764 		(void) ea_attach_to_group(eo_system, eo_pool);
765 	}
766 	return (0);
767 }
768 
769 /*
770  * Pack the whole pool configuration in the specified buffer.
771  */
772 int
773 pool_pack_conf(void *kbuf, size_t kbufsz, size_t *asize)
774 {
775 	ea_object_t *eo_system;
776 	size_t ksize;
777 	int ret = 0;
778 
779 	ASSERT(pool_lock_held());
780 
781 	eo_system = pool_system_pack();		/* 1. pack system */
782 	(void) pool_pool_pack(eo_system);	/* 2. pack all pools */
783 	(void) pool_pset_pack(eo_system);	/* 3. pack all psets */
784 	ksize = ea_pack_object(eo_system, NULL, 0);
785 	if (kbuf == NULL || kbufsz == 0)
786 		*asize = ksize;
787 	else if (ksize > kbufsz)
788 		ret = ENOMEM;
789 	else
790 		*asize = ea_pack_object(eo_system, kbuf, kbufsz);
791 	ea_free_object(eo_system, EUP_ALLOC);
792 	return (ret);
793 }
794 
795 /*
796  * Start/end the commit transaction.  If commit transaction is currently
797  * in progress, then all POOL_QUERY ioctls will return pools configuration
798  * at the beginning of transaction.
799  */
800 int
801 pool_commit(int state)
802 {
803 	ea_object_t *eo_system;
804 	int ret = 0;
805 
806 	ASSERT(pool_lock_held());
807 
808 	if (pool_state == POOL_DISABLED)
809 		return (ENOTACTIVE);
810 	switch (state) {
811 	case 1:
812 		/*
813 		 * Beginning commit transation.
814 		 */
815 		if (pool_buf != NULL)		/* transaction in progress */
816 			return (EBUSY);
817 		eo_system = pool_system_pack();		/* 1. pack system */
818 		(void) pool_pool_pack(eo_system);	/* 2. pack all pools */
819 		(void) pool_pset_pack(eo_system);	/* 3. pack all psets */
820 		pool_bufsz = ea_pack_object(eo_system, NULL, 0);
821 		pool_buf = kmem_alloc(pool_bufsz, KM_SLEEP);
822 		pool_bufsz = ea_pack_object(eo_system, pool_buf, pool_bufsz);
823 		ea_free_object(eo_system, EUP_ALLOC);
824 		break;
825 	case 0:
826 		/*
827 		 * Finishing commit transaction.
828 		 */
829 		if (pool_buf != NULL) {
830 			kmem_free(pool_buf, pool_bufsz);
831 			pool_buf = NULL;
832 			pool_bufsz = 0;
833 		}
834 		break;
835 	default:
836 		ret = EINVAL;
837 	}
838 	return (ret);
839 }
840 
841 /*
842  * Check is the specified property is special
843  */
844 static pool_property_t *
845 pool_property_find(char *name, pool_property_t *list)
846 {
847 	pool_property_t *prop;
848 
849 	for (prop = list; prop->pp_name != NULL; prop++)
850 		if (strcmp(prop->pp_name, name) == 0)
851 			return (prop);
852 	return (NULL);
853 }
854 
855 static pool_property_t pool_prop_sys[] = {
856 	{ "system.name",		DATA_TYPE_STRING,	PP_RDWR },
857 	{ "system.comment",		DATA_TYPE_STRING,	PP_RDWR },
858 	{ "system.version",		DATA_TYPE_UINT64,	PP_READ },
859 	{ "system.bind-default",	DATA_TYPE_BYTE,		PP_RDWR },
860 	{ "system.allocate-method",	DATA_TYPE_STRING,
861 	    PP_RDWR | PP_OPTIONAL },
862 	{ "system.poold.log-level",	DATA_TYPE_STRING,
863 	    PP_RDWR | PP_OPTIONAL },
864 	{ "system.poold.log-location",	DATA_TYPE_STRING,
865 	    PP_RDWR | PP_OPTIONAL },
866 	{ "system.poold.monitor-interval",	DATA_TYPE_UINT64,
867 	    PP_RDWR | PP_OPTIONAL },
868 	{ "system.poold.history-file",	DATA_TYPE_STRING,
869 	    PP_RDWR | PP_OPTIONAL },
870 	{ "system.poold.objectives",	DATA_TYPE_STRING,
871 	    PP_RDWR | PP_OPTIONAL },
872 	{ NULL,				0,			0 }
873 };
874 
875 static pool_property_t pool_prop_pool[] = {
876 	{ "pool.sys_id",		DATA_TYPE_UINT64,	PP_READ },
877 	{ "pool.name",			DATA_TYPE_STRING,	PP_RDWR },
878 	{ "pool.default",		DATA_TYPE_BYTE,		PP_READ },
879 	{ "pool.active",		DATA_TYPE_BYTE,		PP_RDWR },
880 	{ "pool.importance",		DATA_TYPE_INT64,	PP_RDWR },
881 	{ "pool.comment",		DATA_TYPE_STRING,	PP_RDWR },
882 	{ "pool.scheduler",		DATA_TYPE_STRING,
883 	    PP_RDWR | PP_OPTIONAL },
884 	{ NULL,				0,			0 }
885 };
886 
887 /*
888  * Common routine to put new property on the specified list
889  */
890 int
891 pool_propput_common(nvlist_t *nvlist, nvpair_t *pair, pool_property_t *props)
892 {
893 	pool_property_t *prop;
894 
895 	if ((prop = pool_property_find(nvpair_name(pair), props)) != NULL) {
896 		/*
897 		 * No read-only properties or properties with bad types
898 		 */
899 		if (!(prop->pp_perm & PP_WRITE) ||
900 		    prop->pp_type != nvpair_type(pair))
901 			return (EINVAL);
902 	}
903 	return (nvlist_add_nvpair(nvlist, pair));
904 }
905 
906 /*
907  * Common routine to remove property from the given list
908  */
909 int
910 pool_proprm_common(nvlist_t *nvlist, char *name, pool_property_t *props)
911 {
912 	pool_property_t *prop;
913 
914 	if ((prop = pool_property_find(name, props)) != NULL) {
915 		if (!(prop->pp_perm & PP_OPTIONAL))
916 			return (EINVAL);
917 	}
918 	return (nvlist_remove_all(nvlist, name));
919 }
920 
921 static int
922 pool_system_propput(nvpair_t *pair)
923 {
924 	int ret;
925 
926 	ASSERT(pool_lock_held());
927 	ret = pool_propput_common(pool_sys_prop, pair, pool_prop_sys);
928 	if (ret == 0)
929 		pool_sys_mod = gethrtime();
930 	return (ret);
931 }
932 
933 static int
934 pool_system_proprm(char *name)
935 {
936 	int ret;
937 
938 	ASSERT(pool_lock_held());
939 	ret = pool_proprm_common(pool_sys_prop, name, pool_prop_sys);
940 	if (ret == 0)
941 		pool_sys_mod = gethrtime();
942 	return (ret);
943 }
944 
945 static int
946 pool_pool_propput(poolid_t poolid, nvpair_t *pair)
947 {
948 	pool_t *pool;
949 	int ret;
950 
951 	ASSERT(pool_lock_held());
952 	if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
953 		return (ESRCH);
954 	ret = pool_propput_common(pool->pool_props, pair, pool_prop_pool);
955 	if (ret == 0)
956 		pool_pool_mod = gethrtime();
957 	return (ret);
958 }
959 
960 static int
961 pool_pool_proprm(poolid_t poolid, char *name)
962 {
963 	int ret;
964 	pool_t *pool;
965 
966 	ASSERT(pool_lock_held());
967 	if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
968 		return (ESRCH);
969 	ret = pool_proprm_common(pool->pool_props, name, pool_prop_pool);
970 	if (ret == 0)
971 		pool_pool_mod = gethrtime();
972 	return (ret);
973 }
974 
975 int
976 pool_propput(int class, int subclass, id_t id, nvpair_t *pair)
977 {
978 	int ret;
979 
980 	ASSERT(pool_lock_held());
981 	if (pool_state == POOL_DISABLED)
982 		return (ENOTACTIVE);
983 	switch (class) {
984 	case PEC_SYSTEM:
985 		ret = pool_system_propput(pair);
986 		break;
987 	case PEC_POOL:
988 		ret = pool_pool_propput((poolid_t)id, pair);
989 		break;
990 	case PEC_RES_COMP:
991 		switch (subclass) {
992 		case PREC_PSET:
993 			ret = pool_pset_propput((psetid_t)id, pair);
994 			break;
995 		default:
996 			ret = EINVAL;
997 		}
998 		break;
999 	case PEC_RES_AGG:
1000 		ret = ENOTSUP;
1001 		break;
1002 	case PEC_COMP:
1003 		switch (subclass) {
1004 		case PCEC_CPU:
1005 			ret = pool_cpu_propput((processorid_t)id, pair);
1006 			break;
1007 		default:
1008 			ret = EINVAL;
1009 		}
1010 		break;
1011 	default:
1012 		ret = EINVAL;
1013 	}
1014 	return (ret);
1015 }
1016 
1017 int
1018 pool_proprm(int class, int subclass, id_t id, char *name)
1019 {
1020 	int ret;
1021 
1022 	ASSERT(pool_lock_held());
1023 	if (pool_state == POOL_DISABLED)
1024 		return (ENOTACTIVE);
1025 	switch (class) {
1026 	case PEC_SYSTEM:
1027 		ret = pool_system_proprm(name);
1028 		break;
1029 	case PEC_POOL:
1030 		ret = pool_pool_proprm((poolid_t)id, name);
1031 		break;
1032 	case PEC_RES_COMP:
1033 		switch (subclass) {
1034 		case PREC_PSET:
1035 			ret = pool_pset_proprm((psetid_t)id, name);
1036 			break;
1037 		default:
1038 			ret = EINVAL;
1039 		}
1040 		break;
1041 	case PEC_RES_AGG:
1042 		ret = ENOTSUP;
1043 		break;
1044 	case PEC_COMP:
1045 		switch (subclass) {
1046 		case PCEC_CPU:
1047 			ret = pool_cpu_proprm((processorid_t)id, name);
1048 			break;
1049 		default:
1050 			ret = EINVAL;
1051 		}
1052 		break;
1053 	default:
1054 		ret = EINVAL;
1055 	}
1056 	return (ret);
1057 }
1058 
1059 int
1060 pool_propget(char *name, int class, int subclass, id_t id, nvlist_t **nvlp)
1061 {
1062 	int ret;
1063 	nvlist_t *nvl;
1064 
1065 	ASSERT(pool_lock_held());
1066 	if (pool_state == POOL_DISABLED)
1067 		return (ENOTACTIVE);
1068 
1069 	(void) nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP);
1070 
1071 	switch (class) {
1072 	case PEC_SYSTEM:
1073 	case PEC_POOL:
1074 		ret = EINVAL;
1075 		break;
1076 	case PEC_RES_COMP:
1077 		switch (subclass) {
1078 		case PREC_PSET:
1079 			ret = pool_pset_propget((psetid_t)id, name, nvl);
1080 			break;
1081 		default:
1082 			ret = EINVAL;
1083 		}
1084 		break;
1085 	case PEC_RES_AGG:
1086 		ret = ENOTSUP;
1087 		break;
1088 	case PEC_COMP:
1089 		switch (subclass) {
1090 		case PCEC_CPU:
1091 			ret = pool_cpu_propget((processorid_t)id, name, nvl);
1092 			break;
1093 		default:
1094 			ret = EINVAL;
1095 		}
1096 		break;
1097 	default:
1098 		ret = EINVAL;
1099 	}
1100 	if (ret == 0)
1101 		*nvlp = nvl;
1102 	else
1103 		nvlist_free(nvl);
1104 	return (ret);
1105 }
1106 
1107 /*
1108  * pool_bind_wake and pool_bind_wakeall are helper functions to undo PBWAITs
1109  * in case of failure in pool_do_bind().
1110  */
1111 static void
1112 pool_bind_wake(proc_t *p)
1113 {
1114 	ASSERT(pool_lock_held());
1115 
1116 	mutex_enter(&p->p_lock);
1117 	ASSERT(p->p_poolflag & PBWAIT);
1118 	if (p->p_poolcnt > 0) {
1119 		mutex_enter(&pool_barrier_lock);
1120 		pool_barrier_count -= p->p_poolcnt;
1121 		mutex_exit(&pool_barrier_lock);
1122 	}
1123 	p->p_poolflag &= ~PBWAIT;
1124 	cv_signal(&p->p_poolcv);
1125 	mutex_exit(&p->p_lock);
1126 }
1127 
1128 static void
1129 pool_bind_wakeall(proc_t **procs)
1130 {
1131 	proc_t *p, **pp;
1132 
1133 	ASSERT(pool_lock_held());
1134 	for (pp = procs; (p = *pp) != NULL; pp++)
1135 		pool_bind_wake(p);
1136 }
1137 
1138 /*
1139  * Return the scheduling class for this pool, or
1140  * 	POOL_CLASS_UNSET if not set
1141  * 	POOL_CLASS_INVAL if set to an invalid class ID.
1142  */
1143 id_t
1144 pool_get_class(pool_t *pool)
1145 {
1146 	char *name;
1147 	id_t cid;
1148 
1149 	ASSERT(pool_lock_held());
1150 
1151 	if (nvlist_lookup_string(pool->pool_props, "pool.scheduler",
1152 	    &name) == 0) {
1153 		if (getcidbyname(name, &cid) == 0)
1154 			return (cid);
1155 		else
1156 			return (POOL_CLASS_INVAL);
1157 	}
1158 	return (POOL_CLASS_UNSET);
1159 }
1160 
1161 /*
1162  * Move process to the new scheduling class.
1163  */
1164 static void
1165 pool_change_class(proc_t *p, id_t cid)
1166 {
1167 	kthread_t *t;
1168 	void *cldata;
1169 	id_t oldcid;
1170 	void **bufs;
1171 	void **buf;
1172 	int nlwp;
1173 	int ret;
1174 	int i;
1175 
1176 	/*
1177 	 * Do not move kernel processes (such as zsched).
1178 	 */
1179 	if (p->p_flag & SSYS)
1180 		return;
1181 	/*
1182 	 * This process is in the pool barrier, so it can't possibly be
1183 	 * adding new threads and we can use p_lwpcnt + p_zombcnt + 1
1184 	 * (for possible agent LWP which doesn't use pool barrier) as
1185 	 * our upper bound.
1186 	 */
1187 	nlwp = p->p_lwpcnt + p->p_zombcnt + 1;
1188 
1189 	/*
1190 	 * Pre-allocate scheduling class specific buffers before
1191 	 * grabbing p_lock.
1192 	 */
1193 	bufs = kmem_zalloc(nlwp * sizeof (void *), KM_SLEEP);
1194 	for (i = 0, buf = bufs; i < nlwp; i++, buf++) {
1195 		ret = CL_ALLOC(buf, cid, KM_SLEEP);
1196 		ASSERT(ret == 0);
1197 	}
1198 
1199 	/*
1200 	 * Move threads one by one to the new scheduling class.
1201 	 * This never fails because we have all the right
1202 	 * privileges here.
1203 	 */
1204 	mutex_enter(&p->p_lock);
1205 	ASSERT(p->p_poolflag & PBWAIT);
1206 	buf = bufs;
1207 	t = p->p_tlist;
1208 	ASSERT(t != NULL);
1209 	do {
1210 		if (t->t_cid != cid) {
1211 			oldcid = t->t_cid;
1212 			cldata = t->t_cldata;
1213 			ret = CL_ENTERCLASS(t, cid, NULL, NULL, *buf);
1214 			ASSERT(ret == 0);
1215 			CL_EXITCLASS(oldcid, cldata);
1216 			*buf++ = NULL;
1217 		}
1218 	} while ((t = t->t_forw) != p->p_tlist);
1219 	mutex_exit(&p->p_lock);
1220 	/*
1221 	 * Free unused scheduling class specific buffers.
1222 	 */
1223 	for (i = 0, buf = bufs; i < nlwp; i++, buf++) {
1224 		if (*buf != NULL) {
1225 			CL_FREE(cid, *buf);
1226 			*buf = NULL;
1227 		}
1228 	}
1229 	kmem_free(bufs, nlwp * sizeof (void *));
1230 }
1231 
1232 /*
1233  * The meat of the bind operation.  The steps in pool_do_bind are:
1234  *
1235  * 1) Set PBWAIT in the p_poolflag of any process of interest, and add all
1236  *    such processes to an array.  For any interesting process that has
1237  *    threads inside the pool barrier set, increment a counter by the
1238  *    count of such threads.  Once PBWAIT is set on a process, that process
1239  *    will not disappear.
1240  *
1241  * 2) Wait for the counter from step 2 to drop to zero.  Any process which
1242  *    calls pool_barrier_exit() and notices that PBWAIT has been set on it
1243  *    will decrement that counter before going to sleep, and the process
1244  *    calling pool_barrier_exit() which does the final decrement will wake us.
1245  *
1246  * 3) For each interesting process, perform a calculation on it to see if
1247  *    the bind will actually succeed.  This uses the following three
1248  *    resource-set-specific functions:
1249  *
1250  *    - int set_bind_start(procs, pool)
1251  *
1252  *      Determine whether the given array of processes can be bound to the
1253  *      resource set associated with the given pool.  If it can, take and hold
1254  *      any locks necessary to ensure that the operation will succeed, and
1255  *      make any necessary reservations in the target resource set.  If it
1256  *      can't, return failure with no reservations made and no new locks held.
1257  *
1258  *    - void set_bind_abort(procs, pool)
1259  *
1260  *      set_bind_start() has completed successfully, but another resource set's
1261  *      set_bind_start() has failed, and we haven't begun the bind yet.  Undo
1262  *      any reservations made and drop any locks acquired by our
1263  *      set_bind_start().
1264  *
1265  *    - void set_bind_finish(void)
1266  *
1267  *      The bind has completed successfully.  The processes have been released,
1268  *      and the reservation acquired in set_bind_start() has been depleted as
1269  *      the processes have finished their bindings.  Drop any locks acquired by
1270  *      set_bind_start().
1271  *
1272  * 4) If we've decided that we can proceed with the bind, iterate through
1273  *    the list of interesting processes, grab the necessary locks (which
1274  *    may differ per resource set), perform the bind, and ASSERT that it
1275  *    succeeds.  Once a process has been rebound, it can be awakened.
1276  *
1277  * The operations from step 4 must be kept in sync with anything which might
1278  * cause the bind operations (e.g., cpupart_bind_thread()) to fail, and
1279  * are thus located in the same source files as the associated bind operations.
1280  */
1281 int
1282 pool_do_bind(pool_t *pool, idtype_t idtype, id_t id, int flags)
1283 {
1284 	extern uint_t nproc;
1285 	klwp_t *lwp = ttolwp(curthread);
1286 	proc_t **pp, **procs;
1287 	proc_t *prstart;
1288 	int procs_count = 0;
1289 	kproject_t *kpj;
1290 	procset_t set;
1291 	zone_t *zone;
1292 	int procs_size;
1293 	int rv = 0;
1294 	proc_t *p;
1295 	id_t cid = -1;
1296 
1297 	ASSERT(pool_lock_held());
1298 
1299 	if ((cid = pool_get_class(pool)) == POOL_CLASS_INVAL)
1300 		return (EINVAL);
1301 
1302 	if (idtype == P_ZONEID) {
1303 		zone = zone_find_by_id(id);
1304 		if (zone == NULL)
1305 			return (ESRCH);
1306 		if (zone_status_get(zone) > ZONE_IS_RUNNING) {
1307 			zone_rele(zone);
1308 			return (EBUSY);
1309 		}
1310 	}
1311 
1312 	if (idtype == P_PROJID) {
1313 		kpj = project_hold_by_id(id, GLOBAL_ZONEID, PROJECT_HOLD_FIND);
1314 		if (kpj == NULL)
1315 			return (ESRCH);
1316 		mutex_enter(&kpj->kpj_poolbind);
1317 	}
1318 
1319 	if (idtype == P_PID) {
1320 		/*
1321 		 * Fast-path for a single process case.
1322 		 */
1323 		procs_size = 2;	/* procs is NULL-terminated */
1324 		procs = kmem_zalloc(procs_size * sizeof (proc_t *), KM_SLEEP);
1325 		mutex_enter(&pidlock);
1326 	} else {
1327 		/*
1328 		 * We will need enough slots for proc_t pointers for as many as
1329 		 * twice the number of currently running processes (assuming
1330 		 * that each one could be in fork() creating a new child).
1331 		 */
1332 		for (;;) {
1333 			procs_size = nproc * 2;
1334 			procs = kmem_zalloc(procs_size * sizeof (proc_t *),
1335 			    KM_SLEEP);
1336 			mutex_enter(&pidlock);
1337 
1338 			if (nproc * 2 <= procs_size)
1339 				break;
1340 			/*
1341 			 * If nproc has changed, try again.
1342 			 */
1343 			mutex_exit(&pidlock);
1344 			kmem_free(procs, procs_size * sizeof (proc_t *));
1345 		}
1346 	}
1347 
1348 	if (id == P_MYID)
1349 		id = getmyid(idtype);
1350 	setprocset(&set, POP_AND, idtype, id, P_ALL, 0);
1351 
1352 	/*
1353 	 * Do a first scan, and select target processes.
1354 	 */
1355 	if (idtype == P_PID)
1356 		prstart = prfind(id);
1357 	else
1358 		prstart = practive;
1359 	for (p = prstart, pp = procs; p != NULL; p = p->p_next) {
1360 		mutex_enter(&p->p_lock);
1361 		/*
1362 		 * Skip processes that don't match our (id, idtype) set or
1363 		 * on the way of becoming zombies.  Skip kernel processes
1364 		 * from the global zone.
1365 		 */
1366 		if (procinset(p, &set) == 0 ||
1367 		    p->p_poolflag & PEXITED ||
1368 		    ((p->p_flag & SSYS) && INGLOBALZONE(p))) {
1369 			mutex_exit(&p->p_lock);
1370 			continue;
1371 		}
1372 		if (!INGLOBALZONE(p)) {
1373 			switch (idtype) {
1374 			case P_PID:
1375 			case P_TASKID:
1376 				/*
1377 				 * Can't bind processes or tasks
1378 				 * in local zones to pools.
1379 				 */
1380 				mutex_exit(&p->p_lock);
1381 				mutex_exit(&pidlock);
1382 				pool_bind_wakeall(procs);
1383 				rv = EINVAL;
1384 				goto out;
1385 			case P_PROJID:
1386 				/*
1387 				 * Only projects in the global
1388 				 * zone can be rebound.
1389 				 */
1390 				mutex_exit(&p->p_lock);
1391 				continue;
1392 			case P_POOLID:
1393 				/*
1394 				 * When rebinding pools, processes can be
1395 				 * in different zones.
1396 				 */
1397 				break;
1398 			}
1399 		}
1400 
1401 		p->p_poolflag |= PBWAIT;
1402 		/*
1403 		 * If some threads in this process are inside the pool
1404 		 * barrier, add them to pool_barrier_count, as we have
1405 		 * to wait for all of them to exit the barrier.
1406 		 */
1407 		if (p->p_poolcnt > 0) {
1408 			mutex_enter(&pool_barrier_lock);
1409 			pool_barrier_count += p->p_poolcnt;
1410 			mutex_exit(&pool_barrier_lock);
1411 		}
1412 		ASSERT(pp < &procs[procs_size]);
1413 		*pp++ = p;
1414 		procs_count++;
1415 		mutex_exit(&p->p_lock);
1416 
1417 		/*
1418 		 * We just found our process, so if we're only rebinding a
1419 		 * single process then get out of this loop.
1420 		 */
1421 		if (idtype == P_PID)
1422 			break;
1423 	}
1424 	*pp = NULL;	/* cap off the end of the array */
1425 	mutex_exit(&pidlock);
1426 
1427 	/*
1428 	 * Wait for relevant processes to stop before they try to enter the
1429 	 * barrier or at the exit from the barrier.  Make sure that we do
1430 	 * not get stopped here while we're holding pool_lock.  If we were
1431 	 * requested to stop, or got a signal then return EAGAIN to let the
1432 	 * library know that it needs to retry.
1433 	 */
1434 	mutex_enter(&pool_barrier_lock);
1435 	lwp->lwp_nostop++;
1436 	while (pool_barrier_count > 0) {
1437 		(void) cv_wait_sig(&pool_barrier_cv, &pool_barrier_lock);
1438 		if (pool_barrier_count > 0) {
1439 			/*
1440 			 * We either got a signal or were requested to
1441 			 * stop by /proc.  Bail out with EAGAIN.  If we were
1442 			 * requested to stop, we'll stop in post_syscall()
1443 			 * on our way back to userland.
1444 			 */
1445 			mutex_exit(&pool_barrier_lock);
1446 			pool_bind_wakeall(procs);
1447 			lwp->lwp_nostop--;
1448 			rv = EAGAIN;
1449 			goto out;
1450 		}
1451 	}
1452 	lwp->lwp_nostop--;
1453 	mutex_exit(&pool_barrier_lock);
1454 
1455 	if (idtype == P_PID)
1456 		goto skip;
1457 
1458 	/*
1459 	 * Do another run, and drop processes that were inside the barrier
1460 	 * in exit(), but when they have dropped to pool_barrier_exit
1461 	 * they have become of no interest to us.  Pick up child processes that
1462 	 * were created by fork() but didn't exist during our first scan.
1463 	 * Their parents are now stopped at pool_barrier_exit in cfork().
1464 	 */
1465 	mutex_enter(&pidlock);
1466 	for (pp = procs; (p = *pp) != NULL; pp++) {
1467 		if (p->p_poolflag & PEXITED) {
1468 			ASSERT(p->p_lwpcnt == 0);
1469 			pool_bind_wake(p);
1470 			/* flip w/last non-NULL slot */
1471 			*pp = procs[procs_count - 1];
1472 			procs[procs_count - 1] = NULL;
1473 			procs_count--;
1474 			pp--;			/* try this slot again */
1475 			continue;
1476 		}
1477 		/*
1478 		 * Look at the child and check if it should be rebound also.
1479 		 * We're holding pidlock, so it is safe to reference p_child.
1480 		 */
1481 		if ((p = p->p_child) == NULL)
1482 			continue;
1483 
1484 		mutex_enter(&p->p_lock);
1485 		/*
1486 		 * Skip processes in local zones if we're not binding
1487 		 * zones to pools (P_ZONEID).  Skip kernel processes also.
1488 		 */
1489 		if ((!INGLOBALZONE(p) && idtype != P_ZONEID) ||
1490 		    p->p_flag & SSYS) {
1491 			mutex_exit(&p->p_lock);
1492 			continue;
1493 		}
1494 
1495 		/*
1496 		 * If the child process has been already created by fork(), has
1497 		 * not exited, and has not been added to the list already,
1498 		 * then add it now.  We will hit this process again (since we
1499 		 * stick it at the end of the procs list) but it will ignored
1500 		 * because it will have the PBWAIT flag set.
1501 		 */
1502 		if (procinset(p, &set) &&
1503 		    !(p->p_poolflag & PEXITED) &&
1504 		    !(p->p_poolflag & PBWAIT)) {
1505 			ASSERT(p->p_child == NULL); /* no child of a child */
1506 			procs[procs_count] = p;
1507 			procs[procs_count + 1] = NULL;
1508 			procs_count++;
1509 			p->p_poolflag |= PBWAIT;
1510 		}
1511 		mutex_exit(&p->p_lock);
1512 	}
1513 	mutex_exit(&pidlock);
1514 skip:
1515 	/*
1516 	 * If there's no processes to rebind then return ESRCH, unless
1517 	 * we're associating a pool with new resource set, destroying it,
1518 	 * or binding a zone to a pool.
1519 	 */
1520 	if (procs_count == 0) {
1521 		if (idtype == P_POOLID || idtype == P_ZONEID)
1522 			rv = 0;
1523 		else
1524 			rv = ESRCH;
1525 		goto out;
1526 	}
1527 
1528 #ifdef DEBUG
1529 	/*
1530 	 * All processes in the array should have PBWAIT set, and none should
1531 	 * be in the critical section.  Even though p_poolflag is protected by
1532 	 * the p_lock, these assertions should be stable across the dropping of
1533 	 * p_lock.
1534 	 */
1535 	for (pp = procs; (p = *pp) != NULL; pp++) {
1536 		ASSERT(p->p_poolflag & PBWAIT);
1537 		ASSERT(p->p_poolcnt == 0);
1538 		ASSERT(procinset(p, &set));
1539 	}
1540 #endif
1541 
1542 	/*
1543 	 * Do the check if processor set rebinding is going to succeed or not.
1544 	 */
1545 	if ((flags & POOL_BIND_PSET) &&
1546 	    (rv = pset_bind_start(procs, pool)) != 0) {
1547 		pool_bind_wakeall(procs);
1548 		goto out;
1549 	}
1550 
1551 	/*
1552 	 * At this point, all bind operations should succeed.
1553 	 */
1554 	for (pp = procs; (p = *pp) != NULL; pp++) {
1555 		if (flags & POOL_BIND_PSET) {
1556 			psetid_t psetid = pool->pool_pset->pset_id;
1557 			void *zonebuf;
1558 			void *projbuf;
1559 
1560 			/*
1561 			 * Pre-allocate one buffer for FSS (per-project
1562 			 * buffer for a new pset) in case if this is the
1563 			 * first thread from its current project getting
1564 			 * bound to this processor set.
1565 			 */
1566 			projbuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_PROJ);
1567 			zonebuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_ZONE);
1568 
1569 			mutex_enter(&pidlock);
1570 			mutex_enter(&p->p_lock);
1571 			pool_pset_bind(p, psetid, projbuf, zonebuf);
1572 			mutex_exit(&p->p_lock);
1573 			mutex_exit(&pidlock);
1574 			/*
1575 			 * Free buffers pre-allocated above if it
1576 			 * wasn't actually used.
1577 			 */
1578 			fss_freebuf(projbuf, FSS_ALLOC_PROJ);
1579 			fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
1580 		}
1581 		/*
1582 		 * Now let's change the scheduling class of this
1583 		 * process if our target pool has it defined.
1584 		 */
1585 		if (cid != POOL_CLASS_UNSET)
1586 			pool_change_class(p, cid);
1587 
1588 		/*
1589 		 * It is safe to reference p_pool here without holding
1590 		 * p_lock because it cannot change underneath of us.
1591 		 * We're holding pool_lock here, so nobody else can be
1592 		 * moving this process between pools.  If process "p"
1593 		 * would be exiting, we're guaranteed that it would be blocked
1594 		 * at pool_barrier_enter() in exit().  Otherwise, it would've
1595 		 * been skipped by one of our scans of the practive list
1596 		 * as a process with PEXITED flag set.
1597 		 */
1598 		if (p->p_pool != pool) {
1599 			ASSERT(p->p_pool->pool_ref > 0);
1600 			atomic_add_32(&p->p_pool->pool_ref, -1);
1601 			p->p_pool = pool;
1602 			atomic_add_32(&p->p_pool->pool_ref, 1);
1603 		}
1604 		/*
1605 		 * Okay, we've tortured this guy enough.
1606 		 * Let this poor process go now.
1607 		 */
1608 		pool_bind_wake(p);
1609 	}
1610 	if (flags & POOL_BIND_PSET)
1611 		pset_bind_finish();
1612 
1613 out:	switch (idtype) {
1614 	case P_PROJID:
1615 		ASSERT(kpj != NULL);
1616 		mutex_exit(&kpj->kpj_poolbind);
1617 		project_rele(kpj);
1618 		break;
1619 	case P_ZONEID:
1620 		if (rv == 0) {
1621 			mutex_enter(&cpu_lock);
1622 			zone_pool_set(zone, pool);
1623 			mutex_exit(&cpu_lock);
1624 		}
1625 		zone->zone_pool_mod = gethrtime();
1626 		zone_rele(zone);
1627 		break;
1628 	}
1629 
1630 	kmem_free(procs, procs_size * sizeof (proc_t *));
1631 	ASSERT(pool_barrier_count == 0);
1632 	return (rv);
1633 }
1634