1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 #include <sys/pool.h>
27 #include <sys/pool_impl.h>
28 #include <sys/pool_pset.h>
29 #include <sys/id_space.h>
30 #include <sys/mutex.h>
31 #include <sys/nvpair.h>
32 #include <sys/cpuvar.h>
33 #include <sys/errno.h>
34 #include <sys/cmn_err.h>
35 #include <sys/systm.h>
36 #include <sys/proc.h>
37 #include <sys/fss.h>
38 #include <sys/class.h>
39 #include <sys/exacct.h>
40 #include <sys/utsname.h>
41 #include <sys/procset.h>
42 #include <sys/atomic.h>
43 #include <sys/zone.h>
44 #include <sys/policy.h>
45 #include <sys/schedctl.h>
46 #include <sys/taskq.h>
47
48 /*
49 * RESOURCE POOLS
50 *
51 * The resource pools facility brings together process-bindable resource into
52 * a common abstraction called a pool. Processor sets and other entities can
53 * be configured, grouped, and labelled such that workload components can be
54 * associated with a subset of a system's total resources.
55 *
56 * When disabled, the pools facility is "invisible". All processes belong
57 * to the same pool (pool_default), and processor sets can be managed through
58 * the old pset() system call. When enabled, processor sets can only be
59 * managed via the pools facility. New pools can be created and associated
60 * with processor sets. Processes can be bound to pools which have non-empty
61 * resource sets.
62 *
63 * Locking: pool_lock() protects global pools state and must be called
64 * before modifying the configuration, or when taking a snapshot of the
65 * configuration. If pool_lock_intr() is used, the operation may be
66 * interrupted by a signal or a request.
67 *
68 * To prevent processes from being rebound between pools while they are
69 * the middle of an operation which affects resource set bindings, such
70 * operations must be surrounded by calls to pool_barrier_enter() and
71 * pool_barrier_exit(). This mechanism guarantees that such processes will
72 * be stopped either at the beginning or at the end of the barrier so that
73 * the rebind operation can atomically bind the process and its threads
74 * to new resource sets, and then let process run again.
75 *
76 * Lock ordering with respect to other locks is as follows:
77 *
78 * pool_lock() -> cpu_lock -> pidlock -> p_lock -> pool_barrier_lock
79 *
80 * Most static and global variables defined in this file are protected
81 * by calling pool_lock().
82 *
83 * The operation that binds tasks and projects to pools is atomic. That is,
84 * either all processes in a given task or a project will be bound to a
85 * new pool, or (in case of an error) they will be all left bound to the
86 * old pool. Processes in a given task or a given project can only be bound to
87 * different pools if they were rebound individually one by one as single
88 * processes. Threads or LWPs of the same process do not have pool bindings,
89 * and are bound to the same resource sets associated with the resource pool
90 * of that process.
91 *
92 * The following picture shows one possible pool configuration with three
93 * pools and three processor sets. Note that processor set "foo" is not
94 * associated with any pools and therefore cannot have any processes
95 * bound to it. Two pools (default and foo) are associated with the
96 * same processor set (default). Also, note that processes in Task 2
97 * are bound to different pools.
98 *
99 *
100 * Processor Sets
101 * +---------+
102 * +--------------+========================>| default |
103 * a| | +---------+
104 * s| | ||
105 * s| | +---------+
106 * o| | | foo |
107 * c| | +---------+
108 * i| | ||
109 * a| | +---------+
110 * t| | +------>| bar |
111 * e| | | +---------+
112 * d| | |
113 * | | |
114 * +---------+ +---------+ +---------+
115 * Pools | default |======| foo |======| bar |
116 * +---------+ +---------+ +---------+
117 * @ @ @ @ @ @
118 * b| | | | | |
119 * o| | | | | |
120 * u| +-----+ | +-------+ | +---+
121 * n| | | | | |
122 * ....d|........|......|......|.........|.......|....
123 * : | :: | | | :: | | :
124 * : +---+ :: +---+ +---+ +---+ :: +---+ +---+ :
125 * Processes : | p | :: | p | | p | | p | :: | p |...| p | :
126 * : +---+ :: +---+ +---+ +---+ :: +---+ +---+ :
127 * :........::......................::...............:
128 * Task 1 Task 2 Task N
129 * | | |
130 * | | |
131 * | +-----------+ | +-----------+
132 * +--| Project 1 |--+ | Project N |
133 * +-----------+ +-----------+
134 *
135 * This is just an illustration of relationships between processes, tasks,
136 * projects, pools, and processor sets. New types of resource sets will be
137 * added in the future.
138 */
139
140 pool_t *pool_default; /* default pool which always exists */
141 int pool_count; /* number of pools created on this system */
142 int pool_state; /* pools state -- enabled/disabled */
143 void *pool_buf; /* pre-commit snapshot of the pools state */
144 size_t pool_bufsz; /* size of pool_buf */
145 static hrtime_t pool_pool_mod; /* last modification time for pools */
146 static hrtime_t pool_sys_mod; /* last modification time for system */
147 static nvlist_t *pool_sys_prop; /* system properties */
148 static id_space_t *pool_ids; /* pool ID space */
149 static list_t pool_list; /* doubly-linked list of pools */
150 static kmutex_t pool_mutex; /* protects pool_busy_* */
151 static kcondvar_t pool_busy_cv; /* waiting for "pool_lock" */
152 static kthread_t *pool_busy_thread; /* thread holding "pool_lock" */
153 static kmutex_t pool_barrier_lock; /* synch. with pool_barrier_* */
154 static kcondvar_t pool_barrier_cv; /* synch. with pool_barrier_* */
155 static int pool_barrier_count; /* synch. with pool_barrier_* */
156 static list_t pool_event_cb_list; /* pool event callbacks */
157 static boolean_t pool_event_cb_init = B_FALSE;
158 static kmutex_t pool_event_cb_lock;
159 static taskq_t *pool_event_cb_taskq = NULL;
160
161 void pool_event_dispatch(pool_event_t, poolid_t);
162
163 /*
164 * Boot-time pool initialization.
165 */
166 void
pool_init(void)167 pool_init(void)
168 {
169 pool_ids = id_space_create("pool_ids", POOL_DEFAULT + 1, POOL_MAXID);
170
171 /*
172 * Initialize default pool.
173 */
174 pool_default = kmem_zalloc(sizeof (pool_t), KM_SLEEP);
175 pool_default->pool_id = POOL_DEFAULT;
176 list_create(&pool_list, sizeof (pool_t), offsetof(pool_t, pool_link));
177 list_insert_head(&pool_list, pool_default);
178
179 /*
180 * Initialize plugins for resource sets.
181 */
182 pool_pset_init();
183 pool_count = 1;
184 p0.p_pool = pool_default;
185 global_zone->zone_pool = pool_default;
186 pool_default->pool_ref = 1;
187 }
188
189 /*
190 * Synchronization routines.
191 *
192 * pool_lock is only called from syscall-level routines (processor_bind(),
193 * pset_*(), and /dev/pool ioctls). The pool "lock" may be held for long
194 * periods of time, including across sleeping operations, so we allow its
195 * acquisition to be interruptible.
196 *
197 * The current thread that owns the "lock" is stored in the variable
198 * pool_busy_thread, both to let pool_lock_held() work and to aid debugging.
199 */
200 void
pool_lock(void)201 pool_lock(void)
202 {
203 mutex_enter(&pool_mutex);
204 ASSERT(!pool_lock_held());
205 while (pool_busy_thread != NULL)
206 cv_wait(&pool_busy_cv, &pool_mutex);
207 pool_busy_thread = curthread;
208 mutex_exit(&pool_mutex);
209 }
210
211 int
pool_lock_intr(void)212 pool_lock_intr(void)
213 {
214 mutex_enter(&pool_mutex);
215 ASSERT(!pool_lock_held());
216 while (pool_busy_thread != NULL) {
217 if (cv_wait_sig(&pool_busy_cv, &pool_mutex) == 0) {
218 cv_signal(&pool_busy_cv);
219 mutex_exit(&pool_mutex);
220 return (1);
221 }
222 }
223 pool_busy_thread = curthread;
224 mutex_exit(&pool_mutex);
225 return (0);
226 }
227
228 int
pool_lock_held(void)229 pool_lock_held(void)
230 {
231 return (pool_busy_thread == curthread);
232 }
233
234 void
pool_unlock(void)235 pool_unlock(void)
236 {
237 mutex_enter(&pool_mutex);
238 ASSERT(pool_lock_held());
239 pool_busy_thread = NULL;
240 cv_signal(&pool_busy_cv);
241 mutex_exit(&pool_mutex);
242 }
243
244 /*
245 * Routines allowing fork(), exec(), exit(), and lwp_create() to synchronize
246 * with pool_do_bind().
247 *
248 * Calls to pool_barrier_enter() and pool_barrier_exit() must bracket all
249 * operations which modify pool or pset associations. They can be called
250 * while the process is multi-threaded. In the common case, when current
251 * process is not being rebound (PBWAIT flag is not set), these functions
252 * will be just incrementing and decrementing reference counts.
253 */
254 void
pool_barrier_enter(void)255 pool_barrier_enter(void)
256 {
257 proc_t *p = curproc;
258
259 ASSERT(MUTEX_HELD(&p->p_lock));
260 while (p->p_poolflag & PBWAIT)
261 cv_wait(&p->p_poolcv, &p->p_lock);
262 p->p_poolcnt++;
263 }
264
265 void
pool_barrier_exit(void)266 pool_barrier_exit(void)
267 {
268 proc_t *p = curproc;
269
270 ASSERT(MUTEX_HELD(&p->p_lock));
271 ASSERT(p->p_poolcnt > 0);
272 p->p_poolcnt--;
273 if (p->p_poolflag & PBWAIT) {
274 mutex_enter(&pool_barrier_lock);
275 ASSERT(pool_barrier_count > 0);
276 pool_barrier_count--;
277 if (pool_barrier_count == 0)
278 cv_signal(&pool_barrier_cv);
279 mutex_exit(&pool_barrier_lock);
280 while (p->p_poolflag & PBWAIT)
281 cv_wait(&p->p_poolcv, &p->p_lock);
282 }
283 }
284
285 /*
286 * Enable pools facility.
287 */
288 static int
pool_enable(void)289 pool_enable(void)
290 {
291 int ret;
292
293 ASSERT(pool_lock_held());
294 ASSERT(pool_count == 1);
295
296 ret = pool_pset_enable();
297 if (ret != 0)
298 return (ret);
299 (void) nvlist_alloc(&pool_sys_prop, NV_UNIQUE_NAME, KM_SLEEP);
300 (void) nvlist_add_string(pool_sys_prop, "system.name",
301 "default");
302 (void) nvlist_add_string(pool_sys_prop, "system.comment", "");
303 (void) nvlist_add_int64(pool_sys_prop, "system.version", 1);
304 (void) nvlist_add_byte(pool_sys_prop, "system.bind-default", 1);
305 (void) nvlist_add_string(pool_sys_prop, "system.poold.objectives",
306 "wt-load");
307
308 (void) nvlist_alloc(&pool_default->pool_props,
309 NV_UNIQUE_NAME, KM_SLEEP);
310 (void) nvlist_add_string(pool_default->pool_props,
311 "pool.name", "pool_default");
312 (void) nvlist_add_string(pool_default->pool_props, "pool.comment", "");
313 (void) nvlist_add_byte(pool_default->pool_props, "pool.default", 1);
314 (void) nvlist_add_byte(pool_default->pool_props, "pool.active", 1);
315 (void) nvlist_add_int64(pool_default->pool_props,
316 "pool.importance", 1);
317 (void) nvlist_add_int64(pool_default->pool_props, "pool.sys_id",
318 pool_default->pool_id);
319
320 pool_sys_mod = pool_pool_mod = gethrtime();
321
322 return (ret);
323 }
324
325 /*
326 * Disable pools facility.
327 */
328 static int
pool_disable(void)329 pool_disable(void)
330 {
331 int ret;
332
333 ASSERT(pool_lock_held());
334
335 if (pool_count > 1) /* must destroy all pools first */
336 return (EBUSY);
337
338 ret = pool_pset_disable();
339 if (ret != 0)
340 return (ret);
341 if (pool_sys_prop != NULL) {
342 nvlist_free(pool_sys_prop);
343 pool_sys_prop = NULL;
344 }
345 if (pool_default->pool_props != NULL) {
346 nvlist_free(pool_default->pool_props);
347 pool_default->pool_props = NULL;
348 }
349 return (0);
350 }
351
352 pool_t *
pool_lookup_pool_by_name(char * name)353 pool_lookup_pool_by_name(char *name)
354 {
355 pool_t *pool = pool_default;
356 char *p;
357
358 ASSERT(pool_lock_held());
359 for (pool = list_head(&pool_list); pool;
360 pool = list_next(&pool_list, pool)) {
361 if (nvlist_lookup_string(pool->pool_props,
362 "pool.name", &p) == 0 && strcmp(name, p) == 0)
363 return (pool);
364 }
365 return (NULL);
366 }
367
368 pool_t *
pool_lookup_pool_by_id(poolid_t poolid)369 pool_lookup_pool_by_id(poolid_t poolid)
370 {
371 pool_t *pool = pool_default;
372
373 ASSERT(pool_lock_held());
374 for (pool = list_head(&pool_list); pool;
375 pool = list_next(&pool_list, pool)) {
376 if (pool->pool_id == poolid)
377 return (pool);
378 }
379 return (NULL);
380 }
381
382 pool_t *
pool_lookup_pool_by_pset(int id)383 pool_lookup_pool_by_pset(int id)
384 {
385 pool_t *pool = pool_default;
386 psetid_t psetid = (psetid_t)id;
387
388 ASSERT(pool_lock_held());
389 for (pool = list_head(&pool_list); pool != NULL;
390 pool = list_next(&pool_list, pool)) {
391 if (pool->pool_pset->pset_id == psetid)
392 return (pool);
393 }
394 return (NULL);
395 }
396
397 /*
398 * Create new pool, associate it with default resource sets, and give
399 * it a temporary name.
400 */
401 static int
pool_pool_create(poolid_t * poolid)402 pool_pool_create(poolid_t *poolid)
403 {
404 pool_t *pool;
405 char pool_name[40];
406
407 ASSERT(pool_lock_held());
408
409 pool = kmem_zalloc(sizeof (pool_t), KM_SLEEP);
410 pool->pool_id = *poolid = id_alloc(pool_ids);
411 pool->pool_pset = pool_pset_default;
412 pool_pset_default->pset_npools++;
413 list_insert_tail(&pool_list, pool);
414 (void) nvlist_alloc(&pool->pool_props, NV_UNIQUE_NAME, KM_SLEEP);
415 (void) nvlist_add_int64(pool->pool_props, "pool.sys_id", pool->pool_id);
416 (void) nvlist_add_byte(pool->pool_props, "pool.default", 0);
417 pool_pool_mod = gethrtime();
418 (void) snprintf(pool_name, sizeof (pool_name), "pool_%lld",
419 pool_pool_mod);
420 (void) nvlist_add_string(pool->pool_props, "pool.name", pool_name);
421 pool_count++;
422 return (0);
423 }
424
425 struct destroy_zone_arg {
426 pool_t *old;
427 pool_t *new;
428 };
429
430 /*
431 * Update pool pointers for zones that are currently bound to pool "old"
432 * to be bound to pool "new".
433 */
434 static int
pool_destroy_zone_cb(zone_t * zone,void * arg)435 pool_destroy_zone_cb(zone_t *zone, void *arg)
436 {
437 struct destroy_zone_arg *dza = arg;
438
439 ASSERT(pool_lock_held());
440 ASSERT(MUTEX_HELD(&cpu_lock));
441
442 if (zone_pool_get(zone) == dza->old)
443 zone_pool_set(zone, dza->new);
444 return (0);
445 }
446
447 /*
448 * Destroy specified pool, and rebind all processes in it
449 * to the default pool.
450 */
451 static int
pool_pool_destroy(poolid_t poolid)452 pool_pool_destroy(poolid_t poolid)
453 {
454 pool_t *pool;
455 int ret;
456
457 ASSERT(pool_lock_held());
458
459 if (poolid == POOL_DEFAULT)
460 return (EINVAL);
461 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
462 return (ESRCH);
463 ret = pool_do_bind(pool_default, P_POOLID, poolid, POOL_BIND_ALL);
464 if (ret == 0) {
465 struct destroy_zone_arg dzarg;
466
467 dzarg.old = pool;
468 dzarg.new = pool_default;
469 mutex_enter(&cpu_lock);
470 ret = zone_walk(pool_destroy_zone_cb, &dzarg);
471 mutex_exit(&cpu_lock);
472 ASSERT(ret == 0);
473 ASSERT(pool->pool_ref == 0);
474 (void) nvlist_free(pool->pool_props);
475 id_free(pool_ids, pool->pool_id);
476 pool->pool_pset->pset_npools--;
477 list_remove(&pool_list, pool);
478 pool_count--;
479 pool_pool_mod = gethrtime();
480 kmem_free(pool, sizeof (pool_t));
481 }
482 return (ret);
483 }
484
485 /*
486 * Create new pool or resource set.
487 */
488 int
pool_create(int class,int subclass,id_t * id)489 pool_create(int class, int subclass, id_t *id)
490 {
491 int ret;
492
493 ASSERT(pool_lock_held());
494 if (pool_state == POOL_DISABLED)
495 return (ENOTACTIVE);
496 switch (class) {
497 case PEC_POOL:
498 ret = pool_pool_create((poolid_t *)id);
499 break;
500 case PEC_RES_COMP:
501 switch (subclass) {
502 case PREC_PSET:
503 ret = pool_pset_create((psetid_t *)id);
504 break;
505 default:
506 ret = EINVAL;
507 }
508 break;
509 case PEC_RES_AGG:
510 ret = ENOTSUP;
511 break;
512 default:
513 ret = EINVAL;
514 }
515 return (ret);
516 }
517
518 /*
519 * Destroy an existing pool or resource set.
520 */
521 int
pool_destroy(int class,int subclass,id_t id)522 pool_destroy(int class, int subclass, id_t id)
523 {
524 int ret;
525
526 ASSERT(pool_lock_held());
527 if (pool_state == POOL_DISABLED)
528 return (ENOTACTIVE);
529 switch (class) {
530 case PEC_POOL:
531 ret = pool_pool_destroy((poolid_t)id);
532 break;
533 case PEC_RES_COMP:
534 switch (subclass) {
535 case PREC_PSET:
536 ret = pool_pset_destroy((psetid_t)id);
537 break;
538 default:
539 ret = EINVAL;
540 }
541 break;
542 case PEC_RES_AGG:
543 ret = ENOTSUP;
544 break;
545 default:
546 ret = EINVAL;
547 }
548 return (ret);
549 }
550
551 /*
552 * Enable or disable pools.
553 */
554 int
pool_status(int status)555 pool_status(int status)
556 {
557 int ret = 0;
558
559 ASSERT(pool_lock_held());
560
561 if (pool_state == status)
562 return (0);
563 switch (status) {
564 case POOL_ENABLED:
565 ret = pool_enable();
566 if (ret != 0)
567 return (ret);
568 pool_state = POOL_ENABLED;
569 pool_event_dispatch(POOL_E_ENABLE, 0);
570 break;
571 case POOL_DISABLED:
572 ret = pool_disable();
573 if (ret != 0)
574 return (ret);
575 pool_state = POOL_DISABLED;
576 pool_event_dispatch(POOL_E_DISABLE, 0);
577 break;
578 default:
579 ret = EINVAL;
580 }
581 return (ret);
582 }
583
584 /*
585 * Associate pool with resource set.
586 */
587 int
pool_assoc(poolid_t poolid,int idtype,id_t id)588 pool_assoc(poolid_t poolid, int idtype, id_t id)
589 {
590 int ret;
591
592 ASSERT(pool_lock_held());
593 if (pool_state == POOL_DISABLED)
594 return (ENOTACTIVE);
595 switch (idtype) {
596 case PREC_PSET:
597 ret = pool_pset_assoc(poolid, (psetid_t)id);
598 if (ret == 0)
599 pool_event_dispatch(POOL_E_CHANGE, poolid);
600 break;
601 default:
602 ret = EINVAL;
603 }
604 if (ret == 0)
605 pool_pool_mod = gethrtime();
606 return (ret);
607 }
608
609 /*
610 * Disassociate resource set from pool.
611 */
612 int
pool_dissoc(poolid_t poolid,int idtype)613 pool_dissoc(poolid_t poolid, int idtype)
614 {
615 int ret;
616
617 ASSERT(pool_lock_held());
618 if (pool_state == POOL_DISABLED)
619 return (ENOTACTIVE);
620 switch (idtype) {
621 case PREC_PSET:
622 ret = pool_pset_assoc(poolid, PS_NONE);
623 if (ret == 0)
624 pool_event_dispatch(POOL_E_CHANGE, poolid);
625 break;
626 default:
627 ret = EINVAL;
628 }
629 if (ret == 0)
630 pool_pool_mod = gethrtime();
631 return (ret);
632 }
633
634 /*
635 * Transfer specified quantity of resources between resource sets.
636 */
637 /*ARGSUSED*/
638 int
pool_transfer(int type,id_t src,id_t dst,uint64_t qty)639 pool_transfer(int type, id_t src, id_t dst, uint64_t qty)
640 {
641 int ret = EINVAL;
642
643 return (ret);
644 }
645
646 static poolid_t
pool_lookup_id_by_pset(int id)647 pool_lookup_id_by_pset(int id)
648 {
649 pool_t *pool = pool_default;
650 psetid_t psetid = (psetid_t)id;
651
652 ASSERT(pool_lock_held());
653 for (pool = list_head(&pool_list); pool != NULL;
654 pool = list_next(&pool_list, pool)) {
655 if (pool->pool_pset->pset_id == psetid)
656 return (pool->pool_id);
657 }
658 return (POOL_INVALID);
659 }
660
661 /*
662 * Transfer resources specified by their IDs between resource sets.
663 */
664 int
pool_xtransfer(int type,id_t src_pset,id_t dst_pset,uint_t size,id_t * ids)665 pool_xtransfer(int type, id_t src_pset, id_t dst_pset, uint_t size, id_t *ids)
666 {
667 int ret;
668 poolid_t src_pool, dst_pool;
669
670 ASSERT(pool_lock_held());
671 if (pool_state == POOL_DISABLED)
672 return (ENOTACTIVE);
673 switch (type) {
674 case PREC_PSET:
675 ret = pool_pset_xtransfer((psetid_t)src_pset,
676 (psetid_t)dst_pset, size, ids);
677 if (ret == 0) {
678 if ((src_pool = pool_lookup_id_by_pset(src_pset)) !=
679 POOL_INVALID)
680 pool_event_dispatch(POOL_E_CHANGE, src_pool);
681 if ((dst_pool = pool_lookup_id_by_pset(dst_pset)) !=
682 POOL_INVALID)
683 pool_event_dispatch(POOL_E_CHANGE, dst_pool);
684 }
685 break;
686 default:
687 ret = EINVAL;
688 }
689 return (ret);
690 }
691
692 /*
693 * Bind processes to pools.
694 */
695 int
pool_bind(poolid_t poolid,idtype_t idtype,id_t id)696 pool_bind(poolid_t poolid, idtype_t idtype, id_t id)
697 {
698 pool_t *pool;
699
700 ASSERT(pool_lock_held());
701
702 if (pool_state == POOL_DISABLED)
703 return (ENOTACTIVE);
704 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
705 return (ESRCH);
706
707 switch (idtype) {
708 case P_PID:
709 case P_TASKID:
710 case P_PROJID:
711 case P_ZONEID:
712 break;
713 default:
714 return (EINVAL);
715 }
716 return (pool_do_bind(pool, idtype, id, POOL_BIND_ALL));
717 }
718
719 /*
720 * Query pool binding of the specifed process.
721 */
722 int
pool_query_binding(idtype_t idtype,id_t id,id_t * poolid)723 pool_query_binding(idtype_t idtype, id_t id, id_t *poolid)
724 {
725 proc_t *p;
726
727 if (idtype != P_PID)
728 return (ENOTSUP);
729 if (id == P_MYID)
730 id = curproc->p_pid;
731
732 ASSERT(pool_lock_held());
733
734 mutex_enter(&pidlock);
735 if ((p = prfind((pid_t)id)) == NULL) {
736 mutex_exit(&pidlock);
737 return (ESRCH);
738 }
739 mutex_enter(&p->p_lock);
740 /*
741 * In local zones, lie about pool bindings of processes from
742 * the global zone.
743 */
744 if (!INGLOBALZONE(curproc) && INGLOBALZONE(p)) {
745 pool_t *pool;
746
747 pool = zone_pool_get(curproc->p_zone);
748 *poolid = pool->pool_id;
749 } else {
750 *poolid = p->p_pool->pool_id;
751 }
752 mutex_exit(&p->p_lock);
753 mutex_exit(&pidlock);
754 return (0);
755 }
756
757 static ea_object_t *
pool_system_pack(void)758 pool_system_pack(void)
759 {
760 ea_object_t *eo_system;
761 size_t bufsz = 0;
762 char *buf = NULL;
763
764 ASSERT(pool_lock_held());
765
766 eo_system = ea_alloc_group(EXT_GROUP | EXC_LOCAL | EXD_GROUP_SYSTEM);
767 (void) ea_attach_item(eo_system, &pool_sys_mod, sizeof (hrtime_t),
768 EXC_LOCAL | EXD_SYSTEM_TSTAMP | EXT_UINT64);
769 if (INGLOBALZONE(curproc))
770 (void) ea_attach_item(eo_system, &pool_pool_mod,
771 sizeof (hrtime_t),
772 EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64);
773 else
774 (void) ea_attach_item(eo_system,
775 &curproc->p_zone->zone_pool_mod,
776 sizeof (hrtime_t),
777 EXC_LOCAL | EXD_POOL_TSTAMP | EXT_UINT64);
778 (void) ea_attach_item(eo_system, &pool_pset_mod, sizeof (hrtime_t),
779 EXC_LOCAL | EXD_PSET_TSTAMP | EXT_UINT64);
780 (void) ea_attach_item(eo_system, &pool_cpu_mod, sizeof (hrtime_t),
781 EXC_LOCAL | EXD_CPU_TSTAMP | EXT_UINT64);
782 (void) nvlist_pack(pool_sys_prop, &buf, &bufsz, NV_ENCODE_NATIVE, 0);
783 (void) ea_attach_item(eo_system, buf, bufsz,
784 EXC_LOCAL | EXD_SYSTEM_PROP | EXT_RAW);
785 kmem_free(buf, bufsz);
786 return (eo_system);
787 }
788
789 /*
790 * Pack information about pools and attach it to specified exacct group.
791 */
792 static int
pool_pool_pack(ea_object_t * eo_system)793 pool_pool_pack(ea_object_t *eo_system)
794 {
795 ea_object_t *eo_pool;
796 pool_t *pool;
797 size_t bufsz;
798 char *buf;
799 pool_t *myzonepool;
800
801 ASSERT(pool_lock_held());
802 myzonepool = zone_pool_get(curproc->p_zone);
803 for (pool = list_head(&pool_list); pool;
804 pool = list_next(&pool_list, pool)) {
805 if (!INGLOBALZONE(curproc) && myzonepool != pool)
806 continue;
807 bufsz = 0;
808 buf = NULL;
809 eo_pool = ea_alloc_group(EXT_GROUP |
810 EXC_LOCAL | EXD_GROUP_POOL);
811 (void) ea_attach_item(eo_pool, &pool->pool_id, sizeof (id_t),
812 EXC_LOCAL | EXD_POOL_POOLID | EXT_UINT32);
813 (void) ea_attach_item(eo_pool, &pool->pool_pset->pset_id,
814 sizeof (id_t), EXC_LOCAL | EXD_POOL_PSETID | EXT_UINT32);
815 (void) nvlist_pack(pool->pool_props, &buf, &bufsz,
816 NV_ENCODE_NATIVE, 0);
817 (void) ea_attach_item(eo_pool, buf, bufsz,
818 EXC_LOCAL | EXD_POOL_PROP | EXT_RAW);
819 kmem_free(buf, bufsz);
820 (void) ea_attach_to_group(eo_system, eo_pool);
821 }
822 return (0);
823 }
824
825 /*
826 * Pack the whole pool configuration in the specified buffer.
827 */
828 int
pool_pack_conf(void * kbuf,size_t kbufsz,size_t * asize)829 pool_pack_conf(void *kbuf, size_t kbufsz, size_t *asize)
830 {
831 ea_object_t *eo_system;
832 size_t ksize;
833 int ret = 0;
834
835 ASSERT(pool_lock_held());
836
837 eo_system = pool_system_pack(); /* 1. pack system */
838 (void) pool_pool_pack(eo_system); /* 2. pack all pools */
839 (void) pool_pset_pack(eo_system); /* 3. pack all psets */
840 ksize = ea_pack_object(eo_system, NULL, 0);
841 if (kbuf == NULL || kbufsz == 0)
842 *asize = ksize;
843 else if (ksize > kbufsz)
844 ret = ENOMEM;
845 else
846 *asize = ea_pack_object(eo_system, kbuf, kbufsz);
847 ea_free_object(eo_system, EUP_ALLOC);
848 return (ret);
849 }
850
851 /*
852 * Start/end the commit transaction. If commit transaction is currently
853 * in progress, then all POOL_QUERY ioctls will return pools configuration
854 * at the beginning of transaction.
855 */
856 int
pool_commit(int state)857 pool_commit(int state)
858 {
859 ea_object_t *eo_system;
860 int ret = 0;
861
862 ASSERT(pool_lock_held());
863
864 if (pool_state == POOL_DISABLED)
865 return (ENOTACTIVE);
866 switch (state) {
867 case 1:
868 /*
869 * Beginning commit transation.
870 */
871 if (pool_buf != NULL) /* transaction in progress */
872 return (EBUSY);
873 eo_system = pool_system_pack(); /* 1. pack system */
874 (void) pool_pool_pack(eo_system); /* 2. pack all pools */
875 (void) pool_pset_pack(eo_system); /* 3. pack all psets */
876 pool_bufsz = ea_pack_object(eo_system, NULL, 0);
877 pool_buf = kmem_alloc(pool_bufsz, KM_SLEEP);
878 pool_bufsz = ea_pack_object(eo_system, pool_buf, pool_bufsz);
879 ea_free_object(eo_system, EUP_ALLOC);
880 break;
881 case 0:
882 /*
883 * Finishing commit transaction.
884 */
885 if (pool_buf != NULL) {
886 kmem_free(pool_buf, pool_bufsz);
887 pool_buf = NULL;
888 pool_bufsz = 0;
889 }
890 break;
891 default:
892 ret = EINVAL;
893 }
894 return (ret);
895 }
896
897 /*
898 * Check is the specified property is special
899 */
900 static pool_property_t *
pool_property_find(char * name,pool_property_t * list)901 pool_property_find(char *name, pool_property_t *list)
902 {
903 pool_property_t *prop;
904
905 for (prop = list; prop->pp_name != NULL; prop++)
906 if (strcmp(prop->pp_name, name) == 0)
907 return (prop);
908 return (NULL);
909 }
910
911 static pool_property_t pool_prop_sys[] = {
912 { "system.name", DATA_TYPE_STRING, PP_RDWR },
913 { "system.comment", DATA_TYPE_STRING, PP_RDWR },
914 { "system.version", DATA_TYPE_UINT64, PP_READ },
915 { "system.bind-default", DATA_TYPE_BYTE, PP_RDWR },
916 { "system.allocate-method", DATA_TYPE_STRING,
917 PP_RDWR | PP_OPTIONAL },
918 { "system.poold.log-level", DATA_TYPE_STRING,
919 PP_RDWR | PP_OPTIONAL },
920 { "system.poold.log-location", DATA_TYPE_STRING,
921 PP_RDWR | PP_OPTIONAL },
922 { "system.poold.monitor-interval", DATA_TYPE_UINT64,
923 PP_RDWR | PP_OPTIONAL },
924 { "system.poold.history-file", DATA_TYPE_STRING,
925 PP_RDWR | PP_OPTIONAL },
926 { "system.poold.objectives", DATA_TYPE_STRING,
927 PP_RDWR | PP_OPTIONAL },
928 { NULL, 0, 0 }
929 };
930
931 static pool_property_t pool_prop_pool[] = {
932 { "pool.sys_id", DATA_TYPE_UINT64, PP_READ },
933 { "pool.name", DATA_TYPE_STRING, PP_RDWR },
934 { "pool.default", DATA_TYPE_BYTE, PP_READ },
935 { "pool.active", DATA_TYPE_BYTE, PP_RDWR },
936 { "pool.importance", DATA_TYPE_INT64, PP_RDWR },
937 { "pool.comment", DATA_TYPE_STRING, PP_RDWR },
938 { "pool.scheduler", DATA_TYPE_STRING,
939 PP_RDWR | PP_OPTIONAL },
940 { NULL, 0, 0 }
941 };
942
943 /*
944 * Common routine to put new property on the specified list
945 */
946 int
pool_propput_common(nvlist_t * nvlist,nvpair_t * pair,pool_property_t * props)947 pool_propput_common(nvlist_t *nvlist, nvpair_t *pair, pool_property_t *props)
948 {
949 pool_property_t *prop;
950
951 if ((prop = pool_property_find(nvpair_name(pair), props)) != NULL) {
952 /*
953 * No read-only properties or properties with bad types
954 */
955 if (!(prop->pp_perm & PP_WRITE) ||
956 prop->pp_type != nvpair_type(pair))
957 return (EINVAL);
958 }
959 return (nvlist_add_nvpair(nvlist, pair));
960 }
961
962 /*
963 * Common routine to remove property from the given list
964 */
965 int
pool_proprm_common(nvlist_t * nvlist,char * name,pool_property_t * props)966 pool_proprm_common(nvlist_t *nvlist, char *name, pool_property_t *props)
967 {
968 pool_property_t *prop;
969
970 if ((prop = pool_property_find(name, props)) != NULL) {
971 if (!(prop->pp_perm & PP_OPTIONAL))
972 return (EINVAL);
973 }
974 return (nvlist_remove_all(nvlist, name));
975 }
976
977 static int
pool_system_propput(nvpair_t * pair)978 pool_system_propput(nvpair_t *pair)
979 {
980 int ret;
981
982 ASSERT(pool_lock_held());
983 ret = pool_propput_common(pool_sys_prop, pair, pool_prop_sys);
984 if (ret == 0)
985 pool_sys_mod = gethrtime();
986 return (ret);
987 }
988
989 static int
pool_system_proprm(char * name)990 pool_system_proprm(char *name)
991 {
992 int ret;
993
994 ASSERT(pool_lock_held());
995 ret = pool_proprm_common(pool_sys_prop, name, pool_prop_sys);
996 if (ret == 0)
997 pool_sys_mod = gethrtime();
998 return (ret);
999 }
1000
1001 static int
pool_pool_propput(poolid_t poolid,nvpair_t * pair)1002 pool_pool_propput(poolid_t poolid, nvpair_t *pair)
1003 {
1004 pool_t *pool;
1005 int ret;
1006
1007 ASSERT(pool_lock_held());
1008 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
1009 return (ESRCH);
1010 ret = pool_propput_common(pool->pool_props, pair, pool_prop_pool);
1011 if (ret == 0)
1012 pool_pool_mod = gethrtime();
1013 return (ret);
1014 }
1015
1016 static int
pool_pool_proprm(poolid_t poolid,char * name)1017 pool_pool_proprm(poolid_t poolid, char *name)
1018 {
1019 int ret;
1020 pool_t *pool;
1021
1022 ASSERT(pool_lock_held());
1023 if ((pool = pool_lookup_pool_by_id(poolid)) == NULL)
1024 return (ESRCH);
1025 ret = pool_proprm_common(pool->pool_props, name, pool_prop_pool);
1026 if (ret == 0)
1027 pool_pool_mod = gethrtime();
1028 return (ret);
1029 }
1030
1031 int
pool_propput(int class,int subclass,id_t id,nvpair_t * pair)1032 pool_propput(int class, int subclass, id_t id, nvpair_t *pair)
1033 {
1034 int ret;
1035
1036 ASSERT(pool_lock_held());
1037 if (pool_state == POOL_DISABLED)
1038 return (ENOTACTIVE);
1039 switch (class) {
1040 case PEC_SYSTEM:
1041 ret = pool_system_propput(pair);
1042 break;
1043 case PEC_POOL:
1044 ret = pool_pool_propput((poolid_t)id, pair);
1045 break;
1046 case PEC_RES_COMP:
1047 switch (subclass) {
1048 case PREC_PSET:
1049 ret = pool_pset_propput((psetid_t)id, pair);
1050 break;
1051 default:
1052 ret = EINVAL;
1053 }
1054 break;
1055 case PEC_RES_AGG:
1056 ret = ENOTSUP;
1057 break;
1058 case PEC_COMP:
1059 switch (subclass) {
1060 case PCEC_CPU:
1061 ret = pool_cpu_propput((processorid_t)id, pair);
1062 break;
1063 default:
1064 ret = EINVAL;
1065 }
1066 break;
1067 default:
1068 ret = EINVAL;
1069 }
1070 return (ret);
1071 }
1072
1073 int
pool_proprm(int class,int subclass,id_t id,char * name)1074 pool_proprm(int class, int subclass, id_t id, char *name)
1075 {
1076 int ret;
1077
1078 ASSERT(pool_lock_held());
1079 if (pool_state == POOL_DISABLED)
1080 return (ENOTACTIVE);
1081 switch (class) {
1082 case PEC_SYSTEM:
1083 ret = pool_system_proprm(name);
1084 break;
1085 case PEC_POOL:
1086 ret = pool_pool_proprm((poolid_t)id, name);
1087 break;
1088 case PEC_RES_COMP:
1089 switch (subclass) {
1090 case PREC_PSET:
1091 ret = pool_pset_proprm((psetid_t)id, name);
1092 break;
1093 default:
1094 ret = EINVAL;
1095 }
1096 break;
1097 case PEC_RES_AGG:
1098 ret = ENOTSUP;
1099 break;
1100 case PEC_COMP:
1101 switch (subclass) {
1102 case PCEC_CPU:
1103 ret = pool_cpu_proprm((processorid_t)id, name);
1104 break;
1105 default:
1106 ret = EINVAL;
1107 }
1108 break;
1109 default:
1110 ret = EINVAL;
1111 }
1112 return (ret);
1113 }
1114
1115 int
pool_propget(char * name,int class,int subclass,id_t id,nvlist_t ** nvlp)1116 pool_propget(char *name, int class, int subclass, id_t id, nvlist_t **nvlp)
1117 {
1118 int ret;
1119 nvlist_t *nvl;
1120
1121 ASSERT(pool_lock_held());
1122 if (pool_state == POOL_DISABLED)
1123 return (ENOTACTIVE);
1124
1125 (void) nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP);
1126
1127 switch (class) {
1128 case PEC_SYSTEM:
1129 case PEC_POOL:
1130 ret = EINVAL;
1131 break;
1132 case PEC_RES_COMP:
1133 switch (subclass) {
1134 case PREC_PSET:
1135 ret = pool_pset_propget((psetid_t)id, name, nvl);
1136 break;
1137 default:
1138 ret = EINVAL;
1139 }
1140 break;
1141 case PEC_RES_AGG:
1142 ret = ENOTSUP;
1143 break;
1144 case PEC_COMP:
1145 switch (subclass) {
1146 case PCEC_CPU:
1147 ret = pool_cpu_propget((processorid_t)id, name, nvl);
1148 break;
1149 default:
1150 ret = EINVAL;
1151 }
1152 break;
1153 default:
1154 ret = EINVAL;
1155 }
1156 if (ret == 0)
1157 *nvlp = nvl;
1158 else
1159 nvlist_free(nvl);
1160 return (ret);
1161 }
1162
1163 /*
1164 * pool_bind_wake and pool_bind_wakeall are helper functions to undo PBWAITs
1165 * in case of failure in pool_do_bind().
1166 */
1167 static void
pool_bind_wake(proc_t * p)1168 pool_bind_wake(proc_t *p)
1169 {
1170 ASSERT(pool_lock_held());
1171
1172 mutex_enter(&p->p_lock);
1173 ASSERT(p->p_poolflag & PBWAIT);
1174 if (p->p_poolcnt > 0) {
1175 mutex_enter(&pool_barrier_lock);
1176 pool_barrier_count -= p->p_poolcnt;
1177 mutex_exit(&pool_barrier_lock);
1178 }
1179 p->p_poolflag &= ~PBWAIT;
1180 cv_signal(&p->p_poolcv);
1181 mutex_exit(&p->p_lock);
1182 }
1183
1184 static void
pool_bind_wakeall(proc_t ** procs)1185 pool_bind_wakeall(proc_t **procs)
1186 {
1187 proc_t *p, **pp;
1188
1189 ASSERT(pool_lock_held());
1190 for (pp = procs; (p = *pp) != NULL; pp++)
1191 pool_bind_wake(p);
1192 }
1193
1194 /*
1195 * Return the scheduling class for this pool, or
1196 * POOL_CLASS_UNSET if not set
1197 * POOL_CLASS_INVAL if set to an invalid class ID.
1198 */
1199 id_t
pool_get_class(pool_t * pool)1200 pool_get_class(pool_t *pool)
1201 {
1202 char *name;
1203 id_t cid;
1204
1205 ASSERT(pool_lock_held());
1206
1207 if (nvlist_lookup_string(pool->pool_props, "pool.scheduler",
1208 &name) == 0) {
1209 if (getcidbyname(name, &cid) == 0)
1210 return (cid);
1211 else
1212 return (POOL_CLASS_INVAL);
1213 }
1214 return (POOL_CLASS_UNSET);
1215 }
1216
1217 /*
1218 * Move process to the new scheduling class.
1219 */
1220 static void
pool_change_class(proc_t * p,id_t cid)1221 pool_change_class(proc_t *p, id_t cid)
1222 {
1223 kthread_t *t;
1224 void *cldata;
1225 id_t oldcid;
1226 void **bufs;
1227 void **buf;
1228 int nlwp;
1229 int ret;
1230 int i;
1231
1232 /*
1233 * Do not move kernel processes (such as zsched).
1234 */
1235 if (p->p_flag & SSYS)
1236 return;
1237 /*
1238 * This process is in the pool barrier, so it can't possibly be
1239 * adding new threads and we can use p_lwpcnt + p_zombcnt + 1
1240 * (for possible agent LWP which doesn't use pool barrier) as
1241 * our upper bound.
1242 */
1243 nlwp = p->p_lwpcnt + p->p_zombcnt + 1;
1244
1245 /*
1246 * Pre-allocate scheduling class specific buffers before
1247 * grabbing p_lock.
1248 */
1249 bufs = kmem_zalloc(nlwp * sizeof (void *), KM_SLEEP);
1250 for (i = 0, buf = bufs; i < nlwp; i++, buf++) {
1251 ret = CL_ALLOC(buf, cid, KM_SLEEP);
1252 ASSERT(ret == 0);
1253 }
1254
1255 /*
1256 * Move threads one by one to the new scheduling class.
1257 * This never fails because we have all the right
1258 * privileges here.
1259 */
1260 mutex_enter(&p->p_lock);
1261 ASSERT(p->p_poolflag & PBWAIT);
1262 buf = bufs;
1263 t = p->p_tlist;
1264 ASSERT(t != NULL);
1265 do {
1266 if (t->t_cid != cid) {
1267 oldcid = t->t_cid;
1268 cldata = t->t_cldata;
1269 ret = CL_ENTERCLASS(t, cid, NULL, NULL, *buf);
1270 ASSERT(ret == 0);
1271 CL_EXITCLASS(oldcid, cldata);
1272 schedctl_set_cidpri(t);
1273 *buf++ = NULL;
1274 }
1275 } while ((t = t->t_forw) != p->p_tlist);
1276 mutex_exit(&p->p_lock);
1277 /*
1278 * Free unused scheduling class specific buffers.
1279 */
1280 for (i = 0, buf = bufs; i < nlwp; i++, buf++) {
1281 if (*buf != NULL) {
1282 CL_FREE(cid, *buf);
1283 *buf = NULL;
1284 }
1285 }
1286 kmem_free(bufs, nlwp * sizeof (void *));
1287 }
1288
1289 void
pool_get_name(pool_t * pool,char ** name)1290 pool_get_name(pool_t *pool, char **name)
1291 {
1292 ASSERT(pool_lock_held());
1293
1294 (void) nvlist_lookup_string(pool->pool_props, "pool.name", name);
1295
1296 ASSERT(strlen(*name) != 0);
1297 }
1298
1299
1300 /*
1301 * The meat of the bind operation. The steps in pool_do_bind are:
1302 *
1303 * 1) Set PBWAIT in the p_poolflag of any process of interest, and add all
1304 * such processes to an array. For any interesting process that has
1305 * threads inside the pool barrier set, increment a counter by the
1306 * count of such threads. Once PBWAIT is set on a process, that process
1307 * will not disappear.
1308 *
1309 * 2) Wait for the counter from step 2 to drop to zero. Any process which
1310 * calls pool_barrier_exit() and notices that PBWAIT has been set on it
1311 * will decrement that counter before going to sleep, and the process
1312 * calling pool_barrier_exit() which does the final decrement will wake us.
1313 *
1314 * 3) For each interesting process, perform a calculation on it to see if
1315 * the bind will actually succeed. This uses the following three
1316 * resource-set-specific functions:
1317 *
1318 * - int set_bind_start(procs, pool)
1319 *
1320 * Determine whether the given array of processes can be bound to the
1321 * resource set associated with the given pool. If it can, take and hold
1322 * any locks necessary to ensure that the operation will succeed, and
1323 * make any necessary reservations in the target resource set. If it
1324 * can't, return failure with no reservations made and no new locks held.
1325 *
1326 * - void set_bind_abort(procs, pool)
1327 *
1328 * set_bind_start() has completed successfully, but another resource set's
1329 * set_bind_start() has failed, and we haven't begun the bind yet. Undo
1330 * any reservations made and drop any locks acquired by our
1331 * set_bind_start().
1332 *
1333 * - void set_bind_finish(void)
1334 *
1335 * The bind has completed successfully. The processes have been released,
1336 * and the reservation acquired in set_bind_start() has been depleted as
1337 * the processes have finished their bindings. Drop any locks acquired by
1338 * set_bind_start().
1339 *
1340 * 4) If we've decided that we can proceed with the bind, iterate through
1341 * the list of interesting processes, grab the necessary locks (which
1342 * may differ per resource set), perform the bind, and ASSERT that it
1343 * succeeds. Once a process has been rebound, it can be awakened.
1344 *
1345 * The operations from step 4 must be kept in sync with anything which might
1346 * cause the bind operations (e.g., cpupart_bind_thread()) to fail, and
1347 * are thus located in the same source files as the associated bind operations.
1348 */
1349 int
pool_do_bind(pool_t * pool,idtype_t idtype,id_t id,int flags)1350 pool_do_bind(pool_t *pool, idtype_t idtype, id_t id, int flags)
1351 {
1352 extern uint_t nproc;
1353 klwp_t *lwp = ttolwp(curthread);
1354 proc_t **pp, **procs;
1355 proc_t *prstart;
1356 int procs_count = 0;
1357 kproject_t *kpj = NULL;
1358 procset_t set;
1359 zone_t *zone = NULL;
1360 int procs_size;
1361 int rv = 0;
1362 proc_t *p;
1363 id_t cid = -1;
1364
1365 ASSERT(pool_lock_held());
1366
1367 if ((cid = pool_get_class(pool)) == POOL_CLASS_INVAL)
1368 return (EINVAL);
1369
1370 if (idtype == P_ZONEID) {
1371 zone = zone_find_by_id(id);
1372 if (zone == NULL)
1373 return (ESRCH);
1374 if (zone_status_get(zone) > ZONE_IS_RUNNING) {
1375 zone_rele(zone);
1376 return (EBUSY);
1377 }
1378 }
1379
1380 if (idtype == P_PROJID) {
1381 kpj = project_hold_by_id(id, global_zone, PROJECT_HOLD_FIND);
1382 if (kpj == NULL)
1383 return (ESRCH);
1384 mutex_enter(&kpj->kpj_poolbind);
1385 }
1386
1387 if (idtype == P_PID) {
1388 /*
1389 * Fast-path for a single process case.
1390 */
1391 procs_size = 2; /* procs is NULL-terminated */
1392 procs = kmem_zalloc(procs_size * sizeof (proc_t *), KM_SLEEP);
1393 mutex_enter(&pidlock);
1394 } else {
1395 /*
1396 * We will need enough slots for proc_t pointers for as many as
1397 * twice the number of currently running processes (assuming
1398 * that each one could be in fork() creating a new child).
1399 */
1400 for (;;) {
1401 procs_size = nproc * 2;
1402 procs = kmem_zalloc(procs_size * sizeof (proc_t *),
1403 KM_SLEEP);
1404 mutex_enter(&pidlock);
1405
1406 if (nproc * 2 <= procs_size)
1407 break;
1408 /*
1409 * If nproc has changed, try again.
1410 */
1411 mutex_exit(&pidlock);
1412 kmem_free(procs, procs_size * sizeof (proc_t *));
1413 }
1414 }
1415
1416 if (id == P_MYID)
1417 id = getmyid(idtype);
1418 setprocset(&set, POP_AND, idtype, id, P_ALL, 0);
1419
1420 /*
1421 * Do a first scan, and select target processes.
1422 */
1423 if (idtype == P_PID)
1424 prstart = prfind(id);
1425 else
1426 prstart = practive;
1427 for (p = prstart, pp = procs; p != NULL; p = p->p_next) {
1428 mutex_enter(&p->p_lock);
1429 /*
1430 * Skip processes that don't match our (id, idtype) set or
1431 * on the way of becoming zombies. Skip kernel processes
1432 * from the global zone.
1433 */
1434 if (procinset(p, &set) == 0 ||
1435 p->p_poolflag & PEXITED ||
1436 ((p->p_flag & SSYS) && INGLOBALZONE(p))) {
1437 mutex_exit(&p->p_lock);
1438 continue;
1439 }
1440 if (!INGLOBALZONE(p)) {
1441 switch (idtype) {
1442 case P_PID:
1443 case P_TASKID:
1444 default:
1445
1446 /*
1447 * Can't bind processes or tasks in local zones
1448 * to pools. Also catch all remaining types of
1449 * idtype_t that should already have been
1450 * filtered out.
1451 */
1452 mutex_exit(&p->p_lock);
1453 mutex_exit(&pidlock);
1454 pool_bind_wakeall(procs);
1455 rv = EINVAL;
1456 goto out;
1457 case P_PROJID:
1458 /*
1459 * Only projects in the global
1460 * zone can be rebound.
1461 */
1462 mutex_exit(&p->p_lock);
1463 continue;
1464 case P_POOLID:
1465 /*
1466 * When rebinding pools, processes can be
1467 * in different zones.
1468 */
1469 break;
1470 }
1471 }
1472
1473 p->p_poolflag |= PBWAIT;
1474 /*
1475 * If some threads in this process are inside the pool
1476 * barrier, add them to pool_barrier_count, as we have
1477 * to wait for all of them to exit the barrier.
1478 */
1479 if (p->p_poolcnt > 0) {
1480 mutex_enter(&pool_barrier_lock);
1481 pool_barrier_count += p->p_poolcnt;
1482 mutex_exit(&pool_barrier_lock);
1483 }
1484 ASSERT(pp < &procs[procs_size]);
1485 *pp++ = p;
1486 procs_count++;
1487 mutex_exit(&p->p_lock);
1488
1489 /*
1490 * We just found our process, so if we're only rebinding a
1491 * single process then get out of this loop.
1492 */
1493 if (idtype == P_PID)
1494 break;
1495 }
1496 *pp = NULL; /* cap off the end of the array */
1497 mutex_exit(&pidlock);
1498
1499 /*
1500 * Wait for relevant processes to stop before they try to enter the
1501 * barrier or at the exit from the barrier. Make sure that we do
1502 * not get stopped here while we're holding pool_lock. If we were
1503 * requested to stop, or got a signal then return EAGAIN to let the
1504 * library know that it needs to retry.
1505 */
1506 mutex_enter(&pool_barrier_lock);
1507 lwp->lwp_nostop++;
1508 while (pool_barrier_count > 0) {
1509 (void) cv_wait_sig(&pool_barrier_cv, &pool_barrier_lock);
1510 if (pool_barrier_count > 0) {
1511 /*
1512 * We either got a signal or were requested to
1513 * stop by /proc. Bail out with EAGAIN. If we were
1514 * requested to stop, we'll stop in post_syscall()
1515 * on our way back to userland.
1516 */
1517 mutex_exit(&pool_barrier_lock);
1518 pool_bind_wakeall(procs);
1519 lwp->lwp_nostop--;
1520 rv = EAGAIN;
1521 goto out;
1522 }
1523 }
1524 lwp->lwp_nostop--;
1525 mutex_exit(&pool_barrier_lock);
1526
1527 if (idtype == P_PID) {
1528 if ((p = *procs) == NULL)
1529 goto skip;
1530 mutex_enter(&p->p_lock);
1531 /* Drop the process if it is exiting */
1532 if (p->p_poolflag & PEXITED) {
1533 mutex_exit(&p->p_lock);
1534 pool_bind_wake(p);
1535 procs_count--;
1536 } else
1537 mutex_exit(&p->p_lock);
1538 goto skip;
1539 }
1540
1541 /*
1542 * Do another run, and drop processes that were inside the barrier
1543 * in exit(), but when they have dropped to pool_barrier_exit
1544 * they have become of no interest to us. Pick up child processes that
1545 * were created by fork() but didn't exist during our first scan.
1546 * Their parents are now stopped at pool_barrier_exit in cfork().
1547 */
1548 mutex_enter(&pidlock);
1549 for (pp = procs; (p = *pp) != NULL; pp++) {
1550 mutex_enter(&p->p_lock);
1551 if (p->p_poolflag & PEXITED) {
1552 ASSERT(p->p_lwpcnt == 0);
1553 mutex_exit(&p->p_lock);
1554 pool_bind_wake(p);
1555 /* flip w/last non-NULL slot */
1556 *pp = procs[procs_count - 1];
1557 procs[procs_count - 1] = NULL;
1558 procs_count--;
1559 pp--; /* try this slot again */
1560 continue;
1561 } else
1562 mutex_exit(&p->p_lock);
1563 /*
1564 * Look at the child and check if it should be rebound also.
1565 * We're holding pidlock, so it is safe to reference p_child.
1566 */
1567 if ((p = p->p_child) == NULL)
1568 continue;
1569
1570 mutex_enter(&p->p_lock);
1571
1572 /*
1573 * Skip system processes and make sure that the child is in
1574 * the same task/project/pool/zone as the parent.
1575 */
1576 if ((!INGLOBALZONE(p) && idtype != P_ZONEID &&
1577 idtype != P_POOLID) || p->p_flag & SSYS) {
1578 mutex_exit(&p->p_lock);
1579 continue;
1580 }
1581
1582 /*
1583 * If the child process has been already created by fork(), has
1584 * not exited, and has not been added to the list already,
1585 * then add it now. We will hit this process again (since we
1586 * stick it at the end of the procs list) but it will ignored
1587 * because it will have the PBWAIT flag set.
1588 */
1589 if (procinset(p, &set) &&
1590 !(p->p_poolflag & PEXITED) &&
1591 !(p->p_poolflag & PBWAIT)) {
1592 ASSERT(p->p_child == NULL); /* no child of a child */
1593 procs[procs_count] = p;
1594 procs[procs_count + 1] = NULL;
1595 procs_count++;
1596 p->p_poolflag |= PBWAIT;
1597 }
1598 mutex_exit(&p->p_lock);
1599 }
1600 mutex_exit(&pidlock);
1601 skip:
1602 /*
1603 * If there's no processes to rebind then return ESRCH, unless
1604 * we're associating a pool with new resource set, destroying it,
1605 * or binding a zone to a pool.
1606 */
1607 if (procs_count == 0) {
1608 if (idtype == P_POOLID || idtype == P_ZONEID)
1609 rv = 0;
1610 else
1611 rv = ESRCH;
1612 goto out;
1613 }
1614
1615 #ifdef DEBUG
1616 /*
1617 * All processes in the array should have PBWAIT set, and none
1618 * should be in the critical section. Thus, although p_poolflag
1619 * and p_poolcnt are protected by p_lock, their ASSERTions below
1620 * should be stable without it. procinset(), however, ASSERTs that
1621 * the p_lock is held upon entry.
1622 */
1623 for (pp = procs; (p = *pp) != NULL; pp++) {
1624 int in_set;
1625
1626 mutex_enter(&p->p_lock);
1627 in_set = procinset(p, &set);
1628 mutex_exit(&p->p_lock);
1629
1630 ASSERT(in_set);
1631 ASSERT(p->p_poolflag & PBWAIT);
1632 ASSERT(p->p_poolcnt == 0);
1633 }
1634 #endif
1635
1636 /*
1637 * Do the check if processor set rebinding is going to succeed or not.
1638 */
1639 if ((flags & POOL_BIND_PSET) &&
1640 (rv = pset_bind_start(procs, pool)) != 0) {
1641 pool_bind_wakeall(procs);
1642 goto out;
1643 }
1644
1645 /*
1646 * At this point, all bind operations should succeed.
1647 */
1648 for (pp = procs; (p = *pp) != NULL; pp++) {
1649 if (flags & POOL_BIND_PSET) {
1650 psetid_t psetid = pool->pool_pset->pset_id;
1651 void *zonebuf;
1652 void *projbuf;
1653
1654 /*
1655 * Pre-allocate one buffer for FSS (per-project
1656 * buffer for a new pset) in case if this is the
1657 * first thread from its current project getting
1658 * bound to this processor set.
1659 */
1660 projbuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_PROJ);
1661 zonebuf = fss_allocbuf(FSS_ONE_BUF, FSS_ALLOC_ZONE);
1662
1663 mutex_enter(&pidlock);
1664 mutex_enter(&p->p_lock);
1665 pool_pset_bind(p, psetid, projbuf, zonebuf);
1666 mutex_exit(&p->p_lock);
1667 mutex_exit(&pidlock);
1668 /*
1669 * Free buffers pre-allocated above if it
1670 * wasn't actually used.
1671 */
1672 fss_freebuf(projbuf, FSS_ALLOC_PROJ);
1673 fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
1674 }
1675 /*
1676 * Now let's change the scheduling class of this
1677 * process if our target pool has it defined.
1678 */
1679 if (cid != POOL_CLASS_UNSET)
1680 pool_change_class(p, cid);
1681
1682 /*
1683 * It is safe to reference p_pool here without holding
1684 * p_lock because it cannot change underneath of us.
1685 * We're holding pool_lock here, so nobody else can be
1686 * moving this process between pools. If process "p"
1687 * would be exiting, we're guaranteed that it would be blocked
1688 * at pool_barrier_enter() in exit(). Otherwise, it would've
1689 * been skipped by one of our scans of the practive list
1690 * as a process with PEXITED flag set.
1691 */
1692 if (p->p_pool != pool) {
1693 ASSERT(p->p_pool->pool_ref > 0);
1694 atomic_dec_32(&p->p_pool->pool_ref);
1695 p->p_pool = pool;
1696 atomic_inc_32(&p->p_pool->pool_ref);
1697 }
1698 /*
1699 * Okay, we've tortured this guy enough.
1700 * Let this poor process go now.
1701 */
1702 pool_bind_wake(p);
1703 }
1704 if (flags & POOL_BIND_PSET)
1705 pset_bind_finish();
1706
1707 out: switch (idtype) {
1708 case P_PROJID:
1709 ASSERT(kpj != NULL);
1710 mutex_exit(&kpj->kpj_poolbind);
1711 project_rele(kpj);
1712 break;
1713 case P_ZONEID:
1714 if (rv == 0) {
1715 mutex_enter(&cpu_lock);
1716 zone_pool_set(zone, pool);
1717 mutex_exit(&cpu_lock);
1718 }
1719 zone->zone_pool_mod = gethrtime();
1720 zone_rele(zone);
1721 break;
1722 default:
1723 break;
1724 }
1725
1726 kmem_free(procs, procs_size * sizeof (proc_t *));
1727 ASSERT(pool_barrier_count == 0);
1728 return (rv);
1729 }
1730
1731 void
pool_event_cb_register(pool_event_cb_t * cb)1732 pool_event_cb_register(pool_event_cb_t *cb)
1733 {
1734 ASSERT(!pool_lock_held() || panicstr);
1735 ASSERT(cb->pec_func != NULL);
1736
1737 mutex_enter(&pool_event_cb_lock);
1738 if (!pool_event_cb_init) {
1739 list_create(&pool_event_cb_list, sizeof (pool_event_cb_t),
1740 offsetof(pool_event_cb_t, pec_list));
1741 pool_event_cb_init = B_TRUE;
1742 }
1743 list_insert_tail(&pool_event_cb_list, cb);
1744 mutex_exit(&pool_event_cb_lock);
1745 }
1746
1747 void
pool_event_cb_unregister(pool_event_cb_t * cb)1748 pool_event_cb_unregister(pool_event_cb_t *cb)
1749 {
1750 ASSERT(!pool_lock_held() || panicstr);
1751
1752 mutex_enter(&pool_event_cb_lock);
1753 list_remove(&pool_event_cb_list, cb);
1754 mutex_exit(&pool_event_cb_lock);
1755 }
1756
1757 typedef struct {
1758 pool_event_t tqd_what;
1759 poolid_t tqd_id;
1760 } pool_tqd_t;
1761
1762 void
pool_event_notify(void * arg)1763 pool_event_notify(void *arg)
1764 {
1765 pool_tqd_t *tqd = (pool_tqd_t *)arg;
1766 pool_event_cb_t *cb;
1767
1768 ASSERT(!pool_lock_held() || panicstr);
1769
1770 mutex_enter(&pool_event_cb_lock);
1771 for (cb = list_head(&pool_event_cb_list); cb != NULL;
1772 cb = list_next(&pool_event_cb_list, cb)) {
1773 cb->pec_func(tqd->tqd_what, tqd->tqd_id, cb->pec_arg);
1774 }
1775 mutex_exit(&pool_event_cb_lock);
1776 kmem_free(tqd, sizeof (*tqd));
1777 }
1778
1779 void
pool_event_dispatch(pool_event_t what,poolid_t id)1780 pool_event_dispatch(pool_event_t what, poolid_t id)
1781 {
1782 pool_tqd_t *tqd = NULL;
1783
1784 ASSERT(pool_lock_held());
1785
1786 if (pool_event_cb_taskq == NULL) {
1787 pool_event_cb_taskq = taskq_create("pool_event_cb_taskq", 1,
1788 -1, 1, 1, TASKQ_PREPOPULATE);
1789 }
1790
1791 tqd = kmem_alloc(sizeof (*tqd), KM_SLEEP);
1792 tqd->tqd_what = what;
1793 tqd->tqd_id = id;
1794
1795 (void) taskq_dispatch(pool_event_cb_taskq, pool_event_notify, tqd,
1796 KM_SLEEP);
1797 }
1798