xref: /illumos-gate/usr/src/uts/common/os/pool_pset.c (revision a1cdd5a67f3bf3e60db3f3a77baef63640ad91a4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Copyright 2019 Joyent, Inc.
29  */
30 
31 #include <sys/pool.h>
32 #include <sys/pool_impl.h>
33 #include <sys/pool_pset.h>
34 #include <sys/cpuvar.h>
35 #include <sys/cpupart.h>
36 #include <sys/mutex.h>
37 #include <sys/errno.h>
38 #include <sys/systm.h>
39 #include <sys/cmn_err.h>
40 #include <sys/fss.h>
41 #include <sys/exacct.h>
42 #include <sys/time.h>
43 #include <sys/policy.h>
44 #include <sys/class.h>
45 #include <sys/list.h>
46 #include <sys/cred.h>
47 #include <sys/zone.h>
48 
49 /*
50  * Processor set plugin for pools.
51  *
52  * This file contains various routines used by the common pools layer to create,
53  * modify, and destroy processor sets.  All processor sets created by this
54  * plug-in are stored in the pool_pset_list doubly-linked list, which is
55  * guaranteed to always have an entry for the default processor set,
56  * pool_pset_default.
57  *
58  * Interaction with zones:
59  *
60  * If pools are enabled, non-global zones only have visibility into the
61  * pset of the pool to which they are bound.  This is accomplished by
62  * changing the set of processors and processor sets which are visible
63  * through both systemcall interfaces and system kstats.
64  *
65  * To avoid grabbing pool_lock() during cpu change operations, we cache
66  * the pset the zone is currently bound to, and can read this value
67  * while under cpu_lock.  The special psetid_t token ZONE_PS_INVAL means
68  * that pools are disabled, and provides a mechanism for determining if the
69  * status of pools without grabbing pool_lock().
70  *
71  * To avoid grabbing any locks to determine the instantaneous value of
72  * the number of configured and online cpus in the zone, we also cache
73  * these values in a zone_t.  If these values are zero, the pools
74  * facility must be disabled, in which case relevant systemcall
75  * interfaces will return the values for the system as a whole.
76  *
77  * The various kstat interfaces are dealt with as follows: if pools are
78  * disabled all cpu-related kstats should be exported to all zones.
79  * When pools are enabled we begin maintaining a list of "permitted
80  * zones" on a per-kstat basis.  There are various hooks throughout the
81  * code to update this list when certain pools- or cpu-related events
82  * occur.
83  */
84 
85 static list_t pool_pset_list;	/* doubly-linked list of psets */
86 pool_pset_t *pool_pset_default;	/* default pset */
87 hrtime_t pool_pset_mod;		/* last modification time for psets */
88 hrtime_t pool_cpu_mod;		/* last modification time for CPUs */
89 
90 static pool_pset_t *
91 pool_lookup_pset_by_id(psetid_t psetid)
92 {
93 	pool_pset_t *pset = pool_pset_default;
94 
95 	ASSERT(pool_lock_held());
96 
97 	for (pset = list_head(&pool_pset_list); pset;
98 	    pset = list_next(&pool_pset_list, pset)) {
99 		if (pset->pset_id == psetid)
100 			return (pset);
101 	}
102 	return (NULL);
103 }
104 
105 struct setup_arg {
106 	psetid_t psetid;
107 	cpu_t *cpu;
108 	cpu_setup_t what;
109 };
110 
111 /*
112  * Callback function used to apply a cpu configuration event to a zone.
113  */
114 static int
115 pool_pset_setup_cb(zone_t *zone, void *arg)
116 {
117 	struct setup_arg *sa = arg;
118 
119 	ASSERT(MUTEX_HELD(&cpu_lock));
120 	ASSERT(INGLOBALZONE(curproc));
121 	ASSERT(zone != NULL);
122 
123 	if (zone == global_zone)
124 		return (0);
125 	if (zone_pset_get(zone) != sa->psetid)
126 		return (0);	/* ignore */
127 	switch (sa->what) {
128 	case CPU_CONFIG:
129 		cpu_visibility_configure(sa->cpu, zone);
130 		break;
131 	case CPU_UNCONFIG:
132 		cpu_visibility_unconfigure(sa->cpu, zone);
133 		break;
134 	case CPU_ON:
135 		cpu_visibility_online(sa->cpu, zone);
136 		break;
137 	case CPU_OFF:
138 		cpu_visibility_offline(sa->cpu, zone);
139 		break;
140 	case CPU_CPUPART_IN:
141 		cpu_visibility_add(sa->cpu, zone);
142 		break;
143 	case CPU_CPUPART_OUT:
144 		cpu_visibility_remove(sa->cpu, zone);
145 		break;
146 	default:
147 		cmn_err(CE_PANIC, "invalid cpu_setup_t value %d", sa->what);
148 	}
149 	return (0);
150 }
151 
152 /*
153  * Callback function to be executed when a noteworthy cpu event takes
154  * place.  Will ensure that the event is reflected by the zones which
155  * were affected by it.
156  */
157 /* ARGSUSED */
158 static int
159 pool_pset_cpu_setup(cpu_setup_t what, int id, void *arg)
160 {
161 	processorid_t cpuid = id;
162 	struct setup_arg sarg;
163 	int error;
164 	cpu_t *c;
165 
166 	ASSERT(MUTEX_HELD(&cpu_lock));
167 	ASSERT(INGLOBALZONE(curproc));
168 
169 	if (!pool_pset_enabled())
170 		return (0);
171 	if (what != CPU_CONFIG && what != CPU_UNCONFIG &&
172 	    what != CPU_ON && what != CPU_OFF &&
173 	    what != CPU_CPUPART_IN && what != CPU_CPUPART_OUT)
174 		return (0);
175 	c = cpu_get(cpuid);
176 	ASSERT(c != NULL);
177 	sarg.psetid = cpupart_query_cpu(c);
178 	sarg.cpu = c;
179 	sarg.what = what;
180 
181 	error = zone_walk(pool_pset_setup_cb, &sarg);
182 	ASSERT(error == 0);
183 	return (0);
184 }
185 
186 /*
187  * Initialize processor set plugin.  Called once at boot time.
188  */
189 void
190 pool_pset_init(void)
191 {
192 	ASSERT(pool_pset_default == NULL);
193 	pool_pset_default = kmem_zalloc(sizeof (pool_pset_t), KM_SLEEP);
194 	pool_pset_default->pset_id = PS_NONE;
195 	pool_pset_default->pset_npools = 1;	/* for pool_default */
196 	pool_default->pool_pset = pool_pset_default;
197 	list_create(&pool_pset_list, sizeof (pool_pset_t),
198 	    offsetof(pool_pset_t, pset_link));
199 	list_insert_head(&pool_pset_list, pool_pset_default);
200 	mutex_enter(&cpu_lock);
201 	register_cpu_setup_func(pool_pset_cpu_setup, NULL);
202 	mutex_exit(&cpu_lock);
203 }
204 
205 /*
206  * Dummy wrapper function that returns 0 to satisfy zone_walk().
207  */
208 static int
209 pool_pset_zone_pset_set(zone_t *zone, void *arg)
210 {
211 	psetid_t psetid = (psetid_t)(uintptr_t)arg;
212 
213 	ASSERT(MUTEX_HELD(&cpu_lock));
214 	zone_pset_set(zone, psetid);
215 	return (0);
216 }
217 
218 /*
219  * Enable processor set plugin.
220  */
221 int
222 pool_pset_enable(void)
223 {
224 	int error;
225 	nvlist_t *props;
226 
227 	ASSERT(pool_lock_held());
228 	ASSERT(INGLOBALZONE(curproc));
229 	/*
230 	 * Can't enable pools if there are existing cpu partitions.
231 	 */
232 	mutex_enter(&cpu_lock);
233 	if (cp_numparts > 1) {
234 		mutex_exit(&cpu_lock);
235 		return (EEXIST);
236 	}
237 
238 	/*
239 	 * We want to switch things such that everything that was tagged with
240 	 * the special ALL_ZONES token now is explicitly visible to all zones:
241 	 * first add individual zones to the visibility list then remove the
242 	 * special "ALL_ZONES" token.  There must only be the default pset
243 	 * (PS_NONE) active if pools are being enabled, so we only need to
244 	 * deal with it.
245 	 *
246 	 * We want to make pool_pset_enabled() start returning B_TRUE before
247 	 * we call any of the visibility update functions.
248 	 */
249 	global_zone->zone_psetid = PS_NONE;
250 	/*
251 	 * We need to explicitly handle the global zone since
252 	 * zone_pset_set() won't modify it.
253 	 */
254 	pool_pset_visibility_add(PS_NONE, global_zone);
255 	/*
256 	 * A NULL argument means the ALL_ZONES token.
257 	 */
258 	pool_pset_visibility_remove(PS_NONE, NULL);
259 	error = zone_walk(pool_pset_zone_pset_set, (void *)PS_NONE);
260 	ASSERT(error == 0);
261 
262 	/*
263 	 * It is safe to drop cpu_lock here.  We're still
264 	 * holding pool_lock so no new cpu partitions can
265 	 * be created while we're here.
266 	 */
267 	mutex_exit(&cpu_lock);
268 	(void) nvlist_alloc(&pool_pset_default->pset_props,
269 	    NV_UNIQUE_NAME, KM_SLEEP);
270 	props = pool_pset_default->pset_props;
271 	(void) nvlist_add_string(props, "pset.name", "pset_default");
272 	(void) nvlist_add_string(props, "pset.comment", "");
273 	(void) nvlist_add_int64(props, "pset.sys_id", PS_NONE);
274 	(void) nvlist_add_string(props, "pset.units", "population");
275 	(void) nvlist_add_byte(props, "pset.default", 1);
276 	(void) nvlist_add_uint64(props, "pset.max", 65536);
277 	(void) nvlist_add_uint64(props, "pset.min", 1);
278 	pool_pset_mod = pool_cpu_mod = gethrtime();
279 	return (0);
280 }
281 
282 /*
283  * Disable processor set plugin.
284  */
285 int
286 pool_pset_disable(void)
287 {
288 	processorid_t cpuid;
289 	cpu_t *cpu;
290 	int error;
291 
292 	ASSERT(pool_lock_held());
293 	ASSERT(INGLOBALZONE(curproc));
294 
295 	mutex_enter(&cpu_lock);
296 	if (cp_numparts > 1) {	/* make sure only default pset is left */
297 		mutex_exit(&cpu_lock);
298 		return (EBUSY);
299 	}
300 	/*
301 	 * Remove all non-system CPU and processor set properties
302 	 */
303 	for (cpuid = 0; cpuid < NCPU; cpuid++) {
304 		if ((cpu = cpu_get(cpuid)) == NULL)
305 			continue;
306 		if (cpu->cpu_props != NULL) {
307 			(void) nvlist_free(cpu->cpu_props);
308 			cpu->cpu_props = NULL;
309 		}
310 	}
311 
312 	/*
313 	 * We want to switch things such that everything is now visible
314 	 * to ALL_ZONES: first add the special "ALL_ZONES" token to the
315 	 * visibility list then remove individual zones.  There must
316 	 * only be the default pset active if pools are being disabled,
317 	 * so we only need to deal with it.
318 	 */
319 	error = zone_walk(pool_pset_zone_pset_set, (void *)ZONE_PS_INVAL);
320 	ASSERT(error == 0);
321 	pool_pset_visibility_add(PS_NONE, NULL);
322 	pool_pset_visibility_remove(PS_NONE, global_zone);
323 	/*
324 	 * pool_pset_enabled() will henceforth return B_FALSE.
325 	 */
326 	global_zone->zone_psetid = ZONE_PS_INVAL;
327 	mutex_exit(&cpu_lock);
328 	if (pool_pset_default->pset_props != NULL) {
329 		nvlist_free(pool_pset_default->pset_props);
330 		pool_pset_default->pset_props = NULL;
331 	}
332 	return (0);
333 }
334 
335 /*
336  * Create new processor set and give it a temporary name.
337  */
338 int
339 pool_pset_create(psetid_t *id)
340 {
341 	char pset_name[40];
342 	pool_pset_t *pset;
343 	psetid_t psetid;
344 	int err;
345 
346 	ASSERT(pool_lock_held());
347 	if ((err = cpupart_create(&psetid)) != 0)
348 		return (err);
349 	pset = kmem_alloc(sizeof (pool_pset_t), KM_SLEEP);
350 	pset->pset_id = *id = psetid;
351 	pset->pset_npools = 0;
352 	(void) nvlist_alloc(&pset->pset_props, NV_UNIQUE_NAME, KM_SLEEP);
353 	(void) nvlist_add_int64(pset->pset_props, "pset.sys_id", psetid);
354 	(void) nvlist_add_byte(pset->pset_props, "pset.default", 0);
355 	pool_pset_mod = gethrtime();
356 	(void) snprintf(pset_name, sizeof (pset_name), "pset_%lld",
357 	    pool_pset_mod);
358 	(void) nvlist_add_string(pset->pset_props, "pset.name", pset_name);
359 	list_insert_tail(&pool_pset_list, pset);
360 	return (0);
361 }
362 
363 /*
364  * Destroy existing processor set.
365  */
366 int
367 pool_pset_destroy(psetid_t psetid)
368 {
369 	pool_pset_t *pset;
370 	int ret;
371 
372 	ASSERT(pool_lock_held());
373 
374 	if (psetid == PS_NONE)
375 		return (EINVAL);
376 	if ((pset = pool_lookup_pset_by_id(psetid)) == NULL)
377 		return (ESRCH);
378 	if (pset->pset_npools > 0) /* can't destroy associated psets */
379 		return (EBUSY);
380 	if ((ret = cpupart_destroy(pset->pset_id)) != 0)
381 		return (ret);
382 	(void) nvlist_free(pset->pset_props);
383 	list_remove(&pool_pset_list, pset);
384 	pool_pset_mod = gethrtime();
385 	kmem_free(pset, sizeof (pool_pset_t));
386 	return (0);
387 }
388 
389 /*
390  * Change the visibility of a pset (and all contained cpus) in a zone.
391  * A NULL zone argument implies the special ALL_ZONES token.
392  */
393 static void
394 pool_pset_visibility_change(psetid_t psetid, zone_t *zone, boolean_t add)
395 {
396 	zoneid_t zoneid = zone ? zone->zone_id : ALL_ZONES;
397 	cpupart_t *cp;
398 	cpu_t *c;
399 
400 	ASSERT(MUTEX_HELD(&cpu_lock));
401 	ASSERT(psetid != ZONE_PS_INVAL);
402 
403 	cp = cpupart_find(psetid);
404 	ASSERT(cp != NULL);
405 	if (cp->cp_kstat != NULL) {
406 		if (add)
407 			kstat_zone_add(cp->cp_kstat, zoneid);
408 		else
409 			kstat_zone_remove(cp->cp_kstat, zoneid);
410 	}
411 
412 	c = cpu_list;
413 	do {
414 		ASSERT(c != NULL);
415 		if (c->cpu_part == cp && !cpu_is_poweredoff(c)) {
416 			if (add)
417 				cpu_visibility_add(c, zone);
418 			else
419 				cpu_visibility_remove(c, zone);
420 		}
421 	} while ((c = c->cpu_next) != cpu_list);
422 }
423 
424 /*
425  * Make the processor set visible to the zone.  A NULL value for
426  * the zone means that the special ALL_ZONES token should be added to
427  * the visibility list.
428  */
429 void
430 pool_pset_visibility_add(psetid_t psetid, zone_t *zone)
431 {
432 	pool_pset_visibility_change(psetid, zone, B_TRUE);
433 }
434 
435 /*
436  * Remove zone's visibility into the processor set.  A NULL value for
437  * the zone means that the special ALL_ZONES token should be removed
438  * from the visibility list.
439  */
440 void
441 pool_pset_visibility_remove(psetid_t psetid, zone_t *zone)
442 {
443 	pool_pset_visibility_change(psetid, zone, B_FALSE);
444 }
445 
446 /*
447  * Quick way of seeing if pools are enabled (as far as processor sets are
448  * concerned) without holding pool_lock().
449  */
450 boolean_t
451 pool_pset_enabled(void)
452 {
453 	ASSERT(MUTEX_HELD(&cpu_lock));
454 
455 	return (zone_pset_get(global_zone) != ZONE_PS_INVAL);
456 }
457 
458 struct assoc_zone_arg {
459 	poolid_t poolid;
460 	psetid_t newpsetid;
461 };
462 
463 /*
464  * Callback function to update a zone's processor set visibility when
465  * a pool is associated with a processor set.
466  */
467 static int
468 pool_pset_assoc_zone_cb(zone_t *zone, void *arg)
469 {
470 	struct assoc_zone_arg *aza = arg;
471 	pool_t *pool;
472 	zoneid_t zoneid = zone->zone_id;
473 
474 	ASSERT(pool_lock_held());
475 	ASSERT(MUTEX_HELD(&cpu_lock));
476 
477 	if (zoneid == GLOBAL_ZONEID)
478 		return (0);
479 	pool = zone_pool_get(zone);
480 	if (pool->pool_id == aza->poolid)
481 		zone_pset_set(zone, aza->newpsetid);
482 	return (0);
483 }
484 
485 /*
486  * Associate pool with new processor set.
487  */
488 int
489 pool_pset_assoc(poolid_t poolid, psetid_t psetid)
490 {
491 	pool_t *pool;
492 	pool_pset_t *pset, *oldpset;
493 	int err = 0;
494 
495 	ASSERT(pool_lock_held());
496 
497 	if ((pool = pool_lookup_pool_by_id(poolid)) == NULL ||
498 	    (pset = pool_lookup_pset_by_id(psetid)) == NULL) {
499 		return (ESRCH);
500 	}
501 	if (pool->pool_pset->pset_id == psetid) {
502 		/*
503 		 * Already associated.
504 		 */
505 		return (0);
506 	}
507 
508 	/*
509 	 * Hang the new pset off the pool, and rebind all of the pool's
510 	 * processes to it.  If pool_do_bind fails, all processes will remain
511 	 * bound to the old set.
512 	 */
513 	oldpset = pool->pool_pset;
514 	pool->pool_pset = pset;
515 	err = pool_do_bind(pool, P_POOLID, poolid, POOL_BIND_PSET);
516 	if (err) {
517 		pool->pool_pset = oldpset;
518 	} else {
519 		struct assoc_zone_arg azarg;
520 
521 		/*
522 		 * Update zones' visibility to reflect changes.
523 		 */
524 		azarg.poolid = poolid;
525 		azarg.newpsetid = pset->pset_id;
526 		mutex_enter(&cpu_lock);
527 		err = zone_walk(pool_pset_assoc_zone_cb, &azarg);
528 		ASSERT(err == 0);
529 		mutex_exit(&cpu_lock);
530 
531 		oldpset->pset_npools--;
532 		pset->pset_npools++;
533 	}
534 	return (err);
535 }
536 
537 /*
538  * Transfer specified CPUs between processor sets.
539  */
540 int
541 pool_pset_xtransfer(psetid_t src, psetid_t dst, size_t size, id_t *ids)
542 {
543 	struct cpu *cpu;
544 	int ret = 0;
545 	int id;
546 
547 	ASSERT(pool_lock_held());
548 	ASSERT(INGLOBALZONE(curproc));
549 
550 	if (size == 0 || size > max_ncpus)	/* quick sanity check */
551 		return (EINVAL);
552 
553 	mutex_enter(&cpu_lock);
554 	for (id = 0; id < size; id++) {
555 		if ((cpu = cpu_get((processorid_t)ids[id])) == NULL ||
556 		    cpupart_query_cpu(cpu) != src) {
557 			ret = EINVAL;
558 			break;
559 		}
560 		if ((ret = cpupart_attach_cpu(dst, cpu, 1)) != 0)
561 			break;
562 	}
563 	mutex_exit(&cpu_lock);
564 	if (ret == 0)
565 		pool_pset_mod = gethrtime();
566 	return (ret);
567 }
568 
569 /*
570  * Bind process to processor set.  This should never fail because
571  * we should've done all preliminary checks before calling it.
572  */
573 void
574 pool_pset_bind(proc_t *p, psetid_t psetid, void *projbuf, void *zonebuf)
575 {
576 	kthread_t *t;
577 	int ret;
578 
579 	ASSERT(pool_lock_held());
580 	ASSERT(MUTEX_HELD(&cpu_lock));
581 	ASSERT(MUTEX_HELD(&pidlock));
582 	ASSERT(MUTEX_HELD(&p->p_lock));
583 
584 	if ((t = p->p_tlist) == NULL)
585 		return;
586 	do {
587 		ret = cpupart_bind_thread(t, psetid, 0, projbuf, zonebuf);
588 		ASSERT(ret == 0);
589 		t->t_bind_pset = psetid;
590 	} while ((t = t->t_forw) != p->p_tlist);
591 }
592 
593 /*
594  * See the comment above pool_do_bind() for the semantics of the pset_bind_*()
595  * functions.  These must be kept in sync with cpupart_move_thread, and
596  * anything else that could fail a pool_pset_bind.
597  *
598  * Returns non-zero errno on failure and zero on success.
599  * Iff successful, cpu_lock is held on return.
600  */
601 int
602 pset_bind_start(proc_t **procs, pool_t *pool)
603 {
604 	cred_t *pcred;
605 	proc_t *p, **pp;
606 	kthread_t *t;
607 	cpupart_t *newpp;
608 	int ret;
609 
610 	extern int cpupart_movable_thread(kthread_id_t, cpupart_t *, int);
611 
612 	ASSERT(pool_lock_held());
613 	ASSERT(INGLOBALZONE(curproc));
614 
615 	mutex_enter(&cpu_lock);
616 	weakbinding_stop();
617 
618 	newpp = cpupart_find(pool->pool_pset->pset_id);
619 	ASSERT(newpp != NULL);
620 	if (newpp->cp_cpulist == NULL) {
621 		weakbinding_start();
622 		mutex_exit(&cpu_lock);
623 		return (ENOTSUP);
624 	}
625 
626 	pcred = crgetcred();
627 
628 	/*
629 	 * Check for the PRIV_PROC_PRIOCNTL privilege that is required
630 	 * to enter and exit scheduling classes.  If other privileges
631 	 * are required by CL_ENTERCLASS/CL_CANEXIT types of routines
632 	 * in the future, this code will have to be updated.
633 	 */
634 	if (secpolicy_setpriority(pcred) != 0) {
635 		weakbinding_start();
636 		mutex_exit(&cpu_lock);
637 		crfree(pcred);
638 		return (EPERM);
639 	}
640 
641 	for (pp = procs; (p = *pp) != NULL; pp++) {
642 		mutex_enter(&p->p_lock);
643 		if ((t = p->p_tlist) == NULL) {
644 			mutex_exit(&p->p_lock);
645 			continue;
646 		}
647 		/*
648 		 * Check our basic permissions to control this process.
649 		 */
650 		if (!prochasprocperm(p, curproc, pcred)) {
651 			mutex_exit(&p->p_lock);
652 			weakbinding_start();
653 			mutex_exit(&cpu_lock);
654 			crfree(pcred);
655 			return (EPERM);
656 		}
657 		do {
658 			/*
659 			 * Check that all threads can be moved to
660 			 * a new processor set.
661 			 */
662 			thread_lock(t);
663 			ret = cpupart_movable_thread(t, newpp, 0);
664 			thread_unlock(t);
665 			if (ret != 0) {
666 				mutex_exit(&p->p_lock);
667 				weakbinding_start();
668 				mutex_exit(&cpu_lock);
669 				crfree(pcred);
670 				return (ret);
671 			}
672 		} while ((t = t->t_forw) != p->p_tlist);
673 		mutex_exit(&p->p_lock);
674 	}
675 	crfree(pcred);
676 	return (0);	/* with cpu_lock held and weakbinding stopped */
677 }
678 
679 /*ARGSUSED*/
680 void
681 pset_bind_abort(proc_t **procs, pool_t *pool)
682 {
683 	mutex_exit(&cpu_lock);
684 }
685 
686 void
687 pset_bind_finish(void)
688 {
689 	weakbinding_start();
690 	mutex_exit(&cpu_lock);
691 }
692 
693 static pool_property_t pool_pset_props[] = {
694 	{ "pset.name",			DATA_TYPE_STRING,	PP_RDWR },
695 	{ "pset.comment",		DATA_TYPE_STRING,	PP_RDWR },
696 	{ "pset.sys_id",		DATA_TYPE_UINT64,	PP_READ },
697 	{ "pset.units",			DATA_TYPE_STRING,	PP_RDWR },
698 	{ "pset.default",		DATA_TYPE_BYTE,		PP_READ },
699 	{ "pset.min",			DATA_TYPE_UINT64,	PP_RDWR },
700 	{ "pset.max",			DATA_TYPE_UINT64,	PP_RDWR },
701 	{ "pset.size",			DATA_TYPE_UINT64,	PP_READ },
702 	{ "pset.load",			DATA_TYPE_UINT64,	PP_READ },
703 	{ "pset.poold.objectives",	DATA_TYPE_STRING,
704 	    PP_RDWR | PP_OPTIONAL },
705 	{ NULL,				0,			0 }
706 };
707 
708 static pool_property_t pool_cpu_props[] = {
709 	{ "cpu.sys_id",			DATA_TYPE_UINT64,	PP_READ },
710 	{ "cpu.comment",		DATA_TYPE_STRING,	PP_RDWR },
711 	{ "cpu.status",			DATA_TYPE_STRING,	PP_RDWR },
712 	{ "cpu.pinned",			DATA_TYPE_BYTE,
713 	    PP_RDWR | PP_OPTIONAL },
714 	{ NULL,				0,			0 }
715 };
716 
717 /*
718  * Put property on the specified processor set.
719  */
720 int
721 pool_pset_propput(psetid_t psetid, nvpair_t *pair)
722 {
723 	pool_pset_t *pset;
724 	int ret;
725 
726 	ASSERT(pool_lock_held());
727 
728 	if ((pset = pool_lookup_pset_by_id(psetid)) == NULL)
729 		return (ESRCH);
730 	ret = pool_propput_common(pset->pset_props, pair, pool_pset_props);
731 	if (ret == 0)
732 		pool_pset_mod = gethrtime();
733 	return (ret);
734 }
735 
736 /*
737  * Remove existing processor set property.
738  */
739 int
740 pool_pset_proprm(psetid_t psetid, char *name)
741 {
742 	pool_pset_t *pset;
743 	int ret;
744 
745 	ASSERT(pool_lock_held());
746 
747 	if ((pset = pool_lookup_pset_by_id(psetid)) == NULL)
748 		return (EINVAL);
749 	ret = pool_proprm_common(pset->pset_props, name, pool_pset_props);
750 	if (ret == 0)
751 		pool_pset_mod = gethrtime();
752 	return (ret);
753 }
754 
755 /*
756  * Put new CPU property.
757  * Handle special case of "cpu.status".
758  */
759 int
760 pool_cpu_propput(processorid_t cpuid, nvpair_t *pair)
761 {
762 	int ret = 0;
763 	cpu_t *cpu;
764 
765 	ASSERT(pool_lock_held());
766 	ASSERT(INGLOBALZONE(curproc));
767 
768 	if (nvpair_type(pair) == DATA_TYPE_STRING &&
769 	    strcmp(nvpair_name(pair), "cpu.status") == 0) {
770 		char *val;
771 		int status;
772 		int old_status;
773 		(void) nvpair_value_string(pair, &val);
774 		if (strcmp(val, PS_OFFLINE) == 0)
775 			status = P_OFFLINE;
776 		else if (strcmp(val, PS_ONLINE) == 0)
777 			status = P_ONLINE;
778 		else if (strcmp(val, PS_NOINTR) == 0)
779 			status = P_NOINTR;
780 		else if (strcmp(val, PS_FAULTED) == 0)
781 			status = P_FAULTED;
782 		else if (strcmp(val, PS_SPARE) == 0)
783 			status = P_SPARE;
784 		else
785 			return (EINVAL);
786 		ret = p_online_internal(cpuid, status, &old_status);
787 	} else {
788 		mutex_enter(&cpu_lock);
789 		if ((cpu = cpu_get(cpuid)) == NULL)
790 			ret = EINVAL;
791 		if (cpu->cpu_props == NULL) {
792 			(void) nvlist_alloc(&cpu->cpu_props,
793 			    NV_UNIQUE_NAME, KM_SLEEP);
794 			(void) nvlist_add_string(cpu->cpu_props,
795 			    "cpu.comment", "");
796 		}
797 		ret = pool_propput_common(cpu->cpu_props, pair, pool_cpu_props);
798 		if (ret == 0)
799 			pool_cpu_mod = gethrtime();
800 		mutex_exit(&cpu_lock);
801 	}
802 	return (ret);
803 }
804 
805 /*
806  * Remove existing CPU property.
807  */
808 int
809 pool_cpu_proprm(processorid_t cpuid, char *name)
810 {
811 	int ret;
812 	cpu_t *cpu;
813 
814 	ASSERT(pool_lock_held());
815 	ASSERT(INGLOBALZONE(curproc));
816 
817 	mutex_enter(&cpu_lock);
818 	if ((cpu = cpu_get(cpuid)) == NULL || cpu_is_poweredoff(cpu)) {
819 		ret = EINVAL;
820 	} else {
821 		if (cpu->cpu_props == NULL)
822 			ret = EINVAL;
823 		else
824 			ret = pool_proprm_common(cpu->cpu_props, name,
825 			    pool_cpu_props);
826 	}
827 	if (ret == 0)
828 		pool_cpu_mod = gethrtime();
829 	mutex_exit(&cpu_lock);
830 	return (ret);
831 }
832 
833 /*
834  * This macro returns load average multiplied by 1000 w/o losing precision
835  */
836 #define	PSET_LOAD(f)	(((f >> 16) * 1000) + (((f & 0xffff) * 1000) / 0xffff))
837 
838 /*
839  * Take a snapshot of the current state of processor sets and CPUs,
840  * pack it in the exacct format, and attach it to specified exacct record.
841  */
842 int
843 pool_pset_pack(ea_object_t *eo_system)
844 {
845 	ea_object_t *eo_pset, *eo_cpu;
846 	cpupart_t *cpupart;
847 	psetid_t mypsetid;
848 	pool_pset_t *pset;
849 	nvlist_t *nvl;
850 	size_t bufsz;
851 	cpu_t *cpu;
852 	char *buf;
853 	int ncpu;
854 
855 	ASSERT(pool_lock_held());
856 
857 	mutex_enter(&cpu_lock);
858 	mypsetid = zone_pset_get(curproc->p_zone);
859 	for (pset = list_head(&pool_pset_list); pset;
860 	    pset = list_next(&pool_pset_list, pset)) {
861 		psetid_t psetid = pset->pset_id;
862 
863 		if (!INGLOBALZONE(curproc) && mypsetid != psetid)
864 			continue;
865 		cpupart = cpupart_find(psetid);
866 		ASSERT(cpupart != NULL);
867 		eo_pset = ea_alloc_group(EXT_GROUP |
868 		    EXC_LOCAL | EXD_GROUP_PSET);
869 		(void) ea_attach_item(eo_pset, &psetid, sizeof (id_t),
870 		    EXC_LOCAL | EXD_PSET_PSETID | EXT_UINT32);
871 		/*
872 		 * Pack info for all CPUs in this processor set.
873 		 */
874 		ncpu = 0;
875 		cpu = cpu_list;
876 		do {
877 			if (cpu->cpu_part != cpupart)	/* not our pset */
878 				continue;
879 			ncpu++;
880 			eo_cpu = ea_alloc_group(EXT_GROUP
881 			    | EXC_LOCAL | EXD_GROUP_CPU);
882 			(void) ea_attach_item(eo_cpu, &cpu->cpu_id,
883 			    sizeof (processorid_t),
884 			    EXC_LOCAL | EXD_CPU_CPUID | EXT_UINT32);
885 			if (cpu->cpu_props == NULL) {
886 				(void) nvlist_alloc(&cpu->cpu_props,
887 				    NV_UNIQUE_NAME, KM_SLEEP);
888 				(void) nvlist_add_string(cpu->cpu_props,
889 				    "cpu.comment", "");
890 			}
891 			(void) nvlist_dup(cpu->cpu_props, &nvl, KM_SLEEP);
892 			(void) nvlist_add_int64(nvl, "cpu.sys_id", cpu->cpu_id);
893 			(void) nvlist_add_string(nvl, "cpu.status",
894 			    (char *)cpu_get_state_str(cpu->cpu_flags));
895 			buf = NULL;
896 			bufsz = 0;
897 			(void) nvlist_pack(nvl, &buf, &bufsz,
898 			    NV_ENCODE_NATIVE, 0);
899 			(void) ea_attach_item(eo_cpu, buf, bufsz,
900 			    EXC_LOCAL | EXD_CPU_PROP | EXT_RAW);
901 			(void) nvlist_free(nvl);
902 			kmem_free(buf, bufsz);
903 			(void) ea_attach_to_group(eo_pset, eo_cpu);
904 		} while ((cpu = cpu->cpu_next) != cpu_list);
905 
906 		(void) nvlist_dup(pset->pset_props, &nvl, KM_SLEEP);
907 		(void) nvlist_add_uint64(nvl, "pset.size", ncpu);
908 		(void) nvlist_add_uint64(nvl, "pset.load",
909 		    (uint64_t)PSET_LOAD(cpupart->cp_hp_avenrun[0]));
910 		buf = NULL;
911 		bufsz = 0;
912 		(void) nvlist_pack(nvl, &buf, &bufsz, NV_ENCODE_NATIVE, 0);
913 		(void) ea_attach_item(eo_pset, buf, bufsz,
914 		    EXC_LOCAL | EXD_PSET_PROP | EXT_RAW);
915 		(void) nvlist_free(nvl);
916 		kmem_free(buf, bufsz);
917 
918 		(void) ea_attach_to_group(eo_system, eo_pset);
919 	}
920 	mutex_exit(&cpu_lock);
921 	return (0);
922 }
923 
924 /*
925  * Get dynamic property for processor sets.
926  * The only dynamic property currently implemented is "pset.load".
927  */
928 int
929 pool_pset_propget(psetid_t psetid, char *name, nvlist_t *nvl)
930 {
931 	cpupart_t *cpupart;
932 	pool_pset_t *pset;
933 	int ret = ESRCH;
934 
935 	ASSERT(pool_lock_held());
936 
937 	mutex_enter(&cpu_lock);
938 	pset = pool_lookup_pset_by_id(psetid);
939 	cpupart = cpupart_find(psetid);
940 	if (cpupart == NULL || pset == NULL) {
941 		mutex_exit(&cpu_lock);
942 		return (EINVAL);
943 	}
944 	if (strcmp(name, "pset.load") == 0)
945 		ret = nvlist_add_uint64(nvl, "pset.load",
946 		    (uint64_t)PSET_LOAD(cpupart->cp_hp_avenrun[0]));
947 	else
948 		ret = EINVAL;
949 	mutex_exit(&cpu_lock);
950 	return (ret);
951 }
952 
953 /*
954  * Get dynamic property for CPUs.
955  * The only dynamic property currently implemented is "cpu.status".
956  */
957 int
958 pool_cpu_propget(processorid_t cpuid, char *name, nvlist_t *nvl)
959 {
960 	int ret = ESRCH;
961 	cpu_t *cpu;
962 
963 	ASSERT(pool_lock_held());
964 
965 	mutex_enter(&cpu_lock);
966 	if ((cpu = cpu_get(cpuid)) == NULL) {
967 		mutex_exit(&cpu_lock);
968 		return (ESRCH);
969 	}
970 	if (strcmp(name, "cpu.status") == 0) {
971 		ret = nvlist_add_string(nvl, "cpu.status",
972 		    (char *)cpu_get_state_str(cpu->cpu_flags));
973 	} else {
974 		ret = EINVAL;
975 	}
976 	mutex_exit(&cpu_lock);
977 	return (ret);
978 }
979