xref: /illumos-gate/usr/src/uts/common/os/pool_pset.c (revision eb0cc229f19c437a6b538d3ac0d0443268290b7e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/pool.h>
30 #include <sys/pool_impl.h>
31 #include <sys/pool_pset.h>
32 #include <sys/cpuvar.h>
33 #include <sys/cpupart.h>
34 #include <sys/mutex.h>
35 #include <sys/errno.h>
36 #include <sys/systm.h>
37 #include <sys/cmn_err.h>
38 #include <sys/fss.h>
39 #include <sys/exacct.h>
40 #include <sys/time.h>
41 #include <sys/policy.h>
42 #include <sys/class.h>
43 #include <sys/list.h>
44 #include <sys/cred.h>
45 #include <sys/zone.h>
46 
47 /*
48  * Processor set plugin for pools.
49  *
50  * This file contains various routines used by the common pools layer to create,
51  * modify, and destroy processor sets.  All processor sets created by this
52  * plug-in are stored in the pool_pset_list doubly-linked list, which is
53  * guaranteed to always have an entry for the default processor set,
54  * pool_pset_default.
55  *
56  * Interaction with zones:
57  *
58  * If pools are enabled, non-global zones only have visibility into the
59  * pset of the pool to which they are bound.  This is accomplished by
60  * changing the set of processors and processor sets which are visible
61  * through both systemcall interfaces and system kstats.
62  *
63  * To avoid grabbing pool_lock() during cpu change operations, we cache
64  * the pset the zone is currently bound to, and can read this value
65  * while under cpu_lock.  The special psetid_t token ZONE_PS_INVAL means
66  * that pools are disabled, and provides a mechanism for determining if the
67  * status of pools without grabbing pool_lock().
68  *
69  * To avoid grabbing any locks to determine the instantaneous value of
70  * the number of configured and online cpus in the zone, we also cache
71  * these values in a zone_t.  If these values are zero, the pools
72  * facility must be disabled, in which case relevant systemcall
73  * interfaces will return the values for the system as a whole.
74  *
75  * The various kstat interfaces are dealt with as follows: if pools are
76  * disabled all cpu-related kstats should be exported to all zones.
77  * When pools are enabled we begin maintaining a list of "permitted
78  * zones" on a per-kstat basis.  There are various hooks throughout the
79  * code to update this list when certain pools- or cpu-related events
80  * occur.
81  */
82 
83 static list_t pool_pset_list;	/* doubly-linked list of psets */
84 pool_pset_t *pool_pset_default;	/* default pset */
85 hrtime_t pool_pset_mod;		/* last modification time for psets */
86 hrtime_t pool_cpu_mod;		/* last modification time for CPUs */
87 
88 static pool_pset_t *
89 pool_lookup_pset_by_id(psetid_t psetid)
90 {
91 	pool_pset_t *pset = pool_pset_default;
92 
93 	ASSERT(pool_lock_held());
94 
95 	for (pset = list_head(&pool_pset_list); pset;
96 	    pset = list_next(&pool_pset_list, pset)) {
97 		if (pset->pset_id == psetid)
98 			return (pset);
99 	}
100 	return (NULL);
101 }
102 
103 struct setup_arg {
104 	psetid_t psetid;
105 	cpu_t *cpu;
106 	cpu_setup_t what;
107 };
108 
109 /*
110  * Callback function used to apply a cpu configuration event to a zone.
111  */
112 static int
113 pool_pset_setup_cb(zone_t *zone, void *arg)
114 {
115 	struct setup_arg *sa = arg;
116 
117 	ASSERT(MUTEX_HELD(&cpu_lock));
118 	ASSERT(INGLOBALZONE(curproc));
119 	ASSERT(zone != NULL);
120 
121 	if (zone == global_zone)
122 		return (0);
123 	if (zone_pset_get(zone) != sa->psetid)
124 		return (0);	/* ignore */
125 	switch (sa->what) {
126 	case CPU_CONFIG:
127 		cpu_visibility_configure(sa->cpu, zone);
128 		break;
129 	case CPU_UNCONFIG:
130 		cpu_visibility_unconfigure(sa->cpu, zone);
131 		break;
132 	case CPU_ON:
133 		cpu_visibility_online(sa->cpu, zone);
134 		break;
135 	case CPU_OFF:
136 		cpu_visibility_offline(sa->cpu, zone);
137 		break;
138 	case CPU_CPUPART_IN:
139 		cpu_visibility_add(sa->cpu, zone);
140 		break;
141 	case CPU_CPUPART_OUT:
142 		cpu_visibility_remove(sa->cpu, zone);
143 		break;
144 	default:
145 		cmn_err(CE_PANIC, "invalid cpu_setup_t value %d", sa->what);
146 	}
147 	return (0);
148 }
149 
150 /*
151  * Callback function to be executed when a noteworthy cpu event takes
152  * place.  Will ensure that the event is reflected by the zones which
153  * were affected by it.
154  */
155 /* ARGSUSED */
156 static int
157 pool_pset_cpu_setup(cpu_setup_t what, int id, void *arg)
158 {
159 	processorid_t cpuid = id;
160 	struct setup_arg sarg;
161 	int error;
162 	cpu_t *c;
163 
164 	ASSERT(MUTEX_HELD(&cpu_lock));
165 	ASSERT(INGLOBALZONE(curproc));
166 
167 	if (!pool_pset_enabled())
168 		return (0);
169 	if (what != CPU_CONFIG && what != CPU_UNCONFIG &&
170 	    what != CPU_ON && what != CPU_OFF &&
171 	    what != CPU_CPUPART_IN && what != CPU_CPUPART_OUT)
172 		return (0);
173 	c = cpu_get(cpuid);
174 	ASSERT(c != NULL);
175 	sarg.psetid = cpupart_query_cpu(c);
176 	sarg.cpu = c;
177 	sarg.what = what;
178 
179 	error = zone_walk(pool_pset_setup_cb, &sarg);
180 	ASSERT(error == 0);
181 	return (0);
182 }
183 
184 /*
185  * Initialize processor set plugin.  Called once at boot time.
186  */
187 void
188 pool_pset_init(void)
189 {
190 	ASSERT(pool_pset_default == NULL);
191 	pool_pset_default = kmem_zalloc(sizeof (pool_pset_t), KM_SLEEP);
192 	pool_pset_default->pset_id = PS_NONE;
193 	pool_pset_default->pset_npools = 1;	/* for pool_default */
194 	pool_default->pool_pset = pool_pset_default;
195 	list_create(&pool_pset_list, sizeof (pool_pset_t),
196 	    offsetof(pool_pset_t, pset_link));
197 	list_insert_head(&pool_pset_list, pool_pset_default);
198 	mutex_enter(&cpu_lock);
199 	register_cpu_setup_func(pool_pset_cpu_setup, NULL);
200 	mutex_exit(&cpu_lock);
201 }
202 
203 /*
204  * Dummy wrapper function that returns 0 to satisfy zone_walk().
205  */
206 static int
207 pool_pset_zone_pset_set(zone_t *zone, void *arg)
208 {
209 	psetid_t psetid = (psetid_t)(uintptr_t)arg;
210 
211 	ASSERT(MUTEX_HELD(&cpu_lock));
212 	zone_pset_set(zone, psetid);
213 	return (0);
214 }
215 
216 /*
217  * Enable processor set plugin.
218  */
219 int
220 pool_pset_enable(void)
221 {
222 	int error;
223 	nvlist_t *props;
224 
225 	ASSERT(pool_lock_held());
226 	ASSERT(INGLOBALZONE(curproc));
227 	/*
228 	 * Can't enable pools if there are existing cpu partitions.
229 	 */
230 	mutex_enter(&cpu_lock);
231 	if (cp_numparts > 1) {
232 		mutex_exit(&cpu_lock);
233 		return (EEXIST);
234 	}
235 
236 	/*
237 	 * We want to switch things such that everything that was tagged with
238 	 * the special ALL_ZONES token now is explicitly visible to all zones:
239 	 * first add individual zones to the visibility list then remove the
240 	 * special "ALL_ZONES" token.  There must only be the default pset
241 	 * (PS_NONE) active if pools are being enabled, so we only need to
242 	 * deal with it.
243 	 *
244 	 * We want to make pool_pset_enabled() start returning B_TRUE before
245 	 * we call any of the visibility update functions.
246 	 */
247 	global_zone->zone_psetid = PS_NONE;
248 	/*
249 	 * We need to explicitly handle the global zone since
250 	 * zone_pset_set() won't modify it.
251 	 */
252 	pool_pset_visibility_add(PS_NONE, global_zone);
253 	/*
254 	 * A NULL argument means the ALL_ZONES token.
255 	 */
256 	pool_pset_visibility_remove(PS_NONE, NULL);
257 	error = zone_walk(pool_pset_zone_pset_set, (void *)PS_NONE);
258 	ASSERT(error == 0);
259 
260 	/*
261 	 * It is safe to drop cpu_lock here.  We're still
262 	 * holding pool_lock so no new cpu partitions can
263 	 * be created while we're here.
264 	 */
265 	mutex_exit(&cpu_lock);
266 	(void) nvlist_alloc(&pool_pset_default->pset_props,
267 	    NV_UNIQUE_NAME, KM_SLEEP);
268 	props = pool_pset_default->pset_props;
269 	(void) nvlist_add_string(props, "pset.name", "pset_default");
270 	(void) nvlist_add_string(props, "pset.comment", "");
271 	(void) nvlist_add_int64(props, "pset.sys_id", PS_NONE);
272 	(void) nvlist_add_string(props, "pset.units", "population");
273 	(void) nvlist_add_byte(props, "pset.default", 1);
274 	(void) nvlist_add_uint64(props, "pset.max", 65536);
275 	(void) nvlist_add_uint64(props, "pset.min", 1);
276 	pool_pset_mod = pool_cpu_mod = gethrtime();
277 	return (0);
278 }
279 
280 /*
281  * Disable processor set plugin.
282  */
283 int
284 pool_pset_disable(void)
285 {
286 	processorid_t cpuid;
287 	cpu_t *cpu;
288 	int error;
289 
290 	ASSERT(pool_lock_held());
291 	ASSERT(INGLOBALZONE(curproc));
292 
293 	mutex_enter(&cpu_lock);
294 	if (cp_numparts > 1) {	/* make sure only default pset is left */
295 		mutex_exit(&cpu_lock);
296 		return (EBUSY);
297 	}
298 	/*
299 	 * Remove all non-system CPU and processor set properties
300 	 */
301 	for (cpuid = 0; cpuid < NCPU; cpuid++) {
302 		if ((cpu = cpu_get(cpuid)) == NULL)
303 			continue;
304 		if (cpu->cpu_props != NULL) {
305 			(void) nvlist_free(cpu->cpu_props);
306 			cpu->cpu_props = NULL;
307 		}
308 	}
309 
310 	/*
311 	 * We want to switch things such that everything is now visible
312 	 * to ALL_ZONES: first add the special "ALL_ZONES" token to the
313 	 * visibility list then remove individual zones.  There must
314 	 * only be the default pset active if pools are being disabled,
315 	 * so we only need to deal with it.
316 	 */
317 	error = zone_walk(pool_pset_zone_pset_set, (void *)ZONE_PS_INVAL);
318 	ASSERT(error == 0);
319 	pool_pset_visibility_add(PS_NONE, NULL);
320 	pool_pset_visibility_remove(PS_NONE, global_zone);
321 	/*
322 	 * pool_pset_enabled() will henceforth return B_FALSE.
323 	 */
324 	global_zone->zone_psetid = ZONE_PS_INVAL;
325 	mutex_exit(&cpu_lock);
326 	if (pool_pset_default->pset_props != NULL) {
327 		nvlist_free(pool_pset_default->pset_props);
328 		pool_pset_default->pset_props = NULL;
329 	}
330 	return (0);
331 }
332 
333 /*
334  * Create new processor set and give it a temporary name.
335  */
336 int
337 pool_pset_create(psetid_t *id)
338 {
339 	char pset_name[40];
340 	pool_pset_t *pset;
341 	psetid_t psetid;
342 	int err;
343 
344 	ASSERT(pool_lock_held());
345 	if ((err = cpupart_create(&psetid)) != 0)
346 		return (err);
347 	pset = kmem_alloc(sizeof (pool_pset_t), KM_SLEEP);
348 	pset->pset_id = *id = psetid;
349 	pset->pset_npools = 0;
350 	(void) nvlist_alloc(&pset->pset_props, NV_UNIQUE_NAME, KM_SLEEP);
351 	(void) nvlist_add_int64(pset->pset_props, "pset.sys_id", psetid);
352 	(void) nvlist_add_byte(pset->pset_props, "pset.default", 0);
353 	pool_pset_mod = gethrtime();
354 	(void) snprintf(pset_name, sizeof (pset_name), "pset_%lld",
355 	    pool_pset_mod);
356 	(void) nvlist_add_string(pset->pset_props, "pset.name", pset_name);
357 	list_insert_tail(&pool_pset_list, pset);
358 	return (0);
359 }
360 
361 /*
362  * Destroy existing processor set.
363  */
364 int
365 pool_pset_destroy(psetid_t psetid)
366 {
367 	pool_pset_t *pset;
368 	int ret;
369 
370 	ASSERT(pool_lock_held());
371 
372 	if (psetid == PS_NONE)
373 		return (EINVAL);
374 	if ((pset = pool_lookup_pset_by_id(psetid)) == NULL)
375 		return (ESRCH);
376 	if (pset->pset_npools > 0) /* can't destroy associated psets */
377 		return (EBUSY);
378 	if ((ret = cpupart_destroy(pset->pset_id)) != 0)
379 		return (ret);
380 	(void) nvlist_free(pset->pset_props);
381 	list_remove(&pool_pset_list, pset);
382 	pool_pset_mod = gethrtime();
383 	kmem_free(pset, sizeof (pool_pset_t));
384 	return (0);
385 }
386 
387 /*
388  * Change the visibility of a pset (and all contained cpus) in a zone.
389  * A NULL zone argument implies the special ALL_ZONES token.
390  */
391 static void
392 pool_pset_visibility_change(psetid_t psetid, zone_t *zone, boolean_t add)
393 {
394 	zoneid_t zoneid = zone ? zone->zone_id : ALL_ZONES;
395 	cpupart_t *cp;
396 	cpu_t *c;
397 
398 	ASSERT(MUTEX_HELD(&cpu_lock));
399 	ASSERT(psetid != ZONE_PS_INVAL);
400 
401 	cp = cpupart_find(psetid);
402 	ASSERT(cp != NULL);
403 	if (cp->cp_kstat != NULL) {
404 		if (add)
405 			kstat_zone_add(cp->cp_kstat, zoneid);
406 		else
407 			kstat_zone_remove(cp->cp_kstat, zoneid);
408 	}
409 
410 	c = cpu_list;
411 	do {
412 		ASSERT(c != NULL);
413 		if (c->cpu_part == cp && !cpu_is_poweredoff(c)) {
414 			if (add)
415 				cpu_visibility_add(c, zone);
416 			else
417 				cpu_visibility_remove(c, zone);
418 		}
419 	} while ((c = c->cpu_next) != cpu_list);
420 }
421 
422 /*
423  * Make the processor set visible to the zone.  A NULL value for
424  * the zone means that the special ALL_ZONES token should be added to
425  * the visibility list.
426  */
427 void
428 pool_pset_visibility_add(psetid_t psetid, zone_t *zone)
429 {
430 	pool_pset_visibility_change(psetid, zone, B_TRUE);
431 }
432 
433 /*
434  * Remove zone's visibility into the processor set.  A NULL value for
435  * the zone means that the special ALL_ZONES token should be removed
436  * from the visibility list.
437  */
438 void
439 pool_pset_visibility_remove(psetid_t psetid, zone_t *zone)
440 {
441 	pool_pset_visibility_change(psetid, zone, B_FALSE);
442 }
443 
444 /*
445  * Quick way of seeing if pools are enabled (as far as processor sets are
446  * concerned) without holding pool_lock().
447  */
448 boolean_t
449 pool_pset_enabled(void)
450 {
451 	ASSERT(MUTEX_HELD(&cpu_lock));
452 
453 	return (zone_pset_get(global_zone) != ZONE_PS_INVAL);
454 }
455 
456 struct assoc_zone_arg {
457 	poolid_t poolid;
458 	psetid_t newpsetid;
459 };
460 
461 /*
462  * Callback function to update a zone's processor set visibility when
463  * a pool is associated with a processor set.
464  */
465 static int
466 pool_pset_assoc_zone_cb(zone_t *zone, void *arg)
467 {
468 	struct assoc_zone_arg *aza = arg;
469 	pool_t *pool;
470 	zoneid_t zoneid = zone->zone_id;
471 
472 	ASSERT(pool_lock_held());
473 	ASSERT(MUTEX_HELD(&cpu_lock));
474 
475 	if (zoneid == GLOBAL_ZONEID)
476 		return (0);
477 	pool = zone_pool_get(zone);
478 	if (pool->pool_id == aza->poolid)
479 		zone_pset_set(zone, aza->newpsetid);
480 	return (0);
481 }
482 
483 /*
484  * Associate pool with new processor set.
485  */
486 int
487 pool_pset_assoc(poolid_t poolid, psetid_t psetid)
488 {
489 	pool_t *pool;
490 	pool_pset_t *pset, *oldpset;
491 	int err = 0;
492 
493 	ASSERT(pool_lock_held());
494 
495 	if ((pool = pool_lookup_pool_by_id(poolid)) == NULL ||
496 	    (pset = pool_lookup_pset_by_id(psetid)) == NULL) {
497 		return (ESRCH);
498 	}
499 	if (pool->pool_pset->pset_id == psetid) {
500 		/*
501 		 * Already associated.
502 		 */
503 		return (0);
504 	}
505 
506 	/*
507 	 * Hang the new pset off the pool, and rebind all of the pool's
508 	 * processes to it.  If pool_do_bind fails, all processes will remain
509 	 * bound to the old set.
510 	 */
511 	oldpset = pool->pool_pset;
512 	pool->pool_pset = pset;
513 	err = pool_do_bind(pool, P_POOLID, poolid, POOL_BIND_PSET);
514 	if (err) {
515 		pool->pool_pset = oldpset;
516 	} else {
517 		struct assoc_zone_arg azarg;
518 
519 		/*
520 		 * Update zones' visibility to reflect changes.
521 		 */
522 		azarg.poolid = poolid;
523 		azarg.newpsetid = pset->pset_id;
524 		mutex_enter(&cpu_lock);
525 		err = zone_walk(pool_pset_assoc_zone_cb, &azarg);
526 		ASSERT(err == 0);
527 		mutex_exit(&cpu_lock);
528 
529 		oldpset->pset_npools--;
530 		pset->pset_npools++;
531 	}
532 	return (err);
533 }
534 
535 /*
536  * Transfer specified CPUs between processor sets.
537  */
538 int
539 pool_pset_xtransfer(psetid_t src, psetid_t dst, size_t size, id_t *ids)
540 {
541 	struct cpu *cpu;
542 	int ret = 0;
543 	int id;
544 
545 	ASSERT(pool_lock_held());
546 	ASSERT(INGLOBALZONE(curproc));
547 
548 	if (size == 0 || size > max_ncpus)	/* quick sanity check */
549 		return (EINVAL);
550 
551 	mutex_enter(&cpu_lock);
552 	for (id = 0; id < size; id++) {
553 		if ((cpu = cpu_get((processorid_t)ids[id])) == NULL ||
554 		    cpupart_query_cpu(cpu) != src) {
555 			ret = EINVAL;
556 			break;
557 		}
558 		if ((ret = cpupart_attach_cpu(dst, cpu, 1)) != 0)
559 			break;
560 	}
561 	mutex_exit(&cpu_lock);
562 	if (ret == 0)
563 		pool_pset_mod = gethrtime();
564 	return (ret);
565 }
566 
567 /*
568  * Bind process to processor set.  This should never fail because
569  * we should've done all preliminary checks before calling it.
570  */
571 void
572 pool_pset_bind(proc_t *p, psetid_t psetid, void *projbuf, void *zonebuf)
573 {
574 	kthread_t *t;
575 	int ret;
576 
577 	ASSERT(pool_lock_held());
578 	ASSERT(MUTEX_HELD(&cpu_lock));
579 	ASSERT(MUTEX_HELD(&pidlock));
580 	ASSERT(MUTEX_HELD(&p->p_lock));
581 
582 	if ((t = p->p_tlist) == NULL)
583 		return;
584 	do {
585 		ret = cpupart_bind_thread(t, psetid, 0, projbuf, zonebuf);
586 		ASSERT(ret == 0);
587 		t->t_bind_pset = psetid;
588 	} while ((t = t->t_forw) != p->p_tlist);
589 }
590 
591 /*
592  * See the comment above pool_do_bind() for the semantics of the pset_bind_*()
593  * functions.  These must be kept in sync with cpupart_move_thread, and
594  * anything else that could fail a pool_pset_bind.
595  *
596  * Returns non-zero errno on failure and zero on success.
597  * Iff successful, cpu_lock is held on return.
598  */
599 int
600 pset_bind_start(proc_t **procs, pool_t *pool)
601 {
602 	cred_t *pcred;
603 	proc_t *p, **pp;
604 	kthread_t *t;
605 	cpupart_t *newpp;
606 	int ret;
607 
608 	extern int cpupart_movable_thread(kthread_id_t, cpupart_t *, int);
609 
610 	ASSERT(pool_lock_held());
611 	ASSERT(INGLOBALZONE(curproc));
612 
613 	mutex_enter(&cpu_lock);
614 	weakbinding_stop();
615 
616 	newpp = cpupart_find(pool->pool_pset->pset_id);
617 	ASSERT(newpp != NULL);
618 	if (newpp->cp_cpulist == NULL) {
619 		weakbinding_start();
620 		mutex_exit(&cpu_lock);
621 		return (ENOTSUP);
622 	}
623 
624 	pcred = crgetcred();
625 
626 	/*
627 	 * Check for the PRIV_PROC_PRIOCNTL privilege that is required
628 	 * to enter and exit scheduling classes.  If other privileges
629 	 * are required by CL_ENTERCLASS/CL_CANEXIT types of routines
630 	 * in the future, this code will have to be updated.
631 	 */
632 	if (secpolicy_setpriority(pcred) != 0) {
633 		weakbinding_start();
634 		mutex_exit(&cpu_lock);
635 		crfree(pcred);
636 		return (EPERM);
637 	}
638 
639 	for (pp = procs; (p = *pp) != NULL; pp++) {
640 		mutex_enter(&p->p_lock);
641 		if ((t = p->p_tlist) == NULL) {
642 			mutex_exit(&p->p_lock);
643 			continue;
644 		}
645 		/*
646 		 * Check our basic permissions to control this process.
647 		 */
648 		if (!prochasprocperm(p, curproc, pcred)) {
649 			mutex_exit(&p->p_lock);
650 			weakbinding_start();
651 			mutex_exit(&cpu_lock);
652 			crfree(pcred);
653 			return (EPERM);
654 		}
655 		do {
656 			/*
657 			 * Check that all threads can be moved to
658 			 * a new processor set.
659 			 */
660 			thread_lock(t);
661 			ret = cpupart_movable_thread(t, newpp, 0);
662 			thread_unlock(t);
663 			if (ret != 0) {
664 				mutex_exit(&p->p_lock);
665 				weakbinding_start();
666 				mutex_exit(&cpu_lock);
667 				crfree(pcred);
668 				return (ret);
669 			}
670 		} while ((t = t->t_forw) != p->p_tlist);
671 		mutex_exit(&p->p_lock);
672 	}
673 	crfree(pcred);
674 	return (0);	/* with cpu_lock held and weakbinding stopped */
675 }
676 
677 /*ARGSUSED*/
678 void
679 pset_bind_abort(proc_t **procs, pool_t *pool)
680 {
681 	mutex_exit(&cpu_lock);
682 }
683 
684 void
685 pset_bind_finish(void)
686 {
687 	weakbinding_start();
688 	mutex_exit(&cpu_lock);
689 }
690 
691 static pool_property_t pool_pset_props[] = {
692 	{ "pset.name",			DATA_TYPE_STRING,	PP_RDWR },
693 	{ "pset.comment",		DATA_TYPE_STRING,	PP_RDWR },
694 	{ "pset.sys_id",		DATA_TYPE_UINT64,	PP_READ },
695 	{ "pset.units",			DATA_TYPE_STRING,	PP_RDWR },
696 	{ "pset.default",		DATA_TYPE_BYTE,		PP_READ },
697 	{ "pset.min",			DATA_TYPE_UINT64,	PP_RDWR },
698 	{ "pset.max",			DATA_TYPE_UINT64,	PP_RDWR },
699 	{ "pset.size",			DATA_TYPE_UINT64,	PP_READ },
700 	{ "pset.load",			DATA_TYPE_UINT64,	PP_READ },
701 	{ "pset.poold.objectives",	DATA_TYPE_STRING,
702 	    PP_RDWR | PP_OPTIONAL },
703 	{ NULL,				0,			0 }
704 };
705 
706 static pool_property_t pool_cpu_props[] = {
707 	{ "cpu.sys_id",			DATA_TYPE_UINT64,	PP_READ },
708 	{ "cpu.comment",		DATA_TYPE_STRING,	PP_RDWR },
709 	{ "cpu.status",			DATA_TYPE_STRING,	PP_RDWR },
710 	{ "cpu.pinned",			DATA_TYPE_BYTE,
711 	    PP_RDWR | PP_OPTIONAL },
712 	{ NULL,				0,			0 }
713 };
714 
715 /*
716  * Put property on the specified processor set.
717  */
718 int
719 pool_pset_propput(psetid_t psetid, nvpair_t *pair)
720 {
721 	pool_pset_t *pset;
722 	int ret;
723 
724 	ASSERT(pool_lock_held());
725 
726 	if ((pset = pool_lookup_pset_by_id(psetid)) == NULL)
727 		return (ESRCH);
728 	ret = pool_propput_common(pset->pset_props, pair, pool_pset_props);
729 	if (ret == 0)
730 		pool_pset_mod = gethrtime();
731 	return (ret);
732 }
733 
734 /*
735  * Remove existing processor set property.
736  */
737 int
738 pool_pset_proprm(psetid_t psetid, char *name)
739 {
740 	pool_pset_t *pset;
741 	int ret;
742 
743 	ASSERT(pool_lock_held());
744 
745 	if ((pset = pool_lookup_pset_by_id(psetid)) == NULL)
746 		return (EINVAL);
747 	ret = pool_proprm_common(pset->pset_props, name, pool_pset_props);
748 	if (ret == 0)
749 		pool_pset_mod = gethrtime();
750 	return (ret);
751 }
752 
753 /*
754  * Put new CPU property.
755  * Handle special case of "cpu.status".
756  */
757 int
758 pool_cpu_propput(processorid_t cpuid, nvpair_t *pair)
759 {
760 	int ret = 0;
761 	cpu_t *cpu;
762 
763 	ASSERT(pool_lock_held());
764 	ASSERT(INGLOBALZONE(curproc));
765 
766 	if (nvpair_type(pair) == DATA_TYPE_STRING &&
767 	    strcmp(nvpair_name(pair), "cpu.status") == 0) {
768 		char *val;
769 		int status;
770 		int old_status;
771 		(void) nvpair_value_string(pair, &val);
772 		if (strcmp(val, PS_OFFLINE) == 0)
773 			status = P_OFFLINE;
774 		else if (strcmp(val, PS_ONLINE) == 0)
775 			status = P_ONLINE;
776 		else if (strcmp(val, PS_NOINTR) == 0)
777 			status = P_NOINTR;
778 		else if (strcmp(val, PS_FAULTED) == 0)
779 			status = P_FAULTED;
780 		else if (strcmp(val, PS_SPARE) == 0)
781 			status = P_SPARE;
782 		else
783 			return (EINVAL);
784 		ret = p_online_internal(cpuid, status, &old_status);
785 	} else {
786 		mutex_enter(&cpu_lock);
787 		if ((cpu = cpu_get(cpuid)) == NULL)
788 			ret = EINVAL;
789 		if (cpu->cpu_props == NULL) {
790 			(void) nvlist_alloc(&cpu->cpu_props,
791 			    NV_UNIQUE_NAME, KM_SLEEP);
792 			(void) nvlist_add_string(cpu->cpu_props,
793 			    "cpu.comment", "");
794 		}
795 		ret = pool_propput_common(cpu->cpu_props, pair, pool_cpu_props);
796 		if (ret == 0)
797 			pool_cpu_mod = gethrtime();
798 		mutex_exit(&cpu_lock);
799 	}
800 	return (ret);
801 }
802 
803 /*
804  * Remove existing CPU property.
805  */
806 int
807 pool_cpu_proprm(processorid_t cpuid, char *name)
808 {
809 	int ret;
810 	cpu_t *cpu;
811 
812 	ASSERT(pool_lock_held());
813 	ASSERT(INGLOBALZONE(curproc));
814 
815 	mutex_enter(&cpu_lock);
816 	if ((cpu = cpu_get(cpuid)) == NULL || cpu_is_poweredoff(cpu)) {
817 		ret = EINVAL;
818 	} else {
819 		if (cpu->cpu_props == NULL)
820 			ret = EINVAL;
821 		else
822 			ret = pool_proprm_common(cpu->cpu_props, name,
823 			    pool_cpu_props);
824 	}
825 	if (ret == 0)
826 		pool_cpu_mod = gethrtime();
827 	mutex_exit(&cpu_lock);
828 	return (ret);
829 }
830 
831 /*
832  * This macro returns load average multiplied by 1000 w/o losing precision
833  */
834 #define	PSET_LOAD(f)	(((f >> 16) * 1000) + (((f & 0xffff) * 1000) / 0xffff))
835 
836 /*
837  * Take a snapshot of the current state of processor sets and CPUs,
838  * pack it in the exacct format, and attach it to specified exacct record.
839  */
840 int
841 pool_pset_pack(ea_object_t *eo_system)
842 {
843 	ea_object_t *eo_pset, *eo_cpu;
844 	cpupart_t *cpupart;
845 	psetid_t mypsetid;
846 	pool_pset_t *pset;
847 	nvlist_t *nvl;
848 	size_t bufsz;
849 	cpu_t *cpu;
850 	char *buf;
851 	int ncpu;
852 
853 	ASSERT(pool_lock_held());
854 
855 	mutex_enter(&cpu_lock);
856 	mypsetid = zone_pset_get(curproc->p_zone);
857 	for (pset = list_head(&pool_pset_list); pset;
858 	    pset = list_next(&pool_pset_list, pset)) {
859 		psetid_t psetid = pset->pset_id;
860 
861 		if (!INGLOBALZONE(curproc) && mypsetid != psetid)
862 			continue;
863 		cpupart = cpupart_find(psetid);
864 		ASSERT(cpupart != NULL);
865 		eo_pset = ea_alloc_group(EXT_GROUP |
866 		    EXC_LOCAL | EXD_GROUP_PSET);
867 		(void) ea_attach_item(eo_pset, &psetid, sizeof (id_t),
868 		    EXC_LOCAL | EXD_PSET_PSETID | EXT_UINT32);
869 		/*
870 		 * Pack info for all CPUs in this processor set.
871 		 */
872 		ncpu = 0;
873 		cpu = cpu_list;
874 		do {
875 			if (cpu->cpu_part != cpupart)	/* not our pset */
876 				continue;
877 			ncpu++;
878 			eo_cpu = ea_alloc_group(EXT_GROUP
879 			    | EXC_LOCAL | EXD_GROUP_CPU);
880 			(void) ea_attach_item(eo_cpu, &cpu->cpu_id,
881 			    sizeof (processorid_t),
882 			    EXC_LOCAL | EXD_CPU_CPUID | EXT_UINT32);
883 			if (cpu->cpu_props == NULL) {
884 				(void) nvlist_alloc(&cpu->cpu_props,
885 				    NV_UNIQUE_NAME, KM_SLEEP);
886 				(void) nvlist_add_string(cpu->cpu_props,
887 				    "cpu.comment", "");
888 			}
889 			(void) nvlist_dup(cpu->cpu_props, &nvl, KM_SLEEP);
890 			(void) nvlist_add_int64(nvl, "cpu.sys_id", cpu->cpu_id);
891 			(void) nvlist_add_string(nvl, "cpu.status",
892 			    (char *)cpu_get_state_str(cpu));
893 			buf = NULL;
894 			bufsz = 0;
895 			(void) nvlist_pack(nvl, &buf, &bufsz,
896 			    NV_ENCODE_NATIVE, 0);
897 			(void) ea_attach_item(eo_cpu, buf, bufsz,
898 			    EXC_LOCAL | EXD_CPU_PROP | EXT_RAW);
899 			(void) nvlist_free(nvl);
900 			kmem_free(buf, bufsz);
901 			(void) ea_attach_to_group(eo_pset, eo_cpu);
902 		} while ((cpu = cpu->cpu_next) != cpu_list);
903 
904 		(void) nvlist_dup(pset->pset_props, &nvl, KM_SLEEP);
905 		(void) nvlist_add_uint64(nvl, "pset.size", ncpu);
906 		(void) nvlist_add_uint64(nvl, "pset.load",
907 		    (uint64_t)PSET_LOAD(cpupart->cp_hp_avenrun[0]));
908 		buf = NULL;
909 		bufsz = 0;
910 		(void) nvlist_pack(nvl, &buf, &bufsz, NV_ENCODE_NATIVE, 0);
911 		(void) ea_attach_item(eo_pset, buf, bufsz,
912 		    EXC_LOCAL | EXD_PSET_PROP | EXT_RAW);
913 		(void) nvlist_free(nvl);
914 		kmem_free(buf, bufsz);
915 
916 		(void) ea_attach_to_group(eo_system, eo_pset);
917 	}
918 	mutex_exit(&cpu_lock);
919 	return (0);
920 }
921 
922 /*
923  * Get dynamic property for processor sets.
924  * The only dynamic property currently implemented is "pset.load".
925  */
926 int
927 pool_pset_propget(psetid_t psetid, char *name, nvlist_t *nvl)
928 {
929 	cpupart_t *cpupart;
930 	pool_pset_t *pset;
931 	int ret = ESRCH;
932 
933 	ASSERT(pool_lock_held());
934 
935 	mutex_enter(&cpu_lock);
936 	pset = pool_lookup_pset_by_id(psetid);
937 	cpupart = cpupart_find(psetid);
938 	if (cpupart == NULL || pset == NULL) {
939 		mutex_exit(&cpu_lock);
940 		return (EINVAL);
941 	}
942 	if (strcmp(name, "pset.load") == 0)
943 		ret = nvlist_add_uint64(nvl, "pset.load",
944 		    (uint64_t)PSET_LOAD(cpupart->cp_hp_avenrun[0]));
945 	else
946 		ret = EINVAL;
947 	mutex_exit(&cpu_lock);
948 	return (ret);
949 }
950 
951 /*
952  * Get dynamic property for CPUs.
953  * The only dynamic property currently implemented is "cpu.status".
954  */
955 int
956 pool_cpu_propget(processorid_t cpuid, char *name, nvlist_t *nvl)
957 {
958 	int ret = ESRCH;
959 	cpu_t *cpu;
960 
961 	ASSERT(pool_lock_held());
962 
963 	mutex_enter(&cpu_lock);
964 	if ((cpu = cpu_get(cpuid)) == NULL) {
965 		mutex_exit(&cpu_lock);
966 		return (ESRCH);
967 	}
968 	if (strcmp(name, "cpu.status") == 0) {
969 		ret = nvlist_add_string(nvl, "cpu.status",
970 		    (char *)cpu_get_state_str(cpu));
971 	} else {
972 		ret = EINVAL;
973 	}
974 	mutex_exit(&cpu_lock);
975 	return (ret);
976 }
977