xref: /freebsd/sys/kern/kern_cpuset.c (revision e796cc77c586c2955b2f3940dbf4991b31e8d289)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2008,  Jeffrey Roberson <jeff@freebsd.org>
5  * All rights reserved.
6  *
7  * Copyright (c) 2008 Nokia Corporation
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice unmodified, this list of conditions, and the following
15  *    disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  */
32 
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 
36 #include "opt_ddb.h"
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/sysproto.h>
41 #include <sys/jail.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/malloc.h>
45 #include <sys/mutex.h>
46 #include <sys/priv.h>
47 #include <sys/proc.h>
48 #include <sys/refcount.h>
49 #include <sys/sched.h>
50 #include <sys/smp.h>
51 #include <sys/syscallsubr.h>
52 #include <sys/capsicum.h>
53 #include <sys/cpuset.h>
54 #include <sys/domainset.h>
55 #include <sys/sx.h>
56 #include <sys/queue.h>
57 #include <sys/libkern.h>
58 #include <sys/limits.h>
59 #include <sys/bus.h>
60 #include <sys/interrupt.h>
61 #include <sys/vmmeter.h>
62 
63 #include <vm/uma.h>
64 #include <vm/vm.h>
65 #include <vm/vm_object.h>
66 #include <vm/vm_page.h>
67 #include <vm/vm_param.h>
68 #include <vm/vm_phys.h>
69 
70 #ifdef DDB
71 #include <ddb/ddb.h>
72 #endif /* DDB */
73 
74 /*
75  * cpusets provide a mechanism for creating and manipulating sets of
76  * processors for the purpose of constraining the scheduling of threads to
77  * specific processors.
78  *
79  * Each process belongs to an identified set, by default this is set 1.  Each
80  * thread may further restrict the cpus it may run on to a subset of this
81  * named set.  This creates an anonymous set which other threads and processes
82  * may not join by number.
83  *
84  * The named set is referred to herein as the 'base' set to avoid ambiguity.
85  * This set is usually a child of a 'root' set while the anonymous set may
86  * simply be referred to as a mask.  In the syscall api these are referred to
87  * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here.
88  *
89  * Threads inherit their set from their creator whether it be anonymous or
90  * not.  This means that anonymous sets are immutable because they may be
91  * shared.  To modify an anonymous set a new set is created with the desired
92  * mask and the same parent as the existing anonymous set.  This gives the
93  * illusion of each thread having a private mask.
94  *
95  * Via the syscall apis a user may ask to retrieve or modify the root, base,
96  * or mask that is discovered via a pid, tid, or setid.  Modifying a set
97  * modifies all numbered and anonymous child sets to comply with the new mask.
98  * Modifying a pid or tid's mask applies only to that tid but must still
99  * exist within the assigned parent set.
100  *
101  * A thread may not be assigned to a group separate from other threads in
102  * the process.  This is to remove ambiguity when the setid is queried with
103  * a pid argument.  There is no other technical limitation.
104  *
105  * This somewhat complex arrangement is intended to make it easy for
106  * applications to query available processors and bind their threads to
107  * specific processors while also allowing administrators to dynamically
108  * reprovision by changing sets which apply to groups of processes.
109  *
110  * A simple application should not concern itself with sets at all and
111  * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id
112  * meaning 'curthread'.  It may query available cpus for that tid with a
113  * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
114  */
115 static uma_zone_t cpuset_zone;
116 static uma_zone_t domainset_zone;
117 static struct mtx cpuset_lock;
118 static struct setlist cpuset_ids;
119 static struct domainlist cpuset_domains;
120 static struct unrhdr *cpuset_unr;
121 static struct cpuset *cpuset_zero, *cpuset_default;
122 
123 /* Return the size of cpuset_t at the kernel level */
124 SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD | CTLFLAG_CAPRD,
125     SYSCTL_NULL_INT_PTR, sizeof(cpuset_t), "sizeof(cpuset_t)");
126 
127 cpuset_t *cpuset_root;
128 cpuset_t cpuset_domain[MAXMEMDOM];
129 
130 static int domainset_valid(const struct domainset *, const struct domainset *);
131 
132 /*
133  * Find the first non-anonymous set starting from 'set'.
134  */
135 static struct cpuset *
136 cpuset_getbase(struct cpuset *set)
137 {
138 
139 	if (set->cs_id == CPUSET_INVALID)
140 		set = set->cs_parent;
141 	return (set);
142 }
143 
144 /*
145  * Walks up the tree from 'set' to find the root.
146  */
147 static struct cpuset *
148 cpuset_getroot(struct cpuset *set)
149 {
150 
151 	while ((set->cs_flags & CPU_SET_ROOT) == 0 && set->cs_parent != NULL)
152 		set = set->cs_parent;
153 	return (set);
154 }
155 
156 /*
157  * Acquire a reference to a cpuset, all pointers must be tracked with refs.
158  */
159 struct cpuset *
160 cpuset_ref(struct cpuset *set)
161 {
162 
163 	refcount_acquire(&set->cs_ref);
164 	return (set);
165 }
166 
167 /*
168  * Walks up the tree from 'set' to find the root.  Returns the root
169  * referenced.
170  */
171 static struct cpuset *
172 cpuset_refroot(struct cpuset *set)
173 {
174 
175 	return (cpuset_ref(cpuset_getroot(set)));
176 }
177 
178 /*
179  * Find the first non-anonymous set starting from 'set'.  Returns this set
180  * referenced.  May return the passed in set with an extra ref if it is
181  * not anonymous.
182  */
183 static struct cpuset *
184 cpuset_refbase(struct cpuset *set)
185 {
186 
187 	return (cpuset_ref(cpuset_getbase(set)));
188 }
189 
190 /*
191  * Release a reference in a context where it is safe to allocate.
192  */
193 void
194 cpuset_rel(struct cpuset *set)
195 {
196 	cpusetid_t id;
197 
198 	if (refcount_release(&set->cs_ref) == 0)
199 		return;
200 	mtx_lock_spin(&cpuset_lock);
201 	LIST_REMOVE(set, cs_siblings);
202 	id = set->cs_id;
203 	if (id != CPUSET_INVALID)
204 		LIST_REMOVE(set, cs_link);
205 	mtx_unlock_spin(&cpuset_lock);
206 	cpuset_rel(set->cs_parent);
207 	uma_zfree(cpuset_zone, set);
208 	if (id != CPUSET_INVALID)
209 		free_unr(cpuset_unr, id);
210 }
211 
212 /*
213  * Deferred release must be used when in a context that is not safe to
214  * allocate/free.  This places any unreferenced sets on the list 'head'.
215  */
216 static void
217 cpuset_rel_defer(struct setlist *head, struct cpuset *set)
218 {
219 
220 	if (refcount_release(&set->cs_ref) == 0)
221 		return;
222 	mtx_lock_spin(&cpuset_lock);
223 	LIST_REMOVE(set, cs_siblings);
224 	if (set->cs_id != CPUSET_INVALID)
225 		LIST_REMOVE(set, cs_link);
226 	LIST_INSERT_HEAD(head, set, cs_link);
227 	mtx_unlock_spin(&cpuset_lock);
228 }
229 
230 /*
231  * Complete a deferred release.  Removes the set from the list provided to
232  * cpuset_rel_defer.
233  */
234 static void
235 cpuset_rel_complete(struct cpuset *set)
236 {
237 	LIST_REMOVE(set, cs_link);
238 	cpuset_rel(set->cs_parent);
239 	uma_zfree(cpuset_zone, set);
240 }
241 
242 /*
243  * Find a set based on an id.  Returns it with a ref.
244  */
245 static struct cpuset *
246 cpuset_lookup(cpusetid_t setid, struct thread *td)
247 {
248 	struct cpuset *set;
249 
250 	if (setid == CPUSET_INVALID)
251 		return (NULL);
252 	mtx_lock_spin(&cpuset_lock);
253 	LIST_FOREACH(set, &cpuset_ids, cs_link)
254 		if (set->cs_id == setid)
255 			break;
256 	if (set)
257 		cpuset_ref(set);
258 	mtx_unlock_spin(&cpuset_lock);
259 
260 	KASSERT(td != NULL, ("[%s:%d] td is NULL", __func__, __LINE__));
261 	if (set != NULL && jailed(td->td_ucred)) {
262 		struct cpuset *jset, *tset;
263 
264 		jset = td->td_ucred->cr_prison->pr_cpuset;
265 		for (tset = set; tset != NULL; tset = tset->cs_parent)
266 			if (tset == jset)
267 				break;
268 		if (tset == NULL) {
269 			cpuset_rel(set);
270 			set = NULL;
271 		}
272 	}
273 
274 	return (set);
275 }
276 
277 /*
278  * Create a set in the space provided in 'set' with the provided parameters.
279  * The set is returned with a single ref.  May return EDEADLK if the set
280  * will have no valid cpu based on restrictions from the parent.
281  */
282 static int
283 _cpuset_create(struct cpuset *set, struct cpuset *parent,
284     const cpuset_t *mask, struct domainset *domain, cpusetid_t id)
285 {
286 
287 	if (domain == NULL)
288 		domain = parent->cs_domain;
289 	if (mask == NULL)
290 		mask = &parent->cs_mask;
291 	if (!CPU_OVERLAP(&parent->cs_mask, mask))
292 		return (EDEADLK);
293 	/* The domain must be prepared ahead of time. */
294 	if (!domainset_valid(parent->cs_domain, domain))
295 		return (EDEADLK);
296 	CPU_COPY(mask, &set->cs_mask);
297 	LIST_INIT(&set->cs_children);
298 	refcount_init(&set->cs_ref, 1);
299 	set->cs_flags = 0;
300 	mtx_lock_spin(&cpuset_lock);
301 	set->cs_domain = domain;
302 	CPU_AND(&set->cs_mask, &parent->cs_mask);
303 	set->cs_id = id;
304 	set->cs_parent = cpuset_ref(parent);
305 	LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings);
306 	if (set->cs_id != CPUSET_INVALID)
307 		LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
308 	mtx_unlock_spin(&cpuset_lock);
309 
310 	return (0);
311 }
312 
313 /*
314  * Create a new non-anonymous set with the requested parent and mask.  May
315  * return failures if the mask is invalid or a new number can not be
316  * allocated.
317  */
318 static int
319 cpuset_create(struct cpuset **setp, struct cpuset *parent, const cpuset_t *mask)
320 {
321 	struct cpuset *set;
322 	cpusetid_t id;
323 	int error;
324 
325 	id = alloc_unr(cpuset_unr);
326 	if (id == -1)
327 		return (ENFILE);
328 	*setp = set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
329 	error = _cpuset_create(set, parent, mask, NULL, id);
330 	if (error == 0)
331 		return (0);
332 	free_unr(cpuset_unr, id);
333 	uma_zfree(cpuset_zone, set);
334 
335 	return (error);
336 }
337 
338 static void
339 cpuset_freelist_add(struct setlist *list, int count)
340 {
341 	struct cpuset *set;
342 	int i;
343 
344 	for (i = 0; i < count; i++) {
345 		set = uma_zalloc(cpuset_zone, M_ZERO | M_WAITOK);
346 		LIST_INSERT_HEAD(list, set, cs_link);
347 	}
348 }
349 
350 static void
351 cpuset_freelist_init(struct setlist *list, int count)
352 {
353 
354 	LIST_INIT(list);
355 	cpuset_freelist_add(list, count);
356 }
357 
358 static void
359 cpuset_freelist_free(struct setlist *list)
360 {
361 	struct cpuset *set;
362 
363 	while ((set = LIST_FIRST(list)) != NULL) {
364 		LIST_REMOVE(set, cs_link);
365 		uma_zfree(cpuset_zone, set);
366 	}
367 }
368 
369 static void
370 domainset_freelist_add(struct domainlist *list, int count)
371 {
372 	struct domainset *set;
373 	int i;
374 
375 	for (i = 0; i < count; i++) {
376 		set = uma_zalloc(domainset_zone, M_ZERO | M_WAITOK);
377 		LIST_INSERT_HEAD(list, set, ds_link);
378 	}
379 }
380 
381 static void
382 domainset_freelist_init(struct domainlist *list, int count)
383 {
384 
385 	LIST_INIT(list);
386 	domainset_freelist_add(list, count);
387 }
388 
389 static void
390 domainset_freelist_free(struct domainlist *list)
391 {
392 	struct domainset *set;
393 
394 	while ((set = LIST_FIRST(list)) != NULL) {
395 		LIST_REMOVE(set, ds_link);
396 		uma_zfree(domainset_zone, set);
397 	}
398 }
399 
400 /* Copy a domainset preserving mask and policy. */
401 static void
402 domainset_copy(const struct domainset *from, struct domainset *to)
403 {
404 
405 	DOMAINSET_COPY(&from->ds_mask, &to->ds_mask);
406 	to->ds_policy = from->ds_policy;
407 	to->ds_prefer = from->ds_prefer;
408 }
409 
410 /* Return 1 if mask and policy are equal, otherwise 0. */
411 static int
412 domainset_equal(const struct domainset *one, const struct domainset *two)
413 {
414 
415 	return (DOMAINSET_CMP(&one->ds_mask, &two->ds_mask) == 0 &&
416 	    one->ds_policy == two->ds_policy &&
417 	    one->ds_prefer == two->ds_prefer);
418 }
419 
420 /* Return 1 if child is a valid subset of parent. */
421 static int
422 domainset_valid(const struct domainset *parent, const struct domainset *child)
423 {
424 	if (child->ds_policy != DOMAINSET_POLICY_PREFER)
425 		return (DOMAINSET_SUBSET(&parent->ds_mask, &child->ds_mask));
426 	return (DOMAINSET_ISSET(child->ds_prefer, &parent->ds_mask));
427 }
428 
429 static int
430 domainset_restrict(const struct domainset *parent,
431     const struct domainset *child)
432 {
433 	if (child->ds_policy != DOMAINSET_POLICY_PREFER)
434 		return (DOMAINSET_OVERLAP(&parent->ds_mask, &child->ds_mask));
435 	return (DOMAINSET_ISSET(child->ds_prefer, &parent->ds_mask));
436 }
437 
438 /*
439  * Lookup or create a domainset.  The key is provided in ds_mask and
440  * ds_policy.  If the domainset does not yet exist the storage in
441  * 'domain' is used to insert.  Otherwise this storage is freed to the
442  * domainset_zone and the existing domainset is returned.
443  */
444 static struct domainset *
445 _domainset_create(struct domainset *domain, struct domainlist *freelist)
446 {
447 	struct domainset *ndomain;
448 
449 	mtx_lock_spin(&cpuset_lock);
450 	LIST_FOREACH(ndomain, &cpuset_domains, ds_link)
451 		if (domainset_equal(ndomain, domain))
452 			break;
453 	/*
454 	 * If the domain does not yet exist we insert it and initialize
455 	 * various iteration helpers which are not part of the key.
456 	 */
457 	if (ndomain == NULL) {
458 		LIST_INSERT_HEAD(&cpuset_domains, domain, ds_link);
459 		domain->ds_cnt = DOMAINSET_COUNT(&domain->ds_mask);
460 		domain->ds_max = DOMAINSET_FLS(&domain->ds_mask) + 1;
461 	}
462 	mtx_unlock_spin(&cpuset_lock);
463 	if (ndomain == NULL)
464 		return (domain);
465 	if (freelist != NULL)
466 		LIST_INSERT_HEAD(freelist, domain, ds_link);
467 	else
468 		uma_zfree(domainset_zone, domain);
469 	return (ndomain);
470 
471 }
472 
473 /*
474  * Create or lookup a domainset based on the key held in 'domain'.
475  */
476 static struct domainset *
477 domainset_create(const struct domainset *domain)
478 {
479 	struct domainset *ndomain;
480 
481 	ndomain = uma_zalloc(domainset_zone, M_WAITOK | M_ZERO);
482 	domainset_copy(domain, ndomain);
483 	return _domainset_create(ndomain, NULL);
484 }
485 
486 /*
487  * Update thread domainset pointers.
488  */
489 static void
490 domainset_notify(void)
491 {
492 	struct thread *td;
493 	struct proc *p;
494 
495 	sx_slock(&allproc_lock);
496 	FOREACH_PROC_IN_SYSTEM(p) {
497 		PROC_LOCK(p);
498 		if (p->p_state == PRS_NEW) {
499 			PROC_UNLOCK(p);
500 			continue;
501 		}
502 		FOREACH_THREAD_IN_PROC(p, td) {
503 			thread_lock(td);
504 			td->td_domain.dr_policy = td->td_cpuset->cs_domain;
505 			thread_unlock(td);
506 		}
507 		PROC_UNLOCK(p);
508 	}
509 	sx_sunlock(&allproc_lock);
510 	kernel_object->domain.dr_policy = cpuset_default->cs_domain;
511 }
512 
513 /*
514  * Create a new set that is a subset of a parent.
515  */
516 static struct domainset *
517 domainset_shadow(const struct domainset *pdomain,
518     const struct domainset *domain, struct domainlist *freelist)
519 {
520 	struct domainset *ndomain;
521 
522 	ndomain = LIST_FIRST(freelist);
523 	LIST_REMOVE(ndomain, ds_link);
524 
525 	/*
526 	 * Initialize the key from the request.
527 	 */
528 	domainset_copy(domain, ndomain);
529 
530 	/*
531 	 * Restrict the key by the parent.
532 	 */
533 	DOMAINSET_AND(&ndomain->ds_mask, &pdomain->ds_mask);
534 
535 	return _domainset_create(ndomain, freelist);
536 }
537 
538 /*
539  * Recursively check for errors that would occur from applying mask to
540  * the tree of sets starting at 'set'.  Checks for sets that would become
541  * empty as well as RDONLY flags.
542  */
543 static int
544 cpuset_testupdate(struct cpuset *set, cpuset_t *mask, int check_mask)
545 {
546 	struct cpuset *nset;
547 	cpuset_t newmask;
548 	int error;
549 
550 	mtx_assert(&cpuset_lock, MA_OWNED);
551 	if (set->cs_flags & CPU_SET_RDONLY)
552 		return (EPERM);
553 	if (check_mask) {
554 		if (!CPU_OVERLAP(&set->cs_mask, mask))
555 			return (EDEADLK);
556 		CPU_COPY(&set->cs_mask, &newmask);
557 		CPU_AND(&newmask, mask);
558 	} else
559 		CPU_COPY(mask, &newmask);
560 	error = 0;
561 	LIST_FOREACH(nset, &set->cs_children, cs_siblings)
562 		if ((error = cpuset_testupdate(nset, &newmask, 1)) != 0)
563 			break;
564 	return (error);
565 }
566 
567 /*
568  * Applies the mask 'mask' without checking for empty sets or permissions.
569  */
570 static void
571 cpuset_update(struct cpuset *set, cpuset_t *mask)
572 {
573 	struct cpuset *nset;
574 
575 	mtx_assert(&cpuset_lock, MA_OWNED);
576 	CPU_AND(&set->cs_mask, mask);
577 	LIST_FOREACH(nset, &set->cs_children, cs_siblings)
578 		cpuset_update(nset, &set->cs_mask);
579 
580 	return;
581 }
582 
583 /*
584  * Modify the set 'set' to use a copy of the mask provided.  Apply this new
585  * mask to restrict all children in the tree.  Checks for validity before
586  * applying the changes.
587  */
588 static int
589 cpuset_modify(struct cpuset *set, cpuset_t *mask)
590 {
591 	struct cpuset *root;
592 	int error;
593 
594 	error = priv_check(curthread, PRIV_SCHED_CPUSET);
595 	if (error)
596 		return (error);
597 	/*
598 	 * In case we are called from within the jail
599 	 * we do not allow modifying the dedicated root
600 	 * cpuset of the jail but may still allow to
601 	 * change child sets.
602 	 */
603 	if (jailed(curthread->td_ucred) &&
604 	    set->cs_flags & CPU_SET_ROOT)
605 		return (EPERM);
606 	/*
607 	 * Verify that we have access to this set of
608 	 * cpus.
609 	 */
610 	root = cpuset_getroot(set);
611 	mtx_lock_spin(&cpuset_lock);
612 	if (root && !CPU_SUBSET(&root->cs_mask, mask)) {
613 		error = EINVAL;
614 		goto out;
615 	}
616 	error = cpuset_testupdate(set, mask, 0);
617 	if (error)
618 		goto out;
619 	CPU_COPY(mask, &set->cs_mask);
620 	cpuset_update(set, mask);
621 out:
622 	mtx_unlock_spin(&cpuset_lock);
623 
624 	return (error);
625 }
626 
627 /*
628  * Recursively check for errors that would occur from applying mask to
629  * the tree of sets starting at 'set'.  Checks for sets that would become
630  * empty as well as RDONLY flags.
631  */
632 static int
633 cpuset_testupdate_domain(struct cpuset *set, struct domainset *dset,
634     struct domainset *orig, int *count, int check_mask)
635 {
636 	struct cpuset *nset;
637 	struct domainset *domain;
638 	struct domainset newset;
639 	int error;
640 
641 	mtx_assert(&cpuset_lock, MA_OWNED);
642 	if (set->cs_flags & CPU_SET_RDONLY)
643 		return (EPERM);
644 	domain = set->cs_domain;
645 	domainset_copy(domain, &newset);
646 	if (!domainset_equal(domain, orig)) {
647 		if (!domainset_restrict(domain, dset))
648 			return (EDEADLK);
649 		DOMAINSET_AND(&newset.ds_mask, &dset->ds_mask);
650 		/* Count the number of domains that are changing. */
651 		(*count)++;
652 	}
653 	error = 0;
654 	LIST_FOREACH(nset, &set->cs_children, cs_siblings)
655 		if ((error = cpuset_testupdate_domain(nset, &newset, domain,
656 		    count, 1)) != 0)
657 			break;
658 	return (error);
659 }
660 
661 /*
662  * Applies the mask 'mask' without checking for empty sets or permissions.
663  */
664 static void
665 cpuset_update_domain(struct cpuset *set, struct domainset *domain,
666     struct domainset *orig, struct domainlist *domains)
667 {
668 	struct cpuset *nset;
669 
670 	mtx_assert(&cpuset_lock, MA_OWNED);
671 	/*
672 	 * If this domainset has changed from the parent we must calculate
673 	 * a new set.  Otherwise it simply inherits from the parent.  When
674 	 * we inherit from the parent we get a new mask and policy.  If the
675 	 * set is modified from the parent we keep the policy and only
676 	 * update the mask.
677 	 */
678 	if (set->cs_domain != orig) {
679 		orig = set->cs_domain;
680 		set->cs_domain = domainset_shadow(domain, orig, domains);
681 	} else
682 		set->cs_domain = domain;
683 	LIST_FOREACH(nset, &set->cs_children, cs_siblings)
684 		cpuset_update_domain(nset, set->cs_domain, orig, domains);
685 
686 	return;
687 }
688 
689 /*
690  * Modify the set 'set' to use a copy the domainset provided.  Apply this new
691  * mask to restrict all children in the tree.  Checks for validity before
692  * applying the changes.
693  */
694 static int
695 cpuset_modify_domain(struct cpuset *set, struct domainset *domain)
696 {
697 	struct domainlist domains;
698 	struct domainset temp;
699 	struct domainset *dset;
700 	struct cpuset *root;
701 	int ndomains, needed;
702 	int error;
703 
704 	error = priv_check(curthread, PRIV_SCHED_CPUSET);
705 	if (error)
706 		return (error);
707 	/*
708 	 * In case we are called from within the jail
709 	 * we do not allow modifying the dedicated root
710 	 * cpuset of the jail but may still allow to
711 	 * change child sets.
712 	 */
713 	if (jailed(curthread->td_ucred) &&
714 	    set->cs_flags & CPU_SET_ROOT)
715 		return (EPERM);
716 	domainset_freelist_init(&domains, 0);
717 	domain = domainset_create(domain);
718 	ndomains = needed = 0;
719 	do {
720 		if (ndomains < needed) {
721 			domainset_freelist_add(&domains, needed - ndomains);
722 			ndomains = needed;
723 		}
724 		root = cpuset_getroot(set);
725 		mtx_lock_spin(&cpuset_lock);
726 		dset = root->cs_domain;
727 		/*
728 		 * Verify that we have access to this set of domains.
729 		 */
730 		if (root && !domainset_valid(dset, domain)) {
731 			error = EINVAL;
732 			goto out;
733 		}
734 		/*
735 		 * If applying prefer we keep the current set as the fallback.
736 		 */
737 		if (domain->ds_policy == DOMAINSET_POLICY_PREFER)
738 			DOMAINSET_COPY(&set->cs_domain->ds_mask,
739 			    &domain->ds_mask);
740 		/*
741 		 * Determine whether we can apply this set of domains and
742 		 * how many new domain structures it will require.
743 		 */
744 		domainset_copy(domain, &temp);
745 		needed = 0;
746 		error = cpuset_testupdate_domain(set, &temp, set->cs_domain,
747 		    &needed, 0);
748 		if (error)
749 			goto out;
750 	} while (ndomains < needed);
751 	dset = set->cs_domain;
752 	cpuset_update_domain(set, domain, dset, &domains);
753 out:
754 	mtx_unlock_spin(&cpuset_lock);
755 	domainset_freelist_free(&domains);
756 	if (error == 0)
757 		domainset_notify();
758 
759 	return (error);
760 }
761 
762 /*
763  * Resolve the 'which' parameter of several cpuset apis.
764  *
765  * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid.  Also
766  * checks for permission via p_cansched().
767  *
768  * For WHICH_SET returns a valid set with a new reference.
769  *
770  * -1 may be supplied for any argument to mean the current proc/thread or
771  * the base set of the current thread.  May fail with ESRCH/EPERM.
772  */
773 int
774 cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp,
775     struct cpuset **setp)
776 {
777 	struct cpuset *set;
778 	struct thread *td;
779 	struct proc *p;
780 	int error;
781 
782 	*pp = p = NULL;
783 	*tdp = td = NULL;
784 	*setp = set = NULL;
785 	switch (which) {
786 	case CPU_WHICH_PID:
787 		if (id == -1) {
788 			PROC_LOCK(curproc);
789 			p = curproc;
790 			break;
791 		}
792 		if ((p = pfind(id)) == NULL)
793 			return (ESRCH);
794 		break;
795 	case CPU_WHICH_TID:
796 		if (id == -1) {
797 			PROC_LOCK(curproc);
798 			p = curproc;
799 			td = curthread;
800 			break;
801 		}
802 		td = tdfind(id, -1);
803 		if (td == NULL)
804 			return (ESRCH);
805 		p = td->td_proc;
806 		break;
807 	case CPU_WHICH_CPUSET:
808 		if (id == -1) {
809 			thread_lock(curthread);
810 			set = cpuset_refbase(curthread->td_cpuset);
811 			thread_unlock(curthread);
812 		} else
813 			set = cpuset_lookup(id, curthread);
814 		if (set) {
815 			*setp = set;
816 			return (0);
817 		}
818 		return (ESRCH);
819 	case CPU_WHICH_JAIL:
820 	{
821 		/* Find `set' for prison with given id. */
822 		struct prison *pr;
823 
824 		sx_slock(&allprison_lock);
825 		pr = prison_find_child(curthread->td_ucred->cr_prison, id);
826 		sx_sunlock(&allprison_lock);
827 		if (pr == NULL)
828 			return (ESRCH);
829 		cpuset_ref(pr->pr_cpuset);
830 		*setp = pr->pr_cpuset;
831 		mtx_unlock(&pr->pr_mtx);
832 		return (0);
833 	}
834 	case CPU_WHICH_IRQ:
835 	case CPU_WHICH_DOMAIN:
836 		return (0);
837 	default:
838 		return (EINVAL);
839 	}
840 	error = p_cansched(curthread, p);
841 	if (error) {
842 		PROC_UNLOCK(p);
843 		return (error);
844 	}
845 	if (td == NULL)
846 		td = FIRST_THREAD_IN_PROC(p);
847 	*pp = p;
848 	*tdp = td;
849 	return (0);
850 }
851 
852 static int
853 cpuset_testshadow(struct cpuset *set, const cpuset_t *mask,
854     const struct domainset *domain)
855 {
856 	struct cpuset *parent;
857 	struct domainset *dset;
858 
859 	parent = cpuset_getbase(set);
860 	/*
861 	 * If we are restricting a cpu mask it must be a subset of the
862 	 * parent or invalid CPUs have been specified.
863 	 */
864 	if (mask != NULL && !CPU_SUBSET(&parent->cs_mask, mask))
865 		return (EINVAL);
866 
867 	/*
868 	 * If we are restricting a domain mask it must be a subset of the
869 	 * parent or invalid domains have been specified.
870 	 */
871 	dset = parent->cs_domain;
872 	if (domain != NULL && !domainset_valid(dset, domain))
873 		return (EINVAL);
874 
875 	return (0);
876 }
877 
878 /*
879  * Create an anonymous set with the provided mask in the space provided by
880  * 'nset'.  If the passed in set is anonymous we use its parent otherwise
881  * the new set is a child of 'set'.
882  */
883 static int
884 cpuset_shadow(struct cpuset *set, struct cpuset **nsetp,
885    const cpuset_t *mask, const struct domainset *domain,
886    struct setlist *cpusets, struct domainlist *domains)
887 {
888 	struct cpuset *parent;
889 	struct cpuset *nset;
890 	struct domainset *dset;
891 	struct domainset *d;
892 	int error;
893 
894 	error = cpuset_testshadow(set, mask, domain);
895 	if (error)
896 		return (error);
897 
898 	parent = cpuset_getbase(set);
899 	dset = parent->cs_domain;
900 	if (mask == NULL)
901 		mask = &set->cs_mask;
902 	if (domain != NULL)
903 		d = domainset_shadow(dset, domain, domains);
904 	else
905 		d = set->cs_domain;
906 	nset = LIST_FIRST(cpusets);
907 	error = _cpuset_create(nset, parent, mask, d, CPUSET_INVALID);
908 	if (error == 0) {
909 		LIST_REMOVE(nset, cs_link);
910 		*nsetp = nset;
911 	}
912 	return (error);
913 }
914 
915 static struct cpuset *
916 cpuset_update_thread(struct thread *td, struct cpuset *nset)
917 {
918 	struct cpuset *tdset;
919 
920 	tdset = td->td_cpuset;
921 	td->td_cpuset = nset;
922 	td->td_domain.dr_policy = nset->cs_domain;
923 	sched_affinity(td);
924 
925 	return (tdset);
926 }
927 
928 static int
929 cpuset_setproc_test_maskthread(struct cpuset *tdset, cpuset_t *mask,
930     struct domainset *domain)
931 {
932 	struct cpuset *parent;
933 
934 	parent = cpuset_getbase(tdset);
935 	if (mask == NULL)
936 		mask = &tdset->cs_mask;
937 	if (domain == NULL)
938 		domain = tdset->cs_domain;
939 	return cpuset_testshadow(parent, mask, domain);
940 }
941 
942 static int
943 cpuset_setproc_maskthread(struct cpuset *tdset, cpuset_t *mask,
944     struct domainset *domain, struct cpuset **nsetp,
945     struct setlist *freelist, struct domainlist *domainlist)
946 {
947 	struct cpuset *parent;
948 
949 	parent = cpuset_getbase(tdset);
950 	if (mask == NULL)
951 		mask = &tdset->cs_mask;
952 	if (domain == NULL)
953 		domain = tdset->cs_domain;
954 	return cpuset_shadow(parent, nsetp, mask, domain, freelist,
955 	    domainlist);
956 }
957 
958 static int
959 cpuset_setproc_setthread_mask(struct cpuset *tdset, struct cpuset *set,
960     cpuset_t *mask, struct domainset *domain)
961 {
962 	struct cpuset *parent;
963 
964 	parent = cpuset_getbase(tdset);
965 
966 	/*
967 	 * If the thread restricted its mask then apply that same
968 	 * restriction to the new set, otherwise take it wholesale.
969 	 */
970 	if (CPU_CMP(&tdset->cs_mask, &parent->cs_mask) != 0) {
971 		CPU_COPY(&tdset->cs_mask, mask);
972 		CPU_AND(mask, &set->cs_mask);
973 	} else
974 		CPU_COPY(&set->cs_mask, mask);
975 
976 	/*
977 	 * If the thread restricted the domain then we apply the
978 	 * restriction to the new set but retain the policy.
979 	 */
980 	if (tdset->cs_domain != parent->cs_domain) {
981 		domainset_copy(tdset->cs_domain, domain);
982 		DOMAINSET_AND(&domain->ds_mask, &set->cs_domain->ds_mask);
983 	} else
984 		domainset_copy(set->cs_domain, domain);
985 
986 	if (CPU_EMPTY(mask) || DOMAINSET_EMPTY(&domain->ds_mask))
987 		return (EDEADLK);
988 
989 	return (0);
990 }
991 
992 static int
993 cpuset_setproc_test_setthread(struct cpuset *tdset, struct cpuset *set)
994 {
995 	struct domainset domain;
996 	cpuset_t mask;
997 
998 	if (tdset->cs_id != CPUSET_INVALID)
999 		return (0);
1000 	return cpuset_setproc_setthread_mask(tdset, set, &mask, &domain);
1001 }
1002 
1003 static int
1004 cpuset_setproc_setthread(struct cpuset *tdset, struct cpuset *set,
1005     struct cpuset **nsetp, struct setlist *freelist,
1006     struct domainlist *domainlist)
1007 {
1008 	struct domainset domain;
1009 	cpuset_t mask;
1010 	int error;
1011 
1012 	/*
1013 	 * If we're replacing on a thread that has not constrained the
1014 	 * original set we can simply accept the new set.
1015 	 */
1016 	if (tdset->cs_id != CPUSET_INVALID) {
1017 		*nsetp = cpuset_ref(set);
1018 		return (0);
1019 	}
1020 	error = cpuset_setproc_setthread_mask(tdset, set, &mask, &domain);
1021 	if (error)
1022 		return (error);
1023 
1024 	return cpuset_shadow(tdset, nsetp, &mask, &domain, freelist,
1025 	    domainlist);
1026 }
1027 
1028 /*
1029  * Handle three cases for updating an entire process.
1030  *
1031  * 1) Set is non-null.  This reparents all anonymous sets to the provided
1032  *    set and replaces all non-anonymous td_cpusets with the provided set.
1033  * 2) Mask is non-null.  This replaces or creates anonymous sets for every
1034  *    thread with the existing base as a parent.
1035  * 3) domain is non-null.  This creates anonymous sets for every thread
1036  *    and replaces the domain set.
1037  *
1038  * This is overly complicated because we can't allocate while holding a
1039  * spinlock and spinlocks must be held while changing and examining thread
1040  * state.
1041  */
1042 static int
1043 cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask,
1044     struct domainset *domain)
1045 {
1046 	struct setlist freelist;
1047 	struct setlist droplist;
1048 	struct domainlist domainlist;
1049 	struct cpuset *nset;
1050 	struct thread *td;
1051 	struct proc *p;
1052 	int threads;
1053 	int nfree;
1054 	int error;
1055 
1056 	/*
1057 	 * The algorithm requires two passes due to locking considerations.
1058 	 *
1059 	 * 1) Lookup the process and acquire the locks in the required order.
1060 	 * 2) If enough cpusets have not been allocated release the locks and
1061 	 *    allocate them.  Loop.
1062 	 */
1063 	cpuset_freelist_init(&freelist, 1);
1064 	domainset_freelist_init(&domainlist, 1);
1065 	nfree = 1;
1066 	LIST_INIT(&droplist);
1067 	nfree = 0;
1068 	for (;;) {
1069 		error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset);
1070 		if (error)
1071 			goto out;
1072 		if (nfree >= p->p_numthreads)
1073 			break;
1074 		threads = p->p_numthreads;
1075 		PROC_UNLOCK(p);
1076 		if (nfree < threads) {
1077 			cpuset_freelist_add(&freelist, threads - nfree);
1078 			domainset_freelist_add(&domainlist, threads - nfree);
1079 			nfree = threads;
1080 		}
1081 	}
1082 	PROC_LOCK_ASSERT(p, MA_OWNED);
1083 	/*
1084 	 * Now that the appropriate locks are held and we have enough cpusets,
1085 	 * make sure the operation will succeed before applying changes. The
1086 	 * proc lock prevents td_cpuset from changing between calls.
1087 	 */
1088 	error = 0;
1089 	FOREACH_THREAD_IN_PROC(p, td) {
1090 		thread_lock(td);
1091 		if (set != NULL)
1092 			error = cpuset_setproc_test_setthread(td->td_cpuset,
1093 			    set);
1094 		else
1095 			error = cpuset_setproc_test_maskthread(td->td_cpuset,
1096 			    mask, domain);
1097 		thread_unlock(td);
1098 		if (error)
1099 			goto unlock_out;
1100 	}
1101 	/*
1102 	 * Replace each thread's cpuset while using deferred release.  We
1103 	 * must do this because the thread lock must be held while operating
1104 	 * on the thread and this limits the type of operations allowed.
1105 	 */
1106 	FOREACH_THREAD_IN_PROC(p, td) {
1107 		thread_lock(td);
1108 		if (set != NULL)
1109 			error = cpuset_setproc_setthread(td->td_cpuset, set,
1110 			    &nset, &freelist, &domainlist);
1111 		else
1112 			error = cpuset_setproc_maskthread(td->td_cpuset, mask,
1113 			    domain, &nset, &freelist, &domainlist);
1114 		if (error) {
1115 			thread_unlock(td);
1116 			break;
1117 		}
1118 		cpuset_rel_defer(&droplist, cpuset_update_thread(td, nset));
1119 		thread_unlock(td);
1120 	}
1121 unlock_out:
1122 	PROC_UNLOCK(p);
1123 out:
1124 	while ((nset = LIST_FIRST(&droplist)) != NULL)
1125 		cpuset_rel_complete(nset);
1126 	cpuset_freelist_free(&freelist);
1127 	domainset_freelist_free(&domainlist);
1128 	return (error);
1129 }
1130 
1131 /*
1132  * Return a string representing a valid layout for a cpuset_t object.
1133  * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
1134  */
1135 char *
1136 cpusetobj_strprint(char *buf, const cpuset_t *set)
1137 {
1138 	char *tbuf;
1139 	size_t i, bytesp, bufsiz;
1140 
1141 	tbuf = buf;
1142 	bytesp = 0;
1143 	bufsiz = CPUSETBUFSIZ;
1144 
1145 	for (i = 0; i < (_NCPUWORDS - 1); i++) {
1146 		bytesp = snprintf(tbuf, bufsiz, "%lx,", set->__bits[i]);
1147 		bufsiz -= bytesp;
1148 		tbuf += bytesp;
1149 	}
1150 	snprintf(tbuf, bufsiz, "%lx", set->__bits[_NCPUWORDS - 1]);
1151 	return (buf);
1152 }
1153 
1154 /*
1155  * Build a valid cpuset_t object from a string representation.
1156  * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
1157  */
1158 int
1159 cpusetobj_strscan(cpuset_t *set, const char *buf)
1160 {
1161 	u_int nwords;
1162 	int i, ret;
1163 
1164 	if (strlen(buf) > CPUSETBUFSIZ - 1)
1165 		return (-1);
1166 
1167 	/* Allow to pass a shorter version of the mask when necessary. */
1168 	nwords = 1;
1169 	for (i = 0; buf[i] != '\0'; i++)
1170 		if (buf[i] == ',')
1171 			nwords++;
1172 	if (nwords > _NCPUWORDS)
1173 		return (-1);
1174 
1175 	CPU_ZERO(set);
1176 	for (i = 0; i < (nwords - 1); i++) {
1177 		ret = sscanf(buf, "%lx,", &set->__bits[i]);
1178 		if (ret == 0 || ret == -1)
1179 			return (-1);
1180 		buf = strstr(buf, ",");
1181 		if (buf == NULL)
1182 			return (-1);
1183 		buf++;
1184 	}
1185 	ret = sscanf(buf, "%lx", &set->__bits[nwords - 1]);
1186 	if (ret == 0 || ret == -1)
1187 		return (-1);
1188 	return (0);
1189 }
1190 
1191 /*
1192  * Apply an anonymous mask or a domain to a single thread.
1193  */
1194 static int
1195 _cpuset_setthread(lwpid_t id, cpuset_t *mask, struct domainset *domain)
1196 {
1197 	struct setlist cpusets;
1198 	struct domainlist domainlist;
1199 	struct cpuset *nset;
1200 	struct cpuset *set;
1201 	struct thread *td;
1202 	struct proc *p;
1203 	int error;
1204 
1205 	cpuset_freelist_init(&cpusets, 1);
1206 	domainset_freelist_init(&domainlist, domain != NULL);
1207 	error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set);
1208 	if (error)
1209 		goto out;
1210 	set = NULL;
1211 	thread_lock(td);
1212 	error = cpuset_shadow(td->td_cpuset, &nset, mask, domain,
1213 	    &cpusets, &domainlist);
1214 	if (error == 0)
1215 		set = cpuset_update_thread(td, nset);
1216 	thread_unlock(td);
1217 	PROC_UNLOCK(p);
1218 	if (set)
1219 		cpuset_rel(set);
1220 out:
1221 	cpuset_freelist_free(&cpusets);
1222 	domainset_freelist_free(&domainlist);
1223 	return (error);
1224 }
1225 
1226 /*
1227  * Apply an anonymous mask to a single thread.
1228  */
1229 int
1230 cpuset_setthread(lwpid_t id, cpuset_t *mask)
1231 {
1232 
1233 	return _cpuset_setthread(id, mask, NULL);
1234 }
1235 
1236 /*
1237  * Apply new cpumask to the ithread.
1238  */
1239 int
1240 cpuset_setithread(lwpid_t id, int cpu)
1241 {
1242 	struct setlist cpusets;
1243 	struct cpuset *nset, *rset;
1244 	struct cpuset *parent, *old_set;
1245 	struct thread *td;
1246 	struct proc *p;
1247 	cpusetid_t cs_id;
1248 	cpuset_t mask;
1249 	int error;
1250 
1251 	cpuset_freelist_init(&cpusets, 1);
1252 	rset = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
1253 	cs_id = CPUSET_INVALID;
1254 
1255 	CPU_ZERO(&mask);
1256 	if (cpu == NOCPU)
1257 		CPU_COPY(cpuset_root, &mask);
1258 	else
1259 		CPU_SET(cpu, &mask);
1260 
1261 	error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &old_set);
1262 	if (error != 0 || ((cs_id = alloc_unr(cpuset_unr)) == CPUSET_INVALID))
1263 		goto out;
1264 
1265 	/* cpuset_which() returns with PROC_LOCK held. */
1266 	old_set = td->td_cpuset;
1267 
1268 	if (cpu == NOCPU) {
1269 		nset = LIST_FIRST(&cpusets);
1270 		LIST_REMOVE(nset, cs_link);
1271 
1272 		/*
1273 		 * roll back to default set. We're not using cpuset_shadow()
1274 		 * here because we can fail CPU_SUBSET() check. This can happen
1275 		 * if default set does not contain all CPUs.
1276 		 */
1277 		error = _cpuset_create(nset, cpuset_default, &mask, NULL,
1278 		    CPUSET_INVALID);
1279 
1280 		goto applyset;
1281 	}
1282 
1283 	if (old_set->cs_id == 1 || (old_set->cs_id == CPUSET_INVALID &&
1284 	    old_set->cs_parent->cs_id == 1)) {
1285 
1286 		/*
1287 		 * Current set is either default (1) or
1288 		 * shadowed version of default set.
1289 		 *
1290 		 * Allocate new root set to be able to shadow it
1291 		 * with any mask.
1292 		 */
1293 		error = _cpuset_create(rset, cpuset_zero,
1294 		    &cpuset_zero->cs_mask, NULL, cs_id);
1295 		if (error != 0) {
1296 			PROC_UNLOCK(p);
1297 			goto out;
1298 		}
1299 		rset->cs_flags |= CPU_SET_ROOT;
1300 		parent = rset;
1301 		rset = NULL;
1302 		cs_id = CPUSET_INVALID;
1303 	} else {
1304 		/* Assume existing set was already allocated by previous call */
1305 		parent = old_set;
1306 		old_set = NULL;
1307 	}
1308 
1309 	error = cpuset_shadow(parent, &nset, &mask, NULL, &cpusets, NULL);
1310 applyset:
1311 	if (error == 0) {
1312 		thread_lock(td);
1313 		old_set = cpuset_update_thread(td, nset);
1314 		thread_unlock(td);
1315 	} else
1316 		old_set = NULL;
1317 	PROC_UNLOCK(p);
1318 	if (old_set != NULL)
1319 		cpuset_rel(old_set);
1320 out:
1321 	cpuset_freelist_free(&cpusets);
1322 	if (rset != NULL)
1323 		uma_zfree(cpuset_zone, rset);
1324 	if (cs_id != CPUSET_INVALID)
1325 		free_unr(cpuset_unr, cs_id);
1326 	return (error);
1327 }
1328 
1329 static struct domainset domainset0;
1330 
1331 void
1332 domainset_zero(void)
1333 {
1334 	struct domainset *dset;
1335 	int i;
1336 
1337 	mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE);
1338 
1339 	dset = &domainset0;
1340 	DOMAINSET_ZERO(&dset->ds_mask);
1341 	for (i = 0; i < vm_ndomains; i++)
1342 		DOMAINSET_SET(i, &dset->ds_mask);
1343 	dset->ds_policy = DOMAINSET_POLICY_ROUNDROBIN;
1344 	dset->ds_prefer = -1;
1345 	curthread->td_domain.dr_policy = _domainset_create(dset, NULL);
1346 	kernel_object->domain.dr_policy = curthread->td_domain.dr_policy;
1347 }
1348 
1349 /*
1350  * Creates system-wide cpusets and the cpuset for thread0 including two
1351  * sets:
1352  *
1353  * 0 - The root set which should represent all valid processors in the
1354  *     system.  It is initially created with a mask of all processors
1355  *     because we don't know what processors are valid until cpuset_init()
1356  *     runs.  This set is immutable.
1357  * 1 - The default set which all processes are a member of until changed.
1358  *     This allows an administrator to move all threads off of given cpus to
1359  *     dedicate them to high priority tasks or save power etc.
1360  */
1361 struct cpuset *
1362 cpuset_thread0(void)
1363 {
1364 	struct cpuset *set;
1365 	int error;
1366 
1367 	cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
1368 	    NULL, NULL, UMA_ALIGN_PTR, 0);
1369 	domainset_zone = uma_zcreate("domainset", sizeof(struct domainset),
1370 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1371 
1372 	/*
1373 	 * Create the root system set for the whole machine.  Doesn't use
1374 	 * cpuset_create() due to NULL parent.
1375 	 */
1376 	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
1377 	CPU_FILL(&set->cs_mask);
1378 	LIST_INIT(&set->cs_children);
1379 	LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
1380 	set->cs_ref = 1;
1381 	set->cs_flags = CPU_SET_ROOT;
1382 	set->cs_domain = &domainset0;
1383 	cpuset_zero = set;
1384 	cpuset_root = &set->cs_mask;
1385 
1386 	/*
1387 	 * Now derive a default, modifiable set from that to give out.
1388 	 */
1389 	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
1390 	error = _cpuset_create(set, cpuset_zero, NULL, NULL, 1);
1391 	KASSERT(error == 0, ("Error creating default set: %d\n", error));
1392 	cpuset_default = set;
1393 
1394 	/*
1395 	 * Initialize the unit allocator. 0 and 1 are allocated above.
1396 	 */
1397 	cpuset_unr = new_unrhdr(2, INT_MAX, NULL);
1398 
1399 	return (set);
1400 }
1401 
1402 /*
1403  * Create a cpuset, which would be cpuset_create() but
1404  * mark the new 'set' as root.
1405  *
1406  * We are not going to reparent the td to it.  Use cpuset_setproc_update_set()
1407  * for that.
1408  *
1409  * In case of no error, returns the set in *setp locked with a reference.
1410  */
1411 int
1412 cpuset_create_root(struct prison *pr, struct cpuset **setp)
1413 {
1414 	struct cpuset *set;
1415 	int error;
1416 
1417 	KASSERT(pr != NULL, ("[%s:%d] invalid pr", __func__, __LINE__));
1418 	KASSERT(setp != NULL, ("[%s:%d] invalid setp", __func__, __LINE__));
1419 
1420 	error = cpuset_create(setp, pr->pr_cpuset, &pr->pr_cpuset->cs_mask);
1421 	if (error)
1422 		return (error);
1423 
1424 	KASSERT(*setp != NULL, ("[%s:%d] cpuset_create returned invalid data",
1425 	    __func__, __LINE__));
1426 
1427 	/* Mark the set as root. */
1428 	set = *setp;
1429 	set->cs_flags |= CPU_SET_ROOT;
1430 
1431 	return (0);
1432 }
1433 
1434 int
1435 cpuset_setproc_update_set(struct proc *p, struct cpuset *set)
1436 {
1437 	int error;
1438 
1439 	KASSERT(p != NULL, ("[%s:%d] invalid proc", __func__, __LINE__));
1440 	KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__));
1441 
1442 	cpuset_ref(set);
1443 	error = cpuset_setproc(p->p_pid, set, NULL, NULL);
1444 	if (error)
1445 		return (error);
1446 	cpuset_rel(set);
1447 	return (0);
1448 }
1449 
1450 /*
1451  * This is called once the final set of system cpus is known.  Modifies
1452  * the root set and all children and mark the root read-only.
1453  */
1454 static void
1455 cpuset_init(void *arg)
1456 {
1457 	cpuset_t mask;
1458 	int i;
1459 
1460 	mask = all_cpus;
1461 	if (cpuset_modify(cpuset_zero, &mask))
1462 		panic("Can't set initial cpuset mask.\n");
1463 	cpuset_zero->cs_flags |= CPU_SET_RDONLY;
1464 
1465 	/*
1466 	 * If MD code has not initialized per-domain cpusets, place all
1467 	 * CPUs in domain 0.
1468 	 */
1469 	for (i = 0; i < MAXMEMDOM; i++)
1470 		if (!CPU_EMPTY(&cpuset_domain[i]))
1471 			goto domains_set;
1472 	CPU_COPY(&all_cpus, &cpuset_domain[0]);
1473 domains_set:
1474 	return;
1475 }
1476 SYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL);
1477 
1478 #ifndef _SYS_SYSPROTO_H_
1479 struct cpuset_args {
1480 	cpusetid_t	*setid;
1481 };
1482 #endif
1483 int
1484 sys_cpuset(struct thread *td, struct cpuset_args *uap)
1485 {
1486 	struct cpuset *root;
1487 	struct cpuset *set;
1488 	int error;
1489 
1490 	thread_lock(td);
1491 	root = cpuset_refroot(td->td_cpuset);
1492 	thread_unlock(td);
1493 	error = cpuset_create(&set, root, &root->cs_mask);
1494 	cpuset_rel(root);
1495 	if (error)
1496 		return (error);
1497 	error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id));
1498 	if (error == 0)
1499 		error = cpuset_setproc(-1, set, NULL, NULL);
1500 	cpuset_rel(set);
1501 	return (error);
1502 }
1503 
1504 #ifndef _SYS_SYSPROTO_H_
1505 struct cpuset_setid_args {
1506 	cpuwhich_t	which;
1507 	id_t		id;
1508 	cpusetid_t	setid;
1509 };
1510 #endif
1511 int
1512 sys_cpuset_setid(struct thread *td, struct cpuset_setid_args *uap)
1513 {
1514 
1515 	return (kern_cpuset_setid(td, uap->which, uap->id, uap->setid));
1516 }
1517 
1518 int
1519 kern_cpuset_setid(struct thread *td, cpuwhich_t which,
1520     id_t id, cpusetid_t setid)
1521 {
1522 	struct cpuset *set;
1523 	int error;
1524 
1525 	/*
1526 	 * Presently we only support per-process sets.
1527 	 */
1528 	if (which != CPU_WHICH_PID)
1529 		return (EINVAL);
1530 	set = cpuset_lookup(setid, td);
1531 	if (set == NULL)
1532 		return (ESRCH);
1533 	error = cpuset_setproc(id, set, NULL, NULL);
1534 	cpuset_rel(set);
1535 	return (error);
1536 }
1537 
1538 #ifndef _SYS_SYSPROTO_H_
1539 struct cpuset_getid_args {
1540 	cpulevel_t	level;
1541 	cpuwhich_t	which;
1542 	id_t		id;
1543 	cpusetid_t	*setid;
1544 };
1545 #endif
1546 int
1547 sys_cpuset_getid(struct thread *td, struct cpuset_getid_args *uap)
1548 {
1549 
1550 	return (kern_cpuset_getid(td, uap->level, uap->which, uap->id,
1551 	    uap->setid));
1552 }
1553 
1554 int
1555 kern_cpuset_getid(struct thread *td, cpulevel_t level, cpuwhich_t which,
1556     id_t id, cpusetid_t *setid)
1557 {
1558 	struct cpuset *nset;
1559 	struct cpuset *set;
1560 	struct thread *ttd;
1561 	struct proc *p;
1562 	cpusetid_t tmpid;
1563 	int error;
1564 
1565 	if (level == CPU_LEVEL_WHICH && which != CPU_WHICH_CPUSET)
1566 		return (EINVAL);
1567 	error = cpuset_which(which, id, &p, &ttd, &set);
1568 	if (error)
1569 		return (error);
1570 	switch (which) {
1571 	case CPU_WHICH_TID:
1572 	case CPU_WHICH_PID:
1573 		thread_lock(ttd);
1574 		set = cpuset_refbase(ttd->td_cpuset);
1575 		thread_unlock(ttd);
1576 		PROC_UNLOCK(p);
1577 		break;
1578 	case CPU_WHICH_CPUSET:
1579 	case CPU_WHICH_JAIL:
1580 		break;
1581 	case CPU_WHICH_IRQ:
1582 	case CPU_WHICH_DOMAIN:
1583 		return (EINVAL);
1584 	}
1585 	switch (level) {
1586 	case CPU_LEVEL_ROOT:
1587 		nset = cpuset_refroot(set);
1588 		cpuset_rel(set);
1589 		set = nset;
1590 		break;
1591 	case CPU_LEVEL_CPUSET:
1592 		break;
1593 	case CPU_LEVEL_WHICH:
1594 		break;
1595 	}
1596 	tmpid = set->cs_id;
1597 	cpuset_rel(set);
1598 	if (error == 0)
1599 		error = copyout(&tmpid, setid, sizeof(tmpid));
1600 
1601 	return (error);
1602 }
1603 
1604 #ifndef _SYS_SYSPROTO_H_
1605 struct cpuset_getaffinity_args {
1606 	cpulevel_t	level;
1607 	cpuwhich_t	which;
1608 	id_t		id;
1609 	size_t		cpusetsize;
1610 	cpuset_t	*mask;
1611 };
1612 #endif
1613 int
1614 sys_cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap)
1615 {
1616 
1617 	return (kern_cpuset_getaffinity(td, uap->level, uap->which,
1618 	    uap->id, uap->cpusetsize, uap->mask));
1619 }
1620 
1621 int
1622 kern_cpuset_getaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which,
1623     id_t id, size_t cpusetsize, cpuset_t *maskp)
1624 {
1625 	struct thread *ttd;
1626 	struct cpuset *nset;
1627 	struct cpuset *set;
1628 	struct proc *p;
1629 	cpuset_t *mask;
1630 	int error;
1631 	size_t size;
1632 
1633 	if (cpusetsize < sizeof(cpuset_t) || cpusetsize > CPU_MAXSIZE / NBBY)
1634 		return (ERANGE);
1635 	/* In Capability mode, you can only get your own CPU set. */
1636 	if (IN_CAPABILITY_MODE(td)) {
1637 		if (level != CPU_LEVEL_WHICH)
1638 			return (ECAPMODE);
1639 		if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
1640 			return (ECAPMODE);
1641 		if (id != -1)
1642 			return (ECAPMODE);
1643 	}
1644 	size = cpusetsize;
1645 	mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
1646 	error = cpuset_which(which, id, &p, &ttd, &set);
1647 	if (error)
1648 		goto out;
1649 	switch (level) {
1650 	case CPU_LEVEL_ROOT:
1651 	case CPU_LEVEL_CPUSET:
1652 		switch (which) {
1653 		case CPU_WHICH_TID:
1654 		case CPU_WHICH_PID:
1655 			thread_lock(ttd);
1656 			set = cpuset_ref(ttd->td_cpuset);
1657 			thread_unlock(ttd);
1658 			break;
1659 		case CPU_WHICH_CPUSET:
1660 		case CPU_WHICH_JAIL:
1661 			break;
1662 		case CPU_WHICH_IRQ:
1663 		case CPU_WHICH_INTRHANDLER:
1664 		case CPU_WHICH_ITHREAD:
1665 		case CPU_WHICH_DOMAIN:
1666 			error = EINVAL;
1667 			goto out;
1668 		}
1669 		if (level == CPU_LEVEL_ROOT)
1670 			nset = cpuset_refroot(set);
1671 		else
1672 			nset = cpuset_refbase(set);
1673 		CPU_COPY(&nset->cs_mask, mask);
1674 		cpuset_rel(nset);
1675 		break;
1676 	case CPU_LEVEL_WHICH:
1677 		switch (which) {
1678 		case CPU_WHICH_TID:
1679 			thread_lock(ttd);
1680 			CPU_COPY(&ttd->td_cpuset->cs_mask, mask);
1681 			thread_unlock(ttd);
1682 			break;
1683 		case CPU_WHICH_PID:
1684 			FOREACH_THREAD_IN_PROC(p, ttd) {
1685 				thread_lock(ttd);
1686 				CPU_OR(mask, &ttd->td_cpuset->cs_mask);
1687 				thread_unlock(ttd);
1688 			}
1689 			break;
1690 		case CPU_WHICH_CPUSET:
1691 		case CPU_WHICH_JAIL:
1692 			CPU_COPY(&set->cs_mask, mask);
1693 			break;
1694 		case CPU_WHICH_IRQ:
1695 		case CPU_WHICH_INTRHANDLER:
1696 		case CPU_WHICH_ITHREAD:
1697 			error = intr_getaffinity(id, which, mask);
1698 			break;
1699 		case CPU_WHICH_DOMAIN:
1700 			if (id < 0 || id >= MAXMEMDOM)
1701 				error = ESRCH;
1702 			else
1703 				CPU_COPY(&cpuset_domain[id], mask);
1704 			break;
1705 		}
1706 		break;
1707 	default:
1708 		error = EINVAL;
1709 		break;
1710 	}
1711 	if (set)
1712 		cpuset_rel(set);
1713 	if (p)
1714 		PROC_UNLOCK(p);
1715 	if (error == 0)
1716 		error = copyout(mask, maskp, size);
1717 out:
1718 	free(mask, M_TEMP);
1719 	return (error);
1720 }
1721 
1722 #ifndef _SYS_SYSPROTO_H_
1723 struct cpuset_setaffinity_args {
1724 	cpulevel_t	level;
1725 	cpuwhich_t	which;
1726 	id_t		id;
1727 	size_t		cpusetsize;
1728 	const cpuset_t	*mask;
1729 };
1730 #endif
1731 int
1732 sys_cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap)
1733 {
1734 
1735 	return (kern_cpuset_setaffinity(td, uap->level, uap->which,
1736 	    uap->id, uap->cpusetsize, uap->mask));
1737 }
1738 
1739 int
1740 kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which,
1741     id_t id, size_t cpusetsize, const cpuset_t *maskp)
1742 {
1743 	struct cpuset *nset;
1744 	struct cpuset *set;
1745 	struct thread *ttd;
1746 	struct proc *p;
1747 	cpuset_t *mask;
1748 	int error;
1749 
1750 	if (cpusetsize < sizeof(cpuset_t) || cpusetsize > CPU_MAXSIZE / NBBY)
1751 		return (ERANGE);
1752 	/* In Capability mode, you can only set your own CPU set. */
1753 	if (IN_CAPABILITY_MODE(td)) {
1754 		if (level != CPU_LEVEL_WHICH)
1755 			return (ECAPMODE);
1756 		if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
1757 			return (ECAPMODE);
1758 		if (id != -1)
1759 			return (ECAPMODE);
1760 	}
1761 	mask = malloc(cpusetsize, M_TEMP, M_WAITOK | M_ZERO);
1762 	error = copyin(maskp, mask, cpusetsize);
1763 	if (error)
1764 		goto out;
1765 	/*
1766 	 * Verify that no high bits are set.
1767 	 */
1768 	if (cpusetsize > sizeof(cpuset_t)) {
1769 		char *end;
1770 		char *cp;
1771 
1772 		end = cp = (char *)&mask->__bits;
1773 		end += cpusetsize;
1774 		cp += sizeof(cpuset_t);
1775 		while (cp != end)
1776 			if (*cp++ != 0) {
1777 				error = EINVAL;
1778 				goto out;
1779 			}
1780 
1781 	}
1782 	switch (level) {
1783 	case CPU_LEVEL_ROOT:
1784 	case CPU_LEVEL_CPUSET:
1785 		error = cpuset_which(which, id, &p, &ttd, &set);
1786 		if (error)
1787 			break;
1788 		switch (which) {
1789 		case CPU_WHICH_TID:
1790 		case CPU_WHICH_PID:
1791 			thread_lock(ttd);
1792 			set = cpuset_ref(ttd->td_cpuset);
1793 			thread_unlock(ttd);
1794 			PROC_UNLOCK(p);
1795 			break;
1796 		case CPU_WHICH_CPUSET:
1797 		case CPU_WHICH_JAIL:
1798 			break;
1799 		case CPU_WHICH_IRQ:
1800 		case CPU_WHICH_INTRHANDLER:
1801 		case CPU_WHICH_ITHREAD:
1802 		case CPU_WHICH_DOMAIN:
1803 			error = EINVAL;
1804 			goto out;
1805 		}
1806 		if (level == CPU_LEVEL_ROOT)
1807 			nset = cpuset_refroot(set);
1808 		else
1809 			nset = cpuset_refbase(set);
1810 		error = cpuset_modify(nset, mask);
1811 		cpuset_rel(nset);
1812 		cpuset_rel(set);
1813 		break;
1814 	case CPU_LEVEL_WHICH:
1815 		switch (which) {
1816 		case CPU_WHICH_TID:
1817 			error = cpuset_setthread(id, mask);
1818 			break;
1819 		case CPU_WHICH_PID:
1820 			error = cpuset_setproc(id, NULL, mask, NULL);
1821 			break;
1822 		case CPU_WHICH_CPUSET:
1823 		case CPU_WHICH_JAIL:
1824 			error = cpuset_which(which, id, &p, &ttd, &set);
1825 			if (error == 0) {
1826 				error = cpuset_modify(set, mask);
1827 				cpuset_rel(set);
1828 			}
1829 			break;
1830 		case CPU_WHICH_IRQ:
1831 		case CPU_WHICH_INTRHANDLER:
1832 		case CPU_WHICH_ITHREAD:
1833 			error = intr_setaffinity(id, which, mask);
1834 			break;
1835 		default:
1836 			error = EINVAL;
1837 			break;
1838 		}
1839 		break;
1840 	default:
1841 		error = EINVAL;
1842 		break;
1843 	}
1844 out:
1845 	free(mask, M_TEMP);
1846 	return (error);
1847 }
1848 
1849 #ifndef _SYS_SYSPROTO_H_
1850 struct cpuset_getdomain_args {
1851 	cpulevel_t	level;
1852 	cpuwhich_t	which;
1853 	id_t		id;
1854 	size_t		domainsetsize;
1855 	domainset_t	*mask;
1856 	int 		*policy;
1857 };
1858 #endif
1859 int
1860 sys_cpuset_getdomain(struct thread *td, struct cpuset_getdomain_args *uap)
1861 {
1862 
1863 	return (kern_cpuset_getdomain(td, uap->level, uap->which,
1864 	    uap->id, uap->domainsetsize, uap->mask, uap->policy));
1865 }
1866 
1867 int
1868 kern_cpuset_getdomain(struct thread *td, cpulevel_t level, cpuwhich_t which,
1869     id_t id, size_t domainsetsize, domainset_t *maskp, int *policyp)
1870 {
1871 	struct domainset outset;
1872 	struct thread *ttd;
1873 	struct cpuset *nset;
1874 	struct cpuset *set;
1875 	struct domainset *dset;
1876 	struct proc *p;
1877 	domainset_t *mask;
1878 	int error;
1879 
1880 	if (domainsetsize < sizeof(domainset_t) ||
1881 	    domainsetsize > DOMAINSET_MAXSIZE / NBBY)
1882 		return (ERANGE);
1883 	/* In Capability mode, you can only get your own domain set. */
1884 	if (IN_CAPABILITY_MODE(td)) {
1885 		if (level != CPU_LEVEL_WHICH)
1886 			return (ECAPMODE);
1887 		if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
1888 			return (ECAPMODE);
1889 		if (id != -1)
1890 			return (ECAPMODE);
1891 	}
1892 	mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO);
1893 	bzero(&outset, sizeof(outset));
1894 	error = cpuset_which(which, id, &p, &ttd, &set);
1895 	if (error)
1896 		goto out;
1897 	switch (level) {
1898 	case CPU_LEVEL_ROOT:
1899 	case CPU_LEVEL_CPUSET:
1900 		switch (which) {
1901 		case CPU_WHICH_TID:
1902 		case CPU_WHICH_PID:
1903 			thread_lock(ttd);
1904 			set = cpuset_ref(ttd->td_cpuset);
1905 			thread_unlock(ttd);
1906 			break;
1907 		case CPU_WHICH_CPUSET:
1908 		case CPU_WHICH_JAIL:
1909 			break;
1910 		case CPU_WHICH_IRQ:
1911 		case CPU_WHICH_INTRHANDLER:
1912 		case CPU_WHICH_ITHREAD:
1913 		case CPU_WHICH_DOMAIN:
1914 			error = EINVAL;
1915 			goto out;
1916 		}
1917 		if (level == CPU_LEVEL_ROOT)
1918 			nset = cpuset_refroot(set);
1919 		else
1920 			nset = cpuset_refbase(set);
1921 		domainset_copy(nset->cs_domain, &outset);
1922 		cpuset_rel(nset);
1923 		break;
1924 	case CPU_LEVEL_WHICH:
1925 		switch (which) {
1926 		case CPU_WHICH_TID:
1927 			thread_lock(ttd);
1928 			domainset_copy(ttd->td_cpuset->cs_domain, &outset);
1929 			thread_unlock(ttd);
1930 			break;
1931 		case CPU_WHICH_PID:
1932 			FOREACH_THREAD_IN_PROC(p, ttd) {
1933 				thread_lock(ttd);
1934 				dset = ttd->td_cpuset->cs_domain;
1935 				/* Show all domains in the proc. */
1936 				DOMAINSET_OR(&outset.ds_mask, &dset->ds_mask);
1937 				/* Last policy wins. */
1938 				outset.ds_policy = dset->ds_policy;
1939 				outset.ds_prefer = dset->ds_prefer;
1940 				thread_unlock(ttd);
1941 			}
1942 			break;
1943 		case CPU_WHICH_CPUSET:
1944 		case CPU_WHICH_JAIL:
1945 			domainset_copy(set->cs_domain, &outset);
1946 			break;
1947 		case CPU_WHICH_IRQ:
1948 		case CPU_WHICH_INTRHANDLER:
1949 		case CPU_WHICH_ITHREAD:
1950 		case CPU_WHICH_DOMAIN:
1951 			error = EINVAL;
1952 			break;
1953 		}
1954 		break;
1955 	default:
1956 		error = EINVAL;
1957 		break;
1958 	}
1959 	if (set)
1960 		cpuset_rel(set);
1961 	if (p)
1962 		PROC_UNLOCK(p);
1963 	/*
1964 	 * Translate prefer into a set containing only the preferred domain,
1965 	 * not the entire fallback set.
1966 	 */
1967 	if (outset.ds_policy == DOMAINSET_POLICY_PREFER) {
1968 		DOMAINSET_ZERO(&outset.ds_mask);
1969 		DOMAINSET_SET(outset.ds_prefer, &outset.ds_mask);
1970 	}
1971 	DOMAINSET_COPY(&outset.ds_mask, mask);
1972 	if (error == 0)
1973 		error = copyout(mask, maskp, domainsetsize);
1974 	if (error == 0)
1975 		if (suword32(policyp, outset.ds_policy) != 0)
1976 			error = EFAULT;
1977 out:
1978 	free(mask, M_TEMP);
1979 	return (error);
1980 }
1981 
1982 #ifndef _SYS_SYSPROTO_H_
1983 struct cpuset_setdomain_args {
1984 	cpulevel_t	level;
1985 	cpuwhich_t	which;
1986 	id_t		id;
1987 	size_t		domainsetsize;
1988 	domainset_t	*mask;
1989 	int 		policy;
1990 };
1991 #endif
1992 int
1993 sys_cpuset_setdomain(struct thread *td, struct cpuset_setdomain_args *uap)
1994 {
1995 
1996 	return (kern_cpuset_setdomain(td, uap->level, uap->which,
1997 	    uap->id, uap->domainsetsize, uap->mask, uap->policy));
1998 }
1999 
2000 int
2001 kern_cpuset_setdomain(struct thread *td, cpulevel_t level, cpuwhich_t which,
2002     id_t id, size_t domainsetsize, const domainset_t *maskp, int policy)
2003 {
2004 	struct cpuset *nset;
2005 	struct cpuset *set;
2006 	struct thread *ttd;
2007 	struct proc *p;
2008 	struct domainset domain;
2009 	domainset_t *mask;
2010 	int error;
2011 
2012 	if (domainsetsize < sizeof(domainset_t) ||
2013 	    domainsetsize > DOMAINSET_MAXSIZE / NBBY)
2014 		return (ERANGE);
2015 	/* In Capability mode, you can only set your own CPU set. */
2016 	if (IN_CAPABILITY_MODE(td)) {
2017 		if (level != CPU_LEVEL_WHICH)
2018 			return (ECAPMODE);
2019 		if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
2020 			return (ECAPMODE);
2021 		if (id != -1)
2022 			return (ECAPMODE);
2023 	}
2024 	memset(&domain, 0, sizeof(domain));
2025 	mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO);
2026 	error = copyin(maskp, mask, domainsetsize);
2027 	if (error)
2028 		goto out;
2029 	/*
2030 	 * Verify that no high bits are set.
2031 	 */
2032 	if (domainsetsize > sizeof(domainset_t)) {
2033 		char *end;
2034 		char *cp;
2035 
2036 		end = cp = (char *)&mask->__bits;
2037 		end += domainsetsize;
2038 		cp += sizeof(domainset_t);
2039 		while (cp != end)
2040 			if (*cp++ != 0) {
2041 				error = EINVAL;
2042 				goto out;
2043 			}
2044 
2045 	}
2046 	DOMAINSET_COPY(mask, &domain.ds_mask);
2047 	domain.ds_policy = policy;
2048 	if (policy <= DOMAINSET_POLICY_INVALID ||
2049 	    policy > DOMAINSET_POLICY_MAX)
2050 		return (EINVAL);
2051 
2052 	/* Translate preferred policy into a mask and fallback. */
2053 	if (policy == DOMAINSET_POLICY_PREFER) {
2054 		/* Only support a single preferred domain. */
2055 		if (DOMAINSET_COUNT(&domain.ds_mask) != 1)
2056 			return (EINVAL);
2057 		domain.ds_prefer = DOMAINSET_FFS(&domain.ds_mask) - 1;
2058 		/* This will be constrained by domainset_shadow(). */
2059 		DOMAINSET_FILL(&domain.ds_mask);
2060 	}
2061 
2062 	switch (level) {
2063 	case CPU_LEVEL_ROOT:
2064 	case CPU_LEVEL_CPUSET:
2065 		error = cpuset_which(which, id, &p, &ttd, &set);
2066 		if (error)
2067 			break;
2068 		switch (which) {
2069 		case CPU_WHICH_TID:
2070 		case CPU_WHICH_PID:
2071 			thread_lock(ttd);
2072 			set = cpuset_ref(ttd->td_cpuset);
2073 			thread_unlock(ttd);
2074 			PROC_UNLOCK(p);
2075 			break;
2076 		case CPU_WHICH_CPUSET:
2077 		case CPU_WHICH_JAIL:
2078 			break;
2079 		case CPU_WHICH_IRQ:
2080 		case CPU_WHICH_INTRHANDLER:
2081 		case CPU_WHICH_ITHREAD:
2082 		case CPU_WHICH_DOMAIN:
2083 			error = EINVAL;
2084 			goto out;
2085 		}
2086 		if (level == CPU_LEVEL_ROOT)
2087 			nset = cpuset_refroot(set);
2088 		else
2089 			nset = cpuset_refbase(set);
2090 		error = cpuset_modify_domain(nset, &domain);
2091 		cpuset_rel(nset);
2092 		cpuset_rel(set);
2093 		break;
2094 	case CPU_LEVEL_WHICH:
2095 		switch (which) {
2096 		case CPU_WHICH_TID:
2097 			error = _cpuset_setthread(id, NULL, &domain);
2098 			break;
2099 		case CPU_WHICH_PID:
2100 			error = cpuset_setproc(id, NULL, NULL, &domain);
2101 			break;
2102 		case CPU_WHICH_CPUSET:
2103 		case CPU_WHICH_JAIL:
2104 			error = cpuset_which(which, id, &p, &ttd, &set);
2105 			if (error == 0) {
2106 				error = cpuset_modify_domain(set, &domain);
2107 				cpuset_rel(set);
2108 			}
2109 			break;
2110 		case CPU_WHICH_IRQ:
2111 		case CPU_WHICH_INTRHANDLER:
2112 		case CPU_WHICH_ITHREAD:
2113 		default:
2114 			error = EINVAL;
2115 			break;
2116 		}
2117 		break;
2118 	default:
2119 		error = EINVAL;
2120 		break;
2121 	}
2122 out:
2123 	free(mask, M_TEMP);
2124 	return (error);
2125 }
2126 
2127 #ifdef DDB
2128 BITSET_DEFINE(bitset, 1);
2129 static void
2130 ddb_display_bitset(const struct bitset *set, int size)
2131 {
2132 	int bit, once;
2133 
2134 	for (once = 0, bit = 0; bit < size; bit++) {
2135 		if (CPU_ISSET(bit, set)) {
2136 			if (once == 0) {
2137 				db_printf("%d", bit);
2138 				once = 1;
2139 			} else
2140 				db_printf(",%d", bit);
2141 		}
2142 	}
2143 	if (once == 0)
2144 		db_printf("<none>");
2145 }
2146 
2147 void
2148 ddb_display_cpuset(const cpuset_t *set)
2149 {
2150 	ddb_display_bitset((const struct bitset *)set, CPU_SETSIZE);
2151 }
2152 
2153 static void
2154 ddb_display_domainset(const domainset_t *set)
2155 {
2156 	ddb_display_bitset((const struct bitset *)set, DOMAINSET_SETSIZE);
2157 }
2158 
2159 DB_SHOW_COMMAND(cpusets, db_show_cpusets)
2160 {
2161 	struct cpuset *set;
2162 
2163 	LIST_FOREACH(set, &cpuset_ids, cs_link) {
2164 		db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n",
2165 		    set, set->cs_id, set->cs_ref, set->cs_flags,
2166 		    (set->cs_parent != NULL) ? set->cs_parent->cs_id : 0);
2167 		db_printf("  cpu mask=");
2168 		ddb_display_cpuset(&set->cs_mask);
2169 		db_printf("\n");
2170 		db_printf("  domain policy %d prefer %d mask=",
2171 		    set->cs_domain->ds_policy, set->cs_domain->ds_prefer);
2172 		ddb_display_domainset(&set->cs_domain->ds_mask);
2173 		db_printf("\n");
2174 		if (db_pager_quit)
2175 			break;
2176 	}
2177 }
2178 
2179 DB_SHOW_COMMAND(domainsets, db_show_domainsets)
2180 {
2181 	struct domainset *set;
2182 
2183 	LIST_FOREACH(set, &cpuset_domains, ds_link) {
2184 		db_printf("set=%p policy %d prefer %d cnt %d max %d\n",
2185 		    set, set->ds_policy, set->ds_prefer, set->ds_cnt,
2186 		    set->ds_max);
2187 		db_printf("  mask =");
2188 		ddb_display_domainset(&set->ds_mask);
2189 		db_printf("\n");
2190 	}
2191 }
2192 #endif /* DDB */
2193