xref: /freebsd/sys/kern/kern_cpuset.c (revision 50696a6e8cbfdbf4a0d00f2f85f1951aa0d8e23d)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2008,  Jeffrey Roberson <jeff@freebsd.org>
5  * All rights reserved.
6  *
7  * Copyright (c) 2008 Nokia Corporation
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice unmodified, this list of conditions, and the following
15  *    disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  */
32 
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 
36 #include "opt_ddb.h"
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/sysctl.h>
41 #include <sys/ctype.h>
42 #include <sys/sysproto.h>
43 #include <sys/jail.h>
44 #include <sys/kernel.h>
45 #include <sys/lock.h>
46 #include <sys/malloc.h>
47 #include <sys/mutex.h>
48 #include <sys/priv.h>
49 #include <sys/proc.h>
50 #include <sys/refcount.h>
51 #include <sys/sched.h>
52 #include <sys/smp.h>
53 #include <sys/syscallsubr.h>
54 #include <sys/capsicum.h>
55 #include <sys/cpuset.h>
56 #include <sys/domainset.h>
57 #include <sys/sx.h>
58 #include <sys/queue.h>
59 #include <sys/libkern.h>
60 #include <sys/limits.h>
61 #include <sys/bus.h>
62 #include <sys/interrupt.h>
63 #include <sys/vmmeter.h>
64 
65 #include <vm/uma.h>
66 #include <vm/vm.h>
67 #include <vm/vm_object.h>
68 #include <vm/vm_page.h>
69 #include <vm/vm_pageout.h>
70 #include <vm/vm_extern.h>
71 #include <vm/vm_param.h>
72 #include <vm/vm_phys.h>
73 #include <vm/vm_pagequeue.h>
74 
75 #ifdef DDB
76 #include <ddb/ddb.h>
77 #endif /* DDB */
78 
79 /*
80  * cpusets provide a mechanism for creating and manipulating sets of
81  * processors for the purpose of constraining the scheduling of threads to
82  * specific processors.
83  *
84  * Each process belongs to an identified set, by default this is set 1.  Each
85  * thread may further restrict the cpus it may run on to a subset of this
86  * named set.  This creates an anonymous set which other threads and processes
87  * may not join by number.
88  *
89  * The named set is referred to herein as the 'base' set to avoid ambiguity.
90  * This set is usually a child of a 'root' set while the anonymous set may
91  * simply be referred to as a mask.  In the syscall api these are referred to
92  * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here.
93  *
94  * Threads inherit their set from their creator whether it be anonymous or
95  * not.  This means that anonymous sets are immutable because they may be
96  * shared.  To modify an anonymous set a new set is created with the desired
97  * mask and the same parent as the existing anonymous set.  This gives the
98  * illusion of each thread having a private mask.
99  *
100  * Via the syscall apis a user may ask to retrieve or modify the root, base,
101  * or mask that is discovered via a pid, tid, or setid.  Modifying a set
102  * modifies all numbered and anonymous child sets to comply with the new mask.
103  * Modifying a pid or tid's mask applies only to that tid but must still
104  * exist within the assigned parent set.
105  *
106  * A thread may not be assigned to a group separate from other threads in
107  * the process.  This is to remove ambiguity when the setid is queried with
108  * a pid argument.  There is no other technical limitation.
109  *
110  * This somewhat complex arrangement is intended to make it easy for
111  * applications to query available processors and bind their threads to
112  * specific processors while also allowing administrators to dynamically
113  * reprovision by changing sets which apply to groups of processes.
114  *
115  * A simple application should not concern itself with sets at all and
116  * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id
117  * meaning 'curthread'.  It may query available cpus for that tid with a
118  * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
119  */
120 
121 LIST_HEAD(domainlist, domainset);
122 struct domainset __read_mostly domainset_fixed[MAXMEMDOM];
123 struct domainset __read_mostly domainset_prefer[MAXMEMDOM];
124 struct domainset __read_mostly domainset_roundrobin;
125 
126 static uma_zone_t cpuset_zone;
127 static uma_zone_t domainset_zone;
128 static struct mtx cpuset_lock;
129 static struct setlist cpuset_ids;
130 static struct domainlist cpuset_domains;
131 static struct unrhdr *cpuset_unr;
132 static struct cpuset *cpuset_zero, *cpuset_default, *cpuset_kernel;
133 static struct domainset domainset0, domainset2;
134 
135 /* Return the size of cpuset_t at the kernel level */
136 SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD | CTLFLAG_CAPRD,
137     SYSCTL_NULL_INT_PTR, sizeof(cpuset_t), "sizeof(cpuset_t)");
138 
139 cpuset_t *cpuset_root;
140 cpuset_t cpuset_domain[MAXMEMDOM];
141 
142 static int domainset_valid(const struct domainset *, const struct domainset *);
143 
144 /*
145  * Find the first non-anonymous set starting from 'set'.
146  */
147 static struct cpuset *
148 cpuset_getbase(struct cpuset *set)
149 {
150 
151 	if (set->cs_id == CPUSET_INVALID)
152 		set = set->cs_parent;
153 	return (set);
154 }
155 
156 /*
157  * Walks up the tree from 'set' to find the root.
158  */
159 static struct cpuset *
160 cpuset_getroot(struct cpuset *set)
161 {
162 
163 	while ((set->cs_flags & CPU_SET_ROOT) == 0 && set->cs_parent != NULL)
164 		set = set->cs_parent;
165 	return (set);
166 }
167 
168 /*
169  * Acquire a reference to a cpuset, all pointers must be tracked with refs.
170  */
171 struct cpuset *
172 cpuset_ref(struct cpuset *set)
173 {
174 
175 	refcount_acquire(&set->cs_ref);
176 	return (set);
177 }
178 
179 /*
180  * Walks up the tree from 'set' to find the root.  Returns the root
181  * referenced.
182  */
183 static struct cpuset *
184 cpuset_refroot(struct cpuset *set)
185 {
186 
187 	return (cpuset_ref(cpuset_getroot(set)));
188 }
189 
190 /*
191  * Find the first non-anonymous set starting from 'set'.  Returns this set
192  * referenced.  May return the passed in set with an extra ref if it is
193  * not anonymous.
194  */
195 static struct cpuset *
196 cpuset_refbase(struct cpuset *set)
197 {
198 
199 	return (cpuset_ref(cpuset_getbase(set)));
200 }
201 
202 /*
203  * Release a reference in a context where it is safe to allocate.
204  */
205 void
206 cpuset_rel(struct cpuset *set)
207 {
208 	cpusetid_t id;
209 
210 	if (refcount_release(&set->cs_ref) == 0)
211 		return;
212 	mtx_lock_spin(&cpuset_lock);
213 	LIST_REMOVE(set, cs_siblings);
214 	id = set->cs_id;
215 	if (id != CPUSET_INVALID)
216 		LIST_REMOVE(set, cs_link);
217 	mtx_unlock_spin(&cpuset_lock);
218 	cpuset_rel(set->cs_parent);
219 	uma_zfree(cpuset_zone, set);
220 	if (id != CPUSET_INVALID)
221 		free_unr(cpuset_unr, id);
222 }
223 
224 /*
225  * Deferred release must be used when in a context that is not safe to
226  * allocate/free.  This places any unreferenced sets on the list 'head'.
227  */
228 static void
229 cpuset_rel_defer(struct setlist *head, struct cpuset *set)
230 {
231 
232 	if (refcount_release(&set->cs_ref) == 0)
233 		return;
234 	mtx_lock_spin(&cpuset_lock);
235 	LIST_REMOVE(set, cs_siblings);
236 	if (set->cs_id != CPUSET_INVALID)
237 		LIST_REMOVE(set, cs_link);
238 	LIST_INSERT_HEAD(head, set, cs_link);
239 	mtx_unlock_spin(&cpuset_lock);
240 }
241 
242 /*
243  * Complete a deferred release.  Removes the set from the list provided to
244  * cpuset_rel_defer.
245  */
246 static void
247 cpuset_rel_complete(struct cpuset *set)
248 {
249 	LIST_REMOVE(set, cs_link);
250 	cpuset_rel(set->cs_parent);
251 	uma_zfree(cpuset_zone, set);
252 }
253 
254 /*
255  * Find a set based on an id.  Returns it with a ref.
256  */
257 static struct cpuset *
258 cpuset_lookup(cpusetid_t setid, struct thread *td)
259 {
260 	struct cpuset *set;
261 
262 	if (setid == CPUSET_INVALID)
263 		return (NULL);
264 	mtx_lock_spin(&cpuset_lock);
265 	LIST_FOREACH(set, &cpuset_ids, cs_link)
266 		if (set->cs_id == setid)
267 			break;
268 	if (set)
269 		cpuset_ref(set);
270 	mtx_unlock_spin(&cpuset_lock);
271 
272 	KASSERT(td != NULL, ("[%s:%d] td is NULL", __func__, __LINE__));
273 	if (set != NULL && jailed(td->td_ucred)) {
274 		struct cpuset *jset, *tset;
275 
276 		jset = td->td_ucred->cr_prison->pr_cpuset;
277 		for (tset = set; tset != NULL; tset = tset->cs_parent)
278 			if (tset == jset)
279 				break;
280 		if (tset == NULL) {
281 			cpuset_rel(set);
282 			set = NULL;
283 		}
284 	}
285 
286 	return (set);
287 }
288 
289 /*
290  * Initialize a set in the space provided in 'set' with the provided parameters.
291  * The set is returned with a single ref.  May return EDEADLK if the set
292  * will have no valid cpu based on restrictions from the parent.
293  */
294 static int
295 cpuset_init(struct cpuset *set, struct cpuset *parent,
296     const cpuset_t *mask, struct domainset *domain, cpusetid_t id)
297 {
298 
299 	if (domain == NULL)
300 		domain = parent->cs_domain;
301 	if (mask == NULL)
302 		mask = &parent->cs_mask;
303 	if (!CPU_OVERLAP(&parent->cs_mask, mask))
304 		return (EDEADLK);
305 	/* The domain must be prepared ahead of time. */
306 	if (!domainset_valid(parent->cs_domain, domain))
307 		return (EDEADLK);
308 	CPU_COPY(mask, &set->cs_mask);
309 	LIST_INIT(&set->cs_children);
310 	refcount_init(&set->cs_ref, 1);
311 	set->cs_flags = 0;
312 	mtx_lock_spin(&cpuset_lock);
313 	set->cs_domain = domain;
314 	CPU_AND(&set->cs_mask, &parent->cs_mask);
315 	set->cs_id = id;
316 	set->cs_parent = cpuset_ref(parent);
317 	LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings);
318 	if (set->cs_id != CPUSET_INVALID)
319 		LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
320 	mtx_unlock_spin(&cpuset_lock);
321 
322 	return (0);
323 }
324 
325 /*
326  * Create a new non-anonymous set with the requested parent and mask.  May
327  * return failures if the mask is invalid or a new number can not be
328  * allocated.
329  *
330  * If *setp is not NULL, then it will be used as-is.  The caller must take
331  * into account that *setp will be inserted at the head of cpuset_ids and
332  * plan any potentially conflicting cs_link usage accordingly.
333  */
334 static int
335 cpuset_create(struct cpuset **setp, struct cpuset *parent, const cpuset_t *mask)
336 {
337 	struct cpuset *set;
338 	cpusetid_t id;
339 	int error;
340 	bool dofree;
341 
342 	id = alloc_unr(cpuset_unr);
343 	if (id == -1)
344 		return (ENFILE);
345 	dofree = (*setp == NULL);
346 	if (*setp != NULL)
347 		set = *setp;
348 	else
349 		*setp = set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
350 	error = cpuset_init(set, parent, mask, NULL, id);
351 	if (error == 0)
352 		return (0);
353 	free_unr(cpuset_unr, id);
354 	if (dofree)
355 		uma_zfree(cpuset_zone, set);
356 
357 	return (error);
358 }
359 
360 static void
361 cpuset_freelist_add(struct setlist *list, int count)
362 {
363 	struct cpuset *set;
364 	int i;
365 
366 	for (i = 0; i < count; i++) {
367 		set = uma_zalloc(cpuset_zone, M_ZERO | M_WAITOK);
368 		LIST_INSERT_HEAD(list, set, cs_link);
369 	}
370 }
371 
372 static void
373 cpuset_freelist_init(struct setlist *list, int count)
374 {
375 
376 	LIST_INIT(list);
377 	cpuset_freelist_add(list, count);
378 }
379 
380 static void
381 cpuset_freelist_free(struct setlist *list)
382 {
383 	struct cpuset *set;
384 
385 	while ((set = LIST_FIRST(list)) != NULL) {
386 		LIST_REMOVE(set, cs_link);
387 		uma_zfree(cpuset_zone, set);
388 	}
389 }
390 
391 static void
392 domainset_freelist_add(struct domainlist *list, int count)
393 {
394 	struct domainset *set;
395 	int i;
396 
397 	for (i = 0; i < count; i++) {
398 		set = uma_zalloc(domainset_zone, M_ZERO | M_WAITOK);
399 		LIST_INSERT_HEAD(list, set, ds_link);
400 	}
401 }
402 
403 static void
404 domainset_freelist_init(struct domainlist *list, int count)
405 {
406 
407 	LIST_INIT(list);
408 	domainset_freelist_add(list, count);
409 }
410 
411 static void
412 domainset_freelist_free(struct domainlist *list)
413 {
414 	struct domainset *set;
415 
416 	while ((set = LIST_FIRST(list)) != NULL) {
417 		LIST_REMOVE(set, ds_link);
418 		uma_zfree(domainset_zone, set);
419 	}
420 }
421 
422 /* Copy a domainset preserving mask and policy. */
423 static void
424 domainset_copy(const struct domainset *from, struct domainset *to)
425 {
426 
427 	DOMAINSET_COPY(&from->ds_mask, &to->ds_mask);
428 	to->ds_policy = from->ds_policy;
429 	to->ds_prefer = from->ds_prefer;
430 }
431 
432 /* Return 1 if mask and policy are equal, otherwise 0. */
433 static int
434 domainset_equal(const struct domainset *one, const struct domainset *two)
435 {
436 
437 	return (DOMAINSET_CMP(&one->ds_mask, &two->ds_mask) == 0 &&
438 	    one->ds_policy == two->ds_policy &&
439 	    one->ds_prefer == two->ds_prefer);
440 }
441 
442 /* Return 1 if child is a valid subset of parent. */
443 static int
444 domainset_valid(const struct domainset *parent, const struct domainset *child)
445 {
446 	if (child->ds_policy != DOMAINSET_POLICY_PREFER)
447 		return (DOMAINSET_SUBSET(&parent->ds_mask, &child->ds_mask));
448 	return (DOMAINSET_ISSET(child->ds_prefer, &parent->ds_mask));
449 }
450 
451 static int
452 domainset_restrict(const struct domainset *parent,
453     const struct domainset *child)
454 {
455 	if (child->ds_policy != DOMAINSET_POLICY_PREFER)
456 		return (DOMAINSET_OVERLAP(&parent->ds_mask, &child->ds_mask));
457 	return (DOMAINSET_ISSET(child->ds_prefer, &parent->ds_mask));
458 }
459 
460 /*
461  * Lookup or create a domainset.  The key is provided in ds_mask and
462  * ds_policy.  If the domainset does not yet exist the storage in
463  * 'domain' is used to insert.  Otherwise this storage is freed to the
464  * domainset_zone and the existing domainset is returned.
465  */
466 static struct domainset *
467 _domainset_create(struct domainset *domain, struct domainlist *freelist)
468 {
469 	struct domainset *ndomain;
470 	int i, j;
471 
472 	KASSERT(domain->ds_cnt <= vm_ndomains,
473 	    ("invalid domain count in domainset %p", domain));
474 	KASSERT(domain->ds_policy != DOMAINSET_POLICY_PREFER ||
475 	    domain->ds_prefer < vm_ndomains,
476 	    ("invalid preferred domain in domains %p", domain));
477 
478 	mtx_lock_spin(&cpuset_lock);
479 	LIST_FOREACH(ndomain, &cpuset_domains, ds_link)
480 		if (domainset_equal(ndomain, domain))
481 			break;
482 	/*
483 	 * If the domain does not yet exist we insert it and initialize
484 	 * various iteration helpers which are not part of the key.
485 	 */
486 	if (ndomain == NULL) {
487 		LIST_INSERT_HEAD(&cpuset_domains, domain, ds_link);
488 		domain->ds_cnt = DOMAINSET_COUNT(&domain->ds_mask);
489 		for (i = 0, j = 0; i < DOMAINSET_FLS(&domain->ds_mask); i++)
490 			if (DOMAINSET_ISSET(i, &domain->ds_mask))
491 				domain->ds_order[j++] = i;
492 	}
493 	mtx_unlock_spin(&cpuset_lock);
494 	if (ndomain == NULL)
495 		return (domain);
496 	if (freelist != NULL)
497 		LIST_INSERT_HEAD(freelist, domain, ds_link);
498 	else
499 		uma_zfree(domainset_zone, domain);
500 	return (ndomain);
501 
502 }
503 
504 /*
505  * Are any of the domains in the mask empty?  If so, silently
506  * remove them and update the domainset accordingly.  If only empty
507  * domains are present, we must return failure.
508  */
509 static bool
510 domainset_empty_vm(struct domainset *domain)
511 {
512 	domainset_t empty;
513 	int i, j;
514 
515 	DOMAINSET_ZERO(&empty);
516 	for (i = 0; i < vm_ndomains; i++)
517 		if (VM_DOMAIN_EMPTY(i))
518 			DOMAINSET_SET(i, &empty);
519 	if (DOMAINSET_SUBSET(&empty, &domain->ds_mask))
520 		return (true);
521 
522 	/* Remove empty domains from the set and recompute. */
523 	DOMAINSET_ANDNOT(&domain->ds_mask, &empty);
524 	domain->ds_cnt = DOMAINSET_COUNT(&domain->ds_mask);
525 	for (i = j = 0; i < DOMAINSET_FLS(&domain->ds_mask); i++)
526 		if (DOMAINSET_ISSET(i, &domain->ds_mask))
527 			domain->ds_order[j++] = i;
528 
529 	/* Convert a PREFER policy referencing an empty domain to RR. */
530 	if (domain->ds_policy == DOMAINSET_POLICY_PREFER &&
531 	    DOMAINSET_ISSET(domain->ds_prefer, &empty)) {
532 		domain->ds_policy = DOMAINSET_POLICY_ROUNDROBIN;
533 		domain->ds_prefer = -1;
534 	}
535 
536 	return (false);
537 }
538 
539 /*
540  * Create or lookup a domainset based on the key held in 'domain'.
541  */
542 struct domainset *
543 domainset_create(const struct domainset *domain)
544 {
545 	struct domainset *ndomain;
546 
547 	/*
548 	 * Validate the policy.  It must specify a useable policy number with
549 	 * only valid domains.  Preferred must include the preferred domain
550 	 * in the mask.
551 	 */
552 	if (domain->ds_policy <= DOMAINSET_POLICY_INVALID ||
553 	    domain->ds_policy > DOMAINSET_POLICY_MAX)
554 		return (NULL);
555 	if (domain->ds_policy == DOMAINSET_POLICY_PREFER &&
556 	    !DOMAINSET_ISSET(domain->ds_prefer, &domain->ds_mask))
557 		return (NULL);
558 	if (!DOMAINSET_SUBSET(&domainset0.ds_mask, &domain->ds_mask))
559 		return (NULL);
560 	ndomain = uma_zalloc(domainset_zone, M_WAITOK | M_ZERO);
561 	domainset_copy(domain, ndomain);
562 	return _domainset_create(ndomain, NULL);
563 }
564 
565 /*
566  * Update thread domainset pointers.
567  */
568 static void
569 domainset_notify(void)
570 {
571 	struct thread *td;
572 	struct proc *p;
573 
574 	sx_slock(&allproc_lock);
575 	FOREACH_PROC_IN_SYSTEM(p) {
576 		PROC_LOCK(p);
577 		if (p->p_state == PRS_NEW) {
578 			PROC_UNLOCK(p);
579 			continue;
580 		}
581 		FOREACH_THREAD_IN_PROC(p, td) {
582 			thread_lock(td);
583 			td->td_domain.dr_policy = td->td_cpuset->cs_domain;
584 			thread_unlock(td);
585 		}
586 		PROC_UNLOCK(p);
587 	}
588 	sx_sunlock(&allproc_lock);
589 	kernel_object->domain.dr_policy = cpuset_kernel->cs_domain;
590 }
591 
592 /*
593  * Create a new set that is a subset of a parent.
594  */
595 static struct domainset *
596 domainset_shadow(const struct domainset *pdomain,
597     const struct domainset *domain, struct domainlist *freelist)
598 {
599 	struct domainset *ndomain;
600 
601 	ndomain = LIST_FIRST(freelist);
602 	LIST_REMOVE(ndomain, ds_link);
603 
604 	/*
605 	 * Initialize the key from the request.
606 	 */
607 	domainset_copy(domain, ndomain);
608 
609 	/*
610 	 * Restrict the key by the parent.
611 	 */
612 	DOMAINSET_AND(&ndomain->ds_mask, &pdomain->ds_mask);
613 
614 	return _domainset_create(ndomain, freelist);
615 }
616 
617 /*
618  * Recursively check for errors that would occur from applying mask to
619  * the tree of sets starting at 'set'.  Checks for sets that would become
620  * empty as well as RDONLY flags.
621  */
622 static int
623 cpuset_testupdate(struct cpuset *set, cpuset_t *mask, int check_mask)
624 {
625 	struct cpuset *nset;
626 	cpuset_t newmask;
627 	int error;
628 
629 	mtx_assert(&cpuset_lock, MA_OWNED);
630 	if (set->cs_flags & CPU_SET_RDONLY)
631 		return (EPERM);
632 	if (check_mask) {
633 		if (!CPU_OVERLAP(&set->cs_mask, mask))
634 			return (EDEADLK);
635 		CPU_COPY(&set->cs_mask, &newmask);
636 		CPU_AND(&newmask, mask);
637 	} else
638 		CPU_COPY(mask, &newmask);
639 	error = 0;
640 	LIST_FOREACH(nset, &set->cs_children, cs_siblings)
641 		if ((error = cpuset_testupdate(nset, &newmask, 1)) != 0)
642 			break;
643 	return (error);
644 }
645 
646 /*
647  * Applies the mask 'mask' without checking for empty sets or permissions.
648  */
649 static void
650 cpuset_update(struct cpuset *set, cpuset_t *mask)
651 {
652 	struct cpuset *nset;
653 
654 	mtx_assert(&cpuset_lock, MA_OWNED);
655 	CPU_AND(&set->cs_mask, mask);
656 	LIST_FOREACH(nset, &set->cs_children, cs_siblings)
657 		cpuset_update(nset, &set->cs_mask);
658 
659 	return;
660 }
661 
662 /*
663  * Modify the set 'set' to use a copy of the mask provided.  Apply this new
664  * mask to restrict all children in the tree.  Checks for validity before
665  * applying the changes.
666  */
667 static int
668 cpuset_modify(struct cpuset *set, cpuset_t *mask)
669 {
670 	struct cpuset *root;
671 	int error;
672 
673 	error = priv_check(curthread, PRIV_SCHED_CPUSET);
674 	if (error)
675 		return (error);
676 	/*
677 	 * In case we are called from within the jail
678 	 * we do not allow modifying the dedicated root
679 	 * cpuset of the jail but may still allow to
680 	 * change child sets.
681 	 */
682 	if (jailed(curthread->td_ucred) &&
683 	    set->cs_flags & CPU_SET_ROOT)
684 		return (EPERM);
685 	/*
686 	 * Verify that we have access to this set of
687 	 * cpus.
688 	 */
689 	root = cpuset_getroot(set);
690 	mtx_lock_spin(&cpuset_lock);
691 	if (root && !CPU_SUBSET(&root->cs_mask, mask)) {
692 		error = EINVAL;
693 		goto out;
694 	}
695 	error = cpuset_testupdate(set, mask, 0);
696 	if (error)
697 		goto out;
698 	CPU_COPY(mask, &set->cs_mask);
699 	cpuset_update(set, mask);
700 out:
701 	mtx_unlock_spin(&cpuset_lock);
702 
703 	return (error);
704 }
705 
706 /*
707  * Recursively check for errors that would occur from applying mask to
708  * the tree of sets starting at 'set'.  Checks for sets that would become
709  * empty as well as RDONLY flags.
710  */
711 static int
712 cpuset_testupdate_domain(struct cpuset *set, struct domainset *dset,
713     struct domainset *orig, int *count, int check_mask)
714 {
715 	struct cpuset *nset;
716 	struct domainset *domain;
717 	struct domainset newset;
718 	int error;
719 
720 	mtx_assert(&cpuset_lock, MA_OWNED);
721 	if (set->cs_flags & CPU_SET_RDONLY)
722 		return (EPERM);
723 	domain = set->cs_domain;
724 	domainset_copy(domain, &newset);
725 	if (!domainset_equal(domain, orig)) {
726 		if (!domainset_restrict(domain, dset))
727 			return (EDEADLK);
728 		DOMAINSET_AND(&newset.ds_mask, &dset->ds_mask);
729 		/* Count the number of domains that are changing. */
730 		(*count)++;
731 	}
732 	error = 0;
733 	LIST_FOREACH(nset, &set->cs_children, cs_siblings)
734 		if ((error = cpuset_testupdate_domain(nset, &newset, domain,
735 		    count, 1)) != 0)
736 			break;
737 	return (error);
738 }
739 
740 /*
741  * Applies the mask 'mask' without checking for empty sets or permissions.
742  */
743 static void
744 cpuset_update_domain(struct cpuset *set, struct domainset *domain,
745     struct domainset *orig, struct domainlist *domains)
746 {
747 	struct cpuset *nset;
748 
749 	mtx_assert(&cpuset_lock, MA_OWNED);
750 	/*
751 	 * If this domainset has changed from the parent we must calculate
752 	 * a new set.  Otherwise it simply inherits from the parent.  When
753 	 * we inherit from the parent we get a new mask and policy.  If the
754 	 * set is modified from the parent we keep the policy and only
755 	 * update the mask.
756 	 */
757 	if (set->cs_domain != orig) {
758 		orig = set->cs_domain;
759 		set->cs_domain = domainset_shadow(domain, orig, domains);
760 	} else
761 		set->cs_domain = domain;
762 	LIST_FOREACH(nset, &set->cs_children, cs_siblings)
763 		cpuset_update_domain(nset, set->cs_domain, orig, domains);
764 
765 	return;
766 }
767 
768 /*
769  * Modify the set 'set' to use a copy the domainset provided.  Apply this new
770  * mask to restrict all children in the tree.  Checks for validity before
771  * applying the changes.
772  */
773 static int
774 cpuset_modify_domain(struct cpuset *set, struct domainset *domain)
775 {
776 	struct domainlist domains;
777 	struct domainset temp;
778 	struct domainset *dset;
779 	struct cpuset *root;
780 	int ndomains, needed;
781 	int error;
782 
783 	error = priv_check(curthread, PRIV_SCHED_CPUSET);
784 	if (error)
785 		return (error);
786 	/*
787 	 * In case we are called from within the jail
788 	 * we do not allow modifying the dedicated root
789 	 * cpuset of the jail but may still allow to
790 	 * change child sets.
791 	 */
792 	if (jailed(curthread->td_ucred) &&
793 	    set->cs_flags & CPU_SET_ROOT)
794 		return (EPERM);
795 	domainset_freelist_init(&domains, 0);
796 	domain = domainset_create(domain);
797 	ndomains = needed = 0;
798 	do {
799 		if (ndomains < needed) {
800 			domainset_freelist_add(&domains, needed - ndomains);
801 			ndomains = needed;
802 		}
803 		root = cpuset_getroot(set);
804 		mtx_lock_spin(&cpuset_lock);
805 		dset = root->cs_domain;
806 		/*
807 		 * Verify that we have access to this set of domains.
808 		 */
809 		if (!domainset_valid(dset, domain)) {
810 			error = EINVAL;
811 			goto out;
812 		}
813 		/*
814 		 * If applying prefer we keep the current set as the fallback.
815 		 */
816 		if (domain->ds_policy == DOMAINSET_POLICY_PREFER)
817 			DOMAINSET_COPY(&set->cs_domain->ds_mask,
818 			    &domain->ds_mask);
819 		/*
820 		 * Determine whether we can apply this set of domains and
821 		 * how many new domain structures it will require.
822 		 */
823 		domainset_copy(domain, &temp);
824 		needed = 0;
825 		error = cpuset_testupdate_domain(set, &temp, set->cs_domain,
826 		    &needed, 0);
827 		if (error)
828 			goto out;
829 	} while (ndomains < needed);
830 	dset = set->cs_domain;
831 	cpuset_update_domain(set, domain, dset, &domains);
832 out:
833 	mtx_unlock_spin(&cpuset_lock);
834 	domainset_freelist_free(&domains);
835 	if (error == 0)
836 		domainset_notify();
837 
838 	return (error);
839 }
840 
841 /*
842  * Resolve the 'which' parameter of several cpuset apis.
843  *
844  * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid.  Also
845  * checks for permission via p_cansched().
846  *
847  * For WHICH_SET returns a valid set with a new reference.
848  *
849  * -1 may be supplied for any argument to mean the current proc/thread or
850  * the base set of the current thread.  May fail with ESRCH/EPERM.
851  */
852 int
853 cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp,
854     struct cpuset **setp)
855 {
856 	struct cpuset *set;
857 	struct thread *td;
858 	struct proc *p;
859 	int error;
860 
861 	*pp = p = NULL;
862 	*tdp = td = NULL;
863 	*setp = set = NULL;
864 	switch (which) {
865 	case CPU_WHICH_PID:
866 		if (id == -1) {
867 			PROC_LOCK(curproc);
868 			p = curproc;
869 			break;
870 		}
871 		if ((p = pfind(id)) == NULL)
872 			return (ESRCH);
873 		break;
874 	case CPU_WHICH_TID:
875 		if (id == -1) {
876 			PROC_LOCK(curproc);
877 			p = curproc;
878 			td = curthread;
879 			break;
880 		}
881 		td = tdfind(id, -1);
882 		if (td == NULL)
883 			return (ESRCH);
884 		p = td->td_proc;
885 		break;
886 	case CPU_WHICH_CPUSET:
887 		if (id == -1) {
888 			thread_lock(curthread);
889 			set = cpuset_refbase(curthread->td_cpuset);
890 			thread_unlock(curthread);
891 		} else
892 			set = cpuset_lookup(id, curthread);
893 		if (set) {
894 			*setp = set;
895 			return (0);
896 		}
897 		return (ESRCH);
898 	case CPU_WHICH_JAIL:
899 	{
900 		/* Find `set' for prison with given id. */
901 		struct prison *pr;
902 
903 		sx_slock(&allprison_lock);
904 		pr = prison_find_child(curthread->td_ucred->cr_prison, id);
905 		sx_sunlock(&allprison_lock);
906 		if (pr == NULL)
907 			return (ESRCH);
908 		cpuset_ref(pr->pr_cpuset);
909 		*setp = pr->pr_cpuset;
910 		mtx_unlock(&pr->pr_mtx);
911 		return (0);
912 	}
913 	case CPU_WHICH_IRQ:
914 	case CPU_WHICH_DOMAIN:
915 		return (0);
916 	default:
917 		return (EINVAL);
918 	}
919 	error = p_cansched(curthread, p);
920 	if (error) {
921 		PROC_UNLOCK(p);
922 		return (error);
923 	}
924 	if (td == NULL)
925 		td = FIRST_THREAD_IN_PROC(p);
926 	*pp = p;
927 	*tdp = td;
928 	return (0);
929 }
930 
931 static int
932 cpuset_testshadow(struct cpuset *set, const cpuset_t *mask,
933     const struct domainset *domain)
934 {
935 	struct cpuset *parent;
936 	struct domainset *dset;
937 
938 	parent = cpuset_getbase(set);
939 	/*
940 	 * If we are restricting a cpu mask it must be a subset of the
941 	 * parent or invalid CPUs have been specified.
942 	 */
943 	if (mask != NULL && !CPU_SUBSET(&parent->cs_mask, mask))
944 		return (EINVAL);
945 
946 	/*
947 	 * If we are restricting a domain mask it must be a subset of the
948 	 * parent or invalid domains have been specified.
949 	 */
950 	dset = parent->cs_domain;
951 	if (domain != NULL && !domainset_valid(dset, domain))
952 		return (EINVAL);
953 
954 	return (0);
955 }
956 
957 /*
958  * Create an anonymous set with the provided mask in the space provided by
959  * 'nset'.  If the passed in set is anonymous we use its parent otherwise
960  * the new set is a child of 'set'.
961  */
962 static int
963 cpuset_shadow(struct cpuset *set, struct cpuset **nsetp,
964    const cpuset_t *mask, const struct domainset *domain,
965    struct setlist *cpusets, struct domainlist *domains)
966 {
967 	struct cpuset *parent;
968 	struct cpuset *nset;
969 	struct domainset *dset;
970 	struct domainset *d;
971 	int error;
972 
973 	error = cpuset_testshadow(set, mask, domain);
974 	if (error)
975 		return (error);
976 
977 	parent = cpuset_getbase(set);
978 	dset = parent->cs_domain;
979 	if (mask == NULL)
980 		mask = &set->cs_mask;
981 	if (domain != NULL)
982 		d = domainset_shadow(dset, domain, domains);
983 	else
984 		d = set->cs_domain;
985 	nset = LIST_FIRST(cpusets);
986 	error = cpuset_init(nset, parent, mask, d, CPUSET_INVALID);
987 	if (error == 0) {
988 		LIST_REMOVE(nset, cs_link);
989 		*nsetp = nset;
990 	}
991 	return (error);
992 }
993 
994 static struct cpuset *
995 cpuset_update_thread(struct thread *td, struct cpuset *nset)
996 {
997 	struct cpuset *tdset;
998 
999 	tdset = td->td_cpuset;
1000 	td->td_cpuset = nset;
1001 	td->td_domain.dr_policy = nset->cs_domain;
1002 	sched_affinity(td);
1003 
1004 	return (tdset);
1005 }
1006 
1007 static int
1008 cpuset_setproc_test_maskthread(struct cpuset *tdset, cpuset_t *mask,
1009     struct domainset *domain)
1010 {
1011 	struct cpuset *parent;
1012 
1013 	parent = cpuset_getbase(tdset);
1014 	if (mask == NULL)
1015 		mask = &tdset->cs_mask;
1016 	if (domain == NULL)
1017 		domain = tdset->cs_domain;
1018 	return cpuset_testshadow(parent, mask, domain);
1019 }
1020 
1021 static int
1022 cpuset_setproc_maskthread(struct cpuset *tdset, cpuset_t *mask,
1023     struct domainset *domain, struct cpuset **nsetp,
1024     struct setlist *freelist, struct domainlist *domainlist)
1025 {
1026 	struct cpuset *parent;
1027 
1028 	parent = cpuset_getbase(tdset);
1029 	if (mask == NULL)
1030 		mask = &tdset->cs_mask;
1031 	if (domain == NULL)
1032 		domain = tdset->cs_domain;
1033 	return cpuset_shadow(parent, nsetp, mask, domain, freelist,
1034 	    domainlist);
1035 }
1036 
1037 static int
1038 cpuset_setproc_setthread_mask(struct cpuset *tdset, struct cpuset *set,
1039     cpuset_t *mask, struct domainset *domain)
1040 {
1041 	struct cpuset *parent;
1042 
1043 	parent = cpuset_getbase(tdset);
1044 
1045 	/*
1046 	 * If the thread restricted its mask then apply that same
1047 	 * restriction to the new set, otherwise take it wholesale.
1048 	 */
1049 	if (CPU_CMP(&tdset->cs_mask, &parent->cs_mask) != 0) {
1050 		CPU_COPY(&tdset->cs_mask, mask);
1051 		CPU_AND(mask, &set->cs_mask);
1052 	} else
1053 		CPU_COPY(&set->cs_mask, mask);
1054 
1055 	/*
1056 	 * If the thread restricted the domain then we apply the
1057 	 * restriction to the new set but retain the policy.
1058 	 */
1059 	if (tdset->cs_domain != parent->cs_domain) {
1060 		domainset_copy(tdset->cs_domain, domain);
1061 		DOMAINSET_AND(&domain->ds_mask, &set->cs_domain->ds_mask);
1062 	} else
1063 		domainset_copy(set->cs_domain, domain);
1064 
1065 	if (CPU_EMPTY(mask) || DOMAINSET_EMPTY(&domain->ds_mask))
1066 		return (EDEADLK);
1067 
1068 	return (0);
1069 }
1070 
1071 static int
1072 cpuset_setproc_test_setthread(struct cpuset *tdset, struct cpuset *set)
1073 {
1074 	struct domainset domain;
1075 	cpuset_t mask;
1076 
1077 	if (tdset->cs_id != CPUSET_INVALID)
1078 		return (0);
1079 	return cpuset_setproc_setthread_mask(tdset, set, &mask, &domain);
1080 }
1081 
1082 static int
1083 cpuset_setproc_setthread(struct cpuset *tdset, struct cpuset *set,
1084     struct cpuset **nsetp, struct setlist *freelist,
1085     struct domainlist *domainlist)
1086 {
1087 	struct domainset domain;
1088 	cpuset_t mask;
1089 	int error;
1090 
1091 	/*
1092 	 * If we're replacing on a thread that has not constrained the
1093 	 * original set we can simply accept the new set.
1094 	 */
1095 	if (tdset->cs_id != CPUSET_INVALID) {
1096 		*nsetp = cpuset_ref(set);
1097 		return (0);
1098 	}
1099 	error = cpuset_setproc_setthread_mask(tdset, set, &mask, &domain);
1100 	if (error)
1101 		return (error);
1102 
1103 	return cpuset_shadow(set, nsetp, &mask, &domain, freelist,
1104 	    domainlist);
1105 }
1106 
1107 static int
1108 cpuset_setproc_newbase(struct thread *td, struct cpuset *set,
1109     struct cpuset *nroot, struct cpuset **nsetp,
1110     struct setlist *cpusets, struct domainlist *domainlist)
1111 {
1112 	struct domainset ndomain;
1113 	cpuset_t nmask;
1114 	struct cpuset *pbase;
1115 	int error;
1116 
1117 	pbase = cpuset_getbase(td->td_cpuset);
1118 
1119 	/* Copy process mask, then further apply the new root mask. */
1120 	CPU_COPY(&pbase->cs_mask, &nmask);
1121 	CPU_AND(&nmask, &nroot->cs_mask);
1122 
1123 	domainset_copy(pbase->cs_domain, &ndomain);
1124 	DOMAINSET_AND(&ndomain.ds_mask, &set->cs_domain->ds_mask);
1125 
1126 	/* Policy is too restrictive, will not work. */
1127 	if (CPU_EMPTY(&nmask) || DOMAINSET_EMPTY(&ndomain.ds_mask))
1128 		return (EDEADLK);
1129 
1130 	/*
1131 	 * Remove pbase from the freelist in advance, it'll be pushed to
1132 	 * cpuset_ids on success.  We assume here that cpuset_create() will not
1133 	 * touch pbase on failure, and we just enqueue it back to the freelist
1134 	 * to remain in a consistent state.
1135 	 */
1136 	pbase = LIST_FIRST(cpusets);
1137 	LIST_REMOVE(pbase, cs_link);
1138 	error = cpuset_create(&pbase, set, &nmask);
1139 	if (error != 0) {
1140 		LIST_INSERT_HEAD(cpusets, pbase, cs_link);
1141 		return (error);
1142 	}
1143 
1144 	/* Duplicates some work from above... oh well. */
1145 	pbase->cs_domain = domainset_shadow(set->cs_domain, &ndomain,
1146 	    domainlist);
1147 	*nsetp = pbase;
1148 	return (0);
1149 }
1150 
1151 /*
1152  * Handle four cases for updating an entire process.
1153  *
1154  * 1) Set is non-null and the process is not rebasing onto a new root.  This
1155  *    reparents all anonymous sets to the provided set and replaces all
1156  *    non-anonymous td_cpusets with the provided set.
1157  * 2) Set is non-null and the process is rebasing onto a new root.  This
1158  *    creates a new base set if the process previously had its own base set,
1159  *    then reparents all anonymous sets either to that set or the provided set
1160  *    if one was not created.  Non-anonymous sets are similarly replaced.
1161  * 3) Mask is non-null.  This replaces or creates anonymous sets for every
1162  *    thread with the existing base as a parent.
1163  * 4) domain is non-null.  This creates anonymous sets for every thread
1164  *    and replaces the domain set.
1165  *
1166  * This is overly complicated because we can't allocate while holding a
1167  * spinlock and spinlocks must be held while changing and examining thread
1168  * state.
1169  */
1170 static int
1171 cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask,
1172     struct domainset *domain, bool rebase)
1173 {
1174 	struct setlist freelist;
1175 	struct setlist droplist;
1176 	struct domainlist domainlist;
1177 	struct cpuset *base, *nset, *nroot, *tdroot;
1178 	struct thread *td;
1179 	struct proc *p;
1180 	int needed;
1181 	int nfree;
1182 	int error;
1183 
1184 	/*
1185 	 * The algorithm requires two passes due to locking considerations.
1186 	 *
1187 	 * 1) Lookup the process and acquire the locks in the required order.
1188 	 * 2) If enough cpusets have not been allocated release the locks and
1189 	 *    allocate them.  Loop.
1190 	 */
1191 	cpuset_freelist_init(&freelist, 1);
1192 	domainset_freelist_init(&domainlist, 1);
1193 	nfree = 1;
1194 	LIST_INIT(&droplist);
1195 	nfree = 0;
1196 	base = set;
1197 	nroot = NULL;
1198 	if (set != NULL)
1199 		nroot = cpuset_getroot(set);
1200 	for (;;) {
1201 		error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset);
1202 		if (error)
1203 			goto out;
1204 		tdroot = cpuset_getroot(td->td_cpuset);
1205 		needed = p->p_numthreads;
1206 		if (set != NULL && rebase && tdroot != nroot)
1207 			needed++;
1208 		if (nfree >= needed)
1209 			break;
1210 		PROC_UNLOCK(p);
1211 		if (nfree < needed) {
1212 			cpuset_freelist_add(&freelist, needed - nfree);
1213 			domainset_freelist_add(&domainlist, needed - nfree);
1214 			nfree = needed;
1215 		}
1216 	}
1217 	PROC_LOCK_ASSERT(p, MA_OWNED);
1218 
1219 	/*
1220 	 * If we're changing roots and the root set is what has been specified
1221 	 * as the parent, then we'll check if the process was previously using
1222 	 * the root set and, if it wasn't, create a new base with the process's
1223 	 * mask applied to it.
1224 	 */
1225 	if (set != NULL && rebase && nroot != tdroot) {
1226 		cpusetid_t base_id, root_id;
1227 
1228 		root_id = td->td_ucred->cr_prison->pr_cpuset->cs_id;
1229 		base_id = cpuset_getbase(td->td_cpuset)->cs_id;
1230 
1231 		if (base_id != root_id) {
1232 			error = cpuset_setproc_newbase(td, set, nroot, &base,
1233 			    &freelist, &domainlist);
1234 			if (error != 0)
1235 				goto unlock_out;
1236 		}
1237 	}
1238 
1239 	/*
1240 	 * Now that the appropriate locks are held and we have enough cpusets,
1241 	 * make sure the operation will succeed before applying changes. The
1242 	 * proc lock prevents td_cpuset from changing between calls.
1243 	 */
1244 	error = 0;
1245 	FOREACH_THREAD_IN_PROC(p, td) {
1246 		thread_lock(td);
1247 		if (set != NULL)
1248 			error = cpuset_setproc_test_setthread(td->td_cpuset,
1249 			    base);
1250 		else
1251 			error = cpuset_setproc_test_maskthread(td->td_cpuset,
1252 			    mask, domain);
1253 		thread_unlock(td);
1254 		if (error)
1255 			goto unlock_out;
1256 	}
1257 	/*
1258 	 * Replace each thread's cpuset while using deferred release.  We
1259 	 * must do this because the thread lock must be held while operating
1260 	 * on the thread and this limits the type of operations allowed.
1261 	 */
1262 	FOREACH_THREAD_IN_PROC(p, td) {
1263 		thread_lock(td);
1264 		if (set != NULL)
1265 			error = cpuset_setproc_setthread(td->td_cpuset, base,
1266 			    &nset, &freelist, &domainlist);
1267 		else
1268 			error = cpuset_setproc_maskthread(td->td_cpuset, mask,
1269 			    domain, &nset, &freelist, &domainlist);
1270 		if (error) {
1271 			thread_unlock(td);
1272 			break;
1273 		}
1274 		cpuset_rel_defer(&droplist, cpuset_update_thread(td, nset));
1275 		thread_unlock(td);
1276 	}
1277 unlock_out:
1278 	PROC_UNLOCK(p);
1279 out:
1280 	if (base != NULL && base != set)
1281 		cpuset_rel(base);
1282 	while ((nset = LIST_FIRST(&droplist)) != NULL)
1283 		cpuset_rel_complete(nset);
1284 	cpuset_freelist_free(&freelist);
1285 	domainset_freelist_free(&domainlist);
1286 	return (error);
1287 }
1288 
1289 static int
1290 bitset_strprint(char *buf, size_t bufsiz, const struct bitset *set, int setlen)
1291 {
1292 	size_t bytes;
1293 	int i, once;
1294 	char *p;
1295 
1296 	once = 0;
1297 	p = buf;
1298 	for (i = 0; i < __bitset_words(setlen); i++) {
1299 		if (once != 0) {
1300 			if (bufsiz < 1)
1301 				return (0);
1302 			*p = ',';
1303 			p++;
1304 			bufsiz--;
1305 		} else
1306 			once = 1;
1307 		if (bufsiz < sizeof(__STRING(ULONG_MAX)))
1308 			return (0);
1309 		bytes = snprintf(p, bufsiz, "%lx", set->__bits[i]);
1310 		p += bytes;
1311 		bufsiz -= bytes;
1312 	}
1313 	return (p - buf);
1314 }
1315 
1316 static int
1317 bitset_strscan(struct bitset *set, int setlen, const char *buf)
1318 {
1319 	int i, ret;
1320 	const char *p;
1321 
1322 	BIT_ZERO(setlen, set);
1323 	p = buf;
1324 	for (i = 0; i < __bitset_words(setlen); i++) {
1325 		if (*p == ',') {
1326 			p++;
1327 			continue;
1328 		}
1329 		ret = sscanf(p, "%lx", &set->__bits[i]);
1330 		if (ret == 0 || ret == -1)
1331 			break;
1332 		while (isxdigit(*p))
1333 			p++;
1334 	}
1335 	return (p - buf);
1336 }
1337 
1338 /*
1339  * Return a string representing a valid layout for a cpuset_t object.
1340  * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
1341  */
1342 char *
1343 cpusetobj_strprint(char *buf, const cpuset_t *set)
1344 {
1345 
1346 	bitset_strprint(buf, CPUSETBUFSIZ, (const struct bitset *)set,
1347 	    CPU_SETSIZE);
1348 	return (buf);
1349 }
1350 
1351 /*
1352  * Build a valid cpuset_t object from a string representation.
1353  * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
1354  */
1355 int
1356 cpusetobj_strscan(cpuset_t *set, const char *buf)
1357 {
1358 	char p;
1359 
1360 	if (strlen(buf) > CPUSETBUFSIZ - 1)
1361 		return (-1);
1362 
1363 	p = buf[bitset_strscan((struct bitset *)set, CPU_SETSIZE, buf)];
1364 	if (p != '\0')
1365 		return (-1);
1366 
1367 	return (0);
1368 }
1369 
1370 /*
1371  * Handle a domainset specifier in the sysctl tree.  A poiner to a pointer to
1372  * a domainset is in arg1.  If the user specifies a valid domainset the
1373  * pointer is updated.
1374  *
1375  * Format is:
1376  * hex mask word 0,hex mask word 1,...:decimal policy:decimal preferred
1377  */
1378 int
1379 sysctl_handle_domainset(SYSCTL_HANDLER_ARGS)
1380 {
1381 	char buf[DOMAINSETBUFSIZ];
1382 	struct domainset *dset;
1383 	struct domainset key;
1384 	int policy, prefer, error;
1385 	char *p;
1386 
1387 	dset = *(struct domainset **)arg1;
1388 	error = 0;
1389 
1390 	if (dset != NULL) {
1391 		p = buf + bitset_strprint(buf, DOMAINSETBUFSIZ,
1392 		    (const struct bitset *)&dset->ds_mask, DOMAINSET_SETSIZE);
1393 		sprintf(p, ":%d:%d", dset->ds_policy, dset->ds_prefer);
1394 	} else
1395 		sprintf(buf, "<NULL>");
1396 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
1397 	if (error != 0 || req->newptr == NULL)
1398 		return (error);
1399 
1400 	/*
1401 	 * Read in and validate the string.
1402 	 */
1403 	memset(&key, 0, sizeof(key));
1404 	p = &buf[bitset_strscan((struct bitset *)&key.ds_mask,
1405 	    DOMAINSET_SETSIZE, buf)];
1406 	if (p == buf)
1407 		return (EINVAL);
1408 	if (sscanf(p, ":%d:%d", &policy, &prefer) != 2)
1409 		return (EINVAL);
1410 	key.ds_policy = policy;
1411 	key.ds_prefer = prefer;
1412 
1413 	/* Domainset_create() validates the policy.*/
1414 	dset = domainset_create(&key);
1415 	if (dset == NULL)
1416 		return (EINVAL);
1417 	*(struct domainset **)arg1 = dset;
1418 
1419 	return (error);
1420 }
1421 
1422 /*
1423  * Apply an anonymous mask or a domain to a single thread.
1424  */
1425 static int
1426 _cpuset_setthread(lwpid_t id, cpuset_t *mask, struct domainset *domain)
1427 {
1428 	struct setlist cpusets;
1429 	struct domainlist domainlist;
1430 	struct cpuset *nset;
1431 	struct cpuset *set;
1432 	struct thread *td;
1433 	struct proc *p;
1434 	int error;
1435 
1436 	cpuset_freelist_init(&cpusets, 1);
1437 	domainset_freelist_init(&domainlist, domain != NULL);
1438 	error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set);
1439 	if (error)
1440 		goto out;
1441 	set = NULL;
1442 	thread_lock(td);
1443 	error = cpuset_shadow(td->td_cpuset, &nset, mask, domain,
1444 	    &cpusets, &domainlist);
1445 	if (error == 0)
1446 		set = cpuset_update_thread(td, nset);
1447 	thread_unlock(td);
1448 	PROC_UNLOCK(p);
1449 	if (set)
1450 		cpuset_rel(set);
1451 out:
1452 	cpuset_freelist_free(&cpusets);
1453 	domainset_freelist_free(&domainlist);
1454 	return (error);
1455 }
1456 
1457 /*
1458  * Apply an anonymous mask to a single thread.
1459  */
1460 int
1461 cpuset_setthread(lwpid_t id, cpuset_t *mask)
1462 {
1463 
1464 	return _cpuset_setthread(id, mask, NULL);
1465 }
1466 
1467 /*
1468  * Apply new cpumask to the ithread.
1469  */
1470 int
1471 cpuset_setithread(lwpid_t id, int cpu)
1472 {
1473 	cpuset_t mask;
1474 
1475 	CPU_ZERO(&mask);
1476 	if (cpu == NOCPU)
1477 		CPU_COPY(cpuset_root, &mask);
1478 	else
1479 		CPU_SET(cpu, &mask);
1480 	return _cpuset_setthread(id, &mask, NULL);
1481 }
1482 
1483 /*
1484  * Initialize static domainsets after NUMA information is available.  This is
1485  * called before memory allocators are initialized.
1486  */
1487 void
1488 domainset_init(void)
1489 {
1490 	struct domainset *dset;
1491 	int i;
1492 
1493 	dset = &domainset_roundrobin;
1494 	DOMAINSET_COPY(&all_domains, &dset->ds_mask);
1495 	dset->ds_policy = DOMAINSET_POLICY_ROUNDROBIN;
1496 	dset->ds_prefer = -1;
1497 	_domainset_create(dset, NULL);
1498 
1499 	for (i = 0; i < vm_ndomains; i++) {
1500 		dset = &domainset_fixed[i];
1501 		DOMAINSET_ZERO(&dset->ds_mask);
1502 		DOMAINSET_SET(i, &dset->ds_mask);
1503 		dset->ds_policy = DOMAINSET_POLICY_ROUNDROBIN;
1504 		_domainset_create(dset, NULL);
1505 
1506 		dset = &domainset_prefer[i];
1507 		DOMAINSET_COPY(&all_domains, &dset->ds_mask);
1508 		dset->ds_policy = DOMAINSET_POLICY_PREFER;
1509 		dset->ds_prefer = i;
1510 		_domainset_create(dset, NULL);
1511 	}
1512 }
1513 
1514 /*
1515  * Create the domainset for cpuset 0, 1 and cpuset 2.
1516  */
1517 void
1518 domainset_zero(void)
1519 {
1520 	struct domainset *dset, *tmp;
1521 
1522 	mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE);
1523 
1524 	dset = &domainset0;
1525 	DOMAINSET_COPY(&all_domains, &dset->ds_mask);
1526 	dset->ds_policy = DOMAINSET_POLICY_FIRSTTOUCH;
1527 	dset->ds_prefer = -1;
1528 	curthread->td_domain.dr_policy = _domainset_create(dset, NULL);
1529 
1530 	domainset_copy(dset, &domainset2);
1531 	domainset2.ds_policy = DOMAINSET_POLICY_INTERLEAVE;
1532 	kernel_object->domain.dr_policy = _domainset_create(&domainset2, NULL);
1533 
1534 	/* Remove empty domains from the global policies. */
1535 	LIST_FOREACH_SAFE(dset, &cpuset_domains, ds_link, tmp)
1536 		if (domainset_empty_vm(dset))
1537 			LIST_REMOVE(dset, ds_link);
1538 }
1539 
1540 /*
1541  * Creates system-wide cpusets and the cpuset for thread0 including three
1542  * sets:
1543  *
1544  * 0 - The root set which should represent all valid processors in the
1545  *     system.  This set is immutable.
1546  * 1 - The default set which all processes are a member of until changed.
1547  *     This allows an administrator to move all threads off of given cpus to
1548  *     dedicate them to high priority tasks or save power etc.
1549  * 2 - The kernel set which allows restriction and policy to be applied only
1550  *     to kernel threads and the kernel_object.
1551  */
1552 struct cpuset *
1553 cpuset_thread0(void)
1554 {
1555 	struct cpuset *set;
1556 	int i;
1557 	int error __unused;
1558 
1559 	cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
1560 	    NULL, NULL, UMA_ALIGN_CACHE, 0);
1561 	domainset_zone = uma_zcreate("domainset", sizeof(struct domainset),
1562 	    NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
1563 
1564 	/*
1565 	 * Create the root system set (0) for the whole machine.  Doesn't use
1566 	 * cpuset_create() due to NULL parent.
1567 	 */
1568 	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
1569 	CPU_COPY(&all_cpus, &set->cs_mask);
1570 	LIST_INIT(&set->cs_children);
1571 	LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
1572 	refcount_init(&set->cs_ref, 1);
1573 	set->cs_flags = CPU_SET_ROOT | CPU_SET_RDONLY;
1574 	set->cs_domain = &domainset0;
1575 	cpuset_zero = set;
1576 	cpuset_root = &set->cs_mask;
1577 
1578 	/*
1579 	 * Now derive a default (1), modifiable set from that to give out.
1580 	 */
1581 	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
1582 	error = cpuset_init(set, cpuset_zero, NULL, NULL, 1);
1583 	KASSERT(error == 0, ("Error creating default set: %d\n", error));
1584 	cpuset_default = set;
1585 	/*
1586 	 * Create the kernel set (2).
1587 	 */
1588 	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
1589 	error = cpuset_init(set, cpuset_zero, NULL, NULL, 2);
1590 	KASSERT(error == 0, ("Error creating kernel set: %d\n", error));
1591 	set->cs_domain = &domainset2;
1592 	cpuset_kernel = set;
1593 
1594 	/*
1595 	 * Initialize the unit allocator. 0 and 1 are allocated above.
1596 	 */
1597 	cpuset_unr = new_unrhdr(3, INT_MAX, NULL);
1598 
1599 	/*
1600 	 * If MD code has not initialized per-domain cpusets, place all
1601 	 * CPUs in domain 0.
1602 	 */
1603 	for (i = 0; i < MAXMEMDOM; i++)
1604 		if (!CPU_EMPTY(&cpuset_domain[i]))
1605 			goto domains_set;
1606 	CPU_COPY(&all_cpus, &cpuset_domain[0]);
1607 domains_set:
1608 
1609 	return (cpuset_default);
1610 }
1611 
1612 void
1613 cpuset_kernthread(struct thread *td)
1614 {
1615 	struct cpuset *set;
1616 
1617 	thread_lock(td);
1618 	set = td->td_cpuset;
1619 	td->td_cpuset = cpuset_ref(cpuset_kernel);
1620 	thread_unlock(td);
1621 	cpuset_rel(set);
1622 }
1623 
1624 /*
1625  * Create a cpuset, which would be cpuset_create() but
1626  * mark the new 'set' as root.
1627  *
1628  * We are not going to reparent the td to it.  Use cpuset_setproc_update_set()
1629  * for that.
1630  *
1631  * In case of no error, returns the set in *setp locked with a reference.
1632  */
1633 int
1634 cpuset_create_root(struct prison *pr, struct cpuset **setp)
1635 {
1636 	struct cpuset *set;
1637 	int error;
1638 
1639 	KASSERT(pr != NULL, ("[%s:%d] invalid pr", __func__, __LINE__));
1640 	KASSERT(setp != NULL, ("[%s:%d] invalid setp", __func__, __LINE__));
1641 
1642 	set = NULL;
1643 	error = cpuset_create(&set, pr->pr_cpuset, &pr->pr_cpuset->cs_mask);
1644 	if (error)
1645 		return (error);
1646 
1647 	KASSERT(set != NULL, ("[%s:%d] cpuset_create returned invalid data",
1648 	    __func__, __LINE__));
1649 
1650 	/* Mark the set as root. */
1651 	set->cs_flags |= CPU_SET_ROOT;
1652 	*setp = set;
1653 
1654 	return (0);
1655 }
1656 
1657 int
1658 cpuset_setproc_update_set(struct proc *p, struct cpuset *set)
1659 {
1660 	int error;
1661 
1662 	KASSERT(p != NULL, ("[%s:%d] invalid proc", __func__, __LINE__));
1663 	KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__));
1664 
1665 	cpuset_ref(set);
1666 	error = cpuset_setproc(p->p_pid, set, NULL, NULL, true);
1667 	if (error)
1668 		return (error);
1669 	cpuset_rel(set);
1670 	return (0);
1671 }
1672 
1673 /*
1674  * In Capability mode, the only accesses that are permitted are to the current
1675  * thread and process' CPU and domain sets.
1676  */
1677 static int
1678 cpuset_check_capabilities(struct thread *td, cpulevel_t level, cpuwhich_t which,
1679     id_t id)
1680 {
1681 	if (IN_CAPABILITY_MODE(td)) {
1682 		if (level != CPU_LEVEL_WHICH)
1683 			return (ECAPMODE);
1684 		if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
1685 			return (ECAPMODE);
1686 		if (id != -1 &&
1687 		    !(which == CPU_WHICH_TID && id == td->td_tid) &&
1688 		    !(which == CPU_WHICH_PID && id == td->td_proc->p_pid))
1689 			return (ECAPMODE);
1690 	}
1691 	return (0);
1692 }
1693 
1694 #ifndef _SYS_SYSPROTO_H_
1695 struct cpuset_args {
1696 	cpusetid_t	*setid;
1697 };
1698 #endif
1699 int
1700 sys_cpuset(struct thread *td, struct cpuset_args *uap)
1701 {
1702 	struct cpuset *root;
1703 	struct cpuset *set;
1704 	int error;
1705 
1706 	thread_lock(td);
1707 	root = cpuset_refroot(td->td_cpuset);
1708 	thread_unlock(td);
1709 	set = NULL;
1710 	error = cpuset_create(&set, root, &root->cs_mask);
1711 	cpuset_rel(root);
1712 	if (error)
1713 		return (error);
1714 	error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id));
1715 	if (error == 0)
1716 		error = cpuset_setproc(-1, set, NULL, NULL, false);
1717 	cpuset_rel(set);
1718 	return (error);
1719 }
1720 
1721 #ifndef _SYS_SYSPROTO_H_
1722 struct cpuset_setid_args {
1723 	cpuwhich_t	which;
1724 	id_t		id;
1725 	cpusetid_t	setid;
1726 };
1727 #endif
1728 int
1729 sys_cpuset_setid(struct thread *td, struct cpuset_setid_args *uap)
1730 {
1731 
1732 	return (kern_cpuset_setid(td, uap->which, uap->id, uap->setid));
1733 }
1734 
1735 int
1736 kern_cpuset_setid(struct thread *td, cpuwhich_t which,
1737     id_t id, cpusetid_t setid)
1738 {
1739 	struct cpuset *set;
1740 	int error;
1741 
1742 	/*
1743 	 * Presently we only support per-process sets.
1744 	 */
1745 	if (which != CPU_WHICH_PID)
1746 		return (EINVAL);
1747 	set = cpuset_lookup(setid, td);
1748 	if (set == NULL)
1749 		return (ESRCH);
1750 	error = cpuset_setproc(id, set, NULL, NULL, false);
1751 	cpuset_rel(set);
1752 	return (error);
1753 }
1754 
1755 #ifndef _SYS_SYSPROTO_H_
1756 struct cpuset_getid_args {
1757 	cpulevel_t	level;
1758 	cpuwhich_t	which;
1759 	id_t		id;
1760 	cpusetid_t	*setid;
1761 };
1762 #endif
1763 int
1764 sys_cpuset_getid(struct thread *td, struct cpuset_getid_args *uap)
1765 {
1766 
1767 	return (kern_cpuset_getid(td, uap->level, uap->which, uap->id,
1768 	    uap->setid));
1769 }
1770 
1771 int
1772 kern_cpuset_getid(struct thread *td, cpulevel_t level, cpuwhich_t which,
1773     id_t id, cpusetid_t *setid)
1774 {
1775 	struct cpuset *nset;
1776 	struct cpuset *set;
1777 	struct thread *ttd;
1778 	struct proc *p;
1779 	cpusetid_t tmpid;
1780 	int error;
1781 
1782 	if (level == CPU_LEVEL_WHICH && which != CPU_WHICH_CPUSET)
1783 		return (EINVAL);
1784 	error = cpuset_which(which, id, &p, &ttd, &set);
1785 	if (error)
1786 		return (error);
1787 	switch (which) {
1788 	case CPU_WHICH_TID:
1789 	case CPU_WHICH_PID:
1790 		thread_lock(ttd);
1791 		set = cpuset_refbase(ttd->td_cpuset);
1792 		thread_unlock(ttd);
1793 		PROC_UNLOCK(p);
1794 		break;
1795 	case CPU_WHICH_CPUSET:
1796 	case CPU_WHICH_JAIL:
1797 		break;
1798 	case CPU_WHICH_IRQ:
1799 	case CPU_WHICH_DOMAIN:
1800 		return (EINVAL);
1801 	}
1802 	switch (level) {
1803 	case CPU_LEVEL_ROOT:
1804 		nset = cpuset_refroot(set);
1805 		cpuset_rel(set);
1806 		set = nset;
1807 		break;
1808 	case CPU_LEVEL_CPUSET:
1809 		break;
1810 	case CPU_LEVEL_WHICH:
1811 		break;
1812 	}
1813 	tmpid = set->cs_id;
1814 	cpuset_rel(set);
1815 	if (error == 0)
1816 		error = copyout(&tmpid, setid, sizeof(tmpid));
1817 
1818 	return (error);
1819 }
1820 
1821 #ifndef _SYS_SYSPROTO_H_
1822 struct cpuset_getaffinity_args {
1823 	cpulevel_t	level;
1824 	cpuwhich_t	which;
1825 	id_t		id;
1826 	size_t		cpusetsize;
1827 	cpuset_t	*mask;
1828 };
1829 #endif
1830 int
1831 sys_cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap)
1832 {
1833 
1834 	return (kern_cpuset_getaffinity(td, uap->level, uap->which,
1835 	    uap->id, uap->cpusetsize, uap->mask));
1836 }
1837 
1838 int
1839 kern_cpuset_getaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which,
1840     id_t id, size_t cpusetsize, cpuset_t *maskp)
1841 {
1842 	struct thread *ttd;
1843 	struct cpuset *nset;
1844 	struct cpuset *set;
1845 	struct proc *p;
1846 	cpuset_t *mask;
1847 	int error;
1848 	size_t size;
1849 
1850 	if (cpusetsize < sizeof(cpuset_t) || cpusetsize > CPU_MAXSIZE / NBBY)
1851 		return (ERANGE);
1852 	error = cpuset_check_capabilities(td, level, which, id);
1853 	if (error != 0)
1854 		return (error);
1855 	size = cpusetsize;
1856 	mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
1857 	error = cpuset_which(which, id, &p, &ttd, &set);
1858 	if (error)
1859 		goto out;
1860 	switch (level) {
1861 	case CPU_LEVEL_ROOT:
1862 	case CPU_LEVEL_CPUSET:
1863 		switch (which) {
1864 		case CPU_WHICH_TID:
1865 		case CPU_WHICH_PID:
1866 			thread_lock(ttd);
1867 			set = cpuset_ref(ttd->td_cpuset);
1868 			thread_unlock(ttd);
1869 			break;
1870 		case CPU_WHICH_CPUSET:
1871 		case CPU_WHICH_JAIL:
1872 			break;
1873 		case CPU_WHICH_IRQ:
1874 		case CPU_WHICH_INTRHANDLER:
1875 		case CPU_WHICH_ITHREAD:
1876 		case CPU_WHICH_DOMAIN:
1877 			error = EINVAL;
1878 			goto out;
1879 		}
1880 		if (level == CPU_LEVEL_ROOT)
1881 			nset = cpuset_refroot(set);
1882 		else
1883 			nset = cpuset_refbase(set);
1884 		CPU_COPY(&nset->cs_mask, mask);
1885 		cpuset_rel(nset);
1886 		break;
1887 	case CPU_LEVEL_WHICH:
1888 		switch (which) {
1889 		case CPU_WHICH_TID:
1890 			thread_lock(ttd);
1891 			CPU_COPY(&ttd->td_cpuset->cs_mask, mask);
1892 			thread_unlock(ttd);
1893 			break;
1894 		case CPU_WHICH_PID:
1895 			FOREACH_THREAD_IN_PROC(p, ttd) {
1896 				thread_lock(ttd);
1897 				CPU_OR(mask, &ttd->td_cpuset->cs_mask);
1898 				thread_unlock(ttd);
1899 			}
1900 			break;
1901 		case CPU_WHICH_CPUSET:
1902 		case CPU_WHICH_JAIL:
1903 			CPU_COPY(&set->cs_mask, mask);
1904 			break;
1905 		case CPU_WHICH_IRQ:
1906 		case CPU_WHICH_INTRHANDLER:
1907 		case CPU_WHICH_ITHREAD:
1908 			error = intr_getaffinity(id, which, mask);
1909 			break;
1910 		case CPU_WHICH_DOMAIN:
1911 			if (id < 0 || id >= MAXMEMDOM)
1912 				error = ESRCH;
1913 			else
1914 				CPU_COPY(&cpuset_domain[id], mask);
1915 			break;
1916 		}
1917 		break;
1918 	default:
1919 		error = EINVAL;
1920 		break;
1921 	}
1922 	if (set)
1923 		cpuset_rel(set);
1924 	if (p)
1925 		PROC_UNLOCK(p);
1926 	if (error == 0)
1927 		error = copyout(mask, maskp, size);
1928 out:
1929 	free(mask, M_TEMP);
1930 	return (error);
1931 }
1932 
1933 #ifndef _SYS_SYSPROTO_H_
1934 struct cpuset_setaffinity_args {
1935 	cpulevel_t	level;
1936 	cpuwhich_t	which;
1937 	id_t		id;
1938 	size_t		cpusetsize;
1939 	const cpuset_t	*mask;
1940 };
1941 #endif
1942 int
1943 sys_cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap)
1944 {
1945 
1946 	return (kern_cpuset_setaffinity(td, uap->level, uap->which,
1947 	    uap->id, uap->cpusetsize, uap->mask));
1948 }
1949 
1950 int
1951 kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which,
1952     id_t id, size_t cpusetsize, const cpuset_t *maskp)
1953 {
1954 	struct cpuset *nset;
1955 	struct cpuset *set;
1956 	struct thread *ttd;
1957 	struct proc *p;
1958 	cpuset_t *mask;
1959 	int error;
1960 
1961 	if (cpusetsize < sizeof(cpuset_t) || cpusetsize > CPU_MAXSIZE / NBBY)
1962 		return (ERANGE);
1963 	error = cpuset_check_capabilities(td, level, which, id);
1964 	if (error != 0)
1965 		return (error);
1966 	mask = malloc(cpusetsize, M_TEMP, M_WAITOK | M_ZERO);
1967 	error = copyin(maskp, mask, cpusetsize);
1968 	if (error)
1969 		goto out;
1970 	/*
1971 	 * Verify that no high bits are set.
1972 	 */
1973 	if (cpusetsize > sizeof(cpuset_t)) {
1974 		char *end;
1975 		char *cp;
1976 
1977 		end = cp = (char *)&mask->__bits;
1978 		end += cpusetsize;
1979 		cp += sizeof(cpuset_t);
1980 		while (cp != end)
1981 			if (*cp++ != 0) {
1982 				error = EINVAL;
1983 				goto out;
1984 			}
1985 	}
1986 	switch (level) {
1987 	case CPU_LEVEL_ROOT:
1988 	case CPU_LEVEL_CPUSET:
1989 		error = cpuset_which(which, id, &p, &ttd, &set);
1990 		if (error)
1991 			break;
1992 		switch (which) {
1993 		case CPU_WHICH_TID:
1994 		case CPU_WHICH_PID:
1995 			thread_lock(ttd);
1996 			set = cpuset_ref(ttd->td_cpuset);
1997 			thread_unlock(ttd);
1998 			PROC_UNLOCK(p);
1999 			break;
2000 		case CPU_WHICH_CPUSET:
2001 		case CPU_WHICH_JAIL:
2002 			break;
2003 		case CPU_WHICH_IRQ:
2004 		case CPU_WHICH_INTRHANDLER:
2005 		case CPU_WHICH_ITHREAD:
2006 		case CPU_WHICH_DOMAIN:
2007 			error = EINVAL;
2008 			goto out;
2009 		}
2010 		if (level == CPU_LEVEL_ROOT)
2011 			nset = cpuset_refroot(set);
2012 		else
2013 			nset = cpuset_refbase(set);
2014 		error = cpuset_modify(nset, mask);
2015 		cpuset_rel(nset);
2016 		cpuset_rel(set);
2017 		break;
2018 	case CPU_LEVEL_WHICH:
2019 		switch (which) {
2020 		case CPU_WHICH_TID:
2021 			error = cpuset_setthread(id, mask);
2022 			break;
2023 		case CPU_WHICH_PID:
2024 			error = cpuset_setproc(id, NULL, mask, NULL, false);
2025 			break;
2026 		case CPU_WHICH_CPUSET:
2027 		case CPU_WHICH_JAIL:
2028 			error = cpuset_which(which, id, &p, &ttd, &set);
2029 			if (error == 0) {
2030 				error = cpuset_modify(set, mask);
2031 				cpuset_rel(set);
2032 			}
2033 			break;
2034 		case CPU_WHICH_IRQ:
2035 		case CPU_WHICH_INTRHANDLER:
2036 		case CPU_WHICH_ITHREAD:
2037 			error = intr_setaffinity(id, which, mask);
2038 			break;
2039 		default:
2040 			error = EINVAL;
2041 			break;
2042 		}
2043 		break;
2044 	default:
2045 		error = EINVAL;
2046 		break;
2047 	}
2048 out:
2049 	free(mask, M_TEMP);
2050 	return (error);
2051 }
2052 
2053 #ifndef _SYS_SYSPROTO_H_
2054 struct cpuset_getdomain_args {
2055 	cpulevel_t	level;
2056 	cpuwhich_t	which;
2057 	id_t		id;
2058 	size_t		domainsetsize;
2059 	domainset_t	*mask;
2060 	int 		*policy;
2061 };
2062 #endif
2063 int
2064 sys_cpuset_getdomain(struct thread *td, struct cpuset_getdomain_args *uap)
2065 {
2066 
2067 	return (kern_cpuset_getdomain(td, uap->level, uap->which,
2068 	    uap->id, uap->domainsetsize, uap->mask, uap->policy));
2069 }
2070 
2071 int
2072 kern_cpuset_getdomain(struct thread *td, cpulevel_t level, cpuwhich_t which,
2073     id_t id, size_t domainsetsize, domainset_t *maskp, int *policyp)
2074 {
2075 	struct domainset outset;
2076 	struct thread *ttd;
2077 	struct cpuset *nset;
2078 	struct cpuset *set;
2079 	struct domainset *dset;
2080 	struct proc *p;
2081 	domainset_t *mask;
2082 	int error;
2083 
2084 	if (domainsetsize < sizeof(domainset_t) ||
2085 	    domainsetsize > DOMAINSET_MAXSIZE / NBBY)
2086 		return (ERANGE);
2087 	error = cpuset_check_capabilities(td, level, which, id);
2088 	if (error != 0)
2089 		return (error);
2090 	mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO);
2091 	bzero(&outset, sizeof(outset));
2092 	error = cpuset_which(which, id, &p, &ttd, &set);
2093 	if (error)
2094 		goto out;
2095 	switch (level) {
2096 	case CPU_LEVEL_ROOT:
2097 	case CPU_LEVEL_CPUSET:
2098 		switch (which) {
2099 		case CPU_WHICH_TID:
2100 		case CPU_WHICH_PID:
2101 			thread_lock(ttd);
2102 			set = cpuset_ref(ttd->td_cpuset);
2103 			thread_unlock(ttd);
2104 			break;
2105 		case CPU_WHICH_CPUSET:
2106 		case CPU_WHICH_JAIL:
2107 			break;
2108 		case CPU_WHICH_IRQ:
2109 		case CPU_WHICH_INTRHANDLER:
2110 		case CPU_WHICH_ITHREAD:
2111 		case CPU_WHICH_DOMAIN:
2112 			error = EINVAL;
2113 			goto out;
2114 		}
2115 		if (level == CPU_LEVEL_ROOT)
2116 			nset = cpuset_refroot(set);
2117 		else
2118 			nset = cpuset_refbase(set);
2119 		domainset_copy(nset->cs_domain, &outset);
2120 		cpuset_rel(nset);
2121 		break;
2122 	case CPU_LEVEL_WHICH:
2123 		switch (which) {
2124 		case CPU_WHICH_TID:
2125 			thread_lock(ttd);
2126 			domainset_copy(ttd->td_cpuset->cs_domain, &outset);
2127 			thread_unlock(ttd);
2128 			break;
2129 		case CPU_WHICH_PID:
2130 			FOREACH_THREAD_IN_PROC(p, ttd) {
2131 				thread_lock(ttd);
2132 				dset = ttd->td_cpuset->cs_domain;
2133 				/* Show all domains in the proc. */
2134 				DOMAINSET_OR(&outset.ds_mask, &dset->ds_mask);
2135 				/* Last policy wins. */
2136 				outset.ds_policy = dset->ds_policy;
2137 				outset.ds_prefer = dset->ds_prefer;
2138 				thread_unlock(ttd);
2139 			}
2140 			break;
2141 		case CPU_WHICH_CPUSET:
2142 		case CPU_WHICH_JAIL:
2143 			domainset_copy(set->cs_domain, &outset);
2144 			break;
2145 		case CPU_WHICH_IRQ:
2146 		case CPU_WHICH_INTRHANDLER:
2147 		case CPU_WHICH_ITHREAD:
2148 		case CPU_WHICH_DOMAIN:
2149 			error = EINVAL;
2150 			break;
2151 		}
2152 		break;
2153 	default:
2154 		error = EINVAL;
2155 		break;
2156 	}
2157 	if (set)
2158 		cpuset_rel(set);
2159 	if (p)
2160 		PROC_UNLOCK(p);
2161 	/*
2162 	 * Translate prefer into a set containing only the preferred domain,
2163 	 * not the entire fallback set.
2164 	 */
2165 	if (outset.ds_policy == DOMAINSET_POLICY_PREFER) {
2166 		DOMAINSET_ZERO(&outset.ds_mask);
2167 		DOMAINSET_SET(outset.ds_prefer, &outset.ds_mask);
2168 	}
2169 	DOMAINSET_COPY(&outset.ds_mask, mask);
2170 	if (error == 0)
2171 		error = copyout(mask, maskp, domainsetsize);
2172 	if (error == 0)
2173 		if (suword32(policyp, outset.ds_policy) != 0)
2174 			error = EFAULT;
2175 out:
2176 	free(mask, M_TEMP);
2177 	return (error);
2178 }
2179 
2180 #ifndef _SYS_SYSPROTO_H_
2181 struct cpuset_setdomain_args {
2182 	cpulevel_t	level;
2183 	cpuwhich_t	which;
2184 	id_t		id;
2185 	size_t		domainsetsize;
2186 	domainset_t	*mask;
2187 	int 		policy;
2188 };
2189 #endif
2190 int
2191 sys_cpuset_setdomain(struct thread *td, struct cpuset_setdomain_args *uap)
2192 {
2193 
2194 	return (kern_cpuset_setdomain(td, uap->level, uap->which,
2195 	    uap->id, uap->domainsetsize, uap->mask, uap->policy));
2196 }
2197 
2198 int
2199 kern_cpuset_setdomain(struct thread *td, cpulevel_t level, cpuwhich_t which,
2200     id_t id, size_t domainsetsize, const domainset_t *maskp, int policy)
2201 {
2202 	struct cpuset *nset;
2203 	struct cpuset *set;
2204 	struct thread *ttd;
2205 	struct proc *p;
2206 	struct domainset domain;
2207 	domainset_t *mask;
2208 	int error;
2209 
2210 	if (domainsetsize < sizeof(domainset_t) ||
2211 	    domainsetsize > DOMAINSET_MAXSIZE / NBBY)
2212 		return (ERANGE);
2213 	if (policy <= DOMAINSET_POLICY_INVALID ||
2214 	    policy > DOMAINSET_POLICY_MAX)
2215 		return (EINVAL);
2216 	error = cpuset_check_capabilities(td, level, which, id);
2217 	if (error != 0)
2218 		return (error);
2219 	memset(&domain, 0, sizeof(domain));
2220 	mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO);
2221 	error = copyin(maskp, mask, domainsetsize);
2222 	if (error)
2223 		goto out;
2224 	/*
2225 	 * Verify that no high bits are set.
2226 	 */
2227 	if (domainsetsize > sizeof(domainset_t)) {
2228 		char *end;
2229 		char *cp;
2230 
2231 		end = cp = (char *)&mask->__bits;
2232 		end += domainsetsize;
2233 		cp += sizeof(domainset_t);
2234 		while (cp != end)
2235 			if (*cp++ != 0) {
2236 				error = EINVAL;
2237 				goto out;
2238 			}
2239 	}
2240 	DOMAINSET_COPY(mask, &domain.ds_mask);
2241 	domain.ds_policy = policy;
2242 
2243 	/*
2244 	 * Sanitize the provided mask.
2245 	 */
2246 	if (!DOMAINSET_SUBSET(&all_domains, &domain.ds_mask)) {
2247 		error = EINVAL;
2248 		goto out;
2249 	}
2250 
2251 	/* Translate preferred policy into a mask and fallback. */
2252 	if (policy == DOMAINSET_POLICY_PREFER) {
2253 		/* Only support a single preferred domain. */
2254 		if (DOMAINSET_COUNT(&domain.ds_mask) != 1) {
2255 			error = EINVAL;
2256 			goto out;
2257 		}
2258 		domain.ds_prefer = DOMAINSET_FFS(&domain.ds_mask) - 1;
2259 		/* This will be constrained by domainset_shadow(). */
2260 		DOMAINSET_COPY(&all_domains, &domain.ds_mask);
2261 	}
2262 
2263 	/*
2264 	 * When given an impossible policy, fall back to interleaving
2265 	 * across all domains.
2266 	 */
2267 	if (domainset_empty_vm(&domain))
2268 		domainset_copy(&domainset2, &domain);
2269 
2270 	switch (level) {
2271 	case CPU_LEVEL_ROOT:
2272 	case CPU_LEVEL_CPUSET:
2273 		error = cpuset_which(which, id, &p, &ttd, &set);
2274 		if (error)
2275 			break;
2276 		switch (which) {
2277 		case CPU_WHICH_TID:
2278 		case CPU_WHICH_PID:
2279 			thread_lock(ttd);
2280 			set = cpuset_ref(ttd->td_cpuset);
2281 			thread_unlock(ttd);
2282 			PROC_UNLOCK(p);
2283 			break;
2284 		case CPU_WHICH_CPUSET:
2285 		case CPU_WHICH_JAIL:
2286 			break;
2287 		case CPU_WHICH_IRQ:
2288 		case CPU_WHICH_INTRHANDLER:
2289 		case CPU_WHICH_ITHREAD:
2290 		case CPU_WHICH_DOMAIN:
2291 			error = EINVAL;
2292 			goto out;
2293 		}
2294 		if (level == CPU_LEVEL_ROOT)
2295 			nset = cpuset_refroot(set);
2296 		else
2297 			nset = cpuset_refbase(set);
2298 		error = cpuset_modify_domain(nset, &domain);
2299 		cpuset_rel(nset);
2300 		cpuset_rel(set);
2301 		break;
2302 	case CPU_LEVEL_WHICH:
2303 		switch (which) {
2304 		case CPU_WHICH_TID:
2305 			error = _cpuset_setthread(id, NULL, &domain);
2306 			break;
2307 		case CPU_WHICH_PID:
2308 			error = cpuset_setproc(id, NULL, NULL, &domain, false);
2309 			break;
2310 		case CPU_WHICH_CPUSET:
2311 		case CPU_WHICH_JAIL:
2312 			error = cpuset_which(which, id, &p, &ttd, &set);
2313 			if (error == 0) {
2314 				error = cpuset_modify_domain(set, &domain);
2315 				cpuset_rel(set);
2316 			}
2317 			break;
2318 		case CPU_WHICH_IRQ:
2319 		case CPU_WHICH_INTRHANDLER:
2320 		case CPU_WHICH_ITHREAD:
2321 		default:
2322 			error = EINVAL;
2323 			break;
2324 		}
2325 		break;
2326 	default:
2327 		error = EINVAL;
2328 		break;
2329 	}
2330 out:
2331 	free(mask, M_TEMP);
2332 	return (error);
2333 }
2334 
2335 #ifdef DDB
2336 
2337 static void
2338 ddb_display_bitset(const struct bitset *set, int size)
2339 {
2340 	int bit, once;
2341 
2342 	for (once = 0, bit = 0; bit < size; bit++) {
2343 		if (CPU_ISSET(bit, set)) {
2344 			if (once == 0) {
2345 				db_printf("%d", bit);
2346 				once = 1;
2347 			} else
2348 				db_printf(",%d", bit);
2349 		}
2350 	}
2351 	if (once == 0)
2352 		db_printf("<none>");
2353 }
2354 
2355 void
2356 ddb_display_cpuset(const cpuset_t *set)
2357 {
2358 	ddb_display_bitset((const struct bitset *)set, CPU_SETSIZE);
2359 }
2360 
2361 static void
2362 ddb_display_domainset(const domainset_t *set)
2363 {
2364 	ddb_display_bitset((const struct bitset *)set, DOMAINSET_SETSIZE);
2365 }
2366 
2367 DB_SHOW_COMMAND(cpusets, db_show_cpusets)
2368 {
2369 	struct cpuset *set;
2370 
2371 	LIST_FOREACH(set, &cpuset_ids, cs_link) {
2372 		db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n",
2373 		    set, set->cs_id, refcount_load(&set->cs_ref), set->cs_flags,
2374 		    (set->cs_parent != NULL) ? set->cs_parent->cs_id : 0);
2375 		db_printf("  cpu mask=");
2376 		ddb_display_cpuset(&set->cs_mask);
2377 		db_printf("\n");
2378 		db_printf("  domain policy %d prefer %d mask=",
2379 		    set->cs_domain->ds_policy, set->cs_domain->ds_prefer);
2380 		ddb_display_domainset(&set->cs_domain->ds_mask);
2381 		db_printf("\n");
2382 		if (db_pager_quit)
2383 			break;
2384 	}
2385 }
2386 
2387 DB_SHOW_COMMAND(domainsets, db_show_domainsets)
2388 {
2389 	struct domainset *set;
2390 
2391 	LIST_FOREACH(set, &cpuset_domains, ds_link) {
2392 		db_printf("set=%p policy %d prefer %d cnt %d\n",
2393 		    set, set->ds_policy, set->ds_prefer, set->ds_cnt);
2394 		db_printf("  mask =");
2395 		ddb_display_domainset(&set->ds_mask);
2396 		db_printf("\n");
2397 	}
2398 }
2399 #endif /* DDB */
2400