xref: /freebsd/sys/kern/kern_cpuset.c (revision 59c3cb81c1769fdb6c840c971df129b52f4a848d)
1 /*-
2  * Copyright (c) 2008,  Jeffrey Roberson <jeff@freebsd.org>
3  * All rights reserved.
4  *
5  * Copyright (c) 2008 Nokia Corporation
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice unmodified, this list of conditions, and the following
13  *    disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  *
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_ddb.h"
35 
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/sysproto.h>
39 #include <sys/jail.h>
40 #include <sys/kernel.h>
41 #include <sys/lock.h>
42 #include <sys/malloc.h>
43 #include <sys/mutex.h>
44 #include <sys/priv.h>
45 #include <sys/proc.h>
46 #include <sys/refcount.h>
47 #include <sys/sched.h>
48 #include <sys/smp.h>
49 #include <sys/syscallsubr.h>
50 #include <sys/cpuset.h>
51 #include <sys/sx.h>
52 #include <sys/queue.h>
53 #include <sys/libkern.h>
54 #include <sys/limits.h>
55 #include <sys/bus.h>
56 #include <sys/interrupt.h>
57 
58 #include <vm/uma.h>
59 #include <vm/vm.h>
60 #include <vm/vm_page.h>
61 #include <vm/vm_param.h>
62 #include <vm/vm_phys.h>
63 
64 #ifdef DDB
65 #include <ddb/ddb.h>
66 #endif /* DDB */
67 
68 /*
69  * cpusets provide a mechanism for creating and manipulating sets of
70  * processors for the purpose of constraining the scheduling of threads to
71  * specific processors.
72  *
73  * Each process belongs to an identified set, by default this is set 1.  Each
74  * thread may further restrict the cpus it may run on to a subset of this
75  * named set.  This creates an anonymous set which other threads and processes
76  * may not join by number.
77  *
78  * The named set is referred to herein as the 'base' set to avoid ambiguity.
79  * This set is usually a child of a 'root' set while the anonymous set may
80  * simply be referred to as a mask.  In the syscall api these are referred to
81  * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here.
82  *
83  * Threads inherit their set from their creator whether it be anonymous or
84  * not.  This means that anonymous sets are immutable because they may be
85  * shared.  To modify an anonymous set a new set is created with the desired
86  * mask and the same parent as the existing anonymous set.  This gives the
87  * illusion of each thread having a private mask.
88  *
89  * Via the syscall apis a user may ask to retrieve or modify the root, base,
90  * or mask that is discovered via a pid, tid, or setid.  Modifying a set
91  * modifies all numbered and anonymous child sets to comply with the new mask.
92  * Modifying a pid or tid's mask applies only to that tid but must still
93  * exist within the assigned parent set.
94  *
95  * A thread may not be assigned to a group separate from other threads in
96  * the process.  This is to remove ambiguity when the setid is queried with
97  * a pid argument.  There is no other technical limitation.
98  *
99  * This somewhat complex arrangement is intended to make it easy for
100  * applications to query available processors and bind their threads to
101  * specific processors while also allowing administrators to dynamically
102  * reprovision by changing sets which apply to groups of processes.
103  *
104  * A simple application should not concern itself with sets at all and
105  * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id
106  * meaning 'curthread'.  It may query available cpus for that tid with a
107  * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
108  */
109 static uma_zone_t cpuset_zone;
110 static struct mtx cpuset_lock;
111 static struct setlist cpuset_ids;
112 static struct unrhdr *cpuset_unr;
113 static struct cpuset *cpuset_zero, *cpuset_default;
114 
115 /* Return the size of cpuset_t at the kernel level */
116 SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD | CTLFLAG_CAPRD,
117     SYSCTL_NULL_INT_PTR, sizeof(cpuset_t), "sizeof(cpuset_t)");
118 
119 cpuset_t *cpuset_root;
120 cpuset_t cpuset_domain[MAXMEMDOM];
121 
122 /*
123  * Acquire a reference to a cpuset, all pointers must be tracked with refs.
124  */
125 struct cpuset *
126 cpuset_ref(struct cpuset *set)
127 {
128 
129 	refcount_acquire(&set->cs_ref);
130 	return (set);
131 }
132 
133 /*
134  * Walks up the tree from 'set' to find the root.  Returns the root
135  * referenced.
136  */
137 static struct cpuset *
138 cpuset_refroot(struct cpuset *set)
139 {
140 
141 	for (; set->cs_parent != NULL; set = set->cs_parent)
142 		if (set->cs_flags & CPU_SET_ROOT)
143 			break;
144 	cpuset_ref(set);
145 
146 	return (set);
147 }
148 
149 /*
150  * Find the first non-anonymous set starting from 'set'.  Returns this set
151  * referenced.  May return the passed in set with an extra ref if it is
152  * not anonymous.
153  */
154 static struct cpuset *
155 cpuset_refbase(struct cpuset *set)
156 {
157 
158 	if (set->cs_id == CPUSET_INVALID)
159 		set = set->cs_parent;
160 	cpuset_ref(set);
161 
162 	return (set);
163 }
164 
165 /*
166  * Release a reference in a context where it is safe to allocate.
167  */
168 void
169 cpuset_rel(struct cpuset *set)
170 {
171 	cpusetid_t id;
172 
173 	if (refcount_release(&set->cs_ref) == 0)
174 		return;
175 	mtx_lock_spin(&cpuset_lock);
176 	LIST_REMOVE(set, cs_siblings);
177 	id = set->cs_id;
178 	if (id != CPUSET_INVALID)
179 		LIST_REMOVE(set, cs_link);
180 	mtx_unlock_spin(&cpuset_lock);
181 	cpuset_rel(set->cs_parent);
182 	uma_zfree(cpuset_zone, set);
183 	if (id != CPUSET_INVALID)
184 		free_unr(cpuset_unr, id);
185 }
186 
187 /*
188  * Deferred release must be used when in a context that is not safe to
189  * allocate/free.  This places any unreferenced sets on the list 'head'.
190  */
191 static void
192 cpuset_rel_defer(struct setlist *head, struct cpuset *set)
193 {
194 
195 	if (refcount_release(&set->cs_ref) == 0)
196 		return;
197 	mtx_lock_spin(&cpuset_lock);
198 	LIST_REMOVE(set, cs_siblings);
199 	if (set->cs_id != CPUSET_INVALID)
200 		LIST_REMOVE(set, cs_link);
201 	LIST_INSERT_HEAD(head, set, cs_link);
202 	mtx_unlock_spin(&cpuset_lock);
203 }
204 
205 /*
206  * Complete a deferred release.  Removes the set from the list provided to
207  * cpuset_rel_defer.
208  */
209 static void
210 cpuset_rel_complete(struct cpuset *set)
211 {
212 	LIST_REMOVE(set, cs_link);
213 	cpuset_rel(set->cs_parent);
214 	uma_zfree(cpuset_zone, set);
215 }
216 
217 /*
218  * Find a set based on an id.  Returns it with a ref.
219  */
220 static struct cpuset *
221 cpuset_lookup(cpusetid_t setid, struct thread *td)
222 {
223 	struct cpuset *set;
224 
225 	if (setid == CPUSET_INVALID)
226 		return (NULL);
227 	mtx_lock_spin(&cpuset_lock);
228 	LIST_FOREACH(set, &cpuset_ids, cs_link)
229 		if (set->cs_id == setid)
230 			break;
231 	if (set)
232 		cpuset_ref(set);
233 	mtx_unlock_spin(&cpuset_lock);
234 
235 	KASSERT(td != NULL, ("[%s:%d] td is NULL", __func__, __LINE__));
236 	if (set != NULL && jailed(td->td_ucred)) {
237 		struct cpuset *jset, *tset;
238 
239 		jset = td->td_ucred->cr_prison->pr_cpuset;
240 		for (tset = set; tset != NULL; tset = tset->cs_parent)
241 			if (tset == jset)
242 				break;
243 		if (tset == NULL) {
244 			cpuset_rel(set);
245 			set = NULL;
246 		}
247 	}
248 
249 	return (set);
250 }
251 
252 /*
253  * Create a set in the space provided in 'set' with the provided parameters.
254  * The set is returned with a single ref.  May return EDEADLK if the set
255  * will have no valid cpu based on restrictions from the parent.
256  */
257 static int
258 _cpuset_create(struct cpuset *set, struct cpuset *parent, const cpuset_t *mask,
259     cpusetid_t id)
260 {
261 
262 	if (!CPU_OVERLAP(&parent->cs_mask, mask))
263 		return (EDEADLK);
264 	CPU_COPY(mask, &set->cs_mask);
265 	LIST_INIT(&set->cs_children);
266 	refcount_init(&set->cs_ref, 1);
267 	set->cs_flags = 0;
268 	mtx_lock_spin(&cpuset_lock);
269 	CPU_AND(&set->cs_mask, &parent->cs_mask);
270 	set->cs_id = id;
271 	set->cs_parent = cpuset_ref(parent);
272 	LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings);
273 	if (set->cs_id != CPUSET_INVALID)
274 		LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
275 	mtx_unlock_spin(&cpuset_lock);
276 
277 	return (0);
278 }
279 
280 /*
281  * Create a new non-anonymous set with the requested parent and mask.  May
282  * return failures if the mask is invalid or a new number can not be
283  * allocated.
284  */
285 static int
286 cpuset_create(struct cpuset **setp, struct cpuset *parent, const cpuset_t *mask)
287 {
288 	struct cpuset *set;
289 	cpusetid_t id;
290 	int error;
291 
292 	id = alloc_unr(cpuset_unr);
293 	if (id == -1)
294 		return (ENFILE);
295 	*setp = set = uma_zalloc(cpuset_zone, M_WAITOK);
296 	error = _cpuset_create(set, parent, mask, id);
297 	if (error == 0)
298 		return (0);
299 	free_unr(cpuset_unr, id);
300 	uma_zfree(cpuset_zone, set);
301 
302 	return (error);
303 }
304 
305 /*
306  * Recursively check for errors that would occur from applying mask to
307  * the tree of sets starting at 'set'.  Checks for sets that would become
308  * empty as well as RDONLY flags.
309  */
310 static int
311 cpuset_testupdate(struct cpuset *set, cpuset_t *mask, int check_mask)
312 {
313 	struct cpuset *nset;
314 	cpuset_t newmask;
315 	int error;
316 
317 	mtx_assert(&cpuset_lock, MA_OWNED);
318 	if (set->cs_flags & CPU_SET_RDONLY)
319 		return (EPERM);
320 	if (check_mask) {
321 		if (!CPU_OVERLAP(&set->cs_mask, mask))
322 			return (EDEADLK);
323 		CPU_COPY(&set->cs_mask, &newmask);
324 		CPU_AND(&newmask, mask);
325 	} else
326 		CPU_COPY(mask, &newmask);
327 	error = 0;
328 	LIST_FOREACH(nset, &set->cs_children, cs_siblings)
329 		if ((error = cpuset_testupdate(nset, &newmask, 1)) != 0)
330 			break;
331 	return (error);
332 }
333 
334 /*
335  * Applies the mask 'mask' without checking for empty sets or permissions.
336  */
337 static void
338 cpuset_update(struct cpuset *set, cpuset_t *mask)
339 {
340 	struct cpuset *nset;
341 
342 	mtx_assert(&cpuset_lock, MA_OWNED);
343 	CPU_AND(&set->cs_mask, mask);
344 	LIST_FOREACH(nset, &set->cs_children, cs_siblings)
345 		cpuset_update(nset, &set->cs_mask);
346 
347 	return;
348 }
349 
350 /*
351  * Modify the set 'set' to use a copy of the mask provided.  Apply this new
352  * mask to restrict all children in the tree.  Checks for validity before
353  * applying the changes.
354  */
355 static int
356 cpuset_modify(struct cpuset *set, cpuset_t *mask)
357 {
358 	struct cpuset *root;
359 	int error;
360 
361 	error = priv_check(curthread, PRIV_SCHED_CPUSET);
362 	if (error)
363 		return (error);
364 	/*
365 	 * In case we are called from within the jail
366 	 * we do not allow modifying the dedicated root
367 	 * cpuset of the jail but may still allow to
368 	 * change child sets.
369 	 */
370 	if (jailed(curthread->td_ucred) &&
371 	    set->cs_flags & CPU_SET_ROOT)
372 		return (EPERM);
373 	/*
374 	 * Verify that we have access to this set of
375 	 * cpus.
376 	 */
377 	root = set->cs_parent;
378 	if (root && !CPU_SUBSET(&root->cs_mask, mask))
379 		return (EINVAL);
380 	mtx_lock_spin(&cpuset_lock);
381 	error = cpuset_testupdate(set, mask, 0);
382 	if (error)
383 		goto out;
384 	CPU_COPY(mask, &set->cs_mask);
385 	cpuset_update(set, mask);
386 out:
387 	mtx_unlock_spin(&cpuset_lock);
388 
389 	return (error);
390 }
391 
392 /*
393  * Resolve the 'which' parameter of several cpuset apis.
394  *
395  * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid.  Also
396  * checks for permission via p_cansched().
397  *
398  * For WHICH_SET returns a valid set with a new reference.
399  *
400  * -1 may be supplied for any argument to mean the current proc/thread or
401  * the base set of the current thread.  May fail with ESRCH/EPERM.
402  */
403 int
404 cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp,
405     struct cpuset **setp)
406 {
407 	struct cpuset *set;
408 	struct thread *td;
409 	struct proc *p;
410 	int error;
411 
412 	*pp = p = NULL;
413 	*tdp = td = NULL;
414 	*setp = set = NULL;
415 	switch (which) {
416 	case CPU_WHICH_PID:
417 		if (id == -1) {
418 			PROC_LOCK(curproc);
419 			p = curproc;
420 			break;
421 		}
422 		if ((p = pfind(id)) == NULL)
423 			return (ESRCH);
424 		break;
425 	case CPU_WHICH_TID:
426 		if (id == -1) {
427 			PROC_LOCK(curproc);
428 			p = curproc;
429 			td = curthread;
430 			break;
431 		}
432 		td = tdfind(id, -1);
433 		if (td == NULL)
434 			return (ESRCH);
435 		p = td->td_proc;
436 		break;
437 	case CPU_WHICH_CPUSET:
438 		if (id == -1) {
439 			thread_lock(curthread);
440 			set = cpuset_refbase(curthread->td_cpuset);
441 			thread_unlock(curthread);
442 		} else
443 			set = cpuset_lookup(id, curthread);
444 		if (set) {
445 			*setp = set;
446 			return (0);
447 		}
448 		return (ESRCH);
449 	case CPU_WHICH_JAIL:
450 	{
451 		/* Find `set' for prison with given id. */
452 		struct prison *pr;
453 
454 		sx_slock(&allprison_lock);
455 		pr = prison_find_child(curthread->td_ucred->cr_prison, id);
456 		sx_sunlock(&allprison_lock);
457 		if (pr == NULL)
458 			return (ESRCH);
459 		cpuset_ref(pr->pr_cpuset);
460 		*setp = pr->pr_cpuset;
461 		mtx_unlock(&pr->pr_mtx);
462 		return (0);
463 	}
464 	case CPU_WHICH_IRQ:
465 	case CPU_WHICH_DOMAIN:
466 		return (0);
467 	default:
468 		return (EINVAL);
469 	}
470 	error = p_cansched(curthread, p);
471 	if (error) {
472 		PROC_UNLOCK(p);
473 		return (error);
474 	}
475 	if (td == NULL)
476 		td = FIRST_THREAD_IN_PROC(p);
477 	*pp = p;
478 	*tdp = td;
479 	return (0);
480 }
481 
482 /*
483  * Create an anonymous set with the provided mask in the space provided by
484  * 'fset'.  If the passed in set is anonymous we use its parent otherwise
485  * the new set is a child of 'set'.
486  */
487 static int
488 cpuset_shadow(struct cpuset *set, struct cpuset *fset, const cpuset_t *mask)
489 {
490 	struct cpuset *parent;
491 
492 	if (set->cs_id == CPUSET_INVALID)
493 		parent = set->cs_parent;
494 	else
495 		parent = set;
496 	if (!CPU_SUBSET(&parent->cs_mask, mask))
497 		return (EDEADLK);
498 	return (_cpuset_create(fset, parent, mask, CPUSET_INVALID));
499 }
500 
501 /*
502  * Handle two cases for replacing the base set or mask of an entire process.
503  *
504  * 1) Set is non-null and mask is null.  This reparents all anonymous sets
505  *    to the provided set and replaces all non-anonymous td_cpusets with the
506  *    provided set.
507  * 2) Mask is non-null and set is null.  This replaces or creates anonymous
508  *    sets for every thread with the existing base as a parent.
509  *
510  * This is overly complicated because we can't allocate while holding a
511  * spinlock and spinlocks must be held while changing and examining thread
512  * state.
513  */
514 static int
515 cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask)
516 {
517 	struct setlist freelist;
518 	struct setlist droplist;
519 	struct cpuset *tdset;
520 	struct cpuset *nset;
521 	struct thread *td;
522 	struct proc *p;
523 	int threads;
524 	int nfree;
525 	int error;
526 	/*
527 	 * The algorithm requires two passes due to locking considerations.
528 	 *
529 	 * 1) Lookup the process and acquire the locks in the required order.
530 	 * 2) If enough cpusets have not been allocated release the locks and
531 	 *    allocate them.  Loop.
532 	 */
533 	LIST_INIT(&freelist);
534 	LIST_INIT(&droplist);
535 	nfree = 0;
536 	for (;;) {
537 		error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset);
538 		if (error)
539 			goto out;
540 		if (nfree >= p->p_numthreads)
541 			break;
542 		threads = p->p_numthreads;
543 		PROC_UNLOCK(p);
544 		for (; nfree < threads; nfree++) {
545 			nset = uma_zalloc(cpuset_zone, M_WAITOK);
546 			LIST_INSERT_HEAD(&freelist, nset, cs_link);
547 		}
548 	}
549 	PROC_LOCK_ASSERT(p, MA_OWNED);
550 	/*
551 	 * Now that the appropriate locks are held and we have enough cpusets,
552 	 * make sure the operation will succeed before applying changes.  The
553 	 * proc lock prevents td_cpuset from changing between calls.
554 	 */
555 	error = 0;
556 	FOREACH_THREAD_IN_PROC(p, td) {
557 		thread_lock(td);
558 		tdset = td->td_cpuset;
559 		/*
560 		 * Verify that a new mask doesn't specify cpus outside of
561 		 * the set the thread is a member of.
562 		 */
563 		if (mask) {
564 			if (tdset->cs_id == CPUSET_INVALID)
565 				tdset = tdset->cs_parent;
566 			if (!CPU_SUBSET(&tdset->cs_mask, mask))
567 				error = EDEADLK;
568 		/*
569 		 * Verify that a new set won't leave an existing thread
570 		 * mask without a cpu to run on.  It can, however, restrict
571 		 * the set.
572 		 */
573 		} else if (tdset->cs_id == CPUSET_INVALID) {
574 			if (!CPU_OVERLAP(&set->cs_mask, &tdset->cs_mask))
575 				error = EDEADLK;
576 		}
577 		thread_unlock(td);
578 		if (error)
579 			goto unlock_out;
580 	}
581 	/*
582 	 * Replace each thread's cpuset while using deferred release.  We
583 	 * must do this because the thread lock must be held while operating
584 	 * on the thread and this limits the type of operations allowed.
585 	 */
586 	FOREACH_THREAD_IN_PROC(p, td) {
587 		thread_lock(td);
588 		/*
589 		 * If we presently have an anonymous set or are applying a
590 		 * mask we must create an anonymous shadow set.  That is
591 		 * either parented to our existing base or the supplied set.
592 		 *
593 		 * If we have a base set with no anonymous shadow we simply
594 		 * replace it outright.
595 		 */
596 		tdset = td->td_cpuset;
597 		if (tdset->cs_id == CPUSET_INVALID || mask) {
598 			nset = LIST_FIRST(&freelist);
599 			LIST_REMOVE(nset, cs_link);
600 			if (mask)
601 				error = cpuset_shadow(tdset, nset, mask);
602 			else
603 				error = _cpuset_create(nset, set,
604 				    &tdset->cs_mask, CPUSET_INVALID);
605 			if (error) {
606 				LIST_INSERT_HEAD(&freelist, nset, cs_link);
607 				thread_unlock(td);
608 				break;
609 			}
610 		} else
611 			nset = cpuset_ref(set);
612 		cpuset_rel_defer(&droplist, tdset);
613 		td->td_cpuset = nset;
614 		sched_affinity(td);
615 		thread_unlock(td);
616 	}
617 unlock_out:
618 	PROC_UNLOCK(p);
619 out:
620 	while ((nset = LIST_FIRST(&droplist)) != NULL)
621 		cpuset_rel_complete(nset);
622 	while ((nset = LIST_FIRST(&freelist)) != NULL) {
623 		LIST_REMOVE(nset, cs_link);
624 		uma_zfree(cpuset_zone, nset);
625 	}
626 	return (error);
627 }
628 
629 /*
630  * Return a string representing a valid layout for a cpuset_t object.
631  * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
632  */
633 char *
634 cpusetobj_strprint(char *buf, const cpuset_t *set)
635 {
636 	char *tbuf;
637 	size_t i, bytesp, bufsiz;
638 
639 	tbuf = buf;
640 	bytesp = 0;
641 	bufsiz = CPUSETBUFSIZ;
642 
643 	for (i = 0; i < (_NCPUWORDS - 1); i++) {
644 		bytesp = snprintf(tbuf, bufsiz, "%lx,", set->__bits[i]);
645 		bufsiz -= bytesp;
646 		tbuf += bytesp;
647 	}
648 	snprintf(tbuf, bufsiz, "%lx", set->__bits[_NCPUWORDS - 1]);
649 	return (buf);
650 }
651 
652 /*
653  * Build a valid cpuset_t object from a string representation.
654  * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
655  */
656 int
657 cpusetobj_strscan(cpuset_t *set, const char *buf)
658 {
659 	u_int nwords;
660 	int i, ret;
661 
662 	if (strlen(buf) > CPUSETBUFSIZ - 1)
663 		return (-1);
664 
665 	/* Allow to pass a shorter version of the mask when necessary. */
666 	nwords = 1;
667 	for (i = 0; buf[i] != '\0'; i++)
668 		if (buf[i] == ',')
669 			nwords++;
670 	if (nwords > _NCPUWORDS)
671 		return (-1);
672 
673 	CPU_ZERO(set);
674 	for (i = 0; i < (nwords - 1); i++) {
675 		ret = sscanf(buf, "%lx,", &set->__bits[i]);
676 		if (ret == 0 || ret == -1)
677 			return (-1);
678 		buf = strstr(buf, ",");
679 		if (buf == NULL)
680 			return (-1);
681 		buf++;
682 	}
683 	ret = sscanf(buf, "%lx", &set->__bits[nwords - 1]);
684 	if (ret == 0 || ret == -1)
685 		return (-1);
686 	return (0);
687 }
688 
689 /*
690  * Apply an anonymous mask to a single thread.
691  */
692 int
693 cpuset_setthread(lwpid_t id, cpuset_t *mask)
694 {
695 	struct cpuset *nset;
696 	struct cpuset *set;
697 	struct thread *td;
698 	struct proc *p;
699 	int error;
700 
701 	nset = uma_zalloc(cpuset_zone, M_WAITOK);
702 	error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set);
703 	if (error)
704 		goto out;
705 	set = NULL;
706 	thread_lock(td);
707 	error = cpuset_shadow(td->td_cpuset, nset, mask);
708 	if (error == 0) {
709 		set = td->td_cpuset;
710 		td->td_cpuset = nset;
711 		sched_affinity(td);
712 		nset = NULL;
713 	}
714 	thread_unlock(td);
715 	PROC_UNLOCK(p);
716 	if (set)
717 		cpuset_rel(set);
718 out:
719 	if (nset)
720 		uma_zfree(cpuset_zone, nset);
721 	return (error);
722 }
723 
724 /*
725  * Apply new cpumask to the ithread.
726  */
727 int
728 cpuset_setithread(lwpid_t id, int cpu)
729 {
730 	struct cpuset *nset, *rset;
731 	struct cpuset *parent, *old_set;
732 	struct thread *td;
733 	struct proc *p;
734 	cpusetid_t cs_id;
735 	cpuset_t mask;
736 	int error;
737 
738 	nset = uma_zalloc(cpuset_zone, M_WAITOK);
739 	rset = uma_zalloc(cpuset_zone, M_WAITOK);
740 	cs_id = CPUSET_INVALID;
741 
742 	CPU_ZERO(&mask);
743 	if (cpu == NOCPU)
744 		CPU_COPY(cpuset_root, &mask);
745 	else
746 		CPU_SET(cpu, &mask);
747 
748 	error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &old_set);
749 	if (error != 0 || ((cs_id = alloc_unr(cpuset_unr)) == CPUSET_INVALID))
750 		goto out;
751 
752 	/* cpuset_which() returns with PROC_LOCK held. */
753 	old_set = td->td_cpuset;
754 
755 	if (cpu == NOCPU) {
756 
757 		/*
758 		 * roll back to default set. We're not using cpuset_shadow()
759 		 * here because we can fail CPU_SUBSET() check. This can happen
760 		 * if default set does not contain all CPUs.
761 		 */
762 		error = _cpuset_create(nset, cpuset_default, &mask,
763 		    CPUSET_INVALID);
764 
765 		goto applyset;
766 	}
767 
768 	if (old_set->cs_id == 1 || (old_set->cs_id == CPUSET_INVALID &&
769 	    old_set->cs_parent->cs_id == 1)) {
770 
771 		/*
772 		 * Current set is either default (1) or
773 		 * shadowed version of default set.
774 		 *
775 		 * Allocate new root set to be able to shadow it
776 		 * with any mask.
777 		 */
778 		error = _cpuset_create(rset, cpuset_zero,
779 		    &cpuset_zero->cs_mask, cs_id);
780 		if (error != 0) {
781 			PROC_UNLOCK(p);
782 			goto out;
783 		}
784 		rset->cs_flags |= CPU_SET_ROOT;
785 		parent = rset;
786 		rset = NULL;
787 		cs_id = CPUSET_INVALID;
788 	} else {
789 		/* Assume existing set was already allocated by previous call */
790 		parent = old_set;
791 		old_set = NULL;
792 	}
793 
794 	error = cpuset_shadow(parent, nset, &mask);
795 applyset:
796 	if (error == 0) {
797 		thread_lock(td);
798 		td->td_cpuset = nset;
799 		sched_affinity(td);
800 		thread_unlock(td);
801 		nset = NULL;
802 	} else
803 		old_set = NULL;
804 	PROC_UNLOCK(p);
805 	if (old_set != NULL)
806 		cpuset_rel(old_set);
807 out:
808 	if (nset != NULL)
809 		uma_zfree(cpuset_zone, nset);
810 	if (rset != NULL)
811 		uma_zfree(cpuset_zone, rset);
812 	if (cs_id != CPUSET_INVALID)
813 		free_unr(cpuset_unr, cs_id);
814 	return (error);
815 }
816 
817 
818 /*
819  * Creates system-wide cpusets and the cpuset for thread0 including two
820  * sets:
821  *
822  * 0 - The root set which should represent all valid processors in the
823  *     system.  It is initially created with a mask of all processors
824  *     because we don't know what processors are valid until cpuset_init()
825  *     runs.  This set is immutable.
826  * 1 - The default set which all processes are a member of until changed.
827  *     This allows an administrator to move all threads off of given cpus to
828  *     dedicate them to high priority tasks or save power etc.
829  */
830 struct cpuset *
831 cpuset_thread0(void)
832 {
833 	struct cpuset *set;
834 	int error, i;
835 
836 	cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
837 	    NULL, NULL, UMA_ALIGN_PTR, 0);
838 	mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE);
839 
840 	/*
841 	 * Create the root system set for the whole machine.  Doesn't use
842 	 * cpuset_create() due to NULL parent.
843 	 */
844 	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
845 	CPU_FILL(&set->cs_mask);
846 	LIST_INIT(&set->cs_children);
847 	LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
848 	set->cs_ref = 1;
849 	set->cs_flags = CPU_SET_ROOT;
850 	cpuset_zero = set;
851 	cpuset_root = &set->cs_mask;
852 
853 	/*
854 	 * Now derive a default, modifiable set from that to give out.
855 	 */
856 	set = uma_zalloc(cpuset_zone, M_WAITOK);
857 	error = _cpuset_create(set, cpuset_zero, &cpuset_zero->cs_mask, 1);
858 	KASSERT(error == 0, ("Error creating default set: %d\n", error));
859 	cpuset_default = set;
860 
861 	/*
862 	 * Initialize the unit allocator. 0 and 1 are allocated above.
863 	 */
864 	cpuset_unr = new_unrhdr(2, INT_MAX, NULL);
865 
866 	/*
867 	 * If MD code has not initialized per-domain cpusets, place all
868 	 * CPUs in domain 0.
869 	 */
870 	for (i = 0; i < MAXMEMDOM; i++)
871 		if (!CPU_EMPTY(&cpuset_domain[i]))
872 			goto domains_set;
873 	CPU_COPY(&all_cpus, &cpuset_domain[0]);
874 domains_set:
875 
876 	return (set);
877 }
878 
879 /*
880  * Create a cpuset, which would be cpuset_create() but
881  * mark the new 'set' as root.
882  *
883  * We are not going to reparent the td to it.  Use cpuset_setproc_update_set()
884  * for that.
885  *
886  * In case of no error, returns the set in *setp locked with a reference.
887  */
888 int
889 cpuset_create_root(struct prison *pr, struct cpuset **setp)
890 {
891 	struct cpuset *set;
892 	int error;
893 
894 	KASSERT(pr != NULL, ("[%s:%d] invalid pr", __func__, __LINE__));
895 	KASSERT(setp != NULL, ("[%s:%d] invalid setp", __func__, __LINE__));
896 
897 	error = cpuset_create(setp, pr->pr_cpuset, &pr->pr_cpuset->cs_mask);
898 	if (error)
899 		return (error);
900 
901 	KASSERT(*setp != NULL, ("[%s:%d] cpuset_create returned invalid data",
902 	    __func__, __LINE__));
903 
904 	/* Mark the set as root. */
905 	set = *setp;
906 	set->cs_flags |= CPU_SET_ROOT;
907 
908 	return (0);
909 }
910 
911 int
912 cpuset_setproc_update_set(struct proc *p, struct cpuset *set)
913 {
914 	int error;
915 
916 	KASSERT(p != NULL, ("[%s:%d] invalid proc", __func__, __LINE__));
917 	KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__));
918 
919 	cpuset_ref(set);
920 	error = cpuset_setproc(p->p_pid, set, NULL);
921 	if (error)
922 		return (error);
923 	cpuset_rel(set);
924 	return (0);
925 }
926 
927 /*
928  * This is called once the final set of system cpus is known.  Modifies
929  * the root set and all children and mark the root read-only.
930  */
931 static void
932 cpuset_init(void *arg)
933 {
934 	cpuset_t mask;
935 
936 	mask = all_cpus;
937 	if (cpuset_modify(cpuset_zero, &mask))
938 		panic("Can't set initial cpuset mask.\n");
939 	cpuset_zero->cs_flags |= CPU_SET_RDONLY;
940 }
941 SYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL);
942 
943 #ifndef _SYS_SYSPROTO_H_
944 struct cpuset_args {
945 	cpusetid_t	*setid;
946 };
947 #endif
948 int
949 sys_cpuset(struct thread *td, struct cpuset_args *uap)
950 {
951 	struct cpuset *root;
952 	struct cpuset *set;
953 	int error;
954 
955 	thread_lock(td);
956 	root = cpuset_refroot(td->td_cpuset);
957 	thread_unlock(td);
958 	error = cpuset_create(&set, root, &root->cs_mask);
959 	cpuset_rel(root);
960 	if (error)
961 		return (error);
962 	error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id));
963 	if (error == 0)
964 		error = cpuset_setproc(-1, set, NULL);
965 	cpuset_rel(set);
966 	return (error);
967 }
968 
969 #ifndef _SYS_SYSPROTO_H_
970 struct cpuset_setid_args {
971 	cpuwhich_t	which;
972 	id_t		id;
973 	cpusetid_t	setid;
974 };
975 #endif
976 int
977 sys_cpuset_setid(struct thread *td, struct cpuset_setid_args *uap)
978 {
979 	struct cpuset *set;
980 	int error;
981 
982 	/*
983 	 * Presently we only support per-process sets.
984 	 */
985 	if (uap->which != CPU_WHICH_PID)
986 		return (EINVAL);
987 	set = cpuset_lookup(uap->setid, td);
988 	if (set == NULL)
989 		return (ESRCH);
990 	error = cpuset_setproc(uap->id, set, NULL);
991 	cpuset_rel(set);
992 	return (error);
993 }
994 
995 #ifndef _SYS_SYSPROTO_H_
996 struct cpuset_getid_args {
997 	cpulevel_t	level;
998 	cpuwhich_t	which;
999 	id_t		id;
1000 	cpusetid_t	*setid;
1001 };
1002 #endif
1003 int
1004 sys_cpuset_getid(struct thread *td, struct cpuset_getid_args *uap)
1005 {
1006 	struct cpuset *nset;
1007 	struct cpuset *set;
1008 	struct thread *ttd;
1009 	struct proc *p;
1010 	cpusetid_t id;
1011 	int error;
1012 
1013 	if (uap->level == CPU_LEVEL_WHICH && uap->which != CPU_WHICH_CPUSET)
1014 		return (EINVAL);
1015 	error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
1016 	if (error)
1017 		return (error);
1018 	switch (uap->which) {
1019 	case CPU_WHICH_TID:
1020 	case CPU_WHICH_PID:
1021 		thread_lock(ttd);
1022 		set = cpuset_refbase(ttd->td_cpuset);
1023 		thread_unlock(ttd);
1024 		PROC_UNLOCK(p);
1025 		break;
1026 	case CPU_WHICH_CPUSET:
1027 	case CPU_WHICH_JAIL:
1028 		break;
1029 	case CPU_WHICH_IRQ:
1030 	case CPU_WHICH_DOMAIN:
1031 		return (EINVAL);
1032 	}
1033 	switch (uap->level) {
1034 	case CPU_LEVEL_ROOT:
1035 		nset = cpuset_refroot(set);
1036 		cpuset_rel(set);
1037 		set = nset;
1038 		break;
1039 	case CPU_LEVEL_CPUSET:
1040 		break;
1041 	case CPU_LEVEL_WHICH:
1042 		break;
1043 	}
1044 	id = set->cs_id;
1045 	cpuset_rel(set);
1046 	if (error == 0)
1047 		error = copyout(&id, uap->setid, sizeof(id));
1048 
1049 	return (error);
1050 }
1051 
1052 #ifndef _SYS_SYSPROTO_H_
1053 struct cpuset_getaffinity_args {
1054 	cpulevel_t	level;
1055 	cpuwhich_t	which;
1056 	id_t		id;
1057 	size_t		cpusetsize;
1058 	cpuset_t	*mask;
1059 };
1060 #endif
1061 int
1062 sys_cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap)
1063 {
1064 	struct thread *ttd;
1065 	struct cpuset *nset;
1066 	struct cpuset *set;
1067 	struct proc *p;
1068 	cpuset_t *mask;
1069 	int error;
1070 	size_t size;
1071 
1072 	if (uap->cpusetsize < sizeof(cpuset_t) ||
1073 	    uap->cpusetsize > CPU_MAXSIZE / NBBY)
1074 		return (ERANGE);
1075 	size = uap->cpusetsize;
1076 	mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
1077 	error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
1078 	if (error)
1079 		goto out;
1080 	switch (uap->level) {
1081 	case CPU_LEVEL_ROOT:
1082 	case CPU_LEVEL_CPUSET:
1083 		switch (uap->which) {
1084 		case CPU_WHICH_TID:
1085 		case CPU_WHICH_PID:
1086 			thread_lock(ttd);
1087 			set = cpuset_ref(ttd->td_cpuset);
1088 			thread_unlock(ttd);
1089 			break;
1090 		case CPU_WHICH_CPUSET:
1091 		case CPU_WHICH_JAIL:
1092 			break;
1093 		case CPU_WHICH_IRQ:
1094 		case CPU_WHICH_DOMAIN:
1095 			error = EINVAL;
1096 			goto out;
1097 		}
1098 		if (uap->level == CPU_LEVEL_ROOT)
1099 			nset = cpuset_refroot(set);
1100 		else
1101 			nset = cpuset_refbase(set);
1102 		CPU_COPY(&nset->cs_mask, mask);
1103 		cpuset_rel(nset);
1104 		break;
1105 	case CPU_LEVEL_WHICH:
1106 		switch (uap->which) {
1107 		case CPU_WHICH_TID:
1108 			thread_lock(ttd);
1109 			CPU_COPY(&ttd->td_cpuset->cs_mask, mask);
1110 			thread_unlock(ttd);
1111 			break;
1112 		case CPU_WHICH_PID:
1113 			FOREACH_THREAD_IN_PROC(p, ttd) {
1114 				thread_lock(ttd);
1115 				CPU_OR(mask, &ttd->td_cpuset->cs_mask);
1116 				thread_unlock(ttd);
1117 			}
1118 			break;
1119 		case CPU_WHICH_CPUSET:
1120 		case CPU_WHICH_JAIL:
1121 			CPU_COPY(&set->cs_mask, mask);
1122 			break;
1123 		case CPU_WHICH_IRQ:
1124 			error = intr_getaffinity(uap->id, mask);
1125 			break;
1126 		case CPU_WHICH_DOMAIN:
1127 			if (uap->id < 0 || uap->id >= MAXMEMDOM)
1128 				error = ESRCH;
1129 			else
1130 				CPU_COPY(&cpuset_domain[uap->id], mask);
1131 			break;
1132 		}
1133 		break;
1134 	default:
1135 		error = EINVAL;
1136 		break;
1137 	}
1138 	if (set)
1139 		cpuset_rel(set);
1140 	if (p)
1141 		PROC_UNLOCK(p);
1142 	if (error == 0)
1143 		error = copyout(mask, uap->mask, size);
1144 out:
1145 	free(mask, M_TEMP);
1146 	return (error);
1147 }
1148 
1149 #ifndef _SYS_SYSPROTO_H_
1150 struct cpuset_setaffinity_args {
1151 	cpulevel_t	level;
1152 	cpuwhich_t	which;
1153 	id_t		id;
1154 	size_t		cpusetsize;
1155 	const cpuset_t	*mask;
1156 };
1157 #endif
1158 int
1159 sys_cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap)
1160 {
1161 	struct cpuset *nset;
1162 	struct cpuset *set;
1163 	struct thread *ttd;
1164 	struct proc *p;
1165 	cpuset_t *mask;
1166 	int error;
1167 
1168 	if (uap->cpusetsize < sizeof(cpuset_t) ||
1169 	    uap->cpusetsize > CPU_MAXSIZE / NBBY)
1170 		return (ERANGE);
1171 	mask = malloc(uap->cpusetsize, M_TEMP, M_WAITOK | M_ZERO);
1172 	error = copyin(uap->mask, mask, uap->cpusetsize);
1173 	if (error)
1174 		goto out;
1175 	/*
1176 	 * Verify that no high bits are set.
1177 	 */
1178 	if (uap->cpusetsize > sizeof(cpuset_t)) {
1179 		char *end;
1180 		char *cp;
1181 
1182 		end = cp = (char *)&mask->__bits;
1183 		end += uap->cpusetsize;
1184 		cp += sizeof(cpuset_t);
1185 		while (cp != end)
1186 			if (*cp++ != 0) {
1187 				error = EINVAL;
1188 				goto out;
1189 			}
1190 
1191 	}
1192 	switch (uap->level) {
1193 	case CPU_LEVEL_ROOT:
1194 	case CPU_LEVEL_CPUSET:
1195 		error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
1196 		if (error)
1197 			break;
1198 		switch (uap->which) {
1199 		case CPU_WHICH_TID:
1200 		case CPU_WHICH_PID:
1201 			thread_lock(ttd);
1202 			set = cpuset_ref(ttd->td_cpuset);
1203 			thread_unlock(ttd);
1204 			PROC_UNLOCK(p);
1205 			break;
1206 		case CPU_WHICH_CPUSET:
1207 		case CPU_WHICH_JAIL:
1208 			break;
1209 		case CPU_WHICH_IRQ:
1210 		case CPU_WHICH_DOMAIN:
1211 			error = EINVAL;
1212 			goto out;
1213 		}
1214 		if (uap->level == CPU_LEVEL_ROOT)
1215 			nset = cpuset_refroot(set);
1216 		else
1217 			nset = cpuset_refbase(set);
1218 		error = cpuset_modify(nset, mask);
1219 		cpuset_rel(nset);
1220 		cpuset_rel(set);
1221 		break;
1222 	case CPU_LEVEL_WHICH:
1223 		switch (uap->which) {
1224 		case CPU_WHICH_TID:
1225 			error = cpuset_setthread(uap->id, mask);
1226 			break;
1227 		case CPU_WHICH_PID:
1228 			error = cpuset_setproc(uap->id, NULL, mask);
1229 			break;
1230 		case CPU_WHICH_CPUSET:
1231 		case CPU_WHICH_JAIL:
1232 			error = cpuset_which(uap->which, uap->id, &p,
1233 			    &ttd, &set);
1234 			if (error == 0) {
1235 				error = cpuset_modify(set, mask);
1236 				cpuset_rel(set);
1237 			}
1238 			break;
1239 		case CPU_WHICH_IRQ:
1240 			error = intr_setaffinity(uap->id, mask);
1241 			break;
1242 		default:
1243 			error = EINVAL;
1244 			break;
1245 		}
1246 		break;
1247 	default:
1248 		error = EINVAL;
1249 		break;
1250 	}
1251 out:
1252 	free(mask, M_TEMP);
1253 	return (error);
1254 }
1255 
1256 #ifdef DDB
1257 void
1258 ddb_display_cpuset(const cpuset_t *set)
1259 {
1260 	int cpu, once;
1261 
1262 	for (once = 0, cpu = 0; cpu < CPU_SETSIZE; cpu++) {
1263 		if (CPU_ISSET(cpu, set)) {
1264 			if (once == 0) {
1265 				db_printf("%d", cpu);
1266 				once = 1;
1267 			} else
1268 				db_printf(",%d", cpu);
1269 		}
1270 	}
1271 	if (once == 0)
1272 		db_printf("<none>");
1273 }
1274 
1275 DB_SHOW_COMMAND(cpusets, db_show_cpusets)
1276 {
1277 	struct cpuset *set;
1278 
1279 	LIST_FOREACH(set, &cpuset_ids, cs_link) {
1280 		db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n",
1281 		    set, set->cs_id, set->cs_ref, set->cs_flags,
1282 		    (set->cs_parent != NULL) ? set->cs_parent->cs_id : 0);
1283 		db_printf("  mask=");
1284 		ddb_display_cpuset(&set->cs_mask);
1285 		db_printf("\n");
1286 		if (db_pager_quit)
1287 			break;
1288 	}
1289 }
1290 #endif /* DDB */
1291