xref: /freebsd/sys/kern/kern_cpuset.c (revision 830940567b49bb0c08dfaed40418999e76616909)
1 /*-
2  * Copyright (c) 2008,  Jeffrey Roberson <jeff@freebsd.org>
3  * All rights reserved.
4  *
5  * Copyright (c) 2008 Nokia Corporation
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice unmodified, this list of conditions, and the following
13  *    disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  *
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_ddb.h"
35 
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/sysproto.h>
39 #include <sys/jail.h>
40 #include <sys/kernel.h>
41 #include <sys/lock.h>
42 #include <sys/malloc.h>
43 #include <sys/mutex.h>
44 #include <sys/priv.h>
45 #include <sys/proc.h>
46 #include <sys/refcount.h>
47 #include <sys/sched.h>
48 #include <sys/smp.h>
49 #include <sys/syscallsubr.h>
50 #include <sys/cpuset.h>
51 #include <sys/sx.h>
52 #include <sys/queue.h>
53 #include <sys/limits.h>
54 #include <sys/bus.h>
55 #include <sys/interrupt.h>
56 
57 #include <vm/uma.h>
58 
59 #ifdef DDB
60 #include <ddb/ddb.h>
61 #endif /* DDB */
62 
63 /*
64  * cpusets provide a mechanism for creating and manipulating sets of
65  * processors for the purpose of constraining the scheduling of threads to
66  * specific processors.
67  *
68  * Each process belongs to an identified set, by default this is set 1.  Each
69  * thread may further restrict the cpus it may run on to a subset of this
70  * named set.  This creates an anonymous set which other threads and processes
71  * may not join by number.
72  *
73  * The named set is referred to herein as the 'base' set to avoid ambiguity.
74  * This set is usually a child of a 'root' set while the anonymous set may
75  * simply be referred to as a mask.  In the syscall api these are referred to
76  * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here.
77  *
78  * Threads inherit their set from their creator whether it be anonymous or
79  * not.  This means that anonymous sets are immutable because they may be
80  * shared.  To modify an anonymous set a new set is created with the desired
81  * mask and the same parent as the existing anonymous set.  This gives the
82  * illusion of each thread having a private mask.A
83  *
84  * Via the syscall apis a user may ask to retrieve or modify the root, base,
85  * or mask that is discovered via a pid, tid, or setid.  Modifying a set
86  * modifies all numbered and anonymous child sets to comply with the new mask.
87  * Modifying a pid or tid's mask applies only to that tid but must still
88  * exist within the assigned parent set.
89  *
90  * A thread may not be assigned to a a group seperate from other threads in
91  * the process.  This is to remove ambiguity when the setid is queried with
92  * a pid argument.  There is no other technical limitation.
93  *
94  * This somewhat complex arrangement is intended to make it easy for
95  * applications to query available processors and bind their threads to
96  * specific processors while also allowing administrators to dynamically
97  * reprovision by changing sets which apply to groups of processes.
98  *
99  * A simple application should not concern itself with sets at all and
100  * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id
101  * meaning 'curthread'.  It may query availble cpus for that tid with a
102  * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
103  */
104 static uma_zone_t cpuset_zone;
105 static struct mtx cpuset_lock;
106 static struct setlist cpuset_ids;
107 static struct unrhdr *cpuset_unr;
108 static struct cpuset *cpuset_zero;
109 
110 cpuset_t *cpuset_root;
111 
112 /*
113  * Acquire a reference to a cpuset, all pointers must be tracked with refs.
114  */
115 struct cpuset *
116 cpuset_ref(struct cpuset *set)
117 {
118 
119 	refcount_acquire(&set->cs_ref);
120 	return (set);
121 }
122 
123 /*
124  * Walks up the tree from 'set' to find the root.  Returns the root
125  * referenced.
126  */
127 static struct cpuset *
128 cpuset_refroot(struct cpuset *set)
129 {
130 
131 	for (; set->cs_parent != NULL; set = set->cs_parent)
132 		if (set->cs_flags & CPU_SET_ROOT)
133 			break;
134 	cpuset_ref(set);
135 
136 	return (set);
137 }
138 
139 /*
140  * Find the first non-anonymous set starting from 'set'.  Returns this set
141  * referenced.  May return the passed in set with an extra ref if it is
142  * not anonymous.
143  */
144 static struct cpuset *
145 cpuset_refbase(struct cpuset *set)
146 {
147 
148 	if (set->cs_id == CPUSET_INVALID)
149 		set = set->cs_parent;
150 	cpuset_ref(set);
151 
152 	return (set);
153 }
154 
155 /*
156  * Release a reference in a context where it is safe to allocte.
157  */
158 void
159 cpuset_rel(struct cpuset *set)
160 {
161 	cpusetid_t id;
162 
163 	if (refcount_release(&set->cs_ref) == 0)
164 		return;
165 	mtx_lock_spin(&cpuset_lock);
166 	LIST_REMOVE(set, cs_siblings);
167 	id = set->cs_id;
168 	if (id != CPUSET_INVALID)
169 		LIST_REMOVE(set, cs_link);
170 	mtx_unlock_spin(&cpuset_lock);
171 	cpuset_rel(set->cs_parent);
172 	uma_zfree(cpuset_zone, set);
173 	if (id != CPUSET_INVALID)
174 		free_unr(cpuset_unr, id);
175 }
176 
177 /*
178  * Deferred release must be used when in a context that is not safe to
179  * allocate/free.  This places any unreferenced sets on the list 'head'.
180  */
181 static void
182 cpuset_rel_defer(struct setlist *head, struct cpuset *set)
183 {
184 
185 	if (refcount_release(&set->cs_ref) == 0)
186 		return;
187 	mtx_lock_spin(&cpuset_lock);
188 	LIST_REMOVE(set, cs_siblings);
189 	if (set->cs_id != CPUSET_INVALID)
190 		LIST_REMOVE(set, cs_link);
191 	LIST_INSERT_HEAD(head, set, cs_link);
192 	mtx_unlock_spin(&cpuset_lock);
193 }
194 
195 /*
196  * Complete a deferred release.  Removes the set from the list provided to
197  * cpuset_rel_defer.
198  */
199 static void
200 cpuset_rel_complete(struct cpuset *set)
201 {
202 	LIST_REMOVE(set, cs_link);
203 	cpuset_rel(set->cs_parent);
204 	uma_zfree(cpuset_zone, set);
205 }
206 
207 /*
208  * Find a set based on an id.  Returns it with a ref.
209  */
210 static struct cpuset *
211 cpuset_lookup(cpusetid_t setid, struct thread *td)
212 {
213 	struct cpuset *set;
214 
215 	if (setid == CPUSET_INVALID)
216 		return (NULL);
217 	mtx_lock_spin(&cpuset_lock);
218 	LIST_FOREACH(set, &cpuset_ids, cs_link)
219 		if (set->cs_id == setid)
220 			break;
221 	if (set)
222 		cpuset_ref(set);
223 	mtx_unlock_spin(&cpuset_lock);
224 
225 	KASSERT(td != NULL, ("[%s:%d] td is NULL", __func__, __LINE__));
226 	if (set != NULL && jailed(td->td_ucred)) {
227 		struct cpuset *jset, *tset;
228 
229 		jset = td->td_ucred->cr_prison->pr_cpuset;
230 		for (tset = set; tset != NULL; tset = tset->cs_parent)
231 			if (tset == jset)
232 				break;
233 		if (tset == NULL) {
234 			cpuset_rel(set);
235 			set = NULL;
236 		}
237 	}
238 
239 	return (set);
240 }
241 
242 /*
243  * Create a set in the space provided in 'set' with the provided parameters.
244  * The set is returned with a single ref.  May return EDEADLK if the set
245  * will have no valid cpu based on restrictions from the parent.
246  */
247 static int
248 _cpuset_create(struct cpuset *set, struct cpuset *parent, cpuset_t *mask,
249     cpusetid_t id)
250 {
251 
252 	if (!CPU_OVERLAP(&parent->cs_mask, mask))
253 		return (EDEADLK);
254 	CPU_COPY(mask, &set->cs_mask);
255 	LIST_INIT(&set->cs_children);
256 	refcount_init(&set->cs_ref, 1);
257 	set->cs_flags = 0;
258 	mtx_lock_spin(&cpuset_lock);
259 	CPU_AND(mask, &parent->cs_mask);
260 	set->cs_id = id;
261 	set->cs_parent = cpuset_ref(parent);
262 	LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings);
263 	if (set->cs_id != CPUSET_INVALID)
264 		LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
265 	mtx_unlock_spin(&cpuset_lock);
266 
267 	return (0);
268 }
269 
270 /*
271  * Create a new non-anonymous set with the requested parent and mask.  May
272  * return failures if the mask is invalid or a new number can not be
273  * allocated.
274  */
275 static int
276 cpuset_create(struct cpuset **setp, struct cpuset *parent, cpuset_t *mask)
277 {
278 	struct cpuset *set;
279 	cpusetid_t id;
280 	int error;
281 
282 	id = alloc_unr(cpuset_unr);
283 	if (id == -1)
284 		return (ENFILE);
285 	*setp = set = uma_zalloc(cpuset_zone, M_WAITOK);
286 	error = _cpuset_create(set, parent, mask, id);
287 	if (error == 0)
288 		return (0);
289 	free_unr(cpuset_unr, id);
290 	uma_zfree(cpuset_zone, set);
291 
292 	return (error);
293 }
294 
295 /*
296  * Recursively check for errors that would occur from applying mask to
297  * the tree of sets starting at 'set'.  Checks for sets that would become
298  * empty as well as RDONLY flags.
299  */
300 static int
301 cpuset_testupdate(struct cpuset *set, cpuset_t *mask)
302 {
303 	struct cpuset *nset;
304 	cpuset_t newmask;
305 	int error;
306 
307 	mtx_assert(&cpuset_lock, MA_OWNED);
308 	if (set->cs_flags & CPU_SET_RDONLY)
309 		return (EPERM);
310 	if (!CPU_OVERLAP(&set->cs_mask, mask))
311 		return (EDEADLK);
312 	CPU_COPY(&set->cs_mask, &newmask);
313 	CPU_AND(&newmask, mask);
314 	error = 0;
315 	LIST_FOREACH(nset, &set->cs_children, cs_siblings)
316 		if ((error = cpuset_testupdate(nset, &newmask)) != 0)
317 			break;
318 	return (error);
319 }
320 
321 /*
322  * Applies the mask 'mask' without checking for empty sets or permissions.
323  */
324 static void
325 cpuset_update(struct cpuset *set, cpuset_t *mask)
326 {
327 	struct cpuset *nset;
328 
329 	mtx_assert(&cpuset_lock, MA_OWNED);
330 	CPU_AND(&set->cs_mask, mask);
331 	LIST_FOREACH(nset, &set->cs_children, cs_siblings)
332 		cpuset_update(nset, &set->cs_mask);
333 
334 	return;
335 }
336 
337 /*
338  * Modify the set 'set' to use a copy of the mask provided.  Apply this new
339  * mask to restrict all children in the tree.  Checks for validity before
340  * applying the changes.
341  */
342 static int
343 cpuset_modify(struct cpuset *set, cpuset_t *mask)
344 {
345 	struct cpuset *root;
346 	int error;
347 
348 	error = priv_check(curthread, PRIV_SCHED_CPUSET);
349 	if (error)
350 		return (error);
351 	/*
352 	 * In case we are called from within the jail
353 	 * we do not allow modifying the dedicated root
354 	 * cpuset of the jail but may still allow to
355 	 * change child sets.
356 	 */
357 	if (jailed(curthread->td_ucred) &&
358 	    set->cs_flags & CPU_SET_ROOT)
359 		return (EPERM);
360 	/*
361 	 * Verify that we have access to this set of
362 	 * cpus.
363 	 */
364 	root = set->cs_parent;
365 	if (root && !CPU_SUBSET(&root->cs_mask, mask))
366 		return (EINVAL);
367 	mtx_lock_spin(&cpuset_lock);
368 	error = cpuset_testupdate(set, mask);
369 	if (error)
370 		goto out;
371 	cpuset_update(set, mask);
372 	CPU_COPY(mask, &set->cs_mask);
373 out:
374 	mtx_unlock_spin(&cpuset_lock);
375 
376 	return (error);
377 }
378 
379 /*
380  * Resolve the 'which' parameter of several cpuset apis.
381  *
382  * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid.  Also
383  * checks for permission via p_cansched().
384  *
385  * For WHICH_SET returns a valid set with a new reference.
386  *
387  * -1 may be supplied for any argument to mean the current proc/thread or
388  * the base set of the current thread.  May fail with ESRCH/EPERM.
389  */
390 static int
391 cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp,
392     struct cpuset **setp)
393 {
394 	struct cpuset *set;
395 	struct thread *td;
396 	struct proc *p;
397 	int error;
398 
399 	*pp = p = NULL;
400 	*tdp = td = NULL;
401 	*setp = set = NULL;
402 	switch (which) {
403 	case CPU_WHICH_PID:
404 		if (id == -1) {
405 			PROC_LOCK(curproc);
406 			p = curproc;
407 			break;
408 		}
409 		if ((p = pfind(id)) == NULL)
410 			return (ESRCH);
411 		break;
412 	case CPU_WHICH_TID:
413 		if (id == -1) {
414 			PROC_LOCK(curproc);
415 			p = curproc;
416 			td = curthread;
417 			break;
418 		}
419 		sx_slock(&allproc_lock);
420 		FOREACH_PROC_IN_SYSTEM(p) {
421 			PROC_LOCK(p);
422 			FOREACH_THREAD_IN_PROC(p, td)
423 				if (td->td_tid == id)
424 					break;
425 			if (td != NULL)
426 				break;
427 			PROC_UNLOCK(p);
428 		}
429 		sx_sunlock(&allproc_lock);
430 		if (td == NULL)
431 			return (ESRCH);
432 		break;
433 	case CPU_WHICH_CPUSET:
434 		if (id == -1) {
435 			thread_lock(curthread);
436 			set = cpuset_refbase(curthread->td_cpuset);
437 			thread_unlock(curthread);
438 		} else
439 			set = cpuset_lookup(id, curthread);
440 		if (set) {
441 			*setp = set;
442 			return (0);
443 		}
444 		return (ESRCH);
445 	case CPU_WHICH_JAIL:
446 	{
447 		/* Find `set' for prison with given id. */
448 		struct prison *pr;
449 
450 		sx_slock(&allprison_lock);
451 		pr = prison_find_child(curthread->td_ucred->cr_prison, id);
452 		sx_sunlock(&allprison_lock);
453 		if (pr == NULL)
454 			return (ESRCH);
455 		cpuset_ref(pr->pr_cpuset);
456 		*setp = pr->pr_cpuset;
457 		mtx_unlock(&pr->pr_mtx);
458 		return (0);
459 	}
460 	case CPU_WHICH_IRQ:
461 		return (0);
462 	default:
463 		return (EINVAL);
464 	}
465 	error = p_cansched(curthread, p);
466 	if (error) {
467 		PROC_UNLOCK(p);
468 		return (error);
469 	}
470 	if (td == NULL)
471 		td = FIRST_THREAD_IN_PROC(p);
472 	*pp = p;
473 	*tdp = td;
474 	return (0);
475 }
476 
477 /*
478  * Create an anonymous set with the provided mask in the space provided by
479  * 'fset'.  If the passed in set is anonymous we use its parent otherwise
480  * the new set is a child of 'set'.
481  */
482 static int
483 cpuset_shadow(struct cpuset *set, struct cpuset *fset, cpuset_t *mask)
484 {
485 	struct cpuset *parent;
486 
487 	if (set->cs_id == CPUSET_INVALID)
488 		parent = set->cs_parent;
489 	else
490 		parent = set;
491 	if (!CPU_SUBSET(&parent->cs_mask, mask))
492 		return (EDEADLK);
493 	return (_cpuset_create(fset, parent, mask, CPUSET_INVALID));
494 }
495 
496 /*
497  * Handle two cases for replacing the base set or mask of an entire process.
498  *
499  * 1) Set is non-null and mask is null.  This reparents all anonymous sets
500  *    to the provided set and replaces all non-anonymous td_cpusets with the
501  *    provided set.
502  * 2) Mask is non-null and set is null.  This replaces or creates anonymous
503  *    sets for every thread with the existing base as a parent.
504  *
505  * This is overly complicated because we can't allocate while holding a
506  * spinlock and spinlocks must be held while changing and examining thread
507  * state.
508  */
509 static int
510 cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask)
511 {
512 	struct setlist freelist;
513 	struct setlist droplist;
514 	struct cpuset *tdset;
515 	struct cpuset *nset;
516 	struct thread *td;
517 	struct proc *p;
518 	int threads;
519 	int nfree;
520 	int error;
521 	/*
522 	 * The algorithm requires two passes due to locking considerations.
523 	 *
524 	 * 1) Lookup the process and acquire the locks in the required order.
525 	 * 2) If enough cpusets have not been allocated release the locks and
526 	 *    allocate them.  Loop.
527 	 */
528 	LIST_INIT(&freelist);
529 	LIST_INIT(&droplist);
530 	nfree = 0;
531 	for (;;) {
532 		error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset);
533 		if (error)
534 			goto out;
535 		if (nfree >= p->p_numthreads)
536 			break;
537 		threads = p->p_numthreads;
538 		PROC_UNLOCK(p);
539 		for (; nfree < threads; nfree++) {
540 			nset = uma_zalloc(cpuset_zone, M_WAITOK);
541 			LIST_INSERT_HEAD(&freelist, nset, cs_link);
542 		}
543 	}
544 	PROC_LOCK_ASSERT(p, MA_OWNED);
545 	/*
546 	 * Now that the appropriate locks are held and we have enough cpusets,
547 	 * make sure the operation will succeed before applying changes.  The
548 	 * proc lock prevents td_cpuset from changing between calls.
549 	 */
550 	error = 0;
551 	FOREACH_THREAD_IN_PROC(p, td) {
552 		thread_lock(td);
553 		tdset = td->td_cpuset;
554 		/*
555 		 * Verify that a new mask doesn't specify cpus outside of
556 		 * the set the thread is a member of.
557 		 */
558 		if (mask) {
559 			if (tdset->cs_id == CPUSET_INVALID)
560 				tdset = tdset->cs_parent;
561 			if (!CPU_SUBSET(&tdset->cs_mask, mask))
562 				error = EDEADLK;
563 		/*
564 		 * Verify that a new set won't leave an existing thread
565 		 * mask without a cpu to run on.  It can, however, restrict
566 		 * the set.
567 		 */
568 		} else if (tdset->cs_id == CPUSET_INVALID) {
569 			if (!CPU_OVERLAP(&set->cs_mask, &tdset->cs_mask))
570 				error = EDEADLK;
571 		}
572 		thread_unlock(td);
573 		if (error)
574 			goto unlock_out;
575 	}
576 	/*
577 	 * Replace each thread's cpuset while using deferred release.  We
578 	 * must do this because the thread lock must be held while operating
579 	 * on the thread and this limits the type of operations allowed.
580 	 */
581 	FOREACH_THREAD_IN_PROC(p, td) {
582 		thread_lock(td);
583 		/*
584 		 * If we presently have an anonymous set or are applying a
585 		 * mask we must create an anonymous shadow set.  That is
586 		 * either parented to our existing base or the supplied set.
587 		 *
588 		 * If we have a base set with no anonymous shadow we simply
589 		 * replace it outright.
590 		 */
591 		tdset = td->td_cpuset;
592 		if (tdset->cs_id == CPUSET_INVALID || mask) {
593 			nset = LIST_FIRST(&freelist);
594 			LIST_REMOVE(nset, cs_link);
595 			if (mask)
596 				error = cpuset_shadow(tdset, nset, mask);
597 			else
598 				error = _cpuset_create(nset, set,
599 				    &tdset->cs_mask, CPUSET_INVALID);
600 			if (error) {
601 				LIST_INSERT_HEAD(&freelist, nset, cs_link);
602 				thread_unlock(td);
603 				break;
604 			}
605 		} else
606 			nset = cpuset_ref(set);
607 		cpuset_rel_defer(&droplist, tdset);
608 		td->td_cpuset = nset;
609 		sched_affinity(td);
610 		thread_unlock(td);
611 	}
612 unlock_out:
613 	PROC_UNLOCK(p);
614 out:
615 	while ((nset = LIST_FIRST(&droplist)) != NULL)
616 		cpuset_rel_complete(nset);
617 	while ((nset = LIST_FIRST(&freelist)) != NULL) {
618 		LIST_REMOVE(nset, cs_link);
619 		uma_zfree(cpuset_zone, nset);
620 	}
621 	return (error);
622 }
623 
624 /*
625  * Apply an anonymous mask to a single thread.
626  */
627 int
628 cpuset_setthread(lwpid_t id, cpuset_t *mask)
629 {
630 	struct cpuset *nset;
631 	struct cpuset *set;
632 	struct thread *td;
633 	struct proc *p;
634 	int error;
635 
636 	nset = uma_zalloc(cpuset_zone, M_WAITOK);
637 	error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set);
638 	if (error)
639 		goto out;
640 	set = NULL;
641 	thread_lock(td);
642 	error = cpuset_shadow(td->td_cpuset, nset, mask);
643 	if (error == 0) {
644 		set = td->td_cpuset;
645 		td->td_cpuset = nset;
646 		sched_affinity(td);
647 		nset = NULL;
648 	}
649 	thread_unlock(td);
650 	PROC_UNLOCK(p);
651 	if (set)
652 		cpuset_rel(set);
653 out:
654 	if (nset)
655 		uma_zfree(cpuset_zone, nset);
656 	return (error);
657 }
658 
659 /*
660  * Creates the cpuset for thread0.  We make two sets:
661  *
662  * 0 - The root set which should represent all valid processors in the
663  *     system.  It is initially created with a mask of all processors
664  *     because we don't know what processors are valid until cpuset_init()
665  *     runs.  This set is immutable.
666  * 1 - The default set which all processes are a member of until changed.
667  *     This allows an administrator to move all threads off of given cpus to
668  *     dedicate them to high priority tasks or save power etc.
669  */
670 struct cpuset *
671 cpuset_thread0(void)
672 {
673 	struct cpuset *set;
674 	int error;
675 
676 	cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
677 	    NULL, NULL, UMA_ALIGN_PTR, 0);
678 	mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE);
679 	/*
680 	 * Create the root system set for the whole machine.  Doesn't use
681 	 * cpuset_create() due to NULL parent.
682 	 */
683 	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
684 	set->cs_mask.__bits[0] = -1;
685 	LIST_INIT(&set->cs_children);
686 	LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
687 	set->cs_ref = 1;
688 	set->cs_flags = CPU_SET_ROOT;
689 	cpuset_zero = set;
690 	cpuset_root = &set->cs_mask;
691 	/*
692 	 * Now derive a default, modifiable set from that to give out.
693 	 */
694 	set = uma_zalloc(cpuset_zone, M_WAITOK);
695 	error = _cpuset_create(set, cpuset_zero, &cpuset_zero->cs_mask, 1);
696 	KASSERT(error == 0, ("Error creating default set: %d\n", error));
697 	/*
698 	 * Initialize the unit allocator. 0 and 1 are allocated above.
699 	 */
700 	cpuset_unr = new_unrhdr(2, INT_MAX, NULL);
701 
702 	return (set);
703 }
704 
705 /*
706  * Create a cpuset, which would be cpuset_create() but
707  * mark the new 'set' as root.
708  *
709  * We are not going to reparent the td to it.  Use cpuset_setproc_update_set()
710  * for that.
711  *
712  * In case of no error, returns the set in *setp locked with a reference.
713  */
714 int
715 cpuset_create_root(struct prison *pr, struct cpuset **setp)
716 {
717 	struct cpuset *set;
718 	int error;
719 
720 	KASSERT(pr != NULL, ("[%s:%d] invalid pr", __func__, __LINE__));
721 	KASSERT(setp != NULL, ("[%s:%d] invalid setp", __func__, __LINE__));
722 
723 	error = cpuset_create(setp, pr->pr_cpuset, &pr->pr_cpuset->cs_mask);
724 	if (error)
725 		return (error);
726 
727 	KASSERT(*setp != NULL, ("[%s:%d] cpuset_create returned invalid data",
728 	    __func__, __LINE__));
729 
730 	/* Mark the set as root. */
731 	set = *setp;
732 	set->cs_flags |= CPU_SET_ROOT;
733 
734 	return (0);
735 }
736 
737 int
738 cpuset_setproc_update_set(struct proc *p, struct cpuset *set)
739 {
740 	int error;
741 
742 	KASSERT(p != NULL, ("[%s:%d] invalid proc", __func__, __LINE__));
743 	KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__));
744 
745 	cpuset_ref(set);
746 	error = cpuset_setproc(p->p_pid, set, NULL);
747 	if (error)
748 		return (error);
749 	cpuset_rel(set);
750 	return (0);
751 }
752 
753 /*
754  * This is called once the final set of system cpus is known.  Modifies
755  * the root set and all children and mark the root readonly.
756  */
757 static void
758 cpuset_init(void *arg)
759 {
760 	cpuset_t mask;
761 
762 	CPU_ZERO(&mask);
763 #ifdef SMP
764 	mask.__bits[0] = all_cpus;
765 #else
766 	mask.__bits[0] = 1;
767 #endif
768 	if (cpuset_modify(cpuset_zero, &mask))
769 		panic("Can't set initial cpuset mask.\n");
770 	cpuset_zero->cs_flags |= CPU_SET_RDONLY;
771 }
772 SYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL);
773 
774 #ifndef _SYS_SYSPROTO_H_
775 struct cpuset_args {
776 	cpusetid_t	*setid;
777 };
778 #endif
779 int
780 cpuset(struct thread *td, struct cpuset_args *uap)
781 {
782 	struct cpuset *root;
783 	struct cpuset *set;
784 	int error;
785 
786 	thread_lock(td);
787 	root = cpuset_refroot(td->td_cpuset);
788 	thread_unlock(td);
789 	error = cpuset_create(&set, root, &root->cs_mask);
790 	cpuset_rel(root);
791 	if (error)
792 		return (error);
793 	error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id));
794 	if (error == 0)
795 		error = cpuset_setproc(-1, set, NULL);
796 	cpuset_rel(set);
797 	return (error);
798 }
799 
800 #ifndef _SYS_SYSPROTO_H_
801 struct cpuset_setid_args {
802 	cpuwhich_t	which;
803 	id_t		id;
804 	cpusetid_t	setid;
805 };
806 #endif
807 int
808 cpuset_setid(struct thread *td, struct cpuset_setid_args *uap)
809 {
810 	struct cpuset *set;
811 	int error;
812 
813 	/*
814 	 * Presently we only support per-process sets.
815 	 */
816 	if (uap->which != CPU_WHICH_PID)
817 		return (EINVAL);
818 	set = cpuset_lookup(uap->setid, td);
819 	if (set == NULL)
820 		return (ESRCH);
821 	error = cpuset_setproc(uap->id, set, NULL);
822 	cpuset_rel(set);
823 	return (error);
824 }
825 
826 #ifndef _SYS_SYSPROTO_H_
827 struct cpuset_getid_args {
828 	cpulevel_t	level;
829 	cpuwhich_t	which;
830 	id_t		id;
831 	cpusetid_t	*setid;
832 #endif
833 int
834 cpuset_getid(struct thread *td, struct cpuset_getid_args *uap)
835 {
836 	struct cpuset *nset;
837 	struct cpuset *set;
838 	struct thread *ttd;
839 	struct proc *p;
840 	cpusetid_t id;
841 	int error;
842 
843 	if (uap->level == CPU_LEVEL_WHICH && uap->which != CPU_WHICH_CPUSET)
844 		return (EINVAL);
845 	error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
846 	if (error)
847 		return (error);
848 	switch (uap->which) {
849 	case CPU_WHICH_TID:
850 	case CPU_WHICH_PID:
851 		thread_lock(ttd);
852 		set = cpuset_refbase(ttd->td_cpuset);
853 		thread_unlock(ttd);
854 		PROC_UNLOCK(p);
855 		break;
856 	case CPU_WHICH_CPUSET:
857 	case CPU_WHICH_JAIL:
858 		break;
859 	case CPU_WHICH_IRQ:
860 		return (EINVAL);
861 	}
862 	switch (uap->level) {
863 	case CPU_LEVEL_ROOT:
864 		nset = cpuset_refroot(set);
865 		cpuset_rel(set);
866 		set = nset;
867 		break;
868 	case CPU_LEVEL_CPUSET:
869 		break;
870 	case CPU_LEVEL_WHICH:
871 		break;
872 	}
873 	id = set->cs_id;
874 	cpuset_rel(set);
875 	if (error == 0)
876 		error = copyout(&id, uap->setid, sizeof(id));
877 
878 	return (error);
879 }
880 
881 #ifndef _SYS_SYSPROTO_H_
882 struct cpuset_getaffinity_args {
883 	cpulevel_t	level;
884 	cpuwhich_t	which;
885 	id_t		id;
886 	size_t		cpusetsize;
887 	cpuset_t	*mask;
888 };
889 #endif
890 int
891 cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap)
892 {
893 	struct thread *ttd;
894 	struct cpuset *nset;
895 	struct cpuset *set;
896 	struct proc *p;
897 	cpuset_t *mask;
898 	int error;
899 	size_t size;
900 
901 	if (uap->cpusetsize < sizeof(cpuset_t) ||
902 	    uap->cpusetsize > CPU_MAXSIZE / NBBY)
903 		return (ERANGE);
904 	size = uap->cpusetsize;
905 	mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
906 	error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
907 	if (error)
908 		goto out;
909 	switch (uap->level) {
910 	case CPU_LEVEL_ROOT:
911 	case CPU_LEVEL_CPUSET:
912 		switch (uap->which) {
913 		case CPU_WHICH_TID:
914 		case CPU_WHICH_PID:
915 			thread_lock(ttd);
916 			set = cpuset_ref(ttd->td_cpuset);
917 			thread_unlock(ttd);
918 			break;
919 		case CPU_WHICH_CPUSET:
920 		case CPU_WHICH_JAIL:
921 			break;
922 		case CPU_WHICH_IRQ:
923 			error = EINVAL;
924 			goto out;
925 		}
926 		if (uap->level == CPU_LEVEL_ROOT)
927 			nset = cpuset_refroot(set);
928 		else
929 			nset = cpuset_refbase(set);
930 		CPU_COPY(&nset->cs_mask, mask);
931 		cpuset_rel(nset);
932 		break;
933 	case CPU_LEVEL_WHICH:
934 		switch (uap->which) {
935 		case CPU_WHICH_TID:
936 			thread_lock(ttd);
937 			CPU_COPY(&ttd->td_cpuset->cs_mask, mask);
938 			thread_unlock(ttd);
939 			break;
940 		case CPU_WHICH_PID:
941 			FOREACH_THREAD_IN_PROC(p, ttd) {
942 				thread_lock(ttd);
943 				CPU_OR(mask, &ttd->td_cpuset->cs_mask);
944 				thread_unlock(ttd);
945 			}
946 			break;
947 		case CPU_WHICH_CPUSET:
948 		case CPU_WHICH_JAIL:
949 			CPU_COPY(&set->cs_mask, mask);
950 			break;
951 		case CPU_WHICH_IRQ:
952 			error = intr_getaffinity(uap->id, mask);
953 			break;
954 		}
955 		break;
956 	default:
957 		error = EINVAL;
958 		break;
959 	}
960 	if (set)
961 		cpuset_rel(set);
962 	if (p)
963 		PROC_UNLOCK(p);
964 	if (error == 0)
965 		error = copyout(mask, uap->mask, size);
966 out:
967 	free(mask, M_TEMP);
968 	return (error);
969 }
970 
971 #ifndef _SYS_SYSPROTO_H_
972 struct cpuset_setaffinity_args {
973 	cpulevel_t	level;
974 	cpuwhich_t	which;
975 	id_t		id;
976 	size_t		cpusetsize;
977 	const cpuset_t	*mask;
978 };
979 #endif
980 int
981 cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap)
982 {
983 	struct cpuset *nset;
984 	struct cpuset *set;
985 	struct thread *ttd;
986 	struct proc *p;
987 	cpuset_t *mask;
988 	int error;
989 
990 	if (uap->cpusetsize < sizeof(cpuset_t) ||
991 	    uap->cpusetsize > CPU_MAXSIZE / NBBY)
992 		return (ERANGE);
993 	mask = malloc(uap->cpusetsize, M_TEMP, M_WAITOK | M_ZERO);
994 	error = copyin(uap->mask, mask, uap->cpusetsize);
995 	if (error)
996 		goto out;
997 	/*
998 	 * Verify that no high bits are set.
999 	 */
1000 	if (uap->cpusetsize > sizeof(cpuset_t)) {
1001 		char *end;
1002 		char *cp;
1003 
1004 		end = cp = (char *)&mask->__bits;
1005 		end += uap->cpusetsize;
1006 		cp += sizeof(cpuset_t);
1007 		while (cp != end)
1008 			if (*cp++ != 0) {
1009 				error = EINVAL;
1010 				goto out;
1011 			}
1012 
1013 	}
1014 	switch (uap->level) {
1015 	case CPU_LEVEL_ROOT:
1016 	case CPU_LEVEL_CPUSET:
1017 		error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
1018 		if (error)
1019 			break;
1020 		switch (uap->which) {
1021 		case CPU_WHICH_TID:
1022 		case CPU_WHICH_PID:
1023 			thread_lock(ttd);
1024 			set = cpuset_ref(ttd->td_cpuset);
1025 			thread_unlock(ttd);
1026 			PROC_UNLOCK(p);
1027 			break;
1028 		case CPU_WHICH_CPUSET:
1029 		case CPU_WHICH_JAIL:
1030 			break;
1031 		case CPU_WHICH_IRQ:
1032 			error = EINVAL;
1033 			goto out;
1034 		}
1035 		if (uap->level == CPU_LEVEL_ROOT)
1036 			nset = cpuset_refroot(set);
1037 		else
1038 			nset = cpuset_refbase(set);
1039 		error = cpuset_modify(nset, mask);
1040 		cpuset_rel(nset);
1041 		cpuset_rel(set);
1042 		break;
1043 	case CPU_LEVEL_WHICH:
1044 		switch (uap->which) {
1045 		case CPU_WHICH_TID:
1046 			error = cpuset_setthread(uap->id, mask);
1047 			break;
1048 		case CPU_WHICH_PID:
1049 			error = cpuset_setproc(uap->id, NULL, mask);
1050 			break;
1051 		case CPU_WHICH_CPUSET:
1052 		case CPU_WHICH_JAIL:
1053 			error = cpuset_which(uap->which, uap->id, &p,
1054 			    &ttd, &set);
1055 			if (error == 0) {
1056 				error = cpuset_modify(set, mask);
1057 				cpuset_rel(set);
1058 			}
1059 			break;
1060 		case CPU_WHICH_IRQ:
1061 			error = intr_setaffinity(uap->id, mask);
1062 			break;
1063 		default:
1064 			error = EINVAL;
1065 			break;
1066 		}
1067 		break;
1068 	default:
1069 		error = EINVAL;
1070 		break;
1071 	}
1072 out:
1073 	free(mask, M_TEMP);
1074 	return (error);
1075 }
1076 
1077 #ifdef DDB
1078 DB_SHOW_COMMAND(cpusets, db_show_cpusets)
1079 {
1080 	struct cpuset *set;
1081 	int cpu, once;
1082 
1083 	LIST_FOREACH(set, &cpuset_ids, cs_link) {
1084 		db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n",
1085 		    set, set->cs_id, set->cs_ref, set->cs_flags,
1086 		    (set->cs_parent != NULL) ? set->cs_parent->cs_id : 0);
1087 		db_printf("  mask=");
1088 		for (once = 0, cpu = 0; cpu < CPU_SETSIZE; cpu++) {
1089 			if (CPU_ISSET(cpu, &set->cs_mask)) {
1090 				if (once == 0) {
1091 					db_printf("%d", cpu);
1092 					once = 1;
1093 				} else
1094 					db_printf(",%d", cpu);
1095 			}
1096 		}
1097 		db_printf("\n");
1098 		if (db_pager_quit)
1099 			break;
1100 	}
1101 }
1102 #endif /* DDB */
1103