xref: /freebsd/sys/kern/kern_cpuset.c (revision 02f27f1cfa619cdf9509c65366f55f7c8803de5c)
1 /*-
2  * Copyright (c) 2008,  Jeffrey Roberson <jeff@freebsd.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice unmodified, this list of conditions, and the following
10  *    disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/sysproto.h>
34 #include <sys/kernel.h>
35 #include <sys/lock.h>
36 #include <sys/malloc.h>
37 #include <sys/mutex.h>
38 #include <sys/priv.h>
39 #include <sys/proc.h>
40 #include <sys/refcount.h>
41 #include <sys/sched.h>
42 #include <sys/smp.h>
43 #include <sys/syscallsubr.h>
44 #include <sys/cpuset.h>
45 #include <sys/sx.h>
46 #include <sys/refcount.h>
47 #include <sys/queue.h>
48 #include <sys/limits.h>
49 
50 #include <vm/uma.h>
51 
52 /*
53  * cpusets provide a mechanism for creating and manipulating sets of
54  * processors for the purpose of constraining the scheduling of threads to
55  * specific processors.
56  *
57  * Each process belongs to an identified set, by default this is set 1.  Each
58  * thread may further restrict the cpus it may run on to a subset of this
59  * named set.  This creates an anonymous set which other threads and processes
60  * may not join by number.
61  *
62  * The named set is referred to herein as the 'base' set to avoid ambiguity.
63  * This set is usually a child of a 'root' set while the anonymous set may
64  * simply be referred to as a mask.  In the syscall api these are referred to
65  * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here.
66  *
67  * Threads inherit their set from their creator whether it be anonymous or
68  * not.  This means that anonymous sets are immutable because they may be
69  * shared.  To modify an anonymous set a new set is created with the desired
70  * mask and the same parent as the existing anonymous set.  This gives the
71  * illusion of each thread having a private mask.A
72  *
73  * Via the syscall apis a user may ask to retrieve or modify the root, base,
74  * or mask that is discovered via a pid, tid, or setid.  Modifying a set
75  * modifies all numbered and anonymous child sets to comply with the new mask.
76  * Modifying a pid or tid's mask applies only to that tid but must still
77  * exist within the assigned parent set.
78  *
79  * A thread may not be assigned to a a group seperate from other threads in
80  * the process.  This is to remove ambiguity when the setid is queried with
81  * a pid argument.  There is no other technical limitation.
82  *
83  * This somewhat complex arrangement is intended to make it easy for
84  * applications to query available processors and bind their threads to
85  * specific processors while also allowing administrators to dynamically
86  * reprovision by changing sets which apply to groups of processes.
87  *
88  * A simple application should not concern itself with sets at all and
89  * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id
90  * meaning 'curthread'.  It may query availble cpus for that tid with a
91  * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
92  */
93 static uma_zone_t cpuset_zone;
94 static struct mtx cpuset_lock;
95 static struct setlist cpuset_ids;
96 struct cpuset *cpuset_zero;
97 static struct unrhdr *cpuset_unr;
98 
99 /*
100  * Acquire a reference to a cpuset, all pointers must be tracked with refs.
101  */
102 struct cpuset *
103 cpuset_ref(struct cpuset *set)
104 {
105 
106 	refcount_acquire(&set->cs_ref);
107 	return (set);
108 }
109 
110 /*
111  * Release a reference in a context where it is safe to allocte.
112  */
113 void
114 cpuset_rel(struct cpuset *set)
115 {
116 	cpusetid_t id;
117 
118 	if (refcount_release(&set->cs_ref) == 0)
119 		return;
120 	mtx_lock_spin(&cpuset_lock);
121 	LIST_REMOVE(set, cs_siblings);
122 	id = set->cs_id;
123 	if (id != CPUSET_INVALID)
124 		LIST_REMOVE(set, cs_link);
125 	mtx_unlock_spin(&cpuset_lock);
126 	cpuset_rel(set->cs_parent);
127 	uma_zfree(cpuset_zone, set);
128 	if (id != CPUSET_INVALID)
129 		free_unr(cpuset_unr, id);
130 }
131 
132 /*
133  * Deferred release must be used when in a context that is not safe to
134  * allocate/free.  This places any unreferenced sets on the list 'head'.
135  */
136 static void
137 cpuset_rel_defer(struct setlist *head, struct cpuset *set)
138 {
139 
140 	if (refcount_release(&set->cs_ref) == 0)
141 		return;
142 	mtx_lock_spin(&cpuset_lock);
143 	LIST_REMOVE(set, cs_siblings);
144 	if (set->cs_id != CPUSET_INVALID)
145 		LIST_REMOVE(set, cs_link);
146 	LIST_INSERT_HEAD(head, set, cs_link);
147 	mtx_unlock_spin(&cpuset_lock);
148 }
149 
150 /*
151  * Complete a deferred release.  Removes the set from the list provided to
152  * cpuset_rel_defer.
153  */
154 static void
155 cpuset_rel_complete(struct cpuset *set)
156 {
157 	LIST_REMOVE(set, cs_link);
158 	cpuset_rel(set->cs_parent);
159 	uma_zfree(cpuset_zone, set);
160 }
161 
162 /*
163  * Find a set based on an id.  Returns it with a ref.
164  */
165 static struct cpuset *
166 cpuset_lookup(cpusetid_t setid)
167 {
168 	struct cpuset *set;
169 
170 	if (setid == CPUSET_INVALID)
171 		return (NULL);
172 	mtx_lock_spin(&cpuset_lock);
173 	LIST_FOREACH(set, &cpuset_ids, cs_link)
174 		if (set->cs_id == setid)
175 			break;
176 	if (set)
177 		cpuset_ref(set);
178 	mtx_unlock_spin(&cpuset_lock);
179 	return (set);
180 }
181 
182 /*
183  * Create a set in the space provided in 'set' with the provided parameters.
184  * The set is returned with a single ref.  May return EDEADLK if the set
185  * will have no valid cpu based on restrictions from the parent.
186  */
187 static int
188 _cpuset_create(struct cpuset *set, struct cpuset *parent, cpuset_t *mask,
189     cpusetid_t id)
190 {
191 
192 	if (!CPU_OVERLAP(&parent->cs_mask, mask))
193 		return (EDEADLK);
194 	CPU_COPY(mask, &set->cs_mask);
195 	LIST_INIT(&set->cs_children);
196 	refcount_init(&set->cs_ref, 1);
197 	set->cs_flags = 0;
198 	mtx_lock_spin(&cpuset_lock);
199 	CPU_AND(mask, &parent->cs_mask);
200 	set->cs_id = id;
201 	set->cs_parent = cpuset_ref(parent);
202 	LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings);
203 	if (set->cs_id != CPUSET_INVALID)
204 		LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
205 	mtx_unlock_spin(&cpuset_lock);
206 
207 	return (0);
208 }
209 
210 /*
211  * Create a new non-anonymous set with the requested parent and mask.  May
212  * return failures if the mask is invalid or a new number can not be
213  * allocated.
214  */
215 static int
216 cpuset_create(struct cpuset **setp, struct cpuset *parent, cpuset_t *mask)
217 {
218 	struct cpuset *set;
219 	cpusetid_t id;
220 	int error;
221 
222 	id = alloc_unr(cpuset_unr);
223 	if (id == -1)
224 		return (ENFILE);
225 	*setp = set = uma_zalloc(cpuset_zone, M_WAITOK);
226 	error = _cpuset_create(set, parent, mask, id);
227 	if (error == 0)
228 		return (0);
229 	free_unr(cpuset_unr, id);
230 	uma_zfree(cpuset_zone, set);
231 
232 	return (error);
233 }
234 
235 /*
236  * Recursively check for errors that would occur from applying mask to
237  * the tree of sets starting at 'set'.  Checks for sets that would become
238  * empty as well as RDONLY flags.
239  */
240 static int
241 cpuset_testupdate(struct cpuset *set, cpuset_t *mask)
242 {
243 	struct cpuset *nset;
244 	cpuset_t newmask;
245 	int error;
246 
247 	mtx_assert(&cpuset_lock, MA_OWNED);
248 	if (set->cs_flags & CPU_SET_RDONLY)
249 		return (EPERM);
250 	if (!CPU_OVERLAP(&set->cs_mask, mask))
251 		return (EDEADLK);
252 	CPU_COPY(&set->cs_mask, &newmask);
253 	CPU_AND(&newmask, mask);
254 	error = 0;
255 	LIST_FOREACH(nset, &set->cs_children, cs_siblings)
256 		if ((error = cpuset_testupdate(nset, &newmask)) != 0)
257 			break;
258 	return (error);
259 }
260 
261 /*
262  * Applies the mask 'mask' without checking for empty sets or permissions.
263  */
264 static void
265 cpuset_update(struct cpuset *set, cpuset_t *mask)
266 {
267 	struct cpuset *nset;
268 
269 	mtx_assert(&cpuset_lock, MA_OWNED);
270 	CPU_AND(&set->cs_mask, mask);
271 	LIST_FOREACH(nset, &set->cs_children, cs_siblings)
272 		cpuset_update(nset, &set->cs_mask);
273 
274 	return;
275 }
276 
277 /*
278  * Modify the set 'set' to use a copy of the mask provided.  Apply this new
279  * mask to restrict all children in the tree.  Checks for validity before
280  * applying the changes.
281  */
282 static int
283 cpuset_modify(struct cpuset *set, cpuset_t *mask)
284 {
285 	struct cpuset *root;
286 	int error;
287 
288 	error = suser(curthread);
289 	if (error)
290 		return (error);
291 	/*
292 	 * Verify that we have access to this set of
293 	 * cpus.
294 	 */
295 	root = set->cs_parent;
296 	if (root && !CPU_SUBSET(&root->cs_mask, mask))
297 		return (EINVAL);
298 	mtx_lock_spin(&cpuset_lock);
299 	error = cpuset_testupdate(set, mask);
300 	if (error)
301 		goto out;
302 	cpuset_update(set, mask);
303 	CPU_COPY(mask, &set->cs_mask);
304 out:
305 	mtx_unlock_spin(&cpuset_lock);
306 
307 	return (error);
308 }
309 
310 /*
311  * Walks up the tree from 'set' to find the root.  Returns the root
312  * referenced.
313  */
314 static struct cpuset *
315 cpuset_root(struct cpuset *set)
316 {
317 
318 	for (; set->cs_parent != NULL; set = set->cs_parent)
319 		if (set->cs_flags & CPU_SET_ROOT)
320 			break;
321 	cpuset_ref(set);
322 
323 	return (set);
324 }
325 
326 /*
327  * Find the first non-anonymous set starting from 'set'.  Returns this set
328  * referenced.  May return the passed in set with an extra ref if it is
329  * not anonymous.
330  */
331 static struct cpuset *
332 cpuset_base(struct cpuset *set)
333 {
334 
335 	if (set->cs_id == CPUSET_INVALID)
336 		set = set->cs_parent;
337 	cpuset_ref(set);
338 
339 	return (set);
340 }
341 
342 /*
343  * Resolve the 'which' parameter of several cpuset apis.
344  *
345  * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid.  Also
346  * checks for permission via p_cansched().
347  *
348  * For WHICH_SET returns a valid set with a new reference.
349  *
350  * -1 may be supplied for any argument to mean the current proc/thread or
351  * the base set of the current thread.  May fail with ESRCH/EPERM.
352  */
353 static int
354 cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp,
355     struct cpuset **setp)
356 {
357 	struct cpuset *set;
358 	struct thread *td;
359 	struct proc *p;
360 	int error;
361 
362 	*pp = p = NULL;
363 	*tdp = td = NULL;
364 	*setp = set = NULL;
365 	switch (which) {
366 	case CPU_WHICH_PID:
367 		if (id == -1) {
368 			PROC_LOCK(curproc);
369 			p = curproc;
370 			break;
371 		}
372 		if ((p = pfind(id)) == NULL)
373 			return (ESRCH);
374 		break;
375 	case CPU_WHICH_TID:
376 		if (id == -1) {
377 			PROC_LOCK(curproc);
378 			p = curproc;
379 			td = curthread;
380 			break;
381 		}
382 		sx_slock(&allproc_lock);
383 		FOREACH_PROC_IN_SYSTEM(p) {
384 			PROC_LOCK(p);
385 			PROC_SLOCK(p);
386 			FOREACH_THREAD_IN_PROC(p, td)
387 				if (td->td_tid == id)
388 					break;
389 			PROC_SUNLOCK(p);
390 			if (td != NULL)
391 				break;
392 			PROC_UNLOCK(p);
393 		}
394 		sx_sunlock(&allproc_lock);
395 		if (td == NULL)
396 			return (ESRCH);
397 		break;
398 	case CPU_WHICH_CPUSET:
399 		if (id == -1) {
400 			thread_lock(curthread);
401 			set = cpuset_base(curthread->td_cpuset);
402 			thread_unlock(curthread);
403 		} else
404 			set = cpuset_lookup(id);
405 		if (set) {
406 			*setp = set;
407 			return (0);
408 		}
409 		return (ESRCH);
410 	default:
411 		return (EINVAL);
412 	}
413 	error = p_cansched(curthread, p);
414 	if (error) {
415 		PROC_UNLOCK(p);
416 		return (error);
417 	}
418 	if (td == NULL)
419 		td = FIRST_THREAD_IN_PROC(p);
420 	*pp = p;
421 	*tdp = td;
422 	return (0);
423 }
424 
425 /*
426  * Create an anonymous set with the provided mask in the space provided by
427  * 'fset'.  If the passed in set is anonymous we use its parent otherwise
428  * the new set is a child of 'set'.
429  */
430 static int
431 cpuset_shadow(struct cpuset *set, struct cpuset *fset, cpuset_t *mask)
432 {
433 	struct cpuset *parent;
434 
435 	if (set->cs_id == CPUSET_INVALID)
436 		parent = set->cs_parent;
437 	else
438 		parent = set;
439 	if (!CPU_SUBSET(&parent->cs_mask, mask))
440 		return (EINVAL);
441 	return (_cpuset_create(fset, parent, mask, CPUSET_INVALID));
442 }
443 
444 /*
445  * Handle two cases for replacing the base set or mask of an entire process.
446  *
447  * 1) Set is non-null and mask is null.  This reparents all anonymous sets
448  *    to the provided set and replaces all non-anonymous td_cpusets with the
449  *    provided set.
450  * 2) Mask is non-null and set is null.  This replaces or creates anonymous
451  *    sets for every thread with the existing base as a parent.
452  *
453  * This is overly complicated because we can't allocate while holding a
454  * spinlock and spinlocks must be held while changing and examining thread
455  * state.
456  */
457 static int
458 cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask)
459 {
460 	struct setlist freelist;
461 	struct setlist droplist;
462 	struct cpuset *tdset;
463 	struct cpuset *nset;
464 	struct thread *td;
465 	struct proc *p;
466 	int threads;
467 	int nfree;
468 	int error;
469 	/*
470 	 * The algorithm requires two passes due to locking considerations.
471 	 *
472 	 * 1) Lookup the process and acquire the locks in the required order.
473 	 * 2) If enough cpusets have not been allocated release the locks and
474 	 *    allocate them.  Loop.
475 	 */
476 	LIST_INIT(&freelist);
477 	LIST_INIT(&droplist);
478 	nfree = 0;
479 	for (;;) {
480 		error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset);
481 		if (error)
482 			goto out;
483 		PROC_SLOCK(p);
484 		if (nfree >= p->p_numthreads)
485 			break;
486 		threads = p->p_numthreads;
487 		PROC_SUNLOCK(p);
488 		PROC_UNLOCK(p);
489 		for (; nfree < threads; nfree++) {
490 			nset = uma_zalloc(cpuset_zone, M_WAITOK);
491 			LIST_INSERT_HEAD(&freelist, nset, cs_link);
492 		}
493 	}
494 	PROC_LOCK_ASSERT(p, MA_OWNED);
495 	PROC_SLOCK_ASSERT(p, MA_OWNED);
496 	/*
497 	 * Now that the appropriate locks are held and we have enough cpusets,
498 	 * make sure the operation will succeed before applying changes.  The
499 	 * proc lock prevents td_cpuset from changing between calls.
500 	 */
501 	error = 0;
502 	FOREACH_THREAD_IN_PROC(p, td) {
503 		thread_lock(td);
504 		tdset = td->td_cpuset;
505 		/*
506 		 * Verify that a new mask doesn't specify cpus outside of
507 		 * the set the thread is a member of.
508 		 */
509 		if (mask) {
510 			if (tdset->cs_id == CPUSET_INVALID)
511 				tdset = tdset->cs_parent;
512 			if (!CPU_SUBSET(&tdset->cs_mask, mask))
513 				error = EINVAL;
514 		/*
515 		 * Verify that a new set won't leave an existing thread
516 		 * mask without a cpu to run on.  It can, however, restrict
517 		 * the set.
518 		 */
519 		} else if (tdset->cs_id == CPUSET_INVALID) {
520 			if (!CPU_OVERLAP(&set->cs_mask, &tdset->cs_mask))
521 				error = EINVAL;
522 		}
523 		thread_unlock(td);
524 		if (error)
525 			goto unlock_out;
526 	}
527 	/*
528 	 * Replace each thread's cpuset while using deferred release.  We
529 	 * must do this because the PROC_SLOCK has to be held while traversing
530 	 * the thread list and this limits the type of operations allowed.
531 	 */
532 	FOREACH_THREAD_IN_PROC(p, td) {
533 		thread_lock(td);
534 		/*
535 		 * If we presently have an anonymous set or are applying a
536 		 * mask we must create an anonymous shadow set.  That is
537 		 * either parented to our existing base or the supplied set.
538 		 *
539 		 * If we have a base set with no anonymous shadow we simply
540 		 * replace it outright.
541 		 */
542 		tdset = td->td_cpuset;
543 		if (tdset->cs_id == CPUSET_INVALID || mask) {
544 			nset = LIST_FIRST(&freelist);
545 			LIST_REMOVE(nset, cs_link);
546 			if (mask)
547 				error = cpuset_shadow(tdset, nset, mask);
548 			else
549 				error = _cpuset_create(nset, set,
550 				    &tdset->cs_mask, CPUSET_INVALID);
551 			if (error) {
552 				LIST_INSERT_HEAD(&freelist, nset, cs_link);
553 				thread_unlock(td);
554 				break;
555 			}
556 		} else
557 			nset = cpuset_ref(set);
558 		cpuset_rel_defer(&droplist, tdset);
559 		td->td_cpuset = nset;
560 		sched_affinity(td);
561 		thread_unlock(td);
562 	}
563 unlock_out:
564 	PROC_SUNLOCK(p);
565 	PROC_UNLOCK(p);
566 out:
567 	while ((nset = LIST_FIRST(&droplist)) != NULL)
568 		cpuset_rel_complete(nset);
569 	while ((nset = LIST_FIRST(&freelist)) != NULL) {
570 		LIST_REMOVE(nset, cs_link);
571 		uma_zfree(cpuset_zone, nset);
572 	}
573 	return (error);
574 }
575 
576 /*
577  * Apply an anonymous mask to a single thread.
578  */
579 static int
580 cpuset_setthread(lwpid_t id, cpuset_t *mask)
581 {
582 	struct cpuset *nset;
583 	struct cpuset *set;
584 	struct thread *td;
585 	struct proc *p;
586 	int error;
587 
588 	nset = uma_zalloc(cpuset_zone, M_WAITOK);
589 	error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set);
590 	if (error)
591 		goto out;
592 	thread_lock(td);
593 	set = td->td_cpuset;
594 	error = cpuset_shadow(set, nset, mask);
595 	if (error == 0) {
596 		cpuset_rel(td->td_cpuset);
597 		td->td_cpuset = nset;
598 		sched_affinity(td);
599 		nset = NULL;
600 	}
601 	thread_unlock(td);
602 	PROC_UNLOCK(p);
603 out:
604 	if (nset)
605 		uma_zfree(cpuset_zone, nset);
606 	return (error);
607 }
608 
609 /*
610  * Creates the cpuset for thread0.  We make two sets:
611  *
612  * 0 - The root set which should represent all valid processors in the
613  *     system.  It is initially created with a mask of all processors
614  *     because we don't know what processors are valid until cpuset_init()
615  *     runs.  This set is immutable.
616  * 1 - The default set which all processes are a member of until changed.
617  *     This allows an administrator to move all threads off of given cpus to
618  *     dedicate them to high priority tasks or save power etc.
619  */
620 struct cpuset *
621 cpuset_thread0(void)
622 {
623 	struct cpuset *set;
624 	int error;
625 
626 	cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
627 	    NULL, NULL, UMA_ALIGN_PTR, 0);
628 	mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE);
629 	/*
630 	 * Create the root system set for the whole machine.  Doesn't use
631 	 * cpuset_create() due to NULL parent.
632 	 */
633 	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
634 	set->cs_mask.__bits[0] = -1;
635 	LIST_INIT(&set->cs_children);
636 	LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
637 	set->cs_ref = 1;
638 	set->cs_flags = CPU_SET_ROOT;
639 	cpuset_zero = set;
640 	/*
641 	 * Now derive a default, modifiable set from that to give out.
642 	 */
643 	set = uma_zalloc(cpuset_zone, M_WAITOK);
644 	error = _cpuset_create(set, cpuset_zero, &cpuset_zero->cs_mask, 1);
645 	KASSERT(error == 0, ("Error creating default set: %d\n", error));
646 	/*
647 	 * Initialize the unit allocator. 0 and 1 are allocated above.
648 	 */
649 	cpuset_unr = new_unrhdr(2, INT_MAX, NULL);
650 
651 	return (set);
652 }
653 
654 /*
655  * This is called once the final set of system cpus is known.  Modifies
656  * the root set and all children and mark the root readonly.
657  */
658 static void
659 cpuset_init(void *arg)
660 {
661 	cpuset_t mask;
662 
663 	CPU_ZERO(&mask);
664 #ifdef SMP
665 	mask.__bits[0] = all_cpus;
666 #else
667 	mask.__bits[0] = 1;
668 #endif
669 	if (cpuset_modify(cpuset_zero, &mask))
670 		panic("Can't set initial cpuset mask.\n");
671 	cpuset_zero->cs_flags |= CPU_SET_RDONLY;
672 }
673 SYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL);
674 
675 #ifndef _SYS_SYSPROTO_H_
676 struct cpuset_args {
677 	cpusetid_t	*setid;
678 };
679 #endif
680 int
681 cpuset(struct thread *td, struct cpuset_args *uap)
682 {
683 	struct cpuset *root;
684 	struct cpuset *set;
685 	int error;
686 
687 	thread_lock(td);
688 	root = cpuset_root(td->td_cpuset);
689 	thread_unlock(td);
690 	error = cpuset_create(&set, root, &root->cs_mask);
691 	cpuset_rel(root);
692 	if (error)
693 		return (error);
694 	error = cpuset_setproc(-1, set, NULL);
695 	if (error == 0)
696 		error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id));
697 	cpuset_rel(set);
698 	return (error);
699 }
700 
701 #ifndef _SYS_SYSPROTO_H_
702 struct cpuset_setid_args {
703 	cpuwhich_t	which;
704 	id_t		id;
705 	cpusetid_t	setid;
706 };
707 #endif
708 int
709 cpuset_setid(struct thread *td, struct cpuset_setid_args *uap)
710 {
711 	struct cpuset *set;
712 	int error;
713 
714 	/*
715 	 * Presently we only support per-process sets.
716 	 */
717 	if (uap->which != CPU_WHICH_PID)
718 		return (EINVAL);
719 	set = cpuset_lookup(uap->setid);
720 	if (set == NULL)
721 		return (ESRCH);
722 	error = cpuset_setproc(uap->id, set, NULL);
723 	cpuset_rel(set);
724 	return (error);
725 }
726 
727 #ifndef _SYS_SYSPROTO_H_
728 struct cpuset_getid_args {
729 	cpulevel_t	level;
730 	cpuwhich_t	which;
731 	id_t		id;
732 	cpusetid_t	*setid;
733 #endif
734 int
735 cpuset_getid(struct thread *td, struct cpuset_getid_args *uap)
736 {
737 	struct cpuset *nset;
738 	struct cpuset *set;
739 	struct thread *ttd;
740 	struct proc *p;
741 	cpusetid_t id;
742 	int error;
743 
744 	if (uap->level == CPU_LEVEL_WHICH && uap->which != CPU_WHICH_CPUSET)
745 		return (EINVAL);
746 	error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
747 	if (error)
748 		return (error);
749 	switch (uap->which) {
750 	case CPU_WHICH_TID:
751 	case CPU_WHICH_PID:
752 		thread_lock(ttd);
753 		set = cpuset_base(ttd->td_cpuset);
754 		thread_unlock(ttd);
755 		PROC_UNLOCK(p);
756 		break;
757 	case CPU_WHICH_CPUSET:
758 		break;
759 	}
760 	switch (uap->level) {
761 	case CPU_LEVEL_ROOT:
762 		nset = cpuset_root(set);
763 		cpuset_rel(set);
764 		set = nset;
765 		break;
766 	case CPU_LEVEL_CPUSET:
767 		break;
768 	case CPU_LEVEL_WHICH:
769 		break;
770 	}
771 	id = set->cs_id;
772 	cpuset_rel(set);
773 	if (error == 0)
774 		error = copyout(&id, uap->setid, sizeof(id));
775 
776 	return (error);
777 }
778 
779 #ifndef _SYS_SYSPROTO_H_
780 struct cpuset_getaffinity_args {
781         cpulevel_t	level;
782         cpuwhich_t	which;
783         int		id;
784         int		cpusetsize;
785         long 		*mask;
786 };
787 #endif
788 int
789 cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap)
790 {
791 	struct thread *ttd;
792 	struct cpuset *nset;
793 	struct cpuset *set;
794 	struct proc *p;
795 	cpuset_t *mask;
796 	int error;
797 	int size;
798 
799 	if (uap->cpusetsize < sizeof(cpuset_t) ||
800 	    uap->cpusetsize * NBBY > CPU_MAXSIZE)
801 		return (ERANGE);
802 	size = uap->cpusetsize;
803 	mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
804 	error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
805 	if (error)
806 		goto out;
807 	error = 0;
808 	switch (uap->level) {
809 	case CPU_LEVEL_ROOT:
810 	case CPU_LEVEL_CPUSET:
811 		switch (uap->which) {
812 		case CPU_WHICH_TID:
813 		case CPU_WHICH_PID:
814 			thread_lock(ttd);
815 			set = cpuset_ref(ttd->td_cpuset);
816 			thread_unlock(ttd);
817 			break;
818 		case CPU_WHICH_CPUSET:
819 			break;
820 		}
821 		if (uap->level == CPU_LEVEL_ROOT)
822 			nset = cpuset_root(set);
823 		else
824 			nset = cpuset_base(set);
825 		CPU_COPY(&nset->cs_mask, mask);
826 		cpuset_rel(nset);
827 		break;
828 	case CPU_LEVEL_WHICH:
829 		switch (uap->which) {
830 		case CPU_WHICH_TID:
831 			thread_lock(ttd);
832 			CPU_COPY(&ttd->td_cpuset->cs_mask, mask);
833 			thread_unlock(ttd);
834 			break;
835 		case CPU_WHICH_PID:
836 			PROC_SLOCK(p);
837 			FOREACH_THREAD_IN_PROC(p, ttd) {
838 				thread_lock(ttd);
839 				CPU_OR(mask, &ttd->td_cpuset->cs_mask);
840 				thread_unlock(ttd);
841 			}
842 			PROC_SUNLOCK(p);
843 			break;
844 		case CPU_WHICH_CPUSET:
845 			CPU_COPY(&set->cs_mask, mask);
846 			break;
847 		}
848 		break;
849 	default:
850 		error = EINVAL;
851 		break;
852 	}
853 	if (set)
854 		cpuset_rel(set);
855 	if (p)
856 		PROC_UNLOCK(p);
857 	if (error == 0)
858 		error = copyout(mask, uap->mask, size);
859 out:
860 	free(mask, M_TEMP);
861 	return (error);
862 }
863 
864 #ifndef _SYS_SYSPROTO_H_
865 struct cpuset_setaffinity_args {
866 	cpulevel_t	level;
867         cpuwhich_t	which;
868         int		id;
869         int		cpusetsize;
870         long 	*	mask;
871 };
872 #endif
873 int
874 cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap)
875 {
876 	struct cpuset *nset;
877 	struct cpuset *set;
878 	struct thread *ttd;
879 	struct proc *p;
880 	cpuset_t *mask;
881 	int error;
882 
883 	if (uap->cpusetsize < sizeof(cpuset_t) ||
884 	    uap->cpusetsize * NBBY > CPU_MAXSIZE)
885 		return (ERANGE);
886 	mask = malloc(uap->cpusetsize, M_TEMP, M_WAITOK | M_ZERO);
887 	error = copyin(uap->mask, mask, uap->cpusetsize);
888 	if (error)
889 		goto out;
890 	/*
891 	 * Verify that no high bits are set.
892 	 */
893 	if (uap->cpusetsize > sizeof(cpuset_t)) {
894 		char *end;
895 		char *cp;
896 
897 		end = cp = (char *)&mask->__bits;
898 		end += uap->cpusetsize;
899 		cp += sizeof(cpuset_t);
900 		while (cp != end)
901 			if (*cp++ != 0) {
902 				error = EINVAL;
903 				goto out;
904 			}
905 
906 	}
907 	switch (uap->level) {
908 	case CPU_LEVEL_ROOT:
909 	case CPU_LEVEL_CPUSET:
910 		error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
911 		if (error)
912 			break;
913 		switch (uap->which) {
914 		case CPU_WHICH_TID:
915 		case CPU_WHICH_PID:
916 			thread_lock(ttd);
917 			set = cpuset_ref(ttd->td_cpuset);
918 			thread_unlock(ttd);
919 			PROC_UNLOCK(p);
920 			break;
921 		case CPU_WHICH_CPUSET:
922 			break;
923 		}
924 		if (uap->level == CPU_LEVEL_ROOT)
925 			nset = cpuset_root(set);
926 		else
927 			nset = cpuset_base(set);
928 		error = cpuset_modify(nset, mask);
929 		cpuset_rel(nset);
930 		cpuset_rel(set);
931 		break;
932 	case CPU_LEVEL_WHICH:
933 		switch (uap->which) {
934 		case CPU_WHICH_TID:
935 			error = cpuset_setthread(uap->id, mask);
936 			break;
937 		case CPU_WHICH_PID:
938 			error = cpuset_setproc(uap->id, NULL, mask);
939 			break;
940 		case CPU_WHICH_CPUSET:
941 			error = cpuset_which(CPU_WHICH_CPUSET, uap->id, &p,
942 			    &ttd, &set);
943 			if (error == 0) {
944 				error = cpuset_modify(set, mask);
945 				cpuset_rel(set);
946 			}
947 			break;
948 		default:
949 			error = EINVAL;
950 			break;
951 		}
952 		break;
953 	default:
954 		error = EINVAL;
955 		break;
956 	}
957 out:
958 	free(mask, M_TEMP);
959 	return (error);
960 }
961