1 /*- 2 * Copyright (c) 2008, Jeffrey Roberson <jeff@freebsd.org> 3 * All rights reserved. 4 * 5 * Copyright (c) 2008 Nokia Corporation 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice unmodified, this list of conditions, and the following 13 * disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 23 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 27 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 * 29 */ 30 31 #include <sys/cdefs.h> 32 __FBSDID("$FreeBSD$"); 33 34 #include "opt_ddb.h" 35 36 #include <sys/param.h> 37 #include <sys/systm.h> 38 #include <sys/sysproto.h> 39 #include <sys/jail.h> 40 #include <sys/kernel.h> 41 #include <sys/lock.h> 42 #include <sys/malloc.h> 43 #include <sys/mutex.h> 44 #include <sys/priv.h> 45 #include <sys/proc.h> 46 #include <sys/refcount.h> 47 #include <sys/sched.h> 48 #include <sys/smp.h> 49 #include <sys/syscallsubr.h> 50 #include <sys/cpuset.h> 51 #include <sys/sx.h> 52 #include <sys/queue.h> 53 #include <sys/libkern.h> 54 #include <sys/limits.h> 55 #include <sys/bus.h> 56 #include <sys/interrupt.h> 57 58 #include <vm/uma.h> 59 #include <vm/vm.h> 60 #include <vm/vm_page.h> 61 #include <vm/vm_param.h> 62 63 #ifdef DDB 64 #include <ddb/ddb.h> 65 #endif /* DDB */ 66 67 /* 68 * cpusets provide a mechanism for creating and manipulating sets of 69 * processors for the purpose of constraining the scheduling of threads to 70 * specific processors. 71 * 72 * Each process belongs to an identified set, by default this is set 1. Each 73 * thread may further restrict the cpus it may run on to a subset of this 74 * named set. This creates an anonymous set which other threads and processes 75 * may not join by number. 76 * 77 * The named set is referred to herein as the 'base' set to avoid ambiguity. 78 * This set is usually a child of a 'root' set while the anonymous set may 79 * simply be referred to as a mask. In the syscall api these are referred to 80 * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here. 81 * 82 * Threads inherit their set from their creator whether it be anonymous or 83 * not. This means that anonymous sets are immutable because they may be 84 * shared. To modify an anonymous set a new set is created with the desired 85 * mask and the same parent as the existing anonymous set. This gives the 86 * illusion of each thread having a private mask. 87 * 88 * Via the syscall apis a user may ask to retrieve or modify the root, base, 89 * or mask that is discovered via a pid, tid, or setid. Modifying a set 90 * modifies all numbered and anonymous child sets to comply with the new mask. 91 * Modifying a pid or tid's mask applies only to that tid but must still 92 * exist within the assigned parent set. 93 * 94 * A thread may not be assigned to a group separate from other threads in 95 * the process. This is to remove ambiguity when the setid is queried with 96 * a pid argument. There is no other technical limitation. 97 * 98 * This somewhat complex arrangement is intended to make it easy for 99 * applications to query available processors and bind their threads to 100 * specific processors while also allowing administrators to dynamically 101 * reprovision by changing sets which apply to groups of processes. 102 * 103 * A simple application should not concern itself with sets at all and 104 * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id 105 * meaning 'curthread'. It may query available cpus for that tid with a 106 * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...). 107 */ 108 static uma_zone_t cpuset_zone; 109 static struct mtx cpuset_lock; 110 static struct setlist cpuset_ids; 111 static struct unrhdr *cpuset_unr; 112 static struct cpuset *cpuset_zero, *cpuset_default; 113 114 /* Return the size of cpuset_t at the kernel level */ 115 SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD | CTLFLAG_CAPRD, 116 SYSCTL_NULL_INT_PTR, sizeof(cpuset_t), "sizeof(cpuset_t)"); 117 118 cpuset_t *cpuset_root; 119 cpuset_t cpuset_domain[MAXMEMDOM]; 120 121 /* 122 * Acquire a reference to a cpuset, all pointers must be tracked with refs. 123 */ 124 struct cpuset * 125 cpuset_ref(struct cpuset *set) 126 { 127 128 refcount_acquire(&set->cs_ref); 129 return (set); 130 } 131 132 /* 133 * Walks up the tree from 'set' to find the root. Returns the root 134 * referenced. 135 */ 136 static struct cpuset * 137 cpuset_refroot(struct cpuset *set) 138 { 139 140 for (; set->cs_parent != NULL; set = set->cs_parent) 141 if (set->cs_flags & CPU_SET_ROOT) 142 break; 143 cpuset_ref(set); 144 145 return (set); 146 } 147 148 /* 149 * Find the first non-anonymous set starting from 'set'. Returns this set 150 * referenced. May return the passed in set with an extra ref if it is 151 * not anonymous. 152 */ 153 static struct cpuset * 154 cpuset_refbase(struct cpuset *set) 155 { 156 157 if (set->cs_id == CPUSET_INVALID) 158 set = set->cs_parent; 159 cpuset_ref(set); 160 161 return (set); 162 } 163 164 /* 165 * Release a reference in a context where it is safe to allocate. 166 */ 167 void 168 cpuset_rel(struct cpuset *set) 169 { 170 cpusetid_t id; 171 172 if (refcount_release(&set->cs_ref) == 0) 173 return; 174 mtx_lock_spin(&cpuset_lock); 175 LIST_REMOVE(set, cs_siblings); 176 id = set->cs_id; 177 if (id != CPUSET_INVALID) 178 LIST_REMOVE(set, cs_link); 179 mtx_unlock_spin(&cpuset_lock); 180 cpuset_rel(set->cs_parent); 181 uma_zfree(cpuset_zone, set); 182 if (id != CPUSET_INVALID) 183 free_unr(cpuset_unr, id); 184 } 185 186 /* 187 * Deferred release must be used when in a context that is not safe to 188 * allocate/free. This places any unreferenced sets on the list 'head'. 189 */ 190 static void 191 cpuset_rel_defer(struct setlist *head, struct cpuset *set) 192 { 193 194 if (refcount_release(&set->cs_ref) == 0) 195 return; 196 mtx_lock_spin(&cpuset_lock); 197 LIST_REMOVE(set, cs_siblings); 198 if (set->cs_id != CPUSET_INVALID) 199 LIST_REMOVE(set, cs_link); 200 LIST_INSERT_HEAD(head, set, cs_link); 201 mtx_unlock_spin(&cpuset_lock); 202 } 203 204 /* 205 * Complete a deferred release. Removes the set from the list provided to 206 * cpuset_rel_defer. 207 */ 208 static void 209 cpuset_rel_complete(struct cpuset *set) 210 { 211 LIST_REMOVE(set, cs_link); 212 cpuset_rel(set->cs_parent); 213 uma_zfree(cpuset_zone, set); 214 } 215 216 /* 217 * Find a set based on an id. Returns it with a ref. 218 */ 219 static struct cpuset * 220 cpuset_lookup(cpusetid_t setid, struct thread *td) 221 { 222 struct cpuset *set; 223 224 if (setid == CPUSET_INVALID) 225 return (NULL); 226 mtx_lock_spin(&cpuset_lock); 227 LIST_FOREACH(set, &cpuset_ids, cs_link) 228 if (set->cs_id == setid) 229 break; 230 if (set) 231 cpuset_ref(set); 232 mtx_unlock_spin(&cpuset_lock); 233 234 KASSERT(td != NULL, ("[%s:%d] td is NULL", __func__, __LINE__)); 235 if (set != NULL && jailed(td->td_ucred)) { 236 struct cpuset *jset, *tset; 237 238 jset = td->td_ucred->cr_prison->pr_cpuset; 239 for (tset = set; tset != NULL; tset = tset->cs_parent) 240 if (tset == jset) 241 break; 242 if (tset == NULL) { 243 cpuset_rel(set); 244 set = NULL; 245 } 246 } 247 248 return (set); 249 } 250 251 /* 252 * Create a set in the space provided in 'set' with the provided parameters. 253 * The set is returned with a single ref. May return EDEADLK if the set 254 * will have no valid cpu based on restrictions from the parent. 255 */ 256 static int 257 _cpuset_create(struct cpuset *set, struct cpuset *parent, const cpuset_t *mask, 258 cpusetid_t id) 259 { 260 261 if (!CPU_OVERLAP(&parent->cs_mask, mask)) 262 return (EDEADLK); 263 CPU_COPY(mask, &set->cs_mask); 264 LIST_INIT(&set->cs_children); 265 refcount_init(&set->cs_ref, 1); 266 set->cs_flags = 0; 267 mtx_lock_spin(&cpuset_lock); 268 CPU_AND(&set->cs_mask, &parent->cs_mask); 269 set->cs_id = id; 270 set->cs_parent = cpuset_ref(parent); 271 LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings); 272 if (set->cs_id != CPUSET_INVALID) 273 LIST_INSERT_HEAD(&cpuset_ids, set, cs_link); 274 mtx_unlock_spin(&cpuset_lock); 275 276 return (0); 277 } 278 279 /* 280 * Create a new non-anonymous set with the requested parent and mask. May 281 * return failures if the mask is invalid or a new number can not be 282 * allocated. 283 */ 284 static int 285 cpuset_create(struct cpuset **setp, struct cpuset *parent, const cpuset_t *mask) 286 { 287 struct cpuset *set; 288 cpusetid_t id; 289 int error; 290 291 id = alloc_unr(cpuset_unr); 292 if (id == -1) 293 return (ENFILE); 294 *setp = set = uma_zalloc(cpuset_zone, M_WAITOK); 295 error = _cpuset_create(set, parent, mask, id); 296 if (error == 0) 297 return (0); 298 free_unr(cpuset_unr, id); 299 uma_zfree(cpuset_zone, set); 300 301 return (error); 302 } 303 304 /* 305 * Recursively check for errors that would occur from applying mask to 306 * the tree of sets starting at 'set'. Checks for sets that would become 307 * empty as well as RDONLY flags. 308 */ 309 static int 310 cpuset_testupdate(struct cpuset *set, cpuset_t *mask, int check_mask) 311 { 312 struct cpuset *nset; 313 cpuset_t newmask; 314 int error; 315 316 mtx_assert(&cpuset_lock, MA_OWNED); 317 if (set->cs_flags & CPU_SET_RDONLY) 318 return (EPERM); 319 if (check_mask) { 320 if (!CPU_OVERLAP(&set->cs_mask, mask)) 321 return (EDEADLK); 322 CPU_COPY(&set->cs_mask, &newmask); 323 CPU_AND(&newmask, mask); 324 } else 325 CPU_COPY(mask, &newmask); 326 error = 0; 327 LIST_FOREACH(nset, &set->cs_children, cs_siblings) 328 if ((error = cpuset_testupdate(nset, &newmask, 1)) != 0) 329 break; 330 return (error); 331 } 332 333 /* 334 * Applies the mask 'mask' without checking for empty sets or permissions. 335 */ 336 static void 337 cpuset_update(struct cpuset *set, cpuset_t *mask) 338 { 339 struct cpuset *nset; 340 341 mtx_assert(&cpuset_lock, MA_OWNED); 342 CPU_AND(&set->cs_mask, mask); 343 LIST_FOREACH(nset, &set->cs_children, cs_siblings) 344 cpuset_update(nset, &set->cs_mask); 345 346 return; 347 } 348 349 /* 350 * Modify the set 'set' to use a copy of the mask provided. Apply this new 351 * mask to restrict all children in the tree. Checks for validity before 352 * applying the changes. 353 */ 354 static int 355 cpuset_modify(struct cpuset *set, cpuset_t *mask) 356 { 357 struct cpuset *root; 358 int error; 359 360 error = priv_check(curthread, PRIV_SCHED_CPUSET); 361 if (error) 362 return (error); 363 /* 364 * In case we are called from within the jail 365 * we do not allow modifying the dedicated root 366 * cpuset of the jail but may still allow to 367 * change child sets. 368 */ 369 if (jailed(curthread->td_ucred) && 370 set->cs_flags & CPU_SET_ROOT) 371 return (EPERM); 372 /* 373 * Verify that we have access to this set of 374 * cpus. 375 */ 376 root = set->cs_parent; 377 if (root && !CPU_SUBSET(&root->cs_mask, mask)) 378 return (EINVAL); 379 mtx_lock_spin(&cpuset_lock); 380 error = cpuset_testupdate(set, mask, 0); 381 if (error) 382 goto out; 383 CPU_COPY(mask, &set->cs_mask); 384 cpuset_update(set, mask); 385 out: 386 mtx_unlock_spin(&cpuset_lock); 387 388 return (error); 389 } 390 391 /* 392 * Resolve the 'which' parameter of several cpuset apis. 393 * 394 * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid. Also 395 * checks for permission via p_cansched(). 396 * 397 * For WHICH_SET returns a valid set with a new reference. 398 * 399 * -1 may be supplied for any argument to mean the current proc/thread or 400 * the base set of the current thread. May fail with ESRCH/EPERM. 401 */ 402 int 403 cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp, 404 struct cpuset **setp) 405 { 406 struct cpuset *set; 407 struct thread *td; 408 struct proc *p; 409 int error; 410 411 *pp = p = NULL; 412 *tdp = td = NULL; 413 *setp = set = NULL; 414 switch (which) { 415 case CPU_WHICH_PID: 416 if (id == -1) { 417 PROC_LOCK(curproc); 418 p = curproc; 419 break; 420 } 421 if ((p = pfind(id)) == NULL) 422 return (ESRCH); 423 break; 424 case CPU_WHICH_TID: 425 if (id == -1) { 426 PROC_LOCK(curproc); 427 p = curproc; 428 td = curthread; 429 break; 430 } 431 td = tdfind(id, -1); 432 if (td == NULL) 433 return (ESRCH); 434 p = td->td_proc; 435 break; 436 case CPU_WHICH_CPUSET: 437 if (id == -1) { 438 thread_lock(curthread); 439 set = cpuset_refbase(curthread->td_cpuset); 440 thread_unlock(curthread); 441 } else 442 set = cpuset_lookup(id, curthread); 443 if (set) { 444 *setp = set; 445 return (0); 446 } 447 return (ESRCH); 448 case CPU_WHICH_JAIL: 449 { 450 /* Find `set' for prison with given id. */ 451 struct prison *pr; 452 453 sx_slock(&allprison_lock); 454 pr = prison_find_child(curthread->td_ucred->cr_prison, id); 455 sx_sunlock(&allprison_lock); 456 if (pr == NULL) 457 return (ESRCH); 458 cpuset_ref(pr->pr_cpuset); 459 *setp = pr->pr_cpuset; 460 mtx_unlock(&pr->pr_mtx); 461 return (0); 462 } 463 case CPU_WHICH_IRQ: 464 case CPU_WHICH_DOMAIN: 465 return (0); 466 default: 467 return (EINVAL); 468 } 469 error = p_cansched(curthread, p); 470 if (error) { 471 PROC_UNLOCK(p); 472 return (error); 473 } 474 if (td == NULL) 475 td = FIRST_THREAD_IN_PROC(p); 476 *pp = p; 477 *tdp = td; 478 return (0); 479 } 480 481 /* 482 * Create an anonymous set with the provided mask in the space provided by 483 * 'fset'. If the passed in set is anonymous we use its parent otherwise 484 * the new set is a child of 'set'. 485 */ 486 static int 487 cpuset_shadow(struct cpuset *set, struct cpuset *fset, const cpuset_t *mask) 488 { 489 struct cpuset *parent; 490 491 if (set->cs_id == CPUSET_INVALID) 492 parent = set->cs_parent; 493 else 494 parent = set; 495 if (!CPU_SUBSET(&parent->cs_mask, mask)) 496 return (EDEADLK); 497 return (_cpuset_create(fset, parent, mask, CPUSET_INVALID)); 498 } 499 500 /* 501 * Handle two cases for replacing the base set or mask of an entire process. 502 * 503 * 1) Set is non-null and mask is null. This reparents all anonymous sets 504 * to the provided set and replaces all non-anonymous td_cpusets with the 505 * provided set. 506 * 2) Mask is non-null and set is null. This replaces or creates anonymous 507 * sets for every thread with the existing base as a parent. 508 * 509 * This is overly complicated because we can't allocate while holding a 510 * spinlock and spinlocks must be held while changing and examining thread 511 * state. 512 */ 513 static int 514 cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask) 515 { 516 struct setlist freelist; 517 struct setlist droplist; 518 struct cpuset *tdset; 519 struct cpuset *nset; 520 struct thread *td; 521 struct proc *p; 522 int threads; 523 int nfree; 524 int error; 525 /* 526 * The algorithm requires two passes due to locking considerations. 527 * 528 * 1) Lookup the process and acquire the locks in the required order. 529 * 2) If enough cpusets have not been allocated release the locks and 530 * allocate them. Loop. 531 */ 532 LIST_INIT(&freelist); 533 LIST_INIT(&droplist); 534 nfree = 0; 535 for (;;) { 536 error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset); 537 if (error) 538 goto out; 539 if (nfree >= p->p_numthreads) 540 break; 541 threads = p->p_numthreads; 542 PROC_UNLOCK(p); 543 for (; nfree < threads; nfree++) { 544 nset = uma_zalloc(cpuset_zone, M_WAITOK); 545 LIST_INSERT_HEAD(&freelist, nset, cs_link); 546 } 547 } 548 PROC_LOCK_ASSERT(p, MA_OWNED); 549 /* 550 * Now that the appropriate locks are held and we have enough cpusets, 551 * make sure the operation will succeed before applying changes. The 552 * proc lock prevents td_cpuset from changing between calls. 553 */ 554 error = 0; 555 FOREACH_THREAD_IN_PROC(p, td) { 556 thread_lock(td); 557 tdset = td->td_cpuset; 558 /* 559 * Verify that a new mask doesn't specify cpus outside of 560 * the set the thread is a member of. 561 */ 562 if (mask) { 563 if (tdset->cs_id == CPUSET_INVALID) 564 tdset = tdset->cs_parent; 565 if (!CPU_SUBSET(&tdset->cs_mask, mask)) 566 error = EDEADLK; 567 /* 568 * Verify that a new set won't leave an existing thread 569 * mask without a cpu to run on. It can, however, restrict 570 * the set. 571 */ 572 } else if (tdset->cs_id == CPUSET_INVALID) { 573 if (!CPU_OVERLAP(&set->cs_mask, &tdset->cs_mask)) 574 error = EDEADLK; 575 } 576 thread_unlock(td); 577 if (error) 578 goto unlock_out; 579 } 580 /* 581 * Replace each thread's cpuset while using deferred release. We 582 * must do this because the thread lock must be held while operating 583 * on the thread and this limits the type of operations allowed. 584 */ 585 FOREACH_THREAD_IN_PROC(p, td) { 586 thread_lock(td); 587 /* 588 * If we presently have an anonymous set or are applying a 589 * mask we must create an anonymous shadow set. That is 590 * either parented to our existing base or the supplied set. 591 * 592 * If we have a base set with no anonymous shadow we simply 593 * replace it outright. 594 */ 595 tdset = td->td_cpuset; 596 if (tdset->cs_id == CPUSET_INVALID || mask) { 597 nset = LIST_FIRST(&freelist); 598 LIST_REMOVE(nset, cs_link); 599 if (mask) 600 error = cpuset_shadow(tdset, nset, mask); 601 else 602 error = _cpuset_create(nset, set, 603 &tdset->cs_mask, CPUSET_INVALID); 604 if (error) { 605 LIST_INSERT_HEAD(&freelist, nset, cs_link); 606 thread_unlock(td); 607 break; 608 } 609 } else 610 nset = cpuset_ref(set); 611 cpuset_rel_defer(&droplist, tdset); 612 td->td_cpuset = nset; 613 sched_affinity(td); 614 thread_unlock(td); 615 } 616 unlock_out: 617 PROC_UNLOCK(p); 618 out: 619 while ((nset = LIST_FIRST(&droplist)) != NULL) 620 cpuset_rel_complete(nset); 621 while ((nset = LIST_FIRST(&freelist)) != NULL) { 622 LIST_REMOVE(nset, cs_link); 623 uma_zfree(cpuset_zone, nset); 624 } 625 return (error); 626 } 627 628 /* 629 * Return a string representing a valid layout for a cpuset_t object. 630 * It expects an incoming buffer at least sized as CPUSETBUFSIZ. 631 */ 632 char * 633 cpusetobj_strprint(char *buf, const cpuset_t *set) 634 { 635 char *tbuf; 636 size_t i, bytesp, bufsiz; 637 638 tbuf = buf; 639 bytesp = 0; 640 bufsiz = CPUSETBUFSIZ; 641 642 for (i = 0; i < (_NCPUWORDS - 1); i++) { 643 bytesp = snprintf(tbuf, bufsiz, "%lx,", set->__bits[i]); 644 bufsiz -= bytesp; 645 tbuf += bytesp; 646 } 647 snprintf(tbuf, bufsiz, "%lx", set->__bits[_NCPUWORDS - 1]); 648 return (buf); 649 } 650 651 /* 652 * Build a valid cpuset_t object from a string representation. 653 * It expects an incoming buffer at least sized as CPUSETBUFSIZ. 654 */ 655 int 656 cpusetobj_strscan(cpuset_t *set, const char *buf) 657 { 658 u_int nwords; 659 int i, ret; 660 661 if (strlen(buf) > CPUSETBUFSIZ - 1) 662 return (-1); 663 664 /* Allow to pass a shorter version of the mask when necessary. */ 665 nwords = 1; 666 for (i = 0; buf[i] != '\0'; i++) 667 if (buf[i] == ',') 668 nwords++; 669 if (nwords > _NCPUWORDS) 670 return (-1); 671 672 CPU_ZERO(set); 673 for (i = 0; i < (nwords - 1); i++) { 674 ret = sscanf(buf, "%lx,", &set->__bits[i]); 675 if (ret == 0 || ret == -1) 676 return (-1); 677 buf = strstr(buf, ","); 678 if (buf == NULL) 679 return (-1); 680 buf++; 681 } 682 ret = sscanf(buf, "%lx", &set->__bits[nwords - 1]); 683 if (ret == 0 || ret == -1) 684 return (-1); 685 return (0); 686 } 687 688 /* 689 * Apply an anonymous mask to a single thread. 690 */ 691 int 692 cpuset_setthread(lwpid_t id, cpuset_t *mask) 693 { 694 struct cpuset *nset; 695 struct cpuset *set; 696 struct thread *td; 697 struct proc *p; 698 int error; 699 700 nset = uma_zalloc(cpuset_zone, M_WAITOK); 701 error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set); 702 if (error) 703 goto out; 704 set = NULL; 705 thread_lock(td); 706 error = cpuset_shadow(td->td_cpuset, nset, mask); 707 if (error == 0) { 708 set = td->td_cpuset; 709 td->td_cpuset = nset; 710 sched_affinity(td); 711 nset = NULL; 712 } 713 thread_unlock(td); 714 PROC_UNLOCK(p); 715 if (set) 716 cpuset_rel(set); 717 out: 718 if (nset) 719 uma_zfree(cpuset_zone, nset); 720 return (error); 721 } 722 723 /* 724 * Apply new cpumask to the ithread. 725 */ 726 int 727 cpuset_setithread(lwpid_t id, int cpu) 728 { 729 struct cpuset *nset, *rset; 730 struct cpuset *parent, *old_set; 731 struct thread *td; 732 struct proc *p; 733 cpusetid_t cs_id; 734 cpuset_t mask; 735 int error; 736 737 nset = uma_zalloc(cpuset_zone, M_WAITOK); 738 rset = uma_zalloc(cpuset_zone, M_WAITOK); 739 cs_id = CPUSET_INVALID; 740 741 CPU_ZERO(&mask); 742 if (cpu == NOCPU) 743 CPU_COPY(cpuset_root, &mask); 744 else 745 CPU_SET(cpu, &mask); 746 747 error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &old_set); 748 if (error != 0 || ((cs_id = alloc_unr(cpuset_unr)) == CPUSET_INVALID)) 749 goto out; 750 751 /* cpuset_which() returns with PROC_LOCK held. */ 752 old_set = td->td_cpuset; 753 754 if (cpu == NOCPU) { 755 756 /* 757 * roll back to default set. We're not using cpuset_shadow() 758 * here because we can fail CPU_SUBSET() check. This can happen 759 * if default set does not contain all CPUs. 760 */ 761 error = _cpuset_create(nset, cpuset_default, &mask, 762 CPUSET_INVALID); 763 764 goto applyset; 765 } 766 767 if (old_set->cs_id == 1 || (old_set->cs_id == CPUSET_INVALID && 768 old_set->cs_parent->cs_id == 1)) { 769 770 /* 771 * Current set is either default (1) or 772 * shadowed version of default set. 773 * 774 * Allocate new root set to be able to shadow it 775 * with any mask. 776 */ 777 error = _cpuset_create(rset, cpuset_zero, 778 &cpuset_zero->cs_mask, cs_id); 779 if (error != 0) { 780 PROC_UNLOCK(p); 781 goto out; 782 } 783 rset->cs_flags |= CPU_SET_ROOT; 784 parent = rset; 785 rset = NULL; 786 cs_id = CPUSET_INVALID; 787 } else { 788 /* Assume existing set was already allocated by previous call */ 789 parent = old_set; 790 old_set = NULL; 791 } 792 793 error = cpuset_shadow(parent, nset, &mask); 794 applyset: 795 if (error == 0) { 796 thread_lock(td); 797 td->td_cpuset = nset; 798 sched_affinity(td); 799 thread_unlock(td); 800 nset = NULL; 801 } else 802 old_set = NULL; 803 PROC_UNLOCK(p); 804 if (old_set != NULL) 805 cpuset_rel(old_set); 806 out: 807 if (nset != NULL) 808 uma_zfree(cpuset_zone, nset); 809 if (rset != NULL) 810 uma_zfree(cpuset_zone, rset); 811 if (cs_id != CPUSET_INVALID) 812 free_unr(cpuset_unr, cs_id); 813 return (error); 814 } 815 816 817 /* 818 * Creates system-wide cpusets and the cpuset for thread0 including two 819 * sets: 820 * 821 * 0 - The root set which should represent all valid processors in the 822 * system. It is initially created with a mask of all processors 823 * because we don't know what processors are valid until cpuset_init() 824 * runs. This set is immutable. 825 * 1 - The default set which all processes are a member of until changed. 826 * This allows an administrator to move all threads off of given cpus to 827 * dedicate them to high priority tasks or save power etc. 828 */ 829 struct cpuset * 830 cpuset_thread0(void) 831 { 832 struct cpuset *set; 833 int error, i; 834 835 cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL, 836 NULL, NULL, UMA_ALIGN_PTR, 0); 837 mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE); 838 839 /* 840 * Create the root system set for the whole machine. Doesn't use 841 * cpuset_create() due to NULL parent. 842 */ 843 set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); 844 CPU_FILL(&set->cs_mask); 845 LIST_INIT(&set->cs_children); 846 LIST_INSERT_HEAD(&cpuset_ids, set, cs_link); 847 set->cs_ref = 1; 848 set->cs_flags = CPU_SET_ROOT; 849 cpuset_zero = set; 850 cpuset_root = &set->cs_mask; 851 852 /* 853 * Now derive a default, modifiable set from that to give out. 854 */ 855 set = uma_zalloc(cpuset_zone, M_WAITOK); 856 error = _cpuset_create(set, cpuset_zero, &cpuset_zero->cs_mask, 1); 857 KASSERT(error == 0, ("Error creating default set: %d\n", error)); 858 cpuset_default = set; 859 860 /* 861 * Initialize the unit allocator. 0 and 1 are allocated above. 862 */ 863 cpuset_unr = new_unrhdr(2, INT_MAX, NULL); 864 865 /* 866 * If MD code has not initialized per-domain cpusets, place all 867 * CPUs in domain 0. 868 */ 869 for (i = 0; i < MAXMEMDOM; i++) 870 if (!CPU_EMPTY(&cpuset_domain[i])) 871 goto domains_set; 872 CPU_COPY(&all_cpus, &cpuset_domain[0]); 873 domains_set: 874 875 return (set); 876 } 877 878 /* 879 * Create a cpuset, which would be cpuset_create() but 880 * mark the new 'set' as root. 881 * 882 * We are not going to reparent the td to it. Use cpuset_setproc_update_set() 883 * for that. 884 * 885 * In case of no error, returns the set in *setp locked with a reference. 886 */ 887 int 888 cpuset_create_root(struct prison *pr, struct cpuset **setp) 889 { 890 struct cpuset *set; 891 int error; 892 893 KASSERT(pr != NULL, ("[%s:%d] invalid pr", __func__, __LINE__)); 894 KASSERT(setp != NULL, ("[%s:%d] invalid setp", __func__, __LINE__)); 895 896 error = cpuset_create(setp, pr->pr_cpuset, &pr->pr_cpuset->cs_mask); 897 if (error) 898 return (error); 899 900 KASSERT(*setp != NULL, ("[%s:%d] cpuset_create returned invalid data", 901 __func__, __LINE__)); 902 903 /* Mark the set as root. */ 904 set = *setp; 905 set->cs_flags |= CPU_SET_ROOT; 906 907 return (0); 908 } 909 910 int 911 cpuset_setproc_update_set(struct proc *p, struct cpuset *set) 912 { 913 int error; 914 915 KASSERT(p != NULL, ("[%s:%d] invalid proc", __func__, __LINE__)); 916 KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__)); 917 918 cpuset_ref(set); 919 error = cpuset_setproc(p->p_pid, set, NULL); 920 if (error) 921 return (error); 922 cpuset_rel(set); 923 return (0); 924 } 925 926 /* 927 * This is called once the final set of system cpus is known. Modifies 928 * the root set and all children and mark the root read-only. 929 */ 930 static void 931 cpuset_init(void *arg) 932 { 933 cpuset_t mask; 934 935 mask = all_cpus; 936 if (cpuset_modify(cpuset_zero, &mask)) 937 panic("Can't set initial cpuset mask.\n"); 938 cpuset_zero->cs_flags |= CPU_SET_RDONLY; 939 } 940 SYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL); 941 942 #ifndef _SYS_SYSPROTO_H_ 943 struct cpuset_args { 944 cpusetid_t *setid; 945 }; 946 #endif 947 int 948 sys_cpuset(struct thread *td, struct cpuset_args *uap) 949 { 950 struct cpuset *root; 951 struct cpuset *set; 952 int error; 953 954 thread_lock(td); 955 root = cpuset_refroot(td->td_cpuset); 956 thread_unlock(td); 957 error = cpuset_create(&set, root, &root->cs_mask); 958 cpuset_rel(root); 959 if (error) 960 return (error); 961 error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id)); 962 if (error == 0) 963 error = cpuset_setproc(-1, set, NULL); 964 cpuset_rel(set); 965 return (error); 966 } 967 968 #ifndef _SYS_SYSPROTO_H_ 969 struct cpuset_setid_args { 970 cpuwhich_t which; 971 id_t id; 972 cpusetid_t setid; 973 }; 974 #endif 975 int 976 sys_cpuset_setid(struct thread *td, struct cpuset_setid_args *uap) 977 { 978 979 return (kern_cpuset_setid(td, uap->which, uap->id, uap->setid)); 980 } 981 982 int 983 kern_cpuset_setid(struct thread *td, cpuwhich_t which, 984 id_t id, cpusetid_t setid) 985 { 986 struct cpuset *set; 987 int error; 988 989 /* 990 * Presently we only support per-process sets. 991 */ 992 if (which != CPU_WHICH_PID) 993 return (EINVAL); 994 set = cpuset_lookup(setid, td); 995 if (set == NULL) 996 return (ESRCH); 997 error = cpuset_setproc(id, set, NULL); 998 cpuset_rel(set); 999 return (error); 1000 } 1001 1002 #ifndef _SYS_SYSPROTO_H_ 1003 struct cpuset_getid_args { 1004 cpulevel_t level; 1005 cpuwhich_t which; 1006 id_t id; 1007 cpusetid_t *setid; 1008 }; 1009 #endif 1010 int 1011 sys_cpuset_getid(struct thread *td, struct cpuset_getid_args *uap) 1012 { 1013 1014 return (kern_cpuset_getid(td, uap->level, uap->which, uap->id, 1015 uap->setid)); 1016 } 1017 1018 int 1019 kern_cpuset_getid(struct thread *td, cpulevel_t level, cpuwhich_t which, 1020 id_t id, cpusetid_t *setid) 1021 { 1022 struct cpuset *nset; 1023 struct cpuset *set; 1024 struct thread *ttd; 1025 struct proc *p; 1026 cpusetid_t tmpid; 1027 int error; 1028 1029 if (level == CPU_LEVEL_WHICH && which != CPU_WHICH_CPUSET) 1030 return (EINVAL); 1031 error = cpuset_which(which, id, &p, &ttd, &set); 1032 if (error) 1033 return (error); 1034 switch (which) { 1035 case CPU_WHICH_TID: 1036 case CPU_WHICH_PID: 1037 thread_lock(ttd); 1038 set = cpuset_refbase(ttd->td_cpuset); 1039 thread_unlock(ttd); 1040 PROC_UNLOCK(p); 1041 break; 1042 case CPU_WHICH_CPUSET: 1043 case CPU_WHICH_JAIL: 1044 break; 1045 case CPU_WHICH_IRQ: 1046 case CPU_WHICH_DOMAIN: 1047 return (EINVAL); 1048 } 1049 switch (level) { 1050 case CPU_LEVEL_ROOT: 1051 nset = cpuset_refroot(set); 1052 cpuset_rel(set); 1053 set = nset; 1054 break; 1055 case CPU_LEVEL_CPUSET: 1056 break; 1057 case CPU_LEVEL_WHICH: 1058 break; 1059 } 1060 tmpid = set->cs_id; 1061 cpuset_rel(set); 1062 if (error == 0) 1063 error = copyout(&tmpid, setid, sizeof(id)); 1064 1065 return (error); 1066 } 1067 1068 #ifndef _SYS_SYSPROTO_H_ 1069 struct cpuset_getaffinity_args { 1070 cpulevel_t level; 1071 cpuwhich_t which; 1072 id_t id; 1073 size_t cpusetsize; 1074 cpuset_t *mask; 1075 }; 1076 #endif 1077 int 1078 sys_cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap) 1079 { 1080 1081 return (kern_cpuset_getaffinity(td, uap->level, uap->which, 1082 uap->id, uap->cpusetsize, uap->mask)); 1083 } 1084 1085 int 1086 kern_cpuset_getaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, 1087 id_t id, size_t cpusetsize, cpuset_t *maskp) 1088 { 1089 struct thread *ttd; 1090 struct cpuset *nset; 1091 struct cpuset *set; 1092 struct proc *p; 1093 cpuset_t *mask; 1094 int error; 1095 size_t size; 1096 1097 if (cpusetsize < sizeof(cpuset_t) || cpusetsize > CPU_MAXSIZE / NBBY) 1098 return (ERANGE); 1099 size = cpusetsize; 1100 mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO); 1101 error = cpuset_which(which, id, &p, &ttd, &set); 1102 if (error) 1103 goto out; 1104 switch (level) { 1105 case CPU_LEVEL_ROOT: 1106 case CPU_LEVEL_CPUSET: 1107 switch (which) { 1108 case CPU_WHICH_TID: 1109 case CPU_WHICH_PID: 1110 thread_lock(ttd); 1111 set = cpuset_ref(ttd->td_cpuset); 1112 thread_unlock(ttd); 1113 break; 1114 case CPU_WHICH_CPUSET: 1115 case CPU_WHICH_JAIL: 1116 break; 1117 case CPU_WHICH_IRQ: 1118 case CPU_WHICH_INTRHANDLER: 1119 case CPU_WHICH_ITHREAD: 1120 case CPU_WHICH_DOMAIN: 1121 error = EINVAL; 1122 goto out; 1123 } 1124 if (level == CPU_LEVEL_ROOT) 1125 nset = cpuset_refroot(set); 1126 else 1127 nset = cpuset_refbase(set); 1128 CPU_COPY(&nset->cs_mask, mask); 1129 cpuset_rel(nset); 1130 break; 1131 case CPU_LEVEL_WHICH: 1132 switch (which) { 1133 case CPU_WHICH_TID: 1134 thread_lock(ttd); 1135 CPU_COPY(&ttd->td_cpuset->cs_mask, mask); 1136 thread_unlock(ttd); 1137 break; 1138 case CPU_WHICH_PID: 1139 FOREACH_THREAD_IN_PROC(p, ttd) { 1140 thread_lock(ttd); 1141 CPU_OR(mask, &ttd->td_cpuset->cs_mask); 1142 thread_unlock(ttd); 1143 } 1144 break; 1145 case CPU_WHICH_CPUSET: 1146 case CPU_WHICH_JAIL: 1147 CPU_COPY(&set->cs_mask, mask); 1148 break; 1149 case CPU_WHICH_IRQ: 1150 case CPU_WHICH_INTRHANDLER: 1151 case CPU_WHICH_ITHREAD: 1152 error = intr_getaffinity(id, which, mask); 1153 break; 1154 case CPU_WHICH_DOMAIN: 1155 if (id < 0 || id >= MAXMEMDOM) 1156 error = ESRCH; 1157 else 1158 CPU_COPY(&cpuset_domain[id], mask); 1159 break; 1160 } 1161 break; 1162 default: 1163 error = EINVAL; 1164 break; 1165 } 1166 if (set) 1167 cpuset_rel(set); 1168 if (p) 1169 PROC_UNLOCK(p); 1170 if (error == 0) 1171 error = copyout(mask, maskp, size); 1172 out: 1173 free(mask, M_TEMP); 1174 return (error); 1175 } 1176 1177 #ifndef _SYS_SYSPROTO_H_ 1178 struct cpuset_setaffinity_args { 1179 cpulevel_t level; 1180 cpuwhich_t which; 1181 id_t id; 1182 size_t cpusetsize; 1183 const cpuset_t *mask; 1184 }; 1185 #endif 1186 int 1187 sys_cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap) 1188 { 1189 1190 return (kern_cpuset_setaffinity(td, uap->level, uap->which, 1191 uap->id, uap->cpusetsize, uap->mask)); 1192 } 1193 1194 int 1195 kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, 1196 id_t id, size_t cpusetsize, const cpuset_t *maskp) 1197 { 1198 struct cpuset *nset; 1199 struct cpuset *set; 1200 struct thread *ttd; 1201 struct proc *p; 1202 cpuset_t *mask; 1203 int error; 1204 1205 if (cpusetsize < sizeof(cpuset_t) || cpusetsize > CPU_MAXSIZE / NBBY) 1206 return (ERANGE); 1207 mask = malloc(cpusetsize, M_TEMP, M_WAITOK | M_ZERO); 1208 error = copyin(maskp, mask, cpusetsize); 1209 if (error) 1210 goto out; 1211 /* 1212 * Verify that no high bits are set. 1213 */ 1214 if (cpusetsize > sizeof(cpuset_t)) { 1215 char *end; 1216 char *cp; 1217 1218 end = cp = (char *)&mask->__bits; 1219 end += cpusetsize; 1220 cp += sizeof(cpuset_t); 1221 while (cp != end) 1222 if (*cp++ != 0) { 1223 error = EINVAL; 1224 goto out; 1225 } 1226 1227 } 1228 switch (level) { 1229 case CPU_LEVEL_ROOT: 1230 case CPU_LEVEL_CPUSET: 1231 error = cpuset_which(which, id, &p, &ttd, &set); 1232 if (error) 1233 break; 1234 switch (which) { 1235 case CPU_WHICH_TID: 1236 case CPU_WHICH_PID: 1237 thread_lock(ttd); 1238 set = cpuset_ref(ttd->td_cpuset); 1239 thread_unlock(ttd); 1240 PROC_UNLOCK(p); 1241 break; 1242 case CPU_WHICH_CPUSET: 1243 case CPU_WHICH_JAIL: 1244 break; 1245 case CPU_WHICH_IRQ: 1246 case CPU_WHICH_INTRHANDLER: 1247 case CPU_WHICH_ITHREAD: 1248 case CPU_WHICH_DOMAIN: 1249 error = EINVAL; 1250 goto out; 1251 } 1252 if (level == CPU_LEVEL_ROOT) 1253 nset = cpuset_refroot(set); 1254 else 1255 nset = cpuset_refbase(set); 1256 error = cpuset_modify(nset, mask); 1257 cpuset_rel(nset); 1258 cpuset_rel(set); 1259 break; 1260 case CPU_LEVEL_WHICH: 1261 switch (which) { 1262 case CPU_WHICH_TID: 1263 error = cpuset_setthread(id, mask); 1264 break; 1265 case CPU_WHICH_PID: 1266 error = cpuset_setproc(id, NULL, mask); 1267 break; 1268 case CPU_WHICH_CPUSET: 1269 case CPU_WHICH_JAIL: 1270 error = cpuset_which(which, id, &p, &ttd, &set); 1271 if (error == 0) { 1272 error = cpuset_modify(set, mask); 1273 cpuset_rel(set); 1274 } 1275 break; 1276 case CPU_WHICH_IRQ: 1277 case CPU_WHICH_INTRHANDLER: 1278 case CPU_WHICH_ITHREAD: 1279 error = intr_setaffinity(id, which, mask); 1280 break; 1281 default: 1282 error = EINVAL; 1283 break; 1284 } 1285 break; 1286 default: 1287 error = EINVAL; 1288 break; 1289 } 1290 out: 1291 free(mask, M_TEMP); 1292 return (error); 1293 } 1294 1295 #ifdef DDB 1296 void 1297 ddb_display_cpuset(const cpuset_t *set) 1298 { 1299 int cpu, once; 1300 1301 for (once = 0, cpu = 0; cpu < CPU_SETSIZE; cpu++) { 1302 if (CPU_ISSET(cpu, set)) { 1303 if (once == 0) { 1304 db_printf("%d", cpu); 1305 once = 1; 1306 } else 1307 db_printf(",%d", cpu); 1308 } 1309 } 1310 if (once == 0) 1311 db_printf("<none>"); 1312 } 1313 1314 DB_SHOW_COMMAND(cpusets, db_show_cpusets) 1315 { 1316 struct cpuset *set; 1317 1318 LIST_FOREACH(set, &cpuset_ids, cs_link) { 1319 db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n", 1320 set, set->cs_id, set->cs_ref, set->cs_flags, 1321 (set->cs_parent != NULL) ? set->cs_parent->cs_id : 0); 1322 db_printf(" mask="); 1323 ddb_display_cpuset(&set->cs_mask); 1324 db_printf("\n"); 1325 if (db_pager_quit) 1326 break; 1327 } 1328 } 1329 #endif /* DDB */ 1330