1 /*- 2 * Copyright (c) 2008, Jeffrey Roberson <jeff@freebsd.org> 3 * All rights reserved. 4 * 5 * Copyright (c) 2008 Nokia Corporation 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice unmodified, this list of conditions, and the following 13 * disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 23 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 27 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 * 29 */ 30 31 #include <sys/cdefs.h> 32 __FBSDID("$FreeBSD$"); 33 34 #include "opt_ddb.h" 35 36 #include <sys/param.h> 37 #include <sys/systm.h> 38 #include <sys/sysproto.h> 39 #include <sys/jail.h> 40 #include <sys/kernel.h> 41 #include <sys/lock.h> 42 #include <sys/malloc.h> 43 #include <sys/mutex.h> 44 #include <sys/priv.h> 45 #include <sys/proc.h> 46 #include <sys/refcount.h> 47 #include <sys/sched.h> 48 #include <sys/smp.h> 49 #include <sys/syscallsubr.h> 50 #include <sys/cpuset.h> 51 #include <sys/sx.h> 52 #include <sys/queue.h> 53 #include <sys/limits.h> 54 #include <sys/bus.h> 55 #include <sys/interrupt.h> 56 57 #include <vm/uma.h> 58 59 #ifdef DDB 60 #include <ddb/ddb.h> 61 #endif /* DDB */ 62 63 /* 64 * cpusets provide a mechanism for creating and manipulating sets of 65 * processors for the purpose of constraining the scheduling of threads to 66 * specific processors. 67 * 68 * Each process belongs to an identified set, by default this is set 1. Each 69 * thread may further restrict the cpus it may run on to a subset of this 70 * named set. This creates an anonymous set which other threads and processes 71 * may not join by number. 72 * 73 * The named set is referred to herein as the 'base' set to avoid ambiguity. 74 * This set is usually a child of a 'root' set while the anonymous set may 75 * simply be referred to as a mask. In the syscall api these are referred to 76 * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here. 77 * 78 * Threads inherit their set from their creator whether it be anonymous or 79 * not. This means that anonymous sets are immutable because they may be 80 * shared. To modify an anonymous set a new set is created with the desired 81 * mask and the same parent as the existing anonymous set. This gives the 82 * illusion of each thread having a private mask.A 83 * 84 * Via the syscall apis a user may ask to retrieve or modify the root, base, 85 * or mask that is discovered via a pid, tid, or setid. Modifying a set 86 * modifies all numbered and anonymous child sets to comply with the new mask. 87 * Modifying a pid or tid's mask applies only to that tid but must still 88 * exist within the assigned parent set. 89 * 90 * A thread may not be assigned to a a group seperate from other threads in 91 * the process. This is to remove ambiguity when the setid is queried with 92 * a pid argument. There is no other technical limitation. 93 * 94 * This somewhat complex arrangement is intended to make it easy for 95 * applications to query available processors and bind their threads to 96 * specific processors while also allowing administrators to dynamically 97 * reprovision by changing sets which apply to groups of processes. 98 * 99 * A simple application should not concern itself with sets at all and 100 * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id 101 * meaning 'curthread'. It may query availble cpus for that tid with a 102 * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...). 103 */ 104 static uma_zone_t cpuset_zone; 105 static struct mtx cpuset_lock; 106 static struct setlist cpuset_ids; 107 static struct unrhdr *cpuset_unr; 108 static struct cpuset *cpuset_zero; 109 110 cpuset_t *cpuset_root; 111 112 /* 113 * Acquire a reference to a cpuset, all pointers must be tracked with refs. 114 */ 115 struct cpuset * 116 cpuset_ref(struct cpuset *set) 117 { 118 119 refcount_acquire(&set->cs_ref); 120 return (set); 121 } 122 123 /* 124 * Walks up the tree from 'set' to find the root. Returns the root 125 * referenced. 126 */ 127 static struct cpuset * 128 cpuset_refroot(struct cpuset *set) 129 { 130 131 for (; set->cs_parent != NULL; set = set->cs_parent) 132 if (set->cs_flags & CPU_SET_ROOT) 133 break; 134 cpuset_ref(set); 135 136 return (set); 137 } 138 139 /* 140 * Find the first non-anonymous set starting from 'set'. Returns this set 141 * referenced. May return the passed in set with an extra ref if it is 142 * not anonymous. 143 */ 144 static struct cpuset * 145 cpuset_refbase(struct cpuset *set) 146 { 147 148 if (set->cs_id == CPUSET_INVALID) 149 set = set->cs_parent; 150 cpuset_ref(set); 151 152 return (set); 153 } 154 155 /* 156 * Release a reference in a context where it is safe to allocte. 157 */ 158 void 159 cpuset_rel(struct cpuset *set) 160 { 161 cpusetid_t id; 162 163 if (refcount_release(&set->cs_ref) == 0) 164 return; 165 mtx_lock_spin(&cpuset_lock); 166 LIST_REMOVE(set, cs_siblings); 167 id = set->cs_id; 168 if (id != CPUSET_INVALID) 169 LIST_REMOVE(set, cs_link); 170 mtx_unlock_spin(&cpuset_lock); 171 cpuset_rel(set->cs_parent); 172 uma_zfree(cpuset_zone, set); 173 if (id != CPUSET_INVALID) 174 free_unr(cpuset_unr, id); 175 } 176 177 /* 178 * Deferred release must be used when in a context that is not safe to 179 * allocate/free. This places any unreferenced sets on the list 'head'. 180 */ 181 static void 182 cpuset_rel_defer(struct setlist *head, struct cpuset *set) 183 { 184 185 if (refcount_release(&set->cs_ref) == 0) 186 return; 187 mtx_lock_spin(&cpuset_lock); 188 LIST_REMOVE(set, cs_siblings); 189 if (set->cs_id != CPUSET_INVALID) 190 LIST_REMOVE(set, cs_link); 191 LIST_INSERT_HEAD(head, set, cs_link); 192 mtx_unlock_spin(&cpuset_lock); 193 } 194 195 /* 196 * Complete a deferred release. Removes the set from the list provided to 197 * cpuset_rel_defer. 198 */ 199 static void 200 cpuset_rel_complete(struct cpuset *set) 201 { 202 LIST_REMOVE(set, cs_link); 203 cpuset_rel(set->cs_parent); 204 uma_zfree(cpuset_zone, set); 205 } 206 207 /* 208 * Find a set based on an id. Returns it with a ref. 209 */ 210 static struct cpuset * 211 cpuset_lookup(cpusetid_t setid, struct thread *td) 212 { 213 struct cpuset *set; 214 215 if (setid == CPUSET_INVALID) 216 return (NULL); 217 mtx_lock_spin(&cpuset_lock); 218 LIST_FOREACH(set, &cpuset_ids, cs_link) 219 if (set->cs_id == setid) 220 break; 221 if (set) 222 cpuset_ref(set); 223 mtx_unlock_spin(&cpuset_lock); 224 225 KASSERT(td != NULL, ("[%s:%d] td is NULL", __func__, __LINE__)); 226 if (set != NULL && jailed(td->td_ucred)) { 227 struct cpuset *jset, *tset; 228 229 jset = td->td_ucred->cr_prison->pr_cpuset; 230 for (tset = set; tset != NULL; tset = tset->cs_parent) 231 if (tset == jset) 232 break; 233 if (tset == NULL) { 234 cpuset_rel(set); 235 set = NULL; 236 } 237 } 238 239 return (set); 240 } 241 242 /* 243 * Create a set in the space provided in 'set' with the provided parameters. 244 * The set is returned with a single ref. May return EDEADLK if the set 245 * will have no valid cpu based on restrictions from the parent. 246 */ 247 static int 248 _cpuset_create(struct cpuset *set, struct cpuset *parent, cpuset_t *mask, 249 cpusetid_t id) 250 { 251 252 if (!CPU_OVERLAP(&parent->cs_mask, mask)) 253 return (EDEADLK); 254 CPU_COPY(mask, &set->cs_mask); 255 LIST_INIT(&set->cs_children); 256 refcount_init(&set->cs_ref, 1); 257 set->cs_flags = 0; 258 mtx_lock_spin(&cpuset_lock); 259 CPU_AND(mask, &parent->cs_mask); 260 set->cs_id = id; 261 set->cs_parent = cpuset_ref(parent); 262 LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings); 263 if (set->cs_id != CPUSET_INVALID) 264 LIST_INSERT_HEAD(&cpuset_ids, set, cs_link); 265 mtx_unlock_spin(&cpuset_lock); 266 267 return (0); 268 } 269 270 /* 271 * Create a new non-anonymous set with the requested parent and mask. May 272 * return failures if the mask is invalid or a new number can not be 273 * allocated. 274 */ 275 static int 276 cpuset_create(struct cpuset **setp, struct cpuset *parent, cpuset_t *mask) 277 { 278 struct cpuset *set; 279 cpusetid_t id; 280 int error; 281 282 id = alloc_unr(cpuset_unr); 283 if (id == -1) 284 return (ENFILE); 285 *setp = set = uma_zalloc(cpuset_zone, M_WAITOK); 286 error = _cpuset_create(set, parent, mask, id); 287 if (error == 0) 288 return (0); 289 free_unr(cpuset_unr, id); 290 uma_zfree(cpuset_zone, set); 291 292 return (error); 293 } 294 295 /* 296 * Recursively check for errors that would occur from applying mask to 297 * the tree of sets starting at 'set'. Checks for sets that would become 298 * empty as well as RDONLY flags. 299 */ 300 static int 301 cpuset_testupdate(struct cpuset *set, cpuset_t *mask) 302 { 303 struct cpuset *nset; 304 cpuset_t newmask; 305 int error; 306 307 mtx_assert(&cpuset_lock, MA_OWNED); 308 if (set->cs_flags & CPU_SET_RDONLY) 309 return (EPERM); 310 if (!CPU_OVERLAP(&set->cs_mask, mask)) 311 return (EDEADLK); 312 CPU_COPY(&set->cs_mask, &newmask); 313 CPU_AND(&newmask, mask); 314 error = 0; 315 LIST_FOREACH(nset, &set->cs_children, cs_siblings) 316 if ((error = cpuset_testupdate(nset, &newmask)) != 0) 317 break; 318 return (error); 319 } 320 321 /* 322 * Applies the mask 'mask' without checking for empty sets or permissions. 323 */ 324 static void 325 cpuset_update(struct cpuset *set, cpuset_t *mask) 326 { 327 struct cpuset *nset; 328 329 mtx_assert(&cpuset_lock, MA_OWNED); 330 CPU_AND(&set->cs_mask, mask); 331 LIST_FOREACH(nset, &set->cs_children, cs_siblings) 332 cpuset_update(nset, &set->cs_mask); 333 334 return; 335 } 336 337 /* 338 * Modify the set 'set' to use a copy of the mask provided. Apply this new 339 * mask to restrict all children in the tree. Checks for validity before 340 * applying the changes. 341 */ 342 static int 343 cpuset_modify(struct cpuset *set, cpuset_t *mask) 344 { 345 struct cpuset *root; 346 int error; 347 348 error = priv_check(curthread, PRIV_SCHED_CPUSET); 349 if (error) 350 return (error); 351 /* 352 * In case we are called from within the jail 353 * we do not allow modifying the dedicated root 354 * cpuset of the jail but may still allow to 355 * change child sets. 356 */ 357 if (jailed(curthread->td_ucred) && 358 set->cs_flags & CPU_SET_ROOT) 359 return (EPERM); 360 /* 361 * Verify that we have access to this set of 362 * cpus. 363 */ 364 root = set->cs_parent; 365 if (root && !CPU_SUBSET(&root->cs_mask, mask)) 366 return (EINVAL); 367 mtx_lock_spin(&cpuset_lock); 368 error = cpuset_testupdate(set, mask); 369 if (error) 370 goto out; 371 cpuset_update(set, mask); 372 CPU_COPY(mask, &set->cs_mask); 373 out: 374 mtx_unlock_spin(&cpuset_lock); 375 376 return (error); 377 } 378 379 /* 380 * Resolve the 'which' parameter of several cpuset apis. 381 * 382 * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid. Also 383 * checks for permission via p_cansched(). 384 * 385 * For WHICH_SET returns a valid set with a new reference. 386 * 387 * -1 may be supplied for any argument to mean the current proc/thread or 388 * the base set of the current thread. May fail with ESRCH/EPERM. 389 */ 390 static int 391 cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp, 392 struct cpuset **setp) 393 { 394 struct cpuset *set; 395 struct thread *td; 396 struct proc *p; 397 int error; 398 399 *pp = p = NULL; 400 *tdp = td = NULL; 401 *setp = set = NULL; 402 switch (which) { 403 case CPU_WHICH_PID: 404 if (id == -1) { 405 PROC_LOCK(curproc); 406 p = curproc; 407 break; 408 } 409 if ((p = pfind(id)) == NULL) 410 return (ESRCH); 411 break; 412 case CPU_WHICH_TID: 413 if (id == -1) { 414 PROC_LOCK(curproc); 415 p = curproc; 416 td = curthread; 417 break; 418 } 419 sx_slock(&allproc_lock); 420 FOREACH_PROC_IN_SYSTEM(p) { 421 PROC_LOCK(p); 422 FOREACH_THREAD_IN_PROC(p, td) 423 if (td->td_tid == id) 424 break; 425 if (td != NULL) 426 break; 427 PROC_UNLOCK(p); 428 } 429 sx_sunlock(&allproc_lock); 430 if (td == NULL) 431 return (ESRCH); 432 break; 433 case CPU_WHICH_CPUSET: 434 if (id == -1) { 435 thread_lock(curthread); 436 set = cpuset_refbase(curthread->td_cpuset); 437 thread_unlock(curthread); 438 } else 439 set = cpuset_lookup(id, curthread); 440 if (set) { 441 *setp = set; 442 return (0); 443 } 444 return (ESRCH); 445 case CPU_WHICH_JAIL: 446 { 447 /* Find `set' for prison with given id. */ 448 struct prison *pr; 449 450 sx_slock(&allprison_lock); 451 pr = prison_find_child(curthread->td_ucred->cr_prison, id); 452 sx_sunlock(&allprison_lock); 453 if (pr == NULL) 454 return (ESRCH); 455 cpuset_ref(pr->pr_cpuset); 456 *setp = pr->pr_cpuset; 457 mtx_unlock(&pr->pr_mtx); 458 return (0); 459 } 460 case CPU_WHICH_IRQ: 461 return (0); 462 default: 463 return (EINVAL); 464 } 465 error = p_cansched(curthread, p); 466 if (error) { 467 PROC_UNLOCK(p); 468 return (error); 469 } 470 if (td == NULL) 471 td = FIRST_THREAD_IN_PROC(p); 472 *pp = p; 473 *tdp = td; 474 return (0); 475 } 476 477 /* 478 * Create an anonymous set with the provided mask in the space provided by 479 * 'fset'. If the passed in set is anonymous we use its parent otherwise 480 * the new set is a child of 'set'. 481 */ 482 static int 483 cpuset_shadow(struct cpuset *set, struct cpuset *fset, cpuset_t *mask) 484 { 485 struct cpuset *parent; 486 487 if (set->cs_id == CPUSET_INVALID) 488 parent = set->cs_parent; 489 else 490 parent = set; 491 if (!CPU_SUBSET(&parent->cs_mask, mask)) 492 return (EDEADLK); 493 return (_cpuset_create(fset, parent, mask, CPUSET_INVALID)); 494 } 495 496 /* 497 * Handle two cases for replacing the base set or mask of an entire process. 498 * 499 * 1) Set is non-null and mask is null. This reparents all anonymous sets 500 * to the provided set and replaces all non-anonymous td_cpusets with the 501 * provided set. 502 * 2) Mask is non-null and set is null. This replaces or creates anonymous 503 * sets for every thread with the existing base as a parent. 504 * 505 * This is overly complicated because we can't allocate while holding a 506 * spinlock and spinlocks must be held while changing and examining thread 507 * state. 508 */ 509 static int 510 cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask) 511 { 512 struct setlist freelist; 513 struct setlist droplist; 514 struct cpuset *tdset; 515 struct cpuset *nset; 516 struct thread *td; 517 struct proc *p; 518 int threads; 519 int nfree; 520 int error; 521 /* 522 * The algorithm requires two passes due to locking considerations. 523 * 524 * 1) Lookup the process and acquire the locks in the required order. 525 * 2) If enough cpusets have not been allocated release the locks and 526 * allocate them. Loop. 527 */ 528 LIST_INIT(&freelist); 529 LIST_INIT(&droplist); 530 nfree = 0; 531 for (;;) { 532 error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset); 533 if (error) 534 goto out; 535 if (nfree >= p->p_numthreads) 536 break; 537 threads = p->p_numthreads; 538 PROC_UNLOCK(p); 539 for (; nfree < threads; nfree++) { 540 nset = uma_zalloc(cpuset_zone, M_WAITOK); 541 LIST_INSERT_HEAD(&freelist, nset, cs_link); 542 } 543 } 544 PROC_LOCK_ASSERT(p, MA_OWNED); 545 /* 546 * Now that the appropriate locks are held and we have enough cpusets, 547 * make sure the operation will succeed before applying changes. The 548 * proc lock prevents td_cpuset from changing between calls. 549 */ 550 error = 0; 551 FOREACH_THREAD_IN_PROC(p, td) { 552 thread_lock(td); 553 tdset = td->td_cpuset; 554 /* 555 * Verify that a new mask doesn't specify cpus outside of 556 * the set the thread is a member of. 557 */ 558 if (mask) { 559 if (tdset->cs_id == CPUSET_INVALID) 560 tdset = tdset->cs_parent; 561 if (!CPU_SUBSET(&tdset->cs_mask, mask)) 562 error = EDEADLK; 563 /* 564 * Verify that a new set won't leave an existing thread 565 * mask without a cpu to run on. It can, however, restrict 566 * the set. 567 */ 568 } else if (tdset->cs_id == CPUSET_INVALID) { 569 if (!CPU_OVERLAP(&set->cs_mask, &tdset->cs_mask)) 570 error = EDEADLK; 571 } 572 thread_unlock(td); 573 if (error) 574 goto unlock_out; 575 } 576 /* 577 * Replace each thread's cpuset while using deferred release. We 578 * must do this because the thread lock must be held while operating 579 * on the thread and this limits the type of operations allowed. 580 */ 581 FOREACH_THREAD_IN_PROC(p, td) { 582 thread_lock(td); 583 /* 584 * If we presently have an anonymous set or are applying a 585 * mask we must create an anonymous shadow set. That is 586 * either parented to our existing base or the supplied set. 587 * 588 * If we have a base set with no anonymous shadow we simply 589 * replace it outright. 590 */ 591 tdset = td->td_cpuset; 592 if (tdset->cs_id == CPUSET_INVALID || mask) { 593 nset = LIST_FIRST(&freelist); 594 LIST_REMOVE(nset, cs_link); 595 if (mask) 596 error = cpuset_shadow(tdset, nset, mask); 597 else 598 error = _cpuset_create(nset, set, 599 &tdset->cs_mask, CPUSET_INVALID); 600 if (error) { 601 LIST_INSERT_HEAD(&freelist, nset, cs_link); 602 thread_unlock(td); 603 break; 604 } 605 } else 606 nset = cpuset_ref(set); 607 cpuset_rel_defer(&droplist, tdset); 608 td->td_cpuset = nset; 609 sched_affinity(td); 610 thread_unlock(td); 611 } 612 unlock_out: 613 PROC_UNLOCK(p); 614 out: 615 while ((nset = LIST_FIRST(&droplist)) != NULL) 616 cpuset_rel_complete(nset); 617 while ((nset = LIST_FIRST(&freelist)) != NULL) { 618 LIST_REMOVE(nset, cs_link); 619 uma_zfree(cpuset_zone, nset); 620 } 621 return (error); 622 } 623 624 /* 625 * Apply an anonymous mask to a single thread. 626 */ 627 int 628 cpuset_setthread(lwpid_t id, cpuset_t *mask) 629 { 630 struct cpuset *nset; 631 struct cpuset *set; 632 struct thread *td; 633 struct proc *p; 634 int error; 635 636 nset = uma_zalloc(cpuset_zone, M_WAITOK); 637 error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set); 638 if (error) 639 goto out; 640 set = NULL; 641 thread_lock(td); 642 error = cpuset_shadow(td->td_cpuset, nset, mask); 643 if (error == 0) { 644 set = td->td_cpuset; 645 td->td_cpuset = nset; 646 sched_affinity(td); 647 nset = NULL; 648 } 649 thread_unlock(td); 650 PROC_UNLOCK(p); 651 if (set) 652 cpuset_rel(set); 653 out: 654 if (nset) 655 uma_zfree(cpuset_zone, nset); 656 return (error); 657 } 658 659 /* 660 * Creates the cpuset for thread0. We make two sets: 661 * 662 * 0 - The root set which should represent all valid processors in the 663 * system. It is initially created with a mask of all processors 664 * because we don't know what processors are valid until cpuset_init() 665 * runs. This set is immutable. 666 * 1 - The default set which all processes are a member of until changed. 667 * This allows an administrator to move all threads off of given cpus to 668 * dedicate them to high priority tasks or save power etc. 669 */ 670 struct cpuset * 671 cpuset_thread0(void) 672 { 673 struct cpuset *set; 674 int error; 675 676 cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL, 677 NULL, NULL, UMA_ALIGN_PTR, 0); 678 mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE); 679 /* 680 * Create the root system set for the whole machine. Doesn't use 681 * cpuset_create() due to NULL parent. 682 */ 683 set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); 684 set->cs_mask.__bits[0] = -1; 685 LIST_INIT(&set->cs_children); 686 LIST_INSERT_HEAD(&cpuset_ids, set, cs_link); 687 set->cs_ref = 1; 688 set->cs_flags = CPU_SET_ROOT; 689 cpuset_zero = set; 690 cpuset_root = &set->cs_mask; 691 /* 692 * Now derive a default, modifiable set from that to give out. 693 */ 694 set = uma_zalloc(cpuset_zone, M_WAITOK); 695 error = _cpuset_create(set, cpuset_zero, &cpuset_zero->cs_mask, 1); 696 KASSERT(error == 0, ("Error creating default set: %d\n", error)); 697 /* 698 * Initialize the unit allocator. 0 and 1 are allocated above. 699 */ 700 cpuset_unr = new_unrhdr(2, INT_MAX, NULL); 701 702 return (set); 703 } 704 705 /* 706 * Create a cpuset, which would be cpuset_create() but 707 * mark the new 'set' as root. 708 * 709 * We are not going to reparent the td to it. Use cpuset_setproc_update_set() 710 * for that. 711 * 712 * In case of no error, returns the set in *setp locked with a reference. 713 */ 714 int 715 cpuset_create_root(struct prison *pr, struct cpuset **setp) 716 { 717 struct cpuset *set; 718 int error; 719 720 KASSERT(pr != NULL, ("[%s:%d] invalid pr", __func__, __LINE__)); 721 KASSERT(setp != NULL, ("[%s:%d] invalid setp", __func__, __LINE__)); 722 723 error = cpuset_create(setp, pr->pr_cpuset, &pr->pr_cpuset->cs_mask); 724 if (error) 725 return (error); 726 727 KASSERT(*setp != NULL, ("[%s:%d] cpuset_create returned invalid data", 728 __func__, __LINE__)); 729 730 /* Mark the set as root. */ 731 set = *setp; 732 set->cs_flags |= CPU_SET_ROOT; 733 734 return (0); 735 } 736 737 int 738 cpuset_setproc_update_set(struct proc *p, struct cpuset *set) 739 { 740 int error; 741 742 KASSERT(p != NULL, ("[%s:%d] invalid proc", __func__, __LINE__)); 743 KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__)); 744 745 cpuset_ref(set); 746 error = cpuset_setproc(p->p_pid, set, NULL); 747 if (error) 748 return (error); 749 cpuset_rel(set); 750 return (0); 751 } 752 753 /* 754 * This is called once the final set of system cpus is known. Modifies 755 * the root set and all children and mark the root readonly. 756 */ 757 static void 758 cpuset_init(void *arg) 759 { 760 cpuset_t mask; 761 762 CPU_ZERO(&mask); 763 #ifdef SMP 764 mask.__bits[0] = all_cpus; 765 #else 766 mask.__bits[0] = 1; 767 #endif 768 if (cpuset_modify(cpuset_zero, &mask)) 769 panic("Can't set initial cpuset mask.\n"); 770 cpuset_zero->cs_flags |= CPU_SET_RDONLY; 771 } 772 SYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL); 773 774 #ifndef _SYS_SYSPROTO_H_ 775 struct cpuset_args { 776 cpusetid_t *setid; 777 }; 778 #endif 779 int 780 cpuset(struct thread *td, struct cpuset_args *uap) 781 { 782 struct cpuset *root; 783 struct cpuset *set; 784 int error; 785 786 thread_lock(td); 787 root = cpuset_refroot(td->td_cpuset); 788 thread_unlock(td); 789 error = cpuset_create(&set, root, &root->cs_mask); 790 cpuset_rel(root); 791 if (error) 792 return (error); 793 error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id)); 794 if (error == 0) 795 error = cpuset_setproc(-1, set, NULL); 796 cpuset_rel(set); 797 return (error); 798 } 799 800 #ifndef _SYS_SYSPROTO_H_ 801 struct cpuset_setid_args { 802 cpuwhich_t which; 803 id_t id; 804 cpusetid_t setid; 805 }; 806 #endif 807 int 808 cpuset_setid(struct thread *td, struct cpuset_setid_args *uap) 809 { 810 struct cpuset *set; 811 int error; 812 813 /* 814 * Presently we only support per-process sets. 815 */ 816 if (uap->which != CPU_WHICH_PID) 817 return (EINVAL); 818 set = cpuset_lookup(uap->setid, td); 819 if (set == NULL) 820 return (ESRCH); 821 error = cpuset_setproc(uap->id, set, NULL); 822 cpuset_rel(set); 823 return (error); 824 } 825 826 #ifndef _SYS_SYSPROTO_H_ 827 struct cpuset_getid_args { 828 cpulevel_t level; 829 cpuwhich_t which; 830 id_t id; 831 cpusetid_t *setid; 832 #endif 833 int 834 cpuset_getid(struct thread *td, struct cpuset_getid_args *uap) 835 { 836 struct cpuset *nset; 837 struct cpuset *set; 838 struct thread *ttd; 839 struct proc *p; 840 cpusetid_t id; 841 int error; 842 843 if (uap->level == CPU_LEVEL_WHICH && uap->which != CPU_WHICH_CPUSET) 844 return (EINVAL); 845 error = cpuset_which(uap->which, uap->id, &p, &ttd, &set); 846 if (error) 847 return (error); 848 switch (uap->which) { 849 case CPU_WHICH_TID: 850 case CPU_WHICH_PID: 851 thread_lock(ttd); 852 set = cpuset_refbase(ttd->td_cpuset); 853 thread_unlock(ttd); 854 PROC_UNLOCK(p); 855 break; 856 case CPU_WHICH_CPUSET: 857 case CPU_WHICH_JAIL: 858 break; 859 case CPU_WHICH_IRQ: 860 return (EINVAL); 861 } 862 switch (uap->level) { 863 case CPU_LEVEL_ROOT: 864 nset = cpuset_refroot(set); 865 cpuset_rel(set); 866 set = nset; 867 break; 868 case CPU_LEVEL_CPUSET: 869 break; 870 case CPU_LEVEL_WHICH: 871 break; 872 } 873 id = set->cs_id; 874 cpuset_rel(set); 875 if (error == 0) 876 error = copyout(&id, uap->setid, sizeof(id)); 877 878 return (error); 879 } 880 881 #ifndef _SYS_SYSPROTO_H_ 882 struct cpuset_getaffinity_args { 883 cpulevel_t level; 884 cpuwhich_t which; 885 id_t id; 886 size_t cpusetsize; 887 cpuset_t *mask; 888 }; 889 #endif 890 int 891 cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap) 892 { 893 struct thread *ttd; 894 struct cpuset *nset; 895 struct cpuset *set; 896 struct proc *p; 897 cpuset_t *mask; 898 int error; 899 size_t size; 900 901 if (uap->cpusetsize < sizeof(cpuset_t) || 902 uap->cpusetsize > CPU_MAXSIZE / NBBY) 903 return (ERANGE); 904 size = uap->cpusetsize; 905 mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO); 906 error = cpuset_which(uap->which, uap->id, &p, &ttd, &set); 907 if (error) 908 goto out; 909 switch (uap->level) { 910 case CPU_LEVEL_ROOT: 911 case CPU_LEVEL_CPUSET: 912 switch (uap->which) { 913 case CPU_WHICH_TID: 914 case CPU_WHICH_PID: 915 thread_lock(ttd); 916 set = cpuset_ref(ttd->td_cpuset); 917 thread_unlock(ttd); 918 break; 919 case CPU_WHICH_CPUSET: 920 case CPU_WHICH_JAIL: 921 break; 922 case CPU_WHICH_IRQ: 923 error = EINVAL; 924 goto out; 925 } 926 if (uap->level == CPU_LEVEL_ROOT) 927 nset = cpuset_refroot(set); 928 else 929 nset = cpuset_refbase(set); 930 CPU_COPY(&nset->cs_mask, mask); 931 cpuset_rel(nset); 932 break; 933 case CPU_LEVEL_WHICH: 934 switch (uap->which) { 935 case CPU_WHICH_TID: 936 thread_lock(ttd); 937 CPU_COPY(&ttd->td_cpuset->cs_mask, mask); 938 thread_unlock(ttd); 939 break; 940 case CPU_WHICH_PID: 941 FOREACH_THREAD_IN_PROC(p, ttd) { 942 thread_lock(ttd); 943 CPU_OR(mask, &ttd->td_cpuset->cs_mask); 944 thread_unlock(ttd); 945 } 946 break; 947 case CPU_WHICH_CPUSET: 948 case CPU_WHICH_JAIL: 949 CPU_COPY(&set->cs_mask, mask); 950 break; 951 case CPU_WHICH_IRQ: 952 error = intr_getaffinity(uap->id, mask); 953 break; 954 } 955 break; 956 default: 957 error = EINVAL; 958 break; 959 } 960 if (set) 961 cpuset_rel(set); 962 if (p) 963 PROC_UNLOCK(p); 964 if (error == 0) 965 error = copyout(mask, uap->mask, size); 966 out: 967 free(mask, M_TEMP); 968 return (error); 969 } 970 971 #ifndef _SYS_SYSPROTO_H_ 972 struct cpuset_setaffinity_args { 973 cpulevel_t level; 974 cpuwhich_t which; 975 id_t id; 976 size_t cpusetsize; 977 const cpuset_t *mask; 978 }; 979 #endif 980 int 981 cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap) 982 { 983 struct cpuset *nset; 984 struct cpuset *set; 985 struct thread *ttd; 986 struct proc *p; 987 cpuset_t *mask; 988 int error; 989 990 if (uap->cpusetsize < sizeof(cpuset_t) || 991 uap->cpusetsize > CPU_MAXSIZE / NBBY) 992 return (ERANGE); 993 mask = malloc(uap->cpusetsize, M_TEMP, M_WAITOK | M_ZERO); 994 error = copyin(uap->mask, mask, uap->cpusetsize); 995 if (error) 996 goto out; 997 /* 998 * Verify that no high bits are set. 999 */ 1000 if (uap->cpusetsize > sizeof(cpuset_t)) { 1001 char *end; 1002 char *cp; 1003 1004 end = cp = (char *)&mask->__bits; 1005 end += uap->cpusetsize; 1006 cp += sizeof(cpuset_t); 1007 while (cp != end) 1008 if (*cp++ != 0) { 1009 error = EINVAL; 1010 goto out; 1011 } 1012 1013 } 1014 switch (uap->level) { 1015 case CPU_LEVEL_ROOT: 1016 case CPU_LEVEL_CPUSET: 1017 error = cpuset_which(uap->which, uap->id, &p, &ttd, &set); 1018 if (error) 1019 break; 1020 switch (uap->which) { 1021 case CPU_WHICH_TID: 1022 case CPU_WHICH_PID: 1023 thread_lock(ttd); 1024 set = cpuset_ref(ttd->td_cpuset); 1025 thread_unlock(ttd); 1026 PROC_UNLOCK(p); 1027 break; 1028 case CPU_WHICH_CPUSET: 1029 case CPU_WHICH_JAIL: 1030 break; 1031 case CPU_WHICH_IRQ: 1032 error = EINVAL; 1033 goto out; 1034 } 1035 if (uap->level == CPU_LEVEL_ROOT) 1036 nset = cpuset_refroot(set); 1037 else 1038 nset = cpuset_refbase(set); 1039 error = cpuset_modify(nset, mask); 1040 cpuset_rel(nset); 1041 cpuset_rel(set); 1042 break; 1043 case CPU_LEVEL_WHICH: 1044 switch (uap->which) { 1045 case CPU_WHICH_TID: 1046 error = cpuset_setthread(uap->id, mask); 1047 break; 1048 case CPU_WHICH_PID: 1049 error = cpuset_setproc(uap->id, NULL, mask); 1050 break; 1051 case CPU_WHICH_CPUSET: 1052 case CPU_WHICH_JAIL: 1053 error = cpuset_which(uap->which, uap->id, &p, 1054 &ttd, &set); 1055 if (error == 0) { 1056 error = cpuset_modify(set, mask); 1057 cpuset_rel(set); 1058 } 1059 break; 1060 case CPU_WHICH_IRQ: 1061 error = intr_setaffinity(uap->id, mask); 1062 break; 1063 default: 1064 error = EINVAL; 1065 break; 1066 } 1067 break; 1068 default: 1069 error = EINVAL; 1070 break; 1071 } 1072 out: 1073 free(mask, M_TEMP); 1074 return (error); 1075 } 1076 1077 #ifdef DDB 1078 DB_SHOW_COMMAND(cpusets, db_show_cpusets) 1079 { 1080 struct cpuset *set; 1081 int cpu, once; 1082 1083 LIST_FOREACH(set, &cpuset_ids, cs_link) { 1084 db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n", 1085 set, set->cs_id, set->cs_ref, set->cs_flags, 1086 (set->cs_parent != NULL) ? set->cs_parent->cs_id : 0); 1087 db_printf(" mask="); 1088 for (once = 0, cpu = 0; cpu < CPU_SETSIZE; cpu++) { 1089 if (CPU_ISSET(cpu, &set->cs_mask)) { 1090 if (once == 0) { 1091 db_printf("%d", cpu); 1092 once = 1; 1093 } else 1094 db_printf(",%d", cpu); 1095 } 1096 } 1097 db_printf("\n"); 1098 if (db_pager_quit) 1099 break; 1100 } 1101 } 1102 #endif /* DDB */ 1103