1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2008, Jeffrey Roberson <jeff@freebsd.org> 5 * All rights reserved. 6 * 7 * Copyright (c) 2008 Nokia Corporation 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice unmodified, this list of conditions, and the following 15 * disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 21 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 22 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 23 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 24 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 25 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 29 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 * 31 */ 32 33 #include <sys/cdefs.h> 34 __FBSDID("$FreeBSD$"); 35 36 #include "opt_ddb.h" 37 #include "opt_ktrace.h" 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/sysctl.h> 42 #include <sys/ctype.h> 43 #include <sys/sysproto.h> 44 #include <sys/jail.h> 45 #include <sys/kernel.h> 46 #include <sys/lock.h> 47 #include <sys/malloc.h> 48 #include <sys/mutex.h> 49 #include <sys/priv.h> 50 #include <sys/proc.h> 51 #include <sys/refcount.h> 52 #include <sys/sched.h> 53 #include <sys/smp.h> 54 #include <sys/syscallsubr.h> 55 #include <sys/sysent.h> 56 #include <sys/capsicum.h> 57 #include <sys/cpuset.h> 58 #include <sys/domainset.h> 59 #include <sys/sx.h> 60 #include <sys/queue.h> 61 #include <sys/libkern.h> 62 #include <sys/limits.h> 63 #include <sys/bus.h> 64 #include <sys/interrupt.h> 65 #include <sys/vmmeter.h> 66 #include <sys/ktrace.h> 67 68 #include <vm/uma.h> 69 #include <vm/vm.h> 70 #include <vm/vm_object.h> 71 #include <vm/vm_page.h> 72 #include <vm/vm_pageout.h> 73 #include <vm/vm_extern.h> 74 #include <vm/vm_param.h> 75 #include <vm/vm_phys.h> 76 #include <vm/vm_pagequeue.h> 77 78 #ifdef DDB 79 #include <ddb/ddb.h> 80 #endif /* DDB */ 81 82 /* 83 * cpusets provide a mechanism for creating and manipulating sets of 84 * processors for the purpose of constraining the scheduling of threads to 85 * specific processors. 86 * 87 * Each process belongs to an identified set, by default this is set 1. Each 88 * thread may further restrict the cpus it may run on to a subset of this 89 * named set. This creates an anonymous set which other threads and processes 90 * may not join by number. 91 * 92 * The named set is referred to herein as the 'base' set to avoid ambiguity. 93 * This set is usually a child of a 'root' set while the anonymous set may 94 * simply be referred to as a mask. In the syscall api these are referred to 95 * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here. 96 * 97 * Threads inherit their set from their creator whether it be anonymous or 98 * not. This means that anonymous sets are immutable because they may be 99 * shared. To modify an anonymous set a new set is created with the desired 100 * mask and the same parent as the existing anonymous set. This gives the 101 * illusion of each thread having a private mask. 102 * 103 * Via the syscall apis a user may ask to retrieve or modify the root, base, 104 * or mask that is discovered via a pid, tid, or setid. Modifying a set 105 * modifies all numbered and anonymous child sets to comply with the new mask. 106 * Modifying a pid or tid's mask applies only to that tid but must still 107 * exist within the assigned parent set. 108 * 109 * A thread may not be assigned to a group separate from other threads in 110 * the process. This is to remove ambiguity when the setid is queried with 111 * a pid argument. There is no other technical limitation. 112 * 113 * This somewhat complex arrangement is intended to make it easy for 114 * applications to query available processors and bind their threads to 115 * specific processors while also allowing administrators to dynamically 116 * reprovision by changing sets which apply to groups of processes. 117 * 118 * A simple application should not concern itself with sets at all and 119 * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id 120 * meaning 'curthread'. It may query available cpus for that tid with a 121 * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...). 122 */ 123 124 LIST_HEAD(domainlist, domainset); 125 struct domainset __read_mostly domainset_firsttouch; 126 struct domainset __read_mostly domainset_fixed[MAXMEMDOM]; 127 struct domainset __read_mostly domainset_interleave; 128 struct domainset __read_mostly domainset_prefer[MAXMEMDOM]; 129 struct domainset __read_mostly domainset_roundrobin; 130 131 static uma_zone_t cpuset_zone; 132 static uma_zone_t domainset_zone; 133 static struct mtx cpuset_lock; 134 static struct setlist cpuset_ids; 135 static struct domainlist cpuset_domains; 136 static struct unrhdr *cpuset_unr; 137 static struct cpuset *cpuset_zero, *cpuset_default, *cpuset_kernel; 138 static struct domainset *domainset0, *domainset2; 139 u_int cpusetsizemin = 1; 140 141 /* Return the size of cpuset_t at the kernel level */ 142 SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD | CTLFLAG_CAPRD, 143 SYSCTL_NULL_INT_PTR, sizeof(cpuset_t), "sizeof(cpuset_t)"); 144 145 /* Return the minimum size of cpuset_t allowed by the kernel */ 146 SYSCTL_UINT(_kern_sched, OID_AUTO, cpusetsizemin, 147 CTLFLAG_RD | CTLFLAG_CAPRD, &cpusetsizemin, 0, 148 "The minimum size of cpuset_t allowed by the kernel"); 149 150 cpuset_t *cpuset_root; 151 cpuset_t cpuset_domain[MAXMEMDOM]; 152 153 static int cpuset_which2(cpuwhich_t *, id_t, struct proc **, struct thread **, 154 struct cpuset **); 155 static int domainset_valid(const struct domainset *, const struct domainset *); 156 157 /* 158 * Find the first non-anonymous set starting from 'set'. 159 */ 160 static struct cpuset * 161 cpuset_getbase(struct cpuset *set) 162 { 163 164 if (set->cs_id == CPUSET_INVALID) 165 set = set->cs_parent; 166 return (set); 167 } 168 169 /* 170 * Walks up the tree from 'set' to find the root. 171 */ 172 static struct cpuset * 173 cpuset_getroot(struct cpuset *set) 174 { 175 176 while ((set->cs_flags & CPU_SET_ROOT) == 0 && set->cs_parent != NULL) 177 set = set->cs_parent; 178 return (set); 179 } 180 181 /* 182 * Acquire a reference to a cpuset, all pointers must be tracked with refs. 183 */ 184 struct cpuset * 185 cpuset_ref(struct cpuset *set) 186 { 187 188 refcount_acquire(&set->cs_ref); 189 return (set); 190 } 191 192 /* 193 * Walks up the tree from 'set' to find the root. Returns the root 194 * referenced. 195 */ 196 static struct cpuset * 197 cpuset_refroot(struct cpuset *set) 198 { 199 200 return (cpuset_ref(cpuset_getroot(set))); 201 } 202 203 /* 204 * Find the first non-anonymous set starting from 'set'. Returns this set 205 * referenced. May return the passed in set with an extra ref if it is 206 * not anonymous. 207 */ 208 static struct cpuset * 209 cpuset_refbase(struct cpuset *set) 210 { 211 212 return (cpuset_ref(cpuset_getbase(set))); 213 } 214 215 /* 216 * Release a reference in a context where it is safe to allocate. 217 */ 218 void 219 cpuset_rel(struct cpuset *set) 220 { 221 cpusetid_t id; 222 223 if (refcount_release_if_not_last(&set->cs_ref)) 224 return; 225 mtx_lock_spin(&cpuset_lock); 226 if (!refcount_release(&set->cs_ref)) { 227 mtx_unlock_spin(&cpuset_lock); 228 return; 229 } 230 LIST_REMOVE(set, cs_siblings); 231 id = set->cs_id; 232 if (id != CPUSET_INVALID) 233 LIST_REMOVE(set, cs_link); 234 mtx_unlock_spin(&cpuset_lock); 235 cpuset_rel(set->cs_parent); 236 uma_zfree(cpuset_zone, set); 237 if (id != CPUSET_INVALID) 238 free_unr(cpuset_unr, id); 239 } 240 241 /* 242 * Deferred release must be used when in a context that is not safe to 243 * allocate/free. This places any unreferenced sets on the list 'head'. 244 */ 245 static void 246 cpuset_rel_defer(struct setlist *head, struct cpuset *set) 247 { 248 249 if (refcount_release_if_not_last(&set->cs_ref)) 250 return; 251 mtx_lock_spin(&cpuset_lock); 252 if (!refcount_release(&set->cs_ref)) { 253 mtx_unlock_spin(&cpuset_lock); 254 return; 255 } 256 LIST_REMOVE(set, cs_siblings); 257 if (set->cs_id != CPUSET_INVALID) 258 LIST_REMOVE(set, cs_link); 259 LIST_INSERT_HEAD(head, set, cs_link); 260 mtx_unlock_spin(&cpuset_lock); 261 } 262 263 /* 264 * Complete a deferred release. Removes the set from the list provided to 265 * cpuset_rel_defer. 266 */ 267 static void 268 cpuset_rel_complete(struct cpuset *set) 269 { 270 cpusetid_t id; 271 272 id = set->cs_id; 273 LIST_REMOVE(set, cs_link); 274 cpuset_rel(set->cs_parent); 275 uma_zfree(cpuset_zone, set); 276 if (id != CPUSET_INVALID) 277 free_unr(cpuset_unr, id); 278 } 279 280 /* 281 * Find a set based on an id. Returns it with a ref. 282 */ 283 static struct cpuset * 284 cpuset_lookup(cpusetid_t setid, struct thread *td) 285 { 286 struct cpuset *set; 287 288 if (setid == CPUSET_INVALID) 289 return (NULL); 290 mtx_lock_spin(&cpuset_lock); 291 LIST_FOREACH(set, &cpuset_ids, cs_link) 292 if (set->cs_id == setid) 293 break; 294 if (set) 295 cpuset_ref(set); 296 mtx_unlock_spin(&cpuset_lock); 297 298 KASSERT(td != NULL, ("[%s:%d] td is NULL", __func__, __LINE__)); 299 if (set != NULL && jailed(td->td_ucred)) { 300 struct cpuset *jset, *tset; 301 302 jset = td->td_ucred->cr_prison->pr_cpuset; 303 for (tset = set; tset != NULL; tset = tset->cs_parent) 304 if (tset == jset) 305 break; 306 if (tset == NULL) { 307 cpuset_rel(set); 308 set = NULL; 309 } 310 } 311 312 return (set); 313 } 314 315 /* 316 * Initialize a set in the space provided in 'set' with the provided parameters. 317 * The set is returned with a single ref. May return EDEADLK if the set 318 * will have no valid cpu based on restrictions from the parent. 319 */ 320 static int 321 cpuset_init(struct cpuset *set, struct cpuset *parent, 322 const cpuset_t *mask, struct domainset *domain, cpusetid_t id) 323 { 324 325 if (domain == NULL) 326 domain = parent->cs_domain; 327 if (mask == NULL) 328 mask = &parent->cs_mask; 329 if (!CPU_OVERLAP(&parent->cs_mask, mask)) 330 return (EDEADLK); 331 /* The domain must be prepared ahead of time. */ 332 if (!domainset_valid(parent->cs_domain, domain)) 333 return (EDEADLK); 334 CPU_COPY(mask, &set->cs_mask); 335 LIST_INIT(&set->cs_children); 336 refcount_init(&set->cs_ref, 1); 337 set->cs_flags = 0; 338 mtx_lock_spin(&cpuset_lock); 339 set->cs_domain = domain; 340 CPU_AND(&set->cs_mask, &set->cs_mask, &parent->cs_mask); 341 set->cs_id = id; 342 set->cs_parent = cpuset_ref(parent); 343 LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings); 344 if (set->cs_id != CPUSET_INVALID) 345 LIST_INSERT_HEAD(&cpuset_ids, set, cs_link); 346 mtx_unlock_spin(&cpuset_lock); 347 348 return (0); 349 } 350 351 /* 352 * Create a new non-anonymous set with the requested parent and mask. May 353 * return failures if the mask is invalid or a new number can not be 354 * allocated. 355 * 356 * If *setp is not NULL, then it will be used as-is. The caller must take 357 * into account that *setp will be inserted at the head of cpuset_ids and 358 * plan any potentially conflicting cs_link usage accordingly. 359 */ 360 static int 361 cpuset_create(struct cpuset **setp, struct cpuset *parent, const cpuset_t *mask) 362 { 363 struct cpuset *set; 364 cpusetid_t id; 365 int error; 366 bool dofree; 367 368 id = alloc_unr(cpuset_unr); 369 if (id == -1) 370 return (ENFILE); 371 dofree = (*setp == NULL); 372 if (*setp != NULL) 373 set = *setp; 374 else 375 *setp = set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); 376 error = cpuset_init(set, parent, mask, NULL, id); 377 if (error == 0) 378 return (0); 379 free_unr(cpuset_unr, id); 380 if (dofree) 381 uma_zfree(cpuset_zone, set); 382 383 return (error); 384 } 385 386 static void 387 cpuset_freelist_add(struct setlist *list, int count) 388 { 389 struct cpuset *set; 390 int i; 391 392 for (i = 0; i < count; i++) { 393 set = uma_zalloc(cpuset_zone, M_ZERO | M_WAITOK); 394 LIST_INSERT_HEAD(list, set, cs_link); 395 } 396 } 397 398 static void 399 cpuset_freelist_init(struct setlist *list, int count) 400 { 401 402 LIST_INIT(list); 403 cpuset_freelist_add(list, count); 404 } 405 406 static void 407 cpuset_freelist_free(struct setlist *list) 408 { 409 struct cpuset *set; 410 411 while ((set = LIST_FIRST(list)) != NULL) { 412 LIST_REMOVE(set, cs_link); 413 uma_zfree(cpuset_zone, set); 414 } 415 } 416 417 static void 418 domainset_freelist_add(struct domainlist *list, int count) 419 { 420 struct domainset *set; 421 int i; 422 423 for (i = 0; i < count; i++) { 424 set = uma_zalloc(domainset_zone, M_ZERO | M_WAITOK); 425 LIST_INSERT_HEAD(list, set, ds_link); 426 } 427 } 428 429 static void 430 domainset_freelist_init(struct domainlist *list, int count) 431 { 432 433 LIST_INIT(list); 434 domainset_freelist_add(list, count); 435 } 436 437 static void 438 domainset_freelist_free(struct domainlist *list) 439 { 440 struct domainset *set; 441 442 while ((set = LIST_FIRST(list)) != NULL) { 443 LIST_REMOVE(set, ds_link); 444 uma_zfree(domainset_zone, set); 445 } 446 } 447 448 /* Copy a domainset preserving mask and policy. */ 449 static void 450 domainset_copy(const struct domainset *from, struct domainset *to) 451 { 452 453 DOMAINSET_COPY(&from->ds_mask, &to->ds_mask); 454 to->ds_policy = from->ds_policy; 455 to->ds_prefer = from->ds_prefer; 456 } 457 458 /* Return 1 if mask and policy are equal, otherwise 0. */ 459 static int 460 domainset_equal(const struct domainset *one, const struct domainset *two) 461 { 462 463 return (DOMAINSET_CMP(&one->ds_mask, &two->ds_mask) == 0 && 464 one->ds_policy == two->ds_policy && 465 one->ds_prefer == two->ds_prefer); 466 } 467 468 /* Return 1 if child is a valid subset of parent. */ 469 static int 470 domainset_valid(const struct domainset *parent, const struct domainset *child) 471 { 472 if (child->ds_policy != DOMAINSET_POLICY_PREFER) 473 return (DOMAINSET_SUBSET(&parent->ds_mask, &child->ds_mask)); 474 return (DOMAINSET_ISSET(child->ds_prefer, &parent->ds_mask)); 475 } 476 477 static int 478 domainset_restrict(const struct domainset *parent, 479 const struct domainset *child) 480 { 481 if (child->ds_policy != DOMAINSET_POLICY_PREFER) 482 return (DOMAINSET_OVERLAP(&parent->ds_mask, &child->ds_mask)); 483 return (DOMAINSET_ISSET(child->ds_prefer, &parent->ds_mask)); 484 } 485 486 /* 487 * Lookup or create a domainset. The key is provided in ds_mask and 488 * ds_policy. If the domainset does not yet exist the storage in 489 * 'domain' is used to insert. Otherwise this storage is freed to the 490 * domainset_zone and the existing domainset is returned. 491 */ 492 static struct domainset * 493 _domainset_create(struct domainset *domain, struct domainlist *freelist) 494 { 495 struct domainset *ndomain; 496 int i, j; 497 498 KASSERT(domain->ds_cnt <= vm_ndomains, 499 ("invalid domain count in domainset %p", domain)); 500 KASSERT(domain->ds_policy != DOMAINSET_POLICY_PREFER || 501 domain->ds_prefer < vm_ndomains, 502 ("invalid preferred domain in domains %p", domain)); 503 504 mtx_lock_spin(&cpuset_lock); 505 LIST_FOREACH(ndomain, &cpuset_domains, ds_link) 506 if (domainset_equal(ndomain, domain)) 507 break; 508 /* 509 * If the domain does not yet exist we insert it and initialize 510 * various iteration helpers which are not part of the key. 511 */ 512 if (ndomain == NULL) { 513 LIST_INSERT_HEAD(&cpuset_domains, domain, ds_link); 514 domain->ds_cnt = DOMAINSET_COUNT(&domain->ds_mask); 515 for (i = 0, j = 0; i < DOMAINSET_FLS(&domain->ds_mask); i++) 516 if (DOMAINSET_ISSET(i, &domain->ds_mask)) 517 domain->ds_order[j++] = i; 518 } 519 mtx_unlock_spin(&cpuset_lock); 520 if (ndomain == NULL) 521 return (domain); 522 if (freelist != NULL) 523 LIST_INSERT_HEAD(freelist, domain, ds_link); 524 else 525 uma_zfree(domainset_zone, domain); 526 return (ndomain); 527 528 } 529 530 /* 531 * Are any of the domains in the mask empty? If so, silently 532 * remove them and update the domainset accordingly. If only empty 533 * domains are present, we must return failure. 534 */ 535 static bool 536 domainset_empty_vm(struct domainset *domain) 537 { 538 domainset_t empty; 539 int i, j; 540 541 DOMAINSET_ZERO(&empty); 542 for (i = 0; i < vm_ndomains; i++) 543 if (VM_DOMAIN_EMPTY(i)) 544 DOMAINSET_SET(i, &empty); 545 if (DOMAINSET_SUBSET(&empty, &domain->ds_mask)) 546 return (true); 547 548 /* Remove empty domains from the set and recompute. */ 549 DOMAINSET_ANDNOT(&domain->ds_mask, &empty); 550 domain->ds_cnt = DOMAINSET_COUNT(&domain->ds_mask); 551 for (i = j = 0; i < DOMAINSET_FLS(&domain->ds_mask); i++) 552 if (DOMAINSET_ISSET(i, &domain->ds_mask)) 553 domain->ds_order[j++] = i; 554 555 /* Convert a PREFER policy referencing an empty domain to RR. */ 556 if (domain->ds_policy == DOMAINSET_POLICY_PREFER && 557 DOMAINSET_ISSET(domain->ds_prefer, &empty)) { 558 domain->ds_policy = DOMAINSET_POLICY_ROUNDROBIN; 559 domain->ds_prefer = -1; 560 } 561 562 return (false); 563 } 564 565 /* 566 * Create or lookup a domainset based on the key held in 'domain'. 567 */ 568 struct domainset * 569 domainset_create(const struct domainset *domain) 570 { 571 struct domainset *ndomain; 572 573 /* 574 * Validate the policy. It must specify a useable policy number with 575 * only valid domains. Preferred must include the preferred domain 576 * in the mask. 577 */ 578 if (domain->ds_policy <= DOMAINSET_POLICY_INVALID || 579 domain->ds_policy > DOMAINSET_POLICY_MAX) 580 return (NULL); 581 if (domain->ds_policy == DOMAINSET_POLICY_PREFER && 582 !DOMAINSET_ISSET(domain->ds_prefer, &domain->ds_mask)) 583 return (NULL); 584 if (!DOMAINSET_SUBSET(&domainset0->ds_mask, &domain->ds_mask)) 585 return (NULL); 586 ndomain = uma_zalloc(domainset_zone, M_WAITOK | M_ZERO); 587 domainset_copy(domain, ndomain); 588 return _domainset_create(ndomain, NULL); 589 } 590 591 /* 592 * Update thread domainset pointers. 593 */ 594 static void 595 domainset_notify(void) 596 { 597 struct thread *td; 598 struct proc *p; 599 600 sx_slock(&allproc_lock); 601 FOREACH_PROC_IN_SYSTEM(p) { 602 PROC_LOCK(p); 603 if (p->p_state == PRS_NEW) { 604 PROC_UNLOCK(p); 605 continue; 606 } 607 FOREACH_THREAD_IN_PROC(p, td) { 608 thread_lock(td); 609 td->td_domain.dr_policy = td->td_cpuset->cs_domain; 610 thread_unlock(td); 611 } 612 PROC_UNLOCK(p); 613 } 614 sx_sunlock(&allproc_lock); 615 kernel_object->domain.dr_policy = cpuset_kernel->cs_domain; 616 } 617 618 /* 619 * Create a new set that is a subset of a parent. 620 */ 621 static struct domainset * 622 domainset_shadow(const struct domainset *pdomain, 623 const struct domainset *domain, struct domainlist *freelist) 624 { 625 struct domainset *ndomain; 626 627 ndomain = LIST_FIRST(freelist); 628 LIST_REMOVE(ndomain, ds_link); 629 630 /* 631 * Initialize the key from the request. 632 */ 633 domainset_copy(domain, ndomain); 634 635 /* 636 * Restrict the key by the parent. 637 */ 638 DOMAINSET_AND(&ndomain->ds_mask, &pdomain->ds_mask); 639 640 return _domainset_create(ndomain, freelist); 641 } 642 643 /* 644 * Recursively check for errors that would occur from applying mask to 645 * the tree of sets starting at 'set'. Checks for sets that would become 646 * empty as well as RDONLY flags. 647 */ 648 static int 649 cpuset_testupdate(struct cpuset *set, cpuset_t *mask, int augment_mask) 650 { 651 struct cpuset *nset; 652 cpuset_t newmask; 653 int error; 654 655 mtx_assert(&cpuset_lock, MA_OWNED); 656 if (set->cs_flags & CPU_SET_RDONLY) 657 return (EPERM); 658 if (augment_mask) { 659 CPU_AND(&newmask, &set->cs_mask, mask); 660 } else 661 CPU_COPY(mask, &newmask); 662 663 if (CPU_EMPTY(&newmask)) 664 return (EDEADLK); 665 error = 0; 666 LIST_FOREACH(nset, &set->cs_children, cs_siblings) 667 if ((error = cpuset_testupdate(nset, &newmask, 1)) != 0) 668 break; 669 return (error); 670 } 671 672 /* 673 * Applies the mask 'mask' without checking for empty sets or permissions. 674 */ 675 static void 676 cpuset_update(struct cpuset *set, cpuset_t *mask) 677 { 678 struct cpuset *nset; 679 680 mtx_assert(&cpuset_lock, MA_OWNED); 681 CPU_AND(&set->cs_mask, &set->cs_mask, mask); 682 LIST_FOREACH(nset, &set->cs_children, cs_siblings) 683 cpuset_update(nset, &set->cs_mask); 684 685 return; 686 } 687 688 /* 689 * Modify the set 'set' to use a copy of the mask provided. Apply this new 690 * mask to restrict all children in the tree. Checks for validity before 691 * applying the changes. 692 */ 693 static int 694 cpuset_modify(struct cpuset *set, cpuset_t *mask) 695 { 696 struct cpuset *root; 697 int error; 698 699 error = priv_check(curthread, PRIV_SCHED_CPUSET); 700 if (error) 701 return (error); 702 /* 703 * In case we are called from within the jail, 704 * we do not allow modifying the dedicated root 705 * cpuset of the jail but may still allow to 706 * change child sets, including subordinate jails' 707 * roots. 708 */ 709 if ((set->cs_flags & CPU_SET_ROOT) != 0 && 710 jailed(curthread->td_ucred) && 711 set == curthread->td_ucred->cr_prison->pr_cpuset) 712 return (EPERM); 713 /* 714 * Verify that we have access to this set of 715 * cpus. 716 */ 717 if ((set->cs_flags & (CPU_SET_ROOT | CPU_SET_RDONLY)) == CPU_SET_ROOT) { 718 KASSERT(set->cs_parent != NULL, 719 ("jail.cpuset=%d is not a proper child of parent jail's root.", 720 set->cs_id)); 721 722 /* 723 * cpuset_getroot() cannot work here due to how top-level jail 724 * roots are constructed. Top-level jails are parented to 725 * thread0's cpuset (i.e. cpuset 1) rather than the system root. 726 */ 727 root = set->cs_parent; 728 } else { 729 root = cpuset_getroot(set); 730 } 731 mtx_lock_spin(&cpuset_lock); 732 if (root && !CPU_SUBSET(&root->cs_mask, mask)) { 733 error = EINVAL; 734 goto out; 735 } 736 error = cpuset_testupdate(set, mask, 0); 737 if (error) 738 goto out; 739 CPU_COPY(mask, &set->cs_mask); 740 cpuset_update(set, mask); 741 out: 742 mtx_unlock_spin(&cpuset_lock); 743 744 return (error); 745 } 746 747 /* 748 * Recursively check for errors that would occur from applying mask to 749 * the tree of sets starting at 'set'. Checks for sets that would become 750 * empty as well as RDONLY flags. 751 */ 752 static int 753 cpuset_testupdate_domain(struct cpuset *set, struct domainset *dset, 754 struct domainset *orig, int *count, int augment_mask __unused) 755 { 756 struct cpuset *nset; 757 struct domainset *domain; 758 struct domainset newset; 759 int error; 760 761 mtx_assert(&cpuset_lock, MA_OWNED); 762 if (set->cs_flags & CPU_SET_RDONLY) 763 return (EPERM); 764 domain = set->cs_domain; 765 domainset_copy(domain, &newset); 766 if (!domainset_equal(domain, orig)) { 767 if (!domainset_restrict(domain, dset)) 768 return (EDEADLK); 769 DOMAINSET_AND(&newset.ds_mask, &dset->ds_mask); 770 /* Count the number of domains that are changing. */ 771 (*count)++; 772 } 773 error = 0; 774 LIST_FOREACH(nset, &set->cs_children, cs_siblings) 775 if ((error = cpuset_testupdate_domain(nset, &newset, domain, 776 count, 1)) != 0) 777 break; 778 return (error); 779 } 780 781 /* 782 * Applies the mask 'mask' without checking for empty sets or permissions. 783 */ 784 static void 785 cpuset_update_domain(struct cpuset *set, struct domainset *domain, 786 struct domainset *orig, struct domainlist *domains) 787 { 788 struct cpuset *nset; 789 790 mtx_assert(&cpuset_lock, MA_OWNED); 791 /* 792 * If this domainset has changed from the parent we must calculate 793 * a new set. Otherwise it simply inherits from the parent. When 794 * we inherit from the parent we get a new mask and policy. If the 795 * set is modified from the parent we keep the policy and only 796 * update the mask. 797 */ 798 if (set->cs_domain != orig) { 799 orig = set->cs_domain; 800 set->cs_domain = domainset_shadow(domain, orig, domains); 801 } else 802 set->cs_domain = domain; 803 LIST_FOREACH(nset, &set->cs_children, cs_siblings) 804 cpuset_update_domain(nset, set->cs_domain, orig, domains); 805 806 return; 807 } 808 809 /* 810 * Modify the set 'set' to use a copy the domainset provided. Apply this new 811 * mask to restrict all children in the tree. Checks for validity before 812 * applying the changes. 813 */ 814 static int 815 cpuset_modify_domain(struct cpuset *set, struct domainset *domain) 816 { 817 struct domainlist domains; 818 struct domainset temp; 819 struct domainset *dset; 820 struct cpuset *root; 821 int ndomains, needed; 822 int error; 823 824 error = priv_check(curthread, PRIV_SCHED_CPUSET); 825 if (error) 826 return (error); 827 /* 828 * In case we are called from within the jail 829 * we do not allow modifying the dedicated root 830 * cpuset of the jail but may still allow to 831 * change child sets. 832 */ 833 if (jailed(curthread->td_ucred) && 834 set->cs_flags & CPU_SET_ROOT) 835 return (EPERM); 836 domainset_freelist_init(&domains, 0); 837 domain = domainset_create(domain); 838 ndomains = 0; 839 840 mtx_lock_spin(&cpuset_lock); 841 for (;;) { 842 root = cpuset_getroot(set); 843 dset = root->cs_domain; 844 /* 845 * Verify that we have access to this set of domains. 846 */ 847 if (!domainset_valid(dset, domain)) { 848 error = EINVAL; 849 goto out; 850 } 851 /* 852 * If applying prefer we keep the current set as the fallback. 853 */ 854 if (domain->ds_policy == DOMAINSET_POLICY_PREFER) 855 DOMAINSET_COPY(&set->cs_domain->ds_mask, 856 &domain->ds_mask); 857 /* 858 * Determine whether we can apply this set of domains and 859 * how many new domain structures it will require. 860 */ 861 domainset_copy(domain, &temp); 862 needed = 0; 863 error = cpuset_testupdate_domain(set, &temp, set->cs_domain, 864 &needed, 0); 865 if (error) 866 goto out; 867 if (ndomains >= needed) 868 break; 869 870 /* Dropping the lock; we'll need to re-evaluate again. */ 871 mtx_unlock_spin(&cpuset_lock); 872 domainset_freelist_add(&domains, needed - ndomains); 873 ndomains = needed; 874 mtx_lock_spin(&cpuset_lock); 875 } 876 dset = set->cs_domain; 877 cpuset_update_domain(set, domain, dset, &domains); 878 out: 879 mtx_unlock_spin(&cpuset_lock); 880 domainset_freelist_free(&domains); 881 if (error == 0) 882 domainset_notify(); 883 884 return (error); 885 } 886 887 /* 888 * Resolve the 'which' parameter of several cpuset apis. 889 * 890 * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid. Also 891 * checks for permission via p_cansched(). 892 * 893 * For WHICH_SET returns a valid set with a new reference. 894 * 895 * -1 may be supplied for any argument to mean the current proc/thread or 896 * the base set of the current thread. May fail with ESRCH/EPERM. 897 */ 898 int 899 cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp, 900 struct cpuset **setp) 901 { 902 struct cpuset *set; 903 struct thread *td; 904 struct proc *p; 905 int error; 906 907 *pp = p = NULL; 908 *tdp = td = NULL; 909 *setp = set = NULL; 910 switch (which) { 911 case CPU_WHICH_PID: 912 if (id == -1) { 913 PROC_LOCK(curproc); 914 p = curproc; 915 break; 916 } 917 if ((p = pfind(id)) == NULL) 918 return (ESRCH); 919 break; 920 case CPU_WHICH_TID: 921 if (id == -1) { 922 PROC_LOCK(curproc); 923 p = curproc; 924 td = curthread; 925 break; 926 } 927 td = tdfind(id, -1); 928 if (td == NULL) 929 return (ESRCH); 930 p = td->td_proc; 931 break; 932 case CPU_WHICH_TIDPID: 933 if (id == -1) { 934 PROC_LOCK(curproc); 935 td = curthread; 936 p = curproc; 937 } else if (id > PID_MAX) { 938 td = tdfind(id, -1); 939 if (td == NULL) 940 return (ESRCH); 941 p = td->td_proc; 942 } else { 943 p = pfind(id); 944 if (p == NULL) 945 return (ESRCH); 946 } 947 break; 948 case CPU_WHICH_CPUSET: 949 if (id == -1) { 950 thread_lock(curthread); 951 set = cpuset_refbase(curthread->td_cpuset); 952 thread_unlock(curthread); 953 } else 954 set = cpuset_lookup(id, curthread); 955 if (set) { 956 *setp = set; 957 return (0); 958 } 959 return (ESRCH); 960 case CPU_WHICH_JAIL: 961 { 962 /* Find `set' for prison with given id. */ 963 struct prison *pr; 964 965 sx_slock(&allprison_lock); 966 pr = prison_find_child(curthread->td_ucred->cr_prison, id); 967 sx_sunlock(&allprison_lock); 968 if (pr == NULL) 969 return (ESRCH); 970 cpuset_ref(pr->pr_cpuset); 971 *setp = pr->pr_cpuset; 972 mtx_unlock(&pr->pr_mtx); 973 return (0); 974 } 975 case CPU_WHICH_IRQ: 976 case CPU_WHICH_DOMAIN: 977 return (0); 978 default: 979 return (EINVAL); 980 } 981 error = p_cansched(curthread, p); 982 if (error) { 983 PROC_UNLOCK(p); 984 return (error); 985 } 986 if (td == NULL) 987 td = FIRST_THREAD_IN_PROC(p); 988 *pp = p; 989 *tdp = td; 990 return (0); 991 } 992 993 static int 994 cpuset_which2(cpuwhich_t *which, id_t id, struct proc **pp, struct thread **tdp, 995 struct cpuset **setp) 996 { 997 998 if (*which == CPU_WHICH_TIDPID) { 999 if (id == -1 || id > PID_MAX) 1000 *which = CPU_WHICH_TID; 1001 else 1002 *which = CPU_WHICH_PID; 1003 } 1004 return (cpuset_which(*which, id, pp, tdp, setp)); 1005 } 1006 1007 static int 1008 cpuset_testshadow(struct cpuset *set, const cpuset_t *mask, 1009 const struct domainset *domain) 1010 { 1011 struct cpuset *parent; 1012 struct domainset *dset; 1013 1014 parent = cpuset_getbase(set); 1015 /* 1016 * If we are restricting a cpu mask it must be a subset of the 1017 * parent or invalid CPUs have been specified. 1018 */ 1019 if (mask != NULL && !CPU_SUBSET(&parent->cs_mask, mask)) 1020 return (EINVAL); 1021 1022 /* 1023 * If we are restricting a domain mask it must be a subset of the 1024 * parent or invalid domains have been specified. 1025 */ 1026 dset = parent->cs_domain; 1027 if (domain != NULL && !domainset_valid(dset, domain)) 1028 return (EINVAL); 1029 1030 return (0); 1031 } 1032 1033 /* 1034 * Create an anonymous set with the provided mask in the space provided by 1035 * 'nset'. If the passed in set is anonymous we use its parent otherwise 1036 * the new set is a child of 'set'. 1037 */ 1038 static int 1039 cpuset_shadow(struct cpuset *set, struct cpuset **nsetp, 1040 const cpuset_t *mask, const struct domainset *domain, 1041 struct setlist *cpusets, struct domainlist *domains) 1042 { 1043 struct cpuset *parent; 1044 struct cpuset *nset; 1045 struct domainset *dset; 1046 struct domainset *d; 1047 int error; 1048 1049 error = cpuset_testshadow(set, mask, domain); 1050 if (error) 1051 return (error); 1052 1053 parent = cpuset_getbase(set); 1054 dset = parent->cs_domain; 1055 if (mask == NULL) 1056 mask = &set->cs_mask; 1057 if (domain != NULL) 1058 d = domainset_shadow(dset, domain, domains); 1059 else 1060 d = set->cs_domain; 1061 nset = LIST_FIRST(cpusets); 1062 error = cpuset_init(nset, parent, mask, d, CPUSET_INVALID); 1063 if (error == 0) { 1064 LIST_REMOVE(nset, cs_link); 1065 *nsetp = nset; 1066 } 1067 return (error); 1068 } 1069 1070 static struct cpuset * 1071 cpuset_update_thread(struct thread *td, struct cpuset *nset) 1072 { 1073 struct cpuset *tdset; 1074 1075 tdset = td->td_cpuset; 1076 td->td_cpuset = nset; 1077 td->td_domain.dr_policy = nset->cs_domain; 1078 sched_affinity(td); 1079 1080 return (tdset); 1081 } 1082 1083 static int 1084 cpuset_setproc_test_maskthread(struct cpuset *tdset, cpuset_t *mask, 1085 struct domainset *domain) 1086 { 1087 struct cpuset *parent; 1088 1089 parent = cpuset_getbase(tdset); 1090 if (mask == NULL) 1091 mask = &tdset->cs_mask; 1092 if (domain == NULL) 1093 domain = tdset->cs_domain; 1094 return cpuset_testshadow(parent, mask, domain); 1095 } 1096 1097 static int 1098 cpuset_setproc_maskthread(struct cpuset *tdset, cpuset_t *mask, 1099 struct domainset *domain, struct cpuset **nsetp, 1100 struct setlist *freelist, struct domainlist *domainlist) 1101 { 1102 struct cpuset *parent; 1103 1104 parent = cpuset_getbase(tdset); 1105 if (mask == NULL) 1106 mask = &tdset->cs_mask; 1107 if (domain == NULL) 1108 domain = tdset->cs_domain; 1109 return cpuset_shadow(parent, nsetp, mask, domain, freelist, 1110 domainlist); 1111 } 1112 1113 static int 1114 cpuset_setproc_setthread_mask(struct cpuset *tdset, struct cpuset *set, 1115 cpuset_t *mask, struct domainset *domain) 1116 { 1117 struct cpuset *parent; 1118 1119 parent = cpuset_getbase(tdset); 1120 1121 /* 1122 * If the thread restricted its mask then apply that same 1123 * restriction to the new set, otherwise take it wholesale. 1124 */ 1125 if (CPU_CMP(&tdset->cs_mask, &parent->cs_mask) != 0) { 1126 CPU_AND(mask, &tdset->cs_mask, &set->cs_mask); 1127 } else 1128 CPU_COPY(&set->cs_mask, mask); 1129 1130 /* 1131 * If the thread restricted the domain then we apply the 1132 * restriction to the new set but retain the policy. 1133 */ 1134 if (tdset->cs_domain != parent->cs_domain) { 1135 domainset_copy(tdset->cs_domain, domain); 1136 DOMAINSET_AND(&domain->ds_mask, &set->cs_domain->ds_mask); 1137 } else 1138 domainset_copy(set->cs_domain, domain); 1139 1140 if (CPU_EMPTY(mask) || DOMAINSET_EMPTY(&domain->ds_mask)) 1141 return (EDEADLK); 1142 1143 return (0); 1144 } 1145 1146 static int 1147 cpuset_setproc_test_setthread(struct cpuset *tdset, struct cpuset *set) 1148 { 1149 struct domainset domain; 1150 cpuset_t mask; 1151 1152 if (tdset->cs_id != CPUSET_INVALID) 1153 return (0); 1154 return cpuset_setproc_setthread_mask(tdset, set, &mask, &domain); 1155 } 1156 1157 static int 1158 cpuset_setproc_setthread(struct cpuset *tdset, struct cpuset *set, 1159 struct cpuset **nsetp, struct setlist *freelist, 1160 struct domainlist *domainlist) 1161 { 1162 struct domainset domain; 1163 cpuset_t mask; 1164 int error; 1165 1166 /* 1167 * If we're replacing on a thread that has not constrained the 1168 * original set we can simply accept the new set. 1169 */ 1170 if (tdset->cs_id != CPUSET_INVALID) { 1171 *nsetp = cpuset_ref(set); 1172 return (0); 1173 } 1174 error = cpuset_setproc_setthread_mask(tdset, set, &mask, &domain); 1175 if (error) 1176 return (error); 1177 1178 return cpuset_shadow(set, nsetp, &mask, &domain, freelist, 1179 domainlist); 1180 } 1181 1182 static int 1183 cpuset_setproc_newbase(struct thread *td, struct cpuset *set, 1184 struct cpuset *nroot, struct cpuset **nsetp, 1185 struct setlist *cpusets, struct domainlist *domainlist) 1186 { 1187 struct domainset ndomain; 1188 cpuset_t nmask; 1189 struct cpuset *pbase; 1190 int error; 1191 1192 pbase = cpuset_getbase(td->td_cpuset); 1193 1194 /* Copy process mask, then further apply the new root mask. */ 1195 CPU_AND(&nmask, &pbase->cs_mask, &nroot->cs_mask); 1196 1197 domainset_copy(pbase->cs_domain, &ndomain); 1198 DOMAINSET_AND(&ndomain.ds_mask, &set->cs_domain->ds_mask); 1199 1200 /* Policy is too restrictive, will not work. */ 1201 if (CPU_EMPTY(&nmask) || DOMAINSET_EMPTY(&ndomain.ds_mask)) 1202 return (EDEADLK); 1203 1204 /* 1205 * Remove pbase from the freelist in advance, it'll be pushed to 1206 * cpuset_ids on success. We assume here that cpuset_create() will not 1207 * touch pbase on failure, and we just enqueue it back to the freelist 1208 * to remain in a consistent state. 1209 */ 1210 pbase = LIST_FIRST(cpusets); 1211 LIST_REMOVE(pbase, cs_link); 1212 error = cpuset_create(&pbase, set, &nmask); 1213 if (error != 0) { 1214 LIST_INSERT_HEAD(cpusets, pbase, cs_link); 1215 return (error); 1216 } 1217 1218 /* Duplicates some work from above... oh well. */ 1219 pbase->cs_domain = domainset_shadow(set->cs_domain, &ndomain, 1220 domainlist); 1221 *nsetp = pbase; 1222 return (0); 1223 } 1224 1225 /* 1226 * Handle four cases for updating an entire process. 1227 * 1228 * 1) Set is non-null and the process is not rebasing onto a new root. This 1229 * reparents all anonymous sets to the provided set and replaces all 1230 * non-anonymous td_cpusets with the provided set. 1231 * 2) Set is non-null and the process is rebasing onto a new root. This 1232 * creates a new base set if the process previously had its own base set, 1233 * then reparents all anonymous sets either to that set or the provided set 1234 * if one was not created. Non-anonymous sets are similarly replaced. 1235 * 3) Mask is non-null. This replaces or creates anonymous sets for every 1236 * thread with the existing base as a parent. 1237 * 4) domain is non-null. This creates anonymous sets for every thread 1238 * and replaces the domain set. 1239 * 1240 * This is overly complicated because we can't allocate while holding a 1241 * spinlock and spinlocks must be held while changing and examining thread 1242 * state. 1243 */ 1244 static int 1245 cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask, 1246 struct domainset *domain, bool rebase) 1247 { 1248 struct setlist freelist; 1249 struct setlist droplist; 1250 struct domainlist domainlist; 1251 struct cpuset *base, *nset, *nroot, *tdroot; 1252 struct thread *td; 1253 struct proc *p; 1254 int needed; 1255 int nfree; 1256 int error; 1257 1258 /* 1259 * The algorithm requires two passes due to locking considerations. 1260 * 1261 * 1) Lookup the process and acquire the locks in the required order. 1262 * 2) If enough cpusets have not been allocated release the locks and 1263 * allocate them. Loop. 1264 */ 1265 cpuset_freelist_init(&freelist, 1); 1266 domainset_freelist_init(&domainlist, 1); 1267 nfree = 1; 1268 LIST_INIT(&droplist); 1269 nfree = 0; 1270 base = set; 1271 nroot = NULL; 1272 if (set != NULL) 1273 nroot = cpuset_getroot(set); 1274 for (;;) { 1275 error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset); 1276 if (error) 1277 goto out; 1278 tdroot = cpuset_getroot(td->td_cpuset); 1279 needed = p->p_numthreads; 1280 if (set != NULL && rebase && tdroot != nroot) 1281 needed++; 1282 if (nfree >= needed) 1283 break; 1284 PROC_UNLOCK(p); 1285 if (nfree < needed) { 1286 cpuset_freelist_add(&freelist, needed - nfree); 1287 domainset_freelist_add(&domainlist, needed - nfree); 1288 nfree = needed; 1289 } 1290 } 1291 PROC_LOCK_ASSERT(p, MA_OWNED); 1292 1293 /* 1294 * If we're changing roots and the root set is what has been specified 1295 * as the parent, then we'll check if the process was previously using 1296 * the root set and, if it wasn't, create a new base with the process's 1297 * mask applied to it. 1298 * 1299 * If the new root is incompatible with the existing mask, then we allow 1300 * the process to take on the new root if and only if they have 1301 * privilege to widen their mask anyways. Unprivileged processes get 1302 * rejected with EDEADLK. 1303 */ 1304 if (set != NULL && rebase && nroot != tdroot) { 1305 cpusetid_t base_id, root_id; 1306 1307 root_id = td->td_ucred->cr_prison->pr_cpuset->cs_id; 1308 base_id = cpuset_getbase(td->td_cpuset)->cs_id; 1309 1310 if (base_id != root_id) { 1311 error = cpuset_setproc_newbase(td, set, nroot, &base, 1312 &freelist, &domainlist); 1313 if (error == EDEADLK && 1314 priv_check(td, PRIV_SCHED_CPUSET) == 0) 1315 error = 0; 1316 if (error != 0) 1317 goto unlock_out; 1318 } 1319 } 1320 1321 /* 1322 * Now that the appropriate locks are held and we have enough cpusets, 1323 * make sure the operation will succeed before applying changes. The 1324 * proc lock prevents td_cpuset from changing between calls. 1325 */ 1326 error = 0; 1327 FOREACH_THREAD_IN_PROC(p, td) { 1328 thread_lock(td); 1329 if (set != NULL) 1330 error = cpuset_setproc_test_setthread(td->td_cpuset, 1331 base); 1332 else 1333 error = cpuset_setproc_test_maskthread(td->td_cpuset, 1334 mask, domain); 1335 thread_unlock(td); 1336 if (error) 1337 goto unlock_out; 1338 } 1339 /* 1340 * Replace each thread's cpuset while using deferred release. We 1341 * must do this because the thread lock must be held while operating 1342 * on the thread and this limits the type of operations allowed. 1343 */ 1344 FOREACH_THREAD_IN_PROC(p, td) { 1345 thread_lock(td); 1346 if (set != NULL) 1347 error = cpuset_setproc_setthread(td->td_cpuset, base, 1348 &nset, &freelist, &domainlist); 1349 else 1350 error = cpuset_setproc_maskthread(td->td_cpuset, mask, 1351 domain, &nset, &freelist, &domainlist); 1352 if (error) { 1353 thread_unlock(td); 1354 break; 1355 } 1356 cpuset_rel_defer(&droplist, cpuset_update_thread(td, nset)); 1357 thread_unlock(td); 1358 } 1359 unlock_out: 1360 PROC_UNLOCK(p); 1361 out: 1362 if (base != NULL && base != set) 1363 cpuset_rel(base); 1364 while ((nset = LIST_FIRST(&droplist)) != NULL) 1365 cpuset_rel_complete(nset); 1366 cpuset_freelist_free(&freelist); 1367 domainset_freelist_free(&domainlist); 1368 return (error); 1369 } 1370 1371 static int 1372 bitset_strprint(char *buf, size_t bufsiz, const struct bitset *set, int setlen) 1373 { 1374 size_t bytes; 1375 int i, once; 1376 char *p; 1377 1378 once = 0; 1379 p = buf; 1380 for (i = 0; i < __bitset_words(setlen); i++) { 1381 if (once != 0) { 1382 if (bufsiz < 1) 1383 return (0); 1384 *p = ','; 1385 p++; 1386 bufsiz--; 1387 } else 1388 once = 1; 1389 if (bufsiz < sizeof(__STRING(ULONG_MAX))) 1390 return (0); 1391 bytes = snprintf(p, bufsiz, "%lx", set->__bits[i]); 1392 p += bytes; 1393 bufsiz -= bytes; 1394 } 1395 return (p - buf); 1396 } 1397 1398 static int 1399 bitset_strscan(struct bitset *set, int setlen, const char *buf) 1400 { 1401 int i, ret; 1402 const char *p; 1403 1404 BIT_ZERO(setlen, set); 1405 p = buf; 1406 for (i = 0; i < __bitset_words(setlen); i++) { 1407 if (*p == ',') { 1408 p++; 1409 continue; 1410 } 1411 ret = sscanf(p, "%lx", &set->__bits[i]); 1412 if (ret == 0 || ret == -1) 1413 break; 1414 while (isxdigit(*p)) 1415 p++; 1416 } 1417 return (p - buf); 1418 } 1419 1420 /* 1421 * Return a string representing a valid layout for a cpuset_t object. 1422 * It expects an incoming buffer at least sized as CPUSETBUFSIZ. 1423 */ 1424 char * 1425 cpusetobj_strprint(char *buf, const cpuset_t *set) 1426 { 1427 1428 bitset_strprint(buf, CPUSETBUFSIZ, (const struct bitset *)set, 1429 CPU_SETSIZE); 1430 return (buf); 1431 } 1432 1433 /* 1434 * Build a valid cpuset_t object from a string representation. 1435 * It expects an incoming buffer at least sized as CPUSETBUFSIZ. 1436 */ 1437 int 1438 cpusetobj_strscan(cpuset_t *set, const char *buf) 1439 { 1440 char p; 1441 1442 if (strlen(buf) > CPUSETBUFSIZ - 1) 1443 return (-1); 1444 1445 p = buf[bitset_strscan((struct bitset *)set, CPU_SETSIZE, buf)]; 1446 if (p != '\0') 1447 return (-1); 1448 1449 return (0); 1450 } 1451 1452 /* 1453 * Handle a domainset specifier in the sysctl tree. A poiner to a pointer to 1454 * a domainset is in arg1. If the user specifies a valid domainset the 1455 * pointer is updated. 1456 * 1457 * Format is: 1458 * hex mask word 0,hex mask word 1,...:decimal policy:decimal preferred 1459 */ 1460 int 1461 sysctl_handle_domainset(SYSCTL_HANDLER_ARGS) 1462 { 1463 char buf[DOMAINSETBUFSIZ]; 1464 struct domainset *dset; 1465 struct domainset key; 1466 int policy, prefer, error; 1467 char *p; 1468 1469 dset = *(struct domainset **)arg1; 1470 error = 0; 1471 1472 if (dset != NULL) { 1473 p = buf + bitset_strprint(buf, DOMAINSETBUFSIZ, 1474 (const struct bitset *)&dset->ds_mask, DOMAINSET_SETSIZE); 1475 sprintf(p, ":%d:%d", dset->ds_policy, dset->ds_prefer); 1476 } else 1477 sprintf(buf, "<NULL>"); 1478 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 1479 if (error != 0 || req->newptr == NULL) 1480 return (error); 1481 1482 /* 1483 * Read in and validate the string. 1484 */ 1485 memset(&key, 0, sizeof(key)); 1486 p = &buf[bitset_strscan((struct bitset *)&key.ds_mask, 1487 DOMAINSET_SETSIZE, buf)]; 1488 if (p == buf) 1489 return (EINVAL); 1490 if (sscanf(p, ":%d:%d", &policy, &prefer) != 2) 1491 return (EINVAL); 1492 key.ds_policy = policy; 1493 key.ds_prefer = prefer; 1494 1495 /* Domainset_create() validates the policy.*/ 1496 dset = domainset_create(&key); 1497 if (dset == NULL) 1498 return (EINVAL); 1499 *(struct domainset **)arg1 = dset; 1500 1501 return (error); 1502 } 1503 1504 /* 1505 * Apply an anonymous mask or a domain to a single thread. 1506 */ 1507 static int 1508 _cpuset_setthread(lwpid_t id, cpuset_t *mask, struct domainset *domain) 1509 { 1510 struct setlist cpusets; 1511 struct domainlist domainlist; 1512 struct cpuset *nset; 1513 struct cpuset *set; 1514 struct thread *td; 1515 struct proc *p; 1516 int error; 1517 1518 cpuset_freelist_init(&cpusets, 1); 1519 domainset_freelist_init(&domainlist, domain != NULL); 1520 error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set); 1521 if (error) 1522 goto out; 1523 set = NULL; 1524 thread_lock(td); 1525 error = cpuset_shadow(td->td_cpuset, &nset, mask, domain, 1526 &cpusets, &domainlist); 1527 if (error == 0) 1528 set = cpuset_update_thread(td, nset); 1529 thread_unlock(td); 1530 PROC_UNLOCK(p); 1531 if (set) 1532 cpuset_rel(set); 1533 out: 1534 cpuset_freelist_free(&cpusets); 1535 domainset_freelist_free(&domainlist); 1536 return (error); 1537 } 1538 1539 /* 1540 * Apply an anonymous mask to a single thread. 1541 */ 1542 int 1543 cpuset_setthread(lwpid_t id, cpuset_t *mask) 1544 { 1545 1546 return _cpuset_setthread(id, mask, NULL); 1547 } 1548 1549 /* 1550 * Apply new cpumask to the ithread. 1551 */ 1552 int 1553 cpuset_setithread(lwpid_t id, int cpu) 1554 { 1555 cpuset_t mask; 1556 1557 CPU_ZERO(&mask); 1558 if (cpu == NOCPU) 1559 CPU_COPY(cpuset_root, &mask); 1560 else 1561 CPU_SET(cpu, &mask); 1562 return _cpuset_setthread(id, &mask, NULL); 1563 } 1564 1565 /* 1566 * Initialize static domainsets after NUMA information is available. This is 1567 * called before memory allocators are initialized. 1568 */ 1569 void 1570 domainset_init(void) 1571 { 1572 struct domainset *dset; 1573 int i; 1574 1575 dset = &domainset_firsttouch; 1576 DOMAINSET_COPY(&all_domains, &dset->ds_mask); 1577 dset->ds_policy = DOMAINSET_POLICY_FIRSTTOUCH; 1578 dset->ds_prefer = -1; 1579 _domainset_create(dset, NULL); 1580 1581 dset = &domainset_interleave; 1582 DOMAINSET_COPY(&all_domains, &dset->ds_mask); 1583 dset->ds_policy = DOMAINSET_POLICY_INTERLEAVE; 1584 dset->ds_prefer = -1; 1585 _domainset_create(dset, NULL); 1586 1587 dset = &domainset_roundrobin; 1588 DOMAINSET_COPY(&all_domains, &dset->ds_mask); 1589 dset->ds_policy = DOMAINSET_POLICY_ROUNDROBIN; 1590 dset->ds_prefer = -1; 1591 _domainset_create(dset, NULL); 1592 1593 for (i = 0; i < vm_ndomains; i++) { 1594 dset = &domainset_fixed[i]; 1595 DOMAINSET_ZERO(&dset->ds_mask); 1596 DOMAINSET_SET(i, &dset->ds_mask); 1597 dset->ds_policy = DOMAINSET_POLICY_ROUNDROBIN; 1598 _domainset_create(dset, NULL); 1599 1600 dset = &domainset_prefer[i]; 1601 DOMAINSET_COPY(&all_domains, &dset->ds_mask); 1602 dset->ds_policy = DOMAINSET_POLICY_PREFER; 1603 dset->ds_prefer = i; 1604 _domainset_create(dset, NULL); 1605 } 1606 } 1607 1608 /* 1609 * Define the domainsets for cpuset 0, 1 and cpuset 2. 1610 */ 1611 void 1612 domainset_zero(void) 1613 { 1614 struct domainset *dset, *tmp; 1615 1616 mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE); 1617 1618 domainset0 = &domainset_firsttouch; 1619 curthread->td_domain.dr_policy = domainset0; 1620 1621 domainset2 = &domainset_interleave; 1622 kernel_object->domain.dr_policy = domainset2; 1623 1624 /* Remove empty domains from the global policies. */ 1625 LIST_FOREACH_SAFE(dset, &cpuset_domains, ds_link, tmp) 1626 if (domainset_empty_vm(dset)) 1627 LIST_REMOVE(dset, ds_link); 1628 } 1629 1630 /* 1631 * Creates system-wide cpusets and the cpuset for thread0 including three 1632 * sets: 1633 * 1634 * 0 - The root set which should represent all valid processors in the 1635 * system. This set is immutable. 1636 * 1 - The default set which all processes are a member of until changed. 1637 * This allows an administrator to move all threads off of given cpus to 1638 * dedicate them to high priority tasks or save power etc. 1639 * 2 - The kernel set which allows restriction and policy to be applied only 1640 * to kernel threads and the kernel_object. 1641 */ 1642 struct cpuset * 1643 cpuset_thread0(void) 1644 { 1645 struct cpuset *set; 1646 int i; 1647 int error __unused; 1648 1649 cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL, 1650 NULL, NULL, UMA_ALIGN_CACHE, 0); 1651 domainset_zone = uma_zcreate("domainset", sizeof(struct domainset), 1652 NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); 1653 1654 /* 1655 * Create the root system set (0) for the whole machine. Doesn't use 1656 * cpuset_create() due to NULL parent. 1657 */ 1658 set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); 1659 CPU_COPY(&all_cpus, &set->cs_mask); 1660 LIST_INIT(&set->cs_children); 1661 LIST_INSERT_HEAD(&cpuset_ids, set, cs_link); 1662 refcount_init(&set->cs_ref, 1); 1663 set->cs_flags = CPU_SET_ROOT | CPU_SET_RDONLY; 1664 set->cs_domain = domainset0; 1665 cpuset_zero = set; 1666 cpuset_root = &set->cs_mask; 1667 1668 /* 1669 * Now derive a default (1), modifiable set from that to give out. 1670 */ 1671 set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); 1672 error = cpuset_init(set, cpuset_zero, NULL, NULL, 1); 1673 KASSERT(error == 0, ("Error creating default set: %d\n", error)); 1674 cpuset_default = set; 1675 /* 1676 * Create the kernel set (2). 1677 */ 1678 set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); 1679 error = cpuset_init(set, cpuset_zero, NULL, NULL, 2); 1680 KASSERT(error == 0, ("Error creating kernel set: %d\n", error)); 1681 set->cs_domain = domainset2; 1682 cpuset_kernel = set; 1683 1684 /* 1685 * Initialize the unit allocator. 0 and 1 are allocated above. 1686 */ 1687 cpuset_unr = new_unrhdr(3, INT_MAX, NULL); 1688 1689 /* 1690 * If MD code has not initialized per-domain cpusets, place all 1691 * CPUs in domain 0. 1692 */ 1693 for (i = 0; i < MAXMEMDOM; i++) 1694 if (!CPU_EMPTY(&cpuset_domain[i])) 1695 goto domains_set; 1696 CPU_COPY(&all_cpus, &cpuset_domain[0]); 1697 domains_set: 1698 1699 return (cpuset_default); 1700 } 1701 1702 void 1703 cpuset_kernthread(struct thread *td) 1704 { 1705 struct cpuset *set; 1706 1707 thread_lock(td); 1708 set = td->td_cpuset; 1709 td->td_cpuset = cpuset_ref(cpuset_kernel); 1710 thread_unlock(td); 1711 cpuset_rel(set); 1712 } 1713 1714 /* 1715 * Create a cpuset, which would be cpuset_create() but 1716 * mark the new 'set' as root. 1717 * 1718 * We are not going to reparent the td to it. Use cpuset_setproc_update_set() 1719 * for that. 1720 * 1721 * In case of no error, returns the set in *setp locked with a reference. 1722 */ 1723 int 1724 cpuset_create_root(struct prison *pr, struct cpuset **setp) 1725 { 1726 struct cpuset *set; 1727 int error; 1728 1729 KASSERT(pr != NULL, ("[%s:%d] invalid pr", __func__, __LINE__)); 1730 KASSERT(setp != NULL, ("[%s:%d] invalid setp", __func__, __LINE__)); 1731 1732 set = NULL; 1733 error = cpuset_create(&set, pr->pr_cpuset, &pr->pr_cpuset->cs_mask); 1734 if (error) 1735 return (error); 1736 1737 KASSERT(set != NULL, ("[%s:%d] cpuset_create returned invalid data", 1738 __func__, __LINE__)); 1739 1740 /* Mark the set as root. */ 1741 set->cs_flags |= CPU_SET_ROOT; 1742 *setp = set; 1743 1744 return (0); 1745 } 1746 1747 int 1748 cpuset_setproc_update_set(struct proc *p, struct cpuset *set) 1749 { 1750 int error; 1751 1752 KASSERT(p != NULL, ("[%s:%d] invalid proc", __func__, __LINE__)); 1753 KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__)); 1754 1755 cpuset_ref(set); 1756 error = cpuset_setproc(p->p_pid, set, NULL, NULL, true); 1757 if (error) 1758 return (error); 1759 cpuset_rel(set); 1760 return (0); 1761 } 1762 1763 /* 1764 * In Capability mode, the only accesses that are permitted are to the current 1765 * thread and process' CPU and domain sets. 1766 */ 1767 static int 1768 cpuset_check_capabilities(struct thread *td, cpulevel_t level, cpuwhich_t which, 1769 id_t id) 1770 { 1771 if (IN_CAPABILITY_MODE(td)) { 1772 if (level != CPU_LEVEL_WHICH) 1773 return (ECAPMODE); 1774 if (which != CPU_WHICH_TID && which != CPU_WHICH_PID && 1775 which != CPU_WHICH_TIDPID) 1776 return (ECAPMODE); 1777 if (id != -1 && which == CPU_WHICH_TIDPID && 1778 id != td->td_tid && id != td->td_proc->p_pid) 1779 return (ECAPMODE); 1780 if (id != -1 && 1781 !(which == CPU_WHICH_TID && id == td->td_tid) && 1782 !(which == CPU_WHICH_PID && id == td->td_proc->p_pid)) 1783 return (ECAPMODE); 1784 } 1785 return (0); 1786 } 1787 1788 #if defined(__powerpc__) 1789 /* 1790 * TODO: At least powerpc64 and powerpc64le kernels panic with 1791 * exception 0x480 (instruction segment exception) when copyin/copyout, 1792 * are set as a function pointer in cpuset_copy_cb struct and called by 1793 * an external module (like pfsync). Tip: copyin/copyout have an ifunc 1794 * resolver function. 1795 * 1796 * Bisect of LLVM shows that the behavior changed on LLVM 10.0 with 1797 * https://reviews.llvm.org/rGdc06b0bc9ad055d06535462d91bfc2a744b2f589 1798 * 1799 * This is a hack/workaround while problem is being discussed with LLVM 1800 * community 1801 */ 1802 static int 1803 cpuset_copyin(const void *uaddr, void *kaddr, size_t len) 1804 { 1805 return(copyin(uaddr, kaddr, len)); 1806 } 1807 1808 static int 1809 cpuset_copyout(const void *kaddr, void *uaddr, size_t len) 1810 { 1811 return(copyout(kaddr, uaddr, len)); 1812 } 1813 1814 static const struct cpuset_copy_cb copy_set = { 1815 .cpuset_copyin = cpuset_copyin, 1816 .cpuset_copyout = cpuset_copyout 1817 }; 1818 #else 1819 static const struct cpuset_copy_cb copy_set = { 1820 .cpuset_copyin = copyin, 1821 .cpuset_copyout = copyout 1822 }; 1823 #endif 1824 1825 #ifndef _SYS_SYSPROTO_H_ 1826 struct cpuset_args { 1827 cpusetid_t *setid; 1828 }; 1829 #endif 1830 int 1831 sys_cpuset(struct thread *td, struct cpuset_args *uap) 1832 { 1833 struct cpuset *root; 1834 struct cpuset *set; 1835 int error; 1836 1837 thread_lock(td); 1838 root = cpuset_refroot(td->td_cpuset); 1839 thread_unlock(td); 1840 set = NULL; 1841 error = cpuset_create(&set, root, &root->cs_mask); 1842 cpuset_rel(root); 1843 if (error) 1844 return (error); 1845 error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id)); 1846 if (error == 0) 1847 error = cpuset_setproc(-1, set, NULL, NULL, false); 1848 cpuset_rel(set); 1849 return (error); 1850 } 1851 1852 #ifndef _SYS_SYSPROTO_H_ 1853 struct cpuset_setid_args { 1854 cpuwhich_t which; 1855 id_t id; 1856 cpusetid_t setid; 1857 }; 1858 #endif 1859 int 1860 sys_cpuset_setid(struct thread *td, struct cpuset_setid_args *uap) 1861 { 1862 1863 return (kern_cpuset_setid(td, uap->which, uap->id, uap->setid)); 1864 } 1865 1866 int 1867 kern_cpuset_setid(struct thread *td, cpuwhich_t which, 1868 id_t id, cpusetid_t setid) 1869 { 1870 struct cpuset *set; 1871 int error; 1872 1873 /* 1874 * Presently we only support per-process sets. 1875 */ 1876 if (which != CPU_WHICH_PID) 1877 return (EINVAL); 1878 set = cpuset_lookup(setid, td); 1879 if (set == NULL) 1880 return (ESRCH); 1881 error = cpuset_setproc(id, set, NULL, NULL, false); 1882 cpuset_rel(set); 1883 return (error); 1884 } 1885 1886 #ifndef _SYS_SYSPROTO_H_ 1887 struct cpuset_getid_args { 1888 cpulevel_t level; 1889 cpuwhich_t which; 1890 id_t id; 1891 cpusetid_t *setid; 1892 }; 1893 #endif 1894 int 1895 sys_cpuset_getid(struct thread *td, struct cpuset_getid_args *uap) 1896 { 1897 1898 return (kern_cpuset_getid(td, uap->level, uap->which, uap->id, 1899 uap->setid)); 1900 } 1901 1902 int 1903 kern_cpuset_getid(struct thread *td, cpulevel_t level, cpuwhich_t which, 1904 id_t id, cpusetid_t *setid) 1905 { 1906 struct cpuset *nset; 1907 struct cpuset *set; 1908 struct thread *ttd; 1909 struct proc *p; 1910 cpusetid_t tmpid; 1911 int error; 1912 1913 if (level == CPU_LEVEL_WHICH && which != CPU_WHICH_CPUSET) 1914 return (EINVAL); 1915 error = cpuset_which(which, id, &p, &ttd, &set); 1916 if (error) 1917 return (error); 1918 switch (which) { 1919 case CPU_WHICH_TID: 1920 case CPU_WHICH_PID: 1921 case CPU_WHICH_TIDPID: 1922 thread_lock(ttd); 1923 set = cpuset_refbase(ttd->td_cpuset); 1924 thread_unlock(ttd); 1925 PROC_UNLOCK(p); 1926 break; 1927 case CPU_WHICH_CPUSET: 1928 case CPU_WHICH_JAIL: 1929 break; 1930 case CPU_WHICH_IRQ: 1931 case CPU_WHICH_DOMAIN: 1932 return (EINVAL); 1933 } 1934 switch (level) { 1935 case CPU_LEVEL_ROOT: 1936 nset = cpuset_refroot(set); 1937 cpuset_rel(set); 1938 set = nset; 1939 break; 1940 case CPU_LEVEL_CPUSET: 1941 break; 1942 case CPU_LEVEL_WHICH: 1943 break; 1944 } 1945 tmpid = set->cs_id; 1946 cpuset_rel(set); 1947 if (error == 0) 1948 error = copyout(&tmpid, setid, sizeof(tmpid)); 1949 1950 return (error); 1951 } 1952 1953 #ifndef _SYS_SYSPROTO_H_ 1954 struct cpuset_getaffinity_args { 1955 cpulevel_t level; 1956 cpuwhich_t which; 1957 id_t id; 1958 size_t cpusetsize; 1959 cpuset_t *mask; 1960 }; 1961 #endif 1962 int 1963 sys_cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap) 1964 { 1965 1966 return (user_cpuset_getaffinity(td, uap->level, uap->which, 1967 uap->id, uap->cpusetsize, uap->mask, ©_set)); 1968 } 1969 1970 int 1971 kern_cpuset_getaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, 1972 id_t id, size_t cpusetsize, cpuset_t *mask) 1973 { 1974 struct thread *ttd; 1975 struct cpuset *nset; 1976 struct cpuset *set; 1977 struct proc *p; 1978 int error; 1979 1980 error = cpuset_check_capabilities(td, level, which, id); 1981 if (error != 0) 1982 return (error); 1983 error = cpuset_which2(&which, id, &p, &ttd, &set); 1984 if (error != 0) 1985 return (error); 1986 switch (level) { 1987 case CPU_LEVEL_ROOT: 1988 case CPU_LEVEL_CPUSET: 1989 switch (which) { 1990 case CPU_WHICH_TID: 1991 case CPU_WHICH_PID: 1992 thread_lock(ttd); 1993 set = cpuset_ref(ttd->td_cpuset); 1994 thread_unlock(ttd); 1995 break; 1996 case CPU_WHICH_CPUSET: 1997 case CPU_WHICH_JAIL: 1998 break; 1999 case CPU_WHICH_IRQ: 2000 case CPU_WHICH_INTRHANDLER: 2001 case CPU_WHICH_ITHREAD: 2002 case CPU_WHICH_DOMAIN: 2003 return (EINVAL); 2004 } 2005 if (level == CPU_LEVEL_ROOT) 2006 nset = cpuset_refroot(set); 2007 else 2008 nset = cpuset_refbase(set); 2009 CPU_COPY(&nset->cs_mask, mask); 2010 cpuset_rel(nset); 2011 break; 2012 case CPU_LEVEL_WHICH: 2013 switch (which) { 2014 case CPU_WHICH_TID: 2015 thread_lock(ttd); 2016 CPU_COPY(&ttd->td_cpuset->cs_mask, mask); 2017 thread_unlock(ttd); 2018 break; 2019 case CPU_WHICH_PID: 2020 FOREACH_THREAD_IN_PROC(p, ttd) { 2021 thread_lock(ttd); 2022 CPU_OR(mask, mask, &ttd->td_cpuset->cs_mask); 2023 thread_unlock(ttd); 2024 } 2025 break; 2026 case CPU_WHICH_CPUSET: 2027 case CPU_WHICH_JAIL: 2028 CPU_COPY(&set->cs_mask, mask); 2029 break; 2030 case CPU_WHICH_IRQ: 2031 case CPU_WHICH_INTRHANDLER: 2032 case CPU_WHICH_ITHREAD: 2033 error = intr_getaffinity(id, which, mask); 2034 break; 2035 case CPU_WHICH_DOMAIN: 2036 if (id < 0 || id >= MAXMEMDOM) 2037 error = ESRCH; 2038 else 2039 CPU_COPY(&cpuset_domain[id], mask); 2040 break; 2041 } 2042 break; 2043 default: 2044 error = EINVAL; 2045 break; 2046 } 2047 if (set) 2048 cpuset_rel(set); 2049 if (p) 2050 PROC_UNLOCK(p); 2051 if (error == 0) { 2052 if (cpusetsize < howmany(CPU_FLS(mask), NBBY)) 2053 return (ERANGE); 2054 #ifdef KTRACE 2055 if (KTRPOINT(td, KTR_STRUCT)) 2056 ktrcpuset(mask, cpusetsize); 2057 #endif 2058 } 2059 return (error); 2060 } 2061 2062 int 2063 user_cpuset_getaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, 2064 id_t id, size_t cpusetsize, cpuset_t *maskp, const struct cpuset_copy_cb *cb) 2065 { 2066 cpuset_t *mask; 2067 size_t size; 2068 int error; 2069 2070 mask = malloc(sizeof(cpuset_t), M_TEMP, M_WAITOK | M_ZERO); 2071 size = min(cpusetsize, sizeof(cpuset_t)); 2072 error = kern_cpuset_getaffinity(td, level, which, id, size, mask); 2073 if (error == 0) { 2074 error = cb->cpuset_copyout(mask, maskp, size); 2075 if (error != 0) 2076 goto out; 2077 if (cpusetsize > size) { 2078 char *end; 2079 char *cp; 2080 int rv; 2081 2082 end = cp = (char *)&maskp->__bits; 2083 end += cpusetsize; 2084 cp += size; 2085 while (cp != end) { 2086 rv = subyte(cp, 0); 2087 if (rv == -1) { 2088 error = EFAULT; 2089 goto out; 2090 } 2091 cp++; 2092 } 2093 } 2094 } 2095 out: 2096 free(mask, M_TEMP); 2097 return (error); 2098 } 2099 2100 #ifndef _SYS_SYSPROTO_H_ 2101 struct cpuset_setaffinity_args { 2102 cpulevel_t level; 2103 cpuwhich_t which; 2104 id_t id; 2105 size_t cpusetsize; 2106 const cpuset_t *mask; 2107 }; 2108 #endif 2109 int 2110 sys_cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap) 2111 { 2112 2113 return (user_cpuset_setaffinity(td, uap->level, uap->which, 2114 uap->id, uap->cpusetsize, uap->mask, ©_set)); 2115 } 2116 2117 int 2118 kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, 2119 id_t id, cpuset_t *mask) 2120 { 2121 struct cpuset *nset; 2122 struct cpuset *set; 2123 struct thread *ttd; 2124 struct proc *p; 2125 int error; 2126 2127 #ifdef KTRACE 2128 if (KTRPOINT(td, KTR_STRUCT)) 2129 ktrcpuset(mask, sizeof(cpuset_t)); 2130 #endif 2131 error = cpuset_check_capabilities(td, level, which, id); 2132 if (error != 0) 2133 return (error); 2134 if (CPU_EMPTY(mask)) 2135 return (EDEADLK); 2136 switch (level) { 2137 case CPU_LEVEL_ROOT: 2138 case CPU_LEVEL_CPUSET: 2139 error = cpuset_which(which, id, &p, &ttd, &set); 2140 if (error) 2141 break; 2142 switch (which) { 2143 case CPU_WHICH_TID: 2144 case CPU_WHICH_PID: 2145 case CPU_WHICH_TIDPID: 2146 thread_lock(ttd); 2147 set = cpuset_ref(ttd->td_cpuset); 2148 thread_unlock(ttd); 2149 PROC_UNLOCK(p); 2150 break; 2151 case CPU_WHICH_CPUSET: 2152 case CPU_WHICH_JAIL: 2153 break; 2154 case CPU_WHICH_IRQ: 2155 case CPU_WHICH_INTRHANDLER: 2156 case CPU_WHICH_ITHREAD: 2157 case CPU_WHICH_DOMAIN: 2158 return (EINVAL); 2159 } 2160 if (level == CPU_LEVEL_ROOT) 2161 nset = cpuset_refroot(set); 2162 else 2163 nset = cpuset_refbase(set); 2164 error = cpuset_modify(nset, mask); 2165 cpuset_rel(nset); 2166 cpuset_rel(set); 2167 break; 2168 case CPU_LEVEL_WHICH: 2169 switch (which) { 2170 case CPU_WHICH_TID: 2171 error = cpuset_setthread(id, mask); 2172 break; 2173 case CPU_WHICH_PID: 2174 error = cpuset_setproc(id, NULL, mask, NULL, false); 2175 break; 2176 case CPU_WHICH_TIDPID: 2177 if (id > PID_MAX || id == -1) 2178 error = cpuset_setthread(id, mask); 2179 else 2180 error = cpuset_setproc(id, NULL, mask, NULL, 2181 false); 2182 break; 2183 case CPU_WHICH_CPUSET: 2184 case CPU_WHICH_JAIL: 2185 error = cpuset_which(which, id, &p, &ttd, &set); 2186 if (error == 0) { 2187 error = cpuset_modify(set, mask); 2188 cpuset_rel(set); 2189 } 2190 break; 2191 case CPU_WHICH_IRQ: 2192 case CPU_WHICH_INTRHANDLER: 2193 case CPU_WHICH_ITHREAD: 2194 error = intr_setaffinity(id, which, mask); 2195 break; 2196 default: 2197 error = EINVAL; 2198 break; 2199 } 2200 break; 2201 default: 2202 error = EINVAL; 2203 break; 2204 } 2205 return (error); 2206 } 2207 2208 int 2209 user_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, 2210 id_t id, size_t cpusetsize, const cpuset_t *maskp, const struct cpuset_copy_cb *cb) 2211 { 2212 cpuset_t *mask; 2213 int error; 2214 size_t size; 2215 2216 size = min(cpusetsize, sizeof(cpuset_t)); 2217 mask = malloc(sizeof(cpuset_t), M_TEMP, M_WAITOK | M_ZERO); 2218 error = cb->cpuset_copyin(maskp, mask, size); 2219 if (error) 2220 goto out; 2221 /* 2222 * Verify that no high bits are set. 2223 */ 2224 if (cpusetsize > sizeof(cpuset_t)) { 2225 const char *end, *cp; 2226 int val; 2227 end = cp = (const char *)&maskp->__bits; 2228 end += cpusetsize; 2229 cp += sizeof(cpuset_t); 2230 2231 while (cp != end) { 2232 val = fubyte(cp); 2233 if (val == -1) { 2234 error = EFAULT; 2235 goto out; 2236 } 2237 if (val != 0) { 2238 error = EINVAL; 2239 goto out; 2240 } 2241 cp++; 2242 } 2243 } 2244 error = kern_cpuset_setaffinity(td, level, which, id, mask); 2245 2246 out: 2247 free(mask, M_TEMP); 2248 return (error); 2249 } 2250 2251 #ifndef _SYS_SYSPROTO_H_ 2252 struct cpuset_getdomain_args { 2253 cpulevel_t level; 2254 cpuwhich_t which; 2255 id_t id; 2256 size_t domainsetsize; 2257 domainset_t *mask; 2258 int *policy; 2259 }; 2260 #endif 2261 int 2262 sys_cpuset_getdomain(struct thread *td, struct cpuset_getdomain_args *uap) 2263 { 2264 2265 return (kern_cpuset_getdomain(td, uap->level, uap->which, 2266 uap->id, uap->domainsetsize, uap->mask, uap->policy, ©_set)); 2267 } 2268 2269 int 2270 kern_cpuset_getdomain(struct thread *td, cpulevel_t level, cpuwhich_t which, 2271 id_t id, size_t domainsetsize, domainset_t *maskp, int *policyp, 2272 const struct cpuset_copy_cb *cb) 2273 { 2274 struct domainset outset; 2275 struct thread *ttd; 2276 struct cpuset *nset; 2277 struct cpuset *set; 2278 struct domainset *dset; 2279 struct proc *p; 2280 domainset_t *mask; 2281 int error; 2282 2283 if (domainsetsize < sizeof(domainset_t) || 2284 domainsetsize > DOMAINSET_MAXSIZE / NBBY) 2285 return (ERANGE); 2286 error = cpuset_check_capabilities(td, level, which, id); 2287 if (error != 0) 2288 return (error); 2289 mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO); 2290 bzero(&outset, sizeof(outset)); 2291 error = cpuset_which2(&which, id, &p, &ttd, &set); 2292 if (error) 2293 goto out; 2294 switch (level) { 2295 case CPU_LEVEL_ROOT: 2296 case CPU_LEVEL_CPUSET: 2297 switch (which) { 2298 case CPU_WHICH_TID: 2299 case CPU_WHICH_PID: 2300 thread_lock(ttd); 2301 set = cpuset_ref(ttd->td_cpuset); 2302 thread_unlock(ttd); 2303 break; 2304 case CPU_WHICH_CPUSET: 2305 case CPU_WHICH_JAIL: 2306 break; 2307 case CPU_WHICH_IRQ: 2308 case CPU_WHICH_INTRHANDLER: 2309 case CPU_WHICH_ITHREAD: 2310 case CPU_WHICH_DOMAIN: 2311 error = EINVAL; 2312 goto out; 2313 } 2314 if (level == CPU_LEVEL_ROOT) 2315 nset = cpuset_refroot(set); 2316 else 2317 nset = cpuset_refbase(set); 2318 domainset_copy(nset->cs_domain, &outset); 2319 cpuset_rel(nset); 2320 break; 2321 case CPU_LEVEL_WHICH: 2322 switch (which) { 2323 case CPU_WHICH_TID: 2324 thread_lock(ttd); 2325 domainset_copy(ttd->td_cpuset->cs_domain, &outset); 2326 thread_unlock(ttd); 2327 break; 2328 case CPU_WHICH_PID: 2329 FOREACH_THREAD_IN_PROC(p, ttd) { 2330 thread_lock(ttd); 2331 dset = ttd->td_cpuset->cs_domain; 2332 /* Show all domains in the proc. */ 2333 DOMAINSET_OR(&outset.ds_mask, &dset->ds_mask); 2334 /* Last policy wins. */ 2335 outset.ds_policy = dset->ds_policy; 2336 outset.ds_prefer = dset->ds_prefer; 2337 thread_unlock(ttd); 2338 } 2339 break; 2340 case CPU_WHICH_CPUSET: 2341 case CPU_WHICH_JAIL: 2342 domainset_copy(set->cs_domain, &outset); 2343 break; 2344 case CPU_WHICH_IRQ: 2345 case CPU_WHICH_INTRHANDLER: 2346 case CPU_WHICH_ITHREAD: 2347 case CPU_WHICH_DOMAIN: 2348 error = EINVAL; 2349 break; 2350 } 2351 break; 2352 default: 2353 error = EINVAL; 2354 break; 2355 } 2356 if (set) 2357 cpuset_rel(set); 2358 if (p) 2359 PROC_UNLOCK(p); 2360 /* 2361 * Translate prefer into a set containing only the preferred domain, 2362 * not the entire fallback set. 2363 */ 2364 if (outset.ds_policy == DOMAINSET_POLICY_PREFER) { 2365 DOMAINSET_ZERO(&outset.ds_mask); 2366 DOMAINSET_SET(outset.ds_prefer, &outset.ds_mask); 2367 } 2368 DOMAINSET_COPY(&outset.ds_mask, mask); 2369 if (error == 0) 2370 error = cb->cpuset_copyout(mask, maskp, domainsetsize); 2371 if (error == 0) 2372 if (suword32(policyp, outset.ds_policy) != 0) 2373 error = EFAULT; 2374 out: 2375 free(mask, M_TEMP); 2376 return (error); 2377 } 2378 2379 #ifndef _SYS_SYSPROTO_H_ 2380 struct cpuset_setdomain_args { 2381 cpulevel_t level; 2382 cpuwhich_t which; 2383 id_t id; 2384 size_t domainsetsize; 2385 domainset_t *mask; 2386 int policy; 2387 }; 2388 #endif 2389 int 2390 sys_cpuset_setdomain(struct thread *td, struct cpuset_setdomain_args *uap) 2391 { 2392 2393 return (kern_cpuset_setdomain(td, uap->level, uap->which, 2394 uap->id, uap->domainsetsize, uap->mask, uap->policy, ©_set)); 2395 } 2396 2397 int 2398 kern_cpuset_setdomain(struct thread *td, cpulevel_t level, cpuwhich_t which, 2399 id_t id, size_t domainsetsize, const domainset_t *maskp, int policy, 2400 const struct cpuset_copy_cb *cb) 2401 { 2402 struct cpuset *nset; 2403 struct cpuset *set; 2404 struct thread *ttd; 2405 struct proc *p; 2406 struct domainset domain; 2407 domainset_t *mask; 2408 int error; 2409 2410 if (domainsetsize < sizeof(domainset_t) || 2411 domainsetsize > DOMAINSET_MAXSIZE / NBBY) 2412 return (ERANGE); 2413 if (policy <= DOMAINSET_POLICY_INVALID || 2414 policy > DOMAINSET_POLICY_MAX) 2415 return (EINVAL); 2416 error = cpuset_check_capabilities(td, level, which, id); 2417 if (error != 0) 2418 return (error); 2419 memset(&domain, 0, sizeof(domain)); 2420 mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO); 2421 error = cb->cpuset_copyin(maskp, mask, domainsetsize); 2422 if (error) 2423 goto out; 2424 /* 2425 * Verify that no high bits are set. 2426 */ 2427 if (domainsetsize > sizeof(domainset_t)) { 2428 char *end; 2429 char *cp; 2430 2431 end = cp = (char *)&mask->__bits; 2432 end += domainsetsize; 2433 cp += sizeof(domainset_t); 2434 while (cp != end) 2435 if (*cp++ != 0) { 2436 error = EINVAL; 2437 goto out; 2438 } 2439 } 2440 if (DOMAINSET_EMPTY(mask)) { 2441 error = EDEADLK; 2442 goto out; 2443 } 2444 DOMAINSET_COPY(mask, &domain.ds_mask); 2445 domain.ds_policy = policy; 2446 2447 /* 2448 * Sanitize the provided mask. 2449 */ 2450 if (!DOMAINSET_SUBSET(&all_domains, &domain.ds_mask)) { 2451 error = EINVAL; 2452 goto out; 2453 } 2454 2455 /* Translate preferred policy into a mask and fallback. */ 2456 if (policy == DOMAINSET_POLICY_PREFER) { 2457 /* Only support a single preferred domain. */ 2458 if (DOMAINSET_COUNT(&domain.ds_mask) != 1) { 2459 error = EINVAL; 2460 goto out; 2461 } 2462 domain.ds_prefer = DOMAINSET_FFS(&domain.ds_mask) - 1; 2463 /* This will be constrained by domainset_shadow(). */ 2464 DOMAINSET_COPY(&all_domains, &domain.ds_mask); 2465 } 2466 2467 /* 2468 * When given an impossible policy, fall back to interleaving 2469 * across all domains. 2470 */ 2471 if (domainset_empty_vm(&domain)) 2472 domainset_copy(domainset2, &domain); 2473 2474 switch (level) { 2475 case CPU_LEVEL_ROOT: 2476 case CPU_LEVEL_CPUSET: 2477 error = cpuset_which(which, id, &p, &ttd, &set); 2478 if (error) 2479 break; 2480 switch (which) { 2481 case CPU_WHICH_TID: 2482 case CPU_WHICH_PID: 2483 case CPU_WHICH_TIDPID: 2484 thread_lock(ttd); 2485 set = cpuset_ref(ttd->td_cpuset); 2486 thread_unlock(ttd); 2487 PROC_UNLOCK(p); 2488 break; 2489 case CPU_WHICH_CPUSET: 2490 case CPU_WHICH_JAIL: 2491 break; 2492 case CPU_WHICH_IRQ: 2493 case CPU_WHICH_INTRHANDLER: 2494 case CPU_WHICH_ITHREAD: 2495 case CPU_WHICH_DOMAIN: 2496 error = EINVAL; 2497 goto out; 2498 } 2499 if (level == CPU_LEVEL_ROOT) 2500 nset = cpuset_refroot(set); 2501 else 2502 nset = cpuset_refbase(set); 2503 error = cpuset_modify_domain(nset, &domain); 2504 cpuset_rel(nset); 2505 cpuset_rel(set); 2506 break; 2507 case CPU_LEVEL_WHICH: 2508 switch (which) { 2509 case CPU_WHICH_TID: 2510 error = _cpuset_setthread(id, NULL, &domain); 2511 break; 2512 case CPU_WHICH_PID: 2513 error = cpuset_setproc(id, NULL, NULL, &domain, false); 2514 break; 2515 case CPU_WHICH_TIDPID: 2516 if (id > PID_MAX || id == -1) 2517 error = _cpuset_setthread(id, NULL, &domain); 2518 else 2519 error = cpuset_setproc(id, NULL, NULL, &domain, 2520 false); 2521 break; 2522 case CPU_WHICH_CPUSET: 2523 case CPU_WHICH_JAIL: 2524 error = cpuset_which(which, id, &p, &ttd, &set); 2525 if (error == 0) { 2526 error = cpuset_modify_domain(set, &domain); 2527 cpuset_rel(set); 2528 } 2529 break; 2530 case CPU_WHICH_IRQ: 2531 case CPU_WHICH_INTRHANDLER: 2532 case CPU_WHICH_ITHREAD: 2533 default: 2534 error = EINVAL; 2535 break; 2536 } 2537 break; 2538 default: 2539 error = EINVAL; 2540 break; 2541 } 2542 out: 2543 free(mask, M_TEMP); 2544 return (error); 2545 } 2546 2547 #ifdef DDB 2548 2549 static void 2550 ddb_display_bitset(const struct bitset *set, int size) 2551 { 2552 int bit, once; 2553 2554 for (once = 0, bit = 0; bit < size; bit++) { 2555 if (CPU_ISSET(bit, set)) { 2556 if (once == 0) { 2557 db_printf("%d", bit); 2558 once = 1; 2559 } else 2560 db_printf(",%d", bit); 2561 } 2562 } 2563 if (once == 0) 2564 db_printf("<none>"); 2565 } 2566 2567 void 2568 ddb_display_cpuset(const cpuset_t *set) 2569 { 2570 ddb_display_bitset((const struct bitset *)set, CPU_SETSIZE); 2571 } 2572 2573 static void 2574 ddb_display_domainset(const domainset_t *set) 2575 { 2576 ddb_display_bitset((const struct bitset *)set, DOMAINSET_SETSIZE); 2577 } 2578 2579 DB_SHOW_COMMAND_FLAGS(cpusets, db_show_cpusets, DB_CMD_MEMSAFE) 2580 { 2581 struct cpuset *set; 2582 2583 LIST_FOREACH(set, &cpuset_ids, cs_link) { 2584 db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n", 2585 set, set->cs_id, refcount_load(&set->cs_ref), set->cs_flags, 2586 (set->cs_parent != NULL) ? set->cs_parent->cs_id : 0); 2587 db_printf(" cpu mask="); 2588 ddb_display_cpuset(&set->cs_mask); 2589 db_printf("\n"); 2590 db_printf(" domain policy %d prefer %d mask=", 2591 set->cs_domain->ds_policy, set->cs_domain->ds_prefer); 2592 ddb_display_domainset(&set->cs_domain->ds_mask); 2593 db_printf("\n"); 2594 if (db_pager_quit) 2595 break; 2596 } 2597 } 2598 2599 DB_SHOW_COMMAND_FLAGS(domainsets, db_show_domainsets, DB_CMD_MEMSAFE) 2600 { 2601 struct domainset *set; 2602 2603 LIST_FOREACH(set, &cpuset_domains, ds_link) { 2604 db_printf("set=%p policy %d prefer %d cnt %d\n", 2605 set, set->ds_policy, set->ds_prefer, set->ds_cnt); 2606 db_printf(" mask ="); 2607 ddb_display_domainset(&set->ds_mask); 2608 db_printf("\n"); 2609 } 2610 } 2611 #endif /* DDB */ 2612