1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * Copyright 2021 Joyent, Inc. 26 * Copyright 2021 Oxide Computer Company 27 */ 28 29 #include <sys/types.h> 30 #include <sys/systm.h> 31 #include <sys/schedctl.h> 32 #include <sys/proc.h> 33 #include <sys/thread.h> 34 #include <sys/class.h> 35 #include <sys/cred.h> 36 #include <sys/kmem.h> 37 #include <sys/cmn_err.h> 38 #include <sys/stack.h> 39 #include <sys/debug.h> 40 #include <sys/cpuvar.h> 41 #include <sys/sobject.h> 42 #include <sys/door.h> 43 #include <sys/modctl.h> 44 #include <sys/syscall.h> 45 #include <sys/sysmacros.h> 46 #include <sys/vmsystm.h> 47 #include <sys/mman.h> 48 #include <sys/vnode.h> 49 #include <sys/swap.h> 50 #include <sys/lwp.h> 51 #include <sys/bitmap.h> 52 #include <sys/atomic.h> 53 #include <sys/fcntl.h> 54 #include <vm/seg_kp.h> 55 #include <vm/seg_vn.h> 56 #include <vm/as.h> 57 #include <fs/fs_subr.h> 58 59 /* 60 * Page handling structures. This is set up as a list of per-page 61 * control structures (sc_page_ctl), with p->p_pagep pointing to 62 * the first. The per-page structures point to the actual pages 63 * and contain pointers to the user address for each mapped page. 64 * 65 * All data is protected by p->p_sc_lock. Since this lock is 66 * held while waiting for memory, schedctl_shared_alloc() should 67 * not be called while holding p_lock. 68 */ 69 70 typedef struct sc_page_ctl { 71 struct sc_page_ctl *spc_next; 72 sc_shared_t *spc_base; /* base of kernel page */ 73 sc_shared_t *spc_end; /* end of usable space */ 74 ulong_t *spc_map; /* bitmap of allocated space on page */ 75 size_t spc_space; /* amount of space on page */ 76 caddr_t spc_uaddr; /* user-level address of the page */ 77 struct anon_map *spc_amp; /* anonymous memory structure */ 78 } sc_page_ctl_t; 79 80 static size_t sc_pagesize; /* size of usable space on page */ 81 static size_t sc_bitmap_len; /* # of bits in allocation bitmap */ 82 static size_t sc_bitmap_words; /* # of words in allocation bitmap */ 83 84 /* Context ops */ 85 static void schedctl_save(void *); 86 static void schedctl_restore(void *); 87 static void schedctl_fork(void *, void *); 88 89 /* Functions for handling shared pages */ 90 static int schedctl_shared_alloc(sc_shared_t **, uintptr_t *); 91 static sc_page_ctl_t *schedctl_page_lookup(sc_shared_t *); 92 static int schedctl_map(struct anon_map *, caddr_t *, caddr_t); 93 static int schedctl_getpage(struct anon_map **, caddr_t *); 94 static void schedctl_freepage(struct anon_map *, caddr_t); 95 96 static const struct ctxop_template schedctl_ctxop_tpl = { 97 .ct_rev = CTXOP_TPL_REV, 98 .ct_save = schedctl_save, 99 .ct_restore = schedctl_restore, 100 .ct_fork = schedctl_fork, 101 }; 102 103 /* 104 * System call interface to scheduler activations. 105 * This always operates on the current lwp. 106 */ 107 caddr_t 108 schedctl(void) 109 { 110 kthread_t *t = curthread; 111 sc_shared_t *ssp; 112 uintptr_t uaddr; 113 int error; 114 115 if (t->t_schedctl == NULL) { 116 /* 117 * Allocate and initialize the shared structure. 118 */ 119 if ((error = schedctl_shared_alloc(&ssp, &uaddr)) != 0) 120 return ((caddr_t)(uintptr_t)set_errno(error)); 121 bzero(ssp, sizeof (*ssp)); 122 123 ctxop_install(t, &schedctl_ctxop_tpl, ssp); 124 125 thread_lock(t); /* protect against ts_tick and ts_update */ 126 t->t_schedctl = ssp; 127 t->t_sc_uaddr = uaddr; 128 ssp->sc_cid = t->t_cid; 129 ssp->sc_cpri = t->t_cpri; 130 ssp->sc_priority = DISP_PRIO(t); 131 thread_unlock(t); 132 } 133 134 return ((caddr_t)t->t_sc_uaddr); 135 } 136 137 138 /* 139 * Clean up scheduler activations state associated with an exiting 140 * (or execing) lwp. t is always the current thread. 141 */ 142 void 143 schedctl_lwp_cleanup(kthread_t *t) 144 { 145 sc_shared_t *ssp = t->t_schedctl; 146 proc_t *p = ttoproc(t); 147 sc_page_ctl_t *pagep; 148 index_t index; 149 150 ASSERT(MUTEX_NOT_HELD(&p->p_lock)); 151 152 thread_lock(t); /* protect against ts_tick and ts_update */ 153 t->t_schedctl = NULL; 154 t->t_sc_uaddr = 0; 155 thread_unlock(t); 156 157 /* 158 * Remove the context op to avoid the final call to 159 * schedctl_save when switching away from this lwp. 160 */ 161 (void) ctxop_remove(t, &schedctl_ctxop_tpl, ssp); 162 163 /* 164 * Do not unmap the shared page until the process exits. 165 * User-level library code relies on this for adaptive mutex locking. 166 */ 167 mutex_enter(&p->p_sc_lock); 168 ssp->sc_state = SC_FREE; 169 pagep = schedctl_page_lookup(ssp); 170 index = (index_t)(ssp - pagep->spc_base); 171 BT_CLEAR(pagep->spc_map, index); 172 pagep->spc_space += sizeof (sc_shared_t); 173 mutex_exit(&p->p_sc_lock); 174 } 175 176 177 /* 178 * Cleanup the list of schedctl shared pages for the process. 179 * Called from exec() and exit() system calls. 180 */ 181 void 182 schedctl_proc_cleanup(void) 183 { 184 proc_t *p = curproc; 185 sc_page_ctl_t *pagep; 186 sc_page_ctl_t *next; 187 188 ASSERT(p->p_lwpcnt == 1); /* we are single-threaded now */ 189 ASSERT(curthread->t_schedctl == NULL); 190 191 /* 192 * Since we are single-threaded, we don't have to hold p->p_sc_lock. 193 */ 194 pagep = p->p_pagep; 195 p->p_pagep = NULL; 196 while (pagep != NULL) { 197 ASSERT(pagep->spc_space == sc_pagesize); 198 next = pagep->spc_next; 199 /* 200 * Unmap the user space and free the mapping structure. 201 */ 202 (void) as_unmap(p->p_as, pagep->spc_uaddr, PAGESIZE); 203 schedctl_freepage(pagep->spc_amp, (caddr_t)(pagep->spc_base)); 204 kmem_free(pagep->spc_map, sizeof (ulong_t) * sc_bitmap_words); 205 kmem_free(pagep, sizeof (sc_page_ctl_t)); 206 pagep = next; 207 } 208 } 209 210 211 /* 212 * Called by resume just before switching away from the current thread. 213 * Save new thread state. 214 */ 215 static void 216 schedctl_save(void *arg) 217 { 218 sc_shared_t *ssp = arg; 219 220 ssp->sc_state = curthread->t_state; 221 } 222 223 224 /* 225 * Called by resume after switching to the current thread. 226 * Save new thread state and CPU. 227 */ 228 static void 229 schedctl_restore(void *arg) 230 { 231 sc_shared_t *ssp = arg; 232 233 ssp->sc_state = SC_ONPROC; 234 ssp->sc_cpu = CPU->cpu_id; 235 } 236 237 238 /* 239 * On fork, remove inherited mappings from the child's address space. 240 * The child's threads must call schedctl() to get new shared mappings. 241 */ 242 static void 243 schedctl_fork(void *parent, void *child) 244 { 245 kthread_t *pt = parent, *ct = child; 246 proc_t *pp = ttoproc(pt); 247 proc_t *cp = ttoproc(ct); 248 sc_page_ctl_t *pagep; 249 250 ASSERT(ct->t_schedctl == NULL); 251 252 /* 253 * Do this only once, whether we are doing fork1() or forkall(). 254 * Don't do it at all if the child process is a child of vfork() 255 * because a child of vfork() borrows the parent's address space. 256 */ 257 if (pt != curthread || (cp->p_flag & SVFORK)) 258 return; 259 260 mutex_enter(&pp->p_sc_lock); 261 for (pagep = pp->p_pagep; pagep != NULL; pagep = pagep->spc_next) 262 (void) as_unmap(cp->p_as, pagep->spc_uaddr, PAGESIZE); 263 mutex_exit(&pp->p_sc_lock); 264 } 265 266 267 /* 268 * Returns non-zero if the specified thread shouldn't be preempted at this time. 269 * Called by ts_preempt(), ts_tick(), and ts_update(). 270 */ 271 int 272 schedctl_get_nopreempt(kthread_t *t) 273 { 274 ASSERT(THREAD_LOCK_HELD(t)); 275 return (t->t_schedctl->sc_preemptctl.sc_nopreempt); 276 } 277 278 279 /* 280 * Sets the value of the nopreempt field for the specified thread. 281 * Called by ts_preempt() to clear the field on preemption. 282 */ 283 void 284 schedctl_set_nopreempt(kthread_t *t, short val) 285 { 286 ASSERT(THREAD_LOCK_HELD(t)); 287 t->t_schedctl->sc_preemptctl.sc_nopreempt = val; 288 } 289 290 291 /* 292 * Sets the value of the yield field for the specified thread. 293 * Called by ts_preempt() and ts_tick() to set the field, and 294 * ts_yield() to clear it. 295 * The kernel never looks at this field so we don't need a 296 * schedctl_get_yield() function. 297 */ 298 void 299 schedctl_set_yield(kthread_t *t, short val) 300 { 301 ASSERT(THREAD_LOCK_HELD(t)); 302 t->t_schedctl->sc_preemptctl.sc_yield = val; 303 } 304 305 306 /* 307 * Sets the values of the cid and priority fields for the specified thread. 308 * Called from thread_change_pri(), thread_change_epri(), THREAD_CHANGE_PRI(). 309 * Called following calls to CL_FORKRET() and CL_ENTERCLASS(). 310 */ 311 void 312 schedctl_set_cidpri(kthread_t *t) 313 { 314 sc_shared_t *tdp = t->t_schedctl; 315 316 if (tdp != NULL) { 317 tdp->sc_cid = t->t_cid; 318 tdp->sc_cpri = t->t_cpri; 319 tdp->sc_priority = DISP_PRIO(t); 320 } 321 } 322 323 324 /* 325 * Returns non-zero if the specified thread has requested that all 326 * signals be blocked. Called by signal-related code that tests 327 * the signal mask of a thread that may not be the current thread 328 * and where the process's p_lock cannot be acquired. 329 */ 330 int 331 schedctl_sigblock(kthread_t *t) 332 { 333 sc_shared_t *tdp = t->t_schedctl; 334 335 if (tdp != NULL) 336 return (tdp->sc_sigblock); 337 return (0); 338 } 339 340 341 /* 342 * If the sc_sigblock field is set for the specified thread, set its signal 343 * mask to block all maskable signals, then clear the sc_sigblock field. This 344 * accomplishes what user-level code requested to be done when it set 345 * tdp->sc_shared->sc_sigblock non-zero. 346 * 347 * This is generally called by signal-related code in the current thread. In 348 * order to call against a thread other than curthread, p_lock for the 349 * containing process must be held. Even then, the caller is not protected 350 * from races with the thread in question updating its own fields. It is the 351 * responsibility of the caller to perform additional synchronization. 352 * 353 */ 354 void 355 schedctl_finish_sigblock(kthread_t *t) 356 { 357 sc_shared_t *tdp = t->t_schedctl; 358 359 ASSERT(t == curthread || MUTEX_HELD(&ttoproc(t)->p_lock)); 360 361 if (tdp != NULL && tdp->sc_sigblock) { 362 t->t_hold.__sigbits[0] = FILLSET0 & ~CANTMASK0; 363 t->t_hold.__sigbits[1] = FILLSET1 & ~CANTMASK1; 364 t->t_hold.__sigbits[2] = FILLSET2 & ~CANTMASK2; 365 tdp->sc_sigblock = 0; 366 } 367 } 368 369 370 /* 371 * Return non-zero if the current thread has declared that it has 372 * a cancellation pending and that cancellation is not disabled. 373 * If SIGCANCEL is blocked, we must be going over the wire in an 374 * NFS transaction (sigintr() was called); return zero in this case. 375 */ 376 int 377 schedctl_cancel_pending(void) 378 { 379 sc_shared_t *tdp = curthread->t_schedctl; 380 381 if (tdp != NULL && 382 (tdp->sc_flgs & SC_CANCEL_FLG) && 383 !tdp->sc_sigblock && 384 !sigismember(&curthread->t_hold, SIGCANCEL)) 385 return (1); 386 return (0); 387 } 388 389 390 /* 391 * Inform libc that the kernel returned EINTR from some system call 392 * due to there being a cancellation pending (SC_CANCEL_FLG set or 393 * we received an SI_LWP SIGCANCEL while in a system call), rather 394 * than because of some other signal. User-level code can try to 395 * recover from receiving other signals, but it can't recover from 396 * being cancelled. 397 */ 398 void 399 schedctl_cancel_eintr(void) 400 { 401 sc_shared_t *tdp = curthread->t_schedctl; 402 403 if (tdp != NULL) 404 tdp->sc_flgs |= SC_EINTR_FLG; 405 } 406 407 408 /* 409 * Return non-zero if the current thread has declared that 410 * it is calling into the kernel to park, else return zero. 411 */ 412 int 413 schedctl_is_park(void) 414 { 415 sc_shared_t *tdp = curthread->t_schedctl; 416 417 if (tdp != NULL) 418 return ((tdp->sc_flgs & SC_PARK_FLG) != 0); 419 /* 420 * If we're here and there is no shared memory (how could 421 * that happen?) then just assume we really are here to park. 422 */ 423 return (1); 424 } 425 426 427 /* 428 * Declare thread is parking. 429 * 430 * libc will set "sc_flgs |= SC_PARK_FLG" before calling lwpsys_park(0, tid) 431 * in order to declare that the thread is calling into the kernel to park. 432 * 433 * This interface exists ONLY to support older versions of libthread which 434 * are not aware of the SC_PARK_FLG flag. 435 * 436 * Older versions of libthread which are not aware of the SC_PARK_FLG flag 437 * need to be modified or emulated to call lwpsys_park(4, ...) instead of 438 * lwpsys_park(0, ...). This will invoke schedctl_set_park() before 439 * lwp_park() to declare that the thread is parking. 440 */ 441 void 442 schedctl_set_park(void) 443 { 444 sc_shared_t *tdp = curthread->t_schedctl; 445 if (tdp != NULL) 446 tdp->sc_flgs |= SC_PARK_FLG; 447 } 448 449 450 /* 451 * Clear the parking flag on return from parking in the kernel. 452 */ 453 void 454 schedctl_unpark(void) 455 { 456 sc_shared_t *tdp = curthread->t_schedctl; 457 458 if (tdp != NULL) 459 tdp->sc_flgs &= ~SC_PARK_FLG; 460 } 461 462 463 /* 464 * Page handling code. 465 */ 466 467 void 468 schedctl_init(void) 469 { 470 /* 471 * Amount of page that can hold sc_shared_t structures. If 472 * sizeof (sc_shared_t) is a power of 2, this should just be 473 * PAGESIZE. 474 */ 475 sc_pagesize = PAGESIZE - (PAGESIZE % sizeof (sc_shared_t)); 476 477 /* 478 * Allocation bitmap is one bit per struct on a page. 479 */ 480 sc_bitmap_len = sc_pagesize / sizeof (sc_shared_t); 481 sc_bitmap_words = howmany(sc_bitmap_len, BT_NBIPUL); 482 } 483 484 485 static int 486 schedctl_shared_alloc(sc_shared_t **kaddrp, uintptr_t *uaddrp) 487 { 488 proc_t *p = curproc; 489 sc_page_ctl_t *pagep; 490 sc_shared_t *ssp; 491 caddr_t base; 492 index_t index; 493 int error; 494 495 ASSERT(MUTEX_NOT_HELD(&p->p_lock)); 496 mutex_enter(&p->p_sc_lock); 497 498 /* 499 * Try to find space for the new data in existing pages 500 * within the process's list of shared pages. 501 */ 502 for (pagep = p->p_pagep; pagep != NULL; pagep = pagep->spc_next) 503 if (pagep->spc_space != 0) 504 break; 505 506 if (pagep != NULL) 507 base = pagep->spc_uaddr; 508 else { 509 struct anon_map *amp; 510 caddr_t kaddr; 511 512 /* 513 * No room, need to allocate a new page. Also set up 514 * a mapping to the kernel address space for the new 515 * page and lock it in memory. 516 */ 517 if ((error = schedctl_getpage(&, &kaddr)) != 0) { 518 mutex_exit(&p->p_sc_lock); 519 return (error); 520 } 521 if ((error = schedctl_map(amp, &base, kaddr)) != 0) { 522 schedctl_freepage(amp, kaddr); 523 mutex_exit(&p->p_sc_lock); 524 return (error); 525 } 526 527 /* 528 * Allocate and initialize the page control structure. 529 */ 530 pagep = kmem_alloc(sizeof (sc_page_ctl_t), KM_SLEEP); 531 pagep->spc_amp = amp; 532 pagep->spc_base = (sc_shared_t *)kaddr; 533 pagep->spc_end = (sc_shared_t *)(kaddr + sc_pagesize); 534 pagep->spc_uaddr = base; 535 536 pagep->spc_map = kmem_zalloc(sizeof (ulong_t) * sc_bitmap_words, 537 KM_SLEEP); 538 pagep->spc_space = sc_pagesize; 539 540 pagep->spc_next = p->p_pagep; 541 p->p_pagep = pagep; 542 } 543 544 /* 545 * Got a page, now allocate space for the data. There should 546 * be space unless something's wrong. 547 */ 548 ASSERT(pagep != NULL && pagep->spc_space >= sizeof (sc_shared_t)); 549 index = bt_availbit(pagep->spc_map, sc_bitmap_len); 550 ASSERT(index != -1); 551 552 /* 553 * Get location with pointer arithmetic. spc_base is of type 554 * sc_shared_t *. Mark as allocated. 555 */ 556 ssp = pagep->spc_base + index; 557 BT_SET(pagep->spc_map, index); 558 pagep->spc_space -= sizeof (sc_shared_t); 559 560 mutex_exit(&p->p_sc_lock); 561 562 /* 563 * Return kernel and user addresses. 564 */ 565 *kaddrp = ssp; 566 *uaddrp = (uintptr_t)base + ((uintptr_t)ssp & PAGEOFFSET); 567 return (0); 568 } 569 570 571 /* 572 * Find the page control structure corresponding to a kernel address. 573 */ 574 static sc_page_ctl_t * 575 schedctl_page_lookup(sc_shared_t *ssp) 576 { 577 proc_t *p = curproc; 578 sc_page_ctl_t *pagep; 579 580 ASSERT(MUTEX_HELD(&p->p_sc_lock)); 581 for (pagep = p->p_pagep; pagep != NULL; pagep = pagep->spc_next) { 582 if (ssp >= pagep->spc_base && ssp < pagep->spc_end) 583 return (pagep); 584 } 585 return (NULL); /* This "can't happen". Should we panic? */ 586 } 587 588 589 /* 590 * This function is called when a page needs to be mapped into a 591 * process's address space. Allocate the user address space and 592 * set up the mapping to the page. Assumes the page has already 593 * been allocated and locked in memory via schedctl_getpage. 594 */ 595 static int 596 schedctl_map(struct anon_map *amp, caddr_t *uaddrp, caddr_t kaddr) 597 { 598 caddr_t addr = NULL; 599 struct as *as = curproc->p_as; 600 struct segvn_crargs vn_a; 601 int error; 602 603 as_rangelock(as); 604 /* pass address of kernel mapping as offset to avoid VAC conflicts */ 605 map_addr(&addr, PAGESIZE, (offset_t)(uintptr_t)kaddr, 1, 0); 606 if (addr == NULL) { 607 as_rangeunlock(as); 608 return (ENOMEM); 609 } 610 611 /* 612 * Use segvn to set up the mapping to the page. 613 */ 614 vn_a.vp = NULL; 615 vn_a.offset = 0; 616 vn_a.cred = NULL; 617 vn_a.type = MAP_SHARED; 618 vn_a.prot = vn_a.maxprot = PROT_ALL; 619 vn_a.flags = 0; 620 vn_a.amp = amp; 621 vn_a.szc = 0; 622 vn_a.lgrp_mem_policy_flags = 0; 623 error = as_map(as, addr, PAGESIZE, segvn_create, &vn_a); 624 as_rangeunlock(as); 625 626 if (error) 627 return (error); 628 629 *uaddrp = addr; 630 return (0); 631 } 632 633 634 /* 635 * Allocate a new page from anonymous memory. Also, create a kernel 636 * mapping to the page and lock the page in memory. 637 */ 638 static int 639 schedctl_getpage(struct anon_map **newamp, caddr_t *newaddr) 640 { 641 struct anon_map *amp; 642 caddr_t kaddr; 643 644 /* 645 * Set up anonymous memory struct. No swap reservation is 646 * needed since the page will be locked into memory. 647 */ 648 amp = anonmap_alloc(PAGESIZE, 0, ANON_SLEEP); 649 650 /* 651 * Allocate the page. 652 */ 653 kaddr = segkp_get_withanonmap(segkp, PAGESIZE, 654 KPD_NO_ANON | KPD_LOCKED | KPD_ZERO, amp); 655 if (kaddr == NULL) { 656 amp->refcnt--; 657 anonmap_free(amp); 658 return (ENOMEM); 659 } 660 661 /* 662 * The page is left SE_SHARED locked so that it won't be 663 * paged out or relocated (KPD_LOCKED above). 664 */ 665 666 *newamp = amp; 667 *newaddr = kaddr; 668 return (0); 669 } 670 671 672 /* 673 * Take the necessary steps to allow a page to be released. 674 * This is called when the process is doing exit() or exec(). 675 * There should be no accesses to the page after this. 676 * The kernel mapping of the page is released and the page is unlocked. 677 */ 678 static void 679 schedctl_freepage(struct anon_map *amp, caddr_t kaddr) 680 { 681 /* 682 * Release the lock on the page and remove the kernel mapping. 683 */ 684 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 685 segkp_release(segkp, kaddr); 686 687 /* 688 * Decrement the refcnt so the anon_map structure will be freed. 689 */ 690 if (--amp->refcnt == 0) { 691 /* 692 * The current process no longer has the page mapped, so 693 * we have to free everything rather than letting as_free 694 * do the work. 695 */ 696 anonmap_purge(amp); 697 anon_free(amp->ahp, 0, PAGESIZE); 698 ANON_LOCK_EXIT(&->a_rwlock); 699 anonmap_free(amp); 700 } else { 701 ANON_LOCK_EXIT(&->a_rwlock); 702 } 703 } 704