1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * Copyright 2021 Joyent, Inc. 26 */ 27 28 #include <sys/types.h> 29 #include <sys/systm.h> 30 #include <sys/schedctl.h> 31 #include <sys/proc.h> 32 #include <sys/thread.h> 33 #include <sys/class.h> 34 #include <sys/cred.h> 35 #include <sys/kmem.h> 36 #include <sys/cmn_err.h> 37 #include <sys/stack.h> 38 #include <sys/debug.h> 39 #include <sys/cpuvar.h> 40 #include <sys/sobject.h> 41 #include <sys/door.h> 42 #include <sys/modctl.h> 43 #include <sys/syscall.h> 44 #include <sys/sysmacros.h> 45 #include <sys/vmsystm.h> 46 #include <sys/mman.h> 47 #include <sys/vnode.h> 48 #include <sys/swap.h> 49 #include <sys/lwp.h> 50 #include <sys/bitmap.h> 51 #include <sys/atomic.h> 52 #include <sys/fcntl.h> 53 #include <vm/seg_kp.h> 54 #include <vm/seg_vn.h> 55 #include <vm/as.h> 56 #include <fs/fs_subr.h> 57 58 /* 59 * Page handling structures. This is set up as a list of per-page 60 * control structures (sc_page_ctl), with p->p_pagep pointing to 61 * the first. The per-page structures point to the actual pages 62 * and contain pointers to the user address for each mapped page. 63 * 64 * All data is protected by p->p_sc_lock. Since this lock is 65 * held while waiting for memory, schedctl_shared_alloc() should 66 * not be called while holding p_lock. 67 */ 68 69 typedef struct sc_page_ctl { 70 struct sc_page_ctl *spc_next; 71 sc_shared_t *spc_base; /* base of kernel page */ 72 sc_shared_t *spc_end; /* end of usable space */ 73 ulong_t *spc_map; /* bitmap of allocated space on page */ 74 size_t spc_space; /* amount of space on page */ 75 caddr_t spc_uaddr; /* user-level address of the page */ 76 struct anon_map *spc_amp; /* anonymous memory structure */ 77 } sc_page_ctl_t; 78 79 static size_t sc_pagesize; /* size of usable space on page */ 80 static size_t sc_bitmap_len; /* # of bits in allocation bitmap */ 81 static size_t sc_bitmap_words; /* # of words in allocation bitmap */ 82 83 /* Context ops */ 84 static void schedctl_save(sc_shared_t *); 85 static void schedctl_restore(sc_shared_t *); 86 static void schedctl_fork(kthread_t *, kthread_t *); 87 88 /* Functions for handling shared pages */ 89 static int schedctl_shared_alloc(sc_shared_t **, uintptr_t *); 90 static sc_page_ctl_t *schedctl_page_lookup(sc_shared_t *); 91 static int schedctl_map(struct anon_map *, caddr_t *, caddr_t); 92 static int schedctl_getpage(struct anon_map **, caddr_t *); 93 static void schedctl_freepage(struct anon_map *, caddr_t); 94 95 /* 96 * System call interface to scheduler activations. 97 * This always operates on the current lwp. 98 */ 99 caddr_t 100 schedctl(void) 101 { 102 kthread_t *t = curthread; 103 sc_shared_t *ssp; 104 uintptr_t uaddr; 105 int error; 106 107 if (t->t_schedctl == NULL) { 108 /* 109 * Allocate and initialize the shared structure. 110 */ 111 if ((error = schedctl_shared_alloc(&ssp, &uaddr)) != 0) 112 return ((caddr_t)(uintptr_t)set_errno(error)); 113 bzero(ssp, sizeof (*ssp)); 114 115 installctx(t, ssp, schedctl_save, schedctl_restore, 116 schedctl_fork, NULL, NULL, NULL, NULL); 117 118 thread_lock(t); /* protect against ts_tick and ts_update */ 119 t->t_schedctl = ssp; 120 t->t_sc_uaddr = uaddr; 121 ssp->sc_cid = t->t_cid; 122 ssp->sc_cpri = t->t_cpri; 123 ssp->sc_priority = DISP_PRIO(t); 124 thread_unlock(t); 125 } 126 127 return ((caddr_t)t->t_sc_uaddr); 128 } 129 130 131 /* 132 * Clean up scheduler activations state associated with an exiting 133 * (or execing) lwp. t is always the current thread. 134 */ 135 void 136 schedctl_lwp_cleanup(kthread_t *t) 137 { 138 sc_shared_t *ssp = t->t_schedctl; 139 proc_t *p = ttoproc(t); 140 sc_page_ctl_t *pagep; 141 index_t index; 142 143 ASSERT(MUTEX_NOT_HELD(&p->p_lock)); 144 145 thread_lock(t); /* protect against ts_tick and ts_update */ 146 t->t_schedctl = NULL; 147 t->t_sc_uaddr = 0; 148 thread_unlock(t); 149 150 /* 151 * Remove the context op to avoid the final call to 152 * schedctl_save when switching away from this lwp. 153 */ 154 (void) removectx(t, ssp, schedctl_save, schedctl_restore, 155 schedctl_fork, NULL, NULL, NULL); 156 157 /* 158 * Do not unmap the shared page until the process exits. 159 * User-level library code relies on this for adaptive mutex locking. 160 */ 161 mutex_enter(&p->p_sc_lock); 162 ssp->sc_state = SC_FREE; 163 pagep = schedctl_page_lookup(ssp); 164 index = (index_t)(ssp - pagep->spc_base); 165 BT_CLEAR(pagep->spc_map, index); 166 pagep->spc_space += sizeof (sc_shared_t); 167 mutex_exit(&p->p_sc_lock); 168 } 169 170 171 /* 172 * Cleanup the list of schedctl shared pages for the process. 173 * Called from exec() and exit() system calls. 174 */ 175 void 176 schedctl_proc_cleanup(void) 177 { 178 proc_t *p = curproc; 179 sc_page_ctl_t *pagep; 180 sc_page_ctl_t *next; 181 182 ASSERT(p->p_lwpcnt == 1); /* we are single-threaded now */ 183 ASSERT(curthread->t_schedctl == NULL); 184 185 /* 186 * Since we are single-threaded, we don't have to hold p->p_sc_lock. 187 */ 188 pagep = p->p_pagep; 189 p->p_pagep = NULL; 190 while (pagep != NULL) { 191 ASSERT(pagep->spc_space == sc_pagesize); 192 next = pagep->spc_next; 193 /* 194 * Unmap the user space and free the mapping structure. 195 */ 196 (void) as_unmap(p->p_as, pagep->spc_uaddr, PAGESIZE); 197 schedctl_freepage(pagep->spc_amp, (caddr_t)(pagep->spc_base)); 198 kmem_free(pagep->spc_map, sizeof (ulong_t) * sc_bitmap_words); 199 kmem_free(pagep, sizeof (sc_page_ctl_t)); 200 pagep = next; 201 } 202 } 203 204 205 /* 206 * Called by resume just before switching away from the current thread. 207 * Save new thread state. 208 */ 209 static void 210 schedctl_save(sc_shared_t *ssp) 211 { 212 ssp->sc_state = curthread->t_state; 213 } 214 215 216 /* 217 * Called by resume after switching to the current thread. 218 * Save new thread state and CPU. 219 */ 220 static void 221 schedctl_restore(sc_shared_t *ssp) 222 { 223 ssp->sc_state = SC_ONPROC; 224 ssp->sc_cpu = CPU->cpu_id; 225 } 226 227 228 /* 229 * On fork, remove inherited mappings from the child's address space. 230 * The child's threads must call schedctl() to get new shared mappings. 231 */ 232 static void 233 schedctl_fork(kthread_t *pt, kthread_t *ct) 234 { 235 proc_t *pp = ttoproc(pt); 236 proc_t *cp = ttoproc(ct); 237 sc_page_ctl_t *pagep; 238 239 ASSERT(ct->t_schedctl == NULL); 240 241 /* 242 * Do this only once, whether we are doing fork1() or forkall(). 243 * Don't do it at all if the child process is a child of vfork() 244 * because a child of vfork() borrows the parent's address space. 245 */ 246 if (pt != curthread || (cp->p_flag & SVFORK)) 247 return; 248 249 mutex_enter(&pp->p_sc_lock); 250 for (pagep = pp->p_pagep; pagep != NULL; pagep = pagep->spc_next) 251 (void) as_unmap(cp->p_as, pagep->spc_uaddr, PAGESIZE); 252 mutex_exit(&pp->p_sc_lock); 253 } 254 255 256 /* 257 * Returns non-zero if the specified thread shouldn't be preempted at this time. 258 * Called by ts_preempt(), ts_tick(), and ts_update(). 259 */ 260 int 261 schedctl_get_nopreempt(kthread_t *t) 262 { 263 ASSERT(THREAD_LOCK_HELD(t)); 264 return (t->t_schedctl->sc_preemptctl.sc_nopreempt); 265 } 266 267 268 /* 269 * Sets the value of the nopreempt field for the specified thread. 270 * Called by ts_preempt() to clear the field on preemption. 271 */ 272 void 273 schedctl_set_nopreempt(kthread_t *t, short val) 274 { 275 ASSERT(THREAD_LOCK_HELD(t)); 276 t->t_schedctl->sc_preemptctl.sc_nopreempt = val; 277 } 278 279 280 /* 281 * Sets the value of the yield field for the specified thread. 282 * Called by ts_preempt() and ts_tick() to set the field, and 283 * ts_yield() to clear it. 284 * The kernel never looks at this field so we don't need a 285 * schedctl_get_yield() function. 286 */ 287 void 288 schedctl_set_yield(kthread_t *t, short val) 289 { 290 ASSERT(THREAD_LOCK_HELD(t)); 291 t->t_schedctl->sc_preemptctl.sc_yield = val; 292 } 293 294 295 /* 296 * Sets the values of the cid and priority fields for the specified thread. 297 * Called from thread_change_pri(), thread_change_epri(), THREAD_CHANGE_PRI(). 298 * Called following calls to CL_FORKRET() and CL_ENTERCLASS(). 299 */ 300 void 301 schedctl_set_cidpri(kthread_t *t) 302 { 303 sc_shared_t *tdp = t->t_schedctl; 304 305 if (tdp != NULL) { 306 tdp->sc_cid = t->t_cid; 307 tdp->sc_cpri = t->t_cpri; 308 tdp->sc_priority = DISP_PRIO(t); 309 } 310 } 311 312 313 /* 314 * Returns non-zero if the specified thread has requested that all 315 * signals be blocked. Called by signal-related code that tests 316 * the signal mask of a thread that may not be the current thread 317 * and where the process's p_lock cannot be acquired. 318 */ 319 int 320 schedctl_sigblock(kthread_t *t) 321 { 322 sc_shared_t *tdp = t->t_schedctl; 323 324 if (tdp != NULL) 325 return (tdp->sc_sigblock); 326 return (0); 327 } 328 329 330 /* 331 * If the sc_sigblock field is set for the specified thread, set its signal 332 * mask to block all maskable signals, then clear the sc_sigblock field. This 333 * accomplishes what user-level code requested to be done when it set 334 * tdp->sc_shared->sc_sigblock non-zero. 335 * 336 * This is generally called by signal-related code in the current thread. In 337 * order to call against a thread other than curthread, p_lock for the 338 * containing process must be held. Even then, the caller is not protected 339 * from races with the thread in question updating its own fields. It is the 340 * responsibility of the caller to perform additional synchronization. 341 * 342 */ 343 void 344 schedctl_finish_sigblock(kthread_t *t) 345 { 346 sc_shared_t *tdp = t->t_schedctl; 347 348 ASSERT(t == curthread || MUTEX_HELD(&ttoproc(t)->p_lock)); 349 350 if (tdp != NULL && tdp->sc_sigblock) { 351 t->t_hold.__sigbits[0] = FILLSET0 & ~CANTMASK0; 352 t->t_hold.__sigbits[1] = FILLSET1 & ~CANTMASK1; 353 t->t_hold.__sigbits[2] = FILLSET2 & ~CANTMASK2; 354 tdp->sc_sigblock = 0; 355 } 356 } 357 358 359 /* 360 * Return non-zero if the current thread has declared that it has 361 * a cancellation pending and that cancellation is not disabled. 362 * If SIGCANCEL is blocked, we must be going over the wire in an 363 * NFS transaction (sigintr() was called); return zero in this case. 364 */ 365 int 366 schedctl_cancel_pending(void) 367 { 368 sc_shared_t *tdp = curthread->t_schedctl; 369 370 if (tdp != NULL && 371 (tdp->sc_flgs & SC_CANCEL_FLG) && 372 !tdp->sc_sigblock && 373 !sigismember(&curthread->t_hold, SIGCANCEL)) 374 return (1); 375 return (0); 376 } 377 378 379 /* 380 * Inform libc that the kernel returned EINTR from some system call 381 * due to there being a cancellation pending (SC_CANCEL_FLG set or 382 * we received an SI_LWP SIGCANCEL while in a system call), rather 383 * than because of some other signal. User-level code can try to 384 * recover from receiving other signals, but it can't recover from 385 * being cancelled. 386 */ 387 void 388 schedctl_cancel_eintr(void) 389 { 390 sc_shared_t *tdp = curthread->t_schedctl; 391 392 if (tdp != NULL) 393 tdp->sc_flgs |= SC_EINTR_FLG; 394 } 395 396 397 /* 398 * Return non-zero if the current thread has declared that 399 * it is calling into the kernel to park, else return zero. 400 */ 401 int 402 schedctl_is_park(void) 403 { 404 sc_shared_t *tdp = curthread->t_schedctl; 405 406 if (tdp != NULL) 407 return ((tdp->sc_flgs & SC_PARK_FLG) != 0); 408 /* 409 * If we're here and there is no shared memory (how could 410 * that happen?) then just assume we really are here to park. 411 */ 412 return (1); 413 } 414 415 416 /* 417 * Declare thread is parking. 418 * 419 * libc will set "sc_flgs |= SC_PARK_FLG" before calling lwpsys_park(0, tid) 420 * in order to declare that the thread is calling into the kernel to park. 421 * 422 * This interface exists ONLY to support older versions of libthread which 423 * are not aware of the SC_PARK_FLG flag. 424 * 425 * Older versions of libthread which are not aware of the SC_PARK_FLG flag 426 * need to be modified or emulated to call lwpsys_park(4, ...) instead of 427 * lwpsys_park(0, ...). This will invoke schedctl_set_park() before 428 * lwp_park() to declare that the thread is parking. 429 */ 430 void 431 schedctl_set_park(void) 432 { 433 sc_shared_t *tdp = curthread->t_schedctl; 434 if (tdp != NULL) 435 tdp->sc_flgs |= SC_PARK_FLG; 436 } 437 438 439 /* 440 * Clear the parking flag on return from parking in the kernel. 441 */ 442 void 443 schedctl_unpark(void) 444 { 445 sc_shared_t *tdp = curthread->t_schedctl; 446 447 if (tdp != NULL) 448 tdp->sc_flgs &= ~SC_PARK_FLG; 449 } 450 451 452 /* 453 * Page handling code. 454 */ 455 456 void 457 schedctl_init(void) 458 { 459 /* 460 * Amount of page that can hold sc_shared_t structures. If 461 * sizeof (sc_shared_t) is a power of 2, this should just be 462 * PAGESIZE. 463 */ 464 sc_pagesize = PAGESIZE - (PAGESIZE % sizeof (sc_shared_t)); 465 466 /* 467 * Allocation bitmap is one bit per struct on a page. 468 */ 469 sc_bitmap_len = sc_pagesize / sizeof (sc_shared_t); 470 sc_bitmap_words = howmany(sc_bitmap_len, BT_NBIPUL); 471 } 472 473 474 static int 475 schedctl_shared_alloc(sc_shared_t **kaddrp, uintptr_t *uaddrp) 476 { 477 proc_t *p = curproc; 478 sc_page_ctl_t *pagep; 479 sc_shared_t *ssp; 480 caddr_t base; 481 index_t index; 482 int error; 483 484 ASSERT(MUTEX_NOT_HELD(&p->p_lock)); 485 mutex_enter(&p->p_sc_lock); 486 487 /* 488 * Try to find space for the new data in existing pages 489 * within the process's list of shared pages. 490 */ 491 for (pagep = p->p_pagep; pagep != NULL; pagep = pagep->spc_next) 492 if (pagep->spc_space != 0) 493 break; 494 495 if (pagep != NULL) 496 base = pagep->spc_uaddr; 497 else { 498 struct anon_map *amp; 499 caddr_t kaddr; 500 501 /* 502 * No room, need to allocate a new page. Also set up 503 * a mapping to the kernel address space for the new 504 * page and lock it in memory. 505 */ 506 if ((error = schedctl_getpage(&, &kaddr)) != 0) { 507 mutex_exit(&p->p_sc_lock); 508 return (error); 509 } 510 if ((error = schedctl_map(amp, &base, kaddr)) != 0) { 511 schedctl_freepage(amp, kaddr); 512 mutex_exit(&p->p_sc_lock); 513 return (error); 514 } 515 516 /* 517 * Allocate and initialize the page control structure. 518 */ 519 pagep = kmem_alloc(sizeof (sc_page_ctl_t), KM_SLEEP); 520 pagep->spc_amp = amp; 521 pagep->spc_base = (sc_shared_t *)kaddr; 522 pagep->spc_end = (sc_shared_t *)(kaddr + sc_pagesize); 523 pagep->spc_uaddr = base; 524 525 pagep->spc_map = kmem_zalloc(sizeof (ulong_t) * sc_bitmap_words, 526 KM_SLEEP); 527 pagep->spc_space = sc_pagesize; 528 529 pagep->spc_next = p->p_pagep; 530 p->p_pagep = pagep; 531 } 532 533 /* 534 * Got a page, now allocate space for the data. There should 535 * be space unless something's wrong. 536 */ 537 ASSERT(pagep != NULL && pagep->spc_space >= sizeof (sc_shared_t)); 538 index = bt_availbit(pagep->spc_map, sc_bitmap_len); 539 ASSERT(index != -1); 540 541 /* 542 * Get location with pointer arithmetic. spc_base is of type 543 * sc_shared_t *. Mark as allocated. 544 */ 545 ssp = pagep->spc_base + index; 546 BT_SET(pagep->spc_map, index); 547 pagep->spc_space -= sizeof (sc_shared_t); 548 549 mutex_exit(&p->p_sc_lock); 550 551 /* 552 * Return kernel and user addresses. 553 */ 554 *kaddrp = ssp; 555 *uaddrp = (uintptr_t)base + ((uintptr_t)ssp & PAGEOFFSET); 556 return (0); 557 } 558 559 560 /* 561 * Find the page control structure corresponding to a kernel address. 562 */ 563 static sc_page_ctl_t * 564 schedctl_page_lookup(sc_shared_t *ssp) 565 { 566 proc_t *p = curproc; 567 sc_page_ctl_t *pagep; 568 569 ASSERT(MUTEX_HELD(&p->p_sc_lock)); 570 for (pagep = p->p_pagep; pagep != NULL; pagep = pagep->spc_next) { 571 if (ssp >= pagep->spc_base && ssp < pagep->spc_end) 572 return (pagep); 573 } 574 return (NULL); /* This "can't happen". Should we panic? */ 575 } 576 577 578 /* 579 * This function is called when a page needs to be mapped into a 580 * process's address space. Allocate the user address space and 581 * set up the mapping to the page. Assumes the page has already 582 * been allocated and locked in memory via schedctl_getpage. 583 */ 584 static int 585 schedctl_map(struct anon_map *amp, caddr_t *uaddrp, caddr_t kaddr) 586 { 587 caddr_t addr = NULL; 588 struct as *as = curproc->p_as; 589 struct segvn_crargs vn_a; 590 int error; 591 592 as_rangelock(as); 593 /* pass address of kernel mapping as offset to avoid VAC conflicts */ 594 map_addr(&addr, PAGESIZE, (offset_t)(uintptr_t)kaddr, 1, 0); 595 if (addr == NULL) { 596 as_rangeunlock(as); 597 return (ENOMEM); 598 } 599 600 /* 601 * Use segvn to set up the mapping to the page. 602 */ 603 vn_a.vp = NULL; 604 vn_a.offset = 0; 605 vn_a.cred = NULL; 606 vn_a.type = MAP_SHARED; 607 vn_a.prot = vn_a.maxprot = PROT_ALL; 608 vn_a.flags = 0; 609 vn_a.amp = amp; 610 vn_a.szc = 0; 611 vn_a.lgrp_mem_policy_flags = 0; 612 error = as_map(as, addr, PAGESIZE, segvn_create, &vn_a); 613 as_rangeunlock(as); 614 615 if (error) 616 return (error); 617 618 *uaddrp = addr; 619 return (0); 620 } 621 622 623 /* 624 * Allocate a new page from anonymous memory. Also, create a kernel 625 * mapping to the page and lock the page in memory. 626 */ 627 static int 628 schedctl_getpage(struct anon_map **newamp, caddr_t *newaddr) 629 { 630 struct anon_map *amp; 631 caddr_t kaddr; 632 633 /* 634 * Set up anonymous memory struct. No swap reservation is 635 * needed since the page will be locked into memory. 636 */ 637 amp = anonmap_alloc(PAGESIZE, 0, ANON_SLEEP); 638 639 /* 640 * Allocate the page. 641 */ 642 kaddr = segkp_get_withanonmap(segkp, PAGESIZE, 643 KPD_NO_ANON | KPD_LOCKED | KPD_ZERO, amp); 644 if (kaddr == NULL) { 645 amp->refcnt--; 646 anonmap_free(amp); 647 return (ENOMEM); 648 } 649 650 /* 651 * The page is left SE_SHARED locked so that it won't be 652 * paged out or relocated (KPD_LOCKED above). 653 */ 654 655 *newamp = amp; 656 *newaddr = kaddr; 657 return (0); 658 } 659 660 661 /* 662 * Take the necessary steps to allow a page to be released. 663 * This is called when the process is doing exit() or exec(). 664 * There should be no accesses to the page after this. 665 * The kernel mapping of the page is released and the page is unlocked. 666 */ 667 static void 668 schedctl_freepage(struct anon_map *amp, caddr_t kaddr) 669 { 670 /* 671 * Release the lock on the page and remove the kernel mapping. 672 */ 673 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 674 segkp_release(segkp, kaddr); 675 676 /* 677 * Decrement the refcnt so the anon_map structure will be freed. 678 */ 679 if (--amp->refcnt == 0) { 680 /* 681 * The current process no longer has the page mapped, so 682 * we have to free everything rather than letting as_free 683 * do the work. 684 */ 685 anonmap_purge(amp); 686 anon_free(amp->ahp, 0, PAGESIZE); 687 ANON_LOCK_EXIT(&->a_rwlock); 688 anonmap_free(amp); 689 } else { 690 ANON_LOCK_EXIT(&->a_rwlock); 691 } 692 } 693