1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/systm.h> 31 #include <sys/schedctl.h> 32 #include <sys/proc.h> 33 #include <sys/thread.h> 34 #include <sys/class.h> 35 #include <sys/cred.h> 36 #include <sys/kmem.h> 37 #include <sys/cmn_err.h> 38 #include <sys/stack.h> 39 #include <sys/debug.h> 40 #include <sys/cpuvar.h> 41 #include <sys/sobject.h> 42 #include <sys/door.h> 43 #include <sys/modctl.h> 44 #include <sys/syscall.h> 45 #include <sys/sysmacros.h> 46 #include <sys/vmsystm.h> 47 #include <sys/mman.h> 48 #include <sys/vnode.h> 49 #include <sys/swap.h> 50 #include <sys/lwp.h> 51 #include <sys/bitmap.h> 52 #include <sys/atomic.h> 53 #include <sys/fcntl.h> 54 #include <vm/seg_kp.h> 55 #include <vm/seg_vn.h> 56 #include <vm/as.h> 57 #include <fs/fs_subr.h> 58 59 /* 60 * Page handling structures. This is set up as a list of per-page 61 * control structures (sc_page_ctl), with p->p_pagep pointing to 62 * the first. The per-page structures point to the actual pages 63 * and contain pointers to the user address for each mapped page. 64 * 65 * All data is protected by p->p_sc_lock. Since this lock is 66 * held while waiting for memory, schedctl_shared_alloc() should 67 * not be called while holding p_lock. 68 */ 69 70 typedef struct sc_page_ctl { 71 struct sc_page_ctl *spc_next; 72 sc_shared_t *spc_base; /* base of kernel page */ 73 sc_shared_t *spc_end; /* end of usable space */ 74 ulong_t *spc_map; /* bitmap of allocated space on page */ 75 size_t spc_space; /* amount of space on page */ 76 caddr_t spc_uaddr; /* user-level address of the page */ 77 struct anon_map *spc_amp; /* anonymous memory structure */ 78 } sc_page_ctl_t; 79 80 static size_t sc_pagesize; /* size of usable space on page */ 81 static size_t sc_bitmap_len; /* # of bits in allocation bitmap */ 82 static size_t sc_bitmap_words; /* # of words in allocation bitmap */ 83 84 /* Context ops */ 85 static void schedctl_save(sc_shared_t *); 86 static void schedctl_restore(sc_shared_t *); 87 static void schedctl_fork(kthread_t *, kthread_t *); 88 89 /* Functions for handling shared pages */ 90 static int schedctl_shared_alloc(sc_shared_t **, uintptr_t *); 91 static sc_page_ctl_t *schedctl_page_lookup(sc_shared_t *); 92 static int schedctl_map(struct anon_map *, caddr_t *, caddr_t); 93 static int schedctl_getpage(struct anon_map **, caddr_t *); 94 static void schedctl_freepage(struct anon_map *, caddr_t); 95 96 /* 97 * System call interface to scheduler activations. 98 * This always operates on the current lwp. 99 */ 100 caddr_t 101 schedctl(void) 102 { 103 kthread_t *t = curthread; 104 sc_shared_t *ssp; 105 uintptr_t uaddr; 106 int error; 107 108 if (t->t_schedctl == NULL) { 109 /* 110 * Allocate and initialize the shared structure. 111 */ 112 if ((error = schedctl_shared_alloc(&ssp, &uaddr)) != 0) 113 return ((caddr_t)(uintptr_t)set_errno(error)); 114 bzero(ssp, sizeof (*ssp)); 115 116 installctx(t, ssp, schedctl_save, schedctl_restore, 117 schedctl_fork, NULL, NULL, NULL); 118 119 thread_lock(t); /* protect against ts_tick and ts_update */ 120 t->t_schedctl = ssp; 121 t->t_sc_uaddr = uaddr; 122 thread_unlock(t); 123 } 124 125 return ((caddr_t)t->t_sc_uaddr); 126 } 127 128 129 /* 130 * Clean up scheduler activations state associated with an exiting 131 * (or execing) lwp. t is always the current thread. 132 */ 133 void 134 schedctl_lwp_cleanup(kthread_t *t) 135 { 136 sc_shared_t *ssp = t->t_schedctl; 137 proc_t *p = ttoproc(t); 138 sc_page_ctl_t *pagep; 139 index_t index; 140 141 ASSERT(MUTEX_NOT_HELD(&p->p_lock)); 142 143 thread_lock(t); /* protect against ts_tick and ts_update */ 144 t->t_schedctl = NULL; 145 t->t_sc_uaddr = 0; 146 thread_unlock(t); 147 148 /* 149 * Remove the context op to avoid the final call to 150 * schedctl_save when switching away from this lwp. 151 */ 152 (void) removectx(t, ssp, schedctl_save, schedctl_restore, 153 schedctl_fork, NULL, NULL, NULL); 154 155 /* 156 * Do not unmap the shared page until the process exits. 157 * User-level library code relies on this for adaptive mutex locking. 158 */ 159 mutex_enter(&p->p_sc_lock); 160 ssp->sc_state = SC_FREE; 161 pagep = schedctl_page_lookup(ssp); 162 index = (index_t)(ssp - pagep->spc_base); 163 BT_CLEAR(pagep->spc_map, index); 164 pagep->spc_space += sizeof (sc_shared_t); 165 mutex_exit(&p->p_sc_lock); 166 } 167 168 169 /* 170 * Cleanup the list of schedctl shared pages for the process. 171 * Called from exec() and exit() system calls. 172 */ 173 void 174 schedctl_proc_cleanup(void) 175 { 176 proc_t *p = curproc; 177 sc_page_ctl_t *pagep; 178 sc_page_ctl_t *next; 179 180 ASSERT(p->p_lwpcnt == 1); /* we are single-threaded now */ 181 ASSERT(curthread->t_schedctl == NULL); 182 183 /* 184 * Since we are single-threaded, we don't have to hold p->p_sc_lock. 185 */ 186 pagep = p->p_pagep; 187 p->p_pagep = NULL; 188 while (pagep != NULL) { 189 ASSERT(pagep->spc_space == sc_pagesize); 190 next = pagep->spc_next; 191 /* 192 * Unmap the user space and free the mapping structure. 193 */ 194 (void) as_unmap(p->p_as, pagep->spc_uaddr, PAGESIZE); 195 schedctl_freepage(pagep->spc_amp, (caddr_t)(pagep->spc_base)); 196 kmem_free(pagep->spc_map, sizeof (ulong_t) * sc_bitmap_words); 197 kmem_free(pagep, sizeof (sc_page_ctl_t)); 198 pagep = next; 199 } 200 } 201 202 203 /* 204 * Called by resume just before switching away from the current thread. 205 * Save new thread state. 206 */ 207 void 208 schedctl_save(sc_shared_t *ssp) 209 { 210 ssp->sc_state = curthread->t_state; 211 } 212 213 214 /* 215 * Called by resume after switching to the current thread. 216 * Save new thread state and CPU. 217 */ 218 void 219 schedctl_restore(sc_shared_t *ssp) 220 { 221 ssp->sc_state = SC_ONPROC; 222 ssp->sc_cpu = CPU->cpu_id; 223 } 224 225 226 /* 227 * On fork, remove inherited mappings from the child's address space. 228 * The child's threads must call schedctl() to get new shared mappings. 229 */ 230 void 231 schedctl_fork(kthread_t *pt, kthread_t *ct) 232 { 233 proc_t *pp = ttoproc(pt); 234 proc_t *cp = ttoproc(ct); 235 sc_page_ctl_t *pagep; 236 237 ASSERT(ct->t_schedctl == NULL); 238 239 /* 240 * Do this only once, whether we are doing fork1() or forkall(). 241 * Don't do it at all if the child process is a child of vfork() 242 * because a child of vfork() borrows the parent's address space. 243 */ 244 if (pt != curthread || (cp->p_flag & SVFORK)) 245 return; 246 247 mutex_enter(&pp->p_sc_lock); 248 for (pagep = pp->p_pagep; pagep != NULL; pagep = pagep->spc_next) 249 (void) as_unmap(cp->p_as, pagep->spc_uaddr, PAGESIZE); 250 mutex_exit(&pp->p_sc_lock); 251 } 252 253 254 /* 255 * Returns non-zero if the specified thread shouldn't be preempted at this time. 256 * Called by ts_preempt, ts_tick, and ts_update. 257 */ 258 int 259 schedctl_get_nopreempt(kthread_t *t) 260 { 261 ASSERT(THREAD_LOCK_HELD(t)); 262 return (t->t_schedctl->sc_preemptctl.sc_nopreempt); 263 } 264 265 266 /* 267 * Sets the value of the nopreempt field for the specified thread. 268 * Called by ts_preempt to clear the field on preemption. 269 */ 270 void 271 schedctl_set_nopreempt(kthread_t *t, short val) 272 { 273 ASSERT(THREAD_LOCK_HELD(t)); 274 t->t_schedctl->sc_preemptctl.sc_nopreempt = val; 275 } 276 277 278 /* 279 * Sets the value of the yield field for the specified thread. Called by 280 * ts_preempt and ts_tick to set the field, and ts_yield to clear it. 281 * The kernel never looks at this field so we don't need a schedctl_get_yield 282 * function. 283 */ 284 void 285 schedctl_set_yield(kthread_t *t, short val) 286 { 287 ASSERT(THREAD_LOCK_HELD(t)); 288 t->t_schedctl->sc_preemptctl.sc_yield = val; 289 } 290 291 292 /* 293 * Returns non-zero if the specified thread has requested that all 294 * signals be blocked. Called by signal-related code that tests 295 * the signal mask of a thread that may not be the current thread 296 * and where the process's p_lock cannot be acquired. 297 */ 298 int 299 schedctl_sigblock(kthread_t *t) 300 { 301 sc_shared_t *tdp = t->t_schedctl; 302 303 if (tdp != NULL) 304 return (tdp->sc_sigblock); 305 return (0); 306 } 307 308 309 /* 310 * If the sc_sigblock field is set for the specified thread, set 311 * its signal mask to block all maskable signals, then clear the 312 * sc_sigblock field. This finishes what user-level code requested 313 * to be done when it set tdp->sc_shared->sc_sigblock non-zero. 314 * Called by signal-related code that holds the process's p_lock. 315 */ 316 void 317 schedctl_finish_sigblock(kthread_t *t) 318 { 319 sc_shared_t *tdp = t->t_schedctl; 320 321 ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)); 322 323 if (tdp != NULL && tdp->sc_sigblock) { 324 t->t_hold.__sigbits[0] = FILLSET0 & ~CANTMASK0; 325 t->t_hold.__sigbits[1] = FILLSET1 & ~CANTMASK1; 326 tdp->sc_sigblock = 0; 327 } 328 } 329 330 331 /* 332 * Return non-zero if the current thread has declared that it has 333 * a cancellation pending and that cancellation is not disabled. 334 * If SIGCANCEL is blocked, we must be going over the wire in an 335 * NFS transaction (sigintr() was called); return zero in this case. 336 */ 337 int 338 schedctl_cancel_pending(void) 339 { 340 sc_shared_t *tdp = curthread->t_schedctl; 341 342 if (tdp != NULL && 343 (tdp->sc_flgs & SC_CANCEL_FLG) && 344 !tdp->sc_sigblock && 345 !sigismember(&curthread->t_hold, SIGCANCEL)) 346 return (1); 347 return (0); 348 } 349 350 351 /* 352 * Inform libc that the kernel returned EINTR from some system call 353 * due to there being a cancellation pending (SC_CANCEL_FLG set or 354 * we received an SI_LWP SIGCANCEL while in a system call), rather 355 * than because of some other signal. User-level code can try to 356 * recover from receiving other signals, but it can't recover from 357 * being cancelled. 358 */ 359 void 360 schedctl_cancel_eintr(void) 361 { 362 sc_shared_t *tdp = curthread->t_schedctl; 363 364 if (tdp != NULL) 365 tdp->sc_flgs |= SC_EINTR_FLG; 366 } 367 368 369 /* 370 * Return non-zero if the current thread has declared that 371 * it is calling into the kernel to park, else return zero. 372 */ 373 int 374 schedctl_is_park(void) 375 { 376 sc_shared_t *tdp = curthread->t_schedctl; 377 378 if (tdp != NULL) 379 return ((tdp->sc_flgs & SC_PARK_FLG) != 0); 380 /* 381 * If we're here and there is no shared memory (how could 382 * that happen?) then just assume we really are here to park. 383 */ 384 return (1); 385 } 386 387 388 /* 389 * Declare thread is parking. 390 * 391 * libc will set "sc_flgs |= SC_PARK_FLG" before calling lwpsys_park(0, tid) 392 * in order to declare that the thread is calling into the kernel to park. 393 * 394 * This interface exists ONLY to support older versions of libthread which 395 * are not aware of the SC_PARK_FLG flag. 396 * 397 * Older versions of libthread which are not aware of the SC_PARK_FLG flag 398 * need to be modified or emulated to call lwpsys_park(4, ...) instead of 399 * lwpsys_park(0, ...). This will invoke schedctl_set_park() before 400 * lwp_park() to declare that the thread is parking. 401 */ 402 void 403 schedctl_set_park(void) 404 { 405 sc_shared_t *tdp = curthread->t_schedctl; 406 if (tdp != NULL) 407 tdp->sc_flgs |= SC_PARK_FLG; 408 } 409 410 411 /* 412 * Clear the parking flag on return from parking in the kernel. 413 */ 414 void 415 schedctl_unpark(void) 416 { 417 sc_shared_t *tdp = curthread->t_schedctl; 418 419 if (tdp != NULL) 420 tdp->sc_flgs &= ~SC_PARK_FLG; 421 } 422 423 424 /* 425 * Page handling code. 426 */ 427 428 void 429 schedctl_init(void) 430 { 431 /* 432 * Amount of page that can hold sc_shared_t structures. If 433 * sizeof (sc_shared_t) is a power of 2, this should just be 434 * PAGESIZE. 435 */ 436 sc_pagesize = PAGESIZE - (PAGESIZE % sizeof (sc_shared_t)); 437 438 /* 439 * Allocation bitmap is one bit per struct on a page. 440 */ 441 sc_bitmap_len = sc_pagesize / sizeof (sc_shared_t); 442 sc_bitmap_words = howmany(sc_bitmap_len, BT_NBIPUL); 443 } 444 445 446 int 447 schedctl_shared_alloc(sc_shared_t **kaddrp, uintptr_t *uaddrp) 448 { 449 proc_t *p = curproc; 450 sc_page_ctl_t *pagep; 451 sc_shared_t *ssp; 452 caddr_t base; 453 index_t index; 454 int error; 455 456 ASSERT(MUTEX_NOT_HELD(&p->p_lock)); 457 mutex_enter(&p->p_sc_lock); 458 459 /* 460 * Try to find space for the new data in existing pages 461 * within the process's list of shared pages. 462 */ 463 for (pagep = p->p_pagep; pagep != NULL; pagep = pagep->spc_next) 464 if (pagep->spc_space != 0) 465 break; 466 467 if (pagep != NULL) 468 base = pagep->spc_uaddr; 469 else { 470 struct anon_map *amp; 471 caddr_t kaddr; 472 473 /* 474 * No room, need to allocate a new page. Also set up 475 * a mapping to the kernel address space for the new 476 * page and lock it in memory. 477 */ 478 if ((error = schedctl_getpage(&, &kaddr)) != 0) { 479 mutex_exit(&p->p_sc_lock); 480 return (error); 481 } 482 if ((error = schedctl_map(amp, &base, kaddr)) != 0) { 483 schedctl_freepage(amp, kaddr); 484 mutex_exit(&p->p_sc_lock); 485 return (error); 486 } 487 488 /* 489 * Allocate and initialize the page control structure. 490 */ 491 pagep = kmem_alloc(sizeof (sc_page_ctl_t), KM_SLEEP); 492 pagep->spc_amp = amp; 493 pagep->spc_base = (sc_shared_t *)kaddr; 494 pagep->spc_end = (sc_shared_t *)(kaddr + sc_pagesize); 495 pagep->spc_uaddr = base; 496 497 pagep->spc_map = kmem_zalloc(sizeof (ulong_t) * sc_bitmap_words, 498 KM_SLEEP); 499 pagep->spc_space = sc_pagesize; 500 501 pagep->spc_next = p->p_pagep; 502 p->p_pagep = pagep; 503 } 504 505 /* 506 * Got a page, now allocate space for the data. There should 507 * be space unless something's wrong. 508 */ 509 ASSERT(pagep != NULL && pagep->spc_space >= sizeof (sc_shared_t)); 510 index = bt_availbit(pagep->spc_map, sc_bitmap_len); 511 ASSERT(index != -1); 512 513 /* 514 * Get location with pointer arithmetic. spc_base is of type 515 * sc_shared_t *. Mark as allocated. 516 */ 517 ssp = pagep->spc_base + index; 518 BT_SET(pagep->spc_map, index); 519 pagep->spc_space -= sizeof (sc_shared_t); 520 521 mutex_exit(&p->p_sc_lock); 522 523 /* 524 * Return kernel and user addresses. 525 */ 526 *kaddrp = ssp; 527 *uaddrp = (uintptr_t)base + ((uintptr_t)ssp & PAGEOFFSET); 528 return (0); 529 } 530 531 532 /* 533 * Find the page control structure corresponding to a kernel address. 534 */ 535 static sc_page_ctl_t * 536 schedctl_page_lookup(sc_shared_t *ssp) 537 { 538 proc_t *p = curproc; 539 sc_page_ctl_t *pagep; 540 541 ASSERT(MUTEX_HELD(&p->p_sc_lock)); 542 for (pagep = p->p_pagep; pagep != NULL; pagep = pagep->spc_next) { 543 if (ssp >= pagep->spc_base && ssp < pagep->spc_end) 544 return (pagep); 545 } 546 return (NULL); /* This "can't happen". Should we panic? */ 547 } 548 549 550 /* 551 * This function is called when a page needs to be mapped into a 552 * process's address space. Allocate the user address space and 553 * set up the mapping to the page. Assumes the page has already 554 * been allocated and locked in memory via schedctl_getpage. 555 */ 556 static int 557 schedctl_map(struct anon_map *amp, caddr_t *uaddrp, caddr_t kaddr) 558 { 559 caddr_t addr; 560 struct as *as = curproc->p_as; 561 struct segvn_crargs vn_a; 562 int error; 563 564 as_rangelock(as); 565 /* pass address of kernel mapping as offset to avoid VAC conflicts */ 566 map_addr(&addr, PAGESIZE, (offset_t)(uintptr_t)kaddr, 1, 0); 567 if (addr == NULL) { 568 as_rangeunlock(as); 569 return (ENOMEM); 570 } 571 572 /* 573 * Use segvn to set up the mapping to the page. 574 */ 575 vn_a.vp = NULL; 576 vn_a.offset = 0; 577 vn_a.cred = NULL; 578 vn_a.type = MAP_SHARED; 579 vn_a.prot = vn_a.maxprot = PROT_ALL; 580 vn_a.flags = 0; 581 vn_a.amp = amp; 582 vn_a.szc = 0; 583 vn_a.lgrp_mem_policy_flags = 0; 584 error = as_map(as, addr, PAGESIZE, segvn_create, &vn_a); 585 as_rangeunlock(as); 586 587 if (error) 588 return (error); 589 590 *uaddrp = addr; 591 return (0); 592 } 593 594 595 /* 596 * Allocate a new page from anonymous memory. Also, create a kernel 597 * mapping to the page and lock the page in memory. 598 */ 599 static int 600 schedctl_getpage(struct anon_map **newamp, caddr_t *newaddr) 601 { 602 struct anon_map *amp; 603 caddr_t kaddr; 604 605 /* 606 * Set up anonymous memory struct. No swap reservation is 607 * needed since the page will be locked into memory. 608 */ 609 amp = anonmap_alloc(PAGESIZE, 0, ANON_SLEEP); 610 611 /* 612 * Allocate the page. 613 */ 614 kaddr = segkp_get_withanonmap(segkp, PAGESIZE, 615 KPD_NO_ANON | KPD_LOCKED | KPD_ZERO, amp); 616 if (kaddr == NULL) { 617 amp->refcnt--; 618 anonmap_free(amp); 619 return (ENOMEM); 620 } 621 622 /* 623 * The page is left SE_SHARED locked so that it won't be 624 * paged out or relocated (KPD_LOCKED above). 625 */ 626 627 *newamp = amp; 628 *newaddr = kaddr; 629 return (0); 630 } 631 632 633 /* 634 * Take the necessary steps to allow a page to be released. 635 * This is called when the process is doing exit() or exec(). 636 * There should be no accesses to the page after this. 637 * The kernel mapping of the page is released and the page is unlocked. 638 */ 639 static void 640 schedctl_freepage(struct anon_map *amp, caddr_t kaddr) 641 { 642 /* 643 * Release the lock on the page and remove the kernel mapping. 644 */ 645 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 646 segkp_release(segkp, kaddr); 647 648 /* 649 * Decrement the refcnt so the anon_map structure will be freed. 650 */ 651 if (--amp->refcnt == 0) { 652 /* 653 * The current process no longer has the page mapped, so 654 * we have to free everything rather than letting as_free 655 * do the work. 656 */ 657 anon_free(amp->ahp, 0, PAGESIZE); 658 ANON_LOCK_EXIT(&->a_rwlock); 659 anonmap_free(amp); 660 } else { 661 ANON_LOCK_EXIT(&->a_rwlock); 662 } 663 } 664