1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/systm.h> 31 #include <sys/schedctl.h> 32 #include <sys/proc.h> 33 #include <sys/thread.h> 34 #include <sys/class.h> 35 #include <sys/cred.h> 36 #include <sys/kmem.h> 37 #include <sys/cmn_err.h> 38 #include <sys/stack.h> 39 #include <sys/debug.h> 40 #include <sys/cpuvar.h> 41 #include <sys/sobject.h> 42 #include <sys/door.h> 43 #include <sys/modctl.h> 44 #include <sys/syscall.h> 45 #include <sys/sysmacros.h> 46 #include <sys/vmsystm.h> 47 #include <sys/mman.h> 48 #include <sys/vnode.h> 49 #include <sys/swap.h> 50 #include <sys/lwp.h> 51 #include <sys/bitmap.h> 52 #include <sys/atomic.h> 53 #include <sys/fcntl.h> 54 #include <vm/seg_kp.h> 55 #include <vm/seg_vn.h> 56 #include <vm/as.h> 57 #include <fs/fs_subr.h> 58 59 /* 60 * Page handling structures. This is set up as a list of per-page 61 * control structures (sc_page_ctl), with p->p_pagep pointing to 62 * the first. The per-page structures point to the actual pages 63 * and contain pointers to the user address for each mapped page. 64 * 65 * All data is protected by p->p_sc_lock. Since this lock is 66 * held while waiting for memory, schedctl_shared_alloc() should 67 * not be called while holding p_lock. 68 */ 69 70 typedef struct sc_page_ctl { 71 struct sc_page_ctl *spc_next; 72 sc_shared_t *spc_base; /* base of kernel page */ 73 sc_shared_t *spc_end; /* end of usable space */ 74 ulong_t *spc_map; /* bitmap of allocated space on page */ 75 size_t spc_space; /* amount of space on page */ 76 caddr_t spc_uaddr; /* user-level address of the page */ 77 struct anon_map *spc_amp; /* anonymous memory structure */ 78 } sc_page_ctl_t; 79 80 static size_t sc_pagesize; /* size of usable space on page */ 81 static size_t sc_bitmap_len; /* # of bits in allocation bitmap */ 82 static size_t sc_bitmap_words; /* # of words in allocation bitmap */ 83 84 /* Context ops */ 85 static void schedctl_save(sc_shared_t *); 86 static void schedctl_restore(sc_shared_t *); 87 static void schedctl_fork(kthread_t *, kthread_t *); 88 89 /* Functions for handling shared pages */ 90 static int schedctl_shared_alloc(sc_shared_t **, uintptr_t *); 91 static sc_page_ctl_t *schedctl_page_lookup(sc_shared_t *); 92 static int schedctl_map(struct anon_map *, caddr_t *, caddr_t); 93 static int schedctl_getpage(struct anon_map **, caddr_t *); 94 static void schedctl_freepage(struct anon_map *, caddr_t); 95 96 /* 97 * System call interface to scheduler activations. 98 * This always operates on the current lwp. 99 */ 100 caddr_t 101 schedctl(void) 102 { 103 kthread_t *t = curthread; 104 sc_shared_t *ssp; 105 uintptr_t uaddr; 106 int error; 107 108 if (t->t_schedctl == NULL) { 109 /* 110 * Allocate and initialize the shared structure. 111 */ 112 if ((error = schedctl_shared_alloc(&ssp, &uaddr)) != 0) 113 return ((caddr_t)(uintptr_t)set_errno(error)); 114 bzero(ssp, sizeof (*ssp)); 115 116 installctx(t, ssp, schedctl_save, schedctl_restore, 117 schedctl_fork, NULL, NULL, NULL); 118 119 thread_lock(t); /* protect against ts_tick and ts_update */ 120 t->t_schedctl = ssp; 121 t->t_sc_uaddr = uaddr; 122 ssp->sc_cid = t->t_cid; 123 ssp->sc_cpri = t->t_cpri; 124 ssp->sc_priority = DISP_PRIO(t); 125 thread_unlock(t); 126 } 127 128 return ((caddr_t)t->t_sc_uaddr); 129 } 130 131 132 /* 133 * Clean up scheduler activations state associated with an exiting 134 * (or execing) lwp. t is always the current thread. 135 */ 136 void 137 schedctl_lwp_cleanup(kthread_t *t) 138 { 139 sc_shared_t *ssp = t->t_schedctl; 140 proc_t *p = ttoproc(t); 141 sc_page_ctl_t *pagep; 142 index_t index; 143 144 ASSERT(MUTEX_NOT_HELD(&p->p_lock)); 145 146 thread_lock(t); /* protect against ts_tick and ts_update */ 147 t->t_schedctl = NULL; 148 t->t_sc_uaddr = 0; 149 thread_unlock(t); 150 151 /* 152 * Remove the context op to avoid the final call to 153 * schedctl_save when switching away from this lwp. 154 */ 155 (void) removectx(t, ssp, schedctl_save, schedctl_restore, 156 schedctl_fork, NULL, NULL, NULL); 157 158 /* 159 * Do not unmap the shared page until the process exits. 160 * User-level library code relies on this for adaptive mutex locking. 161 */ 162 mutex_enter(&p->p_sc_lock); 163 ssp->sc_state = SC_FREE; 164 pagep = schedctl_page_lookup(ssp); 165 index = (index_t)(ssp - pagep->spc_base); 166 BT_CLEAR(pagep->spc_map, index); 167 pagep->spc_space += sizeof (sc_shared_t); 168 mutex_exit(&p->p_sc_lock); 169 } 170 171 172 /* 173 * Cleanup the list of schedctl shared pages for the process. 174 * Called from exec() and exit() system calls. 175 */ 176 void 177 schedctl_proc_cleanup(void) 178 { 179 proc_t *p = curproc; 180 sc_page_ctl_t *pagep; 181 sc_page_ctl_t *next; 182 183 ASSERT(p->p_lwpcnt == 1); /* we are single-threaded now */ 184 ASSERT(curthread->t_schedctl == NULL); 185 186 /* 187 * Since we are single-threaded, we don't have to hold p->p_sc_lock. 188 */ 189 pagep = p->p_pagep; 190 p->p_pagep = NULL; 191 while (pagep != NULL) { 192 ASSERT(pagep->spc_space == sc_pagesize); 193 next = pagep->spc_next; 194 /* 195 * Unmap the user space and free the mapping structure. 196 */ 197 (void) as_unmap(p->p_as, pagep->spc_uaddr, PAGESIZE); 198 schedctl_freepage(pagep->spc_amp, (caddr_t)(pagep->spc_base)); 199 kmem_free(pagep->spc_map, sizeof (ulong_t) * sc_bitmap_words); 200 kmem_free(pagep, sizeof (sc_page_ctl_t)); 201 pagep = next; 202 } 203 } 204 205 206 /* 207 * Called by resume just before switching away from the current thread. 208 * Save new thread state. 209 */ 210 static void 211 schedctl_save(sc_shared_t *ssp) 212 { 213 ssp->sc_state = curthread->t_state; 214 } 215 216 217 /* 218 * Called by resume after switching to the current thread. 219 * Save new thread state and CPU. 220 */ 221 static void 222 schedctl_restore(sc_shared_t *ssp) 223 { 224 ssp->sc_state = SC_ONPROC; 225 ssp->sc_cpu = CPU->cpu_id; 226 } 227 228 229 /* 230 * On fork, remove inherited mappings from the child's address space. 231 * The child's threads must call schedctl() to get new shared mappings. 232 */ 233 static void 234 schedctl_fork(kthread_t *pt, kthread_t *ct) 235 { 236 proc_t *pp = ttoproc(pt); 237 proc_t *cp = ttoproc(ct); 238 sc_page_ctl_t *pagep; 239 240 ASSERT(ct->t_schedctl == NULL); 241 242 /* 243 * Do this only once, whether we are doing fork1() or forkall(). 244 * Don't do it at all if the child process is a child of vfork() 245 * because a child of vfork() borrows the parent's address space. 246 */ 247 if (pt != curthread || (cp->p_flag & SVFORK)) 248 return; 249 250 mutex_enter(&pp->p_sc_lock); 251 for (pagep = pp->p_pagep; pagep != NULL; pagep = pagep->spc_next) 252 (void) as_unmap(cp->p_as, pagep->spc_uaddr, PAGESIZE); 253 mutex_exit(&pp->p_sc_lock); 254 } 255 256 257 /* 258 * Returns non-zero if the specified thread shouldn't be preempted at this time. 259 * Called by ts_preempt(), ts_tick(), and ts_update(). 260 */ 261 int 262 schedctl_get_nopreempt(kthread_t *t) 263 { 264 ASSERT(THREAD_LOCK_HELD(t)); 265 return (t->t_schedctl->sc_preemptctl.sc_nopreempt); 266 } 267 268 269 /* 270 * Sets the value of the nopreempt field for the specified thread. 271 * Called by ts_preempt() to clear the field on preemption. 272 */ 273 void 274 schedctl_set_nopreempt(kthread_t *t, short val) 275 { 276 ASSERT(THREAD_LOCK_HELD(t)); 277 t->t_schedctl->sc_preemptctl.sc_nopreempt = val; 278 } 279 280 281 /* 282 * Sets the value of the yield field for the specified thread. 283 * Called by ts_preempt() and ts_tick() to set the field, and 284 * ts_yield() to clear it. 285 * The kernel never looks at this field so we don't need a 286 * schedctl_get_yield() function. 287 */ 288 void 289 schedctl_set_yield(kthread_t *t, short val) 290 { 291 ASSERT(THREAD_LOCK_HELD(t)); 292 t->t_schedctl->sc_preemptctl.sc_yield = val; 293 } 294 295 296 /* 297 * Sets the values of the cid and priority fields for the specified thread. 298 * Called from thread_change_pri(), thread_change_epri(), THREAD_CHANGE_PRI(). 299 * Called following calls to CL_FORKRET() and CL_ENTERCLASS(). 300 */ 301 void 302 schedctl_set_cidpri(kthread_t *t) 303 { 304 sc_shared_t *tdp = t->t_schedctl; 305 306 if (tdp != NULL) { 307 tdp->sc_cid = t->t_cid; 308 tdp->sc_cpri = t->t_cpri; 309 tdp->sc_priority = DISP_PRIO(t); 310 } 311 } 312 313 314 /* 315 * Returns non-zero if the specified thread has requested that all 316 * signals be blocked. Called by signal-related code that tests 317 * the signal mask of a thread that may not be the current thread 318 * and where the process's p_lock cannot be acquired. 319 */ 320 int 321 schedctl_sigblock(kthread_t *t) 322 { 323 sc_shared_t *tdp = t->t_schedctl; 324 325 if (tdp != NULL) 326 return (tdp->sc_sigblock); 327 return (0); 328 } 329 330 331 /* 332 * If the sc_sigblock field is set for the specified thread, set 333 * its signal mask to block all maskable signals, then clear the 334 * sc_sigblock field. This finishes what user-level code requested 335 * to be done when it set tdp->sc_shared->sc_sigblock non-zero. 336 * Called by signal-related code that holds the process's p_lock. 337 */ 338 void 339 schedctl_finish_sigblock(kthread_t *t) 340 { 341 sc_shared_t *tdp = t->t_schedctl; 342 343 ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)); 344 345 if (tdp != NULL && tdp->sc_sigblock) { 346 t->t_hold.__sigbits[0] = FILLSET0 & ~CANTMASK0; 347 t->t_hold.__sigbits[1] = FILLSET1 & ~CANTMASK1; 348 tdp->sc_sigblock = 0; 349 } 350 } 351 352 353 /* 354 * Return non-zero if the current thread has declared that it has 355 * a cancellation pending and that cancellation is not disabled. 356 * If SIGCANCEL is blocked, we must be going over the wire in an 357 * NFS transaction (sigintr() was called); return zero in this case. 358 */ 359 int 360 schedctl_cancel_pending(void) 361 { 362 sc_shared_t *tdp = curthread->t_schedctl; 363 364 if (tdp != NULL && 365 (tdp->sc_flgs & SC_CANCEL_FLG) && 366 !tdp->sc_sigblock && 367 !sigismember(&curthread->t_hold, SIGCANCEL)) 368 return (1); 369 return (0); 370 } 371 372 373 /* 374 * Inform libc that the kernel returned EINTR from some system call 375 * due to there being a cancellation pending (SC_CANCEL_FLG set or 376 * we received an SI_LWP SIGCANCEL while in a system call), rather 377 * than because of some other signal. User-level code can try to 378 * recover from receiving other signals, but it can't recover from 379 * being cancelled. 380 */ 381 void 382 schedctl_cancel_eintr(void) 383 { 384 sc_shared_t *tdp = curthread->t_schedctl; 385 386 if (tdp != NULL) 387 tdp->sc_flgs |= SC_EINTR_FLG; 388 } 389 390 391 /* 392 * Return non-zero if the current thread has declared that 393 * it is calling into the kernel to park, else return zero. 394 */ 395 int 396 schedctl_is_park(void) 397 { 398 sc_shared_t *tdp = curthread->t_schedctl; 399 400 if (tdp != NULL) 401 return ((tdp->sc_flgs & SC_PARK_FLG) != 0); 402 /* 403 * If we're here and there is no shared memory (how could 404 * that happen?) then just assume we really are here to park. 405 */ 406 return (1); 407 } 408 409 410 /* 411 * Declare thread is parking. 412 * 413 * libc will set "sc_flgs |= SC_PARK_FLG" before calling lwpsys_park(0, tid) 414 * in order to declare that the thread is calling into the kernel to park. 415 * 416 * This interface exists ONLY to support older versions of libthread which 417 * are not aware of the SC_PARK_FLG flag. 418 * 419 * Older versions of libthread which are not aware of the SC_PARK_FLG flag 420 * need to be modified or emulated to call lwpsys_park(4, ...) instead of 421 * lwpsys_park(0, ...). This will invoke schedctl_set_park() before 422 * lwp_park() to declare that the thread is parking. 423 */ 424 void 425 schedctl_set_park(void) 426 { 427 sc_shared_t *tdp = curthread->t_schedctl; 428 if (tdp != NULL) 429 tdp->sc_flgs |= SC_PARK_FLG; 430 } 431 432 433 /* 434 * Clear the parking flag on return from parking in the kernel. 435 */ 436 void 437 schedctl_unpark(void) 438 { 439 sc_shared_t *tdp = curthread->t_schedctl; 440 441 if (tdp != NULL) 442 tdp->sc_flgs &= ~SC_PARK_FLG; 443 } 444 445 446 /* 447 * Page handling code. 448 */ 449 450 void 451 schedctl_init(void) 452 { 453 /* 454 * Amount of page that can hold sc_shared_t structures. If 455 * sizeof (sc_shared_t) is a power of 2, this should just be 456 * PAGESIZE. 457 */ 458 sc_pagesize = PAGESIZE - (PAGESIZE % sizeof (sc_shared_t)); 459 460 /* 461 * Allocation bitmap is one bit per struct on a page. 462 */ 463 sc_bitmap_len = sc_pagesize / sizeof (sc_shared_t); 464 sc_bitmap_words = howmany(sc_bitmap_len, BT_NBIPUL); 465 } 466 467 468 static int 469 schedctl_shared_alloc(sc_shared_t **kaddrp, uintptr_t *uaddrp) 470 { 471 proc_t *p = curproc; 472 sc_page_ctl_t *pagep; 473 sc_shared_t *ssp; 474 caddr_t base; 475 index_t index; 476 int error; 477 478 ASSERT(MUTEX_NOT_HELD(&p->p_lock)); 479 mutex_enter(&p->p_sc_lock); 480 481 /* 482 * Try to find space for the new data in existing pages 483 * within the process's list of shared pages. 484 */ 485 for (pagep = p->p_pagep; pagep != NULL; pagep = pagep->spc_next) 486 if (pagep->spc_space != 0) 487 break; 488 489 if (pagep != NULL) 490 base = pagep->spc_uaddr; 491 else { 492 struct anon_map *amp; 493 caddr_t kaddr; 494 495 /* 496 * No room, need to allocate a new page. Also set up 497 * a mapping to the kernel address space for the new 498 * page and lock it in memory. 499 */ 500 if ((error = schedctl_getpage(&, &kaddr)) != 0) { 501 mutex_exit(&p->p_sc_lock); 502 return (error); 503 } 504 if ((error = schedctl_map(amp, &base, kaddr)) != 0) { 505 schedctl_freepage(amp, kaddr); 506 mutex_exit(&p->p_sc_lock); 507 return (error); 508 } 509 510 /* 511 * Allocate and initialize the page control structure. 512 */ 513 pagep = kmem_alloc(sizeof (sc_page_ctl_t), KM_SLEEP); 514 pagep->spc_amp = amp; 515 pagep->spc_base = (sc_shared_t *)kaddr; 516 pagep->spc_end = (sc_shared_t *)(kaddr + sc_pagesize); 517 pagep->spc_uaddr = base; 518 519 pagep->spc_map = kmem_zalloc(sizeof (ulong_t) * sc_bitmap_words, 520 KM_SLEEP); 521 pagep->spc_space = sc_pagesize; 522 523 pagep->spc_next = p->p_pagep; 524 p->p_pagep = pagep; 525 } 526 527 /* 528 * Got a page, now allocate space for the data. There should 529 * be space unless something's wrong. 530 */ 531 ASSERT(pagep != NULL && pagep->spc_space >= sizeof (sc_shared_t)); 532 index = bt_availbit(pagep->spc_map, sc_bitmap_len); 533 ASSERT(index != -1); 534 535 /* 536 * Get location with pointer arithmetic. spc_base is of type 537 * sc_shared_t *. Mark as allocated. 538 */ 539 ssp = pagep->spc_base + index; 540 BT_SET(pagep->spc_map, index); 541 pagep->spc_space -= sizeof (sc_shared_t); 542 543 mutex_exit(&p->p_sc_lock); 544 545 /* 546 * Return kernel and user addresses. 547 */ 548 *kaddrp = ssp; 549 *uaddrp = (uintptr_t)base + ((uintptr_t)ssp & PAGEOFFSET); 550 return (0); 551 } 552 553 554 /* 555 * Find the page control structure corresponding to a kernel address. 556 */ 557 static sc_page_ctl_t * 558 schedctl_page_lookup(sc_shared_t *ssp) 559 { 560 proc_t *p = curproc; 561 sc_page_ctl_t *pagep; 562 563 ASSERT(MUTEX_HELD(&p->p_sc_lock)); 564 for (pagep = p->p_pagep; pagep != NULL; pagep = pagep->spc_next) { 565 if (ssp >= pagep->spc_base && ssp < pagep->spc_end) 566 return (pagep); 567 } 568 return (NULL); /* This "can't happen". Should we panic? */ 569 } 570 571 572 /* 573 * This function is called when a page needs to be mapped into a 574 * process's address space. Allocate the user address space and 575 * set up the mapping to the page. Assumes the page has already 576 * been allocated and locked in memory via schedctl_getpage. 577 */ 578 static int 579 schedctl_map(struct anon_map *amp, caddr_t *uaddrp, caddr_t kaddr) 580 { 581 caddr_t addr = NULL; 582 struct as *as = curproc->p_as; 583 struct segvn_crargs vn_a; 584 int error; 585 586 as_rangelock(as); 587 /* pass address of kernel mapping as offset to avoid VAC conflicts */ 588 map_addr(&addr, PAGESIZE, (offset_t)(uintptr_t)kaddr, 1, 0); 589 if (addr == NULL) { 590 as_rangeunlock(as); 591 return (ENOMEM); 592 } 593 594 /* 595 * Use segvn to set up the mapping to the page. 596 */ 597 vn_a.vp = NULL; 598 vn_a.offset = 0; 599 vn_a.cred = NULL; 600 vn_a.type = MAP_SHARED; 601 vn_a.prot = vn_a.maxprot = PROT_ALL; 602 vn_a.flags = 0; 603 vn_a.amp = amp; 604 vn_a.szc = 0; 605 vn_a.lgrp_mem_policy_flags = 0; 606 error = as_map(as, addr, PAGESIZE, segvn_create, &vn_a); 607 as_rangeunlock(as); 608 609 if (error) 610 return (error); 611 612 *uaddrp = addr; 613 return (0); 614 } 615 616 617 /* 618 * Allocate a new page from anonymous memory. Also, create a kernel 619 * mapping to the page and lock the page in memory. 620 */ 621 static int 622 schedctl_getpage(struct anon_map **newamp, caddr_t *newaddr) 623 { 624 struct anon_map *amp; 625 caddr_t kaddr; 626 627 /* 628 * Set up anonymous memory struct. No swap reservation is 629 * needed since the page will be locked into memory. 630 */ 631 amp = anonmap_alloc(PAGESIZE, 0, ANON_SLEEP); 632 633 /* 634 * Allocate the page. 635 */ 636 kaddr = segkp_get_withanonmap(segkp, PAGESIZE, 637 KPD_NO_ANON | KPD_LOCKED | KPD_ZERO, amp); 638 if (kaddr == NULL) { 639 amp->refcnt--; 640 anonmap_free(amp); 641 return (ENOMEM); 642 } 643 644 /* 645 * The page is left SE_SHARED locked so that it won't be 646 * paged out or relocated (KPD_LOCKED above). 647 */ 648 649 *newamp = amp; 650 *newaddr = kaddr; 651 return (0); 652 } 653 654 655 /* 656 * Take the necessary steps to allow a page to be released. 657 * This is called when the process is doing exit() or exec(). 658 * There should be no accesses to the page after this. 659 * The kernel mapping of the page is released and the page is unlocked. 660 */ 661 static void 662 schedctl_freepage(struct anon_map *amp, caddr_t kaddr) 663 { 664 /* 665 * Release the lock on the page and remove the kernel mapping. 666 */ 667 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 668 segkp_release(segkp, kaddr); 669 670 /* 671 * Decrement the refcnt so the anon_map structure will be freed. 672 */ 673 if (--amp->refcnt == 0) { 674 /* 675 * The current process no longer has the page mapped, so 676 * we have to free everything rather than letting as_free 677 * do the work. 678 */ 679 anon_free(amp->ahp, 0, PAGESIZE); 680 ANON_LOCK_EXIT(&->a_rwlock); 681 anonmap_free(amp); 682 } else { 683 ANON_LOCK_EXIT(&->a_rwlock); 684 } 685 } 686