1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/systm.h> 29 #include <sys/schedctl.h> 30 #include <sys/proc.h> 31 #include <sys/thread.h> 32 #include <sys/class.h> 33 #include <sys/cred.h> 34 #include <sys/kmem.h> 35 #include <sys/cmn_err.h> 36 #include <sys/stack.h> 37 #include <sys/debug.h> 38 #include <sys/cpuvar.h> 39 #include <sys/sobject.h> 40 #include <sys/door.h> 41 #include <sys/modctl.h> 42 #include <sys/syscall.h> 43 #include <sys/sysmacros.h> 44 #include <sys/vmsystm.h> 45 #include <sys/mman.h> 46 #include <sys/vnode.h> 47 #include <sys/swap.h> 48 #include <sys/lwp.h> 49 #include <sys/bitmap.h> 50 #include <sys/atomic.h> 51 #include <sys/fcntl.h> 52 #include <vm/seg_kp.h> 53 #include <vm/seg_vn.h> 54 #include <vm/as.h> 55 #include <fs/fs_subr.h> 56 57 /* 58 * Page handling structures. This is set up as a list of per-page 59 * control structures (sc_page_ctl), with p->p_pagep pointing to 60 * the first. The per-page structures point to the actual pages 61 * and contain pointers to the user address for each mapped page. 62 * 63 * All data is protected by p->p_sc_lock. Since this lock is 64 * held while waiting for memory, schedctl_shared_alloc() should 65 * not be called while holding p_lock. 66 */ 67 68 typedef struct sc_page_ctl { 69 struct sc_page_ctl *spc_next; 70 sc_shared_t *spc_base; /* base of kernel page */ 71 sc_shared_t *spc_end; /* end of usable space */ 72 ulong_t *spc_map; /* bitmap of allocated space on page */ 73 size_t spc_space; /* amount of space on page */ 74 caddr_t spc_uaddr; /* user-level address of the page */ 75 struct anon_map *spc_amp; /* anonymous memory structure */ 76 } sc_page_ctl_t; 77 78 static size_t sc_pagesize; /* size of usable space on page */ 79 static size_t sc_bitmap_len; /* # of bits in allocation bitmap */ 80 static size_t sc_bitmap_words; /* # of words in allocation bitmap */ 81 82 /* Context ops */ 83 static void schedctl_save(sc_shared_t *); 84 static void schedctl_restore(sc_shared_t *); 85 static void schedctl_fork(kthread_t *, kthread_t *); 86 87 /* Functions for handling shared pages */ 88 static int schedctl_shared_alloc(sc_shared_t **, uintptr_t *); 89 static sc_page_ctl_t *schedctl_page_lookup(sc_shared_t *); 90 static int schedctl_map(struct anon_map *, caddr_t *, caddr_t); 91 static int schedctl_getpage(struct anon_map **, caddr_t *); 92 static void schedctl_freepage(struct anon_map *, caddr_t); 93 94 /* 95 * System call interface to scheduler activations. 96 * This always operates on the current lwp. 97 */ 98 caddr_t 99 schedctl(void) 100 { 101 kthread_t *t = curthread; 102 sc_shared_t *ssp; 103 uintptr_t uaddr; 104 int error; 105 106 if (t->t_schedctl == NULL) { 107 /* 108 * Allocate and initialize the shared structure. 109 */ 110 if ((error = schedctl_shared_alloc(&ssp, &uaddr)) != 0) 111 return ((caddr_t)(uintptr_t)set_errno(error)); 112 bzero(ssp, sizeof (*ssp)); 113 114 installctx(t, ssp, schedctl_save, schedctl_restore, 115 schedctl_fork, NULL, NULL, NULL); 116 117 thread_lock(t); /* protect against ts_tick and ts_update */ 118 t->t_schedctl = ssp; 119 t->t_sc_uaddr = uaddr; 120 ssp->sc_cid = t->t_cid; 121 ssp->sc_cpri = t->t_cpri; 122 ssp->sc_priority = DISP_PRIO(t); 123 thread_unlock(t); 124 } 125 126 return ((caddr_t)t->t_sc_uaddr); 127 } 128 129 130 /* 131 * Clean up scheduler activations state associated with an exiting 132 * (or execing) lwp. t is always the current thread. 133 */ 134 void 135 schedctl_lwp_cleanup(kthread_t *t) 136 { 137 sc_shared_t *ssp = t->t_schedctl; 138 proc_t *p = ttoproc(t); 139 sc_page_ctl_t *pagep; 140 index_t index; 141 142 ASSERT(MUTEX_NOT_HELD(&p->p_lock)); 143 144 thread_lock(t); /* protect against ts_tick and ts_update */ 145 t->t_schedctl = NULL; 146 t->t_sc_uaddr = 0; 147 thread_unlock(t); 148 149 /* 150 * Remove the context op to avoid the final call to 151 * schedctl_save when switching away from this lwp. 152 */ 153 (void) removectx(t, ssp, schedctl_save, schedctl_restore, 154 schedctl_fork, NULL, NULL, NULL); 155 156 /* 157 * Do not unmap the shared page until the process exits. 158 * User-level library code relies on this for adaptive mutex locking. 159 */ 160 mutex_enter(&p->p_sc_lock); 161 ssp->sc_state = SC_FREE; 162 pagep = schedctl_page_lookup(ssp); 163 index = (index_t)(ssp - pagep->spc_base); 164 BT_CLEAR(pagep->spc_map, index); 165 pagep->spc_space += sizeof (sc_shared_t); 166 mutex_exit(&p->p_sc_lock); 167 } 168 169 170 /* 171 * Cleanup the list of schedctl shared pages for the process. 172 * Called from exec() and exit() system calls. 173 */ 174 void 175 schedctl_proc_cleanup(void) 176 { 177 proc_t *p = curproc; 178 sc_page_ctl_t *pagep; 179 sc_page_ctl_t *next; 180 181 ASSERT(p->p_lwpcnt == 1); /* we are single-threaded now */ 182 ASSERT(curthread->t_schedctl == NULL); 183 184 /* 185 * Since we are single-threaded, we don't have to hold p->p_sc_lock. 186 */ 187 pagep = p->p_pagep; 188 p->p_pagep = NULL; 189 while (pagep != NULL) { 190 ASSERT(pagep->spc_space == sc_pagesize); 191 next = pagep->spc_next; 192 /* 193 * Unmap the user space and free the mapping structure. 194 */ 195 (void) as_unmap(p->p_as, pagep->spc_uaddr, PAGESIZE); 196 schedctl_freepage(pagep->spc_amp, (caddr_t)(pagep->spc_base)); 197 kmem_free(pagep->spc_map, sizeof (ulong_t) * sc_bitmap_words); 198 kmem_free(pagep, sizeof (sc_page_ctl_t)); 199 pagep = next; 200 } 201 } 202 203 204 /* 205 * Called by resume just before switching away from the current thread. 206 * Save new thread state. 207 */ 208 static void 209 schedctl_save(sc_shared_t *ssp) 210 { 211 ssp->sc_state = curthread->t_state; 212 } 213 214 215 /* 216 * Called by resume after switching to the current thread. 217 * Save new thread state and CPU. 218 */ 219 static void 220 schedctl_restore(sc_shared_t *ssp) 221 { 222 ssp->sc_state = SC_ONPROC; 223 ssp->sc_cpu = CPU->cpu_id; 224 } 225 226 227 /* 228 * On fork, remove inherited mappings from the child's address space. 229 * The child's threads must call schedctl() to get new shared mappings. 230 */ 231 static void 232 schedctl_fork(kthread_t *pt, kthread_t *ct) 233 { 234 proc_t *pp = ttoproc(pt); 235 proc_t *cp = ttoproc(ct); 236 sc_page_ctl_t *pagep; 237 238 ASSERT(ct->t_schedctl == NULL); 239 240 /* 241 * Do this only once, whether we are doing fork1() or forkall(). 242 * Don't do it at all if the child process is a child of vfork() 243 * because a child of vfork() borrows the parent's address space. 244 */ 245 if (pt != curthread || (cp->p_flag & SVFORK)) 246 return; 247 248 mutex_enter(&pp->p_sc_lock); 249 for (pagep = pp->p_pagep; pagep != NULL; pagep = pagep->spc_next) 250 (void) as_unmap(cp->p_as, pagep->spc_uaddr, PAGESIZE); 251 mutex_exit(&pp->p_sc_lock); 252 } 253 254 255 /* 256 * Returns non-zero if the specified thread shouldn't be preempted at this time. 257 * Called by ts_preempt(), ts_tick(), and ts_update(). 258 */ 259 int 260 schedctl_get_nopreempt(kthread_t *t) 261 { 262 ASSERT(THREAD_LOCK_HELD(t)); 263 return (t->t_schedctl->sc_preemptctl.sc_nopreempt); 264 } 265 266 267 /* 268 * Sets the value of the nopreempt field for the specified thread. 269 * Called by ts_preempt() to clear the field on preemption. 270 */ 271 void 272 schedctl_set_nopreempt(kthread_t *t, short val) 273 { 274 ASSERT(THREAD_LOCK_HELD(t)); 275 t->t_schedctl->sc_preemptctl.sc_nopreempt = val; 276 } 277 278 279 /* 280 * Sets the value of the yield field for the specified thread. 281 * Called by ts_preempt() and ts_tick() to set the field, and 282 * ts_yield() to clear it. 283 * The kernel never looks at this field so we don't need a 284 * schedctl_get_yield() function. 285 */ 286 void 287 schedctl_set_yield(kthread_t *t, short val) 288 { 289 ASSERT(THREAD_LOCK_HELD(t)); 290 t->t_schedctl->sc_preemptctl.sc_yield = val; 291 } 292 293 294 /* 295 * Sets the values of the cid and priority fields for the specified thread. 296 * Called from thread_change_pri(), thread_change_epri(), THREAD_CHANGE_PRI(). 297 * Called following calls to CL_FORKRET() and CL_ENTERCLASS(). 298 */ 299 void 300 schedctl_set_cidpri(kthread_t *t) 301 { 302 sc_shared_t *tdp = t->t_schedctl; 303 304 if (tdp != NULL) { 305 tdp->sc_cid = t->t_cid; 306 tdp->sc_cpri = t->t_cpri; 307 tdp->sc_priority = DISP_PRIO(t); 308 } 309 } 310 311 312 /* 313 * Returns non-zero if the specified thread has requested that all 314 * signals be blocked. Called by signal-related code that tests 315 * the signal mask of a thread that may not be the current thread 316 * and where the process's p_lock cannot be acquired. 317 */ 318 int 319 schedctl_sigblock(kthread_t *t) 320 { 321 sc_shared_t *tdp = t->t_schedctl; 322 323 if (tdp != NULL) 324 return (tdp->sc_sigblock); 325 return (0); 326 } 327 328 329 /* 330 * If the sc_sigblock field is set for the specified thread, set 331 * its signal mask to block all maskable signals, then clear the 332 * sc_sigblock field. This finishes what user-level code requested 333 * to be done when it set tdp->sc_shared->sc_sigblock non-zero. 334 * Called from signal-related code either by the current thread for 335 * itself or by a thread that holds the process's p_lock (/proc code). 336 */ 337 void 338 schedctl_finish_sigblock(kthread_t *t) 339 { 340 sc_shared_t *tdp = t->t_schedctl; 341 342 ASSERT(t == curthread || MUTEX_HELD(&ttoproc(t)->p_lock)); 343 344 if (tdp != NULL && tdp->sc_sigblock) { 345 t->t_hold.__sigbits[0] = FILLSET0 & ~CANTMASK0; 346 t->t_hold.__sigbits[1] = FILLSET1 & ~CANTMASK1; 347 tdp->sc_sigblock = 0; 348 } 349 } 350 351 352 /* 353 * Return non-zero if the current thread has declared that it has 354 * a cancellation pending and that cancellation is not disabled. 355 * If SIGCANCEL is blocked, we must be going over the wire in an 356 * NFS transaction (sigintr() was called); return zero in this case. 357 */ 358 int 359 schedctl_cancel_pending(void) 360 { 361 sc_shared_t *tdp = curthread->t_schedctl; 362 363 if (tdp != NULL && 364 (tdp->sc_flgs & SC_CANCEL_FLG) && 365 !tdp->sc_sigblock && 366 !sigismember(&curthread->t_hold, SIGCANCEL)) 367 return (1); 368 return (0); 369 } 370 371 372 /* 373 * Inform libc that the kernel returned EINTR from some system call 374 * due to there being a cancellation pending (SC_CANCEL_FLG set or 375 * we received an SI_LWP SIGCANCEL while in a system call), rather 376 * than because of some other signal. User-level code can try to 377 * recover from receiving other signals, but it can't recover from 378 * being cancelled. 379 */ 380 void 381 schedctl_cancel_eintr(void) 382 { 383 sc_shared_t *tdp = curthread->t_schedctl; 384 385 if (tdp != NULL) 386 tdp->sc_flgs |= SC_EINTR_FLG; 387 } 388 389 390 /* 391 * Return non-zero if the current thread has declared that 392 * it is calling into the kernel to park, else return zero. 393 */ 394 int 395 schedctl_is_park(void) 396 { 397 sc_shared_t *tdp = curthread->t_schedctl; 398 399 if (tdp != NULL) 400 return ((tdp->sc_flgs & SC_PARK_FLG) != 0); 401 /* 402 * If we're here and there is no shared memory (how could 403 * that happen?) then just assume we really are here to park. 404 */ 405 return (1); 406 } 407 408 409 /* 410 * Declare thread is parking. 411 * 412 * libc will set "sc_flgs |= SC_PARK_FLG" before calling lwpsys_park(0, tid) 413 * in order to declare that the thread is calling into the kernel to park. 414 * 415 * This interface exists ONLY to support older versions of libthread which 416 * are not aware of the SC_PARK_FLG flag. 417 * 418 * Older versions of libthread which are not aware of the SC_PARK_FLG flag 419 * need to be modified or emulated to call lwpsys_park(4, ...) instead of 420 * lwpsys_park(0, ...). This will invoke schedctl_set_park() before 421 * lwp_park() to declare that the thread is parking. 422 */ 423 void 424 schedctl_set_park(void) 425 { 426 sc_shared_t *tdp = curthread->t_schedctl; 427 if (tdp != NULL) 428 tdp->sc_flgs |= SC_PARK_FLG; 429 } 430 431 432 /* 433 * Clear the parking flag on return from parking in the kernel. 434 */ 435 void 436 schedctl_unpark(void) 437 { 438 sc_shared_t *tdp = curthread->t_schedctl; 439 440 if (tdp != NULL) 441 tdp->sc_flgs &= ~SC_PARK_FLG; 442 } 443 444 445 /* 446 * Page handling code. 447 */ 448 449 void 450 schedctl_init(void) 451 { 452 /* 453 * Amount of page that can hold sc_shared_t structures. If 454 * sizeof (sc_shared_t) is a power of 2, this should just be 455 * PAGESIZE. 456 */ 457 sc_pagesize = PAGESIZE - (PAGESIZE % sizeof (sc_shared_t)); 458 459 /* 460 * Allocation bitmap is one bit per struct on a page. 461 */ 462 sc_bitmap_len = sc_pagesize / sizeof (sc_shared_t); 463 sc_bitmap_words = howmany(sc_bitmap_len, BT_NBIPUL); 464 } 465 466 467 static int 468 schedctl_shared_alloc(sc_shared_t **kaddrp, uintptr_t *uaddrp) 469 { 470 proc_t *p = curproc; 471 sc_page_ctl_t *pagep; 472 sc_shared_t *ssp; 473 caddr_t base; 474 index_t index; 475 int error; 476 477 ASSERT(MUTEX_NOT_HELD(&p->p_lock)); 478 mutex_enter(&p->p_sc_lock); 479 480 /* 481 * Try to find space for the new data in existing pages 482 * within the process's list of shared pages. 483 */ 484 for (pagep = p->p_pagep; pagep != NULL; pagep = pagep->spc_next) 485 if (pagep->spc_space != 0) 486 break; 487 488 if (pagep != NULL) 489 base = pagep->spc_uaddr; 490 else { 491 struct anon_map *amp; 492 caddr_t kaddr; 493 494 /* 495 * No room, need to allocate a new page. Also set up 496 * a mapping to the kernel address space for the new 497 * page and lock it in memory. 498 */ 499 if ((error = schedctl_getpage(&, &kaddr)) != 0) { 500 mutex_exit(&p->p_sc_lock); 501 return (error); 502 } 503 if ((error = schedctl_map(amp, &base, kaddr)) != 0) { 504 schedctl_freepage(amp, kaddr); 505 mutex_exit(&p->p_sc_lock); 506 return (error); 507 } 508 509 /* 510 * Allocate and initialize the page control structure. 511 */ 512 pagep = kmem_alloc(sizeof (sc_page_ctl_t), KM_SLEEP); 513 pagep->spc_amp = amp; 514 pagep->spc_base = (sc_shared_t *)kaddr; 515 pagep->spc_end = (sc_shared_t *)(kaddr + sc_pagesize); 516 pagep->spc_uaddr = base; 517 518 pagep->spc_map = kmem_zalloc(sizeof (ulong_t) * sc_bitmap_words, 519 KM_SLEEP); 520 pagep->spc_space = sc_pagesize; 521 522 pagep->spc_next = p->p_pagep; 523 p->p_pagep = pagep; 524 } 525 526 /* 527 * Got a page, now allocate space for the data. There should 528 * be space unless something's wrong. 529 */ 530 ASSERT(pagep != NULL && pagep->spc_space >= sizeof (sc_shared_t)); 531 index = bt_availbit(pagep->spc_map, sc_bitmap_len); 532 ASSERT(index != -1); 533 534 /* 535 * Get location with pointer arithmetic. spc_base is of type 536 * sc_shared_t *. Mark as allocated. 537 */ 538 ssp = pagep->spc_base + index; 539 BT_SET(pagep->spc_map, index); 540 pagep->spc_space -= sizeof (sc_shared_t); 541 542 mutex_exit(&p->p_sc_lock); 543 544 /* 545 * Return kernel and user addresses. 546 */ 547 *kaddrp = ssp; 548 *uaddrp = (uintptr_t)base + ((uintptr_t)ssp & PAGEOFFSET); 549 return (0); 550 } 551 552 553 /* 554 * Find the page control structure corresponding to a kernel address. 555 */ 556 static sc_page_ctl_t * 557 schedctl_page_lookup(sc_shared_t *ssp) 558 { 559 proc_t *p = curproc; 560 sc_page_ctl_t *pagep; 561 562 ASSERT(MUTEX_HELD(&p->p_sc_lock)); 563 for (pagep = p->p_pagep; pagep != NULL; pagep = pagep->spc_next) { 564 if (ssp >= pagep->spc_base && ssp < pagep->spc_end) 565 return (pagep); 566 } 567 return (NULL); /* This "can't happen". Should we panic? */ 568 } 569 570 571 /* 572 * This function is called when a page needs to be mapped into a 573 * process's address space. Allocate the user address space and 574 * set up the mapping to the page. Assumes the page has already 575 * been allocated and locked in memory via schedctl_getpage. 576 */ 577 static int 578 schedctl_map(struct anon_map *amp, caddr_t *uaddrp, caddr_t kaddr) 579 { 580 caddr_t addr = NULL; 581 struct as *as = curproc->p_as; 582 struct segvn_crargs vn_a; 583 int error; 584 585 as_rangelock(as); 586 /* pass address of kernel mapping as offset to avoid VAC conflicts */ 587 map_addr(&addr, PAGESIZE, (offset_t)(uintptr_t)kaddr, 1, 0); 588 if (addr == NULL) { 589 as_rangeunlock(as); 590 return (ENOMEM); 591 } 592 593 /* 594 * Use segvn to set up the mapping to the page. 595 */ 596 vn_a.vp = NULL; 597 vn_a.offset = 0; 598 vn_a.cred = NULL; 599 vn_a.type = MAP_SHARED; 600 vn_a.prot = vn_a.maxprot = PROT_ALL; 601 vn_a.flags = 0; 602 vn_a.amp = amp; 603 vn_a.szc = 0; 604 vn_a.lgrp_mem_policy_flags = 0; 605 error = as_map(as, addr, PAGESIZE, segvn_create, &vn_a); 606 as_rangeunlock(as); 607 608 if (error) 609 return (error); 610 611 *uaddrp = addr; 612 return (0); 613 } 614 615 616 /* 617 * Allocate a new page from anonymous memory. Also, create a kernel 618 * mapping to the page and lock the page in memory. 619 */ 620 static int 621 schedctl_getpage(struct anon_map **newamp, caddr_t *newaddr) 622 { 623 struct anon_map *amp; 624 caddr_t kaddr; 625 626 /* 627 * Set up anonymous memory struct. No swap reservation is 628 * needed since the page will be locked into memory. 629 */ 630 amp = anonmap_alloc(PAGESIZE, 0, ANON_SLEEP); 631 632 /* 633 * Allocate the page. 634 */ 635 kaddr = segkp_get_withanonmap(segkp, PAGESIZE, 636 KPD_NO_ANON | KPD_LOCKED | KPD_ZERO, amp); 637 if (kaddr == NULL) { 638 amp->refcnt--; 639 anonmap_free(amp); 640 return (ENOMEM); 641 } 642 643 /* 644 * The page is left SE_SHARED locked so that it won't be 645 * paged out or relocated (KPD_LOCKED above). 646 */ 647 648 *newamp = amp; 649 *newaddr = kaddr; 650 return (0); 651 } 652 653 654 /* 655 * Take the necessary steps to allow a page to be released. 656 * This is called when the process is doing exit() or exec(). 657 * There should be no accesses to the page after this. 658 * The kernel mapping of the page is released and the page is unlocked. 659 */ 660 static void 661 schedctl_freepage(struct anon_map *amp, caddr_t kaddr) 662 { 663 /* 664 * Release the lock on the page and remove the kernel mapping. 665 */ 666 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 667 segkp_release(segkp, kaddr); 668 669 /* 670 * Decrement the refcnt so the anon_map structure will be freed. 671 */ 672 if (--amp->refcnt == 0) { 673 /* 674 * The current process no longer has the page mapped, so 675 * we have to free everything rather than letting as_free 676 * do the work. 677 */ 678 anonmap_purge(amp); 679 anon_free(amp->ahp, 0, PAGESIZE); 680 ANON_LOCK_EXIT(&->a_rwlock); 681 anonmap_free(amp); 682 } else { 683 ANON_LOCK_EXIT(&->a_rwlock); 684 } 685 } 686