1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <sys/systm.h> 30 #include <sys/schedctl.h> 31 #include <sys/proc.h> 32 #include <sys/thread.h> 33 #include <sys/class.h> 34 #include <sys/cred.h> 35 #include <sys/kmem.h> 36 #include <sys/cmn_err.h> 37 #include <sys/stack.h> 38 #include <sys/debug.h> 39 #include <sys/cpuvar.h> 40 #include <sys/sobject.h> 41 #include <sys/door.h> 42 #include <sys/modctl.h> 43 #include <sys/syscall.h> 44 #include <sys/sysmacros.h> 45 #include <sys/vmsystm.h> 46 #include <sys/mman.h> 47 #include <sys/vnode.h> 48 #include <sys/swap.h> 49 #include <sys/lwp.h> 50 #include <sys/bitmap.h> 51 #include <sys/atomic.h> 52 #include <sys/fcntl.h> 53 #include <vm/seg_kp.h> 54 #include <vm/seg_vn.h> 55 #include <vm/as.h> 56 #include <fs/fs_subr.h> 57 58 59 /* 60 * Page handling structures. This is set up as a list of per-page 61 * control structures (sc_page_ctl), with p->p_pagep pointing to 62 * the first. The per-page structures point to the actual pages 63 * and contain pointers to the user address for each mapped page. 64 * 65 * All data is protected by p->p_sc_lock. Since this lock is 66 * held while waiting for memory, schedctl_shared_alloc() should 67 * not be called while holding p_lock. 68 */ 69 70 typedef struct sc_page_ctl { 71 struct sc_page_ctl *spc_next; 72 sc_shared_t *spc_base; /* base of kernel page */ 73 sc_shared_t *spc_end; /* end of usable space */ 74 ulong_t *spc_map; /* bitmap of allocated space on page */ 75 size_t spc_space; /* amount of space on page */ 76 caddr_t spc_uaddr; /* user-level address of the page */ 77 struct anon_map *spc_amp; /* anonymous memory structure */ 78 } sc_page_ctl_t; 79 80 static size_t sc_pagesize; /* size of usable space on page */ 81 static size_t sc_bitmap_len; /* # of bits in allocation bitmap */ 82 static size_t sc_bitmap_words; /* # of words in allocation bitmap */ 83 84 /* Context ops */ 85 static void schedctl_save(sc_shared_t *); 86 static void schedctl_restore(sc_shared_t *); 87 static void schedctl_fork(kthread_t *, kthread_t *); 88 89 /* Functions for handling shared pages */ 90 static int schedctl_shared_alloc(sc_shared_t **, uintptr_t *); 91 static sc_page_ctl_t *schedctl_page_lookup(sc_shared_t *); 92 static int schedctl_map(struct anon_map *, caddr_t *, caddr_t); 93 static int schedctl_getpage(struct anon_map **, caddr_t *); 94 static void schedctl_freepage(struct anon_map *, caddr_t); 95 96 /* 97 * System call interface to scheduler activations. 98 * This always operates on the current lwp. 99 */ 100 caddr_t 101 schedctl(void) 102 { 103 kthread_t *t = curthread; 104 sc_shared_t *ssp; 105 uintptr_t uaddr; 106 int error; 107 108 if (t->t_schedctl == NULL) { 109 /* 110 * Allocate and initialize the shared structure. 111 */ 112 if ((error = schedctl_shared_alloc(&ssp, &uaddr)) != 0) 113 return ((caddr_t)(uintptr_t)set_errno(error)); 114 bzero(ssp, sizeof (*ssp)); 115 116 installctx(t, ssp, schedctl_save, schedctl_restore, 117 schedctl_fork, NULL, NULL, NULL); 118 119 thread_lock(t); /* protect against ts_tick and ts_update */ 120 t->t_schedctl = ssp; 121 t->t_sc_uaddr = uaddr; 122 thread_unlock(t); 123 } 124 125 return ((caddr_t)t->t_sc_uaddr); 126 } 127 128 129 /* 130 * Clean up scheduler activations state associated with an exiting 131 * (or execing) lwp. t is always the current thread. 132 */ 133 void 134 schedctl_lwp_cleanup(kthread_t *t) 135 { 136 sc_shared_t *ssp = t->t_schedctl; 137 proc_t *p = ttoproc(t); 138 sc_page_ctl_t *pagep; 139 index_t index; 140 141 ASSERT(MUTEX_NOT_HELD(&p->p_lock)); 142 143 thread_lock(t); /* protect against ts_tick and ts_update */ 144 t->t_schedctl = NULL; 145 t->t_sc_uaddr = 0; 146 thread_unlock(t); 147 148 /* 149 * Remove the context op to avoid the final call to 150 * schedctl_save when switching away from this lwp. 151 */ 152 (void) removectx(t, ssp, schedctl_save, schedctl_restore, 153 schedctl_fork, NULL, NULL, NULL); 154 155 /* 156 * Do not unmap the shared page until the process exits. 157 * User-level library code relies on this for adaptive mutex locking. 158 */ 159 mutex_enter(&p->p_sc_lock); 160 ssp->sc_state = SC_FREE; 161 pagep = schedctl_page_lookup(ssp); 162 index = (index_t)(ssp - pagep->spc_base); 163 BT_CLEAR(pagep->spc_map, index); 164 pagep->spc_space += sizeof (sc_shared_t); 165 mutex_exit(&p->p_sc_lock); 166 } 167 168 /* 169 * Cleanup the list of schedctl shared pages for the process. 170 * Called from exec() and exit() system calls. 171 */ 172 void 173 schedctl_proc_cleanup() 174 { 175 proc_t *p = curproc; 176 sc_page_ctl_t *pagep; 177 sc_page_ctl_t *next; 178 179 ASSERT(p->p_lwpcnt == 1); /* we are single-threaded now */ 180 ASSERT(curthread->t_schedctl == NULL); 181 182 /* 183 * Since we are single-threaded, we don't have to hold p->p_sc_lock. 184 */ 185 pagep = p->p_pagep; 186 p->p_pagep = NULL; 187 while (pagep != NULL) { 188 ASSERT(pagep->spc_space == sc_pagesize); 189 next = pagep->spc_next; 190 /* 191 * Unmap the user space and free the mapping structure. 192 */ 193 (void) as_unmap(p->p_as, pagep->spc_uaddr, PAGESIZE); 194 schedctl_freepage(pagep->spc_amp, (caddr_t)(pagep->spc_base)); 195 kmem_free(pagep->spc_map, sizeof (ulong_t) * sc_bitmap_words); 196 kmem_free(pagep, sizeof (sc_page_ctl_t)); 197 pagep = next; 198 } 199 } 200 201 /* 202 * Called by resume just before switching away from the current thread. 203 * Save new thread state. 204 */ 205 void 206 schedctl_save(sc_shared_t *ssp) 207 { 208 ssp->sc_state = curthread->t_state; 209 } 210 211 212 /* 213 * Called by resume after switching to the current thread. 214 * Save new thread state and CPU. 215 */ 216 void 217 schedctl_restore(sc_shared_t *ssp) 218 { 219 ssp->sc_state = SC_ONPROC; 220 ssp->sc_cpu = CPU->cpu_id; 221 } 222 223 224 /* 225 * On fork, remove inherited mappings from the child's address space. 226 * The child's threads must call schedctl() to get new shared mappings. 227 */ 228 void 229 schedctl_fork(kthread_t *pt, kthread_t *ct) 230 { 231 proc_t *pp = ttoproc(pt); 232 proc_t *cp = ttoproc(ct); 233 sc_page_ctl_t *pagep; 234 235 ASSERT(ct->t_schedctl == NULL); 236 237 /* 238 * Do this only once, whether we are doing fork1() or forkall(). 239 * Don't do it at all if the child process is a child of vfork() 240 * because a child of vfork() borrows the parent's address space. 241 */ 242 if (pt != curthread || (cp->p_flag & SVFORK)) 243 return; 244 245 mutex_enter(&pp->p_sc_lock); 246 for (pagep = pp->p_pagep; pagep != NULL; pagep = pagep->spc_next) 247 (void) as_unmap(cp->p_as, pagep->spc_uaddr, PAGESIZE); 248 mutex_exit(&pp->p_sc_lock); 249 } 250 251 /* 252 * Returns non-zero if the specified thread shouldn't be preempted at this time. 253 * Called by ts_preempt, ts_tick, and ts_update. 254 */ 255 int 256 schedctl_get_nopreempt(kthread_t *t) 257 { 258 ASSERT(THREAD_LOCK_HELD(t)); 259 return (t->t_schedctl->sc_preemptctl.sc_nopreempt); 260 } 261 262 263 /* 264 * Sets the value of the nopreempt field for the specified thread. 265 * Called by ts_preempt to clear the field on preemption. 266 */ 267 void 268 schedctl_set_nopreempt(kthread_t *t, short val) 269 { 270 ASSERT(THREAD_LOCK_HELD(t)); 271 t->t_schedctl->sc_preemptctl.sc_nopreempt = val; 272 } 273 274 275 /* 276 * Sets the value of the yield field for the specified thread. Called by 277 * ts_preempt and ts_tick to set the field, and ts_yield to clear it. 278 * The kernel never looks at this field so we don't need a schedctl_get_yield 279 * function. 280 */ 281 void 282 schedctl_set_yield(kthread_t *t, short val) 283 { 284 ASSERT(THREAD_LOCK_HELD(t)); 285 t->t_schedctl->sc_preemptctl.sc_yield = val; 286 } 287 288 289 /* 290 * Returns non-zero if the specified thread has requested that all 291 * signals be blocked. Called by signal-related code that tests 292 * the signal mask of a thread that may not be the current thread 293 * and where the process's p_lock cannot be acquired. 294 */ 295 int 296 schedctl_sigblock(kthread_t *t) 297 { 298 sc_shared_t *tdp = t->t_schedctl; 299 300 if (tdp) 301 return (tdp->sc_sigblock); 302 return (0); 303 } 304 305 306 /* 307 * If the sc_sigblock field is set for the specified thread, set 308 * its signal mask to block all maskable signals, then clear the 309 * sc_sigblock field. This finishes what user-level code requested 310 * to be done when it set tdp->sc_shared->sc_sigblock non-zero. 311 * Called by signal-related code that holds the process's p_lock. 312 */ 313 void 314 schedctl_finish_sigblock(kthread_t *t) 315 { 316 sc_shared_t *tdp = t->t_schedctl; 317 318 ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)); 319 320 if (tdp && tdp->sc_sigblock) { 321 t->t_hold.__sigbits[0] = FILLSET0 & ~CANTMASK0; 322 t->t_hold.__sigbits[1] = FILLSET1 & ~CANTMASK1; 323 tdp->sc_sigblock = 0; 324 } 325 } 326 327 328 /* 329 * Return non-zero if the current thread has declared that 330 * it is calling into the kernel to park, else return zero. 331 */ 332 int 333 schedctl_is_park() 334 { 335 sc_shared_t *tdp = curthread->t_schedctl; 336 337 if (tdp) 338 return (tdp->sc_park); 339 /* 340 * If we're here and there is no shared memory (how could 341 * that happen?) then just assume we really are here to park. 342 */ 343 return (1); 344 } 345 346 347 /* 348 * Clear the shared sc_park flag on return from parking in the kernel. 349 */ 350 void 351 schedctl_unpark() 352 { 353 sc_shared_t *tdp = curthread->t_schedctl; 354 355 if (tdp) 356 tdp->sc_park = 0; 357 } 358 359 360 /* 361 * Page handling code. 362 */ 363 364 void 365 schedctl_init() 366 { 367 /* 368 * Amount of page that can hold sc_shared_t structures. If 369 * sizeof (sc_shared_t) is a power of 2, this should just be 370 * PAGESIZE. 371 */ 372 sc_pagesize = PAGESIZE - (PAGESIZE % sizeof (sc_shared_t)); 373 374 /* 375 * Allocation bitmap is one bit per struct on a page. 376 */ 377 sc_bitmap_len = sc_pagesize / sizeof (sc_shared_t); 378 sc_bitmap_words = howmany(sc_bitmap_len, BT_NBIPUL); 379 } 380 381 int 382 schedctl_shared_alloc(sc_shared_t **kaddrp, uintptr_t *uaddrp) 383 { 384 proc_t *p = curproc; 385 sc_page_ctl_t *pagep; 386 sc_shared_t *ssp; 387 caddr_t base; 388 index_t index; 389 int error; 390 391 ASSERT(MUTEX_NOT_HELD(&p->p_lock)); 392 mutex_enter(&p->p_sc_lock); 393 394 /* 395 * Try to find space for the new data in existing pages 396 * within the process's list of shared pages. 397 */ 398 for (pagep = p->p_pagep; pagep != NULL; pagep = pagep->spc_next) 399 if (pagep->spc_space != 0) 400 break; 401 402 if (pagep != NULL) 403 base = pagep->spc_uaddr; 404 else { 405 struct anon_map *amp; 406 caddr_t kaddr; 407 408 /* 409 * No room, need to allocate a new page. Also set up 410 * a mapping to the kernel address space for the new 411 * page and lock it in memory. 412 */ 413 if ((error = schedctl_getpage(&, &kaddr)) != 0) { 414 mutex_exit(&p->p_sc_lock); 415 return (error); 416 } 417 if ((error = schedctl_map(amp, &base, kaddr)) != 0) { 418 schedctl_freepage(amp, kaddr); 419 mutex_exit(&p->p_sc_lock); 420 return (error); 421 } 422 423 /* 424 * Allocate and initialize the page control structure. 425 */ 426 pagep = kmem_alloc(sizeof (sc_page_ctl_t), KM_SLEEP); 427 pagep->spc_amp = amp; 428 pagep->spc_base = (sc_shared_t *)kaddr; 429 pagep->spc_end = (sc_shared_t *)(kaddr + sc_pagesize); 430 pagep->spc_uaddr = base; 431 432 pagep->spc_map = kmem_zalloc(sizeof (ulong_t) * sc_bitmap_words, 433 KM_SLEEP); 434 pagep->spc_space = sc_pagesize; 435 436 pagep->spc_next = p->p_pagep; 437 p->p_pagep = pagep; 438 } 439 440 /* 441 * Got a page, now allocate space for the data. There should 442 * be space unless something's wrong. 443 */ 444 ASSERT(pagep != NULL && pagep->spc_space >= sizeof (sc_shared_t)); 445 index = bt_availbit(pagep->spc_map, sc_bitmap_len); 446 ASSERT(index != -1); 447 448 /* 449 * Get location with pointer arithmetic. spc_base is of type 450 * sc_shared_t *. Mark as allocated. 451 */ 452 ssp = pagep->spc_base + index; 453 BT_SET(pagep->spc_map, index); 454 pagep->spc_space -= sizeof (sc_shared_t); 455 456 mutex_exit(&p->p_sc_lock); 457 458 /* 459 * Return kernel and user addresses. 460 */ 461 *kaddrp = ssp; 462 *uaddrp = (uintptr_t)base + ((uintptr_t)ssp & PAGEOFFSET); 463 return (0); 464 } 465 466 467 /* 468 * Find the page control structure corresponding to a kernel address. 469 */ 470 static sc_page_ctl_t * 471 schedctl_page_lookup(sc_shared_t *ssp) 472 { 473 proc_t *p = curproc; 474 sc_page_ctl_t *pagep; 475 476 ASSERT(MUTEX_HELD(&p->p_sc_lock)); 477 for (pagep = p->p_pagep; pagep != NULL; pagep = pagep->spc_next) { 478 if (ssp >= pagep->spc_base && ssp < pagep->spc_end) 479 return (pagep); 480 } 481 return (NULL); /* This "can't happen". Should we panic? */ 482 } 483 484 485 /* 486 * This function is called when a page needs to be mapped into a 487 * process's address space. Allocate the user address space and 488 * set up the mapping to the page. Assumes the page has already 489 * been allocated and locked in memory via schedctl_getpage. 490 */ 491 static int 492 schedctl_map(struct anon_map *amp, caddr_t *uaddrp, caddr_t kaddr) 493 { 494 caddr_t addr; 495 struct as *as = curproc->p_as; 496 struct segvn_crargs vn_a; 497 int error; 498 499 as_rangelock(as); 500 /* pass address of kernel mapping as offset to avoid VAC conflicts */ 501 map_addr(&addr, PAGESIZE, (offset_t)(uintptr_t)kaddr, 1, 0); 502 if (addr == NULL) { 503 as_rangeunlock(as); 504 return (ENOMEM); 505 } 506 507 /* 508 * Use segvn to set up the mapping to the page. 509 */ 510 vn_a.vp = NULL; 511 vn_a.offset = 0; 512 vn_a.cred = NULL; 513 vn_a.type = MAP_SHARED; 514 vn_a.prot = vn_a.maxprot = PROT_ALL; 515 vn_a.flags = 0; 516 vn_a.amp = amp; 517 vn_a.szc = 0; 518 vn_a.lgrp_mem_policy_flags = 0; 519 error = as_map(as, addr, PAGESIZE, segvn_create, &vn_a); 520 as_rangeunlock(as); 521 522 if (error) 523 return (error); 524 525 *uaddrp = addr; 526 return (0); 527 } 528 529 530 /* 531 * Allocate a new page from anonymous memory. Also, create a kernel 532 * mapping to the page and lock the page in memory. 533 */ 534 static int 535 schedctl_getpage(struct anon_map **newamp, caddr_t *newaddr) 536 { 537 struct anon_map *amp; 538 caddr_t kaddr; 539 540 /* 541 * Set up anonymous memory struct. No swap reservation is 542 * needed since the page will be locked into memory. 543 */ 544 amp = anonmap_alloc(PAGESIZE, 0); 545 546 /* 547 * Allocate the page. 548 */ 549 kaddr = segkp_get_withanonmap(segkp, PAGESIZE, 550 KPD_NO_ANON | KPD_LOCKED | KPD_ZERO, amp); 551 if (kaddr == NULL) { 552 amp->refcnt--; 553 anonmap_free(amp); 554 return (ENOMEM); 555 } 556 557 /* 558 * The page is left SE_SHARED locked so that it won't be 559 * paged out or relocated (KPD_LOCKED above). 560 */ 561 562 *newamp = amp; 563 *newaddr = kaddr; 564 return (0); 565 } 566 567 568 /* 569 * Take the necessary steps to allow a page to be released. 570 * This is called when the process is doing exit() or exec(). 571 * There should be no accesses to the page after this. 572 * The kernel mapping of the page is released and the page is unlocked. 573 */ 574 static void 575 schedctl_freepage(struct anon_map *amp, caddr_t kaddr) 576 { 577 /* 578 * Release the lock on the page and remove the kernel mapping. 579 */ 580 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 581 segkp_release(segkp, kaddr); 582 583 /* 584 * Decrement the refcnt so the anon_map structure will be freed. 585 */ 586 if (--amp->refcnt == 0) { 587 /* 588 * The current process no longer has the page mapped, so 589 * we have to free everything rather than letting as_free 590 * do the work. 591 */ 592 anon_free(amp->ahp, 0, PAGESIZE); 593 ANON_LOCK_EXIT(&->a_rwlock); 594 anonmap_free(amp); 595 } else { 596 ANON_LOCK_EXIT(&->a_rwlock); 597 } 598 } 599