1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/systm.h> 31 #include <sys/schedctl.h> 32 #include <sys/proc.h> 33 #include <sys/thread.h> 34 #include <sys/class.h> 35 #include <sys/cred.h> 36 #include <sys/kmem.h> 37 #include <sys/cmn_err.h> 38 #include <sys/stack.h> 39 #include <sys/debug.h> 40 #include <sys/cpuvar.h> 41 #include <sys/sobject.h> 42 #include <sys/door.h> 43 #include <sys/modctl.h> 44 #include <sys/syscall.h> 45 #include <sys/sysmacros.h> 46 #include <sys/vmsystm.h> 47 #include <sys/mman.h> 48 #include <sys/vnode.h> 49 #include <sys/swap.h> 50 #include <sys/lwp.h> 51 #include <sys/bitmap.h> 52 #include <sys/atomic.h> 53 #include <sys/fcntl.h> 54 #include <vm/seg_kp.h> 55 #include <vm/seg_vn.h> 56 #include <vm/as.h> 57 #include <fs/fs_subr.h> 58 59 60 /* 61 * Page handling structures. This is set up as a list of per-page 62 * control structures (sc_page_ctl), with p->p_pagep pointing to 63 * the first. The per-page structures point to the actual pages 64 * and contain pointers to the user address for each mapped page. 65 * 66 * All data is protected by p->p_sc_lock. Since this lock is 67 * held while waiting for memory, schedctl_shared_alloc() should 68 * not be called while holding p_lock. 69 */ 70 71 typedef struct sc_page_ctl { 72 struct sc_page_ctl *spc_next; 73 sc_shared_t *spc_base; /* base of kernel page */ 74 sc_shared_t *spc_end; /* end of usable space */ 75 ulong_t *spc_map; /* bitmap of allocated space on page */ 76 size_t spc_space; /* amount of space on page */ 77 caddr_t spc_uaddr; /* user-level address of the page */ 78 struct anon_map *spc_amp; /* anonymous memory structure */ 79 } sc_page_ctl_t; 80 81 static size_t sc_pagesize; /* size of usable space on page */ 82 static size_t sc_bitmap_len; /* # of bits in allocation bitmap */ 83 static size_t sc_bitmap_words; /* # of words in allocation bitmap */ 84 85 /* Context ops */ 86 static void schedctl_save(sc_shared_t *); 87 static void schedctl_restore(sc_shared_t *); 88 static void schedctl_fork(kthread_t *, kthread_t *); 89 90 /* Functions for handling shared pages */ 91 static int schedctl_shared_alloc(sc_shared_t **, uintptr_t *); 92 static sc_page_ctl_t *schedctl_page_lookup(sc_shared_t *); 93 static int schedctl_map(struct anon_map *, caddr_t *, caddr_t); 94 static int schedctl_getpage(struct anon_map **, caddr_t *); 95 static void schedctl_freepage(struct anon_map *, caddr_t); 96 97 /* 98 * System call interface to scheduler activations. 99 * This always operates on the current lwp. 100 */ 101 caddr_t 102 schedctl(void) 103 { 104 kthread_t *t = curthread; 105 sc_shared_t *ssp; 106 uintptr_t uaddr; 107 int error; 108 109 if (t->t_schedctl == NULL) { 110 /* 111 * Allocate and initialize the shared structure. 112 */ 113 if ((error = schedctl_shared_alloc(&ssp, &uaddr)) != 0) 114 return ((caddr_t)(uintptr_t)set_errno(error)); 115 bzero(ssp, sizeof (*ssp)); 116 117 installctx(t, ssp, schedctl_save, schedctl_restore, 118 schedctl_fork, NULL, NULL, NULL); 119 120 thread_lock(t); /* protect against ts_tick and ts_update */ 121 t->t_schedctl = ssp; 122 t->t_sc_uaddr = uaddr; 123 thread_unlock(t); 124 } 125 126 return ((caddr_t)t->t_sc_uaddr); 127 } 128 129 130 /* 131 * Clean up scheduler activations state associated with an exiting 132 * (or execing) lwp. t is always the current thread. 133 */ 134 void 135 schedctl_lwp_cleanup(kthread_t *t) 136 { 137 sc_shared_t *ssp = t->t_schedctl; 138 proc_t *p = ttoproc(t); 139 sc_page_ctl_t *pagep; 140 index_t index; 141 142 ASSERT(MUTEX_NOT_HELD(&p->p_lock)); 143 144 thread_lock(t); /* protect against ts_tick and ts_update */ 145 t->t_schedctl = NULL; 146 t->t_sc_uaddr = 0; 147 thread_unlock(t); 148 149 /* 150 * Remove the context op to avoid the final call to 151 * schedctl_save when switching away from this lwp. 152 */ 153 (void) removectx(t, ssp, schedctl_save, schedctl_restore, 154 schedctl_fork, NULL, NULL, NULL); 155 156 /* 157 * Do not unmap the shared page until the process exits. 158 * User-level library code relies on this for adaptive mutex locking. 159 */ 160 mutex_enter(&p->p_sc_lock); 161 ssp->sc_state = SC_FREE; 162 pagep = schedctl_page_lookup(ssp); 163 index = (index_t)(ssp - pagep->spc_base); 164 BT_CLEAR(pagep->spc_map, index); 165 pagep->spc_space += sizeof (sc_shared_t); 166 mutex_exit(&p->p_sc_lock); 167 } 168 169 /* 170 * Cleanup the list of schedctl shared pages for the process. 171 * Called from exec() and exit() system calls. 172 */ 173 void 174 schedctl_proc_cleanup() 175 { 176 proc_t *p = curproc; 177 sc_page_ctl_t *pagep; 178 sc_page_ctl_t *next; 179 180 ASSERT(p->p_lwpcnt == 1); /* we are single-threaded now */ 181 ASSERT(curthread->t_schedctl == NULL); 182 183 /* 184 * Since we are single-threaded, we don't have to hold p->p_sc_lock. 185 */ 186 pagep = p->p_pagep; 187 p->p_pagep = NULL; 188 while (pagep != NULL) { 189 ASSERT(pagep->spc_space == sc_pagesize); 190 next = pagep->spc_next; 191 /* 192 * Unmap the user space and free the mapping structure. 193 */ 194 (void) as_unmap(p->p_as, pagep->spc_uaddr, PAGESIZE); 195 schedctl_freepage(pagep->spc_amp, (caddr_t)(pagep->spc_base)); 196 kmem_free(pagep->spc_map, sizeof (ulong_t) * sc_bitmap_words); 197 kmem_free(pagep, sizeof (sc_page_ctl_t)); 198 pagep = next; 199 } 200 } 201 202 /* 203 * Called by resume just before switching away from the current thread. 204 * Save new thread state. 205 */ 206 void 207 schedctl_save(sc_shared_t *ssp) 208 { 209 ssp->sc_state = curthread->t_state; 210 } 211 212 213 /* 214 * Called by resume after switching to the current thread. 215 * Save new thread state and CPU. 216 */ 217 void 218 schedctl_restore(sc_shared_t *ssp) 219 { 220 ssp->sc_state = SC_ONPROC; 221 ssp->sc_cpu = CPU->cpu_id; 222 } 223 224 225 /* 226 * On fork, remove inherited mappings from the child's address space. 227 * The child's threads must call schedctl() to get new shared mappings. 228 */ 229 void 230 schedctl_fork(kthread_t *pt, kthread_t *ct) 231 { 232 proc_t *pp = ttoproc(pt); 233 proc_t *cp = ttoproc(ct); 234 sc_page_ctl_t *pagep; 235 236 ASSERT(ct->t_schedctl == NULL); 237 238 /* 239 * Do this only once, whether we are doing fork1() or forkall(). 240 * Don't do it at all if the child process is a child of vfork() 241 * because a child of vfork() borrows the parent's address space. 242 */ 243 if (pt != curthread || (cp->p_flag & SVFORK)) 244 return; 245 246 mutex_enter(&pp->p_sc_lock); 247 for (pagep = pp->p_pagep; pagep != NULL; pagep = pagep->spc_next) 248 (void) as_unmap(cp->p_as, pagep->spc_uaddr, PAGESIZE); 249 mutex_exit(&pp->p_sc_lock); 250 } 251 252 /* 253 * Returns non-zero if the specified thread shouldn't be preempted at this time. 254 * Called by ts_preempt, ts_tick, and ts_update. 255 */ 256 int 257 schedctl_get_nopreempt(kthread_t *t) 258 { 259 ASSERT(THREAD_LOCK_HELD(t)); 260 return (t->t_schedctl->sc_preemptctl.sc_nopreempt); 261 } 262 263 264 /* 265 * Sets the value of the nopreempt field for the specified thread. 266 * Called by ts_preempt to clear the field on preemption. 267 */ 268 void 269 schedctl_set_nopreempt(kthread_t *t, short val) 270 { 271 ASSERT(THREAD_LOCK_HELD(t)); 272 t->t_schedctl->sc_preemptctl.sc_nopreempt = val; 273 } 274 275 276 /* 277 * Sets the value of the yield field for the specified thread. Called by 278 * ts_preempt and ts_tick to set the field, and ts_yield to clear it. 279 * The kernel never looks at this field so we don't need a schedctl_get_yield 280 * function. 281 */ 282 void 283 schedctl_set_yield(kthread_t *t, short val) 284 { 285 ASSERT(THREAD_LOCK_HELD(t)); 286 t->t_schedctl->sc_preemptctl.sc_yield = val; 287 } 288 289 290 /* 291 * Returns non-zero if the specified thread has requested that all 292 * signals be blocked. Called by signal-related code that tests 293 * the signal mask of a thread that may not be the current thread 294 * and where the process's p_lock cannot be acquired. 295 */ 296 int 297 schedctl_sigblock(kthread_t *t) 298 { 299 sc_shared_t *tdp = t->t_schedctl; 300 301 if (tdp) 302 return (tdp->sc_sigblock); 303 return (0); 304 } 305 306 307 /* 308 * If the sc_sigblock field is set for the specified thread, set 309 * its signal mask to block all maskable signals, then clear the 310 * sc_sigblock field. This finishes what user-level code requested 311 * to be done when it set tdp->sc_shared->sc_sigblock non-zero. 312 * Called by signal-related code that holds the process's p_lock. 313 */ 314 void 315 schedctl_finish_sigblock(kthread_t *t) 316 { 317 sc_shared_t *tdp = t->t_schedctl; 318 319 ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)); 320 321 if (tdp && tdp->sc_sigblock) { 322 t->t_hold.__sigbits[0] = FILLSET0 & ~CANTMASK0; 323 t->t_hold.__sigbits[1] = FILLSET1 & ~CANTMASK1; 324 tdp->sc_sigblock = 0; 325 } 326 } 327 328 329 /* 330 * Return non-zero if the current thread has declared that 331 * it is calling into the kernel to park, else return zero. 332 */ 333 int 334 schedctl_is_park() 335 { 336 sc_shared_t *tdp = curthread->t_schedctl; 337 338 if (tdp) 339 return (tdp->sc_park); 340 /* 341 * If we're here and there is no shared memory (how could 342 * that happen?) then just assume we really are here to park. 343 */ 344 return (1); 345 } 346 347 348 /* 349 * Clear the shared sc_park flag on return from parking in the kernel. 350 */ 351 void 352 schedctl_unpark() 353 { 354 sc_shared_t *tdp = curthread->t_schedctl; 355 356 if (tdp) 357 tdp->sc_park = 0; 358 } 359 360 361 /* 362 * Page handling code. 363 */ 364 365 void 366 schedctl_init() 367 { 368 /* 369 * Amount of page that can hold sc_shared_t structures. If 370 * sizeof (sc_shared_t) is a power of 2, this should just be 371 * PAGESIZE. 372 */ 373 sc_pagesize = PAGESIZE - (PAGESIZE % sizeof (sc_shared_t)); 374 375 /* 376 * Allocation bitmap is one bit per struct on a page. 377 */ 378 sc_bitmap_len = sc_pagesize / sizeof (sc_shared_t); 379 sc_bitmap_words = howmany(sc_bitmap_len, BT_NBIPUL); 380 } 381 382 int 383 schedctl_shared_alloc(sc_shared_t **kaddrp, uintptr_t *uaddrp) 384 { 385 proc_t *p = curproc; 386 sc_page_ctl_t *pagep; 387 sc_shared_t *ssp; 388 caddr_t base; 389 index_t index; 390 int error; 391 392 ASSERT(MUTEX_NOT_HELD(&p->p_lock)); 393 mutex_enter(&p->p_sc_lock); 394 395 /* 396 * Try to find space for the new data in existing pages 397 * within the process's list of shared pages. 398 */ 399 for (pagep = p->p_pagep; pagep != NULL; pagep = pagep->spc_next) 400 if (pagep->spc_space != 0) 401 break; 402 403 if (pagep != NULL) 404 base = pagep->spc_uaddr; 405 else { 406 struct anon_map *amp; 407 caddr_t kaddr; 408 409 /* 410 * No room, need to allocate a new page. Also set up 411 * a mapping to the kernel address space for the new 412 * page and lock it in memory. 413 */ 414 if ((error = schedctl_getpage(&, &kaddr)) != 0) { 415 mutex_exit(&p->p_sc_lock); 416 return (error); 417 } 418 if ((error = schedctl_map(amp, &base, kaddr)) != 0) { 419 schedctl_freepage(amp, kaddr); 420 mutex_exit(&p->p_sc_lock); 421 return (error); 422 } 423 424 /* 425 * Allocate and initialize the page control structure. 426 */ 427 pagep = kmem_alloc(sizeof (sc_page_ctl_t), KM_SLEEP); 428 pagep->spc_amp = amp; 429 pagep->spc_base = (sc_shared_t *)kaddr; 430 pagep->spc_end = (sc_shared_t *)(kaddr + sc_pagesize); 431 pagep->spc_uaddr = base; 432 433 pagep->spc_map = kmem_zalloc(sizeof (ulong_t) * sc_bitmap_words, 434 KM_SLEEP); 435 pagep->spc_space = sc_pagesize; 436 437 pagep->spc_next = p->p_pagep; 438 p->p_pagep = pagep; 439 } 440 441 /* 442 * Got a page, now allocate space for the data. There should 443 * be space unless something's wrong. 444 */ 445 ASSERT(pagep != NULL && pagep->spc_space >= sizeof (sc_shared_t)); 446 index = bt_availbit(pagep->spc_map, sc_bitmap_len); 447 ASSERT(index != -1); 448 449 /* 450 * Get location with pointer arithmetic. spc_base is of type 451 * sc_shared_t *. Mark as allocated. 452 */ 453 ssp = pagep->spc_base + index; 454 BT_SET(pagep->spc_map, index); 455 pagep->spc_space -= sizeof (sc_shared_t); 456 457 mutex_exit(&p->p_sc_lock); 458 459 /* 460 * Return kernel and user addresses. 461 */ 462 *kaddrp = ssp; 463 *uaddrp = (uintptr_t)base + ((uintptr_t)ssp & PAGEOFFSET); 464 return (0); 465 } 466 467 468 /* 469 * Find the page control structure corresponding to a kernel address. 470 */ 471 static sc_page_ctl_t * 472 schedctl_page_lookup(sc_shared_t *ssp) 473 { 474 proc_t *p = curproc; 475 sc_page_ctl_t *pagep; 476 477 ASSERT(MUTEX_HELD(&p->p_sc_lock)); 478 for (pagep = p->p_pagep; pagep != NULL; pagep = pagep->spc_next) { 479 if (ssp >= pagep->spc_base && ssp < pagep->spc_end) 480 return (pagep); 481 } 482 return (NULL); /* This "can't happen". Should we panic? */ 483 } 484 485 486 /* 487 * This function is called when a page needs to be mapped into a 488 * process's address space. Allocate the user address space and 489 * set up the mapping to the page. Assumes the page has already 490 * been allocated and locked in memory via schedctl_getpage. 491 */ 492 static int 493 schedctl_map(struct anon_map *amp, caddr_t *uaddrp, caddr_t kaddr) 494 { 495 caddr_t addr; 496 struct as *as = curproc->p_as; 497 struct segvn_crargs vn_a; 498 int error; 499 500 as_rangelock(as); 501 /* pass address of kernel mapping as offset to avoid VAC conflicts */ 502 map_addr(&addr, PAGESIZE, (offset_t)(uintptr_t)kaddr, 1, 0); 503 if (addr == NULL) { 504 as_rangeunlock(as); 505 return (ENOMEM); 506 } 507 508 /* 509 * Use segvn to set up the mapping to the page. 510 */ 511 vn_a.vp = NULL; 512 vn_a.offset = 0; 513 vn_a.cred = NULL; 514 vn_a.type = MAP_SHARED; 515 vn_a.prot = vn_a.maxprot = PROT_ALL; 516 vn_a.flags = 0; 517 vn_a.amp = amp; 518 vn_a.szc = 0; 519 vn_a.lgrp_mem_policy_flags = 0; 520 error = as_map(as, addr, PAGESIZE, segvn_create, &vn_a); 521 as_rangeunlock(as); 522 523 if (error) 524 return (error); 525 526 *uaddrp = addr; 527 return (0); 528 } 529 530 531 /* 532 * Allocate a new page from anonymous memory. Also, create a kernel 533 * mapping to the page and lock the page in memory. 534 */ 535 static int 536 schedctl_getpage(struct anon_map **newamp, caddr_t *newaddr) 537 { 538 struct anon_map *amp; 539 caddr_t kaddr; 540 541 /* 542 * Set up anonymous memory struct. No swap reservation is 543 * needed since the page will be locked into memory. 544 */ 545 amp = anonmap_alloc(PAGESIZE, PAGESIZE); 546 547 /* 548 * Allocate the page. 549 */ 550 kaddr = segkp_get_withanonmap(segkp, PAGESIZE, KPD_LOCKED | KPD_ZERO, 551 amp); 552 if (kaddr == NULL) { 553 amp->refcnt--; 554 anonmap_free(amp); 555 return (ENOMEM); 556 } 557 558 /* 559 * The page is left SE_SHARED locked so that it won't be 560 * paged out or relocated (KPD_LOCKED above). 561 */ 562 563 *newamp = amp; 564 *newaddr = kaddr; 565 return (0); 566 } 567 568 569 /* 570 * Take the necessary steps to allow a page to be released. 571 * This is called when the process is doing exit() or exec(). 572 * There should be no accesses to the page after this. 573 * The kernel mapping of the page is released and the page is unlocked. 574 */ 575 static void 576 schedctl_freepage(struct anon_map *amp, caddr_t kaddr) 577 { 578 /* 579 * Release the lock on the page and remove the kernel mapping. 580 */ 581 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 582 segkp_release(segkp, kaddr); 583 584 /* 585 * Decrement the refcnt so the anon_map structure will be freed. 586 */ 587 if (--amp->refcnt == 0) { 588 /* 589 * The current process no longer has the page mapped, so 590 * we have to free everything rather than letting as_free 591 * do the work. 592 */ 593 anon_free(amp->ahp, 0, PAGESIZE); 594 ANON_LOCK_EXIT(&->a_rwlock); 595 anonmap_free(amp); 596 } else { 597 ANON_LOCK_EXIT(&->a_rwlock); 598 } 599 } 600