1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #include <sys/types.h>
28 #include <sys/systm.h>
29 #include <sys/schedctl.h>
30 #include <sys/proc.h>
31 #include <sys/thread.h>
32 #include <sys/class.h>
33 #include <sys/cred.h>
34 #include <sys/kmem.h>
35 #include <sys/cmn_err.h>
36 #include <sys/stack.h>
37 #include <sys/debug.h>
38 #include <sys/cpuvar.h>
39 #include <sys/sobject.h>
40 #include <sys/door.h>
41 #include <sys/modctl.h>
42 #include <sys/syscall.h>
43 #include <sys/sysmacros.h>
44 #include <sys/vmsystm.h>
45 #include <sys/mman.h>
46 #include <sys/vnode.h>
47 #include <sys/swap.h>
48 #include <sys/lwp.h>
49 #include <sys/bitmap.h>
50 #include <sys/atomic.h>
51 #include <sys/fcntl.h>
52 #include <vm/seg_kp.h>
53 #include <vm/seg_vn.h>
54 #include <vm/as.h>
55 #include <fs/fs_subr.h>
56
57 /*
58 * Page handling structures. This is set up as a list of per-page
59 * control structures (sc_page_ctl), with p->p_pagep pointing to
60 * the first. The per-page structures point to the actual pages
61 * and contain pointers to the user address for each mapped page.
62 *
63 * All data is protected by p->p_sc_lock. Since this lock is
64 * held while waiting for memory, schedctl_shared_alloc() should
65 * not be called while holding p_lock.
66 */
67
68 typedef struct sc_page_ctl {
69 struct sc_page_ctl *spc_next;
70 sc_shared_t *spc_base; /* base of kernel page */
71 sc_shared_t *spc_end; /* end of usable space */
72 ulong_t *spc_map; /* bitmap of allocated space on page */
73 size_t spc_space; /* amount of space on page */
74 caddr_t spc_uaddr; /* user-level address of the page */
75 struct anon_map *spc_amp; /* anonymous memory structure */
76 } sc_page_ctl_t;
77
78 static size_t sc_pagesize; /* size of usable space on page */
79 static size_t sc_bitmap_len; /* # of bits in allocation bitmap */
80 static size_t sc_bitmap_words; /* # of words in allocation bitmap */
81
82 /* Context ops */
83 static void schedctl_save(sc_shared_t *);
84 static void schedctl_restore(sc_shared_t *);
85 static void schedctl_fork(kthread_t *, kthread_t *);
86
87 /* Functions for handling shared pages */
88 static int schedctl_shared_alloc(sc_shared_t **, uintptr_t *);
89 static sc_page_ctl_t *schedctl_page_lookup(sc_shared_t *);
90 static int schedctl_map(struct anon_map *, caddr_t *, caddr_t);
91 static int schedctl_getpage(struct anon_map **, caddr_t *);
92 static void schedctl_freepage(struct anon_map *, caddr_t);
93
94 /*
95 * System call interface to scheduler activations.
96 * This always operates on the current lwp.
97 */
98 caddr_t
schedctl(void)99 schedctl(void)
100 {
101 kthread_t *t = curthread;
102 sc_shared_t *ssp;
103 uintptr_t uaddr;
104 int error;
105
106 if (t->t_schedctl == NULL) {
107 /*
108 * Allocate and initialize the shared structure.
109 */
110 if ((error = schedctl_shared_alloc(&ssp, &uaddr)) != 0)
111 return ((caddr_t)(uintptr_t)set_errno(error));
112 bzero(ssp, sizeof (*ssp));
113
114 installctx(t, ssp, schedctl_save, schedctl_restore,
115 schedctl_fork, NULL, NULL, NULL);
116
117 thread_lock(t); /* protect against ts_tick and ts_update */
118 t->t_schedctl = ssp;
119 t->t_sc_uaddr = uaddr;
120 ssp->sc_cid = t->t_cid;
121 ssp->sc_cpri = t->t_cpri;
122 ssp->sc_priority = DISP_PRIO(t);
123 thread_unlock(t);
124 }
125
126 return ((caddr_t)t->t_sc_uaddr);
127 }
128
129
130 /*
131 * Clean up scheduler activations state associated with an exiting
132 * (or execing) lwp. t is always the current thread.
133 */
134 void
schedctl_lwp_cleanup(kthread_t * t)135 schedctl_lwp_cleanup(kthread_t *t)
136 {
137 sc_shared_t *ssp = t->t_schedctl;
138 proc_t *p = ttoproc(t);
139 sc_page_ctl_t *pagep;
140 index_t index;
141
142 ASSERT(MUTEX_NOT_HELD(&p->p_lock));
143
144 thread_lock(t); /* protect against ts_tick and ts_update */
145 t->t_schedctl = NULL;
146 t->t_sc_uaddr = 0;
147 thread_unlock(t);
148
149 /*
150 * Remove the context op to avoid the final call to
151 * schedctl_save when switching away from this lwp.
152 */
153 (void) removectx(t, ssp, schedctl_save, schedctl_restore,
154 schedctl_fork, NULL, NULL, NULL);
155
156 /*
157 * Do not unmap the shared page until the process exits.
158 * User-level library code relies on this for adaptive mutex locking.
159 */
160 mutex_enter(&p->p_sc_lock);
161 ssp->sc_state = SC_FREE;
162 pagep = schedctl_page_lookup(ssp);
163 index = (index_t)(ssp - pagep->spc_base);
164 BT_CLEAR(pagep->spc_map, index);
165 pagep->spc_space += sizeof (sc_shared_t);
166 mutex_exit(&p->p_sc_lock);
167 }
168
169
170 /*
171 * Cleanup the list of schedctl shared pages for the process.
172 * Called from exec() and exit() system calls.
173 */
174 void
schedctl_proc_cleanup(void)175 schedctl_proc_cleanup(void)
176 {
177 proc_t *p = curproc;
178 sc_page_ctl_t *pagep;
179 sc_page_ctl_t *next;
180
181 ASSERT(p->p_lwpcnt == 1); /* we are single-threaded now */
182 ASSERT(curthread->t_schedctl == NULL);
183
184 /*
185 * Since we are single-threaded, we don't have to hold p->p_sc_lock.
186 */
187 pagep = p->p_pagep;
188 p->p_pagep = NULL;
189 while (pagep != NULL) {
190 ASSERT(pagep->spc_space == sc_pagesize);
191 next = pagep->spc_next;
192 /*
193 * Unmap the user space and free the mapping structure.
194 */
195 (void) as_unmap(p->p_as, pagep->spc_uaddr, PAGESIZE);
196 schedctl_freepage(pagep->spc_amp, (caddr_t)(pagep->spc_base));
197 kmem_free(pagep->spc_map, sizeof (ulong_t) * sc_bitmap_words);
198 kmem_free(pagep, sizeof (sc_page_ctl_t));
199 pagep = next;
200 }
201 }
202
203
204 /*
205 * Called by resume just before switching away from the current thread.
206 * Save new thread state.
207 */
208 static void
schedctl_save(sc_shared_t * ssp)209 schedctl_save(sc_shared_t *ssp)
210 {
211 ssp->sc_state = curthread->t_state;
212 }
213
214
215 /*
216 * Called by resume after switching to the current thread.
217 * Save new thread state and CPU.
218 */
219 static void
schedctl_restore(sc_shared_t * ssp)220 schedctl_restore(sc_shared_t *ssp)
221 {
222 ssp->sc_state = SC_ONPROC;
223 ssp->sc_cpu = CPU->cpu_id;
224 }
225
226
227 /*
228 * On fork, remove inherited mappings from the child's address space.
229 * The child's threads must call schedctl() to get new shared mappings.
230 */
231 static void
schedctl_fork(kthread_t * pt,kthread_t * ct)232 schedctl_fork(kthread_t *pt, kthread_t *ct)
233 {
234 proc_t *pp = ttoproc(pt);
235 proc_t *cp = ttoproc(ct);
236 sc_page_ctl_t *pagep;
237
238 ASSERT(ct->t_schedctl == NULL);
239
240 /*
241 * Do this only once, whether we are doing fork1() or forkall().
242 * Don't do it at all if the child process is a child of vfork()
243 * because a child of vfork() borrows the parent's address space.
244 */
245 if (pt != curthread || (cp->p_flag & SVFORK))
246 return;
247
248 mutex_enter(&pp->p_sc_lock);
249 for (pagep = pp->p_pagep; pagep != NULL; pagep = pagep->spc_next)
250 (void) as_unmap(cp->p_as, pagep->spc_uaddr, PAGESIZE);
251 mutex_exit(&pp->p_sc_lock);
252 }
253
254
255 /*
256 * Returns non-zero if the specified thread shouldn't be preempted at this time.
257 * Called by ts_preempt(), ts_tick(), and ts_update().
258 */
259 int
schedctl_get_nopreempt(kthread_t * t)260 schedctl_get_nopreempt(kthread_t *t)
261 {
262 ASSERT(THREAD_LOCK_HELD(t));
263 return (t->t_schedctl->sc_preemptctl.sc_nopreempt);
264 }
265
266
267 /*
268 * Sets the value of the nopreempt field for the specified thread.
269 * Called by ts_preempt() to clear the field on preemption.
270 */
271 void
schedctl_set_nopreempt(kthread_t * t,short val)272 schedctl_set_nopreempt(kthread_t *t, short val)
273 {
274 ASSERT(THREAD_LOCK_HELD(t));
275 t->t_schedctl->sc_preemptctl.sc_nopreempt = val;
276 }
277
278
279 /*
280 * Sets the value of the yield field for the specified thread.
281 * Called by ts_preempt() and ts_tick() to set the field, and
282 * ts_yield() to clear it.
283 * The kernel never looks at this field so we don't need a
284 * schedctl_get_yield() function.
285 */
286 void
schedctl_set_yield(kthread_t * t,short val)287 schedctl_set_yield(kthread_t *t, short val)
288 {
289 ASSERT(THREAD_LOCK_HELD(t));
290 t->t_schedctl->sc_preemptctl.sc_yield = val;
291 }
292
293
294 /*
295 * Sets the values of the cid and priority fields for the specified thread.
296 * Called from thread_change_pri(), thread_change_epri(), THREAD_CHANGE_PRI().
297 * Called following calls to CL_FORKRET() and CL_ENTERCLASS().
298 */
299 void
schedctl_set_cidpri(kthread_t * t)300 schedctl_set_cidpri(kthread_t *t)
301 {
302 sc_shared_t *tdp = t->t_schedctl;
303
304 if (tdp != NULL) {
305 tdp->sc_cid = t->t_cid;
306 tdp->sc_cpri = t->t_cpri;
307 tdp->sc_priority = DISP_PRIO(t);
308 }
309 }
310
311
312 /*
313 * Returns non-zero if the specified thread has requested that all
314 * signals be blocked. Called by signal-related code that tests
315 * the signal mask of a thread that may not be the current thread
316 * and where the process's p_lock cannot be acquired.
317 */
318 int
schedctl_sigblock(kthread_t * t)319 schedctl_sigblock(kthread_t *t)
320 {
321 sc_shared_t *tdp = t->t_schedctl;
322
323 if (tdp != NULL)
324 return (tdp->sc_sigblock);
325 return (0);
326 }
327
328
329 /*
330 * If the sc_sigblock field is set for the specified thread, set
331 * its signal mask to block all maskable signals, then clear the
332 * sc_sigblock field. This finishes what user-level code requested
333 * to be done when it set tdp->sc_shared->sc_sigblock non-zero.
334 * Called from signal-related code either by the current thread for
335 * itself or by a thread that holds the process's p_lock (/proc code).
336 */
337 void
schedctl_finish_sigblock(kthread_t * t)338 schedctl_finish_sigblock(kthread_t *t)
339 {
340 sc_shared_t *tdp = t->t_schedctl;
341
342 ASSERT(t == curthread || MUTEX_HELD(&ttoproc(t)->p_lock));
343
344 if (tdp != NULL && tdp->sc_sigblock) {
345 t->t_hold.__sigbits[0] = FILLSET0 & ~CANTMASK0;
346 t->t_hold.__sigbits[1] = FILLSET1 & ~CANTMASK1;
347 t->t_hold.__sigbits[2] = FILLSET2 & ~CANTMASK2;
348 tdp->sc_sigblock = 0;
349 }
350 }
351
352
353 /*
354 * Return non-zero if the current thread has declared that it has
355 * a cancellation pending and that cancellation is not disabled.
356 * If SIGCANCEL is blocked, we must be going over the wire in an
357 * NFS transaction (sigintr() was called); return zero in this case.
358 */
359 int
schedctl_cancel_pending(void)360 schedctl_cancel_pending(void)
361 {
362 sc_shared_t *tdp = curthread->t_schedctl;
363
364 if (tdp != NULL &&
365 (tdp->sc_flgs & SC_CANCEL_FLG) &&
366 !tdp->sc_sigblock &&
367 !sigismember(&curthread->t_hold, SIGCANCEL))
368 return (1);
369 return (0);
370 }
371
372
373 /*
374 * Inform libc that the kernel returned EINTR from some system call
375 * due to there being a cancellation pending (SC_CANCEL_FLG set or
376 * we received an SI_LWP SIGCANCEL while in a system call), rather
377 * than because of some other signal. User-level code can try to
378 * recover from receiving other signals, but it can't recover from
379 * being cancelled.
380 */
381 void
schedctl_cancel_eintr(void)382 schedctl_cancel_eintr(void)
383 {
384 sc_shared_t *tdp = curthread->t_schedctl;
385
386 if (tdp != NULL)
387 tdp->sc_flgs |= SC_EINTR_FLG;
388 }
389
390
391 /*
392 * Return non-zero if the current thread has declared that
393 * it is calling into the kernel to park, else return zero.
394 */
395 int
schedctl_is_park(void)396 schedctl_is_park(void)
397 {
398 sc_shared_t *tdp = curthread->t_schedctl;
399
400 if (tdp != NULL)
401 return ((tdp->sc_flgs & SC_PARK_FLG) != 0);
402 /*
403 * If we're here and there is no shared memory (how could
404 * that happen?) then just assume we really are here to park.
405 */
406 return (1);
407 }
408
409
410 /*
411 * Declare thread is parking.
412 *
413 * libc will set "sc_flgs |= SC_PARK_FLG" before calling lwpsys_park(0, tid)
414 * in order to declare that the thread is calling into the kernel to park.
415 *
416 * This interface exists ONLY to support older versions of libthread which
417 * are not aware of the SC_PARK_FLG flag.
418 *
419 * Older versions of libthread which are not aware of the SC_PARK_FLG flag
420 * need to be modified or emulated to call lwpsys_park(4, ...) instead of
421 * lwpsys_park(0, ...). This will invoke schedctl_set_park() before
422 * lwp_park() to declare that the thread is parking.
423 */
424 void
schedctl_set_park(void)425 schedctl_set_park(void)
426 {
427 sc_shared_t *tdp = curthread->t_schedctl;
428 if (tdp != NULL)
429 tdp->sc_flgs |= SC_PARK_FLG;
430 }
431
432
433 /*
434 * Clear the parking flag on return from parking in the kernel.
435 */
436 void
schedctl_unpark(void)437 schedctl_unpark(void)
438 {
439 sc_shared_t *tdp = curthread->t_schedctl;
440
441 if (tdp != NULL)
442 tdp->sc_flgs &= ~SC_PARK_FLG;
443 }
444
445
446 /*
447 * Page handling code.
448 */
449
450 void
schedctl_init(void)451 schedctl_init(void)
452 {
453 /*
454 * Amount of page that can hold sc_shared_t structures. If
455 * sizeof (sc_shared_t) is a power of 2, this should just be
456 * PAGESIZE.
457 */
458 sc_pagesize = PAGESIZE - (PAGESIZE % sizeof (sc_shared_t));
459
460 /*
461 * Allocation bitmap is one bit per struct on a page.
462 */
463 sc_bitmap_len = sc_pagesize / sizeof (sc_shared_t);
464 sc_bitmap_words = howmany(sc_bitmap_len, BT_NBIPUL);
465 }
466
467
468 static int
schedctl_shared_alloc(sc_shared_t ** kaddrp,uintptr_t * uaddrp)469 schedctl_shared_alloc(sc_shared_t **kaddrp, uintptr_t *uaddrp)
470 {
471 proc_t *p = curproc;
472 sc_page_ctl_t *pagep;
473 sc_shared_t *ssp;
474 caddr_t base;
475 index_t index;
476 int error;
477
478 ASSERT(MUTEX_NOT_HELD(&p->p_lock));
479 mutex_enter(&p->p_sc_lock);
480
481 /*
482 * Try to find space for the new data in existing pages
483 * within the process's list of shared pages.
484 */
485 for (pagep = p->p_pagep; pagep != NULL; pagep = pagep->spc_next)
486 if (pagep->spc_space != 0)
487 break;
488
489 if (pagep != NULL)
490 base = pagep->spc_uaddr;
491 else {
492 struct anon_map *amp;
493 caddr_t kaddr;
494
495 /*
496 * No room, need to allocate a new page. Also set up
497 * a mapping to the kernel address space for the new
498 * page and lock it in memory.
499 */
500 if ((error = schedctl_getpage(&, &kaddr)) != 0) {
501 mutex_exit(&p->p_sc_lock);
502 return (error);
503 }
504 if ((error = schedctl_map(amp, &base, kaddr)) != 0) {
505 schedctl_freepage(amp, kaddr);
506 mutex_exit(&p->p_sc_lock);
507 return (error);
508 }
509
510 /*
511 * Allocate and initialize the page control structure.
512 */
513 pagep = kmem_alloc(sizeof (sc_page_ctl_t), KM_SLEEP);
514 pagep->spc_amp = amp;
515 pagep->spc_base = (sc_shared_t *)kaddr;
516 pagep->spc_end = (sc_shared_t *)(kaddr + sc_pagesize);
517 pagep->spc_uaddr = base;
518
519 pagep->spc_map = kmem_zalloc(sizeof (ulong_t) * sc_bitmap_words,
520 KM_SLEEP);
521 pagep->spc_space = sc_pagesize;
522
523 pagep->spc_next = p->p_pagep;
524 p->p_pagep = pagep;
525 }
526
527 /*
528 * Got a page, now allocate space for the data. There should
529 * be space unless something's wrong.
530 */
531 ASSERT(pagep != NULL && pagep->spc_space >= sizeof (sc_shared_t));
532 index = bt_availbit(pagep->spc_map, sc_bitmap_len);
533 ASSERT(index != -1);
534
535 /*
536 * Get location with pointer arithmetic. spc_base is of type
537 * sc_shared_t *. Mark as allocated.
538 */
539 ssp = pagep->spc_base + index;
540 BT_SET(pagep->spc_map, index);
541 pagep->spc_space -= sizeof (sc_shared_t);
542
543 mutex_exit(&p->p_sc_lock);
544
545 /*
546 * Return kernel and user addresses.
547 */
548 *kaddrp = ssp;
549 *uaddrp = (uintptr_t)base + ((uintptr_t)ssp & PAGEOFFSET);
550 return (0);
551 }
552
553
554 /*
555 * Find the page control structure corresponding to a kernel address.
556 */
557 static sc_page_ctl_t *
schedctl_page_lookup(sc_shared_t * ssp)558 schedctl_page_lookup(sc_shared_t *ssp)
559 {
560 proc_t *p = curproc;
561 sc_page_ctl_t *pagep;
562
563 ASSERT(MUTEX_HELD(&p->p_sc_lock));
564 for (pagep = p->p_pagep; pagep != NULL; pagep = pagep->spc_next) {
565 if (ssp >= pagep->spc_base && ssp < pagep->spc_end)
566 return (pagep);
567 }
568 return (NULL); /* This "can't happen". Should we panic? */
569 }
570
571
572 /*
573 * This function is called when a page needs to be mapped into a
574 * process's address space. Allocate the user address space and
575 * set up the mapping to the page. Assumes the page has already
576 * been allocated and locked in memory via schedctl_getpage.
577 */
578 static int
schedctl_map(struct anon_map * amp,caddr_t * uaddrp,caddr_t kaddr)579 schedctl_map(struct anon_map *amp, caddr_t *uaddrp, caddr_t kaddr)
580 {
581 caddr_t addr = NULL;
582 struct as *as = curproc->p_as;
583 struct segvn_crargs vn_a;
584 int error;
585
586 as_rangelock(as);
587 /* pass address of kernel mapping as offset to avoid VAC conflicts */
588 map_addr(&addr, PAGESIZE, (offset_t)(uintptr_t)kaddr, 1, 0);
589 if (addr == NULL) {
590 as_rangeunlock(as);
591 return (ENOMEM);
592 }
593
594 /*
595 * Use segvn to set up the mapping to the page.
596 */
597 vn_a.vp = NULL;
598 vn_a.offset = 0;
599 vn_a.cred = NULL;
600 vn_a.type = MAP_SHARED;
601 vn_a.prot = vn_a.maxprot = PROT_ALL;
602 vn_a.flags = 0;
603 vn_a.amp = amp;
604 vn_a.szc = 0;
605 vn_a.lgrp_mem_policy_flags = 0;
606 error = as_map(as, addr, PAGESIZE, segvn_create, &vn_a);
607 as_rangeunlock(as);
608
609 if (error)
610 return (error);
611
612 *uaddrp = addr;
613 return (0);
614 }
615
616
617 /*
618 * Allocate a new page from anonymous memory. Also, create a kernel
619 * mapping to the page and lock the page in memory.
620 */
621 static int
schedctl_getpage(struct anon_map ** newamp,caddr_t * newaddr)622 schedctl_getpage(struct anon_map **newamp, caddr_t *newaddr)
623 {
624 struct anon_map *amp;
625 caddr_t kaddr;
626
627 /*
628 * Set up anonymous memory struct. No swap reservation is
629 * needed since the page will be locked into memory.
630 */
631 amp = anonmap_alloc(PAGESIZE, 0, ANON_SLEEP);
632
633 /*
634 * Allocate the page.
635 */
636 kaddr = segkp_get_withanonmap(segkp, PAGESIZE,
637 KPD_NO_ANON | KPD_LOCKED | KPD_ZERO, amp);
638 if (kaddr == NULL) {
639 amp->refcnt--;
640 anonmap_free(amp);
641 return (ENOMEM);
642 }
643
644 /*
645 * The page is left SE_SHARED locked so that it won't be
646 * paged out or relocated (KPD_LOCKED above).
647 */
648
649 *newamp = amp;
650 *newaddr = kaddr;
651 return (0);
652 }
653
654
655 /*
656 * Take the necessary steps to allow a page to be released.
657 * This is called when the process is doing exit() or exec().
658 * There should be no accesses to the page after this.
659 * The kernel mapping of the page is released and the page is unlocked.
660 */
661 static void
schedctl_freepage(struct anon_map * amp,caddr_t kaddr)662 schedctl_freepage(struct anon_map *amp, caddr_t kaddr)
663 {
664 /*
665 * Release the lock on the page and remove the kernel mapping.
666 */
667 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
668 segkp_release(segkp, kaddr);
669
670 /*
671 * Decrement the refcnt so the anon_map structure will be freed.
672 */
673 if (--amp->refcnt == 0) {
674 /*
675 * The current process no longer has the page mapped, so
676 * we have to free everything rather than letting as_free
677 * do the work.
678 */
679 anonmap_purge(amp);
680 anon_free(amp->ahp, 0, PAGESIZE);
681 ANON_LOCK_EXIT(&->a_rwlock);
682 anonmap_free(amp);
683 } else {
684 ANON_LOCK_EXIT(&->a_rwlock);
685 }
686 }
687