xref: /illumos-gate/usr/src/uts/common/os/schedctl.c (revision 7ff836697c120cb94bd30d5c2204eb9b74718e4c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/systm.h>
31 #include <sys/schedctl.h>
32 #include <sys/proc.h>
33 #include <sys/thread.h>
34 #include <sys/class.h>
35 #include <sys/cred.h>
36 #include <sys/kmem.h>
37 #include <sys/cmn_err.h>
38 #include <sys/stack.h>
39 #include <sys/debug.h>
40 #include <sys/cpuvar.h>
41 #include <sys/sobject.h>
42 #include <sys/door.h>
43 #include <sys/modctl.h>
44 #include <sys/syscall.h>
45 #include <sys/sysmacros.h>
46 #include <sys/vmsystm.h>
47 #include <sys/mman.h>
48 #include <sys/vnode.h>
49 #include <sys/swap.h>
50 #include <sys/lwp.h>
51 #include <sys/bitmap.h>
52 #include <sys/atomic.h>
53 #include <sys/fcntl.h>
54 #include <vm/seg_kp.h>
55 #include <vm/seg_vn.h>
56 #include <vm/as.h>
57 #include <fs/fs_subr.h>
58 
59 /*
60  * Page handling structures.  This is set up as a list of per-page
61  * control structures (sc_page_ctl), with p->p_pagep pointing to
62  * the first.  The per-page structures point to the actual pages
63  * and contain pointers to the user address for each mapped page.
64  *
65  * All data is protected by p->p_sc_lock.  Since this lock is
66  * held while waiting for memory, schedctl_shared_alloc() should
67  * not be called while holding p_lock.
68  */
69 
70 typedef struct sc_page_ctl {
71 	struct sc_page_ctl *spc_next;
72 	sc_shared_t	*spc_base;	/* base of kernel page */
73 	sc_shared_t	*spc_end;	/* end of usable space */
74 	ulong_t		*spc_map;	/* bitmap of allocated space on page */
75 	size_t		spc_space;	/* amount of space on page */
76 	caddr_t		spc_uaddr;	/* user-level address of the page */
77 	struct anon_map	*spc_amp;	/* anonymous memory structure */
78 } sc_page_ctl_t;
79 
80 static size_t	sc_pagesize;		/* size of usable space on page */
81 static size_t	sc_bitmap_len;		/* # of bits in allocation bitmap */
82 static size_t	sc_bitmap_words;	/* # of words in allocation bitmap */
83 
84 /* Context ops */
85 static void	schedctl_save(sc_shared_t *);
86 static void	schedctl_restore(sc_shared_t *);
87 static void	schedctl_fork(kthread_t *, kthread_t *);
88 
89 /* Functions for handling shared pages */
90 static int	schedctl_shared_alloc(sc_shared_t **, uintptr_t *);
91 static sc_page_ctl_t *schedctl_page_lookup(sc_shared_t *);
92 static int	schedctl_map(struct anon_map *, caddr_t *, caddr_t);
93 static int	schedctl_getpage(struct anon_map **, caddr_t *);
94 static void	schedctl_freepage(struct anon_map *, caddr_t);
95 
96 /*
97  * System call interface to scheduler activations.
98  * This always operates on the current lwp.
99  */
100 caddr_t
101 schedctl(void)
102 {
103 	kthread_t	*t = curthread;
104 	sc_shared_t	*ssp;
105 	uintptr_t	uaddr;
106 	int		error;
107 
108 	if (t->t_schedctl == NULL) {
109 		/*
110 		 * Allocate and initialize the shared structure.
111 		 */
112 		if ((error = schedctl_shared_alloc(&ssp, &uaddr)) != 0)
113 			return ((caddr_t)(uintptr_t)set_errno(error));
114 		bzero(ssp, sizeof (*ssp));
115 
116 		installctx(t, ssp, schedctl_save, schedctl_restore,
117 		    schedctl_fork, NULL, NULL, NULL);
118 
119 		thread_lock(t);	/* protect against ts_tick and ts_update */
120 		t->t_schedctl = ssp;
121 		t->t_sc_uaddr = uaddr;
122 		ssp->sc_cid = t->t_cid;
123 		ssp->sc_cpri = t->t_cpri;
124 		ssp->sc_priority = DISP_PRIO(t);
125 		thread_unlock(t);
126 	}
127 
128 	return ((caddr_t)t->t_sc_uaddr);
129 }
130 
131 
132 /*
133  * Clean up scheduler activations state associated with an exiting
134  * (or execing) lwp.  t is always the current thread.
135  */
136 void
137 schedctl_lwp_cleanup(kthread_t *t)
138 {
139 	sc_shared_t	*ssp = t->t_schedctl;
140 	proc_t		*p = ttoproc(t);
141 	sc_page_ctl_t	*pagep;
142 	index_t		index;
143 
144 	ASSERT(MUTEX_NOT_HELD(&p->p_lock));
145 
146 	thread_lock(t);		/* protect against ts_tick and ts_update */
147 	t->t_schedctl = NULL;
148 	t->t_sc_uaddr = 0;
149 	thread_unlock(t);
150 
151 	/*
152 	 * Remove the context op to avoid the final call to
153 	 * schedctl_save when switching away from this lwp.
154 	 */
155 	(void) removectx(t, ssp, schedctl_save, schedctl_restore,
156 	    schedctl_fork, NULL, NULL, NULL);
157 
158 	/*
159 	 * Do not unmap the shared page until the process exits.
160 	 * User-level library code relies on this for adaptive mutex locking.
161 	 */
162 	mutex_enter(&p->p_sc_lock);
163 	ssp->sc_state = SC_FREE;
164 	pagep = schedctl_page_lookup(ssp);
165 	index = (index_t)(ssp - pagep->spc_base);
166 	BT_CLEAR(pagep->spc_map, index);
167 	pagep->spc_space += sizeof (sc_shared_t);
168 	mutex_exit(&p->p_sc_lock);
169 }
170 
171 
172 /*
173  * Cleanup the list of schedctl shared pages for the process.
174  * Called from exec() and exit() system calls.
175  */
176 void
177 schedctl_proc_cleanup(void)
178 {
179 	proc_t *p = curproc;
180 	sc_page_ctl_t *pagep;
181 	sc_page_ctl_t *next;
182 
183 	ASSERT(p->p_lwpcnt == 1);	/* we are single-threaded now */
184 	ASSERT(curthread->t_schedctl == NULL);
185 
186 	/*
187 	 * Since we are single-threaded, we don't have to hold p->p_sc_lock.
188 	 */
189 	pagep = p->p_pagep;
190 	p->p_pagep = NULL;
191 	while (pagep != NULL) {
192 		ASSERT(pagep->spc_space == sc_pagesize);
193 		next = pagep->spc_next;
194 		/*
195 		 * Unmap the user space and free the mapping structure.
196 		 */
197 		(void) as_unmap(p->p_as, pagep->spc_uaddr, PAGESIZE);
198 		schedctl_freepage(pagep->spc_amp, (caddr_t)(pagep->spc_base));
199 		kmem_free(pagep->spc_map, sizeof (ulong_t) * sc_bitmap_words);
200 		kmem_free(pagep, sizeof (sc_page_ctl_t));
201 		pagep = next;
202 	}
203 }
204 
205 
206 /*
207  * Called by resume just before switching away from the current thread.
208  * Save new thread state.
209  */
210 static void
211 schedctl_save(sc_shared_t *ssp)
212 {
213 	ssp->sc_state = curthread->t_state;
214 }
215 
216 
217 /*
218  * Called by resume after switching to the current thread.
219  * Save new thread state and CPU.
220  */
221 static void
222 schedctl_restore(sc_shared_t *ssp)
223 {
224 	ssp->sc_state = SC_ONPROC;
225 	ssp->sc_cpu = CPU->cpu_id;
226 }
227 
228 
229 /*
230  * On fork, remove inherited mappings from the child's address space.
231  * The child's threads must call schedctl() to get new shared mappings.
232  */
233 static void
234 schedctl_fork(kthread_t *pt, kthread_t *ct)
235 {
236 	proc_t *pp = ttoproc(pt);
237 	proc_t *cp = ttoproc(ct);
238 	sc_page_ctl_t *pagep;
239 
240 	ASSERT(ct->t_schedctl == NULL);
241 
242 	/*
243 	 * Do this only once, whether we are doing fork1() or forkall().
244 	 * Don't do it at all if the child process is a child of vfork()
245 	 * because a child of vfork() borrows the parent's address space.
246 	 */
247 	if (pt != curthread || (cp->p_flag & SVFORK))
248 		return;
249 
250 	mutex_enter(&pp->p_sc_lock);
251 	for (pagep = pp->p_pagep; pagep != NULL; pagep = pagep->spc_next)
252 		(void) as_unmap(cp->p_as, pagep->spc_uaddr, PAGESIZE);
253 	mutex_exit(&pp->p_sc_lock);
254 }
255 
256 
257 /*
258  * Returns non-zero if the specified thread shouldn't be preempted at this time.
259  * Called by ts_preempt(), ts_tick(), and ts_update().
260  */
261 int
262 schedctl_get_nopreempt(kthread_t *t)
263 {
264 	ASSERT(THREAD_LOCK_HELD(t));
265 	return (t->t_schedctl->sc_preemptctl.sc_nopreempt);
266 }
267 
268 
269 /*
270  * Sets the value of the nopreempt field for the specified thread.
271  * Called by ts_preempt() to clear the field on preemption.
272  */
273 void
274 schedctl_set_nopreempt(kthread_t *t, short val)
275 {
276 	ASSERT(THREAD_LOCK_HELD(t));
277 	t->t_schedctl->sc_preemptctl.sc_nopreempt = val;
278 }
279 
280 
281 /*
282  * Sets the value of the yield field for the specified thread.
283  * Called by ts_preempt() and ts_tick() to set the field, and
284  * ts_yield() to clear it.
285  * The kernel never looks at this field so we don't need a
286  * schedctl_get_yield() function.
287  */
288 void
289 schedctl_set_yield(kthread_t *t, short val)
290 {
291 	ASSERT(THREAD_LOCK_HELD(t));
292 	t->t_schedctl->sc_preemptctl.sc_yield = val;
293 }
294 
295 
296 /*
297  * Sets the values of the cid and priority fields for the specified thread.
298  * Called from thread_change_pri(), thread_change_epri(), THREAD_CHANGE_PRI().
299  * Called following calls to CL_FORKRET() and CL_ENTERCLASS().
300  */
301 void
302 schedctl_set_cidpri(kthread_t *t)
303 {
304 	sc_shared_t *tdp = t->t_schedctl;
305 
306 	if (tdp != NULL) {
307 		tdp->sc_cid = t->t_cid;
308 		tdp->sc_cpri = t->t_cpri;
309 		tdp->sc_priority = DISP_PRIO(t);
310 	}
311 }
312 
313 
314 /*
315  * Returns non-zero if the specified thread has requested that all
316  * signals be blocked.  Called by signal-related code that tests
317  * the signal mask of a thread that may not be the current thread
318  * and where the process's p_lock cannot be acquired.
319  */
320 int
321 schedctl_sigblock(kthread_t *t)
322 {
323 	sc_shared_t *tdp = t->t_schedctl;
324 
325 	if (tdp != NULL)
326 		return (tdp->sc_sigblock);
327 	return (0);
328 }
329 
330 
331 /*
332  * If the sc_sigblock field is set for the specified thread, set
333  * its signal mask to block all maskable signals, then clear the
334  * sc_sigblock field.  This finishes what user-level code requested
335  * to be done when it set tdp->sc_shared->sc_sigblock non-zero.
336  * Called by signal-related code that holds the process's p_lock.
337  */
338 void
339 schedctl_finish_sigblock(kthread_t *t)
340 {
341 	sc_shared_t *tdp = t->t_schedctl;
342 
343 	ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));
344 
345 	if (tdp != NULL && tdp->sc_sigblock) {
346 		t->t_hold.__sigbits[0] = FILLSET0 & ~CANTMASK0;
347 		t->t_hold.__sigbits[1] = FILLSET1 & ~CANTMASK1;
348 		tdp->sc_sigblock = 0;
349 	}
350 }
351 
352 
353 /*
354  * Return non-zero if the current thread has declared that it has
355  * a cancellation pending and that cancellation is not disabled.
356  * If SIGCANCEL is blocked, we must be going over the wire in an
357  * NFS transaction (sigintr() was called); return zero in this case.
358  */
359 int
360 schedctl_cancel_pending(void)
361 {
362 	sc_shared_t *tdp = curthread->t_schedctl;
363 
364 	if (tdp != NULL &&
365 	    (tdp->sc_flgs & SC_CANCEL_FLG) &&
366 	    !tdp->sc_sigblock &&
367 	    !sigismember(&curthread->t_hold, SIGCANCEL))
368 		return (1);
369 	return (0);
370 }
371 
372 
373 /*
374  * Inform libc that the kernel returned EINTR from some system call
375  * due to there being a cancellation pending (SC_CANCEL_FLG set or
376  * we received an SI_LWP SIGCANCEL while in a system call), rather
377  * than because of some other signal.  User-level code can try to
378  * recover from receiving other signals, but it can't recover from
379  * being cancelled.
380  */
381 void
382 schedctl_cancel_eintr(void)
383 {
384 	sc_shared_t *tdp = curthread->t_schedctl;
385 
386 	if (tdp != NULL)
387 		tdp->sc_flgs |= SC_EINTR_FLG;
388 }
389 
390 
391 /*
392  * Return non-zero if the current thread has declared that
393  * it is calling into the kernel to park, else return zero.
394  */
395 int
396 schedctl_is_park(void)
397 {
398 	sc_shared_t *tdp = curthread->t_schedctl;
399 
400 	if (tdp != NULL)
401 		return ((tdp->sc_flgs & SC_PARK_FLG) != 0);
402 	/*
403 	 * If we're here and there is no shared memory (how could
404 	 * that happen?) then just assume we really are here to park.
405 	 */
406 	return (1);
407 }
408 
409 
410 /*
411  * Declare thread is parking.
412  *
413  * libc will set "sc_flgs |= SC_PARK_FLG" before calling lwpsys_park(0, tid)
414  * in order to declare that the thread is calling into the kernel to park.
415  *
416  * This interface exists ONLY to support older versions of libthread which
417  * are not aware of the SC_PARK_FLG flag.
418  *
419  * Older versions of libthread which are not aware of the SC_PARK_FLG flag
420  * need to be modified or emulated to call lwpsys_park(4, ...) instead of
421  * lwpsys_park(0, ...).  This will invoke schedctl_set_park() before
422  * lwp_park() to declare that the thread is parking.
423  */
424 void
425 schedctl_set_park(void)
426 {
427 	sc_shared_t *tdp = curthread->t_schedctl;
428 	if (tdp != NULL)
429 		tdp->sc_flgs |= SC_PARK_FLG;
430 }
431 
432 
433 /*
434  * Clear the parking flag on return from parking in the kernel.
435  */
436 void
437 schedctl_unpark(void)
438 {
439 	sc_shared_t *tdp = curthread->t_schedctl;
440 
441 	if (tdp != NULL)
442 		tdp->sc_flgs &= ~SC_PARK_FLG;
443 }
444 
445 
446 /*
447  * Page handling code.
448  */
449 
450 void
451 schedctl_init(void)
452 {
453 	/*
454 	 * Amount of page that can hold sc_shared_t structures.  If
455 	 * sizeof (sc_shared_t) is a power of 2, this should just be
456 	 * PAGESIZE.
457 	 */
458 	sc_pagesize = PAGESIZE - (PAGESIZE % sizeof (sc_shared_t));
459 
460 	/*
461 	 * Allocation bitmap is one bit per struct on a page.
462 	 */
463 	sc_bitmap_len = sc_pagesize / sizeof (sc_shared_t);
464 	sc_bitmap_words = howmany(sc_bitmap_len, BT_NBIPUL);
465 }
466 
467 
468 static int
469 schedctl_shared_alloc(sc_shared_t **kaddrp, uintptr_t *uaddrp)
470 {
471 	proc_t		*p = curproc;
472 	sc_page_ctl_t	*pagep;
473 	sc_shared_t	*ssp;
474 	caddr_t		base;
475 	index_t		index;
476 	int		error;
477 
478 	ASSERT(MUTEX_NOT_HELD(&p->p_lock));
479 	mutex_enter(&p->p_sc_lock);
480 
481 	/*
482 	 * Try to find space for the new data in existing pages
483 	 * within the process's list of shared pages.
484 	 */
485 	for (pagep = p->p_pagep; pagep != NULL; pagep = pagep->spc_next)
486 		if (pagep->spc_space != 0)
487 			break;
488 
489 	if (pagep != NULL)
490 		base = pagep->spc_uaddr;
491 	else {
492 		struct anon_map *amp;
493 		caddr_t kaddr;
494 
495 		/*
496 		 * No room, need to allocate a new page.  Also set up
497 		 * a mapping to the kernel address space for the new
498 		 * page and lock it in memory.
499 		 */
500 		if ((error = schedctl_getpage(&amp, &kaddr)) != 0) {
501 			mutex_exit(&p->p_sc_lock);
502 			return (error);
503 		}
504 		if ((error = schedctl_map(amp, &base, kaddr)) != 0) {
505 			schedctl_freepage(amp, kaddr);
506 			mutex_exit(&p->p_sc_lock);
507 			return (error);
508 		}
509 
510 		/*
511 		 * Allocate and initialize the page control structure.
512 		 */
513 		pagep = kmem_alloc(sizeof (sc_page_ctl_t), KM_SLEEP);
514 		pagep->spc_amp = amp;
515 		pagep->spc_base = (sc_shared_t *)kaddr;
516 		pagep->spc_end = (sc_shared_t *)(kaddr + sc_pagesize);
517 		pagep->spc_uaddr = base;
518 
519 		pagep->spc_map = kmem_zalloc(sizeof (ulong_t) * sc_bitmap_words,
520 		    KM_SLEEP);
521 		pagep->spc_space = sc_pagesize;
522 
523 		pagep->spc_next = p->p_pagep;
524 		p->p_pagep = pagep;
525 	}
526 
527 	/*
528 	 * Got a page, now allocate space for the data.  There should
529 	 * be space unless something's wrong.
530 	 */
531 	ASSERT(pagep != NULL && pagep->spc_space >= sizeof (sc_shared_t));
532 	index = bt_availbit(pagep->spc_map, sc_bitmap_len);
533 	ASSERT(index != -1);
534 
535 	/*
536 	 * Get location with pointer arithmetic.  spc_base is of type
537 	 * sc_shared_t *.  Mark as allocated.
538 	 */
539 	ssp = pagep->spc_base + index;
540 	BT_SET(pagep->spc_map, index);
541 	pagep->spc_space -= sizeof (sc_shared_t);
542 
543 	mutex_exit(&p->p_sc_lock);
544 
545 	/*
546 	 * Return kernel and user addresses.
547 	 */
548 	*kaddrp = ssp;
549 	*uaddrp = (uintptr_t)base + ((uintptr_t)ssp & PAGEOFFSET);
550 	return (0);
551 }
552 
553 
554 /*
555  * Find the page control structure corresponding to a kernel address.
556  */
557 static sc_page_ctl_t *
558 schedctl_page_lookup(sc_shared_t *ssp)
559 {
560 	proc_t *p = curproc;
561 	sc_page_ctl_t *pagep;
562 
563 	ASSERT(MUTEX_HELD(&p->p_sc_lock));
564 	for (pagep = p->p_pagep; pagep != NULL; pagep = pagep->spc_next) {
565 		if (ssp >= pagep->spc_base && ssp < pagep->spc_end)
566 			return (pagep);
567 	}
568 	return (NULL);		/* This "can't happen".  Should we panic? */
569 }
570 
571 
572 /*
573  * This function is called when a page needs to be mapped into a
574  * process's address space.  Allocate the user address space and
575  * set up the mapping to the page.  Assumes the page has already
576  * been allocated and locked in memory via schedctl_getpage.
577  */
578 static int
579 schedctl_map(struct anon_map *amp, caddr_t *uaddrp, caddr_t kaddr)
580 {
581 	caddr_t addr = NULL;
582 	struct as *as = curproc->p_as;
583 	struct segvn_crargs vn_a;
584 	int error;
585 
586 	as_rangelock(as);
587 	/* pass address of kernel mapping as offset to avoid VAC conflicts */
588 	map_addr(&addr, PAGESIZE, (offset_t)(uintptr_t)kaddr, 1, 0);
589 	if (addr == NULL) {
590 		as_rangeunlock(as);
591 		return (ENOMEM);
592 	}
593 
594 	/*
595 	 * Use segvn to set up the mapping to the page.
596 	 */
597 	vn_a.vp = NULL;
598 	vn_a.offset = 0;
599 	vn_a.cred = NULL;
600 	vn_a.type = MAP_SHARED;
601 	vn_a.prot = vn_a.maxprot = PROT_ALL;
602 	vn_a.flags = 0;
603 	vn_a.amp = amp;
604 	vn_a.szc = 0;
605 	vn_a.lgrp_mem_policy_flags = 0;
606 	error = as_map(as, addr, PAGESIZE, segvn_create, &vn_a);
607 	as_rangeunlock(as);
608 
609 	if (error)
610 		return (error);
611 
612 	*uaddrp = addr;
613 	return (0);
614 }
615 
616 
617 /*
618  * Allocate a new page from anonymous memory.  Also, create a kernel
619  * mapping to the page and lock the page in memory.
620  */
621 static int
622 schedctl_getpage(struct anon_map **newamp, caddr_t *newaddr)
623 {
624 	struct anon_map *amp;
625 	caddr_t kaddr;
626 
627 	/*
628 	 * Set up anonymous memory struct.  No swap reservation is
629 	 * needed since the page will be locked into memory.
630 	 */
631 	amp = anonmap_alloc(PAGESIZE, 0, ANON_SLEEP);
632 
633 	/*
634 	 * Allocate the page.
635 	 */
636 	kaddr = segkp_get_withanonmap(segkp, PAGESIZE,
637 	    KPD_NO_ANON | KPD_LOCKED | KPD_ZERO, amp);
638 	if (kaddr == NULL) {
639 		amp->refcnt--;
640 		anonmap_free(amp);
641 		return (ENOMEM);
642 	}
643 
644 	/*
645 	 * The page is left SE_SHARED locked so that it won't be
646 	 * paged out or relocated (KPD_LOCKED above).
647 	 */
648 
649 	*newamp = amp;
650 	*newaddr = kaddr;
651 	return (0);
652 }
653 
654 
655 /*
656  * Take the necessary steps to allow a page to be released.
657  * This is called when the process is doing exit() or exec().
658  * There should be no accesses to the page after this.
659  * The kernel mapping of the page is released and the page is unlocked.
660  */
661 static void
662 schedctl_freepage(struct anon_map *amp, caddr_t kaddr)
663 {
664 	/*
665 	 * Release the lock on the page and remove the kernel mapping.
666 	 */
667 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
668 	segkp_release(segkp, kaddr);
669 
670 	/*
671 	 * Decrement the refcnt so the anon_map structure will be freed.
672 	 */
673 	if (--amp->refcnt == 0) {
674 		/*
675 		 * The current process no longer has the page mapped, so
676 		 * we have to free everything rather than letting as_free
677 		 * do the work.
678 		 */
679 		anonmap_purge(amp);
680 		anon_free(amp->ahp, 0, PAGESIZE);
681 		ANON_LOCK_EXIT(&amp->a_rwlock);
682 		anonmap_free(amp);
683 	} else {
684 		ANON_LOCK_EXIT(&amp->a_rwlock);
685 	}
686 }
687