xref: /illumos-gate/usr/src/uts/common/os/schedctl.c (revision f3af49816e370d667d566ab703e94b81305a536e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/systm.h>
30 #include <sys/schedctl.h>
31 #include <sys/proc.h>
32 #include <sys/thread.h>
33 #include <sys/class.h>
34 #include <sys/cred.h>
35 #include <sys/kmem.h>
36 #include <sys/cmn_err.h>
37 #include <sys/stack.h>
38 #include <sys/debug.h>
39 #include <sys/cpuvar.h>
40 #include <sys/sobject.h>
41 #include <sys/door.h>
42 #include <sys/modctl.h>
43 #include <sys/syscall.h>
44 #include <sys/sysmacros.h>
45 #include <sys/vmsystm.h>
46 #include <sys/mman.h>
47 #include <sys/vnode.h>
48 #include <sys/swap.h>
49 #include <sys/lwp.h>
50 #include <sys/bitmap.h>
51 #include <sys/atomic.h>
52 #include <sys/fcntl.h>
53 #include <vm/seg_kp.h>
54 #include <vm/seg_vn.h>
55 #include <vm/as.h>
56 #include <fs/fs_subr.h>
57 
58 
59 /*
60  * Page handling structures.  This is set up as a list of per-page
61  * control structures (sc_page_ctl), with p->p_pagep pointing to
62  * the first.  The per-page structures point to the actual pages
63  * and contain pointers to the user address for each mapped page.
64  *
65  * All data is protected by p->p_sc_lock.  Since this lock is
66  * held while waiting for memory, schedctl_shared_alloc() should
67  * not be called while holding p_lock.
68  */
69 
70 typedef struct sc_page_ctl {
71 	struct sc_page_ctl *spc_next;
72 	sc_shared_t	*spc_base;	/* base of kernel page */
73 	sc_shared_t	*spc_end;	/* end of usable space */
74 	ulong_t		*spc_map;	/* bitmap of allocated space on page */
75 	size_t		spc_space;	/* amount of space on page */
76 	caddr_t		spc_uaddr;	/* user-level address of the page */
77 	struct anon_map	*spc_amp;	/* anonymous memory structure */
78 } sc_page_ctl_t;
79 
80 static size_t	sc_pagesize;		/* size of usable space on page */
81 static size_t	sc_bitmap_len;		/* # of bits in allocation bitmap */
82 static size_t	sc_bitmap_words;	/* # of words in allocation bitmap */
83 
84 /* Context ops */
85 static void	schedctl_save(sc_shared_t *);
86 static void	schedctl_restore(sc_shared_t *);
87 static void	schedctl_fork(kthread_t *, kthread_t *);
88 
89 /* Functions for handling shared pages */
90 static int	schedctl_shared_alloc(sc_shared_t **, uintptr_t *);
91 static sc_page_ctl_t *schedctl_page_lookup(sc_shared_t *);
92 static int	schedctl_map(struct anon_map *, caddr_t *, caddr_t);
93 static int	schedctl_getpage(struct anon_map **, caddr_t *);
94 static void	schedctl_freepage(struct anon_map *, caddr_t);
95 
96 /*
97  * System call interface to scheduler activations.
98  * This always operates on the current lwp.
99  */
100 caddr_t
101 schedctl(void)
102 {
103 	kthread_t	*t = curthread;
104 	sc_shared_t	*ssp;
105 	uintptr_t	uaddr;
106 	int		error;
107 
108 	if (t->t_schedctl == NULL) {
109 		/*
110 		 * Allocate and initialize the shared structure.
111 		 */
112 		if ((error = schedctl_shared_alloc(&ssp, &uaddr)) != 0)
113 			return ((caddr_t)(uintptr_t)set_errno(error));
114 		bzero(ssp, sizeof (*ssp));
115 
116 		installctx(t, ssp, schedctl_save, schedctl_restore,
117 		    schedctl_fork, NULL, NULL, NULL);
118 
119 		thread_lock(t);	/* protect against ts_tick and ts_update */
120 		t->t_schedctl = ssp;
121 		t->t_sc_uaddr = uaddr;
122 		thread_unlock(t);
123 	}
124 
125 	return ((caddr_t)t->t_sc_uaddr);
126 }
127 
128 
129 /*
130  * Clean up scheduler activations state associated with an exiting
131  * (or execing) lwp.  t is always the current thread.
132  */
133 void
134 schedctl_lwp_cleanup(kthread_t *t)
135 {
136 	sc_shared_t	*ssp = t->t_schedctl;
137 	proc_t		*p = ttoproc(t);
138 	sc_page_ctl_t	*pagep;
139 	index_t		index;
140 
141 	ASSERT(MUTEX_NOT_HELD(&p->p_lock));
142 
143 	thread_lock(t);		/* protect against ts_tick and ts_update */
144 	t->t_schedctl = NULL;
145 	t->t_sc_uaddr = 0;
146 	thread_unlock(t);
147 
148 	/*
149 	 * Remove the context op to avoid the final call to
150 	 * schedctl_save when switching away from this lwp.
151 	 */
152 	(void) removectx(t, ssp, schedctl_save, schedctl_restore,
153 	    schedctl_fork, NULL, NULL, NULL);
154 
155 	/*
156 	 * Do not unmap the shared page until the process exits.
157 	 * User-level library code relies on this for adaptive mutex locking.
158 	 */
159 	mutex_enter(&p->p_sc_lock);
160 	ssp->sc_state = SC_FREE;
161 	pagep = schedctl_page_lookup(ssp);
162 	index = (index_t)(ssp - pagep->spc_base);
163 	BT_CLEAR(pagep->spc_map, index);
164 	pagep->spc_space += sizeof (sc_shared_t);
165 	mutex_exit(&p->p_sc_lock);
166 }
167 
168 /*
169  * Cleanup the list of schedctl shared pages for the process.
170  * Called from exec() and exit() system calls.
171  */
172 void
173 schedctl_proc_cleanup()
174 {
175 	proc_t *p = curproc;
176 	sc_page_ctl_t *pagep;
177 	sc_page_ctl_t *next;
178 
179 	ASSERT(p->p_lwpcnt == 1);	/* we are single-threaded now */
180 	ASSERT(curthread->t_schedctl == NULL);
181 
182 	/*
183 	 * Since we are single-threaded, we don't have to hold p->p_sc_lock.
184 	 */
185 	pagep = p->p_pagep;
186 	p->p_pagep = NULL;
187 	while (pagep != NULL) {
188 		ASSERT(pagep->spc_space == sc_pagesize);
189 		next = pagep->spc_next;
190 		/*
191 		 * Unmap the user space and free the mapping structure.
192 		 */
193 		(void) as_unmap(p->p_as, pagep->spc_uaddr, PAGESIZE);
194 		schedctl_freepage(pagep->spc_amp, (caddr_t)(pagep->spc_base));
195 		kmem_free(pagep->spc_map, sizeof (ulong_t) * sc_bitmap_words);
196 		kmem_free(pagep, sizeof (sc_page_ctl_t));
197 		pagep = next;
198 	}
199 }
200 
201 /*
202  * Called by resume just before switching away from the current thread.
203  * Save new thread state.
204  */
205 void
206 schedctl_save(sc_shared_t *ssp)
207 {
208 	ssp->sc_state = curthread->t_state;
209 }
210 
211 
212 /*
213  * Called by resume after switching to the current thread.
214  * Save new thread state and CPU.
215  */
216 void
217 schedctl_restore(sc_shared_t *ssp)
218 {
219 	ssp->sc_state = SC_ONPROC;
220 	ssp->sc_cpu = CPU->cpu_id;
221 }
222 
223 
224 /*
225  * On fork, remove inherited mappings from the child's address space.
226  * The child's threads must call schedctl() to get new shared mappings.
227  */
228 void
229 schedctl_fork(kthread_t *pt, kthread_t *ct)
230 {
231 	proc_t *pp = ttoproc(pt);
232 	proc_t *cp = ttoproc(ct);
233 	sc_page_ctl_t *pagep;
234 
235 	ASSERT(ct->t_schedctl == NULL);
236 
237 	/*
238 	 * Do this only once, whether we are doing fork1() or forkall().
239 	 * Don't do it at all if the child process is a child of vfork()
240 	 * because a child of vfork() borrows the parent's address space.
241 	 */
242 	if (pt != curthread || (cp->p_flag & SVFORK))
243 		return;
244 
245 	mutex_enter(&pp->p_sc_lock);
246 	for (pagep = pp->p_pagep; pagep != NULL; pagep = pagep->spc_next)
247 		(void) as_unmap(cp->p_as, pagep->spc_uaddr, PAGESIZE);
248 	mutex_exit(&pp->p_sc_lock);
249 }
250 
251 /*
252  * Returns non-zero if the specified thread shouldn't be preempted at this time.
253  * Called by ts_preempt, ts_tick, and ts_update.
254  */
255 int
256 schedctl_get_nopreempt(kthread_t *t)
257 {
258 	ASSERT(THREAD_LOCK_HELD(t));
259 	return (t->t_schedctl->sc_preemptctl.sc_nopreempt);
260 }
261 
262 
263 /*
264  * Sets the value of the nopreempt field for the specified thread.
265  * Called by ts_preempt to clear the field on preemption.
266  */
267 void
268 schedctl_set_nopreempt(kthread_t *t, short val)
269 {
270 	ASSERT(THREAD_LOCK_HELD(t));
271 	t->t_schedctl->sc_preemptctl.sc_nopreempt = val;
272 }
273 
274 
275 /*
276  * Sets the value of the yield field for the specified thread.  Called by
277  * ts_preempt and ts_tick to set the field, and ts_yield to clear it.
278  * The kernel never looks at this field so we don't need a schedctl_get_yield
279  * function.
280  */
281 void
282 schedctl_set_yield(kthread_t *t, short val)
283 {
284 	ASSERT(THREAD_LOCK_HELD(t));
285 	t->t_schedctl->sc_preemptctl.sc_yield = val;
286 }
287 
288 
289 /*
290  * Returns non-zero if the specified thread has requested that all
291  * signals be blocked.  Called by signal-related code that tests
292  * the signal mask of a thread that may not be the current thread
293  * and where the process's p_lock cannot be acquired.
294  */
295 int
296 schedctl_sigblock(kthread_t *t)
297 {
298 	sc_shared_t *tdp = t->t_schedctl;
299 
300 	if (tdp != NULL)
301 		return (tdp->sc_sigblock);
302 	return (0);
303 }
304 
305 
306 /*
307  * If the sc_sigblock field is set for the specified thread, set
308  * its signal mask to block all maskable signals, then clear the
309  * sc_sigblock field.  This finishes what user-level code requested
310  * to be done when it set tdp->sc_shared->sc_sigblock non-zero.
311  * Called by signal-related code that holds the process's p_lock.
312  */
313 void
314 schedctl_finish_sigblock(kthread_t *t)
315 {
316 	sc_shared_t *tdp = t->t_schedctl;
317 
318 	ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));
319 
320 	if (tdp != NULL && tdp->sc_sigblock) {
321 		t->t_hold.__sigbits[0] = FILLSET0 & ~CANTMASK0;
322 		t->t_hold.__sigbits[1] = FILLSET1 & ~CANTMASK1;
323 		tdp->sc_sigblock = 0;
324 	}
325 }
326 
327 
328 /*
329  * Return non-zero if the current thread has declared that
330  * it is calling into the kernel to park, else return zero.
331  */
332 int
333 schedctl_is_park()
334 {
335 	sc_shared_t *tdp = curthread->t_schedctl;
336 
337 	if (tdp != NULL)
338 		return (tdp->sc_park);
339 	/*
340 	 * If we're here and there is no shared memory (how could
341 	 * that happen?) then just assume we really are here to park.
342 	 */
343 	return (1);
344 }
345 
346 /*
347  * Declare thread is parking.
348  *
349  * libc will set "sc_park = 1" before calling lwpsys_park(0, tid) in order
350  * to declare that the thread is calling into the kernel to park.
351  *
352  * This interface exists ONLY to support older versions of libthread which
353  * are not aware of the sc_park flag.
354  *
355  * Older versions of libthread which are not aware of the sc_park flag need to
356  * be modified or emulated to call lwpsys_park(4, ...) instead of
357  * lwpsys_park(0, ...).  This will invoke schedctl_set_park() before
358  * lwp_park() to declare that the thread is parking.
359  */
360 void
361 schedctl_set_park()
362 {
363 	sc_shared_t *tdp = curthread->t_schedctl;
364 
365 	if (tdp != NULL)
366 		tdp->sc_park = 1;
367 }
368 
369 /*
370  * Clear the shared sc_park flag on return from parking in the kernel.
371  */
372 void
373 schedctl_unpark()
374 {
375 	sc_shared_t *tdp = curthread->t_schedctl;
376 
377 	if (tdp != NULL)
378 		tdp->sc_park = 0;
379 }
380 
381 
382 /*
383  * Page handling code.
384  */
385 
386 void
387 schedctl_init()
388 {
389 	/*
390 	 * Amount of page that can hold sc_shared_t structures.  If
391 	 * sizeof (sc_shared_t) is a power of 2, this should just be
392 	 * PAGESIZE.
393 	 */
394 	sc_pagesize = PAGESIZE - (PAGESIZE % sizeof (sc_shared_t));
395 
396 	/*
397 	 * Allocation bitmap is one bit per struct on a page.
398 	 */
399 	sc_bitmap_len = sc_pagesize / sizeof (sc_shared_t);
400 	sc_bitmap_words = howmany(sc_bitmap_len, BT_NBIPUL);
401 }
402 
403 int
404 schedctl_shared_alloc(sc_shared_t **kaddrp, uintptr_t *uaddrp)
405 {
406 	proc_t		*p = curproc;
407 	sc_page_ctl_t	*pagep;
408 	sc_shared_t	*ssp;
409 	caddr_t		base;
410 	index_t		index;
411 	int		error;
412 
413 	ASSERT(MUTEX_NOT_HELD(&p->p_lock));
414 	mutex_enter(&p->p_sc_lock);
415 
416 	/*
417 	 * Try to find space for the new data in existing pages
418 	 * within the process's list of shared pages.
419 	 */
420 	for (pagep = p->p_pagep; pagep != NULL; pagep = pagep->spc_next)
421 		if (pagep->spc_space != 0)
422 			break;
423 
424 	if (pagep != NULL)
425 		base = pagep->spc_uaddr;
426 	else {
427 		struct anon_map *amp;
428 		caddr_t kaddr;
429 
430 		/*
431 		 * No room, need to allocate a new page.  Also set up
432 		 * a mapping to the kernel address space for the new
433 		 * page and lock it in memory.
434 		 */
435 		if ((error = schedctl_getpage(&amp, &kaddr)) != 0) {
436 			mutex_exit(&p->p_sc_lock);
437 			return (error);
438 		}
439 		if ((error = schedctl_map(amp, &base, kaddr)) != 0) {
440 			schedctl_freepage(amp, kaddr);
441 			mutex_exit(&p->p_sc_lock);
442 			return (error);
443 		}
444 
445 		/*
446 		 * Allocate and initialize the page control structure.
447 		 */
448 		pagep = kmem_alloc(sizeof (sc_page_ctl_t), KM_SLEEP);
449 		pagep->spc_amp = amp;
450 		pagep->spc_base = (sc_shared_t *)kaddr;
451 		pagep->spc_end = (sc_shared_t *)(kaddr + sc_pagesize);
452 		pagep->spc_uaddr = base;
453 
454 		pagep->spc_map = kmem_zalloc(sizeof (ulong_t) * sc_bitmap_words,
455 		    KM_SLEEP);
456 		pagep->spc_space = sc_pagesize;
457 
458 		pagep->spc_next = p->p_pagep;
459 		p->p_pagep = pagep;
460 	}
461 
462 	/*
463 	 * Got a page, now allocate space for the data.  There should
464 	 * be space unless something's wrong.
465 	 */
466 	ASSERT(pagep != NULL && pagep->spc_space >= sizeof (sc_shared_t));
467 	index = bt_availbit(pagep->spc_map, sc_bitmap_len);
468 	ASSERT(index != -1);
469 
470 	/*
471 	 * Get location with pointer arithmetic.  spc_base is of type
472 	 * sc_shared_t *.  Mark as allocated.
473 	 */
474 	ssp = pagep->spc_base + index;
475 	BT_SET(pagep->spc_map, index);
476 	pagep->spc_space -= sizeof (sc_shared_t);
477 
478 	mutex_exit(&p->p_sc_lock);
479 
480 	/*
481 	 * Return kernel and user addresses.
482 	 */
483 	*kaddrp = ssp;
484 	*uaddrp = (uintptr_t)base + ((uintptr_t)ssp & PAGEOFFSET);
485 	return (0);
486 }
487 
488 
489 /*
490  * Find the page control structure corresponding to a kernel address.
491  */
492 static sc_page_ctl_t *
493 schedctl_page_lookup(sc_shared_t *ssp)
494 {
495 	proc_t *p = curproc;
496 	sc_page_ctl_t *pagep;
497 
498 	ASSERT(MUTEX_HELD(&p->p_sc_lock));
499 	for (pagep = p->p_pagep; pagep != NULL; pagep = pagep->spc_next) {
500 		if (ssp >= pagep->spc_base && ssp < pagep->spc_end)
501 			return (pagep);
502 	}
503 	return (NULL);		/* This "can't happen".  Should we panic? */
504 }
505 
506 
507 /*
508  * This function is called when a page needs to be mapped into a
509  * process's address space.  Allocate the user address space and
510  * set up the mapping to the page.  Assumes the page has already
511  * been allocated and locked in memory via schedctl_getpage.
512  */
513 static int
514 schedctl_map(struct anon_map *amp, caddr_t *uaddrp, caddr_t kaddr)
515 {
516 	caddr_t addr;
517 	struct as *as = curproc->p_as;
518 	struct segvn_crargs vn_a;
519 	int error;
520 
521 	as_rangelock(as);
522 	/* pass address of kernel mapping as offset to avoid VAC conflicts */
523 	map_addr(&addr, PAGESIZE, (offset_t)(uintptr_t)kaddr, 1, 0);
524 	if (addr == NULL) {
525 		as_rangeunlock(as);
526 		return (ENOMEM);
527 	}
528 
529 	/*
530 	 * Use segvn to set up the mapping to the page.
531 	 */
532 	vn_a.vp = NULL;
533 	vn_a.offset = 0;
534 	vn_a.cred = NULL;
535 	vn_a.type = MAP_SHARED;
536 	vn_a.prot = vn_a.maxprot = PROT_ALL;
537 	vn_a.flags = 0;
538 	vn_a.amp = amp;
539 	vn_a.szc = 0;
540 	vn_a.lgrp_mem_policy_flags = 0;
541 	error = as_map(as, addr, PAGESIZE, segvn_create, &vn_a);
542 	as_rangeunlock(as);
543 
544 	if (error)
545 		return (error);
546 
547 	*uaddrp = addr;
548 	return (0);
549 }
550 
551 
552 /*
553  * Allocate a new page from anonymous memory.  Also, create a kernel
554  * mapping to the page and lock the page in memory.
555  */
556 static int
557 schedctl_getpage(struct anon_map **newamp, caddr_t *newaddr)
558 {
559 	struct anon_map *amp;
560 	caddr_t kaddr;
561 
562 	/*
563 	 * Set up anonymous memory struct.  No swap reservation is
564 	 * needed since the page will be locked into memory.
565 	 */
566 	amp = anonmap_alloc(PAGESIZE, 0);
567 
568 	/*
569 	 * Allocate the page.
570 	 */
571 	kaddr = segkp_get_withanonmap(segkp, PAGESIZE,
572 	    KPD_NO_ANON | KPD_LOCKED | KPD_ZERO, amp);
573 	if (kaddr == NULL) {
574 		amp->refcnt--;
575 		anonmap_free(amp);
576 		return (ENOMEM);
577 	}
578 
579 	/*
580 	 * The page is left SE_SHARED locked so that it won't be
581 	 * paged out or relocated (KPD_LOCKED above).
582 	 */
583 
584 	*newamp = amp;
585 	*newaddr = kaddr;
586 	return (0);
587 }
588 
589 
590 /*
591  * Take the necessary steps to allow a page to be released.
592  * This is called when the process is doing exit() or exec().
593  * There should be no accesses to the page after this.
594  * The kernel mapping of the page is released and the page is unlocked.
595  */
596 static void
597 schedctl_freepage(struct anon_map *amp, caddr_t kaddr)
598 {
599 	/*
600 	 * Release the lock on the page and remove the kernel mapping.
601 	 */
602 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
603 	segkp_release(segkp, kaddr);
604 
605 	/*
606 	 * Decrement the refcnt so the anon_map structure will be freed.
607 	 */
608 	if (--amp->refcnt == 0) {
609 		/*
610 		 * The current process no longer has the page mapped, so
611 		 * we have to free everything rather than letting as_free
612 		 * do the work.
613 		 */
614 		anon_free(amp->ahp, 0, PAGESIZE);
615 		ANON_LOCK_EXIT(&amp->a_rwlock);
616 		anonmap_free(amp);
617 	} else {
618 		ANON_LOCK_EXIT(&amp->a_rwlock);
619 	}
620 }
621