xref: /illumos-gate/usr/src/uts/common/os/schedctl.c (revision 355b4669e025ff377602b6fc7caaf30dbc218371)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/systm.h>
31 #include <sys/schedctl.h>
32 #include <sys/proc.h>
33 #include <sys/thread.h>
34 #include <sys/class.h>
35 #include <sys/cred.h>
36 #include <sys/kmem.h>
37 #include <sys/cmn_err.h>
38 #include <sys/stack.h>
39 #include <sys/debug.h>
40 #include <sys/cpuvar.h>
41 #include <sys/sobject.h>
42 #include <sys/door.h>
43 #include <sys/modctl.h>
44 #include <sys/syscall.h>
45 #include <sys/sysmacros.h>
46 #include <sys/vmsystm.h>
47 #include <sys/mman.h>
48 #include <sys/vnode.h>
49 #include <sys/swap.h>
50 #include <sys/lwp.h>
51 #include <sys/bitmap.h>
52 #include <sys/atomic.h>
53 #include <sys/fcntl.h>
54 #include <vm/seg_kp.h>
55 #include <vm/seg_vn.h>
56 #include <vm/as.h>
57 #include <fs/fs_subr.h>
58 
59 
60 /*
61  * Page handling structures.  This is set up as a list of per-page
62  * control structures (sc_page_ctl), with p->p_pagep pointing to
63  * the first.  The per-page structures point to the actual pages
64  * and contain pointers to the user address for each mapped page.
65  *
66  * All data is protected by p->p_sc_lock.  Since this lock is
67  * held while waiting for memory, schedctl_shared_alloc() should
68  * not be called while holding p_lock.
69  */
70 
71 typedef struct sc_page_ctl {
72 	struct sc_page_ctl *spc_next;
73 	sc_shared_t	*spc_base;	/* base of kernel page */
74 	sc_shared_t	*spc_end;	/* end of usable space */
75 	ulong_t		*spc_map;	/* bitmap of allocated space on page */
76 	size_t		spc_space;	/* amount of space on page */
77 	caddr_t		spc_uaddr;	/* user-level address of the page */
78 	struct anon_map	*spc_amp;	/* anonymous memory structure */
79 } sc_page_ctl_t;
80 
81 static size_t	sc_pagesize;		/* size of usable space on page */
82 static size_t	sc_bitmap_len;		/* # of bits in allocation bitmap */
83 static size_t	sc_bitmap_words;	/* # of words in allocation bitmap */
84 
85 /* Context ops */
86 static void	schedctl_save(sc_shared_t *);
87 static void	schedctl_restore(sc_shared_t *);
88 static void	schedctl_fork(kthread_t *, kthread_t *);
89 
90 /* Functions for handling shared pages */
91 static int	schedctl_shared_alloc(sc_shared_t **, uintptr_t *);
92 static sc_page_ctl_t *schedctl_page_lookup(sc_shared_t *);
93 static int	schedctl_map(struct anon_map *, caddr_t *, caddr_t);
94 static int	schedctl_getpage(struct anon_map **, caddr_t *);
95 static void	schedctl_freepage(struct anon_map *, caddr_t);
96 
97 /*
98  * System call interface to scheduler activations.
99  * This always operates on the current lwp.
100  */
101 caddr_t
102 schedctl(void)
103 {
104 	kthread_t	*t = curthread;
105 	sc_shared_t	*ssp;
106 	uintptr_t	uaddr;
107 	int		error;
108 
109 	if (t->t_schedctl == NULL) {
110 		/*
111 		 * Allocate and initialize the shared structure.
112 		 */
113 		if ((error = schedctl_shared_alloc(&ssp, &uaddr)) != 0)
114 			return ((caddr_t)(uintptr_t)set_errno(error));
115 		bzero(ssp, sizeof (*ssp));
116 
117 		installctx(t, ssp, schedctl_save, schedctl_restore,
118 		    schedctl_fork, NULL, NULL, NULL);
119 
120 		thread_lock(t);	/* protect against ts_tick and ts_update */
121 		t->t_schedctl = ssp;
122 		t->t_sc_uaddr = uaddr;
123 		thread_unlock(t);
124 	}
125 
126 	return ((caddr_t)t->t_sc_uaddr);
127 }
128 
129 
130 /*
131  * Clean up scheduler activations state associated with an exiting
132  * (or execing) lwp.  t is always the current thread.
133  */
134 void
135 schedctl_lwp_cleanup(kthread_t *t)
136 {
137 	sc_shared_t	*ssp = t->t_schedctl;
138 	proc_t		*p = ttoproc(t);
139 	sc_page_ctl_t	*pagep;
140 	index_t		index;
141 
142 	ASSERT(MUTEX_NOT_HELD(&p->p_lock));
143 
144 	thread_lock(t);		/* protect against ts_tick and ts_update */
145 	t->t_schedctl = NULL;
146 	t->t_sc_uaddr = 0;
147 	thread_unlock(t);
148 
149 	/*
150 	 * Remove the context op to avoid the final call to
151 	 * schedctl_save when switching away from this lwp.
152 	 */
153 	(void) removectx(t, ssp, schedctl_save, schedctl_restore,
154 	    schedctl_fork, NULL, NULL, NULL);
155 
156 	/*
157 	 * Do not unmap the shared page until the process exits.
158 	 * User-level library code relies on this for adaptive mutex locking.
159 	 */
160 	mutex_enter(&p->p_sc_lock);
161 	ssp->sc_state = SC_FREE;
162 	pagep = schedctl_page_lookup(ssp);
163 	index = (index_t)(ssp - pagep->spc_base);
164 	BT_CLEAR(pagep->spc_map, index);
165 	pagep->spc_space += sizeof (sc_shared_t);
166 	mutex_exit(&p->p_sc_lock);
167 }
168 
169 /*
170  * Cleanup the list of schedctl shared pages for the process.
171  * Called from exec() and exit() system calls.
172  */
173 void
174 schedctl_proc_cleanup()
175 {
176 	proc_t *p = curproc;
177 	sc_page_ctl_t *pagep;
178 	sc_page_ctl_t *next;
179 
180 	ASSERT(p->p_lwpcnt == 1);	/* we are single-threaded now */
181 	ASSERT(curthread->t_schedctl == NULL);
182 
183 	/*
184 	 * Since we are single-threaded, we don't have to hold p->p_sc_lock.
185 	 */
186 	pagep = p->p_pagep;
187 	p->p_pagep = NULL;
188 	while (pagep != NULL) {
189 		ASSERT(pagep->spc_space == sc_pagesize);
190 		next = pagep->spc_next;
191 		/*
192 		 * Unmap the user space and free the mapping structure.
193 		 */
194 		(void) as_unmap(p->p_as, pagep->spc_uaddr, PAGESIZE);
195 		schedctl_freepage(pagep->spc_amp, (caddr_t)(pagep->spc_base));
196 		kmem_free(pagep->spc_map, sizeof (ulong_t) * sc_bitmap_words);
197 		kmem_free(pagep, sizeof (sc_page_ctl_t));
198 		pagep = next;
199 	}
200 }
201 
202 /*
203  * Called by resume just before switching away from the current thread.
204  * Save new thread state.
205  */
206 void
207 schedctl_save(sc_shared_t *ssp)
208 {
209 	ssp->sc_state = curthread->t_state;
210 }
211 
212 
213 /*
214  * Called by resume after switching to the current thread.
215  * Save new thread state and CPU.
216  */
217 void
218 schedctl_restore(sc_shared_t *ssp)
219 {
220 	ssp->sc_state = SC_ONPROC;
221 	ssp->sc_cpu = CPU->cpu_id;
222 }
223 
224 
225 /*
226  * On fork, remove inherited mappings from the child's address space.
227  * The child's threads must call schedctl() to get new shared mappings.
228  */
229 void
230 schedctl_fork(kthread_t *pt, kthread_t *ct)
231 {
232 	proc_t *pp = ttoproc(pt);
233 	proc_t *cp = ttoproc(ct);
234 	sc_page_ctl_t *pagep;
235 
236 	ASSERT(ct->t_schedctl == NULL);
237 
238 	/*
239 	 * Do this only once, whether we are doing fork1() or forkall().
240 	 * Don't do it at all if the child process is a child of vfork()
241 	 * because a child of vfork() borrows the parent's address space.
242 	 */
243 	if (pt != curthread || (cp->p_flag & SVFORK))
244 		return;
245 
246 	mutex_enter(&pp->p_sc_lock);
247 	for (pagep = pp->p_pagep; pagep != NULL; pagep = pagep->spc_next)
248 		(void) as_unmap(cp->p_as, pagep->spc_uaddr, PAGESIZE);
249 	mutex_exit(&pp->p_sc_lock);
250 }
251 
252 /*
253  * Returns non-zero if the specified thread shouldn't be preempted at this time.
254  * Called by ts_preempt, ts_tick, and ts_update.
255  */
256 int
257 schedctl_get_nopreempt(kthread_t *t)
258 {
259 	ASSERT(THREAD_LOCK_HELD(t));
260 	return (t->t_schedctl->sc_preemptctl.sc_nopreempt);
261 }
262 
263 
264 /*
265  * Sets the value of the nopreempt field for the specified thread.
266  * Called by ts_preempt to clear the field on preemption.
267  */
268 void
269 schedctl_set_nopreempt(kthread_t *t, short val)
270 {
271 	ASSERT(THREAD_LOCK_HELD(t));
272 	t->t_schedctl->sc_preemptctl.sc_nopreempt = val;
273 }
274 
275 
276 /*
277  * Sets the value of the yield field for the specified thread.  Called by
278  * ts_preempt and ts_tick to set the field, and ts_yield to clear it.
279  * The kernel never looks at this field so we don't need a schedctl_get_yield
280  * function.
281  */
282 void
283 schedctl_set_yield(kthread_t *t, short val)
284 {
285 	ASSERT(THREAD_LOCK_HELD(t));
286 	t->t_schedctl->sc_preemptctl.sc_yield = val;
287 }
288 
289 
290 /*
291  * Returns non-zero if the specified thread has requested that all
292  * signals be blocked.  Called by signal-related code that tests
293  * the signal mask of a thread that may not be the current thread
294  * and where the process's p_lock cannot be acquired.
295  */
296 int
297 schedctl_sigblock(kthread_t *t)
298 {
299 	sc_shared_t *tdp = t->t_schedctl;
300 
301 	if (tdp)
302 		return (tdp->sc_sigblock);
303 	return (0);
304 }
305 
306 
307 /*
308  * If the sc_sigblock field is set for the specified thread, set
309  * its signal mask to block all maskable signals, then clear the
310  * sc_sigblock field.  This finishes what user-level code requested
311  * to be done when it set tdp->sc_shared->sc_sigblock non-zero.
312  * Called by signal-related code that holds the process's p_lock.
313  */
314 void
315 schedctl_finish_sigblock(kthread_t *t)
316 {
317 	sc_shared_t *tdp = t->t_schedctl;
318 
319 	ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));
320 
321 	if (tdp && tdp->sc_sigblock) {
322 		t->t_hold.__sigbits[0] = FILLSET0 & ~CANTMASK0;
323 		t->t_hold.__sigbits[1] = FILLSET1 & ~CANTMASK1;
324 		tdp->sc_sigblock = 0;
325 	}
326 }
327 
328 
329 /*
330  * Return non-zero if the current thread has declared that
331  * it is calling into the kernel to park, else return zero.
332  */
333 int
334 schedctl_is_park()
335 {
336 	sc_shared_t *tdp = curthread->t_schedctl;
337 
338 	if (tdp)
339 		return (tdp->sc_park);
340 	/*
341 	 * If we're here and there is no shared memory (how could
342 	 * that happen?) then just assume we really are here to park.
343 	 */
344 	return (1);
345 }
346 
347 
348 /*
349  * Clear the shared sc_park flag on return from parking in the kernel.
350  */
351 void
352 schedctl_unpark()
353 {
354 	sc_shared_t *tdp = curthread->t_schedctl;
355 
356 	if (tdp)
357 		tdp->sc_park = 0;
358 }
359 
360 
361 /*
362  * Page handling code.
363  */
364 
365 void
366 schedctl_init()
367 {
368 	/*
369 	 * Amount of page that can hold sc_shared_t structures.  If
370 	 * sizeof (sc_shared_t) is a power of 2, this should just be
371 	 * PAGESIZE.
372 	 */
373 	sc_pagesize = PAGESIZE - (PAGESIZE % sizeof (sc_shared_t));
374 
375 	/*
376 	 * Allocation bitmap is one bit per struct on a page.
377 	 */
378 	sc_bitmap_len = sc_pagesize / sizeof (sc_shared_t);
379 	sc_bitmap_words = howmany(sc_bitmap_len, BT_NBIPUL);
380 }
381 
382 int
383 schedctl_shared_alloc(sc_shared_t **kaddrp, uintptr_t *uaddrp)
384 {
385 	proc_t		*p = curproc;
386 	sc_page_ctl_t	*pagep;
387 	sc_shared_t	*ssp;
388 	caddr_t		base;
389 	index_t		index;
390 	int		error;
391 
392 	ASSERT(MUTEX_NOT_HELD(&p->p_lock));
393 	mutex_enter(&p->p_sc_lock);
394 
395 	/*
396 	 * Try to find space for the new data in existing pages
397 	 * within the process's list of shared pages.
398 	 */
399 	for (pagep = p->p_pagep; pagep != NULL; pagep = pagep->spc_next)
400 		if (pagep->spc_space != 0)
401 			break;
402 
403 	if (pagep != NULL)
404 		base = pagep->spc_uaddr;
405 	else {
406 		struct anon_map *amp;
407 		caddr_t kaddr;
408 
409 		/*
410 		 * No room, need to allocate a new page.  Also set up
411 		 * a mapping to the kernel address space for the new
412 		 * page and lock it in memory.
413 		 */
414 		if ((error = schedctl_getpage(&amp, &kaddr)) != 0) {
415 			mutex_exit(&p->p_sc_lock);
416 			return (error);
417 		}
418 		if ((error = schedctl_map(amp, &base, kaddr)) != 0) {
419 			schedctl_freepage(amp, kaddr);
420 			mutex_exit(&p->p_sc_lock);
421 			return (error);
422 		}
423 
424 		/*
425 		 * Allocate and initialize the page control structure.
426 		 */
427 		pagep = kmem_alloc(sizeof (sc_page_ctl_t), KM_SLEEP);
428 		pagep->spc_amp = amp;
429 		pagep->spc_base = (sc_shared_t *)kaddr;
430 		pagep->spc_end = (sc_shared_t *)(kaddr + sc_pagesize);
431 		pagep->spc_uaddr = base;
432 
433 		pagep->spc_map = kmem_zalloc(sizeof (ulong_t) * sc_bitmap_words,
434 		    KM_SLEEP);
435 		pagep->spc_space = sc_pagesize;
436 
437 		pagep->spc_next = p->p_pagep;
438 		p->p_pagep = pagep;
439 	}
440 
441 	/*
442 	 * Got a page, now allocate space for the data.  There should
443 	 * be space unless something's wrong.
444 	 */
445 	ASSERT(pagep != NULL && pagep->spc_space >= sizeof (sc_shared_t));
446 	index = bt_availbit(pagep->spc_map, sc_bitmap_len);
447 	ASSERT(index != -1);
448 
449 	/*
450 	 * Get location with pointer arithmetic.  spc_base is of type
451 	 * sc_shared_t *.  Mark as allocated.
452 	 */
453 	ssp = pagep->spc_base + index;
454 	BT_SET(pagep->spc_map, index);
455 	pagep->spc_space -= sizeof (sc_shared_t);
456 
457 	mutex_exit(&p->p_sc_lock);
458 
459 	/*
460 	 * Return kernel and user addresses.
461 	 */
462 	*kaddrp = ssp;
463 	*uaddrp = (uintptr_t)base + ((uintptr_t)ssp & PAGEOFFSET);
464 	return (0);
465 }
466 
467 
468 /*
469  * Find the page control structure corresponding to a kernel address.
470  */
471 static sc_page_ctl_t *
472 schedctl_page_lookup(sc_shared_t *ssp)
473 {
474 	proc_t *p = curproc;
475 	sc_page_ctl_t *pagep;
476 
477 	ASSERT(MUTEX_HELD(&p->p_sc_lock));
478 	for (pagep = p->p_pagep; pagep != NULL; pagep = pagep->spc_next) {
479 		if (ssp >= pagep->spc_base && ssp < pagep->spc_end)
480 			return (pagep);
481 	}
482 	return (NULL);		/* This "can't happen".  Should we panic? */
483 }
484 
485 
486 /*
487  * This function is called when a page needs to be mapped into a
488  * process's address space.  Allocate the user address space and
489  * set up the mapping to the page.  Assumes the page has already
490  * been allocated and locked in memory via schedctl_getpage.
491  */
492 static int
493 schedctl_map(struct anon_map *amp, caddr_t *uaddrp, caddr_t kaddr)
494 {
495 	caddr_t addr;
496 	struct as *as = curproc->p_as;
497 	struct segvn_crargs vn_a;
498 	int error;
499 
500 	as_rangelock(as);
501 	/* pass address of kernel mapping as offset to avoid VAC conflicts */
502 	map_addr(&addr, PAGESIZE, (offset_t)(uintptr_t)kaddr, 1, 0);
503 	if (addr == NULL) {
504 		as_rangeunlock(as);
505 		return (ENOMEM);
506 	}
507 
508 	/*
509 	 * Use segvn to set up the mapping to the page.
510 	 */
511 	vn_a.vp = NULL;
512 	vn_a.offset = 0;
513 	vn_a.cred = NULL;
514 	vn_a.type = MAP_SHARED;
515 	vn_a.prot = vn_a.maxprot = PROT_ALL;
516 	vn_a.flags = 0;
517 	vn_a.amp = amp;
518 	vn_a.szc = 0;
519 	vn_a.lgrp_mem_policy_flags = 0;
520 	error = as_map(as, addr, PAGESIZE, segvn_create, &vn_a);
521 	as_rangeunlock(as);
522 
523 	if (error)
524 		return (error);
525 
526 	*uaddrp = addr;
527 	return (0);
528 }
529 
530 
531 /*
532  * Allocate a new page from anonymous memory.  Also, create a kernel
533  * mapping to the page and lock the page in memory.
534  */
535 static int
536 schedctl_getpage(struct anon_map **newamp, caddr_t *newaddr)
537 {
538 	struct anon_map *amp;
539 	caddr_t kaddr;
540 
541 	/*
542 	 * Set up anonymous memory struct.  No swap reservation is
543 	 * needed since the page will be locked into memory.
544 	 */
545 	amp = anonmap_alloc(PAGESIZE, PAGESIZE);
546 
547 	/*
548 	 * Allocate the page.
549 	 */
550 	kaddr = segkp_get_withanonmap(segkp, PAGESIZE, KPD_LOCKED | KPD_ZERO,
551 	    amp);
552 	if (kaddr == NULL) {
553 		amp->refcnt--;
554 		anonmap_free(amp);
555 		return (ENOMEM);
556 	}
557 
558 	/*
559 	 * The page is left SE_SHARED locked so that it won't be
560 	 * paged out or relocated (KPD_LOCKED above).
561 	 */
562 
563 	*newamp = amp;
564 	*newaddr = kaddr;
565 	return (0);
566 }
567 
568 
569 /*
570  * Take the necessary steps to allow a page to be released.
571  * This is called when the process is doing exit() or exec().
572  * There should be no accesses to the page after this.
573  * The kernel mapping of the page is released and the page is unlocked.
574  */
575 static void
576 schedctl_freepage(struct anon_map *amp, caddr_t kaddr)
577 {
578 	/*
579 	 * Release the lock on the page and remove the kernel mapping.
580 	 */
581 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
582 	segkp_release(segkp, kaddr);
583 
584 	/*
585 	 * Decrement the refcnt so the anon_map structure will be freed.
586 	 */
587 	if (--amp->refcnt == 0) {
588 		/*
589 		 * The current process no longer has the page mapped, so
590 		 * we have to free everything rather than letting as_free
591 		 * do the work.
592 		 */
593 		anon_free(amp->ahp, 0, PAGESIZE);
594 		ANON_LOCK_EXIT(&amp->a_rwlock);
595 		anonmap_free(amp);
596 	} else {
597 		ANON_LOCK_EXIT(&amp->a_rwlock);
598 	}
599 }
600