xref: /freebsd/sys/vm/vm_glue.c (revision be9968363742a7b369819f598be0d2f08f1c92ab)
160727d8bSWarner Losh /*-
2df8bae1dSRodney W. Grimes  * Copyright (c) 1991, 1993
3df8bae1dSRodney W. Grimes  *	The Regents of the University of California.  All rights reserved.
4df8bae1dSRodney W. Grimes  *
5df8bae1dSRodney W. Grimes  * This code is derived from software contributed to Berkeley by
6df8bae1dSRodney W. Grimes  * The Mach Operating System project at Carnegie-Mellon University.
7df8bae1dSRodney W. Grimes  *
8df8bae1dSRodney W. Grimes  * Redistribution and use in source and binary forms, with or without
9df8bae1dSRodney W. Grimes  * modification, are permitted provided that the following conditions
10df8bae1dSRodney W. Grimes  * are met:
11df8bae1dSRodney W. Grimes  * 1. Redistributions of source code must retain the above copyright
12df8bae1dSRodney W. Grimes  *    notice, this list of conditions and the following disclaimer.
13df8bae1dSRodney W. Grimes  * 2. Redistributions in binary form must reproduce the above copyright
14df8bae1dSRodney W. Grimes  *    notice, this list of conditions and the following disclaimer in the
15df8bae1dSRodney W. Grimes  *    documentation and/or other materials provided with the distribution.
16df8bae1dSRodney W. Grimes  * 4. Neither the name of the University nor the names of its contributors
17df8bae1dSRodney W. Grimes  *    may be used to endorse or promote products derived from this software
18df8bae1dSRodney W. Grimes  *    without specific prior written permission.
19df8bae1dSRodney W. Grimes  *
20df8bae1dSRodney W. Grimes  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21df8bae1dSRodney W. Grimes  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22df8bae1dSRodney W. Grimes  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23df8bae1dSRodney W. Grimes  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24df8bae1dSRodney W. Grimes  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25df8bae1dSRodney W. Grimes  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26df8bae1dSRodney W. Grimes  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27df8bae1dSRodney W. Grimes  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28df8bae1dSRodney W. Grimes  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29df8bae1dSRodney W. Grimes  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30df8bae1dSRodney W. Grimes  * SUCH DAMAGE.
31df8bae1dSRodney W. Grimes  *
323c4dd356SDavid Greenman  *	from: @(#)vm_glue.c	8.6 (Berkeley) 1/5/94
33df8bae1dSRodney W. Grimes  *
34df8bae1dSRodney W. Grimes  *
35df8bae1dSRodney W. Grimes  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
36df8bae1dSRodney W. Grimes  * All rights reserved.
37df8bae1dSRodney W. Grimes  *
38df8bae1dSRodney W. Grimes  * Permission to use, copy, modify and distribute this software and
39df8bae1dSRodney W. Grimes  * its documentation is hereby granted, provided that both the copyright
40df8bae1dSRodney W. Grimes  * notice and this permission notice appear in all copies of the
41df8bae1dSRodney W. Grimes  * software, derivative works or modified versions, and any portions
42df8bae1dSRodney W. Grimes  * thereof, and that both notices appear in supporting documentation.
43df8bae1dSRodney W. Grimes  *
44df8bae1dSRodney W. Grimes  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
45df8bae1dSRodney W. Grimes  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
46df8bae1dSRodney W. Grimes  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
47df8bae1dSRodney W. Grimes  *
48df8bae1dSRodney W. Grimes  * Carnegie Mellon requests users of this software to return to
49df8bae1dSRodney W. Grimes  *
50df8bae1dSRodney W. Grimes  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
51df8bae1dSRodney W. Grimes  *  School of Computer Science
52df8bae1dSRodney W. Grimes  *  Carnegie Mellon University
53df8bae1dSRodney W. Grimes  *  Pittsburgh PA 15213-3890
54df8bae1dSRodney W. Grimes  *
55df8bae1dSRodney W. Grimes  * any improvements or extensions that they make and grant Carnegie the
56df8bae1dSRodney W. Grimes  * rights to redistribute these changes.
57df8bae1dSRodney W. Grimes  */
58df8bae1dSRodney W. Grimes 
59874651b1SDavid E. O'Brien #include <sys/cdefs.h>
60874651b1SDavid E. O'Brien __FBSDID("$FreeBSD$");
61874651b1SDavid E. O'Brien 
62faa5f8d8SAndrzej Bialecki #include "opt_vm.h"
6315a7ad60SPeter Wemm #include "opt_kstack_pages.h"
6415a7ad60SPeter Wemm #include "opt_kstack_max_pages.h"
65e9822d92SJoerg Wunsch 
66df8bae1dSRodney W. Grimes #include <sys/param.h>
67df8bae1dSRodney W. Grimes #include <sys/systm.h>
68104a9b7eSAlexander Kabaev #include <sys/limits.h>
69fb919e4dSMark Murray #include <sys/lock.h>
70fb919e4dSMark Murray #include <sys/mutex.h>
71df8bae1dSRodney W. Grimes #include <sys/proc.h>
721ba5ad42SEdward Tomasz Napierala #include <sys/racct.h>
73df8bae1dSRodney W. Grimes #include <sys/resourcevar.h>
7489f6b863SAttilio Rao #include <sys/rwlock.h>
75da61b9a6SAlan Cox #include <sys/sched.h>
76da61b9a6SAlan Cox #include <sys/sf_buf.h>
773aa12267SBruce Evans #include <sys/shm.h>
78efeaf95aSDavid Greenman #include <sys/vmmeter.h>
791005a129SJohn Baldwin #include <sys/sx.h>
80ceb0cf87SJohn Dyson #include <sys/sysctl.h>
81e878d997SKonstantin Belousov #include <sys/_kstack_cache.h>
828a945d10SKonstantin Belousov #include <sys/eventhandler.h>
8326f9a767SRodney W. Grimes #include <sys/kernel.h>
840384fff8SJason Evans #include <sys/ktr.h>
85a2a1c95cSPeter Wemm #include <sys/unistd.h>
8626f9a767SRodney W. Grimes 
87df8bae1dSRodney W. Grimes #include <vm/vm.h>
88efeaf95aSDavid Greenman #include <vm/vm_param.h>
89efeaf95aSDavid Greenman #include <vm/pmap.h>
90efeaf95aSDavid Greenman #include <vm/vm_map.h>
91df8bae1dSRodney W. Grimes #include <vm/vm_page.h>
9226f9a767SRodney W. Grimes #include <vm/vm_pageout.h>
93a136efe9SPeter Wemm #include <vm/vm_object.h>
94df8bae1dSRodney W. Grimes #include <vm/vm_kern.h>
95efeaf95aSDavid Greenman #include <vm/vm_extern.h>
96a136efe9SPeter Wemm #include <vm/vm_pager.h>
9792da00bbSMatthew Dillon #include <vm/swap_pager.h>
98efeaf95aSDavid Greenman 
99e50f5c2eSBruce Evans #ifndef NO_SWAPPING
100b61ce5b0SJeff Roberson static int swapout(struct proc *);
101b61ce5b0SJeff Roberson static void swapclear(struct proc *);
102ac45ee97SAlan Cox static void vm_thread_swapin(struct thread *td);
103ac45ee97SAlan Cox static void vm_thread_swapout(struct thread *td);
104e50f5c2eSBruce Evans #endif
105f708ef1bSPoul-Henning Kamp 
10643a90f3aSAlan Cox /*
10743a90f3aSAlan Cox  * MPSAFE
1082d5c7e45SMatthew Dillon  *
1092d5c7e45SMatthew Dillon  * WARNING!  This code calls vm_map_check_protection() which only checks
1102d5c7e45SMatthew Dillon  * the associated vm_map_entry range.  It does not determine whether the
1112d5c7e45SMatthew Dillon  * contents of the memory is actually readable or writable.  In most cases
1122d5c7e45SMatthew Dillon  * just checking the vm_map_entry is sufficient within the kernel's address
1132d5c7e45SMatthew Dillon  * space.
11443a90f3aSAlan Cox  */
115df8bae1dSRodney W. Grimes int
116df8bae1dSRodney W. Grimes kernacc(addr, len, rw)
117c3dfdfd1SAlfred Perlstein 	void *addr;
118df8bae1dSRodney W. Grimes 	int len, rw;
119df8bae1dSRodney W. Grimes {
120df8bae1dSRodney W. Grimes 	boolean_t rv;
121df8bae1dSRodney W. Grimes 	vm_offset_t saddr, eaddr;
12202c58685SPoul-Henning Kamp 	vm_prot_t prot;
123df8bae1dSRodney W. Grimes 
124e50f5c2eSBruce Evans 	KASSERT((rw & ~VM_PROT_ALL) == 0,
12502c58685SPoul-Henning Kamp 	    ("illegal ``rw'' argument to kernacc (%x)\n", rw));
12675337a56SAlan Cox 
12775337a56SAlan Cox 	if ((vm_offset_t)addr + len > kernel_map->max_offset ||
12875337a56SAlan Cox 	    (vm_offset_t)addr + len < (vm_offset_t)addr)
12975337a56SAlan Cox 		return (FALSE);
13075337a56SAlan Cox 
13102c58685SPoul-Henning Kamp 	prot = rw;
1326cde7a16SDavid Greenman 	saddr = trunc_page((vm_offset_t)addr);
1336cde7a16SDavid Greenman 	eaddr = round_page((vm_offset_t)addr + len);
134d8834602SAlan Cox 	vm_map_lock_read(kernel_map);
135df8bae1dSRodney W. Grimes 	rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot);
136d8834602SAlan Cox 	vm_map_unlock_read(kernel_map);
137df8bae1dSRodney W. Grimes 	return (rv == TRUE);
138df8bae1dSRodney W. Grimes }
139df8bae1dSRodney W. Grimes 
14043a90f3aSAlan Cox /*
14143a90f3aSAlan Cox  * MPSAFE
1422d5c7e45SMatthew Dillon  *
1432d5c7e45SMatthew Dillon  * WARNING!  This code calls vm_map_check_protection() which only checks
1442d5c7e45SMatthew Dillon  * the associated vm_map_entry range.  It does not determine whether the
1452d5c7e45SMatthew Dillon  * contents of the memory is actually readable or writable.  vmapbuf(),
1462d5c7e45SMatthew Dillon  * vm_fault_quick(), or copyin()/copout()/su*()/fu*() functions should be
1472d5c7e45SMatthew Dillon  * used in conjuction with this call.
14843a90f3aSAlan Cox  */
149df8bae1dSRodney W. Grimes int
150df8bae1dSRodney W. Grimes useracc(addr, len, rw)
151c3dfdfd1SAlfred Perlstein 	void *addr;
152df8bae1dSRodney W. Grimes 	int len, rw;
153df8bae1dSRodney W. Grimes {
154df8bae1dSRodney W. Grimes 	boolean_t rv;
15502c58685SPoul-Henning Kamp 	vm_prot_t prot;
15605ba50f5SJake Burkholder 	vm_map_t map;
157df8bae1dSRodney W. Grimes 
158e50f5c2eSBruce Evans 	KASSERT((rw & ~VM_PROT_ALL) == 0,
15902c58685SPoul-Henning Kamp 	    ("illegal ``rw'' argument to useracc (%x)\n", rw));
16002c58685SPoul-Henning Kamp 	prot = rw;
16105ba50f5SJake Burkholder 	map = &curproc->p_vmspace->vm_map;
16205ba50f5SJake Burkholder 	if ((vm_offset_t)addr + len > vm_map_max(map) ||
16305ba50f5SJake Burkholder 	    (vm_offset_t)addr + len < (vm_offset_t)addr) {
16426f9a767SRodney W. Grimes 		return (FALSE);
16526f9a767SRodney W. Grimes 	}
166d8834602SAlan Cox 	vm_map_lock_read(map);
16705ba50f5SJake Burkholder 	rv = vm_map_check_protection(map, trunc_page((vm_offset_t)addr),
16805ba50f5SJake Burkholder 	    round_page((vm_offset_t)addr + len), prot);
169d8834602SAlan Cox 	vm_map_unlock_read(map);
170df8bae1dSRodney W. Grimes 	return (rv == TRUE);
171df8bae1dSRodney W. Grimes }
172df8bae1dSRodney W. Grimes 
17316929939SDon Lewis int
174f0ea4612SDon Lewis vslock(void *addr, size_t len)
17516929939SDon Lewis {
176bb734798SDon Lewis 	vm_offset_t end, last, start;
177bb734798SDon Lewis 	vm_size_t npages;
178bb734798SDon Lewis 	int error;
17916929939SDon Lewis 
180bb734798SDon Lewis 	last = (vm_offset_t)addr + len;
181ce8660e3SDon Lewis 	start = trunc_page((vm_offset_t)addr);
182bb734798SDon Lewis 	end = round_page(last);
183bb734798SDon Lewis 	if (last < (vm_offset_t)addr || end < (vm_offset_t)addr)
18416929939SDon Lewis 		return (EINVAL);
18516929939SDon Lewis 	npages = atop(end - start);
18616929939SDon Lewis 	if (npages > vm_page_max_wired)
18716929939SDon Lewis 		return (ENOMEM);
18816929939SDon Lewis #if 0
18916929939SDon Lewis 	/*
19016929939SDon Lewis 	 * XXX - not yet
19116929939SDon Lewis 	 *
19216929939SDon Lewis 	 * The limit for transient usage of wired pages should be
19316929939SDon Lewis 	 * larger than for "permanent" wired pages (mlock()).
19416929939SDon Lewis 	 *
19516929939SDon Lewis 	 * Also, the sysctl code, which is the only present user
19616929939SDon Lewis 	 * of vslock(), does a hard loop on EAGAIN.
19716929939SDon Lewis 	 */
1982feb50bfSAttilio Rao 	if (npages + cnt.v_wire_count > vm_page_max_wired)
19916929939SDon Lewis 		return (EAGAIN);
20016929939SDon Lewis #endif
201ce8660e3SDon Lewis 	error = vm_map_wire(&curproc->p_vmspace->vm_map, start, end,
202d9b2500eSBrian Feldman 	    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
203ce8660e3SDon Lewis 	/*
204ce8660e3SDon Lewis 	 * Return EFAULT on error to match copy{in,out}() behaviour
205ce8660e3SDon Lewis 	 * rather than returning ENOMEM like mlock() would.
206ce8660e3SDon Lewis 	 */
207ce8660e3SDon Lewis 	return (error == KERN_SUCCESS ? 0 : EFAULT);
20816929939SDon Lewis }
20916929939SDon Lewis 
210ce8660e3SDon Lewis void
211f0ea4612SDon Lewis vsunlock(void *addr, size_t len)
21216929939SDon Lewis {
21316929939SDon Lewis 
214ce8660e3SDon Lewis 	/* Rely on the parameter sanity checks performed by vslock(). */
215ce8660e3SDon Lewis 	(void)vm_map_unwire(&curproc->p_vmspace->vm_map,
216ce8660e3SDon Lewis 	    trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len),
21716929939SDon Lewis 	    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
21816929939SDon Lewis }
21916929939SDon Lewis 
220da61b9a6SAlan Cox /*
221da61b9a6SAlan Cox  * Pin the page contained within the given object at the given offset.  If the
222da61b9a6SAlan Cox  * page is not resident, allocate and load it using the given object's pager.
223da61b9a6SAlan Cox  * Return the pinned page if successful; otherwise, return NULL.
224da61b9a6SAlan Cox  */
225da61b9a6SAlan Cox static vm_page_t
226*be996836SAttilio Rao vm_imgact_hold_page(vm_object_t object, vm_ooffset_t offset)
227da61b9a6SAlan Cox {
228da61b9a6SAlan Cox 	vm_page_t m, ma[1];
229da61b9a6SAlan Cox 	vm_pindex_t pindex;
230da61b9a6SAlan Cox 	int rv;
231da61b9a6SAlan Cox 
23289f6b863SAttilio Rao 	VM_OBJECT_WLOCK(object);
233da61b9a6SAlan Cox 	pindex = OFF_TO_IDX(offset);
234002f377aSAttilio Rao 	m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_RETRY |
235002f377aSAttilio Rao 	    VM_ALLOC_NOBUSY);
2360a2e596aSAlan Cox 	if (m->valid != VM_PAGE_BITS_ALL) {
237002f377aSAttilio Rao 		vm_page_busy(m);
238da61b9a6SAlan Cox 		ma[0] = m;
239da61b9a6SAlan Cox 		rv = vm_pager_get_pages(object, ma, 1, 0);
240da61b9a6SAlan Cox 		m = vm_page_lookup(object, pindex);
241da61b9a6SAlan Cox 		if (m == NULL)
242da61b9a6SAlan Cox 			goto out;
243d1a6e42dSAlan Cox 		if (rv != VM_PAGER_OK) {
2442965a453SKip Macy 			vm_page_lock(m);
245da61b9a6SAlan Cox 			vm_page_free(m);
2462965a453SKip Macy 			vm_page_unlock(m);
247da61b9a6SAlan Cox 			m = NULL;
248da61b9a6SAlan Cox 			goto out;
249da61b9a6SAlan Cox 		}
250002f377aSAttilio Rao 		vm_page_wakeup(m);
251da61b9a6SAlan Cox 	}
252*be996836SAttilio Rao 	vm_page_lock(m);
253*be996836SAttilio Rao 	vm_page_hold(m);
254*be996836SAttilio Rao 	vm_page_unlock(m);
255da61b9a6SAlan Cox out:
25689f6b863SAttilio Rao 	VM_OBJECT_WUNLOCK(object);
257da61b9a6SAlan Cox 	return (m);
258da61b9a6SAlan Cox }
259da61b9a6SAlan Cox 
260da61b9a6SAlan Cox /*
261da61b9a6SAlan Cox  * Return a CPU private mapping to the page at the given offset within the
262da61b9a6SAlan Cox  * given object.  The page is pinned before it is mapped.
263da61b9a6SAlan Cox  */
264da61b9a6SAlan Cox struct sf_buf *
265da61b9a6SAlan Cox vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset)
266da61b9a6SAlan Cox {
267da61b9a6SAlan Cox 	vm_page_t m;
268da61b9a6SAlan Cox 
269*be996836SAttilio Rao 	m = vm_imgact_hold_page(object, offset);
270da61b9a6SAlan Cox 	if (m == NULL)
271da61b9a6SAlan Cox 		return (NULL);
272da61b9a6SAlan Cox 	sched_pin();
273da61b9a6SAlan Cox 	return (sf_buf_alloc(m, SFB_CPUPRIVATE));
274da61b9a6SAlan Cox }
275da61b9a6SAlan Cox 
276da61b9a6SAlan Cox /*
277da61b9a6SAlan Cox  * Destroy the given CPU private mapping and unpin the page that it mapped.
278da61b9a6SAlan Cox  */
279da61b9a6SAlan Cox void
280*be996836SAttilio Rao vm_imgact_unmap_page(struct sf_buf *sf)
281da61b9a6SAlan Cox {
282da61b9a6SAlan Cox 	vm_page_t m;
283da61b9a6SAlan Cox 
284da61b9a6SAlan Cox 	m = sf_buf_page(sf);
285da61b9a6SAlan Cox 	sf_buf_free(sf);
286da61b9a6SAlan Cox 	sched_unpin();
287*be996836SAttilio Rao 	vm_page_lock(m);
288*be996836SAttilio Rao 	vm_page_unhold(m);
289*be996836SAttilio Rao 	vm_page_unlock(m);
290da61b9a6SAlan Cox }
291da61b9a6SAlan Cox 
2921a4fcaebSMarcel Moolenaar void
2931a4fcaebSMarcel Moolenaar vm_sync_icache(vm_map_t map, vm_offset_t va, vm_offset_t sz)
2941a4fcaebSMarcel Moolenaar {
2951a4fcaebSMarcel Moolenaar 
2961a4fcaebSMarcel Moolenaar 	pmap_sync_icache(map->pmap, va, sz);
2971a4fcaebSMarcel Moolenaar }
2981a4fcaebSMarcel Moolenaar 
299e878d997SKonstantin Belousov struct kstack_cache_entry *kstack_cache;
3008a945d10SKonstantin Belousov static int kstack_cache_size = 128;
3018a945d10SKonstantin Belousov static int kstacks;
3028a945d10SKonstantin Belousov static struct mtx kstack_cache_mtx;
30325c1e164SAndre Oppermann MTX_SYSINIT(kstack_cache, &kstack_cache_mtx, "kstkch", MTX_DEF);
30425c1e164SAndre Oppermann 
3058a945d10SKonstantin Belousov SYSCTL_INT(_vm, OID_AUTO, kstack_cache_size, CTLFLAG_RW, &kstack_cache_size, 0,
3068a945d10SKonstantin Belousov     "");
3078a945d10SKonstantin Belousov SYSCTL_INT(_vm, OID_AUTO, kstacks, CTLFLAG_RD, &kstacks, 0,
3088a945d10SKonstantin Belousov     "");
3098a945d10SKonstantin Belousov 
31049a2507bSAlan Cox #ifndef KSTACK_MAX_PAGES
31149a2507bSAlan Cox #define KSTACK_MAX_PAGES 32
31249a2507bSAlan Cox #endif
31349a2507bSAlan Cox 
31449a2507bSAlan Cox /*
31549a2507bSAlan Cox  * Create the kernel stack (including pcb for i386) for a new thread.
31649a2507bSAlan Cox  * This routine directly affects the fork perf for a process and
31749a2507bSAlan Cox  * create performance for a thread.
31849a2507bSAlan Cox  */
31989b57fcfSKonstantin Belousov int
32049a2507bSAlan Cox vm_thread_new(struct thread *td, int pages)
32149a2507bSAlan Cox {
32249a2507bSAlan Cox 	vm_object_t ksobj;
32349a2507bSAlan Cox 	vm_offset_t ks;
32449a2507bSAlan Cox 	vm_page_t m, ma[KSTACK_MAX_PAGES];
3258a945d10SKonstantin Belousov 	struct kstack_cache_entry *ks_ce;
32649a2507bSAlan Cox 	int i;
32749a2507bSAlan Cox 
32849a2507bSAlan Cox 	/* Bounds check */
32949a2507bSAlan Cox 	if (pages <= 1)
33049a2507bSAlan Cox 		pages = KSTACK_PAGES;
33149a2507bSAlan Cox 	else if (pages > KSTACK_MAX_PAGES)
33249a2507bSAlan Cox 		pages = KSTACK_MAX_PAGES;
3338a945d10SKonstantin Belousov 
3348a945d10SKonstantin Belousov 	if (pages == KSTACK_PAGES) {
3358a945d10SKonstantin Belousov 		mtx_lock(&kstack_cache_mtx);
3368a945d10SKonstantin Belousov 		if (kstack_cache != NULL) {
3378a945d10SKonstantin Belousov 			ks_ce = kstack_cache;
3388a945d10SKonstantin Belousov 			kstack_cache = ks_ce->next_ks_entry;
3398a945d10SKonstantin Belousov 			mtx_unlock(&kstack_cache_mtx);
3408a945d10SKonstantin Belousov 
3418a945d10SKonstantin Belousov 			td->td_kstack_obj = ks_ce->ksobj;
3428a945d10SKonstantin Belousov 			td->td_kstack = (vm_offset_t)ks_ce;
3438a945d10SKonstantin Belousov 			td->td_kstack_pages = KSTACK_PAGES;
3448a945d10SKonstantin Belousov 			return (1);
3458a945d10SKonstantin Belousov 		}
3468a945d10SKonstantin Belousov 		mtx_unlock(&kstack_cache_mtx);
3478a945d10SKonstantin Belousov 	}
3488a945d10SKonstantin Belousov 
34949a2507bSAlan Cox 	/*
35049a2507bSAlan Cox 	 * Allocate an object for the kstack.
35149a2507bSAlan Cox 	 */
35249a2507bSAlan Cox 	ksobj = vm_object_allocate(OBJT_DEFAULT, pages);
353374ae2a3SJeff Roberson 
35449a2507bSAlan Cox 	/*
35549a2507bSAlan Cox 	 * Get a kernel virtual address for this thread's kstack.
35649a2507bSAlan Cox 	 */
357ca596a25SJuli Mallett #if defined(__mips__)
358ca596a25SJuli Mallett 	/*
359ca596a25SJuli Mallett 	 * We need to align the kstack's mapped address to fit within
360ca596a25SJuli Mallett 	 * a single TLB entry.
361ca596a25SJuli Mallett 	 */
362ca596a25SJuli Mallett 	ks = kmem_alloc_nofault_space(kernel_map,
363ca596a25SJuli Mallett 	    (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE, VMFS_TLB_ALIGNED_SPACE);
364ca596a25SJuli Mallett #else
36549a2507bSAlan Cox 	ks = kmem_alloc_nofault(kernel_map,
36649a2507bSAlan Cox 	   (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE);
367ca596a25SJuli Mallett #endif
36889b57fcfSKonstantin Belousov 	if (ks == 0) {
36989b57fcfSKonstantin Belousov 		printf("vm_thread_new: kstack allocation failed\n");
37089b57fcfSKonstantin Belousov 		vm_object_deallocate(ksobj);
37189b57fcfSKonstantin Belousov 		return (0);
37289b57fcfSKonstantin Belousov 	}
37389b57fcfSKonstantin Belousov 
3748a945d10SKonstantin Belousov 	atomic_add_int(&kstacks, 1);
37549a2507bSAlan Cox 	if (KSTACK_GUARD_PAGES != 0) {
37649a2507bSAlan Cox 		pmap_qremove(ks, KSTACK_GUARD_PAGES);
37749a2507bSAlan Cox 		ks += KSTACK_GUARD_PAGES * PAGE_SIZE;
37849a2507bSAlan Cox 	}
37989b57fcfSKonstantin Belousov 	td->td_kstack_obj = ksobj;
38049a2507bSAlan Cox 	td->td_kstack = ks;
38149a2507bSAlan Cox 	/*
38249a2507bSAlan Cox 	 * Knowing the number of pages allocated is useful when you
38349a2507bSAlan Cox 	 * want to deallocate them.
38449a2507bSAlan Cox 	 */
38549a2507bSAlan Cox 	td->td_kstack_pages = pages;
38649a2507bSAlan Cox 	/*
38749a2507bSAlan Cox 	 * For the length of the stack, link in a real page of ram for each
38849a2507bSAlan Cox 	 * page of stack.
38949a2507bSAlan Cox 	 */
39089f6b863SAttilio Rao 	VM_OBJECT_WLOCK(ksobj);
39149a2507bSAlan Cox 	for (i = 0; i < pages; i++) {
39249a2507bSAlan Cox 		/*
39349a2507bSAlan Cox 		 * Get a kernel stack page.
39449a2507bSAlan Cox 		 */
395ddf4bb37SAlan Cox 		m = vm_page_grab(ksobj, i, VM_ALLOC_NOBUSY |
39649a2507bSAlan Cox 		    VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_WIRED);
39749a2507bSAlan Cox 		ma[i] = m;
39849a2507bSAlan Cox 		m->valid = VM_PAGE_BITS_ALL;
39949a2507bSAlan Cox 	}
40089f6b863SAttilio Rao 	VM_OBJECT_WUNLOCK(ksobj);
40149a2507bSAlan Cox 	pmap_qenter(ks, ma, pages);
40289b57fcfSKonstantin Belousov 	return (1);
40349a2507bSAlan Cox }
40449a2507bSAlan Cox 
4058a945d10SKonstantin Belousov static void
4068a945d10SKonstantin Belousov vm_thread_stack_dispose(vm_object_t ksobj, vm_offset_t ks, int pages)
40749a2507bSAlan Cox {
40849a2507bSAlan Cox 	vm_page_t m;
4098a945d10SKonstantin Belousov 	int i;
41049a2507bSAlan Cox 
4118a945d10SKonstantin Belousov 	atomic_add_int(&kstacks, -1);
41249a2507bSAlan Cox 	pmap_qremove(ks, pages);
41389f6b863SAttilio Rao 	VM_OBJECT_WLOCK(ksobj);
41449a2507bSAlan Cox 	for (i = 0; i < pages; i++) {
41549a2507bSAlan Cox 		m = vm_page_lookup(ksobj, i);
41649a2507bSAlan Cox 		if (m == NULL)
41749a2507bSAlan Cox 			panic("vm_thread_dispose: kstack already missing?");
4182965a453SKip Macy 		vm_page_lock(m);
41949a2507bSAlan Cox 		vm_page_unwire(m, 0);
42049a2507bSAlan Cox 		vm_page_free(m);
4212965a453SKip Macy 		vm_page_unlock(m);
42249a2507bSAlan Cox 	}
42389f6b863SAttilio Rao 	VM_OBJECT_WUNLOCK(ksobj);
42449a2507bSAlan Cox 	vm_object_deallocate(ksobj);
42549a2507bSAlan Cox 	kmem_free(kernel_map, ks - (KSTACK_GUARD_PAGES * PAGE_SIZE),
42649a2507bSAlan Cox 	    (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE);
427c3cf0b47SKonstantin Belousov }
428c3cf0b47SKonstantin Belousov 
429c3cf0b47SKonstantin Belousov /*
4308a945d10SKonstantin Belousov  * Dispose of a thread's kernel stack.
4318a945d10SKonstantin Belousov  */
4328a945d10SKonstantin Belousov void
4338a945d10SKonstantin Belousov vm_thread_dispose(struct thread *td)
4348a945d10SKonstantin Belousov {
4358a945d10SKonstantin Belousov 	vm_object_t ksobj;
4368a945d10SKonstantin Belousov 	vm_offset_t ks;
4378a945d10SKonstantin Belousov 	struct kstack_cache_entry *ks_ce;
4388a945d10SKonstantin Belousov 	int pages;
4398a945d10SKonstantin Belousov 
4408a945d10SKonstantin Belousov 	pages = td->td_kstack_pages;
4418a945d10SKonstantin Belousov 	ksobj = td->td_kstack_obj;
4428a945d10SKonstantin Belousov 	ks = td->td_kstack;
4438a945d10SKonstantin Belousov 	td->td_kstack = 0;
4448a945d10SKonstantin Belousov 	td->td_kstack_pages = 0;
4458a945d10SKonstantin Belousov 	if (pages == KSTACK_PAGES && kstacks <= kstack_cache_size) {
4468a945d10SKonstantin Belousov 		ks_ce = (struct kstack_cache_entry *)ks;
4478a945d10SKonstantin Belousov 		ks_ce->ksobj = ksobj;
4488a945d10SKonstantin Belousov 		mtx_lock(&kstack_cache_mtx);
4498a945d10SKonstantin Belousov 		ks_ce->next_ks_entry = kstack_cache;
4508a945d10SKonstantin Belousov 		kstack_cache = ks_ce;
4518a945d10SKonstantin Belousov 		mtx_unlock(&kstack_cache_mtx);
4528a945d10SKonstantin Belousov 		return;
4538a945d10SKonstantin Belousov 	}
4548a945d10SKonstantin Belousov 	vm_thread_stack_dispose(ksobj, ks, pages);
4558a945d10SKonstantin Belousov }
4568a945d10SKonstantin Belousov 
4578a945d10SKonstantin Belousov static void
4588a945d10SKonstantin Belousov vm_thread_stack_lowmem(void *nulll)
4598a945d10SKonstantin Belousov {
4608a945d10SKonstantin Belousov 	struct kstack_cache_entry *ks_ce, *ks_ce1;
4618a945d10SKonstantin Belousov 
4628a945d10SKonstantin Belousov 	mtx_lock(&kstack_cache_mtx);
4638a945d10SKonstantin Belousov 	ks_ce = kstack_cache;
4648a945d10SKonstantin Belousov 	kstack_cache = NULL;
4658a945d10SKonstantin Belousov 	mtx_unlock(&kstack_cache_mtx);
4668a945d10SKonstantin Belousov 
4678a945d10SKonstantin Belousov 	while (ks_ce != NULL) {
4688a945d10SKonstantin Belousov 		ks_ce1 = ks_ce;
4698a945d10SKonstantin Belousov 		ks_ce = ks_ce->next_ks_entry;
4708a945d10SKonstantin Belousov 
4718a945d10SKonstantin Belousov 		vm_thread_stack_dispose(ks_ce1->ksobj, (vm_offset_t)ks_ce1,
4728a945d10SKonstantin Belousov 		    KSTACK_PAGES);
4738a945d10SKonstantin Belousov 	}
4748a945d10SKonstantin Belousov }
4758a945d10SKonstantin Belousov 
4768a945d10SKonstantin Belousov static void
4778a945d10SKonstantin Belousov kstack_cache_init(void *nulll)
4788a945d10SKonstantin Belousov {
4798a945d10SKonstantin Belousov 
4808a945d10SKonstantin Belousov 	EVENTHANDLER_REGISTER(vm_lowmem, vm_thread_stack_lowmem, NULL,
4818a945d10SKonstantin Belousov 	    EVENTHANDLER_PRI_ANY);
4828a945d10SKonstantin Belousov }
4838a945d10SKonstantin Belousov 
4848a945d10SKonstantin Belousov SYSINIT(vm_kstacks, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY, kstack_cache_init, NULL);
4858a945d10SKonstantin Belousov 
486ac45ee97SAlan Cox #ifndef NO_SWAPPING
4878a945d10SKonstantin Belousov /*
48849a2507bSAlan Cox  * Allow a thread's kernel stack to be paged out.
48949a2507bSAlan Cox  */
490ac45ee97SAlan Cox static void
49149a2507bSAlan Cox vm_thread_swapout(struct thread *td)
49249a2507bSAlan Cox {
49349a2507bSAlan Cox 	vm_object_t ksobj;
49449a2507bSAlan Cox 	vm_page_t m;
49549a2507bSAlan Cox 	int i, pages;
49649a2507bSAlan Cox 
497710338e9SMarcel Moolenaar 	cpu_thread_swapout(td);
49849a2507bSAlan Cox 	pages = td->td_kstack_pages;
49949a2507bSAlan Cox 	ksobj = td->td_kstack_obj;
50049a2507bSAlan Cox 	pmap_qremove(td->td_kstack, pages);
50189f6b863SAttilio Rao 	VM_OBJECT_WLOCK(ksobj);
50249a2507bSAlan Cox 	for (i = 0; i < pages; i++) {
50349a2507bSAlan Cox 		m = vm_page_lookup(ksobj, i);
50449a2507bSAlan Cox 		if (m == NULL)
50549a2507bSAlan Cox 			panic("vm_thread_swapout: kstack already missing?");
50649a2507bSAlan Cox 		vm_page_dirty(m);
5072965a453SKip Macy 		vm_page_lock(m);
50849a2507bSAlan Cox 		vm_page_unwire(m, 0);
5092965a453SKip Macy 		vm_page_unlock(m);
51049a2507bSAlan Cox 	}
51189f6b863SAttilio Rao 	VM_OBJECT_WUNLOCK(ksobj);
51249a2507bSAlan Cox }
51349a2507bSAlan Cox 
51449a2507bSAlan Cox /*
51549a2507bSAlan Cox  * Bring the kernel stack for a specified thread back in.
51649a2507bSAlan Cox  */
517ac45ee97SAlan Cox static void
51849a2507bSAlan Cox vm_thread_swapin(struct thread *td)
51949a2507bSAlan Cox {
52049a2507bSAlan Cox 	vm_object_t ksobj;
5216fb8c0c1SKonstantin Belousov 	vm_page_t ma[KSTACK_MAX_PAGES];
5226fb8c0c1SKonstantin Belousov 	int i, j, k, pages, rv;
52349a2507bSAlan Cox 
52449a2507bSAlan Cox 	pages = td->td_kstack_pages;
52549a2507bSAlan Cox 	ksobj = td->td_kstack_obj;
52689f6b863SAttilio Rao 	VM_OBJECT_WLOCK(ksobj);
5276fb8c0c1SKonstantin Belousov 	for (i = 0; i < pages; i++)
5286fb8c0c1SKonstantin Belousov 		ma[i] = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY |
5296c68c971SAlan Cox 		    VM_ALLOC_WIRED);
5306fb8c0c1SKonstantin Belousov 	for (i = 0; i < pages; i++) {
5316fb8c0c1SKonstantin Belousov 		if (ma[i]->valid != VM_PAGE_BITS_ALL) {
5326fb8c0c1SKonstantin Belousov 			KASSERT(ma[i]->oflags & VPO_BUSY,
5336fb8c0c1SKonstantin Belousov 			    ("lost busy 1"));
5346fb8c0c1SKonstantin Belousov 			vm_object_pip_add(ksobj, 1);
5356fb8c0c1SKonstantin Belousov 			for (j = i + 1; j < pages; j++) {
5366fb8c0c1SKonstantin Belousov 				KASSERT(ma[j]->valid == VM_PAGE_BITS_ALL ||
5376fb8c0c1SKonstantin Belousov 				    (ma[j]->oflags & VPO_BUSY),
5386fb8c0c1SKonstantin Belousov 				    ("lost busy 2"));
5396fb8c0c1SKonstantin Belousov 				if (ma[j]->valid == VM_PAGE_BITS_ALL)
5406fb8c0c1SKonstantin Belousov 					break;
54149a2507bSAlan Cox 			}
5426fb8c0c1SKonstantin Belousov 			rv = vm_pager_get_pages(ksobj, ma + i, j - i, 0);
5436fb8c0c1SKonstantin Belousov 			if (rv != VM_PAGER_OK)
5446fb8c0c1SKonstantin Belousov 	panic("vm_thread_swapin: cannot get kstack for proc: %d",
5456fb8c0c1SKonstantin Belousov 				    td->td_proc->p_pid);
5466fb8c0c1SKonstantin Belousov 			vm_object_pip_wakeup(ksobj);
5476fb8c0c1SKonstantin Belousov 			for (k = i; k < j; k++)
5486fb8c0c1SKonstantin Belousov 				ma[k] = vm_page_lookup(ksobj, k);
5496fb8c0c1SKonstantin Belousov 			vm_page_wakeup(ma[i]);
5506fb8c0c1SKonstantin Belousov 		} else if (ma[i]->oflags & VPO_BUSY)
5516fb8c0c1SKonstantin Belousov 			vm_page_wakeup(ma[i]);
55249a2507bSAlan Cox 	}
55389f6b863SAttilio Rao 	VM_OBJECT_WUNLOCK(ksobj);
55449a2507bSAlan Cox 	pmap_qenter(td->td_kstack, ma, pages);
555710338e9SMarcel Moolenaar 	cpu_thread_swapin(td);
55649a2507bSAlan Cox }
557ac45ee97SAlan Cox #endif /* !NO_SWAPPING */
55849a2507bSAlan Cox 
559a136efe9SPeter Wemm /*
560df8bae1dSRodney W. Grimes  * Implement fork's actions on an address space.
561df8bae1dSRodney W. Grimes  * Here we arrange for the address space to be copied or referenced,
562df8bae1dSRodney W. Grimes  * allocate a user struct (pcb and kernel stack), then call the
563df8bae1dSRodney W. Grimes  * machine-dependent layer to fill those in and make the new process
564a2a1c95cSPeter Wemm  * ready to run.  The new process is set up so that it returns directly
565a2a1c95cSPeter Wemm  * to user mode to avoid stack copying and relocation problems.
566df8bae1dSRodney W. Grimes  */
56789b57fcfSKonstantin Belousov int
56889b57fcfSKonstantin Belousov vm_forkproc(td, p2, td2, vm2, flags)
569b40ce416SJulian Elischer 	struct thread *td;
570b40ce416SJulian Elischer 	struct proc *p2;
571079b7badSJulian Elischer 	struct thread *td2;
57289b57fcfSKonstantin Belousov 	struct vmspace *vm2;
573a2a1c95cSPeter Wemm 	int flags;
574df8bae1dSRodney W. Grimes {
575b40ce416SJulian Elischer 	struct proc *p1 = td->td_proc;
57689b57fcfSKonstantin Belousov 	int error;
577df8bae1dSRodney W. Grimes 
57891c28bfdSLuoqi Chen 	if ((flags & RFPROC) == 0) {
57991c28bfdSLuoqi Chen 		/*
58091c28bfdSLuoqi Chen 		 * Divorce the memory, if it is shared, essentially
58191c28bfdSLuoqi Chen 		 * this changes shared memory amongst threads, into
58291c28bfdSLuoqi Chen 		 * COW locally.
58391c28bfdSLuoqi Chen 		 */
58491c28bfdSLuoqi Chen 		if ((flags & RFMEM) == 0) {
58591c28bfdSLuoqi Chen 			if (p1->p_vmspace->vm_refcnt > 1) {
58689b57fcfSKonstantin Belousov 				error = vmspace_unshare(p1);
58789b57fcfSKonstantin Belousov 				if (error)
58889b57fcfSKonstantin Belousov 					return (error);
58991c28bfdSLuoqi Chen 			}
59091c28bfdSLuoqi Chen 		}
591079b7badSJulian Elischer 		cpu_fork(td, p2, td2, flags);
59289b57fcfSKonstantin Belousov 		return (0);
59391c28bfdSLuoqi Chen 	}
59491c28bfdSLuoqi Chen 
5955856e12eSJohn Dyson 	if (flags & RFMEM) {
5965856e12eSJohn Dyson 		p2->p_vmspace = p1->p_vmspace;
5971a276a3fSAlan Cox 		atomic_add_int(&p1->p_vmspace->vm_refcnt, 1);
5985856e12eSJohn Dyson 	}
5995856e12eSJohn Dyson 
60090ecac61SMatthew Dillon 	while (vm_page_count_severe()) {
60126f9a767SRodney W. Grimes 		VM_WAIT;
6020d94caffSDavid Greenman 	}
60326f9a767SRodney W. Grimes 
6045856e12eSJohn Dyson 	if ((flags & RFMEM) == 0) {
60589b57fcfSKonstantin Belousov 		p2->p_vmspace = vm2;
606df8bae1dSRodney W. Grimes 		if (p1->p_vmspace->vm_shm)
607dabee6feSPeter Wemm 			shmfork(p1, p2);
608a2a1c95cSPeter Wemm 	}
609df8bae1dSRodney W. Grimes 
61039fb8e6bSJulian Elischer 	/*
611a2a1c95cSPeter Wemm 	 * cpu_fork will copy and update the pcb, set up the kernel stack,
612a2a1c95cSPeter Wemm 	 * and make the child ready to run.
613df8bae1dSRodney W. Grimes 	 */
614079b7badSJulian Elischer 	cpu_fork(td, p2, td2, flags);
61589b57fcfSKonstantin Belousov 	return (0);
616df8bae1dSRodney W. Grimes }
617df8bae1dSRodney W. Grimes 
618df8bae1dSRodney W. Grimes /*
619eb30c1c0SPeter Wemm  * Called after process has been wait(2)'ed apon and is being reaped.
620eb30c1c0SPeter Wemm  * The idea is to reclaim resources that we could not reclaim while
621eb30c1c0SPeter Wemm  * the process was still executing.
622eb30c1c0SPeter Wemm  */
623eb30c1c0SPeter Wemm void
624eb30c1c0SPeter Wemm vm_waitproc(p)
625eb30c1c0SPeter Wemm 	struct proc *p;
626eb30c1c0SPeter Wemm {
627eb30c1c0SPeter Wemm 
628582ec34cSAlfred Perlstein 	vmspace_exitfree(p);		/* and clean-out the vmspace */
629eb30c1c0SPeter Wemm }
630eb30c1c0SPeter Wemm 
63126f9a767SRodney W. Grimes void
63226f9a767SRodney W. Grimes faultin(p)
63326f9a767SRodney W. Grimes 	struct proc *p;
63426f9a767SRodney W. Grimes {
63511edc1e0SJohn Baldwin #ifdef NO_SWAPPING
63611edc1e0SJohn Baldwin 
63711edc1e0SJohn Baldwin 	PROC_LOCK_ASSERT(p, MA_OWNED);
638b61ce5b0SJeff Roberson 	if ((p->p_flag & P_INMEM) == 0)
63911edc1e0SJohn Baldwin 		panic("faultin: proc swapped out with NO_SWAPPING!");
64011edc1e0SJohn Baldwin #else /* !NO_SWAPPING */
641664f718bSJohn Baldwin 	struct thread *td;
64226f9a767SRodney W. Grimes 
643c96d52a9SJohn Baldwin 	PROC_LOCK_ASSERT(p, MA_OWNED);
6441d7b9ed2SJulian Elischer 	/*
6451d7b9ed2SJulian Elischer 	 * If another process is swapping in this process,
6461d7b9ed2SJulian Elischer 	 * just wait until it finishes.
6471d7b9ed2SJulian Elischer 	 */
648b61ce5b0SJeff Roberson 	if (p->p_flag & P_SWAPPINGIN) {
649b61ce5b0SJeff Roberson 		while (p->p_flag & P_SWAPPINGIN)
650b61ce5b0SJeff Roberson 			msleep(&p->p_flag, &p->p_mtx, PVM, "faultin", 0);
651b61ce5b0SJeff Roberson 		return;
652b61ce5b0SJeff Roberson 	}
653b61ce5b0SJeff Roberson 	if ((p->p_flag & P_INMEM) == 0) {
654664f718bSJohn Baldwin 		/*
655664f718bSJohn Baldwin 		 * Don't let another thread swap process p out while we are
656664f718bSJohn Baldwin 		 * busy swapping it in.
657664f718bSJohn Baldwin 		 */
658664f718bSJohn Baldwin 		++p->p_lock;
659b61ce5b0SJeff Roberson 		p->p_flag |= P_SWAPPINGIN;
66045ece682SJohn Baldwin 		PROC_UNLOCK(p);
66126f9a767SRodney W. Grimes 
662b61ce5b0SJeff Roberson 		/*
663b61ce5b0SJeff Roberson 		 * We hold no lock here because the list of threads
664b61ce5b0SJeff Roberson 		 * can not change while all threads in the process are
665b61ce5b0SJeff Roberson 		 * swapped out.
666b61ce5b0SJeff Roberson 		 */
667664f718bSJohn Baldwin 		FOREACH_THREAD_IN_PROC(p, td)
66849a2507bSAlan Cox 			vm_thread_swapin(td);
66945ece682SJohn Baldwin 		PROC_LOCK(p);
670b61ce5b0SJeff Roberson 		swapclear(p);
671258853abSJeff Roberson 		p->p_swtick = ticks;
67226f9a767SRodney W. Grimes 
673b61ce5b0SJeff Roberson 		wakeup(&p->p_flag);
67426f9a767SRodney W. Grimes 
675664f718bSJohn Baldwin 		/* Allow other threads to swap p out now. */
67626f9a767SRodney W. Grimes 		--p->p_lock;
67726f9a767SRodney W. Grimes 	}
67811edc1e0SJohn Baldwin #endif /* NO_SWAPPING */
67926f9a767SRodney W. Grimes }
68026f9a767SRodney W. Grimes 
681df8bae1dSRodney W. Grimes /*
68226f9a767SRodney W. Grimes  * This swapin algorithm attempts to swap-in processes only if there
68326f9a767SRodney W. Grimes  * is enough space for them.  Of course, if a process waits for a long
68426f9a767SRodney W. Grimes  * time, it will be swapped in anyway.
6850384fff8SJason Evans  *
68610c447faSAlan Cox  * Giant is held on entry.
687df8bae1dSRodney W. Grimes  */
688785797c3SAndriy Gapon void
689785797c3SAndriy Gapon swapper(void)
690df8bae1dSRodney W. Grimes {
69154d92145SMatthew Dillon 	struct proc *p;
692e602ba25SJulian Elischer 	struct thread *td;
693df8bae1dSRodney W. Grimes 	struct proc *pp;
694258853abSJeff Roberson 	int slptime;
695258853abSJeff Roberson 	int swtime;
696df8bae1dSRodney W. Grimes 	int ppri;
697258853abSJeff Roberson 	int pri;
698df8bae1dSRodney W. Grimes 
699df8bae1dSRodney W. Grimes loop:
70090ecac61SMatthew Dillon 	if (vm_page_count_min()) {
7010d94caffSDavid Greenman 		VM_WAIT;
70290ecac61SMatthew Dillon 		goto loop;
7030d94caffSDavid Greenman 	}
70426f9a767SRodney W. Grimes 
705df8bae1dSRodney W. Grimes 	pp = NULL;
706df8bae1dSRodney W. Grimes 	ppri = INT_MIN;
7071005a129SJohn Baldwin 	sx_slock(&allproc_lock);
708b40ce416SJulian Elischer 	FOREACH_PROC_IN_SYSTEM(p) {
709b61ce5b0SJeff Roberson 		PROC_LOCK(p);
710e806d352SJohn Baldwin 		if (p->p_state == PRS_NEW ||
711e806d352SJohn Baldwin 		    p->p_flag & (P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM)) {
712b61ce5b0SJeff Roberson 			PROC_UNLOCK(p);
713e602ba25SJulian Elischer 			continue;
714e602ba25SJulian Elischer 		}
715258853abSJeff Roberson 		swtime = (ticks - p->p_swtick) / hz;
716e602ba25SJulian Elischer 		FOREACH_THREAD_IN_PROC(p, td) {
7171d7b9ed2SJulian Elischer 			/*
71871fad9fdSJulian Elischer 			 * An otherwise runnable thread of a process
71971fad9fdSJulian Elischer 			 * swapped out has only the TDI_SWAPPED bit set.
72071fad9fdSJulian Elischer 			 *
7211d7b9ed2SJulian Elischer 			 */
722982d11f8SJeff Roberson 			thread_lock(td);
72371fad9fdSJulian Elischer 			if (td->td_inhibitors == TDI_SWAPPED) {
724258853abSJeff Roberson 				slptime = (ticks - td->td_slptick) / hz;
725258853abSJeff Roberson 				pri = swtime + slptime;
726b61ce5b0SJeff Roberson 				if ((td->td_flags & TDF_SWAPINREQ) == 0)
727fa885116SJulian Elischer 					pri -= p->p_nice * 8;
72826f9a767SRodney W. Grimes 				/*
729ad1e7d28SJulian Elischer 				 * if this thread is higher priority
730b40ce416SJulian Elischer 				 * and there is enough space, then select
731b40ce416SJulian Elischer 				 * this process instead of the previous
732b40ce416SJulian Elischer 				 * selection.
73326f9a767SRodney W. Grimes 				 */
7340d94caffSDavid Greenman 				if (pri > ppri) {
735df8bae1dSRodney W. Grimes 					pp = p;
736df8bae1dSRodney W. Grimes 					ppri = pri;
737df8bae1dSRodney W. Grimes 				}
738df8bae1dSRodney W. Grimes 			}
739982d11f8SJeff Roberson 			thread_unlock(td);
740b40ce416SJulian Elischer 		}
741b61ce5b0SJeff Roberson 		PROC_UNLOCK(p);
742df8bae1dSRodney W. Grimes 	}
7431005a129SJohn Baldwin 	sx_sunlock(&allproc_lock);
74426f9a767SRodney W. Grimes 
745df8bae1dSRodney W. Grimes 	/*
746a669a6e9SJohn Dyson 	 * Nothing to do, back to sleep.
747df8bae1dSRodney W. Grimes 	 */
748df8bae1dSRodney W. Grimes 	if ((p = pp) == NULL) {
749785797c3SAndriy Gapon 		tsleep(&proc0, PVM, "swapin", MAXSLP * hz / 2);
750df8bae1dSRodney W. Grimes 		goto loop;
751df8bae1dSRodney W. Grimes 	}
7521d7b9ed2SJulian Elischer 	PROC_LOCK(p);
7531d7b9ed2SJulian Elischer 
7541d7b9ed2SJulian Elischer 	/*
7551d7b9ed2SJulian Elischer 	 * Another process may be bringing or may have already
7561d7b9ed2SJulian Elischer 	 * brought this process in while we traverse all threads.
7571d7b9ed2SJulian Elischer 	 * Or, this process may even be being swapped out again.
7581d7b9ed2SJulian Elischer 	 */
759b61ce5b0SJeff Roberson 	if (p->p_flag & (P_INMEM | P_SWAPPINGOUT | P_SWAPPINGIN)) {
7601d7b9ed2SJulian Elischer 		PROC_UNLOCK(p);
7611d7b9ed2SJulian Elischer 		goto loop;
7621d7b9ed2SJulian Elischer 	}
7631d7b9ed2SJulian Elischer 
764df8bae1dSRodney W. Grimes 	/*
76526f9a767SRodney W. Grimes 	 * We would like to bring someone in. (only if there is space).
766e602ba25SJulian Elischer 	 * [What checks the space? ]
767df8bae1dSRodney W. Grimes 	 */
76826f9a767SRodney W. Grimes 	faultin(p);
76945ece682SJohn Baldwin 	PROC_UNLOCK(p);
770df8bae1dSRodney W. Grimes 	goto loop;
771df8bae1dSRodney W. Grimes }
772df8bae1dSRodney W. Grimes 
773da7bbd2cSJohn Baldwin void
774da7bbd2cSJohn Baldwin kick_proc0(void)
775d13ec713SStephan Uphoff {
776d13ec713SStephan Uphoff 
777da7bbd2cSJohn Baldwin 	wakeup(&proc0);
778d13ec713SStephan Uphoff }
779d13ec713SStephan Uphoff 
7805afce282SDavid Greenman #ifndef NO_SWAPPING
7815afce282SDavid Greenman 
782ceb0cf87SJohn Dyson /*
783ceb0cf87SJohn Dyson  * Swap_idle_threshold1 is the guaranteed swapped in time for a process
784ceb0cf87SJohn Dyson  */
785303b270bSEivind Eklund static int swap_idle_threshold1 = 2;
7862a3eeaa2STom Rhodes SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW,
7879faaf3b3STom Rhodes     &swap_idle_threshold1, 0, "Guaranteed swapped in time for a process");
788ceb0cf87SJohn Dyson 
789ceb0cf87SJohn Dyson /*
790ceb0cf87SJohn Dyson  * Swap_idle_threshold2 is the time that a process can be idle before
791ceb0cf87SJohn Dyson  * it will be swapped out, if idle swapping is enabled.
792ceb0cf87SJohn Dyson  */
793303b270bSEivind Eklund static int swap_idle_threshold2 = 10;
7942a3eeaa2STom Rhodes SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW,
7959faaf3b3STom Rhodes     &swap_idle_threshold2, 0, "Time before a process will be swapped out");
796ceb0cf87SJohn Dyson 
797df8bae1dSRodney W. Grimes /*
79850a57dfbSKonstantin Belousov  * First, if any processes have been sleeping or stopped for at least
79950a57dfbSKonstantin Belousov  * "swap_idle_threshold1" seconds, they are swapped out.  If, however,
80050a57dfbSKonstantin Belousov  * no such processes exist, then the longest-sleeping or stopped
80150a57dfbSKonstantin Belousov  * process is swapped out.  Finally, and only as a last resort, if
80250a57dfbSKonstantin Belousov  * there are no sleeping or stopped processes, the longest-resident
80350a57dfbSKonstantin Belousov  * process is swapped out.
804df8bae1dSRodney W. Grimes  */
805df8bae1dSRodney W. Grimes void
8063a2dc656SJohn Dyson swapout_procs(action)
8073a2dc656SJohn Dyson int action;
808df8bae1dSRodney W. Grimes {
80954d92145SMatthew Dillon 	struct proc *p;
810e602ba25SJulian Elischer 	struct thread *td;
811df8bae1dSRodney W. Grimes 	int didswap = 0;
812df8bae1dSRodney W. Grimes 
8130d94caffSDavid Greenman retry:
8143a2189d4SJohn Baldwin 	sx_slock(&allproc_lock);
815e602ba25SJulian Elischer 	FOREACH_PROC_IN_SYSTEM(p) {
816b18bfc3dSJohn Dyson 		struct vmspace *vm;
817b40ce416SJulian Elischer 		int minslptime = 100000;
818258853abSJeff Roberson 		int slptime;
819b18bfc3dSJohn Dyson 
8209eb881f8SSeigo Tanimura 		/*
821b1f99ebeSSeigo Tanimura 		 * Watch out for a process in
822b1f99ebeSSeigo Tanimura 		 * creation.  It may have no
8231c865ac7SJohn Baldwin 		 * address space or lock yet.
8241c865ac7SJohn Baldwin 		 */
825b61ce5b0SJeff Roberson 		if (p->p_state == PRS_NEW)
8261c865ac7SJohn Baldwin 			continue;
8271c865ac7SJohn Baldwin 		/*
828b1f99ebeSSeigo Tanimura 		 * An aio daemon switches its
829b1f99ebeSSeigo Tanimura 		 * address space while running.
830b1f99ebeSSeigo Tanimura 		 * Perform a quick check whether
831b1f99ebeSSeigo Tanimura 		 * a process has P_SYSTEM.
8329eb881f8SSeigo Tanimura 		 */
8338f887403SJohn Baldwin 		if ((p->p_flag & P_SYSTEM) != 0)
834b1f99ebeSSeigo Tanimura 			continue;
8351c865ac7SJohn Baldwin 		/*
8361c865ac7SJohn Baldwin 		 * Do not swapout a process that
8371c865ac7SJohn Baldwin 		 * is waiting for VM data
8381c865ac7SJohn Baldwin 		 * structures as there is a possible
8391c865ac7SJohn Baldwin 		 * deadlock.  Test this first as
8401c865ac7SJohn Baldwin 		 * this may block.
8411c865ac7SJohn Baldwin 		 *
8421c865ac7SJohn Baldwin 		 * Lock the map until swapout
8431c865ac7SJohn Baldwin 		 * finishes, or a thread of this
8441c865ac7SJohn Baldwin 		 * process may attempt to alter
8451c865ac7SJohn Baldwin 		 * the map.
8461c865ac7SJohn Baldwin 		 */
84757051fdcSTor Egge 		vm = vmspace_acquire_ref(p);
84857051fdcSTor Egge 		if (vm == NULL)
84957051fdcSTor Egge 			continue;
8509eb881f8SSeigo Tanimura 		if (!vm_map_trylock(&vm->vm_map))
8519eb881f8SSeigo Tanimura 			goto nextproc1;
8529eb881f8SSeigo Tanimura 
8535074aecdSJohn Baldwin 		PROC_LOCK(p);
85469b40456SJohn Baldwin 		if (p->p_lock != 0 ||
8551279572aSDavid Xu 		    (p->p_flag & (P_STOPPED_SINGLE|P_TRACED|P_SYSTEM|P_WEXIT)
8561279572aSDavid Xu 		    ) != 0) {
857374ae2a3SJeff Roberson 			goto nextproc;
8585074aecdSJohn Baldwin 		}
85923955314SAlfred Perlstein 		/*
86023955314SAlfred Perlstein 		 * only aiod changes vmspace, however it will be
86123955314SAlfred Perlstein 		 * skipped because of the if statement above checking
86223955314SAlfred Perlstein 		 * for P_SYSTEM
86323955314SAlfred Perlstein 		 */
864b61ce5b0SJeff Roberson 		if ((p->p_flag & (P_INMEM|P_SWAPPINGOUT|P_SWAPPINGIN)) != P_INMEM)
865374ae2a3SJeff Roberson 			goto nextproc;
86669b40456SJohn Baldwin 
867e602ba25SJulian Elischer 		switch (p->p_state) {
8680d94caffSDavid Greenman 		default:
869e602ba25SJulian Elischer 			/* Don't swap out processes in any sort
870e602ba25SJulian Elischer 			 * of 'special' state. */
8718f887403SJohn Baldwin 			break;
872df8bae1dSRodney W. Grimes 
873e602ba25SJulian Elischer 		case PRS_NORMAL:
87426f9a767SRodney W. Grimes 			/*
875bfbfac11SDavid Greenman 			 * do not swapout a realtime process
876b40ce416SJulian Elischer 			 * Check all the thread groups..
877bfbfac11SDavid Greenman 			 */
8788460a577SJohn Birrell 			FOREACH_THREAD_IN_PROC(p, td) {
879b61ce5b0SJeff Roberson 				thread_lock(td);
880b61ce5b0SJeff Roberson 				if (PRI_IS_REALTIME(td->td_pri_class)) {
881b61ce5b0SJeff Roberson 					thread_unlock(td);
882b40ce416SJulian Elischer 					goto nextproc;
883b61ce5b0SJeff Roberson 				}
884258853abSJeff Roberson 				slptime = (ticks - td->td_slptick) / hz;
885bfbfac11SDavid Greenman 				/*
8869eb881f8SSeigo Tanimura 				 * Guarantee swap_idle_threshold1
887ceb0cf87SJohn Dyson 				 * time in memory.
8880d94caffSDavid Greenman 				 */
889258853abSJeff Roberson 				if (slptime < swap_idle_threshold1) {
890b61ce5b0SJeff Roberson 					thread_unlock(td);
891b40ce416SJulian Elischer 					goto nextproc;
892b61ce5b0SJeff Roberson 				}
8939eb881f8SSeigo Tanimura 
8941d7b9ed2SJulian Elischer 				/*
8959eb881f8SSeigo Tanimura 				 * Do not swapout a process if it is
8969eb881f8SSeigo Tanimura 				 * waiting on a critical event of some
8979eb881f8SSeigo Tanimura 				 * kind or there is a thread whose
8989eb881f8SSeigo Tanimura 				 * pageable memory may be accessed.
8991d7b9ed2SJulian Elischer 				 *
9001d7b9ed2SJulian Elischer 				 * This could be refined to support
9011d7b9ed2SJulian Elischer 				 * swapping out a thread.
9021d7b9ed2SJulian Elischer 				 */
903c5aa6b58SJeff Roberson 				if (!thread_safetoswapout(td)) {
904b61ce5b0SJeff Roberson 					thread_unlock(td);
905e602ba25SJulian Elischer 					goto nextproc;
906b61ce5b0SJeff Roberson 				}
907ceb0cf87SJohn Dyson 				/*
908b40ce416SJulian Elischer 				 * If the system is under memory stress,
909b40ce416SJulian Elischer 				 * or if we are swapping
910b40ce416SJulian Elischer 				 * idle processes >= swap_idle_threshold2,
911b40ce416SJulian Elischer 				 * then swap the process out.
912ceb0cf87SJohn Dyson 				 */
913ceb0cf87SJohn Dyson 				if (((action & VM_SWAP_NORMAL) == 0) &&
914ceb0cf87SJohn Dyson 				    (((action & VM_SWAP_IDLE) == 0) ||
915258853abSJeff Roberson 				    (slptime < swap_idle_threshold2))) {
916b61ce5b0SJeff Roberson 					thread_unlock(td);
917b40ce416SJulian Elischer 					goto nextproc;
918b61ce5b0SJeff Roberson 				}
9199eb881f8SSeigo Tanimura 
920258853abSJeff Roberson 				if (minslptime > slptime)
921258853abSJeff Roberson 					minslptime = slptime;
922b61ce5b0SJeff Roberson 				thread_unlock(td);
923b40ce416SJulian Elischer 			}
9240d94caffSDavid Greenman 
92511b224dcSDavid Greenman 			/*
92617d9d0d0SDavid Schultz 			 * If the pageout daemon didn't free enough pages,
92717d9d0d0SDavid Schultz 			 * or if this process is idle and the system is
92817d9d0d0SDavid Schultz 			 * configured to swap proactively, swap it out.
92911b224dcSDavid Greenman 			 */
930ceb0cf87SJohn Dyson 			if ((action & VM_SWAP_NORMAL) ||
931ceb0cf87SJohn Dyson 				((action & VM_SWAP_IDLE) &&
932b40ce416SJulian Elischer 				 (minslptime > swap_idle_threshold2))) {
933b61ce5b0SJeff Roberson 				if (swapout(p) == 0)
934df8bae1dSRodney W. Grimes 					didswap++;
935664f718bSJohn Baldwin 				PROC_UNLOCK(p);
9369eb881f8SSeigo Tanimura 				vm_map_unlock(&vm->vm_map);
9379eb881f8SSeigo Tanimura 				vmspace_free(vm);
9389eb881f8SSeigo Tanimura 				sx_sunlock(&allproc_lock);
9390d94caffSDavid Greenman 				goto retry;
940c96d52a9SJohn Baldwin 			}
9418f887403SJohn Baldwin 		}
942374ae2a3SJeff Roberson nextproc:
9439eb881f8SSeigo Tanimura 		PROC_UNLOCK(p);
9449eb881f8SSeigo Tanimura 		vm_map_unlock(&vm->vm_map);
9459eb881f8SSeigo Tanimura nextproc1:
9469eb881f8SSeigo Tanimura 		vmspace_free(vm);
94730171114SPeter Wemm 		continue;
948ceb0cf87SJohn Dyson 	}
9491005a129SJohn Baldwin 	sx_sunlock(&allproc_lock);
95026f9a767SRodney W. Grimes 	/*
95126f9a767SRodney W. Grimes 	 * If we swapped something out, and another process needed memory,
95226f9a767SRodney W. Grimes 	 * then wakeup the sched process.
95326f9a767SRodney W. Grimes 	 */
9540d94caffSDavid Greenman 	if (didswap)
95524a1cce3SDavid Greenman 		wakeup(&proc0);
956df8bae1dSRodney W. Grimes }
957df8bae1dSRodney W. Grimes 
958f708ef1bSPoul-Henning Kamp static void
959b61ce5b0SJeff Roberson swapclear(p)
960b61ce5b0SJeff Roberson 	struct proc *p;
961b61ce5b0SJeff Roberson {
962b61ce5b0SJeff Roberson 	struct thread *td;
963b61ce5b0SJeff Roberson 
964b61ce5b0SJeff Roberson 	PROC_LOCK_ASSERT(p, MA_OWNED);
965b61ce5b0SJeff Roberson 
966b61ce5b0SJeff Roberson 	FOREACH_THREAD_IN_PROC(p, td) {
967b61ce5b0SJeff Roberson 		thread_lock(td);
968b61ce5b0SJeff Roberson 		td->td_flags |= TDF_INMEM;
969b61ce5b0SJeff Roberson 		td->td_flags &= ~TDF_SWAPINREQ;
970b61ce5b0SJeff Roberson 		TD_CLR_SWAPPED(td);
971b61ce5b0SJeff Roberson 		if (TD_CAN_RUN(td))
972da7bbd2cSJohn Baldwin 			if (setrunnable(td)) {
973da7bbd2cSJohn Baldwin #ifdef INVARIANTS
974da7bbd2cSJohn Baldwin 				/*
975da7bbd2cSJohn Baldwin 				 * XXX: We just cleared TDI_SWAPPED
976da7bbd2cSJohn Baldwin 				 * above and set TDF_INMEM, so this
977da7bbd2cSJohn Baldwin 				 * should never happen.
978da7bbd2cSJohn Baldwin 				 */
979da7bbd2cSJohn Baldwin 				panic("not waking up swapper");
980da7bbd2cSJohn Baldwin #endif
981da7bbd2cSJohn Baldwin 			}
982b61ce5b0SJeff Roberson 		thread_unlock(td);
983b61ce5b0SJeff Roberson 	}
984b61ce5b0SJeff Roberson 	p->p_flag &= ~(P_SWAPPINGIN|P_SWAPPINGOUT);
985b61ce5b0SJeff Roberson 	p->p_flag |= P_INMEM;
986b61ce5b0SJeff Roberson }
987b61ce5b0SJeff Roberson 
988b61ce5b0SJeff Roberson static int
989df8bae1dSRodney W. Grimes swapout(p)
99054d92145SMatthew Dillon 	struct proc *p;
991df8bae1dSRodney W. Grimes {
992b40ce416SJulian Elischer 	struct thread *td;
993df8bae1dSRodney W. Grimes 
994ea754954SJohn Baldwin 	PROC_LOCK_ASSERT(p, MA_OWNED);
995d3a34985SJohn Dyson #if defined(SWAP_DEBUG)
996d3a34985SJohn Dyson 	printf("swapping out %d\n", p->p_pid);
997d3a34985SJohn Dyson #endif
9981d7b9ed2SJulian Elischer 
9991d7b9ed2SJulian Elischer 	/*
10009eb881f8SSeigo Tanimura 	 * The states of this process and its threads may have changed
10019eb881f8SSeigo Tanimura 	 * by now.  Assuming that there is only one pageout daemon thread,
10029eb881f8SSeigo Tanimura 	 * this process should still be in memory.
10039eb881f8SSeigo Tanimura 	 */
1004b61ce5b0SJeff Roberson 	KASSERT((p->p_flag & (P_INMEM|P_SWAPPINGOUT|P_SWAPPINGIN)) == P_INMEM,
10059eb881f8SSeigo Tanimura 		("swapout: lost a swapout race?"));
10069eb881f8SSeigo Tanimura 
1007df8bae1dSRodney W. Grimes 	/*
100826f9a767SRodney W. Grimes 	 * remember the process resident count
1009df8bae1dSRodney W. Grimes 	 */
1010b1028ad1SLuoqi Chen 	p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace);
1011b61ce5b0SJeff Roberson 	/*
1012b61ce5b0SJeff Roberson 	 * Check and mark all threads before we proceed.
1013b61ce5b0SJeff Roberson 	 */
1014b61ce5b0SJeff Roberson 	p->p_flag &= ~P_INMEM;
1015b61ce5b0SJeff Roberson 	p->p_flag |= P_SWAPPINGOUT;
1016982d11f8SJeff Roberson 	FOREACH_THREAD_IN_PROC(p, td) {
1017982d11f8SJeff Roberson 		thread_lock(td);
1018b61ce5b0SJeff Roberson 		if (!thread_safetoswapout(td)) {
1019b61ce5b0SJeff Roberson 			thread_unlock(td);
1020b61ce5b0SJeff Roberson 			swapclear(p);
1021b61ce5b0SJeff Roberson 			return (EBUSY);
1022b61ce5b0SJeff Roberson 		}
1023b61ce5b0SJeff Roberson 		td->td_flags &= ~TDF_INMEM;
1024664f718bSJohn Baldwin 		TD_SET_SWAPPED(td);
1025982d11f8SJeff Roberson 		thread_unlock(td);
1026982d11f8SJeff Roberson 	}
1027b61ce5b0SJeff Roberson 	td = FIRST_THREAD_IN_PROC(p);
1028b61ce5b0SJeff Roberson 	++td->td_ru.ru_nswap;
1029b61ce5b0SJeff Roberson 	PROC_UNLOCK(p);
103026f9a767SRodney W. Grimes 
1031b61ce5b0SJeff Roberson 	/*
1032b61ce5b0SJeff Roberson 	 * This list is stable because all threads are now prevented from
1033b61ce5b0SJeff Roberson 	 * running.  The list is only modified in the context of a running
1034b61ce5b0SJeff Roberson 	 * thread in this process.
1035b61ce5b0SJeff Roberson 	 */
1036664f718bSJohn Baldwin 	FOREACH_THREAD_IN_PROC(p, td)
103749a2507bSAlan Cox 		vm_thread_swapout(td);
1038664f718bSJohn Baldwin 
1039664f718bSJohn Baldwin 	PROC_LOCK(p);
1040b61ce5b0SJeff Roberson 	p->p_flag &= ~P_SWAPPINGOUT;
1041258853abSJeff Roberson 	p->p_swtick = ticks;
1042b61ce5b0SJeff Roberson 	return (0);
1043df8bae1dSRodney W. Grimes }
10445afce282SDavid Greenman #endif /* !NO_SWAPPING */
1045