xref: /freebsd/sys/vm/vm_mmap.c (revision 68e7a217f8019b955f87547f218e95ab237597af)
1 /*
2  * Copyright (c) 1988 University of Utah.
3  * Copyright (c) 1991, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * the Systems Programming Group of the University of Utah Computer
8  * Science Department.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
39  *
40  *	@(#)vm_mmap.c	8.4 (Berkeley) 1/12/94
41  * $FreeBSD$
42  */
43 
44 /*
45  * Mapped file (mmap) interface to VM
46  */
47 
48 #include "opt_compat.h"
49 
50 #include <sys/param.h>
51 #include <sys/systm.h>
52 #include <sys/kernel.h>
53 #include <sys/lock.h>
54 #include <sys/mutex.h>
55 #include <sys/sysproto.h>
56 #include <sys/filedesc.h>
57 #include <sys/proc.h>
58 #include <sys/vnode.h>
59 #include <sys/fcntl.h>
60 #include <sys/file.h>
61 #include <sys/mman.h>
62 #include <sys/conf.h>
63 #include <sys/stat.h>
64 #include <sys/vmmeter.h>
65 #include <sys/sysctl.h>
66 
67 #include <vm/vm.h>
68 #include <vm/vm_param.h>
69 #include <vm/pmap.h>
70 #include <vm/vm_map.h>
71 #include <vm/vm_object.h>
72 #include <vm/vm_page.h>
73 #include <vm/vm_pager.h>
74 #include <vm/vm_pageout.h>
75 #include <vm/vm_extern.h>
76 #include <vm/vm_page.h>
77 #include <vm/vm_kern.h>
78 
79 #ifndef _SYS_SYSPROTO_H_
80 struct sbrk_args {
81 	int incr;
82 };
83 #endif
84 
85 static int max_proc_mmap;
86 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, "");
87 
88 /*
89  * Set the maximum number of vm_map_entry structures per process.  Roughly
90  * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100
91  * of our KVM malloc space still results in generous limits.  We want a
92  * default that is good enough to prevent the kernel running out of resources
93  * if attacked from compromised user account but generous enough such that
94  * multi-threaded processes are not unduly inconvenienced.
95  */
96 static void vmmapentry_rsrc_init(void *);
97 SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, NULL)
98 
99 static void
100 vmmapentry_rsrc_init(dummy)
101         void *dummy;
102 {
103     max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry);
104     max_proc_mmap /= 100;
105 }
106 
107 /*
108  * MPSAFE
109  */
110 /* ARGSUSED */
111 int
112 sbrk(td, uap)
113 	struct thread *td;
114 	struct sbrk_args *uap;
115 {
116 	/* Not yet implemented */
117 	/* mtx_lock(&Giant); */
118 	/* mtx_unlock(&Giant); */
119 	return (EOPNOTSUPP);
120 }
121 
122 #ifndef _SYS_SYSPROTO_H_
123 struct sstk_args {
124 	int incr;
125 };
126 #endif
127 
128 /*
129  * MPSAFE
130  */
131 /* ARGSUSED */
132 int
133 sstk(td, uap)
134 	struct thread *td;
135 	struct sstk_args *uap;
136 {
137 	/* Not yet implemented */
138 	/* mtx_lock(&Giant); */
139 	/* mtx_unlock(&Giant); */
140 	return (EOPNOTSUPP);
141 }
142 
143 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
144 #ifndef _SYS_SYSPROTO_H_
145 struct getpagesize_args {
146 	int dummy;
147 };
148 #endif
149 
150 /* ARGSUSED */
151 int
152 ogetpagesize(td, uap)
153 	struct thread *td;
154 	struct getpagesize_args *uap;
155 {
156 	/* MP SAFE */
157 	td->td_retval[0] = PAGE_SIZE;
158 	return (0);
159 }
160 #endif				/* COMPAT_43 || COMPAT_SUNOS */
161 
162 
163 /*
164  * Memory Map (mmap) system call.  Note that the file offset
165  * and address are allowed to be NOT page aligned, though if
166  * the MAP_FIXED flag it set, both must have the same remainder
167  * modulo the PAGE_SIZE (POSIX 1003.1b).  If the address is not
168  * page-aligned, the actual mapping starts at trunc_page(addr)
169  * and the return value is adjusted up by the page offset.
170  *
171  * Generally speaking, only character devices which are themselves
172  * memory-based, such as a video framebuffer, can be mmap'd.  Otherwise
173  * there would be no cache coherency between a descriptor and a VM mapping
174  * both to the same character device.
175  *
176  * Block devices can be mmap'd no matter what they represent.  Cache coherency
177  * is maintained as long as you do not write directly to the underlying
178  * character device.
179  */
180 #ifndef _SYS_SYSPROTO_H_
181 struct mmap_args {
182 	void *addr;
183 	size_t len;
184 	int prot;
185 	int flags;
186 	int fd;
187 	long pad;
188 	off_t pos;
189 };
190 #endif
191 
192 /*
193  * MPSAFE
194  */
195 int
196 mmap(td, uap)
197 	struct thread *td;
198 	struct mmap_args *uap;
199 {
200 	struct file *fp = NULL;
201 	struct vnode *vp;
202 	vm_offset_t addr;
203 	vm_size_t size, pageoff;
204 	vm_prot_t prot, maxprot;
205 	void *handle;
206 	int flags, error;
207 	int disablexworkaround;
208 	off_t pos;
209 	struct vmspace *vms = td->td_proc->p_vmspace;
210 	vm_object_t obj;
211 
212 	addr = (vm_offset_t) uap->addr;
213 	size = uap->len;
214 	prot = uap->prot & VM_PROT_ALL;
215 	flags = uap->flags;
216 	pos = uap->pos;
217 
218 	fp = NULL;
219 	/* make sure mapping fits into numeric range etc */
220 	if ((ssize_t) uap->len < 0 ||
221 	    ((flags & MAP_ANON) && uap->fd != -1))
222 		return (EINVAL);
223 
224 	if (flags & MAP_STACK) {
225 		if ((uap->fd != -1) ||
226 		    ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE)))
227 			return (EINVAL);
228 		flags |= MAP_ANON;
229 		pos = 0;
230 	}
231 
232 	/*
233 	 * Align the file position to a page boundary,
234 	 * and save its page offset component.
235 	 */
236 	pageoff = (pos & PAGE_MASK);
237 	pos -= pageoff;
238 
239 	/* Adjust size for rounding (on both ends). */
240 	size += pageoff;			/* low end... */
241 	size = (vm_size_t) round_page(size);	/* hi end */
242 
243 	/*
244 	 * Check for illegal addresses.  Watch out for address wrap... Note
245 	 * that VM_*_ADDRESS are not constants due to casts (argh).
246 	 */
247 	if (flags & MAP_FIXED) {
248 		/*
249 		 * The specified address must have the same remainder
250 		 * as the file offset taken modulo PAGE_SIZE, so it
251 		 * should be aligned after adjustment by pageoff.
252 		 */
253 		addr -= pageoff;
254 		if (addr & PAGE_MASK)
255 			return (EINVAL);
256 		/* Address range must be all in user VM space. */
257 		if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS)
258 			return (EINVAL);
259 #ifndef i386
260 		if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS)
261 			return (EINVAL);
262 #endif
263 		if (addr + size < addr)
264 			return (EINVAL);
265 	}
266 	/*
267 	 * XXX for non-fixed mappings where no hint is provided or
268 	 * the hint would fall in the potential heap space,
269 	 * place it after the end of the largest possible heap.
270 	 *
271 	 * There should really be a pmap call to determine a reasonable
272 	 * location.
273 	 */
274 	else if (addr == 0 ||
275 	    (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
276 	     addr < round_page((vm_offset_t)vms->vm_daddr + maxdsiz)))
277 		addr = round_page((vm_offset_t)vms->vm_daddr + maxdsiz);
278 
279 	mtx_lock(&Giant);	/* syscall marked mp-safe but isn't */
280 	if (flags & MAP_ANON) {
281 		/*
282 		 * Mapping blank space is trivial.
283 		 */
284 		handle = NULL;
285 		maxprot = VM_PROT_ALL;
286 		pos = 0;
287 	} else {
288 		/*
289 		 * Mapping file, get fp for validation. Obtain vnode and make
290 		 * sure it is of appropriate type.
291 		 * don't let the descriptor disappear on us if we block
292 		 */
293 		if ((error = fget(td, uap->fd, &fp)) != 0)
294 			goto done;
295 		if (fp->f_type != DTYPE_VNODE) {
296 			error = EINVAL;
297 			goto done;
298 		}
299 
300 		/*
301 		 * POSIX shared-memory objects are defined to have
302 		 * kernel persistence, and are not defined to support
303 		 * read(2)/write(2) -- or even open(2).  Thus, we can
304 		 * use MAP_ASYNC to trade on-disk coherence for speed.
305 		 * The shm_open(3) library routine turns on the FPOSIXSHM
306 		 * flag to request this behavior.
307 		 */
308 		if (fp->f_flag & FPOSIXSHM)
309 			flags |= MAP_NOSYNC;
310 		vp = (struct vnode *) fp->f_data;
311 		if (vp->v_type != VREG && vp->v_type != VCHR) {
312 			error = EINVAL;
313 			goto done;
314 		}
315 		if (vp->v_type == VREG) {
316 			/*
317 			 * Get the proper underlying object
318 			 */
319 			if (VOP_GETVOBJECT(vp, &obj) != 0) {
320 				error = EINVAL;
321 				goto done;
322 			}
323 			vp = (struct vnode*)obj->handle;
324 		}
325 		/*
326 		 * XXX hack to handle use of /dev/zero to map anon memory (ala
327 		 * SunOS).
328 		 */
329 		if ((vp->v_type == VCHR) &&
330 		    (vp->v_rdev->si_devsw->d_flags & D_MMAP_ANON)) {
331 			handle = NULL;
332 			maxprot = VM_PROT_ALL;
333 			flags |= MAP_ANON;
334 			pos = 0;
335 		} else {
336 			/*
337 			 * cdevs does not provide private mappings of any kind.
338 			 */
339 			/*
340 			 * However, for XIG X server to continue to work,
341 			 * we should allow the superuser to do it anyway.
342 			 * We only allow it at securelevel < 1.
343 			 * (Because the XIG X server writes directly to video
344 			 * memory via /dev/mem, it should never work at any
345 			 * other securelevel.
346 			 * XXX this will have to go
347 			 */
348 			if (securelevel_ge(td->td_ucred, 1))
349 				disablexworkaround = 1;
350 			else
351 				disablexworkaround = suser(td);
352 			if (vp->v_type == VCHR && disablexworkaround &&
353 			    (flags & (MAP_PRIVATE|MAP_COPY))) {
354 				error = EINVAL;
355 				goto done;
356 			}
357 			/*
358 			 * Ensure that file and memory protections are
359 			 * compatible.  Note that we only worry about
360 			 * writability if mapping is shared; in this case,
361 			 * current and max prot are dictated by the open file.
362 			 * XXX use the vnode instead?  Problem is: what
363 			 * credentials do we use for determination? What if
364 			 * proc does a setuid?
365 			 */
366 			maxprot = VM_PROT_EXECUTE;	/* ??? */
367 			if (fp->f_flag & FREAD) {
368 				maxprot |= VM_PROT_READ;
369 			} else if (prot & PROT_READ) {
370 				error = EACCES;
371 				goto done;
372 			}
373 			/*
374 			 * If we are sharing potential changes (either via
375 			 * MAP_SHARED or via the implicit sharing of character
376 			 * device mappings), and we are trying to get write
377 			 * permission although we opened it without asking
378 			 * for it, bail out.  Check for superuser, only if
379 			 * we're at securelevel < 1, to allow the XIG X server
380 			 * to continue to work.
381 			 */
382 			if ((flags & MAP_SHARED) != 0 ||
383 			    (vp->v_type == VCHR && disablexworkaround)) {
384 				if ((fp->f_flag & FWRITE) != 0) {
385 					struct vattr va;
386 					if ((error =
387 					    VOP_GETATTR(vp, &va,
388 						        td->td_ucred, td))) {
389 						goto done;
390 					}
391 					if ((va.va_flags &
392 					   (SF_SNAPSHOT|IMMUTABLE|APPEND)) == 0) {
393 						maxprot |= VM_PROT_WRITE;
394 					} else if (prot & PROT_WRITE) {
395 						error = EPERM;
396 						goto done;
397 					}
398 				} else if ((prot & PROT_WRITE) != 0) {
399 					error = EACCES;
400 					goto done;
401 				}
402 			} else {
403 				maxprot |= VM_PROT_WRITE;
404 			}
405 
406 			handle = (void *)vp;
407 		}
408 	}
409 
410 	/*
411 	 * Do not allow more then a certain number of vm_map_entry structures
412 	 * per process.  Scale with the number of rforks sharing the map
413 	 * to make the limit reasonable for threads.
414 	 */
415 	if (max_proc_mmap &&
416 	    vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) {
417 		error = ENOMEM;
418 		goto done;
419 	}
420 
421 	mtx_unlock(&Giant);
422 	error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot,
423 	    flags, handle, pos);
424 	if (error == 0)
425 		td->td_retval[0] = (register_t) (addr + pageoff);
426 	mtx_lock(&Giant);
427 done:
428 	if (fp)
429 		fdrop(fp, td);
430 	mtx_unlock(&Giant);
431 	return (error);
432 }
433 
434 #ifdef COMPAT_43
435 #ifndef _SYS_SYSPROTO_H_
436 struct ommap_args {
437 	caddr_t addr;
438 	int len;
439 	int prot;
440 	int flags;
441 	int fd;
442 	long pos;
443 };
444 #endif
445 int
446 ommap(td, uap)
447 	struct thread *td;
448 	struct ommap_args *uap;
449 {
450 	struct mmap_args nargs;
451 	static const char cvtbsdprot[8] = {
452 		0,
453 		PROT_EXEC,
454 		PROT_WRITE,
455 		PROT_EXEC | PROT_WRITE,
456 		PROT_READ,
457 		PROT_EXEC | PROT_READ,
458 		PROT_WRITE | PROT_READ,
459 		PROT_EXEC | PROT_WRITE | PROT_READ,
460 	};
461 
462 #define	OMAP_ANON	0x0002
463 #define	OMAP_COPY	0x0020
464 #define	OMAP_SHARED	0x0010
465 #define	OMAP_FIXED	0x0100
466 
467 	nargs.addr = uap->addr;
468 	nargs.len = uap->len;
469 	nargs.prot = cvtbsdprot[uap->prot & 0x7];
470 	nargs.flags = 0;
471 	if (uap->flags & OMAP_ANON)
472 		nargs.flags |= MAP_ANON;
473 	if (uap->flags & OMAP_COPY)
474 		nargs.flags |= MAP_COPY;
475 	if (uap->flags & OMAP_SHARED)
476 		nargs.flags |= MAP_SHARED;
477 	else
478 		nargs.flags |= MAP_PRIVATE;
479 	if (uap->flags & OMAP_FIXED)
480 		nargs.flags |= MAP_FIXED;
481 	nargs.fd = uap->fd;
482 	nargs.pos = uap->pos;
483 	return (mmap(td, &nargs));
484 }
485 #endif				/* COMPAT_43 */
486 
487 
488 #ifndef _SYS_SYSPROTO_H_
489 struct msync_args {
490 	void *addr;
491 	int len;
492 	int flags;
493 };
494 #endif
495 /*
496  * MPSAFE
497  */
498 int
499 msync(td, uap)
500 	struct thread *td;
501 	struct msync_args *uap;
502 {
503 	vm_offset_t addr;
504 	vm_size_t size, pageoff;
505 	int flags;
506 	vm_map_t map;
507 	int rv;
508 
509 	addr = (vm_offset_t) uap->addr;
510 	size = uap->len;
511 	flags = uap->flags;
512 
513 	pageoff = (addr & PAGE_MASK);
514 	addr -= pageoff;
515 	size += pageoff;
516 	size = (vm_size_t) round_page(size);
517 	if (addr + size < addr)
518 		return (EINVAL);
519 
520 	if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE))
521 		return (EINVAL);
522 
523 	mtx_lock(&Giant);
524 
525 	map = &td->td_proc->p_vmspace->vm_map;
526 
527 	/*
528 	 * XXX Gak!  If size is zero we are supposed to sync "all modified
529 	 * pages with the region containing addr".  Unfortunately, we don't
530 	 * really keep track of individual mmaps so we approximate by flushing
531 	 * the range of the map entry containing addr. This can be incorrect
532 	 * if the region splits or is coalesced with a neighbor.
533 	 */
534 	if (size == 0) {
535 		vm_map_entry_t entry;
536 
537 		vm_map_lock_read(map);
538 		rv = vm_map_lookup_entry(map, addr, &entry);
539 		vm_map_unlock_read(map);
540 		if (rv == FALSE) {
541 			rv = -1;
542 			goto done2;
543 		}
544 		addr = entry->start;
545 		size = entry->end - entry->start;
546 	}
547 
548 	/*
549 	 * Clean the pages and interpret the return value.
550 	 */
551 	rv = vm_map_clean(map, addr, addr + size, (flags & MS_ASYNC) == 0,
552 	    (flags & MS_INVALIDATE) != 0);
553 
554 done2:
555 	mtx_unlock(&Giant);
556 
557 	switch (rv) {
558 	case KERN_SUCCESS:
559 		return (0);
560 	case KERN_INVALID_ADDRESS:
561 		return (EINVAL);	/* Sun returns ENOMEM? */
562 	case KERN_FAILURE:
563 		return (EIO);
564 	default:
565 		return (EINVAL);
566 	}
567 }
568 
569 #ifndef _SYS_SYSPROTO_H_
570 struct munmap_args {
571 	void *addr;
572 	size_t len;
573 };
574 #endif
575 /*
576  * MPSAFE
577  */
578 int
579 munmap(td, uap)
580 	struct thread *td;
581 	struct munmap_args *uap;
582 {
583 	vm_offset_t addr;
584 	vm_size_t size, pageoff;
585 	vm_map_t map;
586 
587 	addr = (vm_offset_t) uap->addr;
588 	size = uap->len;
589 
590 	pageoff = (addr & PAGE_MASK);
591 	addr -= pageoff;
592 	size += pageoff;
593 	size = (vm_size_t) round_page(size);
594 	if (addr + size < addr)
595 		return (EINVAL);
596 
597 	if (size == 0)
598 		return (0);
599 
600 	/*
601 	 * Check for illegal addresses.  Watch out for address wrap... Note
602 	 * that VM_*_ADDRESS are not constants due to casts (argh).
603 	 */
604 	if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS)
605 		return (EINVAL);
606 #ifndef i386
607 	if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS)
608 		return (EINVAL);
609 #endif
610 	map = &td->td_proc->p_vmspace->vm_map;
611 	/*
612 	 * Make sure entire range is allocated.
613 	 */
614 	if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE))
615 		return (EINVAL);
616 
617 	/* returns nothing but KERN_SUCCESS anyway */
618 	mtx_lock(&Giant);
619 	(void) vm_map_remove(map, addr, addr + size);
620 	mtx_unlock(&Giant);
621 	return (0);
622 }
623 
624 #if 0
625 void
626 munmapfd(td, fd)
627 	struct thread *td;
628 	int fd;
629 {
630 	/*
631 	 * XXX should unmap any regions mapped to this file
632 	 */
633 	FILEDESC_LOCK(p->p_fd);
634 	td->td_proc->p_fd->fd_ofileflags[fd] &= ~UF_MAPPED;
635 	FILEDESC_UNLOCK(p->p_fd);
636 }
637 #endif
638 
639 #ifndef _SYS_SYSPROTO_H_
640 struct mprotect_args {
641 	const void *addr;
642 	size_t len;
643 	int prot;
644 };
645 #endif
646 /*
647  * MPSAFE
648  */
649 int
650 mprotect(td, uap)
651 	struct thread *td;
652 	struct mprotect_args *uap;
653 {
654 	vm_offset_t addr;
655 	vm_size_t size, pageoff;
656 	vm_prot_t prot;
657 
658 	addr = (vm_offset_t) uap->addr;
659 	size = uap->len;
660 	prot = uap->prot & VM_PROT_ALL;
661 #if defined(VM_PROT_READ_IS_EXEC)
662 	if (prot & VM_PROT_READ)
663 		prot |= VM_PROT_EXECUTE;
664 #endif
665 
666 	pageoff = (addr & PAGE_MASK);
667 	addr -= pageoff;
668 	size += pageoff;
669 	size = (vm_size_t) round_page(size);
670 	if (addr + size < addr)
671 		return (EINVAL);
672 
673 	switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr,
674 	    addr + size, prot, FALSE)) {
675 	case KERN_SUCCESS:
676 		return (0);
677 	case KERN_PROTECTION_FAILURE:
678 		return (EACCES);
679 	}
680 	return (EINVAL);
681 }
682 
683 #ifndef _SYS_SYSPROTO_H_
684 struct minherit_args {
685 	void *addr;
686 	size_t len;
687 	int inherit;
688 };
689 #endif
690 /*
691  * MPSAFE
692  */
693 int
694 minherit(td, uap)
695 	struct thread *td;
696 	struct minherit_args *uap;
697 {
698 	vm_offset_t addr;
699 	vm_size_t size, pageoff;
700 	vm_inherit_t inherit;
701 
702 	addr = (vm_offset_t)uap->addr;
703 	size = uap->len;
704 	inherit = uap->inherit;
705 
706 	pageoff = (addr & PAGE_MASK);
707 	addr -= pageoff;
708 	size += pageoff;
709 	size = (vm_size_t) round_page(size);
710 	if (addr + size < addr)
711 		return (EINVAL);
712 
713 	switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr,
714 	    addr + size, inherit)) {
715 	case KERN_SUCCESS:
716 		return (0);
717 	case KERN_PROTECTION_FAILURE:
718 		return (EACCES);
719 	}
720 	return (EINVAL);
721 }
722 
723 #ifndef _SYS_SYSPROTO_H_
724 struct madvise_args {
725 	void *addr;
726 	size_t len;
727 	int behav;
728 };
729 #endif
730 
731 /*
732  * MPSAFE
733  */
734 /* ARGSUSED */
735 int
736 madvise(td, uap)
737 	struct thread *td;
738 	struct madvise_args *uap;
739 {
740 	vm_offset_t start, end;
741 
742 	/*
743 	 * Check for illegal behavior
744 	 */
745 	if (uap->behav < 0 || uap->behav > MADV_CORE)
746 		return (EINVAL);
747 	/*
748 	 * Check for illegal addresses.  Watch out for address wrap... Note
749 	 * that VM_*_ADDRESS are not constants due to casts (argh).
750 	 */
751 	if (VM_MAXUSER_ADDRESS > 0 &&
752 		((vm_offset_t) uap->addr + uap->len) > VM_MAXUSER_ADDRESS)
753 		return (EINVAL);
754 #ifndef i386
755 	if (VM_MIN_ADDRESS > 0 && uap->addr < VM_MIN_ADDRESS)
756 		return (EINVAL);
757 #endif
758 	if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr)
759 		return (EINVAL);
760 
761 	/*
762 	 * Since this routine is only advisory, we default to conservative
763 	 * behavior.
764 	 */
765 	start = trunc_page((vm_offset_t) uap->addr);
766 	end = round_page((vm_offset_t) uap->addr + uap->len);
767 
768 	if (vm_map_madvise(&td->td_proc->p_vmspace->vm_map, start, end,
769 	    uap->behav))
770 		return (EINVAL);
771 	return (0);
772 }
773 
774 #ifndef _SYS_SYSPROTO_H_
775 struct mincore_args {
776 	const void *addr;
777 	size_t len;
778 	char *vec;
779 };
780 #endif
781 
782 /*
783  * MPSAFE
784  */
785 /* ARGSUSED */
786 int
787 mincore(td, uap)
788 	struct thread *td;
789 	struct mincore_args *uap;
790 {
791 	vm_offset_t addr, first_addr;
792 	vm_offset_t end, cend;
793 	pmap_t pmap;
794 	vm_map_t map;
795 	char *vec;
796 	int error = 0;
797 	int vecindex, lastvecindex;
798 	vm_map_entry_t current;
799 	vm_map_entry_t entry;
800 	int mincoreinfo;
801 	unsigned int timestamp;
802 
803 	/*
804 	 * Make sure that the addresses presented are valid for user
805 	 * mode.
806 	 */
807 	first_addr = addr = trunc_page((vm_offset_t) uap->addr);
808 	end = addr + (vm_size_t)round_page(uap->len);
809 	if (VM_MAXUSER_ADDRESS > 0 && end > VM_MAXUSER_ADDRESS)
810 		return (EINVAL);
811 	if (end < addr)
812 		return (EINVAL);
813 
814 	/*
815 	 * Address of byte vector
816 	 */
817 	vec = uap->vec;
818 
819 	mtx_lock(&Giant);
820 	map = &td->td_proc->p_vmspace->vm_map;
821 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
822 
823 	vm_map_lock_read(map);
824 RestartScan:
825 	timestamp = map->timestamp;
826 
827 	if (!vm_map_lookup_entry(map, addr, &entry))
828 		entry = entry->next;
829 
830 	/*
831 	 * Do this on a map entry basis so that if the pages are not
832 	 * in the current processes address space, we can easily look
833 	 * up the pages elsewhere.
834 	 */
835 	lastvecindex = -1;
836 	for (current = entry;
837 	    (current != &map->header) && (current->start < end);
838 	    current = current->next) {
839 
840 		/*
841 		 * ignore submaps (for now) or null objects
842 		 */
843 		if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) ||
844 			current->object.vm_object == NULL)
845 			continue;
846 
847 		/*
848 		 * limit this scan to the current map entry and the
849 		 * limits for the mincore call
850 		 */
851 		if (addr < current->start)
852 			addr = current->start;
853 		cend = current->end;
854 		if (cend > end)
855 			cend = end;
856 
857 		/*
858 		 * scan this entry one page at a time
859 		 */
860 		while (addr < cend) {
861 			/*
862 			 * Check pmap first, it is likely faster, also
863 			 * it can provide info as to whether we are the
864 			 * one referencing or modifying the page.
865 			 */
866 			mincoreinfo = pmap_mincore(pmap, addr);
867 			if (!mincoreinfo) {
868 				vm_pindex_t pindex;
869 				vm_ooffset_t offset;
870 				vm_page_t m;
871 				/*
872 				 * calculate the page index into the object
873 				 */
874 				offset = current->offset + (addr - current->start);
875 				pindex = OFF_TO_IDX(offset);
876 				m = vm_page_lookup(current->object.vm_object,
877 					pindex);
878 				/*
879 				 * if the page is resident, then gather information about
880 				 * it.
881 				 */
882 				if (m) {
883 					mincoreinfo = MINCORE_INCORE;
884 					if (m->dirty ||
885 						pmap_is_modified(m))
886 						mincoreinfo |= MINCORE_MODIFIED_OTHER;
887 					if ((m->flags & PG_REFERENCED) ||
888 						pmap_ts_referenced(m)) {
889 						vm_page_flag_set(m, PG_REFERENCED);
890 						mincoreinfo |= MINCORE_REFERENCED_OTHER;
891 					}
892 				}
893 			}
894 
895 			/*
896 			 * subyte may page fault.  In case it needs to modify
897 			 * the map, we release the lock.
898 			 */
899 			vm_map_unlock_read(map);
900 
901 			/*
902 			 * calculate index into user supplied byte vector
903 			 */
904 			vecindex = OFF_TO_IDX(addr - first_addr);
905 
906 			/*
907 			 * If we have skipped map entries, we need to make sure that
908 			 * the byte vector is zeroed for those skipped entries.
909 			 */
910 			while ((lastvecindex + 1) < vecindex) {
911 				error = subyte(vec + lastvecindex, 0);
912 				if (error) {
913 					error = EFAULT;
914 					goto done2;
915 				}
916 				++lastvecindex;
917 			}
918 
919 			/*
920 			 * Pass the page information to the user
921 			 */
922 			error = subyte(vec + vecindex, mincoreinfo);
923 			if (error) {
924 				error = EFAULT;
925 				goto done2;
926 			}
927 
928 			/*
929 			 * If the map has changed, due to the subyte, the previous
930 			 * output may be invalid.
931 			 */
932 			vm_map_lock_read(map);
933 			if (timestamp != map->timestamp)
934 				goto RestartScan;
935 
936 			lastvecindex = vecindex;
937 			addr += PAGE_SIZE;
938 		}
939 	}
940 
941 	/*
942 	 * subyte may page fault.  In case it needs to modify
943 	 * the map, we release the lock.
944 	 */
945 	vm_map_unlock_read(map);
946 
947 	/*
948 	 * Zero the last entries in the byte vector.
949 	 */
950 	vecindex = OFF_TO_IDX(end - first_addr);
951 	while ((lastvecindex + 1) < vecindex) {
952 		error = subyte(vec + lastvecindex, 0);
953 		if (error) {
954 			error = EFAULT;
955 			goto done2;
956 		}
957 		++lastvecindex;
958 	}
959 
960 	/*
961 	 * If the map has changed, due to the subyte, the previous
962 	 * output may be invalid.
963 	 */
964 	vm_map_lock_read(map);
965 	if (timestamp != map->timestamp)
966 		goto RestartScan;
967 	vm_map_unlock_read(map);
968 done2:
969 	mtx_unlock(&Giant);
970 	return (error);
971 }
972 
973 #ifndef _SYS_SYSPROTO_H_
974 struct mlock_args {
975 	const void *addr;
976 	size_t len;
977 };
978 #endif
979 /*
980  * MPSAFE
981  */
982 int
983 mlock(td, uap)
984 	struct thread *td;
985 	struct mlock_args *uap;
986 {
987 	vm_offset_t addr;
988 	vm_size_t size, pageoff;
989 	int error;
990 
991 	addr = (vm_offset_t) uap->addr;
992 	size = uap->len;
993 
994 	pageoff = (addr & PAGE_MASK);
995 	addr -= pageoff;
996 	size += pageoff;
997 	size = (vm_size_t) round_page(size);
998 
999 	/* disable wrap around */
1000 	if (addr + size < addr)
1001 		return (EINVAL);
1002 
1003 	if (atop(size) + cnt.v_wire_count > vm_page_max_wired)
1004 		return (EAGAIN);
1005 
1006 #ifdef pmap_wired_count
1007 	if (size + ptoa(pmap_wired_count(vm_map_pmap(&td->td_proc->p_vmspace->vm_map))) >
1008 	    td->td_proc->p_rlimit[RLIMIT_MEMLOCK].rlim_cur)
1009 		return (ENOMEM);
1010 #else
1011 	error = suser(td);
1012 	if (error)
1013 		return (error);
1014 #endif
1015 
1016 	mtx_lock(&Giant);
1017 	error = vm_map_user_pageable(&td->td_proc->p_vmspace->vm_map, addr,
1018 		     addr + size, FALSE);
1019 	mtx_unlock(&Giant);
1020 	return (error == KERN_SUCCESS ? 0 : ENOMEM);
1021 }
1022 
1023 #ifndef _SYS_SYSPROTO_H_
1024 struct mlockall_args {
1025 	int	how;
1026 };
1027 #endif
1028 
1029 /*
1030  * MPSAFE
1031  */
1032 int
1033 mlockall(td, uap)
1034 	struct thread *td;
1035 	struct mlockall_args *uap;
1036 {
1037 	/* mtx_lock(&Giant); */
1038 	/* mtx_unlock(&Giant); */
1039 	return 0;
1040 }
1041 
1042 #ifndef _SYS_SYSPROTO_H_
1043 struct mlockall_args {
1044 	int	how;
1045 };
1046 #endif
1047 
1048 /*
1049  * MPSAFE
1050  */
1051 int
1052 munlockall(td, uap)
1053 	struct thread *td;
1054 	struct munlockall_args *uap;
1055 {
1056 	/* mtx_lock(&Giant); */
1057 	/* mtx_unlock(&Giant); */
1058 	return 0;
1059 }
1060 
1061 #ifndef _SYS_SYSPROTO_H_
1062 struct munlock_args {
1063 	const void *addr;
1064 	size_t len;
1065 };
1066 #endif
1067 /*
1068  * MPSAFE
1069  */
1070 int
1071 munlock(td, uap)
1072 	struct thread *td;
1073 	struct munlock_args *uap;
1074 {
1075 	vm_offset_t addr;
1076 	vm_size_t size, pageoff;
1077 	int error;
1078 
1079 	addr = (vm_offset_t) uap->addr;
1080 	size = uap->len;
1081 
1082 	pageoff = (addr & PAGE_MASK);
1083 	addr -= pageoff;
1084 	size += pageoff;
1085 	size = (vm_size_t) round_page(size);
1086 
1087 	/* disable wrap around */
1088 	if (addr + size < addr)
1089 		return (EINVAL);
1090 
1091 #ifndef pmap_wired_count
1092 	error = suser(td);
1093 	if (error)
1094 		return (error);
1095 #endif
1096 
1097 	mtx_lock(&Giant);
1098 	error = vm_map_user_pageable(&td->td_proc->p_vmspace->vm_map, addr,
1099 		     addr + size, TRUE);
1100 	mtx_unlock(&Giant);
1101 	return (error == KERN_SUCCESS ? 0 : ENOMEM);
1102 }
1103 
1104 /*
1105  * vm_mmap()
1106  *
1107  * MPSAFE
1108  *
1109  * Internal version of mmap.  Currently used by mmap, exec, and sys5
1110  * shared memory.  Handle is either a vnode pointer or NULL for MAP_ANON.
1111  */
1112 int
1113 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
1114 	vm_prot_t maxprot, int flags,
1115 	void *handle,
1116 	vm_ooffset_t foff)
1117 {
1118 	boolean_t fitit;
1119 	vm_object_t object;
1120 	struct vnode *vp = NULL;
1121 	objtype_t type;
1122 	int rv = KERN_SUCCESS;
1123 	vm_ooffset_t objsize;
1124 	int docow;
1125 	struct thread *td = curthread;
1126 
1127 	if (size == 0)
1128 		return (0);
1129 
1130 	objsize = size = round_page(size);
1131 
1132 	/*
1133 	 * We currently can only deal with page aligned file offsets.
1134 	 * The check is here rather than in the syscall because the
1135 	 * kernel calls this function internally for other mmaping
1136 	 * operations (such as in exec) and non-aligned offsets will
1137 	 * cause pmap inconsistencies...so we want to be sure to
1138 	 * disallow this in all cases.
1139 	 */
1140 	if (foff & PAGE_MASK)
1141 		return (EINVAL);
1142 
1143 	if ((flags & MAP_FIXED) == 0) {
1144 		fitit = TRUE;
1145 		*addr = round_page(*addr);
1146 		mtx_lock(&Giant);
1147 	} else {
1148 		if (*addr != trunc_page(*addr))
1149 			return (EINVAL);
1150 		fitit = FALSE;
1151 		mtx_lock(&Giant);
1152 		(void) vm_map_remove(map, *addr, *addr + size);
1153 	}
1154 
1155 	/*
1156 	 * Lookup/allocate object.
1157 	 */
1158 	if (flags & MAP_ANON) {
1159 		type = OBJT_DEFAULT;
1160 		/*
1161 		 * Unnamed anonymous regions always start at 0.
1162 		 */
1163 		if (handle == 0)
1164 			foff = 0;
1165 	} else {
1166 		vp = (struct vnode *) handle;
1167 		if (vp->v_type == VCHR) {
1168 			type = OBJT_DEVICE;
1169 			handle = (void *)(intptr_t)vp->v_rdev;
1170 		} else {
1171 			struct vattr vat;
1172 			int error;
1173 
1174 			error = VOP_GETATTR(vp, &vat, td->td_ucred, td);
1175 			if (error) {
1176 				mtx_unlock(&Giant);
1177 				return (error);
1178 			}
1179 			objsize = round_page(vat.va_size);
1180 			type = OBJT_VNODE;
1181 			/*
1182 			 * if it is a regular file without any references
1183 			 * we do not need to sync it.
1184 			 */
1185 			if (vp->v_type == VREG && vat.va_nlink == 0) {
1186 				flags |= MAP_NOSYNC;
1187 			}
1188 		}
1189 	}
1190 
1191 	if (handle == NULL) {
1192 		object = NULL;
1193 		docow = 0;
1194 	} else {
1195 		object = vm_pager_allocate(type,
1196 			handle, objsize, prot, foff);
1197 		if (object == NULL) {
1198 			mtx_unlock(&Giant);
1199 			return (type == OBJT_DEVICE ? EINVAL : ENOMEM);
1200 		}
1201 		docow = MAP_PREFAULT_PARTIAL;
1202 	}
1203 
1204 	/*
1205 	 * Force device mappings to be shared.
1206 	 */
1207 	if (type == OBJT_DEVICE || type == OBJT_PHYS) {
1208 		flags &= ~(MAP_PRIVATE|MAP_COPY);
1209 		flags |= MAP_SHARED;
1210 	}
1211 
1212 	if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
1213 		docow |= MAP_COPY_ON_WRITE;
1214 	if (flags & MAP_NOSYNC)
1215 		docow |= MAP_DISABLE_SYNCER;
1216 	if (flags & MAP_NOCORE)
1217 		docow |= MAP_DISABLE_COREDUMP;
1218 
1219 #if defined(VM_PROT_READ_IS_EXEC)
1220 	if (prot & VM_PROT_READ)
1221 		prot |= VM_PROT_EXECUTE;
1222 
1223 	if (maxprot & VM_PROT_READ)
1224 		maxprot |= VM_PROT_EXECUTE;
1225 #endif
1226 
1227 	if (fitit)
1228 		*addr = pmap_addr_hint(object, *addr, size);
1229 
1230 	if (flags & MAP_STACK)
1231 		rv = vm_map_stack (map, *addr, size, prot,
1232 				   maxprot, docow);
1233 	else
1234 		rv = vm_map_find(map, object, foff, addr, size, fitit,
1235 				 prot, maxprot, docow);
1236 
1237 	if (rv != KERN_SUCCESS) {
1238 		/*
1239 		 * Lose the object reference. Will destroy the
1240 		 * object if it's an unnamed anonymous mapping
1241 		 * or named anonymous without other references.
1242 		 */
1243 		vm_object_deallocate(object);
1244 	} else if (flags & MAP_SHARED) {
1245 		/*
1246 		 * Shared memory is also shared with children.
1247 		 */
1248 		rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE);
1249 		if (rv != KERN_SUCCESS)
1250 			(void) vm_map_remove(map, *addr, *addr + size);
1251 	}
1252 	mtx_unlock(&Giant);
1253 	switch (rv) {
1254 	case KERN_SUCCESS:
1255 		return (0);
1256 	case KERN_INVALID_ADDRESS:
1257 	case KERN_NO_SPACE:
1258 		return (ENOMEM);
1259 	case KERN_PROTECTION_FAILURE:
1260 		return (EACCES);
1261 	default:
1262 		return (EINVAL);
1263 	}
1264 }
1265