xref: /freebsd/sys/vm/vm_mmap.c (revision 06064893b3c62c648518be78604fac29fc0d9d61)
1 /*
2  * Copyright (c) 1988 University of Utah.
3  * Copyright (c) 1991, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * the Systems Programming Group of the University of Utah Computer
8  * Science Department.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
35  *
36  *	@(#)vm_mmap.c	8.4 (Berkeley) 1/12/94
37  */
38 
39 /*
40  * Mapped file (mmap) interface to VM
41  */
42 
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45 
46 #include "opt_compat.h"
47 #include "opt_mac.h"
48 
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/kernel.h>
52 #include <sys/lock.h>
53 #include <sys/mutex.h>
54 #include <sys/sysproto.h>
55 #include <sys/filedesc.h>
56 #include <sys/proc.h>
57 #include <sys/resource.h>
58 #include <sys/resourcevar.h>
59 #include <sys/vnode.h>
60 #include <sys/fcntl.h>
61 #include <sys/file.h>
62 #include <sys/mac.h>
63 #include <sys/mman.h>
64 #include <sys/mount.h>
65 #include <sys/conf.h>
66 #include <sys/stat.h>
67 #include <sys/vmmeter.h>
68 #include <sys/sysctl.h>
69 
70 #include <vm/vm.h>
71 #include <vm/vm_param.h>
72 #include <vm/pmap.h>
73 #include <vm/vm_map.h>
74 #include <vm/vm_object.h>
75 #include <vm/vm_page.h>
76 #include <vm/vm_pager.h>
77 #include <vm/vm_pageout.h>
78 #include <vm/vm_extern.h>
79 #include <vm/vm_page.h>
80 #include <vm/vm_kern.h>
81 
82 #ifndef _SYS_SYSPROTO_H_
83 struct sbrk_args {
84 	int incr;
85 };
86 #endif
87 
88 static int max_proc_mmap;
89 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, "");
90 
91 /*
92  * Set the maximum number of vm_map_entry structures per process.  Roughly
93  * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100
94  * of our KVM malloc space still results in generous limits.  We want a
95  * default that is good enough to prevent the kernel running out of resources
96  * if attacked from compromised user account but generous enough such that
97  * multi-threaded processes are not unduly inconvenienced.
98  */
99 static void vmmapentry_rsrc_init(void *);
100 SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, NULL)
101 
102 static void
103 vmmapentry_rsrc_init(dummy)
104         void *dummy;
105 {
106     max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry);
107     max_proc_mmap /= 100;
108 }
109 
110 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
111     int *, struct vnode *, vm_ooffset_t, vm_object_t *);
112 
113 /*
114  * MPSAFE
115  */
116 /* ARGSUSED */
117 int
118 sbrk(td, uap)
119 	struct thread *td;
120 	struct sbrk_args *uap;
121 {
122 	/* Not yet implemented */
123 	return (EOPNOTSUPP);
124 }
125 
126 #ifndef _SYS_SYSPROTO_H_
127 struct sstk_args {
128 	int incr;
129 };
130 #endif
131 
132 /*
133  * MPSAFE
134  */
135 /* ARGSUSED */
136 int
137 sstk(td, uap)
138 	struct thread *td;
139 	struct sstk_args *uap;
140 {
141 	/* Not yet implemented */
142 	return (EOPNOTSUPP);
143 }
144 
145 #if defined(COMPAT_43)
146 #ifndef _SYS_SYSPROTO_H_
147 struct getpagesize_args {
148 	int dummy;
149 };
150 #endif
151 
152 /* ARGSUSED */
153 int
154 ogetpagesize(td, uap)
155 	struct thread *td;
156 	struct getpagesize_args *uap;
157 {
158 	/* MP SAFE */
159 	td->td_retval[0] = PAGE_SIZE;
160 	return (0);
161 }
162 #endif				/* COMPAT_43 */
163 
164 
165 /*
166  * Memory Map (mmap) system call.  Note that the file offset
167  * and address are allowed to be NOT page aligned, though if
168  * the MAP_FIXED flag it set, both must have the same remainder
169  * modulo the PAGE_SIZE (POSIX 1003.1b).  If the address is not
170  * page-aligned, the actual mapping starts at trunc_page(addr)
171  * and the return value is adjusted up by the page offset.
172  *
173  * Generally speaking, only character devices which are themselves
174  * memory-based, such as a video framebuffer, can be mmap'd.  Otherwise
175  * there would be no cache coherency between a descriptor and a VM mapping
176  * both to the same character device.
177  *
178  * Block devices can be mmap'd no matter what they represent.  Cache coherency
179  * is maintained as long as you do not write directly to the underlying
180  * character device.
181  */
182 #ifndef _SYS_SYSPROTO_H_
183 struct mmap_args {
184 	void *addr;
185 	size_t len;
186 	int prot;
187 	int flags;
188 	int fd;
189 	long pad;
190 	off_t pos;
191 };
192 #endif
193 
194 /*
195  * MPSAFE
196  */
197 int
198 mmap(td, uap)
199 	struct thread *td;
200 	struct mmap_args *uap;
201 {
202 	struct file *fp;
203 	struct vnode *vp;
204 	vm_offset_t addr;
205 	vm_size_t size, pageoff;
206 	vm_prot_t prot, maxprot;
207 	void *handle;
208 	int flags, error;
209 	off_t pos;
210 	struct vmspace *vms = td->td_proc->p_vmspace;
211 
212 	addr = (vm_offset_t) uap->addr;
213 	size = uap->len;
214 	prot = uap->prot & VM_PROT_ALL;
215 	flags = uap->flags;
216 	pos = uap->pos;
217 
218 	fp = NULL;
219 	/* make sure mapping fits into numeric range etc */
220 	if ((ssize_t) uap->len < 0 ||
221 	    ((flags & MAP_ANON) && uap->fd != -1))
222 		return (EINVAL);
223 
224 	if (flags & MAP_STACK) {
225 		if ((uap->fd != -1) ||
226 		    ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE)))
227 			return (EINVAL);
228 		flags |= MAP_ANON;
229 		pos = 0;
230 	}
231 
232 	/*
233 	 * Align the file position to a page boundary,
234 	 * and save its page offset component.
235 	 */
236 	pageoff = (pos & PAGE_MASK);
237 	pos -= pageoff;
238 
239 	/* Adjust size for rounding (on both ends). */
240 	size += pageoff;			/* low end... */
241 	size = (vm_size_t) round_page(size);	/* hi end */
242 
243 	/*
244 	 * Check for illegal addresses.  Watch out for address wrap... Note
245 	 * that VM_*_ADDRESS are not constants due to casts (argh).
246 	 */
247 	if (flags & MAP_FIXED) {
248 		/*
249 		 * The specified address must have the same remainder
250 		 * as the file offset taken modulo PAGE_SIZE, so it
251 		 * should be aligned after adjustment by pageoff.
252 		 */
253 		addr -= pageoff;
254 		if (addr & PAGE_MASK)
255 			return (EINVAL);
256 		/* Address range must be all in user VM space. */
257 		if (addr < vm_map_min(&vms->vm_map) ||
258 		    addr + size > vm_map_max(&vms->vm_map))
259 			return (EINVAL);
260 		if (addr + size < addr)
261 			return (EINVAL);
262 	} else {
263 	/*
264 	 * XXX for non-fixed mappings where no hint is provided or
265 	 * the hint would fall in the potential heap space,
266 	 * place it after the end of the largest possible heap.
267 	 *
268 	 * There should really be a pmap call to determine a reasonable
269 	 * location.
270 	 */
271 		PROC_LOCK(td->td_proc);
272 		if (addr == 0 ||
273 		    (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
274 		    addr < round_page((vm_offset_t)vms->vm_daddr +
275 		    lim_max(td->td_proc, RLIMIT_DATA))))
276 			addr = round_page((vm_offset_t)vms->vm_daddr +
277 			    lim_max(td->td_proc, RLIMIT_DATA));
278 		PROC_UNLOCK(td->td_proc);
279 	}
280 	if (flags & MAP_ANON) {
281 		/*
282 		 * Mapping blank space is trivial.
283 		 */
284 		handle = NULL;
285 		maxprot = VM_PROT_ALL;
286 		pos = 0;
287 	} else {
288 		/*
289 		 * Mapping file, get fp for validation. Obtain vnode and make
290 		 * sure it is of appropriate type.
291 		 * don't let the descriptor disappear on us if we block
292 		 */
293 		if ((error = fget(td, uap->fd, &fp)) != 0)
294 			goto done;
295 		if (fp->f_type != DTYPE_VNODE) {
296 			error = EINVAL;
297 			goto done;
298 		}
299 		/*
300 		 * POSIX shared-memory objects are defined to have
301 		 * kernel persistence, and are not defined to support
302 		 * read(2)/write(2) -- or even open(2).  Thus, we can
303 		 * use MAP_ASYNC to trade on-disk coherence for speed.
304 		 * The shm_open(3) library routine turns on the FPOSIXSHM
305 		 * flag to request this behavior.
306 		 */
307 		if (fp->f_flag & FPOSIXSHM)
308 			flags |= MAP_NOSYNC;
309 		vp = fp->f_vnode;
310 		/*
311 		 * Ensure that file and memory protections are
312 		 * compatible.  Note that we only worry about
313 		 * writability if mapping is shared; in this case,
314 		 * current and max prot are dictated by the open file.
315 		 * XXX use the vnode instead?  Problem is: what
316 		 * credentials do we use for determination? What if
317 		 * proc does a setuid?
318 		 */
319 		if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC)
320 			maxprot = VM_PROT_NONE;
321 		else
322 			maxprot = VM_PROT_EXECUTE;
323 		if (fp->f_flag & FREAD) {
324 			maxprot |= VM_PROT_READ;
325 		} else if (prot & PROT_READ) {
326 			error = EACCES;
327 			goto done;
328 		}
329 		/*
330 		 * If we are sharing potential changes (either via
331 		 * MAP_SHARED or via the implicit sharing of character
332 		 * device mappings), and we are trying to get write
333 		 * permission although we opened it without asking
334 		 * for it, bail out.
335 		 */
336 		if ((flags & MAP_SHARED) != 0) {
337 			if ((fp->f_flag & FWRITE) != 0) {
338 				maxprot |= VM_PROT_WRITE;
339 			} else if ((prot & PROT_WRITE) != 0) {
340 				error = EACCES;
341 				goto done;
342 			}
343 		} else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) {
344 			maxprot |= VM_PROT_WRITE;
345 		}
346 		handle = (void *)vp;
347 	}
348 
349 	/*
350 	 * Do not allow more then a certain number of vm_map_entry structures
351 	 * per process.  Scale with the number of rforks sharing the map
352 	 * to make the limit reasonable for threads.
353 	 */
354 	if (max_proc_mmap &&
355 	    vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) {
356 		error = ENOMEM;
357 		goto done;
358 	}
359 
360 	error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot,
361 	    flags, handle, pos);
362 	if (error == 0)
363 		td->td_retval[0] = (register_t) (addr + pageoff);
364 done:
365 	if (fp)
366 		fdrop(fp, td);
367 
368 	return (error);
369 }
370 
371 #ifdef COMPAT_43
372 #ifndef _SYS_SYSPROTO_H_
373 struct ommap_args {
374 	caddr_t addr;
375 	int len;
376 	int prot;
377 	int flags;
378 	int fd;
379 	long pos;
380 };
381 #endif
382 int
383 ommap(td, uap)
384 	struct thread *td;
385 	struct ommap_args *uap;
386 {
387 	struct mmap_args nargs;
388 	static const char cvtbsdprot[8] = {
389 		0,
390 		PROT_EXEC,
391 		PROT_WRITE,
392 		PROT_EXEC | PROT_WRITE,
393 		PROT_READ,
394 		PROT_EXEC | PROT_READ,
395 		PROT_WRITE | PROT_READ,
396 		PROT_EXEC | PROT_WRITE | PROT_READ,
397 	};
398 
399 #define	OMAP_ANON	0x0002
400 #define	OMAP_COPY	0x0020
401 #define	OMAP_SHARED	0x0010
402 #define	OMAP_FIXED	0x0100
403 
404 	nargs.addr = uap->addr;
405 	nargs.len = uap->len;
406 	nargs.prot = cvtbsdprot[uap->prot & 0x7];
407 	nargs.flags = 0;
408 	if (uap->flags & OMAP_ANON)
409 		nargs.flags |= MAP_ANON;
410 	if (uap->flags & OMAP_COPY)
411 		nargs.flags |= MAP_COPY;
412 	if (uap->flags & OMAP_SHARED)
413 		nargs.flags |= MAP_SHARED;
414 	else
415 		nargs.flags |= MAP_PRIVATE;
416 	if (uap->flags & OMAP_FIXED)
417 		nargs.flags |= MAP_FIXED;
418 	nargs.fd = uap->fd;
419 	nargs.pos = uap->pos;
420 	return (mmap(td, &nargs));
421 }
422 #endif				/* COMPAT_43 */
423 
424 
425 #ifndef _SYS_SYSPROTO_H_
426 struct msync_args {
427 	void *addr;
428 	int len;
429 	int flags;
430 };
431 #endif
432 /*
433  * MPSAFE
434  */
435 int
436 msync(td, uap)
437 	struct thread *td;
438 	struct msync_args *uap;
439 {
440 	vm_offset_t addr;
441 	vm_size_t size, pageoff;
442 	int flags;
443 	vm_map_t map;
444 	int rv;
445 
446 	addr = (vm_offset_t) uap->addr;
447 	size = uap->len;
448 	flags = uap->flags;
449 
450 	pageoff = (addr & PAGE_MASK);
451 	addr -= pageoff;
452 	size += pageoff;
453 	size = (vm_size_t) round_page(size);
454 	if (addr + size < addr)
455 		return (EINVAL);
456 
457 	if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE))
458 		return (EINVAL);
459 
460 	map = &td->td_proc->p_vmspace->vm_map;
461 
462 	/*
463 	 * Clean the pages and interpret the return value.
464 	 */
465 	rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0,
466 	    (flags & MS_INVALIDATE) != 0);
467 	switch (rv) {
468 	case KERN_SUCCESS:
469 		return (0);
470 	case KERN_INVALID_ADDRESS:
471 		return (EINVAL);	/* Sun returns ENOMEM? */
472 	case KERN_INVALID_ARGUMENT:
473 		return (EBUSY);
474 	default:
475 		return (EINVAL);
476 	}
477 }
478 
479 #ifndef _SYS_SYSPROTO_H_
480 struct munmap_args {
481 	void *addr;
482 	size_t len;
483 };
484 #endif
485 /*
486  * MPSAFE
487  */
488 int
489 munmap(td, uap)
490 	struct thread *td;
491 	struct munmap_args *uap;
492 {
493 	vm_offset_t addr;
494 	vm_size_t size, pageoff;
495 	vm_map_t map;
496 
497 	addr = (vm_offset_t) uap->addr;
498 	size = uap->len;
499 	if (size == 0)
500 		return (EINVAL);
501 
502 	pageoff = (addr & PAGE_MASK);
503 	addr -= pageoff;
504 	size += pageoff;
505 	size = (vm_size_t) round_page(size);
506 	if (addr + size < addr)
507 		return (EINVAL);
508 
509 	/*
510 	 * Check for illegal addresses.  Watch out for address wrap...
511 	 */
512 	map = &td->td_proc->p_vmspace->vm_map;
513 	if (addr < vm_map_min(map) || addr + size > vm_map_max(map))
514 		return (EINVAL);
515 	vm_map_lock(map);
516 	/*
517 	 * Make sure entire range is allocated.
518 	 */
519 	if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) {
520 		vm_map_unlock(map);
521 		return (EINVAL);
522 	}
523 	/* returns nothing but KERN_SUCCESS anyway */
524 	vm_map_delete(map, addr, addr + size);
525 	vm_map_unlock(map);
526 	return (0);
527 }
528 
529 #ifndef _SYS_SYSPROTO_H_
530 struct mprotect_args {
531 	const void *addr;
532 	size_t len;
533 	int prot;
534 };
535 #endif
536 /*
537  * MPSAFE
538  */
539 int
540 mprotect(td, uap)
541 	struct thread *td;
542 	struct mprotect_args *uap;
543 {
544 	vm_offset_t addr;
545 	vm_size_t size, pageoff;
546 	vm_prot_t prot;
547 
548 	addr = (vm_offset_t) uap->addr;
549 	size = uap->len;
550 	prot = uap->prot & VM_PROT_ALL;
551 #if defined(VM_PROT_READ_IS_EXEC)
552 	if (prot & VM_PROT_READ)
553 		prot |= VM_PROT_EXECUTE;
554 #endif
555 
556 	pageoff = (addr & PAGE_MASK);
557 	addr -= pageoff;
558 	size += pageoff;
559 	size = (vm_size_t) round_page(size);
560 	if (addr + size < addr)
561 		return (EINVAL);
562 
563 	switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr,
564 	    addr + size, prot, FALSE)) {
565 	case KERN_SUCCESS:
566 		return (0);
567 	case KERN_PROTECTION_FAILURE:
568 		return (EACCES);
569 	}
570 	return (EINVAL);
571 }
572 
573 #ifndef _SYS_SYSPROTO_H_
574 struct minherit_args {
575 	void *addr;
576 	size_t len;
577 	int inherit;
578 };
579 #endif
580 /*
581  * MPSAFE
582  */
583 int
584 minherit(td, uap)
585 	struct thread *td;
586 	struct minherit_args *uap;
587 {
588 	vm_offset_t addr;
589 	vm_size_t size, pageoff;
590 	vm_inherit_t inherit;
591 
592 	addr = (vm_offset_t)uap->addr;
593 	size = uap->len;
594 	inherit = uap->inherit;
595 
596 	pageoff = (addr & PAGE_MASK);
597 	addr -= pageoff;
598 	size += pageoff;
599 	size = (vm_size_t) round_page(size);
600 	if (addr + size < addr)
601 		return (EINVAL);
602 
603 	switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr,
604 	    addr + size, inherit)) {
605 	case KERN_SUCCESS:
606 		return (0);
607 	case KERN_PROTECTION_FAILURE:
608 		return (EACCES);
609 	}
610 	return (EINVAL);
611 }
612 
613 #ifndef _SYS_SYSPROTO_H_
614 struct madvise_args {
615 	void *addr;
616 	size_t len;
617 	int behav;
618 };
619 #endif
620 
621 /*
622  * MPSAFE
623  */
624 /* ARGSUSED */
625 int
626 madvise(td, uap)
627 	struct thread *td;
628 	struct madvise_args *uap;
629 {
630 	vm_offset_t start, end;
631 	vm_map_t map;
632 	struct proc *p;
633 	int error;
634 
635 	/*
636 	 * Check for our special case, advising the swap pager we are
637 	 * "immortal."
638 	 */
639 	if (uap->behav == MADV_PROTECT) {
640 		error = suser(td);
641 		if (error == 0) {
642 			p = td->td_proc;
643 			PROC_LOCK(p);
644 			p->p_flag |= P_PROTECTED;
645 			PROC_UNLOCK(p);
646 		}
647 		return (error);
648 	}
649 	/*
650 	 * Check for illegal behavior
651 	 */
652 	if (uap->behav < 0 || uap->behav > MADV_CORE)
653 		return (EINVAL);
654 	/*
655 	 * Check for illegal addresses.  Watch out for address wrap... Note
656 	 * that VM_*_ADDRESS are not constants due to casts (argh).
657 	 */
658 	map = &td->td_proc->p_vmspace->vm_map;
659 	if ((vm_offset_t)uap->addr < vm_map_min(map) ||
660 	    (vm_offset_t)uap->addr + uap->len > vm_map_max(map))
661 		return (EINVAL);
662 	if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr)
663 		return (EINVAL);
664 
665 	/*
666 	 * Since this routine is only advisory, we default to conservative
667 	 * behavior.
668 	 */
669 	start = trunc_page((vm_offset_t) uap->addr);
670 	end = round_page((vm_offset_t) uap->addr + uap->len);
671 
672 	if (vm_map_madvise(map, start, end, uap->behav))
673 		return (EINVAL);
674 	return (0);
675 }
676 
677 #ifndef _SYS_SYSPROTO_H_
678 struct mincore_args {
679 	const void *addr;
680 	size_t len;
681 	char *vec;
682 };
683 #endif
684 
685 /*
686  * MPSAFE
687  */
688 /* ARGSUSED */
689 int
690 mincore(td, uap)
691 	struct thread *td;
692 	struct mincore_args *uap;
693 {
694 	vm_offset_t addr, first_addr;
695 	vm_offset_t end, cend;
696 	pmap_t pmap;
697 	vm_map_t map;
698 	char *vec;
699 	int error = 0;
700 	int vecindex, lastvecindex;
701 	vm_map_entry_t current;
702 	vm_map_entry_t entry;
703 	int mincoreinfo;
704 	unsigned int timestamp;
705 
706 	/*
707 	 * Make sure that the addresses presented are valid for user
708 	 * mode.
709 	 */
710 	first_addr = addr = trunc_page((vm_offset_t) uap->addr);
711 	end = addr + (vm_size_t)round_page(uap->len);
712 	map = &td->td_proc->p_vmspace->vm_map;
713 	if (end > vm_map_max(map) || end < addr)
714 		return (EINVAL);
715 
716 	/*
717 	 * Address of byte vector
718 	 */
719 	vec = uap->vec;
720 
721 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
722 
723 	vm_map_lock_read(map);
724 RestartScan:
725 	timestamp = map->timestamp;
726 
727 	if (!vm_map_lookup_entry(map, addr, &entry))
728 		entry = entry->next;
729 
730 	/*
731 	 * Do this on a map entry basis so that if the pages are not
732 	 * in the current processes address space, we can easily look
733 	 * up the pages elsewhere.
734 	 */
735 	lastvecindex = -1;
736 	for (current = entry;
737 	    (current != &map->header) && (current->start < end);
738 	    current = current->next) {
739 
740 		/*
741 		 * ignore submaps (for now) or null objects
742 		 */
743 		if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) ||
744 			current->object.vm_object == NULL)
745 			continue;
746 
747 		/*
748 		 * limit this scan to the current map entry and the
749 		 * limits for the mincore call
750 		 */
751 		if (addr < current->start)
752 			addr = current->start;
753 		cend = current->end;
754 		if (cend > end)
755 			cend = end;
756 
757 		/*
758 		 * scan this entry one page at a time
759 		 */
760 		while (addr < cend) {
761 			/*
762 			 * Check pmap first, it is likely faster, also
763 			 * it can provide info as to whether we are the
764 			 * one referencing or modifying the page.
765 			 */
766 			mincoreinfo = pmap_mincore(pmap, addr);
767 			if (!mincoreinfo) {
768 				vm_pindex_t pindex;
769 				vm_ooffset_t offset;
770 				vm_page_t m;
771 				/*
772 				 * calculate the page index into the object
773 				 */
774 				offset = current->offset + (addr - current->start);
775 				pindex = OFF_TO_IDX(offset);
776 				VM_OBJECT_LOCK(current->object.vm_object);
777 				m = vm_page_lookup(current->object.vm_object,
778 					pindex);
779 				/*
780 				 * if the page is resident, then gather information about
781 				 * it.
782 				 */
783 				if (m != NULL && m->valid != 0) {
784 					mincoreinfo = MINCORE_INCORE;
785 					vm_page_lock_queues();
786 					if (m->dirty ||
787 						pmap_is_modified(m))
788 						mincoreinfo |= MINCORE_MODIFIED_OTHER;
789 					if ((m->flags & PG_REFERENCED) ||
790 						pmap_ts_referenced(m)) {
791 						vm_page_flag_set(m, PG_REFERENCED);
792 						mincoreinfo |= MINCORE_REFERENCED_OTHER;
793 					}
794 					vm_page_unlock_queues();
795 				}
796 				VM_OBJECT_UNLOCK(current->object.vm_object);
797 			}
798 
799 			/*
800 			 * subyte may page fault.  In case it needs to modify
801 			 * the map, we release the lock.
802 			 */
803 			vm_map_unlock_read(map);
804 
805 			/*
806 			 * calculate index into user supplied byte vector
807 			 */
808 			vecindex = OFF_TO_IDX(addr - first_addr);
809 
810 			/*
811 			 * If we have skipped map entries, we need to make sure that
812 			 * the byte vector is zeroed for those skipped entries.
813 			 */
814 			while ((lastvecindex + 1) < vecindex) {
815 				error = subyte(vec + lastvecindex, 0);
816 				if (error) {
817 					error = EFAULT;
818 					goto done2;
819 				}
820 				++lastvecindex;
821 			}
822 
823 			/*
824 			 * Pass the page information to the user
825 			 */
826 			error = subyte(vec + vecindex, mincoreinfo);
827 			if (error) {
828 				error = EFAULT;
829 				goto done2;
830 			}
831 
832 			/*
833 			 * If the map has changed, due to the subyte, the previous
834 			 * output may be invalid.
835 			 */
836 			vm_map_lock_read(map);
837 			if (timestamp != map->timestamp)
838 				goto RestartScan;
839 
840 			lastvecindex = vecindex;
841 			addr += PAGE_SIZE;
842 		}
843 	}
844 
845 	/*
846 	 * subyte may page fault.  In case it needs to modify
847 	 * the map, we release the lock.
848 	 */
849 	vm_map_unlock_read(map);
850 
851 	/*
852 	 * Zero the last entries in the byte vector.
853 	 */
854 	vecindex = OFF_TO_IDX(end - first_addr);
855 	while ((lastvecindex + 1) < vecindex) {
856 		error = subyte(vec + lastvecindex, 0);
857 		if (error) {
858 			error = EFAULT;
859 			goto done2;
860 		}
861 		++lastvecindex;
862 	}
863 
864 	/*
865 	 * If the map has changed, due to the subyte, the previous
866 	 * output may be invalid.
867 	 */
868 	vm_map_lock_read(map);
869 	if (timestamp != map->timestamp)
870 		goto RestartScan;
871 	vm_map_unlock_read(map);
872 done2:
873 	return (error);
874 }
875 
876 #ifndef _SYS_SYSPROTO_H_
877 struct mlock_args {
878 	const void *addr;
879 	size_t len;
880 };
881 #endif
882 /*
883  * MPSAFE
884  */
885 int
886 mlock(td, uap)
887 	struct thread *td;
888 	struct mlock_args *uap;
889 {
890 	struct proc *proc;
891 	vm_offset_t addr, end, last, start;
892 	vm_size_t npages, size;
893 	int error;
894 
895 	error = suser(td);
896 	if (error)
897 		return (error);
898 	addr = (vm_offset_t)uap->addr;
899 	size = uap->len;
900 	last = addr + size;
901 	start = trunc_page(addr);
902 	end = round_page(last);
903 	if (last < addr || end < addr)
904 		return (EINVAL);
905 	npages = atop(end - start);
906 	if (npages > vm_page_max_wired)
907 		return (ENOMEM);
908 	proc = td->td_proc;
909 	PROC_LOCK(proc);
910 	if (ptoa(npages +
911 	    pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) >
912 	    lim_cur(proc, RLIMIT_MEMLOCK)) {
913 		PROC_UNLOCK(proc);
914 		return (ENOMEM);
915 	}
916 	PROC_UNLOCK(proc);
917 	if (npages + cnt.v_wire_count > vm_page_max_wired)
918 		return (EAGAIN);
919 	error = vm_map_wire(&proc->p_vmspace->vm_map, start, end,
920 	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
921 	return (error == KERN_SUCCESS ? 0 : ENOMEM);
922 }
923 
924 #ifndef _SYS_SYSPROTO_H_
925 struct mlockall_args {
926 	int	how;
927 };
928 #endif
929 
930 /*
931  * MPSAFE
932  */
933 int
934 mlockall(td, uap)
935 	struct thread *td;
936 	struct mlockall_args *uap;
937 {
938 	vm_map_t map;
939 	int error;
940 
941 	map = &td->td_proc->p_vmspace->vm_map;
942 	error = 0;
943 
944 	if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0))
945 		return (EINVAL);
946 
947 #if 0
948 	/*
949 	 * If wiring all pages in the process would cause it to exceed
950 	 * a hard resource limit, return ENOMEM.
951 	 */
952 	PROC_LOCK(td->td_proc);
953 	if (map->size - ptoa(pmap_wired_count(vm_map_pmap(map)) >
954 		lim_cur(td->td_proc, RLIMIT_MEMLOCK))) {
955 		PROC_UNLOCK(td->td_proc);
956 		return (ENOMEM);
957 	}
958 	PROC_UNLOCK(td->td_proc);
959 #else
960 	error = suser(td);
961 	if (error)
962 		return (error);
963 #endif
964 
965 	if (uap->how & MCL_FUTURE) {
966 		vm_map_lock(map);
967 		vm_map_modflags(map, MAP_WIREFUTURE, 0);
968 		vm_map_unlock(map);
969 		error = 0;
970 	}
971 
972 	if (uap->how & MCL_CURRENT) {
973 		/*
974 		 * P1003.1-2001 mandates that all currently mapped pages
975 		 * will be memory resident and locked (wired) upon return
976 		 * from mlockall(). vm_map_wire() will wire pages, by
977 		 * calling vm_fault_wire() for each page in the region.
978 		 */
979 		error = vm_map_wire(map, vm_map_min(map), vm_map_max(map),
980 		    VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
981 		error = (error == KERN_SUCCESS ? 0 : EAGAIN);
982 	}
983 
984 	return (error);
985 }
986 
987 #ifndef _SYS_SYSPROTO_H_
988 struct munlockall_args {
989 	register_t dummy;
990 };
991 #endif
992 
993 /*
994  * MPSAFE
995  */
996 int
997 munlockall(td, uap)
998 	struct thread *td;
999 	struct munlockall_args *uap;
1000 {
1001 	vm_map_t map;
1002 	int error;
1003 
1004 	map = &td->td_proc->p_vmspace->vm_map;
1005 	error = suser(td);
1006 	if (error)
1007 		return (error);
1008 
1009 	/* Clear the MAP_WIREFUTURE flag from this vm_map. */
1010 	vm_map_lock(map);
1011 	vm_map_modflags(map, 0, MAP_WIREFUTURE);
1012 	vm_map_unlock(map);
1013 
1014 	/* Forcibly unwire all pages. */
1015 	error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map),
1016 	    VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
1017 
1018 	return (error);
1019 }
1020 
1021 #ifndef _SYS_SYSPROTO_H_
1022 struct munlock_args {
1023 	const void *addr;
1024 	size_t len;
1025 };
1026 #endif
1027 /*
1028  * MPSAFE
1029  */
1030 int
1031 munlock(td, uap)
1032 	struct thread *td;
1033 	struct munlock_args *uap;
1034 {
1035 	vm_offset_t addr, end, last, start;
1036 	vm_size_t size;
1037 	int error;
1038 
1039 	error = suser(td);
1040 	if (error)
1041 		return (error);
1042 	addr = (vm_offset_t)uap->addr;
1043 	size = uap->len;
1044 	last = addr + size;
1045 	start = trunc_page(addr);
1046 	end = round_page(last);
1047 	if (last < addr || end < addr)
1048 		return (EINVAL);
1049 	error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end,
1050 	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
1051 	return (error == KERN_SUCCESS ? 0 : ENOMEM);
1052 }
1053 
1054 /*
1055  * vm_mmap_vnode()
1056  *
1057  * MPSAFE
1058  *
1059  * Helper function for vm_mmap.  Perform sanity check specific for mmap
1060  * operations on vnodes.
1061  */
1062 int
1063 vm_mmap_vnode(struct thread *td, vm_size_t objsize,
1064     vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
1065     struct vnode *vp, vm_ooffset_t foff, vm_object_t *objp)
1066 {
1067 	struct vattr va;
1068 	void *handle;
1069 	vm_object_t obj;
1070 	int error, flags, type;
1071 
1072 	mtx_lock(&Giant);
1073 	if ((error = vget(vp, LK_EXCLUSIVE, td)) != 0) {
1074 		mtx_unlock(&Giant);
1075 		return (error);
1076 	}
1077 	flags = *flagsp;
1078 	if (vp->v_type == VREG) {
1079 		/*
1080 		 * Get the proper underlying object
1081 		 */
1082 		if (VOP_GETVOBJECT(vp, &obj) != 0) {
1083 			error = EINVAL;
1084 			goto done;
1085 		}
1086 		if (obj->handle != vp) {
1087 			vput(vp);
1088 			vp = (struct vnode*)obj->handle;
1089 			vget(vp, LK_EXCLUSIVE, td);
1090 		}
1091 		type = OBJT_VNODE;
1092 		handle = vp;
1093 	} else if (vp->v_type == VCHR) {
1094 		type = OBJT_DEVICE;
1095 		handle = vp->v_rdev;
1096 
1097 		/* XXX: lack thredref on device */
1098 		if(vp->v_rdev->si_devsw->d_flags & D_MMAP_ANON) {
1099 			*maxprotp = VM_PROT_ALL;
1100 			*flagsp |= MAP_ANON;
1101 			error = 0;
1102 			goto done;
1103 		}
1104 		/*
1105 		 * cdevs does not provide private mappings of any kind.
1106 		 */
1107 		if ((*maxprotp & VM_PROT_WRITE) == 0 &&
1108 		    (prot & PROT_WRITE) != 0) {
1109 			error = EACCES;
1110 			goto done;
1111 		}
1112 		if (flags & (MAP_PRIVATE|MAP_COPY)) {
1113 			error = EINVAL;
1114 			goto done;
1115 		}
1116 		/*
1117 		 * Force device mappings to be shared.
1118 		 */
1119 		flags |= MAP_SHARED;
1120 	} else {
1121 		error = EINVAL;
1122 		goto done;
1123 	}
1124 	if ((error = VOP_GETATTR(vp, &va, td->td_ucred, td))) {
1125 		goto done;
1126 	}
1127 	if ((flags & MAP_SHARED) != 0) {
1128 		if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) {
1129 			if (prot & PROT_WRITE) {
1130 				error = EPERM;
1131 				goto done;
1132 			}
1133 			*maxprotp &= ~VM_PROT_WRITE;
1134 		}
1135 #ifdef MAC
1136 		error = mac_check_vnode_mmap(td->td_ucred, vp, prot);
1137 		if (error != 0)
1138 			goto done;
1139 #endif
1140 	}
1141 	/*
1142 	 * If it is a regular file without any references
1143 	 * we do not need to sync it.
1144 	 * Adjust object size to be the size of actual file.
1145 	 */
1146 	if (vp->v_type == VREG) {
1147 		objsize = round_page(va.va_size);
1148 		if (va.va_nlink == 0)
1149 			flags |= MAP_NOSYNC;
1150 	}
1151 	obj = vm_pager_allocate(type, handle, objsize, prot, foff);
1152 	if (obj == NULL) {
1153 		error = (type == OBJT_DEVICE ? EINVAL : ENOMEM);
1154 		goto done;
1155 	}
1156 	*objp = obj;
1157 	*flagsp = flags;
1158 done:
1159 	vput(vp);
1160 	mtx_unlock(&Giant);
1161 	return (error);
1162 }
1163 
1164 /*
1165  * vm_mmap()
1166  *
1167  * MPSAFE
1168  *
1169  * Internal version of mmap.  Currently used by mmap, exec, and sys5
1170  * shared memory.  Handle is either a vnode pointer or NULL for MAP_ANON.
1171  */
1172 int
1173 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
1174 	vm_prot_t maxprot, int flags,
1175 	void *handle,
1176 	vm_ooffset_t foff)
1177 {
1178 	boolean_t fitit;
1179 	vm_object_t object;
1180 	int rv = KERN_SUCCESS;
1181 	vm_ooffset_t objsize;
1182 	int docow, error;
1183 	struct thread *td = curthread;
1184 
1185 	if (size == 0)
1186 		return (0);
1187 
1188 	objsize = size = round_page(size);
1189 
1190 	PROC_LOCK(td->td_proc);
1191 	if (td->td_proc->p_vmspace->vm_map.size + size >
1192 	    lim_cur(td->td_proc, RLIMIT_VMEM)) {
1193 		PROC_UNLOCK(td->td_proc);
1194 		return(ENOMEM);
1195 	}
1196 	PROC_UNLOCK(td->td_proc);
1197 
1198 	/*
1199 	 * We currently can only deal with page aligned file offsets.
1200 	 * The check is here rather than in the syscall because the
1201 	 * kernel calls this function internally for other mmaping
1202 	 * operations (such as in exec) and non-aligned offsets will
1203 	 * cause pmap inconsistencies...so we want to be sure to
1204 	 * disallow this in all cases.
1205 	 */
1206 	if (foff & PAGE_MASK)
1207 		return (EINVAL);
1208 
1209 	if ((flags & MAP_FIXED) == 0) {
1210 		fitit = TRUE;
1211 		*addr = round_page(*addr);
1212 	} else {
1213 		if (*addr != trunc_page(*addr))
1214 			return (EINVAL);
1215 		fitit = FALSE;
1216 		(void) vm_map_remove(map, *addr, *addr + size);
1217 	}
1218 	/*
1219 	 * Lookup/allocate object.
1220 	 */
1221 	if (handle != NULL) {
1222 		error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
1223 		    handle, foff, &object);
1224 		if (error) {
1225 			return (error);
1226 		}
1227 	}
1228 	if (flags & MAP_ANON) {
1229 		object = NULL;
1230 		docow = 0;
1231 		/*
1232 		 * Unnamed anonymous regions always start at 0.
1233 		 */
1234 		if (handle == 0)
1235 			foff = 0;
1236 	} else {
1237 		docow = MAP_PREFAULT_PARTIAL;
1238 	}
1239 
1240 	if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
1241 		docow |= MAP_COPY_ON_WRITE;
1242 	if (flags & MAP_NOSYNC)
1243 		docow |= MAP_DISABLE_SYNCER;
1244 	if (flags & MAP_NOCORE)
1245 		docow |= MAP_DISABLE_COREDUMP;
1246 
1247 #if defined(VM_PROT_READ_IS_EXEC)
1248 	if (prot & VM_PROT_READ)
1249 		prot |= VM_PROT_EXECUTE;
1250 
1251 	if (maxprot & VM_PROT_READ)
1252 		maxprot |= VM_PROT_EXECUTE;
1253 #endif
1254 
1255 	if (fitit)
1256 		*addr = pmap_addr_hint(object, *addr, size);
1257 
1258 	if (flags & MAP_STACK)
1259 		rv = vm_map_stack(map, *addr, size, prot, maxprot,
1260 		    docow | MAP_STACK_GROWS_DOWN);
1261 	else
1262 		rv = vm_map_find(map, object, foff, addr, size, fitit,
1263 				 prot, maxprot, docow);
1264 
1265 	if (rv != KERN_SUCCESS) {
1266 		/*
1267 		 * Lose the object reference. Will destroy the
1268 		 * object if it's an unnamed anonymous mapping
1269 		 * or named anonymous without other references.
1270 		 */
1271 		vm_object_deallocate(object);
1272 	} else if (flags & MAP_SHARED) {
1273 		/*
1274 		 * Shared memory is also shared with children.
1275 		 */
1276 		rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE);
1277 		if (rv != KERN_SUCCESS)
1278 			(void) vm_map_remove(map, *addr, *addr + size);
1279 	}
1280 
1281 	/*
1282 	 * If the process has requested that all future mappings
1283 	 * be wired, then heed this.
1284 	 */
1285 	if ((rv == KERN_SUCCESS) && (map->flags & MAP_WIREFUTURE))
1286 		vm_map_wire(map, *addr, *addr + size,
1287 		    VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
1288 
1289 	switch (rv) {
1290 	case KERN_SUCCESS:
1291 		return (0);
1292 	case KERN_INVALID_ADDRESS:
1293 	case KERN_NO_SPACE:
1294 		return (ENOMEM);
1295 	case KERN_PROTECTION_FAILURE:
1296 		return (EACCES);
1297 	default:
1298 		return (EINVAL);
1299 	}
1300 }
1301