xref: /freebsd/sys/vm/vm_mmap.c (revision 0b3105a37d7adcadcb720112fed4dc4e8040be99)
1 /*-
2  * Copyright (c) 1988 University of Utah.
3  * Copyright (c) 1991, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * the Systems Programming Group of the University of Utah Computer
8  * Science Department.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
35  *
36  *	@(#)vm_mmap.c	8.4 (Berkeley) 1/12/94
37  */
38 
39 /*
40  * Mapped file (mmap) interface to VM
41  */
42 
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45 
46 #include "opt_compat.h"
47 #include "opt_hwpmc_hooks.h"
48 #include "opt_vm.h"
49 
50 #include <sys/param.h>
51 #include <sys/systm.h>
52 #include <sys/capsicum.h>
53 #include <sys/kernel.h>
54 #include <sys/lock.h>
55 #include <sys/mutex.h>
56 #include <sys/sysproto.h>
57 #include <sys/filedesc.h>
58 #include <sys/priv.h>
59 #include <sys/proc.h>
60 #include <sys/procctl.h>
61 #include <sys/racct.h>
62 #include <sys/resource.h>
63 #include <sys/resourcevar.h>
64 #include <sys/rwlock.h>
65 #include <sys/sysctl.h>
66 #include <sys/vnode.h>
67 #include <sys/fcntl.h>
68 #include <sys/file.h>
69 #include <sys/mman.h>
70 #include <sys/mount.h>
71 #include <sys/conf.h>
72 #include <sys/stat.h>
73 #include <sys/syscallsubr.h>
74 #include <sys/sysent.h>
75 #include <sys/vmmeter.h>
76 
77 #include <security/mac/mac_framework.h>
78 
79 #include <vm/vm.h>
80 #include <vm/vm_param.h>
81 #include <vm/pmap.h>
82 #include <vm/vm_map.h>
83 #include <vm/vm_object.h>
84 #include <vm/vm_page.h>
85 #include <vm/vm_pager.h>
86 #include <vm/vm_pageout.h>
87 #include <vm/vm_extern.h>
88 #include <vm/vm_page.h>
89 #include <vm/vnode_pager.h>
90 
91 #ifdef HWPMC_HOOKS
92 #include <sys/pmckern.h>
93 #endif
94 
95 int old_mlock = 0;
96 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0,
97     "Do not apply RLIMIT_MEMLOCK on mlockall");
98 
99 #ifdef MAP_32BIT
100 #define	MAP_32BIT_MAX_ADDR	((vm_offset_t)1 << 31)
101 #endif
102 
103 #ifndef _SYS_SYSPROTO_H_
104 struct sbrk_args {
105 	int incr;
106 };
107 #endif
108 
109 /*
110  * MPSAFE
111  */
112 /* ARGSUSED */
113 int
114 sys_sbrk(td, uap)
115 	struct thread *td;
116 	struct sbrk_args *uap;
117 {
118 	/* Not yet implemented */
119 	return (EOPNOTSUPP);
120 }
121 
122 #ifndef _SYS_SYSPROTO_H_
123 struct sstk_args {
124 	int incr;
125 };
126 #endif
127 
128 /*
129  * MPSAFE
130  */
131 /* ARGSUSED */
132 int
133 sys_sstk(td, uap)
134 	struct thread *td;
135 	struct sstk_args *uap;
136 {
137 	/* Not yet implemented */
138 	return (EOPNOTSUPP);
139 }
140 
141 #if defined(COMPAT_43)
142 #ifndef _SYS_SYSPROTO_H_
143 struct getpagesize_args {
144 	int dummy;
145 };
146 #endif
147 
148 int
149 ogetpagesize(td, uap)
150 	struct thread *td;
151 	struct getpagesize_args *uap;
152 {
153 	/* MP SAFE */
154 	td->td_retval[0] = PAGE_SIZE;
155 	return (0);
156 }
157 #endif				/* COMPAT_43 */
158 
159 
160 /*
161  * Memory Map (mmap) system call.  Note that the file offset
162  * and address are allowed to be NOT page aligned, though if
163  * the MAP_FIXED flag it set, both must have the same remainder
164  * modulo the PAGE_SIZE (POSIX 1003.1b).  If the address is not
165  * page-aligned, the actual mapping starts at trunc_page(addr)
166  * and the return value is adjusted up by the page offset.
167  *
168  * Generally speaking, only character devices which are themselves
169  * memory-based, such as a video framebuffer, can be mmap'd.  Otherwise
170  * there would be no cache coherency between a descriptor and a VM mapping
171  * both to the same character device.
172  */
173 #ifndef _SYS_SYSPROTO_H_
174 struct mmap_args {
175 	void *addr;
176 	size_t len;
177 	int prot;
178 	int flags;
179 	int fd;
180 	long pad;
181 	off_t pos;
182 };
183 #endif
184 
185 /*
186  * MPSAFE
187  */
188 int
189 sys_mmap(td, uap)
190 	struct thread *td;
191 	struct mmap_args *uap;
192 {
193 	struct file *fp;
194 	vm_offset_t addr;
195 	vm_size_t size, pageoff;
196 	vm_prot_t cap_maxprot;
197 	int align, error, flags, prot;
198 	off_t pos;
199 	struct vmspace *vms = td->td_proc->p_vmspace;
200 	cap_rights_t rights;
201 
202 	addr = (vm_offset_t) uap->addr;
203 	size = uap->len;
204 	prot = uap->prot;
205 	flags = uap->flags;
206 	pos = uap->pos;
207 
208 	fp = NULL;
209 
210 	/*
211 	 * Ignore old flags that used to be defined but did not do anything.
212 	 */
213 	flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040);
214 
215 	/*
216 	 * Enforce the constraints.
217 	 * Mapping of length 0 is only allowed for old binaries.
218 	 * Anonymous mapping shall specify -1 as filedescriptor and
219 	 * zero position for new code. Be nice to ancient a.out
220 	 * binaries and correct pos for anonymous mapping, since old
221 	 * ld.so sometimes issues anonymous map requests with non-zero
222 	 * pos.
223 	 */
224 	if (!SV_CURPROC_FLAG(SV_AOUT)) {
225 		if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) ||
226 		    ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0)))
227 			return (EINVAL);
228 	} else {
229 		if ((flags & MAP_ANON) != 0)
230 			pos = 0;
231 	}
232 
233 	if (flags & MAP_STACK) {
234 		if ((uap->fd != -1) ||
235 		    ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE)))
236 			return (EINVAL);
237 		flags |= MAP_ANON;
238 		pos = 0;
239 	}
240 	if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE |
241 	    MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE |
242 	    MAP_PREFAULT_READ |
243 #ifdef MAP_32BIT
244 	    MAP_32BIT |
245 #endif
246 	    MAP_ALIGNMENT_MASK)) != 0)
247 		return (EINVAL);
248 	if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL)
249 		return (EINVAL);
250 	if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE))
251 		return (EINVAL);
252 	if (prot != PROT_NONE &&
253 	    (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0)
254 		return (EINVAL);
255 
256 	/*
257 	 * Align the file position to a page boundary,
258 	 * and save its page offset component.
259 	 */
260 	pageoff = (pos & PAGE_MASK);
261 	pos -= pageoff;
262 
263 	/* Adjust size for rounding (on both ends). */
264 	size += pageoff;			/* low end... */
265 	size = (vm_size_t) round_page(size);	/* hi end */
266 
267 	/* Ensure alignment is at least a page and fits in a pointer. */
268 	align = flags & MAP_ALIGNMENT_MASK;
269 	if (align != 0 && align != MAP_ALIGNED_SUPER &&
270 	    (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY ||
271 	    align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT))
272 		return (EINVAL);
273 
274 	/*
275 	 * Check for illegal addresses.  Watch out for address wrap... Note
276 	 * that VM_*_ADDRESS are not constants due to casts (argh).
277 	 */
278 	if (flags & MAP_FIXED) {
279 		/*
280 		 * The specified address must have the same remainder
281 		 * as the file offset taken modulo PAGE_SIZE, so it
282 		 * should be aligned after adjustment by pageoff.
283 		 */
284 		addr -= pageoff;
285 		if (addr & PAGE_MASK)
286 			return (EINVAL);
287 
288 		/* Address range must be all in user VM space. */
289 		if (addr < vm_map_min(&vms->vm_map) ||
290 		    addr + size > vm_map_max(&vms->vm_map))
291 			return (EINVAL);
292 		if (addr + size < addr)
293 			return (EINVAL);
294 #ifdef MAP_32BIT
295 		if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR)
296 			return (EINVAL);
297 	} else if (flags & MAP_32BIT) {
298 		/*
299 		 * For MAP_32BIT, override the hint if it is too high and
300 		 * do not bother moving the mapping past the heap (since
301 		 * the heap is usually above 2GB).
302 		 */
303 		if (addr + size > MAP_32BIT_MAX_ADDR)
304 			addr = 0;
305 #endif
306 	} else {
307 		/*
308 		 * XXX for non-fixed mappings where no hint is provided or
309 		 * the hint would fall in the potential heap space,
310 		 * place it after the end of the largest possible heap.
311 		 *
312 		 * There should really be a pmap call to determine a reasonable
313 		 * location.
314 		 */
315 		if (addr == 0 ||
316 		    (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
317 		    addr < round_page((vm_offset_t)vms->vm_daddr +
318 		    lim_max(td, RLIMIT_DATA))))
319 			addr = round_page((vm_offset_t)vms->vm_daddr +
320 			    lim_max(td, RLIMIT_DATA));
321 	}
322 	if (size == 0) {
323 		/*
324 		 * Return success without mapping anything for old
325 		 * binaries that request a page-aligned mapping of
326 		 * length 0.  For modern binaries, this function
327 		 * returns an error earlier.
328 		 */
329 		error = 0;
330 	} else if (flags & MAP_ANON) {
331 		/*
332 		 * Mapping blank space is trivial.
333 		 *
334 		 * This relies on VM_PROT_* matching PROT_*.
335 		 */
336 		error = vm_mmap_object(&vms->vm_map, &addr, size, prot,
337 		    VM_PROT_ALL, flags, NULL, pos, FALSE, td);
338 	} else {
339 		/*
340 		 * Mapping file, get fp for validation and don't let the
341 		 * descriptor disappear on us if we block. Check capability
342 		 * rights, but also return the maximum rights to be combined
343 		 * with maxprot later.
344 		 */
345 		cap_rights_init(&rights, CAP_MMAP);
346 		if (prot & PROT_READ)
347 			cap_rights_set(&rights, CAP_MMAP_R);
348 		if ((flags & MAP_SHARED) != 0) {
349 			if (prot & PROT_WRITE)
350 				cap_rights_set(&rights, CAP_MMAP_W);
351 		}
352 		if (prot & PROT_EXEC)
353 			cap_rights_set(&rights, CAP_MMAP_X);
354 		error = fget_mmap(td, uap->fd, &rights, &cap_maxprot, &fp);
355 		if (error != 0)
356 			goto done;
357 		if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 &&
358 		    td->td_proc->p_osrel >= P_OSREL_MAP_FSTRICT) {
359 			error = EINVAL;
360 			goto done;
361 		}
362 
363 		/* This relies on VM_PROT_* matching PROT_*. */
364 		error = fo_mmap(fp, &vms->vm_map, &addr, size, prot,
365 		    cap_maxprot, flags, pos, td);
366 	}
367 
368 	if (error == 0)
369 		td->td_retval[0] = (register_t) (addr + pageoff);
370 done:
371 	if (fp)
372 		fdrop(fp, td);
373 
374 	return (error);
375 }
376 
377 #if defined(COMPAT_FREEBSD6)
378 int
379 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap)
380 {
381 	struct mmap_args oargs;
382 
383 	oargs.addr = uap->addr;
384 	oargs.len = uap->len;
385 	oargs.prot = uap->prot;
386 	oargs.flags = uap->flags;
387 	oargs.fd = uap->fd;
388 	oargs.pos = uap->pos;
389 	return (sys_mmap(td, &oargs));
390 }
391 #endif
392 
393 #ifdef COMPAT_43
394 #ifndef _SYS_SYSPROTO_H_
395 struct ommap_args {
396 	caddr_t addr;
397 	int len;
398 	int prot;
399 	int flags;
400 	int fd;
401 	long pos;
402 };
403 #endif
404 int
405 ommap(td, uap)
406 	struct thread *td;
407 	struct ommap_args *uap;
408 {
409 	struct mmap_args nargs;
410 	static const char cvtbsdprot[8] = {
411 		0,
412 		PROT_EXEC,
413 		PROT_WRITE,
414 		PROT_EXEC | PROT_WRITE,
415 		PROT_READ,
416 		PROT_EXEC | PROT_READ,
417 		PROT_WRITE | PROT_READ,
418 		PROT_EXEC | PROT_WRITE | PROT_READ,
419 	};
420 
421 #define	OMAP_ANON	0x0002
422 #define	OMAP_COPY	0x0020
423 #define	OMAP_SHARED	0x0010
424 #define	OMAP_FIXED	0x0100
425 
426 	nargs.addr = uap->addr;
427 	nargs.len = uap->len;
428 	nargs.prot = cvtbsdprot[uap->prot & 0x7];
429 #ifdef COMPAT_FREEBSD32
430 #if defined(__amd64__)
431 	if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) &&
432 	    nargs.prot != 0)
433 		nargs.prot |= PROT_EXEC;
434 #endif
435 #endif
436 	nargs.flags = 0;
437 	if (uap->flags & OMAP_ANON)
438 		nargs.flags |= MAP_ANON;
439 	if (uap->flags & OMAP_COPY)
440 		nargs.flags |= MAP_COPY;
441 	if (uap->flags & OMAP_SHARED)
442 		nargs.flags |= MAP_SHARED;
443 	else
444 		nargs.flags |= MAP_PRIVATE;
445 	if (uap->flags & OMAP_FIXED)
446 		nargs.flags |= MAP_FIXED;
447 	nargs.fd = uap->fd;
448 	nargs.pos = uap->pos;
449 	return (sys_mmap(td, &nargs));
450 }
451 #endif				/* COMPAT_43 */
452 
453 
454 #ifndef _SYS_SYSPROTO_H_
455 struct msync_args {
456 	void *addr;
457 	size_t len;
458 	int flags;
459 };
460 #endif
461 /*
462  * MPSAFE
463  */
464 int
465 sys_msync(td, uap)
466 	struct thread *td;
467 	struct msync_args *uap;
468 {
469 	vm_offset_t addr;
470 	vm_size_t size, pageoff;
471 	int flags;
472 	vm_map_t map;
473 	int rv;
474 
475 	addr = (vm_offset_t) uap->addr;
476 	size = uap->len;
477 	flags = uap->flags;
478 
479 	pageoff = (addr & PAGE_MASK);
480 	addr -= pageoff;
481 	size += pageoff;
482 	size = (vm_size_t) round_page(size);
483 	if (addr + size < addr)
484 		return (EINVAL);
485 
486 	if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE))
487 		return (EINVAL);
488 
489 	map = &td->td_proc->p_vmspace->vm_map;
490 
491 	/*
492 	 * Clean the pages and interpret the return value.
493 	 */
494 	rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0,
495 	    (flags & MS_INVALIDATE) != 0);
496 	switch (rv) {
497 	case KERN_SUCCESS:
498 		return (0);
499 	case KERN_INVALID_ADDRESS:
500 		return (ENOMEM);
501 	case KERN_INVALID_ARGUMENT:
502 		return (EBUSY);
503 	case KERN_FAILURE:
504 		return (EIO);
505 	default:
506 		return (EINVAL);
507 	}
508 }
509 
510 #ifndef _SYS_SYSPROTO_H_
511 struct munmap_args {
512 	void *addr;
513 	size_t len;
514 };
515 #endif
516 /*
517  * MPSAFE
518  */
519 int
520 sys_munmap(td, uap)
521 	struct thread *td;
522 	struct munmap_args *uap;
523 {
524 #ifdef HWPMC_HOOKS
525 	struct pmckern_map_out pkm;
526 	vm_map_entry_t entry;
527 #endif
528 	vm_offset_t addr;
529 	vm_size_t size, pageoff;
530 	vm_map_t map;
531 
532 	addr = (vm_offset_t) uap->addr;
533 	size = uap->len;
534 	if (size == 0)
535 		return (EINVAL);
536 
537 	pageoff = (addr & PAGE_MASK);
538 	addr -= pageoff;
539 	size += pageoff;
540 	size = (vm_size_t) round_page(size);
541 	if (addr + size < addr)
542 		return (EINVAL);
543 
544 	/*
545 	 * Check for illegal addresses.  Watch out for address wrap...
546 	 */
547 	map = &td->td_proc->p_vmspace->vm_map;
548 	if (addr < vm_map_min(map) || addr + size > vm_map_max(map))
549 		return (EINVAL);
550 	vm_map_lock(map);
551 #ifdef HWPMC_HOOKS
552 	/*
553 	 * Inform hwpmc if the address range being unmapped contains
554 	 * an executable region.
555 	 */
556 	pkm.pm_address = (uintptr_t) NULL;
557 	if (vm_map_lookup_entry(map, addr, &entry)) {
558 		for (;
559 		     entry != &map->header && entry->start < addr + size;
560 		     entry = entry->next) {
561 			if (vm_map_check_protection(map, entry->start,
562 				entry->end, VM_PROT_EXECUTE) == TRUE) {
563 				pkm.pm_address = (uintptr_t) addr;
564 				pkm.pm_size = (size_t) size;
565 				break;
566 			}
567 		}
568 	}
569 #endif
570 	vm_map_delete(map, addr, addr + size);
571 
572 #ifdef HWPMC_HOOKS
573 	/* downgrade the lock to prevent a LOR with the pmc-sx lock */
574 	vm_map_lock_downgrade(map);
575 	if (pkm.pm_address != (uintptr_t) NULL)
576 		PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm);
577 	vm_map_unlock_read(map);
578 #else
579 	vm_map_unlock(map);
580 #endif
581 	/* vm_map_delete returns nothing but KERN_SUCCESS anyway */
582 	return (0);
583 }
584 
585 #ifndef _SYS_SYSPROTO_H_
586 struct mprotect_args {
587 	const void *addr;
588 	size_t len;
589 	int prot;
590 };
591 #endif
592 /*
593  * MPSAFE
594  */
595 int
596 sys_mprotect(td, uap)
597 	struct thread *td;
598 	struct mprotect_args *uap;
599 {
600 	vm_offset_t addr;
601 	vm_size_t size, pageoff;
602 	vm_prot_t prot;
603 
604 	addr = (vm_offset_t) uap->addr;
605 	size = uap->len;
606 	prot = uap->prot & VM_PROT_ALL;
607 
608 	pageoff = (addr & PAGE_MASK);
609 	addr -= pageoff;
610 	size += pageoff;
611 	size = (vm_size_t) round_page(size);
612 	if (addr + size < addr)
613 		return (EINVAL);
614 
615 	switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr,
616 	    addr + size, prot, FALSE)) {
617 	case KERN_SUCCESS:
618 		return (0);
619 	case KERN_PROTECTION_FAILURE:
620 		return (EACCES);
621 	case KERN_RESOURCE_SHORTAGE:
622 		return (ENOMEM);
623 	}
624 	return (EINVAL);
625 }
626 
627 #ifndef _SYS_SYSPROTO_H_
628 struct minherit_args {
629 	void *addr;
630 	size_t len;
631 	int inherit;
632 };
633 #endif
634 /*
635  * MPSAFE
636  */
637 int
638 sys_minherit(td, uap)
639 	struct thread *td;
640 	struct minherit_args *uap;
641 {
642 	vm_offset_t addr;
643 	vm_size_t size, pageoff;
644 	vm_inherit_t inherit;
645 
646 	addr = (vm_offset_t)uap->addr;
647 	size = uap->len;
648 	inherit = uap->inherit;
649 
650 	pageoff = (addr & PAGE_MASK);
651 	addr -= pageoff;
652 	size += pageoff;
653 	size = (vm_size_t) round_page(size);
654 	if (addr + size < addr)
655 		return (EINVAL);
656 
657 	switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr,
658 	    addr + size, inherit)) {
659 	case KERN_SUCCESS:
660 		return (0);
661 	case KERN_PROTECTION_FAILURE:
662 		return (EACCES);
663 	}
664 	return (EINVAL);
665 }
666 
667 #ifndef _SYS_SYSPROTO_H_
668 struct madvise_args {
669 	void *addr;
670 	size_t len;
671 	int behav;
672 };
673 #endif
674 
675 /*
676  * MPSAFE
677  */
678 int
679 sys_madvise(td, uap)
680 	struct thread *td;
681 	struct madvise_args *uap;
682 {
683 	vm_offset_t start, end;
684 	vm_map_t map;
685 	int flags;
686 
687 	/*
688 	 * Check for our special case, advising the swap pager we are
689 	 * "immortal."
690 	 */
691 	if (uap->behav == MADV_PROTECT) {
692 		flags = PPROT_SET;
693 		return (kern_procctl(td, P_PID, td->td_proc->p_pid,
694 		    PROC_SPROTECT, &flags));
695 	}
696 
697 	/*
698 	 * Check for illegal behavior
699 	 */
700 	if (uap->behav < 0 || uap->behav > MADV_CORE)
701 		return (EINVAL);
702 	/*
703 	 * Check for illegal addresses.  Watch out for address wrap... Note
704 	 * that VM_*_ADDRESS are not constants due to casts (argh).
705 	 */
706 	map = &td->td_proc->p_vmspace->vm_map;
707 	if ((vm_offset_t)uap->addr < vm_map_min(map) ||
708 	    (vm_offset_t)uap->addr + uap->len > vm_map_max(map))
709 		return (EINVAL);
710 	if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr)
711 		return (EINVAL);
712 
713 	/*
714 	 * Since this routine is only advisory, we default to conservative
715 	 * behavior.
716 	 */
717 	start = trunc_page((vm_offset_t) uap->addr);
718 	end = round_page((vm_offset_t) uap->addr + uap->len);
719 
720 	if (vm_map_madvise(map, start, end, uap->behav))
721 		return (EINVAL);
722 	return (0);
723 }
724 
725 #ifndef _SYS_SYSPROTO_H_
726 struct mincore_args {
727 	const void *addr;
728 	size_t len;
729 	char *vec;
730 };
731 #endif
732 
733 /*
734  * MPSAFE
735  */
736 int
737 sys_mincore(td, uap)
738 	struct thread *td;
739 	struct mincore_args *uap;
740 {
741 	vm_offset_t addr, first_addr;
742 	vm_offset_t end, cend;
743 	pmap_t pmap;
744 	vm_map_t map;
745 	char *vec;
746 	int error = 0;
747 	int vecindex, lastvecindex;
748 	vm_map_entry_t current;
749 	vm_map_entry_t entry;
750 	vm_object_t object;
751 	vm_paddr_t locked_pa;
752 	vm_page_t m;
753 	vm_pindex_t pindex;
754 	int mincoreinfo;
755 	unsigned int timestamp;
756 	boolean_t locked;
757 
758 	/*
759 	 * Make sure that the addresses presented are valid for user
760 	 * mode.
761 	 */
762 	first_addr = addr = trunc_page((vm_offset_t) uap->addr);
763 	end = addr + (vm_size_t)round_page(uap->len);
764 	map = &td->td_proc->p_vmspace->vm_map;
765 	if (end > vm_map_max(map) || end < addr)
766 		return (ENOMEM);
767 
768 	/*
769 	 * Address of byte vector
770 	 */
771 	vec = uap->vec;
772 
773 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
774 
775 	vm_map_lock_read(map);
776 RestartScan:
777 	timestamp = map->timestamp;
778 
779 	if (!vm_map_lookup_entry(map, addr, &entry)) {
780 		vm_map_unlock_read(map);
781 		return (ENOMEM);
782 	}
783 
784 	/*
785 	 * Do this on a map entry basis so that if the pages are not
786 	 * in the current processes address space, we can easily look
787 	 * up the pages elsewhere.
788 	 */
789 	lastvecindex = -1;
790 	for (current = entry;
791 	    (current != &map->header) && (current->start < end);
792 	    current = current->next) {
793 
794 		/*
795 		 * check for contiguity
796 		 */
797 		if (current->end < end &&
798 		    (entry->next == &map->header ||
799 		     current->next->start > current->end)) {
800 			vm_map_unlock_read(map);
801 			return (ENOMEM);
802 		}
803 
804 		/*
805 		 * ignore submaps (for now) or null objects
806 		 */
807 		if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) ||
808 			current->object.vm_object == NULL)
809 			continue;
810 
811 		/*
812 		 * limit this scan to the current map entry and the
813 		 * limits for the mincore call
814 		 */
815 		if (addr < current->start)
816 			addr = current->start;
817 		cend = current->end;
818 		if (cend > end)
819 			cend = end;
820 
821 		/*
822 		 * scan this entry one page at a time
823 		 */
824 		while (addr < cend) {
825 			/*
826 			 * Check pmap first, it is likely faster, also
827 			 * it can provide info as to whether we are the
828 			 * one referencing or modifying the page.
829 			 */
830 			object = NULL;
831 			locked_pa = 0;
832 		retry:
833 			m = NULL;
834 			mincoreinfo = pmap_mincore(pmap, addr, &locked_pa);
835 			if (locked_pa != 0) {
836 				/*
837 				 * The page is mapped by this process but not
838 				 * both accessed and modified.  It is also
839 				 * managed.  Acquire the object lock so that
840 				 * other mappings might be examined.
841 				 */
842 				m = PHYS_TO_VM_PAGE(locked_pa);
843 				if (m->object != object) {
844 					if (object != NULL)
845 						VM_OBJECT_WUNLOCK(object);
846 					object = m->object;
847 					locked = VM_OBJECT_TRYWLOCK(object);
848 					vm_page_unlock(m);
849 					if (!locked) {
850 						VM_OBJECT_WLOCK(object);
851 						vm_page_lock(m);
852 						goto retry;
853 					}
854 				} else
855 					vm_page_unlock(m);
856 				KASSERT(m->valid == VM_PAGE_BITS_ALL,
857 				    ("mincore: page %p is mapped but invalid",
858 				    m));
859 			} else if (mincoreinfo == 0) {
860 				/*
861 				 * The page is not mapped by this process.  If
862 				 * the object implements managed pages, then
863 				 * determine if the page is resident so that
864 				 * the mappings might be examined.
865 				 */
866 				if (current->object.vm_object != object) {
867 					if (object != NULL)
868 						VM_OBJECT_WUNLOCK(object);
869 					object = current->object.vm_object;
870 					VM_OBJECT_WLOCK(object);
871 				}
872 				if (object->type == OBJT_DEFAULT ||
873 				    object->type == OBJT_SWAP ||
874 				    object->type == OBJT_VNODE) {
875 					pindex = OFF_TO_IDX(current->offset +
876 					    (addr - current->start));
877 					m = vm_page_lookup(object, pindex);
878 					if (m == NULL &&
879 					    vm_page_is_cached(object, pindex))
880 						mincoreinfo = MINCORE_INCORE;
881 					if (m != NULL && m->valid == 0)
882 						m = NULL;
883 					if (m != NULL)
884 						mincoreinfo = MINCORE_INCORE;
885 				}
886 			}
887 			if (m != NULL) {
888 				/* Examine other mappings to the page. */
889 				if (m->dirty == 0 && pmap_is_modified(m))
890 					vm_page_dirty(m);
891 				if (m->dirty != 0)
892 					mincoreinfo |= MINCORE_MODIFIED_OTHER;
893 				/*
894 				 * The first test for PGA_REFERENCED is an
895 				 * optimization.  The second test is
896 				 * required because a concurrent pmap
897 				 * operation could clear the last reference
898 				 * and set PGA_REFERENCED before the call to
899 				 * pmap_is_referenced().
900 				 */
901 				if ((m->aflags & PGA_REFERENCED) != 0 ||
902 				    pmap_is_referenced(m) ||
903 				    (m->aflags & PGA_REFERENCED) != 0)
904 					mincoreinfo |= MINCORE_REFERENCED_OTHER;
905 			}
906 			if (object != NULL)
907 				VM_OBJECT_WUNLOCK(object);
908 
909 			/*
910 			 * subyte may page fault.  In case it needs to modify
911 			 * the map, we release the lock.
912 			 */
913 			vm_map_unlock_read(map);
914 
915 			/*
916 			 * calculate index into user supplied byte vector
917 			 */
918 			vecindex = OFF_TO_IDX(addr - first_addr);
919 
920 			/*
921 			 * If we have skipped map entries, we need to make sure that
922 			 * the byte vector is zeroed for those skipped entries.
923 			 */
924 			while ((lastvecindex + 1) < vecindex) {
925 				++lastvecindex;
926 				error = subyte(vec + lastvecindex, 0);
927 				if (error) {
928 					error = EFAULT;
929 					goto done2;
930 				}
931 			}
932 
933 			/*
934 			 * Pass the page information to the user
935 			 */
936 			error = subyte(vec + vecindex, mincoreinfo);
937 			if (error) {
938 				error = EFAULT;
939 				goto done2;
940 			}
941 
942 			/*
943 			 * If the map has changed, due to the subyte, the previous
944 			 * output may be invalid.
945 			 */
946 			vm_map_lock_read(map);
947 			if (timestamp != map->timestamp)
948 				goto RestartScan;
949 
950 			lastvecindex = vecindex;
951 			addr += PAGE_SIZE;
952 		}
953 	}
954 
955 	/*
956 	 * subyte may page fault.  In case it needs to modify
957 	 * the map, we release the lock.
958 	 */
959 	vm_map_unlock_read(map);
960 
961 	/*
962 	 * Zero the last entries in the byte vector.
963 	 */
964 	vecindex = OFF_TO_IDX(end - first_addr);
965 	while ((lastvecindex + 1) < vecindex) {
966 		++lastvecindex;
967 		error = subyte(vec + lastvecindex, 0);
968 		if (error) {
969 			error = EFAULT;
970 			goto done2;
971 		}
972 	}
973 
974 	/*
975 	 * If the map has changed, due to the subyte, the previous
976 	 * output may be invalid.
977 	 */
978 	vm_map_lock_read(map);
979 	if (timestamp != map->timestamp)
980 		goto RestartScan;
981 	vm_map_unlock_read(map);
982 done2:
983 	return (error);
984 }
985 
986 #ifndef _SYS_SYSPROTO_H_
987 struct mlock_args {
988 	const void *addr;
989 	size_t len;
990 };
991 #endif
992 /*
993  * MPSAFE
994  */
995 int
996 sys_mlock(td, uap)
997 	struct thread *td;
998 	struct mlock_args *uap;
999 {
1000 
1001 	return (vm_mlock(td->td_proc, td->td_ucred, uap->addr, uap->len));
1002 }
1003 
1004 int
1005 vm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len)
1006 {
1007 	vm_offset_t addr, end, last, start;
1008 	vm_size_t npages, size;
1009 	vm_map_t map;
1010 	unsigned long nsize;
1011 	int error;
1012 
1013 	error = priv_check_cred(cred, PRIV_VM_MLOCK, 0);
1014 	if (error)
1015 		return (error);
1016 	addr = (vm_offset_t)addr0;
1017 	size = len;
1018 	last = addr + size;
1019 	start = trunc_page(addr);
1020 	end = round_page(last);
1021 	if (last < addr || end < addr)
1022 		return (EINVAL);
1023 	npages = atop(end - start);
1024 	if (npages > vm_page_max_wired)
1025 		return (ENOMEM);
1026 	map = &proc->p_vmspace->vm_map;
1027 	PROC_LOCK(proc);
1028 	nsize = ptoa(npages + pmap_wired_count(map->pmap));
1029 	if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) {
1030 		PROC_UNLOCK(proc);
1031 		return (ENOMEM);
1032 	}
1033 	PROC_UNLOCK(proc);
1034 	if (npages + vm_cnt.v_wire_count > vm_page_max_wired)
1035 		return (EAGAIN);
1036 #ifdef RACCT
1037 	if (racct_enable) {
1038 		PROC_LOCK(proc);
1039 		error = racct_set(proc, RACCT_MEMLOCK, nsize);
1040 		PROC_UNLOCK(proc);
1041 		if (error != 0)
1042 			return (ENOMEM);
1043 	}
1044 #endif
1045 	error = vm_map_wire(map, start, end,
1046 	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
1047 #ifdef RACCT
1048 	if (racct_enable && error != KERN_SUCCESS) {
1049 		PROC_LOCK(proc);
1050 		racct_set(proc, RACCT_MEMLOCK,
1051 		    ptoa(pmap_wired_count(map->pmap)));
1052 		PROC_UNLOCK(proc);
1053 	}
1054 #endif
1055 	return (error == KERN_SUCCESS ? 0 : ENOMEM);
1056 }
1057 
1058 #ifndef _SYS_SYSPROTO_H_
1059 struct mlockall_args {
1060 	int	how;
1061 };
1062 #endif
1063 
1064 /*
1065  * MPSAFE
1066  */
1067 int
1068 sys_mlockall(td, uap)
1069 	struct thread *td;
1070 	struct mlockall_args *uap;
1071 {
1072 	vm_map_t map;
1073 	int error;
1074 
1075 	map = &td->td_proc->p_vmspace->vm_map;
1076 	error = priv_check(td, PRIV_VM_MLOCK);
1077 	if (error)
1078 		return (error);
1079 
1080 	if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0))
1081 		return (EINVAL);
1082 
1083 	/*
1084 	 * If wiring all pages in the process would cause it to exceed
1085 	 * a hard resource limit, return ENOMEM.
1086 	 */
1087 	if (!old_mlock && uap->how & MCL_CURRENT) {
1088 		PROC_LOCK(td->td_proc);
1089 		if (map->size > lim_cur(td, RLIMIT_MEMLOCK)) {
1090 			PROC_UNLOCK(td->td_proc);
1091 			return (ENOMEM);
1092 		}
1093 		PROC_UNLOCK(td->td_proc);
1094 	}
1095 #ifdef RACCT
1096 	if (racct_enable) {
1097 		PROC_LOCK(td->td_proc);
1098 		error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size);
1099 		PROC_UNLOCK(td->td_proc);
1100 		if (error != 0)
1101 			return (ENOMEM);
1102 	}
1103 #endif
1104 
1105 	if (uap->how & MCL_FUTURE) {
1106 		vm_map_lock(map);
1107 		vm_map_modflags(map, MAP_WIREFUTURE, 0);
1108 		vm_map_unlock(map);
1109 		error = 0;
1110 	}
1111 
1112 	if (uap->how & MCL_CURRENT) {
1113 		/*
1114 		 * P1003.1-2001 mandates that all currently mapped pages
1115 		 * will be memory resident and locked (wired) upon return
1116 		 * from mlockall(). vm_map_wire() will wire pages, by
1117 		 * calling vm_fault_wire() for each page in the region.
1118 		 */
1119 		error = vm_map_wire(map, vm_map_min(map), vm_map_max(map),
1120 		    VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
1121 		error = (error == KERN_SUCCESS ? 0 : EAGAIN);
1122 	}
1123 #ifdef RACCT
1124 	if (racct_enable && error != KERN_SUCCESS) {
1125 		PROC_LOCK(td->td_proc);
1126 		racct_set(td->td_proc, RACCT_MEMLOCK,
1127 		    ptoa(pmap_wired_count(map->pmap)));
1128 		PROC_UNLOCK(td->td_proc);
1129 	}
1130 #endif
1131 
1132 	return (error);
1133 }
1134 
1135 #ifndef _SYS_SYSPROTO_H_
1136 struct munlockall_args {
1137 	register_t dummy;
1138 };
1139 #endif
1140 
1141 /*
1142  * MPSAFE
1143  */
1144 int
1145 sys_munlockall(td, uap)
1146 	struct thread *td;
1147 	struct munlockall_args *uap;
1148 {
1149 	vm_map_t map;
1150 	int error;
1151 
1152 	map = &td->td_proc->p_vmspace->vm_map;
1153 	error = priv_check(td, PRIV_VM_MUNLOCK);
1154 	if (error)
1155 		return (error);
1156 
1157 	/* Clear the MAP_WIREFUTURE flag from this vm_map. */
1158 	vm_map_lock(map);
1159 	vm_map_modflags(map, 0, MAP_WIREFUTURE);
1160 	vm_map_unlock(map);
1161 
1162 	/* Forcibly unwire all pages. */
1163 	error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map),
1164 	    VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
1165 #ifdef RACCT
1166 	if (racct_enable && error == KERN_SUCCESS) {
1167 		PROC_LOCK(td->td_proc);
1168 		racct_set(td->td_proc, RACCT_MEMLOCK, 0);
1169 		PROC_UNLOCK(td->td_proc);
1170 	}
1171 #endif
1172 
1173 	return (error);
1174 }
1175 
1176 #ifndef _SYS_SYSPROTO_H_
1177 struct munlock_args {
1178 	const void *addr;
1179 	size_t len;
1180 };
1181 #endif
1182 /*
1183  * MPSAFE
1184  */
1185 int
1186 sys_munlock(td, uap)
1187 	struct thread *td;
1188 	struct munlock_args *uap;
1189 {
1190 	vm_offset_t addr, end, last, start;
1191 	vm_size_t size;
1192 #ifdef RACCT
1193 	vm_map_t map;
1194 #endif
1195 	int error;
1196 
1197 	error = priv_check(td, PRIV_VM_MUNLOCK);
1198 	if (error)
1199 		return (error);
1200 	addr = (vm_offset_t)uap->addr;
1201 	size = uap->len;
1202 	last = addr + size;
1203 	start = trunc_page(addr);
1204 	end = round_page(last);
1205 	if (last < addr || end < addr)
1206 		return (EINVAL);
1207 	error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end,
1208 	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
1209 #ifdef RACCT
1210 	if (racct_enable && error == KERN_SUCCESS) {
1211 		PROC_LOCK(td->td_proc);
1212 		map = &td->td_proc->p_vmspace->vm_map;
1213 		racct_set(td->td_proc, RACCT_MEMLOCK,
1214 		    ptoa(pmap_wired_count(map->pmap)));
1215 		PROC_UNLOCK(td->td_proc);
1216 	}
1217 #endif
1218 	return (error == KERN_SUCCESS ? 0 : ENOMEM);
1219 }
1220 
1221 /*
1222  * vm_mmap_vnode()
1223  *
1224  * Helper function for vm_mmap.  Perform sanity check specific for mmap
1225  * operations on vnodes.
1226  */
1227 int
1228 vm_mmap_vnode(struct thread *td, vm_size_t objsize,
1229     vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
1230     struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp,
1231     boolean_t *writecounted)
1232 {
1233 	struct vattr va;
1234 	vm_object_t obj;
1235 	vm_offset_t foff;
1236 	struct ucred *cred;
1237 	int error, flags, locktype;
1238 
1239 	cred = td->td_ucred;
1240 	if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED))
1241 		locktype = LK_EXCLUSIVE;
1242 	else
1243 		locktype = LK_SHARED;
1244 	if ((error = vget(vp, locktype, td)) != 0)
1245 		return (error);
1246 	foff = *foffp;
1247 	flags = *flagsp;
1248 	obj = vp->v_object;
1249 	if (vp->v_type == VREG) {
1250 		/*
1251 		 * Get the proper underlying object
1252 		 */
1253 		if (obj == NULL) {
1254 			error = EINVAL;
1255 			goto done;
1256 		}
1257 		if (obj->type == OBJT_VNODE && obj->handle != vp) {
1258 			vput(vp);
1259 			vp = (struct vnode *)obj->handle;
1260 			/*
1261 			 * Bypass filesystems obey the mpsafety of the
1262 			 * underlying fs.  Tmpfs never bypasses.
1263 			 */
1264 			error = vget(vp, locktype, td);
1265 			if (error != 0)
1266 				return (error);
1267 		}
1268 		if (locktype == LK_EXCLUSIVE) {
1269 			*writecounted = TRUE;
1270 			vnode_pager_update_writecount(obj, 0, objsize);
1271 		}
1272 	} else {
1273 		error = EINVAL;
1274 		goto done;
1275 	}
1276 	if ((error = VOP_GETATTR(vp, &va, cred)))
1277 		goto done;
1278 #ifdef MAC
1279 	/* This relies on VM_PROT_* matching PROT_*. */
1280 	error = mac_vnode_check_mmap(cred, vp, (int)prot, flags);
1281 	if (error != 0)
1282 		goto done;
1283 #endif
1284 	if ((flags & MAP_SHARED) != 0) {
1285 		if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) {
1286 			if (prot & VM_PROT_WRITE) {
1287 				error = EPERM;
1288 				goto done;
1289 			}
1290 			*maxprotp &= ~VM_PROT_WRITE;
1291 		}
1292 	}
1293 	/*
1294 	 * If it is a regular file without any references
1295 	 * we do not need to sync it.
1296 	 * Adjust object size to be the size of actual file.
1297 	 */
1298 	objsize = round_page(va.va_size);
1299 	if (va.va_nlink == 0)
1300 		flags |= MAP_NOSYNC;
1301 	if (obj->type == OBJT_VNODE) {
1302 		obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff,
1303 		    cred);
1304 		if (obj == NULL) {
1305 			error = ENOMEM;
1306 			goto done;
1307 		}
1308 	} else {
1309 		KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP,
1310 		    ("wrong object type"));
1311 		VM_OBJECT_WLOCK(obj);
1312 		vm_object_reference_locked(obj);
1313 #if VM_NRESERVLEVEL > 0
1314 		vm_object_color(obj, 0);
1315 #endif
1316 		VM_OBJECT_WUNLOCK(obj);
1317 	}
1318 	*objp = obj;
1319 	*flagsp = flags;
1320 
1321 	vfs_mark_atime(vp, cred);
1322 
1323 done:
1324 	if (error != 0 && *writecounted) {
1325 		*writecounted = FALSE;
1326 		vnode_pager_update_writecount(obj, objsize, 0);
1327 	}
1328 	vput(vp);
1329 	return (error);
1330 }
1331 
1332 /*
1333  * vm_mmap_cdev()
1334  *
1335  * MPSAFE
1336  *
1337  * Helper function for vm_mmap.  Perform sanity check specific for mmap
1338  * operations on cdevs.
1339  */
1340 int
1341 vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot,
1342     vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw,
1343     vm_ooffset_t *foff, vm_object_t *objp)
1344 {
1345 	vm_object_t obj;
1346 	int error, flags;
1347 
1348 	flags = *flagsp;
1349 
1350 	if (dsw->d_flags & D_MMAP_ANON) {
1351 		*objp = NULL;
1352 		*foff = 0;
1353 		*maxprotp = VM_PROT_ALL;
1354 		*flagsp |= MAP_ANON;
1355 		return (0);
1356 	}
1357 	/*
1358 	 * cdevs do not provide private mappings of any kind.
1359 	 */
1360 	if ((*maxprotp & VM_PROT_WRITE) == 0 &&
1361 	    (prot & VM_PROT_WRITE) != 0)
1362 		return (EACCES);
1363 	if (flags & (MAP_PRIVATE|MAP_COPY))
1364 		return (EINVAL);
1365 	/*
1366 	 * Force device mappings to be shared.
1367 	 */
1368 	flags |= MAP_SHARED;
1369 #ifdef MAC_XXX
1370 	error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot);
1371 	if (error != 0)
1372 		return (error);
1373 #endif
1374 	/*
1375 	 * First, try d_mmap_single().  If that is not implemented
1376 	 * (returns ENODEV), fall back to using the device pager.
1377 	 * Note that d_mmap_single() must return a reference to the
1378 	 * object (it needs to bump the reference count of the object
1379 	 * it returns somehow).
1380 	 *
1381 	 * XXX assumes VM_PROT_* == PROT_*
1382 	 */
1383 	error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot);
1384 	if (error != ENODEV)
1385 		return (error);
1386 	obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff,
1387 	    td->td_ucred);
1388 	if (obj == NULL)
1389 		return (EINVAL);
1390 	*objp = obj;
1391 	*flagsp = flags;
1392 	return (0);
1393 }
1394 
1395 /*
1396  * vm_mmap()
1397  *
1398  * Internal version of mmap used by exec, sys5 shared memory, and
1399  * various device drivers.  Handle is either a vnode pointer, a
1400  * character device, or NULL for MAP_ANON.
1401  */
1402 int
1403 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
1404 	vm_prot_t maxprot, int flags,
1405 	objtype_t handle_type, void *handle,
1406 	vm_ooffset_t foff)
1407 {
1408 	vm_object_t object;
1409 	struct thread *td = curthread;
1410 	int error;
1411 	boolean_t writecounted;
1412 
1413 	if (size == 0)
1414 		return (EINVAL);
1415 
1416 	size = round_page(size);
1417 	object = NULL;
1418 	writecounted = FALSE;
1419 
1420 	/*
1421 	 * Lookup/allocate object.
1422 	 */
1423 	switch (handle_type) {
1424 	case OBJT_DEVICE: {
1425 		struct cdevsw *dsw;
1426 		struct cdev *cdev;
1427 		int ref;
1428 
1429 		cdev = handle;
1430 		dsw = dev_refthread(cdev, &ref);
1431 		if (dsw == NULL)
1432 			return (ENXIO);
1433 		error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev,
1434 		    dsw, &foff, &object);
1435 		dev_relthread(cdev, ref);
1436 		break;
1437 	}
1438 	case OBJT_VNODE:
1439 		error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
1440 		    handle, &foff, &object, &writecounted);
1441 		break;
1442 	case OBJT_DEFAULT:
1443 		if (handle == NULL) {
1444 			error = 0;
1445 			break;
1446 		}
1447 		/* FALLTHROUGH */
1448 	default:
1449 		error = EINVAL;
1450 		break;
1451 	}
1452 	if (error)
1453 		return (error);
1454 
1455 	error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
1456 	    foff, writecounted, td);
1457 	if (error != 0 && object != NULL) {
1458 		/*
1459 		 * If this mapping was accounted for in the vnode's
1460 		 * writecount, then undo that now.
1461 		 */
1462 		if (writecounted)
1463 			vnode_pager_release_writecount(object, 0, size);
1464 		vm_object_deallocate(object);
1465 	}
1466 	return (error);
1467 }
1468 
1469 /*
1470  * Internal version of mmap that maps a specific VM object into an
1471  * map.  Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap.
1472  */
1473 int
1474 vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
1475     vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff,
1476     boolean_t writecounted, struct thread *td)
1477 {
1478 	boolean_t fitit;
1479 	int docow, error, findspace, rv;
1480 
1481 	if (map == &td->td_proc->p_vmspace->vm_map) {
1482 		PROC_LOCK(td->td_proc);
1483 		if (map->size + size > lim_cur_proc(td->td_proc, RLIMIT_VMEM)) {
1484 			PROC_UNLOCK(td->td_proc);
1485 			return (ENOMEM);
1486 		}
1487 		if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) {
1488 			PROC_UNLOCK(td->td_proc);
1489 			return (ENOMEM);
1490 		}
1491 		if (!old_mlock && map->flags & MAP_WIREFUTURE) {
1492 			if (ptoa(pmap_wired_count(map->pmap)) + size >
1493 			    lim_cur_proc(td->td_proc, RLIMIT_MEMLOCK)) {
1494 				racct_set_force(td->td_proc, RACCT_VMEM,
1495 				    map->size);
1496 				PROC_UNLOCK(td->td_proc);
1497 				return (ENOMEM);
1498 			}
1499 			error = racct_set(td->td_proc, RACCT_MEMLOCK,
1500 			    ptoa(pmap_wired_count(map->pmap)) + size);
1501 			if (error != 0) {
1502 				racct_set_force(td->td_proc, RACCT_VMEM,
1503 				    map->size);
1504 				PROC_UNLOCK(td->td_proc);
1505 				return (error);
1506 			}
1507 		}
1508 		PROC_UNLOCK(td->td_proc);
1509 	}
1510 
1511 	/*
1512 	 * We currently can only deal with page aligned file offsets.
1513 	 * The mmap() system call already enforces this by subtracting
1514 	 * the page offset from the file offset, but checking here
1515 	 * catches errors in device drivers (e.g. d_single_mmap()
1516 	 * callbacks) and other internal mapping requests (such as in
1517 	 * exec).
1518 	 */
1519 	if (foff & PAGE_MASK)
1520 		return (EINVAL);
1521 
1522 	if ((flags & MAP_FIXED) == 0) {
1523 		fitit = TRUE;
1524 		*addr = round_page(*addr);
1525 	} else {
1526 		if (*addr != trunc_page(*addr))
1527 			return (EINVAL);
1528 		fitit = FALSE;
1529 	}
1530 
1531 	if (flags & MAP_ANON) {
1532 		if (object != NULL || foff != 0)
1533 			return (EINVAL);
1534 		docow = 0;
1535 	} else if (flags & MAP_PREFAULT_READ)
1536 		docow = MAP_PREFAULT;
1537 	else
1538 		docow = MAP_PREFAULT_PARTIAL;
1539 
1540 	if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
1541 		docow |= MAP_COPY_ON_WRITE;
1542 	if (flags & MAP_NOSYNC)
1543 		docow |= MAP_DISABLE_SYNCER;
1544 	if (flags & MAP_NOCORE)
1545 		docow |= MAP_DISABLE_COREDUMP;
1546 	/* Shared memory is also shared with children. */
1547 	if (flags & MAP_SHARED)
1548 		docow |= MAP_INHERIT_SHARE;
1549 	if (writecounted)
1550 		docow |= MAP_VN_WRITECOUNT;
1551 	if (flags & MAP_STACK) {
1552 		if (object != NULL)
1553 			return (EINVAL);
1554 		docow |= MAP_STACK_GROWS_DOWN;
1555 	}
1556 	if ((flags & MAP_EXCL) != 0)
1557 		docow |= MAP_CHECK_EXCL;
1558 
1559 	if (fitit) {
1560 		if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER)
1561 			findspace = VMFS_SUPER_SPACE;
1562 		else if ((flags & MAP_ALIGNMENT_MASK) != 0)
1563 			findspace = VMFS_ALIGNED_SPACE(flags >>
1564 			    MAP_ALIGNMENT_SHIFT);
1565 		else
1566 			findspace = VMFS_OPTIMAL_SPACE;
1567 		rv = vm_map_find(map, object, foff, addr, size,
1568 #ifdef MAP_32BIT
1569 		    flags & MAP_32BIT ? MAP_32BIT_MAX_ADDR :
1570 #endif
1571 		    0, findspace, prot, maxprot, docow);
1572 	} else {
1573 		rv = vm_map_fixed(map, object, foff, *addr, size,
1574 		    prot, maxprot, docow);
1575 	}
1576 
1577 	if (rv == KERN_SUCCESS) {
1578 		/*
1579 		 * If the process has requested that all future mappings
1580 		 * be wired, then heed this.
1581 		 */
1582 		if (map->flags & MAP_WIREFUTURE) {
1583 			vm_map_wire(map, *addr, *addr + size,
1584 			    VM_MAP_WIRE_USER | ((flags & MAP_STACK) ?
1585 			    VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES));
1586 		}
1587 	}
1588 	return (vm_mmap_to_errno(rv));
1589 }
1590 
1591 /*
1592  * Translate a Mach VM return code to zero on success or the appropriate errno
1593  * on failure.
1594  */
1595 int
1596 vm_mmap_to_errno(int rv)
1597 {
1598 
1599 	switch (rv) {
1600 	case KERN_SUCCESS:
1601 		return (0);
1602 	case KERN_INVALID_ADDRESS:
1603 	case KERN_NO_SPACE:
1604 		return (ENOMEM);
1605 	case KERN_PROTECTION_FAILURE:
1606 		return (EACCES);
1607 	default:
1608 		return (EINVAL);
1609 	}
1610 }
1611