1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1988 University of Utah.
5 * Copyright (c) 1991, 1993
6 * The Regents of the University of California. All rights reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * the Systems Programming Group of the University of Utah Computer
10 * Science Department.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
37 */
38
39 /*
40 * Mapped file (mmap) interface to VM
41 */
42
43 #include "opt_hwpmc_hooks.h"
44 #include "opt_hwt_hooks.h"
45 #include "opt_vm.h"
46
47 #define EXTERR_CATEGORY EXTERR_CAT_MMAP
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/capsicum.h>
51 #include <sys/exterrvar.h>
52 #include <sys/kernel.h>
53 #include <sys/lock.h>
54 #include <sys/mutex.h>
55 #include <sys/sysproto.h>
56 #include <sys/elf.h>
57 #include <sys/filedesc.h>
58 #include <sys/priv.h>
59 #include <sys/proc.h>
60 #include <sys/procctl.h>
61 #include <sys/racct.h>
62 #include <sys/resource.h>
63 #include <sys/resourcevar.h>
64 #include <sys/rwlock.h>
65 #include <sys/sysctl.h>
66 #include <sys/vnode.h>
67 #include <sys/fcntl.h>
68 #include <sys/file.h>
69 #include <sys/mman.h>
70 #include <sys/mount.h>
71 #include <sys/conf.h>
72 #include <sys/stat.h>
73 #include <sys/syscallsubr.h>
74 #include <sys/sysent.h>
75 #include <sys/vmmeter.h>
76 #if defined(__amd64__) || defined(__i386__) /* for i386_read_exec */
77 #include <machine/md_var.h>
78 #endif
79
80 #include <security/audit/audit.h>
81 #include <security/mac/mac_framework.h>
82
83 #include <vm/vm.h>
84 #include <vm/vm_param.h>
85 #include <vm/pmap.h>
86 #include <vm/vm_map.h>
87 #include <vm/vm_object.h>
88 #include <vm/vm_page.h>
89 #include <vm/vm_pager.h>
90 #include <vm/vm_pageout.h>
91 #include <vm/vm_extern.h>
92 #include <vm/vm_page.h>
93 #include <vm/vnode_pager.h>
94
95 #ifdef HWPMC_HOOKS
96 #include <sys/pmckern.h>
97 #endif
98
99 #ifdef HWT_HOOKS
100 #include <dev/hwt/hwt_hook.h>
101 #endif
102
103 int old_mlock = 0;
104 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0,
105 "Do not apply RLIMIT_MEMLOCK on mlockall");
106 static int mincore_mapped = 1;
107 SYSCTL_INT(_vm, OID_AUTO, mincore_mapped, CTLFLAG_RWTUN, &mincore_mapped, 0,
108 "mincore reports mappings, not residency");
109 static int imply_prot_max = 0;
110 SYSCTL_INT(_vm, OID_AUTO, imply_prot_max, CTLFLAG_RWTUN, &imply_prot_max, 0,
111 "Imply maximum page protections in mmap() when none are specified");
112
113 _Static_assert(MAXPAGESIZES <= 4, "MINCORE_SUPER too narrow");
114
115 #if defined(COMPAT_43)
116 int
ogetpagesize(struct thread * td,struct ogetpagesize_args * uap)117 ogetpagesize(struct thread *td, struct ogetpagesize_args *uap)
118 {
119
120 td->td_retval[0] = PAGE_SIZE;
121 return (0);
122 }
123 #endif /* COMPAT_43 */
124
125 /*
126 * Memory Map (mmap) system call. Note that the file offset
127 * and address are allowed to be NOT page aligned, though if
128 * the MAP_FIXED flag it set, both must have the same remainder
129 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not
130 * page-aligned, the actual mapping starts at trunc_page(addr)
131 * and the return value is adjusted up by the page offset.
132 *
133 * Generally speaking, only character devices which are themselves
134 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise
135 * there would be no cache coherency between a descriptor and a VM mapping
136 * both to the same character device.
137 */
138 #ifndef _SYS_SYSPROTO_H_
139 struct mmap_args {
140 void *addr;
141 size_t len;
142 int prot;
143 int flags;
144 int fd;
145 long pad;
146 off_t pos;
147 };
148 #endif
149
150 int
sys_mmap(struct thread * td,struct mmap_args * uap)151 sys_mmap(struct thread *td, struct mmap_args *uap)
152 {
153
154 return (kern_mmap(td, &(struct mmap_req){
155 .mr_hint = (uintptr_t)uap->addr,
156 .mr_len = uap->len,
157 .mr_prot = uap->prot,
158 .mr_flags = uap->flags,
159 .mr_fd = uap->fd,
160 .mr_pos = uap->pos,
161 }));
162 }
163
164 int
kern_mmap_maxprot(struct proc * p,int prot)165 kern_mmap_maxprot(struct proc *p, int prot)
166 {
167
168 if ((p->p_flag2 & P2_PROTMAX_DISABLE) != 0 ||
169 (p->p_fctl0 & NT_FREEBSD_FCTL_PROTMAX_DISABLE) != 0)
170 return (_PROT_ALL);
171 if (((p->p_flag2 & P2_PROTMAX_ENABLE) != 0 || imply_prot_max) &&
172 prot != PROT_NONE)
173 return (prot);
174 return (_PROT_ALL);
175 }
176
177 int
kern_mmap(struct thread * td,const struct mmap_req * mrp)178 kern_mmap(struct thread *td, const struct mmap_req *mrp)
179 {
180 struct vmspace *vms;
181 struct file *fp;
182 struct proc *p;
183 off_t pos;
184 vm_offset_t addr, orig_addr;
185 vm_size_t len, pageoff, size;
186 vm_prot_t cap_maxprot;
187 int align, error, fd, flags, max_prot, prot;
188 cap_rights_t rights;
189 mmap_check_fp_fn check_fp_fn;
190
191 orig_addr = addr = mrp->mr_hint;
192 len = mrp->mr_len;
193 prot = mrp->mr_prot;
194 flags = mrp->mr_flags;
195 fd = mrp->mr_fd;
196 pos = mrp->mr_pos;
197 check_fp_fn = mrp->mr_check_fp_fn;
198
199 if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0) {
200 return (EXTERROR(EINVAL, "unknown PROT bits"));
201 }
202 max_prot = PROT_MAX_EXTRACT(prot);
203 prot = PROT_EXTRACT(prot);
204 if (max_prot != 0 && (max_prot & prot) != prot) {
205 return (EXTERROR(ENOTSUP, "prot is not subset of max_prot"));
206 }
207
208 p = td->td_proc;
209
210 /*
211 * Always honor PROT_MAX if set. If not, default to all
212 * permissions unless we're implying maximum permissions.
213 */
214 if (max_prot == 0)
215 max_prot = kern_mmap_maxprot(p, prot);
216
217 vms = p->p_vmspace;
218 fp = NULL;
219 AUDIT_ARG_FD(fd);
220
221 /*
222 * Ignore old flags that used to be defined but did not do anything.
223 */
224 flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040);
225
226 /*
227 * Enforce the constraints.
228 * Mapping of length 0 is only allowed for old binaries.
229 * Anonymous mapping shall specify -1 as filedescriptor and
230 * zero position for new code. Be nice to ancient a.out
231 * binaries and correct pos for anonymous mapping, since old
232 * ld.so sometimes issues anonymous map requests with non-zero
233 * pos.
234 */
235 if (!SV_CURPROC_FLAG(SV_AOUT)) {
236 if ((len == 0 && p->p_osrel >= P_OSREL_MAP_ANON) ||
237 ((flags & MAP_ANON) != 0 && (fd != -1 || pos != 0))) {
238 return (EXTERROR(EINVAL,
239 "offset not zero/fd not -1 for MAP_ANON",
240 fd, pos));
241 }
242 } else {
243 if ((flags & MAP_ANON) != 0)
244 pos = 0;
245 }
246
247 if (flags & MAP_STACK) {
248 if ((fd != -1) || ((prot & (PROT_READ | PROT_WRITE)) !=
249 (PROT_READ | PROT_WRITE))) {
250 return (EXTERROR(EINVAL, "MAP_STACK with prot < rw",
251 prot));
252 }
253 flags |= MAP_ANON;
254 pos = 0;
255 }
256 if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE |
257 MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE |
258 MAP_PREFAULT_READ | MAP_GUARD | MAP_32BIT |
259 MAP_ALIGNMENT_MASK)) != 0) {
260 return (EXTERROR(EINVAL, "reserved flag set"));
261 }
262 if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) {
263 return (EXTERROR(EINVAL, "EXCL without FIXED"));
264 }
265 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED |
266 MAP_PRIVATE)) {
267 return (EXTERROR(EINVAL, "both SHARED and PRIVATE set"));
268 }
269 if (prot != PROT_NONE &&
270 (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0) {
271 return (EXTERROR(EINVAL, "invalid prot", prot));
272 }
273 if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || fd != -1 ||
274 pos != 0 || (flags & ~(MAP_FIXED | MAP_GUARD | MAP_EXCL |
275 MAP_32BIT | MAP_ALIGNMENT_MASK)) != 0)) {
276 return (EXTERROR(EINVAL, "GUARD with wrong parameters"));
277 }
278
279 /*
280 * Align the file position to a page boundary,
281 * and save its page offset component.
282 */
283 pageoff = (pos & PAGE_MASK);
284 pos -= pageoff;
285
286 /* Compute size from len by rounding (on both ends). */
287 size = len + pageoff; /* low end... */
288 size = round_page(size); /* hi end */
289 /* Check for rounding up to zero. */
290 if (len > size)
291 return (ENOMEM);
292
293 /* Ensure alignment is at least a page and fits in a pointer. */
294 align = flags & MAP_ALIGNMENT_MASK;
295 if (align != 0 && align != MAP_ALIGNED_SUPER &&
296 (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY ||
297 align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) {
298 return (EXTERROR(EINVAL, "bad alignment", align));
299 }
300
301 /*
302 * Check for illegal addresses. Watch out for address wrap... Note
303 * that VM_*_ADDRESS are not constants due to casts (argh).
304 */
305 if (flags & MAP_FIXED) {
306 /*
307 * The specified address must have the same remainder
308 * as the file offset taken modulo PAGE_SIZE, so it
309 * should be aligned after adjustment by pageoff.
310 */
311 addr -= pageoff;
312 if ((addr & PAGE_MASK) != 0) {
313 return (EXTERROR(EINVAL, "fixed mapping not aligned",
314 addr));
315 }
316
317 /* Address range must be all in user VM space. */
318 if (!vm_map_range_valid(&vms->vm_map, addr, addr + size)) {
319 EXTERROR(EINVAL, "mapping outside vm_map");
320 return (EINVAL);
321 }
322 if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) {
323 return (EXTERROR(EINVAL,
324 "fixed 32bit mapping does not fit into 4G"));
325 }
326 } else if (flags & MAP_32BIT) {
327 /*
328 * For MAP_32BIT, override the hint if it is too high and
329 * do not bother moving the mapping past the heap (since
330 * the heap is usually above 2GB).
331 */
332 if (addr + size > MAP_32BIT_MAX_ADDR)
333 addr = 0;
334 } else {
335 /*
336 * XXX for non-fixed mappings where no hint is provided or
337 * the hint would fall in the potential heap space,
338 * place it after the end of the largest possible heap.
339 *
340 * For anonymous mappings within the address space of the
341 * calling process, the absence of a hint is handled at a
342 * lower level in order to implement different clustering
343 * strategies for ASLR.
344 */
345 if (((flags & MAP_ANON) == 0 && addr == 0) ||
346 (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
347 addr < round_page((vm_offset_t)vms->vm_daddr +
348 lim_max(td, RLIMIT_DATA))))
349 addr = round_page((vm_offset_t)vms->vm_daddr +
350 lim_max(td, RLIMIT_DATA));
351 }
352 if (len == 0) {
353 /*
354 * Return success without mapping anything for old
355 * binaries that request a page-aligned mapping of
356 * length 0. For modern binaries, this function
357 * returns an error earlier.
358 */
359 error = 0;
360 } else if ((flags & MAP_GUARD) != 0) {
361 error = vm_mmap_object(&vms->vm_map, &addr, size, VM_PROT_NONE,
362 VM_PROT_NONE, flags, NULL, pos, FALSE, td);
363 } else if ((flags & MAP_ANON) != 0) {
364 /*
365 * Mapping blank space is trivial.
366 *
367 * This relies on VM_PROT_* matching PROT_*.
368 */
369 error = vm_mmap_object(&vms->vm_map, &addr, size, prot,
370 max_prot, flags, NULL, pos, FALSE, td);
371 } else {
372 /*
373 * Mapping file, get fp for validation and don't let the
374 * descriptor disappear on us if we block. Check capability
375 * rights, but also return the maximum rights to be combined
376 * with maxprot later.
377 */
378 cap_rights_init_one(&rights, CAP_MMAP);
379 if (prot & PROT_READ)
380 cap_rights_set_one(&rights, CAP_MMAP_R);
381 if ((flags & MAP_SHARED) != 0) {
382 if (prot & PROT_WRITE)
383 cap_rights_set_one(&rights, CAP_MMAP_W);
384 }
385 if (prot & PROT_EXEC)
386 cap_rights_set_one(&rights, CAP_MMAP_X);
387 error = fget_mmap(td, fd, &rights, &cap_maxprot, &fp);
388 if (error != 0)
389 goto done;
390 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 &&
391 p->p_osrel >= P_OSREL_MAP_FSTRICT) {
392 EXTERROR(EINVAL, "neither SHARED nor PRIVATE req");
393 error = EINVAL;
394 goto done;
395 }
396 if (check_fp_fn != NULL) {
397 error = check_fp_fn(fp, prot, max_prot & cap_maxprot,
398 flags);
399 if (error != 0)
400 goto done;
401 }
402 if (fp->f_ops == &shm_ops && shm_largepage(fp->f_data))
403 addr = orig_addr;
404 /* This relies on VM_PROT_* matching PROT_*. */
405 error = fo_mmap(fp, &vms->vm_map, &addr, size, prot,
406 max_prot & cap_maxprot, flags, pos, td);
407 }
408
409 if (error == 0)
410 td->td_retval[0] = addr + pageoff;
411 done:
412 if (fp)
413 fdrop(fp, td);
414
415 return (error);
416 }
417
418 #if defined(COMPAT_FREEBSD6)
419 int
freebsd6_mmap(struct thread * td,struct freebsd6_mmap_args * uap)420 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap)
421 {
422 return (kern_mmap(td, &(struct mmap_req){
423 .mr_hint = (uintptr_t)uap->addr,
424 .mr_len = uap->len,
425 .mr_prot = uap->prot,
426 .mr_flags = uap->flags,
427 .mr_fd = uap->fd,
428 .mr_pos = uap->pos,
429 }));
430 }
431 #endif
432
433 #ifdef COMPAT_43
434 #ifndef _SYS_SYSPROTO_H_
435 struct ommap_args {
436 caddr_t addr;
437 int len;
438 int prot;
439 int flags;
440 int fd;
441 long pos;
442 };
443 #endif
444 int
ommap(struct thread * td,struct ommap_args * uap)445 ommap(struct thread *td, struct ommap_args *uap)
446 {
447 return (kern_ommap(td, (uintptr_t)uap->addr, uap->len, uap->prot,
448 uap->flags, uap->fd, uap->pos));
449 }
450
451 int
kern_ommap(struct thread * td,uintptr_t hint,int len,int oprot,int oflags,int fd,long pos)452 kern_ommap(struct thread *td, uintptr_t hint, int len, int oprot,
453 int oflags, int fd, long pos)
454 {
455 static const char cvtbsdprot[8] = {
456 0,
457 PROT_EXEC,
458 PROT_WRITE,
459 PROT_EXEC | PROT_WRITE,
460 PROT_READ,
461 PROT_EXEC | PROT_READ,
462 PROT_WRITE | PROT_READ,
463 PROT_EXEC | PROT_WRITE | PROT_READ,
464 };
465 int flags, prot;
466
467 if (len < 0)
468 return (EINVAL);
469
470 #define OMAP_ANON 0x0002
471 #define OMAP_COPY 0x0020
472 #define OMAP_SHARED 0x0010
473 #define OMAP_FIXED 0x0100
474
475 prot = cvtbsdprot[oprot & 0x7];
476 #if (defined(COMPAT_FREEBSD32) && defined(__amd64__)) || defined(__i386__)
477 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) &&
478 prot != 0)
479 prot |= PROT_EXEC;
480 #endif
481 flags = 0;
482 if (oflags & OMAP_ANON)
483 flags |= MAP_ANON;
484 if (oflags & OMAP_COPY)
485 flags |= MAP_COPY;
486 if (oflags & OMAP_SHARED)
487 flags |= MAP_SHARED;
488 else
489 flags |= MAP_PRIVATE;
490 if (oflags & OMAP_FIXED)
491 flags |= MAP_FIXED;
492 return (kern_mmap(td, &(struct mmap_req){
493 .mr_hint = hint,
494 .mr_len = len,
495 .mr_prot = prot,
496 .mr_flags = flags,
497 .mr_fd = fd,
498 .mr_pos = pos,
499 }));
500 }
501 #endif /* COMPAT_43 */
502
503 #ifndef _SYS_SYSPROTO_H_
504 struct msync_args {
505 void *addr;
506 size_t len;
507 int flags;
508 };
509 #endif
510 int
sys_msync(struct thread * td,struct msync_args * uap)511 sys_msync(struct thread *td, struct msync_args *uap)
512 {
513
514 return (kern_msync(td, (uintptr_t)uap->addr, uap->len, uap->flags));
515 }
516
517 int
kern_msync(struct thread * td,uintptr_t addr0,size_t size,int flags)518 kern_msync(struct thread *td, uintptr_t addr0, size_t size, int flags)
519 {
520 vm_offset_t addr;
521 vm_size_t pageoff;
522 vm_map_t map;
523 int rv;
524
525 addr = addr0;
526 pageoff = (addr & PAGE_MASK);
527 addr -= pageoff;
528 size += pageoff;
529 size = (vm_size_t) round_page(size);
530 if (addr + size < addr)
531 return (EINVAL);
532
533 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE))
534 return (EINVAL);
535
536 map = &td->td_proc->p_vmspace->vm_map;
537
538 /*
539 * Clean the pages and interpret the return value.
540 */
541 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0,
542 (flags & MS_INVALIDATE) != 0);
543 switch (rv) {
544 case KERN_SUCCESS:
545 return (0);
546 case KERN_INVALID_ADDRESS:
547 return (ENOMEM);
548 case KERN_INVALID_ARGUMENT:
549 return (EBUSY);
550 case KERN_FAILURE:
551 return (EIO);
552 default:
553 return (EINVAL);
554 }
555 }
556
557 #ifndef _SYS_SYSPROTO_H_
558 struct munmap_args {
559 void *addr;
560 size_t len;
561 };
562 #endif
563 int
sys_munmap(struct thread * td,struct munmap_args * uap)564 sys_munmap(struct thread *td, struct munmap_args *uap)
565 {
566
567 return (kern_munmap(td, (uintptr_t)uap->addr, uap->len));
568 }
569
570 int
kern_munmap(struct thread * td,uintptr_t addr0,size_t size)571 kern_munmap(struct thread *td, uintptr_t addr0, size_t size)
572 {
573 #ifdef HWPMC_HOOKS
574 struct pmckern_map_out pkm;
575 vm_map_entry_t entry;
576 bool pmc_handled;
577 #endif
578 vm_offset_t addr, end;
579 vm_size_t pageoff;
580 vm_map_t map;
581 int rv;
582
583 if (size == 0)
584 return (EINVAL);
585
586 addr = addr0;
587 pageoff = (addr & PAGE_MASK);
588 addr -= pageoff;
589 size += pageoff;
590 size = (vm_size_t) round_page(size);
591 end = addr + size;
592 map = &td->td_proc->p_vmspace->vm_map;
593 if (!vm_map_range_valid(map, addr, end))
594 return (EINVAL);
595
596 vm_map_lock(map);
597 #ifdef HWPMC_HOOKS
598 pmc_handled = false;
599 if (PMC_HOOK_INSTALLED(PMC_FN_MUNMAP)) {
600 pmc_handled = true;
601 /*
602 * Inform hwpmc if the address range being unmapped contains
603 * an executable region.
604 */
605 pkm.pm_address = (uintptr_t) NULL;
606 if (vm_map_lookup_entry(map, addr, &entry)) {
607 for (; entry->start < end;
608 entry = vm_map_entry_succ(entry)) {
609 if (vm_map_check_protection(map, entry->start,
610 entry->end, VM_PROT_EXECUTE) == TRUE) {
611 pkm.pm_address = (uintptr_t) addr;
612 pkm.pm_size = (size_t) size;
613 break;
614 }
615 }
616 }
617 }
618 #endif
619 rv = vm_map_delete(map, addr, end);
620
621 #ifdef HWT_HOOKS
622 if (HWT_HOOK_INSTALLED && rv == KERN_SUCCESS) {
623 struct hwt_record_entry ent;
624
625 ent.addr = (uintptr_t) addr;
626 ent.fullpath = NULL;
627 ent.record_type = HWT_RECORD_MUNMAP;
628 HWT_CALL_HOOK(td, HWT_RECORD, &ent);
629 }
630 #endif
631
632 #ifdef HWPMC_HOOKS
633 if (rv == KERN_SUCCESS && __predict_false(pmc_handled)) {
634 /* downgrade the lock to prevent a LOR with the pmc-sx lock */
635 vm_map_lock_downgrade(map);
636 if (pkm.pm_address != (uintptr_t) NULL)
637 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm);
638 vm_map_unlock_read(map);
639 } else
640 #endif
641 vm_map_unlock(map);
642
643 return (vm_mmap_to_errno(rv));
644 }
645
646 #ifndef _SYS_SYSPROTO_H_
647 struct mprotect_args {
648 const void *addr;
649 size_t len;
650 int prot;
651 };
652 #endif
653 int
sys_mprotect(struct thread * td,struct mprotect_args * uap)654 sys_mprotect(struct thread *td, struct mprotect_args *uap)
655 {
656
657 return (kern_mprotect(td, (uintptr_t)uap->addr, uap->len,
658 uap->prot, 0));
659 }
660
661 int
kern_mprotect(struct thread * td,uintptr_t addr0,size_t size,int prot,int flags)662 kern_mprotect(struct thread *td, uintptr_t addr0, size_t size, int prot,
663 int flags)
664 {
665 vm_offset_t addr;
666 vm_size_t pageoff;
667 int vm_error, max_prot;
668
669 addr = addr0;
670 if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0)
671 return (EINVAL);
672 max_prot = PROT_MAX_EXTRACT(prot);
673 prot = PROT_EXTRACT(prot);
674 pageoff = (addr & PAGE_MASK);
675 addr -= pageoff;
676 size += pageoff;
677 size = (vm_size_t) round_page(size);
678 #ifdef COMPAT_FREEBSD32
679 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
680 if (((addr + size) & 0xffffffff) < addr)
681 return (EINVAL);
682 } else
683 #endif
684 if (addr + size < addr)
685 return (EINVAL);
686
687 flags |= VM_MAP_PROTECT_SET_PROT;
688 if (max_prot != 0)
689 flags |= VM_MAP_PROTECT_SET_MAXPROT;
690 vm_error = vm_map_protect(&td->td_proc->p_vmspace->vm_map,
691 addr, addr + size, prot, max_prot, flags);
692
693 switch (vm_error) {
694 case KERN_SUCCESS:
695 return (0);
696 case KERN_PROTECTION_FAILURE:
697 return (EACCES);
698 case KERN_RESOURCE_SHORTAGE:
699 return (ENOMEM);
700 case KERN_OUT_OF_BOUNDS:
701 return (ENOTSUP);
702 }
703 return (EINVAL);
704 }
705
706 #ifndef _SYS_SYSPROTO_H_
707 struct minherit_args {
708 void *addr;
709 size_t len;
710 int inherit;
711 };
712 #endif
713 int
sys_minherit(struct thread * td,struct minherit_args * uap)714 sys_minherit(struct thread *td, struct minherit_args *uap)
715 {
716
717 return (kern_minherit(td, (uintptr_t)uap->addr, uap->len,
718 uap->inherit));
719 }
720
721 int
kern_minherit(struct thread * td,uintptr_t addr0,size_t len,int inherit0)722 kern_minherit(struct thread *td, uintptr_t addr0, size_t len, int inherit0)
723 {
724 vm_offset_t addr;
725 vm_size_t size, pageoff;
726 vm_inherit_t inherit;
727
728 addr = (vm_offset_t)addr0;
729 size = len;
730 inherit = inherit0;
731
732 pageoff = (addr & PAGE_MASK);
733 addr -= pageoff;
734 size += pageoff;
735 size = (vm_size_t) round_page(size);
736 if (addr + size < addr)
737 return (EINVAL);
738
739 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr,
740 addr + size, inherit)) {
741 case KERN_SUCCESS:
742 return (0);
743 case KERN_PROTECTION_FAILURE:
744 return (EACCES);
745 }
746 return (EINVAL);
747 }
748
749 #ifndef _SYS_SYSPROTO_H_
750 struct madvise_args {
751 void *addr;
752 size_t len;
753 int behav;
754 };
755 #endif
756
757 int
sys_madvise(struct thread * td,struct madvise_args * uap)758 sys_madvise(struct thread *td, struct madvise_args *uap)
759 {
760
761 return (kern_madvise(td, (uintptr_t)uap->addr, uap->len, uap->behav));
762 }
763
764 int
kern_madvise(struct thread * td,uintptr_t addr0,size_t len,int behav)765 kern_madvise(struct thread *td, uintptr_t addr0, size_t len, int behav)
766 {
767 vm_map_t map;
768 vm_offset_t addr, end, start;
769 int flags;
770
771 /*
772 * Check for our special case, advising the swap pager we are
773 * "immortal."
774 */
775 if (behav == MADV_PROTECT) {
776 flags = PPROT_SET;
777 return (kern_procctl(td, P_PID, td->td_proc->p_pid,
778 PROC_SPROTECT, &flags));
779 }
780
781 /*
782 * Check for illegal addresses. Watch out for address wrap... Note
783 * that VM_*_ADDRESS are not constants due to casts (argh).
784 */
785 map = &td->td_proc->p_vmspace->vm_map;
786 addr = addr0;
787 if (!vm_map_range_valid(map, addr, addr + len))
788 return (EINVAL);
789
790 /*
791 * Since this routine is only advisory, we default to conservative
792 * behavior.
793 */
794 start = trunc_page(addr);
795 end = round_page(addr + len);
796
797 /*
798 * vm_map_madvise() checks for illegal values of behav.
799 */
800 return (vm_map_madvise(map, start, end, behav));
801 }
802
803 #ifndef _SYS_SYSPROTO_H_
804 struct mincore_args {
805 const void *addr;
806 size_t len;
807 char *vec;
808 };
809 #endif
810
811 int
sys_mincore(struct thread * td,struct mincore_args * uap)812 sys_mincore(struct thread *td, struct mincore_args *uap)
813 {
814
815 return (kern_mincore(td, (uintptr_t)uap->addr, uap->len, uap->vec));
816 }
817
818 int
kern_mincore(struct thread * td,uintptr_t addr0,size_t len,char * vec)819 kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec)
820 {
821 pmap_t pmap;
822 vm_map_t map;
823 vm_map_entry_t current, entry;
824 vm_object_t object;
825 vm_offset_t addr, cend, end, first_addr;
826 vm_paddr_t pa;
827 vm_page_t m;
828 vm_pindex_t pindex;
829 int error, lastvecindex, mincoreinfo, vecindex;
830 unsigned int timestamp;
831
832 /*
833 * Make sure that the addresses presented are valid for user
834 * mode.
835 */
836 first_addr = addr = trunc_page(addr0);
837 end = round_page(addr0 + len);
838 map = &td->td_proc->p_vmspace->vm_map;
839 if (end > vm_map_max(map) || end < addr)
840 return (ENOMEM);
841
842 pmap = vmspace_pmap(td->td_proc->p_vmspace);
843
844 vm_map_lock_read(map);
845 RestartScan:
846 timestamp = map->timestamp;
847
848 if (!vm_map_lookup_entry(map, addr, &entry)) {
849 vm_map_unlock_read(map);
850 return (ENOMEM);
851 }
852
853 /*
854 * Do this on a map entry basis so that if the pages are not
855 * in the current processes address space, we can easily look
856 * up the pages elsewhere.
857 */
858 lastvecindex = -1;
859 while (entry->start < end) {
860 /*
861 * check for contiguity
862 */
863 current = entry;
864 entry = vm_map_entry_succ(current);
865 if (current->end < end &&
866 entry->start > current->end) {
867 vm_map_unlock_read(map);
868 return (ENOMEM);
869 }
870
871 /*
872 * ignore submaps (for now) or null objects
873 */
874 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) ||
875 current->object.vm_object == NULL)
876 continue;
877
878 /*
879 * limit this scan to the current map entry and the
880 * limits for the mincore call
881 */
882 if (addr < current->start)
883 addr = current->start;
884 cend = current->end;
885 if (cend > end)
886 cend = end;
887
888 for (; addr < cend; addr += PAGE_SIZE) {
889 /*
890 * Check pmap first, it is likely faster, also
891 * it can provide info as to whether we are the
892 * one referencing or modifying the page.
893 */
894 m = NULL;
895 object = NULL;
896 retry:
897 pa = 0;
898 mincoreinfo = pmap_mincore(pmap, addr, &pa);
899 if (mincore_mapped) {
900 /*
901 * We only care about this pmap's
902 * mapping of the page, if any.
903 */
904 ;
905 } else if (pa != 0) {
906 /*
907 * The page is mapped by this process but not
908 * both accessed and modified. It is also
909 * managed. Acquire the object lock so that
910 * other mappings might be examined. The page's
911 * identity may change at any point before its
912 * object lock is acquired, so re-validate if
913 * necessary.
914 */
915 m = PHYS_TO_VM_PAGE(pa);
916 while (object == NULL || m->object != object) {
917 if (object != NULL)
918 VM_OBJECT_WUNLOCK(object);
919 object = atomic_load_ptr(&m->object);
920 if (object == NULL)
921 goto retry;
922 VM_OBJECT_WLOCK(object);
923 }
924 if (pa != pmap_extract(pmap, addr))
925 goto retry;
926 KASSERT(vm_page_all_valid(m),
927 ("mincore: page %p is mapped but invalid",
928 m));
929 } else if (mincoreinfo == 0) {
930 /*
931 * The page is not mapped by this process. If
932 * the object implements managed pages, then
933 * determine if the page is resident so that
934 * the mappings might be examined.
935 */
936 if (current->object.vm_object != object) {
937 if (object != NULL)
938 VM_OBJECT_WUNLOCK(object);
939 object = current->object.vm_object;
940 VM_OBJECT_WLOCK(object);
941 }
942 if ((object->flags & OBJ_SWAP) != 0 ||
943 object->type == OBJT_VNODE) {
944 pindex = OFF_TO_IDX(current->offset +
945 (addr - current->start));
946 m = vm_page_lookup(object, pindex);
947 if (m != NULL && vm_page_none_valid(m))
948 m = NULL;
949 if (m != NULL)
950 mincoreinfo = MINCORE_INCORE;
951 }
952 }
953 if (m != NULL) {
954 VM_OBJECT_ASSERT_WLOCKED(m->object);
955
956 /* Examine other mappings of the page. */
957 if (m->dirty == 0 && pmap_is_modified(m))
958 vm_page_dirty(m);
959 if (m->dirty != 0)
960 mincoreinfo |= MINCORE_MODIFIED_OTHER;
961
962 /*
963 * The first test for PGA_REFERENCED is an
964 * optimization. The second test is
965 * required because a concurrent pmap
966 * operation could clear the last reference
967 * and set PGA_REFERENCED before the call to
968 * pmap_is_referenced().
969 */
970 if ((m->a.flags & PGA_REFERENCED) != 0 ||
971 pmap_is_referenced(m) ||
972 (m->a.flags & PGA_REFERENCED) != 0)
973 mincoreinfo |= MINCORE_REFERENCED_OTHER;
974 }
975 if (object != NULL)
976 VM_OBJECT_WUNLOCK(object);
977
978 /*
979 * subyte may page fault. In case it needs to modify
980 * the map, we release the lock.
981 */
982 vm_map_unlock_read(map);
983
984 /*
985 * calculate index into user supplied byte vector
986 */
987 vecindex = atop(addr - first_addr);
988
989 /*
990 * If we have skipped map entries, we need to make sure that
991 * the byte vector is zeroed for those skipped entries.
992 */
993 while ((lastvecindex + 1) < vecindex) {
994 ++lastvecindex;
995 error = subyte(vec + lastvecindex, 0);
996 if (error) {
997 error = EFAULT;
998 goto done2;
999 }
1000 }
1001
1002 /*
1003 * Pass the page information to the user
1004 */
1005 error = subyte(vec + vecindex, mincoreinfo);
1006 if (error) {
1007 error = EFAULT;
1008 goto done2;
1009 }
1010
1011 /*
1012 * If the map has changed, due to the subyte, the previous
1013 * output may be invalid.
1014 */
1015 vm_map_lock_read(map);
1016 if (timestamp != map->timestamp)
1017 goto RestartScan;
1018
1019 lastvecindex = vecindex;
1020 }
1021 }
1022
1023 /*
1024 * subyte may page fault. In case it needs to modify
1025 * the map, we release the lock.
1026 */
1027 vm_map_unlock_read(map);
1028
1029 /*
1030 * Zero the last entries in the byte vector.
1031 */
1032 vecindex = atop(end - first_addr);
1033 while ((lastvecindex + 1) < vecindex) {
1034 ++lastvecindex;
1035 error = subyte(vec + lastvecindex, 0);
1036 if (error) {
1037 error = EFAULT;
1038 goto done2;
1039 }
1040 }
1041
1042 /*
1043 * If the map has changed, due to the subyte, the previous
1044 * output may be invalid.
1045 */
1046 vm_map_lock_read(map);
1047 if (timestamp != map->timestamp)
1048 goto RestartScan;
1049 vm_map_unlock_read(map);
1050 done2:
1051 return (error);
1052 }
1053
1054 #ifndef _SYS_SYSPROTO_H_
1055 struct mlock_args {
1056 const void *addr;
1057 size_t len;
1058 };
1059 #endif
1060 int
sys_mlock(struct thread * td,struct mlock_args * uap)1061 sys_mlock(struct thread *td, struct mlock_args *uap)
1062 {
1063
1064 return (kern_mlock(td->td_proc, td->td_ucred,
1065 __DECONST(uintptr_t, uap->addr), uap->len));
1066 }
1067
1068 int
kern_mlock(struct proc * proc,struct ucred * cred,uintptr_t addr0,size_t len)1069 kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr0, size_t len)
1070 {
1071 vm_offset_t addr, end, last, start;
1072 vm_size_t npages, size;
1073 vm_map_t map;
1074 unsigned long nsize;
1075 int error;
1076
1077 error = priv_check_cred(cred, PRIV_VM_MLOCK);
1078 if (error)
1079 return (error);
1080 addr = addr0;
1081 size = len;
1082 last = addr + size;
1083 start = trunc_page(addr);
1084 end = round_page(last);
1085 if (last < addr || end < addr)
1086 return (EINVAL);
1087 npages = atop(end - start);
1088 if (npages > vm_page_max_user_wired)
1089 return (ENOMEM);
1090 map = &proc->p_vmspace->vm_map;
1091 PROC_LOCK(proc);
1092 nsize = ptoa(npages + pmap_wired_count(map->pmap));
1093 if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) {
1094 PROC_UNLOCK(proc);
1095 return (ENOMEM);
1096 }
1097 PROC_UNLOCK(proc);
1098 #ifdef RACCT
1099 if (racct_enable) {
1100 PROC_LOCK(proc);
1101 error = racct_set(proc, RACCT_MEMLOCK, nsize);
1102 PROC_UNLOCK(proc);
1103 if (error != 0)
1104 return (ENOMEM);
1105 }
1106 #endif
1107 error = vm_map_wire(map, start, end,
1108 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
1109 #ifdef RACCT
1110 if (racct_enable && error != KERN_SUCCESS) {
1111 PROC_LOCK(proc);
1112 racct_set(proc, RACCT_MEMLOCK,
1113 ptoa(pmap_wired_count(map->pmap)));
1114 PROC_UNLOCK(proc);
1115 }
1116 #endif
1117 switch (error) {
1118 case KERN_SUCCESS:
1119 return (0);
1120 case KERN_INVALID_ARGUMENT:
1121 return (EINVAL);
1122 default:
1123 return (ENOMEM);
1124 }
1125 }
1126
1127 #ifndef _SYS_SYSPROTO_H_
1128 struct mlockall_args {
1129 int how;
1130 };
1131 #endif
1132
1133 int
sys_mlockall(struct thread * td,struct mlockall_args * uap)1134 sys_mlockall(struct thread *td, struct mlockall_args *uap)
1135 {
1136 vm_map_t map;
1137 int error;
1138
1139 map = &td->td_proc->p_vmspace->vm_map;
1140 error = priv_check(td, PRIV_VM_MLOCK);
1141 if (error)
1142 return (error);
1143
1144 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0))
1145 return (EINVAL);
1146
1147 /*
1148 * If wiring all pages in the process would cause it to exceed
1149 * a hard resource limit, return ENOMEM.
1150 */
1151 if (!old_mlock && uap->how & MCL_CURRENT) {
1152 if (map->size > lim_cur(td, RLIMIT_MEMLOCK))
1153 return (ENOMEM);
1154 }
1155 #ifdef RACCT
1156 if (racct_enable) {
1157 PROC_LOCK(td->td_proc);
1158 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size);
1159 PROC_UNLOCK(td->td_proc);
1160 if (error != 0)
1161 return (ENOMEM);
1162 }
1163 #endif
1164
1165 if (uap->how & MCL_FUTURE) {
1166 vm_map_lock(map);
1167 vm_map_modflags(map, MAP_WIREFUTURE, 0);
1168 vm_map_unlock(map);
1169 error = 0;
1170 }
1171
1172 if (uap->how & MCL_CURRENT) {
1173 /*
1174 * P1003.1-2001 mandates that all currently mapped pages
1175 * will be memory resident and locked (wired) upon return
1176 * from mlockall(). vm_map_wire() will wire pages, by
1177 * calling vm_fault_wire() for each page in the region.
1178 */
1179 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map),
1180 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
1181 if (error == KERN_SUCCESS)
1182 error = 0;
1183 else if (error == KERN_RESOURCE_SHORTAGE)
1184 error = ENOMEM;
1185 else
1186 error = EAGAIN;
1187 }
1188 #ifdef RACCT
1189 if (racct_enable && error != KERN_SUCCESS) {
1190 PROC_LOCK(td->td_proc);
1191 racct_set(td->td_proc, RACCT_MEMLOCK,
1192 ptoa(pmap_wired_count(map->pmap)));
1193 PROC_UNLOCK(td->td_proc);
1194 }
1195 #endif
1196
1197 return (error);
1198 }
1199
1200 #ifndef _SYS_SYSPROTO_H_
1201 struct munlockall_args {
1202 register_t dummy;
1203 };
1204 #endif
1205
1206 int
sys_munlockall(struct thread * td,struct munlockall_args * uap)1207 sys_munlockall(struct thread *td, struct munlockall_args *uap)
1208 {
1209 vm_map_t map;
1210 int error;
1211
1212 map = &td->td_proc->p_vmspace->vm_map;
1213 error = priv_check(td, PRIV_VM_MUNLOCK);
1214 if (error)
1215 return (error);
1216
1217 /* Clear the MAP_WIREFUTURE flag from this vm_map. */
1218 vm_map_lock(map);
1219 vm_map_modflags(map, 0, MAP_WIREFUTURE);
1220 vm_map_unlock(map);
1221
1222 /* Forcibly unwire all pages. */
1223 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map),
1224 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
1225 #ifdef RACCT
1226 if (racct_enable && error == KERN_SUCCESS) {
1227 PROC_LOCK(td->td_proc);
1228 racct_set(td->td_proc, RACCT_MEMLOCK, 0);
1229 PROC_UNLOCK(td->td_proc);
1230 }
1231 #endif
1232
1233 return (error);
1234 }
1235
1236 #ifndef _SYS_SYSPROTO_H_
1237 struct munlock_args {
1238 const void *addr;
1239 size_t len;
1240 };
1241 #endif
1242 int
sys_munlock(struct thread * td,struct munlock_args * uap)1243 sys_munlock(struct thread *td, struct munlock_args *uap)
1244 {
1245
1246 return (kern_munlock(td, (uintptr_t)uap->addr, uap->len));
1247 }
1248
1249 int
kern_munlock(struct thread * td,uintptr_t addr0,size_t size)1250 kern_munlock(struct thread *td, uintptr_t addr0, size_t size)
1251 {
1252 vm_offset_t addr, end, last, start;
1253 #ifdef RACCT
1254 vm_map_t map;
1255 #endif
1256 int error;
1257
1258 error = priv_check(td, PRIV_VM_MUNLOCK);
1259 if (error)
1260 return (error);
1261 addr = addr0;
1262 last = addr + size;
1263 start = trunc_page(addr);
1264 end = round_page(last);
1265 if (last < addr || end < addr)
1266 return (EINVAL);
1267 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end,
1268 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
1269 #ifdef RACCT
1270 if (racct_enable && error == KERN_SUCCESS) {
1271 PROC_LOCK(td->td_proc);
1272 map = &td->td_proc->p_vmspace->vm_map;
1273 racct_set(td->td_proc, RACCT_MEMLOCK,
1274 ptoa(pmap_wired_count(map->pmap)));
1275 PROC_UNLOCK(td->td_proc);
1276 }
1277 #endif
1278 return (error == KERN_SUCCESS ? 0 : ENOMEM);
1279 }
1280
1281 /*
1282 * vm_mmap_vnode()
1283 *
1284 * Helper function for vm_mmap. Perform sanity check specific for mmap
1285 * operations on vnodes.
1286 */
1287 int
vm_mmap_vnode(struct thread * td,vm_size_t objsize,vm_prot_t prot,vm_prot_t * maxprotp,int * flagsp,struct vnode * vp,vm_ooffset_t * foffp,vm_object_t * objp,boolean_t * writecounted)1288 vm_mmap_vnode(struct thread *td, vm_size_t objsize,
1289 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
1290 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp,
1291 boolean_t *writecounted)
1292 {
1293 struct vattr va;
1294 vm_object_t obj;
1295 vm_ooffset_t foff;
1296 struct ucred *cred;
1297 int error, flags;
1298 bool writex;
1299
1300 cred = td->td_ucred;
1301 writex = (*maxprotp & VM_PROT_WRITE) != 0 &&
1302 (*flagsp & MAP_SHARED) != 0;
1303 if ((error = vget(vp, LK_SHARED)) != 0)
1304 return (error);
1305 AUDIT_ARG_VNODE1(vp);
1306 foff = *foffp;
1307 flags = *flagsp;
1308 obj = vp->v_object;
1309 if (vp->v_type == VREG) {
1310 /*
1311 * Get the proper underlying object
1312 */
1313 if (obj == NULL) {
1314 error = EINVAL;
1315 goto done;
1316 }
1317 if (obj->type == OBJT_VNODE && obj->handle != vp) {
1318 vput(vp);
1319 vp = (struct vnode *)obj->handle;
1320 /*
1321 * Bypass filesystems obey the mpsafety of the
1322 * underlying fs. Tmpfs never bypasses.
1323 */
1324 error = vget(vp, LK_SHARED);
1325 if (error != 0)
1326 return (error);
1327 }
1328 if (writex) {
1329 *writecounted = TRUE;
1330 vm_pager_update_writecount(obj, 0, objsize);
1331 }
1332 } else {
1333 error = EXTERROR(EINVAL, "non-reg file");
1334 goto done;
1335 }
1336 if ((error = VOP_GETATTR(vp, &va, cred)))
1337 goto done;
1338 #ifdef MAC
1339 /* This relies on VM_PROT_* matching PROT_*. */
1340 error = mac_vnode_check_mmap(cred, vp, (int)prot, flags);
1341 if (error != 0)
1342 goto done;
1343 #endif
1344 if ((flags & MAP_SHARED) != 0) {
1345 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) {
1346 if (prot & VM_PROT_WRITE) {
1347 error = EPERM;
1348 goto done;
1349 }
1350 *maxprotp &= ~VM_PROT_WRITE;
1351 }
1352 }
1353 /*
1354 * If it is a regular file without any references
1355 * we do not need to sync it.
1356 * Adjust object size to be the size of actual file.
1357 */
1358 objsize = round_page(va.va_size);
1359 if (va.va_nlink == 0)
1360 flags |= MAP_NOSYNC;
1361 if (obj->type == OBJT_VNODE) {
1362 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff,
1363 cred);
1364 if (obj == NULL) {
1365 error = ENOMEM;
1366 goto done;
1367 }
1368 } else {
1369 KASSERT((obj->flags & OBJ_SWAP) != 0, ("wrong object type"));
1370 vm_object_reference(obj);
1371 #if VM_NRESERVLEVEL > 0
1372 if ((obj->flags & OBJ_COLORED) == 0) {
1373 VM_OBJECT_WLOCK(obj);
1374 vm_object_color(obj, 0);
1375 VM_OBJECT_WUNLOCK(obj);
1376 }
1377 #endif
1378 }
1379 *objp = obj;
1380 *flagsp = flags;
1381
1382 VOP_MMAPPED(vp);
1383
1384 done:
1385 if (error != 0 && *writecounted) {
1386 *writecounted = FALSE;
1387 vm_pager_update_writecount(obj, objsize, 0);
1388 }
1389 vput(vp);
1390 return (error);
1391 }
1392
1393 /*
1394 * vm_mmap_cdev()
1395 *
1396 * Helper function for vm_mmap. Perform sanity check specific for mmap
1397 * operations on cdevs.
1398 */
1399 int
vm_mmap_cdev(struct thread * td,vm_size_t objsize,vm_prot_t prot,vm_prot_t * maxprotp,int * flagsp,struct cdev * cdev,struct cdevsw * dsw,vm_ooffset_t * foff,vm_object_t * objp)1400 vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot,
1401 vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw,
1402 vm_ooffset_t *foff, vm_object_t *objp)
1403 {
1404 vm_object_t obj;
1405 int error, flags;
1406
1407 flags = *flagsp;
1408
1409 if (dsw->d_flags & D_MMAP_ANON) {
1410 *objp = NULL;
1411 *foff = 0;
1412 *maxprotp = VM_PROT_ALL;
1413 *flagsp |= MAP_ANON;
1414 return (0);
1415 }
1416
1417 /*
1418 * cdevs do not provide private mappings of any kind.
1419 */
1420 if ((*maxprotp & VM_PROT_WRITE) == 0 &&
1421 (prot & VM_PROT_WRITE) != 0)
1422 return (EACCES);
1423 if ((flags & (MAP_PRIVATE | MAP_COPY)) != 0) {
1424 return (EXTERROR(EINVAL, "cdev mapping must be shared"));
1425 }
1426
1427 /*
1428 * Force device mappings to be shared.
1429 */
1430 flags |= MAP_SHARED;
1431 #ifdef MAC_XXX
1432 error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot);
1433 if (error != 0)
1434 return (error);
1435 #endif
1436 /*
1437 * First, try d_mmap_single(). If that is not implemented
1438 * (returns ENODEV), fall back to using the device pager.
1439 * Note that d_mmap_single() must return a reference to the
1440 * object (it needs to bump the reference count of the object
1441 * it returns somehow).
1442 *
1443 * XXX assumes VM_PROT_* == PROT_*
1444 */
1445 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot);
1446 if (error != ENODEV)
1447 return (error);
1448 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff,
1449 td->td_ucred);
1450 if (obj == NULL) {
1451 return (EXTERROR(EINVAL,
1452 "cdev driver does not support mmap"));
1453 }
1454 *objp = obj;
1455 *flagsp = flags;
1456 return (0);
1457 }
1458
1459 int
vm_mmap(vm_map_t map,vm_offset_t * addr,vm_size_t size,vm_prot_t prot,vm_prot_t maxprot,int flags,objtype_t handle_type,void * handle,vm_ooffset_t foff)1460 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
1461 vm_prot_t maxprot, int flags,
1462 objtype_t handle_type, void *handle,
1463 vm_ooffset_t foff)
1464 {
1465 vm_object_t object;
1466 struct thread *td = curthread;
1467 int error;
1468 boolean_t writecounted;
1469
1470 if (size == 0) {
1471 return (EXTERROR(EINVAL, "zero-sized req"));
1472 }
1473
1474 size = round_page(size);
1475 object = NULL;
1476 writecounted = FALSE;
1477
1478 switch (handle_type) {
1479 case OBJT_DEVICE: {
1480 struct cdevsw *dsw;
1481 struct cdev *cdev;
1482 int ref;
1483
1484 cdev = handle;
1485 dsw = dev_refthread(cdev, &ref);
1486 if (dsw == NULL)
1487 return (ENXIO);
1488 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev,
1489 dsw, &foff, &object);
1490 dev_relthread(cdev, ref);
1491 break;
1492 }
1493 case OBJT_VNODE:
1494 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
1495 handle, &foff, &object, &writecounted);
1496 break;
1497 default:
1498 error = EXTERROR(EINVAL, "unsupported backing obj type",
1499 handle_type);
1500 break;
1501 }
1502 if (error)
1503 return (error);
1504
1505 error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
1506 foff, writecounted, td);
1507 if (error != 0 && object != NULL) {
1508 /*
1509 * If this mapping was accounted for in the vnode's
1510 * writecount, then undo that now.
1511 */
1512 if (writecounted)
1513 vm_pager_release_writecount(object, 0, size);
1514 vm_object_deallocate(object);
1515 }
1516 return (error);
1517 }
1518
1519 int
kern_mmap_racct_check(struct thread * td,vm_map_t map,vm_size_t size)1520 kern_mmap_racct_check(struct thread *td, vm_map_t map, vm_size_t size)
1521 {
1522 int error;
1523
1524 RACCT_PROC_LOCK(td->td_proc);
1525 if (map->size + size > lim_cur(td, RLIMIT_VMEM)) {
1526 RACCT_PROC_UNLOCK(td->td_proc);
1527 return (ENOMEM);
1528 }
1529 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) {
1530 RACCT_PROC_UNLOCK(td->td_proc);
1531 return (ENOMEM);
1532 }
1533 if (!old_mlock && map->flags & MAP_WIREFUTURE) {
1534 if (ptoa(pmap_wired_count(map->pmap)) + size >
1535 lim_cur(td, RLIMIT_MEMLOCK)) {
1536 racct_set_force(td->td_proc, RACCT_VMEM, map->size);
1537 RACCT_PROC_UNLOCK(td->td_proc);
1538 return (ENOMEM);
1539 }
1540 error = racct_set(td->td_proc, RACCT_MEMLOCK,
1541 ptoa(pmap_wired_count(map->pmap)) + size);
1542 if (error != 0) {
1543 racct_set_force(td->td_proc, RACCT_VMEM, map->size);
1544 RACCT_PROC_UNLOCK(td->td_proc);
1545 return (error);
1546 }
1547 }
1548 RACCT_PROC_UNLOCK(td->td_proc);
1549 return (0);
1550 }
1551
1552 /*
1553 * Internal version of mmap that maps a specific VM object into an
1554 * map. Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap.
1555 */
1556 int
vm_mmap_object(vm_map_t map,vm_offset_t * addr,vm_size_t size,vm_prot_t prot,vm_prot_t maxprot,int flags,vm_object_t object,vm_ooffset_t foff,boolean_t writecounted,struct thread * td)1557 vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
1558 vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff,
1559 boolean_t writecounted, struct thread *td)
1560 {
1561 vm_offset_t default_addr, max_addr;
1562 int docow, error, findspace, rv;
1563 bool curmap, fitit;
1564
1565 curmap = map == &td->td_proc->p_vmspace->vm_map;
1566 if (curmap) {
1567 error = kern_mmap_racct_check(td, map, size);
1568 if (error != 0)
1569 return (error);
1570 }
1571
1572 /*
1573 * We currently can only deal with page aligned file offsets.
1574 * The mmap() system call already enforces this by subtracting
1575 * the page offset from the file offset, but checking here
1576 * catches errors in device drivers (e.g. d_single_mmap()
1577 * callbacks) and other internal mapping requests (such as in
1578 * exec).
1579 */
1580 if ((foff & PAGE_MASK) != 0) {
1581 return (EXTERROR(EINVAL, "offset not page-aligned", foff));
1582 }
1583
1584 if ((flags & MAP_FIXED) == 0) {
1585 fitit = true;
1586 *addr = round_page(*addr);
1587 } else {
1588 if (*addr != trunc_page(*addr)) {
1589 return (EXTERROR(EINVAL,
1590 "non-fixed mapping address not aligned", *addr));
1591 }
1592 fitit = false;
1593 }
1594
1595 if (flags & MAP_ANON) {
1596 if (object != NULL) {
1597 return (EXTERROR(EINVAL,
1598 "anon mapping backed by an object"));
1599 }
1600 if (foff != 0) {
1601 return (EXTERROR(EINVAL,
1602 "anon mapping with non-zero offset"));
1603 }
1604 docow = 0;
1605 } else if (flags & MAP_PREFAULT_READ)
1606 docow = MAP_PREFAULT;
1607 else
1608 docow = MAP_PREFAULT_PARTIAL;
1609
1610 if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
1611 docow |= MAP_COPY_ON_WRITE;
1612 if (flags & MAP_NOSYNC)
1613 docow |= MAP_DISABLE_SYNCER;
1614 if (flags & MAP_NOCORE)
1615 docow |= MAP_DISABLE_COREDUMP;
1616 /* Shared memory is also shared with children. */
1617 if (flags & MAP_SHARED)
1618 docow |= MAP_INHERIT_SHARE;
1619 if (writecounted)
1620 docow |= MAP_WRITECOUNT;
1621 if (flags & MAP_STACK) {
1622 if (object != NULL) {
1623 return (EXTERROR(EINVAL,
1624 "stack mapping backed by an object"));
1625 }
1626 docow |= MAP_STACK_AREA;
1627 }
1628 if ((flags & MAP_EXCL) != 0)
1629 docow |= MAP_CHECK_EXCL;
1630 if ((flags & MAP_GUARD) != 0)
1631 docow |= MAP_CREATE_GUARD;
1632
1633 if (fitit) {
1634 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER)
1635 findspace = VMFS_SUPER_SPACE;
1636 else if ((flags & MAP_ALIGNMENT_MASK) != 0)
1637 findspace = VMFS_ALIGNED_SPACE(flags >>
1638 MAP_ALIGNMENT_SHIFT);
1639 else
1640 findspace = VMFS_OPTIMAL_SPACE;
1641 max_addr = 0;
1642 if ((flags & MAP_32BIT) != 0)
1643 max_addr = MAP_32BIT_MAX_ADDR;
1644 if (curmap) {
1645 default_addr =
1646 round_page((vm_offset_t)td->td_proc->p_vmspace->
1647 vm_daddr + lim_max(td, RLIMIT_DATA));
1648 if ((flags & MAP_32BIT) != 0)
1649 default_addr = 0;
1650 rv = vm_map_find_min(map, object, foff, addr, size,
1651 default_addr, max_addr, findspace, prot, maxprot,
1652 docow);
1653 } else {
1654 rv = vm_map_find(map, object, foff, addr, size,
1655 max_addr, findspace, prot, maxprot, docow);
1656 }
1657 } else {
1658 rv = vm_map_fixed(map, object, foff, *addr, size,
1659 prot, maxprot, docow);
1660 }
1661
1662 if (rv == KERN_SUCCESS) {
1663 /*
1664 * If the process has requested that all future mappings
1665 * be wired, then heed this.
1666 */
1667 if ((map->flags & MAP_WIREFUTURE) != 0) {
1668 vm_map_lock(map);
1669 if ((map->flags & MAP_WIREFUTURE) != 0)
1670 (void)vm_map_wire_locked(map, *addr,
1671 *addr + size, VM_MAP_WIRE_USER |
1672 ((flags & MAP_STACK) ? VM_MAP_WIRE_HOLESOK :
1673 VM_MAP_WIRE_NOHOLES));
1674 vm_map_unlock(map);
1675 }
1676 }
1677 return (vm_mmap_to_errno(rv));
1678 }
1679
1680 /*
1681 * Translate a Mach VM return code to zero on success or the appropriate errno
1682 * on failure.
1683 */
1684 int
vm_mmap_to_errno(int rv)1685 vm_mmap_to_errno(int rv)
1686 {
1687 int error;
1688
1689 switch (rv) {
1690 case KERN_SUCCESS:
1691 return (0);
1692 case KERN_INVALID_ADDRESS:
1693 case KERN_NO_SPACE:
1694 error = ENOMEM;
1695 break;
1696 case KERN_PROTECTION_FAILURE:
1697 error = EACCES;
1698 break;
1699 default:
1700 error = EINVAL;
1701 break;
1702 }
1703 if ((curthread->td_pflags2 & (TDP2_UEXTERR | TDP2_EXTERR)) ==
1704 TDP2_UEXTERR)
1705 EXTERROR(error, "mach error", rv);
1706 return (error);
1707 }
1708