1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1988 University of Utah.
5 * Copyright (c) 1991, 1993
6 * The Regents of the University of California. All rights reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * the Systems Programming Group of the University of Utah Computer
10 * Science Department.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
37 */
38
39 /*
40 * Mapped file (mmap) interface to VM
41 */
42
43 #include "opt_hwpmc_hooks.h"
44 #include "opt_hwt_hooks.h"
45 #include "opt_vm.h"
46
47 #define EXTERR_CATEGORY EXTERR_CAT_MMAP
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/capsicum.h>
51 #include <sys/exterrvar.h>
52 #include <sys/kernel.h>
53 #include <sys/lock.h>
54 #include <sys/mutex.h>
55 #include <sys/sysproto.h>
56 #include <sys/elf.h>
57 #include <sys/filedesc.h>
58 #include <sys/priv.h>
59 #include <sys/proc.h>
60 #include <sys/procctl.h>
61 #include <sys/racct.h>
62 #include <sys/resource.h>
63 #include <sys/resourcevar.h>
64 #include <sys/rwlock.h>
65 #include <sys/sysctl.h>
66 #include <sys/vnode.h>
67 #include <sys/fcntl.h>
68 #include <sys/file.h>
69 #include <sys/mman.h>
70 #include <sys/mount.h>
71 #include <sys/conf.h>
72 #include <sys/stat.h>
73 #include <sys/syscallsubr.h>
74 #include <sys/sysent.h>
75 #include <sys/vmmeter.h>
76 #if defined(__amd64__) || defined(__i386__) /* for i386_read_exec */
77 #include <machine/md_var.h>
78 #endif
79
80 #include <security/audit/audit.h>
81 #include <security/mac/mac_framework.h>
82
83 #include <vm/vm.h>
84 #include <vm/vm_param.h>
85 #include <vm/pmap.h>
86 #include <vm/vm_map.h>
87 #include <vm/vm_object.h>
88 #include <vm/vm_page.h>
89 #include <vm/vm_pager.h>
90 #include <vm/vm_pageout.h>
91 #include <vm/vm_extern.h>
92 #include <vm/vm_page.h>
93 #include <vm/vnode_pager.h>
94
95 #ifdef HWPMC_HOOKS
96 #include <sys/pmckern.h>
97 #endif
98
99 #ifdef HWT_HOOKS
100 #include <dev/hwt/hwt_hook.h>
101 #endif
102
103 int old_mlock = 0;
104 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0,
105 "Do not apply RLIMIT_MEMLOCK on mlockall");
106 static int mincore_mapped = 1;
107 SYSCTL_INT(_vm, OID_AUTO, mincore_mapped, CTLFLAG_RWTUN, &mincore_mapped, 0,
108 "mincore reports mappings, not residency");
109 static int imply_prot_max = 0;
110 SYSCTL_INT(_vm, OID_AUTO, imply_prot_max, CTLFLAG_RWTUN, &imply_prot_max, 0,
111 "Imply maximum page protections in mmap() when none are specified");
112
113 _Static_assert(MAXPAGESIZES <= 4, "MINCORE_SUPER too narrow");
114
115 #if defined(COMPAT_43)
116 int
ogetpagesize(struct thread * td,struct ogetpagesize_args * uap)117 ogetpagesize(struct thread *td, struct ogetpagesize_args *uap)
118 {
119
120 td->td_retval[0] = PAGE_SIZE;
121 return (0);
122 }
123 #endif /* COMPAT_43 */
124
125 /*
126 * Memory Map (mmap) system call. Note that the file offset
127 * and address are allowed to be NOT page aligned, though if
128 * the MAP_FIXED flag it set, both must have the same remainder
129 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not
130 * page-aligned, the actual mapping starts at trunc_page(addr)
131 * and the return value is adjusted up by the page offset.
132 *
133 * Generally speaking, only character devices which are themselves
134 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise
135 * there would be no cache coherency between a descriptor and a VM mapping
136 * both to the same character device.
137 */
138 #ifndef _SYS_SYSPROTO_H_
139 struct mmap_args {
140 void *addr;
141 size_t len;
142 int prot;
143 int flags;
144 int fd;
145 long pad;
146 off_t pos;
147 };
148 #endif
149
150 int
sys_mmap(struct thread * td,struct mmap_args * uap)151 sys_mmap(struct thread *td, struct mmap_args *uap)
152 {
153
154 return (kern_mmap(td, &(struct mmap_req){
155 .mr_hint = (uintptr_t)uap->addr,
156 .mr_len = uap->len,
157 .mr_prot = uap->prot,
158 .mr_flags = uap->flags,
159 .mr_fd = uap->fd,
160 .mr_pos = uap->pos,
161 }));
162 }
163
164 int
kern_mmap_maxprot(struct proc * p,int prot)165 kern_mmap_maxprot(struct proc *p, int prot)
166 {
167
168 if ((p->p_flag2 & P2_PROTMAX_DISABLE) != 0 ||
169 (p->p_fctl0 & NT_FREEBSD_FCTL_PROTMAX_DISABLE) != 0)
170 return (_PROT_ALL);
171 if (((p->p_flag2 & P2_PROTMAX_ENABLE) != 0 || imply_prot_max) &&
172 prot != PROT_NONE)
173 return (prot);
174 return (_PROT_ALL);
175 }
176
177 int
kern_mmap(struct thread * td,const struct mmap_req * mrp)178 kern_mmap(struct thread *td, const struct mmap_req *mrp)
179 {
180 struct vmspace *vms;
181 struct file *fp;
182 struct proc *p;
183 off_t pos;
184 vm_offset_t addr, orig_addr;
185 vm_size_t len, pageoff, size;
186 vm_prot_t cap_maxprot;
187 int align, error, fd, flags, max_prot, prot;
188 cap_rights_t rights;
189 mmap_check_fp_fn check_fp_fn;
190
191 orig_addr = addr = mrp->mr_hint;
192 len = mrp->mr_len;
193 prot = mrp->mr_prot;
194 flags = mrp->mr_flags;
195 fd = mrp->mr_fd;
196 pos = mrp->mr_pos;
197 check_fp_fn = mrp->mr_check_fp_fn;
198
199 if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0) {
200 return (EXTERROR(EINVAL, "unknown PROT bits %#jx", prot));
201 }
202 max_prot = PROT_MAX_EXTRACT(prot);
203 prot = PROT_EXTRACT(prot);
204 if (max_prot != 0 && (max_prot & prot) != prot) {
205 return (EXTERROR(ENOTSUP,
206 "prot %#jx is not subset of max_prot %#jx",
207 prot, max_prot));
208 }
209
210 p = td->td_proc;
211
212 /*
213 * Always honor PROT_MAX if set. If not, default to all
214 * permissions unless we're implying maximum permissions.
215 */
216 if (max_prot == 0)
217 max_prot = kern_mmap_maxprot(p, prot);
218
219 vms = p->p_vmspace;
220 fp = NULL;
221 AUDIT_ARG_FD(fd);
222
223 /*
224 * Ignore old flags that used to be defined but did not do anything.
225 */
226 flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040);
227
228 /*
229 * Enforce the constraints.
230 * Mapping of length 0 is only allowed for old binaries.
231 * Anonymous mapping shall specify -1 as filedescriptor and
232 * zero position for new code. Be nice to ancient a.out
233 * binaries and correct pos for anonymous mapping, since old
234 * ld.so sometimes issues anonymous map requests with non-zero
235 * pos.
236 */
237 if (!SV_CURPROC_FLAG(SV_AOUT)) {
238 if ((len == 0 && p->p_osrel >= P_OSREL_MAP_ANON) ||
239 ((flags & MAP_ANON) != 0 && (fd != -1 || pos != 0))) {
240 return (EXTERROR(EINVAL,
241 "offset %#jd not zero/fd %#jd not -1 for MAP_ANON",
242 fd, pos));
243 }
244 } else {
245 if ((flags & MAP_ANON) != 0)
246 pos = 0;
247 }
248
249 if (flags & MAP_STACK) {
250 if ((fd != -1) || ((prot & (PROT_READ | PROT_WRITE)) !=
251 (PROT_READ | PROT_WRITE))) {
252 return (EXTERROR(EINVAL,
253 "MAP_STACK with prot %#jx < rw", prot));
254 }
255 flags |= MAP_ANON;
256 pos = 0;
257 }
258 if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE |
259 MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE |
260 MAP_PREFAULT_READ | MAP_GUARD | MAP_32BIT |
261 MAP_ALIGNMENT_MASK)) != 0) {
262 return (EXTERROR(EINVAL, "reserved flag set (flags %#jx)",
263 flags));
264 }
265 if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) {
266 return (EXTERROR(EINVAL, "EXCL without FIXED (flags %#jx)",
267 flags));
268 }
269 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED |
270 MAP_PRIVATE)) {
271 return (EXTERROR(EINVAL,
272 "both SHARED and PRIVATE set (flags %#jx)", flags));
273 }
274 if (prot != PROT_NONE &&
275 (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0) {
276 return (EXTERROR(EINVAL, "invalid prot %#jx", prot));
277 }
278 if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || fd != -1 ||
279 pos != 0 || (flags & ~(MAP_FIXED | MAP_GUARD | MAP_EXCL |
280 MAP_32BIT | MAP_ALIGNMENT_MASK)) != 0)) {
281 return (EXTERROR(EINVAL, "GUARD with wrong parameters"));
282 }
283
284 /*
285 * Align the file position to a page boundary,
286 * and save its page offset component.
287 */
288 pageoff = (pos & PAGE_MASK);
289 pos -= pageoff;
290
291 /* Compute size from len by rounding (on both ends). */
292 size = len + pageoff; /* low end... */
293 size = round_page(size); /* hi end */
294 /* Check for rounding up to zero. */
295 if (len > size)
296 return (ENOMEM);
297
298 /* Ensure alignment is at least a page and fits in a pointer. */
299 align = flags & MAP_ALIGNMENT_MASK;
300 if (align != 0 && align != MAP_ALIGNED_SUPER &&
301 (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY ||
302 align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) {
303 return (EXTERROR(EINVAL, "bad alignment %#jx", align));
304 }
305
306 /*
307 * Check for illegal addresses. Watch out for address wrap... Note
308 * that VM_*_ADDRESS are not constants due to casts (argh).
309 */
310 if (flags & MAP_FIXED) {
311 /*
312 * The specified address must have the same remainder
313 * as the file offset taken modulo PAGE_SIZE, so it
314 * should be aligned after adjustment by pageoff.
315 */
316 addr -= pageoff;
317 if ((addr & PAGE_MASK) != 0) {
318 return (EXTERROR(EINVAL,
319 "fixed mapping at %#jx not aligned", addr));
320 }
321
322 /* Address range must be all in user VM space. */
323 if (!vm_map_range_valid(&vms->vm_map, addr, addr + size)) {
324 EXTERROR(EINVAL, "mapping outside vm_map");
325 return (EINVAL);
326 }
327 if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) {
328 return (EXTERROR(EINVAL,
329 "fixed 32bit mapping of [%#jx %#jx] does not fit into 4G",
330 addr, addr + size));
331 }
332 } else if (flags & MAP_32BIT) {
333 /*
334 * For MAP_32BIT, override the hint if it is too high and
335 * do not bother moving the mapping past the heap (since
336 * the heap is usually above 2GB).
337 */
338 if (addr + size > MAP_32BIT_MAX_ADDR)
339 addr = 0;
340 } else {
341 /*
342 * XXX for non-fixed mappings where no hint is provided or
343 * the hint would fall in the potential heap space,
344 * place it after the end of the largest possible heap.
345 *
346 * For anonymous mappings within the address space of the
347 * calling process, the absence of a hint is handled at a
348 * lower level in order to implement different clustering
349 * strategies for ASLR.
350 */
351 if (((flags & MAP_ANON) == 0 && addr == 0) ||
352 (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
353 addr < round_page((vm_offset_t)vms->vm_daddr +
354 lim_max(td, RLIMIT_DATA))))
355 addr = round_page((vm_offset_t)vms->vm_daddr +
356 lim_max(td, RLIMIT_DATA));
357 }
358 if (len == 0) {
359 /*
360 * Return success without mapping anything for old
361 * binaries that request a page-aligned mapping of
362 * length 0. For modern binaries, this function
363 * returns an error earlier.
364 */
365 error = 0;
366 } else if ((flags & MAP_GUARD) != 0) {
367 error = vm_mmap_object(&vms->vm_map, &addr, size, VM_PROT_NONE,
368 VM_PROT_NONE, flags, NULL, pos, FALSE, td);
369 } else if ((flags & MAP_ANON) != 0) {
370 /*
371 * Mapping blank space is trivial.
372 *
373 * This relies on VM_PROT_* matching PROT_*.
374 */
375 error = vm_mmap_object(&vms->vm_map, &addr, size, prot,
376 max_prot, flags, NULL, pos, FALSE, td);
377 } else {
378 /*
379 * Mapping file, get fp for validation and don't let the
380 * descriptor disappear on us if we block. Check capability
381 * rights, but also return the maximum rights to be combined
382 * with maxprot later.
383 */
384 cap_rights_init_one(&rights, CAP_MMAP);
385 if (prot & PROT_READ)
386 cap_rights_set_one(&rights, CAP_MMAP_R);
387 if ((flags & MAP_SHARED) != 0) {
388 if (prot & PROT_WRITE)
389 cap_rights_set_one(&rights, CAP_MMAP_W);
390 }
391 if (prot & PROT_EXEC)
392 cap_rights_set_one(&rights, CAP_MMAP_X);
393 error = fget_mmap(td, fd, &rights, &cap_maxprot, &fp);
394 if (error != 0)
395 goto done;
396 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 &&
397 p->p_osrel >= P_OSREL_MAP_FSTRICT) {
398 EXTERROR(EINVAL, "neither SHARED nor PRIVATE req");
399 error = EINVAL;
400 goto done;
401 }
402 if (check_fp_fn != NULL) {
403 error = check_fp_fn(fp, prot, max_prot & cap_maxprot,
404 flags);
405 if (error != 0)
406 goto done;
407 }
408 if (fp->f_ops == &shm_ops && shm_largepage(fp->f_data))
409 addr = orig_addr;
410 /* This relies on VM_PROT_* matching PROT_*. */
411 error = fo_mmap(fp, &vms->vm_map, &addr, size, prot,
412 max_prot & cap_maxprot, flags, pos, td);
413 }
414
415 if (error == 0)
416 td->td_retval[0] = addr + pageoff;
417 done:
418 if (fp)
419 fdrop(fp, td);
420
421 return (error);
422 }
423
424 #if defined(COMPAT_FREEBSD6)
425 int
freebsd6_mmap(struct thread * td,struct freebsd6_mmap_args * uap)426 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap)
427 {
428 return (kern_mmap(td, &(struct mmap_req){
429 .mr_hint = (uintptr_t)uap->addr,
430 .mr_len = uap->len,
431 .mr_prot = uap->prot,
432 .mr_flags = uap->flags,
433 .mr_fd = uap->fd,
434 .mr_pos = uap->pos,
435 }));
436 }
437 #endif
438
439 #ifdef COMPAT_43
440 #ifndef _SYS_SYSPROTO_H_
441 struct ommap_args {
442 caddr_t addr;
443 int len;
444 int prot;
445 int flags;
446 int fd;
447 long pos;
448 };
449 #endif
450 int
ommap(struct thread * td,struct ommap_args * uap)451 ommap(struct thread *td, struct ommap_args *uap)
452 {
453 return (kern_ommap(td, (uintptr_t)uap->addr, uap->len, uap->prot,
454 uap->flags, uap->fd, uap->pos));
455 }
456
457 int
kern_ommap(struct thread * td,uintptr_t hint,int len,int oprot,int oflags,int fd,long pos)458 kern_ommap(struct thread *td, uintptr_t hint, int len, int oprot,
459 int oflags, int fd, long pos)
460 {
461 static const char cvtbsdprot[8] = {
462 0,
463 PROT_EXEC,
464 PROT_WRITE,
465 PROT_EXEC | PROT_WRITE,
466 PROT_READ,
467 PROT_EXEC | PROT_READ,
468 PROT_WRITE | PROT_READ,
469 PROT_EXEC | PROT_WRITE | PROT_READ,
470 };
471 int flags, prot;
472
473 if (len < 0)
474 return (EINVAL);
475
476 #define OMAP_ANON 0x0002
477 #define OMAP_COPY 0x0020
478 #define OMAP_SHARED 0x0010
479 #define OMAP_FIXED 0x0100
480
481 prot = cvtbsdprot[oprot & 0x7];
482 #if (defined(COMPAT_FREEBSD32) && defined(__amd64__)) || defined(__i386__)
483 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) &&
484 prot != 0)
485 prot |= PROT_EXEC;
486 #endif
487 flags = 0;
488 if (oflags & OMAP_ANON)
489 flags |= MAP_ANON;
490 if (oflags & OMAP_COPY)
491 flags |= MAP_COPY;
492 if (oflags & OMAP_SHARED)
493 flags |= MAP_SHARED;
494 else
495 flags |= MAP_PRIVATE;
496 if (oflags & OMAP_FIXED)
497 flags |= MAP_FIXED;
498 return (kern_mmap(td, &(struct mmap_req){
499 .mr_hint = hint,
500 .mr_len = len,
501 .mr_prot = prot,
502 .mr_flags = flags,
503 .mr_fd = fd,
504 .mr_pos = pos,
505 }));
506 }
507 #endif /* COMPAT_43 */
508
509 #ifndef _SYS_SYSPROTO_H_
510 struct msync_args {
511 void *addr;
512 size_t len;
513 int flags;
514 };
515 #endif
516 int
sys_msync(struct thread * td,struct msync_args * uap)517 sys_msync(struct thread *td, struct msync_args *uap)
518 {
519
520 return (kern_msync(td, (uintptr_t)uap->addr, uap->len, uap->flags));
521 }
522
523 int
kern_msync(struct thread * td,uintptr_t addr0,size_t size,int flags)524 kern_msync(struct thread *td, uintptr_t addr0, size_t size, int flags)
525 {
526 vm_offset_t addr;
527 vm_size_t pageoff;
528 vm_map_t map;
529 int rv;
530
531 addr = addr0;
532 pageoff = (addr & PAGE_MASK);
533 addr -= pageoff;
534 size += pageoff;
535 size = (vm_size_t) round_page(size);
536 if (addr + size < addr)
537 return (EINVAL);
538
539 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE))
540 return (EINVAL);
541
542 map = &td->td_proc->p_vmspace->vm_map;
543
544 /*
545 * Clean the pages and interpret the return value.
546 */
547 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0,
548 (flags & MS_INVALIDATE) != 0);
549 switch (rv) {
550 case KERN_SUCCESS:
551 return (0);
552 case KERN_INVALID_ADDRESS:
553 return (ENOMEM);
554 case KERN_INVALID_ARGUMENT:
555 return (EBUSY);
556 case KERN_FAILURE:
557 return (EIO);
558 default:
559 return (EINVAL);
560 }
561 }
562
563 #ifndef _SYS_SYSPROTO_H_
564 struct munmap_args {
565 void *addr;
566 size_t len;
567 };
568 #endif
569 int
sys_munmap(struct thread * td,struct munmap_args * uap)570 sys_munmap(struct thread *td, struct munmap_args *uap)
571 {
572
573 return (kern_munmap(td, (uintptr_t)uap->addr, uap->len));
574 }
575
576 int
kern_munmap(struct thread * td,uintptr_t addr0,size_t size)577 kern_munmap(struct thread *td, uintptr_t addr0, size_t size)
578 {
579 #ifdef HWPMC_HOOKS
580 struct pmckern_map_out pkm;
581 vm_map_entry_t entry;
582 bool pmc_handled;
583 #endif
584 vm_offset_t addr, end;
585 vm_size_t pageoff;
586 vm_map_t map;
587 int rv;
588
589 if (size == 0)
590 return (EINVAL);
591
592 addr = addr0;
593 pageoff = (addr & PAGE_MASK);
594 addr -= pageoff;
595 size += pageoff;
596 size = (vm_size_t) round_page(size);
597 end = addr + size;
598 map = &td->td_proc->p_vmspace->vm_map;
599 if (!vm_map_range_valid(map, addr, end))
600 return (EINVAL);
601
602 vm_map_lock(map);
603 #ifdef HWPMC_HOOKS
604 pmc_handled = false;
605 if (PMC_HOOK_INSTALLED(PMC_FN_MUNMAP)) {
606 pmc_handled = true;
607 /*
608 * Inform hwpmc if the address range being unmapped contains
609 * an executable region.
610 */
611 pkm.pm_address = (uintptr_t) NULL;
612 if (vm_map_lookup_entry(map, addr, &entry)) {
613 for (; entry->start < end;
614 entry = vm_map_entry_succ(entry)) {
615 if (vm_map_check_protection(map, entry->start,
616 entry->end, VM_PROT_EXECUTE) == TRUE) {
617 pkm.pm_address = (uintptr_t) addr;
618 pkm.pm_size = (size_t) size;
619 break;
620 }
621 }
622 }
623 }
624 #endif
625 rv = vm_map_delete(map, addr, end);
626
627 #ifdef HWT_HOOKS
628 if (HWT_HOOK_INSTALLED && rv == KERN_SUCCESS) {
629 struct hwt_record_entry ent;
630
631 ent.addr = (uintptr_t) addr;
632 ent.fullpath = NULL;
633 ent.record_type = HWT_RECORD_MUNMAP;
634 HWT_CALL_HOOK(td, HWT_RECORD, &ent);
635 }
636 #endif
637
638 #ifdef HWPMC_HOOKS
639 if (rv == KERN_SUCCESS && __predict_false(pmc_handled)) {
640 /* downgrade the lock to prevent a LOR with the pmc-sx lock */
641 vm_map_lock_downgrade(map);
642 if (pkm.pm_address != (uintptr_t) NULL)
643 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm);
644 vm_map_unlock_read(map);
645 } else
646 #endif
647 vm_map_unlock(map);
648
649 return (vm_mmap_to_errno(rv));
650 }
651
652 #ifndef _SYS_SYSPROTO_H_
653 struct mprotect_args {
654 const void *addr;
655 size_t len;
656 int prot;
657 };
658 #endif
659 int
sys_mprotect(struct thread * td,struct mprotect_args * uap)660 sys_mprotect(struct thread *td, struct mprotect_args *uap)
661 {
662
663 return (kern_mprotect(td, (uintptr_t)uap->addr, uap->len,
664 uap->prot, 0));
665 }
666
667 int
kern_mprotect(struct thread * td,uintptr_t addr0,size_t size,int prot,int flags)668 kern_mprotect(struct thread *td, uintptr_t addr0, size_t size, int prot,
669 int flags)
670 {
671 vm_offset_t addr;
672 vm_size_t pageoff;
673 int vm_error, max_prot;
674
675 addr = addr0;
676 if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0)
677 return (EINVAL);
678 max_prot = PROT_MAX_EXTRACT(prot);
679 prot = PROT_EXTRACT(prot);
680 pageoff = (addr & PAGE_MASK);
681 addr -= pageoff;
682 size += pageoff;
683 size = (vm_size_t) round_page(size);
684 #ifdef COMPAT_FREEBSD32
685 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
686 if (((addr + size) & 0xffffffff) < addr)
687 return (EINVAL);
688 } else
689 #endif
690 if (addr + size < addr)
691 return (EINVAL);
692
693 flags |= VM_MAP_PROTECT_SET_PROT;
694 if (max_prot != 0)
695 flags |= VM_MAP_PROTECT_SET_MAXPROT;
696 vm_error = vm_map_protect(&td->td_proc->p_vmspace->vm_map,
697 addr, addr + size, prot, max_prot, flags);
698
699 switch (vm_error) {
700 case KERN_SUCCESS:
701 return (0);
702 case KERN_PROTECTION_FAILURE:
703 return (EACCES);
704 case KERN_RESOURCE_SHORTAGE:
705 return (ENOMEM);
706 case KERN_OUT_OF_BOUNDS:
707 return (ENOTSUP);
708 }
709 return (EINVAL);
710 }
711
712 #ifndef _SYS_SYSPROTO_H_
713 struct minherit_args {
714 void *addr;
715 size_t len;
716 int inherit;
717 };
718 #endif
719 int
sys_minherit(struct thread * td,struct minherit_args * uap)720 sys_minherit(struct thread *td, struct minherit_args *uap)
721 {
722
723 return (kern_minherit(td, (uintptr_t)uap->addr, uap->len,
724 uap->inherit));
725 }
726
727 int
kern_minherit(struct thread * td,uintptr_t addr0,size_t len,int inherit0)728 kern_minherit(struct thread *td, uintptr_t addr0, size_t len, int inherit0)
729 {
730 vm_offset_t addr;
731 vm_size_t size, pageoff;
732 vm_inherit_t inherit;
733
734 addr = (vm_offset_t)addr0;
735 size = len;
736 inherit = inherit0;
737
738 pageoff = (addr & PAGE_MASK);
739 addr -= pageoff;
740 size += pageoff;
741 size = (vm_size_t) round_page(size);
742 if (addr + size < addr)
743 return (EINVAL);
744
745 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr,
746 addr + size, inherit)) {
747 case KERN_SUCCESS:
748 return (0);
749 case KERN_PROTECTION_FAILURE:
750 return (EACCES);
751 }
752 return (EINVAL);
753 }
754
755 #ifndef _SYS_SYSPROTO_H_
756 struct madvise_args {
757 void *addr;
758 size_t len;
759 int behav;
760 };
761 #endif
762
763 int
sys_madvise(struct thread * td,struct madvise_args * uap)764 sys_madvise(struct thread *td, struct madvise_args *uap)
765 {
766
767 return (kern_madvise(td, (uintptr_t)uap->addr, uap->len, uap->behav));
768 }
769
770 int
kern_madvise(struct thread * td,uintptr_t addr0,size_t len,int behav)771 kern_madvise(struct thread *td, uintptr_t addr0, size_t len, int behav)
772 {
773 vm_map_t map;
774 vm_offset_t addr, end, start;
775 int flags;
776
777 /*
778 * Check for our special case, advising the swap pager we are
779 * "immortal."
780 */
781 if (behav == MADV_PROTECT) {
782 flags = PPROT_SET;
783 return (kern_procctl(td, P_PID, td->td_proc->p_pid,
784 PROC_SPROTECT, &flags));
785 }
786
787 /*
788 * Check for illegal addresses. Watch out for address wrap... Note
789 * that VM_*_ADDRESS are not constants due to casts (argh).
790 */
791 map = &td->td_proc->p_vmspace->vm_map;
792 addr = addr0;
793 if (!vm_map_range_valid(map, addr, addr + len))
794 return (EINVAL);
795
796 /*
797 * Since this routine is only advisory, we default to conservative
798 * behavior.
799 */
800 start = trunc_page(addr);
801 end = round_page(addr + len);
802
803 /*
804 * vm_map_madvise() checks for illegal values of behav.
805 */
806 return (vm_map_madvise(map, start, end, behav));
807 }
808
809 #ifndef _SYS_SYSPROTO_H_
810 struct mincore_args {
811 const void *addr;
812 size_t len;
813 char *vec;
814 };
815 #endif
816
817 int
sys_mincore(struct thread * td,struct mincore_args * uap)818 sys_mincore(struct thread *td, struct mincore_args *uap)
819 {
820
821 return (kern_mincore(td, (uintptr_t)uap->addr, uap->len, uap->vec));
822 }
823
824 int
kern_mincore(struct thread * td,uintptr_t addr0,size_t len,char * vec)825 kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec)
826 {
827 pmap_t pmap;
828 vm_map_t map;
829 vm_map_entry_t current, entry;
830 vm_object_t object;
831 vm_offset_t addr, cend, end, first_addr;
832 vm_paddr_t pa;
833 vm_page_t m;
834 vm_pindex_t pindex;
835 int error, lastvecindex, mincoreinfo, vecindex;
836 unsigned int timestamp;
837
838 /*
839 * Make sure that the addresses presented are valid for user
840 * mode.
841 */
842 first_addr = addr = trunc_page(addr0);
843 end = round_page(addr0 + len);
844 map = &td->td_proc->p_vmspace->vm_map;
845 if (end > vm_map_max(map) || end < addr)
846 return (ENOMEM);
847
848 pmap = vmspace_pmap(td->td_proc->p_vmspace);
849
850 vm_map_lock_read(map);
851 RestartScan:
852 timestamp = map->timestamp;
853
854 if (!vm_map_lookup_entry(map, addr, &entry)) {
855 vm_map_unlock_read(map);
856 return (ENOMEM);
857 }
858
859 /*
860 * Do this on a map entry basis so that if the pages are not
861 * in the current processes address space, we can easily look
862 * up the pages elsewhere.
863 */
864 lastvecindex = -1;
865 while (entry->start < end) {
866 /*
867 * check for contiguity
868 */
869 current = entry;
870 entry = vm_map_entry_succ(current);
871 if (current->end < end &&
872 entry->start > current->end) {
873 vm_map_unlock_read(map);
874 return (ENOMEM);
875 }
876
877 /*
878 * ignore submaps (for now) or null objects
879 */
880 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) ||
881 current->object.vm_object == NULL)
882 continue;
883
884 /*
885 * limit this scan to the current map entry and the
886 * limits for the mincore call
887 */
888 if (addr < current->start)
889 addr = current->start;
890 cend = current->end;
891 if (cend > end)
892 cend = end;
893
894 for (; addr < cend; addr += PAGE_SIZE) {
895 /*
896 * Check pmap first, it is likely faster, also
897 * it can provide info as to whether we are the
898 * one referencing or modifying the page.
899 */
900 m = NULL;
901 object = NULL;
902 retry:
903 pa = 0;
904 mincoreinfo = pmap_mincore(pmap, addr, &pa);
905 if (mincore_mapped) {
906 /*
907 * We only care about this pmap's
908 * mapping of the page, if any.
909 */
910 ;
911 } else if (pa != 0) {
912 /*
913 * The page is mapped by this process but not
914 * both accessed and modified. It is also
915 * managed. Acquire the object lock so that
916 * other mappings might be examined. The page's
917 * identity may change at any point before its
918 * object lock is acquired, so re-validate if
919 * necessary.
920 */
921 m = PHYS_TO_VM_PAGE(pa);
922 while (object == NULL || m->object != object) {
923 if (object != NULL)
924 VM_OBJECT_WUNLOCK(object);
925 object = atomic_load_ptr(&m->object);
926 if (object == NULL)
927 goto retry;
928 VM_OBJECT_WLOCK(object);
929 }
930 if (pa != pmap_extract(pmap, addr))
931 goto retry;
932 KASSERT(vm_page_all_valid(m),
933 ("mincore: page %p is mapped but invalid",
934 m));
935 } else if (mincoreinfo == 0) {
936 /*
937 * The page is not mapped by this process. If
938 * the object implements managed pages, then
939 * determine if the page is resident so that
940 * the mappings might be examined.
941 */
942 if (current->object.vm_object != object) {
943 if (object != NULL)
944 VM_OBJECT_WUNLOCK(object);
945 object = current->object.vm_object;
946 VM_OBJECT_WLOCK(object);
947 }
948 if ((object->flags & OBJ_SWAP) != 0 ||
949 object->type == OBJT_VNODE) {
950 pindex = OFF_TO_IDX(current->offset +
951 (addr - current->start));
952 m = vm_page_lookup(object, pindex);
953 if (m != NULL && vm_page_none_valid(m))
954 m = NULL;
955 if (m != NULL)
956 mincoreinfo = MINCORE_INCORE;
957 }
958 }
959 if (m != NULL) {
960 VM_OBJECT_ASSERT_WLOCKED(m->object);
961
962 /* Examine other mappings of the page. */
963 if (m->dirty == 0 && pmap_is_modified(m))
964 vm_page_dirty(m);
965 if (m->dirty != 0)
966 mincoreinfo |= MINCORE_MODIFIED_OTHER;
967
968 /*
969 * The first test for PGA_REFERENCED is an
970 * optimization. The second test is
971 * required because a concurrent pmap
972 * operation could clear the last reference
973 * and set PGA_REFERENCED before the call to
974 * pmap_is_referenced().
975 */
976 if ((m->a.flags & PGA_REFERENCED) != 0 ||
977 pmap_is_referenced(m) ||
978 (m->a.flags & PGA_REFERENCED) != 0)
979 mincoreinfo |= MINCORE_REFERENCED_OTHER;
980 }
981 if (object != NULL)
982 VM_OBJECT_WUNLOCK(object);
983
984 /*
985 * subyte may page fault. In case it needs to modify
986 * the map, we release the lock.
987 */
988 vm_map_unlock_read(map);
989
990 /*
991 * calculate index into user supplied byte vector
992 */
993 vecindex = atop(addr - first_addr);
994
995 /*
996 * If we have skipped map entries, we need to make sure that
997 * the byte vector is zeroed for those skipped entries.
998 */
999 while ((lastvecindex + 1) < vecindex) {
1000 ++lastvecindex;
1001 error = subyte(vec + lastvecindex, 0);
1002 if (error) {
1003 error = EFAULT;
1004 goto done2;
1005 }
1006 }
1007
1008 /*
1009 * Pass the page information to the user
1010 */
1011 error = subyte(vec + vecindex, mincoreinfo);
1012 if (error) {
1013 error = EFAULT;
1014 goto done2;
1015 }
1016
1017 /*
1018 * If the map has changed, due to the subyte, the previous
1019 * output may be invalid.
1020 */
1021 vm_map_lock_read(map);
1022 if (timestamp != map->timestamp)
1023 goto RestartScan;
1024
1025 lastvecindex = vecindex;
1026 }
1027 }
1028
1029 /*
1030 * subyte may page fault. In case it needs to modify
1031 * the map, we release the lock.
1032 */
1033 vm_map_unlock_read(map);
1034
1035 /*
1036 * Zero the last entries in the byte vector.
1037 */
1038 vecindex = atop(end - first_addr);
1039 while ((lastvecindex + 1) < vecindex) {
1040 ++lastvecindex;
1041 error = subyte(vec + lastvecindex, 0);
1042 if (error) {
1043 error = EFAULT;
1044 goto done2;
1045 }
1046 }
1047
1048 /*
1049 * If the map has changed, due to the subyte, the previous
1050 * output may be invalid.
1051 */
1052 vm_map_lock_read(map);
1053 if (timestamp != map->timestamp)
1054 goto RestartScan;
1055 vm_map_unlock_read(map);
1056 done2:
1057 return (error);
1058 }
1059
1060 #ifndef _SYS_SYSPROTO_H_
1061 struct mlock_args {
1062 const void *addr;
1063 size_t len;
1064 };
1065 #endif
1066 int
sys_mlock(struct thread * td,struct mlock_args * uap)1067 sys_mlock(struct thread *td, struct mlock_args *uap)
1068 {
1069
1070 return (kern_mlock(td->td_proc, td->td_ucred,
1071 __DECONST(uintptr_t, uap->addr), uap->len));
1072 }
1073
1074 int
kern_mlock(struct proc * proc,struct ucred * cred,uintptr_t addr0,size_t len)1075 kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr0, size_t len)
1076 {
1077 vm_offset_t addr, end, last, start;
1078 vm_size_t npages, size;
1079 vm_map_t map;
1080 unsigned long nsize;
1081 int error;
1082
1083 error = priv_check_cred(cred, PRIV_VM_MLOCK);
1084 if (error)
1085 return (error);
1086 addr = addr0;
1087 size = len;
1088 last = addr + size;
1089 start = trunc_page(addr);
1090 end = round_page(last);
1091 if (last < addr || end < addr)
1092 return (EINVAL);
1093 npages = atop(end - start);
1094 if (npages > vm_page_max_user_wired)
1095 return (ENOMEM);
1096 map = &proc->p_vmspace->vm_map;
1097 PROC_LOCK(proc);
1098 nsize = ptoa(npages + pmap_wired_count(map->pmap));
1099 if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) {
1100 PROC_UNLOCK(proc);
1101 return (ENOMEM);
1102 }
1103 PROC_UNLOCK(proc);
1104 #ifdef RACCT
1105 if (racct_enable) {
1106 PROC_LOCK(proc);
1107 error = racct_set(proc, RACCT_MEMLOCK, nsize);
1108 PROC_UNLOCK(proc);
1109 if (error != 0)
1110 return (ENOMEM);
1111 }
1112 #endif
1113 error = vm_map_wire(map, start, end,
1114 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
1115 #ifdef RACCT
1116 if (racct_enable && error != KERN_SUCCESS) {
1117 PROC_LOCK(proc);
1118 racct_set(proc, RACCT_MEMLOCK,
1119 ptoa(pmap_wired_count(map->pmap)));
1120 PROC_UNLOCK(proc);
1121 }
1122 #endif
1123 switch (error) {
1124 case KERN_SUCCESS:
1125 return (0);
1126 case KERN_INVALID_ARGUMENT:
1127 return (EINVAL);
1128 default:
1129 return (ENOMEM);
1130 }
1131 }
1132
1133 #ifndef _SYS_SYSPROTO_H_
1134 struct mlockall_args {
1135 int how;
1136 };
1137 #endif
1138
1139 int
sys_mlockall(struct thread * td,struct mlockall_args * uap)1140 sys_mlockall(struct thread *td, struct mlockall_args *uap)
1141 {
1142 vm_map_t map;
1143 int error;
1144
1145 map = &td->td_proc->p_vmspace->vm_map;
1146 error = priv_check(td, PRIV_VM_MLOCK);
1147 if (error)
1148 return (error);
1149
1150 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0))
1151 return (EINVAL);
1152
1153 /*
1154 * If wiring all pages in the process would cause it to exceed
1155 * a hard resource limit, return ENOMEM.
1156 */
1157 if (!old_mlock && uap->how & MCL_CURRENT) {
1158 if (map->size > lim_cur(td, RLIMIT_MEMLOCK))
1159 return (ENOMEM);
1160 }
1161 #ifdef RACCT
1162 if (racct_enable) {
1163 PROC_LOCK(td->td_proc);
1164 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size);
1165 PROC_UNLOCK(td->td_proc);
1166 if (error != 0)
1167 return (ENOMEM);
1168 }
1169 #endif
1170
1171 if (uap->how & MCL_FUTURE) {
1172 vm_map_lock(map);
1173 vm_map_modflags(map, MAP_WIREFUTURE, 0);
1174 vm_map_unlock(map);
1175 error = 0;
1176 }
1177
1178 if (uap->how & MCL_CURRENT) {
1179 /*
1180 * P1003.1-2001 mandates that all currently mapped pages
1181 * will be memory resident and locked (wired) upon return
1182 * from mlockall(). vm_map_wire() will wire pages, by
1183 * calling vm_fault_wire() for each page in the region.
1184 */
1185 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map),
1186 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
1187 if (error == KERN_SUCCESS)
1188 error = 0;
1189 else if (error == KERN_RESOURCE_SHORTAGE)
1190 error = ENOMEM;
1191 else
1192 error = EAGAIN;
1193 }
1194 #ifdef RACCT
1195 if (racct_enable && error != KERN_SUCCESS) {
1196 PROC_LOCK(td->td_proc);
1197 racct_set(td->td_proc, RACCT_MEMLOCK,
1198 ptoa(pmap_wired_count(map->pmap)));
1199 PROC_UNLOCK(td->td_proc);
1200 }
1201 #endif
1202
1203 return (error);
1204 }
1205
1206 #ifndef _SYS_SYSPROTO_H_
1207 struct munlockall_args {
1208 register_t dummy;
1209 };
1210 #endif
1211
1212 int
sys_munlockall(struct thread * td,struct munlockall_args * uap)1213 sys_munlockall(struct thread *td, struct munlockall_args *uap)
1214 {
1215 vm_map_t map;
1216 int error;
1217
1218 map = &td->td_proc->p_vmspace->vm_map;
1219 error = priv_check(td, PRIV_VM_MUNLOCK);
1220 if (error)
1221 return (error);
1222
1223 /* Clear the MAP_WIREFUTURE flag from this vm_map. */
1224 vm_map_lock(map);
1225 vm_map_modflags(map, 0, MAP_WIREFUTURE);
1226 vm_map_unlock(map);
1227
1228 /* Forcibly unwire all pages. */
1229 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map),
1230 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
1231 #ifdef RACCT
1232 if (racct_enable && error == KERN_SUCCESS) {
1233 PROC_LOCK(td->td_proc);
1234 racct_set(td->td_proc, RACCT_MEMLOCK, 0);
1235 PROC_UNLOCK(td->td_proc);
1236 }
1237 #endif
1238
1239 return (error);
1240 }
1241
1242 #ifndef _SYS_SYSPROTO_H_
1243 struct munlock_args {
1244 const void *addr;
1245 size_t len;
1246 };
1247 #endif
1248 int
sys_munlock(struct thread * td,struct munlock_args * uap)1249 sys_munlock(struct thread *td, struct munlock_args *uap)
1250 {
1251
1252 return (kern_munlock(td, (uintptr_t)uap->addr, uap->len));
1253 }
1254
1255 int
kern_munlock(struct thread * td,uintptr_t addr0,size_t size)1256 kern_munlock(struct thread *td, uintptr_t addr0, size_t size)
1257 {
1258 vm_offset_t addr, end, last, start;
1259 #ifdef RACCT
1260 vm_map_t map;
1261 #endif
1262 int error;
1263
1264 error = priv_check(td, PRIV_VM_MUNLOCK);
1265 if (error)
1266 return (error);
1267 addr = addr0;
1268 last = addr + size;
1269 start = trunc_page(addr);
1270 end = round_page(last);
1271 if (last < addr || end < addr)
1272 return (EINVAL);
1273 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end,
1274 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
1275 #ifdef RACCT
1276 if (racct_enable && error == KERN_SUCCESS) {
1277 PROC_LOCK(td->td_proc);
1278 map = &td->td_proc->p_vmspace->vm_map;
1279 racct_set(td->td_proc, RACCT_MEMLOCK,
1280 ptoa(pmap_wired_count(map->pmap)));
1281 PROC_UNLOCK(td->td_proc);
1282 }
1283 #endif
1284 return (error == KERN_SUCCESS ? 0 : ENOMEM);
1285 }
1286
1287 /*
1288 * vm_mmap_vnode()
1289 *
1290 * Helper function for vm_mmap. Perform sanity check specific for mmap
1291 * operations on vnodes.
1292 */
1293 int
vm_mmap_vnode(struct thread * td,vm_size_t objsize,vm_prot_t prot,vm_prot_t * maxprotp,int * flagsp,struct vnode * vp,vm_ooffset_t * foffp,vm_object_t * objp,boolean_t * writecounted)1294 vm_mmap_vnode(struct thread *td, vm_size_t objsize,
1295 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
1296 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp,
1297 boolean_t *writecounted)
1298 {
1299 struct vattr va;
1300 vm_object_t obj;
1301 vm_ooffset_t foff;
1302 struct ucred *cred;
1303 int error, flags;
1304 bool writex;
1305
1306 cred = td->td_ucred;
1307 writex = (*maxprotp & VM_PROT_WRITE) != 0 &&
1308 (*flagsp & MAP_SHARED) != 0;
1309 if ((error = vget(vp, LK_SHARED)) != 0)
1310 return (error);
1311 AUDIT_ARG_VNODE1(vp);
1312 foff = *foffp;
1313 flags = *flagsp;
1314 obj = vp->v_object;
1315 if (vp->v_type == VREG) {
1316 /*
1317 * Get the proper underlying object
1318 */
1319 if (obj == NULL) {
1320 error = EINVAL;
1321 goto done;
1322 }
1323 if (obj->type == OBJT_VNODE && obj->handle != vp) {
1324 vput(vp);
1325 vp = (struct vnode *)obj->handle;
1326 /*
1327 * Bypass filesystems obey the mpsafety of the
1328 * underlying fs. Tmpfs never bypasses.
1329 */
1330 error = vget(vp, LK_SHARED);
1331 if (error != 0)
1332 return (error);
1333 }
1334 if (writex) {
1335 *writecounted = TRUE;
1336 vm_pager_update_writecount(obj, 0, objsize);
1337 }
1338 } else {
1339 error = EXTERROR(EINVAL, "non-reg file");
1340 goto done;
1341 }
1342 if ((error = VOP_GETATTR(vp, &va, cred)))
1343 goto done;
1344 #ifdef MAC
1345 /* This relies on VM_PROT_* matching PROT_*. */
1346 error = mac_vnode_check_mmap(cred, vp, (int)prot, flags);
1347 if (error != 0)
1348 goto done;
1349 #endif
1350 if ((flags & MAP_SHARED) != 0) {
1351 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) {
1352 if (prot & VM_PROT_WRITE) {
1353 error = EPERM;
1354 goto done;
1355 }
1356 *maxprotp &= ~VM_PROT_WRITE;
1357 }
1358 }
1359 /*
1360 * If it is a regular file without any references
1361 * we do not need to sync it.
1362 * Adjust object size to be the size of actual file.
1363 */
1364 objsize = round_page(va.va_size);
1365 if (va.va_nlink == 0)
1366 flags |= MAP_NOSYNC;
1367 if (obj->type == OBJT_VNODE) {
1368 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff,
1369 cred);
1370 if (obj == NULL) {
1371 error = ENOMEM;
1372 goto done;
1373 }
1374 } else {
1375 KASSERT((obj->flags & OBJ_SWAP) != 0, ("wrong object type"));
1376 vm_object_reference(obj);
1377 #if VM_NRESERVLEVEL > 0
1378 if ((obj->flags & OBJ_COLORED) == 0) {
1379 VM_OBJECT_WLOCK(obj);
1380 vm_object_color(obj, 0);
1381 VM_OBJECT_WUNLOCK(obj);
1382 }
1383 #endif
1384 }
1385 *objp = obj;
1386 *flagsp = flags;
1387
1388 VOP_MMAPPED(vp);
1389
1390 done:
1391 if (error != 0 && *writecounted) {
1392 *writecounted = FALSE;
1393 vm_pager_update_writecount(obj, objsize, 0);
1394 }
1395 vput(vp);
1396 return (error);
1397 }
1398
1399 /*
1400 * vm_mmap_cdev()
1401 *
1402 * Helper function for vm_mmap. Perform sanity check specific for mmap
1403 * operations on cdevs.
1404 */
1405 int
vm_mmap_cdev(struct thread * td,vm_size_t objsize,vm_prot_t prot,vm_prot_t * maxprotp,int * flagsp,struct cdev * cdev,struct cdevsw * dsw,vm_ooffset_t * foff,vm_object_t * objp)1406 vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot,
1407 vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw,
1408 vm_ooffset_t *foff, vm_object_t *objp)
1409 {
1410 vm_object_t obj;
1411 int error, flags;
1412
1413 flags = *flagsp;
1414
1415 if (dsw->d_flags & D_MMAP_ANON) {
1416 *objp = NULL;
1417 *foff = 0;
1418 *maxprotp = VM_PROT_ALL;
1419 *flagsp |= MAP_ANON;
1420 return (0);
1421 }
1422
1423 /*
1424 * cdevs do not provide private mappings of any kind.
1425 */
1426 if ((*maxprotp & VM_PROT_WRITE) == 0 &&
1427 (prot & VM_PROT_WRITE) != 0)
1428 return (EACCES);
1429 if ((flags & (MAP_PRIVATE | MAP_COPY)) != 0) {
1430 return (EXTERROR(EINVAL, "cdev mapping must be shared"));
1431 }
1432
1433 /*
1434 * Force device mappings to be shared.
1435 */
1436 flags |= MAP_SHARED;
1437 #ifdef MAC_XXX
1438 error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot);
1439 if (error != 0)
1440 return (error);
1441 #endif
1442 /*
1443 * First, try d_mmap_single(). If that is not implemented
1444 * (returns ENODEV), fall back to using the device pager.
1445 * Note that d_mmap_single() must return a reference to the
1446 * object (it needs to bump the reference count of the object
1447 * it returns somehow).
1448 *
1449 * XXX assumes VM_PROT_* == PROT_*
1450 */
1451 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot);
1452 if (error != ENODEV)
1453 return (error);
1454 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff,
1455 td->td_ucred);
1456 if (obj == NULL) {
1457 return (EXTERROR(EINVAL,
1458 "cdev driver does not support mmap"));
1459 }
1460 *objp = obj;
1461 *flagsp = flags;
1462 return (0);
1463 }
1464
1465 int
vm_mmap(vm_map_t map,vm_offset_t * addr,vm_size_t size,vm_prot_t prot,vm_prot_t maxprot,int flags,objtype_t handle_type,void * handle,vm_ooffset_t foff)1466 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
1467 vm_prot_t maxprot, int flags,
1468 objtype_t handle_type, void *handle,
1469 vm_ooffset_t foff)
1470 {
1471 vm_object_t object;
1472 struct thread *td = curthread;
1473 int error;
1474 boolean_t writecounted;
1475
1476 if (size == 0) {
1477 return (EXTERROR(EINVAL, "zero-sized req"));
1478 }
1479
1480 size = round_page(size);
1481 object = NULL;
1482 writecounted = FALSE;
1483
1484 switch (handle_type) {
1485 case OBJT_DEVICE: {
1486 struct cdevsw *dsw;
1487 struct cdev *cdev;
1488 int ref;
1489
1490 cdev = handle;
1491 dsw = dev_refthread(cdev, &ref);
1492 if (dsw == NULL)
1493 return (ENXIO);
1494 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev,
1495 dsw, &foff, &object);
1496 dev_relthread(cdev, ref);
1497 break;
1498 }
1499 case OBJT_VNODE:
1500 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
1501 handle, &foff, &object, &writecounted);
1502 break;
1503 default:
1504 error = EXTERROR(EINVAL, "unsupported backing obj type %jd",
1505 handle_type);
1506 break;
1507 }
1508 if (error)
1509 return (error);
1510
1511 error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
1512 foff, writecounted, td);
1513 if (error != 0 && object != NULL) {
1514 /*
1515 * If this mapping was accounted for in the vnode's
1516 * writecount, then undo that now.
1517 */
1518 if (writecounted)
1519 vm_pager_release_writecount(object, 0, size);
1520 vm_object_deallocate(object);
1521 }
1522 return (error);
1523 }
1524
1525 int
kern_mmap_racct_check(struct thread * td,vm_map_t map,vm_size_t size)1526 kern_mmap_racct_check(struct thread *td, vm_map_t map, vm_size_t size)
1527 {
1528 int error;
1529
1530 RACCT_PROC_LOCK(td->td_proc);
1531 if (map->size + size > lim_cur(td, RLIMIT_VMEM)) {
1532 RACCT_PROC_UNLOCK(td->td_proc);
1533 return (ENOMEM);
1534 }
1535 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) {
1536 RACCT_PROC_UNLOCK(td->td_proc);
1537 return (ENOMEM);
1538 }
1539 if (!old_mlock && map->flags & MAP_WIREFUTURE) {
1540 if (ptoa(pmap_wired_count(map->pmap)) + size >
1541 lim_cur(td, RLIMIT_MEMLOCK)) {
1542 racct_set_force(td->td_proc, RACCT_VMEM, map->size);
1543 RACCT_PROC_UNLOCK(td->td_proc);
1544 return (ENOMEM);
1545 }
1546 error = racct_set(td->td_proc, RACCT_MEMLOCK,
1547 ptoa(pmap_wired_count(map->pmap)) + size);
1548 if (error != 0) {
1549 racct_set_force(td->td_proc, RACCT_VMEM, map->size);
1550 RACCT_PROC_UNLOCK(td->td_proc);
1551 return (error);
1552 }
1553 }
1554 RACCT_PROC_UNLOCK(td->td_proc);
1555 return (0);
1556 }
1557
1558 /*
1559 * Internal version of mmap that maps a specific VM object into an
1560 * map. Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap.
1561 */
1562 int
vm_mmap_object(vm_map_t map,vm_offset_t * addr,vm_size_t size,vm_prot_t prot,vm_prot_t maxprot,int flags,vm_object_t object,vm_ooffset_t foff,boolean_t writecounted,struct thread * td)1563 vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
1564 vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff,
1565 boolean_t writecounted, struct thread *td)
1566 {
1567 vm_offset_t default_addr, max_addr;
1568 int docow, error, findspace, rv;
1569 bool curmap, fitit;
1570
1571 curmap = map == &td->td_proc->p_vmspace->vm_map;
1572 if (curmap) {
1573 error = kern_mmap_racct_check(td, map, size);
1574 if (error != 0)
1575 return (error);
1576 }
1577
1578 /*
1579 * We currently can only deal with page aligned file offsets.
1580 * The mmap() system call already enforces this by subtracting
1581 * the page offset from the file offset, but checking here
1582 * catches errors in device drivers (e.g. d_single_mmap()
1583 * callbacks) and other internal mapping requests (such as in
1584 * exec).
1585 */
1586 if ((foff & PAGE_MASK) != 0) {
1587 return (EXTERROR(EINVAL, "offset %#jx not page-aligned", foff));
1588 }
1589
1590 if ((flags & MAP_FIXED) == 0) {
1591 fitit = true;
1592 *addr = round_page(*addr);
1593 } else {
1594 if (*addr != trunc_page(*addr)) {
1595 return (EXTERROR(EINVAL,
1596 "non-fixed mapping address %#jx not aligned",
1597 *addr));
1598 }
1599 fitit = false;
1600 }
1601
1602 if (flags & MAP_ANON) {
1603 if (object != NULL) {
1604 return (EXTERROR(EINVAL,
1605 "anon mapping backed by an object"));
1606 }
1607 if (foff != 0) {
1608 return (EXTERROR(EINVAL,
1609 "anon mapping with non-zero offset %#jx", foff));
1610 }
1611 docow = 0;
1612 } else if (flags & MAP_PREFAULT_READ)
1613 docow = MAP_PREFAULT;
1614 else
1615 docow = MAP_PREFAULT_PARTIAL;
1616
1617 if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
1618 docow |= MAP_COPY_ON_WRITE;
1619 if (flags & MAP_NOSYNC)
1620 docow |= MAP_DISABLE_SYNCER;
1621 if (flags & MAP_NOCORE)
1622 docow |= MAP_DISABLE_COREDUMP;
1623 /* Shared memory is also shared with children. */
1624 if (flags & MAP_SHARED)
1625 docow |= MAP_INHERIT_SHARE;
1626 if (writecounted)
1627 docow |= MAP_WRITECOUNT;
1628 if (flags & MAP_STACK) {
1629 if (object != NULL) {
1630 return (EXTERROR(EINVAL,
1631 "stack mapping backed by an object"));
1632 }
1633 docow |= MAP_STACK_AREA;
1634 }
1635 if ((flags & MAP_EXCL) != 0)
1636 docow |= MAP_CHECK_EXCL;
1637 if ((flags & MAP_GUARD) != 0)
1638 docow |= MAP_CREATE_GUARD;
1639
1640 if (fitit) {
1641 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER)
1642 findspace = VMFS_SUPER_SPACE;
1643 else if ((flags & MAP_ALIGNMENT_MASK) != 0)
1644 findspace = VMFS_ALIGNED_SPACE(flags >>
1645 MAP_ALIGNMENT_SHIFT);
1646 else
1647 findspace = VMFS_OPTIMAL_SPACE;
1648 max_addr = 0;
1649 if ((flags & MAP_32BIT) != 0)
1650 max_addr = MAP_32BIT_MAX_ADDR;
1651 if (curmap) {
1652 default_addr =
1653 round_page((vm_offset_t)td->td_proc->p_vmspace->
1654 vm_daddr + lim_max(td, RLIMIT_DATA));
1655 if ((flags & MAP_32BIT) != 0)
1656 default_addr = 0;
1657 rv = vm_map_find_min(map, object, foff, addr, size,
1658 default_addr, max_addr, findspace, prot, maxprot,
1659 docow);
1660 } else {
1661 rv = vm_map_find(map, object, foff, addr, size,
1662 max_addr, findspace, prot, maxprot, docow);
1663 }
1664 } else {
1665 rv = vm_map_fixed(map, object, foff, *addr, size,
1666 prot, maxprot, docow);
1667 }
1668
1669 if (rv == KERN_SUCCESS) {
1670 /*
1671 * If the process has requested that all future mappings
1672 * be wired, then heed this.
1673 */
1674 if ((map->flags & MAP_WIREFUTURE) != 0) {
1675 vm_map_lock(map);
1676 if ((map->flags & MAP_WIREFUTURE) != 0)
1677 (void)vm_map_wire_locked(map, *addr,
1678 *addr + size, VM_MAP_WIRE_USER |
1679 ((flags & MAP_STACK) ? VM_MAP_WIRE_HOLESOK :
1680 VM_MAP_WIRE_NOHOLES));
1681 vm_map_unlock(map);
1682 }
1683 }
1684 return (vm_mmap_to_errno(rv));
1685 }
1686
1687 /*
1688 * Translate a Mach VM return code to zero on success or the appropriate errno
1689 * on failure.
1690 */
1691 int
vm_mmap_to_errno(int rv)1692 vm_mmap_to_errno(int rv)
1693 {
1694 int error;
1695
1696 switch (rv) {
1697 case KERN_SUCCESS:
1698 return (0);
1699 case KERN_INVALID_ADDRESS:
1700 case KERN_NO_SPACE:
1701 error = ENOMEM;
1702 break;
1703 case KERN_PROTECTION_FAILURE:
1704 error = EACCES;
1705 break;
1706 default:
1707 error = EINVAL;
1708 break;
1709 }
1710 if ((curthread->td_pflags2 & (TDP2_UEXTERR | TDP2_EXTERR)) ==
1711 TDP2_UEXTERR)
1712 EXTERROR(error, "mach error %jd", rv);
1713 return (error);
1714 }
1715