1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1988 University of Utah.
5 * Copyright (c) 1991, 1993
6 * The Regents of the University of California. All rights reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * the Systems Programming Group of the University of Utah Computer
10 * Science Department.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
37 */
38
39 /*
40 * Mapped file (mmap) interface to VM
41 */
42
43 #include <sys/cdefs.h>
44 #include "opt_hwpmc_hooks.h"
45 #include "opt_vm.h"
46
47 #include <sys/param.h>
48 #include <sys/systm.h>
49 #include <sys/capsicum.h>
50 #include <sys/kernel.h>
51 #include <sys/lock.h>
52 #include <sys/mutex.h>
53 #include <sys/sysproto.h>
54 #include <sys/elf.h>
55 #include <sys/filedesc.h>
56 #include <sys/priv.h>
57 #include <sys/proc.h>
58 #include <sys/procctl.h>
59 #include <sys/racct.h>
60 #include <sys/resource.h>
61 #include <sys/resourcevar.h>
62 #include <sys/rwlock.h>
63 #include <sys/sysctl.h>
64 #include <sys/vnode.h>
65 #include <sys/fcntl.h>
66 #include <sys/file.h>
67 #include <sys/mman.h>
68 #include <sys/mount.h>
69 #include <sys/conf.h>
70 #include <sys/stat.h>
71 #include <sys/syscallsubr.h>
72 #include <sys/sysent.h>
73 #include <sys/vmmeter.h>
74 #if defined(__amd64__) || defined(__i386__) /* for i386_read_exec */
75 #include <machine/md_var.h>
76 #endif
77
78 #include <security/audit/audit.h>
79 #include <security/mac/mac_framework.h>
80
81 #include <vm/vm.h>
82 #include <vm/vm_param.h>
83 #include <vm/pmap.h>
84 #include <vm/vm_map.h>
85 #include <vm/vm_object.h>
86 #include <vm/vm_page.h>
87 #include <vm/vm_pager.h>
88 #include <vm/vm_pageout.h>
89 #include <vm/vm_extern.h>
90 #include <vm/vm_page.h>
91 #include <vm/vnode_pager.h>
92
93 #ifdef HWPMC_HOOKS
94 #include <sys/pmckern.h>
95 #endif
96
97 int old_mlock = 0;
98 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0,
99 "Do not apply RLIMIT_MEMLOCK on mlockall");
100 static int mincore_mapped = 1;
101 SYSCTL_INT(_vm, OID_AUTO, mincore_mapped, CTLFLAG_RWTUN, &mincore_mapped, 0,
102 "mincore reports mappings, not residency");
103 static int imply_prot_max = 0;
104 SYSCTL_INT(_vm, OID_AUTO, imply_prot_max, CTLFLAG_RWTUN, &imply_prot_max, 0,
105 "Imply maximum page protections in mmap() when none are specified");
106
107 _Static_assert(MAXPAGESIZES <= 4, "MINCORE_SUPER too narrow");
108
109 #if defined(COMPAT_43)
110 int
ogetpagesize(struct thread * td,struct ogetpagesize_args * uap)111 ogetpagesize(struct thread *td, struct ogetpagesize_args *uap)
112 {
113
114 td->td_retval[0] = PAGE_SIZE;
115 return (0);
116 }
117 #endif /* COMPAT_43 */
118
119 /*
120 * Memory Map (mmap) system call. Note that the file offset
121 * and address are allowed to be NOT page aligned, though if
122 * the MAP_FIXED flag it set, both must have the same remainder
123 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not
124 * page-aligned, the actual mapping starts at trunc_page(addr)
125 * and the return value is adjusted up by the page offset.
126 *
127 * Generally speaking, only character devices which are themselves
128 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise
129 * there would be no cache coherency between a descriptor and a VM mapping
130 * both to the same character device.
131 */
132 #ifndef _SYS_SYSPROTO_H_
133 struct mmap_args {
134 void *addr;
135 size_t len;
136 int prot;
137 int flags;
138 int fd;
139 long pad;
140 off_t pos;
141 };
142 #endif
143
144 int
sys_mmap(struct thread * td,struct mmap_args * uap)145 sys_mmap(struct thread *td, struct mmap_args *uap)
146 {
147
148 return (kern_mmap(td, &(struct mmap_req){
149 .mr_hint = (uintptr_t)uap->addr,
150 .mr_len = uap->len,
151 .mr_prot = uap->prot,
152 .mr_flags = uap->flags,
153 .mr_fd = uap->fd,
154 .mr_pos = uap->pos,
155 }));
156 }
157
158 int
kern_mmap_maxprot(struct proc * p,int prot)159 kern_mmap_maxprot(struct proc *p, int prot)
160 {
161
162 if ((p->p_flag2 & P2_PROTMAX_DISABLE) != 0 ||
163 (p->p_fctl0 & NT_FREEBSD_FCTL_PROTMAX_DISABLE) != 0)
164 return (_PROT_ALL);
165 if (((p->p_flag2 & P2_PROTMAX_ENABLE) != 0 || imply_prot_max) &&
166 prot != PROT_NONE)
167 return (prot);
168 return (_PROT_ALL);
169 }
170
171 int
kern_mmap(struct thread * td,const struct mmap_req * mrp)172 kern_mmap(struct thread *td, const struct mmap_req *mrp)
173 {
174 struct vmspace *vms;
175 struct file *fp;
176 struct proc *p;
177 off_t pos;
178 vm_offset_t addr, orig_addr;
179 vm_size_t len, pageoff, size;
180 vm_prot_t cap_maxprot;
181 int align, error, fd, flags, max_prot, prot;
182 cap_rights_t rights;
183 mmap_check_fp_fn check_fp_fn;
184
185 orig_addr = addr = mrp->mr_hint;
186 len = mrp->mr_len;
187 prot = mrp->mr_prot;
188 flags = mrp->mr_flags;
189 fd = mrp->mr_fd;
190 pos = mrp->mr_pos;
191 check_fp_fn = mrp->mr_check_fp_fn;
192
193 if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0)
194 return (EINVAL);
195 max_prot = PROT_MAX_EXTRACT(prot);
196 prot = PROT_EXTRACT(prot);
197 if (max_prot != 0 && (max_prot & prot) != prot)
198 return (ENOTSUP);
199
200 p = td->td_proc;
201
202 /*
203 * Always honor PROT_MAX if set. If not, default to all
204 * permissions unless we're implying maximum permissions.
205 */
206 if (max_prot == 0)
207 max_prot = kern_mmap_maxprot(p, prot);
208
209 vms = p->p_vmspace;
210 fp = NULL;
211 AUDIT_ARG_FD(fd);
212
213 /*
214 * Ignore old flags that used to be defined but did not do anything.
215 */
216 flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040);
217
218 /*
219 * Enforce the constraints.
220 * Mapping of length 0 is only allowed for old binaries.
221 * Anonymous mapping shall specify -1 as filedescriptor and
222 * zero position for new code. Be nice to ancient a.out
223 * binaries and correct pos for anonymous mapping, since old
224 * ld.so sometimes issues anonymous map requests with non-zero
225 * pos.
226 */
227 if (!SV_CURPROC_FLAG(SV_AOUT)) {
228 if ((len == 0 && p->p_osrel >= P_OSREL_MAP_ANON) ||
229 ((flags & MAP_ANON) != 0 && (fd != -1 || pos != 0)))
230 return (EINVAL);
231 } else {
232 if ((flags & MAP_ANON) != 0)
233 pos = 0;
234 }
235
236 if (flags & MAP_STACK) {
237 if ((fd != -1) ||
238 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE)))
239 return (EINVAL);
240 flags |= MAP_ANON;
241 pos = 0;
242 }
243 if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE |
244 MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE |
245 MAP_PREFAULT_READ | MAP_GUARD | MAP_32BIT | MAP_ALIGNMENT_MASK)) != 0)
246 return (EINVAL);
247 if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL)
248 return (EINVAL);
249 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE))
250 return (EINVAL);
251 if (prot != PROT_NONE &&
252 (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0)
253 return (EINVAL);
254 if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || fd != -1 ||
255 pos != 0 || (flags & ~(MAP_FIXED | MAP_GUARD | MAP_EXCL |
256 MAP_32BIT | MAP_ALIGNMENT_MASK)) != 0))
257 return (EINVAL);
258
259 /*
260 * Align the file position to a page boundary,
261 * and save its page offset component.
262 */
263 pageoff = (pos & PAGE_MASK);
264 pos -= pageoff;
265
266 /* Compute size from len by rounding (on both ends). */
267 size = len + pageoff; /* low end... */
268 size = round_page(size); /* hi end */
269 /* Check for rounding up to zero. */
270 if (len > size)
271 return (ENOMEM);
272
273 /* Ensure alignment is at least a page and fits in a pointer. */
274 align = flags & MAP_ALIGNMENT_MASK;
275 if (align != 0 && align != MAP_ALIGNED_SUPER &&
276 (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY ||
277 align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT))
278 return (EINVAL);
279
280 /*
281 * Check for illegal addresses. Watch out for address wrap... Note
282 * that VM_*_ADDRESS are not constants due to casts (argh).
283 */
284 if (flags & MAP_FIXED) {
285 /*
286 * The specified address must have the same remainder
287 * as the file offset taken modulo PAGE_SIZE, so it
288 * should be aligned after adjustment by pageoff.
289 */
290 addr -= pageoff;
291 if (addr & PAGE_MASK)
292 return (EINVAL);
293
294 /* Address range must be all in user VM space. */
295 if (!vm_map_range_valid(&vms->vm_map, addr, addr + size))
296 return (EINVAL);
297 if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR)
298 return (EINVAL);
299 } else if (flags & MAP_32BIT) {
300 /*
301 * For MAP_32BIT, override the hint if it is too high and
302 * do not bother moving the mapping past the heap (since
303 * the heap is usually above 2GB).
304 */
305 if (addr + size > MAP_32BIT_MAX_ADDR)
306 addr = 0;
307 } else {
308 /*
309 * XXX for non-fixed mappings where no hint is provided or
310 * the hint would fall in the potential heap space,
311 * place it after the end of the largest possible heap.
312 *
313 * For anonymous mappings within the address space of the
314 * calling process, the absence of a hint is handled at a
315 * lower level in order to implement different clustering
316 * strategies for ASLR.
317 */
318 if (((flags & MAP_ANON) == 0 && addr == 0) ||
319 (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
320 addr < round_page((vm_offset_t)vms->vm_daddr +
321 lim_max(td, RLIMIT_DATA))))
322 addr = round_page((vm_offset_t)vms->vm_daddr +
323 lim_max(td, RLIMIT_DATA));
324 }
325 if (len == 0) {
326 /*
327 * Return success without mapping anything for old
328 * binaries that request a page-aligned mapping of
329 * length 0. For modern binaries, this function
330 * returns an error earlier.
331 */
332 error = 0;
333 } else if ((flags & MAP_GUARD) != 0) {
334 error = vm_mmap_object(&vms->vm_map, &addr, size, VM_PROT_NONE,
335 VM_PROT_NONE, flags, NULL, pos, FALSE, td);
336 } else if ((flags & MAP_ANON) != 0) {
337 /*
338 * Mapping blank space is trivial.
339 *
340 * This relies on VM_PROT_* matching PROT_*.
341 */
342 error = vm_mmap_object(&vms->vm_map, &addr, size, prot,
343 max_prot, flags, NULL, pos, FALSE, td);
344 } else {
345 /*
346 * Mapping file, get fp for validation and don't let the
347 * descriptor disappear on us if we block. Check capability
348 * rights, but also return the maximum rights to be combined
349 * with maxprot later.
350 */
351 cap_rights_init_one(&rights, CAP_MMAP);
352 if (prot & PROT_READ)
353 cap_rights_set_one(&rights, CAP_MMAP_R);
354 if ((flags & MAP_SHARED) != 0) {
355 if (prot & PROT_WRITE)
356 cap_rights_set_one(&rights, CAP_MMAP_W);
357 }
358 if (prot & PROT_EXEC)
359 cap_rights_set_one(&rights, CAP_MMAP_X);
360 error = fget_mmap(td, fd, &rights, &cap_maxprot, &fp);
361 if (error != 0)
362 goto done;
363 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 &&
364 p->p_osrel >= P_OSREL_MAP_FSTRICT) {
365 error = EINVAL;
366 goto done;
367 }
368 if (check_fp_fn != NULL) {
369 error = check_fp_fn(fp, prot, max_prot & cap_maxprot,
370 flags);
371 if (error != 0)
372 goto done;
373 }
374 if (fp->f_ops == &shm_ops && shm_largepage(fp->f_data))
375 addr = orig_addr;
376 /* This relies on VM_PROT_* matching PROT_*. */
377 error = fo_mmap(fp, &vms->vm_map, &addr, size, prot,
378 max_prot & cap_maxprot, flags, pos, td);
379 }
380
381 if (error == 0)
382 td->td_retval[0] = addr + pageoff;
383 done:
384 if (fp)
385 fdrop(fp, td);
386
387 return (error);
388 }
389
390 #if defined(COMPAT_FREEBSD6)
391 int
freebsd6_mmap(struct thread * td,struct freebsd6_mmap_args * uap)392 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap)
393 {
394 return (kern_mmap(td, &(struct mmap_req){
395 .mr_hint = (uintptr_t)uap->addr,
396 .mr_len = uap->len,
397 .mr_prot = uap->prot,
398 .mr_flags = uap->flags,
399 .mr_fd = uap->fd,
400 .mr_pos = uap->pos,
401 }));
402 }
403 #endif
404
405 #ifdef COMPAT_43
406 #ifndef _SYS_SYSPROTO_H_
407 struct ommap_args {
408 caddr_t addr;
409 int len;
410 int prot;
411 int flags;
412 int fd;
413 long pos;
414 };
415 #endif
416 int
ommap(struct thread * td,struct ommap_args * uap)417 ommap(struct thread *td, struct ommap_args *uap)
418 {
419 return (kern_ommap(td, (uintptr_t)uap->addr, uap->len, uap->prot,
420 uap->flags, uap->fd, uap->pos));
421 }
422
423 int
kern_ommap(struct thread * td,uintptr_t hint,int len,int oprot,int oflags,int fd,long pos)424 kern_ommap(struct thread *td, uintptr_t hint, int len, int oprot,
425 int oflags, int fd, long pos)
426 {
427 static const char cvtbsdprot[8] = {
428 0,
429 PROT_EXEC,
430 PROT_WRITE,
431 PROT_EXEC | PROT_WRITE,
432 PROT_READ,
433 PROT_EXEC | PROT_READ,
434 PROT_WRITE | PROT_READ,
435 PROT_EXEC | PROT_WRITE | PROT_READ,
436 };
437 int flags, prot;
438
439 if (len < 0)
440 return (EINVAL);
441
442 #define OMAP_ANON 0x0002
443 #define OMAP_COPY 0x0020
444 #define OMAP_SHARED 0x0010
445 #define OMAP_FIXED 0x0100
446
447 prot = cvtbsdprot[oprot & 0x7];
448 #if (defined(COMPAT_FREEBSD32) && defined(__amd64__)) || defined(__i386__)
449 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) &&
450 prot != 0)
451 prot |= PROT_EXEC;
452 #endif
453 flags = 0;
454 if (oflags & OMAP_ANON)
455 flags |= MAP_ANON;
456 if (oflags & OMAP_COPY)
457 flags |= MAP_COPY;
458 if (oflags & OMAP_SHARED)
459 flags |= MAP_SHARED;
460 else
461 flags |= MAP_PRIVATE;
462 if (oflags & OMAP_FIXED)
463 flags |= MAP_FIXED;
464 return (kern_mmap(td, &(struct mmap_req){
465 .mr_hint = hint,
466 .mr_len = len,
467 .mr_prot = prot,
468 .mr_flags = flags,
469 .mr_fd = fd,
470 .mr_pos = pos,
471 }));
472 }
473 #endif /* COMPAT_43 */
474
475 #ifndef _SYS_SYSPROTO_H_
476 struct msync_args {
477 void *addr;
478 size_t len;
479 int flags;
480 };
481 #endif
482 int
sys_msync(struct thread * td,struct msync_args * uap)483 sys_msync(struct thread *td, struct msync_args *uap)
484 {
485
486 return (kern_msync(td, (uintptr_t)uap->addr, uap->len, uap->flags));
487 }
488
489 int
kern_msync(struct thread * td,uintptr_t addr0,size_t size,int flags)490 kern_msync(struct thread *td, uintptr_t addr0, size_t size, int flags)
491 {
492 vm_offset_t addr;
493 vm_size_t pageoff;
494 vm_map_t map;
495 int rv;
496
497 addr = addr0;
498 pageoff = (addr & PAGE_MASK);
499 addr -= pageoff;
500 size += pageoff;
501 size = (vm_size_t) round_page(size);
502 if (addr + size < addr)
503 return (EINVAL);
504
505 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE))
506 return (EINVAL);
507
508 map = &td->td_proc->p_vmspace->vm_map;
509
510 /*
511 * Clean the pages and interpret the return value.
512 */
513 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0,
514 (flags & MS_INVALIDATE) != 0);
515 switch (rv) {
516 case KERN_SUCCESS:
517 return (0);
518 case KERN_INVALID_ADDRESS:
519 return (ENOMEM);
520 case KERN_INVALID_ARGUMENT:
521 return (EBUSY);
522 case KERN_FAILURE:
523 return (EIO);
524 default:
525 return (EINVAL);
526 }
527 }
528
529 #ifndef _SYS_SYSPROTO_H_
530 struct munmap_args {
531 void *addr;
532 size_t len;
533 };
534 #endif
535 int
sys_munmap(struct thread * td,struct munmap_args * uap)536 sys_munmap(struct thread *td, struct munmap_args *uap)
537 {
538
539 return (kern_munmap(td, (uintptr_t)uap->addr, uap->len));
540 }
541
542 int
kern_munmap(struct thread * td,uintptr_t addr0,size_t size)543 kern_munmap(struct thread *td, uintptr_t addr0, size_t size)
544 {
545 #ifdef HWPMC_HOOKS
546 struct pmckern_map_out pkm;
547 vm_map_entry_t entry;
548 bool pmc_handled;
549 #endif
550 vm_offset_t addr, end;
551 vm_size_t pageoff;
552 vm_map_t map;
553 int rv;
554
555 if (size == 0)
556 return (EINVAL);
557
558 addr = addr0;
559 pageoff = (addr & PAGE_MASK);
560 addr -= pageoff;
561 size += pageoff;
562 size = (vm_size_t) round_page(size);
563 end = addr + size;
564 map = &td->td_proc->p_vmspace->vm_map;
565 if (!vm_map_range_valid(map, addr, end))
566 return (EINVAL);
567
568 vm_map_lock(map);
569 #ifdef HWPMC_HOOKS
570 pmc_handled = false;
571 if (PMC_HOOK_INSTALLED(PMC_FN_MUNMAP)) {
572 pmc_handled = true;
573 /*
574 * Inform hwpmc if the address range being unmapped contains
575 * an executable region.
576 */
577 pkm.pm_address = (uintptr_t) NULL;
578 if (vm_map_lookup_entry(map, addr, &entry)) {
579 for (; entry->start < end;
580 entry = vm_map_entry_succ(entry)) {
581 if (vm_map_check_protection(map, entry->start,
582 entry->end, VM_PROT_EXECUTE) == TRUE) {
583 pkm.pm_address = (uintptr_t) addr;
584 pkm.pm_size = (size_t) size;
585 break;
586 }
587 }
588 }
589 }
590 #endif
591 rv = vm_map_delete(map, addr, end);
592
593 #ifdef HWPMC_HOOKS
594 if (rv == KERN_SUCCESS && __predict_false(pmc_handled)) {
595 /* downgrade the lock to prevent a LOR with the pmc-sx lock */
596 vm_map_lock_downgrade(map);
597 if (pkm.pm_address != (uintptr_t) NULL)
598 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm);
599 vm_map_unlock_read(map);
600 } else
601 #endif
602 vm_map_unlock(map);
603
604 return (vm_mmap_to_errno(rv));
605 }
606
607 #ifndef _SYS_SYSPROTO_H_
608 struct mprotect_args {
609 const void *addr;
610 size_t len;
611 int prot;
612 };
613 #endif
614 int
sys_mprotect(struct thread * td,struct mprotect_args * uap)615 sys_mprotect(struct thread *td, struct mprotect_args *uap)
616 {
617
618 return (kern_mprotect(td, (uintptr_t)uap->addr, uap->len,
619 uap->prot, 0));
620 }
621
622 int
kern_mprotect(struct thread * td,uintptr_t addr0,size_t size,int prot,int flags)623 kern_mprotect(struct thread *td, uintptr_t addr0, size_t size, int prot,
624 int flags)
625 {
626 vm_offset_t addr;
627 vm_size_t pageoff;
628 int vm_error, max_prot;
629
630 addr = addr0;
631 if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0)
632 return (EINVAL);
633 max_prot = PROT_MAX_EXTRACT(prot);
634 prot = PROT_EXTRACT(prot);
635 pageoff = (addr & PAGE_MASK);
636 addr -= pageoff;
637 size += pageoff;
638 size = (vm_size_t) round_page(size);
639 #ifdef COMPAT_FREEBSD32
640 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
641 if (((addr + size) & 0xffffffff) < addr)
642 return (EINVAL);
643 } else
644 #endif
645 if (addr + size < addr)
646 return (EINVAL);
647
648 flags |= VM_MAP_PROTECT_SET_PROT;
649 if (max_prot != 0)
650 flags |= VM_MAP_PROTECT_SET_MAXPROT;
651 vm_error = vm_map_protect(&td->td_proc->p_vmspace->vm_map,
652 addr, addr + size, prot, max_prot, flags);
653
654 switch (vm_error) {
655 case KERN_SUCCESS:
656 return (0);
657 case KERN_PROTECTION_FAILURE:
658 return (EACCES);
659 case KERN_RESOURCE_SHORTAGE:
660 return (ENOMEM);
661 case KERN_OUT_OF_BOUNDS:
662 return (ENOTSUP);
663 }
664 return (EINVAL);
665 }
666
667 #ifndef _SYS_SYSPROTO_H_
668 struct minherit_args {
669 void *addr;
670 size_t len;
671 int inherit;
672 };
673 #endif
674 int
sys_minherit(struct thread * td,struct minherit_args * uap)675 sys_minherit(struct thread *td, struct minherit_args *uap)
676 {
677
678 return (kern_minherit(td, (uintptr_t)uap->addr, uap->len,
679 uap->inherit));
680 }
681
682 int
kern_minherit(struct thread * td,uintptr_t addr0,size_t len,int inherit0)683 kern_minherit(struct thread *td, uintptr_t addr0, size_t len, int inherit0)
684 {
685 vm_offset_t addr;
686 vm_size_t size, pageoff;
687 vm_inherit_t inherit;
688
689 addr = (vm_offset_t)addr0;
690 size = len;
691 inherit = inherit0;
692
693 pageoff = (addr & PAGE_MASK);
694 addr -= pageoff;
695 size += pageoff;
696 size = (vm_size_t) round_page(size);
697 if (addr + size < addr)
698 return (EINVAL);
699
700 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr,
701 addr + size, inherit)) {
702 case KERN_SUCCESS:
703 return (0);
704 case KERN_PROTECTION_FAILURE:
705 return (EACCES);
706 }
707 return (EINVAL);
708 }
709
710 #ifndef _SYS_SYSPROTO_H_
711 struct madvise_args {
712 void *addr;
713 size_t len;
714 int behav;
715 };
716 #endif
717
718 int
sys_madvise(struct thread * td,struct madvise_args * uap)719 sys_madvise(struct thread *td, struct madvise_args *uap)
720 {
721
722 return (kern_madvise(td, (uintptr_t)uap->addr, uap->len, uap->behav));
723 }
724
725 int
kern_madvise(struct thread * td,uintptr_t addr0,size_t len,int behav)726 kern_madvise(struct thread *td, uintptr_t addr0, size_t len, int behav)
727 {
728 vm_map_t map;
729 vm_offset_t addr, end, start;
730 int flags;
731
732 /*
733 * Check for our special case, advising the swap pager we are
734 * "immortal."
735 */
736 if (behav == MADV_PROTECT) {
737 flags = PPROT_SET;
738 return (kern_procctl(td, P_PID, td->td_proc->p_pid,
739 PROC_SPROTECT, &flags));
740 }
741
742 /*
743 * Check for illegal addresses. Watch out for address wrap... Note
744 * that VM_*_ADDRESS are not constants due to casts (argh).
745 */
746 map = &td->td_proc->p_vmspace->vm_map;
747 addr = addr0;
748 if (!vm_map_range_valid(map, addr, addr + len))
749 return (EINVAL);
750
751 /*
752 * Since this routine is only advisory, we default to conservative
753 * behavior.
754 */
755 start = trunc_page(addr);
756 end = round_page(addr + len);
757
758 /*
759 * vm_map_madvise() checks for illegal values of behav.
760 */
761 return (vm_map_madvise(map, start, end, behav));
762 }
763
764 #ifndef _SYS_SYSPROTO_H_
765 struct mincore_args {
766 const void *addr;
767 size_t len;
768 char *vec;
769 };
770 #endif
771
772 int
sys_mincore(struct thread * td,struct mincore_args * uap)773 sys_mincore(struct thread *td, struct mincore_args *uap)
774 {
775
776 return (kern_mincore(td, (uintptr_t)uap->addr, uap->len, uap->vec));
777 }
778
779 int
kern_mincore(struct thread * td,uintptr_t addr0,size_t len,char * vec)780 kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec)
781 {
782 pmap_t pmap;
783 vm_map_t map;
784 vm_map_entry_t current, entry;
785 vm_object_t object;
786 vm_offset_t addr, cend, end, first_addr;
787 vm_paddr_t pa;
788 vm_page_t m;
789 vm_pindex_t pindex;
790 int error, lastvecindex, mincoreinfo, vecindex;
791 unsigned int timestamp;
792
793 /*
794 * Make sure that the addresses presented are valid for user
795 * mode.
796 */
797 first_addr = addr = trunc_page(addr0);
798 end = round_page(addr0 + len);
799 map = &td->td_proc->p_vmspace->vm_map;
800 if (end > vm_map_max(map) || end < addr)
801 return (ENOMEM);
802
803 pmap = vmspace_pmap(td->td_proc->p_vmspace);
804
805 vm_map_lock_read(map);
806 RestartScan:
807 timestamp = map->timestamp;
808
809 if (!vm_map_lookup_entry(map, addr, &entry)) {
810 vm_map_unlock_read(map);
811 return (ENOMEM);
812 }
813
814 /*
815 * Do this on a map entry basis so that if the pages are not
816 * in the current processes address space, we can easily look
817 * up the pages elsewhere.
818 */
819 lastvecindex = -1;
820 while (entry->start < end) {
821 /*
822 * check for contiguity
823 */
824 current = entry;
825 entry = vm_map_entry_succ(current);
826 if (current->end < end &&
827 entry->start > current->end) {
828 vm_map_unlock_read(map);
829 return (ENOMEM);
830 }
831
832 /*
833 * ignore submaps (for now) or null objects
834 */
835 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) ||
836 current->object.vm_object == NULL)
837 continue;
838
839 /*
840 * limit this scan to the current map entry and the
841 * limits for the mincore call
842 */
843 if (addr < current->start)
844 addr = current->start;
845 cend = current->end;
846 if (cend > end)
847 cend = end;
848
849 for (; addr < cend; addr += PAGE_SIZE) {
850 /*
851 * Check pmap first, it is likely faster, also
852 * it can provide info as to whether we are the
853 * one referencing or modifying the page.
854 */
855 m = NULL;
856 object = NULL;
857 retry:
858 pa = 0;
859 mincoreinfo = pmap_mincore(pmap, addr, &pa);
860 if (mincore_mapped) {
861 /*
862 * We only care about this pmap's
863 * mapping of the page, if any.
864 */
865 ;
866 } else if (pa != 0) {
867 /*
868 * The page is mapped by this process but not
869 * both accessed and modified. It is also
870 * managed. Acquire the object lock so that
871 * other mappings might be examined. The page's
872 * identity may change at any point before its
873 * object lock is acquired, so re-validate if
874 * necessary.
875 */
876 m = PHYS_TO_VM_PAGE(pa);
877 while (object == NULL || m->object != object) {
878 if (object != NULL)
879 VM_OBJECT_WUNLOCK(object);
880 object = atomic_load_ptr(&m->object);
881 if (object == NULL)
882 goto retry;
883 VM_OBJECT_WLOCK(object);
884 }
885 if (pa != pmap_extract(pmap, addr))
886 goto retry;
887 KASSERT(vm_page_all_valid(m),
888 ("mincore: page %p is mapped but invalid",
889 m));
890 } else if (mincoreinfo == 0) {
891 /*
892 * The page is not mapped by this process. If
893 * the object implements managed pages, then
894 * determine if the page is resident so that
895 * the mappings might be examined.
896 */
897 if (current->object.vm_object != object) {
898 if (object != NULL)
899 VM_OBJECT_WUNLOCK(object);
900 object = current->object.vm_object;
901 VM_OBJECT_WLOCK(object);
902 }
903 if ((object->flags & OBJ_SWAP) != 0 ||
904 object->type == OBJT_VNODE) {
905 pindex = OFF_TO_IDX(current->offset +
906 (addr - current->start));
907 m = vm_page_lookup(object, pindex);
908 if (m != NULL && vm_page_none_valid(m))
909 m = NULL;
910 if (m != NULL)
911 mincoreinfo = MINCORE_INCORE;
912 }
913 }
914 if (m != NULL) {
915 VM_OBJECT_ASSERT_WLOCKED(m->object);
916
917 /* Examine other mappings of the page. */
918 if (m->dirty == 0 && pmap_is_modified(m))
919 vm_page_dirty(m);
920 if (m->dirty != 0)
921 mincoreinfo |= MINCORE_MODIFIED_OTHER;
922
923 /*
924 * The first test for PGA_REFERENCED is an
925 * optimization. The second test is
926 * required because a concurrent pmap
927 * operation could clear the last reference
928 * and set PGA_REFERENCED before the call to
929 * pmap_is_referenced().
930 */
931 if ((m->a.flags & PGA_REFERENCED) != 0 ||
932 pmap_is_referenced(m) ||
933 (m->a.flags & PGA_REFERENCED) != 0)
934 mincoreinfo |= MINCORE_REFERENCED_OTHER;
935 }
936 if (object != NULL)
937 VM_OBJECT_WUNLOCK(object);
938
939 /*
940 * subyte may page fault. In case it needs to modify
941 * the map, we release the lock.
942 */
943 vm_map_unlock_read(map);
944
945 /*
946 * calculate index into user supplied byte vector
947 */
948 vecindex = atop(addr - first_addr);
949
950 /*
951 * If we have skipped map entries, we need to make sure that
952 * the byte vector is zeroed for those skipped entries.
953 */
954 while ((lastvecindex + 1) < vecindex) {
955 ++lastvecindex;
956 error = subyte(vec + lastvecindex, 0);
957 if (error) {
958 error = EFAULT;
959 goto done2;
960 }
961 }
962
963 /*
964 * Pass the page information to the user
965 */
966 error = subyte(vec + vecindex, mincoreinfo);
967 if (error) {
968 error = EFAULT;
969 goto done2;
970 }
971
972 /*
973 * If the map has changed, due to the subyte, the previous
974 * output may be invalid.
975 */
976 vm_map_lock_read(map);
977 if (timestamp != map->timestamp)
978 goto RestartScan;
979
980 lastvecindex = vecindex;
981 }
982 }
983
984 /*
985 * subyte may page fault. In case it needs to modify
986 * the map, we release the lock.
987 */
988 vm_map_unlock_read(map);
989
990 /*
991 * Zero the last entries in the byte vector.
992 */
993 vecindex = atop(end - first_addr);
994 while ((lastvecindex + 1) < vecindex) {
995 ++lastvecindex;
996 error = subyte(vec + lastvecindex, 0);
997 if (error) {
998 error = EFAULT;
999 goto done2;
1000 }
1001 }
1002
1003 /*
1004 * If the map has changed, due to the subyte, the previous
1005 * output may be invalid.
1006 */
1007 vm_map_lock_read(map);
1008 if (timestamp != map->timestamp)
1009 goto RestartScan;
1010 vm_map_unlock_read(map);
1011 done2:
1012 return (error);
1013 }
1014
1015 #ifndef _SYS_SYSPROTO_H_
1016 struct mlock_args {
1017 const void *addr;
1018 size_t len;
1019 };
1020 #endif
1021 int
sys_mlock(struct thread * td,struct mlock_args * uap)1022 sys_mlock(struct thread *td, struct mlock_args *uap)
1023 {
1024
1025 return (kern_mlock(td->td_proc, td->td_ucred,
1026 __DECONST(uintptr_t, uap->addr), uap->len));
1027 }
1028
1029 int
kern_mlock(struct proc * proc,struct ucred * cred,uintptr_t addr0,size_t len)1030 kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr0, size_t len)
1031 {
1032 vm_offset_t addr, end, last, start;
1033 vm_size_t npages, size;
1034 vm_map_t map;
1035 unsigned long nsize;
1036 int error;
1037
1038 error = priv_check_cred(cred, PRIV_VM_MLOCK);
1039 if (error)
1040 return (error);
1041 addr = addr0;
1042 size = len;
1043 last = addr + size;
1044 start = trunc_page(addr);
1045 end = round_page(last);
1046 if (last < addr || end < addr)
1047 return (EINVAL);
1048 npages = atop(end - start);
1049 if (npages > vm_page_max_user_wired)
1050 return (ENOMEM);
1051 map = &proc->p_vmspace->vm_map;
1052 PROC_LOCK(proc);
1053 nsize = ptoa(npages + pmap_wired_count(map->pmap));
1054 if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) {
1055 PROC_UNLOCK(proc);
1056 return (ENOMEM);
1057 }
1058 PROC_UNLOCK(proc);
1059 #ifdef RACCT
1060 if (racct_enable) {
1061 PROC_LOCK(proc);
1062 error = racct_set(proc, RACCT_MEMLOCK, nsize);
1063 PROC_UNLOCK(proc);
1064 if (error != 0)
1065 return (ENOMEM);
1066 }
1067 #endif
1068 error = vm_map_wire(map, start, end,
1069 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
1070 #ifdef RACCT
1071 if (racct_enable && error != KERN_SUCCESS) {
1072 PROC_LOCK(proc);
1073 racct_set(proc, RACCT_MEMLOCK,
1074 ptoa(pmap_wired_count(map->pmap)));
1075 PROC_UNLOCK(proc);
1076 }
1077 #endif
1078 switch (error) {
1079 case KERN_SUCCESS:
1080 return (0);
1081 case KERN_INVALID_ARGUMENT:
1082 return (EINVAL);
1083 default:
1084 return (ENOMEM);
1085 }
1086 }
1087
1088 #ifndef _SYS_SYSPROTO_H_
1089 struct mlockall_args {
1090 int how;
1091 };
1092 #endif
1093
1094 int
sys_mlockall(struct thread * td,struct mlockall_args * uap)1095 sys_mlockall(struct thread *td, struct mlockall_args *uap)
1096 {
1097 vm_map_t map;
1098 int error;
1099
1100 map = &td->td_proc->p_vmspace->vm_map;
1101 error = priv_check(td, PRIV_VM_MLOCK);
1102 if (error)
1103 return (error);
1104
1105 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0))
1106 return (EINVAL);
1107
1108 /*
1109 * If wiring all pages in the process would cause it to exceed
1110 * a hard resource limit, return ENOMEM.
1111 */
1112 if (!old_mlock && uap->how & MCL_CURRENT) {
1113 if (map->size > lim_cur(td, RLIMIT_MEMLOCK))
1114 return (ENOMEM);
1115 }
1116 #ifdef RACCT
1117 if (racct_enable) {
1118 PROC_LOCK(td->td_proc);
1119 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size);
1120 PROC_UNLOCK(td->td_proc);
1121 if (error != 0)
1122 return (ENOMEM);
1123 }
1124 #endif
1125
1126 if (uap->how & MCL_FUTURE) {
1127 vm_map_lock(map);
1128 vm_map_modflags(map, MAP_WIREFUTURE, 0);
1129 vm_map_unlock(map);
1130 error = 0;
1131 }
1132
1133 if (uap->how & MCL_CURRENT) {
1134 /*
1135 * P1003.1-2001 mandates that all currently mapped pages
1136 * will be memory resident and locked (wired) upon return
1137 * from mlockall(). vm_map_wire() will wire pages, by
1138 * calling vm_fault_wire() for each page in the region.
1139 */
1140 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map),
1141 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
1142 if (error == KERN_SUCCESS)
1143 error = 0;
1144 else if (error == KERN_RESOURCE_SHORTAGE)
1145 error = ENOMEM;
1146 else
1147 error = EAGAIN;
1148 }
1149 #ifdef RACCT
1150 if (racct_enable && error != KERN_SUCCESS) {
1151 PROC_LOCK(td->td_proc);
1152 racct_set(td->td_proc, RACCT_MEMLOCK,
1153 ptoa(pmap_wired_count(map->pmap)));
1154 PROC_UNLOCK(td->td_proc);
1155 }
1156 #endif
1157
1158 return (error);
1159 }
1160
1161 #ifndef _SYS_SYSPROTO_H_
1162 struct munlockall_args {
1163 register_t dummy;
1164 };
1165 #endif
1166
1167 int
sys_munlockall(struct thread * td,struct munlockall_args * uap)1168 sys_munlockall(struct thread *td, struct munlockall_args *uap)
1169 {
1170 vm_map_t map;
1171 int error;
1172
1173 map = &td->td_proc->p_vmspace->vm_map;
1174 error = priv_check(td, PRIV_VM_MUNLOCK);
1175 if (error)
1176 return (error);
1177
1178 /* Clear the MAP_WIREFUTURE flag from this vm_map. */
1179 vm_map_lock(map);
1180 vm_map_modflags(map, 0, MAP_WIREFUTURE);
1181 vm_map_unlock(map);
1182
1183 /* Forcibly unwire all pages. */
1184 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map),
1185 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
1186 #ifdef RACCT
1187 if (racct_enable && error == KERN_SUCCESS) {
1188 PROC_LOCK(td->td_proc);
1189 racct_set(td->td_proc, RACCT_MEMLOCK, 0);
1190 PROC_UNLOCK(td->td_proc);
1191 }
1192 #endif
1193
1194 return (error);
1195 }
1196
1197 #ifndef _SYS_SYSPROTO_H_
1198 struct munlock_args {
1199 const void *addr;
1200 size_t len;
1201 };
1202 #endif
1203 int
sys_munlock(struct thread * td,struct munlock_args * uap)1204 sys_munlock(struct thread *td, struct munlock_args *uap)
1205 {
1206
1207 return (kern_munlock(td, (uintptr_t)uap->addr, uap->len));
1208 }
1209
1210 int
kern_munlock(struct thread * td,uintptr_t addr0,size_t size)1211 kern_munlock(struct thread *td, uintptr_t addr0, size_t size)
1212 {
1213 vm_offset_t addr, end, last, start;
1214 #ifdef RACCT
1215 vm_map_t map;
1216 #endif
1217 int error;
1218
1219 error = priv_check(td, PRIV_VM_MUNLOCK);
1220 if (error)
1221 return (error);
1222 addr = addr0;
1223 last = addr + size;
1224 start = trunc_page(addr);
1225 end = round_page(last);
1226 if (last < addr || end < addr)
1227 return (EINVAL);
1228 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end,
1229 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
1230 #ifdef RACCT
1231 if (racct_enable && error == KERN_SUCCESS) {
1232 PROC_LOCK(td->td_proc);
1233 map = &td->td_proc->p_vmspace->vm_map;
1234 racct_set(td->td_proc, RACCT_MEMLOCK,
1235 ptoa(pmap_wired_count(map->pmap)));
1236 PROC_UNLOCK(td->td_proc);
1237 }
1238 #endif
1239 return (error == KERN_SUCCESS ? 0 : ENOMEM);
1240 }
1241
1242 /*
1243 * vm_mmap_vnode()
1244 *
1245 * Helper function for vm_mmap. Perform sanity check specific for mmap
1246 * operations on vnodes.
1247 */
1248 int
vm_mmap_vnode(struct thread * td,vm_size_t objsize,vm_prot_t prot,vm_prot_t * maxprotp,int * flagsp,struct vnode * vp,vm_ooffset_t * foffp,vm_object_t * objp,boolean_t * writecounted)1249 vm_mmap_vnode(struct thread *td, vm_size_t objsize,
1250 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
1251 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp,
1252 boolean_t *writecounted)
1253 {
1254 struct vattr va;
1255 vm_object_t obj;
1256 vm_ooffset_t foff;
1257 struct ucred *cred;
1258 int error, flags;
1259 bool writex;
1260
1261 cred = td->td_ucred;
1262 writex = (*maxprotp & VM_PROT_WRITE) != 0 &&
1263 (*flagsp & MAP_SHARED) != 0;
1264 if ((error = vget(vp, LK_SHARED)) != 0)
1265 return (error);
1266 AUDIT_ARG_VNODE1(vp);
1267 foff = *foffp;
1268 flags = *flagsp;
1269 obj = vp->v_object;
1270 if (vp->v_type == VREG) {
1271 /*
1272 * Get the proper underlying object
1273 */
1274 if (obj == NULL) {
1275 error = EINVAL;
1276 goto done;
1277 }
1278 if (obj->type == OBJT_VNODE && obj->handle != vp) {
1279 vput(vp);
1280 vp = (struct vnode *)obj->handle;
1281 /*
1282 * Bypass filesystems obey the mpsafety of the
1283 * underlying fs. Tmpfs never bypasses.
1284 */
1285 error = vget(vp, LK_SHARED);
1286 if (error != 0)
1287 return (error);
1288 }
1289 if (writex) {
1290 *writecounted = TRUE;
1291 vm_pager_update_writecount(obj, 0, objsize);
1292 }
1293 } else {
1294 error = EINVAL;
1295 goto done;
1296 }
1297 if ((error = VOP_GETATTR(vp, &va, cred)))
1298 goto done;
1299 #ifdef MAC
1300 /* This relies on VM_PROT_* matching PROT_*. */
1301 error = mac_vnode_check_mmap(cred, vp, (int)prot, flags);
1302 if (error != 0)
1303 goto done;
1304 #endif
1305 if ((flags & MAP_SHARED) != 0) {
1306 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) {
1307 if (prot & VM_PROT_WRITE) {
1308 error = EPERM;
1309 goto done;
1310 }
1311 *maxprotp &= ~VM_PROT_WRITE;
1312 }
1313 }
1314 /*
1315 * If it is a regular file without any references
1316 * we do not need to sync it.
1317 * Adjust object size to be the size of actual file.
1318 */
1319 objsize = round_page(va.va_size);
1320 if (va.va_nlink == 0)
1321 flags |= MAP_NOSYNC;
1322 if (obj->type == OBJT_VNODE) {
1323 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff,
1324 cred);
1325 if (obj == NULL) {
1326 error = ENOMEM;
1327 goto done;
1328 }
1329 } else {
1330 KASSERT((obj->flags & OBJ_SWAP) != 0, ("wrong object type"));
1331 vm_object_reference(obj);
1332 #if VM_NRESERVLEVEL > 0
1333 if ((obj->flags & OBJ_COLORED) == 0) {
1334 VM_OBJECT_WLOCK(obj);
1335 vm_object_color(obj, 0);
1336 VM_OBJECT_WUNLOCK(obj);
1337 }
1338 #endif
1339 }
1340 *objp = obj;
1341 *flagsp = flags;
1342
1343 VOP_MMAPPED(vp);
1344
1345 done:
1346 if (error != 0 && *writecounted) {
1347 *writecounted = FALSE;
1348 vm_pager_update_writecount(obj, objsize, 0);
1349 }
1350 vput(vp);
1351 return (error);
1352 }
1353
1354 /*
1355 * vm_mmap_cdev()
1356 *
1357 * Helper function for vm_mmap. Perform sanity check specific for mmap
1358 * operations on cdevs.
1359 */
1360 int
vm_mmap_cdev(struct thread * td,vm_size_t objsize,vm_prot_t prot,vm_prot_t * maxprotp,int * flagsp,struct cdev * cdev,struct cdevsw * dsw,vm_ooffset_t * foff,vm_object_t * objp)1361 vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot,
1362 vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw,
1363 vm_ooffset_t *foff, vm_object_t *objp)
1364 {
1365 vm_object_t obj;
1366 int error, flags;
1367
1368 flags = *flagsp;
1369
1370 if (dsw->d_flags & D_MMAP_ANON) {
1371 *objp = NULL;
1372 *foff = 0;
1373 *maxprotp = VM_PROT_ALL;
1374 *flagsp |= MAP_ANON;
1375 return (0);
1376 }
1377 /*
1378 * cdevs do not provide private mappings of any kind.
1379 */
1380 if ((*maxprotp & VM_PROT_WRITE) == 0 &&
1381 (prot & VM_PROT_WRITE) != 0)
1382 return (EACCES);
1383 if (flags & (MAP_PRIVATE|MAP_COPY))
1384 return (EINVAL);
1385 /*
1386 * Force device mappings to be shared.
1387 */
1388 flags |= MAP_SHARED;
1389 #ifdef MAC_XXX
1390 error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot);
1391 if (error != 0)
1392 return (error);
1393 #endif
1394 /*
1395 * First, try d_mmap_single(). If that is not implemented
1396 * (returns ENODEV), fall back to using the device pager.
1397 * Note that d_mmap_single() must return a reference to the
1398 * object (it needs to bump the reference count of the object
1399 * it returns somehow).
1400 *
1401 * XXX assumes VM_PROT_* == PROT_*
1402 */
1403 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot);
1404 if (error != ENODEV)
1405 return (error);
1406 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff,
1407 td->td_ucred);
1408 if (obj == NULL)
1409 return (EINVAL);
1410 VM_OBJECT_WLOCK(obj);
1411 vm_object_set_flag(obj, OBJ_CDEVH);
1412 VM_OBJECT_WUNLOCK(obj);
1413 *objp = obj;
1414 *flagsp = flags;
1415 return (0);
1416 }
1417
1418 int
vm_mmap(vm_map_t map,vm_offset_t * addr,vm_size_t size,vm_prot_t prot,vm_prot_t maxprot,int flags,objtype_t handle_type,void * handle,vm_ooffset_t foff)1419 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
1420 vm_prot_t maxprot, int flags,
1421 objtype_t handle_type, void *handle,
1422 vm_ooffset_t foff)
1423 {
1424 vm_object_t object;
1425 struct thread *td = curthread;
1426 int error;
1427 boolean_t writecounted;
1428
1429 if (size == 0)
1430 return (EINVAL);
1431
1432 size = round_page(size);
1433 object = NULL;
1434 writecounted = FALSE;
1435
1436 switch (handle_type) {
1437 case OBJT_DEVICE: {
1438 struct cdevsw *dsw;
1439 struct cdev *cdev;
1440 int ref;
1441
1442 cdev = handle;
1443 dsw = dev_refthread(cdev, &ref);
1444 if (dsw == NULL)
1445 return (ENXIO);
1446 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev,
1447 dsw, &foff, &object);
1448 dev_relthread(cdev, ref);
1449 break;
1450 }
1451 case OBJT_VNODE:
1452 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
1453 handle, &foff, &object, &writecounted);
1454 break;
1455 default:
1456 error = EINVAL;
1457 break;
1458 }
1459 if (error)
1460 return (error);
1461
1462 error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
1463 foff, writecounted, td);
1464 if (error != 0 && object != NULL) {
1465 /*
1466 * If this mapping was accounted for in the vnode's
1467 * writecount, then undo that now.
1468 */
1469 if (writecounted)
1470 vm_pager_release_writecount(object, 0, size);
1471 vm_object_deallocate(object);
1472 }
1473 return (error);
1474 }
1475
1476 int
kern_mmap_racct_check(struct thread * td,vm_map_t map,vm_size_t size)1477 kern_mmap_racct_check(struct thread *td, vm_map_t map, vm_size_t size)
1478 {
1479 int error;
1480
1481 RACCT_PROC_LOCK(td->td_proc);
1482 if (map->size + size > lim_cur(td, RLIMIT_VMEM)) {
1483 RACCT_PROC_UNLOCK(td->td_proc);
1484 return (ENOMEM);
1485 }
1486 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) {
1487 RACCT_PROC_UNLOCK(td->td_proc);
1488 return (ENOMEM);
1489 }
1490 if (!old_mlock && map->flags & MAP_WIREFUTURE) {
1491 if (ptoa(pmap_wired_count(map->pmap)) + size >
1492 lim_cur(td, RLIMIT_MEMLOCK)) {
1493 racct_set_force(td->td_proc, RACCT_VMEM, map->size);
1494 RACCT_PROC_UNLOCK(td->td_proc);
1495 return (ENOMEM);
1496 }
1497 error = racct_set(td->td_proc, RACCT_MEMLOCK,
1498 ptoa(pmap_wired_count(map->pmap)) + size);
1499 if (error != 0) {
1500 racct_set_force(td->td_proc, RACCT_VMEM, map->size);
1501 RACCT_PROC_UNLOCK(td->td_proc);
1502 return (error);
1503 }
1504 }
1505 RACCT_PROC_UNLOCK(td->td_proc);
1506 return (0);
1507 }
1508
1509 /*
1510 * Internal version of mmap that maps a specific VM object into an
1511 * map. Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap.
1512 */
1513 int
vm_mmap_object(vm_map_t map,vm_offset_t * addr,vm_size_t size,vm_prot_t prot,vm_prot_t maxprot,int flags,vm_object_t object,vm_ooffset_t foff,boolean_t writecounted,struct thread * td)1514 vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
1515 vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff,
1516 boolean_t writecounted, struct thread *td)
1517 {
1518 vm_offset_t default_addr, max_addr;
1519 int docow, error, findspace, rv;
1520 bool curmap, fitit;
1521
1522 curmap = map == &td->td_proc->p_vmspace->vm_map;
1523 if (curmap) {
1524 error = kern_mmap_racct_check(td, map, size);
1525 if (error != 0)
1526 return (error);
1527 }
1528
1529 /*
1530 * We currently can only deal with page aligned file offsets.
1531 * The mmap() system call already enforces this by subtracting
1532 * the page offset from the file offset, but checking here
1533 * catches errors in device drivers (e.g. d_single_mmap()
1534 * callbacks) and other internal mapping requests (such as in
1535 * exec).
1536 */
1537 if (foff & PAGE_MASK)
1538 return (EINVAL);
1539
1540 if ((flags & MAP_FIXED) == 0) {
1541 fitit = true;
1542 *addr = round_page(*addr);
1543 } else {
1544 if (*addr != trunc_page(*addr))
1545 return (EINVAL);
1546 fitit = false;
1547 }
1548
1549 if (flags & MAP_ANON) {
1550 if (object != NULL || foff != 0)
1551 return (EINVAL);
1552 docow = 0;
1553 } else if (flags & MAP_PREFAULT_READ)
1554 docow = MAP_PREFAULT;
1555 else
1556 docow = MAP_PREFAULT_PARTIAL;
1557
1558 if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
1559 docow |= MAP_COPY_ON_WRITE;
1560 if (flags & MAP_NOSYNC)
1561 docow |= MAP_DISABLE_SYNCER;
1562 if (flags & MAP_NOCORE)
1563 docow |= MAP_DISABLE_COREDUMP;
1564 /* Shared memory is also shared with children. */
1565 if (flags & MAP_SHARED)
1566 docow |= MAP_INHERIT_SHARE;
1567 if (writecounted)
1568 docow |= MAP_WRITECOUNT;
1569 if (flags & MAP_STACK) {
1570 if (object != NULL)
1571 return (EINVAL);
1572 docow |= MAP_STACK_GROWS_DOWN;
1573 }
1574 if ((flags & MAP_EXCL) != 0)
1575 docow |= MAP_CHECK_EXCL;
1576 if ((flags & MAP_GUARD) != 0)
1577 docow |= MAP_CREATE_GUARD;
1578
1579 if (fitit) {
1580 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER)
1581 findspace = VMFS_SUPER_SPACE;
1582 else if ((flags & MAP_ALIGNMENT_MASK) != 0)
1583 findspace = VMFS_ALIGNED_SPACE(flags >>
1584 MAP_ALIGNMENT_SHIFT);
1585 else
1586 findspace = VMFS_OPTIMAL_SPACE;
1587 max_addr = 0;
1588 if ((flags & MAP_32BIT) != 0)
1589 max_addr = MAP_32BIT_MAX_ADDR;
1590 if (curmap) {
1591 default_addr =
1592 round_page((vm_offset_t)td->td_proc->p_vmspace->
1593 vm_daddr + lim_max(td, RLIMIT_DATA));
1594 if ((flags & MAP_32BIT) != 0)
1595 default_addr = 0;
1596 rv = vm_map_find_min(map, object, foff, addr, size,
1597 default_addr, max_addr, findspace, prot, maxprot,
1598 docow);
1599 } else {
1600 rv = vm_map_find(map, object, foff, addr, size,
1601 max_addr, findspace, prot, maxprot, docow);
1602 }
1603 } else {
1604 rv = vm_map_fixed(map, object, foff, *addr, size,
1605 prot, maxprot, docow);
1606 }
1607
1608 if (rv == KERN_SUCCESS) {
1609 /*
1610 * If the process has requested that all future mappings
1611 * be wired, then heed this.
1612 */
1613 if ((map->flags & MAP_WIREFUTURE) != 0) {
1614 vm_map_lock(map);
1615 if ((map->flags & MAP_WIREFUTURE) != 0)
1616 (void)vm_map_wire_locked(map, *addr,
1617 *addr + size, VM_MAP_WIRE_USER |
1618 ((flags & MAP_STACK) ? VM_MAP_WIRE_HOLESOK :
1619 VM_MAP_WIRE_NOHOLES));
1620 vm_map_unlock(map);
1621 }
1622 }
1623 return (vm_mmap_to_errno(rv));
1624 }
1625
1626 /*
1627 * Translate a Mach VM return code to zero on success or the appropriate errno
1628 * on failure.
1629 */
1630 int
vm_mmap_to_errno(int rv)1631 vm_mmap_to_errno(int rv)
1632 {
1633
1634 switch (rv) {
1635 case KERN_SUCCESS:
1636 return (0);
1637 case KERN_INVALID_ADDRESS:
1638 case KERN_NO_SPACE:
1639 return (ENOMEM);
1640 case KERN_PROTECTION_FAILURE:
1641 return (EACCES);
1642 default:
1643 return (EINVAL);
1644 }
1645 }
1646