xref: /illumos-gate/usr/src/uts/common/os/grow.c (revision 13b136d3061155363c62c9f6568d25b8b27da8f6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved.
24  * Copyright 2017 Joyent, Inc.
25  */
26 
27 /*
28  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
29  * Use is subject to license terms.
30  */
31 
32 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
33 /*	  All Rights Reserved  	*/
34 
35 #include <sys/types.h>
36 #include <sys/inttypes.h>
37 #include <sys/param.h>
38 #include <sys/sysmacros.h>
39 #include <sys/systm.h>
40 #include <sys/signal.h>
41 #include <sys/user.h>
42 #include <sys/errno.h>
43 #include <sys/var.h>
44 #include <sys/proc.h>
45 #include <sys/tuneable.h>
46 #include <sys/debug.h>
47 #include <sys/cmn_err.h>
48 #include <sys/cred.h>
49 #include <sys/vnode.h>
50 #include <sys/vfs.h>
51 #include <sys/vm.h>
52 #include <sys/file.h>
53 #include <sys/mman.h>
54 #include <sys/vmparam.h>
55 #include <sys/fcntl.h>
56 #include <sys/lwpchan_impl.h>
57 #include <sys/nbmlock.h>
58 
59 #include <vm/hat.h>
60 #include <vm/as.h>
61 #include <vm/seg.h>
62 #include <vm/seg_dev.h>
63 #include <vm/seg_vn.h>
64 
65 int use_brk_lpg = 1;
66 int use_stk_lpg = 1;
67 
68 /*
69  * If set, we will not randomize mappings where the 'addr' argument is
70  * non-NULL and not an alignment.
71  */
72 int aslr_respect_mmap_hint = 1;
73 
74 static int brk_lpg(caddr_t nva);
75 static int grow_lpg(caddr_t sp);
76 
77 intptr_t
78 brk(caddr_t nva)
79 {
80 	int error;
81 	proc_t *p = curproc;
82 
83 	/*
84 	 * Serialize brk operations on an address space.
85 	 * This also serves as the lock protecting p_brksize
86 	 * and p_brkpageszc.
87 	 */
88 	as_rangelock(p->p_as);
89 
90 	/*
91 	 * As a special case to aid the implementation of sbrk(3C), if given a
92 	 * new brk of 0, return the current brk.  We'll hide this in brk(3C).
93 	 */
94 	if (nva == 0) {
95 		intptr_t base = (intptr_t)(p->p_brkbase + p->p_brksize);
96 		as_rangeunlock(p->p_as);
97 		return (base);
98 	}
99 
100 	if (use_brk_lpg && (p->p_flag & SAUTOLPG) != 0) {
101 		error = brk_lpg(nva);
102 	} else {
103 		error = brk_internal(nva, p->p_brkpageszc);
104 	}
105 	as_rangeunlock(p->p_as);
106 	return ((error != 0 ? set_errno(error) : 0));
107 }
108 
109 /*
110  * Algorithm: call arch-specific map_pgsz to get best page size to use,
111  * then call brk_internal().
112  * Returns 0 on success.
113  */
114 static int
115 brk_lpg(caddr_t nva)
116 {
117 	struct proc *p = curproc;
118 	size_t pgsz, len;
119 	caddr_t addr, brkend;
120 	caddr_t bssbase = p->p_bssbase;
121 	caddr_t brkbase = p->p_brkbase;
122 	int oszc, szc;
123 	int err;
124 
125 	oszc = p->p_brkpageszc;
126 
127 	/*
128 	 * If p_brkbase has not yet been set, the first call
129 	 * to brk_internal() will initialize it.
130 	 */
131 	if (brkbase == 0) {
132 		return (brk_internal(nva, oszc));
133 	}
134 
135 	len = nva - bssbase;
136 
137 	pgsz = map_pgsz(MAPPGSZ_HEAP, p, bssbase, len, 0);
138 	szc = page_szc(pgsz);
139 
140 	/*
141 	 * Covers two cases:
142 	 * 1. page_szc() returns -1 for invalid page size, so we want to
143 	 * ignore it in that case.
144 	 * 2. By design we never decrease page size, as it is more stable.
145 	 */
146 	if (szc <= oszc) {
147 		err = brk_internal(nva, oszc);
148 		/* If failed, back off to base page size. */
149 		if (err != 0 && oszc != 0) {
150 			err = brk_internal(nva, 0);
151 		}
152 		return (err);
153 	}
154 
155 	err = brk_internal(nva, szc);
156 	/* If using szc failed, map with base page size and return. */
157 	if (err != 0) {
158 		if (szc != 0) {
159 			err = brk_internal(nva, 0);
160 		}
161 		return (err);
162 	}
163 
164 	/*
165 	 * Round up brk base to a large page boundary and remap
166 	 * anything in the segment already faulted in beyond that
167 	 * point.
168 	 */
169 	addr = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase, pgsz);
170 	brkend = brkbase + p->p_brksize;
171 	len = brkend - addr;
172 	/* Check that len is not negative. Update page size code for heap. */
173 	if (addr >= p->p_bssbase && brkend > addr && IS_P2ALIGNED(len, pgsz)) {
174 		(void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
175 		p->p_brkpageszc = szc;
176 	}
177 
178 	ASSERT(err == 0);
179 	return (err);		/* should always be 0 */
180 }
181 
182 /*
183  * Returns 0 on success.
184  */
185 int
186 brk_internal(caddr_t nva, uint_t brkszc)
187 {
188 	caddr_t ova;			/* current break address */
189 	size_t size;
190 	int	error;
191 	struct proc *p = curproc;
192 	struct as *as = p->p_as;
193 	size_t pgsz;
194 	uint_t szc;
195 	rctl_qty_t as_rctl;
196 
197 	/*
198 	 * extend heap to brkszc alignment but use current p->p_brkpageszc
199 	 * for the newly created segment. This allows the new extension
200 	 * segment to be concatenated successfully with the existing brk
201 	 * segment.
202 	 */
203 	if ((szc = brkszc) != 0) {
204 		pgsz = page_get_pagesize(szc);
205 		ASSERT(pgsz > PAGESIZE);
206 	} else {
207 		pgsz = PAGESIZE;
208 	}
209 
210 	mutex_enter(&p->p_lock);
211 	as_rctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_DATA],
212 	    p->p_rctls, p);
213 	mutex_exit(&p->p_lock);
214 
215 	/*
216 	 * If p_brkbase has not yet been set, the first call
217 	 * to brk() will initialize it.
218 	 */
219 	if (p->p_brkbase == 0)
220 		p->p_brkbase = nva;
221 
222 	/*
223 	 * Before multiple page size support existed p_brksize was the value
224 	 * not rounded to the pagesize (i.e. it stored the exact user request
225 	 * for heap size). If pgsz is greater than PAGESIZE calculate the
226 	 * heap size as the real new heap size by rounding it up to pgsz.
227 	 * This is useful since we may want to know where the heap ends
228 	 * without knowing heap pagesize (e.g. some old code) and also if
229 	 * heap pagesize changes we can update p_brkpageszc but delay adding
230 	 * new mapping yet still know from p_brksize where the heap really
231 	 * ends. The user requested heap end is stored in libc variable.
232 	 */
233 	if (pgsz > PAGESIZE) {
234 		caddr_t tnva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
235 		size = tnva - p->p_brkbase;
236 		if (tnva < p->p_brkbase || (size > p->p_brksize &&
237 		    size > (size_t)as_rctl)) {
238 			szc = 0;
239 			pgsz = PAGESIZE;
240 			size = nva - p->p_brkbase;
241 		}
242 	} else {
243 		size = nva - p->p_brkbase;
244 	}
245 
246 	/*
247 	 * use PAGESIZE to roundup ova because we want to know the real value
248 	 * of the current heap end in case p_brkpageszc changes since the last
249 	 * p_brksize was computed.
250 	 */
251 	nva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
252 	ova = (caddr_t)P2ROUNDUP((uintptr_t)(p->p_brkbase + p->p_brksize),
253 	    PAGESIZE);
254 
255 	if ((nva < p->p_brkbase) || (size > p->p_brksize &&
256 	    size > as_rctl)) {
257 		mutex_enter(&p->p_lock);
258 		(void) rctl_action(rctlproc_legacy[RLIMIT_DATA], p->p_rctls, p,
259 		    RCA_SAFE);
260 		mutex_exit(&p->p_lock);
261 		return (ENOMEM);
262 	}
263 
264 	if (nva > ova) {
265 		struct segvn_crargs crargs =
266 		    SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
267 
268 		if (!(p->p_datprot & PROT_EXEC)) {
269 			crargs.prot &= ~PROT_EXEC;
270 		}
271 
272 		/*
273 		 * Add new zfod mapping to extend UNIX data segment
274 		 * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies
275 		 * via map_pgszcvec(). Use AS_MAP_HEAP to get intermediate
276 		 * page sizes if ova is not aligned to szc's pgsz.
277 		 */
278 		if (szc > 0) {
279 			caddr_t rbss;
280 
281 			rbss = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase,
282 			    pgsz);
283 			if (IS_P2ALIGNED(p->p_bssbase, pgsz) || ova > rbss) {
284 				crargs.szc = p->p_brkpageszc ? p->p_brkpageszc :
285 				    AS_MAP_NO_LPOOB;
286 			} else if (ova == rbss) {
287 				crargs.szc = szc;
288 			} else {
289 				crargs.szc = AS_MAP_HEAP;
290 			}
291 		} else {
292 			crargs.szc = AS_MAP_NO_LPOOB;
293 		}
294 		crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_UP;
295 		error = as_map(as, ova, (size_t)(nva - ova), segvn_create,
296 		    &crargs);
297 		if (error) {
298 			return (error);
299 		}
300 
301 	} else if (nva < ova) {
302 		/*
303 		 * Release mapping to shrink UNIX data segment.
304 		 */
305 		(void) as_unmap(as, nva, (size_t)(ova - nva));
306 	}
307 	p->p_brksize = size;
308 	return (0);
309 }
310 
311 /*
312  * Grow the stack to include sp.  Return 1 if successful, 0 otherwise.
313  * This routine assumes that the stack grows downward.
314  */
315 int
316 grow(caddr_t sp)
317 {
318 	struct proc *p = curproc;
319 	struct as *as = p->p_as;
320 	size_t oldsize = p->p_stksize;
321 	size_t newsize;
322 	int err;
323 
324 	/*
325 	 * Serialize grow operations on an address space.
326 	 * This also serves as the lock protecting p_stksize
327 	 * and p_stkpageszc.
328 	 */
329 	as_rangelock(as);
330 	if (use_stk_lpg && (p->p_flag & SAUTOLPG) != 0) {
331 		err = grow_lpg(sp);
332 	} else {
333 		err = grow_internal(sp, p->p_stkpageszc);
334 	}
335 	newsize = p->p_stksize;
336 	as_rangeunlock(as);
337 
338 	if (err == 0 && newsize > oldsize) {
339 		ASSERT(IS_P2ALIGNED(oldsize, PAGESIZE));
340 		ASSERT(IS_P2ALIGNED(newsize, PAGESIZE));
341 		/*
342 		 * Set up translations so the process doesn't have to fault in
343 		 * the stack pages we just gave it.
344 		 */
345 		(void) as_fault(as->a_hat, as, p->p_usrstack - newsize,
346 		    newsize - oldsize, F_INVAL, S_WRITE);
347 	}
348 	return ((err == 0 ? 1 : 0));
349 }
350 
351 /*
352  * Algorithm: call arch-specific map_pgsz to get best page size to use,
353  * then call grow_internal().
354  * Returns 0 on success.
355  */
356 static int
357 grow_lpg(caddr_t sp)
358 {
359 	struct proc *p = curproc;
360 	size_t pgsz;
361 	size_t len, newsize;
362 	caddr_t addr, saddr;
363 	caddr_t growend;
364 	int oszc, szc;
365 	int err;
366 
367 	newsize = p->p_usrstack - sp;
368 
369 	oszc = p->p_stkpageszc;
370 	pgsz = map_pgsz(MAPPGSZ_STK, p, sp, newsize, 0);
371 	szc = page_szc(pgsz);
372 
373 	/*
374 	 * Covers two cases:
375 	 * 1. page_szc() returns -1 for invalid page size, so we want to
376 	 * ignore it in that case.
377 	 * 2. By design we never decrease page size, as it is more stable.
378 	 * This shouldn't happen as the stack never shrinks.
379 	 */
380 	if (szc <= oszc) {
381 		err = grow_internal(sp, oszc);
382 		/* failed, fall back to base page size */
383 		if (err != 0 && oszc != 0) {
384 			err = grow_internal(sp, 0);
385 		}
386 		return (err);
387 	}
388 
389 	/*
390 	 * We've grown sufficiently to switch to a new page size.
391 	 * So we are going to remap the whole segment with the new page size.
392 	 */
393 	err = grow_internal(sp, szc);
394 	/* The grow with szc failed, so fall back to base page size. */
395 	if (err != 0) {
396 		if (szc != 0) {
397 			err = grow_internal(sp, 0);
398 		}
399 		return (err);
400 	}
401 
402 	/*
403 	 * Round up stack pointer to a large page boundary and remap
404 	 * any pgsz pages in the segment already faulted in beyond that
405 	 * point.
406 	 */
407 	saddr = p->p_usrstack - p->p_stksize;
408 	addr = (caddr_t)P2ROUNDUP((uintptr_t)saddr, pgsz);
409 	growend = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack, pgsz);
410 	len = growend - addr;
411 	/* Check that len is not negative. Update page size code for stack. */
412 	if (addr >= saddr && growend > addr && IS_P2ALIGNED(len, pgsz)) {
413 		(void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
414 		p->p_stkpageszc = szc;
415 	}
416 
417 	ASSERT(err == 0);
418 	return (err);		/* should always be 0 */
419 }
420 
421 /*
422  * This routine assumes that the stack grows downward.
423  * Returns 0 on success, errno on failure.
424  */
425 int
426 grow_internal(caddr_t sp, uint_t growszc)
427 {
428 	struct proc *p = curproc;
429 	size_t newsize;
430 	size_t oldsize;
431 	uintptr_t new_start;
432 	int    error;
433 	size_t pgsz;
434 	uint_t szc;
435 	struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
436 
437 	ASSERT(sp < p->p_usrstack);
438 	sp = (caddr_t)P2ALIGN((uintptr_t)sp, PAGESIZE);
439 
440 	/*
441 	 * grow to growszc alignment but use current p->p_stkpageszc for
442 	 * the segvn_crargs szc passed to segvn_create. For memcntl to
443 	 * increase the szc, this allows the new extension segment to be
444 	 * concatenated successfully with the existing stack segment.
445 	 */
446 	if ((szc = growszc) != 0) {
447 		pgsz = page_get_pagesize(szc);
448 		ASSERT(pgsz > PAGESIZE);
449 		newsize = p->p_usrstack - (caddr_t)P2ALIGN((uintptr_t)sp, pgsz);
450 		if (newsize > (size_t)p->p_stk_ctl) {
451 			szc = 0;
452 			pgsz = PAGESIZE;
453 			newsize = p->p_usrstack - sp;
454 		}
455 	} else {
456 		pgsz = PAGESIZE;
457 		newsize = p->p_usrstack - sp;
458 	}
459 
460 	if (newsize > (size_t)p->p_stk_ctl) {
461 		(void) rctl_action(rctlproc_legacy[RLIMIT_STACK], p->p_rctls, p,
462 		    RCA_UNSAFE_ALL);
463 
464 		return (ENOMEM);
465 	}
466 
467 	oldsize = p->p_stksize;
468 	ASSERT(P2PHASE(oldsize, PAGESIZE) == 0);
469 
470 	if (newsize <= oldsize) {	/* prevent the stack from shrinking */
471 		return (0);
472 	}
473 
474 	if (!(p->p_stkprot & PROT_EXEC)) {
475 		crargs.prot &= ~PROT_EXEC;
476 	}
477 	/*
478 	 * extend stack with the proposed new growszc, which is different
479 	 * than p_stkpageszc only on a memcntl to increase the stack pagesize.
480 	 * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies via
481 	 * map_pgszcvec(). Use AS_MAP_STACK to get intermediate page sizes
482 	 * if not aligned to szc's pgsz.
483 	 */
484 	if (szc > 0) {
485 		caddr_t oldsp = p->p_usrstack - oldsize;
486 		caddr_t austk = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack,
487 		    pgsz);
488 
489 		if (IS_P2ALIGNED(p->p_usrstack, pgsz) || oldsp < austk) {
490 			crargs.szc = p->p_stkpageszc ? p->p_stkpageszc :
491 			    AS_MAP_NO_LPOOB;
492 		} else if (oldsp == austk) {
493 			crargs.szc = szc;
494 		} else {
495 			crargs.szc = AS_MAP_STACK;
496 		}
497 	} else {
498 		crargs.szc = AS_MAP_NO_LPOOB;
499 	}
500 	crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_DOWN;
501 
502 	/*
503 	 * The stack is about to grow into its guard.  This can be acceptable
504 	 * if the size restriction on the stack has been expanded since its
505 	 * initialization during exec().  In such cases, the guard segment will
506 	 * be shrunk, provided the new size is reasonable.
507 	 */
508 	new_start = (uintptr_t)p->p_usrstack - newsize;
509 	if (p->p_stkg_start != 0 && new_start > p->p_stkg_start &&
510 	    new_start < p->p_stkg_end) {
511 		const size_t unmap_sz = p->p_stkg_end - new_start;
512 		const size_t remain_sz = new_start - p->p_stkg_start;
513 		extern size_t stack_guard_min_sz;
514 
515 		/* Do not allow the guard to shrink below minimum size */
516 		if (remain_sz < stack_guard_min_sz) {
517 			return (ENOMEM);
518 		}
519 
520 		error = as_unmap(p->p_as, (caddr_t)new_start, unmap_sz);
521 		if (error != 0) {
522 			return (error);
523 		}
524 		p->p_stkg_end -= unmap_sz;
525 	}
526 
527 	if ((error = as_map(p->p_as, (caddr_t)new_start, newsize - oldsize,
528 	    segvn_create, &crargs)) != 0) {
529 		if (error == EAGAIN) {
530 			cmn_err(CE_WARN, "Sorry, no swap space to grow stack "
531 			    "for pid %d (%s)", p->p_pid, PTOU(p)->u_comm);
532 		}
533 		return (error);
534 	}
535 	p->p_stksize = newsize;
536 	return (0);
537 }
538 
539 /*
540  * Find address for user to map.  If MAP_FIXED is not specified, we can pick
541  * any address we want, but we will first try the value in *addrp if it is
542  * non-NULL and _MAP_RANDOMIZE is not set.  Thus this is implementing a way to
543  * try and get a preferred address.
544  */
545 int
546 choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off,
547     int vacalign, uint_t flags)
548 {
549 	caddr_t basep = (caddr_t)(uintptr_t)((uintptr_t)*addrp & PAGEMASK);
550 	size_t lenp = len;
551 
552 	ASSERT(AS_ISCLAIMGAP(as));	/* searches should be serialized */
553 	if (flags & MAP_FIXED) {
554 		(void) as_unmap(as, *addrp, len);
555 		return (0);
556 	} else if (basep != NULL &&
557 	    ((flags & (MAP_ALIGN | _MAP_RANDOMIZE)) == 0) &&
558 	    !as_gap(as, len, &basep, &lenp, 0, *addrp)) {
559 		/* User supplied address was available */
560 		*addrp = basep;
561 	} else {
562 		/*
563 		 * No user supplied address or the address supplied was not
564 		 * available.
565 		 */
566 		map_addr(addrp, len, off, vacalign, flags);
567 	}
568 	if (*addrp == NULL)
569 		return (ENOMEM);
570 	return (0);
571 }
572 
573 
574 /*
575  * Used for MAP_ANON - fast way to get anonymous pages
576  */
577 static int
578 zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
579     offset_t pos)
580 {
581 	struct segvn_crargs vn_a;
582 	int error;
583 
584 	if (((PROT_ALL & uprot) != uprot))
585 		return (EACCES);
586 
587 	if ((flags & MAP_FIXED) != 0) {
588 		caddr_t userlimit;
589 
590 		/*
591 		 * Use the user address.  First verify that
592 		 * the address to be used is page aligned.
593 		 * Then make some simple bounds checks.
594 		 */
595 		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
596 			return (EINVAL);
597 
598 		userlimit = flags & _MAP_LOW32 ?
599 		    (caddr_t)USERLIMIT32 : as->a_userlimit;
600 		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
601 		case RANGE_OKAY:
602 			break;
603 		case RANGE_BADPROT:
604 			return (ENOTSUP);
605 		case RANGE_BADADDR:
606 		default:
607 			return (ENOMEM);
608 		}
609 	}
610 	/*
611 	 * No need to worry about vac alignment for anonymous
612 	 * pages since this is a "clone" object that doesn't
613 	 * yet exist.
614 	 */
615 	error = choose_addr(as, addrp, len, pos, ADDR_NOVACALIGN, flags);
616 	if (error != 0) {
617 		return (error);
618 	}
619 
620 	/*
621 	 * Use the seg_vn segment driver; passing in the NULL amp
622 	 * gives the desired "cloning" effect.
623 	 */
624 	vn_a.vp = NULL;
625 	vn_a.offset = 0;
626 	vn_a.type = flags & MAP_TYPE;
627 	vn_a.prot = uprot;
628 	vn_a.maxprot = PROT_ALL;
629 	vn_a.flags = flags & ~MAP_TYPE;
630 	vn_a.cred = CRED();
631 	vn_a.amp = NULL;
632 	vn_a.szc = 0;
633 	vn_a.lgrp_mem_policy_flags = 0;
634 
635 	return (as_map(as, *addrp, len, segvn_create, &vn_a));
636 }
637 
638 #define	RANDOMIZABLE_MAPPING(addr, flags) (((flags & MAP_FIXED) == 0) && \
639 	!(((flags & MAP_ALIGN) == 0) && (addr != 0) && aslr_respect_mmap_hint))
640 
641 static int
642 smmap_common(caddr_t *addrp, size_t len,
643     int prot, int flags, struct file *fp, offset_t pos)
644 {
645 	struct vnode *vp;
646 	struct as *as = curproc->p_as;
647 	uint_t uprot, maxprot, type;
648 	int error;
649 	int in_crit = 0;
650 
651 	if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | _MAP_NEW |
652 	    _MAP_LOW32 | MAP_NORESERVE | MAP_ANON | MAP_ALIGN |
653 	    MAP_TEXT | MAP_INITDATA)) != 0) {
654 		/* | MAP_RENAME */	/* not implemented, let user know */
655 		return (EINVAL);
656 	}
657 
658 	if ((flags & MAP_TEXT) && !(prot & PROT_EXEC)) {
659 		return (EINVAL);
660 	}
661 
662 	if ((flags & (MAP_TEXT | MAP_INITDATA)) == (MAP_TEXT | MAP_INITDATA)) {
663 		return (EINVAL);
664 	}
665 
666 	if ((flags & (MAP_FIXED | _MAP_RANDOMIZE)) ==
667 	    (MAP_FIXED | _MAP_RANDOMIZE)) {
668 		return (EINVAL);
669 	}
670 
671 	/*
672 	 * If it's not a fixed allocation and mmap ASLR is enabled, randomize
673 	 * it.
674 	 */
675 	if (RANDOMIZABLE_MAPPING(*addrp, flags) &&
676 	    secflag_enabled(curproc, PROC_SEC_ASLR))
677 		flags |= _MAP_RANDOMIZE;
678 
679 #if defined(__sparc)
680 	/*
681 	 * See if this is an "old mmap call".  If so, remember this
682 	 * fact and convert the flags value given to mmap to indicate
683 	 * the specified address in the system call must be used.
684 	 * _MAP_NEW is turned set by all new uses of mmap.
685 	 */
686 	if ((flags & _MAP_NEW) == 0)
687 		flags |= MAP_FIXED;
688 #endif
689 	flags &= ~_MAP_NEW;
690 
691 	type = flags & MAP_TYPE;
692 	if (type != MAP_PRIVATE && type != MAP_SHARED)
693 		return (EINVAL);
694 
695 
696 	if (flags & MAP_ALIGN) {
697 		if (flags & MAP_FIXED)
698 			return (EINVAL);
699 
700 		/* alignment needs to be a power of 2 >= page size */
701 		if (((uintptr_t)*addrp < PAGESIZE && (uintptr_t)*addrp != 0) ||
702 		    !ISP2((uintptr_t)*addrp))
703 			return (EINVAL);
704 	}
705 	/*
706 	 * Check for bad lengths and file position.
707 	 * We let the VOP_MAP routine check for negative lengths
708 	 * since on some vnode types this might be appropriate.
709 	 */
710 	if (len == 0 || (pos & (u_offset_t)PAGEOFFSET) != 0)
711 		return (EINVAL);
712 
713 	maxprot = PROT_ALL;		/* start out allowing all accesses */
714 	uprot = prot | PROT_USER;
715 
716 	if (fp == NULL) {
717 		ASSERT(flags & MAP_ANON);
718 		/* discard lwpchan mappings, like munmap() */
719 		if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
720 			lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
721 		as_rangelock(as);
722 		error = zmap(as, addrp, len, uprot, flags, pos);
723 		as_rangeunlock(as);
724 		/*
725 		 * Tell machine specific code that lwp has mapped shared memory
726 		 */
727 		if (error == 0 && (flags & MAP_SHARED)) {
728 			/* EMPTY */
729 			LWP_MMODEL_SHARED_AS(*addrp, len);
730 		}
731 		return (error);
732 	} else if ((flags & MAP_ANON) != 0)
733 		return (EINVAL);
734 
735 	vp = fp->f_vnode;
736 
737 	/* Can't execute code from "noexec" mounted filesystem. */
738 	if ((vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0)
739 		maxprot &= ~PROT_EXEC;
740 
741 	/*
742 	 * These checks were added as part of large files.
743 	 *
744 	 * Return ENXIO if the initial position is negative; return EOVERFLOW
745 	 * if (offset + len) would overflow the maximum allowed offset for the
746 	 * type of file descriptor being used.
747 	 */
748 	if (vp->v_type == VREG) {
749 		if (pos < 0)
750 			return (ENXIO);
751 		if ((offset_t)len > (OFFSET_MAX(fp) - pos))
752 			return (EOVERFLOW);
753 	}
754 
755 	if (type == MAP_SHARED && (fp->f_flag & FWRITE) == 0) {
756 		/* no write access allowed */
757 		maxprot &= ~PROT_WRITE;
758 	}
759 
760 	/*
761 	 * XXX - Do we also adjust maxprot based on protections
762 	 * of the vnode?  E.g. if no execute permission is given
763 	 * on the vnode for the current user, maxprot probably
764 	 * should disallow PROT_EXEC also?  This is different
765 	 * from the write access as this would be a per vnode
766 	 * test as opposed to a per fd test for writability.
767 	 */
768 
769 	/*
770 	 * Verify that the specified protections are not greater than
771 	 * the maximum allowable protections.  Also test to make sure
772 	 * that the file descriptor does allows for read access since
773 	 * "write only" mappings are hard to do since normally we do
774 	 * the read from the file before the page can be written.
775 	 */
776 	if (((maxprot & uprot) != uprot) || (fp->f_flag & FREAD) == 0)
777 		return (EACCES);
778 
779 	/*
780 	 * If the user specified an address, do some simple checks here
781 	 */
782 	if ((flags & MAP_FIXED) != 0) {
783 		caddr_t userlimit;
784 
785 		/*
786 		 * Use the user address.  First verify that
787 		 * the address to be used is page aligned.
788 		 * Then make some simple bounds checks.
789 		 */
790 		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
791 			return (EINVAL);
792 
793 		userlimit = flags & _MAP_LOW32 ?
794 		    (caddr_t)USERLIMIT32 : as->a_userlimit;
795 		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
796 		case RANGE_OKAY:
797 			break;
798 		case RANGE_BADPROT:
799 			return (ENOTSUP);
800 		case RANGE_BADADDR:
801 		default:
802 			return (ENOMEM);
803 		}
804 	}
805 
806 	if ((prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) &&
807 	    nbl_need_check(vp)) {
808 		int svmand;
809 		nbl_op_t nop;
810 
811 		nbl_start_crit(vp, RW_READER);
812 		in_crit = 1;
813 		error = nbl_svmand(vp, fp->f_cred, &svmand);
814 		if (error != 0)
815 			goto done;
816 		if ((prot & PROT_WRITE) && (type == MAP_SHARED)) {
817 			if (prot & (PROT_READ | PROT_EXEC)) {
818 				nop = NBL_READWRITE;
819 			} else {
820 				nop = NBL_WRITE;
821 			}
822 		} else {
823 			nop = NBL_READ;
824 		}
825 		if (nbl_conflict(vp, nop, 0, LONG_MAX, svmand, NULL)) {
826 			error = EACCES;
827 			goto done;
828 		}
829 	}
830 
831 	/* discard lwpchan mappings, like munmap() */
832 	if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
833 		lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
834 
835 	/*
836 	 * Ok, now let the vnode map routine do its thing to set things up.
837 	 */
838 	error = VOP_MAP(vp, pos, as,
839 	    addrp, len, uprot, maxprot, flags, fp->f_cred, NULL);
840 
841 	if (error == 0) {
842 		/*
843 		 * Tell machine specific code that lwp has mapped shared memory
844 		 */
845 		if (flags & MAP_SHARED) {
846 			/* EMPTY */
847 			LWP_MMODEL_SHARED_AS(*addrp, len);
848 		}
849 		if (vp->v_type == VREG &&
850 		    (flags & (MAP_TEXT | MAP_INITDATA)) != 0) {
851 			/*
852 			 * Mark this as an executable vnode
853 			 */
854 			mutex_enter(&vp->v_lock);
855 			vp->v_flag |= VVMEXEC;
856 			mutex_exit(&vp->v_lock);
857 		}
858 	}
859 
860 done:
861 	if (in_crit)
862 		nbl_end_crit(vp);
863 	return (error);
864 }
865 
866 #ifdef _LP64
867 /*
868  * LP64 mmap(2) system call: 64-bit offset, 64-bit address.
869  *
870  * The "large file" mmap routine mmap64(2) is also mapped to this routine
871  * by the 64-bit version of libc.
872  *
873  * Eventually, this should be the only version, and have smmap_common()
874  * folded back into it again.  Some day.
875  */
876 caddr_t
877 smmap64(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos)
878 {
879 	struct file *fp;
880 	int error;
881 
882 	if (fd == -1 && (flags & MAP_ANON) != 0)
883 		error = smmap_common(&addr, len, prot, flags,
884 		    NULL, (offset_t)pos);
885 	else if ((fp = getf(fd)) != NULL) {
886 		error = smmap_common(&addr, len, prot, flags,
887 		    fp, (offset_t)pos);
888 		releasef(fd);
889 	} else
890 		error = EBADF;
891 
892 	return (error ? (caddr_t)(uintptr_t)set_errno(error) : addr);
893 }
894 #endif	/* _LP64 */
895 
896 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
897 
898 /*
899  * ILP32 mmap(2) system call: 32-bit offset, 32-bit address.
900  */
901 caddr_t
902 smmap32(caddr32_t addr, size32_t len, int prot, int flags, int fd, off32_t pos)
903 {
904 	struct file *fp;
905 	int error;
906 	caddr_t a = (caddr_t)(uintptr_t)addr;
907 
908 	if (flags & _MAP_LOW32)
909 		error = EINVAL;
910 	else if (fd == -1 && (flags & MAP_ANON) != 0)
911 		error = smmap_common(&a, (size_t)len, prot,
912 		    flags | _MAP_LOW32, NULL, (offset_t)pos);
913 	else if ((fp = getf(fd)) != NULL) {
914 		error = smmap_common(&a, (size_t)len, prot,
915 		    flags | _MAP_LOW32, fp, (offset_t)pos);
916 		releasef(fd);
917 	} else
918 		error = EBADF;
919 
920 	ASSERT(error != 0 || (uintptr_t)(a + len) < (uintptr_t)UINT32_MAX);
921 
922 	return (error ? (caddr_t)(uintptr_t)set_errno(error) : a);
923 }
924 
925 /*
926  * ILP32 mmap64(2) system call: 64-bit offset, 32-bit address.
927  *
928  * Now things really get ugly because we can't use the C-style
929  * calling convention for more than 6 args, and 64-bit parameter
930  * passing on 32-bit systems is less than clean.
931  */
932 
933 struct mmaplf32a {
934 	caddr_t addr;
935 	size_t len;
936 #ifdef _LP64
937 	/*
938 	 * 32-bit contents, 64-bit cells
939 	 */
940 	uint64_t prot;
941 	uint64_t flags;
942 	uint64_t fd;
943 	uint64_t offhi;
944 	uint64_t offlo;
945 #else
946 	/*
947 	 * 32-bit contents, 32-bit cells
948 	 */
949 	uint32_t prot;
950 	uint32_t flags;
951 	uint32_t fd;
952 	uint32_t offhi;
953 	uint32_t offlo;
954 #endif
955 };
956 
957 int
958 smmaplf32(struct mmaplf32a *uap, rval_t *rvp)
959 {
960 	struct file *fp;
961 	int error;
962 	caddr_t a = uap->addr;
963 	int flags = (int)uap->flags;
964 	int fd = (int)uap->fd;
965 #ifdef _BIG_ENDIAN
966 	offset_t off = ((u_offset_t)uap->offhi << 32) | (u_offset_t)uap->offlo;
967 #else
968 	offset_t off = ((u_offset_t)uap->offlo << 32) | (u_offset_t)uap->offhi;
969 #endif
970 
971 	if (flags & _MAP_LOW32)
972 		error = EINVAL;
973 	else if (fd == -1 && (flags & MAP_ANON) != 0)
974 		error = smmap_common(&a, uap->len, (int)uap->prot,
975 		    flags | _MAP_LOW32, NULL, off);
976 	else if ((fp = getf(fd)) != NULL) {
977 		error = smmap_common(&a, uap->len, (int)uap->prot,
978 		    flags | _MAP_LOW32, fp, off);
979 		releasef(fd);
980 	} else
981 		error = EBADF;
982 
983 	if (error == 0)
984 		rvp->r_val1 = (uintptr_t)a;
985 	return (error);
986 }
987 
988 #endif	/* _SYSCALL32_IMPL || _ILP32 */
989 
990 int
991 munmap(caddr_t addr, size_t len)
992 {
993 	struct proc *p = curproc;
994 	struct as *as = p->p_as;
995 
996 	if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
997 		return (set_errno(EINVAL));
998 
999 	if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
1000 		return (set_errno(EINVAL));
1001 
1002 	/*
1003 	 * Discard lwpchan mappings.
1004 	 */
1005 	if (p->p_lcp != NULL)
1006 		lwpchan_delete_mapping(p, addr, addr + len);
1007 	if (as_unmap(as, addr, len) != 0)
1008 		return (set_errno(EINVAL));
1009 
1010 	return (0);
1011 }
1012 
1013 int
1014 mprotect(caddr_t addr, size_t len, int prot)
1015 {
1016 	struct as *as = curproc->p_as;
1017 	uint_t uprot = prot | PROT_USER;
1018 	int error;
1019 
1020 	if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
1021 		return (set_errno(EINVAL));
1022 
1023 	switch (valid_usr_range(addr, len, prot, as, as->a_userlimit)) {
1024 	case RANGE_OKAY:
1025 		break;
1026 	case RANGE_BADPROT:
1027 		return (set_errno(ENOTSUP));
1028 	case RANGE_BADADDR:
1029 	default:
1030 		return (set_errno(ENOMEM));
1031 	}
1032 
1033 	error = as_setprot(as, addr, len, uprot);
1034 	if (error)
1035 		return (set_errno(error));
1036 	return (0);
1037 }
1038 
1039 #define	MC_CACHE	128			/* internal result buffer */
1040 #define	MC_QUANTUM	(MC_CACHE * PAGESIZE)	/* addresses covered in loop */
1041 
1042 int
1043 mincore(caddr_t addr, size_t len, char *vecp)
1044 {
1045 	struct as *as = curproc->p_as;
1046 	caddr_t ea;			/* end address of loop */
1047 	size_t rl;			/* inner result length */
1048 	char vec[MC_CACHE];		/* local vector cache */
1049 	int error;
1050 	model_t model;
1051 	long	llen;
1052 
1053 	model = get_udatamodel();
1054 	/*
1055 	 * Validate form of address parameters.
1056 	 */
1057 	if (model == DATAMODEL_NATIVE) {
1058 		llen = (long)len;
1059 	} else {
1060 		llen = (int32_t)(size32_t)len;
1061 	}
1062 	if (((uintptr_t)addr & PAGEOFFSET) != 0 || llen <= 0)
1063 		return (set_errno(EINVAL));
1064 
1065 	if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
1066 		return (set_errno(ENOMEM));
1067 
1068 	/*
1069 	 * Loop over subranges of interval [addr : addr + len), recovering
1070 	 * results internally and then copying them out to caller.  Subrange
1071 	 * is based on the size of MC_CACHE, defined above.
1072 	 */
1073 	for (ea = addr + len; addr < ea; addr += MC_QUANTUM) {
1074 		error = as_incore(as, addr,
1075 		    (size_t)MIN(MC_QUANTUM, ea - addr), vec, &rl);
1076 		if (rl != 0) {
1077 			rl = (rl + PAGESIZE - 1) / PAGESIZE;
1078 			if (copyout(vec, vecp, rl) != 0)
1079 				return (set_errno(EFAULT));
1080 			vecp += rl;
1081 		}
1082 		if (error != 0)
1083 			return (set_errno(ENOMEM));
1084 	}
1085 	return (0);
1086 }
1087