xref: /illumos-gate/usr/src/uts/common/os/grow.c (revision d042c5a26452797afc4fe8c2ceddebff94d88745)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /* Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. */
23 
24 /*
25  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
26  * Use is subject to license terms.
27  */
28 
29 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
30 /*	  All Rights Reserved  	*/
31 
32 #include <sys/types.h>
33 #include <sys/inttypes.h>
34 #include <sys/param.h>
35 #include <sys/sysmacros.h>
36 #include <sys/systm.h>
37 #include <sys/signal.h>
38 #include <sys/user.h>
39 #include <sys/errno.h>
40 #include <sys/var.h>
41 #include <sys/proc.h>
42 #include <sys/tuneable.h>
43 #include <sys/debug.h>
44 #include <sys/cmn_err.h>
45 #include <sys/cred.h>
46 #include <sys/vnode.h>
47 #include <sys/vfs.h>
48 #include <sys/vm.h>
49 #include <sys/file.h>
50 #include <sys/mman.h>
51 #include <sys/vmparam.h>
52 #include <sys/fcntl.h>
53 #include <sys/lwpchan_impl.h>
54 #include <sys/nbmlock.h>
55 
56 #include <vm/hat.h>
57 #include <vm/as.h>
58 #include <vm/seg.h>
59 #include <vm/seg_dev.h>
60 #include <vm/seg_vn.h>
61 
62 int use_brk_lpg = 1;
63 int use_stk_lpg = 1;
64 
65 /*
66  * If set, we will not randomize mappings where the 'addr' argument is
67  * non-NULL and not an alignment.
68  */
69 int aslr_respect_mmap_hint = 1;
70 
71 static int brk_lpg(caddr_t nva);
72 static int grow_lpg(caddr_t sp);
73 
74 intptr_t
75 brk(caddr_t nva)
76 {
77 	int error;
78 	proc_t *p = curproc;
79 
80 	/*
81 	 * Serialize brk operations on an address space.
82 	 * This also serves as the lock protecting p_brksize
83 	 * and p_brkpageszc.
84 	 */
85 	as_rangelock(p->p_as);
86 
87 	/*
88 	 * As a special case to aid the implementation of sbrk(3C), if given a
89 	 * new brk of 0, return the current brk.  We'll hide this in brk(3C).
90 	 */
91 	if (nva == 0) {
92 		intptr_t base = (intptr_t)(p->p_brkbase + p->p_brksize);
93 		as_rangeunlock(p->p_as);
94 		return (base);
95 	}
96 
97 	if (use_brk_lpg && (p->p_flag & SAUTOLPG) != 0) {
98 		error = brk_lpg(nva);
99 	} else {
100 		error = brk_internal(nva, p->p_brkpageszc);
101 	}
102 	as_rangeunlock(p->p_as);
103 	return ((error != 0 ? set_errno(error) : 0));
104 }
105 
106 /*
107  * Algorithm: call arch-specific map_pgsz to get best page size to use,
108  * then call brk_internal().
109  * Returns 0 on success.
110  */
111 static int
112 brk_lpg(caddr_t nva)
113 {
114 	struct proc *p = curproc;
115 	size_t pgsz, len;
116 	caddr_t addr, brkend;
117 	caddr_t bssbase = p->p_bssbase;
118 	caddr_t brkbase = p->p_brkbase;
119 	int oszc, szc;
120 	int err;
121 
122 	oszc = p->p_brkpageszc;
123 
124 	/*
125 	 * If p_brkbase has not yet been set, the first call
126 	 * to brk_internal() will initialize it.
127 	 */
128 	if (brkbase == 0) {
129 		return (brk_internal(nva, oszc));
130 	}
131 
132 	len = nva - bssbase;
133 
134 	pgsz = map_pgsz(MAPPGSZ_HEAP, p, bssbase, len, 0);
135 	szc = page_szc(pgsz);
136 
137 	/*
138 	 * Covers two cases:
139 	 * 1. page_szc() returns -1 for invalid page size, so we want to
140 	 * ignore it in that case.
141 	 * 2. By design we never decrease page size, as it is more stable.
142 	 */
143 	if (szc <= oszc) {
144 		err = brk_internal(nva, oszc);
145 		/* If failed, back off to base page size. */
146 		if (err != 0 && oszc != 0) {
147 			err = brk_internal(nva, 0);
148 		}
149 		return (err);
150 	}
151 
152 	err = brk_internal(nva, szc);
153 	/* If using szc failed, map with base page size and return. */
154 	if (err != 0) {
155 		if (szc != 0) {
156 			err = brk_internal(nva, 0);
157 		}
158 		return (err);
159 	}
160 
161 	/*
162 	 * Round up brk base to a large page boundary and remap
163 	 * anything in the segment already faulted in beyond that
164 	 * point.
165 	 */
166 	addr = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase, pgsz);
167 	brkend = brkbase + p->p_brksize;
168 	len = brkend - addr;
169 	/* Check that len is not negative. Update page size code for heap. */
170 	if (addr >= p->p_bssbase && brkend > addr && IS_P2ALIGNED(len, pgsz)) {
171 		(void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
172 		p->p_brkpageszc = szc;
173 	}
174 
175 	ASSERT(err == 0);
176 	return (err);		/* should always be 0 */
177 }
178 
179 /*
180  * Returns 0 on success.
181  */
182 int
183 brk_internal(caddr_t nva, uint_t brkszc)
184 {
185 	caddr_t ova;			/* current break address */
186 	size_t size;
187 	int	error;
188 	struct proc *p = curproc;
189 	struct as *as = p->p_as;
190 	size_t pgsz;
191 	uint_t szc;
192 	rctl_qty_t as_rctl;
193 
194 	/*
195 	 * extend heap to brkszc alignment but use current p->p_brkpageszc
196 	 * for the newly created segment. This allows the new extension
197 	 * segment to be concatenated successfully with the existing brk
198 	 * segment.
199 	 */
200 	if ((szc = brkszc) != 0) {
201 		pgsz = page_get_pagesize(szc);
202 		ASSERT(pgsz > PAGESIZE);
203 	} else {
204 		pgsz = PAGESIZE;
205 	}
206 
207 	mutex_enter(&p->p_lock);
208 	as_rctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_DATA],
209 	    p->p_rctls, p);
210 	mutex_exit(&p->p_lock);
211 
212 	/*
213 	 * If p_brkbase has not yet been set, the first call
214 	 * to brk() will initialize it.
215 	 */
216 	if (p->p_brkbase == 0)
217 		p->p_brkbase = nva;
218 
219 	/*
220 	 * Before multiple page size support existed p_brksize was the value
221 	 * not rounded to the pagesize (i.e. it stored the exact user request
222 	 * for heap size). If pgsz is greater than PAGESIZE calculate the
223 	 * heap size as the real new heap size by rounding it up to pgsz.
224 	 * This is useful since we may want to know where the heap ends
225 	 * without knowing heap pagesize (e.g. some old code) and also if
226 	 * heap pagesize changes we can update p_brkpageszc but delay adding
227 	 * new mapping yet still know from p_brksize where the heap really
228 	 * ends. The user requested heap end is stored in libc variable.
229 	 */
230 	if (pgsz > PAGESIZE) {
231 		caddr_t tnva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
232 		size = tnva - p->p_brkbase;
233 		if (tnva < p->p_brkbase || (size > p->p_brksize &&
234 		    size > (size_t)as_rctl)) {
235 			szc = 0;
236 			pgsz = PAGESIZE;
237 			size = nva - p->p_brkbase;
238 		}
239 	} else {
240 		size = nva - p->p_brkbase;
241 	}
242 
243 	/*
244 	 * use PAGESIZE to roundup ova because we want to know the real value
245 	 * of the current heap end in case p_brkpageszc changes since the last
246 	 * p_brksize was computed.
247 	 */
248 	nva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
249 	ova = (caddr_t)P2ROUNDUP((uintptr_t)(p->p_brkbase + p->p_brksize),
250 	    PAGESIZE);
251 
252 	if ((nva < p->p_brkbase) || (size > p->p_brksize &&
253 	    size > as_rctl)) {
254 		mutex_enter(&p->p_lock);
255 		(void) rctl_action(rctlproc_legacy[RLIMIT_DATA], p->p_rctls, p,
256 		    RCA_SAFE);
257 		mutex_exit(&p->p_lock);
258 		return (ENOMEM);
259 	}
260 
261 	if (nva > ova) {
262 		struct segvn_crargs crargs =
263 		    SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
264 
265 		if (!(p->p_datprot & PROT_EXEC)) {
266 			crargs.prot &= ~PROT_EXEC;
267 		}
268 
269 		/*
270 		 * Add new zfod mapping to extend UNIX data segment
271 		 * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies
272 		 * via map_pgszcvec(). Use AS_MAP_HEAP to get intermediate
273 		 * page sizes if ova is not aligned to szc's pgsz.
274 		 */
275 		if (szc > 0) {
276 			caddr_t rbss;
277 
278 			rbss = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase,
279 			    pgsz);
280 			if (IS_P2ALIGNED(p->p_bssbase, pgsz) || ova > rbss) {
281 				crargs.szc = p->p_brkpageszc ? p->p_brkpageszc :
282 				    AS_MAP_NO_LPOOB;
283 			} else if (ova == rbss) {
284 				crargs.szc = szc;
285 			} else {
286 				crargs.szc = AS_MAP_HEAP;
287 			}
288 		} else {
289 			crargs.szc = AS_MAP_NO_LPOOB;
290 		}
291 		crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_UP;
292 		error = as_map(as, ova, (size_t)(nva - ova), segvn_create,
293 		    &crargs);
294 		if (error) {
295 			return (error);
296 		}
297 
298 	} else if (nva < ova) {
299 		/*
300 		 * Release mapping to shrink UNIX data segment.
301 		 */
302 		(void) as_unmap(as, nva, (size_t)(ova - nva));
303 	}
304 	p->p_brksize = size;
305 	return (0);
306 }
307 
308 /*
309  * Grow the stack to include sp.  Return 1 if successful, 0 otherwise.
310  * This routine assumes that the stack grows downward.
311  */
312 int
313 grow(caddr_t sp)
314 {
315 	struct proc *p = curproc;
316 	struct as *as = p->p_as;
317 	size_t oldsize = p->p_stksize;
318 	size_t newsize;
319 	int err;
320 
321 	/*
322 	 * Serialize grow operations on an address space.
323 	 * This also serves as the lock protecting p_stksize
324 	 * and p_stkpageszc.
325 	 */
326 	as_rangelock(as);
327 	if (use_stk_lpg && (p->p_flag & SAUTOLPG) != 0) {
328 		err = grow_lpg(sp);
329 	} else {
330 		err = grow_internal(sp, p->p_stkpageszc);
331 	}
332 	as_rangeunlock(as);
333 
334 	if (err == 0 && (newsize = p->p_stksize) > oldsize) {
335 		ASSERT(IS_P2ALIGNED(oldsize, PAGESIZE));
336 		ASSERT(IS_P2ALIGNED(newsize, PAGESIZE));
337 		/*
338 		 * Set up translations so the process doesn't have to fault in
339 		 * the stack pages we just gave it.
340 		 */
341 		(void) as_fault(as->a_hat, as, p->p_usrstack - newsize,
342 		    newsize - oldsize, F_INVAL, S_WRITE);
343 	}
344 	return ((err == 0 ? 1 : 0));
345 }
346 
347 /*
348  * Algorithm: call arch-specific map_pgsz to get best page size to use,
349  * then call grow_internal().
350  * Returns 0 on success.
351  */
352 static int
353 grow_lpg(caddr_t sp)
354 {
355 	struct proc *p = curproc;
356 	size_t pgsz;
357 	size_t len, newsize;
358 	caddr_t addr, saddr;
359 	caddr_t growend;
360 	int oszc, szc;
361 	int err;
362 
363 	newsize = p->p_usrstack - sp;
364 
365 	oszc = p->p_stkpageszc;
366 	pgsz = map_pgsz(MAPPGSZ_STK, p, sp, newsize, 0);
367 	szc = page_szc(pgsz);
368 
369 	/*
370 	 * Covers two cases:
371 	 * 1. page_szc() returns -1 for invalid page size, so we want to
372 	 * ignore it in that case.
373 	 * 2. By design we never decrease page size, as it is more stable.
374 	 * This shouldn't happen as the stack never shrinks.
375 	 */
376 	if (szc <= oszc) {
377 		err = grow_internal(sp, oszc);
378 		/* failed, fall back to base page size */
379 		if (err != 0 && oszc != 0) {
380 			err = grow_internal(sp, 0);
381 		}
382 		return (err);
383 	}
384 
385 	/*
386 	 * We've grown sufficiently to switch to a new page size.
387 	 * So we are going to remap the whole segment with the new page size.
388 	 */
389 	err = grow_internal(sp, szc);
390 	/* The grow with szc failed, so fall back to base page size. */
391 	if (err != 0) {
392 		if (szc != 0) {
393 			err = grow_internal(sp, 0);
394 		}
395 		return (err);
396 	}
397 
398 	/*
399 	 * Round up stack pointer to a large page boundary and remap
400 	 * any pgsz pages in the segment already faulted in beyond that
401 	 * point.
402 	 */
403 	saddr = p->p_usrstack - p->p_stksize;
404 	addr = (caddr_t)P2ROUNDUP((uintptr_t)saddr, pgsz);
405 	growend = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack, pgsz);
406 	len = growend - addr;
407 	/* Check that len is not negative. Update page size code for stack. */
408 	if (addr >= saddr && growend > addr && IS_P2ALIGNED(len, pgsz)) {
409 		(void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
410 		p->p_stkpageszc = szc;
411 	}
412 
413 	ASSERT(err == 0);
414 	return (err);		/* should always be 0 */
415 }
416 
417 /*
418  * This routine assumes that the stack grows downward.
419  * Returns 0 on success, errno on failure.
420  */
421 int
422 grow_internal(caddr_t sp, uint_t growszc)
423 {
424 	struct proc *p = curproc;
425 	size_t newsize;
426 	size_t oldsize;
427 	int    error;
428 	size_t pgsz;
429 	uint_t szc;
430 	struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
431 
432 	ASSERT(sp < p->p_usrstack);
433 	sp = (caddr_t)P2ALIGN((uintptr_t)sp, PAGESIZE);
434 
435 	/*
436 	 * grow to growszc alignment but use current p->p_stkpageszc for
437 	 * the segvn_crargs szc passed to segvn_create. For memcntl to
438 	 * increase the szc, this allows the new extension segment to be
439 	 * concatenated successfully with the existing stack segment.
440 	 */
441 	if ((szc = growszc) != 0) {
442 		pgsz = page_get_pagesize(szc);
443 		ASSERT(pgsz > PAGESIZE);
444 		newsize = p->p_usrstack - (caddr_t)P2ALIGN((uintptr_t)sp, pgsz);
445 		if (newsize > (size_t)p->p_stk_ctl) {
446 			szc = 0;
447 			pgsz = PAGESIZE;
448 			newsize = p->p_usrstack - sp;
449 		}
450 	} else {
451 		pgsz = PAGESIZE;
452 		newsize = p->p_usrstack - sp;
453 	}
454 
455 	if (newsize > (size_t)p->p_stk_ctl) {
456 		(void) rctl_action(rctlproc_legacy[RLIMIT_STACK], p->p_rctls, p,
457 		    RCA_UNSAFE_ALL);
458 
459 		return (ENOMEM);
460 	}
461 
462 	oldsize = p->p_stksize;
463 	ASSERT(P2PHASE(oldsize, PAGESIZE) == 0);
464 
465 	if (newsize <= oldsize) {	/* prevent the stack from shrinking */
466 		return (0);
467 	}
468 
469 	if (!(p->p_stkprot & PROT_EXEC)) {
470 		crargs.prot &= ~PROT_EXEC;
471 	}
472 	/*
473 	 * extend stack with the proposed new growszc, which is different
474 	 * than p_stkpageszc only on a memcntl to increase the stack pagesize.
475 	 * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies via
476 	 * map_pgszcvec(). Use AS_MAP_STACK to get intermediate page sizes
477 	 * if not aligned to szc's pgsz.
478 	 */
479 	if (szc > 0) {
480 		caddr_t oldsp = p->p_usrstack - oldsize;
481 		caddr_t austk = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack,
482 		    pgsz);
483 
484 		if (IS_P2ALIGNED(p->p_usrstack, pgsz) || oldsp < austk) {
485 			crargs.szc = p->p_stkpageszc ? p->p_stkpageszc :
486 			    AS_MAP_NO_LPOOB;
487 		} else if (oldsp == austk) {
488 			crargs.szc = szc;
489 		} else {
490 			crargs.szc = AS_MAP_STACK;
491 		}
492 	} else {
493 		crargs.szc = AS_MAP_NO_LPOOB;
494 	}
495 	crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_DOWN;
496 
497 	if ((error = as_map(p->p_as, p->p_usrstack - newsize, newsize - oldsize,
498 	    segvn_create, &crargs)) != 0) {
499 		if (error == EAGAIN) {
500 			cmn_err(CE_WARN, "Sorry, no swap space to grow stack "
501 			    "for pid %d (%s)", p->p_pid, PTOU(p)->u_comm);
502 		}
503 		return (error);
504 	}
505 	p->p_stksize = newsize;
506 	return (0);
507 }
508 
509 /*
510  * Find address for user to map.  If MAP_FIXED is not specified, we can pick
511  * any address we want, but we will first try the value in *addrp if it is
512  * non-NULL and _MAP_RANDOMIZE is not set.  Thus this is implementing a way to
513  * try and get a preferred address.
514  */
515 int
516 choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off,
517     int vacalign, uint_t flags)
518 {
519 	caddr_t basep = (caddr_t)(uintptr_t)((uintptr_t)*addrp & PAGEMASK);
520 	size_t lenp = len;
521 
522 	ASSERT(AS_ISCLAIMGAP(as));	/* searches should be serialized */
523 	if (flags & MAP_FIXED) {
524 		(void) as_unmap(as, *addrp, len);
525 		return (0);
526 	} else if (basep != NULL &&
527 	    ((flags & (MAP_ALIGN | _MAP_RANDOMIZE)) == 0) &&
528 	    !as_gap(as, len, &basep, &lenp, 0, *addrp)) {
529 		/* User supplied address was available */
530 		*addrp = basep;
531 	} else {
532 		/*
533 		 * No user supplied address or the address supplied was not
534 		 * available.
535 		 */
536 		map_addr(addrp, len, off, vacalign, flags);
537 	}
538 	if (*addrp == NULL)
539 		return (ENOMEM);
540 	return (0);
541 }
542 
543 
544 /*
545  * Used for MAP_ANON - fast way to get anonymous pages
546  */
547 static int
548 zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
549     offset_t pos)
550 {
551 	struct segvn_crargs vn_a;
552 	int error;
553 
554 	if (((PROT_ALL & uprot) != uprot))
555 		return (EACCES);
556 
557 	if ((flags & MAP_FIXED) != 0) {
558 		caddr_t userlimit;
559 
560 		/*
561 		 * Use the user address.  First verify that
562 		 * the address to be used is page aligned.
563 		 * Then make some simple bounds checks.
564 		 */
565 		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
566 			return (EINVAL);
567 
568 		userlimit = flags & _MAP_LOW32 ?
569 		    (caddr_t)USERLIMIT32 : as->a_userlimit;
570 		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
571 		case RANGE_OKAY:
572 			break;
573 		case RANGE_BADPROT:
574 			return (ENOTSUP);
575 		case RANGE_BADADDR:
576 		default:
577 			return (ENOMEM);
578 		}
579 	}
580 	/*
581 	 * No need to worry about vac alignment for anonymous
582 	 * pages since this is a "clone" object that doesn't
583 	 * yet exist.
584 	 */
585 	error = choose_addr(as, addrp, len, pos, ADDR_NOVACALIGN, flags);
586 	if (error != 0) {
587 		return (error);
588 	}
589 
590 	/*
591 	 * Use the seg_vn segment driver; passing in the NULL amp
592 	 * gives the desired "cloning" effect.
593 	 */
594 	vn_a.vp = NULL;
595 	vn_a.offset = 0;
596 	vn_a.type = flags & MAP_TYPE;
597 	vn_a.prot = uprot;
598 	vn_a.maxprot = PROT_ALL;
599 	vn_a.flags = flags & ~MAP_TYPE;
600 	vn_a.cred = CRED();
601 	vn_a.amp = NULL;
602 	vn_a.szc = 0;
603 	vn_a.lgrp_mem_policy_flags = 0;
604 
605 	return (as_map(as, *addrp, len, segvn_create, &vn_a));
606 }
607 
608 #define	RANDOMIZABLE_MAPPING(addr, flags) (((flags & MAP_FIXED) == 0) && \
609 	!(((flags & MAP_ALIGN) == 0) && (addr != 0) && aslr_respect_mmap_hint))
610 
611 static int
612 smmap_common(caddr_t *addrp, size_t len,
613     int prot, int flags, struct file *fp, offset_t pos)
614 {
615 	struct vnode *vp;
616 	struct as *as = curproc->p_as;
617 	uint_t uprot, maxprot, type;
618 	int error;
619 	int in_crit = 0;
620 
621 	if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | _MAP_NEW |
622 	    _MAP_LOW32 | MAP_NORESERVE | MAP_ANON | MAP_ALIGN |
623 	    MAP_TEXT | MAP_INITDATA)) != 0) {
624 		/* | MAP_RENAME */	/* not implemented, let user know */
625 		return (EINVAL);
626 	}
627 
628 	if ((flags & MAP_TEXT) && !(prot & PROT_EXEC)) {
629 		return (EINVAL);
630 	}
631 
632 	if ((flags & (MAP_TEXT | MAP_INITDATA)) == (MAP_TEXT | MAP_INITDATA)) {
633 		return (EINVAL);
634 	}
635 
636 	if ((flags & (MAP_FIXED | _MAP_RANDOMIZE)) ==
637 	    (MAP_FIXED | _MAP_RANDOMIZE)) {
638 		return (EINVAL);
639 	}
640 
641 	/*
642 	 * If it's not a fixed allocation and mmap ASLR is enabled, randomize
643 	 * it.
644 	 */
645 	if (RANDOMIZABLE_MAPPING(*addrp, flags) &&
646 	    secflag_enabled(curproc, PROC_SEC_ASLR))
647 		flags |= _MAP_RANDOMIZE;
648 
649 #if defined(__sparc)
650 	/*
651 	 * See if this is an "old mmap call".  If so, remember this
652 	 * fact and convert the flags value given to mmap to indicate
653 	 * the specified address in the system call must be used.
654 	 * _MAP_NEW is turned set by all new uses of mmap.
655 	 */
656 	if ((flags & _MAP_NEW) == 0)
657 		flags |= MAP_FIXED;
658 #endif
659 	flags &= ~_MAP_NEW;
660 
661 	type = flags & MAP_TYPE;
662 	if (type != MAP_PRIVATE && type != MAP_SHARED)
663 		return (EINVAL);
664 
665 
666 	if (flags & MAP_ALIGN) {
667 		if (flags & MAP_FIXED)
668 			return (EINVAL);
669 
670 		/* alignment needs to be a power of 2 >= page size */
671 		if (((uintptr_t)*addrp < PAGESIZE && (uintptr_t)*addrp != 0) ||
672 		    !ISP2((uintptr_t)*addrp))
673 			return (EINVAL);
674 	}
675 	/*
676 	 * Check for bad lengths and file position.
677 	 * We let the VOP_MAP routine check for negative lengths
678 	 * since on some vnode types this might be appropriate.
679 	 */
680 	if (len == 0 || (pos & (u_offset_t)PAGEOFFSET) != 0)
681 		return (EINVAL);
682 
683 	maxprot = PROT_ALL;		/* start out allowing all accesses */
684 	uprot = prot | PROT_USER;
685 
686 	if (fp == NULL) {
687 		ASSERT(flags & MAP_ANON);
688 		/* discard lwpchan mappings, like munmap() */
689 		if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
690 			lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
691 		as_rangelock(as);
692 		error = zmap(as, addrp, len, uprot, flags, pos);
693 		as_rangeunlock(as);
694 		/*
695 		 * Tell machine specific code that lwp has mapped shared memory
696 		 */
697 		if (error == 0 && (flags & MAP_SHARED)) {
698 			/* EMPTY */
699 			LWP_MMODEL_SHARED_AS(*addrp, len);
700 		}
701 		return (error);
702 	} else if ((flags & MAP_ANON) != 0)
703 		return (EINVAL);
704 
705 	vp = fp->f_vnode;
706 
707 	/* Can't execute code from "noexec" mounted filesystem. */
708 	if ((vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0)
709 		maxprot &= ~PROT_EXEC;
710 
711 	/*
712 	 * These checks were added as part of large files.
713 	 *
714 	 * Return ENXIO if the initial position is negative; return EOVERFLOW
715 	 * if (offset + len) would overflow the maximum allowed offset for the
716 	 * type of file descriptor being used.
717 	 */
718 	if (vp->v_type == VREG) {
719 		if (pos < 0)
720 			return (ENXIO);
721 		if ((offset_t)len > (OFFSET_MAX(fp) - pos))
722 			return (EOVERFLOW);
723 	}
724 
725 	if (type == MAP_SHARED && (fp->f_flag & FWRITE) == 0) {
726 		/* no write access allowed */
727 		maxprot &= ~PROT_WRITE;
728 	}
729 
730 	/*
731 	 * XXX - Do we also adjust maxprot based on protections
732 	 * of the vnode?  E.g. if no execute permission is given
733 	 * on the vnode for the current user, maxprot probably
734 	 * should disallow PROT_EXEC also?  This is different
735 	 * from the write access as this would be a per vnode
736 	 * test as opposed to a per fd test for writability.
737 	 */
738 
739 	/*
740 	 * Verify that the specified protections are not greater than
741 	 * the maximum allowable protections.  Also test to make sure
742 	 * that the file descriptor does allows for read access since
743 	 * "write only" mappings are hard to do since normally we do
744 	 * the read from the file before the page can be written.
745 	 */
746 	if (((maxprot & uprot) != uprot) || (fp->f_flag & FREAD) == 0)
747 		return (EACCES);
748 
749 	/*
750 	 * If the user specified an address, do some simple checks here
751 	 */
752 	if ((flags & MAP_FIXED) != 0) {
753 		caddr_t userlimit;
754 
755 		/*
756 		 * Use the user address.  First verify that
757 		 * the address to be used is page aligned.
758 		 * Then make some simple bounds checks.
759 		 */
760 		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
761 			return (EINVAL);
762 
763 		userlimit = flags & _MAP_LOW32 ?
764 		    (caddr_t)USERLIMIT32 : as->a_userlimit;
765 		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
766 		case RANGE_OKAY:
767 			break;
768 		case RANGE_BADPROT:
769 			return (ENOTSUP);
770 		case RANGE_BADADDR:
771 		default:
772 			return (ENOMEM);
773 		}
774 	}
775 
776 	if ((prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) &&
777 	    nbl_need_check(vp)) {
778 		int svmand;
779 		nbl_op_t nop;
780 
781 		nbl_start_crit(vp, RW_READER);
782 		in_crit = 1;
783 		error = nbl_svmand(vp, fp->f_cred, &svmand);
784 		if (error != 0)
785 			goto done;
786 		if ((prot & PROT_WRITE) && (type == MAP_SHARED)) {
787 			if (prot & (PROT_READ | PROT_EXEC)) {
788 				nop = NBL_READWRITE;
789 			} else {
790 				nop = NBL_WRITE;
791 			}
792 		} else {
793 			nop = NBL_READ;
794 		}
795 		if (nbl_conflict(vp, nop, 0, LONG_MAX, svmand, NULL)) {
796 			error = EACCES;
797 			goto done;
798 		}
799 	}
800 
801 	/* discard lwpchan mappings, like munmap() */
802 	if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
803 		lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
804 
805 	/*
806 	 * Ok, now let the vnode map routine do its thing to set things up.
807 	 */
808 	error = VOP_MAP(vp, pos, as,
809 	    addrp, len, uprot, maxprot, flags, fp->f_cred, NULL);
810 
811 	if (error == 0) {
812 		/*
813 		 * Tell machine specific code that lwp has mapped shared memory
814 		 */
815 		if (flags & MAP_SHARED) {
816 			/* EMPTY */
817 			LWP_MMODEL_SHARED_AS(*addrp, len);
818 		}
819 		if (vp->v_type == VREG &&
820 		    (flags & (MAP_TEXT | MAP_INITDATA)) != 0) {
821 			/*
822 			 * Mark this as an executable vnode
823 			 */
824 			mutex_enter(&vp->v_lock);
825 			vp->v_flag |= VVMEXEC;
826 			mutex_exit(&vp->v_lock);
827 		}
828 	}
829 
830 done:
831 	if (in_crit)
832 		nbl_end_crit(vp);
833 	return (error);
834 }
835 
836 #ifdef _LP64
837 /*
838  * LP64 mmap(2) system call: 64-bit offset, 64-bit address.
839  *
840  * The "large file" mmap routine mmap64(2) is also mapped to this routine
841  * by the 64-bit version of libc.
842  *
843  * Eventually, this should be the only version, and have smmap_common()
844  * folded back into it again.  Some day.
845  */
846 caddr_t
847 smmap64(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos)
848 {
849 	struct file *fp;
850 	int error;
851 
852 	if (fd == -1 && (flags & MAP_ANON) != 0)
853 		error = smmap_common(&addr, len, prot, flags,
854 		    NULL, (offset_t)pos);
855 	else if ((fp = getf(fd)) != NULL) {
856 		error = smmap_common(&addr, len, prot, flags,
857 		    fp, (offset_t)pos);
858 		releasef(fd);
859 	} else
860 		error = EBADF;
861 
862 	return (error ? (caddr_t)(uintptr_t)set_errno(error) : addr);
863 }
864 #endif	/* _LP64 */
865 
866 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
867 
868 /*
869  * ILP32 mmap(2) system call: 32-bit offset, 32-bit address.
870  */
871 caddr_t
872 smmap32(caddr32_t addr, size32_t len, int prot, int flags, int fd, off32_t pos)
873 {
874 	struct file *fp;
875 	int error;
876 	caddr_t a = (caddr_t)(uintptr_t)addr;
877 
878 	if (flags & _MAP_LOW32)
879 		error = EINVAL;
880 	else if (fd == -1 && (flags & MAP_ANON) != 0)
881 		error = smmap_common(&a, (size_t)len, prot,
882 		    flags | _MAP_LOW32, NULL, (offset_t)pos);
883 	else if ((fp = getf(fd)) != NULL) {
884 		error = smmap_common(&a, (size_t)len, prot,
885 		    flags | _MAP_LOW32, fp, (offset_t)pos);
886 		releasef(fd);
887 	} else
888 		error = EBADF;
889 
890 	ASSERT(error != 0 || (uintptr_t)(a + len) < (uintptr_t)UINT32_MAX);
891 
892 	return (error ? (caddr_t)(uintptr_t)set_errno(error) : a);
893 }
894 
895 /*
896  * ILP32 mmap64(2) system call: 64-bit offset, 32-bit address.
897  *
898  * Now things really get ugly because we can't use the C-style
899  * calling convention for more than 6 args, and 64-bit parameter
900  * passing on 32-bit systems is less than clean.
901  */
902 
903 struct mmaplf32a {
904 	caddr_t addr;
905 	size_t len;
906 #ifdef _LP64
907 	/*
908 	 * 32-bit contents, 64-bit cells
909 	 */
910 	uint64_t prot;
911 	uint64_t flags;
912 	uint64_t fd;
913 	uint64_t offhi;
914 	uint64_t offlo;
915 #else
916 	/*
917 	 * 32-bit contents, 32-bit cells
918 	 */
919 	uint32_t prot;
920 	uint32_t flags;
921 	uint32_t fd;
922 	uint32_t offhi;
923 	uint32_t offlo;
924 #endif
925 };
926 
927 int
928 smmaplf32(struct mmaplf32a *uap, rval_t *rvp)
929 {
930 	struct file *fp;
931 	int error;
932 	caddr_t a = uap->addr;
933 	int flags = (int)uap->flags;
934 	int fd = (int)uap->fd;
935 #ifdef _BIG_ENDIAN
936 	offset_t off = ((u_offset_t)uap->offhi << 32) | (u_offset_t)uap->offlo;
937 #else
938 	offset_t off = ((u_offset_t)uap->offlo << 32) | (u_offset_t)uap->offhi;
939 #endif
940 
941 	if (flags & _MAP_LOW32)
942 		error = EINVAL;
943 	else if (fd == -1 && (flags & MAP_ANON) != 0)
944 		error = smmap_common(&a, uap->len, (int)uap->prot,
945 		    flags | _MAP_LOW32, NULL, off);
946 	else if ((fp = getf(fd)) != NULL) {
947 		error = smmap_common(&a, uap->len, (int)uap->prot,
948 		    flags | _MAP_LOW32, fp, off);
949 		releasef(fd);
950 	} else
951 		error = EBADF;
952 
953 	if (error == 0)
954 		rvp->r_val1 = (uintptr_t)a;
955 	return (error);
956 }
957 
958 #endif	/* _SYSCALL32_IMPL || _ILP32 */
959 
960 int
961 munmap(caddr_t addr, size_t len)
962 {
963 	struct proc *p = curproc;
964 	struct as *as = p->p_as;
965 
966 	if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
967 		return (set_errno(EINVAL));
968 
969 	if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
970 		return (set_errno(EINVAL));
971 
972 	/*
973 	 * Discard lwpchan mappings.
974 	 */
975 	if (p->p_lcp != NULL)
976 		lwpchan_delete_mapping(p, addr, addr + len);
977 	if (as_unmap(as, addr, len) != 0)
978 		return (set_errno(EINVAL));
979 
980 	return (0);
981 }
982 
983 int
984 mprotect(caddr_t addr, size_t len, int prot)
985 {
986 	struct as *as = curproc->p_as;
987 	uint_t uprot = prot | PROT_USER;
988 	int error;
989 
990 	if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
991 		return (set_errno(EINVAL));
992 
993 	switch (valid_usr_range(addr, len, prot, as, as->a_userlimit)) {
994 	case RANGE_OKAY:
995 		break;
996 	case RANGE_BADPROT:
997 		return (set_errno(ENOTSUP));
998 	case RANGE_BADADDR:
999 	default:
1000 		return (set_errno(ENOMEM));
1001 	}
1002 
1003 	error = as_setprot(as, addr, len, uprot);
1004 	if (error)
1005 		return (set_errno(error));
1006 	return (0);
1007 }
1008 
1009 #define	MC_CACHE	128			/* internal result buffer */
1010 #define	MC_QUANTUM	(MC_CACHE * PAGESIZE)	/* addresses covered in loop */
1011 
1012 int
1013 mincore(caddr_t addr, size_t len, char *vecp)
1014 {
1015 	struct as *as = curproc->p_as;
1016 	caddr_t ea;			/* end address of loop */
1017 	size_t rl;			/* inner result length */
1018 	char vec[MC_CACHE];		/* local vector cache */
1019 	int error;
1020 	model_t model;
1021 	long	llen;
1022 
1023 	model = get_udatamodel();
1024 	/*
1025 	 * Validate form of address parameters.
1026 	 */
1027 	if (model == DATAMODEL_NATIVE) {
1028 		llen = (long)len;
1029 	} else {
1030 		llen = (int32_t)(size32_t)len;
1031 	}
1032 	if (((uintptr_t)addr & PAGEOFFSET) != 0 || llen <= 0)
1033 		return (set_errno(EINVAL));
1034 
1035 	if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
1036 		return (set_errno(ENOMEM));
1037 
1038 	/*
1039 	 * Loop over subranges of interval [addr : addr + len), recovering
1040 	 * results internally and then copying them out to caller.  Subrange
1041 	 * is based on the size of MC_CACHE, defined above.
1042 	 */
1043 	for (ea = addr + len; addr < ea; addr += MC_QUANTUM) {
1044 		error = as_incore(as, addr,
1045 		    (size_t)MIN(MC_QUANTUM, ea - addr), vec, &rl);
1046 		if (rl != 0) {
1047 			rl = (rl + PAGESIZE - 1) / PAGESIZE;
1048 			if (copyout(vec, vecp, rl) != 0)
1049 				return (set_errno(EFAULT));
1050 			vecp += rl;
1051 		}
1052 		if (error != 0)
1053 			return (set_errno(ENOMEM));
1054 	}
1055 	return (0);
1056 }
1057