xref: /titanic_44/usr/src/uts/common/os/grow.c (revision 40e5e17b3361b3eea56a9723071c406894a20b78)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"
31 
32 #include <sys/types.h>
33 #include <sys/inttypes.h>
34 #include <sys/param.h>
35 #include <sys/sysmacros.h>
36 #include <sys/systm.h>
37 #include <sys/signal.h>
38 #include <sys/user.h>
39 #include <sys/errno.h>
40 #include <sys/var.h>
41 #include <sys/proc.h>
42 #include <sys/tuneable.h>
43 #include <sys/debug.h>
44 #include <sys/cmn_err.h>
45 #include <sys/cred.h>
46 #include <sys/vnode.h>
47 #include <sys/vfs.h>
48 #include <sys/vm.h>
49 #include <sys/file.h>
50 #include <sys/mman.h>
51 #include <sys/vmparam.h>
52 #include <sys/fcntl.h>
53 #include <sys/lwpchan_impl.h>
54 
55 #include <vm/hat.h>
56 #include <vm/as.h>
57 #include <vm/seg.h>
58 #include <vm/seg_dev.h>
59 #include <vm/seg_vn.h>
60 
61 int use_brk_lpg = 1;
62 int use_stk_lpg = 1;
63 
64 static int brk_lpg(caddr_t nva);
65 static int grow_lpg(caddr_t sp);
66 
67 int
68 brk(caddr_t nva)
69 {
70 	int error;
71 	proc_t *p = curproc;
72 
73 	/*
74 	 * Serialize brk operations on an address space.
75 	 * This also serves as the lock protecting p_brksize
76 	 * and p_brkpageszc.
77 	 */
78 	as_rangelock(p->p_as);
79 	if (use_brk_lpg && (p->p_flag & SAUTOLPG) != 0) {
80 		error = brk_lpg(nva);
81 	} else {
82 		error = brk_internal(nva, p->p_brkpageszc);
83 	}
84 	as_rangeunlock(p->p_as);
85 	return ((error != 0 ? set_errno(error) : 0));
86 }
87 
88 /*
89  * Algorithm: call arch-specific map_pgsz to get best page size to use,
90  * then call brk_internal().
91  * Returns 0 on success.
92  */
93 static int
94 brk_lpg(caddr_t nva)
95 {
96 	struct proc *p = curproc;
97 	size_t pgsz, len;
98 	caddr_t addr, brkend;
99 	caddr_t bssbase = p->p_bssbase;
100 	caddr_t brkbase = p->p_brkbase;
101 	int oszc, szc;
102 	int err;
103 
104 	oszc = p->p_brkpageszc;
105 
106 	/*
107 	 * If p_brkbase has not yet been set, the first call
108 	 * to brk_internal() will initialize it.
109 	 */
110 	if (brkbase == 0) {
111 		return (brk_internal(nva, oszc));
112 	}
113 
114 	len = nva - bssbase;
115 
116 	pgsz = map_pgsz(MAPPGSZ_HEAP, p, bssbase, len, 0);
117 	szc = page_szc(pgsz);
118 
119 	/*
120 	 * Covers two cases:
121 	 * 1. page_szc() returns -1 for invalid page size, so we want to
122 	 * ignore it in that case.
123 	 * 2. By design we never decrease page size, as it is more stable.
124 	 */
125 	if (szc <= oszc) {
126 		err = brk_internal(nva, oszc);
127 		/* If failed, back off to base page size. */
128 		if (err != 0 && oszc != 0) {
129 			err = brk_internal(nva, 0);
130 		}
131 		return (err);
132 	}
133 
134 	err = brk_internal(nva, szc);
135 	/* If using szc failed, map with base page size and return. */
136 	if (err != 0) {
137 		if (szc != 0) {
138 			err = brk_internal(nva, 0);
139 		}
140 		return (err);
141 	}
142 
143 	/*
144 	 * Round up brk base to a large page boundary and remap
145 	 * anything in the segment already faulted in beyond that
146 	 * point.
147 	 */
148 	addr = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase, pgsz);
149 	brkend = brkbase + p->p_brksize;
150 	len = brkend - addr;
151 	/* Check that len is not negative. Update page size code for heap. */
152 	if (addr >= p->p_bssbase && brkend > addr && IS_P2ALIGNED(len, pgsz)) {
153 		(void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
154 		p->p_brkpageszc = szc;
155 	}
156 
157 	ASSERT(err == 0);
158 	return (err);		/* should always be 0 */
159 }
160 
161 /*
162  * Returns 0 on success.
163  */
164 int
165 brk_internal(caddr_t nva, uint_t brkszc)
166 {
167 	caddr_t ova;			/* current break address */
168 	size_t size;
169 	int	error;
170 	struct proc *p = curproc;
171 	struct as *as = p->p_as;
172 	size_t pgsz;
173 	uint_t szc;
174 	rctl_qty_t as_rctl;
175 
176 	/*
177 	 * extend heap to brkszc alignment but use current p->p_brkpageszc
178 	 * for the newly created segment. This allows the new extension
179 	 * segment to be concatenated successfully with the existing brk
180 	 * segment.
181 	 */
182 	if ((szc = brkszc) != 0) {
183 		pgsz = page_get_pagesize(szc);
184 		ASSERT(pgsz > PAGESIZE);
185 	} else {
186 		pgsz = PAGESIZE;
187 	}
188 
189 	mutex_enter(&p->p_lock);
190 	as_rctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_DATA],
191 	    p->p_rctls, p);
192 	mutex_exit(&p->p_lock);
193 
194 	/*
195 	 * If p_brkbase has not yet been set, the first call
196 	 * to brk() will initialize it.
197 	 */
198 	if (p->p_brkbase == 0)
199 		p->p_brkbase = nva;
200 
201 	/*
202 	 * Before multiple page size support existed p_brksize was the value
203 	 * not rounded to the pagesize (i.e. it stored the exact user request
204 	 * for heap size). If pgsz is greater than PAGESIZE calculate the
205 	 * heap size as the real new heap size by rounding it up to pgsz.
206 	 * This is useful since we may want to know where the heap ends
207 	 * without knowing heap pagesize (e.g. some old code) and also if
208 	 * heap pagesize changes we can update p_brkpageszc but delay adding
209 	 * new mapping yet still know from p_brksize where the heap really
210 	 * ends. The user requested heap end is stored in libc variable.
211 	 */
212 	if (pgsz > PAGESIZE) {
213 		caddr_t tnva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
214 		size = tnva - p->p_brkbase;
215 		if (tnva < p->p_brkbase || (size > p->p_brksize &&
216 		    size > (size_t)as_rctl)) {
217 			szc = 0;
218 			pgsz = PAGESIZE;
219 			size = nva - p->p_brkbase;
220 		}
221 	} else {
222 		size = nva - p->p_brkbase;
223 	}
224 
225 	/*
226 	 * use PAGESIZE to roundup ova because we want to know the real value
227 	 * of the current heap end in case p_brkpageszc changes since the last
228 	 * p_brksize was computed.
229 	 */
230 	nva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
231 	ova = (caddr_t)P2ROUNDUP((uintptr_t)(p->p_brkbase + p->p_brksize),
232 		PAGESIZE);
233 
234 	if ((nva < p->p_brkbase) || (size > p->p_brksize &&
235 	    size > as_rctl)) {
236 		mutex_enter(&p->p_lock);
237 		(void) rctl_action(rctlproc_legacy[RLIMIT_DATA], p->p_rctls, p,
238 		    RCA_SAFE);
239 		mutex_exit(&p->p_lock);
240 		return (ENOMEM);
241 	}
242 
243 	if (nva > ova) {
244 		struct segvn_crargs crargs =
245 		    SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
246 
247 		if (!(p->p_datprot & PROT_EXEC)) {
248 			crargs.prot &= ~PROT_EXEC;
249 		}
250 
251 		/*
252 		 * Add new zfod mapping to extend UNIX data segment
253 		 * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies
254 		 * via map_pgszcvec(). Use AS_MAP_HEAP to get intermediate
255 		 * page sizes if ova is not aligned to szc's pgsz.
256 		 */
257 		if (szc > 0) {
258 			caddr_t rbss;
259 
260 			rbss = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase,
261 			    pgsz);
262 			if (IS_P2ALIGNED(p->p_bssbase, pgsz) || ova > rbss) {
263 				crargs.szc = p->p_brkpageszc ? p->p_brkpageszc :
264 				    AS_MAP_NO_LPOOB;
265 			} else if (ova == rbss) {
266 				crargs.szc = szc;
267 			} else {
268 				crargs.szc = AS_MAP_HEAP;
269 			}
270 		} else {
271 			crargs.szc = AS_MAP_NO_LPOOB;
272 		}
273 		crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_UP;
274 		error = as_map(as, ova, (size_t)(nva - ova), segvn_create,
275 		    &crargs);
276 		if (error) {
277 			return (error);
278 		}
279 
280 	} else if (nva < ova) {
281 		/*
282 		 * Release mapping to shrink UNIX data segment.
283 		 */
284 		(void) as_unmap(as, nva, (size_t)(ova - nva));
285 	}
286 	p->p_brksize = size;
287 	return (0);
288 }
289 
290 /*
291  * Grow the stack to include sp.  Return 1 if successful, 0 otherwise.
292  * This routine assumes that the stack grows downward.
293  */
294 int
295 grow(caddr_t sp)
296 {
297 	struct proc *p = curproc;
298 	struct as *as = p->p_as;
299 	size_t oldsize = p->p_stksize;
300 	size_t newsize;
301 	int err;
302 
303 	/*
304 	 * Serialize grow operations on an address space.
305 	 * This also serves as the lock protecting p_stksize
306 	 * and p_stkpageszc.
307 	 */
308 	as_rangelock(as);
309 	if (use_stk_lpg && (p->p_flag & SAUTOLPG) != 0) {
310 		err = grow_lpg(sp);
311 	} else {
312 		err = grow_internal(sp, p->p_stkpageszc);
313 	}
314 	as_rangeunlock(as);
315 
316 	if (err == 0 && (newsize = p->p_stksize) > oldsize) {
317 		ASSERT(IS_P2ALIGNED(oldsize, PAGESIZE));
318 		ASSERT(IS_P2ALIGNED(newsize, PAGESIZE));
319 		/*
320 		 * Set up translations so the process doesn't have to fault in
321 		 * the stack pages we just gave it.
322 		 */
323 		(void) as_fault(as->a_hat, as, p->p_usrstack - newsize,
324 		    newsize - oldsize, F_INVAL, S_WRITE);
325 	}
326 	return ((err == 0 ? 1 : 0));
327 }
328 
329 /*
330  * Algorithm: call arch-specific map_pgsz to get best page size to use,
331  * then call grow_internal().
332  * Returns 0 on success.
333  */
334 static int
335 grow_lpg(caddr_t sp)
336 {
337 	struct proc *p = curproc;
338 	size_t pgsz;
339 	size_t len, newsize;
340 	caddr_t addr, saddr;
341 	caddr_t growend;
342 	int oszc, szc;
343 	int err;
344 
345 	newsize = p->p_usrstack - sp;
346 
347 	oszc = p->p_stkpageszc;
348 	pgsz = map_pgsz(MAPPGSZ_STK, p, sp, newsize, 0);
349 	szc = page_szc(pgsz);
350 
351 	/*
352 	 * Covers two cases:
353 	 * 1. page_szc() returns -1 for invalid page size, so we want to
354 	 * ignore it in that case.
355 	 * 2. By design we never decrease page size, as it is more stable.
356 	 * This shouldn't happen as the stack never shrinks.
357 	 */
358 	if (szc <= oszc) {
359 		err = grow_internal(sp, oszc);
360 		/* failed, fall back to base page size */
361 		if (err != 0 && oszc != 0) {
362 			err = grow_internal(sp, 0);
363 		}
364 		return (err);
365 	}
366 
367 	/*
368 	 * We've grown sufficiently to switch to a new page size.
369 	 * So we are going to remap the whole segment with the new page size.
370 	 */
371 	err = grow_internal(sp, szc);
372 	/* The grow with szc failed, so fall back to base page size. */
373 	if (err != 0) {
374 		if (szc != 0) {
375 			err = grow_internal(sp, 0);
376 		}
377 		return (err);
378 	}
379 
380 	/*
381 	 * Round up stack pointer to a large page boundary and remap
382 	 * any pgsz pages in the segment already faulted in beyond that
383 	 * point.
384 	 */
385 	saddr = p->p_usrstack - p->p_stksize;
386 	addr = (caddr_t)P2ROUNDUP((uintptr_t)saddr, pgsz);
387 	growend = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack, pgsz);
388 	len = growend - addr;
389 	/* Check that len is not negative. Update page size code for stack. */
390 	if (addr >= saddr && growend > addr && IS_P2ALIGNED(len, pgsz)) {
391 		(void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
392 		p->p_stkpageszc = szc;
393 	}
394 
395 	ASSERT(err == 0);
396 	return (err);		/* should always be 0 */
397 }
398 
399 /*
400  * This routine assumes that the stack grows downward.
401  * Returns 0 on success, errno on failure.
402  */
403 int
404 grow_internal(caddr_t sp, uint_t growszc)
405 {
406 	struct proc *p = curproc;
407 	size_t newsize;
408 	size_t oldsize;
409 	int    error;
410 	size_t pgsz;
411 	uint_t szc;
412 	struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
413 
414 	ASSERT(sp < p->p_usrstack);
415 	sp = (caddr_t)P2ALIGN((uintptr_t)sp, PAGESIZE);
416 
417 	/*
418 	 * grow to growszc alignment but use current p->p_stkpageszc for
419 	 * the segvn_crargs szc passed to segvn_create. For memcntl to
420 	 * increase the szc, this allows the new extension segment to be
421 	 * concatenated successfully with the existing stack segment.
422 	 */
423 	if ((szc = growszc) != 0) {
424 		pgsz = page_get_pagesize(szc);
425 		ASSERT(pgsz > PAGESIZE);
426 		newsize = p->p_usrstack - (caddr_t)P2ALIGN((uintptr_t)sp, pgsz);
427 		if (newsize > (size_t)p->p_stk_ctl) {
428 			szc = 0;
429 			pgsz = PAGESIZE;
430 			newsize = p->p_usrstack - sp;
431 		}
432 	} else {
433 		pgsz = PAGESIZE;
434 		newsize = p->p_usrstack - sp;
435 	}
436 
437 	if (newsize > (size_t)p->p_stk_ctl) {
438 		(void) rctl_action(rctlproc_legacy[RLIMIT_STACK], p->p_rctls, p,
439 		    RCA_UNSAFE_ALL);
440 
441 		return (ENOMEM);
442 	}
443 
444 	oldsize = p->p_stksize;
445 	ASSERT(P2PHASE(oldsize, PAGESIZE) == 0);
446 
447 	if (newsize <= oldsize) {	/* prevent the stack from shrinking */
448 		return (0);
449 	}
450 
451 	if (!(p->p_stkprot & PROT_EXEC)) {
452 		crargs.prot &= ~PROT_EXEC;
453 	}
454 	/*
455 	 * extend stack with the proposed new growszc, which is different
456 	 * than p_stkpageszc only on a memcntl to increase the stack pagesize.
457 	 * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies via
458 	 * map_pgszcvec(). Use AS_MAP_STACK to get intermediate page sizes
459 	 * if not aligned to szc's pgsz.
460 	 */
461 	if (szc > 0) {
462 		caddr_t oldsp = p->p_usrstack - oldsize;
463 		caddr_t austk = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack,
464 		    pgsz);
465 
466 		if (IS_P2ALIGNED(p->p_usrstack, pgsz) || oldsp < austk) {
467 			crargs.szc = p->p_stkpageszc ? p->p_stkpageszc :
468 			    AS_MAP_NO_LPOOB;
469 		} else if (oldsp == austk) {
470 			crargs.szc = szc;
471 		} else {
472 			crargs.szc = AS_MAP_STACK;
473 		}
474 	} else {
475 		crargs.szc = AS_MAP_NO_LPOOB;
476 	}
477 	crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_DOWN;
478 
479 	if ((error = as_map(p->p_as, p->p_usrstack - newsize, newsize - oldsize,
480 	    segvn_create, &crargs)) != 0) {
481 		if (error == EAGAIN) {
482 			cmn_err(CE_WARN, "Sorry, no swap space to grow stack "
483 			    "for pid %d (%s)", p->p_pid, PTOU(p)->u_comm);
484 		}
485 		return (error);
486 	}
487 	p->p_stksize = newsize;
488 	return (0);
489 }
490 
491 /*
492  * Used for MAP_ANON - fast way to get anonymous pages
493  */
494 static int
495 zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
496     offset_t pos)
497 {
498 	struct segvn_crargs vn_a;
499 
500 	if (((PROT_ALL & uprot) != uprot))
501 		return (EACCES);
502 
503 	if ((flags & MAP_FIXED) != 0) {
504 		caddr_t userlimit;
505 
506 		/*
507 		 * Use the user address.  First verify that
508 		 * the address to be used is page aligned.
509 		 * Then make some simple bounds checks.
510 		 */
511 		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
512 			return (EINVAL);
513 
514 		userlimit = flags & _MAP_LOW32 ?
515 		    (caddr_t)USERLIMIT32 : as->a_userlimit;
516 		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
517 		case RANGE_OKAY:
518 			break;
519 		case RANGE_BADPROT:
520 			return (ENOTSUP);
521 		case RANGE_BADADDR:
522 		default:
523 			return (ENOMEM);
524 		}
525 		(void) as_unmap(as, *addrp, len);
526 	} else {
527 		/*
528 		 * No need to worry about vac alignment for anonymous
529 		 * pages since this is a "clone" object that doesn't
530 		 * yet exist.
531 		 */
532 		map_addr(addrp, len, pos, 0, flags);
533 		if (*addrp == NULL)
534 			return (ENOMEM);
535 	}
536 
537 	/*
538 	 * Use the seg_vn segment driver; passing in the NULL amp
539 	 * gives the desired "cloning" effect.
540 	 */
541 	vn_a.vp = NULL;
542 	vn_a.offset = 0;
543 	vn_a.type = flags & MAP_TYPE;
544 	vn_a.prot = uprot;
545 	vn_a.maxprot = PROT_ALL;
546 	vn_a.flags = flags & ~MAP_TYPE;
547 	vn_a.cred = CRED();
548 	vn_a.amp = NULL;
549 	vn_a.szc = 0;
550 	vn_a.lgrp_mem_policy_flags = 0;
551 
552 	return (as_map(as, *addrp, len, segvn_create, &vn_a));
553 }
554 
555 static int
556 smmap_common(caddr_t *addrp, size_t len,
557     int prot, int flags, struct file *fp, offset_t pos)
558 {
559 	struct vnode *vp;
560 	struct as *as = curproc->p_as;
561 	uint_t uprot, maxprot, type;
562 	int error;
563 
564 	if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | _MAP_NEW |
565 	    _MAP_LOW32 | MAP_NORESERVE | MAP_ANON | MAP_ALIGN |
566 	    MAP_TEXT | MAP_INITDATA)) != 0) {
567 		/* | MAP_RENAME */	/* not implemented, let user know */
568 		return (EINVAL);
569 	}
570 
571 	if ((flags & MAP_TEXT) && !(prot & PROT_EXEC)) {
572 		return (EINVAL);
573 	}
574 
575 	if ((flags & (MAP_TEXT | MAP_INITDATA)) == (MAP_TEXT | MAP_INITDATA)) {
576 		return (EINVAL);
577 	}
578 
579 #if defined(__sparc)
580 	/*
581 	 * See if this is an "old mmap call".  If so, remember this
582 	 * fact and convert the flags value given to mmap to indicate
583 	 * the specified address in the system call must be used.
584 	 * _MAP_NEW is turned set by all new uses of mmap.
585 	 */
586 	if ((flags & _MAP_NEW) == 0)
587 		flags |= MAP_FIXED;
588 #endif
589 	flags &= ~_MAP_NEW;
590 
591 	type = flags & MAP_TYPE;
592 	if (type != MAP_PRIVATE && type != MAP_SHARED)
593 		return (EINVAL);
594 
595 
596 	if (flags & MAP_ALIGN) {
597 
598 		if (flags & MAP_FIXED)
599 			return (EINVAL);
600 
601 		/* alignment needs to be a power of 2 >= page size */
602 		if (((uintptr_t)*addrp < PAGESIZE && (uintptr_t)*addrp != 0) ||
603 			!ISP2((uintptr_t)*addrp))
604 			return (EINVAL);
605 	}
606 	/*
607 	 * Check for bad lengths and file position.
608 	 * We let the VOP_MAP routine check for negative lengths
609 	 * since on some vnode types this might be appropriate.
610 	 */
611 	if (len == 0 || (pos & (u_offset_t)PAGEOFFSET) != 0)
612 		return (EINVAL);
613 
614 	maxprot = PROT_ALL;		/* start out allowing all accesses */
615 	uprot = prot | PROT_USER;
616 
617 	if (fp == NULL) {
618 		ASSERT(flags & MAP_ANON);
619 		as_rangelock(as);
620 		error = zmap(as, addrp, len, uprot, flags, pos);
621 		as_rangeunlock(as);
622 		return (error);
623 	} else if ((flags & MAP_ANON) != 0)
624 		return (EINVAL);
625 
626 	vp = fp->f_vnode;
627 
628 	/* Can't execute code from "noexec" mounted filesystem. */
629 	if ((vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0)
630 		maxprot &= ~PROT_EXEC;
631 
632 	/*
633 	 * These checks were added as part of large files.
634 	 *
635 	 * Return ENXIO if the initial position is negative; return EOVERFLOW
636 	 * if (offset + len) would overflow the maximum allowed offset for the
637 	 * type of file descriptor being used.
638 	 */
639 	if (vp->v_type == VREG) {
640 		if (pos < 0)
641 			return (ENXIO);
642 		if ((offset_t)len > (OFFSET_MAX(fp) - pos))
643 			return (EOVERFLOW);
644 	}
645 
646 	if (type == MAP_SHARED && (fp->f_flag & FWRITE) == 0) {
647 		/* no write access allowed */
648 		maxprot &= ~PROT_WRITE;
649 	}
650 
651 	/*
652 	 * XXX - Do we also adjust maxprot based on protections
653 	 * of the vnode?  E.g. if no execute permission is given
654 	 * on the vnode for the current user, maxprot probably
655 	 * should disallow PROT_EXEC also?  This is different
656 	 * from the write access as this would be a per vnode
657 	 * test as opposed to a per fd test for writability.
658 	 */
659 
660 	/*
661 	 * Verify that the specified protections are not greater than
662 	 * the maximum allowable protections.  Also test to make sure
663 	 * that the file descriptor does allows for read access since
664 	 * "write only" mappings are hard to do since normally we do
665 	 * the read from the file before the page can be written.
666 	 */
667 	if (((maxprot & uprot) != uprot) || (fp->f_flag & FREAD) == 0)
668 		return (EACCES);
669 
670 	/*
671 	 * If the user specified an address, do some simple checks here
672 	 */
673 	if ((flags & MAP_FIXED) != 0) {
674 		caddr_t userlimit;
675 
676 		/*
677 		 * Use the user address.  First verify that
678 		 * the address to be used is page aligned.
679 		 * Then make some simple bounds checks.
680 		 */
681 		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
682 			return (EINVAL);
683 
684 		userlimit = flags & _MAP_LOW32 ?
685 		    (caddr_t)USERLIMIT32 : as->a_userlimit;
686 		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
687 		case RANGE_OKAY:
688 			break;
689 		case RANGE_BADPROT:
690 			return (ENOTSUP);
691 		case RANGE_BADADDR:
692 		default:
693 			return (ENOMEM);
694 		}
695 	}
696 
697 
698 	/*
699 	 * Ok, now let the vnode map routine do its thing to set things up.
700 	 */
701 	error = VOP_MAP(vp, pos, as,
702 	    addrp, len, uprot, maxprot, flags, fp->f_cred);
703 
704 	if (error == 0) {
705 		if (vp->v_type == VREG &&
706 		    (flags & (MAP_TEXT | MAP_INITDATA)) != 0) {
707 			/*
708 			 * Mark this as an executable vnode
709 			 */
710 			mutex_enter(&vp->v_lock);
711 			vp->v_flag |= VVMEXEC;
712 			mutex_exit(&vp->v_lock);
713 		}
714 	}
715 
716 	return (error);
717 }
718 
719 #ifdef _LP64
720 /*
721  * LP64 mmap(2) system call: 64-bit offset, 64-bit address.
722  *
723  * The "large file" mmap routine mmap64(2) is also mapped to this routine
724  * by the 64-bit version of libc.
725  *
726  * Eventually, this should be the only version, and have smmap_common()
727  * folded back into it again.  Some day.
728  */
729 caddr_t
730 smmap64(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos)
731 {
732 	struct file *fp;
733 	int error;
734 
735 	if (flags & _MAP_LOW32)
736 		error = EINVAL;
737 	else if (fd == -1 && (flags & MAP_ANON) != 0)
738 		error = smmap_common(&addr, len, prot, flags,
739 		    NULL, (offset_t)pos);
740 	else if ((fp = getf(fd)) != NULL) {
741 		error = smmap_common(&addr, len, prot, flags,
742 		    fp, (offset_t)pos);
743 		releasef(fd);
744 	} else
745 		error = EBADF;
746 
747 	return (error ? (caddr_t)(uintptr_t)set_errno(error) : addr);
748 }
749 #endif	/* _LP64 */
750 
751 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
752 
753 /*
754  * ILP32 mmap(2) system call: 32-bit offset, 32-bit address.
755  */
756 caddr_t
757 smmap32(caddr32_t addr, size32_t len, int prot, int flags, int fd, off32_t pos)
758 {
759 	struct file *fp;
760 	int error;
761 	caddr_t a = (caddr_t)(uintptr_t)addr;
762 
763 	if (flags & _MAP_LOW32)
764 		error = EINVAL;
765 	else if (fd == -1 && (flags & MAP_ANON) != 0)
766 		error = smmap_common(&a, (size_t)len, prot,
767 		    flags | _MAP_LOW32, NULL, (offset_t)pos);
768 	else if ((fp = getf(fd)) != NULL) {
769 		error = smmap_common(&a, (size_t)len, prot,
770 		    flags | _MAP_LOW32, fp, (offset_t)pos);
771 		releasef(fd);
772 	} else
773 		error = EBADF;
774 
775 	ASSERT(error != 0 || (uintptr_t)(a + len) < (uintptr_t)UINT32_MAX);
776 
777 	return (error ? (caddr_t)(uintptr_t)set_errno(error) : a);
778 }
779 
780 /*
781  * ILP32 mmap64(2) system call: 64-bit offset, 32-bit address.
782  *
783  * Now things really get ugly because we can't use the C-style
784  * calling convention for more than 6 args, and 64-bit parameter
785  * passing on 32-bit systems is less than clean.
786  */
787 
788 struct mmaplf32a {
789 	caddr_t addr;
790 	size_t len;
791 #ifdef _LP64
792 	/*
793 	 * 32-bit contents, 64-bit cells
794 	 */
795 	uint64_t prot;
796 	uint64_t flags;
797 	uint64_t fd;
798 	uint64_t offhi;
799 	uint64_t offlo;
800 #else
801 	/*
802 	 * 32-bit contents, 32-bit cells
803 	 */
804 	uint32_t prot;
805 	uint32_t flags;
806 	uint32_t fd;
807 	uint32_t offhi;
808 	uint32_t offlo;
809 #endif
810 };
811 
812 int
813 smmaplf32(struct mmaplf32a *uap, rval_t *rvp)
814 {
815 	struct file *fp;
816 	int error;
817 	caddr_t a = uap->addr;
818 	int flags = (int)uap->flags;
819 	int fd = (int)uap->fd;
820 #ifdef _BIG_ENDIAN
821 	offset_t off = ((u_offset_t)uap->offhi << 32) | (u_offset_t)uap->offlo;
822 #else
823 	offset_t off = ((u_offset_t)uap->offlo << 32) | (u_offset_t)uap->offhi;
824 #endif
825 
826 	if (flags & _MAP_LOW32)
827 		error = EINVAL;
828 	else if (fd == -1 && (flags & MAP_ANON) != 0)
829 		error = smmap_common(&a, uap->len, (int)uap->prot,
830 		    flags | _MAP_LOW32, NULL, off);
831 	else if ((fp = getf(fd)) != NULL) {
832 		error = smmap_common(&a, uap->len, (int)uap->prot,
833 		    flags | _MAP_LOW32, fp, off);
834 		releasef(fd);
835 	} else
836 		error = EBADF;
837 
838 	if (error == 0)
839 		rvp->r_val1 = (uintptr_t)a;
840 	return (error);
841 }
842 
843 #endif	/* _SYSCALL32_IMPL || _ILP32 */
844 
845 int
846 munmap(caddr_t addr, size_t len)
847 {
848 	struct proc *p = curproc;
849 	struct as *as = p->p_as;
850 
851 	if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
852 		return (set_errno(EINVAL));
853 
854 	if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
855 		return (set_errno(EINVAL));
856 
857 	/*
858 	 * Discard lwpchan mappings.
859 	 */
860 	if (p->p_lcp != NULL)
861 		lwpchan_delete_mapping(p, addr, addr + len);
862 	if (as_unmap(as, addr, len) != 0)
863 		return (set_errno(EINVAL));
864 
865 	return (0);
866 }
867 
868 int
869 mprotect(caddr_t addr, size_t len, int prot)
870 {
871 	struct as *as = curproc->p_as;
872 	uint_t uprot = prot | PROT_USER;
873 	int error;
874 
875 	if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
876 		return (set_errno(EINVAL));
877 
878 	switch (valid_usr_range(addr, len, prot, as, as->a_userlimit)) {
879 	case RANGE_OKAY:
880 		break;
881 	case RANGE_BADPROT:
882 		return (set_errno(ENOTSUP));
883 	case RANGE_BADADDR:
884 	default:
885 		return (set_errno(ENOMEM));
886 	}
887 
888 	error = as_setprot(as, addr, len, uprot);
889 	if (error)
890 		return (set_errno(error));
891 	return (0);
892 }
893 
894 #define	MC_CACHE	128			/* internal result buffer */
895 #define	MC_QUANTUM	(MC_CACHE * PAGESIZE)	/* addresses covered in loop */
896 
897 int
898 mincore(caddr_t addr, size_t len, char *vecp)
899 {
900 	struct as *as = curproc->p_as;
901 	caddr_t ea;			/* end address of loop */
902 	size_t rl;			/* inner result length */
903 	char vec[MC_CACHE];		/* local vector cache */
904 	int error;
905 	model_t model;
906 	long	llen;
907 
908 	model = get_udatamodel();
909 	/*
910 	 * Validate form of address parameters.
911 	 */
912 	if (model == DATAMODEL_NATIVE) {
913 		llen = (long)len;
914 	} else {
915 		llen = (int32_t)(size32_t)len;
916 	}
917 	if (((uintptr_t)addr & PAGEOFFSET) != 0 || llen <= 0)
918 		return (set_errno(EINVAL));
919 
920 	if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
921 		return (set_errno(ENOMEM));
922 
923 	/*
924 	 * Loop over subranges of interval [addr : addr + len), recovering
925 	 * results internally and then copying them out to caller.  Subrange
926 	 * is based on the size of MC_CACHE, defined above.
927 	 */
928 	for (ea = addr + len; addr < ea; addr += MC_QUANTUM) {
929 		error = as_incore(as, addr,
930 		    (size_t)MIN(MC_QUANTUM, ea - addr), vec, &rl);
931 		if (rl != 0) {
932 			rl = (rl + PAGESIZE - 1) / PAGESIZE;
933 			if (copyout(vec, vecp, rl) != 0)
934 				return (set_errno(EFAULT));
935 			vecp += rl;
936 		}
937 		if (error != 0)
938 			return (set_errno(ENOMEM));
939 	}
940 	return (0);
941 }
942