xref: /titanic_44/usr/src/uts/common/os/grow.c (revision 4a634bb80136cc001d14ab96addd9915105e5223)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"
31 
32 #include <sys/types.h>
33 #include <sys/inttypes.h>
34 #include <sys/param.h>
35 #include <sys/sysmacros.h>
36 #include <sys/systm.h>
37 #include <sys/signal.h>
38 #include <sys/user.h>
39 #include <sys/errno.h>
40 #include <sys/var.h>
41 #include <sys/proc.h>
42 #include <sys/tuneable.h>
43 #include <sys/debug.h>
44 #include <sys/cmn_err.h>
45 #include <sys/cred.h>
46 #include <sys/vnode.h>
47 #include <sys/vfs.h>
48 #include <sys/vm.h>
49 #include <sys/file.h>
50 #include <sys/mman.h>
51 #include <sys/vmparam.h>
52 #include <sys/fcntl.h>
53 #include <sys/lwpchan_impl.h>
54 #include <sys/nbmlock.h>
55 
56 #include <vm/hat.h>
57 #include <vm/as.h>
58 #include <vm/seg.h>
59 #include <vm/seg_dev.h>
60 #include <vm/seg_vn.h>
61 
62 int use_brk_lpg = 1;
63 int use_stk_lpg = 1;
64 
65 static int brk_lpg(caddr_t nva);
66 static int grow_lpg(caddr_t sp);
67 
68 int
69 brk(caddr_t nva)
70 {
71 	int error;
72 	proc_t *p = curproc;
73 
74 	/*
75 	 * Serialize brk operations on an address space.
76 	 * This also serves as the lock protecting p_brksize
77 	 * and p_brkpageszc.
78 	 */
79 	as_rangelock(p->p_as);
80 	if (use_brk_lpg && (p->p_flag & SAUTOLPG) != 0) {
81 		error = brk_lpg(nva);
82 	} else {
83 		error = brk_internal(nva, p->p_brkpageszc);
84 	}
85 	as_rangeunlock(p->p_as);
86 	return ((error != 0 ? set_errno(error) : 0));
87 }
88 
89 /*
90  * Algorithm: call arch-specific map_pgsz to get best page size to use,
91  * then call brk_internal().
92  * Returns 0 on success.
93  */
94 static int
95 brk_lpg(caddr_t nva)
96 {
97 	struct proc *p = curproc;
98 	size_t pgsz, len;
99 	caddr_t addr, brkend;
100 	caddr_t bssbase = p->p_bssbase;
101 	caddr_t brkbase = p->p_brkbase;
102 	int oszc, szc;
103 	int err;
104 
105 	oszc = p->p_brkpageszc;
106 
107 	/*
108 	 * If p_brkbase has not yet been set, the first call
109 	 * to brk_internal() will initialize it.
110 	 */
111 	if (brkbase == 0) {
112 		return (brk_internal(nva, oszc));
113 	}
114 
115 	len = nva - bssbase;
116 
117 	pgsz = map_pgsz(MAPPGSZ_HEAP, p, bssbase, len, 0);
118 	szc = page_szc(pgsz);
119 
120 	/*
121 	 * Covers two cases:
122 	 * 1. page_szc() returns -1 for invalid page size, so we want to
123 	 * ignore it in that case.
124 	 * 2. By design we never decrease page size, as it is more stable.
125 	 */
126 	if (szc <= oszc) {
127 		err = brk_internal(nva, oszc);
128 		/* If failed, back off to base page size. */
129 		if (err != 0 && oszc != 0) {
130 			err = brk_internal(nva, 0);
131 		}
132 		return (err);
133 	}
134 
135 	err = brk_internal(nva, szc);
136 	/* If using szc failed, map with base page size and return. */
137 	if (err != 0) {
138 		if (szc != 0) {
139 			err = brk_internal(nva, 0);
140 		}
141 		return (err);
142 	}
143 
144 	/*
145 	 * Round up brk base to a large page boundary and remap
146 	 * anything in the segment already faulted in beyond that
147 	 * point.
148 	 */
149 	addr = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase, pgsz);
150 	brkend = brkbase + p->p_brksize;
151 	len = brkend - addr;
152 	/* Check that len is not negative. Update page size code for heap. */
153 	if (addr >= p->p_bssbase && brkend > addr && IS_P2ALIGNED(len, pgsz)) {
154 		(void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
155 		p->p_brkpageszc = szc;
156 	}
157 
158 	ASSERT(err == 0);
159 	return (err);		/* should always be 0 */
160 }
161 
162 /*
163  * Returns 0 on success.
164  */
165 int
166 brk_internal(caddr_t nva, uint_t brkszc)
167 {
168 	caddr_t ova;			/* current break address */
169 	size_t size;
170 	int	error;
171 	struct proc *p = curproc;
172 	struct as *as = p->p_as;
173 	size_t pgsz;
174 	uint_t szc;
175 	rctl_qty_t as_rctl;
176 
177 	/*
178 	 * extend heap to brkszc alignment but use current p->p_brkpageszc
179 	 * for the newly created segment. This allows the new extension
180 	 * segment to be concatenated successfully with the existing brk
181 	 * segment.
182 	 */
183 	if ((szc = brkszc) != 0) {
184 		pgsz = page_get_pagesize(szc);
185 		ASSERT(pgsz > PAGESIZE);
186 	} else {
187 		pgsz = PAGESIZE;
188 	}
189 
190 	mutex_enter(&p->p_lock);
191 	as_rctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_DATA],
192 	    p->p_rctls, p);
193 	mutex_exit(&p->p_lock);
194 
195 	/*
196 	 * If p_brkbase has not yet been set, the first call
197 	 * to brk() will initialize it.
198 	 */
199 	if (p->p_brkbase == 0)
200 		p->p_brkbase = nva;
201 
202 	/*
203 	 * Before multiple page size support existed p_brksize was the value
204 	 * not rounded to the pagesize (i.e. it stored the exact user request
205 	 * for heap size). If pgsz is greater than PAGESIZE calculate the
206 	 * heap size as the real new heap size by rounding it up to pgsz.
207 	 * This is useful since we may want to know where the heap ends
208 	 * without knowing heap pagesize (e.g. some old code) and also if
209 	 * heap pagesize changes we can update p_brkpageszc but delay adding
210 	 * new mapping yet still know from p_brksize where the heap really
211 	 * ends. The user requested heap end is stored in libc variable.
212 	 */
213 	if (pgsz > PAGESIZE) {
214 		caddr_t tnva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
215 		size = tnva - p->p_brkbase;
216 		if (tnva < p->p_brkbase || (size > p->p_brksize &&
217 		    size > (size_t)as_rctl)) {
218 			szc = 0;
219 			pgsz = PAGESIZE;
220 			size = nva - p->p_brkbase;
221 		}
222 	} else {
223 		size = nva - p->p_brkbase;
224 	}
225 
226 	/*
227 	 * use PAGESIZE to roundup ova because we want to know the real value
228 	 * of the current heap end in case p_brkpageszc changes since the last
229 	 * p_brksize was computed.
230 	 */
231 	nva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
232 	ova = (caddr_t)P2ROUNDUP((uintptr_t)(p->p_brkbase + p->p_brksize),
233 	    PAGESIZE);
234 
235 	if ((nva < p->p_brkbase) || (size > p->p_brksize &&
236 	    size > as_rctl)) {
237 		mutex_enter(&p->p_lock);
238 		(void) rctl_action(rctlproc_legacy[RLIMIT_DATA], p->p_rctls, p,
239 		    RCA_SAFE);
240 		mutex_exit(&p->p_lock);
241 		return (ENOMEM);
242 	}
243 
244 	if (nva > ova) {
245 		struct segvn_crargs crargs =
246 		    SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
247 
248 		if (!(p->p_datprot & PROT_EXEC)) {
249 			crargs.prot &= ~PROT_EXEC;
250 		}
251 
252 		/*
253 		 * Add new zfod mapping to extend UNIX data segment
254 		 * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies
255 		 * via map_pgszcvec(). Use AS_MAP_HEAP to get intermediate
256 		 * page sizes if ova is not aligned to szc's pgsz.
257 		 */
258 		if (szc > 0) {
259 			caddr_t rbss;
260 
261 			rbss = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase,
262 			    pgsz);
263 			if (IS_P2ALIGNED(p->p_bssbase, pgsz) || ova > rbss) {
264 				crargs.szc = p->p_brkpageszc ? p->p_brkpageszc :
265 				    AS_MAP_NO_LPOOB;
266 			} else if (ova == rbss) {
267 				crargs.szc = szc;
268 			} else {
269 				crargs.szc = AS_MAP_HEAP;
270 			}
271 		} else {
272 			crargs.szc = AS_MAP_NO_LPOOB;
273 		}
274 		crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_UP;
275 		error = as_map(as, ova, (size_t)(nva - ova), segvn_create,
276 		    &crargs);
277 		if (error) {
278 			return (error);
279 		}
280 
281 	} else if (nva < ova) {
282 		/*
283 		 * Release mapping to shrink UNIX data segment.
284 		 */
285 		(void) as_unmap(as, nva, (size_t)(ova - nva));
286 	}
287 	p->p_brksize = size;
288 	return (0);
289 }
290 
291 /*
292  * Grow the stack to include sp.  Return 1 if successful, 0 otherwise.
293  * This routine assumes that the stack grows downward.
294  */
295 int
296 grow(caddr_t sp)
297 {
298 	struct proc *p = curproc;
299 	struct as *as = p->p_as;
300 	size_t oldsize = p->p_stksize;
301 	size_t newsize;
302 	int err;
303 
304 	/*
305 	 * Serialize grow operations on an address space.
306 	 * This also serves as the lock protecting p_stksize
307 	 * and p_stkpageszc.
308 	 */
309 	as_rangelock(as);
310 	if (use_stk_lpg && (p->p_flag & SAUTOLPG) != 0) {
311 		err = grow_lpg(sp);
312 	} else {
313 		err = grow_internal(sp, p->p_stkpageszc);
314 	}
315 	as_rangeunlock(as);
316 
317 	if (err == 0 && (newsize = p->p_stksize) > oldsize) {
318 		ASSERT(IS_P2ALIGNED(oldsize, PAGESIZE));
319 		ASSERT(IS_P2ALIGNED(newsize, PAGESIZE));
320 		/*
321 		 * Set up translations so the process doesn't have to fault in
322 		 * the stack pages we just gave it.
323 		 */
324 		(void) as_fault(as->a_hat, as, p->p_usrstack - newsize,
325 		    newsize - oldsize, F_INVAL, S_WRITE);
326 	}
327 	return ((err == 0 ? 1 : 0));
328 }
329 
330 /*
331  * Algorithm: call arch-specific map_pgsz to get best page size to use,
332  * then call grow_internal().
333  * Returns 0 on success.
334  */
335 static int
336 grow_lpg(caddr_t sp)
337 {
338 	struct proc *p = curproc;
339 	size_t pgsz;
340 	size_t len, newsize;
341 	caddr_t addr, saddr;
342 	caddr_t growend;
343 	int oszc, szc;
344 	int err;
345 
346 	newsize = p->p_usrstack - sp;
347 
348 	oszc = p->p_stkpageszc;
349 	pgsz = map_pgsz(MAPPGSZ_STK, p, sp, newsize, 0);
350 	szc = page_szc(pgsz);
351 
352 	/*
353 	 * Covers two cases:
354 	 * 1. page_szc() returns -1 for invalid page size, so we want to
355 	 * ignore it in that case.
356 	 * 2. By design we never decrease page size, as it is more stable.
357 	 * This shouldn't happen as the stack never shrinks.
358 	 */
359 	if (szc <= oszc) {
360 		err = grow_internal(sp, oszc);
361 		/* failed, fall back to base page size */
362 		if (err != 0 && oszc != 0) {
363 			err = grow_internal(sp, 0);
364 		}
365 		return (err);
366 	}
367 
368 	/*
369 	 * We've grown sufficiently to switch to a new page size.
370 	 * So we are going to remap the whole segment with the new page size.
371 	 */
372 	err = grow_internal(sp, szc);
373 	/* The grow with szc failed, so fall back to base page size. */
374 	if (err != 0) {
375 		if (szc != 0) {
376 			err = grow_internal(sp, 0);
377 		}
378 		return (err);
379 	}
380 
381 	/*
382 	 * Round up stack pointer to a large page boundary and remap
383 	 * any pgsz pages in the segment already faulted in beyond that
384 	 * point.
385 	 */
386 	saddr = p->p_usrstack - p->p_stksize;
387 	addr = (caddr_t)P2ROUNDUP((uintptr_t)saddr, pgsz);
388 	growend = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack, pgsz);
389 	len = growend - addr;
390 	/* Check that len is not negative. Update page size code for stack. */
391 	if (addr >= saddr && growend > addr && IS_P2ALIGNED(len, pgsz)) {
392 		(void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
393 		p->p_stkpageszc = szc;
394 	}
395 
396 	ASSERT(err == 0);
397 	return (err);		/* should always be 0 */
398 }
399 
400 /*
401  * This routine assumes that the stack grows downward.
402  * Returns 0 on success, errno on failure.
403  */
404 int
405 grow_internal(caddr_t sp, uint_t growszc)
406 {
407 	struct proc *p = curproc;
408 	size_t newsize;
409 	size_t oldsize;
410 	int    error;
411 	size_t pgsz;
412 	uint_t szc;
413 	struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
414 
415 	ASSERT(sp < p->p_usrstack);
416 	sp = (caddr_t)P2ALIGN((uintptr_t)sp, PAGESIZE);
417 
418 	/*
419 	 * grow to growszc alignment but use current p->p_stkpageszc for
420 	 * the segvn_crargs szc passed to segvn_create. For memcntl to
421 	 * increase the szc, this allows the new extension segment to be
422 	 * concatenated successfully with the existing stack segment.
423 	 */
424 	if ((szc = growszc) != 0) {
425 		pgsz = page_get_pagesize(szc);
426 		ASSERT(pgsz > PAGESIZE);
427 		newsize = p->p_usrstack - (caddr_t)P2ALIGN((uintptr_t)sp, pgsz);
428 		if (newsize > (size_t)p->p_stk_ctl) {
429 			szc = 0;
430 			pgsz = PAGESIZE;
431 			newsize = p->p_usrstack - sp;
432 		}
433 	} else {
434 		pgsz = PAGESIZE;
435 		newsize = p->p_usrstack - sp;
436 	}
437 
438 	if (newsize > (size_t)p->p_stk_ctl) {
439 		(void) rctl_action(rctlproc_legacy[RLIMIT_STACK], p->p_rctls, p,
440 		    RCA_UNSAFE_ALL);
441 
442 		return (ENOMEM);
443 	}
444 
445 	oldsize = p->p_stksize;
446 	ASSERT(P2PHASE(oldsize, PAGESIZE) == 0);
447 
448 	if (newsize <= oldsize) {	/* prevent the stack from shrinking */
449 		return (0);
450 	}
451 
452 	if (!(p->p_stkprot & PROT_EXEC)) {
453 		crargs.prot &= ~PROT_EXEC;
454 	}
455 	/*
456 	 * extend stack with the proposed new growszc, which is different
457 	 * than p_stkpageszc only on a memcntl to increase the stack pagesize.
458 	 * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies via
459 	 * map_pgszcvec(). Use AS_MAP_STACK to get intermediate page sizes
460 	 * if not aligned to szc's pgsz.
461 	 */
462 	if (szc > 0) {
463 		caddr_t oldsp = p->p_usrstack - oldsize;
464 		caddr_t austk = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack,
465 		    pgsz);
466 
467 		if (IS_P2ALIGNED(p->p_usrstack, pgsz) || oldsp < austk) {
468 			crargs.szc = p->p_stkpageszc ? p->p_stkpageszc :
469 			    AS_MAP_NO_LPOOB;
470 		} else if (oldsp == austk) {
471 			crargs.szc = szc;
472 		} else {
473 			crargs.szc = AS_MAP_STACK;
474 		}
475 	} else {
476 		crargs.szc = AS_MAP_NO_LPOOB;
477 	}
478 	crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_DOWN;
479 
480 	if ((error = as_map(p->p_as, p->p_usrstack - newsize, newsize - oldsize,
481 	    segvn_create, &crargs)) != 0) {
482 		if (error == EAGAIN) {
483 			cmn_err(CE_WARN, "Sorry, no swap space to grow stack "
484 			    "for pid %d (%s)", p->p_pid, PTOU(p)->u_comm);
485 		}
486 		return (error);
487 	}
488 	p->p_stksize = newsize;
489 	return (0);
490 }
491 
492 /*
493  * Find address for user to map.
494  * If MAP_FIXED is not specified, we can pick any address we want, but we will
495  * first try the value in *addrp if it is non-NULL.  Thus this is implementing
496  * a way to try and get a preferred address.
497  */
498 int
499 choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off,
500     int vacalign, uint_t flags)
501 {
502 	caddr_t basep = (caddr_t)(uintptr_t)((uintptr_t)*addrp & PAGEMASK);
503 	size_t lenp = len;
504 
505 	ASSERT(AS_ISCLAIMGAP(as));	/* searches should be serialized */
506 	if (flags & MAP_FIXED) {
507 		(void) as_unmap(as, *addrp, len);
508 		return (0);
509 	} else if (basep != NULL && ((flags & MAP_ALIGN) == 0) &&
510 	    !as_gap(as, len, &basep, &lenp, 0, *addrp)) {
511 		/* User supplied address was available */
512 		*addrp = basep;
513 	} else {
514 		/*
515 		 * No user supplied address or the address supplied was not
516 		 * available.
517 		 */
518 		map_addr(addrp, len, off, vacalign, flags);
519 	}
520 	if (*addrp == NULL)
521 		return (ENOMEM);
522 	return (0);
523 }
524 
525 
526 /*
527  * Used for MAP_ANON - fast way to get anonymous pages
528  */
529 static int
530 zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
531     offset_t pos)
532 {
533 	struct segvn_crargs vn_a;
534 	int error;
535 
536 	if (((PROT_ALL & uprot) != uprot))
537 		return (EACCES);
538 
539 	if ((flags & MAP_FIXED) != 0) {
540 		caddr_t userlimit;
541 
542 		/*
543 		 * Use the user address.  First verify that
544 		 * the address to be used is page aligned.
545 		 * Then make some simple bounds checks.
546 		 */
547 		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
548 			return (EINVAL);
549 
550 		userlimit = flags & _MAP_LOW32 ?
551 		    (caddr_t)USERLIMIT32 : as->a_userlimit;
552 		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
553 		case RANGE_OKAY:
554 			break;
555 		case RANGE_BADPROT:
556 			return (ENOTSUP);
557 		case RANGE_BADADDR:
558 		default:
559 			return (ENOMEM);
560 		}
561 	}
562 	/*
563 	 * No need to worry about vac alignment for anonymous
564 	 * pages since this is a "clone" object that doesn't
565 	 * yet exist.
566 	 */
567 	error = choose_addr(as, addrp, len, pos, ADDR_NOVACALIGN, flags);
568 	if (error != 0) {
569 		return (error);
570 	}
571 
572 	/*
573 	 * Use the seg_vn segment driver; passing in the NULL amp
574 	 * gives the desired "cloning" effect.
575 	 */
576 	vn_a.vp = NULL;
577 	vn_a.offset = 0;
578 	vn_a.type = flags & MAP_TYPE;
579 	vn_a.prot = uprot;
580 	vn_a.maxprot = PROT_ALL;
581 	vn_a.flags = flags & ~MAP_TYPE;
582 	vn_a.cred = CRED();
583 	vn_a.amp = NULL;
584 	vn_a.szc = 0;
585 	vn_a.lgrp_mem_policy_flags = 0;
586 
587 	return (as_map(as, *addrp, len, segvn_create, &vn_a));
588 }
589 
590 static int
591 smmap_common(caddr_t *addrp, size_t len,
592     int prot, int flags, struct file *fp, offset_t pos)
593 {
594 	struct vnode *vp;
595 	struct as *as = curproc->p_as;
596 	uint_t uprot, maxprot, type;
597 	int error;
598 	int in_crit = 0;
599 
600 	if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | _MAP_NEW |
601 	    _MAP_LOW32 | MAP_NORESERVE | MAP_ANON | MAP_ALIGN |
602 	    MAP_TEXT | MAP_INITDATA)) != 0) {
603 		/* | MAP_RENAME */	/* not implemented, let user know */
604 		return (EINVAL);
605 	}
606 
607 	if ((flags & MAP_TEXT) && !(prot & PROT_EXEC)) {
608 		return (EINVAL);
609 	}
610 
611 	if ((flags & (MAP_TEXT | MAP_INITDATA)) == (MAP_TEXT | MAP_INITDATA)) {
612 		return (EINVAL);
613 	}
614 
615 #if defined(__sparc)
616 	/*
617 	 * See if this is an "old mmap call".  If so, remember this
618 	 * fact and convert the flags value given to mmap to indicate
619 	 * the specified address in the system call must be used.
620 	 * _MAP_NEW is turned set by all new uses of mmap.
621 	 */
622 	if ((flags & _MAP_NEW) == 0)
623 		flags |= MAP_FIXED;
624 #endif
625 	flags &= ~_MAP_NEW;
626 
627 	type = flags & MAP_TYPE;
628 	if (type != MAP_PRIVATE && type != MAP_SHARED)
629 		return (EINVAL);
630 
631 
632 	if (flags & MAP_ALIGN) {
633 
634 		if (flags & MAP_FIXED)
635 			return (EINVAL);
636 
637 		/* alignment needs to be a power of 2 >= page size */
638 		if (((uintptr_t)*addrp < PAGESIZE && (uintptr_t)*addrp != 0) ||
639 		    !ISP2((uintptr_t)*addrp))
640 			return (EINVAL);
641 	}
642 	/*
643 	 * Check for bad lengths and file position.
644 	 * We let the VOP_MAP routine check for negative lengths
645 	 * since on some vnode types this might be appropriate.
646 	 */
647 	if (len == 0 || (pos & (u_offset_t)PAGEOFFSET) != 0)
648 		return (EINVAL);
649 
650 	maxprot = PROT_ALL;		/* start out allowing all accesses */
651 	uprot = prot | PROT_USER;
652 
653 	if (fp == NULL) {
654 		ASSERT(flags & MAP_ANON);
655 		as_rangelock(as);
656 		error = zmap(as, addrp, len, uprot, flags, pos);
657 		as_rangeunlock(as);
658 		return (error);
659 	} else if ((flags & MAP_ANON) != 0)
660 		return (EINVAL);
661 
662 	vp = fp->f_vnode;
663 
664 	/* Can't execute code from "noexec" mounted filesystem. */
665 	if ((vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0)
666 		maxprot &= ~PROT_EXEC;
667 
668 	/*
669 	 * These checks were added as part of large files.
670 	 *
671 	 * Return ENXIO if the initial position is negative; return EOVERFLOW
672 	 * if (offset + len) would overflow the maximum allowed offset for the
673 	 * type of file descriptor being used.
674 	 */
675 	if (vp->v_type == VREG) {
676 		if (pos < 0)
677 			return (ENXIO);
678 		if ((offset_t)len > (OFFSET_MAX(fp) - pos))
679 			return (EOVERFLOW);
680 	}
681 
682 	if (type == MAP_SHARED && (fp->f_flag & FWRITE) == 0) {
683 		/* no write access allowed */
684 		maxprot &= ~PROT_WRITE;
685 	}
686 
687 	/*
688 	 * XXX - Do we also adjust maxprot based on protections
689 	 * of the vnode?  E.g. if no execute permission is given
690 	 * on the vnode for the current user, maxprot probably
691 	 * should disallow PROT_EXEC also?  This is different
692 	 * from the write access as this would be a per vnode
693 	 * test as opposed to a per fd test for writability.
694 	 */
695 
696 	/*
697 	 * Verify that the specified protections are not greater than
698 	 * the maximum allowable protections.  Also test to make sure
699 	 * that the file descriptor does allows for read access since
700 	 * "write only" mappings are hard to do since normally we do
701 	 * the read from the file before the page can be written.
702 	 */
703 	if (((maxprot & uprot) != uprot) || (fp->f_flag & FREAD) == 0)
704 		return (EACCES);
705 
706 	/*
707 	 * If the user specified an address, do some simple checks here
708 	 */
709 	if ((flags & MAP_FIXED) != 0) {
710 		caddr_t userlimit;
711 
712 		/*
713 		 * Use the user address.  First verify that
714 		 * the address to be used is page aligned.
715 		 * Then make some simple bounds checks.
716 		 */
717 		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
718 			return (EINVAL);
719 
720 		userlimit = flags & _MAP_LOW32 ?
721 		    (caddr_t)USERLIMIT32 : as->a_userlimit;
722 		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
723 		case RANGE_OKAY:
724 			break;
725 		case RANGE_BADPROT:
726 			return (ENOTSUP);
727 		case RANGE_BADADDR:
728 		default:
729 			return (ENOMEM);
730 		}
731 	}
732 
733 	if ((prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) &&
734 	    nbl_need_check(vp)) {
735 		int svmand;
736 		nbl_op_t nop;
737 
738 		nbl_start_crit(vp, RW_READER);
739 		in_crit = 1;
740 		error = nbl_svmand(vp, fp->f_cred, &svmand);
741 		if (error != 0)
742 			goto done;
743 		if ((prot & PROT_WRITE) && (type == MAP_SHARED)) {
744 			if (prot & (PROT_READ | PROT_EXEC)) {
745 				nop = NBL_READWRITE;
746 			} else {
747 				nop = NBL_WRITE;
748 			}
749 		} else {
750 			nop = NBL_READ;
751 		}
752 		if (nbl_conflict(vp, nop, 0, LONG_MAX, svmand, NULL)) {
753 			error = EACCES;
754 			goto done;
755 		}
756 	}
757 
758 	/*
759 	 * Ok, now let the vnode map routine do its thing to set things up.
760 	 */
761 	error = VOP_MAP(vp, pos, as,
762 	    addrp, len, uprot, maxprot, flags, fp->f_cred, NULL);
763 
764 	if (error == 0) {
765 		if (vp->v_type == VREG &&
766 		    (flags & (MAP_TEXT | MAP_INITDATA)) != 0) {
767 			/*
768 			 * Mark this as an executable vnode
769 			 */
770 			mutex_enter(&vp->v_lock);
771 			vp->v_flag |= VVMEXEC;
772 			mutex_exit(&vp->v_lock);
773 		}
774 	}
775 
776 done:
777 	if (in_crit)
778 		nbl_end_crit(vp);
779 	return (error);
780 }
781 
782 #ifdef _LP64
783 /*
784  * LP64 mmap(2) system call: 64-bit offset, 64-bit address.
785  *
786  * The "large file" mmap routine mmap64(2) is also mapped to this routine
787  * by the 64-bit version of libc.
788  *
789  * Eventually, this should be the only version, and have smmap_common()
790  * folded back into it again.  Some day.
791  */
792 caddr_t
793 smmap64(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos)
794 {
795 	struct file *fp;
796 	int error;
797 
798 	if (flags & _MAP_LOW32)
799 		error = EINVAL;
800 	else if (fd == -1 && (flags & MAP_ANON) != 0)
801 		error = smmap_common(&addr, len, prot, flags,
802 		    NULL, (offset_t)pos);
803 	else if ((fp = getf(fd)) != NULL) {
804 		error = smmap_common(&addr, len, prot, flags,
805 		    fp, (offset_t)pos);
806 		releasef(fd);
807 	} else
808 		error = EBADF;
809 
810 	return (error ? (caddr_t)(uintptr_t)set_errno(error) : addr);
811 }
812 #endif	/* _LP64 */
813 
814 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
815 
816 /*
817  * ILP32 mmap(2) system call: 32-bit offset, 32-bit address.
818  */
819 caddr_t
820 smmap32(caddr32_t addr, size32_t len, int prot, int flags, int fd, off32_t pos)
821 {
822 	struct file *fp;
823 	int error;
824 	caddr_t a = (caddr_t)(uintptr_t)addr;
825 
826 	if (flags & _MAP_LOW32)
827 		error = EINVAL;
828 	else if (fd == -1 && (flags & MAP_ANON) != 0)
829 		error = smmap_common(&a, (size_t)len, prot,
830 		    flags | _MAP_LOW32, NULL, (offset_t)pos);
831 	else if ((fp = getf(fd)) != NULL) {
832 		error = smmap_common(&a, (size_t)len, prot,
833 		    flags | _MAP_LOW32, fp, (offset_t)pos);
834 		releasef(fd);
835 	} else
836 		error = EBADF;
837 
838 	ASSERT(error != 0 || (uintptr_t)(a + len) < (uintptr_t)UINT32_MAX);
839 
840 	return (error ? (caddr_t)(uintptr_t)set_errno(error) : a);
841 }
842 
843 /*
844  * ILP32 mmap64(2) system call: 64-bit offset, 32-bit address.
845  *
846  * Now things really get ugly because we can't use the C-style
847  * calling convention for more than 6 args, and 64-bit parameter
848  * passing on 32-bit systems is less than clean.
849  */
850 
851 struct mmaplf32a {
852 	caddr_t addr;
853 	size_t len;
854 #ifdef _LP64
855 	/*
856 	 * 32-bit contents, 64-bit cells
857 	 */
858 	uint64_t prot;
859 	uint64_t flags;
860 	uint64_t fd;
861 	uint64_t offhi;
862 	uint64_t offlo;
863 #else
864 	/*
865 	 * 32-bit contents, 32-bit cells
866 	 */
867 	uint32_t prot;
868 	uint32_t flags;
869 	uint32_t fd;
870 	uint32_t offhi;
871 	uint32_t offlo;
872 #endif
873 };
874 
875 int
876 smmaplf32(struct mmaplf32a *uap, rval_t *rvp)
877 {
878 	struct file *fp;
879 	int error;
880 	caddr_t a = uap->addr;
881 	int flags = (int)uap->flags;
882 	int fd = (int)uap->fd;
883 #ifdef _BIG_ENDIAN
884 	offset_t off = ((u_offset_t)uap->offhi << 32) | (u_offset_t)uap->offlo;
885 #else
886 	offset_t off = ((u_offset_t)uap->offlo << 32) | (u_offset_t)uap->offhi;
887 #endif
888 
889 	if (flags & _MAP_LOW32)
890 		error = EINVAL;
891 	else if (fd == -1 && (flags & MAP_ANON) != 0)
892 		error = smmap_common(&a, uap->len, (int)uap->prot,
893 		    flags | _MAP_LOW32, NULL, off);
894 	else if ((fp = getf(fd)) != NULL) {
895 		error = smmap_common(&a, uap->len, (int)uap->prot,
896 		    flags | _MAP_LOW32, fp, off);
897 		releasef(fd);
898 	} else
899 		error = EBADF;
900 
901 	if (error == 0)
902 		rvp->r_val1 = (uintptr_t)a;
903 	return (error);
904 }
905 
906 #endif	/* _SYSCALL32_IMPL || _ILP32 */
907 
908 int
909 munmap(caddr_t addr, size_t len)
910 {
911 	struct proc *p = curproc;
912 	struct as *as = p->p_as;
913 
914 	if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
915 		return (set_errno(EINVAL));
916 
917 	if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
918 		return (set_errno(EINVAL));
919 
920 	/*
921 	 * Discard lwpchan mappings.
922 	 */
923 	if (p->p_lcp != NULL)
924 		lwpchan_delete_mapping(p, addr, addr + len);
925 	if (as_unmap(as, addr, len) != 0)
926 		return (set_errno(EINVAL));
927 
928 	return (0);
929 }
930 
931 int
932 mprotect(caddr_t addr, size_t len, int prot)
933 {
934 	struct as *as = curproc->p_as;
935 	uint_t uprot = prot | PROT_USER;
936 	int error;
937 
938 	if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
939 		return (set_errno(EINVAL));
940 
941 	switch (valid_usr_range(addr, len, prot, as, as->a_userlimit)) {
942 	case RANGE_OKAY:
943 		break;
944 	case RANGE_BADPROT:
945 		return (set_errno(ENOTSUP));
946 	case RANGE_BADADDR:
947 	default:
948 		return (set_errno(ENOMEM));
949 	}
950 
951 	error = as_setprot(as, addr, len, uprot);
952 	if (error)
953 		return (set_errno(error));
954 	return (0);
955 }
956 
957 #define	MC_CACHE	128			/* internal result buffer */
958 #define	MC_QUANTUM	(MC_CACHE * PAGESIZE)	/* addresses covered in loop */
959 
960 int
961 mincore(caddr_t addr, size_t len, char *vecp)
962 {
963 	struct as *as = curproc->p_as;
964 	caddr_t ea;			/* end address of loop */
965 	size_t rl;			/* inner result length */
966 	char vec[MC_CACHE];		/* local vector cache */
967 	int error;
968 	model_t model;
969 	long	llen;
970 
971 	model = get_udatamodel();
972 	/*
973 	 * Validate form of address parameters.
974 	 */
975 	if (model == DATAMODEL_NATIVE) {
976 		llen = (long)len;
977 	} else {
978 		llen = (int32_t)(size32_t)len;
979 	}
980 	if (((uintptr_t)addr & PAGEOFFSET) != 0 || llen <= 0)
981 		return (set_errno(EINVAL));
982 
983 	if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
984 		return (set_errno(ENOMEM));
985 
986 	/*
987 	 * Loop over subranges of interval [addr : addr + len), recovering
988 	 * results internally and then copying them out to caller.  Subrange
989 	 * is based on the size of MC_CACHE, defined above.
990 	 */
991 	for (ea = addr + len; addr < ea; addr += MC_QUANTUM) {
992 		error = as_incore(as, addr,
993 		    (size_t)MIN(MC_QUANTUM, ea - addr), vec, &rl);
994 		if (rl != 0) {
995 			rl = (rl + PAGESIZE - 1) / PAGESIZE;
996 			if (copyout(vec, vecp, rl) != 0)
997 				return (set_errno(EFAULT));
998 			vecp += rl;
999 		}
1000 		if (error != 0)
1001 			return (set_errno(ENOMEM));
1002 	}
1003 	return (0);
1004 }
1005