xref: /titanic_44/usr/src/uts/common/os/grow.c (revision 73a0bd151c1115bf39cc2caa30c7cbfdd86361c1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 #include <sys/types.h>
31 #include <sys/inttypes.h>
32 #include <sys/param.h>
33 #include <sys/sysmacros.h>
34 #include <sys/systm.h>
35 #include <sys/signal.h>
36 #include <sys/user.h>
37 #include <sys/errno.h>
38 #include <sys/var.h>
39 #include <sys/proc.h>
40 #include <sys/tuneable.h>
41 #include <sys/debug.h>
42 #include <sys/cmn_err.h>
43 #include <sys/cred.h>
44 #include <sys/vnode.h>
45 #include <sys/vfs.h>
46 #include <sys/vm.h>
47 #include <sys/file.h>
48 #include <sys/mman.h>
49 #include <sys/vmparam.h>
50 #include <sys/fcntl.h>
51 #include <sys/lwpchan_impl.h>
52 #include <sys/nbmlock.h>
53 
54 #include <vm/hat.h>
55 #include <vm/as.h>
56 #include <vm/seg.h>
57 #include <vm/seg_dev.h>
58 #include <vm/seg_vn.h>
59 
60 int use_brk_lpg = 1;
61 int use_stk_lpg = 1;
62 
63 static int brk_lpg(caddr_t nva);
64 static int grow_lpg(caddr_t sp);
65 
66 int
67 brk(caddr_t nva)
68 {
69 	int error;
70 	proc_t *p = curproc;
71 
72 	/*
73 	 * Serialize brk operations on an address space.
74 	 * This also serves as the lock protecting p_brksize
75 	 * and p_brkpageszc.
76 	 */
77 	as_rangelock(p->p_as);
78 	if (use_brk_lpg && (p->p_flag & SAUTOLPG) != 0) {
79 		error = brk_lpg(nva);
80 	} else {
81 		error = brk_internal(nva, p->p_brkpageszc);
82 	}
83 	as_rangeunlock(p->p_as);
84 	return ((error != 0 ? set_errno(error) : 0));
85 }
86 
87 /*
88  * Algorithm: call arch-specific map_pgsz to get best page size to use,
89  * then call brk_internal().
90  * Returns 0 on success.
91  */
92 static int
93 brk_lpg(caddr_t nva)
94 {
95 	struct proc *p = curproc;
96 	size_t pgsz, len;
97 	caddr_t addr, brkend;
98 	caddr_t bssbase = p->p_bssbase;
99 	caddr_t brkbase = p->p_brkbase;
100 	int oszc, szc;
101 	int err;
102 
103 	oszc = p->p_brkpageszc;
104 
105 	/*
106 	 * If p_brkbase has not yet been set, the first call
107 	 * to brk_internal() will initialize it.
108 	 */
109 	if (brkbase == 0) {
110 		return (brk_internal(nva, oszc));
111 	}
112 
113 	len = nva - bssbase;
114 
115 	pgsz = map_pgsz(MAPPGSZ_HEAP, p, bssbase, len, 0);
116 	szc = page_szc(pgsz);
117 
118 	/*
119 	 * Covers two cases:
120 	 * 1. page_szc() returns -1 for invalid page size, so we want to
121 	 * ignore it in that case.
122 	 * 2. By design we never decrease page size, as it is more stable.
123 	 */
124 	if (szc <= oszc) {
125 		err = brk_internal(nva, oszc);
126 		/* If failed, back off to base page size. */
127 		if (err != 0 && oszc != 0) {
128 			err = brk_internal(nva, 0);
129 		}
130 		return (err);
131 	}
132 
133 	err = brk_internal(nva, szc);
134 	/* If using szc failed, map with base page size and return. */
135 	if (err != 0) {
136 		if (szc != 0) {
137 			err = brk_internal(nva, 0);
138 		}
139 		return (err);
140 	}
141 
142 	/*
143 	 * Round up brk base to a large page boundary and remap
144 	 * anything in the segment already faulted in beyond that
145 	 * point.
146 	 */
147 	addr = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase, pgsz);
148 	brkend = brkbase + p->p_brksize;
149 	len = brkend - addr;
150 	/* Check that len is not negative. Update page size code for heap. */
151 	if (addr >= p->p_bssbase && brkend > addr && IS_P2ALIGNED(len, pgsz)) {
152 		(void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
153 		p->p_brkpageszc = szc;
154 	}
155 
156 	ASSERT(err == 0);
157 	return (err);		/* should always be 0 */
158 }
159 
160 /*
161  * Returns 0 on success.
162  */
163 int
164 brk_internal(caddr_t nva, uint_t brkszc)
165 {
166 	caddr_t ova;			/* current break address */
167 	size_t size;
168 	int	error;
169 	struct proc *p = curproc;
170 	struct as *as = p->p_as;
171 	size_t pgsz;
172 	uint_t szc;
173 	rctl_qty_t as_rctl;
174 
175 	/*
176 	 * extend heap to brkszc alignment but use current p->p_brkpageszc
177 	 * for the newly created segment. This allows the new extension
178 	 * segment to be concatenated successfully with the existing brk
179 	 * segment.
180 	 */
181 	if ((szc = brkszc) != 0) {
182 		pgsz = page_get_pagesize(szc);
183 		ASSERT(pgsz > PAGESIZE);
184 	} else {
185 		pgsz = PAGESIZE;
186 	}
187 
188 	mutex_enter(&p->p_lock);
189 	as_rctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_DATA],
190 	    p->p_rctls, p);
191 	mutex_exit(&p->p_lock);
192 
193 	/*
194 	 * If p_brkbase has not yet been set, the first call
195 	 * to brk() will initialize it.
196 	 */
197 	if (p->p_brkbase == 0)
198 		p->p_brkbase = nva;
199 
200 	/*
201 	 * Before multiple page size support existed p_brksize was the value
202 	 * not rounded to the pagesize (i.e. it stored the exact user request
203 	 * for heap size). If pgsz is greater than PAGESIZE calculate the
204 	 * heap size as the real new heap size by rounding it up to pgsz.
205 	 * This is useful since we may want to know where the heap ends
206 	 * without knowing heap pagesize (e.g. some old code) and also if
207 	 * heap pagesize changes we can update p_brkpageszc but delay adding
208 	 * new mapping yet still know from p_brksize where the heap really
209 	 * ends. The user requested heap end is stored in libc variable.
210 	 */
211 	if (pgsz > PAGESIZE) {
212 		caddr_t tnva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
213 		size = tnva - p->p_brkbase;
214 		if (tnva < p->p_brkbase || (size > p->p_brksize &&
215 		    size > (size_t)as_rctl)) {
216 			szc = 0;
217 			pgsz = PAGESIZE;
218 			size = nva - p->p_brkbase;
219 		}
220 	} else {
221 		size = nva - p->p_brkbase;
222 	}
223 
224 	/*
225 	 * use PAGESIZE to roundup ova because we want to know the real value
226 	 * of the current heap end in case p_brkpageszc changes since the last
227 	 * p_brksize was computed.
228 	 */
229 	nva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
230 	ova = (caddr_t)P2ROUNDUP((uintptr_t)(p->p_brkbase + p->p_brksize),
231 	    PAGESIZE);
232 
233 	if ((nva < p->p_brkbase) || (size > p->p_brksize &&
234 	    size > as_rctl)) {
235 		mutex_enter(&p->p_lock);
236 		(void) rctl_action(rctlproc_legacy[RLIMIT_DATA], p->p_rctls, p,
237 		    RCA_SAFE);
238 		mutex_exit(&p->p_lock);
239 		return (ENOMEM);
240 	}
241 
242 	if (nva > ova) {
243 		struct segvn_crargs crargs =
244 		    SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
245 
246 		if (!(p->p_datprot & PROT_EXEC)) {
247 			crargs.prot &= ~PROT_EXEC;
248 		}
249 
250 		/*
251 		 * Add new zfod mapping to extend UNIX data segment
252 		 * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies
253 		 * via map_pgszcvec(). Use AS_MAP_HEAP to get intermediate
254 		 * page sizes if ova is not aligned to szc's pgsz.
255 		 */
256 		if (szc > 0) {
257 			caddr_t rbss;
258 
259 			rbss = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase,
260 			    pgsz);
261 			if (IS_P2ALIGNED(p->p_bssbase, pgsz) || ova > rbss) {
262 				crargs.szc = p->p_brkpageszc ? p->p_brkpageszc :
263 				    AS_MAP_NO_LPOOB;
264 			} else if (ova == rbss) {
265 				crargs.szc = szc;
266 			} else {
267 				crargs.szc = AS_MAP_HEAP;
268 			}
269 		} else {
270 			crargs.szc = AS_MAP_NO_LPOOB;
271 		}
272 		crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_UP;
273 		error = as_map(as, ova, (size_t)(nva - ova), segvn_create,
274 		    &crargs);
275 		if (error) {
276 			return (error);
277 		}
278 
279 	} else if (nva < ova) {
280 		/*
281 		 * Release mapping to shrink UNIX data segment.
282 		 */
283 		(void) as_unmap(as, nva, (size_t)(ova - nva));
284 	}
285 	p->p_brksize = size;
286 	return (0);
287 }
288 
289 /*
290  * Grow the stack to include sp.  Return 1 if successful, 0 otherwise.
291  * This routine assumes that the stack grows downward.
292  */
293 int
294 grow(caddr_t sp)
295 {
296 	struct proc *p = curproc;
297 	struct as *as = p->p_as;
298 	size_t oldsize = p->p_stksize;
299 	size_t newsize;
300 	int err;
301 
302 	/*
303 	 * Serialize grow operations on an address space.
304 	 * This also serves as the lock protecting p_stksize
305 	 * and p_stkpageszc.
306 	 */
307 	as_rangelock(as);
308 	if (use_stk_lpg && (p->p_flag & SAUTOLPG) != 0) {
309 		err = grow_lpg(sp);
310 	} else {
311 		err = grow_internal(sp, p->p_stkpageszc);
312 	}
313 	as_rangeunlock(as);
314 
315 	if (err == 0 && (newsize = p->p_stksize) > oldsize) {
316 		ASSERT(IS_P2ALIGNED(oldsize, PAGESIZE));
317 		ASSERT(IS_P2ALIGNED(newsize, PAGESIZE));
318 		/*
319 		 * Set up translations so the process doesn't have to fault in
320 		 * the stack pages we just gave it.
321 		 */
322 		(void) as_fault(as->a_hat, as, p->p_usrstack - newsize,
323 		    newsize - oldsize, F_INVAL, S_WRITE);
324 	}
325 	return ((err == 0 ? 1 : 0));
326 }
327 
328 /*
329  * Algorithm: call arch-specific map_pgsz to get best page size to use,
330  * then call grow_internal().
331  * Returns 0 on success.
332  */
333 static int
334 grow_lpg(caddr_t sp)
335 {
336 	struct proc *p = curproc;
337 	size_t pgsz;
338 	size_t len, newsize;
339 	caddr_t addr, saddr;
340 	caddr_t growend;
341 	int oszc, szc;
342 	int err;
343 
344 	newsize = p->p_usrstack - sp;
345 
346 	oszc = p->p_stkpageszc;
347 	pgsz = map_pgsz(MAPPGSZ_STK, p, sp, newsize, 0);
348 	szc = page_szc(pgsz);
349 
350 	/*
351 	 * Covers two cases:
352 	 * 1. page_szc() returns -1 for invalid page size, so we want to
353 	 * ignore it in that case.
354 	 * 2. By design we never decrease page size, as it is more stable.
355 	 * This shouldn't happen as the stack never shrinks.
356 	 */
357 	if (szc <= oszc) {
358 		err = grow_internal(sp, oszc);
359 		/* failed, fall back to base page size */
360 		if (err != 0 && oszc != 0) {
361 			err = grow_internal(sp, 0);
362 		}
363 		return (err);
364 	}
365 
366 	/*
367 	 * We've grown sufficiently to switch to a new page size.
368 	 * So we are going to remap the whole segment with the new page size.
369 	 */
370 	err = grow_internal(sp, szc);
371 	/* The grow with szc failed, so fall back to base page size. */
372 	if (err != 0) {
373 		if (szc != 0) {
374 			err = grow_internal(sp, 0);
375 		}
376 		return (err);
377 	}
378 
379 	/*
380 	 * Round up stack pointer to a large page boundary and remap
381 	 * any pgsz pages in the segment already faulted in beyond that
382 	 * point.
383 	 */
384 	saddr = p->p_usrstack - p->p_stksize;
385 	addr = (caddr_t)P2ROUNDUP((uintptr_t)saddr, pgsz);
386 	growend = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack, pgsz);
387 	len = growend - addr;
388 	/* Check that len is not negative. Update page size code for stack. */
389 	if (addr >= saddr && growend > addr && IS_P2ALIGNED(len, pgsz)) {
390 		(void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
391 		p->p_stkpageszc = szc;
392 	}
393 
394 	ASSERT(err == 0);
395 	return (err);		/* should always be 0 */
396 }
397 
398 /*
399  * This routine assumes that the stack grows downward.
400  * Returns 0 on success, errno on failure.
401  */
402 int
403 grow_internal(caddr_t sp, uint_t growszc)
404 {
405 	struct proc *p = curproc;
406 	size_t newsize;
407 	size_t oldsize;
408 	int    error;
409 	size_t pgsz;
410 	uint_t szc;
411 	struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
412 
413 	ASSERT(sp < p->p_usrstack);
414 	sp = (caddr_t)P2ALIGN((uintptr_t)sp, PAGESIZE);
415 
416 	/*
417 	 * grow to growszc alignment but use current p->p_stkpageszc for
418 	 * the segvn_crargs szc passed to segvn_create. For memcntl to
419 	 * increase the szc, this allows the new extension segment to be
420 	 * concatenated successfully with the existing stack segment.
421 	 */
422 	if ((szc = growszc) != 0) {
423 		pgsz = page_get_pagesize(szc);
424 		ASSERT(pgsz > PAGESIZE);
425 		newsize = p->p_usrstack - (caddr_t)P2ALIGN((uintptr_t)sp, pgsz);
426 		if (newsize > (size_t)p->p_stk_ctl) {
427 			szc = 0;
428 			pgsz = PAGESIZE;
429 			newsize = p->p_usrstack - sp;
430 		}
431 	} else {
432 		pgsz = PAGESIZE;
433 		newsize = p->p_usrstack - sp;
434 	}
435 
436 	if (newsize > (size_t)p->p_stk_ctl) {
437 		(void) rctl_action(rctlproc_legacy[RLIMIT_STACK], p->p_rctls, p,
438 		    RCA_UNSAFE_ALL);
439 
440 		return (ENOMEM);
441 	}
442 
443 	oldsize = p->p_stksize;
444 	ASSERT(P2PHASE(oldsize, PAGESIZE) == 0);
445 
446 	if (newsize <= oldsize) {	/* prevent the stack from shrinking */
447 		return (0);
448 	}
449 
450 	if (!(p->p_stkprot & PROT_EXEC)) {
451 		crargs.prot &= ~PROT_EXEC;
452 	}
453 	/*
454 	 * extend stack with the proposed new growszc, which is different
455 	 * than p_stkpageszc only on a memcntl to increase the stack pagesize.
456 	 * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies via
457 	 * map_pgszcvec(). Use AS_MAP_STACK to get intermediate page sizes
458 	 * if not aligned to szc's pgsz.
459 	 */
460 	if (szc > 0) {
461 		caddr_t oldsp = p->p_usrstack - oldsize;
462 		caddr_t austk = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack,
463 		    pgsz);
464 
465 		if (IS_P2ALIGNED(p->p_usrstack, pgsz) || oldsp < austk) {
466 			crargs.szc = p->p_stkpageszc ? p->p_stkpageszc :
467 			    AS_MAP_NO_LPOOB;
468 		} else if (oldsp == austk) {
469 			crargs.szc = szc;
470 		} else {
471 			crargs.szc = AS_MAP_STACK;
472 		}
473 	} else {
474 		crargs.szc = AS_MAP_NO_LPOOB;
475 	}
476 	crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_DOWN;
477 
478 	if ((error = as_map(p->p_as, p->p_usrstack - newsize, newsize - oldsize,
479 	    segvn_create, &crargs)) != 0) {
480 		if (error == EAGAIN) {
481 			cmn_err(CE_WARN, "Sorry, no swap space to grow stack "
482 			    "for pid %d (%s)", p->p_pid, PTOU(p)->u_comm);
483 		}
484 		return (error);
485 	}
486 	p->p_stksize = newsize;
487 	return (0);
488 }
489 
490 /*
491  * Find address for user to map.
492  * If MAP_FIXED is not specified, we can pick any address we want, but we will
493  * first try the value in *addrp if it is non-NULL.  Thus this is implementing
494  * a way to try and get a preferred address.
495  */
496 int
497 choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off,
498     int vacalign, uint_t flags)
499 {
500 	caddr_t basep = (caddr_t)(uintptr_t)((uintptr_t)*addrp & PAGEMASK);
501 	size_t lenp = len;
502 
503 	ASSERT(AS_ISCLAIMGAP(as));	/* searches should be serialized */
504 	if (flags & MAP_FIXED) {
505 		(void) as_unmap(as, *addrp, len);
506 		return (0);
507 	} else if (basep != NULL && ((flags & MAP_ALIGN) == 0) &&
508 	    !as_gap(as, len, &basep, &lenp, 0, *addrp)) {
509 		/* User supplied address was available */
510 		*addrp = basep;
511 	} else {
512 		/*
513 		 * No user supplied address or the address supplied was not
514 		 * available.
515 		 */
516 		map_addr(addrp, len, off, vacalign, flags);
517 	}
518 	if (*addrp == NULL)
519 		return (ENOMEM);
520 	return (0);
521 }
522 
523 
524 /*
525  * Used for MAP_ANON - fast way to get anonymous pages
526  */
527 static int
528 zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
529     offset_t pos)
530 {
531 	struct segvn_crargs vn_a;
532 	int error;
533 
534 	if (((PROT_ALL & uprot) != uprot))
535 		return (EACCES);
536 
537 	if ((flags & MAP_FIXED) != 0) {
538 		caddr_t userlimit;
539 
540 		/*
541 		 * Use the user address.  First verify that
542 		 * the address to be used is page aligned.
543 		 * Then make some simple bounds checks.
544 		 */
545 		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
546 			return (EINVAL);
547 
548 		userlimit = flags & _MAP_LOW32 ?
549 		    (caddr_t)USERLIMIT32 : as->a_userlimit;
550 		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
551 		case RANGE_OKAY:
552 			break;
553 		case RANGE_BADPROT:
554 			return (ENOTSUP);
555 		case RANGE_BADADDR:
556 		default:
557 			return (ENOMEM);
558 		}
559 	}
560 	/*
561 	 * No need to worry about vac alignment for anonymous
562 	 * pages since this is a "clone" object that doesn't
563 	 * yet exist.
564 	 */
565 	error = choose_addr(as, addrp, len, pos, ADDR_NOVACALIGN, flags);
566 	if (error != 0) {
567 		return (error);
568 	}
569 
570 	/*
571 	 * Use the seg_vn segment driver; passing in the NULL amp
572 	 * gives the desired "cloning" effect.
573 	 */
574 	vn_a.vp = NULL;
575 	vn_a.offset = 0;
576 	vn_a.type = flags & MAP_TYPE;
577 	vn_a.prot = uprot;
578 	vn_a.maxprot = PROT_ALL;
579 	vn_a.flags = flags & ~MAP_TYPE;
580 	vn_a.cred = CRED();
581 	vn_a.amp = NULL;
582 	vn_a.szc = 0;
583 	vn_a.lgrp_mem_policy_flags = 0;
584 
585 	return (as_map(as, *addrp, len, segvn_create, &vn_a));
586 }
587 
588 static int
589 smmap_common(caddr_t *addrp, size_t len,
590     int prot, int flags, struct file *fp, offset_t pos)
591 {
592 	struct vnode *vp;
593 	struct as *as = curproc->p_as;
594 	uint_t uprot, maxprot, type;
595 	int error;
596 	int in_crit = 0;
597 
598 	if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | _MAP_NEW |
599 	    _MAP_LOW32 | MAP_NORESERVE | MAP_ANON | MAP_ALIGN |
600 	    MAP_TEXT | MAP_INITDATA)) != 0) {
601 		/* | MAP_RENAME */	/* not implemented, let user know */
602 		return (EINVAL);
603 	}
604 
605 	if ((flags & MAP_TEXT) && !(prot & PROT_EXEC)) {
606 		return (EINVAL);
607 	}
608 
609 	if ((flags & (MAP_TEXT | MAP_INITDATA)) == (MAP_TEXT | MAP_INITDATA)) {
610 		return (EINVAL);
611 	}
612 
613 #if defined(__sparc)
614 	/*
615 	 * See if this is an "old mmap call".  If so, remember this
616 	 * fact and convert the flags value given to mmap to indicate
617 	 * the specified address in the system call must be used.
618 	 * _MAP_NEW is turned set by all new uses of mmap.
619 	 */
620 	if ((flags & _MAP_NEW) == 0)
621 		flags |= MAP_FIXED;
622 #endif
623 	flags &= ~_MAP_NEW;
624 
625 	type = flags & MAP_TYPE;
626 	if (type != MAP_PRIVATE && type != MAP_SHARED)
627 		return (EINVAL);
628 
629 
630 	if (flags & MAP_ALIGN) {
631 
632 		if (flags & MAP_FIXED)
633 			return (EINVAL);
634 
635 		/* alignment needs to be a power of 2 >= page size */
636 		if (((uintptr_t)*addrp < PAGESIZE && (uintptr_t)*addrp != 0) ||
637 		    !ISP2((uintptr_t)*addrp))
638 			return (EINVAL);
639 	}
640 	/*
641 	 * Check for bad lengths and file position.
642 	 * We let the VOP_MAP routine check for negative lengths
643 	 * since on some vnode types this might be appropriate.
644 	 */
645 	if (len == 0 || (pos & (u_offset_t)PAGEOFFSET) != 0)
646 		return (EINVAL);
647 
648 	maxprot = PROT_ALL;		/* start out allowing all accesses */
649 	uprot = prot | PROT_USER;
650 
651 	if (fp == NULL) {
652 		ASSERT(flags & MAP_ANON);
653 		/* discard lwpchan mappings, like munmap() */
654 		if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
655 			lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
656 		as_rangelock(as);
657 		error = zmap(as, addrp, len, uprot, flags, pos);
658 		as_rangeunlock(as);
659 		return (error);
660 	} else if ((flags & MAP_ANON) != 0)
661 		return (EINVAL);
662 
663 	vp = fp->f_vnode;
664 
665 	/* Can't execute code from "noexec" mounted filesystem. */
666 	if ((vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0)
667 		maxprot &= ~PROT_EXEC;
668 
669 	/*
670 	 * These checks were added as part of large files.
671 	 *
672 	 * Return ENXIO if the initial position is negative; return EOVERFLOW
673 	 * if (offset + len) would overflow the maximum allowed offset for the
674 	 * type of file descriptor being used.
675 	 */
676 	if (vp->v_type == VREG) {
677 		if (pos < 0)
678 			return (ENXIO);
679 		if ((offset_t)len > (OFFSET_MAX(fp) - pos))
680 			return (EOVERFLOW);
681 	}
682 
683 	if (type == MAP_SHARED && (fp->f_flag & FWRITE) == 0) {
684 		/* no write access allowed */
685 		maxprot &= ~PROT_WRITE;
686 	}
687 
688 	/*
689 	 * XXX - Do we also adjust maxprot based on protections
690 	 * of the vnode?  E.g. if no execute permission is given
691 	 * on the vnode for the current user, maxprot probably
692 	 * should disallow PROT_EXEC also?  This is different
693 	 * from the write access as this would be a per vnode
694 	 * test as opposed to a per fd test for writability.
695 	 */
696 
697 	/*
698 	 * Verify that the specified protections are not greater than
699 	 * the maximum allowable protections.  Also test to make sure
700 	 * that the file descriptor does allows for read access since
701 	 * "write only" mappings are hard to do since normally we do
702 	 * the read from the file before the page can be written.
703 	 */
704 	if (((maxprot & uprot) != uprot) || (fp->f_flag & FREAD) == 0)
705 		return (EACCES);
706 
707 	/*
708 	 * If the user specified an address, do some simple checks here
709 	 */
710 	if ((flags & MAP_FIXED) != 0) {
711 		caddr_t userlimit;
712 
713 		/*
714 		 * Use the user address.  First verify that
715 		 * the address to be used is page aligned.
716 		 * Then make some simple bounds checks.
717 		 */
718 		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
719 			return (EINVAL);
720 
721 		userlimit = flags & _MAP_LOW32 ?
722 		    (caddr_t)USERLIMIT32 : as->a_userlimit;
723 		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
724 		case RANGE_OKAY:
725 			break;
726 		case RANGE_BADPROT:
727 			return (ENOTSUP);
728 		case RANGE_BADADDR:
729 		default:
730 			return (ENOMEM);
731 		}
732 	}
733 
734 	if ((prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) &&
735 	    nbl_need_check(vp)) {
736 		int svmand;
737 		nbl_op_t nop;
738 
739 		nbl_start_crit(vp, RW_READER);
740 		in_crit = 1;
741 		error = nbl_svmand(vp, fp->f_cred, &svmand);
742 		if (error != 0)
743 			goto done;
744 		if ((prot & PROT_WRITE) && (type == MAP_SHARED)) {
745 			if (prot & (PROT_READ | PROT_EXEC)) {
746 				nop = NBL_READWRITE;
747 			} else {
748 				nop = NBL_WRITE;
749 			}
750 		} else {
751 			nop = NBL_READ;
752 		}
753 		if (nbl_conflict(vp, nop, 0, LONG_MAX, svmand, NULL)) {
754 			error = EACCES;
755 			goto done;
756 		}
757 	}
758 
759 	/* discard lwpchan mappings, like munmap() */
760 	if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
761 		lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
762 
763 	/*
764 	 * Ok, now let the vnode map routine do its thing to set things up.
765 	 */
766 	error = VOP_MAP(vp, pos, as,
767 	    addrp, len, uprot, maxprot, flags, fp->f_cred, NULL);
768 
769 	if (error == 0) {
770 		if (vp->v_type == VREG &&
771 		    (flags & (MAP_TEXT | MAP_INITDATA)) != 0) {
772 			/*
773 			 * Mark this as an executable vnode
774 			 */
775 			mutex_enter(&vp->v_lock);
776 			vp->v_flag |= VVMEXEC;
777 			mutex_exit(&vp->v_lock);
778 		}
779 	}
780 
781 done:
782 	if (in_crit)
783 		nbl_end_crit(vp);
784 	return (error);
785 }
786 
787 #ifdef _LP64
788 /*
789  * LP64 mmap(2) system call: 64-bit offset, 64-bit address.
790  *
791  * The "large file" mmap routine mmap64(2) is also mapped to this routine
792  * by the 64-bit version of libc.
793  *
794  * Eventually, this should be the only version, and have smmap_common()
795  * folded back into it again.  Some day.
796  */
797 caddr_t
798 smmap64(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos)
799 {
800 	struct file *fp;
801 	int error;
802 
803 	if (flags & _MAP_LOW32)
804 		error = EINVAL;
805 	else if (fd == -1 && (flags & MAP_ANON) != 0)
806 		error = smmap_common(&addr, len, prot, flags,
807 		    NULL, (offset_t)pos);
808 	else if ((fp = getf(fd)) != NULL) {
809 		error = smmap_common(&addr, len, prot, flags,
810 		    fp, (offset_t)pos);
811 		releasef(fd);
812 	} else
813 		error = EBADF;
814 
815 	return (error ? (caddr_t)(uintptr_t)set_errno(error) : addr);
816 }
817 #endif	/* _LP64 */
818 
819 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
820 
821 /*
822  * ILP32 mmap(2) system call: 32-bit offset, 32-bit address.
823  */
824 caddr_t
825 smmap32(caddr32_t addr, size32_t len, int prot, int flags, int fd, off32_t pos)
826 {
827 	struct file *fp;
828 	int error;
829 	caddr_t a = (caddr_t)(uintptr_t)addr;
830 
831 	if (flags & _MAP_LOW32)
832 		error = EINVAL;
833 	else if (fd == -1 && (flags & MAP_ANON) != 0)
834 		error = smmap_common(&a, (size_t)len, prot,
835 		    flags | _MAP_LOW32, NULL, (offset_t)pos);
836 	else if ((fp = getf(fd)) != NULL) {
837 		error = smmap_common(&a, (size_t)len, prot,
838 		    flags | _MAP_LOW32, fp, (offset_t)pos);
839 		releasef(fd);
840 	} else
841 		error = EBADF;
842 
843 	ASSERT(error != 0 || (uintptr_t)(a + len) < (uintptr_t)UINT32_MAX);
844 
845 	return (error ? (caddr_t)(uintptr_t)set_errno(error) : a);
846 }
847 
848 /*
849  * ILP32 mmap64(2) system call: 64-bit offset, 32-bit address.
850  *
851  * Now things really get ugly because we can't use the C-style
852  * calling convention for more than 6 args, and 64-bit parameter
853  * passing on 32-bit systems is less than clean.
854  */
855 
856 struct mmaplf32a {
857 	caddr_t addr;
858 	size_t len;
859 #ifdef _LP64
860 	/*
861 	 * 32-bit contents, 64-bit cells
862 	 */
863 	uint64_t prot;
864 	uint64_t flags;
865 	uint64_t fd;
866 	uint64_t offhi;
867 	uint64_t offlo;
868 #else
869 	/*
870 	 * 32-bit contents, 32-bit cells
871 	 */
872 	uint32_t prot;
873 	uint32_t flags;
874 	uint32_t fd;
875 	uint32_t offhi;
876 	uint32_t offlo;
877 #endif
878 };
879 
880 int
881 smmaplf32(struct mmaplf32a *uap, rval_t *rvp)
882 {
883 	struct file *fp;
884 	int error;
885 	caddr_t a = uap->addr;
886 	int flags = (int)uap->flags;
887 	int fd = (int)uap->fd;
888 #ifdef _BIG_ENDIAN
889 	offset_t off = ((u_offset_t)uap->offhi << 32) | (u_offset_t)uap->offlo;
890 #else
891 	offset_t off = ((u_offset_t)uap->offlo << 32) | (u_offset_t)uap->offhi;
892 #endif
893 
894 	if (flags & _MAP_LOW32)
895 		error = EINVAL;
896 	else if (fd == -1 && (flags & MAP_ANON) != 0)
897 		error = smmap_common(&a, uap->len, (int)uap->prot,
898 		    flags | _MAP_LOW32, NULL, off);
899 	else if ((fp = getf(fd)) != NULL) {
900 		error = smmap_common(&a, uap->len, (int)uap->prot,
901 		    flags | _MAP_LOW32, fp, off);
902 		releasef(fd);
903 	} else
904 		error = EBADF;
905 
906 	if (error == 0)
907 		rvp->r_val1 = (uintptr_t)a;
908 	return (error);
909 }
910 
911 #endif	/* _SYSCALL32_IMPL || _ILP32 */
912 
913 int
914 munmap(caddr_t addr, size_t len)
915 {
916 	struct proc *p = curproc;
917 	struct as *as = p->p_as;
918 
919 	if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
920 		return (set_errno(EINVAL));
921 
922 	if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
923 		return (set_errno(EINVAL));
924 
925 	/*
926 	 * Discard lwpchan mappings.
927 	 */
928 	if (p->p_lcp != NULL)
929 		lwpchan_delete_mapping(p, addr, addr + len);
930 	if (as_unmap(as, addr, len) != 0)
931 		return (set_errno(EINVAL));
932 
933 	return (0);
934 }
935 
936 int
937 mprotect(caddr_t addr, size_t len, int prot)
938 {
939 	struct as *as = curproc->p_as;
940 	uint_t uprot = prot | PROT_USER;
941 	int error;
942 
943 	if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
944 		return (set_errno(EINVAL));
945 
946 	switch (valid_usr_range(addr, len, prot, as, as->a_userlimit)) {
947 	case RANGE_OKAY:
948 		break;
949 	case RANGE_BADPROT:
950 		return (set_errno(ENOTSUP));
951 	case RANGE_BADADDR:
952 	default:
953 		return (set_errno(ENOMEM));
954 	}
955 
956 	error = as_setprot(as, addr, len, uprot);
957 	if (error)
958 		return (set_errno(error));
959 	return (0);
960 }
961 
962 #define	MC_CACHE	128			/* internal result buffer */
963 #define	MC_QUANTUM	(MC_CACHE * PAGESIZE)	/* addresses covered in loop */
964 
965 int
966 mincore(caddr_t addr, size_t len, char *vecp)
967 {
968 	struct as *as = curproc->p_as;
969 	caddr_t ea;			/* end address of loop */
970 	size_t rl;			/* inner result length */
971 	char vec[MC_CACHE];		/* local vector cache */
972 	int error;
973 	model_t model;
974 	long	llen;
975 
976 	model = get_udatamodel();
977 	/*
978 	 * Validate form of address parameters.
979 	 */
980 	if (model == DATAMODEL_NATIVE) {
981 		llen = (long)len;
982 	} else {
983 		llen = (int32_t)(size32_t)len;
984 	}
985 	if (((uintptr_t)addr & PAGEOFFSET) != 0 || llen <= 0)
986 		return (set_errno(EINVAL));
987 
988 	if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
989 		return (set_errno(ENOMEM));
990 
991 	/*
992 	 * Loop over subranges of interval [addr : addr + len), recovering
993 	 * results internally and then copying them out to caller.  Subrange
994 	 * is based on the size of MC_CACHE, defined above.
995 	 */
996 	for (ea = addr + len; addr < ea; addr += MC_QUANTUM) {
997 		error = as_incore(as, addr,
998 		    (size_t)MIN(MC_QUANTUM, ea - addr), vec, &rl);
999 		if (rl != 0) {
1000 			rl = (rl + PAGESIZE - 1) / PAGESIZE;
1001 			if (copyout(vec, vecp, rl) != 0)
1002 				return (set_errno(EFAULT));
1003 			vecp += rl;
1004 		}
1005 		if (error != 0)
1006 			return (set_errno(ENOMEM));
1007 	}
1008 	return (0);
1009 }
1010