xref: /titanic_51/usr/src/uts/common/os/grow.c (revision c8343062f6e25afd9c2a31b65df357030e69fa55)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
32 
33 #include <sys/types.h>
34 #include <sys/inttypes.h>
35 #include <sys/param.h>
36 #include <sys/sysmacros.h>
37 #include <sys/systm.h>
38 #include <sys/signal.h>
39 #include <sys/user.h>
40 #include <sys/errno.h>
41 #include <sys/var.h>
42 #include <sys/proc.h>
43 #include <sys/tuneable.h>
44 #include <sys/debug.h>
45 #include <sys/cmn_err.h>
46 #include <sys/cred.h>
47 #include <sys/vnode.h>
48 #include <sys/vfs.h>
49 #include <sys/vm.h>
50 #include <sys/file.h>
51 #include <sys/mman.h>
52 #include <sys/vmparam.h>
53 #include <sys/fcntl.h>
54 #include <sys/lwpchan_impl.h>
55 
56 #include <vm/hat.h>
57 #include <vm/as.h>
58 #include <vm/seg.h>
59 #include <vm/seg_dev.h>
60 #include <vm/seg_vn.h>
61 
62 int use_brk_lpg = 1;
63 int use_stk_lpg = 1;
64 int use_zmap_lpg = 1;
65 
66 static int brk_lpg(caddr_t nva);
67 static int grow_lpg(caddr_t sp);
68 
69 int
70 brk(caddr_t nva)
71 {
72 	int error;
73 	proc_t *p = curproc;
74 
75 	/*
76 	 * Serialize brk operations on an address space.
77 	 * This also serves as the lock protecting p_brksize
78 	 * and p_brkpageszc.
79 	 */
80 	as_rangelock(p->p_as);
81 	if (use_brk_lpg && (p->p_flag & SAUTOLPG) != 0) {
82 		error = brk_lpg(nva);
83 	} else {
84 		error = brk_internal(nva, p->p_brkpageszc);
85 	}
86 	as_rangeunlock(p->p_as);
87 	return ((error != 0 ? set_errno(error) : 0));
88 }
89 
90 /*
91  * Algorithm: call arch-specific map_pgsz to get best page size to use,
92  * then call brk_internal().
93  * Returns 0 on success.
94  */
95 static int
96 brk_lpg(caddr_t nva)
97 {
98 	struct proc *p = curproc;
99 	size_t pgsz, len;
100 	caddr_t addr;
101 	caddr_t bssbase = p->p_bssbase;
102 	caddr_t brkbase = p->p_brkbase;
103 	int oszc, szc;
104 	int err;
105 	int remap = 0;
106 
107 	oszc = p->p_brkpageszc;
108 
109 	/*
110 	 * If p_brkbase has not yet been set, the first call
111 	 * to brk_internal() will initialize it.
112 	 */
113 	if (brkbase == 0) {
114 		return (brk_internal(nva, oszc));
115 	}
116 
117 	len = nva - bssbase;
118 
119 	pgsz = map_pgsz(MAPPGSZ_HEAP, p, bssbase, len, &remap);
120 	szc = page_szc(pgsz);
121 
122 	/*
123 	 * Covers two cases:
124 	 * 1. page_szc() returns -1 for invalid page size, so we want to
125 	 * ignore it in that case.
126 	 * 2. By design we never decrease page size, as it is more stable.
127 	 */
128 	if (szc <= oszc) {
129 		err = brk_internal(nva, oszc);
130 		/* If failed, back off to base page size. */
131 		if (err != 0 && oszc != 0) {
132 			err = brk_internal(nva, 0);
133 		}
134 		return (err);
135 	}
136 
137 	if (remap == 0) {
138 		/*
139 		 * Map from the current brk end up to the new page size
140 		 * alignment using the current page size.
141 		 */
142 		addr = brkbase + p->p_brksize;
143 		addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
144 		if (addr < nva) {
145 			err = brk_internal(addr, oszc);
146 			/*
147 			 * In failure case, try again if oszc is not base page
148 			 * size, then return err.
149 			 */
150 			if (err != 0) {
151 				if (oszc != 0) {
152 					err = brk_internal(nva, 0);
153 				}
154 				return (err);
155 			}
156 		}
157 	}
158 
159 	err = brk_internal(nva, szc);
160 	/* If using szc failed, map with base page size and return. */
161 	if (err != 0) {
162 		if (szc != 0) {
163 			err = brk_internal(nva, 0);
164 		}
165 		return (err);
166 	}
167 
168 	if (remap != 0) {
169 		/*
170 		 * Round up brk base to a large page boundary and remap
171 		 * anything in the segment already faulted in beyond that
172 		 * point.
173 		 */
174 		addr = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase, pgsz);
175 		len = (brkbase + p->p_brksize) - addr;
176 		/* advisory, so ignore errors */
177 		(void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
178 	}
179 
180 	ASSERT(err == 0);
181 	return (err);		/* should always be 0 */
182 }
183 
184 /*
185  * Returns 0 on success.
186  */
187 int
188 brk_internal(caddr_t nva, uint_t brkszc)
189 {
190 	caddr_t ova;			/* current break address */
191 	size_t size;
192 	int	error;
193 	struct proc *p = curproc;
194 	struct as *as = p->p_as;
195 	size_t pgsz;
196 	uint_t szc;
197 	rctl_qty_t as_rctl;
198 
199 	/*
200 	 * extend heap to brkszc alignment but use current p->p_brkpageszc
201 	 * for the newly created segment. This allows the new extension
202 	 * segment to be concatenated successfully with the existing brk
203 	 * segment.
204 	 */
205 	if ((szc = brkszc) != 0) {
206 		pgsz = page_get_pagesize(szc);
207 		ASSERT(pgsz > PAGESIZE);
208 	} else {
209 		pgsz = PAGESIZE;
210 	}
211 
212 	mutex_enter(&p->p_lock);
213 	as_rctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_DATA],
214 	    p->p_rctls, p);
215 	mutex_exit(&p->p_lock);
216 
217 	/*
218 	 * If p_brkbase has not yet been set, the first call
219 	 * to brk() will initialize it.
220 	 */
221 	if (p->p_brkbase == 0)
222 		p->p_brkbase = nva;
223 
224 	/*
225 	 * Before multiple page size support existed p_brksize was the value
226 	 * not rounded to the pagesize (i.e. it stored the exact user request
227 	 * for heap size). If pgsz is greater than PAGESIZE calculate the
228 	 * heap size as the real new heap size by rounding it up to pgsz.
229 	 * This is useful since we may want to know where the heap ends
230 	 * without knowing heap pagesize (e.g. some old code) and also if
231 	 * heap pagesize changes we can update p_brkpageszc but delay adding
232 	 * new mapping yet still know from p_brksize where the heap really
233 	 * ends. The user requested heap end is stored in libc variable.
234 	 */
235 	if (pgsz > PAGESIZE) {
236 		caddr_t tnva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
237 		size = tnva - p->p_brkbase;
238 		if (tnva < p->p_brkbase || (size > p->p_brksize &&
239 		    size > (size_t)as_rctl)) {
240 			szc = 0;
241 			pgsz = PAGESIZE;
242 			size = nva - p->p_brkbase;
243 		}
244 	} else {
245 		size = nva - p->p_brkbase;
246 	}
247 
248 	/*
249 	 * use PAGESIZE to roundup ova because we want to know the real value
250 	 * of the current heap end in case p_brkpageszc changes since the last
251 	 * p_brksize was computed.
252 	 */
253 	nva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
254 	ova = (caddr_t)P2ROUNDUP((uintptr_t)(p->p_brkbase + p->p_brksize),
255 		PAGESIZE);
256 
257 	if ((nva < p->p_brkbase) || (size > p->p_brksize &&
258 	    size > as_rctl)) {
259 		mutex_enter(&p->p_lock);
260 		(void) rctl_action(rctlproc_legacy[RLIMIT_DATA], p->p_rctls, p,
261 		    RCA_SAFE);
262 		mutex_exit(&p->p_lock);
263 		return (ENOMEM);
264 	}
265 
266 	if (nva > ova) {
267 		struct segvn_crargs crargs =
268 		    SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
269 
270 		if (!(p->p_datprot & PROT_EXEC)) {
271 			crargs.prot &= ~PROT_EXEC;
272 		}
273 
274 		/*
275 		 * Add new zfod mapping to extend UNIX data segment
276 		 */
277 		crargs.szc = szc;
278 		crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_UP;
279 		error = as_map(as, ova, (size_t)(nva - ova), segvn_create,
280 		    &crargs);
281 		if (error) {
282 			return (error);
283 		}
284 
285 	} else if (nva < ova) {
286 		/*
287 		 * Release mapping to shrink UNIX data segment.
288 		 */
289 		(void) as_unmap(as, nva, (size_t)(ova - nva));
290 	}
291 	p->p_brksize = size;
292 	p->p_brkpageszc = szc;
293 	return (0);
294 }
295 
296 /*
297  * Grow the stack to include sp.  Return 1 if successful, 0 otherwise.
298  * This routine assumes that the stack grows downward.
299  */
300 int
301 grow(caddr_t sp)
302 {
303 	struct proc *p = curproc;
304 	int err;
305 
306 	/*
307 	 * Serialize grow operations on an address space.
308 	 * This also serves as the lock protecting p_stksize
309 	 * and p_stkpageszc.
310 	 */
311 	as_rangelock(p->p_as);
312 	if (use_stk_lpg && (p->p_flag & SAUTOLPG) != 0) {
313 		err = grow_lpg(sp);
314 	} else {
315 		err = grow_internal(sp, p->p_stkpageszc);
316 	}
317 	as_rangeunlock(p->p_as);
318 	return ((err == 0 ? 1 : 0));
319 }
320 
321 /*
322  * Algorithm: call arch-specific map_pgsz to get best page size to use,
323  * then call grow_internal().
324  * Returns 0 on success.
325  */
326 static int
327 grow_lpg(caddr_t sp)
328 {
329 	struct proc *p = curproc;
330 	size_t pgsz;
331 	size_t len, newsize;
332 	caddr_t addr, oldsp;
333 	int oszc, szc;
334 	int err;
335 	int remap = 0;
336 
337 	newsize = p->p_usrstack - sp;
338 
339 	oszc = p->p_stkpageszc;
340 	pgsz = map_pgsz(MAPPGSZ_STK, p, sp, newsize, &remap);
341 	szc = page_szc(pgsz);
342 
343 	/*
344 	 * Covers two cases:
345 	 * 1. page_szc() returns -1 for invalid page size, so we want to
346 	 * ignore it in that case.
347 	 * 2. By design we never decrease page size, as it is more stable.
348 	 * This shouldn't happen as the stack never shrinks.
349 	 */
350 	if (szc <= oszc) {
351 		err = grow_internal(sp, oszc);
352 		/* failed, fall back to base page size */
353 		if (err != 0 && oszc != 0) {
354 			err = grow_internal(sp, 0);
355 		}
356 		return (err);
357 	}
358 
359 	/*
360 	 * We've grown sufficiently to switch to a new page size.
361 	 * If we're not going to remap the whole segment with the new
362 	 * page size, split the grow into two operations: map to the new
363 	 * page size alignment boundary with the existing page size, then
364 	 * map the rest with the new page size.
365 	 */
366 	err = 0;
367 	if (remap == 0) {
368 		oldsp = p->p_usrstack - p->p_stksize;
369 		addr = (caddr_t)P2ALIGN((uintptr_t)oldsp, pgsz);
370 		if (addr > sp) {
371 			err = grow_internal(addr, oszc);
372 			/*
373 			 * In this case, grow with oszc failed, so grow all the
374 			 * way to sp with base page size.
375 			 */
376 			if (err != 0) {
377 				if (oszc != 0) {
378 					err = grow_internal(sp, 0);
379 				}
380 				return (err);
381 			}
382 		}
383 	}
384 
385 	err = grow_internal(sp, szc);
386 	/* The grow with szc failed, so fall back to base page size. */
387 	if (err != 0) {
388 		if (szc != 0) {
389 			err = grow_internal(sp, 0);
390 		}
391 		return (err);
392 	}
393 
394 	if (remap) {
395 		/*
396 		 * Round up stack pointer to a large page boundary and remap
397 		 * any pgsz pages in the segment already faulted in beyond that
398 		 * point.
399 		 */
400 		addr = p->p_usrstack - p->p_stksize;
401 		addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
402 		len = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack, pgsz) - addr;
403 		/* advisory, so ignore errors */
404 		(void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
405 	}
406 
407 	/* Update page size code for stack. */
408 	p->p_stkpageszc = szc;
409 
410 	ASSERT(err == 0);
411 	return (err);		/* should always be 0 */
412 }
413 
414 /*
415  * This routine assumes that the stack grows downward.
416  * Returns 0 on success, errno on failure.
417  */
418 int
419 grow_internal(caddr_t sp, uint_t growszc)
420 {
421 	struct proc *p = curproc;
422 	struct as *as = p->p_as;
423 	size_t newsize = p->p_usrstack - sp;
424 	size_t oldsize;
425 	int    error;
426 	size_t pgsz;
427 	uint_t szc;
428 	struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
429 
430 	ASSERT(sp < p->p_usrstack);
431 
432 	/*
433 	 * grow to growszc alignment but use current p->p_stkpageszc for
434 	 * the segvn_crargs szc passed to segvn_create. For memcntl to
435 	 * increase the szc, this allows the new extension segment to be
436 	 * concatenated successfully with the existing stack segment.
437 	 */
438 	if ((szc = growszc) != 0) {
439 		pgsz = page_get_pagesize(szc);
440 		ASSERT(pgsz > PAGESIZE);
441 		newsize = P2ROUNDUP(newsize, pgsz);
442 		if (newsize > (size_t)p->p_stk_ctl) {
443 			szc = 0;
444 			pgsz = PAGESIZE;
445 			newsize = p->p_usrstack - sp;
446 		}
447 	} else {
448 		pgsz = PAGESIZE;
449 	}
450 
451 	if (newsize > (size_t)p->p_stk_ctl) {
452 		(void) rctl_action(rctlproc_legacy[RLIMIT_STACK], p->p_rctls, p,
453 		    RCA_UNSAFE_ALL);
454 
455 		return (ENOMEM);
456 	}
457 
458 	oldsize = p->p_stksize;
459 	newsize = P2ROUNDUP(newsize, pgsz);
460 	ASSERT(P2PHASE(oldsize, PAGESIZE) == 0);
461 
462 	if (newsize <= oldsize) {	/* prevent the stack from shrinking */
463 		return (0);
464 	}
465 
466 	if (!(p->p_stkprot & PROT_EXEC)) {
467 		crargs.prot &= ~PROT_EXEC;
468 	}
469 	/*
470 	 * extend stack with the p_stkpageszc. growszc is different than
471 	 * p_stkpageszc only on a memcntl to increase the stack pagesize.
472 	 */
473 	crargs.szc = p->p_stkpageszc;
474 	crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_DOWN;
475 
476 	if ((error = as_map(as, p->p_usrstack - newsize, newsize - oldsize,
477 	    segvn_create, &crargs)) != 0) {
478 		if (error == EAGAIN) {
479 			cmn_err(CE_WARN, "Sorry, no swap space to grow stack "
480 			    "for pid %d (%s)", p->p_pid, u.u_comm);
481 		}
482 		return (error);
483 	}
484 	p->p_stksize = newsize;
485 
486 
487 	/*
488 	 * Set up translations so the process doesn't have to fault in
489 	 * the stack pages we just gave it.
490 	 */
491 	(void) as_fault(as->a_hat, as,
492 	    p->p_usrstack - newsize, newsize - oldsize, F_INVAL, S_WRITE);
493 
494 	return (0);
495 }
496 
497 /*
498  * Used for MAP_ANON - fast way to get anonymous pages
499  */
500 static int
501 zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
502     offset_t pos)
503 {
504 	struct segvn_crargs a, b;
505 	struct proc *p = curproc;
506 	int err;
507 	size_t pgsz;
508 	size_t l0, l1, l2, l3, l4; /* 0th through 5th chunks */
509 	caddr_t ruaddr, ruaddr0; /* rounded up addresses */
510 	extern size_t auto_lpg_va_default;
511 
512 	if (((PROT_ALL & uprot) != uprot))
513 		return (EACCES);
514 
515 	if ((flags & MAP_FIXED) != 0) {
516 		caddr_t userlimit;
517 
518 		/*
519 		 * Use the user address.  First verify that
520 		 * the address to be used is page aligned.
521 		 * Then make some simple bounds checks.
522 		 */
523 		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
524 			return (EINVAL);
525 
526 		userlimit = flags & _MAP_LOW32 ?
527 		    (caddr_t)USERLIMIT32 : as->a_userlimit;
528 		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
529 		case RANGE_OKAY:
530 			break;
531 		case RANGE_BADPROT:
532 			return (ENOTSUP);
533 		case RANGE_BADADDR:
534 		default:
535 			return (ENOMEM);
536 		}
537 		(void) as_unmap(as, *addrp, len);
538 	} else {
539 		/*
540 		 * No need to worry about vac alignment for anonymous
541 		 * pages since this is a "clone" object that doesn't
542 		 * yet exist.
543 		 */
544 		map_addr(addrp, len, pos, 0, flags);
545 		if (*addrp == NULL)
546 			return (ENOMEM);
547 	}
548 
549 	/*
550 	 * Use the seg_vn segment driver; passing in the NULL amp
551 	 * gives the desired "cloning" effect.
552 	 */
553 	a.vp = NULL;
554 	a.offset = 0;
555 	a.type = flags & MAP_TYPE;
556 	a.prot = uprot;
557 	a.maxprot = PROT_ALL;
558 	a.flags = flags & ~MAP_TYPE;
559 	a.cred = CRED();
560 	a.amp = NULL;
561 	a.szc = 0;
562 	a.lgrp_mem_policy_flags = 0;
563 
564 	/*
565 	 * Call arch-specific map_pgsz routine to pick best page size to map
566 	 * this segment, and break the mapping up into parts if required.
567 	 *
568 	 * The parts work like this:
569 	 *
570 	 * addr		---------
571 	 *		|	| l0
572 	 *		---------
573 	 *		|	| l1
574 	 *		---------
575 	 *		|	| l2
576 	 *		---------
577 	 *		|	| l3
578 	 *		---------
579 	 *		|	| l4
580 	 *		---------
581 	 * addr+len
582 	 *
583 	 * Starting from the middle, l2 is the number of bytes mapped by the
584 	 * selected large page.  l1 and l3 are mapped by auto_lpg_va_default
585 	 * page size pages, and l0 and l4 are mapped by base page size pages.
586 	 * If auto_lpg_va_default is the base page size, then l0 == l4 == 0.
587 	 * If the requested address or length are aligned to the selected large
588 	 * page size, l1 or l3 may also be 0.
589 	 */
590 	if (use_zmap_lpg) {
591 
592 		pgsz = map_pgsz(MAPPGSZ_VA, p, *addrp, len, NULL);
593 		if (pgsz <= PAGESIZE || len < pgsz) {
594 			return (as_map(as, *addrp, len, segvn_create, &a));
595 		}
596 
597 		ruaddr = (caddr_t)P2ROUNDUP((uintptr_t)*addrp, pgsz);
598 		if (auto_lpg_va_default != MMU_PAGESIZE) {
599 			ruaddr0 = (caddr_t)P2ROUNDUP((uintptr_t)*addrp,
600 			    auto_lpg_va_default);
601 			l0 = ruaddr0 - *addrp;
602 		} else {
603 			l0 = 0;
604 			ruaddr0 = *addrp;
605 		}
606 		l1 = ruaddr - ruaddr0;
607 		l3 = P2PHASE(len - l0 - l1, pgsz);
608 		if (auto_lpg_va_default == MMU_PAGESIZE) {
609 			l4 = 0;
610 		} else {
611 			l4 = P2PHASE(l3, auto_lpg_va_default);
612 			l3 -= l4;
613 		}
614 		l2 = len - l0 - l1 - l3 - l4;
615 
616 		if (l0) {
617 			b = a;
618 			err = as_map(as, *addrp, l0, segvn_create, &b);
619 			if (err) {
620 				return (err);
621 			}
622 		}
623 
624 		if (l1) {
625 			b = a;
626 			b.szc = page_szc(auto_lpg_va_default);
627 			err = as_map(as, ruaddr0, l1, segvn_create, &b);
628 			if (err) {
629 				goto error1;
630 			}
631 		}
632 
633 		if (l2) {
634 			b = a;
635 			b.szc = page_szc(pgsz);
636 			err = as_map(as, ruaddr, l2, segvn_create, &b);
637 			if (err) {
638 				goto error2;
639 			}
640 		}
641 
642 		if (l3) {
643 			b = a;
644 			b.szc = page_szc(auto_lpg_va_default);
645 			err = as_map(as, ruaddr + l2, l3, segvn_create, &b);
646 			if (err) {
647 				goto error3;
648 			}
649 		}
650 		if (l4) {
651 			err = as_map(as, ruaddr + l2 + l3, l4, segvn_create,
652 			    &a);
653 			if (err) {
654 error3:
655 				if (l3) {
656 					(void) as_unmap(as, ruaddr + l2, l3);
657 				}
658 error2:
659 				if (l2) {
660 					(void) as_unmap(as, ruaddr, l2);
661 				}
662 error1:
663 				if (l1) {
664 					(void) as_unmap(as, ruaddr0, l1);
665 				}
666 				if (l0) {
667 					(void) as_unmap(as, *addrp, l0);
668 				}
669 				return (err);
670 			}
671 		}
672 
673 		return (0);
674 	}
675 
676 	return (as_map(as, *addrp, len, segvn_create, &a));
677 }
678 
679 static int
680 smmap_common(caddr_t *addrp, size_t len,
681     int prot, int flags, struct file *fp, offset_t pos)
682 {
683 	struct vnode *vp;
684 	struct as *as = curproc->p_as;
685 	uint_t uprot, maxprot, type;
686 	int error;
687 
688 	if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | _MAP_NEW |
689 	    _MAP_LOW32 | MAP_NORESERVE | MAP_ANON | MAP_ALIGN |
690 	    MAP_TEXT | MAP_INITDATA)) != 0) {
691 		/* | MAP_RENAME */	/* not implemented, let user know */
692 		return (EINVAL);
693 	}
694 
695 	if ((flags & MAP_TEXT) && !(prot & PROT_EXEC)) {
696 		return (EINVAL);
697 	}
698 
699 	if ((flags & (MAP_TEXT | MAP_INITDATA)) == (MAP_TEXT | MAP_INITDATA)) {
700 		return (EINVAL);
701 	}
702 
703 #if defined(__sparc)
704 	/*
705 	 * See if this is an "old mmap call".  If so, remember this
706 	 * fact and convert the flags value given to mmap to indicate
707 	 * the specified address in the system call must be used.
708 	 * _MAP_NEW is turned set by all new uses of mmap.
709 	 */
710 	if ((flags & _MAP_NEW) == 0)
711 		flags |= MAP_FIXED;
712 #endif
713 	flags &= ~_MAP_NEW;
714 
715 	type = flags & MAP_TYPE;
716 	if (type != MAP_PRIVATE && type != MAP_SHARED)
717 		return (EINVAL);
718 
719 
720 	if (flags & MAP_ALIGN) {
721 
722 		if (flags & MAP_FIXED)
723 			return (EINVAL);
724 
725 		/* alignment needs to be a power of 2 >= page size */
726 		if (((uintptr_t)*addrp < PAGESIZE && (uintptr_t)*addrp != 0) ||
727 			!ISP2((uintptr_t)*addrp))
728 			return (EINVAL);
729 	}
730 	/*
731 	 * Check for bad lengths and file position.
732 	 * We let the VOP_MAP routine check for negative lengths
733 	 * since on some vnode types this might be appropriate.
734 	 */
735 	if (len == 0 || (pos & (u_offset_t)PAGEOFFSET) != 0)
736 		return (EINVAL);
737 
738 	maxprot = PROT_ALL;		/* start out allowing all accesses */
739 	uprot = prot | PROT_USER;
740 
741 	if (fp == NULL) {
742 		ASSERT(flags & MAP_ANON);
743 		as_rangelock(as);
744 		error = zmap(as, addrp, len, uprot, flags, pos);
745 		as_rangeunlock(as);
746 		return (error);
747 	} else if ((flags & MAP_ANON) != 0)
748 		return (EINVAL);
749 
750 	vp = fp->f_vnode;
751 
752 	/* Can't execute code from "noexec" mounted filesystem. */
753 	if ((vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0)
754 		maxprot &= ~PROT_EXEC;
755 
756 	/*
757 	 * These checks were added as part of large files.
758 	 *
759 	 * Return ENXIO if the initial position is negative; return EOVERFLOW
760 	 * if (offset + len) would overflow the maximum allowed offset for the
761 	 * type of file descriptor being used.
762 	 */
763 	if (vp->v_type == VREG) {
764 		if (pos < 0)
765 			return (ENXIO);
766 		if ((offset_t)len > (OFFSET_MAX(fp) - pos))
767 			return (EOVERFLOW);
768 	}
769 
770 	if (type == MAP_SHARED && (fp->f_flag & FWRITE) == 0) {
771 		/* no write access allowed */
772 		maxprot &= ~PROT_WRITE;
773 	}
774 
775 	/*
776 	 * XXX - Do we also adjust maxprot based on protections
777 	 * of the vnode?  E.g. if no execute permission is given
778 	 * on the vnode for the current user, maxprot probably
779 	 * should disallow PROT_EXEC also?  This is different
780 	 * from the write access as this would be a per vnode
781 	 * test as opposed to a per fd test for writability.
782 	 */
783 
784 	/*
785 	 * Verify that the specified protections are not greater than
786 	 * the maximum allowable protections.  Also test to make sure
787 	 * that the file descriptor does allows for read access since
788 	 * "write only" mappings are hard to do since normally we do
789 	 * the read from the file before the page can be written.
790 	 */
791 	if (((maxprot & uprot) != uprot) || (fp->f_flag & FREAD) == 0)
792 		return (EACCES);
793 
794 	/*
795 	 * If the user specified an address, do some simple checks here
796 	 */
797 	if ((flags & MAP_FIXED) != 0) {
798 		caddr_t userlimit;
799 
800 		/*
801 		 * Use the user address.  First verify that
802 		 * the address to be used is page aligned.
803 		 * Then make some simple bounds checks.
804 		 */
805 		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
806 			return (EINVAL);
807 
808 		userlimit = flags & _MAP_LOW32 ?
809 		    (caddr_t)USERLIMIT32 : as->a_userlimit;
810 		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
811 		case RANGE_OKAY:
812 			break;
813 		case RANGE_BADPROT:
814 			return (ENOTSUP);
815 		case RANGE_BADADDR:
816 		default:
817 			return (ENOMEM);
818 		}
819 	}
820 
821 
822 	/*
823 	 * Ok, now let the vnode map routine do its thing to set things up.
824 	 */
825 	error = VOP_MAP(vp, pos, as,
826 	    addrp, len, uprot, maxprot, flags, fp->f_cred);
827 
828 	if (error == 0) {
829 		if (vp->v_type == VREG &&
830 		    (flags & (MAP_TEXT | MAP_INITDATA)) != 0) {
831 			/*
832 			 * Mark this as an executable vnode
833 			 */
834 			mutex_enter(&vp->v_lock);
835 			vp->v_flag |= VVMEXEC;
836 			mutex_exit(&vp->v_lock);
837 		}
838 	}
839 
840 	return (error);
841 }
842 
843 #ifdef _LP64
844 /*
845  * LP64 mmap(2) system call: 64-bit offset, 64-bit address.
846  *
847  * The "large file" mmap routine mmap64(2) is also mapped to this routine
848  * by the 64-bit version of libc.
849  *
850  * Eventually, this should be the only version, and have smmap_common()
851  * folded back into it again.  Some day.
852  */
853 caddr_t
854 smmap64(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos)
855 {
856 	struct file *fp;
857 	int error;
858 
859 	if (flags & _MAP_LOW32)
860 		error = EINVAL;
861 	else if (fd == -1 && (flags & MAP_ANON) != 0)
862 		error = smmap_common(&addr, len, prot, flags,
863 		    NULL, (offset_t)pos);
864 	else if ((fp = getf(fd)) != NULL) {
865 		error = smmap_common(&addr, len, prot, flags,
866 		    fp, (offset_t)pos);
867 		releasef(fd);
868 	} else
869 		error = EBADF;
870 
871 	return (error ? (caddr_t)(uintptr_t)set_errno(error) : addr);
872 }
873 #endif	/* _LP64 */
874 
875 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
876 
877 /*
878  * ILP32 mmap(2) system call: 32-bit offset, 32-bit address.
879  */
880 caddr_t
881 smmap32(caddr32_t addr, size32_t len, int prot, int flags, int fd, off32_t pos)
882 {
883 	struct file *fp;
884 	int error;
885 	caddr_t a = (caddr_t)(uintptr_t)addr;
886 
887 	if (flags & _MAP_LOW32)
888 		error = EINVAL;
889 	else if (fd == -1 && (flags & MAP_ANON) != 0)
890 		error = smmap_common(&a, (size_t)len, prot,
891 		    flags | _MAP_LOW32, NULL, (offset_t)pos);
892 	else if ((fp = getf(fd)) != NULL) {
893 		error = smmap_common(&a, (size_t)len, prot,
894 		    flags | _MAP_LOW32, fp, (offset_t)pos);
895 		releasef(fd);
896 	} else
897 		error = EBADF;
898 
899 	ASSERT(error != 0 || (uintptr_t)(a + len) < (uintptr_t)UINT32_MAX);
900 
901 	return (error ? (caddr_t)(uintptr_t)set_errno(error) : a);
902 }
903 
904 /*
905  * ILP32 mmap64(2) system call: 64-bit offset, 32-bit address.
906  *
907  * Now things really get ugly because we can't use the C-style
908  * calling convention for more than 6 args, and 64-bit parameter
909  * passing on 32-bit systems is less than clean.
910  */
911 
912 struct mmaplf32a {
913 	caddr_t addr;
914 	size_t len;
915 #ifdef _LP64
916 	/*
917 	 * 32-bit contents, 64-bit cells
918 	 */
919 	uint64_t prot;
920 	uint64_t flags;
921 	uint64_t fd;
922 	uint64_t offhi;
923 	uint64_t offlo;
924 #else
925 	/*
926 	 * 32-bit contents, 32-bit cells
927 	 */
928 	uint32_t prot;
929 	uint32_t flags;
930 	uint32_t fd;
931 	uint32_t offhi;
932 	uint32_t offlo;
933 #endif
934 };
935 
936 int
937 smmaplf32(struct mmaplf32a *uap, rval_t *rvp)
938 {
939 	struct file *fp;
940 	int error;
941 	caddr_t a = uap->addr;
942 	int flags = (int)uap->flags;
943 	int fd = (int)uap->fd;
944 #ifdef _BIG_ENDIAN
945 	offset_t off = ((u_offset_t)uap->offhi << 32) | (u_offset_t)uap->offlo;
946 #else
947 	offset_t off = ((u_offset_t)uap->offlo << 32) | (u_offset_t)uap->offhi;
948 #endif
949 
950 	if (flags & _MAP_LOW32)
951 		error = EINVAL;
952 	else if (fd == -1 && (flags & MAP_ANON) != 0)
953 		error = smmap_common(&a, uap->len, (int)uap->prot,
954 		    flags | _MAP_LOW32, NULL, off);
955 	else if ((fp = getf(fd)) != NULL) {
956 		error = smmap_common(&a, uap->len, (int)uap->prot,
957 		    flags | _MAP_LOW32, fp, off);
958 		releasef(fd);
959 	} else
960 		error = EBADF;
961 
962 	if (error == 0)
963 		rvp->r_val1 = (uintptr_t)a;
964 	return (error);
965 }
966 
967 #endif	/* _SYSCALL32_IMPL || _ILP32 */
968 
969 int
970 munmap(caddr_t addr, size_t len)
971 {
972 	struct proc *p = curproc;
973 	struct as *as = p->p_as;
974 
975 	if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
976 		return (set_errno(EINVAL));
977 
978 	if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
979 		return (set_errno(EINVAL));
980 
981 	/*
982 	 * Discard lwpchan mappings.
983 	 */
984 	if (p->p_lcp != NULL)
985 		lwpchan_delete_mapping(p, addr, addr + len);
986 	if (as_unmap(as, addr, len) != 0)
987 		return (set_errno(EINVAL));
988 
989 	return (0);
990 }
991 
992 int
993 mprotect(caddr_t addr, size_t len, int prot)
994 {
995 	struct as *as = curproc->p_as;
996 	uint_t uprot = prot | PROT_USER;
997 	int error;
998 
999 	if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
1000 		return (set_errno(EINVAL));
1001 
1002 	switch (valid_usr_range(addr, len, prot, as, as->a_userlimit)) {
1003 	case RANGE_OKAY:
1004 		break;
1005 	case RANGE_BADPROT:
1006 		return (set_errno(ENOTSUP));
1007 	case RANGE_BADADDR:
1008 	default:
1009 		return (set_errno(ENOMEM));
1010 	}
1011 
1012 	error = as_setprot(as, addr, len, uprot);
1013 	if (error)
1014 		return (set_errno(error));
1015 	return (0);
1016 }
1017 
1018 #define	MC_CACHE	128			/* internal result buffer */
1019 #define	MC_QUANTUM	(MC_CACHE * PAGESIZE)	/* addresses covered in loop */
1020 
1021 int
1022 mincore(caddr_t addr, size_t len, char *vecp)
1023 {
1024 	struct as *as = curproc->p_as;
1025 	caddr_t ea;			/* end address of loop */
1026 	size_t rl;			/* inner result length */
1027 	char vec[MC_CACHE];		/* local vector cache */
1028 	int error;
1029 	model_t model;
1030 	long	llen;
1031 
1032 	model = get_udatamodel();
1033 	/*
1034 	 * Validate form of address parameters.
1035 	 */
1036 	if (model == DATAMODEL_NATIVE) {
1037 		llen = (long)len;
1038 	} else {
1039 		llen = (int32_t)(size32_t)len;
1040 	}
1041 	if (((uintptr_t)addr & PAGEOFFSET) != 0 || llen <= 0)
1042 		return (set_errno(EINVAL));
1043 
1044 	if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
1045 		return (set_errno(ENOMEM));
1046 
1047 	/*
1048 	 * Loop over subranges of interval [addr : addr + len), recovering
1049 	 * results internally and then copying them out to caller.  Subrange
1050 	 * is based on the size of MC_CACHE, defined above.
1051 	 */
1052 	for (ea = addr + len; addr < ea; addr += MC_QUANTUM) {
1053 		error = as_incore(as, addr,
1054 		    (size_t)MIN(MC_QUANTUM, ea - addr), vec, &rl);
1055 		if (rl != 0) {
1056 			rl = (rl + PAGESIZE - 1) / PAGESIZE;
1057 			if (copyout(vec, vecp, rl) != 0)
1058 				return (set_errno(EFAULT));
1059 			vecp += rl;
1060 		}
1061 		if (error != 0)
1062 			return (set_errno(ENOMEM));
1063 	}
1064 	return (0);
1065 }
1066