xref: /titanic_41/usr/src/uts/common/os/grow.c (revision 8c74a1f9477c04aa8539a84a49aa2bf629c7a14d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"
31 
32 #include <sys/types.h>
33 #include <sys/inttypes.h>
34 #include <sys/param.h>
35 #include <sys/sysmacros.h>
36 #include <sys/systm.h>
37 #include <sys/signal.h>
38 #include <sys/user.h>
39 #include <sys/errno.h>
40 #include <sys/var.h>
41 #include <sys/proc.h>
42 #include <sys/tuneable.h>
43 #include <sys/debug.h>
44 #include <sys/cmn_err.h>
45 #include <sys/cred.h>
46 #include <sys/vnode.h>
47 #include <sys/vfs.h>
48 #include <sys/vm.h>
49 #include <sys/file.h>
50 #include <sys/mman.h>
51 #include <sys/vmparam.h>
52 #include <sys/fcntl.h>
53 #include <sys/lwpchan_impl.h>
54 
55 #include <vm/hat.h>
56 #include <vm/as.h>
57 #include <vm/seg.h>
58 #include <vm/seg_dev.h>
59 #include <vm/seg_vn.h>
60 
61 int use_brk_lpg = 1;
62 int use_stk_lpg = 1;
63 int use_zmap_lpg = 1;
64 
65 static int brk_lpg(caddr_t nva);
66 static int grow_lpg(caddr_t sp);
67 
68 int
69 brk(caddr_t nva)
70 {
71 	int error;
72 	proc_t *p = curproc;
73 
74 	/*
75 	 * Serialize brk operations on an address space.
76 	 * This also serves as the lock protecting p_brksize
77 	 * and p_brkpageszc.
78 	 */
79 	as_rangelock(p->p_as);
80 	if (use_brk_lpg && (p->p_flag & SAUTOLPG) != 0) {
81 		error = brk_lpg(nva);
82 	} else {
83 		error = brk_internal(nva, p->p_brkpageszc);
84 	}
85 	as_rangeunlock(p->p_as);
86 	return ((error != 0 ? set_errno(error) : 0));
87 }
88 
89 /*
90  * Algorithm: call arch-specific map_pgsz to get best page size to use,
91  * then call brk_internal().
92  * Returns 0 on success.
93  */
94 static int
95 brk_lpg(caddr_t nva)
96 {
97 	struct proc *p = curproc;
98 	size_t pgsz, len;
99 	caddr_t addr;
100 	caddr_t bssbase = p->p_bssbase;
101 	caddr_t brkbase = p->p_brkbase;
102 	int oszc, szc;
103 	int err;
104 	int remap = 0;
105 
106 	oszc = p->p_brkpageszc;
107 
108 	/*
109 	 * If p_brkbase has not yet been set, the first call
110 	 * to brk_internal() will initialize it.
111 	 */
112 	if (brkbase == 0) {
113 		return (brk_internal(nva, oszc));
114 	}
115 
116 	len = nva - bssbase;
117 
118 	pgsz = map_pgsz(MAPPGSZ_HEAP, p, bssbase, len, &remap);
119 	szc = page_szc(pgsz);
120 
121 	/*
122 	 * Covers two cases:
123 	 * 1. page_szc() returns -1 for invalid page size, so we want to
124 	 * ignore it in that case.
125 	 * 2. By design we never decrease page size, as it is more stable.
126 	 */
127 	if (szc <= oszc) {
128 		err = brk_internal(nva, oszc);
129 		/* If failed, back off to base page size. */
130 		if (err != 0 && oszc != 0) {
131 			err = brk_internal(nva, 0);
132 		}
133 		return (err);
134 	}
135 
136 	if (remap == 0) {
137 		/*
138 		 * Map from the current brk end up to the new page size
139 		 * alignment using the current page size.
140 		 */
141 		addr = brkbase + p->p_brksize;
142 		addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
143 		if (addr < nva) {
144 			err = brk_internal(addr, oszc);
145 			/*
146 			 * In failure case, try again if oszc is not base page
147 			 * size, then return err.
148 			 */
149 			if (err != 0) {
150 				if (oszc != 0) {
151 					err = brk_internal(nva, 0);
152 				}
153 				return (err);
154 			}
155 		}
156 	}
157 
158 	err = brk_internal(nva, szc);
159 	/* If using szc failed, map with base page size and return. */
160 	if (err != 0) {
161 		if (szc != 0) {
162 			err = brk_internal(nva, 0);
163 		}
164 		return (err);
165 	}
166 
167 	if (remap != 0) {
168 		/*
169 		 * Round up brk base to a large page boundary and remap
170 		 * anything in the segment already faulted in beyond that
171 		 * point.
172 		 */
173 		addr = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase, pgsz);
174 		len = (brkbase + p->p_brksize) - addr;
175 		/* advisory, so ignore errors */
176 		(void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
177 	}
178 
179 	ASSERT(err == 0);
180 	return (err);		/* should always be 0 */
181 }
182 
183 /*
184  * Returns 0 on success.
185  */
186 int
187 brk_internal(caddr_t nva, uint_t brkszc)
188 {
189 	caddr_t ova;			/* current break address */
190 	size_t size;
191 	int	error;
192 	struct proc *p = curproc;
193 	struct as *as = p->p_as;
194 	size_t pgsz;
195 	uint_t szc;
196 	rctl_qty_t as_rctl;
197 
198 	/*
199 	 * extend heap to brkszc alignment but use current p->p_brkpageszc
200 	 * for the newly created segment. This allows the new extension
201 	 * segment to be concatenated successfully with the existing brk
202 	 * segment.
203 	 */
204 	if ((szc = brkszc) != 0) {
205 		pgsz = page_get_pagesize(szc);
206 		ASSERT(pgsz > PAGESIZE);
207 	} else {
208 		pgsz = PAGESIZE;
209 	}
210 
211 	mutex_enter(&p->p_lock);
212 	as_rctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_DATA],
213 	    p->p_rctls, p);
214 	mutex_exit(&p->p_lock);
215 
216 	/*
217 	 * If p_brkbase has not yet been set, the first call
218 	 * to brk() will initialize it.
219 	 */
220 	if (p->p_brkbase == 0)
221 		p->p_brkbase = nva;
222 
223 	/*
224 	 * Before multiple page size support existed p_brksize was the value
225 	 * not rounded to the pagesize (i.e. it stored the exact user request
226 	 * for heap size). If pgsz is greater than PAGESIZE calculate the
227 	 * heap size as the real new heap size by rounding it up to pgsz.
228 	 * This is useful since we may want to know where the heap ends
229 	 * without knowing heap pagesize (e.g. some old code) and also if
230 	 * heap pagesize changes we can update p_brkpageszc but delay adding
231 	 * new mapping yet still know from p_brksize where the heap really
232 	 * ends. The user requested heap end is stored in libc variable.
233 	 */
234 	if (pgsz > PAGESIZE) {
235 		caddr_t tnva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
236 		size = tnva - p->p_brkbase;
237 		if (tnva < p->p_brkbase || (size > p->p_brksize &&
238 		    size > (size_t)as_rctl)) {
239 			szc = 0;
240 			pgsz = PAGESIZE;
241 			size = nva - p->p_brkbase;
242 		}
243 	} else {
244 		size = nva - p->p_brkbase;
245 	}
246 
247 	/*
248 	 * use PAGESIZE to roundup ova because we want to know the real value
249 	 * of the current heap end in case p_brkpageszc changes since the last
250 	 * p_brksize was computed.
251 	 */
252 	nva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
253 	ova = (caddr_t)P2ROUNDUP((uintptr_t)(p->p_brkbase + p->p_brksize),
254 		PAGESIZE);
255 
256 	if ((nva < p->p_brkbase) || (size > p->p_brksize &&
257 	    size > as_rctl)) {
258 		mutex_enter(&p->p_lock);
259 		(void) rctl_action(rctlproc_legacy[RLIMIT_DATA], p->p_rctls, p,
260 		    RCA_SAFE);
261 		mutex_exit(&p->p_lock);
262 		return (ENOMEM);
263 	}
264 
265 	if (nva > ova) {
266 		struct segvn_crargs crargs =
267 		    SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
268 
269 		if (!(p->p_datprot & PROT_EXEC)) {
270 			crargs.prot &= ~PROT_EXEC;
271 		}
272 
273 		/*
274 		 * Add new zfod mapping to extend UNIX data segment
275 		 */
276 		crargs.szc = szc;
277 		crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_UP;
278 		error = as_map(as, ova, (size_t)(nva - ova), segvn_create,
279 		    &crargs);
280 		if (error) {
281 			return (error);
282 		}
283 
284 	} else if (nva < ova) {
285 		/*
286 		 * Release mapping to shrink UNIX data segment.
287 		 */
288 		(void) as_unmap(as, nva, (size_t)(ova - nva));
289 	}
290 	p->p_brksize = size;
291 	p->p_brkpageszc = szc;
292 	return (0);
293 }
294 
295 /*
296  * Grow the stack to include sp.  Return 1 if successful, 0 otherwise.
297  * This routine assumes that the stack grows downward.
298  */
299 int
300 grow(caddr_t sp)
301 {
302 	struct proc *p = curproc;
303 	int err;
304 
305 	/*
306 	 * Serialize grow operations on an address space.
307 	 * This also serves as the lock protecting p_stksize
308 	 * and p_stkpageszc.
309 	 */
310 	as_rangelock(p->p_as);
311 	if (use_stk_lpg && (p->p_flag & SAUTOLPG) != 0) {
312 		err = grow_lpg(sp);
313 	} else {
314 		err = grow_internal(sp, p->p_stkpageszc);
315 	}
316 	as_rangeunlock(p->p_as);
317 	return ((err == 0 ? 1 : 0));
318 }
319 
320 /*
321  * Algorithm: call arch-specific map_pgsz to get best page size to use,
322  * then call grow_internal().
323  * Returns 0 on success.
324  */
325 static int
326 grow_lpg(caddr_t sp)
327 {
328 	struct proc *p = curproc;
329 	size_t pgsz;
330 	size_t len, newsize;
331 	caddr_t addr, oldsp;
332 	int oszc, szc;
333 	int err;
334 	int remap = 0;
335 
336 	newsize = p->p_usrstack - sp;
337 
338 	oszc = p->p_stkpageszc;
339 	pgsz = map_pgsz(MAPPGSZ_STK, p, sp, newsize, &remap);
340 	szc = page_szc(pgsz);
341 
342 	/*
343 	 * Covers two cases:
344 	 * 1. page_szc() returns -1 for invalid page size, so we want to
345 	 * ignore it in that case.
346 	 * 2. By design we never decrease page size, as it is more stable.
347 	 * This shouldn't happen as the stack never shrinks.
348 	 */
349 	if (szc <= oszc) {
350 		err = grow_internal(sp, oszc);
351 		/* failed, fall back to base page size */
352 		if (err != 0 && oszc != 0) {
353 			err = grow_internal(sp, 0);
354 		}
355 		return (err);
356 	}
357 
358 	/*
359 	 * We've grown sufficiently to switch to a new page size.
360 	 * If we're not going to remap the whole segment with the new
361 	 * page size, split the grow into two operations: map to the new
362 	 * page size alignment boundary with the existing page size, then
363 	 * map the rest with the new page size.
364 	 */
365 	err = 0;
366 	if (remap == 0) {
367 		oldsp = p->p_usrstack - p->p_stksize;
368 		addr = (caddr_t)P2ALIGN((uintptr_t)oldsp, pgsz);
369 		if (addr > sp) {
370 			err = grow_internal(addr, oszc);
371 			/*
372 			 * In this case, grow with oszc failed, so grow all the
373 			 * way to sp with base page size.
374 			 */
375 			if (err != 0) {
376 				if (oszc != 0) {
377 					err = grow_internal(sp, 0);
378 				}
379 				return (err);
380 			}
381 		}
382 	}
383 
384 	err = grow_internal(sp, szc);
385 	/* The grow with szc failed, so fall back to base page size. */
386 	if (err != 0) {
387 		if (szc != 0) {
388 			err = grow_internal(sp, 0);
389 		}
390 		return (err);
391 	}
392 
393 	if (remap) {
394 		/*
395 		 * Round up stack pointer to a large page boundary and remap
396 		 * any pgsz pages in the segment already faulted in beyond that
397 		 * point.
398 		 */
399 		addr = p->p_usrstack - p->p_stksize;
400 		addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
401 		len = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack, pgsz) - addr;
402 		/* advisory, so ignore errors */
403 		(void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
404 	}
405 
406 	/* Update page size code for stack. */
407 	p->p_stkpageszc = szc;
408 
409 	ASSERT(err == 0);
410 	return (err);		/* should always be 0 */
411 }
412 
413 /*
414  * This routine assumes that the stack grows downward.
415  * Returns 0 on success, errno on failure.
416  */
417 int
418 grow_internal(caddr_t sp, uint_t growszc)
419 {
420 	struct proc *p = curproc;
421 	struct as *as = p->p_as;
422 	size_t newsize = p->p_usrstack - sp;
423 	size_t oldsize;
424 	int    error;
425 	size_t pgsz;
426 	uint_t szc;
427 	struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
428 
429 	ASSERT(sp < p->p_usrstack);
430 
431 	/*
432 	 * grow to growszc alignment but use current p->p_stkpageszc for
433 	 * the segvn_crargs szc passed to segvn_create. For memcntl to
434 	 * increase the szc, this allows the new extension segment to be
435 	 * concatenated successfully with the existing stack segment.
436 	 */
437 	if ((szc = growszc) != 0) {
438 		pgsz = page_get_pagesize(szc);
439 		ASSERT(pgsz > PAGESIZE);
440 		newsize = P2ROUNDUP(newsize, pgsz);
441 		if (newsize > (size_t)p->p_stk_ctl) {
442 			szc = 0;
443 			pgsz = PAGESIZE;
444 			newsize = p->p_usrstack - sp;
445 		}
446 	} else {
447 		pgsz = PAGESIZE;
448 	}
449 
450 	if (newsize > (size_t)p->p_stk_ctl) {
451 		(void) rctl_action(rctlproc_legacy[RLIMIT_STACK], p->p_rctls, p,
452 		    RCA_UNSAFE_ALL);
453 
454 		return (ENOMEM);
455 	}
456 
457 	oldsize = p->p_stksize;
458 	newsize = P2ROUNDUP(newsize, pgsz);
459 	ASSERT(P2PHASE(oldsize, PAGESIZE) == 0);
460 
461 	if (newsize <= oldsize) {	/* prevent the stack from shrinking */
462 		return (0);
463 	}
464 
465 	if (!(p->p_stkprot & PROT_EXEC)) {
466 		crargs.prot &= ~PROT_EXEC;
467 	}
468 	/*
469 	 * extend stack with the p_stkpageszc. growszc is different than
470 	 * p_stkpageszc only on a memcntl to increase the stack pagesize.
471 	 */
472 	crargs.szc = p->p_stkpageszc;
473 	crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_DOWN;
474 
475 	if ((error = as_map(as, p->p_usrstack - newsize, newsize - oldsize,
476 	    segvn_create, &crargs)) != 0) {
477 		if (error == EAGAIN) {
478 			cmn_err(CE_WARN, "Sorry, no swap space to grow stack "
479 			    "for pid %d (%s)", p->p_pid, u.u_comm);
480 		}
481 		return (error);
482 	}
483 	p->p_stksize = newsize;
484 
485 
486 	/*
487 	 * Set up translations so the process doesn't have to fault in
488 	 * the stack pages we just gave it.
489 	 */
490 	(void) as_fault(as->a_hat, as,
491 	    p->p_usrstack - newsize, newsize - oldsize, F_INVAL, S_WRITE);
492 
493 	return (0);
494 }
495 
496 /*
497  * Used for MAP_ANON - fast way to get anonymous pages
498  */
499 static int
500 zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
501     offset_t pos)
502 {
503 	struct segvn_crargs a, b;
504 	struct proc *p = curproc;
505 	int err;
506 	size_t pgsz;
507 	size_t l0, l1, l2, l3, l4; /* 0th through 5th chunks */
508 	caddr_t ruaddr, ruaddr0; /* rounded up addresses */
509 	extern size_t auto_lpg_va_default;
510 
511 	if (((PROT_ALL & uprot) != uprot))
512 		return (EACCES);
513 
514 	if ((flags & MAP_FIXED) != 0) {
515 		caddr_t userlimit;
516 
517 		/*
518 		 * Use the user address.  First verify that
519 		 * the address to be used is page aligned.
520 		 * Then make some simple bounds checks.
521 		 */
522 		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
523 			return (EINVAL);
524 
525 		userlimit = flags & _MAP_LOW32 ?
526 		    (caddr_t)USERLIMIT32 : as->a_userlimit;
527 		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
528 		case RANGE_OKAY:
529 			break;
530 		case RANGE_BADPROT:
531 			return (ENOTSUP);
532 		case RANGE_BADADDR:
533 		default:
534 			return (ENOMEM);
535 		}
536 		(void) as_unmap(as, *addrp, len);
537 	} else {
538 		/*
539 		 * No need to worry about vac alignment for anonymous
540 		 * pages since this is a "clone" object that doesn't
541 		 * yet exist.
542 		 */
543 		map_addr(addrp, len, pos, 0, flags);
544 		if (*addrp == NULL)
545 			return (ENOMEM);
546 	}
547 
548 	/*
549 	 * Use the seg_vn segment driver; passing in the NULL amp
550 	 * gives the desired "cloning" effect.
551 	 */
552 	a.vp = NULL;
553 	a.offset = 0;
554 	a.type = flags & MAP_TYPE;
555 	a.prot = uprot;
556 	a.maxprot = PROT_ALL;
557 	a.flags = flags & ~MAP_TYPE;
558 	a.cred = CRED();
559 	a.amp = NULL;
560 	a.szc = 0;
561 	a.lgrp_mem_policy_flags = 0;
562 
563 	/*
564 	 * Call arch-specific map_pgsz routine to pick best page size to map
565 	 * this segment, and break the mapping up into parts if required.
566 	 *
567 	 * The parts work like this:
568 	 *
569 	 * addr		---------
570 	 *		|	| l0
571 	 *		---------
572 	 *		|	| l1
573 	 *		---------
574 	 *		|	| l2
575 	 *		---------
576 	 *		|	| l3
577 	 *		---------
578 	 *		|	| l4
579 	 *		---------
580 	 * addr+len
581 	 *
582 	 * Starting from the middle, l2 is the number of bytes mapped by the
583 	 * selected large page.  l1 and l3 are mapped by auto_lpg_va_default
584 	 * page size pages, and l0 and l4 are mapped by base page size pages.
585 	 * If auto_lpg_va_default is the base page size, then l0 == l4 == 0.
586 	 * If the requested address or length are aligned to the selected large
587 	 * page size, l1 or l3 may also be 0.
588 	 */
589 	if (use_zmap_lpg && a.type == MAP_PRIVATE) {
590 
591 		pgsz = map_pgsz(MAPPGSZ_VA, p, *addrp, len, NULL);
592 		if (pgsz <= PAGESIZE || len < pgsz) {
593 			return (as_map(as, *addrp, len, segvn_create, &a));
594 		}
595 
596 		ruaddr = (caddr_t)P2ROUNDUP((uintptr_t)*addrp, pgsz);
597 		if (auto_lpg_va_default != MMU_PAGESIZE) {
598 			ruaddr0 = (caddr_t)P2ROUNDUP((uintptr_t)*addrp,
599 			    auto_lpg_va_default);
600 			l0 = ruaddr0 - *addrp;
601 		} else {
602 			l0 = 0;
603 			ruaddr0 = *addrp;
604 		}
605 		l1 = ruaddr - ruaddr0;
606 		l3 = P2PHASE(len - l0 - l1, pgsz);
607 		if (auto_lpg_va_default == MMU_PAGESIZE) {
608 			l4 = 0;
609 		} else {
610 			l4 = P2PHASE(l3, auto_lpg_va_default);
611 			l3 -= l4;
612 		}
613 		l2 = len - l0 - l1 - l3 - l4;
614 
615 		if (l0) {
616 			b = a;
617 			err = as_map(as, *addrp, l0, segvn_create, &b);
618 			if (err) {
619 				return (err);
620 			}
621 		}
622 
623 		if (l1) {
624 			b = a;
625 			b.szc = page_szc(auto_lpg_va_default);
626 			err = as_map(as, ruaddr0, l1, segvn_create, &b);
627 			if (err) {
628 				goto error1;
629 			}
630 		}
631 
632 		if (l2) {
633 			b = a;
634 			b.szc = page_szc(pgsz);
635 			err = as_map(as, ruaddr, l2, segvn_create, &b);
636 			if (err) {
637 				goto error2;
638 			}
639 		}
640 
641 		if (l3) {
642 			b = a;
643 			b.szc = page_szc(auto_lpg_va_default);
644 			err = as_map(as, ruaddr + l2, l3, segvn_create, &b);
645 			if (err) {
646 				goto error3;
647 			}
648 		}
649 		if (l4) {
650 			err = as_map(as, ruaddr + l2 + l3, l4, segvn_create,
651 			    &a);
652 			if (err) {
653 error3:
654 				if (l3) {
655 					(void) as_unmap(as, ruaddr + l2, l3);
656 				}
657 error2:
658 				if (l2) {
659 					(void) as_unmap(as, ruaddr, l2);
660 				}
661 error1:
662 				if (l1) {
663 					(void) as_unmap(as, ruaddr0, l1);
664 				}
665 				if (l0) {
666 					(void) as_unmap(as, *addrp, l0);
667 				}
668 				return (err);
669 			}
670 		}
671 
672 		return (0);
673 	}
674 
675 	return (as_map(as, *addrp, len, segvn_create, &a));
676 }
677 
678 static int
679 smmap_common(caddr_t *addrp, size_t len,
680     int prot, int flags, struct file *fp, offset_t pos)
681 {
682 	struct vnode *vp;
683 	struct as *as = curproc->p_as;
684 	uint_t uprot, maxprot, type;
685 	int error;
686 
687 	if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | _MAP_NEW |
688 	    _MAP_LOW32 | MAP_NORESERVE | MAP_ANON | MAP_ALIGN |
689 	    MAP_TEXT | MAP_INITDATA)) != 0) {
690 		/* | MAP_RENAME */	/* not implemented, let user know */
691 		return (EINVAL);
692 	}
693 
694 	if ((flags & MAP_TEXT) && !(prot & PROT_EXEC)) {
695 		return (EINVAL);
696 	}
697 
698 	if ((flags & (MAP_TEXT | MAP_INITDATA)) == (MAP_TEXT | MAP_INITDATA)) {
699 		return (EINVAL);
700 	}
701 
702 #if defined(__sparc)
703 	/*
704 	 * See if this is an "old mmap call".  If so, remember this
705 	 * fact and convert the flags value given to mmap to indicate
706 	 * the specified address in the system call must be used.
707 	 * _MAP_NEW is turned set by all new uses of mmap.
708 	 */
709 	if ((flags & _MAP_NEW) == 0)
710 		flags |= MAP_FIXED;
711 #endif
712 	flags &= ~_MAP_NEW;
713 
714 	type = flags & MAP_TYPE;
715 	if (type != MAP_PRIVATE && type != MAP_SHARED)
716 		return (EINVAL);
717 
718 
719 	if (flags & MAP_ALIGN) {
720 
721 		if (flags & MAP_FIXED)
722 			return (EINVAL);
723 
724 		/* alignment needs to be a power of 2 >= page size */
725 		if (((uintptr_t)*addrp < PAGESIZE && (uintptr_t)*addrp != 0) ||
726 			!ISP2((uintptr_t)*addrp))
727 			return (EINVAL);
728 	}
729 	/*
730 	 * Check for bad lengths and file position.
731 	 * We let the VOP_MAP routine check for negative lengths
732 	 * since on some vnode types this might be appropriate.
733 	 */
734 	if (len == 0 || (pos & (u_offset_t)PAGEOFFSET) != 0)
735 		return (EINVAL);
736 
737 	maxprot = PROT_ALL;		/* start out allowing all accesses */
738 	uprot = prot | PROT_USER;
739 
740 	if (fp == NULL) {
741 		ASSERT(flags & MAP_ANON);
742 		as_rangelock(as);
743 		error = zmap(as, addrp, len, uprot, flags, pos);
744 		as_rangeunlock(as);
745 		return (error);
746 	} else if ((flags & MAP_ANON) != 0)
747 		return (EINVAL);
748 
749 	vp = fp->f_vnode;
750 
751 	/* Can't execute code from "noexec" mounted filesystem. */
752 	if ((vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0)
753 		maxprot &= ~PROT_EXEC;
754 
755 	/*
756 	 * These checks were added as part of large files.
757 	 *
758 	 * Return ENXIO if the initial position is negative; return EOVERFLOW
759 	 * if (offset + len) would overflow the maximum allowed offset for the
760 	 * type of file descriptor being used.
761 	 */
762 	if (vp->v_type == VREG) {
763 		if (pos < 0)
764 			return (ENXIO);
765 		if ((offset_t)len > (OFFSET_MAX(fp) - pos))
766 			return (EOVERFLOW);
767 	}
768 
769 	if (type == MAP_SHARED && (fp->f_flag & FWRITE) == 0) {
770 		/* no write access allowed */
771 		maxprot &= ~PROT_WRITE;
772 	}
773 
774 	/*
775 	 * XXX - Do we also adjust maxprot based on protections
776 	 * of the vnode?  E.g. if no execute permission is given
777 	 * on the vnode for the current user, maxprot probably
778 	 * should disallow PROT_EXEC also?  This is different
779 	 * from the write access as this would be a per vnode
780 	 * test as opposed to a per fd test for writability.
781 	 */
782 
783 	/*
784 	 * Verify that the specified protections are not greater than
785 	 * the maximum allowable protections.  Also test to make sure
786 	 * that the file descriptor does allows for read access since
787 	 * "write only" mappings are hard to do since normally we do
788 	 * the read from the file before the page can be written.
789 	 */
790 	if (((maxprot & uprot) != uprot) || (fp->f_flag & FREAD) == 0)
791 		return (EACCES);
792 
793 	/*
794 	 * If the user specified an address, do some simple checks here
795 	 */
796 	if ((flags & MAP_FIXED) != 0) {
797 		caddr_t userlimit;
798 
799 		/*
800 		 * Use the user address.  First verify that
801 		 * the address to be used is page aligned.
802 		 * Then make some simple bounds checks.
803 		 */
804 		if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
805 			return (EINVAL);
806 
807 		userlimit = flags & _MAP_LOW32 ?
808 		    (caddr_t)USERLIMIT32 : as->a_userlimit;
809 		switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
810 		case RANGE_OKAY:
811 			break;
812 		case RANGE_BADPROT:
813 			return (ENOTSUP);
814 		case RANGE_BADADDR:
815 		default:
816 			return (ENOMEM);
817 		}
818 	}
819 
820 
821 	/*
822 	 * Ok, now let the vnode map routine do its thing to set things up.
823 	 */
824 	error = VOP_MAP(vp, pos, as,
825 	    addrp, len, uprot, maxprot, flags, fp->f_cred);
826 
827 	if (error == 0) {
828 		if (vp->v_type == VREG &&
829 		    (flags & (MAP_TEXT | MAP_INITDATA)) != 0) {
830 			/*
831 			 * Mark this as an executable vnode
832 			 */
833 			mutex_enter(&vp->v_lock);
834 			vp->v_flag |= VVMEXEC;
835 			mutex_exit(&vp->v_lock);
836 		}
837 	}
838 
839 	return (error);
840 }
841 
842 #ifdef _LP64
843 /*
844  * LP64 mmap(2) system call: 64-bit offset, 64-bit address.
845  *
846  * The "large file" mmap routine mmap64(2) is also mapped to this routine
847  * by the 64-bit version of libc.
848  *
849  * Eventually, this should be the only version, and have smmap_common()
850  * folded back into it again.  Some day.
851  */
852 caddr_t
853 smmap64(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos)
854 {
855 	struct file *fp;
856 	int error;
857 
858 	if (flags & _MAP_LOW32)
859 		error = EINVAL;
860 	else if (fd == -1 && (flags & MAP_ANON) != 0)
861 		error = smmap_common(&addr, len, prot, flags,
862 		    NULL, (offset_t)pos);
863 	else if ((fp = getf(fd)) != NULL) {
864 		error = smmap_common(&addr, len, prot, flags,
865 		    fp, (offset_t)pos);
866 		releasef(fd);
867 	} else
868 		error = EBADF;
869 
870 	return (error ? (caddr_t)(uintptr_t)set_errno(error) : addr);
871 }
872 #endif	/* _LP64 */
873 
874 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
875 
876 /*
877  * ILP32 mmap(2) system call: 32-bit offset, 32-bit address.
878  */
879 caddr_t
880 smmap32(caddr32_t addr, size32_t len, int prot, int flags, int fd, off32_t pos)
881 {
882 	struct file *fp;
883 	int error;
884 	caddr_t a = (caddr_t)(uintptr_t)addr;
885 
886 	if (flags & _MAP_LOW32)
887 		error = EINVAL;
888 	else if (fd == -1 && (flags & MAP_ANON) != 0)
889 		error = smmap_common(&a, (size_t)len, prot,
890 		    flags | _MAP_LOW32, NULL, (offset_t)pos);
891 	else if ((fp = getf(fd)) != NULL) {
892 		error = smmap_common(&a, (size_t)len, prot,
893 		    flags | _MAP_LOW32, fp, (offset_t)pos);
894 		releasef(fd);
895 	} else
896 		error = EBADF;
897 
898 	ASSERT(error != 0 || (uintptr_t)(a + len) < (uintptr_t)UINT32_MAX);
899 
900 	return (error ? (caddr_t)(uintptr_t)set_errno(error) : a);
901 }
902 
903 /*
904  * ILP32 mmap64(2) system call: 64-bit offset, 32-bit address.
905  *
906  * Now things really get ugly because we can't use the C-style
907  * calling convention for more than 6 args, and 64-bit parameter
908  * passing on 32-bit systems is less than clean.
909  */
910 
911 struct mmaplf32a {
912 	caddr_t addr;
913 	size_t len;
914 #ifdef _LP64
915 	/*
916 	 * 32-bit contents, 64-bit cells
917 	 */
918 	uint64_t prot;
919 	uint64_t flags;
920 	uint64_t fd;
921 	uint64_t offhi;
922 	uint64_t offlo;
923 #else
924 	/*
925 	 * 32-bit contents, 32-bit cells
926 	 */
927 	uint32_t prot;
928 	uint32_t flags;
929 	uint32_t fd;
930 	uint32_t offhi;
931 	uint32_t offlo;
932 #endif
933 };
934 
935 int
936 smmaplf32(struct mmaplf32a *uap, rval_t *rvp)
937 {
938 	struct file *fp;
939 	int error;
940 	caddr_t a = uap->addr;
941 	int flags = (int)uap->flags;
942 	int fd = (int)uap->fd;
943 #ifdef _BIG_ENDIAN
944 	offset_t off = ((u_offset_t)uap->offhi << 32) | (u_offset_t)uap->offlo;
945 #else
946 	offset_t off = ((u_offset_t)uap->offlo << 32) | (u_offset_t)uap->offhi;
947 #endif
948 
949 	if (flags & _MAP_LOW32)
950 		error = EINVAL;
951 	else if (fd == -1 && (flags & MAP_ANON) != 0)
952 		error = smmap_common(&a, uap->len, (int)uap->prot,
953 		    flags | _MAP_LOW32, NULL, off);
954 	else if ((fp = getf(fd)) != NULL) {
955 		error = smmap_common(&a, uap->len, (int)uap->prot,
956 		    flags | _MAP_LOW32, fp, off);
957 		releasef(fd);
958 	} else
959 		error = EBADF;
960 
961 	if (error == 0)
962 		rvp->r_val1 = (uintptr_t)a;
963 	return (error);
964 }
965 
966 #endif	/* _SYSCALL32_IMPL || _ILP32 */
967 
968 int
969 munmap(caddr_t addr, size_t len)
970 {
971 	struct proc *p = curproc;
972 	struct as *as = p->p_as;
973 
974 	if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
975 		return (set_errno(EINVAL));
976 
977 	if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
978 		return (set_errno(EINVAL));
979 
980 	/*
981 	 * Discard lwpchan mappings.
982 	 */
983 	if (p->p_lcp != NULL)
984 		lwpchan_delete_mapping(p, addr, addr + len);
985 	if (as_unmap(as, addr, len) != 0)
986 		return (set_errno(EINVAL));
987 
988 	return (0);
989 }
990 
991 int
992 mprotect(caddr_t addr, size_t len, int prot)
993 {
994 	struct as *as = curproc->p_as;
995 	uint_t uprot = prot | PROT_USER;
996 	int error;
997 
998 	if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
999 		return (set_errno(EINVAL));
1000 
1001 	switch (valid_usr_range(addr, len, prot, as, as->a_userlimit)) {
1002 	case RANGE_OKAY:
1003 		break;
1004 	case RANGE_BADPROT:
1005 		return (set_errno(ENOTSUP));
1006 	case RANGE_BADADDR:
1007 	default:
1008 		return (set_errno(ENOMEM));
1009 	}
1010 
1011 	error = as_setprot(as, addr, len, uprot);
1012 	if (error)
1013 		return (set_errno(error));
1014 	return (0);
1015 }
1016 
1017 #define	MC_CACHE	128			/* internal result buffer */
1018 #define	MC_QUANTUM	(MC_CACHE * PAGESIZE)	/* addresses covered in loop */
1019 
1020 int
1021 mincore(caddr_t addr, size_t len, char *vecp)
1022 {
1023 	struct as *as = curproc->p_as;
1024 	caddr_t ea;			/* end address of loop */
1025 	size_t rl;			/* inner result length */
1026 	char vec[MC_CACHE];		/* local vector cache */
1027 	int error;
1028 	model_t model;
1029 	long	llen;
1030 
1031 	model = get_udatamodel();
1032 	/*
1033 	 * Validate form of address parameters.
1034 	 */
1035 	if (model == DATAMODEL_NATIVE) {
1036 		llen = (long)len;
1037 	} else {
1038 		llen = (int32_t)(size32_t)len;
1039 	}
1040 	if (((uintptr_t)addr & PAGEOFFSET) != 0 || llen <= 0)
1041 		return (set_errno(EINVAL));
1042 
1043 	if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
1044 		return (set_errno(ENOMEM));
1045 
1046 	/*
1047 	 * Loop over subranges of interval [addr : addr + len), recovering
1048 	 * results internally and then copying them out to caller.  Subrange
1049 	 * is based on the size of MC_CACHE, defined above.
1050 	 */
1051 	for (ea = addr + len; addr < ea; addr += MC_QUANTUM) {
1052 		error = as_incore(as, addr,
1053 		    (size_t)MIN(MC_QUANTUM, ea - addr), vec, &rl);
1054 		if (rl != 0) {
1055 			rl = (rl + PAGESIZE - 1) / PAGESIZE;
1056 			if (copyout(vec, vecp, rl) != 0)
1057 				return (set_errno(EFAULT));
1058 			vecp += rl;
1059 		}
1060 		if (error != 0)
1061 			return (set_errno(ENOMEM));
1062 	}
1063 	return (0);
1064 }
1065