xref: /illumos-gate/usr/src/uts/common/vm/vm_as.c (revision 1ee1bcba6f9a17099e11b5bb0d1eeab71c729aa3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  * Copyright 2015, Joyent, Inc.  All rights reserved.
25  * Copyright (c) 2016 by Delphix. All rights reserved.
26  */
27 
28 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
29 /*	  All Rights Reserved  	*/
30 
31 /*
32  * University Copyright- Copyright (c) 1982, 1986, 1988
33  * The Regents of the University of California
34  * All Rights Reserved
35  *
36  * University Acknowledgment- Portions of this document are derived from
37  * software developed by the University of California, Berkeley, and its
38  * contributors.
39  */
40 
41 /*
42  * VM - address spaces.
43  */
44 
45 #include <sys/types.h>
46 #include <sys/t_lock.h>
47 #include <sys/param.h>
48 #include <sys/errno.h>
49 #include <sys/systm.h>
50 #include <sys/mman.h>
51 #include <sys/sysmacros.h>
52 #include <sys/cpuvar.h>
53 #include <sys/sysinfo.h>
54 #include <sys/kmem.h>
55 #include <sys/vnode.h>
56 #include <sys/vmsystm.h>
57 #include <sys/cmn_err.h>
58 #include <sys/debug.h>
59 #include <sys/tnf_probe.h>
60 #include <sys/vtrace.h>
61 
62 #include <vm/hat.h>
63 #include <vm/as.h>
64 #include <vm/seg.h>
65 #include <vm/seg_vn.h>
66 #include <vm/seg_dev.h>
67 #include <vm/seg_kmem.h>
68 #include <vm/seg_map.h>
69 #include <vm/seg_spt.h>
70 #include <vm/page.h>
71 
72 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
73 
74 static struct kmem_cache *as_cache;
75 
76 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
77 static void as_clearwatchprot(struct as *, caddr_t, size_t);
78 int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
79 
80 
81 /*
82  * Verifying the segment lists is very time-consuming; it may not be
83  * desirable always to define VERIFY_SEGLIST when DEBUG is set.
84  */
85 #ifdef DEBUG
86 #define	VERIFY_SEGLIST
87 int do_as_verify = 0;
88 #endif
89 
90 /*
91  * Allocate a new callback data structure entry and fill in the events of
92  * interest, the address range of interest, and the callback argument.
93  * Link the entry on the as->a_callbacks list. A callback entry for the
94  * entire address space may be specified with vaddr = 0 and size = -1.
95  *
96  * CALLERS RESPONSIBILITY: If not calling from within the process context for
97  * the specified as, the caller must guarantee persistence of the specified as
98  * for the duration of this function (eg. pages being locked within the as
99  * will guarantee persistence).
100  */
101 int
102 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
103     caddr_t vaddr, size_t size, int sleepflag)
104 {
105 	struct as_callback 	*current_head, *cb;
106 	caddr_t 		saddr;
107 	size_t 			rsize;
108 
109 	/* callback function and an event are mandatory */
110 	if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
111 		return (EINVAL);
112 
113 	/* Adding a callback after as_free has been called is not allowed */
114 	if (as == &kas)
115 		return (ENOMEM);
116 
117 	/*
118 	 * vaddr = 0 and size = -1 is used to indicate that the callback range
119 	 * is the entire address space so no rounding is done in that case.
120 	 */
121 	if (size != -1) {
122 		saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
123 		rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
124 		    (size_t)saddr;
125 		/* check for wraparound */
126 		if (saddr + rsize < saddr)
127 			return (ENOMEM);
128 	} else {
129 		if (vaddr != 0)
130 			return (EINVAL);
131 		saddr = vaddr;
132 		rsize = size;
133 	}
134 
135 	/* Allocate and initialize a callback entry */
136 	cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
137 	if (cb == NULL)
138 		return (EAGAIN);
139 
140 	cb->ascb_func = cb_func;
141 	cb->ascb_arg = arg;
142 	cb->ascb_events = events;
143 	cb->ascb_saddr = saddr;
144 	cb->ascb_len = rsize;
145 
146 	/* Add the entry to the list */
147 	mutex_enter(&as->a_contents);
148 	current_head = as->a_callbacks;
149 	as->a_callbacks = cb;
150 	cb->ascb_next = current_head;
151 
152 	/*
153 	 * The call to this function may lose in a race with
154 	 * a pertinent event - eg. a thread does long term memory locking
155 	 * but before the callback is added another thread executes as_unmap.
156 	 * A broadcast here resolves that.
157 	 */
158 	if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
159 		AS_CLRUNMAPWAIT(as);
160 		cv_broadcast(&as->a_cv);
161 	}
162 
163 	mutex_exit(&as->a_contents);
164 	return (0);
165 }
166 
167 /*
168  * Search the callback list for an entry which pertains to arg.
169  *
170  * This is called from within the client upon completion of the callback.
171  * RETURN VALUES:
172  *	AS_CALLBACK_DELETED  (callback entry found and deleted)
173  *	AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
174  *	AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
175  *			entry will be made in as_do_callbacks)
176  *
177  * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
178  * set, it indicates that as_do_callbacks is processing this entry.  The
179  * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
180  * to unblock as_do_callbacks, in case it is blocked.
181  *
182  * CALLERS RESPONSIBILITY: If not calling from within the process context for
183  * the specified as, the caller must guarantee persistence of the specified as
184  * for the duration of this function (eg. pages being locked within the as
185  * will guarantee persistence).
186  */
187 uint_t
188 as_delete_callback(struct as *as, void *arg)
189 {
190 	struct as_callback **prevcb = &as->a_callbacks;
191 	struct as_callback *cb;
192 	uint_t rc = AS_CALLBACK_NOTFOUND;
193 
194 	mutex_enter(&as->a_contents);
195 	for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
196 		if (cb->ascb_arg != arg)
197 			continue;
198 
199 		/*
200 		 * If the events indicate AS_CALLBACK_CALLED, just clear
201 		 * AS_ALL_EVENT in the events field and wakeup the thread
202 		 * that may be waiting in as_do_callbacks.  as_do_callbacks
203 		 * will take care of removing this entry from the list.  In
204 		 * that case, return AS_CALLBACK_DELETE_DEFERRED.  Otherwise
205 		 * (AS_CALLBACK_CALLED not set), just remove it from the
206 		 * list, return the memory and return AS_CALLBACK_DELETED.
207 		 */
208 		if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
209 			/* leave AS_CALLBACK_CALLED */
210 			cb->ascb_events &= ~AS_ALL_EVENT;
211 			rc = AS_CALLBACK_DELETE_DEFERRED;
212 			cv_broadcast(&as->a_cv);
213 		} else {
214 			*prevcb = cb->ascb_next;
215 			kmem_free(cb, sizeof (struct as_callback));
216 			rc = AS_CALLBACK_DELETED;
217 		}
218 		break;
219 	}
220 	mutex_exit(&as->a_contents);
221 	return (rc);
222 }
223 
224 /*
225  * Searches the as callback list for a matching entry.
226  * Returns a pointer to the first matching callback, or NULL if
227  * nothing is found.
228  * This function never sleeps so it is ok to call it with more
229  * locks held but the (required) a_contents mutex.
230  *
231  * See also comment on as_do_callbacks below.
232  */
233 static struct as_callback *
234 as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
235     size_t event_len)
236 {
237 	struct as_callback	*cb;
238 
239 	ASSERT(MUTEX_HELD(&as->a_contents));
240 	for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
241 		/*
242 		 * If the callback has not already been called, then
243 		 * check if events or address range pertains.  An event_len
244 		 * of zero means do an unconditional callback.
245 		 */
246 		if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
247 		    ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
248 		    (event_addr + event_len < cb->ascb_saddr) ||
249 		    (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
250 			continue;
251 		}
252 		break;
253 	}
254 	return (cb);
255 }
256 
257 /*
258  * Executes a given callback and removes it from the callback list for
259  * this address space.
260  * This function may sleep so the caller must drop all locks except
261  * a_contents before calling this func.
262  *
263  * See also comments on as_do_callbacks below.
264  */
265 static void
266 as_execute_callback(struct as *as, struct as_callback *cb,
267     uint_t events)
268 {
269 	struct as_callback **prevcb;
270 	void	*cb_arg;
271 
272 	ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
273 	cb->ascb_events |= AS_CALLBACK_CALLED;
274 	mutex_exit(&as->a_contents);
275 	(*cb->ascb_func)(as, cb->ascb_arg, events);
276 	mutex_enter(&as->a_contents);
277 	/*
278 	 * the callback function is required to delete the callback
279 	 * when the callback function determines it is OK for
280 	 * this thread to continue. as_delete_callback will clear
281 	 * the AS_ALL_EVENT in the events field when it is deleted.
282 	 * If the callback function called as_delete_callback,
283 	 * events will already be cleared and there will be no blocking.
284 	 */
285 	while ((cb->ascb_events & events) != 0) {
286 		cv_wait(&as->a_cv, &as->a_contents);
287 	}
288 	/*
289 	 * This entry needs to be taken off the list. Normally, the
290 	 * callback func itself does that, but unfortunately the list
291 	 * may have changed while the callback was running because the
292 	 * a_contents mutex was dropped and someone else other than the
293 	 * callback func itself could have called as_delete_callback,
294 	 * so we have to search to find this entry again.  The entry
295 	 * must have AS_CALLBACK_CALLED, and have the same 'arg'.
296 	 */
297 	cb_arg = cb->ascb_arg;
298 	prevcb = &as->a_callbacks;
299 	for (cb = as->a_callbacks; cb != NULL;
300 	    prevcb = &cb->ascb_next, cb = *prevcb) {
301 		if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
302 		    (cb_arg != cb->ascb_arg)) {
303 			continue;
304 		}
305 		*prevcb = cb->ascb_next;
306 		kmem_free(cb, sizeof (struct as_callback));
307 		break;
308 	}
309 }
310 
311 /*
312  * Check the callback list for a matching event and intersection of
313  * address range. If there is a match invoke the callback.  Skip an entry if:
314  *    - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
315  *    - not event of interest
316  *    - not address range of interest
317  *
318  * An event_len of zero indicates a request for an unconditional callback
319  * (regardless of event), only the AS_CALLBACK_CALLED is checked.  The
320  * a_contents lock must be dropped before a callback, so only one callback
321  * can be done before returning. Return -1 (true) if a callback was
322  * executed and removed from the list, else return 0 (false).
323  *
324  * The logically separate parts, i.e. finding a matching callback and
325  * executing a given callback have been separated into two functions
326  * so that they can be called with different sets of locks held beyond
327  * the always-required a_contents. as_find_callback does not sleep so
328  * it is ok to call it if more locks than a_contents (i.e. the a_lock
329  * rwlock) are held. as_execute_callback on the other hand may sleep
330  * so all locks beyond a_contents must be dropped by the caller if one
331  * does not want to end comatose.
332  */
333 static int
334 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
335     size_t event_len)
336 {
337 	struct as_callback *cb;
338 
339 	if ((cb = as_find_callback(as, events, event_addr, event_len))) {
340 		as_execute_callback(as, cb, events);
341 		return (-1);
342 	}
343 	return (0);
344 }
345 
346 /*
347  * Search for the segment containing addr. If a segment containing addr
348  * exists, that segment is returned.  If no such segment exists, and
349  * the list spans addresses greater than addr, then the first segment
350  * whose base is greater than addr is returned; otherwise, NULL is
351  * returned unless tail is true, in which case the last element of the
352  * list is returned.
353  *
354  * a_seglast is used to cache the last found segment for repeated
355  * searches to the same addr (which happens frequently).
356  */
357 struct seg *
358 as_findseg(struct as *as, caddr_t addr, int tail)
359 {
360 	struct seg *seg = as->a_seglast;
361 	avl_index_t where;
362 
363 	ASSERT(AS_LOCK_HELD(as));
364 
365 	if (seg != NULL &&
366 	    seg->s_base <= addr &&
367 	    addr < seg->s_base + seg->s_size)
368 		return (seg);
369 
370 	seg = avl_find(&as->a_segtree, &addr, &where);
371 	if (seg != NULL)
372 		return (as->a_seglast = seg);
373 
374 	seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
375 	if (seg == NULL && tail)
376 		seg = avl_last(&as->a_segtree);
377 	return (as->a_seglast = seg);
378 }
379 
380 #ifdef VERIFY_SEGLIST
381 /*
382  * verify that the linked list is coherent
383  */
384 static void
385 as_verify(struct as *as)
386 {
387 	struct seg *seg, *seglast, *p, *n;
388 	uint_t nsegs = 0;
389 
390 	if (do_as_verify == 0)
391 		return;
392 
393 	seglast = as->a_seglast;
394 
395 	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
396 		ASSERT(seg->s_as == as);
397 		p = AS_SEGPREV(as, seg);
398 		n = AS_SEGNEXT(as, seg);
399 		ASSERT(p == NULL || p->s_as == as);
400 		ASSERT(p == NULL || p->s_base < seg->s_base);
401 		ASSERT(n == NULL || n->s_base > seg->s_base);
402 		ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
403 		if (seg == seglast)
404 			seglast = NULL;
405 		nsegs++;
406 	}
407 	ASSERT(seglast == NULL);
408 	ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
409 }
410 #endif /* VERIFY_SEGLIST */
411 
412 /*
413  * Add a new segment to the address space. The avl_find()
414  * may be expensive so we attempt to use last segment accessed
415  * in as_gap() as an insertion point.
416  */
417 int
418 as_addseg(struct as  *as, struct seg *newseg)
419 {
420 	struct seg *seg;
421 	caddr_t addr;
422 	caddr_t eaddr;
423 	avl_index_t where;
424 
425 	ASSERT(AS_WRITE_HELD(as));
426 
427 	as->a_updatedir = 1;	/* inform /proc */
428 	gethrestime(&as->a_updatetime);
429 
430 	if (as->a_lastgaphl != NULL) {
431 		struct seg *hseg = NULL;
432 		struct seg *lseg = NULL;
433 
434 		if (as->a_lastgaphl->s_base > newseg->s_base) {
435 			hseg = as->a_lastgaphl;
436 			lseg = AVL_PREV(&as->a_segtree, hseg);
437 		} else {
438 			lseg = as->a_lastgaphl;
439 			hseg = AVL_NEXT(&as->a_segtree, lseg);
440 		}
441 
442 		if (hseg && lseg && lseg->s_base < newseg->s_base &&
443 		    hseg->s_base > newseg->s_base) {
444 			avl_insert_here(&as->a_segtree, newseg, lseg,
445 			    AVL_AFTER);
446 			as->a_lastgaphl = NULL;
447 			as->a_seglast = newseg;
448 			return (0);
449 		}
450 		as->a_lastgaphl = NULL;
451 	}
452 
453 	addr = newseg->s_base;
454 	eaddr = addr + newseg->s_size;
455 again:
456 
457 	seg = avl_find(&as->a_segtree, &addr, &where);
458 
459 	if (seg == NULL)
460 		seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
461 
462 	if (seg == NULL)
463 		seg = avl_last(&as->a_segtree);
464 
465 	if (seg != NULL) {
466 		caddr_t base = seg->s_base;
467 
468 		/*
469 		 * If top of seg is below the requested address, then
470 		 * the insertion point is at the end of the linked list,
471 		 * and seg points to the tail of the list.  Otherwise,
472 		 * the insertion point is immediately before seg.
473 		 */
474 		if (base + seg->s_size > addr) {
475 			if (addr >= base || eaddr > base) {
476 #ifdef __sparc
477 				extern struct seg_ops segnf_ops;
478 
479 				/*
480 				 * no-fault segs must disappear if overlaid.
481 				 * XXX need new segment type so
482 				 * we don't have to check s_ops
483 				 */
484 				if (seg->s_ops == &segnf_ops) {
485 					seg_unmap(seg);
486 					goto again;
487 				}
488 #endif
489 				return (-1);	/* overlapping segment */
490 			}
491 		}
492 	}
493 	as->a_seglast = newseg;
494 	avl_insert(&as->a_segtree, newseg, where);
495 
496 #ifdef VERIFY_SEGLIST
497 	as_verify(as);
498 #endif
499 	return (0);
500 }
501 
502 struct seg *
503 as_removeseg(struct as *as, struct seg *seg)
504 {
505 	avl_tree_t *t;
506 
507 	ASSERT(AS_WRITE_HELD(as));
508 
509 	as->a_updatedir = 1;	/* inform /proc */
510 	gethrestime(&as->a_updatetime);
511 
512 	if (seg == NULL)
513 		return (NULL);
514 
515 	t = &as->a_segtree;
516 	if (as->a_seglast == seg)
517 		as->a_seglast = NULL;
518 	as->a_lastgaphl = NULL;
519 
520 	/*
521 	 * if this segment is at an address higher than
522 	 * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
523 	 */
524 	if (as->a_lastgap &&
525 	    (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
526 		as->a_lastgap = AVL_NEXT(t, seg);
527 
528 	/*
529 	 * remove the segment from the seg tree
530 	 */
531 	avl_remove(t, seg);
532 
533 #ifdef VERIFY_SEGLIST
534 	as_verify(as);
535 #endif
536 	return (seg);
537 }
538 
539 /*
540  * Find a segment containing addr.
541  */
542 struct seg *
543 as_segat(struct as *as, caddr_t addr)
544 {
545 	struct seg *seg = as->a_seglast;
546 
547 	ASSERT(AS_LOCK_HELD(as));
548 
549 	if (seg != NULL && seg->s_base <= addr &&
550 	    addr < seg->s_base + seg->s_size)
551 		return (seg);
552 
553 	seg = avl_find(&as->a_segtree, &addr, NULL);
554 	return (seg);
555 }
556 
557 /*
558  * Serialize all searches for holes in an address space to
559  * prevent two or more threads from allocating the same virtual
560  * address range.  The address space must not be "read/write"
561  * locked by the caller since we may block.
562  */
563 void
564 as_rangelock(struct as *as)
565 {
566 	mutex_enter(&as->a_contents);
567 	while (AS_ISCLAIMGAP(as))
568 		cv_wait(&as->a_cv, &as->a_contents);
569 	AS_SETCLAIMGAP(as);
570 	mutex_exit(&as->a_contents);
571 }
572 
573 /*
574  * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
575  */
576 void
577 as_rangeunlock(struct as *as)
578 {
579 	mutex_enter(&as->a_contents);
580 	AS_CLRCLAIMGAP(as);
581 	cv_signal(&as->a_cv);
582 	mutex_exit(&as->a_contents);
583 }
584 
585 /*
586  * compar segments (or just an address) by segment address range
587  */
588 static int
589 as_segcompar(const void *x, const void *y)
590 {
591 	struct seg *a = (struct seg *)x;
592 	struct seg *b = (struct seg *)y;
593 
594 	if (a->s_base < b->s_base)
595 		return (-1);
596 	if (a->s_base >= b->s_base + b->s_size)
597 		return (1);
598 	return (0);
599 }
600 
601 
602 void
603 as_avlinit(struct as *as)
604 {
605 	avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
606 	    offsetof(struct seg, s_tree));
607 	avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
608 	    offsetof(struct watched_page, wp_link));
609 }
610 
611 /*ARGSUSED*/
612 static int
613 as_constructor(void *buf, void *cdrarg, int kmflags)
614 {
615 	struct as *as = buf;
616 
617 	mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
618 	cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
619 	rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
620 	as_avlinit(as);
621 	return (0);
622 }
623 
624 /*ARGSUSED1*/
625 static void
626 as_destructor(void *buf, void *cdrarg)
627 {
628 	struct as *as = buf;
629 
630 	avl_destroy(&as->a_segtree);
631 	mutex_destroy(&as->a_contents);
632 	cv_destroy(&as->a_cv);
633 	rw_destroy(&as->a_lock);
634 }
635 
636 void
637 as_init(void)
638 {
639 	as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
640 	    as_constructor, as_destructor, NULL, NULL, NULL, 0);
641 }
642 
643 /*
644  * Allocate and initialize an address space data structure.
645  * We call hat_alloc to allow any machine dependent
646  * information in the hat structure to be initialized.
647  */
648 struct as *
649 as_alloc(void)
650 {
651 	struct as *as;
652 
653 	as = kmem_cache_alloc(as_cache, KM_SLEEP);
654 
655 	as->a_flags		= 0;
656 	as->a_vbits		= 0;
657 	as->a_hrm		= NULL;
658 	as->a_seglast		= NULL;
659 	as->a_size		= 0;
660 	as->a_resvsize		= 0;
661 	as->a_updatedir		= 0;
662 	gethrestime(&as->a_updatetime);
663 	as->a_objectdir		= NULL;
664 	as->a_sizedir		= 0;
665 	as->a_userlimit		= (caddr_t)USERLIMIT;
666 	as->a_lastgap		= NULL;
667 	as->a_lastgaphl		= NULL;
668 	as->a_callbacks		= NULL;
669 	as->a_proc		= NULL;
670 
671 	AS_LOCK_ENTER(as, RW_WRITER);
672 	as->a_hat = hat_alloc(as);	/* create hat for default system mmu */
673 	AS_LOCK_EXIT(as);
674 
675 	return (as);
676 }
677 
678 /*
679  * Free an address space data structure.
680  * Need to free the hat first and then
681  * all the segments on this as and finally
682  * the space for the as struct itself.
683  */
684 void
685 as_free(struct as *as)
686 {
687 	struct hat *hat = as->a_hat;
688 	struct seg *seg, *next;
689 	boolean_t free_started = B_FALSE;
690 
691 top:
692 	/*
693 	 * Invoke ALL callbacks. as_do_callbacks will do one callback
694 	 * per call, and not return (-1) until the callback has completed.
695 	 * When as_do_callbacks returns zero, all callbacks have completed.
696 	 */
697 	mutex_enter(&as->a_contents);
698 	while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
699 		;
700 
701 	mutex_exit(&as->a_contents);
702 	AS_LOCK_ENTER(as, RW_WRITER);
703 
704 	if (!free_started) {
705 		free_started = B_TRUE;
706 		hat_free_start(hat);
707 	}
708 	for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
709 		int err;
710 
711 		next = AS_SEGNEXT(as, seg);
712 retry:
713 		err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
714 		if (err == EAGAIN) {
715 			mutex_enter(&as->a_contents);
716 			if (as->a_callbacks) {
717 				AS_LOCK_EXIT(as);
718 			} else if (!AS_ISNOUNMAPWAIT(as)) {
719 				/*
720 				 * Memory is currently locked. Wait for a
721 				 * cv_signal that it has been unlocked, then
722 				 * try the operation again.
723 				 */
724 				if (AS_ISUNMAPWAIT(as) == 0)
725 					cv_broadcast(&as->a_cv);
726 				AS_SETUNMAPWAIT(as);
727 				AS_LOCK_EXIT(as);
728 				while (AS_ISUNMAPWAIT(as))
729 					cv_wait(&as->a_cv, &as->a_contents);
730 			} else {
731 				/*
732 				 * We may have raced with
733 				 * segvn_reclaim()/segspt_reclaim(). In this
734 				 * case clean nounmapwait flag and retry since
735 				 * softlockcnt in this segment may be already
736 				 * 0.  We don't drop as writer lock so our
737 				 * number of retries without sleeping should
738 				 * be very small. See segvn_reclaim() for
739 				 * more comments.
740 				 */
741 				AS_CLRNOUNMAPWAIT(as);
742 				mutex_exit(&as->a_contents);
743 				goto retry;
744 			}
745 			mutex_exit(&as->a_contents);
746 			goto top;
747 		} else {
748 			/*
749 			 * We do not expect any other error return at this
750 			 * time. This is similar to an ASSERT in seg_unmap()
751 			 */
752 			ASSERT(err == 0);
753 		}
754 	}
755 	hat_free_end(hat);
756 	AS_LOCK_EXIT(as);
757 
758 	/* /proc stuff */
759 	ASSERT(avl_numnodes(&as->a_wpage) == 0);
760 	if (as->a_objectdir) {
761 		kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
762 		as->a_objectdir = NULL;
763 		as->a_sizedir = 0;
764 	}
765 
766 	/*
767 	 * Free the struct as back to kmem.  Assert it has no segments.
768 	 */
769 	ASSERT(avl_numnodes(&as->a_segtree) == 0);
770 	kmem_cache_free(as_cache, as);
771 }
772 
773 int
774 as_dup(struct as *as, struct proc *forkedproc)
775 {
776 	struct as *newas;
777 	struct seg *seg, *newseg;
778 	size_t	purgesize = 0;
779 	int error;
780 
781 	AS_LOCK_ENTER(as, RW_WRITER);
782 	as_clearwatch(as);
783 	newas = as_alloc();
784 	newas->a_userlimit = as->a_userlimit;
785 	newas->a_proc = forkedproc;
786 
787 	AS_LOCK_ENTER(newas, RW_WRITER);
788 
789 	(void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
790 
791 	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
792 
793 		if (seg->s_flags & S_PURGE) {
794 			purgesize += seg->s_size;
795 			continue;
796 		}
797 
798 		newseg = seg_alloc(newas, seg->s_base, seg->s_size);
799 		if (newseg == NULL) {
800 			AS_LOCK_EXIT(newas);
801 			as_setwatch(as);
802 			AS_LOCK_EXIT(as);
803 			as_free(newas);
804 			return (-1);
805 		}
806 		if ((error = SEGOP_DUP(seg, newseg)) != 0) {
807 			/*
808 			 * We call seg_free() on the new seg
809 			 * because the segment is not set up
810 			 * completely; i.e. it has no ops.
811 			 */
812 			as_setwatch(as);
813 			AS_LOCK_EXIT(as);
814 			seg_free(newseg);
815 			AS_LOCK_EXIT(newas);
816 			as_free(newas);
817 			return (error);
818 		}
819 		newas->a_size += seg->s_size;
820 	}
821 	newas->a_resvsize = as->a_resvsize - purgesize;
822 
823 	error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
824 
825 	AS_LOCK_EXIT(newas);
826 
827 	as_setwatch(as);
828 	AS_LOCK_EXIT(as);
829 	if (error != 0) {
830 		as_free(newas);
831 		return (error);
832 	}
833 	forkedproc->p_as = newas;
834 	return (0);
835 }
836 
837 /*
838  * Handle a ``fault'' at addr for size bytes.
839  */
840 faultcode_t
841 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
842     enum fault_type type, enum seg_rw rw)
843 {
844 	struct seg *seg;
845 	caddr_t raddr;			/* rounded down addr */
846 	size_t rsize;			/* rounded up size */
847 	size_t ssize;
848 	faultcode_t res = 0;
849 	caddr_t addrsav;
850 	struct seg *segsav;
851 	int as_lock_held;
852 	klwp_t *lwp = ttolwp(curthread);
853 
854 
855 
856 retry:
857 	/*
858 	 * Indicate that the lwp is not to be stopped while waiting for a
859 	 * pagefault.  This is to avoid deadlock while debugging a process
860 	 * via /proc over NFS (in particular).
861 	 */
862 	if (lwp != NULL)
863 		lwp->lwp_nostop++;
864 
865 	/*
866 	 * same length must be used when we softlock and softunlock.  We
867 	 * don't support softunlocking lengths less than the original length
868 	 * when there is largepage support.  See seg_dev.c for more
869 	 * comments.
870 	 */
871 	switch (type) {
872 
873 	case F_SOFTLOCK:
874 		CPU_STATS_ADD_K(vm, softlock, 1);
875 		break;
876 
877 	case F_SOFTUNLOCK:
878 		break;
879 
880 	case F_PROT:
881 		CPU_STATS_ADD_K(vm, prot_fault, 1);
882 		break;
883 
884 	case F_INVAL:
885 		CPU_STATS_ENTER_K();
886 		CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
887 		if (as == &kas)
888 			CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
889 		CPU_STATS_EXIT_K();
890 		break;
891 	}
892 
893 	/* Kernel probe */
894 	TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */,
895 	    tnf_opaque,	address,	addr,
896 	    tnf_fault_type,	fault_type,	type,
897 	    tnf_seg_access,	access,		rw);
898 
899 	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
900 	rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
901 	    (size_t)raddr;
902 
903 	/*
904 	 * XXX -- Don't grab the as lock for segkmap. We should grab it for
905 	 * correctness, but then we could be stuck holding this lock for
906 	 * a LONG time if the fault needs to be resolved on a slow
907 	 * filesystem, and then no-one will be able to exec new commands,
908 	 * as exec'ing requires the write lock on the as.
909 	 */
910 	if (as == &kas && segkmap && segkmap->s_base <= raddr &&
911 	    raddr + size < segkmap->s_base + segkmap->s_size) {
912 		seg = segkmap;
913 		as_lock_held = 0;
914 	} else {
915 		AS_LOCK_ENTER(as, RW_READER);
916 
917 		seg = as_segat(as, raddr);
918 		if (seg == NULL) {
919 			AS_LOCK_EXIT(as);
920 			if (lwp != NULL)
921 				lwp->lwp_nostop--;
922 			return (FC_NOMAP);
923 		}
924 
925 		as_lock_held = 1;
926 	}
927 
928 	addrsav = raddr;
929 	segsav = seg;
930 
931 	for (; rsize != 0; rsize -= ssize, raddr += ssize) {
932 		if (raddr >= seg->s_base + seg->s_size) {
933 			seg = AS_SEGNEXT(as, seg);
934 			if (seg == NULL || raddr != seg->s_base) {
935 				res = FC_NOMAP;
936 				break;
937 			}
938 		}
939 		if (raddr + rsize > seg->s_base + seg->s_size)
940 			ssize = seg->s_base + seg->s_size - raddr;
941 		else
942 			ssize = rsize;
943 
944 		res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw);
945 		if (res != 0)
946 			break;
947 	}
948 
949 	/*
950 	 * If we were SOFTLOCKing and encountered a failure,
951 	 * we must SOFTUNLOCK the range we already did. (Maybe we
952 	 * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
953 	 * right here...)
954 	 */
955 	if (res != 0 && type == F_SOFTLOCK) {
956 		for (seg = segsav; addrsav < raddr; addrsav += ssize) {
957 			if (addrsav >= seg->s_base + seg->s_size)
958 				seg = AS_SEGNEXT(as, seg);
959 			ASSERT(seg != NULL);
960 			/*
961 			 * Now call the fault routine again to perform the
962 			 * unlock using S_OTHER instead of the rw variable
963 			 * since we never got a chance to touch the pages.
964 			 */
965 			if (raddr > seg->s_base + seg->s_size)
966 				ssize = seg->s_base + seg->s_size - addrsav;
967 			else
968 				ssize = raddr - addrsav;
969 			(void) SEGOP_FAULT(hat, seg, addrsav, ssize,
970 			    F_SOFTUNLOCK, S_OTHER);
971 		}
972 	}
973 	if (as_lock_held)
974 		AS_LOCK_EXIT(as);
975 	if (lwp != NULL)
976 		lwp->lwp_nostop--;
977 
978 	/*
979 	 * If the lower levels returned EDEADLK for a fault,
980 	 * It means that we should retry the fault.  Let's wait
981 	 * a bit also to let the deadlock causing condition clear.
982 	 * This is part of a gross hack to work around a design flaw
983 	 * in the ufs/sds logging code and should go away when the
984 	 * logging code is re-designed to fix the problem. See bug
985 	 * 4125102 for details of the problem.
986 	 */
987 	if (FC_ERRNO(res) == EDEADLK) {
988 		delay(deadlk_wait);
989 		res = 0;
990 		goto retry;
991 	}
992 	return (res);
993 }
994 
995 
996 
997 /*
998  * Asynchronous ``fault'' at addr for size bytes.
999  */
1000 faultcode_t
1001 as_faulta(struct as *as, caddr_t addr, size_t size)
1002 {
1003 	struct seg *seg;
1004 	caddr_t raddr;			/* rounded down addr */
1005 	size_t rsize;			/* rounded up size */
1006 	faultcode_t res = 0;
1007 	klwp_t *lwp = ttolwp(curthread);
1008 
1009 retry:
1010 	/*
1011 	 * Indicate that the lwp is not to be stopped while waiting
1012 	 * for a pagefault.  This is to avoid deadlock while debugging
1013 	 * a process via /proc over NFS (in particular).
1014 	 */
1015 	if (lwp != NULL)
1016 		lwp->lwp_nostop++;
1017 
1018 	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1019 	rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1020 	    (size_t)raddr;
1021 
1022 	AS_LOCK_ENTER(as, RW_READER);
1023 	seg = as_segat(as, raddr);
1024 	if (seg == NULL) {
1025 		AS_LOCK_EXIT(as);
1026 		if (lwp != NULL)
1027 			lwp->lwp_nostop--;
1028 		return (FC_NOMAP);
1029 	}
1030 
1031 	for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1032 		if (raddr >= seg->s_base + seg->s_size) {
1033 			seg = AS_SEGNEXT(as, seg);
1034 			if (seg == NULL || raddr != seg->s_base) {
1035 				res = FC_NOMAP;
1036 				break;
1037 			}
1038 		}
1039 		res = SEGOP_FAULTA(seg, raddr);
1040 		if (res != 0)
1041 			break;
1042 	}
1043 	AS_LOCK_EXIT(as);
1044 	if (lwp != NULL)
1045 		lwp->lwp_nostop--;
1046 	/*
1047 	 * If the lower levels returned EDEADLK for a fault,
1048 	 * It means that we should retry the fault.  Let's wait
1049 	 * a bit also to let the deadlock causing condition clear.
1050 	 * This is part of a gross hack to work around a design flaw
1051 	 * in the ufs/sds logging code and should go away when the
1052 	 * logging code is re-designed to fix the problem. See bug
1053 	 * 4125102 for details of the problem.
1054 	 */
1055 	if (FC_ERRNO(res) == EDEADLK) {
1056 		delay(deadlk_wait);
1057 		res = 0;
1058 		goto retry;
1059 	}
1060 	return (res);
1061 }
1062 
1063 /*
1064  * Set the virtual mapping for the interval from [addr : addr + size)
1065  * in address space `as' to have the specified protection.
1066  * It is ok for the range to cross over several segments,
1067  * as long as they are contiguous.
1068  */
1069 int
1070 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1071 {
1072 	struct seg *seg;
1073 	struct as_callback *cb;
1074 	size_t ssize;
1075 	caddr_t raddr;			/* rounded down addr */
1076 	size_t rsize;			/* rounded up size */
1077 	int error = 0, writer = 0;
1078 	caddr_t saveraddr;
1079 	size_t saversize;
1080 
1081 setprot_top:
1082 	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1083 	rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1084 	    (size_t)raddr;
1085 
1086 	if (raddr + rsize < raddr)		/* check for wraparound */
1087 		return (ENOMEM);
1088 
1089 	saveraddr = raddr;
1090 	saversize = rsize;
1091 
1092 	/*
1093 	 * Normally we only lock the as as a reader. But
1094 	 * if due to setprot the segment driver needs to split
1095 	 * a segment it will return IE_RETRY. Therefore we re-acquire
1096 	 * the as lock as a writer so the segment driver can change
1097 	 * the seg list. Also the segment driver will return IE_RETRY
1098 	 * after it has changed the segment list so we therefore keep
1099 	 * locking as a writer. Since these opeartions should be rare
1100 	 * want to only lock as a writer when necessary.
1101 	 */
1102 	if (writer || avl_numnodes(&as->a_wpage) != 0) {
1103 		AS_LOCK_ENTER(as, RW_WRITER);
1104 	} else {
1105 		AS_LOCK_ENTER(as, RW_READER);
1106 	}
1107 
1108 	as_clearwatchprot(as, raddr, rsize);
1109 	seg = as_segat(as, raddr);
1110 	if (seg == NULL) {
1111 		as_setwatch(as);
1112 		AS_LOCK_EXIT(as);
1113 		return (ENOMEM);
1114 	}
1115 
1116 	for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1117 		if (raddr >= seg->s_base + seg->s_size) {
1118 			seg = AS_SEGNEXT(as, seg);
1119 			if (seg == NULL || raddr != seg->s_base) {
1120 				error = ENOMEM;
1121 				break;
1122 			}
1123 		}
1124 		if ((raddr + rsize) > (seg->s_base + seg->s_size))
1125 			ssize = seg->s_base + seg->s_size - raddr;
1126 		else
1127 			ssize = rsize;
1128 retry:
1129 		error = SEGOP_SETPROT(seg, raddr, ssize, prot);
1130 
1131 		if (error == IE_NOMEM) {
1132 			error = EAGAIN;
1133 			break;
1134 		}
1135 
1136 		if (error == IE_RETRY) {
1137 			AS_LOCK_EXIT(as);
1138 			writer = 1;
1139 			goto setprot_top;
1140 		}
1141 
1142 		if (error == EAGAIN) {
1143 			/*
1144 			 * Make sure we have a_lock as writer.
1145 			 */
1146 			if (writer == 0) {
1147 				AS_LOCK_EXIT(as);
1148 				writer = 1;
1149 				goto setprot_top;
1150 			}
1151 
1152 			/*
1153 			 * Memory is currently locked.  It must be unlocked
1154 			 * before this operation can succeed through a retry.
1155 			 * The possible reasons for locked memory and
1156 			 * corresponding strategies for unlocking are:
1157 			 * (1) Normal I/O
1158 			 *	wait for a signal that the I/O operation
1159 			 *	has completed and the memory is unlocked.
1160 			 * (2) Asynchronous I/O
1161 			 *	The aio subsystem does not unlock pages when
1162 			 *	the I/O is completed. Those pages are unlocked
1163 			 *	when the application calls aiowait/aioerror.
1164 			 *	So, to prevent blocking forever, cv_broadcast()
1165 			 *	is done to wake up aio_cleanup_thread.
1166 			 *	Subsequently, segvn_reclaim will be called, and
1167 			 *	that will do AS_CLRUNMAPWAIT() and wake us up.
1168 			 * (3) Long term page locking:
1169 			 *	Drivers intending to have pages locked for a
1170 			 *	period considerably longer than for normal I/O
1171 			 *	(essentially forever) may have registered for a
1172 			 *	callback so they may unlock these pages on
1173 			 *	request. This is needed to allow this operation
1174 			 *	to succeed. Each entry on the callback list is
1175 			 *	examined. If the event or address range pertains
1176 			 *	the callback is invoked (unless it already is in
1177 			 *	progress). The a_contents lock must be dropped
1178 			 *	before the callback, so only one callback can
1179 			 *	be done at a time. Go to the top and do more
1180 			 *	until zero is returned. If zero is returned,
1181 			 *	either there were no callbacks for this event
1182 			 *	or they were already in progress.
1183 			 */
1184 			mutex_enter(&as->a_contents);
1185 			if (as->a_callbacks &&
1186 			    (cb = as_find_callback(as, AS_SETPROT_EVENT,
1187 			    seg->s_base, seg->s_size))) {
1188 				AS_LOCK_EXIT(as);
1189 				as_execute_callback(as, cb, AS_SETPROT_EVENT);
1190 			} else if (!AS_ISNOUNMAPWAIT(as)) {
1191 				if (AS_ISUNMAPWAIT(as) == 0)
1192 					cv_broadcast(&as->a_cv);
1193 				AS_SETUNMAPWAIT(as);
1194 				AS_LOCK_EXIT(as);
1195 				while (AS_ISUNMAPWAIT(as))
1196 					cv_wait(&as->a_cv, &as->a_contents);
1197 			} else {
1198 				/*
1199 				 * We may have raced with
1200 				 * segvn_reclaim()/segspt_reclaim(). In this
1201 				 * case clean nounmapwait flag and retry since
1202 				 * softlockcnt in this segment may be already
1203 				 * 0.  We don't drop as writer lock so our
1204 				 * number of retries without sleeping should
1205 				 * be very small. See segvn_reclaim() for
1206 				 * more comments.
1207 				 */
1208 				AS_CLRNOUNMAPWAIT(as);
1209 				mutex_exit(&as->a_contents);
1210 				goto retry;
1211 			}
1212 			mutex_exit(&as->a_contents);
1213 			goto setprot_top;
1214 		} else if (error != 0)
1215 			break;
1216 	}
1217 	if (error != 0) {
1218 		as_setwatch(as);
1219 	} else {
1220 		as_setwatchprot(as, saveraddr, saversize, prot);
1221 	}
1222 	AS_LOCK_EXIT(as);
1223 	return (error);
1224 }
1225 
1226 /*
1227  * Check to make sure that the interval [addr, addr + size)
1228  * in address space `as' has at least the specified protection.
1229  * It is ok for the range to cross over several segments, as long
1230  * as they are contiguous.
1231  */
1232 int
1233 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1234 {
1235 	struct seg *seg;
1236 	size_t ssize;
1237 	caddr_t raddr;			/* rounded down addr */
1238 	size_t rsize;			/* rounded up size */
1239 	int error = 0;
1240 
1241 	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1242 	rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1243 	    (size_t)raddr;
1244 
1245 	if (raddr + rsize < raddr)		/* check for wraparound */
1246 		return (ENOMEM);
1247 
1248 	/*
1249 	 * This is ugly as sin...
1250 	 * Normally, we only acquire the address space readers lock.
1251 	 * However, if the address space has watchpoints present,
1252 	 * we must acquire the writer lock on the address space for
1253 	 * the benefit of as_clearwatchprot() and as_setwatchprot().
1254 	 */
1255 	if (avl_numnodes(&as->a_wpage) != 0)
1256 		AS_LOCK_ENTER(as, RW_WRITER);
1257 	else
1258 		AS_LOCK_ENTER(as, RW_READER);
1259 	as_clearwatchprot(as, raddr, rsize);
1260 	seg = as_segat(as, raddr);
1261 	if (seg == NULL) {
1262 		as_setwatch(as);
1263 		AS_LOCK_EXIT(as);
1264 		return (ENOMEM);
1265 	}
1266 
1267 	for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1268 		if (raddr >= seg->s_base + seg->s_size) {
1269 			seg = AS_SEGNEXT(as, seg);
1270 			if (seg == NULL || raddr != seg->s_base) {
1271 				error = ENOMEM;
1272 				break;
1273 			}
1274 		}
1275 		if ((raddr + rsize) > (seg->s_base + seg->s_size))
1276 			ssize = seg->s_base + seg->s_size - raddr;
1277 		else
1278 			ssize = rsize;
1279 
1280 		error = SEGOP_CHECKPROT(seg, raddr, ssize, prot);
1281 		if (error != 0)
1282 			break;
1283 	}
1284 	as_setwatch(as);
1285 	AS_LOCK_EXIT(as);
1286 	return (error);
1287 }
1288 
1289 int
1290 as_unmap(struct as *as, caddr_t addr, size_t size)
1291 {
1292 	struct seg *seg, *seg_next;
1293 	struct as_callback *cb;
1294 	caddr_t raddr, eaddr;
1295 	size_t ssize, rsize = 0;
1296 	int err;
1297 
1298 top:
1299 	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1300 	eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1301 	    (uintptr_t)PAGEMASK);
1302 
1303 	AS_LOCK_ENTER(as, RW_WRITER);
1304 
1305 	as->a_updatedir = 1;	/* inform /proc */
1306 	gethrestime(&as->a_updatetime);
1307 
1308 	/*
1309 	 * Use as_findseg to find the first segment in the range, then
1310 	 * step through the segments in order, following s_next.
1311 	 */
1312 	as_clearwatchprot(as, raddr, eaddr - raddr);
1313 
1314 	for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1315 		if (eaddr <= seg->s_base)
1316 			break;		/* eaddr was in a gap; all done */
1317 
1318 		/* this is implied by the test above */
1319 		ASSERT(raddr < eaddr);
1320 
1321 		if (raddr < seg->s_base)
1322 			raddr = seg->s_base; 	/* raddr was in a gap */
1323 
1324 		if (eaddr > (seg->s_base + seg->s_size))
1325 			ssize = seg->s_base + seg->s_size - raddr;
1326 		else
1327 			ssize = eaddr - raddr;
1328 
1329 		/*
1330 		 * Save next segment pointer since seg can be
1331 		 * destroyed during the segment unmap operation.
1332 		 */
1333 		seg_next = AS_SEGNEXT(as, seg);
1334 
1335 		/*
1336 		 * We didn't count /dev/null mappings, so ignore them here.
1337 		 * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1338 		 * we have to do this check here while we have seg.)
1339 		 */
1340 		rsize = 0;
1341 		if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1342 		    !SEG_IS_PARTIAL_RESV(seg))
1343 			rsize = ssize;
1344 
1345 retry:
1346 		err = SEGOP_UNMAP(seg, raddr, ssize);
1347 		if (err == EAGAIN) {
1348 			/*
1349 			 * Memory is currently locked.  It must be unlocked
1350 			 * before this operation can succeed through a retry.
1351 			 * The possible reasons for locked memory and
1352 			 * corresponding strategies for unlocking are:
1353 			 * (1) Normal I/O
1354 			 *	wait for a signal that the I/O operation
1355 			 *	has completed and the memory is unlocked.
1356 			 * (2) Asynchronous I/O
1357 			 *	The aio subsystem does not unlock pages when
1358 			 *	the I/O is completed. Those pages are unlocked
1359 			 *	when the application calls aiowait/aioerror.
1360 			 *	So, to prevent blocking forever, cv_broadcast()
1361 			 *	is done to wake up aio_cleanup_thread.
1362 			 *	Subsequently, segvn_reclaim will be called, and
1363 			 *	that will do AS_CLRUNMAPWAIT() and wake us up.
1364 			 * (3) Long term page locking:
1365 			 *	Drivers intending to have pages locked for a
1366 			 *	period considerably longer than for normal I/O
1367 			 *	(essentially forever) may have registered for a
1368 			 *	callback so they may unlock these pages on
1369 			 *	request. This is needed to allow this operation
1370 			 *	to succeed. Each entry on the callback list is
1371 			 *	examined. If the event or address range pertains
1372 			 *	the callback is invoked (unless it already is in
1373 			 *	progress). The a_contents lock must be dropped
1374 			 *	before the callback, so only one callback can
1375 			 *	be done at a time. Go to the top and do more
1376 			 *	until zero is returned. If zero is returned,
1377 			 *	either there were no callbacks for this event
1378 			 *	or they were already in progress.
1379 			 */
1380 			mutex_enter(&as->a_contents);
1381 			if (as->a_callbacks &&
1382 			    (cb = as_find_callback(as, AS_UNMAP_EVENT,
1383 			    seg->s_base, seg->s_size))) {
1384 				AS_LOCK_EXIT(as);
1385 				as_execute_callback(as, cb, AS_UNMAP_EVENT);
1386 			} else if (!AS_ISNOUNMAPWAIT(as)) {
1387 				if (AS_ISUNMAPWAIT(as) == 0)
1388 					cv_broadcast(&as->a_cv);
1389 				AS_SETUNMAPWAIT(as);
1390 				AS_LOCK_EXIT(as);
1391 				while (AS_ISUNMAPWAIT(as))
1392 					cv_wait(&as->a_cv, &as->a_contents);
1393 			} else {
1394 				/*
1395 				 * We may have raced with
1396 				 * segvn_reclaim()/segspt_reclaim(). In this
1397 				 * case clean nounmapwait flag and retry since
1398 				 * softlockcnt in this segment may be already
1399 				 * 0.  We don't drop as writer lock so our
1400 				 * number of retries without sleeping should
1401 				 * be very small. See segvn_reclaim() for
1402 				 * more comments.
1403 				 */
1404 				AS_CLRNOUNMAPWAIT(as);
1405 				mutex_exit(&as->a_contents);
1406 				goto retry;
1407 			}
1408 			mutex_exit(&as->a_contents);
1409 			goto top;
1410 		} else if (err == IE_RETRY) {
1411 			AS_LOCK_EXIT(as);
1412 			goto top;
1413 		} else if (err) {
1414 			as_setwatch(as);
1415 			AS_LOCK_EXIT(as);
1416 			return (-1);
1417 		}
1418 
1419 		as->a_size -= ssize;
1420 		if (rsize)
1421 			as->a_resvsize -= rsize;
1422 		raddr += ssize;
1423 	}
1424 	AS_LOCK_EXIT(as);
1425 	return (0);
1426 }
1427 
1428 static int
1429 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1430     int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1431 {
1432 	uint_t szc;
1433 	uint_t nszc;
1434 	int error;
1435 	caddr_t a;
1436 	caddr_t eaddr;
1437 	size_t segsize;
1438 	struct seg *seg;
1439 	size_t pgsz;
1440 	int do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1441 	uint_t save_szcvec;
1442 
1443 	ASSERT(AS_WRITE_HELD(as));
1444 	ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1445 	ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1446 	ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1447 	if (!do_off) {
1448 		vn_a->offset = 0;
1449 	}
1450 
1451 	if (szcvec <= 1) {
1452 		seg = seg_alloc(as, addr, size);
1453 		if (seg == NULL) {
1454 			return (ENOMEM);
1455 		}
1456 		vn_a->szc = 0;
1457 		error = (*crfp)(seg, vn_a);
1458 		if (error != 0) {
1459 			seg_free(seg);
1460 		} else {
1461 			as->a_size += size;
1462 			as->a_resvsize += size;
1463 		}
1464 		return (error);
1465 	}
1466 
1467 	eaddr = addr + size;
1468 	save_szcvec = szcvec;
1469 	szcvec >>= 1;
1470 	szc = 0;
1471 	nszc = 0;
1472 	while (szcvec) {
1473 		if ((szcvec & 0x1) == 0) {
1474 			nszc++;
1475 			szcvec >>= 1;
1476 			continue;
1477 		}
1478 		nszc++;
1479 		pgsz = page_get_pagesize(nszc);
1480 		a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
1481 		if (a != addr) {
1482 			ASSERT(a < eaddr);
1483 			segsize = a - addr;
1484 			seg = seg_alloc(as, addr, segsize);
1485 			if (seg == NULL) {
1486 				return (ENOMEM);
1487 			}
1488 			vn_a->szc = szc;
1489 			error = (*crfp)(seg, vn_a);
1490 			if (error != 0) {
1491 				seg_free(seg);
1492 				return (error);
1493 			}
1494 			as->a_size += segsize;
1495 			as->a_resvsize += segsize;
1496 			*segcreated = 1;
1497 			if (do_off) {
1498 				vn_a->offset += segsize;
1499 			}
1500 			addr = a;
1501 		}
1502 		szc = nszc;
1503 		szcvec >>= 1;
1504 	}
1505 
1506 	ASSERT(addr < eaddr);
1507 	szcvec = save_szcvec | 1; /* add 8K pages */
1508 	while (szcvec) {
1509 		a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
1510 		ASSERT(a >= addr);
1511 		if (a != addr) {
1512 			segsize = a - addr;
1513 			seg = seg_alloc(as, addr, segsize);
1514 			if (seg == NULL) {
1515 				return (ENOMEM);
1516 			}
1517 			vn_a->szc = szc;
1518 			error = (*crfp)(seg, vn_a);
1519 			if (error != 0) {
1520 				seg_free(seg);
1521 				return (error);
1522 			}
1523 			as->a_size += segsize;
1524 			as->a_resvsize += segsize;
1525 			*segcreated = 1;
1526 			if (do_off) {
1527 				vn_a->offset += segsize;
1528 			}
1529 			addr = a;
1530 		}
1531 		szcvec &= ~(1 << szc);
1532 		if (szcvec) {
1533 			szc = highbit(szcvec) - 1;
1534 			pgsz = page_get_pagesize(szc);
1535 		}
1536 	}
1537 	ASSERT(addr == eaddr);
1538 
1539 	return (0);
1540 }
1541 
1542 static int
1543 as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1544     int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1545 {
1546 	uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1547 	int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1548 	uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1549 	    type, 0);
1550 	int error;
1551 	struct seg *seg;
1552 	struct vattr va;
1553 	u_offset_t eoff;
1554 	size_t save_size = 0;
1555 	extern size_t textrepl_size_thresh;
1556 
1557 	ASSERT(AS_WRITE_HELD(as));
1558 	ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1559 	ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1560 	ASSERT(vn_a->vp != NULL);
1561 	ASSERT(vn_a->amp == NULL);
1562 
1563 again:
1564 	if (szcvec <= 1) {
1565 		seg = seg_alloc(as, addr, size);
1566 		if (seg == NULL) {
1567 			return (ENOMEM);
1568 		}
1569 		vn_a->szc = 0;
1570 		error = (*crfp)(seg, vn_a);
1571 		if (error != 0) {
1572 			seg_free(seg);
1573 		} else {
1574 			as->a_size += size;
1575 			as->a_resvsize += size;
1576 		}
1577 		return (error);
1578 	}
1579 
1580 	va.va_mask = AT_SIZE;
1581 	if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) {
1582 		szcvec = 0;
1583 		goto again;
1584 	}
1585 	eoff = vn_a->offset & PAGEMASK;
1586 	if (eoff >= va.va_size) {
1587 		szcvec = 0;
1588 		goto again;
1589 	}
1590 	eoff += size;
1591 	if (btopr(va.va_size) < btopr(eoff)) {
1592 		save_size = size;
1593 		size = va.va_size - (vn_a->offset & PAGEMASK);
1594 		size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
1595 		szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1596 		    type, 0);
1597 		if (szcvec <= 1) {
1598 			size = save_size;
1599 			goto again;
1600 		}
1601 	}
1602 
1603 	if (size > textrepl_size_thresh) {
1604 		vn_a->flags |= _MAP_TEXTREPL;
1605 	}
1606 	error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
1607 	    segcreated);
1608 	if (error != 0) {
1609 		return (error);
1610 	}
1611 	if (save_size) {
1612 		addr += size;
1613 		size = save_size - size;
1614 		szcvec = 0;
1615 		goto again;
1616 	}
1617 	return (0);
1618 }
1619 
1620 /*
1621  * as_map_ansegs: shared or private anonymous memory.  Note that the flags
1622  * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1623  */
1624 static int
1625 as_map_ansegs(struct as *as, caddr_t addr, size_t size,
1626     int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1627 {
1628 	uint_t szcvec;
1629 	uchar_t type;
1630 
1631 	ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1632 	if (vn_a->type == MAP_SHARED) {
1633 		type = MAPPGSZC_SHM;
1634 	} else if (vn_a->type == MAP_PRIVATE) {
1635 		if (vn_a->szc == AS_MAP_HEAP) {
1636 			type = MAPPGSZC_HEAP;
1637 		} else if (vn_a->szc == AS_MAP_STACK) {
1638 			type = MAPPGSZC_STACK;
1639 		} else {
1640 			type = MAPPGSZC_PRIVM;
1641 		}
1642 	}
1643 	szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1644 	    (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1645 	    (vn_a->flags & MAP_TEXT), type, 0);
1646 	ASSERT(AS_WRITE_HELD(as));
1647 	ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1648 	ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1649 	ASSERT(vn_a->vp == NULL);
1650 
1651 	return (as_map_segvn_segs(as, addr, size, szcvec,
1652 	    crfp, vn_a, segcreated));
1653 }
1654 
1655 int
1656 as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp)
1657 {
1658 	AS_LOCK_ENTER(as, RW_WRITER);
1659 	return (as_map_locked(as, addr, size, crfp, argsp));
1660 }
1661 
1662 int
1663 as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(),
1664     void *argsp)
1665 {
1666 	struct seg *seg = NULL;
1667 	caddr_t raddr;			/* rounded down addr */
1668 	size_t rsize;			/* rounded up size */
1669 	int error;
1670 	int unmap = 0;
1671 	/*
1672 	 * The use of a_proc is preferred to handle the case where curproc is
1673 	 * a door_call server and is allocating memory in the client's (a_proc)
1674 	 * address space.
1675 	 * When creating a shared memory segment a_proc will be NULL so we
1676 	 * fallback to curproc in that case.
1677 	 */
1678 	struct proc *p = (as->a_proc == NULL) ? curproc : as->a_proc;
1679 	struct segvn_crargs crargs;
1680 
1681 	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1682 	rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1683 	    (size_t)raddr;
1684 
1685 	/*
1686 	 * check for wrap around
1687 	 */
1688 	if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1689 		AS_LOCK_EXIT(as);
1690 		return (ENOMEM);
1691 	}
1692 
1693 	as->a_updatedir = 1;	/* inform /proc */
1694 	gethrestime(&as->a_updatetime);
1695 
1696 	if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) {
1697 		AS_LOCK_EXIT(as);
1698 
1699 		(void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1700 		    RCA_UNSAFE_ALL);
1701 
1702 		return (ENOMEM);
1703 	}
1704 
1705 	if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1706 		crargs = *(struct segvn_crargs *)argsp;
1707 		error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap);
1708 		if (error != 0) {
1709 			AS_LOCK_EXIT(as);
1710 			if (unmap) {
1711 				(void) as_unmap(as, addr, size);
1712 			}
1713 			return (error);
1714 		}
1715 	} else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1716 		crargs = *(struct segvn_crargs *)argsp;
1717 		error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap);
1718 		if (error != 0) {
1719 			AS_LOCK_EXIT(as);
1720 			if (unmap) {
1721 				(void) as_unmap(as, addr, size);
1722 			}
1723 			return (error);
1724 		}
1725 	} else {
1726 		seg = seg_alloc(as, addr, size);
1727 		if (seg == NULL) {
1728 			AS_LOCK_EXIT(as);
1729 			return (ENOMEM);
1730 		}
1731 
1732 		error = (*crfp)(seg, argsp);
1733 		if (error != 0) {
1734 			seg_free(seg);
1735 			AS_LOCK_EXIT(as);
1736 			return (error);
1737 		}
1738 		/*
1739 		 * Add size now so as_unmap will work if as_ctl fails.
1740 		 */
1741 		as->a_size += rsize;
1742 		as->a_resvsize += rsize;
1743 	}
1744 
1745 	as_setwatch(as);
1746 
1747 	/*
1748 	 * If the address space is locked,
1749 	 * establish memory locks for the new segment.
1750 	 */
1751 	mutex_enter(&as->a_contents);
1752 	if (AS_ISPGLCK(as)) {
1753 		mutex_exit(&as->a_contents);
1754 		AS_LOCK_EXIT(as);
1755 		error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1756 		if (error != 0)
1757 			(void) as_unmap(as, addr, size);
1758 	} else {
1759 		mutex_exit(&as->a_contents);
1760 		AS_LOCK_EXIT(as);
1761 	}
1762 	return (error);
1763 }
1764 
1765 
1766 /*
1767  * Delete all segments in the address space marked with S_PURGE.
1768  * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1769  * These segments are deleted as a first step before calls to as_gap(), so
1770  * that they don't affect mmap() or shmat().
1771  */
1772 void
1773 as_purge(struct as *as)
1774 {
1775 	struct seg *seg;
1776 	struct seg *next_seg;
1777 
1778 	/*
1779 	 * the setting of NEEDSPURGE is protect by as_rangelock(), so
1780 	 * no need to grab a_contents mutex for this check
1781 	 */
1782 	if ((as->a_flags & AS_NEEDSPURGE) == 0)
1783 		return;
1784 
1785 	AS_LOCK_ENTER(as, RW_WRITER);
1786 	next_seg = NULL;
1787 	seg = AS_SEGFIRST(as);
1788 	while (seg != NULL) {
1789 		next_seg = AS_SEGNEXT(as, seg);
1790 		if (seg->s_flags & S_PURGE)
1791 			SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1792 		seg = next_seg;
1793 	}
1794 	AS_LOCK_EXIT(as);
1795 
1796 	mutex_enter(&as->a_contents);
1797 	as->a_flags &= ~AS_NEEDSPURGE;
1798 	mutex_exit(&as->a_contents);
1799 }
1800 
1801 /*
1802  * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1803  * range of addresses at least "minlen" long, where the base of the range is
1804  * at "off" phase from an "align" boundary and there is space for a
1805  * "redzone"-sized redzone on eithe rside of the range.  Thus,
1806  * if align was 4M and off was 16k, the user wants a hole which will start
1807  * 16k into a 4M page.
1808  *
1809  * If flags specifies AH_HI, the hole will have the highest possible address
1810  * in the range.  We use the as->a_lastgap field to figure out where to
1811  * start looking for a gap.
1812  *
1813  * Otherwise, the gap will have the lowest possible address.
1814  *
1815  * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1816  *
1817  * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1818  * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1819  *
1820  * NOTE: This routine is not correct when base+len overflows caddr_t.
1821  */
1822 int
1823 as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
1824     uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off)
1825 {
1826 	caddr_t lobound = *basep;
1827 	caddr_t hibound = lobound + *lenp;
1828 	struct seg *lseg, *hseg;
1829 	caddr_t lo, hi;
1830 	int forward;
1831 	caddr_t save_base;
1832 	size_t save_len;
1833 	size_t save_minlen;
1834 	size_t save_redzone;
1835 	int fast_path = 1;
1836 
1837 	save_base = *basep;
1838 	save_len = *lenp;
1839 	save_minlen = minlen;
1840 	save_redzone = redzone;
1841 
1842 	/*
1843 	 * For the first pass/fast_path, just add align and redzone into
1844 	 * minlen since if we get an allocation, we can guarantee that it
1845 	 * will fit the alignment and redzone requested.
1846 	 * This increases the chance that hibound will be adjusted to
1847 	 * a_lastgap->s_base which will likely allow us to find an
1848 	 * acceptable hole in the address space quicker.
1849 	 * If we can't find a hole with this fast_path, then we look for
1850 	 * smaller holes in which the alignment and offset may allow
1851 	 * the allocation to fit.
1852 	 */
1853 	minlen += align;
1854 	minlen += 2 * redzone;
1855 	redzone = 0;
1856 
1857 	AS_LOCK_ENTER(as, RW_READER);
1858 	if (AS_SEGFIRST(as) == NULL) {
1859 		if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1860 		    align, redzone, off)) {
1861 			AS_LOCK_EXIT(as);
1862 			return (0);
1863 		} else {
1864 			AS_LOCK_EXIT(as);
1865 			*basep = save_base;
1866 			*lenp = save_len;
1867 			return (-1);
1868 		}
1869 	}
1870 
1871 retry:
1872 	/*
1873 	 * Set up to iterate over all the inter-segment holes in the given
1874 	 * direction.  lseg is NULL for the lowest-addressed hole and hseg is
1875 	 * NULL for the highest-addressed hole.  If moving backwards, we reset
1876 	 * sseg to denote the highest-addressed segment.
1877 	 */
1878 	forward = (flags & AH_DIR) == AH_LO;
1879 	if (forward) {
1880 		hseg = as_findseg(as, lobound, 1);
1881 		lseg = AS_SEGPREV(as, hseg);
1882 	} else {
1883 
1884 		/*
1885 		 * If allocating at least as much as the last allocation,
1886 		 * use a_lastgap's base as a better estimate of hibound.
1887 		 */
1888 		if (as->a_lastgap &&
1889 		    minlen >= as->a_lastgap->s_size &&
1890 		    hibound >= as->a_lastgap->s_base)
1891 			hibound = as->a_lastgap->s_base;
1892 
1893 		hseg = as_findseg(as, hibound, 1);
1894 		if (hseg->s_base + hseg->s_size < hibound) {
1895 			lseg = hseg;
1896 			hseg = NULL;
1897 		} else {
1898 			lseg = AS_SEGPREV(as, hseg);
1899 		}
1900 	}
1901 
1902 	for (;;) {
1903 		/*
1904 		 * Set lo and hi to the hole's boundaries.  (We should really
1905 		 * use MAXADDR in place of hibound in the expression below,
1906 		 * but can't express it easily; using hibound in its place is
1907 		 * harmless.)
1908 		 */
1909 		lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
1910 		hi = (hseg == NULL) ? hibound : hseg->s_base;
1911 		/*
1912 		 * If the iteration has moved past the interval from lobound
1913 		 * to hibound it's pointless to continue.
1914 		 */
1915 		if ((forward && lo > hibound) || (!forward && hi < lobound))
1916 			break;
1917 		else if (lo > hibound || hi < lobound)
1918 			goto cont;
1919 		/*
1920 		 * Candidate hole lies at least partially within the allowable
1921 		 * range.  Restrict it to fall completely within that range,
1922 		 * i.e., to [max(lo, lobound), min(hi, hibound)].
1923 		 */
1924 		if (lo < lobound)
1925 			lo = lobound;
1926 		if (hi > hibound)
1927 			hi = hibound;
1928 		/*
1929 		 * Verify that the candidate hole is big enough and meets
1930 		 * hardware constraints.  If the hole is too small, no need
1931 		 * to do the further checks since they will fail.
1932 		 */
1933 		*basep = lo;
1934 		*lenp = hi - lo;
1935 		if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
1936 		    minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
1937 		    ((flags & AH_CONTAIN) == 0 ||
1938 		    (*basep <= addr && *basep + *lenp > addr))) {
1939 			if (!forward)
1940 				as->a_lastgap = hseg;
1941 			if (hseg != NULL)
1942 				as->a_lastgaphl = hseg;
1943 			else
1944 				as->a_lastgaphl = lseg;
1945 			AS_LOCK_EXIT(as);
1946 			return (0);
1947 		}
1948 	cont:
1949 		/*
1950 		 * Move to the next hole.
1951 		 */
1952 		if (forward) {
1953 			lseg = hseg;
1954 			if (lseg == NULL)
1955 				break;
1956 			hseg = AS_SEGNEXT(as, hseg);
1957 		} else {
1958 			hseg = lseg;
1959 			if (hseg == NULL)
1960 				break;
1961 			lseg = AS_SEGPREV(as, lseg);
1962 		}
1963 	}
1964 	if (fast_path && (align != 0 || save_redzone != 0)) {
1965 		fast_path = 0;
1966 		minlen = save_minlen;
1967 		redzone = save_redzone;
1968 		goto retry;
1969 	}
1970 	*basep = save_base;
1971 	*lenp = save_len;
1972 	AS_LOCK_EXIT(as);
1973 	return (-1);
1974 }
1975 
1976 /*
1977  * Find a hole of at least size minlen within [*basep, *basep + *lenp).
1978  *
1979  * If flags specifies AH_HI, the hole will have the highest possible address
1980  * in the range.  We use the as->a_lastgap field to figure out where to
1981  * start looking for a gap.
1982  *
1983  * Otherwise, the gap will have the lowest possible address.
1984  *
1985  * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1986  *
1987  * If an adequate hole is found, base and len are set to reflect the part of
1988  * the hole that is within range, and 0 is returned, otherwise,
1989  * -1 is returned.
1990  *
1991  * NOTE: This routine is not correct when base+len overflows caddr_t.
1992  */
1993 int
1994 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
1995     caddr_t addr)
1996 {
1997 
1998 	return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
1999 }
2000 
2001 /*
2002  * Return the next range within [base, base + len) that is backed
2003  * with "real memory".  Skip holes and non-seg_vn segments.
2004  * We're lazy and only return one segment at a time.
2005  */
2006 int
2007 as_memory(struct as *as, caddr_t *basep, size_t *lenp)
2008 {
2009 	extern struct seg_ops segspt_shmops;	/* needs a header file */
2010 	struct seg *seg;
2011 	caddr_t addr, eaddr;
2012 	caddr_t segend;
2013 
2014 	AS_LOCK_ENTER(as, RW_READER);
2015 
2016 	addr = *basep;
2017 	eaddr = addr + *lenp;
2018 
2019 	seg = as_findseg(as, addr, 0);
2020 	if (seg != NULL)
2021 		addr = MAX(seg->s_base, addr);
2022 
2023 	for (;;) {
2024 		if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2025 			AS_LOCK_EXIT(as);
2026 			return (EINVAL);
2027 		}
2028 
2029 		if (seg->s_ops == &segvn_ops) {
2030 			segend = seg->s_base + seg->s_size;
2031 			break;
2032 		}
2033 
2034 		/*
2035 		 * We do ISM by looking into the private data
2036 		 * to determine the real size of the segment.
2037 		 */
2038 		if (seg->s_ops == &segspt_shmops) {
2039 			segend = seg->s_base + spt_realsize(seg);
2040 			if (addr < segend)
2041 				break;
2042 		}
2043 
2044 		seg = AS_SEGNEXT(as, seg);
2045 
2046 		if (seg != NULL)
2047 			addr = seg->s_base;
2048 	}
2049 
2050 	*basep = addr;
2051 
2052 	if (segend > eaddr)
2053 		*lenp = eaddr - addr;
2054 	else
2055 		*lenp = segend - addr;
2056 
2057 	AS_LOCK_EXIT(as);
2058 	return (0);
2059 }
2060 
2061 /*
2062  * Swap the pages associated with the address space as out to
2063  * secondary storage, returning the number of bytes actually
2064  * swapped.
2065  *
2066  * The value returned is intended to correlate well with the process's
2067  * memory requirements.  Its usefulness for this purpose depends on
2068  * how well the segment-level routines do at returning accurate
2069  * information.
2070  */
2071 size_t
2072 as_swapout(struct as *as)
2073 {
2074 	struct seg *seg;
2075 	size_t swpcnt = 0;
2076 
2077 	/*
2078 	 * Kernel-only processes have given up their address
2079 	 * spaces.  Of course, we shouldn't be attempting to
2080 	 * swap out such processes in the first place...
2081 	 */
2082 	if (as == NULL)
2083 		return (0);
2084 
2085 	AS_LOCK_ENTER(as, RW_READER);
2086 
2087 	/*
2088 	 * Free all mapping resources associated with the address
2089 	 * space.  The segment-level swapout routines capitalize
2090 	 * on this unmapping by scavanging pages that have become
2091 	 * unmapped here.
2092 	 */
2093 	hat_swapout(as->a_hat);
2094 
2095 	/*
2096 	 * Call the swapout routines of all segments in the address
2097 	 * space to do the actual work, accumulating the amount of
2098 	 * space reclaimed.
2099 	 */
2100 	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2101 		struct seg_ops *ov = seg->s_ops;
2102 
2103 		/*
2104 		 * We have to check to see if the seg has
2105 		 * an ops vector because the seg may have
2106 		 * been in the middle of being set up when
2107 		 * the process was picked for swapout.
2108 		 */
2109 		if ((ov != NULL) && (ov->swapout != NULL))
2110 			swpcnt += SEGOP_SWAPOUT(seg);
2111 	}
2112 	AS_LOCK_EXIT(as);
2113 	return (swpcnt);
2114 }
2115 
2116 /*
2117  * Determine whether data from the mappings in interval [addr, addr + size)
2118  * are in the primary memory (core) cache.
2119  */
2120 int
2121 as_incore(struct as *as, caddr_t addr,
2122     size_t size, char *vec, size_t *sizep)
2123 {
2124 	struct seg *seg;
2125 	size_t ssize;
2126 	caddr_t raddr;		/* rounded down addr */
2127 	size_t rsize;		/* rounded up size */
2128 	size_t isize;			/* iteration size */
2129 	int error = 0;		/* result, assume success */
2130 
2131 	*sizep = 0;
2132 	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2133 	rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2134 	    (size_t)raddr;
2135 
2136 	if (raddr + rsize < raddr)		/* check for wraparound */
2137 		return (ENOMEM);
2138 
2139 	AS_LOCK_ENTER(as, RW_READER);
2140 	seg = as_segat(as, raddr);
2141 	if (seg == NULL) {
2142 		AS_LOCK_EXIT(as);
2143 		return (-1);
2144 	}
2145 
2146 	for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2147 		if (raddr >= seg->s_base + seg->s_size) {
2148 			seg = AS_SEGNEXT(as, seg);
2149 			if (seg == NULL || raddr != seg->s_base) {
2150 				error = -1;
2151 				break;
2152 			}
2153 		}
2154 		if ((raddr + rsize) > (seg->s_base + seg->s_size))
2155 			ssize = seg->s_base + seg->s_size - raddr;
2156 		else
2157 			ssize = rsize;
2158 		*sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec);
2159 		if (isize != ssize) {
2160 			error = -1;
2161 			break;
2162 		}
2163 		vec += btopr(ssize);
2164 	}
2165 	AS_LOCK_EXIT(as);
2166 	return (error);
2167 }
2168 
2169 static void
2170 as_segunlock(struct seg *seg, caddr_t addr, int attr,
2171     ulong_t *bitmap, size_t position, size_t npages)
2172 {
2173 	caddr_t	range_start;
2174 	size_t	pos1 = position;
2175 	size_t	pos2;
2176 	size_t	size;
2177 	size_t  end_pos = npages + position;
2178 
2179 	while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2180 		size = ptob((pos2 - pos1));
2181 		range_start = (caddr_t)((uintptr_t)addr +
2182 		    ptob(pos1 - position));
2183 
2184 		(void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK,
2185 		    (ulong_t *)NULL, (size_t)NULL);
2186 		pos1 = pos2;
2187 	}
2188 }
2189 
2190 static void
2191 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2192     caddr_t raddr, size_t rsize)
2193 {
2194 	struct seg *seg = as_segat(as, raddr);
2195 	size_t ssize;
2196 
2197 	while (rsize != 0) {
2198 		if (raddr >= seg->s_base + seg->s_size)
2199 			seg = AS_SEGNEXT(as, seg);
2200 
2201 		if ((raddr + rsize) > (seg->s_base + seg->s_size))
2202 			ssize = seg->s_base + seg->s_size - raddr;
2203 		else
2204 			ssize = rsize;
2205 
2206 		as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
2207 
2208 		rsize -= ssize;
2209 		raddr += ssize;
2210 	}
2211 }
2212 
2213 /*
2214  * Cache control operations over the interval [addr, addr + size) in
2215  * address space "as".
2216  */
2217 /*ARGSUSED*/
2218 int
2219 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2220     uintptr_t arg, ulong_t *lock_map, size_t pos)
2221 {
2222 	struct seg *seg;	/* working segment */
2223 	caddr_t raddr;		/* rounded down addr */
2224 	caddr_t initraddr;	/* saved initial rounded down addr */
2225 	size_t rsize;		/* rounded up size */
2226 	size_t initrsize;	/* saved initial rounded up size */
2227 	size_t ssize;		/* size of seg */
2228 	int error = 0;			/* result */
2229 	size_t mlock_size;	/* size of bitmap */
2230 	ulong_t *mlock_map;	/* pointer to bitmap used */
2231 				/* to represent the locked */
2232 				/* pages. */
2233 retry:
2234 	if (error == IE_RETRY)
2235 		AS_LOCK_ENTER(as, RW_WRITER);
2236 	else
2237 		AS_LOCK_ENTER(as, RW_READER);
2238 
2239 	/*
2240 	 * If these are address space lock/unlock operations, loop over
2241 	 * all segments in the address space, as appropriate.
2242 	 */
2243 	if (func == MC_LOCKAS) {
2244 		size_t npages, idx;
2245 		size_t rlen = 0;	/* rounded as length */
2246 
2247 		idx = pos;
2248 
2249 		if (arg & MCL_FUTURE) {
2250 			mutex_enter(&as->a_contents);
2251 			AS_SETPGLCK(as);
2252 			mutex_exit(&as->a_contents);
2253 		}
2254 		if ((arg & MCL_CURRENT) == 0) {
2255 			AS_LOCK_EXIT(as);
2256 			return (0);
2257 		}
2258 
2259 		seg = AS_SEGFIRST(as);
2260 		if (seg == NULL) {
2261 			AS_LOCK_EXIT(as);
2262 			return (0);
2263 		}
2264 
2265 		do {
2266 			raddr = (caddr_t)((uintptr_t)seg->s_base &
2267 			    (uintptr_t)PAGEMASK);
2268 			rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2269 			    PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2270 		} while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2271 
2272 		mlock_size = BT_BITOUL(btopr(rlen));
2273 		if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2274 		    sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2275 				AS_LOCK_EXIT(as);
2276 				return (EAGAIN);
2277 		}
2278 
2279 		for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2280 			error = SEGOP_LOCKOP(seg, seg->s_base,
2281 			    seg->s_size, attr, MC_LOCK, mlock_map, pos);
2282 			if (error != 0)
2283 				break;
2284 			pos += seg_pages(seg);
2285 		}
2286 
2287 		if (error) {
2288 			for (seg = AS_SEGFIRST(as); seg != NULL;
2289 			    seg = AS_SEGNEXT(as, seg)) {
2290 
2291 				raddr = (caddr_t)((uintptr_t)seg->s_base &
2292 				    (uintptr_t)PAGEMASK);
2293 				npages = seg_pages(seg);
2294 				as_segunlock(seg, raddr, attr, mlock_map,
2295 				    idx, npages);
2296 				idx += npages;
2297 			}
2298 		}
2299 
2300 		kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2301 		AS_LOCK_EXIT(as);
2302 		goto lockerr;
2303 	} else if (func == MC_UNLOCKAS) {
2304 		mutex_enter(&as->a_contents);
2305 		AS_CLRPGLCK(as);
2306 		mutex_exit(&as->a_contents);
2307 
2308 		for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2309 			error = SEGOP_LOCKOP(seg, seg->s_base,
2310 			    seg->s_size, attr, MC_UNLOCK, NULL, 0);
2311 			if (error != 0)
2312 				break;
2313 		}
2314 
2315 		AS_LOCK_EXIT(as);
2316 		goto lockerr;
2317 	}
2318 
2319 	/*
2320 	 * Normalize addresses and sizes.
2321 	 */
2322 	initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2323 	initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2324 	    (size_t)raddr;
2325 
2326 	if (raddr + rsize < raddr) {		/* check for wraparound */
2327 		AS_LOCK_EXIT(as);
2328 		return (ENOMEM);
2329 	}
2330 
2331 	/*
2332 	 * Get initial segment.
2333 	 */
2334 	if ((seg = as_segat(as, raddr)) == NULL) {
2335 		AS_LOCK_EXIT(as);
2336 		return (ENOMEM);
2337 	}
2338 
2339 	if (func == MC_LOCK) {
2340 		mlock_size = BT_BITOUL(btopr(rsize));
2341 		if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2342 		    sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2343 				AS_LOCK_EXIT(as);
2344 				return (EAGAIN);
2345 		}
2346 	}
2347 
2348 	/*
2349 	 * Loop over all segments.  If a hole in the address range is
2350 	 * discovered, then fail.  For each segment, perform the appropriate
2351 	 * control operation.
2352 	 */
2353 	while (rsize != 0) {
2354 
2355 		/*
2356 		 * Make sure there's no hole, calculate the portion
2357 		 * of the next segment to be operated over.
2358 		 */
2359 		if (raddr >= seg->s_base + seg->s_size) {
2360 			seg = AS_SEGNEXT(as, seg);
2361 			if (seg == NULL || raddr != seg->s_base) {
2362 				if (func == MC_LOCK) {
2363 					as_unlockerr(as, attr, mlock_map,
2364 					    initraddr, initrsize - rsize);
2365 					kmem_free(mlock_map,
2366 					    mlock_size * sizeof (ulong_t));
2367 				}
2368 				AS_LOCK_EXIT(as);
2369 				return (ENOMEM);
2370 			}
2371 		}
2372 		if ((raddr + rsize) > (seg->s_base + seg->s_size))
2373 			ssize = seg->s_base + seg->s_size - raddr;
2374 		else
2375 			ssize = rsize;
2376 
2377 		/*
2378 		 * Dispatch on specific function.
2379 		 */
2380 		switch (func) {
2381 
2382 		/*
2383 		 * Synchronize cached data from mappings with backing
2384 		 * objects.
2385 		 */
2386 		case MC_SYNC:
2387 			if (error = SEGOP_SYNC(seg, raddr, ssize,
2388 			    attr, (uint_t)arg)) {
2389 				AS_LOCK_EXIT(as);
2390 				return (error);
2391 			}
2392 			break;
2393 
2394 		/*
2395 		 * Lock pages in memory.
2396 		 */
2397 		case MC_LOCK:
2398 			if (error = SEGOP_LOCKOP(seg, raddr, ssize,
2399 			    attr, func, mlock_map, pos)) {
2400 				as_unlockerr(as, attr, mlock_map, initraddr,
2401 				    initrsize - rsize + ssize);
2402 				kmem_free(mlock_map, mlock_size *
2403 				    sizeof (ulong_t));
2404 				AS_LOCK_EXIT(as);
2405 				goto lockerr;
2406 			}
2407 			break;
2408 
2409 		/*
2410 		 * Unlock mapped pages.
2411 		 */
2412 		case MC_UNLOCK:
2413 			(void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func,
2414 			    (ulong_t *)NULL, (size_t)NULL);
2415 			break;
2416 
2417 		/*
2418 		 * Store VM advise for mapped pages in segment layer.
2419 		 */
2420 		case MC_ADVISE:
2421 			error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg);
2422 
2423 			/*
2424 			 * Check for regular errors and special retry error
2425 			 */
2426 			if (error) {
2427 				if (error == IE_RETRY) {
2428 					/*
2429 					 * Need to acquire writers lock, so
2430 					 * have to drop readers lock and start
2431 					 * all over again
2432 					 */
2433 					AS_LOCK_EXIT(as);
2434 					goto retry;
2435 				} else if (error == IE_REATTACH) {
2436 					/*
2437 					 * Find segment for current address
2438 					 * because current segment just got
2439 					 * split or concatenated
2440 					 */
2441 					seg = as_segat(as, raddr);
2442 					if (seg == NULL) {
2443 						AS_LOCK_EXIT(as);
2444 						return (ENOMEM);
2445 					}
2446 				} else {
2447 					/*
2448 					 * Regular error
2449 					 */
2450 					AS_LOCK_EXIT(as);
2451 					return (error);
2452 				}
2453 			}
2454 			break;
2455 
2456 		case MC_INHERIT_ZERO:
2457 			if (seg->s_ops->inherit == NULL) {
2458 				error = ENOTSUP;
2459 			} else {
2460 				error = SEGOP_INHERIT(seg, raddr, ssize,
2461 				    SEGP_INH_ZERO);
2462 			}
2463 			if (error != 0) {
2464 				AS_LOCK_EXIT(as);
2465 				return (error);
2466 			}
2467 			break;
2468 
2469 		/*
2470 		 * Can't happen.
2471 		 */
2472 		default:
2473 			panic("as_ctl: bad operation %d", func);
2474 			/*NOTREACHED*/
2475 		}
2476 
2477 		rsize -= ssize;
2478 		raddr += ssize;
2479 	}
2480 
2481 	if (func == MC_LOCK)
2482 		kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2483 	AS_LOCK_EXIT(as);
2484 	return (0);
2485 lockerr:
2486 
2487 	/*
2488 	 * If the lower levels returned EDEADLK for a segment lockop,
2489 	 * it means that we should retry the operation.  Let's wait
2490 	 * a bit also to let the deadlock causing condition clear.
2491 	 * This is part of a gross hack to work around a design flaw
2492 	 * in the ufs/sds logging code and should go away when the
2493 	 * logging code is re-designed to fix the problem. See bug
2494 	 * 4125102 for details of the problem.
2495 	 */
2496 	if (error == EDEADLK) {
2497 		delay(deadlk_wait);
2498 		error = 0;
2499 		goto retry;
2500 	}
2501 	return (error);
2502 }
2503 
2504 int
2505 fc_decode(faultcode_t fault_err)
2506 {
2507 	int error = 0;
2508 
2509 	switch (FC_CODE(fault_err)) {
2510 	case FC_OBJERR:
2511 		error = FC_ERRNO(fault_err);
2512 		break;
2513 	case FC_PROT:
2514 		error = EACCES;
2515 		break;
2516 	default:
2517 		error = EFAULT;
2518 		break;
2519 	}
2520 	return (error);
2521 }
2522 
2523 /*
2524  * Pagelock pages from a range that spans more than 1 segment.  Obtain shadow
2525  * lists from each segment and copy them to one contiguous shadow list (plist)
2526  * as expected by the caller.  Save pointers to per segment shadow lists at
2527  * the tail of plist so that they can be used during as_pageunlock().
2528  */
2529 static int
2530 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2531     caddr_t addr, size_t size, enum seg_rw rw)
2532 {
2533 	caddr_t sv_addr = addr;
2534 	size_t sv_size = size;
2535 	struct seg *sv_seg = seg;
2536 	ulong_t segcnt = 1;
2537 	ulong_t cnt;
2538 	size_t ssize;
2539 	pgcnt_t npages = btop(size);
2540 	page_t **plist;
2541 	page_t **pl;
2542 	int error;
2543 	caddr_t eaddr;
2544 	faultcode_t fault_err = 0;
2545 	pgcnt_t pl_off;
2546 	extern struct seg_ops segspt_shmops;
2547 
2548 	ASSERT(AS_LOCK_HELD(as));
2549 	ASSERT(seg != NULL);
2550 	ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2551 	ASSERT(addr + size > seg->s_base + seg->s_size);
2552 	ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2553 	ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2554 
2555 	/*
2556 	 * Count the number of segments covered by the range we are about to
2557 	 * lock. The segment count is used to size the shadow list we return
2558 	 * back to the caller.
2559 	 */
2560 	for (; size != 0; size -= ssize, addr += ssize) {
2561 		if (addr >= seg->s_base + seg->s_size) {
2562 
2563 			seg = AS_SEGNEXT(as, seg);
2564 			if (seg == NULL || addr != seg->s_base) {
2565 				AS_LOCK_EXIT(as);
2566 				return (EFAULT);
2567 			}
2568 			/*
2569 			 * Do a quick check if subsequent segments
2570 			 * will most likely support pagelock.
2571 			 */
2572 			if (seg->s_ops == &segvn_ops) {
2573 				vnode_t *vp;
2574 
2575 				if (SEGOP_GETVP(seg, addr, &vp) != 0 ||
2576 				    vp != NULL) {
2577 					AS_LOCK_EXIT(as);
2578 					goto slow;
2579 				}
2580 			} else if (seg->s_ops != &segspt_shmops) {
2581 				AS_LOCK_EXIT(as);
2582 				goto slow;
2583 			}
2584 			segcnt++;
2585 		}
2586 		if (addr + size > seg->s_base + seg->s_size) {
2587 			ssize = seg->s_base + seg->s_size - addr;
2588 		} else {
2589 			ssize = size;
2590 		}
2591 	}
2592 	ASSERT(segcnt > 1);
2593 
2594 	plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2595 
2596 	addr = sv_addr;
2597 	size = sv_size;
2598 	seg = sv_seg;
2599 
2600 	for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2601 		if (addr >= seg->s_base + seg->s_size) {
2602 			seg = AS_SEGNEXT(as, seg);
2603 			ASSERT(seg != NULL && addr == seg->s_base);
2604 			cnt++;
2605 			ASSERT(cnt < segcnt);
2606 		}
2607 		if (addr + size > seg->s_base + seg->s_size) {
2608 			ssize = seg->s_base + seg->s_size - addr;
2609 		} else {
2610 			ssize = size;
2611 		}
2612 		pl = &plist[npages + cnt];
2613 		error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2614 		    L_PAGELOCK, rw);
2615 		if (error) {
2616 			break;
2617 		}
2618 		ASSERT(plist[npages + cnt] != NULL);
2619 		ASSERT(pl_off + btop(ssize) <= npages);
2620 		bcopy(plist[npages + cnt], &plist[pl_off],
2621 		    btop(ssize) * sizeof (page_t *));
2622 		pl_off += btop(ssize);
2623 	}
2624 
2625 	if (size == 0) {
2626 		AS_LOCK_EXIT(as);
2627 		ASSERT(cnt == segcnt - 1);
2628 		*ppp = plist;
2629 		return (0);
2630 	}
2631 
2632 	/*
2633 	 * one of pagelock calls failed. The error type is in error variable.
2634 	 * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2635 	 * type is either EFAULT or ENOTSUP. Otherwise just return the error
2636 	 * back to the caller.
2637 	 */
2638 
2639 	eaddr = addr;
2640 	seg = sv_seg;
2641 
2642 	for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2643 		if (addr >= seg->s_base + seg->s_size) {
2644 			seg = AS_SEGNEXT(as, seg);
2645 			ASSERT(seg != NULL && addr == seg->s_base);
2646 			cnt++;
2647 			ASSERT(cnt < segcnt);
2648 		}
2649 		if (eaddr > seg->s_base + seg->s_size) {
2650 			ssize = seg->s_base + seg->s_size - addr;
2651 		} else {
2652 			ssize = eaddr - addr;
2653 		}
2654 		pl = &plist[npages + cnt];
2655 		ASSERT(*pl != NULL);
2656 		(void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2657 		    L_PAGEUNLOCK, rw);
2658 	}
2659 
2660 	AS_LOCK_EXIT(as);
2661 
2662 	kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2663 
2664 	if (error != ENOTSUP && error != EFAULT) {
2665 		return (error);
2666 	}
2667 
2668 slow:
2669 	/*
2670 	 * If we are here because pagelock failed due to the need to cow fault
2671 	 * in the pages we want to lock F_SOFTLOCK will do this job and in
2672 	 * next as_pagelock() call for this address range pagelock will
2673 	 * hopefully succeed.
2674 	 */
2675 	fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2676 	if (fault_err != 0) {
2677 		return (fc_decode(fault_err));
2678 	}
2679 	*ppp = NULL;
2680 
2681 	return (0);
2682 }
2683 
2684 /*
2685  * lock pages in a given address space. Return shadow list. If
2686  * the list is NULL, the MMU mapping is also locked.
2687  */
2688 int
2689 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2690     size_t size, enum seg_rw rw)
2691 {
2692 	size_t rsize;
2693 	caddr_t raddr;
2694 	faultcode_t fault_err;
2695 	struct seg *seg;
2696 	int err;
2697 
2698 	TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
2699 	    "as_pagelock_start: addr %p size %ld", addr, size);
2700 
2701 	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2702 	rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2703 	    (size_t)raddr;
2704 
2705 	/*
2706 	 * if the request crosses two segments let
2707 	 * as_fault handle it.
2708 	 */
2709 	AS_LOCK_ENTER(as, RW_READER);
2710 
2711 	seg = as_segat(as, raddr);
2712 	if (seg == NULL) {
2713 		AS_LOCK_EXIT(as);
2714 		return (EFAULT);
2715 	}
2716 	ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2717 	if (raddr + rsize > seg->s_base + seg->s_size) {
2718 		return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2719 	}
2720 	if (raddr + rsize <= raddr) {
2721 		AS_LOCK_EXIT(as);
2722 		return (EFAULT);
2723 	}
2724 
2725 	TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
2726 	    "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
2727 
2728 	/*
2729 	 * try to lock pages and pass back shadow list
2730 	 */
2731 	err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2732 
2733 	TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
2734 
2735 	AS_LOCK_EXIT(as);
2736 
2737 	if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2738 		return (err);
2739 	}
2740 
2741 	/*
2742 	 * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2743 	 * to no pagelock support for this segment or pages need to be cow
2744 	 * faulted in. If fault is needed F_SOFTLOCK will do this job for
2745 	 * this as_pagelock() call and in the next as_pagelock() call for the
2746 	 * same address range pagelock call will hopefull succeed.
2747 	 */
2748 	fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2749 	if (fault_err != 0) {
2750 		return (fc_decode(fault_err));
2751 	}
2752 	*ppp = NULL;
2753 
2754 	TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end");
2755 	return (0);
2756 }
2757 
2758 /*
2759  * unlock pages locked by as_pagelock_segs().  Retrieve per segment shadow
2760  * lists from the end of plist and call pageunlock interface for each segment.
2761  * Drop as lock and free plist.
2762  */
2763 static void
2764 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2765     struct page **plist, enum seg_rw rw)
2766 {
2767 	ulong_t cnt;
2768 	caddr_t eaddr = addr + size;
2769 	pgcnt_t npages = btop(size);
2770 	size_t ssize;
2771 	page_t **pl;
2772 
2773 	ASSERT(AS_LOCK_HELD(as));
2774 	ASSERT(seg != NULL);
2775 	ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2776 	ASSERT(addr + size > seg->s_base + seg->s_size);
2777 	ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2778 	ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2779 	ASSERT(plist != NULL);
2780 
2781 	for (cnt = 0; addr < eaddr; addr += ssize) {
2782 		if (addr >= seg->s_base + seg->s_size) {
2783 			seg = AS_SEGNEXT(as, seg);
2784 			ASSERT(seg != NULL && addr == seg->s_base);
2785 			cnt++;
2786 		}
2787 		if (eaddr > seg->s_base + seg->s_size) {
2788 			ssize = seg->s_base + seg->s_size - addr;
2789 		} else {
2790 			ssize = eaddr - addr;
2791 		}
2792 		pl = &plist[npages + cnt];
2793 		ASSERT(*pl != NULL);
2794 		(void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2795 		    L_PAGEUNLOCK, rw);
2796 	}
2797 	ASSERT(cnt > 0);
2798 	AS_LOCK_EXIT(as);
2799 
2800 	cnt++;
2801 	kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2802 }
2803 
2804 /*
2805  * unlock pages in a given address range
2806  */
2807 void
2808 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2809     enum seg_rw rw)
2810 {
2811 	struct seg *seg;
2812 	size_t rsize;
2813 	caddr_t raddr;
2814 
2815 	TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START,
2816 	    "as_pageunlock_start: addr %p size %ld", addr, size);
2817 
2818 	/*
2819 	 * if the shadow list is NULL, as_pagelock was
2820 	 * falling back to as_fault
2821 	 */
2822 	if (pp == NULL) {
2823 		(void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2824 		return;
2825 	}
2826 
2827 	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2828 	rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2829 	    (size_t)raddr;
2830 
2831 	AS_LOCK_ENTER(as, RW_READER);
2832 	seg = as_segat(as, raddr);
2833 	ASSERT(seg != NULL);
2834 
2835 	TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
2836 	    "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
2837 
2838 	ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2839 	if (raddr + rsize <= seg->s_base + seg->s_size) {
2840 		SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2841 	} else {
2842 		as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2843 		return;
2844 	}
2845 	AS_LOCK_EXIT(as);
2846 	TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
2847 }
2848 
2849 int
2850 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2851     boolean_t wait)
2852 {
2853 	struct seg *seg;
2854 	size_t ssize;
2855 	caddr_t raddr;			/* rounded down addr */
2856 	size_t rsize;			/* rounded up size */
2857 	int error = 0;
2858 	size_t pgsz = page_get_pagesize(szc);
2859 
2860 setpgsz_top:
2861 	if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2862 		return (EINVAL);
2863 	}
2864 
2865 	raddr = addr;
2866 	rsize = size;
2867 
2868 	if (raddr + rsize < raddr)		/* check for wraparound */
2869 		return (ENOMEM);
2870 
2871 	AS_LOCK_ENTER(as, RW_WRITER);
2872 	as_clearwatchprot(as, raddr, rsize);
2873 	seg = as_segat(as, raddr);
2874 	if (seg == NULL) {
2875 		as_setwatch(as);
2876 		AS_LOCK_EXIT(as);
2877 		return (ENOMEM);
2878 	}
2879 
2880 	for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2881 		if (raddr >= seg->s_base + seg->s_size) {
2882 			seg = AS_SEGNEXT(as, seg);
2883 			if (seg == NULL || raddr != seg->s_base) {
2884 				error = ENOMEM;
2885 				break;
2886 			}
2887 		}
2888 		if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2889 			ssize = seg->s_base + seg->s_size - raddr;
2890 		} else {
2891 			ssize = rsize;
2892 		}
2893 
2894 retry:
2895 		error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
2896 
2897 		if (error == IE_NOMEM) {
2898 			error = EAGAIN;
2899 			break;
2900 		}
2901 
2902 		if (error == IE_RETRY) {
2903 			AS_LOCK_EXIT(as);
2904 			goto setpgsz_top;
2905 		}
2906 
2907 		if (error == ENOTSUP) {
2908 			error = EINVAL;
2909 			break;
2910 		}
2911 
2912 		if (wait && (error == EAGAIN)) {
2913 			/*
2914 			 * Memory is currently locked.  It must be unlocked
2915 			 * before this operation can succeed through a retry.
2916 			 * The possible reasons for locked memory and
2917 			 * corresponding strategies for unlocking are:
2918 			 * (1) Normal I/O
2919 			 *	wait for a signal that the I/O operation
2920 			 *	has completed and the memory is unlocked.
2921 			 * (2) Asynchronous I/O
2922 			 *	The aio subsystem does not unlock pages when
2923 			 *	the I/O is completed. Those pages are unlocked
2924 			 *	when the application calls aiowait/aioerror.
2925 			 *	So, to prevent blocking forever, cv_broadcast()
2926 			 *	is done to wake up aio_cleanup_thread.
2927 			 *	Subsequently, segvn_reclaim will be called, and
2928 			 *	that will do AS_CLRUNMAPWAIT() and wake us up.
2929 			 * (3) Long term page locking:
2930 			 *	This is not relevant for as_setpagesize()
2931 			 *	because we cannot change the page size for
2932 			 *	driver memory. The attempt to do so will
2933 			 *	fail with a different error than EAGAIN so
2934 			 *	there's no need to trigger as callbacks like
2935 			 *	as_unmap, as_setprot or as_free would do.
2936 			 */
2937 			mutex_enter(&as->a_contents);
2938 			if (!AS_ISNOUNMAPWAIT(as)) {
2939 				if (AS_ISUNMAPWAIT(as) == 0) {
2940 					cv_broadcast(&as->a_cv);
2941 				}
2942 				AS_SETUNMAPWAIT(as);
2943 				AS_LOCK_EXIT(as);
2944 				while (AS_ISUNMAPWAIT(as)) {
2945 					cv_wait(&as->a_cv, &as->a_contents);
2946 				}
2947 			} else {
2948 				/*
2949 				 * We may have raced with
2950 				 * segvn_reclaim()/segspt_reclaim(). In this
2951 				 * case clean nounmapwait flag and retry since
2952 				 * softlockcnt in this segment may be already
2953 				 * 0.  We don't drop as writer lock so our
2954 				 * number of retries without sleeping should
2955 				 * be very small. See segvn_reclaim() for
2956 				 * more comments.
2957 				 */
2958 				AS_CLRNOUNMAPWAIT(as);
2959 				mutex_exit(&as->a_contents);
2960 				goto retry;
2961 			}
2962 			mutex_exit(&as->a_contents);
2963 			goto setpgsz_top;
2964 		} else if (error != 0) {
2965 			break;
2966 		}
2967 	}
2968 	as_setwatch(as);
2969 	AS_LOCK_EXIT(as);
2970 	return (error);
2971 }
2972 
2973 /*
2974  * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments
2975  * in its chunk where s_szc is less than the szc we want to set.
2976  */
2977 static int
2978 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
2979     int *retry)
2980 {
2981 	struct seg *seg;
2982 	size_t ssize;
2983 	int error;
2984 
2985 	ASSERT(AS_WRITE_HELD(as));
2986 
2987 	seg = as_segat(as, raddr);
2988 	if (seg == NULL) {
2989 		panic("as_iset3_default_lpsize: no seg");
2990 	}
2991 
2992 	for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2993 		if (raddr >= seg->s_base + seg->s_size) {
2994 			seg = AS_SEGNEXT(as, seg);
2995 			if (seg == NULL || raddr != seg->s_base) {
2996 				panic("as_iset3_default_lpsize: as changed");
2997 			}
2998 		}
2999 		if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3000 			ssize = seg->s_base + seg->s_size - raddr;
3001 		} else {
3002 			ssize = rsize;
3003 		}
3004 
3005 		if (szc > seg->s_szc) {
3006 			error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
3007 			/* Only retry on EINVAL segments that have no vnode. */
3008 			if (error == EINVAL) {
3009 				vnode_t *vp = NULL;
3010 				if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) &&
3011 				    (SEGOP_GETVP(seg, raddr, &vp) != 0 ||
3012 				    vp == NULL)) {
3013 					*retry = 1;
3014 				} else {
3015 					*retry = 0;
3016 				}
3017 			}
3018 			if (error) {
3019 				return (error);
3020 			}
3021 		}
3022 	}
3023 	return (0);
3024 }
3025 
3026 /*
3027  * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
3028  * pagesize on each segment in its range, but if any fails with EINVAL,
3029  * then it reduces the pagesizes to the next size in the bitmap and
3030  * retries as_iset3_default_lpsize(). The reason why the code retries
3031  * smaller allowed sizes on EINVAL is because (a) the anon offset may not
3032  * match the bigger sizes, and (b) it's hard to get this offset (to begin
3033  * with) to pass to map_pgszcvec().
3034  */
3035 static int
3036 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
3037     uint_t szcvec)
3038 {
3039 	int error;
3040 	int retry;
3041 
3042 	ASSERT(AS_WRITE_HELD(as));
3043 
3044 	for (;;) {
3045 		error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
3046 		if (error == EINVAL && retry) {
3047 			szcvec &= ~(1 << szc);
3048 			if (szcvec <= 1) {
3049 				return (EINVAL);
3050 			}
3051 			szc = highbit(szcvec) - 1;
3052 		} else {
3053 			return (error);
3054 		}
3055 	}
3056 }
3057 
3058 /*
3059  * as_iset1_default_lpsize() breaks its chunk into areas where existing
3060  * segments have a smaller szc than we want to set. For each such area,
3061  * it calls as_iset2_default_lpsize()
3062  */
3063 static int
3064 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3065     uint_t szcvec)
3066 {
3067 	struct seg *seg;
3068 	size_t ssize;
3069 	caddr_t setaddr = raddr;
3070 	size_t setsize = 0;
3071 	int set;
3072 	int error;
3073 
3074 	ASSERT(AS_WRITE_HELD(as));
3075 
3076 	seg = as_segat(as, raddr);
3077 	if (seg == NULL) {
3078 		panic("as_iset1_default_lpsize: no seg");
3079 	}
3080 	if (seg->s_szc < szc) {
3081 		set = 1;
3082 	} else {
3083 		set = 0;
3084 	}
3085 
3086 	for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3087 		if (raddr >= seg->s_base + seg->s_size) {
3088 			seg = AS_SEGNEXT(as, seg);
3089 			if (seg == NULL || raddr != seg->s_base) {
3090 				panic("as_iset1_default_lpsize: as changed");
3091 			}
3092 			if (seg->s_szc >= szc && set) {
3093 				ASSERT(setsize != 0);
3094 				error = as_iset2_default_lpsize(as,
3095 				    setaddr, setsize, szc, szcvec);
3096 				if (error) {
3097 					return (error);
3098 				}
3099 				set = 0;
3100 			} else if (seg->s_szc < szc && !set) {
3101 				setaddr = raddr;
3102 				setsize = 0;
3103 				set = 1;
3104 			}
3105 		}
3106 		if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3107 			ssize = seg->s_base + seg->s_size - raddr;
3108 		} else {
3109 			ssize = rsize;
3110 		}
3111 	}
3112 	error = 0;
3113 	if (set) {
3114 		ASSERT(setsize != 0);
3115 		error = as_iset2_default_lpsize(as, setaddr, setsize,
3116 		    szc, szcvec);
3117 	}
3118 	return (error);
3119 }
3120 
3121 /*
3122  * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3123  * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3124  * chunk to as_iset1_default_lpsize().
3125  */
3126 static int
3127 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3128     int type)
3129 {
3130 	int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3131 	uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3132 	    flags, rtype, 1);
3133 	uint_t szc;
3134 	uint_t nszc;
3135 	int error;
3136 	caddr_t a;
3137 	caddr_t eaddr;
3138 	size_t segsize;
3139 	size_t pgsz;
3140 	uint_t save_szcvec;
3141 
3142 	ASSERT(AS_WRITE_HELD(as));
3143 	ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3144 	ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3145 
3146 	szcvec &= ~1;
3147 	if (szcvec <= 1) {	/* skip if base page size */
3148 		return (0);
3149 	}
3150 
3151 	/* Get the pagesize of the first larger page size. */
3152 	szc = lowbit(szcvec) - 1;
3153 	pgsz = page_get_pagesize(szc);
3154 	eaddr = addr + size;
3155 	addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3156 	eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3157 
3158 	save_szcvec = szcvec;
3159 	szcvec >>= (szc + 1);
3160 	nszc = szc;
3161 	while (szcvec) {
3162 		if ((szcvec & 0x1) == 0) {
3163 			nszc++;
3164 			szcvec >>= 1;
3165 			continue;
3166 		}
3167 		nszc++;
3168 		pgsz = page_get_pagesize(nszc);
3169 		a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3170 		if (a != addr) {
3171 			ASSERT(szc > 0);
3172 			ASSERT(a < eaddr);
3173 			segsize = a - addr;
3174 			error = as_iset1_default_lpsize(as, addr, segsize, szc,
3175 			    save_szcvec);
3176 			if (error) {
3177 				return (error);
3178 			}
3179 			addr = a;
3180 		}
3181 		szc = nszc;
3182 		szcvec >>= 1;
3183 	}
3184 
3185 	ASSERT(addr < eaddr);
3186 	szcvec = save_szcvec;
3187 	while (szcvec) {
3188 		a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3189 		ASSERT(a >= addr);
3190 		if (a != addr) {
3191 			ASSERT(szc > 0);
3192 			segsize = a - addr;
3193 			error = as_iset1_default_lpsize(as, addr, segsize, szc,
3194 			    save_szcvec);
3195 			if (error) {
3196 				return (error);
3197 			}
3198 			addr = a;
3199 		}
3200 		szcvec &= ~(1 << szc);
3201 		if (szcvec) {
3202 			szc = highbit(szcvec) - 1;
3203 			pgsz = page_get_pagesize(szc);
3204 		}
3205 	}
3206 	ASSERT(addr == eaddr);
3207 
3208 	return (0);
3209 }
3210 
3211 /*
3212  * Set the default large page size for the range. Called via memcntl with
3213  * page size set to 0. as_set_default_lpsize breaks the range down into
3214  * chunks with the same type/flags, ignores-non segvn segments, and passes
3215  * each chunk to as_iset_default_lpsize().
3216  */
3217 int
3218 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3219 {
3220 	struct seg *seg;
3221 	caddr_t raddr;
3222 	size_t rsize;
3223 	size_t ssize;
3224 	int rtype, rflags;
3225 	int stype, sflags;
3226 	int error;
3227 	caddr_t	setaddr;
3228 	size_t setsize;
3229 	int segvn;
3230 
3231 	if (size == 0)
3232 		return (0);
3233 
3234 	AS_LOCK_ENTER(as, RW_WRITER);
3235 again:
3236 	error = 0;
3237 
3238 	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3239 	rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3240 	    (size_t)raddr;
3241 
3242 	if (raddr + rsize < raddr) {		/* check for wraparound */
3243 		AS_LOCK_EXIT(as);
3244 		return (ENOMEM);
3245 	}
3246 	as_clearwatchprot(as, raddr, rsize);
3247 	seg = as_segat(as, raddr);
3248 	if (seg == NULL) {
3249 		as_setwatch(as);
3250 		AS_LOCK_EXIT(as);
3251 		return (ENOMEM);
3252 	}
3253 	if (seg->s_ops == &segvn_ops) {
3254 		rtype = SEGOP_GETTYPE(seg, addr);
3255 		rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3256 		rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3257 		segvn = 1;
3258 	} else {
3259 		segvn = 0;
3260 	}
3261 	setaddr = raddr;
3262 	setsize = 0;
3263 
3264 	for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3265 		if (raddr >= (seg->s_base + seg->s_size)) {
3266 			seg = AS_SEGNEXT(as, seg);
3267 			if (seg == NULL || raddr != seg->s_base) {
3268 				error = ENOMEM;
3269 				break;
3270 			}
3271 			if (seg->s_ops == &segvn_ops) {
3272 				stype = SEGOP_GETTYPE(seg, raddr);
3273 				sflags = stype & (MAP_TEXT | MAP_INITDATA);
3274 				stype &= (MAP_SHARED | MAP_PRIVATE);
3275 				if (segvn && (rflags != sflags ||
3276 				    rtype != stype)) {
3277 					/*
3278 					 * The next segment is also segvn but
3279 					 * has different flags and/or type.
3280 					 */
3281 					ASSERT(setsize != 0);
3282 					error = as_iset_default_lpsize(as,
3283 					    setaddr, setsize, rflags, rtype);
3284 					if (error) {
3285 						break;
3286 					}
3287 					rflags = sflags;
3288 					rtype = stype;
3289 					setaddr = raddr;
3290 					setsize = 0;
3291 				} else if (!segvn) {
3292 					rflags = sflags;
3293 					rtype = stype;
3294 					setaddr = raddr;
3295 					setsize = 0;
3296 					segvn = 1;
3297 				}
3298 			} else if (segvn) {
3299 				/* The next segment is not segvn. */
3300 				ASSERT(setsize != 0);
3301 				error = as_iset_default_lpsize(as,
3302 				    setaddr, setsize, rflags, rtype);
3303 				if (error) {
3304 					break;
3305 				}
3306 				segvn = 0;
3307 			}
3308 		}
3309 		if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3310 			ssize = seg->s_base + seg->s_size - raddr;
3311 		} else {
3312 			ssize = rsize;
3313 		}
3314 	}
3315 	if (error == 0 && segvn) {
3316 		/* The last chunk when rsize == 0. */
3317 		ASSERT(setsize != 0);
3318 		error = as_iset_default_lpsize(as, setaddr, setsize,
3319 		    rflags, rtype);
3320 	}
3321 
3322 	if (error == IE_RETRY) {
3323 		goto again;
3324 	} else if (error == IE_NOMEM) {
3325 		error = EAGAIN;
3326 	} else if (error == ENOTSUP) {
3327 		error = EINVAL;
3328 	} else if (error == EAGAIN) {
3329 		mutex_enter(&as->a_contents);
3330 		if (!AS_ISNOUNMAPWAIT(as)) {
3331 			if (AS_ISUNMAPWAIT(as) == 0) {
3332 				cv_broadcast(&as->a_cv);
3333 			}
3334 			AS_SETUNMAPWAIT(as);
3335 			AS_LOCK_EXIT(as);
3336 			while (AS_ISUNMAPWAIT(as)) {
3337 				cv_wait(&as->a_cv, &as->a_contents);
3338 			}
3339 			mutex_exit(&as->a_contents);
3340 			AS_LOCK_ENTER(as, RW_WRITER);
3341 		} else {
3342 			/*
3343 			 * We may have raced with
3344 			 * segvn_reclaim()/segspt_reclaim(). In this case
3345 			 * clean nounmapwait flag and retry since softlockcnt
3346 			 * in this segment may be already 0.  We don't drop as
3347 			 * writer lock so our number of retries without
3348 			 * sleeping should be very small. See segvn_reclaim()
3349 			 * for more comments.
3350 			 */
3351 			AS_CLRNOUNMAPWAIT(as);
3352 			mutex_exit(&as->a_contents);
3353 		}
3354 		goto again;
3355 	}
3356 
3357 	as_setwatch(as);
3358 	AS_LOCK_EXIT(as);
3359 	return (error);
3360 }
3361 
3362 /*
3363  * Setup all of the uninitialized watched pages that we can.
3364  */
3365 void
3366 as_setwatch(struct as *as)
3367 {
3368 	struct watched_page *pwp;
3369 	struct seg *seg;
3370 	caddr_t vaddr;
3371 	uint_t prot;
3372 	int  err, retrycnt;
3373 
3374 	if (avl_numnodes(&as->a_wpage) == 0)
3375 		return;
3376 
3377 	ASSERT(AS_WRITE_HELD(as));
3378 
3379 	for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3380 	    pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3381 		retrycnt = 0;
3382 	retry:
3383 		vaddr = pwp->wp_vaddr;
3384 		if (pwp->wp_oprot != 0 ||	/* already set up */
3385 		    (seg = as_segat(as, vaddr)) == NULL ||
3386 		    SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0)
3387 			continue;
3388 
3389 		pwp->wp_oprot = prot;
3390 		if (pwp->wp_read)
3391 			prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3392 		if (pwp->wp_write)
3393 			prot &= ~PROT_WRITE;
3394 		if (pwp->wp_exec)
3395 			prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3396 		if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3397 			err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3398 			if (err == IE_RETRY) {
3399 				pwp->wp_oprot = 0;
3400 				ASSERT(retrycnt == 0);
3401 				retrycnt++;
3402 				goto retry;
3403 			}
3404 		}
3405 		pwp->wp_prot = prot;
3406 	}
3407 }
3408 
3409 /*
3410  * Clear all of the watched pages in the address space.
3411  */
3412 void
3413 as_clearwatch(struct as *as)
3414 {
3415 	struct watched_page *pwp;
3416 	struct seg *seg;
3417 	caddr_t vaddr;
3418 	uint_t prot;
3419 	int err, retrycnt;
3420 
3421 	if (avl_numnodes(&as->a_wpage) == 0)
3422 		return;
3423 
3424 	ASSERT(AS_WRITE_HELD(as));
3425 
3426 	for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3427 	    pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3428 		retrycnt = 0;
3429 	retry:
3430 		vaddr = pwp->wp_vaddr;
3431 		if (pwp->wp_oprot == 0 ||	/* not set up */
3432 		    (seg = as_segat(as, vaddr)) == NULL)
3433 			continue;
3434 
3435 		if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3436 			err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3437 			if (err == IE_RETRY) {
3438 				ASSERT(retrycnt == 0);
3439 				retrycnt++;
3440 				goto retry;
3441 			}
3442 		}
3443 		pwp->wp_oprot = 0;
3444 		pwp->wp_prot = 0;
3445 	}
3446 }
3447 
3448 /*
3449  * Force a new setup for all the watched pages in the range.
3450  */
3451 static void
3452 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3453 {
3454 	struct watched_page *pwp;
3455 	struct watched_page tpw;
3456 	caddr_t eaddr = addr + size;
3457 	caddr_t vaddr;
3458 	struct seg *seg;
3459 	int err, retrycnt;
3460 	uint_t	wprot;
3461 	avl_index_t where;
3462 
3463 	if (avl_numnodes(&as->a_wpage) == 0)
3464 		return;
3465 
3466 	ASSERT(AS_WRITE_HELD(as));
3467 
3468 	tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3469 	if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3470 		pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3471 
3472 	while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3473 		retrycnt = 0;
3474 		vaddr = pwp->wp_vaddr;
3475 
3476 		wprot = prot;
3477 		if (pwp->wp_read)
3478 			wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3479 		if (pwp->wp_write)
3480 			wprot &= ~PROT_WRITE;
3481 		if (pwp->wp_exec)
3482 			wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3483 		if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3484 		retry:
3485 			seg = as_segat(as, vaddr);
3486 			if (seg == NULL) {
3487 				panic("as_setwatchprot: no seg");
3488 				/*NOTREACHED*/
3489 			}
3490 			err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot);
3491 			if (err == IE_RETRY) {
3492 				ASSERT(retrycnt == 0);
3493 				retrycnt++;
3494 				goto retry;
3495 			}
3496 		}
3497 		pwp->wp_oprot = prot;
3498 		pwp->wp_prot = wprot;
3499 
3500 		pwp = AVL_NEXT(&as->a_wpage, pwp);
3501 	}
3502 }
3503 
3504 /*
3505  * Clear all of the watched pages in the range.
3506  */
3507 static void
3508 as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3509 {
3510 	caddr_t eaddr = addr + size;
3511 	struct watched_page *pwp;
3512 	struct watched_page tpw;
3513 	uint_t prot;
3514 	struct seg *seg;
3515 	int err, retrycnt;
3516 	avl_index_t where;
3517 
3518 	if (avl_numnodes(&as->a_wpage) == 0)
3519 		return;
3520 
3521 	tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3522 	if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3523 		pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3524 
3525 	ASSERT(AS_WRITE_HELD(as));
3526 
3527 	while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3528 
3529 		if ((prot = pwp->wp_oprot) != 0) {
3530 			retrycnt = 0;
3531 
3532 			if (prot != pwp->wp_prot) {
3533 			retry:
3534 				seg = as_segat(as, pwp->wp_vaddr);
3535 				if (seg == NULL)
3536 					continue;
3537 				err = SEGOP_SETPROT(seg, pwp->wp_vaddr,
3538 				    PAGESIZE, prot);
3539 				if (err == IE_RETRY) {
3540 					ASSERT(retrycnt == 0);
3541 					retrycnt++;
3542 					goto retry;
3543 
3544 				}
3545 			}
3546 			pwp->wp_oprot = 0;
3547 			pwp->wp_prot = 0;
3548 		}
3549 
3550 		pwp = AVL_NEXT(&as->a_wpage, pwp);
3551 	}
3552 }
3553 
3554 void
3555 as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3556 {
3557 	struct proc *p;
3558 
3559 	mutex_enter(&pidlock);
3560 	for (p = practive; p; p = p->p_next) {
3561 		if (p->p_as == as) {
3562 			mutex_enter(&p->p_lock);
3563 			if (p->p_as == as)
3564 				sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3565 			mutex_exit(&p->p_lock);
3566 		}
3567 	}
3568 	mutex_exit(&pidlock);
3569 }
3570 
3571 /*
3572  * return memory object ID
3573  */
3574 int
3575 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3576 {
3577 	struct seg	*seg;
3578 	int		sts;
3579 
3580 	AS_LOCK_ENTER(as, RW_READER);
3581 	seg = as_segat(as, addr);
3582 	if (seg == NULL) {
3583 		AS_LOCK_EXIT(as);
3584 		return (EFAULT);
3585 	}
3586 	/*
3587 	 * catch old drivers which may not support getmemid
3588 	 */
3589 	if (seg->s_ops->getmemid == NULL) {
3590 		AS_LOCK_EXIT(as);
3591 		return (ENODEV);
3592 	}
3593 
3594 	sts = SEGOP_GETMEMID(seg, addr, memidp);
3595 
3596 	AS_LOCK_EXIT(as);
3597 	return (sts);
3598 }
3599