xref: /titanic_50/usr/src/uts/common/vm/seg_vn.c (revision b9bc7f7832704fda46b4d6b04f3f7be1227dc644)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 #pragma ident	"%Z%%M%	%I%	%E% SMI"
40 
41 /*
42  * VM - shared or copy-on-write from a vnode/anonymous memory.
43  */
44 
45 #include <sys/types.h>
46 #include <sys/param.h>
47 #include <sys/t_lock.h>
48 #include <sys/errno.h>
49 #include <sys/systm.h>
50 #include <sys/mman.h>
51 #include <sys/debug.h>
52 #include <sys/cred.h>
53 #include <sys/vmsystm.h>
54 #include <sys/tuneable.h>
55 #include <sys/bitmap.h>
56 #include <sys/swap.h>
57 #include <sys/kmem.h>
58 #include <sys/sysmacros.h>
59 #include <sys/vtrace.h>
60 #include <sys/cmn_err.h>
61 #include <sys/callb.h>
62 #include <sys/vm.h>
63 #include <sys/dumphdr.h>
64 #include <sys/lgrp.h>
65 
66 #include <vm/hat.h>
67 #include <vm/as.h>
68 #include <vm/seg.h>
69 #include <vm/seg_vn.h>
70 #include <vm/pvn.h>
71 #include <vm/anon.h>
72 #include <vm/page.h>
73 #include <vm/vpage.h>
74 #include <sys/proc.h>
75 #include <sys/task.h>
76 #include <sys/project.h>
77 #include <sys/zone.h>
78 #include <sys/shm_impl.h>
79 /*
80  * Private seg op routines.
81  */
82 static int	segvn_dup(struct seg *seg, struct seg *newseg);
83 static int	segvn_unmap(struct seg *seg, caddr_t addr, size_t len);
84 static void	segvn_free(struct seg *seg);
85 static faultcode_t segvn_fault(struct hat *hat, struct seg *seg,
86 		    caddr_t addr, size_t len, enum fault_type type,
87 		    enum seg_rw rw);
88 static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr);
89 static int	segvn_setprot(struct seg *seg, caddr_t addr,
90 		    size_t len, uint_t prot);
91 static int	segvn_checkprot(struct seg *seg, caddr_t addr,
92 		    size_t len, uint_t prot);
93 static int	segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta);
94 static size_t	segvn_swapout(struct seg *seg);
95 static int	segvn_sync(struct seg *seg, caddr_t addr, size_t len,
96 		    int attr, uint_t flags);
97 static size_t	segvn_incore(struct seg *seg, caddr_t addr, size_t len,
98 		    char *vec);
99 static int	segvn_lockop(struct seg *seg, caddr_t addr, size_t len,
100 		    int attr, int op, ulong_t *lockmap, size_t pos);
101 static int	segvn_getprot(struct seg *seg, caddr_t addr, size_t len,
102 		    uint_t *protv);
103 static u_offset_t	segvn_getoffset(struct seg *seg, caddr_t addr);
104 static int	segvn_gettype(struct seg *seg, caddr_t addr);
105 static int	segvn_getvp(struct seg *seg, caddr_t addr,
106 		    struct vnode **vpp);
107 static int	segvn_advise(struct seg *seg, caddr_t addr, size_t len,
108 		    uint_t behav);
109 static void	segvn_dump(struct seg *seg);
110 static int	segvn_pagelock(struct seg *seg, caddr_t addr, size_t len,
111 		    struct page ***ppp, enum lock_type type, enum seg_rw rw);
112 static int	segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len,
113 		    uint_t szc);
114 static int	segvn_getmemid(struct seg *seg, caddr_t addr,
115 		    memid_t *memidp);
116 static lgrp_mem_policy_info_t	*segvn_getpolicy(struct seg *, caddr_t);
117 static int	segvn_capable(struct seg *seg, segcapability_t capable);
118 
119 struct	seg_ops segvn_ops = {
120 	segvn_dup,
121 	segvn_unmap,
122 	segvn_free,
123 	segvn_fault,
124 	segvn_faulta,
125 	segvn_setprot,
126 	segvn_checkprot,
127 	segvn_kluster,
128 	segvn_swapout,
129 	segvn_sync,
130 	segvn_incore,
131 	segvn_lockop,
132 	segvn_getprot,
133 	segvn_getoffset,
134 	segvn_gettype,
135 	segvn_getvp,
136 	segvn_advise,
137 	segvn_dump,
138 	segvn_pagelock,
139 	segvn_setpagesize,
140 	segvn_getmemid,
141 	segvn_getpolicy,
142 	segvn_capable,
143 };
144 
145 /*
146  * Common zfod structures, provided as a shorthand for others to use.
147  */
148 static segvn_crargs_t zfod_segvn_crargs =
149 	SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
150 static segvn_crargs_t kzfod_segvn_crargs =
151 	SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER,
152 	PROT_ALL & ~PROT_USER);
153 static segvn_crargs_t stack_noexec_crargs =
154 	SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL);
155 
156 caddr_t	zfod_argsp = (caddr_t)&zfod_segvn_crargs;	/* user zfod argsp */
157 caddr_t	kzfod_argsp = (caddr_t)&kzfod_segvn_crargs;	/* kernel zfod argsp */
158 caddr_t	stack_exec_argsp = (caddr_t)&zfod_segvn_crargs;	/* executable stack */
159 caddr_t	stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */
160 
161 #define	vpgtob(n)	((n) * sizeof (struct vpage))	/* For brevity */
162 
163 size_t	segvn_comb_thrshld = UINT_MAX;	/* patchable -- see 1196681 */
164 
165 static int	segvn_concat(struct seg *, struct seg *, int);
166 static int	segvn_extend_prev(struct seg *, struct seg *,
167 		    struct segvn_crargs *, size_t);
168 static int	segvn_extend_next(struct seg *, struct seg *,
169 		    struct segvn_crargs *, size_t);
170 static void	segvn_softunlock(struct seg *, caddr_t, size_t, enum seg_rw);
171 static void	segvn_pagelist_rele(page_t **);
172 static void	segvn_setvnode_mpss(vnode_t *);
173 static void	segvn_relocate_pages(page_t **, page_t *);
174 static int	segvn_full_szcpages(page_t **, uint_t, int *, uint_t *);
175 static int	segvn_fill_vp_pages(struct segvn_data *, vnode_t *, u_offset_t,
176     uint_t, page_t **, page_t **, uint_t *, int *);
177 static faultcode_t segvn_fault_vnodepages(struct hat *, struct seg *, caddr_t,
178     caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int);
179 static faultcode_t segvn_fault_anonpages(struct hat *, struct seg *, caddr_t,
180     caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int);
181 static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t,
182     u_offset_t, struct vpage *, page_t **, uint_t,
183     enum fault_type, enum seg_rw, int, int);
184 static void	segvn_vpage(struct seg *);
185 
186 static void segvn_purge(struct seg *seg);
187 static int segvn_reclaim(struct seg *, caddr_t, size_t, struct page **,
188     enum seg_rw);
189 
190 static int sameprot(struct seg *, caddr_t, size_t);
191 
192 static int segvn_demote_range(struct seg *, caddr_t, size_t, int, uint_t);
193 static int segvn_clrszc(struct seg *);
194 static struct seg *segvn_split_seg(struct seg *, caddr_t);
195 static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t,
196     ulong_t, uint_t);
197 
198 static void segvn_hat_rgn_unload_callback(caddr_t, caddr_t, caddr_t,
199     size_t, void *, u_offset_t);
200 
201 static int segvn_slock_anonpages(page_t *, int);
202 static void segvn_sunlock_anonpages(page_t *, int);
203 
204 static struct kmem_cache *segvn_cache;
205 static struct kmem_cache **segvn_szc_cache;
206 
207 #ifdef VM_STATS
208 static struct segvnvmstats_str {
209 	ulong_t	fill_vp_pages[31];
210 	ulong_t fltvnpages[49];
211 	ulong_t	fullszcpages[10];
212 	ulong_t	relocatepages[3];
213 	ulong_t	fltanpages[17];
214 	ulong_t pagelock[3];
215 	ulong_t	demoterange[3];
216 } segvnvmstats;
217 #endif /* VM_STATS */
218 
219 #define	SDR_RANGE	1		/* demote entire range */
220 #define	SDR_END		2		/* demote non aligned ends only */
221 
222 #define	CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) {	    \
223 		if ((len) != 0) { 		      	      		      \
224 			lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz);  \
225 			ASSERT(lpgaddr >= (seg)->s_base);	      	      \
226 			lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) +    \
227 			    (len)), pgsz);				      \
228 			ASSERT(lpgeaddr > lpgaddr);		      	      \
229 			ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size);    \
230 		} else {					      	      \
231 			lpgeaddr = lpgaddr = (addr);	      		      \
232 		}							      \
233 	}
234 
235 /*ARGSUSED*/
236 static int
237 segvn_cache_constructor(void *buf, void *cdrarg, int kmflags)
238 {
239 	struct segvn_data *svd = buf;
240 
241 	rw_init(&svd->lock, NULL, RW_DEFAULT, NULL);
242 	mutex_init(&svd->segp_slock, NULL, MUTEX_DEFAULT, NULL);
243 	svd->svn_trnext = svd->svn_trprev = NULL;
244 	return (0);
245 }
246 
247 /*ARGSUSED1*/
248 static void
249 segvn_cache_destructor(void *buf, void *cdrarg)
250 {
251 	struct segvn_data *svd = buf;
252 
253 	rw_destroy(&svd->lock);
254 	mutex_destroy(&svd->segp_slock);
255 }
256 
257 /*ARGSUSED*/
258 static int
259 svntr_cache_constructor(void *buf, void *cdrarg, int kmflags)
260 {
261 	bzero(buf, sizeof (svntr_t));
262 	return (0);
263 }
264 
265 /*
266  * Patching this variable to non-zero allows the system to run with
267  * stacks marked as "not executable".  It's a bit of a kludge, but is
268  * provided as a tweakable for platforms that export those ABIs
269  * (e.g. sparc V8) that have executable stacks enabled by default.
270  * There are also some restrictions for platforms that don't actually
271  * implement 'noexec' protections.
272  *
273  * Once enabled, the system is (therefore) unable to provide a fully
274  * ABI-compliant execution environment, though practically speaking,
275  * most everything works.  The exceptions are generally some interpreters
276  * and debuggers that create executable code on the stack and jump
277  * into it (without explicitly mprotecting the address range to include
278  * PROT_EXEC).
279  *
280  * One important class of applications that are disabled are those
281  * that have been transformed into malicious agents using one of the
282  * numerous "buffer overflow" attacks.  See 4007890.
283  */
284 int noexec_user_stack = 0;
285 int noexec_user_stack_log = 1;
286 
287 int segvn_lpg_disable = 0;
288 uint_t segvn_maxpgszc = 0;
289 
290 ulong_t segvn_vmpss_clrszc_cnt;
291 ulong_t segvn_vmpss_clrszc_err;
292 ulong_t segvn_fltvnpages_clrszc_cnt;
293 ulong_t segvn_fltvnpages_clrszc_err;
294 ulong_t segvn_setpgsz_align_err;
295 ulong_t segvn_setpgsz_anon_align_err;
296 ulong_t segvn_setpgsz_getattr_err;
297 ulong_t segvn_setpgsz_eof_err;
298 ulong_t segvn_faultvnmpss_align_err1;
299 ulong_t segvn_faultvnmpss_align_err2;
300 ulong_t segvn_faultvnmpss_align_err3;
301 ulong_t segvn_faultvnmpss_align_err4;
302 ulong_t segvn_faultvnmpss_align_err5;
303 ulong_t	segvn_vmpss_pageio_deadlk_err;
304 
305 int segvn_use_regions = 1;
306 
307 /*
308  * Segvn supports text replication optimization for NUMA platforms. Text
309  * replica's are represented by anon maps (amp). There's one amp per text file
310  * region per lgroup. A process chooses the amp for each of its text mappings
311  * based on the lgroup assignment of its main thread (t_tid = 1). All
312  * processes that want a replica on a particular lgroup for the same text file
313  * mapping share the same amp. amp's are looked up in svntr_hashtab hash table
314  * with vp,off,size,szc used as a key. Text replication segments are read only
315  * MAP_PRIVATE|MAP_TEXT segments that map vnode. Replication is achieved by
316  * forcing COW faults from vnode to amp and mapping amp pages instead of vnode
317  * pages. Replication amp is assigned to a segment when it gets its first
318  * pagefault. To handle main thread lgroup rehoming segvn_trasync_thread
319  * rechecks periodically if the process still maps an amp local to the main
320  * thread. If not async thread forces process to remap to an amp in the new
321  * home lgroup of the main thread. Current text replication implementation
322  * only provides the benefit to workloads that do most of their work in the
323  * main thread of a process or all the threads of a process run in the same
324  * lgroup. To extend text replication benefit to different types of
325  * multithreaded workloads further work would be needed in the hat layer to
326  * allow the same virtual address in the same hat to simultaneously map
327  * different physical addresses (i.e. page table replication would be needed
328  * for x86).
329  *
330  * amp pages are used instead of vnode pages as long as segment has a very
331  * simple life cycle.  It's created via segvn_create(), handles S_EXEC
332  * (S_READ) pagefaults and is fully unmapped.  If anything more complicated
333  * happens such as protection is changed, real COW fault happens, pagesize is
334  * changed, MC_LOCK is requested or segment is partially unmapped we turn off
335  * text replication by converting the segment back to vnode only segment
336  * (unmap segment's address range and set svd->amp to NULL).
337  *
338  * The original file can be changed after amp is inserted into
339  * svntr_hashtab. Processes that are launched after the file is already
340  * changed can't use the replica's created prior to the file change. To
341  * implement this functionality hash entries are timestamped. Replica's can
342  * only be used if current file modification time is the same as the timestamp
343  * saved when hash entry was created. However just timestamps alone are not
344  * sufficient to detect file modification via mmap(MAP_SHARED) mappings. We
345  * deal with file changes via MAP_SHARED mappings differently. When writable
346  * MAP_SHARED mappings are created to vnodes marked as executable we mark all
347  * existing replica's for this vnode as not usable for future text
348  * mappings. And we don't create new replica's for files that currently have
349  * potentially writable MAP_SHARED mappings (i.e. vn_is_mapped(V_WRITE) is
350  * true).
351  */
352 
353 #define	SEGVN_TEXTREPL_MAXBYTES_FACTOR	(20)
354 size_t	segvn_textrepl_max_bytes_factor = SEGVN_TEXTREPL_MAXBYTES_FACTOR;
355 
356 static ulong_t			svntr_hashtab_sz = 512;
357 static svntr_bucket_t		*svntr_hashtab = NULL;
358 static struct kmem_cache	*svntr_cache;
359 static svntr_stats_t		*segvn_textrepl_stats;
360 static ksema_t 			segvn_trasync_sem;
361 
362 int				segvn_disable_textrepl = 1;
363 size_t				textrepl_size_thresh = (size_t)-1;
364 size_t				segvn_textrepl_bytes = 0;
365 size_t				segvn_textrepl_max_bytes = 0;
366 clock_t				segvn_update_textrepl_interval = 0;
367 int				segvn_update_tr_time = 10;
368 int				segvn_disable_textrepl_update = 0;
369 
370 static void segvn_textrepl(struct seg *);
371 static void segvn_textunrepl(struct seg *, int);
372 static void segvn_inval_trcache(vnode_t *);
373 static void segvn_trasync_thread(void);
374 static void segvn_trupdate_wakeup(void *);
375 static void segvn_trupdate(void);
376 static void segvn_trupdate_seg(struct seg *, segvn_data_t *, svntr_t *,
377     ulong_t);
378 
379 /*
380  * Initialize segvn data structures
381  */
382 void
383 segvn_init(void)
384 {
385 	uint_t maxszc;
386 	uint_t szc;
387 	size_t pgsz;
388 
389 	segvn_cache = kmem_cache_create("segvn_cache",
390 	    sizeof (struct segvn_data), 0,
391 	    segvn_cache_constructor, segvn_cache_destructor, NULL,
392 	    NULL, NULL, 0);
393 
394 	if (segvn_lpg_disable == 0) {
395 		szc = maxszc = page_num_pagesizes() - 1;
396 		if (szc == 0) {
397 			segvn_lpg_disable = 1;
398 		}
399 		if (page_get_pagesize(0) != PAGESIZE) {
400 			panic("segvn_init: bad szc 0");
401 			/*NOTREACHED*/
402 		}
403 		while (szc != 0) {
404 			pgsz = page_get_pagesize(szc);
405 			if (pgsz <= PAGESIZE || !IS_P2ALIGNED(pgsz, pgsz)) {
406 				panic("segvn_init: bad szc %d", szc);
407 				/*NOTREACHED*/
408 			}
409 			szc--;
410 		}
411 		if (segvn_maxpgszc == 0 || segvn_maxpgszc > maxszc)
412 			segvn_maxpgszc = maxszc;
413 	}
414 
415 	if (segvn_maxpgszc) {
416 		segvn_szc_cache = (struct kmem_cache **)kmem_alloc(
417 		    (segvn_maxpgszc + 1) * sizeof (struct kmem_cache *),
418 		    KM_SLEEP);
419 	}
420 
421 	for (szc = 1; szc <= segvn_maxpgszc; szc++) {
422 		char	str[32];
423 
424 		(void) sprintf(str, "segvn_szc_cache%d", szc);
425 		segvn_szc_cache[szc] = kmem_cache_create(str,
426 		    page_get_pagecnt(szc) * sizeof (page_t *), 0,
427 		    NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
428 	}
429 
430 
431 	if (segvn_use_regions && !hat_supported(HAT_SHARED_REGIONS, NULL))
432 		segvn_use_regions = 0;
433 
434 	/*
435 	 * For now shared regions and text replication segvn support
436 	 * are mutually exclusive. This is acceptable because
437 	 * currently significant benefit from text replication was
438 	 * only observed on AMD64 NUMA platforms (due to relatively
439 	 * small L2$ size) and currently we don't support shared
440 	 * regions on x86.
441 	 */
442 	if (segvn_use_regions && !segvn_disable_textrepl) {
443 		segvn_disable_textrepl = 1;
444 	}
445 
446 #if defined(_LP64)
447 	if (lgrp_optimizations() && textrepl_size_thresh != (size_t)-1 &&
448 	    !segvn_disable_textrepl) {
449 		ulong_t i;
450 		size_t hsz = svntr_hashtab_sz * sizeof (svntr_bucket_t);
451 
452 		svntr_cache = kmem_cache_create("svntr_cache",
453 		    sizeof (svntr_t), 0, svntr_cache_constructor, NULL,
454 		    NULL, NULL, NULL, 0);
455 		svntr_hashtab = kmem_zalloc(hsz, KM_SLEEP);
456 		for (i = 0; i < svntr_hashtab_sz; i++) {
457 			mutex_init(&svntr_hashtab[i].tr_lock,  NULL,
458 			    MUTEX_DEFAULT, NULL);
459 		}
460 		segvn_textrepl_max_bytes = ptob(physmem) /
461 		    segvn_textrepl_max_bytes_factor;
462 		segvn_textrepl_stats = kmem_zalloc(NCPU *
463 		    sizeof (svntr_stats_t), KM_SLEEP);
464 		sema_init(&segvn_trasync_sem, 0, NULL, SEMA_DEFAULT, NULL);
465 		(void) thread_create(NULL, 0, segvn_trasync_thread,
466 		    NULL, 0, &p0, TS_RUN, minclsyspri);
467 	}
468 #endif
469 }
470 
471 #define	SEGVN_PAGEIO	((void *)0x1)
472 #define	SEGVN_NOPAGEIO	((void *)0x2)
473 
474 static void
475 segvn_setvnode_mpss(vnode_t *vp)
476 {
477 	int err;
478 
479 	ASSERT(vp->v_mpssdata == NULL ||
480 	    vp->v_mpssdata == SEGVN_PAGEIO ||
481 	    vp->v_mpssdata == SEGVN_NOPAGEIO);
482 
483 	if (vp->v_mpssdata == NULL) {
484 		if (vn_vmpss_usepageio(vp)) {
485 			err = VOP_PAGEIO(vp, (page_t *)NULL,
486 			    (u_offset_t)0, 0, 0, CRED(), NULL);
487 		} else {
488 			err = ENOSYS;
489 		}
490 		/*
491 		 * set v_mpssdata just once per vnode life
492 		 * so that it never changes.
493 		 */
494 		mutex_enter(&vp->v_lock);
495 		if (vp->v_mpssdata == NULL) {
496 			if (err == EINVAL) {
497 				vp->v_mpssdata = SEGVN_PAGEIO;
498 			} else {
499 				vp->v_mpssdata = SEGVN_NOPAGEIO;
500 			}
501 		}
502 		mutex_exit(&vp->v_lock);
503 	}
504 }
505 
506 int
507 segvn_create(struct seg *seg, void *argsp)
508 {
509 	struct segvn_crargs *a = (struct segvn_crargs *)argsp;
510 	struct segvn_data *svd;
511 	size_t swresv = 0;
512 	struct cred *cred;
513 	struct anon_map *amp;
514 	int error = 0;
515 	size_t pgsz;
516 	lgrp_mem_policy_t mpolicy = LGRP_MEM_POLICY_DEFAULT;
517 	int use_rgn = 0;
518 	int trok = 0;
519 
520 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
521 
522 	if (a->type != MAP_PRIVATE && a->type != MAP_SHARED) {
523 		panic("segvn_create type");
524 		/*NOTREACHED*/
525 	}
526 
527 	/*
528 	 * Check arguments.  If a shared anon structure is given then
529 	 * it is illegal to also specify a vp.
530 	 */
531 	if (a->amp != NULL && a->vp != NULL) {
532 		panic("segvn_create anon_map");
533 		/*NOTREACHED*/
534 	}
535 
536 	if (a->type == MAP_PRIVATE && (a->flags & MAP_TEXT) &&
537 	    a->vp != NULL && a->prot == (PROT_USER | PROT_READ | PROT_EXEC) &&
538 	    segvn_use_regions) {
539 		use_rgn = 1;
540 	}
541 
542 	/* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */
543 	if (a->type == MAP_SHARED)
544 		a->flags &= ~MAP_NORESERVE;
545 
546 	if (a->szc != 0) {
547 		if (segvn_lpg_disable != 0 || (a->szc == AS_MAP_NO_LPOOB) ||
548 		    (a->amp != NULL && a->type == MAP_PRIVATE) ||
549 		    (a->flags & MAP_NORESERVE) || seg->s_as == &kas) {
550 			a->szc = 0;
551 		} else {
552 			if (a->szc > segvn_maxpgszc)
553 				a->szc = segvn_maxpgszc;
554 			pgsz = page_get_pagesize(a->szc);
555 			if (!IS_P2ALIGNED(seg->s_base, pgsz) ||
556 			    !IS_P2ALIGNED(seg->s_size, pgsz)) {
557 				a->szc = 0;
558 			} else if (a->vp != NULL) {
559 				extern struct vnode kvp;
560 				if (IS_SWAPFSVP(a->vp) || VN_ISKAS(a->vp)) {
561 					/*
562 					 * paranoid check.
563 					 * hat_page_demote() is not supported
564 					 * on swapfs pages.
565 					 */
566 					a->szc = 0;
567 				} else if (map_addr_vacalign_check(seg->s_base,
568 				    a->offset & PAGEMASK)) {
569 					a->szc = 0;
570 				}
571 			} else if (a->amp != NULL) {
572 				pgcnt_t anum = btopr(a->offset);
573 				pgcnt_t pgcnt = page_get_pagecnt(a->szc);
574 				if (!IS_P2ALIGNED(anum, pgcnt)) {
575 					a->szc = 0;
576 				}
577 			}
578 		}
579 	}
580 
581 	/*
582 	 * If segment may need private pages, reserve them now.
583 	 */
584 	if (!(a->flags & MAP_NORESERVE) && ((a->vp == NULL && a->amp == NULL) ||
585 	    (a->type == MAP_PRIVATE && (a->prot & PROT_WRITE)))) {
586 		if (anon_resv(seg->s_size) == 0)
587 			return (EAGAIN);
588 		swresv = seg->s_size;
589 		TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
590 		    seg, swresv, 1);
591 	}
592 
593 	/*
594 	 * Reserve any mapping structures that may be required.
595 	 *
596 	 * Don't do it for segments that may use regions. It's currently a
597 	 * noop in the hat implementations anyway.
598 	 */
599 	if (!use_rgn) {
600 		hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP);
601 	}
602 
603 	if (a->cred) {
604 		cred = a->cred;
605 		crhold(cred);
606 	} else {
607 		crhold(cred = CRED());
608 	}
609 
610 	/* Inform the vnode of the new mapping */
611 	if (a->vp != NULL) {
612 		error = VOP_ADDMAP(a->vp, a->offset & PAGEMASK,
613 		    seg->s_as, seg->s_base, seg->s_size, a->prot,
614 		    a->maxprot, a->type, cred, NULL);
615 		if (error) {
616 			if (swresv != 0) {
617 				anon_unresv(swresv);
618 				TRACE_3(TR_FAC_VM, TR_ANON_PROC,
619 				    "anon proc:%p %lu %u", seg, swresv, 0);
620 			}
621 			crfree(cred);
622 			if (!use_rgn) {
623 				hat_unload(seg->s_as->a_hat, seg->s_base,
624 				    seg->s_size, HAT_UNLOAD_UNMAP);
625 			}
626 			return (error);
627 		}
628 		/*
629 		 * svntr_hashtab will be NULL if we support shared regions.
630 		 */
631 		trok = ((a->flags & MAP_TEXT) &&
632 		    (seg->s_size > textrepl_size_thresh ||
633 		    (a->flags & _MAP_TEXTREPL)) &&
634 		    lgrp_optimizations() && svntr_hashtab != NULL &&
635 		    a->type == MAP_PRIVATE && swresv == 0 &&
636 		    !(a->flags & MAP_NORESERVE) &&
637 		    seg->s_as != &kas && a->vp->v_type == VREG);
638 
639 		ASSERT(!trok || !use_rgn);
640 	}
641 
642 	/*
643 	 * If more than one segment in the address space, and they're adjacent
644 	 * virtually, try to concatenate them.  Don't concatenate if an
645 	 * explicit anon_map structure was supplied (e.g., SystemV shared
646 	 * memory) or if we'll use text replication for this segment.
647 	 */
648 	if (a->amp == NULL && !use_rgn && !trok) {
649 		struct seg *pseg, *nseg;
650 		struct segvn_data *psvd, *nsvd;
651 		lgrp_mem_policy_t ppolicy, npolicy;
652 		uint_t	lgrp_mem_policy_flags = 0;
653 		extern lgrp_mem_policy_t lgrp_mem_default_policy;
654 
655 		/*
656 		 * Memory policy flags (lgrp_mem_policy_flags) is valid when
657 		 * extending stack/heap segments.
658 		 */
659 		if ((a->vp == NULL) && (a->type == MAP_PRIVATE) &&
660 		    !(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) {
661 			lgrp_mem_policy_flags = a->lgrp_mem_policy_flags;
662 		} else {
663 			/*
664 			 * Get policy when not extending it from another segment
665 			 */
666 			mpolicy = lgrp_mem_policy_default(seg->s_size, a->type);
667 		}
668 
669 		/*
670 		 * First, try to concatenate the previous and new segments
671 		 */
672 		pseg = AS_SEGPREV(seg->s_as, seg);
673 		if (pseg != NULL &&
674 		    pseg->s_base + pseg->s_size == seg->s_base &&
675 		    pseg->s_ops == &segvn_ops) {
676 			/*
677 			 * Get memory allocation policy from previous segment.
678 			 * When extension is specified (e.g. for heap) apply
679 			 * this policy to the new segment regardless of the
680 			 * outcome of segment concatenation.  Extension occurs
681 			 * for non-default policy otherwise default policy is
682 			 * used and is based on extended segment size.
683 			 */
684 			psvd = (struct segvn_data *)pseg->s_data;
685 			ppolicy = psvd->policy_info.mem_policy;
686 			if (lgrp_mem_policy_flags ==
687 			    LGRP_MP_FLAG_EXTEND_UP) {
688 				if (ppolicy != lgrp_mem_default_policy) {
689 					mpolicy = ppolicy;
690 				} else {
691 					mpolicy = lgrp_mem_policy_default(
692 					    pseg->s_size + seg->s_size,
693 					    a->type);
694 				}
695 			}
696 
697 			if (mpolicy == ppolicy &&
698 			    (pseg->s_size + seg->s_size <=
699 			    segvn_comb_thrshld || psvd->amp == NULL) &&
700 			    segvn_extend_prev(pseg, seg, a, swresv) == 0) {
701 				/*
702 				 * success! now try to concatenate
703 				 * with following seg
704 				 */
705 				crfree(cred);
706 				nseg = AS_SEGNEXT(pseg->s_as, pseg);
707 				if (nseg != NULL &&
708 				    nseg != pseg &&
709 				    nseg->s_ops == &segvn_ops &&
710 				    pseg->s_base + pseg->s_size ==
711 				    nseg->s_base)
712 					(void) segvn_concat(pseg, nseg, 0);
713 				ASSERT(pseg->s_szc == 0 ||
714 				    (a->szc == pseg->s_szc &&
715 				    IS_P2ALIGNED(pseg->s_base, pgsz) &&
716 				    IS_P2ALIGNED(pseg->s_size, pgsz)));
717 				return (0);
718 			}
719 		}
720 
721 		/*
722 		 * Failed, so try to concatenate with following seg
723 		 */
724 		nseg = AS_SEGNEXT(seg->s_as, seg);
725 		if (nseg != NULL &&
726 		    seg->s_base + seg->s_size == nseg->s_base &&
727 		    nseg->s_ops == &segvn_ops) {
728 			/*
729 			 * Get memory allocation policy from next segment.
730 			 * When extension is specified (e.g. for stack) apply
731 			 * this policy to the new segment regardless of the
732 			 * outcome of segment concatenation.  Extension occurs
733 			 * for non-default policy otherwise default policy is
734 			 * used and is based on extended segment size.
735 			 */
736 			nsvd = (struct segvn_data *)nseg->s_data;
737 			npolicy = nsvd->policy_info.mem_policy;
738 			if (lgrp_mem_policy_flags ==
739 			    LGRP_MP_FLAG_EXTEND_DOWN) {
740 				if (npolicy != lgrp_mem_default_policy) {
741 					mpolicy = npolicy;
742 				} else {
743 					mpolicy = lgrp_mem_policy_default(
744 					    nseg->s_size + seg->s_size,
745 					    a->type);
746 				}
747 			}
748 
749 			if (mpolicy == npolicy &&
750 			    segvn_extend_next(seg, nseg, a, swresv) == 0) {
751 				crfree(cred);
752 				ASSERT(nseg->s_szc == 0 ||
753 				    (a->szc == nseg->s_szc &&
754 				    IS_P2ALIGNED(nseg->s_base, pgsz) &&
755 				    IS_P2ALIGNED(nseg->s_size, pgsz)));
756 				return (0);
757 			}
758 		}
759 	}
760 
761 	if (a->vp != NULL) {
762 		VN_HOLD(a->vp);
763 		if (a->type == MAP_SHARED)
764 			lgrp_shm_policy_init(NULL, a->vp);
765 	}
766 	svd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
767 
768 	seg->s_ops = &segvn_ops;
769 	seg->s_data = (void *)svd;
770 	seg->s_szc = a->szc;
771 
772 	svd->seg = seg;
773 	svd->vp = a->vp;
774 	/*
775 	 * Anonymous mappings have no backing file so the offset is meaningless.
776 	 */
777 	svd->offset = a->vp ? (a->offset & PAGEMASK) : 0;
778 	svd->prot = a->prot;
779 	svd->maxprot = a->maxprot;
780 	svd->pageprot = 0;
781 	svd->type = a->type;
782 	svd->vpage = NULL;
783 	svd->cred = cred;
784 	svd->advice = MADV_NORMAL;
785 	svd->pageadvice = 0;
786 	svd->flags = (ushort_t)a->flags;
787 	svd->softlockcnt = 0;
788 	svd->rcookie = HAT_INVALID_REGION_COOKIE;
789 
790 	if (a->szc != 0 && a->vp != NULL) {
791 		segvn_setvnode_mpss(a->vp);
792 	}
793 	if (svd->type == MAP_SHARED && svd->vp != NULL &&
794 	    (svd->vp->v_flag & VVMEXEC) && (svd->prot & PROT_WRITE)) {
795 		ASSERT(vn_is_mapped(svd->vp, V_WRITE));
796 		segvn_inval_trcache(svd->vp);
797 	}
798 
799 	amp = a->amp;
800 	if ((svd->amp = amp) == NULL) {
801 		svd->anon_index = 0;
802 		if (svd->type == MAP_SHARED) {
803 			svd->swresv = 0;
804 			/*
805 			 * Shared mappings to a vp need no other setup.
806 			 * If we have a shared mapping to an anon_map object
807 			 * which hasn't been allocated yet,  allocate the
808 			 * struct now so that it will be properly shared
809 			 * by remembering the swap reservation there.
810 			 */
811 			if (a->vp == NULL) {
812 				svd->amp = anonmap_alloc(seg->s_size, swresv,
813 				    ANON_SLEEP);
814 				svd->amp->a_szc = seg->s_szc;
815 			}
816 		} else {
817 			/*
818 			 * Private mapping (with or without a vp).
819 			 * Allocate anon_map when needed.
820 			 */
821 			svd->swresv = swresv;
822 		}
823 	} else {
824 		pgcnt_t anon_num;
825 
826 		/*
827 		 * Mapping to an existing anon_map structure without a vp.
828 		 * For now we will insure that the segment size isn't larger
829 		 * than the size - offset gives us.  Later on we may wish to
830 		 * have the anon array dynamically allocated itself so that
831 		 * we don't always have to allocate all the anon pointer slots.
832 		 * This of course involves adding extra code to check that we
833 		 * aren't trying to use an anon pointer slot beyond the end
834 		 * of the currently allocated anon array.
835 		 */
836 		if ((amp->size - a->offset) < seg->s_size) {
837 			panic("segvn_create anon_map size");
838 			/*NOTREACHED*/
839 		}
840 
841 		anon_num = btopr(a->offset);
842 
843 		if (a->type == MAP_SHARED) {
844 			/*
845 			 * SHARED mapping to a given anon_map.
846 			 */
847 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
848 			amp->refcnt++;
849 			if (a->szc > amp->a_szc) {
850 				amp->a_szc = a->szc;
851 			}
852 			ANON_LOCK_EXIT(&amp->a_rwlock);
853 			svd->anon_index = anon_num;
854 			svd->swresv = 0;
855 		} else {
856 			/*
857 			 * PRIVATE mapping to a given anon_map.
858 			 * Make sure that all the needed anon
859 			 * structures are created (so that we will
860 			 * share the underlying pages if nothing
861 			 * is written by this mapping) and then
862 			 * duplicate the anon array as is done
863 			 * when a privately mapped segment is dup'ed.
864 			 */
865 			struct anon *ap;
866 			caddr_t addr;
867 			caddr_t eaddr;
868 			ulong_t	anon_idx;
869 			int hat_flag = HAT_LOAD;
870 
871 			if (svd->flags & MAP_TEXT) {
872 				hat_flag |= HAT_LOAD_TEXT;
873 			}
874 
875 			svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP);
876 			svd->amp->a_szc = seg->s_szc;
877 			svd->anon_index = 0;
878 			svd->swresv = swresv;
879 
880 			/*
881 			 * Prevent 2 threads from allocating anon
882 			 * slots simultaneously.
883 			 */
884 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
885 			eaddr = seg->s_base + seg->s_size;
886 
887 			for (anon_idx = anon_num, addr = seg->s_base;
888 			    addr < eaddr; addr += PAGESIZE, anon_idx++) {
889 				page_t *pp;
890 
891 				if ((ap = anon_get_ptr(amp->ahp,
892 				    anon_idx)) != NULL)
893 					continue;
894 
895 				/*
896 				 * Allocate the anon struct now.
897 				 * Might as well load up translation
898 				 * to the page while we're at it...
899 				 */
900 				pp = anon_zero(seg, addr, &ap, cred);
901 				if (ap == NULL || pp == NULL) {
902 					panic("segvn_create anon_zero");
903 					/*NOTREACHED*/
904 				}
905 
906 				/*
907 				 * Re-acquire the anon_map lock and
908 				 * initialize the anon array entry.
909 				 */
910 				ASSERT(anon_get_ptr(amp->ahp,
911 				    anon_idx) == NULL);
912 				(void) anon_set_ptr(amp->ahp, anon_idx, ap,
913 				    ANON_SLEEP);
914 
915 				ASSERT(seg->s_szc == 0);
916 				ASSERT(!IS_VMODSORT(pp->p_vnode));
917 
918 				ASSERT(use_rgn == 0);
919 				hat_memload(seg->s_as->a_hat, addr, pp,
920 				    svd->prot & ~PROT_WRITE, hat_flag);
921 
922 				page_unlock(pp);
923 			}
924 			ASSERT(seg->s_szc == 0);
925 			anon_dup(amp->ahp, anon_num, svd->amp->ahp,
926 			    0, seg->s_size);
927 			ANON_LOCK_EXIT(&amp->a_rwlock);
928 		}
929 	}
930 
931 	/*
932 	 * Set default memory allocation policy for segment
933 	 *
934 	 * Always set policy for private memory at least for initialization
935 	 * even if this is a shared memory segment
936 	 */
937 	(void) lgrp_privm_policy_set(mpolicy, &svd->policy_info, seg->s_size);
938 
939 	if (svd->type == MAP_SHARED)
940 		(void) lgrp_shm_policy_set(mpolicy, svd->amp, svd->anon_index,
941 		    svd->vp, svd->offset, seg->s_size);
942 
943 	if (use_rgn) {
944 		ASSERT(!trok);
945 		ASSERT(svd->amp == NULL);
946 		svd->rcookie = hat_join_region(seg->s_as->a_hat, seg->s_base,
947 		    seg->s_size, (void *)svd->vp, svd->offset, svd->prot,
948 		    (uchar_t)seg->s_szc, segvn_hat_rgn_unload_callback,
949 		    HAT_REGION_TEXT);
950 	}
951 
952 	ASSERT(!trok || !(svd->prot & PROT_WRITE));
953 	svd->tr_state = trok ? SEGVN_TR_INIT : SEGVN_TR_OFF;
954 
955 	return (0);
956 }
957 
958 /*
959  * Concatenate two existing segments, if possible.
960  * Return 0 on success, -1 if two segments are not compatible
961  * or -2 on memory allocation failure.
962  * If amp_cat == 1 then try and concat segments with anon maps
963  */
964 static int
965 segvn_concat(struct seg *seg1, struct seg *seg2, int amp_cat)
966 {
967 	struct segvn_data *svd1 = seg1->s_data;
968 	struct segvn_data *svd2 = seg2->s_data;
969 	struct anon_map *amp1 = svd1->amp;
970 	struct anon_map *amp2 = svd2->amp;
971 	struct vpage *vpage1 = svd1->vpage;
972 	struct vpage *vpage2 = svd2->vpage, *nvpage = NULL;
973 	size_t size, nvpsize;
974 	pgcnt_t npages1, npages2;
975 
976 	ASSERT(seg1->s_as && seg2->s_as && seg1->s_as == seg2->s_as);
977 	ASSERT(AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock));
978 	ASSERT(seg1->s_ops == seg2->s_ops);
979 
980 	if (HAT_IS_REGION_COOKIE_VALID(svd1->rcookie) ||
981 	    HAT_IS_REGION_COOKIE_VALID(svd2->rcookie)) {
982 		return (-1);
983 	}
984 
985 	/* both segments exist, try to merge them */
986 #define	incompat(x)	(svd1->x != svd2->x)
987 	if (incompat(vp) || incompat(maxprot) ||
988 	    (!svd1->pageadvice && !svd2->pageadvice && incompat(advice)) ||
989 	    (!svd1->pageprot && !svd2->pageprot && incompat(prot)) ||
990 	    incompat(type) || incompat(cred) || incompat(flags) ||
991 	    seg1->s_szc != seg2->s_szc || incompat(policy_info.mem_policy) ||
992 	    (svd2->softlockcnt > 0))
993 		return (-1);
994 #undef incompat
995 
996 	/*
997 	 * vp == NULL implies zfod, offset doesn't matter
998 	 */
999 	if (svd1->vp != NULL &&
1000 	    svd1->offset + seg1->s_size != svd2->offset) {
1001 		return (-1);
1002 	}
1003 
1004 	/*
1005 	 * Don't concatenate if either segment uses text replication.
1006 	 */
1007 	if (svd1->tr_state != SEGVN_TR_OFF || svd2->tr_state != SEGVN_TR_OFF) {
1008 		return (-1);
1009 	}
1010 
1011 	/*
1012 	 * Fail early if we're not supposed to concatenate
1013 	 * segments with non NULL amp.
1014 	 */
1015 	if (amp_cat == 0 && (amp1 != NULL || amp2 != NULL)) {
1016 		return (-1);
1017 	}
1018 
1019 	if (svd1->vp == NULL && svd1->type == MAP_SHARED) {
1020 		if (amp1 != amp2) {
1021 			return (-1);
1022 		}
1023 		if (amp1 != NULL && svd1->anon_index + btop(seg1->s_size) !=
1024 		    svd2->anon_index) {
1025 			return (-1);
1026 		}
1027 		ASSERT(amp1 == NULL || amp1->refcnt >= 2);
1028 	}
1029 
1030 	/*
1031 	 * If either seg has vpages, create a new merged vpage array.
1032 	 */
1033 	if (vpage1 != NULL || vpage2 != NULL) {
1034 		struct vpage *vp;
1035 
1036 		npages1 = seg_pages(seg1);
1037 		npages2 = seg_pages(seg2);
1038 		nvpsize = vpgtob(npages1 + npages2);
1039 
1040 		if ((nvpage = kmem_zalloc(nvpsize, KM_NOSLEEP)) == NULL) {
1041 			return (-2);
1042 		}
1043 
1044 		if (vpage1 != NULL) {
1045 			bcopy(vpage1, nvpage, vpgtob(npages1));
1046 		} else {
1047 			for (vp = nvpage; vp < nvpage + npages1; vp++) {
1048 				VPP_SETPROT(vp, svd1->prot);
1049 				VPP_SETADVICE(vp, svd1->advice);
1050 			}
1051 		}
1052 
1053 		if (vpage2 != NULL) {
1054 			bcopy(vpage2, nvpage + npages1, vpgtob(npages2));
1055 		} else {
1056 			for (vp = nvpage + npages1;
1057 			    vp < nvpage + npages1 + npages2; vp++) {
1058 				VPP_SETPROT(vp, svd2->prot);
1059 				VPP_SETADVICE(vp, svd2->advice);
1060 			}
1061 		}
1062 	}
1063 
1064 	/*
1065 	 * If either segment has private pages, create a new merged anon
1066 	 * array. If mergeing shared anon segments just decrement anon map's
1067 	 * refcnt.
1068 	 */
1069 	if (amp1 != NULL && svd1->type == MAP_SHARED) {
1070 		ASSERT(amp1 == amp2 && svd1->vp == NULL);
1071 		ANON_LOCK_ENTER(&amp1->a_rwlock, RW_WRITER);
1072 		ASSERT(amp1->refcnt >= 2);
1073 		amp1->refcnt--;
1074 		ANON_LOCK_EXIT(&amp1->a_rwlock);
1075 		svd2->amp = NULL;
1076 	} else if (amp1 != NULL || amp2 != NULL) {
1077 		struct anon_hdr *nahp;
1078 		struct anon_map *namp = NULL;
1079 		size_t asize;
1080 
1081 		ASSERT(svd1->type == MAP_PRIVATE);
1082 
1083 		asize = seg1->s_size + seg2->s_size;
1084 		if ((nahp = anon_create(btop(asize), ANON_NOSLEEP)) == NULL) {
1085 			if (nvpage != NULL) {
1086 				kmem_free(nvpage, nvpsize);
1087 			}
1088 			return (-2);
1089 		}
1090 		if (amp1 != NULL) {
1091 			/*
1092 			 * XXX anon rwlock is not really needed because
1093 			 * this is a private segment and we are writers.
1094 			 */
1095 			ANON_LOCK_ENTER(&amp1->a_rwlock, RW_WRITER);
1096 			ASSERT(amp1->refcnt == 1);
1097 			if (anon_copy_ptr(amp1->ahp, svd1->anon_index,
1098 			    nahp, 0, btop(seg1->s_size), ANON_NOSLEEP)) {
1099 				anon_release(nahp, btop(asize));
1100 				ANON_LOCK_EXIT(&amp1->a_rwlock);
1101 				if (nvpage != NULL) {
1102 					kmem_free(nvpage, nvpsize);
1103 				}
1104 				return (-2);
1105 			}
1106 		}
1107 		if (amp2 != NULL) {
1108 			ANON_LOCK_ENTER(&amp2->a_rwlock, RW_WRITER);
1109 			ASSERT(amp2->refcnt == 1);
1110 			if (anon_copy_ptr(amp2->ahp, svd2->anon_index,
1111 			    nahp, btop(seg1->s_size), btop(seg2->s_size),
1112 			    ANON_NOSLEEP)) {
1113 				anon_release(nahp, btop(asize));
1114 				ANON_LOCK_EXIT(&amp2->a_rwlock);
1115 				if (amp1 != NULL) {
1116 					ANON_LOCK_EXIT(&amp1->a_rwlock);
1117 				}
1118 				if (nvpage != NULL) {
1119 					kmem_free(nvpage, nvpsize);
1120 				}
1121 				return (-2);
1122 			}
1123 		}
1124 		if (amp1 != NULL) {
1125 			namp = amp1;
1126 			anon_release(amp1->ahp, btop(amp1->size));
1127 		}
1128 		if (amp2 != NULL) {
1129 			if (namp == NULL) {
1130 				ASSERT(amp1 == NULL);
1131 				namp = amp2;
1132 				anon_release(amp2->ahp, btop(amp2->size));
1133 			} else {
1134 				amp2->refcnt--;
1135 				ANON_LOCK_EXIT(&amp2->a_rwlock);
1136 				anonmap_free(amp2);
1137 			}
1138 			svd2->amp = NULL; /* needed for seg_free */
1139 		}
1140 		namp->ahp = nahp;
1141 		namp->size = asize;
1142 		svd1->amp = namp;
1143 		svd1->anon_index = 0;
1144 		ANON_LOCK_EXIT(&namp->a_rwlock);
1145 	}
1146 	/*
1147 	 * Now free the old vpage structures.
1148 	 */
1149 	if (nvpage != NULL) {
1150 		if (vpage1 != NULL) {
1151 			kmem_free(vpage1, vpgtob(npages1));
1152 		}
1153 		if (vpage2 != NULL) {
1154 			svd2->vpage = NULL;
1155 			kmem_free(vpage2, vpgtob(npages2));
1156 		}
1157 		if (svd2->pageprot) {
1158 			svd1->pageprot = 1;
1159 		}
1160 		if (svd2->pageadvice) {
1161 			svd1->pageadvice = 1;
1162 		}
1163 		svd1->vpage = nvpage;
1164 	}
1165 
1166 	/* all looks ok, merge segments */
1167 	svd1->swresv += svd2->swresv;
1168 	svd2->swresv = 0;  /* so seg_free doesn't release swap space */
1169 	size = seg2->s_size;
1170 	seg_free(seg2);
1171 	seg1->s_size += size;
1172 	return (0);
1173 }
1174 
1175 /*
1176  * Extend the previous segment (seg1) to include the
1177  * new segment (seg2 + a), if possible.
1178  * Return 0 on success.
1179  */
1180 static int
1181 segvn_extend_prev(seg1, seg2, a, swresv)
1182 	struct seg *seg1, *seg2;
1183 	struct segvn_crargs *a;
1184 	size_t swresv;
1185 {
1186 	struct segvn_data *svd1 = (struct segvn_data *)seg1->s_data;
1187 	size_t size;
1188 	struct anon_map *amp1;
1189 	struct vpage *new_vpage;
1190 
1191 	/*
1192 	 * We don't need any segment level locks for "segvn" data
1193 	 * since the address space is "write" locked.
1194 	 */
1195 	ASSERT(seg1->s_as && AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock));
1196 
1197 	if (HAT_IS_REGION_COOKIE_VALID(svd1->rcookie)) {
1198 		return (-1);
1199 	}
1200 
1201 	/* second segment is new, try to extend first */
1202 	/* XXX - should also check cred */
1203 	if (svd1->vp != a->vp || svd1->maxprot != a->maxprot ||
1204 	    (!svd1->pageprot && (svd1->prot != a->prot)) ||
1205 	    svd1->type != a->type || svd1->flags != a->flags ||
1206 	    seg1->s_szc != a->szc)
1207 		return (-1);
1208 
1209 	/* vp == NULL implies zfod, offset doesn't matter */
1210 	if (svd1->vp != NULL &&
1211 	    svd1->offset + seg1->s_size != (a->offset & PAGEMASK))
1212 		return (-1);
1213 
1214 	if (svd1->tr_state != SEGVN_TR_OFF) {
1215 		return (-1);
1216 	}
1217 
1218 	amp1 = svd1->amp;
1219 	if (amp1) {
1220 		pgcnt_t newpgs;
1221 
1222 		/*
1223 		 * Segment has private pages, can data structures
1224 		 * be expanded?
1225 		 *
1226 		 * Acquire the anon_map lock to prevent it from changing,
1227 		 * if it is shared.  This ensures that the anon_map
1228 		 * will not change while a thread which has a read/write
1229 		 * lock on an address space references it.
1230 		 * XXX - Don't need the anon_map lock at all if "refcnt"
1231 		 * is 1.
1232 		 *
1233 		 * Can't grow a MAP_SHARED segment with an anonmap because
1234 		 * there may be existing anon slots where we want to extend
1235 		 * the segment and we wouldn't know what to do with them
1236 		 * (e.g., for tmpfs right thing is to just leave them there,
1237 		 * for /dev/zero they should be cleared out).
1238 		 */
1239 		if (svd1->type == MAP_SHARED)
1240 			return (-1);
1241 
1242 		ANON_LOCK_ENTER(&amp1->a_rwlock, RW_WRITER);
1243 		if (amp1->refcnt > 1) {
1244 			ANON_LOCK_EXIT(&amp1->a_rwlock);
1245 			return (-1);
1246 		}
1247 		newpgs = anon_grow(amp1->ahp, &svd1->anon_index,
1248 		    btop(seg1->s_size), btop(seg2->s_size), ANON_NOSLEEP);
1249 
1250 		if (newpgs == 0) {
1251 			ANON_LOCK_EXIT(&amp1->a_rwlock);
1252 			return (-1);
1253 		}
1254 		amp1->size = ptob(newpgs);
1255 		ANON_LOCK_EXIT(&amp1->a_rwlock);
1256 	}
1257 	if (svd1->vpage != NULL) {
1258 		struct vpage *vp, *evp;
1259 		new_vpage =
1260 		    kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)),
1261 			KM_NOSLEEP);
1262 		if (new_vpage == NULL)
1263 			return (-1);
1264 		bcopy(svd1->vpage, new_vpage, vpgtob(seg_pages(seg1)));
1265 		kmem_free(svd1->vpage, vpgtob(seg_pages(seg1)));
1266 		svd1->vpage = new_vpage;
1267 
1268 		vp = new_vpage + seg_pages(seg1);
1269 		evp = vp + seg_pages(seg2);
1270 		for (; vp < evp; vp++)
1271 			VPP_SETPROT(vp, a->prot);
1272 	}
1273 	size = seg2->s_size;
1274 	seg_free(seg2);
1275 	seg1->s_size += size;
1276 	svd1->swresv += swresv;
1277 	if (svd1->pageprot && (a->prot & PROT_WRITE) &&
1278 	    svd1->type == MAP_SHARED && svd1->vp != NULL &&
1279 	    (svd1->vp->v_flag & VVMEXEC)) {
1280 		ASSERT(vn_is_mapped(svd1->vp, V_WRITE));
1281 		segvn_inval_trcache(svd1->vp);
1282 	}
1283 	return (0);
1284 }
1285 
1286 /*
1287  * Extend the next segment (seg2) to include the
1288  * new segment (seg1 + a), if possible.
1289  * Return 0 on success.
1290  */
1291 static int
1292 segvn_extend_next(
1293 	struct seg *seg1,
1294 	struct seg *seg2,
1295 	struct segvn_crargs *a,
1296 	size_t swresv)
1297 {
1298 	struct segvn_data *svd2 = (struct segvn_data *)seg2->s_data;
1299 	size_t size;
1300 	struct anon_map *amp2;
1301 	struct vpage *new_vpage;
1302 
1303 	/*
1304 	 * We don't need any segment level locks for "segvn" data
1305 	 * since the address space is "write" locked.
1306 	 */
1307 	ASSERT(seg2->s_as && AS_WRITE_HELD(seg2->s_as, &seg2->s_as->a_lock));
1308 
1309 	if (HAT_IS_REGION_COOKIE_VALID(svd2->rcookie)) {
1310 		return (-1);
1311 	}
1312 
1313 	/* first segment is new, try to extend second */
1314 	/* XXX - should also check cred */
1315 	if (svd2->vp != a->vp || svd2->maxprot != a->maxprot ||
1316 	    (!svd2->pageprot && (svd2->prot != a->prot)) ||
1317 	    svd2->type != a->type || svd2->flags != a->flags ||
1318 	    seg2->s_szc != a->szc)
1319 		return (-1);
1320 	/* vp == NULL implies zfod, offset doesn't matter */
1321 	if (svd2->vp != NULL &&
1322 	    (a->offset & PAGEMASK) + seg1->s_size != svd2->offset)
1323 		return (-1);
1324 
1325 	if (svd2->tr_state != SEGVN_TR_OFF) {
1326 		return (-1);
1327 	}
1328 
1329 	amp2 = svd2->amp;
1330 	if (amp2) {
1331 		pgcnt_t newpgs;
1332 
1333 		/*
1334 		 * Segment has private pages, can data structures
1335 		 * be expanded?
1336 		 *
1337 		 * Acquire the anon_map lock to prevent it from changing,
1338 		 * if it is shared.  This ensures that the anon_map
1339 		 * will not change while a thread which has a read/write
1340 		 * lock on an address space references it.
1341 		 *
1342 		 * XXX - Don't need the anon_map lock at all if "refcnt"
1343 		 * is 1.
1344 		 */
1345 		if (svd2->type == MAP_SHARED)
1346 			return (-1);
1347 
1348 		ANON_LOCK_ENTER(&amp2->a_rwlock, RW_WRITER);
1349 		if (amp2->refcnt > 1) {
1350 			ANON_LOCK_EXIT(&amp2->a_rwlock);
1351 			return (-1);
1352 		}
1353 		newpgs = anon_grow(amp2->ahp, &svd2->anon_index,
1354 		    btop(seg2->s_size), btop(seg1->s_size),
1355 		    ANON_NOSLEEP | ANON_GROWDOWN);
1356 
1357 		if (newpgs == 0) {
1358 			ANON_LOCK_EXIT(&amp2->a_rwlock);
1359 			return (-1);
1360 		}
1361 		amp2->size = ptob(newpgs);
1362 		ANON_LOCK_EXIT(&amp2->a_rwlock);
1363 	}
1364 	if (svd2->vpage != NULL) {
1365 		struct vpage *vp, *evp;
1366 		new_vpage =
1367 		    kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)),
1368 		    KM_NOSLEEP);
1369 		if (new_vpage == NULL) {
1370 			/* Not merging segments so adjust anon_index back */
1371 			if (amp2)
1372 				svd2->anon_index += seg_pages(seg1);
1373 			return (-1);
1374 		}
1375 		bcopy(svd2->vpage, new_vpage + seg_pages(seg1),
1376 		    vpgtob(seg_pages(seg2)));
1377 		kmem_free(svd2->vpage, vpgtob(seg_pages(seg2)));
1378 		svd2->vpage = new_vpage;
1379 
1380 		vp = new_vpage;
1381 		evp = vp + seg_pages(seg1);
1382 		for (; vp < evp; vp++)
1383 			VPP_SETPROT(vp, a->prot);
1384 	}
1385 	size = seg1->s_size;
1386 	seg_free(seg1);
1387 	seg2->s_size += size;
1388 	seg2->s_base -= size;
1389 	svd2->offset -= size;
1390 	svd2->swresv += swresv;
1391 	if (svd2->pageprot && (a->prot & PROT_WRITE) &&
1392 	    svd2->type == MAP_SHARED && svd2->vp != NULL &&
1393 	    (svd2->vp->v_flag & VVMEXEC)) {
1394 		ASSERT(vn_is_mapped(svd2->vp, V_WRITE));
1395 		segvn_inval_trcache(svd2->vp);
1396 	}
1397 	return (0);
1398 }
1399 
1400 static int
1401 segvn_dup(struct seg *seg, struct seg *newseg)
1402 {
1403 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
1404 	struct segvn_data *newsvd;
1405 	pgcnt_t npages = seg_pages(seg);
1406 	int error = 0;
1407 	uint_t prot;
1408 	size_t len;
1409 	struct anon_map *amp;
1410 
1411 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
1412 
1413 	/*
1414 	 * If segment has anon reserved, reserve more for the new seg.
1415 	 * For a MAP_NORESERVE segment swresv will be a count of all the
1416 	 * allocated anon slots; thus we reserve for the child as many slots
1417 	 * as the parent has allocated. This semantic prevents the child or
1418 	 * parent from dieing during a copy-on-write fault caused by trying
1419 	 * to write a shared pre-existing anon page.
1420 	 */
1421 	if ((len = svd->swresv) != 0) {
1422 		if (anon_resv(svd->swresv) == 0)
1423 			return (ENOMEM);
1424 
1425 		TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
1426 		    seg, len, 0);
1427 	}
1428 
1429 	newsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
1430 
1431 	newseg->s_ops = &segvn_ops;
1432 	newseg->s_data = (void *)newsvd;
1433 	newseg->s_szc = seg->s_szc;
1434 
1435 	newsvd->seg = newseg;
1436 	if ((newsvd->vp = svd->vp) != NULL) {
1437 		VN_HOLD(svd->vp);
1438 		if (svd->type == MAP_SHARED)
1439 			lgrp_shm_policy_init(NULL, svd->vp);
1440 	}
1441 	newsvd->offset = svd->offset;
1442 	newsvd->prot = svd->prot;
1443 	newsvd->maxprot = svd->maxprot;
1444 	newsvd->pageprot = svd->pageprot;
1445 	newsvd->type = svd->type;
1446 	newsvd->cred = svd->cred;
1447 	crhold(newsvd->cred);
1448 	newsvd->advice = svd->advice;
1449 	newsvd->pageadvice = svd->pageadvice;
1450 	newsvd->swresv = svd->swresv;
1451 	newsvd->flags = svd->flags;
1452 	newsvd->softlockcnt = 0;
1453 	newsvd->policy_info = svd->policy_info;
1454 	newsvd->rcookie = HAT_INVALID_REGION_COOKIE;
1455 
1456 	if ((amp = svd->amp) == NULL || svd->tr_state == SEGVN_TR_ON) {
1457 		/*
1458 		 * Not attaching to a shared anon object.
1459 		 */
1460 		ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie) ||
1461 		    svd->tr_state == SEGVN_TR_OFF);
1462 		if (svd->tr_state == SEGVN_TR_ON) {
1463 			ASSERT(newsvd->vp != NULL && amp != NULL);
1464 			newsvd->tr_state = SEGVN_TR_INIT;
1465 		} else {
1466 			newsvd->tr_state = svd->tr_state;
1467 		}
1468 		newsvd->amp = NULL;
1469 		newsvd->anon_index = 0;
1470 	} else {
1471 		/* regions for now are only used on pure vnode segments */
1472 		ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
1473 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
1474 		newsvd->tr_state = SEGVN_TR_OFF;
1475 		if (svd->type == MAP_SHARED) {
1476 			newsvd->amp = amp;
1477 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
1478 			amp->refcnt++;
1479 			ANON_LOCK_EXIT(&amp->a_rwlock);
1480 			newsvd->anon_index = svd->anon_index;
1481 		} else {
1482 			int reclaim = 1;
1483 
1484 			/*
1485 			 * Allocate and initialize new anon_map structure.
1486 			 */
1487 			newsvd->amp = anonmap_alloc(newseg->s_size, 0,
1488 			    ANON_SLEEP);
1489 			newsvd->amp->a_szc = newseg->s_szc;
1490 			newsvd->anon_index = 0;
1491 
1492 			/*
1493 			 * We don't have to acquire the anon_map lock
1494 			 * for the new segment (since it belongs to an
1495 			 * address space that is still not associated
1496 			 * with any process), or the segment in the old
1497 			 * address space (since all threads in it
1498 			 * are stopped while duplicating the address space).
1499 			 */
1500 
1501 			/*
1502 			 * The goal of the following code is to make sure that
1503 			 * softlocked pages do not end up as copy on write
1504 			 * pages.  This would cause problems where one
1505 			 * thread writes to a page that is COW and a different
1506 			 * thread in the same process has softlocked it.  The
1507 			 * softlock lock would move away from this process
1508 			 * because the write would cause this process to get
1509 			 * a copy (without the softlock).
1510 			 *
1511 			 * The strategy here is to just break the
1512 			 * sharing on pages that could possibly be
1513 			 * softlocked.
1514 			 */
1515 retry:
1516 			if (svd->softlockcnt) {
1517 				struct anon *ap, *newap;
1518 				size_t i;
1519 				uint_t vpprot;
1520 				page_t *anon_pl[1+1], *pp;
1521 				caddr_t addr;
1522 				ulong_t old_idx = svd->anon_index;
1523 				ulong_t new_idx = 0;
1524 
1525 				/*
1526 				 * The softlock count might be non zero
1527 				 * because some pages are still stuck in the
1528 				 * cache for lazy reclaim. Flush the cache
1529 				 * now. This should drop the count to zero.
1530 				 * [or there is really I/O going on to these
1531 				 * pages]. Note, we have the writers lock so
1532 				 * nothing gets inserted during the flush.
1533 				 */
1534 				if (reclaim == 1) {
1535 					segvn_purge(seg);
1536 					reclaim = 0;
1537 					goto retry;
1538 				}
1539 				i = btopr(seg->s_size);
1540 				addr = seg->s_base;
1541 				/*
1542 				 * XXX break cow sharing using PAGESIZE
1543 				 * pages. They will be relocated into larger
1544 				 * pages at fault time.
1545 				 */
1546 				while (i-- > 0) {
1547 					if (ap = anon_get_ptr(amp->ahp,
1548 					    old_idx)) {
1549 						error = anon_getpage(&ap,
1550 						    &vpprot, anon_pl, PAGESIZE,
1551 						    seg, addr, S_READ,
1552 						    svd->cred);
1553 						if (error) {
1554 							newsvd->vpage = NULL;
1555 							goto out;
1556 						}
1557 						/*
1558 						 * prot need not be computed
1559 						 * below 'cause anon_private is
1560 						 * going to ignore it anyway
1561 						 * as child doesn't inherit
1562 						 * pagelock from parent.
1563 						 */
1564 						prot = svd->pageprot ?
1565 						    VPP_PROT(
1566 						    &svd->vpage[
1567 						    seg_page(seg, addr)])
1568 						    : svd->prot;
1569 						pp = anon_private(&newap,
1570 						    newseg, addr, prot,
1571 						    anon_pl[0],	0,
1572 						    newsvd->cred);
1573 						if (pp == NULL) {
1574 							/* no mem abort */
1575 							newsvd->vpage = NULL;
1576 							error = ENOMEM;
1577 							goto out;
1578 						}
1579 						(void) anon_set_ptr(
1580 						    newsvd->amp->ahp, new_idx,
1581 						    newap, ANON_SLEEP);
1582 						page_unlock(pp);
1583 					}
1584 					addr += PAGESIZE;
1585 					old_idx++;
1586 					new_idx++;
1587 				}
1588 			} else {	/* common case */
1589 				if (seg->s_szc != 0) {
1590 					/*
1591 					 * If at least one of anon slots of a
1592 					 * large page exists then make sure
1593 					 * all anon slots of a large page
1594 					 * exist to avoid partial cow sharing
1595 					 * of a large page in the future.
1596 					 */
1597 					anon_dup_fill_holes(amp->ahp,
1598 					    svd->anon_index, newsvd->amp->ahp,
1599 					    0, seg->s_size, seg->s_szc,
1600 					    svd->vp != NULL);
1601 				} else {
1602 					anon_dup(amp->ahp, svd->anon_index,
1603 					    newsvd->amp->ahp, 0, seg->s_size);
1604 				}
1605 
1606 				hat_clrattr(seg->s_as->a_hat, seg->s_base,
1607 				    seg->s_size, PROT_WRITE);
1608 			}
1609 		}
1610 	}
1611 	/*
1612 	 * If necessary, create a vpage structure for the new segment.
1613 	 * Do not copy any page lock indications.
1614 	 */
1615 	if (svd->vpage != NULL) {
1616 		uint_t i;
1617 		struct vpage *ovp = svd->vpage;
1618 		struct vpage *nvp;
1619 
1620 		nvp = newsvd->vpage =
1621 		    kmem_alloc(vpgtob(npages), KM_SLEEP);
1622 		for (i = 0; i < npages; i++) {
1623 			*nvp = *ovp++;
1624 			VPP_CLRPPLOCK(nvp++);
1625 		}
1626 	} else
1627 		newsvd->vpage = NULL;
1628 
1629 	/* Inform the vnode of the new mapping */
1630 	if (newsvd->vp != NULL) {
1631 		error = VOP_ADDMAP(newsvd->vp, (offset_t)newsvd->offset,
1632 		    newseg->s_as, newseg->s_base, newseg->s_size, newsvd->prot,
1633 		    newsvd->maxprot, newsvd->type, newsvd->cred, NULL);
1634 	}
1635 out:
1636 	if (error == 0 && HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
1637 		ASSERT(newsvd->amp == NULL);
1638 		ASSERT(newsvd->tr_state == SEGVN_TR_OFF);
1639 		newsvd->rcookie = svd->rcookie;
1640 		hat_dup_region(newseg->s_as->a_hat, newsvd->rcookie);
1641 	}
1642 	return (error);
1643 }
1644 
1645 
1646 /*
1647  * callback function to invoke free_vp_pages() for only those pages actually
1648  * processed by the HAT when a shared region is destroyed.
1649  */
1650 extern int free_pages;
1651 
1652 static void
1653 segvn_hat_rgn_unload_callback(caddr_t saddr, caddr_t eaddr, caddr_t r_saddr,
1654     size_t r_size, void *r_obj, u_offset_t r_objoff)
1655 {
1656 	u_offset_t off;
1657 	size_t len;
1658 	vnode_t *vp = (vnode_t *)r_obj;
1659 
1660 	ASSERT(eaddr > saddr);
1661 	ASSERT(saddr >= r_saddr);
1662 	ASSERT(saddr < r_saddr + r_size);
1663 	ASSERT(eaddr > r_saddr);
1664 	ASSERT(eaddr <= r_saddr + r_size);
1665 	ASSERT(vp != NULL);
1666 
1667 	if (!free_pages) {
1668 		return;
1669 	}
1670 
1671 	len = eaddr - saddr;
1672 	off = (saddr - r_saddr) + r_objoff;
1673 	free_vp_pages(vp, off, len);
1674 }
1675 
1676 /*
1677  * callback function used by segvn_unmap to invoke free_vp_pages() for only
1678  * those pages actually processed by the HAT
1679  */
1680 static void
1681 segvn_hat_unload_callback(hat_callback_t *cb)
1682 {
1683 	struct seg		*seg = cb->hcb_data;
1684 	struct segvn_data	*svd = (struct segvn_data *)seg->s_data;
1685 	size_t			len;
1686 	u_offset_t		off;
1687 
1688 	ASSERT(svd->vp != NULL);
1689 	ASSERT(cb->hcb_end_addr > cb->hcb_start_addr);
1690 	ASSERT(cb->hcb_start_addr >= seg->s_base);
1691 
1692 	len = cb->hcb_end_addr - cb->hcb_start_addr;
1693 	off = cb->hcb_start_addr - seg->s_base;
1694 	free_vp_pages(svd->vp, svd->offset + off, len);
1695 }
1696 
1697 static int
1698 segvn_unmap(struct seg *seg, caddr_t addr, size_t len)
1699 {
1700 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
1701 	struct segvn_data *nsvd;
1702 	struct seg *nseg;
1703 	struct anon_map *amp;
1704 	pgcnt_t	opages;		/* old segment size in pages */
1705 	pgcnt_t	npages;		/* new segment size in pages */
1706 	pgcnt_t	dpages;		/* pages being deleted (unmapped) */
1707 	hat_callback_t callback;	/* used for free_vp_pages() */
1708 	hat_callback_t *cbp = NULL;
1709 	caddr_t nbase;
1710 	size_t nsize;
1711 	size_t oswresv;
1712 	int reclaim = 1;
1713 
1714 	/*
1715 	 * We don't need any segment level locks for "segvn" data
1716 	 * since the address space is "write" locked.
1717 	 */
1718 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
1719 
1720 	/*
1721 	 * Fail the unmap if pages are SOFTLOCKed through this mapping.
1722 	 * softlockcnt is protected from change by the as write lock.
1723 	 */
1724 retry:
1725 	if (svd->softlockcnt > 0) {
1726 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
1727 		/*
1728 		 * since we do have the writers lock nobody can fill
1729 		 * the cache during the purge. The flush either succeeds
1730 		 * or we still have pending I/Os.
1731 		 */
1732 		if (reclaim == 1) {
1733 			segvn_purge(seg);
1734 			reclaim = 0;
1735 			goto retry;
1736 		}
1737 		return (EAGAIN);
1738 	}
1739 
1740 	/*
1741 	 * Check for bad sizes
1742 	 */
1743 	if (addr < seg->s_base || addr + len > seg->s_base + seg->s_size ||
1744 	    (len & PAGEOFFSET) || ((uintptr_t)addr & PAGEOFFSET)) {
1745 		panic("segvn_unmap");
1746 		/*NOTREACHED*/
1747 	}
1748 
1749 	if (seg->s_szc != 0) {
1750 		size_t pgsz = page_get_pagesize(seg->s_szc);
1751 		int err;
1752 		if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) {
1753 			ASSERT(seg->s_base != addr || seg->s_size != len);
1754 			if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
1755 				ASSERT(svd->amp == NULL);
1756 				ASSERT(svd->tr_state == SEGVN_TR_OFF);
1757 				hat_leave_region(seg->s_as->a_hat,
1758 				    svd->rcookie, HAT_REGION_TEXT);
1759 				svd->rcookie = HAT_INVALID_REGION_COOKIE;
1760 				/*
1761 				 * could pass a flag to segvn_demote_range()
1762 				 * below to tell it not to do any unloads but
1763 				 * this case is rare enough to not bother for
1764 				 * now.
1765 				 */
1766 			} else if (svd->tr_state == SEGVN_TR_INIT) {
1767 				svd->tr_state = SEGVN_TR_OFF;
1768 			} else if (svd->tr_state == SEGVN_TR_ON) {
1769 				ASSERT(svd->amp != NULL);
1770 				segvn_textunrepl(seg, 1);
1771 				ASSERT(svd->amp == NULL);
1772 				ASSERT(svd->tr_state == SEGVN_TR_OFF);
1773 			}
1774 			VM_STAT_ADD(segvnvmstats.demoterange[0]);
1775 			err = segvn_demote_range(seg, addr, len, SDR_END, 0);
1776 			if (err == 0) {
1777 				return (IE_RETRY);
1778 			}
1779 			return (err);
1780 		}
1781 	}
1782 
1783 	/* Inform the vnode of the unmapping. */
1784 	if (svd->vp) {
1785 		int error;
1786 
1787 		error = VOP_DELMAP(svd->vp,
1788 		    (offset_t)svd->offset + (uintptr_t)(addr - seg->s_base),
1789 		    seg->s_as, addr, len, svd->prot, svd->maxprot,
1790 		    svd->type, svd->cred, NULL);
1791 
1792 		if (error == EAGAIN)
1793 			return (error);
1794 	}
1795 
1796 	/*
1797 	 * Remove any page locks set through this mapping.
1798 	 * If text replication is not off no page locks could have been
1799 	 * established via this mapping.
1800 	 */
1801 	if (svd->tr_state == SEGVN_TR_OFF) {
1802 		(void) segvn_lockop(seg, addr, len, 0, MC_UNLOCK, NULL, 0);
1803 	}
1804 
1805 	if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
1806 		ASSERT(svd->amp == NULL);
1807 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
1808 		ASSERT(svd->type == MAP_PRIVATE);
1809 		hat_leave_region(seg->s_as->a_hat, svd->rcookie,
1810 		    HAT_REGION_TEXT);
1811 		svd->rcookie = HAT_INVALID_REGION_COOKIE;
1812 	} else if (svd->tr_state == SEGVN_TR_ON) {
1813 		ASSERT(svd->amp != NULL);
1814 		ASSERT(svd->pageprot == 0 && !(svd->prot & PROT_WRITE));
1815 		segvn_textunrepl(seg, 1);
1816 		ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF);
1817 	} else {
1818 		if (svd->tr_state != SEGVN_TR_OFF) {
1819 			ASSERT(svd->tr_state == SEGVN_TR_INIT);
1820 			svd->tr_state = SEGVN_TR_OFF;
1821 		}
1822 		/*
1823 		 * Unload any hardware translations in the range to be taken
1824 		 * out. Use a callback to invoke free_vp_pages() effectively.
1825 		 */
1826 		if (svd->vp != NULL && free_pages != 0) {
1827 			callback.hcb_data = seg;
1828 			callback.hcb_function = segvn_hat_unload_callback;
1829 			cbp = &callback;
1830 		}
1831 		hat_unload_callback(seg->s_as->a_hat, addr, len,
1832 		    HAT_UNLOAD_UNMAP, cbp);
1833 
1834 		if (svd->type == MAP_SHARED && svd->vp != NULL &&
1835 		    (svd->vp->v_flag & VVMEXEC) &&
1836 		    ((svd->prot & PROT_WRITE) || svd->pageprot)) {
1837 			segvn_inval_trcache(svd->vp);
1838 		}
1839 	}
1840 
1841 	/*
1842 	 * Check for entire segment
1843 	 */
1844 	if (addr == seg->s_base && len == seg->s_size) {
1845 		seg_free(seg);
1846 		return (0);
1847 	}
1848 
1849 	opages = seg_pages(seg);
1850 	dpages = btop(len);
1851 	npages = opages - dpages;
1852 	amp = svd->amp;
1853 	ASSERT(amp == NULL || amp->a_szc >= seg->s_szc);
1854 
1855 	/*
1856 	 * Check for beginning of segment
1857 	 */
1858 	if (addr == seg->s_base) {
1859 		if (svd->vpage != NULL) {
1860 			size_t nbytes;
1861 			struct vpage *ovpage;
1862 
1863 			ovpage = svd->vpage;	/* keep pointer to vpage */
1864 
1865 			nbytes = vpgtob(npages);
1866 			svd->vpage = kmem_alloc(nbytes, KM_SLEEP);
1867 			bcopy(&ovpage[dpages], svd->vpage, nbytes);
1868 
1869 			/* free up old vpage */
1870 			kmem_free(ovpage, vpgtob(opages));
1871 		}
1872 		if (amp != NULL) {
1873 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
1874 			if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) {
1875 				/*
1876 				 * Free up now unused parts of anon_map array.
1877 				 */
1878 				if (amp->a_szc == seg->s_szc) {
1879 					if (seg->s_szc != 0) {
1880 						anon_free_pages(amp->ahp,
1881 						    svd->anon_index, len,
1882 						    seg->s_szc);
1883 					} else {
1884 						anon_free(amp->ahp,
1885 						    svd->anon_index,
1886 						    len);
1887 					}
1888 				} else {
1889 					ASSERT(svd->type == MAP_SHARED);
1890 					ASSERT(amp->a_szc > seg->s_szc);
1891 					anon_shmap_free_pages(amp,
1892 					    svd->anon_index, len);
1893 				}
1894 
1895 				/*
1896 				 * Unreserve swap space for the
1897 				 * unmapped chunk of this segment in
1898 				 * case it's MAP_SHARED
1899 				 */
1900 				if (svd->type == MAP_SHARED) {
1901 					anon_unresv(len);
1902 					amp->swresv -= len;
1903 				}
1904 			}
1905 			ANON_LOCK_EXIT(&amp->a_rwlock);
1906 			svd->anon_index += dpages;
1907 		}
1908 		if (svd->vp != NULL)
1909 			svd->offset += len;
1910 
1911 		if (svd->swresv) {
1912 			if (svd->flags & MAP_NORESERVE) {
1913 				ASSERT(amp);
1914 				oswresv = svd->swresv;
1915 
1916 				svd->swresv = ptob(anon_pages(amp->ahp,
1917 				    svd->anon_index, npages));
1918 				anon_unresv(oswresv - svd->swresv);
1919 			} else {
1920 				anon_unresv(len);
1921 				svd->swresv -= len;
1922 			}
1923 			TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
1924 			    seg, len, 0);
1925 		}
1926 
1927 		seg->s_base += len;
1928 		seg->s_size -= len;
1929 		return (0);
1930 	}
1931 
1932 	/*
1933 	 * Check for end of segment
1934 	 */
1935 	if (addr + len == seg->s_base + seg->s_size) {
1936 		if (svd->vpage != NULL) {
1937 			size_t nbytes;
1938 			struct vpage *ovpage;
1939 
1940 			ovpage = svd->vpage;	/* keep pointer to vpage */
1941 
1942 			nbytes = vpgtob(npages);
1943 			svd->vpage = kmem_alloc(nbytes, KM_SLEEP);
1944 			bcopy(ovpage, svd->vpage, nbytes);
1945 
1946 			/* free up old vpage */
1947 			kmem_free(ovpage, vpgtob(opages));
1948 
1949 		}
1950 		if (amp != NULL) {
1951 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
1952 			if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) {
1953 				/*
1954 				 * Free up now unused parts of anon_map array.
1955 				 */
1956 				ulong_t an_idx = svd->anon_index + npages;
1957 				if (amp->a_szc == seg->s_szc) {
1958 					if (seg->s_szc != 0) {
1959 						anon_free_pages(amp->ahp,
1960 						    an_idx, len,
1961 						    seg->s_szc);
1962 					} else {
1963 						anon_free(amp->ahp, an_idx,
1964 						    len);
1965 					}
1966 				} else {
1967 					ASSERT(svd->type == MAP_SHARED);
1968 					ASSERT(amp->a_szc > seg->s_szc);
1969 					anon_shmap_free_pages(amp,
1970 					    an_idx, len);
1971 				}
1972 
1973 				/*
1974 				 * Unreserve swap space for the
1975 				 * unmapped chunk of this segment in
1976 				 * case it's MAP_SHARED
1977 				 */
1978 				if (svd->type == MAP_SHARED) {
1979 					anon_unresv(len);
1980 					amp->swresv -= len;
1981 				}
1982 			}
1983 			ANON_LOCK_EXIT(&amp->a_rwlock);
1984 		}
1985 
1986 		if (svd->swresv) {
1987 			if (svd->flags & MAP_NORESERVE) {
1988 				ASSERT(amp);
1989 				oswresv = svd->swresv;
1990 				svd->swresv = ptob(anon_pages(amp->ahp,
1991 				    svd->anon_index, npages));
1992 				anon_unresv(oswresv - svd->swresv);
1993 			} else {
1994 				anon_unresv(len);
1995 				svd->swresv -= len;
1996 			}
1997 			TRACE_3(TR_FAC_VM, TR_ANON_PROC,
1998 			    "anon proc:%p %lu %u", seg, len, 0);
1999 		}
2000 
2001 		seg->s_size -= len;
2002 		return (0);
2003 	}
2004 
2005 	/*
2006 	 * The section to go is in the middle of the segment,
2007 	 * have to make it into two segments.  nseg is made for
2008 	 * the high end while seg is cut down at the low end.
2009 	 */
2010 	nbase = addr + len;				/* new seg base */
2011 	nsize = (seg->s_base + seg->s_size) - nbase;	/* new seg size */
2012 	seg->s_size = addr - seg->s_base;		/* shrink old seg */
2013 	nseg = seg_alloc(seg->s_as, nbase, nsize);
2014 	if (nseg == NULL) {
2015 		panic("segvn_unmap seg_alloc");
2016 		/*NOTREACHED*/
2017 	}
2018 	nseg->s_ops = seg->s_ops;
2019 	nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
2020 	nseg->s_data = (void *)nsvd;
2021 	nseg->s_szc = seg->s_szc;
2022 	*nsvd = *svd;
2023 	nsvd->seg = nseg;
2024 	nsvd->offset = svd->offset + (uintptr_t)(nseg->s_base - seg->s_base);
2025 	nsvd->swresv = 0;
2026 	nsvd->softlockcnt = 0;
2027 	ASSERT(nsvd->rcookie == HAT_INVALID_REGION_COOKIE);
2028 
2029 	if (svd->vp != NULL) {
2030 		VN_HOLD(nsvd->vp);
2031 		if (nsvd->type == MAP_SHARED)
2032 			lgrp_shm_policy_init(NULL, nsvd->vp);
2033 	}
2034 	crhold(svd->cred);
2035 
2036 	if (svd->vpage == NULL) {
2037 		nsvd->vpage = NULL;
2038 	} else {
2039 		/* need to split vpage into two arrays */
2040 		size_t nbytes;
2041 		struct vpage *ovpage;
2042 
2043 		ovpage = svd->vpage;		/* keep pointer to vpage */
2044 
2045 		npages = seg_pages(seg);	/* seg has shrunk */
2046 		nbytes = vpgtob(npages);
2047 		svd->vpage = kmem_alloc(nbytes, KM_SLEEP);
2048 
2049 		bcopy(ovpage, svd->vpage, nbytes);
2050 
2051 		npages = seg_pages(nseg);
2052 		nbytes = vpgtob(npages);
2053 		nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP);
2054 
2055 		bcopy(&ovpage[opages - npages], nsvd->vpage, nbytes);
2056 
2057 		/* free up old vpage */
2058 		kmem_free(ovpage, vpgtob(opages));
2059 	}
2060 
2061 	if (amp == NULL) {
2062 		nsvd->amp = NULL;
2063 		nsvd->anon_index = 0;
2064 	} else {
2065 		/*
2066 		 * Need to create a new anon map for the new segment.
2067 		 * We'll also allocate a new smaller array for the old
2068 		 * smaller segment to save space.
2069 		 */
2070 		opages = btop((uintptr_t)(addr - seg->s_base));
2071 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
2072 		if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) {
2073 			/*
2074 			 * Free up now unused parts of anon_map array.
2075 			 */
2076 			ulong_t an_idx = svd->anon_index + opages;
2077 			if (amp->a_szc == seg->s_szc) {
2078 				if (seg->s_szc != 0) {
2079 					anon_free_pages(amp->ahp, an_idx, len,
2080 					    seg->s_szc);
2081 				} else {
2082 					anon_free(amp->ahp, an_idx,
2083 					    len);
2084 				}
2085 			} else {
2086 				ASSERT(svd->type == MAP_SHARED);
2087 				ASSERT(amp->a_szc > seg->s_szc);
2088 				anon_shmap_free_pages(amp, an_idx, len);
2089 			}
2090 
2091 			/*
2092 			 * Unreserve swap space for the
2093 			 * unmapped chunk of this segment in
2094 			 * case it's MAP_SHARED
2095 			 */
2096 			if (svd->type == MAP_SHARED) {
2097 				anon_unresv(len);
2098 				amp->swresv -= len;
2099 			}
2100 		}
2101 		nsvd->anon_index = svd->anon_index +
2102 		    btop((uintptr_t)(nseg->s_base - seg->s_base));
2103 		if (svd->type == MAP_SHARED) {
2104 			amp->refcnt++;
2105 			nsvd->amp = amp;
2106 		} else {
2107 			struct anon_map *namp;
2108 			struct anon_hdr *nahp;
2109 
2110 			ASSERT(svd->type == MAP_PRIVATE);
2111 			nahp = anon_create(btop(seg->s_size), ANON_SLEEP);
2112 			namp = anonmap_alloc(nseg->s_size, 0, ANON_SLEEP);
2113 			namp->a_szc = seg->s_szc;
2114 			(void) anon_copy_ptr(amp->ahp, svd->anon_index, nahp,
2115 			    0, btop(seg->s_size), ANON_SLEEP);
2116 			(void) anon_copy_ptr(amp->ahp, nsvd->anon_index,
2117 			    namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP);
2118 			anon_release(amp->ahp, btop(amp->size));
2119 			svd->anon_index = 0;
2120 			nsvd->anon_index = 0;
2121 			amp->ahp = nahp;
2122 			amp->size = seg->s_size;
2123 			nsvd->amp = namp;
2124 		}
2125 		ANON_LOCK_EXIT(&amp->a_rwlock);
2126 	}
2127 	if (svd->swresv) {
2128 		if (svd->flags & MAP_NORESERVE) {
2129 			ASSERT(amp);
2130 			oswresv = svd->swresv;
2131 			svd->swresv = ptob(anon_pages(amp->ahp,
2132 			    svd->anon_index, btop(seg->s_size)));
2133 			nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp,
2134 			    nsvd->anon_index, btop(nseg->s_size)));
2135 			ASSERT(oswresv >= (svd->swresv + nsvd->swresv));
2136 			anon_unresv(oswresv - (svd->swresv + nsvd->swresv));
2137 		} else {
2138 			if (seg->s_size + nseg->s_size + len != svd->swresv) {
2139 				panic("segvn_unmap: "
2140 				    "cannot split swap reservation");
2141 				/*NOTREACHED*/
2142 			}
2143 			anon_unresv(len);
2144 			svd->swresv = seg->s_size;
2145 			nsvd->swresv = nseg->s_size;
2146 		}
2147 		TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
2148 		    seg, len, 0);
2149 	}
2150 
2151 	return (0);			/* I'm glad that's all over with! */
2152 }
2153 
2154 static void
2155 segvn_free(struct seg *seg)
2156 {
2157 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
2158 	pgcnt_t npages = seg_pages(seg);
2159 	struct anon_map *amp;
2160 	size_t len;
2161 
2162 	/*
2163 	 * We don't need any segment level locks for "segvn" data
2164 	 * since the address space is "write" locked.
2165 	 */
2166 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
2167 	ASSERT(svd->tr_state == SEGVN_TR_OFF);
2168 
2169 	ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
2170 
2171 	/*
2172 	 * Be sure to unlock pages. XXX Why do things get free'ed instead
2173 	 * of unmapped? XXX
2174 	 */
2175 	(void) segvn_lockop(seg, seg->s_base, seg->s_size,
2176 	    0, MC_UNLOCK, NULL, 0);
2177 
2178 	/*
2179 	 * Deallocate the vpage and anon pointers if necessary and possible.
2180 	 */
2181 	if (svd->vpage != NULL) {
2182 		kmem_free(svd->vpage, vpgtob(npages));
2183 		svd->vpage = NULL;
2184 	}
2185 	if ((amp = svd->amp) != NULL) {
2186 		/*
2187 		 * If there are no more references to this anon_map
2188 		 * structure, then deallocate the structure after freeing
2189 		 * up all the anon slot pointers that we can.
2190 		 */
2191 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
2192 		ASSERT(amp->a_szc >= seg->s_szc);
2193 		if (--amp->refcnt == 0) {
2194 			if (svd->type == MAP_PRIVATE) {
2195 				/*
2196 				 * Private - we only need to anon_free
2197 				 * the part that this segment refers to.
2198 				 */
2199 				if (seg->s_szc != 0) {
2200 					anon_free_pages(amp->ahp,
2201 					    svd->anon_index, seg->s_size,
2202 					    seg->s_szc);
2203 				} else {
2204 					anon_free(amp->ahp, svd->anon_index,
2205 					    seg->s_size);
2206 				}
2207 			} else {
2208 				/*
2209 				 * Shared - anon_free the entire
2210 				 * anon_map's worth of stuff and
2211 				 * release any swap reservation.
2212 				 */
2213 				if (amp->a_szc != 0) {
2214 					anon_shmap_free_pages(amp, 0,
2215 					    amp->size);
2216 				} else {
2217 					anon_free(amp->ahp, 0, amp->size);
2218 				}
2219 				if ((len = amp->swresv) != 0) {
2220 					anon_unresv(len);
2221 					TRACE_3(TR_FAC_VM, TR_ANON_PROC,
2222 					    "anon proc:%p %lu %u", seg, len, 0);
2223 				}
2224 			}
2225 			svd->amp = NULL;
2226 			ANON_LOCK_EXIT(&amp->a_rwlock);
2227 			anonmap_free(amp);
2228 		} else if (svd->type == MAP_PRIVATE) {
2229 			/*
2230 			 * We had a private mapping which still has
2231 			 * a held anon_map so just free up all the
2232 			 * anon slot pointers that we were using.
2233 			 */
2234 			if (seg->s_szc != 0) {
2235 				anon_free_pages(amp->ahp, svd->anon_index,
2236 				    seg->s_size, seg->s_szc);
2237 			} else {
2238 				anon_free(amp->ahp, svd->anon_index,
2239 				    seg->s_size);
2240 			}
2241 			ANON_LOCK_EXIT(&amp->a_rwlock);
2242 		} else {
2243 			ANON_LOCK_EXIT(&amp->a_rwlock);
2244 		}
2245 	}
2246 
2247 	/*
2248 	 * Release swap reservation.
2249 	 */
2250 	if ((len = svd->swresv) != 0) {
2251 		anon_unresv(svd->swresv);
2252 		TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
2253 		    seg, len, 0);
2254 		svd->swresv = 0;
2255 	}
2256 	/*
2257 	 * Release claim on vnode, credentials, and finally free the
2258 	 * private data.
2259 	 */
2260 	if (svd->vp != NULL) {
2261 		if (svd->type == MAP_SHARED)
2262 			lgrp_shm_policy_fini(NULL, svd->vp);
2263 		VN_RELE(svd->vp);
2264 		svd->vp = NULL;
2265 	}
2266 	crfree(svd->cred);
2267 	svd->cred = NULL;
2268 
2269 	seg->s_data = NULL;
2270 	kmem_cache_free(segvn_cache, svd);
2271 }
2272 
2273 #ifdef DEBUG
2274 uint32_t segvn_slock_mtbf = 0;
2275 #endif
2276 
2277 ulong_t segvn_lpglck_limit = 0;
2278 
2279 /*
2280  * Support routines used by segvn_pagelock() and softlock faults for anonymous
2281  * pages to implement availrmem accounting in a way that makes sure the
2282  * same memory is accounted just once for all softlock/pagelock purposes.
2283  * This prevents a bug when availrmem is quickly incorrectly exhausted from
2284  * several pagelocks to different parts of the same large page since each
2285  * pagelock has to decrement availrmem by the size of the entire large
2286  * page. Note those pages are not COW shared until softunlock/pageunlock so
2287  * we don't need to use cow style accounting here.  We also need to make sure
2288  * the entire large page is accounted even if softlock range is less than the
2289  * entire large page because large anon pages can't be demoted when any of
2290  * constituent pages is locked. The caller calls this routine for every page_t
2291  * it locks. The very first page in the range may not be the root page of a
2292  * large page. For all other pages it's guaranteed we are going to visit the
2293  * root of a particular large page before any other constituent page as we are
2294  * locking sequential pages belonging to the same anon map. So we do all the
2295  * locking when the root is encountered except for the very first page.  Since
2296  * softlocking is not supported (except S_READ_NOCOW special case) for vmpss
2297  * segments and since vnode pages can be demoted without locking all
2298  * constituent pages vnode pages don't come here.  Unlocking relies on the
2299  * fact that pagesize can't change whenever any of constituent large pages is
2300  * locked at least SE_SHARED. This allows unlocking code to find the right
2301  * root and decrement availrmem by the same amount it was incremented when the
2302  * page was locked.
2303  */
2304 static int
2305 segvn_slock_anonpages(page_t *pp, int first)
2306 {
2307 	pgcnt_t		pages;
2308 	pfn_t		pfn;
2309 	uchar_t		szc = pp->p_szc;
2310 
2311 	ASSERT(PAGE_LOCKED(pp));
2312 	ASSERT(pp->p_vnode != NULL);
2313 	ASSERT(IS_SWAPFSVP(pp->p_vnode));
2314 
2315 	/*
2316 	 * pagesize won't change as long as any constituent page is locked.
2317 	 */
2318 	pages = page_get_pagecnt(pp->p_szc);
2319 	pfn = page_pptonum(pp);
2320 
2321 	if (!first) {
2322 		if (!IS_P2ALIGNED(pfn, pages)) {
2323 #ifdef DEBUG
2324 			pp = &pp[-(spgcnt_t)(pfn & (pages - 1))];
2325 			pfn = page_pptonum(pp);
2326 			ASSERT(IS_P2ALIGNED(pfn, pages));
2327 			ASSERT(pp->p_szc == szc);
2328 			ASSERT(pp->p_vnode != NULL);
2329 			ASSERT(IS_SWAPFSVP(pp->p_vnode));
2330 			ASSERT(pp->p_slckcnt != 0);
2331 #endif /* DEBUG */
2332 			return (1);
2333 		}
2334 	} else if (!IS_P2ALIGNED(pfn, pages)) {
2335 		pp = &pp[-(spgcnt_t)(pfn & (pages - 1))];
2336 #ifdef DEBUG
2337 		pfn = page_pptonum(pp);
2338 		ASSERT(IS_P2ALIGNED(pfn, pages));
2339 		ASSERT(pp->p_szc == szc);
2340 		ASSERT(pp->p_vnode != NULL);
2341 		ASSERT(IS_SWAPFSVP(pp->p_vnode));
2342 #endif /* DEBUG */
2343 	}
2344 
2345 #ifdef DEBUG
2346 	if (segvn_slock_mtbf && !(gethrtime() % segvn_slock_mtbf)) {
2347 		return (0);
2348 	}
2349 #endif /* DEBUG */
2350 
2351 	/*
2352 	 * pp is a root page.
2353 	 * We haven't locked this large page yet.
2354 	 */
2355 	page_struct_lock(pp);
2356 	if (pp->p_slckcnt != 0) {
2357 		if (pp->p_slckcnt < PAGE_SLOCK_MAXIMUM) {
2358 			pp->p_slckcnt++;
2359 			page_struct_unlock(pp);
2360 			return (1);
2361 		}
2362 		page_struct_unlock(pp);
2363 		segvn_lpglck_limit++;
2364 		return (0);
2365 	}
2366 	mutex_enter(&freemem_lock);
2367 	if (availrmem < tune.t_minarmem + pages) {
2368 		mutex_exit(&freemem_lock);
2369 		page_struct_unlock(pp);
2370 		return (0);
2371 	}
2372 	pp->p_slckcnt++;
2373 	availrmem -= pages;
2374 	mutex_exit(&freemem_lock);
2375 	page_struct_unlock(pp);
2376 	return (1);
2377 }
2378 
2379 static void
2380 segvn_sunlock_anonpages(page_t *pp, int first)
2381 {
2382 	pgcnt_t		pages;
2383 	pfn_t		pfn;
2384 
2385 	ASSERT(PAGE_LOCKED(pp));
2386 	ASSERT(pp->p_vnode != NULL);
2387 	ASSERT(IS_SWAPFSVP(pp->p_vnode));
2388 
2389 	/*
2390 	 * pagesize won't change as long as any constituent page is locked.
2391 	 */
2392 	pages = page_get_pagecnt(pp->p_szc);
2393 	pfn = page_pptonum(pp);
2394 
2395 	if (!first) {
2396 		if (!IS_P2ALIGNED(pfn, pages)) {
2397 			return;
2398 		}
2399 	} else if (!IS_P2ALIGNED(pfn, pages)) {
2400 		pp = &pp[-(spgcnt_t)(pfn & (pages - 1))];
2401 #ifdef DEBUG
2402 		pfn = page_pptonum(pp);
2403 		ASSERT(IS_P2ALIGNED(pfn, pages));
2404 #endif /* DEBUG */
2405 	}
2406 	ASSERT(pp->p_vnode != NULL);
2407 	ASSERT(IS_SWAPFSVP(pp->p_vnode));
2408 	ASSERT(pp->p_slckcnt != 0);
2409 	page_struct_lock(pp);
2410 	if (--pp->p_slckcnt == 0) {
2411 		mutex_enter(&freemem_lock);
2412 		availrmem += pages;
2413 		mutex_exit(&freemem_lock);
2414 	}
2415 	page_struct_unlock(pp);
2416 }
2417 
2418 /*
2419  * Do a F_SOFTUNLOCK call over the range requested.  The range must have
2420  * already been F_SOFTLOCK'ed.
2421  * Caller must always match addr and len of a softunlock with a previous
2422  * softlock with exactly the same addr and len.
2423  */
2424 static void
2425 segvn_softunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
2426 {
2427 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
2428 	page_t *pp;
2429 	caddr_t adr;
2430 	struct vnode *vp;
2431 	u_offset_t offset;
2432 	ulong_t anon_index;
2433 	struct anon_map *amp;
2434 	struct anon *ap = NULL;
2435 
2436 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2437 	ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
2438 
2439 	if ((amp = svd->amp) != NULL)
2440 		anon_index = svd->anon_index + seg_page(seg, addr);
2441 
2442 	if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
2443 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
2444 		hat_unlock_region(seg->s_as->a_hat, addr, len, svd->rcookie);
2445 	} else {
2446 		hat_unlock(seg->s_as->a_hat, addr, len);
2447 	}
2448 	for (adr = addr; adr < addr + len; adr += PAGESIZE) {
2449 		if (amp != NULL) {
2450 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2451 			if ((ap = anon_get_ptr(amp->ahp, anon_index++))
2452 			    != NULL) {
2453 				swap_xlate(ap, &vp, &offset);
2454 			} else {
2455 				vp = svd->vp;
2456 				offset = svd->offset +
2457 				    (uintptr_t)(adr - seg->s_base);
2458 			}
2459 			ANON_LOCK_EXIT(&amp->a_rwlock);
2460 		} else {
2461 			vp = svd->vp;
2462 			offset = svd->offset +
2463 			    (uintptr_t)(adr - seg->s_base);
2464 		}
2465 
2466 		/*
2467 		 * Use page_find() instead of page_lookup() to
2468 		 * find the page since we know that it is locked.
2469 		 */
2470 		pp = page_find(vp, offset);
2471 		if (pp == NULL) {
2472 			panic(
2473 			    "segvn_softunlock: addr %p, ap %p, vp %p, off %llx",
2474 			    (void *)adr, (void *)ap, (void *)vp, offset);
2475 			/*NOTREACHED*/
2476 		}
2477 
2478 		if (rw == S_WRITE) {
2479 			hat_setrefmod(pp);
2480 			if (seg->s_as->a_vbits)
2481 				hat_setstat(seg->s_as, adr, PAGESIZE,
2482 				    P_REF | P_MOD);
2483 		} else if (rw != S_OTHER) {
2484 			hat_setref(pp);
2485 			if (seg->s_as->a_vbits)
2486 				hat_setstat(seg->s_as, adr, PAGESIZE, P_REF);
2487 		}
2488 		TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT,
2489 		    "segvn_fault:pp %p vp %p offset %llx", pp, vp, offset);
2490 		if (svd->vp == NULL) {
2491 			segvn_sunlock_anonpages(pp, adr == addr);
2492 		}
2493 		page_unlock(pp);
2494 	}
2495 	mutex_enter(&freemem_lock); /* for availrmem */
2496 	if (svd->vp != NULL) {
2497 		availrmem += btop(len);
2498 	}
2499 	segvn_pages_locked -= btop(len);
2500 	svd->softlockcnt -= btop(len);
2501 	mutex_exit(&freemem_lock);
2502 	if (svd->softlockcnt == 0) {
2503 		/*
2504 		 * All SOFTLOCKS are gone. Wakeup any waiting
2505 		 * unmappers so they can try again to unmap.
2506 		 * Check for waiters first without the mutex
2507 		 * held so we don't always grab the mutex on
2508 		 * softunlocks.
2509 		 */
2510 		if (AS_ISUNMAPWAIT(seg->s_as)) {
2511 			mutex_enter(&seg->s_as->a_contents);
2512 			if (AS_ISUNMAPWAIT(seg->s_as)) {
2513 				AS_CLRUNMAPWAIT(seg->s_as);
2514 				cv_broadcast(&seg->s_as->a_cv);
2515 			}
2516 			mutex_exit(&seg->s_as->a_contents);
2517 		}
2518 	}
2519 }
2520 
2521 #define	PAGE_HANDLED	((page_t *)-1)
2522 
2523 /*
2524  * Release all the pages in the NULL terminated ppp list
2525  * which haven't already been converted to PAGE_HANDLED.
2526  */
2527 static void
2528 segvn_pagelist_rele(page_t **ppp)
2529 {
2530 	for (; *ppp != NULL; ppp++) {
2531 		if (*ppp != PAGE_HANDLED)
2532 			page_unlock(*ppp);
2533 	}
2534 }
2535 
2536 static int stealcow = 1;
2537 
2538 /*
2539  * Workaround for viking chip bug.  See bug id 1220902.
2540  * To fix this down in pagefault() would require importing so
2541  * much as and segvn code as to be unmaintainable.
2542  */
2543 int enable_mbit_wa = 0;
2544 
2545 /*
2546  * Handles all the dirty work of getting the right
2547  * anonymous pages and loading up the translations.
2548  * This routine is called only from segvn_fault()
2549  * when looping over the range of addresses requested.
2550  *
2551  * The basic algorithm here is:
2552  * 	If this is an anon_zero case
2553  *		Call anon_zero to allocate page
2554  *		Load up translation
2555  *		Return
2556  *	endif
2557  *	If this is an anon page
2558  *		Use anon_getpage to get the page
2559  *	else
2560  *		Find page in pl[] list passed in
2561  *	endif
2562  *	If not a cow
2563  *		Load up the translation to the page
2564  *		return
2565  *	endif
2566  *	Call anon_private to handle cow
2567  *	Load up (writable) translation to new page
2568  */
2569 static faultcode_t
2570 segvn_faultpage(
2571 	struct hat *hat,		/* the hat to use for mapping */
2572 	struct seg *seg,		/* seg_vn of interest */
2573 	caddr_t addr,			/* address in as */
2574 	u_offset_t off,			/* offset in vp */
2575 	struct vpage *vpage,		/* pointer to vpage for vp, off */
2576 	page_t *pl[],			/* object source page pointer */
2577 	uint_t vpprot,			/* access allowed to object pages */
2578 	enum fault_type type,		/* type of fault */
2579 	enum seg_rw rw,			/* type of access at fault */
2580 	int brkcow,			/* we may need to break cow */
2581 	int first)			/* first page for this fault if 1 */
2582 {
2583 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
2584 	page_t *pp, **ppp;
2585 	uint_t pageflags = 0;
2586 	page_t *anon_pl[1 + 1];
2587 	page_t *opp = NULL;		/* original page */
2588 	uint_t prot;
2589 	int err;
2590 	int cow;
2591 	int claim;
2592 	int steal = 0;
2593 	ulong_t anon_index;
2594 	struct anon *ap, *oldap;
2595 	struct anon_map *amp;
2596 	int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD;
2597 	int anon_lock = 0;
2598 	anon_sync_obj_t cookie;
2599 
2600 	if (svd->flags & MAP_TEXT) {
2601 		hat_flag |= HAT_LOAD_TEXT;
2602 	}
2603 
2604 	ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock));
2605 	ASSERT(seg->s_szc == 0);
2606 	ASSERT(svd->tr_state != SEGVN_TR_INIT);
2607 
2608 	/*
2609 	 * Initialize protection value for this page.
2610 	 * If we have per page protection values check it now.
2611 	 */
2612 	if (svd->pageprot) {
2613 		uint_t protchk;
2614 
2615 		switch (rw) {
2616 		case S_READ:
2617 			protchk = PROT_READ;
2618 			break;
2619 		case S_WRITE:
2620 			protchk = PROT_WRITE;
2621 			break;
2622 		case S_EXEC:
2623 			protchk = PROT_EXEC;
2624 			break;
2625 		case S_OTHER:
2626 		default:
2627 			protchk = PROT_READ | PROT_WRITE | PROT_EXEC;
2628 			break;
2629 		}
2630 
2631 		prot = VPP_PROT(vpage);
2632 		if ((prot & protchk) == 0)
2633 			return (FC_PROT);	/* illegal access type */
2634 	} else {
2635 		prot = svd->prot;
2636 	}
2637 
2638 	if (type == F_SOFTLOCK && svd->vp != NULL) {
2639 		mutex_enter(&freemem_lock);
2640 		if (availrmem <= tune.t_minarmem) {
2641 			mutex_exit(&freemem_lock);
2642 			return (FC_MAKE_ERR(ENOMEM));	/* out of real memory */
2643 		} else {
2644 			availrmem--;
2645 			svd->softlockcnt++;
2646 			segvn_pages_locked++;
2647 		}
2648 		mutex_exit(&freemem_lock);
2649 	}
2650 
2651 	/*
2652 	 * Always acquire the anon array lock to prevent 2 threads from
2653 	 * allocating separate anon slots for the same "addr".
2654 	 */
2655 
2656 	if ((amp = svd->amp) != NULL) {
2657 		ASSERT(RW_READ_HELD(&amp->a_rwlock));
2658 		anon_index = svd->anon_index + seg_page(seg, addr);
2659 		anon_array_enter(amp, anon_index, &cookie);
2660 		anon_lock = 1;
2661 	}
2662 
2663 	if (svd->vp == NULL && amp != NULL) {
2664 		if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) {
2665 			/*
2666 			 * Allocate a (normally) writable anonymous page of
2667 			 * zeroes. If no advance reservations, reserve now.
2668 			 */
2669 			if (svd->flags & MAP_NORESERVE) {
2670 				if (anon_resv_zone(ptob(1),
2671 				    seg->s_as->a_proc->p_zone)) {
2672 					atomic_add_long(&svd->swresv, ptob(1));
2673 				} else {
2674 					err = ENOMEM;
2675 					goto out;
2676 				}
2677 			}
2678 			if ((pp = anon_zero(seg, addr, &ap,
2679 			    svd->cred)) == NULL) {
2680 				err = ENOMEM;
2681 				goto out;	/* out of swap space */
2682 			}
2683 			/*
2684 			 * Re-acquire the anon_map lock and
2685 			 * initialize the anon array entry.
2686 			 */
2687 			(void) anon_set_ptr(amp->ahp, anon_index, ap,
2688 			    ANON_SLEEP);
2689 
2690 			ASSERT(pp->p_szc == 0);
2691 
2692 			/*
2693 			 * Handle pages that have been marked for migration
2694 			 */
2695 			if (lgrp_optimizations())
2696 				page_migrate(seg, addr, &pp, 1);
2697 
2698 			if (type == F_SOFTLOCK) {
2699 				if (!segvn_slock_anonpages(pp, first)) {
2700 					page_unlock(pp);
2701 					err = ENOMEM;
2702 					goto out;
2703 				} else {
2704 					mutex_enter(&freemem_lock);
2705 					svd->softlockcnt++;
2706 					segvn_pages_locked++;
2707 					mutex_exit(&freemem_lock);
2708 				}
2709 			}
2710 
2711 			if (enable_mbit_wa) {
2712 				if (rw == S_WRITE)
2713 					hat_setmod(pp);
2714 				else if (!hat_ismod(pp))
2715 					prot &= ~PROT_WRITE;
2716 			}
2717 			/*
2718 			 * If AS_PAGLCK is set in a_flags (via memcntl(2)
2719 			 * with MC_LOCKAS, MCL_FUTURE) and this is a
2720 			 * MAP_NORESERVE segment, we may need to
2721 			 * permanently lock the page as it is being faulted
2722 			 * for the first time. The following text applies
2723 			 * only to MAP_NORESERVE segments:
2724 			 *
2725 			 * As per memcntl(2), if this segment was created
2726 			 * after MCL_FUTURE was applied (a "future"
2727 			 * segment), its pages must be locked.  If this
2728 			 * segment existed at MCL_FUTURE application (a
2729 			 * "past" segment), the interface is unclear.
2730 			 *
2731 			 * We decide to lock only if vpage is present:
2732 			 *
2733 			 * - "future" segments will have a vpage array (see
2734 			 *    as_map), and so will be locked as required
2735 			 *
2736 			 * - "past" segments may not have a vpage array,
2737 			 *    depending on whether events (such as
2738 			 *    mprotect) have occurred. Locking if vpage
2739 			 *    exists will preserve legacy behavior.  Not
2740 			 *    locking if vpage is absent, will not break
2741 			 *    the interface or legacy behavior.  Note that
2742 			 *    allocating vpage here if it's absent requires
2743 			 *    upgrading the segvn reader lock, the cost of
2744 			 *    which does not seem worthwhile.
2745 			 *
2746 			 * Usually testing and setting VPP_ISPPLOCK and
2747 			 * VPP_SETPPLOCK requires holding the segvn lock as
2748 			 * writer, but in this case all readers are
2749 			 * serializing on the anon array lock.
2750 			 */
2751 			if (AS_ISPGLCK(seg->s_as) && vpage != NULL &&
2752 			    (svd->flags & MAP_NORESERVE) &&
2753 			    !VPP_ISPPLOCK(vpage)) {
2754 				proc_t *p = seg->s_as->a_proc;
2755 				ASSERT(svd->type == MAP_PRIVATE);
2756 				mutex_enter(&p->p_lock);
2757 				if (rctl_incr_locked_mem(p, NULL, PAGESIZE,
2758 				    1) == 0) {
2759 					claim = VPP_PROT(vpage) & PROT_WRITE;
2760 					if (page_pp_lock(pp, claim, 0)) {
2761 						VPP_SETPPLOCK(vpage);
2762 					} else {
2763 						rctl_decr_locked_mem(p, NULL,
2764 						    PAGESIZE, 1);
2765 					}
2766 				}
2767 				mutex_exit(&p->p_lock);
2768 			}
2769 
2770 			ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
2771 			hat_memload(hat, addr, pp, prot, hat_flag);
2772 
2773 			if (!(hat_flag & HAT_LOAD_LOCK))
2774 				page_unlock(pp);
2775 
2776 			anon_array_exit(&cookie);
2777 			return (0);
2778 		}
2779 	}
2780 
2781 	/*
2782 	 * Obtain the page structure via anon_getpage() if it is
2783 	 * a private copy of an object (the result of a previous
2784 	 * copy-on-write).
2785 	 */
2786 	if (amp != NULL) {
2787 		if ((ap = anon_get_ptr(amp->ahp, anon_index)) != NULL) {
2788 			err = anon_getpage(&ap, &vpprot, anon_pl, PAGESIZE,
2789 			    seg, addr, rw, svd->cred);
2790 			if (err)
2791 				goto out;
2792 
2793 			if (svd->type == MAP_SHARED) {
2794 				/*
2795 				 * If this is a shared mapping to an
2796 				 * anon_map, then ignore the write
2797 				 * permissions returned by anon_getpage().
2798 				 * They apply to the private mappings
2799 				 * of this anon_map.
2800 				 */
2801 				vpprot |= PROT_WRITE;
2802 			}
2803 			opp = anon_pl[0];
2804 		}
2805 	}
2806 
2807 	/*
2808 	 * Search the pl[] list passed in if it is from the
2809 	 * original object (i.e., not a private copy).
2810 	 */
2811 	if (opp == NULL) {
2812 		/*
2813 		 * Find original page.  We must be bringing it in
2814 		 * from the list in pl[].
2815 		 */
2816 		for (ppp = pl; (opp = *ppp) != NULL; ppp++) {
2817 			if (opp == PAGE_HANDLED)
2818 				continue;
2819 			ASSERT(opp->p_vnode == svd->vp); /* XXX */
2820 			if (opp->p_offset == off)
2821 				break;
2822 		}
2823 		if (opp == NULL) {
2824 			panic("segvn_faultpage not found");
2825 			/*NOTREACHED*/
2826 		}
2827 		*ppp = PAGE_HANDLED;
2828 
2829 	}
2830 
2831 	ASSERT(PAGE_LOCKED(opp));
2832 
2833 	TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT,
2834 	    "segvn_fault:pp %p vp %p offset %llx", opp, NULL, 0);
2835 
2836 	/*
2837 	 * The fault is treated as a copy-on-write fault if a
2838 	 * write occurs on a private segment and the object
2839 	 * page (i.e., mapping) is write protected.  We assume
2840 	 * that fatal protection checks have already been made.
2841 	 */
2842 
2843 	if (brkcow) {
2844 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
2845 		cow = !(vpprot & PROT_WRITE);
2846 	} else if (svd->tr_state == SEGVN_TR_ON) {
2847 		/*
2848 		 * If we are doing text replication COW on first touch.
2849 		 */
2850 		ASSERT(amp != NULL);
2851 		ASSERT(svd->vp != NULL);
2852 		ASSERT(rw != S_WRITE);
2853 		cow = (ap == NULL);
2854 	} else {
2855 		cow = 0;
2856 	}
2857 
2858 	/*
2859 	 * If not a copy-on-write case load the translation
2860 	 * and return.
2861 	 */
2862 	if (cow == 0) {
2863 
2864 		/*
2865 		 * Handle pages that have been marked for migration
2866 		 */
2867 		if (lgrp_optimizations())
2868 			page_migrate(seg, addr, &opp, 1);
2869 
2870 		if (type == F_SOFTLOCK && svd->vp == NULL) {
2871 
2872 			ASSERT(opp->p_szc == 0 ||
2873 			    (svd->type == MAP_SHARED &&
2874 			    amp != NULL && amp->a_szc != 0));
2875 
2876 			if (!segvn_slock_anonpages(opp, first)) {
2877 				page_unlock(opp);
2878 				err = ENOMEM;
2879 				goto out;
2880 			} else {
2881 				mutex_enter(&freemem_lock);
2882 				svd->softlockcnt++;
2883 				segvn_pages_locked++;
2884 				mutex_exit(&freemem_lock);
2885 			}
2886 		}
2887 		if (IS_VMODSORT(opp->p_vnode) || enable_mbit_wa) {
2888 			if (rw == S_WRITE)
2889 				hat_setmod(opp);
2890 			else if (rw != S_OTHER && !hat_ismod(opp))
2891 				prot &= ~PROT_WRITE;
2892 		}
2893 
2894 		ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE ||
2895 		    (!svd->pageprot && svd->prot == (prot & vpprot)));
2896 		ASSERT(amp == NULL ||
2897 		    svd->rcookie == HAT_INVALID_REGION_COOKIE);
2898 		hat_memload_region(hat, addr, opp, prot & vpprot, hat_flag,
2899 		    svd->rcookie);
2900 
2901 		if (!(hat_flag & HAT_LOAD_LOCK))
2902 			page_unlock(opp);
2903 
2904 		if (anon_lock) {
2905 			anon_array_exit(&cookie);
2906 		}
2907 		return (0);
2908 	}
2909 
2910 	ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
2911 
2912 	hat_setref(opp);
2913 
2914 	ASSERT(amp != NULL && anon_lock);
2915 
2916 	/*
2917 	 * Steal the page only if it isn't a private page
2918 	 * since stealing a private page is not worth the effort.
2919 	 */
2920 	if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL)
2921 		steal = 1;
2922 
2923 	/*
2924 	 * Steal the original page if the following conditions are true:
2925 	 *
2926 	 * We are low on memory, the page is not private, page is not large,
2927 	 * not shared, not modified, not `locked' or if we have it `locked'
2928 	 * (i.e., p_cowcnt == 1 and p_lckcnt == 0, which also implies
2929 	 * that the page is not shared) and if it doesn't have any
2930 	 * translations. page_struct_lock isn't needed to look at p_cowcnt
2931 	 * and p_lckcnt because we first get exclusive lock on page.
2932 	 */
2933 	(void) hat_pagesync(opp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD);
2934 
2935 	if (stealcow && freemem < minfree && steal && opp->p_szc == 0 &&
2936 	    page_tryupgrade(opp) && !hat_ismod(opp) &&
2937 	    ((opp->p_lckcnt == 0 && opp->p_cowcnt == 0) ||
2938 	    (opp->p_lckcnt == 0 && opp->p_cowcnt == 1 &&
2939 	    vpage != NULL && VPP_ISPPLOCK(vpage)))) {
2940 		/*
2941 		 * Check if this page has other translations
2942 		 * after unloading our translation.
2943 		 */
2944 		if (hat_page_is_mapped(opp)) {
2945 			ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
2946 			hat_unload(seg->s_as->a_hat, addr, PAGESIZE,
2947 			    HAT_UNLOAD);
2948 		}
2949 
2950 		/*
2951 		 * hat_unload() might sync back someone else's recent
2952 		 * modification, so check again.
2953 		 */
2954 		if (!hat_ismod(opp) && !hat_page_is_mapped(opp))
2955 			pageflags |= STEAL_PAGE;
2956 	}
2957 
2958 	/*
2959 	 * If we have a vpage pointer, see if it indicates that we have
2960 	 * ``locked'' the page we map -- if so, tell anon_private to
2961 	 * transfer the locking resource to the new page.
2962 	 *
2963 	 * See Statement at the beginning of segvn_lockop regarding
2964 	 * the way lockcnts/cowcnts are handled during COW.
2965 	 *
2966 	 */
2967 	if (vpage != NULL && VPP_ISPPLOCK(vpage))
2968 		pageflags |= LOCK_PAGE;
2969 
2970 	/*
2971 	 * Allocate a private page and perform the copy.
2972 	 * For MAP_NORESERVE reserve swap space now, unless this
2973 	 * is a cow fault on an existing anon page in which case
2974 	 * MAP_NORESERVE will have made advance reservations.
2975 	 */
2976 	if ((svd->flags & MAP_NORESERVE) && (ap == NULL)) {
2977 		if (anon_resv_zone(ptob(1), seg->s_as->a_proc->p_zone)) {
2978 			atomic_add_long(&svd->swresv, ptob(1));
2979 		} else {
2980 			page_unlock(opp);
2981 			err = ENOMEM;
2982 			goto out;
2983 		}
2984 	}
2985 	oldap = ap;
2986 	pp = anon_private(&ap, seg, addr, prot, opp, pageflags, svd->cred);
2987 	if (pp == NULL) {
2988 		err = ENOMEM;	/* out of swap space */
2989 		goto out;
2990 	}
2991 
2992 	/*
2993 	 * If we copied away from an anonymous page, then
2994 	 * we are one step closer to freeing up an anon slot.
2995 	 *
2996 	 * NOTE:  The original anon slot must be released while
2997 	 * holding the "anon_map" lock.  This is necessary to prevent
2998 	 * other threads from obtaining a pointer to the anon slot
2999 	 * which may be freed if its "refcnt" is 1.
3000 	 */
3001 	if (oldap != NULL)
3002 		anon_decref(oldap);
3003 
3004 	(void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP);
3005 
3006 	/*
3007 	 * Handle pages that have been marked for migration
3008 	 */
3009 	if (lgrp_optimizations())
3010 		page_migrate(seg, addr, &pp, 1);
3011 
3012 	ASSERT(pp->p_szc == 0);
3013 	if (type == F_SOFTLOCK && svd->vp == NULL) {
3014 		if (!segvn_slock_anonpages(pp, first)) {
3015 			page_unlock(pp);
3016 			err = ENOMEM;
3017 			goto out;
3018 		} else {
3019 			mutex_enter(&freemem_lock);
3020 			svd->softlockcnt++;
3021 			segvn_pages_locked++;
3022 			mutex_exit(&freemem_lock);
3023 		}
3024 	}
3025 
3026 	ASSERT(!IS_VMODSORT(pp->p_vnode));
3027 	if (enable_mbit_wa) {
3028 		if (rw == S_WRITE)
3029 			hat_setmod(pp);
3030 		else if (!hat_ismod(pp))
3031 			prot &= ~PROT_WRITE;
3032 	}
3033 
3034 	ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
3035 	hat_memload(hat, addr, pp, prot, hat_flag);
3036 
3037 	if (!(hat_flag & HAT_LOAD_LOCK))
3038 		page_unlock(pp);
3039 
3040 	ASSERT(anon_lock);
3041 	anon_array_exit(&cookie);
3042 	return (0);
3043 out:
3044 	if (anon_lock)
3045 		anon_array_exit(&cookie);
3046 
3047 	if (type == F_SOFTLOCK && svd->vp != NULL) {
3048 		mutex_enter(&freemem_lock);
3049 		availrmem++;
3050 		segvn_pages_locked--;
3051 		svd->softlockcnt--;
3052 		mutex_exit(&freemem_lock);
3053 	}
3054 	return (FC_MAKE_ERR(err));
3055 }
3056 
3057 /*
3058  * relocate a bunch of smaller targ pages into one large repl page. all targ
3059  * pages must be complete pages smaller than replacement pages.
3060  * it's assumed that no page's szc can change since they are all PAGESIZE or
3061  * complete large pages locked SHARED.
3062  */
3063 static void
3064 segvn_relocate_pages(page_t **targ, page_t *replacement)
3065 {
3066 	page_t *pp;
3067 	pgcnt_t repl_npgs, curnpgs;
3068 	pgcnt_t i;
3069 	uint_t repl_szc = replacement->p_szc;
3070 	page_t *first_repl = replacement;
3071 	page_t *repl;
3072 	spgcnt_t npgs;
3073 
3074 	VM_STAT_ADD(segvnvmstats.relocatepages[0]);
3075 
3076 	ASSERT(repl_szc != 0);
3077 	npgs = repl_npgs = page_get_pagecnt(repl_szc);
3078 
3079 	i = 0;
3080 	while (repl_npgs) {
3081 		spgcnt_t nreloc;
3082 		int err;
3083 		ASSERT(replacement != NULL);
3084 		pp = targ[i];
3085 		ASSERT(pp->p_szc < repl_szc);
3086 		ASSERT(PAGE_EXCL(pp));
3087 		ASSERT(!PP_ISFREE(pp));
3088 		curnpgs = page_get_pagecnt(pp->p_szc);
3089 		if (curnpgs == 1) {
3090 			VM_STAT_ADD(segvnvmstats.relocatepages[1]);
3091 			repl = replacement;
3092 			page_sub(&replacement, repl);
3093 			ASSERT(PAGE_EXCL(repl));
3094 			ASSERT(!PP_ISFREE(repl));
3095 			ASSERT(repl->p_szc == repl_szc);
3096 		} else {
3097 			page_t *repl_savepp;
3098 			int j;
3099 			VM_STAT_ADD(segvnvmstats.relocatepages[2]);
3100 			repl_savepp = replacement;
3101 			for (j = 0; j < curnpgs; j++) {
3102 				repl = replacement;
3103 				page_sub(&replacement, repl);
3104 				ASSERT(PAGE_EXCL(repl));
3105 				ASSERT(!PP_ISFREE(repl));
3106 				ASSERT(repl->p_szc == repl_szc);
3107 				ASSERT(page_pptonum(targ[i + j]) ==
3108 				    page_pptonum(targ[i]) + j);
3109 			}
3110 			repl = repl_savepp;
3111 			ASSERT(IS_P2ALIGNED(page_pptonum(repl), curnpgs));
3112 		}
3113 		err = page_relocate(&pp, &repl, 0, 1, &nreloc, NULL);
3114 		if (err || nreloc != curnpgs) {
3115 			panic("segvn_relocate_pages: "
3116 			    "page_relocate failed err=%d curnpgs=%ld "
3117 			    "nreloc=%ld", err, curnpgs, nreloc);
3118 		}
3119 		ASSERT(curnpgs <= repl_npgs);
3120 		repl_npgs -= curnpgs;
3121 		i += curnpgs;
3122 	}
3123 	ASSERT(replacement == NULL);
3124 
3125 	repl = first_repl;
3126 	repl_npgs = npgs;
3127 	for (i = 0; i < repl_npgs; i++) {
3128 		ASSERT(PAGE_EXCL(repl));
3129 		ASSERT(!PP_ISFREE(repl));
3130 		targ[i] = repl;
3131 		page_downgrade(targ[i]);
3132 		repl++;
3133 	}
3134 }
3135 
3136 /*
3137  * Check if all pages in ppa array are complete smaller than szc pages and
3138  * their roots will still be aligned relative to their current size if the
3139  * entire ppa array is relocated into one szc page. If these conditions are
3140  * not met return 0.
3141  *
3142  * If all pages are properly aligned attempt to upgrade their locks
3143  * to exclusive mode. If it fails set *upgrdfail to 1 and return 0.
3144  * upgrdfail was set to 0 by caller.
3145  *
3146  * Return 1 if all pages are aligned and locked exclusively.
3147  *
3148  * If all pages in ppa array happen to be physically contiguous to make one
3149  * szc page and all exclusive locks are successfully obtained promote the page
3150  * size to szc and set *pszc to szc. Return 1 with pages locked shared.
3151  */
3152 static int
3153 segvn_full_szcpages(page_t **ppa, uint_t szc, int *upgrdfail, uint_t *pszc)
3154 {
3155 	page_t *pp;
3156 	pfn_t pfn;
3157 	pgcnt_t totnpgs = page_get_pagecnt(szc);
3158 	pfn_t first_pfn;
3159 	int contig = 1;
3160 	pgcnt_t i;
3161 	pgcnt_t j;
3162 	uint_t curszc;
3163 	pgcnt_t curnpgs;
3164 	int root = 0;
3165 
3166 	ASSERT(szc > 0);
3167 
3168 	VM_STAT_ADD(segvnvmstats.fullszcpages[0]);
3169 
3170 	for (i = 0; i < totnpgs; i++) {
3171 		pp = ppa[i];
3172 		ASSERT(PAGE_SHARED(pp));
3173 		ASSERT(!PP_ISFREE(pp));
3174 		pfn = page_pptonum(pp);
3175 		if (i == 0) {
3176 			if (!IS_P2ALIGNED(pfn, totnpgs)) {
3177 				contig = 0;
3178 			} else {
3179 				first_pfn = pfn;
3180 			}
3181 		} else if (contig && pfn != first_pfn + i) {
3182 			contig = 0;
3183 		}
3184 		if (pp->p_szc == 0) {
3185 			if (root) {
3186 				VM_STAT_ADD(segvnvmstats.fullszcpages[1]);
3187 				return (0);
3188 			}
3189 		} else if (!root) {
3190 			if ((curszc = pp->p_szc) >= szc) {
3191 				VM_STAT_ADD(segvnvmstats.fullszcpages[2]);
3192 				return (0);
3193 			}
3194 			if (curszc == 0) {
3195 				/*
3196 				 * p_szc changed means we don't have all pages
3197 				 * locked. return failure.
3198 				 */
3199 				VM_STAT_ADD(segvnvmstats.fullszcpages[3]);
3200 				return (0);
3201 			}
3202 			curnpgs = page_get_pagecnt(curszc);
3203 			if (!IS_P2ALIGNED(pfn, curnpgs) ||
3204 			    !IS_P2ALIGNED(i, curnpgs)) {
3205 				VM_STAT_ADD(segvnvmstats.fullszcpages[4]);
3206 				return (0);
3207 			}
3208 			root = 1;
3209 		} else {
3210 			ASSERT(i > 0);
3211 			VM_STAT_ADD(segvnvmstats.fullszcpages[5]);
3212 			if (pp->p_szc != curszc) {
3213 				VM_STAT_ADD(segvnvmstats.fullszcpages[6]);
3214 				return (0);
3215 			}
3216 			if (pfn - 1 != page_pptonum(ppa[i - 1])) {
3217 				panic("segvn_full_szcpages: "
3218 				    "large page not physically contiguous");
3219 			}
3220 			if (P2PHASE(pfn, curnpgs) == curnpgs - 1) {
3221 				root = 0;
3222 			}
3223 		}
3224 	}
3225 
3226 	for (i = 0; i < totnpgs; i++) {
3227 		ASSERT(ppa[i]->p_szc < szc);
3228 		if (!page_tryupgrade(ppa[i])) {
3229 			for (j = 0; j < i; j++) {
3230 				page_downgrade(ppa[j]);
3231 			}
3232 			*pszc = ppa[i]->p_szc;
3233 			*upgrdfail = 1;
3234 			VM_STAT_ADD(segvnvmstats.fullszcpages[7]);
3235 			return (0);
3236 		}
3237 	}
3238 
3239 	/*
3240 	 * When a page is put a free cachelist its szc is set to 0.  if file
3241 	 * system reclaimed pages from cachelist targ pages will be physically
3242 	 * contiguous with 0 p_szc.  in this case just upgrade szc of targ
3243 	 * pages without any relocations.
3244 	 * To avoid any hat issues with previous small mappings
3245 	 * hat_pageunload() the target pages first.
3246 	 */
3247 	if (contig) {
3248 		VM_STAT_ADD(segvnvmstats.fullszcpages[8]);
3249 		for (i = 0; i < totnpgs; i++) {
3250 			(void) hat_pageunload(ppa[i], HAT_FORCE_PGUNLOAD);
3251 		}
3252 		for (i = 0; i < totnpgs; i++) {
3253 			ppa[i]->p_szc = szc;
3254 		}
3255 		for (i = 0; i < totnpgs; i++) {
3256 			ASSERT(PAGE_EXCL(ppa[i]));
3257 			page_downgrade(ppa[i]);
3258 		}
3259 		if (pszc != NULL) {
3260 			*pszc = szc;
3261 		}
3262 	}
3263 	VM_STAT_ADD(segvnvmstats.fullszcpages[9]);
3264 	return (1);
3265 }
3266 
3267 /*
3268  * Create physically contiguous pages for [vp, off] - [vp, off +
3269  * page_size(szc)) range and for private segment return them in ppa array.
3270  * Pages are created either via IO or relocations.
3271  *
3272  * Return 1 on success and 0 on failure.
3273  *
3274  * If physically contiguous pages already exist for this range return 1 without
3275  * filling ppa array. Caller initializes ppa[0] as NULL to detect that ppa
3276  * array wasn't filled. In this case caller fills ppa array via VOP_GETPAGE().
3277  */
3278 
3279 static int
3280 segvn_fill_vp_pages(struct segvn_data *svd, vnode_t *vp, u_offset_t off,
3281     uint_t szc, page_t **ppa, page_t **ppplist, uint_t *ret_pszc,
3282     int *downsize)
3283 
3284 {
3285 	page_t *pplist = *ppplist;
3286 	size_t pgsz = page_get_pagesize(szc);
3287 	pgcnt_t pages = btop(pgsz);
3288 	ulong_t start_off = off;
3289 	u_offset_t eoff = off + pgsz;
3290 	spgcnt_t nreloc;
3291 	u_offset_t io_off = off;
3292 	size_t io_len;
3293 	page_t *io_pplist = NULL;
3294 	page_t *done_pplist = NULL;
3295 	pgcnt_t pgidx = 0;
3296 	page_t *pp;
3297 	page_t *newpp;
3298 	page_t *targpp;
3299 	int io_err = 0;
3300 	int i;
3301 	pfn_t pfn;
3302 	ulong_t ppages;
3303 	page_t *targ_pplist = NULL;
3304 	page_t *repl_pplist = NULL;
3305 	page_t *tmp_pplist;
3306 	int nios = 0;
3307 	uint_t pszc;
3308 	struct vattr va;
3309 
3310 	VM_STAT_ADD(segvnvmstats.fill_vp_pages[0]);
3311 
3312 	ASSERT(szc != 0);
3313 	ASSERT(pplist->p_szc == szc);
3314 
3315 	/*
3316 	 * downsize will be set to 1 only if we fail to lock pages. this will
3317 	 * allow subsequent faults to try to relocate the page again. If we
3318 	 * fail due to misalignment don't downsize and let the caller map the
3319 	 * whole region with small mappings to avoid more faults into the area
3320 	 * where we can't get large pages anyway.
3321 	 */
3322 	*downsize = 0;
3323 
3324 	while (off < eoff) {
3325 		newpp = pplist;
3326 		ASSERT(newpp != NULL);
3327 		ASSERT(PAGE_EXCL(newpp));
3328 		ASSERT(!PP_ISFREE(newpp));
3329 		/*
3330 		 * we pass NULL for nrelocp to page_lookup_create()
3331 		 * so that it doesn't relocate. We relocate here
3332 		 * later only after we make sure we can lock all
3333 		 * pages in the range we handle and they are all
3334 		 * aligned.
3335 		 */
3336 		pp = page_lookup_create(vp, off, SE_SHARED, newpp, NULL, 0);
3337 		ASSERT(pp != NULL);
3338 		ASSERT(!PP_ISFREE(pp));
3339 		ASSERT(pp->p_vnode == vp);
3340 		ASSERT(pp->p_offset == off);
3341 		if (pp == newpp) {
3342 			VM_STAT_ADD(segvnvmstats.fill_vp_pages[1]);
3343 			page_sub(&pplist, pp);
3344 			ASSERT(PAGE_EXCL(pp));
3345 			ASSERT(page_iolock_assert(pp));
3346 			page_list_concat(&io_pplist, &pp);
3347 			off += PAGESIZE;
3348 			continue;
3349 		}
3350 		VM_STAT_ADD(segvnvmstats.fill_vp_pages[2]);
3351 		pfn = page_pptonum(pp);
3352 		pszc = pp->p_szc;
3353 		if (pszc >= szc && targ_pplist == NULL && io_pplist == NULL &&
3354 		    IS_P2ALIGNED(pfn, pages)) {
3355 			ASSERT(repl_pplist == NULL);
3356 			ASSERT(done_pplist == NULL);
3357 			ASSERT(pplist == *ppplist);
3358 			page_unlock(pp);
3359 			page_free_replacement_page(pplist);
3360 			page_create_putback(pages);
3361 			*ppplist = NULL;
3362 			VM_STAT_ADD(segvnvmstats.fill_vp_pages[3]);
3363 			return (1);
3364 		}
3365 		if (pszc >= szc) {
3366 			page_unlock(pp);
3367 			segvn_faultvnmpss_align_err1++;
3368 			goto out;
3369 		}
3370 		ppages = page_get_pagecnt(pszc);
3371 		if (!IS_P2ALIGNED(pfn, ppages)) {
3372 			ASSERT(pszc > 0);
3373 			/*
3374 			 * sizing down to pszc won't help.
3375 			 */
3376 			page_unlock(pp);
3377 			segvn_faultvnmpss_align_err2++;
3378 			goto out;
3379 		}
3380 		pfn = page_pptonum(newpp);
3381 		if (!IS_P2ALIGNED(pfn, ppages)) {
3382 			ASSERT(pszc > 0);
3383 			/*
3384 			 * sizing down to pszc won't help.
3385 			 */
3386 			page_unlock(pp);
3387 			segvn_faultvnmpss_align_err3++;
3388 			goto out;
3389 		}
3390 		if (!PAGE_EXCL(pp)) {
3391 			VM_STAT_ADD(segvnvmstats.fill_vp_pages[4]);
3392 			page_unlock(pp);
3393 			*downsize = 1;
3394 			*ret_pszc = pp->p_szc;
3395 			goto out;
3396 		}
3397 		targpp = pp;
3398 		if (io_pplist != NULL) {
3399 			VM_STAT_ADD(segvnvmstats.fill_vp_pages[5]);
3400 			io_len = off - io_off;
3401 			/*
3402 			 * Some file systems like NFS don't check EOF
3403 			 * conditions in VOP_PAGEIO(). Check it here
3404 			 * now that pages are locked SE_EXCL. Any file
3405 			 * truncation will wait until the pages are
3406 			 * unlocked so no need to worry that file will
3407 			 * be truncated after we check its size here.
3408 			 * XXX fix NFS to remove this check.
3409 			 */
3410 			va.va_mask = AT_SIZE;
3411 			if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred, NULL)) {
3412 				VM_STAT_ADD(segvnvmstats.fill_vp_pages[6]);
3413 				page_unlock(targpp);
3414 				goto out;
3415 			}
3416 			if (btopr(va.va_size) < btopr(io_off + io_len)) {
3417 				VM_STAT_ADD(segvnvmstats.fill_vp_pages[7]);
3418 				*downsize = 1;
3419 				*ret_pszc = 0;
3420 				page_unlock(targpp);
3421 				goto out;
3422 			}
3423 			io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len,
3424 				B_READ, svd->cred, NULL);
3425 			if (io_err) {
3426 				VM_STAT_ADD(segvnvmstats.fill_vp_pages[8]);
3427 				page_unlock(targpp);
3428 				if (io_err == EDEADLK) {
3429 					segvn_vmpss_pageio_deadlk_err++;
3430 				}
3431 				goto out;
3432 			}
3433 			nios++;
3434 			VM_STAT_ADD(segvnvmstats.fill_vp_pages[9]);
3435 			while (io_pplist != NULL) {
3436 				pp = io_pplist;
3437 				page_sub(&io_pplist, pp);
3438 				ASSERT(page_iolock_assert(pp));
3439 				page_io_unlock(pp);
3440 				pgidx = (pp->p_offset - start_off) >>
3441 				    PAGESHIFT;
3442 				ASSERT(pgidx < pages);
3443 				ppa[pgidx] = pp;
3444 				page_list_concat(&done_pplist, &pp);
3445 			}
3446 		}
3447 		pp = targpp;
3448 		ASSERT(PAGE_EXCL(pp));
3449 		ASSERT(pp->p_szc <= pszc);
3450 		if (pszc != 0 && !group_page_trylock(pp, SE_EXCL)) {
3451 			VM_STAT_ADD(segvnvmstats.fill_vp_pages[10]);
3452 			page_unlock(pp);
3453 			*downsize = 1;
3454 			*ret_pszc = pp->p_szc;
3455 			goto out;
3456 		}
3457 		VM_STAT_ADD(segvnvmstats.fill_vp_pages[11]);
3458 		/*
3459 		 * page szc chould have changed before the entire group was
3460 		 * locked. reread page szc.
3461 		 */
3462 		pszc = pp->p_szc;
3463 		ppages = page_get_pagecnt(pszc);
3464 
3465 		/* link just the roots */
3466 		page_list_concat(&targ_pplist, &pp);
3467 		page_sub(&pplist, newpp);
3468 		page_list_concat(&repl_pplist, &newpp);
3469 		off += PAGESIZE;
3470 		while (--ppages != 0) {
3471 			newpp = pplist;
3472 			page_sub(&pplist, newpp);
3473 			off += PAGESIZE;
3474 		}
3475 		io_off = off;
3476 	}
3477 	if (io_pplist != NULL) {
3478 		VM_STAT_ADD(segvnvmstats.fill_vp_pages[12]);
3479 		io_len = eoff - io_off;
3480 		va.va_mask = AT_SIZE;
3481 		if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred, NULL) != 0) {
3482 			VM_STAT_ADD(segvnvmstats.fill_vp_pages[13]);
3483 			goto out;
3484 		}
3485 		if (btopr(va.va_size) < btopr(io_off + io_len)) {
3486 			VM_STAT_ADD(segvnvmstats.fill_vp_pages[14]);
3487 			*downsize = 1;
3488 			*ret_pszc = 0;
3489 			goto out;
3490 		}
3491 		io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len,
3492 		    B_READ, svd->cred, NULL);
3493 		if (io_err) {
3494 			VM_STAT_ADD(segvnvmstats.fill_vp_pages[15]);
3495 			if (io_err == EDEADLK) {
3496 				segvn_vmpss_pageio_deadlk_err++;
3497 			}
3498 			goto out;
3499 		}
3500 		nios++;
3501 		while (io_pplist != NULL) {
3502 			pp = io_pplist;
3503 			page_sub(&io_pplist, pp);
3504 			ASSERT(page_iolock_assert(pp));
3505 			page_io_unlock(pp);
3506 			pgidx = (pp->p_offset - start_off) >> PAGESHIFT;
3507 			ASSERT(pgidx < pages);
3508 			ppa[pgidx] = pp;
3509 		}
3510 	}
3511 	/*
3512 	 * we're now bound to succeed or panic.
3513 	 * remove pages from done_pplist. it's not needed anymore.
3514 	 */
3515 	while (done_pplist != NULL) {
3516 		pp = done_pplist;
3517 		page_sub(&done_pplist, pp);
3518 	}
3519 	VM_STAT_ADD(segvnvmstats.fill_vp_pages[16]);
3520 	ASSERT(pplist == NULL);
3521 	*ppplist = NULL;
3522 	while (targ_pplist != NULL) {
3523 		int ret;
3524 		VM_STAT_ADD(segvnvmstats.fill_vp_pages[17]);
3525 		ASSERT(repl_pplist);
3526 		pp = targ_pplist;
3527 		page_sub(&targ_pplist, pp);
3528 		pgidx = (pp->p_offset - start_off) >> PAGESHIFT;
3529 		newpp = repl_pplist;
3530 		page_sub(&repl_pplist, newpp);
3531 #ifdef DEBUG
3532 		pfn = page_pptonum(pp);
3533 		pszc = pp->p_szc;
3534 		ppages = page_get_pagecnt(pszc);
3535 		ASSERT(IS_P2ALIGNED(pfn, ppages));
3536 		pfn = page_pptonum(newpp);
3537 		ASSERT(IS_P2ALIGNED(pfn, ppages));
3538 		ASSERT(P2PHASE(pfn, pages) == pgidx);
3539 #endif
3540 		nreloc = 0;
3541 		ret = page_relocate(&pp, &newpp, 0, 1, &nreloc, NULL);
3542 		if (ret != 0 || nreloc == 0) {
3543 			panic("segvn_fill_vp_pages: "
3544 			    "page_relocate failed");
3545 		}
3546 		pp = newpp;
3547 		while (nreloc-- != 0) {
3548 			ASSERT(PAGE_EXCL(pp));
3549 			ASSERT(pp->p_vnode == vp);
3550 			ASSERT(pgidx ==
3551 			    ((pp->p_offset - start_off) >> PAGESHIFT));
3552 			ppa[pgidx++] = pp;
3553 			pp++;
3554 		}
3555 	}
3556 
3557 	if (svd->type == MAP_PRIVATE) {
3558 		VM_STAT_ADD(segvnvmstats.fill_vp_pages[18]);
3559 		for (i = 0; i < pages; i++) {
3560 			ASSERT(ppa[i] != NULL);
3561 			ASSERT(PAGE_EXCL(ppa[i]));
3562 			ASSERT(ppa[i]->p_vnode == vp);
3563 			ASSERT(ppa[i]->p_offset ==
3564 			    start_off + (i << PAGESHIFT));
3565 			page_downgrade(ppa[i]);
3566 		}
3567 		ppa[pages] = NULL;
3568 	} else {
3569 		VM_STAT_ADD(segvnvmstats.fill_vp_pages[19]);
3570 		/*
3571 		 * the caller will still call VOP_GETPAGE() for shared segments
3572 		 * to check FS write permissions. For private segments we map
3573 		 * file read only anyway.  so no VOP_GETPAGE is needed.
3574 		 */
3575 		for (i = 0; i < pages; i++) {
3576 			ASSERT(ppa[i] != NULL);
3577 			ASSERT(PAGE_EXCL(ppa[i]));
3578 			ASSERT(ppa[i]->p_vnode == vp);
3579 			ASSERT(ppa[i]->p_offset ==
3580 			    start_off + (i << PAGESHIFT));
3581 			page_unlock(ppa[i]);
3582 		}
3583 		ppa[0] = NULL;
3584 	}
3585 
3586 	return (1);
3587 out:
3588 	/*
3589 	 * Do the cleanup. Unlock target pages we didn't relocate. They are
3590 	 * linked on targ_pplist by root pages. reassemble unused replacement
3591 	 * and io pages back to pplist.
3592 	 */
3593 	if (io_pplist != NULL) {
3594 		VM_STAT_ADD(segvnvmstats.fill_vp_pages[20]);
3595 		pp = io_pplist;
3596 		do {
3597 			ASSERT(pp->p_vnode == vp);
3598 			ASSERT(pp->p_offset == io_off);
3599 			ASSERT(page_iolock_assert(pp));
3600 			page_io_unlock(pp);
3601 			page_hashout(pp, NULL);
3602 			io_off += PAGESIZE;
3603 		} while ((pp = pp->p_next) != io_pplist);
3604 		page_list_concat(&io_pplist, &pplist);
3605 		pplist = io_pplist;
3606 	}
3607 	tmp_pplist = NULL;
3608 	while (targ_pplist != NULL) {
3609 		VM_STAT_ADD(segvnvmstats.fill_vp_pages[21]);
3610 		pp = targ_pplist;
3611 		ASSERT(PAGE_EXCL(pp));
3612 		page_sub(&targ_pplist, pp);
3613 
3614 		pszc = pp->p_szc;
3615 		ppages = page_get_pagecnt(pszc);
3616 		ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages));
3617 
3618 		if (pszc != 0) {
3619 			group_page_unlock(pp);
3620 		}
3621 		page_unlock(pp);
3622 
3623 		pp = repl_pplist;
3624 		ASSERT(pp != NULL);
3625 		ASSERT(PAGE_EXCL(pp));
3626 		ASSERT(pp->p_szc == szc);
3627 		page_sub(&repl_pplist, pp);
3628 
3629 		ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages));
3630 
3631 		/* relink replacement page */
3632 		page_list_concat(&tmp_pplist, &pp);
3633 		while (--ppages != 0) {
3634 			VM_STAT_ADD(segvnvmstats.fill_vp_pages[22]);
3635 			pp++;
3636 			ASSERT(PAGE_EXCL(pp));
3637 			ASSERT(pp->p_szc == szc);
3638 			page_list_concat(&tmp_pplist, &pp);
3639 		}
3640 	}
3641 	if (tmp_pplist != NULL) {
3642 		VM_STAT_ADD(segvnvmstats.fill_vp_pages[23]);
3643 		page_list_concat(&tmp_pplist, &pplist);
3644 		pplist = tmp_pplist;
3645 	}
3646 	/*
3647 	 * at this point all pages are either on done_pplist or
3648 	 * pplist. They can't be all on done_pplist otherwise
3649 	 * we'd've been done.
3650 	 */
3651 	ASSERT(pplist != NULL);
3652 	if (nios != 0) {
3653 		VM_STAT_ADD(segvnvmstats.fill_vp_pages[24]);
3654 		pp = pplist;
3655 		do {
3656 			VM_STAT_ADD(segvnvmstats.fill_vp_pages[25]);
3657 			ASSERT(pp->p_szc == szc);
3658 			ASSERT(PAGE_EXCL(pp));
3659 			ASSERT(pp->p_vnode != vp);
3660 			pp->p_szc = 0;
3661 		} while ((pp = pp->p_next) != pplist);
3662 
3663 		pp = done_pplist;
3664 		do {
3665 			VM_STAT_ADD(segvnvmstats.fill_vp_pages[26]);
3666 			ASSERT(pp->p_szc == szc);
3667 			ASSERT(PAGE_EXCL(pp));
3668 			ASSERT(pp->p_vnode == vp);
3669 			pp->p_szc = 0;
3670 		} while ((pp = pp->p_next) != done_pplist);
3671 
3672 		while (pplist != NULL) {
3673 			VM_STAT_ADD(segvnvmstats.fill_vp_pages[27]);
3674 			pp = pplist;
3675 			page_sub(&pplist, pp);
3676 			page_free(pp, 0);
3677 		}
3678 
3679 		while (done_pplist != NULL) {
3680 			VM_STAT_ADD(segvnvmstats.fill_vp_pages[28]);
3681 			pp = done_pplist;
3682 			page_sub(&done_pplist, pp);
3683 			page_unlock(pp);
3684 		}
3685 		*ppplist = NULL;
3686 		return (0);
3687 	}
3688 	ASSERT(pplist == *ppplist);
3689 	if (io_err) {
3690 		VM_STAT_ADD(segvnvmstats.fill_vp_pages[29]);
3691 		/*
3692 		 * don't downsize on io error.
3693 		 * see if vop_getpage succeeds.
3694 		 * pplist may still be used in this case
3695 		 * for relocations.
3696 		 */
3697 		return (0);
3698 	}
3699 	VM_STAT_ADD(segvnvmstats.fill_vp_pages[30]);
3700 	page_free_replacement_page(pplist);
3701 	page_create_putback(pages);
3702 	*ppplist = NULL;
3703 	return (0);
3704 }
3705 
3706 int segvn_anypgsz = 0;
3707 
3708 #define	SEGVN_RESTORE_SOFTLOCK(type, pages) 		\
3709 		if ((type) == F_SOFTLOCK) {		\
3710 			mutex_enter(&freemem_lock);	\
3711 			availrmem += (pages);		\
3712 			segvn_pages_locked -= (pages);	\
3713 			svd->softlockcnt -= (pages);	\
3714 			mutex_exit(&freemem_lock);	\
3715 		}
3716 
3717 #define	SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot)		\
3718 		if (IS_VMODSORT((ppa)[0]->p_vnode)) {			\
3719 			if ((rw) == S_WRITE) {				\
3720 				for (i = 0; i < (pages); i++) {		\
3721 					ASSERT((ppa)[i]->p_vnode ==	\
3722 					    (ppa)[0]->p_vnode);		\
3723 					hat_setmod((ppa)[i]);		\
3724 				}					\
3725 			} else if ((rw) != S_OTHER &&			\
3726 			    ((prot) & (vpprot) & PROT_WRITE)) {		\
3727 				for (i = 0; i < (pages); i++) {		\
3728 					ASSERT((ppa)[i]->p_vnode ==	\
3729 					    (ppa)[0]->p_vnode);		\
3730 					if (!hat_ismod((ppa)[i])) {	\
3731 						prot &= ~PROT_WRITE;	\
3732 						break;			\
3733 					}				\
3734 				}					\
3735 			}						\
3736 		}
3737 
3738 #ifdef  VM_STATS
3739 
3740 #define	SEGVN_VMSTAT_FLTVNPAGES(idx)					\
3741 		VM_STAT_ADD(segvnvmstats.fltvnpages[(idx)]);
3742 
3743 #else /* VM_STATS */
3744 
3745 #define	SEGVN_VMSTAT_FLTVNPAGES(idx)
3746 
3747 #endif
3748 
3749 static faultcode_t
3750 segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
3751     caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr,
3752     caddr_t eaddr, int brkcow)
3753 {
3754 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
3755 	struct anon_map *amp = svd->amp;
3756 	uchar_t segtype = svd->type;
3757 	uint_t szc = seg->s_szc;
3758 	size_t pgsz = page_get_pagesize(szc);
3759 	size_t maxpgsz = pgsz;
3760 	pgcnt_t pages = btop(pgsz);
3761 	pgcnt_t maxpages = pages;
3762 	size_t ppasize = (pages + 1) * sizeof (page_t *);
3763 	caddr_t a = lpgaddr;
3764 	caddr_t	maxlpgeaddr = lpgeaddr;
3765 	u_offset_t off = svd->offset + (uintptr_t)(a - seg->s_base);
3766 	ulong_t aindx = svd->anon_index + seg_page(seg, a);
3767 	struct vpage *vpage = (svd->vpage != NULL) ?
3768 	    &svd->vpage[seg_page(seg, a)] : NULL;
3769 	vnode_t *vp = svd->vp;
3770 	page_t **ppa;
3771 	uint_t	pszc;
3772 	size_t	ppgsz;
3773 	pgcnt_t	ppages;
3774 	faultcode_t err = 0;
3775 	int ierr;
3776 	int vop_size_err = 0;
3777 	uint_t protchk, prot, vpprot;
3778 	ulong_t i;
3779 	int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD;
3780 	anon_sync_obj_t an_cookie;
3781 	enum seg_rw arw;
3782 	int alloc_failed = 0;
3783 	int adjszc_chk;
3784 	struct vattr va;
3785 	int xhat = 0;
3786 	page_t *pplist;
3787 	pfn_t pfn;
3788 	int physcontig;
3789 	int upgrdfail;
3790 	int segvn_anypgsz_vnode = 0; /* for now map vnode with 2 page sizes */
3791 	int tron = (svd->tr_state == SEGVN_TR_ON);
3792 
3793 	ASSERT(szc != 0);
3794 	ASSERT(vp != NULL);
3795 	ASSERT(brkcow == 0 || amp != NULL);
3796 	ASSERT(tron == 0 || amp != NULL);
3797 	ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */
3798 	ASSERT(!(svd->flags & MAP_NORESERVE));
3799 	ASSERT(type != F_SOFTUNLOCK);
3800 	ASSERT(IS_P2ALIGNED(a, maxpgsz));
3801 	ASSERT(amp == NULL || IS_P2ALIGNED(aindx, maxpages));
3802 	ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
3803 	ASSERT(seg->s_szc < NBBY * sizeof (int));
3804 	ASSERT(type != F_SOFTLOCK || lpgeaddr - a == maxpgsz);
3805 	ASSERT(svd->tr_state != SEGVN_TR_INIT);
3806 
3807 	VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltvnpages[0]);
3808 	VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltvnpages[1]);
3809 
3810 	if (svd->flags & MAP_TEXT) {
3811 		hat_flag |= HAT_LOAD_TEXT;
3812 	}
3813 
3814 	if (svd->pageprot) {
3815 		switch (rw) {
3816 		case S_READ:
3817 			protchk = PROT_READ;
3818 			break;
3819 		case S_WRITE:
3820 			protchk = PROT_WRITE;
3821 			break;
3822 		case S_EXEC:
3823 			protchk = PROT_EXEC;
3824 			break;
3825 		case S_OTHER:
3826 		default:
3827 			protchk = PROT_READ | PROT_WRITE | PROT_EXEC;
3828 			break;
3829 		}
3830 	} else {
3831 		prot = svd->prot;
3832 		/* caller has already done segment level protection check. */
3833 	}
3834 
3835 	if (seg->s_as->a_hat != hat) {
3836 		xhat = 1;
3837 	}
3838 
3839 	if (rw == S_WRITE && segtype == MAP_PRIVATE) {
3840 		SEGVN_VMSTAT_FLTVNPAGES(2);
3841 		arw = S_READ;
3842 	} else {
3843 		arw = rw;
3844 	}
3845 
3846 	ppa = kmem_alloc(ppasize, KM_SLEEP);
3847 
3848 	VM_STAT_COND_ADD(amp != NULL, segvnvmstats.fltvnpages[3]);
3849 
3850 	for (;;) {
3851 		adjszc_chk = 0;
3852 		for (; a < lpgeaddr; a += pgsz, off += pgsz, aindx += pages) {
3853 			if (adjszc_chk) {
3854 				while (szc < seg->s_szc) {
3855 					uintptr_t e;
3856 					uint_t tszc;
3857 					tszc = segvn_anypgsz_vnode ? szc + 1 :
3858 					    seg->s_szc;
3859 					ppgsz = page_get_pagesize(tszc);
3860 					if (!IS_P2ALIGNED(a, ppgsz) ||
3861 					    ((alloc_failed >> tszc) & 0x1)) {
3862 						break;
3863 					}
3864 					SEGVN_VMSTAT_FLTVNPAGES(4);
3865 					szc = tszc;
3866 					pgsz = ppgsz;
3867 					pages = btop(pgsz);
3868 					e = P2ROUNDUP((uintptr_t)eaddr, pgsz);
3869 					lpgeaddr = (caddr_t)e;
3870 				}
3871 			}
3872 
3873 		again:
3874 			if (IS_P2ALIGNED(a, maxpgsz) && amp != NULL) {
3875 				ASSERT(IS_P2ALIGNED(aindx, maxpages));
3876 				ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
3877 				anon_array_enter(amp, aindx, &an_cookie);
3878 				if (anon_get_ptr(amp->ahp, aindx) != NULL) {
3879 					SEGVN_VMSTAT_FLTVNPAGES(5);
3880 					ASSERT(anon_pages(amp->ahp, aindx,
3881 					    maxpages) == maxpages);
3882 					anon_array_exit(&an_cookie);
3883 					ANON_LOCK_EXIT(&amp->a_rwlock);
3884 					err = segvn_fault_anonpages(hat, seg,
3885 					    a, a + maxpgsz, type, rw,
3886 					    MAX(a, addr),
3887 					    MIN(a + maxpgsz, eaddr), brkcow);
3888 					if (err != 0) {
3889 						SEGVN_VMSTAT_FLTVNPAGES(6);
3890 						goto out;
3891 					}
3892 					if (szc < seg->s_szc) {
3893 						szc = seg->s_szc;
3894 						pgsz = maxpgsz;
3895 						pages = maxpages;
3896 						lpgeaddr = maxlpgeaddr;
3897 					}
3898 					goto next;
3899 				} else {
3900 					ASSERT(anon_pages(amp->ahp, aindx,
3901 					    maxpages) == 0);
3902 					SEGVN_VMSTAT_FLTVNPAGES(7);
3903 					anon_array_exit(&an_cookie);
3904 					ANON_LOCK_EXIT(&amp->a_rwlock);
3905 				}
3906 			}
3907 			ASSERT(!brkcow || IS_P2ALIGNED(a, maxpgsz));
3908 			ASSERT(!tron || IS_P2ALIGNED(a, maxpgsz));
3909 
3910 			if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) {
3911 				ASSERT(vpage != NULL);
3912 				prot = VPP_PROT(vpage);
3913 				ASSERT(sameprot(seg, a, maxpgsz));
3914 				if ((prot & protchk) == 0) {
3915 					SEGVN_VMSTAT_FLTVNPAGES(8);
3916 					err = FC_PROT;
3917 					goto out;
3918 				}
3919 			}
3920 			if (type == F_SOFTLOCK) {
3921 				mutex_enter(&freemem_lock);
3922 				if (availrmem < tune.t_minarmem + pages) {
3923 					mutex_exit(&freemem_lock);
3924 					err = FC_MAKE_ERR(ENOMEM);
3925 					goto out;
3926 				} else {
3927 					availrmem -= pages;
3928 					segvn_pages_locked += pages;
3929 					svd->softlockcnt += pages;
3930 				}
3931 				mutex_exit(&freemem_lock);
3932 			}
3933 
3934 			pplist = NULL;
3935 			physcontig = 0;
3936 			ppa[0] = NULL;
3937 			if (!brkcow && !tron && szc &&
3938 			    !page_exists_physcontig(vp, off, szc,
3939 			    segtype == MAP_PRIVATE ? ppa : NULL)) {
3940 				SEGVN_VMSTAT_FLTVNPAGES(9);
3941 				if (page_alloc_pages(vp, seg, a, &pplist, NULL,
3942 				    szc, 0, 0) && type != F_SOFTLOCK) {
3943 					SEGVN_VMSTAT_FLTVNPAGES(10);
3944 					pszc = 0;
3945 					ierr = -1;
3946 					alloc_failed |= (1 << szc);
3947 					break;
3948 				}
3949 				if (pplist != NULL &&
3950 				    vp->v_mpssdata == SEGVN_PAGEIO) {
3951 					int downsize;
3952 					SEGVN_VMSTAT_FLTVNPAGES(11);
3953 					physcontig = segvn_fill_vp_pages(svd,
3954 					    vp, off, szc, ppa, &pplist,
3955 					    &pszc, &downsize);
3956 					ASSERT(!physcontig || pplist == NULL);
3957 					if (!physcontig && downsize &&
3958 					    type != F_SOFTLOCK) {
3959 						ASSERT(pplist == NULL);
3960 						SEGVN_VMSTAT_FLTVNPAGES(12);
3961 						ierr = -1;
3962 						break;
3963 					}
3964 					ASSERT(!physcontig ||
3965 					    segtype == MAP_PRIVATE ||
3966 					    ppa[0] == NULL);
3967 					if (physcontig && ppa[0] == NULL) {
3968 						physcontig = 0;
3969 					}
3970 				}
3971 			} else if (!brkcow && !tron && szc && ppa[0] != NULL) {
3972 				SEGVN_VMSTAT_FLTVNPAGES(13);
3973 				ASSERT(segtype == MAP_PRIVATE);
3974 				physcontig = 1;
3975 			}
3976 
3977 			if (!physcontig) {
3978 				SEGVN_VMSTAT_FLTVNPAGES(14);
3979 				ppa[0] = NULL;
3980 				ierr = VOP_GETPAGE(vp, (offset_t)off, pgsz,
3981 				    &vpprot, ppa, pgsz, seg, a, arw,
3982 				    svd->cred, NULL);
3983 #ifdef DEBUG
3984 				if (ierr == 0) {
3985 					for (i = 0; i < pages; i++) {
3986 						ASSERT(PAGE_LOCKED(ppa[i]));
3987 						ASSERT(!PP_ISFREE(ppa[i]));
3988 						ASSERT(ppa[i]->p_vnode == vp);
3989 						ASSERT(ppa[i]->p_offset ==
3990 						    off + (i << PAGESHIFT));
3991 					}
3992 				}
3993 #endif /* DEBUG */
3994 				if (segtype == MAP_PRIVATE) {
3995 					SEGVN_VMSTAT_FLTVNPAGES(15);
3996 					vpprot &= ~PROT_WRITE;
3997 				}
3998 			} else {
3999 				ASSERT(segtype == MAP_PRIVATE);
4000 				SEGVN_VMSTAT_FLTVNPAGES(16);
4001 				vpprot = PROT_ALL & ~PROT_WRITE;
4002 				ierr = 0;
4003 			}
4004 
4005 			if (ierr != 0) {
4006 				SEGVN_VMSTAT_FLTVNPAGES(17);
4007 				if (pplist != NULL) {
4008 					SEGVN_VMSTAT_FLTVNPAGES(18);
4009 					page_free_replacement_page(pplist);
4010 					page_create_putback(pages);
4011 				}
4012 				SEGVN_RESTORE_SOFTLOCK(type, pages);
4013 				if (a + pgsz <= eaddr) {
4014 					SEGVN_VMSTAT_FLTVNPAGES(19);
4015 					err = FC_MAKE_ERR(ierr);
4016 					goto out;
4017 				}
4018 				va.va_mask = AT_SIZE;
4019 				if (VOP_GETATTR(vp, &va, 0, svd->cred, NULL)) {
4020 					SEGVN_VMSTAT_FLTVNPAGES(20);
4021 					err = FC_MAKE_ERR(EIO);
4022 					goto out;
4023 				}
4024 				if (btopr(va.va_size) >= btopr(off + pgsz)) {
4025 					SEGVN_VMSTAT_FLTVNPAGES(21);
4026 					err = FC_MAKE_ERR(ierr);
4027 					goto out;
4028 				}
4029 				if (btopr(va.va_size) <
4030 				    btopr(off + (eaddr - a))) {
4031 					SEGVN_VMSTAT_FLTVNPAGES(22);
4032 					err = FC_MAKE_ERR(ierr);
4033 					goto out;
4034 				}
4035 				if (brkcow || tron || type == F_SOFTLOCK) {
4036 					/* can't reduce map area */
4037 					SEGVN_VMSTAT_FLTVNPAGES(23);
4038 					vop_size_err = 1;
4039 					goto out;
4040 				}
4041 				SEGVN_VMSTAT_FLTVNPAGES(24);
4042 				ASSERT(szc != 0);
4043 				pszc = 0;
4044 				ierr = -1;
4045 				break;
4046 			}
4047 
4048 			if (amp != NULL) {
4049 				ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
4050 				anon_array_enter(amp, aindx, &an_cookie);
4051 			}
4052 			if (amp != NULL &&
4053 			    anon_get_ptr(amp->ahp, aindx) != NULL) {
4054 				ulong_t taindx = P2ALIGN(aindx, maxpages);
4055 
4056 				SEGVN_VMSTAT_FLTVNPAGES(25);
4057 				ASSERT(anon_pages(amp->ahp, taindx,
4058 				    maxpages) == maxpages);
4059 				for (i = 0; i < pages; i++) {
4060 					page_unlock(ppa[i]);
4061 				}
4062 				anon_array_exit(&an_cookie);
4063 				ANON_LOCK_EXIT(&amp->a_rwlock);
4064 				if (pplist != NULL) {
4065 					page_free_replacement_page(pplist);
4066 					page_create_putback(pages);
4067 				}
4068 				SEGVN_RESTORE_SOFTLOCK(type, pages);
4069 				if (szc < seg->s_szc) {
4070 					SEGVN_VMSTAT_FLTVNPAGES(26);
4071 					/*
4072 					 * For private segments SOFTLOCK
4073 					 * either always breaks cow (any rw
4074 					 * type except S_READ_NOCOW) or
4075 					 * address space is locked as writer
4076 					 * (S_READ_NOCOW case) and anon slots
4077 					 * can't show up on second check.
4078 					 * Therefore if we are here for
4079 					 * SOFTLOCK case it must be a cow
4080 					 * break but cow break never reduces
4081 					 * szc. text replication (tron) in
4082 					 * this case works as cow break.
4083 					 * Thus the assert below.
4084 					 */
4085 					ASSERT(!brkcow && !tron &&
4086 					    type != F_SOFTLOCK);
4087 					pszc = seg->s_szc;
4088 					ierr = -2;
4089 					break;
4090 				}
4091 				ASSERT(IS_P2ALIGNED(a, maxpgsz));
4092 				goto again;
4093 			}
4094 #ifdef DEBUG
4095 			if (amp != NULL) {
4096 				ulong_t taindx = P2ALIGN(aindx, maxpages);
4097 				ASSERT(!anon_pages(amp->ahp, taindx, maxpages));
4098 			}
4099 #endif /* DEBUG */
4100 
4101 			if (brkcow || tron) {
4102 				ASSERT(amp != NULL);
4103 				ASSERT(pplist == NULL);
4104 				ASSERT(szc == seg->s_szc);
4105 				ASSERT(IS_P2ALIGNED(a, maxpgsz));
4106 				ASSERT(IS_P2ALIGNED(aindx, maxpages));
4107 				SEGVN_VMSTAT_FLTVNPAGES(27);
4108 				ierr = anon_map_privatepages(amp, aindx, szc,
4109 				    seg, a, prot, ppa, vpage, segvn_anypgsz,
4110 				    tron ? PG_LOCAL : 0, svd->cred);
4111 				if (ierr != 0) {
4112 					SEGVN_VMSTAT_FLTVNPAGES(28);
4113 					anon_array_exit(&an_cookie);
4114 					ANON_LOCK_EXIT(&amp->a_rwlock);
4115 					SEGVN_RESTORE_SOFTLOCK(type, pages);
4116 					err = FC_MAKE_ERR(ierr);
4117 					goto out;
4118 				}
4119 
4120 				ASSERT(!IS_VMODSORT(ppa[0]->p_vnode));
4121 				/*
4122 				 * p_szc can't be changed for locked
4123 				 * swapfs pages.
4124 				 */
4125 				ASSERT(svd->rcookie ==
4126 				    HAT_INVALID_REGION_COOKIE);
4127 				hat_memload_array(hat, a, pgsz, ppa, prot,
4128 				    hat_flag);
4129 
4130 				if (!(hat_flag & HAT_LOAD_LOCK)) {
4131 					SEGVN_VMSTAT_FLTVNPAGES(29);
4132 					for (i = 0; i < pages; i++) {
4133 						page_unlock(ppa[i]);
4134 					}
4135 				}
4136 				anon_array_exit(&an_cookie);
4137 				ANON_LOCK_EXIT(&amp->a_rwlock);
4138 				goto next;
4139 			}
4140 
4141 			ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE ||
4142 			    (!svd->pageprot && svd->prot == (prot & vpprot)));
4143 
4144 			pfn = page_pptonum(ppa[0]);
4145 			/*
4146 			 * hat_page_demote() needs an SE_EXCL lock on one of
4147 			 * constituent page_t's and it decreases root's p_szc
4148 			 * last. This means if root's p_szc is equal szc and
4149 			 * all its constituent pages are locked
4150 			 * hat_page_demote() that could have changed p_szc to
4151 			 * szc is already done and no new have page_demote()
4152 			 * can start for this large page.
4153 			 */
4154 
4155 			/*
4156 			 * we need to make sure same mapping size is used for
4157 			 * the same address range if there's a possibility the
4158 			 * adddress is already mapped because hat layer panics
4159 			 * when translation is loaded for the range already
4160 			 * mapped with a different page size.  We achieve it
4161 			 * by always using largest page size possible subject
4162 			 * to the constraints of page size, segment page size
4163 			 * and page alignment.  Since mappings are invalidated
4164 			 * when those constraints change and make it
4165 			 * impossible to use previously used mapping size no
4166 			 * mapping size conflicts should happen.
4167 			 */
4168 
4169 		chkszc:
4170 			if ((pszc = ppa[0]->p_szc) == szc &&
4171 			    IS_P2ALIGNED(pfn, pages)) {
4172 
4173 				SEGVN_VMSTAT_FLTVNPAGES(30);
4174 #ifdef DEBUG
4175 				for (i = 0; i < pages; i++) {
4176 					ASSERT(PAGE_LOCKED(ppa[i]));
4177 					ASSERT(!PP_ISFREE(ppa[i]));
4178 					ASSERT(page_pptonum(ppa[i]) ==
4179 					    pfn + i);
4180 					ASSERT(ppa[i]->p_szc == szc);
4181 					ASSERT(ppa[i]->p_vnode == vp);
4182 					ASSERT(ppa[i]->p_offset ==
4183 					    off + (i << PAGESHIFT));
4184 				}
4185 #endif /* DEBUG */
4186 				/*
4187 				 * All pages are of szc we need and they are
4188 				 * all locked so they can't change szc. load
4189 				 * translations.
4190 				 *
4191 				 * if page got promoted since last check
4192 				 * we don't need pplist.
4193 				 */
4194 				if (pplist != NULL) {
4195 					page_free_replacement_page(pplist);
4196 					page_create_putback(pages);
4197 				}
4198 				if (PP_ISMIGRATE(ppa[0])) {
4199 					page_migrate(seg, a, ppa, pages);
4200 				}
4201 				SEGVN_UPDATE_MODBITS(ppa, pages, rw,
4202 				    prot, vpprot);
4203 				if (!xhat) {
4204 					hat_memload_array_region(hat, a, pgsz,
4205 					    ppa, prot & vpprot, hat_flag,
4206 					    svd->rcookie);
4207 				} else {
4208 					/*
4209 					 * avoid large xhat mappings to FS
4210 					 * pages so that hat_page_demote()
4211 					 * doesn't need to check for xhat
4212 					 * large mappings.
4213 					 * Don't use regions with xhats.
4214 					 */
4215 					for (i = 0; i < pages; i++) {
4216 						hat_memload(hat,
4217 						    a + (i << PAGESHIFT),
4218 						    ppa[i], prot & vpprot,
4219 						    hat_flag);
4220 					}
4221 				}
4222 
4223 				if (!(hat_flag & HAT_LOAD_LOCK)) {
4224 					for (i = 0; i < pages; i++) {
4225 						page_unlock(ppa[i]);
4226 					}
4227 				}
4228 				if (amp != NULL) {
4229 					anon_array_exit(&an_cookie);
4230 					ANON_LOCK_EXIT(&amp->a_rwlock);
4231 				}
4232 				goto next;
4233 			}
4234 
4235 			/*
4236 			 * See if upsize is possible.
4237 			 */
4238 			if (pszc > szc && szc < seg->s_szc &&
4239 			    (segvn_anypgsz_vnode || pszc >= seg->s_szc)) {
4240 				pgcnt_t aphase;
4241 				uint_t pszc1 = MIN(pszc, seg->s_szc);
4242 				ppgsz = page_get_pagesize(pszc1);
4243 				ppages = btop(ppgsz);
4244 				aphase = btop(P2PHASE((uintptr_t)a, ppgsz));
4245 
4246 				ASSERT(type != F_SOFTLOCK);
4247 
4248 				SEGVN_VMSTAT_FLTVNPAGES(31);
4249 				if (aphase != P2PHASE(pfn, ppages)) {
4250 					segvn_faultvnmpss_align_err4++;
4251 				} else {
4252 					SEGVN_VMSTAT_FLTVNPAGES(32);
4253 					if (pplist != NULL) {
4254 						page_t *pl = pplist;
4255 						page_free_replacement_page(pl);
4256 						page_create_putback(pages);
4257 					}
4258 					for (i = 0; i < pages; i++) {
4259 						page_unlock(ppa[i]);
4260 					}
4261 					if (amp != NULL) {
4262 						anon_array_exit(&an_cookie);
4263 						ANON_LOCK_EXIT(&amp->a_rwlock);
4264 					}
4265 					pszc = pszc1;
4266 					ierr = -2;
4267 					break;
4268 				}
4269 			}
4270 
4271 			/*
4272 			 * check if we should use smallest mapping size.
4273 			 */
4274 			upgrdfail = 0;
4275 			if (szc == 0 || xhat ||
4276 			    (pszc >= szc &&
4277 			    !IS_P2ALIGNED(pfn, pages)) ||
4278 			    (pszc < szc &&
4279 			    !segvn_full_szcpages(ppa, szc, &upgrdfail,
4280 			    &pszc))) {
4281 
4282 				if (upgrdfail && type != F_SOFTLOCK) {
4283 					/*
4284 					 * segvn_full_szcpages failed to lock
4285 					 * all pages EXCL. Size down.
4286 					 */
4287 					ASSERT(pszc < szc);
4288 
4289 					SEGVN_VMSTAT_FLTVNPAGES(33);
4290 
4291 					if (pplist != NULL) {
4292 						page_t *pl = pplist;
4293 						page_free_replacement_page(pl);
4294 						page_create_putback(pages);
4295 					}
4296 
4297 					for (i = 0; i < pages; i++) {
4298 						page_unlock(ppa[i]);
4299 					}
4300 					if (amp != NULL) {
4301 						anon_array_exit(&an_cookie);
4302 						ANON_LOCK_EXIT(&amp->a_rwlock);
4303 					}
4304 					ierr = -1;
4305 					break;
4306 				}
4307 				if (szc != 0 && !xhat && !upgrdfail) {
4308 					segvn_faultvnmpss_align_err5++;
4309 				}
4310 				SEGVN_VMSTAT_FLTVNPAGES(34);
4311 				if (pplist != NULL) {
4312 					page_free_replacement_page(pplist);
4313 					page_create_putback(pages);
4314 				}
4315 				SEGVN_UPDATE_MODBITS(ppa, pages, rw,
4316 				    prot, vpprot);
4317 				if (upgrdfail && segvn_anypgsz_vnode) {
4318 					/* SOFTLOCK case */
4319 					hat_memload_array_region(hat, a, pgsz,
4320 					    ppa, prot & vpprot, hat_flag,
4321 					    svd->rcookie);
4322 				} else {
4323 					for (i = 0; i < pages; i++) {
4324 						hat_memload_region(hat,
4325 						    a + (i << PAGESHIFT),
4326 						    ppa[i], prot & vpprot,
4327 						    hat_flag, svd->rcookie);
4328 					}
4329 				}
4330 				if (!(hat_flag & HAT_LOAD_LOCK)) {
4331 					for (i = 0; i < pages; i++) {
4332 						page_unlock(ppa[i]);
4333 					}
4334 				}
4335 				if (amp != NULL) {
4336 					anon_array_exit(&an_cookie);
4337 					ANON_LOCK_EXIT(&amp->a_rwlock);
4338 				}
4339 				goto next;
4340 			}
4341 
4342 			if (pszc == szc) {
4343 				/*
4344 				 * segvn_full_szcpages() upgraded pages szc.
4345 				 */
4346 				ASSERT(pszc == ppa[0]->p_szc);
4347 				ASSERT(IS_P2ALIGNED(pfn, pages));
4348 				goto chkszc;
4349 			}
4350 
4351 			if (pszc > szc) {
4352 				kmutex_t *szcmtx;
4353 				SEGVN_VMSTAT_FLTVNPAGES(35);
4354 				/*
4355 				 * p_szc of ppa[0] can change since we haven't
4356 				 * locked all constituent pages. Call
4357 				 * page_lock_szc() to prevent szc changes.
4358 				 * This should be a rare case that happens when
4359 				 * multiple segments use a different page size
4360 				 * to map the same file offsets.
4361 				 */
4362 				szcmtx = page_szc_lock(ppa[0]);
4363 				pszc = ppa[0]->p_szc;
4364 				ASSERT(szcmtx != NULL || pszc == 0);
4365 				ASSERT(ppa[0]->p_szc <= pszc);
4366 				if (pszc <= szc) {
4367 					SEGVN_VMSTAT_FLTVNPAGES(36);
4368 					if (szcmtx != NULL) {
4369 						mutex_exit(szcmtx);
4370 					}
4371 					goto chkszc;
4372 				}
4373 				if (pplist != NULL) {
4374 					/*
4375 					 * page got promoted since last check.
4376 					 * we don't need preaalocated large
4377 					 * page.
4378 					 */
4379 					SEGVN_VMSTAT_FLTVNPAGES(37);
4380 					page_free_replacement_page(pplist);
4381 					page_create_putback(pages);
4382 				}
4383 				SEGVN_UPDATE_MODBITS(ppa, pages, rw,
4384 				    prot, vpprot);
4385 				hat_memload_array_region(hat, a, pgsz, ppa,
4386 				    prot & vpprot, hat_flag, svd->rcookie);
4387 				mutex_exit(szcmtx);
4388 				if (!(hat_flag & HAT_LOAD_LOCK)) {
4389 					for (i = 0; i < pages; i++) {
4390 						page_unlock(ppa[i]);
4391 					}
4392 				}
4393 				if (amp != NULL) {
4394 					anon_array_exit(&an_cookie);
4395 					ANON_LOCK_EXIT(&amp->a_rwlock);
4396 				}
4397 				goto next;
4398 			}
4399 
4400 			/*
4401 			 * if page got demoted since last check
4402 			 * we could have not allocated larger page.
4403 			 * allocate now.
4404 			 */
4405 			if (pplist == NULL &&
4406 			    page_alloc_pages(vp, seg, a, &pplist, NULL,
4407 			    szc, 0, 0) && type != F_SOFTLOCK) {
4408 				SEGVN_VMSTAT_FLTVNPAGES(38);
4409 				for (i = 0; i < pages; i++) {
4410 					page_unlock(ppa[i]);
4411 				}
4412 				if (amp != NULL) {
4413 					anon_array_exit(&an_cookie);
4414 					ANON_LOCK_EXIT(&amp->a_rwlock);
4415 				}
4416 				ierr = -1;
4417 				alloc_failed |= (1 << szc);
4418 				break;
4419 			}
4420 
4421 			SEGVN_VMSTAT_FLTVNPAGES(39);
4422 
4423 			if (pplist != NULL) {
4424 				segvn_relocate_pages(ppa, pplist);
4425 #ifdef DEBUG
4426 			} else {
4427 				ASSERT(type == F_SOFTLOCK);
4428 				SEGVN_VMSTAT_FLTVNPAGES(40);
4429 #endif /* DEBUG */
4430 			}
4431 
4432 			SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot);
4433 
4434 			if (pplist == NULL && segvn_anypgsz_vnode == 0) {
4435 				ASSERT(type == F_SOFTLOCK);
4436 				for (i = 0; i < pages; i++) {
4437 					ASSERT(ppa[i]->p_szc < szc);
4438 					hat_memload_region(hat,
4439 					    a + (i << PAGESHIFT),
4440 					    ppa[i], prot & vpprot, hat_flag,
4441 					    svd->rcookie);
4442 				}
4443 			} else {
4444 				ASSERT(pplist != NULL || type == F_SOFTLOCK);
4445 				hat_memload_array_region(hat, a, pgsz, ppa,
4446 				    prot & vpprot, hat_flag, svd->rcookie);
4447 			}
4448 			if (!(hat_flag & HAT_LOAD_LOCK)) {
4449 				for (i = 0; i < pages; i++) {
4450 					ASSERT(PAGE_SHARED(ppa[i]));
4451 					page_unlock(ppa[i]);
4452 				}
4453 			}
4454 			if (amp != NULL) {
4455 				anon_array_exit(&an_cookie);
4456 				ANON_LOCK_EXIT(&amp->a_rwlock);
4457 			}
4458 
4459 		next:
4460 			if (vpage != NULL) {
4461 				vpage += pages;
4462 			}
4463 			adjszc_chk = 1;
4464 		}
4465 		if (a == lpgeaddr)
4466 			break;
4467 		ASSERT(a < lpgeaddr);
4468 
4469 		ASSERT(!brkcow && !tron && type != F_SOFTLOCK);
4470 
4471 		/*
4472 		 * ierr == -1 means we failed to map with a large page.
4473 		 * (either due to allocation/relocation failures or
4474 		 * misalignment with other mappings to this file.
4475 		 *
4476 		 * ierr == -2 means some other thread allocated a large page
4477 		 * after we gave up tp map with a large page.  retry with
4478 		 * larger mapping.
4479 		 */
4480 		ASSERT(ierr == -1 || ierr == -2);
4481 		ASSERT(ierr == -2 || szc != 0);
4482 		ASSERT(ierr == -1 || szc < seg->s_szc);
4483 		if (ierr == -2) {
4484 			SEGVN_VMSTAT_FLTVNPAGES(41);
4485 			ASSERT(pszc > szc && pszc <= seg->s_szc);
4486 			szc = pszc;
4487 		} else if (segvn_anypgsz_vnode) {
4488 			SEGVN_VMSTAT_FLTVNPAGES(42);
4489 			szc--;
4490 		} else {
4491 			SEGVN_VMSTAT_FLTVNPAGES(43);
4492 			ASSERT(pszc < szc);
4493 			/*
4494 			 * other process created pszc large page.
4495 			 * but we still have to drop to 0 szc.
4496 			 */
4497 			szc = 0;
4498 		}
4499 
4500 		pgsz = page_get_pagesize(szc);
4501 		pages = btop(pgsz);
4502 		if (ierr == -2) {
4503 			/*
4504 			 * Size up case. Note lpgaddr may only be needed for
4505 			 * softlock case so we don't adjust it here.
4506 			 */
4507 			a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz);
4508 			ASSERT(a >= lpgaddr);
4509 			lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz);
4510 			off = svd->offset + (uintptr_t)(a - seg->s_base);
4511 			aindx = svd->anon_index + seg_page(seg, a);
4512 			vpage = (svd->vpage != NULL) ?
4513 			    &svd->vpage[seg_page(seg, a)] : NULL;
4514 		} else {
4515 			/*
4516 			 * Size down case. Note lpgaddr may only be needed for
4517 			 * softlock case so we don't adjust it here.
4518 			 */
4519 			ASSERT(IS_P2ALIGNED(a, pgsz));
4520 			ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz));
4521 			lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz);
4522 			ASSERT(a < lpgeaddr);
4523 			if (a < addr) {
4524 				SEGVN_VMSTAT_FLTVNPAGES(44);
4525 				/*
4526 				 * The beginning of the large page region can
4527 				 * be pulled to the right to make a smaller
4528 				 * region. We haven't yet faulted a single
4529 				 * page.
4530 				 */
4531 				a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz);
4532 				ASSERT(a >= lpgaddr);
4533 				off = svd->offset +
4534 				    (uintptr_t)(a - seg->s_base);
4535 				aindx = svd->anon_index + seg_page(seg, a);
4536 				vpage = (svd->vpage != NULL) ?
4537 				    &svd->vpage[seg_page(seg, a)] : NULL;
4538 			}
4539 		}
4540 	}
4541 out:
4542 	kmem_free(ppa, ppasize);
4543 	if (!err && !vop_size_err) {
4544 		SEGVN_VMSTAT_FLTVNPAGES(45);
4545 		return (0);
4546 	}
4547 	if (type == F_SOFTLOCK && a > lpgaddr) {
4548 		SEGVN_VMSTAT_FLTVNPAGES(46);
4549 		segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER);
4550 	}
4551 	if (!vop_size_err) {
4552 		SEGVN_VMSTAT_FLTVNPAGES(47);
4553 		return (err);
4554 	}
4555 	ASSERT(brkcow || tron || type == F_SOFTLOCK);
4556 	/*
4557 	 * Large page end is mapped beyond the end of file and it's a cow
4558 	 * fault (can be a text replication induced cow) or softlock so we can't
4559 	 * reduce the map area.  For now just demote the segment. This should
4560 	 * really only happen if the end of the file changed after the mapping
4561 	 * was established since when large page segments are created we make
4562 	 * sure they don't extend beyond the end of the file.
4563 	 */
4564 	SEGVN_VMSTAT_FLTVNPAGES(48);
4565 
4566 	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
4567 	SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
4568 	err = 0;
4569 	if (seg->s_szc != 0) {
4570 		segvn_fltvnpages_clrszc_cnt++;
4571 		ASSERT(svd->softlockcnt == 0);
4572 		err = segvn_clrszc(seg);
4573 		if (err != 0) {
4574 			segvn_fltvnpages_clrszc_err++;
4575 		}
4576 	}
4577 	ASSERT(err || seg->s_szc == 0);
4578 	SEGVN_LOCK_DOWNGRADE(seg->s_as, &svd->lock);
4579 	/* segvn_fault will do its job as if szc had been zero to begin with */
4580 	return (err == 0 ? IE_RETRY : FC_MAKE_ERR(err));
4581 }
4582 
4583 /*
4584  * This routine will attempt to fault in one large page.
4585  * it will use smaller pages if that fails.
4586  * It should only be called for pure anonymous segments.
4587  */
4588 static faultcode_t
4589 segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr,
4590     caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr,
4591     caddr_t eaddr, int brkcow)
4592 {
4593 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
4594 	struct anon_map *amp = svd->amp;
4595 	uchar_t segtype = svd->type;
4596 	uint_t szc = seg->s_szc;
4597 	size_t pgsz = page_get_pagesize(szc);
4598 	size_t maxpgsz = pgsz;
4599 	pgcnt_t pages = btop(pgsz);
4600 	uint_t ppaszc = szc;
4601 	caddr_t a = lpgaddr;
4602 	ulong_t aindx = svd->anon_index + seg_page(seg, a);
4603 	struct vpage *vpage = (svd->vpage != NULL) ?
4604 	    &svd->vpage[seg_page(seg, a)] : NULL;
4605 	page_t **ppa;
4606 	uint_t	ppa_szc;
4607 	faultcode_t err;
4608 	int ierr;
4609 	uint_t protchk, prot, vpprot;
4610 	ulong_t i;
4611 	int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD;
4612 	anon_sync_obj_t cookie;
4613 	int first = 1;
4614 	int adjszc_chk;
4615 	int purged = 0;
4616 	int pgflags = (svd->tr_state == SEGVN_TR_ON) ? PG_LOCAL : 0;
4617 
4618 	ASSERT(szc != 0);
4619 	ASSERT(amp != NULL);
4620 	ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */
4621 	ASSERT(!(svd->flags & MAP_NORESERVE));
4622 	ASSERT(type != F_SOFTUNLOCK);
4623 	ASSERT(IS_P2ALIGNED(a, maxpgsz));
4624 	ASSERT(!brkcow || svd->tr_state == SEGVN_TR_OFF);
4625 	ASSERT(svd->tr_state != SEGVN_TR_INIT);
4626 
4627 	ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
4628 
4629 	VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltanpages[0]);
4630 	VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltanpages[1]);
4631 
4632 	if (svd->flags & MAP_TEXT) {
4633 		hat_flag |= HAT_LOAD_TEXT;
4634 	}
4635 
4636 	if (svd->pageprot) {
4637 		switch (rw) {
4638 		case S_READ:
4639 			protchk = PROT_READ;
4640 			break;
4641 		case S_WRITE:
4642 			protchk = PROT_WRITE;
4643 			break;
4644 		case S_EXEC:
4645 			protchk = PROT_EXEC;
4646 			break;
4647 		case S_OTHER:
4648 		default:
4649 			protchk = PROT_READ | PROT_WRITE | PROT_EXEC;
4650 			break;
4651 		}
4652 		VM_STAT_ADD(segvnvmstats.fltanpages[2]);
4653 	} else {
4654 		prot = svd->prot;
4655 		/* caller has already done segment level protection check. */
4656 	}
4657 
4658 	ppa = kmem_cache_alloc(segvn_szc_cache[ppaszc], KM_SLEEP);
4659 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
4660 	for (;;) {
4661 		adjszc_chk = 0;
4662 		for (; a < lpgeaddr; a += pgsz, aindx += pages) {
4663 			if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) {
4664 				VM_STAT_ADD(segvnvmstats.fltanpages[3]);
4665 				ASSERT(vpage != NULL);
4666 				prot = VPP_PROT(vpage);
4667 				ASSERT(sameprot(seg, a, maxpgsz));
4668 				if ((prot & protchk) == 0) {
4669 					err = FC_PROT;
4670 					goto error;
4671 				}
4672 			}
4673 			if (adjszc_chk && IS_P2ALIGNED(a, maxpgsz) &&
4674 			    pgsz < maxpgsz) {
4675 				ASSERT(a > lpgaddr);
4676 				szc = seg->s_szc;
4677 				pgsz = maxpgsz;
4678 				pages = btop(pgsz);
4679 				ASSERT(IS_P2ALIGNED(aindx, pages));
4680 				lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr,
4681 				    pgsz);
4682 			}
4683 			if (type == F_SOFTLOCK && svd->vp != NULL) {
4684 				mutex_enter(&freemem_lock);
4685 				if (availrmem < tune.t_minarmem + pages) {
4686 					mutex_exit(&freemem_lock);
4687 					err = FC_MAKE_ERR(ENOMEM);
4688 					goto error;
4689 				} else {
4690 					availrmem -= pages;
4691 					segvn_pages_locked += pages;
4692 					svd->softlockcnt += pages;
4693 				}
4694 				mutex_exit(&freemem_lock);
4695 			}
4696 			anon_array_enter(amp, aindx, &cookie);
4697 			ppa_szc = (uint_t)-1;
4698 			ierr = anon_map_getpages(amp, aindx, szc, seg, a,
4699 			    prot, &vpprot, ppa, &ppa_szc, vpage, rw, brkcow,
4700 			    segvn_anypgsz, pgflags, svd->cred);
4701 			if (ierr != 0) {
4702 				anon_array_exit(&cookie);
4703 				VM_STAT_ADD(segvnvmstats.fltanpages[4]);
4704 				if (type == F_SOFTLOCK && svd->vp != NULL) {
4705 					VM_STAT_ADD(segvnvmstats.fltanpages[5]);
4706 					mutex_enter(&freemem_lock);
4707 					availrmem += pages;
4708 					segvn_pages_locked -= pages;
4709 					svd->softlockcnt -= pages;
4710 					mutex_exit(&freemem_lock);
4711 				}
4712 				if (ierr > 0) {
4713 					VM_STAT_ADD(segvnvmstats.fltanpages[6]);
4714 					err = FC_MAKE_ERR(ierr);
4715 					goto error;
4716 				}
4717 				break;
4718 			}
4719 
4720 			ASSERT(!IS_VMODSORT(ppa[0]->p_vnode));
4721 
4722 			ASSERT(segtype == MAP_SHARED ||
4723 			    ppa[0]->p_szc <= szc);
4724 			ASSERT(segtype == MAP_PRIVATE ||
4725 			    ppa[0]->p_szc >= szc);
4726 
4727 			/*
4728 			 * Handle pages that have been marked for migration
4729 			 */
4730 			if (lgrp_optimizations())
4731 				page_migrate(seg, a, ppa, pages);
4732 
4733 			ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
4734 			if (type == F_SOFTLOCK && svd->vp == NULL) {
4735 				/*
4736 				 * If all pages in ppa array belong to the same
4737 				 * large page call segvn_slock_anonpages()
4738 				 * just for ppa[0].
4739 				 */
4740 				for (i = 0; i < pages; i++) {
4741 					if (!segvn_slock_anonpages(ppa[i],
4742 					    i == 0 && first)) {
4743 						ulong_t j;
4744 						for (j = 0; j < i; j++) {
4745 							segvn_sunlock_anonpages(
4746 							    ppa[j], j == 0 &&
4747 							    first);
4748 							page_unlock(ppa[j]);
4749 						}
4750 						for (j = i; j < pages; j++) {
4751 							page_unlock(ppa[j]);
4752 						}
4753 						anon_array_exit(&cookie);
4754 						err = FC_MAKE_ERR(ENOMEM);
4755 						goto error;
4756 					}
4757 					if (i == 0 && ppa[0]->p_szc >= szc) {
4758 						ASSERT(!(page_pptonum(ppa[0]) &
4759 						    (pages - 1)));
4760 						break;
4761 					}
4762 				}
4763 				first = 0;
4764 				mutex_enter(&freemem_lock);
4765 				svd->softlockcnt += pages;
4766 				segvn_pages_locked += pages;
4767 				mutex_exit(&freemem_lock);
4768 			}
4769 
4770 			if (segtype == MAP_SHARED) {
4771 				vpprot |= PROT_WRITE;
4772 			}
4773 
4774 			hat_memload_array(hat, a, pgsz, ppa,
4775 			    prot & vpprot, hat_flag);
4776 
4777 			if (hat_flag & HAT_LOAD_LOCK) {
4778 				VM_STAT_ADD(segvnvmstats.fltanpages[7]);
4779 			} else {
4780 				VM_STAT_ADD(segvnvmstats.fltanpages[8]);
4781 				for (i = 0; i < pages; i++)
4782 					page_unlock(ppa[i]);
4783 			}
4784 			if (vpage != NULL)
4785 				vpage += pages;
4786 
4787 			anon_array_exit(&cookie);
4788 			adjszc_chk = 1;
4789 		}
4790 		if (a == lpgeaddr)
4791 			break;
4792 		ASSERT(a < lpgeaddr);
4793 		/*
4794 		 * ierr == -1 means we failed to allocate a large page.
4795 		 * so do a size down operation.
4796 		 *
4797 		 * ierr == -2 means some other process that privately shares
4798 		 * pages with this process has allocated a larger page and we
4799 		 * need to retry with larger pages. So do a size up
4800 		 * operation. This relies on the fact that large pages are
4801 		 * never partially shared i.e. if we share any constituent
4802 		 * page of a large page with another process we must share the
4803 		 * entire large page. Note this cannot happen for SOFTLOCK
4804 		 * case, unless current address (a) is at the beginning of the
4805 		 * next page size boundary because the other process couldn't
4806 		 * have relocated locked pages.
4807 		 */
4808 		ASSERT(ierr == -1 || ierr == -2);
4809 		/*
4810 		 * For the very first relocation failure try to purge this
4811 		 * segment's cache so that the relocator can obtain an
4812 		 * exclusive lock on pages we want to relocate.
4813 		 */
4814 		if (!purged && ierr == -1 && ppa_szc != (uint_t)-1 &&
4815 		    svd->softlockcnt != 0) {
4816 			purged = 1;
4817 			segvn_purge(seg);
4818 			continue;
4819 		}
4820 
4821 		if (segvn_anypgsz) {
4822 			ASSERT(ierr == -2 || szc != 0);
4823 			ASSERT(ierr == -1 || szc < seg->s_szc);
4824 			szc = (ierr == -1) ? szc - 1 : szc + 1;
4825 		} else {
4826 			/*
4827 			 * For non COW faults and segvn_anypgsz == 0
4828 			 * we need to be careful not to loop forever
4829 			 * if existing page is found with szc other
4830 			 * than 0 or seg->s_szc. This could be due
4831 			 * to page relocations on behalf of DR or
4832 			 * more likely large page creation. For this
4833 			 * case simply re-size to existing page's szc
4834 			 * if returned by anon_map_getpages().
4835 			 */
4836 			if (ppa_szc == (uint_t)-1) {
4837 				szc = (ierr == -1) ? 0 : seg->s_szc;
4838 			} else {
4839 				ASSERT(ppa_szc <= seg->s_szc);
4840 				ASSERT(ierr == -2 || ppa_szc < szc);
4841 				ASSERT(ierr == -1 || ppa_szc > szc);
4842 				szc = ppa_szc;
4843 			}
4844 		}
4845 
4846 		pgsz = page_get_pagesize(szc);
4847 		pages = btop(pgsz);
4848 		ASSERT(type != F_SOFTLOCK || ierr == -1 ||
4849 		    (IS_P2ALIGNED(a, pgsz) && IS_P2ALIGNED(lpgeaddr, pgsz)));
4850 		if (type == F_SOFTLOCK) {
4851 			/*
4852 			 * For softlocks we cannot reduce the fault area
4853 			 * (calculated based on the largest page size for this
4854 			 * segment) for size down and a is already next
4855 			 * page size aligned as assertted above for size
4856 			 * ups. Therefore just continue in case of softlock.
4857 			 */
4858 			VM_STAT_ADD(segvnvmstats.fltanpages[9]);
4859 			continue; /* keep lint happy */
4860 		} else if (ierr == -2) {
4861 
4862 			/*
4863 			 * Size up case. Note lpgaddr may only be needed for
4864 			 * softlock case so we don't adjust it here.
4865 			 */
4866 			VM_STAT_ADD(segvnvmstats.fltanpages[10]);
4867 			a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz);
4868 			ASSERT(a >= lpgaddr);
4869 			lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz);
4870 			aindx = svd->anon_index + seg_page(seg, a);
4871 			vpage = (svd->vpage != NULL) ?
4872 			    &svd->vpage[seg_page(seg, a)] : NULL;
4873 		} else {
4874 			/*
4875 			 * Size down case. Note lpgaddr may only be needed for
4876 			 * softlock case so we don't adjust it here.
4877 			 */
4878 			VM_STAT_ADD(segvnvmstats.fltanpages[11]);
4879 			ASSERT(IS_P2ALIGNED(a, pgsz));
4880 			ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz));
4881 			lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz);
4882 			ASSERT(a < lpgeaddr);
4883 			if (a < addr) {
4884 				/*
4885 				 * The beginning of the large page region can
4886 				 * be pulled to the right to make a smaller
4887 				 * region. We haven't yet faulted a single
4888 				 * page.
4889 				 */
4890 				VM_STAT_ADD(segvnvmstats.fltanpages[12]);
4891 				a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz);
4892 				ASSERT(a >= lpgaddr);
4893 				aindx = svd->anon_index + seg_page(seg, a);
4894 				vpage = (svd->vpage != NULL) ?
4895 				    &svd->vpage[seg_page(seg, a)] : NULL;
4896 			}
4897 		}
4898 	}
4899 	VM_STAT_ADD(segvnvmstats.fltanpages[13]);
4900 	ANON_LOCK_EXIT(&amp->a_rwlock);
4901 	kmem_cache_free(segvn_szc_cache[ppaszc], ppa);
4902 	return (0);
4903 error:
4904 	VM_STAT_ADD(segvnvmstats.fltanpages[14]);
4905 	ANON_LOCK_EXIT(&amp->a_rwlock);
4906 	kmem_cache_free(segvn_szc_cache[ppaszc], ppa);
4907 	if (type == F_SOFTLOCK && a > lpgaddr) {
4908 		VM_STAT_ADD(segvnvmstats.fltanpages[15]);
4909 		segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER);
4910 	}
4911 	return (err);
4912 }
4913 
4914 int fltadvice = 1;	/* set to free behind pages for sequential access */
4915 
4916 /*
4917  * This routine is called via a machine specific fault handling routine.
4918  * It is also called by software routines wishing to lock or unlock
4919  * a range of addresses.
4920  *
4921  * Here is the basic algorithm:
4922  *	If unlocking
4923  *		Call segvn_softunlock
4924  *		Return
4925  *	endif
4926  *	Checking and set up work
4927  *	If we will need some non-anonymous pages
4928  *		Call VOP_GETPAGE over the range of non-anonymous pages
4929  *	endif
4930  *	Loop over all addresses requested
4931  *		Call segvn_faultpage passing in page list
4932  *		    to load up translations and handle anonymous pages
4933  *	endloop
4934  *	Load up translation to any additional pages in page list not
4935  *	    already handled that fit into this segment
4936  */
4937 static faultcode_t
4938 segvn_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len,
4939     enum fault_type type, enum seg_rw rw)
4940 {
4941 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
4942 	page_t **plp, **ppp, *pp;
4943 	u_offset_t off;
4944 	caddr_t a;
4945 	struct vpage *vpage;
4946 	uint_t vpprot, prot;
4947 	int err;
4948 	page_t *pl[PVN_GETPAGE_NUM + 1];
4949 	size_t plsz, pl_alloc_sz;
4950 	size_t page;
4951 	ulong_t anon_index;
4952 	struct anon_map *amp;
4953 	int dogetpage = 0;
4954 	caddr_t	lpgaddr, lpgeaddr;
4955 	size_t pgsz;
4956 	anon_sync_obj_t cookie;
4957 	int brkcow = BREAK_COW_SHARE(rw, type, svd->type);
4958 
4959 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
4960 	ASSERT(svd->amp == NULL || svd->rcookie == HAT_INVALID_REGION_COOKIE);
4961 
4962 	/*
4963 	 * First handle the easy stuff
4964 	 */
4965 	if (type == F_SOFTUNLOCK) {
4966 		if (rw == S_READ_NOCOW) {
4967 			rw = S_READ;
4968 			ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
4969 		}
4970 		SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
4971 		pgsz = (seg->s_szc == 0) ? PAGESIZE :
4972 		    page_get_pagesize(seg->s_szc);
4973 		VM_STAT_COND_ADD(pgsz > PAGESIZE, segvnvmstats.fltanpages[16]);
4974 		CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr);
4975 		segvn_softunlock(seg, lpgaddr, lpgeaddr - lpgaddr, rw);
4976 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
4977 		return (0);
4978 	}
4979 
4980 	ASSERT(svd->tr_state == SEGVN_TR_OFF ||
4981 	    !HAT_IS_REGION_COOKIE_VALID(svd->rcookie));
4982 	if (brkcow == 0) {
4983 		if (svd->tr_state == SEGVN_TR_INIT) {
4984 			SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
4985 			if (svd->tr_state == SEGVN_TR_INIT) {
4986 				ASSERT(svd->vp != NULL && svd->amp == NULL);
4987 				ASSERT(svd->flags & MAP_TEXT);
4988 				ASSERT(svd->type == MAP_PRIVATE);
4989 				segvn_textrepl(seg);
4990 				ASSERT(svd->tr_state != SEGVN_TR_INIT);
4991 				ASSERT(svd->tr_state != SEGVN_TR_ON ||
4992 				    svd->amp != NULL);
4993 			}
4994 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
4995 		}
4996 	} else if (svd->tr_state != SEGVN_TR_OFF) {
4997 		SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
4998 
4999 		if (rw == S_WRITE && svd->tr_state != SEGVN_TR_OFF) {
5000 			ASSERT(!svd->pageprot && !(svd->prot & PROT_WRITE));
5001 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5002 			return (FC_PROT);
5003 		}
5004 
5005 		if (svd->tr_state == SEGVN_TR_ON) {
5006 			ASSERT(svd->vp != NULL && svd->amp != NULL);
5007 			segvn_textunrepl(seg, 0);
5008 			ASSERT(svd->amp == NULL &&
5009 			    svd->tr_state == SEGVN_TR_OFF);
5010 		} else if (svd->tr_state != SEGVN_TR_OFF) {
5011 			svd->tr_state = SEGVN_TR_OFF;
5012 		}
5013 		ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF);
5014 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5015 	}
5016 
5017 top:
5018 	SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
5019 
5020 	/*
5021 	 * If we have the same protections for the entire segment,
5022 	 * insure that the access being attempted is legitimate.
5023 	 */
5024 
5025 	if (svd->pageprot == 0) {
5026 		uint_t protchk;
5027 
5028 		switch (rw) {
5029 		case S_READ:
5030 		case S_READ_NOCOW:
5031 			protchk = PROT_READ;
5032 			break;
5033 		case S_WRITE:
5034 			protchk = PROT_WRITE;
5035 			break;
5036 		case S_EXEC:
5037 			protchk = PROT_EXEC;
5038 			break;
5039 		case S_OTHER:
5040 		default:
5041 			protchk = PROT_READ | PROT_WRITE | PROT_EXEC;
5042 			break;
5043 		}
5044 
5045 		if ((svd->prot & protchk) == 0) {
5046 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5047 			return (FC_PROT);	/* illegal access type */
5048 		}
5049 	}
5050 
5051 	if (brkcow && HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
5052 		/* this must be SOFTLOCK S_READ fault */
5053 		ASSERT(svd->amp == NULL);
5054 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
5055 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5056 		SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
5057 		if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
5058 			/*
5059 			 * this must be the first ever non S_READ_NOCOW
5060 			 * softlock for this segment.
5061 			 */
5062 			ASSERT(svd->softlockcnt == 0);
5063 			hat_leave_region(seg->s_as->a_hat, svd->rcookie,
5064 			    HAT_REGION_TEXT);
5065 			svd->rcookie = HAT_INVALID_REGION_COOKIE;
5066 		}
5067 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5068 		goto top;
5069 	}
5070 
5071 	/*
5072 	 * We can't allow the long term use of softlocks for vmpss segments,
5073 	 * because in some file truncation cases we should be able to demote
5074 	 * the segment, which requires that there are no softlocks.  The
5075 	 * only case where it's ok to allow a SOFTLOCK fault against a vmpss
5076 	 * segment is S_READ_NOCOW, where the caller holds the address space
5077 	 * locked as writer and calls softunlock before dropping the as lock.
5078 	 * S_READ_NOCOW is used by /proc to read memory from another user.
5079 	 *
5080 	 * Another deadlock between SOFTLOCK and file truncation can happen
5081 	 * because segvn_fault_vnodepages() calls the FS one pagesize at
5082 	 * a time. A second VOP_GETPAGE() call by segvn_fault_vnodepages()
5083 	 * can cause a deadlock because the first set of page_t's remain
5084 	 * locked SE_SHARED.  To avoid this, we demote segments on a first
5085 	 * SOFTLOCK if they have a length greater than the segment's
5086 	 * page size.
5087 	 *
5088 	 * So for now, we only avoid demoting a segment on a SOFTLOCK when
5089 	 * the access type is S_READ_NOCOW and the fault length is less than
5090 	 * or equal to the segment's page size. While this is quite restrictive,
5091 	 * it should be the most common case of SOFTLOCK against a vmpss
5092 	 * segment.
5093 	 *
5094 	 * For S_READ_NOCOW, it's safe not to do a copy on write because the
5095 	 * caller makes sure no COW will be caused by another thread for a
5096 	 * softlocked page.
5097 	 */
5098 	if (type == F_SOFTLOCK && svd->vp != NULL && seg->s_szc != 0) {
5099 		int demote = 0;
5100 
5101 		if (rw != S_READ_NOCOW) {
5102 			demote = 1;
5103 		}
5104 		if (!demote && len > PAGESIZE) {
5105 			pgsz = page_get_pagesize(seg->s_szc);
5106 			CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr,
5107 			    lpgeaddr);
5108 			if (lpgeaddr - lpgaddr > pgsz) {
5109 				demote = 1;
5110 			}
5111 		}
5112 
5113 		ASSERT(demote || AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
5114 
5115 		if (demote) {
5116 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5117 			SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
5118 			if (seg->s_szc != 0) {
5119 				segvn_vmpss_clrszc_cnt++;
5120 				ASSERT(svd->softlockcnt == 0);
5121 				err = segvn_clrszc(seg);
5122 				if (err) {
5123 					segvn_vmpss_clrszc_err++;
5124 					SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5125 					return (FC_MAKE_ERR(err));
5126 				}
5127 			}
5128 			ASSERT(seg->s_szc == 0);
5129 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5130 			goto top;
5131 		}
5132 	}
5133 
5134 	/*
5135 	 * Check to see if we need to allocate an anon_map structure.
5136 	 */
5137 	if (svd->amp == NULL && (svd->vp == NULL || brkcow)) {
5138 		ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
5139 		/*
5140 		 * Drop the "read" lock on the segment and acquire
5141 		 * the "write" version since we have to allocate the
5142 		 * anon_map.
5143 		 */
5144 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5145 		SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
5146 
5147 		if (svd->amp == NULL) {
5148 			svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP);
5149 			svd->amp->a_szc = seg->s_szc;
5150 		}
5151 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5152 
5153 		/*
5154 		 * Start all over again since segment protections
5155 		 * may have changed after we dropped the "read" lock.
5156 		 */
5157 		goto top;
5158 	}
5159 
5160 	/*
5161 	 * S_READ_NOCOW vs S_READ distinction was
5162 	 * only needed for the code above. After
5163 	 * that we treat it as S_READ.
5164 	 */
5165 	if (rw == S_READ_NOCOW) {
5166 		ASSERT(type == F_SOFTLOCK);
5167 		ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
5168 		rw = S_READ;
5169 	}
5170 
5171 	amp = svd->amp;
5172 
5173 	/*
5174 	 * MADV_SEQUENTIAL work is ignored for large page segments.
5175 	 */
5176 	if (seg->s_szc != 0) {
5177 		pgsz = page_get_pagesize(seg->s_szc);
5178 		ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
5179 		CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr);
5180 		if (svd->vp == NULL) {
5181 			err = segvn_fault_anonpages(hat, seg, lpgaddr,
5182 			    lpgeaddr, type, rw, addr, addr + len, brkcow);
5183 		} else {
5184 			err = segvn_fault_vnodepages(hat, seg, lpgaddr,
5185 			    lpgeaddr, type, rw, addr, addr + len, brkcow);
5186 			if (err == IE_RETRY) {
5187 				ASSERT(seg->s_szc == 0);
5188 				ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock));
5189 				SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5190 				goto top;
5191 			}
5192 		}
5193 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5194 		return (err);
5195 	}
5196 
5197 	page = seg_page(seg, addr);
5198 	if (amp != NULL) {
5199 		ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
5200 		anon_index = svd->anon_index + page;
5201 
5202 		if (type == F_PROT && rw == S_READ &&
5203 		    svd->tr_state == SEGVN_TR_OFF &&
5204 		    svd->type == MAP_PRIVATE && svd->pageprot == 0) {
5205 			size_t index = anon_index;
5206 			struct anon *ap;
5207 
5208 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
5209 			/*
5210 			 * The fast path could apply to S_WRITE also, except
5211 			 * that the protection fault could be caused by lazy
5212 			 * tlb flush when ro->rw. In this case, the pte is
5213 			 * RW already. But RO in the other cpu's tlb causes
5214 			 * the fault. Since hat_chgprot won't do anything if
5215 			 * pte doesn't change, we may end up faulting
5216 			 * indefinitely until the RO tlb entry gets replaced.
5217 			 */
5218 			for (a = addr; a < addr + len; a += PAGESIZE, index++) {
5219 				anon_array_enter(amp, index, &cookie);
5220 				ap = anon_get_ptr(amp->ahp, index);
5221 				anon_array_exit(&cookie);
5222 				if ((ap == NULL) || (ap->an_refcnt != 1)) {
5223 					ANON_LOCK_EXIT(&amp->a_rwlock);
5224 					goto slow;
5225 				}
5226 			}
5227 			hat_chgprot(seg->s_as->a_hat, addr, len, svd->prot);
5228 			ANON_LOCK_EXIT(&amp->a_rwlock);
5229 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5230 			return (0);
5231 		}
5232 	}
5233 slow:
5234 
5235 	if (svd->vpage == NULL)
5236 		vpage = NULL;
5237 	else
5238 		vpage = &svd->vpage[page];
5239 
5240 	off = svd->offset + (uintptr_t)(addr - seg->s_base);
5241 
5242 	/*
5243 	 * If MADV_SEQUENTIAL has been set for the particular page we
5244 	 * are faulting on, free behind all pages in the segment and put
5245 	 * them on the free list.
5246 	 */
5247 
5248 	if ((page != 0) && fltadvice && svd->tr_state != SEGVN_TR_ON) {
5249 		struct vpage *vpp;
5250 		ulong_t fanon_index;
5251 		size_t fpage;
5252 		u_offset_t pgoff, fpgoff;
5253 		struct vnode *fvp;
5254 		struct anon *fap = NULL;
5255 
5256 		if (svd->advice == MADV_SEQUENTIAL ||
5257 		    (svd->pageadvice &&
5258 		    VPP_ADVICE(vpage) == MADV_SEQUENTIAL)) {
5259 			pgoff = off - PAGESIZE;
5260 			fpage = page - 1;
5261 			if (vpage != NULL)
5262 				vpp = &svd->vpage[fpage];
5263 			if (amp != NULL)
5264 				fanon_index = svd->anon_index + fpage;
5265 
5266 			while (pgoff > svd->offset) {
5267 				if (svd->advice != MADV_SEQUENTIAL &&
5268 				    (!svd->pageadvice || (vpage &&
5269 				    VPP_ADVICE(vpp) != MADV_SEQUENTIAL)))
5270 					break;
5271 
5272 				/*
5273 				 * If this is an anon page, we must find the
5274 				 * correct <vp, offset> for it
5275 				 */
5276 				fap = NULL;
5277 				if (amp != NULL) {
5278 					ANON_LOCK_ENTER(&amp->a_rwlock,
5279 					    RW_READER);
5280 					anon_array_enter(amp, fanon_index,
5281 					    &cookie);
5282 					fap = anon_get_ptr(amp->ahp,
5283 					    fanon_index);
5284 					if (fap != NULL) {
5285 						swap_xlate(fap, &fvp, &fpgoff);
5286 					} else {
5287 						fpgoff = pgoff;
5288 						fvp = svd->vp;
5289 					}
5290 					anon_array_exit(&cookie);
5291 					ANON_LOCK_EXIT(&amp->a_rwlock);
5292 				} else {
5293 					fpgoff = pgoff;
5294 					fvp = svd->vp;
5295 				}
5296 				if (fvp == NULL)
5297 					break;	/* XXX */
5298 				/*
5299 				 * Skip pages that are free or have an
5300 				 * "exclusive" lock.
5301 				 */
5302 				pp = page_lookup_nowait(fvp, fpgoff, SE_SHARED);
5303 				if (pp == NULL)
5304 					break;
5305 				/*
5306 				 * We don't need the page_struct_lock to test
5307 				 * as this is only advisory; even if we
5308 				 * acquire it someone might race in and lock
5309 				 * the page after we unlock and before the
5310 				 * PUTPAGE, then VOP_PUTPAGE will do nothing.
5311 				 */
5312 				if (pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
5313 					/*
5314 					 * Hold the vnode before releasing
5315 					 * the page lock to prevent it from
5316 					 * being freed and re-used by some
5317 					 * other thread.
5318 					 */
5319 					VN_HOLD(fvp);
5320 					page_unlock(pp);
5321 					/*
5322 					 * We should build a page list
5323 					 * to kluster putpages XXX
5324 					 */
5325 					(void) VOP_PUTPAGE(fvp,
5326 					    (offset_t)fpgoff, PAGESIZE,
5327 					    (B_DONTNEED|B_FREE|B_ASYNC),
5328 					    svd->cred, NULL);
5329 					VN_RELE(fvp);
5330 				} else {
5331 					/*
5332 					 * XXX - Should the loop terminate if
5333 					 * the page is `locked'?
5334 					 */
5335 					page_unlock(pp);
5336 				}
5337 				--vpp;
5338 				--fanon_index;
5339 				pgoff -= PAGESIZE;
5340 			}
5341 		}
5342 	}
5343 
5344 	plp = pl;
5345 	*plp = NULL;
5346 	pl_alloc_sz = 0;
5347 
5348 	/*
5349 	 * See if we need to call VOP_GETPAGE for
5350 	 * *any* of the range being faulted on.
5351 	 * We can skip all of this work if there
5352 	 * was no original vnode.
5353 	 */
5354 	if (svd->vp != NULL) {
5355 		u_offset_t vp_off;
5356 		size_t vp_len;
5357 		struct anon *ap;
5358 		vnode_t *vp;
5359 
5360 		vp_off = off;
5361 		vp_len = len;
5362 
5363 		if (amp == NULL)
5364 			dogetpage = 1;
5365 		else {
5366 			/*
5367 			 * Only acquire reader lock to prevent amp->ahp
5368 			 * from being changed.  It's ok to miss pages,
5369 			 * hence we don't do anon_array_enter
5370 			 */
5371 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
5372 			ap = anon_get_ptr(amp->ahp, anon_index);
5373 
5374 			if (len <= PAGESIZE)
5375 				/* inline non_anon() */
5376 				dogetpage = (ap == NULL);
5377 			else
5378 				dogetpage = non_anon(amp->ahp, anon_index,
5379 				    &vp_off, &vp_len);
5380 			ANON_LOCK_EXIT(&amp->a_rwlock);
5381 		}
5382 
5383 		if (dogetpage) {
5384 			enum seg_rw arw;
5385 			struct as *as = seg->s_as;
5386 
5387 			if (len > ptob((sizeof (pl) / sizeof (pl[0])) - 1)) {
5388 				/*
5389 				 * Page list won't fit in local array,
5390 				 * allocate one of the needed size.
5391 				 */
5392 				pl_alloc_sz =
5393 				    (btop(len) + 1) * sizeof (page_t *);
5394 				plp = kmem_alloc(pl_alloc_sz, KM_SLEEP);
5395 				plp[0] = NULL;
5396 				plsz = len;
5397 			} else if (rw == S_WRITE && svd->type == MAP_PRIVATE ||
5398 			    svd->tr_state == SEGVN_TR_ON || rw == S_OTHER ||
5399 			    (((size_t)(addr + PAGESIZE) <
5400 			    (size_t)(seg->s_base + seg->s_size)) &&
5401 			    hat_probe(as->a_hat, addr + PAGESIZE))) {
5402 				/*
5403 				 * Ask VOP_GETPAGE to return the exact number
5404 				 * of pages if
5405 				 * (a) this is a COW fault, or
5406 				 * (b) this is a software fault, or
5407 				 * (c) next page is already mapped.
5408 				 */
5409 				plsz = len;
5410 			} else {
5411 				/*
5412 				 * Ask VOP_GETPAGE to return adjacent pages
5413 				 * within the segment.
5414 				 */
5415 				plsz = MIN((size_t)PVN_GETPAGE_SZ, (size_t)
5416 				    ((seg->s_base + seg->s_size) - addr));
5417 				ASSERT((addr + plsz) <=
5418 				    (seg->s_base + seg->s_size));
5419 			}
5420 
5421 			/*
5422 			 * Need to get some non-anonymous pages.
5423 			 * We need to make only one call to GETPAGE to do
5424 			 * this to prevent certain deadlocking conditions
5425 			 * when we are doing locking.  In this case
5426 			 * non_anon() should have picked up the smallest
5427 			 * range which includes all the non-anonymous
5428 			 * pages in the requested range.  We have to
5429 			 * be careful regarding which rw flag to pass in
5430 			 * because on a private mapping, the underlying
5431 			 * object is never allowed to be written.
5432 			 */
5433 			if (rw == S_WRITE && svd->type == MAP_PRIVATE) {
5434 				arw = S_READ;
5435 			} else {
5436 				arw = rw;
5437 			}
5438 			vp = svd->vp;
5439 			TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE,
5440 			    "segvn_getpage:seg %p addr %p vp %p",
5441 			    seg, addr, vp);
5442 			err = VOP_GETPAGE(vp, (offset_t)vp_off, vp_len,
5443 			    &vpprot, plp, plsz, seg, addr + (vp_off - off), arw,
5444 			    svd->cred, NULL);
5445 			if (err) {
5446 				SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5447 				segvn_pagelist_rele(plp);
5448 				if (pl_alloc_sz)
5449 					kmem_free(plp, pl_alloc_sz);
5450 				return (FC_MAKE_ERR(err));
5451 			}
5452 			if (svd->type == MAP_PRIVATE)
5453 				vpprot &= ~PROT_WRITE;
5454 		}
5455 	}
5456 
5457 	/*
5458 	 * N.B. at this time the plp array has all the needed non-anon
5459 	 * pages in addition to (possibly) having some adjacent pages.
5460 	 */
5461 
5462 	/*
5463 	 * Always acquire the anon_array_lock to prevent
5464 	 * 2 threads from allocating separate anon slots for
5465 	 * the same "addr".
5466 	 *
5467 	 * If this is a copy-on-write fault and we don't already
5468 	 * have the anon_array_lock, acquire it to prevent the
5469 	 * fault routine from handling multiple copy-on-write faults
5470 	 * on the same "addr" in the same address space.
5471 	 *
5472 	 * Only one thread should deal with the fault since after
5473 	 * it is handled, the other threads can acquire a translation
5474 	 * to the newly created private page.  This prevents two or
5475 	 * more threads from creating different private pages for the
5476 	 * same fault.
5477 	 *
5478 	 * We grab "serialization" lock here if this is a MAP_PRIVATE segment
5479 	 * to prevent deadlock between this thread and another thread
5480 	 * which has soft-locked this page and wants to acquire serial_lock.
5481 	 * ( bug 4026339 )
5482 	 *
5483 	 * The fix for bug 4026339 becomes unnecessary when using the
5484 	 * locking scheme with per amp rwlock and a global set of hash
5485 	 * lock, anon_array_lock.  If we steal a vnode page when low
5486 	 * on memory and upgrad the page lock through page_rename,
5487 	 * then the page is PAGE_HANDLED, nothing needs to be done
5488 	 * for this page after returning from segvn_faultpage.
5489 	 *
5490 	 * But really, the page lock should be downgraded after
5491 	 * the stolen page is page_rename'd.
5492 	 */
5493 
5494 	if (amp != NULL)
5495 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
5496 
5497 	/*
5498 	 * Ok, now loop over the address range and handle faults
5499 	 */
5500 	for (a = addr; a < addr + len; a += PAGESIZE, off += PAGESIZE) {
5501 		err = segvn_faultpage(hat, seg, a, off, vpage, plp, vpprot,
5502 		    type, rw, brkcow, a == addr);
5503 		if (err) {
5504 			if (amp != NULL)
5505 				ANON_LOCK_EXIT(&amp->a_rwlock);
5506 			if (type == F_SOFTLOCK && a > addr) {
5507 				segvn_softunlock(seg, addr, (a - addr),
5508 				    S_OTHER);
5509 			}
5510 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5511 			segvn_pagelist_rele(plp);
5512 			if (pl_alloc_sz)
5513 				kmem_free(plp, pl_alloc_sz);
5514 			return (err);
5515 		}
5516 		if (vpage) {
5517 			vpage++;
5518 		} else if (svd->vpage) {
5519 			page = seg_page(seg, addr);
5520 			vpage = &svd->vpage[++page];
5521 		}
5522 	}
5523 
5524 	/* Didn't get pages from the underlying fs so we're done */
5525 	if (!dogetpage)
5526 		goto done;
5527 
5528 	/*
5529 	 * Now handle any other pages in the list returned.
5530 	 * If the page can be used, load up the translations now.
5531 	 * Note that the for loop will only be entered if "plp"
5532 	 * is pointing to a non-NULL page pointer which means that
5533 	 * VOP_GETPAGE() was called and vpprot has been initialized.
5534 	 */
5535 	if (svd->pageprot == 0)
5536 		prot = svd->prot & vpprot;
5537 
5538 
5539 	/*
5540 	 * Large Files: diff should be unsigned value because we started
5541 	 * supporting > 2GB segment sizes from 2.5.1 and when a
5542 	 * large file of size > 2GB gets mapped to address space
5543 	 * the diff value can be > 2GB.
5544 	 */
5545 
5546 	for (ppp = plp; (pp = *ppp) != NULL; ppp++) {
5547 		size_t diff;
5548 		struct anon *ap;
5549 		int anon_index;
5550 		anon_sync_obj_t cookie;
5551 		int hat_flag = HAT_LOAD_ADV;
5552 
5553 		if (svd->flags & MAP_TEXT) {
5554 			hat_flag |= HAT_LOAD_TEXT;
5555 		}
5556 
5557 		if (pp == PAGE_HANDLED)
5558 			continue;
5559 
5560 		if (svd->tr_state != SEGVN_TR_ON &&
5561 		    pp->p_offset >=  svd->offset &&
5562 		    pp->p_offset < svd->offset + seg->s_size) {
5563 
5564 			diff = pp->p_offset - svd->offset;
5565 
5566 			/*
5567 			 * Large Files: Following is the assertion
5568 			 * validating the above cast.
5569 			 */
5570 			ASSERT(svd->vp == pp->p_vnode);
5571 
5572 			page = btop(diff);
5573 			if (svd->pageprot)
5574 				prot = VPP_PROT(&svd->vpage[page]) & vpprot;
5575 
5576 			/*
5577 			 * Prevent other threads in the address space from
5578 			 * creating private pages (i.e., allocating anon slots)
5579 			 * while we are in the process of loading translations
5580 			 * to additional pages returned by the underlying
5581 			 * object.
5582 			 */
5583 			if (amp != NULL) {
5584 				anon_index = svd->anon_index + page;
5585 				anon_array_enter(amp, anon_index, &cookie);
5586 				ap = anon_get_ptr(amp->ahp, anon_index);
5587 			}
5588 			if ((amp == NULL) || (ap == NULL)) {
5589 				if (IS_VMODSORT(pp->p_vnode) ||
5590 				    enable_mbit_wa) {
5591 					if (rw == S_WRITE)
5592 						hat_setmod(pp);
5593 					else if (rw != S_OTHER &&
5594 					    !hat_ismod(pp))
5595 						prot &= ~PROT_WRITE;
5596 				}
5597 				/*
5598 				 * Skip mapping read ahead pages marked
5599 				 * for migration, so they will get migrated
5600 				 * properly on fault
5601 				 */
5602 				ASSERT(amp == NULL ||
5603 				    svd->rcookie == HAT_INVALID_REGION_COOKIE);
5604 				if ((prot & PROT_READ) && !PP_ISMIGRATE(pp)) {
5605 					hat_memload_region(hat,
5606 					    seg->s_base + diff,
5607 					    pp, prot, hat_flag,
5608 					    svd->rcookie);
5609 				}
5610 			}
5611 			if (amp != NULL)
5612 				anon_array_exit(&cookie);
5613 		}
5614 		page_unlock(pp);
5615 	}
5616 done:
5617 	if (amp != NULL)
5618 		ANON_LOCK_EXIT(&amp->a_rwlock);
5619 	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5620 	if (pl_alloc_sz)
5621 		kmem_free(plp, pl_alloc_sz);
5622 	return (0);
5623 }
5624 
5625 /*
5626  * This routine is used to start I/O on pages asynchronously.  XXX it will
5627  * only create PAGESIZE pages. At fault time they will be relocated into
5628  * larger pages.
5629  */
5630 static faultcode_t
5631 segvn_faulta(struct seg *seg, caddr_t addr)
5632 {
5633 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
5634 	int err;
5635 	struct anon_map *amp;
5636 	vnode_t *vp;
5637 
5638 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
5639 
5640 	SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
5641 	if ((amp = svd->amp) != NULL) {
5642 		struct anon *ap;
5643 
5644 		/*
5645 		 * Reader lock to prevent amp->ahp from being changed.
5646 		 * This is advisory, it's ok to miss a page, so
5647 		 * we don't do anon_array_enter lock.
5648 		 */
5649 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
5650 		if ((ap = anon_get_ptr(amp->ahp,
5651 		    svd->anon_index + seg_page(seg, addr))) != NULL) {
5652 
5653 			err = anon_getpage(&ap, NULL, NULL,
5654 			    0, seg, addr, S_READ, svd->cred);
5655 
5656 			ANON_LOCK_EXIT(&amp->a_rwlock);
5657 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5658 			if (err)
5659 				return (FC_MAKE_ERR(err));
5660 			return (0);
5661 		}
5662 		ANON_LOCK_EXIT(&amp->a_rwlock);
5663 	}
5664 
5665 	if (svd->vp == NULL) {
5666 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5667 		return (0);			/* zfod page - do nothing now */
5668 	}
5669 
5670 	vp = svd->vp;
5671 	TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE,
5672 	    "segvn_getpage:seg %p addr %p vp %p", seg, addr, vp);
5673 	err = VOP_GETPAGE(vp,
5674 	    (offset_t)(svd->offset + (uintptr_t)(addr - seg->s_base)),
5675 	    PAGESIZE, NULL, NULL, 0, seg, addr,
5676 	    S_OTHER, svd->cred, NULL);
5677 
5678 	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5679 	if (err)
5680 		return (FC_MAKE_ERR(err));
5681 	return (0);
5682 }
5683 
5684 static int
5685 segvn_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
5686 {
5687 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
5688 	struct vpage *svp, *evp;
5689 	struct vnode *vp;
5690 	size_t pgsz;
5691 	pgcnt_t pgcnt;
5692 	anon_sync_obj_t cookie;
5693 	int unload_done = 0;
5694 
5695 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
5696 
5697 	if ((svd->maxprot & prot) != prot)
5698 		return (EACCES);			/* violated maxprot */
5699 
5700 	SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
5701 
5702 	/* return if prot is the same */
5703 	if (!svd->pageprot && svd->prot == prot) {
5704 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5705 		return (0);
5706 	}
5707 
5708 	/*
5709 	 * Since we change protections we first have to flush the cache.
5710 	 * This makes sure all the pagelock calls have to recheck
5711 	 * protections.
5712 	 */
5713 	if (svd->softlockcnt > 0) {
5714 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
5715 		/*
5716 		 * Since we do have the segvn writers lock nobody can fill
5717 		 * the cache with entries belonging to this seg during
5718 		 * the purge. The flush either succeeds or we still have
5719 		 * pending I/Os.
5720 		 */
5721 		segvn_purge(seg);
5722 		if (svd->softlockcnt > 0) {
5723 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5724 			return (EAGAIN);
5725 		}
5726 	}
5727 
5728 	if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
5729 		ASSERT(svd->amp == NULL);
5730 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
5731 		hat_leave_region(seg->s_as->a_hat, svd->rcookie,
5732 		    HAT_REGION_TEXT);
5733 		svd->rcookie = HAT_INVALID_REGION_COOKIE;
5734 		unload_done = 1;
5735 	} else if (svd->tr_state == SEGVN_TR_INIT) {
5736 		svd->tr_state = SEGVN_TR_OFF;
5737 	} else if (svd->tr_state == SEGVN_TR_ON) {
5738 		ASSERT(svd->amp != NULL);
5739 		segvn_textunrepl(seg, 0);
5740 		ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF);
5741 		unload_done = 1;
5742 	}
5743 
5744 	if ((prot & PROT_WRITE) && svd->type == MAP_SHARED &&
5745 	    svd->vp != NULL && (svd->vp->v_flag & VVMEXEC)) {
5746 		ASSERT(vn_is_mapped(svd->vp, V_WRITE));
5747 		segvn_inval_trcache(svd->vp);
5748 	}
5749 	if (seg->s_szc != 0) {
5750 		int err;
5751 		pgsz = page_get_pagesize(seg->s_szc);
5752 		pgcnt = pgsz >> PAGESHIFT;
5753 		ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
5754 		if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) {
5755 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5756 			ASSERT(seg->s_base != addr || seg->s_size != len);
5757 			/*
5758 			 * If we are holding the as lock as a reader then
5759 			 * we need to return IE_RETRY and let the as
5760 			 * layer drop and re-acquire the lock as a writer.
5761 			 */
5762 			if (AS_READ_HELD(seg->s_as, &seg->s_as->a_lock))
5763 				return (IE_RETRY);
5764 			VM_STAT_ADD(segvnvmstats.demoterange[1]);
5765 			if (svd->type == MAP_PRIVATE || svd->vp != NULL) {
5766 				err = segvn_demote_range(seg, addr, len,
5767 				    SDR_END, 0);
5768 			} else {
5769 				uint_t szcvec = map_pgszcvec(seg->s_base,
5770 				    pgsz, (uintptr_t)seg->s_base,
5771 				    (svd->flags & MAP_TEXT), MAPPGSZC_SHM, 0);
5772 				err = segvn_demote_range(seg, addr, len,
5773 				    SDR_END, szcvec);
5774 			}
5775 			if (err == 0)
5776 				return (IE_RETRY);
5777 			if (err == ENOMEM)
5778 				return (IE_NOMEM);
5779 			return (err);
5780 		}
5781 	}
5782 
5783 
5784 	/*
5785 	 * If it's a private mapping and we're making it writable
5786 	 * and no swap space has been reserved, have to reserve
5787 	 * it all now.  If it's a private mapping to a file (i.e., vp != NULL)
5788 	 * and we're removing write permission on the entire segment and
5789 	 * we haven't modified any pages, we can release the swap space.
5790 	 */
5791 	if (svd->type == MAP_PRIVATE) {
5792 		if (prot & PROT_WRITE) {
5793 			size_t sz;
5794 			if (svd->swresv == 0 && !(svd->flags & MAP_NORESERVE)) {
5795 				if (anon_resv_zone(seg->s_size,
5796 				    seg->s_as->a_proc->p_zone) == 0) {
5797 					SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5798 					return (IE_NOMEM);
5799 				}
5800 				sz = svd->swresv = seg->s_size;
5801 				TRACE_3(TR_FAC_VM, TR_ANON_PROC,
5802 				    "anon proc:%p %lu %u",
5803 				    seg, sz, 1);
5804 			}
5805 		} else {
5806 			/*
5807 			 * Swap space is released only if this segment
5808 			 * does not map anonymous memory, since read faults
5809 			 * on such segments still need an anon slot to read
5810 			 * in the data.
5811 			 */
5812 			if (svd->swresv != 0 && svd->vp != NULL &&
5813 			    svd->amp == NULL && addr == seg->s_base &&
5814 			    len == seg->s_size && svd->pageprot == 0) {
5815 				anon_unresv_zone(svd->swresv,
5816 				    seg->s_as->a_proc->p_zone);
5817 				svd->swresv = 0;
5818 				TRACE_3(TR_FAC_VM, TR_ANON_PROC,
5819 				    "anon proc:%p %lu %u", seg, 0, 0);
5820 			}
5821 		}
5822 	}
5823 
5824 	if (addr == seg->s_base && len == seg->s_size && svd->vpage == NULL) {
5825 		if (svd->prot == prot) {
5826 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5827 			return (0);			/* all done */
5828 		}
5829 		svd->prot = (uchar_t)prot;
5830 	} else if (svd->type == MAP_PRIVATE) {
5831 		struct anon *ap = NULL;
5832 		page_t *pp;
5833 		u_offset_t offset, off;
5834 		struct anon_map *amp;
5835 		ulong_t anon_idx = 0;
5836 
5837 		/*
5838 		 * A vpage structure exists or else the change does not
5839 		 * involve the entire segment.  Establish a vpage structure
5840 		 * if none is there.  Then, for each page in the range,
5841 		 * adjust its individual permissions.  Note that write-
5842 		 * enabling a MAP_PRIVATE page can affect the claims for
5843 		 * locked down memory.  Overcommitting memory terminates
5844 		 * the operation.
5845 		 */
5846 		segvn_vpage(seg);
5847 		svd->pageprot = 1;
5848 		if ((amp = svd->amp) != NULL) {
5849 			anon_idx = svd->anon_index + seg_page(seg, addr);
5850 			ASSERT(seg->s_szc == 0 ||
5851 			    IS_P2ALIGNED(anon_idx, pgcnt));
5852 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
5853 		}
5854 
5855 		offset = svd->offset + (uintptr_t)(addr - seg->s_base);
5856 		evp = &svd->vpage[seg_page(seg, addr + len)];
5857 
5858 		/*
5859 		 * See Statement at the beginning of segvn_lockop regarding
5860 		 * the way cowcnts and lckcnts are handled.
5861 		 */
5862 		for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) {
5863 
5864 			if (seg->s_szc != 0) {
5865 				if (amp != NULL) {
5866 					anon_array_enter(amp, anon_idx,
5867 					    &cookie);
5868 				}
5869 				if (IS_P2ALIGNED(anon_idx, pgcnt) &&
5870 				    !segvn_claim_pages(seg, svp, offset,
5871 				    anon_idx, prot)) {
5872 					if (amp != NULL) {
5873 						anon_array_exit(&cookie);
5874 					}
5875 					break;
5876 				}
5877 				if (amp != NULL) {
5878 					anon_array_exit(&cookie);
5879 				}
5880 				anon_idx++;
5881 			} else {
5882 				if (amp != NULL) {
5883 					anon_array_enter(amp, anon_idx,
5884 					    &cookie);
5885 					ap = anon_get_ptr(amp->ahp, anon_idx++);
5886 				}
5887 
5888 				if (VPP_ISPPLOCK(svp) &&
5889 				    VPP_PROT(svp) != prot) {
5890 
5891 					if (amp == NULL || ap == NULL) {
5892 						vp = svd->vp;
5893 						off = offset;
5894 					} else
5895 						swap_xlate(ap, &vp, &off);
5896 					if (amp != NULL)
5897 						anon_array_exit(&cookie);
5898 
5899 					if ((pp = page_lookup(vp, off,
5900 					    SE_SHARED)) == NULL) {
5901 						panic("segvn_setprot: no page");
5902 						/*NOTREACHED*/
5903 					}
5904 					ASSERT(seg->s_szc == 0);
5905 					if ((VPP_PROT(svp) ^ prot) &
5906 					    PROT_WRITE) {
5907 						if (prot & PROT_WRITE) {
5908 						    if (!page_addclaim(pp)) {
5909 							page_unlock(pp);
5910 							break;
5911 						    }
5912 						} else {
5913 						    if (!page_subclaim(pp)) {
5914 							page_unlock(pp);
5915 							break;
5916 						    }
5917 						}
5918 					}
5919 					page_unlock(pp);
5920 				} else if (amp != NULL)
5921 					anon_array_exit(&cookie);
5922 			}
5923 			VPP_SETPROT(svp, prot);
5924 			offset += PAGESIZE;
5925 		}
5926 		if (amp != NULL)
5927 			ANON_LOCK_EXIT(&amp->a_rwlock);
5928 
5929 		/*
5930 		 * Did we terminate prematurely?  If so, simply unload
5931 		 * the translations to the things we've updated so far.
5932 		 */
5933 		if (svp != evp) {
5934 			if (unload_done) {
5935 				SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5936 				return (IE_NOMEM);
5937 			}
5938 			len = (svp - &svd->vpage[seg_page(seg, addr)]) *
5939 			    PAGESIZE;
5940 			ASSERT(seg->s_szc == 0 || IS_P2ALIGNED(len, pgsz));
5941 			if (len != 0)
5942 				hat_unload(seg->s_as->a_hat, addr,
5943 				    len, HAT_UNLOAD);
5944 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5945 			return (IE_NOMEM);
5946 		}
5947 	} else {
5948 		segvn_vpage(seg);
5949 		svd->pageprot = 1;
5950 		evp = &svd->vpage[seg_page(seg, addr + len)];
5951 		for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) {
5952 			VPP_SETPROT(svp, prot);
5953 		}
5954 	}
5955 
5956 	if (unload_done) {
5957 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5958 		return (0);
5959 	}
5960 
5961 	if (((prot & PROT_WRITE) != 0 &&
5962 	    (svd->vp != NULL || svd->type == MAP_PRIVATE)) ||
5963 	    (prot & ~PROT_USER) == PROT_NONE) {
5964 		/*
5965 		 * Either private or shared data with write access (in
5966 		 * which case we need to throw out all former translations
5967 		 * so that we get the right translations set up on fault
5968 		 * and we don't allow write access to any copy-on-write pages
5969 		 * that might be around or to prevent write access to pages
5970 		 * representing holes in a file), or we don't have permission
5971 		 * to access the memory at all (in which case we have to
5972 		 * unload any current translations that might exist).
5973 		 */
5974 		hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD);
5975 	} else {
5976 		/*
5977 		 * A shared mapping or a private mapping in which write
5978 		 * protection is going to be denied - just change all the
5979 		 * protections over the range of addresses in question.
5980 		 * segvn does not support any other attributes other
5981 		 * than prot so we can use hat_chgattr.
5982 		 */
5983 		hat_chgattr(seg->s_as->a_hat, addr, len, prot);
5984 	}
5985 
5986 	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
5987 
5988 	return (0);
5989 }
5990 
5991 /*
5992  * segvn_setpagesize is called via SEGOP_SETPAGESIZE from as_setpagesize,
5993  * to determine if the seg is capable of mapping the requested szc.
5994  */
5995 static int
5996 segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
5997 {
5998 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
5999 	struct segvn_data *nsvd;
6000 	struct anon_map *amp = svd->amp;
6001 	struct seg *nseg;
6002 	caddr_t eaddr = addr + len, a;
6003 	size_t pgsz = page_get_pagesize(szc);
6004 	pgcnt_t pgcnt = page_get_pagecnt(szc);
6005 	int err;
6006 	u_offset_t off = svd->offset + (uintptr_t)(addr - seg->s_base);
6007 	extern struct vnode kvp;
6008 
6009 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
6010 	ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size);
6011 
6012 	if (seg->s_szc == szc || segvn_lpg_disable != 0) {
6013 		return (0);
6014 	}
6015 
6016 	/*
6017 	 * addr should always be pgsz aligned but eaddr may be misaligned if
6018 	 * it's at the end of the segment.
6019 	 *
6020 	 * XXX we should assert this condition since as_setpagesize() logic
6021 	 * guarantees it.
6022 	 */
6023 	if (!IS_P2ALIGNED(addr, pgsz) ||
6024 	    (!IS_P2ALIGNED(eaddr, pgsz) &&
6025 	    eaddr != seg->s_base + seg->s_size)) {
6026 
6027 		segvn_setpgsz_align_err++;
6028 		return (EINVAL);
6029 	}
6030 
6031 	if (amp != NULL && svd->type == MAP_SHARED) {
6032 		ulong_t an_idx = svd->anon_index + seg_page(seg, addr);
6033 		if (!IS_P2ALIGNED(an_idx, pgcnt)) {
6034 
6035 			segvn_setpgsz_anon_align_err++;
6036 			return (EINVAL);
6037 		}
6038 	}
6039 
6040 	if ((svd->flags & MAP_NORESERVE) || seg->s_as == &kas ||
6041 	    szc > segvn_maxpgszc) {
6042 		return (EINVAL);
6043 	}
6044 
6045 	/* paranoid check */
6046 	if (svd->vp != NULL &&
6047 	    (IS_SWAPFSVP(svd->vp) || VN_ISKAS(svd->vp))) {
6048 		return (EINVAL);
6049 	}
6050 
6051 	if (seg->s_szc == 0 && svd->vp != NULL &&
6052 	    map_addr_vacalign_check(addr, off)) {
6053 		return (EINVAL);
6054 	}
6055 
6056 	/*
6057 	 * Check that protections are the same within new page
6058 	 * size boundaries.
6059 	 */
6060 	if (svd->pageprot) {
6061 		for (a = addr; a < eaddr; a += pgsz) {
6062 			if ((a + pgsz) > eaddr) {
6063 				if (!sameprot(seg, a, eaddr - a)) {
6064 					return (EINVAL);
6065 				}
6066 			} else {
6067 				if (!sameprot(seg, a, pgsz)) {
6068 					return (EINVAL);
6069 				}
6070 			}
6071 		}
6072 	}
6073 
6074 	/*
6075 	 * Since we are changing page size we first have to flush
6076 	 * the cache. This makes sure all the pagelock calls have
6077 	 * to recheck protections.
6078 	 */
6079 	if (svd->softlockcnt > 0) {
6080 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
6081 		/*
6082 		 * Since we do have the segvn writers lock nobody can fill
6083 		 * the cache with entries belonging to this seg during
6084 		 * the purge. The flush either succeeds or we still have
6085 		 * pending I/Os.
6086 		 */
6087 		segvn_purge(seg);
6088 		if (svd->softlockcnt > 0) {
6089 			return (EAGAIN);
6090 		}
6091 	}
6092 
6093 	if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
6094 		ASSERT(svd->amp == NULL);
6095 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
6096 		hat_leave_region(seg->s_as->a_hat, svd->rcookie,
6097 		    HAT_REGION_TEXT);
6098 		svd->rcookie = HAT_INVALID_REGION_COOKIE;
6099 	} else if (svd->tr_state == SEGVN_TR_INIT) {
6100 		svd->tr_state = SEGVN_TR_OFF;
6101 	} else if (svd->tr_state == SEGVN_TR_ON) {
6102 		ASSERT(svd->amp != NULL);
6103 		segvn_textunrepl(seg, 1);
6104 		ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF);
6105 		amp = NULL;
6106 	}
6107 
6108 	/*
6109 	 * Operation for sub range of existing segment.
6110 	 */
6111 	if (addr != seg->s_base || eaddr != (seg->s_base + seg->s_size)) {
6112 		if (szc < seg->s_szc) {
6113 			VM_STAT_ADD(segvnvmstats.demoterange[2]);
6114 			err = segvn_demote_range(seg, addr, len, SDR_RANGE, 0);
6115 			if (err == 0) {
6116 				return (IE_RETRY);
6117 			}
6118 			if (err == ENOMEM) {
6119 				return (IE_NOMEM);
6120 			}
6121 			return (err);
6122 		}
6123 		if (addr != seg->s_base) {
6124 			nseg = segvn_split_seg(seg, addr);
6125 			if (eaddr != (nseg->s_base + nseg->s_size)) {
6126 				/* eaddr is szc aligned */
6127 				(void) segvn_split_seg(nseg, eaddr);
6128 			}
6129 			return (IE_RETRY);
6130 		}
6131 		if (eaddr != (seg->s_base + seg->s_size)) {
6132 			/* eaddr is szc aligned */
6133 			(void) segvn_split_seg(seg, eaddr);
6134 		}
6135 		return (IE_RETRY);
6136 	}
6137 
6138 	/*
6139 	 * Break any low level sharing and reset seg->s_szc to 0.
6140 	 */
6141 	if ((err = segvn_clrszc(seg)) != 0) {
6142 		if (err == ENOMEM) {
6143 			err = IE_NOMEM;
6144 		}
6145 		return (err);
6146 	}
6147 	ASSERT(seg->s_szc == 0);
6148 
6149 	/*
6150 	 * If the end of the current segment is not pgsz aligned
6151 	 * then attempt to concatenate with the next segment.
6152 	 */
6153 	if (!IS_P2ALIGNED(eaddr, pgsz)) {
6154 		nseg = AS_SEGNEXT(seg->s_as, seg);
6155 		if (nseg == NULL || nseg == seg || eaddr != nseg->s_base) {
6156 			return (ENOMEM);
6157 		}
6158 		if (nseg->s_ops != &segvn_ops) {
6159 			return (EINVAL);
6160 		}
6161 		nsvd = (struct segvn_data *)nseg->s_data;
6162 		if (nsvd->softlockcnt > 0) {
6163 			segvn_purge(nseg);
6164 			if (nsvd->softlockcnt > 0) {
6165 				return (EAGAIN);
6166 			}
6167 		}
6168 		err = segvn_clrszc(nseg);
6169 		if (err == ENOMEM) {
6170 			err = IE_NOMEM;
6171 		}
6172 		if (err != 0) {
6173 			return (err);
6174 		}
6175 		ASSERT(nsvd->rcookie == HAT_INVALID_REGION_COOKIE);
6176 		err = segvn_concat(seg, nseg, 1);
6177 		if (err == -1) {
6178 			return (EINVAL);
6179 		}
6180 		if (err == -2) {
6181 			return (IE_NOMEM);
6182 		}
6183 		return (IE_RETRY);
6184 	}
6185 
6186 	/*
6187 	 * May need to re-align anon array to
6188 	 * new szc.
6189 	 */
6190 	if (amp != NULL) {
6191 		if (!IS_P2ALIGNED(svd->anon_index, pgcnt)) {
6192 			struct anon_hdr *nahp;
6193 
6194 			ASSERT(svd->type == MAP_PRIVATE);
6195 
6196 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
6197 			ASSERT(amp->refcnt == 1);
6198 			nahp = anon_create(btop(amp->size), ANON_NOSLEEP);
6199 			if (nahp == NULL) {
6200 				ANON_LOCK_EXIT(&amp->a_rwlock);
6201 				return (IE_NOMEM);
6202 			}
6203 			if (anon_copy_ptr(amp->ahp, svd->anon_index,
6204 			    nahp, 0, btop(seg->s_size), ANON_NOSLEEP)) {
6205 				anon_release(nahp, btop(amp->size));
6206 				ANON_LOCK_EXIT(&amp->a_rwlock);
6207 				return (IE_NOMEM);
6208 			}
6209 			anon_release(amp->ahp, btop(amp->size));
6210 			amp->ahp = nahp;
6211 			svd->anon_index = 0;
6212 			ANON_LOCK_EXIT(&amp->a_rwlock);
6213 		}
6214 	}
6215 	if (svd->vp != NULL && szc != 0) {
6216 		struct vattr va;
6217 		u_offset_t eoffpage = svd->offset;
6218 		va.va_mask = AT_SIZE;
6219 		eoffpage += seg->s_size;
6220 		eoffpage = btopr(eoffpage);
6221 		if (VOP_GETATTR(svd->vp, &va, 0, svd->cred, NULL) != 0) {
6222 			segvn_setpgsz_getattr_err++;
6223 			return (EINVAL);
6224 		}
6225 		if (btopr(va.va_size) < eoffpage) {
6226 			segvn_setpgsz_eof_err++;
6227 			return (EINVAL);
6228 		}
6229 		if (amp != NULL) {
6230 			/*
6231 			 * anon_fill_cow_holes() may call VOP_GETPAGE().
6232 			 * don't take anon map lock here to avoid holding it
6233 			 * across VOP_GETPAGE() calls that may call back into
6234 			 * segvn for klsutering checks. We don't really need
6235 			 * anon map lock here since it's a private segment and
6236 			 * we hold as level lock as writers.
6237 			 */
6238 			if ((err = anon_fill_cow_holes(seg, seg->s_base,
6239 			    amp->ahp, svd->anon_index, svd->vp, svd->offset,
6240 			    seg->s_size, szc, svd->prot, svd->vpage,
6241 			    svd->cred)) != 0) {
6242 				return (EINVAL);
6243 			}
6244 		}
6245 		segvn_setvnode_mpss(svd->vp);
6246 	}
6247 
6248 	if (amp != NULL) {
6249 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
6250 		if (svd->type == MAP_PRIVATE) {
6251 			amp->a_szc = szc;
6252 		} else if (szc > amp->a_szc) {
6253 			amp->a_szc = szc;
6254 		}
6255 		ANON_LOCK_EXIT(&amp->a_rwlock);
6256 	}
6257 
6258 	seg->s_szc = szc;
6259 
6260 	return (0);
6261 }
6262 
6263 static int
6264 segvn_clrszc(struct seg *seg)
6265 {
6266 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
6267 	struct anon_map *amp = svd->amp;
6268 	size_t pgsz;
6269 	pgcnt_t pages;
6270 	int err = 0;
6271 	caddr_t a = seg->s_base;
6272 	caddr_t ea = a + seg->s_size;
6273 	ulong_t an_idx = svd->anon_index;
6274 	vnode_t *vp = svd->vp;
6275 	struct vpage *vpage = svd->vpage;
6276 	page_t *anon_pl[1 + 1], *pp;
6277 	struct anon *ap, *oldap;
6278 	uint_t prot = svd->prot, vpprot;
6279 	int pageflag = 0;
6280 
6281 	ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) ||
6282 	    SEGVN_WRITE_HELD(seg->s_as, &svd->lock));
6283 	ASSERT(svd->softlockcnt == 0);
6284 
6285 	if (vp == NULL && amp == NULL) {
6286 		ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
6287 		seg->s_szc = 0;
6288 		return (0);
6289 	}
6290 
6291 	if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
6292 		ASSERT(svd->amp == NULL);
6293 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
6294 		hat_leave_region(seg->s_as->a_hat, svd->rcookie,
6295 		    HAT_REGION_TEXT);
6296 		svd->rcookie = HAT_INVALID_REGION_COOKIE;
6297 	} else if (svd->tr_state == SEGVN_TR_ON) {
6298 		ASSERT(svd->amp != NULL);
6299 		segvn_textunrepl(seg, 1);
6300 		ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF);
6301 		amp = NULL;
6302 	} else {
6303 		if (svd->tr_state != SEGVN_TR_OFF) {
6304 			ASSERT(svd->tr_state == SEGVN_TR_INIT);
6305 			svd->tr_state = SEGVN_TR_OFF;
6306 		}
6307 
6308 		/*
6309 		 * do HAT_UNLOAD_UNMAP since we are changing the pagesize.
6310 		 * unload argument is 0 when we are freeing the segment
6311 		 * and unload was already done.
6312 		 */
6313 		hat_unload(seg->s_as->a_hat, seg->s_base, seg->s_size,
6314 		    HAT_UNLOAD_UNMAP);
6315 	}
6316 
6317 	if (amp == NULL || svd->type == MAP_SHARED) {
6318 		seg->s_szc = 0;
6319 		return (0);
6320 	}
6321 
6322 	pgsz = page_get_pagesize(seg->s_szc);
6323 	pages = btop(pgsz);
6324 
6325 	/*
6326 	 * XXX anon rwlock is not really needed because this is a
6327 	 * private segment and we are writers.
6328 	 */
6329 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
6330 
6331 	for (; a < ea; a += pgsz, an_idx += pages) {
6332 		if ((oldap = anon_get_ptr(amp->ahp, an_idx)) != NULL) {
6333 			ASSERT(vpage != NULL || svd->pageprot == 0);
6334 			if (vpage != NULL) {
6335 				ASSERT(sameprot(seg, a, pgsz));
6336 				prot = VPP_PROT(vpage);
6337 				pageflag = VPP_ISPPLOCK(vpage) ? LOCK_PAGE : 0;
6338 			}
6339 			if (seg->s_szc != 0) {
6340 				ASSERT(vp == NULL || anon_pages(amp->ahp,
6341 				    an_idx, pages) == pages);
6342 				if ((err = anon_map_demotepages(amp, an_idx,
6343 				    seg, a, prot, vpage, svd->cred)) != 0) {
6344 					goto out;
6345 				}
6346 			} else {
6347 				if (oldap->an_refcnt == 1) {
6348 					continue;
6349 				}
6350 				if ((err = anon_getpage(&oldap, &vpprot,
6351 				    anon_pl, PAGESIZE, seg, a, S_READ,
6352 				    svd->cred))) {
6353 					goto out;
6354 				}
6355 				if ((pp = anon_private(&ap, seg, a, prot,
6356 				    anon_pl[0], pageflag, svd->cred)) == NULL) {
6357 					err = ENOMEM;
6358 					goto out;
6359 				}
6360 				anon_decref(oldap);
6361 				(void) anon_set_ptr(amp->ahp, an_idx, ap,
6362 				    ANON_SLEEP);
6363 				page_unlock(pp);
6364 			}
6365 		}
6366 		vpage = (vpage == NULL) ? NULL : vpage + pages;
6367 	}
6368 
6369 	amp->a_szc = 0;
6370 	seg->s_szc = 0;
6371 out:
6372 	ANON_LOCK_EXIT(&amp->a_rwlock);
6373 	return (err);
6374 }
6375 
6376 static int
6377 segvn_claim_pages(
6378 	struct seg *seg,
6379 	struct vpage *svp,
6380 	u_offset_t off,
6381 	ulong_t anon_idx,
6382 	uint_t prot)
6383 {
6384 	pgcnt_t	pgcnt = page_get_pagecnt(seg->s_szc);
6385 	size_t ppasize = (pgcnt + 1) * sizeof (page_t *);
6386 	page_t	**ppa;
6387 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
6388 	struct anon_map *amp = svd->amp;
6389 	struct vpage *evp = svp + pgcnt;
6390 	caddr_t addr = ((uintptr_t)(svp - svd->vpage) << PAGESHIFT)
6391 	    + seg->s_base;
6392 	struct anon *ap;
6393 	struct vnode *vp = svd->vp;
6394 	page_t *pp;
6395 	pgcnt_t pg_idx, i;
6396 	int err = 0;
6397 	anoff_t aoff;
6398 	int anon = (amp != NULL) ? 1 : 0;
6399 
6400 	ASSERT(svd->type == MAP_PRIVATE);
6401 	ASSERT(svd->vpage != NULL);
6402 	ASSERT(seg->s_szc != 0);
6403 	ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
6404 	ASSERT(amp == NULL || IS_P2ALIGNED(anon_idx, pgcnt));
6405 	ASSERT(sameprot(seg, addr, pgcnt << PAGESHIFT));
6406 
6407 	if (VPP_PROT(svp) == prot)
6408 		return (1);
6409 	if (!((VPP_PROT(svp) ^ prot) & PROT_WRITE))
6410 		return (1);
6411 
6412 	ppa = kmem_alloc(ppasize, KM_SLEEP);
6413 	if (anon && vp != NULL) {
6414 		if (anon_get_ptr(amp->ahp, anon_idx) == NULL) {
6415 			anon = 0;
6416 			ASSERT(!anon_pages(amp->ahp, anon_idx, pgcnt));
6417 		}
6418 		ASSERT(!anon ||
6419 		    anon_pages(amp->ahp, anon_idx, pgcnt) == pgcnt);
6420 	}
6421 
6422 	for (*ppa = NULL, pg_idx = 0; svp < evp; svp++, anon_idx++) {
6423 		if (!VPP_ISPPLOCK(svp))
6424 			continue;
6425 		if (anon) {
6426 			ap = anon_get_ptr(amp->ahp, anon_idx);
6427 			if (ap == NULL) {
6428 				panic("segvn_claim_pages: no anon slot");
6429 			}
6430 			swap_xlate(ap, &vp, &aoff);
6431 			off = (u_offset_t)aoff;
6432 		}
6433 		ASSERT(vp != NULL);
6434 		if ((pp = page_lookup(vp,
6435 		    (u_offset_t)off, SE_SHARED)) == NULL) {
6436 			panic("segvn_claim_pages: no page");
6437 		}
6438 		ppa[pg_idx++] = pp;
6439 		off += PAGESIZE;
6440 	}
6441 
6442 	if (ppa[0] == NULL) {
6443 		kmem_free(ppa, ppasize);
6444 		return (1);
6445 	}
6446 
6447 	ASSERT(pg_idx <= pgcnt);
6448 	ppa[pg_idx] = NULL;
6449 
6450 	if (prot & PROT_WRITE)
6451 		err = page_addclaim_pages(ppa);
6452 	else
6453 		err = page_subclaim_pages(ppa);
6454 
6455 	for (i = 0; i < pg_idx; i++) {
6456 		ASSERT(ppa[i] != NULL);
6457 		page_unlock(ppa[i]);
6458 	}
6459 
6460 	kmem_free(ppa, ppasize);
6461 	return (err);
6462 }
6463 
6464 /*
6465  * Returns right (upper address) segment if split occurred.
6466  * If the address is equal to the beginning or end of its segment it returns
6467  * the current segment.
6468  */
6469 static struct seg *
6470 segvn_split_seg(struct seg *seg, caddr_t addr)
6471 {
6472 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
6473 	struct seg *nseg;
6474 	size_t nsize;
6475 	struct segvn_data *nsvd;
6476 
6477 	ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
6478 	ASSERT(svd->tr_state == SEGVN_TR_OFF);
6479 
6480 	ASSERT(addr >= seg->s_base);
6481 	ASSERT(addr <= seg->s_base + seg->s_size);
6482 	ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
6483 
6484 	if (addr == seg->s_base || addr == seg->s_base + seg->s_size)
6485 		return (seg);
6486 
6487 	nsize = seg->s_base + seg->s_size - addr;
6488 	seg->s_size = addr - seg->s_base;
6489 	nseg = seg_alloc(seg->s_as, addr, nsize);
6490 	ASSERT(nseg != NULL);
6491 	nseg->s_ops = seg->s_ops;
6492 	nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
6493 	nseg->s_data = (void *)nsvd;
6494 	nseg->s_szc = seg->s_szc;
6495 	*nsvd = *svd;
6496 	ASSERT(nsvd->rcookie == HAT_INVALID_REGION_COOKIE);
6497 	nsvd->seg = nseg;
6498 	rw_init(&nsvd->lock, NULL, RW_DEFAULT, NULL);
6499 
6500 	if (nsvd->vp != NULL) {
6501 		VN_HOLD(nsvd->vp);
6502 		nsvd->offset = svd->offset +
6503 		    (uintptr_t)(nseg->s_base - seg->s_base);
6504 		if (nsvd->type == MAP_SHARED)
6505 			lgrp_shm_policy_init(NULL, nsvd->vp);
6506 	} else {
6507 		/*
6508 		 * The offset for an anonymous segment has no signifigance in
6509 		 * terms of an offset into a file. If we were to use the above
6510 		 * calculation instead, the structures read out of
6511 		 * /proc/<pid>/xmap would be more difficult to decipher since
6512 		 * it would be unclear whether two seemingly contiguous
6513 		 * prxmap_t structures represented different segments or a
6514 		 * single segment that had been split up into multiple prxmap_t
6515 		 * structures (e.g. if some part of the segment had not yet
6516 		 * been faulted in).
6517 		 */
6518 		nsvd->offset = 0;
6519 	}
6520 
6521 	ASSERT(svd->softlockcnt == 0);
6522 	crhold(svd->cred);
6523 
6524 	if (svd->vpage != NULL) {
6525 		size_t bytes = vpgtob(seg_pages(seg));
6526 		size_t nbytes = vpgtob(seg_pages(nseg));
6527 		struct vpage *ovpage = svd->vpage;
6528 
6529 		svd->vpage = kmem_alloc(bytes, KM_SLEEP);
6530 		bcopy(ovpage, svd->vpage, bytes);
6531 		nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP);
6532 		bcopy(ovpage + seg_pages(seg), nsvd->vpage, nbytes);
6533 		kmem_free(ovpage, bytes + nbytes);
6534 	}
6535 	if (svd->amp != NULL && svd->type == MAP_PRIVATE) {
6536 		struct anon_map *oamp = svd->amp, *namp;
6537 		struct anon_hdr *nahp;
6538 
6539 		ANON_LOCK_ENTER(&oamp->a_rwlock, RW_WRITER);
6540 		ASSERT(oamp->refcnt == 1);
6541 		nahp = anon_create(btop(seg->s_size), ANON_SLEEP);
6542 		(void) anon_copy_ptr(oamp->ahp, svd->anon_index,
6543 		    nahp, 0, btop(seg->s_size), ANON_SLEEP);
6544 
6545 		namp = anonmap_alloc(nseg->s_size, 0, ANON_SLEEP);
6546 		namp->a_szc = nseg->s_szc;
6547 		(void) anon_copy_ptr(oamp->ahp,
6548 		    svd->anon_index + btop(seg->s_size),
6549 		    namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP);
6550 		anon_release(oamp->ahp, btop(oamp->size));
6551 		oamp->ahp = nahp;
6552 		oamp->size = seg->s_size;
6553 		svd->anon_index = 0;
6554 		nsvd->amp = namp;
6555 		nsvd->anon_index = 0;
6556 		ANON_LOCK_EXIT(&oamp->a_rwlock);
6557 	} else if (svd->amp != NULL) {
6558 		pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc);
6559 		ASSERT(svd->amp == nsvd->amp);
6560 		ASSERT(seg->s_szc <= svd->amp->a_szc);
6561 		nsvd->anon_index = svd->anon_index + seg_pages(seg);
6562 		ASSERT(IS_P2ALIGNED(nsvd->anon_index, pgcnt));
6563 		ANON_LOCK_ENTER(&svd->amp->a_rwlock, RW_WRITER);
6564 		svd->amp->refcnt++;
6565 		ANON_LOCK_EXIT(&svd->amp->a_rwlock);
6566 	}
6567 
6568 	/*
6569 	 * Split amount of swap reserve
6570 	 */
6571 	if (svd->swresv) {
6572 		/*
6573 		 * For MAP_NORESERVE, only allocate swap reserve for pages
6574 		 * being used.  Other segments get enough to cover whole
6575 		 * segment.
6576 		 */
6577 		if (svd->flags & MAP_NORESERVE) {
6578 			size_t	oswresv;
6579 
6580 			ASSERT(svd->amp);
6581 			oswresv = svd->swresv;
6582 			svd->swresv = ptob(anon_pages(svd->amp->ahp,
6583 			    svd->anon_index, btop(seg->s_size)));
6584 			nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp,
6585 			    nsvd->anon_index, btop(nseg->s_size)));
6586 			ASSERT(oswresv >= (svd->swresv + nsvd->swresv));
6587 		} else {
6588 			ASSERT(svd->swresv == seg->s_size + nseg->s_size);
6589 			svd->swresv = seg->s_size;
6590 			nsvd->swresv = nseg->s_size;
6591 		}
6592 	}
6593 
6594 	return (nseg);
6595 }
6596 
6597 /*
6598  * called on memory operations (unmap, setprot, setpagesize) for a subset
6599  * of a large page segment to either demote the memory range (SDR_RANGE)
6600  * or the ends (SDR_END) by addr/len.
6601  *
6602  * returns 0 on success. returns errno, including ENOMEM, on failure.
6603  */
6604 static int
6605 segvn_demote_range(
6606 	struct seg *seg,
6607 	caddr_t addr,
6608 	size_t len,
6609 	int flag,
6610 	uint_t szcvec)
6611 {
6612 	caddr_t eaddr = addr + len;
6613 	caddr_t lpgaddr, lpgeaddr;
6614 	struct seg *nseg;
6615 	struct seg *badseg1 = NULL;
6616 	struct seg *badseg2 = NULL;
6617 	size_t pgsz;
6618 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
6619 	int err;
6620 	uint_t szc = seg->s_szc;
6621 	uint_t tszcvec;
6622 
6623 	ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
6624 	ASSERT(svd->tr_state == SEGVN_TR_OFF);
6625 	ASSERT(szc != 0);
6626 	pgsz = page_get_pagesize(szc);
6627 	ASSERT(seg->s_base != addr || seg->s_size != len);
6628 	ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size);
6629 	ASSERT(svd->softlockcnt == 0);
6630 	ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
6631 	ASSERT(szcvec == 0 || (flag == SDR_END && svd->type == MAP_SHARED));
6632 
6633 	CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr);
6634 	ASSERT(flag == SDR_RANGE || eaddr < lpgeaddr || addr > lpgaddr);
6635 	if (flag == SDR_RANGE) {
6636 		/* demote entire range */
6637 		badseg1 = nseg = segvn_split_seg(seg, lpgaddr);
6638 		(void) segvn_split_seg(nseg, lpgeaddr);
6639 		ASSERT(badseg1->s_base == lpgaddr);
6640 		ASSERT(badseg1->s_size == lpgeaddr - lpgaddr);
6641 	} else if (addr != lpgaddr) {
6642 		ASSERT(flag == SDR_END);
6643 		badseg1 = nseg = segvn_split_seg(seg, lpgaddr);
6644 		if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz &&
6645 		    eaddr < lpgaddr + 2 * pgsz) {
6646 			(void) segvn_split_seg(nseg, lpgeaddr);
6647 			ASSERT(badseg1->s_base == lpgaddr);
6648 			ASSERT(badseg1->s_size == 2 * pgsz);
6649 		} else {
6650 			nseg = segvn_split_seg(nseg, lpgaddr + pgsz);
6651 			ASSERT(badseg1->s_base == lpgaddr);
6652 			ASSERT(badseg1->s_size == pgsz);
6653 			if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz) {
6654 				ASSERT(lpgeaddr - lpgaddr > 2 * pgsz);
6655 				nseg = segvn_split_seg(nseg, lpgeaddr - pgsz);
6656 				badseg2 = nseg;
6657 				(void) segvn_split_seg(nseg, lpgeaddr);
6658 				ASSERT(badseg2->s_base == lpgeaddr - pgsz);
6659 				ASSERT(badseg2->s_size == pgsz);
6660 			}
6661 		}
6662 	} else {
6663 		ASSERT(flag == SDR_END);
6664 		ASSERT(eaddr < lpgeaddr);
6665 		badseg1 = nseg = segvn_split_seg(seg, lpgeaddr - pgsz);
6666 		(void) segvn_split_seg(nseg, lpgeaddr);
6667 		ASSERT(badseg1->s_base == lpgeaddr - pgsz);
6668 		ASSERT(badseg1->s_size == pgsz);
6669 	}
6670 
6671 	ASSERT(badseg1 != NULL);
6672 	ASSERT(badseg1->s_szc == szc);
6673 	ASSERT(flag == SDR_RANGE || badseg1->s_size == pgsz ||
6674 	    badseg1->s_size == 2 * pgsz);
6675 	ASSERT(sameprot(badseg1, badseg1->s_base, pgsz));
6676 	ASSERT(badseg1->s_size == pgsz ||
6677 	    sameprot(badseg1, badseg1->s_base + pgsz, pgsz));
6678 	if (err = segvn_clrszc(badseg1)) {
6679 		return (err);
6680 	}
6681 	ASSERT(badseg1->s_szc == 0);
6682 
6683 	if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) {
6684 		uint_t tszc = highbit(tszcvec) - 1;
6685 		caddr_t ta = MAX(addr, badseg1->s_base);
6686 		caddr_t te;
6687 		size_t tpgsz = page_get_pagesize(tszc);
6688 
6689 		ASSERT(svd->type == MAP_SHARED);
6690 		ASSERT(flag == SDR_END);
6691 		ASSERT(tszc < szc && tszc > 0);
6692 
6693 		if (eaddr > badseg1->s_base + badseg1->s_size) {
6694 			te = badseg1->s_base + badseg1->s_size;
6695 		} else {
6696 			te = eaddr;
6697 		}
6698 
6699 		ASSERT(ta <= te);
6700 		badseg1->s_szc = tszc;
6701 		if (!IS_P2ALIGNED(ta, tpgsz) || !IS_P2ALIGNED(te, tpgsz)) {
6702 			if (badseg2 != NULL) {
6703 				err = segvn_demote_range(badseg1, ta, te - ta,
6704 				    SDR_END, tszcvec);
6705 				if (err != 0) {
6706 					return (err);
6707 				}
6708 			} else {
6709 				return (segvn_demote_range(badseg1, ta,
6710 				    te - ta, SDR_END, tszcvec));
6711 			}
6712 		}
6713 	}
6714 
6715 	if (badseg2 == NULL)
6716 		return (0);
6717 	ASSERT(badseg2->s_szc == szc);
6718 	ASSERT(badseg2->s_size == pgsz);
6719 	ASSERT(sameprot(badseg2, badseg2->s_base, badseg2->s_size));
6720 	if (err = segvn_clrszc(badseg2)) {
6721 		return (err);
6722 	}
6723 	ASSERT(badseg2->s_szc == 0);
6724 
6725 	if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) {
6726 		uint_t tszc = highbit(tszcvec) - 1;
6727 		size_t tpgsz = page_get_pagesize(tszc);
6728 
6729 		ASSERT(svd->type == MAP_SHARED);
6730 		ASSERT(flag == SDR_END);
6731 		ASSERT(tszc < szc && tszc > 0);
6732 		ASSERT(badseg2->s_base > addr);
6733 		ASSERT(eaddr > badseg2->s_base);
6734 		ASSERT(eaddr < badseg2->s_base + badseg2->s_size);
6735 
6736 		badseg2->s_szc = tszc;
6737 		if (!IS_P2ALIGNED(eaddr, tpgsz)) {
6738 			return (segvn_demote_range(badseg2, badseg2->s_base,
6739 			    eaddr - badseg2->s_base, SDR_END, tszcvec));
6740 		}
6741 	}
6742 
6743 	return (0);
6744 }
6745 
6746 static int
6747 segvn_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
6748 {
6749 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
6750 	struct vpage *vp, *evp;
6751 
6752 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
6753 
6754 	SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
6755 	/*
6756 	 * If segment protection can be used, simply check against them.
6757 	 */
6758 	if (svd->pageprot == 0) {
6759 		int err;
6760 
6761 		err = ((svd->prot & prot) != prot) ? EACCES : 0;
6762 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
6763 		return (err);
6764 	}
6765 
6766 	/*
6767 	 * Have to check down to the vpage level.
6768 	 */
6769 	evp = &svd->vpage[seg_page(seg, addr + len)];
6770 	for (vp = &svd->vpage[seg_page(seg, addr)]; vp < evp; vp++) {
6771 		if ((VPP_PROT(vp) & prot) != prot) {
6772 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
6773 			return (EACCES);
6774 		}
6775 	}
6776 	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
6777 	return (0);
6778 }
6779 
6780 static int
6781 segvn_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
6782 {
6783 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
6784 	size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1;
6785 
6786 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
6787 
6788 	if (pgno != 0) {
6789 		SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
6790 		if (svd->pageprot == 0) {
6791 			do {
6792 				protv[--pgno] = svd->prot;
6793 			} while (pgno != 0);
6794 		} else {
6795 			size_t pgoff = seg_page(seg, addr);
6796 
6797 			do {
6798 				pgno--;
6799 				protv[pgno] = VPP_PROT(&svd->vpage[pgno+pgoff]);
6800 			} while (pgno != 0);
6801 		}
6802 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
6803 	}
6804 	return (0);
6805 }
6806 
6807 static u_offset_t
6808 segvn_getoffset(struct seg *seg, caddr_t addr)
6809 {
6810 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
6811 
6812 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
6813 
6814 	return (svd->offset + (uintptr_t)(addr - seg->s_base));
6815 }
6816 
6817 /*ARGSUSED*/
6818 static int
6819 segvn_gettype(struct seg *seg, caddr_t addr)
6820 {
6821 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
6822 
6823 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
6824 
6825 	return (svd->type | (svd->flags & (MAP_NORESERVE | MAP_TEXT |
6826 	    MAP_INITDATA)));
6827 }
6828 
6829 /*ARGSUSED*/
6830 static int
6831 segvn_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
6832 {
6833 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
6834 
6835 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
6836 
6837 	*vpp = svd->vp;
6838 	return (0);
6839 }
6840 
6841 /*
6842  * Check to see if it makes sense to do kluster/read ahead to
6843  * addr + delta relative to the mapping at addr.  We assume here
6844  * that delta is a signed PAGESIZE'd multiple (which can be negative).
6845  *
6846  * For segvn, we currently "approve" of the action if we are
6847  * still in the segment and it maps from the same vp/off,
6848  * or if the advice stored in segvn_data or vpages allows it.
6849  * Currently, klustering is not allowed only if MADV_RANDOM is set.
6850  */
6851 static int
6852 segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
6853 {
6854 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
6855 	struct anon *oap, *ap;
6856 	ssize_t pd;
6857 	size_t page;
6858 	struct vnode *vp1, *vp2;
6859 	u_offset_t off1, off2;
6860 	struct anon_map *amp;
6861 
6862 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
6863 	ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) ||
6864 	    SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
6865 
6866 	if (addr + delta < seg->s_base ||
6867 	    addr + delta >= (seg->s_base + seg->s_size))
6868 		return (-1);		/* exceeded segment bounds */
6869 
6870 	pd = delta / (ssize_t)PAGESIZE;	/* divide to preserve sign bit */
6871 	page = seg_page(seg, addr);
6872 
6873 	/*
6874 	 * Check to see if either of the pages addr or addr + delta
6875 	 * have advice set that prevents klustering (if MADV_RANDOM advice
6876 	 * is set for entire segment, or MADV_SEQUENTIAL is set and delta
6877 	 * is negative).
6878 	 */
6879 	if (svd->advice == MADV_RANDOM ||
6880 	    svd->advice == MADV_SEQUENTIAL && delta < 0)
6881 		return (-1);
6882 	else if (svd->pageadvice && svd->vpage) {
6883 		struct vpage *bvpp, *evpp;
6884 
6885 		bvpp = &svd->vpage[page];
6886 		evpp = &svd->vpage[page + pd];
6887 		if (VPP_ADVICE(bvpp) == MADV_RANDOM ||
6888 		    VPP_ADVICE(evpp) == MADV_SEQUENTIAL && delta < 0)
6889 			return (-1);
6890 		if (VPP_ADVICE(bvpp) != VPP_ADVICE(evpp) &&
6891 		    VPP_ADVICE(evpp) == MADV_RANDOM)
6892 			return (-1);
6893 	}
6894 
6895 	if (svd->type == MAP_SHARED)
6896 		return (0);		/* shared mapping - all ok */
6897 
6898 	if ((amp = svd->amp) == NULL)
6899 		return (0);		/* off original vnode */
6900 
6901 	page += svd->anon_index;
6902 
6903 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
6904 
6905 	oap = anon_get_ptr(amp->ahp, page);
6906 	ap = anon_get_ptr(amp->ahp, page + pd);
6907 
6908 	ANON_LOCK_EXIT(&amp->a_rwlock);
6909 
6910 	if ((oap == NULL && ap != NULL) || (oap != NULL && ap == NULL)) {
6911 		return (-1);		/* one with and one without an anon */
6912 	}
6913 
6914 	if (oap == NULL) {		/* implies that ap == NULL */
6915 		return (0);		/* off original vnode */
6916 	}
6917 
6918 	/*
6919 	 * Now we know we have two anon pointers - check to
6920 	 * see if they happen to be properly allocated.
6921 	 */
6922 
6923 	/*
6924 	 * XXX We cheat here and don't lock the anon slots. We can't because
6925 	 * we may have been called from the anon layer which might already
6926 	 * have locked them. We are holding a refcnt on the slots so they
6927 	 * can't disappear. The worst that will happen is we'll get the wrong
6928 	 * names (vp, off) for the slots and make a poor klustering decision.
6929 	 */
6930 	swap_xlate(ap, &vp1, &off1);
6931 	swap_xlate(oap, &vp2, &off2);
6932 
6933 
6934 	if (!VOP_CMP(vp1, vp2, NULL) || off1 - off2 != delta)
6935 		return (-1);
6936 	return (0);
6937 }
6938 
6939 /*
6940  * Swap the pages of seg out to secondary storage, returning the
6941  * number of bytes of storage freed.
6942  *
6943  * The basic idea is first to unload all translations and then to call
6944  * VOP_PUTPAGE() for all newly-unmapped pages, to push them out to the
6945  * swap device.  Pages to which other segments have mappings will remain
6946  * mapped and won't be swapped.  Our caller (as_swapout) has already
6947  * performed the unloading step.
6948  *
6949  * The value returned is intended to correlate well with the process's
6950  * memory requirements.  However, there are some caveats:
6951  * 1)	When given a shared segment as argument, this routine will
6952  *	only succeed in swapping out pages for the last sharer of the
6953  *	segment.  (Previous callers will only have decremented mapping
6954  *	reference counts.)
6955  * 2)	We assume that the hat layer maintains a large enough translation
6956  *	cache to capture process reference patterns.
6957  */
6958 static size_t
6959 segvn_swapout(struct seg *seg)
6960 {
6961 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
6962 	struct anon_map *amp;
6963 	pgcnt_t pgcnt = 0;
6964 	pgcnt_t npages;
6965 	pgcnt_t page;
6966 	ulong_t anon_index;
6967 
6968 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
6969 
6970 	SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
6971 	/*
6972 	 * Find pages unmapped by our caller and force them
6973 	 * out to the virtual swap device.
6974 	 */
6975 	if ((amp = svd->amp) != NULL)
6976 		anon_index = svd->anon_index;
6977 	npages = seg->s_size >> PAGESHIFT;
6978 	for (page = 0; page < npages; page++) {
6979 		page_t *pp;
6980 		struct anon *ap;
6981 		struct vnode *vp;
6982 		u_offset_t off;
6983 		anon_sync_obj_t cookie;
6984 
6985 		/*
6986 		 * Obtain <vp, off> pair for the page, then look it up.
6987 		 *
6988 		 * Note that this code is willing to consider regular
6989 		 * pages as well as anon pages.  Is this appropriate here?
6990 		 */
6991 		ap = NULL;
6992 		if (amp != NULL) {
6993 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
6994 			if (anon_array_try_enter(amp, anon_index + page,
6995 			    &cookie)) {
6996 				ANON_LOCK_EXIT(&amp->a_rwlock);
6997 				continue;
6998 			}
6999 			ap = anon_get_ptr(amp->ahp, anon_index + page);
7000 			if (ap != NULL) {
7001 				swap_xlate(ap, &vp, &off);
7002 			} else {
7003 				vp = svd->vp;
7004 				off = svd->offset + ptob(page);
7005 			}
7006 			anon_array_exit(&cookie);
7007 			ANON_LOCK_EXIT(&amp->a_rwlock);
7008 		} else {
7009 			vp = svd->vp;
7010 			off = svd->offset + ptob(page);
7011 		}
7012 		if (vp == NULL) {		/* untouched zfod page */
7013 			ASSERT(ap == NULL);
7014 			continue;
7015 		}
7016 
7017 		pp = page_lookup_nowait(vp, off, SE_SHARED);
7018 		if (pp == NULL)
7019 			continue;
7020 
7021 
7022 		/*
7023 		 * Examine the page to see whether it can be tossed out,
7024 		 * keeping track of how many we've found.
7025 		 */
7026 		if (!page_tryupgrade(pp)) {
7027 			/*
7028 			 * If the page has an i/o lock and no mappings,
7029 			 * it's very likely that the page is being
7030 			 * written out as a result of klustering.
7031 			 * Assume this is so and take credit for it here.
7032 			 */
7033 			if (!page_io_trylock(pp)) {
7034 				if (!hat_page_is_mapped(pp))
7035 					pgcnt++;
7036 			} else {
7037 				page_io_unlock(pp);
7038 			}
7039 			page_unlock(pp);
7040 			continue;
7041 		}
7042 		ASSERT(!page_iolock_assert(pp));
7043 
7044 
7045 		/*
7046 		 * Skip if page is locked or has mappings.
7047 		 * We don't need the page_struct_lock to look at lckcnt
7048 		 * and cowcnt because the page is exclusive locked.
7049 		 */
7050 		if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
7051 		    hat_page_is_mapped(pp)) {
7052 			page_unlock(pp);
7053 			continue;
7054 		}
7055 
7056 		/*
7057 		 * dispose skips large pages so try to demote first.
7058 		 */
7059 		if (pp->p_szc != 0 && !page_try_demote_pages(pp)) {
7060 			page_unlock(pp);
7061 			/*
7062 			 * XXX should skip the remaining page_t's of this
7063 			 * large page.
7064 			 */
7065 			continue;
7066 		}
7067 
7068 		ASSERT(pp->p_szc == 0);
7069 
7070 		/*
7071 		 * No longer mapped -- we can toss it out.  How
7072 		 * we do so depends on whether or not it's dirty.
7073 		 */
7074 		if (hat_ismod(pp) && pp->p_vnode) {
7075 			/*
7076 			 * We must clean the page before it can be
7077 			 * freed.  Setting B_FREE will cause pvn_done
7078 			 * to free the page when the i/o completes.
7079 			 * XXX:	This also causes it to be accounted
7080 			 *	as a pageout instead of a swap: need
7081 			 *	B_SWAPOUT bit to use instead of B_FREE.
7082 			 *
7083 			 * Hold the vnode before releasing the page lock
7084 			 * to prevent it from being freed and re-used by
7085 			 * some other thread.
7086 			 */
7087 			VN_HOLD(vp);
7088 			page_unlock(pp);
7089 
7090 			/*
7091 			 * Queue all i/o requests for the pageout thread
7092 			 * to avoid saturating the pageout devices.
7093 			 */
7094 			if (!queue_io_request(vp, off))
7095 				VN_RELE(vp);
7096 		} else {
7097 			/*
7098 			 * The page was clean, free it.
7099 			 *
7100 			 * XXX:	Can we ever encounter modified pages
7101 			 *	with no associated vnode here?
7102 			 */
7103 			ASSERT(pp->p_vnode != NULL);
7104 			/*LINTED: constant in conditional context*/
7105 			VN_DISPOSE(pp, B_FREE, 0, kcred);
7106 		}
7107 
7108 		/*
7109 		 * Credit now even if i/o is in progress.
7110 		 */
7111 		pgcnt++;
7112 	}
7113 	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7114 
7115 	/*
7116 	 * Wakeup pageout to initiate i/o on all queued requests.
7117 	 */
7118 	cv_signal_pageout();
7119 	return (ptob(pgcnt));
7120 }
7121 
7122 /*
7123  * Synchronize primary storage cache with real object in virtual memory.
7124  *
7125  * XXX - Anonymous pages should not be sync'ed out at all.
7126  */
7127 static int
7128 segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags)
7129 {
7130 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
7131 	struct vpage *vpp;
7132 	page_t *pp;
7133 	u_offset_t offset;
7134 	struct vnode *vp;
7135 	u_offset_t off;
7136 	caddr_t eaddr;
7137 	int bflags;
7138 	int err = 0;
7139 	int segtype;
7140 	int pageprot;
7141 	int prot;
7142 	ulong_t anon_index;
7143 	struct anon_map *amp;
7144 	struct anon *ap;
7145 	anon_sync_obj_t cookie;
7146 
7147 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
7148 
7149 	SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
7150 
7151 	if (svd->softlockcnt > 0) {
7152 		/*
7153 		 * flush all pages from seg cache
7154 		 * otherwise we may deadlock in swap_putpage
7155 		 * for B_INVAL page (4175402).
7156 		 *
7157 		 * Even if we grab segvn WRITER's lock or segp_slock
7158 		 * here, there might be another thread which could've
7159 		 * successfully performed lookup/insert just before
7160 		 * we acquired the lock here.  So, grabbing either
7161 		 * lock here is of not much use.  Until we devise
7162 		 * a strategy at upper layers to solve the
7163 		 * synchronization issues completely, we expect
7164 		 * applications to handle this appropriately.
7165 		 */
7166 		segvn_purge(seg);
7167 		if (svd->softlockcnt > 0) {
7168 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7169 			return (EAGAIN);
7170 		}
7171 	}
7172 
7173 	vpp = svd->vpage;
7174 	offset = svd->offset + (uintptr_t)(addr - seg->s_base);
7175 	bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) |
7176 	    ((flags & MS_INVALIDATE) ? B_INVAL : 0);
7177 
7178 	if (attr) {
7179 		pageprot = attr & ~(SHARED|PRIVATE);
7180 		segtype = (attr & SHARED) ? MAP_SHARED : MAP_PRIVATE;
7181 
7182 		/*
7183 		 * We are done if the segment types don't match
7184 		 * or if we have segment level protections and
7185 		 * they don't match.
7186 		 */
7187 		if (svd->type != segtype) {
7188 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7189 			return (0);
7190 		}
7191 		if (vpp == NULL) {
7192 			if (svd->prot != pageprot) {
7193 				SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7194 				return (0);
7195 			}
7196 			prot = svd->prot;
7197 		} else
7198 			vpp = &svd->vpage[seg_page(seg, addr)];
7199 
7200 	} else if (svd->vp && svd->amp == NULL &&
7201 	    (flags & MS_INVALIDATE) == 0) {
7202 
7203 		/*
7204 		 * No attributes, no anonymous pages and MS_INVALIDATE flag
7205 		 * is not on, just use one big request.
7206 		 */
7207 		err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len,
7208 		    bflags, svd->cred, NULL);
7209 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7210 		return (err);
7211 	}
7212 
7213 	if ((amp = svd->amp) != NULL)
7214 		anon_index = svd->anon_index + seg_page(seg, addr);
7215 
7216 	for (eaddr = addr + len; addr < eaddr; addr += PAGESIZE) {
7217 		ap = NULL;
7218 		if (amp != NULL) {
7219 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
7220 			anon_array_enter(amp, anon_index, &cookie);
7221 			ap = anon_get_ptr(amp->ahp, anon_index++);
7222 			if (ap != NULL) {
7223 				swap_xlate(ap, &vp, &off);
7224 			} else {
7225 				vp = svd->vp;
7226 				off = offset;
7227 			}
7228 			anon_array_exit(&cookie);
7229 			ANON_LOCK_EXIT(&amp->a_rwlock);
7230 		} else {
7231 			vp = svd->vp;
7232 			off = offset;
7233 		}
7234 		offset += PAGESIZE;
7235 
7236 		if (vp == NULL)		/* untouched zfod page */
7237 			continue;
7238 
7239 		if (attr) {
7240 			if (vpp) {
7241 				prot = VPP_PROT(vpp);
7242 				vpp++;
7243 			}
7244 			if (prot != pageprot) {
7245 				continue;
7246 			}
7247 		}
7248 
7249 		/*
7250 		 * See if any of these pages are locked --  if so, then we
7251 		 * will have to truncate an invalidate request at the first
7252 		 * locked one. We don't need the page_struct_lock to test
7253 		 * as this is only advisory; even if we acquire it someone
7254 		 * might race in and lock the page after we unlock and before
7255 		 * we do the PUTPAGE, then PUTPAGE simply does nothing.
7256 		 */
7257 		if (flags & MS_INVALIDATE) {
7258 			if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) {
7259 				if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
7260 					page_unlock(pp);
7261 					SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7262 					return (EBUSY);
7263 				}
7264 				if (ap != NULL && pp->p_szc != 0 &&
7265 				    page_tryupgrade(pp)) {
7266 					if (pp->p_lckcnt == 0 &&
7267 					    pp->p_cowcnt == 0) {
7268 						/*
7269 						 * swapfs VN_DISPOSE() won't
7270 						 * invalidate large pages.
7271 						 * Attempt to demote.
7272 						 * XXX can't help it if it
7273 						 * fails. But for swapfs
7274 						 * pages it is no big deal.
7275 						 */
7276 						(void) page_try_demote_pages(
7277 						    pp);
7278 					}
7279 				}
7280 				page_unlock(pp);
7281 			}
7282 		} else if (svd->type == MAP_SHARED && amp != NULL) {
7283 			/*
7284 			 * Avoid writing out to disk ISM's large pages
7285 			 * because segspt_free_pages() relies on NULL an_pvp
7286 			 * of anon slots of such pages.
7287 			 */
7288 
7289 			ASSERT(svd->vp == NULL);
7290 			/*
7291 			 * swapfs uses page_lookup_nowait if not freeing or
7292 			 * invalidating and skips a page if
7293 			 * page_lookup_nowait returns NULL.
7294 			 */
7295 			pp = page_lookup_nowait(vp, off, SE_SHARED);
7296 			if (pp == NULL) {
7297 				continue;
7298 			}
7299 			if (pp->p_szc != 0) {
7300 				page_unlock(pp);
7301 				continue;
7302 			}
7303 
7304 			/*
7305 			 * Note ISM pages are created large so (vp, off)'s
7306 			 * page cannot suddenly become large after we unlock
7307 			 * pp.
7308 			 */
7309 			page_unlock(pp);
7310 		}
7311 		/*
7312 		 * XXX - Should ultimately try to kluster
7313 		 * calls to VOP_PUTPAGE() for performance.
7314 		 */
7315 		VN_HOLD(vp);
7316 		err = VOP_PUTPAGE(vp, (offset_t)off, PAGESIZE,
7317 		    bflags, svd->cred, NULL);
7318 		VN_RELE(vp);
7319 		if (err)
7320 			break;
7321 	}
7322 	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7323 	return (err);
7324 }
7325 
7326 /*
7327  * Determine if we have data corresponding to pages in the
7328  * primary storage virtual memory cache (i.e., "in core").
7329  */
7330 static size_t
7331 segvn_incore(struct seg *seg, caddr_t addr, size_t len, char *vec)
7332 {
7333 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
7334 	struct vnode *vp, *avp;
7335 	u_offset_t offset, aoffset;
7336 	size_t p, ep;
7337 	int ret;
7338 	struct vpage *vpp;
7339 	page_t *pp;
7340 	uint_t start;
7341 	struct anon_map *amp;		/* XXX - for locknest */
7342 	struct anon *ap;
7343 	uint_t attr;
7344 	anon_sync_obj_t cookie;
7345 
7346 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
7347 
7348 	SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
7349 	if (svd->amp == NULL && svd->vp == NULL) {
7350 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7351 		bzero(vec, btopr(len));
7352 		return (len);	/* no anonymous pages created yet */
7353 	}
7354 
7355 	p = seg_page(seg, addr);
7356 	ep = seg_page(seg, addr + len);
7357 	start = svd->vp ? SEG_PAGE_VNODEBACKED : 0;
7358 
7359 	amp = svd->amp;
7360 	for (; p < ep; p++, addr += PAGESIZE) {
7361 		vpp = (svd->vpage) ? &svd->vpage[p]: NULL;
7362 		ret = start;
7363 		ap = NULL;
7364 		avp = NULL;
7365 		/* Grab the vnode/offset for the anon slot */
7366 		if (amp != NULL) {
7367 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
7368 			anon_array_enter(amp, svd->anon_index + p, &cookie);
7369 			ap = anon_get_ptr(amp->ahp, svd->anon_index + p);
7370 			if (ap != NULL) {
7371 				swap_xlate(ap, &avp, &aoffset);
7372 			}
7373 			anon_array_exit(&cookie);
7374 			ANON_LOCK_EXIT(&amp->a_rwlock);
7375 		}
7376 		if ((avp != NULL) && page_exists(avp, aoffset)) {
7377 			/* A page exists for the anon slot */
7378 			ret |= SEG_PAGE_INCORE;
7379 
7380 			/*
7381 			 * If page is mapped and writable
7382 			 */
7383 			attr = (uint_t)0;
7384 			if ((hat_getattr(seg->s_as->a_hat, addr,
7385 			    &attr) != -1) && (attr & PROT_WRITE)) {
7386 				ret |= SEG_PAGE_ANON;
7387 			}
7388 			/*
7389 			 * Don't get page_struct lock for lckcnt and cowcnt,
7390 			 * since this is purely advisory.
7391 			 */
7392 			if ((pp = page_lookup_nowait(avp, aoffset,
7393 			    SE_SHARED)) != NULL) {
7394 				if (pp->p_lckcnt)
7395 					ret |= SEG_PAGE_SOFTLOCK;
7396 				if (pp->p_cowcnt)
7397 					ret |= SEG_PAGE_HASCOW;
7398 				page_unlock(pp);
7399 			}
7400 		}
7401 
7402 		/* Gather vnode statistics */
7403 		vp = svd->vp;
7404 		offset = svd->offset + (uintptr_t)(addr - seg->s_base);
7405 
7406 		if (vp != NULL) {
7407 			/*
7408 			 * Try to obtain a "shared" lock on the page
7409 			 * without blocking.  If this fails, determine
7410 			 * if the page is in memory.
7411 			 */
7412 			pp = page_lookup_nowait(vp, offset, SE_SHARED);
7413 			if ((pp == NULL) && (page_exists(vp, offset))) {
7414 				/* Page is incore, and is named */
7415 				ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE);
7416 			}
7417 			/*
7418 			 * Don't get page_struct lock for lckcnt and cowcnt,
7419 			 * since this is purely advisory.
7420 			 */
7421 			if (pp != NULL) {
7422 				ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE);
7423 				if (pp->p_lckcnt)
7424 					ret |= SEG_PAGE_SOFTLOCK;
7425 				if (pp->p_cowcnt)
7426 					ret |= SEG_PAGE_HASCOW;
7427 				page_unlock(pp);
7428 			}
7429 		}
7430 
7431 		/* Gather virtual page information */
7432 		if (vpp) {
7433 			if (VPP_ISPPLOCK(vpp))
7434 				ret |= SEG_PAGE_LOCKED;
7435 			vpp++;
7436 		}
7437 
7438 		*vec++ = (char)ret;
7439 	}
7440 	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7441 	return (len);
7442 }
7443 
7444 /*
7445  * Statement for p_cowcnts/p_lckcnts.
7446  *
7447  * p_cowcnt is updated while mlock/munlocking MAP_PRIVATE and PROT_WRITE region
7448  * irrespective of the following factors or anything else:
7449  *
7450  *	(1) anon slots are populated or not
7451  *	(2) cow is broken or not
7452  *	(3) refcnt on ap is 1 or greater than 1
7453  *
7454  * If it's not MAP_PRIVATE and PROT_WRITE, p_lckcnt is updated during mlock
7455  * and munlock.
7456  *
7457  *
7458  * Handling p_cowcnts/p_lckcnts during copy-on-write fault:
7459  *
7460  *	if vpage has PROT_WRITE
7461  *		transfer cowcnt on the oldpage -> cowcnt on the newpage
7462  *	else
7463  *		transfer lckcnt on the oldpage -> lckcnt on the newpage
7464  *
7465  *	During copy-on-write, decrement p_cowcnt on the oldpage and increment
7466  *	p_cowcnt on the newpage *if* the corresponding vpage has PROT_WRITE.
7467  *
7468  *	We may also break COW if softlocking on read access in the physio case.
7469  *	In this case, vpage may not have PROT_WRITE. So, we need to decrement
7470  *	p_lckcnt on the oldpage and increment p_lckcnt on the newpage *if* the
7471  *	vpage doesn't have PROT_WRITE.
7472  *
7473  *
7474  * Handling p_cowcnts/p_lckcnts during mprotect on mlocked region:
7475  *
7476  * 	If a MAP_PRIVATE region loses PROT_WRITE, we decrement p_cowcnt and
7477  *	increment p_lckcnt by calling page_subclaim() which takes care of
7478  * 	availrmem accounting and p_lckcnt overflow.
7479  *
7480  *	If a MAP_PRIVATE region gains PROT_WRITE, we decrement p_lckcnt and
7481  *	increment p_cowcnt by calling page_addclaim() which takes care of
7482  *	availrmem availability and p_cowcnt overflow.
7483  */
7484 
7485 /*
7486  * Lock down (or unlock) pages mapped by this segment.
7487  *
7488  * XXX only creates PAGESIZE pages if anon slots are not initialized.
7489  * At fault time they will be relocated into larger pages.
7490  */
7491 static int
7492 segvn_lockop(struct seg *seg, caddr_t addr, size_t len,
7493     int attr, int op, ulong_t *lockmap, size_t pos)
7494 {
7495 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
7496 	struct vpage *vpp;
7497 	struct vpage *evp;
7498 	page_t *pp;
7499 	u_offset_t offset;
7500 	u_offset_t off;
7501 	int segtype;
7502 	int pageprot;
7503 	int claim;
7504 	struct vnode *vp;
7505 	ulong_t anon_index;
7506 	struct anon_map *amp;
7507 	struct anon *ap;
7508 	struct vattr va;
7509 	anon_sync_obj_t cookie;
7510 	struct kshmid *sp = NULL;
7511 	struct proc	*p = curproc;
7512 	kproject_t	*proj = NULL;
7513 	int chargeproc = 1;
7514 	size_t locked_bytes = 0;
7515 	size_t unlocked_bytes = 0;
7516 	int err = 0;
7517 
7518 	/*
7519 	 * Hold write lock on address space because may split or concatenate
7520 	 * segments
7521 	 */
7522 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
7523 
7524 	/*
7525 	 * If this is a shm, use shm's project and zone, else use
7526 	 * project and zone of calling process
7527 	 */
7528 
7529 	/* Determine if this segment backs a sysV shm */
7530 	if (svd->amp != NULL && svd->amp->a_sp != NULL) {
7531 		ASSERT(svd->type == MAP_SHARED);
7532 		ASSERT(svd->tr_state == SEGVN_TR_OFF);
7533 		sp = svd->amp->a_sp;
7534 		proj = sp->shm_perm.ipc_proj;
7535 		chargeproc = 0;
7536 	}
7537 
7538 	SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
7539 	if (attr) {
7540 		pageprot = attr & ~(SHARED|PRIVATE);
7541 		segtype = attr & SHARED ? MAP_SHARED : MAP_PRIVATE;
7542 
7543 		/*
7544 		 * We are done if the segment types don't match
7545 		 * or if we have segment level protections and
7546 		 * they don't match.
7547 		 */
7548 		if (svd->type != segtype) {
7549 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7550 			return (0);
7551 		}
7552 		if (svd->pageprot == 0 && svd->prot != pageprot) {
7553 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7554 			return (0);
7555 		}
7556 	}
7557 
7558 	if (op == MC_LOCK) {
7559 		if (svd->tr_state == SEGVN_TR_INIT) {
7560 			svd->tr_state = SEGVN_TR_OFF;
7561 		} else if (svd->tr_state == SEGVN_TR_ON) {
7562 			ASSERT(svd->amp != NULL);
7563 			segvn_textunrepl(seg, 0);
7564 			ASSERT(svd->amp == NULL &&
7565 			    svd->tr_state == SEGVN_TR_OFF);
7566 		}
7567 	}
7568 
7569 	/*
7570 	 * If we're locking, then we must create a vpage structure if
7571 	 * none exists.  If we're unlocking, then check to see if there
7572 	 * is a vpage --  if not, then we could not have locked anything.
7573 	 */
7574 
7575 	if ((vpp = svd->vpage) == NULL) {
7576 		if (op == MC_LOCK)
7577 			segvn_vpage(seg);
7578 		else {
7579 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7580 			return (0);
7581 		}
7582 	}
7583 
7584 	/*
7585 	 * The anonymous data vector (i.e., previously
7586 	 * unreferenced mapping to swap space) can be allocated
7587 	 * by lazily testing for its existence.
7588 	 */
7589 	if (op == MC_LOCK && svd->amp == NULL && svd->vp == NULL) {
7590 		ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
7591 		svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP);
7592 		svd->amp->a_szc = seg->s_szc;
7593 	}
7594 
7595 	if ((amp = svd->amp) != NULL) {
7596 		anon_index = svd->anon_index + seg_page(seg, addr);
7597 	}
7598 
7599 	offset = svd->offset + (uintptr_t)(addr - seg->s_base);
7600 	evp = &svd->vpage[seg_page(seg, addr + len)];
7601 
7602 	if (sp != NULL)
7603 		mutex_enter(&sp->shm_mlock);
7604 
7605 	/* determine number of unlocked bytes in range for lock operation */
7606 	if (op == MC_LOCK) {
7607 
7608 		if (sp == NULL) {
7609 			for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp;
7610 			    vpp++) {
7611 				if (!VPP_ISPPLOCK(vpp))
7612 					unlocked_bytes += PAGESIZE;
7613 			}
7614 		} else {
7615 			ulong_t		i_idx, i_edx;
7616 			anon_sync_obj_t	i_cookie;
7617 			struct anon	*i_ap;
7618 			struct vnode	*i_vp;
7619 			u_offset_t	i_off;
7620 
7621 			/* Only count sysV pages once for locked memory */
7622 			i_edx = svd->anon_index + seg_page(seg, addr + len);
7623 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
7624 			for (i_idx = anon_index; i_idx < i_edx; i_idx++) {
7625 				anon_array_enter(amp, i_idx, &i_cookie);
7626 				i_ap = anon_get_ptr(amp->ahp, i_idx);
7627 				if (i_ap == NULL) {
7628 					unlocked_bytes += PAGESIZE;
7629 					anon_array_exit(&i_cookie);
7630 					continue;
7631 				}
7632 				swap_xlate(i_ap, &i_vp, &i_off);
7633 				anon_array_exit(&i_cookie);
7634 				pp = page_lookup(i_vp, i_off, SE_SHARED);
7635 				if (pp == NULL) {
7636 					unlocked_bytes += PAGESIZE;
7637 					continue;
7638 				} else if (pp->p_lckcnt == 0)
7639 					unlocked_bytes += PAGESIZE;
7640 				page_unlock(pp);
7641 			}
7642 			ANON_LOCK_EXIT(&amp->a_rwlock);
7643 		}
7644 
7645 		mutex_enter(&p->p_lock);
7646 		err = rctl_incr_locked_mem(p, proj, unlocked_bytes,
7647 		    chargeproc);
7648 		mutex_exit(&p->p_lock);
7649 
7650 		if (err) {
7651 			if (sp != NULL)
7652 				mutex_exit(&sp->shm_mlock);
7653 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7654 			return (err);
7655 		}
7656 	}
7657 	/*
7658 	 * Loop over all pages in the range.  Process if we're locking and
7659 	 * page has not already been locked in this mapping; or if we're
7660 	 * unlocking and the page has been locked.
7661 	 */
7662 	for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp;
7663 	    vpp++, pos++, addr += PAGESIZE, offset += PAGESIZE, anon_index++) {
7664 		if ((attr == 0 || VPP_PROT(vpp) == pageprot) &&
7665 		    ((op == MC_LOCK && !VPP_ISPPLOCK(vpp)) ||
7666 		    (op == MC_UNLOCK && VPP_ISPPLOCK(vpp)))) {
7667 
7668 			if (amp != NULL)
7669 				ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
7670 			/*
7671 			 * If this isn't a MAP_NORESERVE segment and
7672 			 * we're locking, allocate anon slots if they
7673 			 * don't exist.  The page is brought in later on.
7674 			 */
7675 			if (op == MC_LOCK && svd->vp == NULL &&
7676 			    ((svd->flags & MAP_NORESERVE) == 0) &&
7677 			    amp != NULL &&
7678 			    ((ap = anon_get_ptr(amp->ahp, anon_index))
7679 			    == NULL)) {
7680 				anon_array_enter(amp, anon_index, &cookie);
7681 
7682 				if ((ap = anon_get_ptr(amp->ahp,
7683 				    anon_index)) == NULL) {
7684 					pp = anon_zero(seg, addr, &ap,
7685 					    svd->cred);
7686 					if (pp == NULL) {
7687 						anon_array_exit(&cookie);
7688 						ANON_LOCK_EXIT(&amp->a_rwlock);
7689 						err = ENOMEM;
7690 						goto out;
7691 					}
7692 					ASSERT(anon_get_ptr(amp->ahp,
7693 					    anon_index) == NULL);
7694 					(void) anon_set_ptr(amp->ahp,
7695 					    anon_index, ap, ANON_SLEEP);
7696 					page_unlock(pp);
7697 				}
7698 				anon_array_exit(&cookie);
7699 			}
7700 
7701 			/*
7702 			 * Get name for page, accounting for
7703 			 * existence of private copy.
7704 			 */
7705 			ap = NULL;
7706 			if (amp != NULL) {
7707 				anon_array_enter(amp, anon_index, &cookie);
7708 				ap = anon_get_ptr(amp->ahp, anon_index);
7709 				if (ap != NULL) {
7710 					swap_xlate(ap, &vp, &off);
7711 				} else {
7712 					if (svd->vp == NULL &&
7713 					    (svd->flags & MAP_NORESERVE)) {
7714 						anon_array_exit(&cookie);
7715 						ANON_LOCK_EXIT(&amp->a_rwlock);
7716 						continue;
7717 					}
7718 					vp = svd->vp;
7719 					off = offset;
7720 				}
7721 				anon_array_exit(&cookie);
7722 				ANON_LOCK_EXIT(&amp->a_rwlock);
7723 			} else {
7724 				vp = svd->vp;
7725 				off = offset;
7726 			}
7727 
7728 			/*
7729 			 * Get page frame.  It's ok if the page is
7730 			 * not available when we're unlocking, as this
7731 			 * may simply mean that a page we locked got
7732 			 * truncated out of existence after we locked it.
7733 			 *
7734 			 * Invoke VOP_GETPAGE() to obtain the page struct
7735 			 * since we may need to read it from disk if its
7736 			 * been paged out.
7737 			 */
7738 			if (op != MC_LOCK)
7739 				pp = page_lookup(vp, off, SE_SHARED);
7740 			else {
7741 				page_t *pl[1 + 1];
7742 				int error;
7743 
7744 				ASSERT(vp != NULL);
7745 
7746 				error = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE,
7747 				    (uint_t *)NULL, pl, PAGESIZE, seg, addr,
7748 				    S_OTHER, svd->cred, NULL);
7749 
7750 				/*
7751 				 * If the error is EDEADLK then we must bounce
7752 				 * up and drop all vm subsystem locks and then
7753 				 * retry the operation later
7754 				 * This behavior is a temporary measure because
7755 				 * ufs/sds logging is badly designed and will
7756 				 * deadlock if we don't allow this bounce to
7757 				 * happen.  The real solution is to re-design
7758 				 * the logging code to work properly.  See bug
7759 				 * 4125102 for details of the problem.
7760 				 */
7761 				if (error == EDEADLK) {
7762 					err = error;
7763 					goto out;
7764 				}
7765 				/*
7766 				 * Quit if we fail to fault in the page.  Treat
7767 				 * the failure as an error, unless the addr
7768 				 * is mapped beyond the end of a file.
7769 				 */
7770 				if (error && svd->vp) {
7771 					va.va_mask = AT_SIZE;
7772 					if (VOP_GETATTR(svd->vp, &va, 0,
7773 					    svd->cred, NULL) != 0) {
7774 						err = EIO;
7775 						goto out;
7776 					}
7777 					if (btopr(va.va_size) >=
7778 					    btopr(off + 1)) {
7779 						err = EIO;
7780 						goto out;
7781 					}
7782 					goto out;
7783 
7784 				} else if (error) {
7785 					err = EIO;
7786 					goto out;
7787 				}
7788 				pp = pl[0];
7789 				ASSERT(pp != NULL);
7790 			}
7791 
7792 			/*
7793 			 * See Statement at the beginning of this routine.
7794 			 *
7795 			 * claim is always set if MAP_PRIVATE and PROT_WRITE
7796 			 * irrespective of following factors:
7797 			 *
7798 			 * (1) anon slots are populated or not
7799 			 * (2) cow is broken or not
7800 			 * (3) refcnt on ap is 1 or greater than 1
7801 			 *
7802 			 * See 4140683 for details
7803 			 */
7804 			claim = ((VPP_PROT(vpp) & PROT_WRITE) &&
7805 			    (svd->type == MAP_PRIVATE));
7806 
7807 			/*
7808 			 * Perform page-level operation appropriate to
7809 			 * operation.  If locking, undo the SOFTLOCK
7810 			 * performed to bring the page into memory
7811 			 * after setting the lock.  If unlocking,
7812 			 * and no page was found, account for the claim
7813 			 * separately.
7814 			 */
7815 			if (op == MC_LOCK) {
7816 				int ret = 1;	/* Assume success */
7817 
7818 				ASSERT(!VPP_ISPPLOCK(vpp));
7819 
7820 				ret = page_pp_lock(pp, claim, 0);
7821 				if (ret == 0) {
7822 					/* locking page failed */
7823 					page_unlock(pp);
7824 					err = EAGAIN;
7825 					goto out;
7826 				}
7827 				VPP_SETPPLOCK(vpp);
7828 				if (sp != NULL) {
7829 					if (pp->p_lckcnt == 1)
7830 						locked_bytes += PAGESIZE;
7831 				} else
7832 					locked_bytes += PAGESIZE;
7833 
7834 				if (lockmap != (ulong_t *)NULL)
7835 					BT_SET(lockmap, pos);
7836 
7837 				page_unlock(pp);
7838 			} else {
7839 				ASSERT(VPP_ISPPLOCK(vpp));
7840 				if (pp != NULL) {
7841 					/* sysV pages should be locked */
7842 					ASSERT(sp == NULL || pp->p_lckcnt > 0);
7843 					page_pp_unlock(pp, claim, 0);
7844 					if (sp != NULL) {
7845 						if (pp->p_lckcnt == 0)
7846 							unlocked_bytes
7847 							    += PAGESIZE;
7848 					} else
7849 						unlocked_bytes += PAGESIZE;
7850 					page_unlock(pp);
7851 				} else {
7852 					ASSERT(sp == NULL);
7853 					unlocked_bytes += PAGESIZE;
7854 				}
7855 				VPP_CLRPPLOCK(vpp);
7856 			}
7857 		}
7858 	}
7859 out:
7860 	if (op == MC_LOCK) {
7861 		/* Credit back bytes that did not get locked */
7862 		if ((unlocked_bytes - locked_bytes) > 0) {
7863 			if (proj == NULL)
7864 				mutex_enter(&p->p_lock);
7865 			rctl_decr_locked_mem(p, proj,
7866 			    (unlocked_bytes - locked_bytes), chargeproc);
7867 			if (proj == NULL)
7868 				mutex_exit(&p->p_lock);
7869 		}
7870 
7871 	} else {
7872 		/* Account bytes that were unlocked */
7873 		if (unlocked_bytes > 0) {
7874 			if (proj == NULL)
7875 				mutex_enter(&p->p_lock);
7876 			rctl_decr_locked_mem(p, proj, unlocked_bytes,
7877 			    chargeproc);
7878 			if (proj == NULL)
7879 				mutex_exit(&p->p_lock);
7880 		}
7881 	}
7882 	if (sp != NULL)
7883 		mutex_exit(&sp->shm_mlock);
7884 	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7885 
7886 	return (err);
7887 }
7888 
7889 /*
7890  * Set advice from user for specified pages
7891  * There are 5 types of advice:
7892  *	MADV_NORMAL	- Normal (default) behavior (whatever that is)
7893  *	MADV_RANDOM	- Random page references
7894  *				do not allow readahead or 'klustering'
7895  *	MADV_SEQUENTIAL	- Sequential page references
7896  *				Pages previous to the one currently being
7897  *				accessed (determined by fault) are 'not needed'
7898  *				and are freed immediately
7899  *	MADV_WILLNEED	- Pages are likely to be used (fault ahead in mctl)
7900  *	MADV_DONTNEED	- Pages are not needed (synced out in mctl)
7901  *	MADV_FREE	- Contents can be discarded
7902  *	MADV_ACCESS_DEFAULT- Default access
7903  *	MADV_ACCESS_LWP	- Next LWP will access heavily
7904  *	MADV_ACCESS_MANY- Many LWPs or processes will access heavily
7905  */
7906 static int
7907 segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
7908 {
7909 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
7910 	size_t page;
7911 	int err = 0;
7912 	int already_set;
7913 	struct anon_map *amp;
7914 	ulong_t anon_index;
7915 	struct seg *next;
7916 	lgrp_mem_policy_t policy;
7917 	struct seg *prev;
7918 	struct vnode *vp;
7919 
7920 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
7921 
7922 	/*
7923 	 * In case of MADV_FREE, we won't be modifying any segment private
7924 	 * data structures; so, we only need to grab READER's lock
7925 	 */
7926 	if (behav != MADV_FREE) {
7927 		SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER);
7928 		if (svd->tr_state != SEGVN_TR_OFF) {
7929 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7930 			return (0);
7931 		}
7932 	} else {
7933 		SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
7934 	}
7935 
7936 	/*
7937 	 * Large pages are assumed to be only turned on when accesses to the
7938 	 * segment's address range have spatial and temporal locality. That
7939 	 * justifies ignoring MADV_SEQUENTIAL for large page segments.
7940 	 * Also, ignore advice affecting lgroup memory allocation
7941 	 * if don't need to do lgroup optimizations on this system
7942 	 */
7943 
7944 	if ((behav == MADV_SEQUENTIAL &&
7945 	    (seg->s_szc != 0 || HAT_IS_REGION_COOKIE_VALID(svd->rcookie))) ||
7946 	    (!lgrp_optimizations() && (behav == MADV_ACCESS_DEFAULT ||
7947 	    behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY))) {
7948 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7949 		return (0);
7950 	}
7951 
7952 	if (behav == MADV_SEQUENTIAL || behav == MADV_ACCESS_DEFAULT ||
7953 	    behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY) {
7954 		/*
7955 		 * Since we are going to unload hat mappings
7956 		 * we first have to flush the cache. Otherwise
7957 		 * this might lead to system panic if another
7958 		 * thread is doing physio on the range whose
7959 		 * mappings are unloaded by madvise(3C).
7960 		 */
7961 		if (svd->softlockcnt > 0) {
7962 			/*
7963 			 * Since we do have the segvn writers lock
7964 			 * nobody can fill the cache with entries
7965 			 * belonging to this seg during the purge.
7966 			 * The flush either succeeds or we still
7967 			 * have pending I/Os. In the later case,
7968 			 * madvise(3C) fails.
7969 			 */
7970 			segvn_purge(seg);
7971 			if (svd->softlockcnt > 0) {
7972 				/*
7973 				 * Since madvise(3C) is advisory and
7974 				 * it's not part of UNIX98, madvise(3C)
7975 				 * failure here doesn't cause any hardship.
7976 				 * Note that we don't block in "as" layer.
7977 				 */
7978 				SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7979 				return (EAGAIN);
7980 			}
7981 		}
7982 	}
7983 
7984 	amp = svd->amp;
7985 	vp = svd->vp;
7986 	if (behav == MADV_FREE) {
7987 		/*
7988 		 * MADV_FREE is not supported for segments with
7989 		 * underlying object; if anonmap is NULL, anon slots
7990 		 * are not yet populated and there is nothing for
7991 		 * us to do. As MADV_FREE is advisory, we don't
7992 		 * return error in either case.
7993 		 */
7994 		if (vp != NULL || amp == NULL) {
7995 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
7996 			return (0);
7997 		}
7998 
7999 		page = seg_page(seg, addr);
8000 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
8001 		anon_disclaim(amp, svd->anon_index + page, len);
8002 		ANON_LOCK_EXIT(&amp->a_rwlock);
8003 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
8004 		return (0);
8005 	}
8006 
8007 	/*
8008 	 * If advice is to be applied to entire segment,
8009 	 * use advice field in seg_data structure
8010 	 * otherwise use appropriate vpage entry.
8011 	 */
8012 	if ((addr == seg->s_base) && (len == seg->s_size)) {
8013 		switch (behav) {
8014 		case MADV_ACCESS_LWP:
8015 		case MADV_ACCESS_MANY:
8016 		case MADV_ACCESS_DEFAULT:
8017 			/*
8018 			 * Set memory allocation policy for this segment
8019 			 */
8020 			policy = lgrp_madv_to_policy(behav, len, svd->type);
8021 			if (svd->type == MAP_SHARED)
8022 				already_set = lgrp_shm_policy_set(policy, amp,
8023 				    svd->anon_index, vp, svd->offset, len);
8024 			else {
8025 				/*
8026 				 * For private memory, need writers lock on
8027 				 * address space because the segment may be
8028 				 * split or concatenated when changing policy
8029 				 */
8030 				if (AS_READ_HELD(seg->s_as,
8031 				    &seg->s_as->a_lock)) {
8032 					SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
8033 					return (IE_RETRY);
8034 				}
8035 
8036 				already_set = lgrp_privm_policy_set(policy,
8037 				    &svd->policy_info, len);
8038 			}
8039 
8040 			/*
8041 			 * If policy set already and it shouldn't be reapplied,
8042 			 * don't do anything.
8043 			 */
8044 			if (already_set &&
8045 			    !LGRP_MEM_POLICY_REAPPLICABLE(policy))
8046 				break;
8047 
8048 			/*
8049 			 * Mark any existing pages in given range for
8050 			 * migration
8051 			 */
8052 			page_mark_migrate(seg, addr, len, amp, svd->anon_index,
8053 			    vp, svd->offset, 1);
8054 
8055 			/*
8056 			 * If same policy set already or this is a shared
8057 			 * memory segment, don't need to try to concatenate
8058 			 * segment with adjacent ones.
8059 			 */
8060 			if (already_set || svd->type == MAP_SHARED)
8061 				break;
8062 
8063 			/*
8064 			 * Try to concatenate this segment with previous
8065 			 * one and next one, since we changed policy for
8066 			 * this one and it may be compatible with adjacent
8067 			 * ones now.
8068 			 */
8069 			prev = AS_SEGPREV(seg->s_as, seg);
8070 			next = AS_SEGNEXT(seg->s_as, seg);
8071 
8072 			if (next && next->s_ops == &segvn_ops &&
8073 			    addr + len == next->s_base)
8074 				(void) segvn_concat(seg, next, 1);
8075 
8076 			if (prev && prev->s_ops == &segvn_ops &&
8077 			    addr == prev->s_base + prev->s_size) {
8078 				/*
8079 				 * Drop lock for private data of current
8080 				 * segment before concatenating (deleting) it
8081 				 * and return IE_REATTACH to tell as_ctl() that
8082 				 * current segment has changed
8083 				 */
8084 				SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
8085 				if (!segvn_concat(prev, seg, 1))
8086 					err = IE_REATTACH;
8087 
8088 				return (err);
8089 			}
8090 			break;
8091 
8092 		case MADV_SEQUENTIAL:
8093 			/*
8094 			 * unloading mapping guarantees
8095 			 * detection in segvn_fault
8096 			 */
8097 			ASSERT(seg->s_szc == 0);
8098 			ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
8099 			hat_unload(seg->s_as->a_hat, addr, len,
8100 			    HAT_UNLOAD);
8101 			/* FALLTHROUGH */
8102 		case MADV_NORMAL:
8103 		case MADV_RANDOM:
8104 			svd->advice = (uchar_t)behav;
8105 			svd->pageadvice = 0;
8106 			break;
8107 		case MADV_WILLNEED:	/* handled in memcntl */
8108 		case MADV_DONTNEED:	/* handled in memcntl */
8109 		case MADV_FREE:		/* handled above */
8110 			break;
8111 		default:
8112 			err = EINVAL;
8113 		}
8114 	} else {
8115 		caddr_t			eaddr;
8116 		struct seg		*new_seg;
8117 		struct segvn_data	*new_svd;
8118 		u_offset_t		off;
8119 		caddr_t			oldeaddr;
8120 
8121 		page = seg_page(seg, addr);
8122 
8123 		segvn_vpage(seg);
8124 
8125 		switch (behav) {
8126 			struct vpage *bvpp, *evpp;
8127 
8128 		case MADV_ACCESS_LWP:
8129 		case MADV_ACCESS_MANY:
8130 		case MADV_ACCESS_DEFAULT:
8131 			/*
8132 			 * Set memory allocation policy for portion of this
8133 			 * segment
8134 			 */
8135 
8136 			/*
8137 			 * Align address and length of advice to page
8138 			 * boundaries for large pages
8139 			 */
8140 			if (seg->s_szc != 0) {
8141 				size_t	pgsz;
8142 
8143 				pgsz = page_get_pagesize(seg->s_szc);
8144 				addr = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz);
8145 				len = P2ROUNDUP(len, pgsz);
8146 			}
8147 
8148 			/*
8149 			 * Check to see whether policy is set already
8150 			 */
8151 			policy = lgrp_madv_to_policy(behav, len, svd->type);
8152 
8153 			anon_index = svd->anon_index + page;
8154 			off = svd->offset + (uintptr_t)(addr - seg->s_base);
8155 
8156 			if (svd->type == MAP_SHARED)
8157 				already_set = lgrp_shm_policy_set(policy, amp,
8158 				    anon_index, vp, off, len);
8159 			else
8160 				already_set =
8161 				    (policy == svd->policy_info.mem_policy);
8162 
8163 			/*
8164 			 * If policy set already and it shouldn't be reapplied,
8165 			 * don't do anything.
8166 			 */
8167 			if (already_set &&
8168 			    !LGRP_MEM_POLICY_REAPPLICABLE(policy))
8169 				break;
8170 
8171 			/*
8172 			 * For private memory, need writers lock on
8173 			 * address space because the segment may be
8174 			 * split or concatenated when changing policy
8175 			 */
8176 			if (svd->type == MAP_PRIVATE &&
8177 			    AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) {
8178 				SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
8179 				return (IE_RETRY);
8180 			}
8181 
8182 			/*
8183 			 * Mark any existing pages in given range for
8184 			 * migration
8185 			 */
8186 			page_mark_migrate(seg, addr, len, amp, svd->anon_index,
8187 			    vp, svd->offset, 1);
8188 
8189 			/*
8190 			 * Don't need to try to split or concatenate
8191 			 * segments, since policy is same or this is a shared
8192 			 * memory segment
8193 			 */
8194 			if (already_set || svd->type == MAP_SHARED)
8195 				break;
8196 
8197 			if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) {
8198 				ASSERT(svd->amp == NULL);
8199 				ASSERT(svd->tr_state == SEGVN_TR_OFF);
8200 				ASSERT(svd->softlockcnt == 0);
8201 				hat_leave_region(seg->s_as->a_hat, svd->rcookie,
8202 				    HAT_REGION_TEXT);
8203 				svd->rcookie = HAT_INVALID_REGION_COOKIE;
8204 			}
8205 
8206 			/*
8207 			 * Split off new segment if advice only applies to a
8208 			 * portion of existing segment starting in middle
8209 			 */
8210 			new_seg = NULL;
8211 			eaddr = addr + len;
8212 			oldeaddr = seg->s_base + seg->s_size;
8213 			if (addr > seg->s_base) {
8214 				/*
8215 				 * Must flush I/O page cache
8216 				 * before splitting segment
8217 				 */
8218 				if (svd->softlockcnt > 0)
8219 					segvn_purge(seg);
8220 
8221 				/*
8222 				 * Split segment and return IE_REATTACH to tell
8223 				 * as_ctl() that current segment changed
8224 				 */
8225 				new_seg = segvn_split_seg(seg, addr);
8226 				new_svd = (struct segvn_data *)new_seg->s_data;
8227 				err = IE_REATTACH;
8228 
8229 				/*
8230 				 * If new segment ends where old one
8231 				 * did, try to concatenate the new
8232 				 * segment with next one.
8233 				 */
8234 				if (eaddr == oldeaddr) {
8235 					/*
8236 					 * Set policy for new segment
8237 					 */
8238 					(void) lgrp_privm_policy_set(policy,
8239 					    &new_svd->policy_info,
8240 					    new_seg->s_size);
8241 
8242 					next = AS_SEGNEXT(new_seg->s_as,
8243 					    new_seg);
8244 
8245 					if (next &&
8246 					    next->s_ops == &segvn_ops &&
8247 					    eaddr == next->s_base)
8248 						(void) segvn_concat(new_seg,
8249 						    next, 1);
8250 				}
8251 			}
8252 
8253 			/*
8254 			 * Split off end of existing segment if advice only
8255 			 * applies to a portion of segment ending before
8256 			 * end of the existing segment
8257 			 */
8258 			if (eaddr < oldeaddr) {
8259 				/*
8260 				 * Must flush I/O page cache
8261 				 * before splitting segment
8262 				 */
8263 				if (svd->softlockcnt > 0)
8264 					segvn_purge(seg);
8265 
8266 				/*
8267 				 * If beginning of old segment was already
8268 				 * split off, use new segment to split end off
8269 				 * from.
8270 				 */
8271 				if (new_seg != NULL && new_seg != seg) {
8272 					/*
8273 					 * Split segment
8274 					 */
8275 					(void) segvn_split_seg(new_seg, eaddr);
8276 
8277 					/*
8278 					 * Set policy for new segment
8279 					 */
8280 					(void) lgrp_privm_policy_set(policy,
8281 					    &new_svd->policy_info,
8282 					    new_seg->s_size);
8283 				} else {
8284 					/*
8285 					 * Split segment and return IE_REATTACH
8286 					 * to tell as_ctl() that current
8287 					 * segment changed
8288 					 */
8289 					(void) segvn_split_seg(seg, eaddr);
8290 					err = IE_REATTACH;
8291 
8292 					(void) lgrp_privm_policy_set(policy,
8293 					    &svd->policy_info, seg->s_size);
8294 
8295 					/*
8296 					 * If new segment starts where old one
8297 					 * did, try to concatenate it with
8298 					 * previous segment.
8299 					 */
8300 					if (addr == seg->s_base) {
8301 						prev = AS_SEGPREV(seg->s_as,
8302 						    seg);
8303 
8304 						/*
8305 						 * Drop lock for private data
8306 						 * of current segment before
8307 						 * concatenating (deleting) it
8308 						 */
8309 						if (prev &&
8310 						    prev->s_ops ==
8311 						    &segvn_ops &&
8312 						    addr == prev->s_base +
8313 						    prev->s_size) {
8314 							SEGVN_LOCK_EXIT(
8315 							    seg->s_as,
8316 							    &svd->lock);
8317 							(void) segvn_concat(
8318 							    prev, seg, 1);
8319 							return (err);
8320 						}
8321 					}
8322 				}
8323 			}
8324 			break;
8325 		case MADV_SEQUENTIAL:
8326 			ASSERT(seg->s_szc == 0);
8327 			ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE);
8328 			hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD);
8329 			/* FALLTHROUGH */
8330 		case MADV_NORMAL:
8331 		case MADV_RANDOM:
8332 			bvpp = &svd->vpage[page];
8333 			evpp = &svd->vpage[page + (len >> PAGESHIFT)];
8334 			for (; bvpp < evpp; bvpp++)
8335 				VPP_SETADVICE(bvpp, behav);
8336 			svd->advice = MADV_NORMAL;
8337 			break;
8338 		case MADV_WILLNEED:	/* handled in memcntl */
8339 		case MADV_DONTNEED:	/* handled in memcntl */
8340 		case MADV_FREE:		/* handled above */
8341 			break;
8342 		default:
8343 			err = EINVAL;
8344 		}
8345 	}
8346 	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
8347 	return (err);
8348 }
8349 
8350 /*
8351  * Create a vpage structure for this seg.
8352  */
8353 static void
8354 segvn_vpage(struct seg *seg)
8355 {
8356 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
8357 	struct vpage *vp, *evp;
8358 
8359 	ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock));
8360 
8361 	/*
8362 	 * If no vpage structure exists, allocate one.  Copy the protections
8363 	 * and the advice from the segment itself to the individual pages.
8364 	 */
8365 	if (svd->vpage == NULL) {
8366 		svd->pageadvice = 1;
8367 		svd->vpage = kmem_zalloc(seg_pages(seg) * sizeof (struct vpage),
8368 		    KM_SLEEP);
8369 		evp = &svd->vpage[seg_page(seg, seg->s_base + seg->s_size)];
8370 		for (vp = svd->vpage; vp < evp; vp++) {
8371 			VPP_SETPROT(vp, svd->prot);
8372 			VPP_SETADVICE(vp, svd->advice);
8373 		}
8374 	}
8375 }
8376 
8377 /*
8378  * Dump the pages belonging to this segvn segment.
8379  */
8380 static void
8381 segvn_dump(struct seg *seg)
8382 {
8383 	struct segvn_data *svd;
8384 	page_t *pp;
8385 	struct anon_map *amp;
8386 	ulong_t	anon_index;
8387 	struct vnode *vp;
8388 	u_offset_t off, offset;
8389 	pfn_t pfn;
8390 	pgcnt_t page, npages;
8391 	caddr_t addr;
8392 
8393 	npages = seg_pages(seg);
8394 	svd = (struct segvn_data *)seg->s_data;
8395 	vp = svd->vp;
8396 	off = offset = svd->offset;
8397 	addr = seg->s_base;
8398 
8399 	if ((amp = svd->amp) != NULL) {
8400 		anon_index = svd->anon_index;
8401 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
8402 	}
8403 
8404 	for (page = 0; page < npages; page++, offset += PAGESIZE) {
8405 		struct anon *ap;
8406 		int we_own_it = 0;
8407 
8408 		if (amp && (ap = anon_get_ptr(svd->amp->ahp, anon_index++))) {
8409 			swap_xlate_nopanic(ap, &vp, &off);
8410 		} else {
8411 			vp = svd->vp;
8412 			off = offset;
8413 		}
8414 
8415 		/*
8416 		 * If pp == NULL, the page either does not exist
8417 		 * or is exclusively locked.  So determine if it
8418 		 * exists before searching for it.
8419 		 */
8420 
8421 		if ((pp = page_lookup_nowait(vp, off, SE_SHARED)))
8422 			we_own_it = 1;
8423 		else
8424 			pp = page_exists(vp, off);
8425 
8426 		if (pp) {
8427 			pfn = page_pptonum(pp);
8428 			dump_addpage(seg->s_as, addr, pfn);
8429 			if (we_own_it)
8430 				page_unlock(pp);
8431 		}
8432 		addr += PAGESIZE;
8433 		dump_timeleft = dump_timeout;
8434 	}
8435 
8436 	if (amp != NULL)
8437 		ANON_LOCK_EXIT(&amp->a_rwlock);
8438 }
8439 
8440 /*
8441  * lock/unlock anon pages over a given range. Return shadow list
8442  */
8443 static int
8444 segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp,
8445     enum lock_type type, enum seg_rw rw)
8446 {
8447 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
8448 	size_t np, adjustpages = 0, npages = (len >> PAGESHIFT);
8449 	ulong_t anon_index;
8450 	uint_t protchk;
8451 	uint_t error;
8452 	struct anon_map *amp;
8453 	struct page **pplist, **pl, *pp;
8454 	caddr_t a;
8455 	size_t page;
8456 	caddr_t lpgaddr, lpgeaddr;
8457 	pgcnt_t szc0_npages = 0;
8458 
8459 	TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_START,
8460 	    "segvn_pagelock: start seg %p addr %p", seg, addr);
8461 
8462 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
8463 	if (seg->s_szc != 0 && (type == L_PAGELOCK || type == L_PAGEUNLOCK)) {
8464 		/*
8465 		 * We are adjusting the pagelock region to the large page size
8466 		 * boundary because the unlocked part of a large page cannot
8467 		 * be freed anyway unless all constituent pages of a large
8468 		 * page are locked. Therefore this adjustment allows us to
8469 		 * decrement availrmem by the right value (note we don't want
8470 		 * to just decrement availrem by the large page size without
8471 		 * adjusting addr and len because then we may end up
8472 		 * decrementing availrmem by large page size for every
8473 		 * constituent page locked by a new as_pagelock call).
8474 		 * as_pageunlock caller must always match as_pagelock call's
8475 		 * addr and len.
8476 		 *
8477 		 * Note segment's page size cannot change while we are holding
8478 		 * as lock.  And then it cannot change while softlockcnt is
8479 		 * not 0. This will allow us to correctly recalculate large
8480 		 * page size region for the matching pageunlock/reclaim call.
8481 		 *
8482 		 * for pageunlock *ppp points to the pointer of page_t that
8483 		 * corresponds to the real unadjusted start address. Similar
8484 		 * for pagelock *ppp must point to the pointer of page_t that
8485 		 * corresponds to the real unadjusted start address.
8486 		 */
8487 		size_t pgsz = page_get_pagesize(seg->s_szc);
8488 		CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr);
8489 		adjustpages = ((uintptr_t)(addr - lpgaddr)) >> PAGESHIFT;
8490 	}
8491 
8492 	if (type == L_PAGEUNLOCK) {
8493 
8494 		/*
8495 		 * update hat ref bits for /proc. We need to make sure
8496 		 * that threads tracing the ref and mod bits of the
8497 		 * address space get the right data.
8498 		 * Note: page ref and mod bits are updated at reclaim time
8499 		 */
8500 		if (seg->s_as->a_vbits) {
8501 			for (a = addr; a < addr + len; a += PAGESIZE) {
8502 				if (rw == S_WRITE) {
8503 					hat_setstat(seg->s_as, a,
8504 					    PAGESIZE, P_REF | P_MOD);
8505 				} else {
8506 					hat_setstat(seg->s_as, a,
8507 					    PAGESIZE, P_REF);
8508 				}
8509 			}
8510 		}
8511 		SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
8512 		if (seg->s_szc != 0) {
8513 			VM_STAT_ADD(segvnvmstats.pagelock[0]);
8514 			seg_pinactive(seg, lpgaddr, lpgeaddr - lpgaddr,
8515 			    *ppp - adjustpages, rw, segvn_reclaim);
8516 		} else {
8517 			seg_pinactive(seg, addr, len, *ppp, rw, segvn_reclaim);
8518 		}
8519 
8520 		/*
8521 		 * If someone is blocked while unmapping, we purge
8522 		 * segment page cache and thus reclaim pplist synchronously
8523 		 * without waiting for seg_pasync_thread. This speeds up
8524 		 * unmapping in cases where munmap(2) is called, while
8525 		 * raw async i/o is still in progress or where a thread
8526 		 * exits on data fault in a multithreaded application.
8527 		 */
8528 		if (AS_ISUNMAPWAIT(seg->s_as) && (svd->softlockcnt > 0)) {
8529 			/*
8530 			 * Even if we grab segvn WRITER's lock or segp_slock
8531 			 * here, there might be another thread which could've
8532 			 * successfully performed lookup/insert just before
8533 			 * we acquired the lock here.  So, grabbing either
8534 			 * lock here is of not much use.  Until we devise
8535 			 * a strategy at upper layers to solve the
8536 			 * synchronization issues completely, we expect
8537 			 * applications to handle this appropriately.
8538 			 */
8539 			segvn_purge(seg);
8540 		}
8541 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
8542 		TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END,
8543 		    "segvn_pagelock: unlock seg %p addr %p", seg, addr);
8544 		return (0);
8545 	} else if (type == L_PAGERECLAIM) {
8546 		VM_STAT_COND_ADD(seg->s_szc != 0, segvnvmstats.pagelock[1]);
8547 		SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
8548 		(void) segvn_reclaim(seg, addr, len, *ppp, rw);
8549 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
8550 		TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END,
8551 		    "segvn_pagelock: reclaim seg %p addr %p", seg, addr);
8552 		return (0);
8553 	}
8554 
8555 	if (seg->s_szc != 0) {
8556 		VM_STAT_ADD(segvnvmstats.pagelock[2]);
8557 		addr = lpgaddr;
8558 		len = lpgeaddr - lpgaddr;
8559 		npages = (len >> PAGESHIFT);
8560 	}
8561 
8562 	/*
8563 	 * for now we only support pagelock to anon memory. We've to check
8564 	 * protections for vnode objects and call into the vnode driver.
8565 	 * That's too much for a fast path. Let the fault entry point handle it.
8566 	 */
8567 	if (svd->vp != NULL) {
8568 		TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END,
8569 		    "segvn_pagelock: mapped vnode seg %p addr %p", seg, addr);
8570 		*ppp = NULL;
8571 		return (ENOTSUP);
8572 	}
8573 
8574 	/*
8575 	 * if anonmap is not yet created, let the fault entry point populate it
8576 	 * with anon ptrs.
8577 	 */
8578 	if ((amp = svd->amp) == NULL) {
8579 		TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END,
8580 		    "segvn_pagelock: anonmap null seg %p addr %p", seg, addr);
8581 		*ppp = NULL;
8582 		return (EFAULT);
8583 	}
8584 
8585 	SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
8586 
8587 	/*
8588 	 * we acquire segp_slock to prevent duplicate entries
8589 	 * in seg_pcache
8590 	 */
8591 	mutex_enter(&svd->segp_slock);
8592 
8593 	/*
8594 	 * try to find pages in segment page cache
8595 	 */
8596 	pplist = seg_plookup(seg, addr, len, rw);
8597 	if (pplist != NULL) {
8598 		mutex_exit(&svd->segp_slock);
8599 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
8600 		*ppp = pplist + adjustpages;
8601 		TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_HIT_END,
8602 		    "segvn_pagelock: cache hit seg %p addr %p", seg, addr);
8603 		return (0);
8604 	}
8605 
8606 	if (rw == S_READ) {
8607 		protchk = PROT_READ;
8608 	} else {
8609 		protchk = PROT_WRITE;
8610 	}
8611 
8612 	if (svd->pageprot == 0) {
8613 		if ((svd->prot & protchk) == 0) {
8614 			mutex_exit(&svd->segp_slock);
8615 			error = EFAULT;
8616 			goto out;
8617 		}
8618 	} else {
8619 		/*
8620 		 * check page protections
8621 		 */
8622 		for (a = addr; a < addr + len; a += PAGESIZE) {
8623 			struct vpage *vp;
8624 
8625 			vp = &svd->vpage[seg_page(seg, a)];
8626 			if ((VPP_PROT(vp) & protchk) == 0) {
8627 				mutex_exit(&svd->segp_slock);
8628 				error = EFAULT;
8629 				goto out;
8630 			}
8631 		}
8632 	}
8633 
8634 	/*
8635 	 * Avoid per page overhead of segvn_slock_anonpages() for small
8636 	 * pages. For large pages segvn_slock_anonpages() only does real
8637 	 * work once per large page.  The tradeoff is that we may decrement
8638 	 * availrmem more than once for the same page but this is ok
8639 	 * for small pages.
8640 	 */
8641 	if (seg->s_szc == 0) {
8642 		mutex_enter(&freemem_lock);
8643 		if (availrmem < tune.t_minarmem + npages) {
8644 			mutex_exit(&freemem_lock);
8645 			mutex_exit(&svd->segp_slock);
8646 			error = ENOMEM;
8647 			goto out;
8648 		}
8649 		availrmem -= npages;
8650 		mutex_exit(&freemem_lock);
8651 	}
8652 
8653 	pplist = kmem_alloc(sizeof (page_t *) * npages, KM_SLEEP);
8654 	pl = pplist;
8655 	*ppp = pplist + adjustpages;
8656 
8657 	page = seg_page(seg, addr);
8658 	anon_index = svd->anon_index + page;
8659 
8660 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
8661 	for (a = addr; a < addr + len; a += PAGESIZE, anon_index++) {
8662 		struct anon *ap;
8663 		struct vnode *vp;
8664 		u_offset_t off;
8665 		anon_sync_obj_t cookie;
8666 
8667 		anon_array_enter(amp, anon_index, &cookie);
8668 		ap = anon_get_ptr(amp->ahp, anon_index);
8669 		if (ap == NULL) {
8670 			anon_array_exit(&cookie);
8671 			break;
8672 		} else {
8673 			/*
8674 			 * We must never use seg_pcache for COW pages
8675 			 * because we might end up with original page still
8676 			 * lying in seg_pcache even after private page is
8677 			 * created. This leads to data corruption as
8678 			 * aio_write refers to the page still in cache
8679 			 * while all other accesses refer to the private
8680 			 * page.
8681 			 */
8682 			if (ap->an_refcnt != 1) {
8683 				anon_array_exit(&cookie);
8684 				break;
8685 			}
8686 		}
8687 		swap_xlate(ap, &vp, &off);
8688 		anon_array_exit(&cookie);
8689 
8690 		pp = page_lookup_nowait(vp, off, SE_SHARED);
8691 		if (pp == NULL) {
8692 			break;
8693 		}
8694 		if (seg->s_szc != 0 || pp->p_szc != 0) {
8695 			if (!segvn_slock_anonpages(pp, a == addr)) {
8696 				page_unlock(pp);
8697 				break;
8698 			}
8699 		} else {
8700 			szc0_npages++;
8701 		}
8702 		*pplist++ = pp;
8703 	}
8704 	ANON_LOCK_EXIT(&amp->a_rwlock);
8705 
8706 	ASSERT(npages >= szc0_npages);
8707 
8708 	if (a >= addr + len) {
8709 		mutex_enter(&freemem_lock);
8710 		if (seg->s_szc == 0 && npages != szc0_npages) {
8711 			ASSERT(svd->type == MAP_SHARED && amp->a_szc > 0);
8712 			availrmem += (npages - szc0_npages);
8713 		}
8714 		svd->softlockcnt += npages;
8715 		segvn_pages_locked += npages;
8716 		mutex_exit(&freemem_lock);
8717 		(void) seg_pinsert(seg, addr, len, pl, rw, SEGP_ASYNC_FLUSH,
8718 		    segvn_reclaim);
8719 		mutex_exit(&svd->segp_slock);
8720 		SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
8721 		TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_FILL_END,
8722 		    "segvn_pagelock: cache fill seg %p addr %p", seg, addr);
8723 		return (0);
8724 	}
8725 
8726 	mutex_exit(&svd->segp_slock);
8727 	if (seg->s_szc == 0) {
8728 		mutex_enter(&freemem_lock);
8729 		availrmem += npages;
8730 		mutex_exit(&freemem_lock);
8731 	}
8732 	error = EFAULT;
8733 	pplist = pl;
8734 	np = ((uintptr_t)(a - addr)) >> PAGESHIFT;
8735 	while (np > (uint_t)0) {
8736 		ASSERT(PAGE_LOCKED(*pplist));
8737 		if (seg->s_szc != 0 || (*pplist)->p_szc != 0) {
8738 			segvn_sunlock_anonpages(*pplist, pplist == pl);
8739 		}
8740 		page_unlock(*pplist);
8741 		np--;
8742 		pplist++;
8743 	}
8744 	kmem_free(pl, sizeof (page_t *) * npages);
8745 out:
8746 	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
8747 	*ppp = NULL;
8748 	TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END,
8749 	    "segvn_pagelock: cache miss seg %p addr %p", seg, addr);
8750 	return (error);
8751 }
8752 
8753 /*
8754  * purge any cached pages in the I/O page cache
8755  */
8756 static void
8757 segvn_purge(struct seg *seg)
8758 {
8759 	seg_ppurge(seg);
8760 }
8761 
8762 static int
8763 segvn_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist,
8764 	enum seg_rw rw)
8765 {
8766 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
8767 	pgcnt_t np, npages;
8768 	struct page **pl;
8769 	pgcnt_t szc0_npages = 0;
8770 
8771 #ifdef lint
8772 	addr = addr;
8773 #endif
8774 
8775 	npages = np = (len >> PAGESHIFT);
8776 	ASSERT(npages);
8777 	pl = pplist;
8778 	if (seg->s_szc != 0) {
8779 		size_t pgsz = page_get_pagesize(seg->s_szc);
8780 		if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) {
8781 			panic("segvn_reclaim: unaligned addr or len");
8782 			/*NOTREACHED*/
8783 		}
8784 	}
8785 
8786 	ASSERT(svd->vp == NULL && svd->amp != NULL);
8787 
8788 	while (np > (uint_t)0) {
8789 		if (rw == S_WRITE) {
8790 			hat_setrefmod(*pplist);
8791 		} else {
8792 			hat_setref(*pplist);
8793 		}
8794 		if (seg->s_szc != 0 || (*pplist)->p_szc != 0) {
8795 			segvn_sunlock_anonpages(*pplist, pplist == pl);
8796 		} else {
8797 			szc0_npages++;
8798 		}
8799 		page_unlock(*pplist);
8800 		np--;
8801 		pplist++;
8802 	}
8803 	kmem_free(pl, sizeof (page_t *) * npages);
8804 
8805 	mutex_enter(&freemem_lock);
8806 	segvn_pages_locked -= npages;
8807 	svd->softlockcnt -= npages;
8808 	if (szc0_npages != 0) {
8809 		availrmem += szc0_npages;
8810 	}
8811 	mutex_exit(&freemem_lock);
8812 	if (svd->softlockcnt <= 0) {
8813 		if (AS_ISUNMAPWAIT(seg->s_as)) {
8814 			mutex_enter(&seg->s_as->a_contents);
8815 			if (AS_ISUNMAPWAIT(seg->s_as)) {
8816 				AS_CLRUNMAPWAIT(seg->s_as);
8817 				cv_broadcast(&seg->s_as->a_cv);
8818 			}
8819 			mutex_exit(&seg->s_as->a_contents);
8820 		}
8821 	}
8822 	return (0);
8823 }
8824 /*
8825  * get a memory ID for an addr in a given segment
8826  *
8827  * XXX only creates PAGESIZE pages if anon slots are not initialized.
8828  * At fault time they will be relocated into larger pages.
8829  */
8830 static int
8831 segvn_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
8832 {
8833 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
8834 	struct anon 	*ap = NULL;
8835 	ulong_t		anon_index;
8836 	struct anon_map	*amp;
8837 	anon_sync_obj_t cookie;
8838 
8839 	if (svd->type == MAP_PRIVATE) {
8840 		memidp->val[0] = (uintptr_t)seg->s_as;
8841 		memidp->val[1] = (uintptr_t)addr;
8842 		return (0);
8843 	}
8844 
8845 	if (svd->type == MAP_SHARED) {
8846 		if (svd->vp) {
8847 			memidp->val[0] = (uintptr_t)svd->vp;
8848 			memidp->val[1] = (u_longlong_t)svd->offset +
8849 			    (uintptr_t)(addr - seg->s_base);
8850 			return (0);
8851 		} else {
8852 
8853 			SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER);
8854 			if ((amp = svd->amp) != NULL) {
8855 				anon_index = svd->anon_index +
8856 				    seg_page(seg, addr);
8857 			}
8858 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
8859 
8860 			ASSERT(amp != NULL);
8861 
8862 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
8863 			anon_array_enter(amp, anon_index, &cookie);
8864 			ap = anon_get_ptr(amp->ahp, anon_index);
8865 			if (ap == NULL) {
8866 				page_t		*pp;
8867 
8868 				pp = anon_zero(seg, addr, &ap, svd->cred);
8869 				if (pp == NULL) {
8870 					anon_array_exit(&cookie);
8871 					ANON_LOCK_EXIT(&amp->a_rwlock);
8872 					return (ENOMEM);
8873 				}
8874 				ASSERT(anon_get_ptr(amp->ahp, anon_index)
8875 				    == NULL);
8876 				(void) anon_set_ptr(amp->ahp, anon_index,
8877 				    ap, ANON_SLEEP);
8878 				page_unlock(pp);
8879 			}
8880 
8881 			anon_array_exit(&cookie);
8882 			ANON_LOCK_EXIT(&amp->a_rwlock);
8883 
8884 			memidp->val[0] = (uintptr_t)ap;
8885 			memidp->val[1] = (uintptr_t)addr & PAGEOFFSET;
8886 			return (0);
8887 		}
8888 	}
8889 	return (EINVAL);
8890 }
8891 
8892 static int
8893 sameprot(struct seg *seg, caddr_t a, size_t len)
8894 {
8895 	struct segvn_data *svd = (struct segvn_data *)seg->s_data;
8896 	struct vpage *vpage;
8897 	spgcnt_t pages = btop(len);
8898 	uint_t prot;
8899 
8900 	if (svd->pageprot == 0)
8901 		return (1);
8902 
8903 	ASSERT(svd->vpage != NULL);
8904 
8905 	vpage = &svd->vpage[seg_page(seg, a)];
8906 	prot = VPP_PROT(vpage);
8907 	vpage++;
8908 	pages--;
8909 	while (pages-- > 0) {
8910 		if (prot != VPP_PROT(vpage))
8911 			return (0);
8912 		vpage++;
8913 	}
8914 	return (1);
8915 }
8916 
8917 /*
8918  * Get memory allocation policy info for specified address in given segment
8919  */
8920 static lgrp_mem_policy_info_t *
8921 segvn_getpolicy(struct seg *seg, caddr_t addr)
8922 {
8923 	struct anon_map		*amp;
8924 	ulong_t			anon_index;
8925 	lgrp_mem_policy_info_t	*policy_info;
8926 	struct segvn_data	*svn_data;
8927 	u_offset_t		vn_off;
8928 	vnode_t			*vp;
8929 
8930 	ASSERT(seg != NULL);
8931 
8932 	svn_data = (struct segvn_data *)seg->s_data;
8933 	if (svn_data == NULL)
8934 		return (NULL);
8935 
8936 	/*
8937 	 * Get policy info for private or shared memory
8938 	 */
8939 	if (svn_data->type != MAP_SHARED) {
8940 		if (svn_data->tr_state != SEGVN_TR_ON) {
8941 			policy_info = &svn_data->policy_info;
8942 		} else {
8943 			policy_info = &svn_data->tr_policy_info;
8944 			ASSERT(policy_info->mem_policy ==
8945 			    LGRP_MEM_POLICY_NEXT_SEG);
8946 		}
8947 	} else {
8948 		amp = svn_data->amp;
8949 		anon_index = svn_data->anon_index + seg_page(seg, addr);
8950 		vp = svn_data->vp;
8951 		vn_off = svn_data->offset + (uintptr_t)(addr - seg->s_base);
8952 		policy_info = lgrp_shm_policy_get(amp, anon_index, vp, vn_off);
8953 	}
8954 
8955 	return (policy_info);
8956 }
8957 
8958 /*ARGSUSED*/
8959 static int
8960 segvn_capable(struct seg *seg, segcapability_t capability)
8961 {
8962 	return (0);
8963 }
8964 
8965 /*
8966  * Bind text vnode segment to an amp. If we bind successfully mappings will be
8967  * established to per vnode mapping per lgroup amp pages instead of to vnode
8968  * pages. There's one amp per vnode text mapping per lgroup. Many processes
8969  * may share the same text replication amp. If a suitable amp doesn't already
8970  * exist in svntr hash table create a new one.  We may fail to bind to amp if
8971  * segment is not eligible for text replication.  Code below first checks for
8972  * these conditions. If binding is successful segment tr_state is set to on
8973  * and svd->amp points to the amp to use. Otherwise tr_state is set to off and
8974  * svd->amp remains as NULL.
8975  */
8976 static void
8977 segvn_textrepl(struct seg *seg)
8978 {
8979 	struct segvn_data	*svd = (struct segvn_data *)seg->s_data;
8980 	vnode_t			*vp = svd->vp;
8981 	u_offset_t		off = svd->offset;
8982 	size_t			size = seg->s_size;
8983 	u_offset_t		eoff = off + size;
8984 	uint_t			szc = seg->s_szc;
8985 	ulong_t			hash = SVNTR_HASH_FUNC(vp);
8986 	svntr_t			*svntrp;
8987 	struct vattr		va;
8988 	proc_t			*p = seg->s_as->a_proc;
8989 	lgrp_id_t		lgrp_id;
8990 	lgrp_id_t		olid;
8991 	int			first;
8992 	struct anon_map		*amp;
8993 
8994 	ASSERT(AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
8995 	ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock));
8996 	ASSERT(p != NULL);
8997 	ASSERT(svd->tr_state == SEGVN_TR_INIT);
8998 	ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie));
8999 	ASSERT(svd->flags & MAP_TEXT);
9000 	ASSERT(svd->type == MAP_PRIVATE);
9001 	ASSERT(vp != NULL && svd->amp == NULL);
9002 	ASSERT(!svd->pageprot && !(svd->prot & PROT_WRITE));
9003 	ASSERT(!(svd->flags & MAP_NORESERVE) && svd->swresv == 0);
9004 	ASSERT(seg->s_as != &kas);
9005 	ASSERT(off < eoff);
9006 	ASSERT(svntr_hashtab != NULL);
9007 
9008 	/*
9009 	 * If numa optimizations are no longer desired bail out.
9010 	 */
9011 	if (!lgrp_optimizations()) {
9012 		svd->tr_state = SEGVN_TR_OFF;
9013 		return;
9014 	}
9015 
9016 	/*
9017 	 * Avoid creating anon maps with size bigger than the file size.
9018 	 * If VOP_GETATTR() call fails bail out.
9019 	 */
9020 	va.va_mask = AT_SIZE | AT_MTIME | AT_CTIME;
9021 	if (VOP_GETATTR(vp, &va, 0, svd->cred, NULL) != 0) {
9022 		svd->tr_state = SEGVN_TR_OFF;
9023 		SEGVN_TR_ADDSTAT(gaerr);
9024 		return;
9025 	}
9026 	if (btopr(va.va_size) < btopr(eoff)) {
9027 		svd->tr_state = SEGVN_TR_OFF;
9028 		SEGVN_TR_ADDSTAT(overmap);
9029 		return;
9030 	}
9031 
9032 	/*
9033 	 * VVMEXEC may not be set yet if exec() prefaults text segment. Set
9034 	 * this flag now before vn_is_mapped(V_WRITE) so that MAP_SHARED
9035 	 * mapping that checks if trcache for this vnode needs to be
9036 	 * invalidated can't miss us.
9037 	 */
9038 	if (!(vp->v_flag & VVMEXEC)) {
9039 		mutex_enter(&vp->v_lock);
9040 		vp->v_flag |= VVMEXEC;
9041 		mutex_exit(&vp->v_lock);
9042 	}
9043 	mutex_enter(&svntr_hashtab[hash].tr_lock);
9044 	/*
9045 	 * Bail out if potentially MAP_SHARED writable mappings exist to this
9046 	 * vnode.  We don't want to use old file contents from existing
9047 	 * replicas if this mapping was established after the original file
9048 	 * was changed.
9049 	 */
9050 	if (vn_is_mapped(vp, V_WRITE)) {
9051 		mutex_exit(&svntr_hashtab[hash].tr_lock);
9052 		svd->tr_state = SEGVN_TR_OFF;
9053 		SEGVN_TR_ADDSTAT(wrcnt);
9054 		return;
9055 	}
9056 	svntrp = svntr_hashtab[hash].tr_head;
9057 	for (; svntrp != NULL; svntrp = svntrp->tr_next) {
9058 		ASSERT(svntrp->tr_refcnt != 0);
9059 		if (svntrp->tr_vp != vp) {
9060 			continue;
9061 		}
9062 
9063 		/*
9064 		 * Bail out if the file or its attributes were changed after
9065 		 * this replication entry was created since we need to use the
9066 		 * latest file contents. Note that mtime test alone is not
9067 		 * sufficient because a user can explicitly change mtime via
9068 		 * utimes(2) interfaces back to the old value after modifiying
9069 		 * the file contents. To detect this case we also have to test
9070 		 * ctime which among other things records the time of the last
9071 		 * mtime change by utimes(2). ctime is not changed when the file
9072 		 * is only read or executed so we expect that typically existing
9073 		 * replication amp's can be used most of the time.
9074 		 */
9075 		if (!svntrp->tr_valid ||
9076 		    svntrp->tr_mtime.tv_sec != va.va_mtime.tv_sec ||
9077 		    svntrp->tr_mtime.tv_nsec != va.va_mtime.tv_nsec ||
9078 		    svntrp->tr_ctime.tv_sec != va.va_ctime.tv_sec ||
9079 		    svntrp->tr_ctime.tv_nsec != va.va_ctime.tv_nsec) {
9080 			mutex_exit(&svntr_hashtab[hash].tr_lock);
9081 			svd->tr_state = SEGVN_TR_OFF;
9082 			SEGVN_TR_ADDSTAT(stale);
9083 			return;
9084 		}
9085 		/*
9086 		 * if off, eoff and szc match current segment we found the
9087 		 * existing entry we can use.
9088 		 */
9089 		if (svntrp->tr_off == off && svntrp->tr_eoff == eoff &&
9090 		    svntrp->tr_szc == szc) {
9091 			break;
9092 		}
9093 		/*
9094 		 * Don't create different but overlapping in file offsets
9095 		 * entries to avoid replication of the same file pages more
9096 		 * than once per lgroup.
9097 		 */
9098 		if ((off >= svntrp->tr_off && off < svntrp->tr_eoff) ||
9099 		    (eoff > svntrp->tr_off && eoff <= svntrp->tr_eoff)) {
9100 			mutex_exit(&svntr_hashtab[hash].tr_lock);
9101 			svd->tr_state = SEGVN_TR_OFF;
9102 			SEGVN_TR_ADDSTAT(overlap);
9103 			return;
9104 		}
9105 	}
9106 	/*
9107 	 * If we didn't find existing entry create a new one.
9108 	 */
9109 	if (svntrp == NULL) {
9110 		svntrp = kmem_cache_alloc(svntr_cache, KM_NOSLEEP);
9111 		if (svntrp == NULL) {
9112 			mutex_exit(&svntr_hashtab[hash].tr_lock);
9113 			svd->tr_state = SEGVN_TR_OFF;
9114 			SEGVN_TR_ADDSTAT(nokmem);
9115 			return;
9116 		}
9117 #ifdef DEBUG
9118 		{
9119 			lgrp_id_t i;
9120 			for (i = 0; i < NLGRPS_MAX; i++) {
9121 				ASSERT(svntrp->tr_amp[i] == NULL);
9122 			}
9123 		}
9124 #endif /* DEBUG */
9125 		svntrp->tr_vp = vp;
9126 		svntrp->tr_off = off;
9127 		svntrp->tr_eoff = eoff;
9128 		svntrp->tr_szc = szc;
9129 		svntrp->tr_valid = 1;
9130 		svntrp->tr_mtime = va.va_mtime;
9131 		svntrp->tr_ctime = va.va_ctime;
9132 		svntrp->tr_refcnt = 0;
9133 		svntrp->tr_next = svntr_hashtab[hash].tr_head;
9134 		svntr_hashtab[hash].tr_head = svntrp;
9135 	}
9136 	first = 1;
9137 again:
9138 	/*
9139 	 * We want to pick a replica with pages on main thread's (t_tid = 1,
9140 	 * aka T1) lgrp. Currently text replication is only optimized for
9141 	 * workloads that either have all threads of a process on the same
9142 	 * lgrp or execute their large text primarily on main thread.
9143 	 */
9144 	lgrp_id = p->p_t1_lgrpid;
9145 	if (lgrp_id == LGRP_NONE) {
9146 		/*
9147 		 * In case exec() prefaults text on non main thread use
9148 		 * current thread lgrpid.  It will become main thread anyway
9149 		 * soon.
9150 		 */
9151 		lgrp_id = lgrp_home_id(curthread);
9152 	}
9153 	/*
9154 	 * Set p_tr_lgrpid to lgrpid if it hasn't been set yet.  Otherwise
9155 	 * just set it to NLGRPS_MAX if it's different from current process T1
9156 	 * home lgrp.  p_tr_lgrpid is used to detect if process uses text
9157 	 * replication and T1 new home is different from lgrp used for text
9158 	 * replication. When this happens asyncronous segvn thread rechecks if
9159 	 * segments should change lgrps used for text replication.  If we fail
9160 	 * to set p_tr_lgrpid with cas32 then set it to NLGRPS_MAX without cas
9161 	 * if it's not already NLGRPS_MAX and not equal lgrp_id we want to
9162 	 * use.  We don't need to use cas in this case because another thread
9163 	 * that races in between our non atomic check and set may only change
9164 	 * p_tr_lgrpid to NLGRPS_MAX at this point.
9165 	 */
9166 	ASSERT(lgrp_id != LGRP_NONE && lgrp_id < NLGRPS_MAX);
9167 	olid = p->p_tr_lgrpid;
9168 	if (lgrp_id != olid && olid != NLGRPS_MAX) {
9169 		lgrp_id_t nlid = (olid == LGRP_NONE) ? lgrp_id : NLGRPS_MAX;
9170 		if (cas32((uint32_t *)&p->p_tr_lgrpid, olid, nlid) != olid) {
9171 			olid = p->p_tr_lgrpid;
9172 			ASSERT(olid != LGRP_NONE);
9173 			if (olid != lgrp_id && olid != NLGRPS_MAX) {
9174 				p->p_tr_lgrpid = NLGRPS_MAX;
9175 			}
9176 		}
9177 		ASSERT(p->p_tr_lgrpid != LGRP_NONE);
9178 		membar_producer();
9179 		/*
9180 		 * lgrp_move_thread() won't schedule async recheck after
9181 		 * p->p_t1_lgrpid update unless p->p_tr_lgrpid is not
9182 		 * LGRP_NONE. Recheck p_t1_lgrpid once now that p->p_tr_lgrpid
9183 		 * is not LGRP_NONE.
9184 		 */
9185 		if (first && p->p_t1_lgrpid != LGRP_NONE &&
9186 		    p->p_t1_lgrpid != lgrp_id) {
9187 			first = 0;
9188 			goto again;
9189 		}
9190 	}
9191 	/*
9192 	 * If no amp was created yet for lgrp_id create a new one as long as
9193 	 * we have enough memory to afford it.
9194 	 */
9195 	if ((amp = svntrp->tr_amp[lgrp_id]) == NULL) {
9196 		size_t trmem = atomic_add_long_nv(&segvn_textrepl_bytes, size);
9197 		if (trmem > segvn_textrepl_max_bytes) {
9198 			SEGVN_TR_ADDSTAT(normem);
9199 			goto fail;
9200 		}
9201 		if (anon_try_resv_zone(size, NULL) == 0) {
9202 			SEGVN_TR_ADDSTAT(noanon);
9203 			goto fail;
9204 		}
9205 		amp = anonmap_alloc(size, size, ANON_NOSLEEP);
9206 		if (amp == NULL) {
9207 			anon_unresv_zone(size, NULL);
9208 			SEGVN_TR_ADDSTAT(nokmem);
9209 			goto fail;
9210 		}
9211 		ASSERT(amp->refcnt == 1);
9212 		amp->a_szc = szc;
9213 		svntrp->tr_amp[lgrp_id] = amp;
9214 		SEGVN_TR_ADDSTAT(newamp);
9215 	}
9216 	svntrp->tr_refcnt++;
9217 	ASSERT(svd->svn_trnext == NULL);
9218 	ASSERT(svd->svn_trprev == NULL);
9219 	svd->svn_trnext = svntrp->tr_svnhead;
9220 	svd->svn_trprev = NULL;
9221 	if (svntrp->tr_svnhead != NULL) {
9222 		svntrp->tr_svnhead->svn_trprev = svd;
9223 	}
9224 	svntrp->tr_svnhead = svd;
9225 	ASSERT(amp->a_szc == szc && amp->size == size && amp->swresv == size);
9226 	ASSERT(amp->refcnt >= 1);
9227 	svd->amp = amp;
9228 	svd->anon_index = 0;
9229 	svd->tr_policy_info.mem_policy = LGRP_MEM_POLICY_NEXT_SEG;
9230 	svd->tr_policy_info.mem_lgrpid = lgrp_id;
9231 	svd->tr_state = SEGVN_TR_ON;
9232 	mutex_exit(&svntr_hashtab[hash].tr_lock);
9233 	SEGVN_TR_ADDSTAT(repl);
9234 	return;
9235 fail:
9236 	ASSERT(segvn_textrepl_bytes >= size);
9237 	atomic_add_long(&segvn_textrepl_bytes, -size);
9238 	ASSERT(svntrp != NULL);
9239 	ASSERT(svntrp->tr_amp[lgrp_id] == NULL);
9240 	if (svntrp->tr_refcnt == 0) {
9241 		ASSERT(svntrp == svntr_hashtab[hash].tr_head);
9242 		svntr_hashtab[hash].tr_head = svntrp->tr_next;
9243 		mutex_exit(&svntr_hashtab[hash].tr_lock);
9244 		kmem_cache_free(svntr_cache, svntrp);
9245 	} else {
9246 		mutex_exit(&svntr_hashtab[hash].tr_lock);
9247 	}
9248 	svd->tr_state = SEGVN_TR_OFF;
9249 }
9250 
9251 /*
9252  * Convert seg back to regular vnode mapping seg by unbinding it from its text
9253  * replication amp.  This routine is most typically called when segment is
9254  * unmapped but can also be called when segment no longer qualifies for text
9255  * replication (e.g. due to protection changes). If unload_unmap is set use
9256  * HAT_UNLOAD_UNMAP flag in hat_unload_callback().  If we are the last user of
9257  * svntr free all its anon maps and remove it from the hash table.
9258  */
9259 static void
9260 segvn_textunrepl(struct seg *seg, int unload_unmap)
9261 {
9262 	struct segvn_data	*svd = (struct segvn_data *)seg->s_data;
9263 	vnode_t			*vp = svd->vp;
9264 	u_offset_t		off = svd->offset;
9265 	size_t			size = seg->s_size;
9266 	u_offset_t		eoff = off + size;
9267 	uint_t			szc = seg->s_szc;
9268 	ulong_t			hash = SVNTR_HASH_FUNC(vp);
9269 	svntr_t			*svntrp;
9270 	svntr_t			**prv_svntrp;
9271 	lgrp_id_t		lgrp_id = svd->tr_policy_info.mem_lgrpid;
9272 	lgrp_id_t		i;
9273 
9274 	ASSERT(AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
9275 	ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) ||
9276 	    SEGVN_WRITE_HELD(seg->s_as, &svd->lock));
9277 	ASSERT(svd->tr_state == SEGVN_TR_ON);
9278 	ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie));
9279 	ASSERT(svd->amp != NULL);
9280 	ASSERT(svd->amp->refcnt >= 1);
9281 	ASSERT(svd->anon_index == 0);
9282 	ASSERT(lgrp_id != LGRP_NONE && lgrp_id < NLGRPS_MAX);
9283 	ASSERT(svntr_hashtab != NULL);
9284 
9285 	mutex_enter(&svntr_hashtab[hash].tr_lock);
9286 	prv_svntrp = &svntr_hashtab[hash].tr_head;
9287 	for (; (svntrp = *prv_svntrp) != NULL; prv_svntrp = &svntrp->tr_next) {
9288 		ASSERT(svntrp->tr_refcnt != 0);
9289 		if (svntrp->tr_vp == vp && svntrp->tr_off == off &&
9290 		    svntrp->tr_eoff == eoff && svntrp->tr_szc == szc) {
9291 			break;
9292 		}
9293 	}
9294 	if (svntrp == NULL) {
9295 		panic("segvn_textunrepl: svntr record not found");
9296 	}
9297 	if (svntrp->tr_amp[lgrp_id] != svd->amp) {
9298 		panic("segvn_textunrepl: amp mismatch");
9299 	}
9300 	svd->tr_state = SEGVN_TR_OFF;
9301 	svd->amp = NULL;
9302 	if (svd->svn_trprev == NULL) {
9303 		ASSERT(svntrp->tr_svnhead == svd);
9304 		svntrp->tr_svnhead = svd->svn_trnext;
9305 		if (svntrp->tr_svnhead != NULL) {
9306 			svntrp->tr_svnhead->svn_trprev = NULL;
9307 		}
9308 		svd->svn_trnext = NULL;
9309 	} else {
9310 		svd->svn_trprev->svn_trnext = svd->svn_trnext;
9311 		if (svd->svn_trnext != NULL) {
9312 			svd->svn_trnext->svn_trprev = svd->svn_trprev;
9313 			svd->svn_trnext = NULL;
9314 		}
9315 		svd->svn_trprev = NULL;
9316 	}
9317 	if (--svntrp->tr_refcnt) {
9318 		mutex_exit(&svntr_hashtab[hash].tr_lock);
9319 		goto done;
9320 	}
9321 	*prv_svntrp = svntrp->tr_next;
9322 	mutex_exit(&svntr_hashtab[hash].tr_lock);
9323 	for (i = 0; i < NLGRPS_MAX; i++) {
9324 		struct anon_map *amp = svntrp->tr_amp[i];
9325 		if (amp == NULL) {
9326 			continue;
9327 		}
9328 		ASSERT(amp->refcnt == 1);
9329 		ASSERT(amp->swresv == size);
9330 		ASSERT(amp->size == size);
9331 		ASSERT(amp->a_szc == szc);
9332 		if (amp->a_szc != 0) {
9333 			anon_free_pages(amp->ahp, 0, size, szc);
9334 		} else {
9335 			anon_free(amp->ahp, 0, size);
9336 		}
9337 		svntrp->tr_amp[i] = NULL;
9338 		ASSERT(segvn_textrepl_bytes >= size);
9339 		atomic_add_long(&segvn_textrepl_bytes, -size);
9340 		anon_unresv_zone(amp->swresv, NULL);
9341 		amp->refcnt = 0;
9342 		anonmap_free(amp);
9343 	}
9344 	kmem_cache_free(svntr_cache, svntrp);
9345 done:
9346 	hat_unload_callback(seg->s_as->a_hat, seg->s_base, size,
9347 	    unload_unmap ? HAT_UNLOAD_UNMAP : 0, NULL);
9348 }
9349 
9350 /*
9351  * This is called when a MAP_SHARED writable mapping is created to a vnode
9352  * that is currently used for execution (VVMEXEC flag is set). In this case we
9353  * need to prevent further use of existing replicas.
9354  */
9355 static void
9356 segvn_inval_trcache(vnode_t *vp)
9357 {
9358 	ulong_t			hash = SVNTR_HASH_FUNC(vp);
9359 	svntr_t			*svntrp;
9360 
9361 	ASSERT(vp->v_flag & VVMEXEC);
9362 
9363 	if (svntr_hashtab == NULL) {
9364 		return;
9365 	}
9366 
9367 	mutex_enter(&svntr_hashtab[hash].tr_lock);
9368 	svntrp = svntr_hashtab[hash].tr_head;
9369 	for (; svntrp != NULL; svntrp = svntrp->tr_next) {
9370 		ASSERT(svntrp->tr_refcnt != 0);
9371 		if (svntrp->tr_vp == vp && svntrp->tr_valid) {
9372 			svntrp->tr_valid = 0;
9373 		}
9374 	}
9375 	mutex_exit(&svntr_hashtab[hash].tr_lock);
9376 }
9377 
9378 static void
9379 segvn_trasync_thread(void)
9380 {
9381 	callb_cpr_t cpr_info;
9382 	kmutex_t cpr_lock;	/* just for CPR stuff */
9383 
9384 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
9385 
9386 	CALLB_CPR_INIT(&cpr_info, &cpr_lock,
9387 	    callb_generic_cpr, "segvn_async");
9388 
9389 	if (segvn_update_textrepl_interval == 0) {
9390 		segvn_update_textrepl_interval = segvn_update_tr_time * hz;
9391 	} else {
9392 		segvn_update_textrepl_interval *= hz;
9393 	}
9394 	(void) timeout(segvn_trupdate_wakeup, NULL,
9395 	    segvn_update_textrepl_interval);
9396 
9397 	for (;;) {
9398 		mutex_enter(&cpr_lock);
9399 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
9400 		mutex_exit(&cpr_lock);
9401 		sema_p(&segvn_trasync_sem);
9402 		mutex_enter(&cpr_lock);
9403 		CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
9404 		mutex_exit(&cpr_lock);
9405 		segvn_trupdate();
9406 	}
9407 }
9408 
9409 static uint64_t segvn_lgrp_trthr_migrs_snpsht = 0;
9410 
9411 static void
9412 segvn_trupdate_wakeup(void *dummy)
9413 {
9414 	uint64_t cur_lgrp_trthr_migrs = lgrp_get_trthr_migrations();
9415 
9416 	if (cur_lgrp_trthr_migrs != segvn_lgrp_trthr_migrs_snpsht) {
9417 		segvn_lgrp_trthr_migrs_snpsht = cur_lgrp_trthr_migrs;
9418 		sema_v(&segvn_trasync_sem);
9419 	}
9420 
9421 	if (!segvn_disable_textrepl_update &&
9422 	    segvn_update_textrepl_interval != 0) {
9423 		(void) timeout(segvn_trupdate_wakeup, dummy,
9424 		    segvn_update_textrepl_interval);
9425 	}
9426 }
9427 
9428 static void
9429 segvn_trupdate(void)
9430 {
9431 	ulong_t		hash;
9432 	svntr_t		*svntrp;
9433 	segvn_data_t	*svd;
9434 
9435 	ASSERT(svntr_hashtab != NULL);
9436 
9437 	for (hash = 0; hash < svntr_hashtab_sz; hash++) {
9438 		mutex_enter(&svntr_hashtab[hash].tr_lock);
9439 		svntrp = svntr_hashtab[hash].tr_head;
9440 		for (; svntrp != NULL; svntrp = svntrp->tr_next) {
9441 			ASSERT(svntrp->tr_refcnt != 0);
9442 			svd = svntrp->tr_svnhead;
9443 			for (; svd != NULL; svd = svd->svn_trnext) {
9444 				segvn_trupdate_seg(svd->seg, svd, svntrp,
9445 				    hash);
9446 			}
9447 		}
9448 		mutex_exit(&svntr_hashtab[hash].tr_lock);
9449 	}
9450 }
9451 
9452 static void
9453 segvn_trupdate_seg(struct seg *seg,
9454 	segvn_data_t *svd,
9455 	svntr_t *svntrp,
9456 	ulong_t hash)
9457 {
9458 	proc_t			*p;
9459 	lgrp_id_t		lgrp_id;
9460 	struct as		*as;
9461 	size_t			size;
9462 	struct anon_map		*amp;
9463 
9464 	ASSERT(svd->vp != NULL);
9465 	ASSERT(svd->vp == svntrp->tr_vp);
9466 	ASSERT(svd->offset == svntrp->tr_off);
9467 	ASSERT(svd->offset + seg->s_size == svntrp->tr_eoff);
9468 	ASSERT(seg != NULL);
9469 	ASSERT(svd->seg == seg);
9470 	ASSERT(seg->s_data == (void *)svd);
9471 	ASSERT(seg->s_szc == svntrp->tr_szc);
9472 	ASSERT(svd->tr_state == SEGVN_TR_ON);
9473 	ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie));
9474 	ASSERT(svd->amp != NULL);
9475 	ASSERT(svd->tr_policy_info.mem_policy == LGRP_MEM_POLICY_NEXT_SEG);
9476 	ASSERT(svd->tr_policy_info.mem_lgrpid != LGRP_NONE);
9477 	ASSERT(svd->tr_policy_info.mem_lgrpid < NLGRPS_MAX);
9478 	ASSERT(svntrp->tr_amp[svd->tr_policy_info.mem_lgrpid] == svd->amp);
9479 	ASSERT(svntrp->tr_refcnt != 0);
9480 	ASSERT(mutex_owned(&svntr_hashtab[hash].tr_lock));
9481 
9482 	as = seg->s_as;
9483 	ASSERT(as != NULL && as != &kas);
9484 	p = as->a_proc;
9485 	ASSERT(p != NULL);
9486 	ASSERT(p->p_tr_lgrpid != LGRP_NONE);
9487 	lgrp_id = p->p_t1_lgrpid;
9488 	if (lgrp_id == LGRP_NONE) {
9489 		return;
9490 	}
9491 	ASSERT(lgrp_id < NLGRPS_MAX);
9492 	if (svd->tr_policy_info.mem_lgrpid == lgrp_id) {
9493 		return;
9494 	}
9495 
9496 	/*
9497 	 * Use tryenter locking since we are locking as/seg and svntr hash
9498 	 * lock in reverse from syncrounous thread order.
9499 	 */
9500 	if (!AS_LOCK_TRYENTER(as, &as->a_lock, RW_READER)) {
9501 		SEGVN_TR_ADDSTAT(nolock);
9502 		if (segvn_lgrp_trthr_migrs_snpsht) {
9503 			segvn_lgrp_trthr_migrs_snpsht = 0;
9504 		}
9505 		return;
9506 	}
9507 	if (!SEGVN_LOCK_TRYENTER(seg->s_as, &svd->lock, RW_WRITER)) {
9508 		AS_LOCK_EXIT(as, &as->a_lock);
9509 		SEGVN_TR_ADDSTAT(nolock);
9510 		if (segvn_lgrp_trthr_migrs_snpsht) {
9511 			segvn_lgrp_trthr_migrs_snpsht = 0;
9512 		}
9513 		return;
9514 	}
9515 	size = seg->s_size;
9516 	if (svntrp->tr_amp[lgrp_id] == NULL) {
9517 		size_t trmem = atomic_add_long_nv(&segvn_textrepl_bytes, size);
9518 		if (trmem > segvn_textrepl_max_bytes) {
9519 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
9520 			AS_LOCK_EXIT(as, &as->a_lock);
9521 			atomic_add_long(&segvn_textrepl_bytes, -size);
9522 			SEGVN_TR_ADDSTAT(normem);
9523 			return;
9524 		}
9525 		if (anon_try_resv_zone(size, NULL) == 0) {
9526 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
9527 			AS_LOCK_EXIT(as, &as->a_lock);
9528 			atomic_add_long(&segvn_textrepl_bytes, -size);
9529 			SEGVN_TR_ADDSTAT(noanon);
9530 			return;
9531 		}
9532 		amp = anonmap_alloc(size, size, KM_NOSLEEP);
9533 		if (amp == NULL) {
9534 			SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
9535 			AS_LOCK_EXIT(as, &as->a_lock);
9536 			atomic_add_long(&segvn_textrepl_bytes, -size);
9537 			anon_unresv_zone(size, NULL);
9538 			SEGVN_TR_ADDSTAT(nokmem);
9539 			return;
9540 		}
9541 		ASSERT(amp->refcnt == 1);
9542 		amp->a_szc = seg->s_szc;
9543 		svntrp->tr_amp[lgrp_id] = amp;
9544 	}
9545 	/*
9546 	 * We don't need to drop the bucket lock but here we give other
9547 	 * threads a chance.  svntr and svd can't be unlinked as long as
9548 	 * segment lock is held as a writer and AS held as well.  After we
9549 	 * retake bucket lock we'll continue from where we left. We'll be able
9550 	 * to reach the end of either list since new entries are always added
9551 	 * to the beginning of the lists.
9552 	 */
9553 	mutex_exit(&svntr_hashtab[hash].tr_lock);
9554 	hat_unload_callback(as->a_hat, seg->s_base, size, 0, NULL);
9555 	mutex_enter(&svntr_hashtab[hash].tr_lock);
9556 
9557 	ASSERT(svd->tr_state == SEGVN_TR_ON);
9558 	ASSERT(svd->amp != NULL);
9559 	ASSERT(svd->tr_policy_info.mem_policy == LGRP_MEM_POLICY_NEXT_SEG);
9560 	ASSERT(svd->tr_policy_info.mem_lgrpid != lgrp_id);
9561 	ASSERT(svd->amp != svntrp->tr_amp[lgrp_id]);
9562 
9563 	svd->tr_policy_info.mem_lgrpid = lgrp_id;
9564 	svd->amp = svntrp->tr_amp[lgrp_id];
9565 	p->p_tr_lgrpid = NLGRPS_MAX;
9566 	SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
9567 	AS_LOCK_EXIT(as, &as->a_lock);
9568 
9569 	ASSERT(svntrp->tr_refcnt != 0);
9570 	ASSERT(svd->vp == svntrp->tr_vp);
9571 	ASSERT(svd->tr_policy_info.mem_lgrpid == lgrp_id);
9572 	ASSERT(svd->amp != NULL && svd->amp == svntrp->tr_amp[lgrp_id]);
9573 	ASSERT(svd->seg == seg);
9574 	ASSERT(svd->tr_state == SEGVN_TR_ON);
9575 
9576 	SEGVN_TR_ADDSTAT(asyncrepl);
9577 }
9578