xref: /titanic_41/usr/src/uts/common/vm/seg_spt.c (revision 23a276b1252962c987a613be470dde26561247b8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/param.h>
29 #include <sys/user.h>
30 #include <sys/mman.h>
31 #include <sys/kmem.h>
32 #include <sys/sysmacros.h>
33 #include <sys/cmn_err.h>
34 #include <sys/systm.h>
35 #include <sys/tuneable.h>
36 #include <vm/hat.h>
37 #include <vm/seg.h>
38 #include <vm/as.h>
39 #include <vm/anon.h>
40 #include <vm/page.h>
41 #include <sys/buf.h>
42 #include <sys/swap.h>
43 #include <sys/atomic.h>
44 #include <vm/seg_spt.h>
45 #include <sys/debug.h>
46 #include <sys/vtrace.h>
47 #include <sys/shm.h>
48 #include <sys/lgrp.h>
49 #include <sys/vmsystm.h>
50 
51 #include <sys/tnf_probe.h>
52 
53 #define	SEGSPTADDR	(caddr_t)0x0
54 
55 /*
56  * # pages used for spt
57  */
58 static size_t	spt_used;
59 
60 /*
61  * segspt_minfree is the memory left for system after ISM
62  * locked its pages; it is set up to 5% of availrmem in
63  * sptcreate when ISM is created.  ISM should not use more
64  * than ~90% of availrmem; if it does, then the performance
65  * of the system may decrease. Machines with large memories may
66  * be able to use up more memory for ISM so we set the default
67  * segspt_minfree to 5% (which gives ISM max 95% of availrmem.
68  * If somebody wants even more memory for ISM (risking hanging
69  * the system) they can patch the segspt_minfree to smaller number.
70  */
71 pgcnt_t segspt_minfree = 0;
72 
73 static int segspt_create(struct seg *seg, caddr_t argsp);
74 static int segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize);
75 static void segspt_free(struct seg *seg);
76 static void segspt_free_pages(struct seg *seg, caddr_t addr, size_t len);
77 static lgrp_mem_policy_info_t *segspt_getpolicy(struct seg *seg, caddr_t addr);
78 
79 static void
80 segspt_badop()
81 {
82 	panic("segspt_badop called");
83 	/*NOTREACHED*/
84 }
85 
86 #define	SEGSPT_BADOP(t)	(t(*)())segspt_badop
87 
88 struct seg_ops segspt_ops = {
89 	SEGSPT_BADOP(int),		/* dup */
90 	segspt_unmap,
91 	segspt_free,
92 	SEGSPT_BADOP(int),		/* fault */
93 	SEGSPT_BADOP(faultcode_t),	/* faulta */
94 	SEGSPT_BADOP(int),		/* setprot */
95 	SEGSPT_BADOP(int),		/* checkprot */
96 	SEGSPT_BADOP(int),		/* kluster */
97 	SEGSPT_BADOP(size_t),		/* swapout */
98 	SEGSPT_BADOP(int),		/* sync */
99 	SEGSPT_BADOP(size_t),		/* incore */
100 	SEGSPT_BADOP(int),		/* lockop */
101 	SEGSPT_BADOP(int),		/* getprot */
102 	SEGSPT_BADOP(u_offset_t), 	/* getoffset */
103 	SEGSPT_BADOP(int),		/* gettype */
104 	SEGSPT_BADOP(int),		/* getvp */
105 	SEGSPT_BADOP(int),		/* advise */
106 	SEGSPT_BADOP(void),		/* dump */
107 	SEGSPT_BADOP(int),		/* pagelock */
108 	SEGSPT_BADOP(int),		/* setpgsz */
109 	SEGSPT_BADOP(int),		/* getmemid */
110 	segspt_getpolicy,		/* getpolicy */
111 	SEGSPT_BADOP(int),		/* capable */
112 };
113 
114 static int segspt_shmdup(struct seg *seg, struct seg *newseg);
115 static int segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize);
116 static void segspt_shmfree(struct seg *seg);
117 static faultcode_t segspt_shmfault(struct hat *hat, struct seg *seg,
118 		caddr_t addr, size_t len, enum fault_type type, enum seg_rw rw);
119 static faultcode_t segspt_shmfaulta(struct seg *seg, caddr_t addr);
120 static int segspt_shmsetprot(register struct seg *seg, register caddr_t addr,
121 			register size_t len, register uint_t prot);
122 static int segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size,
123 			uint_t prot);
124 static int	segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta);
125 static size_t	segspt_shmswapout(struct seg *seg);
126 static size_t segspt_shmincore(struct seg *seg, caddr_t addr, size_t len,
127 			register char *vec);
128 static int segspt_shmsync(struct seg *seg, register caddr_t addr, size_t len,
129 			int attr, uint_t flags);
130 static int segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
131 			int attr, int op, ulong_t *lockmap, size_t pos);
132 static int segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len,
133 			uint_t *protv);
134 static u_offset_t segspt_shmgetoffset(struct seg *seg, caddr_t addr);
135 static int segspt_shmgettype(struct seg *seg, caddr_t addr);
136 static int segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
137 static int segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len,
138 			uint_t behav);
139 static void segspt_shmdump(struct seg *seg);
140 static int segspt_shmpagelock(struct seg *, caddr_t, size_t,
141 			struct page ***, enum lock_type, enum seg_rw);
142 static int segspt_shmsetpgsz(struct seg *, caddr_t, size_t, uint_t);
143 static int segspt_shmgetmemid(struct seg *, caddr_t, memid_t *);
144 static lgrp_mem_policy_info_t *segspt_shmgetpolicy(struct seg *, caddr_t);
145 static int segspt_shmcapable(struct seg *, segcapability_t);
146 
147 struct seg_ops segspt_shmops = {
148 	segspt_shmdup,
149 	segspt_shmunmap,
150 	segspt_shmfree,
151 	segspt_shmfault,
152 	segspt_shmfaulta,
153 	segspt_shmsetprot,
154 	segspt_shmcheckprot,
155 	segspt_shmkluster,
156 	segspt_shmswapout,
157 	segspt_shmsync,
158 	segspt_shmincore,
159 	segspt_shmlockop,
160 	segspt_shmgetprot,
161 	segspt_shmgetoffset,
162 	segspt_shmgettype,
163 	segspt_shmgetvp,
164 	segspt_shmadvise,	/* advise */
165 	segspt_shmdump,
166 	segspt_shmpagelock,
167 	segspt_shmsetpgsz,
168 	segspt_shmgetmemid,
169 	segspt_shmgetpolicy,
170 	segspt_shmcapable,
171 };
172 
173 static void segspt_purge(struct seg *seg);
174 static int segspt_reclaim(struct seg *, caddr_t, size_t, struct page **,
175 		enum seg_rw);
176 static int spt_anon_getpages(struct seg *seg, caddr_t addr, size_t len,
177 		page_t **ppa);
178 
179 
180 
181 /*ARGSUSED*/
182 int
183 sptcreate(size_t size, struct seg **sptseg, struct anon_map *amp,
184     uint_t prot, uint_t flags, uint_t share_szc)
185 {
186 	int 	err;
187 	struct  as	*newas;
188 	struct	segspt_crargs sptcargs;
189 
190 #ifdef DEBUG
191 	TNF_PROBE_1(sptcreate, "spt", /* CSTYLED */,
192                 	tnf_ulong, size, size );
193 #endif
194 	if (segspt_minfree == 0)	/* leave min 5% of availrmem for */
195 		segspt_minfree = availrmem/20;	/* for the system */
196 
197 	if (!hat_supported(HAT_SHARED_PT, (void *)0))
198 		return (EINVAL);
199 
200 	/*
201 	 * get a new as for this shared memory segment
202 	 */
203 	newas = as_alloc();
204 	sptcargs.amp = amp;
205 	sptcargs.prot = prot;
206 	sptcargs.flags = flags;
207 	sptcargs.szc = share_szc;
208 
209 	/*
210 	 * create a shared page table (spt) segment
211 	 */
212 
213 	if (err = as_map(newas, SEGSPTADDR, size, segspt_create, &sptcargs)) {
214 		as_free(newas);
215 		return (err);
216 	}
217 	*sptseg = sptcargs.seg_spt;
218 	return (0);
219 }
220 
221 void
222 sptdestroy(struct as *as, struct anon_map *amp)
223 {
224 
225 #ifdef DEBUG
226 	TNF_PROBE_0(sptdestroy, "spt", /* CSTYLED */);
227 #endif
228 	(void) as_unmap(as, SEGSPTADDR, amp->size);
229 	as_free(as);
230 }
231 
232 /*
233  * called from seg_free().
234  * free (i.e., unlock, unmap, return to free list)
235  *  all the pages in the given seg.
236  */
237 void
238 segspt_free(struct seg	*seg)
239 {
240 	struct spt_data *sptd = (struct spt_data *)seg->s_data;
241 
242 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
243 
244 	if (sptd != NULL) {
245 		if (sptd->spt_realsize)
246 			segspt_free_pages(seg, seg->s_base, sptd->spt_realsize);
247 
248 		if (sptd->spt_ppa_lckcnt)
249 			kmem_free(sptd->spt_ppa_lckcnt,
250 				sizeof (*sptd->spt_ppa_lckcnt)
251 				* btopr(sptd->spt_amp->size));
252 		kmem_free(sptd->spt_vp, sizeof (*sptd->spt_vp));
253 		mutex_destroy(&sptd->spt_lock);
254 		kmem_free(sptd, sizeof (*sptd));
255 	}
256 }
257 
258 /*ARGSUSED*/
259 static int
260 segspt_shmsync(struct seg *seg, caddr_t addr, size_t len, int attr,
261 	uint_t flags)
262 {
263 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
264 
265 	return (0);
266 }
267 
268 /*ARGSUSED*/
269 static size_t
270 segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, char *vec)
271 {
272 	caddr_t	eo_seg;
273 	pgcnt_t	npages;
274 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
275 	struct seg	*sptseg;
276 	struct spt_data *sptd;
277 
278 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
279 #ifdef lint
280 	seg = seg;
281 #endif
282 	sptseg = shmd->shm_sptseg;
283 	sptd = sptseg->s_data;
284 
285 	if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
286 		eo_seg = addr + len;
287 		while (addr < eo_seg) {
288 			/* page exists, and it's locked. */
289 			*vec++ = SEG_PAGE_INCORE | SEG_PAGE_LOCKED |
290 				SEG_PAGE_ANON;
291 			addr += PAGESIZE;
292 		}
293 		return (len);
294 	} else {
295 		struct  anon_map *amp = shmd->shm_amp;
296 		struct  anon	*ap;
297 		page_t		*pp;
298 		pgcnt_t 	anon_index;
299 		struct vnode 	*vp;
300 		u_offset_t 	off;
301 		ulong_t		i;
302 		int		ret;
303 		anon_sync_obj_t	cookie;
304 
305 		addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
306 		anon_index = seg_page(seg, addr);
307 		npages = btopr(len);
308 		if (anon_index + npages > btopr(shmd->shm_amp->size)) {
309 			return (EINVAL);
310 		}
311 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
312 		for (i = 0; i < npages; i++, anon_index++) {
313 			ret = 0;
314 			anon_array_enter(amp, anon_index, &cookie);
315 			ap = anon_get_ptr(amp->ahp, anon_index);
316 			if (ap != NULL) {
317 				swap_xlate(ap, &vp, &off);
318 				anon_array_exit(&cookie);
319 				pp = page_lookup_nowait(vp, off, SE_SHARED);
320 				if (pp != NULL) {
321 					ret |= SEG_PAGE_INCORE | SEG_PAGE_ANON;
322 					page_unlock(pp);
323 				}
324 			} else {
325 				anon_array_exit(&cookie);
326 			}
327 			if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) {
328 				ret |= SEG_PAGE_LOCKED;
329 			}
330 			*vec++ = (char)ret;
331 		}
332 		ANON_LOCK_EXIT(&amp->a_rwlock);
333 		return (len);
334 	}
335 }
336 
337 static int
338 segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize)
339 {
340 	size_t share_size;
341 
342 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
343 
344 	/*
345 	 * seg.s_size may have been rounded up to the largest page size
346 	 * in shmat().
347 	 * XXX This should be cleanedup. sptdestroy should take a length
348 	 * argument which should be the same as sptcreate. Then
349 	 * this rounding would not be needed (or is done in shm.c)
350 	 * Only the check for full segment will be needed.
351 	 *
352 	 * XXX -- shouldn't raddr == 0 always? These tests don't seem
353 	 * to be useful at all.
354 	 */
355 	share_size = page_get_pagesize(seg->s_szc);
356 	ssize = P2ROUNDUP(ssize, share_size);
357 
358 	if (raddr == seg->s_base && ssize == seg->s_size) {
359 		seg_free(seg);
360 		return (0);
361 	} else
362 		return (EINVAL);
363 }
364 
365 int
366 segspt_create(struct seg *seg, caddr_t argsp)
367 {
368 	int		err;
369 	caddr_t		addr = seg->s_base;
370 	struct spt_data *sptd;
371 	struct 	segspt_crargs *sptcargs = (struct segspt_crargs *)argsp;
372 	struct anon_map *amp = sptcargs->amp;
373 	struct	cred	*cred = CRED();
374 	ulong_t		i, j, anon_index = 0;
375 	pgcnt_t		npages = btopr(amp->size);
376 	struct vnode	*vp;
377 	page_t		**ppa;
378 	uint_t		hat_flags;
379 	size_t		pgsz;
380 	pgcnt_t		pgcnt;
381 	caddr_t		a;
382 	pgcnt_t		pidx;
383 	size_t		sz;
384 
385 	/*
386 	 * We are holding the a_lock on the underlying dummy as,
387 	 * so we can make calls to the HAT layer.
388 	 */
389 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
390 
391 #ifdef DEBUG
392 	TNF_PROBE_2(segspt_create, "spt", /* CSTYLED */,
393                                 tnf_opaque, addr, addr,
394 				tnf_ulong, len, seg->s_size);
395 #endif
396 	if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
397 		if (err = anon_swap_adjust(npages))
398 			return (err);
399 	}
400 	err = ENOMEM;
401 
402 	if ((sptd = kmem_zalloc(sizeof (*sptd), KM_NOSLEEP)) == NULL)
403 		goto out1;
404 
405 	if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
406 		if ((ppa = kmem_zalloc(((sizeof (page_t *)) * npages),
407 		    KM_NOSLEEP)) == NULL)
408 			goto out2;
409 	}
410 
411 	mutex_init(&sptd->spt_lock, NULL, MUTEX_DEFAULT, NULL);
412 
413 	if ((vp = kmem_zalloc(sizeof (*vp), KM_NOSLEEP)) == NULL)
414 		goto out3;
415 
416 	seg->s_ops = &segspt_ops;
417 	sptd->spt_vp = vp;
418 	sptd->spt_amp = amp;
419 	sptd->spt_prot = sptcargs->prot;
420 	sptd->spt_flags = sptcargs->flags;
421 	seg->s_data = (caddr_t)sptd;
422 	sptd->spt_ppa = NULL;
423 	sptd->spt_ppa_lckcnt = NULL;
424 	seg->s_szc = sptcargs->szc;
425 
426 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
427 	if (seg->s_szc > amp->a_szc) {
428 		amp->a_szc = seg->s_szc;
429 	}
430 	ANON_LOCK_EXIT(&amp->a_rwlock);
431 
432 	/*
433 	 * Set policy to affect initial allocation of pages in
434 	 * anon_map_createpages()
435 	 */
436 	(void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, amp, anon_index,
437 	    NULL, 0, ptob(npages));
438 
439 	if (sptcargs->flags & SHM_PAGEABLE) {
440 		size_t  share_sz;
441 		pgcnt_t new_npgs, more_pgs;
442 		struct anon_hdr *nahp;
443 
444 		share_sz = page_get_pagesize(seg->s_szc);
445 		if (!IS_P2ALIGNED(amp->size, share_sz)) {
446 			/*
447 			 * We are rounding up the size of the anon array
448 			 * on 4 M boundary because we always create 4 M
449 			 * of page(s) when locking, faulting pages and we
450 			 * don't have to check for all corner cases e.g.
451 			 * if there is enough space to allocate 4 M
452 			 * page.
453 			 */
454 			new_npgs = btop(P2ROUNDUP(amp->size, share_sz));
455 			more_pgs = new_npgs - npages;
456 
457 			if (anon_resv(ptob(more_pgs)) == 0) {
458 				err = ENOMEM;
459 				goto out4;
460 			}
461 			nahp = anon_create(new_npgs, ANON_SLEEP);
462 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
463 			(void) anon_copy_ptr(amp->ahp, 0, nahp, 0, npages,
464 			    ANON_SLEEP);
465 			anon_release(amp->ahp, npages);
466 			amp->ahp = nahp;
467 			amp->swresv = amp->size = ptob(new_npgs);
468 			ANON_LOCK_EXIT(&amp->a_rwlock);
469 			npages = new_npgs;
470 		}
471 
472 		sptd->spt_ppa_lckcnt = kmem_zalloc(npages *
473 		    sizeof (*sptd->spt_ppa_lckcnt), KM_SLEEP);
474 		sptd->spt_pcachecnt = 0;
475 		sptd->spt_realsize = ptob(npages);
476 		sptcargs->seg_spt = seg;
477 		return (0);
478 	}
479 
480 	/*
481 	 * get array of pages for each anon slot in amp
482 	 */
483 	if ((err = anon_map_createpages(amp, anon_index, ptob(npages), ppa,
484 	    seg, addr, S_CREATE, cred)) != 0)
485 		goto out4;
486 
487 	/*
488 	 * addr is initial address corresponding to the first page on ppa list
489 	 */
490 	for (i = 0; i < npages; i++) {
491 		/* attempt to lock all pages */
492 		if (!page_pp_lock(ppa[i], 0, 1)) {
493 			/*
494 			 * if unable to lock any page, unlock all
495 			 * of them and return error
496 			 */
497 			for (j = 0; j < i; j++)
498 				page_pp_unlock(ppa[j], 0, 1);
499 			for (i = 0; i < npages; i++) {
500 				page_unlock(ppa[i]);
501 			}
502 			err = ENOMEM;
503 			goto out4;
504 		}
505 	}
506 
507 	/*
508 	 * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
509 	 * for the entire life of the segment. For example platforms
510 	 * that do not support Dynamic Reconfiguration.
511 	 */
512 	hat_flags = HAT_LOAD_SHARE;
513 	if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, NULL))
514 		hat_flags |= HAT_LOAD_LOCK;
515 
516 	/*
517 	 * Load translations one lare page at a time
518 	 * to make sure we don't create mappings bigger than
519 	 * segment's size code in case underlying pages
520 	 * are shared with segvn's segment that uses bigger
521 	 * size code than we do.
522 	 */
523 	pgsz = page_get_pagesize(seg->s_szc);
524 	pgcnt = page_get_pagecnt(seg->s_szc);
525 	for (a = addr, pidx = 0; pidx < npages; a += pgsz, pidx += pgcnt) {
526 		sz = MIN(pgsz, ptob(npages - pidx));
527 		hat_memload_array(seg->s_as->a_hat, a, sz,
528 		    &ppa[pidx], sptd->spt_prot, hat_flags);
529 	}
530 
531 	/*
532 	 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
533 	 * we will leave the pages locked SE_SHARED for the life
534 	 * of the ISM segment. This will prevent any calls to
535 	 * hat_pageunload() on this ISM segment for those platforms.
536 	 */
537 	if (!(hat_flags & HAT_LOAD_LOCK)) {
538 		/*
539 		 * On platforms that support HAT_DYNAMIC_ISM_UNMAP,
540 		 * we no longer need to hold the SE_SHARED lock on the pages,
541 		 * since L_PAGELOCK and F_SOFTLOCK calls will grab the
542 		 * SE_SHARED lock on the pages as necessary.
543 		 */
544 		for (i = 0; i < npages; i++)
545 			page_unlock(ppa[i]);
546 	}
547 	sptd->spt_pcachecnt = 0;
548 	kmem_free(ppa, ((sizeof (page_t *)) * npages));
549 	sptd->spt_realsize = ptob(npages);
550 	atomic_add_long(&spt_used, npages);
551 	sptcargs->seg_spt = seg;
552 	return (0);
553 
554 out4:
555 	seg->s_data = NULL;
556 	kmem_free(vp, sizeof (*vp));
557 out3:
558 	mutex_destroy(&sptd->spt_lock);
559 	if ((sptcargs->flags & SHM_PAGEABLE) == 0)
560 		kmem_free(ppa, (sizeof (*ppa) * npages));
561 out2:
562 	kmem_free(sptd, sizeof (*sptd));
563 out1:
564 	if ((sptcargs->flags & SHM_PAGEABLE) == 0)
565 		anon_swap_restore(npages);
566 	return (err);
567 }
568 
569 /*ARGSUSED*/
570 void
571 segspt_free_pages(struct seg *seg, caddr_t addr, size_t len)
572 {
573 	struct page 	*pp;
574 	struct spt_data *sptd = (struct spt_data *)seg->s_data;
575 	pgcnt_t		npages;
576 	ulong_t		anon_idx;
577 	struct anon_map *amp;
578 	struct anon 	*ap;
579 	struct vnode 	*vp;
580 	u_offset_t 	off;
581 	uint_t		hat_flags;
582 	int		root = 0;
583 	pgcnt_t		pgs, curnpgs = 0;
584 	page_t		*rootpp;
585 
586 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
587 
588 	len = P2ROUNDUP(len, PAGESIZE);
589 
590 	npages = btop(len);
591 
592 	hat_flags = HAT_UNLOAD_UNLOCK;
593 	if ((hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) ||
594 	    (sptd->spt_flags & SHM_PAGEABLE)) {
595 		hat_flags = HAT_UNLOAD;
596 	}
597 
598 	hat_unload(seg->s_as->a_hat, addr, len, hat_flags);
599 
600 	amp = sptd->spt_amp;
601 	if (sptd->spt_flags & SHM_PAGEABLE)
602 		npages = btop(amp->size);
603 
604 	ASSERT(amp);
605 	for (anon_idx = 0; anon_idx < npages; anon_idx++) {
606 		if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
607 			if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) {
608 				panic("segspt_free_pages: null app");
609 				/*NOTREACHED*/
610 			}
611 		} else {
612 			if ((ap = anon_get_next_ptr(amp->ahp, &anon_idx))
613 			    == NULL)
614 				continue;
615 		}
616 		ASSERT(ANON_ISBUSY(anon_get_slot(amp->ahp, anon_idx)) == 0);
617 		swap_xlate(ap, &vp, &off);
618 
619 		/*
620 		 * If this platform supports HAT_DYNAMIC_ISM_UNMAP,
621 		 * the pages won't be having SE_SHARED lock at this
622 		 * point.
623 		 *
624 		 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
625 		 * the pages are still held SE_SHARED locked from the
626 		 * original segspt_create()
627 		 *
628 		 * Our goal is to get SE_EXCL lock on each page, remove
629 		 * permanent lock on it and invalidate the page.
630 		 */
631 		if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
632 			if (hat_flags == HAT_UNLOAD)
633 				pp = page_lookup(vp, off, SE_EXCL);
634 			else {
635 				if ((pp = page_find(vp, off)) == NULL) {
636 					panic("segspt_free_pages: "
637 					    "page not locked");
638 					/*NOTREACHED*/
639 				}
640 				if (!page_tryupgrade(pp)) {
641 					page_unlock(pp);
642 					pp = page_lookup(vp, off, SE_EXCL);
643 				}
644 			}
645 			if (pp == NULL) {
646 				panic("segspt_free_pages: "
647 				    "page not in the system");
648 				/*NOTREACHED*/
649 			}
650 			page_pp_unlock(pp, 0, 1);
651 		} else {
652 			if ((pp = page_lookup(vp, off, SE_EXCL)) == NULL)
653 				continue;
654 			page_pp_unlock(pp, 0, 0);
655 		}
656 		/*
657 		 * It's logical to invalidate the pages here as in most cases
658 		 * these were created by segspt.
659 		 */
660 		if (pp->p_szc != 0) {
661 			/*
662 			 * For DISM swap is released in shm_rm_amp.
663 			 */
664 			if ((sptd->spt_flags & SHM_PAGEABLE) == 0 &&
665 			    ap->an_pvp != NULL) {
666 				panic("segspt_free_pages: pvp non NULL");
667 				/*NOTREACHED*/
668 			}
669 			if (root == 0) {
670 				ASSERT(curnpgs == 0);
671 				root = 1;
672 				rootpp = pp;
673 				pgs = curnpgs = page_get_pagecnt(pp->p_szc);
674 				ASSERT(pgs > 1);
675 				ASSERT(IS_P2ALIGNED(pgs, pgs));
676 				ASSERT(!(page_pptonum(pp) & (pgs - 1)));
677 				curnpgs--;
678 			} else if ((page_pptonum(pp) & (pgs - 1)) == pgs - 1) {
679 				ASSERT(curnpgs == 1);
680 				ASSERT(page_pptonum(pp) ==
681 				    page_pptonum(rootpp) + (pgs - 1));
682 				page_destroy_pages(rootpp);
683 				root = 0;
684 				curnpgs = 0;
685 			} else {
686 				ASSERT(curnpgs > 1);
687 				ASSERT(page_pptonum(pp) ==
688 				    page_pptonum(rootpp) + (pgs - curnpgs));
689 				curnpgs--;
690 			}
691 		} else {
692 			if (root != 0 || curnpgs != 0) {
693 				panic("segspt_free_pages: bad large page");
694 				/*NOTREACHED*/
695 			}
696 			/*LINTED: constant in conditional context */
697 			VN_DISPOSE(pp, B_INVAL, 0, kcred);
698 		}
699 	}
700 
701 	if (root != 0 || curnpgs != 0) {
702 		panic("segspt_free_pages: bad large page");
703 		/*NOTREACHED*/
704 	}
705 
706 	/*
707 	 * mark that pages have been released
708 	 */
709 	sptd->spt_realsize = 0;
710 
711 	if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
712 		atomic_add_long(&spt_used, -npages);
713 		anon_swap_restore(npages);
714 	}
715 }
716 
717 /*
718  * Get memory allocation policy info for specified address in given segment
719  */
720 static lgrp_mem_policy_info_t *
721 segspt_getpolicy(struct seg *seg, caddr_t addr)
722 {
723 	struct anon_map		*amp;
724 	ulong_t			anon_index;
725 	lgrp_mem_policy_info_t	*policy_info;
726 	struct spt_data		*spt_data;
727 
728 	ASSERT(seg != NULL);
729 
730 	/*
731 	 * Get anon_map from segspt
732 	 *
733 	 * Assume that no lock needs to be held on anon_map, since
734 	 * it should be protected by its reference count which must be
735 	 * nonzero for an existing segment
736 	 * Need to grab readers lock on policy tree though
737 	 */
738 	spt_data = (struct spt_data *)seg->s_data;
739 	if (spt_data == NULL)
740 		return (NULL);
741 	amp = spt_data->spt_amp;
742 	ASSERT(amp->refcnt != 0);
743 
744 	/*
745 	 * Get policy info
746 	 *
747 	 * Assume starting anon index of 0
748 	 */
749 	anon_index = seg_page(seg, addr);
750 	policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
751 
752 	return (policy_info);
753 }
754 
755 /*
756  * DISM only.
757  * Return locked pages over a given range.
758  *
759  * We will cache all DISM locked pages and save the pplist for the
760  * entire segment in the ppa field of the underlying DISM segment structure.
761  * Later, during a call to segspt_reclaim() we will use this ppa array
762  * to page_unlock() all of the pages and then we will free this ppa list.
763  */
764 /*ARGSUSED*/
765 static int
766 segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len,
767     struct page ***ppp, enum lock_type type, enum seg_rw rw)
768 {
769 	struct  shm_data *shmd = (struct shm_data *)seg->s_data;
770 	struct  seg	*sptseg = shmd->shm_sptseg;
771 	struct  spt_data *sptd = sptseg->s_data;
772 	pgcnt_t pg_idx, npages, tot_npages, npgs;
773 	struct  page **pplist, **pl, **ppa, *pp;
774 	struct  anon_map *amp;
775 	spgcnt_t	an_idx;
776 	int 	ret = ENOTSUP;
777 	uint_t	pl_built = 0;
778 	struct  anon *ap;
779 	struct  vnode *vp;
780 	u_offset_t off;
781 	pgcnt_t claim_availrmem = 0;
782 	uint_t	szc;
783 
784 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
785 
786 	/*
787 	 * We want to lock/unlock the entire ISM segment. Therefore,
788 	 * we will be using the underlying sptseg and it's base address
789 	 * and length for the caching arguments.
790 	 */
791 	ASSERT(sptseg);
792 	ASSERT(sptd);
793 
794 	pg_idx = seg_page(seg, addr);
795 	npages = btopr(len);
796 
797 	/*
798 	 * check if the request is larger than number of pages covered
799 	 * by amp
800 	 */
801 	if (pg_idx + npages > btopr(sptd->spt_amp->size)) {
802 		*ppp = NULL;
803 		return (ENOTSUP);
804 	}
805 
806 	if (type == L_PAGEUNLOCK) {
807 		ASSERT(sptd->spt_ppa != NULL);
808 
809 		seg_pinactive(seg, seg->s_base, sptd->spt_amp->size,
810 		    sptd->spt_ppa, sptd->spt_prot, segspt_reclaim);
811 
812 		/*
813 		 * If someone is blocked while unmapping, we purge
814 		 * segment page cache and thus reclaim pplist synchronously
815 		 * without waiting for seg_pasync_thread. This speeds up
816 		 * unmapping in cases where munmap(2) is called, while
817 		 * raw async i/o is still in progress or where a thread
818 		 * exits on data fault in a multithreaded application.
819 		 */
820 		if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) {
821 			segspt_purge(seg);
822 		}
823 		return (0);
824 	} else if (type == L_PAGERECLAIM) {
825 		ASSERT(sptd->spt_ppa != NULL);
826 		(void) segspt_reclaim(seg, seg->s_base, sptd->spt_amp->size,
827 		    sptd->spt_ppa, sptd->spt_prot);
828 		return (0);
829 	}
830 
831 	if (sptd->spt_flags & DISM_PPA_CHANGED) {
832 		segspt_purge(seg);
833 		/*
834 		 * for DISM ppa needs to be rebuild since
835 		 * number of locked pages could be changed
836 		 */
837 		*ppp = NULL;
838 		return (ENOTSUP);
839 	}
840 
841 	/*
842 	 * First try to find pages in segment page cache, without
843 	 * holding the segment lock.
844 	 */
845 	pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size,
846 	    sptd->spt_prot);
847 	if (pplist != NULL) {
848 		ASSERT(sptd->spt_ppa != NULL);
849 		ASSERT(sptd->spt_ppa == pplist);
850 		ppa = sptd->spt_ppa;
851 		for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
852 			if (ppa[an_idx] == NULL) {
853 				seg_pinactive(seg, seg->s_base,
854 				    sptd->spt_amp->size, ppa,
855 				    sptd->spt_prot, segspt_reclaim);
856 				*ppp = NULL;
857 				return (ENOTSUP);
858 			}
859 			if ((szc = ppa[an_idx]->p_szc) != 0) {
860 				npgs = page_get_pagecnt(szc);
861 				an_idx = P2ROUNDUP(an_idx + 1, npgs);
862 			} else {
863 				an_idx++;
864 			}
865 		}
866 		/*
867 		 * Since we cache the entire DISM segment, we want to
868 		 * set ppp to point to the first slot that corresponds
869 		 * to the requested addr, i.e. pg_idx.
870 		 */
871 		*ppp = &(sptd->spt_ppa[pg_idx]);
872 		return (0);
873 	}
874 
875 	/* The L_PAGELOCK case... */
876 	mutex_enter(&sptd->spt_lock);
877 	/*
878 	 * try to find pages in segment page cache with mutex
879 	 */
880 	pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size,
881 	    sptd->spt_prot);
882 	if (pplist != NULL) {
883 		ASSERT(sptd->spt_ppa != NULL);
884 		ASSERT(sptd->spt_ppa == pplist);
885 		ppa = sptd->spt_ppa;
886 		for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
887 			if (ppa[an_idx] == NULL) {
888 				mutex_exit(&sptd->spt_lock);
889 				seg_pinactive(seg, seg->s_base,
890 				    sptd->spt_amp->size, ppa,
891 				    sptd->spt_prot, segspt_reclaim);
892 				*ppp = NULL;
893 				return (ENOTSUP);
894 			}
895 			if ((szc = ppa[an_idx]->p_szc) != 0) {
896 				npgs = page_get_pagecnt(szc);
897 				an_idx = P2ROUNDUP(an_idx + 1, npgs);
898 			} else {
899 				an_idx++;
900 			}
901 		}
902 		/*
903 		 * Since we cache the entire DISM segment, we want to
904 		 * set ppp to point to the first slot that corresponds
905 		 * to the requested addr, i.e. pg_idx.
906 		 */
907 		mutex_exit(&sptd->spt_lock);
908 		*ppp = &(sptd->spt_ppa[pg_idx]);
909 		return (0);
910 	}
911 	if (seg_pinsert_check(seg, sptd->spt_amp->size, SEGP_FORCE_WIRED) ==
912 	    SEGP_FAIL) {
913 		mutex_exit(&sptd->spt_lock);
914 		*ppp = NULL;
915 		return (ENOTSUP);
916 	}
917 
918 	/*
919 	 * No need to worry about protections because DISM pages are always rw.
920 	 */
921 	pl = pplist = NULL;
922 	amp = sptd->spt_amp;
923 
924 	/*
925 	 * Do we need to build the ppa array?
926 	 */
927 	if (sptd->spt_ppa == NULL) {
928 		pgcnt_t lpg_cnt = 0;
929 
930 		pl_built = 1;
931 		tot_npages = btopr(sptd->spt_amp->size);
932 
933 		ASSERT(sptd->spt_pcachecnt == 0);
934 		pplist = kmem_zalloc(sizeof (page_t *) * tot_npages, KM_SLEEP);
935 		pl = pplist;
936 
937 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
938 		for (an_idx = 0; an_idx < tot_npages; ) {
939 			ap = anon_get_ptr(amp->ahp, an_idx);
940 			/*
941 			 * Cache only mlocked pages. For large pages
942 			 * if one (constituent) page is mlocked
943 			 * all pages for that large page
944 			 * are cached also. This is for quick
945 			 * lookups of ppa array;
946 			 */
947 			if ((ap != NULL) && (lpg_cnt != 0 ||
948 			    (sptd->spt_ppa_lckcnt[an_idx] != 0))) {
949 
950 				swap_xlate(ap, &vp, &off);
951 				pp = page_lookup(vp, off, SE_SHARED);
952 				ASSERT(pp != NULL);
953 				if (lpg_cnt == 0) {
954 					lpg_cnt++;
955 					/*
956 					 * For a small page, we are done --
957 					 * lpg_count is reset to 0 below.
958 					 *
959 					 * For a large page, we are guaranteed
960 					 * to find the anon structures of all
961 					 * constituent pages and a non-zero
962 					 * lpg_cnt ensures that we don't test
963 					 * for mlock for these. We are done
964 					 * when lpg_count reaches (npgs + 1).
965 					 * If we are not the first constituent
966 					 * page, restart at the first one.
967 					 */
968 					npgs = page_get_pagecnt(pp->p_szc);
969 					if (!IS_P2ALIGNED(an_idx, npgs)) {
970 						an_idx = P2ALIGN(an_idx, npgs);
971 						page_unlock(pp);
972 						continue;
973 					}
974 				}
975 				if (++lpg_cnt > npgs)
976 					lpg_cnt = 0;
977 
978 				/*
979 				 * availrmem is decremented only
980 				 * for unlocked pages
981 				 */
982 				if (sptd->spt_ppa_lckcnt[an_idx] == 0)
983 					claim_availrmem++;
984 				pplist[an_idx] = pp;
985 			}
986 			an_idx++;
987 		}
988 		ANON_LOCK_EXIT(&amp->a_rwlock);
989 
990 		mutex_enter(&freemem_lock);
991 		if (availrmem < tune.t_minarmem + claim_availrmem) {
992 			mutex_exit(&freemem_lock);
993 			ret = FC_MAKE_ERR(ENOMEM);
994 			claim_availrmem = 0;
995 			goto insert_fail;
996 		} else {
997 			availrmem -= claim_availrmem;
998 		}
999 		mutex_exit(&freemem_lock);
1000 
1001 		sptd->spt_ppa = pl;
1002 	} else {
1003 		/*
1004 		 * We already have a valid ppa[].
1005 		 */
1006 		pl = sptd->spt_ppa;
1007 	}
1008 
1009 	ASSERT(pl != NULL);
1010 
1011 	ret = seg_pinsert(seg, seg->s_base, sptd->spt_amp->size,
1012 	    pl, sptd->spt_prot, SEGP_FORCE_WIRED | SEGP_ASYNC_FLUSH,
1013 	    segspt_reclaim);
1014 	if (ret == SEGP_FAIL) {
1015 		/*
1016 		 * seg_pinsert failed. We return
1017 		 * ENOTSUP, so that the as_pagelock() code will
1018 		 * then try the slower F_SOFTLOCK path.
1019 		 */
1020 		if (pl_built) {
1021 			/*
1022 			 * No one else has referenced the ppa[].
1023 			 * We created it and we need to destroy it.
1024 			 */
1025 			sptd->spt_ppa = NULL;
1026 		}
1027 		ret = ENOTSUP;
1028 		goto insert_fail;
1029 	}
1030 
1031 	/*
1032 	 * In either case, we increment softlockcnt on the 'real' segment.
1033 	 */
1034 	sptd->spt_pcachecnt++;
1035 	atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), 1);
1036 
1037 	ppa = sptd->spt_ppa;
1038 	for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
1039 		if (ppa[an_idx] == NULL) {
1040 			mutex_exit(&sptd->spt_lock);
1041 			seg_pinactive(seg, seg->s_base, sptd->spt_amp->size,
1042 			    pl, sptd->spt_prot, segspt_reclaim);
1043 			*ppp = NULL;
1044 			return (ENOTSUP);
1045 		}
1046 		if ((szc = ppa[an_idx]->p_szc) != 0) {
1047 			npgs = page_get_pagecnt(szc);
1048 			an_idx = P2ROUNDUP(an_idx + 1, npgs);
1049 		} else {
1050 			an_idx++;
1051 		}
1052 	}
1053 	/*
1054 	 * We can now drop the sptd->spt_lock since the ppa[]
1055 	 * exists and he have incremented pacachecnt.
1056 	 */
1057 	mutex_exit(&sptd->spt_lock);
1058 
1059 	/*
1060 	 * Since we cache the entire segment, we want to
1061 	 * set ppp to point to the first slot that corresponds
1062 	 * to the requested addr, i.e. pg_idx.
1063 	 */
1064 	*ppp = &(sptd->spt_ppa[pg_idx]);
1065 	return (ret);
1066 
1067 insert_fail:
1068 	/*
1069 	 * We will only reach this code if we tried and failed.
1070 	 *
1071 	 * And we can drop the lock on the dummy seg, once we've failed
1072 	 * to set up a new ppa[].
1073 	 */
1074 	mutex_exit(&sptd->spt_lock);
1075 
1076 	if (pl_built) {
1077 		mutex_enter(&freemem_lock);
1078 		availrmem += claim_availrmem;
1079 		mutex_exit(&freemem_lock);
1080 
1081 		/*
1082 		 * We created pl and we need to destroy it.
1083 		 */
1084 		pplist = pl;
1085 		for (an_idx = 0; an_idx < tot_npages; an_idx++) {
1086 			if (pplist[an_idx] != NULL)
1087 				page_unlock(pplist[an_idx]);
1088 		}
1089 		kmem_free(pl, sizeof (page_t *) * tot_npages);
1090 	}
1091 
1092 	if (shmd->shm_softlockcnt <= 0) {
1093 		if (AS_ISUNMAPWAIT(seg->s_as)) {
1094 			mutex_enter(&seg->s_as->a_contents);
1095 			if (AS_ISUNMAPWAIT(seg->s_as)) {
1096 				AS_CLRUNMAPWAIT(seg->s_as);
1097 				cv_broadcast(&seg->s_as->a_cv);
1098 			}
1099 			mutex_exit(&seg->s_as->a_contents);
1100 		}
1101 	}
1102 	*ppp = NULL;
1103 	return (ret);
1104 }
1105 
1106 
1107 
1108 /*
1109  * return locked pages over a given range.
1110  *
1111  * We will cache the entire ISM segment and save the pplist for the
1112  * entire segment in the ppa field of the underlying ISM segment structure.
1113  * Later, during a call to segspt_reclaim() we will use this ppa array
1114  * to page_unlock() all of the pages and then we will free this ppa list.
1115  */
1116 /*ARGSUSED*/
1117 static int
1118 segspt_shmpagelock(struct seg *seg, caddr_t addr, size_t len,
1119     struct page ***ppp, enum lock_type type, enum seg_rw rw)
1120 {
1121 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
1122 	struct seg	*sptseg = shmd->shm_sptseg;
1123 	struct spt_data *sptd = sptseg->s_data;
1124 	pgcnt_t np, page_index, npages;
1125 	caddr_t a, spt_base;
1126 	struct page **pplist, **pl, *pp;
1127 	struct anon_map *amp;
1128 	ulong_t anon_index;
1129 	int ret = ENOTSUP;
1130 	uint_t	pl_built = 0;
1131 	struct anon *ap;
1132 	struct vnode *vp;
1133 	u_offset_t off;
1134 
1135 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
1136 
1137 	/*
1138 	 * We want to lock/unlock the entire ISM segment. Therefore,
1139 	 * we will be using the underlying sptseg and it's base address
1140 	 * and length for the caching arguments.
1141 	 */
1142 	ASSERT(sptseg);
1143 	ASSERT(sptd);
1144 
1145 	if (sptd->spt_flags & SHM_PAGEABLE) {
1146 		return (segspt_dismpagelock(seg, addr, len, ppp, type, rw));
1147 	}
1148 
1149 	page_index = seg_page(seg, addr);
1150 	npages = btopr(len);
1151 
1152 	/*
1153 	 * check if the request is larger than number of pages covered
1154 	 * by amp
1155 	 */
1156 	if (page_index + npages > btopr(sptd->spt_amp->size)) {
1157 		*ppp = NULL;
1158 		return (ENOTSUP);
1159 	}
1160 
1161 	if (type == L_PAGEUNLOCK) {
1162 
1163 		ASSERT(sptd->spt_ppa != NULL);
1164 
1165 		seg_pinactive(seg, seg->s_base, sptd->spt_amp->size,
1166 		    sptd->spt_ppa, sptd->spt_prot, segspt_reclaim);
1167 
1168 		/*
1169 		 * If someone is blocked while unmapping, we purge
1170 		 * segment page cache and thus reclaim pplist synchronously
1171 		 * without waiting for seg_pasync_thread. This speeds up
1172 		 * unmapping in cases where munmap(2) is called, while
1173 		 * raw async i/o is still in progress or where a thread
1174 		 * exits on data fault in a multithreaded application.
1175 		 */
1176 		if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) {
1177 			segspt_purge(seg);
1178 		}
1179 		return (0);
1180 	} else if (type == L_PAGERECLAIM) {
1181 		ASSERT(sptd->spt_ppa != NULL);
1182 
1183 		(void) segspt_reclaim(seg, seg->s_base, sptd->spt_amp->size,
1184 		    sptd->spt_ppa, sptd->spt_prot);
1185 		return (0);
1186 	}
1187 
1188 	/*
1189 	 * First try to find pages in segment page cache, without
1190 	 * holding the segment lock.
1191 	 */
1192 	pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size,
1193 	    sptd->spt_prot);
1194 	if (pplist != NULL) {
1195 		ASSERT(sptd->spt_ppa == pplist);
1196 		ASSERT(sptd->spt_ppa[page_index]);
1197 		/*
1198 		 * Since we cache the entire ISM segment, we want to
1199 		 * set ppp to point to the first slot that corresponds
1200 		 * to the requested addr, i.e. page_index.
1201 		 */
1202 		*ppp = &(sptd->spt_ppa[page_index]);
1203 		return (0);
1204 	}
1205 
1206 	/* The L_PAGELOCK case... */
1207 	mutex_enter(&sptd->spt_lock);
1208 
1209 	/*
1210 	 * try to find pages in segment page cache
1211 	 */
1212 	pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size,
1213 	    sptd->spt_prot);
1214 	if (pplist != NULL) {
1215 		ASSERT(sptd->spt_ppa == pplist);
1216 		/*
1217 		 * Since we cache the entire segment, we want to
1218 		 * set ppp to point to the first slot that corresponds
1219 		 * to the requested addr, i.e. page_index.
1220 		 */
1221 		mutex_exit(&sptd->spt_lock);
1222 		*ppp = &(sptd->spt_ppa[page_index]);
1223 		return (0);
1224 	}
1225 
1226 	if (seg_pinsert_check(seg, sptd->spt_amp->size, SEGP_FORCE_WIRED) ==
1227 	    SEGP_FAIL) {
1228 		mutex_exit(&sptd->spt_lock);
1229 		*ppp = NULL;
1230 		return (ENOTSUP);
1231 	}
1232 
1233 	/*
1234 	 * No need to worry about protections because ISM pages
1235 	 * are always rw.
1236 	 */
1237 	pl = pplist = NULL;
1238 
1239 	/*
1240 	 * Do we need to build the ppa array?
1241 	 */
1242 	if (sptd->spt_ppa == NULL) {
1243 		ASSERT(sptd->spt_ppa == pplist);
1244 
1245 		spt_base = sptseg->s_base;
1246 		pl_built = 1;
1247 
1248 		/*
1249 		 * availrmem is decremented once during anon_swap_adjust()
1250 		 * and is incremented during the anon_unresv(), which is
1251 		 * called from shm_rm_amp() when the segment is destroyed.
1252 		 */
1253 		amp = sptd->spt_amp;
1254 		ASSERT(amp != NULL);
1255 
1256 		/* pcachecnt is protected by sptd->spt_lock */
1257 		ASSERT(sptd->spt_pcachecnt == 0);
1258 		pplist = kmem_zalloc(sizeof (page_t *)
1259 		    * btopr(sptd->spt_amp->size), KM_SLEEP);
1260 		pl = pplist;
1261 
1262 		anon_index = seg_page(sptseg, spt_base);
1263 
1264 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
1265 		for (a = spt_base; a < (spt_base + sptd->spt_amp->size);
1266 		    a += PAGESIZE, anon_index++, pplist++) {
1267 			ap = anon_get_ptr(amp->ahp, anon_index);
1268 			ASSERT(ap != NULL);
1269 			swap_xlate(ap, &vp, &off);
1270 			pp = page_lookup(vp, off, SE_SHARED);
1271 			ASSERT(pp != NULL);
1272 			*pplist = pp;
1273 		}
1274 		ANON_LOCK_EXIT(&amp->a_rwlock);
1275 
1276 		if (a < (spt_base + sptd->spt_amp->size)) {
1277 			ret = ENOTSUP;
1278 			goto insert_fail;
1279 		}
1280 		sptd->spt_ppa = pl;
1281 	} else {
1282 		/*
1283 		 * We already have a valid ppa[].
1284 		 */
1285 		pl = sptd->spt_ppa;
1286 	}
1287 
1288 	ASSERT(pl != NULL);
1289 
1290 	ret = seg_pinsert(seg, seg->s_base, sptd->spt_amp->size,
1291 	    pl, sptd->spt_prot, SEGP_FORCE_WIRED, segspt_reclaim);
1292 	if (ret == SEGP_FAIL) {
1293 		/*
1294 		 * seg_pinsert failed. We return
1295 		 * ENOTSUP, so that the as_pagelock() code will
1296 		 * then try the slower F_SOFTLOCK path.
1297 		 */
1298 		if (pl_built) {
1299 			/*
1300 			 * No one else has referenced the ppa[].
1301 			 * We created it and we need to destroy it.
1302 			 */
1303 			sptd->spt_ppa = NULL;
1304 		}
1305 		ret = ENOTSUP;
1306 		goto insert_fail;
1307 	}
1308 
1309 	/*
1310 	 * In either case, we increment softlockcnt on the 'real' segment.
1311 	 */
1312 	sptd->spt_pcachecnt++;
1313 	atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), 1);
1314 
1315 	/*
1316 	 * We can now drop the sptd->spt_lock since the ppa[]
1317 	 * exists and he have incremented pacachecnt.
1318 	 */
1319 	mutex_exit(&sptd->spt_lock);
1320 
1321 	/*
1322 	 * Since we cache the entire segment, we want to
1323 	 * set ppp to point to the first slot that corresponds
1324 	 * to the requested addr, i.e. page_index.
1325 	 */
1326 	*ppp = &(sptd->spt_ppa[page_index]);
1327 	return (ret);
1328 
1329 insert_fail:
1330 	/*
1331 	 * We will only reach this code if we tried and failed.
1332 	 *
1333 	 * And we can drop the lock on the dummy seg, once we've failed
1334 	 * to set up a new ppa[].
1335 	 */
1336 	mutex_exit(&sptd->spt_lock);
1337 
1338 	if (pl_built) {
1339 		/*
1340 		 * We created pl and we need to destroy it.
1341 		 */
1342 		pplist = pl;
1343 		np = (((uintptr_t)(a - spt_base)) >> PAGESHIFT);
1344 		while (np) {
1345 			page_unlock(*pplist);
1346 			np--;
1347 			pplist++;
1348 		}
1349 		kmem_free(pl, sizeof (page_t *) *
1350 				btopr(sptd->spt_amp->size));
1351 	}
1352 	if (shmd->shm_softlockcnt <= 0) {
1353 		if (AS_ISUNMAPWAIT(seg->s_as)) {
1354 			mutex_enter(&seg->s_as->a_contents);
1355 			if (AS_ISUNMAPWAIT(seg->s_as)) {
1356 				AS_CLRUNMAPWAIT(seg->s_as);
1357 				cv_broadcast(&seg->s_as->a_cv);
1358 			}
1359 			mutex_exit(&seg->s_as->a_contents);
1360 		}
1361 	}
1362 	*ppp = NULL;
1363 	return (ret);
1364 }
1365 
1366 /*
1367  * purge any cached pages in the I/O page cache
1368  */
1369 static void
1370 segspt_purge(struct seg *seg)
1371 {
1372 	seg_ppurge(seg);
1373 }
1374 
1375 static int
1376 segspt_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist,
1377 	enum seg_rw rw)
1378 {
1379 	struct	shm_data *shmd = (struct shm_data *)seg->s_data;
1380 	struct	seg	*sptseg;
1381 	struct	spt_data *sptd;
1382 	pgcnt_t npages, i, free_availrmem = 0;
1383 	int	done = 0;
1384 
1385 #ifdef lint
1386 	addr = addr;
1387 #endif
1388 	sptseg = shmd->shm_sptseg;
1389 	sptd = sptseg->s_data;
1390 	npages = (len >> PAGESHIFT);
1391 	ASSERT(npages);
1392 	ASSERT(sptd->spt_pcachecnt != 0);
1393 	ASSERT(sptd->spt_ppa == pplist);
1394 	ASSERT(npages == btopr(sptd->spt_amp->size));
1395 
1396 	/*
1397 	 * Acquire the lock on the dummy seg and destroy the
1398 	 * ppa array IF this is the last pcachecnt.
1399 	 */
1400 	mutex_enter(&sptd->spt_lock);
1401 	if (--sptd->spt_pcachecnt == 0) {
1402 		for (i = 0; i < npages; i++) {
1403 			if (pplist[i] == NULL) {
1404 				continue;
1405 			}
1406 			if (rw == S_WRITE) {
1407 				hat_setrefmod(pplist[i]);
1408 			} else {
1409 				hat_setref(pplist[i]);
1410 			}
1411 			if ((sptd->spt_flags & SHM_PAGEABLE) &&
1412 				(sptd->spt_ppa_lckcnt[i] == 0))
1413 				free_availrmem++;
1414 			page_unlock(pplist[i]);
1415 		}
1416 		if (sptd->spt_flags & SHM_PAGEABLE) {
1417 			mutex_enter(&freemem_lock);
1418 			availrmem += free_availrmem;
1419 			mutex_exit(&freemem_lock);
1420 		}
1421 		/*
1422 		 * Since we want to cach/uncache the entire ISM segment,
1423 		 * we will track the pplist in a segspt specific field
1424 		 * ppa, that is initialized at the time we add an entry to
1425 		 * the cache.
1426 		 */
1427 		ASSERT(sptd->spt_pcachecnt == 0);
1428 		kmem_free(pplist, sizeof (page_t *) * npages);
1429 		sptd->spt_ppa = NULL;
1430 		sptd->spt_flags &= ~DISM_PPA_CHANGED;
1431 		done = 1;
1432 	}
1433 	mutex_exit(&sptd->spt_lock);
1434 	/*
1435 	 * Now decrement softlockcnt.
1436 	 */
1437 	atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -1);
1438 
1439 	if (shmd->shm_softlockcnt <= 0) {
1440 		if (AS_ISUNMAPWAIT(seg->s_as)) {
1441 			mutex_enter(&seg->s_as->a_contents);
1442 			if (AS_ISUNMAPWAIT(seg->s_as)) {
1443 				AS_CLRUNMAPWAIT(seg->s_as);
1444 				cv_broadcast(&seg->s_as->a_cv);
1445 			}
1446 			mutex_exit(&seg->s_as->a_contents);
1447 		}
1448 	}
1449 	return (done);
1450 }
1451 
1452 /*
1453  * Do a F_SOFTUNLOCK call over the range requested.
1454  * The range must have already been F_SOFTLOCK'ed.
1455  *
1456  * The calls to acquire and release the anon map lock mutex were
1457  * removed in order to avoid a deadly embrace during a DR
1458  * memory delete operation.  (Eg. DR blocks while waiting for a
1459  * exclusive lock on a page that is being used for kaio; the
1460  * thread that will complete the kaio and call segspt_softunlock
1461  * blocks on the anon map lock; another thread holding the anon
1462  * map lock blocks on another page lock via the segspt_shmfault
1463  * -> page_lookup -> page_lookup_create -> page_lock_es code flow.)
1464  *
1465  * The appropriateness of the removal is based upon the following:
1466  * 1. If we are holding a segment's reader lock and the page is held
1467  * shared, then the corresponding element in anonmap which points to
1468  * anon struct cannot change and there is no need to acquire the
1469  * anonymous map lock.
1470  * 2. Threads in segspt_softunlock have a reader lock on the segment
1471  * and already have the shared page lock, so we are guaranteed that
1472  * the anon map slot cannot change and therefore can call anon_get_ptr()
1473  * without grabbing the anonymous map lock.
1474  * 3. Threads that softlock a shared page break copy-on-write, even if
1475  * its a read.  Thus cow faults can be ignored with respect to soft
1476  * unlocking, since the breaking of cow means that the anon slot(s) will
1477  * not be shared.
1478  */
1479 static void
1480 segspt_softunlock(struct seg *seg, caddr_t sptseg_addr,
1481 	size_t len, enum seg_rw rw)
1482 {
1483 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
1484 	struct seg	*sptseg;
1485 	struct spt_data *sptd;
1486 	page_t *pp;
1487 	caddr_t adr;
1488 	struct vnode *vp;
1489 	u_offset_t offset;
1490 	ulong_t anon_index;
1491 	struct anon_map *amp;		/* XXX - for locknest */
1492 	struct anon *ap = NULL;
1493 	pgcnt_t npages;
1494 
1495 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
1496 
1497 	sptseg = shmd->shm_sptseg;
1498 	sptd = sptseg->s_data;
1499 
1500 	/*
1501 	 * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
1502 	 * and therefore their pages are SE_SHARED locked
1503 	 * for the entire life of the segment.
1504 	 */
1505 	if ((!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) &&
1506 		((sptd->spt_flags & SHM_PAGEABLE) == 0)) {
1507 		goto softlock_decrement;
1508 	}
1509 
1510 	/*
1511 	 * Any thread is free to do a page_find and
1512 	 * page_unlock() on the pages within this seg.
1513 	 *
1514 	 * We are already holding the as->a_lock on the user's
1515 	 * real segment, but we need to hold the a_lock on the
1516 	 * underlying dummy as. This is mostly to satisfy the
1517 	 * underlying HAT layer.
1518 	 */
1519 	AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER);
1520 	hat_unlock(sptseg->s_as->a_hat, sptseg_addr, len);
1521 	AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock);
1522 
1523 	amp = sptd->spt_amp;
1524 	ASSERT(amp != NULL);
1525 	anon_index = seg_page(sptseg, sptseg_addr);
1526 
1527 	for (adr = sptseg_addr; adr < sptseg_addr + len; adr += PAGESIZE) {
1528 		ap = anon_get_ptr(amp->ahp, anon_index++);
1529 		ASSERT(ap != NULL);
1530 		swap_xlate(ap, &vp, &offset);
1531 
1532 		/*
1533 		 * Use page_find() instead of page_lookup() to
1534 		 * find the page since we know that it has a
1535 		 * "shared" lock.
1536 		 */
1537 		pp = page_find(vp, offset);
1538 		ASSERT(ap == anon_get_ptr(amp->ahp, anon_index - 1));
1539 		if (pp == NULL) {
1540 			panic("segspt_softunlock: "
1541 			    "addr %p, ap %p, vp %p, off %llx",
1542 			    (void *)adr, (void *)ap, (void *)vp, offset);
1543 			/*NOTREACHED*/
1544 		}
1545 
1546 		if (rw == S_WRITE) {
1547 			hat_setrefmod(pp);
1548 		} else if (rw != S_OTHER) {
1549 			hat_setref(pp);
1550 		}
1551 		page_unlock(pp);
1552 	}
1553 
1554 softlock_decrement:
1555 	npages = btopr(len);
1556 	atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -npages);
1557 	if (shmd->shm_softlockcnt == 0) {
1558 		/*
1559 		 * All SOFTLOCKS are gone. Wakeup any waiting
1560 		 * unmappers so they can try again to unmap.
1561 		 * Check for waiters first without the mutex
1562 		 * held so we don't always grab the mutex on
1563 		 * softunlocks.
1564 		 */
1565 		if (AS_ISUNMAPWAIT(seg->s_as)) {
1566 			mutex_enter(&seg->s_as->a_contents);
1567 			if (AS_ISUNMAPWAIT(seg->s_as)) {
1568 				AS_CLRUNMAPWAIT(seg->s_as);
1569 				cv_broadcast(&seg->s_as->a_cv);
1570 			}
1571 			mutex_exit(&seg->s_as->a_contents);
1572 		}
1573 	}
1574 }
1575 
1576 int
1577 segspt_shmattach(struct seg *seg, caddr_t *argsp)
1578 {
1579 	struct shm_data *shmd_arg = (struct shm_data *)argsp;
1580 	struct shm_data *shmd;
1581 	struct anon_map *shm_amp = shmd_arg->shm_amp;
1582 	struct spt_data *sptd;
1583 	int error = 0;
1584 
1585 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
1586 
1587 	shmd = kmem_zalloc((sizeof (*shmd)), KM_NOSLEEP);
1588 	if (shmd == NULL)
1589 		return (ENOMEM);
1590 
1591 	shmd->shm_sptas = shmd_arg->shm_sptas;
1592 	shmd->shm_amp = shm_amp;
1593 	shmd->shm_sptseg = shmd_arg->shm_sptseg;
1594 
1595 	(void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, shm_amp, 0,
1596 	    NULL, 0, seg->s_size);
1597 
1598 	seg->s_data = (void *)shmd;
1599 	seg->s_ops = &segspt_shmops;
1600 	seg->s_szc = shmd->shm_sptseg->s_szc;
1601 	sptd = shmd->shm_sptseg->s_data;
1602 
1603 	if (sptd->spt_flags & SHM_PAGEABLE) {
1604 		if ((shmd->shm_vpage = kmem_zalloc(btopr(shm_amp->size),
1605 		    KM_NOSLEEP)) == NULL) {
1606 			seg->s_data = (void *)NULL;
1607 			kmem_free(shmd, (sizeof (*shmd)));
1608 			return (ENOMEM);
1609 		}
1610 		shmd->shm_lckpgs = 0;
1611 		if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
1612 			if ((error = hat_share(seg->s_as->a_hat, seg->s_base,
1613 			    shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
1614 			    seg->s_size, seg->s_szc)) != 0) {
1615 				kmem_free(shmd->shm_vpage,
1616 					btopr(shm_amp->size));
1617 			}
1618 		}
1619 	} else {
1620 		error = hat_share(seg->s_as->a_hat, seg->s_base,
1621 				shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
1622 				seg->s_size, seg->s_szc);
1623 	}
1624 	if (error) {
1625 		seg->s_szc = 0;
1626 		seg->s_data = (void *)NULL;
1627 		kmem_free(shmd, (sizeof (*shmd)));
1628 	} else {
1629 		ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
1630 		shm_amp->refcnt++;
1631 		ANON_LOCK_EXIT(&shm_amp->a_rwlock);
1632 	}
1633 	return (error);
1634 }
1635 
1636 int
1637 segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize)
1638 {
1639 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
1640 	int reclaim = 1;
1641 
1642 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
1643 retry:
1644 	if (shmd->shm_softlockcnt > 0) {
1645 		if (reclaim == 1) {
1646 			segspt_purge(seg);
1647 			reclaim = 0;
1648 			goto retry;
1649 		}
1650 		return (EAGAIN);
1651 	}
1652 
1653 	if (ssize != seg->s_size) {
1654 #ifdef DEBUG
1655 		cmn_err(CE_WARN, "Incompatible ssize %lx s_size %lx\n",
1656 		    ssize, seg->s_size);
1657 #endif
1658 		return (EINVAL);
1659 	}
1660 
1661 	(void) segspt_shmlockop(seg, raddr, shmd->shm_amp->size, 0, MC_UNLOCK,
1662 	    NULL, 0);
1663 	hat_unshare(seg->s_as->a_hat, raddr, ssize, seg->s_szc);
1664 
1665 	seg_free(seg);
1666 
1667 	return (0);
1668 }
1669 
1670 void
1671 segspt_shmfree(struct seg *seg)
1672 {
1673 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
1674 	struct anon_map *shm_amp = shmd->shm_amp;
1675 
1676 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
1677 
1678 	(void) segspt_shmlockop(seg, seg->s_base, shm_amp->size, 0,
1679 		MC_UNLOCK, NULL, 0);
1680 
1681 	/*
1682 	 * Need to increment refcnt when attaching
1683 	 * and decrement when detaching because of dup().
1684 	 */
1685 	ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
1686 	shm_amp->refcnt--;
1687 	ANON_LOCK_EXIT(&shm_amp->a_rwlock);
1688 
1689 	if (shmd->shm_vpage) {	/* only for DISM */
1690 		kmem_free(shmd->shm_vpage, btopr(shm_amp->size));
1691 		shmd->shm_vpage = NULL;
1692 	}
1693 	kmem_free(shmd, sizeof (*shmd));
1694 }
1695 
1696 /*ARGSUSED*/
1697 int
1698 segspt_shmsetprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
1699 {
1700 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
1701 
1702 	/*
1703 	 * Shared page table is more than shared mapping.
1704 	 *  Individual process sharing page tables can't change prot
1705 	 *  because there is only one set of page tables.
1706 	 *  This will be allowed after private page table is
1707 	 *  supported.
1708 	 */
1709 /* need to return correct status error? */
1710 	return (0);
1711 }
1712 
1713 
1714 faultcode_t
1715 segspt_dismfault(struct hat *hat, struct seg *seg, caddr_t addr,
1716     size_t len, enum fault_type type, enum seg_rw rw)
1717 {
1718 	struct  shm_data 	*shmd = (struct shm_data *)seg->s_data;
1719 	struct  seg		*sptseg = shmd->shm_sptseg;
1720 	struct  as		*curspt = shmd->shm_sptas;
1721 	struct  spt_data 	*sptd = sptseg->s_data;
1722 	pgcnt_t npages;
1723 	size_t  size;
1724 	caddr_t segspt_addr, shm_addr;
1725 	page_t  **ppa;
1726 	int	i;
1727 	ulong_t an_idx = 0;
1728 	int	err = 0;
1729 	int	dyn_ism_unmap = hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0);
1730 	size_t	pgsz;
1731 	pgcnt_t	pgcnt;
1732 	caddr_t	a;
1733 	pgcnt_t	pidx;
1734 
1735 #ifdef lint
1736 	hat = hat;
1737 #endif
1738 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
1739 
1740 	/*
1741 	 * Because of the way spt is implemented
1742 	 * the realsize of the segment does not have to be
1743 	 * equal to the segment size itself. The segment size is
1744 	 * often in multiples of a page size larger than PAGESIZE.
1745 	 * The realsize is rounded up to the nearest PAGESIZE
1746 	 * based on what the user requested. This is a bit of
1747 	 * ungliness that is historical but not easily fixed
1748 	 * without re-designing the higher levels of ISM.
1749 	 */
1750 	ASSERT(addr >= seg->s_base);
1751 	if (((addr + len) - seg->s_base) > sptd->spt_realsize)
1752 		return (FC_NOMAP);
1753 	/*
1754 	 * For all of the following cases except F_PROT, we need to
1755 	 * make any necessary adjustments to addr and len
1756 	 * and get all of the necessary page_t's into an array called ppa[].
1757 	 *
1758 	 * The code in shmat() forces base addr and len of ISM segment
1759 	 * to be aligned to largest page size supported. Therefore,
1760 	 * we are able to handle F_SOFTLOCK and F_INVAL calls in "large
1761 	 * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
1762 	 * in large pagesize chunks, or else we will screw up the HAT
1763 	 * layer by calling hat_memload_array() with differing page sizes
1764 	 * over a given virtual range.
1765 	 */
1766 	pgsz = page_get_pagesize(sptseg->s_szc);
1767 	pgcnt = page_get_pagecnt(sptseg->s_szc);
1768 	shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz);
1769 	size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz);
1770 	npages = btopr(size);
1771 
1772 	/*
1773 	 * Now we need to convert from addr in segshm to addr in segspt.
1774 	 */
1775 	an_idx = seg_page(seg, shm_addr);
1776 	segspt_addr = sptseg->s_base + ptob(an_idx);
1777 
1778 	ASSERT((segspt_addr + ptob(npages)) <=
1779 		(sptseg->s_base + sptd->spt_realsize));
1780 	ASSERT(segspt_addr < (sptseg->s_base + sptseg->s_size));
1781 
1782 	switch (type) {
1783 
1784 	case F_SOFTLOCK:
1785 
1786 		mutex_enter(&freemem_lock);
1787 		if (availrmem < tune.t_minarmem + npages) {
1788 			mutex_exit(&freemem_lock);
1789 			return (FC_MAKE_ERR(ENOMEM));
1790 		} else {
1791 			availrmem -= npages;
1792 		}
1793 		mutex_exit(&freemem_lock);
1794 		atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
1795 		/*
1796 		 * Fall through to the F_INVAL case to load up the hat layer
1797 		 * entries with the HAT_LOAD_LOCK flag.
1798 		 */
1799 		/* FALLTHRU */
1800 	case F_INVAL:
1801 
1802 		if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
1803 			return (FC_NOMAP);
1804 
1805 		ppa = kmem_zalloc(npages * sizeof (page_t *), KM_SLEEP);
1806 
1807 		err = spt_anon_getpages(sptseg, segspt_addr, size, ppa);
1808 		if (err != 0) {
1809 			if (type == F_SOFTLOCK) {
1810 				mutex_enter(&freemem_lock);
1811 				availrmem += npages;
1812 				mutex_exit(&freemem_lock);
1813 				atomic_add_long((ulong_t *)(
1814 				    &(shmd->shm_softlockcnt)), -npages);
1815 			}
1816 			goto dism_err;
1817 		}
1818 		AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER);
1819 		a = segspt_addr;
1820 		pidx = 0;
1821 		if (type == F_SOFTLOCK) {
1822 
1823 			/*
1824 			 * Load up the translation keeping it
1825 			 * locked and don't unlock the page.
1826 			 */
1827 			for (; pidx < npages; a += pgsz, pidx += pgcnt) {
1828 				hat_memload_array(sptseg->s_as->a_hat,
1829 				    a, pgsz, &ppa[pidx], sptd->spt_prot,
1830 				    HAT_LOAD_LOCK | HAT_LOAD_SHARE);
1831 			}
1832 		} else {
1833 			if (hat == seg->s_as->a_hat) {
1834 
1835 				/*
1836 				 * Migrate pages marked for migration
1837 				 */
1838 				if (lgrp_optimizations())
1839 					page_migrate(seg, shm_addr, ppa,
1840 					    npages);
1841 
1842 				/* CPU HAT */
1843 				for (; pidx < npages;
1844 				    a += pgsz, pidx += pgcnt) {
1845 					hat_memload_array(sptseg->s_as->a_hat,
1846 					    a, pgsz, &ppa[pidx],
1847 					    sptd->spt_prot,
1848 					    HAT_LOAD_SHARE);
1849 				}
1850 			} else {
1851 				/* XHAT. Pass real address */
1852 				hat_memload_array(hat, shm_addr,
1853 				    size, ppa, sptd->spt_prot, HAT_LOAD_SHARE);
1854 			}
1855 
1856 			/*
1857 			 * And now drop the SE_SHARED lock(s).
1858 			 */
1859 			if (dyn_ism_unmap) {
1860 				for (i = 0; i < npages; i++) {
1861 					page_unlock(ppa[i]);
1862 				}
1863 			}
1864 		}
1865 
1866 		if (!dyn_ism_unmap) {
1867 			if (hat_share(seg->s_as->a_hat, shm_addr,
1868 			    curspt->a_hat, segspt_addr, ptob(npages),
1869 			    seg->s_szc) != 0) {
1870 				panic("hat_share err in DISM fault");
1871 				/* NOTREACHED */
1872 			}
1873 			if (type == F_INVAL) {
1874 				for (i = 0; i < npages; i++) {
1875 					page_unlock(ppa[i]);
1876 				}
1877 			}
1878 		}
1879 		AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock);
1880 dism_err:
1881 		kmem_free(ppa, npages * sizeof (page_t *));
1882 		return (err);
1883 
1884 	case F_SOFTUNLOCK:
1885 
1886 		mutex_enter(&freemem_lock);
1887 		availrmem += npages;
1888 		mutex_exit(&freemem_lock);
1889 
1890 		/*
1891 		 * This is a bit ugly, we pass in the real seg pointer,
1892 		 * but the segspt_addr is the virtual address within the
1893 		 * dummy seg.
1894 		 */
1895 		segspt_softunlock(seg, segspt_addr, size, rw);
1896 		return (0);
1897 
1898 	case F_PROT:
1899 
1900 		/*
1901 		 * This takes care of the unusual case where a user
1902 		 * allocates a stack in shared memory and a register
1903 		 * window overflow is written to that stack page before
1904 		 * it is otherwise modified.
1905 		 *
1906 		 * We can get away with this because ISM segments are
1907 		 * always rw. Other than this unusual case, there
1908 		 * should be no instances of protection violations.
1909 		 */
1910 		return (0);
1911 
1912 	default:
1913 #ifdef DEBUG
1914 		panic("segspt_dismfault default type?");
1915 #else
1916 		return (FC_NOMAP);
1917 #endif
1918 	}
1919 }
1920 
1921 
1922 faultcode_t
1923 segspt_shmfault(struct hat *hat, struct seg *seg, caddr_t addr,
1924     size_t len, enum fault_type type, enum seg_rw rw)
1925 {
1926 	struct shm_data 	*shmd = (struct shm_data *)seg->s_data;
1927 	struct seg		*sptseg = shmd->shm_sptseg;
1928 	struct as		*curspt = shmd->shm_sptas;
1929 	struct spt_data 	*sptd   = sptseg->s_data;
1930 	pgcnt_t npages;
1931 	size_t size;
1932 	caddr_t sptseg_addr, shm_addr;
1933 	page_t *pp, **ppa;
1934 	int	i;
1935 	u_offset_t offset;
1936 	ulong_t anon_index = 0;
1937 	struct vnode *vp;
1938 	struct anon_map *amp;		/* XXX - for locknest */
1939 	struct anon *ap = NULL;
1940 	anon_sync_obj_t cookie;
1941 	size_t		pgsz;
1942 	pgcnt_t		pgcnt;
1943 	caddr_t		a;
1944 	pgcnt_t		pidx;
1945 	size_t		sz;
1946 
1947 #ifdef lint
1948 	hat = hat;
1949 #endif
1950 
1951 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
1952 
1953 	if (sptd->spt_flags & SHM_PAGEABLE) {
1954 		return (segspt_dismfault(hat, seg, addr, len, type, rw));
1955 	}
1956 
1957 	/*
1958 	 * Because of the way spt is implemented
1959 	 * the realsize of the segment does not have to be
1960 	 * equal to the segment size itself. The segment size is
1961 	 * often in multiples of a page size larger than PAGESIZE.
1962 	 * The realsize is rounded up to the nearest PAGESIZE
1963 	 * based on what the user requested. This is a bit of
1964 	 * ungliness that is historical but not easily fixed
1965 	 * without re-designing the higher levels of ISM.
1966 	 */
1967 	ASSERT(addr >= seg->s_base);
1968 	if (((addr + len) - seg->s_base) > sptd->spt_realsize)
1969 		return (FC_NOMAP);
1970 	/*
1971 	 * For all of the following cases except F_PROT, we need to
1972 	 * make any necessary adjustments to addr and len
1973 	 * and get all of the necessary page_t's into an array called ppa[].
1974 	 *
1975 	 * The code in shmat() forces base addr and len of ISM segment
1976 	 * to be aligned to largest page size supported. Therefore,
1977 	 * we are able to handle F_SOFTLOCK and F_INVAL calls in "large
1978 	 * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
1979 	 * in large pagesize chunks, or else we will screw up the HAT
1980 	 * layer by calling hat_memload_array() with differing page sizes
1981 	 * over a given virtual range.
1982 	 */
1983 	pgsz = page_get_pagesize(sptseg->s_szc);
1984 	pgcnt = page_get_pagecnt(sptseg->s_szc);
1985 	shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz);
1986 	size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz);
1987 	npages = btopr(size);
1988 
1989 	/*
1990 	 * Now we need to convert from addr in segshm to addr in segspt.
1991 	 */
1992 	anon_index = seg_page(seg, shm_addr);
1993 	sptseg_addr = sptseg->s_base + ptob(anon_index);
1994 
1995 	/*
1996 	 * And now we may have to adjust npages downward if we have
1997 	 * exceeded the realsize of the segment or initial anon
1998 	 * allocations.
1999 	 */
2000 	if ((sptseg_addr + ptob(npages)) >
2001 	    (sptseg->s_base + sptd->spt_realsize))
2002 		size = (sptseg->s_base + sptd->spt_realsize) - sptseg_addr;
2003 
2004 	npages = btopr(size);
2005 
2006 	ASSERT(sptseg_addr < (sptseg->s_base + sptseg->s_size));
2007 	ASSERT((sptd->spt_flags & SHM_PAGEABLE) == 0);
2008 
2009 	switch (type) {
2010 
2011 	case F_SOFTLOCK:
2012 
2013 		/*
2014 		 * availrmem is decremented once during anon_swap_adjust()
2015 		 * and is incremented during the anon_unresv(), which is
2016 		 * called from shm_rm_amp() when the segment is destroyed.
2017 		 */
2018 		atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
2019 		/*
2020 		 * Some platforms assume that ISM pages are SE_SHARED
2021 		 * locked for the entire life of the segment.
2022 		 */
2023 		if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0))
2024 			return (0);
2025 		/*
2026 		 * Fall through to the F_INVAL case to load up the hat layer
2027 		 * entries with the HAT_LOAD_LOCK flag.
2028 		 */
2029 
2030 		/* FALLTHRU */
2031 	case F_INVAL:
2032 
2033 		if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
2034 			return (FC_NOMAP);
2035 
2036 		/*
2037 		 * Some platforms that do NOT support DYNAMIC_ISM_UNMAP
2038 		 * may still rely on this call to hat_share(). That
2039 		 * would imply that those hat's can fault on a
2040 		 * HAT_LOAD_LOCK translation, which would seem
2041 		 * contradictory.
2042 		 */
2043 		if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
2044 			if (hat_share(seg->s_as->a_hat, seg->s_base,
2045 			    curspt->a_hat, sptseg->s_base,
2046 			    sptseg->s_size, sptseg->s_szc) != 0) {
2047 				panic("hat_share error in ISM fault");
2048 				/*NOTREACHED*/
2049 			}
2050 			return (0);
2051 		}
2052 		ppa = kmem_zalloc(sizeof (page_t *) * npages, KM_SLEEP);
2053 
2054 		/*
2055 		 * I see no need to lock the real seg,
2056 		 * here, because all of our work will be on the underlying
2057 		 * dummy seg.
2058 		 *
2059 		 * sptseg_addr and npages now account for large pages.
2060 		 */
2061 		amp = sptd->spt_amp;
2062 		ASSERT(amp != NULL);
2063 		anon_index = seg_page(sptseg, sptseg_addr);
2064 
2065 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2066 		for (i = 0; i < npages; i++) {
2067 			anon_array_enter(amp, anon_index, &cookie);
2068 			ap = anon_get_ptr(amp->ahp, anon_index++);
2069 			ASSERT(ap != NULL);
2070 			swap_xlate(ap, &vp, &offset);
2071 			anon_array_exit(&cookie);
2072 			pp = page_lookup(vp, offset, SE_SHARED);
2073 			ASSERT(pp != NULL);
2074 			ppa[i] = pp;
2075 		}
2076 		ANON_LOCK_EXIT(&amp->a_rwlock);
2077 		ASSERT(i == npages);
2078 
2079 		/*
2080 		 * We are already holding the as->a_lock on the user's
2081 		 * real segment, but we need to hold the a_lock on the
2082 		 * underlying dummy as. This is mostly to satisfy the
2083 		 * underlying HAT layer.
2084 		 */
2085 		AS_LOCK_ENTER(sptseg->s_as, &sptseg->s_as->a_lock, RW_READER);
2086 		a = sptseg_addr;
2087 		pidx = 0;
2088 		if (type == F_SOFTLOCK) {
2089 			/*
2090 			 * Load up the translation keeping it
2091 			 * locked and don't unlock the page.
2092 			 */
2093 			for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2094 				sz = MIN(pgsz, ptob(npages - pidx));
2095 				hat_memload_array(sptseg->s_as->a_hat, a,
2096 				    sz, &ppa[pidx], sptd->spt_prot,
2097 				    HAT_LOAD_LOCK | HAT_LOAD_SHARE);
2098 			}
2099 		} else {
2100 			if (hat == seg->s_as->a_hat) {
2101 
2102 				/*
2103 				 * Migrate pages marked for migration.
2104 				 */
2105 				if (lgrp_optimizations())
2106 					page_migrate(seg, shm_addr, ppa,
2107 					    npages);
2108 
2109 				/* CPU HAT */
2110 				for (; pidx < npages;
2111 				    a += pgsz, pidx += pgcnt) {
2112 					sz = MIN(pgsz, ptob(npages - pidx));
2113 					hat_memload_array(sptseg->s_as->a_hat,
2114 					    a, sz, &ppa[pidx],
2115 					    sptd->spt_prot, HAT_LOAD_SHARE);
2116 				}
2117 			} else {
2118 				/* XHAT. Pass real address */
2119 				hat_memload_array(hat, shm_addr,
2120 				    ptob(npages), ppa, sptd->spt_prot,
2121 				    HAT_LOAD_SHARE);
2122 			}
2123 
2124 			/*
2125 			 * And now drop the SE_SHARED lock(s).
2126 			 */
2127 			for (i = 0; i < npages; i++)
2128 				page_unlock(ppa[i]);
2129 		}
2130 		AS_LOCK_EXIT(sptseg->s_as, &sptseg->s_as->a_lock);
2131 
2132 		kmem_free(ppa, sizeof (page_t *) * npages);
2133 		return (0);
2134 	case F_SOFTUNLOCK:
2135 
2136 		/*
2137 		 * This is a bit ugly, we pass in the real seg pointer,
2138 		 * but the sptseg_addr is the virtual address within the
2139 		 * dummy seg.
2140 		 */
2141 		segspt_softunlock(seg, sptseg_addr, ptob(npages), rw);
2142 		return (0);
2143 
2144 	case F_PROT:
2145 
2146 		/*
2147 		 * This takes care of the unusual case where a user
2148 		 * allocates a stack in shared memory and a register
2149 		 * window overflow is written to that stack page before
2150 		 * it is otherwise modified.
2151 		 *
2152 		 * We can get away with this because ISM segments are
2153 		 * always rw. Other than this unusual case, there
2154 		 * should be no instances of protection violations.
2155 		 */
2156 		return (0);
2157 
2158 	default:
2159 #ifdef DEBUG
2160 		cmn_err(CE_WARN, "segspt_shmfault default type?");
2161 #endif
2162 		return (FC_NOMAP);
2163 	}
2164 }
2165 
2166 /*ARGSUSED*/
2167 static faultcode_t
2168 segspt_shmfaulta(struct seg *seg, caddr_t addr)
2169 {
2170 	return (0);
2171 }
2172 
2173 /*ARGSUSED*/
2174 static int
2175 segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta)
2176 {
2177 	return (0);
2178 }
2179 
2180 /*ARGSUSED*/
2181 static size_t
2182 segspt_shmswapout(struct seg *seg)
2183 {
2184 	return (0);
2185 }
2186 
2187 /*
2188  * duplicate the shared page tables
2189  */
2190 int
2191 segspt_shmdup(struct seg *seg, struct seg *newseg)
2192 {
2193 	struct shm_data		*shmd = (struct shm_data *)seg->s_data;
2194 	struct anon_map 	*amp = shmd->shm_amp;
2195 	struct shm_data 	*shmd_new;
2196 	struct seg		*spt_seg = shmd->shm_sptseg;
2197 	struct spt_data		*sptd = spt_seg->s_data;
2198 	int			error = 0;
2199 
2200 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
2201 
2202 	shmd_new = kmem_zalloc((sizeof (*shmd_new)), KM_SLEEP);
2203 	newseg->s_data = (void *)shmd_new;
2204 	shmd_new->shm_sptas = shmd->shm_sptas;
2205 	shmd_new->shm_amp = amp;
2206 	shmd_new->shm_sptseg = shmd->shm_sptseg;
2207 	newseg->s_ops = &segspt_shmops;
2208 	newseg->s_szc = seg->s_szc;
2209 	ASSERT(seg->s_szc == shmd->shm_sptseg->s_szc);
2210 
2211 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
2212 	amp->refcnt++;
2213 	ANON_LOCK_EXIT(&amp->a_rwlock);
2214 
2215 	if (sptd->spt_flags & SHM_PAGEABLE) {
2216 		shmd_new->shm_vpage = kmem_zalloc(btopr(amp->size), KM_SLEEP);
2217 		shmd_new->shm_lckpgs = 0;
2218 		if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
2219 			if ((error = hat_share(newseg->s_as->a_hat,
2220 			    newseg->s_base, shmd->shm_sptas->a_hat, SEGSPTADDR,
2221 			    seg->s_size, seg->s_szc)) != 0) {
2222 				kmem_free(shmd_new->shm_vpage,
2223 				btopr(amp->size));
2224 			}
2225 		}
2226 		return (error);
2227 	} else {
2228 		return (hat_share(newseg->s_as->a_hat, newseg->s_base,
2229 		    shmd->shm_sptas->a_hat, SEGSPTADDR, seg->s_size,
2230 		    seg->s_szc));
2231 
2232 	}
2233 }
2234 
2235 /*ARGSUSED*/
2236 int
2237 segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
2238 {
2239 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
2240 	struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2241 
2242 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2243 
2244 	/*
2245 	 * ISM segment is always rw.
2246 	 */
2247 	return (((sptd->spt_prot & prot) != prot) ? EACCES : 0);
2248 }
2249 
2250 /*
2251  * Return an array of locked large pages, for empty slots allocate
2252  * private zero-filled anon pages.
2253  */
2254 static int
2255 spt_anon_getpages(
2256 	struct seg *sptseg,
2257 	caddr_t sptaddr,
2258 	size_t len,
2259 	page_t *ppa[])
2260 {
2261 	struct  spt_data *sptd = sptseg->s_data;
2262 	struct  anon_map *amp = sptd->spt_amp;
2263 	enum 	seg_rw rw = sptd->spt_prot;
2264 	uint_t	szc = sptseg->s_szc;
2265 	size_t	pg_sz, share_sz = page_get_pagesize(szc);
2266 	pgcnt_t	lp_npgs;
2267 	caddr_t	lp_addr, e_sptaddr;
2268 	uint_t	vpprot, ppa_szc = 0;
2269 	struct  vpage *vpage = NULL;
2270 	ulong_t	j, ppa_idx;
2271 	int	err, ierr = 0;
2272 	pgcnt_t	an_idx;
2273 	anon_sync_obj_t cookie;
2274 
2275 	ASSERT(IS_P2ALIGNED(sptaddr, share_sz) && IS_P2ALIGNED(len, share_sz));
2276 	ASSERT(len != 0);
2277 
2278 	pg_sz = share_sz;
2279 	lp_npgs = btop(pg_sz);
2280 	lp_addr = sptaddr;
2281 	e_sptaddr = sptaddr + len;
2282 	an_idx = seg_page(sptseg, sptaddr);
2283 	ppa_idx = 0;
2284 
2285 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2286 	/*CONSTCOND*/
2287 	while (1) {
2288 		for (; lp_addr < e_sptaddr;
2289 			an_idx += lp_npgs, lp_addr += pg_sz,
2290 			ppa_idx += lp_npgs) {
2291 
2292 			anon_array_enter(amp, an_idx, &cookie);
2293 			ppa_szc = (uint_t)-1;
2294 			ierr = anon_map_getpages(amp, an_idx, szc, sptseg,
2295 			    lp_addr, sptd->spt_prot, &vpprot, &ppa[ppa_idx],
2296 			    &ppa_szc, vpage, rw, 0, segvn_anypgsz, kcred);
2297 			anon_array_exit(&cookie);
2298 
2299 			if (ierr != 0) {
2300 				if (ierr > 0) {
2301 					err = FC_MAKE_ERR(ierr);
2302 					goto lpgs_err;
2303 				}
2304 				break;
2305 			}
2306 		}
2307 		if (lp_addr == e_sptaddr) {
2308 			break;
2309 		}
2310 		ASSERT(lp_addr < e_sptaddr);
2311 
2312 		/*
2313 		 * ierr == -1 means we failed to allocate a large page.
2314 		 * so do a size down operation.
2315 		 *
2316 		 * ierr == -2 means some other process that privately shares
2317 		 * pages with this process has allocated a larger page and we
2318 		 * need to retry with larger pages. So do a size up
2319 		 * operation. This relies on the fact that large pages are
2320 		 * never partially shared i.e. if we share any constituent
2321 		 * page of a large page with another process we must share the
2322 		 * entire large page. Note this cannot happen for SOFTLOCK
2323 		 * case, unless current address (lpaddr) is at the beginning
2324 		 * of the next page size boundary because the other process
2325 		 * couldn't have relocated locked pages.
2326 		 */
2327 		ASSERT(ierr == -1 || ierr == -2);
2328 		if (segvn_anypgsz) {
2329 			ASSERT(ierr == -2 || szc != 0);
2330 			ASSERT(ierr == -1 || szc < sptseg->s_szc);
2331 			szc = (ierr == -1) ? szc - 1 : szc + 1;
2332 		} else {
2333 			/*
2334 			 * For faults and segvn_anypgsz == 0
2335 			 * we need to be careful not to loop forever
2336 			 * if existing page is found with szc other
2337 			 * than 0 or seg->s_szc. This could be due
2338 			 * to page relocations on behalf of DR or
2339 			 * more likely large page creation. For this
2340 			 * case simply re-size to existing page's szc
2341 			 * if returned by anon_map_getpages().
2342 			 */
2343 			if (ppa_szc == (uint_t)-1) {
2344 				szc = (ierr == -1) ? 0 : sptseg->s_szc;
2345 			} else {
2346 				ASSERT(ppa_szc <= sptseg->s_szc);
2347 				ASSERT(ierr == -2 || ppa_szc < szc);
2348 				ASSERT(ierr == -1 || ppa_szc > szc);
2349 				szc = ppa_szc;
2350 			}
2351 		}
2352 		pg_sz = page_get_pagesize(szc);
2353 		lp_npgs = btop(pg_sz);
2354 		ASSERT(IS_P2ALIGNED(lp_addr, pg_sz));
2355 	}
2356 	ANON_LOCK_EXIT(&amp->a_rwlock);
2357 	return (0);
2358 
2359 lpgs_err:
2360 	ANON_LOCK_EXIT(&amp->a_rwlock);
2361 	for (j = 0; j < ppa_idx; j++)
2362 		page_unlock(ppa[j]);
2363 	return (err);
2364 }
2365 
2366 int
2367 spt_lockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages,
2368     page_t **ppa, ulong_t *lockmap, size_t pos)
2369 {
2370 	struct shm_data *shmd = seg->s_data;
2371 	struct spt_data *sptd = shmd->shm_sptseg->s_data;
2372 	ulong_t	i;
2373 	int	kernel;
2374 
2375 	for (i = 0; i < npages; anon_index++, pos++, i++) {
2376 		if (!(shmd->shm_vpage[anon_index] & DISM_PG_LOCKED)) {
2377 			if (sptd->spt_ppa_lckcnt[anon_index] <
2378 			    (ushort_t)DISM_LOCK_MAX) {
2379 				if (++sptd->spt_ppa_lckcnt[anon_index] ==
2380 				    (ushort_t)DISM_LOCK_MAX) {
2381 					cmn_err(CE_WARN,
2382 					    "DISM page lock limit "
2383 					    "reached on DISM offset 0x%lx\n",
2384 					    anon_index << PAGESHIFT);
2385 				}
2386 				kernel = (sptd->spt_ppa &&
2387 				    sptd->spt_ppa[anon_index]) ? 1 : 0;
2388 				if (!page_pp_lock(ppa[i], 0, kernel)) {
2389 					/* unlock rest of the pages */
2390 					for (; i < npages; i++)
2391 						page_unlock(ppa[i]);
2392 					sptd->spt_ppa_lckcnt[anon_index]--;
2393 					return (EAGAIN);
2394 				}
2395 				shmd->shm_lckpgs++;
2396 				shmd->shm_vpage[anon_index] |= DISM_PG_LOCKED;
2397 				if (lockmap != NULL)
2398 					BT_SET(lockmap, pos);
2399 			}
2400 		}
2401 		page_unlock(ppa[i]);
2402 	}
2403 	return (0);
2404 }
2405 
2406 /*ARGSUSED*/
2407 static int
2408 segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
2409     int attr, int op, ulong_t *lockmap, size_t pos)
2410 {
2411 	struct shm_data *shmd = seg->s_data;
2412 	struct seg	*sptseg = shmd->shm_sptseg;
2413 	struct spt_data *sptd = sptseg->s_data;
2414 	pgcnt_t		npages, a_npages;
2415 	page_t		**ppa;
2416 	pgcnt_t 	an_idx, a_an_idx, ppa_idx;
2417 	caddr_t		spt_addr, a_addr;	/* spt and aligned address */
2418 	size_t		a_len;			/* aligned len */
2419 	size_t		share_sz;
2420 	ulong_t		i;
2421 	int		sts = 0;
2422 
2423 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2424 
2425 	if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
2426 		return (0);
2427 	}
2428 
2429 	addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2430 	an_idx = seg_page(seg, addr);
2431 	npages = btopr(len);
2432 
2433 	if (an_idx + npages > btopr(shmd->shm_amp->size)) {
2434 		return (ENOMEM);
2435 	}
2436 
2437 	if (op == MC_LOCK) {
2438 		/*
2439 		 * Need to align addr and size request if they are not
2440 		 * aligned so we can always allocate large page(s) however
2441 		 * we only lock what was requested in initial request.
2442 		 */
2443 		share_sz = page_get_pagesize(sptseg->s_szc);
2444 		a_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_sz);
2445 		a_len = P2ROUNDUP((uintptr_t)(((addr + len) - a_addr)),
2446 				share_sz);
2447 		a_npages = btop(a_len);
2448 		a_an_idx = seg_page(seg, a_addr);
2449 		spt_addr = sptseg->s_base + ptob(a_an_idx);
2450 		ppa_idx = an_idx - a_an_idx;
2451 
2452 		if ((ppa = kmem_zalloc(((sizeof (page_t *)) * a_npages),
2453 			KM_NOSLEEP)) == NULL) {
2454 			return (ENOMEM);
2455 		}
2456 
2457 		/*
2458 		 * Don't cache any new pages for IO and
2459 		 * flush any cached pages.
2460 		 */
2461 		mutex_enter(&sptd->spt_lock);
2462 		if (sptd->spt_ppa != NULL)
2463 			sptd->spt_flags |= DISM_PPA_CHANGED;
2464 
2465 		sts = spt_anon_getpages(sptseg, spt_addr, a_len, ppa);
2466 		if (sts != 0) {
2467 			mutex_exit(&sptd->spt_lock);
2468 			kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
2469 			return (sts);
2470 		}
2471 
2472 		sts = spt_lockpages(seg, an_idx, npages,
2473 		    &ppa[ppa_idx], lockmap, pos);
2474 		/*
2475 		 * unlock remaining pages for requests which are not
2476 		 * aligned or not in 4 M chunks
2477 		 */
2478 		for (i = 0; i < ppa_idx; i++)
2479 			page_unlock(ppa[i]);
2480 		for (i = ppa_idx + npages; i < a_npages; i++)
2481 			page_unlock(ppa[i]);
2482 		if (sptd->spt_ppa != NULL)
2483 			sptd->spt_flags |= DISM_PPA_CHANGED;
2484 		mutex_exit(&sptd->spt_lock);
2485 
2486 		kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
2487 
2488 	} else if (op == MC_UNLOCK) { /* unlock */
2489 		struct anon_map *amp;
2490 		struct anon 	*ap;
2491 		struct vnode 	*vp;
2492 		u_offset_t 	off;
2493 		struct page	*pp;
2494 		int		kernel;
2495 		anon_sync_obj_t cookie;
2496 
2497 		amp = sptd->spt_amp;
2498 		mutex_enter(&sptd->spt_lock);
2499 		if (shmd->shm_lckpgs == 0) {
2500 			mutex_exit(&sptd->spt_lock);
2501 			return (0);
2502 		}
2503 		/*
2504 		 * Don't cache new IO pages.
2505 		 */
2506 		if (sptd->spt_ppa != NULL)
2507 			sptd->spt_flags |= DISM_PPA_CHANGED;
2508 
2509 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2510 		for (i = 0; i < npages; i++, an_idx++) {
2511 			if (shmd->shm_vpage[an_idx] & DISM_PG_LOCKED) {
2512 				anon_array_enter(amp, an_idx, &cookie);
2513 				ap = anon_get_ptr(amp->ahp, an_idx);
2514 				ASSERT(ap);
2515 				ASSERT(sptd->spt_ppa_lckcnt[an_idx] > 0);
2516 
2517 				swap_xlate(ap, &vp, &off);
2518 				anon_array_exit(&cookie);
2519 				pp = page_lookup(vp, off, SE_SHARED);
2520 				ASSERT(pp);
2521 				/*
2522 				 * the availrmem is decremented only for
2523 				 * pages which are not in seg pcache,
2524 				 * for pages in seg pcache availrmem was
2525 				 * decremented in _dismpagelock() (if
2526 				 * they were not locked here)
2527 				 */
2528 				kernel = (sptd->spt_ppa &&
2529 				    sptd->spt_ppa[an_idx]) ? 1 : 0;
2530 				page_pp_unlock(pp, 0, kernel);
2531 				page_unlock(pp);
2532 				shmd->shm_vpage[an_idx] &= ~DISM_PG_LOCKED;
2533 				sptd->spt_ppa_lckcnt[an_idx]--;
2534 				shmd->shm_lckpgs--;
2535 			}
2536 		}
2537 		ANON_LOCK_EXIT(&amp->a_rwlock);
2538 		if (sptd->spt_ppa != NULL)
2539 			sptd->spt_flags |= DISM_PPA_CHANGED;
2540 		mutex_exit(&sptd->spt_lock);
2541 	}
2542 	return (sts);
2543 }
2544 
2545 /*ARGSUSED*/
2546 int
2547 segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
2548 {
2549 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
2550 	struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2551 	spgcnt_t pgno = seg_page(seg, addr+len) - seg_page(seg, addr) + 1;
2552 
2553 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2554 
2555 	/*
2556 	 * ISM segment is always rw.
2557 	 */
2558 	while (--pgno >= 0)
2559 		*protv++ = sptd->spt_prot;
2560 	return (0);
2561 }
2562 
2563 /*ARGSUSED*/
2564 u_offset_t
2565 segspt_shmgetoffset(struct seg *seg, caddr_t addr)
2566 {
2567 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2568 
2569 	/* Offset does not matter in ISM memory */
2570 
2571 	return ((u_offset_t)0);
2572 }
2573 
2574 /* ARGSUSED */
2575 int
2576 segspt_shmgettype(struct seg *seg, caddr_t addr)
2577 {
2578 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
2579 	struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2580 
2581 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2582 
2583 	/*
2584 	 * The shared memory mapping is always MAP_SHARED, SWAP is only
2585 	 * reserved for DISM
2586 	 */
2587 	return (MAP_SHARED |
2588 		((sptd->spt_flags & SHM_PAGEABLE) ? 0 : MAP_NORESERVE));
2589 }
2590 
2591 /*ARGSUSED*/
2592 int
2593 segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
2594 {
2595 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
2596 	struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2597 
2598 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2599 
2600 	*vpp = sptd->spt_vp;
2601 	return (0);
2602 }
2603 
2604 /*ARGSUSED*/
2605 static int
2606 segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
2607 {
2608 	struct shm_data 	*shmd = (struct shm_data *)seg->s_data;
2609 	struct spt_data	*sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2610 	struct anon_map	*amp;
2611 	pgcnt_t		pg_idx;
2612 
2613 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
2614 
2615 	if (behav == MADV_FREE) {
2616 		if ((sptd->spt_flags & SHM_PAGEABLE) == 0)
2617 			return (0);
2618 
2619 		amp = sptd->spt_amp;
2620 		pg_idx = seg_page(seg, addr);
2621 
2622 		mutex_enter(&sptd->spt_lock);
2623 		if (sptd->spt_ppa != NULL)
2624 			sptd->spt_flags |= DISM_PPA_CHANGED;
2625 		mutex_exit(&sptd->spt_lock);
2626 
2627 		/*
2628 		 * Purge all DISM cached pages
2629 		 */
2630 		seg_ppurge_seg(segspt_reclaim);
2631 
2632 		mutex_enter(&sptd->spt_lock);
2633 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2634 		anon_disclaim(amp, pg_idx, len, ANON_PGLOOKUP_BLK);
2635 		ANON_LOCK_EXIT(&amp->a_rwlock);
2636 		mutex_exit(&sptd->spt_lock);
2637 	} else if (lgrp_optimizations() && (behav == MADV_ACCESS_LWP ||
2638 	    behav == MADV_ACCESS_MANY || behav == MADV_ACCESS_DEFAULT)) {
2639 		int			already_set;
2640 		ulong_t			anon_index;
2641 		lgrp_mem_policy_t	policy;
2642 		caddr_t			shm_addr;
2643 		size_t			share_size;
2644 		size_t			size;
2645 		struct seg		*sptseg = shmd->shm_sptseg;
2646 		caddr_t			sptseg_addr;
2647 
2648 		/*
2649 		 * Align address and length to page size of underlying segment
2650 		 */
2651 		share_size = page_get_pagesize(shmd->shm_sptseg->s_szc);
2652 		shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_size);
2653 		size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)),
2654 		    share_size);
2655 
2656 		amp = shmd->shm_amp;
2657 		anon_index = seg_page(seg, shm_addr);
2658 
2659 		/*
2660 		 * And now we may have to adjust size downward if we have
2661 		 * exceeded the realsize of the segment or initial anon
2662 		 * allocations.
2663 		 */
2664 		sptseg_addr = sptseg->s_base + ptob(anon_index);
2665 		if ((sptseg_addr + size) >
2666 		    (sptseg->s_base + sptd->spt_realsize))
2667 			size = (sptseg->s_base + sptd->spt_realsize) -
2668 			    sptseg_addr;
2669 
2670 		/*
2671 		 * Set memory allocation policy for this segment
2672 		 */
2673 		policy = lgrp_madv_to_policy(behav, len, MAP_SHARED);
2674 		already_set = lgrp_shm_policy_set(policy, amp, anon_index,
2675 		    NULL, 0, len);
2676 
2677 		/*
2678 		 * If random memory allocation policy set already,
2679 		 * don't bother reapplying it.
2680 		 */
2681 		if (already_set && !LGRP_MEM_POLICY_REAPPLICABLE(policy))
2682 			return (0);
2683 
2684 		/*
2685 		 * Mark any existing pages in the given range for
2686 		 * migration, flushing the I/O page cache, and using
2687 		 * underlying segment to calculate anon index and get
2688 		 * anonmap and vnode pointer from
2689 		 */
2690 		if (shmd->shm_softlockcnt > 0)
2691 			segspt_purge(seg);
2692 
2693 		page_mark_migrate(seg, shm_addr, size, amp, 0, NULL, 0, 0);
2694 	}
2695 
2696 	return (0);
2697 }
2698 
2699 /*ARGSUSED*/
2700 void
2701 segspt_shmdump(struct seg *seg)
2702 {
2703 	/* no-op for ISM segment */
2704 }
2705 
2706 /*ARGSUSED*/
2707 static faultcode_t
2708 segspt_shmsetpgsz(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
2709 {
2710 	return (ENOTSUP);
2711 }
2712 
2713 /*
2714  * get a memory ID for an addr in a given segment
2715  */
2716 static int
2717 segspt_shmgetmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
2718 {
2719 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
2720 	struct anon 	*ap;
2721 	size_t		anon_index;
2722 	struct anon_map	*amp = shmd->shm_amp;
2723 	struct spt_data	*sptd = shmd->shm_sptseg->s_data;
2724 	struct seg	*sptseg = shmd->shm_sptseg;
2725 	anon_sync_obj_t	cookie;
2726 
2727 	anon_index = seg_page(seg, addr);
2728 
2729 	if (addr > (seg->s_base + sptd->spt_realsize)) {
2730 		return (EFAULT);
2731 	}
2732 
2733 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2734 	anon_array_enter(amp, anon_index, &cookie);
2735 	ap = anon_get_ptr(amp->ahp, anon_index);
2736 	if (ap == NULL) {
2737 		struct page *pp;
2738 		caddr_t spt_addr = sptseg->s_base + ptob(anon_index);
2739 
2740 		pp = anon_zero(sptseg, spt_addr, &ap, kcred);
2741 		if (pp == NULL) {
2742 			anon_array_exit(&cookie);
2743 			ANON_LOCK_EXIT(&amp->a_rwlock);
2744 			return (ENOMEM);
2745 		}
2746 		(void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP);
2747 		page_unlock(pp);
2748 	}
2749 	anon_array_exit(&cookie);
2750 	ANON_LOCK_EXIT(&amp->a_rwlock);
2751 	memidp->val[0] = (uintptr_t)ap;
2752 	memidp->val[1] = (uintptr_t)addr & PAGEOFFSET;
2753 	return (0);
2754 }
2755 
2756 /*
2757  * Get memory allocation policy info for specified address in given segment
2758  */
2759 static lgrp_mem_policy_info_t *
2760 segspt_shmgetpolicy(struct seg *seg, caddr_t addr)
2761 {
2762 	struct anon_map		*amp;
2763 	ulong_t			anon_index;
2764 	lgrp_mem_policy_info_t	*policy_info;
2765 	struct shm_data		*shm_data;
2766 
2767 	ASSERT(seg != NULL);
2768 
2769 	/*
2770 	 * Get anon_map from segshm
2771 	 *
2772 	 * Assume that no lock needs to be held on anon_map, since
2773 	 * it should be protected by its reference count which must be
2774 	 * nonzero for an existing segment
2775 	 * Need to grab readers lock on policy tree though
2776 	 */
2777 	shm_data = (struct shm_data *)seg->s_data;
2778 	if (shm_data == NULL)
2779 		return (NULL);
2780 	amp = shm_data->shm_amp;
2781 	ASSERT(amp->refcnt != 0);
2782 
2783 	/*
2784 	 * Get policy info
2785 	 *
2786 	 * Assume starting anon index of 0
2787 	 */
2788 	anon_index = seg_page(seg, addr);
2789 	policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
2790 
2791 	return (policy_info);
2792 }
2793 
2794 /*ARGSUSED*/
2795 static int
2796 segspt_shmcapable(struct seg *seg, segcapability_t capability)
2797 {
2798 	return (0);
2799 }
2800