xref: /illumos-gate/usr/src/uts/common/vm/seg_spt.c (revision bde334a8dbd66dfa70ce4d7fc9dcad6e1ae45fe4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2018 Joyent, Inc.
24  * Copyright (c) 2016 by Delphix. All rights reserved.
25  */
26 
27 #include <sys/param.h>
28 #include <sys/user.h>
29 #include <sys/mman.h>
30 #include <sys/kmem.h>
31 #include <sys/sysmacros.h>
32 #include <sys/cmn_err.h>
33 #include <sys/systm.h>
34 #include <sys/tuneable.h>
35 #include <vm/hat.h>
36 #include <vm/seg.h>
37 #include <vm/as.h>
38 #include <vm/anon.h>
39 #include <vm/page.h>
40 #include <sys/buf.h>
41 #include <sys/swap.h>
42 #include <sys/atomic.h>
43 #include <vm/seg_spt.h>
44 #include <sys/debug.h>
45 #include <sys/vtrace.h>
46 #include <sys/shm.h>
47 #include <sys/shm_impl.h>
48 #include <sys/lgrp.h>
49 #include <sys/vmsystm.h>
50 #include <sys/policy.h>
51 #include <sys/project.h>
52 #include <sys/tnf_probe.h>
53 #include <sys/zone.h>
54 
55 #define	SEGSPTADDR	(caddr_t)0x0
56 
57 /*
58  * # pages used for spt
59  */
60 size_t	spt_used;
61 
62 /*
63  * segspt_minfree is the memory left for system after ISM
64  * locked its pages; it is set up to 5% of availrmem in
65  * sptcreate when ISM is created.  ISM should not use more
66  * than ~90% of availrmem; if it does, then the performance
67  * of the system may decrease. Machines with large memories may
68  * be able to use up more memory for ISM so we set the default
69  * segspt_minfree to 5% (which gives ISM max 95% of availrmem.
70  * If somebody wants even more memory for ISM (risking hanging
71  * the system) they can patch the segspt_minfree to smaller number.
72  */
73 pgcnt_t segspt_minfree = 0;
74 
75 static int segspt_create(struct seg **segpp, void *argsp);
76 static int segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize);
77 static void segspt_free(struct seg *seg);
78 static void segspt_free_pages(struct seg *seg, caddr_t addr, size_t len);
79 static lgrp_mem_policy_info_t *segspt_getpolicy(struct seg *seg, caddr_t addr);
80 
81 /* ARGSUSED */
82 __NORETURN static int
83 segspt_badop_dup(struct seg *seg __unused, struct seg *newseg __unused)
84 {
85 	panic("%s called", __func__);
86 }
87 
88 /* ARGSUSED */
89 __NORETURN static faultcode_t
90 segspt_badop_fault(struct hat *hat, struct seg *seg, caddr_t addr,
91     size_t len, enum fault_type type, enum seg_rw rw)
92 {
93 	panic("%s called", __func__);
94 }
95 
96 /* ARGSUSED */
97 __NORETURN static faultcode_t
98 segspt_badop_faulta(struct seg *seg __unused, caddr_t addr __unused)
99 {
100 	panic("%s called", __func__);
101 }
102 
103 /* ARGSUSED */
104 __NORETURN static int
105 segspt_badop_prot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
106 {
107 	panic("%s called", __func__);
108 }
109 
110 /* ARGSUSED */
111 __NORETURN static int
112 segspt_badop_checkprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
113 {
114 	panic("%s called", __func__);
115 }
116 
117 /* ARGSUSED */
118 __NORETURN static int
119 segspt_badop_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
120 {
121 	panic("%s called", __func__);
122 }
123 
124 /* ARGSUSED */
125 __NORETURN static size_t
126 segspt_badop_swapout(struct seg *seg)
127 {
128 	panic("%s called", __func__);
129 }
130 
131 /* ARGSUSED */
132 __NORETURN static int
133 segspt_badop_sync(struct seg *seg, caddr_t addr, size_t len, int attr,
134     uint_t flags)
135 {
136 	panic("%s called", __func__);
137 }
138 
139 /* ARGSUSED */
140 __NORETURN
141 static size_t
142 segspt_badop_incore(struct seg *seg, caddr_t addr, size_t len, char *vec)
143 {
144 	panic("%s called", __func__);
145 }
146 
147 /* ARGSUSED */
148 __NORETURN static int
149 segspt_badop_lockop(struct seg *seg, caddr_t addr, size_t len, int attr,
150     int op, ulong_t *lockmap, size_t pos)
151 {
152 	panic("%s called", __func__);
153 }
154 
155 /* ARGSUSED */
156 __NORETURN static int
157 segspt_badop_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
158 {
159 	panic("%s called", __func__);
160 }
161 
162 /* ARGSUSED */
163 __NORETURN static u_offset_t
164 segspt_badop_getoffset(struct seg *seg, caddr_t addr)
165 {
166 	panic("%s called", __func__);
167 }
168 
169 /* ARGSUSED */
170 __NORETURN static int
171 segspt_badop_gettype(struct seg *seg, caddr_t addr)
172 {
173 	panic("%s called", __func__);
174 }
175 
176 /* ARGSUSED */
177 __NORETURN static int
178 segspt_badop_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
179 {
180 	panic("%s called", __func__);
181 }
182 
183 /* ARGSUSED */
184 __NORETURN static int
185 segspt_badop_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
186 {
187 	panic("%s called", __func__);
188 }
189 
190 /* ARGSUSED */
191 __NORETURN static void
192 segspt_badop_dump(struct seg *seg)
193 {
194 	panic("%s called", __func__);
195 }
196 
197 /* ARGSUSED */
198 __NORETURN static int
199 segspt_badop_pagelock(struct seg *seg, caddr_t addr, size_t len,
200     struct page ***ppp, enum lock_type type, enum seg_rw rw)
201 {
202 	panic("%s called", __func__);
203 }
204 
205 /* ARGSUSED */
206 __NORETURN static int
207 segspt_badop_setpgsz(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
208 {
209 	panic("%s called", __func__);
210 }
211 
212 /* ARGSUSED */
213 __NORETURN static int
214 segspt_badop_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
215 {
216 	panic("%s called", __func__);
217 }
218 
219 /* ARGSUSED */
220 __NORETURN static int
221 segspt_badop_capable(struct seg *seg, segcapability_t capability)
222 {
223 	panic("%s called", __func__);
224 }
225 
226 struct seg_ops segspt_ops = {
227 	segspt_badop_dup,		/* dup */
228 	segspt_unmap,
229 	segspt_free,
230 	segspt_badop_fault,		/* fault */
231 	segspt_badop_faulta,		/* faulta */
232 	segspt_badop_prot,		/* setprot */
233 	segspt_badop_checkprot,		/* checkprot */
234 	segspt_badop_kluster,		/* kluster */
235 	segspt_badop_swapout,		/* swapout */
236 	segspt_badop_sync,		/* sync */
237 	segspt_badop_incore,		/* incore */
238 	segspt_badop_lockop,		/* lockop */
239 	segspt_badop_getprot,		/* getprot */
240 	segspt_badop_getoffset,		/* getoffset */
241 	segspt_badop_gettype,		/* gettype */
242 	segspt_badop_getvp,		/* getvp */
243 	segspt_badop_advise,		/* advise */
244 	segspt_badop_dump,		/* dump */
245 	segspt_badop_pagelock,		/* pagelock */
246 	segspt_badop_setpgsz,		/* setpgsz */
247 	segspt_badop_getmemid,		/* getmemid */
248 	segspt_getpolicy,		/* getpolicy */
249 	segspt_badop_capable,		/* capable */
250 	seg_inherit_notsup		/* inherit */
251 };
252 
253 static int segspt_shmdup(struct seg *seg, struct seg *newseg);
254 static int segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize);
255 static void segspt_shmfree(struct seg *seg);
256 static faultcode_t segspt_shmfault(struct hat *hat, struct seg *seg,
257 		caddr_t addr, size_t len, enum fault_type type, enum seg_rw rw);
258 static faultcode_t segspt_shmfaulta(struct seg *seg, caddr_t addr);
259 static int segspt_shmsetprot(struct seg *seg, caddr_t addr, size_t len,
260 		uint_t prot);
261 static int segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size,
262 		uint_t prot);
263 static int	segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta);
264 static size_t	segspt_shmswapout(struct seg *seg);
265 static size_t segspt_shmincore(struct seg *seg, caddr_t addr, size_t len,
266 		char *vec);
267 static int segspt_shmsync(struct seg *seg, caddr_t addr, size_t len,
268 		int attr, uint_t flags);
269 static int segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
270 		int attr, int op, ulong_t *lockmap, size_t pos);
271 static int segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len,
272 		uint_t *protv);
273 static u_offset_t segspt_shmgetoffset(struct seg *seg, caddr_t addr);
274 static int segspt_shmgettype(struct seg *seg, caddr_t addr);
275 static int segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
276 static int segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len,
277 		uint_t behav);
278 static void segspt_shmdump(struct seg *seg);
279 static int segspt_shmpagelock(struct seg *, caddr_t, size_t,
280 		struct page ***, enum lock_type, enum seg_rw);
281 static int segspt_shmsetpgsz(struct seg *, caddr_t, size_t, uint_t);
282 static int segspt_shmgetmemid(struct seg *, caddr_t, memid_t *);
283 static lgrp_mem_policy_info_t *segspt_shmgetpolicy(struct seg *, caddr_t);
284 static int segspt_shmcapable(struct seg *, segcapability_t);
285 
286 struct seg_ops segspt_shmops = {
287 	segspt_shmdup,
288 	segspt_shmunmap,
289 	segspt_shmfree,
290 	segspt_shmfault,
291 	segspt_shmfaulta,
292 	segspt_shmsetprot,
293 	segspt_shmcheckprot,
294 	segspt_shmkluster,
295 	segspt_shmswapout,
296 	segspt_shmsync,
297 	segspt_shmincore,
298 	segspt_shmlockop,
299 	segspt_shmgetprot,
300 	segspt_shmgetoffset,
301 	segspt_shmgettype,
302 	segspt_shmgetvp,
303 	segspt_shmadvise,	/* advise */
304 	segspt_shmdump,
305 	segspt_shmpagelock,
306 	segspt_shmsetpgsz,
307 	segspt_shmgetmemid,
308 	segspt_shmgetpolicy,
309 	segspt_shmcapable,
310 	seg_inherit_notsup
311 };
312 
313 static void segspt_purge(struct seg *seg);
314 static int segspt_reclaim(void *, caddr_t, size_t, struct page **,
315 		enum seg_rw, int);
316 static int spt_anon_getpages(struct seg *seg, caddr_t addr, size_t len,
317 		page_t **ppa);
318 
319 
320 
321 /*ARGSUSED*/
322 int
323 sptcreate(size_t size, struct seg **sptseg, struct anon_map *amp,
324     uint_t prot, uint_t flags, uint_t share_szc)
325 {
326 	int	err;
327 	struct	as	*newas;
328 	struct	segspt_crargs sptcargs;
329 
330 #ifdef DEBUG
331 	TNF_PROBE_1(sptcreate, "spt", /* CSTYLED */,
332 			tnf_ulong, size, size );
333 #endif
334 	if (segspt_minfree == 0)	/* leave min 5% of availrmem for */
335 		segspt_minfree = availrmem/20;	/* for the system */
336 
337 	if (!hat_supported(HAT_SHARED_PT, (void *)0))
338 		return (EINVAL);
339 
340 	/*
341 	 * get a new as for this shared memory segment
342 	 */
343 	newas = as_alloc();
344 	newas->a_proc = NULL;
345 	sptcargs.amp = amp;
346 	sptcargs.prot = prot;
347 	sptcargs.flags = flags;
348 	sptcargs.szc = share_szc;
349 	/*
350 	 * create a shared page table (spt) segment
351 	 */
352 
353 	if (err = as_map(newas, SEGSPTADDR, size, segspt_create, &sptcargs)) {
354 		as_free(newas);
355 		return (err);
356 	}
357 	*sptseg = sptcargs.seg_spt;
358 	return (0);
359 }
360 
361 void
362 sptdestroy(struct as *as, struct anon_map *amp)
363 {
364 
365 #ifdef DEBUG
366 	TNF_PROBE_0(sptdestroy, "spt", /* CSTYLED */);
367 #endif
368 	(void) as_unmap(as, SEGSPTADDR, amp->size);
369 	as_free(as);
370 }
371 
372 /*
373  * called from seg_free().
374  * free (i.e., unlock, unmap, return to free list)
375  *  all the pages in the given seg.
376  */
377 void
378 segspt_free(struct seg	*seg)
379 {
380 	struct spt_data *sptd = (struct spt_data *)seg->s_data;
381 
382 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
383 
384 	if (sptd != NULL) {
385 		if (sptd->spt_realsize)
386 			segspt_free_pages(seg, seg->s_base, sptd->spt_realsize);
387 
388 		if (sptd->spt_ppa_lckcnt) {
389 			kmem_free(sptd->spt_ppa_lckcnt,
390 			    sizeof (*sptd->spt_ppa_lckcnt)
391 			    * btopr(sptd->spt_amp->size));
392 		}
393 		kmem_free(sptd->spt_vp, sizeof (*sptd->spt_vp));
394 		cv_destroy(&sptd->spt_cv);
395 		mutex_destroy(&sptd->spt_lock);
396 		kmem_free(sptd, sizeof (*sptd));
397 	}
398 }
399 
400 /*ARGSUSED*/
401 static int
402 segspt_shmsync(struct seg *seg, caddr_t addr, size_t len, int attr,
403     uint_t flags)
404 {
405 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
406 
407 	return (0);
408 }
409 
410 /*ARGSUSED*/
411 static size_t
412 segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, char *vec)
413 {
414 	caddr_t	eo_seg;
415 	pgcnt_t	npages;
416 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
417 	struct seg	*sptseg;
418 	struct spt_data *sptd;
419 
420 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
421 #ifdef lint
422 	seg = seg;
423 #endif
424 	sptseg = shmd->shm_sptseg;
425 	sptd = sptseg->s_data;
426 
427 	if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
428 		eo_seg = addr + len;
429 		while (addr < eo_seg) {
430 			/* page exists, and it's locked. */
431 			*vec++ = SEG_PAGE_INCORE | SEG_PAGE_LOCKED |
432 			    SEG_PAGE_ANON;
433 			addr += PAGESIZE;
434 		}
435 		return (len);
436 	} else {
437 		struct  anon_map *amp = shmd->shm_amp;
438 		struct  anon	*ap;
439 		page_t		*pp;
440 		pgcnt_t		anon_index;
441 		struct vnode	*vp;
442 		u_offset_t	off;
443 		ulong_t		i;
444 		int		ret;
445 		anon_sync_obj_t	cookie;
446 
447 		addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
448 		anon_index = seg_page(seg, addr);
449 		npages = btopr(len);
450 		if (anon_index + npages > btopr(shmd->shm_amp->size)) {
451 			return (EINVAL);
452 		}
453 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
454 		for (i = 0; i < npages; i++, anon_index++) {
455 			ret = 0;
456 			anon_array_enter(amp, anon_index, &cookie);
457 			ap = anon_get_ptr(amp->ahp, anon_index);
458 			if (ap != NULL) {
459 				swap_xlate(ap, &vp, &off);
460 				anon_array_exit(&cookie);
461 				pp = page_lookup_nowait(vp, off, SE_SHARED);
462 				if (pp != NULL) {
463 					ret |= SEG_PAGE_INCORE | SEG_PAGE_ANON;
464 					page_unlock(pp);
465 				}
466 			} else {
467 				anon_array_exit(&cookie);
468 			}
469 			if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) {
470 				ret |= SEG_PAGE_LOCKED;
471 			}
472 			*vec++ = (char)ret;
473 		}
474 		ANON_LOCK_EXIT(&amp->a_rwlock);
475 		return (len);
476 	}
477 }
478 
479 static int
480 segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize)
481 {
482 	size_t share_size;
483 
484 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
485 
486 	/*
487 	 * seg.s_size may have been rounded up to the largest page size
488 	 * in shmat().
489 	 * XXX This should be cleanedup. sptdestroy should take a length
490 	 * argument which should be the same as sptcreate. Then
491 	 * this rounding would not be needed (or is done in shm.c)
492 	 * Only the check for full segment will be needed.
493 	 *
494 	 * XXX -- shouldn't raddr == 0 always? These tests don't seem
495 	 * to be useful at all.
496 	 */
497 	share_size = page_get_pagesize(seg->s_szc);
498 	ssize = P2ROUNDUP(ssize, share_size);
499 
500 	if (raddr == seg->s_base && ssize == seg->s_size) {
501 		seg_free(seg);
502 		return (0);
503 	} else
504 		return (EINVAL);
505 }
506 
507 int
508 segspt_create(struct seg **segpp, void *argsp)
509 {
510 	struct seg	*seg = *segpp;
511 	int		err;
512 	caddr_t		addr = seg->s_base;
513 	struct spt_data *sptd;
514 	struct segspt_crargs *sptcargs = (struct segspt_crargs *)argsp;
515 	struct anon_map *amp = sptcargs->amp;
516 	struct kshmid	*sp = amp->a_sp;
517 	struct	cred	*cred = CRED();
518 	ulong_t		i, j, anon_index = 0;
519 	pgcnt_t		npages = btopr(amp->size);
520 	struct vnode	*vp;
521 	page_t		**ppa;
522 	uint_t		hat_flags;
523 	size_t		pgsz;
524 	pgcnt_t		pgcnt;
525 	caddr_t		a;
526 	pgcnt_t		pidx;
527 	size_t		sz;
528 	proc_t		*procp = curproc;
529 	rctl_qty_t	lockedbytes = 0;
530 	kproject_t	*proj;
531 
532 	/*
533 	 * We are holding the a_lock on the underlying dummy as,
534 	 * so we can make calls to the HAT layer.
535 	 */
536 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
537 	ASSERT(sp != NULL);
538 
539 #ifdef DEBUG
540 	TNF_PROBE_2(segspt_create, "spt", /* CSTYLED */,
541 	    tnf_opaque, addr, addr, tnf_ulong, len, seg->s_size);
542 #endif
543 	if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
544 		if (err = anon_swap_adjust(npages))
545 			return (err);
546 	}
547 	err = ENOMEM;
548 
549 	if ((sptd = kmem_zalloc(sizeof (*sptd), KM_NOSLEEP)) == NULL)
550 		goto out1;
551 
552 	if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
553 		if ((ppa = kmem_zalloc(((sizeof (page_t *)) * npages),
554 		    KM_NOSLEEP)) == NULL)
555 			goto out2;
556 	}
557 
558 	mutex_init(&sptd->spt_lock, NULL, MUTEX_DEFAULT, NULL);
559 
560 	if ((vp = kmem_zalloc(sizeof (*vp), KM_NOSLEEP)) == NULL)
561 		goto out3;
562 
563 	seg->s_ops = &segspt_ops;
564 	sptd->spt_vp = vp;
565 	sptd->spt_amp = amp;
566 	sptd->spt_prot = sptcargs->prot;
567 	sptd->spt_flags = sptcargs->flags;
568 	seg->s_data = (caddr_t)sptd;
569 	sptd->spt_ppa = NULL;
570 	sptd->spt_ppa_lckcnt = NULL;
571 	seg->s_szc = sptcargs->szc;
572 	cv_init(&sptd->spt_cv, NULL, CV_DEFAULT, NULL);
573 	sptd->spt_gen = 0;
574 
575 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
576 	if (seg->s_szc > amp->a_szc) {
577 		amp->a_szc = seg->s_szc;
578 	}
579 	ANON_LOCK_EXIT(&amp->a_rwlock);
580 
581 	/*
582 	 * Set policy to affect initial allocation of pages in
583 	 * anon_map_createpages()
584 	 */
585 	(void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, amp, anon_index,
586 	    NULL, 0, ptob(npages));
587 
588 	if (sptcargs->flags & SHM_PAGEABLE) {
589 		size_t  share_sz;
590 		pgcnt_t new_npgs, more_pgs;
591 		struct anon_hdr *nahp;
592 		zone_t *zone;
593 
594 		share_sz = page_get_pagesize(seg->s_szc);
595 		if (!IS_P2ALIGNED(amp->size, share_sz)) {
596 			/*
597 			 * We are rounding up the size of the anon array
598 			 * on 4 M boundary because we always create 4 M
599 			 * of page(s) when locking, faulting pages and we
600 			 * don't have to check for all corner cases e.g.
601 			 * if there is enough space to allocate 4 M
602 			 * page.
603 			 */
604 			new_npgs = btop(P2ROUNDUP(amp->size, share_sz));
605 			more_pgs = new_npgs - npages;
606 
607 			/*
608 			 * The zone will never be NULL, as a fully created
609 			 * shm always has an owning zone.
610 			 */
611 			zone = sp->shm_perm.ipc_zone_ref.zref_zone;
612 			ASSERT(zone != NULL);
613 			if (anon_resv_zone(ptob(more_pgs), zone) == 0) {
614 				err = ENOMEM;
615 				goto out4;
616 			}
617 
618 			nahp = anon_create(new_npgs, ANON_SLEEP);
619 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
620 			(void) anon_copy_ptr(amp->ahp, 0, nahp, 0, npages,
621 			    ANON_SLEEP);
622 			anon_release(amp->ahp, npages);
623 			amp->ahp = nahp;
624 			ASSERT(amp->swresv == ptob(npages));
625 			amp->swresv = amp->size = ptob(new_npgs);
626 			ANON_LOCK_EXIT(&amp->a_rwlock);
627 			npages = new_npgs;
628 		}
629 
630 		sptd->spt_ppa_lckcnt = kmem_zalloc(npages *
631 		    sizeof (*sptd->spt_ppa_lckcnt), KM_SLEEP);
632 		sptd->spt_pcachecnt = 0;
633 		sptd->spt_realsize = ptob(npages);
634 		sptcargs->seg_spt = seg;
635 		return (0);
636 	}
637 
638 	/*
639 	 * get array of pages for each anon slot in amp
640 	 */
641 	if ((err = anon_map_createpages(amp, anon_index, ptob(npages), ppa,
642 	    seg, addr, S_CREATE, cred)) != 0)
643 		goto out4;
644 
645 	mutex_enter(&sp->shm_mlock);
646 
647 	/* May be partially locked, so, count bytes to charge for locking */
648 	for (i = 0; i < npages; i++)
649 		if (ppa[i]->p_lckcnt == 0)
650 			lockedbytes += PAGESIZE;
651 
652 	proj = sp->shm_perm.ipc_proj;
653 
654 	if (lockedbytes > 0) {
655 		mutex_enter(&procp->p_lock);
656 		if (rctl_incr_locked_mem(procp, proj, lockedbytes, 0)) {
657 			mutex_exit(&procp->p_lock);
658 			mutex_exit(&sp->shm_mlock);
659 			for (i = 0; i < npages; i++)
660 				page_unlock(ppa[i]);
661 			err = ENOMEM;
662 			goto out4;
663 		}
664 		mutex_exit(&procp->p_lock);
665 	}
666 
667 	/*
668 	 * addr is initial address corresponding to the first page on ppa list
669 	 */
670 	for (i = 0; i < npages; i++) {
671 		/* attempt to lock all pages */
672 		if (page_pp_lock(ppa[i], 0, 1) == 0) {
673 			/*
674 			 * if unable to lock any page, unlock all
675 			 * of them and return error
676 			 */
677 			for (j = 0; j < i; j++)
678 				page_pp_unlock(ppa[j], 0, 1);
679 			for (i = 0; i < npages; i++)
680 				page_unlock(ppa[i]);
681 			rctl_decr_locked_mem(NULL, proj, lockedbytes, 0);
682 			mutex_exit(&sp->shm_mlock);
683 			err = ENOMEM;
684 			goto out4;
685 		}
686 	}
687 	mutex_exit(&sp->shm_mlock);
688 
689 	/*
690 	 * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
691 	 * for the entire life of the segment. For example platforms
692 	 * that do not support Dynamic Reconfiguration.
693 	 */
694 	hat_flags = HAT_LOAD_SHARE;
695 	if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, NULL))
696 		hat_flags |= HAT_LOAD_LOCK;
697 
698 	/*
699 	 * Load translations one lare page at a time
700 	 * to make sure we don't create mappings bigger than
701 	 * segment's size code in case underlying pages
702 	 * are shared with segvn's segment that uses bigger
703 	 * size code than we do.
704 	 */
705 	pgsz = page_get_pagesize(seg->s_szc);
706 	pgcnt = page_get_pagecnt(seg->s_szc);
707 	for (a = addr, pidx = 0; pidx < npages; a += pgsz, pidx += pgcnt) {
708 		sz = MIN(pgsz, ptob(npages - pidx));
709 		hat_memload_array(seg->s_as->a_hat, a, sz,
710 		    &ppa[pidx], sptd->spt_prot, hat_flags);
711 	}
712 
713 	/*
714 	 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
715 	 * we will leave the pages locked SE_SHARED for the life
716 	 * of the ISM segment. This will prevent any calls to
717 	 * hat_pageunload() on this ISM segment for those platforms.
718 	 */
719 	if (!(hat_flags & HAT_LOAD_LOCK)) {
720 		/*
721 		 * On platforms that support HAT_DYNAMIC_ISM_UNMAP,
722 		 * we no longer need to hold the SE_SHARED lock on the pages,
723 		 * since L_PAGELOCK and F_SOFTLOCK calls will grab the
724 		 * SE_SHARED lock on the pages as necessary.
725 		 */
726 		for (i = 0; i < npages; i++)
727 			page_unlock(ppa[i]);
728 	}
729 	sptd->spt_pcachecnt = 0;
730 	kmem_free(ppa, ((sizeof (page_t *)) * npages));
731 	sptd->spt_realsize = ptob(npages);
732 	atomic_add_long(&spt_used, npages);
733 	sptcargs->seg_spt = seg;
734 	return (0);
735 
736 out4:
737 	seg->s_data = NULL;
738 	kmem_free(vp, sizeof (*vp));
739 	cv_destroy(&sptd->spt_cv);
740 out3:
741 	mutex_destroy(&sptd->spt_lock);
742 	if ((sptcargs->flags & SHM_PAGEABLE) == 0)
743 		kmem_free(ppa, (sizeof (*ppa) * npages));
744 out2:
745 	kmem_free(sptd, sizeof (*sptd));
746 out1:
747 	if ((sptcargs->flags & SHM_PAGEABLE) == 0)
748 		anon_swap_restore(npages);
749 	return (err);
750 }
751 
752 /*ARGSUSED*/
753 void
754 segspt_free_pages(struct seg *seg, caddr_t addr, size_t len)
755 {
756 	struct page	*pp;
757 	struct spt_data *sptd = (struct spt_data *)seg->s_data;
758 	pgcnt_t		npages;
759 	ulong_t		anon_idx;
760 	struct anon_map *amp;
761 	struct anon	*ap;
762 	struct vnode	*vp;
763 	u_offset_t	off;
764 	uint_t		hat_flags;
765 	int		root = 0;
766 	pgcnt_t		pgs, curnpgs = 0;
767 	page_t		*rootpp;
768 	rctl_qty_t	unlocked_bytes = 0;
769 	kproject_t	*proj;
770 	kshmid_t	*sp;
771 
772 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
773 
774 	len = P2ROUNDUP(len, PAGESIZE);
775 
776 	npages = btop(len);
777 
778 	hat_flags = HAT_UNLOAD_UNLOCK | HAT_UNLOAD_UNMAP;
779 	if ((hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) ||
780 	    (sptd->spt_flags & SHM_PAGEABLE)) {
781 		hat_flags = HAT_UNLOAD_UNMAP;
782 	}
783 
784 	hat_unload(seg->s_as->a_hat, addr, len, hat_flags);
785 
786 	amp = sptd->spt_amp;
787 	if (sptd->spt_flags & SHM_PAGEABLE)
788 		npages = btop(amp->size);
789 
790 	ASSERT(amp != NULL);
791 
792 	if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
793 		sp = amp->a_sp;
794 		proj = sp->shm_perm.ipc_proj;
795 		mutex_enter(&sp->shm_mlock);
796 	}
797 	for (anon_idx = 0; anon_idx < npages; anon_idx++) {
798 		if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
799 			if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) {
800 				panic("segspt_free_pages: null app");
801 				/*NOTREACHED*/
802 			}
803 		} else {
804 			if ((ap = anon_get_next_ptr(amp->ahp, &anon_idx))
805 			    == NULL)
806 				continue;
807 		}
808 		ASSERT(ANON_ISBUSY(anon_get_slot(amp->ahp, anon_idx)) == 0);
809 		swap_xlate(ap, &vp, &off);
810 
811 		/*
812 		 * If this platform supports HAT_DYNAMIC_ISM_UNMAP,
813 		 * the pages won't be having SE_SHARED lock at this
814 		 * point.
815 		 *
816 		 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
817 		 * the pages are still held SE_SHARED locked from the
818 		 * original segspt_create()
819 		 *
820 		 * Our goal is to get SE_EXCL lock on each page, remove
821 		 * permanent lock on it and invalidate the page.
822 		 */
823 		if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
824 			if (hat_flags == HAT_UNLOAD_UNMAP)
825 				pp = page_lookup(vp, off, SE_EXCL);
826 			else {
827 				if ((pp = page_find(vp, off)) == NULL) {
828 					panic("segspt_free_pages: "
829 					    "page not locked");
830 					/*NOTREACHED*/
831 				}
832 				if (!page_tryupgrade(pp)) {
833 					page_unlock(pp);
834 					pp = page_lookup(vp, off, SE_EXCL);
835 				}
836 			}
837 			if (pp == NULL) {
838 				panic("segspt_free_pages: "
839 				    "page not in the system");
840 				/*NOTREACHED*/
841 			}
842 			ASSERT(pp->p_lckcnt > 0);
843 			page_pp_unlock(pp, 0, 1);
844 			if (pp->p_lckcnt == 0)
845 				unlocked_bytes += PAGESIZE;
846 		} else {
847 			if ((pp = page_lookup(vp, off, SE_EXCL)) == NULL)
848 				continue;
849 		}
850 		/*
851 		 * It's logical to invalidate the pages here as in most cases
852 		 * these were created by segspt.
853 		 */
854 		if (pp->p_szc != 0) {
855 			if (root == 0) {
856 				ASSERT(curnpgs == 0);
857 				root = 1;
858 				rootpp = pp;
859 				pgs = curnpgs = page_get_pagecnt(pp->p_szc);
860 				ASSERT(pgs > 1);
861 				ASSERT(IS_P2ALIGNED(pgs, pgs));
862 				ASSERT(!(page_pptonum(pp) & (pgs - 1)));
863 				curnpgs--;
864 			} else if ((page_pptonum(pp) & (pgs - 1)) == pgs - 1) {
865 				ASSERT(curnpgs == 1);
866 				ASSERT(page_pptonum(pp) ==
867 				    page_pptonum(rootpp) + (pgs - 1));
868 				page_destroy_pages(rootpp);
869 				root = 0;
870 				curnpgs = 0;
871 			} else {
872 				ASSERT(curnpgs > 1);
873 				ASSERT(page_pptonum(pp) ==
874 				    page_pptonum(rootpp) + (pgs - curnpgs));
875 				curnpgs--;
876 			}
877 		} else {
878 			if (root != 0 || curnpgs != 0) {
879 				panic("segspt_free_pages: bad large page");
880 				/*NOTREACHED*/
881 			}
882 			/*
883 			 * Before destroying the pages, we need to take care
884 			 * of the rctl locked memory accounting. For that
885 			 * we need to calculte the unlocked_bytes.
886 			 */
887 			if (pp->p_lckcnt > 0)
888 				unlocked_bytes += PAGESIZE;
889 			/*LINTED: constant in conditional context */
890 			VN_DISPOSE(pp, B_INVAL, 0, kcred);
891 		}
892 	}
893 	if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
894 		if (unlocked_bytes > 0)
895 			rctl_decr_locked_mem(NULL, proj, unlocked_bytes, 0);
896 		mutex_exit(&sp->shm_mlock);
897 	}
898 	if (root != 0 || curnpgs != 0) {
899 		panic("segspt_free_pages: bad large page");
900 		/*NOTREACHED*/
901 	}
902 
903 	/*
904 	 * mark that pages have been released
905 	 */
906 	sptd->spt_realsize = 0;
907 
908 	if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
909 		atomic_add_long(&spt_used, -npages);
910 		anon_swap_restore(npages);
911 	}
912 }
913 
914 /*
915  * Get memory allocation policy info for specified address in given segment
916  */
917 static lgrp_mem_policy_info_t *
918 segspt_getpolicy(struct seg *seg, caddr_t addr)
919 {
920 	struct anon_map		*amp;
921 	ulong_t			anon_index;
922 	lgrp_mem_policy_info_t	*policy_info;
923 	struct spt_data		*spt_data;
924 
925 	ASSERT(seg != NULL);
926 
927 	/*
928 	 * Get anon_map from segspt
929 	 *
930 	 * Assume that no lock needs to be held on anon_map, since
931 	 * it should be protected by its reference count which must be
932 	 * nonzero for an existing segment
933 	 * Need to grab readers lock on policy tree though
934 	 */
935 	spt_data = (struct spt_data *)seg->s_data;
936 	if (spt_data == NULL)
937 		return (NULL);
938 	amp = spt_data->spt_amp;
939 	ASSERT(amp->refcnt != 0);
940 
941 	/*
942 	 * Get policy info
943 	 *
944 	 * Assume starting anon index of 0
945 	 */
946 	anon_index = seg_page(seg, addr);
947 	policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
948 
949 	return (policy_info);
950 }
951 
952 /*
953  * DISM only.
954  * Return locked pages over a given range.
955  *
956  * We will cache all DISM locked pages and save the pplist for the
957  * entire segment in the ppa field of the underlying DISM segment structure.
958  * Later, during a call to segspt_reclaim() we will use this ppa array
959  * to page_unlock() all of the pages and then we will free this ppa list.
960  */
961 /*ARGSUSED*/
962 static int
963 segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len,
964     struct page ***ppp, enum lock_type type, enum seg_rw rw)
965 {
966 	struct  shm_data *shmd = (struct shm_data *)seg->s_data;
967 	struct  seg	*sptseg = shmd->shm_sptseg;
968 	struct  spt_data *sptd = sptseg->s_data;
969 	pgcnt_t pg_idx, npages, tot_npages, npgs;
970 	struct  page **pplist, **pl, **ppa, *pp;
971 	struct  anon_map *amp;
972 	spgcnt_t	an_idx;
973 	int	ret = ENOTSUP;
974 	uint_t	pl_built = 0;
975 	struct  anon *ap;
976 	struct  vnode *vp;
977 	u_offset_t off;
978 	pgcnt_t claim_availrmem = 0;
979 	uint_t	szc;
980 
981 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
982 	ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK);
983 
984 	/*
985 	 * We want to lock/unlock the entire ISM segment. Therefore,
986 	 * we will be using the underlying sptseg and it's base address
987 	 * and length for the caching arguments.
988 	 */
989 	ASSERT(sptseg);
990 	ASSERT(sptd);
991 
992 	pg_idx = seg_page(seg, addr);
993 	npages = btopr(len);
994 
995 	/*
996 	 * check if the request is larger than number of pages covered
997 	 * by amp
998 	 */
999 	if (pg_idx + npages > btopr(sptd->spt_amp->size)) {
1000 		*ppp = NULL;
1001 		return (ENOTSUP);
1002 	}
1003 
1004 	if (type == L_PAGEUNLOCK) {
1005 		ASSERT(sptd->spt_ppa != NULL);
1006 
1007 		seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size,
1008 		    sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1009 
1010 		/*
1011 		 * If someone is blocked while unmapping, we purge
1012 		 * segment page cache and thus reclaim pplist synchronously
1013 		 * without waiting for seg_pasync_thread. This speeds up
1014 		 * unmapping in cases where munmap(2) is called, while
1015 		 * raw async i/o is still in progress or where a thread
1016 		 * exits on data fault in a multithreaded application.
1017 		 */
1018 		if ((sptd->spt_flags & DISM_PPA_CHANGED) ||
1019 		    (AS_ISUNMAPWAIT(seg->s_as) &&
1020 		    shmd->shm_softlockcnt > 0)) {
1021 			segspt_purge(seg);
1022 		}
1023 		return (0);
1024 	}
1025 
1026 	/* The L_PAGELOCK case ... */
1027 
1028 	if (sptd->spt_flags & DISM_PPA_CHANGED) {
1029 		segspt_purge(seg);
1030 		/*
1031 		 * for DISM ppa needs to be rebuild since
1032 		 * number of locked pages could be changed
1033 		 */
1034 		*ppp = NULL;
1035 		return (ENOTSUP);
1036 	}
1037 
1038 	/*
1039 	 * First try to find pages in segment page cache, without
1040 	 * holding the segment lock.
1041 	 */
1042 	pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1043 	    S_WRITE, SEGP_FORCE_WIRED);
1044 	if (pplist != NULL) {
1045 		ASSERT(sptd->spt_ppa != NULL);
1046 		ASSERT(sptd->spt_ppa == pplist);
1047 		ppa = sptd->spt_ppa;
1048 		for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
1049 			if (ppa[an_idx] == NULL) {
1050 				seg_pinactive(seg, NULL, seg->s_base,
1051 				    sptd->spt_amp->size, ppa,
1052 				    S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1053 				*ppp = NULL;
1054 				return (ENOTSUP);
1055 			}
1056 			if ((szc = ppa[an_idx]->p_szc) != 0) {
1057 				npgs = page_get_pagecnt(szc);
1058 				an_idx = P2ROUNDUP(an_idx + 1, npgs);
1059 			} else {
1060 				an_idx++;
1061 			}
1062 		}
1063 		/*
1064 		 * Since we cache the entire DISM segment, we want to
1065 		 * set ppp to point to the first slot that corresponds
1066 		 * to the requested addr, i.e. pg_idx.
1067 		 */
1068 		*ppp = &(sptd->spt_ppa[pg_idx]);
1069 		return (0);
1070 	}
1071 
1072 	mutex_enter(&sptd->spt_lock);
1073 	/*
1074 	 * try to find pages in segment page cache with mutex
1075 	 */
1076 	pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1077 	    S_WRITE, SEGP_FORCE_WIRED);
1078 	if (pplist != NULL) {
1079 		ASSERT(sptd->spt_ppa != NULL);
1080 		ASSERT(sptd->spt_ppa == pplist);
1081 		ppa = sptd->spt_ppa;
1082 		for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
1083 			if (ppa[an_idx] == NULL) {
1084 				mutex_exit(&sptd->spt_lock);
1085 				seg_pinactive(seg, NULL, seg->s_base,
1086 				    sptd->spt_amp->size, ppa,
1087 				    S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1088 				*ppp = NULL;
1089 				return (ENOTSUP);
1090 			}
1091 			if ((szc = ppa[an_idx]->p_szc) != 0) {
1092 				npgs = page_get_pagecnt(szc);
1093 				an_idx = P2ROUNDUP(an_idx + 1, npgs);
1094 			} else {
1095 				an_idx++;
1096 			}
1097 		}
1098 		/*
1099 		 * Since we cache the entire DISM segment, we want to
1100 		 * set ppp to point to the first slot that corresponds
1101 		 * to the requested addr, i.e. pg_idx.
1102 		 */
1103 		mutex_exit(&sptd->spt_lock);
1104 		*ppp = &(sptd->spt_ppa[pg_idx]);
1105 		return (0);
1106 	}
1107 	if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size,
1108 	    SEGP_FORCE_WIRED) == SEGP_FAIL) {
1109 		mutex_exit(&sptd->spt_lock);
1110 		*ppp = NULL;
1111 		return (ENOTSUP);
1112 	}
1113 
1114 	/*
1115 	 * No need to worry about protections because DISM pages are always rw.
1116 	 */
1117 	pl = pplist = NULL;
1118 	amp = sptd->spt_amp;
1119 
1120 	/*
1121 	 * Do we need to build the ppa array?
1122 	 */
1123 	if (sptd->spt_ppa == NULL) {
1124 		pgcnt_t lpg_cnt = 0;
1125 
1126 		pl_built = 1;
1127 		tot_npages = btopr(sptd->spt_amp->size);
1128 
1129 		ASSERT(sptd->spt_pcachecnt == 0);
1130 		pplist = kmem_zalloc(sizeof (page_t *) * tot_npages, KM_SLEEP);
1131 		pl = pplist;
1132 
1133 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
1134 		for (an_idx = 0; an_idx < tot_npages; ) {
1135 			ap = anon_get_ptr(amp->ahp, an_idx);
1136 			/*
1137 			 * Cache only mlocked pages. For large pages
1138 			 * if one (constituent) page is mlocked
1139 			 * all pages for that large page
1140 			 * are cached also. This is for quick
1141 			 * lookups of ppa array;
1142 			 */
1143 			if ((ap != NULL) && (lpg_cnt != 0 ||
1144 			    (sptd->spt_ppa_lckcnt[an_idx] != 0))) {
1145 
1146 				swap_xlate(ap, &vp, &off);
1147 				pp = page_lookup(vp, off, SE_SHARED);
1148 				ASSERT(pp != NULL);
1149 				if (lpg_cnt == 0) {
1150 					lpg_cnt++;
1151 					/*
1152 					 * For a small page, we are done --
1153 					 * lpg_count is reset to 0 below.
1154 					 *
1155 					 * For a large page, we are guaranteed
1156 					 * to find the anon structures of all
1157 					 * constituent pages and a non-zero
1158 					 * lpg_cnt ensures that we don't test
1159 					 * for mlock for these. We are done
1160 					 * when lpg_count reaches (npgs + 1).
1161 					 * If we are not the first constituent
1162 					 * page, restart at the first one.
1163 					 */
1164 					npgs = page_get_pagecnt(pp->p_szc);
1165 					if (!IS_P2ALIGNED(an_idx, npgs)) {
1166 						an_idx = P2ALIGN(an_idx, npgs);
1167 						page_unlock(pp);
1168 						continue;
1169 					}
1170 				}
1171 				if (++lpg_cnt > npgs)
1172 					lpg_cnt = 0;
1173 
1174 				/*
1175 				 * availrmem is decremented only
1176 				 * for unlocked pages
1177 				 */
1178 				if (sptd->spt_ppa_lckcnt[an_idx] == 0)
1179 					claim_availrmem++;
1180 				pplist[an_idx] = pp;
1181 			}
1182 			an_idx++;
1183 		}
1184 		ANON_LOCK_EXIT(&amp->a_rwlock);
1185 
1186 		if (claim_availrmem) {
1187 			mutex_enter(&freemem_lock);
1188 			if (availrmem < tune.t_minarmem + claim_availrmem) {
1189 				mutex_exit(&freemem_lock);
1190 				ret = ENOTSUP;
1191 				claim_availrmem = 0;
1192 				goto insert_fail;
1193 			} else {
1194 				availrmem -= claim_availrmem;
1195 			}
1196 			mutex_exit(&freemem_lock);
1197 		}
1198 
1199 		sptd->spt_ppa = pl;
1200 	} else {
1201 		/*
1202 		 * We already have a valid ppa[].
1203 		 */
1204 		pl = sptd->spt_ppa;
1205 	}
1206 
1207 	ASSERT(pl != NULL);
1208 
1209 	ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size,
1210 	    sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED,
1211 	    segspt_reclaim);
1212 	if (ret == SEGP_FAIL) {
1213 		/*
1214 		 * seg_pinsert failed. We return
1215 		 * ENOTSUP, so that the as_pagelock() code will
1216 		 * then try the slower F_SOFTLOCK path.
1217 		 */
1218 		if (pl_built) {
1219 			/*
1220 			 * No one else has referenced the ppa[].
1221 			 * We created it and we need to destroy it.
1222 			 */
1223 			sptd->spt_ppa = NULL;
1224 		}
1225 		ret = ENOTSUP;
1226 		goto insert_fail;
1227 	}
1228 
1229 	/*
1230 	 * In either case, we increment softlockcnt on the 'real' segment.
1231 	 */
1232 	sptd->spt_pcachecnt++;
1233 	atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1234 
1235 	ppa = sptd->spt_ppa;
1236 	for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
1237 		if (ppa[an_idx] == NULL) {
1238 			mutex_exit(&sptd->spt_lock);
1239 			seg_pinactive(seg, NULL, seg->s_base,
1240 			    sptd->spt_amp->size,
1241 			    pl, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1242 			*ppp = NULL;
1243 			return (ENOTSUP);
1244 		}
1245 		if ((szc = ppa[an_idx]->p_szc) != 0) {
1246 			npgs = page_get_pagecnt(szc);
1247 			an_idx = P2ROUNDUP(an_idx + 1, npgs);
1248 		} else {
1249 			an_idx++;
1250 		}
1251 	}
1252 	/*
1253 	 * We can now drop the sptd->spt_lock since the ppa[]
1254 	 * exists and we have incremented pacachecnt.
1255 	 */
1256 	mutex_exit(&sptd->spt_lock);
1257 
1258 	/*
1259 	 * Since we cache the entire segment, we want to
1260 	 * set ppp to point to the first slot that corresponds
1261 	 * to the requested addr, i.e. pg_idx.
1262 	 */
1263 	*ppp = &(sptd->spt_ppa[pg_idx]);
1264 	return (0);
1265 
1266 insert_fail:
1267 	/*
1268 	 * We will only reach this code if we tried and failed.
1269 	 *
1270 	 * And we can drop the lock on the dummy seg, once we've failed
1271 	 * to set up a new ppa[].
1272 	 */
1273 	mutex_exit(&sptd->spt_lock);
1274 
1275 	if (pl_built) {
1276 		if (claim_availrmem) {
1277 			mutex_enter(&freemem_lock);
1278 			availrmem += claim_availrmem;
1279 			mutex_exit(&freemem_lock);
1280 		}
1281 
1282 		/*
1283 		 * We created pl and we need to destroy it.
1284 		 */
1285 		pplist = pl;
1286 		for (an_idx = 0; an_idx < tot_npages; an_idx++) {
1287 			if (pplist[an_idx] != NULL)
1288 				page_unlock(pplist[an_idx]);
1289 		}
1290 		kmem_free(pl, sizeof (page_t *) * tot_npages);
1291 	}
1292 
1293 	if (shmd->shm_softlockcnt <= 0) {
1294 		if (AS_ISUNMAPWAIT(seg->s_as)) {
1295 			mutex_enter(&seg->s_as->a_contents);
1296 			if (AS_ISUNMAPWAIT(seg->s_as)) {
1297 				AS_CLRUNMAPWAIT(seg->s_as);
1298 				cv_broadcast(&seg->s_as->a_cv);
1299 			}
1300 			mutex_exit(&seg->s_as->a_contents);
1301 		}
1302 	}
1303 	*ppp = NULL;
1304 	return (ret);
1305 }
1306 
1307 
1308 
1309 /*
1310  * return locked pages over a given range.
1311  *
1312  * We will cache the entire ISM segment and save the pplist for the
1313  * entire segment in the ppa field of the underlying ISM segment structure.
1314  * Later, during a call to segspt_reclaim() we will use this ppa array
1315  * to page_unlock() all of the pages and then we will free this ppa list.
1316  */
1317 /*ARGSUSED*/
1318 static int
1319 segspt_shmpagelock(struct seg *seg, caddr_t addr, size_t len,
1320     struct page ***ppp, enum lock_type type, enum seg_rw rw)
1321 {
1322 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
1323 	struct seg	*sptseg = shmd->shm_sptseg;
1324 	struct spt_data *sptd = sptseg->s_data;
1325 	pgcnt_t np, page_index, npages;
1326 	caddr_t a, spt_base;
1327 	struct page **pplist, **pl, *pp;
1328 	struct anon_map *amp;
1329 	ulong_t anon_index;
1330 	int ret = ENOTSUP;
1331 	uint_t	pl_built = 0;
1332 	struct anon *ap;
1333 	struct vnode *vp;
1334 	u_offset_t off;
1335 
1336 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
1337 	ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK);
1338 
1339 
1340 	/*
1341 	 * We want to lock/unlock the entire ISM segment. Therefore,
1342 	 * we will be using the underlying sptseg and it's base address
1343 	 * and length for the caching arguments.
1344 	 */
1345 	ASSERT(sptseg);
1346 	ASSERT(sptd);
1347 
1348 	if (sptd->spt_flags & SHM_PAGEABLE) {
1349 		return (segspt_dismpagelock(seg, addr, len, ppp, type, rw));
1350 	}
1351 
1352 	page_index = seg_page(seg, addr);
1353 	npages = btopr(len);
1354 
1355 	/*
1356 	 * check if the request is larger than number of pages covered
1357 	 * by amp
1358 	 */
1359 	if (page_index + npages > btopr(sptd->spt_amp->size)) {
1360 		*ppp = NULL;
1361 		return (ENOTSUP);
1362 	}
1363 
1364 	if (type == L_PAGEUNLOCK) {
1365 
1366 		ASSERT(sptd->spt_ppa != NULL);
1367 
1368 		seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size,
1369 		    sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1370 
1371 		/*
1372 		 * If someone is blocked while unmapping, we purge
1373 		 * segment page cache and thus reclaim pplist synchronously
1374 		 * without waiting for seg_pasync_thread. This speeds up
1375 		 * unmapping in cases where munmap(2) is called, while
1376 		 * raw async i/o is still in progress or where a thread
1377 		 * exits on data fault in a multithreaded application.
1378 		 */
1379 		if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) {
1380 			segspt_purge(seg);
1381 		}
1382 		return (0);
1383 	}
1384 
1385 	/* The L_PAGELOCK case... */
1386 
1387 	/*
1388 	 * First try to find pages in segment page cache, without
1389 	 * holding the segment lock.
1390 	 */
1391 	pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1392 	    S_WRITE, SEGP_FORCE_WIRED);
1393 	if (pplist != NULL) {
1394 		ASSERT(sptd->spt_ppa == pplist);
1395 		ASSERT(sptd->spt_ppa[page_index]);
1396 		/*
1397 		 * Since we cache the entire ISM segment, we want to
1398 		 * set ppp to point to the first slot that corresponds
1399 		 * to the requested addr, i.e. page_index.
1400 		 */
1401 		*ppp = &(sptd->spt_ppa[page_index]);
1402 		return (0);
1403 	}
1404 
1405 	mutex_enter(&sptd->spt_lock);
1406 
1407 	/*
1408 	 * try to find pages in segment page cache
1409 	 */
1410 	pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1411 	    S_WRITE, SEGP_FORCE_WIRED);
1412 	if (pplist != NULL) {
1413 		ASSERT(sptd->spt_ppa == pplist);
1414 		/*
1415 		 * Since we cache the entire segment, we want to
1416 		 * set ppp to point to the first slot that corresponds
1417 		 * to the requested addr, i.e. page_index.
1418 		 */
1419 		mutex_exit(&sptd->spt_lock);
1420 		*ppp = &(sptd->spt_ppa[page_index]);
1421 		return (0);
1422 	}
1423 
1424 	if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size,
1425 	    SEGP_FORCE_WIRED) == SEGP_FAIL) {
1426 		mutex_exit(&sptd->spt_lock);
1427 		*ppp = NULL;
1428 		return (ENOTSUP);
1429 	}
1430 
1431 	/*
1432 	 * No need to worry about protections because ISM pages
1433 	 * are always rw.
1434 	 */
1435 	pl = pplist = NULL;
1436 
1437 	/*
1438 	 * Do we need to build the ppa array?
1439 	 */
1440 	if (sptd->spt_ppa == NULL) {
1441 		ASSERT(sptd->spt_ppa == pplist);
1442 
1443 		spt_base = sptseg->s_base;
1444 		pl_built = 1;
1445 
1446 		/*
1447 		 * availrmem is decremented once during anon_swap_adjust()
1448 		 * and is incremented during the anon_unresv(), which is
1449 		 * called from shm_rm_amp() when the segment is destroyed.
1450 		 */
1451 		amp = sptd->spt_amp;
1452 		ASSERT(amp != NULL);
1453 
1454 		/* pcachecnt is protected by sptd->spt_lock */
1455 		ASSERT(sptd->spt_pcachecnt == 0);
1456 		pplist = kmem_zalloc(sizeof (page_t *)
1457 		    * btopr(sptd->spt_amp->size), KM_SLEEP);
1458 		pl = pplist;
1459 
1460 		anon_index = seg_page(sptseg, spt_base);
1461 
1462 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
1463 		for (a = spt_base; a < (spt_base + sptd->spt_amp->size);
1464 		    a += PAGESIZE, anon_index++, pplist++) {
1465 			ap = anon_get_ptr(amp->ahp, anon_index);
1466 			ASSERT(ap != NULL);
1467 			swap_xlate(ap, &vp, &off);
1468 			pp = page_lookup(vp, off, SE_SHARED);
1469 			ASSERT(pp != NULL);
1470 			*pplist = pp;
1471 		}
1472 		ANON_LOCK_EXIT(&amp->a_rwlock);
1473 
1474 		if (a < (spt_base + sptd->spt_amp->size)) {
1475 			ret = ENOTSUP;
1476 			goto insert_fail;
1477 		}
1478 		sptd->spt_ppa = pl;
1479 	} else {
1480 		/*
1481 		 * We already have a valid ppa[].
1482 		 */
1483 		pl = sptd->spt_ppa;
1484 	}
1485 
1486 	ASSERT(pl != NULL);
1487 
1488 	ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size,
1489 	    sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED,
1490 	    segspt_reclaim);
1491 	if (ret == SEGP_FAIL) {
1492 		/*
1493 		 * seg_pinsert failed. We return
1494 		 * ENOTSUP, so that the as_pagelock() code will
1495 		 * then try the slower F_SOFTLOCK path.
1496 		 */
1497 		if (pl_built) {
1498 			/*
1499 			 * No one else has referenced the ppa[].
1500 			 * We created it and we need to destroy it.
1501 			 */
1502 			sptd->spt_ppa = NULL;
1503 		}
1504 		ret = ENOTSUP;
1505 		goto insert_fail;
1506 	}
1507 
1508 	/*
1509 	 * In either case, we increment softlockcnt on the 'real' segment.
1510 	 */
1511 	sptd->spt_pcachecnt++;
1512 	atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1513 
1514 	/*
1515 	 * We can now drop the sptd->spt_lock since the ppa[]
1516 	 * exists and we have incremented pacachecnt.
1517 	 */
1518 	mutex_exit(&sptd->spt_lock);
1519 
1520 	/*
1521 	 * Since we cache the entire segment, we want to
1522 	 * set ppp to point to the first slot that corresponds
1523 	 * to the requested addr, i.e. page_index.
1524 	 */
1525 	*ppp = &(sptd->spt_ppa[page_index]);
1526 	return (0);
1527 
1528 insert_fail:
1529 	/*
1530 	 * We will only reach this code if we tried and failed.
1531 	 *
1532 	 * And we can drop the lock on the dummy seg, once we've failed
1533 	 * to set up a new ppa[].
1534 	 */
1535 	mutex_exit(&sptd->spt_lock);
1536 
1537 	if (pl_built) {
1538 		/*
1539 		 * We created pl and we need to destroy it.
1540 		 */
1541 		pplist = pl;
1542 		np = (((uintptr_t)(a - spt_base)) >> PAGESHIFT);
1543 		while (np) {
1544 			page_unlock(*pplist);
1545 			np--;
1546 			pplist++;
1547 		}
1548 		kmem_free(pl, sizeof (page_t *) * btopr(sptd->spt_amp->size));
1549 	}
1550 	if (shmd->shm_softlockcnt <= 0) {
1551 		if (AS_ISUNMAPWAIT(seg->s_as)) {
1552 			mutex_enter(&seg->s_as->a_contents);
1553 			if (AS_ISUNMAPWAIT(seg->s_as)) {
1554 				AS_CLRUNMAPWAIT(seg->s_as);
1555 				cv_broadcast(&seg->s_as->a_cv);
1556 			}
1557 			mutex_exit(&seg->s_as->a_contents);
1558 		}
1559 	}
1560 	*ppp = NULL;
1561 	return (ret);
1562 }
1563 
1564 /*
1565  * purge any cached pages in the I/O page cache
1566  */
1567 static void
1568 segspt_purge(struct seg *seg)
1569 {
1570 	seg_ppurge(seg, NULL, SEGP_FORCE_WIRED);
1571 }
1572 
1573 static int
1574 segspt_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist,
1575     enum seg_rw rw, int async)
1576 {
1577 	struct seg *seg = (struct seg *)ptag;
1578 	struct	shm_data *shmd = (struct shm_data *)seg->s_data;
1579 	struct	seg	*sptseg;
1580 	struct	spt_data *sptd;
1581 	pgcnt_t npages, i, free_availrmem = 0;
1582 	int	done = 0;
1583 
1584 #ifdef lint
1585 	addr = addr;
1586 #endif
1587 	sptseg = shmd->shm_sptseg;
1588 	sptd = sptseg->s_data;
1589 	npages = (len >> PAGESHIFT);
1590 	ASSERT(npages);
1591 	ASSERT(sptd->spt_pcachecnt != 0);
1592 	ASSERT(sptd->spt_ppa == pplist);
1593 	ASSERT(npages == btopr(sptd->spt_amp->size));
1594 	ASSERT(async || AS_LOCK_HELD(seg->s_as));
1595 
1596 	/*
1597 	 * Acquire the lock on the dummy seg and destroy the
1598 	 * ppa array IF this is the last pcachecnt.
1599 	 */
1600 	mutex_enter(&sptd->spt_lock);
1601 	if (--sptd->spt_pcachecnt == 0) {
1602 		for (i = 0; i < npages; i++) {
1603 			if (pplist[i] == NULL) {
1604 				continue;
1605 			}
1606 			if (rw == S_WRITE) {
1607 				hat_setrefmod(pplist[i]);
1608 			} else {
1609 				hat_setref(pplist[i]);
1610 			}
1611 			if ((sptd->spt_flags & SHM_PAGEABLE) &&
1612 			    (sptd->spt_ppa_lckcnt[i] == 0))
1613 				free_availrmem++;
1614 			page_unlock(pplist[i]);
1615 		}
1616 		if ((sptd->spt_flags & SHM_PAGEABLE) && free_availrmem) {
1617 			mutex_enter(&freemem_lock);
1618 			availrmem += free_availrmem;
1619 			mutex_exit(&freemem_lock);
1620 		}
1621 		/*
1622 		 * Since we want to cach/uncache the entire ISM segment,
1623 		 * we will track the pplist in a segspt specific field
1624 		 * ppa, that is initialized at the time we add an entry to
1625 		 * the cache.
1626 		 */
1627 		ASSERT(sptd->spt_pcachecnt == 0);
1628 		kmem_free(pplist, sizeof (page_t *) * npages);
1629 		sptd->spt_ppa = NULL;
1630 		sptd->spt_flags &= ~DISM_PPA_CHANGED;
1631 		sptd->spt_gen++;
1632 		cv_broadcast(&sptd->spt_cv);
1633 		done = 1;
1634 	}
1635 	mutex_exit(&sptd->spt_lock);
1636 
1637 	/*
1638 	 * If we are pcache async thread or called via seg_ppurge_wiredpp() we
1639 	 * may not hold AS lock (in this case async argument is not 0). This
1640 	 * means if softlockcnt drops to 0 after the decrement below address
1641 	 * space may get freed. We can't allow it since after softlock
1642 	 * derement to 0 we still need to access as structure for possible
1643 	 * wakeup of unmap waiters. To prevent the disappearance of as we take
1644 	 * this segment's shm_segfree_syncmtx. segspt_shmfree() also takes
1645 	 * this mutex as a barrier to make sure this routine completes before
1646 	 * segment is freed.
1647 	 *
1648 	 * The second complication we have to deal with in async case is a
1649 	 * possibility of missed wake up of unmap wait thread. When we don't
1650 	 * hold as lock here we may take a_contents lock before unmap wait
1651 	 * thread that was first to see softlockcnt was still not 0. As a
1652 	 * result we'll fail to wake up an unmap wait thread. To avoid this
1653 	 * race we set nounmapwait flag in as structure if we drop softlockcnt
1654 	 * to 0 if async is not 0.  unmapwait thread
1655 	 * will not block if this flag is set.
1656 	 */
1657 	if (async)
1658 		mutex_enter(&shmd->shm_segfree_syncmtx);
1659 
1660 	/*
1661 	 * Now decrement softlockcnt.
1662 	 */
1663 	ASSERT(shmd->shm_softlockcnt > 0);
1664 	atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1665 
1666 	if (shmd->shm_softlockcnt <= 0) {
1667 		if (async || AS_ISUNMAPWAIT(seg->s_as)) {
1668 			mutex_enter(&seg->s_as->a_contents);
1669 			if (async)
1670 				AS_SETNOUNMAPWAIT(seg->s_as);
1671 			if (AS_ISUNMAPWAIT(seg->s_as)) {
1672 				AS_CLRUNMAPWAIT(seg->s_as);
1673 				cv_broadcast(&seg->s_as->a_cv);
1674 			}
1675 			mutex_exit(&seg->s_as->a_contents);
1676 		}
1677 	}
1678 
1679 	if (async)
1680 		mutex_exit(&shmd->shm_segfree_syncmtx);
1681 
1682 	return (done);
1683 }
1684 
1685 /*
1686  * Do a F_SOFTUNLOCK call over the range requested.
1687  * The range must have already been F_SOFTLOCK'ed.
1688  *
1689  * The calls to acquire and release the anon map lock mutex were
1690  * removed in order to avoid a deadly embrace during a DR
1691  * memory delete operation.  (Eg. DR blocks while waiting for a
1692  * exclusive lock on a page that is being used for kaio; the
1693  * thread that will complete the kaio and call segspt_softunlock
1694  * blocks on the anon map lock; another thread holding the anon
1695  * map lock blocks on another page lock via the segspt_shmfault
1696  * -> page_lookup -> page_lookup_create -> page_lock_es code flow.)
1697  *
1698  * The appropriateness of the removal is based upon the following:
1699  * 1. If we are holding a segment's reader lock and the page is held
1700  * shared, then the corresponding element in anonmap which points to
1701  * anon struct cannot change and there is no need to acquire the
1702  * anonymous map lock.
1703  * 2. Threads in segspt_softunlock have a reader lock on the segment
1704  * and already have the shared page lock, so we are guaranteed that
1705  * the anon map slot cannot change and therefore can call anon_get_ptr()
1706  * without grabbing the anonymous map lock.
1707  * 3. Threads that softlock a shared page break copy-on-write, even if
1708  * its a read.  Thus cow faults can be ignored with respect to soft
1709  * unlocking, since the breaking of cow means that the anon slot(s) will
1710  * not be shared.
1711  */
1712 static void
1713 segspt_softunlock(struct seg *seg, caddr_t sptseg_addr,
1714     size_t len, enum seg_rw rw)
1715 {
1716 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
1717 	struct seg	*sptseg;
1718 	struct spt_data *sptd;
1719 	page_t *pp;
1720 	caddr_t adr;
1721 	struct vnode *vp;
1722 	u_offset_t offset;
1723 	ulong_t anon_index;
1724 	struct anon_map *amp;		/* XXX - for locknest */
1725 	struct anon *ap = NULL;
1726 	pgcnt_t npages;
1727 
1728 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
1729 
1730 	sptseg = shmd->shm_sptseg;
1731 	sptd = sptseg->s_data;
1732 
1733 	/*
1734 	 * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
1735 	 * and therefore their pages are SE_SHARED locked
1736 	 * for the entire life of the segment.
1737 	 */
1738 	if ((!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) &&
1739 	    ((sptd->spt_flags & SHM_PAGEABLE) == 0)) {
1740 		goto softlock_decrement;
1741 	}
1742 
1743 	/*
1744 	 * Any thread is free to do a page_find and
1745 	 * page_unlock() on the pages within this seg.
1746 	 *
1747 	 * We are already holding the as->a_lock on the user's
1748 	 * real segment, but we need to hold the a_lock on the
1749 	 * underlying dummy as. This is mostly to satisfy the
1750 	 * underlying HAT layer.
1751 	 */
1752 	AS_LOCK_ENTER(sptseg->s_as, RW_READER);
1753 	hat_unlock(sptseg->s_as->a_hat, sptseg_addr, len);
1754 	AS_LOCK_EXIT(sptseg->s_as);
1755 
1756 	amp = sptd->spt_amp;
1757 	ASSERT(amp != NULL);
1758 	anon_index = seg_page(sptseg, sptseg_addr);
1759 
1760 	for (adr = sptseg_addr; adr < sptseg_addr + len; adr += PAGESIZE) {
1761 		ap = anon_get_ptr(amp->ahp, anon_index++);
1762 		ASSERT(ap != NULL);
1763 		swap_xlate(ap, &vp, &offset);
1764 
1765 		/*
1766 		 * Use page_find() instead of page_lookup() to
1767 		 * find the page since we know that it has a
1768 		 * "shared" lock.
1769 		 */
1770 		pp = page_find(vp, offset);
1771 		ASSERT(ap == anon_get_ptr(amp->ahp, anon_index - 1));
1772 		if (pp == NULL) {
1773 			panic("segspt_softunlock: "
1774 			    "addr %p, ap %p, vp %p, off %llx",
1775 			    (void *)adr, (void *)ap, (void *)vp, offset);
1776 			/*NOTREACHED*/
1777 		}
1778 
1779 		if (rw == S_WRITE) {
1780 			hat_setrefmod(pp);
1781 		} else if (rw != S_OTHER) {
1782 			hat_setref(pp);
1783 		}
1784 		page_unlock(pp);
1785 	}
1786 
1787 softlock_decrement:
1788 	npages = btopr(len);
1789 	ASSERT(shmd->shm_softlockcnt >= npages);
1790 	atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -npages);
1791 	if (shmd->shm_softlockcnt == 0) {
1792 		/*
1793 		 * All SOFTLOCKS are gone. Wakeup any waiting
1794 		 * unmappers so they can try again to unmap.
1795 		 * Check for waiters first without the mutex
1796 		 * held so we don't always grab the mutex on
1797 		 * softunlocks.
1798 		 */
1799 		if (AS_ISUNMAPWAIT(seg->s_as)) {
1800 			mutex_enter(&seg->s_as->a_contents);
1801 			if (AS_ISUNMAPWAIT(seg->s_as)) {
1802 				AS_CLRUNMAPWAIT(seg->s_as);
1803 				cv_broadcast(&seg->s_as->a_cv);
1804 			}
1805 			mutex_exit(&seg->s_as->a_contents);
1806 		}
1807 	}
1808 }
1809 
1810 int
1811 segspt_shmattach(struct seg **segpp, void *argsp)
1812 {
1813 	struct seg *seg = *segpp;
1814 	struct shm_data *shmd_arg = (struct shm_data *)argsp;
1815 	struct shm_data *shmd;
1816 	struct anon_map *shm_amp = shmd_arg->shm_amp;
1817 	struct spt_data *sptd;
1818 	int error = 0;
1819 
1820 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
1821 
1822 	shmd = kmem_zalloc((sizeof (*shmd)), KM_NOSLEEP);
1823 	if (shmd == NULL)
1824 		return (ENOMEM);
1825 
1826 	shmd->shm_sptas = shmd_arg->shm_sptas;
1827 	shmd->shm_amp = shm_amp;
1828 	shmd->shm_sptseg = shmd_arg->shm_sptseg;
1829 
1830 	(void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, shm_amp, 0,
1831 	    NULL, 0, seg->s_size);
1832 
1833 	mutex_init(&shmd->shm_segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL);
1834 
1835 	seg->s_data = (void *)shmd;
1836 	seg->s_ops = &segspt_shmops;
1837 	seg->s_szc = shmd->shm_sptseg->s_szc;
1838 	sptd = shmd->shm_sptseg->s_data;
1839 
1840 	if (sptd->spt_flags & SHM_PAGEABLE) {
1841 		if ((shmd->shm_vpage = kmem_zalloc(btopr(shm_amp->size),
1842 		    KM_NOSLEEP)) == NULL) {
1843 			seg->s_data = (void *)NULL;
1844 			kmem_free(shmd, (sizeof (*shmd)));
1845 			return (ENOMEM);
1846 		}
1847 		shmd->shm_lckpgs = 0;
1848 		if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
1849 			if ((error = hat_share(seg->s_as->a_hat, seg->s_base,
1850 			    shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
1851 			    seg->s_size, seg->s_szc)) != 0) {
1852 				kmem_free(shmd->shm_vpage,
1853 				    btopr(shm_amp->size));
1854 			}
1855 		}
1856 	} else {
1857 		error = hat_share(seg->s_as->a_hat, seg->s_base,
1858 		    shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
1859 		    seg->s_size, seg->s_szc);
1860 	}
1861 	if (error) {
1862 		seg->s_szc = 0;
1863 		seg->s_data = (void *)NULL;
1864 		kmem_free(shmd, (sizeof (*shmd)));
1865 	} else {
1866 		ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
1867 		shm_amp->refcnt++;
1868 		ANON_LOCK_EXIT(&shm_amp->a_rwlock);
1869 	}
1870 	return (error);
1871 }
1872 
1873 int
1874 segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize)
1875 {
1876 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
1877 	int reclaim = 1;
1878 
1879 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
1880 retry:
1881 	if (shmd->shm_softlockcnt > 0) {
1882 		if (reclaim == 1) {
1883 			segspt_purge(seg);
1884 			reclaim = 0;
1885 			goto retry;
1886 		}
1887 		return (EAGAIN);
1888 	}
1889 
1890 	if (ssize != seg->s_size) {
1891 #ifdef DEBUG
1892 		cmn_err(CE_WARN, "Incompatible ssize %lx s_size %lx\n",
1893 		    ssize, seg->s_size);
1894 #endif
1895 		return (EINVAL);
1896 	}
1897 
1898 	(void) segspt_shmlockop(seg, raddr, shmd->shm_amp->size, 0, MC_UNLOCK,
1899 	    NULL, 0);
1900 	hat_unshare(seg->s_as->a_hat, raddr, ssize, seg->s_szc);
1901 
1902 	seg_free(seg);
1903 
1904 	return (0);
1905 }
1906 
1907 void
1908 segspt_shmfree(struct seg *seg)
1909 {
1910 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
1911 	struct anon_map *shm_amp = shmd->shm_amp;
1912 
1913 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
1914 
1915 	(void) segspt_shmlockop(seg, seg->s_base, shm_amp->size, 0,
1916 	    MC_UNLOCK, NULL, 0);
1917 
1918 	/*
1919 	 * Need to increment refcnt when attaching
1920 	 * and decrement when detaching because of dup().
1921 	 */
1922 	ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
1923 	shm_amp->refcnt--;
1924 	ANON_LOCK_EXIT(&shm_amp->a_rwlock);
1925 
1926 	if (shmd->shm_vpage) {	/* only for DISM */
1927 		kmem_free(shmd->shm_vpage, btopr(shm_amp->size));
1928 		shmd->shm_vpage = NULL;
1929 	}
1930 
1931 	/*
1932 	 * Take shm_segfree_syncmtx lock to let segspt_reclaim() finish if it's
1933 	 * still working with this segment without holding as lock.
1934 	 */
1935 	ASSERT(shmd->shm_softlockcnt == 0);
1936 	mutex_enter(&shmd->shm_segfree_syncmtx);
1937 	mutex_destroy(&shmd->shm_segfree_syncmtx);
1938 
1939 	kmem_free(shmd, sizeof (*shmd));
1940 }
1941 
1942 /*ARGSUSED*/
1943 int
1944 segspt_shmsetprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
1945 {
1946 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
1947 
1948 	/*
1949 	 * Shared page table is more than shared mapping.
1950 	 *  Individual process sharing page tables can't change prot
1951 	 *  because there is only one set of page tables.
1952 	 *  This will be allowed after private page table is
1953 	 *  supported.
1954 	 */
1955 /* need to return correct status error? */
1956 	return (0);
1957 }
1958 
1959 
1960 faultcode_t
1961 segspt_dismfault(struct hat *hat, struct seg *seg, caddr_t addr,
1962     size_t len, enum fault_type type, enum seg_rw rw)
1963 {
1964 	struct  shm_data	*shmd = (struct shm_data *)seg->s_data;
1965 	struct  seg		*sptseg = shmd->shm_sptseg;
1966 	struct  as		*curspt = shmd->shm_sptas;
1967 	struct  spt_data	*sptd = sptseg->s_data;
1968 	pgcnt_t npages;
1969 	size_t  size;
1970 	caddr_t segspt_addr, shm_addr;
1971 	page_t  **ppa;
1972 	int	i;
1973 	ulong_t an_idx = 0;
1974 	int	err = 0;
1975 	int	dyn_ism_unmap = hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0);
1976 	size_t	pgsz;
1977 	pgcnt_t	pgcnt;
1978 	caddr_t	a;
1979 	pgcnt_t	pidx;
1980 
1981 #ifdef lint
1982 	hat = hat;
1983 #endif
1984 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
1985 
1986 	/*
1987 	 * Because of the way spt is implemented
1988 	 * the realsize of the segment does not have to be
1989 	 * equal to the segment size itself. The segment size is
1990 	 * often in multiples of a page size larger than PAGESIZE.
1991 	 * The realsize is rounded up to the nearest PAGESIZE
1992 	 * based on what the user requested. This is a bit of
1993 	 * ungliness that is historical but not easily fixed
1994 	 * without re-designing the higher levels of ISM.
1995 	 */
1996 	ASSERT(addr >= seg->s_base);
1997 	if (((addr + len) - seg->s_base) > sptd->spt_realsize)
1998 		return (FC_NOMAP);
1999 	/*
2000 	 * For all of the following cases except F_PROT, we need to
2001 	 * make any necessary adjustments to addr and len
2002 	 * and get all of the necessary page_t's into an array called ppa[].
2003 	 *
2004 	 * The code in shmat() forces base addr and len of ISM segment
2005 	 * to be aligned to largest page size supported. Therefore,
2006 	 * we are able to handle F_SOFTLOCK and F_INVAL calls in "large
2007 	 * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
2008 	 * in large pagesize chunks, or else we will screw up the HAT
2009 	 * layer by calling hat_memload_array() with differing page sizes
2010 	 * over a given virtual range.
2011 	 */
2012 	pgsz = page_get_pagesize(sptseg->s_szc);
2013 	pgcnt = page_get_pagecnt(sptseg->s_szc);
2014 	shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz);
2015 	size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz);
2016 	npages = btopr(size);
2017 
2018 	/*
2019 	 * Now we need to convert from addr in segshm to addr in segspt.
2020 	 */
2021 	an_idx = seg_page(seg, shm_addr);
2022 	segspt_addr = sptseg->s_base + ptob(an_idx);
2023 
2024 	ASSERT((segspt_addr + ptob(npages)) <=
2025 	    (sptseg->s_base + sptd->spt_realsize));
2026 	ASSERT(segspt_addr < (sptseg->s_base + sptseg->s_size));
2027 
2028 	switch (type) {
2029 
2030 	case F_SOFTLOCK:
2031 
2032 		atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
2033 		/*
2034 		 * Fall through to the F_INVAL case to load up the hat layer
2035 		 * entries with the HAT_LOAD_LOCK flag.
2036 		 */
2037 		/* FALLTHRU */
2038 	case F_INVAL:
2039 
2040 		if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
2041 			return (FC_NOMAP);
2042 
2043 		ppa = kmem_zalloc(npages * sizeof (page_t *), KM_SLEEP);
2044 
2045 		err = spt_anon_getpages(sptseg, segspt_addr, size, ppa);
2046 		if (err != 0) {
2047 			if (type == F_SOFTLOCK) {
2048 				atomic_add_long((ulong_t *)(
2049 				    &(shmd->shm_softlockcnt)), -npages);
2050 			}
2051 			goto dism_err;
2052 		}
2053 		AS_LOCK_ENTER(sptseg->s_as, RW_READER);
2054 		a = segspt_addr;
2055 		pidx = 0;
2056 		if (type == F_SOFTLOCK) {
2057 
2058 			/*
2059 			 * Load up the translation keeping it
2060 			 * locked and don't unlock the page.
2061 			 */
2062 			for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2063 				hat_memload_array(sptseg->s_as->a_hat,
2064 				    a, pgsz, &ppa[pidx], sptd->spt_prot,
2065 				    HAT_LOAD_LOCK | HAT_LOAD_SHARE);
2066 			}
2067 		} else {
2068 			/*
2069 			 * Migrate pages marked for migration
2070 			 */
2071 			if (lgrp_optimizations())
2072 				page_migrate(seg, shm_addr, ppa, npages);
2073 
2074 			for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2075 				hat_memload_array(sptseg->s_as->a_hat,
2076 				    a, pgsz, &ppa[pidx],
2077 				    sptd->spt_prot,
2078 				    HAT_LOAD_SHARE);
2079 			}
2080 
2081 			/*
2082 			 * And now drop the SE_SHARED lock(s).
2083 			 */
2084 			if (dyn_ism_unmap) {
2085 				for (i = 0; i < npages; i++) {
2086 					page_unlock(ppa[i]);
2087 				}
2088 			}
2089 		}
2090 
2091 		if (!dyn_ism_unmap) {
2092 			if (hat_share(seg->s_as->a_hat, shm_addr,
2093 			    curspt->a_hat, segspt_addr, ptob(npages),
2094 			    seg->s_szc) != 0) {
2095 				panic("hat_share err in DISM fault");
2096 				/* NOTREACHED */
2097 			}
2098 			if (type == F_INVAL) {
2099 				for (i = 0; i < npages; i++) {
2100 					page_unlock(ppa[i]);
2101 				}
2102 			}
2103 		}
2104 		AS_LOCK_EXIT(sptseg->s_as);
2105 dism_err:
2106 		kmem_free(ppa, npages * sizeof (page_t *));
2107 		return (err);
2108 
2109 	case F_SOFTUNLOCK:
2110 
2111 		/*
2112 		 * This is a bit ugly, we pass in the real seg pointer,
2113 		 * but the segspt_addr is the virtual address within the
2114 		 * dummy seg.
2115 		 */
2116 		segspt_softunlock(seg, segspt_addr, size, rw);
2117 		return (0);
2118 
2119 	case F_PROT:
2120 
2121 		/*
2122 		 * This takes care of the unusual case where a user
2123 		 * allocates a stack in shared memory and a register
2124 		 * window overflow is written to that stack page before
2125 		 * it is otherwise modified.
2126 		 *
2127 		 * We can get away with this because ISM segments are
2128 		 * always rw. Other than this unusual case, there
2129 		 * should be no instances of protection violations.
2130 		 */
2131 		return (0);
2132 
2133 	default:
2134 #ifdef DEBUG
2135 		panic("segspt_dismfault default type?");
2136 #else
2137 		return (FC_NOMAP);
2138 #endif
2139 	}
2140 }
2141 
2142 
2143 faultcode_t
2144 segspt_shmfault(struct hat *hat, struct seg *seg, caddr_t addr,
2145     size_t len, enum fault_type type, enum seg_rw rw)
2146 {
2147 	struct shm_data		*shmd = (struct shm_data *)seg->s_data;
2148 	struct seg		*sptseg = shmd->shm_sptseg;
2149 	struct as		*curspt = shmd->shm_sptas;
2150 	struct spt_data		*sptd = sptseg->s_data;
2151 	pgcnt_t npages;
2152 	size_t size;
2153 	caddr_t sptseg_addr, shm_addr;
2154 	page_t *pp, **ppa;
2155 	int	i;
2156 	u_offset_t offset;
2157 	ulong_t anon_index = 0;
2158 	struct vnode *vp;
2159 	struct anon_map *amp;		/* XXX - for locknest */
2160 	struct anon *ap = NULL;
2161 	size_t		pgsz;
2162 	pgcnt_t		pgcnt;
2163 	caddr_t		a;
2164 	pgcnt_t		pidx;
2165 	size_t		sz;
2166 
2167 #ifdef lint
2168 	hat = hat;
2169 #endif
2170 
2171 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2172 
2173 	if (sptd->spt_flags & SHM_PAGEABLE) {
2174 		return (segspt_dismfault(hat, seg, addr, len, type, rw));
2175 	}
2176 
2177 	/*
2178 	 * Because of the way spt is implemented
2179 	 * the realsize of the segment does not have to be
2180 	 * equal to the segment size itself. The segment size is
2181 	 * often in multiples of a page size larger than PAGESIZE.
2182 	 * The realsize is rounded up to the nearest PAGESIZE
2183 	 * based on what the user requested. This is a bit of
2184 	 * ungliness that is historical but not easily fixed
2185 	 * without re-designing the higher levels of ISM.
2186 	 */
2187 	ASSERT(addr >= seg->s_base);
2188 	if (((addr + len) - seg->s_base) > sptd->spt_realsize)
2189 		return (FC_NOMAP);
2190 	/*
2191 	 * For all of the following cases except F_PROT, we need to
2192 	 * make any necessary adjustments to addr and len
2193 	 * and get all of the necessary page_t's into an array called ppa[].
2194 	 *
2195 	 * The code in shmat() forces base addr and len of ISM segment
2196 	 * to be aligned to largest page size supported. Therefore,
2197 	 * we are able to handle F_SOFTLOCK and F_INVAL calls in "large
2198 	 * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
2199 	 * in large pagesize chunks, or else we will screw up the HAT
2200 	 * layer by calling hat_memload_array() with differing page sizes
2201 	 * over a given virtual range.
2202 	 */
2203 	pgsz = page_get_pagesize(sptseg->s_szc);
2204 	pgcnt = page_get_pagecnt(sptseg->s_szc);
2205 	shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz);
2206 	size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz);
2207 	npages = btopr(size);
2208 
2209 	/*
2210 	 * Now we need to convert from addr in segshm to addr in segspt.
2211 	 */
2212 	anon_index = seg_page(seg, shm_addr);
2213 	sptseg_addr = sptseg->s_base + ptob(anon_index);
2214 
2215 	/*
2216 	 * And now we may have to adjust npages downward if we have
2217 	 * exceeded the realsize of the segment or initial anon
2218 	 * allocations.
2219 	 */
2220 	if ((sptseg_addr + ptob(npages)) >
2221 	    (sptseg->s_base + sptd->spt_realsize))
2222 		size = (sptseg->s_base + sptd->spt_realsize) - sptseg_addr;
2223 
2224 	npages = btopr(size);
2225 
2226 	ASSERT(sptseg_addr < (sptseg->s_base + sptseg->s_size));
2227 	ASSERT((sptd->spt_flags & SHM_PAGEABLE) == 0);
2228 
2229 	switch (type) {
2230 
2231 	case F_SOFTLOCK:
2232 
2233 		/*
2234 		 * availrmem is decremented once during anon_swap_adjust()
2235 		 * and is incremented during the anon_unresv(), which is
2236 		 * called from shm_rm_amp() when the segment is destroyed.
2237 		 */
2238 		atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
2239 		/*
2240 		 * Some platforms assume that ISM pages are SE_SHARED
2241 		 * locked for the entire life of the segment.
2242 		 */
2243 		if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0))
2244 			return (0);
2245 		/*
2246 		 * Fall through to the F_INVAL case to load up the hat layer
2247 		 * entries with the HAT_LOAD_LOCK flag.
2248 		 */
2249 
2250 		/* FALLTHRU */
2251 	case F_INVAL:
2252 
2253 		if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
2254 			return (FC_NOMAP);
2255 
2256 		/*
2257 		 * Some platforms that do NOT support DYNAMIC_ISM_UNMAP
2258 		 * may still rely on this call to hat_share(). That
2259 		 * would imply that those hat's can fault on a
2260 		 * HAT_LOAD_LOCK translation, which would seem
2261 		 * contradictory.
2262 		 */
2263 		if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
2264 			if (hat_share(seg->s_as->a_hat, seg->s_base,
2265 			    curspt->a_hat, sptseg->s_base,
2266 			    sptseg->s_size, sptseg->s_szc) != 0) {
2267 				panic("hat_share error in ISM fault");
2268 				/*NOTREACHED*/
2269 			}
2270 			return (0);
2271 		}
2272 		ppa = kmem_zalloc(sizeof (page_t *) * npages, KM_SLEEP);
2273 
2274 		/*
2275 		 * I see no need to lock the real seg,
2276 		 * here, because all of our work will be on the underlying
2277 		 * dummy seg.
2278 		 *
2279 		 * sptseg_addr and npages now account for large pages.
2280 		 */
2281 		amp = sptd->spt_amp;
2282 		ASSERT(amp != NULL);
2283 		anon_index = seg_page(sptseg, sptseg_addr);
2284 
2285 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2286 		for (i = 0; i < npages; i++) {
2287 			ap = anon_get_ptr(amp->ahp, anon_index++);
2288 			ASSERT(ap != NULL);
2289 			swap_xlate(ap, &vp, &offset);
2290 			pp = page_lookup(vp, offset, SE_SHARED);
2291 			ASSERT(pp != NULL);
2292 			ppa[i] = pp;
2293 		}
2294 		ANON_LOCK_EXIT(&amp->a_rwlock);
2295 		ASSERT(i == npages);
2296 
2297 		/*
2298 		 * We are already holding the as->a_lock on the user's
2299 		 * real segment, but we need to hold the a_lock on the
2300 		 * underlying dummy as. This is mostly to satisfy the
2301 		 * underlying HAT layer.
2302 		 */
2303 		AS_LOCK_ENTER(sptseg->s_as, RW_READER);
2304 		a = sptseg_addr;
2305 		pidx = 0;
2306 		if (type == F_SOFTLOCK) {
2307 			/*
2308 			 * Load up the translation keeping it
2309 			 * locked and don't unlock the page.
2310 			 */
2311 			for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2312 				sz = MIN(pgsz, ptob(npages - pidx));
2313 				hat_memload_array(sptseg->s_as->a_hat, a,
2314 				    sz, &ppa[pidx], sptd->spt_prot,
2315 				    HAT_LOAD_LOCK | HAT_LOAD_SHARE);
2316 			}
2317 		} else {
2318 			/*
2319 			 * Migrate pages marked for migration.
2320 			 */
2321 			if (lgrp_optimizations())
2322 				page_migrate(seg, shm_addr, ppa, npages);
2323 
2324 			for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2325 				sz = MIN(pgsz, ptob(npages - pidx));
2326 				hat_memload_array(sptseg->s_as->a_hat,
2327 				    a, sz, &ppa[pidx],
2328 				    sptd->spt_prot, HAT_LOAD_SHARE);
2329 			}
2330 
2331 			/*
2332 			 * And now drop the SE_SHARED lock(s).
2333 			 */
2334 			for (i = 0; i < npages; i++)
2335 				page_unlock(ppa[i]);
2336 		}
2337 		AS_LOCK_EXIT(sptseg->s_as);
2338 
2339 		kmem_free(ppa, sizeof (page_t *) * npages);
2340 		return (0);
2341 	case F_SOFTUNLOCK:
2342 
2343 		/*
2344 		 * This is a bit ugly, we pass in the real seg pointer,
2345 		 * but the sptseg_addr is the virtual address within the
2346 		 * dummy seg.
2347 		 */
2348 		segspt_softunlock(seg, sptseg_addr, ptob(npages), rw);
2349 		return (0);
2350 
2351 	case F_PROT:
2352 
2353 		/*
2354 		 * This takes care of the unusual case where a user
2355 		 * allocates a stack in shared memory and a register
2356 		 * window overflow is written to that stack page before
2357 		 * it is otherwise modified.
2358 		 *
2359 		 * We can get away with this because ISM segments are
2360 		 * always rw. Other than this unusual case, there
2361 		 * should be no instances of protection violations.
2362 		 */
2363 		return (0);
2364 
2365 	default:
2366 #ifdef DEBUG
2367 		cmn_err(CE_WARN, "segspt_shmfault default type?");
2368 #endif
2369 		return (FC_NOMAP);
2370 	}
2371 }
2372 
2373 /*ARGSUSED*/
2374 static faultcode_t
2375 segspt_shmfaulta(struct seg *seg, caddr_t addr)
2376 {
2377 	return (0);
2378 }
2379 
2380 /*ARGSUSED*/
2381 static int
2382 segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta)
2383 {
2384 	return (0);
2385 }
2386 
2387 /*ARGSUSED*/
2388 static size_t
2389 segspt_shmswapout(struct seg *seg)
2390 {
2391 	return (0);
2392 }
2393 
2394 /*
2395  * duplicate the shared page tables
2396  */
2397 int
2398 segspt_shmdup(struct seg *seg, struct seg *newseg)
2399 {
2400 	struct shm_data		*shmd = (struct shm_data *)seg->s_data;
2401 	struct anon_map		*amp = shmd->shm_amp;
2402 	struct shm_data		*shmd_new;
2403 	struct seg		*spt_seg = shmd->shm_sptseg;
2404 	struct spt_data		*sptd = spt_seg->s_data;
2405 	int			error = 0;
2406 
2407 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
2408 
2409 	shmd_new = kmem_zalloc((sizeof (*shmd_new)), KM_SLEEP);
2410 	newseg->s_data = (void *)shmd_new;
2411 	shmd_new->shm_sptas = shmd->shm_sptas;
2412 	shmd_new->shm_amp = amp;
2413 	shmd_new->shm_sptseg = shmd->shm_sptseg;
2414 	newseg->s_ops = &segspt_shmops;
2415 	newseg->s_szc = seg->s_szc;
2416 	ASSERT(seg->s_szc == shmd->shm_sptseg->s_szc);
2417 
2418 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
2419 	amp->refcnt++;
2420 	ANON_LOCK_EXIT(&amp->a_rwlock);
2421 
2422 	if (sptd->spt_flags & SHM_PAGEABLE) {
2423 		shmd_new->shm_vpage = kmem_zalloc(btopr(amp->size), KM_SLEEP);
2424 		shmd_new->shm_lckpgs = 0;
2425 		if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
2426 			if ((error = hat_share(newseg->s_as->a_hat,
2427 			    newseg->s_base, shmd->shm_sptas->a_hat, SEGSPTADDR,
2428 			    seg->s_size, seg->s_szc)) != 0) {
2429 				kmem_free(shmd_new->shm_vpage,
2430 				    btopr(amp->size));
2431 			}
2432 		}
2433 		return (error);
2434 	} else {
2435 		return (hat_share(newseg->s_as->a_hat, newseg->s_base,
2436 		    shmd->shm_sptas->a_hat, SEGSPTADDR, seg->s_size,
2437 		    seg->s_szc));
2438 
2439 	}
2440 }
2441 
2442 /*ARGSUSED*/
2443 int
2444 segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
2445 {
2446 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
2447 	struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2448 
2449 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2450 
2451 	/*
2452 	 * ISM segment is always rw.
2453 	 */
2454 	return (((sptd->spt_prot & prot) != prot) ? EACCES : 0);
2455 }
2456 
2457 /*
2458  * Return an array of locked large pages, for empty slots allocate
2459  * private zero-filled anon pages.
2460  */
2461 static int
2462 spt_anon_getpages(
2463 	struct seg *sptseg,
2464 	caddr_t sptaddr,
2465 	size_t len,
2466 	page_t *ppa[])
2467 {
2468 	struct  spt_data *sptd = sptseg->s_data;
2469 	struct  anon_map *amp = sptd->spt_amp;
2470 	enum	seg_rw rw = sptd->spt_prot;
2471 	uint_t	szc = sptseg->s_szc;
2472 	size_t	pg_sz, share_sz = page_get_pagesize(szc);
2473 	pgcnt_t	lp_npgs;
2474 	caddr_t	lp_addr, e_sptaddr;
2475 	uint_t	vpprot, ppa_szc = 0;
2476 	struct  vpage *vpage = NULL;
2477 	ulong_t	j, ppa_idx;
2478 	int	err, ierr = 0;
2479 	pgcnt_t	an_idx;
2480 	anon_sync_obj_t cookie;
2481 	int anon_locked = 0;
2482 	pgcnt_t amp_pgs;
2483 
2484 
2485 	ASSERT(IS_P2ALIGNED(sptaddr, share_sz) && IS_P2ALIGNED(len, share_sz));
2486 	ASSERT(len != 0);
2487 
2488 	pg_sz = share_sz;
2489 	lp_npgs = btop(pg_sz);
2490 	lp_addr = sptaddr;
2491 	e_sptaddr = sptaddr + len;
2492 	an_idx = seg_page(sptseg, sptaddr);
2493 	ppa_idx = 0;
2494 
2495 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2496 
2497 	amp_pgs = page_get_pagecnt(amp->a_szc);
2498 
2499 	/*CONSTCOND*/
2500 	while (1) {
2501 		for (; lp_addr < e_sptaddr;
2502 		    an_idx += lp_npgs, lp_addr += pg_sz, ppa_idx += lp_npgs) {
2503 
2504 			/*
2505 			 * If we're currently locked, and we get to a new
2506 			 * page, unlock our current anon chunk.
2507 			 */
2508 			if (anon_locked && P2PHASE(an_idx, amp_pgs) == 0) {
2509 				anon_array_exit(&cookie);
2510 				anon_locked = 0;
2511 			}
2512 			if (!anon_locked) {
2513 				anon_array_enter(amp, an_idx, &cookie);
2514 				anon_locked = 1;
2515 			}
2516 			ppa_szc = (uint_t)-1;
2517 			ierr = anon_map_getpages(amp, an_idx, szc, sptseg,
2518 			    lp_addr, sptd->spt_prot, &vpprot, &ppa[ppa_idx],
2519 			    &ppa_szc, vpage, rw, 0, segvn_anypgsz, 0, kcred);
2520 
2521 			if (ierr != 0) {
2522 				if (ierr > 0) {
2523 					err = FC_MAKE_ERR(ierr);
2524 					goto lpgs_err;
2525 				}
2526 				break;
2527 			}
2528 		}
2529 		if (lp_addr == e_sptaddr) {
2530 			break;
2531 		}
2532 		ASSERT(lp_addr < e_sptaddr);
2533 
2534 		/*
2535 		 * ierr == -1 means we failed to allocate a large page.
2536 		 * so do a size down operation.
2537 		 *
2538 		 * ierr == -2 means some other process that privately shares
2539 		 * pages with this process has allocated a larger page and we
2540 		 * need to retry with larger pages. So do a size up
2541 		 * operation. This relies on the fact that large pages are
2542 		 * never partially shared i.e. if we share any constituent
2543 		 * page of a large page with another process we must share the
2544 		 * entire large page. Note this cannot happen for SOFTLOCK
2545 		 * case, unless current address (lpaddr) is at the beginning
2546 		 * of the next page size boundary because the other process
2547 		 * couldn't have relocated locked pages.
2548 		 */
2549 		ASSERT(ierr == -1 || ierr == -2);
2550 		if (segvn_anypgsz) {
2551 			ASSERT(ierr == -2 || szc != 0);
2552 			ASSERT(ierr == -1 || szc < sptseg->s_szc);
2553 			szc = (ierr == -1) ? szc - 1 : szc + 1;
2554 		} else {
2555 			/*
2556 			 * For faults and segvn_anypgsz == 0
2557 			 * we need to be careful not to loop forever
2558 			 * if existing page is found with szc other
2559 			 * than 0 or seg->s_szc. This could be due
2560 			 * to page relocations on behalf of DR or
2561 			 * more likely large page creation. For this
2562 			 * case simply re-size to existing page's szc
2563 			 * if returned by anon_map_getpages().
2564 			 */
2565 			if (ppa_szc == (uint_t)-1) {
2566 				szc = (ierr == -1) ? 0 : sptseg->s_szc;
2567 			} else {
2568 				ASSERT(ppa_szc <= sptseg->s_szc);
2569 				ASSERT(ierr == -2 || ppa_szc < szc);
2570 				ASSERT(ierr == -1 || ppa_szc > szc);
2571 				szc = ppa_szc;
2572 			}
2573 		}
2574 		pg_sz = page_get_pagesize(szc);
2575 		lp_npgs = btop(pg_sz);
2576 		ASSERT(IS_P2ALIGNED(lp_addr, pg_sz));
2577 	}
2578 	if (anon_locked) {
2579 		anon_array_exit(&cookie);
2580 	}
2581 	ANON_LOCK_EXIT(&amp->a_rwlock);
2582 	return (0);
2583 
2584 lpgs_err:
2585 	if (anon_locked) {
2586 		anon_array_exit(&cookie);
2587 	}
2588 	ANON_LOCK_EXIT(&amp->a_rwlock);
2589 	for (j = 0; j < ppa_idx; j++)
2590 		page_unlock(ppa[j]);
2591 	return (err);
2592 }
2593 
2594 /*
2595  * count the number of bytes in a set of spt pages that are currently not
2596  * locked
2597  */
2598 static rctl_qty_t
2599 spt_unlockedbytes(pgcnt_t npages, page_t **ppa)
2600 {
2601 	ulong_t	i;
2602 	rctl_qty_t unlocked = 0;
2603 
2604 	for (i = 0; i < npages; i++) {
2605 		if (ppa[i]->p_lckcnt == 0)
2606 			unlocked += PAGESIZE;
2607 	}
2608 	return (unlocked);
2609 }
2610 
2611 extern	u_longlong_t randtick(void);
2612 /* number of locks to reserve/skip by spt_lockpages() and spt_unlockpages() */
2613 #define	NLCK	(NCPU_P2)
2614 /* Random number with a range [0, n-1], n must be power of two */
2615 #define	RAND_P2(n)	\
2616 	((((long)curthread >> PTR24_LSB) ^ (long)randtick()) & ((n) - 1))
2617 
2618 int
2619 spt_lockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages,
2620     page_t **ppa, ulong_t *lockmap, size_t pos,
2621     rctl_qty_t *locked)
2622 {
2623 	struct	shm_data *shmd = seg->s_data;
2624 	struct	spt_data *sptd = shmd->shm_sptseg->s_data;
2625 	ulong_t	i;
2626 	int	kernel;
2627 	pgcnt_t	nlck = 0;
2628 	int	rv = 0;
2629 	int	use_reserved = 1;
2630 
2631 	/* return the number of bytes actually locked */
2632 	*locked = 0;
2633 
2634 	/*
2635 	 * To avoid contention on freemem_lock, availrmem and pages_locked
2636 	 * global counters are updated only every nlck locked pages instead of
2637 	 * every time.  Reserve nlck locks up front and deduct from this
2638 	 * reservation for each page that requires a lock.  When the reservation
2639 	 * is consumed, reserve again.  nlck is randomized, so the competing
2640 	 * threads do not fall into a cyclic lock contention pattern. When
2641 	 * memory is low, the lock ahead is disabled, and instead page_pp_lock()
2642 	 * is used to lock pages.
2643 	 */
2644 	for (i = 0; i < npages; anon_index++, pos++, i++) {
2645 		if (nlck == 0 && use_reserved == 1) {
2646 			nlck = NLCK + RAND_P2(NLCK);
2647 			/* if fewer loops left, decrease nlck */
2648 			nlck = MIN(nlck, npages - i);
2649 			/*
2650 			 * Reserve nlck locks up front and deduct from this
2651 			 * reservation for each page that requires a lock.  When
2652 			 * the reservation is consumed, reserve again.
2653 			 */
2654 			mutex_enter(&freemem_lock);
2655 			if ((availrmem - nlck) < pages_pp_maximum) {
2656 				/* Do not do advance memory reserves */
2657 				use_reserved = 0;
2658 			} else {
2659 				availrmem	-= nlck;
2660 				pages_locked	+= nlck;
2661 			}
2662 			mutex_exit(&freemem_lock);
2663 		}
2664 		if (!(shmd->shm_vpage[anon_index] & DISM_PG_LOCKED)) {
2665 			if (sptd->spt_ppa_lckcnt[anon_index] <
2666 			    (ushort_t)DISM_LOCK_MAX) {
2667 				if (++sptd->spt_ppa_lckcnt[anon_index] ==
2668 				    (ushort_t)DISM_LOCK_MAX) {
2669 					cmn_err(CE_WARN,
2670 					    "DISM page lock limit "
2671 					    "reached on DISM offset 0x%lx\n",
2672 					    anon_index << PAGESHIFT);
2673 				}
2674 				kernel = (sptd->spt_ppa &&
2675 				    sptd->spt_ppa[anon_index]);
2676 				if (!page_pp_lock(ppa[i], 0, kernel ||
2677 				    use_reserved)) {
2678 					sptd->spt_ppa_lckcnt[anon_index]--;
2679 					rv = EAGAIN;
2680 					break;
2681 				}
2682 				/* if this is a newly locked page, count it */
2683 				if (ppa[i]->p_lckcnt == 1) {
2684 					if (kernel == 0 && use_reserved == 1)
2685 						nlck--;
2686 					*locked += PAGESIZE;
2687 				}
2688 				shmd->shm_lckpgs++;
2689 				shmd->shm_vpage[anon_index] |= DISM_PG_LOCKED;
2690 				if (lockmap != NULL)
2691 					BT_SET(lockmap, pos);
2692 			}
2693 		}
2694 	}
2695 	/* Return unused lock reservation */
2696 	if (nlck != 0 && use_reserved == 1) {
2697 		mutex_enter(&freemem_lock);
2698 		availrmem	+= nlck;
2699 		pages_locked	-= nlck;
2700 		mutex_exit(&freemem_lock);
2701 	}
2702 
2703 	return (rv);
2704 }
2705 
2706 int
2707 spt_unlockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages,
2708     rctl_qty_t *unlocked)
2709 {
2710 	struct shm_data	*shmd = seg->s_data;
2711 	struct spt_data	*sptd = shmd->shm_sptseg->s_data;
2712 	struct anon_map	*amp = sptd->spt_amp;
2713 	struct anon	*ap;
2714 	struct vnode	*vp;
2715 	u_offset_t	off;
2716 	struct page	*pp;
2717 	int		kernel;
2718 	anon_sync_obj_t	cookie;
2719 	ulong_t		i;
2720 	pgcnt_t		nlck = 0;
2721 	pgcnt_t		nlck_limit = NLCK;
2722 
2723 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2724 	for (i = 0; i < npages; i++, anon_index++) {
2725 		if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) {
2726 			anon_array_enter(amp, anon_index, &cookie);
2727 			ap = anon_get_ptr(amp->ahp, anon_index);
2728 			ASSERT(ap);
2729 
2730 			swap_xlate(ap, &vp, &off);
2731 			anon_array_exit(&cookie);
2732 			pp = page_lookup(vp, off, SE_SHARED);
2733 			ASSERT(pp);
2734 			/*
2735 			 * availrmem is decremented only for pages which are not
2736 			 * in seg pcache, for pages in seg pcache availrmem was
2737 			 * decremented in _dismpagelock()
2738 			 */
2739 			kernel = (sptd->spt_ppa && sptd->spt_ppa[anon_index]);
2740 			ASSERT(pp->p_lckcnt > 0);
2741 
2742 			/*
2743 			 * lock page but do not change availrmem, we do it
2744 			 * ourselves every nlck loops.
2745 			 */
2746 			page_pp_unlock(pp, 0, 1);
2747 			if (pp->p_lckcnt == 0) {
2748 				if (kernel == 0)
2749 					nlck++;
2750 				*unlocked += PAGESIZE;
2751 			}
2752 			page_unlock(pp);
2753 			shmd->shm_vpage[anon_index] &= ~DISM_PG_LOCKED;
2754 			sptd->spt_ppa_lckcnt[anon_index]--;
2755 			shmd->shm_lckpgs--;
2756 		}
2757 
2758 		/*
2759 		 * To reduce freemem_lock contention, do not update availrmem
2760 		 * until at least NLCK pages have been unlocked.
2761 		 * 1. No need to update if nlck is zero
2762 		 * 2. Always update if the last iteration
2763 		 */
2764 		if (nlck > 0 && (nlck == nlck_limit || i == npages - 1)) {
2765 			mutex_enter(&freemem_lock);
2766 			availrmem	+= nlck;
2767 			pages_locked	-= nlck;
2768 			mutex_exit(&freemem_lock);
2769 			nlck = 0;
2770 			nlck_limit = NLCK + RAND_P2(NLCK);
2771 		}
2772 	}
2773 	ANON_LOCK_EXIT(&amp->a_rwlock);
2774 
2775 	return (0);
2776 }
2777 
2778 /*ARGSUSED*/
2779 static int
2780 segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
2781     int attr, int op, ulong_t *lockmap, size_t pos)
2782 {
2783 	struct shm_data *shmd = seg->s_data;
2784 	struct seg	*sptseg = shmd->shm_sptseg;
2785 	struct spt_data *sptd = sptseg->s_data;
2786 	struct kshmid	*sp = sptd->spt_amp->a_sp;
2787 	pgcnt_t		npages, a_npages;
2788 	page_t		**ppa;
2789 	pgcnt_t		an_idx, a_an_idx, ppa_idx;
2790 	caddr_t		spt_addr, a_addr;	/* spt and aligned address */
2791 	size_t		a_len;			/* aligned len */
2792 	size_t		share_sz;
2793 	ulong_t		i;
2794 	int		sts = 0;
2795 	rctl_qty_t	unlocked = 0;
2796 	rctl_qty_t	locked = 0;
2797 	struct proc	*p = curproc;
2798 	kproject_t	*proj;
2799 
2800 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2801 	ASSERT(sp != NULL);
2802 
2803 	if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
2804 		return (0);
2805 	}
2806 
2807 	addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2808 	an_idx = seg_page(seg, addr);
2809 	npages = btopr(len);
2810 
2811 	if (an_idx + npages > btopr(shmd->shm_amp->size)) {
2812 		return (ENOMEM);
2813 	}
2814 
2815 	/*
2816 	 * A shm's project never changes, so no lock needed.
2817 	 * The shm has a hold on the project, so it will not go away.
2818 	 * Since we have a mapping to shm within this zone, we know
2819 	 * that the zone will not go away.
2820 	 */
2821 	proj = sp->shm_perm.ipc_proj;
2822 
2823 	if (op == MC_LOCK) {
2824 
2825 		/*
2826 		 * Need to align addr and size request if they are not
2827 		 * aligned so we can always allocate large page(s) however
2828 		 * we only lock what was requested in initial request.
2829 		 */
2830 		share_sz = page_get_pagesize(sptseg->s_szc);
2831 		a_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_sz);
2832 		a_len = P2ROUNDUP((uintptr_t)(((addr + len) - a_addr)),
2833 		    share_sz);
2834 		a_npages = btop(a_len);
2835 		a_an_idx = seg_page(seg, a_addr);
2836 		spt_addr = sptseg->s_base + ptob(a_an_idx);
2837 		ppa_idx = an_idx - a_an_idx;
2838 
2839 		if ((ppa = kmem_zalloc(((sizeof (page_t *)) * a_npages),
2840 		    KM_NOSLEEP)) == NULL) {
2841 			return (ENOMEM);
2842 		}
2843 
2844 		/*
2845 		 * Don't cache any new pages for IO and
2846 		 * flush any cached pages.
2847 		 */
2848 		mutex_enter(&sptd->spt_lock);
2849 		if (sptd->spt_ppa != NULL)
2850 			sptd->spt_flags |= DISM_PPA_CHANGED;
2851 
2852 		sts = spt_anon_getpages(sptseg, spt_addr, a_len, ppa);
2853 		if (sts != 0) {
2854 			mutex_exit(&sptd->spt_lock);
2855 			kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
2856 			return (sts);
2857 		}
2858 
2859 		mutex_enter(&sp->shm_mlock);
2860 		/* enforce locked memory rctl */
2861 		unlocked = spt_unlockedbytes(npages, &ppa[ppa_idx]);
2862 
2863 		mutex_enter(&p->p_lock);
2864 		if (rctl_incr_locked_mem(p, proj, unlocked, 0)) {
2865 			mutex_exit(&p->p_lock);
2866 			sts = EAGAIN;
2867 		} else {
2868 			mutex_exit(&p->p_lock);
2869 			sts = spt_lockpages(seg, an_idx, npages,
2870 			    &ppa[ppa_idx], lockmap, pos, &locked);
2871 
2872 			/*
2873 			 * correct locked count if not all pages could be
2874 			 * locked
2875 			 */
2876 			if ((unlocked - locked) > 0) {
2877 				rctl_decr_locked_mem(NULL, proj,
2878 				    (unlocked - locked), 0);
2879 			}
2880 		}
2881 		/*
2882 		 * unlock pages
2883 		 */
2884 		for (i = 0; i < a_npages; i++)
2885 			page_unlock(ppa[i]);
2886 		if (sptd->spt_ppa != NULL)
2887 			sptd->spt_flags |= DISM_PPA_CHANGED;
2888 		mutex_exit(&sp->shm_mlock);
2889 		mutex_exit(&sptd->spt_lock);
2890 
2891 		kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
2892 
2893 	} else if (op == MC_UNLOCK) { /* unlock */
2894 		page_t		**ppa;
2895 
2896 		mutex_enter(&sptd->spt_lock);
2897 		if (shmd->shm_lckpgs == 0) {
2898 			mutex_exit(&sptd->spt_lock);
2899 			return (0);
2900 		}
2901 		/*
2902 		 * Don't cache new IO pages.
2903 		 */
2904 		if (sptd->spt_ppa != NULL)
2905 			sptd->spt_flags |= DISM_PPA_CHANGED;
2906 
2907 		mutex_enter(&sp->shm_mlock);
2908 		sts = spt_unlockpages(seg, an_idx, npages, &unlocked);
2909 		if ((ppa = sptd->spt_ppa) != NULL)
2910 			sptd->spt_flags |= DISM_PPA_CHANGED;
2911 		mutex_exit(&sptd->spt_lock);
2912 
2913 		rctl_decr_locked_mem(NULL, proj, unlocked, 0);
2914 		mutex_exit(&sp->shm_mlock);
2915 
2916 		if (ppa != NULL)
2917 			seg_ppurge_wiredpp(ppa);
2918 	}
2919 	return (sts);
2920 }
2921 
2922 /*ARGSUSED*/
2923 int
2924 segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
2925 {
2926 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
2927 	struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2928 	spgcnt_t pgno = seg_page(seg, addr+len) - seg_page(seg, addr) + 1;
2929 
2930 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2931 
2932 	/*
2933 	 * ISM segment is always rw.
2934 	 */
2935 	while (--pgno >= 0)
2936 		*protv++ = sptd->spt_prot;
2937 	return (0);
2938 }
2939 
2940 /*ARGSUSED*/
2941 u_offset_t
2942 segspt_shmgetoffset(struct seg *seg, caddr_t addr)
2943 {
2944 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2945 
2946 	/* Offset does not matter in ISM memory */
2947 
2948 	return ((u_offset_t)0);
2949 }
2950 
2951 /* ARGSUSED */
2952 int
2953 segspt_shmgettype(struct seg *seg, caddr_t addr)
2954 {
2955 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
2956 	struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2957 
2958 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2959 
2960 	/*
2961 	 * The shared memory mapping is always MAP_SHARED, SWAP is only
2962 	 * reserved for DISM
2963 	 */
2964 	return (MAP_SHARED |
2965 	    ((sptd->spt_flags & SHM_PAGEABLE) ? 0 : MAP_NORESERVE));
2966 }
2967 
2968 /*ARGSUSED*/
2969 int
2970 segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
2971 {
2972 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
2973 	struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2974 
2975 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2976 
2977 	*vpp = sptd->spt_vp;
2978 	return (0);
2979 }
2980 
2981 /*
2982  * We need to wait for pending IO to complete to a DISM segment in order for
2983  * pages to get kicked out of the seg_pcache.  120 seconds should be more
2984  * than enough time to wait.
2985  */
2986 static clock_t spt_pcache_wait = 120;
2987 
2988 /*ARGSUSED*/
2989 static int
2990 segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
2991 {
2992 	struct shm_data	*shmd = (struct shm_data *)seg->s_data;
2993 	struct spt_data	*sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2994 	struct anon_map	*amp;
2995 	pgcnt_t pg_idx;
2996 	ushort_t gen;
2997 	clock_t	end_lbolt;
2998 	int writer;
2999 	page_t **ppa;
3000 
3001 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
3002 
3003 	if (behav == MADV_FREE || behav == MADV_PURGE) {
3004 		if ((sptd->spt_flags & SHM_PAGEABLE) == 0)
3005 			return (0);
3006 
3007 		amp = sptd->spt_amp;
3008 		pg_idx = seg_page(seg, addr);
3009 
3010 		mutex_enter(&sptd->spt_lock);
3011 		if ((ppa = sptd->spt_ppa) == NULL) {
3012 			mutex_exit(&sptd->spt_lock);
3013 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
3014 			(void) anon_disclaim(amp, pg_idx, len, behav, NULL);
3015 			ANON_LOCK_EXIT(&amp->a_rwlock);
3016 			return (0);
3017 		}
3018 
3019 		sptd->spt_flags |= DISM_PPA_CHANGED;
3020 		gen = sptd->spt_gen;
3021 
3022 		mutex_exit(&sptd->spt_lock);
3023 
3024 		/*
3025 		 * Purge all DISM cached pages
3026 		 */
3027 		seg_ppurge_wiredpp(ppa);
3028 
3029 		/*
3030 		 * Drop the AS_LOCK so that other threads can grab it
3031 		 * in the as_pageunlock path and hopefully get the segment
3032 		 * kicked out of the seg_pcache.  We bump the shm_softlockcnt
3033 		 * to keep this segment resident.
3034 		 */
3035 		writer = AS_WRITE_HELD(seg->s_as);
3036 		atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
3037 		AS_LOCK_EXIT(seg->s_as);
3038 
3039 		mutex_enter(&sptd->spt_lock);
3040 
3041 		end_lbolt = ddi_get_lbolt() + (hz * spt_pcache_wait);
3042 
3043 		/*
3044 		 * Try to wait for pages to get kicked out of the seg_pcache.
3045 		 */
3046 		while (sptd->spt_gen == gen &&
3047 		    (sptd->spt_flags & DISM_PPA_CHANGED) &&
3048 		    ddi_get_lbolt() < end_lbolt) {
3049 			if (!cv_timedwait_sig(&sptd->spt_cv,
3050 			    &sptd->spt_lock, end_lbolt)) {
3051 				break;
3052 			}
3053 		}
3054 
3055 		mutex_exit(&sptd->spt_lock);
3056 
3057 		/* Regrab the AS_LOCK and release our hold on the segment */
3058 		AS_LOCK_ENTER(seg->s_as, writer ? RW_WRITER : RW_READER);
3059 		atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
3060 		if (shmd->shm_softlockcnt <= 0) {
3061 			if (AS_ISUNMAPWAIT(seg->s_as)) {
3062 				mutex_enter(&seg->s_as->a_contents);
3063 				if (AS_ISUNMAPWAIT(seg->s_as)) {
3064 					AS_CLRUNMAPWAIT(seg->s_as);
3065 					cv_broadcast(&seg->s_as->a_cv);
3066 				}
3067 				mutex_exit(&seg->s_as->a_contents);
3068 			}
3069 		}
3070 
3071 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
3072 		(void) anon_disclaim(amp, pg_idx, len, behav, NULL);
3073 		ANON_LOCK_EXIT(&amp->a_rwlock);
3074 	} else if (lgrp_optimizations() && (behav == MADV_ACCESS_LWP ||
3075 	    behav == MADV_ACCESS_MANY || behav == MADV_ACCESS_DEFAULT)) {
3076 		int			already_set;
3077 		ulong_t			anon_index;
3078 		lgrp_mem_policy_t	policy;
3079 		caddr_t			shm_addr;
3080 		size_t			share_size;
3081 		size_t			size;
3082 		struct seg		*sptseg = shmd->shm_sptseg;
3083 		caddr_t			sptseg_addr;
3084 
3085 		/*
3086 		 * Align address and length to page size of underlying segment
3087 		 */
3088 		share_size = page_get_pagesize(shmd->shm_sptseg->s_szc);
3089 		shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_size);
3090 		size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)),
3091 		    share_size);
3092 
3093 		amp = shmd->shm_amp;
3094 		anon_index = seg_page(seg, shm_addr);
3095 
3096 		/*
3097 		 * And now we may have to adjust size downward if we have
3098 		 * exceeded the realsize of the segment or initial anon
3099 		 * allocations.
3100 		 */
3101 		sptseg_addr = sptseg->s_base + ptob(anon_index);
3102 		if ((sptseg_addr + size) >
3103 		    (sptseg->s_base + sptd->spt_realsize))
3104 			size = (sptseg->s_base + sptd->spt_realsize) -
3105 			    sptseg_addr;
3106 
3107 		/*
3108 		 * Set memory allocation policy for this segment
3109 		 */
3110 		policy = lgrp_madv_to_policy(behav, len, MAP_SHARED);
3111 		already_set = lgrp_shm_policy_set(policy, amp, anon_index,
3112 		    NULL, 0, len);
3113 
3114 		/*
3115 		 * If random memory allocation policy set already,
3116 		 * don't bother reapplying it.
3117 		 */
3118 		if (already_set && !LGRP_MEM_POLICY_REAPPLICABLE(policy))
3119 			return (0);
3120 
3121 		/*
3122 		 * Mark any existing pages in the given range for
3123 		 * migration, flushing the I/O page cache, and using
3124 		 * underlying segment to calculate anon index and get
3125 		 * anonmap and vnode pointer from
3126 		 */
3127 		if (shmd->shm_softlockcnt > 0)
3128 			segspt_purge(seg);
3129 
3130 		page_mark_migrate(seg, shm_addr, size, amp, 0, NULL, 0, 0);
3131 	}
3132 
3133 	return (0);
3134 }
3135 
3136 /*ARGSUSED*/
3137 void
3138 segspt_shmdump(struct seg *seg)
3139 {
3140 	/* no-op for ISM segment */
3141 }
3142 
3143 /*ARGSUSED*/
3144 static int
3145 segspt_shmsetpgsz(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
3146 {
3147 	return (ENOTSUP);
3148 }
3149 
3150 /*
3151  * get a memory ID for an addr in a given segment
3152  */
3153 static int
3154 segspt_shmgetmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
3155 {
3156 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
3157 	struct anon	*ap;
3158 	size_t		anon_index;
3159 	struct anon_map	*amp = shmd->shm_amp;
3160 	struct spt_data	*sptd = shmd->shm_sptseg->s_data;
3161 	struct seg	*sptseg = shmd->shm_sptseg;
3162 	anon_sync_obj_t	cookie;
3163 
3164 	anon_index = seg_page(seg, addr);
3165 
3166 	if (addr > (seg->s_base + sptd->spt_realsize)) {
3167 		return (EFAULT);
3168 	}
3169 
3170 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
3171 	anon_array_enter(amp, anon_index, &cookie);
3172 	ap = anon_get_ptr(amp->ahp, anon_index);
3173 	if (ap == NULL) {
3174 		struct page *pp;
3175 		caddr_t spt_addr = sptseg->s_base + ptob(anon_index);
3176 
3177 		pp = anon_zero(sptseg, spt_addr, &ap, kcred);
3178 		if (pp == NULL) {
3179 			anon_array_exit(&cookie);
3180 			ANON_LOCK_EXIT(&amp->a_rwlock);
3181 			return (ENOMEM);
3182 		}
3183 		(void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP);
3184 		page_unlock(pp);
3185 	}
3186 	anon_array_exit(&cookie);
3187 	ANON_LOCK_EXIT(&amp->a_rwlock);
3188 	memidp->val[0] = (uintptr_t)ap;
3189 	memidp->val[1] = (uintptr_t)addr & PAGEOFFSET;
3190 	return (0);
3191 }
3192 
3193 /*
3194  * Get memory allocation policy info for specified address in given segment
3195  */
3196 static lgrp_mem_policy_info_t *
3197 segspt_shmgetpolicy(struct seg *seg, caddr_t addr)
3198 {
3199 	struct anon_map		*amp;
3200 	ulong_t			anon_index;
3201 	lgrp_mem_policy_info_t	*policy_info;
3202 	struct shm_data		*shm_data;
3203 
3204 	ASSERT(seg != NULL);
3205 
3206 	/*
3207 	 * Get anon_map from segshm
3208 	 *
3209 	 * Assume that no lock needs to be held on anon_map, since
3210 	 * it should be protected by its reference count which must be
3211 	 * nonzero for an existing segment
3212 	 * Need to grab readers lock on policy tree though
3213 	 */
3214 	shm_data = (struct shm_data *)seg->s_data;
3215 	if (shm_data == NULL)
3216 		return (NULL);
3217 	amp = shm_data->shm_amp;
3218 	ASSERT(amp->refcnt != 0);
3219 
3220 	/*
3221 	 * Get policy info
3222 	 *
3223 	 * Assume starting anon index of 0
3224 	 */
3225 	anon_index = seg_page(seg, addr);
3226 	policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
3227 
3228 	return (policy_info);
3229 }
3230 
3231 /*ARGSUSED*/
3232 static int
3233 segspt_shmcapable(struct seg *seg, segcapability_t capability)
3234 {
3235 	return (0);
3236 }
3237