xref: /illumos-gate/usr/src/uts/common/vm/seg_map.c (revision a07094369b21309434206d9b3601d162693466fc)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 /*
31  * Portions of this source code were derived from Berkeley 4.3 BSD
32  * under license from the Regents of the University of California.
33  */
34 
35 #pragma ident	"%Z%%M%	%I%	%E% SMI"
36 
37 /*
38  * VM - generic vnode mapping segment.
39  *
40  * The segmap driver is used only by the kernel to get faster (than seg_vn)
41  * mappings [lower routine overhead; more persistent cache] to random
42  * vnode/offsets.  Note than the kernel may (and does) use seg_vn as well.
43  */
44 
45 #include <sys/types.h>
46 #include <sys/t_lock.h>
47 #include <sys/param.h>
48 #include <sys/sysmacros.h>
49 #include <sys/buf.h>
50 #include <sys/systm.h>
51 #include <sys/vnode.h>
52 #include <sys/mman.h>
53 #include <sys/errno.h>
54 #include <sys/cred.h>
55 #include <sys/kmem.h>
56 #include <sys/vtrace.h>
57 #include <sys/cmn_err.h>
58 #include <sys/debug.h>
59 #include <sys/thread.h>
60 #include <sys/dumphdr.h>
61 #include <sys/bitmap.h>
62 #include <sys/lgrp.h>
63 
64 #include <vm/seg_kmem.h>
65 #include <vm/hat.h>
66 #include <vm/as.h>
67 #include <vm/seg.h>
68 #include <vm/seg_kpm.h>
69 #include <vm/seg_map.h>
70 #include <vm/page.h>
71 #include <vm/pvn.h>
72 #include <vm/rm.h>
73 
74 /*
75  * Private seg op routines.
76  */
77 static void	segmap_free(struct seg *seg);
78 faultcode_t segmap_fault(struct hat *hat, struct seg *seg, caddr_t addr,
79 			size_t len, enum fault_type type, enum seg_rw rw);
80 static faultcode_t segmap_faulta(struct seg *seg, caddr_t addr);
81 static int	segmap_checkprot(struct seg *seg, caddr_t addr, size_t len,
82 			uint_t prot);
83 static int	segmap_kluster(struct seg *seg, caddr_t addr, ssize_t);
84 static int	segmap_getprot(struct seg *seg, caddr_t addr, size_t len,
85 			uint_t *protv);
86 static u_offset_t	segmap_getoffset(struct seg *seg, caddr_t addr);
87 static int	segmap_gettype(struct seg *seg, caddr_t addr);
88 static int	segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
89 static void	segmap_dump(struct seg *seg);
90 static int	segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
91 			struct page ***ppp, enum lock_type type,
92 			enum seg_rw rw);
93 static void	segmap_badop(void);
94 static int	segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp);
95 static lgrp_mem_policy_info_t	*segmap_getpolicy(struct seg *seg,
96     caddr_t addr);
97 static int	segmap_capable(struct seg *seg, segcapability_t capability);
98 
99 /* segkpm support */
100 static caddr_t	segmap_pagecreate_kpm(struct seg *, vnode_t *, u_offset_t,
101 			struct smap *, enum seg_rw);
102 struct smap	*get_smap_kpm(caddr_t, page_t **);
103 
104 #define	SEGMAP_BADOP(t)	(t(*)())segmap_badop
105 
106 static struct seg_ops segmap_ops = {
107 	SEGMAP_BADOP(int),	/* dup */
108 	SEGMAP_BADOP(int),	/* unmap */
109 	segmap_free,
110 	segmap_fault,
111 	segmap_faulta,
112 	SEGMAP_BADOP(int),	/* setprot */
113 	segmap_checkprot,
114 	segmap_kluster,
115 	SEGMAP_BADOP(size_t),	/* swapout */
116 	SEGMAP_BADOP(int),	/* sync */
117 	SEGMAP_BADOP(size_t),	/* incore */
118 	SEGMAP_BADOP(int),	/* lockop */
119 	segmap_getprot,
120 	segmap_getoffset,
121 	segmap_gettype,
122 	segmap_getvp,
123 	SEGMAP_BADOP(int),	/* advise */
124 	segmap_dump,
125 	segmap_pagelock,	/* pagelock */
126 	SEGMAP_BADOP(int),	/* setpgsz */
127 	segmap_getmemid,	/* getmemid */
128 	segmap_getpolicy,	/* getpolicy */
129 	segmap_capable,		/* capable */
130 };
131 
132 /*
133  * Private segmap routines.
134  */
135 static void	segmap_unlock(struct hat *hat, struct seg *seg, caddr_t addr,
136 			size_t len, enum seg_rw rw, struct smap *smp);
137 static void	segmap_smapadd(struct smap *smp);
138 static struct smap *segmap_hashin(struct smap *smp, struct vnode *vp,
139 			u_offset_t off, int hashid);
140 static void	segmap_hashout(struct smap *smp);
141 
142 
143 /*
144  * Statistics for segmap operations.
145  *
146  * No explicit locking to protect these stats.
147  */
148 struct segmapcnt segmapcnt = {
149 	{ "fault",		KSTAT_DATA_ULONG },
150 	{ "faulta",		KSTAT_DATA_ULONG },
151 	{ "getmap",		KSTAT_DATA_ULONG },
152 	{ "get_use",		KSTAT_DATA_ULONG },
153 	{ "get_reclaim",	KSTAT_DATA_ULONG },
154 	{ "get_reuse",		KSTAT_DATA_ULONG },
155 	{ "get_unused",		KSTAT_DATA_ULONG },
156 	{ "get_nofree",		KSTAT_DATA_ULONG },
157 	{ "rel_async",		KSTAT_DATA_ULONG },
158 	{ "rel_write",		KSTAT_DATA_ULONG },
159 	{ "rel_free",		KSTAT_DATA_ULONG },
160 	{ "rel_abort",		KSTAT_DATA_ULONG },
161 	{ "rel_dontneed",	KSTAT_DATA_ULONG },
162 	{ "release",		KSTAT_DATA_ULONG },
163 	{ "pagecreate",		KSTAT_DATA_ULONG },
164 	{ "free_notfree",	KSTAT_DATA_ULONG },
165 	{ "free_dirty",		KSTAT_DATA_ULONG },
166 	{ "free",		KSTAT_DATA_ULONG },
167 	{ "stolen",		KSTAT_DATA_ULONG },
168 	{ "get_nomtx",		KSTAT_DATA_ULONG }
169 };
170 
171 kstat_named_t *segmapcnt_ptr = (kstat_named_t *)&segmapcnt;
172 uint_t segmapcnt_ndata = sizeof (segmapcnt) / sizeof (kstat_named_t);
173 
174 /*
175  * Return number of map pages in segment.
176  */
177 #define	MAP_PAGES(seg)		((seg)->s_size >> MAXBSHIFT)
178 
179 /*
180  * Translate addr into smap number within segment.
181  */
182 #define	MAP_PAGE(seg, addr)  (((addr) - (seg)->s_base) >> MAXBSHIFT)
183 
184 /*
185  * Translate addr in seg into struct smap pointer.
186  */
187 #define	GET_SMAP(seg, addr)	\
188 	&(((struct segmap_data *)((seg)->s_data))->smd_sm[MAP_PAGE(seg, addr)])
189 
190 /*
191  * Bit in map (16 bit bitmap).
192  */
193 #define	SMAP_BIT_MASK(bitindex)	(1 << ((bitindex) & 0xf))
194 
195 static int smd_colormsk = 0;
196 static int smd_ncolor = 0;
197 static int smd_nfree = 0;
198 static int smd_freemsk = 0;
199 #ifdef DEBUG
200 static int *colors_used;
201 #endif
202 static struct smap *smd_smap;
203 static struct smaphash *smd_hash;
204 #ifdef SEGMAP_HASHSTATS
205 static unsigned int *smd_hash_len;
206 #endif
207 static struct smfree *smd_free;
208 static ulong_t smd_hashmsk = 0;
209 
210 #define	SEGMAP_MAXCOLOR		2
211 #define	SEGMAP_CACHE_PAD	64
212 
213 union segmap_cpu {
214 	struct {
215 		uint32_t	scpu_free_ndx[SEGMAP_MAXCOLOR];
216 		struct smap	*scpu_last_smap;
217 		ulong_t		scpu_getmap;
218 		ulong_t		scpu_release;
219 		ulong_t		scpu_get_reclaim;
220 		ulong_t		scpu_fault;
221 		ulong_t		scpu_pagecreate;
222 		ulong_t		scpu_get_reuse;
223 	} scpu;
224 	char	scpu_pad[SEGMAP_CACHE_PAD];
225 };
226 static union segmap_cpu *smd_cpu;
227 
228 /*
229  * There are three locks in seg_map:
230  *	- per freelist mutexes
231  *	- per hashchain mutexes
232  *	- per smap mutexes
233  *
234  * The lock ordering is to get the smap mutex to lock down the slot
235  * first then the hash lock (for hash in/out (vp, off) list) or the
236  * freelist lock to put the slot back on the free list.
237  *
238  * The hash search is done by only holding the hashchain lock, when a wanted
239  * slot is found, we drop the hashchain lock then lock the slot so there
240  * is no overlapping of hashchain and smap locks. After the slot is
241  * locked, we verify again if the slot is still what we are looking
242  * for.
243  *
244  * Allocation of a free slot is done by holding the freelist lock,
245  * then locking the smap slot at the head of the freelist. This is
246  * in reversed lock order so mutex_tryenter() is used.
247  *
248  * The smap lock protects all fields in smap structure except for
249  * the link fields for hash/free lists which are protected by
250  * hashchain and freelist locks.
251  */
252 
253 #define	SHASHMTX(hashid)	(&smd_hash[hashid].sh_mtx)
254 
255 #define	SMP2SMF(smp)		(&smd_free[(smp - smd_smap) & smd_freemsk])
256 #define	SMP2SMF_NDX(smp)	(ushort_t)((smp - smd_smap) & smd_freemsk)
257 
258 #define	SMAPMTX(smp) (&smp->sm_mtx)
259 
260 #define	SMAP_HASHFUNC(vp, off, hashid) \
261 	{ \
262 	hashid = ((((uintptr_t)(vp) >> 6) + ((uintptr_t)(vp) >> 3) + \
263 		((off) >> MAXBSHIFT)) & smd_hashmsk); \
264 	}
265 
266 /*
267  * The most frequently updated kstat counters are kept in the
268  * per cpu array to avoid hot cache blocks. The update function
269  * sums the cpu local counters to update the global counters.
270  */
271 
272 /* ARGSUSED */
273 int
274 segmap_kstat_update(kstat_t *ksp, int rw)
275 {
276 	int i;
277 	ulong_t	getmap, release, get_reclaim;
278 	ulong_t	fault, pagecreate, get_reuse;
279 
280 	if (rw == KSTAT_WRITE)
281 		return (EACCES);
282 	getmap = release = get_reclaim = (ulong_t)0;
283 	fault = pagecreate = get_reuse = (ulong_t)0;
284 	for (i = 0; i < max_ncpus; i++) {
285 		getmap += smd_cpu[i].scpu.scpu_getmap;
286 		release  += smd_cpu[i].scpu.scpu_release;
287 		get_reclaim += smd_cpu[i].scpu.scpu_get_reclaim;
288 		fault  += smd_cpu[i].scpu.scpu_fault;
289 		pagecreate  += smd_cpu[i].scpu.scpu_pagecreate;
290 		get_reuse += smd_cpu[i].scpu.scpu_get_reuse;
291 	}
292 	segmapcnt.smp_getmap.value.ul = getmap;
293 	segmapcnt.smp_release.value.ul = release;
294 	segmapcnt.smp_get_reclaim.value.ul = get_reclaim;
295 	segmapcnt.smp_fault.value.ul = fault;
296 	segmapcnt.smp_pagecreate.value.ul = pagecreate;
297 	segmapcnt.smp_get_reuse.value.ul = get_reuse;
298 	return (0);
299 }
300 
301 int
302 segmap_create(struct seg *seg, void *argsp)
303 {
304 	struct segmap_data *smd;
305 	struct smap *smp;
306 	struct smfree *sm;
307 	struct segmap_crargs *a = (struct segmap_crargs *)argsp;
308 	struct smaphash *shashp;
309 	union segmap_cpu *scpu;
310 	long i, npages;
311 	size_t hashsz;
312 	uint_t nfreelist;
313 	extern void prefetch_smap_w(void *);
314 	extern int max_ncpus;
315 
316 	ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
317 
318 	if (((uintptr_t)seg->s_base | seg->s_size) & MAXBOFFSET) {
319 		panic("segkmap not MAXBSIZE aligned");
320 		/*NOTREACHED*/
321 	}
322 
323 	smd = kmem_zalloc(sizeof (struct segmap_data), KM_SLEEP);
324 
325 	seg->s_data = (void *)smd;
326 	seg->s_ops = &segmap_ops;
327 	smd->smd_prot = a->prot;
328 
329 	/*
330 	 * Scale the number of smap freelists to be
331 	 * proportional to max_ncpus * number of virtual colors.
332 	 * The caller can over-ride this scaling by providing
333 	 * a non-zero a->nfreelist argument.
334 	 */
335 	nfreelist = a->nfreelist;
336 	if (nfreelist == 0)
337 		nfreelist = max_ncpus;
338 	else if (nfreelist < 0 || nfreelist > 4 * max_ncpus) {
339 		cmn_err(CE_WARN, "segmap_create: nfreelist out of range "
340 		"%d, using %d", nfreelist, max_ncpus);
341 		nfreelist = max_ncpus;
342 	}
343 	if (nfreelist & (nfreelist - 1)) {
344 		/* round up nfreelist to the next power of two. */
345 		nfreelist = 1 << (highbit(nfreelist));
346 	}
347 
348 	/*
349 	 * Get the number of virtual colors - must be a power of 2.
350 	 */
351 	if (a->shmsize)
352 		smd_ncolor = a->shmsize >> MAXBSHIFT;
353 	else
354 		smd_ncolor = 1;
355 	ASSERT((smd_ncolor & (smd_ncolor - 1)) == 0);
356 	ASSERT(smd_ncolor <= SEGMAP_MAXCOLOR);
357 	smd_colormsk = smd_ncolor - 1;
358 	smd->smd_nfree = smd_nfree = smd_ncolor * nfreelist;
359 	smd_freemsk = smd_nfree - 1;
360 
361 	/*
362 	 * Allocate and initialize the freelist headers.
363 	 * Note that sm_freeq[1] starts out as the release queue. This
364 	 * is known when the smap structures are initialized below.
365 	 */
366 	smd_free = smd->smd_free =
367 	    kmem_zalloc(smd_nfree * sizeof (struct smfree), KM_SLEEP);
368 	for (i = 0; i < smd_nfree; i++) {
369 		sm = &smd->smd_free[i];
370 		mutex_init(&sm->sm_freeq[0].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
371 		mutex_init(&sm->sm_freeq[1].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
372 		sm->sm_allocq = &sm->sm_freeq[0];
373 		sm->sm_releq = &sm->sm_freeq[1];
374 	}
375 
376 	/*
377 	 * Allocate and initialize the smap hash chain headers.
378 	 * Compute hash size rounding down to the next power of two.
379 	 */
380 	npages = MAP_PAGES(seg);
381 	smd->smd_npages = npages;
382 	hashsz = npages / SMAP_HASHAVELEN;
383 	hashsz = 1 << (highbit(hashsz)-1);
384 	smd_hashmsk = hashsz - 1;
385 	smd_hash = smd->smd_hash =
386 	    kmem_alloc(hashsz * sizeof (struct smaphash), KM_SLEEP);
387 #ifdef SEGMAP_HASHSTATS
388 	smd_hash_len =
389 	    kmem_zalloc(hashsz * sizeof (unsigned int), KM_SLEEP);
390 #endif
391 	for (i = 0, shashp = smd_hash; i < hashsz; i++, shashp++) {
392 		shashp->sh_hash_list = NULL;
393 		mutex_init(&shashp->sh_mtx, NULL, MUTEX_DEFAULT, NULL);
394 	}
395 
396 	/*
397 	 * Allocate and initialize the smap structures.
398 	 * Link all slots onto the appropriate freelist.
399 	 * The smap array is large enough to affect boot time
400 	 * on large systems, so use memory prefetching and only
401 	 * go through the array 1 time. Inline a optimized version
402 	 * of segmap_smapadd to add structures to freelists with
403 	 * knowledge that no locks are needed here.
404 	 */
405 	smd_smap = smd->smd_sm =
406 		kmem_alloc(sizeof (struct smap) * npages, KM_SLEEP);
407 
408 	for (smp = &smd->smd_sm[MAP_PAGES(seg) - 1];
409 	    smp >= smd->smd_sm; smp--) {
410 		struct smap *smpfreelist;
411 		struct sm_freeq *releq;
412 
413 		prefetch_smap_w((char *)smp);
414 
415 		smp->sm_vp = NULL;
416 		smp->sm_hash = NULL;
417 		smp->sm_off = 0;
418 		smp->sm_bitmap = 0;
419 		smp->sm_refcnt = 0;
420 		mutex_init(&smp->sm_mtx, NULL, MUTEX_DEFAULT, NULL);
421 		smp->sm_free_ndx = SMP2SMF_NDX(smp);
422 
423 		sm = SMP2SMF(smp);
424 		releq = sm->sm_releq;
425 
426 		smpfreelist = releq->smq_free;
427 		if (smpfreelist == 0) {
428 			releq->smq_free = smp->sm_next = smp->sm_prev = smp;
429 		} else {
430 			smp->sm_next = smpfreelist;
431 			smp->sm_prev = smpfreelist->sm_prev;
432 			smpfreelist->sm_prev = smp;
433 			smp->sm_prev->sm_next = smp;
434 			releq->smq_free = smp->sm_next;
435 		}
436 
437 		/*
438 		 * sm_flag = 0 (no SM_QNDX_ZERO) implies smap on sm_freeq[1]
439 		 */
440 		smp->sm_flags = 0;
441 
442 #ifdef	SEGKPM_SUPPORT
443 		/*
444 		 * Due to the fragile prefetch loop no
445 		 * separate function is used here.
446 		 */
447 		smp->sm_kpme_next = NULL;
448 		smp->sm_kpme_prev = NULL;
449 		smp->sm_kpme_page = NULL;
450 #endif
451 	}
452 
453 	/*
454 	 * Allocate the per color indices that distribute allocation
455 	 * requests over the free lists. Each cpu will have a private
456 	 * rotor index to spread the allocations even across the available
457 	 * smap freelists. Init the scpu_last_smap field to the first
458 	 * smap element so there is no need to check for NULL.
459 	 */
460 	smd_cpu =
461 		kmem_zalloc(sizeof (union segmap_cpu) * max_ncpus, KM_SLEEP);
462 	for (i = 0, scpu = smd_cpu; i < max_ncpus; i++, scpu++) {
463 		int j;
464 		for (j = 0; j < smd_ncolor; j++)
465 			scpu->scpu.scpu_free_ndx[j] = j;
466 		scpu->scpu.scpu_last_smap = smd_smap;
467 	}
468 
469 #ifdef DEBUG
470 	/*
471 	 * Keep track of which colors are used more often.
472 	 */
473 	colors_used = kmem_zalloc(smd_nfree * sizeof (int), KM_SLEEP);
474 #endif /* DEBUG */
475 
476 	return (0);
477 }
478 
479 static void
480 segmap_free(seg)
481 	struct seg *seg;
482 {
483 	ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
484 }
485 
486 /*
487  * Do a F_SOFTUNLOCK call over the range requested.
488  * The range must have already been F_SOFTLOCK'ed.
489  */
490 static void
491 segmap_unlock(
492 	struct hat *hat,
493 	struct seg *seg,
494 	caddr_t addr,
495 	size_t len,
496 	enum seg_rw rw,
497 	struct smap *smp)
498 {
499 	page_t *pp;
500 	caddr_t adr;
501 	u_offset_t off;
502 	struct vnode *vp;
503 	kmutex_t *smtx;
504 
505 	ASSERT(smp->sm_refcnt > 0);
506 
507 #ifdef lint
508 	seg = seg;
509 #endif
510 
511 	if (segmap_kpm && IS_KPM_ADDR(addr)) {
512 
513 		/*
514 		 * We're called only from segmap_fault and this was a
515 		 * NOP in case of a kpm based smap, so dangerous things
516 		 * must have happened in the meantime. Pages are prefaulted
517 		 * and locked in segmap_getmapflt and they will not be
518 		 * unlocked until segmap_release.
519 		 */
520 		panic("segmap_unlock: called with kpm addr %p", (void *)addr);
521 		/*NOTREACHED*/
522 	}
523 
524 	vp = smp->sm_vp;
525 	off = smp->sm_off + (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
526 
527 	hat_unlock(hat, addr, P2ROUNDUP(len, PAGESIZE));
528 	for (adr = addr; adr < addr + len; adr += PAGESIZE, off += PAGESIZE) {
529 		ushort_t bitmask;
530 
531 		/*
532 		 * Use page_find() instead of page_lookup() to
533 		 * find the page since we know that it has
534 		 * "shared" lock.
535 		 */
536 		pp = page_find(vp, off);
537 		if (pp == NULL) {
538 			panic("segmap_unlock: page not found");
539 			/*NOTREACHED*/
540 		}
541 
542 		if (rw == S_WRITE) {
543 			hat_setrefmod(pp);
544 		} else if (rw != S_OTHER) {
545 			TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
546 				"segmap_fault:pp %p vp %p offset %llx",
547 				pp, vp, off);
548 			hat_setref(pp);
549 		}
550 
551 		/*
552 		 * Clear bitmap, if the bit corresponding to "off" is set,
553 		 * since the page and translation are being unlocked.
554 		 */
555 		bitmask = SMAP_BIT_MASK((off - smp->sm_off) >> PAGESHIFT);
556 
557 		/*
558 		 * Large Files: Following assertion is to verify
559 		 * the correctness of the cast to (int) above.
560 		 */
561 		ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
562 		smtx = SMAPMTX(smp);
563 		mutex_enter(smtx);
564 		if (smp->sm_bitmap & bitmask) {
565 			smp->sm_bitmap &= ~bitmask;
566 		}
567 		mutex_exit(smtx);
568 
569 		page_unlock(pp);
570 	}
571 }
572 
573 #define	MAXPPB	(MAXBSIZE/4096)	/* assumes minimum page size of 4k */
574 
575 /*
576  * This routine is called via a machine specific fault handling
577  * routine.  It is also called by software routines wishing to
578  * lock or unlock a range of addresses.
579  *
580  * Note that this routine expects a page-aligned "addr".
581  */
582 faultcode_t
583 segmap_fault(
584 	struct hat *hat,
585 	struct seg *seg,
586 	caddr_t addr,
587 	size_t len,
588 	enum fault_type type,
589 	enum seg_rw rw)
590 {
591 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
592 	struct smap *smp;
593 	page_t *pp, **ppp;
594 	struct vnode *vp;
595 	u_offset_t off;
596 	page_t *pl[MAXPPB + 1];
597 	uint_t prot;
598 	u_offset_t addroff;
599 	caddr_t adr;
600 	int err;
601 	u_offset_t sm_off;
602 	int hat_flag;
603 
604 	if (segmap_kpm && IS_KPM_ADDR(addr)) {
605 		int newpage;
606 		kmutex_t *smtx;
607 
608 		/*
609 		 * Pages are successfully prefaulted and locked in
610 		 * segmap_getmapflt and can't be unlocked until
611 		 * segmap_release. No hat mappings have to be locked
612 		 * and they also can't be unlocked as long as the
613 		 * caller owns an active kpm addr.
614 		 */
615 #ifndef DEBUG
616 		if (type != F_SOFTUNLOCK)
617 			return (0);
618 #endif
619 
620 		if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
621 			panic("segmap_fault: smap not found "
622 			    "for addr %p", (void *)addr);
623 			/*NOTREACHED*/
624 		}
625 
626 		smtx = SMAPMTX(smp);
627 #ifdef	DEBUG
628 		newpage = smp->sm_flags & SM_KPM_NEWPAGE;
629 		if (newpage) {
630 			cmn_err(CE_WARN, "segmap_fault: newpage? smp %p",
631 				(void *)smp);
632 		}
633 
634 		if (type != F_SOFTUNLOCK) {
635 			mutex_exit(smtx);
636 			return (0);
637 		}
638 #endif
639 		mutex_exit(smtx);
640 		vp = smp->sm_vp;
641 		sm_off = smp->sm_off;
642 
643 		if (vp == NULL)
644 			return (FC_MAKE_ERR(EIO));
645 
646 		ASSERT(smp->sm_refcnt > 0);
647 
648 		addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
649 		if (addroff + len > MAXBSIZE)
650 			panic("segmap_fault: endaddr %p exceeds MAXBSIZE chunk",
651 			    (void *)(addr + len));
652 
653 		off = sm_off + addroff;
654 
655 		pp = page_find(vp, off);
656 
657 		if (pp == NULL)
658 			panic("segmap_fault: softunlock page not found");
659 
660 		/*
661 		 * Set ref bit also here in case of S_OTHER to avoid the
662 		 * overhead of supporting other cases than F_SOFTUNLOCK
663 		 * with segkpm. We can do this because the underlying
664 		 * pages are locked anyway.
665 		 */
666 		if (rw == S_WRITE) {
667 			hat_setrefmod(pp);
668 		} else {
669 			TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
670 				"segmap_fault:pp %p vp %p offset %llx",
671 				pp, vp, off);
672 			hat_setref(pp);
673 		}
674 
675 		return (0);
676 	}
677 
678 	smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
679 	smp = GET_SMAP(seg, addr);
680 	vp = smp->sm_vp;
681 	sm_off = smp->sm_off;
682 
683 	if (vp == NULL)
684 		return (FC_MAKE_ERR(EIO));
685 
686 	ASSERT(smp->sm_refcnt > 0);
687 
688 	addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
689 	if (addroff + len > MAXBSIZE) {
690 		panic("segmap_fault: endaddr %p "
691 		    "exceeds MAXBSIZE chunk", (void *)(addr + len));
692 		/*NOTREACHED*/
693 	}
694 	off = sm_off + addroff;
695 
696 	/*
697 	 * First handle the easy stuff
698 	 */
699 	if (type == F_SOFTUNLOCK) {
700 		segmap_unlock(hat, seg, addr, len, rw, smp);
701 		return (0);
702 	}
703 
704 	TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
705 		"segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
706 	err = VOP_GETPAGE(vp, (offset_t)off, len, &prot, pl, MAXBSIZE,
707 	    seg, addr, rw, CRED());
708 
709 	if (err)
710 		return (FC_MAKE_ERR(err));
711 
712 	prot &= smd->smd_prot;
713 
714 	/*
715 	 * Handle all pages returned in the pl[] array.
716 	 * This loop is coded on the assumption that if
717 	 * there was no error from the VOP_GETPAGE routine,
718 	 * that the page list returned will contain all the
719 	 * needed pages for the vp from [off..off + len].
720 	 */
721 	ppp = pl;
722 	while ((pp = *ppp++) != NULL) {
723 		u_offset_t poff;
724 		ASSERT(pp->p_vnode == vp);
725 		hat_flag = HAT_LOAD;
726 
727 		/*
728 		 * Verify that the pages returned are within the range
729 		 * of this segmap region.  Note that it is theoretically
730 		 * possible for pages outside this range to be returned,
731 		 * but it is not very likely.  If we cannot use the
732 		 * page here, just release it and go on to the next one.
733 		 */
734 		if (pp->p_offset < sm_off ||
735 		    pp->p_offset >= sm_off + MAXBSIZE) {
736 			(void) page_release(pp, 1);
737 			continue;
738 		}
739 
740 		ASSERT(hat == kas.a_hat);
741 		poff = pp->p_offset;
742 		adr = addr + (poff - off);
743 		if (adr >= addr && adr < addr + len) {
744 			hat_setref(pp);
745 			TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
746 			    "segmap_fault:pp %p vp %p offset %llx",
747 			    pp, vp, poff);
748 			if (type == F_SOFTLOCK)
749 				hat_flag = HAT_LOAD_LOCK;
750 		}
751 
752 		/*
753 		 * Deal with VMODSORT pages here. If we know this is a write
754 		 * do the setmod now and allow write protection.
755 		 * As long as it's modified or not S_OTHER, remove write
756 		 * protection. With S_OTHER it's up to the FS to deal with this.
757 		 */
758 		if (IS_VMODSORT(vp)) {
759 			if (rw == S_WRITE)
760 				hat_setmod(pp);
761 			else if (rw != S_OTHER && !hat_ismod(pp))
762 				prot &= ~PROT_WRITE;
763 		}
764 
765 		hat_memload(hat, adr, pp, prot, hat_flag);
766 		if (hat_flag != HAT_LOAD_LOCK)
767 			page_unlock(pp);
768 	}
769 	return (0);
770 }
771 
772 /*
773  * This routine is used to start I/O on pages asynchronously.
774  */
775 static faultcode_t
776 segmap_faulta(struct seg *seg, caddr_t addr)
777 {
778 	struct smap *smp;
779 	struct vnode *vp;
780 	u_offset_t off;
781 	int err;
782 
783 	if (segmap_kpm && IS_KPM_ADDR(addr)) {
784 		int	newpage;
785 		kmutex_t *smtx;
786 
787 		/*
788 		 * Pages are successfully prefaulted and locked in
789 		 * segmap_getmapflt and can't be unlocked until
790 		 * segmap_release. No hat mappings have to be locked
791 		 * and they also can't be unlocked as long as the
792 		 * caller owns an active kpm addr.
793 		 */
794 #ifdef	DEBUG
795 		if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
796 			panic("segmap_faulta: smap not found "
797 			    "for addr %p", (void *)addr);
798 			/*NOTREACHED*/
799 		}
800 
801 		smtx = SMAPMTX(smp);
802 		newpage = smp->sm_flags & SM_KPM_NEWPAGE;
803 		mutex_exit(smtx);
804 		if (newpage)
805 			cmn_err(CE_WARN, "segmap_faulta: newpage? smp %p",
806 			    (void *)smp);
807 #endif
808 		return (0);
809 	}
810 
811 	segmapcnt.smp_faulta.value.ul++;
812 	smp = GET_SMAP(seg, addr);
813 
814 	ASSERT(smp->sm_refcnt > 0);
815 
816 	vp = smp->sm_vp;
817 	off = smp->sm_off;
818 
819 	if (vp == NULL) {
820 		cmn_err(CE_WARN, "segmap_faulta - no vp");
821 		return (FC_MAKE_ERR(EIO));
822 	}
823 
824 	TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
825 		"segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
826 
827 	err = VOP_GETPAGE(vp, (offset_t)(off + ((offset_t)((uintptr_t)addr
828 	    & MAXBOFFSET))), PAGESIZE, (uint_t *)NULL, (page_t **)NULL, 0,
829 	    seg, addr, S_READ, CRED());
830 
831 	if (err)
832 		return (FC_MAKE_ERR(err));
833 	return (0);
834 }
835 
836 /*ARGSUSED*/
837 static int
838 segmap_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
839 {
840 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
841 
842 	ASSERT(seg->s_as && RW_LOCK_HELD(&seg->s_as->a_lock));
843 
844 	/*
845 	 * Need not acquire the segment lock since
846 	 * "smd_prot" is a read-only field.
847 	 */
848 	return (((smd->smd_prot & prot) != prot) ? EACCES : 0);
849 }
850 
851 static int
852 segmap_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
853 {
854 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
855 	size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1;
856 
857 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
858 
859 	if (pgno != 0) {
860 		do
861 			protv[--pgno] = smd->smd_prot;
862 		while (pgno != 0);
863 	}
864 	return (0);
865 }
866 
867 static u_offset_t
868 segmap_getoffset(struct seg *seg, caddr_t addr)
869 {
870 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
871 
872 	ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
873 
874 	return ((u_offset_t)smd->smd_sm->sm_off + (addr - seg->s_base));
875 }
876 
877 /*ARGSUSED*/
878 static int
879 segmap_gettype(struct seg *seg, caddr_t addr)
880 {
881 	ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
882 
883 	return (MAP_SHARED);
884 }
885 
886 /*ARGSUSED*/
887 static int
888 segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
889 {
890 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
891 
892 	ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
893 
894 	/* XXX - This doesn't make any sense */
895 	*vpp = smd->smd_sm->sm_vp;
896 	return (0);
897 }
898 
899 /*
900  * Check to see if it makes sense to do kluster/read ahead to
901  * addr + delta relative to the mapping at addr.  We assume here
902  * that delta is a signed PAGESIZE'd multiple (which can be negative).
903  *
904  * For segmap we always "approve" of this action from our standpoint.
905  */
906 /*ARGSUSED*/
907 static int
908 segmap_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
909 {
910 	return (0);
911 }
912 
913 static void
914 segmap_badop()
915 {
916 	panic("segmap_badop");
917 	/*NOTREACHED*/
918 }
919 
920 /*
921  * Special private segmap operations
922  */
923 
924 /*
925  * Add smap to the appropriate free list.
926  */
927 static void
928 segmap_smapadd(struct smap *smp)
929 {
930 	struct smfree *sm;
931 	struct smap *smpfreelist;
932 	struct sm_freeq *releq;
933 
934 	ASSERT(MUTEX_HELD(SMAPMTX(smp)));
935 
936 	if (smp->sm_refcnt != 0) {
937 		panic("segmap_smapadd");
938 		/*NOTREACHED*/
939 	}
940 
941 	sm = &smd_free[smp->sm_free_ndx];
942 	/*
943 	 * Add to the tail of the release queue
944 	 * Note that sm_releq and sm_allocq could toggle
945 	 * before we get the lock. This does not affect
946 	 * correctness as the 2 queues are only maintained
947 	 * to reduce lock pressure.
948 	 */
949 	releq = sm->sm_releq;
950 	if (releq == &sm->sm_freeq[0])
951 		smp->sm_flags |= SM_QNDX_ZERO;
952 	else
953 		smp->sm_flags &= ~SM_QNDX_ZERO;
954 	mutex_enter(&releq->smq_mtx);
955 	smpfreelist = releq->smq_free;
956 	if (smpfreelist == 0) {
957 		int want;
958 
959 		releq->smq_free = smp->sm_next = smp->sm_prev = smp;
960 		/*
961 		 * Both queue mutexes held to set sm_want;
962 		 * snapshot the value before dropping releq mutex.
963 		 * If sm_want appears after the releq mutex is dropped,
964 		 * then the smap just freed is already gone.
965 		 */
966 		want = sm->sm_want;
967 		mutex_exit(&releq->smq_mtx);
968 		/*
969 		 * See if there was a waiter before dropping the releq mutex
970 		 * then recheck after obtaining sm_freeq[0] mutex as
971 		 * the another thread may have already signaled.
972 		 */
973 		if (want) {
974 			mutex_enter(&sm->sm_freeq[0].smq_mtx);
975 			if (sm->sm_want)
976 				cv_signal(&sm->sm_free_cv);
977 			mutex_exit(&sm->sm_freeq[0].smq_mtx);
978 		}
979 	} else {
980 		smp->sm_next = smpfreelist;
981 		smp->sm_prev = smpfreelist->sm_prev;
982 		smpfreelist->sm_prev = smp;
983 		smp->sm_prev->sm_next = smp;
984 		mutex_exit(&releq->smq_mtx);
985 	}
986 }
987 
988 
989 static struct smap *
990 segmap_hashin(struct smap *smp, struct vnode *vp, u_offset_t off, int hashid)
991 {
992 	struct smap **hpp;
993 	struct smap *tmp;
994 	kmutex_t *hmtx;
995 
996 	ASSERT(MUTEX_HELD(SMAPMTX(smp)));
997 	ASSERT(smp->sm_vp == NULL);
998 	ASSERT(smp->sm_hash == NULL);
999 	ASSERT(smp->sm_prev == NULL);
1000 	ASSERT(smp->sm_next == NULL);
1001 	ASSERT(hashid >= 0 && hashid <= smd_hashmsk);
1002 
1003 	hmtx = SHASHMTX(hashid);
1004 
1005 	mutex_enter(hmtx);
1006 	/*
1007 	 * First we need to verify that no one has created a smp
1008 	 * with (vp,off) as its tag before we us.
1009 	 */
1010 	for (tmp = smd_hash[hashid].sh_hash_list;
1011 	    tmp != NULL; tmp = tmp->sm_hash)
1012 		if (tmp->sm_vp == vp && tmp->sm_off == off)
1013 			break;
1014 
1015 	if (tmp == NULL) {
1016 		/*
1017 		 * No one created one yet.
1018 		 *
1019 		 * Funniness here - we don't increment the ref count on the
1020 		 * vnode * even though we have another pointer to it here.
1021 		 * The reason for this is that we don't want the fact that
1022 		 * a seg_map entry somewhere refers to a vnode to prevent the
1023 		 * vnode * itself from going away.  This is because this
1024 		 * reference to the vnode is a "soft one".  In the case where
1025 		 * a mapping is being used by a rdwr [or directory routine?]
1026 		 * there already has to be a non-zero ref count on the vnode.
1027 		 * In the case where the vp has been freed and the the smap
1028 		 * structure is on the free list, there are no pages in memory
1029 		 * that can refer to the vnode.  Thus even if we reuse the same
1030 		 * vnode/smap structure for a vnode which has the same
1031 		 * address but represents a different object, we are ok.
1032 		 */
1033 		smp->sm_vp = vp;
1034 		smp->sm_off = off;
1035 
1036 		hpp = &smd_hash[hashid].sh_hash_list;
1037 		smp->sm_hash = *hpp;
1038 		*hpp = smp;
1039 #ifdef SEGMAP_HASHSTATS
1040 		smd_hash_len[hashid]++;
1041 #endif
1042 	}
1043 	mutex_exit(hmtx);
1044 
1045 	return (tmp);
1046 }
1047 
1048 static void
1049 segmap_hashout(struct smap *smp)
1050 {
1051 	struct smap **hpp, *hp;
1052 	struct vnode *vp;
1053 	kmutex_t *mtx;
1054 	int hashid;
1055 	u_offset_t off;
1056 
1057 	ASSERT(MUTEX_HELD(SMAPMTX(smp)));
1058 
1059 	vp = smp->sm_vp;
1060 	off = smp->sm_off;
1061 
1062 	SMAP_HASHFUNC(vp, off, hashid);	/* macro assigns hashid */
1063 	mtx = SHASHMTX(hashid);
1064 	mutex_enter(mtx);
1065 
1066 	hpp = &smd_hash[hashid].sh_hash_list;
1067 	for (;;) {
1068 		hp = *hpp;
1069 		if (hp == NULL) {
1070 			panic("segmap_hashout");
1071 			/*NOTREACHED*/
1072 		}
1073 		if (hp == smp)
1074 			break;
1075 		hpp = &hp->sm_hash;
1076 	}
1077 
1078 	*hpp = smp->sm_hash;
1079 	smp->sm_hash = NULL;
1080 #ifdef SEGMAP_HASHSTATS
1081 	smd_hash_len[hashid]--;
1082 #endif
1083 	mutex_exit(mtx);
1084 
1085 	smp->sm_vp = NULL;
1086 	smp->sm_off = (u_offset_t)0;
1087 
1088 }
1089 
1090 /*
1091  * Attempt to free unmodified, unmapped, and non locked segmap
1092  * pages.
1093  */
1094 void
1095 segmap_pagefree(struct vnode *vp, u_offset_t off)
1096 {
1097 	u_offset_t pgoff;
1098 	page_t  *pp;
1099 
1100 	for (pgoff = off; pgoff < off + MAXBSIZE; pgoff += PAGESIZE) {
1101 
1102 		if ((pp = page_lookup_nowait(vp, pgoff, SE_EXCL)) == NULL)
1103 			continue;
1104 
1105 		switch (page_release(pp, 1)) {
1106 		case PGREL_NOTREL:
1107 			segmapcnt.smp_free_notfree.value.ul++;
1108 			break;
1109 		case PGREL_MOD:
1110 			segmapcnt.smp_free_dirty.value.ul++;
1111 			break;
1112 		case PGREL_CLEAN:
1113 			segmapcnt.smp_free.value.ul++;
1114 			break;
1115 		}
1116 	}
1117 }
1118 
1119 /*
1120  * Locks held on entry: smap lock
1121  * Locks held on exit : smap lock.
1122  */
1123 
1124 static void
1125 grab_smp(struct smap *smp, page_t *pp)
1126 {
1127 	ASSERT(MUTEX_HELD(SMAPMTX(smp)));
1128 	ASSERT(smp->sm_refcnt == 0);
1129 
1130 	if (smp->sm_vp != (struct vnode *)NULL) {
1131 		struct vnode	*vp = smp->sm_vp;
1132 		u_offset_t 	off = smp->sm_off;
1133 		/*
1134 		 * Destroy old vnode association and
1135 		 * unload any hardware translations to
1136 		 * the old object.
1137 		 */
1138 		smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reuse++;
1139 		segmap_hashout(smp);
1140 
1141 		/*
1142 		 * This node is off freelist and hashlist,
1143 		 * so there is no reason to drop/reacquire sm_mtx
1144 		 * across calls to hat_unload.
1145 		 */
1146 		if (segmap_kpm) {
1147 			caddr_t vaddr;
1148 			int hat_unload_needed = 0;
1149 
1150 			/*
1151 			 * unload kpm mapping
1152 			 */
1153 			if (pp != NULL) {
1154 				vaddr = hat_kpm_page2va(pp, 1);
1155 				hat_kpm_mapout(pp, GET_KPME(smp), vaddr);
1156 				page_unlock(pp);
1157 			}
1158 
1159 			/*
1160 			 * Check if we have (also) the rare case of a
1161 			 * non kpm mapping.
1162 			 */
1163 			if (smp->sm_flags & SM_NOTKPM_RELEASED) {
1164 				hat_unload_needed = 1;
1165 				smp->sm_flags &= ~SM_NOTKPM_RELEASED;
1166 			}
1167 
1168 			if (hat_unload_needed) {
1169 				hat_unload(kas.a_hat, segkmap->s_base +
1170 				    ((smp - smd_smap) * MAXBSIZE),
1171 				    MAXBSIZE, HAT_UNLOAD);
1172 			}
1173 
1174 		} else {
1175 			ASSERT(smp->sm_flags & SM_NOTKPM_RELEASED);
1176 			smp->sm_flags &= ~SM_NOTKPM_RELEASED;
1177 			hat_unload(kas.a_hat, segkmap->s_base +
1178 			    ((smp - smd_smap) * MAXBSIZE),
1179 			    MAXBSIZE, HAT_UNLOAD);
1180 		}
1181 		segmap_pagefree(vp, off);
1182 	}
1183 }
1184 
1185 static struct smap *
1186 get_free_smp(int free_ndx)
1187 {
1188 	struct smfree *sm;
1189 	kmutex_t *smtx;
1190 	struct smap *smp, *first;
1191 	struct sm_freeq *allocq, *releq;
1192 	struct kpme *kpme;
1193 	page_t *pp = NULL;
1194 	int end_ndx, page_locked = 0;
1195 
1196 	end_ndx = free_ndx;
1197 	sm = &smd_free[free_ndx];
1198 
1199 retry_queue:
1200 	allocq = sm->sm_allocq;
1201 	mutex_enter(&allocq->smq_mtx);
1202 
1203 	if ((smp = allocq->smq_free) == NULL) {
1204 
1205 skip_queue:
1206 		/*
1207 		 * The alloc list is empty or this queue is being skipped;
1208 		 * first see if the allocq toggled.
1209 		 */
1210 		if (sm->sm_allocq != allocq) {
1211 			/* queue changed */
1212 			mutex_exit(&allocq->smq_mtx);
1213 			goto retry_queue;
1214 		}
1215 		releq = sm->sm_releq;
1216 		if (!mutex_tryenter(&releq->smq_mtx)) {
1217 			/* cannot get releq; a free smp may be there now */
1218 			mutex_exit(&allocq->smq_mtx);
1219 
1220 			/*
1221 			 * This loop could spin forever if this thread has
1222 			 * higher priority than the thread that is holding
1223 			 * releq->smq_mtx. In order to force the other thread
1224 			 * to run, we'll lock/unlock the mutex which is safe
1225 			 * since we just unlocked the allocq mutex.
1226 			 */
1227 			mutex_enter(&releq->smq_mtx);
1228 			mutex_exit(&releq->smq_mtx);
1229 			goto retry_queue;
1230 		}
1231 		if (releq->smq_free == NULL) {
1232 			/*
1233 			 * This freelist is empty.
1234 			 * This should not happen unless clients
1235 			 * are failing to release the segmap
1236 			 * window after accessing the data.
1237 			 * Before resorting to sleeping, try
1238 			 * the next list of the same color.
1239 			 */
1240 			free_ndx = (free_ndx + smd_ncolor) & smd_freemsk;
1241 			if (free_ndx != end_ndx) {
1242 				mutex_exit(&releq->smq_mtx);
1243 				mutex_exit(&allocq->smq_mtx);
1244 				sm = &smd_free[free_ndx];
1245 				goto retry_queue;
1246 			}
1247 			/*
1248 			 * Tried all freelists of the same color once,
1249 			 * wait on this list and hope something gets freed.
1250 			 */
1251 			segmapcnt.smp_get_nofree.value.ul++;
1252 			sm->sm_want++;
1253 			mutex_exit(&sm->sm_freeq[1].smq_mtx);
1254 			cv_wait(&sm->sm_free_cv,
1255 				&sm->sm_freeq[0].smq_mtx);
1256 			sm->sm_want--;
1257 			mutex_exit(&sm->sm_freeq[0].smq_mtx);
1258 			sm = &smd_free[free_ndx];
1259 			goto retry_queue;
1260 		} else {
1261 			/*
1262 			 * Something on the rele queue; flip the alloc
1263 			 * and rele queues and retry.
1264 			 */
1265 			sm->sm_allocq = releq;
1266 			sm->sm_releq = allocq;
1267 			mutex_exit(&allocq->smq_mtx);
1268 			mutex_exit(&releq->smq_mtx);
1269 			if (page_locked) {
1270 				delay(hz >> 2);
1271 				page_locked = 0;
1272 			}
1273 			goto retry_queue;
1274 		}
1275 	} else {
1276 		/*
1277 		 * Fastpath the case we get the smap mutex
1278 		 * on the first try.
1279 		 */
1280 		first = smp;
1281 next_smap:
1282 		smtx = SMAPMTX(smp);
1283 		if (!mutex_tryenter(smtx)) {
1284 			/*
1285 			 * Another thread is trying to reclaim this slot.
1286 			 * Skip to the next queue or smap.
1287 			 */
1288 			if ((smp = smp->sm_next) == first) {
1289 				goto skip_queue;
1290 			} else {
1291 				goto next_smap;
1292 			}
1293 		} else {
1294 			/*
1295 			 * if kpme exists, get shared lock on the page
1296 			 */
1297 			if (segmap_kpm && smp->sm_vp != NULL) {
1298 
1299 				kpme = GET_KPME(smp);
1300 				pp = kpme->kpe_page;
1301 
1302 				if (pp != NULL) {
1303 					if (!page_trylock(pp, SE_SHARED)) {
1304 						smp = smp->sm_next;
1305 						mutex_exit(smtx);
1306 						page_locked = 1;
1307 
1308 						pp = NULL;
1309 
1310 						if (smp == first) {
1311 							goto skip_queue;
1312 						} else {
1313 							goto next_smap;
1314 						}
1315 					} else {
1316 						if (kpme->kpe_page == NULL) {
1317 							page_unlock(pp);
1318 							pp = NULL;
1319 						}
1320 					}
1321 				}
1322 			}
1323 
1324 			/*
1325 			 * At this point, we've selected smp.  Remove smp
1326 			 * from its freelist.  If smp is the first one in
1327 			 * the freelist, update the head of the freelist.
1328 			 */
1329 			if (first == smp) {
1330 				ASSERT(first == allocq->smq_free);
1331 				allocq->smq_free = smp->sm_next;
1332 			}
1333 
1334 			/*
1335 			 * if the head of the freelist still points to smp,
1336 			 * then there are no more free smaps in that list.
1337 			 */
1338 			if (allocq->smq_free == smp)
1339 				/*
1340 				 * Took the last one
1341 				 */
1342 				allocq->smq_free = NULL;
1343 			else {
1344 				smp->sm_prev->sm_next = smp->sm_next;
1345 				smp->sm_next->sm_prev = smp->sm_prev;
1346 			}
1347 			mutex_exit(&allocq->smq_mtx);
1348 			smp->sm_prev = smp->sm_next = NULL;
1349 
1350 			/*
1351 			 * if pp != NULL, pp must have been locked;
1352 			 * grab_smp() unlocks pp.
1353 			 */
1354 			ASSERT((pp == NULL) || PAGE_LOCKED(pp));
1355 			grab_smp(smp, pp);
1356 			/* return smp locked. */
1357 			ASSERT(SMAPMTX(smp) == smtx);
1358 			ASSERT(MUTEX_HELD(smtx));
1359 			return (smp);
1360 		}
1361 	}
1362 }
1363 
1364 /*
1365  * Special public segmap operations
1366  */
1367 
1368 /*
1369  * Create pages (without using VOP_GETPAGE) and load up tranlations to them.
1370  * If softlock is TRUE, then set things up so that it looks like a call
1371  * to segmap_fault with F_SOFTLOCK.
1372  *
1373  * Returns 1, if a page is created by calling page_create_va(), or 0 otherwise.
1374  *
1375  * All fields in the generic segment (struct seg) are considered to be
1376  * read-only for "segmap" even though the kernel address space (kas) may
1377  * not be locked, hence no lock is needed to access them.
1378  */
1379 int
1380 segmap_pagecreate(struct seg *seg, caddr_t addr, size_t len, int softlock)
1381 {
1382 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
1383 	page_t *pp;
1384 	u_offset_t off;
1385 	struct smap *smp;
1386 	struct vnode *vp;
1387 	caddr_t eaddr;
1388 	int newpage = 0;
1389 	uint_t prot;
1390 	kmutex_t *smtx;
1391 	int hat_flag;
1392 
1393 	ASSERT(seg->s_as == &kas);
1394 
1395 	if (segmap_kpm && IS_KPM_ADDR(addr)) {
1396 		/*
1397 		 * Pages are successfully prefaulted and locked in
1398 		 * segmap_getmapflt and can't be unlocked until
1399 		 * segmap_release. The SM_KPM_NEWPAGE flag is set
1400 		 * in segmap_pagecreate_kpm when new pages are created.
1401 		 * and it is returned as "newpage" indication here.
1402 		 */
1403 		if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
1404 			panic("segmap_pagecreate: smap not found "
1405 			    "for addr %p", (void *)addr);
1406 			/*NOTREACHED*/
1407 		}
1408 
1409 		smtx = SMAPMTX(smp);
1410 		newpage = smp->sm_flags & SM_KPM_NEWPAGE;
1411 		smp->sm_flags &= ~SM_KPM_NEWPAGE;
1412 		mutex_exit(smtx);
1413 
1414 		return (newpage);
1415 	}
1416 
1417 	smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
1418 
1419 	eaddr = addr + len;
1420 	addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1421 
1422 	smp = GET_SMAP(seg, addr);
1423 
1424 	/*
1425 	 * We don't grab smp mutex here since we assume the smp
1426 	 * has a refcnt set already which prevents the slot from
1427 	 * changing its id.
1428 	 */
1429 	ASSERT(smp->sm_refcnt > 0);
1430 
1431 	vp = smp->sm_vp;
1432 	off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET));
1433 	prot = smd->smd_prot;
1434 
1435 	for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
1436 		hat_flag = HAT_LOAD;
1437 		pp = page_lookup(vp, off, SE_SHARED);
1438 		if (pp == NULL) {
1439 			ushort_t bitindex;
1440 
1441 			if ((pp = page_create_va(vp, off,
1442 			    PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
1443 				panic("segmap_pagecreate: page_create failed");
1444 				/*NOTREACHED*/
1445 			}
1446 			newpage = 1;
1447 			page_io_unlock(pp);
1448 
1449 			/*
1450 			 * Since pages created here do not contain valid
1451 			 * data until the caller writes into them, the
1452 			 * "exclusive" lock will not be dropped to prevent
1453 			 * other users from accessing the page.  We also
1454 			 * have to lock the translation to prevent a fault
1455 			 * from occuring when the virtual address mapped by
1456 			 * this page is written into.  This is necessary to
1457 			 * avoid a deadlock since we haven't dropped the
1458 			 * "exclusive" lock.
1459 			 */
1460 			bitindex = (ushort_t)((off - smp->sm_off) >> PAGESHIFT);
1461 
1462 			/*
1463 			 * Large Files: The following assertion is to
1464 			 * verify the cast above.
1465 			 */
1466 			ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
1467 			smtx = SMAPMTX(smp);
1468 			mutex_enter(smtx);
1469 			smp->sm_bitmap |= SMAP_BIT_MASK(bitindex);
1470 			mutex_exit(smtx);
1471 
1472 			hat_flag = HAT_LOAD_LOCK;
1473 		} else if (softlock) {
1474 			hat_flag = HAT_LOAD_LOCK;
1475 		}
1476 
1477 		if (IS_VMODSORT(pp->p_vnode) && (prot & PROT_WRITE))
1478 			hat_setmod(pp);
1479 
1480 		hat_memload(kas.a_hat, addr, pp, prot, hat_flag);
1481 
1482 		if (hat_flag != HAT_LOAD_LOCK)
1483 			page_unlock(pp);
1484 
1485 		TRACE_5(TR_FAC_VM, TR_SEGMAP_PAGECREATE,
1486 		    "segmap_pagecreate:seg %p addr %p pp %p vp %p offset %llx",
1487 		    seg, addr, pp, vp, off);
1488 	}
1489 
1490 	return (newpage);
1491 }
1492 
1493 void
1494 segmap_pageunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
1495 {
1496 	struct smap	*smp;
1497 	ushort_t	bitmask;
1498 	page_t		*pp;
1499 	struct	vnode	*vp;
1500 	u_offset_t	off;
1501 	caddr_t		eaddr;
1502 	kmutex_t	*smtx;
1503 
1504 	ASSERT(seg->s_as == &kas);
1505 
1506 	eaddr = addr + len;
1507 	addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1508 
1509 	if (segmap_kpm && IS_KPM_ADDR(addr)) {
1510 		/*
1511 		 * Pages are successfully prefaulted and locked in
1512 		 * segmap_getmapflt and can't be unlocked until
1513 		 * segmap_release, so no pages or hat mappings have
1514 		 * to be unlocked at this point.
1515 		 */
1516 #ifdef DEBUG
1517 		if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
1518 			panic("segmap_pageunlock: smap not found "
1519 			    "for addr %p", (void *)addr);
1520 			/*NOTREACHED*/
1521 		}
1522 
1523 		ASSERT(smp->sm_refcnt > 0);
1524 		mutex_exit(SMAPMTX(smp));
1525 #endif
1526 		return;
1527 	}
1528 
1529 	smp = GET_SMAP(seg, addr);
1530 	smtx = SMAPMTX(smp);
1531 
1532 	ASSERT(smp->sm_refcnt > 0);
1533 
1534 	vp = smp->sm_vp;
1535 	off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET));
1536 
1537 	for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
1538 		bitmask = SMAP_BIT_MASK((int)(off - smp->sm_off) >> PAGESHIFT);
1539 
1540 		/*
1541 		 * Large Files: Following assertion is to verify
1542 		 * the correctness of the cast to (int) above.
1543 		 */
1544 		ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
1545 
1546 		/*
1547 		 * If the bit corresponding to "off" is set,
1548 		 * clear this bit in the bitmap, unlock translations,
1549 		 * and release the "exclusive" lock on the page.
1550 		 */
1551 		if (smp->sm_bitmap & bitmask) {
1552 			mutex_enter(smtx);
1553 			smp->sm_bitmap &= ~bitmask;
1554 			mutex_exit(smtx);
1555 
1556 			hat_unlock(kas.a_hat, addr, PAGESIZE);
1557 
1558 			/*
1559 			 * Use page_find() instead of page_lookup() to
1560 			 * find the page since we know that it has
1561 			 * "exclusive" lock.
1562 			 */
1563 			pp = page_find(vp, off);
1564 			if (pp == NULL) {
1565 				panic("segmap_pageunlock: page not found");
1566 				/*NOTREACHED*/
1567 			}
1568 			if (rw == S_WRITE) {
1569 				hat_setrefmod(pp);
1570 			} else if (rw != S_OTHER) {
1571 				hat_setref(pp);
1572 			}
1573 
1574 			page_unlock(pp);
1575 		}
1576 	}
1577 }
1578 
1579 caddr_t
1580 segmap_getmap(struct seg *seg, struct vnode *vp, u_offset_t off)
1581 {
1582 	return (segmap_getmapflt(seg, vp, off, MAXBSIZE, 0, S_OTHER));
1583 }
1584 
1585 /*
1586  * This is the magic virtual address that offset 0 of an ELF
1587  * file gets mapped to in user space. This is used to pick
1588  * the vac color on the freelist.
1589  */
1590 #define	ELF_OFFZERO_VA	(0x10000)
1591 /*
1592  * segmap_getmap allocates a MAXBSIZE big slot to map the vnode vp
1593  * in the range <off, off + len). off doesn't need to be MAXBSIZE aligned.
1594  * The return address is  always MAXBSIZE aligned.
1595  *
1596  * If forcefault is nonzero and the MMU translations haven't yet been created,
1597  * segmap_getmap will call segmap_fault(..., F_INVAL, rw) to create them.
1598  */
1599 caddr_t
1600 segmap_getmapflt(
1601 	struct seg *seg,
1602 	struct vnode *vp,
1603 	u_offset_t off,
1604 	size_t len,
1605 	int forcefault,
1606 	enum seg_rw rw)
1607 {
1608 	struct smap *smp, *nsmp;
1609 	extern struct vnode *common_specvp();
1610 	caddr_t baseaddr;			/* MAXBSIZE aligned */
1611 	u_offset_t baseoff;
1612 	int newslot;
1613 	caddr_t vaddr;
1614 	int color, hashid;
1615 	kmutex_t *hashmtx, *smapmtx;
1616 	struct smfree *sm;
1617 	page_t	*pp;
1618 	struct kpme *kpme;
1619 	uint_t	prot;
1620 	caddr_t base;
1621 	page_t	*pl[MAXPPB + 1];
1622 	int	error;
1623 	int	is_kpm = 1;
1624 
1625 	ASSERT(seg->s_as == &kas);
1626 	ASSERT(seg == segkmap);
1627 
1628 	baseoff = off & (offset_t)MAXBMASK;
1629 	if (off + len > baseoff + MAXBSIZE) {
1630 		panic("segmap_getmap bad len");
1631 		/*NOTREACHED*/
1632 	}
1633 
1634 	/*
1635 	 * If this is a block device we have to be sure to use the
1636 	 * "common" block device vnode for the mapping.
1637 	 */
1638 	if (vp->v_type == VBLK)
1639 		vp = common_specvp(vp);
1640 
1641 	smd_cpu[CPU->cpu_seqid].scpu.scpu_getmap++;
1642 
1643 	if (segmap_kpm == 0 ||
1644 	    (forcefault == SM_PAGECREATE && rw != S_WRITE)) {
1645 		is_kpm = 0;
1646 	}
1647 
1648 	SMAP_HASHFUNC(vp, off, hashid);	/* macro assigns hashid */
1649 	hashmtx = SHASHMTX(hashid);
1650 
1651 retry_hash:
1652 	mutex_enter(hashmtx);
1653 	for (smp = smd_hash[hashid].sh_hash_list;
1654 	    smp != NULL; smp = smp->sm_hash)
1655 		if (smp->sm_vp == vp && smp->sm_off == baseoff)
1656 			break;
1657 	mutex_exit(hashmtx);
1658 
1659 vrfy_smp:
1660 	if (smp != NULL) {
1661 
1662 		ASSERT(vp->v_count != 0);
1663 
1664 		/*
1665 		 * Get smap lock and recheck its tag. The hash lock
1666 		 * is dropped since the hash is based on (vp, off)
1667 		 * and (vp, off) won't change when we have smap mtx.
1668 		 */
1669 		smapmtx = SMAPMTX(smp);
1670 		mutex_enter(smapmtx);
1671 		if (smp->sm_vp != vp || smp->sm_off != baseoff) {
1672 			mutex_exit(smapmtx);
1673 			goto retry_hash;
1674 		}
1675 
1676 		if (smp->sm_refcnt == 0) {
1677 
1678 			smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reclaim++;
1679 
1680 			/*
1681 			 * Could still be on the free list. However, this
1682 			 * could also be an smp that is transitioning from
1683 			 * the free list when we have too much contention
1684 			 * for the smapmtx's. In this case, we have an
1685 			 * unlocked smp that is not on the free list any
1686 			 * longer, but still has a 0 refcnt.  The only way
1687 			 * to be sure is to check the freelist pointers.
1688 			 * Since we now have the smapmtx, we are guaranteed
1689 			 * that the (vp, off) won't change, so we are safe
1690 			 * to reclaim it.  get_free_smp() knows that this
1691 			 * can happen, and it will check the refcnt.
1692 			 */
1693 
1694 			if ((smp->sm_next != NULL)) {
1695 				struct sm_freeq *freeq;
1696 
1697 				ASSERT(smp->sm_prev != NULL);
1698 				sm = &smd_free[smp->sm_free_ndx];
1699 
1700 				if (smp->sm_flags & SM_QNDX_ZERO)
1701 					freeq = &sm->sm_freeq[0];
1702 				else
1703 					freeq = &sm->sm_freeq[1];
1704 
1705 				mutex_enter(&freeq->smq_mtx);
1706 				if (freeq->smq_free != smp) {
1707 					/*
1708 					 * fastpath normal case
1709 					 */
1710 					smp->sm_prev->sm_next = smp->sm_next;
1711 					smp->sm_next->sm_prev = smp->sm_prev;
1712 				} else if (smp == smp->sm_next) {
1713 					/*
1714 					 * Taking the last smap on freelist
1715 					 */
1716 					freeq->smq_free = NULL;
1717 				} else {
1718 					/*
1719 					 * Reclaiming 1st smap on list
1720 					 */
1721 					freeq->smq_free = smp->sm_next;
1722 					smp->sm_prev->sm_next = smp->sm_next;
1723 					smp->sm_next->sm_prev = smp->sm_prev;
1724 				}
1725 				mutex_exit(&freeq->smq_mtx);
1726 				smp->sm_prev = smp->sm_next = NULL;
1727 			} else {
1728 				ASSERT(smp->sm_prev == NULL);
1729 				segmapcnt.smp_stolen.value.ul++;
1730 			}
1731 
1732 		} else {
1733 			segmapcnt.smp_get_use.value.ul++;
1734 		}
1735 		smp->sm_refcnt++;		/* another user */
1736 
1737 		/*
1738 		 * We don't invoke segmap_fault via TLB miss, so we set ref
1739 		 * and mod bits in advance. For S_OTHER  we set them in
1740 		 * segmap_fault F_SOFTUNLOCK.
1741 		 */
1742 		if (is_kpm) {
1743 			if (rw == S_WRITE) {
1744 				smp->sm_flags |= SM_WRITE_DATA;
1745 			} else if (rw == S_READ) {
1746 				smp->sm_flags |= SM_READ_DATA;
1747 			}
1748 		}
1749 		mutex_exit(smapmtx);
1750 
1751 		newslot = 0;
1752 	} else {
1753 
1754 		uint32_t free_ndx, *free_ndxp;
1755 		union segmap_cpu *scpu;
1756 
1757 		/*
1758 		 * On a PAC machine or a machine with anti-alias
1759 		 * hardware, smd_colormsk will be zero.
1760 		 *
1761 		 * On a VAC machine- pick color by offset in the file
1762 		 * so we won't get VAC conflicts on elf files.
1763 		 * On data files, color does not matter but we
1764 		 * don't know what kind of file it is so we always
1765 		 * pick color by offset. This causes color
1766 		 * corresponding to file offset zero to be used more
1767 		 * heavily.
1768 		 */
1769 		color = (baseoff >> MAXBSHIFT) & smd_colormsk;
1770 		scpu = smd_cpu+CPU->cpu_seqid;
1771 		free_ndxp = &scpu->scpu.scpu_free_ndx[color];
1772 		free_ndx = (*free_ndxp += smd_ncolor) & smd_freemsk;
1773 #ifdef DEBUG
1774 		colors_used[free_ndx]++;
1775 #endif /* DEBUG */
1776 
1777 		/*
1778 		 * Get a locked smp slot from the free list.
1779 		 */
1780 		smp = get_free_smp(free_ndx);
1781 		smapmtx = SMAPMTX(smp);
1782 
1783 		ASSERT(smp->sm_vp == NULL);
1784 
1785 		if ((nsmp = segmap_hashin(smp, vp, baseoff, hashid)) != NULL) {
1786 			/*
1787 			 * Failed to hashin, there exists one now.
1788 			 * Return the smp we just allocated.
1789 			 */
1790 			segmap_smapadd(smp);
1791 			mutex_exit(smapmtx);
1792 
1793 			smp = nsmp;
1794 			goto vrfy_smp;
1795 		}
1796 		smp->sm_refcnt++;		/* another user */
1797 
1798 		/*
1799 		 * We don't invoke segmap_fault via TLB miss, so we set ref
1800 		 * and mod bits in advance. For S_OTHER  we set them in
1801 		 * segmap_fault F_SOFTUNLOCK.
1802 		 */
1803 		if (is_kpm) {
1804 			if (rw == S_WRITE) {
1805 				smp->sm_flags |= SM_WRITE_DATA;
1806 			} else if (rw == S_READ) {
1807 				smp->sm_flags |= SM_READ_DATA;
1808 			}
1809 		}
1810 		mutex_exit(smapmtx);
1811 
1812 		newslot = 1;
1813 	}
1814 
1815 	if (!is_kpm)
1816 		goto use_segmap_range;
1817 
1818 	/*
1819 	 * Use segkpm
1820 	 */
1821 	ASSERT(PAGESIZE == MAXBSIZE);
1822 
1823 	/*
1824 	 * remember the last smp faulted on this cpu.
1825 	 */
1826 	(smd_cpu+CPU->cpu_seqid)->scpu.scpu_last_smap = smp;
1827 
1828 	if (forcefault == SM_PAGECREATE) {
1829 		baseaddr = segmap_pagecreate_kpm(seg, vp, baseoff, smp, rw);
1830 		return (baseaddr);
1831 	}
1832 
1833 	if (newslot == 0 &&
1834 	    (pp = GET_KPME(smp)->kpe_page) != NULL) {
1835 
1836 		/* fastpath */
1837 		switch (rw) {
1838 		case S_READ:
1839 		case S_WRITE:
1840 			if (page_trylock(pp, SE_SHARED)) {
1841 				if (PP_ISFREE(pp) ||
1842 				    !(pp->p_vnode == vp &&
1843 				    pp->p_offset == baseoff)) {
1844 					page_unlock(pp);
1845 					pp = page_lookup(vp, baseoff,
1846 						SE_SHARED);
1847 				}
1848 			} else {
1849 				pp = page_lookup(vp, baseoff, SE_SHARED);
1850 			}
1851 
1852 			if (pp == NULL) {
1853 				ASSERT(GET_KPME(smp)->kpe_page == NULL);
1854 				break;
1855 			}
1856 
1857 			if (rw == S_WRITE &&
1858 			    hat_page_getattr(pp, P_MOD | P_REF) !=
1859 			    (P_MOD | P_REF)) {
1860 				page_unlock(pp);
1861 				break;
1862 			}
1863 
1864 			/*
1865 			 * We have the p_selock as reader, grab_smp
1866 			 * can't hit us, we have bumped the smap
1867 			 * refcnt and hat_pageunload needs the
1868 			 * p_selock exclusive.
1869 			 */
1870 			kpme = GET_KPME(smp);
1871 			if (kpme->kpe_page == pp) {
1872 				baseaddr = hat_kpm_page2va(pp, 0);
1873 			} else if (kpme->kpe_page == NULL) {
1874 				baseaddr = hat_kpm_mapin(pp, kpme);
1875 			} else {
1876 				panic("segmap_getmapflt: stale "
1877 				    "kpme page, kpme %p", (void *)kpme);
1878 				/*NOTREACHED*/
1879 			}
1880 
1881 			/*
1882 			 * We don't invoke segmap_fault via TLB miss,
1883 			 * so we set ref and mod bits in advance.
1884 			 * For S_OTHER and we set them in segmap_fault
1885 			 * F_SOFTUNLOCK.
1886 			 */
1887 			if (rw == S_READ && !hat_isref(pp))
1888 				hat_setref(pp);
1889 
1890 			return (baseaddr);
1891 		default:
1892 			break;
1893 		}
1894 	}
1895 
1896 	base = segkpm_create_va(baseoff);
1897 	error = VOP_GETPAGE(vp, (offset_t)baseoff, len, &prot, pl, MAXBSIZE,
1898 	    seg, base, rw, CRED());
1899 
1900 	pp = pl[0];
1901 	if (error || pp == NULL) {
1902 		/*
1903 		 * Use segmap address slot and let segmap_fault deal
1904 		 * with the error cases. There is no error return
1905 		 * possible here.
1906 		 */
1907 		goto use_segmap_range;
1908 	}
1909 
1910 	ASSERT(pl[1] == NULL);
1911 
1912 	/*
1913 	 * When prot is not returned w/ PROT_ALL the returned pages
1914 	 * are not backed by fs blocks. For most of the segmap users
1915 	 * this is no problem, they don't write to the pages in the
1916 	 * same request and therefore don't rely on a following
1917 	 * trap driven segmap_fault. With SM_LOCKPROTO users it
1918 	 * is more secure to use segkmap adresses to allow
1919 	 * protection segmap_fault's.
1920 	 */
1921 	if (prot != PROT_ALL && forcefault == SM_LOCKPROTO) {
1922 		/*
1923 		 * Use segmap address slot and let segmap_fault
1924 		 * do the error return.
1925 		 */
1926 		ASSERT(rw != S_WRITE);
1927 		ASSERT(PAGE_LOCKED(pp));
1928 		page_unlock(pp);
1929 		forcefault = 0;
1930 		goto use_segmap_range;
1931 	}
1932 
1933 	/*
1934 	 * We have the p_selock as reader, grab_smp can't hit us, we
1935 	 * have bumped the smap refcnt and hat_pageunload needs the
1936 	 * p_selock exclusive.
1937 	 */
1938 	kpme = GET_KPME(smp);
1939 	if (kpme->kpe_page == pp) {
1940 		baseaddr = hat_kpm_page2va(pp, 0);
1941 	} else if (kpme->kpe_page == NULL) {
1942 		baseaddr = hat_kpm_mapin(pp, kpme);
1943 	} else {
1944 		panic("segmap_getmapflt: stale kpme page after "
1945 		    "VOP_GETPAGE, kpme %p", (void *)kpme);
1946 		/*NOTREACHED*/
1947 	}
1948 
1949 	smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
1950 
1951 	return (baseaddr);
1952 
1953 
1954 use_segmap_range:
1955 	baseaddr = seg->s_base + ((smp - smd_smap) * MAXBSIZE);
1956 	TRACE_4(TR_FAC_VM, TR_SEGMAP_GETMAP,
1957 	    "segmap_getmap:seg %p addr %p vp %p offset %llx",
1958 	    seg, baseaddr, vp, baseoff);
1959 
1960 	/*
1961 	 * Prefault the translations
1962 	 */
1963 	vaddr = baseaddr + (off - baseoff);
1964 	if (forcefault && (newslot || !hat_probe(kas.a_hat, vaddr))) {
1965 
1966 		caddr_t pgaddr = (caddr_t)((uintptr_t)vaddr &
1967 		    (uintptr_t)PAGEMASK);
1968 
1969 		(void) segmap_fault(kas.a_hat, seg, pgaddr,
1970 		    (vaddr + len - pgaddr + PAGESIZE - 1) & (uintptr_t)PAGEMASK,
1971 		    F_INVAL, rw);
1972 	}
1973 
1974 	return (baseaddr);
1975 }
1976 
1977 int
1978 segmap_release(struct seg *seg, caddr_t addr, uint_t flags)
1979 {
1980 	struct smap	*smp;
1981 	int 		error;
1982 	int		bflags = 0;
1983 	struct vnode	*vp;
1984 	u_offset_t	offset;
1985 	kmutex_t	*smtx;
1986 	int		is_kpm = 0;
1987 	page_t		*pp;
1988 
1989 	if (segmap_kpm && IS_KPM_ADDR(addr)) {
1990 
1991 		if (((uintptr_t)addr & MAXBOFFSET) != 0) {
1992 			panic("segmap_release: addr %p not "
1993 			    "MAXBSIZE aligned", (void *)addr);
1994 			/*NOTREACHED*/
1995 		}
1996 
1997 		if ((smp = get_smap_kpm(addr, &pp)) == NULL) {
1998 			panic("segmap_release: smap not found "
1999 			    "for addr %p", (void *)addr);
2000 			/*NOTREACHED*/
2001 		}
2002 
2003 		TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP,
2004 			"segmap_relmap:seg %p addr %p smp %p",
2005 			seg, addr, smp);
2006 
2007 		smtx = SMAPMTX(smp);
2008 
2009 		/*
2010 		 * For compatibilty reasons segmap_pagecreate_kpm sets this
2011 		 * flag to allow a following segmap_pagecreate to return
2012 		 * this as "newpage" flag. When segmap_pagecreate is not
2013 		 * called at all we clear it now.
2014 		 */
2015 		smp->sm_flags &= ~SM_KPM_NEWPAGE;
2016 		is_kpm = 1;
2017 		if (smp->sm_flags & SM_WRITE_DATA) {
2018 			hat_setrefmod(pp);
2019 		} else if (smp->sm_flags & SM_READ_DATA) {
2020 			hat_setref(pp);
2021 		}
2022 	} else {
2023 		if (addr < seg->s_base || addr >= seg->s_base + seg->s_size ||
2024 		    ((uintptr_t)addr & MAXBOFFSET) != 0) {
2025 			panic("segmap_release: bad addr %p", (void *)addr);
2026 			/*NOTREACHED*/
2027 		}
2028 		smp = GET_SMAP(seg, addr);
2029 
2030 		TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP,
2031 			"segmap_relmap:seg %p addr %p smp %p",
2032 			seg, addr, smp);
2033 
2034 		smtx = SMAPMTX(smp);
2035 		mutex_enter(smtx);
2036 		smp->sm_flags |= SM_NOTKPM_RELEASED;
2037 	}
2038 
2039 	ASSERT(smp->sm_refcnt > 0);
2040 
2041 	/*
2042 	 * Need to call VOP_PUTPAGE() if any flags (except SM_DONTNEED)
2043 	 * are set.
2044 	 */
2045 	if ((flags & ~SM_DONTNEED) != 0) {
2046 		if (flags & SM_WRITE)
2047 			segmapcnt.smp_rel_write.value.ul++;
2048 		if (flags & SM_ASYNC) {
2049 			bflags |= B_ASYNC;
2050 			segmapcnt.smp_rel_async.value.ul++;
2051 		}
2052 		if (flags & SM_INVAL) {
2053 			bflags |= B_INVAL;
2054 			segmapcnt.smp_rel_abort.value.ul++;
2055 		}
2056 		if (flags & SM_DESTROY) {
2057 			bflags |= (B_INVAL|B_TRUNC);
2058 			segmapcnt.smp_rel_abort.value.ul++;
2059 		}
2060 		if (smp->sm_refcnt == 1) {
2061 			/*
2062 			 * We only bother doing the FREE and DONTNEED flags
2063 			 * if no one else is still referencing this mapping.
2064 			 */
2065 			if (flags & SM_FREE) {
2066 				bflags |= B_FREE;
2067 				segmapcnt.smp_rel_free.value.ul++;
2068 			}
2069 			if (flags & SM_DONTNEED) {
2070 				bflags |= B_DONTNEED;
2071 				segmapcnt.smp_rel_dontneed.value.ul++;
2072 			}
2073 		}
2074 	} else {
2075 		smd_cpu[CPU->cpu_seqid].scpu.scpu_release++;
2076 	}
2077 
2078 	vp = smp->sm_vp;
2079 	offset = smp->sm_off;
2080 
2081 	if (--smp->sm_refcnt == 0) {
2082 
2083 		smp->sm_flags &= ~(SM_WRITE_DATA | SM_READ_DATA);
2084 
2085 		if (flags & (SM_INVAL|SM_DESTROY)) {
2086 			segmap_hashout(smp);	/* remove map info */
2087 			if (is_kpm) {
2088 				hat_kpm_mapout(pp, GET_KPME(smp), addr);
2089 				if (smp->sm_flags & SM_NOTKPM_RELEASED) {
2090 					smp->sm_flags &= ~SM_NOTKPM_RELEASED;
2091 					hat_unload(kas.a_hat, addr, MAXBSIZE,
2092 						HAT_UNLOAD);
2093 				}
2094 
2095 			} else {
2096 				if (segmap_kpm)
2097 					segkpm_mapout_validkpme(GET_KPME(smp));
2098 
2099 				smp->sm_flags &= ~SM_NOTKPM_RELEASED;
2100 				hat_unload(kas.a_hat, addr, MAXBSIZE,
2101 					HAT_UNLOAD);
2102 			}
2103 		}
2104 		segmap_smapadd(smp);	/* add to free list */
2105 	}
2106 
2107 	mutex_exit(smtx);
2108 
2109 	if (is_kpm)
2110 		page_unlock(pp);
2111 	/*
2112 	 * Now invoke VOP_PUTPAGE() if any flags (except SM_DONTNEED)
2113 	 * are set.
2114 	 */
2115 	if ((flags & ~SM_DONTNEED) != 0) {
2116 		error = VOP_PUTPAGE(vp, offset, MAXBSIZE,
2117 		    bflags, CRED());
2118 	} else {
2119 		error = 0;
2120 	}
2121 
2122 	return (error);
2123 }
2124 
2125 /*
2126  * Dump the pages belonging to this segmap segment.
2127  */
2128 static void
2129 segmap_dump(struct seg *seg)
2130 {
2131 	struct segmap_data *smd;
2132 	struct smap *smp, *smp_end;
2133 	page_t *pp;
2134 	pfn_t pfn;
2135 	u_offset_t off;
2136 	caddr_t addr;
2137 
2138 	smd = (struct segmap_data *)seg->s_data;
2139 	addr = seg->s_base;
2140 	for (smp = smd->smd_sm, smp_end = smp + smd->smd_npages;
2141 	    smp < smp_end; smp++) {
2142 
2143 		if (smp->sm_refcnt) {
2144 			for (off = 0; off < MAXBSIZE; off += PAGESIZE) {
2145 				int we_own_it = 0;
2146 
2147 				/*
2148 				 * If pp == NULL, the page either does
2149 				 * not exist or is exclusively locked.
2150 				 * So determine if it exists before
2151 				 * searching for it.
2152 				 */
2153 				if ((pp = page_lookup_nowait(smp->sm_vp,
2154 				    smp->sm_off + off, SE_SHARED)))
2155 					we_own_it = 1;
2156 				else
2157 					pp = page_exists(smp->sm_vp,
2158 					    smp->sm_off + off);
2159 
2160 				if (pp) {
2161 					pfn = page_pptonum(pp);
2162 					dump_addpage(seg->s_as,
2163 						addr + off, pfn);
2164 					if (we_own_it)
2165 						page_unlock(pp);
2166 				}
2167 				dump_timeleft = dump_timeout;
2168 			}
2169 		}
2170 		addr += MAXBSIZE;
2171 	}
2172 }
2173 
2174 /*ARGSUSED*/
2175 static int
2176 segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
2177     struct page ***ppp, enum lock_type type, enum seg_rw rw)
2178 {
2179 	return (ENOTSUP);
2180 }
2181 
2182 static int
2183 segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
2184 {
2185 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
2186 
2187 	memidp->val[0] = (uintptr_t)smd->smd_sm->sm_vp;
2188 	memidp->val[1] = smd->smd_sm->sm_off + (uintptr_t)(addr - seg->s_base);
2189 	return (0);
2190 }
2191 
2192 /*ARGSUSED*/
2193 static lgrp_mem_policy_info_t *
2194 segmap_getpolicy(struct seg *seg, caddr_t addr)
2195 {
2196 	return (NULL);
2197 }
2198 
2199 /*ARGSUSED*/
2200 static int
2201 segmap_capable(struct seg *seg, segcapability_t capability)
2202 {
2203 	return (0);
2204 }
2205 
2206 
2207 #ifdef	SEGKPM_SUPPORT
2208 
2209 /*
2210  * segkpm support routines
2211  */
2212 
2213 static caddr_t
2214 segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off,
2215 	struct smap *smp, enum seg_rw rw)
2216 {
2217 	caddr_t	base;
2218 	page_t	*pp;
2219 	int	newpage = 0;
2220 	struct kpme	*kpme;
2221 
2222 	ASSERT(smp->sm_refcnt > 0);
2223 
2224 	if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
2225 		kmutex_t *smtx;
2226 
2227 		base = segkpm_create_va(off);
2228 
2229 		if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT,
2230 		    seg, base)) == NULL) {
2231 			panic("segmap_pagecreate_kpm: "
2232 			    "page_create failed");
2233 			/*NOTREACHED*/
2234 		}
2235 
2236 		newpage = 1;
2237 		page_io_unlock(pp);
2238 		ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
2239 
2240 		/*
2241 		 * Mark this here until the following segmap_pagecreate
2242 		 * or segmap_release.
2243 		 */
2244 		smtx = SMAPMTX(smp);
2245 		mutex_enter(smtx);
2246 		smp->sm_flags |= SM_KPM_NEWPAGE;
2247 		mutex_exit(smtx);
2248 	}
2249 
2250 	kpme = GET_KPME(smp);
2251 	if (!newpage && kpme->kpe_page == pp)
2252 		base = hat_kpm_page2va(pp, 0);
2253 	else
2254 		base = hat_kpm_mapin(pp, kpme);
2255 
2256 	/*
2257 	 * FS code may decide not to call segmap_pagecreate and we
2258 	 * don't invoke segmap_fault via TLB miss, so we have to set
2259 	 * ref and mod bits in advance.
2260 	 */
2261 	if (rw == S_WRITE) {
2262 		hat_setrefmod(pp);
2263 	} else {
2264 		ASSERT(rw == S_READ);
2265 		hat_setref(pp);
2266 	}
2267 
2268 	smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
2269 
2270 	return (base);
2271 }
2272 
2273 /*
2274  * Find the smap structure corresponding to the
2275  * KPM addr and return it locked.
2276  */
2277 struct smap *
2278 get_smap_kpm(caddr_t addr, page_t **ppp)
2279 {
2280 	struct smap	*smp;
2281 	struct vnode	*vp;
2282 	u_offset_t	offset;
2283 	caddr_t		baseaddr = (caddr_t)((uintptr_t)addr & MAXBMASK);
2284 	int		hashid;
2285 	kmutex_t	*hashmtx;
2286 	page_t		*pp;
2287 	union segmap_cpu *scpu;
2288 
2289 	pp = hat_kpm_vaddr2page(baseaddr);
2290 
2291 	ASSERT(pp && !PP_ISFREE(pp));
2292 	ASSERT(PAGE_LOCKED(pp));
2293 	ASSERT(((uintptr_t)pp->p_offset & MAXBOFFSET) == 0);
2294 
2295 	vp = pp->p_vnode;
2296 	offset = pp->p_offset;
2297 	ASSERT(vp != NULL);
2298 
2299 	/*
2300 	 * Assume the last smap used on this cpu is the one needed.
2301 	 */
2302 	scpu = smd_cpu+CPU->cpu_seqid;
2303 	smp = scpu->scpu.scpu_last_smap;
2304 	mutex_enter(&smp->sm_mtx);
2305 	if (smp->sm_vp == vp && smp->sm_off == offset) {
2306 		ASSERT(smp->sm_refcnt > 0);
2307 	} else {
2308 		/*
2309 		 * Assumption wrong, find the smap on the hash chain.
2310 		 */
2311 		mutex_exit(&smp->sm_mtx);
2312 		SMAP_HASHFUNC(vp, offset, hashid); /* macro assigns hashid */
2313 		hashmtx = SHASHMTX(hashid);
2314 
2315 		mutex_enter(hashmtx);
2316 		smp = smd_hash[hashid].sh_hash_list;
2317 		for (; smp != NULL; smp = smp->sm_hash) {
2318 			if (smp->sm_vp == vp && smp->sm_off == offset)
2319 				break;
2320 		}
2321 		mutex_exit(hashmtx);
2322 		if (smp) {
2323 			mutex_enter(&smp->sm_mtx);
2324 			ASSERT(smp->sm_vp == vp && smp->sm_off == offset);
2325 		}
2326 	}
2327 
2328 	if (ppp)
2329 		*ppp = smp ? pp : NULL;
2330 
2331 	return (smp);
2332 }
2333 
2334 #else	/* SEGKPM_SUPPORT */
2335 
2336 /* segkpm stubs */
2337 
2338 /*ARGSUSED*/
2339 static caddr_t
2340 segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off,
2341 	struct smap *smp, enum seg_rw rw)
2342 {
2343 	return (NULL);
2344 }
2345 
2346 /*ARGSUSED*/
2347 struct smap *
2348 get_smap_kpm(caddr_t addr, page_t **ppp)
2349 {
2350 	return (NULL);
2351 }
2352 
2353 #endif	/* SEGKPM_SUPPORT */
2354