xref: /titanic_52/usr/src/uts/common/vm/seg_map.c (revision f936286c99fb83153e4bfd870eb2830a990a82c1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * Portions of this source code were derived from Berkeley 4.3 BSD
31  * under license from the Regents of the University of California.
32  */
33 
34 /*
35  * VM - generic vnode mapping segment.
36  *
37  * The segmap driver is used only by the kernel to get faster (than seg_vn)
38  * mappings [lower routine overhead; more persistent cache] to random
39  * vnode/offsets.  Note than the kernel may (and does) use seg_vn as well.
40  */
41 
42 #include <sys/types.h>
43 #include <sys/t_lock.h>
44 #include <sys/param.h>
45 #include <sys/sysmacros.h>
46 #include <sys/buf.h>
47 #include <sys/systm.h>
48 #include <sys/vnode.h>
49 #include <sys/mman.h>
50 #include <sys/errno.h>
51 #include <sys/cred.h>
52 #include <sys/kmem.h>
53 #include <sys/vtrace.h>
54 #include <sys/cmn_err.h>
55 #include <sys/debug.h>
56 #include <sys/thread.h>
57 #include <sys/dumphdr.h>
58 #include <sys/bitmap.h>
59 #include <sys/lgrp.h>
60 
61 #include <vm/seg_kmem.h>
62 #include <vm/hat.h>
63 #include <vm/as.h>
64 #include <vm/seg.h>
65 #include <vm/seg_kpm.h>
66 #include <vm/seg_map.h>
67 #include <vm/page.h>
68 #include <vm/pvn.h>
69 #include <vm/rm.h>
70 
71 /*
72  * Private seg op routines.
73  */
74 static void	segmap_free(struct seg *seg);
75 faultcode_t segmap_fault(struct hat *hat, struct seg *seg, caddr_t addr,
76 			size_t len, enum fault_type type, enum seg_rw rw);
77 static faultcode_t segmap_faulta(struct seg *seg, caddr_t addr);
78 static int	segmap_checkprot(struct seg *seg, caddr_t addr, size_t len,
79 			uint_t prot);
80 static int	segmap_kluster(struct seg *seg, caddr_t addr, ssize_t);
81 static int	segmap_getprot(struct seg *seg, caddr_t addr, size_t len,
82 			uint_t *protv);
83 static u_offset_t	segmap_getoffset(struct seg *seg, caddr_t addr);
84 static int	segmap_gettype(struct seg *seg, caddr_t addr);
85 static int	segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
86 static void	segmap_dump(struct seg *seg);
87 static int	segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
88 			struct page ***ppp, enum lock_type type,
89 			enum seg_rw rw);
90 static void	segmap_badop(void);
91 static int	segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp);
92 static lgrp_mem_policy_info_t	*segmap_getpolicy(struct seg *seg,
93     caddr_t addr);
94 static int	segmap_capable(struct seg *seg, segcapability_t capability);
95 
96 /* segkpm support */
97 static caddr_t	segmap_pagecreate_kpm(struct seg *, vnode_t *, u_offset_t,
98 			struct smap *, enum seg_rw);
99 struct smap	*get_smap_kpm(caddr_t, page_t **);
100 
101 #define	SEGMAP_BADOP(t)	(t(*)())segmap_badop
102 
103 static struct seg_ops segmap_ops = {
104 	SEGMAP_BADOP(int),	/* dup */
105 	SEGMAP_BADOP(int),	/* unmap */
106 	segmap_free,
107 	segmap_fault,
108 	segmap_faulta,
109 	SEGMAP_BADOP(int),	/* setprot */
110 	segmap_checkprot,
111 	segmap_kluster,
112 	SEGMAP_BADOP(size_t),	/* swapout */
113 	SEGMAP_BADOP(int),	/* sync */
114 	SEGMAP_BADOP(size_t),	/* incore */
115 	SEGMAP_BADOP(int),	/* lockop */
116 	segmap_getprot,
117 	segmap_getoffset,
118 	segmap_gettype,
119 	segmap_getvp,
120 	SEGMAP_BADOP(int),	/* advise */
121 	segmap_dump,
122 	segmap_pagelock,	/* pagelock */
123 	SEGMAP_BADOP(int),	/* setpgsz */
124 	segmap_getmemid,	/* getmemid */
125 	segmap_getpolicy,	/* getpolicy */
126 	segmap_capable,		/* capable */
127 	seg_inherit_notsup	/* inherit */
128 };
129 
130 /*
131  * Private segmap routines.
132  */
133 static void	segmap_unlock(struct hat *hat, struct seg *seg, caddr_t addr,
134 			size_t len, enum seg_rw rw, struct smap *smp);
135 static void	segmap_smapadd(struct smap *smp);
136 static struct smap *segmap_hashin(struct smap *smp, struct vnode *vp,
137 			u_offset_t off, int hashid);
138 static void	segmap_hashout(struct smap *smp);
139 
140 
141 /*
142  * Statistics for segmap operations.
143  *
144  * No explicit locking to protect these stats.
145  */
146 struct segmapcnt segmapcnt = {
147 	{ "fault",		KSTAT_DATA_ULONG },
148 	{ "faulta",		KSTAT_DATA_ULONG },
149 	{ "getmap",		KSTAT_DATA_ULONG },
150 	{ "get_use",		KSTAT_DATA_ULONG },
151 	{ "get_reclaim",	KSTAT_DATA_ULONG },
152 	{ "get_reuse",		KSTAT_DATA_ULONG },
153 	{ "get_unused",		KSTAT_DATA_ULONG },
154 	{ "get_nofree",		KSTAT_DATA_ULONG },
155 	{ "rel_async",		KSTAT_DATA_ULONG },
156 	{ "rel_write",		KSTAT_DATA_ULONG },
157 	{ "rel_free",		KSTAT_DATA_ULONG },
158 	{ "rel_abort",		KSTAT_DATA_ULONG },
159 	{ "rel_dontneed",	KSTAT_DATA_ULONG },
160 	{ "release",		KSTAT_DATA_ULONG },
161 	{ "pagecreate",		KSTAT_DATA_ULONG },
162 	{ "free_notfree",	KSTAT_DATA_ULONG },
163 	{ "free_dirty",		KSTAT_DATA_ULONG },
164 	{ "free",		KSTAT_DATA_ULONG },
165 	{ "stolen",		KSTAT_DATA_ULONG },
166 	{ "get_nomtx",		KSTAT_DATA_ULONG }
167 };
168 
169 kstat_named_t *segmapcnt_ptr = (kstat_named_t *)&segmapcnt;
170 uint_t segmapcnt_ndata = sizeof (segmapcnt) / sizeof (kstat_named_t);
171 
172 /*
173  * Return number of map pages in segment.
174  */
175 #define	MAP_PAGES(seg)		((seg)->s_size >> MAXBSHIFT)
176 
177 /*
178  * Translate addr into smap number within segment.
179  */
180 #define	MAP_PAGE(seg, addr)  (((addr) - (seg)->s_base) >> MAXBSHIFT)
181 
182 /*
183  * Translate addr in seg into struct smap pointer.
184  */
185 #define	GET_SMAP(seg, addr)	\
186 	&(((struct segmap_data *)((seg)->s_data))->smd_sm[MAP_PAGE(seg, addr)])
187 
188 /*
189  * Bit in map (16 bit bitmap).
190  */
191 #define	SMAP_BIT_MASK(bitindex)	(1 << ((bitindex) & 0xf))
192 
193 static int smd_colormsk = 0;
194 static int smd_ncolor = 0;
195 static int smd_nfree = 0;
196 static int smd_freemsk = 0;
197 #ifdef DEBUG
198 static int *colors_used;
199 #endif
200 static struct smap *smd_smap;
201 static struct smaphash *smd_hash;
202 #ifdef SEGMAP_HASHSTATS
203 static unsigned int *smd_hash_len;
204 #endif
205 static struct smfree *smd_free;
206 static ulong_t smd_hashmsk = 0;
207 
208 #define	SEGMAP_MAXCOLOR		2
209 #define	SEGMAP_CACHE_PAD	64
210 
211 union segmap_cpu {
212 	struct {
213 		uint32_t	scpu_free_ndx[SEGMAP_MAXCOLOR];
214 		struct smap	*scpu_last_smap;
215 		ulong_t		scpu_getmap;
216 		ulong_t		scpu_release;
217 		ulong_t		scpu_get_reclaim;
218 		ulong_t		scpu_fault;
219 		ulong_t		scpu_pagecreate;
220 		ulong_t		scpu_get_reuse;
221 	} scpu;
222 	char	scpu_pad[SEGMAP_CACHE_PAD];
223 };
224 static union segmap_cpu *smd_cpu;
225 
226 /*
227  * There are three locks in seg_map:
228  *	- per freelist mutexes
229  *	- per hashchain mutexes
230  *	- per smap mutexes
231  *
232  * The lock ordering is to get the smap mutex to lock down the slot
233  * first then the hash lock (for hash in/out (vp, off) list) or the
234  * freelist lock to put the slot back on the free list.
235  *
236  * The hash search is done by only holding the hashchain lock, when a wanted
237  * slot is found, we drop the hashchain lock then lock the slot so there
238  * is no overlapping of hashchain and smap locks. After the slot is
239  * locked, we verify again if the slot is still what we are looking
240  * for.
241  *
242  * Allocation of a free slot is done by holding the freelist lock,
243  * then locking the smap slot at the head of the freelist. This is
244  * in reversed lock order so mutex_tryenter() is used.
245  *
246  * The smap lock protects all fields in smap structure except for
247  * the link fields for hash/free lists which are protected by
248  * hashchain and freelist locks.
249  */
250 
251 #define	SHASHMTX(hashid)	(&smd_hash[hashid].sh_mtx)
252 
253 #define	SMP2SMF(smp)		(&smd_free[(smp - smd_smap) & smd_freemsk])
254 #define	SMP2SMF_NDX(smp)	(ushort_t)((smp - smd_smap) & smd_freemsk)
255 
256 #define	SMAPMTX(smp) (&smp->sm_mtx)
257 
258 #define	SMAP_HASHFUNC(vp, off, hashid) \
259 	{ \
260 	hashid = ((((uintptr_t)(vp) >> 6) + ((uintptr_t)(vp) >> 3) + \
261 		((off) >> MAXBSHIFT)) & smd_hashmsk); \
262 	}
263 
264 /*
265  * The most frequently updated kstat counters are kept in the
266  * per cpu array to avoid hot cache blocks. The update function
267  * sums the cpu local counters to update the global counters.
268  */
269 
270 /* ARGSUSED */
271 int
272 segmap_kstat_update(kstat_t *ksp, int rw)
273 {
274 	int i;
275 	ulong_t	getmap, release, get_reclaim;
276 	ulong_t	fault, pagecreate, get_reuse;
277 
278 	if (rw == KSTAT_WRITE)
279 		return (EACCES);
280 	getmap = release = get_reclaim = (ulong_t)0;
281 	fault = pagecreate = get_reuse = (ulong_t)0;
282 	for (i = 0; i < max_ncpus; i++) {
283 		getmap += smd_cpu[i].scpu.scpu_getmap;
284 		release  += smd_cpu[i].scpu.scpu_release;
285 		get_reclaim += smd_cpu[i].scpu.scpu_get_reclaim;
286 		fault  += smd_cpu[i].scpu.scpu_fault;
287 		pagecreate  += smd_cpu[i].scpu.scpu_pagecreate;
288 		get_reuse += smd_cpu[i].scpu.scpu_get_reuse;
289 	}
290 	segmapcnt.smp_getmap.value.ul = getmap;
291 	segmapcnt.smp_release.value.ul = release;
292 	segmapcnt.smp_get_reclaim.value.ul = get_reclaim;
293 	segmapcnt.smp_fault.value.ul = fault;
294 	segmapcnt.smp_pagecreate.value.ul = pagecreate;
295 	segmapcnt.smp_get_reuse.value.ul = get_reuse;
296 	return (0);
297 }
298 
299 int
300 segmap_create(struct seg *seg, void *argsp)
301 {
302 	struct segmap_data *smd;
303 	struct smap *smp;
304 	struct smfree *sm;
305 	struct segmap_crargs *a = (struct segmap_crargs *)argsp;
306 	struct smaphash *shashp;
307 	union segmap_cpu *scpu;
308 	long i, npages;
309 	size_t hashsz;
310 	uint_t nfreelist;
311 	extern void prefetch_smap_w(void *);
312 	extern int max_ncpus;
313 
314 	ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
315 
316 	if (((uintptr_t)seg->s_base | seg->s_size) & MAXBOFFSET) {
317 		panic("segkmap not MAXBSIZE aligned");
318 		/*NOTREACHED*/
319 	}
320 
321 	smd = kmem_zalloc(sizeof (struct segmap_data), KM_SLEEP);
322 
323 	seg->s_data = (void *)smd;
324 	seg->s_ops = &segmap_ops;
325 	smd->smd_prot = a->prot;
326 
327 	/*
328 	 * Scale the number of smap freelists to be
329 	 * proportional to max_ncpus * number of virtual colors.
330 	 * The caller can over-ride this scaling by providing
331 	 * a non-zero a->nfreelist argument.
332 	 */
333 	nfreelist = a->nfreelist;
334 	if (nfreelist == 0)
335 		nfreelist = max_ncpus;
336 	else if (nfreelist < 0 || nfreelist > 4 * max_ncpus) {
337 		cmn_err(CE_WARN, "segmap_create: nfreelist out of range "
338 		"%d, using %d", nfreelist, max_ncpus);
339 		nfreelist = max_ncpus;
340 	}
341 	if (!ISP2(nfreelist)) {
342 		/* round up nfreelist to the next power of two. */
343 		nfreelist = 1 << (highbit(nfreelist));
344 	}
345 
346 	/*
347 	 * Get the number of virtual colors - must be a power of 2.
348 	 */
349 	if (a->shmsize)
350 		smd_ncolor = a->shmsize >> MAXBSHIFT;
351 	else
352 		smd_ncolor = 1;
353 	ASSERT((smd_ncolor & (smd_ncolor - 1)) == 0);
354 	ASSERT(smd_ncolor <= SEGMAP_MAXCOLOR);
355 	smd_colormsk = smd_ncolor - 1;
356 	smd->smd_nfree = smd_nfree = smd_ncolor * nfreelist;
357 	smd_freemsk = smd_nfree - 1;
358 
359 	/*
360 	 * Allocate and initialize the freelist headers.
361 	 * Note that sm_freeq[1] starts out as the release queue. This
362 	 * is known when the smap structures are initialized below.
363 	 */
364 	smd_free = smd->smd_free =
365 	    kmem_zalloc(smd_nfree * sizeof (struct smfree), KM_SLEEP);
366 	for (i = 0; i < smd_nfree; i++) {
367 		sm = &smd->smd_free[i];
368 		mutex_init(&sm->sm_freeq[0].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
369 		mutex_init(&sm->sm_freeq[1].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
370 		sm->sm_allocq = &sm->sm_freeq[0];
371 		sm->sm_releq = &sm->sm_freeq[1];
372 	}
373 
374 	/*
375 	 * Allocate and initialize the smap hash chain headers.
376 	 * Compute hash size rounding down to the next power of two.
377 	 */
378 	npages = MAP_PAGES(seg);
379 	smd->smd_npages = npages;
380 	hashsz = npages / SMAP_HASHAVELEN;
381 	hashsz = 1 << (highbit(hashsz)-1);
382 	smd_hashmsk = hashsz - 1;
383 	smd_hash = smd->smd_hash =
384 	    kmem_alloc(hashsz * sizeof (struct smaphash), KM_SLEEP);
385 #ifdef SEGMAP_HASHSTATS
386 	smd_hash_len =
387 	    kmem_zalloc(hashsz * sizeof (unsigned int), KM_SLEEP);
388 #endif
389 	for (i = 0, shashp = smd_hash; i < hashsz; i++, shashp++) {
390 		shashp->sh_hash_list = NULL;
391 		mutex_init(&shashp->sh_mtx, NULL, MUTEX_DEFAULT, NULL);
392 	}
393 
394 	/*
395 	 * Allocate and initialize the smap structures.
396 	 * Link all slots onto the appropriate freelist.
397 	 * The smap array is large enough to affect boot time
398 	 * on large systems, so use memory prefetching and only
399 	 * go through the array 1 time. Inline a optimized version
400 	 * of segmap_smapadd to add structures to freelists with
401 	 * knowledge that no locks are needed here.
402 	 */
403 	smd_smap = smd->smd_sm =
404 	    kmem_alloc(sizeof (struct smap) * npages, KM_SLEEP);
405 
406 	for (smp = &smd->smd_sm[MAP_PAGES(seg) - 1];
407 	    smp >= smd->smd_sm; smp--) {
408 		struct smap *smpfreelist;
409 		struct sm_freeq *releq;
410 
411 		prefetch_smap_w((char *)smp);
412 
413 		smp->sm_vp = NULL;
414 		smp->sm_hash = NULL;
415 		smp->sm_off = 0;
416 		smp->sm_bitmap = 0;
417 		smp->sm_refcnt = 0;
418 		mutex_init(&smp->sm_mtx, NULL, MUTEX_DEFAULT, NULL);
419 		smp->sm_free_ndx = SMP2SMF_NDX(smp);
420 
421 		sm = SMP2SMF(smp);
422 		releq = sm->sm_releq;
423 
424 		smpfreelist = releq->smq_free;
425 		if (smpfreelist == 0) {
426 			releq->smq_free = smp->sm_next = smp->sm_prev = smp;
427 		} else {
428 			smp->sm_next = smpfreelist;
429 			smp->sm_prev = smpfreelist->sm_prev;
430 			smpfreelist->sm_prev = smp;
431 			smp->sm_prev->sm_next = smp;
432 			releq->smq_free = smp->sm_next;
433 		}
434 
435 		/*
436 		 * sm_flag = 0 (no SM_QNDX_ZERO) implies smap on sm_freeq[1]
437 		 */
438 		smp->sm_flags = 0;
439 
440 #ifdef	SEGKPM_SUPPORT
441 		/*
442 		 * Due to the fragile prefetch loop no
443 		 * separate function is used here.
444 		 */
445 		smp->sm_kpme_next = NULL;
446 		smp->sm_kpme_prev = NULL;
447 		smp->sm_kpme_page = NULL;
448 #endif
449 	}
450 
451 	/*
452 	 * Allocate the per color indices that distribute allocation
453 	 * requests over the free lists. Each cpu will have a private
454 	 * rotor index to spread the allocations even across the available
455 	 * smap freelists. Init the scpu_last_smap field to the first
456 	 * smap element so there is no need to check for NULL.
457 	 */
458 	smd_cpu =
459 	    kmem_zalloc(sizeof (union segmap_cpu) * max_ncpus, KM_SLEEP);
460 	for (i = 0, scpu = smd_cpu; i < max_ncpus; i++, scpu++) {
461 		int j;
462 		for (j = 0; j < smd_ncolor; j++)
463 			scpu->scpu.scpu_free_ndx[j] = j;
464 		scpu->scpu.scpu_last_smap = smd_smap;
465 	}
466 
467 	vpm_init();
468 
469 #ifdef DEBUG
470 	/*
471 	 * Keep track of which colors are used more often.
472 	 */
473 	colors_used = kmem_zalloc(smd_nfree * sizeof (int), KM_SLEEP);
474 #endif /* DEBUG */
475 
476 	return (0);
477 }
478 
479 static void
480 segmap_free(seg)
481 	struct seg *seg;
482 {
483 	ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
484 }
485 
486 /*
487  * Do a F_SOFTUNLOCK call over the range requested.
488  * The range must have already been F_SOFTLOCK'ed.
489  */
490 static void
491 segmap_unlock(
492 	struct hat *hat,
493 	struct seg *seg,
494 	caddr_t addr,
495 	size_t len,
496 	enum seg_rw rw,
497 	struct smap *smp)
498 {
499 	page_t *pp;
500 	caddr_t adr;
501 	u_offset_t off;
502 	struct vnode *vp;
503 	kmutex_t *smtx;
504 
505 	ASSERT(smp->sm_refcnt > 0);
506 
507 #ifdef lint
508 	seg = seg;
509 #endif
510 
511 	if (segmap_kpm && IS_KPM_ADDR(addr)) {
512 
513 		/*
514 		 * We're called only from segmap_fault and this was a
515 		 * NOP in case of a kpm based smap, so dangerous things
516 		 * must have happened in the meantime. Pages are prefaulted
517 		 * and locked in segmap_getmapflt and they will not be
518 		 * unlocked until segmap_release.
519 		 */
520 		panic("segmap_unlock: called with kpm addr %p", (void *)addr);
521 		/*NOTREACHED*/
522 	}
523 
524 	vp = smp->sm_vp;
525 	off = smp->sm_off + (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
526 
527 	hat_unlock(hat, addr, P2ROUNDUP(len, PAGESIZE));
528 	for (adr = addr; adr < addr + len; adr += PAGESIZE, off += PAGESIZE) {
529 		ushort_t bitmask;
530 
531 		/*
532 		 * Use page_find() instead of page_lookup() to
533 		 * find the page since we know that it has
534 		 * "shared" lock.
535 		 */
536 		pp = page_find(vp, off);
537 		if (pp == NULL) {
538 			panic("segmap_unlock: page not found");
539 			/*NOTREACHED*/
540 		}
541 
542 		if (rw == S_WRITE) {
543 			hat_setrefmod(pp);
544 		} else if (rw != S_OTHER) {
545 			TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
546 			"segmap_fault:pp %p vp %p offset %llx", pp, vp, off);
547 			hat_setref(pp);
548 		}
549 
550 		/*
551 		 * Clear bitmap, if the bit corresponding to "off" is set,
552 		 * since the page and translation are being unlocked.
553 		 */
554 		bitmask = SMAP_BIT_MASK((off - smp->sm_off) >> PAGESHIFT);
555 
556 		/*
557 		 * Large Files: Following assertion is to verify
558 		 * the correctness of the cast to (int) above.
559 		 */
560 		ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
561 		smtx = SMAPMTX(smp);
562 		mutex_enter(smtx);
563 		if (smp->sm_bitmap & bitmask) {
564 			smp->sm_bitmap &= ~bitmask;
565 		}
566 		mutex_exit(smtx);
567 
568 		page_unlock(pp);
569 	}
570 }
571 
572 #define	MAXPPB	(MAXBSIZE/4096)	/* assumes minimum page size of 4k */
573 
574 /*
575  * This routine is called via a machine specific fault handling
576  * routine.  It is also called by software routines wishing to
577  * lock or unlock a range of addresses.
578  *
579  * Note that this routine expects a page-aligned "addr".
580  */
581 faultcode_t
582 segmap_fault(
583 	struct hat *hat,
584 	struct seg *seg,
585 	caddr_t addr,
586 	size_t len,
587 	enum fault_type type,
588 	enum seg_rw rw)
589 {
590 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
591 	struct smap *smp;
592 	page_t *pp, **ppp;
593 	struct vnode *vp;
594 	u_offset_t off;
595 	page_t *pl[MAXPPB + 1];
596 	uint_t prot;
597 	u_offset_t addroff;
598 	caddr_t adr;
599 	int err;
600 	u_offset_t sm_off;
601 	int hat_flag;
602 
603 	if (segmap_kpm && IS_KPM_ADDR(addr)) {
604 		int newpage;
605 		kmutex_t *smtx;
606 
607 		/*
608 		 * Pages are successfully prefaulted and locked in
609 		 * segmap_getmapflt and can't be unlocked until
610 		 * segmap_release. No hat mappings have to be locked
611 		 * and they also can't be unlocked as long as the
612 		 * caller owns an active kpm addr.
613 		 */
614 #ifndef DEBUG
615 		if (type != F_SOFTUNLOCK)
616 			return (0);
617 #endif
618 
619 		if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
620 			panic("segmap_fault: smap not found "
621 			    "for addr %p", (void *)addr);
622 			/*NOTREACHED*/
623 		}
624 
625 		smtx = SMAPMTX(smp);
626 #ifdef	DEBUG
627 		newpage = smp->sm_flags & SM_KPM_NEWPAGE;
628 		if (newpage) {
629 			cmn_err(CE_WARN, "segmap_fault: newpage? smp %p",
630 			    (void *)smp);
631 		}
632 
633 		if (type != F_SOFTUNLOCK) {
634 			mutex_exit(smtx);
635 			return (0);
636 		}
637 #endif
638 		mutex_exit(smtx);
639 		vp = smp->sm_vp;
640 		sm_off = smp->sm_off;
641 
642 		if (vp == NULL)
643 			return (FC_MAKE_ERR(EIO));
644 
645 		ASSERT(smp->sm_refcnt > 0);
646 
647 		addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
648 		if (addroff + len > MAXBSIZE)
649 			panic("segmap_fault: endaddr %p exceeds MAXBSIZE chunk",
650 			    (void *)(addr + len));
651 
652 		off = sm_off + addroff;
653 
654 		pp = page_find(vp, off);
655 
656 		if (pp == NULL)
657 			panic("segmap_fault: softunlock page not found");
658 
659 		/*
660 		 * Set ref bit also here in case of S_OTHER to avoid the
661 		 * overhead of supporting other cases than F_SOFTUNLOCK
662 		 * with segkpm. We can do this because the underlying
663 		 * pages are locked anyway.
664 		 */
665 		if (rw == S_WRITE) {
666 			hat_setrefmod(pp);
667 		} else {
668 			TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
669 			    "segmap_fault:pp %p vp %p offset %llx",
670 			    pp, vp, off);
671 			hat_setref(pp);
672 		}
673 
674 		return (0);
675 	}
676 
677 	smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
678 	smp = GET_SMAP(seg, addr);
679 	vp = smp->sm_vp;
680 	sm_off = smp->sm_off;
681 
682 	if (vp == NULL)
683 		return (FC_MAKE_ERR(EIO));
684 
685 	ASSERT(smp->sm_refcnt > 0);
686 
687 	addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
688 	if (addroff + len > MAXBSIZE) {
689 		panic("segmap_fault: endaddr %p "
690 		    "exceeds MAXBSIZE chunk", (void *)(addr + len));
691 		/*NOTREACHED*/
692 	}
693 	off = sm_off + addroff;
694 
695 	/*
696 	 * First handle the easy stuff
697 	 */
698 	if (type == F_SOFTUNLOCK) {
699 		segmap_unlock(hat, seg, addr, len, rw, smp);
700 		return (0);
701 	}
702 
703 	TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
704 	    "segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
705 	err = VOP_GETPAGE(vp, (offset_t)off, len, &prot, pl, MAXBSIZE,
706 	    seg, addr, rw, CRED(), NULL);
707 
708 	if (err)
709 		return (FC_MAKE_ERR(err));
710 
711 	prot &= smd->smd_prot;
712 
713 	/*
714 	 * Handle all pages returned in the pl[] array.
715 	 * This loop is coded on the assumption that if
716 	 * there was no error from the VOP_GETPAGE routine,
717 	 * that the page list returned will contain all the
718 	 * needed pages for the vp from [off..off + len].
719 	 */
720 	ppp = pl;
721 	while ((pp = *ppp++) != NULL) {
722 		u_offset_t poff;
723 		ASSERT(pp->p_vnode == vp);
724 		hat_flag = HAT_LOAD;
725 
726 		/*
727 		 * Verify that the pages returned are within the range
728 		 * of this segmap region.  Note that it is theoretically
729 		 * possible for pages outside this range to be returned,
730 		 * but it is not very likely.  If we cannot use the
731 		 * page here, just release it and go on to the next one.
732 		 */
733 		if (pp->p_offset < sm_off ||
734 		    pp->p_offset >= sm_off + MAXBSIZE) {
735 			(void) page_release(pp, 1);
736 			continue;
737 		}
738 
739 		ASSERT(hat == kas.a_hat);
740 		poff = pp->p_offset;
741 		adr = addr + (poff - off);
742 		if (adr >= addr && adr < addr + len) {
743 			hat_setref(pp);
744 			TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
745 			    "segmap_fault:pp %p vp %p offset %llx",
746 			    pp, vp, poff);
747 			if (type == F_SOFTLOCK)
748 				hat_flag = HAT_LOAD_LOCK;
749 		}
750 
751 		/*
752 		 * Deal with VMODSORT pages here. If we know this is a write
753 		 * do the setmod now and allow write protection.
754 		 * As long as it's modified or not S_OTHER, remove write
755 		 * protection. With S_OTHER it's up to the FS to deal with this.
756 		 */
757 		if (IS_VMODSORT(vp)) {
758 			if (rw == S_WRITE)
759 				hat_setmod(pp);
760 			else if (rw != S_OTHER && !hat_ismod(pp))
761 				prot &= ~PROT_WRITE;
762 		}
763 
764 		hat_memload(hat, adr, pp, prot, hat_flag);
765 		if (hat_flag != HAT_LOAD_LOCK)
766 			page_unlock(pp);
767 	}
768 	return (0);
769 }
770 
771 /*
772  * This routine is used to start I/O on pages asynchronously.
773  */
774 static faultcode_t
775 segmap_faulta(struct seg *seg, caddr_t addr)
776 {
777 	struct smap *smp;
778 	struct vnode *vp;
779 	u_offset_t off;
780 	int err;
781 
782 	if (segmap_kpm && IS_KPM_ADDR(addr)) {
783 		int	newpage;
784 		kmutex_t *smtx;
785 
786 		/*
787 		 * Pages are successfully prefaulted and locked in
788 		 * segmap_getmapflt and can't be unlocked until
789 		 * segmap_release. No hat mappings have to be locked
790 		 * and they also can't be unlocked as long as the
791 		 * caller owns an active kpm addr.
792 		 */
793 #ifdef	DEBUG
794 		if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
795 			panic("segmap_faulta: smap not found "
796 			    "for addr %p", (void *)addr);
797 			/*NOTREACHED*/
798 		}
799 
800 		smtx = SMAPMTX(smp);
801 		newpage = smp->sm_flags & SM_KPM_NEWPAGE;
802 		mutex_exit(smtx);
803 		if (newpage)
804 			cmn_err(CE_WARN, "segmap_faulta: newpage? smp %p",
805 			    (void *)smp);
806 #endif
807 		return (0);
808 	}
809 
810 	segmapcnt.smp_faulta.value.ul++;
811 	smp = GET_SMAP(seg, addr);
812 
813 	ASSERT(smp->sm_refcnt > 0);
814 
815 	vp = smp->sm_vp;
816 	off = smp->sm_off;
817 
818 	if (vp == NULL) {
819 		cmn_err(CE_WARN, "segmap_faulta - no vp");
820 		return (FC_MAKE_ERR(EIO));
821 	}
822 
823 	TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
824 	    "segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
825 
826 	err = VOP_GETPAGE(vp, (offset_t)(off + ((offset_t)((uintptr_t)addr
827 	    & MAXBOFFSET))), PAGESIZE, (uint_t *)NULL, (page_t **)NULL, 0,
828 	    seg, addr, S_READ, CRED(), NULL);
829 
830 	if (err)
831 		return (FC_MAKE_ERR(err));
832 	return (0);
833 }
834 
835 /*ARGSUSED*/
836 static int
837 segmap_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
838 {
839 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
840 
841 	ASSERT(seg->s_as && RW_LOCK_HELD(&seg->s_as->a_lock));
842 
843 	/*
844 	 * Need not acquire the segment lock since
845 	 * "smd_prot" is a read-only field.
846 	 */
847 	return (((smd->smd_prot & prot) != prot) ? EACCES : 0);
848 }
849 
850 static int
851 segmap_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
852 {
853 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
854 	size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1;
855 
856 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
857 
858 	if (pgno != 0) {
859 		do {
860 			protv[--pgno] = smd->smd_prot;
861 		} while (pgno != 0);
862 	}
863 	return (0);
864 }
865 
866 static u_offset_t
867 segmap_getoffset(struct seg *seg, caddr_t addr)
868 {
869 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
870 
871 	ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
872 
873 	return ((u_offset_t)smd->smd_sm->sm_off + (addr - seg->s_base));
874 }
875 
876 /*ARGSUSED*/
877 static int
878 segmap_gettype(struct seg *seg, caddr_t addr)
879 {
880 	ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
881 
882 	return (MAP_SHARED);
883 }
884 
885 /*ARGSUSED*/
886 static int
887 segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
888 {
889 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
890 
891 	ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
892 
893 	/* XXX - This doesn't make any sense */
894 	*vpp = smd->smd_sm->sm_vp;
895 	return (0);
896 }
897 
898 /*
899  * Check to see if it makes sense to do kluster/read ahead to
900  * addr + delta relative to the mapping at addr.  We assume here
901  * that delta is a signed PAGESIZE'd multiple (which can be negative).
902  *
903  * For segmap we always "approve" of this action from our standpoint.
904  */
905 /*ARGSUSED*/
906 static int
907 segmap_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
908 {
909 	return (0);
910 }
911 
912 static void
913 segmap_badop()
914 {
915 	panic("segmap_badop");
916 	/*NOTREACHED*/
917 }
918 
919 /*
920  * Special private segmap operations
921  */
922 
923 /*
924  * Add smap to the appropriate free list.
925  */
926 static void
927 segmap_smapadd(struct smap *smp)
928 {
929 	struct smfree *sm;
930 	struct smap *smpfreelist;
931 	struct sm_freeq *releq;
932 
933 	ASSERT(MUTEX_HELD(SMAPMTX(smp)));
934 
935 	if (smp->sm_refcnt != 0) {
936 		panic("segmap_smapadd");
937 		/*NOTREACHED*/
938 	}
939 
940 	sm = &smd_free[smp->sm_free_ndx];
941 	/*
942 	 * Add to the tail of the release queue
943 	 * Note that sm_releq and sm_allocq could toggle
944 	 * before we get the lock. This does not affect
945 	 * correctness as the 2 queues are only maintained
946 	 * to reduce lock pressure.
947 	 */
948 	releq = sm->sm_releq;
949 	if (releq == &sm->sm_freeq[0])
950 		smp->sm_flags |= SM_QNDX_ZERO;
951 	else
952 		smp->sm_flags &= ~SM_QNDX_ZERO;
953 	mutex_enter(&releq->smq_mtx);
954 	smpfreelist = releq->smq_free;
955 	if (smpfreelist == 0) {
956 		int want;
957 
958 		releq->smq_free = smp->sm_next = smp->sm_prev = smp;
959 		/*
960 		 * Both queue mutexes held to set sm_want;
961 		 * snapshot the value before dropping releq mutex.
962 		 * If sm_want appears after the releq mutex is dropped,
963 		 * then the smap just freed is already gone.
964 		 */
965 		want = sm->sm_want;
966 		mutex_exit(&releq->smq_mtx);
967 		/*
968 		 * See if there was a waiter before dropping the releq mutex
969 		 * then recheck after obtaining sm_freeq[0] mutex as
970 		 * the another thread may have already signaled.
971 		 */
972 		if (want) {
973 			mutex_enter(&sm->sm_freeq[0].smq_mtx);
974 			if (sm->sm_want)
975 				cv_signal(&sm->sm_free_cv);
976 			mutex_exit(&sm->sm_freeq[0].smq_mtx);
977 		}
978 	} else {
979 		smp->sm_next = smpfreelist;
980 		smp->sm_prev = smpfreelist->sm_prev;
981 		smpfreelist->sm_prev = smp;
982 		smp->sm_prev->sm_next = smp;
983 		mutex_exit(&releq->smq_mtx);
984 	}
985 }
986 
987 
988 static struct smap *
989 segmap_hashin(struct smap *smp, struct vnode *vp, u_offset_t off, int hashid)
990 {
991 	struct smap **hpp;
992 	struct smap *tmp;
993 	kmutex_t *hmtx;
994 
995 	ASSERT(MUTEX_HELD(SMAPMTX(smp)));
996 	ASSERT(smp->sm_vp == NULL);
997 	ASSERT(smp->sm_hash == NULL);
998 	ASSERT(smp->sm_prev == NULL);
999 	ASSERT(smp->sm_next == NULL);
1000 	ASSERT(hashid >= 0 && hashid <= smd_hashmsk);
1001 
1002 	hmtx = SHASHMTX(hashid);
1003 
1004 	mutex_enter(hmtx);
1005 	/*
1006 	 * First we need to verify that no one has created a smp
1007 	 * with (vp,off) as its tag before we us.
1008 	 */
1009 	for (tmp = smd_hash[hashid].sh_hash_list;
1010 	    tmp != NULL; tmp = tmp->sm_hash)
1011 		if (tmp->sm_vp == vp && tmp->sm_off == off)
1012 			break;
1013 
1014 	if (tmp == NULL) {
1015 		/*
1016 		 * No one created one yet.
1017 		 *
1018 		 * Funniness here - we don't increment the ref count on the
1019 		 * vnode * even though we have another pointer to it here.
1020 		 * The reason for this is that we don't want the fact that
1021 		 * a seg_map entry somewhere refers to a vnode to prevent the
1022 		 * vnode * itself from going away.  This is because this
1023 		 * reference to the vnode is a "soft one".  In the case where
1024 		 * a mapping is being used by a rdwr [or directory routine?]
1025 		 * there already has to be a non-zero ref count on the vnode.
1026 		 * In the case where the vp has been freed and the the smap
1027 		 * structure is on the free list, there are no pages in memory
1028 		 * that can refer to the vnode.  Thus even if we reuse the same
1029 		 * vnode/smap structure for a vnode which has the same
1030 		 * address but represents a different object, we are ok.
1031 		 */
1032 		smp->sm_vp = vp;
1033 		smp->sm_off = off;
1034 
1035 		hpp = &smd_hash[hashid].sh_hash_list;
1036 		smp->sm_hash = *hpp;
1037 		*hpp = smp;
1038 #ifdef SEGMAP_HASHSTATS
1039 		smd_hash_len[hashid]++;
1040 #endif
1041 	}
1042 	mutex_exit(hmtx);
1043 
1044 	return (tmp);
1045 }
1046 
1047 static void
1048 segmap_hashout(struct smap *smp)
1049 {
1050 	struct smap **hpp, *hp;
1051 	struct vnode *vp;
1052 	kmutex_t *mtx;
1053 	int hashid;
1054 	u_offset_t off;
1055 
1056 	ASSERT(MUTEX_HELD(SMAPMTX(smp)));
1057 
1058 	vp = smp->sm_vp;
1059 	off = smp->sm_off;
1060 
1061 	SMAP_HASHFUNC(vp, off, hashid);	/* macro assigns hashid */
1062 	mtx = SHASHMTX(hashid);
1063 	mutex_enter(mtx);
1064 
1065 	hpp = &smd_hash[hashid].sh_hash_list;
1066 	for (;;) {
1067 		hp = *hpp;
1068 		if (hp == NULL) {
1069 			panic("segmap_hashout");
1070 			/*NOTREACHED*/
1071 		}
1072 		if (hp == smp)
1073 			break;
1074 		hpp = &hp->sm_hash;
1075 	}
1076 
1077 	*hpp = smp->sm_hash;
1078 	smp->sm_hash = NULL;
1079 #ifdef SEGMAP_HASHSTATS
1080 	smd_hash_len[hashid]--;
1081 #endif
1082 	mutex_exit(mtx);
1083 
1084 	smp->sm_vp = NULL;
1085 	smp->sm_off = (u_offset_t)0;
1086 
1087 }
1088 
1089 /*
1090  * Attempt to free unmodified, unmapped, and non locked segmap
1091  * pages.
1092  */
1093 void
1094 segmap_pagefree(struct vnode *vp, u_offset_t off)
1095 {
1096 	u_offset_t pgoff;
1097 	page_t  *pp;
1098 
1099 	for (pgoff = off; pgoff < off + MAXBSIZE; pgoff += PAGESIZE) {
1100 
1101 		if ((pp = page_lookup_nowait(vp, pgoff, SE_EXCL)) == NULL)
1102 			continue;
1103 
1104 		switch (page_release(pp, 1)) {
1105 		case PGREL_NOTREL:
1106 			segmapcnt.smp_free_notfree.value.ul++;
1107 			break;
1108 		case PGREL_MOD:
1109 			segmapcnt.smp_free_dirty.value.ul++;
1110 			break;
1111 		case PGREL_CLEAN:
1112 			segmapcnt.smp_free.value.ul++;
1113 			break;
1114 		}
1115 	}
1116 }
1117 
1118 /*
1119  * Locks held on entry: smap lock
1120  * Locks held on exit : smap lock.
1121  */
1122 
1123 static void
1124 grab_smp(struct smap *smp, page_t *pp)
1125 {
1126 	ASSERT(MUTEX_HELD(SMAPMTX(smp)));
1127 	ASSERT(smp->sm_refcnt == 0);
1128 
1129 	if (smp->sm_vp != (struct vnode *)NULL) {
1130 		struct vnode	*vp = smp->sm_vp;
1131 		u_offset_t 	off = smp->sm_off;
1132 		/*
1133 		 * Destroy old vnode association and
1134 		 * unload any hardware translations to
1135 		 * the old object.
1136 		 */
1137 		smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reuse++;
1138 		segmap_hashout(smp);
1139 
1140 		/*
1141 		 * This node is off freelist and hashlist,
1142 		 * so there is no reason to drop/reacquire sm_mtx
1143 		 * across calls to hat_unload.
1144 		 */
1145 		if (segmap_kpm) {
1146 			caddr_t vaddr;
1147 			int hat_unload_needed = 0;
1148 
1149 			/*
1150 			 * unload kpm mapping
1151 			 */
1152 			if (pp != NULL) {
1153 				vaddr = hat_kpm_page2va(pp, 1);
1154 				hat_kpm_mapout(pp, GET_KPME(smp), vaddr);
1155 				page_unlock(pp);
1156 			}
1157 
1158 			/*
1159 			 * Check if we have (also) the rare case of a
1160 			 * non kpm mapping.
1161 			 */
1162 			if (smp->sm_flags & SM_NOTKPM_RELEASED) {
1163 				hat_unload_needed = 1;
1164 				smp->sm_flags &= ~SM_NOTKPM_RELEASED;
1165 			}
1166 
1167 			if (hat_unload_needed) {
1168 				hat_unload(kas.a_hat, segkmap->s_base +
1169 				    ((smp - smd_smap) * MAXBSIZE),
1170 				    MAXBSIZE, HAT_UNLOAD);
1171 			}
1172 
1173 		} else {
1174 			ASSERT(smp->sm_flags & SM_NOTKPM_RELEASED);
1175 			smp->sm_flags &= ~SM_NOTKPM_RELEASED;
1176 			hat_unload(kas.a_hat, segkmap->s_base +
1177 			    ((smp - smd_smap) * MAXBSIZE),
1178 			    MAXBSIZE, HAT_UNLOAD);
1179 		}
1180 		segmap_pagefree(vp, off);
1181 	}
1182 }
1183 
1184 static struct smap *
1185 get_free_smp(int free_ndx)
1186 {
1187 	struct smfree *sm;
1188 	kmutex_t *smtx;
1189 	struct smap *smp, *first;
1190 	struct sm_freeq *allocq, *releq;
1191 	struct kpme *kpme;
1192 	page_t *pp = NULL;
1193 	int end_ndx, page_locked = 0;
1194 
1195 	end_ndx = free_ndx;
1196 	sm = &smd_free[free_ndx];
1197 
1198 retry_queue:
1199 	allocq = sm->sm_allocq;
1200 	mutex_enter(&allocq->smq_mtx);
1201 
1202 	if ((smp = allocq->smq_free) == NULL) {
1203 
1204 skip_queue:
1205 		/*
1206 		 * The alloc list is empty or this queue is being skipped;
1207 		 * first see if the allocq toggled.
1208 		 */
1209 		if (sm->sm_allocq != allocq) {
1210 			/* queue changed */
1211 			mutex_exit(&allocq->smq_mtx);
1212 			goto retry_queue;
1213 		}
1214 		releq = sm->sm_releq;
1215 		if (!mutex_tryenter(&releq->smq_mtx)) {
1216 			/* cannot get releq; a free smp may be there now */
1217 			mutex_exit(&allocq->smq_mtx);
1218 
1219 			/*
1220 			 * This loop could spin forever if this thread has
1221 			 * higher priority than the thread that is holding
1222 			 * releq->smq_mtx. In order to force the other thread
1223 			 * to run, we'll lock/unlock the mutex which is safe
1224 			 * since we just unlocked the allocq mutex.
1225 			 */
1226 			mutex_enter(&releq->smq_mtx);
1227 			mutex_exit(&releq->smq_mtx);
1228 			goto retry_queue;
1229 		}
1230 		if (releq->smq_free == NULL) {
1231 			/*
1232 			 * This freelist is empty.
1233 			 * This should not happen unless clients
1234 			 * are failing to release the segmap
1235 			 * window after accessing the data.
1236 			 * Before resorting to sleeping, try
1237 			 * the next list of the same color.
1238 			 */
1239 			free_ndx = (free_ndx + smd_ncolor) & smd_freemsk;
1240 			if (free_ndx != end_ndx) {
1241 				mutex_exit(&releq->smq_mtx);
1242 				mutex_exit(&allocq->smq_mtx);
1243 				sm = &smd_free[free_ndx];
1244 				goto retry_queue;
1245 			}
1246 			/*
1247 			 * Tried all freelists of the same color once,
1248 			 * wait on this list and hope something gets freed.
1249 			 */
1250 			segmapcnt.smp_get_nofree.value.ul++;
1251 			sm->sm_want++;
1252 			mutex_exit(&sm->sm_freeq[1].smq_mtx);
1253 			cv_wait(&sm->sm_free_cv,
1254 			    &sm->sm_freeq[0].smq_mtx);
1255 			sm->sm_want--;
1256 			mutex_exit(&sm->sm_freeq[0].smq_mtx);
1257 			sm = &smd_free[free_ndx];
1258 			goto retry_queue;
1259 		} else {
1260 			/*
1261 			 * Something on the rele queue; flip the alloc
1262 			 * and rele queues and retry.
1263 			 */
1264 			sm->sm_allocq = releq;
1265 			sm->sm_releq = allocq;
1266 			mutex_exit(&allocq->smq_mtx);
1267 			mutex_exit(&releq->smq_mtx);
1268 			if (page_locked) {
1269 				delay(hz >> 2);
1270 				page_locked = 0;
1271 			}
1272 			goto retry_queue;
1273 		}
1274 	} else {
1275 		/*
1276 		 * Fastpath the case we get the smap mutex
1277 		 * on the first try.
1278 		 */
1279 		first = smp;
1280 next_smap:
1281 		smtx = SMAPMTX(smp);
1282 		if (!mutex_tryenter(smtx)) {
1283 			/*
1284 			 * Another thread is trying to reclaim this slot.
1285 			 * Skip to the next queue or smap.
1286 			 */
1287 			if ((smp = smp->sm_next) == first) {
1288 				goto skip_queue;
1289 			} else {
1290 				goto next_smap;
1291 			}
1292 		} else {
1293 			/*
1294 			 * if kpme exists, get shared lock on the page
1295 			 */
1296 			if (segmap_kpm && smp->sm_vp != NULL) {
1297 
1298 				kpme = GET_KPME(smp);
1299 				pp = kpme->kpe_page;
1300 
1301 				if (pp != NULL) {
1302 					if (!page_trylock(pp, SE_SHARED)) {
1303 						smp = smp->sm_next;
1304 						mutex_exit(smtx);
1305 						page_locked = 1;
1306 
1307 						pp = NULL;
1308 
1309 						if (smp == first) {
1310 							goto skip_queue;
1311 						} else {
1312 							goto next_smap;
1313 						}
1314 					} else {
1315 						if (kpme->kpe_page == NULL) {
1316 							page_unlock(pp);
1317 							pp = NULL;
1318 						}
1319 					}
1320 				}
1321 			}
1322 
1323 			/*
1324 			 * At this point, we've selected smp.  Remove smp
1325 			 * from its freelist.  If smp is the first one in
1326 			 * the freelist, update the head of the freelist.
1327 			 */
1328 			if (first == smp) {
1329 				ASSERT(first == allocq->smq_free);
1330 				allocq->smq_free = smp->sm_next;
1331 			}
1332 
1333 			/*
1334 			 * if the head of the freelist still points to smp,
1335 			 * then there are no more free smaps in that list.
1336 			 */
1337 			if (allocq->smq_free == smp)
1338 				/*
1339 				 * Took the last one
1340 				 */
1341 				allocq->smq_free = NULL;
1342 			else {
1343 				smp->sm_prev->sm_next = smp->sm_next;
1344 				smp->sm_next->sm_prev = smp->sm_prev;
1345 			}
1346 			mutex_exit(&allocq->smq_mtx);
1347 			smp->sm_prev = smp->sm_next = NULL;
1348 
1349 			/*
1350 			 * if pp != NULL, pp must have been locked;
1351 			 * grab_smp() unlocks pp.
1352 			 */
1353 			ASSERT((pp == NULL) || PAGE_LOCKED(pp));
1354 			grab_smp(smp, pp);
1355 			/* return smp locked. */
1356 			ASSERT(SMAPMTX(smp) == smtx);
1357 			ASSERT(MUTEX_HELD(smtx));
1358 			return (smp);
1359 		}
1360 	}
1361 }
1362 
1363 /*
1364  * Special public segmap operations
1365  */
1366 
1367 /*
1368  * Create pages (without using VOP_GETPAGE) and load up translations to them.
1369  * If softlock is TRUE, then set things up so that it looks like a call
1370  * to segmap_fault with F_SOFTLOCK.
1371  *
1372  * Returns 1, if a page is created by calling page_create_va(), or 0 otherwise.
1373  *
1374  * All fields in the generic segment (struct seg) are considered to be
1375  * read-only for "segmap" even though the kernel address space (kas) may
1376  * not be locked, hence no lock is needed to access them.
1377  */
1378 int
1379 segmap_pagecreate(struct seg *seg, caddr_t addr, size_t len, int softlock)
1380 {
1381 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
1382 	page_t *pp;
1383 	u_offset_t off;
1384 	struct smap *smp;
1385 	struct vnode *vp;
1386 	caddr_t eaddr;
1387 	int newpage = 0;
1388 	uint_t prot;
1389 	kmutex_t *smtx;
1390 	int hat_flag;
1391 
1392 	ASSERT(seg->s_as == &kas);
1393 
1394 	if (segmap_kpm && IS_KPM_ADDR(addr)) {
1395 		/*
1396 		 * Pages are successfully prefaulted and locked in
1397 		 * segmap_getmapflt and can't be unlocked until
1398 		 * segmap_release. The SM_KPM_NEWPAGE flag is set
1399 		 * in segmap_pagecreate_kpm when new pages are created.
1400 		 * and it is returned as "newpage" indication here.
1401 		 */
1402 		if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
1403 			panic("segmap_pagecreate: smap not found "
1404 			    "for addr %p", (void *)addr);
1405 			/*NOTREACHED*/
1406 		}
1407 
1408 		smtx = SMAPMTX(smp);
1409 		newpage = smp->sm_flags & SM_KPM_NEWPAGE;
1410 		smp->sm_flags &= ~SM_KPM_NEWPAGE;
1411 		mutex_exit(smtx);
1412 
1413 		return (newpage);
1414 	}
1415 
1416 	smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
1417 
1418 	eaddr = addr + len;
1419 	addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1420 
1421 	smp = GET_SMAP(seg, addr);
1422 
1423 	/*
1424 	 * We don't grab smp mutex here since we assume the smp
1425 	 * has a refcnt set already which prevents the slot from
1426 	 * changing its id.
1427 	 */
1428 	ASSERT(smp->sm_refcnt > 0);
1429 
1430 	vp = smp->sm_vp;
1431 	off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET));
1432 	prot = smd->smd_prot;
1433 
1434 	for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
1435 		hat_flag = HAT_LOAD;
1436 		pp = page_lookup(vp, off, SE_SHARED);
1437 		if (pp == NULL) {
1438 			ushort_t bitindex;
1439 
1440 			if ((pp = page_create_va(vp, off,
1441 			    PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
1442 				panic("segmap_pagecreate: page_create failed");
1443 				/*NOTREACHED*/
1444 			}
1445 			newpage = 1;
1446 			page_io_unlock(pp);
1447 
1448 			/*
1449 			 * Since pages created here do not contain valid
1450 			 * data until the caller writes into them, the
1451 			 * "exclusive" lock will not be dropped to prevent
1452 			 * other users from accessing the page.  We also
1453 			 * have to lock the translation to prevent a fault
1454 			 * from occurring when the virtual address mapped by
1455 			 * this page is written into.  This is necessary to
1456 			 * avoid a deadlock since we haven't dropped the
1457 			 * "exclusive" lock.
1458 			 */
1459 			bitindex = (ushort_t)((off - smp->sm_off) >> PAGESHIFT);
1460 
1461 			/*
1462 			 * Large Files: The following assertion is to
1463 			 * verify the cast above.
1464 			 */
1465 			ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
1466 			smtx = SMAPMTX(smp);
1467 			mutex_enter(smtx);
1468 			smp->sm_bitmap |= SMAP_BIT_MASK(bitindex);
1469 			mutex_exit(smtx);
1470 
1471 			hat_flag = HAT_LOAD_LOCK;
1472 		} else if (softlock) {
1473 			hat_flag = HAT_LOAD_LOCK;
1474 		}
1475 
1476 		if (IS_VMODSORT(pp->p_vnode) && (prot & PROT_WRITE))
1477 			hat_setmod(pp);
1478 
1479 		hat_memload(kas.a_hat, addr, pp, prot, hat_flag);
1480 
1481 		if (hat_flag != HAT_LOAD_LOCK)
1482 			page_unlock(pp);
1483 
1484 		TRACE_5(TR_FAC_VM, TR_SEGMAP_PAGECREATE,
1485 		    "segmap_pagecreate:seg %p addr %p pp %p vp %p offset %llx",
1486 		    seg, addr, pp, vp, off);
1487 	}
1488 
1489 	return (newpage);
1490 }
1491 
1492 void
1493 segmap_pageunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
1494 {
1495 	struct smap	*smp;
1496 	ushort_t	bitmask;
1497 	page_t		*pp;
1498 	struct	vnode	*vp;
1499 	u_offset_t	off;
1500 	caddr_t		eaddr;
1501 	kmutex_t	*smtx;
1502 
1503 	ASSERT(seg->s_as == &kas);
1504 
1505 	eaddr = addr + len;
1506 	addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1507 
1508 	if (segmap_kpm && IS_KPM_ADDR(addr)) {
1509 		/*
1510 		 * Pages are successfully prefaulted and locked in
1511 		 * segmap_getmapflt and can't be unlocked until
1512 		 * segmap_release, so no pages or hat mappings have
1513 		 * to be unlocked at this point.
1514 		 */
1515 #ifdef DEBUG
1516 		if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
1517 			panic("segmap_pageunlock: smap not found "
1518 			    "for addr %p", (void *)addr);
1519 			/*NOTREACHED*/
1520 		}
1521 
1522 		ASSERT(smp->sm_refcnt > 0);
1523 		mutex_exit(SMAPMTX(smp));
1524 #endif
1525 		return;
1526 	}
1527 
1528 	smp = GET_SMAP(seg, addr);
1529 	smtx = SMAPMTX(smp);
1530 
1531 	ASSERT(smp->sm_refcnt > 0);
1532 
1533 	vp = smp->sm_vp;
1534 	off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET));
1535 
1536 	for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
1537 		bitmask = SMAP_BIT_MASK((int)(off - smp->sm_off) >> PAGESHIFT);
1538 
1539 		/*
1540 		 * Large Files: Following assertion is to verify
1541 		 * the correctness of the cast to (int) above.
1542 		 */
1543 		ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
1544 
1545 		/*
1546 		 * If the bit corresponding to "off" is set,
1547 		 * clear this bit in the bitmap, unlock translations,
1548 		 * and release the "exclusive" lock on the page.
1549 		 */
1550 		if (smp->sm_bitmap & bitmask) {
1551 			mutex_enter(smtx);
1552 			smp->sm_bitmap &= ~bitmask;
1553 			mutex_exit(smtx);
1554 
1555 			hat_unlock(kas.a_hat, addr, PAGESIZE);
1556 
1557 			/*
1558 			 * Use page_find() instead of page_lookup() to
1559 			 * find the page since we know that it has
1560 			 * "exclusive" lock.
1561 			 */
1562 			pp = page_find(vp, off);
1563 			if (pp == NULL) {
1564 				panic("segmap_pageunlock: page not found");
1565 				/*NOTREACHED*/
1566 			}
1567 			if (rw == S_WRITE) {
1568 				hat_setrefmod(pp);
1569 			} else if (rw != S_OTHER) {
1570 				hat_setref(pp);
1571 			}
1572 
1573 			page_unlock(pp);
1574 		}
1575 	}
1576 }
1577 
1578 caddr_t
1579 segmap_getmap(struct seg *seg, struct vnode *vp, u_offset_t off)
1580 {
1581 	return (segmap_getmapflt(seg, vp, off, MAXBSIZE, 0, S_OTHER));
1582 }
1583 
1584 /*
1585  * This is the magic virtual address that offset 0 of an ELF
1586  * file gets mapped to in user space. This is used to pick
1587  * the vac color on the freelist.
1588  */
1589 #define	ELF_OFFZERO_VA	(0x10000)
1590 /*
1591  * segmap_getmap allocates a MAXBSIZE big slot to map the vnode vp
1592  * in the range <off, off + len). off doesn't need to be MAXBSIZE aligned.
1593  * The return address is  always MAXBSIZE aligned.
1594  *
1595  * If forcefault is nonzero and the MMU translations haven't yet been created,
1596  * segmap_getmap will call segmap_fault(..., F_INVAL, rw) to create them.
1597  */
1598 caddr_t
1599 segmap_getmapflt(
1600 	struct seg *seg,
1601 	struct vnode *vp,
1602 	u_offset_t off,
1603 	size_t len,
1604 	int forcefault,
1605 	enum seg_rw rw)
1606 {
1607 	struct smap *smp, *nsmp;
1608 	extern struct vnode *common_specvp();
1609 	caddr_t baseaddr;			/* MAXBSIZE aligned */
1610 	u_offset_t baseoff;
1611 	int newslot;
1612 	caddr_t vaddr;
1613 	int color, hashid;
1614 	kmutex_t *hashmtx, *smapmtx;
1615 	struct smfree *sm;
1616 	page_t	*pp;
1617 	struct kpme *kpme;
1618 	uint_t	prot;
1619 	caddr_t base;
1620 	page_t	*pl[MAXPPB + 1];
1621 	int	error;
1622 	int	is_kpm = 1;
1623 
1624 	ASSERT(seg->s_as == &kas);
1625 	ASSERT(seg == segkmap);
1626 
1627 	baseoff = off & (offset_t)MAXBMASK;
1628 	if (off + len > baseoff + MAXBSIZE) {
1629 		panic("segmap_getmap bad len");
1630 		/*NOTREACHED*/
1631 	}
1632 
1633 	/*
1634 	 * If this is a block device we have to be sure to use the
1635 	 * "common" block device vnode for the mapping.
1636 	 */
1637 	if (vp->v_type == VBLK)
1638 		vp = common_specvp(vp);
1639 
1640 	smd_cpu[CPU->cpu_seqid].scpu.scpu_getmap++;
1641 
1642 	if (segmap_kpm == 0 ||
1643 	    (forcefault == SM_PAGECREATE && rw != S_WRITE)) {
1644 		is_kpm = 0;
1645 	}
1646 
1647 	SMAP_HASHFUNC(vp, off, hashid);	/* macro assigns hashid */
1648 	hashmtx = SHASHMTX(hashid);
1649 
1650 retry_hash:
1651 	mutex_enter(hashmtx);
1652 	for (smp = smd_hash[hashid].sh_hash_list;
1653 	    smp != NULL; smp = smp->sm_hash)
1654 		if (smp->sm_vp == vp && smp->sm_off == baseoff)
1655 			break;
1656 	mutex_exit(hashmtx);
1657 
1658 vrfy_smp:
1659 	if (smp != NULL) {
1660 
1661 		ASSERT(vp->v_count != 0);
1662 
1663 		/*
1664 		 * Get smap lock and recheck its tag. The hash lock
1665 		 * is dropped since the hash is based on (vp, off)
1666 		 * and (vp, off) won't change when we have smap mtx.
1667 		 */
1668 		smapmtx = SMAPMTX(smp);
1669 		mutex_enter(smapmtx);
1670 		if (smp->sm_vp != vp || smp->sm_off != baseoff) {
1671 			mutex_exit(smapmtx);
1672 			goto retry_hash;
1673 		}
1674 
1675 		if (smp->sm_refcnt == 0) {
1676 
1677 			smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reclaim++;
1678 
1679 			/*
1680 			 * Could still be on the free list. However, this
1681 			 * could also be an smp that is transitioning from
1682 			 * the free list when we have too much contention
1683 			 * for the smapmtx's. In this case, we have an
1684 			 * unlocked smp that is not on the free list any
1685 			 * longer, but still has a 0 refcnt.  The only way
1686 			 * to be sure is to check the freelist pointers.
1687 			 * Since we now have the smapmtx, we are guaranteed
1688 			 * that the (vp, off) won't change, so we are safe
1689 			 * to reclaim it.  get_free_smp() knows that this
1690 			 * can happen, and it will check the refcnt.
1691 			 */
1692 
1693 			if ((smp->sm_next != NULL)) {
1694 				struct sm_freeq *freeq;
1695 
1696 				ASSERT(smp->sm_prev != NULL);
1697 				sm = &smd_free[smp->sm_free_ndx];
1698 
1699 				if (smp->sm_flags & SM_QNDX_ZERO)
1700 					freeq = &sm->sm_freeq[0];
1701 				else
1702 					freeq = &sm->sm_freeq[1];
1703 
1704 				mutex_enter(&freeq->smq_mtx);
1705 				if (freeq->smq_free != smp) {
1706 					/*
1707 					 * fastpath normal case
1708 					 */
1709 					smp->sm_prev->sm_next = smp->sm_next;
1710 					smp->sm_next->sm_prev = smp->sm_prev;
1711 				} else if (smp == smp->sm_next) {
1712 					/*
1713 					 * Taking the last smap on freelist
1714 					 */
1715 					freeq->smq_free = NULL;
1716 				} else {
1717 					/*
1718 					 * Reclaiming 1st smap on list
1719 					 */
1720 					freeq->smq_free = smp->sm_next;
1721 					smp->sm_prev->sm_next = smp->sm_next;
1722 					smp->sm_next->sm_prev = smp->sm_prev;
1723 				}
1724 				mutex_exit(&freeq->smq_mtx);
1725 				smp->sm_prev = smp->sm_next = NULL;
1726 			} else {
1727 				ASSERT(smp->sm_prev == NULL);
1728 				segmapcnt.smp_stolen.value.ul++;
1729 			}
1730 
1731 		} else {
1732 			segmapcnt.smp_get_use.value.ul++;
1733 		}
1734 		smp->sm_refcnt++;		/* another user */
1735 
1736 		/*
1737 		 * We don't invoke segmap_fault via TLB miss, so we set ref
1738 		 * and mod bits in advance. For S_OTHER  we set them in
1739 		 * segmap_fault F_SOFTUNLOCK.
1740 		 */
1741 		if (is_kpm) {
1742 			if (rw == S_WRITE) {
1743 				smp->sm_flags |= SM_WRITE_DATA;
1744 			} else if (rw == S_READ) {
1745 				smp->sm_flags |= SM_READ_DATA;
1746 			}
1747 		}
1748 		mutex_exit(smapmtx);
1749 
1750 		newslot = 0;
1751 	} else {
1752 
1753 		uint32_t free_ndx, *free_ndxp;
1754 		union segmap_cpu *scpu;
1755 
1756 		/*
1757 		 * On a PAC machine or a machine with anti-alias
1758 		 * hardware, smd_colormsk will be zero.
1759 		 *
1760 		 * On a VAC machine- pick color by offset in the file
1761 		 * so we won't get VAC conflicts on elf files.
1762 		 * On data files, color does not matter but we
1763 		 * don't know what kind of file it is so we always
1764 		 * pick color by offset. This causes color
1765 		 * corresponding to file offset zero to be used more
1766 		 * heavily.
1767 		 */
1768 		color = (baseoff >> MAXBSHIFT) & smd_colormsk;
1769 		scpu = smd_cpu+CPU->cpu_seqid;
1770 		free_ndxp = &scpu->scpu.scpu_free_ndx[color];
1771 		free_ndx = (*free_ndxp += smd_ncolor) & smd_freemsk;
1772 #ifdef DEBUG
1773 		colors_used[free_ndx]++;
1774 #endif /* DEBUG */
1775 
1776 		/*
1777 		 * Get a locked smp slot from the free list.
1778 		 */
1779 		smp = get_free_smp(free_ndx);
1780 		smapmtx = SMAPMTX(smp);
1781 
1782 		ASSERT(smp->sm_vp == NULL);
1783 
1784 		if ((nsmp = segmap_hashin(smp, vp, baseoff, hashid)) != NULL) {
1785 			/*
1786 			 * Failed to hashin, there exists one now.
1787 			 * Return the smp we just allocated.
1788 			 */
1789 			segmap_smapadd(smp);
1790 			mutex_exit(smapmtx);
1791 
1792 			smp = nsmp;
1793 			goto vrfy_smp;
1794 		}
1795 		smp->sm_refcnt++;		/* another user */
1796 
1797 		/*
1798 		 * We don't invoke segmap_fault via TLB miss, so we set ref
1799 		 * and mod bits in advance. For S_OTHER  we set them in
1800 		 * segmap_fault F_SOFTUNLOCK.
1801 		 */
1802 		if (is_kpm) {
1803 			if (rw == S_WRITE) {
1804 				smp->sm_flags |= SM_WRITE_DATA;
1805 			} else if (rw == S_READ) {
1806 				smp->sm_flags |= SM_READ_DATA;
1807 			}
1808 		}
1809 		mutex_exit(smapmtx);
1810 
1811 		newslot = 1;
1812 	}
1813 
1814 	if (!is_kpm)
1815 		goto use_segmap_range;
1816 
1817 	/*
1818 	 * Use segkpm
1819 	 */
1820 	/* Lint directive required until 6746211 is fixed */
1821 	/*CONSTCOND*/
1822 	ASSERT(PAGESIZE == MAXBSIZE);
1823 
1824 	/*
1825 	 * remember the last smp faulted on this cpu.
1826 	 */
1827 	(smd_cpu+CPU->cpu_seqid)->scpu.scpu_last_smap = smp;
1828 
1829 	if (forcefault == SM_PAGECREATE) {
1830 		baseaddr = segmap_pagecreate_kpm(seg, vp, baseoff, smp, rw);
1831 		return (baseaddr);
1832 	}
1833 
1834 	if (newslot == 0 &&
1835 	    (pp = GET_KPME(smp)->kpe_page) != NULL) {
1836 
1837 		/* fastpath */
1838 		switch (rw) {
1839 		case S_READ:
1840 		case S_WRITE:
1841 			if (page_trylock(pp, SE_SHARED)) {
1842 				if (PP_ISFREE(pp) ||
1843 				    !(pp->p_vnode == vp &&
1844 				    pp->p_offset == baseoff)) {
1845 					page_unlock(pp);
1846 					pp = page_lookup(vp, baseoff,
1847 					    SE_SHARED);
1848 				}
1849 			} else {
1850 				pp = page_lookup(vp, baseoff, SE_SHARED);
1851 			}
1852 
1853 			if (pp == NULL) {
1854 				ASSERT(GET_KPME(smp)->kpe_page == NULL);
1855 				break;
1856 			}
1857 
1858 			if (rw == S_WRITE &&
1859 			    hat_page_getattr(pp, P_MOD | P_REF) !=
1860 			    (P_MOD | P_REF)) {
1861 				page_unlock(pp);
1862 				break;
1863 			}
1864 
1865 			/*
1866 			 * We have the p_selock as reader, grab_smp
1867 			 * can't hit us, we have bumped the smap
1868 			 * refcnt and hat_pageunload needs the
1869 			 * p_selock exclusive.
1870 			 */
1871 			kpme = GET_KPME(smp);
1872 			if (kpme->kpe_page == pp) {
1873 				baseaddr = hat_kpm_page2va(pp, 0);
1874 			} else if (kpme->kpe_page == NULL) {
1875 				baseaddr = hat_kpm_mapin(pp, kpme);
1876 			} else {
1877 				panic("segmap_getmapflt: stale "
1878 				    "kpme page, kpme %p", (void *)kpme);
1879 				/*NOTREACHED*/
1880 			}
1881 
1882 			/*
1883 			 * We don't invoke segmap_fault via TLB miss,
1884 			 * so we set ref and mod bits in advance.
1885 			 * For S_OTHER and we set them in segmap_fault
1886 			 * F_SOFTUNLOCK.
1887 			 */
1888 			if (rw == S_READ && !hat_isref(pp))
1889 				hat_setref(pp);
1890 
1891 			return (baseaddr);
1892 		default:
1893 			break;
1894 		}
1895 	}
1896 
1897 	base = segkpm_create_va(baseoff);
1898 	error = VOP_GETPAGE(vp, (offset_t)baseoff, len, &prot, pl, MAXBSIZE,
1899 	    seg, base, rw, CRED(), NULL);
1900 
1901 	pp = pl[0];
1902 	if (error || pp == NULL) {
1903 		/*
1904 		 * Use segmap address slot and let segmap_fault deal
1905 		 * with the error cases. There is no error return
1906 		 * possible here.
1907 		 */
1908 		goto use_segmap_range;
1909 	}
1910 
1911 	ASSERT(pl[1] == NULL);
1912 
1913 	/*
1914 	 * When prot is not returned w/ PROT_ALL the returned pages
1915 	 * are not backed by fs blocks. For most of the segmap users
1916 	 * this is no problem, they don't write to the pages in the
1917 	 * same request and therefore don't rely on a following
1918 	 * trap driven segmap_fault. With SM_LOCKPROTO users it
1919 	 * is more secure to use segkmap adresses to allow
1920 	 * protection segmap_fault's.
1921 	 */
1922 	if (prot != PROT_ALL && forcefault == SM_LOCKPROTO) {
1923 		/*
1924 		 * Use segmap address slot and let segmap_fault
1925 		 * do the error return.
1926 		 */
1927 		ASSERT(rw != S_WRITE);
1928 		ASSERT(PAGE_LOCKED(pp));
1929 		page_unlock(pp);
1930 		forcefault = 0;
1931 		goto use_segmap_range;
1932 	}
1933 
1934 	/*
1935 	 * We have the p_selock as reader, grab_smp can't hit us, we
1936 	 * have bumped the smap refcnt and hat_pageunload needs the
1937 	 * p_selock exclusive.
1938 	 */
1939 	kpme = GET_KPME(smp);
1940 	if (kpme->kpe_page == pp) {
1941 		baseaddr = hat_kpm_page2va(pp, 0);
1942 	} else if (kpme->kpe_page == NULL) {
1943 		baseaddr = hat_kpm_mapin(pp, kpme);
1944 	} else {
1945 		panic("segmap_getmapflt: stale kpme page after "
1946 		    "VOP_GETPAGE, kpme %p", (void *)kpme);
1947 		/*NOTREACHED*/
1948 	}
1949 
1950 	smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
1951 
1952 	return (baseaddr);
1953 
1954 
1955 use_segmap_range:
1956 	baseaddr = seg->s_base + ((smp - smd_smap) * MAXBSIZE);
1957 	TRACE_4(TR_FAC_VM, TR_SEGMAP_GETMAP,
1958 	    "segmap_getmap:seg %p addr %p vp %p offset %llx",
1959 	    seg, baseaddr, vp, baseoff);
1960 
1961 	/*
1962 	 * Prefault the translations
1963 	 */
1964 	vaddr = baseaddr + (off - baseoff);
1965 	if (forcefault && (newslot || !hat_probe(kas.a_hat, vaddr))) {
1966 
1967 		caddr_t pgaddr = (caddr_t)((uintptr_t)vaddr &
1968 		    (uintptr_t)PAGEMASK);
1969 
1970 		(void) segmap_fault(kas.a_hat, seg, pgaddr,
1971 		    (vaddr + len - pgaddr + PAGESIZE - 1) & (uintptr_t)PAGEMASK,
1972 		    F_INVAL, rw);
1973 	}
1974 
1975 	return (baseaddr);
1976 }
1977 
1978 int
1979 segmap_release(struct seg *seg, caddr_t addr, uint_t flags)
1980 {
1981 	struct smap	*smp;
1982 	int 		error;
1983 	int		bflags = 0;
1984 	struct vnode	*vp;
1985 	u_offset_t	offset;
1986 	kmutex_t	*smtx;
1987 	int		is_kpm = 0;
1988 	page_t		*pp;
1989 
1990 	if (segmap_kpm && IS_KPM_ADDR(addr)) {
1991 
1992 		if (((uintptr_t)addr & MAXBOFFSET) != 0) {
1993 			panic("segmap_release: addr %p not "
1994 			    "MAXBSIZE aligned", (void *)addr);
1995 			/*NOTREACHED*/
1996 		}
1997 
1998 		if ((smp = get_smap_kpm(addr, &pp)) == NULL) {
1999 			panic("segmap_release: smap not found "
2000 			    "for addr %p", (void *)addr);
2001 			/*NOTREACHED*/
2002 		}
2003 
2004 		TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP,
2005 		    "segmap_relmap:seg %p addr %p smp %p",
2006 		    seg, addr, smp);
2007 
2008 		smtx = SMAPMTX(smp);
2009 
2010 		/*
2011 		 * For compatibility reasons segmap_pagecreate_kpm sets this
2012 		 * flag to allow a following segmap_pagecreate to return
2013 		 * this as "newpage" flag. When segmap_pagecreate is not
2014 		 * called at all we clear it now.
2015 		 */
2016 		smp->sm_flags &= ~SM_KPM_NEWPAGE;
2017 		is_kpm = 1;
2018 		if (smp->sm_flags & SM_WRITE_DATA) {
2019 			hat_setrefmod(pp);
2020 		} else if (smp->sm_flags & SM_READ_DATA) {
2021 			hat_setref(pp);
2022 		}
2023 	} else {
2024 		if (addr < seg->s_base || addr >= seg->s_base + seg->s_size ||
2025 		    ((uintptr_t)addr & MAXBOFFSET) != 0) {
2026 			panic("segmap_release: bad addr %p", (void *)addr);
2027 			/*NOTREACHED*/
2028 		}
2029 		smp = GET_SMAP(seg, addr);
2030 
2031 		TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP,
2032 		    "segmap_relmap:seg %p addr %p smp %p",
2033 		    seg, addr, smp);
2034 
2035 		smtx = SMAPMTX(smp);
2036 		mutex_enter(smtx);
2037 		smp->sm_flags |= SM_NOTKPM_RELEASED;
2038 	}
2039 
2040 	ASSERT(smp->sm_refcnt > 0);
2041 
2042 	/*
2043 	 * Need to call VOP_PUTPAGE() if any flags (except SM_DONTNEED)
2044 	 * are set.
2045 	 */
2046 	if ((flags & ~SM_DONTNEED) != 0) {
2047 		if (flags & SM_WRITE)
2048 			segmapcnt.smp_rel_write.value.ul++;
2049 		if (flags & SM_ASYNC) {
2050 			bflags |= B_ASYNC;
2051 			segmapcnt.smp_rel_async.value.ul++;
2052 		}
2053 		if (flags & SM_INVAL) {
2054 			bflags |= B_INVAL;
2055 			segmapcnt.smp_rel_abort.value.ul++;
2056 		}
2057 		if (flags & SM_DESTROY) {
2058 			bflags |= (B_INVAL|B_TRUNC);
2059 			segmapcnt.smp_rel_abort.value.ul++;
2060 		}
2061 		if (smp->sm_refcnt == 1) {
2062 			/*
2063 			 * We only bother doing the FREE and DONTNEED flags
2064 			 * if no one else is still referencing this mapping.
2065 			 */
2066 			if (flags & SM_FREE) {
2067 				bflags |= B_FREE;
2068 				segmapcnt.smp_rel_free.value.ul++;
2069 			}
2070 			if (flags & SM_DONTNEED) {
2071 				bflags |= B_DONTNEED;
2072 				segmapcnt.smp_rel_dontneed.value.ul++;
2073 			}
2074 		}
2075 	} else {
2076 		smd_cpu[CPU->cpu_seqid].scpu.scpu_release++;
2077 	}
2078 
2079 	vp = smp->sm_vp;
2080 	offset = smp->sm_off;
2081 
2082 	if (--smp->sm_refcnt == 0) {
2083 
2084 		smp->sm_flags &= ~(SM_WRITE_DATA | SM_READ_DATA);
2085 
2086 		if (flags & (SM_INVAL|SM_DESTROY)) {
2087 			segmap_hashout(smp);	/* remove map info */
2088 			if (is_kpm) {
2089 				hat_kpm_mapout(pp, GET_KPME(smp), addr);
2090 				if (smp->sm_flags & SM_NOTKPM_RELEASED) {
2091 					smp->sm_flags &= ~SM_NOTKPM_RELEASED;
2092 					hat_unload(kas.a_hat, segkmap->s_base +
2093 					    ((smp - smd_smap) * MAXBSIZE),
2094 					    MAXBSIZE, HAT_UNLOAD);
2095 				}
2096 
2097 			} else {
2098 				if (segmap_kpm)
2099 					segkpm_mapout_validkpme(GET_KPME(smp));
2100 
2101 				smp->sm_flags &= ~SM_NOTKPM_RELEASED;
2102 				hat_unload(kas.a_hat, addr, MAXBSIZE,
2103 				    HAT_UNLOAD);
2104 			}
2105 		}
2106 		segmap_smapadd(smp);	/* add to free list */
2107 	}
2108 
2109 	mutex_exit(smtx);
2110 
2111 	if (is_kpm)
2112 		page_unlock(pp);
2113 	/*
2114 	 * Now invoke VOP_PUTPAGE() if any flags (except SM_DONTNEED)
2115 	 * are set.
2116 	 */
2117 	if ((flags & ~SM_DONTNEED) != 0) {
2118 		error = VOP_PUTPAGE(vp, offset, MAXBSIZE,
2119 		    bflags, CRED(), NULL);
2120 	} else {
2121 		error = 0;
2122 	}
2123 
2124 	return (error);
2125 }
2126 
2127 /*
2128  * Dump the pages belonging to this segmap segment.
2129  */
2130 static void
2131 segmap_dump(struct seg *seg)
2132 {
2133 	struct segmap_data *smd;
2134 	struct smap *smp, *smp_end;
2135 	page_t *pp;
2136 	pfn_t pfn;
2137 	u_offset_t off;
2138 	caddr_t addr;
2139 
2140 	smd = (struct segmap_data *)seg->s_data;
2141 	addr = seg->s_base;
2142 	for (smp = smd->smd_sm, smp_end = smp + smd->smd_npages;
2143 	    smp < smp_end; smp++) {
2144 
2145 		if (smp->sm_refcnt) {
2146 			for (off = 0; off < MAXBSIZE; off += PAGESIZE) {
2147 				int we_own_it = 0;
2148 
2149 				/*
2150 				 * If pp == NULL, the page either does
2151 				 * not exist or is exclusively locked.
2152 				 * So determine if it exists before
2153 				 * searching for it.
2154 				 */
2155 				if ((pp = page_lookup_nowait(smp->sm_vp,
2156 				    smp->sm_off + off, SE_SHARED)))
2157 					we_own_it = 1;
2158 				else
2159 					pp = page_exists(smp->sm_vp,
2160 					    smp->sm_off + off);
2161 
2162 				if (pp) {
2163 					pfn = page_pptonum(pp);
2164 					dump_addpage(seg->s_as,
2165 					    addr + off, pfn);
2166 					if (we_own_it)
2167 						page_unlock(pp);
2168 				}
2169 				dump_timeleft = dump_timeout;
2170 			}
2171 		}
2172 		addr += MAXBSIZE;
2173 	}
2174 }
2175 
2176 /*ARGSUSED*/
2177 static int
2178 segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
2179     struct page ***ppp, enum lock_type type, enum seg_rw rw)
2180 {
2181 	return (ENOTSUP);
2182 }
2183 
2184 static int
2185 segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
2186 {
2187 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
2188 
2189 	memidp->val[0] = (uintptr_t)smd->smd_sm->sm_vp;
2190 	memidp->val[1] = smd->smd_sm->sm_off + (uintptr_t)(addr - seg->s_base);
2191 	return (0);
2192 }
2193 
2194 /*ARGSUSED*/
2195 static lgrp_mem_policy_info_t *
2196 segmap_getpolicy(struct seg *seg, caddr_t addr)
2197 {
2198 	return (NULL);
2199 }
2200 
2201 /*ARGSUSED*/
2202 static int
2203 segmap_capable(struct seg *seg, segcapability_t capability)
2204 {
2205 	return (0);
2206 }
2207 
2208 
2209 #ifdef	SEGKPM_SUPPORT
2210 
2211 /*
2212  * segkpm support routines
2213  */
2214 
2215 static caddr_t
2216 segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off,
2217 	struct smap *smp, enum seg_rw rw)
2218 {
2219 	caddr_t	base;
2220 	page_t	*pp;
2221 	int	newpage = 0;
2222 	struct kpme	*kpme;
2223 
2224 	ASSERT(smp->sm_refcnt > 0);
2225 
2226 	if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
2227 		kmutex_t *smtx;
2228 
2229 		base = segkpm_create_va(off);
2230 
2231 		if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT,
2232 		    seg, base)) == NULL) {
2233 			panic("segmap_pagecreate_kpm: "
2234 			    "page_create failed");
2235 			/*NOTREACHED*/
2236 		}
2237 
2238 		newpage = 1;
2239 		page_io_unlock(pp);
2240 		ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
2241 
2242 		/*
2243 		 * Mark this here until the following segmap_pagecreate
2244 		 * or segmap_release.
2245 		 */
2246 		smtx = SMAPMTX(smp);
2247 		mutex_enter(smtx);
2248 		smp->sm_flags |= SM_KPM_NEWPAGE;
2249 		mutex_exit(smtx);
2250 	}
2251 
2252 	kpme = GET_KPME(smp);
2253 	if (!newpage && kpme->kpe_page == pp)
2254 		base = hat_kpm_page2va(pp, 0);
2255 	else
2256 		base = hat_kpm_mapin(pp, kpme);
2257 
2258 	/*
2259 	 * FS code may decide not to call segmap_pagecreate and we
2260 	 * don't invoke segmap_fault via TLB miss, so we have to set
2261 	 * ref and mod bits in advance.
2262 	 */
2263 	if (rw == S_WRITE) {
2264 		hat_setrefmod(pp);
2265 	} else {
2266 		ASSERT(rw == S_READ);
2267 		hat_setref(pp);
2268 	}
2269 
2270 	smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
2271 
2272 	return (base);
2273 }
2274 
2275 /*
2276  * Find the smap structure corresponding to the
2277  * KPM addr and return it locked.
2278  */
2279 struct smap *
2280 get_smap_kpm(caddr_t addr, page_t **ppp)
2281 {
2282 	struct smap	*smp;
2283 	struct vnode	*vp;
2284 	u_offset_t	offset;
2285 	caddr_t		baseaddr = (caddr_t)((uintptr_t)addr & MAXBMASK);
2286 	int		hashid;
2287 	kmutex_t	*hashmtx;
2288 	page_t		*pp;
2289 	union segmap_cpu *scpu;
2290 
2291 	pp = hat_kpm_vaddr2page(baseaddr);
2292 
2293 	ASSERT(pp && !PP_ISFREE(pp));
2294 	ASSERT(PAGE_LOCKED(pp));
2295 	ASSERT(((uintptr_t)pp->p_offset & MAXBOFFSET) == 0);
2296 
2297 	vp = pp->p_vnode;
2298 	offset = pp->p_offset;
2299 	ASSERT(vp != NULL);
2300 
2301 	/*
2302 	 * Assume the last smap used on this cpu is the one needed.
2303 	 */
2304 	scpu = smd_cpu+CPU->cpu_seqid;
2305 	smp = scpu->scpu.scpu_last_smap;
2306 	mutex_enter(&smp->sm_mtx);
2307 	if (smp->sm_vp == vp && smp->sm_off == offset) {
2308 		ASSERT(smp->sm_refcnt > 0);
2309 	} else {
2310 		/*
2311 		 * Assumption wrong, find the smap on the hash chain.
2312 		 */
2313 		mutex_exit(&smp->sm_mtx);
2314 		SMAP_HASHFUNC(vp, offset, hashid); /* macro assigns hashid */
2315 		hashmtx = SHASHMTX(hashid);
2316 
2317 		mutex_enter(hashmtx);
2318 		smp = smd_hash[hashid].sh_hash_list;
2319 		for (; smp != NULL; smp = smp->sm_hash) {
2320 			if (smp->sm_vp == vp && smp->sm_off == offset)
2321 				break;
2322 		}
2323 		mutex_exit(hashmtx);
2324 		if (smp) {
2325 			mutex_enter(&smp->sm_mtx);
2326 			ASSERT(smp->sm_vp == vp && smp->sm_off == offset);
2327 		}
2328 	}
2329 
2330 	if (ppp)
2331 		*ppp = smp ? pp : NULL;
2332 
2333 	return (smp);
2334 }
2335 
2336 #else	/* SEGKPM_SUPPORT */
2337 
2338 /* segkpm stubs */
2339 
2340 /*ARGSUSED*/
2341 static caddr_t
2342 segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off,
2343 	struct smap *smp, enum seg_rw rw)
2344 {
2345 	return (NULL);
2346 }
2347 
2348 /*ARGSUSED*/
2349 struct smap *
2350 get_smap_kpm(caddr_t addr, page_t **ppp)
2351 {
2352 	return (NULL);
2353 }
2354 
2355 #endif	/* SEGKPM_SUPPORT */
2356