xref: /titanic_51/usr/src/uts/common/vm/seg_map.c (revision f808c858fa61e7769218966759510a8b1190dfcf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * Portions of this source code were derived from Berkeley 4.3 BSD
31  * under license from the Regents of the University of California.
32  */
33 
34 #pragma ident	"%Z%%M%	%I%	%E% SMI"
35 
36 /*
37  * VM - generic vnode mapping segment.
38  *
39  * The segmap driver is used only by the kernel to get faster (than seg_vn)
40  * mappings [lower routine overhead; more persistent cache] to random
41  * vnode/offsets.  Note than the kernel may (and does) use seg_vn as well.
42  */
43 
44 #include <sys/types.h>
45 #include <sys/t_lock.h>
46 #include <sys/param.h>
47 #include <sys/sysmacros.h>
48 #include <sys/buf.h>
49 #include <sys/systm.h>
50 #include <sys/vnode.h>
51 #include <sys/mman.h>
52 #include <sys/errno.h>
53 #include <sys/cred.h>
54 #include <sys/kmem.h>
55 #include <sys/vtrace.h>
56 #include <sys/cmn_err.h>
57 #include <sys/debug.h>
58 #include <sys/thread.h>
59 #include <sys/dumphdr.h>
60 #include <sys/bitmap.h>
61 #include <sys/lgrp.h>
62 
63 #include <vm/seg_kmem.h>
64 #include <vm/hat.h>
65 #include <vm/as.h>
66 #include <vm/seg.h>
67 #include <vm/seg_kpm.h>
68 #include <vm/seg_map.h>
69 #include <vm/page.h>
70 #include <vm/pvn.h>
71 #include <vm/rm.h>
72 
73 /*
74  * Private seg op routines.
75  */
76 static void	segmap_free(struct seg *seg);
77 faultcode_t segmap_fault(struct hat *hat, struct seg *seg, caddr_t addr,
78 			size_t len, enum fault_type type, enum seg_rw rw);
79 static faultcode_t segmap_faulta(struct seg *seg, caddr_t addr);
80 static int	segmap_checkprot(struct seg *seg, caddr_t addr, size_t len,
81 			uint_t prot);
82 static int	segmap_kluster(struct seg *seg, caddr_t addr, ssize_t);
83 static int	segmap_getprot(struct seg *seg, caddr_t addr, size_t len,
84 			uint_t *protv);
85 static u_offset_t	segmap_getoffset(struct seg *seg, caddr_t addr);
86 static int	segmap_gettype(struct seg *seg, caddr_t addr);
87 static int	segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
88 static void	segmap_dump(struct seg *seg);
89 static int	segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
90 			struct page ***ppp, enum lock_type type,
91 			enum seg_rw rw);
92 static void	segmap_badop(void);
93 static int	segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp);
94 static lgrp_mem_policy_info_t	*segmap_getpolicy(struct seg *seg,
95     caddr_t addr);
96 static int	segmap_capable(struct seg *seg, segcapability_t capability);
97 
98 /* segkpm support */
99 static caddr_t	segmap_pagecreate_kpm(struct seg *, vnode_t *, u_offset_t,
100 			struct smap *, enum seg_rw);
101 struct smap	*get_smap_kpm(caddr_t, page_t **);
102 
103 #define	SEGMAP_BADOP(t)	(t(*)())segmap_badop
104 
105 static struct seg_ops segmap_ops = {
106 	SEGMAP_BADOP(int),	/* dup */
107 	SEGMAP_BADOP(int),	/* unmap */
108 	segmap_free,
109 	segmap_fault,
110 	segmap_faulta,
111 	SEGMAP_BADOP(int),	/* setprot */
112 	segmap_checkprot,
113 	segmap_kluster,
114 	SEGMAP_BADOP(size_t),	/* swapout */
115 	SEGMAP_BADOP(int),	/* sync */
116 	SEGMAP_BADOP(size_t),	/* incore */
117 	SEGMAP_BADOP(int),	/* lockop */
118 	segmap_getprot,
119 	segmap_getoffset,
120 	segmap_gettype,
121 	segmap_getvp,
122 	SEGMAP_BADOP(int),	/* advise */
123 	segmap_dump,
124 	segmap_pagelock,	/* pagelock */
125 	SEGMAP_BADOP(int),	/* setpgsz */
126 	segmap_getmemid,	/* getmemid */
127 	segmap_getpolicy,	/* getpolicy */
128 	segmap_capable,		/* capable */
129 };
130 
131 /*
132  * Private segmap routines.
133  */
134 static void	segmap_unlock(struct hat *hat, struct seg *seg, caddr_t addr,
135 			size_t len, enum seg_rw rw, struct smap *smp);
136 static void	segmap_smapadd(struct smap *smp);
137 static struct smap *segmap_hashin(struct smap *smp, struct vnode *vp,
138 			u_offset_t off, int hashid);
139 static void	segmap_hashout(struct smap *smp);
140 
141 
142 /*
143  * Statistics for segmap operations.
144  *
145  * No explicit locking to protect these stats.
146  */
147 struct segmapcnt segmapcnt = {
148 	{ "fault",		KSTAT_DATA_ULONG },
149 	{ "faulta",		KSTAT_DATA_ULONG },
150 	{ "getmap",		KSTAT_DATA_ULONG },
151 	{ "get_use",		KSTAT_DATA_ULONG },
152 	{ "get_reclaim",	KSTAT_DATA_ULONG },
153 	{ "get_reuse",		KSTAT_DATA_ULONG },
154 	{ "get_unused",		KSTAT_DATA_ULONG },
155 	{ "get_nofree",		KSTAT_DATA_ULONG },
156 	{ "rel_async",		KSTAT_DATA_ULONG },
157 	{ "rel_write",		KSTAT_DATA_ULONG },
158 	{ "rel_free",		KSTAT_DATA_ULONG },
159 	{ "rel_abort",		KSTAT_DATA_ULONG },
160 	{ "rel_dontneed",	KSTAT_DATA_ULONG },
161 	{ "release",		KSTAT_DATA_ULONG },
162 	{ "pagecreate",		KSTAT_DATA_ULONG },
163 	{ "free_notfree",	KSTAT_DATA_ULONG },
164 	{ "free_dirty",		KSTAT_DATA_ULONG },
165 	{ "free",		KSTAT_DATA_ULONG },
166 	{ "stolen",		KSTAT_DATA_ULONG },
167 	{ "get_nomtx",		KSTAT_DATA_ULONG }
168 };
169 
170 kstat_named_t *segmapcnt_ptr = (kstat_named_t *)&segmapcnt;
171 uint_t segmapcnt_ndata = sizeof (segmapcnt) / sizeof (kstat_named_t);
172 
173 /*
174  * Return number of map pages in segment.
175  */
176 #define	MAP_PAGES(seg)		((seg)->s_size >> MAXBSHIFT)
177 
178 /*
179  * Translate addr into smap number within segment.
180  */
181 #define	MAP_PAGE(seg, addr)  (((addr) - (seg)->s_base) >> MAXBSHIFT)
182 
183 /*
184  * Translate addr in seg into struct smap pointer.
185  */
186 #define	GET_SMAP(seg, addr)	\
187 	&(((struct segmap_data *)((seg)->s_data))->smd_sm[MAP_PAGE(seg, addr)])
188 
189 /*
190  * Bit in map (16 bit bitmap).
191  */
192 #define	SMAP_BIT_MASK(bitindex)	(1 << ((bitindex) & 0xf))
193 
194 static int smd_colormsk = 0;
195 static int smd_ncolor = 0;
196 static int smd_nfree = 0;
197 static int smd_freemsk = 0;
198 #ifdef DEBUG
199 static int *colors_used;
200 #endif
201 static struct smap *smd_smap;
202 static struct smaphash *smd_hash;
203 #ifdef SEGMAP_HASHSTATS
204 static unsigned int *smd_hash_len;
205 #endif
206 static struct smfree *smd_free;
207 static ulong_t smd_hashmsk = 0;
208 
209 #define	SEGMAP_MAXCOLOR		2
210 #define	SEGMAP_CACHE_PAD	64
211 
212 union segmap_cpu {
213 	struct {
214 		uint32_t	scpu_free_ndx[SEGMAP_MAXCOLOR];
215 		struct smap	*scpu_last_smap;
216 		ulong_t		scpu_getmap;
217 		ulong_t		scpu_release;
218 		ulong_t		scpu_get_reclaim;
219 		ulong_t		scpu_fault;
220 		ulong_t		scpu_pagecreate;
221 		ulong_t		scpu_get_reuse;
222 	} scpu;
223 	char	scpu_pad[SEGMAP_CACHE_PAD];
224 };
225 static union segmap_cpu *smd_cpu;
226 
227 /*
228  * There are three locks in seg_map:
229  *	- per freelist mutexes
230  *	- per hashchain mutexes
231  *	- per smap mutexes
232  *
233  * The lock ordering is to get the smap mutex to lock down the slot
234  * first then the hash lock (for hash in/out (vp, off) list) or the
235  * freelist lock to put the slot back on the free list.
236  *
237  * The hash search is done by only holding the hashchain lock, when a wanted
238  * slot is found, we drop the hashchain lock then lock the slot so there
239  * is no overlapping of hashchain and smap locks. After the slot is
240  * locked, we verify again if the slot is still what we are looking
241  * for.
242  *
243  * Allocation of a free slot is done by holding the freelist lock,
244  * then locking the smap slot at the head of the freelist. This is
245  * in reversed lock order so mutex_tryenter() is used.
246  *
247  * The smap lock protects all fields in smap structure except for
248  * the link fields for hash/free lists which are protected by
249  * hashchain and freelist locks.
250  */
251 
252 #define	SHASHMTX(hashid)	(&smd_hash[hashid].sh_mtx)
253 
254 #define	SMP2SMF(smp)		(&smd_free[(smp - smd_smap) & smd_freemsk])
255 #define	SMP2SMF_NDX(smp)	(ushort_t)((smp - smd_smap) & smd_freemsk)
256 
257 #define	SMAPMTX(smp) (&smp->sm_mtx)
258 
259 #define	SMAP_HASHFUNC(vp, off, hashid) \
260 	{ \
261 	hashid = ((((uintptr_t)(vp) >> 6) + ((uintptr_t)(vp) >> 3) + \
262 		((off) >> MAXBSHIFT)) & smd_hashmsk); \
263 	}
264 
265 /*
266  * The most frequently updated kstat counters are kept in the
267  * per cpu array to avoid hot cache blocks. The update function
268  * sums the cpu local counters to update the global counters.
269  */
270 
271 /* ARGSUSED */
272 int
273 segmap_kstat_update(kstat_t *ksp, int rw)
274 {
275 	int i;
276 	ulong_t	getmap, release, get_reclaim;
277 	ulong_t	fault, pagecreate, get_reuse;
278 
279 	if (rw == KSTAT_WRITE)
280 		return (EACCES);
281 	getmap = release = get_reclaim = (ulong_t)0;
282 	fault = pagecreate = get_reuse = (ulong_t)0;
283 	for (i = 0; i < max_ncpus; i++) {
284 		getmap += smd_cpu[i].scpu.scpu_getmap;
285 		release  += smd_cpu[i].scpu.scpu_release;
286 		get_reclaim += smd_cpu[i].scpu.scpu_get_reclaim;
287 		fault  += smd_cpu[i].scpu.scpu_fault;
288 		pagecreate  += smd_cpu[i].scpu.scpu_pagecreate;
289 		get_reuse += smd_cpu[i].scpu.scpu_get_reuse;
290 	}
291 	segmapcnt.smp_getmap.value.ul = getmap;
292 	segmapcnt.smp_release.value.ul = release;
293 	segmapcnt.smp_get_reclaim.value.ul = get_reclaim;
294 	segmapcnt.smp_fault.value.ul = fault;
295 	segmapcnt.smp_pagecreate.value.ul = pagecreate;
296 	segmapcnt.smp_get_reuse.value.ul = get_reuse;
297 	return (0);
298 }
299 
300 int
301 segmap_create(struct seg *seg, void *argsp)
302 {
303 	struct segmap_data *smd;
304 	struct smap *smp;
305 	struct smfree *sm;
306 	struct segmap_crargs *a = (struct segmap_crargs *)argsp;
307 	struct smaphash *shashp;
308 	union segmap_cpu *scpu;
309 	long i, npages;
310 	size_t hashsz;
311 	uint_t nfreelist;
312 	extern void prefetch_smap_w(void *);
313 	extern int max_ncpus;
314 
315 	ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
316 
317 	if (((uintptr_t)seg->s_base | seg->s_size) & MAXBOFFSET) {
318 		panic("segkmap not MAXBSIZE aligned");
319 		/*NOTREACHED*/
320 	}
321 
322 	smd = kmem_zalloc(sizeof (struct segmap_data), KM_SLEEP);
323 
324 	seg->s_data = (void *)smd;
325 	seg->s_ops = &segmap_ops;
326 	smd->smd_prot = a->prot;
327 
328 	/*
329 	 * Scale the number of smap freelists to be
330 	 * proportional to max_ncpus * number of virtual colors.
331 	 * The caller can over-ride this scaling by providing
332 	 * a non-zero a->nfreelist argument.
333 	 */
334 	nfreelist = a->nfreelist;
335 	if (nfreelist == 0)
336 		nfreelist = max_ncpus;
337 	else if (nfreelist < 0 || nfreelist > 4 * max_ncpus) {
338 		cmn_err(CE_WARN, "segmap_create: nfreelist out of range "
339 		"%d, using %d", nfreelist, max_ncpus);
340 		nfreelist = max_ncpus;
341 	}
342 	if (nfreelist & (nfreelist - 1)) {
343 		/* round up nfreelist to the next power of two. */
344 		nfreelist = 1 << (highbit(nfreelist));
345 	}
346 
347 	/*
348 	 * Get the number of virtual colors - must be a power of 2.
349 	 */
350 	if (a->shmsize)
351 		smd_ncolor = a->shmsize >> MAXBSHIFT;
352 	else
353 		smd_ncolor = 1;
354 	ASSERT((smd_ncolor & (smd_ncolor - 1)) == 0);
355 	ASSERT(smd_ncolor <= SEGMAP_MAXCOLOR);
356 	smd_colormsk = smd_ncolor - 1;
357 	smd->smd_nfree = smd_nfree = smd_ncolor * nfreelist;
358 	smd_freemsk = smd_nfree - 1;
359 
360 	/*
361 	 * Allocate and initialize the freelist headers.
362 	 * Note that sm_freeq[1] starts out as the release queue. This
363 	 * is known when the smap structures are initialized below.
364 	 */
365 	smd_free = smd->smd_free =
366 	    kmem_zalloc(smd_nfree * sizeof (struct smfree), KM_SLEEP);
367 	for (i = 0; i < smd_nfree; i++) {
368 		sm = &smd->smd_free[i];
369 		mutex_init(&sm->sm_freeq[0].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
370 		mutex_init(&sm->sm_freeq[1].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
371 		sm->sm_allocq = &sm->sm_freeq[0];
372 		sm->sm_releq = &sm->sm_freeq[1];
373 	}
374 
375 	/*
376 	 * Allocate and initialize the smap hash chain headers.
377 	 * Compute hash size rounding down to the next power of two.
378 	 */
379 	npages = MAP_PAGES(seg);
380 	smd->smd_npages = npages;
381 	hashsz = npages / SMAP_HASHAVELEN;
382 	hashsz = 1 << (highbit(hashsz)-1);
383 	smd_hashmsk = hashsz - 1;
384 	smd_hash = smd->smd_hash =
385 	    kmem_alloc(hashsz * sizeof (struct smaphash), KM_SLEEP);
386 #ifdef SEGMAP_HASHSTATS
387 	smd_hash_len =
388 	    kmem_zalloc(hashsz * sizeof (unsigned int), KM_SLEEP);
389 #endif
390 	for (i = 0, shashp = smd_hash; i < hashsz; i++, shashp++) {
391 		shashp->sh_hash_list = NULL;
392 		mutex_init(&shashp->sh_mtx, NULL, MUTEX_DEFAULT, NULL);
393 	}
394 
395 	/*
396 	 * Allocate and initialize the smap structures.
397 	 * Link all slots onto the appropriate freelist.
398 	 * The smap array is large enough to affect boot time
399 	 * on large systems, so use memory prefetching and only
400 	 * go through the array 1 time. Inline a optimized version
401 	 * of segmap_smapadd to add structures to freelists with
402 	 * knowledge that no locks are needed here.
403 	 */
404 	smd_smap = smd->smd_sm =
405 		kmem_alloc(sizeof (struct smap) * npages, KM_SLEEP);
406 
407 	for (smp = &smd->smd_sm[MAP_PAGES(seg) - 1];
408 	    smp >= smd->smd_sm; smp--) {
409 		struct smap *smpfreelist;
410 		struct sm_freeq *releq;
411 
412 		prefetch_smap_w((char *)smp);
413 
414 		smp->sm_vp = NULL;
415 		smp->sm_hash = NULL;
416 		smp->sm_off = 0;
417 		smp->sm_bitmap = 0;
418 		smp->sm_refcnt = 0;
419 		mutex_init(&smp->sm_mtx, NULL, MUTEX_DEFAULT, NULL);
420 		smp->sm_free_ndx = SMP2SMF_NDX(smp);
421 
422 		sm = SMP2SMF(smp);
423 		releq = sm->sm_releq;
424 
425 		smpfreelist = releq->smq_free;
426 		if (smpfreelist == 0) {
427 			releq->smq_free = smp->sm_next = smp->sm_prev = smp;
428 		} else {
429 			smp->sm_next = smpfreelist;
430 			smp->sm_prev = smpfreelist->sm_prev;
431 			smpfreelist->sm_prev = smp;
432 			smp->sm_prev->sm_next = smp;
433 			releq->smq_free = smp->sm_next;
434 		}
435 
436 		/*
437 		 * sm_flag = 0 (no SM_QNDX_ZERO) implies smap on sm_freeq[1]
438 		 */
439 		smp->sm_flags = 0;
440 
441 #ifdef	SEGKPM_SUPPORT
442 		/*
443 		 * Due to the fragile prefetch loop no
444 		 * separate function is used here.
445 		 */
446 		smp->sm_kpme_next = NULL;
447 		smp->sm_kpme_prev = NULL;
448 		smp->sm_kpme_page = NULL;
449 #endif
450 	}
451 
452 	/*
453 	 * Allocate the per color indices that distribute allocation
454 	 * requests over the free lists. Each cpu will have a private
455 	 * rotor index to spread the allocations even across the available
456 	 * smap freelists. Init the scpu_last_smap field to the first
457 	 * smap element so there is no need to check for NULL.
458 	 */
459 	smd_cpu =
460 		kmem_zalloc(sizeof (union segmap_cpu) * max_ncpus, KM_SLEEP);
461 	for (i = 0, scpu = smd_cpu; i < max_ncpus; i++, scpu++) {
462 		int j;
463 		for (j = 0; j < smd_ncolor; j++)
464 			scpu->scpu.scpu_free_ndx[j] = j;
465 		scpu->scpu.scpu_last_smap = smd_smap;
466 	}
467 
468 	if (vpm_enable) {
469 		vpm_init();
470 	}
471 
472 #ifdef DEBUG
473 	/*
474 	 * Keep track of which colors are used more often.
475 	 */
476 	colors_used = kmem_zalloc(smd_nfree * sizeof (int), KM_SLEEP);
477 #endif /* DEBUG */
478 
479 	return (0);
480 }
481 
482 static void
483 segmap_free(seg)
484 	struct seg *seg;
485 {
486 	ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
487 }
488 
489 /*
490  * Do a F_SOFTUNLOCK call over the range requested.
491  * The range must have already been F_SOFTLOCK'ed.
492  */
493 static void
494 segmap_unlock(
495 	struct hat *hat,
496 	struct seg *seg,
497 	caddr_t addr,
498 	size_t len,
499 	enum seg_rw rw,
500 	struct smap *smp)
501 {
502 	page_t *pp;
503 	caddr_t adr;
504 	u_offset_t off;
505 	struct vnode *vp;
506 	kmutex_t *smtx;
507 
508 	ASSERT(smp->sm_refcnt > 0);
509 
510 #ifdef lint
511 	seg = seg;
512 #endif
513 
514 	if (segmap_kpm && IS_KPM_ADDR(addr)) {
515 
516 		/*
517 		 * We're called only from segmap_fault and this was a
518 		 * NOP in case of a kpm based smap, so dangerous things
519 		 * must have happened in the meantime. Pages are prefaulted
520 		 * and locked in segmap_getmapflt and they will not be
521 		 * unlocked until segmap_release.
522 		 */
523 		panic("segmap_unlock: called with kpm addr %p", (void *)addr);
524 		/*NOTREACHED*/
525 	}
526 
527 	vp = smp->sm_vp;
528 	off = smp->sm_off + (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
529 
530 	hat_unlock(hat, addr, P2ROUNDUP(len, PAGESIZE));
531 	for (adr = addr; adr < addr + len; adr += PAGESIZE, off += PAGESIZE) {
532 		ushort_t bitmask;
533 
534 		/*
535 		 * Use page_find() instead of page_lookup() to
536 		 * find the page since we know that it has
537 		 * "shared" lock.
538 		 */
539 		pp = page_find(vp, off);
540 		if (pp == NULL) {
541 			panic("segmap_unlock: page not found");
542 			/*NOTREACHED*/
543 		}
544 
545 		if (rw == S_WRITE) {
546 			hat_setrefmod(pp);
547 		} else if (rw != S_OTHER) {
548 			TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
549 				"segmap_fault:pp %p vp %p offset %llx",
550 				pp, vp, off);
551 			hat_setref(pp);
552 		}
553 
554 		/*
555 		 * Clear bitmap, if the bit corresponding to "off" is set,
556 		 * since the page and translation are being unlocked.
557 		 */
558 		bitmask = SMAP_BIT_MASK((off - smp->sm_off) >> PAGESHIFT);
559 
560 		/*
561 		 * Large Files: Following assertion is to verify
562 		 * the correctness of the cast to (int) above.
563 		 */
564 		ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
565 		smtx = SMAPMTX(smp);
566 		mutex_enter(smtx);
567 		if (smp->sm_bitmap & bitmask) {
568 			smp->sm_bitmap &= ~bitmask;
569 		}
570 		mutex_exit(smtx);
571 
572 		page_unlock(pp);
573 	}
574 }
575 
576 #define	MAXPPB	(MAXBSIZE/4096)	/* assumes minimum page size of 4k */
577 
578 /*
579  * This routine is called via a machine specific fault handling
580  * routine.  It is also called by software routines wishing to
581  * lock or unlock a range of addresses.
582  *
583  * Note that this routine expects a page-aligned "addr".
584  */
585 faultcode_t
586 segmap_fault(
587 	struct hat *hat,
588 	struct seg *seg,
589 	caddr_t addr,
590 	size_t len,
591 	enum fault_type type,
592 	enum seg_rw rw)
593 {
594 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
595 	struct smap *smp;
596 	page_t *pp, **ppp;
597 	struct vnode *vp;
598 	u_offset_t off;
599 	page_t *pl[MAXPPB + 1];
600 	uint_t prot;
601 	u_offset_t addroff;
602 	caddr_t adr;
603 	int err;
604 	u_offset_t sm_off;
605 	int hat_flag;
606 
607 	if (segmap_kpm && IS_KPM_ADDR(addr)) {
608 		int newpage;
609 		kmutex_t *smtx;
610 
611 		/*
612 		 * Pages are successfully prefaulted and locked in
613 		 * segmap_getmapflt and can't be unlocked until
614 		 * segmap_release. No hat mappings have to be locked
615 		 * and they also can't be unlocked as long as the
616 		 * caller owns an active kpm addr.
617 		 */
618 #ifndef DEBUG
619 		if (type != F_SOFTUNLOCK)
620 			return (0);
621 #endif
622 
623 		if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
624 			panic("segmap_fault: smap not found "
625 			    "for addr %p", (void *)addr);
626 			/*NOTREACHED*/
627 		}
628 
629 		smtx = SMAPMTX(smp);
630 #ifdef	DEBUG
631 		newpage = smp->sm_flags & SM_KPM_NEWPAGE;
632 		if (newpage) {
633 			cmn_err(CE_WARN, "segmap_fault: newpage? smp %p",
634 				(void *)smp);
635 		}
636 
637 		if (type != F_SOFTUNLOCK) {
638 			mutex_exit(smtx);
639 			return (0);
640 		}
641 #endif
642 		mutex_exit(smtx);
643 		vp = smp->sm_vp;
644 		sm_off = smp->sm_off;
645 
646 		if (vp == NULL)
647 			return (FC_MAKE_ERR(EIO));
648 
649 		ASSERT(smp->sm_refcnt > 0);
650 
651 		addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
652 		if (addroff + len > MAXBSIZE)
653 			panic("segmap_fault: endaddr %p exceeds MAXBSIZE chunk",
654 			    (void *)(addr + len));
655 
656 		off = sm_off + addroff;
657 
658 		pp = page_find(vp, off);
659 
660 		if (pp == NULL)
661 			panic("segmap_fault: softunlock page not found");
662 
663 		/*
664 		 * Set ref bit also here in case of S_OTHER to avoid the
665 		 * overhead of supporting other cases than F_SOFTUNLOCK
666 		 * with segkpm. We can do this because the underlying
667 		 * pages are locked anyway.
668 		 */
669 		if (rw == S_WRITE) {
670 			hat_setrefmod(pp);
671 		} else {
672 			TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
673 				"segmap_fault:pp %p vp %p offset %llx",
674 				pp, vp, off);
675 			hat_setref(pp);
676 		}
677 
678 		return (0);
679 	}
680 
681 	smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
682 	smp = GET_SMAP(seg, addr);
683 	vp = smp->sm_vp;
684 	sm_off = smp->sm_off;
685 
686 	if (vp == NULL)
687 		return (FC_MAKE_ERR(EIO));
688 
689 	ASSERT(smp->sm_refcnt > 0);
690 
691 	addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
692 	if (addroff + len > MAXBSIZE) {
693 		panic("segmap_fault: endaddr %p "
694 		    "exceeds MAXBSIZE chunk", (void *)(addr + len));
695 		/*NOTREACHED*/
696 	}
697 	off = sm_off + addroff;
698 
699 	/*
700 	 * First handle the easy stuff
701 	 */
702 	if (type == F_SOFTUNLOCK) {
703 		segmap_unlock(hat, seg, addr, len, rw, smp);
704 		return (0);
705 	}
706 
707 	TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
708 		"segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
709 	err = VOP_GETPAGE(vp, (offset_t)off, len, &prot, pl, MAXBSIZE,
710 	    seg, addr, rw, CRED());
711 
712 	if (err)
713 		return (FC_MAKE_ERR(err));
714 
715 	prot &= smd->smd_prot;
716 
717 	/*
718 	 * Handle all pages returned in the pl[] array.
719 	 * This loop is coded on the assumption that if
720 	 * there was no error from the VOP_GETPAGE routine,
721 	 * that the page list returned will contain all the
722 	 * needed pages for the vp from [off..off + len].
723 	 */
724 	ppp = pl;
725 	while ((pp = *ppp++) != NULL) {
726 		u_offset_t poff;
727 		ASSERT(pp->p_vnode == vp);
728 		hat_flag = HAT_LOAD;
729 
730 		/*
731 		 * Verify that the pages returned are within the range
732 		 * of this segmap region.  Note that it is theoretically
733 		 * possible for pages outside this range to be returned,
734 		 * but it is not very likely.  If we cannot use the
735 		 * page here, just release it and go on to the next one.
736 		 */
737 		if (pp->p_offset < sm_off ||
738 		    pp->p_offset >= sm_off + MAXBSIZE) {
739 			(void) page_release(pp, 1);
740 			continue;
741 		}
742 
743 		ASSERT(hat == kas.a_hat);
744 		poff = pp->p_offset;
745 		adr = addr + (poff - off);
746 		if (adr >= addr && adr < addr + len) {
747 			hat_setref(pp);
748 			TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
749 			    "segmap_fault:pp %p vp %p offset %llx",
750 			    pp, vp, poff);
751 			if (type == F_SOFTLOCK)
752 				hat_flag = HAT_LOAD_LOCK;
753 		}
754 
755 		/*
756 		 * Deal with VMODSORT pages here. If we know this is a write
757 		 * do the setmod now and allow write protection.
758 		 * As long as it's modified or not S_OTHER, remove write
759 		 * protection. With S_OTHER it's up to the FS to deal with this.
760 		 */
761 		if (IS_VMODSORT(vp)) {
762 			if (rw == S_WRITE)
763 				hat_setmod(pp);
764 			else if (rw != S_OTHER && !hat_ismod(pp))
765 				prot &= ~PROT_WRITE;
766 		}
767 
768 		hat_memload(hat, adr, pp, prot, hat_flag);
769 		if (hat_flag != HAT_LOAD_LOCK)
770 			page_unlock(pp);
771 	}
772 	return (0);
773 }
774 
775 /*
776  * This routine is used to start I/O on pages asynchronously.
777  */
778 static faultcode_t
779 segmap_faulta(struct seg *seg, caddr_t addr)
780 {
781 	struct smap *smp;
782 	struct vnode *vp;
783 	u_offset_t off;
784 	int err;
785 
786 	if (segmap_kpm && IS_KPM_ADDR(addr)) {
787 		int	newpage;
788 		kmutex_t *smtx;
789 
790 		/*
791 		 * Pages are successfully prefaulted and locked in
792 		 * segmap_getmapflt and can't be unlocked until
793 		 * segmap_release. No hat mappings have to be locked
794 		 * and they also can't be unlocked as long as the
795 		 * caller owns an active kpm addr.
796 		 */
797 #ifdef	DEBUG
798 		if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
799 			panic("segmap_faulta: smap not found "
800 			    "for addr %p", (void *)addr);
801 			/*NOTREACHED*/
802 		}
803 
804 		smtx = SMAPMTX(smp);
805 		newpage = smp->sm_flags & SM_KPM_NEWPAGE;
806 		mutex_exit(smtx);
807 		if (newpage)
808 			cmn_err(CE_WARN, "segmap_faulta: newpage? smp %p",
809 			    (void *)smp);
810 #endif
811 		return (0);
812 	}
813 
814 	segmapcnt.smp_faulta.value.ul++;
815 	smp = GET_SMAP(seg, addr);
816 
817 	ASSERT(smp->sm_refcnt > 0);
818 
819 	vp = smp->sm_vp;
820 	off = smp->sm_off;
821 
822 	if (vp == NULL) {
823 		cmn_err(CE_WARN, "segmap_faulta - no vp");
824 		return (FC_MAKE_ERR(EIO));
825 	}
826 
827 	TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
828 		"segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
829 
830 	err = VOP_GETPAGE(vp, (offset_t)(off + ((offset_t)((uintptr_t)addr
831 	    & MAXBOFFSET))), PAGESIZE, (uint_t *)NULL, (page_t **)NULL, 0,
832 	    seg, addr, S_READ, CRED());
833 
834 	if (err)
835 		return (FC_MAKE_ERR(err));
836 	return (0);
837 }
838 
839 /*ARGSUSED*/
840 static int
841 segmap_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
842 {
843 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
844 
845 	ASSERT(seg->s_as && RW_LOCK_HELD(&seg->s_as->a_lock));
846 
847 	/*
848 	 * Need not acquire the segment lock since
849 	 * "smd_prot" is a read-only field.
850 	 */
851 	return (((smd->smd_prot & prot) != prot) ? EACCES : 0);
852 }
853 
854 static int
855 segmap_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
856 {
857 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
858 	size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1;
859 
860 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
861 
862 	if (pgno != 0) {
863 		do
864 			protv[--pgno] = smd->smd_prot;
865 		while (pgno != 0);
866 	}
867 	return (0);
868 }
869 
870 static u_offset_t
871 segmap_getoffset(struct seg *seg, caddr_t addr)
872 {
873 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
874 
875 	ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
876 
877 	return ((u_offset_t)smd->smd_sm->sm_off + (addr - seg->s_base));
878 }
879 
880 /*ARGSUSED*/
881 static int
882 segmap_gettype(struct seg *seg, caddr_t addr)
883 {
884 	ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
885 
886 	return (MAP_SHARED);
887 }
888 
889 /*ARGSUSED*/
890 static int
891 segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
892 {
893 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
894 
895 	ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
896 
897 	/* XXX - This doesn't make any sense */
898 	*vpp = smd->smd_sm->sm_vp;
899 	return (0);
900 }
901 
902 /*
903  * Check to see if it makes sense to do kluster/read ahead to
904  * addr + delta relative to the mapping at addr.  We assume here
905  * that delta is a signed PAGESIZE'd multiple (which can be negative).
906  *
907  * For segmap we always "approve" of this action from our standpoint.
908  */
909 /*ARGSUSED*/
910 static int
911 segmap_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
912 {
913 	return (0);
914 }
915 
916 static void
917 segmap_badop()
918 {
919 	panic("segmap_badop");
920 	/*NOTREACHED*/
921 }
922 
923 /*
924  * Special private segmap operations
925  */
926 
927 /*
928  * Add smap to the appropriate free list.
929  */
930 static void
931 segmap_smapadd(struct smap *smp)
932 {
933 	struct smfree *sm;
934 	struct smap *smpfreelist;
935 	struct sm_freeq *releq;
936 
937 	ASSERT(MUTEX_HELD(SMAPMTX(smp)));
938 
939 	if (smp->sm_refcnt != 0) {
940 		panic("segmap_smapadd");
941 		/*NOTREACHED*/
942 	}
943 
944 	sm = &smd_free[smp->sm_free_ndx];
945 	/*
946 	 * Add to the tail of the release queue
947 	 * Note that sm_releq and sm_allocq could toggle
948 	 * before we get the lock. This does not affect
949 	 * correctness as the 2 queues are only maintained
950 	 * to reduce lock pressure.
951 	 */
952 	releq = sm->sm_releq;
953 	if (releq == &sm->sm_freeq[0])
954 		smp->sm_flags |= SM_QNDX_ZERO;
955 	else
956 		smp->sm_flags &= ~SM_QNDX_ZERO;
957 	mutex_enter(&releq->smq_mtx);
958 	smpfreelist = releq->smq_free;
959 	if (smpfreelist == 0) {
960 		int want;
961 
962 		releq->smq_free = smp->sm_next = smp->sm_prev = smp;
963 		/*
964 		 * Both queue mutexes held to set sm_want;
965 		 * snapshot the value before dropping releq mutex.
966 		 * If sm_want appears after the releq mutex is dropped,
967 		 * then the smap just freed is already gone.
968 		 */
969 		want = sm->sm_want;
970 		mutex_exit(&releq->smq_mtx);
971 		/*
972 		 * See if there was a waiter before dropping the releq mutex
973 		 * then recheck after obtaining sm_freeq[0] mutex as
974 		 * the another thread may have already signaled.
975 		 */
976 		if (want) {
977 			mutex_enter(&sm->sm_freeq[0].smq_mtx);
978 			if (sm->sm_want)
979 				cv_signal(&sm->sm_free_cv);
980 			mutex_exit(&sm->sm_freeq[0].smq_mtx);
981 		}
982 	} else {
983 		smp->sm_next = smpfreelist;
984 		smp->sm_prev = smpfreelist->sm_prev;
985 		smpfreelist->sm_prev = smp;
986 		smp->sm_prev->sm_next = smp;
987 		mutex_exit(&releq->smq_mtx);
988 	}
989 }
990 
991 
992 static struct smap *
993 segmap_hashin(struct smap *smp, struct vnode *vp, u_offset_t off, int hashid)
994 {
995 	struct smap **hpp;
996 	struct smap *tmp;
997 	kmutex_t *hmtx;
998 
999 	ASSERT(MUTEX_HELD(SMAPMTX(smp)));
1000 	ASSERT(smp->sm_vp == NULL);
1001 	ASSERT(smp->sm_hash == NULL);
1002 	ASSERT(smp->sm_prev == NULL);
1003 	ASSERT(smp->sm_next == NULL);
1004 	ASSERT(hashid >= 0 && hashid <= smd_hashmsk);
1005 
1006 	hmtx = SHASHMTX(hashid);
1007 
1008 	mutex_enter(hmtx);
1009 	/*
1010 	 * First we need to verify that no one has created a smp
1011 	 * with (vp,off) as its tag before we us.
1012 	 */
1013 	for (tmp = smd_hash[hashid].sh_hash_list;
1014 	    tmp != NULL; tmp = tmp->sm_hash)
1015 		if (tmp->sm_vp == vp && tmp->sm_off == off)
1016 			break;
1017 
1018 	if (tmp == NULL) {
1019 		/*
1020 		 * No one created one yet.
1021 		 *
1022 		 * Funniness here - we don't increment the ref count on the
1023 		 * vnode * even though we have another pointer to it here.
1024 		 * The reason for this is that we don't want the fact that
1025 		 * a seg_map entry somewhere refers to a vnode to prevent the
1026 		 * vnode * itself from going away.  This is because this
1027 		 * reference to the vnode is a "soft one".  In the case where
1028 		 * a mapping is being used by a rdwr [or directory routine?]
1029 		 * there already has to be a non-zero ref count on the vnode.
1030 		 * In the case where the vp has been freed and the the smap
1031 		 * structure is on the free list, there are no pages in memory
1032 		 * that can refer to the vnode.  Thus even if we reuse the same
1033 		 * vnode/smap structure for a vnode which has the same
1034 		 * address but represents a different object, we are ok.
1035 		 */
1036 		smp->sm_vp = vp;
1037 		smp->sm_off = off;
1038 
1039 		hpp = &smd_hash[hashid].sh_hash_list;
1040 		smp->sm_hash = *hpp;
1041 		*hpp = smp;
1042 #ifdef SEGMAP_HASHSTATS
1043 		smd_hash_len[hashid]++;
1044 #endif
1045 	}
1046 	mutex_exit(hmtx);
1047 
1048 	return (tmp);
1049 }
1050 
1051 static void
1052 segmap_hashout(struct smap *smp)
1053 {
1054 	struct smap **hpp, *hp;
1055 	struct vnode *vp;
1056 	kmutex_t *mtx;
1057 	int hashid;
1058 	u_offset_t off;
1059 
1060 	ASSERT(MUTEX_HELD(SMAPMTX(smp)));
1061 
1062 	vp = smp->sm_vp;
1063 	off = smp->sm_off;
1064 
1065 	SMAP_HASHFUNC(vp, off, hashid);	/* macro assigns hashid */
1066 	mtx = SHASHMTX(hashid);
1067 	mutex_enter(mtx);
1068 
1069 	hpp = &smd_hash[hashid].sh_hash_list;
1070 	for (;;) {
1071 		hp = *hpp;
1072 		if (hp == NULL) {
1073 			panic("segmap_hashout");
1074 			/*NOTREACHED*/
1075 		}
1076 		if (hp == smp)
1077 			break;
1078 		hpp = &hp->sm_hash;
1079 	}
1080 
1081 	*hpp = smp->sm_hash;
1082 	smp->sm_hash = NULL;
1083 #ifdef SEGMAP_HASHSTATS
1084 	smd_hash_len[hashid]--;
1085 #endif
1086 	mutex_exit(mtx);
1087 
1088 	smp->sm_vp = NULL;
1089 	smp->sm_off = (u_offset_t)0;
1090 
1091 }
1092 
1093 /*
1094  * Attempt to free unmodified, unmapped, and non locked segmap
1095  * pages.
1096  */
1097 void
1098 segmap_pagefree(struct vnode *vp, u_offset_t off)
1099 {
1100 	u_offset_t pgoff;
1101 	page_t  *pp;
1102 
1103 	for (pgoff = off; pgoff < off + MAXBSIZE; pgoff += PAGESIZE) {
1104 
1105 		if ((pp = page_lookup_nowait(vp, pgoff, SE_EXCL)) == NULL)
1106 			continue;
1107 
1108 		switch (page_release(pp, 1)) {
1109 		case PGREL_NOTREL:
1110 			segmapcnt.smp_free_notfree.value.ul++;
1111 			break;
1112 		case PGREL_MOD:
1113 			segmapcnt.smp_free_dirty.value.ul++;
1114 			break;
1115 		case PGREL_CLEAN:
1116 			segmapcnt.smp_free.value.ul++;
1117 			break;
1118 		}
1119 	}
1120 }
1121 
1122 /*
1123  * Locks held on entry: smap lock
1124  * Locks held on exit : smap lock.
1125  */
1126 
1127 static void
1128 grab_smp(struct smap *smp, page_t *pp)
1129 {
1130 	ASSERT(MUTEX_HELD(SMAPMTX(smp)));
1131 	ASSERT(smp->sm_refcnt == 0);
1132 
1133 	if (smp->sm_vp != (struct vnode *)NULL) {
1134 		struct vnode	*vp = smp->sm_vp;
1135 		u_offset_t 	off = smp->sm_off;
1136 		/*
1137 		 * Destroy old vnode association and
1138 		 * unload any hardware translations to
1139 		 * the old object.
1140 		 */
1141 		smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reuse++;
1142 		segmap_hashout(smp);
1143 
1144 		/*
1145 		 * This node is off freelist and hashlist,
1146 		 * so there is no reason to drop/reacquire sm_mtx
1147 		 * across calls to hat_unload.
1148 		 */
1149 		if (segmap_kpm) {
1150 			caddr_t vaddr;
1151 			int hat_unload_needed = 0;
1152 
1153 			/*
1154 			 * unload kpm mapping
1155 			 */
1156 			if (pp != NULL) {
1157 				vaddr = hat_kpm_page2va(pp, 1);
1158 				hat_kpm_mapout(pp, GET_KPME(smp), vaddr);
1159 				page_unlock(pp);
1160 			}
1161 
1162 			/*
1163 			 * Check if we have (also) the rare case of a
1164 			 * non kpm mapping.
1165 			 */
1166 			if (smp->sm_flags & SM_NOTKPM_RELEASED) {
1167 				hat_unload_needed = 1;
1168 				smp->sm_flags &= ~SM_NOTKPM_RELEASED;
1169 			}
1170 
1171 			if (hat_unload_needed) {
1172 				hat_unload(kas.a_hat, segkmap->s_base +
1173 				    ((smp - smd_smap) * MAXBSIZE),
1174 				    MAXBSIZE, HAT_UNLOAD);
1175 			}
1176 
1177 		} else {
1178 			ASSERT(smp->sm_flags & SM_NOTKPM_RELEASED);
1179 			smp->sm_flags &= ~SM_NOTKPM_RELEASED;
1180 			hat_unload(kas.a_hat, segkmap->s_base +
1181 			    ((smp - smd_smap) * MAXBSIZE),
1182 			    MAXBSIZE, HAT_UNLOAD);
1183 		}
1184 		segmap_pagefree(vp, off);
1185 	}
1186 }
1187 
1188 static struct smap *
1189 get_free_smp(int free_ndx)
1190 {
1191 	struct smfree *sm;
1192 	kmutex_t *smtx;
1193 	struct smap *smp, *first;
1194 	struct sm_freeq *allocq, *releq;
1195 	struct kpme *kpme;
1196 	page_t *pp = NULL;
1197 	int end_ndx, page_locked = 0;
1198 
1199 	end_ndx = free_ndx;
1200 	sm = &smd_free[free_ndx];
1201 
1202 retry_queue:
1203 	allocq = sm->sm_allocq;
1204 	mutex_enter(&allocq->smq_mtx);
1205 
1206 	if ((smp = allocq->smq_free) == NULL) {
1207 
1208 skip_queue:
1209 		/*
1210 		 * The alloc list is empty or this queue is being skipped;
1211 		 * first see if the allocq toggled.
1212 		 */
1213 		if (sm->sm_allocq != allocq) {
1214 			/* queue changed */
1215 			mutex_exit(&allocq->smq_mtx);
1216 			goto retry_queue;
1217 		}
1218 		releq = sm->sm_releq;
1219 		if (!mutex_tryenter(&releq->smq_mtx)) {
1220 			/* cannot get releq; a free smp may be there now */
1221 			mutex_exit(&allocq->smq_mtx);
1222 
1223 			/*
1224 			 * This loop could spin forever if this thread has
1225 			 * higher priority than the thread that is holding
1226 			 * releq->smq_mtx. In order to force the other thread
1227 			 * to run, we'll lock/unlock the mutex which is safe
1228 			 * since we just unlocked the allocq mutex.
1229 			 */
1230 			mutex_enter(&releq->smq_mtx);
1231 			mutex_exit(&releq->smq_mtx);
1232 			goto retry_queue;
1233 		}
1234 		if (releq->smq_free == NULL) {
1235 			/*
1236 			 * This freelist is empty.
1237 			 * This should not happen unless clients
1238 			 * are failing to release the segmap
1239 			 * window after accessing the data.
1240 			 * Before resorting to sleeping, try
1241 			 * the next list of the same color.
1242 			 */
1243 			free_ndx = (free_ndx + smd_ncolor) & smd_freemsk;
1244 			if (free_ndx != end_ndx) {
1245 				mutex_exit(&releq->smq_mtx);
1246 				mutex_exit(&allocq->smq_mtx);
1247 				sm = &smd_free[free_ndx];
1248 				goto retry_queue;
1249 			}
1250 			/*
1251 			 * Tried all freelists of the same color once,
1252 			 * wait on this list and hope something gets freed.
1253 			 */
1254 			segmapcnt.smp_get_nofree.value.ul++;
1255 			sm->sm_want++;
1256 			mutex_exit(&sm->sm_freeq[1].smq_mtx);
1257 			cv_wait(&sm->sm_free_cv,
1258 				&sm->sm_freeq[0].smq_mtx);
1259 			sm->sm_want--;
1260 			mutex_exit(&sm->sm_freeq[0].smq_mtx);
1261 			sm = &smd_free[free_ndx];
1262 			goto retry_queue;
1263 		} else {
1264 			/*
1265 			 * Something on the rele queue; flip the alloc
1266 			 * and rele queues and retry.
1267 			 */
1268 			sm->sm_allocq = releq;
1269 			sm->sm_releq = allocq;
1270 			mutex_exit(&allocq->smq_mtx);
1271 			mutex_exit(&releq->smq_mtx);
1272 			if (page_locked) {
1273 				delay(hz >> 2);
1274 				page_locked = 0;
1275 			}
1276 			goto retry_queue;
1277 		}
1278 	} else {
1279 		/*
1280 		 * Fastpath the case we get the smap mutex
1281 		 * on the first try.
1282 		 */
1283 		first = smp;
1284 next_smap:
1285 		smtx = SMAPMTX(smp);
1286 		if (!mutex_tryenter(smtx)) {
1287 			/*
1288 			 * Another thread is trying to reclaim this slot.
1289 			 * Skip to the next queue or smap.
1290 			 */
1291 			if ((smp = smp->sm_next) == first) {
1292 				goto skip_queue;
1293 			} else {
1294 				goto next_smap;
1295 			}
1296 		} else {
1297 			/*
1298 			 * if kpme exists, get shared lock on the page
1299 			 */
1300 			if (segmap_kpm && smp->sm_vp != NULL) {
1301 
1302 				kpme = GET_KPME(smp);
1303 				pp = kpme->kpe_page;
1304 
1305 				if (pp != NULL) {
1306 					if (!page_trylock(pp, SE_SHARED)) {
1307 						smp = smp->sm_next;
1308 						mutex_exit(smtx);
1309 						page_locked = 1;
1310 
1311 						pp = NULL;
1312 
1313 						if (smp == first) {
1314 							goto skip_queue;
1315 						} else {
1316 							goto next_smap;
1317 						}
1318 					} else {
1319 						if (kpme->kpe_page == NULL) {
1320 							page_unlock(pp);
1321 							pp = NULL;
1322 						}
1323 					}
1324 				}
1325 			}
1326 
1327 			/*
1328 			 * At this point, we've selected smp.  Remove smp
1329 			 * from its freelist.  If smp is the first one in
1330 			 * the freelist, update the head of the freelist.
1331 			 */
1332 			if (first == smp) {
1333 				ASSERT(first == allocq->smq_free);
1334 				allocq->smq_free = smp->sm_next;
1335 			}
1336 
1337 			/*
1338 			 * if the head of the freelist still points to smp,
1339 			 * then there are no more free smaps in that list.
1340 			 */
1341 			if (allocq->smq_free == smp)
1342 				/*
1343 				 * Took the last one
1344 				 */
1345 				allocq->smq_free = NULL;
1346 			else {
1347 				smp->sm_prev->sm_next = smp->sm_next;
1348 				smp->sm_next->sm_prev = smp->sm_prev;
1349 			}
1350 			mutex_exit(&allocq->smq_mtx);
1351 			smp->sm_prev = smp->sm_next = NULL;
1352 
1353 			/*
1354 			 * if pp != NULL, pp must have been locked;
1355 			 * grab_smp() unlocks pp.
1356 			 */
1357 			ASSERT((pp == NULL) || PAGE_LOCKED(pp));
1358 			grab_smp(smp, pp);
1359 			/* return smp locked. */
1360 			ASSERT(SMAPMTX(smp) == smtx);
1361 			ASSERT(MUTEX_HELD(smtx));
1362 			return (smp);
1363 		}
1364 	}
1365 }
1366 
1367 /*
1368  * Special public segmap operations
1369  */
1370 
1371 /*
1372  * Create pages (without using VOP_GETPAGE) and load up tranlations to them.
1373  * If softlock is TRUE, then set things up so that it looks like a call
1374  * to segmap_fault with F_SOFTLOCK.
1375  *
1376  * Returns 1, if a page is created by calling page_create_va(), or 0 otherwise.
1377  *
1378  * All fields in the generic segment (struct seg) are considered to be
1379  * read-only for "segmap" even though the kernel address space (kas) may
1380  * not be locked, hence no lock is needed to access them.
1381  */
1382 int
1383 segmap_pagecreate(struct seg *seg, caddr_t addr, size_t len, int softlock)
1384 {
1385 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
1386 	page_t *pp;
1387 	u_offset_t off;
1388 	struct smap *smp;
1389 	struct vnode *vp;
1390 	caddr_t eaddr;
1391 	int newpage = 0;
1392 	uint_t prot;
1393 	kmutex_t *smtx;
1394 	int hat_flag;
1395 
1396 	ASSERT(seg->s_as == &kas);
1397 
1398 	if (segmap_kpm && IS_KPM_ADDR(addr)) {
1399 		/*
1400 		 * Pages are successfully prefaulted and locked in
1401 		 * segmap_getmapflt and can't be unlocked until
1402 		 * segmap_release. The SM_KPM_NEWPAGE flag is set
1403 		 * in segmap_pagecreate_kpm when new pages are created.
1404 		 * and it is returned as "newpage" indication here.
1405 		 */
1406 		if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
1407 			panic("segmap_pagecreate: smap not found "
1408 			    "for addr %p", (void *)addr);
1409 			/*NOTREACHED*/
1410 		}
1411 
1412 		smtx = SMAPMTX(smp);
1413 		newpage = smp->sm_flags & SM_KPM_NEWPAGE;
1414 		smp->sm_flags &= ~SM_KPM_NEWPAGE;
1415 		mutex_exit(smtx);
1416 
1417 		return (newpage);
1418 	}
1419 
1420 	smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
1421 
1422 	eaddr = addr + len;
1423 	addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1424 
1425 	smp = GET_SMAP(seg, addr);
1426 
1427 	/*
1428 	 * We don't grab smp mutex here since we assume the smp
1429 	 * has a refcnt set already which prevents the slot from
1430 	 * changing its id.
1431 	 */
1432 	ASSERT(smp->sm_refcnt > 0);
1433 
1434 	vp = smp->sm_vp;
1435 	off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET));
1436 	prot = smd->smd_prot;
1437 
1438 	for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
1439 		hat_flag = HAT_LOAD;
1440 		pp = page_lookup(vp, off, SE_SHARED);
1441 		if (pp == NULL) {
1442 			ushort_t bitindex;
1443 
1444 			if ((pp = page_create_va(vp, off,
1445 			    PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
1446 				panic("segmap_pagecreate: page_create failed");
1447 				/*NOTREACHED*/
1448 			}
1449 			newpage = 1;
1450 			page_io_unlock(pp);
1451 
1452 			/*
1453 			 * Since pages created here do not contain valid
1454 			 * data until the caller writes into them, the
1455 			 * "exclusive" lock will not be dropped to prevent
1456 			 * other users from accessing the page.  We also
1457 			 * have to lock the translation to prevent a fault
1458 			 * from occuring when the virtual address mapped by
1459 			 * this page is written into.  This is necessary to
1460 			 * avoid a deadlock since we haven't dropped the
1461 			 * "exclusive" lock.
1462 			 */
1463 			bitindex = (ushort_t)((off - smp->sm_off) >> PAGESHIFT);
1464 
1465 			/*
1466 			 * Large Files: The following assertion is to
1467 			 * verify the cast above.
1468 			 */
1469 			ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
1470 			smtx = SMAPMTX(smp);
1471 			mutex_enter(smtx);
1472 			smp->sm_bitmap |= SMAP_BIT_MASK(bitindex);
1473 			mutex_exit(smtx);
1474 
1475 			hat_flag = HAT_LOAD_LOCK;
1476 		} else if (softlock) {
1477 			hat_flag = HAT_LOAD_LOCK;
1478 		}
1479 
1480 		if (IS_VMODSORT(pp->p_vnode) && (prot & PROT_WRITE))
1481 			hat_setmod(pp);
1482 
1483 		hat_memload(kas.a_hat, addr, pp, prot, hat_flag);
1484 
1485 		if (hat_flag != HAT_LOAD_LOCK)
1486 			page_unlock(pp);
1487 
1488 		TRACE_5(TR_FAC_VM, TR_SEGMAP_PAGECREATE,
1489 		    "segmap_pagecreate:seg %p addr %p pp %p vp %p offset %llx",
1490 		    seg, addr, pp, vp, off);
1491 	}
1492 
1493 	return (newpage);
1494 }
1495 
1496 void
1497 segmap_pageunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
1498 {
1499 	struct smap	*smp;
1500 	ushort_t	bitmask;
1501 	page_t		*pp;
1502 	struct	vnode	*vp;
1503 	u_offset_t	off;
1504 	caddr_t		eaddr;
1505 	kmutex_t	*smtx;
1506 
1507 	ASSERT(seg->s_as == &kas);
1508 
1509 	eaddr = addr + len;
1510 	addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1511 
1512 	if (segmap_kpm && IS_KPM_ADDR(addr)) {
1513 		/*
1514 		 * Pages are successfully prefaulted and locked in
1515 		 * segmap_getmapflt and can't be unlocked until
1516 		 * segmap_release, so no pages or hat mappings have
1517 		 * to be unlocked at this point.
1518 		 */
1519 #ifdef DEBUG
1520 		if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
1521 			panic("segmap_pageunlock: smap not found "
1522 			    "for addr %p", (void *)addr);
1523 			/*NOTREACHED*/
1524 		}
1525 
1526 		ASSERT(smp->sm_refcnt > 0);
1527 		mutex_exit(SMAPMTX(smp));
1528 #endif
1529 		return;
1530 	}
1531 
1532 	smp = GET_SMAP(seg, addr);
1533 	smtx = SMAPMTX(smp);
1534 
1535 	ASSERT(smp->sm_refcnt > 0);
1536 
1537 	vp = smp->sm_vp;
1538 	off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET));
1539 
1540 	for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
1541 		bitmask = SMAP_BIT_MASK((int)(off - smp->sm_off) >> PAGESHIFT);
1542 
1543 		/*
1544 		 * Large Files: Following assertion is to verify
1545 		 * the correctness of the cast to (int) above.
1546 		 */
1547 		ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
1548 
1549 		/*
1550 		 * If the bit corresponding to "off" is set,
1551 		 * clear this bit in the bitmap, unlock translations,
1552 		 * and release the "exclusive" lock on the page.
1553 		 */
1554 		if (smp->sm_bitmap & bitmask) {
1555 			mutex_enter(smtx);
1556 			smp->sm_bitmap &= ~bitmask;
1557 			mutex_exit(smtx);
1558 
1559 			hat_unlock(kas.a_hat, addr, PAGESIZE);
1560 
1561 			/*
1562 			 * Use page_find() instead of page_lookup() to
1563 			 * find the page since we know that it has
1564 			 * "exclusive" lock.
1565 			 */
1566 			pp = page_find(vp, off);
1567 			if (pp == NULL) {
1568 				panic("segmap_pageunlock: page not found");
1569 				/*NOTREACHED*/
1570 			}
1571 			if (rw == S_WRITE) {
1572 				hat_setrefmod(pp);
1573 			} else if (rw != S_OTHER) {
1574 				hat_setref(pp);
1575 			}
1576 
1577 			page_unlock(pp);
1578 		}
1579 	}
1580 }
1581 
1582 caddr_t
1583 segmap_getmap(struct seg *seg, struct vnode *vp, u_offset_t off)
1584 {
1585 	return (segmap_getmapflt(seg, vp, off, MAXBSIZE, 0, S_OTHER));
1586 }
1587 
1588 /*
1589  * This is the magic virtual address that offset 0 of an ELF
1590  * file gets mapped to in user space. This is used to pick
1591  * the vac color on the freelist.
1592  */
1593 #define	ELF_OFFZERO_VA	(0x10000)
1594 /*
1595  * segmap_getmap allocates a MAXBSIZE big slot to map the vnode vp
1596  * in the range <off, off + len). off doesn't need to be MAXBSIZE aligned.
1597  * The return address is  always MAXBSIZE aligned.
1598  *
1599  * If forcefault is nonzero and the MMU translations haven't yet been created,
1600  * segmap_getmap will call segmap_fault(..., F_INVAL, rw) to create them.
1601  */
1602 caddr_t
1603 segmap_getmapflt(
1604 	struct seg *seg,
1605 	struct vnode *vp,
1606 	u_offset_t off,
1607 	size_t len,
1608 	int forcefault,
1609 	enum seg_rw rw)
1610 {
1611 	struct smap *smp, *nsmp;
1612 	extern struct vnode *common_specvp();
1613 	caddr_t baseaddr;			/* MAXBSIZE aligned */
1614 	u_offset_t baseoff;
1615 	int newslot;
1616 	caddr_t vaddr;
1617 	int color, hashid;
1618 	kmutex_t *hashmtx, *smapmtx;
1619 	struct smfree *sm;
1620 	page_t	*pp;
1621 	struct kpme *kpme;
1622 	uint_t	prot;
1623 	caddr_t base;
1624 	page_t	*pl[MAXPPB + 1];
1625 	int	error;
1626 	int	is_kpm = 1;
1627 
1628 	ASSERT(seg->s_as == &kas);
1629 	ASSERT(seg == segkmap);
1630 
1631 	baseoff = off & (offset_t)MAXBMASK;
1632 	if (off + len > baseoff + MAXBSIZE) {
1633 		panic("segmap_getmap bad len");
1634 		/*NOTREACHED*/
1635 	}
1636 
1637 	/*
1638 	 * If this is a block device we have to be sure to use the
1639 	 * "common" block device vnode for the mapping.
1640 	 */
1641 	if (vp->v_type == VBLK)
1642 		vp = common_specvp(vp);
1643 
1644 	smd_cpu[CPU->cpu_seqid].scpu.scpu_getmap++;
1645 
1646 	if (segmap_kpm == 0 ||
1647 	    (forcefault == SM_PAGECREATE && rw != S_WRITE)) {
1648 		is_kpm = 0;
1649 	}
1650 
1651 	SMAP_HASHFUNC(vp, off, hashid);	/* macro assigns hashid */
1652 	hashmtx = SHASHMTX(hashid);
1653 
1654 retry_hash:
1655 	mutex_enter(hashmtx);
1656 	for (smp = smd_hash[hashid].sh_hash_list;
1657 	    smp != NULL; smp = smp->sm_hash)
1658 		if (smp->sm_vp == vp && smp->sm_off == baseoff)
1659 			break;
1660 	mutex_exit(hashmtx);
1661 
1662 vrfy_smp:
1663 	if (smp != NULL) {
1664 
1665 		ASSERT(vp->v_count != 0);
1666 
1667 		/*
1668 		 * Get smap lock and recheck its tag. The hash lock
1669 		 * is dropped since the hash is based on (vp, off)
1670 		 * and (vp, off) won't change when we have smap mtx.
1671 		 */
1672 		smapmtx = SMAPMTX(smp);
1673 		mutex_enter(smapmtx);
1674 		if (smp->sm_vp != vp || smp->sm_off != baseoff) {
1675 			mutex_exit(smapmtx);
1676 			goto retry_hash;
1677 		}
1678 
1679 		if (smp->sm_refcnt == 0) {
1680 
1681 			smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reclaim++;
1682 
1683 			/*
1684 			 * Could still be on the free list. However, this
1685 			 * could also be an smp that is transitioning from
1686 			 * the free list when we have too much contention
1687 			 * for the smapmtx's. In this case, we have an
1688 			 * unlocked smp that is not on the free list any
1689 			 * longer, but still has a 0 refcnt.  The only way
1690 			 * to be sure is to check the freelist pointers.
1691 			 * Since we now have the smapmtx, we are guaranteed
1692 			 * that the (vp, off) won't change, so we are safe
1693 			 * to reclaim it.  get_free_smp() knows that this
1694 			 * can happen, and it will check the refcnt.
1695 			 */
1696 
1697 			if ((smp->sm_next != NULL)) {
1698 				struct sm_freeq *freeq;
1699 
1700 				ASSERT(smp->sm_prev != NULL);
1701 				sm = &smd_free[smp->sm_free_ndx];
1702 
1703 				if (smp->sm_flags & SM_QNDX_ZERO)
1704 					freeq = &sm->sm_freeq[0];
1705 				else
1706 					freeq = &sm->sm_freeq[1];
1707 
1708 				mutex_enter(&freeq->smq_mtx);
1709 				if (freeq->smq_free != smp) {
1710 					/*
1711 					 * fastpath normal case
1712 					 */
1713 					smp->sm_prev->sm_next = smp->sm_next;
1714 					smp->sm_next->sm_prev = smp->sm_prev;
1715 				} else if (smp == smp->sm_next) {
1716 					/*
1717 					 * Taking the last smap on freelist
1718 					 */
1719 					freeq->smq_free = NULL;
1720 				} else {
1721 					/*
1722 					 * Reclaiming 1st smap on list
1723 					 */
1724 					freeq->smq_free = smp->sm_next;
1725 					smp->sm_prev->sm_next = smp->sm_next;
1726 					smp->sm_next->sm_prev = smp->sm_prev;
1727 				}
1728 				mutex_exit(&freeq->smq_mtx);
1729 				smp->sm_prev = smp->sm_next = NULL;
1730 			} else {
1731 				ASSERT(smp->sm_prev == NULL);
1732 				segmapcnt.smp_stolen.value.ul++;
1733 			}
1734 
1735 		} else {
1736 			segmapcnt.smp_get_use.value.ul++;
1737 		}
1738 		smp->sm_refcnt++;		/* another user */
1739 
1740 		/*
1741 		 * We don't invoke segmap_fault via TLB miss, so we set ref
1742 		 * and mod bits in advance. For S_OTHER  we set them in
1743 		 * segmap_fault F_SOFTUNLOCK.
1744 		 */
1745 		if (is_kpm) {
1746 			if (rw == S_WRITE) {
1747 				smp->sm_flags |= SM_WRITE_DATA;
1748 			} else if (rw == S_READ) {
1749 				smp->sm_flags |= SM_READ_DATA;
1750 			}
1751 		}
1752 		mutex_exit(smapmtx);
1753 
1754 		newslot = 0;
1755 	} else {
1756 
1757 		uint32_t free_ndx, *free_ndxp;
1758 		union segmap_cpu *scpu;
1759 
1760 		/*
1761 		 * On a PAC machine or a machine with anti-alias
1762 		 * hardware, smd_colormsk will be zero.
1763 		 *
1764 		 * On a VAC machine- pick color by offset in the file
1765 		 * so we won't get VAC conflicts on elf files.
1766 		 * On data files, color does not matter but we
1767 		 * don't know what kind of file it is so we always
1768 		 * pick color by offset. This causes color
1769 		 * corresponding to file offset zero to be used more
1770 		 * heavily.
1771 		 */
1772 		color = (baseoff >> MAXBSHIFT) & smd_colormsk;
1773 		scpu = smd_cpu+CPU->cpu_seqid;
1774 		free_ndxp = &scpu->scpu.scpu_free_ndx[color];
1775 		free_ndx = (*free_ndxp += smd_ncolor) & smd_freemsk;
1776 #ifdef DEBUG
1777 		colors_used[free_ndx]++;
1778 #endif /* DEBUG */
1779 
1780 		/*
1781 		 * Get a locked smp slot from the free list.
1782 		 */
1783 		smp = get_free_smp(free_ndx);
1784 		smapmtx = SMAPMTX(smp);
1785 
1786 		ASSERT(smp->sm_vp == NULL);
1787 
1788 		if ((nsmp = segmap_hashin(smp, vp, baseoff, hashid)) != NULL) {
1789 			/*
1790 			 * Failed to hashin, there exists one now.
1791 			 * Return the smp we just allocated.
1792 			 */
1793 			segmap_smapadd(smp);
1794 			mutex_exit(smapmtx);
1795 
1796 			smp = nsmp;
1797 			goto vrfy_smp;
1798 		}
1799 		smp->sm_refcnt++;		/* another user */
1800 
1801 		/*
1802 		 * We don't invoke segmap_fault via TLB miss, so we set ref
1803 		 * and mod bits in advance. For S_OTHER  we set them in
1804 		 * segmap_fault F_SOFTUNLOCK.
1805 		 */
1806 		if (is_kpm) {
1807 			if (rw == S_WRITE) {
1808 				smp->sm_flags |= SM_WRITE_DATA;
1809 			} else if (rw == S_READ) {
1810 				smp->sm_flags |= SM_READ_DATA;
1811 			}
1812 		}
1813 		mutex_exit(smapmtx);
1814 
1815 		newslot = 1;
1816 	}
1817 
1818 	if (!is_kpm)
1819 		goto use_segmap_range;
1820 
1821 	/*
1822 	 * Use segkpm
1823 	 */
1824 	ASSERT(PAGESIZE == MAXBSIZE);
1825 
1826 	/*
1827 	 * remember the last smp faulted on this cpu.
1828 	 */
1829 	(smd_cpu+CPU->cpu_seqid)->scpu.scpu_last_smap = smp;
1830 
1831 	if (forcefault == SM_PAGECREATE) {
1832 		baseaddr = segmap_pagecreate_kpm(seg, vp, baseoff, smp, rw);
1833 		return (baseaddr);
1834 	}
1835 
1836 	if (newslot == 0 &&
1837 	    (pp = GET_KPME(smp)->kpe_page) != NULL) {
1838 
1839 		/* fastpath */
1840 		switch (rw) {
1841 		case S_READ:
1842 		case S_WRITE:
1843 			if (page_trylock(pp, SE_SHARED)) {
1844 				if (PP_ISFREE(pp) ||
1845 				    !(pp->p_vnode == vp &&
1846 				    pp->p_offset == baseoff)) {
1847 					page_unlock(pp);
1848 					pp = page_lookup(vp, baseoff,
1849 						SE_SHARED);
1850 				}
1851 			} else {
1852 				pp = page_lookup(vp, baseoff, SE_SHARED);
1853 			}
1854 
1855 			if (pp == NULL) {
1856 				ASSERT(GET_KPME(smp)->kpe_page == NULL);
1857 				break;
1858 			}
1859 
1860 			if (rw == S_WRITE &&
1861 			    hat_page_getattr(pp, P_MOD | P_REF) !=
1862 			    (P_MOD | P_REF)) {
1863 				page_unlock(pp);
1864 				break;
1865 			}
1866 
1867 			/*
1868 			 * We have the p_selock as reader, grab_smp
1869 			 * can't hit us, we have bumped the smap
1870 			 * refcnt and hat_pageunload needs the
1871 			 * p_selock exclusive.
1872 			 */
1873 			kpme = GET_KPME(smp);
1874 			if (kpme->kpe_page == pp) {
1875 				baseaddr = hat_kpm_page2va(pp, 0);
1876 			} else if (kpme->kpe_page == NULL) {
1877 				baseaddr = hat_kpm_mapin(pp, kpme);
1878 			} else {
1879 				panic("segmap_getmapflt: stale "
1880 				    "kpme page, kpme %p", (void *)kpme);
1881 				/*NOTREACHED*/
1882 			}
1883 
1884 			/*
1885 			 * We don't invoke segmap_fault via TLB miss,
1886 			 * so we set ref and mod bits in advance.
1887 			 * For S_OTHER and we set them in segmap_fault
1888 			 * F_SOFTUNLOCK.
1889 			 */
1890 			if (rw == S_READ && !hat_isref(pp))
1891 				hat_setref(pp);
1892 
1893 			return (baseaddr);
1894 		default:
1895 			break;
1896 		}
1897 	}
1898 
1899 	base = segkpm_create_va(baseoff);
1900 	error = VOP_GETPAGE(vp, (offset_t)baseoff, len, &prot, pl, MAXBSIZE,
1901 	    seg, base, rw, CRED());
1902 
1903 	pp = pl[0];
1904 	if (error || pp == NULL) {
1905 		/*
1906 		 * Use segmap address slot and let segmap_fault deal
1907 		 * with the error cases. There is no error return
1908 		 * possible here.
1909 		 */
1910 		goto use_segmap_range;
1911 	}
1912 
1913 	ASSERT(pl[1] == NULL);
1914 
1915 	/*
1916 	 * When prot is not returned w/ PROT_ALL the returned pages
1917 	 * are not backed by fs blocks. For most of the segmap users
1918 	 * this is no problem, they don't write to the pages in the
1919 	 * same request and therefore don't rely on a following
1920 	 * trap driven segmap_fault. With SM_LOCKPROTO users it
1921 	 * is more secure to use segkmap adresses to allow
1922 	 * protection segmap_fault's.
1923 	 */
1924 	if (prot != PROT_ALL && forcefault == SM_LOCKPROTO) {
1925 		/*
1926 		 * Use segmap address slot and let segmap_fault
1927 		 * do the error return.
1928 		 */
1929 		ASSERT(rw != S_WRITE);
1930 		ASSERT(PAGE_LOCKED(pp));
1931 		page_unlock(pp);
1932 		forcefault = 0;
1933 		goto use_segmap_range;
1934 	}
1935 
1936 	/*
1937 	 * We have the p_selock as reader, grab_smp can't hit us, we
1938 	 * have bumped the smap refcnt and hat_pageunload needs the
1939 	 * p_selock exclusive.
1940 	 */
1941 	kpme = GET_KPME(smp);
1942 	if (kpme->kpe_page == pp) {
1943 		baseaddr = hat_kpm_page2va(pp, 0);
1944 	} else if (kpme->kpe_page == NULL) {
1945 		baseaddr = hat_kpm_mapin(pp, kpme);
1946 	} else {
1947 		panic("segmap_getmapflt: stale kpme page after "
1948 		    "VOP_GETPAGE, kpme %p", (void *)kpme);
1949 		/*NOTREACHED*/
1950 	}
1951 
1952 	smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
1953 
1954 	return (baseaddr);
1955 
1956 
1957 use_segmap_range:
1958 	baseaddr = seg->s_base + ((smp - smd_smap) * MAXBSIZE);
1959 	TRACE_4(TR_FAC_VM, TR_SEGMAP_GETMAP,
1960 	    "segmap_getmap:seg %p addr %p vp %p offset %llx",
1961 	    seg, baseaddr, vp, baseoff);
1962 
1963 	/*
1964 	 * Prefault the translations
1965 	 */
1966 	vaddr = baseaddr + (off - baseoff);
1967 	if (forcefault && (newslot || !hat_probe(kas.a_hat, vaddr))) {
1968 
1969 		caddr_t pgaddr = (caddr_t)((uintptr_t)vaddr &
1970 		    (uintptr_t)PAGEMASK);
1971 
1972 		(void) segmap_fault(kas.a_hat, seg, pgaddr,
1973 		    (vaddr + len - pgaddr + PAGESIZE - 1) & (uintptr_t)PAGEMASK,
1974 		    F_INVAL, rw);
1975 	}
1976 
1977 	return (baseaddr);
1978 }
1979 
1980 int
1981 segmap_release(struct seg *seg, caddr_t addr, uint_t flags)
1982 {
1983 	struct smap	*smp;
1984 	int 		error;
1985 	int		bflags = 0;
1986 	struct vnode	*vp;
1987 	u_offset_t	offset;
1988 	kmutex_t	*smtx;
1989 	int		is_kpm = 0;
1990 	page_t		*pp;
1991 
1992 	if (segmap_kpm && IS_KPM_ADDR(addr)) {
1993 
1994 		if (((uintptr_t)addr & MAXBOFFSET) != 0) {
1995 			panic("segmap_release: addr %p not "
1996 			    "MAXBSIZE aligned", (void *)addr);
1997 			/*NOTREACHED*/
1998 		}
1999 
2000 		if ((smp = get_smap_kpm(addr, &pp)) == NULL) {
2001 			panic("segmap_release: smap not found "
2002 			    "for addr %p", (void *)addr);
2003 			/*NOTREACHED*/
2004 		}
2005 
2006 		TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP,
2007 			"segmap_relmap:seg %p addr %p smp %p",
2008 			seg, addr, smp);
2009 
2010 		smtx = SMAPMTX(smp);
2011 
2012 		/*
2013 		 * For compatibilty reasons segmap_pagecreate_kpm sets this
2014 		 * flag to allow a following segmap_pagecreate to return
2015 		 * this as "newpage" flag. When segmap_pagecreate is not
2016 		 * called at all we clear it now.
2017 		 */
2018 		smp->sm_flags &= ~SM_KPM_NEWPAGE;
2019 		is_kpm = 1;
2020 		if (smp->sm_flags & SM_WRITE_DATA) {
2021 			hat_setrefmod(pp);
2022 		} else if (smp->sm_flags & SM_READ_DATA) {
2023 			hat_setref(pp);
2024 		}
2025 	} else {
2026 		if (addr < seg->s_base || addr >= seg->s_base + seg->s_size ||
2027 		    ((uintptr_t)addr & MAXBOFFSET) != 0) {
2028 			panic("segmap_release: bad addr %p", (void *)addr);
2029 			/*NOTREACHED*/
2030 		}
2031 		smp = GET_SMAP(seg, addr);
2032 
2033 		TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP,
2034 			"segmap_relmap:seg %p addr %p smp %p",
2035 			seg, addr, smp);
2036 
2037 		smtx = SMAPMTX(smp);
2038 		mutex_enter(smtx);
2039 		smp->sm_flags |= SM_NOTKPM_RELEASED;
2040 	}
2041 
2042 	ASSERT(smp->sm_refcnt > 0);
2043 
2044 	/*
2045 	 * Need to call VOP_PUTPAGE() if any flags (except SM_DONTNEED)
2046 	 * are set.
2047 	 */
2048 	if ((flags & ~SM_DONTNEED) != 0) {
2049 		if (flags & SM_WRITE)
2050 			segmapcnt.smp_rel_write.value.ul++;
2051 		if (flags & SM_ASYNC) {
2052 			bflags |= B_ASYNC;
2053 			segmapcnt.smp_rel_async.value.ul++;
2054 		}
2055 		if (flags & SM_INVAL) {
2056 			bflags |= B_INVAL;
2057 			segmapcnt.smp_rel_abort.value.ul++;
2058 		}
2059 		if (flags & SM_DESTROY) {
2060 			bflags |= (B_INVAL|B_TRUNC);
2061 			segmapcnt.smp_rel_abort.value.ul++;
2062 		}
2063 		if (smp->sm_refcnt == 1) {
2064 			/*
2065 			 * We only bother doing the FREE and DONTNEED flags
2066 			 * if no one else is still referencing this mapping.
2067 			 */
2068 			if (flags & SM_FREE) {
2069 				bflags |= B_FREE;
2070 				segmapcnt.smp_rel_free.value.ul++;
2071 			}
2072 			if (flags & SM_DONTNEED) {
2073 				bflags |= B_DONTNEED;
2074 				segmapcnt.smp_rel_dontneed.value.ul++;
2075 			}
2076 		}
2077 	} else {
2078 		smd_cpu[CPU->cpu_seqid].scpu.scpu_release++;
2079 	}
2080 
2081 	vp = smp->sm_vp;
2082 	offset = smp->sm_off;
2083 
2084 	if (--smp->sm_refcnt == 0) {
2085 
2086 		smp->sm_flags &= ~(SM_WRITE_DATA | SM_READ_DATA);
2087 
2088 		if (flags & (SM_INVAL|SM_DESTROY)) {
2089 			segmap_hashout(smp);	/* remove map info */
2090 			if (is_kpm) {
2091 				hat_kpm_mapout(pp, GET_KPME(smp), addr);
2092 				if (smp->sm_flags & SM_NOTKPM_RELEASED) {
2093 					smp->sm_flags &= ~SM_NOTKPM_RELEASED;
2094 					hat_unload(kas.a_hat, addr, MAXBSIZE,
2095 						HAT_UNLOAD);
2096 				}
2097 
2098 			} else {
2099 				if (segmap_kpm)
2100 					segkpm_mapout_validkpme(GET_KPME(smp));
2101 
2102 				smp->sm_flags &= ~SM_NOTKPM_RELEASED;
2103 				hat_unload(kas.a_hat, addr, MAXBSIZE,
2104 					HAT_UNLOAD);
2105 			}
2106 		}
2107 		segmap_smapadd(smp);	/* add to free list */
2108 	}
2109 
2110 	mutex_exit(smtx);
2111 
2112 	if (is_kpm)
2113 		page_unlock(pp);
2114 	/*
2115 	 * Now invoke VOP_PUTPAGE() if any flags (except SM_DONTNEED)
2116 	 * are set.
2117 	 */
2118 	if ((flags & ~SM_DONTNEED) != 0) {
2119 		error = VOP_PUTPAGE(vp, offset, MAXBSIZE,
2120 		    bflags, CRED());
2121 	} else {
2122 		error = 0;
2123 	}
2124 
2125 	return (error);
2126 }
2127 
2128 /*
2129  * Dump the pages belonging to this segmap segment.
2130  */
2131 static void
2132 segmap_dump(struct seg *seg)
2133 {
2134 	struct segmap_data *smd;
2135 	struct smap *smp, *smp_end;
2136 	page_t *pp;
2137 	pfn_t pfn;
2138 	u_offset_t off;
2139 	caddr_t addr;
2140 
2141 	smd = (struct segmap_data *)seg->s_data;
2142 	addr = seg->s_base;
2143 	for (smp = smd->smd_sm, smp_end = smp + smd->smd_npages;
2144 	    smp < smp_end; smp++) {
2145 
2146 		if (smp->sm_refcnt) {
2147 			for (off = 0; off < MAXBSIZE; off += PAGESIZE) {
2148 				int we_own_it = 0;
2149 
2150 				/*
2151 				 * If pp == NULL, the page either does
2152 				 * not exist or is exclusively locked.
2153 				 * So determine if it exists before
2154 				 * searching for it.
2155 				 */
2156 				if ((pp = page_lookup_nowait(smp->sm_vp,
2157 				    smp->sm_off + off, SE_SHARED)))
2158 					we_own_it = 1;
2159 				else
2160 					pp = page_exists(smp->sm_vp,
2161 					    smp->sm_off + off);
2162 
2163 				if (pp) {
2164 					pfn = page_pptonum(pp);
2165 					dump_addpage(seg->s_as,
2166 						addr + off, pfn);
2167 					if (we_own_it)
2168 						page_unlock(pp);
2169 				}
2170 				dump_timeleft = dump_timeout;
2171 			}
2172 		}
2173 		addr += MAXBSIZE;
2174 	}
2175 }
2176 
2177 /*ARGSUSED*/
2178 static int
2179 segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
2180     struct page ***ppp, enum lock_type type, enum seg_rw rw)
2181 {
2182 	return (ENOTSUP);
2183 }
2184 
2185 static int
2186 segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
2187 {
2188 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
2189 
2190 	memidp->val[0] = (uintptr_t)smd->smd_sm->sm_vp;
2191 	memidp->val[1] = smd->smd_sm->sm_off + (uintptr_t)(addr - seg->s_base);
2192 	return (0);
2193 }
2194 
2195 /*ARGSUSED*/
2196 static lgrp_mem_policy_info_t *
2197 segmap_getpolicy(struct seg *seg, caddr_t addr)
2198 {
2199 	return (NULL);
2200 }
2201 
2202 /*ARGSUSED*/
2203 static int
2204 segmap_capable(struct seg *seg, segcapability_t capability)
2205 {
2206 	return (0);
2207 }
2208 
2209 
2210 #ifdef	SEGKPM_SUPPORT
2211 
2212 /*
2213  * segkpm support routines
2214  */
2215 
2216 static caddr_t
2217 segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off,
2218 	struct smap *smp, enum seg_rw rw)
2219 {
2220 	caddr_t	base;
2221 	page_t	*pp;
2222 	int	newpage = 0;
2223 	struct kpme	*kpme;
2224 
2225 	ASSERT(smp->sm_refcnt > 0);
2226 
2227 	if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
2228 		kmutex_t *smtx;
2229 
2230 		base = segkpm_create_va(off);
2231 
2232 		if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT,
2233 		    seg, base)) == NULL) {
2234 			panic("segmap_pagecreate_kpm: "
2235 			    "page_create failed");
2236 			/*NOTREACHED*/
2237 		}
2238 
2239 		newpage = 1;
2240 		page_io_unlock(pp);
2241 		ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
2242 
2243 		/*
2244 		 * Mark this here until the following segmap_pagecreate
2245 		 * or segmap_release.
2246 		 */
2247 		smtx = SMAPMTX(smp);
2248 		mutex_enter(smtx);
2249 		smp->sm_flags |= SM_KPM_NEWPAGE;
2250 		mutex_exit(smtx);
2251 	}
2252 
2253 	kpme = GET_KPME(smp);
2254 	if (!newpage && kpme->kpe_page == pp)
2255 		base = hat_kpm_page2va(pp, 0);
2256 	else
2257 		base = hat_kpm_mapin(pp, kpme);
2258 
2259 	/*
2260 	 * FS code may decide not to call segmap_pagecreate and we
2261 	 * don't invoke segmap_fault via TLB miss, so we have to set
2262 	 * ref and mod bits in advance.
2263 	 */
2264 	if (rw == S_WRITE) {
2265 		hat_setrefmod(pp);
2266 	} else {
2267 		ASSERT(rw == S_READ);
2268 		hat_setref(pp);
2269 	}
2270 
2271 	smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
2272 
2273 	return (base);
2274 }
2275 
2276 /*
2277  * Find the smap structure corresponding to the
2278  * KPM addr and return it locked.
2279  */
2280 struct smap *
2281 get_smap_kpm(caddr_t addr, page_t **ppp)
2282 {
2283 	struct smap	*smp;
2284 	struct vnode	*vp;
2285 	u_offset_t	offset;
2286 	caddr_t		baseaddr = (caddr_t)((uintptr_t)addr & MAXBMASK);
2287 	int		hashid;
2288 	kmutex_t	*hashmtx;
2289 	page_t		*pp;
2290 	union segmap_cpu *scpu;
2291 
2292 	pp = hat_kpm_vaddr2page(baseaddr);
2293 
2294 	ASSERT(pp && !PP_ISFREE(pp));
2295 	ASSERT(PAGE_LOCKED(pp));
2296 	ASSERT(((uintptr_t)pp->p_offset & MAXBOFFSET) == 0);
2297 
2298 	vp = pp->p_vnode;
2299 	offset = pp->p_offset;
2300 	ASSERT(vp != NULL);
2301 
2302 	/*
2303 	 * Assume the last smap used on this cpu is the one needed.
2304 	 */
2305 	scpu = smd_cpu+CPU->cpu_seqid;
2306 	smp = scpu->scpu.scpu_last_smap;
2307 	mutex_enter(&smp->sm_mtx);
2308 	if (smp->sm_vp == vp && smp->sm_off == offset) {
2309 		ASSERT(smp->sm_refcnt > 0);
2310 	} else {
2311 		/*
2312 		 * Assumption wrong, find the smap on the hash chain.
2313 		 */
2314 		mutex_exit(&smp->sm_mtx);
2315 		SMAP_HASHFUNC(vp, offset, hashid); /* macro assigns hashid */
2316 		hashmtx = SHASHMTX(hashid);
2317 
2318 		mutex_enter(hashmtx);
2319 		smp = smd_hash[hashid].sh_hash_list;
2320 		for (; smp != NULL; smp = smp->sm_hash) {
2321 			if (smp->sm_vp == vp && smp->sm_off == offset)
2322 				break;
2323 		}
2324 		mutex_exit(hashmtx);
2325 		if (smp) {
2326 			mutex_enter(&smp->sm_mtx);
2327 			ASSERT(smp->sm_vp == vp && smp->sm_off == offset);
2328 		}
2329 	}
2330 
2331 	if (ppp)
2332 		*ppp = smp ? pp : NULL;
2333 
2334 	return (smp);
2335 }
2336 
2337 #else	/* SEGKPM_SUPPORT */
2338 
2339 /* segkpm stubs */
2340 
2341 /*ARGSUSED*/
2342 static caddr_t
2343 segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off,
2344 	struct smap *smp, enum seg_rw rw)
2345 {
2346 	return (NULL);
2347 }
2348 
2349 /*ARGSUSED*/
2350 struct smap *
2351 get_smap_kpm(caddr_t addr, page_t **ppp)
2352 {
2353 	return (NULL);
2354 }
2355 
2356 #endif	/* SEGKPM_SUPPORT */
2357