xref: /titanic_53/usr/src/uts/common/vm/vpm.c (revision a5652762e5f7bf683d19f18542e5e39df63bad79)
1*a5652762Spraks /*
2*a5652762Spraks  * CDDL HEADER START
3*a5652762Spraks  *
4*a5652762Spraks  * The contents of this file are subject to the terms of the
5*a5652762Spraks  * Common Development and Distribution License (the "License").
6*a5652762Spraks  * You may not use this file except in compliance with the License.
7*a5652762Spraks  *
8*a5652762Spraks  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*a5652762Spraks  * or http://www.opensolaris.org/os/licensing.
10*a5652762Spraks  * See the License for the specific language governing permissions
11*a5652762Spraks  * and limitations under the License.
12*a5652762Spraks  *
13*a5652762Spraks  * When distributing Covered Code, include this CDDL HEADER in each
14*a5652762Spraks  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*a5652762Spraks  * If applicable, add the following below this CDDL HEADER, with the
16*a5652762Spraks  * fields enclosed by brackets "[]" replaced with your own identifying
17*a5652762Spraks  * information: Portions Copyright [yyyy] [name of copyright owner]
18*a5652762Spraks  *
19*a5652762Spraks  * CDDL HEADER END
20*a5652762Spraks  */
21*a5652762Spraks /*
22*a5652762Spraks  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23*a5652762Spraks  * Use is subject to license terms.
24*a5652762Spraks  */
25*a5652762Spraks 
26*a5652762Spraks #pragma ident	"%Z%%M%	%I%	%E% SMI"
27*a5652762Spraks 
28*a5652762Spraks /*
29*a5652762Spraks  * VM - generic vnode page mapping interfaces.
30*a5652762Spraks  *
31*a5652762Spraks  * Mechanism to provide temporary mappings to vnode pages.
32*a5652762Spraks  * The typical use would be to copy/access file data.
33*a5652762Spraks  */
34*a5652762Spraks 
35*a5652762Spraks #include <sys/types.h>
36*a5652762Spraks #include <sys/t_lock.h>
37*a5652762Spraks #include <sys/param.h>
38*a5652762Spraks #include <sys/sysmacros.h>
39*a5652762Spraks #include <sys/buf.h>
40*a5652762Spraks #include <sys/systm.h>
41*a5652762Spraks #include <sys/vnode.h>
42*a5652762Spraks #include <sys/mman.h>
43*a5652762Spraks #include <sys/errno.h>
44*a5652762Spraks #include <sys/cred.h>
45*a5652762Spraks #include <sys/kmem.h>
46*a5652762Spraks #include <sys/vtrace.h>
47*a5652762Spraks #include <sys/cmn_err.h>
48*a5652762Spraks #include <sys/debug.h>
49*a5652762Spraks #include <sys/thread.h>
50*a5652762Spraks #include <sys/dumphdr.h>
51*a5652762Spraks #include <sys/bitmap.h>
52*a5652762Spraks #include <sys/lgrp.h>
53*a5652762Spraks 
54*a5652762Spraks #include <vm/seg_kmem.h>
55*a5652762Spraks #include <vm/hat.h>
56*a5652762Spraks #include <vm/as.h>
57*a5652762Spraks #include <vm/seg.h>
58*a5652762Spraks #include <vm/seg_kpm.h>
59*a5652762Spraks #include <vm/seg_map.h>
60*a5652762Spraks #include <vm/page.h>
61*a5652762Spraks #include <vm/pvn.h>
62*a5652762Spraks #include <vm/rm.h>
63*a5652762Spraks #include <vm/vpm.h>
64*a5652762Spraks 
65*a5652762Spraks /*
66*a5652762Spraks  * Needs to be enabled by each platform.
67*a5652762Spraks  */
68*a5652762Spraks int vpm_enable = 0;
69*a5652762Spraks 
70*a5652762Spraks #ifdef	SEGKPM_SUPPORT
71*a5652762Spraks 
72*a5652762Spraks 
73*a5652762Spraks int	vpm_cache_enable = 1;
74*a5652762Spraks long	vpm_cache_percent = 12;
75*a5652762Spraks long	vpm_cache_size;
76*a5652762Spraks int	vpm_nfreelist = 0;
77*a5652762Spraks int	vpmd_freemsk = 0;
78*a5652762Spraks 
79*a5652762Spraks #define	VPM_S_PAD	64
80*a5652762Spraks union vpm_cpu {
81*a5652762Spraks 	struct {
82*a5652762Spraks 		int	vcpu_free_ndx;
83*a5652762Spraks 		ulong_t	vcpu_hits;
84*a5652762Spraks 		ulong_t vcpu_misses;
85*a5652762Spraks 	} vcpu;
86*a5652762Spraks 	char vpm_pad[VPM_S_PAD];
87*a5652762Spraks };
88*a5652762Spraks static union vpm_cpu	*vpmd_cpu;
89*a5652762Spraks 
90*a5652762Spraks #define	vfree_ndx	vcpu.vcpu_free_ndx
91*a5652762Spraks 
92*a5652762Spraks int	vpm_cachemode = VPMCACHE_LRU;
93*a5652762Spraks 
94*a5652762Spraks #define	PPMTX(pp) (&(pp)->p_ilock)
95*a5652762Spraks 
96*a5652762Spraks static struct vpmap *vpmd_vpmap;	/* list of vpmap structs preallocated */
97*a5652762Spraks static struct vpmfree *vpmd_free;
98*a5652762Spraks #define	VPMAPMTX(vpm)	(&vpm->vpm_mtx)
99*a5652762Spraks #define	VPMAP2VMF(vpm)	(&vpmd_free[(vpm - vpmd_vpmap) & vpmd_freemsk])
100*a5652762Spraks #define	VPMAP2VMF_NDX(vpm)	(ushort_t)((vpm - vpmd_vpmap) & vpmd_freemsk)
101*a5652762Spraks #define	VPMP(id)	(&vpmd_vpmap[id - 1])
102*a5652762Spraks #define	VPMID(vpm)	(uint_t)((vpm - vpmd_vpmap) + 1)
103*a5652762Spraks 
104*a5652762Spraks 
105*a5652762Spraks #ifdef	DEBUG
106*a5652762Spraks 
107*a5652762Spraks struct	vpm_debug {
108*a5652762Spraks 	int vpmd_steals;
109*a5652762Spraks 	int vpmd_contend;
110*a5652762Spraks 	int vpmd_prevpagelocked;
111*a5652762Spraks 	int vpmd_getpagefailed;
112*a5652762Spraks 	int vpmd_zerostart;
113*a5652762Spraks 	int vpmd_emptyfreelist;
114*a5652762Spraks 	int vpmd_nofreevpms;
115*a5652762Spraks } vpm_debug;
116*a5652762Spraks 
117*a5652762Spraks #define	VPM_DEBUG(x)	((vpm_debug.x)++)
118*a5652762Spraks 
119*a5652762Spraks int	steals;
120*a5652762Spraks int	steals_mtbf = 7;
121*a5652762Spraks int	contend;
122*a5652762Spraks int	contend_mtbf = 127;
123*a5652762Spraks 
124*a5652762Spraks #define	VPM_MTBF(v, f)	(((++(v)) & (f)) != (f))
125*a5652762Spraks 
126*a5652762Spraks #else	/* DEBUG */
127*a5652762Spraks 
128*a5652762Spraks #define	VPM_MTBF(v, f)	(1)
129*a5652762Spraks #define	VPM_DEBUG(x)	/* nothing */
130*a5652762Spraks 
131*a5652762Spraks #endif
132*a5652762Spraks 
133*a5652762Spraks /*
134*a5652762Spraks  * The vpm cache.
135*a5652762Spraks  *
136*a5652762Spraks  * The main purpose of having a cache here is to speed up page_lookup()
137*a5652762Spraks  * operations and also provide an LRU(default) behaviour of file pages. The
138*a5652762Spraks  * page_lookup() operation tends to be expensive if a page has to be
139*a5652762Spraks  * reclaimed from the system page cache("cachelist"). Once we speed up the
140*a5652762Spraks  * page_lookup()->page_reclaim() path then there there should be no need for
141*a5652762Spraks  * this cache. The system page cache(cachelist) should effectively serve the
142*a5652762Spraks  * purpose of caching file pages.
143*a5652762Spraks  *
144*a5652762Spraks  * This cache is very similar to segmap's smap cache. Each page in the
145*a5652762Spraks  * cache is tracked by the structure vpmap_t. But unlike segmap, there is no
146*a5652762Spraks  * hash table. The page_t has a reference to the vpmap_t when cached. For a
147*a5652762Spraks  * given vnode, offset the page is found by means of a page_lookup() operation.
148*a5652762Spraks  * Any page which has a mapping(i.e when cached) will not be in the
149*a5652762Spraks  * system 'cachelist'. Hence the page_lookup() will not have to do a
150*a5652762Spraks  * page_reclaim(). That is how the cache serves to speed up page_lookup()
151*a5652762Spraks  * operations.
152*a5652762Spraks  *
153*a5652762Spraks  * This cache can be disabled by setting vpm_cache_enable = 0 in /etc/system.
154*a5652762Spraks  */
155*a5652762Spraks 
156*a5652762Spraks void
157*a5652762Spraks vpm_init()
158*a5652762Spraks {
159*a5652762Spraks 	long  npages;
160*a5652762Spraks 	struct vpmap *vpm;
161*a5652762Spraks 	struct vpmfree *vpmflp;
162*a5652762Spraks 	int i, ndx;
163*a5652762Spraks 	extern void prefetch_smap_w(void *);
164*a5652762Spraks 
165*a5652762Spraks 	if (!vpm_cache_enable) {
166*a5652762Spraks 		return;
167*a5652762Spraks 	}
168*a5652762Spraks 
169*a5652762Spraks 	/*
170*a5652762Spraks 	 * Set the size of the cache.
171*a5652762Spraks 	 */
172*a5652762Spraks 	vpm_cache_size = mmu_ptob((physmem * vpm_cache_percent)/100);
173*a5652762Spraks 	if (vpm_cache_size < VPMAP_MINCACHE) {
174*a5652762Spraks 		vpm_cache_size = VPMAP_MINCACHE;
175*a5652762Spraks 	}
176*a5652762Spraks 
177*a5652762Spraks 	/*
178*a5652762Spraks 	 * Number of freelists.
179*a5652762Spraks 	 */
180*a5652762Spraks 	if (vpm_nfreelist == 0) {
181*a5652762Spraks 		vpm_nfreelist = max_ncpus;
182*a5652762Spraks 	} else if (vpm_nfreelist < 0 || vpm_nfreelist > 2 * max_ncpus) {
183*a5652762Spraks 		cmn_err(CE_WARN, "vpmap create : number of freelist "
184*a5652762Spraks 		"vpm_nfreelist %d using %d", vpm_nfreelist, max_ncpus);
185*a5652762Spraks 		vpm_nfreelist = 2 * max_ncpus;
186*a5652762Spraks 	}
187*a5652762Spraks 
188*a5652762Spraks 	/*
189*a5652762Spraks 	 * Round it up to the next power of 2
190*a5652762Spraks 	 */
191*a5652762Spraks 	if (vpm_nfreelist & (vpm_nfreelist - 1)) {
192*a5652762Spraks 		vpm_nfreelist = 1 << (highbit(vpm_nfreelist));
193*a5652762Spraks 	}
194*a5652762Spraks 	vpmd_freemsk = vpm_nfreelist - 1;
195*a5652762Spraks 
196*a5652762Spraks 	/*
197*a5652762Spraks 	 * Use a per cpu rotor index to spread the allocations evenly
198*a5652762Spraks 	 * across the available vpm freelists.
199*a5652762Spraks 	 */
200*a5652762Spraks 	vpmd_cpu = kmem_zalloc(sizeof (union vpm_cpu) * max_ncpus, KM_SLEEP);
201*a5652762Spraks 	ndx = 0;
202*a5652762Spraks 	for (i = 0; i < max_ncpus; i++) {
203*a5652762Spraks 
204*a5652762Spraks 		vpmd_cpu[i].vfree_ndx = ndx;
205*a5652762Spraks 		ndx = (ndx + 1) & vpmd_freemsk;
206*a5652762Spraks 	}
207*a5652762Spraks 
208*a5652762Spraks 	/*
209*a5652762Spraks 	 * Allocate and initialize the freelist.
210*a5652762Spraks 	 */
211*a5652762Spraks 	vpmd_free = kmem_zalloc(vpm_nfreelist * sizeof (struct vpmfree),
212*a5652762Spraks 				KM_SLEEP);
213*a5652762Spraks 	for (i = 0; i < vpm_nfreelist; i++) {
214*a5652762Spraks 
215*a5652762Spraks 		vpmflp = &vpmd_free[i];
216*a5652762Spraks 		/*
217*a5652762Spraks 		 * Set up initial queue pointers. They will get flipped
218*a5652762Spraks 		 * back and forth.
219*a5652762Spraks 		 */
220*a5652762Spraks 		vpmflp->vpm_allocq = &vpmflp->vpm_freeq[VPMALLOCQ];
221*a5652762Spraks 		vpmflp->vpm_releq = &vpmflp->vpm_freeq[VPMRELEQ];
222*a5652762Spraks 	}
223*a5652762Spraks 
224*a5652762Spraks 	npages = mmu_btop(vpm_cache_size);
225*a5652762Spraks 
226*a5652762Spraks 
227*a5652762Spraks 	/*
228*a5652762Spraks 	 * Allocate and initialize the vpmap structs.
229*a5652762Spraks 	 */
230*a5652762Spraks 	vpmd_vpmap = kmem_zalloc(sizeof (struct vpmap) * npages, KM_SLEEP);
231*a5652762Spraks 	for (vpm = vpmd_vpmap; vpm <= &vpmd_vpmap[npages - 1]; vpm++) {
232*a5652762Spraks 		struct vpmfree *vpmflp;
233*a5652762Spraks 		union vpm_freeq *releq;
234*a5652762Spraks 		struct vpmap *vpmapf;
235*a5652762Spraks 
236*a5652762Spraks 		/*
237*a5652762Spraks 		 * Use prefetch as we have to walk thru a large number of
238*a5652762Spraks 		 * these data structures. We just use the smap's prefetch
239*a5652762Spraks 		 * routine as it does the same. This should work fine
240*a5652762Spraks 		 * for x64(this needs to be modifed when enabled on sparc).
241*a5652762Spraks 		 */
242*a5652762Spraks 		prefetch_smap_w((void *)vpm);
243*a5652762Spraks 
244*a5652762Spraks 		vpm->vpm_free_ndx = VPMAP2VMF_NDX(vpm);
245*a5652762Spraks 
246*a5652762Spraks 		vpmflp = VPMAP2VMF(vpm);
247*a5652762Spraks 		releq = vpmflp->vpm_releq;
248*a5652762Spraks 
249*a5652762Spraks 		vpmapf = releq->vpmq_free;
250*a5652762Spraks 		if (vpmapf == NULL) {
251*a5652762Spraks 			releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
252*a5652762Spraks 		} else {
253*a5652762Spraks 			vpm->vpm_next = vpmapf;
254*a5652762Spraks 			vpm->vpm_prev = vpmapf->vpm_prev;
255*a5652762Spraks 			vpmapf->vpm_prev = vpm;
256*a5652762Spraks 			vpm->vpm_prev->vpm_next = vpm;
257*a5652762Spraks 			releq->vpmq_free = vpm->vpm_next;
258*a5652762Spraks 		}
259*a5652762Spraks 
260*a5652762Spraks 		/*
261*a5652762Spraks 		 * Indicate that the vpmap is on the releq at start
262*a5652762Spraks 		 */
263*a5652762Spraks 		vpm->vpm_ndxflg = VPMRELEQ;
264*a5652762Spraks 	}
265*a5652762Spraks }
266*a5652762Spraks 
267*a5652762Spraks 
268*a5652762Spraks /*
269*a5652762Spraks  * unhooks vpm from the freelist if it is still on the freelist.
270*a5652762Spraks  */
271*a5652762Spraks #define	VPMAP_RMFREELIST(vpm) \
272*a5652762Spraks 	{ \
273*a5652762Spraks 		if (vpm->vpm_next != NULL) { \
274*a5652762Spraks 			union vpm_freeq *freeq; \
275*a5652762Spraks 			struct vpmfree *vpmflp; \
276*a5652762Spraks 			vpmflp = &vpmd_free[vpm->vpm_free_ndx]; \
277*a5652762Spraks 			freeq = &vpmflp->vpm_freeq[vpm->vpm_ndxflg]; \
278*a5652762Spraks 			mutex_enter(&freeq->vpmq_mtx); \
279*a5652762Spraks 			if (freeq->vpmq_free != vpm) { \
280*a5652762Spraks 				vpm->vpm_prev->vpm_next = vpm->vpm_next; \
281*a5652762Spraks 				vpm->vpm_next->vpm_prev = vpm->vpm_prev; \
282*a5652762Spraks 			} else if (vpm == vpm->vpm_next) { \
283*a5652762Spraks 				freeq->vpmq_free = NULL; \
284*a5652762Spraks 			} else { \
285*a5652762Spraks 				freeq->vpmq_free = vpm->vpm_next; \
286*a5652762Spraks 				vpm->vpm_prev->vpm_next = vpm->vpm_next; \
287*a5652762Spraks 				vpm->vpm_next->vpm_prev = vpm->vpm_prev; \
288*a5652762Spraks 			} \
289*a5652762Spraks 			mutex_exit(&freeq->vpmq_mtx); \
290*a5652762Spraks 			vpm->vpm_next = vpm->vpm_prev = NULL; \
291*a5652762Spraks 		} \
292*a5652762Spraks 	}
293*a5652762Spraks 
294*a5652762Spraks static int
295*a5652762Spraks get_freelndx(int mode)
296*a5652762Spraks {
297*a5652762Spraks 	int ndx;
298*a5652762Spraks 
299*a5652762Spraks 	ndx = vpmd_cpu[CPU->cpu_seqid].vfree_ndx & vpmd_freemsk;
300*a5652762Spraks 	switch (mode) {
301*a5652762Spraks 
302*a5652762Spraks 	case	VPMCACHE_LRU:
303*a5652762Spraks 	default:
304*a5652762Spraks 			vpmd_cpu[CPU->cpu_seqid].vfree_ndx++;
305*a5652762Spraks 			break;
306*a5652762Spraks 	}
307*a5652762Spraks 	return (ndx);
308*a5652762Spraks }
309*a5652762Spraks 
310*a5652762Spraks 
311*a5652762Spraks /*
312*a5652762Spraks  * Find one vpmap structure from the free lists and use it for the newpage.
313*a5652762Spraks  * The previous page it cached is dissociated and released. The page_t's
314*a5652762Spraks  * p_vpmref is cleared only when the vpm it is pointing to is locked(or
315*a5652762Spraks  * for AMD64 when the page is exclusively locked in page_unload. That is
316*a5652762Spraks  * because the p_vpmref is treated as mapping).
317*a5652762Spraks  *
318*a5652762Spraks  * The page's p_vpmref is set when the page is
319*a5652762Spraks  * locked(at least SHARED locked).
320*a5652762Spraks  */
321*a5652762Spraks static struct vpmap *
322*a5652762Spraks get_free_vpmap(page_t *newpage)
323*a5652762Spraks {
324*a5652762Spraks 	struct vpmfree *vpmflp;
325*a5652762Spraks 	kmutex_t *vmtx;
326*a5652762Spraks 	struct vpmap *vpm, *first;
327*a5652762Spraks 	union vpm_freeq *allocq, *releq;
328*a5652762Spraks 	page_t *pp = NULL;
329*a5652762Spraks 	int end_ndx, page_locked = 0;
330*a5652762Spraks 	int free_ndx;
331*a5652762Spraks 
332*a5652762Spraks 	/*
333*a5652762Spraks 	 * get the freelist bin index.
334*a5652762Spraks 	 */
335*a5652762Spraks 	free_ndx = get_freelndx(vpm_cachemode);
336*a5652762Spraks 
337*a5652762Spraks 	end_ndx = free_ndx;
338*a5652762Spraks 	vpmflp = &vpmd_free[free_ndx];
339*a5652762Spraks 
340*a5652762Spraks retry_queue:
341*a5652762Spraks 	allocq = vpmflp->vpm_allocq;
342*a5652762Spraks 	mutex_enter(&allocq->vpmq_mtx);
343*a5652762Spraks 
344*a5652762Spraks 	if ((vpm = allocq->vpmq_free) == NULL) {
345*a5652762Spraks 
346*a5652762Spraks skip_queue:
347*a5652762Spraks 		/*
348*a5652762Spraks 		 * The alloc list is empty or this queue is being skipped;
349*a5652762Spraks 		 * first see if the allocq toggled.
350*a5652762Spraks 		 */
351*a5652762Spraks 		if (vpmflp->vpm_allocq != allocq) {
352*a5652762Spraks 			/* queue changed */
353*a5652762Spraks 			mutex_exit(&allocq->vpmq_mtx);
354*a5652762Spraks 			goto retry_queue;
355*a5652762Spraks 		}
356*a5652762Spraks 		releq = vpmflp->vpm_releq;
357*a5652762Spraks 		if (!mutex_tryenter(&releq->vpmq_mtx)) {
358*a5652762Spraks 			/* cannot get releq; a free vpmap may be there now */
359*a5652762Spraks 			mutex_exit(&allocq->vpmq_mtx);
360*a5652762Spraks 
361*a5652762Spraks 			/*
362*a5652762Spraks 			 * This loop could spin forever if this thread has
363*a5652762Spraks 			 * higher priority than the thread that is holding
364*a5652762Spraks 			 * releq->vpmq_mtx. In order to force the other thread
365*a5652762Spraks 			 * to run, we'll lock/unlock the mutex which is safe
366*a5652762Spraks 			 * since we just unlocked the allocq mutex.
367*a5652762Spraks 			 */
368*a5652762Spraks 			mutex_enter(&releq->vpmq_mtx);
369*a5652762Spraks 			mutex_exit(&releq->vpmq_mtx);
370*a5652762Spraks 			goto retry_queue;
371*a5652762Spraks 		}
372*a5652762Spraks 		if (releq->vpmq_free == NULL) {
373*a5652762Spraks 			VPM_DEBUG(vpmd_emptyfreelist);
374*a5652762Spraks 			/*
375*a5652762Spraks 			 * This freelist is empty.
376*a5652762Spraks 			 * This should not happen unless clients
377*a5652762Spraks 			 * are failing to release the vpmap after
378*a5652762Spraks 			 * accessing the data. Before resorting
379*a5652762Spraks 			 * to sleeping, try the next list of the same color.
380*a5652762Spraks 			 */
381*a5652762Spraks 			free_ndx = (free_ndx + 1) & vpmd_freemsk;
382*a5652762Spraks 			if (free_ndx != end_ndx) {
383*a5652762Spraks 				mutex_exit(&releq->vpmq_mtx);
384*a5652762Spraks 				mutex_exit(&allocq->vpmq_mtx);
385*a5652762Spraks 				vpmflp = &vpmd_free[free_ndx];
386*a5652762Spraks 				goto retry_queue;
387*a5652762Spraks 			}
388*a5652762Spraks 			/*
389*a5652762Spraks 			 * Tried all freelists.
390*a5652762Spraks 			 * wait on this list and hope something gets freed.
391*a5652762Spraks 			 */
392*a5652762Spraks 			vpmflp->vpm_want++;
393*a5652762Spraks 			mutex_exit(&vpmflp->vpm_freeq[1].vpmq_mtx);
394*a5652762Spraks 			cv_wait(&vpmflp->vpm_free_cv,
395*a5652762Spraks 				&vpmflp->vpm_freeq[0].vpmq_mtx);
396*a5652762Spraks 			vpmflp->vpm_want--;
397*a5652762Spraks 			mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx);
398*a5652762Spraks 			vpmflp = &vpmd_free[free_ndx];
399*a5652762Spraks 			VPM_DEBUG(vpmd_nofreevpms);
400*a5652762Spraks 			goto retry_queue;
401*a5652762Spraks 		} else {
402*a5652762Spraks 			/*
403*a5652762Spraks 			 * Something on the rele queue; flip the alloc
404*a5652762Spraks 			 * and rele queues and retry.
405*a5652762Spraks 			 */
406*a5652762Spraks 			vpmflp->vpm_allocq = releq;
407*a5652762Spraks 			vpmflp->vpm_releq = allocq;
408*a5652762Spraks 			mutex_exit(&allocq->vpmq_mtx);
409*a5652762Spraks 			mutex_exit(&releq->vpmq_mtx);
410*a5652762Spraks 			if (page_locked) {
411*a5652762Spraks 				delay(hz >> 2);
412*a5652762Spraks 				page_locked = 0;
413*a5652762Spraks 			}
414*a5652762Spraks 			goto retry_queue;
415*a5652762Spraks 		}
416*a5652762Spraks 	} else {
417*a5652762Spraks 		int gotnewvpm;
418*a5652762Spraks 		kmutex_t *pmtx;
419*a5652762Spraks 		uint_t vpmref;
420*a5652762Spraks 
421*a5652762Spraks 		/*
422*a5652762Spraks 		 * Fastpath the case we get the vpmap mutex
423*a5652762Spraks 		 * on the first try.
424*a5652762Spraks 		 */
425*a5652762Spraks 		first = vpm;
426*a5652762Spraks next_vpmap:
427*a5652762Spraks 		vmtx = VPMAPMTX(vpm);
428*a5652762Spraks 		if (!mutex_tryenter(vmtx)) {
429*a5652762Spraks 			/*
430*a5652762Spraks 			 * Another thread is trying to reclaim this slot.
431*a5652762Spraks 			 * Skip to the next queue or vpmap.
432*a5652762Spraks 			 */
433*a5652762Spraks 			if ((vpm = vpm->vpm_next) == first) {
434*a5652762Spraks 				goto skip_queue;
435*a5652762Spraks 			} else {
436*a5652762Spraks 				goto next_vpmap;
437*a5652762Spraks 			}
438*a5652762Spraks 		}
439*a5652762Spraks 
440*a5652762Spraks 		/*
441*a5652762Spraks 		 * Assign this vpm to the newpage.
442*a5652762Spraks 		 */
443*a5652762Spraks 		pmtx = PPMTX(newpage);
444*a5652762Spraks 		gotnewvpm = 0;
445*a5652762Spraks 		mutex_enter(pmtx);
446*a5652762Spraks 
447*a5652762Spraks 		/*
448*a5652762Spraks 		 * Check if some other thread already assigned a vpm to
449*a5652762Spraks 		 * this page.
450*a5652762Spraks 		 */
451*a5652762Spraks 		if ((vpmref = newpage->p_vpmref) == 0) {
452*a5652762Spraks 			newpage->p_vpmref = VPMID(vpm);
453*a5652762Spraks 			gotnewvpm = 1;
454*a5652762Spraks 		} else {
455*a5652762Spraks 			VPM_DEBUG(vpmd_contend);
456*a5652762Spraks 			mutex_exit(vmtx);
457*a5652762Spraks 		}
458*a5652762Spraks 		mutex_exit(pmtx);
459*a5652762Spraks 
460*a5652762Spraks 		if (gotnewvpm) {
461*a5652762Spraks 
462*a5652762Spraks 			/*
463*a5652762Spraks 			 * At this point, we've selected the vpm. Remove vpm
464*a5652762Spraks 			 * from its freelist. If vpm is the first one in
465*a5652762Spraks 			 * the freelist, update the head of the freelist.
466*a5652762Spraks 			 */
467*a5652762Spraks 			if (first == vpm) {
468*a5652762Spraks 				ASSERT(first == allocq->vpmq_free);
469*a5652762Spraks 				allocq->vpmq_free = vpm->vpm_next;
470*a5652762Spraks 			}
471*a5652762Spraks 
472*a5652762Spraks 			/*
473*a5652762Spraks 			 * If the head of the freelist still points to vpm,
474*a5652762Spraks 			 * then there are no more free vpmaps in that list.
475*a5652762Spraks 			 */
476*a5652762Spraks 			if (allocq->vpmq_free == vpm)
477*a5652762Spraks 				/*
478*a5652762Spraks 				 * Took the last one
479*a5652762Spraks 				 */
480*a5652762Spraks 				allocq->vpmq_free = NULL;
481*a5652762Spraks 			else {
482*a5652762Spraks 				vpm->vpm_prev->vpm_next = vpm->vpm_next;
483*a5652762Spraks 				vpm->vpm_next->vpm_prev = vpm->vpm_prev;
484*a5652762Spraks 			}
485*a5652762Spraks 			mutex_exit(&allocq->vpmq_mtx);
486*a5652762Spraks 			vpm->vpm_prev = vpm->vpm_next = NULL;
487*a5652762Spraks 
488*a5652762Spraks 			/*
489*a5652762Spraks 			 * Disassociate the previous page. On x64 systems
490*a5652762Spraks 			 * p_vpmref is used as a mapping reference to the page.
491*a5652762Spraks 			 */
492*a5652762Spraks 			if ((pp = vpm->vpm_pp) != NULL &&
493*a5652762Spraks 				vpm->vpm_vp == pp->p_vnode &&
494*a5652762Spraks 				vpm->vpm_off == pp->p_offset) {
495*a5652762Spraks 
496*a5652762Spraks 				pmtx = PPMTX(pp);
497*a5652762Spraks 				if (page_trylock(pp, SE_SHARED)) {
498*a5652762Spraks 					/*
499*a5652762Spraks 					 * Now verify that it is the correct
500*a5652762Spraks 					 * page. If not someone else stole it,
501*a5652762Spraks 					 * so just unlock it and leave.
502*a5652762Spraks 					 */
503*a5652762Spraks 					mutex_enter(pmtx);
504*a5652762Spraks 					if (PP_ISFREE(pp) ||
505*a5652762Spraks 						vpm->vpm_vp != pp->p_vnode ||
506*a5652762Spraks 						vpm->vpm_off != pp->p_offset ||
507*a5652762Spraks 						pp->p_vpmref != VPMID(vpm)) {
508*a5652762Spraks 						mutex_exit(pmtx);
509*a5652762Spraks 
510*a5652762Spraks 						page_unlock(pp);
511*a5652762Spraks 					} else {
512*a5652762Spraks 						/*
513*a5652762Spraks 						 * Release the page.
514*a5652762Spraks 						 */
515*a5652762Spraks 						pp->p_vpmref = 0;
516*a5652762Spraks 						mutex_exit(pmtx);
517*a5652762Spraks 						hat_kpm_mapout(pp, 0,
518*a5652762Spraks 							hat_kpm_page2va(pp, 1));
519*a5652762Spraks 						(void) page_release(pp, 1);
520*a5652762Spraks 					}
521*a5652762Spraks 				} else {
522*a5652762Spraks 					/*
523*a5652762Spraks 					 * If the page cannot be locked, just
524*a5652762Spraks 					 * clear the p_vpmref and go.
525*a5652762Spraks 					 */
526*a5652762Spraks 					mutex_enter(pmtx);
527*a5652762Spraks 					if (pp->p_vpmref == VPMID(vpm)) {
528*a5652762Spraks 						pp->p_vpmref = 0;
529*a5652762Spraks 					}
530*a5652762Spraks 					mutex_exit(pmtx);
531*a5652762Spraks 					VPM_DEBUG(vpmd_prevpagelocked);
532*a5652762Spraks 				}
533*a5652762Spraks 			}
534*a5652762Spraks 
535*a5652762Spraks 			/*
536*a5652762Spraks 			 * Setup vpm to point to the new page.
537*a5652762Spraks 			 */
538*a5652762Spraks 			vpm->vpm_pp = newpage;
539*a5652762Spraks 			vpm->vpm_vp = newpage->p_vnode;
540*a5652762Spraks 			vpm->vpm_off = newpage->p_offset;
541*a5652762Spraks 
542*a5652762Spraks 		} else {
543*a5652762Spraks 			int steal = !VPM_MTBF(steals, steals_mtbf);
544*a5652762Spraks 			/*
545*a5652762Spraks 			 * Page already has a vpm assigned just use that.
546*a5652762Spraks 			 * Grab the vpm mutex and verify that it is still
547*a5652762Spraks 			 * the correct one. The pp->p_vpmref should not change
548*a5652762Spraks 			 * once we have the vpm mutex and the page lock.
549*a5652762Spraks 			 */
550*a5652762Spraks 			mutex_exit(&allocq->vpmq_mtx);
551*a5652762Spraks 			vpm = VPMP(vpmref);
552*a5652762Spraks 			vmtx = VPMAPMTX(vpm);
553*a5652762Spraks 			mutex_enter(vmtx);
554*a5652762Spraks 			if ((steal && vpm->vpm_refcnt == 0) ||
555*a5652762Spraks 			    vpm->vpm_pp != newpage) {
556*a5652762Spraks 				/*
557*a5652762Spraks 				 * The vpm got stolen, retry.
558*a5652762Spraks 				 * clear the p_vpmref.
559*a5652762Spraks 				 */
560*a5652762Spraks 				pmtx = PPMTX(newpage);
561*a5652762Spraks 				mutex_enter(pmtx);
562*a5652762Spraks 				if (newpage->p_vpmref == vpmref) {
563*a5652762Spraks 					newpage->p_vpmref = 0;
564*a5652762Spraks 				}
565*a5652762Spraks 				mutex_exit(pmtx);
566*a5652762Spraks 
567*a5652762Spraks 				mutex_exit(vmtx);
568*a5652762Spraks 				VPM_DEBUG(vpmd_steals);
569*a5652762Spraks 				goto retry_queue;
570*a5652762Spraks 			} else if (vpm->vpm_refcnt == 0) {
571*a5652762Spraks 				/*
572*a5652762Spraks 				 * Remove it from the free list if it
573*a5652762Spraks 				 * exists there.
574*a5652762Spraks 				 */
575*a5652762Spraks 				VPMAP_RMFREELIST(vpm);
576*a5652762Spraks 			}
577*a5652762Spraks 		}
578*a5652762Spraks 		return (vpm);
579*a5652762Spraks 	}
580*a5652762Spraks }
581*a5652762Spraks 
582*a5652762Spraks static void
583*a5652762Spraks free_vpmap(struct vpmap *vpm)
584*a5652762Spraks {
585*a5652762Spraks 	struct vpmfree *vpmflp;
586*a5652762Spraks 	struct vpmap *vpmfreelist;
587*a5652762Spraks 	union vpm_freeq *releq;
588*a5652762Spraks 
589*a5652762Spraks 	ASSERT(MUTEX_HELD(VPMAPMTX(vpm)));
590*a5652762Spraks 
591*a5652762Spraks 	if (vpm->vpm_refcnt != 0) {
592*a5652762Spraks 		panic("free_vpmap");
593*a5652762Spraks 		/*NOTREACHED*/
594*a5652762Spraks 	}
595*a5652762Spraks 
596*a5652762Spraks 	vpmflp = &vpmd_free[vpm->vpm_free_ndx];
597*a5652762Spraks 	/*
598*a5652762Spraks 	 * Add to the tail of the release queue
599*a5652762Spraks 	 * Note that vpm_releq and vpm_allocq could toggle
600*a5652762Spraks 	 * before we get the lock. This does not affect
601*a5652762Spraks 	 * correctness as the 2 queues are only maintained
602*a5652762Spraks 	 * to reduce lock pressure.
603*a5652762Spraks 	 */
604*a5652762Spraks 	releq = vpmflp->vpm_releq;
605*a5652762Spraks 	if (releq == &vpmflp->vpm_freeq[0]) {
606*a5652762Spraks 		vpm->vpm_ndxflg = 0;
607*a5652762Spraks 	} else {
608*a5652762Spraks 		vpm->vpm_ndxflg = 1;
609*a5652762Spraks 	}
610*a5652762Spraks 	mutex_enter(&releq->vpmq_mtx);
611*a5652762Spraks 	vpmfreelist = releq->vpmq_free;
612*a5652762Spraks 	if (vpmfreelist == 0) {
613*a5652762Spraks 		int want;
614*a5652762Spraks 
615*a5652762Spraks 		releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
616*a5652762Spraks 		/*
617*a5652762Spraks 		 * Both queue mutexes are held to set vpm_want;
618*a5652762Spraks 		 * snapshot the value before dropping releq mutex.
619*a5652762Spraks 		 * If vpm_want appears after the releq mutex is dropped,
620*a5652762Spraks 		 * then the vpmap just freed is already gone.
621*a5652762Spraks 		 */
622*a5652762Spraks 		want = vpmflp->vpm_want;
623*a5652762Spraks 		mutex_exit(&releq->vpmq_mtx);
624*a5652762Spraks 		/*
625*a5652762Spraks 		 * See if there was a waiter before dropping the releq mutex
626*a5652762Spraks 		 * then recheck after obtaining vpm_freeq[0] mutex as
627*a5652762Spraks 		 * the another thread may have already signaled.
628*a5652762Spraks 		 */
629*a5652762Spraks 		if (want) {
630*a5652762Spraks 			mutex_enter(&vpmflp->vpm_freeq[0].vpmq_mtx);
631*a5652762Spraks 			if (vpmflp->vpm_want)
632*a5652762Spraks 				cv_signal(&vpmflp->vpm_free_cv);
633*a5652762Spraks 			mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx);
634*a5652762Spraks 		}
635*a5652762Spraks 	} else {
636*a5652762Spraks 		vpm->vpm_next = vpmfreelist;
637*a5652762Spraks 		vpm->vpm_prev = vpmfreelist->vpm_prev;
638*a5652762Spraks 		vpmfreelist->vpm_prev = vpm;
639*a5652762Spraks 		vpm->vpm_prev->vpm_next = vpm;
640*a5652762Spraks 		mutex_exit(&releq->vpmq_mtx);
641*a5652762Spraks 	}
642*a5652762Spraks }
643*a5652762Spraks 
644*a5652762Spraks /*
645*a5652762Spraks  * Get the vpmap for the page.
646*a5652762Spraks  * The refcnt of this vpm is incremented.
647*a5652762Spraks  */
648*a5652762Spraks static struct vpmap *
649*a5652762Spraks get_vpmap(page_t *pp)
650*a5652762Spraks {
651*a5652762Spraks 	struct vpmap *vpm = NULL;
652*a5652762Spraks 	kmutex_t *vmtx;
653*a5652762Spraks 	kmutex_t *pmtx;
654*a5652762Spraks 	unsigned int refid;
655*a5652762Spraks 
656*a5652762Spraks 	ASSERT((pp != NULL) && PAGE_LOCKED(pp));
657*a5652762Spraks 
658*a5652762Spraks 	if (VPM_MTBF(contend, contend_mtbf) && (refid = pp->p_vpmref) != 0) {
659*a5652762Spraks 		vpm = VPMP(refid);
660*a5652762Spraks 		vmtx = VPMAPMTX(vpm);
661*a5652762Spraks 		mutex_enter(vmtx);
662*a5652762Spraks 		/*
663*a5652762Spraks 		 * Since we have the page lock and the vpm mutex, the
664*a5652762Spraks 		 * pp->p_vpmref cannot change.
665*a5652762Spraks 		 */
666*a5652762Spraks 		if (vpm->vpm_pp != pp) {
667*a5652762Spraks 			pmtx = PPMTX(pp);
668*a5652762Spraks 
669*a5652762Spraks 			/*
670*a5652762Spraks 			 * Clear the p_vpmref as it is incorrect.
671*a5652762Spraks 			 * This can happen if the page was stolen.
672*a5652762Spraks 			 * On x64 this should not happen as p_vpmref
673*a5652762Spraks 			 * is treated as a mapping on the page. So
674*a5652762Spraks 			 * if the page is stolen, the mapping would have
675*a5652762Spraks 			 * been cleared in page_unload().
676*a5652762Spraks 			 */
677*a5652762Spraks 			mutex_enter(pmtx);
678*a5652762Spraks 			if (pp->p_vpmref == refid)
679*a5652762Spraks 				pp->p_vpmref = 0;
680*a5652762Spraks 			mutex_exit(pmtx);
681*a5652762Spraks 
682*a5652762Spraks 			mutex_exit(vmtx);
683*a5652762Spraks 			vpm = NULL;
684*a5652762Spraks 		} else if (vpm->vpm_refcnt == 0) {
685*a5652762Spraks 			/*
686*a5652762Spraks 			 * Got the vpm, remove it from the free
687*a5652762Spraks 			 * list if it exists there.
688*a5652762Spraks 			 */
689*a5652762Spraks 			VPMAP_RMFREELIST(vpm);
690*a5652762Spraks 		}
691*a5652762Spraks 	}
692*a5652762Spraks 	if (vpm == NULL) {
693*a5652762Spraks 		/*
694*a5652762Spraks 		 * get_free_vpmap() returns with the vpmap mutex held.
695*a5652762Spraks 		 */
696*a5652762Spraks 		vpm = get_free_vpmap(pp);
697*a5652762Spraks 		vmtx = VPMAPMTX(vpm);
698*a5652762Spraks 		vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_misses++;
699*a5652762Spraks 	} else {
700*a5652762Spraks 		vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_hits++;
701*a5652762Spraks 	}
702*a5652762Spraks 
703*a5652762Spraks 	vpm->vpm_refcnt++;
704*a5652762Spraks 	mutex_exit(vmtx);
705*a5652762Spraks 
706*a5652762Spraks 	return (vpm);
707*a5652762Spraks }
708*a5652762Spraks 
709*a5652762Spraks /* END --- vpm cache ---- */
710*a5652762Spraks 
711*a5652762Spraks /*
712*a5652762Spraks  * The vnode page mapping(vpm) interface routines.
713*a5652762Spraks  */
714*a5652762Spraks 
715*a5652762Spraks /*
716*a5652762Spraks  * Find or create the pages starting form baseoff for specified
717*a5652762Spraks  * length 'len'.
718*a5652762Spraks  */
719*a5652762Spraks static int
720*a5652762Spraks vpm_pagecreate(
721*a5652762Spraks 	struct vnode *vp,
722*a5652762Spraks 	u_offset_t baseoff,
723*a5652762Spraks 	size_t len,
724*a5652762Spraks 	vmap_t vml[],
725*a5652762Spraks 	int nseg,
726*a5652762Spraks 	int *newpage)
727*a5652762Spraks {
728*a5652762Spraks 
729*a5652762Spraks 	page_t *pp = NULL;
730*a5652762Spraks 	caddr_t base;
731*a5652762Spraks 	u_offset_t off = baseoff;
732*a5652762Spraks 	int i;
733*a5652762Spraks 	ASSERT(nseg >= MINVMAPS && nseg < MAXVMAPS);
734*a5652762Spraks 
735*a5652762Spraks 	for (i = 0; len > 0; len -= MIN(len, PAGESIZE), i++) {
736*a5652762Spraks 		struct vpmap *vpm;
737*a5652762Spraks 
738*a5652762Spraks 
739*a5652762Spraks 		if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
740*a5652762Spraks 
741*a5652762Spraks 			base = segkpm_create_va(off);
742*a5652762Spraks 
743*a5652762Spraks 			/*
744*a5652762Spraks 			 * the seg pointer passed in is just advisor. Just
745*a5652762Spraks 			 * pass segkmap for now like segmap does with
746*a5652762Spraks 			 * segmap_kpm enabled.
747*a5652762Spraks 			 */
748*a5652762Spraks 			if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT,
749*a5652762Spraks 			    segkmap, base)) == NULL) {
750*a5652762Spraks 				panic("segmap_pagecreate_vpm: "
751*a5652762Spraks 				    "page_create failed");
752*a5652762Spraks 				/*NOTREACHED*/
753*a5652762Spraks 			}
754*a5652762Spraks 			if (newpage != NULL)
755*a5652762Spraks 				*newpage = 1;
756*a5652762Spraks 
757*a5652762Spraks 			page_io_unlock(pp);
758*a5652762Spraks 		}
759*a5652762Spraks 
760*a5652762Spraks 		/*
761*a5652762Spraks 		 * Get the vpm for this page_t.
762*a5652762Spraks 		 */
763*a5652762Spraks 		if (vpm_cache_enable) {
764*a5652762Spraks 			vpm = get_vpmap(pp);
765*a5652762Spraks 			vml[i].vs_data = (void *)&vpm->vpm_pp;
766*a5652762Spraks 		} else {
767*a5652762Spraks 			vml[i].vs_data = (void *)pp;
768*a5652762Spraks 			pp->p_vpmref = 0;
769*a5652762Spraks 		}
770*a5652762Spraks 
771*a5652762Spraks 		vml[i].vs_addr = hat_kpm_mapin(pp, 0);
772*a5652762Spraks 		vml[i].vs_len = PAGESIZE;
773*a5652762Spraks 
774*a5652762Spraks 		off += PAGESIZE;
775*a5652762Spraks 	}
776*a5652762Spraks 	vml[i].vs_data = NULL;
777*a5652762Spraks 	vml[i].vs_addr = (caddr_t)NULL;
778*a5652762Spraks 	return (0);
779*a5652762Spraks }
780*a5652762Spraks 
781*a5652762Spraks 
782*a5652762Spraks /*
783*a5652762Spraks  * Returns vpm mappings of pages in the range [off, off+len], where
784*a5652762Spraks  * len is rounded up to the PAGESIZE boundary. The list of pages and
785*a5652762Spraks  * the page addresses are returned in the SGL vml (vmap_t) array passed in.
786*a5652762Spraks  * The nseg is the number of vmap_t entries in the array.
787*a5652762Spraks  *
788*a5652762Spraks  * Currently max len allowed is MAXBSIZE therefore, it will either
789*a5652762Spraks  * fetch/create one or two pages depending on what is the PAGESIZE.
790*a5652762Spraks  *
791*a5652762Spraks  * The segmap's SM_LOCKPROTO  usage is not supported by these interfaces.
792*a5652762Spraks  * For such cases, use the seg_map interfaces.
793*a5652762Spraks  */
794*a5652762Spraks int
795*a5652762Spraks vpm_map_pages(
796*a5652762Spraks 	struct vnode *vp,
797*a5652762Spraks 	u_offset_t off,
798*a5652762Spraks 	size_t len,
799*a5652762Spraks 	int fetchpage,
800*a5652762Spraks 	vmap_t *vml,
801*a5652762Spraks 	int nseg,
802*a5652762Spraks 	int  *newpage,
803*a5652762Spraks 	enum seg_rw rw)
804*a5652762Spraks {
805*a5652762Spraks 	extern struct vnode *common_specvp();
806*a5652762Spraks 	u_offset_t baseoff;
807*a5652762Spraks 	uint_t prot;
808*a5652762Spraks 	caddr_t base;
809*a5652762Spraks 	page_t *pp, *pplist[MAXVMAPS];
810*a5652762Spraks 	struct vpmap *vpm;
811*a5652762Spraks 	int i, error = 0;
812*a5652762Spraks 
813*a5652762Spraks 	ASSERT(nseg >= MINVMAPS && nseg < MAXVMAPS);
814*a5652762Spraks 	baseoff = off & (offset_t)PAGEMASK;
815*a5652762Spraks 	vml[0].vs_data = NULL;
816*a5652762Spraks 	vml[0].vs_addr = (caddr_t)NULL;
817*a5652762Spraks 	/*
818*a5652762Spraks 	 * For now, lets restrict it to MAXBSIZE. XXX - We can allow
819*a5652762Spraks 	 * len longer then MAXBSIZE, but there should be a limit
820*a5652762Spraks 	 * which should be determined by how many pages the VOP_GETPAGE()
821*a5652762Spraks 	 * can fetch.
822*a5652762Spraks 	 */
823*a5652762Spraks 	if (off + len > baseoff + MAXBSIZE) {
824*a5652762Spraks 		panic("vpm_map_pages bad len");
825*a5652762Spraks 		/*NOTREACHED*/
826*a5652762Spraks 	}
827*a5652762Spraks 
828*a5652762Spraks 	/*
829*a5652762Spraks 	 * If this is a block device we have to be sure to use the
830*a5652762Spraks 	 * "common" block device vnode for the mapping.
831*a5652762Spraks 	 */
832*a5652762Spraks 	if (vp->v_type == VBLK)
833*a5652762Spraks 		vp = common_specvp(vp);
834*a5652762Spraks 
835*a5652762Spraks 
836*a5652762Spraks 	if (!fetchpage)
837*a5652762Spraks 		return (vpm_pagecreate(vp, baseoff, len, vml, nseg, newpage));
838*a5652762Spraks 
839*a5652762Spraks 	for (i = 0; len > 0; len -= MIN(len, PAGESIZE), i++,
840*a5652762Spraks 						pplist[i] = NULL) {
841*a5652762Spraks 
842*a5652762Spraks 		pp = page_lookup(vp, baseoff, SE_SHARED);
843*a5652762Spraks 
844*a5652762Spraks 		/*
845*a5652762Spraks 		 * If we did not find the page or if this page was not
846*a5652762Spraks 		 * in our cache, then let VOP_GETPAGE get all the pages.
847*a5652762Spraks 		 * We need to call VOP_GETPAGE so that filesytems can do some
848*a5652762Spraks 		 * (un)necessary tracking for sequential access.
849*a5652762Spraks 		 */
850*a5652762Spraks 
851*a5652762Spraks 		if (pp == NULL || (vpm_cache_enable && pp->p_vpmref == 0) ||
852*a5652762Spraks 			(rw == S_WRITE && hat_page_getattr(pp, P_MOD | P_REF)
853*a5652762Spraks 							!= (P_MOD | P_REF))) {
854*a5652762Spraks 			if (pp != NULL) {
855*a5652762Spraks 				page_unlock(pp);
856*a5652762Spraks 			}
857*a5652762Spraks 
858*a5652762Spraks 			/*
859*a5652762Spraks 			 * Pass a dummy address as it will be required
860*a5652762Spraks 			 * by page_create_va(). We pass segkmap as the seg
861*a5652762Spraks 			 * as some file systems(UFS) check it.
862*a5652762Spraks 			 */
863*a5652762Spraks 			base = segkpm_create_va(baseoff);
864*a5652762Spraks 
865*a5652762Spraks 			error = VOP_GETPAGE(vp, baseoff, len, &prot, &pplist[i],
866*a5652762Spraks 			roundup(len, PAGESIZE), segkmap, base, rw, CRED());
867*a5652762Spraks 			if (error) {
868*a5652762Spraks 				VPM_DEBUG(vpmd_getpagefailed);
869*a5652762Spraks 				pplist[i] = NULL;
870*a5652762Spraks 			}
871*a5652762Spraks 			break;
872*a5652762Spraks 		} else {
873*a5652762Spraks 			pplist[i] = pp;
874*a5652762Spraks 			baseoff += PAGESIZE;
875*a5652762Spraks 		}
876*a5652762Spraks 	}
877*a5652762Spraks 
878*a5652762Spraks 	if (error) {
879*a5652762Spraks 		for (i = 0; pplist[i] != NULL; i++) {
880*a5652762Spraks 			page_unlock(pplist[i]);
881*a5652762Spraks 			pplist[i] = NULL;
882*a5652762Spraks 		}
883*a5652762Spraks 		vml[0].vs_addr = NULL;
884*a5652762Spraks 		vml[0].vs_data = NULL;
885*a5652762Spraks 		return (FC_MAKE_ERR(error));
886*a5652762Spraks 	}
887*a5652762Spraks 
888*a5652762Spraks 	/*
889*a5652762Spraks 	 * Get the vpm's for pages.
890*a5652762Spraks 	 */
891*a5652762Spraks 	for (i = 0; pplist[i] != NULL; i++) {
892*a5652762Spraks 		if (vpm_cache_enable) {
893*a5652762Spraks 			vpm = get_vpmap(pplist[i]);
894*a5652762Spraks 			vml[i].vs_data = (void *)&(vpm->vpm_pp);
895*a5652762Spraks 		} else {
896*a5652762Spraks 			vml[i].vs_data = (void *)pplist[i];
897*a5652762Spraks 			pplist[i]->p_vpmref = 0;
898*a5652762Spraks 		}
899*a5652762Spraks 
900*a5652762Spraks 		vml[i].vs_addr = hat_kpm_mapin(pplist[i], 0);
901*a5652762Spraks 		vml[i].vs_len = PAGESIZE;
902*a5652762Spraks 	}
903*a5652762Spraks 
904*a5652762Spraks 	vml[i].vs_data = NULL;
905*a5652762Spraks 	vml[i].vs_addr = (caddr_t)NULL;
906*a5652762Spraks 
907*a5652762Spraks 	return (0);
908*a5652762Spraks }
909*a5652762Spraks 
910*a5652762Spraks /*
911*a5652762Spraks  * Release the vpm mappings on the pages and unlock them.
912*a5652762Spraks  */
913*a5652762Spraks void
914*a5652762Spraks vpm_unmap_pages(vmap_t vml[], enum seg_rw rw)
915*a5652762Spraks {
916*a5652762Spraks 	int i;
917*a5652762Spraks 	struct vpmap *vpm;
918*a5652762Spraks 	kmutex_t *mtx;
919*a5652762Spraks 	page_t *pp;
920*a5652762Spraks 
921*a5652762Spraks 	for (i = 0; vml[i].vs_data != NULL; i++) {
922*a5652762Spraks 		ASSERT(IS_KPM_ADDR(vml[i].vs_addr));
923*a5652762Spraks 
924*a5652762Spraks 		if (vpm_cache_enable) {
925*a5652762Spraks 			pp = *(((page_t **)vml[i].vs_data));
926*a5652762Spraks 		} else {
927*a5652762Spraks 			pp = (page_t *)vml[i].vs_data;
928*a5652762Spraks 		}
929*a5652762Spraks 
930*a5652762Spraks 		/*
931*a5652762Spraks 		 * Mark page as being modified or referenced, bacause vpm pages
932*a5652762Spraks 		 * would not cause faults where it would be set normally.
933*a5652762Spraks 		 */
934*a5652762Spraks 		if (rw == S_WRITE) {
935*a5652762Spraks 			hat_setrefmod(pp);
936*a5652762Spraks 		} else {
937*a5652762Spraks 			ASSERT(rw == S_READ);
938*a5652762Spraks 			hat_setref(pp);
939*a5652762Spraks 		}
940*a5652762Spraks 
941*a5652762Spraks 		if (vpm_cache_enable) {
942*a5652762Spraks 			page_unlock(pp);
943*a5652762Spraks 			vpm = (struct vpmap *)((char *)vml[i].vs_data
944*a5652762Spraks 					- offsetof(struct vpmap, vpm_pp));
945*a5652762Spraks 			mtx = VPMAPMTX(vpm);
946*a5652762Spraks 			mutex_enter(mtx);
947*a5652762Spraks 
948*a5652762Spraks 			if (--vpm->vpm_refcnt == 0) {
949*a5652762Spraks 				free_vpmap(vpm);
950*a5652762Spraks 			}
951*a5652762Spraks 			mutex_exit(mtx);
952*a5652762Spraks 		} else {
953*a5652762Spraks 			hat_kpm_mapout(pp, 0, vml[i].vs_addr);
954*a5652762Spraks 			(void) page_release(pp, 1);
955*a5652762Spraks 		}
956*a5652762Spraks 		vml[i].vs_data = NULL;
957*a5652762Spraks 		vml[i].vs_addr = NULL;
958*a5652762Spraks 	}
959*a5652762Spraks }
960*a5652762Spraks 
961*a5652762Spraks /*
962*a5652762Spraks  * Given the vp, off and the uio structure, this routine will do the
963*a5652762Spraks  * the copy (uiomove). If the last page created is partially written,
964*a5652762Spraks  * the rest of the page is zeroed out. It also zeros the beginning of
965*a5652762Spraks  * the first page till the start offset if requested(zerostart).
966*a5652762Spraks  * If pages are to be fetched, it will call the filesystem's getpage
967*a5652762Spraks  * function (VOP_GETPAGE) to get them, otherwise they will be created if
968*a5652762Spraks  * not already present in the page cache.
969*a5652762Spraks  */
970*a5652762Spraks int
971*a5652762Spraks vpm_data_copy(struct vnode *vp,
972*a5652762Spraks 	u_offset_t off,
973*a5652762Spraks 	size_t len,
974*a5652762Spraks 	struct uio *uio,
975*a5652762Spraks 	int fetchpage,
976*a5652762Spraks 	int *newpage,
977*a5652762Spraks 	int zerostart,
978*a5652762Spraks 	enum seg_rw rw)
979*a5652762Spraks {
980*a5652762Spraks 	int error;
981*a5652762Spraks 	struct vmap vml[MINVMAPS];
982*a5652762Spraks 	enum uio_rw uiorw;
983*a5652762Spraks 	int npages = 0;
984*a5652762Spraks 
985*a5652762Spraks 	uiorw = (rw == S_WRITE) ? UIO_WRITE : UIO_READ;
986*a5652762Spraks 	/*
987*a5652762Spraks 	 * 'off' will be the offset where the I/O starts.
988*a5652762Spraks 	 * We get the pages starting at the (off & PAGEMASK)
989*a5652762Spraks 	 * page boundary.
990*a5652762Spraks 	 */
991*a5652762Spraks 	error = vpm_map_pages(vp, off, (uint_t)len,
992*a5652762Spraks 		fetchpage, vml, MINVMAPS, &npages,  rw);
993*a5652762Spraks 
994*a5652762Spraks 	if (newpage != NULL)
995*a5652762Spraks 		*newpage = npages;
996*a5652762Spraks 	if (!error) {
997*a5652762Spraks 		int i, pn, slen = len;
998*a5652762Spraks 		int pon = off & PAGEOFFSET;
999*a5652762Spraks 
1000*a5652762Spraks 		/*
1001*a5652762Spraks 		 * Clear from the beginning of the page to start offset
1002*a5652762Spraks 		 * if requested.
1003*a5652762Spraks 		 */
1004*a5652762Spraks 		if (!fetchpage && zerostart) {
1005*a5652762Spraks 			(void) kzero(vml[0].vs_addr,  (uint_t)pon);
1006*a5652762Spraks 			VPM_DEBUG(vpmd_zerostart);
1007*a5652762Spraks 		}
1008*a5652762Spraks 
1009*a5652762Spraks 		for (i = 0; !error && slen > 0 &&
1010*a5652762Spraks 				vml[i].vs_addr != NULL; i++) {
1011*a5652762Spraks 			pn = (int)MIN(slen, (PAGESIZE - pon));
1012*a5652762Spraks 			error = uiomove(vml[i].vs_addr + pon,
1013*a5652762Spraks 				    (long)pn, uiorw, uio);
1014*a5652762Spraks 			slen -= pn;
1015*a5652762Spraks 			pon = 0;
1016*a5652762Spraks 		}
1017*a5652762Spraks 
1018*a5652762Spraks 		/*
1019*a5652762Spraks 		 * When new pages are created, zero out part of the
1020*a5652762Spraks 		 * page we did not copy to.
1021*a5652762Spraks 		 */
1022*a5652762Spraks 		if (!fetchpage && npages &&
1023*a5652762Spraks 			uio->uio_loffset < roundup(off + len, PAGESIZE)) {
1024*a5652762Spraks 			int nzero;
1025*a5652762Spraks 
1026*a5652762Spraks 			pon = (uio->uio_loffset & PAGEOFFSET);
1027*a5652762Spraks 			nzero = PAGESIZE  - pon;
1028*a5652762Spraks 			i = (uio->uio_loffset - (off & PAGEMASK)) / PAGESIZE;
1029*a5652762Spraks 			(void) kzero(vml[i].vs_addr + pon, (uint_t)nzero);
1030*a5652762Spraks 		}
1031*a5652762Spraks 		vpm_unmap_pages(vml, rw);
1032*a5652762Spraks 	}
1033*a5652762Spraks 	return (error);
1034*a5652762Spraks }
1035*a5652762Spraks 
1036*a5652762Spraks /*
1037*a5652762Spraks  * called to flush pages for the given vnode covering
1038*a5652762Spraks  * [off, off+len] range.
1039*a5652762Spraks  */
1040*a5652762Spraks int
1041*a5652762Spraks vpm_sync_pages(struct vnode *vp,
1042*a5652762Spraks 		u_offset_t off,
1043*a5652762Spraks 		size_t len,
1044*a5652762Spraks 		uint_t flags)
1045*a5652762Spraks {
1046*a5652762Spraks 	extern struct vnode *common_specvp();
1047*a5652762Spraks 	int bflags = 0;
1048*a5652762Spraks 	int error = 0;
1049*a5652762Spraks 	size_t psize = roundup(len, PAGESIZE);
1050*a5652762Spraks 
1051*a5652762Spraks 	/*
1052*a5652762Spraks 	 * If this is a block device we have to be sure to use the
1053*a5652762Spraks 	 * "common" block device vnode for the mapping.
1054*a5652762Spraks 	 */
1055*a5652762Spraks 	if (vp->v_type == VBLK)
1056*a5652762Spraks 		vp = common_specvp(vp);
1057*a5652762Spraks 
1058*a5652762Spraks 	if ((flags & ~SM_DONTNEED) != 0) {
1059*a5652762Spraks 		if (flags & SM_ASYNC)
1060*a5652762Spraks 			bflags |= B_ASYNC;
1061*a5652762Spraks 		if (flags & SM_INVAL)
1062*a5652762Spraks 			bflags |= B_INVAL;
1063*a5652762Spraks 		if (flags & SM_DESTROY)
1064*a5652762Spraks 			bflags |= (B_INVAL|B_TRUNC);
1065*a5652762Spraks 		if (flags & SM_FREE)
1066*a5652762Spraks 			bflags |= B_FREE;
1067*a5652762Spraks 		if (flags & SM_DONTNEED)
1068*a5652762Spraks 			bflags |= B_DONTNEED;
1069*a5652762Spraks 
1070*a5652762Spraks 		error = VOP_PUTPAGE(vp, off, psize, bflags, CRED());
1071*a5652762Spraks 	}
1072*a5652762Spraks 
1073*a5652762Spraks 	return (error);
1074*a5652762Spraks }
1075*a5652762Spraks 
1076*a5652762Spraks 
1077*a5652762Spraks #else	/* SEGKPM_SUPPORT */
1078*a5652762Spraks 
1079*a5652762Spraks /* vpm stubs */
1080*a5652762Spraks void
1081*a5652762Spraks vpm_init()
1082*a5652762Spraks {
1083*a5652762Spraks }
1084*a5652762Spraks 
1085*a5652762Spraks /*ARGSUSED*/
1086*a5652762Spraks int
1087*a5652762Spraks vpm_pagecreate(
1088*a5652762Spraks 	struct vnode *vp,
1089*a5652762Spraks 	u_offset_t baseoff,
1090*a5652762Spraks 	size_t len,
1091*a5652762Spraks 	vmap_t vml[],
1092*a5652762Spraks 	int nseg,
1093*a5652762Spraks 	int *newpage)
1094*a5652762Spraks {
1095*a5652762Spraks 	return (0);
1096*a5652762Spraks }
1097*a5652762Spraks 
1098*a5652762Spraks /*ARGSUSED*/
1099*a5652762Spraks int
1100*a5652762Spraks vpm_map_pages(
1101*a5652762Spraks 	struct vnode *vp,
1102*a5652762Spraks 	u_offset_t off,
1103*a5652762Spraks 	size_t len,
1104*a5652762Spraks 	int fetchpage,
1105*a5652762Spraks 	vmap_t vml[],
1106*a5652762Spraks 	int nseg,
1107*a5652762Spraks 	int *newpage,
1108*a5652762Spraks 	enum seg_rw rw)
1109*a5652762Spraks {
1110*a5652762Spraks 	return (0);
1111*a5652762Spraks }
1112*a5652762Spraks 
1113*a5652762Spraks /*ARGSUSED*/
1114*a5652762Spraks int
1115*a5652762Spraks vpm_data_copy(struct vnode *vp,
1116*a5652762Spraks 	u_offset_t off,
1117*a5652762Spraks 	size_t len,
1118*a5652762Spraks 	struct uio *uio,
1119*a5652762Spraks 	int fetchpage,
1120*a5652762Spraks 	int *newpage,
1121*a5652762Spraks 	int zerostart,
1122*a5652762Spraks 	enum seg_rw rw)
1123*a5652762Spraks {
1124*a5652762Spraks 	return (0);
1125*a5652762Spraks }
1126*a5652762Spraks 
1127*a5652762Spraks /*ARGSUSED*/
1128*a5652762Spraks void
1129*a5652762Spraks vpm_unmap_pages(vmap_t vml[], enum seg_rw rw)
1130*a5652762Spraks {
1131*a5652762Spraks }
1132*a5652762Spraks /*ARGSUSED*/
1133*a5652762Spraks int
1134*a5652762Spraks vpm_sync_pages(struct vnode *vp,
1135*a5652762Spraks 		u_offset_t off,
1136*a5652762Spraks 		size_t len,
1137*a5652762Spraks 		uint_t flags)
1138*a5652762Spraks {
1139*a5652762Spraks 	return (0);
1140*a5652762Spraks }
1141*a5652762Spraks #endif	/* SEGKPM_SUPPORT */
1142