xref: /titanic_50/usr/src/uts/common/vm/vpm.c (revision de710d24d2fae4468e64da999e1d952a247f142c)
1a5652762Spraks /*
2a5652762Spraks  * CDDL HEADER START
3a5652762Spraks  *
4a5652762Spraks  * The contents of this file are subject to the terms of the
5a5652762Spraks  * Common Development and Distribution License (the "License").
6a5652762Spraks  * You may not use this file except in compliance with the License.
7a5652762Spraks  *
8a5652762Spraks  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9a5652762Spraks  * or http://www.opensolaris.org/os/licensing.
10a5652762Spraks  * See the License for the specific language governing permissions
11a5652762Spraks  * and limitations under the License.
12a5652762Spraks  *
13a5652762Spraks  * When distributing Covered Code, include this CDDL HEADER in each
14a5652762Spraks  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15a5652762Spraks  * If applicable, add the following below this CDDL HEADER, with the
16a5652762Spraks  * fields enclosed by brackets "[]" replaced with your own identifying
17a5652762Spraks  * information: Portions Copyright [yyyy] [name of copyright owner]
18a5652762Spraks  *
19a5652762Spraks  * CDDL HEADER END
20a5652762Spraks  */
21a5652762Spraks /*
22183971baSPrakash Sangappa  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23a5652762Spraks  * Use is subject to license terms.
24a5652762Spraks  */
25a5652762Spraks 
26a5652762Spraks 
27a5652762Spraks /*
28a5652762Spraks  * VM - generic vnode page mapping interfaces.
29a5652762Spraks  *
30a5652762Spraks  * Mechanism to provide temporary mappings to vnode pages.
31a5652762Spraks  * The typical use would be to copy/access file data.
32a5652762Spraks  */
33a5652762Spraks 
34a5652762Spraks #include <sys/types.h>
35a5652762Spraks #include <sys/t_lock.h>
36a5652762Spraks #include <sys/param.h>
37a5652762Spraks #include <sys/sysmacros.h>
38a5652762Spraks #include <sys/buf.h>
39a5652762Spraks #include <sys/systm.h>
40a5652762Spraks #include <sys/vnode.h>
41a5652762Spraks #include <sys/mman.h>
42a5652762Spraks #include <sys/errno.h>
43a5652762Spraks #include <sys/cred.h>
44a5652762Spraks #include <sys/kmem.h>
45a5652762Spraks #include <sys/vtrace.h>
46a5652762Spraks #include <sys/cmn_err.h>
47a5652762Spraks #include <sys/debug.h>
48a5652762Spraks #include <sys/thread.h>
49a5652762Spraks #include <sys/dumphdr.h>
50a5652762Spraks #include <sys/bitmap.h>
51a5652762Spraks #include <sys/lgrp.h>
52a5652762Spraks 
53a5652762Spraks #include <vm/seg_kmem.h>
54a5652762Spraks #include <vm/hat.h>
55a5652762Spraks #include <vm/as.h>
56a5652762Spraks #include <vm/seg.h>
57a5652762Spraks #include <vm/seg_kpm.h>
58a5652762Spraks #include <vm/seg_map.h>
59a5652762Spraks #include <vm/page.h>
60a5652762Spraks #include <vm/pvn.h>
61a5652762Spraks #include <vm/rm.h>
62a5652762Spraks #include <vm/vpm.h>
63a5652762Spraks 
64183971baSPrakash Sangappa 
65183971baSPrakash Sangappa #ifdef	SEGKPM_SUPPORT
66a5652762Spraks /*
67183971baSPrakash Sangappa  * VPM can be disabled by setting vpm_enable = 0 in
68183971baSPrakash Sangappa  * /etc/system.
69183971baSPrakash Sangappa  *
70a5652762Spraks  */
71183971baSPrakash Sangappa int vpm_enable = 1;
72183971baSPrakash Sangappa 
73183971baSPrakash Sangappa #else
74183971baSPrakash Sangappa 
75a5652762Spraks int vpm_enable = 0;
76a5652762Spraks 
77183971baSPrakash Sangappa #endif
78183971baSPrakash Sangappa 
79a5652762Spraks #ifdef	SEGKPM_SUPPORT
80a5652762Spraks 
81a5652762Spraks 
82a5652762Spraks int	vpm_cache_enable = 1;
83a5652762Spraks long	vpm_cache_percent = 12;
84a5652762Spraks long	vpm_cache_size;
85a5652762Spraks int	vpm_nfreelist = 0;
86a5652762Spraks int	vpmd_freemsk = 0;
87a5652762Spraks 
88a5652762Spraks #define	VPM_S_PAD	64
89a5652762Spraks union vpm_cpu {
90a5652762Spraks 	struct {
91a5652762Spraks 		int	vcpu_free_ndx;
92a5652762Spraks 		ulong_t	vcpu_hits;
93a5652762Spraks 		ulong_t vcpu_misses;
94a5652762Spraks 	} vcpu;
95a5652762Spraks 	char vpm_pad[VPM_S_PAD];
96a5652762Spraks };
97a5652762Spraks static union vpm_cpu	*vpmd_cpu;
98a5652762Spraks 
99a5652762Spraks #define	vfree_ndx	vcpu.vcpu_free_ndx
100a5652762Spraks 
101a5652762Spraks int	vpm_cachemode = VPMCACHE_LRU;
102a5652762Spraks 
103a5652762Spraks #define	PPMTX(pp) (&(pp)->p_ilock)
104a5652762Spraks 
105a5652762Spraks static struct vpmap *vpmd_vpmap;	/* list of vpmap structs preallocated */
106a5652762Spraks static struct vpmfree *vpmd_free;
107a5652762Spraks #define	VPMAPMTX(vpm)	(&vpm->vpm_mtx)
108a5652762Spraks #define	VPMAP2VMF(vpm)	(&vpmd_free[(vpm - vpmd_vpmap) & vpmd_freemsk])
109a5652762Spraks #define	VPMAP2VMF_NDX(vpm)	(ushort_t)((vpm - vpmd_vpmap) & vpmd_freemsk)
110a5652762Spraks #define	VPMP(id)	(&vpmd_vpmap[id - 1])
111a5652762Spraks #define	VPMID(vpm)	(uint_t)((vpm - vpmd_vpmap) + 1)
112a5652762Spraks 
113a5652762Spraks 
114a5652762Spraks #ifdef	DEBUG
115a5652762Spraks 
116a5652762Spraks struct	vpm_debug {
117a5652762Spraks 	int vpmd_steals;
118a5652762Spraks 	int vpmd_contend;
119a5652762Spraks 	int vpmd_prevpagelocked;
120a5652762Spraks 	int vpmd_getpagefailed;
121a5652762Spraks 	int vpmd_zerostart;
122a5652762Spraks 	int vpmd_emptyfreelist;
123a5652762Spraks 	int vpmd_nofreevpms;
124a5652762Spraks } vpm_debug;
125a5652762Spraks 
126a5652762Spraks #define	VPM_DEBUG(x)	((vpm_debug.x)++)
127a5652762Spraks 
128a5652762Spraks int	steals;
129a5652762Spraks int	steals_mtbf = 7;
130a5652762Spraks int	contend;
131a5652762Spraks int	contend_mtbf = 127;
132a5652762Spraks 
133a5652762Spraks #define	VPM_MTBF(v, f)	(((++(v)) & (f)) != (f))
134a5652762Spraks 
135a5652762Spraks #else	/* DEBUG */
136a5652762Spraks 
137a5652762Spraks #define	VPM_MTBF(v, f)	(1)
138a5652762Spraks #define	VPM_DEBUG(x)	/* nothing */
139a5652762Spraks 
140a5652762Spraks #endif
141a5652762Spraks 
142a5652762Spraks /*
143a5652762Spraks  * The vpm cache.
144a5652762Spraks  *
145a5652762Spraks  * The main purpose of having a cache here is to speed up page_lookup()
146a5652762Spraks  * operations and also provide an LRU(default) behaviour of file pages. The
147a5652762Spraks  * page_lookup() operation tends to be expensive if a page has to be
148a5652762Spraks  * reclaimed from the system page cache("cachelist"). Once we speed up the
149a5652762Spraks  * page_lookup()->page_reclaim() path then there there should be no need for
150a5652762Spraks  * this cache. The system page cache(cachelist) should effectively serve the
151a5652762Spraks  * purpose of caching file pages.
152a5652762Spraks  *
153a5652762Spraks  * This cache is very similar to segmap's smap cache. Each page in the
154a5652762Spraks  * cache is tracked by the structure vpmap_t. But unlike segmap, there is no
155a5652762Spraks  * hash table. The page_t has a reference to the vpmap_t when cached. For a
156a5652762Spraks  * given vnode, offset the page is found by means of a page_lookup() operation.
157a5652762Spraks  * Any page which has a mapping(i.e when cached) will not be in the
158a5652762Spraks  * system 'cachelist'. Hence the page_lookup() will not have to do a
159a5652762Spraks  * page_reclaim(). That is how the cache serves to speed up page_lookup()
160a5652762Spraks  * operations.
161a5652762Spraks  *
162a5652762Spraks  * This cache can be disabled by setting vpm_cache_enable = 0 in /etc/system.
163a5652762Spraks  */
164a5652762Spraks 
165a5652762Spraks void
vpm_init()166a5652762Spraks vpm_init()
167a5652762Spraks {
168a5652762Spraks 	long  npages;
169a5652762Spraks 	struct vpmap *vpm;
170a5652762Spraks 	struct vpmfree *vpmflp;
171a5652762Spraks 	int i, ndx;
172a5652762Spraks 	extern void prefetch_smap_w(void *);
173a5652762Spraks 
174183971baSPrakash Sangappa 	if (!kpm_enable) {
175183971baSPrakash Sangappa 		vpm_enable = 0;
176183971baSPrakash Sangappa 	}
177183971baSPrakash Sangappa 
178183971baSPrakash Sangappa 	if (!vpm_enable || !vpm_cache_enable) {
179a5652762Spraks 		return;
180a5652762Spraks 	}
181a5652762Spraks 
182a5652762Spraks 	/*
183a5652762Spraks 	 * Set the size of the cache.
184a5652762Spraks 	 */
185a5652762Spraks 	vpm_cache_size = mmu_ptob((physmem * vpm_cache_percent)/100);
186a5652762Spraks 	if (vpm_cache_size < VPMAP_MINCACHE) {
187a5652762Spraks 		vpm_cache_size = VPMAP_MINCACHE;
188a5652762Spraks 	}
189a5652762Spraks 
190183971baSPrakash Sangappa 	if (vpm_cache_size > VPMAP_MAXCACHE) {
191183971baSPrakash Sangappa 		vpm_cache_size = VPMAP_MAXCACHE;
192183971baSPrakash Sangappa 	}
193183971baSPrakash Sangappa 
194a5652762Spraks 	/*
195a5652762Spraks 	 * Number of freelists.
196a5652762Spraks 	 */
197a5652762Spraks 	if (vpm_nfreelist == 0) {
198a5652762Spraks 		vpm_nfreelist = max_ncpus;
199a5652762Spraks 	} else if (vpm_nfreelist < 0 || vpm_nfreelist > 2 * max_ncpus) {
200a5652762Spraks 		cmn_err(CE_WARN, "vpmap create : number of freelist "
201a5652762Spraks 		"vpm_nfreelist %d using %d", vpm_nfreelist, max_ncpus);
202a5652762Spraks 		vpm_nfreelist = 2 * max_ncpus;
203a5652762Spraks 	}
204a5652762Spraks 
205a5652762Spraks 	/*
206a5652762Spraks 	 * Round it up to the next power of 2
207a5652762Spraks 	 */
208*de710d24SJosef 'Jeff' Sipek 	if (!ISP2(vpm_nfreelist)) {
209a5652762Spraks 		vpm_nfreelist = 1 << (highbit(vpm_nfreelist));
210a5652762Spraks 	}
211a5652762Spraks 	vpmd_freemsk = vpm_nfreelist - 1;
212a5652762Spraks 
213a5652762Spraks 	/*
214a5652762Spraks 	 * Use a per cpu rotor index to spread the allocations evenly
215a5652762Spraks 	 * across the available vpm freelists.
216a5652762Spraks 	 */
217a5652762Spraks 	vpmd_cpu = kmem_zalloc(sizeof (union vpm_cpu) * max_ncpus, KM_SLEEP);
218a5652762Spraks 	ndx = 0;
219a5652762Spraks 	for (i = 0; i < max_ncpus; i++) {
220a5652762Spraks 
221a5652762Spraks 		vpmd_cpu[i].vfree_ndx = ndx;
222a5652762Spraks 		ndx = (ndx + 1) & vpmd_freemsk;
223a5652762Spraks 	}
224a5652762Spraks 
225a5652762Spraks 	/*
226a5652762Spraks 	 * Allocate and initialize the freelist.
227a5652762Spraks 	 */
228a5652762Spraks 	vpmd_free = kmem_zalloc(vpm_nfreelist * sizeof (struct vpmfree),
229a5652762Spraks 	    KM_SLEEP);
230a5652762Spraks 	for (i = 0; i < vpm_nfreelist; i++) {
231a5652762Spraks 
232a5652762Spraks 		vpmflp = &vpmd_free[i];
233a5652762Spraks 		/*
234a5652762Spraks 		 * Set up initial queue pointers. They will get flipped
235a5652762Spraks 		 * back and forth.
236a5652762Spraks 		 */
237a5652762Spraks 		vpmflp->vpm_allocq = &vpmflp->vpm_freeq[VPMALLOCQ];
238a5652762Spraks 		vpmflp->vpm_releq = &vpmflp->vpm_freeq[VPMRELEQ];
239a5652762Spraks 	}
240a5652762Spraks 
241a5652762Spraks 	npages = mmu_btop(vpm_cache_size);
242a5652762Spraks 
243a5652762Spraks 
244a5652762Spraks 	/*
245183971baSPrakash Sangappa 	 * Allocate and initialize the vpmap structs. We need to
246183971baSPrakash Sangappa 	 * walk the array backwards as the prefetch happens in reverse
247183971baSPrakash Sangappa 	 * order.
248a5652762Spraks 	 */
249183971baSPrakash Sangappa 	vpmd_vpmap = kmem_alloc(sizeof (struct vpmap) * npages, KM_SLEEP);
250183971baSPrakash Sangappa 	for (vpm = &vpmd_vpmap[npages - 1]; vpm >= vpmd_vpmap; vpm--) {
251a5652762Spraks 		struct vpmfree *vpmflp;
252a5652762Spraks 		union vpm_freeq *releq;
253a5652762Spraks 		struct vpmap *vpmapf;
254a5652762Spraks 
255a5652762Spraks 		/*
256a5652762Spraks 		 * Use prefetch as we have to walk thru a large number of
257a5652762Spraks 		 * these data structures. We just use the smap's prefetch
258183971baSPrakash Sangappa 		 * routine as it does the same.
259a5652762Spraks 		 */
260a5652762Spraks 		prefetch_smap_w((void *)vpm);
261a5652762Spraks 
262183971baSPrakash Sangappa 		vpm->vpm_vp = NULL;
263183971baSPrakash Sangappa 		vpm->vpm_off = 0;
264183971baSPrakash Sangappa 		vpm->vpm_pp = NULL;
265183971baSPrakash Sangappa 		vpm->vpm_refcnt = 0;
266183971baSPrakash Sangappa 		mutex_init(&vpm->vpm_mtx, NULL, MUTEX_DEFAULT, NULL);
267a5652762Spraks 		vpm->vpm_free_ndx = VPMAP2VMF_NDX(vpm);
268a5652762Spraks 
269a5652762Spraks 		vpmflp = VPMAP2VMF(vpm);
270a5652762Spraks 		releq = vpmflp->vpm_releq;
271a5652762Spraks 
272a5652762Spraks 		vpmapf = releq->vpmq_free;
273a5652762Spraks 		if (vpmapf == NULL) {
274a5652762Spraks 			releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
275a5652762Spraks 		} else {
276a5652762Spraks 			vpm->vpm_next = vpmapf;
277a5652762Spraks 			vpm->vpm_prev = vpmapf->vpm_prev;
278a5652762Spraks 			vpmapf->vpm_prev = vpm;
279a5652762Spraks 			vpm->vpm_prev->vpm_next = vpm;
280a5652762Spraks 			releq->vpmq_free = vpm->vpm_next;
281a5652762Spraks 		}
282a5652762Spraks 
283a5652762Spraks 		/*
284a5652762Spraks 		 * Indicate that the vpmap is on the releq at start
285a5652762Spraks 		 */
286a5652762Spraks 		vpm->vpm_ndxflg = VPMRELEQ;
287a5652762Spraks 	}
288a5652762Spraks }
289a5652762Spraks 
290a5652762Spraks 
291a5652762Spraks /*
292a5652762Spraks  * unhooks vpm from the freelist if it is still on the freelist.
293a5652762Spraks  */
294a5652762Spraks #define	VPMAP_RMFREELIST(vpm) \
295a5652762Spraks 	{ \
296a5652762Spraks 		if (vpm->vpm_next != NULL) { \
297a5652762Spraks 			union vpm_freeq *freeq; \
298a5652762Spraks 			struct vpmfree *vpmflp; \
299a5652762Spraks 			vpmflp = &vpmd_free[vpm->vpm_free_ndx]; \
300a5652762Spraks 			freeq = &vpmflp->vpm_freeq[vpm->vpm_ndxflg]; \
301a5652762Spraks 			mutex_enter(&freeq->vpmq_mtx); \
302a5652762Spraks 			if (freeq->vpmq_free != vpm) { \
303a5652762Spraks 				vpm->vpm_prev->vpm_next = vpm->vpm_next; \
304a5652762Spraks 				vpm->vpm_next->vpm_prev = vpm->vpm_prev; \
305a5652762Spraks 			} else if (vpm == vpm->vpm_next) { \
306a5652762Spraks 				freeq->vpmq_free = NULL; \
307a5652762Spraks 			} else { \
308a5652762Spraks 				freeq->vpmq_free = vpm->vpm_next; \
309a5652762Spraks 				vpm->vpm_prev->vpm_next = vpm->vpm_next; \
310a5652762Spraks 				vpm->vpm_next->vpm_prev = vpm->vpm_prev; \
311a5652762Spraks 			} \
312a5652762Spraks 			mutex_exit(&freeq->vpmq_mtx); \
313a5652762Spraks 			vpm->vpm_next = vpm->vpm_prev = NULL; \
314a5652762Spraks 		} \
315a5652762Spraks 	}
316a5652762Spraks 
317a5652762Spraks static int
get_freelndx(int mode)318a5652762Spraks get_freelndx(int mode)
319a5652762Spraks {
320a5652762Spraks 	int ndx;
321a5652762Spraks 
322a5652762Spraks 	ndx = vpmd_cpu[CPU->cpu_seqid].vfree_ndx & vpmd_freemsk;
323a5652762Spraks 	switch (mode) {
324a5652762Spraks 
325a5652762Spraks 	case	VPMCACHE_LRU:
326a5652762Spraks 	default:
327a5652762Spraks 			vpmd_cpu[CPU->cpu_seqid].vfree_ndx++;
328a5652762Spraks 			break;
329a5652762Spraks 	}
330a5652762Spraks 	return (ndx);
331a5652762Spraks }
332a5652762Spraks 
333a5652762Spraks 
334a5652762Spraks /*
335a5652762Spraks  * Find one vpmap structure from the free lists and use it for the newpage.
336a5652762Spraks  * The previous page it cached is dissociated and released. The page_t's
337a5652762Spraks  * p_vpmref is cleared only when the vpm it is pointing to is locked(or
338a5652762Spraks  * for AMD64 when the page is exclusively locked in page_unload. That is
339a5652762Spraks  * because the p_vpmref is treated as mapping).
340a5652762Spraks  *
341a5652762Spraks  * The page's p_vpmref is set when the page is
342a5652762Spraks  * locked(at least SHARED locked).
343a5652762Spraks  */
344a5652762Spraks static struct vpmap *
get_free_vpmap(page_t * newpage)345a5652762Spraks get_free_vpmap(page_t *newpage)
346a5652762Spraks {
347a5652762Spraks 	struct vpmfree *vpmflp;
348a5652762Spraks 	kmutex_t *vmtx;
349a5652762Spraks 	struct vpmap *vpm, *first;
350a5652762Spraks 	union vpm_freeq *allocq, *releq;
351a5652762Spraks 	page_t *pp = NULL;
352a5652762Spraks 	int end_ndx, page_locked = 0;
353a5652762Spraks 	int free_ndx;
354a5652762Spraks 
355a5652762Spraks 	/*
356a5652762Spraks 	 * get the freelist bin index.
357a5652762Spraks 	 */
358a5652762Spraks 	free_ndx = get_freelndx(vpm_cachemode);
359a5652762Spraks 
360a5652762Spraks 	end_ndx = free_ndx;
361a5652762Spraks 	vpmflp = &vpmd_free[free_ndx];
362a5652762Spraks 
363a5652762Spraks retry_queue:
364a5652762Spraks 	allocq = vpmflp->vpm_allocq;
365a5652762Spraks 	mutex_enter(&allocq->vpmq_mtx);
366a5652762Spraks 
367a5652762Spraks 	if ((vpm = allocq->vpmq_free) == NULL) {
368a5652762Spraks 
369a5652762Spraks skip_queue:
370a5652762Spraks 		/*
371a5652762Spraks 		 * The alloc list is empty or this queue is being skipped;
372a5652762Spraks 		 * first see if the allocq toggled.
373a5652762Spraks 		 */
374a5652762Spraks 		if (vpmflp->vpm_allocq != allocq) {
375a5652762Spraks 			/* queue changed */
376a5652762Spraks 			mutex_exit(&allocq->vpmq_mtx);
377a5652762Spraks 			goto retry_queue;
378a5652762Spraks 		}
379a5652762Spraks 		releq = vpmflp->vpm_releq;
380a5652762Spraks 		if (!mutex_tryenter(&releq->vpmq_mtx)) {
381a5652762Spraks 			/* cannot get releq; a free vpmap may be there now */
382a5652762Spraks 			mutex_exit(&allocq->vpmq_mtx);
383a5652762Spraks 
384a5652762Spraks 			/*
385a5652762Spraks 			 * This loop could spin forever if this thread has
386a5652762Spraks 			 * higher priority than the thread that is holding
387a5652762Spraks 			 * releq->vpmq_mtx. In order to force the other thread
388a5652762Spraks 			 * to run, we'll lock/unlock the mutex which is safe
389a5652762Spraks 			 * since we just unlocked the allocq mutex.
390a5652762Spraks 			 */
391a5652762Spraks 			mutex_enter(&releq->vpmq_mtx);
392a5652762Spraks 			mutex_exit(&releq->vpmq_mtx);
393a5652762Spraks 			goto retry_queue;
394a5652762Spraks 		}
395a5652762Spraks 		if (releq->vpmq_free == NULL) {
396a5652762Spraks 			VPM_DEBUG(vpmd_emptyfreelist);
397a5652762Spraks 			/*
398a5652762Spraks 			 * This freelist is empty.
399a5652762Spraks 			 * This should not happen unless clients
400a5652762Spraks 			 * are failing to release the vpmap after
401a5652762Spraks 			 * accessing the data. Before resorting
402a5652762Spraks 			 * to sleeping, try the next list of the same color.
403a5652762Spraks 			 */
404a5652762Spraks 			free_ndx = (free_ndx + 1) & vpmd_freemsk;
405a5652762Spraks 			if (free_ndx != end_ndx) {
406a5652762Spraks 				mutex_exit(&releq->vpmq_mtx);
407a5652762Spraks 				mutex_exit(&allocq->vpmq_mtx);
408a5652762Spraks 				vpmflp = &vpmd_free[free_ndx];
409a5652762Spraks 				goto retry_queue;
410a5652762Spraks 			}
411a5652762Spraks 			/*
412a5652762Spraks 			 * Tried all freelists.
413a5652762Spraks 			 * wait on this list and hope something gets freed.
414a5652762Spraks 			 */
415a5652762Spraks 			vpmflp->vpm_want++;
416a5652762Spraks 			mutex_exit(&vpmflp->vpm_freeq[1].vpmq_mtx);
417a5652762Spraks 			cv_wait(&vpmflp->vpm_free_cv,
418a5652762Spraks 			    &vpmflp->vpm_freeq[0].vpmq_mtx);
419a5652762Spraks 			vpmflp->vpm_want--;
420a5652762Spraks 			mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx);
421a5652762Spraks 			vpmflp = &vpmd_free[free_ndx];
422a5652762Spraks 			VPM_DEBUG(vpmd_nofreevpms);
423a5652762Spraks 			goto retry_queue;
424a5652762Spraks 		} else {
425a5652762Spraks 			/*
426a5652762Spraks 			 * Something on the rele queue; flip the alloc
427a5652762Spraks 			 * and rele queues and retry.
428a5652762Spraks 			 */
429a5652762Spraks 			vpmflp->vpm_allocq = releq;
430a5652762Spraks 			vpmflp->vpm_releq = allocq;
431a5652762Spraks 			mutex_exit(&allocq->vpmq_mtx);
432a5652762Spraks 			mutex_exit(&releq->vpmq_mtx);
433a5652762Spraks 			if (page_locked) {
434a5652762Spraks 				delay(hz >> 2);
435a5652762Spraks 				page_locked = 0;
436a5652762Spraks 			}
437a5652762Spraks 			goto retry_queue;
438a5652762Spraks 		}
439a5652762Spraks 	} else {
440a5652762Spraks 		int gotnewvpm;
441a5652762Spraks 		kmutex_t *pmtx;
442a5652762Spraks 		uint_t vpmref;
443a5652762Spraks 
444a5652762Spraks 		/*
445a5652762Spraks 		 * Fastpath the case we get the vpmap mutex
446a5652762Spraks 		 * on the first try.
447a5652762Spraks 		 */
448a5652762Spraks 		first = vpm;
449a5652762Spraks next_vpmap:
450a5652762Spraks 		vmtx = VPMAPMTX(vpm);
451a5652762Spraks 		if (!mutex_tryenter(vmtx)) {
452a5652762Spraks 			/*
453a5652762Spraks 			 * Another thread is trying to reclaim this slot.
454a5652762Spraks 			 * Skip to the next queue or vpmap.
455a5652762Spraks 			 */
456a5652762Spraks 			if ((vpm = vpm->vpm_next) == first) {
457a5652762Spraks 				goto skip_queue;
458a5652762Spraks 			} else {
459a5652762Spraks 				goto next_vpmap;
460a5652762Spraks 			}
461a5652762Spraks 		}
462a5652762Spraks 
463a5652762Spraks 		/*
464a5652762Spraks 		 * Assign this vpm to the newpage.
465a5652762Spraks 		 */
466a5652762Spraks 		pmtx = PPMTX(newpage);
467a5652762Spraks 		gotnewvpm = 0;
468a5652762Spraks 		mutex_enter(pmtx);
469a5652762Spraks 
470a5652762Spraks 		/*
471a5652762Spraks 		 * Check if some other thread already assigned a vpm to
472a5652762Spraks 		 * this page.
473a5652762Spraks 		 */
474a5652762Spraks 		if ((vpmref = newpage->p_vpmref) == 0) {
475a5652762Spraks 			newpage->p_vpmref = VPMID(vpm);
476a5652762Spraks 			gotnewvpm = 1;
477a5652762Spraks 		} else {
478a5652762Spraks 			VPM_DEBUG(vpmd_contend);
479a5652762Spraks 			mutex_exit(vmtx);
480a5652762Spraks 		}
481a5652762Spraks 		mutex_exit(pmtx);
482a5652762Spraks 
483a5652762Spraks 		if (gotnewvpm) {
484a5652762Spraks 
485a5652762Spraks 			/*
486a5652762Spraks 			 * At this point, we've selected the vpm. Remove vpm
487a5652762Spraks 			 * from its freelist. If vpm is the first one in
488a5652762Spraks 			 * the freelist, update the head of the freelist.
489a5652762Spraks 			 */
490a5652762Spraks 			if (first == vpm) {
491a5652762Spraks 				ASSERT(first == allocq->vpmq_free);
492a5652762Spraks 				allocq->vpmq_free = vpm->vpm_next;
493a5652762Spraks 			}
494a5652762Spraks 
495a5652762Spraks 			/*
496a5652762Spraks 			 * If the head of the freelist still points to vpm,
497a5652762Spraks 			 * then there are no more free vpmaps in that list.
498a5652762Spraks 			 */
499a5652762Spraks 			if (allocq->vpmq_free == vpm)
500a5652762Spraks 				/*
501a5652762Spraks 				 * Took the last one
502a5652762Spraks 				 */
503a5652762Spraks 				allocq->vpmq_free = NULL;
504a5652762Spraks 			else {
505a5652762Spraks 				vpm->vpm_prev->vpm_next = vpm->vpm_next;
506a5652762Spraks 				vpm->vpm_next->vpm_prev = vpm->vpm_prev;
507a5652762Spraks 			}
508a5652762Spraks 			mutex_exit(&allocq->vpmq_mtx);
509a5652762Spraks 			vpm->vpm_prev = vpm->vpm_next = NULL;
510a5652762Spraks 
511a5652762Spraks 			/*
512183971baSPrakash Sangappa 			 * Disassociate the previous page.
513a5652762Spraks 			 * p_vpmref is used as a mapping reference to the page.
514a5652762Spraks 			 */
515a5652762Spraks 			if ((pp = vpm->vpm_pp) != NULL &&
516a5652762Spraks 			    vpm->vpm_vp == pp->p_vnode &&
517a5652762Spraks 			    vpm->vpm_off == pp->p_offset) {
518a5652762Spraks 
519a5652762Spraks 				pmtx = PPMTX(pp);
520a5652762Spraks 				if (page_trylock(pp, SE_SHARED)) {
521a5652762Spraks 					/*
522a5652762Spraks 					 * Now verify that it is the correct
523a5652762Spraks 					 * page. If not someone else stole it,
524a5652762Spraks 					 * so just unlock it and leave.
525a5652762Spraks 					 */
526a5652762Spraks 					mutex_enter(pmtx);
527a5652762Spraks 					if (PP_ISFREE(pp) ||
528a5652762Spraks 					    vpm->vpm_vp != pp->p_vnode ||
529a5652762Spraks 					    vpm->vpm_off != pp->p_offset ||
530a5652762Spraks 					    pp->p_vpmref != VPMID(vpm)) {
531a5652762Spraks 						mutex_exit(pmtx);
532a5652762Spraks 
533a5652762Spraks 						page_unlock(pp);
534a5652762Spraks 					} else {
535a5652762Spraks 						/*
536a5652762Spraks 						 * Release the page.
537a5652762Spraks 						 */
538a5652762Spraks 						pp->p_vpmref = 0;
539a5652762Spraks 						mutex_exit(pmtx);
540a5652762Spraks 						(void) page_release(pp, 1);
541a5652762Spraks 					}
542a5652762Spraks 				} else {
543a5652762Spraks 					/*
544a5652762Spraks 					 * If the page cannot be locked, just
545a5652762Spraks 					 * clear the p_vpmref and go.
546a5652762Spraks 					 */
547a5652762Spraks 					mutex_enter(pmtx);
548a5652762Spraks 					if (pp->p_vpmref == VPMID(vpm)) {
549a5652762Spraks 						pp->p_vpmref = 0;
550a5652762Spraks 					}
551a5652762Spraks 					mutex_exit(pmtx);
552a5652762Spraks 					VPM_DEBUG(vpmd_prevpagelocked);
553a5652762Spraks 				}
554a5652762Spraks 			}
555a5652762Spraks 
556a5652762Spraks 			/*
557a5652762Spraks 			 * Setup vpm to point to the new page.
558a5652762Spraks 			 */
559a5652762Spraks 			vpm->vpm_pp = newpage;
560a5652762Spraks 			vpm->vpm_vp = newpage->p_vnode;
561a5652762Spraks 			vpm->vpm_off = newpage->p_offset;
562a5652762Spraks 
563a5652762Spraks 		} else {
564a5652762Spraks 			int steal = !VPM_MTBF(steals, steals_mtbf);
565a5652762Spraks 			/*
566a5652762Spraks 			 * Page already has a vpm assigned just use that.
567a5652762Spraks 			 * Grab the vpm mutex and verify that it is still
568a5652762Spraks 			 * the correct one. The pp->p_vpmref should not change
569a5652762Spraks 			 * once we have the vpm mutex and the page lock.
570a5652762Spraks 			 */
571a5652762Spraks 			mutex_exit(&allocq->vpmq_mtx);
572a5652762Spraks 			vpm = VPMP(vpmref);
573a5652762Spraks 			vmtx = VPMAPMTX(vpm);
574a5652762Spraks 			mutex_enter(vmtx);
575a5652762Spraks 			if ((steal && vpm->vpm_refcnt == 0) ||
576a5652762Spraks 			    vpm->vpm_pp != newpage) {
577a5652762Spraks 				/*
578a5652762Spraks 				 * The vpm got stolen, retry.
579a5652762Spraks 				 * clear the p_vpmref.
580a5652762Spraks 				 */
581a5652762Spraks 				pmtx = PPMTX(newpage);
582a5652762Spraks 				mutex_enter(pmtx);
583a5652762Spraks 				if (newpage->p_vpmref == vpmref) {
584a5652762Spraks 					newpage->p_vpmref = 0;
585a5652762Spraks 				}
586a5652762Spraks 				mutex_exit(pmtx);
587a5652762Spraks 
588a5652762Spraks 				mutex_exit(vmtx);
589a5652762Spraks 				VPM_DEBUG(vpmd_steals);
590a5652762Spraks 				goto retry_queue;
591a5652762Spraks 			} else if (vpm->vpm_refcnt == 0) {
592a5652762Spraks 				/*
593a5652762Spraks 				 * Remove it from the free list if it
594a5652762Spraks 				 * exists there.
595a5652762Spraks 				 */
596a5652762Spraks 				VPMAP_RMFREELIST(vpm);
597a5652762Spraks 			}
598a5652762Spraks 		}
599a5652762Spraks 		return (vpm);
600a5652762Spraks 	}
601a5652762Spraks }
602a5652762Spraks 
603a5652762Spraks static void
free_vpmap(struct vpmap * vpm)604a5652762Spraks free_vpmap(struct vpmap *vpm)
605a5652762Spraks {
606a5652762Spraks 	struct vpmfree *vpmflp;
607a5652762Spraks 	struct vpmap *vpmfreelist;
608a5652762Spraks 	union vpm_freeq *releq;
609a5652762Spraks 
610a5652762Spraks 	ASSERT(MUTEX_HELD(VPMAPMTX(vpm)));
611a5652762Spraks 
612a5652762Spraks 	if (vpm->vpm_refcnt != 0) {
613a5652762Spraks 		panic("free_vpmap");
614a5652762Spraks 		/*NOTREACHED*/
615a5652762Spraks 	}
616a5652762Spraks 
617a5652762Spraks 	vpmflp = &vpmd_free[vpm->vpm_free_ndx];
618a5652762Spraks 	/*
619a5652762Spraks 	 * Add to the tail of the release queue
620a5652762Spraks 	 * Note that vpm_releq and vpm_allocq could toggle
621a5652762Spraks 	 * before we get the lock. This does not affect
622a5652762Spraks 	 * correctness as the 2 queues are only maintained
623a5652762Spraks 	 * to reduce lock pressure.
624a5652762Spraks 	 */
625a5652762Spraks 	releq = vpmflp->vpm_releq;
626a5652762Spraks 	if (releq == &vpmflp->vpm_freeq[0]) {
627a5652762Spraks 		vpm->vpm_ndxflg = 0;
628a5652762Spraks 	} else {
629a5652762Spraks 		vpm->vpm_ndxflg = 1;
630a5652762Spraks 	}
631a5652762Spraks 	mutex_enter(&releq->vpmq_mtx);
632a5652762Spraks 	vpmfreelist = releq->vpmq_free;
633a5652762Spraks 	if (vpmfreelist == 0) {
634a5652762Spraks 		int want;
635a5652762Spraks 
636a5652762Spraks 		releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
637a5652762Spraks 		/*
638a5652762Spraks 		 * Both queue mutexes are held to set vpm_want;
639a5652762Spraks 		 * snapshot the value before dropping releq mutex.
640a5652762Spraks 		 * If vpm_want appears after the releq mutex is dropped,
641a5652762Spraks 		 * then the vpmap just freed is already gone.
642a5652762Spraks 		 */
643a5652762Spraks 		want = vpmflp->vpm_want;
644a5652762Spraks 		mutex_exit(&releq->vpmq_mtx);
645a5652762Spraks 		/*
646a5652762Spraks 		 * See if there was a waiter before dropping the releq mutex
647a5652762Spraks 		 * then recheck after obtaining vpm_freeq[0] mutex as
648a5652762Spraks 		 * the another thread may have already signaled.
649a5652762Spraks 		 */
650a5652762Spraks 		if (want) {
651a5652762Spraks 			mutex_enter(&vpmflp->vpm_freeq[0].vpmq_mtx);
652a5652762Spraks 			if (vpmflp->vpm_want)
653a5652762Spraks 				cv_signal(&vpmflp->vpm_free_cv);
654a5652762Spraks 			mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx);
655a5652762Spraks 		}
656a5652762Spraks 	} else {
657a5652762Spraks 		vpm->vpm_next = vpmfreelist;
658a5652762Spraks 		vpm->vpm_prev = vpmfreelist->vpm_prev;
659a5652762Spraks 		vpmfreelist->vpm_prev = vpm;
660a5652762Spraks 		vpm->vpm_prev->vpm_next = vpm;
661a5652762Spraks 		mutex_exit(&releq->vpmq_mtx);
662a5652762Spraks 	}
663a5652762Spraks }
664a5652762Spraks 
665a5652762Spraks /*
666a5652762Spraks  * Get the vpmap for the page.
667a5652762Spraks  * The refcnt of this vpm is incremented.
668a5652762Spraks  */
669a5652762Spraks static struct vpmap *
get_vpmap(page_t * pp)670a5652762Spraks get_vpmap(page_t *pp)
671a5652762Spraks {
672a5652762Spraks 	struct vpmap *vpm = NULL;
673a5652762Spraks 	kmutex_t *vmtx;
674a5652762Spraks 	kmutex_t *pmtx;
675a5652762Spraks 	unsigned int refid;
676a5652762Spraks 
677a5652762Spraks 	ASSERT((pp != NULL) && PAGE_LOCKED(pp));
678a5652762Spraks 
679a5652762Spraks 	if (VPM_MTBF(contend, contend_mtbf) && (refid = pp->p_vpmref) != 0) {
680a5652762Spraks 		vpm = VPMP(refid);
681a5652762Spraks 		vmtx = VPMAPMTX(vpm);
682a5652762Spraks 		mutex_enter(vmtx);
683a5652762Spraks 		/*
684a5652762Spraks 		 * Since we have the page lock and the vpm mutex, the
685a5652762Spraks 		 * pp->p_vpmref cannot change.
686a5652762Spraks 		 */
687a5652762Spraks 		if (vpm->vpm_pp != pp) {
688a5652762Spraks 			pmtx = PPMTX(pp);
689a5652762Spraks 
690a5652762Spraks 			/*
691a5652762Spraks 			 * Clear the p_vpmref as it is incorrect.
692a5652762Spraks 			 * This can happen if the page was stolen.
693a5652762Spraks 			 * On x64 this should not happen as p_vpmref
694a5652762Spraks 			 * is treated as a mapping on the page. So
695a5652762Spraks 			 * if the page is stolen, the mapping would have
696a5652762Spraks 			 * been cleared in page_unload().
697a5652762Spraks 			 */
698a5652762Spraks 			mutex_enter(pmtx);
699a5652762Spraks 			if (pp->p_vpmref == refid)
700a5652762Spraks 				pp->p_vpmref = 0;
701a5652762Spraks 			mutex_exit(pmtx);
702a5652762Spraks 
703a5652762Spraks 			mutex_exit(vmtx);
704a5652762Spraks 			vpm = NULL;
705a5652762Spraks 		} else if (vpm->vpm_refcnt == 0) {
706a5652762Spraks 			/*
707a5652762Spraks 			 * Got the vpm, remove it from the free
708a5652762Spraks 			 * list if it exists there.
709a5652762Spraks 			 */
710a5652762Spraks 			VPMAP_RMFREELIST(vpm);
711a5652762Spraks 		}
712a5652762Spraks 	}
713a5652762Spraks 	if (vpm == NULL) {
714a5652762Spraks 		/*
715a5652762Spraks 		 * get_free_vpmap() returns with the vpmap mutex held.
716a5652762Spraks 		 */
717a5652762Spraks 		vpm = get_free_vpmap(pp);
718a5652762Spraks 		vmtx = VPMAPMTX(vpm);
719a5652762Spraks 		vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_misses++;
720a5652762Spraks 	} else {
721a5652762Spraks 		vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_hits++;
722a5652762Spraks 	}
723a5652762Spraks 
724a5652762Spraks 	vpm->vpm_refcnt++;
725a5652762Spraks 	mutex_exit(vmtx);
726a5652762Spraks 
727a5652762Spraks 	return (vpm);
728a5652762Spraks }
729a5652762Spraks 
730a5652762Spraks /* END --- vpm cache ---- */
731a5652762Spraks 
732a5652762Spraks /*
733a5652762Spraks  * The vnode page mapping(vpm) interface routines.
734a5652762Spraks  */
735a5652762Spraks 
736a5652762Spraks /*
737a5652762Spraks  * Find or create the pages starting form baseoff for specified
738a5652762Spraks  * length 'len'.
739a5652762Spraks  */
740a5652762Spraks static int
vpm_pagecreate(struct vnode * vp,u_offset_t baseoff,size_t len,vmap_t vml[],int nseg,int * newpage)741a5652762Spraks vpm_pagecreate(
742a5652762Spraks 	struct vnode *vp,
743a5652762Spraks 	u_offset_t baseoff,
744a5652762Spraks 	size_t len,
745a5652762Spraks 	vmap_t vml[],
746a5652762Spraks 	int nseg,
747a5652762Spraks 	int *newpage)
748a5652762Spraks {
749a5652762Spraks 
750a5652762Spraks 	page_t *pp = NULL;
751a5652762Spraks 	caddr_t base;
752a5652762Spraks 	u_offset_t off = baseoff;
753a5652762Spraks 	int i;
754183971baSPrakash Sangappa 	ASSERT(nseg >= MINVMAPS && nseg <= MAXVMAPS);
755a5652762Spraks 
7563bd1497bSpraks 	for (i = 0; len > 0; len -= PAGESIZE, i++) {
757a5652762Spraks 		struct vpmap *vpm;
758a5652762Spraks 
759a5652762Spraks 
760a5652762Spraks 		if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
761a5652762Spraks 
762a5652762Spraks 			base = segkpm_create_va(off);
763a5652762Spraks 
764a5652762Spraks 			/*
765a5652762Spraks 			 * the seg pointer passed in is just advisor. Just
766a5652762Spraks 			 * pass segkmap for now like segmap does with
767a5652762Spraks 			 * segmap_kpm enabled.
768a5652762Spraks 			 */
769a5652762Spraks 			if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT,
770a5652762Spraks 			    segkmap, base)) == NULL) {
771a5652762Spraks 				panic("segmap_pagecreate_vpm: "
772a5652762Spraks 				    "page_create failed");
773a5652762Spraks 				/*NOTREACHED*/
774a5652762Spraks 			}
775a5652762Spraks 			if (newpage != NULL)
776a5652762Spraks 				*newpage = 1;
777a5652762Spraks 
778a5652762Spraks 			page_io_unlock(pp);
779a5652762Spraks 		}
780a5652762Spraks 
781a5652762Spraks 		/*
782a5652762Spraks 		 * Get the vpm for this page_t.
783a5652762Spraks 		 */
784a5652762Spraks 		if (vpm_cache_enable) {
785a5652762Spraks 			vpm = get_vpmap(pp);
786a5652762Spraks 			vml[i].vs_data = (void *)&vpm->vpm_pp;
787a5652762Spraks 		} else {
788a5652762Spraks 			vml[i].vs_data = (void *)pp;
789a5652762Spraks 			pp->p_vpmref = 0;
790a5652762Spraks 		}
791a5652762Spraks 
792a5652762Spraks 		vml[i].vs_addr = hat_kpm_mapin(pp, 0);
793a5652762Spraks 		vml[i].vs_len = PAGESIZE;
794a5652762Spraks 
795a5652762Spraks 		off += PAGESIZE;
796a5652762Spraks 	}
797a5652762Spraks 	vml[i].vs_data = NULL;
798a5652762Spraks 	vml[i].vs_addr = (caddr_t)NULL;
799a5652762Spraks 	return (0);
800a5652762Spraks }
801a5652762Spraks 
802a5652762Spraks 
803a5652762Spraks /*
804a5652762Spraks  * Returns vpm mappings of pages in the range [off, off+len], where
805a5652762Spraks  * len is rounded up to the PAGESIZE boundary. The list of pages and
806a5652762Spraks  * the page addresses are returned in the SGL vml (vmap_t) array passed in.
807a5652762Spraks  * The nseg is the number of vmap_t entries in the array.
808a5652762Spraks  *
809a5652762Spraks  * The segmap's SM_LOCKPROTO  usage is not supported by these interfaces.
810a5652762Spraks  * For such cases, use the seg_map interfaces.
811a5652762Spraks  */
812a5652762Spraks int
vpm_map_pages(struct vnode * vp,u_offset_t off,size_t len,int fetchpage,vmap_t * vml,int nseg,int * newpage,enum seg_rw rw)813a5652762Spraks vpm_map_pages(
814a5652762Spraks 	struct vnode *vp,
815a5652762Spraks 	u_offset_t off,
816a5652762Spraks 	size_t len,
817a5652762Spraks 	int fetchpage,
818a5652762Spraks 	vmap_t *vml,
819a5652762Spraks 	int nseg,
820a5652762Spraks 	int  *newpage,
821a5652762Spraks 	enum seg_rw rw)
822a5652762Spraks {
823a5652762Spraks 	extern struct vnode *common_specvp();
824a5652762Spraks 	u_offset_t baseoff;
825a5652762Spraks 	uint_t prot;
826a5652762Spraks 	caddr_t base;
827a5652762Spraks 	page_t *pp, *pplist[MAXVMAPS];
828a5652762Spraks 	struct vpmap *vpm;
829a5652762Spraks 	int i, error = 0;
830183971baSPrakash Sangappa 	size_t tlen;
831a5652762Spraks 
832183971baSPrakash Sangappa 	ASSERT(nseg >= MINVMAPS && nseg <= MAXVMAPS);
833a5652762Spraks 	baseoff = off & (offset_t)PAGEMASK;
834a5652762Spraks 	vml[0].vs_data = NULL;
835a5652762Spraks 	vml[0].vs_addr = (caddr_t)NULL;
836183971baSPrakash Sangappa 
837183971baSPrakash Sangappa 	tlen = P2ROUNDUP(off + len, PAGESIZE) - baseoff;
838a5652762Spraks 	/*
839183971baSPrakash Sangappa 	 * Restrict it to VPMMAXLEN.
840a5652762Spraks 	 */
841183971baSPrakash Sangappa 	if (tlen > (VPMMAXPGS * PAGESIZE)) {
842183971baSPrakash Sangappa 		tlen = VPMMAXPGS * PAGESIZE;
843a5652762Spraks 	}
844183971baSPrakash Sangappa 	/*
845183971baSPrakash Sangappa 	 * Ensure length fits within the vml[] array. One element of
846183971baSPrakash Sangappa 	 * the array is used to mark the end of the scatter/gather list
847183971baSPrakash Sangappa 	 * of valid mappings by setting its vs_addr = NULL. Leave space
848183971baSPrakash Sangappa 	 * for this element.
849183971baSPrakash Sangappa 	 */
850183971baSPrakash Sangappa 	if (tlen > ((nseg - 1) * PAGESIZE)) {
851183971baSPrakash Sangappa 		tlen = ((nseg - 1) * PAGESIZE);
852183971baSPrakash Sangappa 	}
853183971baSPrakash Sangappa 	len = tlen;
854a5652762Spraks 
855a5652762Spraks 	/*
856a5652762Spraks 	 * If this is a block device we have to be sure to use the
857a5652762Spraks 	 * "common" block device vnode for the mapping.
858a5652762Spraks 	 */
859a5652762Spraks 	if (vp->v_type == VBLK)
860a5652762Spraks 		vp = common_specvp(vp);
861a5652762Spraks 
862a5652762Spraks 
863a5652762Spraks 	if (!fetchpage)
864a5652762Spraks 		return (vpm_pagecreate(vp, baseoff, len, vml, nseg, newpage));
865a5652762Spraks 
8663bd1497bSpraks 	for (i = 0; len > 0; len -= PAGESIZE, i++, pplist[i] = NULL) {
867a5652762Spraks 
868a5652762Spraks 		pp = page_lookup(vp, baseoff, SE_SHARED);
869a5652762Spraks 
870a5652762Spraks 		/*
871a5652762Spraks 		 * If we did not find the page or if this page was not
872183971baSPrakash Sangappa 		 * in vpm cache(p_vpmref == 0), then let VOP_GETPAGE get
873183971baSPrakash Sangappa 		 * all the pages.
874a5652762Spraks 		 * We need to call VOP_GETPAGE so that filesytems can do some
875a5652762Spraks 		 * (un)necessary tracking for sequential access.
876a5652762Spraks 		 */
877a5652762Spraks 
878a5652762Spraks 		if (pp == NULL || (vpm_cache_enable && pp->p_vpmref == 0) ||
879a5652762Spraks 		    (rw == S_WRITE && hat_page_getattr(pp, P_MOD | P_REF)
880a5652762Spraks 		    != (P_MOD | P_REF))) {
881183971baSPrakash Sangappa 			int j;
882a5652762Spraks 			if (pp != NULL) {
883a5652762Spraks 				page_unlock(pp);
884a5652762Spraks 			}
885183971baSPrakash Sangappa 			/*
886183971baSPrakash Sangappa 			 * If we did not find the desired set of pages,
887183971baSPrakash Sangappa 			 * from the page cache, just call VOP_GETPAGE to get
888183971baSPrakash Sangappa 			 * all the pages.
889183971baSPrakash Sangappa 			 */
890183971baSPrakash Sangappa 			for (j = 0; j < i; j++) {
891183971baSPrakash Sangappa 				page_unlock(pplist[j]);
892183971baSPrakash Sangappa 			}
893a5652762Spraks 
894183971baSPrakash Sangappa 
895183971baSPrakash Sangappa 			baseoff = off & (offset_t)PAGEMASK;
896a5652762Spraks 			/*
897a5652762Spraks 			 * Pass a dummy address as it will be required
898a5652762Spraks 			 * by page_create_va(). We pass segkmap as the seg
899a5652762Spraks 			 * as some file systems(UFS) check it.
900a5652762Spraks 			 */
901a5652762Spraks 			base = segkpm_create_va(baseoff);
902a5652762Spraks 
903183971baSPrakash Sangappa 			error = VOP_GETPAGE(vp, baseoff, tlen, &prot, pplist,
904183971baSPrakash Sangappa 			    tlen, segkmap, base, rw, CRED(), NULL);
905a5652762Spraks 			if (error) {
906a5652762Spraks 				VPM_DEBUG(vpmd_getpagefailed);
907183971baSPrakash Sangappa 				pplist[0] = NULL;
908a5652762Spraks 			}
909a5652762Spraks 			break;
910a5652762Spraks 		} else {
911a5652762Spraks 			pplist[i] = pp;
912a5652762Spraks 			baseoff += PAGESIZE;
913a5652762Spraks 		}
914a5652762Spraks 	}
915a5652762Spraks 
916a5652762Spraks 	if (error) {
917a5652762Spraks 		for (i = 0; pplist[i] != NULL; i++) {
918a5652762Spraks 			page_unlock(pplist[i]);
919a5652762Spraks 			pplist[i] = NULL;
920a5652762Spraks 		}
921a5652762Spraks 		vml[0].vs_addr = NULL;
922a5652762Spraks 		vml[0].vs_data = NULL;
9239234f026Spraks 		return (error);
924a5652762Spraks 	}
925a5652762Spraks 
926a5652762Spraks 	/*
927a5652762Spraks 	 * Get the vpm's for pages.
928a5652762Spraks 	 */
929a5652762Spraks 	for (i = 0; pplist[i] != NULL; i++) {
930a5652762Spraks 		if (vpm_cache_enable) {
931a5652762Spraks 			vpm = get_vpmap(pplist[i]);
932a5652762Spraks 			vml[i].vs_data = (void *)&(vpm->vpm_pp);
933a5652762Spraks 		} else {
934a5652762Spraks 			vml[i].vs_data = (void *)pplist[i];
935a5652762Spraks 			pplist[i]->p_vpmref = 0;
936a5652762Spraks 		}
937a5652762Spraks 
938a5652762Spraks 		vml[i].vs_addr = hat_kpm_mapin(pplist[i], 0);
939a5652762Spraks 		vml[i].vs_len = PAGESIZE;
940a5652762Spraks 	}
941a5652762Spraks 
942a5652762Spraks 	vml[i].vs_data = NULL;
943a5652762Spraks 	vml[i].vs_addr = (caddr_t)NULL;
944a5652762Spraks 
945a5652762Spraks 	return (0);
946a5652762Spraks }
947a5652762Spraks 
948a5652762Spraks /*
949a5652762Spraks  * Release the vpm mappings on the pages and unlock them.
950a5652762Spraks  */
951a5652762Spraks void
vpm_unmap_pages(vmap_t vml[],enum seg_rw rw)952a5652762Spraks vpm_unmap_pages(vmap_t vml[], enum seg_rw rw)
953a5652762Spraks {
954a5652762Spraks 	int i;
955a5652762Spraks 	struct vpmap *vpm;
956a5652762Spraks 	kmutex_t *mtx;
957a5652762Spraks 	page_t *pp;
958a5652762Spraks 
959a5652762Spraks 	for (i = 0; vml[i].vs_data != NULL; i++) {
960a5652762Spraks 		ASSERT(IS_KPM_ADDR(vml[i].vs_addr));
961a5652762Spraks 
962a5652762Spraks 		if (vpm_cache_enable) {
963a5652762Spraks 			pp = *(((page_t **)vml[i].vs_data));
964a5652762Spraks 		} else {
965a5652762Spraks 			pp = (page_t *)vml[i].vs_data;
966a5652762Spraks 		}
967a5652762Spraks 
968a5652762Spraks 		/*
969a5652762Spraks 		 * Mark page as being modified or referenced, bacause vpm pages
970a5652762Spraks 		 * would not cause faults where it would be set normally.
971a5652762Spraks 		 */
972a5652762Spraks 		if (rw == S_WRITE) {
973a5652762Spraks 			hat_setrefmod(pp);
974a5652762Spraks 		} else {
975a5652762Spraks 			ASSERT(rw == S_READ);
976a5652762Spraks 			hat_setref(pp);
977a5652762Spraks 		}
978a5652762Spraks 
979a5652762Spraks 		if (vpm_cache_enable) {
980a5652762Spraks 			vpm = (struct vpmap *)((char *)vml[i].vs_data
981a5652762Spraks 			    - offsetof(struct vpmap, vpm_pp));
982183971baSPrakash Sangappa 			hat_kpm_mapout(pp, 0, vml[i].vs_addr);
983183971baSPrakash Sangappa 			page_unlock(pp);
984a5652762Spraks 			mtx = VPMAPMTX(vpm);
985a5652762Spraks 			mutex_enter(mtx);
986a5652762Spraks 
987a5652762Spraks 			if (--vpm->vpm_refcnt == 0) {
988a5652762Spraks 				free_vpmap(vpm);
989a5652762Spraks 			}
990a5652762Spraks 			mutex_exit(mtx);
991a5652762Spraks 		} else {
992a5652762Spraks 			hat_kpm_mapout(pp, 0, vml[i].vs_addr);
993a5652762Spraks 			(void) page_release(pp, 1);
994a5652762Spraks 		}
995a5652762Spraks 		vml[i].vs_data = NULL;
996a5652762Spraks 		vml[i].vs_addr = NULL;
997a5652762Spraks 	}
998a5652762Spraks }
999a5652762Spraks 
1000a5652762Spraks /*
1001a5652762Spraks  * Given the vp, off and the uio structure, this routine will do the
1002a5652762Spraks  * the copy (uiomove). If the last page created is partially written,
1003a5652762Spraks  * the rest of the page is zeroed out. It also zeros the beginning of
1004a5652762Spraks  * the first page till the start offset if requested(zerostart).
1005a5652762Spraks  * If pages are to be fetched, it will call the filesystem's getpage
1006a5652762Spraks  * function (VOP_GETPAGE) to get them, otherwise they will be created if
1007a5652762Spraks  * not already present in the page cache.
1008a5652762Spraks  */
1009a5652762Spraks int
vpm_data_copy(struct vnode * vp,u_offset_t off,size_t len,struct uio * uio,int fetchpage,int * newpage,int zerostart,enum seg_rw rw)1010a5652762Spraks vpm_data_copy(struct vnode *vp,
1011a5652762Spraks 	u_offset_t off,
1012a5652762Spraks 	size_t len,
1013a5652762Spraks 	struct uio *uio,
1014a5652762Spraks 	int fetchpage,
1015a5652762Spraks 	int *newpage,
1016a5652762Spraks 	int zerostart,
1017a5652762Spraks 	enum seg_rw rw)
1018a5652762Spraks {
1019a5652762Spraks 	int error;
1020a5652762Spraks 	struct vmap vml[MINVMAPS];
1021a5652762Spraks 	enum uio_rw uiorw;
1022a5652762Spraks 	int npages = 0;
1023a5652762Spraks 
1024a5652762Spraks 	uiorw = (rw == S_WRITE) ? UIO_WRITE : UIO_READ;
1025a5652762Spraks 	/*
1026a5652762Spraks 	 * 'off' will be the offset where the I/O starts.
1027a5652762Spraks 	 * We get the pages starting at the (off & PAGEMASK)
1028a5652762Spraks 	 * page boundary.
1029a5652762Spraks 	 */
1030a5652762Spraks 	error = vpm_map_pages(vp, off, (uint_t)len,
1031a5652762Spraks 	    fetchpage, vml, MINVMAPS, &npages,  rw);
1032a5652762Spraks 
1033a5652762Spraks 	if (newpage != NULL)
1034a5652762Spraks 		*newpage = npages;
1035a5652762Spraks 	if (!error) {
1036a5652762Spraks 		int i, pn, slen = len;
1037a5652762Spraks 		int pon = off & PAGEOFFSET;
1038a5652762Spraks 
1039a5652762Spraks 		/*
1040a5652762Spraks 		 * Clear from the beginning of the page to start offset
1041a5652762Spraks 		 * if requested.
1042a5652762Spraks 		 */
1043a5652762Spraks 		if (!fetchpage && zerostart) {
1044a5652762Spraks 			(void) kzero(vml[0].vs_addr,  (uint_t)pon);
1045a5652762Spraks 			VPM_DEBUG(vpmd_zerostart);
1046a5652762Spraks 		}
1047a5652762Spraks 
1048a5652762Spraks 		for (i = 0; !error && slen > 0 &&
1049a5652762Spraks 		    vml[i].vs_addr != NULL; i++) {
1050a5652762Spraks 			pn = (int)MIN(slen, (PAGESIZE - pon));
1051a5652762Spraks 			error = uiomove(vml[i].vs_addr + pon,
1052a5652762Spraks 			    (long)pn, uiorw, uio);
1053a5652762Spraks 			slen -= pn;
1054a5652762Spraks 			pon = 0;
1055a5652762Spraks 		}
1056a5652762Spraks 
1057a5652762Spraks 		/*
1058a5652762Spraks 		 * When new pages are created, zero out part of the
1059a5652762Spraks 		 * page we did not copy to.
1060a5652762Spraks 		 */
1061a5652762Spraks 		if (!fetchpage && npages &&
1062a5652762Spraks 		    uio->uio_loffset < roundup(off + len, PAGESIZE)) {
1063a5652762Spraks 			int nzero;
1064a5652762Spraks 
1065a5652762Spraks 			pon = (uio->uio_loffset & PAGEOFFSET);
1066a5652762Spraks 			nzero = PAGESIZE  - pon;
1067a5652762Spraks 			i = (uio->uio_loffset - (off & PAGEMASK)) / PAGESIZE;
1068a5652762Spraks 			(void) kzero(vml[i].vs_addr + pon, (uint_t)nzero);
1069a5652762Spraks 		}
1070a5652762Spraks 		vpm_unmap_pages(vml, rw);
1071a5652762Spraks 	}
1072a5652762Spraks 	return (error);
1073a5652762Spraks }
1074a5652762Spraks 
1075a5652762Spraks /*
1076a5652762Spraks  * called to flush pages for the given vnode covering
1077a5652762Spraks  * [off, off+len] range.
1078a5652762Spraks  */
1079a5652762Spraks int
vpm_sync_pages(struct vnode * vp,u_offset_t off,size_t len,uint_t flags)1080a5652762Spraks vpm_sync_pages(struct vnode *vp,
1081a5652762Spraks 		u_offset_t off,
1082a5652762Spraks 		size_t len,
1083a5652762Spraks 		uint_t flags)
1084a5652762Spraks {
1085a5652762Spraks 	extern struct vnode *common_specvp();
1086a5652762Spraks 	int bflags = 0;
1087a5652762Spraks 	int error = 0;
1088a5652762Spraks 	size_t psize = roundup(len, PAGESIZE);
1089a5652762Spraks 
1090a5652762Spraks 	/*
1091a5652762Spraks 	 * If this is a block device we have to be sure to use the
1092a5652762Spraks 	 * "common" block device vnode for the mapping.
1093a5652762Spraks 	 */
1094a5652762Spraks 	if (vp->v_type == VBLK)
1095a5652762Spraks 		vp = common_specvp(vp);
1096a5652762Spraks 
1097a5652762Spraks 	if ((flags & ~SM_DONTNEED) != 0) {
1098a5652762Spraks 		if (flags & SM_ASYNC)
1099a5652762Spraks 			bflags |= B_ASYNC;
1100a5652762Spraks 		if (flags & SM_INVAL)
1101a5652762Spraks 			bflags |= B_INVAL;
1102a5652762Spraks 		if (flags & SM_DESTROY)
1103a5652762Spraks 			bflags |= (B_INVAL|B_TRUNC);
1104a5652762Spraks 		if (flags & SM_FREE)
1105a5652762Spraks 			bflags |= B_FREE;
1106a5652762Spraks 		if (flags & SM_DONTNEED)
1107a5652762Spraks 			bflags |= B_DONTNEED;
1108a5652762Spraks 
1109da6c28aaSamw 		error = VOP_PUTPAGE(vp, off, psize, bflags, CRED(), NULL);
1110a5652762Spraks 	}
1111a5652762Spraks 
1112a5652762Spraks 	return (error);
1113a5652762Spraks }
1114a5652762Spraks 
1115a5652762Spraks 
1116a5652762Spraks #else	/* SEGKPM_SUPPORT */
1117a5652762Spraks 
1118a5652762Spraks /* vpm stubs */
1119a5652762Spraks void
vpm_init()1120a5652762Spraks vpm_init()
1121a5652762Spraks {
1122a5652762Spraks }
1123a5652762Spraks 
1124a5652762Spraks /*ARGSUSED*/
1125a5652762Spraks int
vpm_pagecreate(struct vnode * vp,u_offset_t baseoff,size_t len,vmap_t vml[],int nseg,int * newpage)1126a5652762Spraks vpm_pagecreate(
1127a5652762Spraks 	struct vnode *vp,
1128a5652762Spraks 	u_offset_t baseoff,
1129a5652762Spraks 	size_t len,
1130a5652762Spraks 	vmap_t vml[],
1131a5652762Spraks 	int nseg,
1132a5652762Spraks 	int *newpage)
1133a5652762Spraks {
1134a5652762Spraks 	return (0);
1135a5652762Spraks }
1136a5652762Spraks 
1137a5652762Spraks /*ARGSUSED*/
1138a5652762Spraks int
vpm_map_pages(struct vnode * vp,u_offset_t off,size_t len,int fetchpage,vmap_t vml[],int nseg,int * newpage,enum seg_rw rw)1139a5652762Spraks vpm_map_pages(
1140a5652762Spraks 	struct vnode *vp,
1141a5652762Spraks 	u_offset_t off,
1142a5652762Spraks 	size_t len,
1143a5652762Spraks 	int fetchpage,
1144a5652762Spraks 	vmap_t vml[],
1145a5652762Spraks 	int nseg,
1146a5652762Spraks 	int *newpage,
1147a5652762Spraks 	enum seg_rw rw)
1148a5652762Spraks {
1149a5652762Spraks 	return (0);
1150a5652762Spraks }
1151a5652762Spraks 
1152a5652762Spraks /*ARGSUSED*/
1153a5652762Spraks int
vpm_data_copy(struct vnode * vp,u_offset_t off,size_t len,struct uio * uio,int fetchpage,int * newpage,int zerostart,enum seg_rw rw)1154a5652762Spraks vpm_data_copy(struct vnode *vp,
1155a5652762Spraks 	u_offset_t off,
1156a5652762Spraks 	size_t len,
1157a5652762Spraks 	struct uio *uio,
1158a5652762Spraks 	int fetchpage,
1159a5652762Spraks 	int *newpage,
1160a5652762Spraks 	int zerostart,
1161a5652762Spraks 	enum seg_rw rw)
1162a5652762Spraks {
1163a5652762Spraks 	return (0);
1164a5652762Spraks }
1165a5652762Spraks 
1166a5652762Spraks /*ARGSUSED*/
1167a5652762Spraks void
vpm_unmap_pages(vmap_t vml[],enum seg_rw rw)1168a5652762Spraks vpm_unmap_pages(vmap_t vml[], enum seg_rw rw)
1169a5652762Spraks {
1170a5652762Spraks }
1171a5652762Spraks /*ARGSUSED*/
1172a5652762Spraks int
vpm_sync_pages(struct vnode * vp,u_offset_t off,size_t len,uint_t flags)1173a5652762Spraks vpm_sync_pages(struct vnode *vp,
1174a5652762Spraks 		u_offset_t off,
1175a5652762Spraks 		size_t len,
1176a5652762Spraks 		uint_t flags)
1177a5652762Spraks {
1178a5652762Spraks 	return (0);
1179a5652762Spraks }
1180a5652762Spraks #endif	/* SEGKPM_SUPPORT */
1181