xref: /titanic_50/usr/src/uts/common/vm/vpm.c (revision 986fd29a0dc13f7608ef7f508f6e700bd7bc2720)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * VM - generic vnode page mapping interfaces.
30  *
31  * Mechanism to provide temporary mappings to vnode pages.
32  * The typical use would be to copy/access file data.
33  */
34 
35 #include <sys/types.h>
36 #include <sys/t_lock.h>
37 #include <sys/param.h>
38 #include <sys/sysmacros.h>
39 #include <sys/buf.h>
40 #include <sys/systm.h>
41 #include <sys/vnode.h>
42 #include <sys/mman.h>
43 #include <sys/errno.h>
44 #include <sys/cred.h>
45 #include <sys/kmem.h>
46 #include <sys/vtrace.h>
47 #include <sys/cmn_err.h>
48 #include <sys/debug.h>
49 #include <sys/thread.h>
50 #include <sys/dumphdr.h>
51 #include <sys/bitmap.h>
52 #include <sys/lgrp.h>
53 
54 #include <vm/seg_kmem.h>
55 #include <vm/hat.h>
56 #include <vm/as.h>
57 #include <vm/seg.h>
58 #include <vm/seg_kpm.h>
59 #include <vm/seg_map.h>
60 #include <vm/page.h>
61 #include <vm/pvn.h>
62 #include <vm/rm.h>
63 #include <vm/vpm.h>
64 
65 /*
66  * Needs to be enabled by each platform.
67  */
68 int vpm_enable = 0;
69 
70 #ifdef	SEGKPM_SUPPORT
71 
72 
73 int	vpm_cache_enable = 1;
74 long	vpm_cache_percent = 12;
75 long	vpm_cache_size;
76 int	vpm_nfreelist = 0;
77 int	vpmd_freemsk = 0;
78 
79 #define	VPM_S_PAD	64
80 union vpm_cpu {
81 	struct {
82 		int	vcpu_free_ndx;
83 		ulong_t	vcpu_hits;
84 		ulong_t vcpu_misses;
85 	} vcpu;
86 	char vpm_pad[VPM_S_PAD];
87 };
88 static union vpm_cpu	*vpmd_cpu;
89 
90 #define	vfree_ndx	vcpu.vcpu_free_ndx
91 
92 int	vpm_cachemode = VPMCACHE_LRU;
93 
94 #define	PPMTX(pp) (&(pp)->p_ilock)
95 
96 static struct vpmap *vpmd_vpmap;	/* list of vpmap structs preallocated */
97 static struct vpmfree *vpmd_free;
98 #define	VPMAPMTX(vpm)	(&vpm->vpm_mtx)
99 #define	VPMAP2VMF(vpm)	(&vpmd_free[(vpm - vpmd_vpmap) & vpmd_freemsk])
100 #define	VPMAP2VMF_NDX(vpm)	(ushort_t)((vpm - vpmd_vpmap) & vpmd_freemsk)
101 #define	VPMP(id)	(&vpmd_vpmap[id - 1])
102 #define	VPMID(vpm)	(uint_t)((vpm - vpmd_vpmap) + 1)
103 
104 
105 #ifdef	DEBUG
106 
107 struct	vpm_debug {
108 	int vpmd_steals;
109 	int vpmd_contend;
110 	int vpmd_prevpagelocked;
111 	int vpmd_getpagefailed;
112 	int vpmd_zerostart;
113 	int vpmd_emptyfreelist;
114 	int vpmd_nofreevpms;
115 } vpm_debug;
116 
117 #define	VPM_DEBUG(x)	((vpm_debug.x)++)
118 
119 int	steals;
120 int	steals_mtbf = 7;
121 int	contend;
122 int	contend_mtbf = 127;
123 
124 #define	VPM_MTBF(v, f)	(((++(v)) & (f)) != (f))
125 
126 #else	/* DEBUG */
127 
128 #define	VPM_MTBF(v, f)	(1)
129 #define	VPM_DEBUG(x)	/* nothing */
130 
131 #endif
132 
133 /*
134  * The vpm cache.
135  *
136  * The main purpose of having a cache here is to speed up page_lookup()
137  * operations and also provide an LRU(default) behaviour of file pages. The
138  * page_lookup() operation tends to be expensive if a page has to be
139  * reclaimed from the system page cache("cachelist"). Once we speed up the
140  * page_lookup()->page_reclaim() path then there there should be no need for
141  * this cache. The system page cache(cachelist) should effectively serve the
142  * purpose of caching file pages.
143  *
144  * This cache is very similar to segmap's smap cache. Each page in the
145  * cache is tracked by the structure vpmap_t. But unlike segmap, there is no
146  * hash table. The page_t has a reference to the vpmap_t when cached. For a
147  * given vnode, offset the page is found by means of a page_lookup() operation.
148  * Any page which has a mapping(i.e when cached) will not be in the
149  * system 'cachelist'. Hence the page_lookup() will not have to do a
150  * page_reclaim(). That is how the cache serves to speed up page_lookup()
151  * operations.
152  *
153  * This cache can be disabled by setting vpm_cache_enable = 0 in /etc/system.
154  */
155 
156 void
157 vpm_init()
158 {
159 	long  npages;
160 	struct vpmap *vpm;
161 	struct vpmfree *vpmflp;
162 	int i, ndx;
163 	extern void prefetch_smap_w(void *);
164 
165 	if (!vpm_cache_enable) {
166 		return;
167 	}
168 
169 	/*
170 	 * Set the size of the cache.
171 	 */
172 	vpm_cache_size = mmu_ptob((physmem * vpm_cache_percent)/100);
173 	if (vpm_cache_size < VPMAP_MINCACHE) {
174 		vpm_cache_size = VPMAP_MINCACHE;
175 	}
176 
177 	/*
178 	 * Number of freelists.
179 	 */
180 	if (vpm_nfreelist == 0) {
181 		vpm_nfreelist = max_ncpus;
182 	} else if (vpm_nfreelist < 0 || vpm_nfreelist > 2 * max_ncpus) {
183 		cmn_err(CE_WARN, "vpmap create : number of freelist "
184 		"vpm_nfreelist %d using %d", vpm_nfreelist, max_ncpus);
185 		vpm_nfreelist = 2 * max_ncpus;
186 	}
187 
188 	/*
189 	 * Round it up to the next power of 2
190 	 */
191 	if (vpm_nfreelist & (vpm_nfreelist - 1)) {
192 		vpm_nfreelist = 1 << (highbit(vpm_nfreelist));
193 	}
194 	vpmd_freemsk = vpm_nfreelist - 1;
195 
196 	/*
197 	 * Use a per cpu rotor index to spread the allocations evenly
198 	 * across the available vpm freelists.
199 	 */
200 	vpmd_cpu = kmem_zalloc(sizeof (union vpm_cpu) * max_ncpus, KM_SLEEP);
201 	ndx = 0;
202 	for (i = 0; i < max_ncpus; i++) {
203 
204 		vpmd_cpu[i].vfree_ndx = ndx;
205 		ndx = (ndx + 1) & vpmd_freemsk;
206 	}
207 
208 	/*
209 	 * Allocate and initialize the freelist.
210 	 */
211 	vpmd_free = kmem_zalloc(vpm_nfreelist * sizeof (struct vpmfree),
212 				KM_SLEEP);
213 	for (i = 0; i < vpm_nfreelist; i++) {
214 
215 		vpmflp = &vpmd_free[i];
216 		/*
217 		 * Set up initial queue pointers. They will get flipped
218 		 * back and forth.
219 		 */
220 		vpmflp->vpm_allocq = &vpmflp->vpm_freeq[VPMALLOCQ];
221 		vpmflp->vpm_releq = &vpmflp->vpm_freeq[VPMRELEQ];
222 	}
223 
224 	npages = mmu_btop(vpm_cache_size);
225 
226 
227 	/*
228 	 * Allocate and initialize the vpmap structs.
229 	 */
230 	vpmd_vpmap = kmem_zalloc(sizeof (struct vpmap) * npages, KM_SLEEP);
231 	for (vpm = vpmd_vpmap; vpm <= &vpmd_vpmap[npages - 1]; vpm++) {
232 		struct vpmfree *vpmflp;
233 		union vpm_freeq *releq;
234 		struct vpmap *vpmapf;
235 
236 		/*
237 		 * Use prefetch as we have to walk thru a large number of
238 		 * these data structures. We just use the smap's prefetch
239 		 * routine as it does the same. This should work fine
240 		 * for x64(this needs to be modified when enabled on sparc).
241 		 */
242 		prefetch_smap_w((void *)vpm);
243 
244 		vpm->vpm_free_ndx = VPMAP2VMF_NDX(vpm);
245 
246 		vpmflp = VPMAP2VMF(vpm);
247 		releq = vpmflp->vpm_releq;
248 
249 		vpmapf = releq->vpmq_free;
250 		if (vpmapf == NULL) {
251 			releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
252 		} else {
253 			vpm->vpm_next = vpmapf;
254 			vpm->vpm_prev = vpmapf->vpm_prev;
255 			vpmapf->vpm_prev = vpm;
256 			vpm->vpm_prev->vpm_next = vpm;
257 			releq->vpmq_free = vpm->vpm_next;
258 		}
259 
260 		/*
261 		 * Indicate that the vpmap is on the releq at start
262 		 */
263 		vpm->vpm_ndxflg = VPMRELEQ;
264 	}
265 }
266 
267 
268 /*
269  * unhooks vpm from the freelist if it is still on the freelist.
270  */
271 #define	VPMAP_RMFREELIST(vpm) \
272 	{ \
273 		if (vpm->vpm_next != NULL) { \
274 			union vpm_freeq *freeq; \
275 			struct vpmfree *vpmflp; \
276 			vpmflp = &vpmd_free[vpm->vpm_free_ndx]; \
277 			freeq = &vpmflp->vpm_freeq[vpm->vpm_ndxflg]; \
278 			mutex_enter(&freeq->vpmq_mtx); \
279 			if (freeq->vpmq_free != vpm) { \
280 				vpm->vpm_prev->vpm_next = vpm->vpm_next; \
281 				vpm->vpm_next->vpm_prev = vpm->vpm_prev; \
282 			} else if (vpm == vpm->vpm_next) { \
283 				freeq->vpmq_free = NULL; \
284 			} else { \
285 				freeq->vpmq_free = vpm->vpm_next; \
286 				vpm->vpm_prev->vpm_next = vpm->vpm_next; \
287 				vpm->vpm_next->vpm_prev = vpm->vpm_prev; \
288 			} \
289 			mutex_exit(&freeq->vpmq_mtx); \
290 			vpm->vpm_next = vpm->vpm_prev = NULL; \
291 		} \
292 	}
293 
294 static int
295 get_freelndx(int mode)
296 {
297 	int ndx;
298 
299 	ndx = vpmd_cpu[CPU->cpu_seqid].vfree_ndx & vpmd_freemsk;
300 	switch (mode) {
301 
302 	case	VPMCACHE_LRU:
303 	default:
304 			vpmd_cpu[CPU->cpu_seqid].vfree_ndx++;
305 			break;
306 	}
307 	return (ndx);
308 }
309 
310 
311 /*
312  * Find one vpmap structure from the free lists and use it for the newpage.
313  * The previous page it cached is dissociated and released. The page_t's
314  * p_vpmref is cleared only when the vpm it is pointing to is locked(or
315  * for AMD64 when the page is exclusively locked in page_unload. That is
316  * because the p_vpmref is treated as mapping).
317  *
318  * The page's p_vpmref is set when the page is
319  * locked(at least SHARED locked).
320  */
321 static struct vpmap *
322 get_free_vpmap(page_t *newpage)
323 {
324 	struct vpmfree *vpmflp;
325 	kmutex_t *vmtx;
326 	struct vpmap *vpm, *first;
327 	union vpm_freeq *allocq, *releq;
328 	page_t *pp = NULL;
329 	int end_ndx, page_locked = 0;
330 	int free_ndx;
331 
332 	/*
333 	 * get the freelist bin index.
334 	 */
335 	free_ndx = get_freelndx(vpm_cachemode);
336 
337 	end_ndx = free_ndx;
338 	vpmflp = &vpmd_free[free_ndx];
339 
340 retry_queue:
341 	allocq = vpmflp->vpm_allocq;
342 	mutex_enter(&allocq->vpmq_mtx);
343 
344 	if ((vpm = allocq->vpmq_free) == NULL) {
345 
346 skip_queue:
347 		/*
348 		 * The alloc list is empty or this queue is being skipped;
349 		 * first see if the allocq toggled.
350 		 */
351 		if (vpmflp->vpm_allocq != allocq) {
352 			/* queue changed */
353 			mutex_exit(&allocq->vpmq_mtx);
354 			goto retry_queue;
355 		}
356 		releq = vpmflp->vpm_releq;
357 		if (!mutex_tryenter(&releq->vpmq_mtx)) {
358 			/* cannot get releq; a free vpmap may be there now */
359 			mutex_exit(&allocq->vpmq_mtx);
360 
361 			/*
362 			 * This loop could spin forever if this thread has
363 			 * higher priority than the thread that is holding
364 			 * releq->vpmq_mtx. In order to force the other thread
365 			 * to run, we'll lock/unlock the mutex which is safe
366 			 * since we just unlocked the allocq mutex.
367 			 */
368 			mutex_enter(&releq->vpmq_mtx);
369 			mutex_exit(&releq->vpmq_mtx);
370 			goto retry_queue;
371 		}
372 		if (releq->vpmq_free == NULL) {
373 			VPM_DEBUG(vpmd_emptyfreelist);
374 			/*
375 			 * This freelist is empty.
376 			 * This should not happen unless clients
377 			 * are failing to release the vpmap after
378 			 * accessing the data. Before resorting
379 			 * to sleeping, try the next list of the same color.
380 			 */
381 			free_ndx = (free_ndx + 1) & vpmd_freemsk;
382 			if (free_ndx != end_ndx) {
383 				mutex_exit(&releq->vpmq_mtx);
384 				mutex_exit(&allocq->vpmq_mtx);
385 				vpmflp = &vpmd_free[free_ndx];
386 				goto retry_queue;
387 			}
388 			/*
389 			 * Tried all freelists.
390 			 * wait on this list and hope something gets freed.
391 			 */
392 			vpmflp->vpm_want++;
393 			mutex_exit(&vpmflp->vpm_freeq[1].vpmq_mtx);
394 			cv_wait(&vpmflp->vpm_free_cv,
395 				&vpmflp->vpm_freeq[0].vpmq_mtx);
396 			vpmflp->vpm_want--;
397 			mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx);
398 			vpmflp = &vpmd_free[free_ndx];
399 			VPM_DEBUG(vpmd_nofreevpms);
400 			goto retry_queue;
401 		} else {
402 			/*
403 			 * Something on the rele queue; flip the alloc
404 			 * and rele queues and retry.
405 			 */
406 			vpmflp->vpm_allocq = releq;
407 			vpmflp->vpm_releq = allocq;
408 			mutex_exit(&allocq->vpmq_mtx);
409 			mutex_exit(&releq->vpmq_mtx);
410 			if (page_locked) {
411 				delay(hz >> 2);
412 				page_locked = 0;
413 			}
414 			goto retry_queue;
415 		}
416 	} else {
417 		int gotnewvpm;
418 		kmutex_t *pmtx;
419 		uint_t vpmref;
420 
421 		/*
422 		 * Fastpath the case we get the vpmap mutex
423 		 * on the first try.
424 		 */
425 		first = vpm;
426 next_vpmap:
427 		vmtx = VPMAPMTX(vpm);
428 		if (!mutex_tryenter(vmtx)) {
429 			/*
430 			 * Another thread is trying to reclaim this slot.
431 			 * Skip to the next queue or vpmap.
432 			 */
433 			if ((vpm = vpm->vpm_next) == first) {
434 				goto skip_queue;
435 			} else {
436 				goto next_vpmap;
437 			}
438 		}
439 
440 		/*
441 		 * Assign this vpm to the newpage.
442 		 */
443 		pmtx = PPMTX(newpage);
444 		gotnewvpm = 0;
445 		mutex_enter(pmtx);
446 
447 		/*
448 		 * Check if some other thread already assigned a vpm to
449 		 * this page.
450 		 */
451 		if ((vpmref = newpage->p_vpmref) == 0) {
452 			newpage->p_vpmref = VPMID(vpm);
453 			gotnewvpm = 1;
454 		} else {
455 			VPM_DEBUG(vpmd_contend);
456 			mutex_exit(vmtx);
457 		}
458 		mutex_exit(pmtx);
459 
460 		if (gotnewvpm) {
461 
462 			/*
463 			 * At this point, we've selected the vpm. Remove vpm
464 			 * from its freelist. If vpm is the first one in
465 			 * the freelist, update the head of the freelist.
466 			 */
467 			if (first == vpm) {
468 				ASSERT(first == allocq->vpmq_free);
469 				allocq->vpmq_free = vpm->vpm_next;
470 			}
471 
472 			/*
473 			 * If the head of the freelist still points to vpm,
474 			 * then there are no more free vpmaps in that list.
475 			 */
476 			if (allocq->vpmq_free == vpm)
477 				/*
478 				 * Took the last one
479 				 */
480 				allocq->vpmq_free = NULL;
481 			else {
482 				vpm->vpm_prev->vpm_next = vpm->vpm_next;
483 				vpm->vpm_next->vpm_prev = vpm->vpm_prev;
484 			}
485 			mutex_exit(&allocq->vpmq_mtx);
486 			vpm->vpm_prev = vpm->vpm_next = NULL;
487 
488 			/*
489 			 * Disassociate the previous page. On x64 systems
490 			 * p_vpmref is used as a mapping reference to the page.
491 			 */
492 			if ((pp = vpm->vpm_pp) != NULL &&
493 				vpm->vpm_vp == pp->p_vnode &&
494 				vpm->vpm_off == pp->p_offset) {
495 
496 				pmtx = PPMTX(pp);
497 				if (page_trylock(pp, SE_SHARED)) {
498 					/*
499 					 * Now verify that it is the correct
500 					 * page. If not someone else stole it,
501 					 * so just unlock it and leave.
502 					 */
503 					mutex_enter(pmtx);
504 					if (PP_ISFREE(pp) ||
505 						vpm->vpm_vp != pp->p_vnode ||
506 						vpm->vpm_off != pp->p_offset ||
507 						pp->p_vpmref != VPMID(vpm)) {
508 						mutex_exit(pmtx);
509 
510 						page_unlock(pp);
511 					} else {
512 						/*
513 						 * Release the page.
514 						 */
515 						pp->p_vpmref = 0;
516 						mutex_exit(pmtx);
517 						hat_kpm_mapout(pp, 0,
518 							hat_kpm_page2va(pp, 1));
519 						(void) page_release(pp, 1);
520 					}
521 				} else {
522 					/*
523 					 * If the page cannot be locked, just
524 					 * clear the p_vpmref and go.
525 					 */
526 					mutex_enter(pmtx);
527 					if (pp->p_vpmref == VPMID(vpm)) {
528 						pp->p_vpmref = 0;
529 					}
530 					mutex_exit(pmtx);
531 					VPM_DEBUG(vpmd_prevpagelocked);
532 				}
533 			}
534 
535 			/*
536 			 * Setup vpm to point to the new page.
537 			 */
538 			vpm->vpm_pp = newpage;
539 			vpm->vpm_vp = newpage->p_vnode;
540 			vpm->vpm_off = newpage->p_offset;
541 
542 		} else {
543 			int steal = !VPM_MTBF(steals, steals_mtbf);
544 			/*
545 			 * Page already has a vpm assigned just use that.
546 			 * Grab the vpm mutex and verify that it is still
547 			 * the correct one. The pp->p_vpmref should not change
548 			 * once we have the vpm mutex and the page lock.
549 			 */
550 			mutex_exit(&allocq->vpmq_mtx);
551 			vpm = VPMP(vpmref);
552 			vmtx = VPMAPMTX(vpm);
553 			mutex_enter(vmtx);
554 			if ((steal && vpm->vpm_refcnt == 0) ||
555 			    vpm->vpm_pp != newpage) {
556 				/*
557 				 * The vpm got stolen, retry.
558 				 * clear the p_vpmref.
559 				 */
560 				pmtx = PPMTX(newpage);
561 				mutex_enter(pmtx);
562 				if (newpage->p_vpmref == vpmref) {
563 					newpage->p_vpmref = 0;
564 				}
565 				mutex_exit(pmtx);
566 
567 				mutex_exit(vmtx);
568 				VPM_DEBUG(vpmd_steals);
569 				goto retry_queue;
570 			} else if (vpm->vpm_refcnt == 0) {
571 				/*
572 				 * Remove it from the free list if it
573 				 * exists there.
574 				 */
575 				VPMAP_RMFREELIST(vpm);
576 			}
577 		}
578 		return (vpm);
579 	}
580 }
581 
582 static void
583 free_vpmap(struct vpmap *vpm)
584 {
585 	struct vpmfree *vpmflp;
586 	struct vpmap *vpmfreelist;
587 	union vpm_freeq *releq;
588 
589 	ASSERT(MUTEX_HELD(VPMAPMTX(vpm)));
590 
591 	if (vpm->vpm_refcnt != 0) {
592 		panic("free_vpmap");
593 		/*NOTREACHED*/
594 	}
595 
596 	vpmflp = &vpmd_free[vpm->vpm_free_ndx];
597 	/*
598 	 * Add to the tail of the release queue
599 	 * Note that vpm_releq and vpm_allocq could toggle
600 	 * before we get the lock. This does not affect
601 	 * correctness as the 2 queues are only maintained
602 	 * to reduce lock pressure.
603 	 */
604 	releq = vpmflp->vpm_releq;
605 	if (releq == &vpmflp->vpm_freeq[0]) {
606 		vpm->vpm_ndxflg = 0;
607 	} else {
608 		vpm->vpm_ndxflg = 1;
609 	}
610 	mutex_enter(&releq->vpmq_mtx);
611 	vpmfreelist = releq->vpmq_free;
612 	if (vpmfreelist == 0) {
613 		int want;
614 
615 		releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
616 		/*
617 		 * Both queue mutexes are held to set vpm_want;
618 		 * snapshot the value before dropping releq mutex.
619 		 * If vpm_want appears after the releq mutex is dropped,
620 		 * then the vpmap just freed is already gone.
621 		 */
622 		want = vpmflp->vpm_want;
623 		mutex_exit(&releq->vpmq_mtx);
624 		/*
625 		 * See if there was a waiter before dropping the releq mutex
626 		 * then recheck after obtaining vpm_freeq[0] mutex as
627 		 * the another thread may have already signaled.
628 		 */
629 		if (want) {
630 			mutex_enter(&vpmflp->vpm_freeq[0].vpmq_mtx);
631 			if (vpmflp->vpm_want)
632 				cv_signal(&vpmflp->vpm_free_cv);
633 			mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx);
634 		}
635 	} else {
636 		vpm->vpm_next = vpmfreelist;
637 		vpm->vpm_prev = vpmfreelist->vpm_prev;
638 		vpmfreelist->vpm_prev = vpm;
639 		vpm->vpm_prev->vpm_next = vpm;
640 		mutex_exit(&releq->vpmq_mtx);
641 	}
642 }
643 
644 /*
645  * Get the vpmap for the page.
646  * The refcnt of this vpm is incremented.
647  */
648 static struct vpmap *
649 get_vpmap(page_t *pp)
650 {
651 	struct vpmap *vpm = NULL;
652 	kmutex_t *vmtx;
653 	kmutex_t *pmtx;
654 	unsigned int refid;
655 
656 	ASSERT((pp != NULL) && PAGE_LOCKED(pp));
657 
658 	if (VPM_MTBF(contend, contend_mtbf) && (refid = pp->p_vpmref) != 0) {
659 		vpm = VPMP(refid);
660 		vmtx = VPMAPMTX(vpm);
661 		mutex_enter(vmtx);
662 		/*
663 		 * Since we have the page lock and the vpm mutex, the
664 		 * pp->p_vpmref cannot change.
665 		 */
666 		if (vpm->vpm_pp != pp) {
667 			pmtx = PPMTX(pp);
668 
669 			/*
670 			 * Clear the p_vpmref as it is incorrect.
671 			 * This can happen if the page was stolen.
672 			 * On x64 this should not happen as p_vpmref
673 			 * is treated as a mapping on the page. So
674 			 * if the page is stolen, the mapping would have
675 			 * been cleared in page_unload().
676 			 */
677 			mutex_enter(pmtx);
678 			if (pp->p_vpmref == refid)
679 				pp->p_vpmref = 0;
680 			mutex_exit(pmtx);
681 
682 			mutex_exit(vmtx);
683 			vpm = NULL;
684 		} else if (vpm->vpm_refcnt == 0) {
685 			/*
686 			 * Got the vpm, remove it from the free
687 			 * list if it exists there.
688 			 */
689 			VPMAP_RMFREELIST(vpm);
690 		}
691 	}
692 	if (vpm == NULL) {
693 		/*
694 		 * get_free_vpmap() returns with the vpmap mutex held.
695 		 */
696 		vpm = get_free_vpmap(pp);
697 		vmtx = VPMAPMTX(vpm);
698 		vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_misses++;
699 	} else {
700 		vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_hits++;
701 	}
702 
703 	vpm->vpm_refcnt++;
704 	mutex_exit(vmtx);
705 
706 	return (vpm);
707 }
708 
709 /* END --- vpm cache ---- */
710 
711 /*
712  * The vnode page mapping(vpm) interface routines.
713  */
714 
715 /*
716  * Find or create the pages starting form baseoff for specified
717  * length 'len'.
718  */
719 static int
720 vpm_pagecreate(
721 	struct vnode *vp,
722 	u_offset_t baseoff,
723 	size_t len,
724 	vmap_t vml[],
725 	int nseg,
726 	int *newpage)
727 {
728 
729 	page_t *pp = NULL;
730 	caddr_t base;
731 	u_offset_t off = baseoff;
732 	int i;
733 	ASSERT(nseg >= MINVMAPS && nseg < MAXVMAPS);
734 
735 	for (i = 0; len > 0; len -= PAGESIZE, i++) {
736 		struct vpmap *vpm;
737 
738 
739 		if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
740 
741 			base = segkpm_create_va(off);
742 
743 			/*
744 			 * the seg pointer passed in is just advisor. Just
745 			 * pass segkmap for now like segmap does with
746 			 * segmap_kpm enabled.
747 			 */
748 			if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT,
749 			    segkmap, base)) == NULL) {
750 				panic("segmap_pagecreate_vpm: "
751 				    "page_create failed");
752 				/*NOTREACHED*/
753 			}
754 			if (newpage != NULL)
755 				*newpage = 1;
756 
757 			page_io_unlock(pp);
758 		}
759 
760 		/*
761 		 * Get the vpm for this page_t.
762 		 */
763 		if (vpm_cache_enable) {
764 			vpm = get_vpmap(pp);
765 			vml[i].vs_data = (void *)&vpm->vpm_pp;
766 		} else {
767 			vml[i].vs_data = (void *)pp;
768 			pp->p_vpmref = 0;
769 		}
770 
771 		vml[i].vs_addr = hat_kpm_mapin(pp, 0);
772 		vml[i].vs_len = PAGESIZE;
773 
774 		off += PAGESIZE;
775 	}
776 	vml[i].vs_data = NULL;
777 	vml[i].vs_addr = (caddr_t)NULL;
778 	return (0);
779 }
780 
781 
782 /*
783  * Returns vpm mappings of pages in the range [off, off+len], where
784  * len is rounded up to the PAGESIZE boundary. The list of pages and
785  * the page addresses are returned in the SGL vml (vmap_t) array passed in.
786  * The nseg is the number of vmap_t entries in the array.
787  *
788  * Currently max len allowed is MAXBSIZE therefore, it will either
789  * fetch/create one or two pages depending on what is the PAGESIZE.
790  *
791  * The segmap's SM_LOCKPROTO  usage is not supported by these interfaces.
792  * For such cases, use the seg_map interfaces.
793  */
794 int
795 vpm_map_pages(
796 	struct vnode *vp,
797 	u_offset_t off,
798 	size_t len,
799 	int fetchpage,
800 	vmap_t *vml,
801 	int nseg,
802 	int  *newpage,
803 	enum seg_rw rw)
804 {
805 	extern struct vnode *common_specvp();
806 	u_offset_t baseoff;
807 	uint_t prot;
808 	caddr_t base;
809 	page_t *pp, *pplist[MAXVMAPS];
810 	struct vpmap *vpm;
811 	int i, error = 0;
812 
813 	ASSERT(nseg >= MINVMAPS && nseg < MAXVMAPS);
814 	baseoff = off & (offset_t)PAGEMASK;
815 	vml[0].vs_data = NULL;
816 	vml[0].vs_addr = (caddr_t)NULL;
817 	/*
818 	 * For now, lets restrict it to MAXBSIZE. XXX - We can allow
819 	 * len longer then MAXBSIZE, but there should be a limit
820 	 * which should be determined by how many pages the VOP_GETPAGE()
821 	 * can fetch.
822 	 */
823 	if (off + len > baseoff + MAXBSIZE) {
824 		panic("vpm_map_pages bad len");
825 		/*NOTREACHED*/
826 	}
827 
828 	/*
829 	 * If this is a block device we have to be sure to use the
830 	 * "common" block device vnode for the mapping.
831 	 */
832 	if (vp->v_type == VBLK)
833 		vp = common_specvp(vp);
834 
835 	/*
836 	 * round up len to a multiple of PAGESIZE.
837 	 */
838 	len = ((off + len - baseoff + PAGESIZE - 1) & (uintptr_t)PAGEMASK);
839 
840 	if (!fetchpage)
841 		return (vpm_pagecreate(vp, baseoff, len, vml, nseg, newpage));
842 
843 	for (i = 0; len > 0; len -= PAGESIZE, i++, pplist[i] = NULL) {
844 
845 		pp = page_lookup(vp, baseoff, SE_SHARED);
846 
847 		/*
848 		 * If we did not find the page or if this page was not
849 		 * in our cache, then let VOP_GETPAGE get all the pages.
850 		 * We need to call VOP_GETPAGE so that filesytems can do some
851 		 * (un)necessary tracking for sequential access.
852 		 */
853 
854 		if (pp == NULL || (vpm_cache_enable && pp->p_vpmref == 0) ||
855 			(rw == S_WRITE && hat_page_getattr(pp, P_MOD | P_REF)
856 							!= (P_MOD | P_REF))) {
857 			if (pp != NULL) {
858 				page_unlock(pp);
859 			}
860 
861 			/*
862 			 * Pass a dummy address as it will be required
863 			 * by page_create_va(). We pass segkmap as the seg
864 			 * as some file systems(UFS) check it.
865 			 */
866 			base = segkpm_create_va(baseoff);
867 
868 			error = VOP_GETPAGE(vp, baseoff, len, &prot, &pplist[i],
869 			len, segkmap, base, rw, CRED(), NULL);
870 			if (error) {
871 				VPM_DEBUG(vpmd_getpagefailed);
872 				pplist[i] = NULL;
873 			}
874 			break;
875 		} else {
876 			pplist[i] = pp;
877 			baseoff += PAGESIZE;
878 		}
879 	}
880 
881 	if (error) {
882 		for (i = 0; pplist[i] != NULL; i++) {
883 			page_unlock(pplist[i]);
884 			pplist[i] = NULL;
885 		}
886 		vml[0].vs_addr = NULL;
887 		vml[0].vs_data = NULL;
888 		return (error);
889 	}
890 
891 	/*
892 	 * Get the vpm's for pages.
893 	 */
894 	for (i = 0; pplist[i] != NULL; i++) {
895 		if (vpm_cache_enable) {
896 			vpm = get_vpmap(pplist[i]);
897 			vml[i].vs_data = (void *)&(vpm->vpm_pp);
898 		} else {
899 			vml[i].vs_data = (void *)pplist[i];
900 			pplist[i]->p_vpmref = 0;
901 		}
902 
903 		vml[i].vs_addr = hat_kpm_mapin(pplist[i], 0);
904 		vml[i].vs_len = PAGESIZE;
905 	}
906 
907 	vml[i].vs_data = NULL;
908 	vml[i].vs_addr = (caddr_t)NULL;
909 
910 	return (0);
911 }
912 
913 /*
914  * Release the vpm mappings on the pages and unlock them.
915  */
916 void
917 vpm_unmap_pages(vmap_t vml[], enum seg_rw rw)
918 {
919 	int i;
920 	struct vpmap *vpm;
921 	kmutex_t *mtx;
922 	page_t *pp;
923 
924 	for (i = 0; vml[i].vs_data != NULL; i++) {
925 		ASSERT(IS_KPM_ADDR(vml[i].vs_addr));
926 
927 		if (vpm_cache_enable) {
928 			pp = *(((page_t **)vml[i].vs_data));
929 		} else {
930 			pp = (page_t *)vml[i].vs_data;
931 		}
932 
933 		/*
934 		 * Mark page as being modified or referenced, bacause vpm pages
935 		 * would not cause faults where it would be set normally.
936 		 */
937 		if (rw == S_WRITE) {
938 			hat_setrefmod(pp);
939 		} else {
940 			ASSERT(rw == S_READ);
941 			hat_setref(pp);
942 		}
943 
944 		if (vpm_cache_enable) {
945 			page_unlock(pp);
946 			vpm = (struct vpmap *)((char *)vml[i].vs_data
947 					- offsetof(struct vpmap, vpm_pp));
948 			mtx = VPMAPMTX(vpm);
949 			mutex_enter(mtx);
950 
951 			if (--vpm->vpm_refcnt == 0) {
952 				free_vpmap(vpm);
953 			}
954 			mutex_exit(mtx);
955 		} else {
956 			hat_kpm_mapout(pp, 0, vml[i].vs_addr);
957 			(void) page_release(pp, 1);
958 		}
959 		vml[i].vs_data = NULL;
960 		vml[i].vs_addr = NULL;
961 	}
962 }
963 
964 /*
965  * Given the vp, off and the uio structure, this routine will do the
966  * the copy (uiomove). If the last page created is partially written,
967  * the rest of the page is zeroed out. It also zeros the beginning of
968  * the first page till the start offset if requested(zerostart).
969  * If pages are to be fetched, it will call the filesystem's getpage
970  * function (VOP_GETPAGE) to get them, otherwise they will be created if
971  * not already present in the page cache.
972  */
973 int
974 vpm_data_copy(struct vnode *vp,
975 	u_offset_t off,
976 	size_t len,
977 	struct uio *uio,
978 	int fetchpage,
979 	int *newpage,
980 	int zerostart,
981 	enum seg_rw rw)
982 {
983 	int error;
984 	struct vmap vml[MINVMAPS];
985 	enum uio_rw uiorw;
986 	int npages = 0;
987 
988 	uiorw = (rw == S_WRITE) ? UIO_WRITE : UIO_READ;
989 	/*
990 	 * 'off' will be the offset where the I/O starts.
991 	 * We get the pages starting at the (off & PAGEMASK)
992 	 * page boundary.
993 	 */
994 	error = vpm_map_pages(vp, off, (uint_t)len,
995 		fetchpage, vml, MINVMAPS, &npages,  rw);
996 
997 	if (newpage != NULL)
998 		*newpage = npages;
999 	if (!error) {
1000 		int i, pn, slen = len;
1001 		int pon = off & PAGEOFFSET;
1002 
1003 		/*
1004 		 * Clear from the beginning of the page to start offset
1005 		 * if requested.
1006 		 */
1007 		if (!fetchpage && zerostart) {
1008 			(void) kzero(vml[0].vs_addr,  (uint_t)pon);
1009 			VPM_DEBUG(vpmd_zerostart);
1010 		}
1011 
1012 		for (i = 0; !error && slen > 0 &&
1013 				vml[i].vs_addr != NULL; i++) {
1014 			pn = (int)MIN(slen, (PAGESIZE - pon));
1015 			error = uiomove(vml[i].vs_addr + pon,
1016 				    (long)pn, uiorw, uio);
1017 			slen -= pn;
1018 			pon = 0;
1019 		}
1020 
1021 		/*
1022 		 * When new pages are created, zero out part of the
1023 		 * page we did not copy to.
1024 		 */
1025 		if (!fetchpage && npages &&
1026 			uio->uio_loffset < roundup(off + len, PAGESIZE)) {
1027 			int nzero;
1028 
1029 			pon = (uio->uio_loffset & PAGEOFFSET);
1030 			nzero = PAGESIZE  - pon;
1031 			i = (uio->uio_loffset - (off & PAGEMASK)) / PAGESIZE;
1032 			(void) kzero(vml[i].vs_addr + pon, (uint_t)nzero);
1033 		}
1034 		vpm_unmap_pages(vml, rw);
1035 	}
1036 	return (error);
1037 }
1038 
1039 /*
1040  * called to flush pages for the given vnode covering
1041  * [off, off+len] range.
1042  */
1043 int
1044 vpm_sync_pages(struct vnode *vp,
1045 		u_offset_t off,
1046 		size_t len,
1047 		uint_t flags)
1048 {
1049 	extern struct vnode *common_specvp();
1050 	int bflags = 0;
1051 	int error = 0;
1052 	size_t psize = roundup(len, PAGESIZE);
1053 
1054 	/*
1055 	 * If this is a block device we have to be sure to use the
1056 	 * "common" block device vnode for the mapping.
1057 	 */
1058 	if (vp->v_type == VBLK)
1059 		vp = common_specvp(vp);
1060 
1061 	if ((flags & ~SM_DONTNEED) != 0) {
1062 		if (flags & SM_ASYNC)
1063 			bflags |= B_ASYNC;
1064 		if (flags & SM_INVAL)
1065 			bflags |= B_INVAL;
1066 		if (flags & SM_DESTROY)
1067 			bflags |= (B_INVAL|B_TRUNC);
1068 		if (flags & SM_FREE)
1069 			bflags |= B_FREE;
1070 		if (flags & SM_DONTNEED)
1071 			bflags |= B_DONTNEED;
1072 
1073 		error = VOP_PUTPAGE(vp, off, psize, bflags, CRED(), NULL);
1074 	}
1075 
1076 	return (error);
1077 }
1078 
1079 
1080 #else	/* SEGKPM_SUPPORT */
1081 
1082 /* vpm stubs */
1083 void
1084 vpm_init()
1085 {
1086 }
1087 
1088 /*ARGSUSED*/
1089 int
1090 vpm_pagecreate(
1091 	struct vnode *vp,
1092 	u_offset_t baseoff,
1093 	size_t len,
1094 	vmap_t vml[],
1095 	int nseg,
1096 	int *newpage)
1097 {
1098 	return (0);
1099 }
1100 
1101 /*ARGSUSED*/
1102 int
1103 vpm_map_pages(
1104 	struct vnode *vp,
1105 	u_offset_t off,
1106 	size_t len,
1107 	int fetchpage,
1108 	vmap_t vml[],
1109 	int nseg,
1110 	int *newpage,
1111 	enum seg_rw rw)
1112 {
1113 	return (0);
1114 }
1115 
1116 /*ARGSUSED*/
1117 int
1118 vpm_data_copy(struct vnode *vp,
1119 	u_offset_t off,
1120 	size_t len,
1121 	struct uio *uio,
1122 	int fetchpage,
1123 	int *newpage,
1124 	int zerostart,
1125 	enum seg_rw rw)
1126 {
1127 	return (0);
1128 }
1129 
1130 /*ARGSUSED*/
1131 void
1132 vpm_unmap_pages(vmap_t vml[], enum seg_rw rw)
1133 {
1134 }
1135 /*ARGSUSED*/
1136 int
1137 vpm_sync_pages(struct vnode *vp,
1138 		u_offset_t off,
1139 		size_t len,
1140 		uint_t flags)
1141 {
1142 	return (0);
1143 }
1144 #endif	/* SEGKPM_SUPPORT */
1145