xref: /titanic_50/usr/src/uts/common/vm/vpm.c (revision f38cb554a534c6df738be3f4d23327e69888e634)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 
27 /*
28  * VM - generic vnode page mapping interfaces.
29  *
30  * Mechanism to provide temporary mappings to vnode pages.
31  * The typical use would be to copy/access file data.
32  */
33 
34 #include <sys/types.h>
35 #include <sys/t_lock.h>
36 #include <sys/param.h>
37 #include <sys/sysmacros.h>
38 #include <sys/buf.h>
39 #include <sys/systm.h>
40 #include <sys/vnode.h>
41 #include <sys/mman.h>
42 #include <sys/errno.h>
43 #include <sys/cred.h>
44 #include <sys/kmem.h>
45 #include <sys/vtrace.h>
46 #include <sys/cmn_err.h>
47 #include <sys/debug.h>
48 #include <sys/thread.h>
49 #include <sys/dumphdr.h>
50 #include <sys/bitmap.h>
51 #include <sys/lgrp.h>
52 
53 #include <vm/seg_kmem.h>
54 #include <vm/hat.h>
55 #include <vm/as.h>
56 #include <vm/seg.h>
57 #include <vm/seg_kpm.h>
58 #include <vm/seg_map.h>
59 #include <vm/page.h>
60 #include <vm/pvn.h>
61 #include <vm/rm.h>
62 #include <vm/vpm.h>
63 
64 
65 #ifdef	SEGKPM_SUPPORT
66 /*
67  * VPM can be disabled by setting vpm_enable = 0 in
68  * /etc/system.
69  *
70  */
71 int vpm_enable = 1;
72 
73 #else
74 
75 int vpm_enable = 0;
76 
77 #endif
78 
79 #ifdef	SEGKPM_SUPPORT
80 
81 
82 int	vpm_cache_enable = 1;
83 long	vpm_cache_percent = 12;
84 long	vpm_cache_size;
85 int	vpm_nfreelist = 0;
86 int	vpmd_freemsk = 0;
87 
88 #define	VPM_S_PAD	64
89 union vpm_cpu {
90 	struct {
91 		int	vcpu_free_ndx;
92 		ulong_t	vcpu_hits;
93 		ulong_t vcpu_misses;
94 	} vcpu;
95 	char vpm_pad[VPM_S_PAD];
96 };
97 static union vpm_cpu	*vpmd_cpu;
98 
99 #define	vfree_ndx	vcpu.vcpu_free_ndx
100 
101 int	vpm_cachemode = VPMCACHE_LRU;
102 
103 #define	PPMTX(pp) (&(pp)->p_ilock)
104 
105 static struct vpmap *vpmd_vpmap;	/* list of vpmap structs preallocated */
106 static struct vpmfree *vpmd_free;
107 #define	VPMAPMTX(vpm)	(&vpm->vpm_mtx)
108 #define	VPMAP2VMF(vpm)	(&vpmd_free[(vpm - vpmd_vpmap) & vpmd_freemsk])
109 #define	VPMAP2VMF_NDX(vpm)	(ushort_t)((vpm - vpmd_vpmap) & vpmd_freemsk)
110 #define	VPMP(id)	(&vpmd_vpmap[id - 1])
111 #define	VPMID(vpm)	(uint_t)((vpm - vpmd_vpmap) + 1)
112 
113 
114 #ifdef	DEBUG
115 
116 struct	vpm_debug {
117 	int vpmd_steals;
118 	int vpmd_contend;
119 	int vpmd_prevpagelocked;
120 	int vpmd_getpagefailed;
121 	int vpmd_zerostart;
122 	int vpmd_emptyfreelist;
123 	int vpmd_nofreevpms;
124 } vpm_debug;
125 
126 #define	VPM_DEBUG(x)	((vpm_debug.x)++)
127 
128 int	steals;
129 int	steals_mtbf = 7;
130 int	contend;
131 int	contend_mtbf = 127;
132 
133 #define	VPM_MTBF(v, f)	(((++(v)) & (f)) != (f))
134 
135 #else	/* DEBUG */
136 
137 #define	VPM_MTBF(v, f)	(1)
138 #define	VPM_DEBUG(x)	/* nothing */
139 
140 #endif
141 
142 /*
143  * The vpm cache.
144  *
145  * The main purpose of having a cache here is to speed up page_lookup()
146  * operations and also provide an LRU(default) behaviour of file pages. The
147  * page_lookup() operation tends to be expensive if a page has to be
148  * reclaimed from the system page cache("cachelist"). Once we speed up the
149  * page_lookup()->page_reclaim() path then there there should be no need for
150  * this cache. The system page cache(cachelist) should effectively serve the
151  * purpose of caching file pages.
152  *
153  * This cache is very similar to segmap's smap cache. Each page in the
154  * cache is tracked by the structure vpmap_t. But unlike segmap, there is no
155  * hash table. The page_t has a reference to the vpmap_t when cached. For a
156  * given vnode, offset the page is found by means of a page_lookup() operation.
157  * Any page which has a mapping(i.e when cached) will not be in the
158  * system 'cachelist'. Hence the page_lookup() will not have to do a
159  * page_reclaim(). That is how the cache serves to speed up page_lookup()
160  * operations.
161  *
162  * This cache can be disabled by setting vpm_cache_enable = 0 in /etc/system.
163  */
164 
165 void
166 vpm_init()
167 {
168 	long  npages;
169 	struct vpmap *vpm;
170 	struct vpmfree *vpmflp;
171 	int i, ndx;
172 	extern void prefetch_smap_w(void *);
173 
174 	if (!kpm_enable) {
175 		vpm_enable = 0;
176 	}
177 
178 	if (!vpm_enable || !vpm_cache_enable) {
179 		return;
180 	}
181 
182 	/*
183 	 * Set the size of the cache.
184 	 */
185 	vpm_cache_size = mmu_ptob((physmem * vpm_cache_percent)/100);
186 	if (vpm_cache_size < VPMAP_MINCACHE) {
187 		vpm_cache_size = VPMAP_MINCACHE;
188 	}
189 
190 	if (vpm_cache_size > VPMAP_MAXCACHE) {
191 		vpm_cache_size = VPMAP_MAXCACHE;
192 	}
193 
194 	/*
195 	 * Number of freelists.
196 	 */
197 	if (vpm_nfreelist == 0) {
198 		vpm_nfreelist = max_ncpus;
199 	} else if (vpm_nfreelist < 0 || vpm_nfreelist > 2 * max_ncpus) {
200 		cmn_err(CE_WARN, "vpmap create : number of freelist "
201 		"vpm_nfreelist %d using %d", vpm_nfreelist, max_ncpus);
202 		vpm_nfreelist = 2 * max_ncpus;
203 	}
204 
205 	/*
206 	 * Round it up to the next power of 2
207 	 */
208 	if (vpm_nfreelist & (vpm_nfreelist - 1)) {
209 		vpm_nfreelist = 1 << (highbit(vpm_nfreelist));
210 	}
211 	vpmd_freemsk = vpm_nfreelist - 1;
212 
213 	/*
214 	 * Use a per cpu rotor index to spread the allocations evenly
215 	 * across the available vpm freelists.
216 	 */
217 	vpmd_cpu = kmem_zalloc(sizeof (union vpm_cpu) * max_ncpus, KM_SLEEP);
218 	ndx = 0;
219 	for (i = 0; i < max_ncpus; i++) {
220 
221 		vpmd_cpu[i].vfree_ndx = ndx;
222 		ndx = (ndx + 1) & vpmd_freemsk;
223 	}
224 
225 	/*
226 	 * Allocate and initialize the freelist.
227 	 */
228 	vpmd_free = kmem_zalloc(vpm_nfreelist * sizeof (struct vpmfree),
229 	    KM_SLEEP);
230 	for (i = 0; i < vpm_nfreelist; i++) {
231 
232 		vpmflp = &vpmd_free[i];
233 		/*
234 		 * Set up initial queue pointers. They will get flipped
235 		 * back and forth.
236 		 */
237 		vpmflp->vpm_allocq = &vpmflp->vpm_freeq[VPMALLOCQ];
238 		vpmflp->vpm_releq = &vpmflp->vpm_freeq[VPMRELEQ];
239 	}
240 
241 	npages = mmu_btop(vpm_cache_size);
242 
243 
244 	/*
245 	 * Allocate and initialize the vpmap structs. We need to
246 	 * walk the array backwards as the prefetch happens in reverse
247 	 * order.
248 	 */
249 	vpmd_vpmap = kmem_alloc(sizeof (struct vpmap) * npages, KM_SLEEP);
250 	for (vpm = &vpmd_vpmap[npages - 1]; vpm >= vpmd_vpmap; vpm--) {
251 		struct vpmfree *vpmflp;
252 		union vpm_freeq *releq;
253 		struct vpmap *vpmapf;
254 
255 		/*
256 		 * Use prefetch as we have to walk thru a large number of
257 		 * these data structures. We just use the smap's prefetch
258 		 * routine as it does the same.
259 		 */
260 		prefetch_smap_w((void *)vpm);
261 
262 		vpm->vpm_vp = NULL;
263 		vpm->vpm_off = 0;
264 		vpm->vpm_pp = NULL;
265 		vpm->vpm_refcnt = 0;
266 		mutex_init(&vpm->vpm_mtx, NULL, MUTEX_DEFAULT, NULL);
267 		vpm->vpm_free_ndx = VPMAP2VMF_NDX(vpm);
268 
269 		vpmflp = VPMAP2VMF(vpm);
270 		releq = vpmflp->vpm_releq;
271 
272 		vpmapf = releq->vpmq_free;
273 		if (vpmapf == NULL) {
274 			releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
275 		} else {
276 			vpm->vpm_next = vpmapf;
277 			vpm->vpm_prev = vpmapf->vpm_prev;
278 			vpmapf->vpm_prev = vpm;
279 			vpm->vpm_prev->vpm_next = vpm;
280 			releq->vpmq_free = vpm->vpm_next;
281 		}
282 
283 		/*
284 		 * Indicate that the vpmap is on the releq at start
285 		 */
286 		vpm->vpm_ndxflg = VPMRELEQ;
287 	}
288 }
289 
290 
291 /*
292  * unhooks vpm from the freelist if it is still on the freelist.
293  */
294 #define	VPMAP_RMFREELIST(vpm) \
295 	{ \
296 		if (vpm->vpm_next != NULL) { \
297 			union vpm_freeq *freeq; \
298 			struct vpmfree *vpmflp; \
299 			vpmflp = &vpmd_free[vpm->vpm_free_ndx]; \
300 			freeq = &vpmflp->vpm_freeq[vpm->vpm_ndxflg]; \
301 			mutex_enter(&freeq->vpmq_mtx); \
302 			if (freeq->vpmq_free != vpm) { \
303 				vpm->vpm_prev->vpm_next = vpm->vpm_next; \
304 				vpm->vpm_next->vpm_prev = vpm->vpm_prev; \
305 			} else if (vpm == vpm->vpm_next) { \
306 				freeq->vpmq_free = NULL; \
307 			} else { \
308 				freeq->vpmq_free = vpm->vpm_next; \
309 				vpm->vpm_prev->vpm_next = vpm->vpm_next; \
310 				vpm->vpm_next->vpm_prev = vpm->vpm_prev; \
311 			} \
312 			mutex_exit(&freeq->vpmq_mtx); \
313 			vpm->vpm_next = vpm->vpm_prev = NULL; \
314 		} \
315 	}
316 
317 static int
318 get_freelndx(int mode)
319 {
320 	int ndx;
321 
322 	ndx = vpmd_cpu[CPU->cpu_seqid].vfree_ndx & vpmd_freemsk;
323 	switch (mode) {
324 
325 	case	VPMCACHE_LRU:
326 	default:
327 			vpmd_cpu[CPU->cpu_seqid].vfree_ndx++;
328 			break;
329 	}
330 	return (ndx);
331 }
332 
333 
334 /*
335  * Find one vpmap structure from the free lists and use it for the newpage.
336  * The previous page it cached is dissociated and released. The page_t's
337  * p_vpmref is cleared only when the vpm it is pointing to is locked(or
338  * for AMD64 when the page is exclusively locked in page_unload. That is
339  * because the p_vpmref is treated as mapping).
340  *
341  * The page's p_vpmref is set when the page is
342  * locked(at least SHARED locked).
343  */
344 static struct vpmap *
345 get_free_vpmap(page_t *newpage)
346 {
347 	struct vpmfree *vpmflp;
348 	kmutex_t *vmtx;
349 	struct vpmap *vpm, *first;
350 	union vpm_freeq *allocq, *releq;
351 	page_t *pp = NULL;
352 	int end_ndx, page_locked = 0;
353 	int free_ndx;
354 
355 	/*
356 	 * get the freelist bin index.
357 	 */
358 	free_ndx = get_freelndx(vpm_cachemode);
359 
360 	end_ndx = free_ndx;
361 	vpmflp = &vpmd_free[free_ndx];
362 
363 retry_queue:
364 	allocq = vpmflp->vpm_allocq;
365 	mutex_enter(&allocq->vpmq_mtx);
366 
367 	if ((vpm = allocq->vpmq_free) == NULL) {
368 
369 skip_queue:
370 		/*
371 		 * The alloc list is empty or this queue is being skipped;
372 		 * first see if the allocq toggled.
373 		 */
374 		if (vpmflp->vpm_allocq != allocq) {
375 			/* queue changed */
376 			mutex_exit(&allocq->vpmq_mtx);
377 			goto retry_queue;
378 		}
379 		releq = vpmflp->vpm_releq;
380 		if (!mutex_tryenter(&releq->vpmq_mtx)) {
381 			/* cannot get releq; a free vpmap may be there now */
382 			mutex_exit(&allocq->vpmq_mtx);
383 
384 			/*
385 			 * This loop could spin forever if this thread has
386 			 * higher priority than the thread that is holding
387 			 * releq->vpmq_mtx. In order to force the other thread
388 			 * to run, we'll lock/unlock the mutex which is safe
389 			 * since we just unlocked the allocq mutex.
390 			 */
391 			mutex_enter(&releq->vpmq_mtx);
392 			mutex_exit(&releq->vpmq_mtx);
393 			goto retry_queue;
394 		}
395 		if (releq->vpmq_free == NULL) {
396 			VPM_DEBUG(vpmd_emptyfreelist);
397 			/*
398 			 * This freelist is empty.
399 			 * This should not happen unless clients
400 			 * are failing to release the vpmap after
401 			 * accessing the data. Before resorting
402 			 * to sleeping, try the next list of the same color.
403 			 */
404 			free_ndx = (free_ndx + 1) & vpmd_freemsk;
405 			if (free_ndx != end_ndx) {
406 				mutex_exit(&releq->vpmq_mtx);
407 				mutex_exit(&allocq->vpmq_mtx);
408 				vpmflp = &vpmd_free[free_ndx];
409 				goto retry_queue;
410 			}
411 			/*
412 			 * Tried all freelists.
413 			 * wait on this list and hope something gets freed.
414 			 */
415 			vpmflp->vpm_want++;
416 			mutex_exit(&vpmflp->vpm_freeq[1].vpmq_mtx);
417 			cv_wait(&vpmflp->vpm_free_cv,
418 			    &vpmflp->vpm_freeq[0].vpmq_mtx);
419 			vpmflp->vpm_want--;
420 			mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx);
421 			vpmflp = &vpmd_free[free_ndx];
422 			VPM_DEBUG(vpmd_nofreevpms);
423 			goto retry_queue;
424 		} else {
425 			/*
426 			 * Something on the rele queue; flip the alloc
427 			 * and rele queues and retry.
428 			 */
429 			vpmflp->vpm_allocq = releq;
430 			vpmflp->vpm_releq = allocq;
431 			mutex_exit(&allocq->vpmq_mtx);
432 			mutex_exit(&releq->vpmq_mtx);
433 			if (page_locked) {
434 				delay(hz >> 2);
435 				page_locked = 0;
436 			}
437 			goto retry_queue;
438 		}
439 	} else {
440 		int gotnewvpm;
441 		kmutex_t *pmtx;
442 		uint_t vpmref;
443 
444 		/*
445 		 * Fastpath the case we get the vpmap mutex
446 		 * on the first try.
447 		 */
448 		first = vpm;
449 next_vpmap:
450 		vmtx = VPMAPMTX(vpm);
451 		if (!mutex_tryenter(vmtx)) {
452 			/*
453 			 * Another thread is trying to reclaim this slot.
454 			 * Skip to the next queue or vpmap.
455 			 */
456 			if ((vpm = vpm->vpm_next) == first) {
457 				goto skip_queue;
458 			} else {
459 				goto next_vpmap;
460 			}
461 		}
462 
463 		/*
464 		 * Assign this vpm to the newpage.
465 		 */
466 		pmtx = PPMTX(newpage);
467 		gotnewvpm = 0;
468 		mutex_enter(pmtx);
469 
470 		/*
471 		 * Check if some other thread already assigned a vpm to
472 		 * this page.
473 		 */
474 		if ((vpmref = newpage->p_vpmref) == 0) {
475 			newpage->p_vpmref = VPMID(vpm);
476 			gotnewvpm = 1;
477 		} else {
478 			VPM_DEBUG(vpmd_contend);
479 			mutex_exit(vmtx);
480 		}
481 		mutex_exit(pmtx);
482 
483 		if (gotnewvpm) {
484 
485 			/*
486 			 * At this point, we've selected the vpm. Remove vpm
487 			 * from its freelist. If vpm is the first one in
488 			 * the freelist, update the head of the freelist.
489 			 */
490 			if (first == vpm) {
491 				ASSERT(first == allocq->vpmq_free);
492 				allocq->vpmq_free = vpm->vpm_next;
493 			}
494 
495 			/*
496 			 * If the head of the freelist still points to vpm,
497 			 * then there are no more free vpmaps in that list.
498 			 */
499 			if (allocq->vpmq_free == vpm)
500 				/*
501 				 * Took the last one
502 				 */
503 				allocq->vpmq_free = NULL;
504 			else {
505 				vpm->vpm_prev->vpm_next = vpm->vpm_next;
506 				vpm->vpm_next->vpm_prev = vpm->vpm_prev;
507 			}
508 			mutex_exit(&allocq->vpmq_mtx);
509 			vpm->vpm_prev = vpm->vpm_next = NULL;
510 
511 			/*
512 			 * Disassociate the previous page.
513 			 * p_vpmref is used as a mapping reference to the page.
514 			 */
515 			if ((pp = vpm->vpm_pp) != NULL &&
516 			    vpm->vpm_vp == pp->p_vnode &&
517 			    vpm->vpm_off == pp->p_offset) {
518 
519 				pmtx = PPMTX(pp);
520 				if (page_trylock(pp, SE_SHARED)) {
521 					/*
522 					 * Now verify that it is the correct
523 					 * page. If not someone else stole it,
524 					 * so just unlock it and leave.
525 					 */
526 					mutex_enter(pmtx);
527 					if (PP_ISFREE(pp) ||
528 					    vpm->vpm_vp != pp->p_vnode ||
529 					    vpm->vpm_off != pp->p_offset ||
530 					    pp->p_vpmref != VPMID(vpm)) {
531 						mutex_exit(pmtx);
532 
533 						page_unlock(pp);
534 					} else {
535 						/*
536 						 * Release the page.
537 						 */
538 						pp->p_vpmref = 0;
539 						mutex_exit(pmtx);
540 						(void) page_release(pp, 1);
541 					}
542 				} else {
543 					/*
544 					 * If the page cannot be locked, just
545 					 * clear the p_vpmref and go.
546 					 */
547 					mutex_enter(pmtx);
548 					if (pp->p_vpmref == VPMID(vpm)) {
549 						pp->p_vpmref = 0;
550 					}
551 					mutex_exit(pmtx);
552 					VPM_DEBUG(vpmd_prevpagelocked);
553 				}
554 			}
555 
556 			/*
557 			 * Setup vpm to point to the new page.
558 			 */
559 			vpm->vpm_pp = newpage;
560 			vpm->vpm_vp = newpage->p_vnode;
561 			vpm->vpm_off = newpage->p_offset;
562 
563 		} else {
564 			int steal = !VPM_MTBF(steals, steals_mtbf);
565 			/*
566 			 * Page already has a vpm assigned just use that.
567 			 * Grab the vpm mutex and verify that it is still
568 			 * the correct one. The pp->p_vpmref should not change
569 			 * once we have the vpm mutex and the page lock.
570 			 */
571 			mutex_exit(&allocq->vpmq_mtx);
572 			vpm = VPMP(vpmref);
573 			vmtx = VPMAPMTX(vpm);
574 			mutex_enter(vmtx);
575 			if ((steal && vpm->vpm_refcnt == 0) ||
576 			    vpm->vpm_pp != newpage) {
577 				/*
578 				 * The vpm got stolen, retry.
579 				 * clear the p_vpmref.
580 				 */
581 				pmtx = PPMTX(newpage);
582 				mutex_enter(pmtx);
583 				if (newpage->p_vpmref == vpmref) {
584 					newpage->p_vpmref = 0;
585 				}
586 				mutex_exit(pmtx);
587 
588 				mutex_exit(vmtx);
589 				VPM_DEBUG(vpmd_steals);
590 				goto retry_queue;
591 			} else if (vpm->vpm_refcnt == 0) {
592 				/*
593 				 * Remove it from the free list if it
594 				 * exists there.
595 				 */
596 				VPMAP_RMFREELIST(vpm);
597 			}
598 		}
599 		return (vpm);
600 	}
601 }
602 
603 static void
604 free_vpmap(struct vpmap *vpm)
605 {
606 	struct vpmfree *vpmflp;
607 	struct vpmap *vpmfreelist;
608 	union vpm_freeq *releq;
609 
610 	ASSERT(MUTEX_HELD(VPMAPMTX(vpm)));
611 
612 	if (vpm->vpm_refcnt != 0) {
613 		panic("free_vpmap");
614 		/*NOTREACHED*/
615 	}
616 
617 	vpmflp = &vpmd_free[vpm->vpm_free_ndx];
618 	/*
619 	 * Add to the tail of the release queue
620 	 * Note that vpm_releq and vpm_allocq could toggle
621 	 * before we get the lock. This does not affect
622 	 * correctness as the 2 queues are only maintained
623 	 * to reduce lock pressure.
624 	 */
625 	releq = vpmflp->vpm_releq;
626 	if (releq == &vpmflp->vpm_freeq[0]) {
627 		vpm->vpm_ndxflg = 0;
628 	} else {
629 		vpm->vpm_ndxflg = 1;
630 	}
631 	mutex_enter(&releq->vpmq_mtx);
632 	vpmfreelist = releq->vpmq_free;
633 	if (vpmfreelist == 0) {
634 		int want;
635 
636 		releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
637 		/*
638 		 * Both queue mutexes are held to set vpm_want;
639 		 * snapshot the value before dropping releq mutex.
640 		 * If vpm_want appears after the releq mutex is dropped,
641 		 * then the vpmap just freed is already gone.
642 		 */
643 		want = vpmflp->vpm_want;
644 		mutex_exit(&releq->vpmq_mtx);
645 		/*
646 		 * See if there was a waiter before dropping the releq mutex
647 		 * then recheck after obtaining vpm_freeq[0] mutex as
648 		 * the another thread may have already signaled.
649 		 */
650 		if (want) {
651 			mutex_enter(&vpmflp->vpm_freeq[0].vpmq_mtx);
652 			if (vpmflp->vpm_want)
653 				cv_signal(&vpmflp->vpm_free_cv);
654 			mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx);
655 		}
656 	} else {
657 		vpm->vpm_next = vpmfreelist;
658 		vpm->vpm_prev = vpmfreelist->vpm_prev;
659 		vpmfreelist->vpm_prev = vpm;
660 		vpm->vpm_prev->vpm_next = vpm;
661 		mutex_exit(&releq->vpmq_mtx);
662 	}
663 }
664 
665 /*
666  * Get the vpmap for the page.
667  * The refcnt of this vpm is incremented.
668  */
669 static struct vpmap *
670 get_vpmap(page_t *pp)
671 {
672 	struct vpmap *vpm = NULL;
673 	kmutex_t *vmtx;
674 	kmutex_t *pmtx;
675 	unsigned int refid;
676 
677 	ASSERT((pp != NULL) && PAGE_LOCKED(pp));
678 
679 	if (VPM_MTBF(contend, contend_mtbf) && (refid = pp->p_vpmref) != 0) {
680 		vpm = VPMP(refid);
681 		vmtx = VPMAPMTX(vpm);
682 		mutex_enter(vmtx);
683 		/*
684 		 * Since we have the page lock and the vpm mutex, the
685 		 * pp->p_vpmref cannot change.
686 		 */
687 		if (vpm->vpm_pp != pp) {
688 			pmtx = PPMTX(pp);
689 
690 			/*
691 			 * Clear the p_vpmref as it is incorrect.
692 			 * This can happen if the page was stolen.
693 			 * On x64 this should not happen as p_vpmref
694 			 * is treated as a mapping on the page. So
695 			 * if the page is stolen, the mapping would have
696 			 * been cleared in page_unload().
697 			 */
698 			mutex_enter(pmtx);
699 			if (pp->p_vpmref == refid)
700 				pp->p_vpmref = 0;
701 			mutex_exit(pmtx);
702 
703 			mutex_exit(vmtx);
704 			vpm = NULL;
705 		} else if (vpm->vpm_refcnt == 0) {
706 			/*
707 			 * Got the vpm, remove it from the free
708 			 * list if it exists there.
709 			 */
710 			VPMAP_RMFREELIST(vpm);
711 		}
712 	}
713 	if (vpm == NULL) {
714 		/*
715 		 * get_free_vpmap() returns with the vpmap mutex held.
716 		 */
717 		vpm = get_free_vpmap(pp);
718 		vmtx = VPMAPMTX(vpm);
719 		vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_misses++;
720 	} else {
721 		vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_hits++;
722 	}
723 
724 	vpm->vpm_refcnt++;
725 	mutex_exit(vmtx);
726 
727 	return (vpm);
728 }
729 
730 /* END --- vpm cache ---- */
731 
732 /*
733  * The vnode page mapping(vpm) interface routines.
734  */
735 
736 /*
737  * Find or create the pages starting form baseoff for specified
738  * length 'len'.
739  */
740 static int
741 vpm_pagecreate(
742 	struct vnode *vp,
743 	u_offset_t baseoff,
744 	size_t len,
745 	vmap_t vml[],
746 	int nseg,
747 	int *newpage)
748 {
749 
750 	page_t *pp = NULL;
751 	caddr_t base;
752 	u_offset_t off = baseoff;
753 	int i;
754 	ASSERT(nseg >= MINVMAPS && nseg <= MAXVMAPS);
755 
756 	for (i = 0; len > 0; len -= PAGESIZE, i++) {
757 		struct vpmap *vpm;
758 
759 
760 		if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
761 
762 			base = segkpm_create_va(off);
763 
764 			/*
765 			 * the seg pointer passed in is just advisor. Just
766 			 * pass segkmap for now like segmap does with
767 			 * segmap_kpm enabled.
768 			 */
769 			if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT,
770 			    segkmap, base)) == NULL) {
771 				panic("segmap_pagecreate_vpm: "
772 				    "page_create failed");
773 				/*NOTREACHED*/
774 			}
775 			if (newpage != NULL)
776 				*newpage = 1;
777 
778 			page_io_unlock(pp);
779 		}
780 
781 		/*
782 		 * Get the vpm for this page_t.
783 		 */
784 		if (vpm_cache_enable) {
785 			vpm = get_vpmap(pp);
786 			vml[i].vs_data = (void *)&vpm->vpm_pp;
787 		} else {
788 			vml[i].vs_data = (void *)pp;
789 			pp->p_vpmref = 0;
790 		}
791 
792 		vml[i].vs_addr = hat_kpm_mapin(pp, 0);
793 		vml[i].vs_len = PAGESIZE;
794 
795 		off += PAGESIZE;
796 	}
797 	vml[i].vs_data = NULL;
798 	vml[i].vs_addr = (caddr_t)NULL;
799 	return (0);
800 }
801 
802 
803 /*
804  * Returns vpm mappings of pages in the range [off, off+len], where
805  * len is rounded up to the PAGESIZE boundary. The list of pages and
806  * the page addresses are returned in the SGL vml (vmap_t) array passed in.
807  * The nseg is the number of vmap_t entries in the array.
808  *
809  * The segmap's SM_LOCKPROTO  usage is not supported by these interfaces.
810  * For such cases, use the seg_map interfaces.
811  */
812 int
813 vpm_map_pages(
814 	struct vnode *vp,
815 	u_offset_t off,
816 	size_t len,
817 	int fetchpage,
818 	vmap_t *vml,
819 	int nseg,
820 	int  *newpage,
821 	enum seg_rw rw)
822 {
823 	extern struct vnode *common_specvp();
824 	u_offset_t baseoff;
825 	uint_t prot;
826 	caddr_t base;
827 	page_t *pp, *pplist[MAXVMAPS];
828 	struct vpmap *vpm;
829 	int i, error = 0;
830 	size_t tlen;
831 
832 	ASSERT(nseg >= MINVMAPS && nseg <= MAXVMAPS);
833 	baseoff = off & (offset_t)PAGEMASK;
834 	vml[0].vs_data = NULL;
835 	vml[0].vs_addr = (caddr_t)NULL;
836 
837 	tlen = P2ROUNDUP(off + len, PAGESIZE) - baseoff;
838 	/*
839 	 * Restrict it to VPMMAXLEN.
840 	 */
841 	if (tlen > (VPMMAXPGS * PAGESIZE)) {
842 		tlen = VPMMAXPGS * PAGESIZE;
843 	}
844 	/*
845 	 * Ensure length fits within the vml[] array. One element of
846 	 * the array is used to mark the end of the scatter/gather list
847 	 * of valid mappings by setting its vs_addr = NULL. Leave space
848 	 * for this element.
849 	 */
850 	if (tlen > ((nseg - 1) * PAGESIZE)) {
851 		tlen = ((nseg - 1) * PAGESIZE);
852 	}
853 	len = tlen;
854 
855 	/*
856 	 * If this is a block device we have to be sure to use the
857 	 * "common" block device vnode for the mapping.
858 	 */
859 	if (vp->v_type == VBLK)
860 		vp = common_specvp(vp);
861 
862 
863 	if (!fetchpage)
864 		return (vpm_pagecreate(vp, baseoff, len, vml, nseg, newpage));
865 
866 	for (i = 0; len > 0; len -= PAGESIZE, i++, pplist[i] = NULL) {
867 
868 		pp = page_lookup(vp, baseoff, SE_SHARED);
869 
870 		/*
871 		 * If we did not find the page or if this page was not
872 		 * in vpm cache(p_vpmref == 0), then let VOP_GETPAGE get
873 		 * all the pages.
874 		 * We need to call VOP_GETPAGE so that filesytems can do some
875 		 * (un)necessary tracking for sequential access.
876 		 */
877 
878 		if (pp == NULL || (vpm_cache_enable && pp->p_vpmref == 0) ||
879 		    (rw == S_WRITE && hat_page_getattr(pp, P_MOD | P_REF)
880 		    != (P_MOD | P_REF))) {
881 			int j;
882 			if (pp != NULL) {
883 				page_unlock(pp);
884 			}
885 			/*
886 			 * If we did not find the desired set of pages,
887 			 * from the page cache, just call VOP_GETPAGE to get
888 			 * all the pages.
889 			 */
890 			for (j = 0; j < i; j++) {
891 				page_unlock(pplist[j]);
892 			}
893 
894 
895 			baseoff = off & (offset_t)PAGEMASK;
896 			/*
897 			 * Pass a dummy address as it will be required
898 			 * by page_create_va(). We pass segkmap as the seg
899 			 * as some file systems(UFS) check it.
900 			 */
901 			base = segkpm_create_va(baseoff);
902 
903 			error = VOP_GETPAGE(vp, baseoff, tlen, &prot, pplist,
904 			    tlen, segkmap, base, rw, CRED(), NULL);
905 			if (error) {
906 				VPM_DEBUG(vpmd_getpagefailed);
907 				pplist[0] = NULL;
908 			}
909 			break;
910 		} else {
911 			pplist[i] = pp;
912 			baseoff += PAGESIZE;
913 		}
914 	}
915 
916 	if (error) {
917 		for (i = 0; pplist[i] != NULL; i++) {
918 			page_unlock(pplist[i]);
919 			pplist[i] = NULL;
920 		}
921 		vml[0].vs_addr = NULL;
922 		vml[0].vs_data = NULL;
923 		return (error);
924 	}
925 
926 	/*
927 	 * Get the vpm's for pages.
928 	 */
929 	for (i = 0; pplist[i] != NULL; i++) {
930 		if (vpm_cache_enable) {
931 			vpm = get_vpmap(pplist[i]);
932 			vml[i].vs_data = (void *)&(vpm->vpm_pp);
933 		} else {
934 			vml[i].vs_data = (void *)pplist[i];
935 			pplist[i]->p_vpmref = 0;
936 		}
937 
938 		vml[i].vs_addr = hat_kpm_mapin(pplist[i], 0);
939 		vml[i].vs_len = PAGESIZE;
940 	}
941 
942 	vml[i].vs_data = NULL;
943 	vml[i].vs_addr = (caddr_t)NULL;
944 
945 	return (0);
946 }
947 
948 /*
949  * Release the vpm mappings on the pages and unlock them.
950  */
951 void
952 vpm_unmap_pages(vmap_t vml[], enum seg_rw rw)
953 {
954 	int i;
955 	struct vpmap *vpm;
956 	kmutex_t *mtx;
957 	page_t *pp;
958 
959 	for (i = 0; vml[i].vs_data != NULL; i++) {
960 		ASSERT(IS_KPM_ADDR(vml[i].vs_addr));
961 
962 		if (vpm_cache_enable) {
963 			pp = *(((page_t **)vml[i].vs_data));
964 		} else {
965 			pp = (page_t *)vml[i].vs_data;
966 		}
967 
968 		/*
969 		 * Mark page as being modified or referenced, bacause vpm pages
970 		 * would not cause faults where it would be set normally.
971 		 */
972 		if (rw == S_WRITE) {
973 			hat_setrefmod(pp);
974 		} else {
975 			ASSERT(rw == S_READ);
976 			hat_setref(pp);
977 		}
978 
979 		if (vpm_cache_enable) {
980 			vpm = (struct vpmap *)((char *)vml[i].vs_data
981 			    - offsetof(struct vpmap, vpm_pp));
982 			hat_kpm_mapout(pp, 0, vml[i].vs_addr);
983 			page_unlock(pp);
984 			mtx = VPMAPMTX(vpm);
985 			mutex_enter(mtx);
986 
987 			if (--vpm->vpm_refcnt == 0) {
988 				free_vpmap(vpm);
989 			}
990 			mutex_exit(mtx);
991 		} else {
992 			hat_kpm_mapout(pp, 0, vml[i].vs_addr);
993 			(void) page_release(pp, 1);
994 		}
995 		vml[i].vs_data = NULL;
996 		vml[i].vs_addr = NULL;
997 	}
998 }
999 
1000 /*
1001  * Given the vp, off and the uio structure, this routine will do the
1002  * the copy (uiomove). If the last page created is partially written,
1003  * the rest of the page is zeroed out. It also zeros the beginning of
1004  * the first page till the start offset if requested(zerostart).
1005  * If pages are to be fetched, it will call the filesystem's getpage
1006  * function (VOP_GETPAGE) to get them, otherwise they will be created if
1007  * not already present in the page cache.
1008  */
1009 int
1010 vpm_data_copy(struct vnode *vp,
1011 	u_offset_t off,
1012 	size_t len,
1013 	struct uio *uio,
1014 	int fetchpage,
1015 	int *newpage,
1016 	int zerostart,
1017 	enum seg_rw rw)
1018 {
1019 	int error;
1020 	struct vmap vml[MINVMAPS];
1021 	enum uio_rw uiorw;
1022 	int npages = 0;
1023 
1024 	uiorw = (rw == S_WRITE) ? UIO_WRITE : UIO_READ;
1025 	/*
1026 	 * 'off' will be the offset where the I/O starts.
1027 	 * We get the pages starting at the (off & PAGEMASK)
1028 	 * page boundary.
1029 	 */
1030 	error = vpm_map_pages(vp, off, (uint_t)len,
1031 	    fetchpage, vml, MINVMAPS, &npages,  rw);
1032 
1033 	if (newpage != NULL)
1034 		*newpage = npages;
1035 	if (!error) {
1036 		int i, pn, slen = len;
1037 		int pon = off & PAGEOFFSET;
1038 
1039 		/*
1040 		 * Clear from the beginning of the page to start offset
1041 		 * if requested.
1042 		 */
1043 		if (!fetchpage && zerostart) {
1044 			(void) kzero(vml[0].vs_addr,  (uint_t)pon);
1045 			VPM_DEBUG(vpmd_zerostart);
1046 		}
1047 
1048 		for (i = 0; !error && slen > 0 &&
1049 		    vml[i].vs_addr != NULL; i++) {
1050 			pn = (int)MIN(slen, (PAGESIZE - pon));
1051 			error = uiomove(vml[i].vs_addr + pon,
1052 			    (long)pn, uiorw, uio);
1053 			slen -= pn;
1054 			pon = 0;
1055 		}
1056 
1057 		/*
1058 		 * When new pages are created, zero out part of the
1059 		 * page we did not copy to.
1060 		 */
1061 		if (!fetchpage && npages &&
1062 		    uio->uio_loffset < roundup(off + len, PAGESIZE)) {
1063 			int nzero;
1064 
1065 			pon = (uio->uio_loffset & PAGEOFFSET);
1066 			nzero = PAGESIZE  - pon;
1067 			i = (uio->uio_loffset - (off & PAGEMASK)) / PAGESIZE;
1068 			(void) kzero(vml[i].vs_addr + pon, (uint_t)nzero);
1069 		}
1070 		vpm_unmap_pages(vml, rw);
1071 	}
1072 	return (error);
1073 }
1074 
1075 /*
1076  * called to flush pages for the given vnode covering
1077  * [off, off+len] range.
1078  */
1079 int
1080 vpm_sync_pages(struct vnode *vp,
1081 		u_offset_t off,
1082 		size_t len,
1083 		uint_t flags)
1084 {
1085 	extern struct vnode *common_specvp();
1086 	int bflags = 0;
1087 	int error = 0;
1088 	size_t psize = roundup(len, PAGESIZE);
1089 
1090 	/*
1091 	 * If this is a block device we have to be sure to use the
1092 	 * "common" block device vnode for the mapping.
1093 	 */
1094 	if (vp->v_type == VBLK)
1095 		vp = common_specvp(vp);
1096 
1097 	if ((flags & ~SM_DONTNEED) != 0) {
1098 		if (flags & SM_ASYNC)
1099 			bflags |= B_ASYNC;
1100 		if (flags & SM_INVAL)
1101 			bflags |= B_INVAL;
1102 		if (flags & SM_DESTROY)
1103 			bflags |= (B_INVAL|B_TRUNC);
1104 		if (flags & SM_FREE)
1105 			bflags |= B_FREE;
1106 		if (flags & SM_DONTNEED)
1107 			bflags |= B_DONTNEED;
1108 
1109 		error = VOP_PUTPAGE(vp, off, psize, bflags, CRED(), NULL);
1110 	}
1111 
1112 	return (error);
1113 }
1114 
1115 
1116 #else	/* SEGKPM_SUPPORT */
1117 
1118 /* vpm stubs */
1119 void
1120 vpm_init()
1121 {
1122 }
1123 
1124 /*ARGSUSED*/
1125 int
1126 vpm_pagecreate(
1127 	struct vnode *vp,
1128 	u_offset_t baseoff,
1129 	size_t len,
1130 	vmap_t vml[],
1131 	int nseg,
1132 	int *newpage)
1133 {
1134 	return (0);
1135 }
1136 
1137 /*ARGSUSED*/
1138 int
1139 vpm_map_pages(
1140 	struct vnode *vp,
1141 	u_offset_t off,
1142 	size_t len,
1143 	int fetchpage,
1144 	vmap_t vml[],
1145 	int nseg,
1146 	int *newpage,
1147 	enum seg_rw rw)
1148 {
1149 	return (0);
1150 }
1151 
1152 /*ARGSUSED*/
1153 int
1154 vpm_data_copy(struct vnode *vp,
1155 	u_offset_t off,
1156 	size_t len,
1157 	struct uio *uio,
1158 	int fetchpage,
1159 	int *newpage,
1160 	int zerostart,
1161 	enum seg_rw rw)
1162 {
1163 	return (0);
1164 }
1165 
1166 /*ARGSUSED*/
1167 void
1168 vpm_unmap_pages(vmap_t vml[], enum seg_rw rw)
1169 {
1170 }
1171 /*ARGSUSED*/
1172 int
1173 vpm_sync_pages(struct vnode *vp,
1174 		u_offset_t off,
1175 		size_t len,
1176 		uint_t flags)
1177 {
1178 	return (0);
1179 }
1180 #endif	/* SEGKPM_SUPPORT */
1181