xref: /titanic_50/usr/src/uts/common/vm/vm_pvn.c (revision 5819f75e225cf93d9c11f52e04ee71c2dcd0eca9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 /*
40  * VM - paged vnode.
41  *
42  * This file supplies vm support for the vnode operations that deal with pages.
43  */
44 #include <sys/types.h>
45 #include <sys/t_lock.h>
46 #include <sys/param.h>
47 #include <sys/sysmacros.h>
48 #include <sys/systm.h>
49 #include <sys/time.h>
50 #include <sys/buf.h>
51 #include <sys/vnode.h>
52 #include <sys/uio.h>
53 #include <sys/vmmeter.h>
54 #include <sys/vmsystm.h>
55 #include <sys/mman.h>
56 #include <sys/vfs.h>
57 #include <sys/cred.h>
58 #include <sys/user.h>
59 #include <sys/kmem.h>
60 #include <sys/cmn_err.h>
61 #include <sys/debug.h>
62 #include <sys/cpuvar.h>
63 #include <sys/vtrace.h>
64 #include <sys/tnf_probe.h>
65 
66 #include <vm/hat.h>
67 #include <vm/as.h>
68 #include <vm/seg.h>
69 #include <vm/rm.h>
70 #include <vm/pvn.h>
71 #include <vm/page.h>
72 #include <vm/seg_map.h>
73 #include <vm/seg_kmem.h>
74 #include <sys/fs/swapnode.h>
75 
76 int pvn_nofodklust = 0;
77 int pvn_write_noklust = 0;
78 
79 uint_t pvn_vmodsort_supported = 0;	/* set if HAT supports VMODSORT */
80 uint_t pvn_vmodsort_disable = 0;	/* set in /etc/system to disable HAT */
81 					/* support for vmodsort for testing */
82 
83 static struct kmem_cache *marker_cache = NULL;
84 
85 /*
86  * Find the largest contiguous block which contains `addr' for file offset
87  * `offset' in it while living within the file system block sizes (`vp_off'
88  * and `vp_len') and the address space limits for which no pages currently
89  * exist and which map to consecutive file offsets.
90  */
91 page_t *
92 pvn_read_kluster(
93 	struct vnode *vp,
94 	u_offset_t off,
95 	struct seg *seg,
96 	caddr_t addr,
97 	u_offset_t *offp,			/* return values */
98 	size_t *lenp,				/* return values */
99 	u_offset_t vp_off,
100 	size_t vp_len,
101 	int isra)
102 {
103 	ssize_t deltaf, deltab;
104 	page_t *pp;
105 	page_t *plist = NULL;
106 	spgcnt_t pagesavail;
107 	u_offset_t vp_end;
108 
109 	ASSERT(off >= vp_off && off < vp_off + vp_len);
110 
111 	/*
112 	 * We only want to do klustering/read ahead if there
113 	 * is more than minfree pages currently available.
114 	 */
115 	pagesavail = freemem - minfree;
116 
117 	if (pagesavail <= 0)
118 		if (isra)
119 			return ((page_t *)NULL);    /* ra case - give up */
120 		else
121 			pagesavail = 1;		    /* must return a page */
122 
123 	/* We calculate in pages instead of bytes due to 32-bit overflows */
124 	if (pagesavail < (spgcnt_t)btopr(vp_len)) {
125 		/*
126 		 * Don't have enough free memory for the
127 		 * max request, try sizing down vp request.
128 		 */
129 		deltab = (ssize_t)(off - vp_off);
130 		vp_len -= deltab;
131 		vp_off += deltab;
132 		if (pagesavail < btopr(vp_len)) {
133 			/*
134 			 * Still not enough memory, just settle for
135 			 * pagesavail which is at least 1.
136 			 */
137 			vp_len = ptob(pagesavail);
138 		}
139 	}
140 
141 	vp_end = vp_off + vp_len;
142 	ASSERT(off >= vp_off && off < vp_end);
143 
144 	if (isra && SEGOP_KLUSTER(seg, addr, 0))
145 		return ((page_t *)NULL);	/* segment driver says no */
146 
147 	if ((plist = page_create_va(vp, off,
148 	    PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL)
149 		return ((page_t *)NULL);
150 
151 	if (vp_len <= PAGESIZE || pvn_nofodklust) {
152 		*offp = off;
153 		*lenp = MIN(vp_len, PAGESIZE);
154 	} else {
155 		/*
156 		 * Scan back from front by incrementing "deltab" and
157 		 * comparing "off" with "vp_off + deltab" to avoid
158 		 * "signed" versus "unsigned" conversion problems.
159 		 */
160 		for (deltab = PAGESIZE; off >= vp_off + deltab;
161 		    deltab += PAGESIZE) {
162 			/*
163 			 * Call back to the segment driver to verify that
164 			 * the klustering/read ahead operation makes sense.
165 			 */
166 			if (SEGOP_KLUSTER(seg, addr, -deltab))
167 				break;		/* page not eligible */
168 			if ((pp = page_create_va(vp, off - deltab,
169 			    PAGESIZE, PG_EXCL, seg, addr - deltab))
170 			    == NULL)
171 				break;		/* already have the page */
172 			/*
173 			 * Add page to front of page list.
174 			 */
175 			page_add(&plist, pp);
176 		}
177 		deltab -= PAGESIZE;
178 
179 		/* scan forward from front */
180 		for (deltaf = PAGESIZE; off + deltaf < vp_end;
181 		    deltaf += PAGESIZE) {
182 			/*
183 			 * Call back to the segment driver to verify that
184 			 * the klustering/read ahead operation makes sense.
185 			 */
186 			if (SEGOP_KLUSTER(seg, addr, deltaf))
187 				break;		/* page not file extension */
188 			if ((pp = page_create_va(vp, off + deltaf,
189 			    PAGESIZE, PG_EXCL, seg, addr + deltaf))
190 			    == NULL)
191 				break;		/* already have page */
192 
193 			/*
194 			 * Add page to end of page list.
195 			 */
196 			page_add(&plist, pp);
197 			plist = plist->p_next;
198 		}
199 		*offp = off = off - deltab;
200 		*lenp = deltab + deltaf;
201 		ASSERT(off >= vp_off);
202 
203 		/*
204 		 * If we ended up getting more than was actually
205 		 * requested, retract the returned length to only
206 		 * reflect what was requested.  This might happen
207 		 * if we were allowed to kluster pages across a
208 		 * span of (say) 5 frags, and frag size is less
209 		 * than PAGESIZE.  We need a whole number of
210 		 * pages to contain those frags, but the returned
211 		 * size should only allow the returned range to
212 		 * extend as far as the end of the frags.
213 		 */
214 		if ((vp_off + vp_len) < (off + *lenp)) {
215 			ASSERT(vp_end > off);
216 			*lenp = vp_end - off;
217 		}
218 	}
219 	TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER,
220 	    "pvn_read_kluster:seg %p addr %x isra %x",
221 	    seg, addr, isra);
222 	return (plist);
223 }
224 
225 /*
226  * Handle pages for this vnode on either side of the page "pp"
227  * which has been locked by the caller.  This routine will also
228  * do klustering in the range [vp_off, vp_off + vp_len] up
229  * until a page which is not found.  The offset and length
230  * of pages included is returned in "*offp" and "*lenp".
231  *
232  * Returns a list of dirty locked pages all ready to be
233  * written back.
234  */
235 page_t *
236 pvn_write_kluster(
237 	struct vnode *vp,
238 	page_t *pp,
239 	u_offset_t *offp,		/* return values */
240 	size_t *lenp,			/* return values */
241 	u_offset_t vp_off,
242 	size_t vp_len,
243 	int flags)
244 {
245 	u_offset_t off;
246 	page_t *dirty;
247 	size_t deltab, deltaf;
248 	se_t se;
249 	u_offset_t vp_end;
250 
251 	off = pp->p_offset;
252 
253 	/*
254 	 * Kustering should not be done if we are invalidating
255 	 * pages since we could destroy pages that belong to
256 	 * some other process if this is a swap vnode.
257 	 */
258 	if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) {
259 		*offp = off;
260 		*lenp = PAGESIZE;
261 		return (pp);
262 	}
263 
264 	if (flags & (B_FREE | B_INVAL))
265 		se = SE_EXCL;
266 	else
267 		se = SE_SHARED;
268 
269 	dirty = pp;
270 	/*
271 	 * Scan backwards looking for pages to kluster by incrementing
272 	 * "deltab" and comparing "off" with "vp_off + deltab" to
273 	 * avoid "signed" versus "unsigned" conversion problems.
274 	 */
275 	for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) {
276 		pp = page_lookup_nowait(vp, off - deltab, se);
277 		if (pp == NULL)
278 			break;		/* page not found */
279 		if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
280 			break;
281 		page_add(&dirty, pp);
282 	}
283 	deltab -= PAGESIZE;
284 
285 	vp_end = vp_off + vp_len;
286 	/* now scan forwards looking for pages to kluster */
287 	for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) {
288 		pp = page_lookup_nowait(vp, off + deltaf, se);
289 		if (pp == NULL)
290 			break;		/* page not found */
291 		if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
292 			break;
293 		page_add(&dirty, pp);
294 		dirty = dirty->p_next;
295 	}
296 
297 	*offp = off - deltab;
298 	*lenp = deltab + deltaf;
299 	return (dirty);
300 }
301 
302 /*
303  * Generic entry point used to release the "shared/exclusive" lock
304  * and the "p_iolock" on pages after i/o is complete.
305  */
306 void
307 pvn_io_done(page_t *plist)
308 {
309 	page_t *pp;
310 
311 	while (plist != NULL) {
312 		pp = plist;
313 		page_sub(&plist, pp);
314 		page_io_unlock(pp);
315 		page_unlock(pp);
316 	}
317 }
318 
319 /*
320  * Entry point to be used by file system getpage subr's and
321  * other such routines which either want to unlock pages (B_ASYNC
322  * request) or destroy a list of pages if an error occurred.
323  */
324 void
325 pvn_read_done(page_t *plist, int flags)
326 {
327 	page_t *pp;
328 
329 	while (plist != NULL) {
330 		pp = plist;
331 		page_sub(&plist, pp);
332 		page_io_unlock(pp);
333 		if (flags & B_ERROR) {
334 			/*LINTED: constant in conditional context*/
335 			VN_DISPOSE(pp, B_INVAL, 0, kcred);
336 		} else {
337 			(void) page_release(pp, 0);
338 		}
339 	}
340 }
341 
342 /*
343  * Automagic pageout.
344  * When memory gets tight, start freeing pages popping out of the
345  * write queue.
346  */
347 int	write_free = 1;
348 pgcnt_t	pages_before_pager = 200;	/* LMXXX */
349 
350 /*
351  * Routine to be called when page-out's complete.
352  * The caller, typically VOP_PUTPAGE, has to explicity call this routine
353  * after waiting for i/o to complete (biowait) to free the list of
354  * pages associated with the buffer.  These pages must be locked
355  * before i/o is initiated.
356  *
357  * If a write error occurs, the pages are marked as modified
358  * so the write will be re-tried later.
359  */
360 
361 void
362 pvn_write_done(page_t *plist, int flags)
363 {
364 	int dfree = 0;
365 	int pgrec = 0;
366 	int pgout = 0;
367 	int pgpgout = 0;
368 	int anonpgout = 0;
369 	int anonfree = 0;
370 	int fspgout = 0;
371 	int fsfree = 0;
372 	int execpgout = 0;
373 	int execfree = 0;
374 	page_t *pp;
375 	struct cpu *cpup;
376 	struct vnode *vp = NULL;	/* for probe */
377 	uint_t ppattr;
378 	kmutex_t *vphm = NULL;
379 
380 	ASSERT((flags & B_READ) == 0);
381 
382 	/*
383 	 * If we are about to start paging anyway, start freeing pages.
384 	 */
385 	if (write_free && freemem < lotsfree + pages_before_pager &&
386 	    (flags & B_ERROR) == 0) {
387 		flags |= B_FREE;
388 	}
389 
390 	/*
391 	 * Handle each page involved in the i/o operation.
392 	 */
393 	while (plist != NULL) {
394 		pp = plist;
395 		ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp));
396 		page_sub(&plist, pp);
397 
398 		/* Kernel probe support */
399 		if (vp == NULL)
400 			vp = pp->p_vnode;
401 
402 		if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) {
403 			/*
404 			 * Move page to the top of the v_page list.
405 			 * Skip pages modified during IO.
406 			 */
407 			vphm = page_vnode_mutex(vp);
408 			mutex_enter(vphm);
409 			if ((pp->p_vpnext != pp) && !hat_ismod(pp)) {
410 				page_vpsub(&vp->v_pages, pp);
411 				page_vpadd(&vp->v_pages, pp);
412 			}
413 			mutex_exit(vphm);
414 		}
415 
416 		if (flags & B_ERROR) {
417 			/*
418 			 * Write operation failed.  We don't want
419 			 * to destroy (or free) the page unless B_FORCE
420 			 * is set. We set the mod bit again and release
421 			 * all locks on the page so that it will get written
422 			 * back again later when things are hopefully
423 			 * better again.
424 			 * If B_INVAL and B_FORCE is set we really have
425 			 * to destroy the page.
426 			 */
427 			if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) {
428 				page_io_unlock(pp);
429 				/*LINTED: constant in conditional context*/
430 				VN_DISPOSE(pp, B_INVAL, 0, kcred);
431 			} else {
432 				hat_setmod_only(pp);
433 				page_io_unlock(pp);
434 				page_unlock(pp);
435 			}
436 		} else if (flags & B_INVAL) {
437 			/*
438 			 * XXX - Failed writes with B_INVAL set are
439 			 * not handled appropriately.
440 			 */
441 			page_io_unlock(pp);
442 			/*LINTED: constant in conditional context*/
443 			VN_DISPOSE(pp, B_INVAL, 0, kcred);
444 		} else if (flags & B_FREE ||!hat_page_is_mapped(pp)) {
445 			/*
446 			 * Update statistics for pages being paged out
447 			 */
448 			if (pp->p_vnode) {
449 				if (IS_SWAPFSVP(pp->p_vnode)) {
450 					anonpgout++;
451 				} else {
452 					if (pp->p_vnode->v_flag & VVMEXEC) {
453 						execpgout++;
454 					} else {
455 						fspgout++;
456 					}
457 				}
458 			}
459 			page_io_unlock(pp);
460 			pgout = 1;
461 			pgpgout++;
462 			TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT,
463 			    "page_ws_out:pp %p", pp);
464 
465 			/*
466 			 * The page_struct_lock need not be acquired to
467 			 * examine "p_lckcnt" and "p_cowcnt" since we'll
468 			 * have an "exclusive" lock if the upgrade succeeds.
469 			 */
470 			if (page_tryupgrade(pp) &&
471 			    pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
472 				/*
473 				 * Check if someone has reclaimed the
474 				 * page.  If ref and mod are not set, no
475 				 * one is using it so we can free it.
476 				 * The rest of the system is careful
477 				 * to use the NOSYNC flag to unload
478 				 * translations set up for i/o w/o
479 				 * affecting ref and mod bits.
480 				 *
481 				 * Obtain a copy of the real hardware
482 				 * mod bit using hat_pagesync(pp, HAT_DONTZERO)
483 				 * to avoid having to flush the cache.
484 				 */
485 				ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
486 				    HAT_SYNC_STOPON_MOD);
487 			ck_refmod:
488 				if (!(ppattr & (P_REF | P_MOD))) {
489 					if (hat_page_is_mapped(pp)) {
490 						/*
491 						 * Doesn't look like the page
492 						 * was modified so now we
493 						 * really have to unload the
494 						 * translations.  Meanwhile
495 						 * another CPU could've
496 						 * modified it so we have to
497 						 * check again.  We don't loop
498 						 * forever here because now
499 						 * the translations are gone
500 						 * and no one can get a new one
501 						 * since we have the "exclusive"
502 						 * lock on the page.
503 						 */
504 						(void) hat_pageunload(pp,
505 						    HAT_FORCE_PGUNLOAD);
506 						ppattr = hat_page_getattr(pp,
507 						    P_REF | P_MOD);
508 						goto ck_refmod;
509 					}
510 					/*
511 					 * Update statistics for pages being
512 					 * freed
513 					 */
514 					if (pp->p_vnode) {
515 						if (IS_SWAPFSVP(pp->p_vnode)) {
516 							anonfree++;
517 						} else {
518 							if (pp->p_vnode->v_flag
519 							    & VVMEXEC) {
520 								execfree++;
521 							} else {
522 								fsfree++;
523 							}
524 						}
525 					}
526 					/*LINTED: constant in conditional ctx*/
527 					VN_DISPOSE(pp, B_FREE,
528 					    (flags & B_DONTNEED), kcred);
529 					dfree++;
530 				} else {
531 					page_unlock(pp);
532 					pgrec++;
533 					TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE,
534 					    "page_ws_free:pp %p", pp);
535 				}
536 			} else {
537 				/*
538 				 * Page is either `locked' in memory
539 				 * or was reclaimed and now has a
540 				 * "shared" lock, so release it.
541 				 */
542 				page_unlock(pp);
543 			}
544 		} else {
545 			/*
546 			 * Neither B_FREE nor B_INVAL nor B_ERROR.
547 			 * Just release locks.
548 			 */
549 			page_io_unlock(pp);
550 			page_unlock(pp);
551 		}
552 	}
553 
554 	CPU_STATS_ENTER_K();
555 	cpup = CPU;		/* get cpup now that CPU cannot change */
556 	CPU_STATS_ADDQ(cpup, vm, dfree, dfree);
557 	CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec);
558 	CPU_STATS_ADDQ(cpup, vm, pgout, pgout);
559 	CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout);
560 	CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout);
561 	CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree);
562 	CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout);
563 	CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree);
564 	CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout);
565 	CPU_STATS_ADDQ(cpup, vm, execfree, execfree);
566 	CPU_STATS_EXIT_K();
567 
568 	/* Kernel probe */
569 	TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */,
570 	    tnf_opaque,	vnode,			vp,
571 	    tnf_ulong,	pages_pageout,		pgpgout,
572 	    tnf_ulong,	pages_freed,		dfree,
573 	    tnf_ulong,	pages_reclaimed,	pgrec);
574 }
575 
576 /*
577  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
578  * B_TRUNC, B_FORCE}.  B_DELWRI indicates that this page is part of a kluster
579  * operation and is only to be considered if it doesn't involve any
580  * waiting here.  B_TRUNC indicates that the file is being truncated
581  * and so no i/o needs to be done. B_FORCE indicates that the page
582  * must be destroyed so don't try wrting it out.
583  *
584  * The caller must ensure that the page is locked.  Returns 1, if
585  * the page should be written back (the "iolock" is held in this
586  * case), or 0 if the page has been dealt with or has been
587  * unlocked.
588  */
589 int
590 pvn_getdirty(page_t *pp, int flags)
591 {
592 	ASSERT((flags & (B_INVAL | B_FREE)) ?
593 	    PAGE_EXCL(pp) : PAGE_SHARED(pp));
594 	ASSERT(PP_ISFREE(pp) == 0);
595 
596 	/*
597 	 * If trying to invalidate or free a logically `locked' page,
598 	 * forget it.  Don't need page_struct_lock to check p_lckcnt and
599 	 * p_cowcnt as the page is exclusively locked.
600 	 */
601 	if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) &&
602 	    (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) {
603 		page_unlock(pp);
604 		return (0);
605 	}
606 
607 	/*
608 	 * Now acquire the i/o lock so we can add it to the dirty
609 	 * list (if necessary).  We avoid blocking on the i/o lock
610 	 * in the following cases:
611 	 *
612 	 *	If B_DELWRI is set, which implies that this request is
613 	 *	due to a klustering operartion.
614 	 *
615 	 *	If this is an async (B_ASYNC) operation and we are not doing
616 	 *	invalidation (B_INVAL) [The current i/o or fsflush will ensure
617 	 *	that the the page is written out].
618 	 */
619 	if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) {
620 		if (!page_io_trylock(pp)) {
621 			page_unlock(pp);
622 			return (0);
623 		}
624 	} else {
625 		page_io_lock(pp);
626 	}
627 
628 	/*
629 	 * If we want to free or invalidate the page then
630 	 * we need to unload it so that anyone who wants
631 	 * it will have to take a minor fault to get it.
632 	 * Otherwise, we're just writing the page back so we
633 	 * need to sync up the hardwre and software mod bit to
634 	 * detect any future modifications.  We clear the
635 	 * software mod bit when we put the page on the dirty
636 	 * list.
637 	 */
638 	if (flags & (B_INVAL | B_FREE)) {
639 		(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
640 	} else {
641 		(void) hat_pagesync(pp, HAT_SYNC_ZERORM);
642 	}
643 
644 	if (!hat_ismod(pp) || (flags & B_TRUNC)) {
645 		/*
646 		 * Don't need to add it to the
647 		 * list after all.
648 		 */
649 		page_io_unlock(pp);
650 		if (flags & B_INVAL) {
651 			/*LINTED: constant in conditional context*/
652 			VN_DISPOSE(pp, B_INVAL, 0, kcred);
653 		} else if (flags & B_FREE) {
654 			/*LINTED: constant in conditional context*/
655 			VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred);
656 		} else {
657 			/*
658 			 * This is advisory path for the callers
659 			 * of VOP_PUTPAGE() who prefer freeing the
660 			 * page _only_ if no one else is accessing it.
661 			 * E.g. segmap_release()
662 			 *
663 			 * The above hat_ismod() check is useless because:
664 			 * (1) we may not be holding SE_EXCL lock;
665 			 * (2) we've not unloaded _all_ translations
666 			 *
667 			 * Let page_release() do the heavy-lifting.
668 			 */
669 			(void) page_release(pp, 1);
670 		}
671 		return (0);
672 	}
673 
674 	/*
675 	 * Page is dirty, get it ready for the write back
676 	 * and add page to the dirty list.
677 	 */
678 	hat_clrrefmod(pp);
679 
680 	/*
681 	 * If we're going to free the page when we're done
682 	 * then we can let others try to use it starting now.
683 	 * We'll detect the fact that they used it when the
684 	 * i/o is done and avoid freeing the page.
685 	 */
686 	if (flags & B_FREE)
687 		page_downgrade(pp);
688 
689 
690 	TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp);
691 
692 	return (1);
693 }
694 
695 
696 /*ARGSUSED*/
697 static int
698 marker_constructor(void *buf, void *cdrarg, int kmflags)
699 {
700 	page_t *mark = buf;
701 	bzero(mark, sizeof (page_t));
702 	mark->p_hash = PVN_VPLIST_HASH_TAG;
703 	return (0);
704 }
705 
706 void
707 pvn_init()
708 {
709 	if (pvn_vmodsort_disable == 0)
710 		pvn_vmodsort_supported = hat_supported(HAT_VMODSORT, NULL);
711 	marker_cache = kmem_cache_create("marker_cache",
712 	    sizeof (page_t), 0, marker_constructor,
713 	    NULL, NULL, NULL, NULL, 0);
714 }
715 
716 
717 /*
718  * Process a vnode's page list for all pages whose offset is >= off.
719  * Pages are to either be free'd, invalidated, or written back to disk.
720  *
721  * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE
722  * is specified, otherwise they are "shared" locked.
723  *
724  * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC}
725  *
726  * Special marker page_t's are inserted in the list in order
727  * to keep track of where we are in the list when locks are dropped.
728  *
729  * Note the list is circular and insertions can happen only at the
730  * head and tail of the list. The algorithm ensures visiting all pages
731  * on the list in the following way:
732  *
733  *    Drop two marker pages at the end of the list.
734  *
735  *    Move one marker page backwards towards the start of the list until
736  *    it is at the list head, processing the pages passed along the way.
737  *
738  *    Due to race conditions when the vphm mutex is dropped, additional pages
739  *    can be added to either end of the list, so we'll continue to move
740  *    the marker and process pages until it is up against the end marker.
741  *
742  * There is one special exit condition. If we are processing a VMODSORT
743  * vnode and only writing back modified pages, we can stop as soon as
744  * we run into an unmodified page.  This makes fsync(3) operations fast.
745  */
746 int
747 pvn_vplist_dirty(
748 	vnode_t		*vp,
749 	u_offset_t	off,
750 	int		(*putapage)(vnode_t *, page_t *, u_offset_t *,
751 			size_t *, int, cred_t *),
752 	int		flags,
753 	cred_t		*cred)
754 {
755 	page_t		*pp;
756 	page_t		*mark;		/* marker page that moves toward head */
757 	page_t		*end;		/* marker page at end of list */
758 	int		err = 0;
759 	int		error;
760 	kmutex_t	*vphm;
761 	se_t		se;
762 	page_t		**where_to_move;
763 
764 	ASSERT(vp->v_type != VCHR);
765 
766 	if (vp->v_pages == NULL)
767 		return (0);
768 
769 
770 	/*
771 	 * Serialize vplist_dirty operations on this vnode by setting VVMLOCK.
772 	 *
773 	 * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync()
774 	 * from getting blocked while flushing pages to a dead NFS server.
775 	 */
776 	mutex_enter(&vp->v_lock);
777 	if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) {
778 		mutex_exit(&vp->v_lock);
779 		return (EAGAIN);
780 	}
781 
782 	while (vp->v_flag & VVMLOCK)
783 		cv_wait(&vp->v_cv, &vp->v_lock);
784 
785 	if (vp->v_pages == NULL) {
786 		mutex_exit(&vp->v_lock);
787 		return (0);
788 	}
789 
790 	vp->v_flag |= VVMLOCK;
791 	mutex_exit(&vp->v_lock);
792 
793 
794 	/*
795 	 * Set up the marker pages used to walk the list
796 	 */
797 	end = kmem_cache_alloc(marker_cache, KM_SLEEP);
798 	end->p_vnode = vp;
799 	end->p_offset = (u_offset_t)-2;
800 	mark = kmem_cache_alloc(marker_cache, KM_SLEEP);
801 	mark->p_vnode = vp;
802 	mark->p_offset = (u_offset_t)-1;
803 
804 	/*
805 	 * Grab the lock protecting the vnode's page list
806 	 * note that this lock is dropped at times in the loop.
807 	 */
808 	vphm = page_vnode_mutex(vp);
809 	mutex_enter(vphm);
810 	if (vp->v_pages == NULL)
811 		goto leave;
812 
813 	/*
814 	 * insert the markers and loop through the list of pages
815 	 */
816 	page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark);
817 	page_vpadd(&mark->p_vpnext, end);
818 	for (;;) {
819 
820 		/*
821 		 * If only doing an async write back, then we can
822 		 * stop as soon as we get to start of the list.
823 		 */
824 		if (flags == B_ASYNC && vp->v_pages == mark)
825 			break;
826 
827 		/*
828 		 * otherwise stop when we've gone through all the pages
829 		 */
830 		if (mark->p_vpprev == end)
831 			break;
832 
833 		pp = mark->p_vpprev;
834 		if (vp->v_pages == pp)
835 			where_to_move = &vp->v_pages;
836 		else
837 			where_to_move = &pp->p_vpprev->p_vpnext;
838 
839 		ASSERT(pp->p_vnode == vp);
840 
841 		/*
842 		 * If just flushing dirty pages to disk and this vnode
843 		 * is using a sorted list of pages, we can stop processing
844 		 * as soon as we find an unmodified page. Since all the
845 		 * modified pages are visited first.
846 		 */
847 		if (IS_VMODSORT(vp) &&
848 		    !(flags & (B_INVAL | B_FREE | B_TRUNC))) {
849 			if (!hat_ismod(pp) && !page_io_locked(pp)) {
850 #ifdef  DEBUG
851 				/*
852 				 * For debug kernels examine what should be
853 				 * all the remaining clean pages, asserting
854 				 * that they are not modified.
855 				 */
856 				page_t	*chk = pp;
857 				int	attr;
858 
859 				page_vpsub(&vp->v_pages, mark);
860 				page_vpadd(where_to_move, mark);
861 				do {
862 					chk = chk->p_vpprev;
863 					ASSERT(chk != end);
864 					if (chk == mark)
865 						continue;
866 					attr = hat_page_getattr(chk, P_MOD |
867 					    P_REF);
868 					if ((attr & P_MOD) == 0)
869 						continue;
870 					panic("v_pages list not all clean: "
871 					    "page_t*=%p vnode=%p off=%lx "
872 					    "attr=0x%x last clean page_t*=%p\n",
873 					    (void *)chk, (void *)chk->p_vnode,
874 					    (long)chk->p_offset, attr,
875 					    (void *)pp);
876 				} while (chk != vp->v_pages);
877 #endif
878 				break;
879 			} else if (!(flags & B_ASYNC) && !hat_ismod(pp)) {
880 				/*
881 				 * Couldn't get io lock, wait until IO is done.
882 				 * Block only for sync IO since we don't want
883 				 * to block async IO.
884 				 */
885 				mutex_exit(vphm);
886 				page_io_wait(pp);
887 				mutex_enter(vphm);
888 				continue;
889 			}
890 		}
891 
892 		/*
893 		 * Skip this page if the offset is out of the desired range.
894 		 * Just move the marker and continue.
895 		 */
896 		if (pp->p_offset < off) {
897 			page_vpsub(&vp->v_pages, mark);
898 			page_vpadd(where_to_move, mark);
899 			continue;
900 		}
901 
902 		/*
903 		 * If we are supposed to invalidate or free this
904 		 * page, then we need an exclusive lock.
905 		 */
906 		se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
907 
908 		/*
909 		 * We must acquire the page lock for all synchronous
910 		 * operations (invalidate, free and write).
911 		 */
912 		if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) {
913 			/*
914 			 * If the page_lock() drops the mutex
915 			 * we must retry the loop.
916 			 */
917 			if (!page_lock(pp, se, vphm, P_NO_RECLAIM))
918 				continue;
919 
920 			/*
921 			 * It's ok to move the marker page now.
922 			 */
923 			page_vpsub(&vp->v_pages, mark);
924 			page_vpadd(where_to_move, mark);
925 		} else {
926 
927 			/*
928 			 * update the marker page for all remaining cases
929 			 */
930 			page_vpsub(&vp->v_pages, mark);
931 			page_vpadd(where_to_move, mark);
932 
933 			/*
934 			 * For write backs, If we can't lock the page, it's
935 			 * invalid or in the process of being destroyed.  Skip
936 			 * it, assuming someone else is writing it.
937 			 */
938 			if (!page_trylock(pp, se))
939 				continue;
940 		}
941 
942 		ASSERT(pp->p_vnode == vp);
943 
944 		/*
945 		 * Successfully locked the page, now figure out what to
946 		 * do with it. Free pages are easily dealt with, invalidate
947 		 * if desired or just go on to the next page.
948 		 */
949 		if (PP_ISFREE(pp)) {
950 			if ((flags & B_INVAL) == 0) {
951 				page_unlock(pp);
952 				continue;
953 			}
954 
955 			/*
956 			 * Invalidate (destroy) the page.
957 			 */
958 			mutex_exit(vphm);
959 			page_destroy_free(pp);
960 			mutex_enter(vphm);
961 			continue;
962 		}
963 
964 		/*
965 		 * pvn_getdirty() figures out what do do with a dirty page.
966 		 * If the page is dirty, the putapage() routine will write it
967 		 * and will kluster any other adjacent dirty pages it can.
968 		 *
969 		 * pvn_getdirty() and `(*putapage)' unlock the page.
970 		 */
971 		mutex_exit(vphm);
972 		if (pvn_getdirty(pp, flags)) {
973 			error = (*putapage)(vp, pp, NULL, NULL, flags, cred);
974 			if (!err)
975 				err = error;
976 		}
977 		mutex_enter(vphm);
978 	}
979 	page_vpsub(&vp->v_pages, mark);
980 	page_vpsub(&vp->v_pages, end);
981 
982 leave:
983 	/*
984 	 * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds
985 	 */
986 	mutex_exit(vphm);
987 	kmem_cache_free(marker_cache, mark);
988 	kmem_cache_free(marker_cache, end);
989 	mutex_enter(&vp->v_lock);
990 	vp->v_flag &= ~VVMLOCK;
991 	cv_broadcast(&vp->v_cv);
992 	mutex_exit(&vp->v_lock);
993 	return (err);
994 }
995 
996 /*
997  * Walk the vp->v_pages list, for every page call the callback function
998  * pointed by *page_check. If page_check returns non-zero, then mark the
999  * page as modified and if VMODSORT is set, move it to the end of v_pages
1000  * list. Moving makes sense only if we have at least two pages - this also
1001  * avoids having v_pages temporarily being NULL after calling page_vpsub()
1002  * if there was just one page.
1003  */
1004 void
1005 pvn_vplist_setdirty(vnode_t *vp, int (*page_check)(page_t *))
1006 {
1007 	page_t	*pp, *next, *end;
1008 	kmutex_t	*vphm;
1009 	int	shuffle;
1010 
1011 	vphm = page_vnode_mutex(vp);
1012 	mutex_enter(vphm);
1013 
1014 	if (vp->v_pages == NULL) {
1015 		mutex_exit(vphm);
1016 		return;
1017 	}
1018 
1019 	end = vp->v_pages->p_vpprev;
1020 	shuffle = IS_VMODSORT(vp) && (vp->v_pages != end);
1021 	pp = vp->v_pages;
1022 
1023 	for (;;) {
1024 		next = pp->p_vpnext;
1025 		if (pp->p_hash != PVN_VPLIST_HASH_TAG && page_check(pp)) {
1026 			/*
1027 			 * hat_setmod_only() in contrast to hat_setmod() does
1028 			 * not shuffle the pages and does not grab the mutex
1029 			 * page_vnode_mutex. Exactly what we need.
1030 			 */
1031 			hat_setmod_only(pp);
1032 			if (shuffle) {
1033 				page_vpsub(&vp->v_pages, pp);
1034 				ASSERT(vp->v_pages != NULL);
1035 				page_vpadd(&vp->v_pages->p_vpprev->p_vpnext,
1036 				    pp);
1037 			}
1038 		}
1039 		/* Stop if we have just processed the last page. */
1040 		if (pp == end)
1041 			break;
1042 		pp = next;
1043 	}
1044 
1045 	mutex_exit(vphm);
1046 }
1047 
1048 /*
1049  * Zero out zbytes worth of data. Caller should be aware that this
1050  * routine may enter back into the fs layer (xxx_getpage). Locks
1051  * that the xxx_getpage routine may need should not be held while
1052  * calling this.
1053  */
1054 void
1055 pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes)
1056 {
1057 	caddr_t addr;
1058 
1059 	ASSERT(vp->v_type != VCHR);
1060 
1061 	if (vp->v_pages == NULL)
1062 		return;
1063 
1064 	/*
1065 	 * zbytes may be zero but there still may be some portion of
1066 	 * a page which needs clearing (since zbytes is a function
1067 	 * of filesystem block size, not pagesize.)
1068 	 */
1069 	if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0)
1070 		return;
1071 
1072 	/*
1073 	 * We get the last page and handle the partial
1074 	 * zeroing via kernel mappings.  This will make the page
1075 	 * dirty so that we know that when this page is written
1076 	 * back, the zeroed information will go out with it.  If
1077 	 * the page is not currently in memory, then the kzero
1078 	 * operation will cause it to be brought it.  We use kzero
1079 	 * instead of bzero so that if the page cannot be read in
1080 	 * for any reason, the system will not panic.  We need
1081 	 * to zero out a minimum of the fs given zbytes, but we
1082 	 * might also have to do more to get the entire last page.
1083 	 */
1084 
1085 	if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE)
1086 		panic("pvn_vptrunc zbytes");
1087 	addr = segmap_getmapflt(segkmap, vp, vplen,
1088 	    MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE);
1089 	(void) kzero(addr + (vplen & MAXBOFFSET),
1090 	    MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)));
1091 	(void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC);
1092 }
1093 
1094 /*
1095  * Handles common work of the VOP_GETPAGE routines when more than
1096  * one page must be returned by calling a file system specific operation
1097  * to do most of the work.  Must be called with the vp already locked
1098  * by the VOP_GETPAGE routine.
1099  */
1100 int
1101 pvn_getpages(
1102 	int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[],
1103 		size_t, struct seg *, caddr_t, enum seg_rw, cred_t *),
1104 	struct vnode *vp,
1105 	u_offset_t off,
1106 	size_t len,
1107 	uint_t *protp,
1108 	page_t *pl[],
1109 	size_t plsz,
1110 	struct seg *seg,
1111 	caddr_t addr,
1112 	enum seg_rw rw,
1113 	struct cred *cred)
1114 {
1115 	page_t **ppp;
1116 	u_offset_t o, eoff;
1117 	size_t sz, xlen;
1118 	int err;
1119 
1120 	ASSERT(plsz >= len);		/* insure that we have enough space */
1121 
1122 	/*
1123 	 * Loop one page at a time and let getapage function fill
1124 	 * in the next page in array.  We only allow one page to be
1125 	 * returned at a time (except for the last page) so that we
1126 	 * don't have any problems with duplicates and other such
1127 	 * painful problems.  This is a very simple minded algorithm,
1128 	 * but it does the job correctly.  We hope that the cost of a
1129 	 * getapage call for a resident page that we might have been
1130 	 * able to get from an earlier call doesn't cost too much.
1131 	 */
1132 	ppp = pl;
1133 	sz = PAGESIZE;
1134 	eoff = off + len;
1135 	xlen = len;
1136 	for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE,
1137 	    xlen -= PAGESIZE) {
1138 		if (o + PAGESIZE >= eoff) {
1139 			/*
1140 			 * Last time through - allow the all of
1141 			 * what's left of the pl[] array to be used.
1142 			 */
1143 			sz = plsz - (o - off);
1144 		}
1145 		err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr,
1146 		    rw, cred);
1147 		if (err) {
1148 			/*
1149 			 * Release any pages we already got.
1150 			 */
1151 			if (o > off && pl != NULL) {
1152 				for (ppp = pl; *ppp != NULL; *ppp++ = NULL)
1153 					(void) page_release(*ppp, 1);
1154 			}
1155 			break;
1156 		}
1157 		if (pl != NULL)
1158 			ppp++;
1159 	}
1160 	return (err);
1161 }
1162 
1163 /*
1164  * Initialize the page list array.
1165  */
1166 /*ARGSUSED*/
1167 void
1168 pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz,
1169     u_offset_t off, size_t io_len, enum seg_rw rw)
1170 {
1171 	ssize_t sz;
1172 	page_t *ppcur, **ppp;
1173 
1174 	/*
1175 	 * Set up to load plsz worth
1176 	 * starting at the needed page.
1177 	 */
1178 	while (pp != NULL && pp->p_offset != off) {
1179 		/*
1180 		 * Remove page from the i/o list,
1181 		 * release the i/o and the page lock.
1182 		 */
1183 		ppcur = pp;
1184 		page_sub(&pp, ppcur);
1185 		page_io_unlock(ppcur);
1186 		(void) page_release(ppcur, 1);
1187 	}
1188 
1189 	if (pp == NULL) {
1190 		pl[0] = NULL;
1191 		return;
1192 	}
1193 
1194 	sz = plsz;
1195 
1196 	/*
1197 	 * Initialize the page list array.
1198 	 */
1199 	ppp = pl;
1200 	do {
1201 		ppcur = pp;
1202 		*ppp++ = ppcur;
1203 		page_sub(&pp, ppcur);
1204 		page_io_unlock(ppcur);
1205 		if (rw != S_CREATE)
1206 			page_downgrade(ppcur);
1207 		sz -= PAGESIZE;
1208 	} while (sz > 0 && pp != NULL);
1209 	*ppp = NULL;		/* terminate list */
1210 
1211 	/*
1212 	 * Now free the remaining pages that weren't
1213 	 * loaded in the page list.
1214 	 */
1215 	while (pp != NULL) {
1216 		ppcur = pp;
1217 		page_sub(&pp, ppcur);
1218 		page_io_unlock(ppcur);
1219 		(void) page_release(ppcur, 1);
1220 	}
1221 }
1222