xref: /illumos-gate/usr/src/uts/common/vm/vm_pvn.c (revision ddb365bfc9e868ad24ccdcb0dc91af18b10df082)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 /*
40  * VM - paged vnode.
41  *
42  * This file supplies vm support for the vnode operations that deal with pages.
43  */
44 #include <sys/types.h>
45 #include <sys/t_lock.h>
46 #include <sys/param.h>
47 #include <sys/sysmacros.h>
48 #include <sys/systm.h>
49 #include <sys/time.h>
50 #include <sys/buf.h>
51 #include <sys/vnode.h>
52 #include <sys/uio.h>
53 #include <sys/vmsystm.h>
54 #include <sys/mman.h>
55 #include <sys/vfs.h>
56 #include <sys/cred.h>
57 #include <sys/user.h>
58 #include <sys/kmem.h>
59 #include <sys/cmn_err.h>
60 #include <sys/debug.h>
61 #include <sys/cpuvar.h>
62 #include <sys/vtrace.h>
63 
64 #include <vm/hat.h>
65 #include <vm/as.h>
66 #include <vm/seg.h>
67 #include <vm/rm.h>
68 #include <vm/pvn.h>
69 #include <vm/page.h>
70 #include <vm/seg_map.h>
71 #include <vm/seg_kmem.h>
72 #include <sys/fs/swapnode.h>
73 
74 int pvn_nofodklust = 0;
75 int pvn_write_noklust = 0;
76 
77 uint_t pvn_vmodsort_supported = 0;	/* set if HAT supports VMODSORT */
78 uint_t pvn_vmodsort_disable = 0;	/* set in /etc/system to disable HAT */
79 					/* support for vmodsort for testing */
80 
81 static struct kmem_cache *marker_cache = NULL;
82 
83 /*
84  * Find the largest contiguous block which contains `addr' for file offset
85  * `offset' in it while living within the file system block sizes (`vp_off'
86  * and `vp_len') and the address space limits for which no pages currently
87  * exist and which map to consecutive file offsets.
88  */
89 page_t *
90 pvn_read_kluster(
91 	struct vnode *vp,
92 	u_offset_t off,
93 	struct seg *seg,
94 	caddr_t addr,
95 	u_offset_t *offp,			/* return values */
96 	size_t *lenp,				/* return values */
97 	u_offset_t vp_off,
98 	size_t vp_len,
99 	int isra)
100 {
101 	ssize_t deltaf, deltab;
102 	page_t *pp;
103 	page_t *plist = NULL;
104 	spgcnt_t pagesavail;
105 	u_offset_t vp_end;
106 
107 	ASSERT(off >= vp_off && off < vp_off + vp_len);
108 
109 	/*
110 	 * We only want to do klustering/read ahead if there
111 	 * is more than minfree pages currently available.
112 	 */
113 	pagesavail = freemem - minfree;
114 
115 	if (pagesavail <= 0)
116 		if (isra)
117 			return ((page_t *)NULL);    /* ra case - give up */
118 		else
119 			pagesavail = 1;		    /* must return a page */
120 
121 	/* We calculate in pages instead of bytes due to 32-bit overflows */
122 	if (pagesavail < (spgcnt_t)btopr(vp_len)) {
123 		/*
124 		 * Don't have enough free memory for the
125 		 * max request, try sizing down vp request.
126 		 */
127 		deltab = (ssize_t)(off - vp_off);
128 		vp_len -= deltab;
129 		vp_off += deltab;
130 		if (pagesavail < btopr(vp_len)) {
131 			/*
132 			 * Still not enough memory, just settle for
133 			 * pagesavail which is at least 1.
134 			 */
135 			vp_len = ptob(pagesavail);
136 		}
137 	}
138 
139 	vp_end = vp_off + vp_len;
140 	ASSERT(off >= vp_off && off < vp_end);
141 
142 	if (isra && SEGOP_KLUSTER(seg, addr, 0))
143 		return ((page_t *)NULL);	/* segment driver says no */
144 
145 	if ((plist = page_create_va(vp, off,
146 	    PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL)
147 		return ((page_t *)NULL);
148 
149 	if (vp_len <= PAGESIZE || pvn_nofodklust) {
150 		*offp = off;
151 		*lenp = MIN(vp_len, PAGESIZE);
152 	} else {
153 		/*
154 		 * Scan back from front by incrementing "deltab" and
155 		 * comparing "off" with "vp_off + deltab" to avoid
156 		 * "signed" versus "unsigned" conversion problems.
157 		 */
158 		for (deltab = PAGESIZE; off >= vp_off + deltab;
159 		    deltab += PAGESIZE) {
160 			/*
161 			 * Call back to the segment driver to verify that
162 			 * the klustering/read ahead operation makes sense.
163 			 */
164 			if (SEGOP_KLUSTER(seg, addr, -deltab))
165 				break;		/* page not eligible */
166 			if ((pp = page_create_va(vp, off - deltab,
167 			    PAGESIZE, PG_EXCL, seg, addr - deltab))
168 			    == NULL)
169 				break;		/* already have the page */
170 			/*
171 			 * Add page to front of page list.
172 			 */
173 			page_add(&plist, pp);
174 		}
175 		deltab -= PAGESIZE;
176 
177 		/* scan forward from front */
178 		for (deltaf = PAGESIZE; off + deltaf < vp_end;
179 		    deltaf += PAGESIZE) {
180 			/*
181 			 * Call back to the segment driver to verify that
182 			 * the klustering/read ahead operation makes sense.
183 			 */
184 			if (SEGOP_KLUSTER(seg, addr, deltaf))
185 				break;		/* page not file extension */
186 			if ((pp = page_create_va(vp, off + deltaf,
187 			    PAGESIZE, PG_EXCL, seg, addr + deltaf))
188 			    == NULL)
189 				break;		/* already have page */
190 
191 			/*
192 			 * Add page to end of page list.
193 			 */
194 			page_add(&plist, pp);
195 			plist = plist->p_next;
196 		}
197 		*offp = off = off - deltab;
198 		*lenp = deltab + deltaf;
199 		ASSERT(off >= vp_off);
200 
201 		/*
202 		 * If we ended up getting more than was actually
203 		 * requested, retract the returned length to only
204 		 * reflect what was requested.  This might happen
205 		 * if we were allowed to kluster pages across a
206 		 * span of (say) 5 frags, and frag size is less
207 		 * than PAGESIZE.  We need a whole number of
208 		 * pages to contain those frags, but the returned
209 		 * size should only allow the returned range to
210 		 * extend as far as the end of the frags.
211 		 */
212 		if ((vp_off + vp_len) < (off + *lenp)) {
213 			ASSERT(vp_end > off);
214 			*lenp = vp_end - off;
215 		}
216 	}
217 	TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER,
218 	    "pvn_read_kluster:seg %p addr %x isra %x",
219 	    seg, addr, isra);
220 	return (plist);
221 }
222 
223 /*
224  * Handle pages for this vnode on either side of the page "pp"
225  * which has been locked by the caller.  This routine will also
226  * do klustering in the range [vp_off, vp_off + vp_len] up
227  * until a page which is not found.  The offset and length
228  * of pages included is returned in "*offp" and "*lenp".
229  *
230  * Returns a list of dirty locked pages all ready to be
231  * written back.
232  */
233 page_t *
234 pvn_write_kluster(
235 	struct vnode *vp,
236 	page_t *pp,
237 	u_offset_t *offp,		/* return values */
238 	size_t *lenp,			/* return values */
239 	u_offset_t vp_off,
240 	size_t vp_len,
241 	int flags)
242 {
243 	u_offset_t off;
244 	page_t *dirty;
245 	size_t deltab, deltaf;
246 	se_t se;
247 	u_offset_t vp_end;
248 
249 	off = pp->p_offset;
250 
251 	/*
252 	 * Kustering should not be done if we are invalidating
253 	 * pages since we could destroy pages that belong to
254 	 * some other process if this is a swap vnode.
255 	 */
256 	if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) {
257 		*offp = off;
258 		*lenp = PAGESIZE;
259 		return (pp);
260 	}
261 
262 	if (flags & (B_FREE | B_INVAL))
263 		se = SE_EXCL;
264 	else
265 		se = SE_SHARED;
266 
267 	dirty = pp;
268 	/*
269 	 * Scan backwards looking for pages to kluster by incrementing
270 	 * "deltab" and comparing "off" with "vp_off + deltab" to
271 	 * avoid "signed" versus "unsigned" conversion problems.
272 	 */
273 	for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) {
274 		pp = page_lookup_nowait(vp, off - deltab, se);
275 		if (pp == NULL)
276 			break;		/* page not found */
277 		if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
278 			break;
279 		page_add(&dirty, pp);
280 	}
281 	deltab -= PAGESIZE;
282 
283 	vp_end = vp_off + vp_len;
284 	/* now scan forwards looking for pages to kluster */
285 	for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) {
286 		pp = page_lookup_nowait(vp, off + deltaf, se);
287 		if (pp == NULL)
288 			break;		/* page not found */
289 		if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
290 			break;
291 		page_add(&dirty, pp);
292 		dirty = dirty->p_next;
293 	}
294 
295 	*offp = off - deltab;
296 	*lenp = deltab + deltaf;
297 	return (dirty);
298 }
299 
300 /*
301  * Generic entry point used to release the "shared/exclusive" lock
302  * and the "p_iolock" on pages after i/o is complete.
303  */
304 void
305 pvn_io_done(page_t *plist)
306 {
307 	page_t *pp;
308 
309 	while (plist != NULL) {
310 		pp = plist;
311 		page_sub(&plist, pp);
312 		page_io_unlock(pp);
313 		page_unlock(pp);
314 	}
315 }
316 
317 /*
318  * Entry point to be used by file system getpage subr's and
319  * other such routines which either want to unlock pages (B_ASYNC
320  * request) or destroy a list of pages if an error occurred.
321  */
322 void
323 pvn_read_done(page_t *plist, int flags)
324 {
325 	page_t *pp;
326 
327 	while (plist != NULL) {
328 		pp = plist;
329 		page_sub(&plist, pp);
330 		page_io_unlock(pp);
331 		if (flags & B_ERROR) {
332 			/*LINTED: constant in conditional context*/
333 			VN_DISPOSE(pp, B_INVAL, 0, kcred);
334 		} else {
335 			(void) page_release(pp, 0);
336 		}
337 	}
338 }
339 
340 /*
341  * Automagic pageout.
342  * When memory gets tight, start freeing pages popping out of the
343  * write queue.
344  */
345 int	write_free = 1;
346 pgcnt_t	pages_before_pager = 200;	/* LMXXX */
347 
348 /*
349  * Routine to be called when page-out's complete.
350  * The caller, typically VOP_PUTPAGE, has to explicity call this routine
351  * after waiting for i/o to complete (biowait) to free the list of
352  * pages associated with the buffer.  These pages must be locked
353  * before i/o is initiated.
354  *
355  * If a write error occurs, the pages are marked as modified
356  * so the write will be re-tried later.
357  */
358 
359 void
360 pvn_write_done(page_t *plist, int flags)
361 {
362 	int dfree = 0;
363 	int pgrec = 0;
364 	int pgout = 0;
365 	int pgpgout = 0;
366 	int anonpgout = 0;
367 	int anonfree = 0;
368 	int fspgout = 0;
369 	int fsfree = 0;
370 	int execpgout = 0;
371 	int execfree = 0;
372 	page_t *pp;
373 	struct cpu *cpup;
374 	struct vnode *vp = NULL;	/* for probe */
375 	uint_t ppattr;
376 	kmutex_t *vphm = NULL;
377 
378 	ASSERT((flags & B_READ) == 0);
379 
380 	/*
381 	 * If we are about to start paging anyway, start freeing pages.
382 	 */
383 	if (write_free && freemem < lotsfree + pages_before_pager &&
384 	    (flags & B_ERROR) == 0) {
385 		flags |= B_FREE;
386 	}
387 
388 	/*
389 	 * Handle each page involved in the i/o operation.
390 	 */
391 	while (plist != NULL) {
392 		pp = plist;
393 		ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp));
394 		page_sub(&plist, pp);
395 
396 		/* Kernel probe support */
397 		if (vp == NULL)
398 			vp = pp->p_vnode;
399 
400 		if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) {
401 			/*
402 			 * Move page to the top of the v_page list.
403 			 * Skip pages modified during IO.
404 			 */
405 			vphm = page_vnode_mutex(vp);
406 			mutex_enter(vphm);
407 			if ((pp->p_vpnext != pp) && !hat_ismod(pp)) {
408 				page_vpsub(&vp->v_pages, pp);
409 				page_vpadd(&vp->v_pages, pp);
410 			}
411 			mutex_exit(vphm);
412 		}
413 
414 		if (flags & B_ERROR) {
415 			/*
416 			 * Write operation failed.  We don't want
417 			 * to destroy (or free) the page unless B_FORCE
418 			 * is set. We set the mod bit again and release
419 			 * all locks on the page so that it will get written
420 			 * back again later when things are hopefully
421 			 * better again.
422 			 * If B_INVAL and B_FORCE is set we really have
423 			 * to destroy the page.
424 			 */
425 			if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) {
426 				page_io_unlock(pp);
427 				/*LINTED: constant in conditional context*/
428 				VN_DISPOSE(pp, B_INVAL, 0, kcred);
429 			} else {
430 				hat_setmod_only(pp);
431 				page_io_unlock(pp);
432 				page_unlock(pp);
433 			}
434 		} else if (flags & B_INVAL) {
435 			/*
436 			 * XXX - Failed writes with B_INVAL set are
437 			 * not handled appropriately.
438 			 */
439 			page_io_unlock(pp);
440 			/*LINTED: constant in conditional context*/
441 			VN_DISPOSE(pp, B_INVAL, 0, kcred);
442 		} else if (flags & B_FREE ||!hat_page_is_mapped(pp)) {
443 			/*
444 			 * Update statistics for pages being paged out
445 			 */
446 			if (pp->p_vnode) {
447 				if (IS_SWAPFSVP(pp->p_vnode)) {
448 					anonpgout++;
449 				} else {
450 					if (pp->p_vnode->v_flag & VVMEXEC) {
451 						execpgout++;
452 					} else {
453 						fspgout++;
454 					}
455 				}
456 			}
457 			page_io_unlock(pp);
458 			pgout = 1;
459 			pgpgout++;
460 			TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT,
461 			    "page_ws_out:pp %p", pp);
462 
463 			/*
464 			 * The page_struct_lock need not be acquired to
465 			 * examine "p_lckcnt" and "p_cowcnt" since we'll
466 			 * have an "exclusive" lock if the upgrade succeeds.
467 			 */
468 			if (page_tryupgrade(pp) &&
469 			    pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
470 				/*
471 				 * Check if someone has reclaimed the
472 				 * page.  If ref and mod are not set, no
473 				 * one is using it so we can free it.
474 				 * The rest of the system is careful
475 				 * to use the NOSYNC flag to unload
476 				 * translations set up for i/o w/o
477 				 * affecting ref and mod bits.
478 				 *
479 				 * Obtain a copy of the real hardware
480 				 * mod bit using hat_pagesync(pp, HAT_DONTZERO)
481 				 * to avoid having to flush the cache.
482 				 */
483 				ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
484 				    HAT_SYNC_STOPON_MOD);
485 			ck_refmod:
486 				if (!(ppattr & (P_REF | P_MOD))) {
487 					if (hat_page_is_mapped(pp)) {
488 						/*
489 						 * Doesn't look like the page
490 						 * was modified so now we
491 						 * really have to unload the
492 						 * translations.  Meanwhile
493 						 * another CPU could've
494 						 * modified it so we have to
495 						 * check again.  We don't loop
496 						 * forever here because now
497 						 * the translations are gone
498 						 * and no one can get a new one
499 						 * since we have the "exclusive"
500 						 * lock on the page.
501 						 */
502 						(void) hat_pageunload(pp,
503 						    HAT_FORCE_PGUNLOAD);
504 						ppattr = hat_page_getattr(pp,
505 						    P_REF | P_MOD);
506 						goto ck_refmod;
507 					}
508 					/*
509 					 * Update statistics for pages being
510 					 * freed
511 					 */
512 					if (pp->p_vnode) {
513 						if (IS_SWAPFSVP(pp->p_vnode)) {
514 							anonfree++;
515 						} else {
516 							if (pp->p_vnode->v_flag
517 							    & VVMEXEC) {
518 								execfree++;
519 							} else {
520 								fsfree++;
521 							}
522 						}
523 					}
524 					/*LINTED: constant in conditional ctx*/
525 					VN_DISPOSE(pp, B_FREE,
526 					    (flags & B_DONTNEED), kcred);
527 					dfree++;
528 				} else {
529 					page_unlock(pp);
530 					pgrec++;
531 					TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE,
532 					    "page_ws_free:pp %p", pp);
533 				}
534 			} else {
535 				/*
536 				 * Page is either `locked' in memory
537 				 * or was reclaimed and now has a
538 				 * "shared" lock, so release it.
539 				 */
540 				page_unlock(pp);
541 			}
542 		} else {
543 			/*
544 			 * Neither B_FREE nor B_INVAL nor B_ERROR.
545 			 * Just release locks.
546 			 */
547 			page_io_unlock(pp);
548 			page_unlock(pp);
549 		}
550 	}
551 
552 	CPU_STATS_ENTER_K();
553 	cpup = CPU;		/* get cpup now that CPU cannot change */
554 	CPU_STATS_ADDQ(cpup, vm, dfree, dfree);
555 	CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec);
556 	CPU_STATS_ADDQ(cpup, vm, pgout, pgout);
557 	CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout);
558 	CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout);
559 	CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree);
560 	CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout);
561 	CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree);
562 	CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout);
563 	CPU_STATS_ADDQ(cpup, vm, execfree, execfree);
564 	CPU_STATS_EXIT_K();
565 }
566 
567 /*
568  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
569  * B_TRUNC, B_FORCE}.  B_DELWRI indicates that this page is part of a kluster
570  * operation and is only to be considered if it doesn't involve any
571  * waiting here.  B_TRUNC indicates that the file is being truncated
572  * and so no i/o needs to be done. B_FORCE indicates that the page
573  * must be destroyed so don't try wrting it out.
574  *
575  * The caller must ensure that the page is locked.  Returns 1, if
576  * the page should be written back (the "iolock" is held in this
577  * case), or 0 if the page has been dealt with or has been
578  * unlocked.
579  */
580 int
581 pvn_getdirty(page_t *pp, int flags)
582 {
583 	ASSERT((flags & (B_INVAL | B_FREE)) ?
584 	    PAGE_EXCL(pp) : PAGE_SHARED(pp));
585 	ASSERT(PP_ISFREE(pp) == 0);
586 
587 	/*
588 	 * If trying to invalidate or free a logically `locked' page,
589 	 * forget it.  Don't need page_struct_lock to check p_lckcnt and
590 	 * p_cowcnt as the page is exclusively locked.
591 	 */
592 	if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) &&
593 	    (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) {
594 		page_unlock(pp);
595 		return (0);
596 	}
597 
598 	/*
599 	 * Now acquire the i/o lock so we can add it to the dirty
600 	 * list (if necessary).  We avoid blocking on the i/o lock
601 	 * in the following cases:
602 	 *
603 	 *	If B_DELWRI is set, which implies that this request is
604 	 *	due to a klustering operartion.
605 	 *
606 	 *	If this is an async (B_ASYNC) operation and we are not doing
607 	 *	invalidation (B_INVAL) [The current i/o or fsflush will ensure
608 	 *	that the the page is written out].
609 	 */
610 	if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) {
611 		if (!page_io_trylock(pp)) {
612 			page_unlock(pp);
613 			return (0);
614 		}
615 	} else {
616 		page_io_lock(pp);
617 	}
618 
619 	/*
620 	 * If we want to free or invalidate the page then
621 	 * we need to unload it so that anyone who wants
622 	 * it will have to take a minor fault to get it.
623 	 * Otherwise, we're just writing the page back so we
624 	 * need to sync up the hardwre and software mod bit to
625 	 * detect any future modifications.  We clear the
626 	 * software mod bit when we put the page on the dirty
627 	 * list.
628 	 */
629 	if (flags & (B_INVAL | B_FREE)) {
630 		(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
631 	} else {
632 		(void) hat_pagesync(pp, HAT_SYNC_ZERORM);
633 	}
634 
635 	if (!hat_ismod(pp) || (flags & B_TRUNC)) {
636 		/*
637 		 * Don't need to add it to the
638 		 * list after all.
639 		 */
640 		page_io_unlock(pp);
641 		if (flags & B_INVAL) {
642 			/*LINTED: constant in conditional context*/
643 			VN_DISPOSE(pp, B_INVAL, 0, kcred);
644 		} else if (flags & B_FREE) {
645 			/*LINTED: constant in conditional context*/
646 			VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred);
647 		} else {
648 			/*
649 			 * This is advisory path for the callers
650 			 * of VOP_PUTPAGE() who prefer freeing the
651 			 * page _only_ if no one else is accessing it.
652 			 * E.g. segmap_release()
653 			 *
654 			 * The above hat_ismod() check is useless because:
655 			 * (1) we may not be holding SE_EXCL lock;
656 			 * (2) we've not unloaded _all_ translations
657 			 *
658 			 * Let page_release() do the heavy-lifting.
659 			 */
660 			(void) page_release(pp, 1);
661 		}
662 		return (0);
663 	}
664 
665 	/*
666 	 * Page is dirty, get it ready for the write back
667 	 * and add page to the dirty list.
668 	 */
669 	hat_clrrefmod(pp);
670 
671 	/*
672 	 * If we're going to free the page when we're done
673 	 * then we can let others try to use it starting now.
674 	 * We'll detect the fact that they used it when the
675 	 * i/o is done and avoid freeing the page.
676 	 */
677 	if (flags & B_FREE)
678 		page_downgrade(pp);
679 
680 
681 	TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp);
682 
683 	return (1);
684 }
685 
686 
687 /*ARGSUSED*/
688 static int
689 marker_constructor(void *buf, void *cdrarg, int kmflags)
690 {
691 	page_t *mark = buf;
692 	bzero(mark, sizeof (page_t));
693 	mark->p_hash = PVN_VPLIST_HASH_TAG;
694 	return (0);
695 }
696 
697 void
698 pvn_init()
699 {
700 	if (pvn_vmodsort_disable == 0)
701 		pvn_vmodsort_supported = hat_supported(HAT_VMODSORT, NULL);
702 	marker_cache = kmem_cache_create("marker_cache",
703 	    sizeof (page_t), 0, marker_constructor,
704 	    NULL, NULL, NULL, NULL, 0);
705 }
706 
707 
708 /*
709  * Process a vnode's page list for all pages whose offset is >= off.
710  * Pages are to either be free'd, invalidated, or written back to disk.
711  *
712  * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE
713  * is specified, otherwise they are "shared" locked.
714  *
715  * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC}
716  *
717  * Special marker page_t's are inserted in the list in order
718  * to keep track of where we are in the list when locks are dropped.
719  *
720  * Note the list is circular and insertions can happen only at the
721  * head and tail of the list. The algorithm ensures visiting all pages
722  * on the list in the following way:
723  *
724  *    Drop two marker pages at the end of the list.
725  *
726  *    Move one marker page backwards towards the start of the list until
727  *    it is at the list head, processing the pages passed along the way.
728  *
729  *    Due to race conditions when the vphm mutex is dropped, additional pages
730  *    can be added to either end of the list, so we'll continue to move
731  *    the marker and process pages until it is up against the end marker.
732  *
733  * There is one special exit condition. If we are processing a VMODSORT
734  * vnode and only writing back modified pages, we can stop as soon as
735  * we run into an unmodified page.  This makes fsync(3) operations fast.
736  */
737 int
738 pvn_vplist_dirty(
739 	vnode_t		*vp,
740 	u_offset_t	off,
741 	int		(*putapage)(vnode_t *, page_t *, u_offset_t *,
742 			size_t *, int, cred_t *),
743 	int		flags,
744 	cred_t		*cred)
745 {
746 	page_t		*pp;
747 	page_t		*mark;		/* marker page that moves toward head */
748 	page_t		*end;		/* marker page at end of list */
749 	int		err = 0;
750 	int		error;
751 	kmutex_t	*vphm;
752 	se_t		se;
753 	page_t		**where_to_move;
754 
755 	ASSERT(vp->v_type != VCHR);
756 
757 	if (vp->v_pages == NULL)
758 		return (0);
759 
760 
761 	/*
762 	 * Serialize vplist_dirty operations on this vnode by setting VVMLOCK.
763 	 *
764 	 * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync()
765 	 * from getting blocked while flushing pages to a dead NFS server.
766 	 */
767 	mutex_enter(&vp->v_lock);
768 	if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) {
769 		mutex_exit(&vp->v_lock);
770 		return (EAGAIN);
771 	}
772 
773 	while (vp->v_flag & VVMLOCK)
774 		cv_wait(&vp->v_cv, &vp->v_lock);
775 
776 	if (vp->v_pages == NULL) {
777 		mutex_exit(&vp->v_lock);
778 		return (0);
779 	}
780 
781 	vp->v_flag |= VVMLOCK;
782 	mutex_exit(&vp->v_lock);
783 
784 
785 	/*
786 	 * Set up the marker pages used to walk the list
787 	 */
788 	end = kmem_cache_alloc(marker_cache, KM_SLEEP);
789 	end->p_vnode = vp;
790 	end->p_offset = (u_offset_t)-2;
791 	mark = kmem_cache_alloc(marker_cache, KM_SLEEP);
792 	mark->p_vnode = vp;
793 	mark->p_offset = (u_offset_t)-1;
794 
795 	/*
796 	 * Grab the lock protecting the vnode's page list
797 	 * note that this lock is dropped at times in the loop.
798 	 */
799 	vphm = page_vnode_mutex(vp);
800 	mutex_enter(vphm);
801 	if (vp->v_pages == NULL)
802 		goto leave;
803 
804 	/*
805 	 * insert the markers and loop through the list of pages
806 	 */
807 	page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark);
808 	page_vpadd(&mark->p_vpnext, end);
809 	for (;;) {
810 
811 		/*
812 		 * If only doing an async write back, then we can
813 		 * stop as soon as we get to start of the list.
814 		 */
815 		if (flags == B_ASYNC && vp->v_pages == mark)
816 			break;
817 
818 		/*
819 		 * otherwise stop when we've gone through all the pages
820 		 */
821 		if (mark->p_vpprev == end)
822 			break;
823 
824 		pp = mark->p_vpprev;
825 		if (vp->v_pages == pp)
826 			where_to_move = &vp->v_pages;
827 		else
828 			where_to_move = &pp->p_vpprev->p_vpnext;
829 
830 		ASSERT(pp->p_vnode == vp);
831 
832 		/*
833 		 * If just flushing dirty pages to disk and this vnode
834 		 * is using a sorted list of pages, we can stop processing
835 		 * as soon as we find an unmodified page. Since all the
836 		 * modified pages are visited first.
837 		 */
838 		if (IS_VMODSORT(vp) &&
839 		    !(flags & (B_INVAL | B_FREE | B_TRUNC))) {
840 			if (!hat_ismod(pp) && !page_io_locked(pp)) {
841 #ifdef  DEBUG
842 				/*
843 				 * For debug kernels examine what should be
844 				 * all the remaining clean pages, asserting
845 				 * that they are not modified.
846 				 */
847 				page_t	*chk = pp;
848 				int	attr;
849 
850 				page_vpsub(&vp->v_pages, mark);
851 				page_vpadd(where_to_move, mark);
852 				do {
853 					chk = chk->p_vpprev;
854 					ASSERT(chk != end);
855 					if (chk == mark)
856 						continue;
857 					attr = hat_page_getattr(chk, P_MOD |
858 					    P_REF);
859 					if ((attr & P_MOD) == 0)
860 						continue;
861 					panic("v_pages list not all clean: "
862 					    "page_t*=%p vnode=%p off=%lx "
863 					    "attr=0x%x last clean page_t*=%p\n",
864 					    (void *)chk, (void *)chk->p_vnode,
865 					    (long)chk->p_offset, attr,
866 					    (void *)pp);
867 				} while (chk != vp->v_pages);
868 #endif
869 				break;
870 			} else if (!(flags & B_ASYNC) && !hat_ismod(pp)) {
871 				/*
872 				 * Couldn't get io lock, wait until IO is done.
873 				 * Block only for sync IO since we don't want
874 				 * to block async IO.
875 				 */
876 				mutex_exit(vphm);
877 				page_io_wait(pp);
878 				mutex_enter(vphm);
879 				continue;
880 			}
881 		}
882 
883 		/*
884 		 * Skip this page if the offset is out of the desired range.
885 		 * Just move the marker and continue.
886 		 */
887 		if (pp->p_offset < off) {
888 			page_vpsub(&vp->v_pages, mark);
889 			page_vpadd(where_to_move, mark);
890 			continue;
891 		}
892 
893 		/*
894 		 * If we are supposed to invalidate or free this
895 		 * page, then we need an exclusive lock.
896 		 */
897 		se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
898 
899 		/*
900 		 * We must acquire the page lock for all synchronous
901 		 * operations (invalidate, free and write).
902 		 */
903 		if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) {
904 			/*
905 			 * If the page_lock() drops the mutex
906 			 * we must retry the loop.
907 			 */
908 			if (!page_lock(pp, se, vphm, P_NO_RECLAIM))
909 				continue;
910 
911 			/*
912 			 * It's ok to move the marker page now.
913 			 */
914 			page_vpsub(&vp->v_pages, mark);
915 			page_vpadd(where_to_move, mark);
916 		} else {
917 
918 			/*
919 			 * update the marker page for all remaining cases
920 			 */
921 			page_vpsub(&vp->v_pages, mark);
922 			page_vpadd(where_to_move, mark);
923 
924 			/*
925 			 * For write backs, If we can't lock the page, it's
926 			 * invalid or in the process of being destroyed.  Skip
927 			 * it, assuming someone else is writing it.
928 			 */
929 			if (!page_trylock(pp, se))
930 				continue;
931 		}
932 
933 		ASSERT(pp->p_vnode == vp);
934 
935 		/*
936 		 * Successfully locked the page, now figure out what to
937 		 * do with it. Free pages are easily dealt with, invalidate
938 		 * if desired or just go on to the next page.
939 		 */
940 		if (PP_ISFREE(pp)) {
941 			if ((flags & B_INVAL) == 0) {
942 				page_unlock(pp);
943 				continue;
944 			}
945 
946 			/*
947 			 * Invalidate (destroy) the page.
948 			 */
949 			mutex_exit(vphm);
950 			page_destroy_free(pp);
951 			mutex_enter(vphm);
952 			continue;
953 		}
954 
955 		/*
956 		 * pvn_getdirty() figures out what do do with a dirty page.
957 		 * If the page is dirty, the putapage() routine will write it
958 		 * and will kluster any other adjacent dirty pages it can.
959 		 *
960 		 * pvn_getdirty() and `(*putapage)' unlock the page.
961 		 */
962 		mutex_exit(vphm);
963 		if (pvn_getdirty(pp, flags)) {
964 			error = (*putapage)(vp, pp, NULL, NULL, flags, cred);
965 			if (!err)
966 				err = error;
967 		}
968 		mutex_enter(vphm);
969 	}
970 	page_vpsub(&vp->v_pages, mark);
971 	page_vpsub(&vp->v_pages, end);
972 
973 leave:
974 	/*
975 	 * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds
976 	 */
977 	mutex_exit(vphm);
978 	kmem_cache_free(marker_cache, mark);
979 	kmem_cache_free(marker_cache, end);
980 	mutex_enter(&vp->v_lock);
981 	vp->v_flag &= ~VVMLOCK;
982 	cv_broadcast(&vp->v_cv);
983 	mutex_exit(&vp->v_lock);
984 	return (err);
985 }
986 
987 /*
988  * Walk the vp->v_pages list, for every page call the callback function
989  * pointed by *page_check. If page_check returns non-zero, then mark the
990  * page as modified and if VMODSORT is set, move it to the end of v_pages
991  * list. Moving makes sense only if we have at least two pages - this also
992  * avoids having v_pages temporarily being NULL after calling page_vpsub()
993  * if there was just one page.
994  */
995 void
996 pvn_vplist_setdirty(vnode_t *vp, int (*page_check)(page_t *))
997 {
998 	page_t	*pp, *next, *end;
999 	kmutex_t	*vphm;
1000 	int	shuffle;
1001 
1002 	vphm = page_vnode_mutex(vp);
1003 	mutex_enter(vphm);
1004 
1005 	if (vp->v_pages == NULL) {
1006 		mutex_exit(vphm);
1007 		return;
1008 	}
1009 
1010 	end = vp->v_pages->p_vpprev;
1011 	shuffle = IS_VMODSORT(vp) && (vp->v_pages != end);
1012 	pp = vp->v_pages;
1013 
1014 	for (;;) {
1015 		next = pp->p_vpnext;
1016 		if (pp->p_hash != PVN_VPLIST_HASH_TAG && page_check(pp)) {
1017 			/*
1018 			 * hat_setmod_only() in contrast to hat_setmod() does
1019 			 * not shuffle the pages and does not grab the mutex
1020 			 * page_vnode_mutex. Exactly what we need.
1021 			 */
1022 			hat_setmod_only(pp);
1023 			if (shuffle) {
1024 				page_vpsub(&vp->v_pages, pp);
1025 				ASSERT(vp->v_pages != NULL);
1026 				page_vpadd(&vp->v_pages->p_vpprev->p_vpnext,
1027 				    pp);
1028 			}
1029 		}
1030 		/* Stop if we have just processed the last page. */
1031 		if (pp == end)
1032 			break;
1033 		pp = next;
1034 	}
1035 
1036 	mutex_exit(vphm);
1037 }
1038 
1039 /*
1040  * Zero out zbytes worth of data. Caller should be aware that this
1041  * routine may enter back into the fs layer (xxx_getpage). Locks
1042  * that the xxx_getpage routine may need should not be held while
1043  * calling this.
1044  */
1045 void
1046 pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes)
1047 {
1048 	caddr_t addr;
1049 
1050 	ASSERT(vp->v_type != VCHR);
1051 
1052 	if (vp->v_pages == NULL)
1053 		return;
1054 
1055 	/*
1056 	 * zbytes may be zero but there still may be some portion of
1057 	 * a page which needs clearing (since zbytes is a function
1058 	 * of filesystem block size, not pagesize.)
1059 	 */
1060 	if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0)
1061 		return;
1062 
1063 	/*
1064 	 * We get the last page and handle the partial
1065 	 * zeroing via kernel mappings.  This will make the page
1066 	 * dirty so that we know that when this page is written
1067 	 * back, the zeroed information will go out with it.  If
1068 	 * the page is not currently in memory, then the kzero
1069 	 * operation will cause it to be brought it.  We use kzero
1070 	 * instead of bzero so that if the page cannot be read in
1071 	 * for any reason, the system will not panic.  We need
1072 	 * to zero out a minimum of the fs given zbytes, but we
1073 	 * might also have to do more to get the entire last page.
1074 	 */
1075 
1076 	if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE)
1077 		panic("pvn_vptrunc zbytes");
1078 	addr = segmap_getmapflt(segkmap, vp, vplen,
1079 	    MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE);
1080 	(void) kzero(addr + (vplen & MAXBOFFSET),
1081 	    MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)));
1082 	(void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC);
1083 }
1084 
1085 /*
1086  * Handles common work of the VOP_GETPAGE routines by iterating page by page
1087  * calling the getpage helper for each.
1088  */
1089 int
1090 pvn_getpages(
1091 	int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[],
1092 		size_t, struct seg *, caddr_t, enum seg_rw, cred_t *),
1093 	struct vnode *vp,
1094 	u_offset_t off,
1095 	size_t len,
1096 	uint_t *protp,
1097 	page_t *pl[],
1098 	size_t plsz,
1099 	struct seg *seg,
1100 	caddr_t addr,
1101 	enum seg_rw rw,
1102 	struct cred *cred)
1103 {
1104 	page_t **ppp;
1105 	u_offset_t o, eoff;
1106 	size_t sz, xlen;
1107 	int err;
1108 
1109 	/* ensure that we have enough space */
1110 	ASSERT(pl == NULL || plsz >= len);
1111 
1112 	/*
1113 	 * Loop one page at a time and let getapage function fill
1114 	 * in the next page in array.  We only allow one page to be
1115 	 * returned at a time (except for the last page) so that we
1116 	 * don't have any problems with duplicates and other such
1117 	 * painful problems.  This is a very simple minded algorithm,
1118 	 * but it does the job correctly.  We hope that the cost of a
1119 	 * getapage call for a resident page that we might have been
1120 	 * able to get from an earlier call doesn't cost too much.
1121 	 */
1122 	ppp = pl;
1123 	sz = (pl != NULL) ? PAGESIZE : 0;
1124 	eoff = off + len;
1125 	xlen = len;
1126 	for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE,
1127 	    xlen -= PAGESIZE) {
1128 		if (o + PAGESIZE >= eoff && pl != NULL) {
1129 			/*
1130 			 * Last time through - allow the all of
1131 			 * what's left of the pl[] array to be used.
1132 			 */
1133 			sz = plsz - (o - off);
1134 		}
1135 		err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr,
1136 		    rw, cred);
1137 		if (err) {
1138 			/*
1139 			 * Release any pages we already got.
1140 			 */
1141 			if (o > off && pl != NULL) {
1142 				for (ppp = pl; *ppp != NULL; *ppp++ = NULL)
1143 					(void) page_release(*ppp, 1);
1144 			}
1145 			break;
1146 		}
1147 		if (pl != NULL)
1148 			ppp++;
1149 	}
1150 	return (err);
1151 }
1152 
1153 /*
1154  * Initialize the page list array.
1155  */
1156 /*ARGSUSED*/
1157 void
1158 pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz,
1159     u_offset_t off, size_t io_len, enum seg_rw rw)
1160 {
1161 	ssize_t sz;
1162 	page_t *ppcur, **ppp;
1163 
1164 	/*
1165 	 * Set up to load plsz worth
1166 	 * starting at the needed page.
1167 	 */
1168 	while (pp != NULL && pp->p_offset != off) {
1169 		/*
1170 		 * Remove page from the i/o list,
1171 		 * release the i/o and the page lock.
1172 		 */
1173 		ppcur = pp;
1174 		page_sub(&pp, ppcur);
1175 		page_io_unlock(ppcur);
1176 		(void) page_release(ppcur, 1);
1177 	}
1178 
1179 	if (pp == NULL) {
1180 		pl[0] = NULL;
1181 		return;
1182 	}
1183 
1184 	sz = plsz;
1185 
1186 	/*
1187 	 * Initialize the page list array.
1188 	 */
1189 	ppp = pl;
1190 	do {
1191 		ppcur = pp;
1192 		*ppp++ = ppcur;
1193 		page_sub(&pp, ppcur);
1194 		page_io_unlock(ppcur);
1195 		if (rw != S_CREATE)
1196 			page_downgrade(ppcur);
1197 		sz -= PAGESIZE;
1198 	} while (sz > 0 && pp != NULL);
1199 	*ppp = NULL;		/* terminate list */
1200 
1201 	/*
1202 	 * Now free the remaining pages that weren't
1203 	 * loaded in the page list.
1204 	 */
1205 	while (pp != NULL) {
1206 		ppcur = pp;
1207 		page_sub(&pp, ppcur);
1208 		page_io_unlock(ppcur);
1209 		(void) page_release(ppcur, 1);
1210 	}
1211 }
1212