xref: /titanic_44/usr/src/uts/common/fs/swapfs/swap_vnops.c (revision 2b24ab6b3865caeede9eeb9db6b83e1d89dcd1ea)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/systm.h>
29 #include <sys/buf.h>
30 #include <sys/cred.h>
31 #include <sys/errno.h>
32 #include <sys/vnode.h>
33 #include <sys/vfs_opreg.h>
34 #include <sys/cmn_err.h>
35 #include <sys/swap.h>
36 #include <sys/mman.h>
37 #include <sys/vmsystm.h>
38 #include <sys/vtrace.h>
39 #include <sys/debug.h>
40 #include <sys/sysmacros.h>
41 #include <sys/vm.h>
42 
43 #include <sys/fs/swapnode.h>
44 
45 #include <vm/seg.h>
46 #include <vm/page.h>
47 #include <vm/pvn.h>
48 #include <fs/fs_subr.h>
49 
50 #include <vm/seg_kp.h>
51 
52 /*
53  * Define the routines within this file.
54  */
55 static int	swap_getpage(struct vnode *vp, offset_t off, size_t len,
56     uint_t *protp, struct page **plarr, size_t plsz, struct seg *seg,
57     caddr_t addr, enum seg_rw rw, struct cred *cr, caller_context_t *ct);
58 static int	swap_putpage(struct vnode *vp, offset_t off, size_t len,
59     int flags, struct cred *cr, caller_context_t *ct);
60 static void	swap_inactive(struct vnode *vp, struct cred *cr,
61     caller_context_t *ct);
62 static void	swap_dispose(vnode_t *vp, page_t *pp, int fl, int dn,
63     cred_t *cr, caller_context_t *ct);
64 
65 static int	swap_getapage(struct vnode *vp, u_offset_t off, size_t len,
66     uint_t *protp, page_t **plarr, size_t plsz,
67     struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr);
68 
69 int	swap_getconpage(struct vnode *vp, u_offset_t off, size_t len,
70     uint_t *protp, page_t **plarr, size_t plsz, page_t *conpp,
71     uint_t *pszc, spgcnt_t *nreloc, struct seg *seg, caddr_t addr,
72     enum seg_rw rw, struct cred *cr);
73 
74 static int 	swap_putapage(struct vnode *vp, page_t *pp, u_offset_t *off,
75     size_t *lenp, int flags, struct cred *cr);
76 
77 const fs_operation_def_t swap_vnodeops_template[] = {
78 	VOPNAME_INACTIVE,	{ .vop_inactive = swap_inactive },
79 	VOPNAME_GETPAGE,	{ .vop_getpage = swap_getpage },
80 	VOPNAME_PUTPAGE,	{ .vop_putpage = swap_putpage },
81 	VOPNAME_DISPOSE,	{ .vop_dispose = swap_dispose },
82 	VOPNAME_SETFL,		{ .error = fs_error },
83 	VOPNAME_POLL,		{ .error = fs_error },
84 	VOPNAME_PATHCONF,	{ .error = fs_error },
85 	VOPNAME_GETSECATTR,	{ .error = fs_error },
86 	VOPNAME_SHRLOCK,	{ .error = fs_error },
87 	NULL,			NULL
88 };
89 
90 vnodeops_t *swap_vnodeops;
91 
92 /* ARGSUSED */
93 static void
94 swap_inactive(
95 	struct vnode *vp,
96 	struct cred *cr,
97 	caller_context_t *ct)
98 {
99 	SWAPFS_PRINT(SWAP_VOPS, "swap_inactive: vp %x\n", vp, 0, 0, 0, 0);
100 }
101 
102 /*
103  * Return all the pages from [off..off+len] in given file
104  */
105 /*ARGSUSED*/
106 static int
107 swap_getpage(
108 	struct vnode *vp,
109 	offset_t off,
110 	size_t len,
111 	uint_t *protp,
112 	page_t *pl[],
113 	size_t plsz,
114 	struct seg *seg,
115 	caddr_t addr,
116 	enum seg_rw rw,
117 	struct cred *cr,
118 	caller_context_t *ct)
119 {
120 	int err;
121 
122 	SWAPFS_PRINT(SWAP_VOPS, "swap_getpage: vp %p, off %llx, len %lx\n",
123 	    (void *)vp, off, len, 0, 0);
124 
125 	TRACE_3(TR_FAC_SWAPFS, TR_SWAPFS_GETPAGE,
126 	    "swapfs getpage:vp %p off %llx len %ld",
127 	    (void *)vp, off, len);
128 
129 	if (len <= PAGESIZE) {
130 		err = swap_getapage(vp, (u_offset_t)off, len, protp, pl, plsz,
131 		    seg, addr, rw, cr);
132 	} else {
133 		err = pvn_getpages(swap_getapage, vp, (u_offset_t)off, len,
134 		    protp, pl, plsz, seg, addr, rw, cr);
135 	}
136 
137 	return (err);
138 }
139 
140 /*
141  * Called from pvn_getpages or swap_getpage to get a particular page.
142  */
143 /*ARGSUSED*/
144 static int
145 swap_getapage(
146 	struct vnode *vp,
147 	u_offset_t off,
148 	size_t len,
149 	uint_t *protp,
150 	page_t *pl[],
151 	size_t plsz,
152 	struct seg *seg,
153 	caddr_t addr,
154 	enum seg_rw rw,
155 	struct cred *cr)
156 {
157 	struct page *pp, *rpp;
158 	int flags;
159 	int err = 0;
160 	struct vnode *pvp = NULL;
161 	u_offset_t poff;
162 	int flag_noreloc;
163 	se_t lock;
164 	extern int kcage_on;
165 	int upgrade = 0;
166 
167 	SWAPFS_PRINT(SWAP_VOPS, "swap_getapage: vp %p, off %llx, len %lx\n",
168 	    vp, off, len, 0, 0);
169 
170 	/*
171 	 * Until there is a call-back mechanism to cause SEGKP
172 	 * pages to be unlocked, make them non-relocatable.
173 	 */
174 	if (SEG_IS_SEGKP(seg))
175 		flag_noreloc = PG_NORELOC;
176 	else
177 		flag_noreloc = 0;
178 
179 	if (protp != NULL)
180 		*protp = PROT_ALL;
181 
182 	lock = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
183 
184 again:
185 	if (pp = page_lookup(vp, off, lock)) {
186 		/*
187 		 * In very rare instances, a segkp page may have been
188 		 * relocated outside of the kernel by the kernel cage
189 		 * due to the window between page_unlock() and
190 		 * VOP_PUTPAGE() in segkp_unlock().  Due to the
191 		 * rareness of these occurances, the solution is to
192 		 * relocate the page to a P_NORELOC page.
193 		 */
194 		if (flag_noreloc != 0) {
195 			if (!PP_ISNORELOC(pp) && kcage_on) {
196 				if (lock != SE_EXCL) {
197 					upgrade = 1;
198 					if (!page_tryupgrade(pp)) {
199 						page_unlock(pp);
200 						lock = SE_EXCL;
201 						goto again;
202 					}
203 				}
204 
205 				if (page_relocate_cage(&pp, &rpp) != 0)
206 					panic("swap_getapage: "
207 					    "page_relocate_cage failed");
208 
209 				pp = rpp;
210 			}
211 		}
212 
213 		if (pl) {
214 			if (upgrade)
215 				page_downgrade(pp);
216 
217 			pl[0] = pp;
218 			pl[1] = NULL;
219 		} else {
220 			page_unlock(pp);
221 		}
222 	} else {
223 		pp = page_create_va(vp, off, PAGESIZE,
224 		    PG_WAIT | PG_EXCL | flag_noreloc,
225 		    seg, addr);
226 		/*
227 		 * Someone raced in and created the page after we did the
228 		 * lookup but before we did the create, so go back and
229 		 * try to look it up again.
230 		 */
231 		if (pp == NULL)
232 			goto again;
233 		if (rw != S_CREATE) {
234 			err = swap_getphysname(vp, off, &pvp, &poff);
235 			if (pvp) {
236 				struct anon *ap;
237 				kmutex_t *ahm;
238 
239 				flags = (pl == NULL ? B_ASYNC|B_READ : B_READ);
240 				err = VOP_PAGEIO(pvp, pp, poff,
241 				    PAGESIZE, flags, cr, NULL);
242 
243 				if (!err) {
244 					ahm = &anonhash_lock[AH_LOCK(vp, off)];
245 					mutex_enter(ahm);
246 
247 					ap = swap_anon(vp, off);
248 					if (ap == NULL) {
249 						panic("swap_getapage:"
250 						    " null anon");
251 					}
252 
253 					if (ap->an_pvp == pvp &&
254 					    ap->an_poff == poff) {
255 						swap_phys_free(pvp, poff,
256 						    PAGESIZE);
257 						ap->an_pvp = NULL;
258 						ap->an_poff = NULL;
259 						hat_setmod(pp);
260 					}
261 
262 					mutex_exit(ahm);
263 				}
264 			} else {
265 				if (!err)
266 					pagezero(pp, 0, PAGESIZE);
267 
268 				/*
269 				 * If it's a fault ahead, release page_io_lock
270 				 * and SE_EXCL we grabbed in page_create_va
271 				 *
272 				 * If we are here, we haven't called VOP_PAGEIO
273 				 * and thus calling pvn_read_done(pp, B_READ)
274 				 * below may mislead that we tried i/o. Besides,
275 				 * in case of async, pvn_read_done() should
276 				 * not be called by *getpage()
277 				 */
278 				if (pl == NULL) {
279 					/*
280 					 * swap_getphysname can return error
281 					 * only when we are getting called from
282 					 * swapslot_free which passes non-NULL
283 					 * pl to VOP_GETPAGE.
284 					 */
285 					ASSERT(err == 0);
286 					page_io_unlock(pp);
287 					page_unlock(pp);
288 				}
289 			}
290 		}
291 
292 		ASSERT(pp != NULL);
293 
294 		if (err && pl)
295 			pvn_read_done(pp, B_ERROR);
296 
297 		if (!err && pl)
298 			pvn_plist_init(pp, pl, plsz, off, PAGESIZE, rw);
299 	}
300 	TRACE_3(TR_FAC_SWAPFS, TR_SWAPFS_GETAPAGE,
301 	    "swapfs getapage:pp %p vp %p off %llx", pp, vp, off);
302 	return (err);
303 }
304 
305 /*
306  * Called from large page anon routines only! This is an ugly hack where
307  * the anon layer directly calls into swapfs with a preallocated large page.
308  * Another method would have been to change to VOP and add an extra arg for
309  * the preallocated large page. This all could be cleaned up later when we
310  * solve the anonymous naming problem and no longer need to loop across of
311  * the VOP in PAGESIZE increments to fill in or initialize a large page as
312  * is done today. I think the latter is better since it avoid a change to
313  * the VOP interface that could later be avoided.
314  */
315 int
316 swap_getconpage(
317 	struct vnode *vp,
318 	u_offset_t off,
319 	size_t len,
320 	uint_t *protp,
321 	page_t *pl[],
322 	size_t plsz,
323 	page_t	*conpp,
324 	uint_t	*pszc,
325 	spgcnt_t *nreloc,
326 	struct seg *seg,
327 	caddr_t addr,
328 	enum seg_rw rw,
329 	struct cred *cr)
330 {
331 	struct page	*pp;
332 	int 		err = 0;
333 	struct vnode	*pvp = NULL;
334 	u_offset_t	poff;
335 
336 	ASSERT(len == PAGESIZE);
337 	ASSERT(pl != NULL);
338 	ASSERT(plsz == PAGESIZE);
339 	ASSERT(protp == NULL);
340 	ASSERT(nreloc != NULL);
341 	ASSERT(!SEG_IS_SEGKP(seg)); /* XXX for now not supported */
342 	SWAPFS_PRINT(SWAP_VOPS, "swap_getconpage: vp %p, off %llx, len %lx\n",
343 	    vp, off, len, 0, 0);
344 
345 	/*
346 	 * If we are not using a preallocated page then we know one already
347 	 * exists. So just let the old code handle it.
348 	 */
349 	if (conpp == NULL) {
350 		err = swap_getapage(vp, (u_offset_t)off, len, protp, pl, plsz,
351 		    seg, addr, rw, cr);
352 		return (err);
353 	}
354 	ASSERT(conpp->p_szc != 0);
355 	ASSERT(PAGE_EXCL(conpp));
356 
357 
358 	ASSERT(conpp->p_next == conpp);
359 	ASSERT(conpp->p_prev == conpp);
360 	ASSERT(!PP_ISAGED(conpp));
361 	ASSERT(!PP_ISFREE(conpp));
362 
363 	*nreloc = 0;
364 	pp = page_lookup_create(vp, off, SE_SHARED, conpp, nreloc, 0);
365 
366 	/*
367 	 * If existing page is found we may need to relocate.
368 	 */
369 	if (pp != conpp) {
370 		ASSERT(rw != S_CREATE);
371 		ASSERT(pszc != NULL);
372 		ASSERT(PAGE_SHARED(pp));
373 		if (pp->p_szc < conpp->p_szc) {
374 			*pszc = pp->p_szc;
375 			page_unlock(pp);
376 			err = -1;
377 		} else if (pp->p_szc > conpp->p_szc &&
378 		    seg->s_szc > conpp->p_szc) {
379 			*pszc = MIN(pp->p_szc, seg->s_szc);
380 			page_unlock(pp);
381 			err = -2;
382 		} else {
383 			pl[0] = pp;
384 			pl[1] = NULL;
385 			if (page_pptonum(pp) &
386 			    (page_get_pagecnt(conpp->p_szc) - 1))
387 				cmn_err(CE_PANIC, "swap_getconpage: no root");
388 		}
389 		return (err);
390 	}
391 
392 	ASSERT(PAGE_EXCL(pp));
393 
394 	if (*nreloc != 0) {
395 		ASSERT(rw != S_CREATE);
396 		pl[0] = pp;
397 		pl[1] = NULL;
398 		return (0);
399 	}
400 
401 	*nreloc = 1;
402 
403 	/*
404 	 * If necessary do the page io.
405 	 */
406 	if (rw != S_CREATE) {
407 		/*
408 		 * Since we are only called now on behalf of an
409 		 * address space operation it's impossible for
410 		 * us to fail unlike swap_getapge() which
411 		 * also gets called from swapslot_free().
412 		 */
413 		if (swap_getphysname(vp, off, &pvp, &poff)) {
414 			cmn_err(CE_PANIC,
415 			    "swap_getconpage: swap_getphysname failed!");
416 		}
417 
418 		if (pvp != NULL) {
419 			err = VOP_PAGEIO(pvp, pp, poff, PAGESIZE, B_READ,
420 			    cr, NULL);
421 			if (err == 0) {
422 				struct anon *ap;
423 				kmutex_t *ahm;
424 
425 				ahm = &anonhash_lock[AH_LOCK(vp, off)];
426 				mutex_enter(ahm);
427 				ap = swap_anon(vp, off);
428 				if (ap == NULL)
429 					panic("swap_getconpage: null anon");
430 				if (ap->an_pvp != pvp || ap->an_poff != poff)
431 					panic("swap_getconpage: bad anon");
432 
433 				swap_phys_free(pvp, poff, PAGESIZE);
434 				ap->an_pvp = NULL;
435 				ap->an_poff = NULL;
436 				hat_setmod(pp);
437 				mutex_exit(ahm);
438 			}
439 		} else {
440 			pagezero(pp, 0, PAGESIZE);
441 		}
442 	}
443 
444 	/*
445 	 * Normally we would let pvn_read_done() destroy
446 	 * the page on IO error. But since this is a preallocated
447 	 * page we'll let the anon layer handle it.
448 	 */
449 	page_io_unlock(pp);
450 	if (err != 0)
451 		page_hashout(pp, NULL);
452 	ASSERT(pp->p_next == pp);
453 	ASSERT(pp->p_prev == pp);
454 
455 	TRACE_3(TR_FAC_SWAPFS, TR_SWAPFS_GETAPAGE,
456 	    "swapfs getconpage:pp %p vp %p off %llx", pp, vp, off);
457 
458 	pl[0] = pp;
459 	pl[1] = NULL;
460 	return (err);
461 }
462 
463 /* Async putpage klustering stuff */
464 int sw_pending_size;
465 extern int klustsize;
466 extern struct async_reqs *sw_getreq();
467 extern void sw_putreq(struct async_reqs *);
468 extern void sw_putbackreq(struct async_reqs *);
469 extern struct async_reqs *sw_getfree();
470 extern void sw_putfree(struct async_reqs *);
471 
472 static size_t swap_putpagecnt, swap_pagespushed;
473 static size_t swap_otherfail, swap_otherpages;
474 static size_t swap_klustfail, swap_klustpages;
475 static size_t swap_getiofail, swap_getiopages;
476 
477 /*
478  * Flags are composed of {B_INVAL, B_DIRTY B_FREE, B_DONTNEED}.
479  * If len == 0, do from off to EOF.
480  */
481 static int swap_nopage = 0;	/* Don't do swap_putpage's if set */
482 
483 /* ARGSUSED */
484 static int
485 swap_putpage(
486 	struct vnode *vp,
487 	offset_t off,
488 	size_t len,
489 	int flags,
490 	struct cred *cr,
491 	caller_context_t *ct)
492 {
493 	page_t *pp;
494 	u_offset_t io_off;
495 	size_t io_len = 0;
496 	int err = 0;
497 	int nowait;
498 	struct async_reqs *arg;
499 
500 	if (swap_nopage)
501 		return (0);
502 
503 	ASSERT(vp->v_count != 0);
504 
505 	nowait = flags & B_PAGE_NOWAIT;
506 
507 	/*
508 	 * Clear force flag so that p_lckcnt pages are not invalidated.
509 	 */
510 	flags &= ~(B_FORCE | B_PAGE_NOWAIT);
511 
512 	SWAPFS_PRINT(SWAP_VOPS,
513 	    "swap_putpage: vp %p, off %llx len %lx, flags %x\n",
514 	    (void *)vp, off, len, flags, 0);
515 	TRACE_3(TR_FAC_SWAPFS, TR_SWAPFS_PUTPAGE,
516 	    "swapfs putpage:vp %p off %llx len %ld", (void *)vp, off, len);
517 
518 	if (vp->v_flag & VNOMAP)
519 		return (ENOSYS);
520 
521 	if (!vn_has_cached_data(vp))
522 		return (0);
523 
524 	if (len == 0) {
525 		if (curproc == proc_pageout)
526 			cmn_err(CE_PANIC, "swapfs: pageout can't block");
527 
528 		/* Search the entire vp list for pages >= off. */
529 		err = pvn_vplist_dirty(vp, (u_offset_t)off, swap_putapage,
530 		    flags, cr);
531 	} else {
532 		u_offset_t eoff;
533 
534 		/*
535 		 * Loop over all offsets in the range [off...off + len]
536 		 * looking for pages to deal with.
537 		 */
538 		eoff = off + len;
539 		for (io_off = (u_offset_t)off; io_off < eoff;
540 		    io_off += io_len) {
541 			/*
542 			 * If we run out of the async req slot, put the page
543 			 * now instead of queuing.
544 			 */
545 			if (flags == (B_ASYNC | B_FREE) &&
546 			    sw_pending_size < klustsize &&
547 			    (arg = sw_getfree())) {
548 				/*
549 				 * If we are clustering, we should allow
550 				 * pageout to feed us more pages because # of
551 				 * pushes is limited by # of I/Os, and one
552 				 * cluster is considered to be one I/O.
553 				 */
554 				if (pushes)
555 					pushes--;
556 
557 				arg->a_vp = vp;
558 				arg->a_off = io_off;
559 				arg->a_len = PAGESIZE;
560 				arg->a_flags = B_ASYNC | B_FREE;
561 				arg->a_cred = kcred;
562 				sw_putreq(arg);
563 				io_len = PAGESIZE;
564 				continue;
565 			}
566 			/*
567 			 * If we are not invalidating pages, use the
568 			 * routine page_lookup_nowait() to prevent
569 			 * reclaiming them from the free list.
570 			 */
571 			if (!nowait && ((flags & B_INVAL) ||
572 			    (flags & (B_ASYNC | B_FREE)) == B_FREE))
573 				pp = page_lookup(vp, io_off, SE_EXCL);
574 			else
575 				pp = page_lookup_nowait(vp, io_off,
576 				    (flags & (B_FREE | B_INVAL)) ?
577 				    SE_EXCL : SE_SHARED);
578 
579 			if (pp == NULL || pvn_getdirty(pp, flags) == 0)
580 				io_len = PAGESIZE;
581 			else {
582 				err = swap_putapage(vp, pp, &io_off, &io_len,
583 				    flags, cr);
584 				if (err != 0)
585 					break;
586 			}
587 		}
588 	}
589 	/* If invalidating, verify all pages on vnode list are gone. */
590 	if (err == 0 && off == 0 && len == 0 &&
591 	    (flags & B_INVAL) && vn_has_cached_data(vp)) {
592 		cmn_err(CE_WARN,
593 		    "swap_putpage: B_INVAL, pages not gone");
594 	}
595 	return (err);
596 }
597 
598 /*
599  * Write out a single page.
600  * For swapfs this means choose a physical swap slot and write the page
601  * out using VOP_PAGEIO.
602  * In the (B_ASYNC | B_FREE) case we try to find a bunch of other dirty
603  * swapfs pages, a bunch of contiguous swap slots and then write them
604  * all out in one clustered i/o.
605  */
606 /*ARGSUSED*/
607 static int
608 swap_putapage(
609 	struct vnode *vp,
610 	page_t *pp,
611 	u_offset_t *offp,
612 	size_t *lenp,
613 	int flags,
614 	struct cred *cr)
615 {
616 	int err;
617 	struct vnode *pvp;
618 	u_offset_t poff, off;
619 	u_offset_t doff;
620 	size_t dlen;
621 	size_t klsz = 0;
622 	u_offset_t klstart = 0;
623 	struct vnode *klvp = NULL;
624 	page_t *pplist;
625 	se_t se;
626 	struct async_reqs *arg;
627 	size_t swap_klustsize;
628 
629 	/*
630 	 * This check is added for callers who access swap_putpage with len = 0.
631 	 * swap_putpage calls swap_putapage page-by-page via pvn_vplist_dirty.
632 	 * And it's necessary to do the same queuing if users have the same
633 	 * B_ASYNC|B_FREE flags on.
634 	 */
635 	if (flags == (B_ASYNC | B_FREE) &&
636 	    sw_pending_size < klustsize && (arg = sw_getfree())) {
637 
638 		hat_setmod(pp);
639 		page_io_unlock(pp);
640 		page_unlock(pp);
641 
642 		arg->a_vp = vp;
643 		arg->a_off = pp->p_offset;
644 		arg->a_len = PAGESIZE;
645 		arg->a_flags = B_ASYNC | B_FREE;
646 		arg->a_cred = kcred;
647 		sw_putreq(arg);
648 
649 		return (0);
650 	}
651 
652 	SWAPFS_PRINT(SWAP_PUTP,
653 	    "swap_putapage: pp %p, vp %p, off %llx, flags %x\n",
654 	    pp, vp, pp->p_offset, flags, 0);
655 
656 	ASSERT(PAGE_LOCKED(pp));
657 
658 	off = pp->p_offset;
659 
660 	doff = off;
661 	dlen = PAGESIZE;
662 
663 	if (err = swap_newphysname(vp, off, &doff, &dlen, &pvp, &poff)) {
664 		err = (flags == (B_ASYNC | B_FREE) ? ENOMEM : 0);
665 		hat_setmod(pp);
666 		page_io_unlock(pp);
667 		page_unlock(pp);
668 		goto out;
669 	}
670 
671 	klvp = pvp;
672 	klstart = poff;
673 	pplist = pp;
674 	/*
675 	 * If this is ASYNC | FREE and we've accumulated a bunch of such
676 	 * pending requests, kluster.
677 	 */
678 	if (flags == (B_ASYNC | B_FREE))
679 		swap_klustsize = klustsize;
680 	else
681 		swap_klustsize = PAGESIZE;
682 	se = (flags & B_FREE ? SE_EXCL : SE_SHARED);
683 	klsz = PAGESIZE;
684 	while (klsz < swap_klustsize) {
685 		if ((arg = sw_getreq()) == NULL) {
686 			swap_getiofail++;
687 			swap_getiopages += btop(klsz);
688 			break;
689 		}
690 		ASSERT(vn_matchops(arg->a_vp, swap_vnodeops));
691 		vp = arg->a_vp;
692 		off = arg->a_off;
693 
694 		if ((pp = page_lookup_nowait(vp, off, se)) == NULL) {
695 			swap_otherfail++;
696 			swap_otherpages += btop(klsz);
697 			sw_putfree(arg);
698 			break;
699 		}
700 		if (pvn_getdirty(pp, flags | B_DELWRI) == 0) {
701 			sw_putfree(arg);
702 			continue;
703 		}
704 		/* Get new physical backing store for the page */
705 		doff = off;
706 		dlen = PAGESIZE;
707 		if (err = swap_newphysname(vp, off, &doff, &dlen,
708 		    &pvp, &poff)) {
709 			swap_otherfail++;
710 			swap_otherpages += btop(klsz);
711 			hat_setmod(pp);
712 			page_io_unlock(pp);
713 			page_unlock(pp);
714 			sw_putbackreq(arg);
715 			break;
716 		}
717 		/* Try to cluster new physical name with previous ones */
718 		if (klvp == pvp && poff == klstart + klsz) {
719 			klsz += PAGESIZE;
720 			page_add(&pplist, pp);
721 			pplist = pplist->p_next;
722 			sw_putfree(arg);
723 		} else if (klvp == pvp && poff == klstart - PAGESIZE) {
724 			klsz += PAGESIZE;
725 			klstart -= PAGESIZE;
726 			page_add(&pplist, pp);
727 			sw_putfree(arg);
728 		} else {
729 			swap_klustfail++;
730 			swap_klustpages += btop(klsz);
731 			hat_setmod(pp);
732 			page_io_unlock(pp);
733 			page_unlock(pp);
734 			sw_putbackreq(arg);
735 			break;
736 		}
737 	}
738 
739 	err = VOP_PAGEIO(klvp, pplist, klstart, klsz,
740 	    B_WRITE | flags, cr, NULL);
741 
742 	if ((flags & B_ASYNC) == 0)
743 		pvn_write_done(pp, ((err) ? B_ERROR : 0) | B_WRITE | flags);
744 
745 	/* Statistics */
746 	if (!err) {
747 		swap_putpagecnt++;
748 		swap_pagespushed += btop(klsz);
749 	}
750 out:
751 	TRACE_4(TR_FAC_SWAPFS, TR_SWAPFS_PUTAPAGE,
752 	    "swapfs putapage:vp %p klvp %p, klstart %lx, klsz %lx",
753 	    vp, klvp, klstart, klsz);
754 	if (err && err != ENOMEM)
755 		cmn_err(CE_WARN, "swapfs_putapage: err %d\n", err);
756 	if (lenp)
757 		*lenp = PAGESIZE;
758 	return (err);
759 }
760 
761 static void
762 swap_dispose(
763 	vnode_t *vp,
764 	page_t *pp,
765 	int fl,
766 	int dn,
767 	cred_t *cr,
768 	caller_context_t *ct)
769 {
770 	int err;
771 	u_offset_t off = pp->p_offset;
772 	vnode_t *pvp;
773 	u_offset_t poff;
774 
775 	ASSERT(PAGE_EXCL(pp));
776 
777 	/*
778 	 * The caller will free/invalidate large page in one shot instead of
779 	 * one small page at a time.
780 	 */
781 	if (pp->p_szc != 0) {
782 		page_unlock(pp);
783 		return;
784 	}
785 
786 	err = swap_getphysname(vp, off, &pvp, &poff);
787 	if (!err && pvp != NULL)
788 		VOP_DISPOSE(pvp, pp, fl, dn, cr, ct);
789 	else
790 		fs_dispose(vp, pp, fl, dn, cr, ct);
791 }
792