xref: /titanic_50/usr/src/uts/common/fs/swapfs/swap_vnops.c (revision fdf4286765e129590dce97b37d12188bf7000b58)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/buf.h>
32 #include <sys/cred.h>
33 #include <sys/errno.h>
34 #include <sys/vnode.h>
35 #include <sys/vfs_opreg.h>
36 #include <sys/cmn_err.h>
37 #include <sys/swap.h>
38 #include <sys/mman.h>
39 #include <sys/vmsystm.h>
40 #include <sys/vtrace.h>
41 #include <sys/debug.h>
42 #include <sys/sysmacros.h>
43 #include <sys/vm.h>
44 
45 #include <sys/fs/swapnode.h>
46 
47 #include <vm/seg.h>
48 #include <vm/page.h>
49 #include <vm/pvn.h>
50 #include <fs/fs_subr.h>
51 
52 #include <vm/seg_kp.h>
53 
54 /*
55  * Define the routines within this file.
56  */
57 static int	swap_getpage(struct vnode *vp, offset_t off, size_t len,
58     uint_t *protp, struct page **plarr, size_t plsz, struct seg *seg,
59     caddr_t addr, enum seg_rw rw, struct cred *cr, caller_context_t *ct);
60 static int	swap_putpage(struct vnode *vp, offset_t off, size_t len,
61     int flags, struct cred *cr, caller_context_t *ct);
62 static void	swap_inactive(struct vnode *vp, struct cred *cr,
63     caller_context_t *ct);
64 static void	swap_dispose(vnode_t *vp, page_t *pp, int fl, int dn,
65     cred_t *cr, caller_context_t *ct);
66 
67 static int	swap_getapage(struct vnode *vp, u_offset_t off, size_t len,
68     uint_t *protp, page_t **plarr, size_t plsz,
69     struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr);
70 
71 int	swap_getconpage(struct vnode *vp, u_offset_t off, size_t len,
72     uint_t *protp, page_t **plarr, size_t plsz, page_t *conpp,
73     uint_t *pszc, spgcnt_t *nreloc, struct seg *seg, caddr_t addr,
74     enum seg_rw rw, struct cred *cr);
75 
76 static int 	swap_putapage(struct vnode *vp, page_t *pp, u_offset_t *off,
77     size_t *lenp, int flags, struct cred *cr);
78 
79 const fs_operation_def_t swap_vnodeops_template[] = {
80 	VOPNAME_INACTIVE,	{ .vop_inactive = swap_inactive },
81 	VOPNAME_GETPAGE,	{ .vop_getpage = swap_getpage },
82 	VOPNAME_PUTPAGE,	{ .vop_putpage = swap_putpage },
83 	VOPNAME_DISPOSE,	{ .vop_dispose = swap_dispose },
84 	VOPNAME_SETFL,		{ .error = fs_error },
85 	VOPNAME_POLL,		{ .error = fs_error },
86 	VOPNAME_PATHCONF,	{ .error = fs_error },
87 	VOPNAME_GETSECATTR,	{ .error = fs_error },
88 	VOPNAME_SHRLOCK,	{ .error = fs_error },
89 	NULL,			NULL
90 };
91 
92 vnodeops_t *swap_vnodeops;
93 
94 /* ARGSUSED */
95 static void
96 swap_inactive(
97 	struct vnode *vp,
98 	struct cred *cr,
99 	caller_context_t *ct)
100 {
101 	SWAPFS_PRINT(SWAP_VOPS, "swap_inactive: vp %x\n", vp, 0, 0, 0, 0);
102 }
103 
104 /*
105  * Return all the pages from [off..off+len] in given file
106  */
107 /*ARGSUSED*/
108 static int
109 swap_getpage(
110 	struct vnode *vp,
111 	offset_t off,
112 	size_t len,
113 	uint_t *protp,
114 	page_t *pl[],
115 	size_t plsz,
116 	struct seg *seg,
117 	caddr_t addr,
118 	enum seg_rw rw,
119 	struct cred *cr,
120 	caller_context_t *ct)
121 {
122 	int err;
123 
124 	SWAPFS_PRINT(SWAP_VOPS, "swap_getpage: vp %p, off %llx, len %lx\n",
125 	    (void *)vp, off, len, 0, 0);
126 
127 	TRACE_3(TR_FAC_SWAPFS, TR_SWAPFS_GETPAGE,
128 	    "swapfs getpage:vp %p off %llx len %ld",
129 	    (void *)vp, off, len);
130 
131 	if (len <= PAGESIZE) {
132 		err = swap_getapage(vp, (u_offset_t)off, len, protp, pl, plsz,
133 		    seg, addr, rw, cr);
134 	} else {
135 		err = pvn_getpages(swap_getapage, vp, (u_offset_t)off, len,
136 		    protp, pl, plsz, seg, addr, rw, cr);
137 	}
138 
139 	return (err);
140 }
141 
142 /*
143  * Called from pvn_getpages or swap_getpage to get a particular page.
144  */
145 /*ARGSUSED*/
146 static int
147 swap_getapage(
148 	struct vnode *vp,
149 	u_offset_t off,
150 	size_t len,
151 	uint_t *protp,
152 	page_t *pl[],
153 	size_t plsz,
154 	struct seg *seg,
155 	caddr_t addr,
156 	enum seg_rw rw,
157 	struct cred *cr)
158 {
159 	struct page *pp, *rpp;
160 	int flags;
161 	int err = 0;
162 	struct vnode *pvp = NULL;
163 	u_offset_t poff;
164 	int flag_noreloc;
165 	se_t lock;
166 	extern int kcage_on;
167 	int upgrade = 0;
168 
169 	SWAPFS_PRINT(SWAP_VOPS, "swap_getapage: vp %p, off %llx, len %lx\n",
170 	    vp, off, len, 0, 0);
171 
172 	/*
173 	 * Until there is a call-back mechanism to cause SEGKP
174 	 * pages to be unlocked, make them non-relocatable.
175 	 */
176 	if (SEG_IS_SEGKP(seg))
177 		flag_noreloc = PG_NORELOC;
178 	else
179 		flag_noreloc = 0;
180 
181 	if (protp != NULL)
182 		*protp = PROT_ALL;
183 
184 	lock = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
185 
186 again:
187 	if (pp = page_lookup(vp, off, lock)) {
188 		/*
189 		 * In very rare instances, a segkp page may have been
190 		 * relocated outside of the kernel by the kernel cage
191 		 * due to the window between page_unlock() and
192 		 * VOP_PUTPAGE() in segkp_unlock().  Due to the
193 		 * rareness of these occurances, the solution is to
194 		 * relocate the page to a P_NORELOC page.
195 		 */
196 		if (flag_noreloc != 0) {
197 			if (!PP_ISNORELOC(pp) && kcage_on) {
198 				if (lock != SE_EXCL) {
199 					upgrade = 1;
200 					if (!page_tryupgrade(pp)) {
201 						page_unlock(pp);
202 						lock = SE_EXCL;
203 						goto again;
204 					}
205 				}
206 
207 				if (page_relocate_cage(&pp, &rpp) != 0)
208 					panic("swap_getapage: "
209 					    "page_relocate_cage failed");
210 
211 				pp = rpp;
212 			}
213 		}
214 
215 		if (pl) {
216 			if (upgrade)
217 				page_downgrade(pp);
218 
219 			pl[0] = pp;
220 			pl[1] = NULL;
221 		} else {
222 			page_unlock(pp);
223 		}
224 	} else {
225 		pp = page_create_va(vp, off, PAGESIZE,
226 		    PG_WAIT | PG_EXCL | flag_noreloc,
227 		    seg, addr);
228 		/*
229 		 * Someone raced in and created the page after we did the
230 		 * lookup but before we did the create, so go back and
231 		 * try to look it up again.
232 		 */
233 		if (pp == NULL)
234 			goto again;
235 		if (rw != S_CREATE) {
236 			err = swap_getphysname(vp, off, &pvp, &poff);
237 			if (pvp) {
238 				struct anon *ap;
239 				kmutex_t *ahm;
240 
241 				flags = (pl == NULL ? B_ASYNC|B_READ : B_READ);
242 				err = VOP_PAGEIO(pvp, pp, poff,
243 				    PAGESIZE, flags, cr, NULL);
244 
245 				if (!err) {
246 					ahm = &anonhash_lock[AH_LOCK(vp, off)];
247 					mutex_enter(ahm);
248 
249 					ap = swap_anon(vp, off);
250 					if (ap == NULL) {
251 						panic("swap_getapage:"
252 						    " null anon");
253 					}
254 
255 					if (ap->an_pvp == pvp &&
256 					    ap->an_poff == poff) {
257 						swap_phys_free(pvp, poff,
258 						    PAGESIZE);
259 						ap->an_pvp = NULL;
260 						ap->an_poff = NULL;
261 						hat_setmod(pp);
262 					}
263 
264 					mutex_exit(ahm);
265 				}
266 			} else {
267 				if (!err)
268 					pagezero(pp, 0, PAGESIZE);
269 
270 				/*
271 				 * If it's a fault ahead, release page_io_lock
272 				 * and SE_EXCL we grabbed in page_create_va
273 				 *
274 				 * If we are here, we haven't called VOP_PAGEIO
275 				 * and thus calling pvn_read_done(pp, B_READ)
276 				 * below may mislead that we tried i/o. Besides,
277 				 * in case of async, pvn_read_done() should
278 				 * not be called by *getpage()
279 				 */
280 				if (pl == NULL) {
281 					/*
282 					 * swap_getphysname can return error
283 					 * only when we are getting called from
284 					 * swapslot_free which passes non-NULL
285 					 * pl to VOP_GETPAGE.
286 					 */
287 					ASSERT(err == 0);
288 					page_io_unlock(pp);
289 					page_unlock(pp);
290 				}
291 			}
292 		}
293 
294 		ASSERT(pp != NULL);
295 
296 		if (err && pl)
297 			pvn_read_done(pp, B_ERROR);
298 
299 		if (!err && pl)
300 			pvn_plist_init(pp, pl, plsz, off, PAGESIZE, rw);
301 	}
302 	TRACE_3(TR_FAC_SWAPFS, TR_SWAPFS_GETAPAGE,
303 	    "swapfs getapage:pp %p vp %p off %llx", pp, vp, off);
304 	return (err);
305 }
306 
307 /*
308  * Called from large page anon routines only! This is an ugly hack where
309  * the anon layer directly calls into swapfs with a preallocated large page.
310  * Another method would have been to change to VOP and add an extra arg for
311  * the preallocated large page. This all could be cleaned up later when we
312  * solve the anonymous naming problem and no longer need to loop across of
313  * the VOP in PAGESIZE increments to fill in or initialize a large page as
314  * is done today. I think the latter is better since it avoid a change to
315  * the VOP interface that could later be avoided.
316  */
317 int
318 swap_getconpage(
319 	struct vnode *vp,
320 	u_offset_t off,
321 	size_t len,
322 	uint_t *protp,
323 	page_t *pl[],
324 	size_t plsz,
325 	page_t	*conpp,
326 	uint_t	*pszc,
327 	spgcnt_t *nreloc,
328 	struct seg *seg,
329 	caddr_t addr,
330 	enum seg_rw rw,
331 	struct cred *cr)
332 {
333 	struct page	*pp;
334 	int 		err = 0;
335 	struct vnode	*pvp = NULL;
336 	u_offset_t	poff;
337 
338 	ASSERT(len == PAGESIZE);
339 	ASSERT(pl != NULL);
340 	ASSERT(plsz == PAGESIZE);
341 	ASSERT(protp == NULL);
342 	ASSERT(nreloc != NULL);
343 	ASSERT(!SEG_IS_SEGKP(seg)); /* XXX for now not supported */
344 	SWAPFS_PRINT(SWAP_VOPS, "swap_getconpage: vp %p, off %llx, len %lx\n",
345 	    vp, off, len, 0, 0);
346 
347 	/*
348 	 * If we are not using a preallocated page then we know one already
349 	 * exists. So just let the old code handle it.
350 	 */
351 	if (conpp == NULL) {
352 		err = swap_getapage(vp, (u_offset_t)off, len, protp, pl, plsz,
353 		    seg, addr, rw, cr);
354 		return (err);
355 	}
356 	ASSERT(conpp->p_szc != 0);
357 	ASSERT(PAGE_EXCL(conpp));
358 
359 
360 	ASSERT(conpp->p_next == conpp);
361 	ASSERT(conpp->p_prev == conpp);
362 	ASSERT(!PP_ISAGED(conpp));
363 	ASSERT(!PP_ISFREE(conpp));
364 
365 	*nreloc = 0;
366 	pp = page_lookup_create(vp, off, SE_SHARED, conpp, nreloc, 0);
367 
368 	/*
369 	 * If existing page is found we may need to relocate.
370 	 */
371 	if (pp != conpp) {
372 		ASSERT(rw != S_CREATE);
373 		ASSERT(pszc != NULL);
374 		ASSERT(PAGE_SHARED(pp));
375 		if (pp->p_szc < conpp->p_szc) {
376 			*pszc = pp->p_szc;
377 			page_unlock(pp);
378 			err = -1;
379 		} else if (pp->p_szc > conpp->p_szc &&
380 		    seg->s_szc > conpp->p_szc) {
381 			*pszc = MIN(pp->p_szc, seg->s_szc);
382 			page_unlock(pp);
383 			err = -2;
384 		} else {
385 			pl[0] = pp;
386 			pl[1] = NULL;
387 			if (page_pptonum(pp) &
388 			    (page_get_pagecnt(conpp->p_szc) - 1))
389 				cmn_err(CE_PANIC, "swap_getconpage: no root");
390 		}
391 		return (err);
392 	}
393 
394 	ASSERT(PAGE_EXCL(pp));
395 
396 	if (*nreloc != 0) {
397 		ASSERT(rw != S_CREATE);
398 		pl[0] = pp;
399 		pl[1] = NULL;
400 		return (0);
401 	}
402 
403 	*nreloc = 1;
404 
405 	/*
406 	 * If necessary do the page io.
407 	 */
408 	if (rw != S_CREATE) {
409 		/*
410 		 * Since we are only called now on behalf of an
411 		 * address space operation it's impossible for
412 		 * us to fail unlike swap_getapge() which
413 		 * also gets called from swapslot_free().
414 		 */
415 		if (swap_getphysname(vp, off, &pvp, &poff)) {
416 			cmn_err(CE_PANIC,
417 			    "swap_getconpage: swap_getphysname failed!");
418 		}
419 
420 		if (pvp != NULL) {
421 			err = VOP_PAGEIO(pvp, pp, poff, PAGESIZE, B_READ,
422 			    cr, NULL);
423 			if (err == 0) {
424 				struct anon *ap;
425 				kmutex_t *ahm;
426 
427 				ahm = &anonhash_lock[AH_LOCK(vp, off)];
428 				mutex_enter(ahm);
429 				ap = swap_anon(vp, off);
430 				if (ap == NULL)
431 					panic("swap_getconpage: null anon");
432 				if (ap->an_pvp != pvp || ap->an_poff != poff)
433 					panic("swap_getconpage: bad anon");
434 
435 				swap_phys_free(pvp, poff, PAGESIZE);
436 				ap->an_pvp = NULL;
437 				ap->an_poff = NULL;
438 				hat_setmod(pp);
439 				mutex_exit(ahm);
440 			}
441 		} else {
442 			pagezero(pp, 0, PAGESIZE);
443 		}
444 	}
445 
446 	/*
447 	 * Normally we would let pvn_read_done() destroy
448 	 * the page on IO error. But since this is a preallocated
449 	 * page we'll let the anon layer handle it.
450 	 */
451 	page_io_unlock(pp);
452 	if (err != 0)
453 		page_hashout(pp, NULL);
454 	ASSERT(pp->p_next == pp);
455 	ASSERT(pp->p_prev == pp);
456 
457 	TRACE_3(TR_FAC_SWAPFS, TR_SWAPFS_GETAPAGE,
458 	    "swapfs getconpage:pp %p vp %p off %llx", pp, vp, off);
459 
460 	pl[0] = pp;
461 	pl[1] = NULL;
462 	return (err);
463 }
464 
465 /* Async putpage klustering stuff */
466 int sw_pending_size;
467 extern int klustsize;
468 extern struct async_reqs *sw_getreq();
469 extern void sw_putreq(struct async_reqs *);
470 extern void sw_putbackreq(struct async_reqs *);
471 extern struct async_reqs *sw_getfree();
472 extern void sw_putfree(struct async_reqs *);
473 
474 static size_t swap_putpagecnt, swap_pagespushed;
475 static size_t swap_otherfail, swap_otherpages;
476 static size_t swap_klustfail, swap_klustpages;
477 static size_t swap_getiofail, swap_getiopages;
478 
479 /*
480  * Flags are composed of {B_INVAL, B_DIRTY B_FREE, B_DONTNEED}.
481  * If len == 0, do from off to EOF.
482  */
483 static int swap_nopage = 0;	/* Don't do swap_putpage's if set */
484 
485 /* ARGSUSED */
486 static int
487 swap_putpage(
488 	struct vnode *vp,
489 	offset_t off,
490 	size_t len,
491 	int flags,
492 	struct cred *cr,
493 	caller_context_t *ct)
494 {
495 	page_t *pp;
496 	u_offset_t io_off;
497 	size_t io_len = 0;
498 	int err = 0;
499 	struct async_reqs *arg;
500 
501 	if (swap_nopage)
502 		return (0);
503 
504 	ASSERT(vp->v_count != 0);
505 
506 	/*
507 	 * Clear force flag so that p_lckcnt pages are not invalidated.
508 	 */
509 	flags &= ~B_FORCE;
510 
511 	SWAPFS_PRINT(SWAP_VOPS,
512 	    "swap_putpage: vp %p, off %llx len %lx, flags %x\n",
513 	    (void *)vp, off, len, flags, 0);
514 	TRACE_3(TR_FAC_SWAPFS, TR_SWAPFS_PUTPAGE,
515 	    "swapfs putpage:vp %p off %llx len %ld", (void *)vp, off, len);
516 
517 	if (vp->v_flag & VNOMAP)
518 		return (ENOSYS);
519 
520 	if (!vn_has_cached_data(vp))
521 		return (0);
522 
523 	if (len == 0) {
524 		if (curproc == proc_pageout)
525 			cmn_err(CE_PANIC, "swapfs: pageout can't block");
526 
527 		/* Search the entire vp list for pages >= off. */
528 		err = pvn_vplist_dirty(vp, (u_offset_t)off, swap_putapage,
529 		    flags, cr);
530 	} else {
531 		u_offset_t eoff;
532 
533 		/*
534 		 * Loop over all offsets in the range [off...off + len]
535 		 * looking for pages to deal with.
536 		 */
537 		eoff = off + len;
538 		for (io_off = (u_offset_t)off; io_off < eoff;
539 		    io_off += io_len) {
540 			/*
541 			 * If we run out of the async req slot, put the page
542 			 * now instead of queuing.
543 			 */
544 			if (flags == (B_ASYNC | B_FREE) &&
545 			    sw_pending_size < klustsize &&
546 			    (arg = sw_getfree())) {
547 				/*
548 				 * If we are clustering, we should allow
549 				 * pageout to feed us more pages because # of
550 				 * pushes is limited by # of I/Os, and one
551 				 * cluster is considered to be one I/O.
552 				 */
553 				if (pushes)
554 					pushes--;
555 
556 				arg->a_vp = vp;
557 				arg->a_off = io_off;
558 				arg->a_len = PAGESIZE;
559 				arg->a_flags = B_ASYNC | B_FREE;
560 				arg->a_cred = kcred;
561 				sw_putreq(arg);
562 				io_len = PAGESIZE;
563 				continue;
564 			}
565 			/*
566 			 * If we are not invalidating pages, use the
567 			 * routine page_lookup_nowait() to prevent
568 			 * reclaiming them from the free list.
569 			 */
570 			if ((flags & B_INVAL) ||
571 			    (flags & (B_ASYNC | B_FREE)) == B_FREE)
572 				pp = page_lookup(vp, io_off, SE_EXCL);
573 			else
574 				pp = page_lookup_nowait(vp, io_off,
575 				    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
576 
577 			if (pp == NULL || pvn_getdirty(pp, flags) == 0)
578 				io_len = PAGESIZE;
579 			else {
580 				err = swap_putapage(vp, pp, &io_off, &io_len,
581 				    flags, cr);
582 				if (err != 0)
583 					break;
584 			}
585 		}
586 	}
587 	/* If invalidating, verify all pages on vnode list are gone. */
588 	if (err == 0 && off == 0 && len == 0 &&
589 	    (flags & B_INVAL) && vn_has_cached_data(vp)) {
590 		cmn_err(CE_WARN,
591 		    "swap_putpage: B_INVAL, pages not gone");
592 	}
593 	return (err);
594 }
595 
596 /*
597  * Write out a single page.
598  * For swapfs this means choose a physical swap slot and write the page
599  * out using VOP_PAGEIO.
600  * In the (B_ASYNC | B_FREE) case we try to find a bunch of other dirty
601  * swapfs pages, a bunch of contiguous swap slots and then write them
602  * all out in one clustered i/o.
603  */
604 /*ARGSUSED*/
605 static int
606 swap_putapage(
607 	struct vnode *vp,
608 	page_t *pp,
609 	u_offset_t *offp,
610 	size_t *lenp,
611 	int flags,
612 	struct cred *cr)
613 {
614 	int err;
615 	struct vnode *pvp;
616 	u_offset_t poff, off;
617 	u_offset_t doff;
618 	size_t dlen;
619 	size_t klsz = 0;
620 	u_offset_t klstart = 0;
621 	struct vnode *klvp = NULL;
622 	page_t *pplist;
623 	se_t se;
624 	struct async_reqs *arg;
625 	size_t swap_klustsize;
626 
627 	/*
628 	 * This check is added for callers who access swap_putpage with len = 0.
629 	 * swap_putpage calls swap_putapage page-by-page via pvn_vplist_dirty.
630 	 * And it's necessary to do the same queuing if users have the same
631 	 * B_ASYNC|B_FREE flags on.
632 	 */
633 	if (flags == (B_ASYNC | B_FREE) &&
634 	    sw_pending_size < klustsize && (arg = sw_getfree())) {
635 
636 		hat_setmod(pp);
637 		page_io_unlock(pp);
638 		page_unlock(pp);
639 
640 		arg->a_vp = vp;
641 		arg->a_off = pp->p_offset;
642 		arg->a_len = PAGESIZE;
643 		arg->a_flags = B_ASYNC | B_FREE;
644 		arg->a_cred = kcred;
645 		sw_putreq(arg);
646 
647 		return (0);
648 	}
649 
650 	SWAPFS_PRINT(SWAP_PUTP,
651 	    "swap_putapage: pp %p, vp %p, off %llx, flags %x\n",
652 	    pp, vp, pp->p_offset, flags, 0);
653 
654 	ASSERT(PAGE_LOCKED(pp));
655 
656 	off = pp->p_offset;
657 
658 	doff = off;
659 	dlen = PAGESIZE;
660 
661 	if (err = swap_newphysname(vp, off, &doff, &dlen, &pvp, &poff)) {
662 		err = (flags == (B_ASYNC | B_FREE) ? ENOMEM : 0);
663 		hat_setmod(pp);
664 		page_io_unlock(pp);
665 		page_unlock(pp);
666 		goto out;
667 	}
668 
669 	klvp = pvp;
670 	klstart = poff;
671 	pplist = pp;
672 	/*
673 	 * If this is ASYNC | FREE and we've accumulated a bunch of such
674 	 * pending requests, kluster.
675 	 */
676 	if (flags == (B_ASYNC | B_FREE))
677 		swap_klustsize = klustsize;
678 	else
679 		swap_klustsize = PAGESIZE;
680 	se = (flags & B_FREE ? SE_EXCL : SE_SHARED);
681 	klsz = PAGESIZE;
682 	while (klsz < swap_klustsize) {
683 		if ((arg = sw_getreq()) == NULL) {
684 			swap_getiofail++;
685 			swap_getiopages += btop(klsz);
686 			break;
687 		}
688 		ASSERT(vn_matchops(arg->a_vp, swap_vnodeops));
689 		vp = arg->a_vp;
690 		off = arg->a_off;
691 
692 		if ((pp = page_lookup_nowait(vp, off, se)) == NULL) {
693 			swap_otherfail++;
694 			swap_otherpages += btop(klsz);
695 			sw_putfree(arg);
696 			break;
697 		}
698 		if (pvn_getdirty(pp, flags | B_DELWRI) == 0) {
699 			sw_putfree(arg);
700 			continue;
701 		}
702 		/* Get new physical backing store for the page */
703 		doff = off;
704 		dlen = PAGESIZE;
705 		if (err = swap_newphysname(vp, off, &doff, &dlen,
706 		    &pvp, &poff)) {
707 			swap_otherfail++;
708 			swap_otherpages += btop(klsz);
709 			hat_setmod(pp);
710 			page_io_unlock(pp);
711 			page_unlock(pp);
712 			sw_putbackreq(arg);
713 			break;
714 		}
715 		/* Try to cluster new physical name with previous ones */
716 		if (klvp == pvp && poff == klstart + klsz) {
717 			klsz += PAGESIZE;
718 			page_add(&pplist, pp);
719 			pplist = pplist->p_next;
720 			sw_putfree(arg);
721 		} else if (klvp == pvp && poff == klstart - PAGESIZE) {
722 			klsz += PAGESIZE;
723 			klstart -= PAGESIZE;
724 			page_add(&pplist, pp);
725 			sw_putfree(arg);
726 		} else {
727 			swap_klustfail++;
728 			swap_klustpages += btop(klsz);
729 			hat_setmod(pp);
730 			page_io_unlock(pp);
731 			page_unlock(pp);
732 			sw_putbackreq(arg);
733 			break;
734 		}
735 	}
736 
737 	err = VOP_PAGEIO(klvp, pplist, klstart, klsz,
738 	    B_WRITE | flags, cr, NULL);
739 
740 	if ((flags & B_ASYNC) == 0)
741 		pvn_write_done(pp, ((err) ? B_ERROR : 0) | B_WRITE | flags);
742 
743 	/* Statistics */
744 	if (!err) {
745 		swap_putpagecnt++;
746 		swap_pagespushed += btop(klsz);
747 	}
748 out:
749 	TRACE_4(TR_FAC_SWAPFS, TR_SWAPFS_PUTAPAGE,
750 	    "swapfs putapage:vp %p klvp %p, klstart %lx, klsz %lx",
751 	    vp, klvp, klstart, klsz);
752 	if (err && err != ENOMEM)
753 		cmn_err(CE_WARN, "swapfs_putapage: err %d\n", err);
754 	if (lenp)
755 		*lenp = PAGESIZE;
756 	return (err);
757 }
758 
759 static void
760 swap_dispose(
761 	vnode_t *vp,
762 	page_t *pp,
763 	int fl,
764 	int dn,
765 	cred_t *cr,
766 	caller_context_t *ct)
767 {
768 	int err;
769 	u_offset_t off = pp->p_offset;
770 	vnode_t *pvp;
771 	u_offset_t poff;
772 
773 	ASSERT(PAGE_EXCL(pp));
774 
775 	/*
776 	 * The caller will free/invalidate large page in one shot instead of
777 	 * one small page at a time.
778 	 */
779 	if (pp->p_szc != 0) {
780 		page_unlock(pp);
781 		return;
782 	}
783 
784 	err = swap_getphysname(vp, off, &pvp, &poff);
785 	if (!err && pvp != NULL)
786 		VOP_DISPOSE(pvp, pp, fl, dn, cr, ct);
787 	else
788 		fs_dispose(vp, pp, fl, dn, cr, ct);
789 }
790