xref: /titanic_51/usr/src/uts/common/fs/swapfs/swap_vnops.c (revision 6a72db4a7fa12c3e0d1c1cf91a07390739fa0fbf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/systm.h>
29 #include <sys/buf.h>
30 #include <sys/cred.h>
31 #include <sys/errno.h>
32 #include <sys/vnode.h>
33 #include <sys/vfs_opreg.h>
34 #include <sys/cmn_err.h>
35 #include <sys/swap.h>
36 #include <sys/mman.h>
37 #include <sys/vmsystm.h>
38 #include <sys/vtrace.h>
39 #include <sys/debug.h>
40 #include <sys/sysmacros.h>
41 #include <sys/vm.h>
42 
43 #include <sys/fs/swapnode.h>
44 
45 #include <vm/seg.h>
46 #include <vm/page.h>
47 #include <vm/pvn.h>
48 #include <fs/fs_subr.h>
49 
50 #include <vm/seg_kp.h>
51 
52 /*
53  * Define the routines within this file.
54  */
55 static int	swap_getpage(struct vnode *vp, offset_t off, size_t len,
56     uint_t *protp, struct page **plarr, size_t plsz, struct seg *seg,
57     caddr_t addr, enum seg_rw rw, struct cred *cr, caller_context_t *ct);
58 static int	swap_putpage(struct vnode *vp, offset_t off, size_t len,
59     int flags, struct cred *cr, caller_context_t *ct);
60 static void	swap_inactive(struct vnode *vp, struct cred *cr,
61     caller_context_t *ct);
62 static void	swap_dispose(vnode_t *vp, page_t *pp, int fl, int dn,
63     cred_t *cr, caller_context_t *ct);
64 
65 static int	swap_getapage(struct vnode *vp, u_offset_t off, size_t len,
66     uint_t *protp, page_t **plarr, size_t plsz,
67     struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr);
68 
69 int	swap_getconpage(struct vnode *vp, u_offset_t off, size_t len,
70     uint_t *protp, page_t **plarr, size_t plsz, page_t *conpp,
71     uint_t *pszc, spgcnt_t *nreloc, struct seg *seg, caddr_t addr,
72     enum seg_rw rw, struct cred *cr);
73 
74 static int 	swap_putapage(struct vnode *vp, page_t *pp, u_offset_t *off,
75     size_t *lenp, int flags, struct cred *cr);
76 
77 const fs_operation_def_t swap_vnodeops_template[] = {
78 	VOPNAME_INACTIVE,	{ .vop_inactive = swap_inactive },
79 	VOPNAME_GETPAGE,	{ .vop_getpage = swap_getpage },
80 	VOPNAME_PUTPAGE,	{ .vop_putpage = swap_putpage },
81 	VOPNAME_DISPOSE,	{ .vop_dispose = swap_dispose },
82 	VOPNAME_SETFL,		{ .error = fs_error },
83 	VOPNAME_POLL,		{ .error = fs_error },
84 	VOPNAME_PATHCONF,	{ .error = fs_error },
85 	VOPNAME_GETSECATTR,	{ .error = fs_error },
86 	VOPNAME_SHRLOCK,	{ .error = fs_error },
87 	NULL,			NULL
88 };
89 
90 vnodeops_t *swap_vnodeops;
91 
92 /* ARGSUSED */
93 static void
94 swap_inactive(
95 	struct vnode *vp,
96 	struct cred *cr,
97 	caller_context_t *ct)
98 {
99 	SWAPFS_PRINT(SWAP_VOPS, "swap_inactive: vp %x\n", vp, 0, 0, 0, 0);
100 }
101 
102 /*
103  * Return all the pages from [off..off+len] in given file
104  */
105 /*ARGSUSED*/
106 static int
107 swap_getpage(
108 	struct vnode *vp,
109 	offset_t off,
110 	size_t len,
111 	uint_t *protp,
112 	page_t *pl[],
113 	size_t plsz,
114 	struct seg *seg,
115 	caddr_t addr,
116 	enum seg_rw rw,
117 	struct cred *cr,
118 	caller_context_t *ct)
119 {
120 	SWAPFS_PRINT(SWAP_VOPS, "swap_getpage: vp %p, off %llx, len %lx\n",
121 	    (void *)vp, off, len, 0, 0);
122 
123 	TRACE_3(TR_FAC_SWAPFS, TR_SWAPFS_GETPAGE,
124 	    "swapfs getpage:vp %p off %llx len %ld",
125 	    (void *)vp, off, len);
126 
127 	return (pvn_getpages(swap_getapage, vp, (u_offset_t)off, len, protp,
128 	    pl, plsz, seg, addr, rw, cr));
129 }
130 
131 /*
132  * Called from pvn_getpages to get a particular page.
133  */
134 /*ARGSUSED*/
135 static int
136 swap_getapage(
137 	struct vnode *vp,
138 	u_offset_t off,
139 	size_t len,
140 	uint_t *protp,
141 	page_t *pl[],
142 	size_t plsz,
143 	struct seg *seg,
144 	caddr_t addr,
145 	enum seg_rw rw,
146 	struct cred *cr)
147 {
148 	struct page *pp, *rpp;
149 	int flags;
150 	int err = 0;
151 	struct vnode *pvp = NULL;
152 	u_offset_t poff;
153 	int flag_noreloc;
154 	se_t lock;
155 	extern int kcage_on;
156 	int upgrade = 0;
157 
158 	SWAPFS_PRINT(SWAP_VOPS, "swap_getapage: vp %p, off %llx, len %lx\n",
159 	    vp, off, len, 0, 0);
160 
161 	/*
162 	 * Until there is a call-back mechanism to cause SEGKP
163 	 * pages to be unlocked, make them non-relocatable.
164 	 */
165 	if (SEG_IS_SEGKP(seg))
166 		flag_noreloc = PG_NORELOC;
167 	else
168 		flag_noreloc = 0;
169 
170 	if (protp != NULL)
171 		*protp = PROT_ALL;
172 
173 	lock = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
174 
175 again:
176 	if (pp = page_lookup(vp, off, lock)) {
177 		/*
178 		 * In very rare instances, a segkp page may have been
179 		 * relocated outside of the kernel by the kernel cage
180 		 * due to the window between page_unlock() and
181 		 * VOP_PUTPAGE() in segkp_unlock().  Due to the
182 		 * rareness of these occurances, the solution is to
183 		 * relocate the page to a P_NORELOC page.
184 		 */
185 		if (flag_noreloc != 0) {
186 			if (!PP_ISNORELOC(pp) && kcage_on) {
187 				if (lock != SE_EXCL) {
188 					upgrade = 1;
189 					if (!page_tryupgrade(pp)) {
190 						page_unlock(pp);
191 						lock = SE_EXCL;
192 						goto again;
193 					}
194 				}
195 
196 				if (page_relocate_cage(&pp, &rpp) != 0)
197 					panic("swap_getapage: "
198 					    "page_relocate_cage failed");
199 
200 				pp = rpp;
201 			}
202 		}
203 
204 		if (pl) {
205 			if (upgrade)
206 				page_downgrade(pp);
207 
208 			pl[0] = pp;
209 			pl[1] = NULL;
210 		} else {
211 			page_unlock(pp);
212 		}
213 	} else {
214 		pp = page_create_va(vp, off, PAGESIZE,
215 		    PG_WAIT | PG_EXCL | flag_noreloc,
216 		    seg, addr);
217 		/*
218 		 * Someone raced in and created the page after we did the
219 		 * lookup but before we did the create, so go back and
220 		 * try to look it up again.
221 		 */
222 		if (pp == NULL)
223 			goto again;
224 		if (rw != S_CREATE) {
225 			err = swap_getphysname(vp, off, &pvp, &poff);
226 			if (pvp) {
227 				struct anon *ap;
228 				kmutex_t *ahm;
229 
230 				flags = (pl == NULL ? B_ASYNC|B_READ : B_READ);
231 				err = VOP_PAGEIO(pvp, pp, poff,
232 				    PAGESIZE, flags, cr, NULL);
233 
234 				if (!err) {
235 					ahm = AH_MUTEX(vp, off);
236 					mutex_enter(ahm);
237 
238 					ap = swap_anon(vp, off);
239 					if (ap == NULL) {
240 						panic("swap_getapage:"
241 						    " null anon");
242 					}
243 
244 					if (ap->an_pvp == pvp &&
245 					    ap->an_poff == poff) {
246 						swap_phys_free(pvp, poff,
247 						    PAGESIZE);
248 						ap->an_pvp = NULL;
249 						ap->an_poff = NULL;
250 						hat_setmod(pp);
251 					}
252 
253 					mutex_exit(ahm);
254 				}
255 			} else {
256 				if (!err)
257 					pagezero(pp, 0, PAGESIZE);
258 
259 				/*
260 				 * If it's a fault ahead, release page_io_lock
261 				 * and SE_EXCL we grabbed in page_create_va
262 				 *
263 				 * If we are here, we haven't called VOP_PAGEIO
264 				 * and thus calling pvn_read_done(pp, B_READ)
265 				 * below may mislead that we tried i/o. Besides,
266 				 * in case of async, pvn_read_done() should
267 				 * not be called by *getpage()
268 				 */
269 				if (pl == NULL) {
270 					/*
271 					 * swap_getphysname can return error
272 					 * only when we are getting called from
273 					 * swapslot_free which passes non-NULL
274 					 * pl to VOP_GETPAGE.
275 					 */
276 					ASSERT(err == 0);
277 					page_io_unlock(pp);
278 					page_unlock(pp);
279 				}
280 			}
281 		}
282 
283 		ASSERT(pp != NULL);
284 
285 		if (err && pl)
286 			pvn_read_done(pp, B_ERROR);
287 
288 		if (!err && pl)
289 			pvn_plist_init(pp, pl, plsz, off, PAGESIZE, rw);
290 	}
291 	TRACE_3(TR_FAC_SWAPFS, TR_SWAPFS_GETAPAGE,
292 	    "swapfs getapage:pp %p vp %p off %llx", pp, vp, off);
293 	return (err);
294 }
295 
296 /*
297  * Called from large page anon routines only! This is an ugly hack where
298  * the anon layer directly calls into swapfs with a preallocated large page.
299  * Another method would have been to change to VOP and add an extra arg for
300  * the preallocated large page. This all could be cleaned up later when we
301  * solve the anonymous naming problem and no longer need to loop across of
302  * the VOP in PAGESIZE increments to fill in or initialize a large page as
303  * is done today. I think the latter is better since it avoid a change to
304  * the VOP interface that could later be avoided.
305  */
306 int
307 swap_getconpage(
308 	struct vnode *vp,
309 	u_offset_t off,
310 	size_t len,
311 	uint_t *protp,
312 	page_t *pl[],
313 	size_t plsz,
314 	page_t	*conpp,
315 	uint_t	*pszc,
316 	spgcnt_t *nreloc,
317 	struct seg *seg,
318 	caddr_t addr,
319 	enum seg_rw rw,
320 	struct cred *cr)
321 {
322 	struct page	*pp;
323 	int 		err = 0;
324 	struct vnode	*pvp = NULL;
325 	u_offset_t	poff;
326 
327 	ASSERT(len == PAGESIZE);
328 	ASSERT(pl != NULL);
329 	ASSERT(plsz == PAGESIZE);
330 	ASSERT(protp == NULL);
331 	ASSERT(nreloc != NULL);
332 	ASSERT(!SEG_IS_SEGKP(seg)); /* XXX for now not supported */
333 	SWAPFS_PRINT(SWAP_VOPS, "swap_getconpage: vp %p, off %llx, len %lx\n",
334 	    vp, off, len, 0, 0);
335 
336 	/*
337 	 * If we are not using a preallocated page then we know one already
338 	 * exists. So just let the old code handle it.
339 	 */
340 	if (conpp == NULL) {
341 		err = swap_getapage(vp, (u_offset_t)off, len, protp, pl, plsz,
342 		    seg, addr, rw, cr);
343 		return (err);
344 	}
345 	ASSERT(conpp->p_szc != 0);
346 	ASSERT(PAGE_EXCL(conpp));
347 
348 
349 	ASSERT(conpp->p_next == conpp);
350 	ASSERT(conpp->p_prev == conpp);
351 	ASSERT(!PP_ISAGED(conpp));
352 	ASSERT(!PP_ISFREE(conpp));
353 
354 	*nreloc = 0;
355 	pp = page_lookup_create(vp, off, SE_SHARED, conpp, nreloc, 0);
356 
357 	/*
358 	 * If existing page is found we may need to relocate.
359 	 */
360 	if (pp != conpp) {
361 		ASSERT(rw != S_CREATE);
362 		ASSERT(pszc != NULL);
363 		ASSERT(PAGE_SHARED(pp));
364 		if (pp->p_szc < conpp->p_szc) {
365 			*pszc = pp->p_szc;
366 			page_unlock(pp);
367 			err = -1;
368 		} else if (pp->p_szc > conpp->p_szc &&
369 		    seg->s_szc > conpp->p_szc) {
370 			*pszc = MIN(pp->p_szc, seg->s_szc);
371 			page_unlock(pp);
372 			err = -2;
373 		} else {
374 			pl[0] = pp;
375 			pl[1] = NULL;
376 			if (page_pptonum(pp) &
377 			    (page_get_pagecnt(conpp->p_szc) - 1))
378 				cmn_err(CE_PANIC, "swap_getconpage: no root");
379 		}
380 		return (err);
381 	}
382 
383 	ASSERT(PAGE_EXCL(pp));
384 
385 	if (*nreloc != 0) {
386 		ASSERT(rw != S_CREATE);
387 		pl[0] = pp;
388 		pl[1] = NULL;
389 		return (0);
390 	}
391 
392 	*nreloc = 1;
393 
394 	/*
395 	 * If necessary do the page io.
396 	 */
397 	if (rw != S_CREATE) {
398 		/*
399 		 * Since we are only called now on behalf of an
400 		 * address space operation it's impossible for
401 		 * us to fail unlike swap_getapge() which
402 		 * also gets called from swapslot_free().
403 		 */
404 		if (swap_getphysname(vp, off, &pvp, &poff)) {
405 			cmn_err(CE_PANIC,
406 			    "swap_getconpage: swap_getphysname failed!");
407 		}
408 
409 		if (pvp != NULL) {
410 			err = VOP_PAGEIO(pvp, pp, poff, PAGESIZE, B_READ,
411 			    cr, NULL);
412 			if (err == 0) {
413 				struct anon *ap;
414 				kmutex_t *ahm;
415 
416 				ahm = AH_MUTEX(vp, off);
417 				mutex_enter(ahm);
418 				ap = swap_anon(vp, off);
419 				if (ap == NULL)
420 					panic("swap_getconpage: null anon");
421 				if (ap->an_pvp != pvp || ap->an_poff != poff)
422 					panic("swap_getconpage: bad anon");
423 
424 				swap_phys_free(pvp, poff, PAGESIZE);
425 				ap->an_pvp = NULL;
426 				ap->an_poff = NULL;
427 				hat_setmod(pp);
428 				mutex_exit(ahm);
429 			}
430 		} else {
431 			pagezero(pp, 0, PAGESIZE);
432 		}
433 	}
434 
435 	/*
436 	 * Normally we would let pvn_read_done() destroy
437 	 * the page on IO error. But since this is a preallocated
438 	 * page we'll let the anon layer handle it.
439 	 */
440 	page_io_unlock(pp);
441 	if (err != 0)
442 		page_hashout(pp, NULL);
443 	ASSERT(pp->p_next == pp);
444 	ASSERT(pp->p_prev == pp);
445 
446 	TRACE_3(TR_FAC_SWAPFS, TR_SWAPFS_GETAPAGE,
447 	    "swapfs getconpage:pp %p vp %p off %llx", pp, vp, off);
448 
449 	pl[0] = pp;
450 	pl[1] = NULL;
451 	return (err);
452 }
453 
454 /* Async putpage klustering stuff */
455 int sw_pending_size;
456 extern int klustsize;
457 extern struct async_reqs *sw_getreq();
458 extern void sw_putreq(struct async_reqs *);
459 extern void sw_putbackreq(struct async_reqs *);
460 extern struct async_reqs *sw_getfree();
461 extern void sw_putfree(struct async_reqs *);
462 
463 static size_t swap_putpagecnt, swap_pagespushed;
464 static size_t swap_otherfail, swap_otherpages;
465 static size_t swap_klustfail, swap_klustpages;
466 static size_t swap_getiofail, swap_getiopages;
467 
468 /*
469  * Flags are composed of {B_INVAL, B_DIRTY B_FREE, B_DONTNEED}.
470  * If len == 0, do from off to EOF.
471  */
472 static int swap_nopage = 0;	/* Don't do swap_putpage's if set */
473 
474 /* ARGSUSED */
475 static int
476 swap_putpage(
477 	struct vnode *vp,
478 	offset_t off,
479 	size_t len,
480 	int flags,
481 	struct cred *cr,
482 	caller_context_t *ct)
483 {
484 	page_t *pp;
485 	u_offset_t io_off;
486 	size_t io_len = 0;
487 	int err = 0;
488 	int nowait;
489 	struct async_reqs *arg;
490 
491 	if (swap_nopage)
492 		return (0);
493 
494 	ASSERT(vp->v_count != 0);
495 
496 	nowait = flags & B_PAGE_NOWAIT;
497 
498 	/*
499 	 * Clear force flag so that p_lckcnt pages are not invalidated.
500 	 */
501 	flags &= ~(B_FORCE | B_PAGE_NOWAIT);
502 
503 	SWAPFS_PRINT(SWAP_VOPS,
504 	    "swap_putpage: vp %p, off %llx len %lx, flags %x\n",
505 	    (void *)vp, off, len, flags, 0);
506 	TRACE_3(TR_FAC_SWAPFS, TR_SWAPFS_PUTPAGE,
507 	    "swapfs putpage:vp %p off %llx len %ld", (void *)vp, off, len);
508 
509 	if (vp->v_flag & VNOMAP)
510 		return (ENOSYS);
511 
512 	if (!vn_has_cached_data(vp))
513 		return (0);
514 
515 	if (len == 0) {
516 		if (curproc == proc_pageout)
517 			cmn_err(CE_PANIC, "swapfs: pageout can't block");
518 
519 		/* Search the entire vp list for pages >= off. */
520 		err = pvn_vplist_dirty(vp, (u_offset_t)off, swap_putapage,
521 		    flags, cr);
522 	} else {
523 		u_offset_t eoff;
524 
525 		/*
526 		 * Loop over all offsets in the range [off...off + len]
527 		 * looking for pages to deal with.
528 		 */
529 		eoff = off + len;
530 		for (io_off = (u_offset_t)off; io_off < eoff;
531 		    io_off += io_len) {
532 			/*
533 			 * If we run out of the async req slot, put the page
534 			 * now instead of queuing.
535 			 */
536 			if (flags == (B_ASYNC | B_FREE) &&
537 			    sw_pending_size < klustsize &&
538 			    (arg = sw_getfree())) {
539 				/*
540 				 * If we are clustering, we should allow
541 				 * pageout to feed us more pages because # of
542 				 * pushes is limited by # of I/Os, and one
543 				 * cluster is considered to be one I/O.
544 				 */
545 				if (pushes)
546 					pushes--;
547 
548 				arg->a_vp = vp;
549 				arg->a_off = io_off;
550 				arg->a_len = PAGESIZE;
551 				arg->a_flags = B_ASYNC | B_FREE;
552 				arg->a_cred = kcred;
553 				sw_putreq(arg);
554 				io_len = PAGESIZE;
555 				continue;
556 			}
557 			/*
558 			 * If we are not invalidating pages, use the
559 			 * routine page_lookup_nowait() to prevent
560 			 * reclaiming them from the free list.
561 			 */
562 			if (!nowait && ((flags & B_INVAL) ||
563 			    (flags & (B_ASYNC | B_FREE)) == B_FREE))
564 				pp = page_lookup(vp, io_off, SE_EXCL);
565 			else
566 				pp = page_lookup_nowait(vp, io_off,
567 				    (flags & (B_FREE | B_INVAL)) ?
568 				    SE_EXCL : SE_SHARED);
569 
570 			if (pp == NULL || pvn_getdirty(pp, flags) == 0)
571 				io_len = PAGESIZE;
572 			else {
573 				err = swap_putapage(vp, pp, &io_off, &io_len,
574 				    flags, cr);
575 				if (err != 0)
576 					break;
577 			}
578 		}
579 	}
580 	/* If invalidating, verify all pages on vnode list are gone. */
581 	if (err == 0 && off == 0 && len == 0 &&
582 	    (flags & B_INVAL) && vn_has_cached_data(vp)) {
583 		cmn_err(CE_WARN,
584 		    "swap_putpage: B_INVAL, pages not gone");
585 	}
586 	return (err);
587 }
588 
589 /*
590  * Write out a single page.
591  * For swapfs this means choose a physical swap slot and write the page
592  * out using VOP_PAGEIO.
593  * In the (B_ASYNC | B_FREE) case we try to find a bunch of other dirty
594  * swapfs pages, a bunch of contiguous swap slots and then write them
595  * all out in one clustered i/o.
596  */
597 /*ARGSUSED*/
598 static int
599 swap_putapage(
600 	struct vnode *vp,
601 	page_t *pp,
602 	u_offset_t *offp,
603 	size_t *lenp,
604 	int flags,
605 	struct cred *cr)
606 {
607 	int err;
608 	struct vnode *pvp;
609 	u_offset_t poff, off;
610 	u_offset_t doff;
611 	size_t dlen;
612 	size_t klsz = 0;
613 	u_offset_t klstart = 0;
614 	struct vnode *klvp = NULL;
615 	page_t *pplist;
616 	se_t se;
617 	struct async_reqs *arg;
618 	size_t swap_klustsize;
619 
620 	/*
621 	 * This check is added for callers who access swap_putpage with len = 0.
622 	 * swap_putpage calls swap_putapage page-by-page via pvn_vplist_dirty.
623 	 * And it's necessary to do the same queuing if users have the same
624 	 * B_ASYNC|B_FREE flags on.
625 	 */
626 	if (flags == (B_ASYNC | B_FREE) &&
627 	    sw_pending_size < klustsize && (arg = sw_getfree())) {
628 
629 		hat_setmod(pp);
630 		page_io_unlock(pp);
631 		page_unlock(pp);
632 
633 		arg->a_vp = vp;
634 		arg->a_off = pp->p_offset;
635 		arg->a_len = PAGESIZE;
636 		arg->a_flags = B_ASYNC | B_FREE;
637 		arg->a_cred = kcred;
638 		sw_putreq(arg);
639 
640 		return (0);
641 	}
642 
643 	SWAPFS_PRINT(SWAP_PUTP,
644 	    "swap_putapage: pp %p, vp %p, off %llx, flags %x\n",
645 	    pp, vp, pp->p_offset, flags, 0);
646 
647 	ASSERT(PAGE_LOCKED(pp));
648 
649 	off = pp->p_offset;
650 
651 	doff = off;
652 	dlen = PAGESIZE;
653 
654 	if (err = swap_newphysname(vp, off, &doff, &dlen, &pvp, &poff)) {
655 		err = (flags == (B_ASYNC | B_FREE) ? ENOMEM : 0);
656 		hat_setmod(pp);
657 		page_io_unlock(pp);
658 		page_unlock(pp);
659 		goto out;
660 	}
661 
662 	klvp = pvp;
663 	klstart = poff;
664 	pplist = pp;
665 	/*
666 	 * If this is ASYNC | FREE and we've accumulated a bunch of such
667 	 * pending requests, kluster.
668 	 */
669 	if (flags == (B_ASYNC | B_FREE))
670 		swap_klustsize = klustsize;
671 	else
672 		swap_klustsize = PAGESIZE;
673 	se = (flags & B_FREE ? SE_EXCL : SE_SHARED);
674 	klsz = PAGESIZE;
675 	while (klsz < swap_klustsize) {
676 		if ((arg = sw_getreq()) == NULL) {
677 			swap_getiofail++;
678 			swap_getiopages += btop(klsz);
679 			break;
680 		}
681 		ASSERT(vn_matchops(arg->a_vp, swap_vnodeops));
682 		vp = arg->a_vp;
683 		off = arg->a_off;
684 
685 		if ((pp = page_lookup_nowait(vp, off, se)) == NULL) {
686 			swap_otherfail++;
687 			swap_otherpages += btop(klsz);
688 			sw_putfree(arg);
689 			break;
690 		}
691 		if (pvn_getdirty(pp, flags | B_DELWRI) == 0) {
692 			sw_putfree(arg);
693 			continue;
694 		}
695 		/* Get new physical backing store for the page */
696 		doff = off;
697 		dlen = PAGESIZE;
698 		if (err = swap_newphysname(vp, off, &doff, &dlen,
699 		    &pvp, &poff)) {
700 			swap_otherfail++;
701 			swap_otherpages += btop(klsz);
702 			hat_setmod(pp);
703 			page_io_unlock(pp);
704 			page_unlock(pp);
705 			sw_putbackreq(arg);
706 			break;
707 		}
708 		/* Try to cluster new physical name with previous ones */
709 		if (klvp == pvp && poff == klstart + klsz) {
710 			klsz += PAGESIZE;
711 			page_add(&pplist, pp);
712 			pplist = pplist->p_next;
713 			sw_putfree(arg);
714 		} else if (klvp == pvp && poff == klstart - PAGESIZE) {
715 			klsz += PAGESIZE;
716 			klstart -= PAGESIZE;
717 			page_add(&pplist, pp);
718 			sw_putfree(arg);
719 		} else {
720 			swap_klustfail++;
721 			swap_klustpages += btop(klsz);
722 			hat_setmod(pp);
723 			page_io_unlock(pp);
724 			page_unlock(pp);
725 			sw_putbackreq(arg);
726 			break;
727 		}
728 	}
729 
730 	err = VOP_PAGEIO(klvp, pplist, klstart, klsz,
731 	    B_WRITE | flags, cr, NULL);
732 
733 	if ((flags & B_ASYNC) == 0)
734 		pvn_write_done(pp, ((err) ? B_ERROR : 0) | B_WRITE | flags);
735 
736 	/* Statistics */
737 	if (!err) {
738 		swap_putpagecnt++;
739 		swap_pagespushed += btop(klsz);
740 	}
741 out:
742 	TRACE_4(TR_FAC_SWAPFS, TR_SWAPFS_PUTAPAGE,
743 	    "swapfs putapage:vp %p klvp %p, klstart %lx, klsz %lx",
744 	    vp, klvp, klstart, klsz);
745 	if (err && err != ENOMEM)
746 		cmn_err(CE_WARN, "swapfs_putapage: err %d\n", err);
747 	if (lenp)
748 		*lenp = PAGESIZE;
749 	return (err);
750 }
751 
752 static void
753 swap_dispose(
754 	vnode_t *vp,
755 	page_t *pp,
756 	int fl,
757 	int dn,
758 	cred_t *cr,
759 	caller_context_t *ct)
760 {
761 	int err;
762 	u_offset_t off = pp->p_offset;
763 	vnode_t *pvp;
764 	u_offset_t poff;
765 
766 	ASSERT(PAGE_EXCL(pp));
767 
768 	/*
769 	 * The caller will free/invalidate large page in one shot instead of
770 	 * one small page at a time.
771 	 */
772 	if (pp->p_szc != 0) {
773 		page_unlock(pp);
774 		return;
775 	}
776 
777 	err = swap_getphysname(vp, off, &pvp, &poff);
778 	if (!err && pvp != NULL)
779 		VOP_DISPOSE(pvp, pp, fl, dn, cr, ct);
780 	else
781 		fs_dispose(vp, pp, fl, dn, cr, ct);
782 }
783