xref: /freebsd/sys/ufs/ffs/ffs_vnops.c (revision f9fd7337f63698f33239c58c07bf430198235a22)
1 /*-
2  * SPDX-License-Identifier: (BSD-2-Clause-FreeBSD AND BSD-3-Clause)
3  *
4  * Copyright (c) 2002, 2003 Networks Associates Technology, Inc.
5  * All rights reserved.
6  *
7  * This software was developed for the FreeBSD Project by Marshall
8  * Kirk McKusick and Network Associates Laboratories, the Security
9  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
10  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
11  * research program
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * Copyright (c) 1982, 1986, 1989, 1993
35  *	The Regents of the University of California.  All rights reserved.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. Neither the name of the University nor the names of its contributors
46  *    may be used to endorse or promote products derived from this software
47  *    without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  *
61  *	from: @(#)ufs_readwrite.c	8.11 (Berkeley) 5/8/95
62  * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ...
63  *	@(#)ffs_vnops.c	8.15 (Berkeley) 5/14/95
64  */
65 
66 #include <sys/cdefs.h>
67 __FBSDID("$FreeBSD$");
68 
69 #include <sys/param.h>
70 #include <sys/bio.h>
71 #include <sys/systm.h>
72 #include <sys/buf.h>
73 #include <sys/conf.h>
74 #include <sys/extattr.h>
75 #include <sys/kernel.h>
76 #include <sys/limits.h>
77 #include <sys/malloc.h>
78 #include <sys/mount.h>
79 #include <sys/priv.h>
80 #include <sys/rwlock.h>
81 #include <sys/stat.h>
82 #include <sys/sysctl.h>
83 #include <sys/vmmeter.h>
84 #include <sys/vnode.h>
85 
86 #include <vm/vm.h>
87 #include <vm/vm_param.h>
88 #include <vm/vm_extern.h>
89 #include <vm/vm_object.h>
90 #include <vm/vm_page.h>
91 #include <vm/vm_pager.h>
92 #include <vm/vnode_pager.h>
93 
94 #include <ufs/ufs/extattr.h>
95 #include <ufs/ufs/quota.h>
96 #include <ufs/ufs/inode.h>
97 #include <ufs/ufs/ufs_extern.h>
98 #include <ufs/ufs/ufsmount.h>
99 
100 #include <ufs/ffs/fs.h>
101 #include <ufs/ffs/ffs_extern.h>
102 #include "opt_directio.h"
103 #include "opt_ffs.h"
104 
105 #define	ALIGNED_TO(ptr, s)	\
106 	(((uintptr_t)(ptr) & (_Alignof(s) - 1)) == 0)
107 
108 #ifdef DIRECTIO
109 extern int	ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
110 #endif
111 static vop_fdatasync_t	ffs_fdatasync;
112 static vop_fsync_t	ffs_fsync;
113 static vop_getpages_t	ffs_getpages;
114 static vop_getpages_async_t	ffs_getpages_async;
115 static vop_lock1_t	ffs_lock;
116 #ifdef INVARIANTS
117 static vop_unlock_t	ffs_unlock_debug;
118 #endif
119 static vop_read_t	ffs_read;
120 static vop_write_t	ffs_write;
121 static int	ffs_extread(struct vnode *vp, struct uio *uio, int ioflag);
122 static int	ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag,
123 		    struct ucred *cred);
124 static vop_strategy_t	ffsext_strategy;
125 static vop_closeextattr_t	ffs_closeextattr;
126 static vop_deleteextattr_t	ffs_deleteextattr;
127 static vop_getextattr_t	ffs_getextattr;
128 static vop_listextattr_t	ffs_listextattr;
129 static vop_openextattr_t	ffs_openextattr;
130 static vop_setextattr_t	ffs_setextattr;
131 static vop_vptofh_t	ffs_vptofh;
132 
133 /* Global vfs data structures for ufs. */
134 struct vop_vector ffs_vnodeops1 = {
135 	.vop_default =		&ufs_vnodeops,
136 	.vop_fsync =		ffs_fsync,
137 	.vop_fdatasync =	ffs_fdatasync,
138 	.vop_getpages =		ffs_getpages,
139 	.vop_getpages_async =	ffs_getpages_async,
140 	.vop_lock1 =		ffs_lock,
141 #ifdef INVARIANTS
142 	.vop_unlock =		ffs_unlock_debug,
143 #endif
144 	.vop_read =		ffs_read,
145 	.vop_reallocblks =	ffs_reallocblks,
146 	.vop_write =		ffs_write,
147 	.vop_vptofh =		ffs_vptofh,
148 };
149 VFS_VOP_VECTOR_REGISTER(ffs_vnodeops1);
150 
151 struct vop_vector ffs_fifoops1 = {
152 	.vop_default =		&ufs_fifoops,
153 	.vop_fsync =		ffs_fsync,
154 	.vop_fdatasync =	ffs_fdatasync,
155 	.vop_lock1 =		ffs_lock,
156 #ifdef INVARIANTS
157 	.vop_unlock =		ffs_unlock_debug,
158 #endif
159 	.vop_vptofh =		ffs_vptofh,
160 };
161 VFS_VOP_VECTOR_REGISTER(ffs_fifoops1);
162 
163 /* Global vfs data structures for ufs. */
164 struct vop_vector ffs_vnodeops2 = {
165 	.vop_default =		&ufs_vnodeops,
166 	.vop_fsync =		ffs_fsync,
167 	.vop_fdatasync =	ffs_fdatasync,
168 	.vop_getpages =		ffs_getpages,
169 	.vop_getpages_async =	ffs_getpages_async,
170 	.vop_lock1 =		ffs_lock,
171 #ifdef INVARIANTS
172 	.vop_unlock =		ffs_unlock_debug,
173 #endif
174 	.vop_read =		ffs_read,
175 	.vop_reallocblks =	ffs_reallocblks,
176 	.vop_write =		ffs_write,
177 	.vop_closeextattr =	ffs_closeextattr,
178 	.vop_deleteextattr =	ffs_deleteextattr,
179 	.vop_getextattr =	ffs_getextattr,
180 	.vop_listextattr =	ffs_listextattr,
181 	.vop_openextattr =	ffs_openextattr,
182 	.vop_setextattr =	ffs_setextattr,
183 	.vop_vptofh =		ffs_vptofh,
184 };
185 VFS_VOP_VECTOR_REGISTER(ffs_vnodeops2);
186 
187 struct vop_vector ffs_fifoops2 = {
188 	.vop_default =		&ufs_fifoops,
189 	.vop_fsync =		ffs_fsync,
190 	.vop_fdatasync =	ffs_fdatasync,
191 	.vop_lock1 =		ffs_lock,
192 #ifdef INVARIANTS
193 	.vop_unlock =		ffs_unlock_debug,
194 #endif
195 	.vop_reallocblks =	ffs_reallocblks,
196 	.vop_strategy =		ffsext_strategy,
197 	.vop_closeextattr =	ffs_closeextattr,
198 	.vop_deleteextattr =	ffs_deleteextattr,
199 	.vop_getextattr =	ffs_getextattr,
200 	.vop_listextattr =	ffs_listextattr,
201 	.vop_openextattr =	ffs_openextattr,
202 	.vop_setextattr =	ffs_setextattr,
203 	.vop_vptofh =		ffs_vptofh,
204 };
205 VFS_VOP_VECTOR_REGISTER(ffs_fifoops2);
206 
207 /*
208  * Synch an open file.
209  */
210 /* ARGSUSED */
211 static int
212 ffs_fsync(struct vop_fsync_args *ap)
213 {
214 	struct vnode *vp;
215 	struct bufobj *bo;
216 	int error;
217 
218 	vp = ap->a_vp;
219 	bo = &vp->v_bufobj;
220 retry:
221 	error = ffs_syncvnode(vp, ap->a_waitfor, 0);
222 	if (error)
223 		return (error);
224 	if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) {
225 		error = softdep_fsync(vp);
226 		if (error)
227 			return (error);
228 
229 		/*
230 		 * The softdep_fsync() function may drop vp lock,
231 		 * allowing for dirty buffers to reappear on the
232 		 * bo_dirty list. Recheck and resync as needed.
233 		 */
234 		BO_LOCK(bo);
235 		if ((vp->v_type == VREG || vp->v_type == VDIR) &&
236 		    (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) {
237 			BO_UNLOCK(bo);
238 			goto retry;
239 		}
240 		BO_UNLOCK(bo);
241 	}
242 	if (ffs_fsfail_cleanup(VFSTOUFS(vp->v_mount), 0))
243 		return (ENXIO);
244 	return (0);
245 }
246 
247 int
248 ffs_syncvnode(struct vnode *vp, int waitfor, int flags)
249 {
250 	struct inode *ip;
251 	struct bufobj *bo;
252 	struct ufsmount *ump;
253 	struct buf *bp, *nbp;
254 	ufs_lbn_t lbn;
255 	int error, passes;
256 	bool still_dirty, wait;
257 
258 	ip = VTOI(vp);
259 	ip->i_flag &= ~IN_NEEDSYNC;
260 	bo = &vp->v_bufobj;
261 	ump = VFSTOUFS(vp->v_mount);
262 
263 	/*
264 	 * When doing MNT_WAIT we must first flush all dependencies
265 	 * on the inode.
266 	 */
267 	if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT &&
268 	    (error = softdep_sync_metadata(vp)) != 0) {
269 		if (ffs_fsfail_cleanup(ump, error))
270 			error = 0;
271 		return (error);
272 	}
273 
274 	/*
275 	 * Flush all dirty buffers associated with a vnode.
276 	 */
277 	error = 0;
278 	passes = 0;
279 	wait = false;	/* Always do an async pass first. */
280 	lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1));
281 	BO_LOCK(bo);
282 loop:
283 	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
284 		bp->b_vflags &= ~BV_SCANNED;
285 	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
286 		/*
287 		 * Reasons to skip this buffer: it has already been considered
288 		 * on this pass, the buffer has dependencies that will cause
289 		 * it to be redirtied and it has not already been deferred,
290 		 * or it is already being written.
291 		 */
292 		if ((bp->b_vflags & BV_SCANNED) != 0)
293 			continue;
294 		bp->b_vflags |= BV_SCANNED;
295 		/*
296 		 * Flush indirects in order, if requested.
297 		 *
298 		 * Note that if only datasync is requested, we can
299 		 * skip indirect blocks when softupdates are not
300 		 * active.  Otherwise we must flush them with data,
301 		 * since dependencies prevent data block writes.
302 		 */
303 		if (waitfor == MNT_WAIT && bp->b_lblkno <= -UFS_NDADDR &&
304 		    (lbn_level(bp->b_lblkno) >= passes ||
305 		    ((flags & DATA_ONLY) != 0 && !DOINGSOFTDEP(vp))))
306 			continue;
307 		if (bp->b_lblkno > lbn)
308 			panic("ffs_syncvnode: syncing truncated data.");
309 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) {
310 			BO_UNLOCK(bo);
311 		} else if (wait) {
312 			if (BUF_LOCK(bp,
313 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
314 			    BO_LOCKPTR(bo)) != 0) {
315 				bp->b_vflags &= ~BV_SCANNED;
316 				goto next;
317 			}
318 		} else
319 			continue;
320 		if ((bp->b_flags & B_DELWRI) == 0)
321 			panic("ffs_fsync: not dirty");
322 		/*
323 		 * Check for dependencies and potentially complete them.
324 		 */
325 		if (!LIST_EMPTY(&bp->b_dep) &&
326 		    (error = softdep_sync_buf(vp, bp,
327 		    wait ? MNT_WAIT : MNT_NOWAIT)) != 0) {
328 			/* I/O error. */
329 			if (error != EBUSY) {
330 				BUF_UNLOCK(bp);
331 				return (error);
332 			}
333 			/* If we deferred once, don't defer again. */
334 		    	if ((bp->b_flags & B_DEFERRED) == 0) {
335 				bp->b_flags |= B_DEFERRED;
336 				BUF_UNLOCK(bp);
337 				goto next;
338 			}
339 		}
340 		if (wait) {
341 			bremfree(bp);
342 			error = bwrite(bp);
343 			if (ffs_fsfail_cleanup(ump, error))
344 				error = 0;
345 			if (error != 0)
346 				return (error);
347 		} else if ((bp->b_flags & B_CLUSTEROK)) {
348 			(void) vfs_bio_awrite(bp);
349 		} else {
350 			bremfree(bp);
351 			(void) bawrite(bp);
352 		}
353 next:
354 		/*
355 		 * Since we may have slept during the I/O, we need
356 		 * to start from a known point.
357 		 */
358 		BO_LOCK(bo);
359 		nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd);
360 	}
361 	if (waitfor != MNT_WAIT) {
362 		BO_UNLOCK(bo);
363 		if ((flags & NO_INO_UPDT) != 0)
364 			return (0);
365 		else
366 			return (ffs_update(vp, 0));
367 	}
368 	/* Drain IO to see if we're done. */
369 	bufobj_wwait(bo, 0, 0);
370 	/*
371 	 * Block devices associated with filesystems may have new I/O
372 	 * requests posted for them even if the vnode is locked, so no
373 	 * amount of trying will get them clean.  We make several passes
374 	 * as a best effort.
375 	 *
376 	 * Regular files may need multiple passes to flush all dependency
377 	 * work as it is possible that we must write once per indirect
378 	 * level, once for the leaf, and once for the inode and each of
379 	 * these will be done with one sync and one async pass.
380 	 */
381 	if (bo->bo_dirty.bv_cnt > 0) {
382 		if ((flags & DATA_ONLY) == 0) {
383 			still_dirty = true;
384 		} else {
385 			/*
386 			 * For data-only sync, dirty indirect buffers
387 			 * are ignored.
388 			 */
389 			still_dirty = false;
390 			TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
391 				if (bp->b_lblkno > -UFS_NDADDR) {
392 					still_dirty = true;
393 					break;
394 				}
395 			}
396 		}
397 
398 		if (still_dirty) {
399 			/* Write the inode after sync passes to flush deps. */
400 			if (wait && DOINGSOFTDEP(vp) &&
401 			    (flags & NO_INO_UPDT) == 0) {
402 				BO_UNLOCK(bo);
403 				ffs_update(vp, 1);
404 				BO_LOCK(bo);
405 			}
406 			/* switch between sync/async. */
407 			wait = !wait;
408 			if (wait || ++passes < UFS_NIADDR + 2)
409 				goto loop;
410 		}
411 	}
412 	BO_UNLOCK(bo);
413 	error = 0;
414 	if ((flags & DATA_ONLY) == 0) {
415 		if ((flags & NO_INO_UPDT) == 0)
416 			error = ffs_update(vp, 1);
417 		if (DOINGSUJ(vp))
418 			softdep_journal_fsync(VTOI(vp));
419 	} else if ((ip->i_flags & (IN_SIZEMOD | IN_IBLKDATA)) != 0) {
420 		error = ffs_update(vp, 1);
421 	}
422 	return (error);
423 }
424 
425 static int
426 ffs_fdatasync(struct vop_fdatasync_args *ap)
427 {
428 
429 	return (ffs_syncvnode(ap->a_vp, MNT_WAIT, DATA_ONLY));
430 }
431 
432 static int
433 ffs_lock(ap)
434 	struct vop_lock1_args /* {
435 		struct vnode *a_vp;
436 		int a_flags;
437 		struct thread *a_td;
438 		char *file;
439 		int line;
440 	} */ *ap;
441 {
442 #ifndef NO_FFS_SNAPSHOT
443 	struct vnode *vp;
444 	int flags;
445 	struct lock *lkp;
446 	int result;
447 
448 	/*
449 	 * Adaptive spinning mixed with SU leads to trouble. use a giant hammer
450 	 * and only use it when LK_NODDLKTREAT is set. Currently this means it
451 	 * is only used during path lookup.
452 	 */
453 	if ((ap->a_flags & LK_NODDLKTREAT) != 0)
454 		ap->a_flags |= LK_ADAPTIVE;
455 	switch (ap->a_flags & LK_TYPE_MASK) {
456 	case LK_SHARED:
457 	case LK_UPGRADE:
458 	case LK_EXCLUSIVE:
459 		vp = ap->a_vp;
460 		flags = ap->a_flags;
461 		for (;;) {
462 #ifdef DEBUG_VFS_LOCKS
463 			VNPASS(vp->v_holdcnt != 0, vp);
464 #endif
465 			lkp = vp->v_vnlock;
466 			result = lockmgr_lock_flags(lkp, flags,
467 			    &VI_MTX(vp)->lock_object, ap->a_file, ap->a_line);
468 			if (lkp == vp->v_vnlock || result != 0)
469 				break;
470 			/*
471 			 * Apparent success, except that the vnode
472 			 * mutated between snapshot file vnode and
473 			 * regular file vnode while this process
474 			 * slept.  The lock currently held is not the
475 			 * right lock.  Release it, and try to get the
476 			 * new lock.
477 			 */
478 			lockmgr_unlock(lkp);
479 			if ((flags & (LK_INTERLOCK | LK_NOWAIT)) ==
480 			    (LK_INTERLOCK | LK_NOWAIT))
481 				return (EBUSY);
482 			if ((flags & LK_TYPE_MASK) == LK_UPGRADE)
483 				flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE;
484 			flags &= ~LK_INTERLOCK;
485 		}
486 		break;
487 	default:
488 		result = VOP_LOCK1_APV(&ufs_vnodeops, ap);
489 	}
490 	return (result);
491 #else
492 	/*
493 	 * See above for an explanation.
494 	 */
495 	if ((ap->a_flags & LK_NODDLKTREAT) != 0)
496 		ap->a_flags |= LK_ADAPTIVE;
497 	return (VOP_LOCK1_APV(&ufs_vnodeops, ap));
498 #endif
499 }
500 
501 #ifdef INVARIANTS
502 static int
503 ffs_unlock_debug(struct vop_unlock_args *ap)
504 {
505 	struct vnode *vp = ap->a_vp;
506 	struct inode *ip = VTOI(vp);
507 
508 	if (ip->i_flag & UFS_INODE_FLAG_LAZY_MASK_ASSERTABLE) {
509 		if ((vp->v_mflag & VMP_LAZYLIST) == 0) {
510 			VI_LOCK(vp);
511 			VNASSERT((vp->v_mflag & VMP_LAZYLIST), vp,
512 			    ("%s: modified vnode (%x) not on lazy list",
513 			    __func__, ip->i_flag));
514 			VI_UNLOCK(vp);
515 		}
516 	}
517 	return (VOP_UNLOCK_APV(&ufs_vnodeops, ap));
518 }
519 #endif
520 
521 static int
522 ffs_read_hole(struct uio *uio, long xfersize, long *size)
523 {
524 	ssize_t saved_resid, tlen;
525 	int error;
526 
527 	while (xfersize > 0) {
528 		tlen = min(xfersize, ZERO_REGION_SIZE);
529 		saved_resid = uio->uio_resid;
530 		error = vn_io_fault_uiomove(__DECONST(void *, zero_region),
531 		    tlen, uio);
532 		if (error != 0)
533 			return (error);
534 		tlen = saved_resid - uio->uio_resid;
535 		xfersize -= tlen;
536 		*size -= tlen;
537 	}
538 	return (0);
539 }
540 
541 /*
542  * Vnode op for reading.
543  */
544 static int
545 ffs_read(ap)
546 	struct vop_read_args /* {
547 		struct vnode *a_vp;
548 		struct uio *a_uio;
549 		int a_ioflag;
550 		struct ucred *a_cred;
551 	} */ *ap;
552 {
553 	struct vnode *vp;
554 	struct inode *ip;
555 	struct uio *uio;
556 	struct fs *fs;
557 	struct buf *bp;
558 	ufs_lbn_t lbn, nextlbn;
559 	off_t bytesinfile;
560 	long size, xfersize, blkoffset;
561 	ssize_t orig_resid;
562 	int bflag, error, ioflag, seqcount;
563 
564 	vp = ap->a_vp;
565 	uio = ap->a_uio;
566 	ioflag = ap->a_ioflag;
567 	if (ap->a_ioflag & IO_EXT)
568 #ifdef notyet
569 		return (ffs_extread(vp, uio, ioflag));
570 #else
571 		panic("ffs_read+IO_EXT");
572 #endif
573 #ifdef DIRECTIO
574 	if ((ioflag & IO_DIRECT) != 0) {
575 		int workdone;
576 
577 		error = ffs_rawread(vp, uio, &workdone);
578 		if (error != 0 || workdone != 0)
579 			return error;
580 	}
581 #endif
582 
583 	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
584 	ip = VTOI(vp);
585 
586 #ifdef INVARIANTS
587 	if (uio->uio_rw != UIO_READ)
588 		panic("ffs_read: mode");
589 
590 	if (vp->v_type == VLNK) {
591 		if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
592 			panic("ffs_read: short symlink");
593 	} else if (vp->v_type != VREG && vp->v_type != VDIR)
594 		panic("ffs_read: type %d",  vp->v_type);
595 #endif
596 	orig_resid = uio->uio_resid;
597 	KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0"));
598 	if (orig_resid == 0)
599 		return (0);
600 	KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0"));
601 	fs = ITOFS(ip);
602 	if (uio->uio_offset < ip->i_size &&
603 	    uio->uio_offset >= fs->fs_maxfilesize)
604 		return (EOVERFLOW);
605 
606 	bflag = GB_UNMAPPED | (uio->uio_segflg == UIO_NOCOPY ? 0 : GB_NOSPARSE);
607 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
608 		if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
609 			break;
610 		lbn = lblkno(fs, uio->uio_offset);
611 		nextlbn = lbn + 1;
612 
613 		/*
614 		 * size of buffer.  The buffer representing the
615 		 * end of the file is rounded up to the size of
616 		 * the block type ( fragment or full block,
617 		 * depending ).
618 		 */
619 		size = blksize(fs, ip, lbn);
620 		blkoffset = blkoff(fs, uio->uio_offset);
621 
622 		/*
623 		 * The amount we want to transfer in this iteration is
624 		 * one FS block less the amount of the data before
625 		 * our startpoint (duh!)
626 		 */
627 		xfersize = fs->fs_bsize - blkoffset;
628 
629 		/*
630 		 * But if we actually want less than the block,
631 		 * or the file doesn't have a whole block more of data,
632 		 * then use the lesser number.
633 		 */
634 		if (uio->uio_resid < xfersize)
635 			xfersize = uio->uio_resid;
636 		if (bytesinfile < xfersize)
637 			xfersize = bytesinfile;
638 
639 		if (lblktosize(fs, nextlbn) >= ip->i_size) {
640 			/*
641 			 * Don't do readahead if this is the end of the file.
642 			 */
643 			error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp);
644 		} else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
645 			/*
646 			 * Otherwise if we are allowed to cluster,
647 			 * grab as much as we can.
648 			 *
649 			 * XXX  This may not be a win if we are not
650 			 * doing sequential access.
651 			 */
652 			error = cluster_read(vp, ip->i_size, lbn,
653 			    size, NOCRED, blkoffset + uio->uio_resid,
654 			    seqcount, bflag, &bp);
655 		} else if (seqcount > 1) {
656 			/*
657 			 * If we are NOT allowed to cluster, then
658 			 * if we appear to be acting sequentially,
659 			 * fire off a request for a readahead
660 			 * as well as a read. Note that the 4th and 5th
661 			 * arguments point to arrays of the size specified in
662 			 * the 6th argument.
663 			 */
664 			u_int nextsize = blksize(fs, ip, nextlbn);
665 			error = breadn_flags(vp, lbn, lbn, size, &nextlbn,
666 			    &nextsize, 1, NOCRED, bflag, NULL, &bp);
667 		} else {
668 			/*
669 			 * Failing all of the above, just read what the
670 			 * user asked for. Interestingly, the same as
671 			 * the first option above.
672 			 */
673 			error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp);
674 		}
675 		if (error == EJUSTRETURN) {
676 			error = ffs_read_hole(uio, xfersize, &size);
677 			if (error == 0)
678 				continue;
679 		}
680 		if (error != 0) {
681 			brelse(bp);
682 			bp = NULL;
683 			break;
684 		}
685 
686 		/*
687 		 * We should only get non-zero b_resid when an I/O error
688 		 * has occurred, which should cause us to break above.
689 		 * However, if the short read did not cause an error,
690 		 * then we want to ensure that we do not uiomove bad
691 		 * or uninitialized data.
692 		 */
693 		size -= bp->b_resid;
694 		if (size < xfersize) {
695 			if (size == 0)
696 				break;
697 			xfersize = size;
698 		}
699 
700 		if (buf_mapped(bp)) {
701 			error = vn_io_fault_uiomove((char *)bp->b_data +
702 			    blkoffset, (int)xfersize, uio);
703 		} else {
704 			error = vn_io_fault_pgmove(bp->b_pages, blkoffset,
705 			    (int)xfersize, uio);
706 		}
707 		if (error)
708 			break;
709 
710 		vfs_bio_brelse(bp, ioflag);
711 	}
712 
713 	/*
714 	 * This can only happen in the case of an error
715 	 * because the loop above resets bp to NULL on each iteration
716 	 * and on normal completion has not set a new value into it.
717 	 * so it must have come from a 'break' statement
718 	 */
719 	if (bp != NULL)
720 		vfs_bio_brelse(bp, ioflag);
721 
722 	if ((error == 0 || uio->uio_resid != orig_resid) &&
723 	    (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
724 		UFS_INODE_SET_FLAG_SHARED(ip, IN_ACCESS);
725 	return (error);
726 }
727 
728 /*
729  * Vnode op for writing.
730  */
731 static int
732 ffs_write(ap)
733 	struct vop_write_args /* {
734 		struct vnode *a_vp;
735 		struct uio *a_uio;
736 		int a_ioflag;
737 		struct ucred *a_cred;
738 	} */ *ap;
739 {
740 	struct vnode *vp;
741 	struct uio *uio;
742 	struct inode *ip;
743 	struct fs *fs;
744 	struct buf *bp;
745 	ufs_lbn_t lbn;
746 	off_t osize;
747 	ssize_t resid;
748 	int seqcount;
749 	int blkoffset, error, flags, ioflag, size, xfersize;
750 
751 	vp = ap->a_vp;
752 	uio = ap->a_uio;
753 	ioflag = ap->a_ioflag;
754 	if (ap->a_ioflag & IO_EXT)
755 #ifdef notyet
756 		return (ffs_extwrite(vp, uio, ioflag, ap->a_cred));
757 #else
758 		panic("ffs_write+IO_EXT");
759 #endif
760 
761 	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
762 	ip = VTOI(vp);
763 
764 #ifdef INVARIANTS
765 	if (uio->uio_rw != UIO_WRITE)
766 		panic("ffs_write: mode");
767 #endif
768 
769 	switch (vp->v_type) {
770 	case VREG:
771 		if (ioflag & IO_APPEND)
772 			uio->uio_offset = ip->i_size;
773 		if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
774 			return (EPERM);
775 		/* FALLTHROUGH */
776 	case VLNK:
777 		break;
778 	case VDIR:
779 		panic("ffs_write: dir write");
780 		break;
781 	default:
782 		panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type,
783 			(int)uio->uio_offset,
784 			(int)uio->uio_resid
785 		);
786 	}
787 
788 	KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0"));
789 	KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0"));
790 	fs = ITOFS(ip);
791 	if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize)
792 		return (EFBIG);
793 	/*
794 	 * Maybe this should be above the vnode op call, but so long as
795 	 * file servers have no limits, I don't think it matters.
796 	 */
797 	if (vn_rlimit_fsize(vp, uio, uio->uio_td))
798 		return (EFBIG);
799 
800 	resid = uio->uio_resid;
801 	osize = ip->i_size;
802 	if (seqcount > BA_SEQMAX)
803 		flags = BA_SEQMAX << BA_SEQSHIFT;
804 	else
805 		flags = seqcount << BA_SEQSHIFT;
806 	if (ioflag & IO_SYNC)
807 		flags |= IO_SYNC;
808 	flags |= BA_UNMAPPED;
809 
810 	for (error = 0; uio->uio_resid > 0;) {
811 		lbn = lblkno(fs, uio->uio_offset);
812 		blkoffset = blkoff(fs, uio->uio_offset);
813 		xfersize = fs->fs_bsize - blkoffset;
814 		if (uio->uio_resid < xfersize)
815 			xfersize = uio->uio_resid;
816 		if (uio->uio_offset + xfersize > ip->i_size)
817 			vnode_pager_setsize(vp, uio->uio_offset + xfersize);
818 
819 		/*
820 		 * We must perform a read-before-write if the transfer size
821 		 * does not cover the entire buffer.
822 		 */
823 		if (fs->fs_bsize > xfersize)
824 			flags |= BA_CLRBUF;
825 		else
826 			flags &= ~BA_CLRBUF;
827 /* XXX is uio->uio_offset the right thing here? */
828 		error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
829 		    ap->a_cred, flags, &bp);
830 		if (error != 0) {
831 			vnode_pager_setsize(vp, ip->i_size);
832 			break;
833 		}
834 		if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL))
835 			bp->b_flags |= B_NOCACHE;
836 
837 		if (uio->uio_offset + xfersize > ip->i_size) {
838 			ip->i_size = uio->uio_offset + xfersize;
839 			DIP_SET(ip, i_size, ip->i_size);
840 			UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE);
841 		}
842 
843 		size = blksize(fs, ip, lbn) - bp->b_resid;
844 		if (size < xfersize)
845 			xfersize = size;
846 
847 		if (buf_mapped(bp)) {
848 			error = vn_io_fault_uiomove((char *)bp->b_data +
849 			    blkoffset, (int)xfersize, uio);
850 		} else {
851 			error = vn_io_fault_pgmove(bp->b_pages, blkoffset,
852 			    (int)xfersize, uio);
853 		}
854 		/*
855 		 * If the buffer is not already filled and we encounter an
856 		 * error while trying to fill it, we have to clear out any
857 		 * garbage data from the pages instantiated for the buffer.
858 		 * If we do not, a failed uiomove() during a write can leave
859 		 * the prior contents of the pages exposed to a userland mmap.
860 		 *
861 		 * Note that we need only clear buffers with a transfer size
862 		 * equal to the block size because buffers with a shorter
863 		 * transfer size were cleared above by the call to UFS_BALLOC()
864 		 * with the BA_CLRBUF flag set.
865 		 *
866 		 * If the source region for uiomove identically mmaps the
867 		 * buffer, uiomove() performed the NOP copy, and the buffer
868 		 * content remains valid because the page fault handler
869 		 * validated the pages.
870 		 */
871 		if (error != 0 && (bp->b_flags & B_CACHE) == 0 &&
872 		    fs->fs_bsize == xfersize)
873 			vfs_bio_clrbuf(bp);
874 
875 		vfs_bio_set_flags(bp, ioflag);
876 
877 		/*
878 		 * If IO_SYNC each buffer is written synchronously.  Otherwise
879 		 * if we have a severe page deficiency write the buffer
880 		 * asynchronously.  Otherwise try to cluster, and if that
881 		 * doesn't do it then either do an async write (if O_DIRECT),
882 		 * or a delayed write (if not).
883 		 */
884 		if (ioflag & IO_SYNC) {
885 			(void)bwrite(bp);
886 		} else if (vm_page_count_severe() ||
887 			    buf_dirty_count_severe() ||
888 			    (ioflag & IO_ASYNC)) {
889 			bp->b_flags |= B_CLUSTEROK;
890 			bawrite(bp);
891 		} else if (xfersize + blkoffset == fs->fs_bsize) {
892 			if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
893 				bp->b_flags |= B_CLUSTEROK;
894 				cluster_write(vp, bp, ip->i_size, seqcount,
895 				    GB_UNMAPPED);
896 			} else {
897 				bawrite(bp);
898 			}
899 		} else if (ioflag & IO_DIRECT) {
900 			bp->b_flags |= B_CLUSTEROK;
901 			bawrite(bp);
902 		} else {
903 			bp->b_flags |= B_CLUSTEROK;
904 			bdwrite(bp);
905 		}
906 		if (error || xfersize == 0)
907 			break;
908 		UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
909 	}
910 	/*
911 	 * If we successfully wrote any data, and we are not the superuser
912 	 * we clear the setuid and setgid bits as a precaution against
913 	 * tampering.
914 	 */
915 	if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid &&
916 	    ap->a_cred) {
917 		if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID)) {
918 			vn_seqc_write_begin(vp);
919 			UFS_INODE_SET_MODE(ip, ip->i_mode & ~(ISUID | ISGID));
920 			DIP_SET(ip, i_mode, ip->i_mode);
921 			vn_seqc_write_end(vp);
922 		}
923 	}
924 	if (error) {
925 		if (ioflag & IO_UNIT) {
926 			(void)ffs_truncate(vp, osize,
927 			    IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred);
928 			uio->uio_offset -= resid - uio->uio_resid;
929 			uio->uio_resid = resid;
930 		}
931 	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) {
932 		error = ffs_update(vp, 1);
933 		if (ffs_fsfail_cleanup(VFSTOUFS(vp->v_mount), error))
934 			error = ENXIO;
935 	}
936 	return (error);
937 }
938 
939 /*
940  * Extended attribute area reading.
941  */
942 static int
943 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag)
944 {
945 	struct inode *ip;
946 	struct ufs2_dinode *dp;
947 	struct fs *fs;
948 	struct buf *bp;
949 	ufs_lbn_t lbn, nextlbn;
950 	off_t bytesinfile;
951 	long size, xfersize, blkoffset;
952 	ssize_t orig_resid;
953 	int error;
954 
955 	ip = VTOI(vp);
956 	fs = ITOFS(ip);
957 	dp = ip->i_din2;
958 
959 #ifdef INVARIANTS
960 	if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC)
961 		panic("ffs_extread: mode");
962 
963 #endif
964 	orig_resid = uio->uio_resid;
965 	KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0"));
966 	if (orig_resid == 0)
967 		return (0);
968 	KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0"));
969 
970 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
971 		if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0)
972 			break;
973 		lbn = lblkno(fs, uio->uio_offset);
974 		nextlbn = lbn + 1;
975 
976 		/*
977 		 * size of buffer.  The buffer representing the
978 		 * end of the file is rounded up to the size of
979 		 * the block type ( fragment or full block,
980 		 * depending ).
981 		 */
982 		size = sblksize(fs, dp->di_extsize, lbn);
983 		blkoffset = blkoff(fs, uio->uio_offset);
984 
985 		/*
986 		 * The amount we want to transfer in this iteration is
987 		 * one FS block less the amount of the data before
988 		 * our startpoint (duh!)
989 		 */
990 		xfersize = fs->fs_bsize - blkoffset;
991 
992 		/*
993 		 * But if we actually want less than the block,
994 		 * or the file doesn't have a whole block more of data,
995 		 * then use the lesser number.
996 		 */
997 		if (uio->uio_resid < xfersize)
998 			xfersize = uio->uio_resid;
999 		if (bytesinfile < xfersize)
1000 			xfersize = bytesinfile;
1001 
1002 		if (lblktosize(fs, nextlbn) >= dp->di_extsize) {
1003 			/*
1004 			 * Don't do readahead if this is the end of the info.
1005 			 */
1006 			error = bread(vp, -1 - lbn, size, NOCRED, &bp);
1007 		} else {
1008 			/*
1009 			 * If we have a second block, then
1010 			 * fire off a request for a readahead
1011 			 * as well as a read. Note that the 4th and 5th
1012 			 * arguments point to arrays of the size specified in
1013 			 * the 6th argument.
1014 			 */
1015 			u_int nextsize = sblksize(fs, dp->di_extsize, nextlbn);
1016 
1017 			nextlbn = -1 - nextlbn;
1018 			error = breadn(vp, -1 - lbn,
1019 			    size, &nextlbn, &nextsize, 1, NOCRED, &bp);
1020 		}
1021 		if (error) {
1022 			brelse(bp);
1023 			bp = NULL;
1024 			break;
1025 		}
1026 
1027 		/*
1028 		 * We should only get non-zero b_resid when an I/O error
1029 		 * has occurred, which should cause us to break above.
1030 		 * However, if the short read did not cause an error,
1031 		 * then we want to ensure that we do not uiomove bad
1032 		 * or uninitialized data.
1033 		 */
1034 		size -= bp->b_resid;
1035 		if (size < xfersize) {
1036 			if (size == 0)
1037 				break;
1038 			xfersize = size;
1039 		}
1040 
1041 		error = uiomove((char *)bp->b_data + blkoffset,
1042 					(int)xfersize, uio);
1043 		if (error)
1044 			break;
1045 		vfs_bio_brelse(bp, ioflag);
1046 	}
1047 
1048 	/*
1049 	 * This can only happen in the case of an error
1050 	 * because the loop above resets bp to NULL on each iteration
1051 	 * and on normal completion has not set a new value into it.
1052 	 * so it must have come from a 'break' statement
1053 	 */
1054 	if (bp != NULL)
1055 		vfs_bio_brelse(bp, ioflag);
1056 	return (error);
1057 }
1058 
1059 /*
1060  * Extended attribute area writing.
1061  */
1062 static int
1063 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred)
1064 {
1065 	struct inode *ip;
1066 	struct ufs2_dinode *dp;
1067 	struct fs *fs;
1068 	struct buf *bp;
1069 	ufs_lbn_t lbn;
1070 	off_t osize;
1071 	ssize_t resid;
1072 	int blkoffset, error, flags, size, xfersize;
1073 
1074 	ip = VTOI(vp);
1075 	fs = ITOFS(ip);
1076 	dp = ip->i_din2;
1077 
1078 #ifdef INVARIANTS
1079 	if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC)
1080 		panic("ffs_extwrite: mode");
1081 #endif
1082 
1083 	if (ioflag & IO_APPEND)
1084 		uio->uio_offset = dp->di_extsize;
1085 	KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0"));
1086 	KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0"));
1087 	if ((uoff_t)uio->uio_offset + uio->uio_resid >
1088 	    UFS_NXADDR * fs->fs_bsize)
1089 		return (EFBIG);
1090 
1091 	resid = uio->uio_resid;
1092 	osize = dp->di_extsize;
1093 	flags = IO_EXT;
1094 	if (ioflag & IO_SYNC)
1095 		flags |= IO_SYNC;
1096 
1097 	for (error = 0; uio->uio_resid > 0;) {
1098 		lbn = lblkno(fs, uio->uio_offset);
1099 		blkoffset = blkoff(fs, uio->uio_offset);
1100 		xfersize = fs->fs_bsize - blkoffset;
1101 		if (uio->uio_resid < xfersize)
1102 			xfersize = uio->uio_resid;
1103 
1104 		/*
1105 		 * We must perform a read-before-write if the transfer size
1106 		 * does not cover the entire buffer.
1107 		 */
1108 		if (fs->fs_bsize > xfersize)
1109 			flags |= BA_CLRBUF;
1110 		else
1111 			flags &= ~BA_CLRBUF;
1112 		error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
1113 		    ucred, flags, &bp);
1114 		if (error != 0)
1115 			break;
1116 		/*
1117 		 * If the buffer is not valid we have to clear out any
1118 		 * garbage data from the pages instantiated for the buffer.
1119 		 * If we do not, a failed uiomove() during a write can leave
1120 		 * the prior contents of the pages exposed to a userland
1121 		 * mmap().  XXX deal with uiomove() errors a better way.
1122 		 */
1123 		if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
1124 			vfs_bio_clrbuf(bp);
1125 
1126 		if (uio->uio_offset + xfersize > dp->di_extsize) {
1127 			dp->di_extsize = uio->uio_offset + xfersize;
1128 			UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE);
1129 		}
1130 
1131 		size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid;
1132 		if (size < xfersize)
1133 			xfersize = size;
1134 
1135 		error =
1136 		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
1137 
1138 		vfs_bio_set_flags(bp, ioflag);
1139 
1140 		/*
1141 		 * If IO_SYNC each buffer is written synchronously.  Otherwise
1142 		 * if we have a severe page deficiency write the buffer
1143 		 * asynchronously.  Otherwise try to cluster, and if that
1144 		 * doesn't do it then either do an async write (if O_DIRECT),
1145 		 * or a delayed write (if not).
1146 		 */
1147 		if (ioflag & IO_SYNC) {
1148 			(void)bwrite(bp);
1149 		} else if (vm_page_count_severe() ||
1150 			    buf_dirty_count_severe() ||
1151 			    xfersize + blkoffset == fs->fs_bsize ||
1152 			    (ioflag & (IO_ASYNC | IO_DIRECT)))
1153 			bawrite(bp);
1154 		else
1155 			bdwrite(bp);
1156 		if (error || xfersize == 0)
1157 			break;
1158 		UFS_INODE_SET_FLAG(ip, IN_CHANGE);
1159 	}
1160 	/*
1161 	 * If we successfully wrote any data, and we are not the superuser
1162 	 * we clear the setuid and setgid bits as a precaution against
1163 	 * tampering.
1164 	 */
1165 	if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) {
1166 		if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID)) {
1167 			vn_seqc_write_begin(vp);
1168 			UFS_INODE_SET_MODE(ip, ip->i_mode & ~(ISUID | ISGID));
1169 			dp->di_mode = ip->i_mode;
1170 			vn_seqc_write_end(vp);
1171 		}
1172 	}
1173 	if (error) {
1174 		if (ioflag & IO_UNIT) {
1175 			(void)ffs_truncate(vp, osize,
1176 			    IO_EXT | (ioflag&IO_SYNC), ucred);
1177 			uio->uio_offset -= resid - uio->uio_resid;
1178 			uio->uio_resid = resid;
1179 		}
1180 	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
1181 		error = ffs_update(vp, 1);
1182 	return (error);
1183 }
1184 
1185 /*
1186  * Vnode operating to retrieve a named extended attribute.
1187  *
1188  * Locate a particular EA (nspace:name) in the area (ptr:length), and return
1189  * the length of the EA, and possibly the pointer to the entry and to the data.
1190  */
1191 static int
1192 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name,
1193     struct extattr **eapp, u_char **eac)
1194 {
1195 	struct extattr *eap, *eaend;
1196 	size_t nlen;
1197 
1198 	nlen = strlen(name);
1199 	KASSERT(ALIGNED_TO(ptr, struct extattr), ("unaligned"));
1200 	eap = (struct extattr *)ptr;
1201 	eaend = (struct extattr *)(ptr + length);
1202 	for (; eap < eaend; eap = EXTATTR_NEXT(eap)) {
1203 		KASSERT(EXTATTR_NEXT(eap) <= eaend,
1204 		    ("extattr next %p beyond %p", EXTATTR_NEXT(eap), eaend));
1205 		if (eap->ea_namespace != nspace || eap->ea_namelength != nlen
1206 		    || memcmp(eap->ea_name, name, nlen) != 0)
1207 			continue;
1208 		if (eapp != NULL)
1209 			*eapp = eap;
1210 		if (eac != NULL)
1211 			*eac = EXTATTR_CONTENT(eap);
1212 		return (EXTATTR_CONTENT_SIZE(eap));
1213 	}
1214 	return (-1);
1215 }
1216 
1217 static int
1218 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td)
1219 {
1220 	const struct extattr *eap, *eaend, *eapnext;
1221 	struct inode *ip;
1222 	struct ufs2_dinode *dp;
1223 	struct fs *fs;
1224 	struct uio luio;
1225 	struct iovec liovec;
1226 	u_int easize;
1227 	int error;
1228 	u_char *eae;
1229 
1230 	ip = VTOI(vp);
1231 	fs = ITOFS(ip);
1232 	dp = ip->i_din2;
1233 	easize = dp->di_extsize;
1234 	if ((uoff_t)easize > UFS_NXADDR * fs->fs_bsize)
1235 		return (EFBIG);
1236 
1237 	eae = malloc(easize, M_TEMP, M_WAITOK);
1238 
1239 	liovec.iov_base = eae;
1240 	liovec.iov_len = easize;
1241 	luio.uio_iov = &liovec;
1242 	luio.uio_iovcnt = 1;
1243 	luio.uio_offset = 0;
1244 	luio.uio_resid = easize;
1245 	luio.uio_segflg = UIO_SYSSPACE;
1246 	luio.uio_rw = UIO_READ;
1247 	luio.uio_td = td;
1248 
1249 	error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC);
1250 	if (error) {
1251 		free(eae, M_TEMP);
1252 		return (error);
1253 	}
1254 	/* Validate disk xattrfile contents. */
1255 	for (eap = (void *)eae, eaend = (void *)(eae + easize); eap < eaend;
1256 	    eap = eapnext) {
1257 		eapnext = EXTATTR_NEXT(eap);
1258 		/* Bogusly short entry or bogusly long entry. */
1259 		if (eap->ea_length < sizeof(*eap) || eapnext > eaend) {
1260 			free(eae, M_TEMP);
1261 			return (EINTEGRITY);
1262 		}
1263 	}
1264 	*p = eae;
1265 	return (0);
1266 }
1267 
1268 static void
1269 ffs_lock_ea(struct vnode *vp)
1270 {
1271 	struct inode *ip;
1272 
1273 	ip = VTOI(vp);
1274 	VI_LOCK(vp);
1275 	while (ip->i_flag & IN_EA_LOCKED) {
1276 		UFS_INODE_SET_FLAG(ip, IN_EA_LOCKWAIT);
1277 		msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea",
1278 		    0);
1279 	}
1280 	UFS_INODE_SET_FLAG(ip, IN_EA_LOCKED);
1281 	VI_UNLOCK(vp);
1282 }
1283 
1284 static void
1285 ffs_unlock_ea(struct vnode *vp)
1286 {
1287 	struct inode *ip;
1288 
1289 	ip = VTOI(vp);
1290 	VI_LOCK(vp);
1291 	if (ip->i_flag & IN_EA_LOCKWAIT)
1292 		wakeup(&ip->i_ea_refs);
1293 	ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT);
1294 	VI_UNLOCK(vp);
1295 }
1296 
1297 static int
1298 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td)
1299 {
1300 	struct inode *ip;
1301 	struct ufs2_dinode *dp;
1302 	int error;
1303 
1304 	ip = VTOI(vp);
1305 
1306 	ffs_lock_ea(vp);
1307 	if (ip->i_ea_area != NULL) {
1308 		ip->i_ea_refs++;
1309 		ffs_unlock_ea(vp);
1310 		return (0);
1311 	}
1312 	dp = ip->i_din2;
1313 	error = ffs_rdextattr(&ip->i_ea_area, vp, td);
1314 	if (error) {
1315 		ffs_unlock_ea(vp);
1316 		return (error);
1317 	}
1318 	ip->i_ea_len = dp->di_extsize;
1319 	ip->i_ea_error = 0;
1320 	ip->i_ea_refs++;
1321 	ffs_unlock_ea(vp);
1322 	return (0);
1323 }
1324 
1325 /*
1326  * Vnode extattr transaction commit/abort
1327  */
1328 static int
1329 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td)
1330 {
1331 	struct inode *ip;
1332 	struct uio luio;
1333 	struct iovec liovec;
1334 	int error;
1335 	struct ufs2_dinode *dp;
1336 
1337 	ip = VTOI(vp);
1338 
1339 	ffs_lock_ea(vp);
1340 	if (ip->i_ea_area == NULL) {
1341 		ffs_unlock_ea(vp);
1342 		return (EINVAL);
1343 	}
1344 	dp = ip->i_din2;
1345 	error = ip->i_ea_error;
1346 	if (commit && error == 0) {
1347 		ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit");
1348 		if (cred == NOCRED)
1349 			cred =  vp->v_mount->mnt_cred;
1350 		liovec.iov_base = ip->i_ea_area;
1351 		liovec.iov_len = ip->i_ea_len;
1352 		luio.uio_iov = &liovec;
1353 		luio.uio_iovcnt = 1;
1354 		luio.uio_offset = 0;
1355 		luio.uio_resid = ip->i_ea_len;
1356 		luio.uio_segflg = UIO_SYSSPACE;
1357 		luio.uio_rw = UIO_WRITE;
1358 		luio.uio_td = td;
1359 		/* XXX: I'm not happy about truncating to zero size */
1360 		if (ip->i_ea_len < dp->di_extsize)
1361 			error = ffs_truncate(vp, 0, IO_EXT, cred);
1362 		error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred);
1363 	}
1364 	if (--ip->i_ea_refs == 0) {
1365 		free(ip->i_ea_area, M_TEMP);
1366 		ip->i_ea_area = NULL;
1367 		ip->i_ea_len = 0;
1368 		ip->i_ea_error = 0;
1369 	}
1370 	ffs_unlock_ea(vp);
1371 	return (error);
1372 }
1373 
1374 /*
1375  * Vnode extattr strategy routine for fifos.
1376  *
1377  * We need to check for a read or write of the external attributes.
1378  * Otherwise we just fall through and do the usual thing.
1379  */
1380 static int
1381 ffsext_strategy(struct vop_strategy_args *ap)
1382 /*
1383 struct vop_strategy_args {
1384 	struct vnodeop_desc *a_desc;
1385 	struct vnode *a_vp;
1386 	struct buf *a_bp;
1387 };
1388 */
1389 {
1390 	struct vnode *vp;
1391 	daddr_t lbn;
1392 
1393 	vp = ap->a_vp;
1394 	lbn = ap->a_bp->b_lblkno;
1395 	if (I_IS_UFS2(VTOI(vp)) && lbn < 0 && lbn >= -UFS_NXADDR)
1396 		return (VOP_STRATEGY_APV(&ufs_vnodeops, ap));
1397 	if (vp->v_type == VFIFO)
1398 		return (VOP_STRATEGY_APV(&ufs_fifoops, ap));
1399 	panic("spec nodes went here");
1400 }
1401 
1402 /*
1403  * Vnode extattr transaction commit/abort
1404  */
1405 static int
1406 ffs_openextattr(struct vop_openextattr_args *ap)
1407 /*
1408 struct vop_openextattr_args {
1409 	struct vnodeop_desc *a_desc;
1410 	struct vnode *a_vp;
1411 	IN struct ucred *a_cred;
1412 	IN struct thread *a_td;
1413 };
1414 */
1415 {
1416 
1417 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1418 		return (EOPNOTSUPP);
1419 
1420 	return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td));
1421 }
1422 
1423 /*
1424  * Vnode extattr transaction commit/abort
1425  */
1426 static int
1427 ffs_closeextattr(struct vop_closeextattr_args *ap)
1428 /*
1429 struct vop_closeextattr_args {
1430 	struct vnodeop_desc *a_desc;
1431 	struct vnode *a_vp;
1432 	int a_commit;
1433 	IN struct ucred *a_cred;
1434 	IN struct thread *a_td;
1435 };
1436 */
1437 {
1438 
1439 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1440 		return (EOPNOTSUPP);
1441 
1442 	if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY))
1443 		return (EROFS);
1444 
1445 	return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td));
1446 }
1447 
1448 /*
1449  * Vnode operation to remove a named attribute.
1450  */
1451 static int
1452 ffs_deleteextattr(struct vop_deleteextattr_args *ap)
1453 /*
1454 vop_deleteextattr {
1455 	IN struct vnode *a_vp;
1456 	IN int a_attrnamespace;
1457 	IN const char *a_name;
1458 	IN struct ucred *a_cred;
1459 	IN struct thread *a_td;
1460 };
1461 */
1462 {
1463 	struct inode *ip;
1464 	struct extattr *eap;
1465 	uint32_t ul;
1466 	int olen, error, i, easize;
1467 	u_char *eae;
1468 	void *tmp;
1469 
1470 	ip = VTOI(ap->a_vp);
1471 
1472 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1473 		return (EOPNOTSUPP);
1474 
1475 	if (strlen(ap->a_name) == 0)
1476 		return (EINVAL);
1477 
1478 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1479 		return (EROFS);
1480 
1481 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1482 	    ap->a_cred, ap->a_td, VWRITE);
1483 	if (error) {
1484 		/*
1485 		 * ffs_lock_ea is not needed there, because the vnode
1486 		 * must be exclusively locked.
1487 		 */
1488 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1489 			ip->i_ea_error = error;
1490 		return (error);
1491 	}
1492 
1493 	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1494 	if (error)
1495 		return (error);
1496 
1497 	/* CEM: delete could be done in-place instead */
1498 	eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK);
1499 	bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1500 	easize = ip->i_ea_len;
1501 
1502 	olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1503 	    &eap, NULL);
1504 	if (olen == -1) {
1505 		/* delete but nonexistent */
1506 		free(eae, M_TEMP);
1507 		ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1508 		return (ENOATTR);
1509 	}
1510 	ul = eap->ea_length;
1511 	i = (u_char *)EXTATTR_NEXT(eap) - eae;
1512 	bcopy(EXTATTR_NEXT(eap), eap, easize - i);
1513 	easize -= ul;
1514 
1515 	tmp = ip->i_ea_area;
1516 	ip->i_ea_area = eae;
1517 	ip->i_ea_len = easize;
1518 	free(tmp, M_TEMP);
1519 	error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1520 	return (error);
1521 }
1522 
1523 /*
1524  * Vnode operation to retrieve a named extended attribute.
1525  */
1526 static int
1527 ffs_getextattr(struct vop_getextattr_args *ap)
1528 /*
1529 vop_getextattr {
1530 	IN struct vnode *a_vp;
1531 	IN int a_attrnamespace;
1532 	IN const char *a_name;
1533 	INOUT struct uio *a_uio;
1534 	OUT size_t *a_size;
1535 	IN struct ucred *a_cred;
1536 	IN struct thread *a_td;
1537 };
1538 */
1539 {
1540 	struct inode *ip;
1541 	u_char *eae, *p;
1542 	unsigned easize;
1543 	int error, ealen;
1544 
1545 	ip = VTOI(ap->a_vp);
1546 
1547 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1548 		return (EOPNOTSUPP);
1549 
1550 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1551 	    ap->a_cred, ap->a_td, VREAD);
1552 	if (error)
1553 		return (error);
1554 
1555 	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1556 	if (error)
1557 		return (error);
1558 
1559 	eae = ip->i_ea_area;
1560 	easize = ip->i_ea_len;
1561 
1562 	ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1563 	    NULL, &p);
1564 	if (ealen >= 0) {
1565 		error = 0;
1566 		if (ap->a_size != NULL)
1567 			*ap->a_size = ealen;
1568 		else if (ap->a_uio != NULL)
1569 			error = uiomove(p, ealen, ap->a_uio);
1570 	} else
1571 		error = ENOATTR;
1572 
1573 	ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1574 	return (error);
1575 }
1576 
1577 /*
1578  * Vnode operation to retrieve extended attributes on a vnode.
1579  */
1580 static int
1581 ffs_listextattr(struct vop_listextattr_args *ap)
1582 /*
1583 vop_listextattr {
1584 	IN struct vnode *a_vp;
1585 	IN int a_attrnamespace;
1586 	INOUT struct uio *a_uio;
1587 	OUT size_t *a_size;
1588 	IN struct ucred *a_cred;
1589 	IN struct thread *a_td;
1590 };
1591 */
1592 {
1593 	struct inode *ip;
1594 	struct extattr *eap, *eaend;
1595 	int error, ealen;
1596 
1597 	ip = VTOI(ap->a_vp);
1598 
1599 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1600 		return (EOPNOTSUPP);
1601 
1602 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1603 	    ap->a_cred, ap->a_td, VREAD);
1604 	if (error)
1605 		return (error);
1606 
1607 	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1608 	if (error)
1609 		return (error);
1610 
1611 	error = 0;
1612 	if (ap->a_size != NULL)
1613 		*ap->a_size = 0;
1614 
1615 	KASSERT(ALIGNED_TO(ip->i_ea_area, struct extattr), ("unaligned"));
1616 	eap = (struct extattr *)ip->i_ea_area;
1617 	eaend = (struct extattr *)(ip->i_ea_area + ip->i_ea_len);
1618 	for (; error == 0 && eap < eaend; eap = EXTATTR_NEXT(eap)) {
1619 		KASSERT(EXTATTR_NEXT(eap) <= eaend,
1620 		    ("extattr next %p beyond %p", EXTATTR_NEXT(eap), eaend));
1621 		if (eap->ea_namespace != ap->a_attrnamespace)
1622 			continue;
1623 
1624 		ealen = eap->ea_namelength;
1625 		if (ap->a_size != NULL)
1626 			*ap->a_size += ealen + 1;
1627 		else if (ap->a_uio != NULL)
1628 			error = uiomove(&eap->ea_namelength, ealen + 1,
1629 			    ap->a_uio);
1630 	}
1631 
1632 	ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1633 	return (error);
1634 }
1635 
1636 /*
1637  * Vnode operation to set a named attribute.
1638  */
1639 static int
1640 ffs_setextattr(struct vop_setextattr_args *ap)
1641 /*
1642 vop_setextattr {
1643 	IN struct vnode *a_vp;
1644 	IN int a_attrnamespace;
1645 	IN const char *a_name;
1646 	INOUT struct uio *a_uio;
1647 	IN struct ucred *a_cred;
1648 	IN struct thread *a_td;
1649 };
1650 */
1651 {
1652 	struct inode *ip;
1653 	struct fs *fs;
1654 	struct extattr *eap;
1655 	uint32_t ealength, ul;
1656 	ssize_t ealen;
1657 	int olen, eapad1, eapad2, error, i, easize;
1658 	u_char *eae;
1659 	void *tmp;
1660 
1661 	ip = VTOI(ap->a_vp);
1662 	fs = ITOFS(ip);
1663 
1664 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1665 		return (EOPNOTSUPP);
1666 
1667 	if (strlen(ap->a_name) == 0)
1668 		return (EINVAL);
1669 
1670 	/* XXX Now unsupported API to delete EAs using NULL uio. */
1671 	if (ap->a_uio == NULL)
1672 		return (EOPNOTSUPP);
1673 
1674 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1675 		return (EROFS);
1676 
1677 	ealen = ap->a_uio->uio_resid;
1678 	if (ealen < 0 || ealen > lblktosize(fs, UFS_NXADDR))
1679 		return (EINVAL);
1680 
1681 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1682 	    ap->a_cred, ap->a_td, VWRITE);
1683 	if (error) {
1684 		/*
1685 		 * ffs_lock_ea is not needed there, because the vnode
1686 		 * must be exclusively locked.
1687 		 */
1688 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1689 			ip->i_ea_error = error;
1690 		return (error);
1691 	}
1692 
1693 	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1694 	if (error)
1695 		return (error);
1696 
1697 	ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name);
1698 	eapad1 = roundup2(ealength, 8) - ealength;
1699 	eapad2 = roundup2(ealen, 8) - ealen;
1700 	ealength += eapad1 + ealen + eapad2;
1701 
1702 	/*
1703 	 * CEM: rewrites of the same size or smaller could be done in-place
1704 	 * instead.  (We don't acquire any fine-grained locks in here either,
1705 	 * so we could also do bigger writes in-place.)
1706 	 */
1707 	eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK);
1708 	bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1709 	easize = ip->i_ea_len;
1710 
1711 	olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1712 	    &eap, NULL);
1713         if (olen == -1) {
1714 		/* new, append at end */
1715 		KASSERT(ALIGNED_TO(eae + easize, struct extattr),
1716 		    ("unaligned"));
1717 		eap = (struct extattr *)(eae + easize);
1718 		easize += ealength;
1719 	} else {
1720 		ul = eap->ea_length;
1721 		i = (u_char *)EXTATTR_NEXT(eap) - eae;
1722 		if (ul != ealength) {
1723 			bcopy(EXTATTR_NEXT(eap), (u_char *)eap + ealength,
1724 			    easize - i);
1725 			easize += (ealength - ul);
1726 		}
1727 	}
1728 	if (easize > lblktosize(fs, UFS_NXADDR)) {
1729 		free(eae, M_TEMP);
1730 		ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1731 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1732 			ip->i_ea_error = ENOSPC;
1733 		return (ENOSPC);
1734 	}
1735 	eap->ea_length = ealength;
1736 	eap->ea_namespace = ap->a_attrnamespace;
1737 	eap->ea_contentpadlen = eapad2;
1738 	eap->ea_namelength = strlen(ap->a_name);
1739 	memcpy(eap->ea_name, ap->a_name, strlen(ap->a_name));
1740 	bzero(&eap->ea_name[strlen(ap->a_name)], eapad1);
1741 	error = uiomove(EXTATTR_CONTENT(eap), ealen, ap->a_uio);
1742 	if (error) {
1743 		free(eae, M_TEMP);
1744 		ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1745 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1746 			ip->i_ea_error = error;
1747 		return (error);
1748 	}
1749 	bzero((u_char *)EXTATTR_CONTENT(eap) + ealen, eapad2);
1750 
1751 	tmp = ip->i_ea_area;
1752 	ip->i_ea_area = eae;
1753 	ip->i_ea_len = easize;
1754 	free(tmp, M_TEMP);
1755 	error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1756 	return (error);
1757 }
1758 
1759 /*
1760  * Vnode pointer to File handle
1761  */
1762 static int
1763 ffs_vptofh(struct vop_vptofh_args *ap)
1764 /*
1765 vop_vptofh {
1766 	IN struct vnode *a_vp;
1767 	IN struct fid *a_fhp;
1768 };
1769 */
1770 {
1771 	struct inode *ip;
1772 	struct ufid *ufhp;
1773 
1774 	ip = VTOI(ap->a_vp);
1775 	ufhp = (struct ufid *)ap->a_fhp;
1776 	ufhp->ufid_len = sizeof(struct ufid);
1777 	ufhp->ufid_ino = ip->i_number;
1778 	ufhp->ufid_gen = ip->i_gen;
1779 	return (0);
1780 }
1781 
1782 SYSCTL_DECL(_vfs_ffs);
1783 static int use_buf_pager = 1;
1784 SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0,
1785     "Always use buffer pager instead of bmap");
1786 
1787 static daddr_t
1788 ffs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
1789 {
1790 
1791 	return (lblkno(VFSTOUFS(vp->v_mount)->um_fs, off));
1792 }
1793 
1794 static int
1795 ffs_gbp_getblksz(struct vnode *vp, daddr_t lbn)
1796 {
1797 
1798 	return (blksize(VFSTOUFS(vp->v_mount)->um_fs, VTOI(vp), lbn));
1799 }
1800 
1801 static int
1802 ffs_getpages(struct vop_getpages_args *ap)
1803 {
1804 	struct vnode *vp;
1805 	struct ufsmount *um;
1806 
1807 	vp = ap->a_vp;
1808 	um = VFSTOUFS(vp->v_mount);
1809 
1810 	if (!use_buf_pager && um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE)
1811 		return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
1812 		    ap->a_rbehind, ap->a_rahead, NULL, NULL));
1813 	return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind,
1814 	    ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz));
1815 }
1816 
1817 static int
1818 ffs_getpages_async(struct vop_getpages_async_args *ap)
1819 {
1820 	struct vnode *vp;
1821 	struct ufsmount *um;
1822 	bool do_iodone;
1823 	int error;
1824 
1825 	vp = ap->a_vp;
1826 	um = VFSTOUFS(vp->v_mount);
1827 	do_iodone = true;
1828 
1829 	if (um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE) {
1830 		error = vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
1831 		    ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg);
1832 		if (error == 0)
1833 			do_iodone = false;
1834 	} else {
1835 		error = vfs_bio_getpages(vp, ap->a_m, ap->a_count,
1836 		    ap->a_rbehind, ap->a_rahead, ffs_gbp_getblkno,
1837 		    ffs_gbp_getblksz);
1838 	}
1839 	if (do_iodone && ap->a_iodone != NULL)
1840 		ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error);
1841 
1842 	return (error);
1843 }
1844