xref: /freebsd/sys/ufs/ffs/ffs_vnops.c (revision f5b7695d2d5abd735064870ad43f4b9c723940c1)
1 /*-
2  * SPDX-License-Identifier: (BSD-2-Clause-FreeBSD AND BSD-3-Clause)
3  *
4  * Copyright (c) 2002, 2003 Networks Associates Technology, Inc.
5  * All rights reserved.
6  *
7  * This software was developed for the FreeBSD Project by Marshall
8  * Kirk McKusick and Network Associates Laboratories, the Security
9  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
10  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
11  * research program
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * Copyright (c) 1982, 1986, 1989, 1993
35  *	The Regents of the University of California.  All rights reserved.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. Neither the name of the University nor the names of its contributors
46  *    may be used to endorse or promote products derived from this software
47  *    without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  *
61  *	from: @(#)ufs_readwrite.c	8.11 (Berkeley) 5/8/95
62  * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ...
63  *	@(#)ffs_vnops.c	8.15 (Berkeley) 5/14/95
64  */
65 
66 #include <sys/cdefs.h>
67 __FBSDID("$FreeBSD$");
68 
69 #include <sys/param.h>
70 #include <sys/bio.h>
71 #include <sys/systm.h>
72 #include <sys/buf.h>
73 #include <sys/conf.h>
74 #include <sys/extattr.h>
75 #include <sys/kernel.h>
76 #include <sys/limits.h>
77 #include <sys/malloc.h>
78 #include <sys/mount.h>
79 #include <sys/priv.h>
80 #include <sys/rwlock.h>
81 #include <sys/stat.h>
82 #include <sys/sysctl.h>
83 #include <sys/vmmeter.h>
84 #include <sys/vnode.h>
85 
86 #include <vm/vm.h>
87 #include <vm/vm_param.h>
88 #include <vm/vm_extern.h>
89 #include <vm/vm_object.h>
90 #include <vm/vm_page.h>
91 #include <vm/vm_pager.h>
92 #include <vm/vnode_pager.h>
93 
94 #include <ufs/ufs/extattr.h>
95 #include <ufs/ufs/quota.h>
96 #include <ufs/ufs/inode.h>
97 #include <ufs/ufs/ufs_extern.h>
98 #include <ufs/ufs/ufsmount.h>
99 
100 #include <ufs/ffs/fs.h>
101 #include <ufs/ffs/ffs_extern.h>
102 #include "opt_directio.h"
103 #include "opt_ffs.h"
104 
105 #define	ALIGNED_TO(ptr, s)	\
106 	(((uintptr_t)(ptr) & (_Alignof(s) - 1)) == 0)
107 
108 #ifdef DIRECTIO
109 extern int	ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
110 #endif
111 static vop_fdatasync_t	ffs_fdatasync;
112 static vop_fsync_t	ffs_fsync;
113 static vop_getpages_t	ffs_getpages;
114 static vop_getpages_async_t	ffs_getpages_async;
115 static vop_lock1_t	ffs_lock;
116 #ifdef INVARIANTS
117 static vop_unlock_t	ffs_unlock_debug;
118 #endif
119 static vop_read_t	ffs_read;
120 static vop_write_t	ffs_write;
121 static int	ffs_extread(struct vnode *vp, struct uio *uio, int ioflag);
122 static int	ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag,
123 		    struct ucred *cred);
124 static vop_strategy_t	ffsext_strategy;
125 static vop_closeextattr_t	ffs_closeextattr;
126 static vop_deleteextattr_t	ffs_deleteextattr;
127 static vop_getextattr_t	ffs_getextattr;
128 static vop_listextattr_t	ffs_listextattr;
129 static vop_openextattr_t	ffs_openextattr;
130 static vop_setextattr_t	ffs_setextattr;
131 static vop_vptofh_t	ffs_vptofh;
132 
133 /* Global vfs data structures for ufs. */
134 struct vop_vector ffs_vnodeops1 = {
135 	.vop_default =		&ufs_vnodeops,
136 	.vop_fsync =		ffs_fsync,
137 	.vop_fdatasync =	ffs_fdatasync,
138 	.vop_getpages =		ffs_getpages,
139 	.vop_getpages_async =	ffs_getpages_async,
140 	.vop_lock1 =		ffs_lock,
141 #ifdef INVARIANTS
142 	.vop_unlock =		ffs_unlock_debug,
143 #endif
144 	.vop_read =		ffs_read,
145 	.vop_reallocblks =	ffs_reallocblks,
146 	.vop_write =		ffs_write,
147 	.vop_vptofh =		ffs_vptofh,
148 };
149 VFS_VOP_VECTOR_REGISTER(ffs_vnodeops1);
150 
151 struct vop_vector ffs_fifoops1 = {
152 	.vop_default =		&ufs_fifoops,
153 	.vop_fsync =		ffs_fsync,
154 	.vop_fdatasync =	ffs_fdatasync,
155 	.vop_lock1 =		ffs_lock,
156 #ifdef INVARIANTS
157 	.vop_unlock =		ffs_unlock_debug,
158 #endif
159 	.vop_vptofh =		ffs_vptofh,
160 };
161 VFS_VOP_VECTOR_REGISTER(ffs_fifoops1);
162 
163 /* Global vfs data structures for ufs. */
164 struct vop_vector ffs_vnodeops2 = {
165 	.vop_default =		&ufs_vnodeops,
166 	.vop_fsync =		ffs_fsync,
167 	.vop_fdatasync =	ffs_fdatasync,
168 	.vop_getpages =		ffs_getpages,
169 	.vop_getpages_async =	ffs_getpages_async,
170 	.vop_lock1 =		ffs_lock,
171 #ifdef INVARIANTS
172 	.vop_unlock =		ffs_unlock_debug,
173 #endif
174 	.vop_read =		ffs_read,
175 	.vop_reallocblks =	ffs_reallocblks,
176 	.vop_write =		ffs_write,
177 	.vop_closeextattr =	ffs_closeextattr,
178 	.vop_deleteextattr =	ffs_deleteextattr,
179 	.vop_getextattr =	ffs_getextattr,
180 	.vop_listextattr =	ffs_listextattr,
181 	.vop_openextattr =	ffs_openextattr,
182 	.vop_setextattr =	ffs_setextattr,
183 	.vop_vptofh =		ffs_vptofh,
184 };
185 VFS_VOP_VECTOR_REGISTER(ffs_vnodeops2);
186 
187 struct vop_vector ffs_fifoops2 = {
188 	.vop_default =		&ufs_fifoops,
189 	.vop_fsync =		ffs_fsync,
190 	.vop_fdatasync =	ffs_fdatasync,
191 	.vop_lock1 =		ffs_lock,
192 #ifdef INVARIANTS
193 	.vop_unlock =		ffs_unlock_debug,
194 #endif
195 	.vop_reallocblks =	ffs_reallocblks,
196 	.vop_strategy =		ffsext_strategy,
197 	.vop_closeextattr =	ffs_closeextattr,
198 	.vop_deleteextattr =	ffs_deleteextattr,
199 	.vop_getextattr =	ffs_getextattr,
200 	.vop_listextattr =	ffs_listextattr,
201 	.vop_openextattr =	ffs_openextattr,
202 	.vop_setextattr =	ffs_setextattr,
203 	.vop_vptofh =		ffs_vptofh,
204 };
205 VFS_VOP_VECTOR_REGISTER(ffs_fifoops2);
206 
207 /*
208  * Synch an open file.
209  */
210 /* ARGSUSED */
211 static int
212 ffs_fsync(struct vop_fsync_args *ap)
213 {
214 	struct vnode *vp;
215 	struct bufobj *bo;
216 	int error;
217 
218 	vp = ap->a_vp;
219 	bo = &vp->v_bufobj;
220 retry:
221 	error = ffs_syncvnode(vp, ap->a_waitfor, 0);
222 	if (error)
223 		return (error);
224 	if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) {
225 		error = softdep_fsync(vp);
226 		if (error)
227 			return (error);
228 
229 		/*
230 		 * The softdep_fsync() function may drop vp lock,
231 		 * allowing for dirty buffers to reappear on the
232 		 * bo_dirty list. Recheck and resync as needed.
233 		 */
234 		BO_LOCK(bo);
235 		if ((vp->v_type == VREG || vp->v_type == VDIR) &&
236 		    (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) {
237 			BO_UNLOCK(bo);
238 			goto retry;
239 		}
240 		BO_UNLOCK(bo);
241 	}
242 	return (0);
243 }
244 
245 int
246 ffs_syncvnode(struct vnode *vp, int waitfor, int flags)
247 {
248 	struct inode *ip;
249 	struct bufobj *bo;
250 	struct buf *bp, *nbp;
251 	ufs_lbn_t lbn;
252 	int error, passes;
253 	bool still_dirty, wait;
254 
255 	ip = VTOI(vp);
256 	ip->i_flag &= ~IN_NEEDSYNC;
257 	bo = &vp->v_bufobj;
258 
259 	/*
260 	 * When doing MNT_WAIT we must first flush all dependencies
261 	 * on the inode.
262 	 */
263 	if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT &&
264 	    (error = softdep_sync_metadata(vp)) != 0)
265 		return (error);
266 
267 	/*
268 	 * Flush all dirty buffers associated with a vnode.
269 	 */
270 	error = 0;
271 	passes = 0;
272 	wait = false;	/* Always do an async pass first. */
273 	lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1));
274 	BO_LOCK(bo);
275 loop:
276 	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
277 		bp->b_vflags &= ~BV_SCANNED;
278 	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
279 		/*
280 		 * Reasons to skip this buffer: it has already been considered
281 		 * on this pass, the buffer has dependencies that will cause
282 		 * it to be redirtied and it has not already been deferred,
283 		 * or it is already being written.
284 		 */
285 		if ((bp->b_vflags & BV_SCANNED) != 0)
286 			continue;
287 		bp->b_vflags |= BV_SCANNED;
288 		/*
289 		 * Flush indirects in order, if requested.
290 		 *
291 		 * Note that if only datasync is requested, we can
292 		 * skip indirect blocks when softupdates are not
293 		 * active.  Otherwise we must flush them with data,
294 		 * since dependencies prevent data block writes.
295 		 */
296 		if (waitfor == MNT_WAIT && bp->b_lblkno <= -UFS_NDADDR &&
297 		    (lbn_level(bp->b_lblkno) >= passes ||
298 		    ((flags & DATA_ONLY) != 0 && !DOINGSOFTDEP(vp))))
299 			continue;
300 		if (bp->b_lblkno > lbn)
301 			panic("ffs_syncvnode: syncing truncated data.");
302 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) {
303 			BO_UNLOCK(bo);
304 		} else if (wait) {
305 			if (BUF_LOCK(bp,
306 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
307 			    BO_LOCKPTR(bo)) != 0) {
308 				bp->b_vflags &= ~BV_SCANNED;
309 				goto next;
310 			}
311 		} else
312 			continue;
313 		if ((bp->b_flags & B_DELWRI) == 0)
314 			panic("ffs_fsync: not dirty");
315 		/*
316 		 * Check for dependencies and potentially complete them.
317 		 */
318 		if (!LIST_EMPTY(&bp->b_dep) &&
319 		    (error = softdep_sync_buf(vp, bp,
320 		    wait ? MNT_WAIT : MNT_NOWAIT)) != 0) {
321 			/* I/O error. */
322 			if (error != EBUSY) {
323 				BUF_UNLOCK(bp);
324 				return (error);
325 			}
326 			/* If we deferred once, don't defer again. */
327 		    	if ((bp->b_flags & B_DEFERRED) == 0) {
328 				bp->b_flags |= B_DEFERRED;
329 				BUF_UNLOCK(bp);
330 				goto next;
331 			}
332 		}
333 		if (wait) {
334 			bremfree(bp);
335 			if ((error = bwrite(bp)) != 0)
336 				return (error);
337 		} else if ((bp->b_flags & B_CLUSTEROK)) {
338 			(void) vfs_bio_awrite(bp);
339 		} else {
340 			bremfree(bp);
341 			(void) bawrite(bp);
342 		}
343 next:
344 		/*
345 		 * Since we may have slept during the I/O, we need
346 		 * to start from a known point.
347 		 */
348 		BO_LOCK(bo);
349 		nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd);
350 	}
351 	if (waitfor != MNT_WAIT) {
352 		BO_UNLOCK(bo);
353 		if ((flags & NO_INO_UPDT) != 0)
354 			return (0);
355 		else
356 			return (ffs_update(vp, 0));
357 	}
358 	/* Drain IO to see if we're done. */
359 	bufobj_wwait(bo, 0, 0);
360 	/*
361 	 * Block devices associated with filesystems may have new I/O
362 	 * requests posted for them even if the vnode is locked, so no
363 	 * amount of trying will get them clean.  We make several passes
364 	 * as a best effort.
365 	 *
366 	 * Regular files may need multiple passes to flush all dependency
367 	 * work as it is possible that we must write once per indirect
368 	 * level, once for the leaf, and once for the inode and each of
369 	 * these will be done with one sync and one async pass.
370 	 */
371 	if (bo->bo_dirty.bv_cnt > 0) {
372 		if ((flags & DATA_ONLY) == 0) {
373 			still_dirty = true;
374 		} else {
375 			/*
376 			 * For data-only sync, dirty indirect buffers
377 			 * are ignored.
378 			 */
379 			still_dirty = false;
380 			TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
381 				if (bp->b_lblkno > -UFS_NDADDR) {
382 					still_dirty = true;
383 					break;
384 				}
385 			}
386 		}
387 
388 		if (still_dirty) {
389 			/* Write the inode after sync passes to flush deps. */
390 			if (wait && DOINGSOFTDEP(vp) &&
391 			    (flags & NO_INO_UPDT) == 0) {
392 				BO_UNLOCK(bo);
393 				ffs_update(vp, 1);
394 				BO_LOCK(bo);
395 			}
396 			/* switch between sync/async. */
397 			wait = !wait;
398 			if (wait || ++passes < UFS_NIADDR + 2)
399 				goto loop;
400 		}
401 	}
402 	BO_UNLOCK(bo);
403 	error = 0;
404 	if ((flags & DATA_ONLY) == 0) {
405 		if ((flags & NO_INO_UPDT) == 0)
406 			error = ffs_update(vp, 1);
407 		if (DOINGSUJ(vp))
408 			softdep_journal_fsync(VTOI(vp));
409 	}
410 	return (error);
411 }
412 
413 static int
414 ffs_fdatasync(struct vop_fdatasync_args *ap)
415 {
416 
417 	return (ffs_syncvnode(ap->a_vp, MNT_WAIT, DATA_ONLY));
418 }
419 
420 static int
421 ffs_lock(ap)
422 	struct vop_lock1_args /* {
423 		struct vnode *a_vp;
424 		int a_flags;
425 		struct thread *a_td;
426 		char *file;
427 		int line;
428 	} */ *ap;
429 {
430 #ifndef NO_FFS_SNAPSHOT
431 	struct vnode *vp;
432 	int flags;
433 	struct lock *lkp;
434 	int result;
435 
436 	switch (ap->a_flags & LK_TYPE_MASK) {
437 	case LK_SHARED:
438 	case LK_UPGRADE:
439 	case LK_EXCLUSIVE:
440 		vp = ap->a_vp;
441 		flags = ap->a_flags;
442 		for (;;) {
443 #ifdef DEBUG_VFS_LOCKS
444 			VNPASS(vp->v_holdcnt != 0, vp);
445 #endif
446 			lkp = vp->v_vnlock;
447 			result = lockmgr_lock_flags(lkp, flags,
448 			    &VI_MTX(vp)->lock_object, ap->a_file, ap->a_line);
449 			if (lkp == vp->v_vnlock || result != 0)
450 				break;
451 			/*
452 			 * Apparent success, except that the vnode
453 			 * mutated between snapshot file vnode and
454 			 * regular file vnode while this process
455 			 * slept.  The lock currently held is not the
456 			 * right lock.  Release it, and try to get the
457 			 * new lock.
458 			 */
459 			lockmgr_unlock(lkp);
460 			if ((flags & (LK_INTERLOCK | LK_NOWAIT)) ==
461 			    (LK_INTERLOCK | LK_NOWAIT))
462 				return (EBUSY);
463 			if ((flags & LK_TYPE_MASK) == LK_UPGRADE)
464 				flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE;
465 			flags &= ~LK_INTERLOCK;
466 		}
467 		break;
468 	default:
469 		result = VOP_LOCK1_APV(&ufs_vnodeops, ap);
470 	}
471 	return (result);
472 #else
473 	return (VOP_LOCK1_APV(&ufs_vnodeops, ap));
474 #endif
475 }
476 
477 #ifdef INVARIANTS
478 static int
479 ffs_unlock_debug(struct vop_unlock_args *ap)
480 {
481 	struct vnode *vp = ap->a_vp;
482 	struct inode *ip = VTOI(vp);
483 
484 	if (ip->i_flag & UFS_INODE_FLAG_LAZY_MASK_ASSERTABLE) {
485 		if ((vp->v_mflag & VMP_LAZYLIST) == 0) {
486 			VI_LOCK(vp);
487 			VNASSERT((vp->v_mflag & VMP_LAZYLIST), vp,
488 			    ("%s: modified vnode (%x) not on lazy list",
489 			    __func__, ip->i_flag));
490 			VI_UNLOCK(vp);
491 		}
492 	}
493 	return (VOP_UNLOCK_APV(&ufs_vnodeops, ap));
494 }
495 #endif
496 
497 static int
498 ffs_read_hole(struct uio *uio, long xfersize, long *size)
499 {
500 	ssize_t saved_resid, tlen;
501 	int error;
502 
503 	while (xfersize > 0) {
504 		tlen = min(xfersize, ZERO_REGION_SIZE);
505 		saved_resid = uio->uio_resid;
506 		error = vn_io_fault_uiomove(__DECONST(void *, zero_region),
507 		    tlen, uio);
508 		if (error != 0)
509 			return (error);
510 		tlen = saved_resid - uio->uio_resid;
511 		xfersize -= tlen;
512 		*size -= tlen;
513 	}
514 	return (0);
515 }
516 
517 /*
518  * Vnode op for reading.
519  */
520 static int
521 ffs_read(ap)
522 	struct vop_read_args /* {
523 		struct vnode *a_vp;
524 		struct uio *a_uio;
525 		int a_ioflag;
526 		struct ucred *a_cred;
527 	} */ *ap;
528 {
529 	struct vnode *vp;
530 	struct inode *ip;
531 	struct uio *uio;
532 	struct fs *fs;
533 	struct buf *bp;
534 	ufs_lbn_t lbn, nextlbn;
535 	off_t bytesinfile;
536 	long size, xfersize, blkoffset;
537 	ssize_t orig_resid;
538 	int bflag, error, ioflag, seqcount;
539 
540 	vp = ap->a_vp;
541 	uio = ap->a_uio;
542 	ioflag = ap->a_ioflag;
543 	if (ap->a_ioflag & IO_EXT)
544 #ifdef notyet
545 		return (ffs_extread(vp, uio, ioflag));
546 #else
547 		panic("ffs_read+IO_EXT");
548 #endif
549 #ifdef DIRECTIO
550 	if ((ioflag & IO_DIRECT) != 0) {
551 		int workdone;
552 
553 		error = ffs_rawread(vp, uio, &workdone);
554 		if (error != 0 || workdone != 0)
555 			return error;
556 	}
557 #endif
558 
559 	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
560 	ip = VTOI(vp);
561 
562 #ifdef INVARIANTS
563 	if (uio->uio_rw != UIO_READ)
564 		panic("ffs_read: mode");
565 
566 	if (vp->v_type == VLNK) {
567 		if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
568 			panic("ffs_read: short symlink");
569 	} else if (vp->v_type != VREG && vp->v_type != VDIR)
570 		panic("ffs_read: type %d",  vp->v_type);
571 #endif
572 	orig_resid = uio->uio_resid;
573 	KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0"));
574 	if (orig_resid == 0)
575 		return (0);
576 	KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0"));
577 	fs = ITOFS(ip);
578 	if (uio->uio_offset < ip->i_size &&
579 	    uio->uio_offset >= fs->fs_maxfilesize)
580 		return (EOVERFLOW);
581 
582 	bflag = GB_UNMAPPED | (uio->uio_segflg == UIO_NOCOPY ? 0 : GB_NOSPARSE);
583 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
584 		if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
585 			break;
586 		lbn = lblkno(fs, uio->uio_offset);
587 		nextlbn = lbn + 1;
588 
589 		/*
590 		 * size of buffer.  The buffer representing the
591 		 * end of the file is rounded up to the size of
592 		 * the block type ( fragment or full block,
593 		 * depending ).
594 		 */
595 		size = blksize(fs, ip, lbn);
596 		blkoffset = blkoff(fs, uio->uio_offset);
597 
598 		/*
599 		 * The amount we want to transfer in this iteration is
600 		 * one FS block less the amount of the data before
601 		 * our startpoint (duh!)
602 		 */
603 		xfersize = fs->fs_bsize - blkoffset;
604 
605 		/*
606 		 * But if we actually want less than the block,
607 		 * or the file doesn't have a whole block more of data,
608 		 * then use the lesser number.
609 		 */
610 		if (uio->uio_resid < xfersize)
611 			xfersize = uio->uio_resid;
612 		if (bytesinfile < xfersize)
613 			xfersize = bytesinfile;
614 
615 		if (lblktosize(fs, nextlbn) >= ip->i_size) {
616 			/*
617 			 * Don't do readahead if this is the end of the file.
618 			 */
619 			error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp);
620 		} else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
621 			/*
622 			 * Otherwise if we are allowed to cluster,
623 			 * grab as much as we can.
624 			 *
625 			 * XXX  This may not be a win if we are not
626 			 * doing sequential access.
627 			 */
628 			error = cluster_read(vp, ip->i_size, lbn,
629 			    size, NOCRED, blkoffset + uio->uio_resid,
630 			    seqcount, bflag, &bp);
631 		} else if (seqcount > 1) {
632 			/*
633 			 * If we are NOT allowed to cluster, then
634 			 * if we appear to be acting sequentially,
635 			 * fire off a request for a readahead
636 			 * as well as a read. Note that the 4th and 5th
637 			 * arguments point to arrays of the size specified in
638 			 * the 6th argument.
639 			 */
640 			u_int nextsize = blksize(fs, ip, nextlbn);
641 			error = breadn_flags(vp, lbn, lbn, size, &nextlbn,
642 			    &nextsize, 1, NOCRED, bflag, NULL, &bp);
643 		} else {
644 			/*
645 			 * Failing all of the above, just read what the
646 			 * user asked for. Interestingly, the same as
647 			 * the first option above.
648 			 */
649 			error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp);
650 		}
651 		if (error == EJUSTRETURN) {
652 			error = ffs_read_hole(uio, xfersize, &size);
653 			if (error == 0)
654 				continue;
655 		}
656 		if (error != 0) {
657 			brelse(bp);
658 			bp = NULL;
659 			break;
660 		}
661 
662 		/*
663 		 * We should only get non-zero b_resid when an I/O error
664 		 * has occurred, which should cause us to break above.
665 		 * However, if the short read did not cause an error,
666 		 * then we want to ensure that we do not uiomove bad
667 		 * or uninitialized data.
668 		 */
669 		size -= bp->b_resid;
670 		if (size < xfersize) {
671 			if (size == 0)
672 				break;
673 			xfersize = size;
674 		}
675 
676 		if (buf_mapped(bp)) {
677 			error = vn_io_fault_uiomove((char *)bp->b_data +
678 			    blkoffset, (int)xfersize, uio);
679 		} else {
680 			error = vn_io_fault_pgmove(bp->b_pages, blkoffset,
681 			    (int)xfersize, uio);
682 		}
683 		if (error)
684 			break;
685 
686 		vfs_bio_brelse(bp, ioflag);
687 	}
688 
689 	/*
690 	 * This can only happen in the case of an error
691 	 * because the loop above resets bp to NULL on each iteration
692 	 * and on normal completion has not set a new value into it.
693 	 * so it must have come from a 'break' statement
694 	 */
695 	if (bp != NULL)
696 		vfs_bio_brelse(bp, ioflag);
697 
698 	if ((error == 0 || uio->uio_resid != orig_resid) &&
699 	    (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
700 		UFS_INODE_SET_FLAG_SHARED(ip, IN_ACCESS);
701 	return (error);
702 }
703 
704 /*
705  * Vnode op for writing.
706  */
707 static int
708 ffs_write(ap)
709 	struct vop_write_args /* {
710 		struct vnode *a_vp;
711 		struct uio *a_uio;
712 		int a_ioflag;
713 		struct ucred *a_cred;
714 	} */ *ap;
715 {
716 	struct vnode *vp;
717 	struct uio *uio;
718 	struct inode *ip;
719 	struct fs *fs;
720 	struct buf *bp;
721 	ufs_lbn_t lbn;
722 	off_t osize;
723 	ssize_t resid;
724 	int seqcount;
725 	int blkoffset, error, flags, ioflag, size, xfersize;
726 
727 	vp = ap->a_vp;
728 	uio = ap->a_uio;
729 	ioflag = ap->a_ioflag;
730 	if (ap->a_ioflag & IO_EXT)
731 #ifdef notyet
732 		return (ffs_extwrite(vp, uio, ioflag, ap->a_cred));
733 #else
734 		panic("ffs_write+IO_EXT");
735 #endif
736 
737 	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
738 	ip = VTOI(vp);
739 
740 #ifdef INVARIANTS
741 	if (uio->uio_rw != UIO_WRITE)
742 		panic("ffs_write: mode");
743 #endif
744 
745 	switch (vp->v_type) {
746 	case VREG:
747 		if (ioflag & IO_APPEND)
748 			uio->uio_offset = ip->i_size;
749 		if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
750 			return (EPERM);
751 		/* FALLTHROUGH */
752 	case VLNK:
753 		break;
754 	case VDIR:
755 		panic("ffs_write: dir write");
756 		break;
757 	default:
758 		panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type,
759 			(int)uio->uio_offset,
760 			(int)uio->uio_resid
761 		);
762 	}
763 
764 	KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0"));
765 	KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0"));
766 	fs = ITOFS(ip);
767 	if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize)
768 		return (EFBIG);
769 	/*
770 	 * Maybe this should be above the vnode op call, but so long as
771 	 * file servers have no limits, I don't think it matters.
772 	 */
773 	if (vn_rlimit_fsize(vp, uio, uio->uio_td))
774 		return (EFBIG);
775 
776 	resid = uio->uio_resid;
777 	osize = ip->i_size;
778 	if (seqcount > BA_SEQMAX)
779 		flags = BA_SEQMAX << BA_SEQSHIFT;
780 	else
781 		flags = seqcount << BA_SEQSHIFT;
782 	if (ioflag & IO_SYNC)
783 		flags |= IO_SYNC;
784 	flags |= BA_UNMAPPED;
785 
786 	for (error = 0; uio->uio_resid > 0;) {
787 		lbn = lblkno(fs, uio->uio_offset);
788 		blkoffset = blkoff(fs, uio->uio_offset);
789 		xfersize = fs->fs_bsize - blkoffset;
790 		if (uio->uio_resid < xfersize)
791 			xfersize = uio->uio_resid;
792 		if (uio->uio_offset + xfersize > ip->i_size)
793 			vnode_pager_setsize(vp, uio->uio_offset + xfersize);
794 
795 		/*
796 		 * We must perform a read-before-write if the transfer size
797 		 * does not cover the entire buffer.
798 		 */
799 		if (fs->fs_bsize > xfersize)
800 			flags |= BA_CLRBUF;
801 		else
802 			flags &= ~BA_CLRBUF;
803 /* XXX is uio->uio_offset the right thing here? */
804 		error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
805 		    ap->a_cred, flags, &bp);
806 		if (error != 0) {
807 			vnode_pager_setsize(vp, ip->i_size);
808 			break;
809 		}
810 		if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL))
811 			bp->b_flags |= B_NOCACHE;
812 
813 		if (uio->uio_offset + xfersize > ip->i_size) {
814 			ip->i_size = uio->uio_offset + xfersize;
815 			DIP_SET(ip, i_size, ip->i_size);
816 		}
817 
818 		size = blksize(fs, ip, lbn) - bp->b_resid;
819 		if (size < xfersize)
820 			xfersize = size;
821 
822 		if (buf_mapped(bp)) {
823 			error = vn_io_fault_uiomove((char *)bp->b_data +
824 			    blkoffset, (int)xfersize, uio);
825 		} else {
826 			error = vn_io_fault_pgmove(bp->b_pages, blkoffset,
827 			    (int)xfersize, uio);
828 		}
829 		/*
830 		 * If the buffer is not already filled and we encounter an
831 		 * error while trying to fill it, we have to clear out any
832 		 * garbage data from the pages instantiated for the buffer.
833 		 * If we do not, a failed uiomove() during a write can leave
834 		 * the prior contents of the pages exposed to a userland mmap.
835 		 *
836 		 * Note that we need only clear buffers with a transfer size
837 		 * equal to the block size because buffers with a shorter
838 		 * transfer size were cleared above by the call to UFS_BALLOC()
839 		 * with the BA_CLRBUF flag set.
840 		 *
841 		 * If the source region for uiomove identically mmaps the
842 		 * buffer, uiomove() performed the NOP copy, and the buffer
843 		 * content remains valid because the page fault handler
844 		 * validated the pages.
845 		 */
846 		if (error != 0 && (bp->b_flags & B_CACHE) == 0 &&
847 		    fs->fs_bsize == xfersize)
848 			vfs_bio_clrbuf(bp);
849 
850 		vfs_bio_set_flags(bp, ioflag);
851 
852 		/*
853 		 * If IO_SYNC each buffer is written synchronously.  Otherwise
854 		 * if we have a severe page deficiency write the buffer
855 		 * asynchronously.  Otherwise try to cluster, and if that
856 		 * doesn't do it then either do an async write (if O_DIRECT),
857 		 * or a delayed write (if not).
858 		 */
859 		if (ioflag & IO_SYNC) {
860 			(void)bwrite(bp);
861 		} else if (vm_page_count_severe() ||
862 			    buf_dirty_count_severe() ||
863 			    (ioflag & IO_ASYNC)) {
864 			bp->b_flags |= B_CLUSTEROK;
865 			bawrite(bp);
866 		} else if (xfersize + blkoffset == fs->fs_bsize) {
867 			if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
868 				bp->b_flags |= B_CLUSTEROK;
869 				cluster_write(vp, bp, ip->i_size, seqcount,
870 				    GB_UNMAPPED);
871 			} else {
872 				bawrite(bp);
873 			}
874 		} else if (ioflag & IO_DIRECT) {
875 			bp->b_flags |= B_CLUSTEROK;
876 			bawrite(bp);
877 		} else {
878 			bp->b_flags |= B_CLUSTEROK;
879 			bdwrite(bp);
880 		}
881 		if (error || xfersize == 0)
882 			break;
883 		UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
884 	}
885 	/*
886 	 * If we successfully wrote any data, and we are not the superuser
887 	 * we clear the setuid and setgid bits as a precaution against
888 	 * tampering.
889 	 */
890 	if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid &&
891 	    ap->a_cred) {
892 		if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID)) {
893 			ip->i_mode &= ~(ISUID | ISGID);
894 			DIP_SET(ip, i_mode, ip->i_mode);
895 		}
896 	}
897 	if (error) {
898 		if (ioflag & IO_UNIT) {
899 			(void)ffs_truncate(vp, osize,
900 			    IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred);
901 			uio->uio_offset -= resid - uio->uio_resid;
902 			uio->uio_resid = resid;
903 		}
904 	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
905 		error = ffs_update(vp, 1);
906 	return (error);
907 }
908 
909 /*
910  * Extended attribute area reading.
911  */
912 static int
913 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag)
914 {
915 	struct inode *ip;
916 	struct ufs2_dinode *dp;
917 	struct fs *fs;
918 	struct buf *bp;
919 	ufs_lbn_t lbn, nextlbn;
920 	off_t bytesinfile;
921 	long size, xfersize, blkoffset;
922 	ssize_t orig_resid;
923 	int error;
924 
925 	ip = VTOI(vp);
926 	fs = ITOFS(ip);
927 	dp = ip->i_din2;
928 
929 #ifdef INVARIANTS
930 	if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC)
931 		panic("ffs_extread: mode");
932 
933 #endif
934 	orig_resid = uio->uio_resid;
935 	KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0"));
936 	if (orig_resid == 0)
937 		return (0);
938 	KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0"));
939 
940 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
941 		if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0)
942 			break;
943 		lbn = lblkno(fs, uio->uio_offset);
944 		nextlbn = lbn + 1;
945 
946 		/*
947 		 * size of buffer.  The buffer representing the
948 		 * end of the file is rounded up to the size of
949 		 * the block type ( fragment or full block,
950 		 * depending ).
951 		 */
952 		size = sblksize(fs, dp->di_extsize, lbn);
953 		blkoffset = blkoff(fs, uio->uio_offset);
954 
955 		/*
956 		 * The amount we want to transfer in this iteration is
957 		 * one FS block less the amount of the data before
958 		 * our startpoint (duh!)
959 		 */
960 		xfersize = fs->fs_bsize - blkoffset;
961 
962 		/*
963 		 * But if we actually want less than the block,
964 		 * or the file doesn't have a whole block more of data,
965 		 * then use the lesser number.
966 		 */
967 		if (uio->uio_resid < xfersize)
968 			xfersize = uio->uio_resid;
969 		if (bytesinfile < xfersize)
970 			xfersize = bytesinfile;
971 
972 		if (lblktosize(fs, nextlbn) >= dp->di_extsize) {
973 			/*
974 			 * Don't do readahead if this is the end of the info.
975 			 */
976 			error = bread(vp, -1 - lbn, size, NOCRED, &bp);
977 		} else {
978 			/*
979 			 * If we have a second block, then
980 			 * fire off a request for a readahead
981 			 * as well as a read. Note that the 4th and 5th
982 			 * arguments point to arrays of the size specified in
983 			 * the 6th argument.
984 			 */
985 			u_int nextsize = sblksize(fs, dp->di_extsize, nextlbn);
986 
987 			nextlbn = -1 - nextlbn;
988 			error = breadn(vp, -1 - lbn,
989 			    size, &nextlbn, &nextsize, 1, NOCRED, &bp);
990 		}
991 		if (error) {
992 			brelse(bp);
993 			bp = NULL;
994 			break;
995 		}
996 
997 		/*
998 		 * We should only get non-zero b_resid when an I/O error
999 		 * has occurred, which should cause us to break above.
1000 		 * However, if the short read did not cause an error,
1001 		 * then we want to ensure that we do not uiomove bad
1002 		 * or uninitialized data.
1003 		 */
1004 		size -= bp->b_resid;
1005 		if (size < xfersize) {
1006 			if (size == 0)
1007 				break;
1008 			xfersize = size;
1009 		}
1010 
1011 		error = uiomove((char *)bp->b_data + blkoffset,
1012 					(int)xfersize, uio);
1013 		if (error)
1014 			break;
1015 		vfs_bio_brelse(bp, ioflag);
1016 	}
1017 
1018 	/*
1019 	 * This can only happen in the case of an error
1020 	 * because the loop above resets bp to NULL on each iteration
1021 	 * and on normal completion has not set a new value into it.
1022 	 * so it must have come from a 'break' statement
1023 	 */
1024 	if (bp != NULL)
1025 		vfs_bio_brelse(bp, ioflag);
1026 	return (error);
1027 }
1028 
1029 /*
1030  * Extended attribute area writing.
1031  */
1032 static int
1033 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred)
1034 {
1035 	struct inode *ip;
1036 	struct ufs2_dinode *dp;
1037 	struct fs *fs;
1038 	struct buf *bp;
1039 	ufs_lbn_t lbn;
1040 	off_t osize;
1041 	ssize_t resid;
1042 	int blkoffset, error, flags, size, xfersize;
1043 
1044 	ip = VTOI(vp);
1045 	fs = ITOFS(ip);
1046 	dp = ip->i_din2;
1047 
1048 #ifdef INVARIANTS
1049 	if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC)
1050 		panic("ffs_extwrite: mode");
1051 #endif
1052 
1053 	if (ioflag & IO_APPEND)
1054 		uio->uio_offset = dp->di_extsize;
1055 	KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0"));
1056 	KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0"));
1057 	if ((uoff_t)uio->uio_offset + uio->uio_resid >
1058 	    UFS_NXADDR * fs->fs_bsize)
1059 		return (EFBIG);
1060 
1061 	resid = uio->uio_resid;
1062 	osize = dp->di_extsize;
1063 	flags = IO_EXT;
1064 	if (ioflag & IO_SYNC)
1065 		flags |= IO_SYNC;
1066 
1067 	for (error = 0; uio->uio_resid > 0;) {
1068 		lbn = lblkno(fs, uio->uio_offset);
1069 		blkoffset = blkoff(fs, uio->uio_offset);
1070 		xfersize = fs->fs_bsize - blkoffset;
1071 		if (uio->uio_resid < xfersize)
1072 			xfersize = uio->uio_resid;
1073 
1074 		/*
1075 		 * We must perform a read-before-write if the transfer size
1076 		 * does not cover the entire buffer.
1077 		 */
1078 		if (fs->fs_bsize > xfersize)
1079 			flags |= BA_CLRBUF;
1080 		else
1081 			flags &= ~BA_CLRBUF;
1082 		error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
1083 		    ucred, flags, &bp);
1084 		if (error != 0)
1085 			break;
1086 		/*
1087 		 * If the buffer is not valid we have to clear out any
1088 		 * garbage data from the pages instantiated for the buffer.
1089 		 * If we do not, a failed uiomove() during a write can leave
1090 		 * the prior contents of the pages exposed to a userland
1091 		 * mmap().  XXX deal with uiomove() errors a better way.
1092 		 */
1093 		if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
1094 			vfs_bio_clrbuf(bp);
1095 
1096 		if (uio->uio_offset + xfersize > dp->di_extsize)
1097 			dp->di_extsize = uio->uio_offset + xfersize;
1098 
1099 		size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid;
1100 		if (size < xfersize)
1101 			xfersize = size;
1102 
1103 		error =
1104 		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
1105 
1106 		vfs_bio_set_flags(bp, ioflag);
1107 
1108 		/*
1109 		 * If IO_SYNC each buffer is written synchronously.  Otherwise
1110 		 * if we have a severe page deficiency write the buffer
1111 		 * asynchronously.  Otherwise try to cluster, and if that
1112 		 * doesn't do it then either do an async write (if O_DIRECT),
1113 		 * or a delayed write (if not).
1114 		 */
1115 		if (ioflag & IO_SYNC) {
1116 			(void)bwrite(bp);
1117 		} else if (vm_page_count_severe() ||
1118 			    buf_dirty_count_severe() ||
1119 			    xfersize + blkoffset == fs->fs_bsize ||
1120 			    (ioflag & (IO_ASYNC | IO_DIRECT)))
1121 			bawrite(bp);
1122 		else
1123 			bdwrite(bp);
1124 		if (error || xfersize == 0)
1125 			break;
1126 		UFS_INODE_SET_FLAG(ip, IN_CHANGE);
1127 	}
1128 	/*
1129 	 * If we successfully wrote any data, and we are not the superuser
1130 	 * we clear the setuid and setgid bits as a precaution against
1131 	 * tampering.
1132 	 */
1133 	if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) {
1134 		if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID)) {
1135 			ip->i_mode &= ~(ISUID | ISGID);
1136 			dp->di_mode = ip->i_mode;
1137 		}
1138 	}
1139 	if (error) {
1140 		if (ioflag & IO_UNIT) {
1141 			(void)ffs_truncate(vp, osize,
1142 			    IO_EXT | (ioflag&IO_SYNC), ucred);
1143 			uio->uio_offset -= resid - uio->uio_resid;
1144 			uio->uio_resid = resid;
1145 		}
1146 	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
1147 		error = ffs_update(vp, 1);
1148 	return (error);
1149 }
1150 
1151 
1152 /*
1153  * Vnode operating to retrieve a named extended attribute.
1154  *
1155  * Locate a particular EA (nspace:name) in the area (ptr:length), and return
1156  * the length of the EA, and possibly the pointer to the entry and to the data.
1157  */
1158 static int
1159 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name,
1160     struct extattr **eapp, u_char **eac)
1161 {
1162 	struct extattr *eap, *eaend;
1163 	size_t nlen;
1164 
1165 	nlen = strlen(name);
1166 	KASSERT(ALIGNED_TO(ptr, struct extattr), ("unaligned"));
1167 	eap = (struct extattr *)ptr;
1168 	eaend = (struct extattr *)(ptr + length);
1169 	for (; eap < eaend; eap = EXTATTR_NEXT(eap)) {
1170 		/* make sure this entry is complete */
1171 		if (EXTATTR_NEXT(eap) > eaend)
1172 			break;
1173 		if (eap->ea_namespace != nspace || eap->ea_namelength != nlen
1174 		    || memcmp(eap->ea_name, name, nlen) != 0)
1175 			continue;
1176 		if (eapp != NULL)
1177 			*eapp = eap;
1178 		if (eac != NULL)
1179 			*eac = EXTATTR_CONTENT(eap);
1180 		return (EXTATTR_CONTENT_SIZE(eap));
1181 	}
1182 	return (-1);
1183 }
1184 
1185 static int
1186 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra)
1187 {
1188 	struct inode *ip;
1189 	struct ufs2_dinode *dp;
1190 	struct fs *fs;
1191 	struct uio luio;
1192 	struct iovec liovec;
1193 	u_int easize;
1194 	int error;
1195 	u_char *eae;
1196 
1197 	ip = VTOI(vp);
1198 	fs = ITOFS(ip);
1199 	dp = ip->i_din2;
1200 	easize = dp->di_extsize;
1201 	if ((uoff_t)easize + extra > UFS_NXADDR * fs->fs_bsize)
1202 		return (EFBIG);
1203 
1204 	eae = malloc(easize + extra, M_TEMP, M_WAITOK);
1205 
1206 	liovec.iov_base = eae;
1207 	liovec.iov_len = easize;
1208 	luio.uio_iov = &liovec;
1209 	luio.uio_iovcnt = 1;
1210 	luio.uio_offset = 0;
1211 	luio.uio_resid = easize;
1212 	luio.uio_segflg = UIO_SYSSPACE;
1213 	luio.uio_rw = UIO_READ;
1214 	luio.uio_td = td;
1215 
1216 	error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC);
1217 	if (error) {
1218 		free(eae, M_TEMP);
1219 		return(error);
1220 	}
1221 	*p = eae;
1222 	return (0);
1223 }
1224 
1225 static void
1226 ffs_lock_ea(struct vnode *vp)
1227 {
1228 	struct inode *ip;
1229 
1230 	ip = VTOI(vp);
1231 	VI_LOCK(vp);
1232 	while (ip->i_flag & IN_EA_LOCKED) {
1233 		UFS_INODE_SET_FLAG(ip, IN_EA_LOCKWAIT);
1234 		msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea",
1235 		    0);
1236 	}
1237 	UFS_INODE_SET_FLAG(ip, IN_EA_LOCKED);
1238 	VI_UNLOCK(vp);
1239 }
1240 
1241 static void
1242 ffs_unlock_ea(struct vnode *vp)
1243 {
1244 	struct inode *ip;
1245 
1246 	ip = VTOI(vp);
1247 	VI_LOCK(vp);
1248 	if (ip->i_flag & IN_EA_LOCKWAIT)
1249 		wakeup(&ip->i_ea_refs);
1250 	ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT);
1251 	VI_UNLOCK(vp);
1252 }
1253 
1254 static int
1255 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td)
1256 {
1257 	struct inode *ip;
1258 	struct ufs2_dinode *dp;
1259 	int error;
1260 
1261 	ip = VTOI(vp);
1262 
1263 	ffs_lock_ea(vp);
1264 	if (ip->i_ea_area != NULL) {
1265 		ip->i_ea_refs++;
1266 		ffs_unlock_ea(vp);
1267 		return (0);
1268 	}
1269 	dp = ip->i_din2;
1270 	error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0);
1271 	if (error) {
1272 		ffs_unlock_ea(vp);
1273 		return (error);
1274 	}
1275 	ip->i_ea_len = dp->di_extsize;
1276 	ip->i_ea_error = 0;
1277 	ip->i_ea_refs++;
1278 	ffs_unlock_ea(vp);
1279 	return (0);
1280 }
1281 
1282 /*
1283  * Vnode extattr transaction commit/abort
1284  */
1285 static int
1286 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td)
1287 {
1288 	struct inode *ip;
1289 	struct uio luio;
1290 	struct iovec liovec;
1291 	int error;
1292 	struct ufs2_dinode *dp;
1293 
1294 	ip = VTOI(vp);
1295 
1296 	ffs_lock_ea(vp);
1297 	if (ip->i_ea_area == NULL) {
1298 		ffs_unlock_ea(vp);
1299 		return (EINVAL);
1300 	}
1301 	dp = ip->i_din2;
1302 	error = ip->i_ea_error;
1303 	if (commit && error == 0) {
1304 		ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit");
1305 		if (cred == NOCRED)
1306 			cred =  vp->v_mount->mnt_cred;
1307 		liovec.iov_base = ip->i_ea_area;
1308 		liovec.iov_len = ip->i_ea_len;
1309 		luio.uio_iov = &liovec;
1310 		luio.uio_iovcnt = 1;
1311 		luio.uio_offset = 0;
1312 		luio.uio_resid = ip->i_ea_len;
1313 		luio.uio_segflg = UIO_SYSSPACE;
1314 		luio.uio_rw = UIO_WRITE;
1315 		luio.uio_td = td;
1316 		/* XXX: I'm not happy about truncating to zero size */
1317 		if (ip->i_ea_len < dp->di_extsize)
1318 			error = ffs_truncate(vp, 0, IO_EXT, cred);
1319 		error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred);
1320 	}
1321 	if (--ip->i_ea_refs == 0) {
1322 		free(ip->i_ea_area, M_TEMP);
1323 		ip->i_ea_area = NULL;
1324 		ip->i_ea_len = 0;
1325 		ip->i_ea_error = 0;
1326 	}
1327 	ffs_unlock_ea(vp);
1328 	return (error);
1329 }
1330 
1331 /*
1332  * Vnode extattr strategy routine for fifos.
1333  *
1334  * We need to check for a read or write of the external attributes.
1335  * Otherwise we just fall through and do the usual thing.
1336  */
1337 static int
1338 ffsext_strategy(struct vop_strategy_args *ap)
1339 /*
1340 struct vop_strategy_args {
1341 	struct vnodeop_desc *a_desc;
1342 	struct vnode *a_vp;
1343 	struct buf *a_bp;
1344 };
1345 */
1346 {
1347 	struct vnode *vp;
1348 	daddr_t lbn;
1349 
1350 	vp = ap->a_vp;
1351 	lbn = ap->a_bp->b_lblkno;
1352 	if (I_IS_UFS2(VTOI(vp)) && lbn < 0 && lbn >= -UFS_NXADDR)
1353 		return (VOP_STRATEGY_APV(&ufs_vnodeops, ap));
1354 	if (vp->v_type == VFIFO)
1355 		return (VOP_STRATEGY_APV(&ufs_fifoops, ap));
1356 	panic("spec nodes went here");
1357 }
1358 
1359 /*
1360  * Vnode extattr transaction commit/abort
1361  */
1362 static int
1363 ffs_openextattr(struct vop_openextattr_args *ap)
1364 /*
1365 struct vop_openextattr_args {
1366 	struct vnodeop_desc *a_desc;
1367 	struct vnode *a_vp;
1368 	IN struct ucred *a_cred;
1369 	IN struct thread *a_td;
1370 };
1371 */
1372 {
1373 
1374 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1375 		return (EOPNOTSUPP);
1376 
1377 	return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td));
1378 }
1379 
1380 
1381 /*
1382  * Vnode extattr transaction commit/abort
1383  */
1384 static int
1385 ffs_closeextattr(struct vop_closeextattr_args *ap)
1386 /*
1387 struct vop_closeextattr_args {
1388 	struct vnodeop_desc *a_desc;
1389 	struct vnode *a_vp;
1390 	int a_commit;
1391 	IN struct ucred *a_cred;
1392 	IN struct thread *a_td;
1393 };
1394 */
1395 {
1396 
1397 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1398 		return (EOPNOTSUPP);
1399 
1400 	if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY))
1401 		return (EROFS);
1402 
1403 	return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td));
1404 }
1405 
1406 /*
1407  * Vnode operation to remove a named attribute.
1408  */
1409 static int
1410 ffs_deleteextattr(struct vop_deleteextattr_args *ap)
1411 /*
1412 vop_deleteextattr {
1413 	IN struct vnode *a_vp;
1414 	IN int a_attrnamespace;
1415 	IN const char *a_name;
1416 	IN struct ucred *a_cred;
1417 	IN struct thread *a_td;
1418 };
1419 */
1420 {
1421 	struct inode *ip;
1422 	struct extattr *eap;
1423 	uint32_t ul;
1424 	int olen, error, i, easize;
1425 	u_char *eae;
1426 	void *tmp;
1427 
1428 	ip = VTOI(ap->a_vp);
1429 
1430 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1431 		return (EOPNOTSUPP);
1432 
1433 	if (strlen(ap->a_name) == 0)
1434 		return (EINVAL);
1435 
1436 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1437 		return (EROFS);
1438 
1439 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1440 	    ap->a_cred, ap->a_td, VWRITE);
1441 	if (error) {
1442 
1443 		/*
1444 		 * ffs_lock_ea is not needed there, because the vnode
1445 		 * must be exclusively locked.
1446 		 */
1447 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1448 			ip->i_ea_error = error;
1449 		return (error);
1450 	}
1451 
1452 	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1453 	if (error)
1454 		return (error);
1455 
1456 	/* CEM: delete could be done in-place instead */
1457 	eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK);
1458 	bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1459 	easize = ip->i_ea_len;
1460 
1461 	olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1462 	    &eap, NULL);
1463 	if (olen == -1) {
1464 		/* delete but nonexistent */
1465 		free(eae, M_TEMP);
1466 		ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1467 		return (ENOATTR);
1468 	}
1469 	ul = eap->ea_length;
1470 	i = (u_char *)EXTATTR_NEXT(eap) - eae;
1471 	bcopy(EXTATTR_NEXT(eap), eap, easize - i);
1472 	easize -= ul;
1473 
1474 	tmp = ip->i_ea_area;
1475 	ip->i_ea_area = eae;
1476 	ip->i_ea_len = easize;
1477 	free(tmp, M_TEMP);
1478 	error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1479 	return (error);
1480 }
1481 
1482 /*
1483  * Vnode operation to retrieve a named extended attribute.
1484  */
1485 static int
1486 ffs_getextattr(struct vop_getextattr_args *ap)
1487 /*
1488 vop_getextattr {
1489 	IN struct vnode *a_vp;
1490 	IN int a_attrnamespace;
1491 	IN const char *a_name;
1492 	INOUT struct uio *a_uio;
1493 	OUT size_t *a_size;
1494 	IN struct ucred *a_cred;
1495 	IN struct thread *a_td;
1496 };
1497 */
1498 {
1499 	struct inode *ip;
1500 	u_char *eae, *p;
1501 	unsigned easize;
1502 	int error, ealen;
1503 
1504 	ip = VTOI(ap->a_vp);
1505 
1506 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1507 		return (EOPNOTSUPP);
1508 
1509 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1510 	    ap->a_cred, ap->a_td, VREAD);
1511 	if (error)
1512 		return (error);
1513 
1514 	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1515 	if (error)
1516 		return (error);
1517 
1518 	eae = ip->i_ea_area;
1519 	easize = ip->i_ea_len;
1520 
1521 	ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1522 	    NULL, &p);
1523 	if (ealen >= 0) {
1524 		error = 0;
1525 		if (ap->a_size != NULL)
1526 			*ap->a_size = ealen;
1527 		else if (ap->a_uio != NULL)
1528 			error = uiomove(p, ealen, ap->a_uio);
1529 	} else
1530 		error = ENOATTR;
1531 
1532 	ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1533 	return (error);
1534 }
1535 
1536 /*
1537  * Vnode operation to retrieve extended attributes on a vnode.
1538  */
1539 static int
1540 ffs_listextattr(struct vop_listextattr_args *ap)
1541 /*
1542 vop_listextattr {
1543 	IN struct vnode *a_vp;
1544 	IN int a_attrnamespace;
1545 	INOUT struct uio *a_uio;
1546 	OUT size_t *a_size;
1547 	IN struct ucred *a_cred;
1548 	IN struct thread *a_td;
1549 };
1550 */
1551 {
1552 	struct inode *ip;
1553 	struct extattr *eap, *eaend;
1554 	int error, ealen;
1555 
1556 	ip = VTOI(ap->a_vp);
1557 
1558 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1559 		return (EOPNOTSUPP);
1560 
1561 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1562 	    ap->a_cred, ap->a_td, VREAD);
1563 	if (error)
1564 		return (error);
1565 
1566 	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1567 	if (error)
1568 		return (error);
1569 
1570 	error = 0;
1571 	if (ap->a_size != NULL)
1572 		*ap->a_size = 0;
1573 
1574 	KASSERT(ALIGNED_TO(ip->i_ea_area, struct extattr), ("unaligned"));
1575 	eap = (struct extattr *)ip->i_ea_area;
1576 	eaend = (struct extattr *)(ip->i_ea_area + ip->i_ea_len);
1577 	for (; error == 0 && eap < eaend; eap = EXTATTR_NEXT(eap)) {
1578 		/* make sure this entry is complete */
1579 		if (EXTATTR_NEXT(eap) > eaend)
1580 			break;
1581 		if (eap->ea_namespace != ap->a_attrnamespace)
1582 			continue;
1583 
1584 		ealen = eap->ea_namelength;
1585 		if (ap->a_size != NULL)
1586 			*ap->a_size += ealen + 1;
1587 		else if (ap->a_uio != NULL)
1588 			error = uiomove(&eap->ea_namelength, ealen + 1,
1589 			    ap->a_uio);
1590 	}
1591 
1592 	ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1593 	return (error);
1594 }
1595 
1596 /*
1597  * Vnode operation to set a named attribute.
1598  */
1599 static int
1600 ffs_setextattr(struct vop_setextattr_args *ap)
1601 /*
1602 vop_setextattr {
1603 	IN struct vnode *a_vp;
1604 	IN int a_attrnamespace;
1605 	IN const char *a_name;
1606 	INOUT struct uio *a_uio;
1607 	IN struct ucred *a_cred;
1608 	IN struct thread *a_td;
1609 };
1610 */
1611 {
1612 	struct inode *ip;
1613 	struct fs *fs;
1614 	struct extattr *eap;
1615 	uint32_t ealength, ul;
1616 	ssize_t ealen;
1617 	int olen, eapad1, eapad2, error, i, easize;
1618 	u_char *eae;
1619 	void *tmp;
1620 
1621 	ip = VTOI(ap->a_vp);
1622 	fs = ITOFS(ip);
1623 
1624 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1625 		return (EOPNOTSUPP);
1626 
1627 	if (strlen(ap->a_name) == 0)
1628 		return (EINVAL);
1629 
1630 	/* XXX Now unsupported API to delete EAs using NULL uio. */
1631 	if (ap->a_uio == NULL)
1632 		return (EOPNOTSUPP);
1633 
1634 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1635 		return (EROFS);
1636 
1637 	ealen = ap->a_uio->uio_resid;
1638 	if (ealen < 0 || ealen > lblktosize(fs, UFS_NXADDR))
1639 		return (EINVAL);
1640 
1641 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1642 	    ap->a_cred, ap->a_td, VWRITE);
1643 	if (error) {
1644 
1645 		/*
1646 		 * ffs_lock_ea is not needed there, because the vnode
1647 		 * must be exclusively locked.
1648 		 */
1649 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1650 			ip->i_ea_error = error;
1651 		return (error);
1652 	}
1653 
1654 	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1655 	if (error)
1656 		return (error);
1657 
1658 	ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name);
1659 	eapad1 = roundup2(ealength, 8) - ealength;
1660 	eapad2 = roundup2(ealen, 8) - ealen;
1661 	ealength += eapad1 + ealen + eapad2;
1662 
1663 	/*
1664 	 * CEM: rewrites of the same size or smaller could be done in-place
1665 	 * instead.  (We don't acquire any fine-grained locks in here either,
1666 	 * so we could also do bigger writes in-place.)
1667 	 */
1668 	eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK);
1669 	bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1670 	easize = ip->i_ea_len;
1671 
1672 	olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1673 	    &eap, NULL);
1674         if (olen == -1) {
1675 		/* new, append at end */
1676 		KASSERT(ALIGNED_TO(eae + easize, struct extattr),
1677 		    ("unaligned"));
1678 		eap = (struct extattr *)(eae + easize);
1679 		easize += ealength;
1680 	} else {
1681 		ul = eap->ea_length;
1682 		i = (u_char *)EXTATTR_NEXT(eap) - eae;
1683 		if (ul != ealength) {
1684 			bcopy(EXTATTR_NEXT(eap), (u_char *)eap + ealength,
1685 			    easize - i);
1686 			easize += (ealength - ul);
1687 		}
1688 	}
1689 	if (easize > lblktosize(fs, UFS_NXADDR)) {
1690 		free(eae, M_TEMP);
1691 		ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1692 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1693 			ip->i_ea_error = ENOSPC;
1694 		return (ENOSPC);
1695 	}
1696 	eap->ea_length = ealength;
1697 	eap->ea_namespace = ap->a_attrnamespace;
1698 	eap->ea_contentpadlen = eapad2;
1699 	eap->ea_namelength = strlen(ap->a_name);
1700 	memcpy(eap->ea_name, ap->a_name, strlen(ap->a_name));
1701 	bzero(&eap->ea_name[strlen(ap->a_name)], eapad1);
1702 	error = uiomove(EXTATTR_CONTENT(eap), ealen, ap->a_uio);
1703 	if (error) {
1704 		free(eae, M_TEMP);
1705 		ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1706 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1707 			ip->i_ea_error = error;
1708 		return (error);
1709 	}
1710 	bzero((u_char *)EXTATTR_CONTENT(eap) + ealen, eapad2);
1711 
1712 	tmp = ip->i_ea_area;
1713 	ip->i_ea_area = eae;
1714 	ip->i_ea_len = easize;
1715 	free(tmp, M_TEMP);
1716 	error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1717 	return (error);
1718 }
1719 
1720 /*
1721  * Vnode pointer to File handle
1722  */
1723 static int
1724 ffs_vptofh(struct vop_vptofh_args *ap)
1725 /*
1726 vop_vptofh {
1727 	IN struct vnode *a_vp;
1728 	IN struct fid *a_fhp;
1729 };
1730 */
1731 {
1732 	struct inode *ip;
1733 	struct ufid *ufhp;
1734 
1735 	ip = VTOI(ap->a_vp);
1736 	ufhp = (struct ufid *)ap->a_fhp;
1737 	ufhp->ufid_len = sizeof(struct ufid);
1738 	ufhp->ufid_ino = ip->i_number;
1739 	ufhp->ufid_gen = ip->i_gen;
1740 	return (0);
1741 }
1742 
1743 SYSCTL_DECL(_vfs_ffs);
1744 static int use_buf_pager = 1;
1745 SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0,
1746     "Always use buffer pager instead of bmap");
1747 
1748 static daddr_t
1749 ffs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
1750 {
1751 
1752 	return (lblkno(VFSTOUFS(vp->v_mount)->um_fs, off));
1753 }
1754 
1755 static int
1756 ffs_gbp_getblksz(struct vnode *vp, daddr_t lbn)
1757 {
1758 
1759 	return (blksize(VFSTOUFS(vp->v_mount)->um_fs, VTOI(vp), lbn));
1760 }
1761 
1762 static int
1763 ffs_getpages(struct vop_getpages_args *ap)
1764 {
1765 	struct vnode *vp;
1766 	struct ufsmount *um;
1767 
1768 	vp = ap->a_vp;
1769 	um = VFSTOUFS(vp->v_mount);
1770 
1771 	if (!use_buf_pager && um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE)
1772 		return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
1773 		    ap->a_rbehind, ap->a_rahead, NULL, NULL));
1774 	return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind,
1775 	    ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz));
1776 }
1777 
1778 static int
1779 ffs_getpages_async(struct vop_getpages_async_args *ap)
1780 {
1781 	struct vnode *vp;
1782 	struct ufsmount *um;
1783 	bool do_iodone;
1784 	int error;
1785 
1786 	vp = ap->a_vp;
1787 	um = VFSTOUFS(vp->v_mount);
1788 	do_iodone = true;
1789 
1790 	if (um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE) {
1791 		error = vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
1792 		    ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg);
1793 		if (error == 0)
1794 			do_iodone = false;
1795 	} else {
1796 		error = vfs_bio_getpages(vp, ap->a_m, ap->a_count,
1797 		    ap->a_rbehind, ap->a_rahead, ffs_gbp_getblkno,
1798 		    ffs_gbp_getblksz);
1799 	}
1800 	if (do_iodone && ap->a_iodone != NULL)
1801 		ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error);
1802 
1803 	return (error);
1804 }
1805 
1806