xref: /freebsd/sys/ufs/ffs/ffs_vnops.c (revision 0957b409a90fd597c1e9124cbaf3edd2b488f4ac)
1 /*-
2  * SPDX-License-Identifier: (BSD-2-Clause-FreeBSD AND BSD-3-Clause)
3  *
4  * Copyright (c) 2002, 2003 Networks Associates Technology, Inc.
5  * All rights reserved.
6  *
7  * This software was developed for the FreeBSD Project by Marshall
8  * Kirk McKusick and Network Associates Laboratories, the Security
9  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
10  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
11  * research program
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * Copyright (c) 1982, 1986, 1989, 1993
35  *	The Regents of the University of California.  All rights reserved.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. Neither the name of the University nor the names of its contributors
46  *    may be used to endorse or promote products derived from this software
47  *    without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  *
61  *	from: @(#)ufs_readwrite.c	8.11 (Berkeley) 5/8/95
62  * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ...
63  *	@(#)ffs_vnops.c	8.15 (Berkeley) 5/14/95
64  */
65 
66 #include <sys/cdefs.h>
67 __FBSDID("$FreeBSD$");
68 
69 #include <sys/param.h>
70 #include <sys/bio.h>
71 #include <sys/systm.h>
72 #include <sys/buf.h>
73 #include <sys/conf.h>
74 #include <sys/extattr.h>
75 #include <sys/kernel.h>
76 #include <sys/limits.h>
77 #include <sys/malloc.h>
78 #include <sys/mount.h>
79 #include <sys/priv.h>
80 #include <sys/rwlock.h>
81 #include <sys/stat.h>
82 #include <sys/sysctl.h>
83 #include <sys/vmmeter.h>
84 #include <sys/vnode.h>
85 
86 #include <vm/vm.h>
87 #include <vm/vm_param.h>
88 #include <vm/vm_extern.h>
89 #include <vm/vm_object.h>
90 #include <vm/vm_page.h>
91 #include <vm/vm_pager.h>
92 #include <vm/vnode_pager.h>
93 
94 #include <ufs/ufs/extattr.h>
95 #include <ufs/ufs/quota.h>
96 #include <ufs/ufs/inode.h>
97 #include <ufs/ufs/ufs_extern.h>
98 #include <ufs/ufs/ufsmount.h>
99 
100 #include <ufs/ffs/fs.h>
101 #include <ufs/ffs/ffs_extern.h>
102 #include "opt_directio.h"
103 #include "opt_ffs.h"
104 
105 #define	ALIGNED_TO(ptr, s)	\
106 	(((uintptr_t)(ptr) & (_Alignof(s) - 1)) == 0)
107 
108 #ifdef DIRECTIO
109 extern int	ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
110 #endif
111 static vop_fdatasync_t	ffs_fdatasync;
112 static vop_fsync_t	ffs_fsync;
113 static vop_getpages_t	ffs_getpages;
114 static vop_getpages_async_t	ffs_getpages_async;
115 static vop_lock1_t	ffs_lock;
116 static vop_read_t	ffs_read;
117 static vop_write_t	ffs_write;
118 static int	ffs_extread(struct vnode *vp, struct uio *uio, int ioflag);
119 static int	ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag,
120 		    struct ucred *cred);
121 static vop_strategy_t	ffsext_strategy;
122 static vop_closeextattr_t	ffs_closeextattr;
123 static vop_deleteextattr_t	ffs_deleteextattr;
124 static vop_getextattr_t	ffs_getextattr;
125 static vop_listextattr_t	ffs_listextattr;
126 static vop_openextattr_t	ffs_openextattr;
127 static vop_setextattr_t	ffs_setextattr;
128 static vop_vptofh_t	ffs_vptofh;
129 
130 /* Global vfs data structures for ufs. */
131 struct vop_vector ffs_vnodeops1 = {
132 	.vop_default =		&ufs_vnodeops,
133 	.vop_fsync =		ffs_fsync,
134 	.vop_fdatasync =	ffs_fdatasync,
135 	.vop_getpages =		ffs_getpages,
136 	.vop_getpages_async =	ffs_getpages_async,
137 	.vop_lock1 =		ffs_lock,
138 	.vop_read =		ffs_read,
139 	.vop_reallocblks =	ffs_reallocblks,
140 	.vop_write =		ffs_write,
141 	.vop_vptofh =		ffs_vptofh,
142 };
143 
144 struct vop_vector ffs_fifoops1 = {
145 	.vop_default =		&ufs_fifoops,
146 	.vop_fsync =		ffs_fsync,
147 	.vop_fdatasync =	ffs_fdatasync,
148 	.vop_lock1 =		ffs_lock,
149 	.vop_vptofh =		ffs_vptofh,
150 };
151 
152 /* Global vfs data structures for ufs. */
153 struct vop_vector ffs_vnodeops2 = {
154 	.vop_default =		&ufs_vnodeops,
155 	.vop_fsync =		ffs_fsync,
156 	.vop_fdatasync =	ffs_fdatasync,
157 	.vop_getpages =		ffs_getpages,
158 	.vop_getpages_async =	ffs_getpages_async,
159 	.vop_lock1 =		ffs_lock,
160 	.vop_read =		ffs_read,
161 	.vop_reallocblks =	ffs_reallocblks,
162 	.vop_write =		ffs_write,
163 	.vop_closeextattr =	ffs_closeextattr,
164 	.vop_deleteextattr =	ffs_deleteextattr,
165 	.vop_getextattr =	ffs_getextattr,
166 	.vop_listextattr =	ffs_listextattr,
167 	.vop_openextattr =	ffs_openextattr,
168 	.vop_setextattr =	ffs_setextattr,
169 	.vop_vptofh =		ffs_vptofh,
170 };
171 
172 struct vop_vector ffs_fifoops2 = {
173 	.vop_default =		&ufs_fifoops,
174 	.vop_fsync =		ffs_fsync,
175 	.vop_fdatasync =	ffs_fdatasync,
176 	.vop_lock1 =		ffs_lock,
177 	.vop_reallocblks =	ffs_reallocblks,
178 	.vop_strategy =		ffsext_strategy,
179 	.vop_closeextattr =	ffs_closeextattr,
180 	.vop_deleteextattr =	ffs_deleteextattr,
181 	.vop_getextattr =	ffs_getextattr,
182 	.vop_listextattr =	ffs_listextattr,
183 	.vop_openextattr =	ffs_openextattr,
184 	.vop_setextattr =	ffs_setextattr,
185 	.vop_vptofh =		ffs_vptofh,
186 };
187 
188 /*
189  * Synch an open file.
190  */
191 /* ARGSUSED */
192 static int
193 ffs_fsync(struct vop_fsync_args *ap)
194 {
195 	struct vnode *vp;
196 	struct bufobj *bo;
197 	int error;
198 
199 	vp = ap->a_vp;
200 	bo = &vp->v_bufobj;
201 retry:
202 	error = ffs_syncvnode(vp, ap->a_waitfor, 0);
203 	if (error)
204 		return (error);
205 	if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) {
206 		error = softdep_fsync(vp);
207 		if (error)
208 			return (error);
209 
210 		/*
211 		 * The softdep_fsync() function may drop vp lock,
212 		 * allowing for dirty buffers to reappear on the
213 		 * bo_dirty list. Recheck and resync as needed.
214 		 */
215 		BO_LOCK(bo);
216 		if ((vp->v_type == VREG || vp->v_type == VDIR) &&
217 		    (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) {
218 			BO_UNLOCK(bo);
219 			goto retry;
220 		}
221 		BO_UNLOCK(bo);
222 	}
223 	return (0);
224 }
225 
226 int
227 ffs_syncvnode(struct vnode *vp, int waitfor, int flags)
228 {
229 	struct inode *ip;
230 	struct bufobj *bo;
231 	struct buf *bp, *nbp;
232 	ufs_lbn_t lbn;
233 	int error, passes;
234 	bool still_dirty, wait;
235 
236 	ip = VTOI(vp);
237 	ip->i_flag &= ~IN_NEEDSYNC;
238 	bo = &vp->v_bufobj;
239 
240 	/*
241 	 * When doing MNT_WAIT we must first flush all dependencies
242 	 * on the inode.
243 	 */
244 	if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT &&
245 	    (error = softdep_sync_metadata(vp)) != 0)
246 		return (error);
247 
248 	/*
249 	 * Flush all dirty buffers associated with a vnode.
250 	 */
251 	error = 0;
252 	passes = 0;
253 	wait = false;	/* Always do an async pass first. */
254 	lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1));
255 	BO_LOCK(bo);
256 loop:
257 	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
258 		bp->b_vflags &= ~BV_SCANNED;
259 	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
260 		/*
261 		 * Reasons to skip this buffer: it has already been considered
262 		 * on this pass, the buffer has dependencies that will cause
263 		 * it to be redirtied and it has not already been deferred,
264 		 * or it is already being written.
265 		 */
266 		if ((bp->b_vflags & BV_SCANNED) != 0)
267 			continue;
268 		bp->b_vflags |= BV_SCANNED;
269 		/*
270 		 * Flush indirects in order, if requested.
271 		 *
272 		 * Note that if only datasync is requested, we can
273 		 * skip indirect blocks when softupdates are not
274 		 * active.  Otherwise we must flush them with data,
275 		 * since dependencies prevent data block writes.
276 		 */
277 		if (waitfor == MNT_WAIT && bp->b_lblkno <= -UFS_NDADDR &&
278 		    (lbn_level(bp->b_lblkno) >= passes ||
279 		    ((flags & DATA_ONLY) != 0 && !DOINGSOFTDEP(vp))))
280 			continue;
281 		if (bp->b_lblkno > lbn)
282 			panic("ffs_syncvnode: syncing truncated data.");
283 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) {
284 			BO_UNLOCK(bo);
285 		} else if (wait) {
286 			if (BUF_LOCK(bp,
287 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
288 			    BO_LOCKPTR(bo)) != 0) {
289 				bp->b_vflags &= ~BV_SCANNED;
290 				goto next;
291 			}
292 		} else
293 			continue;
294 		if ((bp->b_flags & B_DELWRI) == 0)
295 			panic("ffs_fsync: not dirty");
296 		/*
297 		 * Check for dependencies and potentially complete them.
298 		 */
299 		if (!LIST_EMPTY(&bp->b_dep) &&
300 		    (error = softdep_sync_buf(vp, bp,
301 		    wait ? MNT_WAIT : MNT_NOWAIT)) != 0) {
302 			/* I/O error. */
303 			if (error != EBUSY) {
304 				BUF_UNLOCK(bp);
305 				return (error);
306 			}
307 			/* If we deferred once, don't defer again. */
308 		    	if ((bp->b_flags & B_DEFERRED) == 0) {
309 				bp->b_flags |= B_DEFERRED;
310 				BUF_UNLOCK(bp);
311 				goto next;
312 			}
313 		}
314 		if (wait) {
315 			bremfree(bp);
316 			if ((error = bwrite(bp)) != 0)
317 				return (error);
318 		} else if ((bp->b_flags & B_CLUSTEROK)) {
319 			(void) vfs_bio_awrite(bp);
320 		} else {
321 			bremfree(bp);
322 			(void) bawrite(bp);
323 		}
324 next:
325 		/*
326 		 * Since we may have slept during the I/O, we need
327 		 * to start from a known point.
328 		 */
329 		BO_LOCK(bo);
330 		nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd);
331 	}
332 	if (waitfor != MNT_WAIT) {
333 		BO_UNLOCK(bo);
334 		if ((flags & NO_INO_UPDT) != 0)
335 			return (0);
336 		else
337 			return (ffs_update(vp, 0));
338 	}
339 	/* Drain IO to see if we're done. */
340 	bufobj_wwait(bo, 0, 0);
341 	/*
342 	 * Block devices associated with filesystems may have new I/O
343 	 * requests posted for them even if the vnode is locked, so no
344 	 * amount of trying will get them clean.  We make several passes
345 	 * as a best effort.
346 	 *
347 	 * Regular files may need multiple passes to flush all dependency
348 	 * work as it is possible that we must write once per indirect
349 	 * level, once for the leaf, and once for the inode and each of
350 	 * these will be done with one sync and one async pass.
351 	 */
352 	if (bo->bo_dirty.bv_cnt > 0) {
353 		if ((flags & DATA_ONLY) == 0) {
354 			still_dirty = true;
355 		} else {
356 			/*
357 			 * For data-only sync, dirty indirect buffers
358 			 * are ignored.
359 			 */
360 			still_dirty = false;
361 			TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
362 				if (bp->b_lblkno > -UFS_NDADDR) {
363 					still_dirty = true;
364 					break;
365 				}
366 			}
367 		}
368 
369 		if (still_dirty) {
370 			/* Write the inode after sync passes to flush deps. */
371 			if (wait && DOINGSOFTDEP(vp) &&
372 			    (flags & NO_INO_UPDT) == 0) {
373 				BO_UNLOCK(bo);
374 				ffs_update(vp, 1);
375 				BO_LOCK(bo);
376 			}
377 			/* switch between sync/async. */
378 			wait = !wait;
379 			if (wait || ++passes < UFS_NIADDR + 2)
380 				goto loop;
381 		}
382 	}
383 	BO_UNLOCK(bo);
384 	error = 0;
385 	if ((flags & DATA_ONLY) == 0) {
386 		if ((flags & NO_INO_UPDT) == 0)
387 			error = ffs_update(vp, 1);
388 		if (DOINGSUJ(vp))
389 			softdep_journal_fsync(VTOI(vp));
390 	}
391 	return (error);
392 }
393 
394 static int
395 ffs_fdatasync(struct vop_fdatasync_args *ap)
396 {
397 
398 	return (ffs_syncvnode(ap->a_vp, MNT_WAIT, DATA_ONLY));
399 }
400 
401 static int
402 ffs_lock(ap)
403 	struct vop_lock1_args /* {
404 		struct vnode *a_vp;
405 		int a_flags;
406 		struct thread *a_td;
407 		char *file;
408 		int line;
409 	} */ *ap;
410 {
411 #ifndef NO_FFS_SNAPSHOT
412 	struct vnode *vp;
413 	int flags;
414 	struct lock *lkp;
415 	int result;
416 
417 	switch (ap->a_flags & LK_TYPE_MASK) {
418 	case LK_SHARED:
419 	case LK_UPGRADE:
420 	case LK_EXCLUSIVE:
421 		vp = ap->a_vp;
422 		flags = ap->a_flags;
423 		for (;;) {
424 #ifdef DEBUG_VFS_LOCKS
425 			KASSERT(vp->v_holdcnt != 0,
426 			    ("ffs_lock %p: zero hold count", vp));
427 #endif
428 			lkp = vp->v_vnlock;
429 			result = _lockmgr_args(lkp, flags, VI_MTX(vp),
430 			    LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT,
431 			    ap->a_file, ap->a_line);
432 			if (lkp == vp->v_vnlock || result != 0)
433 				break;
434 			/*
435 			 * Apparent success, except that the vnode
436 			 * mutated between snapshot file vnode and
437 			 * regular file vnode while this process
438 			 * slept.  The lock currently held is not the
439 			 * right lock.  Release it, and try to get the
440 			 * new lock.
441 			 */
442 			(void) _lockmgr_args(lkp, LK_RELEASE, NULL,
443 			    LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT,
444 			    ap->a_file, ap->a_line);
445 			if ((flags & (LK_INTERLOCK | LK_NOWAIT)) ==
446 			    (LK_INTERLOCK | LK_NOWAIT))
447 				return (EBUSY);
448 			if ((flags & LK_TYPE_MASK) == LK_UPGRADE)
449 				flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE;
450 			flags &= ~LK_INTERLOCK;
451 		}
452 		break;
453 	default:
454 		result = VOP_LOCK1_APV(&ufs_vnodeops, ap);
455 	}
456 	return (result);
457 #else
458 	return (VOP_LOCK1_APV(&ufs_vnodeops, ap));
459 #endif
460 }
461 
462 static int
463 ffs_read_hole(struct uio *uio, long xfersize, long *size)
464 {
465 	ssize_t saved_resid, tlen;
466 	int error;
467 
468 	while (xfersize > 0) {
469 		tlen = min(xfersize, ZERO_REGION_SIZE);
470 		saved_resid = uio->uio_resid;
471 		error = vn_io_fault_uiomove(__DECONST(void *, zero_region),
472 		    tlen, uio);
473 		if (error != 0)
474 			return (error);
475 		tlen = saved_resid - uio->uio_resid;
476 		xfersize -= tlen;
477 		*size -= tlen;
478 	}
479 	return (0);
480 }
481 
482 /*
483  * Vnode op for reading.
484  */
485 static int
486 ffs_read(ap)
487 	struct vop_read_args /* {
488 		struct vnode *a_vp;
489 		struct uio *a_uio;
490 		int a_ioflag;
491 		struct ucred *a_cred;
492 	} */ *ap;
493 {
494 	struct vnode *vp;
495 	struct inode *ip;
496 	struct uio *uio;
497 	struct fs *fs;
498 	struct buf *bp;
499 	ufs_lbn_t lbn, nextlbn;
500 	off_t bytesinfile;
501 	long size, xfersize, blkoffset;
502 	ssize_t orig_resid;
503 	int bflag, error, ioflag, seqcount;
504 
505 	vp = ap->a_vp;
506 	uio = ap->a_uio;
507 	ioflag = ap->a_ioflag;
508 	if (ap->a_ioflag & IO_EXT)
509 #ifdef notyet
510 		return (ffs_extread(vp, uio, ioflag));
511 #else
512 		panic("ffs_read+IO_EXT");
513 #endif
514 #ifdef DIRECTIO
515 	if ((ioflag & IO_DIRECT) != 0) {
516 		int workdone;
517 
518 		error = ffs_rawread(vp, uio, &workdone);
519 		if (error != 0 || workdone != 0)
520 			return error;
521 	}
522 #endif
523 
524 	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
525 	ip = VTOI(vp);
526 
527 #ifdef INVARIANTS
528 	if (uio->uio_rw != UIO_READ)
529 		panic("ffs_read: mode");
530 
531 	if (vp->v_type == VLNK) {
532 		if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
533 			panic("ffs_read: short symlink");
534 	} else if (vp->v_type != VREG && vp->v_type != VDIR)
535 		panic("ffs_read: type %d",  vp->v_type);
536 #endif
537 	orig_resid = uio->uio_resid;
538 	KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0"));
539 	if (orig_resid == 0)
540 		return (0);
541 	KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0"));
542 	fs = ITOFS(ip);
543 	if (uio->uio_offset < ip->i_size &&
544 	    uio->uio_offset >= fs->fs_maxfilesize)
545 		return (EOVERFLOW);
546 
547 	bflag = GB_UNMAPPED | (uio->uio_segflg == UIO_NOCOPY ? 0 : GB_NOSPARSE);
548 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
549 		if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
550 			break;
551 		lbn = lblkno(fs, uio->uio_offset);
552 		nextlbn = lbn + 1;
553 
554 		/*
555 		 * size of buffer.  The buffer representing the
556 		 * end of the file is rounded up to the size of
557 		 * the block type ( fragment or full block,
558 		 * depending ).
559 		 */
560 		size = blksize(fs, ip, lbn);
561 		blkoffset = blkoff(fs, uio->uio_offset);
562 
563 		/*
564 		 * The amount we want to transfer in this iteration is
565 		 * one FS block less the amount of the data before
566 		 * our startpoint (duh!)
567 		 */
568 		xfersize = fs->fs_bsize - blkoffset;
569 
570 		/*
571 		 * But if we actually want less than the block,
572 		 * or the file doesn't have a whole block more of data,
573 		 * then use the lesser number.
574 		 */
575 		if (uio->uio_resid < xfersize)
576 			xfersize = uio->uio_resid;
577 		if (bytesinfile < xfersize)
578 			xfersize = bytesinfile;
579 
580 		if (lblktosize(fs, nextlbn) >= ip->i_size) {
581 			/*
582 			 * Don't do readahead if this is the end of the file.
583 			 */
584 			error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp);
585 		} else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
586 			/*
587 			 * Otherwise if we are allowed to cluster,
588 			 * grab as much as we can.
589 			 *
590 			 * XXX  This may not be a win if we are not
591 			 * doing sequential access.
592 			 */
593 			error = cluster_read(vp, ip->i_size, lbn,
594 			    size, NOCRED, blkoffset + uio->uio_resid,
595 			    seqcount, bflag, &bp);
596 		} else if (seqcount > 1) {
597 			/*
598 			 * If we are NOT allowed to cluster, then
599 			 * if we appear to be acting sequentially,
600 			 * fire off a request for a readahead
601 			 * as well as a read. Note that the 4th and 5th
602 			 * arguments point to arrays of the size specified in
603 			 * the 6th argument.
604 			 */
605 			u_int nextsize = blksize(fs, ip, nextlbn);
606 			error = breadn_flags(vp, lbn, size, &nextlbn,
607 			    &nextsize, 1, NOCRED, bflag, NULL, &bp);
608 		} else {
609 			/*
610 			 * Failing all of the above, just read what the
611 			 * user asked for. Interestingly, the same as
612 			 * the first option above.
613 			 */
614 			error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp);
615 		}
616 		if (error == EJUSTRETURN) {
617 			error = ffs_read_hole(uio, xfersize, &size);
618 			if (error == 0)
619 				continue;
620 		}
621 		if (error != 0) {
622 			brelse(bp);
623 			bp = NULL;
624 			break;
625 		}
626 
627 		/*
628 		 * We should only get non-zero b_resid when an I/O error
629 		 * has occurred, which should cause us to break above.
630 		 * However, if the short read did not cause an error,
631 		 * then we want to ensure that we do not uiomove bad
632 		 * or uninitialized data.
633 		 */
634 		size -= bp->b_resid;
635 		if (size < xfersize) {
636 			if (size == 0)
637 				break;
638 			xfersize = size;
639 		}
640 
641 		if (buf_mapped(bp)) {
642 			error = vn_io_fault_uiomove((char *)bp->b_data +
643 			    blkoffset, (int)xfersize, uio);
644 		} else {
645 			error = vn_io_fault_pgmove(bp->b_pages, blkoffset,
646 			    (int)xfersize, uio);
647 		}
648 		if (error)
649 			break;
650 
651 		vfs_bio_brelse(bp, ioflag);
652 	}
653 
654 	/*
655 	 * This can only happen in the case of an error
656 	 * because the loop above resets bp to NULL on each iteration
657 	 * and on normal completion has not set a new value into it.
658 	 * so it must have come from a 'break' statement
659 	 */
660 	if (bp != NULL)
661 		vfs_bio_brelse(bp, ioflag);
662 
663 	if ((error == 0 || uio->uio_resid != orig_resid) &&
664 	    (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0 &&
665 	    (ip->i_flag & IN_ACCESS) == 0) {
666 		VI_LOCK(vp);
667 		ip->i_flag |= IN_ACCESS;
668 		VI_UNLOCK(vp);
669 	}
670 	return (error);
671 }
672 
673 /*
674  * Vnode op for writing.
675  */
676 static int
677 ffs_write(ap)
678 	struct vop_write_args /* {
679 		struct vnode *a_vp;
680 		struct uio *a_uio;
681 		int a_ioflag;
682 		struct ucred *a_cred;
683 	} */ *ap;
684 {
685 	struct vnode *vp;
686 	struct uio *uio;
687 	struct inode *ip;
688 	struct fs *fs;
689 	struct buf *bp;
690 	ufs_lbn_t lbn;
691 	off_t osize;
692 	ssize_t resid;
693 	int seqcount;
694 	int blkoffset, error, flags, ioflag, size, xfersize;
695 
696 	vp = ap->a_vp;
697 	uio = ap->a_uio;
698 	ioflag = ap->a_ioflag;
699 	if (ap->a_ioflag & IO_EXT)
700 #ifdef notyet
701 		return (ffs_extwrite(vp, uio, ioflag, ap->a_cred));
702 #else
703 		panic("ffs_write+IO_EXT");
704 #endif
705 
706 	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
707 	ip = VTOI(vp);
708 
709 #ifdef INVARIANTS
710 	if (uio->uio_rw != UIO_WRITE)
711 		panic("ffs_write: mode");
712 #endif
713 
714 	switch (vp->v_type) {
715 	case VREG:
716 		if (ioflag & IO_APPEND)
717 			uio->uio_offset = ip->i_size;
718 		if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
719 			return (EPERM);
720 		/* FALLTHROUGH */
721 	case VLNK:
722 		break;
723 	case VDIR:
724 		panic("ffs_write: dir write");
725 		break;
726 	default:
727 		panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type,
728 			(int)uio->uio_offset,
729 			(int)uio->uio_resid
730 		);
731 	}
732 
733 	KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0"));
734 	KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0"));
735 	fs = ITOFS(ip);
736 	if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize)
737 		return (EFBIG);
738 	/*
739 	 * Maybe this should be above the vnode op call, but so long as
740 	 * file servers have no limits, I don't think it matters.
741 	 */
742 	if (vn_rlimit_fsize(vp, uio, uio->uio_td))
743 		return (EFBIG);
744 
745 	resid = uio->uio_resid;
746 	osize = ip->i_size;
747 	if (seqcount > BA_SEQMAX)
748 		flags = BA_SEQMAX << BA_SEQSHIFT;
749 	else
750 		flags = seqcount << BA_SEQSHIFT;
751 	if (ioflag & IO_SYNC)
752 		flags |= IO_SYNC;
753 	flags |= BA_UNMAPPED;
754 
755 	for (error = 0; uio->uio_resid > 0;) {
756 		lbn = lblkno(fs, uio->uio_offset);
757 		blkoffset = blkoff(fs, uio->uio_offset);
758 		xfersize = fs->fs_bsize - blkoffset;
759 		if (uio->uio_resid < xfersize)
760 			xfersize = uio->uio_resid;
761 		if (uio->uio_offset + xfersize > ip->i_size)
762 			vnode_pager_setsize(vp, uio->uio_offset + xfersize);
763 
764 		/*
765 		 * We must perform a read-before-write if the transfer size
766 		 * does not cover the entire buffer.
767 		 */
768 		if (fs->fs_bsize > xfersize)
769 			flags |= BA_CLRBUF;
770 		else
771 			flags &= ~BA_CLRBUF;
772 /* XXX is uio->uio_offset the right thing here? */
773 		error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
774 		    ap->a_cred, flags, &bp);
775 		if (error != 0) {
776 			vnode_pager_setsize(vp, ip->i_size);
777 			break;
778 		}
779 		if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL))
780 			bp->b_flags |= B_NOCACHE;
781 
782 		if (uio->uio_offset + xfersize > ip->i_size) {
783 			ip->i_size = uio->uio_offset + xfersize;
784 			DIP_SET(ip, i_size, ip->i_size);
785 		}
786 
787 		size = blksize(fs, ip, lbn) - bp->b_resid;
788 		if (size < xfersize)
789 			xfersize = size;
790 
791 		if (buf_mapped(bp)) {
792 			error = vn_io_fault_uiomove((char *)bp->b_data +
793 			    blkoffset, (int)xfersize, uio);
794 		} else {
795 			error = vn_io_fault_pgmove(bp->b_pages, blkoffset,
796 			    (int)xfersize, uio);
797 		}
798 		/*
799 		 * If the buffer is not already filled and we encounter an
800 		 * error while trying to fill it, we have to clear out any
801 		 * garbage data from the pages instantiated for the buffer.
802 		 * If we do not, a failed uiomove() during a write can leave
803 		 * the prior contents of the pages exposed to a userland mmap.
804 		 *
805 		 * Note that we need only clear buffers with a transfer size
806 		 * equal to the block size because buffers with a shorter
807 		 * transfer size were cleared above by the call to UFS_BALLOC()
808 		 * with the BA_CLRBUF flag set.
809 		 *
810 		 * If the source region for uiomove identically mmaps the
811 		 * buffer, uiomove() performed the NOP copy, and the buffer
812 		 * content remains valid because the page fault handler
813 		 * validated the pages.
814 		 */
815 		if (error != 0 && (bp->b_flags & B_CACHE) == 0 &&
816 		    fs->fs_bsize == xfersize)
817 			vfs_bio_clrbuf(bp);
818 
819 		vfs_bio_set_flags(bp, ioflag);
820 
821 		/*
822 		 * If IO_SYNC each buffer is written synchronously.  Otherwise
823 		 * if we have a severe page deficiency write the buffer
824 		 * asynchronously.  Otherwise try to cluster, and if that
825 		 * doesn't do it then either do an async write (if O_DIRECT),
826 		 * or a delayed write (if not).
827 		 */
828 		if (ioflag & IO_SYNC) {
829 			(void)bwrite(bp);
830 		} else if (vm_page_count_severe() ||
831 			    buf_dirty_count_severe() ||
832 			    (ioflag & IO_ASYNC)) {
833 			bp->b_flags |= B_CLUSTEROK;
834 			bawrite(bp);
835 		} else if (xfersize + blkoffset == fs->fs_bsize) {
836 			if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
837 				bp->b_flags |= B_CLUSTEROK;
838 				cluster_write(vp, bp, ip->i_size, seqcount,
839 				    GB_UNMAPPED);
840 			} else {
841 				bawrite(bp);
842 			}
843 		} else if (ioflag & IO_DIRECT) {
844 			bp->b_flags |= B_CLUSTEROK;
845 			bawrite(bp);
846 		} else {
847 			bp->b_flags |= B_CLUSTEROK;
848 			bdwrite(bp);
849 		}
850 		if (error || xfersize == 0)
851 			break;
852 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
853 	}
854 	/*
855 	 * If we successfully wrote any data, and we are not the superuser
856 	 * we clear the setuid and setgid bits as a precaution against
857 	 * tampering.
858 	 */
859 	if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid &&
860 	    ap->a_cred) {
861 		if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID)) {
862 			ip->i_mode &= ~(ISUID | ISGID);
863 			DIP_SET(ip, i_mode, ip->i_mode);
864 		}
865 	}
866 	if (error) {
867 		if (ioflag & IO_UNIT) {
868 			(void)ffs_truncate(vp, osize,
869 			    IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred);
870 			uio->uio_offset -= resid - uio->uio_resid;
871 			uio->uio_resid = resid;
872 		}
873 	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
874 		error = ffs_update(vp, 1);
875 	return (error);
876 }
877 
878 /*
879  * Extended attribute area reading.
880  */
881 static int
882 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag)
883 {
884 	struct inode *ip;
885 	struct ufs2_dinode *dp;
886 	struct fs *fs;
887 	struct buf *bp;
888 	ufs_lbn_t lbn, nextlbn;
889 	off_t bytesinfile;
890 	long size, xfersize, blkoffset;
891 	ssize_t orig_resid;
892 	int error;
893 
894 	ip = VTOI(vp);
895 	fs = ITOFS(ip);
896 	dp = ip->i_din2;
897 
898 #ifdef INVARIANTS
899 	if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC)
900 		panic("ffs_extread: mode");
901 
902 #endif
903 	orig_resid = uio->uio_resid;
904 	KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0"));
905 	if (orig_resid == 0)
906 		return (0);
907 	KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0"));
908 
909 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
910 		if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0)
911 			break;
912 		lbn = lblkno(fs, uio->uio_offset);
913 		nextlbn = lbn + 1;
914 
915 		/*
916 		 * size of buffer.  The buffer representing the
917 		 * end of the file is rounded up to the size of
918 		 * the block type ( fragment or full block,
919 		 * depending ).
920 		 */
921 		size = sblksize(fs, dp->di_extsize, lbn);
922 		blkoffset = blkoff(fs, uio->uio_offset);
923 
924 		/*
925 		 * The amount we want to transfer in this iteration is
926 		 * one FS block less the amount of the data before
927 		 * our startpoint (duh!)
928 		 */
929 		xfersize = fs->fs_bsize - blkoffset;
930 
931 		/*
932 		 * But if we actually want less than the block,
933 		 * or the file doesn't have a whole block more of data,
934 		 * then use the lesser number.
935 		 */
936 		if (uio->uio_resid < xfersize)
937 			xfersize = uio->uio_resid;
938 		if (bytesinfile < xfersize)
939 			xfersize = bytesinfile;
940 
941 		if (lblktosize(fs, nextlbn) >= dp->di_extsize) {
942 			/*
943 			 * Don't do readahead if this is the end of the info.
944 			 */
945 			error = bread(vp, -1 - lbn, size, NOCRED, &bp);
946 		} else {
947 			/*
948 			 * If we have a second block, then
949 			 * fire off a request for a readahead
950 			 * as well as a read. Note that the 4th and 5th
951 			 * arguments point to arrays of the size specified in
952 			 * the 6th argument.
953 			 */
954 			u_int nextsize = sblksize(fs, dp->di_extsize, nextlbn);
955 
956 			nextlbn = -1 - nextlbn;
957 			error = breadn(vp, -1 - lbn,
958 			    size, &nextlbn, &nextsize, 1, NOCRED, &bp);
959 		}
960 		if (error) {
961 			brelse(bp);
962 			bp = NULL;
963 			break;
964 		}
965 
966 		/*
967 		 * We should only get non-zero b_resid when an I/O error
968 		 * has occurred, which should cause us to break above.
969 		 * However, if the short read did not cause an error,
970 		 * then we want to ensure that we do not uiomove bad
971 		 * or uninitialized data.
972 		 */
973 		size -= bp->b_resid;
974 		if (size < xfersize) {
975 			if (size == 0)
976 				break;
977 			xfersize = size;
978 		}
979 
980 		error = uiomove((char *)bp->b_data + blkoffset,
981 					(int)xfersize, uio);
982 		if (error)
983 			break;
984 		vfs_bio_brelse(bp, ioflag);
985 	}
986 
987 	/*
988 	 * This can only happen in the case of an error
989 	 * because the loop above resets bp to NULL on each iteration
990 	 * and on normal completion has not set a new value into it.
991 	 * so it must have come from a 'break' statement
992 	 */
993 	if (bp != NULL)
994 		vfs_bio_brelse(bp, ioflag);
995 	return (error);
996 }
997 
998 /*
999  * Extended attribute area writing.
1000  */
1001 static int
1002 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred)
1003 {
1004 	struct inode *ip;
1005 	struct ufs2_dinode *dp;
1006 	struct fs *fs;
1007 	struct buf *bp;
1008 	ufs_lbn_t lbn;
1009 	off_t osize;
1010 	ssize_t resid;
1011 	int blkoffset, error, flags, size, xfersize;
1012 
1013 	ip = VTOI(vp);
1014 	fs = ITOFS(ip);
1015 	dp = ip->i_din2;
1016 
1017 #ifdef INVARIANTS
1018 	if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC)
1019 		panic("ffs_extwrite: mode");
1020 #endif
1021 
1022 	if (ioflag & IO_APPEND)
1023 		uio->uio_offset = dp->di_extsize;
1024 	KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0"));
1025 	KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0"));
1026 	if ((uoff_t)uio->uio_offset + uio->uio_resid >
1027 	    UFS_NXADDR * fs->fs_bsize)
1028 		return (EFBIG);
1029 
1030 	resid = uio->uio_resid;
1031 	osize = dp->di_extsize;
1032 	flags = IO_EXT;
1033 	if (ioflag & IO_SYNC)
1034 		flags |= IO_SYNC;
1035 
1036 	for (error = 0; uio->uio_resid > 0;) {
1037 		lbn = lblkno(fs, uio->uio_offset);
1038 		blkoffset = blkoff(fs, uio->uio_offset);
1039 		xfersize = fs->fs_bsize - blkoffset;
1040 		if (uio->uio_resid < xfersize)
1041 			xfersize = uio->uio_resid;
1042 
1043 		/*
1044 		 * We must perform a read-before-write if the transfer size
1045 		 * does not cover the entire buffer.
1046 		 */
1047 		if (fs->fs_bsize > xfersize)
1048 			flags |= BA_CLRBUF;
1049 		else
1050 			flags &= ~BA_CLRBUF;
1051 		error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
1052 		    ucred, flags, &bp);
1053 		if (error != 0)
1054 			break;
1055 		/*
1056 		 * If the buffer is not valid we have to clear out any
1057 		 * garbage data from the pages instantiated for the buffer.
1058 		 * If we do not, a failed uiomove() during a write can leave
1059 		 * the prior contents of the pages exposed to a userland
1060 		 * mmap().  XXX deal with uiomove() errors a better way.
1061 		 */
1062 		if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
1063 			vfs_bio_clrbuf(bp);
1064 
1065 		if (uio->uio_offset + xfersize > dp->di_extsize)
1066 			dp->di_extsize = uio->uio_offset + xfersize;
1067 
1068 		size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid;
1069 		if (size < xfersize)
1070 			xfersize = size;
1071 
1072 		error =
1073 		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
1074 
1075 		vfs_bio_set_flags(bp, ioflag);
1076 
1077 		/*
1078 		 * If IO_SYNC each buffer is written synchronously.  Otherwise
1079 		 * if we have a severe page deficiency write the buffer
1080 		 * asynchronously.  Otherwise try to cluster, and if that
1081 		 * doesn't do it then either do an async write (if O_DIRECT),
1082 		 * or a delayed write (if not).
1083 		 */
1084 		if (ioflag & IO_SYNC) {
1085 			(void)bwrite(bp);
1086 		} else if (vm_page_count_severe() ||
1087 			    buf_dirty_count_severe() ||
1088 			    xfersize + blkoffset == fs->fs_bsize ||
1089 			    (ioflag & (IO_ASYNC | IO_DIRECT)))
1090 			bawrite(bp);
1091 		else
1092 			bdwrite(bp);
1093 		if (error || xfersize == 0)
1094 			break;
1095 		ip->i_flag |= IN_CHANGE;
1096 	}
1097 	/*
1098 	 * If we successfully wrote any data, and we are not the superuser
1099 	 * we clear the setuid and setgid bits as a precaution against
1100 	 * tampering.
1101 	 */
1102 	if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) {
1103 		if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID)) {
1104 			ip->i_mode &= ~(ISUID | ISGID);
1105 			dp->di_mode = ip->i_mode;
1106 		}
1107 	}
1108 	if (error) {
1109 		if (ioflag & IO_UNIT) {
1110 			(void)ffs_truncate(vp, osize,
1111 			    IO_EXT | (ioflag&IO_SYNC), ucred);
1112 			uio->uio_offset -= resid - uio->uio_resid;
1113 			uio->uio_resid = resid;
1114 		}
1115 	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
1116 		error = ffs_update(vp, 1);
1117 	return (error);
1118 }
1119 
1120 
1121 /*
1122  * Vnode operating to retrieve a named extended attribute.
1123  *
1124  * Locate a particular EA (nspace:name) in the area (ptr:length), and return
1125  * the length of the EA, and possibly the pointer to the entry and to the data.
1126  */
1127 static int
1128 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name,
1129     struct extattr **eapp, u_char **eac)
1130 {
1131 	struct extattr *eap, *eaend;
1132 	size_t nlen;
1133 
1134 	nlen = strlen(name);
1135 	KASSERT(ALIGNED_TO(ptr, struct extattr), ("unaligned"));
1136 	eap = (struct extattr *)ptr;
1137 	eaend = (struct extattr *)(ptr + length);
1138 	for (; eap < eaend; eap = EXTATTR_NEXT(eap)) {
1139 		/* make sure this entry is complete */
1140 		if (EXTATTR_NEXT(eap) > eaend)
1141 			break;
1142 		if (eap->ea_namespace != nspace || eap->ea_namelength != nlen
1143 		    || memcmp(eap->ea_name, name, nlen) != 0)
1144 			continue;
1145 		if (eapp != NULL)
1146 			*eapp = eap;
1147 		if (eac != NULL)
1148 			*eac = EXTATTR_CONTENT(eap);
1149 		return (EXTATTR_CONTENT_SIZE(eap));
1150 	}
1151 	return (-1);
1152 }
1153 
1154 static int
1155 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra)
1156 {
1157 	struct inode *ip;
1158 	struct ufs2_dinode *dp;
1159 	struct fs *fs;
1160 	struct uio luio;
1161 	struct iovec liovec;
1162 	u_int easize;
1163 	int error;
1164 	u_char *eae;
1165 
1166 	ip = VTOI(vp);
1167 	fs = ITOFS(ip);
1168 	dp = ip->i_din2;
1169 	easize = dp->di_extsize;
1170 	if ((uoff_t)easize + extra > UFS_NXADDR * fs->fs_bsize)
1171 		return (EFBIG);
1172 
1173 	eae = malloc(easize + extra, M_TEMP, M_WAITOK);
1174 
1175 	liovec.iov_base = eae;
1176 	liovec.iov_len = easize;
1177 	luio.uio_iov = &liovec;
1178 	luio.uio_iovcnt = 1;
1179 	luio.uio_offset = 0;
1180 	luio.uio_resid = easize;
1181 	luio.uio_segflg = UIO_SYSSPACE;
1182 	luio.uio_rw = UIO_READ;
1183 	luio.uio_td = td;
1184 
1185 	error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC);
1186 	if (error) {
1187 		free(eae, M_TEMP);
1188 		return(error);
1189 	}
1190 	*p = eae;
1191 	return (0);
1192 }
1193 
1194 static void
1195 ffs_lock_ea(struct vnode *vp)
1196 {
1197 	struct inode *ip;
1198 
1199 	ip = VTOI(vp);
1200 	VI_LOCK(vp);
1201 	while (ip->i_flag & IN_EA_LOCKED) {
1202 		ip->i_flag |= IN_EA_LOCKWAIT;
1203 		msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea",
1204 		    0);
1205 	}
1206 	ip->i_flag |= IN_EA_LOCKED;
1207 	VI_UNLOCK(vp);
1208 }
1209 
1210 static void
1211 ffs_unlock_ea(struct vnode *vp)
1212 {
1213 	struct inode *ip;
1214 
1215 	ip = VTOI(vp);
1216 	VI_LOCK(vp);
1217 	if (ip->i_flag & IN_EA_LOCKWAIT)
1218 		wakeup(&ip->i_ea_refs);
1219 	ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT);
1220 	VI_UNLOCK(vp);
1221 }
1222 
1223 static int
1224 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td)
1225 {
1226 	struct inode *ip;
1227 	struct ufs2_dinode *dp;
1228 	int error;
1229 
1230 	ip = VTOI(vp);
1231 
1232 	ffs_lock_ea(vp);
1233 	if (ip->i_ea_area != NULL) {
1234 		ip->i_ea_refs++;
1235 		ffs_unlock_ea(vp);
1236 		return (0);
1237 	}
1238 	dp = ip->i_din2;
1239 	error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0);
1240 	if (error) {
1241 		ffs_unlock_ea(vp);
1242 		return (error);
1243 	}
1244 	ip->i_ea_len = dp->di_extsize;
1245 	ip->i_ea_error = 0;
1246 	ip->i_ea_refs++;
1247 	ffs_unlock_ea(vp);
1248 	return (0);
1249 }
1250 
1251 /*
1252  * Vnode extattr transaction commit/abort
1253  */
1254 static int
1255 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td)
1256 {
1257 	struct inode *ip;
1258 	struct uio luio;
1259 	struct iovec liovec;
1260 	int error;
1261 	struct ufs2_dinode *dp;
1262 
1263 	ip = VTOI(vp);
1264 
1265 	ffs_lock_ea(vp);
1266 	if (ip->i_ea_area == NULL) {
1267 		ffs_unlock_ea(vp);
1268 		return (EINVAL);
1269 	}
1270 	dp = ip->i_din2;
1271 	error = ip->i_ea_error;
1272 	if (commit && error == 0) {
1273 		ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit");
1274 		if (cred == NOCRED)
1275 			cred =  vp->v_mount->mnt_cred;
1276 		liovec.iov_base = ip->i_ea_area;
1277 		liovec.iov_len = ip->i_ea_len;
1278 		luio.uio_iov = &liovec;
1279 		luio.uio_iovcnt = 1;
1280 		luio.uio_offset = 0;
1281 		luio.uio_resid = ip->i_ea_len;
1282 		luio.uio_segflg = UIO_SYSSPACE;
1283 		luio.uio_rw = UIO_WRITE;
1284 		luio.uio_td = td;
1285 		/* XXX: I'm not happy about truncating to zero size */
1286 		if (ip->i_ea_len < dp->di_extsize)
1287 			error = ffs_truncate(vp, 0, IO_EXT, cred);
1288 		error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred);
1289 	}
1290 	if (--ip->i_ea_refs == 0) {
1291 		free(ip->i_ea_area, M_TEMP);
1292 		ip->i_ea_area = NULL;
1293 		ip->i_ea_len = 0;
1294 		ip->i_ea_error = 0;
1295 	}
1296 	ffs_unlock_ea(vp);
1297 	return (error);
1298 }
1299 
1300 /*
1301  * Vnode extattr strategy routine for fifos.
1302  *
1303  * We need to check for a read or write of the external attributes.
1304  * Otherwise we just fall through and do the usual thing.
1305  */
1306 static int
1307 ffsext_strategy(struct vop_strategy_args *ap)
1308 /*
1309 struct vop_strategy_args {
1310 	struct vnodeop_desc *a_desc;
1311 	struct vnode *a_vp;
1312 	struct buf *a_bp;
1313 };
1314 */
1315 {
1316 	struct vnode *vp;
1317 	daddr_t lbn;
1318 
1319 	vp = ap->a_vp;
1320 	lbn = ap->a_bp->b_lblkno;
1321 	if (I_IS_UFS2(VTOI(vp)) && lbn < 0 && lbn >= -UFS_NXADDR)
1322 		return (VOP_STRATEGY_APV(&ufs_vnodeops, ap));
1323 	if (vp->v_type == VFIFO)
1324 		return (VOP_STRATEGY_APV(&ufs_fifoops, ap));
1325 	panic("spec nodes went here");
1326 }
1327 
1328 /*
1329  * Vnode extattr transaction commit/abort
1330  */
1331 static int
1332 ffs_openextattr(struct vop_openextattr_args *ap)
1333 /*
1334 struct vop_openextattr_args {
1335 	struct vnodeop_desc *a_desc;
1336 	struct vnode *a_vp;
1337 	IN struct ucred *a_cred;
1338 	IN struct thread *a_td;
1339 };
1340 */
1341 {
1342 
1343 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1344 		return (EOPNOTSUPP);
1345 
1346 	return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td));
1347 }
1348 
1349 
1350 /*
1351  * Vnode extattr transaction commit/abort
1352  */
1353 static int
1354 ffs_closeextattr(struct vop_closeextattr_args *ap)
1355 /*
1356 struct vop_closeextattr_args {
1357 	struct vnodeop_desc *a_desc;
1358 	struct vnode *a_vp;
1359 	int a_commit;
1360 	IN struct ucred *a_cred;
1361 	IN struct thread *a_td;
1362 };
1363 */
1364 {
1365 
1366 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1367 		return (EOPNOTSUPP);
1368 
1369 	if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY))
1370 		return (EROFS);
1371 
1372 	return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td));
1373 }
1374 
1375 /*
1376  * Vnode operation to remove a named attribute.
1377  */
1378 static int
1379 ffs_deleteextattr(struct vop_deleteextattr_args *ap)
1380 /*
1381 vop_deleteextattr {
1382 	IN struct vnode *a_vp;
1383 	IN int a_attrnamespace;
1384 	IN const char *a_name;
1385 	IN struct ucred *a_cred;
1386 	IN struct thread *a_td;
1387 };
1388 */
1389 {
1390 	struct inode *ip;
1391 	struct extattr *eap;
1392 	uint32_t ul;
1393 	int olen, error, i, easize;
1394 	u_char *eae;
1395 	void *tmp;
1396 
1397 	ip = VTOI(ap->a_vp);
1398 
1399 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1400 		return (EOPNOTSUPP);
1401 
1402 	if (strlen(ap->a_name) == 0)
1403 		return (EINVAL);
1404 
1405 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1406 		return (EROFS);
1407 
1408 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1409 	    ap->a_cred, ap->a_td, VWRITE);
1410 	if (error) {
1411 
1412 		/*
1413 		 * ffs_lock_ea is not needed there, because the vnode
1414 		 * must be exclusively locked.
1415 		 */
1416 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1417 			ip->i_ea_error = error;
1418 		return (error);
1419 	}
1420 
1421 	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1422 	if (error)
1423 		return (error);
1424 
1425 	/* CEM: delete could be done in-place instead */
1426 	eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK);
1427 	bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1428 	easize = ip->i_ea_len;
1429 
1430 	olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1431 	    &eap, NULL);
1432 	if (olen == -1) {
1433 		/* delete but nonexistent */
1434 		free(eae, M_TEMP);
1435 		ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1436 		return (ENOATTR);
1437 	}
1438 	ul = eap->ea_length;
1439 	i = (u_char *)EXTATTR_NEXT(eap) - eae;
1440 	bcopy(EXTATTR_NEXT(eap), eap, easize - i);
1441 	easize -= ul;
1442 
1443 	tmp = ip->i_ea_area;
1444 	ip->i_ea_area = eae;
1445 	ip->i_ea_len = easize;
1446 	free(tmp, M_TEMP);
1447 	error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1448 	return (error);
1449 }
1450 
1451 /*
1452  * Vnode operation to retrieve a named extended attribute.
1453  */
1454 static int
1455 ffs_getextattr(struct vop_getextattr_args *ap)
1456 /*
1457 vop_getextattr {
1458 	IN struct vnode *a_vp;
1459 	IN int a_attrnamespace;
1460 	IN const char *a_name;
1461 	INOUT struct uio *a_uio;
1462 	OUT size_t *a_size;
1463 	IN struct ucred *a_cred;
1464 	IN struct thread *a_td;
1465 };
1466 */
1467 {
1468 	struct inode *ip;
1469 	u_char *eae, *p;
1470 	unsigned easize;
1471 	int error, ealen;
1472 
1473 	ip = VTOI(ap->a_vp);
1474 
1475 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1476 		return (EOPNOTSUPP);
1477 
1478 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1479 	    ap->a_cred, ap->a_td, VREAD);
1480 	if (error)
1481 		return (error);
1482 
1483 	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1484 	if (error)
1485 		return (error);
1486 
1487 	eae = ip->i_ea_area;
1488 	easize = ip->i_ea_len;
1489 
1490 	ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1491 	    NULL, &p);
1492 	if (ealen >= 0) {
1493 		error = 0;
1494 		if (ap->a_size != NULL)
1495 			*ap->a_size = ealen;
1496 		else if (ap->a_uio != NULL)
1497 			error = uiomove(p, ealen, ap->a_uio);
1498 	} else
1499 		error = ENOATTR;
1500 
1501 	ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1502 	return (error);
1503 }
1504 
1505 /*
1506  * Vnode operation to retrieve extended attributes on a vnode.
1507  */
1508 static int
1509 ffs_listextattr(struct vop_listextattr_args *ap)
1510 /*
1511 vop_listextattr {
1512 	IN struct vnode *a_vp;
1513 	IN int a_attrnamespace;
1514 	INOUT struct uio *a_uio;
1515 	OUT size_t *a_size;
1516 	IN struct ucred *a_cred;
1517 	IN struct thread *a_td;
1518 };
1519 */
1520 {
1521 	struct inode *ip;
1522 	struct extattr *eap, *eaend;
1523 	int error, ealen;
1524 
1525 	ip = VTOI(ap->a_vp);
1526 
1527 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1528 		return (EOPNOTSUPP);
1529 
1530 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1531 	    ap->a_cred, ap->a_td, VREAD);
1532 	if (error)
1533 		return (error);
1534 
1535 	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1536 	if (error)
1537 		return (error);
1538 
1539 	error = 0;
1540 	if (ap->a_size != NULL)
1541 		*ap->a_size = 0;
1542 
1543 	KASSERT(ALIGNED_TO(ip->i_ea_area, struct extattr), ("unaligned"));
1544 	eap = (struct extattr *)ip->i_ea_area;
1545 	eaend = (struct extattr *)(ip->i_ea_area + ip->i_ea_len);
1546 	for (; error == 0 && eap < eaend; eap = EXTATTR_NEXT(eap)) {
1547 		/* make sure this entry is complete */
1548 		if (EXTATTR_NEXT(eap) > eaend)
1549 			break;
1550 		if (eap->ea_namespace != ap->a_attrnamespace)
1551 			continue;
1552 
1553 		ealen = eap->ea_namelength;
1554 		if (ap->a_size != NULL)
1555 			*ap->a_size += ealen + 1;
1556 		else if (ap->a_uio != NULL)
1557 			error = uiomove(&eap->ea_namelength, ealen + 1,
1558 			    ap->a_uio);
1559 	}
1560 
1561 	ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1562 	return (error);
1563 }
1564 
1565 /*
1566  * Vnode operation to set a named attribute.
1567  */
1568 static int
1569 ffs_setextattr(struct vop_setextattr_args *ap)
1570 /*
1571 vop_setextattr {
1572 	IN struct vnode *a_vp;
1573 	IN int a_attrnamespace;
1574 	IN const char *a_name;
1575 	INOUT struct uio *a_uio;
1576 	IN struct ucred *a_cred;
1577 	IN struct thread *a_td;
1578 };
1579 */
1580 {
1581 	struct inode *ip;
1582 	struct fs *fs;
1583 	struct extattr *eap;
1584 	uint32_t ealength, ul;
1585 	ssize_t ealen;
1586 	int olen, eapad1, eapad2, error, i, easize;
1587 	u_char *eae;
1588 	void *tmp;
1589 
1590 	ip = VTOI(ap->a_vp);
1591 	fs = ITOFS(ip);
1592 
1593 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1594 		return (EOPNOTSUPP);
1595 
1596 	if (strlen(ap->a_name) == 0)
1597 		return (EINVAL);
1598 
1599 	/* XXX Now unsupported API to delete EAs using NULL uio. */
1600 	if (ap->a_uio == NULL)
1601 		return (EOPNOTSUPP);
1602 
1603 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1604 		return (EROFS);
1605 
1606 	ealen = ap->a_uio->uio_resid;
1607 	if (ealen < 0 || ealen > lblktosize(fs, UFS_NXADDR))
1608 		return (EINVAL);
1609 
1610 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1611 	    ap->a_cred, ap->a_td, VWRITE);
1612 	if (error) {
1613 
1614 		/*
1615 		 * ffs_lock_ea is not needed there, because the vnode
1616 		 * must be exclusively locked.
1617 		 */
1618 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1619 			ip->i_ea_error = error;
1620 		return (error);
1621 	}
1622 
1623 	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1624 	if (error)
1625 		return (error);
1626 
1627 	ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name);
1628 	eapad1 = roundup2(ealength, 8) - ealength;
1629 	eapad2 = roundup2(ealen, 8) - ealen;
1630 	ealength += eapad1 + ealen + eapad2;
1631 
1632 	/*
1633 	 * CEM: rewrites of the same size or smaller could be done in-place
1634 	 * instead.  (We don't acquire any fine-grained locks in here either,
1635 	 * so we could also do bigger writes in-place.)
1636 	 */
1637 	eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK);
1638 	bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1639 	easize = ip->i_ea_len;
1640 
1641 	olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1642 	    &eap, NULL);
1643         if (olen == -1) {
1644 		/* new, append at end */
1645 		KASSERT(ALIGNED_TO(eae + easize, struct extattr),
1646 		    ("unaligned"));
1647 		eap = (struct extattr *)(eae + easize);
1648 		easize += ealength;
1649 	} else {
1650 		ul = eap->ea_length;
1651 		i = (u_char *)EXTATTR_NEXT(eap) - eae;
1652 		if (ul != ealength) {
1653 			bcopy(EXTATTR_NEXT(eap), (u_char *)eap + ealength,
1654 			    easize - i);
1655 			easize += (ealength - ul);
1656 		}
1657 	}
1658 	if (easize > lblktosize(fs, UFS_NXADDR)) {
1659 		free(eae, M_TEMP);
1660 		ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1661 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1662 			ip->i_ea_error = ENOSPC;
1663 		return (ENOSPC);
1664 	}
1665 	eap->ea_length = ealength;
1666 	eap->ea_namespace = ap->a_attrnamespace;
1667 	eap->ea_contentpadlen = eapad2;
1668 	eap->ea_namelength = strlen(ap->a_name);
1669 	memcpy(eap->ea_name, ap->a_name, strlen(ap->a_name));
1670 	bzero(&eap->ea_name[strlen(ap->a_name)], eapad1);
1671 	error = uiomove(EXTATTR_CONTENT(eap), ealen, ap->a_uio);
1672 	if (error) {
1673 		free(eae, M_TEMP);
1674 		ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1675 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1676 			ip->i_ea_error = error;
1677 		return (error);
1678 	}
1679 	bzero((u_char *)EXTATTR_CONTENT(eap) + ealen, eapad2);
1680 
1681 	tmp = ip->i_ea_area;
1682 	ip->i_ea_area = eae;
1683 	ip->i_ea_len = easize;
1684 	free(tmp, M_TEMP);
1685 	error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1686 	return (error);
1687 }
1688 
1689 /*
1690  * Vnode pointer to File handle
1691  */
1692 static int
1693 ffs_vptofh(struct vop_vptofh_args *ap)
1694 /*
1695 vop_vptofh {
1696 	IN struct vnode *a_vp;
1697 	IN struct fid *a_fhp;
1698 };
1699 */
1700 {
1701 	struct inode *ip;
1702 	struct ufid *ufhp;
1703 
1704 	ip = VTOI(ap->a_vp);
1705 	ufhp = (struct ufid *)ap->a_fhp;
1706 	ufhp->ufid_len = sizeof(struct ufid);
1707 	ufhp->ufid_ino = ip->i_number;
1708 	ufhp->ufid_gen = ip->i_gen;
1709 	return (0);
1710 }
1711 
1712 SYSCTL_DECL(_vfs_ffs);
1713 static int use_buf_pager = 1;
1714 SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0,
1715     "Always use buffer pager instead of bmap");
1716 
1717 static daddr_t
1718 ffs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
1719 {
1720 
1721 	return (lblkno(VFSTOUFS(vp->v_mount)->um_fs, off));
1722 }
1723 
1724 static int
1725 ffs_gbp_getblksz(struct vnode *vp, daddr_t lbn)
1726 {
1727 
1728 	return (blksize(VFSTOUFS(vp->v_mount)->um_fs, VTOI(vp), lbn));
1729 }
1730 
1731 static int
1732 ffs_getpages(struct vop_getpages_args *ap)
1733 {
1734 	struct vnode *vp;
1735 	struct ufsmount *um;
1736 
1737 	vp = ap->a_vp;
1738 	um = VFSTOUFS(vp->v_mount);
1739 
1740 	if (!use_buf_pager && um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE)
1741 		return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
1742 		    ap->a_rbehind, ap->a_rahead, NULL, NULL));
1743 	return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind,
1744 	    ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz));
1745 }
1746 
1747 static int
1748 ffs_getpages_async(struct vop_getpages_async_args *ap)
1749 {
1750 	struct vnode *vp;
1751 	struct ufsmount *um;
1752 	int error;
1753 
1754 	vp = ap->a_vp;
1755 	um = VFSTOUFS(vp->v_mount);
1756 
1757 	if (um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE)
1758 		return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
1759 		    ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg));
1760 
1761 	error = vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind,
1762 	    ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz);
1763 	ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error);
1764 
1765 	return (error);
1766 }
1767 
1768