xref: /freebsd/sys/ufs/ffs/ffs_vnops.c (revision d38c30c092828f4882ce13b08d0bd3fd6dc7afb5)
1 /*-
2  * SPDX-License-Identifier: (BSD-2-Clause-FreeBSD AND BSD-3-Clause)
3  *
4  * Copyright (c) 2002, 2003 Networks Associates Technology, Inc.
5  * All rights reserved.
6  *
7  * This software was developed for the FreeBSD Project by Marshall
8  * Kirk McKusick and Network Associates Laboratories, the Security
9  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
10  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
11  * research program
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * Copyright (c) 1982, 1986, 1989, 1993
35  *	The Regents of the University of California.  All rights reserved.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. Neither the name of the University nor the names of its contributors
46  *    may be used to endorse or promote products derived from this software
47  *    without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  *
61  *	from: @(#)ufs_readwrite.c	8.11 (Berkeley) 5/8/95
62  * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ...
63  *	@(#)ffs_vnops.c	8.15 (Berkeley) 5/14/95
64  */
65 
66 #include <sys/cdefs.h>
67 __FBSDID("$FreeBSD$");
68 
69 #include <sys/param.h>
70 #include <sys/bio.h>
71 #include <sys/systm.h>
72 #include <sys/buf.h>
73 #include <sys/conf.h>
74 #include <sys/extattr.h>
75 #include <sys/kernel.h>
76 #include <sys/limits.h>
77 #include <sys/malloc.h>
78 #include <sys/mount.h>
79 #include <sys/priv.h>
80 #include <sys/rwlock.h>
81 #include <sys/stat.h>
82 #include <sys/sysctl.h>
83 #include <sys/vmmeter.h>
84 #include <sys/vnode.h>
85 
86 #include <vm/vm.h>
87 #include <vm/vm_param.h>
88 #include <vm/vm_extern.h>
89 #include <vm/vm_object.h>
90 #include <vm/vm_page.h>
91 #include <vm/vm_pager.h>
92 #include <vm/vnode_pager.h>
93 
94 #include <ufs/ufs/extattr.h>
95 #include <ufs/ufs/quota.h>
96 #include <ufs/ufs/inode.h>
97 #include <ufs/ufs/ufs_extern.h>
98 #include <ufs/ufs/ufsmount.h>
99 
100 #include <ufs/ffs/fs.h>
101 #include <ufs/ffs/ffs_extern.h>
102 #include "opt_directio.h"
103 #include "opt_ffs.h"
104 
105 #define	ALIGNED_TO(ptr, s)	\
106 	(((uintptr_t)(ptr) & (_Alignof(s) - 1)) == 0)
107 
108 #ifdef DIRECTIO
109 extern int	ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
110 #endif
111 static vop_fdatasync_t	ffs_fdatasync;
112 static vop_fsync_t	ffs_fsync;
113 static vop_getpages_t	ffs_getpages;
114 static vop_getpages_async_t	ffs_getpages_async;
115 static vop_lock1_t	ffs_lock;
116 #ifdef INVARIANTS
117 static vop_unlock_t	ffs_unlock_debug;
118 #endif
119 static vop_read_t	ffs_read;
120 static vop_write_t	ffs_write;
121 static int	ffs_extread(struct vnode *vp, struct uio *uio, int ioflag);
122 static int	ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag,
123 		    struct ucred *cred);
124 static vop_strategy_t	ffsext_strategy;
125 static vop_closeextattr_t	ffs_closeextattr;
126 static vop_deleteextattr_t	ffs_deleteextattr;
127 static vop_getextattr_t	ffs_getextattr;
128 static vop_listextattr_t	ffs_listextattr;
129 static vop_openextattr_t	ffs_openextattr;
130 static vop_setextattr_t	ffs_setextattr;
131 static vop_vptofh_t	ffs_vptofh;
132 
133 /* Global vfs data structures for ufs. */
134 struct vop_vector ffs_vnodeops1 = {
135 	.vop_default =		&ufs_vnodeops,
136 	.vop_fsync =		ffs_fsync,
137 	.vop_fdatasync =	ffs_fdatasync,
138 	.vop_getpages =		ffs_getpages,
139 	.vop_getpages_async =	ffs_getpages_async,
140 	.vop_lock1 =		ffs_lock,
141 #ifdef INVARIANTS
142 	.vop_unlock =		ffs_unlock_debug,
143 #endif
144 	.vop_read =		ffs_read,
145 	.vop_reallocblks =	ffs_reallocblks,
146 	.vop_write =		ffs_write,
147 	.vop_vptofh =		ffs_vptofh,
148 };
149 VFS_VOP_VECTOR_REGISTER(ffs_vnodeops1);
150 
151 struct vop_vector ffs_fifoops1 = {
152 	.vop_default =		&ufs_fifoops,
153 	.vop_fsync =		ffs_fsync,
154 	.vop_fdatasync =	ffs_fdatasync,
155 	.vop_lock1 =		ffs_lock,
156 #ifdef INVARIANTS
157 	.vop_unlock =		ffs_unlock_debug,
158 #endif
159 	.vop_vptofh =		ffs_vptofh,
160 };
161 VFS_VOP_VECTOR_REGISTER(ffs_fifoops1);
162 
163 /* Global vfs data structures for ufs. */
164 struct vop_vector ffs_vnodeops2 = {
165 	.vop_default =		&ufs_vnodeops,
166 	.vop_fsync =		ffs_fsync,
167 	.vop_fdatasync =	ffs_fdatasync,
168 	.vop_getpages =		ffs_getpages,
169 	.vop_getpages_async =	ffs_getpages_async,
170 	.vop_lock1 =		ffs_lock,
171 #ifdef INVARIANTS
172 	.vop_unlock =		ffs_unlock_debug,
173 #endif
174 	.vop_read =		ffs_read,
175 	.vop_reallocblks =	ffs_reallocblks,
176 	.vop_write =		ffs_write,
177 	.vop_closeextattr =	ffs_closeextattr,
178 	.vop_deleteextattr =	ffs_deleteextattr,
179 	.vop_getextattr =	ffs_getextattr,
180 	.vop_listextattr =	ffs_listextattr,
181 	.vop_openextattr =	ffs_openextattr,
182 	.vop_setextattr =	ffs_setextattr,
183 	.vop_vptofh =		ffs_vptofh,
184 };
185 VFS_VOP_VECTOR_REGISTER(ffs_vnodeops2);
186 
187 struct vop_vector ffs_fifoops2 = {
188 	.vop_default =		&ufs_fifoops,
189 	.vop_fsync =		ffs_fsync,
190 	.vop_fdatasync =	ffs_fdatasync,
191 	.vop_lock1 =		ffs_lock,
192 #ifdef INVARIANTS
193 	.vop_unlock =		ffs_unlock_debug,
194 #endif
195 	.vop_reallocblks =	ffs_reallocblks,
196 	.vop_strategy =		ffsext_strategy,
197 	.vop_closeextattr =	ffs_closeextattr,
198 	.vop_deleteextattr =	ffs_deleteextattr,
199 	.vop_getextattr =	ffs_getextattr,
200 	.vop_listextattr =	ffs_listextattr,
201 	.vop_openextattr =	ffs_openextattr,
202 	.vop_setextattr =	ffs_setextattr,
203 	.vop_vptofh =		ffs_vptofh,
204 };
205 VFS_VOP_VECTOR_REGISTER(ffs_fifoops2);
206 
207 /*
208  * Synch an open file.
209  */
210 /* ARGSUSED */
211 static int
212 ffs_fsync(struct vop_fsync_args *ap)
213 {
214 	struct vnode *vp;
215 	struct bufobj *bo;
216 	int error;
217 
218 	vp = ap->a_vp;
219 	bo = &vp->v_bufobj;
220 retry:
221 	error = ffs_syncvnode(vp, ap->a_waitfor, 0);
222 	if (error)
223 		return (error);
224 	if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) {
225 		error = softdep_fsync(vp);
226 		if (error)
227 			return (error);
228 
229 		/*
230 		 * The softdep_fsync() function may drop vp lock,
231 		 * allowing for dirty buffers to reappear on the
232 		 * bo_dirty list. Recheck and resync as needed.
233 		 */
234 		BO_LOCK(bo);
235 		if ((vp->v_type == VREG || vp->v_type == VDIR) &&
236 		    (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) {
237 			BO_UNLOCK(bo);
238 			goto retry;
239 		}
240 		BO_UNLOCK(bo);
241 	}
242 	return (0);
243 }
244 
245 int
246 ffs_syncvnode(struct vnode *vp, int waitfor, int flags)
247 {
248 	struct inode *ip;
249 	struct bufobj *bo;
250 	struct buf *bp, *nbp;
251 	ufs_lbn_t lbn;
252 	int error, passes;
253 	bool still_dirty, wait;
254 
255 	ip = VTOI(vp);
256 	ip->i_flag &= ~IN_NEEDSYNC;
257 	bo = &vp->v_bufobj;
258 
259 	/*
260 	 * When doing MNT_WAIT we must first flush all dependencies
261 	 * on the inode.
262 	 */
263 	if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT &&
264 	    (error = softdep_sync_metadata(vp)) != 0)
265 		return (error);
266 
267 	/*
268 	 * Flush all dirty buffers associated with a vnode.
269 	 */
270 	error = 0;
271 	passes = 0;
272 	wait = false;	/* Always do an async pass first. */
273 	lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1));
274 	BO_LOCK(bo);
275 loop:
276 	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
277 		bp->b_vflags &= ~BV_SCANNED;
278 	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
279 		/*
280 		 * Reasons to skip this buffer: it has already been considered
281 		 * on this pass, the buffer has dependencies that will cause
282 		 * it to be redirtied and it has not already been deferred,
283 		 * or it is already being written.
284 		 */
285 		if ((bp->b_vflags & BV_SCANNED) != 0)
286 			continue;
287 		bp->b_vflags |= BV_SCANNED;
288 		/*
289 		 * Flush indirects in order, if requested.
290 		 *
291 		 * Note that if only datasync is requested, we can
292 		 * skip indirect blocks when softupdates are not
293 		 * active.  Otherwise we must flush them with data,
294 		 * since dependencies prevent data block writes.
295 		 */
296 		if (waitfor == MNT_WAIT && bp->b_lblkno <= -UFS_NDADDR &&
297 		    (lbn_level(bp->b_lblkno) >= passes ||
298 		    ((flags & DATA_ONLY) != 0 && !DOINGSOFTDEP(vp))))
299 			continue;
300 		if (bp->b_lblkno > lbn)
301 			panic("ffs_syncvnode: syncing truncated data.");
302 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) {
303 			BO_UNLOCK(bo);
304 		} else if (wait) {
305 			if (BUF_LOCK(bp,
306 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
307 			    BO_LOCKPTR(bo)) != 0) {
308 				bp->b_vflags &= ~BV_SCANNED;
309 				goto next;
310 			}
311 		} else
312 			continue;
313 		if ((bp->b_flags & B_DELWRI) == 0)
314 			panic("ffs_fsync: not dirty");
315 		/*
316 		 * Check for dependencies and potentially complete them.
317 		 */
318 		if (!LIST_EMPTY(&bp->b_dep) &&
319 		    (error = softdep_sync_buf(vp, bp,
320 		    wait ? MNT_WAIT : MNT_NOWAIT)) != 0) {
321 			/* I/O error. */
322 			if (error != EBUSY) {
323 				BUF_UNLOCK(bp);
324 				return (error);
325 			}
326 			/* If we deferred once, don't defer again. */
327 		    	if ((bp->b_flags & B_DEFERRED) == 0) {
328 				bp->b_flags |= B_DEFERRED;
329 				BUF_UNLOCK(bp);
330 				goto next;
331 			}
332 		}
333 		if (wait) {
334 			bremfree(bp);
335 			if ((error = bwrite(bp)) != 0)
336 				return (error);
337 		} else if ((bp->b_flags & B_CLUSTEROK)) {
338 			(void) vfs_bio_awrite(bp);
339 		} else {
340 			bremfree(bp);
341 			(void) bawrite(bp);
342 		}
343 next:
344 		/*
345 		 * Since we may have slept during the I/O, we need
346 		 * to start from a known point.
347 		 */
348 		BO_LOCK(bo);
349 		nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd);
350 	}
351 	if (waitfor != MNT_WAIT) {
352 		BO_UNLOCK(bo);
353 		if ((flags & NO_INO_UPDT) != 0)
354 			return (0);
355 		else
356 			return (ffs_update(vp, 0));
357 	}
358 	/* Drain IO to see if we're done. */
359 	bufobj_wwait(bo, 0, 0);
360 	/*
361 	 * Block devices associated with filesystems may have new I/O
362 	 * requests posted for them even if the vnode is locked, so no
363 	 * amount of trying will get them clean.  We make several passes
364 	 * as a best effort.
365 	 *
366 	 * Regular files may need multiple passes to flush all dependency
367 	 * work as it is possible that we must write once per indirect
368 	 * level, once for the leaf, and once for the inode and each of
369 	 * these will be done with one sync and one async pass.
370 	 */
371 	if (bo->bo_dirty.bv_cnt > 0) {
372 		if ((flags & DATA_ONLY) == 0) {
373 			still_dirty = true;
374 		} else {
375 			/*
376 			 * For data-only sync, dirty indirect buffers
377 			 * are ignored.
378 			 */
379 			still_dirty = false;
380 			TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
381 				if (bp->b_lblkno > -UFS_NDADDR) {
382 					still_dirty = true;
383 					break;
384 				}
385 			}
386 		}
387 
388 		if (still_dirty) {
389 			/* Write the inode after sync passes to flush deps. */
390 			if (wait && DOINGSOFTDEP(vp) &&
391 			    (flags & NO_INO_UPDT) == 0) {
392 				BO_UNLOCK(bo);
393 				ffs_update(vp, 1);
394 				BO_LOCK(bo);
395 			}
396 			/* switch between sync/async. */
397 			wait = !wait;
398 			if (wait || ++passes < UFS_NIADDR + 2)
399 				goto loop;
400 		}
401 	}
402 	BO_UNLOCK(bo);
403 	error = 0;
404 	if ((flags & DATA_ONLY) == 0) {
405 		if ((flags & NO_INO_UPDT) == 0)
406 			error = ffs_update(vp, 1);
407 		if (DOINGSUJ(vp))
408 			softdep_journal_fsync(VTOI(vp));
409 	}
410 	return (error);
411 }
412 
413 static int
414 ffs_fdatasync(struct vop_fdatasync_args *ap)
415 {
416 
417 	return (ffs_syncvnode(ap->a_vp, MNT_WAIT, DATA_ONLY));
418 }
419 
420 static int
421 ffs_lock(ap)
422 	struct vop_lock1_args /* {
423 		struct vnode *a_vp;
424 		int a_flags;
425 		struct thread *a_td;
426 		char *file;
427 		int line;
428 	} */ *ap;
429 {
430 #ifndef NO_FFS_SNAPSHOT
431 	struct vnode *vp;
432 	int flags;
433 	struct lock *lkp;
434 	int result;
435 
436 	switch (ap->a_flags & LK_TYPE_MASK) {
437 	case LK_SHARED:
438 	case LK_UPGRADE:
439 	case LK_EXCLUSIVE:
440 		vp = ap->a_vp;
441 		flags = ap->a_flags;
442 		for (;;) {
443 #ifdef DEBUG_VFS_LOCKS
444 			VNPASS(vp->v_holdcnt != 0, vp);
445 #endif
446 			lkp = vp->v_vnlock;
447 			result = _lockmgr_args(lkp, flags, VI_MTX(vp),
448 			    LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT,
449 			    ap->a_file, ap->a_line);
450 			if (lkp == vp->v_vnlock || result != 0)
451 				break;
452 			/*
453 			 * Apparent success, except that the vnode
454 			 * mutated between snapshot file vnode and
455 			 * regular file vnode while this process
456 			 * slept.  The lock currently held is not the
457 			 * right lock.  Release it, and try to get the
458 			 * new lock.
459 			 */
460 			(void) _lockmgr_args(lkp, LK_RELEASE, NULL,
461 			    LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT,
462 			    ap->a_file, ap->a_line);
463 			if ((flags & (LK_INTERLOCK | LK_NOWAIT)) ==
464 			    (LK_INTERLOCK | LK_NOWAIT))
465 				return (EBUSY);
466 			if ((flags & LK_TYPE_MASK) == LK_UPGRADE)
467 				flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE;
468 			flags &= ~LK_INTERLOCK;
469 		}
470 		break;
471 	default:
472 		result = VOP_LOCK1_APV(&ufs_vnodeops, ap);
473 	}
474 	return (result);
475 #else
476 	return (VOP_LOCK1_APV(&ufs_vnodeops, ap));
477 #endif
478 }
479 
480 #ifdef INVARIANTS
481 static int
482 ffs_unlock_debug(struct vop_unlock_args *ap)
483 {
484 	struct vnode *vp = ap->a_vp;
485 	struct inode *ip = VTOI(vp);
486 
487 	if (ip->i_flag & UFS_INODE_FLAG_LAZY_MASK_ASSERTABLE) {
488 		if ((vp->v_mflag & VMP_LAZYLIST) == 0) {
489 			VI_LOCK(vp);
490 			VNASSERT((vp->v_mflag & VMP_LAZYLIST), vp,
491 			    ("%s: modified vnode (%x) not on lazy list",
492 			    __func__, ip->i_flag));
493 			VI_UNLOCK(vp);
494 		}
495 	}
496 	return (VOP_UNLOCK_APV(&ufs_vnodeops, ap));
497 }
498 #endif
499 
500 static int
501 ffs_read_hole(struct uio *uio, long xfersize, long *size)
502 {
503 	ssize_t saved_resid, tlen;
504 	int error;
505 
506 	while (xfersize > 0) {
507 		tlen = min(xfersize, ZERO_REGION_SIZE);
508 		saved_resid = uio->uio_resid;
509 		error = vn_io_fault_uiomove(__DECONST(void *, zero_region),
510 		    tlen, uio);
511 		if (error != 0)
512 			return (error);
513 		tlen = saved_resid - uio->uio_resid;
514 		xfersize -= tlen;
515 		*size -= tlen;
516 	}
517 	return (0);
518 }
519 
520 /*
521  * Vnode op for reading.
522  */
523 static int
524 ffs_read(ap)
525 	struct vop_read_args /* {
526 		struct vnode *a_vp;
527 		struct uio *a_uio;
528 		int a_ioflag;
529 		struct ucred *a_cred;
530 	} */ *ap;
531 {
532 	struct vnode *vp;
533 	struct inode *ip;
534 	struct uio *uio;
535 	struct fs *fs;
536 	struct buf *bp;
537 	ufs_lbn_t lbn, nextlbn;
538 	off_t bytesinfile;
539 	long size, xfersize, blkoffset;
540 	ssize_t orig_resid;
541 	int bflag, error, ioflag, seqcount;
542 
543 	vp = ap->a_vp;
544 	uio = ap->a_uio;
545 	ioflag = ap->a_ioflag;
546 	if (ap->a_ioflag & IO_EXT)
547 #ifdef notyet
548 		return (ffs_extread(vp, uio, ioflag));
549 #else
550 		panic("ffs_read+IO_EXT");
551 #endif
552 #ifdef DIRECTIO
553 	if ((ioflag & IO_DIRECT) != 0) {
554 		int workdone;
555 
556 		error = ffs_rawread(vp, uio, &workdone);
557 		if (error != 0 || workdone != 0)
558 			return error;
559 	}
560 #endif
561 
562 	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
563 	ip = VTOI(vp);
564 
565 #ifdef INVARIANTS
566 	if (uio->uio_rw != UIO_READ)
567 		panic("ffs_read: mode");
568 
569 	if (vp->v_type == VLNK) {
570 		if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
571 			panic("ffs_read: short symlink");
572 	} else if (vp->v_type != VREG && vp->v_type != VDIR)
573 		panic("ffs_read: type %d",  vp->v_type);
574 #endif
575 	orig_resid = uio->uio_resid;
576 	KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0"));
577 	if (orig_resid == 0)
578 		return (0);
579 	KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0"));
580 	fs = ITOFS(ip);
581 	if (uio->uio_offset < ip->i_size &&
582 	    uio->uio_offset >= fs->fs_maxfilesize)
583 		return (EOVERFLOW);
584 
585 	bflag = GB_UNMAPPED | (uio->uio_segflg == UIO_NOCOPY ? 0 : GB_NOSPARSE);
586 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
587 		if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
588 			break;
589 		lbn = lblkno(fs, uio->uio_offset);
590 		nextlbn = lbn + 1;
591 
592 		/*
593 		 * size of buffer.  The buffer representing the
594 		 * end of the file is rounded up to the size of
595 		 * the block type ( fragment or full block,
596 		 * depending ).
597 		 */
598 		size = blksize(fs, ip, lbn);
599 		blkoffset = blkoff(fs, uio->uio_offset);
600 
601 		/*
602 		 * The amount we want to transfer in this iteration is
603 		 * one FS block less the amount of the data before
604 		 * our startpoint (duh!)
605 		 */
606 		xfersize = fs->fs_bsize - blkoffset;
607 
608 		/*
609 		 * But if we actually want less than the block,
610 		 * or the file doesn't have a whole block more of data,
611 		 * then use the lesser number.
612 		 */
613 		if (uio->uio_resid < xfersize)
614 			xfersize = uio->uio_resid;
615 		if (bytesinfile < xfersize)
616 			xfersize = bytesinfile;
617 
618 		if (lblktosize(fs, nextlbn) >= ip->i_size) {
619 			/*
620 			 * Don't do readahead if this is the end of the file.
621 			 */
622 			error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp);
623 		} else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
624 			/*
625 			 * Otherwise if we are allowed to cluster,
626 			 * grab as much as we can.
627 			 *
628 			 * XXX  This may not be a win if we are not
629 			 * doing sequential access.
630 			 */
631 			error = cluster_read(vp, ip->i_size, lbn,
632 			    size, NOCRED, blkoffset + uio->uio_resid,
633 			    seqcount, bflag, &bp);
634 		} else if (seqcount > 1) {
635 			/*
636 			 * If we are NOT allowed to cluster, then
637 			 * if we appear to be acting sequentially,
638 			 * fire off a request for a readahead
639 			 * as well as a read. Note that the 4th and 5th
640 			 * arguments point to arrays of the size specified in
641 			 * the 6th argument.
642 			 */
643 			u_int nextsize = blksize(fs, ip, nextlbn);
644 			error = breadn_flags(vp, lbn, lbn, size, &nextlbn,
645 			    &nextsize, 1, NOCRED, bflag, NULL, &bp);
646 		} else {
647 			/*
648 			 * Failing all of the above, just read what the
649 			 * user asked for. Interestingly, the same as
650 			 * the first option above.
651 			 */
652 			error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp);
653 		}
654 		if (error == EJUSTRETURN) {
655 			error = ffs_read_hole(uio, xfersize, &size);
656 			if (error == 0)
657 				continue;
658 		}
659 		if (error != 0) {
660 			brelse(bp);
661 			bp = NULL;
662 			break;
663 		}
664 
665 		/*
666 		 * We should only get non-zero b_resid when an I/O error
667 		 * has occurred, which should cause us to break above.
668 		 * However, if the short read did not cause an error,
669 		 * then we want to ensure that we do not uiomove bad
670 		 * or uninitialized data.
671 		 */
672 		size -= bp->b_resid;
673 		if (size < xfersize) {
674 			if (size == 0)
675 				break;
676 			xfersize = size;
677 		}
678 
679 		if (buf_mapped(bp)) {
680 			error = vn_io_fault_uiomove((char *)bp->b_data +
681 			    blkoffset, (int)xfersize, uio);
682 		} else {
683 			error = vn_io_fault_pgmove(bp->b_pages, blkoffset,
684 			    (int)xfersize, uio);
685 		}
686 		if (error)
687 			break;
688 
689 		vfs_bio_brelse(bp, ioflag);
690 	}
691 
692 	/*
693 	 * This can only happen in the case of an error
694 	 * because the loop above resets bp to NULL on each iteration
695 	 * and on normal completion has not set a new value into it.
696 	 * so it must have come from a 'break' statement
697 	 */
698 	if (bp != NULL)
699 		vfs_bio_brelse(bp, ioflag);
700 
701 	if ((error == 0 || uio->uio_resid != orig_resid) &&
702 	    (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
703 		UFS_INODE_SET_FLAG_SHARED(ip, IN_ACCESS);
704 	return (error);
705 }
706 
707 /*
708  * Vnode op for writing.
709  */
710 static int
711 ffs_write(ap)
712 	struct vop_write_args /* {
713 		struct vnode *a_vp;
714 		struct uio *a_uio;
715 		int a_ioflag;
716 		struct ucred *a_cred;
717 	} */ *ap;
718 {
719 	struct vnode *vp;
720 	struct uio *uio;
721 	struct inode *ip;
722 	struct fs *fs;
723 	struct buf *bp;
724 	ufs_lbn_t lbn;
725 	off_t osize;
726 	ssize_t resid;
727 	int seqcount;
728 	int blkoffset, error, flags, ioflag, size, xfersize;
729 
730 	vp = ap->a_vp;
731 	uio = ap->a_uio;
732 	ioflag = ap->a_ioflag;
733 	if (ap->a_ioflag & IO_EXT)
734 #ifdef notyet
735 		return (ffs_extwrite(vp, uio, ioflag, ap->a_cred));
736 #else
737 		panic("ffs_write+IO_EXT");
738 #endif
739 
740 	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
741 	ip = VTOI(vp);
742 
743 #ifdef INVARIANTS
744 	if (uio->uio_rw != UIO_WRITE)
745 		panic("ffs_write: mode");
746 #endif
747 
748 	switch (vp->v_type) {
749 	case VREG:
750 		if (ioflag & IO_APPEND)
751 			uio->uio_offset = ip->i_size;
752 		if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
753 			return (EPERM);
754 		/* FALLTHROUGH */
755 	case VLNK:
756 		break;
757 	case VDIR:
758 		panic("ffs_write: dir write");
759 		break;
760 	default:
761 		panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type,
762 			(int)uio->uio_offset,
763 			(int)uio->uio_resid
764 		);
765 	}
766 
767 	KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0"));
768 	KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0"));
769 	fs = ITOFS(ip);
770 	if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize)
771 		return (EFBIG);
772 	/*
773 	 * Maybe this should be above the vnode op call, but so long as
774 	 * file servers have no limits, I don't think it matters.
775 	 */
776 	if (vn_rlimit_fsize(vp, uio, uio->uio_td))
777 		return (EFBIG);
778 
779 	resid = uio->uio_resid;
780 	osize = ip->i_size;
781 	if (seqcount > BA_SEQMAX)
782 		flags = BA_SEQMAX << BA_SEQSHIFT;
783 	else
784 		flags = seqcount << BA_SEQSHIFT;
785 	if (ioflag & IO_SYNC)
786 		flags |= IO_SYNC;
787 	flags |= BA_UNMAPPED;
788 
789 	for (error = 0; uio->uio_resid > 0;) {
790 		lbn = lblkno(fs, uio->uio_offset);
791 		blkoffset = blkoff(fs, uio->uio_offset);
792 		xfersize = fs->fs_bsize - blkoffset;
793 		if (uio->uio_resid < xfersize)
794 			xfersize = uio->uio_resid;
795 		if (uio->uio_offset + xfersize > ip->i_size)
796 			vnode_pager_setsize(vp, uio->uio_offset + xfersize);
797 
798 		/*
799 		 * We must perform a read-before-write if the transfer size
800 		 * does not cover the entire buffer.
801 		 */
802 		if (fs->fs_bsize > xfersize)
803 			flags |= BA_CLRBUF;
804 		else
805 			flags &= ~BA_CLRBUF;
806 /* XXX is uio->uio_offset the right thing here? */
807 		error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
808 		    ap->a_cred, flags, &bp);
809 		if (error != 0) {
810 			vnode_pager_setsize(vp, ip->i_size);
811 			break;
812 		}
813 		if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL))
814 			bp->b_flags |= B_NOCACHE;
815 
816 		if (uio->uio_offset + xfersize > ip->i_size) {
817 			ip->i_size = uio->uio_offset + xfersize;
818 			DIP_SET(ip, i_size, ip->i_size);
819 		}
820 
821 		size = blksize(fs, ip, lbn) - bp->b_resid;
822 		if (size < xfersize)
823 			xfersize = size;
824 
825 		if (buf_mapped(bp)) {
826 			error = vn_io_fault_uiomove((char *)bp->b_data +
827 			    blkoffset, (int)xfersize, uio);
828 		} else {
829 			error = vn_io_fault_pgmove(bp->b_pages, blkoffset,
830 			    (int)xfersize, uio);
831 		}
832 		/*
833 		 * If the buffer is not already filled and we encounter an
834 		 * error while trying to fill it, we have to clear out any
835 		 * garbage data from the pages instantiated for the buffer.
836 		 * If we do not, a failed uiomove() during a write can leave
837 		 * the prior contents of the pages exposed to a userland mmap.
838 		 *
839 		 * Note that we need only clear buffers with a transfer size
840 		 * equal to the block size because buffers with a shorter
841 		 * transfer size were cleared above by the call to UFS_BALLOC()
842 		 * with the BA_CLRBUF flag set.
843 		 *
844 		 * If the source region for uiomove identically mmaps the
845 		 * buffer, uiomove() performed the NOP copy, and the buffer
846 		 * content remains valid because the page fault handler
847 		 * validated the pages.
848 		 */
849 		if (error != 0 && (bp->b_flags & B_CACHE) == 0 &&
850 		    fs->fs_bsize == xfersize)
851 			vfs_bio_clrbuf(bp);
852 
853 		vfs_bio_set_flags(bp, ioflag);
854 
855 		/*
856 		 * If IO_SYNC each buffer is written synchronously.  Otherwise
857 		 * if we have a severe page deficiency write the buffer
858 		 * asynchronously.  Otherwise try to cluster, and if that
859 		 * doesn't do it then either do an async write (if O_DIRECT),
860 		 * or a delayed write (if not).
861 		 */
862 		if (ioflag & IO_SYNC) {
863 			(void)bwrite(bp);
864 		} else if (vm_page_count_severe() ||
865 			    buf_dirty_count_severe() ||
866 			    (ioflag & IO_ASYNC)) {
867 			bp->b_flags |= B_CLUSTEROK;
868 			bawrite(bp);
869 		} else if (xfersize + blkoffset == fs->fs_bsize) {
870 			if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
871 				bp->b_flags |= B_CLUSTEROK;
872 				cluster_write(vp, bp, ip->i_size, seqcount,
873 				    GB_UNMAPPED);
874 			} else {
875 				bawrite(bp);
876 			}
877 		} else if (ioflag & IO_DIRECT) {
878 			bp->b_flags |= B_CLUSTEROK;
879 			bawrite(bp);
880 		} else {
881 			bp->b_flags |= B_CLUSTEROK;
882 			bdwrite(bp);
883 		}
884 		if (error || xfersize == 0)
885 			break;
886 		UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
887 	}
888 	/*
889 	 * If we successfully wrote any data, and we are not the superuser
890 	 * we clear the setuid and setgid bits as a precaution against
891 	 * tampering.
892 	 */
893 	if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid &&
894 	    ap->a_cred) {
895 		if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID)) {
896 			ip->i_mode &= ~(ISUID | ISGID);
897 			DIP_SET(ip, i_mode, ip->i_mode);
898 		}
899 	}
900 	if (error) {
901 		if (ioflag & IO_UNIT) {
902 			(void)ffs_truncate(vp, osize,
903 			    IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred);
904 			uio->uio_offset -= resid - uio->uio_resid;
905 			uio->uio_resid = resid;
906 		}
907 	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
908 		error = ffs_update(vp, 1);
909 	return (error);
910 }
911 
912 /*
913  * Extended attribute area reading.
914  */
915 static int
916 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag)
917 {
918 	struct inode *ip;
919 	struct ufs2_dinode *dp;
920 	struct fs *fs;
921 	struct buf *bp;
922 	ufs_lbn_t lbn, nextlbn;
923 	off_t bytesinfile;
924 	long size, xfersize, blkoffset;
925 	ssize_t orig_resid;
926 	int error;
927 
928 	ip = VTOI(vp);
929 	fs = ITOFS(ip);
930 	dp = ip->i_din2;
931 
932 #ifdef INVARIANTS
933 	if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC)
934 		panic("ffs_extread: mode");
935 
936 #endif
937 	orig_resid = uio->uio_resid;
938 	KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0"));
939 	if (orig_resid == 0)
940 		return (0);
941 	KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0"));
942 
943 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
944 		if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0)
945 			break;
946 		lbn = lblkno(fs, uio->uio_offset);
947 		nextlbn = lbn + 1;
948 
949 		/*
950 		 * size of buffer.  The buffer representing the
951 		 * end of the file is rounded up to the size of
952 		 * the block type ( fragment or full block,
953 		 * depending ).
954 		 */
955 		size = sblksize(fs, dp->di_extsize, lbn);
956 		blkoffset = blkoff(fs, uio->uio_offset);
957 
958 		/*
959 		 * The amount we want to transfer in this iteration is
960 		 * one FS block less the amount of the data before
961 		 * our startpoint (duh!)
962 		 */
963 		xfersize = fs->fs_bsize - blkoffset;
964 
965 		/*
966 		 * But if we actually want less than the block,
967 		 * or the file doesn't have a whole block more of data,
968 		 * then use the lesser number.
969 		 */
970 		if (uio->uio_resid < xfersize)
971 			xfersize = uio->uio_resid;
972 		if (bytesinfile < xfersize)
973 			xfersize = bytesinfile;
974 
975 		if (lblktosize(fs, nextlbn) >= dp->di_extsize) {
976 			/*
977 			 * Don't do readahead if this is the end of the info.
978 			 */
979 			error = bread(vp, -1 - lbn, size, NOCRED, &bp);
980 		} else {
981 			/*
982 			 * If we have a second block, then
983 			 * fire off a request for a readahead
984 			 * as well as a read. Note that the 4th and 5th
985 			 * arguments point to arrays of the size specified in
986 			 * the 6th argument.
987 			 */
988 			u_int nextsize = sblksize(fs, dp->di_extsize, nextlbn);
989 
990 			nextlbn = -1 - nextlbn;
991 			error = breadn(vp, -1 - lbn,
992 			    size, &nextlbn, &nextsize, 1, NOCRED, &bp);
993 		}
994 		if (error) {
995 			brelse(bp);
996 			bp = NULL;
997 			break;
998 		}
999 
1000 		/*
1001 		 * We should only get non-zero b_resid when an I/O error
1002 		 * has occurred, which should cause us to break above.
1003 		 * However, if the short read did not cause an error,
1004 		 * then we want to ensure that we do not uiomove bad
1005 		 * or uninitialized data.
1006 		 */
1007 		size -= bp->b_resid;
1008 		if (size < xfersize) {
1009 			if (size == 0)
1010 				break;
1011 			xfersize = size;
1012 		}
1013 
1014 		error = uiomove((char *)bp->b_data + blkoffset,
1015 					(int)xfersize, uio);
1016 		if (error)
1017 			break;
1018 		vfs_bio_brelse(bp, ioflag);
1019 	}
1020 
1021 	/*
1022 	 * This can only happen in the case of an error
1023 	 * because the loop above resets bp to NULL on each iteration
1024 	 * and on normal completion has not set a new value into it.
1025 	 * so it must have come from a 'break' statement
1026 	 */
1027 	if (bp != NULL)
1028 		vfs_bio_brelse(bp, ioflag);
1029 	return (error);
1030 }
1031 
1032 /*
1033  * Extended attribute area writing.
1034  */
1035 static int
1036 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred)
1037 {
1038 	struct inode *ip;
1039 	struct ufs2_dinode *dp;
1040 	struct fs *fs;
1041 	struct buf *bp;
1042 	ufs_lbn_t lbn;
1043 	off_t osize;
1044 	ssize_t resid;
1045 	int blkoffset, error, flags, size, xfersize;
1046 
1047 	ip = VTOI(vp);
1048 	fs = ITOFS(ip);
1049 	dp = ip->i_din2;
1050 
1051 #ifdef INVARIANTS
1052 	if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC)
1053 		panic("ffs_extwrite: mode");
1054 #endif
1055 
1056 	if (ioflag & IO_APPEND)
1057 		uio->uio_offset = dp->di_extsize;
1058 	KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0"));
1059 	KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0"));
1060 	if ((uoff_t)uio->uio_offset + uio->uio_resid >
1061 	    UFS_NXADDR * fs->fs_bsize)
1062 		return (EFBIG);
1063 
1064 	resid = uio->uio_resid;
1065 	osize = dp->di_extsize;
1066 	flags = IO_EXT;
1067 	if (ioflag & IO_SYNC)
1068 		flags |= IO_SYNC;
1069 
1070 	for (error = 0; uio->uio_resid > 0;) {
1071 		lbn = lblkno(fs, uio->uio_offset);
1072 		blkoffset = blkoff(fs, uio->uio_offset);
1073 		xfersize = fs->fs_bsize - blkoffset;
1074 		if (uio->uio_resid < xfersize)
1075 			xfersize = uio->uio_resid;
1076 
1077 		/*
1078 		 * We must perform a read-before-write if the transfer size
1079 		 * does not cover the entire buffer.
1080 		 */
1081 		if (fs->fs_bsize > xfersize)
1082 			flags |= BA_CLRBUF;
1083 		else
1084 			flags &= ~BA_CLRBUF;
1085 		error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
1086 		    ucred, flags, &bp);
1087 		if (error != 0)
1088 			break;
1089 		/*
1090 		 * If the buffer is not valid we have to clear out any
1091 		 * garbage data from the pages instantiated for the buffer.
1092 		 * If we do not, a failed uiomove() during a write can leave
1093 		 * the prior contents of the pages exposed to a userland
1094 		 * mmap().  XXX deal with uiomove() errors a better way.
1095 		 */
1096 		if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
1097 			vfs_bio_clrbuf(bp);
1098 
1099 		if (uio->uio_offset + xfersize > dp->di_extsize)
1100 			dp->di_extsize = uio->uio_offset + xfersize;
1101 
1102 		size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid;
1103 		if (size < xfersize)
1104 			xfersize = size;
1105 
1106 		error =
1107 		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
1108 
1109 		vfs_bio_set_flags(bp, ioflag);
1110 
1111 		/*
1112 		 * If IO_SYNC each buffer is written synchronously.  Otherwise
1113 		 * if we have a severe page deficiency write the buffer
1114 		 * asynchronously.  Otherwise try to cluster, and if that
1115 		 * doesn't do it then either do an async write (if O_DIRECT),
1116 		 * or a delayed write (if not).
1117 		 */
1118 		if (ioflag & IO_SYNC) {
1119 			(void)bwrite(bp);
1120 		} else if (vm_page_count_severe() ||
1121 			    buf_dirty_count_severe() ||
1122 			    xfersize + blkoffset == fs->fs_bsize ||
1123 			    (ioflag & (IO_ASYNC | IO_DIRECT)))
1124 			bawrite(bp);
1125 		else
1126 			bdwrite(bp);
1127 		if (error || xfersize == 0)
1128 			break;
1129 		UFS_INODE_SET_FLAG(ip, IN_CHANGE);
1130 	}
1131 	/*
1132 	 * If we successfully wrote any data, and we are not the superuser
1133 	 * we clear the setuid and setgid bits as a precaution against
1134 	 * tampering.
1135 	 */
1136 	if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) {
1137 		if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID)) {
1138 			ip->i_mode &= ~(ISUID | ISGID);
1139 			dp->di_mode = ip->i_mode;
1140 		}
1141 	}
1142 	if (error) {
1143 		if (ioflag & IO_UNIT) {
1144 			(void)ffs_truncate(vp, osize,
1145 			    IO_EXT | (ioflag&IO_SYNC), ucred);
1146 			uio->uio_offset -= resid - uio->uio_resid;
1147 			uio->uio_resid = resid;
1148 		}
1149 	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
1150 		error = ffs_update(vp, 1);
1151 	return (error);
1152 }
1153 
1154 
1155 /*
1156  * Vnode operating to retrieve a named extended attribute.
1157  *
1158  * Locate a particular EA (nspace:name) in the area (ptr:length), and return
1159  * the length of the EA, and possibly the pointer to the entry and to the data.
1160  */
1161 static int
1162 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name,
1163     struct extattr **eapp, u_char **eac)
1164 {
1165 	struct extattr *eap, *eaend;
1166 	size_t nlen;
1167 
1168 	nlen = strlen(name);
1169 	KASSERT(ALIGNED_TO(ptr, struct extattr), ("unaligned"));
1170 	eap = (struct extattr *)ptr;
1171 	eaend = (struct extattr *)(ptr + length);
1172 	for (; eap < eaend; eap = EXTATTR_NEXT(eap)) {
1173 		/* make sure this entry is complete */
1174 		if (EXTATTR_NEXT(eap) > eaend)
1175 			break;
1176 		if (eap->ea_namespace != nspace || eap->ea_namelength != nlen
1177 		    || memcmp(eap->ea_name, name, nlen) != 0)
1178 			continue;
1179 		if (eapp != NULL)
1180 			*eapp = eap;
1181 		if (eac != NULL)
1182 			*eac = EXTATTR_CONTENT(eap);
1183 		return (EXTATTR_CONTENT_SIZE(eap));
1184 	}
1185 	return (-1);
1186 }
1187 
1188 static int
1189 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra)
1190 {
1191 	struct inode *ip;
1192 	struct ufs2_dinode *dp;
1193 	struct fs *fs;
1194 	struct uio luio;
1195 	struct iovec liovec;
1196 	u_int easize;
1197 	int error;
1198 	u_char *eae;
1199 
1200 	ip = VTOI(vp);
1201 	fs = ITOFS(ip);
1202 	dp = ip->i_din2;
1203 	easize = dp->di_extsize;
1204 	if ((uoff_t)easize + extra > UFS_NXADDR * fs->fs_bsize)
1205 		return (EFBIG);
1206 
1207 	eae = malloc(easize + extra, M_TEMP, M_WAITOK);
1208 
1209 	liovec.iov_base = eae;
1210 	liovec.iov_len = easize;
1211 	luio.uio_iov = &liovec;
1212 	luio.uio_iovcnt = 1;
1213 	luio.uio_offset = 0;
1214 	luio.uio_resid = easize;
1215 	luio.uio_segflg = UIO_SYSSPACE;
1216 	luio.uio_rw = UIO_READ;
1217 	luio.uio_td = td;
1218 
1219 	error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC);
1220 	if (error) {
1221 		free(eae, M_TEMP);
1222 		return(error);
1223 	}
1224 	*p = eae;
1225 	return (0);
1226 }
1227 
1228 static void
1229 ffs_lock_ea(struct vnode *vp)
1230 {
1231 	struct inode *ip;
1232 
1233 	ip = VTOI(vp);
1234 	VI_LOCK(vp);
1235 	while (ip->i_flag & IN_EA_LOCKED) {
1236 		UFS_INODE_SET_FLAG(ip, IN_EA_LOCKWAIT);
1237 		msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea",
1238 		    0);
1239 	}
1240 	UFS_INODE_SET_FLAG(ip, IN_EA_LOCKED);
1241 	VI_UNLOCK(vp);
1242 }
1243 
1244 static void
1245 ffs_unlock_ea(struct vnode *vp)
1246 {
1247 	struct inode *ip;
1248 
1249 	ip = VTOI(vp);
1250 	VI_LOCK(vp);
1251 	if (ip->i_flag & IN_EA_LOCKWAIT)
1252 		wakeup(&ip->i_ea_refs);
1253 	ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT);
1254 	VI_UNLOCK(vp);
1255 }
1256 
1257 static int
1258 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td)
1259 {
1260 	struct inode *ip;
1261 	struct ufs2_dinode *dp;
1262 	int error;
1263 
1264 	ip = VTOI(vp);
1265 
1266 	ffs_lock_ea(vp);
1267 	if (ip->i_ea_area != NULL) {
1268 		ip->i_ea_refs++;
1269 		ffs_unlock_ea(vp);
1270 		return (0);
1271 	}
1272 	dp = ip->i_din2;
1273 	error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0);
1274 	if (error) {
1275 		ffs_unlock_ea(vp);
1276 		return (error);
1277 	}
1278 	ip->i_ea_len = dp->di_extsize;
1279 	ip->i_ea_error = 0;
1280 	ip->i_ea_refs++;
1281 	ffs_unlock_ea(vp);
1282 	return (0);
1283 }
1284 
1285 /*
1286  * Vnode extattr transaction commit/abort
1287  */
1288 static int
1289 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td)
1290 {
1291 	struct inode *ip;
1292 	struct uio luio;
1293 	struct iovec liovec;
1294 	int error;
1295 	struct ufs2_dinode *dp;
1296 
1297 	ip = VTOI(vp);
1298 
1299 	ffs_lock_ea(vp);
1300 	if (ip->i_ea_area == NULL) {
1301 		ffs_unlock_ea(vp);
1302 		return (EINVAL);
1303 	}
1304 	dp = ip->i_din2;
1305 	error = ip->i_ea_error;
1306 	if (commit && error == 0) {
1307 		ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit");
1308 		if (cred == NOCRED)
1309 			cred =  vp->v_mount->mnt_cred;
1310 		liovec.iov_base = ip->i_ea_area;
1311 		liovec.iov_len = ip->i_ea_len;
1312 		luio.uio_iov = &liovec;
1313 		luio.uio_iovcnt = 1;
1314 		luio.uio_offset = 0;
1315 		luio.uio_resid = ip->i_ea_len;
1316 		luio.uio_segflg = UIO_SYSSPACE;
1317 		luio.uio_rw = UIO_WRITE;
1318 		luio.uio_td = td;
1319 		/* XXX: I'm not happy about truncating to zero size */
1320 		if (ip->i_ea_len < dp->di_extsize)
1321 			error = ffs_truncate(vp, 0, IO_EXT, cred);
1322 		error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred);
1323 	}
1324 	if (--ip->i_ea_refs == 0) {
1325 		free(ip->i_ea_area, M_TEMP);
1326 		ip->i_ea_area = NULL;
1327 		ip->i_ea_len = 0;
1328 		ip->i_ea_error = 0;
1329 	}
1330 	ffs_unlock_ea(vp);
1331 	return (error);
1332 }
1333 
1334 /*
1335  * Vnode extattr strategy routine for fifos.
1336  *
1337  * We need to check for a read or write of the external attributes.
1338  * Otherwise we just fall through and do the usual thing.
1339  */
1340 static int
1341 ffsext_strategy(struct vop_strategy_args *ap)
1342 /*
1343 struct vop_strategy_args {
1344 	struct vnodeop_desc *a_desc;
1345 	struct vnode *a_vp;
1346 	struct buf *a_bp;
1347 };
1348 */
1349 {
1350 	struct vnode *vp;
1351 	daddr_t lbn;
1352 
1353 	vp = ap->a_vp;
1354 	lbn = ap->a_bp->b_lblkno;
1355 	if (I_IS_UFS2(VTOI(vp)) && lbn < 0 && lbn >= -UFS_NXADDR)
1356 		return (VOP_STRATEGY_APV(&ufs_vnodeops, ap));
1357 	if (vp->v_type == VFIFO)
1358 		return (VOP_STRATEGY_APV(&ufs_fifoops, ap));
1359 	panic("spec nodes went here");
1360 }
1361 
1362 /*
1363  * Vnode extattr transaction commit/abort
1364  */
1365 static int
1366 ffs_openextattr(struct vop_openextattr_args *ap)
1367 /*
1368 struct vop_openextattr_args {
1369 	struct vnodeop_desc *a_desc;
1370 	struct vnode *a_vp;
1371 	IN struct ucred *a_cred;
1372 	IN struct thread *a_td;
1373 };
1374 */
1375 {
1376 
1377 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1378 		return (EOPNOTSUPP);
1379 
1380 	return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td));
1381 }
1382 
1383 
1384 /*
1385  * Vnode extattr transaction commit/abort
1386  */
1387 static int
1388 ffs_closeextattr(struct vop_closeextattr_args *ap)
1389 /*
1390 struct vop_closeextattr_args {
1391 	struct vnodeop_desc *a_desc;
1392 	struct vnode *a_vp;
1393 	int a_commit;
1394 	IN struct ucred *a_cred;
1395 	IN struct thread *a_td;
1396 };
1397 */
1398 {
1399 
1400 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1401 		return (EOPNOTSUPP);
1402 
1403 	if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY))
1404 		return (EROFS);
1405 
1406 	return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td));
1407 }
1408 
1409 /*
1410  * Vnode operation to remove a named attribute.
1411  */
1412 static int
1413 ffs_deleteextattr(struct vop_deleteextattr_args *ap)
1414 /*
1415 vop_deleteextattr {
1416 	IN struct vnode *a_vp;
1417 	IN int a_attrnamespace;
1418 	IN const char *a_name;
1419 	IN struct ucred *a_cred;
1420 	IN struct thread *a_td;
1421 };
1422 */
1423 {
1424 	struct inode *ip;
1425 	struct extattr *eap;
1426 	uint32_t ul;
1427 	int olen, error, i, easize;
1428 	u_char *eae;
1429 	void *tmp;
1430 
1431 	ip = VTOI(ap->a_vp);
1432 
1433 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1434 		return (EOPNOTSUPP);
1435 
1436 	if (strlen(ap->a_name) == 0)
1437 		return (EINVAL);
1438 
1439 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1440 		return (EROFS);
1441 
1442 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1443 	    ap->a_cred, ap->a_td, VWRITE);
1444 	if (error) {
1445 
1446 		/*
1447 		 * ffs_lock_ea is not needed there, because the vnode
1448 		 * must be exclusively locked.
1449 		 */
1450 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1451 			ip->i_ea_error = error;
1452 		return (error);
1453 	}
1454 
1455 	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1456 	if (error)
1457 		return (error);
1458 
1459 	/* CEM: delete could be done in-place instead */
1460 	eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK);
1461 	bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1462 	easize = ip->i_ea_len;
1463 
1464 	olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1465 	    &eap, NULL);
1466 	if (olen == -1) {
1467 		/* delete but nonexistent */
1468 		free(eae, M_TEMP);
1469 		ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1470 		return (ENOATTR);
1471 	}
1472 	ul = eap->ea_length;
1473 	i = (u_char *)EXTATTR_NEXT(eap) - eae;
1474 	bcopy(EXTATTR_NEXT(eap), eap, easize - i);
1475 	easize -= ul;
1476 
1477 	tmp = ip->i_ea_area;
1478 	ip->i_ea_area = eae;
1479 	ip->i_ea_len = easize;
1480 	free(tmp, M_TEMP);
1481 	error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1482 	return (error);
1483 }
1484 
1485 /*
1486  * Vnode operation to retrieve a named extended attribute.
1487  */
1488 static int
1489 ffs_getextattr(struct vop_getextattr_args *ap)
1490 /*
1491 vop_getextattr {
1492 	IN struct vnode *a_vp;
1493 	IN int a_attrnamespace;
1494 	IN const char *a_name;
1495 	INOUT struct uio *a_uio;
1496 	OUT size_t *a_size;
1497 	IN struct ucred *a_cred;
1498 	IN struct thread *a_td;
1499 };
1500 */
1501 {
1502 	struct inode *ip;
1503 	u_char *eae, *p;
1504 	unsigned easize;
1505 	int error, ealen;
1506 
1507 	ip = VTOI(ap->a_vp);
1508 
1509 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1510 		return (EOPNOTSUPP);
1511 
1512 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1513 	    ap->a_cred, ap->a_td, VREAD);
1514 	if (error)
1515 		return (error);
1516 
1517 	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1518 	if (error)
1519 		return (error);
1520 
1521 	eae = ip->i_ea_area;
1522 	easize = ip->i_ea_len;
1523 
1524 	ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1525 	    NULL, &p);
1526 	if (ealen >= 0) {
1527 		error = 0;
1528 		if (ap->a_size != NULL)
1529 			*ap->a_size = ealen;
1530 		else if (ap->a_uio != NULL)
1531 			error = uiomove(p, ealen, ap->a_uio);
1532 	} else
1533 		error = ENOATTR;
1534 
1535 	ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1536 	return (error);
1537 }
1538 
1539 /*
1540  * Vnode operation to retrieve extended attributes on a vnode.
1541  */
1542 static int
1543 ffs_listextattr(struct vop_listextattr_args *ap)
1544 /*
1545 vop_listextattr {
1546 	IN struct vnode *a_vp;
1547 	IN int a_attrnamespace;
1548 	INOUT struct uio *a_uio;
1549 	OUT size_t *a_size;
1550 	IN struct ucred *a_cred;
1551 	IN struct thread *a_td;
1552 };
1553 */
1554 {
1555 	struct inode *ip;
1556 	struct extattr *eap, *eaend;
1557 	int error, ealen;
1558 
1559 	ip = VTOI(ap->a_vp);
1560 
1561 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1562 		return (EOPNOTSUPP);
1563 
1564 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1565 	    ap->a_cred, ap->a_td, VREAD);
1566 	if (error)
1567 		return (error);
1568 
1569 	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1570 	if (error)
1571 		return (error);
1572 
1573 	error = 0;
1574 	if (ap->a_size != NULL)
1575 		*ap->a_size = 0;
1576 
1577 	KASSERT(ALIGNED_TO(ip->i_ea_area, struct extattr), ("unaligned"));
1578 	eap = (struct extattr *)ip->i_ea_area;
1579 	eaend = (struct extattr *)(ip->i_ea_area + ip->i_ea_len);
1580 	for (; error == 0 && eap < eaend; eap = EXTATTR_NEXT(eap)) {
1581 		/* make sure this entry is complete */
1582 		if (EXTATTR_NEXT(eap) > eaend)
1583 			break;
1584 		if (eap->ea_namespace != ap->a_attrnamespace)
1585 			continue;
1586 
1587 		ealen = eap->ea_namelength;
1588 		if (ap->a_size != NULL)
1589 			*ap->a_size += ealen + 1;
1590 		else if (ap->a_uio != NULL)
1591 			error = uiomove(&eap->ea_namelength, ealen + 1,
1592 			    ap->a_uio);
1593 	}
1594 
1595 	ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1596 	return (error);
1597 }
1598 
1599 /*
1600  * Vnode operation to set a named attribute.
1601  */
1602 static int
1603 ffs_setextattr(struct vop_setextattr_args *ap)
1604 /*
1605 vop_setextattr {
1606 	IN struct vnode *a_vp;
1607 	IN int a_attrnamespace;
1608 	IN const char *a_name;
1609 	INOUT struct uio *a_uio;
1610 	IN struct ucred *a_cred;
1611 	IN struct thread *a_td;
1612 };
1613 */
1614 {
1615 	struct inode *ip;
1616 	struct fs *fs;
1617 	struct extattr *eap;
1618 	uint32_t ealength, ul;
1619 	ssize_t ealen;
1620 	int olen, eapad1, eapad2, error, i, easize;
1621 	u_char *eae;
1622 	void *tmp;
1623 
1624 	ip = VTOI(ap->a_vp);
1625 	fs = ITOFS(ip);
1626 
1627 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1628 		return (EOPNOTSUPP);
1629 
1630 	if (strlen(ap->a_name) == 0)
1631 		return (EINVAL);
1632 
1633 	/* XXX Now unsupported API to delete EAs using NULL uio. */
1634 	if (ap->a_uio == NULL)
1635 		return (EOPNOTSUPP);
1636 
1637 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1638 		return (EROFS);
1639 
1640 	ealen = ap->a_uio->uio_resid;
1641 	if (ealen < 0 || ealen > lblktosize(fs, UFS_NXADDR))
1642 		return (EINVAL);
1643 
1644 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1645 	    ap->a_cred, ap->a_td, VWRITE);
1646 	if (error) {
1647 
1648 		/*
1649 		 * ffs_lock_ea is not needed there, because the vnode
1650 		 * must be exclusively locked.
1651 		 */
1652 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1653 			ip->i_ea_error = error;
1654 		return (error);
1655 	}
1656 
1657 	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1658 	if (error)
1659 		return (error);
1660 
1661 	ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name);
1662 	eapad1 = roundup2(ealength, 8) - ealength;
1663 	eapad2 = roundup2(ealen, 8) - ealen;
1664 	ealength += eapad1 + ealen + eapad2;
1665 
1666 	/*
1667 	 * CEM: rewrites of the same size or smaller could be done in-place
1668 	 * instead.  (We don't acquire any fine-grained locks in here either,
1669 	 * so we could also do bigger writes in-place.)
1670 	 */
1671 	eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK);
1672 	bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1673 	easize = ip->i_ea_len;
1674 
1675 	olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1676 	    &eap, NULL);
1677         if (olen == -1) {
1678 		/* new, append at end */
1679 		KASSERT(ALIGNED_TO(eae + easize, struct extattr),
1680 		    ("unaligned"));
1681 		eap = (struct extattr *)(eae + easize);
1682 		easize += ealength;
1683 	} else {
1684 		ul = eap->ea_length;
1685 		i = (u_char *)EXTATTR_NEXT(eap) - eae;
1686 		if (ul != ealength) {
1687 			bcopy(EXTATTR_NEXT(eap), (u_char *)eap + ealength,
1688 			    easize - i);
1689 			easize += (ealength - ul);
1690 		}
1691 	}
1692 	if (easize > lblktosize(fs, UFS_NXADDR)) {
1693 		free(eae, M_TEMP);
1694 		ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1695 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1696 			ip->i_ea_error = ENOSPC;
1697 		return (ENOSPC);
1698 	}
1699 	eap->ea_length = ealength;
1700 	eap->ea_namespace = ap->a_attrnamespace;
1701 	eap->ea_contentpadlen = eapad2;
1702 	eap->ea_namelength = strlen(ap->a_name);
1703 	memcpy(eap->ea_name, ap->a_name, strlen(ap->a_name));
1704 	bzero(&eap->ea_name[strlen(ap->a_name)], eapad1);
1705 	error = uiomove(EXTATTR_CONTENT(eap), ealen, ap->a_uio);
1706 	if (error) {
1707 		free(eae, M_TEMP);
1708 		ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1709 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1710 			ip->i_ea_error = error;
1711 		return (error);
1712 	}
1713 	bzero((u_char *)EXTATTR_CONTENT(eap) + ealen, eapad2);
1714 
1715 	tmp = ip->i_ea_area;
1716 	ip->i_ea_area = eae;
1717 	ip->i_ea_len = easize;
1718 	free(tmp, M_TEMP);
1719 	error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1720 	return (error);
1721 }
1722 
1723 /*
1724  * Vnode pointer to File handle
1725  */
1726 static int
1727 ffs_vptofh(struct vop_vptofh_args *ap)
1728 /*
1729 vop_vptofh {
1730 	IN struct vnode *a_vp;
1731 	IN struct fid *a_fhp;
1732 };
1733 */
1734 {
1735 	struct inode *ip;
1736 	struct ufid *ufhp;
1737 
1738 	ip = VTOI(ap->a_vp);
1739 	ufhp = (struct ufid *)ap->a_fhp;
1740 	ufhp->ufid_len = sizeof(struct ufid);
1741 	ufhp->ufid_ino = ip->i_number;
1742 	ufhp->ufid_gen = ip->i_gen;
1743 	return (0);
1744 }
1745 
1746 SYSCTL_DECL(_vfs_ffs);
1747 static int use_buf_pager = 1;
1748 SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0,
1749     "Always use buffer pager instead of bmap");
1750 
1751 static daddr_t
1752 ffs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
1753 {
1754 
1755 	return (lblkno(VFSTOUFS(vp->v_mount)->um_fs, off));
1756 }
1757 
1758 static int
1759 ffs_gbp_getblksz(struct vnode *vp, daddr_t lbn)
1760 {
1761 
1762 	return (blksize(VFSTOUFS(vp->v_mount)->um_fs, VTOI(vp), lbn));
1763 }
1764 
1765 static int
1766 ffs_getpages(struct vop_getpages_args *ap)
1767 {
1768 	struct vnode *vp;
1769 	struct ufsmount *um;
1770 
1771 	vp = ap->a_vp;
1772 	um = VFSTOUFS(vp->v_mount);
1773 
1774 	if (!use_buf_pager && um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE)
1775 		return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
1776 		    ap->a_rbehind, ap->a_rahead, NULL, NULL));
1777 	return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind,
1778 	    ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz));
1779 }
1780 
1781 static int
1782 ffs_getpages_async(struct vop_getpages_async_args *ap)
1783 {
1784 	struct vnode *vp;
1785 	struct ufsmount *um;
1786 	int error;
1787 
1788 	vp = ap->a_vp;
1789 	um = VFSTOUFS(vp->v_mount);
1790 
1791 	if (um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE)
1792 		return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
1793 		    ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg));
1794 
1795 	error = vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind,
1796 	    ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz);
1797 	ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error);
1798 
1799 	return (error);
1800 }
1801 
1802