xref: /freebsd/sys/ufs/ffs/ffs_vnops.c (revision 480093f4440d54b30b3025afeac24b48f2ba7a2e)
1 /*-
2  * SPDX-License-Identifier: (BSD-2-Clause-FreeBSD AND BSD-3-Clause)
3  *
4  * Copyright (c) 2002, 2003 Networks Associates Technology, Inc.
5  * All rights reserved.
6  *
7  * This software was developed for the FreeBSD Project by Marshall
8  * Kirk McKusick and Network Associates Laboratories, the Security
9  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
10  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
11  * research program
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * Copyright (c) 1982, 1986, 1989, 1993
35  *	The Regents of the University of California.  All rights reserved.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. Neither the name of the University nor the names of its contributors
46  *    may be used to endorse or promote products derived from this software
47  *    without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  *
61  *	from: @(#)ufs_readwrite.c	8.11 (Berkeley) 5/8/95
62  * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ...
63  *	@(#)ffs_vnops.c	8.15 (Berkeley) 5/14/95
64  */
65 
66 #include <sys/cdefs.h>
67 __FBSDID("$FreeBSD$");
68 
69 #include <sys/param.h>
70 #include <sys/bio.h>
71 #include <sys/systm.h>
72 #include <sys/buf.h>
73 #include <sys/conf.h>
74 #include <sys/extattr.h>
75 #include <sys/kernel.h>
76 #include <sys/limits.h>
77 #include <sys/malloc.h>
78 #include <sys/mount.h>
79 #include <sys/priv.h>
80 #include <sys/rwlock.h>
81 #include <sys/stat.h>
82 #include <sys/sysctl.h>
83 #include <sys/vmmeter.h>
84 #include <sys/vnode.h>
85 
86 #include <vm/vm.h>
87 #include <vm/vm_param.h>
88 #include <vm/vm_extern.h>
89 #include <vm/vm_object.h>
90 #include <vm/vm_page.h>
91 #include <vm/vm_pager.h>
92 #include <vm/vnode_pager.h>
93 
94 #include <ufs/ufs/extattr.h>
95 #include <ufs/ufs/quota.h>
96 #include <ufs/ufs/inode.h>
97 #include <ufs/ufs/ufs_extern.h>
98 #include <ufs/ufs/ufsmount.h>
99 
100 #include <ufs/ffs/fs.h>
101 #include <ufs/ffs/ffs_extern.h>
102 #include "opt_directio.h"
103 #include "opt_ffs.h"
104 
105 #define	ALIGNED_TO(ptr, s)	\
106 	(((uintptr_t)(ptr) & (_Alignof(s) - 1)) == 0)
107 
108 #ifdef DIRECTIO
109 extern int	ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
110 #endif
111 static vop_fdatasync_t	ffs_fdatasync;
112 static vop_fsync_t	ffs_fsync;
113 static vop_getpages_t	ffs_getpages;
114 static vop_getpages_async_t	ffs_getpages_async;
115 static vop_lock1_t	ffs_lock;
116 #ifdef INVARIANTS
117 static vop_unlock_t	ffs_unlock_debug;
118 #endif
119 static vop_read_t	ffs_read;
120 static vop_write_t	ffs_write;
121 static int	ffs_extread(struct vnode *vp, struct uio *uio, int ioflag);
122 static int	ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag,
123 		    struct ucred *cred);
124 static vop_strategy_t	ffsext_strategy;
125 static vop_closeextattr_t	ffs_closeextattr;
126 static vop_deleteextattr_t	ffs_deleteextattr;
127 static vop_getextattr_t	ffs_getextattr;
128 static vop_listextattr_t	ffs_listextattr;
129 static vop_openextattr_t	ffs_openextattr;
130 static vop_setextattr_t	ffs_setextattr;
131 static vop_vptofh_t	ffs_vptofh;
132 
133 /* Global vfs data structures for ufs. */
134 struct vop_vector ffs_vnodeops1 = {
135 	.vop_default =		&ufs_vnodeops,
136 	.vop_fsync =		ffs_fsync,
137 	.vop_fdatasync =	ffs_fdatasync,
138 	.vop_getpages =		ffs_getpages,
139 	.vop_getpages_async =	ffs_getpages_async,
140 	.vop_lock1 =		ffs_lock,
141 #ifdef INVARIANTS
142 	.vop_unlock =		ffs_unlock_debug,
143 #endif
144 	.vop_read =		ffs_read,
145 	.vop_reallocblks =	ffs_reallocblks,
146 	.vop_write =		ffs_write,
147 	.vop_vptofh =		ffs_vptofh,
148 };
149 VFS_VOP_VECTOR_REGISTER(ffs_vnodeops1);
150 
151 struct vop_vector ffs_fifoops1 = {
152 	.vop_default =		&ufs_fifoops,
153 	.vop_fsync =		ffs_fsync,
154 	.vop_fdatasync =	ffs_fdatasync,
155 	.vop_lock1 =		ffs_lock,
156 #ifdef INVARIANTS
157 	.vop_unlock =		ffs_unlock_debug,
158 #endif
159 	.vop_vptofh =		ffs_vptofh,
160 };
161 VFS_VOP_VECTOR_REGISTER(ffs_fifoops1);
162 
163 /* Global vfs data structures for ufs. */
164 struct vop_vector ffs_vnodeops2 = {
165 	.vop_default =		&ufs_vnodeops,
166 	.vop_fsync =		ffs_fsync,
167 	.vop_fdatasync =	ffs_fdatasync,
168 	.vop_getpages =		ffs_getpages,
169 	.vop_getpages_async =	ffs_getpages_async,
170 	.vop_lock1 =		ffs_lock,
171 #ifdef INVARIANTS
172 	.vop_unlock =		ffs_unlock_debug,
173 #endif
174 	.vop_read =		ffs_read,
175 	.vop_reallocblks =	ffs_reallocblks,
176 	.vop_write =		ffs_write,
177 	.vop_closeextattr =	ffs_closeextattr,
178 	.vop_deleteextattr =	ffs_deleteextattr,
179 	.vop_getextattr =	ffs_getextattr,
180 	.vop_listextattr =	ffs_listextattr,
181 	.vop_openextattr =	ffs_openextattr,
182 	.vop_setextattr =	ffs_setextattr,
183 	.vop_vptofh =		ffs_vptofh,
184 };
185 VFS_VOP_VECTOR_REGISTER(ffs_vnodeops2);
186 
187 struct vop_vector ffs_fifoops2 = {
188 	.vop_default =		&ufs_fifoops,
189 	.vop_fsync =		ffs_fsync,
190 	.vop_fdatasync =	ffs_fdatasync,
191 	.vop_lock1 =		ffs_lock,
192 #ifdef INVARIANTS
193 	.vop_unlock =		ffs_unlock_debug,
194 #endif
195 	.vop_reallocblks =	ffs_reallocblks,
196 	.vop_strategy =		ffsext_strategy,
197 	.vop_closeextattr =	ffs_closeextattr,
198 	.vop_deleteextattr =	ffs_deleteextattr,
199 	.vop_getextattr =	ffs_getextattr,
200 	.vop_listextattr =	ffs_listextattr,
201 	.vop_openextattr =	ffs_openextattr,
202 	.vop_setextattr =	ffs_setextattr,
203 	.vop_vptofh =		ffs_vptofh,
204 };
205 VFS_VOP_VECTOR_REGISTER(ffs_fifoops2);
206 
207 /*
208  * Synch an open file.
209  */
210 /* ARGSUSED */
211 static int
212 ffs_fsync(struct vop_fsync_args *ap)
213 {
214 	struct vnode *vp;
215 	struct bufobj *bo;
216 	int error;
217 
218 	vp = ap->a_vp;
219 	bo = &vp->v_bufobj;
220 retry:
221 	error = ffs_syncvnode(vp, ap->a_waitfor, 0);
222 	if (error)
223 		return (error);
224 	if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) {
225 		error = softdep_fsync(vp);
226 		if (error)
227 			return (error);
228 
229 		/*
230 		 * The softdep_fsync() function may drop vp lock,
231 		 * allowing for dirty buffers to reappear on the
232 		 * bo_dirty list. Recheck and resync as needed.
233 		 */
234 		BO_LOCK(bo);
235 		if ((vp->v_type == VREG || vp->v_type == VDIR) &&
236 		    (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) {
237 			BO_UNLOCK(bo);
238 			goto retry;
239 		}
240 		BO_UNLOCK(bo);
241 	}
242 	return (0);
243 }
244 
245 int
246 ffs_syncvnode(struct vnode *vp, int waitfor, int flags)
247 {
248 	struct inode *ip;
249 	struct bufobj *bo;
250 	struct buf *bp, *nbp;
251 	ufs_lbn_t lbn;
252 	int error, passes;
253 	bool still_dirty, wait;
254 
255 	ip = VTOI(vp);
256 	ip->i_flag &= ~IN_NEEDSYNC;
257 	bo = &vp->v_bufobj;
258 
259 	/*
260 	 * When doing MNT_WAIT we must first flush all dependencies
261 	 * on the inode.
262 	 */
263 	if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT &&
264 	    (error = softdep_sync_metadata(vp)) != 0)
265 		return (error);
266 
267 	/*
268 	 * Flush all dirty buffers associated with a vnode.
269 	 */
270 	error = 0;
271 	passes = 0;
272 	wait = false;	/* Always do an async pass first. */
273 	lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1));
274 	BO_LOCK(bo);
275 loop:
276 	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
277 		bp->b_vflags &= ~BV_SCANNED;
278 	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
279 		/*
280 		 * Reasons to skip this buffer: it has already been considered
281 		 * on this pass, the buffer has dependencies that will cause
282 		 * it to be redirtied and it has not already been deferred,
283 		 * or it is already being written.
284 		 */
285 		if ((bp->b_vflags & BV_SCANNED) != 0)
286 			continue;
287 		bp->b_vflags |= BV_SCANNED;
288 		/*
289 		 * Flush indirects in order, if requested.
290 		 *
291 		 * Note that if only datasync is requested, we can
292 		 * skip indirect blocks when softupdates are not
293 		 * active.  Otherwise we must flush them with data,
294 		 * since dependencies prevent data block writes.
295 		 */
296 		if (waitfor == MNT_WAIT && bp->b_lblkno <= -UFS_NDADDR &&
297 		    (lbn_level(bp->b_lblkno) >= passes ||
298 		    ((flags & DATA_ONLY) != 0 && !DOINGSOFTDEP(vp))))
299 			continue;
300 		if (bp->b_lblkno > lbn)
301 			panic("ffs_syncvnode: syncing truncated data.");
302 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) {
303 			BO_UNLOCK(bo);
304 		} else if (wait) {
305 			if (BUF_LOCK(bp,
306 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
307 			    BO_LOCKPTR(bo)) != 0) {
308 				bp->b_vflags &= ~BV_SCANNED;
309 				goto next;
310 			}
311 		} else
312 			continue;
313 		if ((bp->b_flags & B_DELWRI) == 0)
314 			panic("ffs_fsync: not dirty");
315 		/*
316 		 * Check for dependencies and potentially complete them.
317 		 */
318 		if (!LIST_EMPTY(&bp->b_dep) &&
319 		    (error = softdep_sync_buf(vp, bp,
320 		    wait ? MNT_WAIT : MNT_NOWAIT)) != 0) {
321 			/* I/O error. */
322 			if (error != EBUSY) {
323 				BUF_UNLOCK(bp);
324 				return (error);
325 			}
326 			/* If we deferred once, don't defer again. */
327 		    	if ((bp->b_flags & B_DEFERRED) == 0) {
328 				bp->b_flags |= B_DEFERRED;
329 				BUF_UNLOCK(bp);
330 				goto next;
331 			}
332 		}
333 		if (wait) {
334 			bremfree(bp);
335 			if ((error = bwrite(bp)) != 0)
336 				return (error);
337 		} else if ((bp->b_flags & B_CLUSTEROK)) {
338 			(void) vfs_bio_awrite(bp);
339 		} else {
340 			bremfree(bp);
341 			(void) bawrite(bp);
342 		}
343 next:
344 		/*
345 		 * Since we may have slept during the I/O, we need
346 		 * to start from a known point.
347 		 */
348 		BO_LOCK(bo);
349 		nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd);
350 	}
351 	if (waitfor != MNT_WAIT) {
352 		BO_UNLOCK(bo);
353 		if ((flags & NO_INO_UPDT) != 0)
354 			return (0);
355 		else
356 			return (ffs_update(vp, 0));
357 	}
358 	/* Drain IO to see if we're done. */
359 	bufobj_wwait(bo, 0, 0);
360 	/*
361 	 * Block devices associated with filesystems may have new I/O
362 	 * requests posted for them even if the vnode is locked, so no
363 	 * amount of trying will get them clean.  We make several passes
364 	 * as a best effort.
365 	 *
366 	 * Regular files may need multiple passes to flush all dependency
367 	 * work as it is possible that we must write once per indirect
368 	 * level, once for the leaf, and once for the inode and each of
369 	 * these will be done with one sync and one async pass.
370 	 */
371 	if (bo->bo_dirty.bv_cnt > 0) {
372 		if ((flags & DATA_ONLY) == 0) {
373 			still_dirty = true;
374 		} else {
375 			/*
376 			 * For data-only sync, dirty indirect buffers
377 			 * are ignored.
378 			 */
379 			still_dirty = false;
380 			TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
381 				if (bp->b_lblkno > -UFS_NDADDR) {
382 					still_dirty = true;
383 					break;
384 				}
385 			}
386 		}
387 
388 		if (still_dirty) {
389 			/* Write the inode after sync passes to flush deps. */
390 			if (wait && DOINGSOFTDEP(vp) &&
391 			    (flags & NO_INO_UPDT) == 0) {
392 				BO_UNLOCK(bo);
393 				ffs_update(vp, 1);
394 				BO_LOCK(bo);
395 			}
396 			/* switch between sync/async. */
397 			wait = !wait;
398 			if (wait || ++passes < UFS_NIADDR + 2)
399 				goto loop;
400 		}
401 	}
402 	BO_UNLOCK(bo);
403 	error = 0;
404 	if ((flags & DATA_ONLY) == 0) {
405 		if ((flags & NO_INO_UPDT) == 0)
406 			error = ffs_update(vp, 1);
407 		if (DOINGSUJ(vp))
408 			softdep_journal_fsync(VTOI(vp));
409 	}
410 	return (error);
411 }
412 
413 static int
414 ffs_fdatasync(struct vop_fdatasync_args *ap)
415 {
416 
417 	return (ffs_syncvnode(ap->a_vp, MNT_WAIT, DATA_ONLY));
418 }
419 
420 static int
421 ffs_lock(ap)
422 	struct vop_lock1_args /* {
423 		struct vnode *a_vp;
424 		int a_flags;
425 		struct thread *a_td;
426 		char *file;
427 		int line;
428 	} */ *ap;
429 {
430 #ifndef NO_FFS_SNAPSHOT
431 	struct vnode *vp;
432 	int flags;
433 	struct lock *lkp;
434 	int result;
435 
436 	switch (ap->a_flags & LK_TYPE_MASK) {
437 	case LK_SHARED:
438 	case LK_UPGRADE:
439 	case LK_EXCLUSIVE:
440 		vp = ap->a_vp;
441 		flags = ap->a_flags;
442 		for (;;) {
443 #ifdef DEBUG_VFS_LOCKS
444 			KASSERT(vp->v_holdcnt != 0,
445 			    ("ffs_lock %p: zero hold count", vp));
446 #endif
447 			lkp = vp->v_vnlock;
448 			result = _lockmgr_args(lkp, flags, VI_MTX(vp),
449 			    LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT,
450 			    ap->a_file, ap->a_line);
451 			if (lkp == vp->v_vnlock || result != 0)
452 				break;
453 			/*
454 			 * Apparent success, except that the vnode
455 			 * mutated between snapshot file vnode and
456 			 * regular file vnode while this process
457 			 * slept.  The lock currently held is not the
458 			 * right lock.  Release it, and try to get the
459 			 * new lock.
460 			 */
461 			(void) _lockmgr_args(lkp, LK_RELEASE, NULL,
462 			    LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT,
463 			    ap->a_file, ap->a_line);
464 			if ((flags & (LK_INTERLOCK | LK_NOWAIT)) ==
465 			    (LK_INTERLOCK | LK_NOWAIT))
466 				return (EBUSY);
467 			if ((flags & LK_TYPE_MASK) == LK_UPGRADE)
468 				flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE;
469 			flags &= ~LK_INTERLOCK;
470 		}
471 		break;
472 	default:
473 		result = VOP_LOCK1_APV(&ufs_vnodeops, ap);
474 	}
475 	return (result);
476 #else
477 	return (VOP_LOCK1_APV(&ufs_vnodeops, ap));
478 #endif
479 }
480 
481 #ifdef INVARIANTS
482 static int
483 ffs_unlock_debug(struct vop_unlock_args *ap)
484 {
485 	struct vnode *vp = ap->a_vp;
486 	struct inode *ip = VTOI(vp);
487 
488 	if (ip->i_flag & UFS_INODE_FLAG_LAZY_MASK_ASSERTABLE) {
489 		if ((vp->v_mflag & VMP_LAZYLIST) == 0) {
490 			VI_LOCK(vp);
491 			VNASSERT((vp->v_mflag & VMP_LAZYLIST), vp,
492 			    ("%s: modified vnode (%x) not on lazy list",
493 			    __func__, ip->i_flag));
494 			VI_UNLOCK(vp);
495 		}
496 	}
497 	return (VOP_UNLOCK_APV(&ufs_vnodeops, ap));
498 }
499 #endif
500 
501 static int
502 ffs_read_hole(struct uio *uio, long xfersize, long *size)
503 {
504 	ssize_t saved_resid, tlen;
505 	int error;
506 
507 	while (xfersize > 0) {
508 		tlen = min(xfersize, ZERO_REGION_SIZE);
509 		saved_resid = uio->uio_resid;
510 		error = vn_io_fault_uiomove(__DECONST(void *, zero_region),
511 		    tlen, uio);
512 		if (error != 0)
513 			return (error);
514 		tlen = saved_resid - uio->uio_resid;
515 		xfersize -= tlen;
516 		*size -= tlen;
517 	}
518 	return (0);
519 }
520 
521 /*
522  * Vnode op for reading.
523  */
524 static int
525 ffs_read(ap)
526 	struct vop_read_args /* {
527 		struct vnode *a_vp;
528 		struct uio *a_uio;
529 		int a_ioflag;
530 		struct ucred *a_cred;
531 	} */ *ap;
532 {
533 	struct vnode *vp;
534 	struct inode *ip;
535 	struct uio *uio;
536 	struct fs *fs;
537 	struct buf *bp;
538 	ufs_lbn_t lbn, nextlbn;
539 	off_t bytesinfile;
540 	long size, xfersize, blkoffset;
541 	ssize_t orig_resid;
542 	int bflag, error, ioflag, seqcount;
543 
544 	vp = ap->a_vp;
545 	uio = ap->a_uio;
546 	ioflag = ap->a_ioflag;
547 	if (ap->a_ioflag & IO_EXT)
548 #ifdef notyet
549 		return (ffs_extread(vp, uio, ioflag));
550 #else
551 		panic("ffs_read+IO_EXT");
552 #endif
553 #ifdef DIRECTIO
554 	if ((ioflag & IO_DIRECT) != 0) {
555 		int workdone;
556 
557 		error = ffs_rawread(vp, uio, &workdone);
558 		if (error != 0 || workdone != 0)
559 			return error;
560 	}
561 #endif
562 
563 	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
564 	ip = VTOI(vp);
565 
566 #ifdef INVARIANTS
567 	if (uio->uio_rw != UIO_READ)
568 		panic("ffs_read: mode");
569 
570 	if (vp->v_type == VLNK) {
571 		if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
572 			panic("ffs_read: short symlink");
573 	} else if (vp->v_type != VREG && vp->v_type != VDIR)
574 		panic("ffs_read: type %d",  vp->v_type);
575 #endif
576 	orig_resid = uio->uio_resid;
577 	KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0"));
578 	if (orig_resid == 0)
579 		return (0);
580 	KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0"));
581 	fs = ITOFS(ip);
582 	if (uio->uio_offset < ip->i_size &&
583 	    uio->uio_offset >= fs->fs_maxfilesize)
584 		return (EOVERFLOW);
585 
586 	bflag = GB_UNMAPPED | (uio->uio_segflg == UIO_NOCOPY ? 0 : GB_NOSPARSE);
587 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
588 		if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
589 			break;
590 		lbn = lblkno(fs, uio->uio_offset);
591 		nextlbn = lbn + 1;
592 
593 		/*
594 		 * size of buffer.  The buffer representing the
595 		 * end of the file is rounded up to the size of
596 		 * the block type ( fragment or full block,
597 		 * depending ).
598 		 */
599 		size = blksize(fs, ip, lbn);
600 		blkoffset = blkoff(fs, uio->uio_offset);
601 
602 		/*
603 		 * The amount we want to transfer in this iteration is
604 		 * one FS block less the amount of the data before
605 		 * our startpoint (duh!)
606 		 */
607 		xfersize = fs->fs_bsize - blkoffset;
608 
609 		/*
610 		 * But if we actually want less than the block,
611 		 * or the file doesn't have a whole block more of data,
612 		 * then use the lesser number.
613 		 */
614 		if (uio->uio_resid < xfersize)
615 			xfersize = uio->uio_resid;
616 		if (bytesinfile < xfersize)
617 			xfersize = bytesinfile;
618 
619 		if (lblktosize(fs, nextlbn) >= ip->i_size) {
620 			/*
621 			 * Don't do readahead if this is the end of the file.
622 			 */
623 			error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp);
624 		} else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
625 			/*
626 			 * Otherwise if we are allowed to cluster,
627 			 * grab as much as we can.
628 			 *
629 			 * XXX  This may not be a win if we are not
630 			 * doing sequential access.
631 			 */
632 			error = cluster_read(vp, ip->i_size, lbn,
633 			    size, NOCRED, blkoffset + uio->uio_resid,
634 			    seqcount, bflag, &bp);
635 		} else if (seqcount > 1) {
636 			/*
637 			 * If we are NOT allowed to cluster, then
638 			 * if we appear to be acting sequentially,
639 			 * fire off a request for a readahead
640 			 * as well as a read. Note that the 4th and 5th
641 			 * arguments point to arrays of the size specified in
642 			 * the 6th argument.
643 			 */
644 			u_int nextsize = blksize(fs, ip, nextlbn);
645 			error = breadn_flags(vp, lbn, lbn, size, &nextlbn,
646 			    &nextsize, 1, NOCRED, bflag, NULL, &bp);
647 		} else {
648 			/*
649 			 * Failing all of the above, just read what the
650 			 * user asked for. Interestingly, the same as
651 			 * the first option above.
652 			 */
653 			error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp);
654 		}
655 		if (error == EJUSTRETURN) {
656 			error = ffs_read_hole(uio, xfersize, &size);
657 			if (error == 0)
658 				continue;
659 		}
660 		if (error != 0) {
661 			brelse(bp);
662 			bp = NULL;
663 			break;
664 		}
665 
666 		/*
667 		 * We should only get non-zero b_resid when an I/O error
668 		 * has occurred, which should cause us to break above.
669 		 * However, if the short read did not cause an error,
670 		 * then we want to ensure that we do not uiomove bad
671 		 * or uninitialized data.
672 		 */
673 		size -= bp->b_resid;
674 		if (size < xfersize) {
675 			if (size == 0)
676 				break;
677 			xfersize = size;
678 		}
679 
680 		if (buf_mapped(bp)) {
681 			error = vn_io_fault_uiomove((char *)bp->b_data +
682 			    blkoffset, (int)xfersize, uio);
683 		} else {
684 			error = vn_io_fault_pgmove(bp->b_pages, blkoffset,
685 			    (int)xfersize, uio);
686 		}
687 		if (error)
688 			break;
689 
690 		vfs_bio_brelse(bp, ioflag);
691 	}
692 
693 	/*
694 	 * This can only happen in the case of an error
695 	 * because the loop above resets bp to NULL on each iteration
696 	 * and on normal completion has not set a new value into it.
697 	 * so it must have come from a 'break' statement
698 	 */
699 	if (bp != NULL)
700 		vfs_bio_brelse(bp, ioflag);
701 
702 	if ((error == 0 || uio->uio_resid != orig_resid) &&
703 	    (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
704 		UFS_INODE_SET_FLAG_SHARED(ip, IN_ACCESS);
705 	return (error);
706 }
707 
708 /*
709  * Vnode op for writing.
710  */
711 static int
712 ffs_write(ap)
713 	struct vop_write_args /* {
714 		struct vnode *a_vp;
715 		struct uio *a_uio;
716 		int a_ioflag;
717 		struct ucred *a_cred;
718 	} */ *ap;
719 {
720 	struct vnode *vp;
721 	struct uio *uio;
722 	struct inode *ip;
723 	struct fs *fs;
724 	struct buf *bp;
725 	ufs_lbn_t lbn;
726 	off_t osize;
727 	ssize_t resid;
728 	int seqcount;
729 	int blkoffset, error, flags, ioflag, size, xfersize;
730 
731 	vp = ap->a_vp;
732 	uio = ap->a_uio;
733 	ioflag = ap->a_ioflag;
734 	if (ap->a_ioflag & IO_EXT)
735 #ifdef notyet
736 		return (ffs_extwrite(vp, uio, ioflag, ap->a_cred));
737 #else
738 		panic("ffs_write+IO_EXT");
739 #endif
740 
741 	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
742 	ip = VTOI(vp);
743 
744 #ifdef INVARIANTS
745 	if (uio->uio_rw != UIO_WRITE)
746 		panic("ffs_write: mode");
747 #endif
748 
749 	switch (vp->v_type) {
750 	case VREG:
751 		if (ioflag & IO_APPEND)
752 			uio->uio_offset = ip->i_size;
753 		if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
754 			return (EPERM);
755 		/* FALLTHROUGH */
756 	case VLNK:
757 		break;
758 	case VDIR:
759 		panic("ffs_write: dir write");
760 		break;
761 	default:
762 		panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type,
763 			(int)uio->uio_offset,
764 			(int)uio->uio_resid
765 		);
766 	}
767 
768 	KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0"));
769 	KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0"));
770 	fs = ITOFS(ip);
771 	if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize)
772 		return (EFBIG);
773 	/*
774 	 * Maybe this should be above the vnode op call, but so long as
775 	 * file servers have no limits, I don't think it matters.
776 	 */
777 	if (vn_rlimit_fsize(vp, uio, uio->uio_td))
778 		return (EFBIG);
779 
780 	resid = uio->uio_resid;
781 	osize = ip->i_size;
782 	if (seqcount > BA_SEQMAX)
783 		flags = BA_SEQMAX << BA_SEQSHIFT;
784 	else
785 		flags = seqcount << BA_SEQSHIFT;
786 	if (ioflag & IO_SYNC)
787 		flags |= IO_SYNC;
788 	flags |= BA_UNMAPPED;
789 
790 	for (error = 0; uio->uio_resid > 0;) {
791 		lbn = lblkno(fs, uio->uio_offset);
792 		blkoffset = blkoff(fs, uio->uio_offset);
793 		xfersize = fs->fs_bsize - blkoffset;
794 		if (uio->uio_resid < xfersize)
795 			xfersize = uio->uio_resid;
796 		if (uio->uio_offset + xfersize > ip->i_size)
797 			vnode_pager_setsize(vp, uio->uio_offset + xfersize);
798 
799 		/*
800 		 * We must perform a read-before-write if the transfer size
801 		 * does not cover the entire buffer.
802 		 */
803 		if (fs->fs_bsize > xfersize)
804 			flags |= BA_CLRBUF;
805 		else
806 			flags &= ~BA_CLRBUF;
807 /* XXX is uio->uio_offset the right thing here? */
808 		error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
809 		    ap->a_cred, flags, &bp);
810 		if (error != 0) {
811 			vnode_pager_setsize(vp, ip->i_size);
812 			break;
813 		}
814 		if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL))
815 			bp->b_flags |= B_NOCACHE;
816 
817 		if (uio->uio_offset + xfersize > ip->i_size) {
818 			ip->i_size = uio->uio_offset + xfersize;
819 			DIP_SET(ip, i_size, ip->i_size);
820 		}
821 
822 		size = blksize(fs, ip, lbn) - bp->b_resid;
823 		if (size < xfersize)
824 			xfersize = size;
825 
826 		if (buf_mapped(bp)) {
827 			error = vn_io_fault_uiomove((char *)bp->b_data +
828 			    blkoffset, (int)xfersize, uio);
829 		} else {
830 			error = vn_io_fault_pgmove(bp->b_pages, blkoffset,
831 			    (int)xfersize, uio);
832 		}
833 		/*
834 		 * If the buffer is not already filled and we encounter an
835 		 * error while trying to fill it, we have to clear out any
836 		 * garbage data from the pages instantiated for the buffer.
837 		 * If we do not, a failed uiomove() during a write can leave
838 		 * the prior contents of the pages exposed to a userland mmap.
839 		 *
840 		 * Note that we need only clear buffers with a transfer size
841 		 * equal to the block size because buffers with a shorter
842 		 * transfer size were cleared above by the call to UFS_BALLOC()
843 		 * with the BA_CLRBUF flag set.
844 		 *
845 		 * If the source region for uiomove identically mmaps the
846 		 * buffer, uiomove() performed the NOP copy, and the buffer
847 		 * content remains valid because the page fault handler
848 		 * validated the pages.
849 		 */
850 		if (error != 0 && (bp->b_flags & B_CACHE) == 0 &&
851 		    fs->fs_bsize == xfersize)
852 			vfs_bio_clrbuf(bp);
853 
854 		vfs_bio_set_flags(bp, ioflag);
855 
856 		/*
857 		 * If IO_SYNC each buffer is written synchronously.  Otherwise
858 		 * if we have a severe page deficiency write the buffer
859 		 * asynchronously.  Otherwise try to cluster, and if that
860 		 * doesn't do it then either do an async write (if O_DIRECT),
861 		 * or a delayed write (if not).
862 		 */
863 		if (ioflag & IO_SYNC) {
864 			(void)bwrite(bp);
865 		} else if (vm_page_count_severe() ||
866 			    buf_dirty_count_severe() ||
867 			    (ioflag & IO_ASYNC)) {
868 			bp->b_flags |= B_CLUSTEROK;
869 			bawrite(bp);
870 		} else if (xfersize + blkoffset == fs->fs_bsize) {
871 			if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
872 				bp->b_flags |= B_CLUSTEROK;
873 				cluster_write(vp, bp, ip->i_size, seqcount,
874 				    GB_UNMAPPED);
875 			} else {
876 				bawrite(bp);
877 			}
878 		} else if (ioflag & IO_DIRECT) {
879 			bp->b_flags |= B_CLUSTEROK;
880 			bawrite(bp);
881 		} else {
882 			bp->b_flags |= B_CLUSTEROK;
883 			bdwrite(bp);
884 		}
885 		if (error || xfersize == 0)
886 			break;
887 		UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
888 	}
889 	/*
890 	 * If we successfully wrote any data, and we are not the superuser
891 	 * we clear the setuid and setgid bits as a precaution against
892 	 * tampering.
893 	 */
894 	if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid &&
895 	    ap->a_cred) {
896 		if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID)) {
897 			ip->i_mode &= ~(ISUID | ISGID);
898 			DIP_SET(ip, i_mode, ip->i_mode);
899 		}
900 	}
901 	if (error) {
902 		if (ioflag & IO_UNIT) {
903 			(void)ffs_truncate(vp, osize,
904 			    IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred);
905 			uio->uio_offset -= resid - uio->uio_resid;
906 			uio->uio_resid = resid;
907 		}
908 	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
909 		error = ffs_update(vp, 1);
910 	return (error);
911 }
912 
913 /*
914  * Extended attribute area reading.
915  */
916 static int
917 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag)
918 {
919 	struct inode *ip;
920 	struct ufs2_dinode *dp;
921 	struct fs *fs;
922 	struct buf *bp;
923 	ufs_lbn_t lbn, nextlbn;
924 	off_t bytesinfile;
925 	long size, xfersize, blkoffset;
926 	ssize_t orig_resid;
927 	int error;
928 
929 	ip = VTOI(vp);
930 	fs = ITOFS(ip);
931 	dp = ip->i_din2;
932 
933 #ifdef INVARIANTS
934 	if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC)
935 		panic("ffs_extread: mode");
936 
937 #endif
938 	orig_resid = uio->uio_resid;
939 	KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0"));
940 	if (orig_resid == 0)
941 		return (0);
942 	KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0"));
943 
944 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
945 		if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0)
946 			break;
947 		lbn = lblkno(fs, uio->uio_offset);
948 		nextlbn = lbn + 1;
949 
950 		/*
951 		 * size of buffer.  The buffer representing the
952 		 * end of the file is rounded up to the size of
953 		 * the block type ( fragment or full block,
954 		 * depending ).
955 		 */
956 		size = sblksize(fs, dp->di_extsize, lbn);
957 		blkoffset = blkoff(fs, uio->uio_offset);
958 
959 		/*
960 		 * The amount we want to transfer in this iteration is
961 		 * one FS block less the amount of the data before
962 		 * our startpoint (duh!)
963 		 */
964 		xfersize = fs->fs_bsize - blkoffset;
965 
966 		/*
967 		 * But if we actually want less than the block,
968 		 * or the file doesn't have a whole block more of data,
969 		 * then use the lesser number.
970 		 */
971 		if (uio->uio_resid < xfersize)
972 			xfersize = uio->uio_resid;
973 		if (bytesinfile < xfersize)
974 			xfersize = bytesinfile;
975 
976 		if (lblktosize(fs, nextlbn) >= dp->di_extsize) {
977 			/*
978 			 * Don't do readahead if this is the end of the info.
979 			 */
980 			error = bread(vp, -1 - lbn, size, NOCRED, &bp);
981 		} else {
982 			/*
983 			 * If we have a second block, then
984 			 * fire off a request for a readahead
985 			 * as well as a read. Note that the 4th and 5th
986 			 * arguments point to arrays of the size specified in
987 			 * the 6th argument.
988 			 */
989 			u_int nextsize = sblksize(fs, dp->di_extsize, nextlbn);
990 
991 			nextlbn = -1 - nextlbn;
992 			error = breadn(vp, -1 - lbn,
993 			    size, &nextlbn, &nextsize, 1, NOCRED, &bp);
994 		}
995 		if (error) {
996 			brelse(bp);
997 			bp = NULL;
998 			break;
999 		}
1000 
1001 		/*
1002 		 * We should only get non-zero b_resid when an I/O error
1003 		 * has occurred, which should cause us to break above.
1004 		 * However, if the short read did not cause an error,
1005 		 * then we want to ensure that we do not uiomove bad
1006 		 * or uninitialized data.
1007 		 */
1008 		size -= bp->b_resid;
1009 		if (size < xfersize) {
1010 			if (size == 0)
1011 				break;
1012 			xfersize = size;
1013 		}
1014 
1015 		error = uiomove((char *)bp->b_data + blkoffset,
1016 					(int)xfersize, uio);
1017 		if (error)
1018 			break;
1019 		vfs_bio_brelse(bp, ioflag);
1020 	}
1021 
1022 	/*
1023 	 * This can only happen in the case of an error
1024 	 * because the loop above resets bp to NULL on each iteration
1025 	 * and on normal completion has not set a new value into it.
1026 	 * so it must have come from a 'break' statement
1027 	 */
1028 	if (bp != NULL)
1029 		vfs_bio_brelse(bp, ioflag);
1030 	return (error);
1031 }
1032 
1033 /*
1034  * Extended attribute area writing.
1035  */
1036 static int
1037 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred)
1038 {
1039 	struct inode *ip;
1040 	struct ufs2_dinode *dp;
1041 	struct fs *fs;
1042 	struct buf *bp;
1043 	ufs_lbn_t lbn;
1044 	off_t osize;
1045 	ssize_t resid;
1046 	int blkoffset, error, flags, size, xfersize;
1047 
1048 	ip = VTOI(vp);
1049 	fs = ITOFS(ip);
1050 	dp = ip->i_din2;
1051 
1052 #ifdef INVARIANTS
1053 	if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC)
1054 		panic("ffs_extwrite: mode");
1055 #endif
1056 
1057 	if (ioflag & IO_APPEND)
1058 		uio->uio_offset = dp->di_extsize;
1059 	KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0"));
1060 	KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0"));
1061 	if ((uoff_t)uio->uio_offset + uio->uio_resid >
1062 	    UFS_NXADDR * fs->fs_bsize)
1063 		return (EFBIG);
1064 
1065 	resid = uio->uio_resid;
1066 	osize = dp->di_extsize;
1067 	flags = IO_EXT;
1068 	if (ioflag & IO_SYNC)
1069 		flags |= IO_SYNC;
1070 
1071 	for (error = 0; uio->uio_resid > 0;) {
1072 		lbn = lblkno(fs, uio->uio_offset);
1073 		blkoffset = blkoff(fs, uio->uio_offset);
1074 		xfersize = fs->fs_bsize - blkoffset;
1075 		if (uio->uio_resid < xfersize)
1076 			xfersize = uio->uio_resid;
1077 
1078 		/*
1079 		 * We must perform a read-before-write if the transfer size
1080 		 * does not cover the entire buffer.
1081 		 */
1082 		if (fs->fs_bsize > xfersize)
1083 			flags |= BA_CLRBUF;
1084 		else
1085 			flags &= ~BA_CLRBUF;
1086 		error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
1087 		    ucred, flags, &bp);
1088 		if (error != 0)
1089 			break;
1090 		/*
1091 		 * If the buffer is not valid we have to clear out any
1092 		 * garbage data from the pages instantiated for the buffer.
1093 		 * If we do not, a failed uiomove() during a write can leave
1094 		 * the prior contents of the pages exposed to a userland
1095 		 * mmap().  XXX deal with uiomove() errors a better way.
1096 		 */
1097 		if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
1098 			vfs_bio_clrbuf(bp);
1099 
1100 		if (uio->uio_offset + xfersize > dp->di_extsize)
1101 			dp->di_extsize = uio->uio_offset + xfersize;
1102 
1103 		size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid;
1104 		if (size < xfersize)
1105 			xfersize = size;
1106 
1107 		error =
1108 		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
1109 
1110 		vfs_bio_set_flags(bp, ioflag);
1111 
1112 		/*
1113 		 * If IO_SYNC each buffer is written synchronously.  Otherwise
1114 		 * if we have a severe page deficiency write the buffer
1115 		 * asynchronously.  Otherwise try to cluster, and if that
1116 		 * doesn't do it then either do an async write (if O_DIRECT),
1117 		 * or a delayed write (if not).
1118 		 */
1119 		if (ioflag & IO_SYNC) {
1120 			(void)bwrite(bp);
1121 		} else if (vm_page_count_severe() ||
1122 			    buf_dirty_count_severe() ||
1123 			    xfersize + blkoffset == fs->fs_bsize ||
1124 			    (ioflag & (IO_ASYNC | IO_DIRECT)))
1125 			bawrite(bp);
1126 		else
1127 			bdwrite(bp);
1128 		if (error || xfersize == 0)
1129 			break;
1130 		UFS_INODE_SET_FLAG(ip, IN_CHANGE);
1131 	}
1132 	/*
1133 	 * If we successfully wrote any data, and we are not the superuser
1134 	 * we clear the setuid and setgid bits as a precaution against
1135 	 * tampering.
1136 	 */
1137 	if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) {
1138 		if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID)) {
1139 			ip->i_mode &= ~(ISUID | ISGID);
1140 			dp->di_mode = ip->i_mode;
1141 		}
1142 	}
1143 	if (error) {
1144 		if (ioflag & IO_UNIT) {
1145 			(void)ffs_truncate(vp, osize,
1146 			    IO_EXT | (ioflag&IO_SYNC), ucred);
1147 			uio->uio_offset -= resid - uio->uio_resid;
1148 			uio->uio_resid = resid;
1149 		}
1150 	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
1151 		error = ffs_update(vp, 1);
1152 	return (error);
1153 }
1154 
1155 
1156 /*
1157  * Vnode operating to retrieve a named extended attribute.
1158  *
1159  * Locate a particular EA (nspace:name) in the area (ptr:length), and return
1160  * the length of the EA, and possibly the pointer to the entry and to the data.
1161  */
1162 static int
1163 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name,
1164     struct extattr **eapp, u_char **eac)
1165 {
1166 	struct extattr *eap, *eaend;
1167 	size_t nlen;
1168 
1169 	nlen = strlen(name);
1170 	KASSERT(ALIGNED_TO(ptr, struct extattr), ("unaligned"));
1171 	eap = (struct extattr *)ptr;
1172 	eaend = (struct extattr *)(ptr + length);
1173 	for (; eap < eaend; eap = EXTATTR_NEXT(eap)) {
1174 		/* make sure this entry is complete */
1175 		if (EXTATTR_NEXT(eap) > eaend)
1176 			break;
1177 		if (eap->ea_namespace != nspace || eap->ea_namelength != nlen
1178 		    || memcmp(eap->ea_name, name, nlen) != 0)
1179 			continue;
1180 		if (eapp != NULL)
1181 			*eapp = eap;
1182 		if (eac != NULL)
1183 			*eac = EXTATTR_CONTENT(eap);
1184 		return (EXTATTR_CONTENT_SIZE(eap));
1185 	}
1186 	return (-1);
1187 }
1188 
1189 static int
1190 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra)
1191 {
1192 	struct inode *ip;
1193 	struct ufs2_dinode *dp;
1194 	struct fs *fs;
1195 	struct uio luio;
1196 	struct iovec liovec;
1197 	u_int easize;
1198 	int error;
1199 	u_char *eae;
1200 
1201 	ip = VTOI(vp);
1202 	fs = ITOFS(ip);
1203 	dp = ip->i_din2;
1204 	easize = dp->di_extsize;
1205 	if ((uoff_t)easize + extra > UFS_NXADDR * fs->fs_bsize)
1206 		return (EFBIG);
1207 
1208 	eae = malloc(easize + extra, M_TEMP, M_WAITOK);
1209 
1210 	liovec.iov_base = eae;
1211 	liovec.iov_len = easize;
1212 	luio.uio_iov = &liovec;
1213 	luio.uio_iovcnt = 1;
1214 	luio.uio_offset = 0;
1215 	luio.uio_resid = easize;
1216 	luio.uio_segflg = UIO_SYSSPACE;
1217 	luio.uio_rw = UIO_READ;
1218 	luio.uio_td = td;
1219 
1220 	error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC);
1221 	if (error) {
1222 		free(eae, M_TEMP);
1223 		return(error);
1224 	}
1225 	*p = eae;
1226 	return (0);
1227 }
1228 
1229 static void
1230 ffs_lock_ea(struct vnode *vp)
1231 {
1232 	struct inode *ip;
1233 
1234 	ip = VTOI(vp);
1235 	VI_LOCK(vp);
1236 	while (ip->i_flag & IN_EA_LOCKED) {
1237 		UFS_INODE_SET_FLAG(ip, IN_EA_LOCKWAIT);
1238 		msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea",
1239 		    0);
1240 	}
1241 	UFS_INODE_SET_FLAG(ip, IN_EA_LOCKED);
1242 	VI_UNLOCK(vp);
1243 }
1244 
1245 static void
1246 ffs_unlock_ea(struct vnode *vp)
1247 {
1248 	struct inode *ip;
1249 
1250 	ip = VTOI(vp);
1251 	VI_LOCK(vp);
1252 	if (ip->i_flag & IN_EA_LOCKWAIT)
1253 		wakeup(&ip->i_ea_refs);
1254 	ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT);
1255 	VI_UNLOCK(vp);
1256 }
1257 
1258 static int
1259 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td)
1260 {
1261 	struct inode *ip;
1262 	struct ufs2_dinode *dp;
1263 	int error;
1264 
1265 	ip = VTOI(vp);
1266 
1267 	ffs_lock_ea(vp);
1268 	if (ip->i_ea_area != NULL) {
1269 		ip->i_ea_refs++;
1270 		ffs_unlock_ea(vp);
1271 		return (0);
1272 	}
1273 	dp = ip->i_din2;
1274 	error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0);
1275 	if (error) {
1276 		ffs_unlock_ea(vp);
1277 		return (error);
1278 	}
1279 	ip->i_ea_len = dp->di_extsize;
1280 	ip->i_ea_error = 0;
1281 	ip->i_ea_refs++;
1282 	ffs_unlock_ea(vp);
1283 	return (0);
1284 }
1285 
1286 /*
1287  * Vnode extattr transaction commit/abort
1288  */
1289 static int
1290 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td)
1291 {
1292 	struct inode *ip;
1293 	struct uio luio;
1294 	struct iovec liovec;
1295 	int error;
1296 	struct ufs2_dinode *dp;
1297 
1298 	ip = VTOI(vp);
1299 
1300 	ffs_lock_ea(vp);
1301 	if (ip->i_ea_area == NULL) {
1302 		ffs_unlock_ea(vp);
1303 		return (EINVAL);
1304 	}
1305 	dp = ip->i_din2;
1306 	error = ip->i_ea_error;
1307 	if (commit && error == 0) {
1308 		ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit");
1309 		if (cred == NOCRED)
1310 			cred =  vp->v_mount->mnt_cred;
1311 		liovec.iov_base = ip->i_ea_area;
1312 		liovec.iov_len = ip->i_ea_len;
1313 		luio.uio_iov = &liovec;
1314 		luio.uio_iovcnt = 1;
1315 		luio.uio_offset = 0;
1316 		luio.uio_resid = ip->i_ea_len;
1317 		luio.uio_segflg = UIO_SYSSPACE;
1318 		luio.uio_rw = UIO_WRITE;
1319 		luio.uio_td = td;
1320 		/* XXX: I'm not happy about truncating to zero size */
1321 		if (ip->i_ea_len < dp->di_extsize)
1322 			error = ffs_truncate(vp, 0, IO_EXT, cred);
1323 		error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred);
1324 	}
1325 	if (--ip->i_ea_refs == 0) {
1326 		free(ip->i_ea_area, M_TEMP);
1327 		ip->i_ea_area = NULL;
1328 		ip->i_ea_len = 0;
1329 		ip->i_ea_error = 0;
1330 	}
1331 	ffs_unlock_ea(vp);
1332 	return (error);
1333 }
1334 
1335 /*
1336  * Vnode extattr strategy routine for fifos.
1337  *
1338  * We need to check for a read or write of the external attributes.
1339  * Otherwise we just fall through and do the usual thing.
1340  */
1341 static int
1342 ffsext_strategy(struct vop_strategy_args *ap)
1343 /*
1344 struct vop_strategy_args {
1345 	struct vnodeop_desc *a_desc;
1346 	struct vnode *a_vp;
1347 	struct buf *a_bp;
1348 };
1349 */
1350 {
1351 	struct vnode *vp;
1352 	daddr_t lbn;
1353 
1354 	vp = ap->a_vp;
1355 	lbn = ap->a_bp->b_lblkno;
1356 	if (I_IS_UFS2(VTOI(vp)) && lbn < 0 && lbn >= -UFS_NXADDR)
1357 		return (VOP_STRATEGY_APV(&ufs_vnodeops, ap));
1358 	if (vp->v_type == VFIFO)
1359 		return (VOP_STRATEGY_APV(&ufs_fifoops, ap));
1360 	panic("spec nodes went here");
1361 }
1362 
1363 /*
1364  * Vnode extattr transaction commit/abort
1365  */
1366 static int
1367 ffs_openextattr(struct vop_openextattr_args *ap)
1368 /*
1369 struct vop_openextattr_args {
1370 	struct vnodeop_desc *a_desc;
1371 	struct vnode *a_vp;
1372 	IN struct ucred *a_cred;
1373 	IN struct thread *a_td;
1374 };
1375 */
1376 {
1377 
1378 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1379 		return (EOPNOTSUPP);
1380 
1381 	return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td));
1382 }
1383 
1384 
1385 /*
1386  * Vnode extattr transaction commit/abort
1387  */
1388 static int
1389 ffs_closeextattr(struct vop_closeextattr_args *ap)
1390 /*
1391 struct vop_closeextattr_args {
1392 	struct vnodeop_desc *a_desc;
1393 	struct vnode *a_vp;
1394 	int a_commit;
1395 	IN struct ucred *a_cred;
1396 	IN struct thread *a_td;
1397 };
1398 */
1399 {
1400 
1401 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1402 		return (EOPNOTSUPP);
1403 
1404 	if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY))
1405 		return (EROFS);
1406 
1407 	return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td));
1408 }
1409 
1410 /*
1411  * Vnode operation to remove a named attribute.
1412  */
1413 static int
1414 ffs_deleteextattr(struct vop_deleteextattr_args *ap)
1415 /*
1416 vop_deleteextattr {
1417 	IN struct vnode *a_vp;
1418 	IN int a_attrnamespace;
1419 	IN const char *a_name;
1420 	IN struct ucred *a_cred;
1421 	IN struct thread *a_td;
1422 };
1423 */
1424 {
1425 	struct inode *ip;
1426 	struct extattr *eap;
1427 	uint32_t ul;
1428 	int olen, error, i, easize;
1429 	u_char *eae;
1430 	void *tmp;
1431 
1432 	ip = VTOI(ap->a_vp);
1433 
1434 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1435 		return (EOPNOTSUPP);
1436 
1437 	if (strlen(ap->a_name) == 0)
1438 		return (EINVAL);
1439 
1440 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1441 		return (EROFS);
1442 
1443 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1444 	    ap->a_cred, ap->a_td, VWRITE);
1445 	if (error) {
1446 
1447 		/*
1448 		 * ffs_lock_ea is not needed there, because the vnode
1449 		 * must be exclusively locked.
1450 		 */
1451 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1452 			ip->i_ea_error = error;
1453 		return (error);
1454 	}
1455 
1456 	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1457 	if (error)
1458 		return (error);
1459 
1460 	/* CEM: delete could be done in-place instead */
1461 	eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK);
1462 	bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1463 	easize = ip->i_ea_len;
1464 
1465 	olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1466 	    &eap, NULL);
1467 	if (olen == -1) {
1468 		/* delete but nonexistent */
1469 		free(eae, M_TEMP);
1470 		ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1471 		return (ENOATTR);
1472 	}
1473 	ul = eap->ea_length;
1474 	i = (u_char *)EXTATTR_NEXT(eap) - eae;
1475 	bcopy(EXTATTR_NEXT(eap), eap, easize - i);
1476 	easize -= ul;
1477 
1478 	tmp = ip->i_ea_area;
1479 	ip->i_ea_area = eae;
1480 	ip->i_ea_len = easize;
1481 	free(tmp, M_TEMP);
1482 	error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1483 	return (error);
1484 }
1485 
1486 /*
1487  * Vnode operation to retrieve a named extended attribute.
1488  */
1489 static int
1490 ffs_getextattr(struct vop_getextattr_args *ap)
1491 /*
1492 vop_getextattr {
1493 	IN struct vnode *a_vp;
1494 	IN int a_attrnamespace;
1495 	IN const char *a_name;
1496 	INOUT struct uio *a_uio;
1497 	OUT size_t *a_size;
1498 	IN struct ucred *a_cred;
1499 	IN struct thread *a_td;
1500 };
1501 */
1502 {
1503 	struct inode *ip;
1504 	u_char *eae, *p;
1505 	unsigned easize;
1506 	int error, ealen;
1507 
1508 	ip = VTOI(ap->a_vp);
1509 
1510 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1511 		return (EOPNOTSUPP);
1512 
1513 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1514 	    ap->a_cred, ap->a_td, VREAD);
1515 	if (error)
1516 		return (error);
1517 
1518 	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1519 	if (error)
1520 		return (error);
1521 
1522 	eae = ip->i_ea_area;
1523 	easize = ip->i_ea_len;
1524 
1525 	ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1526 	    NULL, &p);
1527 	if (ealen >= 0) {
1528 		error = 0;
1529 		if (ap->a_size != NULL)
1530 			*ap->a_size = ealen;
1531 		else if (ap->a_uio != NULL)
1532 			error = uiomove(p, ealen, ap->a_uio);
1533 	} else
1534 		error = ENOATTR;
1535 
1536 	ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1537 	return (error);
1538 }
1539 
1540 /*
1541  * Vnode operation to retrieve extended attributes on a vnode.
1542  */
1543 static int
1544 ffs_listextattr(struct vop_listextattr_args *ap)
1545 /*
1546 vop_listextattr {
1547 	IN struct vnode *a_vp;
1548 	IN int a_attrnamespace;
1549 	INOUT struct uio *a_uio;
1550 	OUT size_t *a_size;
1551 	IN struct ucred *a_cred;
1552 	IN struct thread *a_td;
1553 };
1554 */
1555 {
1556 	struct inode *ip;
1557 	struct extattr *eap, *eaend;
1558 	int error, ealen;
1559 
1560 	ip = VTOI(ap->a_vp);
1561 
1562 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1563 		return (EOPNOTSUPP);
1564 
1565 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1566 	    ap->a_cred, ap->a_td, VREAD);
1567 	if (error)
1568 		return (error);
1569 
1570 	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1571 	if (error)
1572 		return (error);
1573 
1574 	error = 0;
1575 	if (ap->a_size != NULL)
1576 		*ap->a_size = 0;
1577 
1578 	KASSERT(ALIGNED_TO(ip->i_ea_area, struct extattr), ("unaligned"));
1579 	eap = (struct extattr *)ip->i_ea_area;
1580 	eaend = (struct extattr *)(ip->i_ea_area + ip->i_ea_len);
1581 	for (; error == 0 && eap < eaend; eap = EXTATTR_NEXT(eap)) {
1582 		/* make sure this entry is complete */
1583 		if (EXTATTR_NEXT(eap) > eaend)
1584 			break;
1585 		if (eap->ea_namespace != ap->a_attrnamespace)
1586 			continue;
1587 
1588 		ealen = eap->ea_namelength;
1589 		if (ap->a_size != NULL)
1590 			*ap->a_size += ealen + 1;
1591 		else if (ap->a_uio != NULL)
1592 			error = uiomove(&eap->ea_namelength, ealen + 1,
1593 			    ap->a_uio);
1594 	}
1595 
1596 	ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1597 	return (error);
1598 }
1599 
1600 /*
1601  * Vnode operation to set a named attribute.
1602  */
1603 static int
1604 ffs_setextattr(struct vop_setextattr_args *ap)
1605 /*
1606 vop_setextattr {
1607 	IN struct vnode *a_vp;
1608 	IN int a_attrnamespace;
1609 	IN const char *a_name;
1610 	INOUT struct uio *a_uio;
1611 	IN struct ucred *a_cred;
1612 	IN struct thread *a_td;
1613 };
1614 */
1615 {
1616 	struct inode *ip;
1617 	struct fs *fs;
1618 	struct extattr *eap;
1619 	uint32_t ealength, ul;
1620 	ssize_t ealen;
1621 	int olen, eapad1, eapad2, error, i, easize;
1622 	u_char *eae;
1623 	void *tmp;
1624 
1625 	ip = VTOI(ap->a_vp);
1626 	fs = ITOFS(ip);
1627 
1628 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1629 		return (EOPNOTSUPP);
1630 
1631 	if (strlen(ap->a_name) == 0)
1632 		return (EINVAL);
1633 
1634 	/* XXX Now unsupported API to delete EAs using NULL uio. */
1635 	if (ap->a_uio == NULL)
1636 		return (EOPNOTSUPP);
1637 
1638 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1639 		return (EROFS);
1640 
1641 	ealen = ap->a_uio->uio_resid;
1642 	if (ealen < 0 || ealen > lblktosize(fs, UFS_NXADDR))
1643 		return (EINVAL);
1644 
1645 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1646 	    ap->a_cred, ap->a_td, VWRITE);
1647 	if (error) {
1648 
1649 		/*
1650 		 * ffs_lock_ea is not needed there, because the vnode
1651 		 * must be exclusively locked.
1652 		 */
1653 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1654 			ip->i_ea_error = error;
1655 		return (error);
1656 	}
1657 
1658 	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1659 	if (error)
1660 		return (error);
1661 
1662 	ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name);
1663 	eapad1 = roundup2(ealength, 8) - ealength;
1664 	eapad2 = roundup2(ealen, 8) - ealen;
1665 	ealength += eapad1 + ealen + eapad2;
1666 
1667 	/*
1668 	 * CEM: rewrites of the same size or smaller could be done in-place
1669 	 * instead.  (We don't acquire any fine-grained locks in here either,
1670 	 * so we could also do bigger writes in-place.)
1671 	 */
1672 	eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK);
1673 	bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1674 	easize = ip->i_ea_len;
1675 
1676 	olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1677 	    &eap, NULL);
1678         if (olen == -1) {
1679 		/* new, append at end */
1680 		KASSERT(ALIGNED_TO(eae + easize, struct extattr),
1681 		    ("unaligned"));
1682 		eap = (struct extattr *)(eae + easize);
1683 		easize += ealength;
1684 	} else {
1685 		ul = eap->ea_length;
1686 		i = (u_char *)EXTATTR_NEXT(eap) - eae;
1687 		if (ul != ealength) {
1688 			bcopy(EXTATTR_NEXT(eap), (u_char *)eap + ealength,
1689 			    easize - i);
1690 			easize += (ealength - ul);
1691 		}
1692 	}
1693 	if (easize > lblktosize(fs, UFS_NXADDR)) {
1694 		free(eae, M_TEMP);
1695 		ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1696 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1697 			ip->i_ea_error = ENOSPC;
1698 		return (ENOSPC);
1699 	}
1700 	eap->ea_length = ealength;
1701 	eap->ea_namespace = ap->a_attrnamespace;
1702 	eap->ea_contentpadlen = eapad2;
1703 	eap->ea_namelength = strlen(ap->a_name);
1704 	memcpy(eap->ea_name, ap->a_name, strlen(ap->a_name));
1705 	bzero(&eap->ea_name[strlen(ap->a_name)], eapad1);
1706 	error = uiomove(EXTATTR_CONTENT(eap), ealen, ap->a_uio);
1707 	if (error) {
1708 		free(eae, M_TEMP);
1709 		ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1710 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1711 			ip->i_ea_error = error;
1712 		return (error);
1713 	}
1714 	bzero((u_char *)EXTATTR_CONTENT(eap) + ealen, eapad2);
1715 
1716 	tmp = ip->i_ea_area;
1717 	ip->i_ea_area = eae;
1718 	ip->i_ea_len = easize;
1719 	free(tmp, M_TEMP);
1720 	error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1721 	return (error);
1722 }
1723 
1724 /*
1725  * Vnode pointer to File handle
1726  */
1727 static int
1728 ffs_vptofh(struct vop_vptofh_args *ap)
1729 /*
1730 vop_vptofh {
1731 	IN struct vnode *a_vp;
1732 	IN struct fid *a_fhp;
1733 };
1734 */
1735 {
1736 	struct inode *ip;
1737 	struct ufid *ufhp;
1738 
1739 	ip = VTOI(ap->a_vp);
1740 	ufhp = (struct ufid *)ap->a_fhp;
1741 	ufhp->ufid_len = sizeof(struct ufid);
1742 	ufhp->ufid_ino = ip->i_number;
1743 	ufhp->ufid_gen = ip->i_gen;
1744 	return (0);
1745 }
1746 
1747 SYSCTL_DECL(_vfs_ffs);
1748 static int use_buf_pager = 1;
1749 SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0,
1750     "Always use buffer pager instead of bmap");
1751 
1752 static daddr_t
1753 ffs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
1754 {
1755 
1756 	return (lblkno(VFSTOUFS(vp->v_mount)->um_fs, off));
1757 }
1758 
1759 static int
1760 ffs_gbp_getblksz(struct vnode *vp, daddr_t lbn)
1761 {
1762 
1763 	return (blksize(VFSTOUFS(vp->v_mount)->um_fs, VTOI(vp), lbn));
1764 }
1765 
1766 static int
1767 ffs_getpages(struct vop_getpages_args *ap)
1768 {
1769 	struct vnode *vp;
1770 	struct ufsmount *um;
1771 
1772 	vp = ap->a_vp;
1773 	um = VFSTOUFS(vp->v_mount);
1774 
1775 	if (!use_buf_pager && um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE)
1776 		return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
1777 		    ap->a_rbehind, ap->a_rahead, NULL, NULL));
1778 	return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind,
1779 	    ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz));
1780 }
1781 
1782 static int
1783 ffs_getpages_async(struct vop_getpages_async_args *ap)
1784 {
1785 	struct vnode *vp;
1786 	struct ufsmount *um;
1787 	int error;
1788 
1789 	vp = ap->a_vp;
1790 	um = VFSTOUFS(vp->v_mount);
1791 
1792 	if (um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE)
1793 		return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
1794 		    ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg));
1795 
1796 	error = vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind,
1797 	    ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz);
1798 	ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error);
1799 
1800 	return (error);
1801 }
1802 
1803