xref: /freebsd/sys/ufs/ffs/ffs_vnops.c (revision 729362425c09cf6b362366aabc6fb547eee8035a)
1 /*
2  * Copyright (c) 2002 Networks Associates Technology, Inc.
3  * All rights reserved.
4  *
5  * This software was developed for the FreeBSD Project by Marshall
6  * Kirk McKusick and Network Associates Laboratories, the Security
7  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
8  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
9  * research program
10  *
11  * Copyright (c) 1982, 1986, 1989, 1993
12  *	The Regents of the University of California.  All rights reserved.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. All advertising materials mentioning features or use of this software
23  *    must display the following acknowledgement:
24  *	This product includes software developed by the University of
25  *	California, Berkeley and its contributors.
26  * 4. Neither the name of the University nor the names of its contributors
27  *    may be used to endorse or promote products derived from this software
28  *    without specific prior written permission.
29  *
30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40  * SUCH DAMAGE.
41  *
42  *	@(#)ffs_vnops.c	8.15 (Berkeley) 5/14/95
43  * $FreeBSD$
44  */
45 
46 #include <sys/param.h>
47 #include <sys/bio.h>
48 #include <sys/systm.h>
49 #include <sys/buf.h>
50 #include <sys/conf.h>
51 #include <sys/extattr.h>
52 #include <sys/kernel.h>
53 #include <sys/malloc.h>
54 #include <sys/mount.h>
55 #include <sys/proc.h>
56 #include <sys/resourcevar.h>
57 #include <sys/signalvar.h>
58 #include <sys/stat.h>
59 #include <sys/vmmeter.h>
60 #include <sys/vnode.h>
61 
62 #include <machine/limits.h>
63 
64 #include <vm/vm.h>
65 #include <vm/vm_extern.h>
66 #include <vm/vm_object.h>
67 #include <vm/vm_page.h>
68 #include <vm/vm_pager.h>
69 #include <vm/vnode_pager.h>
70 
71 #include <ufs/ufs/extattr.h>
72 #include <ufs/ufs/quota.h>
73 #include <ufs/ufs/inode.h>
74 #include <ufs/ufs/ufs_extern.h>
75 #include <ufs/ufs/ufsmount.h>
76 
77 #include <ufs/ffs/fs.h>
78 #include <ufs/ffs/ffs_extern.h>
79 #include "opt_directio.h"
80 
81 #ifdef DIRECTIO
82 extern int	ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
83 #endif
84 static int	ffs_fsync(struct vop_fsync_args *);
85 static int	ffs_getpages(struct vop_getpages_args *);
86 static int	ffs_read(struct vop_read_args *);
87 static int	ffs_write(struct vop_write_args *);
88 static int	ffs_extread(struct vnode *vp, struct uio *uio, int ioflag);
89 static int	ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag,
90 		    struct ucred *cred);
91 static int	ffsext_strategy(struct vop_strategy_args *);
92 static int	ffs_closeextattr(struct vop_closeextattr_args *);
93 static int	ffs_getextattr(struct vop_getextattr_args *);
94 static int	ffs_openextattr(struct vop_openextattr_args *);
95 static int	ffs_setextattr(struct vop_setextattr_args *);
96 
97 
98 /* Global vfs data structures for ufs. */
99 vop_t **ffs_vnodeop_p;
100 static struct vnodeopv_entry_desc ffs_vnodeop_entries[] = {
101 	{ &vop_default_desc,		(vop_t *) ufs_vnoperate },
102 	{ &vop_fsync_desc,		(vop_t *) ffs_fsync },
103 	{ &vop_getpages_desc,		(vop_t *) ffs_getpages },
104 	{ &vop_read_desc,		(vop_t *) ffs_read },
105 	{ &vop_reallocblks_desc,	(vop_t *) ffs_reallocblks },
106 	{ &vop_write_desc,		(vop_t *) ffs_write },
107 	{ &vop_closeextattr_desc,	(vop_t *) ffs_closeextattr },
108 	{ &vop_getextattr_desc,		(vop_t *) ffs_getextattr },
109 	{ &vop_openextattr_desc,	(vop_t *) ffs_openextattr },
110 	{ &vop_setextattr_desc,		(vop_t *) ffs_setextattr },
111 	{ NULL, NULL }
112 };
113 static struct vnodeopv_desc ffs_vnodeop_opv_desc =
114 	{ &ffs_vnodeop_p, ffs_vnodeop_entries };
115 
116 vop_t **ffs_specop_p;
117 static struct vnodeopv_entry_desc ffs_specop_entries[] = {
118 	{ &vop_default_desc,		(vop_t *) ufs_vnoperatespec },
119 	{ &vop_fsync_desc,		(vop_t *) ffs_fsync },
120 	{ &vop_reallocblks_desc,	(vop_t *) ffs_reallocblks },
121 	{ &vop_strategy_desc,		(vop_t *) ffsext_strategy },
122 	{ &vop_closeextattr_desc,	(vop_t *) ffs_closeextattr },
123 	{ &vop_getextattr_desc,		(vop_t *) ffs_getextattr },
124 	{ &vop_openextattr_desc,	(vop_t *) ffs_openextattr },
125 	{ &vop_setextattr_desc,		(vop_t *) ffs_setextattr },
126 	{ NULL, NULL }
127 };
128 static struct vnodeopv_desc ffs_specop_opv_desc =
129 	{ &ffs_specop_p, ffs_specop_entries };
130 
131 vop_t **ffs_fifoop_p;
132 static struct vnodeopv_entry_desc ffs_fifoop_entries[] = {
133 	{ &vop_default_desc,		(vop_t *) ufs_vnoperatefifo },
134 	{ &vop_fsync_desc,		(vop_t *) ffs_fsync },
135 	{ &vop_reallocblks_desc,	(vop_t *) ffs_reallocblks },
136 	{ &vop_strategy_desc,		(vop_t *) ffsext_strategy },
137 	{ &vop_closeextattr_desc,	(vop_t *) ffs_closeextattr },
138 	{ &vop_getextattr_desc,		(vop_t *) ffs_getextattr },
139 	{ &vop_openextattr_desc,	(vop_t *) ffs_openextattr },
140 	{ &vop_setextattr_desc,		(vop_t *) ffs_setextattr },
141 	{ NULL, NULL }
142 };
143 static struct vnodeopv_desc ffs_fifoop_opv_desc =
144 	{ &ffs_fifoop_p, ffs_fifoop_entries };
145 
146 VNODEOP_SET(ffs_vnodeop_opv_desc);
147 VNODEOP_SET(ffs_specop_opv_desc);
148 VNODEOP_SET(ffs_fifoop_opv_desc);
149 
150 /*
151  * Synch an open file.
152  */
153 /* ARGSUSED */
154 static int
155 ffs_fsync(ap)
156 	struct vop_fsync_args /* {
157 		struct vnode *a_vp;
158 		struct ucred *a_cred;
159 		int a_waitfor;
160 		struct thread *a_td;
161 	} */ *ap;
162 {
163 	struct vnode *vp = ap->a_vp;
164 	struct inode *ip = VTOI(vp);
165 	struct buf *bp;
166 	struct buf *nbp;
167 	int s, error, wait, passes, skipmeta;
168 	ufs_lbn_t lbn;
169 
170 	wait = (ap->a_waitfor == MNT_WAIT);
171 	if (vn_isdisk(vp, NULL)) {
172 		lbn = INT_MAX;
173 		if (vp->v_rdev->si_mountpoint != NULL &&
174 		    (vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP))
175 			softdep_fsync_mountdev(vp);
176 	} else {
177 		lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1));
178 	}
179 
180 	/*
181 	 * Flush all dirty buffers associated with a vnode.
182 	 */
183 	passes = NIADDR + 1;
184 	skipmeta = 0;
185 	if (wait)
186 		skipmeta = 1;
187 	s = splbio();
188 	VI_LOCK(vp);
189 loop:
190 	TAILQ_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs)
191 		bp->b_vflags &= ~BV_SCANNED;
192 	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
193 		nbp = TAILQ_NEXT(bp, b_vnbufs);
194 		/*
195 		 * Reasons to skip this buffer: it has already been considered
196 		 * on this pass, this pass is the first time through on a
197 		 * synchronous flush request and the buffer being considered
198 		 * is metadata, the buffer has dependencies that will cause
199 		 * it to be redirtied and it has not already been deferred,
200 		 * or it is already being written.
201 		 */
202 		if ((bp->b_vflags & BV_SCANNED) != 0)
203 			continue;
204 		bp->b_vflags |= BV_SCANNED;
205 		if ((skipmeta == 1 && bp->b_lblkno < 0))
206 			continue;
207 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
208 			continue;
209 		if (!wait && LIST_FIRST(&bp->b_dep) != NULL &&
210 		    (bp->b_flags & B_DEFERRED) == 0 &&
211 		    buf_countdeps(bp, 0)) {
212 			bp->b_flags |= B_DEFERRED;
213 			BUF_UNLOCK(bp);
214 			continue;
215 		}
216 		VI_UNLOCK(vp);
217 		if ((bp->b_flags & B_DELWRI) == 0)
218 			panic("ffs_fsync: not dirty");
219 		if (vp != bp->b_vp)
220 			panic("ffs_fsync: vp != vp->b_vp");
221 		/*
222 		 * If this is a synchronous flush request, or it is not a
223 		 * file or device, start the write on this buffer immediatly.
224 		 */
225 		if (wait || (vp->v_type != VREG && vp->v_type != VBLK)) {
226 
227 			/*
228 			 * On our final pass through, do all I/O synchronously
229 			 * so that we can find out if our flush is failing
230 			 * because of write errors.
231 			 */
232 			if (passes > 0 || !wait) {
233 				if ((bp->b_flags & B_CLUSTEROK) && !wait) {
234 					(void) vfs_bio_awrite(bp);
235 				} else {
236 					bremfree(bp);
237 					splx(s);
238 					(void) bawrite(bp);
239 					s = splbio();
240 				}
241 			} else {
242 				bremfree(bp);
243 				splx(s);
244 				if ((error = bwrite(bp)) != 0)
245 					return (error);
246 				s = splbio();
247 			}
248 		} else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) {
249 			/*
250 			 * If the buffer is for data that has been truncated
251 			 * off the file, then throw it away.
252 			 */
253 			bremfree(bp);
254 			bp->b_flags |= B_INVAL | B_NOCACHE;
255 			splx(s);
256 			brelse(bp);
257 			s = splbio();
258 		} else
259 			vfs_bio_awrite(bp);
260 
261 		/*
262 		 * Since we may have slept during the I/O, we need
263 		 * to start from a known point.
264 		 */
265 		VI_LOCK(vp);
266 		nbp = TAILQ_FIRST(&vp->v_dirtyblkhd);
267 	}
268 	/*
269 	 * If we were asked to do this synchronously, then go back for
270 	 * another pass, this time doing the metadata.
271 	 */
272 	if (skipmeta) {
273 		skipmeta = 0;
274 		goto loop;
275 	}
276 
277 	if (wait) {
278 		while (vp->v_numoutput) {
279 			vp->v_iflag |= VI_BWAIT;
280 			msleep((caddr_t)&vp->v_numoutput, VI_MTX(vp),
281 			    PRIBIO + 4, "ffsfsn", 0);
282   		}
283 		VI_UNLOCK(vp);
284 
285 		/*
286 		 * Ensure that any filesystem metatdata associated
287 		 * with the vnode has been written.
288 		 */
289 		splx(s);
290 		if ((error = softdep_sync_metadata(ap)) != 0)
291 			return (error);
292 		s = splbio();
293 
294 		VI_LOCK(vp);
295 		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
296 			/*
297 			 * Block devices associated with filesystems may
298 			 * have new I/O requests posted for them even if
299 			 * the vnode is locked, so no amount of trying will
300 			 * get them clean. Thus we give block devices a
301 			 * good effort, then just give up. For all other file
302 			 * types, go around and try again until it is clean.
303 			 */
304 			if (passes > 0) {
305 				passes -= 1;
306 				goto loop;
307 			}
308 #ifdef DIAGNOSTIC
309 			if (!vn_isdisk(vp, NULL))
310 				vprint("ffs_fsync: dirty", vp);
311 #endif
312 		}
313 	}
314 	VI_UNLOCK(vp);
315 	splx(s);
316 	return (UFS_UPDATE(vp, wait));
317 }
318 
319 
320 /*
321  * Vnode op for reading.
322  */
323 /* ARGSUSED */
324 static int
325 ffs_read(ap)
326 	struct vop_read_args /* {
327 		struct vnode *a_vp;
328 		struct uio *a_uio;
329 		int a_ioflag;
330 		struct ucred *a_cred;
331 	} */ *ap;
332 {
333 	struct vnode *vp;
334 	struct inode *ip;
335 	struct uio *uio;
336 	struct fs *fs;
337 	struct buf *bp;
338 	ufs_lbn_t lbn, nextlbn;
339 	off_t bytesinfile;
340 	long size, xfersize, blkoffset;
341 	int error, orig_resid;
342 	mode_t mode;
343 	int seqcount;
344 	int ioflag;
345 	vm_object_t object;
346 
347 	vp = ap->a_vp;
348 	uio = ap->a_uio;
349 	ioflag = ap->a_ioflag;
350 	if (ap->a_ioflag & IO_EXT)
351 #ifdef notyet
352 		return (ffs_extread(vp, uio, ioflag));
353 #else
354 		panic("ffs_read+IO_EXT");
355 #endif
356 #ifdef DIRECTIO
357 	if ((ioflag & IO_DIRECT) != 0) {
358 		int workdone;
359 
360 		error = ffs_rawread(vp, uio, &workdone);
361 		if (error != 0 || workdone != 0)
362 			return error;
363 	}
364 #endif
365 
366 	GIANT_REQUIRED;
367 
368 	seqcount = ap->a_ioflag >> 16;
369 	ip = VTOI(vp);
370 	mode = ip->i_mode;
371 
372 #ifdef DIAGNOSTIC
373 	if (uio->uio_rw != UIO_READ)
374 		panic("ffs_read: mode");
375 
376 	if (vp->v_type == VLNK) {
377 		if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
378 			panic("ffs_read: short symlink");
379 	} else if (vp->v_type != VREG && vp->v_type != VDIR)
380 		panic("ffs_read: type %d",  vp->v_type);
381 #endif
382 	fs = ip->i_fs;
383 	if ((u_int64_t)uio->uio_offset > fs->fs_maxfilesize)
384 		return (EFBIG);
385 
386 	orig_resid = uio->uio_resid;
387 	if (orig_resid <= 0)
388 		return (0);
389 
390 	object = vp->v_object;
391 
392 	bytesinfile = ip->i_size - uio->uio_offset;
393 	if (bytesinfile <= 0) {
394 		if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
395 			ip->i_flag |= IN_ACCESS;
396 		return 0;
397 	}
398 
399 	if (object) {
400 		vm_object_reference(object);
401 	}
402 
403 	/*
404 	 * Ok so we couldn't do it all in one vm trick...
405 	 * so cycle around trying smaller bites..
406 	 */
407 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
408 		if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
409 			break;
410 
411 		lbn = lblkno(fs, uio->uio_offset);
412 		nextlbn = lbn + 1;
413 
414 		/*
415 		 * size of buffer.  The buffer representing the
416 		 * end of the file is rounded up to the size of
417 		 * the block type ( fragment or full block,
418 		 * depending ).
419 		 */
420 		size = blksize(fs, ip, lbn);
421 		blkoffset = blkoff(fs, uio->uio_offset);
422 
423 		/*
424 		 * The amount we want to transfer in this iteration is
425 		 * one FS block less the amount of the data before
426 		 * our startpoint (duh!)
427 		 */
428 		xfersize = fs->fs_bsize - blkoffset;
429 
430 		/*
431 		 * But if we actually want less than the block,
432 		 * or the file doesn't have a whole block more of data,
433 		 * then use the lesser number.
434 		 */
435 		if (uio->uio_resid < xfersize)
436 			xfersize = uio->uio_resid;
437 		if (bytesinfile < xfersize)
438 			xfersize = bytesinfile;
439 
440 		if (lblktosize(fs, nextlbn) >= ip->i_size) {
441 			/*
442 			 * Don't do readahead if this is the end of the file.
443 			 */
444 			error = bread(vp, lbn, size, NOCRED, &bp);
445 		} else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
446 			/*
447 			 * Otherwise if we are allowed to cluster,
448 			 * grab as much as we can.
449 			 *
450 			 * XXX  This may not be a win if we are not
451 			 * doing sequential access.
452 			 */
453 			error = cluster_read(vp, ip->i_size, lbn,
454 				size, NOCRED, uio->uio_resid, seqcount, &bp);
455 		} else if (seqcount > 1) {
456 			/*
457 			 * If we are NOT allowed to cluster, then
458 			 * if we appear to be acting sequentially,
459 			 * fire off a request for a readahead
460 			 * as well as a read. Note that the 4th and 5th
461 			 * arguments point to arrays of the size specified in
462 			 * the 6th argument.
463 			 */
464 			int nextsize = blksize(fs, ip, nextlbn);
465 			error = breadn(vp, lbn,
466 			    size, &nextlbn, &nextsize, 1, NOCRED, &bp);
467 		} else {
468 			/*
469 			 * Failing all of the above, just read what the
470 			 * user asked for. Interestingly, the same as
471 			 * the first option above.
472 			 */
473 			error = bread(vp, lbn, size, NOCRED, &bp);
474 		}
475 		if (error) {
476 			brelse(bp);
477 			bp = NULL;
478 			break;
479 		}
480 
481 		/*
482 		 * If IO_DIRECT then set B_DIRECT for the buffer.  This
483 		 * will cause us to attempt to release the buffer later on
484 		 * and will cause the buffer cache to attempt to free the
485 		 * underlying pages.
486 		 */
487 		if (ioflag & IO_DIRECT)
488 			bp->b_flags |= B_DIRECT;
489 
490 		/*
491 		 * We should only get non-zero b_resid when an I/O error
492 		 * has occurred, which should cause us to break above.
493 		 * However, if the short read did not cause an error,
494 		 * then we want to ensure that we do not uiomove bad
495 		 * or uninitialized data.
496 		 */
497 		size -= bp->b_resid;
498 		if (size < xfersize) {
499 			if (size == 0)
500 				break;
501 			xfersize = size;
502 		}
503 
504 		{
505 			/*
506 			 * otherwise use the general form
507 			 */
508 			error =
509 				uiomove((char *)bp->b_data + blkoffset,
510 					(int)xfersize, uio);
511 		}
512 
513 		if (error)
514 			break;
515 
516 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
517 		   (LIST_FIRST(&bp->b_dep) == NULL)) {
518 			/*
519 			 * If there are no dependencies, and it's VMIO,
520 			 * then we don't need the buf, mark it available
521 			 * for freeing. The VM has the data.
522 			 */
523 			bp->b_flags |= B_RELBUF;
524 			brelse(bp);
525 		} else {
526 			/*
527 			 * Otherwise let whoever
528 			 * made the request take care of
529 			 * freeing it. We just queue
530 			 * it onto another list.
531 			 */
532 			bqrelse(bp);
533 		}
534 	}
535 
536 	/*
537 	 * This can only happen in the case of an error
538 	 * because the loop above resets bp to NULL on each iteration
539 	 * and on normal completion has not set a new value into it.
540 	 * so it must have come from a 'break' statement
541 	 */
542 	if (bp != NULL) {
543 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
544 		   (LIST_FIRST(&bp->b_dep) == NULL)) {
545 			bp->b_flags |= B_RELBUF;
546 			brelse(bp);
547 		} else {
548 			bqrelse(bp);
549 		}
550 	}
551 
552 	if (object) {
553 		vm_object_vndeallocate(object);
554 	}
555 	if ((error == 0 || uio->uio_resid != orig_resid) &&
556 	    (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
557 		ip->i_flag |= IN_ACCESS;
558 	return (error);
559 }
560 
561 /*
562  * Vnode op for writing.
563  */
564 static int
565 ffs_write(ap)
566 	struct vop_write_args /* {
567 		struct vnode *a_vp;
568 		struct uio *a_uio;
569 		int a_ioflag;
570 		struct ucred *a_cred;
571 	} */ *ap;
572 {
573 	struct vnode *vp;
574 	struct uio *uio;
575 	struct inode *ip;
576 	struct fs *fs;
577 	struct buf *bp;
578 	struct thread *td;
579 	ufs_lbn_t lbn;
580 	off_t osize;
581 	int seqcount;
582 	int blkoffset, error, extended, flags, ioflag, resid, size, xfersize;
583 	vm_object_t object;
584 
585 	vp = ap->a_vp;
586 	uio = ap->a_uio;
587 	ioflag = ap->a_ioflag;
588 	if (ap->a_ioflag & IO_EXT)
589 #ifdef notyet
590 		return (ffs_extwrite(vp, uio, ioflag, ap->a_cred));
591 #else
592 		panic("ffs_read+IO_EXT");
593 #endif
594 
595 	GIANT_REQUIRED;
596 
597 	extended = 0;
598 	seqcount = ap->a_ioflag >> 16;
599 	ip = VTOI(vp);
600 
601 	object = vp->v_object;
602 	if (object) {
603 		vm_object_reference(object);
604 	}
605 
606 #ifdef DIAGNOSTIC
607 	if (uio->uio_rw != UIO_WRITE)
608 		panic("ffswrite: mode");
609 #endif
610 
611 	switch (vp->v_type) {
612 	case VREG:
613 		if (ioflag & IO_APPEND)
614 			uio->uio_offset = ip->i_size;
615 		if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) {
616 			if (object) {
617 				vm_object_vndeallocate(object);
618 			}
619 			return (EPERM);
620 		}
621 		/* FALLTHROUGH */
622 	case VLNK:
623 		break;
624 	case VDIR:
625 		panic("ffswrite: dir write");
626 		break;
627 	default:
628 		panic("ffswrite: type %p %d (%d,%d)", vp, (int)vp->v_type,
629 			(int)uio->uio_offset,
630 			(int)uio->uio_resid
631 		);
632 	}
633 
634 	fs = ip->i_fs;
635 	if (uio->uio_offset < 0 ||
636 	    (u_int64_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) {
637 		if (object) {
638 			vm_object_vndeallocate(object);
639 		}
640 		return (EFBIG);
641 	}
642 	/*
643 	 * Maybe this should be above the vnode op call, but so long as
644 	 * file servers have no limits, I don't think it matters.
645 	 */
646 	td = uio->uio_td;
647 	if (vp->v_type == VREG && td &&
648 	    uio->uio_offset + uio->uio_resid >
649 	    td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
650 		PROC_LOCK(td->td_proc);
651 		psignal(td->td_proc, SIGXFSZ);
652 		PROC_UNLOCK(td->td_proc);
653 		if (object) {
654 			vm_object_vndeallocate(object);
655 		}
656 		return (EFBIG);
657 	}
658 
659 	resid = uio->uio_resid;
660 	osize = ip->i_size;
661 	if (seqcount > BA_SEQMAX)
662 		flags = BA_SEQMAX << BA_SEQSHIFT;
663 	else
664 		flags = seqcount << BA_SEQSHIFT;
665 	if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
666 		flags |= IO_SYNC;
667 
668 	for (error = 0; uio->uio_resid > 0;) {
669 		lbn = lblkno(fs, uio->uio_offset);
670 		blkoffset = blkoff(fs, uio->uio_offset);
671 		xfersize = fs->fs_bsize - blkoffset;
672 		if (uio->uio_resid < xfersize)
673 			xfersize = uio->uio_resid;
674 
675 		if (uio->uio_offset + xfersize > ip->i_size)
676 			vnode_pager_setsize(vp, uio->uio_offset + xfersize);
677 
678                 /*
679 		 * We must perform a read-before-write if the transfer size
680 		 * does not cover the entire buffer.
681                  */
682 		if (fs->fs_bsize > xfersize)
683 			flags |= BA_CLRBUF;
684 		else
685 			flags &= ~BA_CLRBUF;
686 /* XXX is uio->uio_offset the right thing here? */
687 		error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
688 		    ap->a_cred, flags, &bp);
689 		if (error != 0)
690 			break;
691 		/*
692 		 * If the buffer is not valid we have to clear out any
693 		 * garbage data from the pages instantiated for the buffer.
694 		 * If we do not, a failed uiomove() during a write can leave
695 		 * the prior contents of the pages exposed to a userland
696 		 * mmap().  XXX deal with uiomove() errors a better way.
697 		 */
698 		if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
699 			vfs_bio_clrbuf(bp);
700 		if (ioflag & IO_DIRECT)
701 			bp->b_flags |= B_DIRECT;
702 		if (ioflag & IO_NOWDRAIN)
703 			bp->b_flags |= B_NOWDRAIN;
704 
705 		if (uio->uio_offset + xfersize > ip->i_size) {
706 			ip->i_size = uio->uio_offset + xfersize;
707 			DIP(ip, i_size) = ip->i_size;
708 			extended = 1;
709 		}
710 
711 		size = blksize(fs, ip, lbn) - bp->b_resid;
712 		if (size < xfersize)
713 			xfersize = size;
714 
715 		error =
716 		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
717 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
718 		   (LIST_FIRST(&bp->b_dep) == NULL)) {
719 			bp->b_flags |= B_RELBUF;
720 		}
721 
722 		/*
723 		 * If IO_SYNC each buffer is written synchronously.  Otherwise
724 		 * if we have a severe page deficiency write the buffer
725 		 * asynchronously.  Otherwise try to cluster, and if that
726 		 * doesn't do it then either do an async write (if O_DIRECT),
727 		 * or a delayed write (if not).
728 		 */
729 		if (ioflag & IO_SYNC) {
730 			(void)bwrite(bp);
731 		} else if (vm_page_count_severe() ||
732 			    buf_dirty_count_severe() ||
733 			    (ioflag & IO_ASYNC)) {
734 			bp->b_flags |= B_CLUSTEROK;
735 			bawrite(bp);
736 		} else if (xfersize + blkoffset == fs->fs_bsize) {
737 			if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
738 				bp->b_flags |= B_CLUSTEROK;
739 				cluster_write(bp, ip->i_size, seqcount);
740 			} else {
741 				bawrite(bp);
742 			}
743 		} else if (ioflag & IO_DIRECT) {
744 			bp->b_flags |= B_CLUSTEROK;
745 			bawrite(bp);
746 		} else {
747 			bp->b_flags |= B_CLUSTEROK;
748 			bdwrite(bp);
749 		}
750 		if (error || xfersize == 0)
751 			break;
752 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
753 	}
754 	/*
755 	 * If we successfully wrote any data, and we are not the superuser
756 	 * we clear the setuid and setgid bits as a precaution against
757 	 * tampering.
758 	 */
759 	if (resid > uio->uio_resid && ap->a_cred &&
760 	    suser_cred(ap->a_cred, PRISON_ROOT)) {
761 		ip->i_mode &= ~(ISUID | ISGID);
762 		DIP(ip, i_mode) = ip->i_mode;
763 	}
764 	if (resid > uio->uio_resid)
765 		VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0));
766 	if (error) {
767 		if (ioflag & IO_UNIT) {
768 			(void)UFS_TRUNCATE(vp, osize,
769 			    IO_NORMAL | (ioflag & IO_SYNC),
770 			    ap->a_cred, uio->uio_td);
771 			uio->uio_offset -= resid - uio->uio_resid;
772 			uio->uio_resid = resid;
773 		}
774 	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
775 		error = UFS_UPDATE(vp, 1);
776 
777 	if (object) {
778 		vm_object_vndeallocate(object);
779 	}
780 
781 	return (error);
782 }
783 
784 /*
785  * get page routine
786  */
787 static int
788 ffs_getpages(ap)
789 	struct vop_getpages_args *ap;
790 {
791 	off_t foff, physoffset;
792 	int i, size, bsize;
793 	struct vnode *dp, *vp;
794 	vm_object_t obj;
795 	vm_pindex_t pindex, firstindex;
796 	vm_page_t mreq;
797 	int bbackwards, bforwards;
798 	int pbackwards, pforwards;
799 	int firstpage;
800 	ufs2_daddr_t reqblkno, reqlblkno;
801 	int poff;
802 	int pcount;
803 	int rtval;
804 	int pagesperblock;
805 
806 	GIANT_REQUIRED;
807 
808 	pcount = round_page(ap->a_count) / PAGE_SIZE;
809 	mreq = ap->a_m[ap->a_reqpage];
810 	firstindex = ap->a_m[0]->pindex;
811 
812 	/*
813 	 * if ANY DEV_BSIZE blocks are valid on a large filesystem block,
814 	 * then the entire page is valid.  Since the page may be mapped,
815 	 * user programs might reference data beyond the actual end of file
816 	 * occuring within the page.  We have to zero that data.
817 	 */
818 	if (mreq->valid) {
819 		if (mreq->valid != VM_PAGE_BITS_ALL)
820 			vm_page_zero_invalid(mreq, TRUE);
821 		vm_page_lock_queues();
822 		for (i = 0; i < pcount; i++) {
823 			if (i != ap->a_reqpage) {
824 				vm_page_free(ap->a_m[i]);
825 			}
826 		}
827 		vm_page_unlock_queues();
828 		return VM_PAGER_OK;
829 	}
830 
831 	vp = ap->a_vp;
832 	obj = vp->v_object;
833 	bsize = vp->v_mount->mnt_stat.f_iosize;
834 	pindex = mreq->pindex;
835 	foff = IDX_TO_OFF(pindex) /* + ap->a_offset should be zero */;
836 
837 	if (bsize < PAGE_SIZE)
838 		return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
839 						    ap->a_count,
840 						    ap->a_reqpage);
841 
842 	/*
843 	 * foff is the file offset of the required page
844 	 * reqlblkno is the logical block that contains the page
845 	 * poff is the index of the page into the logical block
846 	 */
847 	reqlblkno = foff / bsize;
848 	poff = (foff % bsize) / PAGE_SIZE;
849 
850 	dp = VTOI(vp)->i_devvp;
851 	if (ufs_bmaparray(vp, reqlblkno, &reqblkno, 0, &bforwards, &bbackwards)
852 	    || (reqblkno == -1)) {
853 		vm_page_lock_queues();
854 		for(i = 0; i < pcount; i++) {
855 			if (i != ap->a_reqpage)
856 				vm_page_free(ap->a_m[i]);
857 		}
858 		vm_page_unlock_queues();
859 		if (reqblkno == -1) {
860 			if ((mreq->flags & PG_ZERO) == 0)
861 				pmap_zero_page(mreq);
862 			vm_page_undirty(mreq);
863 			mreq->valid = VM_PAGE_BITS_ALL;
864 			return VM_PAGER_OK;
865 		} else {
866 			return VM_PAGER_ERROR;
867 		}
868 	}
869 
870 	physoffset = (off_t)reqblkno * DEV_BSIZE + poff * PAGE_SIZE;
871 	pagesperblock = bsize / PAGE_SIZE;
872 	/*
873 	 * find the first page that is contiguous...
874 	 * note that pbackwards is the number of pages that are contiguous
875 	 * backwards.
876 	 */
877 	firstpage = 0;
878 	if (ap->a_count) {
879 		pbackwards = poff + bbackwards * pagesperblock;
880 		if (ap->a_reqpage > pbackwards) {
881 			firstpage = ap->a_reqpage - pbackwards;
882 			vm_page_lock_queues();
883 			for(i=0;i<firstpage;i++)
884 				vm_page_free(ap->a_m[i]);
885 			vm_page_unlock_queues();
886 		}
887 
888 	/*
889 	 * pforwards is the number of pages that are contiguous
890 	 * after the current page.
891 	 */
892 		pforwards = (pagesperblock - (poff + 1)) +
893 			bforwards * pagesperblock;
894 		if (pforwards < (pcount - (ap->a_reqpage + 1))) {
895 			vm_page_lock_queues();
896 			for( i = ap->a_reqpage + pforwards + 1; i < pcount; i++)
897 				vm_page_free(ap->a_m[i]);
898 			vm_page_unlock_queues();
899 			pcount = ap->a_reqpage + pforwards + 1;
900 		}
901 
902 	/*
903 	 * number of pages for I/O corrected for the non-contig pages at
904 	 * the beginning of the array.
905 	 */
906 		pcount -= firstpage;
907 	}
908 
909 	/*
910 	 * calculate the size of the transfer
911 	 */
912 
913 	size = pcount * PAGE_SIZE;
914 
915 	if ((IDX_TO_OFF(ap->a_m[firstpage]->pindex) + size) >
916 		obj->un_pager.vnp.vnp_size)
917 		size = obj->un_pager.vnp.vnp_size -
918 			IDX_TO_OFF(ap->a_m[firstpage]->pindex);
919 
920 	physoffset -= foff;
921 	rtval = VOP_GETPAGES(dp, &ap->a_m[firstpage], size,
922 		(ap->a_reqpage - firstpage), physoffset);
923 
924 	return (rtval);
925 }
926 
927 /*
928  * Extended attribute area reading.
929  */
930 static int
931 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag)
932 {
933 	struct inode *ip;
934 	struct ufs2_dinode *dp;
935 	struct fs *fs;
936 	struct buf *bp;
937 	ufs_lbn_t lbn, nextlbn;
938 	off_t bytesinfile;
939 	long size, xfersize, blkoffset;
940 	int error, orig_resid;
941 	mode_t mode;
942 
943 	GIANT_REQUIRED;
944 
945 	ip = VTOI(vp);
946 	fs = ip->i_fs;
947 	dp = ip->i_din2;
948 	mode = ip->i_mode;
949 
950 #ifdef DIAGNOSTIC
951 	if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC)
952 		panic("ffs_extread: mode");
953 
954 #endif
955 	orig_resid = uio->uio_resid;
956 	if (orig_resid <= 0)
957 		return (0);
958 
959 	bytesinfile = dp->di_extsize - uio->uio_offset;
960 	if (bytesinfile <= 0) {
961 		if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
962 			ip->i_flag |= IN_ACCESS;
963 		return 0;
964 	}
965 
966 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
967 		if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0)
968 			break;
969 
970 		lbn = lblkno(fs, uio->uio_offset);
971 		nextlbn = lbn + 1;
972 
973 		/*
974 		 * size of buffer.  The buffer representing the
975 		 * end of the file is rounded up to the size of
976 		 * the block type ( fragment or full block,
977 		 * depending ).
978 		 */
979 		size = sblksize(fs, dp->di_extsize, lbn);
980 		blkoffset = blkoff(fs, uio->uio_offset);
981 
982 		/*
983 		 * The amount we want to transfer in this iteration is
984 		 * one FS block less the amount of the data before
985 		 * our startpoint (duh!)
986 		 */
987 		xfersize = fs->fs_bsize - blkoffset;
988 
989 		/*
990 		 * But if we actually want less than the block,
991 		 * or the file doesn't have a whole block more of data,
992 		 * then use the lesser number.
993 		 */
994 		if (uio->uio_resid < xfersize)
995 			xfersize = uio->uio_resid;
996 		if (bytesinfile < xfersize)
997 			xfersize = bytesinfile;
998 
999 		if (lblktosize(fs, nextlbn) >= dp->di_extsize) {
1000 			/*
1001 			 * Don't do readahead if this is the end of the info.
1002 			 */
1003 			error = bread(vp, -1 - lbn, size, NOCRED, &bp);
1004 		} else {
1005 			/*
1006 			 * If we have a second block, then
1007 			 * fire off a request for a readahead
1008 			 * as well as a read. Note that the 4th and 5th
1009 			 * arguments point to arrays of the size specified in
1010 			 * the 6th argument.
1011 			 */
1012 			int nextsize = sblksize(fs, dp->di_extsize, nextlbn);
1013 
1014 			nextlbn = -1 - nextlbn;
1015 			error = breadn(vp, -1 - lbn,
1016 			    size, &nextlbn, &nextsize, 1, NOCRED, &bp);
1017 		}
1018 		if (error) {
1019 			brelse(bp);
1020 			bp = NULL;
1021 			break;
1022 		}
1023 
1024 		/*
1025 		 * If IO_DIRECT then set B_DIRECT for the buffer.  This
1026 		 * will cause us to attempt to release the buffer later on
1027 		 * and will cause the buffer cache to attempt to free the
1028 		 * underlying pages.
1029 		 */
1030 		if (ioflag & IO_DIRECT)
1031 			bp->b_flags |= B_DIRECT;
1032 
1033 		/*
1034 		 * We should only get non-zero b_resid when an I/O error
1035 		 * has occurred, which should cause us to break above.
1036 		 * However, if the short read did not cause an error,
1037 		 * then we want to ensure that we do not uiomove bad
1038 		 * or uninitialized data.
1039 		 */
1040 		size -= bp->b_resid;
1041 		if (size < xfersize) {
1042 			if (size == 0)
1043 				break;
1044 			xfersize = size;
1045 		}
1046 
1047 		error = uiomove((char *)bp->b_data + blkoffset,
1048 					(int)xfersize, uio);
1049 		if (error)
1050 			break;
1051 
1052 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1053 		   (LIST_FIRST(&bp->b_dep) == NULL)) {
1054 			/*
1055 			 * If there are no dependencies, and it's VMIO,
1056 			 * then we don't need the buf, mark it available
1057 			 * for freeing. The VM has the data.
1058 			 */
1059 			bp->b_flags |= B_RELBUF;
1060 			brelse(bp);
1061 		} else {
1062 			/*
1063 			 * Otherwise let whoever
1064 			 * made the request take care of
1065 			 * freeing it. We just queue
1066 			 * it onto another list.
1067 			 */
1068 			bqrelse(bp);
1069 		}
1070 	}
1071 
1072 	/*
1073 	 * This can only happen in the case of an error
1074 	 * because the loop above resets bp to NULL on each iteration
1075 	 * and on normal completion has not set a new value into it.
1076 	 * so it must have come from a 'break' statement
1077 	 */
1078 	if (bp != NULL) {
1079 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1080 		   (LIST_FIRST(&bp->b_dep) == NULL)) {
1081 			bp->b_flags |= B_RELBUF;
1082 			brelse(bp);
1083 		} else {
1084 			bqrelse(bp);
1085 		}
1086 	}
1087 
1088 	if ((error == 0 || uio->uio_resid != orig_resid) &&
1089 	    (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
1090 		ip->i_flag |= IN_ACCESS;
1091 	return (error);
1092 }
1093 
1094 /*
1095  * Extended attribute area writing.
1096  */
1097 static int
1098 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred)
1099 {
1100 	struct inode *ip;
1101 	struct ufs2_dinode *dp;
1102 	struct fs *fs;
1103 	struct buf *bp;
1104 	ufs_lbn_t lbn;
1105 	off_t osize;
1106 	int blkoffset, error, flags, resid, size, xfersize;
1107 
1108 	GIANT_REQUIRED;
1109 
1110 	ip = VTOI(vp);
1111 	fs = ip->i_fs;
1112 	dp = ip->i_din2;
1113 
1114 #ifdef DIAGNOSTIC
1115 	if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC)
1116 		panic("ext_write: mode");
1117 #endif
1118 
1119 	if (ioflag & IO_APPEND)
1120 		uio->uio_offset = dp->di_extsize;
1121 
1122 	if (uio->uio_offset < 0 ||
1123 	    (u_int64_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize)
1124 		return (EFBIG);
1125 
1126 	resid = uio->uio_resid;
1127 	osize = dp->di_extsize;
1128 	flags = IO_EXT;
1129 	if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
1130 		flags |= IO_SYNC;
1131 
1132 	for (error = 0; uio->uio_resid > 0;) {
1133 		lbn = lblkno(fs, uio->uio_offset);
1134 		blkoffset = blkoff(fs, uio->uio_offset);
1135 		xfersize = fs->fs_bsize - blkoffset;
1136 		if (uio->uio_resid < xfersize)
1137 			xfersize = uio->uio_resid;
1138 
1139                 /*
1140 		 * We must perform a read-before-write if the transfer size
1141 		 * does not cover the entire buffer.
1142                  */
1143 		if (fs->fs_bsize > xfersize)
1144 			flags |= BA_CLRBUF;
1145 		else
1146 			flags &= ~BA_CLRBUF;
1147 		error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
1148 		    ucred, flags, &bp);
1149 		if (error != 0)
1150 			break;
1151 		/*
1152 		 * If the buffer is not valid we have to clear out any
1153 		 * garbage data from the pages instantiated for the buffer.
1154 		 * If we do not, a failed uiomove() during a write can leave
1155 		 * the prior contents of the pages exposed to a userland
1156 		 * mmap().  XXX deal with uiomove() errors a better way.
1157 		 */
1158 		if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
1159 			vfs_bio_clrbuf(bp);
1160 		if (ioflag & IO_DIRECT)
1161 			bp->b_flags |= B_DIRECT;
1162 		if (ioflag & IO_NOWDRAIN)
1163 			bp->b_flags |= B_NOWDRAIN;
1164 
1165 		if (uio->uio_offset + xfersize > dp->di_extsize)
1166 			dp->di_extsize = uio->uio_offset + xfersize;
1167 
1168 		size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid;
1169 		if (size < xfersize)
1170 			xfersize = size;
1171 
1172 		error =
1173 		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
1174 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1175 		   (LIST_FIRST(&bp->b_dep) == NULL)) {
1176 			bp->b_flags |= B_RELBUF;
1177 		}
1178 
1179 		/*
1180 		 * If IO_SYNC each buffer is written synchronously.  Otherwise
1181 		 * if we have a severe page deficiency write the buffer
1182 		 * asynchronously.  Otherwise try to cluster, and if that
1183 		 * doesn't do it then either do an async write (if O_DIRECT),
1184 		 * or a delayed write (if not).
1185 		 */
1186 		if (ioflag & IO_SYNC) {
1187 			(void)bwrite(bp);
1188 		} else if (vm_page_count_severe() ||
1189 			    buf_dirty_count_severe() ||
1190 			    xfersize + blkoffset == fs->fs_bsize ||
1191 			    (ioflag & (IO_ASYNC | IO_DIRECT)))
1192 			bawrite(bp);
1193 		else
1194 			bdwrite(bp);
1195 		if (error || xfersize == 0)
1196 			break;
1197 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
1198 	}
1199 	/*
1200 	 * If we successfully wrote any data, and we are not the superuser
1201 	 * we clear the setuid and setgid bits as a precaution against
1202 	 * tampering.
1203 	 */
1204 	if (resid > uio->uio_resid && ucred &&
1205 	    suser_cred(ucred, PRISON_ROOT)) {
1206 		ip->i_mode &= ~(ISUID | ISGID);
1207 		dp->di_mode = ip->i_mode;
1208 	}
1209 	if (error) {
1210 		if (ioflag & IO_UNIT) {
1211 			(void)UFS_TRUNCATE(vp, osize,
1212 			    IO_EXT | (ioflag&IO_SYNC), ucred, uio->uio_td);
1213 			uio->uio_offset -= resid - uio->uio_resid;
1214 			uio->uio_resid = resid;
1215 		}
1216 	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
1217 		error = UFS_UPDATE(vp, 1);
1218 	return (error);
1219 }
1220 
1221 
1222 /*
1223  * Vnode operating to retrieve a named extended attribute.
1224  *
1225  * Locate a particular EA (nspace:name) in the area (ptr:length), and return
1226  * the length of the EA, and possibly the pointer to the entry and to the data.
1227  */
1228 static int
1229 ffs_findextattr(u_char *ptr, uint length, int nspace, const char *name, u_char **eap, u_char **eac)
1230 {
1231 	u_char *p, *pe, *pn, *p0;
1232 	int eapad1, eapad2, ealength, ealen, nlen;
1233 	uint32_t ul;
1234 
1235 	pe = ptr + length;
1236 	nlen = strlen(name);
1237 
1238 	for (p = ptr; p < pe; p = pn) {
1239 		p0 = p;
1240 		bcopy(p, &ul, sizeof(ul));
1241 		pn = p + ul;
1242 		/* make sure this entry is complete */
1243 		if (pn > pe)
1244 			break;
1245 		p += sizeof(uint32_t);
1246 		if (*p != nspace)
1247 			continue;
1248 		p++;
1249 		eapad2 = *p++;
1250 		if (*p != nlen)
1251 			continue;
1252 		p++;
1253 		if (bcmp(p, name, nlen))
1254 			continue;
1255 		ealength = sizeof(uint32_t) + 3 + nlen;
1256 		eapad1 = 8 - (ealength % 8);
1257 		if (eapad1 == 8)
1258 			eapad1 = 0;
1259 		ealength += eapad1;
1260 		ealen = ul - ealength - eapad2;
1261 		p += nlen + eapad1;
1262 		if (eap != NULL)
1263 			*eap = p0;
1264 		if (eac != NULL)
1265 			*eac = p;
1266 		return (ealen);
1267 	}
1268 	return(-1);
1269 }
1270 
1271 static int
1272 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra)
1273 {
1274 	struct inode *ip;
1275 	struct fs *fs;
1276 	struct ufs2_dinode *dp;
1277 	struct uio luio;
1278 	struct iovec liovec;
1279 	int easize, error;
1280 	u_char *eae;
1281 
1282 	ip = VTOI(vp);
1283 	fs = ip->i_fs;
1284 	dp = ip->i_din2;
1285 	easize = dp->di_extsize;
1286 
1287 	eae = malloc(easize + extra, M_TEMP, M_WAITOK);
1288 
1289 	liovec.iov_base = eae;
1290 	liovec.iov_len = easize;
1291 	luio.uio_iov = &liovec;
1292 	luio.uio_iovcnt = 1;
1293 	luio.uio_offset = 0;
1294 	luio.uio_resid = easize;
1295 	luio.uio_segflg = UIO_SYSSPACE;
1296 	luio.uio_rw = UIO_READ;
1297 	luio.uio_td = td;
1298 
1299 	error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC);
1300 	if (error) {
1301 		free(eae, M_TEMP);
1302 		return(error);
1303 	}
1304 	*p = eae;
1305 	return (0);
1306 }
1307 
1308 static int
1309 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td)
1310 {
1311 	struct inode *ip;
1312 	struct fs *fs;
1313 	struct ufs2_dinode *dp;
1314 	int error;
1315 
1316 	ip = VTOI(vp);
1317 	fs = ip->i_fs;
1318 
1319 	if (ip->i_ea_area != NULL)
1320 		return (EBUSY);
1321 	dp = ip->i_din2;
1322 	error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0);
1323 	if (error)
1324 		return (error);
1325 	ip->i_ea_len = dp->di_extsize;
1326 	ip->i_ea_error = 0;
1327 	return (0);
1328 }
1329 
1330 /*
1331  * Vnode extattr transaction commit/abort
1332  */
1333 static int
1334 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td)
1335 {
1336 	struct inode *ip;
1337 	struct fs *fs;
1338 	struct uio luio;
1339 	struct iovec liovec;
1340 	int error;
1341 	struct ufs2_dinode *dp;
1342 
1343 	ip = VTOI(vp);
1344 	fs = ip->i_fs;
1345 	if (ip->i_ea_area == NULL)
1346 		return (EINVAL);
1347 	dp = ip->i_din2;
1348 	error = ip->i_ea_error;
1349 	if (commit && error == 0) {
1350 		if (cred == NOCRED)
1351 			cred =  vp->v_mount->mnt_cred;
1352 		liovec.iov_base = ip->i_ea_area;
1353 		liovec.iov_len = ip->i_ea_len;
1354 		luio.uio_iov = &liovec;
1355 		luio.uio_iovcnt = 1;
1356 		luio.uio_offset = 0;
1357 		luio.uio_resid = ip->i_ea_len;
1358 		luio.uio_segflg = UIO_SYSSPACE;
1359 		luio.uio_rw = UIO_WRITE;
1360 		luio.uio_td = td;
1361 		/* XXX: I'm not happy about truncating to zero size */
1362 		if (ip->i_ea_len < dp->di_extsize)
1363 			error = ffs_truncate(vp, 0, IO_EXT, cred, td);
1364 		error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred);
1365 	}
1366 	free(ip->i_ea_area, M_TEMP);
1367 	ip->i_ea_area = NULL;
1368 	ip->i_ea_len = 0;
1369 	ip->i_ea_error = 0;
1370 	return (error);
1371 }
1372 
1373 /*
1374  * Vnode extattr strategy routine for special devices and fifos.
1375  *
1376  * We need to check for a read or write of the external attributes.
1377  * Otherwise we just fall through and do the usual thing.
1378  */
1379 static int
1380 ffsext_strategy(struct vop_strategy_args *ap)
1381 /*
1382 struct vop_strategy_args {
1383 	struct vnodeop_desc *a_desc;
1384 	struct vnode *a_vp;
1385 	struct buf *a_bp;
1386 };
1387 */
1388 {
1389 	struct vnode *vp;
1390 	daddr_t lbn;
1391 
1392 	vp = ap->a_vp;
1393 	lbn = ap->a_bp->b_lblkno;
1394 	if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC &&
1395 	    lbn < 0 && lbn >= -NXADDR)
1396 		return (ufs_vnoperate((struct vop_generic_args *)ap));
1397 	if (vp->v_type == VFIFO)
1398 		return (ufs_vnoperatefifo((struct vop_generic_args *)ap));
1399 	return (ufs_vnoperatespec((struct vop_generic_args *)ap));
1400 }
1401 
1402 /*
1403  * Vnode extattr transaction commit/abort
1404  */
1405 static int
1406 ffs_openextattr(struct vop_openextattr_args *ap)
1407 /*
1408 struct vop_openextattr_args {
1409 	struct vnodeop_desc *a_desc;
1410 	struct vnode *a_vp;
1411 	IN struct ucred *a_cred;
1412 	IN struct thread *a_td;
1413 };
1414 */
1415 {
1416 	struct inode *ip;
1417 	struct fs *fs;
1418 
1419 	ip = VTOI(ap->a_vp);
1420 	fs = ip->i_fs;
1421 	if (fs->fs_magic == FS_UFS1_MAGIC)
1422 		return (ufs_vnoperate((struct vop_generic_args *)ap));
1423 	return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td));
1424 }
1425 
1426 
1427 /*
1428  * Vnode extattr transaction commit/abort
1429  */
1430 static int
1431 ffs_closeextattr(struct vop_closeextattr_args *ap)
1432 /*
1433 struct vop_closeextattr_args {
1434 	struct vnodeop_desc *a_desc;
1435 	struct vnode *a_vp;
1436 	int a_commit;
1437 	IN struct ucred *a_cred;
1438 	IN struct thread *a_td;
1439 };
1440 */
1441 {
1442 	struct inode *ip;
1443 	struct fs *fs;
1444 
1445 	ip = VTOI(ap->a_vp);
1446 	fs = ip->i_fs;
1447 	if (fs->fs_magic == FS_UFS1_MAGIC)
1448 		return (ufs_vnoperate((struct vop_generic_args *)ap));
1449 	return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td));
1450 }
1451 
1452 
1453 
1454 /*
1455  * Vnode operation to retrieve a named extended attribute.
1456  */
1457 static int
1458 ffs_getextattr(struct vop_getextattr_args *ap)
1459 /*
1460 vop_getextattr {
1461 	IN struct vnode *a_vp;
1462 	IN int a_attrnamespace;
1463 	IN const char *a_name;
1464 	INOUT struct uio *a_uio;
1465 	OUT size_t *a_size;
1466 	IN struct ucred *a_cred;
1467 	IN struct thread *a_td;
1468 };
1469 */
1470 {
1471 	struct inode *ip;
1472 	struct fs *fs;
1473 	u_char *eae, *p, *pe, *pn;
1474 	struct ufs2_dinode *dp;
1475 	unsigned easize;
1476 	uint32_t ul;
1477 	int error, ealen, stand_alone;
1478 
1479 	ip = VTOI(ap->a_vp);
1480 	fs = ip->i_fs;
1481 
1482 	if (fs->fs_magic == FS_UFS1_MAGIC)
1483 		return (ufs_vnoperate((struct vop_generic_args *)ap));
1484 
1485 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1486 	    ap->a_cred, ap->a_td, IREAD);
1487 	if (error)
1488 		return (error);
1489 
1490 	if (ip->i_ea_area == NULL) {
1491 		error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1492 		if (error)
1493 			return (error);
1494 		stand_alone = 1;
1495 	} else {
1496 		stand_alone = 0;
1497 	}
1498 	dp = ip->i_din2;
1499 	eae = ip->i_ea_area;
1500 	easize = ip->i_ea_len;
1501 	if (strlen(ap->a_name) > 0) {
1502 		ealen = ffs_findextattr(eae, easize,
1503 		    ap->a_attrnamespace, ap->a_name, NULL, &p);
1504 		if (ealen >= 0) {
1505 			error = 0;
1506 			if (ap->a_size != NULL)
1507 				*ap->a_size = ealen;
1508 			else if (ap->a_uio != NULL)
1509 				error = uiomove(p, ealen, ap->a_uio);
1510 		} else {
1511 			error = ENOATTR;
1512 		}
1513 	} else {
1514 		error = 0;
1515 		if (ap->a_size != NULL)
1516 			*ap->a_size = 0;
1517 		pe = eae + easize;
1518 		for(p = eae; error == 0 && p < pe; p = pn) {
1519 			bcopy(p, &ul, sizeof(ul));
1520 			pn = p + ul;
1521 			if (pn > pe)
1522 				break;
1523 			p += sizeof(ul);
1524 			if (*p++ != ap->a_attrnamespace)
1525 				continue;
1526 			p++;	/* pad2 */
1527 			ealen = *p;
1528 			if (ap->a_size != NULL) {
1529 				*ap->a_size += ealen + 1;
1530 			} else if (ap->a_uio != NULL) {
1531 				error = uiomove(p, ealen + 1, ap->a_uio);
1532 			}
1533 		}
1534 	}
1535 	if (stand_alone)
1536 		ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1537 	return(error);
1538 }
1539 
1540 /*
1541  * Vnode operation to set a named attribute.
1542  */
1543 static int
1544 ffs_setextattr(struct vop_setextattr_args *ap)
1545 /*
1546 vop_setextattr {
1547 	IN struct vnode *a_vp;
1548 	IN int a_attrnamespace;
1549 	IN const char *a_name;
1550 	INOUT struct uio *a_uio;
1551 	IN struct ucred *a_cred;
1552 	IN struct thread *a_td;
1553 };
1554 */
1555 {
1556 	struct inode *ip;
1557 	struct fs *fs;
1558 	uint32_t ealength, ul;
1559 	int ealen, olen, eacont, eapad1, eapad2, error, i, easize;
1560 	u_char *eae, *p;
1561 	struct ufs2_dinode *dp;
1562 	struct ucred *cred;
1563 	int stand_alone;
1564 
1565 	ip = VTOI(ap->a_vp);
1566 	fs = ip->i_fs;
1567 
1568 	if (fs->fs_magic == FS_UFS1_MAGIC)
1569 		return (ufs_vnoperate((struct vop_generic_args *)ap));
1570 
1571 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1572 	    ap->a_cred, ap->a_td, IWRITE);
1573 	if (error) {
1574 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1575 			ip->i_ea_error = error;
1576 		return (error);
1577 	}
1578 
1579 	if (ap->a_cred != NOCRED)
1580 		cred = ap->a_cred;
1581 	else
1582 		cred = ap->a_vp->v_mount->mnt_cred;
1583 
1584 	dp = ip->i_din2;
1585 
1586 	if (ip->i_ea_area == NULL) {
1587 		error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1588 		if (error)
1589 			return (error);
1590 		stand_alone = 1;
1591 	} else {
1592 		stand_alone = 0;
1593 	}
1594 
1595 	/* Calculate the length of the EA entry */
1596 	if (ap->a_uio == NULL) {
1597 		/* delete */
1598 		ealength = eapad1 = ealen = eapad2 = eacont = 0;
1599 	} else {
1600 		ealen = ap->a_uio->uio_resid;
1601 		ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name);
1602 		eapad1 = 8 - (ealength % 8);
1603 		if (eapad1 == 8)
1604 			eapad1 = 0;
1605 		eacont = ealength + eapad1;
1606 		eapad2 = 8 - (ealen % 8);
1607 		if (eapad2 == 8)
1608 			eapad2 = 0;
1609 		ealength += eapad1 + ealen + eapad2;
1610 	}
1611 
1612 	eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK);
1613 	bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1614 	easize = ip->i_ea_len;
1615 
1616 	olen = ffs_findextattr(eae, easize,
1617 	    ap->a_attrnamespace, ap->a_name, &p, NULL);
1618 	if (olen == -1 && ealength == 0) {
1619 		/* delete but nonexistent */
1620 		free(eae, M_TEMP);
1621 		if (stand_alone)
1622 			ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1623 		return(ENOATTR);
1624 	}
1625         if (olen == -1) {
1626 		/* new, append at end */
1627 		p = eae + easize;
1628 		easize += ealength;
1629 	} else {
1630 		bcopy(p, &ul, sizeof ul);
1631 		i = p - eae + ul;
1632 		if (ul != ealength) {
1633 			bcopy(p + ul, p + ealength, easize - i);
1634 			easize += (ealength - ul);
1635 		}
1636 	}
1637 	if (easize > NXADDR * fs->fs_bsize) {
1638 		free(eae, M_TEMP);
1639 		if (stand_alone)
1640 			ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1641 		else if (ip->i_ea_error == 0)
1642 			ip->i_ea_error = ENOSPC;
1643 		return(ENOSPC);
1644 	}
1645 	if (ealength != 0) {
1646 		bcopy(&ealength, p, sizeof(ealength));
1647 		p += sizeof(ealength);
1648 		*p++ = ap->a_attrnamespace;
1649 		*p++ = eapad2;
1650 		*p++ = strlen(ap->a_name);
1651 		strcpy(p, ap->a_name);
1652 		p += strlen(ap->a_name);
1653 		bzero(p, eapad1);
1654 		p += eapad1;
1655 		error = uiomove(p, ealen, ap->a_uio);
1656 		if (error) {
1657 			free(eae, M_TEMP);
1658 			if (stand_alone)
1659 				ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1660 			else if (ip->i_ea_error == 0)
1661 				ip->i_ea_error = error;
1662 			return(error);
1663 		}
1664 		p += ealen;
1665 		bzero(p, eapad2);
1666 	}
1667 	p = ip->i_ea_area;
1668 	ip->i_ea_area = eae;
1669 	ip->i_ea_len = easize;
1670 	free(p, M_TEMP);
1671 	if (stand_alone)
1672 		error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1673 	return(error);
1674 }
1675