xref: /freebsd/sys/ufs/ffs/ffs_vnops.c (revision 81d1ffee089aab2652954909acbe6aadd8a1a72c)
1 /*
2  * Copyright (c) 2002 Networks Associates Technology, Inc.
3  * All rights reserved.
4  *
5  * This software was developed for the FreeBSD Project by Marshall
6  * Kirk McKusick and Network Associates Laboratories, the Security
7  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
8  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
9  * research program
10  *
11  * Copyright (c) 1982, 1986, 1989, 1993
12  *	The Regents of the University of California.  All rights reserved.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. All advertising materials mentioning features or use of this software
23  *    must display the following acknowledgement:
24  *	This product includes software developed by the University of
25  *	California, Berkeley and its contributors.
26  * 4. Neither the name of the University nor the names of its contributors
27  *    may be used to endorse or promote products derived from this software
28  *    without specific prior written permission.
29  *
30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40  * SUCH DAMAGE.
41  *
42  *	@(#)ffs_vnops.c	8.15 (Berkeley) 5/14/95
43  * $FreeBSD$
44  */
45 
46 #include <sys/param.h>
47 #include <sys/bio.h>
48 #include <sys/systm.h>
49 #include <sys/buf.h>
50 #include <sys/conf.h>
51 #include <sys/extattr.h>
52 #include <sys/kernel.h>
53 #include <sys/malloc.h>
54 #include <sys/mount.h>
55 #include <sys/proc.h>
56 #include <sys/resourcevar.h>
57 #include <sys/signalvar.h>
58 #include <sys/stat.h>
59 #include <sys/vmmeter.h>
60 #include <sys/vnode.h>
61 
62 #include <machine/limits.h>
63 
64 #include <vm/vm.h>
65 #include <vm/vm_extern.h>
66 #include <vm/vm_object.h>
67 #include <vm/vm_page.h>
68 #include <vm/vm_pager.h>
69 #include <vm/vnode_pager.h>
70 
71 #include <ufs/ufs/extattr.h>
72 #include <ufs/ufs/quota.h>
73 #include <ufs/ufs/inode.h>
74 #include <ufs/ufs/ufs_extern.h>
75 #include <ufs/ufs/ufsmount.h>
76 
77 #include <ufs/ffs/fs.h>
78 #include <ufs/ffs/ffs_extern.h>
79 
80 static int	ffs_fsync(struct vop_fsync_args *);
81 static int	ffs_getpages(struct vop_getpages_args *);
82 static int	ffs_read(struct vop_read_args *);
83 static int	ffs_write(struct vop_write_args *);
84 static int	ffs_extread(struct vnode *vp, struct uio *uio, int ioflag);
85 static int	ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag,
86 		    struct ucred *cred);
87 static int	ffsext_strategy(struct vop_strategy_args *);
88 static int	ffs_closeextattr(struct vop_closeextattr_args *);
89 static int	ffs_getextattr(struct vop_getextattr_args *);
90 static int	ffs_openextattr(struct vop_openextattr_args *);
91 static int	ffs_setextattr(struct vop_setextattr_args *);
92 
93 
94 /* Global vfs data structures for ufs. */
95 vop_t **ffs_vnodeop_p;
96 static struct vnodeopv_entry_desc ffs_vnodeop_entries[] = {
97 	{ &vop_default_desc,		(vop_t *) ufs_vnoperate },
98 	{ &vop_fsync_desc,		(vop_t *) ffs_fsync },
99 	{ &vop_getpages_desc,		(vop_t *) ffs_getpages },
100 	{ &vop_read_desc,		(vop_t *) ffs_read },
101 	{ &vop_reallocblks_desc,	(vop_t *) ffs_reallocblks },
102 	{ &vop_write_desc,		(vop_t *) ffs_write },
103 	{ &vop_closeextattr_desc,	(vop_t *) ffs_closeextattr },
104 	{ &vop_getextattr_desc,		(vop_t *) ffs_getextattr },
105 	{ &vop_openextattr_desc,	(vop_t *) ffs_openextattr },
106 	{ &vop_setextattr_desc,		(vop_t *) ffs_setextattr },
107 	{ NULL, NULL }
108 };
109 static struct vnodeopv_desc ffs_vnodeop_opv_desc =
110 	{ &ffs_vnodeop_p, ffs_vnodeop_entries };
111 
112 vop_t **ffs_specop_p;
113 static struct vnodeopv_entry_desc ffs_specop_entries[] = {
114 	{ &vop_default_desc,		(vop_t *) ufs_vnoperatespec },
115 	{ &vop_fsync_desc,		(vop_t *) ffs_fsync },
116 	{ &vop_reallocblks_desc,	(vop_t *) ffs_reallocblks },
117 	{ &vop_strategy_desc,		(vop_t *) ffsext_strategy },
118 	{ &vop_closeextattr_desc,	(vop_t *) ffs_closeextattr },
119 	{ &vop_getextattr_desc,		(vop_t *) ffs_getextattr },
120 	{ &vop_openextattr_desc,	(vop_t *) ffs_openextattr },
121 	{ &vop_setextattr_desc,		(vop_t *) ffs_setextattr },
122 	{ NULL, NULL }
123 };
124 static struct vnodeopv_desc ffs_specop_opv_desc =
125 	{ &ffs_specop_p, ffs_specop_entries };
126 
127 vop_t **ffs_fifoop_p;
128 static struct vnodeopv_entry_desc ffs_fifoop_entries[] = {
129 	{ &vop_default_desc,		(vop_t *) ufs_vnoperatefifo },
130 	{ &vop_fsync_desc,		(vop_t *) ffs_fsync },
131 	{ &vop_reallocblks_desc,	(vop_t *) ffs_reallocblks },
132 	{ &vop_strategy_desc,		(vop_t *) ffsext_strategy },
133 	{ &vop_closeextattr_desc,	(vop_t *) ffs_closeextattr },
134 	{ &vop_getextattr_desc,		(vop_t *) ffs_getextattr },
135 	{ &vop_openextattr_desc,	(vop_t *) ffs_openextattr },
136 	{ &vop_setextattr_desc,		(vop_t *) ffs_setextattr },
137 	{ NULL, NULL }
138 };
139 static struct vnodeopv_desc ffs_fifoop_opv_desc =
140 	{ &ffs_fifoop_p, ffs_fifoop_entries };
141 
142 VNODEOP_SET(ffs_vnodeop_opv_desc);
143 VNODEOP_SET(ffs_specop_opv_desc);
144 VNODEOP_SET(ffs_fifoop_opv_desc);
145 
146 /*
147  * Synch an open file.
148  */
149 /* ARGSUSED */
150 static int
151 ffs_fsync(ap)
152 	struct vop_fsync_args /* {
153 		struct vnode *a_vp;
154 		struct ucred *a_cred;
155 		int a_waitfor;
156 		struct thread *a_td;
157 	} */ *ap;
158 {
159 	struct vnode *vp = ap->a_vp;
160 	struct inode *ip = VTOI(vp);
161 	struct buf *bp;
162 	struct buf *nbp;
163 	int s, error, wait, passes, skipmeta;
164 	ufs_lbn_t lbn;
165 
166 	wait = (ap->a_waitfor == MNT_WAIT);
167 	if (vn_isdisk(vp, NULL)) {
168 		lbn = INT_MAX;
169 		if (vp->v_rdev->si_mountpoint != NULL &&
170 		    (vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP))
171 			softdep_fsync_mountdev(vp);
172 	} else {
173 		lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1));
174 	}
175 
176 	/*
177 	 * Flush all dirty buffers associated with a vnode.
178 	 */
179 	passes = NIADDR + 1;
180 	skipmeta = 0;
181 	if (wait)
182 		skipmeta = 1;
183 	s = splbio();
184 	VI_LOCK(vp);
185 loop:
186 	TAILQ_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs)
187 		bp->b_vflags &= ~BV_SCANNED;
188 	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
189 		nbp = TAILQ_NEXT(bp, b_vnbufs);
190 		/*
191 		 * Reasons to skip this buffer: it has already been considered
192 		 * on this pass, this pass is the first time through on a
193 		 * synchronous flush request and the buffer being considered
194 		 * is metadata, the buffer has dependencies that will cause
195 		 * it to be redirtied and it has not already been deferred,
196 		 * or it is already being written.
197 		 */
198 		if ((bp->b_vflags & BV_SCANNED) != 0)
199 			continue;
200 		bp->b_vflags |= BV_SCANNED;
201 		if ((skipmeta == 1 && bp->b_lblkno < 0))
202 			continue;
203 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
204 			continue;
205 		if (!wait && LIST_FIRST(&bp->b_dep) != NULL &&
206 		    (bp->b_flags & B_DEFERRED) == 0 &&
207 		    buf_countdeps(bp, 0)) {
208 			bp->b_flags |= B_DEFERRED;
209 			BUF_UNLOCK(bp);
210 			continue;
211 		}
212 		VI_UNLOCK(vp);
213 		if ((bp->b_flags & B_DELWRI) == 0)
214 			panic("ffs_fsync: not dirty");
215 		if (vp != bp->b_vp)
216 			panic("ffs_fsync: vp != vp->b_vp");
217 		/*
218 		 * If this is a synchronous flush request, or it is not a
219 		 * file or device, start the write on this buffer immediatly.
220 		 */
221 		if (wait || (vp->v_type != VREG && vp->v_type != VBLK)) {
222 
223 			/*
224 			 * On our final pass through, do all I/O synchronously
225 			 * so that we can find out if our flush is failing
226 			 * because of write errors.
227 			 */
228 			if (passes > 0 || !wait) {
229 				if ((bp->b_flags & B_CLUSTEROK) && !wait) {
230 					BUF_UNLOCK(bp);
231 					(void) vfs_bio_awrite(bp);
232 				} else {
233 					bremfree(bp);
234 					splx(s);
235 					(void) bawrite(bp);
236 					s = splbio();
237 				}
238 			} else {
239 				bremfree(bp);
240 				splx(s);
241 				if ((error = bwrite(bp)) != 0)
242 					return (error);
243 				s = splbio();
244 			}
245 		} else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) {
246 			/*
247 			 * If the buffer is for data that has been truncated
248 			 * off the file, then throw it away.
249 			 */
250 			bremfree(bp);
251 			bp->b_flags |= B_INVAL | B_NOCACHE;
252 			splx(s);
253 			brelse(bp);
254 			s = splbio();
255 		} else {
256 			BUF_UNLOCK(bp);
257 			vfs_bio_awrite(bp);
258 		}
259 		/*
260 		 * Since we may have slept during the I/O, we need
261 		 * to start from a known point.
262 		 */
263 		VI_LOCK(vp);
264 		nbp = TAILQ_FIRST(&vp->v_dirtyblkhd);
265 	}
266 	/*
267 	 * If we were asked to do this synchronously, then go back for
268 	 * another pass, this time doing the metadata.
269 	 */
270 	if (skipmeta) {
271 		skipmeta = 0;
272 		goto loop;
273 	}
274 
275 	if (wait) {
276 		while (vp->v_numoutput) {
277 			vp->v_iflag |= VI_BWAIT;
278 			msleep((caddr_t)&vp->v_numoutput, VI_MTX(vp),
279 			    PRIBIO + 4, "ffsfsn", 0);
280   		}
281 		VI_UNLOCK(vp);
282 
283 		/*
284 		 * Ensure that any filesystem metatdata associated
285 		 * with the vnode has been written.
286 		 */
287 		splx(s);
288 		if ((error = softdep_sync_metadata(ap)) != 0)
289 			return (error);
290 		s = splbio();
291 
292 		VI_LOCK(vp);
293 		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
294 			/*
295 			 * Block devices associated with filesystems may
296 			 * have new I/O requests posted for them even if
297 			 * the vnode is locked, so no amount of trying will
298 			 * get them clean. Thus we give block devices a
299 			 * good effort, then just give up. For all other file
300 			 * types, go around and try again until it is clean.
301 			 */
302 			if (passes > 0) {
303 				passes -= 1;
304 				goto loop;
305 			}
306 #ifdef DIAGNOSTIC
307 			if (!vn_isdisk(vp, NULL))
308 				vprint("ffs_fsync: dirty", vp);
309 #endif
310 		}
311 	}
312 	VI_UNLOCK(vp);
313 	splx(s);
314 	return (UFS_UPDATE(vp, wait));
315 }
316 
317 
318 /*
319  * Vnode op for reading.
320  */
321 /* ARGSUSED */
322 static int
323 ffs_read(ap)
324 	struct vop_read_args /* {
325 		struct vnode *a_vp;
326 		struct uio *a_uio;
327 		int a_ioflag;
328 		struct ucred *a_cred;
329 	} */ *ap;
330 {
331 	struct vnode *vp;
332 	struct inode *ip;
333 	struct uio *uio;
334 	struct fs *fs;
335 	struct buf *bp;
336 	ufs_lbn_t lbn, nextlbn;
337 	off_t bytesinfile;
338 	long size, xfersize, blkoffset;
339 	int error, orig_resid;
340 	mode_t mode;
341 	int seqcount;
342 	int ioflag;
343 	vm_object_t object;
344 
345 	vp = ap->a_vp;
346 	uio = ap->a_uio;
347 	ioflag = ap->a_ioflag;
348 	if (ap->a_ioflag & IO_EXT)
349 #ifdef notyet
350 		return (ffs_extread(vp, uio, ioflag));
351 #else
352 		panic("ffs_read+IO_EXT");
353 #endif
354 
355 	GIANT_REQUIRED;
356 
357 	seqcount = ap->a_ioflag >> 16;
358 	ip = VTOI(vp);
359 	mode = ip->i_mode;
360 
361 #ifdef DIAGNOSTIC
362 	if (uio->uio_rw != UIO_READ)
363 		panic("ffs_read: mode");
364 
365 	if (vp->v_type == VLNK) {
366 		if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
367 			panic("ffs_read: short symlink");
368 	} else if (vp->v_type != VREG && vp->v_type != VDIR)
369 		panic("ffs_read: type %d",  vp->v_type);
370 #endif
371 	fs = ip->i_fs;
372 	if ((u_int64_t)uio->uio_offset > fs->fs_maxfilesize)
373 		return (EFBIG);
374 
375 	orig_resid = uio->uio_resid;
376 	if (orig_resid <= 0)
377 		return (0);
378 
379 	object = vp->v_object;
380 
381 	bytesinfile = ip->i_size - uio->uio_offset;
382 	if (bytesinfile <= 0) {
383 		if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
384 			ip->i_flag |= IN_ACCESS;
385 		return 0;
386 	}
387 
388 	if (object) {
389 		vm_object_reference(object);
390 	}
391 
392 	/*
393 	 * Ok so we couldn't do it all in one vm trick...
394 	 * so cycle around trying smaller bites..
395 	 */
396 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
397 		if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
398 			break;
399 
400 		lbn = lblkno(fs, uio->uio_offset);
401 		nextlbn = lbn + 1;
402 
403 		/*
404 		 * size of buffer.  The buffer representing the
405 		 * end of the file is rounded up to the size of
406 		 * the block type ( fragment or full block,
407 		 * depending ).
408 		 */
409 		size = blksize(fs, ip, lbn);
410 		blkoffset = blkoff(fs, uio->uio_offset);
411 
412 		/*
413 		 * The amount we want to transfer in this iteration is
414 		 * one FS block less the amount of the data before
415 		 * our startpoint (duh!)
416 		 */
417 		xfersize = fs->fs_bsize - blkoffset;
418 
419 		/*
420 		 * But if we actually want less than the block,
421 		 * or the file doesn't have a whole block more of data,
422 		 * then use the lesser number.
423 		 */
424 		if (uio->uio_resid < xfersize)
425 			xfersize = uio->uio_resid;
426 		if (bytesinfile < xfersize)
427 			xfersize = bytesinfile;
428 
429 		if (lblktosize(fs, nextlbn) >= ip->i_size) {
430 			/*
431 			 * Don't do readahead if this is the end of the file.
432 			 */
433 			error = bread(vp, lbn, size, NOCRED, &bp);
434 		} else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
435 			/*
436 			 * Otherwise if we are allowed to cluster,
437 			 * grab as much as we can.
438 			 *
439 			 * XXX  This may not be a win if we are not
440 			 * doing sequential access.
441 			 */
442 			error = cluster_read(vp, ip->i_size, lbn,
443 				size, NOCRED, uio->uio_resid, seqcount, &bp);
444 		} else if (seqcount > 1) {
445 			/*
446 			 * If we are NOT allowed to cluster, then
447 			 * if we appear to be acting sequentially,
448 			 * fire off a request for a readahead
449 			 * as well as a read. Note that the 4th and 5th
450 			 * arguments point to arrays of the size specified in
451 			 * the 6th argument.
452 			 */
453 			int nextsize = blksize(fs, ip, nextlbn);
454 			error = breadn(vp, lbn,
455 			    size, &nextlbn, &nextsize, 1, NOCRED, &bp);
456 		} else {
457 			/*
458 			 * Failing all of the above, just read what the
459 			 * user asked for. Interestingly, the same as
460 			 * the first option above.
461 			 */
462 			error = bread(vp, lbn, size, NOCRED, &bp);
463 		}
464 		if (error) {
465 			brelse(bp);
466 			bp = NULL;
467 			break;
468 		}
469 
470 		/*
471 		 * If IO_DIRECT then set B_DIRECT for the buffer.  This
472 		 * will cause us to attempt to release the buffer later on
473 		 * and will cause the buffer cache to attempt to free the
474 		 * underlying pages.
475 		 */
476 		if (ioflag & IO_DIRECT)
477 			bp->b_flags |= B_DIRECT;
478 
479 		/*
480 		 * We should only get non-zero b_resid when an I/O error
481 		 * has occurred, which should cause us to break above.
482 		 * However, if the short read did not cause an error,
483 		 * then we want to ensure that we do not uiomove bad
484 		 * or uninitialized data.
485 		 */
486 		size -= bp->b_resid;
487 		if (size < xfersize) {
488 			if (size == 0)
489 				break;
490 			xfersize = size;
491 		}
492 
493 		{
494 			/*
495 			 * otherwise use the general form
496 			 */
497 			error =
498 				uiomove((char *)bp->b_data + blkoffset,
499 					(int)xfersize, uio);
500 		}
501 
502 		if (error)
503 			break;
504 
505 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
506 		   (LIST_FIRST(&bp->b_dep) == NULL)) {
507 			/*
508 			 * If there are no dependencies, and it's VMIO,
509 			 * then we don't need the buf, mark it available
510 			 * for freeing. The VM has the data.
511 			 */
512 			bp->b_flags |= B_RELBUF;
513 			brelse(bp);
514 		} else {
515 			/*
516 			 * Otherwise let whoever
517 			 * made the request take care of
518 			 * freeing it. We just queue
519 			 * it onto another list.
520 			 */
521 			bqrelse(bp);
522 		}
523 	}
524 
525 	/*
526 	 * This can only happen in the case of an error
527 	 * because the loop above resets bp to NULL on each iteration
528 	 * and on normal completion has not set a new value into it.
529 	 * so it must have come from a 'break' statement
530 	 */
531 	if (bp != NULL) {
532 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
533 		   (LIST_FIRST(&bp->b_dep) == NULL)) {
534 			bp->b_flags |= B_RELBUF;
535 			brelse(bp);
536 		} else {
537 			bqrelse(bp);
538 		}
539 	}
540 
541 	if (object) {
542 		vm_object_vndeallocate(object);
543 	}
544 	if ((error == 0 || uio->uio_resid != orig_resid) &&
545 	    (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
546 		ip->i_flag |= IN_ACCESS;
547 	return (error);
548 }
549 
550 /*
551  * Vnode op for writing.
552  */
553 static int
554 ffs_write(ap)
555 	struct vop_write_args /* {
556 		struct vnode *a_vp;
557 		struct uio *a_uio;
558 		int a_ioflag;
559 		struct ucred *a_cred;
560 	} */ *ap;
561 {
562 	struct vnode *vp;
563 	struct uio *uio;
564 	struct inode *ip;
565 	struct fs *fs;
566 	struct buf *bp;
567 	struct thread *td;
568 	ufs_lbn_t lbn;
569 	off_t osize;
570 	int seqcount;
571 	int blkoffset, error, extended, flags, ioflag, resid, size, xfersize;
572 	vm_object_t object;
573 
574 	vp = ap->a_vp;
575 	uio = ap->a_uio;
576 	ioflag = ap->a_ioflag;
577 	if (ap->a_ioflag & IO_EXT)
578 #ifdef notyet
579 		return (ffs_extwrite(vp, uio, ioflag, ap->a_cred));
580 #else
581 		panic("ffs_read+IO_EXT");
582 #endif
583 
584 	GIANT_REQUIRED;
585 
586 	extended = 0;
587 	seqcount = ap->a_ioflag >> 16;
588 	ip = VTOI(vp);
589 
590 	object = vp->v_object;
591 	if (object) {
592 		vm_object_reference(object);
593 	}
594 
595 #ifdef DIAGNOSTIC
596 	if (uio->uio_rw != UIO_WRITE)
597 		panic("ffswrite: mode");
598 #endif
599 
600 	switch (vp->v_type) {
601 	case VREG:
602 		if (ioflag & IO_APPEND)
603 			uio->uio_offset = ip->i_size;
604 		if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) {
605 			if (object) {
606 				vm_object_vndeallocate(object);
607 			}
608 			return (EPERM);
609 		}
610 		/* FALLTHROUGH */
611 	case VLNK:
612 		break;
613 	case VDIR:
614 		panic("ffswrite: dir write");
615 		break;
616 	default:
617 		panic("ffswrite: type %p %d (%d,%d)", vp, (int)vp->v_type,
618 			(int)uio->uio_offset,
619 			(int)uio->uio_resid
620 		);
621 	}
622 
623 	fs = ip->i_fs;
624 	if (uio->uio_offset < 0 ||
625 	    (u_int64_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) {
626 		if (object) {
627 			vm_object_vndeallocate(object);
628 		}
629 		return (EFBIG);
630 	}
631 	/*
632 	 * Maybe this should be above the vnode op call, but so long as
633 	 * file servers have no limits, I don't think it matters.
634 	 */
635 	td = uio->uio_td;
636 	if (vp->v_type == VREG && td &&
637 	    uio->uio_offset + uio->uio_resid >
638 	    td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
639 		PROC_LOCK(td->td_proc);
640 		psignal(td->td_proc, SIGXFSZ);
641 		PROC_UNLOCK(td->td_proc);
642 		if (object) {
643 			vm_object_vndeallocate(object);
644 		}
645 		return (EFBIG);
646 	}
647 
648 	resid = uio->uio_resid;
649 	osize = ip->i_size;
650 	if (seqcount > BA_SEQMAX)
651 		flags = BA_SEQMAX << BA_SEQSHIFT;
652 	else
653 		flags = seqcount << BA_SEQSHIFT;
654 	if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
655 		flags |= IO_SYNC;
656 
657 	for (error = 0; uio->uio_resid > 0;) {
658 		lbn = lblkno(fs, uio->uio_offset);
659 		blkoffset = blkoff(fs, uio->uio_offset);
660 		xfersize = fs->fs_bsize - blkoffset;
661 		if (uio->uio_resid < xfersize)
662 			xfersize = uio->uio_resid;
663 
664 		if (uio->uio_offset + xfersize > ip->i_size)
665 			vnode_pager_setsize(vp, uio->uio_offset + xfersize);
666 
667                 /*
668 		 * We must perform a read-before-write if the transfer size
669 		 * does not cover the entire buffer.
670                  */
671 		if (fs->fs_bsize > xfersize)
672 			flags |= BA_CLRBUF;
673 		else
674 			flags &= ~BA_CLRBUF;
675 /* XXX is uio->uio_offset the right thing here? */
676 		error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
677 		    ap->a_cred, flags, &bp);
678 		if (error != 0)
679 			break;
680 		/*
681 		 * If the buffer is not valid we have to clear out any
682 		 * garbage data from the pages instantiated for the buffer.
683 		 * If we do not, a failed uiomove() during a write can leave
684 		 * the prior contents of the pages exposed to a userland
685 		 * mmap().  XXX deal with uiomove() errors a better way.
686 		 */
687 		if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
688 			vfs_bio_clrbuf(bp);
689 		if (ioflag & IO_DIRECT)
690 			bp->b_flags |= B_DIRECT;
691 		if (ioflag & IO_NOWDRAIN)
692 			bp->b_flags |= B_NOWDRAIN;
693 
694 		if (uio->uio_offset + xfersize > ip->i_size) {
695 			ip->i_size = uio->uio_offset + xfersize;
696 			DIP(ip, i_size) = ip->i_size;
697 			extended = 1;
698 		}
699 
700 		size = blksize(fs, ip, lbn) - bp->b_resid;
701 		if (size < xfersize)
702 			xfersize = size;
703 
704 		error =
705 		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
706 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
707 		   (LIST_FIRST(&bp->b_dep) == NULL)) {
708 			bp->b_flags |= B_RELBUF;
709 		}
710 
711 		/*
712 		 * If IO_SYNC each buffer is written synchronously.  Otherwise
713 		 * if we have a severe page deficiency write the buffer
714 		 * asynchronously.  Otherwise try to cluster, and if that
715 		 * doesn't do it then either do an async write (if O_DIRECT),
716 		 * or a delayed write (if not).
717 		 */
718 		if (ioflag & IO_SYNC) {
719 			(void)bwrite(bp);
720 		} else if (vm_page_count_severe() ||
721 			    buf_dirty_count_severe() ||
722 			    (ioflag & IO_ASYNC)) {
723 			bp->b_flags |= B_CLUSTEROK;
724 			bawrite(bp);
725 		} else if (xfersize + blkoffset == fs->fs_bsize) {
726 			if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
727 				bp->b_flags |= B_CLUSTEROK;
728 				cluster_write(bp, ip->i_size, seqcount);
729 			} else {
730 				bawrite(bp);
731 			}
732 		} else if (ioflag & IO_DIRECT) {
733 			bp->b_flags |= B_CLUSTEROK;
734 			bawrite(bp);
735 		} else {
736 			bp->b_flags |= B_CLUSTEROK;
737 			bdwrite(bp);
738 		}
739 		if (error || xfersize == 0)
740 			break;
741 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
742 	}
743 	/*
744 	 * If we successfully wrote any data, and we are not the superuser
745 	 * we clear the setuid and setgid bits as a precaution against
746 	 * tampering.
747 	 */
748 	if (resid > uio->uio_resid && ap->a_cred &&
749 	    suser_cred(ap->a_cred, PRISON_ROOT)) {
750 		ip->i_mode &= ~(ISUID | ISGID);
751 		DIP(ip, i_mode) = ip->i_mode;
752 	}
753 	if (resid > uio->uio_resid)
754 		VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0));
755 	if (error) {
756 		if (ioflag & IO_UNIT) {
757 			(void)UFS_TRUNCATE(vp, osize,
758 			    IO_NORMAL | (ioflag & IO_SYNC),
759 			    ap->a_cred, uio->uio_td);
760 			uio->uio_offset -= resid - uio->uio_resid;
761 			uio->uio_resid = resid;
762 		}
763 	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
764 		error = UFS_UPDATE(vp, 1);
765 
766 	if (object) {
767 		vm_object_vndeallocate(object);
768 	}
769 
770 	return (error);
771 }
772 
773 /*
774  * get page routine
775  */
776 static int
777 ffs_getpages(ap)
778 	struct vop_getpages_args *ap;
779 {
780 	off_t foff, physoffset;
781 	int i, size, bsize;
782 	struct vnode *dp, *vp;
783 	vm_object_t obj;
784 	vm_pindex_t pindex, firstindex;
785 	vm_page_t mreq;
786 	int bbackwards, bforwards;
787 	int pbackwards, pforwards;
788 	int firstpage;
789 	ufs2_daddr_t reqblkno, reqlblkno;
790 	int poff;
791 	int pcount;
792 	int rtval;
793 	int pagesperblock;
794 
795 	GIANT_REQUIRED;
796 
797 	pcount = round_page(ap->a_count) / PAGE_SIZE;
798 	mreq = ap->a_m[ap->a_reqpage];
799 	firstindex = ap->a_m[0]->pindex;
800 
801 	/*
802 	 * if ANY DEV_BSIZE blocks are valid on a large filesystem block,
803 	 * then the entire page is valid.  Since the page may be mapped,
804 	 * user programs might reference data beyond the actual end of file
805 	 * occuring within the page.  We have to zero that data.
806 	 */
807 	if (mreq->valid) {
808 		if (mreq->valid != VM_PAGE_BITS_ALL)
809 			vm_page_zero_invalid(mreq, TRUE);
810 		vm_page_lock_queues();
811 		for (i = 0; i < pcount; i++) {
812 			if (i != ap->a_reqpage) {
813 				vm_page_free(ap->a_m[i]);
814 			}
815 		}
816 		vm_page_unlock_queues();
817 		return VM_PAGER_OK;
818 	}
819 
820 	vp = ap->a_vp;
821 	obj = vp->v_object;
822 	bsize = vp->v_mount->mnt_stat.f_iosize;
823 	pindex = mreq->pindex;
824 	foff = IDX_TO_OFF(pindex) /* + ap->a_offset should be zero */;
825 
826 	if (bsize < PAGE_SIZE)
827 		return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
828 						    ap->a_count,
829 						    ap->a_reqpage);
830 
831 	/*
832 	 * foff is the file offset of the required page
833 	 * reqlblkno is the logical block that contains the page
834 	 * poff is the index of the page into the logical block
835 	 */
836 	reqlblkno = foff / bsize;
837 	poff = (foff % bsize) / PAGE_SIZE;
838 
839 	dp = VTOI(vp)->i_devvp;
840 	if (ufs_bmaparray(vp, reqlblkno, &reqblkno, 0, &bforwards, &bbackwards)
841 	    || (reqblkno == -1)) {
842 		vm_page_lock_queues();
843 		for(i = 0; i < pcount; i++) {
844 			if (i != ap->a_reqpage)
845 				vm_page_free(ap->a_m[i]);
846 		}
847 		vm_page_unlock_queues();
848 		if (reqblkno == -1) {
849 			if ((mreq->flags & PG_ZERO) == 0)
850 				pmap_zero_page(mreq);
851 			vm_page_undirty(mreq);
852 			mreq->valid = VM_PAGE_BITS_ALL;
853 			return VM_PAGER_OK;
854 		} else {
855 			return VM_PAGER_ERROR;
856 		}
857 	}
858 
859 	physoffset = (off_t)reqblkno * DEV_BSIZE + poff * PAGE_SIZE;
860 	pagesperblock = bsize / PAGE_SIZE;
861 	/*
862 	 * find the first page that is contiguous...
863 	 * note that pbackwards is the number of pages that are contiguous
864 	 * backwards.
865 	 */
866 	firstpage = 0;
867 	if (ap->a_count) {
868 		pbackwards = poff + bbackwards * pagesperblock;
869 		if (ap->a_reqpage > pbackwards) {
870 			firstpage = ap->a_reqpage - pbackwards;
871 			vm_page_lock_queues();
872 			for(i=0;i<firstpage;i++)
873 				vm_page_free(ap->a_m[i]);
874 			vm_page_unlock_queues();
875 		}
876 
877 	/*
878 	 * pforwards is the number of pages that are contiguous
879 	 * after the current page.
880 	 */
881 		pforwards = (pagesperblock - (poff + 1)) +
882 			bforwards * pagesperblock;
883 		if (pforwards < (pcount - (ap->a_reqpage + 1))) {
884 			vm_page_lock_queues();
885 			for( i = ap->a_reqpage + pforwards + 1; i < pcount; i++)
886 				vm_page_free(ap->a_m[i]);
887 			vm_page_unlock_queues();
888 			pcount = ap->a_reqpage + pforwards + 1;
889 		}
890 
891 	/*
892 	 * number of pages for I/O corrected for the non-contig pages at
893 	 * the beginning of the array.
894 	 */
895 		pcount -= firstpage;
896 	}
897 
898 	/*
899 	 * calculate the size of the transfer
900 	 */
901 
902 	size = pcount * PAGE_SIZE;
903 
904 	if ((IDX_TO_OFF(ap->a_m[firstpage]->pindex) + size) >
905 		obj->un_pager.vnp.vnp_size)
906 		size = obj->un_pager.vnp.vnp_size -
907 			IDX_TO_OFF(ap->a_m[firstpage]->pindex);
908 
909 	physoffset -= foff;
910 	rtval = VOP_GETPAGES(dp, &ap->a_m[firstpage], size,
911 		(ap->a_reqpage - firstpage), physoffset);
912 
913 	return (rtval);
914 }
915 
916 /*
917  * Extended attribute area reading.
918  */
919 static int
920 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag)
921 {
922 	struct inode *ip;
923 	struct ufs2_dinode *dp;
924 	struct fs *fs;
925 	struct buf *bp;
926 	ufs_lbn_t lbn, nextlbn;
927 	off_t bytesinfile;
928 	long size, xfersize, blkoffset;
929 	int error, orig_resid;
930 	mode_t mode;
931 
932 	GIANT_REQUIRED;
933 
934 	ip = VTOI(vp);
935 	fs = ip->i_fs;
936 	dp = ip->i_din2;
937 	mode = ip->i_mode;
938 
939 #ifdef DIAGNOSTIC
940 	if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC)
941 		panic("ffs_extread: mode");
942 
943 #endif
944 	orig_resid = uio->uio_resid;
945 	if (orig_resid <= 0)
946 		return (0);
947 
948 	bytesinfile = dp->di_extsize - uio->uio_offset;
949 	if (bytesinfile <= 0) {
950 		if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
951 			ip->i_flag |= IN_ACCESS;
952 		return 0;
953 	}
954 
955 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
956 		if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0)
957 			break;
958 
959 		lbn = lblkno(fs, uio->uio_offset);
960 		nextlbn = lbn + 1;
961 
962 		/*
963 		 * size of buffer.  The buffer representing the
964 		 * end of the file is rounded up to the size of
965 		 * the block type ( fragment or full block,
966 		 * depending ).
967 		 */
968 		size = sblksize(fs, dp->di_extsize, lbn);
969 		blkoffset = blkoff(fs, uio->uio_offset);
970 
971 		/*
972 		 * The amount we want to transfer in this iteration is
973 		 * one FS block less the amount of the data before
974 		 * our startpoint (duh!)
975 		 */
976 		xfersize = fs->fs_bsize - blkoffset;
977 
978 		/*
979 		 * But if we actually want less than the block,
980 		 * or the file doesn't have a whole block more of data,
981 		 * then use the lesser number.
982 		 */
983 		if (uio->uio_resid < xfersize)
984 			xfersize = uio->uio_resid;
985 		if (bytesinfile < xfersize)
986 			xfersize = bytesinfile;
987 
988 		if (lblktosize(fs, nextlbn) >= dp->di_extsize) {
989 			/*
990 			 * Don't do readahead if this is the end of the info.
991 			 */
992 			error = bread(vp, -1 - lbn, size, NOCRED, &bp);
993 		} else {
994 			/*
995 			 * If we have a second block, then
996 			 * fire off a request for a readahead
997 			 * as well as a read. Note that the 4th and 5th
998 			 * arguments point to arrays of the size specified in
999 			 * the 6th argument.
1000 			 */
1001 			int nextsize = sblksize(fs, dp->di_extsize, nextlbn);
1002 
1003 			nextlbn = -1 - nextlbn;
1004 			error = breadn(vp, -1 - lbn,
1005 			    size, &nextlbn, &nextsize, 1, NOCRED, &bp);
1006 		}
1007 		if (error) {
1008 			brelse(bp);
1009 			bp = NULL;
1010 			break;
1011 		}
1012 
1013 		/*
1014 		 * If IO_DIRECT then set B_DIRECT for the buffer.  This
1015 		 * will cause us to attempt to release the buffer later on
1016 		 * and will cause the buffer cache to attempt to free the
1017 		 * underlying pages.
1018 		 */
1019 		if (ioflag & IO_DIRECT)
1020 			bp->b_flags |= B_DIRECT;
1021 
1022 		/*
1023 		 * We should only get non-zero b_resid when an I/O error
1024 		 * has occurred, which should cause us to break above.
1025 		 * However, if the short read did not cause an error,
1026 		 * then we want to ensure that we do not uiomove bad
1027 		 * or uninitialized data.
1028 		 */
1029 		size -= bp->b_resid;
1030 		if (size < xfersize) {
1031 			if (size == 0)
1032 				break;
1033 			xfersize = size;
1034 		}
1035 
1036 		error = uiomove((char *)bp->b_data + blkoffset,
1037 					(int)xfersize, uio);
1038 		if (error)
1039 			break;
1040 
1041 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1042 		   (LIST_FIRST(&bp->b_dep) == NULL)) {
1043 			/*
1044 			 * If there are no dependencies, and it's VMIO,
1045 			 * then we don't need the buf, mark it available
1046 			 * for freeing. The VM has the data.
1047 			 */
1048 			bp->b_flags |= B_RELBUF;
1049 			brelse(bp);
1050 		} else {
1051 			/*
1052 			 * Otherwise let whoever
1053 			 * made the request take care of
1054 			 * freeing it. We just queue
1055 			 * it onto another list.
1056 			 */
1057 			bqrelse(bp);
1058 		}
1059 	}
1060 
1061 	/*
1062 	 * This can only happen in the case of an error
1063 	 * because the loop above resets bp to NULL on each iteration
1064 	 * and on normal completion has not set a new value into it.
1065 	 * so it must have come from a 'break' statement
1066 	 */
1067 	if (bp != NULL) {
1068 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1069 		   (LIST_FIRST(&bp->b_dep) == NULL)) {
1070 			bp->b_flags |= B_RELBUF;
1071 			brelse(bp);
1072 		} else {
1073 			bqrelse(bp);
1074 		}
1075 	}
1076 
1077 	if ((error == 0 || uio->uio_resid != orig_resid) &&
1078 	    (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
1079 		ip->i_flag |= IN_ACCESS;
1080 	return (error);
1081 }
1082 
1083 /*
1084  * Extended attribute area writing.
1085  */
1086 static int
1087 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred)
1088 {
1089 	struct inode *ip;
1090 	struct ufs2_dinode *dp;
1091 	struct fs *fs;
1092 	struct buf *bp;
1093 	ufs_lbn_t lbn;
1094 	off_t osize;
1095 	int blkoffset, error, flags, resid, size, xfersize;
1096 
1097 	GIANT_REQUIRED;
1098 
1099 	ip = VTOI(vp);
1100 	fs = ip->i_fs;
1101 	dp = ip->i_din2;
1102 
1103 #ifdef DIAGNOSTIC
1104 	if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC)
1105 		panic("ext_write: mode");
1106 #endif
1107 
1108 	if (ioflag & IO_APPEND)
1109 		uio->uio_offset = dp->di_extsize;
1110 
1111 	if (uio->uio_offset < 0 ||
1112 	    (u_int64_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize)
1113 		return (EFBIG);
1114 
1115 	resid = uio->uio_resid;
1116 	osize = dp->di_extsize;
1117 	flags = IO_EXT;
1118 	if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
1119 		flags |= IO_SYNC;
1120 
1121 	for (error = 0; uio->uio_resid > 0;) {
1122 		lbn = lblkno(fs, uio->uio_offset);
1123 		blkoffset = blkoff(fs, uio->uio_offset);
1124 		xfersize = fs->fs_bsize - blkoffset;
1125 		if (uio->uio_resid < xfersize)
1126 			xfersize = uio->uio_resid;
1127 
1128                 /*
1129 		 * We must perform a read-before-write if the transfer size
1130 		 * does not cover the entire buffer.
1131                  */
1132 		if (fs->fs_bsize > xfersize)
1133 			flags |= BA_CLRBUF;
1134 		else
1135 			flags &= ~BA_CLRBUF;
1136 		error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
1137 		    ucred, flags, &bp);
1138 		if (error != 0)
1139 			break;
1140 		/*
1141 		 * If the buffer is not valid we have to clear out any
1142 		 * garbage data from the pages instantiated for the buffer.
1143 		 * If we do not, a failed uiomove() during a write can leave
1144 		 * the prior contents of the pages exposed to a userland
1145 		 * mmap().  XXX deal with uiomove() errors a better way.
1146 		 */
1147 		if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
1148 			vfs_bio_clrbuf(bp);
1149 		if (ioflag & IO_DIRECT)
1150 			bp->b_flags |= B_DIRECT;
1151 		if (ioflag & IO_NOWDRAIN)
1152 			bp->b_flags |= B_NOWDRAIN;
1153 
1154 		if (uio->uio_offset + xfersize > dp->di_extsize)
1155 			dp->di_extsize = uio->uio_offset + xfersize;
1156 
1157 		size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid;
1158 		if (size < xfersize)
1159 			xfersize = size;
1160 
1161 		error =
1162 		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
1163 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1164 		   (LIST_FIRST(&bp->b_dep) == NULL)) {
1165 			bp->b_flags |= B_RELBUF;
1166 		}
1167 
1168 		/*
1169 		 * If IO_SYNC each buffer is written synchronously.  Otherwise
1170 		 * if we have a severe page deficiency write the buffer
1171 		 * asynchronously.  Otherwise try to cluster, and if that
1172 		 * doesn't do it then either do an async write (if O_DIRECT),
1173 		 * or a delayed write (if not).
1174 		 */
1175 		if (ioflag & IO_SYNC) {
1176 			(void)bwrite(bp);
1177 		} else if (vm_page_count_severe() ||
1178 			    buf_dirty_count_severe() ||
1179 			    xfersize + blkoffset == fs->fs_bsize ||
1180 			    (ioflag & (IO_ASYNC | IO_DIRECT)))
1181 			bawrite(bp);
1182 		else
1183 			bdwrite(bp);
1184 		if (error || xfersize == 0)
1185 			break;
1186 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
1187 	}
1188 	/*
1189 	 * If we successfully wrote any data, and we are not the superuser
1190 	 * we clear the setuid and setgid bits as a precaution against
1191 	 * tampering.
1192 	 */
1193 	if (resid > uio->uio_resid && ucred &&
1194 	    suser_cred(ucred, PRISON_ROOT)) {
1195 		ip->i_mode &= ~(ISUID | ISGID);
1196 		dp->di_mode = ip->i_mode;
1197 	}
1198 	if (error) {
1199 		if (ioflag & IO_UNIT) {
1200 			(void)UFS_TRUNCATE(vp, osize,
1201 			    IO_EXT | (ioflag&IO_SYNC), ucred, uio->uio_td);
1202 			uio->uio_offset -= resid - uio->uio_resid;
1203 			uio->uio_resid = resid;
1204 		}
1205 	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
1206 		error = UFS_UPDATE(vp, 1);
1207 	return (error);
1208 }
1209 
1210 
1211 /*
1212  * Vnode operating to retrieve a named extended attribute.
1213  *
1214  * Locate a particular EA (nspace:name) in the area (ptr:length), and return
1215  * the length of the EA, and possibly the pointer to the entry and to the data.
1216  */
1217 static int
1218 ffs_findextattr(u_char *ptr, uint length, int nspace, const char *name, u_char **eap, u_char **eac)
1219 {
1220 	u_char *p, *pe, *pn, *p0;
1221 	int eapad1, eapad2, ealength, ealen, nlen;
1222 	uint32_t ul;
1223 
1224 	pe = ptr + length;
1225 	nlen = strlen(name);
1226 
1227 	for (p = ptr; p < pe; p = pn) {
1228 		p0 = p;
1229 		bcopy(p, &ul, sizeof(ul));
1230 		pn = p + ul;
1231 		/* make sure this entry is complete */
1232 		if (pn > pe)
1233 			break;
1234 		p += sizeof(uint32_t);
1235 		if (*p != nspace)
1236 			continue;
1237 		p++;
1238 		eapad2 = *p++;
1239 		if (*p != nlen)
1240 			continue;
1241 		p++;
1242 		if (bcmp(p, name, nlen))
1243 			continue;
1244 		ealength = sizeof(uint32_t) + 3 + nlen;
1245 		eapad1 = 8 - (ealength % 8);
1246 		if (eapad1 == 8)
1247 			eapad1 = 0;
1248 		ealength += eapad1;
1249 		ealen = ul - ealength - eapad2;
1250 		p += nlen + eapad1;
1251 		if (eap != NULL)
1252 			*eap = p0;
1253 		if (eac != NULL)
1254 			*eac = p;
1255 		return (ealen);
1256 	}
1257 	return(-1);
1258 }
1259 
1260 static int
1261 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra)
1262 {
1263 	struct inode *ip;
1264 	struct fs *fs;
1265 	struct ufs2_dinode *dp;
1266 	struct uio luio;
1267 	struct iovec liovec;
1268 	int easize, error;
1269 	u_char *eae;
1270 
1271 	ip = VTOI(vp);
1272 	fs = ip->i_fs;
1273 	dp = ip->i_din2;
1274 	easize = dp->di_extsize;
1275 
1276 	eae = malloc(easize + extra, M_TEMP, M_WAITOK);
1277 
1278 	liovec.iov_base = eae;
1279 	liovec.iov_len = easize;
1280 	luio.uio_iov = &liovec;
1281 	luio.uio_iovcnt = 1;
1282 	luio.uio_offset = 0;
1283 	luio.uio_resid = easize;
1284 	luio.uio_segflg = UIO_SYSSPACE;
1285 	luio.uio_rw = UIO_READ;
1286 	luio.uio_td = td;
1287 
1288 	error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC);
1289 	if (error) {
1290 		free(eae, M_TEMP);
1291 		return(error);
1292 	}
1293 	*p = eae;
1294 	return (0);
1295 }
1296 
1297 static int
1298 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td)
1299 {
1300 	struct inode *ip;
1301 	struct fs *fs;
1302 	struct ufs2_dinode *dp;
1303 	int error;
1304 
1305 	ip = VTOI(vp);
1306 	fs = ip->i_fs;
1307 
1308 	if (ip->i_ea_area != NULL)
1309 		return (EBUSY);
1310 	dp = ip->i_din2;
1311 	error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0);
1312 	if (error)
1313 		return (error);
1314 	ip->i_ea_len = dp->di_extsize;
1315 	ip->i_ea_error = 0;
1316 	return (0);
1317 }
1318 
1319 /*
1320  * Vnode extattr transaction commit/abort
1321  */
1322 static int
1323 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td)
1324 {
1325 	struct inode *ip;
1326 	struct fs *fs;
1327 	struct uio luio;
1328 	struct iovec liovec;
1329 	int error;
1330 	struct ufs2_dinode *dp;
1331 
1332 	ip = VTOI(vp);
1333 	fs = ip->i_fs;
1334 	if (ip->i_ea_area == NULL)
1335 		return (EINVAL);
1336 	dp = ip->i_din2;
1337 	error = ip->i_ea_error;
1338 	if (commit && error == 0) {
1339 		if (cred == NOCRED)
1340 			cred =  vp->v_mount->mnt_cred;
1341 		liovec.iov_base = ip->i_ea_area;
1342 		liovec.iov_len = ip->i_ea_len;
1343 		luio.uio_iov = &liovec;
1344 		luio.uio_iovcnt = 1;
1345 		luio.uio_offset = 0;
1346 		luio.uio_resid = ip->i_ea_len;
1347 		luio.uio_segflg = UIO_SYSSPACE;
1348 		luio.uio_rw = UIO_WRITE;
1349 		luio.uio_td = td;
1350 		/* XXX: I'm not happy about truncating to zero size */
1351 		if (ip->i_ea_len < dp->di_extsize)
1352 			error = ffs_truncate(vp, 0, IO_EXT, cred, td);
1353 		error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred);
1354 	}
1355 	free(ip->i_ea_area, M_TEMP);
1356 	ip->i_ea_area = NULL;
1357 	ip->i_ea_len = 0;
1358 	ip->i_ea_error = 0;
1359 	return (error);
1360 }
1361 
1362 /*
1363  * Vnode extattr strategy routine for special devices and fifos.
1364  *
1365  * We need to check for a read or write of the external attributes.
1366  * Otherwise we just fall through and do the usual thing.
1367  */
1368 static int
1369 ffsext_strategy(struct vop_strategy_args *ap)
1370 /*
1371 struct vop_strategy_args {
1372 	struct vnodeop_desc *a_desc;
1373 	struct vnode *a_vp;
1374 	struct buf *a_bp;
1375 };
1376 */
1377 {
1378 	struct vnode *vp;
1379 	daddr_t lbn;
1380 
1381 	vp = ap->a_vp;
1382 	lbn = ap->a_bp->b_lblkno;
1383 	if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC &&
1384 	    lbn < 0 && lbn >= -NXADDR)
1385 		return (ufs_vnoperate((struct vop_generic_args *)ap));
1386 	if (vp->v_type == VFIFO)
1387 		return (ufs_vnoperatefifo((struct vop_generic_args *)ap));
1388 	return (ufs_vnoperatespec((struct vop_generic_args *)ap));
1389 }
1390 
1391 /*
1392  * Vnode extattr transaction commit/abort
1393  */
1394 static int
1395 ffs_openextattr(struct vop_openextattr_args *ap)
1396 /*
1397 struct vop_openextattr_args {
1398 	struct vnodeop_desc *a_desc;
1399 	struct vnode *a_vp;
1400 	IN struct ucred *a_cred;
1401 	IN struct thread *a_td;
1402 };
1403 */
1404 {
1405 	struct inode *ip;
1406 	struct fs *fs;
1407 
1408 	ip = VTOI(ap->a_vp);
1409 	fs = ip->i_fs;
1410 	if (fs->fs_magic == FS_UFS1_MAGIC)
1411 		return (ufs_vnoperate((struct vop_generic_args *)ap));
1412 	return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td));
1413 }
1414 
1415 
1416 /*
1417  * Vnode extattr transaction commit/abort
1418  */
1419 static int
1420 ffs_closeextattr(struct vop_closeextattr_args *ap)
1421 /*
1422 struct vop_closeextattr_args {
1423 	struct vnodeop_desc *a_desc;
1424 	struct vnode *a_vp;
1425 	int a_commit;
1426 	IN struct ucred *a_cred;
1427 	IN struct thread *a_td;
1428 };
1429 */
1430 {
1431 	struct inode *ip;
1432 	struct fs *fs;
1433 
1434 	ip = VTOI(ap->a_vp);
1435 	fs = ip->i_fs;
1436 	if (fs->fs_magic == FS_UFS1_MAGIC)
1437 		return (ufs_vnoperate((struct vop_generic_args *)ap));
1438 	return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td));
1439 }
1440 
1441 
1442 
1443 /*
1444  * Vnode operation to retrieve a named extended attribute.
1445  */
1446 static int
1447 ffs_getextattr(struct vop_getextattr_args *ap)
1448 /*
1449 vop_getextattr {
1450 	IN struct vnode *a_vp;
1451 	IN int a_attrnamespace;
1452 	IN const char *a_name;
1453 	INOUT struct uio *a_uio;
1454 	OUT size_t *a_size;
1455 	IN struct ucred *a_cred;
1456 	IN struct thread *a_td;
1457 };
1458 */
1459 {
1460 	struct inode *ip;
1461 	struct fs *fs;
1462 	u_char *eae, *p, *pe, *pn;
1463 	struct ufs2_dinode *dp;
1464 	unsigned easize;
1465 	uint32_t ul;
1466 	int error, ealen, stand_alone;
1467 
1468 	ip = VTOI(ap->a_vp);
1469 	fs = ip->i_fs;
1470 
1471 	if (fs->fs_magic == FS_UFS1_MAGIC)
1472 		return (ufs_vnoperate((struct vop_generic_args *)ap));
1473 
1474 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1475 	    ap->a_cred, ap->a_td, IREAD);
1476 	if (error)
1477 		return (error);
1478 
1479 	if (ip->i_ea_area == NULL) {
1480 		error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1481 		if (error)
1482 			return (error);
1483 		stand_alone = 1;
1484 	} else {
1485 		stand_alone = 0;
1486 	}
1487 	dp = ip->i_din2;
1488 	eae = ip->i_ea_area;
1489 	easize = ip->i_ea_len;
1490 	if (strlen(ap->a_name) > 0) {
1491 		ealen = ffs_findextattr(eae, easize,
1492 		    ap->a_attrnamespace, ap->a_name, NULL, &p);
1493 		if (ealen >= 0) {
1494 			error = 0;
1495 			if (ap->a_size != NULL)
1496 				*ap->a_size = ealen;
1497 			else if (ap->a_uio != NULL)
1498 				error = uiomove(p, ealen, ap->a_uio);
1499 		} else {
1500 			error = ENOATTR;
1501 		}
1502 	} else {
1503 		error = 0;
1504 		if (ap->a_size != NULL)
1505 			*ap->a_size = 0;
1506 		pe = eae + easize;
1507 		for(p = eae; error == 0 && p < pe; p = pn) {
1508 			bcopy(p, &ul, sizeof(ul));
1509 			pn = p + ul;
1510 			if (pn > pe)
1511 				break;
1512 			p += sizeof(ul);
1513 			if (*p++ != ap->a_attrnamespace)
1514 				continue;
1515 			p++;	/* pad2 */
1516 			ealen = *p;
1517 			if (ap->a_size != NULL) {
1518 				*ap->a_size += ealen + 1;
1519 			} else if (ap->a_uio != NULL) {
1520 				error = uiomove(p, ealen + 1, ap->a_uio);
1521 			}
1522 		}
1523 	}
1524 	if (stand_alone)
1525 		ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1526 	return(error);
1527 }
1528 
1529 /*
1530  * Vnode operation to set a named attribute.
1531  */
1532 static int
1533 ffs_setextattr(struct vop_setextattr_args *ap)
1534 /*
1535 vop_setextattr {
1536 	IN struct vnode *a_vp;
1537 	IN int a_attrnamespace;
1538 	IN const char *a_name;
1539 	INOUT struct uio *a_uio;
1540 	IN struct ucred *a_cred;
1541 	IN struct thread *a_td;
1542 };
1543 */
1544 {
1545 	struct inode *ip;
1546 	struct fs *fs;
1547 	uint32_t ealength, ul;
1548 	int ealen, olen, eacont, eapad1, eapad2, error, i, easize;
1549 	u_char *eae, *p;
1550 	struct ufs2_dinode *dp;
1551 	struct ucred *cred;
1552 	int stand_alone;
1553 
1554 	ip = VTOI(ap->a_vp);
1555 	fs = ip->i_fs;
1556 
1557 	if (fs->fs_magic == FS_UFS1_MAGIC)
1558 		return (ufs_vnoperate((struct vop_generic_args *)ap));
1559 
1560 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1561 	    ap->a_cred, ap->a_td, IWRITE);
1562 	if (error) {
1563 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1564 			ip->i_ea_error = error;
1565 		return (error);
1566 	}
1567 
1568 	if (ap->a_cred != NOCRED)
1569 		cred = ap->a_cred;
1570 	else
1571 		cred = ap->a_vp->v_mount->mnt_cred;
1572 
1573 	dp = ip->i_din2;
1574 
1575 	if (ip->i_ea_area == NULL) {
1576 		error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1577 		if (error)
1578 			return (error);
1579 		stand_alone = 1;
1580 	} else {
1581 		stand_alone = 0;
1582 	}
1583 
1584 	/* Calculate the length of the EA entry */
1585 	if (ap->a_uio == NULL) {
1586 		/* delete */
1587 		ealength = eapad1 = ealen = eapad2 = eacont = 0;
1588 	} else {
1589 		ealen = ap->a_uio->uio_resid;
1590 		ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name);
1591 		eapad1 = 8 - (ealength % 8);
1592 		if (eapad1 == 8)
1593 			eapad1 = 0;
1594 		eacont = ealength + eapad1;
1595 		eapad2 = 8 - (ealen % 8);
1596 		if (eapad2 == 8)
1597 			eapad2 = 0;
1598 		ealength += eapad1 + ealen + eapad2;
1599 	}
1600 
1601 	eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK);
1602 	bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1603 	easize = ip->i_ea_len;
1604 
1605 	olen = ffs_findextattr(eae, easize,
1606 	    ap->a_attrnamespace, ap->a_name, &p, NULL);
1607 	if (olen == -1 && ealength == 0) {
1608 		/* delete but nonexistent */
1609 		free(eae, M_TEMP);
1610 		if (stand_alone)
1611 			ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1612 		return(ENOATTR);
1613 	}
1614         if (olen == -1) {
1615 		/* new, append at end */
1616 		p = eae + easize;
1617 		easize += ealength;
1618 	} else {
1619 		bcopy(p, &ul, sizeof ul);
1620 		i = p - eae + ul;
1621 		if (ul != ealength) {
1622 			bcopy(p + ul, p + ealength, easize - i);
1623 			easize += (ealength - ul);
1624 		}
1625 	}
1626 	if (easize > NXADDR * fs->fs_bsize) {
1627 		free(eae, M_TEMP);
1628 		if (stand_alone)
1629 			ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1630 		else if (ip->i_ea_error == 0)
1631 			ip->i_ea_error = ENOSPC;
1632 		return(ENOSPC);
1633 	}
1634 	if (ealength != 0) {
1635 		bcopy(&ealength, p, sizeof(ealength));
1636 		p += sizeof(ealength);
1637 		*p++ = ap->a_attrnamespace;
1638 		*p++ = eapad2;
1639 		*p++ = strlen(ap->a_name);
1640 		strcpy(p, ap->a_name);
1641 		p += strlen(ap->a_name);
1642 		bzero(p, eapad1);
1643 		p += eapad1;
1644 		error = uiomove(p, ealen, ap->a_uio);
1645 		if (error) {
1646 			free(eae, M_TEMP);
1647 			if (stand_alone)
1648 				ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1649 			else if (ip->i_ea_error == 0)
1650 				ip->i_ea_error = error;
1651 			return(error);
1652 		}
1653 		p += ealen;
1654 		bzero(p, eapad2);
1655 	}
1656 	p = ip->i_ea_area;
1657 	ip->i_ea_area = eae;
1658 	ip->i_ea_len = easize;
1659 	free(p, M_TEMP);
1660 	if (stand_alone)
1661 		error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1662 	return(error);
1663 }
1664