xref: /freebsd/sys/ufs/ffs/ffs_vnops.c (revision f9218d3d4fd34f082473b3a021c6d4d109fb47cf)
1 /*
2  * Copyright (c) 2002 Networks Associates Technology, Inc.
3  * All rights reserved.
4  *
5  * This software was developed for the FreeBSD Project by Marshall
6  * Kirk McKusick and Network Associates Laboratories, the Security
7  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
8  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
9  * research program
10  *
11  * Copyright (c) 1982, 1986, 1989, 1993
12  *	The Regents of the University of California.  All rights reserved.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. All advertising materials mentioning features or use of this software
23  *    must display the following acknowledgement:
24  *	This product includes software developed by the University of
25  *	California, Berkeley and its contributors.
26  * 4. Neither the name of the University nor the names of its contributors
27  *    may be used to endorse or promote products derived from this software
28  *    without specific prior written permission.
29  *
30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40  * SUCH DAMAGE.
41  *
42  *	@(#)ffs_vnops.c	8.15 (Berkeley) 5/14/95
43  * $FreeBSD$
44  */
45 
46 #include <sys/param.h>
47 #include <sys/bio.h>
48 #include <sys/systm.h>
49 #include <sys/buf.h>
50 #include <sys/conf.h>
51 #include <sys/extattr.h>
52 #include <sys/kernel.h>
53 #include <sys/malloc.h>
54 #include <sys/mount.h>
55 #include <sys/proc.h>
56 #include <sys/resourcevar.h>
57 #include <sys/signalvar.h>
58 #include <sys/stat.h>
59 #include <sys/vmmeter.h>
60 #include <sys/vnode.h>
61 
62 #include <machine/limits.h>
63 
64 #include <vm/vm.h>
65 #include <vm/vm_extern.h>
66 #include <vm/vm_object.h>
67 #include <vm/vm_page.h>
68 #include <vm/vm_pager.h>
69 #include <vm/vnode_pager.h>
70 
71 #include <ufs/ufs/extattr.h>
72 #include <ufs/ufs/quota.h>
73 #include <ufs/ufs/inode.h>
74 #include <ufs/ufs/ufs_extern.h>
75 #include <ufs/ufs/ufsmount.h>
76 
77 #include <ufs/ffs/fs.h>
78 #include <ufs/ffs/ffs_extern.h>
79 
80 static int	ffs_fsync(struct vop_fsync_args *);
81 static int	ffs_getpages(struct vop_getpages_args *);
82 static int	ffs_read(struct vop_read_args *);
83 static int	ffs_write(struct vop_write_args *);
84 static int	ffs_extread(struct vnode *vp, struct uio *uio, int ioflag);
85 static int	ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag,
86 		    struct ucred *cred);
87 static int	ffsext_strategy(struct vop_strategy_args *);
88 static int	ffs_closeextattr(struct vop_closeextattr_args *);
89 static int	ffs_getextattr(struct vop_getextattr_args *);
90 static int	ffs_openextattr(struct vop_openextattr_args *);
91 static int	ffs_setextattr(struct vop_setextattr_args *);
92 
93 
94 /* Global vfs data structures for ufs. */
95 vop_t **ffs_vnodeop_p;
96 static struct vnodeopv_entry_desc ffs_vnodeop_entries[] = {
97 	{ &vop_default_desc,		(vop_t *) ufs_vnoperate },
98 	{ &vop_fsync_desc,		(vop_t *) ffs_fsync },
99 	{ &vop_getpages_desc,		(vop_t *) ffs_getpages },
100 	{ &vop_read_desc,		(vop_t *) ffs_read },
101 	{ &vop_reallocblks_desc,	(vop_t *) ffs_reallocblks },
102 	{ &vop_write_desc,		(vop_t *) ffs_write },
103 	{ &vop_closeextattr_desc,	(vop_t *) ffs_closeextattr },
104 	{ &vop_getextattr_desc,		(vop_t *) ffs_getextattr },
105 	{ &vop_openextattr_desc,	(vop_t *) ffs_openextattr },
106 	{ &vop_setextattr_desc,		(vop_t *) ffs_setextattr },
107 	{ NULL, NULL }
108 };
109 static struct vnodeopv_desc ffs_vnodeop_opv_desc =
110 	{ &ffs_vnodeop_p, ffs_vnodeop_entries };
111 
112 vop_t **ffs_specop_p;
113 static struct vnodeopv_entry_desc ffs_specop_entries[] = {
114 	{ &vop_default_desc,		(vop_t *) ufs_vnoperatespec },
115 	{ &vop_fsync_desc,		(vop_t *) ffs_fsync },
116 	{ &vop_reallocblks_desc,	(vop_t *) ffs_reallocblks },
117 	{ &vop_strategy_desc,		(vop_t *) ffsext_strategy },
118 	{ &vop_closeextattr_desc,	(vop_t *) ffs_closeextattr },
119 	{ &vop_getextattr_desc,		(vop_t *) ffs_getextattr },
120 	{ &vop_openextattr_desc,	(vop_t *) ffs_openextattr },
121 	{ &vop_setextattr_desc,		(vop_t *) ffs_setextattr },
122 	{ NULL, NULL }
123 };
124 static struct vnodeopv_desc ffs_specop_opv_desc =
125 	{ &ffs_specop_p, ffs_specop_entries };
126 
127 vop_t **ffs_fifoop_p;
128 static struct vnodeopv_entry_desc ffs_fifoop_entries[] = {
129 	{ &vop_default_desc,		(vop_t *) ufs_vnoperatefifo },
130 	{ &vop_fsync_desc,		(vop_t *) ffs_fsync },
131 	{ &vop_reallocblks_desc,	(vop_t *) ffs_reallocblks },
132 	{ &vop_strategy_desc,		(vop_t *) ffsext_strategy },
133 	{ &vop_closeextattr_desc,	(vop_t *) ffs_closeextattr },
134 	{ &vop_getextattr_desc,		(vop_t *) ffs_getextattr },
135 	{ &vop_openextattr_desc,	(vop_t *) ffs_openextattr },
136 	{ &vop_setextattr_desc,		(vop_t *) ffs_setextattr },
137 	{ NULL, NULL }
138 };
139 static struct vnodeopv_desc ffs_fifoop_opv_desc =
140 	{ &ffs_fifoop_p, ffs_fifoop_entries };
141 
142 VNODEOP_SET(ffs_vnodeop_opv_desc);
143 VNODEOP_SET(ffs_specop_opv_desc);
144 VNODEOP_SET(ffs_fifoop_opv_desc);
145 
146 /*
147  * Synch an open file.
148  */
149 /* ARGSUSED */
150 static int
151 ffs_fsync(ap)
152 	struct vop_fsync_args /* {
153 		struct vnode *a_vp;
154 		struct ucred *a_cred;
155 		int a_waitfor;
156 		struct thread *a_td;
157 	} */ *ap;
158 {
159 	struct vnode *vp = ap->a_vp;
160 	struct inode *ip = VTOI(vp);
161 	struct buf *bp;
162 	struct buf *nbp;
163 	int s, error, wait, passes, skipmeta;
164 	ufs_lbn_t lbn;
165 
166 	wait = (ap->a_waitfor == MNT_WAIT);
167 	if (vn_isdisk(vp, NULL)) {
168 		lbn = INT_MAX;
169 		if (vp->v_rdev->si_mountpoint != NULL &&
170 		    (vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP))
171 			softdep_fsync_mountdev(vp);
172 	} else {
173 		lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1));
174 	}
175 
176 	/*
177 	 * Flush all dirty buffers associated with a vnode.
178 	 */
179 	passes = NIADDR + 1;
180 	skipmeta = 0;
181 	if (wait)
182 		skipmeta = 1;
183 	s = splbio();
184 	VI_LOCK(vp);
185 loop:
186 	TAILQ_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs)
187 		bp->b_vflags &= ~BV_SCANNED;
188 	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
189 		nbp = TAILQ_NEXT(bp, b_vnbufs);
190 		/*
191 		 * Reasons to skip this buffer: it has already been considered
192 		 * on this pass, this pass is the first time through on a
193 		 * synchronous flush request and the buffer being considered
194 		 * is metadata, the buffer has dependencies that will cause
195 		 * it to be redirtied and it has not already been deferred,
196 		 * or it is already being written.
197 		 */
198 		if ((bp->b_vflags & BV_SCANNED) != 0)
199 			continue;
200 		bp->b_vflags |= BV_SCANNED;
201 		if ((skipmeta == 1 && bp->b_lblkno < 0))
202 			continue;
203 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
204 			continue;
205 		if (!wait && LIST_FIRST(&bp->b_dep) != NULL &&
206 		    (bp->b_flags & B_DEFERRED) == 0 &&
207 		    buf_countdeps(bp, 0)) {
208 			bp->b_flags |= B_DEFERRED;
209 			BUF_UNLOCK(bp);
210 			continue;
211 		}
212 		VI_UNLOCK(vp);
213 		if ((bp->b_flags & B_DELWRI) == 0)
214 			panic("ffs_fsync: not dirty");
215 		if (vp != bp->b_vp)
216 			panic("ffs_fsync: vp != vp->b_vp");
217 		/*
218 		 * If this is a synchronous flush request, or it is not a
219 		 * file or device, start the write on this buffer immediatly.
220 		 */
221 		if (wait || (vp->v_type != VREG && vp->v_type != VBLK)) {
222 
223 			/*
224 			 * On our final pass through, do all I/O synchronously
225 			 * so that we can find out if our flush is failing
226 			 * because of write errors.
227 			 */
228 			if (passes > 0 || !wait) {
229 				if ((bp->b_flags & B_CLUSTEROK) && !wait) {
230 					BUF_UNLOCK(bp);
231 					(void) vfs_bio_awrite(bp);
232 				} else {
233 					bremfree(bp);
234 					splx(s);
235 					(void) bawrite(bp);
236 					s = splbio();
237 				}
238 			} else {
239 				bremfree(bp);
240 				splx(s);
241 				if ((error = bwrite(bp)) != 0)
242 					return (error);
243 				s = splbio();
244 			}
245 		} else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) {
246 			/*
247 			 * If the buffer is for data that has been truncated
248 			 * off the file, then throw it away.
249 			 */
250 			bremfree(bp);
251 			bp->b_flags |= B_INVAL | B_NOCACHE;
252 			splx(s);
253 			brelse(bp);
254 			s = splbio();
255 		} else {
256 			BUF_UNLOCK(bp);
257 			vfs_bio_awrite(bp);
258 		}
259 		/*
260 		 * Since we may have slept during the I/O, we need
261 		 * to start from a known point.
262 		 */
263 		VI_LOCK(vp);
264 		nbp = TAILQ_FIRST(&vp->v_dirtyblkhd);
265 	}
266 	/*
267 	 * If we were asked to do this synchronously, then go back for
268 	 * another pass, this time doing the metadata.
269 	 */
270 	if (skipmeta) {
271 		skipmeta = 0;
272 		goto loop;
273 	}
274 
275 	if (wait) {
276 		while (vp->v_numoutput) {
277 			vp->v_iflag |= VI_BWAIT;
278 			msleep((caddr_t)&vp->v_numoutput, VI_MTX(vp),
279 			    PRIBIO + 4, "ffsfsn", 0);
280   		}
281 		VI_UNLOCK(vp);
282 
283 		/*
284 		 * Ensure that any filesystem metatdata associated
285 		 * with the vnode has been written.
286 		 */
287 		splx(s);
288 		if ((error = softdep_sync_metadata(ap)) != 0)
289 			return (error);
290 		s = splbio();
291 
292 		VI_LOCK(vp);
293 		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
294 			/*
295 			 * Block devices associated with filesystems may
296 			 * have new I/O requests posted for them even if
297 			 * the vnode is locked, so no amount of trying will
298 			 * get them clean. Thus we give block devices a
299 			 * good effort, then just give up. For all other file
300 			 * types, go around and try again until it is clean.
301 			 */
302 			if (passes > 0) {
303 				passes -= 1;
304 				goto loop;
305 			}
306 #ifdef DIAGNOSTIC
307 			if (!vn_isdisk(vp, NULL))
308 				vprint("ffs_fsync: dirty", vp);
309 #endif
310 		}
311 	}
312 	VI_UNLOCK(vp);
313 	splx(s);
314 	return (UFS_UPDATE(vp, wait));
315 }
316 
317 
318 /*
319  * Vnode op for reading.
320  */
321 /* ARGSUSED */
322 static int
323 ffs_read(ap)
324 	struct vop_read_args /* {
325 		struct vnode *a_vp;
326 		struct uio *a_uio;
327 		int a_ioflag;
328 		struct ucred *a_cred;
329 	} */ *ap;
330 {
331 	struct vnode *vp;
332 	struct inode *ip;
333 	struct uio *uio;
334 	struct fs *fs;
335 	struct buf *bp;
336 	ufs_lbn_t lbn, nextlbn;
337 	off_t bytesinfile;
338 	long size, xfersize, blkoffset;
339 	int error, orig_resid;
340 	mode_t mode;
341 	int seqcount;
342 	int ioflag;
343 	vm_object_t object;
344 
345 	vp = ap->a_vp;
346 	uio = ap->a_uio;
347 	ioflag = ap->a_ioflag;
348 	if (ap->a_ioflag & IO_EXT)
349 #ifdef notyet
350 		return (ffs_extread(vp, uio, ioflag));
351 #else
352 		panic("ffs_read+IO_EXT");
353 #endif
354 
355 	GIANT_REQUIRED;
356 
357 	seqcount = ap->a_ioflag >> 16;
358 	ip = VTOI(vp);
359 	mode = ip->i_mode;
360 
361 #ifdef DIAGNOSTIC
362 	if (uio->uio_rw != UIO_READ)
363 		panic("ffs_read: mode");
364 
365 	if (vp->v_type == VLNK) {
366 		if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
367 			panic("ffs_read: short symlink");
368 	} else if (vp->v_type != VREG && vp->v_type != VDIR)
369 		panic("ffs_read: type %d",  vp->v_type);
370 #endif
371 	fs = ip->i_fs;
372 	if ((u_int64_t)uio->uio_offset > fs->fs_maxfilesize)
373 		return (EFBIG);
374 
375 	orig_resid = uio->uio_resid;
376 	if (orig_resid <= 0)
377 		return (0);
378 
379 	object = vp->v_object;
380 
381 	bytesinfile = ip->i_size - uio->uio_offset;
382 	if (bytesinfile <= 0) {
383 		if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
384 			ip->i_flag |= IN_ACCESS;
385 		return 0;
386 	}
387 
388 	if (object) {
389 		vm_object_reference(object);
390 	}
391 
392 #ifdef ENABLE_VFS_IOOPT
393 	/*
394 	 * If IO optimisation is turned on,
395 	 * and we are NOT a VM based IO request,
396 	 * (i.e. not headed for the buffer cache)
397 	 * but there IS a vm object associated with it.
398 	 */
399 	if ((ioflag & IO_VMIO) == 0 && (vfs_ioopt > 1) && object) {
400 		int nread, toread;
401 
402 		toread = uio->uio_resid;
403 		if (toread > bytesinfile)
404 			toread = bytesinfile;
405 		if (toread >= PAGE_SIZE) {
406 			/*
407 			 * Then if it's at least a page in size, try
408 			 * get the data from the object using vm tricks
409 			 */
410 			error = uioread(toread, uio, object, &nread);
411 			if ((uio->uio_resid == 0) || (error != 0)) {
412 				/*
413 				 * If we finished or there was an error
414 				 * then finish up (the reference previously
415 				 * obtained on object must be released).
416 				 */
417 				if ((error == 0 ||
418 				    uio->uio_resid != orig_resid) &&
419 				    (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
420 					ip->i_flag |= IN_ACCESS;
421 
422 				if (object) {
423 					vm_object_vndeallocate(object);
424 				}
425 				return error;
426 			}
427 		}
428 	}
429 #endif
430 
431 	/*
432 	 * Ok so we couldn't do it all in one vm trick...
433 	 * so cycle around trying smaller bites..
434 	 */
435 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
436 		if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
437 			break;
438 #ifdef ENABLE_VFS_IOOPT
439 		if ((ioflag & IO_VMIO) == 0 && (vfs_ioopt > 1) && object) {
440 			/*
441 			 * Obviously we didn't finish above, but we
442 			 * didn't get an error either. Try the same trick again.
443 			 * but this time we are looping.
444 			 */
445 			int nread, toread;
446 			toread = uio->uio_resid;
447 			if (toread > bytesinfile)
448 				toread = bytesinfile;
449 
450 			/*
451 			 * Once again, if there isn't enough for a
452 			 * whole page, don't try optimising.
453 			 */
454 			if (toread >= PAGE_SIZE) {
455 				error = uioread(toread, uio, object, &nread);
456 				if ((uio->uio_resid == 0) || (error != 0)) {
457 					/*
458 					 * If we finished or there was an
459 					 * error then finish up (the reference
460 					 * previously obtained on object must
461 					 * be released).
462 					 */
463 					if ((error == 0 ||
464 					    uio->uio_resid != orig_resid) &&
465 					    (vp->v_mount->mnt_flag &
466 					    MNT_NOATIME) == 0)
467 						ip->i_flag |= IN_ACCESS;
468 					if (object) {
469 						vm_object_vndeallocate(object);
470 					}
471 					return error;
472 				}
473 				/*
474 				 * To get here we didnt't finish or err.
475 				 * If we did get some data,
476 				 * loop to try another bite.
477 				 */
478 				if (nread > 0) {
479 					continue;
480 				}
481 			}
482 		}
483 #endif
484 
485 		lbn = lblkno(fs, uio->uio_offset);
486 		nextlbn = lbn + 1;
487 
488 		/*
489 		 * size of buffer.  The buffer representing the
490 		 * end of the file is rounded up to the size of
491 		 * the block type ( fragment or full block,
492 		 * depending ).
493 		 */
494 		size = blksize(fs, ip, lbn);
495 		blkoffset = blkoff(fs, uio->uio_offset);
496 
497 		/*
498 		 * The amount we want to transfer in this iteration is
499 		 * one FS block less the amount of the data before
500 		 * our startpoint (duh!)
501 		 */
502 		xfersize = fs->fs_bsize - blkoffset;
503 
504 		/*
505 		 * But if we actually want less than the block,
506 		 * or the file doesn't have a whole block more of data,
507 		 * then use the lesser number.
508 		 */
509 		if (uio->uio_resid < xfersize)
510 			xfersize = uio->uio_resid;
511 		if (bytesinfile < xfersize)
512 			xfersize = bytesinfile;
513 
514 		if (lblktosize(fs, nextlbn) >= ip->i_size) {
515 			/*
516 			 * Don't do readahead if this is the end of the file.
517 			 */
518 			error = bread(vp, lbn, size, NOCRED, &bp);
519 		} else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
520 			/*
521 			 * Otherwise if we are allowed to cluster,
522 			 * grab as much as we can.
523 			 *
524 			 * XXX  This may not be a win if we are not
525 			 * doing sequential access.
526 			 */
527 			error = cluster_read(vp, ip->i_size, lbn,
528 				size, NOCRED, uio->uio_resid, seqcount, &bp);
529 		} else if (seqcount > 1) {
530 			/*
531 			 * If we are NOT allowed to cluster, then
532 			 * if we appear to be acting sequentially,
533 			 * fire off a request for a readahead
534 			 * as well as a read. Note that the 4th and 5th
535 			 * arguments point to arrays of the size specified in
536 			 * the 6th argument.
537 			 */
538 			int nextsize = blksize(fs, ip, nextlbn);
539 			error = breadn(vp, lbn,
540 			    size, &nextlbn, &nextsize, 1, NOCRED, &bp);
541 		} else {
542 			/*
543 			 * Failing all of the above, just read what the
544 			 * user asked for. Interestingly, the same as
545 			 * the first option above.
546 			 */
547 			error = bread(vp, lbn, size, NOCRED, &bp);
548 		}
549 		if (error) {
550 			brelse(bp);
551 			bp = NULL;
552 			break;
553 		}
554 
555 		/*
556 		 * If IO_DIRECT then set B_DIRECT for the buffer.  This
557 		 * will cause us to attempt to release the buffer later on
558 		 * and will cause the buffer cache to attempt to free the
559 		 * underlying pages.
560 		 */
561 		if (ioflag & IO_DIRECT)
562 			bp->b_flags |= B_DIRECT;
563 
564 		/*
565 		 * We should only get non-zero b_resid when an I/O error
566 		 * has occurred, which should cause us to break above.
567 		 * However, if the short read did not cause an error,
568 		 * then we want to ensure that we do not uiomove bad
569 		 * or uninitialized data.
570 		 */
571 		size -= bp->b_resid;
572 		if (size < xfersize) {
573 			if (size == 0)
574 				break;
575 			xfersize = size;
576 		}
577 
578 #ifdef ENABLE_VFS_IOOPT
579 		if (vfs_ioopt && object &&
580 		    (bp->b_flags & B_VMIO) &&
581 		    ((blkoffset & PAGE_MASK) == 0) &&
582 		    ((xfersize & PAGE_MASK) == 0)) {
583 			/*
584 			 * If VFS IO  optimisation is turned on,
585 			 * and it's an exact page multiple
586 			 * And a normal VM based op,
587 			 * then use uiomiveco()
588 			 */
589 			error =
590 				uiomoveco((char *)bp->b_data + blkoffset,
591 					(int)xfersize, uio, object, 0);
592 		} else
593 #endif
594 		{
595 			/*
596 			 * otherwise use the general form
597 			 */
598 			error =
599 				uiomove((char *)bp->b_data + blkoffset,
600 					(int)xfersize, uio);
601 		}
602 
603 		if (error)
604 			break;
605 
606 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
607 		   (LIST_FIRST(&bp->b_dep) == NULL)) {
608 			/*
609 			 * If there are no dependencies, and it's VMIO,
610 			 * then we don't need the buf, mark it available
611 			 * for freeing. The VM has the data.
612 			 */
613 			bp->b_flags |= B_RELBUF;
614 			brelse(bp);
615 		} else {
616 			/*
617 			 * Otherwise let whoever
618 			 * made the request take care of
619 			 * freeing it. We just queue
620 			 * it onto another list.
621 			 */
622 			bqrelse(bp);
623 		}
624 	}
625 
626 	/*
627 	 * This can only happen in the case of an error
628 	 * because the loop above resets bp to NULL on each iteration
629 	 * and on normal completion has not set a new value into it.
630 	 * so it must have come from a 'break' statement
631 	 */
632 	if (bp != NULL) {
633 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
634 		   (LIST_FIRST(&bp->b_dep) == NULL)) {
635 			bp->b_flags |= B_RELBUF;
636 			brelse(bp);
637 		} else {
638 			bqrelse(bp);
639 		}
640 	}
641 
642 	if (object) {
643 		vm_object_vndeallocate(object);
644 	}
645 	if ((error == 0 || uio->uio_resid != orig_resid) &&
646 	    (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
647 		ip->i_flag |= IN_ACCESS;
648 	return (error);
649 }
650 
651 /*
652  * Vnode op for writing.
653  */
654 static int
655 ffs_write(ap)
656 	struct vop_write_args /* {
657 		struct vnode *a_vp;
658 		struct uio *a_uio;
659 		int a_ioflag;
660 		struct ucred *a_cred;
661 	} */ *ap;
662 {
663 	struct vnode *vp;
664 	struct uio *uio;
665 	struct inode *ip;
666 	struct fs *fs;
667 	struct buf *bp;
668 	struct thread *td;
669 	ufs_lbn_t lbn;
670 	off_t osize;
671 	int seqcount;
672 	int blkoffset, error, extended, flags, ioflag, resid, size, xfersize;
673 	vm_object_t object;
674 
675 	vp = ap->a_vp;
676 	uio = ap->a_uio;
677 	ioflag = ap->a_ioflag;
678 	if (ap->a_ioflag & IO_EXT)
679 #ifdef notyet
680 		return (ffs_extwrite(vp, uio, ioflag, ap->a_cred));
681 #else
682 		panic("ffs_read+IO_EXT");
683 #endif
684 
685 	GIANT_REQUIRED;
686 
687 	extended = 0;
688 	seqcount = ap->a_ioflag >> 16;
689 	ip = VTOI(vp);
690 
691 	object = vp->v_object;
692 	if (object) {
693 		vm_object_reference(object);
694 	}
695 
696 #ifdef DIAGNOSTIC
697 	if (uio->uio_rw != UIO_WRITE)
698 		panic("ffswrite: mode");
699 #endif
700 
701 	switch (vp->v_type) {
702 	case VREG:
703 		if (ioflag & IO_APPEND)
704 			uio->uio_offset = ip->i_size;
705 		if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) {
706 			if (object) {
707 				vm_object_vndeallocate(object);
708 			}
709 			return (EPERM);
710 		}
711 		/* FALLTHROUGH */
712 	case VLNK:
713 		break;
714 	case VDIR:
715 		panic("ffswrite: dir write");
716 		break;
717 	default:
718 		panic("ffswrite: type %p %d (%d,%d)", vp, (int)vp->v_type,
719 			(int)uio->uio_offset,
720 			(int)uio->uio_resid
721 		);
722 	}
723 
724 	fs = ip->i_fs;
725 	if (uio->uio_offset < 0 ||
726 	    (u_int64_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) {
727 		if (object) {
728 			vm_object_vndeallocate(object);
729 		}
730 		return (EFBIG);
731 	}
732 	/*
733 	 * Maybe this should be above the vnode op call, but so long as
734 	 * file servers have no limits, I don't think it matters.
735 	 */
736 	td = uio->uio_td;
737 	if (vp->v_type == VREG && td &&
738 	    uio->uio_offset + uio->uio_resid >
739 	    td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
740 		PROC_LOCK(td->td_proc);
741 		psignal(td->td_proc, SIGXFSZ);
742 		PROC_UNLOCK(td->td_proc);
743 		if (object) {
744 			vm_object_vndeallocate(object);
745 		}
746 		return (EFBIG);
747 	}
748 
749 	resid = uio->uio_resid;
750 	osize = ip->i_size;
751 	if (seqcount > BA_SEQMAX)
752 		flags = BA_SEQMAX << BA_SEQSHIFT;
753 	else
754 		flags = seqcount << BA_SEQSHIFT;
755 	if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
756 		flags |= IO_SYNC;
757 
758 #ifdef ENABLE_VFS_IOOPT
759 	if (object && (object->flags & OBJ_OPT)) {
760 		vm_freeze_copyopts(object,
761 			OFF_TO_IDX(uio->uio_offset),
762 			OFF_TO_IDX(uio->uio_offset + uio->uio_resid + PAGE_MASK));
763 	}
764 #endif
765 	for (error = 0; uio->uio_resid > 0;) {
766 		lbn = lblkno(fs, uio->uio_offset);
767 		blkoffset = blkoff(fs, uio->uio_offset);
768 		xfersize = fs->fs_bsize - blkoffset;
769 		if (uio->uio_resid < xfersize)
770 			xfersize = uio->uio_resid;
771 
772 		if (uio->uio_offset + xfersize > ip->i_size)
773 			vnode_pager_setsize(vp, uio->uio_offset + xfersize);
774 
775                 /*
776 		 * We must perform a read-before-write if the transfer size
777 		 * does not cover the entire buffer.
778                  */
779 		if (fs->fs_bsize > xfersize)
780 			flags |= BA_CLRBUF;
781 		else
782 			flags &= ~BA_CLRBUF;
783 /* XXX is uio->uio_offset the right thing here? */
784 		error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
785 		    ap->a_cred, flags, &bp);
786 		if (error != 0)
787 			break;
788 		/*
789 		 * If the buffer is not valid we have to clear out any
790 		 * garbage data from the pages instantiated for the buffer.
791 		 * If we do not, a failed uiomove() during a write can leave
792 		 * the prior contents of the pages exposed to a userland
793 		 * mmap().  XXX deal with uiomove() errors a better way.
794 		 */
795 		if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
796 			vfs_bio_clrbuf(bp);
797 		if (ioflag & IO_DIRECT)
798 			bp->b_flags |= B_DIRECT;
799 		if (ioflag & IO_NOWDRAIN)
800 			bp->b_flags |= B_NOWDRAIN;
801 
802 		if (uio->uio_offset + xfersize > ip->i_size) {
803 			ip->i_size = uio->uio_offset + xfersize;
804 			DIP(ip, i_size) = ip->i_size;
805 			extended = 1;
806 		}
807 
808 		size = blksize(fs, ip, lbn) - bp->b_resid;
809 		if (size < xfersize)
810 			xfersize = size;
811 
812 		error =
813 		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
814 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
815 		   (LIST_FIRST(&bp->b_dep) == NULL)) {
816 			bp->b_flags |= B_RELBUF;
817 		}
818 
819 		/*
820 		 * If IO_SYNC each buffer is written synchronously.  Otherwise
821 		 * if we have a severe page deficiency write the buffer
822 		 * asynchronously.  Otherwise try to cluster, and if that
823 		 * doesn't do it then either do an async write (if O_DIRECT),
824 		 * or a delayed write (if not).
825 		 */
826 		if (ioflag & IO_SYNC) {
827 			(void)bwrite(bp);
828 		} else if (vm_page_count_severe() ||
829 			    buf_dirty_count_severe() ||
830 			    (ioflag & IO_ASYNC)) {
831 			bp->b_flags |= B_CLUSTEROK;
832 			bawrite(bp);
833 		} else if (xfersize + blkoffset == fs->fs_bsize) {
834 			if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
835 				bp->b_flags |= B_CLUSTEROK;
836 				cluster_write(bp, ip->i_size, seqcount);
837 			} else {
838 				bawrite(bp);
839 			}
840 		} else if (ioflag & IO_DIRECT) {
841 			bp->b_flags |= B_CLUSTEROK;
842 			bawrite(bp);
843 		} else {
844 			bp->b_flags |= B_CLUSTEROK;
845 			bdwrite(bp);
846 		}
847 		if (error || xfersize == 0)
848 			break;
849 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
850 	}
851 	/*
852 	 * If we successfully wrote any data, and we are not the superuser
853 	 * we clear the setuid and setgid bits as a precaution against
854 	 * tampering.
855 	 */
856 	if (resid > uio->uio_resid && ap->a_cred &&
857 	    suser_cred(ap->a_cred, PRISON_ROOT)) {
858 		ip->i_mode &= ~(ISUID | ISGID);
859 		DIP(ip, i_mode) = ip->i_mode;
860 	}
861 	if (resid > uio->uio_resid)
862 		VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0));
863 	if (error) {
864 		if (ioflag & IO_UNIT) {
865 			(void)UFS_TRUNCATE(vp, osize,
866 			    IO_NORMAL | (ioflag & IO_SYNC),
867 			    ap->a_cred, uio->uio_td);
868 			uio->uio_offset -= resid - uio->uio_resid;
869 			uio->uio_resid = resid;
870 		}
871 	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
872 		error = UFS_UPDATE(vp, 1);
873 
874 	if (object) {
875 		vm_object_vndeallocate(object);
876 	}
877 
878 	return (error);
879 }
880 
881 /*
882  * get page routine
883  */
884 static int
885 ffs_getpages(ap)
886 	struct vop_getpages_args *ap;
887 {
888 	off_t foff, physoffset;
889 	int i, size, bsize;
890 	struct vnode *dp, *vp;
891 	vm_object_t obj;
892 	vm_pindex_t pindex, firstindex;
893 	vm_page_t mreq;
894 	int bbackwards, bforwards;
895 	int pbackwards, pforwards;
896 	int firstpage;
897 	ufs2_daddr_t reqblkno, reqlblkno;
898 	int poff;
899 	int pcount;
900 	int rtval;
901 	int pagesperblock;
902 
903 	GIANT_REQUIRED;
904 
905 	pcount = round_page(ap->a_count) / PAGE_SIZE;
906 	mreq = ap->a_m[ap->a_reqpage];
907 	firstindex = ap->a_m[0]->pindex;
908 
909 	/*
910 	 * if ANY DEV_BSIZE blocks are valid on a large filesystem block,
911 	 * then the entire page is valid.  Since the page may be mapped,
912 	 * user programs might reference data beyond the actual end of file
913 	 * occuring within the page.  We have to zero that data.
914 	 */
915 	if (mreq->valid) {
916 		if (mreq->valid != VM_PAGE_BITS_ALL)
917 			vm_page_zero_invalid(mreq, TRUE);
918 		vm_page_lock_queues();
919 		for (i = 0; i < pcount; i++) {
920 			if (i != ap->a_reqpage) {
921 				vm_page_free(ap->a_m[i]);
922 			}
923 		}
924 		vm_page_unlock_queues();
925 		return VM_PAGER_OK;
926 	}
927 
928 	vp = ap->a_vp;
929 	obj = vp->v_object;
930 	bsize = vp->v_mount->mnt_stat.f_iosize;
931 	pindex = mreq->pindex;
932 	foff = IDX_TO_OFF(pindex) /* + ap->a_offset should be zero */;
933 
934 	if (bsize < PAGE_SIZE)
935 		return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
936 						    ap->a_count,
937 						    ap->a_reqpage);
938 
939 	/*
940 	 * foff is the file offset of the required page
941 	 * reqlblkno is the logical block that contains the page
942 	 * poff is the index of the page into the logical block
943 	 */
944 	reqlblkno = foff / bsize;
945 	poff = (foff % bsize) / PAGE_SIZE;
946 
947 	dp = VTOI(vp)->i_devvp;
948 	if (ufs_bmaparray(vp, reqlblkno, &reqblkno, 0, &bforwards, &bbackwards)
949 	    || (reqblkno == -1)) {
950 		vm_page_lock_queues();
951 		for(i = 0; i < pcount; i++) {
952 			if (i != ap->a_reqpage)
953 				vm_page_free(ap->a_m[i]);
954 		}
955 		vm_page_unlock_queues();
956 		if (reqblkno == -1) {
957 			if ((mreq->flags & PG_ZERO) == 0)
958 				pmap_zero_page(mreq);
959 			vm_page_undirty(mreq);
960 			mreq->valid = VM_PAGE_BITS_ALL;
961 			return VM_PAGER_OK;
962 		} else {
963 			return VM_PAGER_ERROR;
964 		}
965 	}
966 
967 	physoffset = (off_t)reqblkno * DEV_BSIZE + poff * PAGE_SIZE;
968 	pagesperblock = bsize / PAGE_SIZE;
969 	/*
970 	 * find the first page that is contiguous...
971 	 * note that pbackwards is the number of pages that are contiguous
972 	 * backwards.
973 	 */
974 	firstpage = 0;
975 	if (ap->a_count) {
976 		pbackwards = poff + bbackwards * pagesperblock;
977 		if (ap->a_reqpage > pbackwards) {
978 			firstpage = ap->a_reqpage - pbackwards;
979 			vm_page_lock_queues();
980 			for(i=0;i<firstpage;i++)
981 				vm_page_free(ap->a_m[i]);
982 			vm_page_unlock_queues();
983 		}
984 
985 	/*
986 	 * pforwards is the number of pages that are contiguous
987 	 * after the current page.
988 	 */
989 		pforwards = (pagesperblock - (poff + 1)) +
990 			bforwards * pagesperblock;
991 		if (pforwards < (pcount - (ap->a_reqpage + 1))) {
992 			vm_page_lock_queues();
993 			for( i = ap->a_reqpage + pforwards + 1; i < pcount; i++)
994 				vm_page_free(ap->a_m[i]);
995 			vm_page_unlock_queues();
996 			pcount = ap->a_reqpage + pforwards + 1;
997 		}
998 
999 	/*
1000 	 * number of pages for I/O corrected for the non-contig pages at
1001 	 * the beginning of the array.
1002 	 */
1003 		pcount -= firstpage;
1004 	}
1005 
1006 	/*
1007 	 * calculate the size of the transfer
1008 	 */
1009 
1010 	size = pcount * PAGE_SIZE;
1011 
1012 	if ((IDX_TO_OFF(ap->a_m[firstpage]->pindex) + size) >
1013 		obj->un_pager.vnp.vnp_size)
1014 		size = obj->un_pager.vnp.vnp_size -
1015 			IDX_TO_OFF(ap->a_m[firstpage]->pindex);
1016 
1017 	physoffset -= foff;
1018 	rtval = VOP_GETPAGES(dp, &ap->a_m[firstpage], size,
1019 		(ap->a_reqpage - firstpage), physoffset);
1020 
1021 	return (rtval);
1022 }
1023 
1024 /*
1025  * Extended attribute area reading.
1026  */
1027 static int
1028 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag)
1029 {
1030 	struct inode *ip;
1031 	struct ufs2_dinode *dp;
1032 	struct fs *fs;
1033 	struct buf *bp;
1034 	ufs_lbn_t lbn, nextlbn;
1035 	off_t bytesinfile;
1036 	long size, xfersize, blkoffset;
1037 	int error, orig_resid;
1038 	mode_t mode;
1039 
1040 	GIANT_REQUIRED;
1041 
1042 	ip = VTOI(vp);
1043 	fs = ip->i_fs;
1044 	dp = ip->i_din2;
1045 	mode = ip->i_mode;
1046 
1047 #ifdef DIAGNOSTIC
1048 	if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC)
1049 		panic("ffs_extread: mode");
1050 
1051 #endif
1052 	orig_resid = uio->uio_resid;
1053 	if (orig_resid <= 0)
1054 		return (0);
1055 
1056 	bytesinfile = dp->di_extsize - uio->uio_offset;
1057 	if (bytesinfile <= 0) {
1058 		if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
1059 			ip->i_flag |= IN_ACCESS;
1060 		return 0;
1061 	}
1062 
1063 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
1064 		if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0)
1065 			break;
1066 
1067 		lbn = lblkno(fs, uio->uio_offset);
1068 		nextlbn = lbn + 1;
1069 
1070 		/*
1071 		 * size of buffer.  The buffer representing the
1072 		 * end of the file is rounded up to the size of
1073 		 * the block type ( fragment or full block,
1074 		 * depending ).
1075 		 */
1076 		size = sblksize(fs, dp->di_extsize, lbn);
1077 		blkoffset = blkoff(fs, uio->uio_offset);
1078 
1079 		/*
1080 		 * The amount we want to transfer in this iteration is
1081 		 * one FS block less the amount of the data before
1082 		 * our startpoint (duh!)
1083 		 */
1084 		xfersize = fs->fs_bsize - blkoffset;
1085 
1086 		/*
1087 		 * But if we actually want less than the block,
1088 		 * or the file doesn't have a whole block more of data,
1089 		 * then use the lesser number.
1090 		 */
1091 		if (uio->uio_resid < xfersize)
1092 			xfersize = uio->uio_resid;
1093 		if (bytesinfile < xfersize)
1094 			xfersize = bytesinfile;
1095 
1096 		if (lblktosize(fs, nextlbn) >= dp->di_extsize) {
1097 			/*
1098 			 * Don't do readahead if this is the end of the info.
1099 			 */
1100 			error = bread(vp, -1 - lbn, size, NOCRED, &bp);
1101 		} else {
1102 			/*
1103 			 * If we have a second block, then
1104 			 * fire off a request for a readahead
1105 			 * as well as a read. Note that the 4th and 5th
1106 			 * arguments point to arrays of the size specified in
1107 			 * the 6th argument.
1108 			 */
1109 			int nextsize = sblksize(fs, dp->di_extsize, nextlbn);
1110 
1111 			nextlbn = -1 - nextlbn;
1112 			error = breadn(vp, -1 - lbn,
1113 			    size, &nextlbn, &nextsize, 1, NOCRED, &bp);
1114 		}
1115 		if (error) {
1116 			brelse(bp);
1117 			bp = NULL;
1118 			break;
1119 		}
1120 
1121 		/*
1122 		 * If IO_DIRECT then set B_DIRECT for the buffer.  This
1123 		 * will cause us to attempt to release the buffer later on
1124 		 * and will cause the buffer cache to attempt to free the
1125 		 * underlying pages.
1126 		 */
1127 		if (ioflag & IO_DIRECT)
1128 			bp->b_flags |= B_DIRECT;
1129 
1130 		/*
1131 		 * We should only get non-zero b_resid when an I/O error
1132 		 * has occurred, which should cause us to break above.
1133 		 * However, if the short read did not cause an error,
1134 		 * then we want to ensure that we do not uiomove bad
1135 		 * or uninitialized data.
1136 		 */
1137 		size -= bp->b_resid;
1138 		if (size < xfersize) {
1139 			if (size == 0)
1140 				break;
1141 			xfersize = size;
1142 		}
1143 
1144 		error = uiomove((char *)bp->b_data + blkoffset,
1145 					(int)xfersize, uio);
1146 		if (error)
1147 			break;
1148 
1149 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1150 		   (LIST_FIRST(&bp->b_dep) == NULL)) {
1151 			/*
1152 			 * If there are no dependencies, and it's VMIO,
1153 			 * then we don't need the buf, mark it available
1154 			 * for freeing. The VM has the data.
1155 			 */
1156 			bp->b_flags |= B_RELBUF;
1157 			brelse(bp);
1158 		} else {
1159 			/*
1160 			 * Otherwise let whoever
1161 			 * made the request take care of
1162 			 * freeing it. We just queue
1163 			 * it onto another list.
1164 			 */
1165 			bqrelse(bp);
1166 		}
1167 	}
1168 
1169 	/*
1170 	 * This can only happen in the case of an error
1171 	 * because the loop above resets bp to NULL on each iteration
1172 	 * and on normal completion has not set a new value into it.
1173 	 * so it must have come from a 'break' statement
1174 	 */
1175 	if (bp != NULL) {
1176 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1177 		   (LIST_FIRST(&bp->b_dep) == NULL)) {
1178 			bp->b_flags |= B_RELBUF;
1179 			brelse(bp);
1180 		} else {
1181 			bqrelse(bp);
1182 		}
1183 	}
1184 
1185 	if ((error == 0 || uio->uio_resid != orig_resid) &&
1186 	    (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
1187 		ip->i_flag |= IN_ACCESS;
1188 	return (error);
1189 }
1190 
1191 /*
1192  * Extended attribute area writing.
1193  */
1194 static int
1195 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred)
1196 {
1197 	struct inode *ip;
1198 	struct ufs2_dinode *dp;
1199 	struct fs *fs;
1200 	struct buf *bp;
1201 	ufs_lbn_t lbn;
1202 	off_t osize;
1203 	int blkoffset, error, flags, resid, size, xfersize;
1204 
1205 	GIANT_REQUIRED;
1206 
1207 	ip = VTOI(vp);
1208 	fs = ip->i_fs;
1209 	dp = ip->i_din2;
1210 
1211 #ifdef DIAGNOSTIC
1212 	if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC)
1213 		panic("ext_write: mode");
1214 #endif
1215 
1216 	if (ioflag & IO_APPEND)
1217 		uio->uio_offset = dp->di_extsize;
1218 
1219 	if (uio->uio_offset < 0 ||
1220 	    (u_int64_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize)
1221 		return (EFBIG);
1222 
1223 	resid = uio->uio_resid;
1224 	osize = dp->di_extsize;
1225 	flags = IO_EXT;
1226 	if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
1227 		flags |= IO_SYNC;
1228 
1229 	for (error = 0; uio->uio_resid > 0;) {
1230 		lbn = lblkno(fs, uio->uio_offset);
1231 		blkoffset = blkoff(fs, uio->uio_offset);
1232 		xfersize = fs->fs_bsize - blkoffset;
1233 		if (uio->uio_resid < xfersize)
1234 			xfersize = uio->uio_resid;
1235 
1236                 /*
1237 		 * We must perform a read-before-write if the transfer size
1238 		 * does not cover the entire buffer.
1239                  */
1240 		if (fs->fs_bsize > xfersize)
1241 			flags |= BA_CLRBUF;
1242 		else
1243 			flags &= ~BA_CLRBUF;
1244 		error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
1245 		    ucred, flags, &bp);
1246 		if (error != 0)
1247 			break;
1248 		/*
1249 		 * If the buffer is not valid we have to clear out any
1250 		 * garbage data from the pages instantiated for the buffer.
1251 		 * If we do not, a failed uiomove() during a write can leave
1252 		 * the prior contents of the pages exposed to a userland
1253 		 * mmap().  XXX deal with uiomove() errors a better way.
1254 		 */
1255 		if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
1256 			vfs_bio_clrbuf(bp);
1257 		if (ioflag & IO_DIRECT)
1258 			bp->b_flags |= B_DIRECT;
1259 		if (ioflag & IO_NOWDRAIN)
1260 			bp->b_flags |= B_NOWDRAIN;
1261 
1262 		if (uio->uio_offset + xfersize > dp->di_extsize)
1263 			dp->di_extsize = uio->uio_offset + xfersize;
1264 
1265 		size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid;
1266 		if (size < xfersize)
1267 			xfersize = size;
1268 
1269 		error =
1270 		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
1271 		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1272 		   (LIST_FIRST(&bp->b_dep) == NULL)) {
1273 			bp->b_flags |= B_RELBUF;
1274 		}
1275 
1276 		/*
1277 		 * If IO_SYNC each buffer is written synchronously.  Otherwise
1278 		 * if we have a severe page deficiency write the buffer
1279 		 * asynchronously.  Otherwise try to cluster, and if that
1280 		 * doesn't do it then either do an async write (if O_DIRECT),
1281 		 * or a delayed write (if not).
1282 		 */
1283 		if (ioflag & IO_SYNC) {
1284 			(void)bwrite(bp);
1285 		} else if (vm_page_count_severe() ||
1286 			    buf_dirty_count_severe() ||
1287 			    xfersize + blkoffset == fs->fs_bsize ||
1288 			    (ioflag & (IO_ASYNC | IO_DIRECT)))
1289 			bawrite(bp);
1290 		else
1291 			bdwrite(bp);
1292 		if (error || xfersize == 0)
1293 			break;
1294 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
1295 	}
1296 	/*
1297 	 * If we successfully wrote any data, and we are not the superuser
1298 	 * we clear the setuid and setgid bits as a precaution against
1299 	 * tampering.
1300 	 */
1301 	if (resid > uio->uio_resid && ucred &&
1302 	    suser_cred(ucred, PRISON_ROOT)) {
1303 		ip->i_mode &= ~(ISUID | ISGID);
1304 		dp->di_mode = ip->i_mode;
1305 	}
1306 	if (error) {
1307 		if (ioflag & IO_UNIT) {
1308 			(void)UFS_TRUNCATE(vp, osize,
1309 			    IO_EXT | (ioflag&IO_SYNC), ucred, uio->uio_td);
1310 			uio->uio_offset -= resid - uio->uio_resid;
1311 			uio->uio_resid = resid;
1312 		}
1313 	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
1314 		error = UFS_UPDATE(vp, 1);
1315 	return (error);
1316 }
1317 
1318 
1319 /*
1320  * Vnode operating to retrieve a named extended attribute.
1321  *
1322  * Locate a particular EA (nspace:name) in the area (ptr:length), and return
1323  * the length of the EA, and possibly the pointer to the entry and to the data.
1324  */
1325 static int
1326 ffs_findextattr(u_char *ptr, uint length, int nspace, const char *name, u_char **eap, u_char **eac)
1327 {
1328 	u_char *p, *pe, *pn, *p0;
1329 	int eapad1, eapad2, ealength, ealen, nlen;
1330 	uint32_t ul;
1331 
1332 	pe = ptr + length;
1333 	nlen = strlen(name);
1334 
1335 	for (p = ptr; p < pe; p = pn) {
1336 		p0 = p;
1337 		bcopy(p, &ul, sizeof(ul));
1338 		pn = p + ul;
1339 		/* make sure this entry is complete */
1340 		if (pn > pe)
1341 			break;
1342 		p += sizeof(uint32_t);
1343 		if (*p != nspace)
1344 			continue;
1345 		p++;
1346 		eapad2 = *p++;
1347 		if (*p != nlen)
1348 			continue;
1349 		p++;
1350 		if (bcmp(p, name, nlen))
1351 			continue;
1352 		ealength = sizeof(uint32_t) + 3 + nlen;
1353 		eapad1 = 8 - (ealength % 8);
1354 		if (eapad1 == 8)
1355 			eapad1 = 0;
1356 		ealength += eapad1;
1357 		ealen = ul - ealength - eapad2;
1358 		p += nlen + eapad1;
1359 		if (eap != NULL)
1360 			*eap = p0;
1361 		if (eac != NULL)
1362 			*eac = p;
1363 		return (ealen);
1364 	}
1365 	return(-1);
1366 }
1367 
1368 static int
1369 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra)
1370 {
1371 	struct inode *ip;
1372 	struct fs *fs;
1373 	struct ufs2_dinode *dp;
1374 	struct uio luio;
1375 	struct iovec liovec;
1376 	int easize, error;
1377 	u_char *eae;
1378 
1379 	ip = VTOI(vp);
1380 	fs = ip->i_fs;
1381 	dp = ip->i_din2;
1382 	easize = dp->di_extsize;
1383 
1384 	eae = malloc(easize + extra, M_TEMP, M_WAITOK);
1385 
1386 	liovec.iov_base = eae;
1387 	liovec.iov_len = easize;
1388 	luio.uio_iov = &liovec;
1389 	luio.uio_iovcnt = 1;
1390 	luio.uio_offset = 0;
1391 	luio.uio_resid = easize;
1392 	luio.uio_segflg = UIO_SYSSPACE;
1393 	luio.uio_rw = UIO_READ;
1394 	luio.uio_td = td;
1395 
1396 	error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC);
1397 	if (error) {
1398 		free(eae, M_TEMP);
1399 		return(error);
1400 	}
1401 	*p = eae;
1402 	return (0);
1403 }
1404 
1405 static int
1406 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td)
1407 {
1408 	struct inode *ip;
1409 	struct fs *fs;
1410 	struct ufs2_dinode *dp;
1411 	int error;
1412 
1413 	ip = VTOI(vp);
1414 	fs = ip->i_fs;
1415 
1416 	if (ip->i_ea_area != NULL)
1417 		return (EBUSY);
1418 	dp = ip->i_din2;
1419 	error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0);
1420 	if (error)
1421 		return (error);
1422 	ip->i_ea_len = dp->di_extsize;
1423 	ip->i_ea_error = 0;
1424 	return (0);
1425 }
1426 
1427 /*
1428  * Vnode extattr transaction commit/abort
1429  */
1430 static int
1431 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td)
1432 {
1433 	struct inode *ip;
1434 	struct fs *fs;
1435 	struct uio luio;
1436 	struct iovec liovec;
1437 	int error;
1438 	struct ufs2_dinode *dp;
1439 
1440 	ip = VTOI(vp);
1441 	fs = ip->i_fs;
1442 	if (ip->i_ea_area == NULL)
1443 		return (EINVAL);
1444 	dp = ip->i_din2;
1445 	error = ip->i_ea_error;
1446 	if (commit && error == 0) {
1447 		if (cred == NOCRED)
1448 			cred =  vp->v_mount->mnt_cred;
1449 		liovec.iov_base = ip->i_ea_area;
1450 		liovec.iov_len = ip->i_ea_len;
1451 		luio.uio_iov = &liovec;
1452 		luio.uio_iovcnt = 1;
1453 		luio.uio_offset = 0;
1454 		luio.uio_resid = ip->i_ea_len;
1455 		luio.uio_segflg = UIO_SYSSPACE;
1456 		luio.uio_rw = UIO_WRITE;
1457 		luio.uio_td = td;
1458 		/* XXX: I'm not happy about truncating to zero size */
1459 		if (ip->i_ea_len < dp->di_extsize)
1460 			error = ffs_truncate(vp, 0, IO_EXT, cred, td);
1461 		error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred);
1462 	}
1463 	free(ip->i_ea_area, M_TEMP);
1464 	ip->i_ea_area = NULL;
1465 	ip->i_ea_len = 0;
1466 	ip->i_ea_error = 0;
1467 	return (error);
1468 }
1469 
1470 /*
1471  * Vnode extattr strategy routine for special devices and fifos.
1472  *
1473  * We need to check for a read or write of the external attributes.
1474  * Otherwise we just fall through and do the usual thing.
1475  */
1476 static int
1477 ffsext_strategy(struct vop_strategy_args *ap)
1478 /*
1479 struct vop_strategy_args {
1480 	struct vnodeop_desc *a_desc;
1481 	struct vnode *a_vp;
1482 	struct buf *a_bp;
1483 };
1484 */
1485 {
1486 	struct vnode *vp;
1487 	daddr_t lbn;
1488 
1489 	vp = ap->a_vp;
1490 	lbn = ap->a_bp->b_lblkno;
1491 	if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC &&
1492 	    lbn < 0 && lbn >= -NXADDR)
1493 		return (ufs_vnoperate((struct vop_generic_args *)ap));
1494 	if (vp->v_type == VFIFO)
1495 		return (ufs_vnoperatefifo((struct vop_generic_args *)ap));
1496 	return (ufs_vnoperatespec((struct vop_generic_args *)ap));
1497 }
1498 
1499 /*
1500  * Vnode extattr transaction commit/abort
1501  */
1502 static int
1503 ffs_openextattr(struct vop_openextattr_args *ap)
1504 /*
1505 struct vop_openextattr_args {
1506 	struct vnodeop_desc *a_desc;
1507 	struct vnode *a_vp;
1508 	IN struct ucred *a_cred;
1509 	IN struct thread *a_td;
1510 };
1511 */
1512 {
1513 	struct inode *ip;
1514 	struct fs *fs;
1515 
1516 	ip = VTOI(ap->a_vp);
1517 	fs = ip->i_fs;
1518 	if (fs->fs_magic == FS_UFS1_MAGIC)
1519 		return (ufs_vnoperate((struct vop_generic_args *)ap));
1520 	return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td));
1521 }
1522 
1523 
1524 /*
1525  * Vnode extattr transaction commit/abort
1526  */
1527 static int
1528 ffs_closeextattr(struct vop_closeextattr_args *ap)
1529 /*
1530 struct vop_closeextattr_args {
1531 	struct vnodeop_desc *a_desc;
1532 	struct vnode *a_vp;
1533 	int a_commit;
1534 	IN struct ucred *a_cred;
1535 	IN struct thread *a_td;
1536 };
1537 */
1538 {
1539 	struct inode *ip;
1540 	struct fs *fs;
1541 
1542 	ip = VTOI(ap->a_vp);
1543 	fs = ip->i_fs;
1544 	if (fs->fs_magic == FS_UFS1_MAGIC)
1545 		return (ufs_vnoperate((struct vop_generic_args *)ap));
1546 	return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td));
1547 }
1548 
1549 
1550 
1551 /*
1552  * Vnode operation to retrieve a named extended attribute.
1553  */
1554 static int
1555 ffs_getextattr(struct vop_getextattr_args *ap)
1556 /*
1557 vop_getextattr {
1558 	IN struct vnode *a_vp;
1559 	IN int a_attrnamespace;
1560 	IN const char *a_name;
1561 	INOUT struct uio *a_uio;
1562 	OUT size_t *a_size;
1563 	IN struct ucred *a_cred;
1564 	IN struct thread *a_td;
1565 };
1566 */
1567 {
1568 	struct inode *ip;
1569 	struct fs *fs;
1570 	u_char *eae, *p, *pe, *pn;
1571 	struct ufs2_dinode *dp;
1572 	unsigned easize;
1573 	uint32_t ul;
1574 	int error, ealen, stand_alone;
1575 
1576 	ip = VTOI(ap->a_vp);
1577 	fs = ip->i_fs;
1578 
1579 	if (fs->fs_magic == FS_UFS1_MAGIC)
1580 		return (ufs_vnoperate((struct vop_generic_args *)ap));
1581 
1582 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1583 	    ap->a_cred, ap->a_td, IREAD);
1584 	if (error)
1585 		return (error);
1586 
1587 	if (ip->i_ea_area == NULL) {
1588 		error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1589 		if (error)
1590 			return (error);
1591 		stand_alone = 1;
1592 	} else {
1593 		stand_alone = 0;
1594 	}
1595 	dp = ip->i_din2;
1596 	eae = ip->i_ea_area;
1597 	easize = ip->i_ea_len;
1598 	if (strlen(ap->a_name) > 0) {
1599 		ealen = ffs_findextattr(eae, easize,
1600 		    ap->a_attrnamespace, ap->a_name, NULL, &p);
1601 		if (ealen >= 0) {
1602 			error = 0;
1603 			if (ap->a_size != NULL)
1604 				*ap->a_size = ealen;
1605 			else if (ap->a_uio != NULL)
1606 				error = uiomove(p, ealen, ap->a_uio);
1607 		} else {
1608 			error = ENOATTR;
1609 		}
1610 	} else {
1611 		error = 0;
1612 		if (ap->a_size != NULL)
1613 			*ap->a_size = 0;
1614 		pe = eae + easize;
1615 		for(p = eae; error == 0 && p < pe; p = pn) {
1616 			bcopy(p, &ul, sizeof(ul));
1617 			pn = p + ul;
1618 			if (pn > pe)
1619 				break;
1620 			p += sizeof(ul);
1621 			if (*p++ != ap->a_attrnamespace)
1622 				continue;
1623 			p++;	/* pad2 */
1624 			ealen = *p;
1625 			if (ap->a_size != NULL) {
1626 				*ap->a_size += ealen + 1;
1627 			} else if (ap->a_uio != NULL) {
1628 				error = uiomove(p, ealen + 1, ap->a_uio);
1629 			}
1630 		}
1631 	}
1632 	if (stand_alone)
1633 		ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1634 	return(error);
1635 }
1636 
1637 /*
1638  * Vnode operation to set a named attribute.
1639  */
1640 static int
1641 ffs_setextattr(struct vop_setextattr_args *ap)
1642 /*
1643 vop_setextattr {
1644 	IN struct vnode *a_vp;
1645 	IN int a_attrnamespace;
1646 	IN const char *a_name;
1647 	INOUT struct uio *a_uio;
1648 	IN struct ucred *a_cred;
1649 	IN struct thread *a_td;
1650 };
1651 */
1652 {
1653 	struct inode *ip;
1654 	struct fs *fs;
1655 	uint32_t ealength, ul;
1656 	int ealen, olen, eacont, eapad1, eapad2, error, i, easize;
1657 	u_char *eae, *p;
1658 	struct ufs2_dinode *dp;
1659 	struct ucred *cred;
1660 	int stand_alone;
1661 
1662 	ip = VTOI(ap->a_vp);
1663 	fs = ip->i_fs;
1664 
1665 	if (fs->fs_magic == FS_UFS1_MAGIC)
1666 		return (ufs_vnoperate((struct vop_generic_args *)ap));
1667 
1668 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1669 	    ap->a_cred, ap->a_td, IWRITE);
1670 	if (error) {
1671 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1672 			ip->i_ea_error = error;
1673 		return (error);
1674 	}
1675 
1676 	if (ap->a_cred != NOCRED)
1677 		cred = ap->a_cred;
1678 	else
1679 		cred = ap->a_vp->v_mount->mnt_cred;
1680 
1681 	dp = ip->i_din2;
1682 
1683 	if (ip->i_ea_area == NULL) {
1684 		error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1685 		if (error)
1686 			return (error);
1687 		stand_alone = 1;
1688 	} else {
1689 		stand_alone = 0;
1690 	}
1691 
1692 	/* Calculate the length of the EA entry */
1693 	if (ap->a_uio == NULL) {
1694 		/* delete */
1695 		ealength = eapad1 = ealen = eapad2 = eacont = 0;
1696 	} else {
1697 		ealen = ap->a_uio->uio_resid;
1698 		ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name);
1699 		eapad1 = 8 - (ealength % 8);
1700 		if (eapad1 == 8)
1701 			eapad1 = 0;
1702 		eacont = ealength + eapad1;
1703 		eapad2 = 8 - (ealen % 8);
1704 		if (eapad2 == 8)
1705 			eapad2 = 0;
1706 		ealength += eapad1 + ealen + eapad2;
1707 	}
1708 
1709 	eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK);
1710 	bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1711 	easize = ip->i_ea_len;
1712 
1713 	olen = ffs_findextattr(eae, easize,
1714 	    ap->a_attrnamespace, ap->a_name, &p, NULL);
1715 	if (olen == -1 && ealength == 0) {
1716 		/* delete but nonexistent */
1717 		free(eae, M_TEMP);
1718 		if (stand_alone)
1719 			ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1720 		return(ENOATTR);
1721 	}
1722         if (olen == -1) {
1723 		/* new, append at end */
1724 		p = eae + easize;
1725 		easize += ealength;
1726 	} else {
1727 		bcopy(p, &ul, sizeof ul);
1728 		i = p - eae + ul;
1729 		if (ul != ealength) {
1730 			bcopy(p + ul, p + ealength, easize - i);
1731 			easize += (ealength - ul);
1732 		}
1733 	}
1734 	if (easize > NXADDR * fs->fs_bsize) {
1735 		free(eae, M_TEMP);
1736 		if (stand_alone)
1737 			ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1738 		else if (ip->i_ea_error == 0)
1739 			ip->i_ea_error = ENOSPC;
1740 		return(ENOSPC);
1741 	}
1742 	if (ealength != 0) {
1743 		bcopy(&ealength, p, sizeof(ealength));
1744 		p += sizeof(ealength);
1745 		*p++ = ap->a_attrnamespace;
1746 		*p++ = eapad2;
1747 		*p++ = strlen(ap->a_name);
1748 		strcpy(p, ap->a_name);
1749 		p += strlen(ap->a_name);
1750 		bzero(p, eapad1);
1751 		p += eapad1;
1752 		error = uiomove(p, ealen, ap->a_uio);
1753 		if (error) {
1754 			free(eae, M_TEMP);
1755 			if (stand_alone)
1756 				ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1757 			else if (ip->i_ea_error == 0)
1758 				ip->i_ea_error = error;
1759 			return(error);
1760 		}
1761 		p += ealen;
1762 		bzero(p, eapad2);
1763 	}
1764 	p = ip->i_ea_area;
1765 	ip->i_ea_area = eae;
1766 	ip->i_ea_len = easize;
1767 	free(p, M_TEMP);
1768 	if (stand_alone)
1769 		error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1770 	return(error);
1771 }
1772