xref: /freebsd/sys/ufs/ffs/ffs_balloc.c (revision 39beb93c3f8bdbf72a61fda42300b5ebed7390c8)
1 /*-
2  * Copyright (c) 2002 Networks Associates Technology, Inc.
3  * All rights reserved.
4  *
5  * This software was developed for the FreeBSD Project by Marshall
6  * Kirk McKusick and Network Associates Laboratories, the Security
7  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
8  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
9  * research program
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  * Copyright (c) 1982, 1986, 1989, 1993
33  *	The Regents of the University of California.  All rights reserved.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  * 4. Neither the name of the University nor the names of its contributors
44  *    may be used to endorse or promote products derived from this software
45  *    without specific prior written permission.
46  *
47  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57  * SUCH DAMAGE.
58  *
59  *	@(#)ffs_balloc.c	8.8 (Berkeley) 6/16/95
60  */
61 
62 #include <sys/cdefs.h>
63 __FBSDID("$FreeBSD$");
64 
65 #include <sys/param.h>
66 #include <sys/systm.h>
67 #include <sys/bio.h>
68 #include <sys/buf.h>
69 #include <sys/lock.h>
70 #include <sys/mount.h>
71 #include <sys/vnode.h>
72 
73 #include <ufs/ufs/quota.h>
74 #include <ufs/ufs/inode.h>
75 #include <ufs/ufs/ufs_extern.h>
76 #include <ufs/ufs/extattr.h>
77 #include <ufs/ufs/ufsmount.h>
78 
79 #include <ufs/ffs/fs.h>
80 #include <ufs/ffs/ffs_extern.h>
81 
82 /*
83  * Balloc defines the structure of filesystem storage
84  * by allocating the physical blocks on a device given
85  * the inode and the logical block number in a file.
86  * This is the allocation strategy for UFS1. Below is
87  * the allocation strategy for UFS2.
88  */
89 int
90 ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
91     struct ucred *cred, int flags, struct buf **bpp)
92 {
93 	struct inode *ip;
94 	struct ufs1_dinode *dp;
95 	ufs_lbn_t lbn, lastlbn;
96 	struct fs *fs;
97 	ufs1_daddr_t nb;
98 	struct buf *bp, *nbp;
99 	struct ufsmount *ump;
100 	struct indir indirs[NIADDR + 2];
101 	int deallocated, osize, nsize, num, i, error;
102 	ufs2_daddr_t newb;
103 	ufs1_daddr_t *bap, pref;
104 	ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
105 	ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
106 	int unwindidx = -1;
107 	int saved_inbdflush;
108 
109 	ip = VTOI(vp);
110 	dp = ip->i_din1;
111 	fs = ip->i_fs;
112 	ump = ip->i_ump;
113 	lbn = lblkno(fs, startoffset);
114 	size = blkoff(fs, startoffset) + size;
115 	if (size > fs->fs_bsize)
116 		panic("ffs_balloc_ufs1: blk too big");
117 	*bpp = NULL;
118 	if (flags & IO_EXT)
119 		return (EOPNOTSUPP);
120 	if (lbn < 0)
121 		return (EFBIG);
122 
123 	/*
124 	 * If the next write will extend the file into a new block,
125 	 * and the file is currently composed of a fragment
126 	 * this fragment has to be extended to be a full block.
127 	 */
128 	lastlbn = lblkno(fs, ip->i_size);
129 	if (lastlbn < NDADDR && lastlbn < lbn) {
130 		nb = lastlbn;
131 		osize = blksize(fs, ip, nb);
132 		if (osize < fs->fs_bsize && osize > 0) {
133 			UFS_LOCK(ump);
134 			error = ffs_realloccg(ip, nb, dp->di_db[nb],
135 			   ffs_blkpref_ufs1(ip, lastlbn, (int)nb,
136 			   &dp->di_db[0]), osize, (int)fs->fs_bsize, flags,
137 			   cred, &bp);
138 			if (error)
139 				return (error);
140 			if (DOINGSOFTDEP(vp))
141 				softdep_setup_allocdirect(ip, nb,
142 				    dbtofsb(fs, bp->b_blkno), dp->di_db[nb],
143 				    fs->fs_bsize, osize, bp);
144 			ip->i_size = smalllblktosize(fs, nb + 1);
145 			dp->di_size = ip->i_size;
146 			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
147 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
148 			if (flags & IO_SYNC)
149 				bwrite(bp);
150 			else
151 				bawrite(bp);
152 		}
153 	}
154 	/*
155 	 * The first NDADDR blocks are direct blocks
156 	 */
157 	if (lbn < NDADDR) {
158 		if (flags & BA_METAONLY)
159 			panic("ffs_balloc_ufs1: BA_METAONLY for direct block");
160 		nb = dp->di_db[lbn];
161 		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
162 			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
163 			if (error) {
164 				brelse(bp);
165 				return (error);
166 			}
167 			bp->b_blkno = fsbtodb(fs, nb);
168 			*bpp = bp;
169 			return (0);
170 		}
171 		if (nb != 0) {
172 			/*
173 			 * Consider need to reallocate a fragment.
174 			 */
175 			osize = fragroundup(fs, blkoff(fs, ip->i_size));
176 			nsize = fragroundup(fs, size);
177 			if (nsize <= osize) {
178 				error = bread(vp, lbn, osize, NOCRED, &bp);
179 				if (error) {
180 					brelse(bp);
181 					return (error);
182 				}
183 				bp->b_blkno = fsbtodb(fs, nb);
184 			} else {
185 				UFS_LOCK(ump);
186 				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
187 				    ffs_blkpref_ufs1(ip, lbn, (int)lbn,
188 				    &dp->di_db[0]), osize, nsize, flags,
189 				    cred, &bp);
190 				if (error)
191 					return (error);
192 				if (DOINGSOFTDEP(vp))
193 					softdep_setup_allocdirect(ip, lbn,
194 					    dbtofsb(fs, bp->b_blkno), nb,
195 					    nsize, osize, bp);
196 			}
197 		} else {
198 			if (ip->i_size < smalllblktosize(fs, lbn + 1))
199 				nsize = fragroundup(fs, size);
200 			else
201 				nsize = fs->fs_bsize;
202 			UFS_LOCK(ump);
203 			error = ffs_alloc(ip, lbn,
204 			    ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]),
205 			    nsize, flags, cred, &newb);
206 			if (error)
207 				return (error);
208 			bp = getblk(vp, lbn, nsize, 0, 0, 0);
209 			bp->b_blkno = fsbtodb(fs, newb);
210 			if (flags & BA_CLRBUF)
211 				vfs_bio_clrbuf(bp);
212 			if (DOINGSOFTDEP(vp))
213 				softdep_setup_allocdirect(ip, lbn, newb, 0,
214 				    nsize, 0, bp);
215 		}
216 		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
217 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
218 		*bpp = bp;
219 		return (0);
220 	}
221 	/*
222 	 * Determine the number of levels of indirection.
223 	 */
224 	pref = 0;
225 	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
226 		return(error);
227 #ifdef INVARIANTS
228 	if (num < 1)
229 		panic ("ffs_balloc_ufs1: ufs_getlbns returned indirect block");
230 #endif
231 	saved_inbdflush = ~TDP_INBDFLUSH | (curthread->td_pflags &
232 	    TDP_INBDFLUSH);
233 	curthread->td_pflags |= TDP_INBDFLUSH;
234 	/*
235 	 * Fetch the first indirect block allocating if necessary.
236 	 */
237 	--num;
238 	nb = dp->di_ib[indirs[0].in_off];
239 	allocib = NULL;
240 	allocblk = allociblk;
241 	lbns_remfree = lbns;
242 	if (nb == 0) {
243 		UFS_LOCK(ump);
244 		pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
245 	        if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
246 		    flags, cred, &newb)) != 0) {
247 			curthread->td_pflags &= saved_inbdflush;
248 			return (error);
249 		}
250 		nb = newb;
251 		*allocblk++ = nb;
252 		*lbns_remfree++ = indirs[1].in_lbn;
253 		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 0);
254 		bp->b_blkno = fsbtodb(fs, nb);
255 		vfs_bio_clrbuf(bp);
256 		if (DOINGSOFTDEP(vp)) {
257 			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
258 			    newb, 0, fs->fs_bsize, 0, bp);
259 			bdwrite(bp);
260 		} else {
261 			/*
262 			 * Write synchronously so that indirect blocks
263 			 * never point at garbage.
264 			 */
265 			if (DOINGASYNC(vp))
266 				bdwrite(bp);
267 			else if ((error = bwrite(bp)) != 0)
268 				goto fail;
269 		}
270 		allocib = &dp->di_ib[indirs[0].in_off];
271 		*allocib = nb;
272 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
273 	}
274 	/*
275 	 * Fetch through the indirect blocks, allocating as necessary.
276 	 */
277 	for (i = 1;;) {
278 		error = bread(vp,
279 		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
280 		if (error) {
281 			brelse(bp);
282 			goto fail;
283 		}
284 		bap = (ufs1_daddr_t *)bp->b_data;
285 		nb = bap[indirs[i].in_off];
286 		if (i == num)
287 			break;
288 		i += 1;
289 		if (nb != 0) {
290 			bqrelse(bp);
291 			continue;
292 		}
293 		UFS_LOCK(ump);
294 		if (pref == 0)
295 			pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
296 		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
297 		    flags, cred, &newb)) != 0) {
298 			brelse(bp);
299 			goto fail;
300 		}
301 		nb = newb;
302 		*allocblk++ = nb;
303 		*lbns_remfree++ = indirs[i].in_lbn;
304 		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
305 		nbp->b_blkno = fsbtodb(fs, nb);
306 		vfs_bio_clrbuf(nbp);
307 		if (DOINGSOFTDEP(vp)) {
308 			softdep_setup_allocindir_meta(nbp, ip, bp,
309 			    indirs[i - 1].in_off, nb);
310 			bdwrite(nbp);
311 		} else {
312 			/*
313 			 * Write synchronously so that indirect blocks
314 			 * never point at garbage.
315 			 */
316 			if ((error = bwrite(nbp)) != 0) {
317 				brelse(bp);
318 				goto fail;
319 			}
320 		}
321 		bap[indirs[i - 1].in_off] = nb;
322 		if (allocib == NULL && unwindidx < 0)
323 			unwindidx = i - 1;
324 		/*
325 		 * If required, write synchronously, otherwise use
326 		 * delayed write.
327 		 */
328 		if (flags & IO_SYNC) {
329 			bwrite(bp);
330 		} else {
331 			if (bp->b_bufsize == fs->fs_bsize)
332 				bp->b_flags |= B_CLUSTEROK;
333 			bdwrite(bp);
334 		}
335 	}
336 	/*
337 	 * If asked only for the indirect block, then return it.
338 	 */
339 	if (flags & BA_METAONLY) {
340 		curthread->td_pflags &= saved_inbdflush;
341 		*bpp = bp;
342 		return (0);
343 	}
344 	/*
345 	 * Get the data block, allocating if necessary.
346 	 */
347 	if (nb == 0) {
348 		UFS_LOCK(ump);
349 		pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off, &bap[0]);
350 		error = ffs_alloc(ip,
351 		    lbn, pref, (int)fs->fs_bsize, flags, cred, &newb);
352 		if (error) {
353 			brelse(bp);
354 			goto fail;
355 		}
356 		nb = newb;
357 		*allocblk++ = nb;
358 		*lbns_remfree++ = lbn;
359 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
360 		nbp->b_blkno = fsbtodb(fs, nb);
361 		if (flags & BA_CLRBUF)
362 			vfs_bio_clrbuf(nbp);
363 		if (DOINGSOFTDEP(vp))
364 			softdep_setup_allocindir_page(ip, lbn, bp,
365 			    indirs[i].in_off, nb, 0, nbp);
366 		bap[indirs[i].in_off] = nb;
367 		/*
368 		 * If required, write synchronously, otherwise use
369 		 * delayed write.
370 		 */
371 		if (flags & IO_SYNC) {
372 			bwrite(bp);
373 		} else {
374 			if (bp->b_bufsize == fs->fs_bsize)
375 				bp->b_flags |= B_CLUSTEROK;
376 			bdwrite(bp);
377 		}
378 		curthread->td_pflags &= saved_inbdflush;
379 		*bpp = nbp;
380 		return (0);
381 	}
382 	brelse(bp);
383 	if (flags & BA_CLRBUF) {
384 		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
385 		if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
386 			error = cluster_read(vp, ip->i_size, lbn,
387 			    (int)fs->fs_bsize, NOCRED,
388 			    MAXBSIZE, seqcount, &nbp);
389 		} else {
390 			error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp);
391 		}
392 		if (error) {
393 			brelse(nbp);
394 			goto fail;
395 		}
396 	} else {
397 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
398 		nbp->b_blkno = fsbtodb(fs, nb);
399 	}
400 	curthread->td_pflags &= saved_inbdflush;
401 	*bpp = nbp;
402 	return (0);
403 fail:
404 	curthread->td_pflags &= saved_inbdflush;
405 	/*
406 	 * If we have failed to allocate any blocks, simply return the error.
407 	 * This is the usual case and avoids the need to fsync the file.
408 	 */
409 	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
410 		return (error);
411 	/*
412 	 * If we have failed part way through block allocation, we
413 	 * have to deallocate any indirect blocks that we have allocated.
414 	 * We have to fsync the file before we start to get rid of all
415 	 * of its dependencies so that we do not leave them dangling.
416 	 * We have to sync it at the end so that the soft updates code
417 	 * does not find any untracked changes. Although this is really
418 	 * slow, running out of disk space is not expected to be a common
419 	 * occurence. The error return from fsync is ignored as we already
420 	 * have an error to return to the user.
421 	 */
422 	(void) ffs_syncvnode(vp, MNT_WAIT);
423 	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
424 	     blkp < allocblk; blkp++, lbns_remfree++) {
425 		/*
426 		 * We shall not leave the freed blocks on the vnode
427 		 * buffer object lists.
428 		 */
429 		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT);
430 		if (bp != NULL) {
431 			bp->b_flags |= (B_INVAL | B_RELBUF);
432 			bp->b_flags &= ~B_ASYNC;
433 			brelse(bp);
434 		}
435 		deallocated += fs->fs_bsize;
436 	}
437 	if (allocib != NULL) {
438 		*allocib = 0;
439 	} else if (unwindidx >= 0) {
440 		int r;
441 
442 		r = bread(vp, indirs[unwindidx].in_lbn,
443 		    (int)fs->fs_bsize, NOCRED, &bp);
444 		if (r) {
445 			panic("Could not unwind indirect block, error %d", r);
446 			brelse(bp);
447 		} else {
448 			bap = (ufs1_daddr_t *)bp->b_data;
449 			bap[indirs[unwindidx].in_off] = 0;
450 			if (flags & IO_SYNC) {
451 				bwrite(bp);
452 			} else {
453 				if (bp->b_bufsize == fs->fs_bsize)
454 					bp->b_flags |= B_CLUSTEROK;
455 				bdwrite(bp);
456 			}
457 		}
458 	}
459 	if (deallocated) {
460 #ifdef QUOTA
461 		/*
462 		 * Restore user's disk quota because allocation failed.
463 		 */
464 		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
465 #endif
466 		dp->di_blocks -= btodb(deallocated);
467 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
468 	}
469 	(void) ffs_syncvnode(vp, MNT_WAIT);
470 	/*
471 	 * After the buffers are invalidated and on-disk pointers are
472 	 * cleared, free the blocks.
473 	 */
474 	for (blkp = allociblk; blkp < allocblk; blkp++) {
475 		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
476 		    ip->i_number);
477 	}
478 	return (error);
479 }
480 
481 /*
482  * Balloc defines the structure of file system storage
483  * by allocating the physical blocks on a device given
484  * the inode and the logical block number in a file.
485  * This is the allocation strategy for UFS2. Above is
486  * the allocation strategy for UFS1.
487  */
488 int
489 ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
490     struct ucred *cred, int flags, struct buf **bpp)
491 {
492 	struct inode *ip;
493 	struct ufs2_dinode *dp;
494 	ufs_lbn_t lbn, lastlbn;
495 	struct fs *fs;
496 	struct buf *bp, *nbp;
497 	struct ufsmount *ump;
498 	struct indir indirs[NIADDR + 2];
499 	ufs2_daddr_t nb, newb, *bap, pref;
500 	ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
501 	ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
502 	int deallocated, osize, nsize, num, i, error;
503 	int unwindidx = -1;
504 	int saved_inbdflush;
505 
506 	ip = VTOI(vp);
507 	dp = ip->i_din2;
508 	fs = ip->i_fs;
509 	ump = ip->i_ump;
510 	lbn = lblkno(fs, startoffset);
511 	size = blkoff(fs, startoffset) + size;
512 	if (size > fs->fs_bsize)
513 		panic("ffs_balloc_ufs2: blk too big");
514 	*bpp = NULL;
515 	if (lbn < 0)
516 		return (EFBIG);
517 
518 	/*
519 	 * Check for allocating external data.
520 	 */
521 	if (flags & IO_EXT) {
522 		if (lbn >= NXADDR)
523 			return (EFBIG);
524 		/*
525 		 * If the next write will extend the data into a new block,
526 		 * and the data is currently composed of a fragment
527 		 * this fragment has to be extended to be a full block.
528 		 */
529 		lastlbn = lblkno(fs, dp->di_extsize);
530 		if (lastlbn < lbn) {
531 			nb = lastlbn;
532 			osize = sblksize(fs, dp->di_extsize, nb);
533 			if (osize < fs->fs_bsize && osize > 0) {
534 				UFS_LOCK(ump);
535 				error = ffs_realloccg(ip, -1 - nb,
536 				    dp->di_extb[nb],
537 				    ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
538 				    &dp->di_extb[0]), osize,
539 				    (int)fs->fs_bsize, flags, cred, &bp);
540 				if (error)
541 					return (error);
542 				if (DOINGSOFTDEP(vp))
543 					softdep_setup_allocext(ip, nb,
544 					    dbtofsb(fs, bp->b_blkno),
545 					    dp->di_extb[nb],
546 					    fs->fs_bsize, osize, bp);
547 				dp->di_extsize = smalllblktosize(fs, nb + 1);
548 				dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno);
549 				bp->b_xflags |= BX_ALTDATA;
550 				ip->i_flag |= IN_CHANGE;
551 				if (flags & IO_SYNC)
552 					bwrite(bp);
553 				else
554 					bawrite(bp);
555 			}
556 		}
557 		/*
558 		 * All blocks are direct blocks
559 		 */
560 		if (flags & BA_METAONLY)
561 			panic("ffs_balloc_ufs2: BA_METAONLY for ext block");
562 		nb = dp->di_extb[lbn];
563 		if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) {
564 			error = bread(vp, -1 - lbn, fs->fs_bsize, NOCRED, &bp);
565 			if (error) {
566 				brelse(bp);
567 				return (error);
568 			}
569 			bp->b_blkno = fsbtodb(fs, nb);
570 			bp->b_xflags |= BX_ALTDATA;
571 			*bpp = bp;
572 			return (0);
573 		}
574 		if (nb != 0) {
575 			/*
576 			 * Consider need to reallocate a fragment.
577 			 */
578 			osize = fragroundup(fs, blkoff(fs, dp->di_extsize));
579 			nsize = fragroundup(fs, size);
580 			if (nsize <= osize) {
581 				error = bread(vp, -1 - lbn, osize, NOCRED, &bp);
582 				if (error) {
583 					brelse(bp);
584 					return (error);
585 				}
586 				bp->b_blkno = fsbtodb(fs, nb);
587 				bp->b_xflags |= BX_ALTDATA;
588 			} else {
589 				UFS_LOCK(ump);
590 				error = ffs_realloccg(ip, -1 - lbn,
591 				    dp->di_extb[lbn],
592 				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
593 				    &dp->di_extb[0]), osize, nsize, flags,
594 				    cred, &bp);
595 				if (error)
596 					return (error);
597 				bp->b_xflags |= BX_ALTDATA;
598 				if (DOINGSOFTDEP(vp))
599 					softdep_setup_allocext(ip, lbn,
600 					    dbtofsb(fs, bp->b_blkno), nb,
601 					    nsize, osize, bp);
602 			}
603 		} else {
604 			if (dp->di_extsize < smalllblktosize(fs, lbn + 1))
605 				nsize = fragroundup(fs, size);
606 			else
607 				nsize = fs->fs_bsize;
608 			UFS_LOCK(ump);
609 			error = ffs_alloc(ip, lbn,
610 			   ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]),
611 			   nsize, flags, cred, &newb);
612 			if (error)
613 				return (error);
614 			bp = getblk(vp, -1 - lbn, nsize, 0, 0, 0);
615 			bp->b_blkno = fsbtodb(fs, newb);
616 			bp->b_xflags |= BX_ALTDATA;
617 			if (flags & BA_CLRBUF)
618 				vfs_bio_clrbuf(bp);
619 			if (DOINGSOFTDEP(vp))
620 				softdep_setup_allocext(ip, lbn, newb, 0,
621 				    nsize, 0, bp);
622 		}
623 		dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno);
624 		ip->i_flag |= IN_CHANGE;
625 		*bpp = bp;
626 		return (0);
627 	}
628 	/*
629 	 * If the next write will extend the file into a new block,
630 	 * and the file is currently composed of a fragment
631 	 * this fragment has to be extended to be a full block.
632 	 */
633 	lastlbn = lblkno(fs, ip->i_size);
634 	if (lastlbn < NDADDR && lastlbn < lbn) {
635 		nb = lastlbn;
636 		osize = blksize(fs, ip, nb);
637 		if (osize < fs->fs_bsize && osize > 0) {
638 			UFS_LOCK(ump);
639 			error = ffs_realloccg(ip, nb, dp->di_db[nb],
640 				ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
641 				    &dp->di_db[0]), osize, (int)fs->fs_bsize,
642 				    flags, cred, &bp);
643 			if (error)
644 				return (error);
645 			if (DOINGSOFTDEP(vp))
646 				softdep_setup_allocdirect(ip, nb,
647 				    dbtofsb(fs, bp->b_blkno),
648 				    dp->di_db[nb],
649 				    fs->fs_bsize, osize, bp);
650 			ip->i_size = smalllblktosize(fs, nb + 1);
651 			dp->di_size = ip->i_size;
652 			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
653 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
654 			if (flags & IO_SYNC)
655 				bwrite(bp);
656 			else
657 				bawrite(bp);
658 		}
659 	}
660 	/*
661 	 * The first NDADDR blocks are direct blocks
662 	 */
663 	if (lbn < NDADDR) {
664 		if (flags & BA_METAONLY)
665 			panic("ffs_balloc_ufs2: BA_METAONLY for direct block");
666 		nb = dp->di_db[lbn];
667 		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
668 			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
669 			if (error) {
670 				brelse(bp);
671 				return (error);
672 			}
673 			bp->b_blkno = fsbtodb(fs, nb);
674 			*bpp = bp;
675 			return (0);
676 		}
677 		if (nb != 0) {
678 			/*
679 			 * Consider need to reallocate a fragment.
680 			 */
681 			osize = fragroundup(fs, blkoff(fs, ip->i_size));
682 			nsize = fragroundup(fs, size);
683 			if (nsize <= osize) {
684 				error = bread(vp, lbn, osize, NOCRED, &bp);
685 				if (error) {
686 					brelse(bp);
687 					return (error);
688 				}
689 				bp->b_blkno = fsbtodb(fs, nb);
690 			} else {
691 				UFS_LOCK(ump);
692 				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
693 				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
694 				       &dp->di_db[0]), osize, nsize, flags,
695 				    cred, &bp);
696 				if (error)
697 					return (error);
698 				if (DOINGSOFTDEP(vp))
699 					softdep_setup_allocdirect(ip, lbn,
700 					    dbtofsb(fs, bp->b_blkno), nb,
701 					    nsize, osize, bp);
702 			}
703 		} else {
704 			if (ip->i_size < smalllblktosize(fs, lbn + 1))
705 				nsize = fragroundup(fs, size);
706 			else
707 				nsize = fs->fs_bsize;
708 			UFS_LOCK(ump);
709 			error = ffs_alloc(ip, lbn,
710 			    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
711 				&dp->di_db[0]), nsize, flags, cred, &newb);
712 			if (error)
713 				return (error);
714 			bp = getblk(vp, lbn, nsize, 0, 0, 0);
715 			bp->b_blkno = fsbtodb(fs, newb);
716 			if (flags & BA_CLRBUF)
717 				vfs_bio_clrbuf(bp);
718 			if (DOINGSOFTDEP(vp))
719 				softdep_setup_allocdirect(ip, lbn, newb, 0,
720 				    nsize, 0, bp);
721 		}
722 		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
723 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
724 		*bpp = bp;
725 		return (0);
726 	}
727 	/*
728 	 * Determine the number of levels of indirection.
729 	 */
730 	pref = 0;
731 	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
732 		return(error);
733 #ifdef INVARIANTS
734 	if (num < 1)
735 		panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block");
736 #endif
737 	saved_inbdflush = ~TDP_INBDFLUSH | (curthread->td_pflags &
738 	    TDP_INBDFLUSH);
739 	curthread->td_pflags |= TDP_INBDFLUSH;
740 	/*
741 	 * Fetch the first indirect block allocating if necessary.
742 	 */
743 	--num;
744 	nb = dp->di_ib[indirs[0].in_off];
745 	allocib = NULL;
746 	allocblk = allociblk;
747 	lbns_remfree = lbns;
748 	if (nb == 0) {
749 		UFS_LOCK(ump);
750 		pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
751 	        if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
752 		    flags, cred, &newb)) != 0) {
753 			curthread->td_pflags &= saved_inbdflush;
754 			return (error);
755 		}
756 		nb = newb;
757 		*allocblk++ = nb;
758 		*lbns_remfree++ = indirs[1].in_lbn;
759 		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 0);
760 		bp->b_blkno = fsbtodb(fs, nb);
761 		vfs_bio_clrbuf(bp);
762 		if (DOINGSOFTDEP(vp)) {
763 			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
764 			    newb, 0, fs->fs_bsize, 0, bp);
765 			bdwrite(bp);
766 		} else {
767 			/*
768 			 * Write synchronously so that indirect blocks
769 			 * never point at garbage.
770 			 */
771 			if (DOINGASYNC(vp))
772 				bdwrite(bp);
773 			else if ((error = bwrite(bp)) != 0)
774 				goto fail;
775 		}
776 		allocib = &dp->di_ib[indirs[0].in_off];
777 		*allocib = nb;
778 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
779 	}
780 	/*
781 	 * Fetch through the indirect blocks, allocating as necessary.
782 	 */
783 	for (i = 1;;) {
784 		error = bread(vp,
785 		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
786 		if (error) {
787 			brelse(bp);
788 			goto fail;
789 		}
790 		bap = (ufs2_daddr_t *)bp->b_data;
791 		nb = bap[indirs[i].in_off];
792 		if (i == num)
793 			break;
794 		i += 1;
795 		if (nb != 0) {
796 			bqrelse(bp);
797 			continue;
798 		}
799 		UFS_LOCK(ump);
800 		if (pref == 0)
801 			pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
802 		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
803 		    flags, cred, &newb)) != 0) {
804 			brelse(bp);
805 			goto fail;
806 		}
807 		nb = newb;
808 		*allocblk++ = nb;
809 		*lbns_remfree++ = indirs[i].in_lbn;
810 		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
811 		nbp->b_blkno = fsbtodb(fs, nb);
812 		vfs_bio_clrbuf(nbp);
813 		if (DOINGSOFTDEP(vp)) {
814 			softdep_setup_allocindir_meta(nbp, ip, bp,
815 			    indirs[i - 1].in_off, nb);
816 			bdwrite(nbp);
817 		} else {
818 			/*
819 			 * Write synchronously so that indirect blocks
820 			 * never point at garbage.
821 			 */
822 			if ((error = bwrite(nbp)) != 0) {
823 				brelse(bp);
824 				goto fail;
825 			}
826 		}
827 		bap[indirs[i - 1].in_off] = nb;
828 		if (allocib == NULL && unwindidx < 0)
829 			unwindidx = i - 1;
830 		/*
831 		 * If required, write synchronously, otherwise use
832 		 * delayed write.
833 		 */
834 		if (flags & IO_SYNC) {
835 			bwrite(bp);
836 		} else {
837 			if (bp->b_bufsize == fs->fs_bsize)
838 				bp->b_flags |= B_CLUSTEROK;
839 			bdwrite(bp);
840 		}
841 	}
842 	/*
843 	 * If asked only for the indirect block, then return it.
844 	 */
845 	if (flags & BA_METAONLY) {
846 		curthread->td_pflags &= saved_inbdflush;
847 		*bpp = bp;
848 		return (0);
849 	}
850 	/*
851 	 * Get the data block, allocating if necessary.
852 	 */
853 	if (nb == 0) {
854 		UFS_LOCK(ump);
855 		pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off, &bap[0]);
856 		error = ffs_alloc(ip,
857 		    lbn, pref, (int)fs->fs_bsize, flags, cred, &newb);
858 		if (error) {
859 			brelse(bp);
860 			goto fail;
861 		}
862 		nb = newb;
863 		*allocblk++ = nb;
864 		*lbns_remfree++ = lbn;
865 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
866 		nbp->b_blkno = fsbtodb(fs, nb);
867 		if (flags & BA_CLRBUF)
868 			vfs_bio_clrbuf(nbp);
869 		if (DOINGSOFTDEP(vp))
870 			softdep_setup_allocindir_page(ip, lbn, bp,
871 			    indirs[i].in_off, nb, 0, nbp);
872 		bap[indirs[i].in_off] = nb;
873 		/*
874 		 * If required, write synchronously, otherwise use
875 		 * delayed write.
876 		 */
877 		if (flags & IO_SYNC) {
878 			bwrite(bp);
879 		} else {
880 			if (bp->b_bufsize == fs->fs_bsize)
881 				bp->b_flags |= B_CLUSTEROK;
882 			bdwrite(bp);
883 		}
884 		curthread->td_pflags &= saved_inbdflush;
885 		*bpp = nbp;
886 		return (0);
887 	}
888 	brelse(bp);
889 	/*
890 	 * If requested clear invalid portions of the buffer.  If we
891 	 * have to do a read-before-write (typical if BA_CLRBUF is set),
892 	 * try to do some read-ahead in the sequential case to reduce
893 	 * the number of I/O transactions.
894 	 */
895 	if (flags & BA_CLRBUF) {
896 		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
897 		if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
898 			error = cluster_read(vp, ip->i_size, lbn,
899 			    (int)fs->fs_bsize, NOCRED,
900 			    MAXBSIZE, seqcount, &nbp);
901 		} else {
902 			error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp);
903 		}
904 		if (error) {
905 			brelse(nbp);
906 			goto fail;
907 		}
908 	} else {
909 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
910 		nbp->b_blkno = fsbtodb(fs, nb);
911 	}
912 	curthread->td_pflags &= saved_inbdflush;
913 	*bpp = nbp;
914 	return (0);
915 fail:
916 	curthread->td_pflags &= saved_inbdflush;
917 	/*
918 	 * If we have failed to allocate any blocks, simply return the error.
919 	 * This is the usual case and avoids the need to fsync the file.
920 	 */
921 	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
922 		return (error);
923 	/*
924 	 * If we have failed part way through block allocation, we
925 	 * have to deallocate any indirect blocks that we have allocated.
926 	 * We have to fsync the file before we start to get rid of all
927 	 * of its dependencies so that we do not leave them dangling.
928 	 * We have to sync it at the end so that the soft updates code
929 	 * does not find any untracked changes. Although this is really
930 	 * slow, running out of disk space is not expected to be a common
931 	 * occurence. The error return from fsync is ignored as we already
932 	 * have an error to return to the user.
933 	 */
934 	(void) ffs_syncvnode(vp, MNT_WAIT);
935 	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
936 	     blkp < allocblk; blkp++, lbns_remfree++) {
937 		/*
938 		 * We shall not leave the freed blocks on the vnode
939 		 * buffer object lists.
940 		 */
941 		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT);
942 		if (bp != NULL) {
943 			bp->b_flags |= (B_INVAL | B_RELBUF);
944 			bp->b_flags &= ~B_ASYNC;
945 			brelse(bp);
946 		}
947 		deallocated += fs->fs_bsize;
948 	}
949 	if (allocib != NULL) {
950 		*allocib = 0;
951 	} else if (unwindidx >= 0) {
952 		int r;
953 
954 		r = bread(vp, indirs[unwindidx].in_lbn,
955 		    (int)fs->fs_bsize, NOCRED, &bp);
956 		if (r) {
957 			panic("Could not unwind indirect block, error %d", r);
958 			brelse(bp);
959 		} else {
960 			bap = (ufs2_daddr_t *)bp->b_data;
961 			bap[indirs[unwindidx].in_off] = 0;
962 			if (flags & IO_SYNC) {
963 				bwrite(bp);
964 			} else {
965 				if (bp->b_bufsize == fs->fs_bsize)
966 					bp->b_flags |= B_CLUSTEROK;
967 				bdwrite(bp);
968 			}
969 		}
970 	}
971 	if (deallocated) {
972 #ifdef QUOTA
973 		/*
974 		 * Restore user's disk quota because allocation failed.
975 		 */
976 		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
977 #endif
978 		dp->di_blocks -= btodb(deallocated);
979 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
980 	}
981 	(void) ffs_syncvnode(vp, MNT_WAIT);
982 	/*
983 	 * After the buffers are invalidated and on-disk pointers are
984 	 * cleared, free the blocks.
985 	 */
986 	for (blkp = allociblk; blkp < allocblk; blkp++) {
987 		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
988 		    ip->i_number);
989 	}
990 	return (error);
991 }
992