xref: /freebsd/sys/ufs/ffs/ffs_balloc.c (revision 271c3a9060f2ee55607ebe146523f888e1db2654)
1 /*-
2  * Copyright (c) 2002 Networks Associates Technology, Inc.
3  * All rights reserved.
4  *
5  * This software was developed for the FreeBSD Project by Marshall
6  * Kirk McKusick and Network Associates Laboratories, the Security
7  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
8  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
9  * research program
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  * Copyright (c) 1982, 1986, 1989, 1993
33  *	The Regents of the University of California.  All rights reserved.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  * 4. Neither the name of the University nor the names of its contributors
44  *    may be used to endorse or promote products derived from this software
45  *    without specific prior written permission.
46  *
47  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57  * SUCH DAMAGE.
58  *
59  *	@(#)ffs_balloc.c	8.8 (Berkeley) 6/16/95
60  */
61 
62 #include <sys/cdefs.h>
63 __FBSDID("$FreeBSD$");
64 
65 #include <sys/param.h>
66 #include <sys/systm.h>
67 #include <sys/bio.h>
68 #include <sys/buf.h>
69 #include <sys/lock.h>
70 #include <sys/mount.h>
71 #include <sys/vnode.h>
72 
73 #include <ufs/ufs/quota.h>
74 #include <ufs/ufs/inode.h>
75 #include <ufs/ufs/ufs_extern.h>
76 #include <ufs/ufs/extattr.h>
77 #include <ufs/ufs/ufsmount.h>
78 
79 #include <ufs/ffs/fs.h>
80 #include <ufs/ffs/ffs_extern.h>
81 
82 /*
83  * Balloc defines the structure of filesystem storage
84  * by allocating the physical blocks on a device given
85  * the inode and the logical block number in a file.
86  * This is the allocation strategy for UFS1. Below is
87  * the allocation strategy for UFS2.
88  */
89 int
90 ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
91     struct ucred *cred, int flags, struct buf **bpp)
92 {
93 	struct inode *ip;
94 	struct ufs1_dinode *dp;
95 	ufs_lbn_t lbn, lastlbn;
96 	struct fs *fs;
97 	ufs1_daddr_t nb;
98 	struct buf *bp, *nbp;
99 	struct ufsmount *ump;
100 	struct indir indirs[NIADDR + 2];
101 	int deallocated, osize, nsize, num, i, error;
102 	ufs2_daddr_t newb;
103 	ufs1_daddr_t *bap, pref;
104 	ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
105 	ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
106 	int unwindidx = -1;
107 	int saved_inbdflush;
108 
109 	ip = VTOI(vp);
110 	dp = ip->i_din1;
111 	fs = ip->i_fs;
112 	ump = ip->i_ump;
113 	lbn = lblkno(fs, startoffset);
114 	size = blkoff(fs, startoffset) + size;
115 	if (size > fs->fs_bsize)
116 		panic("ffs_balloc_ufs1: blk too big");
117 	*bpp = NULL;
118 	if (flags & IO_EXT)
119 		return (EOPNOTSUPP);
120 	if (lbn < 0)
121 		return (EFBIG);
122 
123 	/*
124 	 * If the next write will extend the file into a new block,
125 	 * and the file is currently composed of a fragment
126 	 * this fragment has to be extended to be a full block.
127 	 */
128 	lastlbn = lblkno(fs, ip->i_size);
129 	if (lastlbn < NDADDR && lastlbn < lbn) {
130 		nb = lastlbn;
131 		osize = blksize(fs, ip, nb);
132 		if (osize < fs->fs_bsize && osize > 0) {
133 			UFS_LOCK(ump);
134 			error = ffs_realloccg(ip, nb, dp->di_db[nb],
135 			   ffs_blkpref_ufs1(ip, lastlbn, (int)nb,
136 			   &dp->di_db[0]), osize, (int)fs->fs_bsize, cred, &bp);
137 			if (error)
138 				return (error);
139 			if (DOINGSOFTDEP(vp))
140 				softdep_setup_allocdirect(ip, nb,
141 				    dbtofsb(fs, bp->b_blkno), dp->di_db[nb],
142 				    fs->fs_bsize, osize, bp);
143 			ip->i_size = smalllblktosize(fs, nb + 1);
144 			dp->di_size = ip->i_size;
145 			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
146 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
147 			if (flags & IO_SYNC)
148 				bwrite(bp);
149 			else
150 				bawrite(bp);
151 		}
152 	}
153 	/*
154 	 * The first NDADDR blocks are direct blocks
155 	 */
156 	if (lbn < NDADDR) {
157 		if (flags & BA_METAONLY)
158 			panic("ffs_balloc_ufs1: BA_METAONLY for direct block");
159 		nb = dp->di_db[lbn];
160 		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
161 			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
162 			if (error) {
163 				brelse(bp);
164 				return (error);
165 			}
166 			bp->b_blkno = fsbtodb(fs, nb);
167 			*bpp = bp;
168 			return (0);
169 		}
170 		if (nb != 0) {
171 			/*
172 			 * Consider need to reallocate a fragment.
173 			 */
174 			osize = fragroundup(fs, blkoff(fs, ip->i_size));
175 			nsize = fragroundup(fs, size);
176 			if (nsize <= osize) {
177 				error = bread(vp, lbn, osize, NOCRED, &bp);
178 				if (error) {
179 					brelse(bp);
180 					return (error);
181 				}
182 				bp->b_blkno = fsbtodb(fs, nb);
183 			} else {
184 				UFS_LOCK(ump);
185 				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
186 				    ffs_blkpref_ufs1(ip, lbn, (int)lbn,
187 				    &dp->di_db[0]), osize, nsize, cred, &bp);
188 				if (error)
189 					return (error);
190 				if (DOINGSOFTDEP(vp))
191 					softdep_setup_allocdirect(ip, lbn,
192 					    dbtofsb(fs, bp->b_blkno), nb,
193 					    nsize, osize, bp);
194 			}
195 		} else {
196 			if (ip->i_size < smalllblktosize(fs, lbn + 1))
197 				nsize = fragroundup(fs, size);
198 			else
199 				nsize = fs->fs_bsize;
200 			UFS_LOCK(ump);
201 			error = ffs_alloc(ip, lbn,
202 			    ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]),
203 			    nsize, cred, &newb);
204 			if (error)
205 				return (error);
206 			bp = getblk(vp, lbn, nsize, 0, 0, 0);
207 			bp->b_blkno = fsbtodb(fs, newb);
208 			if (flags & BA_CLRBUF)
209 				vfs_bio_clrbuf(bp);
210 			if (DOINGSOFTDEP(vp))
211 				softdep_setup_allocdirect(ip, lbn, newb, 0,
212 				    nsize, 0, bp);
213 		}
214 		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
215 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
216 		*bpp = bp;
217 		return (0);
218 	}
219 	/*
220 	 * Determine the number of levels of indirection.
221 	 */
222 	pref = 0;
223 	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
224 		return(error);
225 #ifdef INVARIANTS
226 	if (num < 1)
227 		panic ("ffs_balloc_ufs1: ufs_getlbns returned indirect block");
228 #endif
229 	saved_inbdflush = ~TDP_INBDFLUSH | (curthread->td_pflags &
230 	    TDP_INBDFLUSH);
231 	curthread->td_pflags |= TDP_INBDFLUSH;
232 	/*
233 	 * Fetch the first indirect block allocating if necessary.
234 	 */
235 	--num;
236 	nb = dp->di_ib[indirs[0].in_off];
237 	allocib = NULL;
238 	allocblk = allociblk;
239 	lbns_remfree = lbns;
240 	if (nb == 0) {
241 		UFS_LOCK(ump);
242 		pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
243 	        if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
244 		    cred, &newb)) != 0) {
245 			curthread->td_pflags &= saved_inbdflush;
246 			return (error);
247 		}
248 		nb = newb;
249 		*allocblk++ = nb;
250 		*lbns_remfree++ = indirs[1].in_lbn;
251 		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 0);
252 		bp->b_blkno = fsbtodb(fs, nb);
253 		vfs_bio_clrbuf(bp);
254 		if (DOINGSOFTDEP(vp)) {
255 			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
256 			    newb, 0, fs->fs_bsize, 0, bp);
257 			bdwrite(bp);
258 		} else {
259 			/*
260 			 * Write synchronously so that indirect blocks
261 			 * never point at garbage.
262 			 */
263 			if (DOINGASYNC(vp))
264 				bdwrite(bp);
265 			else if ((error = bwrite(bp)) != 0)
266 				goto fail;
267 		}
268 		allocib = &dp->di_ib[indirs[0].in_off];
269 		*allocib = nb;
270 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
271 	}
272 	/*
273 	 * Fetch through the indirect blocks, allocating as necessary.
274 	 */
275 	for (i = 1;;) {
276 		error = bread(vp,
277 		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
278 		if (error) {
279 			brelse(bp);
280 			goto fail;
281 		}
282 		bap = (ufs1_daddr_t *)bp->b_data;
283 		nb = bap[indirs[i].in_off];
284 		if (i == num)
285 			break;
286 		i += 1;
287 		if (nb != 0) {
288 			bqrelse(bp);
289 			continue;
290 		}
291 		UFS_LOCK(ump);
292 		if (pref == 0)
293 			pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
294 		if ((error =
295 		    ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb)) != 0) {
296 			brelse(bp);
297 			goto fail;
298 		}
299 		nb = newb;
300 		*allocblk++ = nb;
301 		*lbns_remfree++ = indirs[i].in_lbn;
302 		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
303 		nbp->b_blkno = fsbtodb(fs, nb);
304 		vfs_bio_clrbuf(nbp);
305 		if (DOINGSOFTDEP(vp)) {
306 			softdep_setup_allocindir_meta(nbp, ip, bp,
307 			    indirs[i - 1].in_off, nb);
308 			bdwrite(nbp);
309 		} else {
310 			/*
311 			 * Write synchronously so that indirect blocks
312 			 * never point at garbage.
313 			 */
314 			if ((error = bwrite(nbp)) != 0) {
315 				brelse(bp);
316 				goto fail;
317 			}
318 		}
319 		bap[indirs[i - 1].in_off] = nb;
320 		if (allocib == NULL && unwindidx < 0)
321 			unwindidx = i - 1;
322 		/*
323 		 * If required, write synchronously, otherwise use
324 		 * delayed write.
325 		 */
326 		if (flags & IO_SYNC) {
327 			bwrite(bp);
328 		} else {
329 			if (bp->b_bufsize == fs->fs_bsize)
330 				bp->b_flags |= B_CLUSTEROK;
331 			bdwrite(bp);
332 		}
333 	}
334 	/*
335 	 * If asked only for the indirect block, then return it.
336 	 */
337 	if (flags & BA_METAONLY) {
338 		curthread->td_pflags &= saved_inbdflush;
339 		*bpp = bp;
340 		return (0);
341 	}
342 	/*
343 	 * Get the data block, allocating if necessary.
344 	 */
345 	if (nb == 0) {
346 		UFS_LOCK(ump);
347 		pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off, &bap[0]);
348 		error = ffs_alloc(ip,
349 		    lbn, pref, (int)fs->fs_bsize, cred, &newb);
350 		if (error) {
351 			brelse(bp);
352 			goto fail;
353 		}
354 		nb = newb;
355 		*allocblk++ = nb;
356 		*lbns_remfree++ = lbn;
357 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
358 		nbp->b_blkno = fsbtodb(fs, nb);
359 		if (flags & BA_CLRBUF)
360 			vfs_bio_clrbuf(nbp);
361 		if (DOINGSOFTDEP(vp))
362 			softdep_setup_allocindir_page(ip, lbn, bp,
363 			    indirs[i].in_off, nb, 0, nbp);
364 		bap[indirs[i].in_off] = nb;
365 		/*
366 		 * If required, write synchronously, otherwise use
367 		 * delayed write.
368 		 */
369 		if (flags & IO_SYNC) {
370 			bwrite(bp);
371 		} else {
372 			if (bp->b_bufsize == fs->fs_bsize)
373 				bp->b_flags |= B_CLUSTEROK;
374 			bdwrite(bp);
375 		}
376 		curthread->td_pflags &= saved_inbdflush;
377 		*bpp = nbp;
378 		return (0);
379 	}
380 	brelse(bp);
381 	if (flags & BA_CLRBUF) {
382 		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
383 		if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
384 			error = cluster_read(vp, ip->i_size, lbn,
385 			    (int)fs->fs_bsize, NOCRED,
386 			    MAXBSIZE, seqcount, &nbp);
387 		} else {
388 			error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp);
389 		}
390 		if (error) {
391 			brelse(nbp);
392 			goto fail;
393 		}
394 	} else {
395 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
396 		nbp->b_blkno = fsbtodb(fs, nb);
397 	}
398 	curthread->td_pflags &= saved_inbdflush;
399 	*bpp = nbp;
400 	return (0);
401 fail:
402 	curthread->td_pflags &= saved_inbdflush;
403 	/*
404 	 * If we have failed to allocate any blocks, simply return the error.
405 	 * This is the usual case and avoids the need to fsync the file.
406 	 */
407 	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
408 		return (error);
409 	/*
410 	 * If we have failed part way through block allocation, we
411 	 * have to deallocate any indirect blocks that we have allocated.
412 	 * We have to fsync the file before we start to get rid of all
413 	 * of its dependencies so that we do not leave them dangling.
414 	 * We have to sync it at the end so that the soft updates code
415 	 * does not find any untracked changes. Although this is really
416 	 * slow, running out of disk space is not expected to be a common
417 	 * occurence. The error return from fsync is ignored as we already
418 	 * have an error to return to the user.
419 	 */
420 	(void) ffs_syncvnode(vp, MNT_WAIT);
421 	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
422 	     blkp < allocblk; blkp++, lbns_remfree++) {
423 		/*
424 		 * We shall not leave the freed blocks on the vnode
425 		 * buffer object lists.
426 		 */
427 		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT);
428 		if (bp != NULL) {
429 			bp->b_flags |= (B_INVAL | B_RELBUF);
430 			bp->b_flags &= ~B_ASYNC;
431 			brelse(bp);
432 		}
433 		deallocated += fs->fs_bsize;
434 	}
435 	if (allocib != NULL) {
436 		*allocib = 0;
437 	} else if (unwindidx >= 0) {
438 		int r;
439 
440 		r = bread(vp, indirs[unwindidx].in_lbn,
441 		    (int)fs->fs_bsize, NOCRED, &bp);
442 		if (r) {
443 			panic("Could not unwind indirect block, error %d", r);
444 			brelse(bp);
445 		} else {
446 			bap = (ufs1_daddr_t *)bp->b_data;
447 			bap[indirs[unwindidx].in_off] = 0;
448 			if (flags & IO_SYNC) {
449 				bwrite(bp);
450 			} else {
451 				if (bp->b_bufsize == fs->fs_bsize)
452 					bp->b_flags |= B_CLUSTEROK;
453 				bdwrite(bp);
454 			}
455 		}
456 	}
457 	if (deallocated) {
458 #ifdef QUOTA
459 		/*
460 		 * Restore user's disk quota because allocation failed.
461 		 */
462 		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
463 #endif
464 		dp->di_blocks -= btodb(deallocated);
465 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
466 	}
467 	(void) ffs_syncvnode(vp, MNT_WAIT);
468 	/*
469 	 * After the buffers are invalidated and on-disk pointers are
470 	 * cleared, free the blocks.
471 	 */
472 	for (blkp = allociblk; blkp < allocblk; blkp++) {
473 		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
474 		    ip->i_number);
475 	}
476 	return (error);
477 }
478 
479 /*
480  * Balloc defines the structure of file system storage
481  * by allocating the physical blocks on a device given
482  * the inode and the logical block number in a file.
483  * This is the allocation strategy for UFS2. Above is
484  * the allocation strategy for UFS1.
485  */
486 int
487 ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
488     struct ucred *cred, int flags, struct buf **bpp)
489 {
490 	struct inode *ip;
491 	struct ufs2_dinode *dp;
492 	ufs_lbn_t lbn, lastlbn;
493 	struct fs *fs;
494 	struct buf *bp, *nbp;
495 	struct ufsmount *ump;
496 	struct indir indirs[NIADDR + 2];
497 	ufs2_daddr_t nb, newb, *bap, pref;
498 	ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
499 	ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
500 	int deallocated, osize, nsize, num, i, error;
501 	int unwindidx = -1;
502 	int saved_inbdflush;
503 
504 	ip = VTOI(vp);
505 	dp = ip->i_din2;
506 	fs = ip->i_fs;
507 	ump = ip->i_ump;
508 	lbn = lblkno(fs, startoffset);
509 	size = blkoff(fs, startoffset) + size;
510 	if (size > fs->fs_bsize)
511 		panic("ffs_balloc_ufs2: blk too big");
512 	*bpp = NULL;
513 	if (lbn < 0)
514 		return (EFBIG);
515 
516 	/*
517 	 * Check for allocating external data.
518 	 */
519 	if (flags & IO_EXT) {
520 		if (lbn >= NXADDR)
521 			return (EFBIG);
522 		/*
523 		 * If the next write will extend the data into a new block,
524 		 * and the data is currently composed of a fragment
525 		 * this fragment has to be extended to be a full block.
526 		 */
527 		lastlbn = lblkno(fs, dp->di_extsize);
528 		if (lastlbn < lbn) {
529 			nb = lastlbn;
530 			osize = sblksize(fs, dp->di_extsize, nb);
531 			if (osize < fs->fs_bsize && osize > 0) {
532 				UFS_LOCK(ump);
533 				error = ffs_realloccg(ip, -1 - nb,
534 				    dp->di_extb[nb],
535 				    ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
536 				    &dp->di_extb[0]), osize,
537 				    (int)fs->fs_bsize, cred, &bp);
538 				if (error)
539 					return (error);
540 				if (DOINGSOFTDEP(vp))
541 					softdep_setup_allocext(ip, nb,
542 					    dbtofsb(fs, bp->b_blkno),
543 					    dp->di_extb[nb],
544 					    fs->fs_bsize, osize, bp);
545 				dp->di_extsize = smalllblktosize(fs, nb + 1);
546 				dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno);
547 				bp->b_xflags |= BX_ALTDATA;
548 				ip->i_flag |= IN_CHANGE | IN_UPDATE;
549 				if (flags & IO_SYNC)
550 					bwrite(bp);
551 				else
552 					bawrite(bp);
553 			}
554 		}
555 		/*
556 		 * All blocks are direct blocks
557 		 */
558 		if (flags & BA_METAONLY)
559 			panic("ffs_balloc_ufs2: BA_METAONLY for ext block");
560 		nb = dp->di_extb[lbn];
561 		if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) {
562 			error = bread(vp, -1 - lbn, fs->fs_bsize, NOCRED, &bp);
563 			if (error) {
564 				brelse(bp);
565 				return (error);
566 			}
567 			bp->b_blkno = fsbtodb(fs, nb);
568 			bp->b_xflags |= BX_ALTDATA;
569 			*bpp = bp;
570 			return (0);
571 		}
572 		if (nb != 0) {
573 			/*
574 			 * Consider need to reallocate a fragment.
575 			 */
576 			osize = fragroundup(fs, blkoff(fs, dp->di_extsize));
577 			nsize = fragroundup(fs, size);
578 			if (nsize <= osize) {
579 				error = bread(vp, -1 - lbn, osize, NOCRED, &bp);
580 				if (error) {
581 					brelse(bp);
582 					return (error);
583 				}
584 				bp->b_blkno = fsbtodb(fs, nb);
585 				bp->b_xflags |= BX_ALTDATA;
586 			} else {
587 				UFS_LOCK(ump);
588 				error = ffs_realloccg(ip, -1 - lbn,
589 				    dp->di_extb[lbn],
590 				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
591 				    &dp->di_extb[0]), osize, nsize, cred, &bp);
592 				if (error)
593 					return (error);
594 				bp->b_xflags |= BX_ALTDATA;
595 				if (DOINGSOFTDEP(vp))
596 					softdep_setup_allocext(ip, lbn,
597 					    dbtofsb(fs, bp->b_blkno), nb,
598 					    nsize, osize, bp);
599 			}
600 		} else {
601 			if (dp->di_extsize < smalllblktosize(fs, lbn + 1))
602 				nsize = fragroundup(fs, size);
603 			else
604 				nsize = fs->fs_bsize;
605 			UFS_LOCK(ump);
606 			error = ffs_alloc(ip, lbn,
607 			   ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]),
608 			   nsize, cred, &newb);
609 			if (error)
610 				return (error);
611 			bp = getblk(vp, -1 - lbn, nsize, 0, 0, 0);
612 			bp->b_blkno = fsbtodb(fs, newb);
613 			bp->b_xflags |= BX_ALTDATA;
614 			if (flags & BA_CLRBUF)
615 				vfs_bio_clrbuf(bp);
616 			if (DOINGSOFTDEP(vp))
617 				softdep_setup_allocext(ip, lbn, newb, 0,
618 				    nsize, 0, bp);
619 		}
620 		dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno);
621 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
622 		*bpp = bp;
623 		return (0);
624 	}
625 	/*
626 	 * If the next write will extend the file into a new block,
627 	 * and the file is currently composed of a fragment
628 	 * this fragment has to be extended to be a full block.
629 	 */
630 	lastlbn = lblkno(fs, ip->i_size);
631 	if (lastlbn < NDADDR && lastlbn < lbn) {
632 		nb = lastlbn;
633 		osize = blksize(fs, ip, nb);
634 		if (osize < fs->fs_bsize && osize > 0) {
635 			UFS_LOCK(ump);
636 			error = ffs_realloccg(ip, nb, dp->di_db[nb],
637 				ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
638 				    &dp->di_db[0]), osize, (int)fs->fs_bsize,
639 				    cred, &bp);
640 			if (error)
641 				return (error);
642 			if (DOINGSOFTDEP(vp))
643 				softdep_setup_allocdirect(ip, nb,
644 				    dbtofsb(fs, bp->b_blkno),
645 				    dp->di_db[nb],
646 				    fs->fs_bsize, osize, bp);
647 			ip->i_size = smalllblktosize(fs, nb + 1);
648 			dp->di_size = ip->i_size;
649 			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
650 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
651 			if (flags & IO_SYNC)
652 				bwrite(bp);
653 			else
654 				bawrite(bp);
655 		}
656 	}
657 	/*
658 	 * The first NDADDR blocks are direct blocks
659 	 */
660 	if (lbn < NDADDR) {
661 		if (flags & BA_METAONLY)
662 			panic("ffs_balloc_ufs2: BA_METAONLY for direct block");
663 		nb = dp->di_db[lbn];
664 		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
665 			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
666 			if (error) {
667 				brelse(bp);
668 				return (error);
669 			}
670 			bp->b_blkno = fsbtodb(fs, nb);
671 			*bpp = bp;
672 			return (0);
673 		}
674 		if (nb != 0) {
675 			/*
676 			 * Consider need to reallocate a fragment.
677 			 */
678 			osize = fragroundup(fs, blkoff(fs, ip->i_size));
679 			nsize = fragroundup(fs, size);
680 			if (nsize <= osize) {
681 				error = bread(vp, lbn, osize, NOCRED, &bp);
682 				if (error) {
683 					brelse(bp);
684 					return (error);
685 				}
686 				bp->b_blkno = fsbtodb(fs, nb);
687 			} else {
688 				UFS_LOCK(ump);
689 				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
690 				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
691 				       &dp->di_db[0]), osize, nsize, cred, &bp);
692 				if (error)
693 					return (error);
694 				if (DOINGSOFTDEP(vp))
695 					softdep_setup_allocdirect(ip, lbn,
696 					    dbtofsb(fs, bp->b_blkno), nb,
697 					    nsize, osize, bp);
698 			}
699 		} else {
700 			if (ip->i_size < smalllblktosize(fs, lbn + 1))
701 				nsize = fragroundup(fs, size);
702 			else
703 				nsize = fs->fs_bsize;
704 			UFS_LOCK(ump);
705 			error = ffs_alloc(ip, lbn,
706 			    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
707 				&dp->di_db[0]), nsize, cred, &newb);
708 			if (error)
709 				return (error);
710 			bp = getblk(vp, lbn, nsize, 0, 0, 0);
711 			bp->b_blkno = fsbtodb(fs, newb);
712 			if (flags & BA_CLRBUF)
713 				vfs_bio_clrbuf(bp);
714 			if (DOINGSOFTDEP(vp))
715 				softdep_setup_allocdirect(ip, lbn, newb, 0,
716 				    nsize, 0, bp);
717 		}
718 		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
719 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
720 		*bpp = bp;
721 		return (0);
722 	}
723 	/*
724 	 * Determine the number of levels of indirection.
725 	 */
726 	pref = 0;
727 	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
728 		return(error);
729 #ifdef INVARIANTS
730 	if (num < 1)
731 		panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block");
732 #endif
733 	saved_inbdflush = ~TDP_INBDFLUSH | (curthread->td_pflags &
734 	    TDP_INBDFLUSH);
735 	curthread->td_pflags |= TDP_INBDFLUSH;
736 	/*
737 	 * Fetch the first indirect block allocating if necessary.
738 	 */
739 	--num;
740 	nb = dp->di_ib[indirs[0].in_off];
741 	allocib = NULL;
742 	allocblk = allociblk;
743 	lbns_remfree = lbns;
744 	if (nb == 0) {
745 		UFS_LOCK(ump);
746 		pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
747 	        if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
748 		    cred, &newb)) != 0) {
749 			curthread->td_pflags &= saved_inbdflush;
750 			return (error);
751 		}
752 		nb = newb;
753 		*allocblk++ = nb;
754 		*lbns_remfree++ = indirs[1].in_lbn;
755 		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 0);
756 		bp->b_blkno = fsbtodb(fs, nb);
757 		vfs_bio_clrbuf(bp);
758 		if (DOINGSOFTDEP(vp)) {
759 			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
760 			    newb, 0, fs->fs_bsize, 0, bp);
761 			bdwrite(bp);
762 		} else {
763 			/*
764 			 * Write synchronously so that indirect blocks
765 			 * never point at garbage.
766 			 */
767 			if (DOINGASYNC(vp))
768 				bdwrite(bp);
769 			else if ((error = bwrite(bp)) != 0)
770 				goto fail;
771 		}
772 		allocib = &dp->di_ib[indirs[0].in_off];
773 		*allocib = nb;
774 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
775 	}
776 	/*
777 	 * Fetch through the indirect blocks, allocating as necessary.
778 	 */
779 	for (i = 1;;) {
780 		error = bread(vp,
781 		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
782 		if (error) {
783 			brelse(bp);
784 			goto fail;
785 		}
786 		bap = (ufs2_daddr_t *)bp->b_data;
787 		nb = bap[indirs[i].in_off];
788 		if (i == num)
789 			break;
790 		i += 1;
791 		if (nb != 0) {
792 			bqrelse(bp);
793 			continue;
794 		}
795 		UFS_LOCK(ump);
796 		if (pref == 0)
797 			pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
798 		if ((error =
799 		    ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb)) != 0) {
800 			brelse(bp);
801 			goto fail;
802 		}
803 		nb = newb;
804 		*allocblk++ = nb;
805 		*lbns_remfree++ = indirs[i].in_lbn;
806 		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
807 		nbp->b_blkno = fsbtodb(fs, nb);
808 		vfs_bio_clrbuf(nbp);
809 		if (DOINGSOFTDEP(vp)) {
810 			softdep_setup_allocindir_meta(nbp, ip, bp,
811 			    indirs[i - 1].in_off, nb);
812 			bdwrite(nbp);
813 		} else {
814 			/*
815 			 * Write synchronously so that indirect blocks
816 			 * never point at garbage.
817 			 */
818 			if ((error = bwrite(nbp)) != 0) {
819 				brelse(bp);
820 				goto fail;
821 			}
822 		}
823 		bap[indirs[i - 1].in_off] = nb;
824 		if (allocib == NULL && unwindidx < 0)
825 			unwindidx = i - 1;
826 		/*
827 		 * If required, write synchronously, otherwise use
828 		 * delayed write.
829 		 */
830 		if (flags & IO_SYNC) {
831 			bwrite(bp);
832 		} else {
833 			if (bp->b_bufsize == fs->fs_bsize)
834 				bp->b_flags |= B_CLUSTEROK;
835 			bdwrite(bp);
836 		}
837 	}
838 	/*
839 	 * If asked only for the indirect block, then return it.
840 	 */
841 	if (flags & BA_METAONLY) {
842 		curthread->td_pflags &= saved_inbdflush;
843 		*bpp = bp;
844 		return (0);
845 	}
846 	/*
847 	 * Get the data block, allocating if necessary.
848 	 */
849 	if (nb == 0) {
850 		UFS_LOCK(ump);
851 		pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off, &bap[0]);
852 		error = ffs_alloc(ip,
853 		    lbn, pref, (int)fs->fs_bsize, cred, &newb);
854 		if (error) {
855 			brelse(bp);
856 			goto fail;
857 		}
858 		nb = newb;
859 		*allocblk++ = nb;
860 		*lbns_remfree++ = lbn;
861 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
862 		nbp->b_blkno = fsbtodb(fs, nb);
863 		if (flags & BA_CLRBUF)
864 			vfs_bio_clrbuf(nbp);
865 		if (DOINGSOFTDEP(vp))
866 			softdep_setup_allocindir_page(ip, lbn, bp,
867 			    indirs[i].in_off, nb, 0, nbp);
868 		bap[indirs[i].in_off] = nb;
869 		/*
870 		 * If required, write synchronously, otherwise use
871 		 * delayed write.
872 		 */
873 		if (flags & IO_SYNC) {
874 			bwrite(bp);
875 		} else {
876 			if (bp->b_bufsize == fs->fs_bsize)
877 				bp->b_flags |= B_CLUSTEROK;
878 			bdwrite(bp);
879 		}
880 		curthread->td_pflags &= saved_inbdflush;
881 		*bpp = nbp;
882 		return (0);
883 	}
884 	brelse(bp);
885 	/*
886 	 * If requested clear invalid portions of the buffer.  If we
887 	 * have to do a read-before-write (typical if BA_CLRBUF is set),
888 	 * try to do some read-ahead in the sequential case to reduce
889 	 * the number of I/O transactions.
890 	 */
891 	if (flags & BA_CLRBUF) {
892 		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
893 		if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
894 			error = cluster_read(vp, ip->i_size, lbn,
895 			    (int)fs->fs_bsize, NOCRED,
896 			    MAXBSIZE, seqcount, &nbp);
897 		} else {
898 			error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp);
899 		}
900 		if (error) {
901 			brelse(nbp);
902 			goto fail;
903 		}
904 	} else {
905 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
906 		nbp->b_blkno = fsbtodb(fs, nb);
907 	}
908 	curthread->td_pflags &= saved_inbdflush;
909 	*bpp = nbp;
910 	return (0);
911 fail:
912 	curthread->td_pflags &= saved_inbdflush;
913 	/*
914 	 * If we have failed to allocate any blocks, simply return the error.
915 	 * This is the usual case and avoids the need to fsync the file.
916 	 */
917 	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
918 		return (error);
919 	/*
920 	 * If we have failed part way through block allocation, we
921 	 * have to deallocate any indirect blocks that we have allocated.
922 	 * We have to fsync the file before we start to get rid of all
923 	 * of its dependencies so that we do not leave them dangling.
924 	 * We have to sync it at the end so that the soft updates code
925 	 * does not find any untracked changes. Although this is really
926 	 * slow, running out of disk space is not expected to be a common
927 	 * occurence. The error return from fsync is ignored as we already
928 	 * have an error to return to the user.
929 	 */
930 	(void) ffs_syncvnode(vp, MNT_WAIT);
931 	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
932 	     blkp < allocblk; blkp++, lbns_remfree++) {
933 		/*
934 		 * We shall not leave the freed blocks on the vnode
935 		 * buffer object lists.
936 		 */
937 		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT);
938 		if (bp != NULL) {
939 			bp->b_flags |= (B_INVAL | B_RELBUF);
940 			bp->b_flags &= ~B_ASYNC;
941 			brelse(bp);
942 		}
943 		deallocated += fs->fs_bsize;
944 	}
945 	if (allocib != NULL) {
946 		*allocib = 0;
947 	} else if (unwindidx >= 0) {
948 		int r;
949 
950 		r = bread(vp, indirs[unwindidx].in_lbn,
951 		    (int)fs->fs_bsize, NOCRED, &bp);
952 		if (r) {
953 			panic("Could not unwind indirect block, error %d", r);
954 			brelse(bp);
955 		} else {
956 			bap = (ufs2_daddr_t *)bp->b_data;
957 			bap[indirs[unwindidx].in_off] = 0;
958 			if (flags & IO_SYNC) {
959 				bwrite(bp);
960 			} else {
961 				if (bp->b_bufsize == fs->fs_bsize)
962 					bp->b_flags |= B_CLUSTEROK;
963 				bdwrite(bp);
964 			}
965 		}
966 	}
967 	if (deallocated) {
968 #ifdef QUOTA
969 		/*
970 		 * Restore user's disk quota because allocation failed.
971 		 */
972 		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
973 #endif
974 		dp->di_blocks -= btodb(deallocated);
975 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
976 	}
977 	(void) ffs_syncvnode(vp, MNT_WAIT);
978 	/*
979 	 * After the buffers are invalidated and on-disk pointers are
980 	 * cleared, free the blocks.
981 	 */
982 	for (blkp = allociblk; blkp < allocblk; blkp++) {
983 		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
984 		    ip->i_number);
985 	}
986 	return (error);
987 }
988