xref: /freebsd/sys/ufs/ffs/ffs_balloc.c (revision e0c27215058b5786c78fcfb3963eebe61a989511)
1 /*
2  * Copyright (c) 2002 Networks Associates Technology, Inc.
3  * All rights reserved.
4  *
5  * This software was developed for the FreeBSD Project by Marshall
6  * Kirk McKusick and Network Associates Laboratories, the Security
7  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
8  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
9  * research program
10  *
11  * Copyright (c) 1982, 1986, 1989, 1993
12  *	The Regents of the University of California.  All rights reserved.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. All advertising materials mentioning features or use of this software
23  *    must display the following acknowledgement:
24  *	This product includes software developed by the University of
25  *	California, Berkeley and its contributors.
26  * 4. Neither the name of the University nor the names of its contributors
27  *    may be used to endorse or promote products derived from this software
28  *    without specific prior written permission.
29  *
30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40  * SUCH DAMAGE.
41  *
42  *	@(#)ffs_balloc.c	8.8 (Berkeley) 6/16/95
43  */
44 
45 #include <sys/cdefs.h>
46 __FBSDID("$FreeBSD$");
47 
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/bio.h>
51 #include <sys/buf.h>
52 #include <sys/lock.h>
53 #include <sys/mount.h>
54 #include <sys/vnode.h>
55 
56 #include <ufs/ufs/quota.h>
57 #include <ufs/ufs/inode.h>
58 #include <ufs/ufs/ufs_extern.h>
59 
60 #include <ufs/ffs/fs.h>
61 #include <ufs/ffs/ffs_extern.h>
62 
63 /*
64  * Balloc defines the structure of filesystem storage
65  * by allocating the physical blocks on a device given
66  * the inode and the logical block number in a file.
67  * This is the allocation strategy for UFS1. Below is
68  * the allocation strategy for UFS2.
69  */
70 int
71 ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
72     struct ucred *cred, int flags, struct buf **bpp)
73 {
74 	struct inode *ip;
75 	struct ufs1_dinode *dp;
76 	ufs_lbn_t lbn, lastlbn;
77 	struct fs *fs;
78 	ufs1_daddr_t nb;
79 	struct buf *bp, *nbp;
80 	struct indir indirs[NIADDR + 2];
81 	int deallocated, osize, nsize, num, i, error;
82 	ufs2_daddr_t newb;
83 	ufs1_daddr_t *bap, pref;
84 	ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
85 	int unwindidx = -1;
86 	struct thread *td = curthread;	/* XXX */
87 
88 	ip = VTOI(vp);
89 	dp = ip->i_din1;
90 	fs = ip->i_fs;
91 	lbn = lblkno(fs, startoffset);
92 	size = blkoff(fs, startoffset) + size;
93 	if (size > fs->fs_bsize)
94 		panic("ffs_balloc_ufs1: blk too big");
95 	*bpp = NULL;
96 	if (flags & IO_EXT)
97 		return (EOPNOTSUPP);
98 	if (lbn < 0)
99 		return (EFBIG);
100 
101 	/*
102 	 * If the next write will extend the file into a new block,
103 	 * and the file is currently composed of a fragment
104 	 * this fragment has to be extended to be a full block.
105 	 */
106 	lastlbn = lblkno(fs, ip->i_size);
107 	if (lastlbn < NDADDR && lastlbn < lbn) {
108 		nb = lastlbn;
109 		osize = blksize(fs, ip, nb);
110 		if (osize < fs->fs_bsize && osize > 0) {
111 			error = ffs_realloccg(ip, nb, dp->di_db[nb],
112 			   ffs_blkpref_ufs1(ip, lastlbn, (int)nb,
113 			   &dp->di_db[0]), osize, (int)fs->fs_bsize, cred, &bp);
114 			if (error)
115 				return (error);
116 			if (DOINGSOFTDEP(vp))
117 				softdep_setup_allocdirect(ip, nb,
118 				    dbtofsb(fs, bp->b_blkno), dp->di_db[nb],
119 				    fs->fs_bsize, osize, bp);
120 			ip->i_size = smalllblktosize(fs, nb + 1);
121 			dp->di_size = ip->i_size;
122 			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
123 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
124 			if (flags & IO_SYNC)
125 				bwrite(bp);
126 			else
127 				bawrite(bp);
128 		}
129 	}
130 	/*
131 	 * The first NDADDR blocks are direct blocks
132 	 */
133 	if (lbn < NDADDR) {
134 		if (flags & BA_METAONLY)
135 			panic("ffs_balloc_ufs1: BA_METAONLY for direct block");
136 		nb = dp->di_db[lbn];
137 		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
138 			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
139 			if (error) {
140 				brelse(bp);
141 				return (error);
142 			}
143 			bp->b_blkno = fsbtodb(fs, nb);
144 			*bpp = bp;
145 			return (0);
146 		}
147 		if (nb != 0) {
148 			/*
149 			 * Consider need to reallocate a fragment.
150 			 */
151 			osize = fragroundup(fs, blkoff(fs, ip->i_size));
152 			nsize = fragroundup(fs, size);
153 			if (nsize <= osize) {
154 				error = bread(vp, lbn, osize, NOCRED, &bp);
155 				if (error) {
156 					brelse(bp);
157 					return (error);
158 				}
159 				bp->b_blkno = fsbtodb(fs, nb);
160 			} else {
161 				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
162 				    ffs_blkpref_ufs1(ip, lbn, (int)lbn,
163 				    &dp->di_db[0]), osize, nsize, cred, &bp);
164 				if (error)
165 					return (error);
166 				if (DOINGSOFTDEP(vp))
167 					softdep_setup_allocdirect(ip, lbn,
168 					    dbtofsb(fs, bp->b_blkno), nb,
169 					    nsize, osize, bp);
170 			}
171 		} else {
172 			if (ip->i_size < smalllblktosize(fs, lbn + 1))
173 				nsize = fragroundup(fs, size);
174 			else
175 				nsize = fs->fs_bsize;
176 			error = ffs_alloc(ip, lbn,
177 			    ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]),
178 			    nsize, cred, &newb);
179 			if (error)
180 				return (error);
181 			bp = getblk(vp, lbn, nsize, 0, 0, 0);
182 			bp->b_blkno = fsbtodb(fs, newb);
183 			if (flags & BA_CLRBUF)
184 				vfs_bio_clrbuf(bp);
185 			if (DOINGSOFTDEP(vp))
186 				softdep_setup_allocdirect(ip, lbn, newb, 0,
187 				    nsize, 0, bp);
188 		}
189 		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
190 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
191 		*bpp = bp;
192 		return (0);
193 	}
194 	/*
195 	 * Determine the number of levels of indirection.
196 	 */
197 	pref = 0;
198 	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
199 		return(error);
200 #ifdef DIAGNOSTIC
201 	if (num < 1)
202 		panic ("ffs_balloc_ufs1: ufs_getlbns returned indirect block");
203 #endif
204 	/*
205 	 * Fetch the first indirect block allocating if necessary.
206 	 */
207 	--num;
208 	nb = dp->di_ib[indirs[0].in_off];
209 	allocib = NULL;
210 	allocblk = allociblk;
211 	if (nb == 0) {
212 		pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
213 	        if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
214 		    cred, &newb)) != 0)
215 			return (error);
216 		nb = newb;
217 		*allocblk++ = nb;
218 		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 0);
219 		bp->b_blkno = fsbtodb(fs, nb);
220 		vfs_bio_clrbuf(bp);
221 		if (DOINGSOFTDEP(vp)) {
222 			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
223 			    newb, 0, fs->fs_bsize, 0, bp);
224 			bdwrite(bp);
225 		} else {
226 			/*
227 			 * Write synchronously so that indirect blocks
228 			 * never point at garbage.
229 			 */
230 			if (DOINGASYNC(vp))
231 				bdwrite(bp);
232 			else if ((error = bwrite(bp)) != 0)
233 				goto fail;
234 		}
235 		allocib = &dp->di_ib[indirs[0].in_off];
236 		*allocib = nb;
237 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
238 	}
239 	/*
240 	 * Fetch through the indirect blocks, allocating as necessary.
241 	 */
242 	for (i = 1;;) {
243 		error = bread(vp,
244 		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
245 		if (error) {
246 			brelse(bp);
247 			goto fail;
248 		}
249 		bap = (ufs1_daddr_t *)bp->b_data;
250 		nb = bap[indirs[i].in_off];
251 		if (i == num)
252 			break;
253 		i += 1;
254 		if (nb != 0) {
255 			bqrelse(bp);
256 			continue;
257 		}
258 		if (pref == 0)
259 			pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
260 		if ((error =
261 		    ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb)) != 0) {
262 			brelse(bp);
263 			goto fail;
264 		}
265 		nb = newb;
266 		*allocblk++ = nb;
267 		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
268 		nbp->b_blkno = fsbtodb(fs, nb);
269 		vfs_bio_clrbuf(nbp);
270 		if (DOINGSOFTDEP(vp)) {
271 			softdep_setup_allocindir_meta(nbp, ip, bp,
272 			    indirs[i - 1].in_off, nb);
273 			bdwrite(nbp);
274 		} else {
275 			/*
276 			 * Write synchronously so that indirect blocks
277 			 * never point at garbage.
278 			 */
279 			if ((error = bwrite(nbp)) != 0) {
280 				brelse(bp);
281 				goto fail;
282 			}
283 		}
284 		bap[indirs[i - 1].in_off] = nb;
285 		if (allocib == NULL && unwindidx < 0)
286 			unwindidx = i - 1;
287 		/*
288 		 * If required, write synchronously, otherwise use
289 		 * delayed write.
290 		 */
291 		if (flags & IO_SYNC) {
292 			bwrite(bp);
293 		} else {
294 			if (bp->b_bufsize == fs->fs_bsize)
295 				bp->b_flags |= B_CLUSTEROK;
296 			bdwrite(bp);
297 		}
298 	}
299 	/*
300 	 * If asked only for the indirect block, then return it.
301 	 */
302 	if (flags & BA_METAONLY) {
303 		*bpp = bp;
304 		return (0);
305 	}
306 	/*
307 	 * Get the data block, allocating if necessary.
308 	 */
309 	if (nb == 0) {
310 		pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off, &bap[0]);
311 		error = ffs_alloc(ip,
312 		    lbn, pref, (int)fs->fs_bsize, cred, &newb);
313 		if (error) {
314 			brelse(bp);
315 			goto fail;
316 		}
317 		nb = newb;
318 		*allocblk++ = nb;
319 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
320 		nbp->b_blkno = fsbtodb(fs, nb);
321 		if (flags & BA_CLRBUF)
322 			vfs_bio_clrbuf(nbp);
323 		if (DOINGSOFTDEP(vp))
324 			softdep_setup_allocindir_page(ip, lbn, bp,
325 			    indirs[i].in_off, nb, 0, nbp);
326 		bap[indirs[i].in_off] = nb;
327 		/*
328 		 * If required, write synchronously, otherwise use
329 		 * delayed write.
330 		 */
331 		if (flags & IO_SYNC) {
332 			bwrite(bp);
333 		} else {
334 			if (bp->b_bufsize == fs->fs_bsize)
335 				bp->b_flags |= B_CLUSTEROK;
336 			bdwrite(bp);
337 		}
338 		*bpp = nbp;
339 		return (0);
340 	}
341 	brelse(bp);
342 	if (flags & BA_CLRBUF) {
343 		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
344 		if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
345 			error = cluster_read(vp, ip->i_size, lbn,
346 			    (int)fs->fs_bsize, NOCRED,
347 			    MAXBSIZE, seqcount, &nbp);
348 		} else {
349 			error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp);
350 		}
351 		if (error) {
352 			brelse(nbp);
353 			goto fail;
354 		}
355 	} else {
356 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
357 		nbp->b_blkno = fsbtodb(fs, nb);
358 	}
359 	*bpp = nbp;
360 	return (0);
361 fail:
362 	/*
363 	 * If we have failed to allocate any blocks, simply return the error.
364 	 * This is the usual case and avoids the need to fsync the file.
365 	 */
366 	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
367 		return (error);
368 	/*
369 	 * If we have failed part way through block allocation, we
370 	 * have to deallocate any indirect blocks that we have allocated.
371 	 * We have to fsync the file before we start to get rid of all
372 	 * of its dependencies so that we do not leave them dangling.
373 	 * We have to sync it at the end so that the soft updates code
374 	 * does not find any untracked changes. Although this is really
375 	 * slow, running out of disk space is not expected to be a common
376 	 * occurence. The error return from fsync is ignored as we already
377 	 * have an error to return to the user.
378 	 */
379 	(void) VOP_FSYNC(vp, cred, MNT_WAIT, td);
380 	for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
381 		ffs_blkfree(fs, ip->i_devvp, *blkp, fs->fs_bsize, ip->i_number);
382 		deallocated += fs->fs_bsize;
383 	}
384 	if (allocib != NULL) {
385 		*allocib = 0;
386 	} else if (unwindidx >= 0) {
387 		int r;
388 
389 		r = bread(vp, indirs[unwindidx].in_lbn,
390 		    (int)fs->fs_bsize, NOCRED, &bp);
391 		if (r) {
392 			panic("Could not unwind indirect block, error %d", r);
393 			brelse(bp);
394 		} else {
395 			bap = (ufs1_daddr_t *)bp->b_data;
396 			bap[indirs[unwindidx].in_off] = 0;
397 			if (flags & IO_SYNC) {
398 				bwrite(bp);
399 			} else {
400 				if (bp->b_bufsize == fs->fs_bsize)
401 					bp->b_flags |= B_CLUSTEROK;
402 				bdwrite(bp);
403 			}
404 		}
405 	}
406 	if (deallocated) {
407 #ifdef QUOTA
408 		/*
409 		 * Restore user's disk quota because allocation failed.
410 		 */
411 		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
412 #endif
413 		dp->di_blocks -= btodb(deallocated);
414 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
415 	}
416 	(void) VOP_FSYNC(vp, cred, MNT_WAIT, td);
417 	return (error);
418 }
419 
420 /*
421  * Balloc defines the structure of file system storage
422  * by allocating the physical blocks on a device given
423  * the inode and the logical block number in a file.
424  * This is the allocation strategy for UFS2. Above is
425  * the allocation strategy for UFS1.
426  */
427 int
428 ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
429     struct ucred *cred, int flags, struct buf **bpp)
430 {
431 	struct inode *ip;
432 	struct ufs2_dinode *dp;
433 	ufs_lbn_t lbn, lastlbn;
434 	struct fs *fs;
435 	struct buf *bp, *nbp;
436 	struct indir indirs[NIADDR + 2];
437 	ufs2_daddr_t nb, newb, *bap, pref;
438 	ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
439 	int deallocated, osize, nsize, num, i, error;
440 	int unwindidx = -1;
441 	struct thread *td = curthread;	/* XXX */
442 
443 	ip = VTOI(vp);
444 	dp = ip->i_din2;
445 	fs = ip->i_fs;
446 	lbn = lblkno(fs, startoffset);
447 	size = blkoff(fs, startoffset) + size;
448 	if (size > fs->fs_bsize)
449 		panic("ffs_balloc_ufs2: blk too big");
450 	*bpp = NULL;
451 	if (lbn < 0)
452 		return (EFBIG);
453 
454 	/*
455 	 * Check for allocating external data.
456 	 */
457 	if (flags & IO_EXT) {
458 		if (lbn >= NXADDR)
459 			return (EFBIG);
460 		/*
461 		 * If the next write will extend the data into a new block,
462 		 * and the data is currently composed of a fragment
463 		 * this fragment has to be extended to be a full block.
464 		 */
465 		lastlbn = lblkno(fs, dp->di_extsize);
466 		if (lastlbn < lbn) {
467 			nb = lastlbn;
468 			osize = sblksize(fs, dp->di_extsize, nb);
469 			if (osize < fs->fs_bsize && osize > 0) {
470 				error = ffs_realloccg(ip, -1 - nb,
471 				    dp->di_extb[nb],
472 				    ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
473 				    &dp->di_extb[0]), osize,
474 				    (int)fs->fs_bsize, cred, &bp);
475 				if (error)
476 					return (error);
477 				if (DOINGSOFTDEP(vp))
478 					softdep_setup_allocext(ip, nb,
479 					    dbtofsb(fs, bp->b_blkno),
480 					    dp->di_extb[nb],
481 					    fs->fs_bsize, osize, bp);
482 				dp->di_extsize = smalllblktosize(fs, nb + 1);
483 				dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno);
484 				bp->b_xflags |= BX_ALTDATA;
485 				ip->i_flag |= IN_CHANGE | IN_UPDATE;
486 				if (flags & IO_SYNC)
487 					bwrite(bp);
488 				else
489 					bawrite(bp);
490 			}
491 		}
492 		/*
493 		 * All blocks are direct blocks
494 		 */
495 		if (flags & BA_METAONLY)
496 			panic("ffs_balloc_ufs2: BA_METAONLY for ext block");
497 		nb = dp->di_extb[lbn];
498 		if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) {
499 			error = bread(vp, -1 - lbn, fs->fs_bsize, NOCRED, &bp);
500 			if (error) {
501 				brelse(bp);
502 				return (error);
503 			}
504 			bp->b_blkno = fsbtodb(fs, nb);
505 			bp->b_xflags |= BX_ALTDATA;
506 			*bpp = bp;
507 			return (0);
508 		}
509 		if (nb != 0) {
510 			/*
511 			 * Consider need to reallocate a fragment.
512 			 */
513 			osize = fragroundup(fs, blkoff(fs, dp->di_extsize));
514 			nsize = fragroundup(fs, size);
515 			if (nsize <= osize) {
516 				error = bread(vp, -1 - lbn, osize, NOCRED, &bp);
517 				if (error) {
518 					brelse(bp);
519 					return (error);
520 				}
521 				bp->b_blkno = fsbtodb(fs, nb);
522 				bp->b_xflags |= BX_ALTDATA;
523 			} else {
524 				error = ffs_realloccg(ip, -1 - lbn,
525 				    dp->di_extb[lbn],
526 				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
527 				    &dp->di_extb[0]), osize, nsize, cred, &bp);
528 				if (error)
529 					return (error);
530 				bp->b_xflags |= BX_ALTDATA;
531 				if (DOINGSOFTDEP(vp))
532 					softdep_setup_allocext(ip, lbn,
533 					    dbtofsb(fs, bp->b_blkno), nb,
534 					    nsize, osize, bp);
535 			}
536 		} else {
537 			if (dp->di_extsize < smalllblktosize(fs, lbn + 1))
538 				nsize = fragroundup(fs, size);
539 			else
540 				nsize = fs->fs_bsize;
541 			error = ffs_alloc(ip, lbn,
542 			   ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]),
543 			   nsize, cred, &newb);
544 			if (error)
545 				return (error);
546 			bp = getblk(vp, -1 - lbn, nsize, 0, 0, 0);
547 			bp->b_blkno = fsbtodb(fs, newb);
548 			bp->b_xflags |= BX_ALTDATA;
549 			if (flags & BA_CLRBUF)
550 				vfs_bio_clrbuf(bp);
551 			if (DOINGSOFTDEP(vp))
552 				softdep_setup_allocext(ip, lbn, newb, 0,
553 				    nsize, 0, bp);
554 		}
555 		dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno);
556 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
557 		*bpp = bp;
558 		return (0);
559 	}
560 	/*
561 	 * If the next write will extend the file into a new block,
562 	 * and the file is currently composed of a fragment
563 	 * this fragment has to be extended to be a full block.
564 	 */
565 	lastlbn = lblkno(fs, ip->i_size);
566 	if (lastlbn < NDADDR && lastlbn < lbn) {
567 		nb = lastlbn;
568 		osize = blksize(fs, ip, nb);
569 		if (osize < fs->fs_bsize && osize > 0) {
570 			error = ffs_realloccg(ip, nb, dp->di_db[nb],
571 				ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
572 				    &dp->di_db[0]), osize, (int)fs->fs_bsize,
573 				    cred, &bp);
574 			if (error)
575 				return (error);
576 			if (DOINGSOFTDEP(vp))
577 				softdep_setup_allocdirect(ip, nb,
578 				    dbtofsb(fs, bp->b_blkno),
579 				    dp->di_db[nb],
580 				    fs->fs_bsize, osize, bp);
581 			ip->i_size = smalllblktosize(fs, nb + 1);
582 			dp->di_size = ip->i_size;
583 			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
584 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
585 			if (flags & IO_SYNC)
586 				bwrite(bp);
587 			else
588 				bawrite(bp);
589 		}
590 	}
591 	/*
592 	 * The first NDADDR blocks are direct blocks
593 	 */
594 	if (lbn < NDADDR) {
595 		if (flags & BA_METAONLY)
596 			panic("ffs_balloc_ufs2: BA_METAONLY for direct block");
597 		nb = dp->di_db[lbn];
598 		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
599 			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
600 			if (error) {
601 				brelse(bp);
602 				return (error);
603 			}
604 			bp->b_blkno = fsbtodb(fs, nb);
605 			*bpp = bp;
606 			return (0);
607 		}
608 		if (nb != 0) {
609 			/*
610 			 * Consider need to reallocate a fragment.
611 			 */
612 			osize = fragroundup(fs, blkoff(fs, ip->i_size));
613 			nsize = fragroundup(fs, size);
614 			if (nsize <= osize) {
615 				error = bread(vp, lbn, osize, NOCRED, &bp);
616 				if (error) {
617 					brelse(bp);
618 					return (error);
619 				}
620 				bp->b_blkno = fsbtodb(fs, nb);
621 			} else {
622 				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
623 				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
624 				       &dp->di_db[0]), osize, nsize, cred, &bp);
625 				if (error)
626 					return (error);
627 				if (DOINGSOFTDEP(vp))
628 					softdep_setup_allocdirect(ip, lbn,
629 					    dbtofsb(fs, bp->b_blkno), nb,
630 					    nsize, osize, bp);
631 			}
632 		} else {
633 			if (ip->i_size < smalllblktosize(fs, lbn + 1))
634 				nsize = fragroundup(fs, size);
635 			else
636 				nsize = fs->fs_bsize;
637 			error = ffs_alloc(ip, lbn,
638 			    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
639 				&dp->di_db[0]), nsize, cred, &newb);
640 			if (error)
641 				return (error);
642 			bp = getblk(vp, lbn, nsize, 0, 0, 0);
643 			bp->b_blkno = fsbtodb(fs, newb);
644 			if (flags & BA_CLRBUF)
645 				vfs_bio_clrbuf(bp);
646 			if (DOINGSOFTDEP(vp))
647 				softdep_setup_allocdirect(ip, lbn, newb, 0,
648 				    nsize, 0, bp);
649 		}
650 		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
651 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
652 		*bpp = bp;
653 		return (0);
654 	}
655 	/*
656 	 * Determine the number of levels of indirection.
657 	 */
658 	pref = 0;
659 	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
660 		return(error);
661 #ifdef DIAGNOSTIC
662 	if (num < 1)
663 		panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block");
664 #endif
665 	/*
666 	 * Fetch the first indirect block allocating if necessary.
667 	 */
668 	--num;
669 	nb = dp->di_ib[indirs[0].in_off];
670 	allocib = NULL;
671 	allocblk = allociblk;
672 	if (nb == 0) {
673 		pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
674 	        if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
675 		    cred, &newb)) != 0)
676 			return (error);
677 		nb = newb;
678 		*allocblk++ = nb;
679 		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 0);
680 		bp->b_blkno = fsbtodb(fs, nb);
681 		vfs_bio_clrbuf(bp);
682 		if (DOINGSOFTDEP(vp)) {
683 			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
684 			    newb, 0, fs->fs_bsize, 0, bp);
685 			bdwrite(bp);
686 		} else {
687 			/*
688 			 * Write synchronously so that indirect blocks
689 			 * never point at garbage.
690 			 */
691 			if (DOINGASYNC(vp))
692 				bdwrite(bp);
693 			else if ((error = bwrite(bp)) != 0)
694 				goto fail;
695 		}
696 		allocib = &dp->di_ib[indirs[0].in_off];
697 		*allocib = nb;
698 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
699 	}
700 	/*
701 	 * Fetch through the indirect blocks, allocating as necessary.
702 	 */
703 	for (i = 1;;) {
704 		error = bread(vp,
705 		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
706 		if (error) {
707 			brelse(bp);
708 			goto fail;
709 		}
710 		bap = (ufs2_daddr_t *)bp->b_data;
711 		nb = bap[indirs[i].in_off];
712 		if (i == num)
713 			break;
714 		i += 1;
715 		if (nb != 0) {
716 			bqrelse(bp);
717 			continue;
718 		}
719 		if (pref == 0)
720 			pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
721 		if ((error =
722 		    ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb)) != 0) {
723 			brelse(bp);
724 			goto fail;
725 		}
726 		nb = newb;
727 		*allocblk++ = nb;
728 		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
729 		nbp->b_blkno = fsbtodb(fs, nb);
730 		vfs_bio_clrbuf(nbp);
731 		if (DOINGSOFTDEP(vp)) {
732 			softdep_setup_allocindir_meta(nbp, ip, bp,
733 			    indirs[i - 1].in_off, nb);
734 			bdwrite(nbp);
735 		} else {
736 			/*
737 			 * Write synchronously so that indirect blocks
738 			 * never point at garbage.
739 			 */
740 			if ((error = bwrite(nbp)) != 0) {
741 				brelse(bp);
742 				goto fail;
743 			}
744 		}
745 		bap[indirs[i - 1].in_off] = nb;
746 		if (allocib == NULL && unwindidx < 0)
747 			unwindidx = i - 1;
748 		/*
749 		 * If required, write synchronously, otherwise use
750 		 * delayed write.
751 		 */
752 		if (flags & IO_SYNC) {
753 			bwrite(bp);
754 		} else {
755 			if (bp->b_bufsize == fs->fs_bsize)
756 				bp->b_flags |= B_CLUSTEROK;
757 			bdwrite(bp);
758 		}
759 	}
760 	/*
761 	 * If asked only for the indirect block, then return it.
762 	 */
763 	if (flags & BA_METAONLY) {
764 		*bpp = bp;
765 		return (0);
766 	}
767 	/*
768 	 * Get the data block, allocating if necessary.
769 	 */
770 	if (nb == 0) {
771 		pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off, &bap[0]);
772 		error = ffs_alloc(ip,
773 		    lbn, pref, (int)fs->fs_bsize, cred, &newb);
774 		if (error) {
775 			brelse(bp);
776 			goto fail;
777 		}
778 		nb = newb;
779 		*allocblk++ = nb;
780 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
781 		nbp->b_blkno = fsbtodb(fs, nb);
782 		if (flags & BA_CLRBUF)
783 			vfs_bio_clrbuf(nbp);
784 		if (DOINGSOFTDEP(vp))
785 			softdep_setup_allocindir_page(ip, lbn, bp,
786 			    indirs[i].in_off, nb, 0, nbp);
787 		bap[indirs[i].in_off] = nb;
788 		/*
789 		 * If required, write synchronously, otherwise use
790 		 * delayed write.
791 		 */
792 		if (flags & IO_SYNC) {
793 			bwrite(bp);
794 		} else {
795 			if (bp->b_bufsize == fs->fs_bsize)
796 				bp->b_flags |= B_CLUSTEROK;
797 			bdwrite(bp);
798 		}
799 		*bpp = nbp;
800 		return (0);
801 	}
802 	brelse(bp);
803 	/*
804 	 * If requested clear invalid portions of the buffer.  If we
805 	 * have to do a read-before-write (typical if BA_CLRBUF is set),
806 	 * try to do some read-ahead in the sequential case to reduce
807 	 * the number of I/O transactions.
808 	 */
809 	if (flags & BA_CLRBUF) {
810 		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
811 		if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
812 			error = cluster_read(vp, ip->i_size, lbn,
813 			    (int)fs->fs_bsize, NOCRED,
814 			    MAXBSIZE, seqcount, &nbp);
815 		} else {
816 			error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp);
817 		}
818 		if (error) {
819 			brelse(nbp);
820 			goto fail;
821 		}
822 	} else {
823 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
824 		nbp->b_blkno = fsbtodb(fs, nb);
825 	}
826 	*bpp = nbp;
827 	return (0);
828 fail:
829 	/*
830 	 * If we have failed to allocate any blocks, simply return the error.
831 	 * This is the usual case and avoids the need to fsync the file.
832 	 */
833 	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
834 		return (error);
835 	/*
836 	 * If we have failed part way through block allocation, we
837 	 * have to deallocate any indirect blocks that we have allocated.
838 	 * We have to fsync the file before we start to get rid of all
839 	 * of its dependencies so that we do not leave them dangling.
840 	 * We have to sync it at the end so that the soft updates code
841 	 * does not find any untracked changes. Although this is really
842 	 * slow, running out of disk space is not expected to be a common
843 	 * occurence. The error return from fsync is ignored as we already
844 	 * have an error to return to the user.
845 	 */
846 	(void) VOP_FSYNC(vp, cred, MNT_WAIT, td);
847 	for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
848 		ffs_blkfree(fs, ip->i_devvp, *blkp, fs->fs_bsize, ip->i_number);
849 		deallocated += fs->fs_bsize;
850 	}
851 	if (allocib != NULL) {
852 		*allocib = 0;
853 	} else if (unwindidx >= 0) {
854 		int r;
855 
856 		r = bread(vp, indirs[unwindidx].in_lbn,
857 		    (int)fs->fs_bsize, NOCRED, &bp);
858 		if (r) {
859 			panic("Could not unwind indirect block, error %d", r);
860 			brelse(bp);
861 		} else {
862 			bap = (ufs2_daddr_t *)bp->b_data;
863 			bap[indirs[unwindidx].in_off] = 0;
864 			if (flags & IO_SYNC) {
865 				bwrite(bp);
866 			} else {
867 				if (bp->b_bufsize == fs->fs_bsize)
868 					bp->b_flags |= B_CLUSTEROK;
869 				bdwrite(bp);
870 			}
871 		}
872 	}
873 	if (deallocated) {
874 #ifdef QUOTA
875 		/*
876 		 * Restore user's disk quota because allocation failed.
877 		 */
878 		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
879 #endif
880 		dp->di_blocks -= btodb(deallocated);
881 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
882 	}
883 	(void) VOP_FSYNC(vp, cred, MNT_WAIT, td);
884 	return (error);
885 }
886