xref: /freebsd/sys/ufs/ffs/ffs_balloc.c (revision 4b2eaea43fec8e8792be611dea204071a10b655a)
1 /*
2  * Copyright (c) 2002 Networks Associates Technology, Inc.
3  * All rights reserved.
4  *
5  * This software was developed for the FreeBSD Project by Marshall
6  * Kirk McKusick and Network Associates Laboratories, the Security
7  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
8  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
9  * research program
10  *
11  * Copyright (c) 1982, 1989, 1993
12  *	The Regents of the University of California.  All rights reserved.
13  * (c) UNIX System Laboratories, Inc.
14  * Copyright (c) 1982, 1986, 1989, 1993
15  *	The Regents of the University of California.  All rights reserved.
16  *
17  * Redistribution and use in source and binary forms, with or without
18  * modification, are permitted provided that the following conditions
19  * are met:
20  * 1. Redistributions of source code must retain the above copyright
21  *    notice, this list of conditions and the following disclaimer.
22  * 2. Redistributions in binary form must reproduce the above copyright
23  *    notice, this list of conditions and the following disclaimer in the
24  *    documentation and/or other materials provided with the distribution.
25  * 3. All advertising materials mentioning features or use of this software
26  *    must display the following acknowledgement:
27  *	This product includes software developed by the University of
28  *	California, Berkeley and its contributors.
29  * 4. Neither the name of the University nor the names of its contributors
30  *    may be used to endorse or promote products derived from this software
31  *    without specific prior written permission.
32  *
33  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
34  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
35  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
36  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
37  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
38  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
39  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
40  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
41  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
42  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
43  * SUCH DAMAGE.
44  *
45  *	@(#)ffs_balloc.c	8.8 (Berkeley) 6/16/95
46  * $FreeBSD$
47  */
48 
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/bio.h>
52 #include <sys/buf.h>
53 #include <sys/lock.h>
54 #include <sys/mount.h>
55 #include <sys/vnode.h>
56 
57 #include <ufs/ufs/quota.h>
58 #include <ufs/ufs/inode.h>
59 #include <ufs/ufs/ufs_extern.h>
60 
61 #include <ufs/ffs/fs.h>
62 #include <ufs/ffs/ffs_extern.h>
63 
64 /*
65  * Balloc defines the structure of filesystem storage
66  * by allocating the physical blocks on a device given
67  * the inode and the logical block number in a file.
68  * This is the allocation strategy for UFS1. Below is
69  * the allocation strategy for UFS2.
70  */
71 int
72 ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
73     struct ucred *cred, int flags, struct buf **bpp)
74 {
75 	struct inode *ip;
76 	struct ufs1_dinode *dp;
77 	ufs_lbn_t lbn, lastlbn;
78 	struct fs *fs;
79 	ufs1_daddr_t nb;
80 	struct buf *bp, *nbp;
81 	struct indir indirs[NIADDR + 2];
82 	int deallocated, osize, nsize, num, i, error;
83 	ufs2_daddr_t newb;
84 	ufs1_daddr_t *bap, pref;
85 	ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
86 	int unwindidx = -1;
87 	struct thread *td = curthread;	/* XXX */
88 
89 	ip = VTOI(vp);
90 	dp = ip->i_din1;
91 	fs = ip->i_fs;
92 	lbn = lblkno(fs, startoffset);
93 	size = blkoff(fs, startoffset) + size;
94 	if (size > fs->fs_bsize)
95 		panic("ffs_balloc_ufs1: blk too big");
96 	*bpp = NULL;
97 	if (flags & IO_EXT)
98 		return (EOPNOTSUPP);
99 	if (lbn < 0)
100 		return (EFBIG);
101 
102 	/*
103 	 * If the next write will extend the file into a new block,
104 	 * and the file is currently composed of a fragment
105 	 * this fragment has to be extended to be a full block.
106 	 */
107 	lastlbn = lblkno(fs, ip->i_size);
108 	if (lastlbn < NDADDR && lastlbn < lbn) {
109 		nb = lastlbn;
110 		osize = blksize(fs, ip, nb);
111 		if (osize < fs->fs_bsize && osize > 0) {
112 			error = ffs_realloccg(ip, nb, dp->di_db[nb],
113 			   ffs_blkpref_ufs1(ip, lastlbn, (int)nb,
114 			   &dp->di_db[0]), osize, (int)fs->fs_bsize, cred, &bp);
115 			if (error)
116 				return (error);
117 			if (DOINGSOFTDEP(vp))
118 				softdep_setup_allocdirect(ip, nb,
119 				    dbtofsb(fs, bp->b_blkno), dp->di_db[nb],
120 				    fs->fs_bsize, osize, bp);
121 			ip->i_size = smalllblktosize(fs, nb + 1);
122 			dp->di_size = ip->i_size;
123 			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
124 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
125 			if (flags & IO_SYNC)
126 				bwrite(bp);
127 			else
128 				bawrite(bp);
129 		}
130 	}
131 	/*
132 	 * The first NDADDR blocks are direct blocks
133 	 */
134 	if (lbn < NDADDR) {
135 		if (flags & BA_METAONLY)
136 			panic("ffs_balloc_ufs1: BA_METAONLY for direct block");
137 		nb = dp->di_db[lbn];
138 		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
139 			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
140 			if (error) {
141 				brelse(bp);
142 				return (error);
143 			}
144 			bp->b_blkno = fsbtodb(fs, nb);
145 			*bpp = bp;
146 			return (0);
147 		}
148 		if (nb != 0) {
149 			/*
150 			 * Consider need to reallocate a fragment.
151 			 */
152 			osize = fragroundup(fs, blkoff(fs, ip->i_size));
153 			nsize = fragroundup(fs, size);
154 			if (nsize <= osize) {
155 				error = bread(vp, lbn, osize, NOCRED, &bp);
156 				if (error) {
157 					brelse(bp);
158 					return (error);
159 				}
160 				bp->b_blkno = fsbtodb(fs, nb);
161 			} else {
162 				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
163 				    ffs_blkpref_ufs1(ip, lbn, (int)lbn,
164 				    &dp->di_db[0]), osize, nsize, cred, &bp);
165 				if (error)
166 					return (error);
167 				if (DOINGSOFTDEP(vp))
168 					softdep_setup_allocdirect(ip, lbn,
169 					    dbtofsb(fs, bp->b_blkno), nb,
170 					    nsize, osize, bp);
171 			}
172 		} else {
173 			if (ip->i_size < smalllblktosize(fs, lbn + 1))
174 				nsize = fragroundup(fs, size);
175 			else
176 				nsize = fs->fs_bsize;
177 			error = ffs_alloc(ip, lbn,
178 			    ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]),
179 			    nsize, cred, &newb);
180 			if (error)
181 				return (error);
182 			bp = getblk(vp, lbn, nsize, 0, 0);
183 			bp->b_blkno = fsbtodb(fs, newb);
184 			if (flags & BA_CLRBUF)
185 				vfs_bio_clrbuf(bp);
186 			if (DOINGSOFTDEP(vp))
187 				softdep_setup_allocdirect(ip, lbn, newb, 0,
188 				    nsize, 0, bp);
189 		}
190 		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
191 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
192 		*bpp = bp;
193 		return (0);
194 	}
195 	/*
196 	 * Determine the number of levels of indirection.
197 	 */
198 	pref = 0;
199 	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
200 		return(error);
201 #ifdef DIAGNOSTIC
202 	if (num < 1)
203 		panic ("ffs_balloc_ufs1: ufs_getlbns returned indirect block");
204 #endif
205 	/*
206 	 * Fetch the first indirect block allocating if necessary.
207 	 */
208 	--num;
209 	nb = dp->di_ib[indirs[0].in_off];
210 	allocib = NULL;
211 	allocblk = allociblk;
212 	if (nb == 0) {
213 		pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
214 	        if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
215 		    cred, &newb)) != 0)
216 			return (error);
217 		nb = newb;
218 		*allocblk++ = nb;
219 		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0);
220 		bp->b_blkno = fsbtodb(fs, nb);
221 		vfs_bio_clrbuf(bp);
222 		if (DOINGSOFTDEP(vp)) {
223 			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
224 			    newb, 0, fs->fs_bsize, 0, bp);
225 			bdwrite(bp);
226 		} else {
227 			/*
228 			 * Write synchronously so that indirect blocks
229 			 * never point at garbage.
230 			 */
231 			if (DOINGASYNC(vp))
232 				bdwrite(bp);
233 			else if ((error = bwrite(bp)) != 0)
234 				goto fail;
235 		}
236 		allocib = &dp->di_ib[indirs[0].in_off];
237 		*allocib = nb;
238 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
239 	}
240 	/*
241 	 * Fetch through the indirect blocks, allocating as necessary.
242 	 */
243 	for (i = 1;;) {
244 		error = bread(vp,
245 		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
246 		if (error) {
247 			brelse(bp);
248 			goto fail;
249 		}
250 		bap = (ufs1_daddr_t *)bp->b_data;
251 		nb = bap[indirs[i].in_off];
252 		if (i == num)
253 			break;
254 		i += 1;
255 		if (nb != 0) {
256 			bqrelse(bp);
257 			continue;
258 		}
259 		if (pref == 0)
260 			pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
261 		if ((error =
262 		    ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb)) != 0) {
263 			brelse(bp);
264 			goto fail;
265 		}
266 		nb = newb;
267 		*allocblk++ = nb;
268 		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0);
269 		nbp->b_blkno = fsbtodb(fs, nb);
270 		vfs_bio_clrbuf(nbp);
271 		if (DOINGSOFTDEP(vp)) {
272 			softdep_setup_allocindir_meta(nbp, ip, bp,
273 			    indirs[i - 1].in_off, nb);
274 			bdwrite(nbp);
275 		} else {
276 			/*
277 			 * Write synchronously so that indirect blocks
278 			 * never point at garbage.
279 			 */
280 			if ((error = bwrite(nbp)) != 0) {
281 				brelse(bp);
282 				goto fail;
283 			}
284 		}
285 		bap[indirs[i - 1].in_off] = nb;
286 		if (allocib == NULL && unwindidx < 0)
287 			unwindidx = i - 1;
288 		/*
289 		 * If required, write synchronously, otherwise use
290 		 * delayed write.
291 		 */
292 		if (flags & IO_SYNC) {
293 			bwrite(bp);
294 		} else {
295 			if (bp->b_bufsize == fs->fs_bsize)
296 				bp->b_flags |= B_CLUSTEROK;
297 			bdwrite(bp);
298 		}
299 	}
300 	/*
301 	 * If asked only for the indirect block, then return it.
302 	 */
303 	if (flags & BA_METAONLY) {
304 		*bpp = bp;
305 		return (0);
306 	}
307 	/*
308 	 * Get the data block, allocating if necessary.
309 	 */
310 	if (nb == 0) {
311 		pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off, &bap[0]);
312 		error = ffs_alloc(ip,
313 		    lbn, pref, (int)fs->fs_bsize, cred, &newb);
314 		if (error) {
315 			brelse(bp);
316 			goto fail;
317 		}
318 		nb = newb;
319 		*allocblk++ = nb;
320 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0);
321 		nbp->b_blkno = fsbtodb(fs, nb);
322 		if (flags & BA_CLRBUF)
323 			vfs_bio_clrbuf(nbp);
324 		if (DOINGSOFTDEP(vp))
325 			softdep_setup_allocindir_page(ip, lbn, bp,
326 			    indirs[i].in_off, nb, 0, nbp);
327 		bap[indirs[i].in_off] = nb;
328 		/*
329 		 * If required, write synchronously, otherwise use
330 		 * delayed write.
331 		 */
332 		if (flags & IO_SYNC) {
333 			bwrite(bp);
334 		} else {
335 			if (bp->b_bufsize == fs->fs_bsize)
336 				bp->b_flags |= B_CLUSTEROK;
337 			bdwrite(bp);
338 		}
339 		*bpp = nbp;
340 		return (0);
341 	}
342 	brelse(bp);
343 	if (flags & BA_CLRBUF) {
344 		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
345 		if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
346 			error = cluster_read(vp, ip->i_size, lbn,
347 			    (int)fs->fs_bsize, NOCRED,
348 			    MAXBSIZE, seqcount, &nbp);
349 		} else {
350 			error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp);
351 		}
352 		if (error) {
353 			brelse(nbp);
354 			goto fail;
355 		}
356 	} else {
357 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0);
358 		nbp->b_blkno = fsbtodb(fs, nb);
359 	}
360 	*bpp = nbp;
361 	return (0);
362 fail:
363 	/*
364 	 * If we have failed to allocate any blocks, simply return the error.
365 	 * This is the usual case and avoids the need to fsync the file.
366 	 */
367 	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
368 		return (error);
369 	/*
370 	 * If we have failed part way through block allocation, we
371 	 * have to deallocate any indirect blocks that we have allocated.
372 	 * We have to fsync the file before we start to get rid of all
373 	 * of its dependencies so that we do not leave them dangling.
374 	 * We have to sync it at the end so that the soft updates code
375 	 * does not find any untracked changes. Although this is really
376 	 * slow, running out of disk space is not expected to be a common
377 	 * occurence. The error return from fsync is ignored as we already
378 	 * have an error to return to the user.
379 	 */
380 	(void) VOP_FSYNC(vp, cred, MNT_WAIT, td);
381 	for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
382 		ffs_blkfree(fs, ip->i_devvp, *blkp, fs->fs_bsize, ip->i_number);
383 		deallocated += fs->fs_bsize;
384 	}
385 	if (allocib != NULL) {
386 		*allocib = 0;
387 	} else if (unwindidx >= 0) {
388 		int r;
389 
390 		r = bread(vp, indirs[unwindidx].in_lbn,
391 		    (int)fs->fs_bsize, NOCRED, &bp);
392 		if (r) {
393 			panic("Could not unwind indirect block, error %d", r);
394 			brelse(bp);
395 		} else {
396 			bap = (ufs1_daddr_t *)bp->b_data;
397 			bap[indirs[unwindidx].in_off] = 0;
398 			if (flags & IO_SYNC) {
399 				bwrite(bp);
400 			} else {
401 				if (bp->b_bufsize == fs->fs_bsize)
402 					bp->b_flags |= B_CLUSTEROK;
403 				bdwrite(bp);
404 			}
405 		}
406 	}
407 	if (deallocated) {
408 #ifdef QUOTA
409 		/*
410 		 * Restore user's disk quota because allocation failed.
411 		 */
412 		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
413 #endif
414 		dp->di_blocks -= btodb(deallocated);
415 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
416 	}
417 	(void) VOP_FSYNC(vp, cred, MNT_WAIT, td);
418 	return (error);
419 }
420 
421 /*
422  * Balloc defines the structure of file system storage
423  * by allocating the physical blocks on a device given
424  * the inode and the logical block number in a file.
425  * This is the allocation strategy for UFS2. Above is
426  * the allocation strategy for UFS1.
427  */
428 int
429 ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
430     struct ucred *cred, int flags, struct buf **bpp)
431 {
432 	struct inode *ip;
433 	struct ufs2_dinode *dp;
434 	ufs_lbn_t lbn, lastlbn;
435 	struct fs *fs;
436 	struct buf *bp, *nbp;
437 	struct indir indirs[NIADDR + 2];
438 	ufs2_daddr_t nb, newb, *bap, pref;
439 	ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
440 	int deallocated, osize, nsize, num, i, error;
441 	int unwindidx = -1;
442 	struct thread *td = curthread;	/* XXX */
443 
444 	ip = VTOI(vp);
445 	dp = ip->i_din2;
446 	fs = ip->i_fs;
447 	lbn = lblkno(fs, startoffset);
448 	size = blkoff(fs, startoffset) + size;
449 	if (size > fs->fs_bsize)
450 		panic("ffs_balloc_ufs2: blk too big");
451 	*bpp = NULL;
452 	if (lbn < 0)
453 		return (EFBIG);
454 
455 	/*
456 	 * Check for allocating external data.
457 	 */
458 	if (flags & IO_EXT) {
459 		if (lbn >= NXADDR)
460 			return (EFBIG);
461 		/*
462 		 * If the next write will extend the data into a new block,
463 		 * and the data is currently composed of a fragment
464 		 * this fragment has to be extended to be a full block.
465 		 */
466 		lastlbn = lblkno(fs, dp->di_extsize);
467 		if (lastlbn < lbn) {
468 			nb = lastlbn;
469 			osize = sblksize(fs, dp->di_extsize, nb);
470 			if (osize < fs->fs_bsize && osize > 0) {
471 				error = ffs_realloccg(ip, -1 - nb,
472 				    dp->di_extb[nb],
473 				    ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
474 				    &dp->di_extb[0]), osize,
475 				    (int)fs->fs_bsize, cred, &bp);
476 				if (error)
477 					return (error);
478 				if (DOINGSOFTDEP(vp))
479 					softdep_setup_allocext(ip, nb,
480 					    dbtofsb(fs, bp->b_blkno),
481 					    dp->di_extb[nb],
482 					    fs->fs_bsize, osize, bp);
483 				dp->di_extsize = smalllblktosize(fs, nb + 1);
484 				dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno);
485 				bp->b_xflags |= BX_ALTDATA;
486 				ip->i_flag |= IN_CHANGE | IN_UPDATE;
487 				if (flags & IO_SYNC)
488 					bwrite(bp);
489 				else
490 					bawrite(bp);
491 			}
492 		}
493 		/*
494 		 * All blocks are direct blocks
495 		 */
496 		if (flags & BA_METAONLY)
497 			panic("ffs_balloc_ufs2: BA_METAONLY for ext block");
498 		nb = dp->di_extb[lbn];
499 		if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) {
500 			error = bread(vp, -1 - lbn, fs->fs_bsize, NOCRED, &bp);
501 			if (error) {
502 				brelse(bp);
503 				return (error);
504 			}
505 			bp->b_blkno = fsbtodb(fs, nb);
506 			bp->b_xflags |= BX_ALTDATA;
507 			*bpp = bp;
508 			return (0);
509 		}
510 		if (nb != 0) {
511 			/*
512 			 * Consider need to reallocate a fragment.
513 			 */
514 			osize = fragroundup(fs, blkoff(fs, dp->di_extsize));
515 			nsize = fragroundup(fs, size);
516 			if (nsize <= osize) {
517 				error = bread(vp, -1 - lbn, osize, NOCRED, &bp);
518 				if (error) {
519 					brelse(bp);
520 					return (error);
521 				}
522 				bp->b_blkno = fsbtodb(fs, nb);
523 				bp->b_xflags |= BX_ALTDATA;
524 			} else {
525 				error = ffs_realloccg(ip, -1 - lbn,
526 				    dp->di_extb[lbn],
527 				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
528 				    &dp->di_extb[0]), osize, nsize, cred, &bp);
529 				if (error)
530 					return (error);
531 				bp->b_xflags |= BX_ALTDATA;
532 				if (DOINGSOFTDEP(vp))
533 					softdep_setup_allocext(ip, lbn,
534 					    dbtofsb(fs, bp->b_blkno), nb,
535 					    nsize, osize, bp);
536 			}
537 		} else {
538 			if (dp->di_extsize < smalllblktosize(fs, lbn + 1))
539 				nsize = fragroundup(fs, size);
540 			else
541 				nsize = fs->fs_bsize;
542 			error = ffs_alloc(ip, lbn,
543 			   ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]),
544 			   nsize, cred, &newb);
545 			if (error)
546 				return (error);
547 			bp = getblk(vp, -1 - lbn, nsize, 0, 0);
548 			bp->b_blkno = fsbtodb(fs, newb);
549 			bp->b_xflags |= BX_ALTDATA;
550 			if (flags & BA_CLRBUF)
551 				vfs_bio_clrbuf(bp);
552 			if (DOINGSOFTDEP(vp))
553 				softdep_setup_allocext(ip, lbn, newb, 0,
554 				    nsize, 0, bp);
555 		}
556 		dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno);
557 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
558 		*bpp = bp;
559 		return (0);
560 	}
561 	/*
562 	 * If the next write will extend the file into a new block,
563 	 * and the file is currently composed of a fragment
564 	 * this fragment has to be extended to be a full block.
565 	 */
566 	lastlbn = lblkno(fs, ip->i_size);
567 	if (lastlbn < NDADDR && lastlbn < lbn) {
568 		nb = lastlbn;
569 		osize = blksize(fs, ip, nb);
570 		if (osize < fs->fs_bsize && osize > 0) {
571 			error = ffs_realloccg(ip, nb, dp->di_db[nb],
572 				ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
573 				    &dp->di_db[0]), osize, (int)fs->fs_bsize,
574 				    cred, &bp);
575 			if (error)
576 				return (error);
577 			if (DOINGSOFTDEP(vp))
578 				softdep_setup_allocdirect(ip, nb,
579 				    dbtofsb(fs, bp->b_blkno),
580 				    dp->di_db[nb],
581 				    fs->fs_bsize, osize, bp);
582 			ip->i_size = smalllblktosize(fs, nb + 1);
583 			dp->di_size = ip->i_size;
584 			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
585 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
586 			if (flags & IO_SYNC)
587 				bwrite(bp);
588 			else
589 				bawrite(bp);
590 		}
591 	}
592 	/*
593 	 * The first NDADDR blocks are direct blocks
594 	 */
595 	if (lbn < NDADDR) {
596 		if (flags & BA_METAONLY)
597 			panic("ffs_balloc_ufs2: BA_METAONLY for direct block");
598 		nb = dp->di_db[lbn];
599 		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
600 			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
601 			if (error) {
602 				brelse(bp);
603 				return (error);
604 			}
605 			bp->b_blkno = fsbtodb(fs, nb);
606 			*bpp = bp;
607 			return (0);
608 		}
609 		if (nb != 0) {
610 			/*
611 			 * Consider need to reallocate a fragment.
612 			 */
613 			osize = fragroundup(fs, blkoff(fs, ip->i_size));
614 			nsize = fragroundup(fs, size);
615 			if (nsize <= osize) {
616 				error = bread(vp, lbn, osize, NOCRED, &bp);
617 				if (error) {
618 					brelse(bp);
619 					return (error);
620 				}
621 				bp->b_blkno = fsbtodb(fs, nb);
622 			} else {
623 				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
624 				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
625 				       &dp->di_db[0]), osize, nsize, cred, &bp);
626 				if (error)
627 					return (error);
628 				if (DOINGSOFTDEP(vp))
629 					softdep_setup_allocdirect(ip, lbn,
630 					    dbtofsb(fs, bp->b_blkno), nb,
631 					    nsize, osize, bp);
632 			}
633 		} else {
634 			if (ip->i_size < smalllblktosize(fs, lbn + 1))
635 				nsize = fragroundup(fs, size);
636 			else
637 				nsize = fs->fs_bsize;
638 			error = ffs_alloc(ip, lbn,
639 			    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
640 				&dp->di_db[0]), nsize, cred, &newb);
641 			if (error)
642 				return (error);
643 			bp = getblk(vp, lbn, nsize, 0, 0);
644 			bp->b_blkno = fsbtodb(fs, newb);
645 			if (flags & BA_CLRBUF)
646 				vfs_bio_clrbuf(bp);
647 			if (DOINGSOFTDEP(vp))
648 				softdep_setup_allocdirect(ip, lbn, newb, 0,
649 				    nsize, 0, bp);
650 		}
651 		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
652 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
653 		*bpp = bp;
654 		return (0);
655 	}
656 	/*
657 	 * Determine the number of levels of indirection.
658 	 */
659 	pref = 0;
660 	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
661 		return(error);
662 #ifdef DIAGNOSTIC
663 	if (num < 1)
664 		panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block");
665 #endif
666 	/*
667 	 * Fetch the first indirect block allocating if necessary.
668 	 */
669 	--num;
670 	nb = dp->di_ib[indirs[0].in_off];
671 	allocib = NULL;
672 	allocblk = allociblk;
673 	if (nb == 0) {
674 		pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
675 	        if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
676 		    cred, &newb)) != 0)
677 			return (error);
678 		nb = newb;
679 		*allocblk++ = nb;
680 		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0);
681 		bp->b_blkno = fsbtodb(fs, nb);
682 		vfs_bio_clrbuf(bp);
683 		if (DOINGSOFTDEP(vp)) {
684 			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
685 			    newb, 0, fs->fs_bsize, 0, bp);
686 			bdwrite(bp);
687 		} else {
688 			/*
689 			 * Write synchronously so that indirect blocks
690 			 * never point at garbage.
691 			 */
692 			if (DOINGASYNC(vp))
693 				bdwrite(bp);
694 			else if ((error = bwrite(bp)) != 0)
695 				goto fail;
696 		}
697 		allocib = &dp->di_ib[indirs[0].in_off];
698 		*allocib = nb;
699 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
700 	}
701 	/*
702 	 * Fetch through the indirect blocks, allocating as necessary.
703 	 */
704 	for (i = 1;;) {
705 		error = bread(vp,
706 		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
707 		if (error) {
708 			brelse(bp);
709 			goto fail;
710 		}
711 		bap = (ufs2_daddr_t *)bp->b_data;
712 		nb = bap[indirs[i].in_off];
713 		if (i == num)
714 			break;
715 		i += 1;
716 		if (nb != 0) {
717 			bqrelse(bp);
718 			continue;
719 		}
720 		if (pref == 0)
721 			pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
722 		if ((error =
723 		    ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb)) != 0) {
724 			brelse(bp);
725 			goto fail;
726 		}
727 		nb = newb;
728 		*allocblk++ = nb;
729 		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0);
730 		nbp->b_blkno = fsbtodb(fs, nb);
731 		vfs_bio_clrbuf(nbp);
732 		if (DOINGSOFTDEP(vp)) {
733 			softdep_setup_allocindir_meta(nbp, ip, bp,
734 			    indirs[i - 1].in_off, nb);
735 			bdwrite(nbp);
736 		} else {
737 			/*
738 			 * Write synchronously so that indirect blocks
739 			 * never point at garbage.
740 			 */
741 			if ((error = bwrite(nbp)) != 0) {
742 				brelse(bp);
743 				goto fail;
744 			}
745 		}
746 		bap[indirs[i - 1].in_off] = nb;
747 		if (allocib == NULL && unwindidx < 0)
748 			unwindidx = i - 1;
749 		/*
750 		 * If required, write synchronously, otherwise use
751 		 * delayed write.
752 		 */
753 		if (flags & IO_SYNC) {
754 			bwrite(bp);
755 		} else {
756 			if (bp->b_bufsize == fs->fs_bsize)
757 				bp->b_flags |= B_CLUSTEROK;
758 			bdwrite(bp);
759 		}
760 	}
761 	/*
762 	 * If asked only for the indirect block, then return it.
763 	 */
764 	if (flags & BA_METAONLY) {
765 		*bpp = bp;
766 		return (0);
767 	}
768 	/*
769 	 * Get the data block, allocating if necessary.
770 	 */
771 	if (nb == 0) {
772 		pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off, &bap[0]);
773 		error = ffs_alloc(ip,
774 		    lbn, pref, (int)fs->fs_bsize, cred, &newb);
775 		if (error) {
776 			brelse(bp);
777 			goto fail;
778 		}
779 		nb = newb;
780 		*allocblk++ = nb;
781 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0);
782 		nbp->b_blkno = fsbtodb(fs, nb);
783 		if (flags & BA_CLRBUF)
784 			vfs_bio_clrbuf(nbp);
785 		if (DOINGSOFTDEP(vp))
786 			softdep_setup_allocindir_page(ip, lbn, bp,
787 			    indirs[i].in_off, nb, 0, nbp);
788 		bap[indirs[i].in_off] = nb;
789 		/*
790 		 * If required, write synchronously, otherwise use
791 		 * delayed write.
792 		 */
793 		if (flags & IO_SYNC) {
794 			bwrite(bp);
795 		} else {
796 			if (bp->b_bufsize == fs->fs_bsize)
797 				bp->b_flags |= B_CLUSTEROK;
798 			bdwrite(bp);
799 		}
800 		*bpp = nbp;
801 		return (0);
802 	}
803 	brelse(bp);
804 	/*
805 	 * If requested clear invalid portions of the buffer.  If we
806 	 * have to do a read-before-write (typical if BA_CLRBUF is set),
807 	 * try to do some read-ahead in the sequential case to reduce
808 	 * the number of I/O transactions.
809 	 */
810 	if (flags & BA_CLRBUF) {
811 		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
812 		if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
813 			error = cluster_read(vp, ip->i_size, lbn,
814 			    (int)fs->fs_bsize, NOCRED,
815 			    MAXBSIZE, seqcount, &nbp);
816 		} else {
817 			error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp);
818 		}
819 		if (error) {
820 			brelse(nbp);
821 			goto fail;
822 		}
823 	} else {
824 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0);
825 		nbp->b_blkno = fsbtodb(fs, nb);
826 	}
827 	*bpp = nbp;
828 	return (0);
829 fail:
830 	/*
831 	 * If we have failed to allocate any blocks, simply return the error.
832 	 * This is the usual case and avoids the need to fsync the file.
833 	 */
834 	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
835 		return (error);
836 	/*
837 	 * If we have failed part way through block allocation, we
838 	 * have to deallocate any indirect blocks that we have allocated.
839 	 * We have to fsync the file before we start to get rid of all
840 	 * of its dependencies so that we do not leave them dangling.
841 	 * We have to sync it at the end so that the soft updates code
842 	 * does not find any untracked changes. Although this is really
843 	 * slow, running out of disk space is not expected to be a common
844 	 * occurence. The error return from fsync is ignored as we already
845 	 * have an error to return to the user.
846 	 */
847 	(void) VOP_FSYNC(vp, cred, MNT_WAIT, td);
848 	for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
849 		ffs_blkfree(fs, ip->i_devvp, *blkp, fs->fs_bsize, ip->i_number);
850 		deallocated += fs->fs_bsize;
851 	}
852 	if (allocib != NULL) {
853 		*allocib = 0;
854 	} else if (unwindidx >= 0) {
855 		int r;
856 
857 		r = bread(vp, indirs[unwindidx].in_lbn,
858 		    (int)fs->fs_bsize, NOCRED, &bp);
859 		if (r) {
860 			panic("Could not unwind indirect block, error %d", r);
861 			brelse(bp);
862 		} else {
863 			bap = (ufs2_daddr_t *)bp->b_data;
864 			bap[indirs[unwindidx].in_off] = 0;
865 			if (flags & IO_SYNC) {
866 				bwrite(bp);
867 			} else {
868 				if (bp->b_bufsize == fs->fs_bsize)
869 					bp->b_flags |= B_CLUSTEROK;
870 				bdwrite(bp);
871 			}
872 		}
873 	}
874 	if (deallocated) {
875 #ifdef QUOTA
876 		/*
877 		 * Restore user's disk quota because allocation failed.
878 		 */
879 		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
880 #endif
881 		dp->di_blocks -= btodb(deallocated);
882 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
883 	}
884 	(void) VOP_FSYNC(vp, cred, MNT_WAIT, td);
885 	return (error);
886 }
887