xref: /freebsd/sys/ufs/ffs/ffs_balloc.c (revision 729362425c09cf6b362366aabc6fb547eee8035a)
1 /*
2  * Copyright (c) 2002 Networks Associates Technology, Inc.
3  * All rights reserved.
4  *
5  * This software was developed for the FreeBSD Project by Marshall
6  * Kirk McKusick and Network Associates Laboratories, the Security
7  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
8  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
9  * research program
10  *
11  * Copyright (c) 1982, 1986, 1989, 1993
12  *	The Regents of the University of California.  All rights reserved.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. All advertising materials mentioning features or use of this software
23  *    must display the following acknowledgement:
24  *	This product includes software developed by the University of
25  *	California, Berkeley and its contributors.
26  * 4. Neither the name of the University nor the names of its contributors
27  *    may be used to endorse or promote products derived from this software
28  *    without specific prior written permission.
29  *
30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40  * SUCH DAMAGE.
41  *
42  *	@(#)ffs_balloc.c	8.8 (Berkeley) 6/16/95
43  * $FreeBSD$
44  */
45 
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/bio.h>
49 #include <sys/buf.h>
50 #include <sys/lock.h>
51 #include <sys/mount.h>
52 #include <sys/vnode.h>
53 
54 #include <ufs/ufs/quota.h>
55 #include <ufs/ufs/inode.h>
56 #include <ufs/ufs/ufs_extern.h>
57 
58 #include <ufs/ffs/fs.h>
59 #include <ufs/ffs/ffs_extern.h>
60 
61 /*
62  * Balloc defines the structure of filesystem storage
63  * by allocating the physical blocks on a device given
64  * the inode and the logical block number in a file.
65  * This is the allocation strategy for UFS1. Below is
66  * the allocation strategy for UFS2.
67  */
68 int
69 ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
70     struct ucred *cred, int flags, struct buf **bpp)
71 {
72 	struct inode *ip;
73 	struct ufs1_dinode *dp;
74 	ufs_lbn_t lbn, lastlbn;
75 	struct fs *fs;
76 	ufs1_daddr_t nb;
77 	struct buf *bp, *nbp;
78 	struct indir indirs[NIADDR + 2];
79 	int deallocated, osize, nsize, num, i, error;
80 	ufs2_daddr_t newb;
81 	ufs1_daddr_t *bap, pref;
82 	ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
83 	int unwindidx = -1;
84 	struct thread *td = curthread;	/* XXX */
85 
86 	ip = VTOI(vp);
87 	dp = ip->i_din1;
88 	fs = ip->i_fs;
89 	lbn = lblkno(fs, startoffset);
90 	size = blkoff(fs, startoffset) + size;
91 	if (size > fs->fs_bsize)
92 		panic("ffs_balloc_ufs1: blk too big");
93 	*bpp = NULL;
94 	if (flags & IO_EXT)
95 		return (EOPNOTSUPP);
96 	if (lbn < 0)
97 		return (EFBIG);
98 
99 	/*
100 	 * If the next write will extend the file into a new block,
101 	 * and the file is currently composed of a fragment
102 	 * this fragment has to be extended to be a full block.
103 	 */
104 	lastlbn = lblkno(fs, ip->i_size);
105 	if (lastlbn < NDADDR && lastlbn < lbn) {
106 		nb = lastlbn;
107 		osize = blksize(fs, ip, nb);
108 		if (osize < fs->fs_bsize && osize > 0) {
109 			error = ffs_realloccg(ip, nb, dp->di_db[nb],
110 			   ffs_blkpref_ufs1(ip, lastlbn, (int)nb,
111 			   &dp->di_db[0]), osize, (int)fs->fs_bsize, cred, &bp);
112 			if (error)
113 				return (error);
114 			if (DOINGSOFTDEP(vp))
115 				softdep_setup_allocdirect(ip, nb,
116 				    dbtofsb(fs, bp->b_blkno), dp->di_db[nb],
117 				    fs->fs_bsize, osize, bp);
118 			ip->i_size = smalllblktosize(fs, nb + 1);
119 			dp->di_size = ip->i_size;
120 			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
121 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
122 			if (flags & IO_SYNC)
123 				bwrite(bp);
124 			else
125 				bawrite(bp);
126 		}
127 	}
128 	/*
129 	 * The first NDADDR blocks are direct blocks
130 	 */
131 	if (lbn < NDADDR) {
132 		if (flags & BA_METAONLY)
133 			panic("ffs_balloc_ufs1: BA_METAONLY for direct block");
134 		nb = dp->di_db[lbn];
135 		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
136 			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
137 			if (error) {
138 				brelse(bp);
139 				return (error);
140 			}
141 			bp->b_blkno = fsbtodb(fs, nb);
142 			*bpp = bp;
143 			return (0);
144 		}
145 		if (nb != 0) {
146 			/*
147 			 * Consider need to reallocate a fragment.
148 			 */
149 			osize = fragroundup(fs, blkoff(fs, ip->i_size));
150 			nsize = fragroundup(fs, size);
151 			if (nsize <= osize) {
152 				error = bread(vp, lbn, osize, NOCRED, &bp);
153 				if (error) {
154 					brelse(bp);
155 					return (error);
156 				}
157 				bp->b_blkno = fsbtodb(fs, nb);
158 			} else {
159 				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
160 				    ffs_blkpref_ufs1(ip, lbn, (int)lbn,
161 				    &dp->di_db[0]), osize, nsize, cred, &bp);
162 				if (error)
163 					return (error);
164 				if (DOINGSOFTDEP(vp))
165 					softdep_setup_allocdirect(ip, lbn,
166 					    dbtofsb(fs, bp->b_blkno), nb,
167 					    nsize, osize, bp);
168 			}
169 		} else {
170 			if (ip->i_size < smalllblktosize(fs, lbn + 1))
171 				nsize = fragroundup(fs, size);
172 			else
173 				nsize = fs->fs_bsize;
174 			error = ffs_alloc(ip, lbn,
175 			    ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]),
176 			    nsize, cred, &newb);
177 			if (error)
178 				return (error);
179 			bp = getblk(vp, lbn, nsize, 0, 0, 0);
180 			bp->b_blkno = fsbtodb(fs, newb);
181 			if (flags & BA_CLRBUF)
182 				vfs_bio_clrbuf(bp);
183 			if (DOINGSOFTDEP(vp))
184 				softdep_setup_allocdirect(ip, lbn, newb, 0,
185 				    nsize, 0, bp);
186 		}
187 		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
188 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
189 		*bpp = bp;
190 		return (0);
191 	}
192 	/*
193 	 * Determine the number of levels of indirection.
194 	 */
195 	pref = 0;
196 	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
197 		return(error);
198 #ifdef DIAGNOSTIC
199 	if (num < 1)
200 		panic ("ffs_balloc_ufs1: ufs_getlbns returned indirect block");
201 #endif
202 	/*
203 	 * Fetch the first indirect block allocating if necessary.
204 	 */
205 	--num;
206 	nb = dp->di_ib[indirs[0].in_off];
207 	allocib = NULL;
208 	allocblk = allociblk;
209 	if (nb == 0) {
210 		pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
211 	        if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
212 		    cred, &newb)) != 0)
213 			return (error);
214 		nb = newb;
215 		*allocblk++ = nb;
216 		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 0);
217 		bp->b_blkno = fsbtodb(fs, nb);
218 		vfs_bio_clrbuf(bp);
219 		if (DOINGSOFTDEP(vp)) {
220 			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
221 			    newb, 0, fs->fs_bsize, 0, bp);
222 			bdwrite(bp);
223 		} else {
224 			/*
225 			 * Write synchronously so that indirect blocks
226 			 * never point at garbage.
227 			 */
228 			if (DOINGASYNC(vp))
229 				bdwrite(bp);
230 			else if ((error = bwrite(bp)) != 0)
231 				goto fail;
232 		}
233 		allocib = &dp->di_ib[indirs[0].in_off];
234 		*allocib = nb;
235 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
236 	}
237 	/*
238 	 * Fetch through the indirect blocks, allocating as necessary.
239 	 */
240 	for (i = 1;;) {
241 		error = bread(vp,
242 		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
243 		if (error) {
244 			brelse(bp);
245 			goto fail;
246 		}
247 		bap = (ufs1_daddr_t *)bp->b_data;
248 		nb = bap[indirs[i].in_off];
249 		if (i == num)
250 			break;
251 		i += 1;
252 		if (nb != 0) {
253 			bqrelse(bp);
254 			continue;
255 		}
256 		if (pref == 0)
257 			pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
258 		if ((error =
259 		    ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb)) != 0) {
260 			brelse(bp);
261 			goto fail;
262 		}
263 		nb = newb;
264 		*allocblk++ = nb;
265 		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
266 		nbp->b_blkno = fsbtodb(fs, nb);
267 		vfs_bio_clrbuf(nbp);
268 		if (DOINGSOFTDEP(vp)) {
269 			softdep_setup_allocindir_meta(nbp, ip, bp,
270 			    indirs[i - 1].in_off, nb);
271 			bdwrite(nbp);
272 		} else {
273 			/*
274 			 * Write synchronously so that indirect blocks
275 			 * never point at garbage.
276 			 */
277 			if ((error = bwrite(nbp)) != 0) {
278 				brelse(bp);
279 				goto fail;
280 			}
281 		}
282 		bap[indirs[i - 1].in_off] = nb;
283 		if (allocib == NULL && unwindidx < 0)
284 			unwindidx = i - 1;
285 		/*
286 		 * If required, write synchronously, otherwise use
287 		 * delayed write.
288 		 */
289 		if (flags & IO_SYNC) {
290 			bwrite(bp);
291 		} else {
292 			if (bp->b_bufsize == fs->fs_bsize)
293 				bp->b_flags |= B_CLUSTEROK;
294 			bdwrite(bp);
295 		}
296 	}
297 	/*
298 	 * If asked only for the indirect block, then return it.
299 	 */
300 	if (flags & BA_METAONLY) {
301 		*bpp = bp;
302 		return (0);
303 	}
304 	/*
305 	 * Get the data block, allocating if necessary.
306 	 */
307 	if (nb == 0) {
308 		pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off, &bap[0]);
309 		error = ffs_alloc(ip,
310 		    lbn, pref, (int)fs->fs_bsize, cred, &newb);
311 		if (error) {
312 			brelse(bp);
313 			goto fail;
314 		}
315 		nb = newb;
316 		*allocblk++ = nb;
317 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
318 		nbp->b_blkno = fsbtodb(fs, nb);
319 		if (flags & BA_CLRBUF)
320 			vfs_bio_clrbuf(nbp);
321 		if (DOINGSOFTDEP(vp))
322 			softdep_setup_allocindir_page(ip, lbn, bp,
323 			    indirs[i].in_off, nb, 0, nbp);
324 		bap[indirs[i].in_off] = nb;
325 		/*
326 		 * If required, write synchronously, otherwise use
327 		 * delayed write.
328 		 */
329 		if (flags & IO_SYNC) {
330 			bwrite(bp);
331 		} else {
332 			if (bp->b_bufsize == fs->fs_bsize)
333 				bp->b_flags |= B_CLUSTEROK;
334 			bdwrite(bp);
335 		}
336 		*bpp = nbp;
337 		return (0);
338 	}
339 	brelse(bp);
340 	if (flags & BA_CLRBUF) {
341 		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
342 		if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
343 			error = cluster_read(vp, ip->i_size, lbn,
344 			    (int)fs->fs_bsize, NOCRED,
345 			    MAXBSIZE, seqcount, &nbp);
346 		} else {
347 			error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp);
348 		}
349 		if (error) {
350 			brelse(nbp);
351 			goto fail;
352 		}
353 	} else {
354 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
355 		nbp->b_blkno = fsbtodb(fs, nb);
356 	}
357 	*bpp = nbp;
358 	return (0);
359 fail:
360 	/*
361 	 * If we have failed to allocate any blocks, simply return the error.
362 	 * This is the usual case and avoids the need to fsync the file.
363 	 */
364 	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
365 		return (error);
366 	/*
367 	 * If we have failed part way through block allocation, we
368 	 * have to deallocate any indirect blocks that we have allocated.
369 	 * We have to fsync the file before we start to get rid of all
370 	 * of its dependencies so that we do not leave them dangling.
371 	 * We have to sync it at the end so that the soft updates code
372 	 * does not find any untracked changes. Although this is really
373 	 * slow, running out of disk space is not expected to be a common
374 	 * occurence. The error return from fsync is ignored as we already
375 	 * have an error to return to the user.
376 	 */
377 	(void) VOP_FSYNC(vp, cred, MNT_WAIT, td);
378 	for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
379 		ffs_blkfree(fs, ip->i_devvp, *blkp, fs->fs_bsize, ip->i_number);
380 		deallocated += fs->fs_bsize;
381 	}
382 	if (allocib != NULL) {
383 		*allocib = 0;
384 	} else if (unwindidx >= 0) {
385 		int r;
386 
387 		r = bread(vp, indirs[unwindidx].in_lbn,
388 		    (int)fs->fs_bsize, NOCRED, &bp);
389 		if (r) {
390 			panic("Could not unwind indirect block, error %d", r);
391 			brelse(bp);
392 		} else {
393 			bap = (ufs1_daddr_t *)bp->b_data;
394 			bap[indirs[unwindidx].in_off] = 0;
395 			if (flags & IO_SYNC) {
396 				bwrite(bp);
397 			} else {
398 				if (bp->b_bufsize == fs->fs_bsize)
399 					bp->b_flags |= B_CLUSTEROK;
400 				bdwrite(bp);
401 			}
402 		}
403 	}
404 	if (deallocated) {
405 #ifdef QUOTA
406 		/*
407 		 * Restore user's disk quota because allocation failed.
408 		 */
409 		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
410 #endif
411 		dp->di_blocks -= btodb(deallocated);
412 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
413 	}
414 	(void) VOP_FSYNC(vp, cred, MNT_WAIT, td);
415 	return (error);
416 }
417 
418 /*
419  * Balloc defines the structure of file system storage
420  * by allocating the physical blocks on a device given
421  * the inode and the logical block number in a file.
422  * This is the allocation strategy for UFS2. Above is
423  * the allocation strategy for UFS1.
424  */
425 int
426 ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
427     struct ucred *cred, int flags, struct buf **bpp)
428 {
429 	struct inode *ip;
430 	struct ufs2_dinode *dp;
431 	ufs_lbn_t lbn, lastlbn;
432 	struct fs *fs;
433 	struct buf *bp, *nbp;
434 	struct indir indirs[NIADDR + 2];
435 	ufs2_daddr_t nb, newb, *bap, pref;
436 	ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
437 	int deallocated, osize, nsize, num, i, error;
438 	int unwindidx = -1;
439 	struct thread *td = curthread;	/* XXX */
440 
441 	ip = VTOI(vp);
442 	dp = ip->i_din2;
443 	fs = ip->i_fs;
444 	lbn = lblkno(fs, startoffset);
445 	size = blkoff(fs, startoffset) + size;
446 	if (size > fs->fs_bsize)
447 		panic("ffs_balloc_ufs2: blk too big");
448 	*bpp = NULL;
449 	if (lbn < 0)
450 		return (EFBIG);
451 
452 	/*
453 	 * Check for allocating external data.
454 	 */
455 	if (flags & IO_EXT) {
456 		if (lbn >= NXADDR)
457 			return (EFBIG);
458 		/*
459 		 * If the next write will extend the data into a new block,
460 		 * and the data is currently composed of a fragment
461 		 * this fragment has to be extended to be a full block.
462 		 */
463 		lastlbn = lblkno(fs, dp->di_extsize);
464 		if (lastlbn < lbn) {
465 			nb = lastlbn;
466 			osize = sblksize(fs, dp->di_extsize, nb);
467 			if (osize < fs->fs_bsize && osize > 0) {
468 				error = ffs_realloccg(ip, -1 - nb,
469 				    dp->di_extb[nb],
470 				    ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
471 				    &dp->di_extb[0]), osize,
472 				    (int)fs->fs_bsize, cred, &bp);
473 				if (error)
474 					return (error);
475 				if (DOINGSOFTDEP(vp))
476 					softdep_setup_allocext(ip, nb,
477 					    dbtofsb(fs, bp->b_blkno),
478 					    dp->di_extb[nb],
479 					    fs->fs_bsize, osize, bp);
480 				dp->di_extsize = smalllblktosize(fs, nb + 1);
481 				dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno);
482 				bp->b_xflags |= BX_ALTDATA;
483 				ip->i_flag |= IN_CHANGE | IN_UPDATE;
484 				if (flags & IO_SYNC)
485 					bwrite(bp);
486 				else
487 					bawrite(bp);
488 			}
489 		}
490 		/*
491 		 * All blocks are direct blocks
492 		 */
493 		if (flags & BA_METAONLY)
494 			panic("ffs_balloc_ufs2: BA_METAONLY for ext block");
495 		nb = dp->di_extb[lbn];
496 		if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) {
497 			error = bread(vp, -1 - lbn, fs->fs_bsize, NOCRED, &bp);
498 			if (error) {
499 				brelse(bp);
500 				return (error);
501 			}
502 			bp->b_blkno = fsbtodb(fs, nb);
503 			bp->b_xflags |= BX_ALTDATA;
504 			*bpp = bp;
505 			return (0);
506 		}
507 		if (nb != 0) {
508 			/*
509 			 * Consider need to reallocate a fragment.
510 			 */
511 			osize = fragroundup(fs, blkoff(fs, dp->di_extsize));
512 			nsize = fragroundup(fs, size);
513 			if (nsize <= osize) {
514 				error = bread(vp, -1 - lbn, osize, NOCRED, &bp);
515 				if (error) {
516 					brelse(bp);
517 					return (error);
518 				}
519 				bp->b_blkno = fsbtodb(fs, nb);
520 				bp->b_xflags |= BX_ALTDATA;
521 			} else {
522 				error = ffs_realloccg(ip, -1 - lbn,
523 				    dp->di_extb[lbn],
524 				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
525 				    &dp->di_extb[0]), osize, nsize, cred, &bp);
526 				if (error)
527 					return (error);
528 				bp->b_xflags |= BX_ALTDATA;
529 				if (DOINGSOFTDEP(vp))
530 					softdep_setup_allocext(ip, lbn,
531 					    dbtofsb(fs, bp->b_blkno), nb,
532 					    nsize, osize, bp);
533 			}
534 		} else {
535 			if (dp->di_extsize < smalllblktosize(fs, lbn + 1))
536 				nsize = fragroundup(fs, size);
537 			else
538 				nsize = fs->fs_bsize;
539 			error = ffs_alloc(ip, lbn,
540 			   ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]),
541 			   nsize, cred, &newb);
542 			if (error)
543 				return (error);
544 			bp = getblk(vp, -1 - lbn, nsize, 0, 0, 0);
545 			bp->b_blkno = fsbtodb(fs, newb);
546 			bp->b_xflags |= BX_ALTDATA;
547 			if (flags & BA_CLRBUF)
548 				vfs_bio_clrbuf(bp);
549 			if (DOINGSOFTDEP(vp))
550 				softdep_setup_allocext(ip, lbn, newb, 0,
551 				    nsize, 0, bp);
552 		}
553 		dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno);
554 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
555 		*bpp = bp;
556 		return (0);
557 	}
558 	/*
559 	 * If the next write will extend the file into a new block,
560 	 * and the file is currently composed of a fragment
561 	 * this fragment has to be extended to be a full block.
562 	 */
563 	lastlbn = lblkno(fs, ip->i_size);
564 	if (lastlbn < NDADDR && lastlbn < lbn) {
565 		nb = lastlbn;
566 		osize = blksize(fs, ip, nb);
567 		if (osize < fs->fs_bsize && osize > 0) {
568 			error = ffs_realloccg(ip, nb, dp->di_db[nb],
569 				ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
570 				    &dp->di_db[0]), osize, (int)fs->fs_bsize,
571 				    cred, &bp);
572 			if (error)
573 				return (error);
574 			if (DOINGSOFTDEP(vp))
575 				softdep_setup_allocdirect(ip, nb,
576 				    dbtofsb(fs, bp->b_blkno),
577 				    dp->di_db[nb],
578 				    fs->fs_bsize, osize, bp);
579 			ip->i_size = smalllblktosize(fs, nb + 1);
580 			dp->di_size = ip->i_size;
581 			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
582 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
583 			if (flags & IO_SYNC)
584 				bwrite(bp);
585 			else
586 				bawrite(bp);
587 		}
588 	}
589 	/*
590 	 * The first NDADDR blocks are direct blocks
591 	 */
592 	if (lbn < NDADDR) {
593 		if (flags & BA_METAONLY)
594 			panic("ffs_balloc_ufs2: BA_METAONLY for direct block");
595 		nb = dp->di_db[lbn];
596 		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
597 			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
598 			if (error) {
599 				brelse(bp);
600 				return (error);
601 			}
602 			bp->b_blkno = fsbtodb(fs, nb);
603 			*bpp = bp;
604 			return (0);
605 		}
606 		if (nb != 0) {
607 			/*
608 			 * Consider need to reallocate a fragment.
609 			 */
610 			osize = fragroundup(fs, blkoff(fs, ip->i_size));
611 			nsize = fragroundup(fs, size);
612 			if (nsize <= osize) {
613 				error = bread(vp, lbn, osize, NOCRED, &bp);
614 				if (error) {
615 					brelse(bp);
616 					return (error);
617 				}
618 				bp->b_blkno = fsbtodb(fs, nb);
619 			} else {
620 				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
621 				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
622 				       &dp->di_db[0]), osize, nsize, cred, &bp);
623 				if (error)
624 					return (error);
625 				if (DOINGSOFTDEP(vp))
626 					softdep_setup_allocdirect(ip, lbn,
627 					    dbtofsb(fs, bp->b_blkno), nb,
628 					    nsize, osize, bp);
629 			}
630 		} else {
631 			if (ip->i_size < smalllblktosize(fs, lbn + 1))
632 				nsize = fragroundup(fs, size);
633 			else
634 				nsize = fs->fs_bsize;
635 			error = ffs_alloc(ip, lbn,
636 			    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
637 				&dp->di_db[0]), nsize, cred, &newb);
638 			if (error)
639 				return (error);
640 			bp = getblk(vp, lbn, nsize, 0, 0, 0);
641 			bp->b_blkno = fsbtodb(fs, newb);
642 			if (flags & BA_CLRBUF)
643 				vfs_bio_clrbuf(bp);
644 			if (DOINGSOFTDEP(vp))
645 				softdep_setup_allocdirect(ip, lbn, newb, 0,
646 				    nsize, 0, bp);
647 		}
648 		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
649 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
650 		*bpp = bp;
651 		return (0);
652 	}
653 	/*
654 	 * Determine the number of levels of indirection.
655 	 */
656 	pref = 0;
657 	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
658 		return(error);
659 #ifdef DIAGNOSTIC
660 	if (num < 1)
661 		panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block");
662 #endif
663 	/*
664 	 * Fetch the first indirect block allocating if necessary.
665 	 */
666 	--num;
667 	nb = dp->di_ib[indirs[0].in_off];
668 	allocib = NULL;
669 	allocblk = allociblk;
670 	if (nb == 0) {
671 		pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
672 	        if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
673 		    cred, &newb)) != 0)
674 			return (error);
675 		nb = newb;
676 		*allocblk++ = nb;
677 		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 0);
678 		bp->b_blkno = fsbtodb(fs, nb);
679 		vfs_bio_clrbuf(bp);
680 		if (DOINGSOFTDEP(vp)) {
681 			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
682 			    newb, 0, fs->fs_bsize, 0, bp);
683 			bdwrite(bp);
684 		} else {
685 			/*
686 			 * Write synchronously so that indirect blocks
687 			 * never point at garbage.
688 			 */
689 			if (DOINGASYNC(vp))
690 				bdwrite(bp);
691 			else if ((error = bwrite(bp)) != 0)
692 				goto fail;
693 		}
694 		allocib = &dp->di_ib[indirs[0].in_off];
695 		*allocib = nb;
696 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
697 	}
698 	/*
699 	 * Fetch through the indirect blocks, allocating as necessary.
700 	 */
701 	for (i = 1;;) {
702 		error = bread(vp,
703 		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
704 		if (error) {
705 			brelse(bp);
706 			goto fail;
707 		}
708 		bap = (ufs2_daddr_t *)bp->b_data;
709 		nb = bap[indirs[i].in_off];
710 		if (i == num)
711 			break;
712 		i += 1;
713 		if (nb != 0) {
714 			bqrelse(bp);
715 			continue;
716 		}
717 		if (pref == 0)
718 			pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
719 		if ((error =
720 		    ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb)) != 0) {
721 			brelse(bp);
722 			goto fail;
723 		}
724 		nb = newb;
725 		*allocblk++ = nb;
726 		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
727 		nbp->b_blkno = fsbtodb(fs, nb);
728 		vfs_bio_clrbuf(nbp);
729 		if (DOINGSOFTDEP(vp)) {
730 			softdep_setup_allocindir_meta(nbp, ip, bp,
731 			    indirs[i - 1].in_off, nb);
732 			bdwrite(nbp);
733 		} else {
734 			/*
735 			 * Write synchronously so that indirect blocks
736 			 * never point at garbage.
737 			 */
738 			if ((error = bwrite(nbp)) != 0) {
739 				brelse(bp);
740 				goto fail;
741 			}
742 		}
743 		bap[indirs[i - 1].in_off] = nb;
744 		if (allocib == NULL && unwindidx < 0)
745 			unwindidx = i - 1;
746 		/*
747 		 * If required, write synchronously, otherwise use
748 		 * delayed write.
749 		 */
750 		if (flags & IO_SYNC) {
751 			bwrite(bp);
752 		} else {
753 			if (bp->b_bufsize == fs->fs_bsize)
754 				bp->b_flags |= B_CLUSTEROK;
755 			bdwrite(bp);
756 		}
757 	}
758 	/*
759 	 * If asked only for the indirect block, then return it.
760 	 */
761 	if (flags & BA_METAONLY) {
762 		*bpp = bp;
763 		return (0);
764 	}
765 	/*
766 	 * Get the data block, allocating if necessary.
767 	 */
768 	if (nb == 0) {
769 		pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off, &bap[0]);
770 		error = ffs_alloc(ip,
771 		    lbn, pref, (int)fs->fs_bsize, cred, &newb);
772 		if (error) {
773 			brelse(bp);
774 			goto fail;
775 		}
776 		nb = newb;
777 		*allocblk++ = nb;
778 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
779 		nbp->b_blkno = fsbtodb(fs, nb);
780 		if (flags & BA_CLRBUF)
781 			vfs_bio_clrbuf(nbp);
782 		if (DOINGSOFTDEP(vp))
783 			softdep_setup_allocindir_page(ip, lbn, bp,
784 			    indirs[i].in_off, nb, 0, nbp);
785 		bap[indirs[i].in_off] = nb;
786 		/*
787 		 * If required, write synchronously, otherwise use
788 		 * delayed write.
789 		 */
790 		if (flags & IO_SYNC) {
791 			bwrite(bp);
792 		} else {
793 			if (bp->b_bufsize == fs->fs_bsize)
794 				bp->b_flags |= B_CLUSTEROK;
795 			bdwrite(bp);
796 		}
797 		*bpp = nbp;
798 		return (0);
799 	}
800 	brelse(bp);
801 	/*
802 	 * If requested clear invalid portions of the buffer.  If we
803 	 * have to do a read-before-write (typical if BA_CLRBUF is set),
804 	 * try to do some read-ahead in the sequential case to reduce
805 	 * the number of I/O transactions.
806 	 */
807 	if (flags & BA_CLRBUF) {
808 		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
809 		if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
810 			error = cluster_read(vp, ip->i_size, lbn,
811 			    (int)fs->fs_bsize, NOCRED,
812 			    MAXBSIZE, seqcount, &nbp);
813 		} else {
814 			error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp);
815 		}
816 		if (error) {
817 			brelse(nbp);
818 			goto fail;
819 		}
820 	} else {
821 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
822 		nbp->b_blkno = fsbtodb(fs, nb);
823 	}
824 	*bpp = nbp;
825 	return (0);
826 fail:
827 	/*
828 	 * If we have failed to allocate any blocks, simply return the error.
829 	 * This is the usual case and avoids the need to fsync the file.
830 	 */
831 	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
832 		return (error);
833 	/*
834 	 * If we have failed part way through block allocation, we
835 	 * have to deallocate any indirect blocks that we have allocated.
836 	 * We have to fsync the file before we start to get rid of all
837 	 * of its dependencies so that we do not leave them dangling.
838 	 * We have to sync it at the end so that the soft updates code
839 	 * does not find any untracked changes. Although this is really
840 	 * slow, running out of disk space is not expected to be a common
841 	 * occurence. The error return from fsync is ignored as we already
842 	 * have an error to return to the user.
843 	 */
844 	(void) VOP_FSYNC(vp, cred, MNT_WAIT, td);
845 	for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
846 		ffs_blkfree(fs, ip->i_devvp, *blkp, fs->fs_bsize, ip->i_number);
847 		deallocated += fs->fs_bsize;
848 	}
849 	if (allocib != NULL) {
850 		*allocib = 0;
851 	} else if (unwindidx >= 0) {
852 		int r;
853 
854 		r = bread(vp, indirs[unwindidx].in_lbn,
855 		    (int)fs->fs_bsize, NOCRED, &bp);
856 		if (r) {
857 			panic("Could not unwind indirect block, error %d", r);
858 			brelse(bp);
859 		} else {
860 			bap = (ufs2_daddr_t *)bp->b_data;
861 			bap[indirs[unwindidx].in_off] = 0;
862 			if (flags & IO_SYNC) {
863 				bwrite(bp);
864 			} else {
865 				if (bp->b_bufsize == fs->fs_bsize)
866 					bp->b_flags |= B_CLUSTEROK;
867 				bdwrite(bp);
868 			}
869 		}
870 	}
871 	if (deallocated) {
872 #ifdef QUOTA
873 		/*
874 		 * Restore user's disk quota because allocation failed.
875 		 */
876 		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
877 #endif
878 		dp->di_blocks -= btodb(deallocated);
879 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
880 	}
881 	(void) VOP_FSYNC(vp, cred, MNT_WAIT, td);
882 	return (error);
883 }
884