xref: /freebsd/sys/ufs/ffs/ffs_balloc.c (revision 2be1a816b9ff69588e55be0a84cbe2a31efc0f2f)
1 /*-
2  * Copyright (c) 2002 Networks Associates Technology, Inc.
3  * All rights reserved.
4  *
5  * This software was developed for the FreeBSD Project by Marshall
6  * Kirk McKusick and Network Associates Laboratories, the Security
7  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
8  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
9  * research program
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  * Copyright (c) 1982, 1986, 1989, 1993
33  *	The Regents of the University of California.  All rights reserved.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  * 4. Neither the name of the University nor the names of its contributors
44  *    may be used to endorse or promote products derived from this software
45  *    without specific prior written permission.
46  *
47  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57  * SUCH DAMAGE.
58  *
59  *	@(#)ffs_balloc.c	8.8 (Berkeley) 6/16/95
60  */
61 
62 #include <sys/cdefs.h>
63 __FBSDID("$FreeBSD$");
64 
65 #include <sys/param.h>
66 #include <sys/systm.h>
67 #include <sys/bio.h>
68 #include <sys/buf.h>
69 #include <sys/lock.h>
70 #include <sys/mount.h>
71 #include <sys/vnode.h>
72 
73 #include <ufs/ufs/quota.h>
74 #include <ufs/ufs/inode.h>
75 #include <ufs/ufs/ufs_extern.h>
76 #include <ufs/ufs/extattr.h>
77 #include <ufs/ufs/ufsmount.h>
78 
79 #include <ufs/ffs/fs.h>
80 #include <ufs/ffs/ffs_extern.h>
81 
82 /*
83  * Balloc defines the structure of filesystem storage
84  * by allocating the physical blocks on a device given
85  * the inode and the logical block number in a file.
86  * This is the allocation strategy for UFS1. Below is
87  * the allocation strategy for UFS2.
88  */
89 int
90 ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
91     struct ucred *cred, int flags, struct buf **bpp)
92 {
93 	struct inode *ip;
94 	struct ufs1_dinode *dp;
95 	ufs_lbn_t lbn, lastlbn;
96 	struct fs *fs;
97 	ufs1_daddr_t nb;
98 	struct buf *bp, *nbp;
99 	struct ufsmount *ump;
100 	struct indir indirs[NIADDR + 2];
101 	int deallocated, osize, nsize, num, i, error;
102 	ufs2_daddr_t newb;
103 	ufs1_daddr_t *bap, pref;
104 	ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
105 	ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
106 	int unwindidx = -1;
107 
108 	ip = VTOI(vp);
109 	dp = ip->i_din1;
110 	fs = ip->i_fs;
111 	ump = ip->i_ump;
112 	lbn = lblkno(fs, startoffset);
113 	size = blkoff(fs, startoffset) + size;
114 	if (size > fs->fs_bsize)
115 		panic("ffs_balloc_ufs1: blk too big");
116 	*bpp = NULL;
117 	if (flags & IO_EXT)
118 		return (EOPNOTSUPP);
119 	if (lbn < 0)
120 		return (EFBIG);
121 
122 	/*
123 	 * If the next write will extend the file into a new block,
124 	 * and the file is currently composed of a fragment
125 	 * this fragment has to be extended to be a full block.
126 	 */
127 	lastlbn = lblkno(fs, ip->i_size);
128 	if (lastlbn < NDADDR && lastlbn < lbn) {
129 		nb = lastlbn;
130 		osize = blksize(fs, ip, nb);
131 		if (osize < fs->fs_bsize && osize > 0) {
132 			UFS_LOCK(ump);
133 			error = ffs_realloccg(ip, nb, dp->di_db[nb],
134 			   ffs_blkpref_ufs1(ip, lastlbn, (int)nb,
135 			   &dp->di_db[0]), osize, (int)fs->fs_bsize, cred, &bp);
136 			if (error)
137 				return (error);
138 			if (DOINGSOFTDEP(vp))
139 				softdep_setup_allocdirect(ip, nb,
140 				    dbtofsb(fs, bp->b_blkno), dp->di_db[nb],
141 				    fs->fs_bsize, osize, bp);
142 			ip->i_size = smalllblktosize(fs, nb + 1);
143 			dp->di_size = ip->i_size;
144 			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
145 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
146 			if (flags & IO_SYNC)
147 				bwrite(bp);
148 			else
149 				bawrite(bp);
150 		}
151 	}
152 	/*
153 	 * The first NDADDR blocks are direct blocks
154 	 */
155 	if (lbn < NDADDR) {
156 		if (flags & BA_METAONLY)
157 			panic("ffs_balloc_ufs1: BA_METAONLY for direct block");
158 		nb = dp->di_db[lbn];
159 		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
160 			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
161 			if (error) {
162 				brelse(bp);
163 				return (error);
164 			}
165 			bp->b_blkno = fsbtodb(fs, nb);
166 			*bpp = bp;
167 			return (0);
168 		}
169 		if (nb != 0) {
170 			/*
171 			 * Consider need to reallocate a fragment.
172 			 */
173 			osize = fragroundup(fs, blkoff(fs, ip->i_size));
174 			nsize = fragroundup(fs, size);
175 			if (nsize <= osize) {
176 				error = bread(vp, lbn, osize, NOCRED, &bp);
177 				if (error) {
178 					brelse(bp);
179 					return (error);
180 				}
181 				bp->b_blkno = fsbtodb(fs, nb);
182 			} else {
183 				UFS_LOCK(ump);
184 				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
185 				    ffs_blkpref_ufs1(ip, lbn, (int)lbn,
186 				    &dp->di_db[0]), osize, nsize, cred, &bp);
187 				if (error)
188 					return (error);
189 				if (DOINGSOFTDEP(vp))
190 					softdep_setup_allocdirect(ip, lbn,
191 					    dbtofsb(fs, bp->b_blkno), nb,
192 					    nsize, osize, bp);
193 			}
194 		} else {
195 			if (ip->i_size < smalllblktosize(fs, lbn + 1))
196 				nsize = fragroundup(fs, size);
197 			else
198 				nsize = fs->fs_bsize;
199 			UFS_LOCK(ump);
200 			error = ffs_alloc(ip, lbn,
201 			    ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]),
202 			    nsize, cred, &newb);
203 			if (error)
204 				return (error);
205 			bp = getblk(vp, lbn, nsize, 0, 0, 0);
206 			bp->b_blkno = fsbtodb(fs, newb);
207 			if (flags & BA_CLRBUF)
208 				vfs_bio_clrbuf(bp);
209 			if (DOINGSOFTDEP(vp))
210 				softdep_setup_allocdirect(ip, lbn, newb, 0,
211 				    nsize, 0, bp);
212 		}
213 		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
214 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
215 		*bpp = bp;
216 		return (0);
217 	}
218 	/*
219 	 * Determine the number of levels of indirection.
220 	 */
221 	pref = 0;
222 	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
223 		return(error);
224 #ifdef INVARIANTS
225 	if (num < 1)
226 		panic ("ffs_balloc_ufs1: ufs_getlbns returned indirect block");
227 #endif
228 	/*
229 	 * Fetch the first indirect block allocating if necessary.
230 	 */
231 	--num;
232 	nb = dp->di_ib[indirs[0].in_off];
233 	allocib = NULL;
234 	allocblk = allociblk;
235 	lbns_remfree = lbns;
236 	if (nb == 0) {
237 		UFS_LOCK(ump);
238 		pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
239 	        if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
240 		    cred, &newb)) != 0)
241 			return (error);
242 		nb = newb;
243 		*allocblk++ = nb;
244 		*lbns_remfree++ = indirs[1].in_lbn;
245 		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 0);
246 		bp->b_blkno = fsbtodb(fs, nb);
247 		vfs_bio_clrbuf(bp);
248 		if (DOINGSOFTDEP(vp)) {
249 			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
250 			    newb, 0, fs->fs_bsize, 0, bp);
251 			bdwrite(bp);
252 		} else {
253 			/*
254 			 * Write synchronously so that indirect blocks
255 			 * never point at garbage.
256 			 */
257 			if (DOINGASYNC(vp))
258 				bdwrite(bp);
259 			else if ((error = bwrite(bp)) != 0)
260 				goto fail;
261 		}
262 		allocib = &dp->di_ib[indirs[0].in_off];
263 		*allocib = nb;
264 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
265 	}
266 	/*
267 	 * Fetch through the indirect blocks, allocating as necessary.
268 	 */
269 	for (i = 1;;) {
270 		error = bread(vp,
271 		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
272 		if (error) {
273 			brelse(bp);
274 			goto fail;
275 		}
276 		bap = (ufs1_daddr_t *)bp->b_data;
277 		nb = bap[indirs[i].in_off];
278 		if (i == num)
279 			break;
280 		i += 1;
281 		if (nb != 0) {
282 			bqrelse(bp);
283 			continue;
284 		}
285 		UFS_LOCK(ump);
286 		if (pref == 0)
287 			pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
288 		if ((error =
289 		    ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb)) != 0) {
290 			brelse(bp);
291 			goto fail;
292 		}
293 		nb = newb;
294 		*allocblk++ = nb;
295 		*lbns_remfree++ = indirs[i].in_lbn;
296 		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
297 		nbp->b_blkno = fsbtodb(fs, nb);
298 		vfs_bio_clrbuf(nbp);
299 		if (DOINGSOFTDEP(vp)) {
300 			softdep_setup_allocindir_meta(nbp, ip, bp,
301 			    indirs[i - 1].in_off, nb);
302 			bdwrite(nbp);
303 		} else {
304 			/*
305 			 * Write synchronously so that indirect blocks
306 			 * never point at garbage.
307 			 */
308 			if ((error = bwrite(nbp)) != 0) {
309 				brelse(bp);
310 				goto fail;
311 			}
312 		}
313 		bap[indirs[i - 1].in_off] = nb;
314 		if (allocib == NULL && unwindidx < 0)
315 			unwindidx = i - 1;
316 		/*
317 		 * If required, write synchronously, otherwise use
318 		 * delayed write.
319 		 */
320 		if (flags & IO_SYNC) {
321 			bwrite(bp);
322 		} else {
323 			if (bp->b_bufsize == fs->fs_bsize)
324 				bp->b_flags |= B_CLUSTEROK;
325 			bdwrite(bp);
326 		}
327 	}
328 	/*
329 	 * If asked only for the indirect block, then return it.
330 	 */
331 	if (flags & BA_METAONLY) {
332 		*bpp = bp;
333 		return (0);
334 	}
335 	/*
336 	 * Get the data block, allocating if necessary.
337 	 */
338 	if (nb == 0) {
339 		UFS_LOCK(ump);
340 		pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off, &bap[0]);
341 		error = ffs_alloc(ip,
342 		    lbn, pref, (int)fs->fs_bsize, cred, &newb);
343 		if (error) {
344 			brelse(bp);
345 			goto fail;
346 		}
347 		nb = newb;
348 		*allocblk++ = nb;
349 		*lbns_remfree++ = lbn;
350 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
351 		nbp->b_blkno = fsbtodb(fs, nb);
352 		if (flags & BA_CLRBUF)
353 			vfs_bio_clrbuf(nbp);
354 		if (DOINGSOFTDEP(vp))
355 			softdep_setup_allocindir_page(ip, lbn, bp,
356 			    indirs[i].in_off, nb, 0, nbp);
357 		bap[indirs[i].in_off] = nb;
358 		/*
359 		 * If required, write synchronously, otherwise use
360 		 * delayed write.
361 		 */
362 		if (flags & IO_SYNC) {
363 			bwrite(bp);
364 		} else {
365 			if (bp->b_bufsize == fs->fs_bsize)
366 				bp->b_flags |= B_CLUSTEROK;
367 			bdwrite(bp);
368 		}
369 		*bpp = nbp;
370 		return (0);
371 	}
372 	brelse(bp);
373 	if (flags & BA_CLRBUF) {
374 		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
375 		if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
376 			error = cluster_read(vp, ip->i_size, lbn,
377 			    (int)fs->fs_bsize, NOCRED,
378 			    MAXBSIZE, seqcount, &nbp);
379 		} else {
380 			error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp);
381 		}
382 		if (error) {
383 			brelse(nbp);
384 			goto fail;
385 		}
386 	} else {
387 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
388 		nbp->b_blkno = fsbtodb(fs, nb);
389 	}
390 	*bpp = nbp;
391 	return (0);
392 fail:
393 	/*
394 	 * If we have failed to allocate any blocks, simply return the error.
395 	 * This is the usual case and avoids the need to fsync the file.
396 	 */
397 	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
398 		return (error);
399 	/*
400 	 * If we have failed part way through block allocation, we
401 	 * have to deallocate any indirect blocks that we have allocated.
402 	 * We have to fsync the file before we start to get rid of all
403 	 * of its dependencies so that we do not leave them dangling.
404 	 * We have to sync it at the end so that the soft updates code
405 	 * does not find any untracked changes. Although this is really
406 	 * slow, running out of disk space is not expected to be a common
407 	 * occurence. The error return from fsync is ignored as we already
408 	 * have an error to return to the user.
409 	 */
410 	(void) ffs_syncvnode(vp, MNT_WAIT);
411 	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
412 	     blkp < allocblk; blkp++, lbns_remfree++) {
413 		/*
414 		 * We shall not leave the freed blocks on the vnode
415 		 * buffer object lists.
416 		 */
417 		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT);
418 		if (bp != NULL) {
419 			bp->b_flags |= (B_INVAL | B_RELBUF);
420 			bp->b_flags &= ~B_ASYNC;
421 			brelse(bp);
422 		}
423 		deallocated += fs->fs_bsize;
424 	}
425 	if (allocib != NULL) {
426 		*allocib = 0;
427 	} else if (unwindidx >= 0) {
428 		int r;
429 
430 		r = bread(vp, indirs[unwindidx].in_lbn,
431 		    (int)fs->fs_bsize, NOCRED, &bp);
432 		if (r) {
433 			panic("Could not unwind indirect block, error %d", r);
434 			brelse(bp);
435 		} else {
436 			bap = (ufs1_daddr_t *)bp->b_data;
437 			bap[indirs[unwindidx].in_off] = 0;
438 			if (flags & IO_SYNC) {
439 				bwrite(bp);
440 			} else {
441 				if (bp->b_bufsize == fs->fs_bsize)
442 					bp->b_flags |= B_CLUSTEROK;
443 				bdwrite(bp);
444 			}
445 		}
446 	}
447 	if (deallocated) {
448 #ifdef QUOTA
449 		/*
450 		 * Restore user's disk quota because allocation failed.
451 		 */
452 		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
453 #endif
454 		dp->di_blocks -= btodb(deallocated);
455 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
456 	}
457 	(void) ffs_syncvnode(vp, MNT_WAIT);
458 	/*
459 	 * After the buffers are invalidated and on-disk pointers are
460 	 * cleared, free the blocks.
461 	 */
462 	for (blkp = allociblk; blkp < allocblk; blkp++) {
463 		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
464 		    ip->i_number);
465 	}
466 	return (error);
467 }
468 
469 /*
470  * Balloc defines the structure of file system storage
471  * by allocating the physical blocks on a device given
472  * the inode and the logical block number in a file.
473  * This is the allocation strategy for UFS2. Above is
474  * the allocation strategy for UFS1.
475  */
476 int
477 ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
478     struct ucred *cred, int flags, struct buf **bpp)
479 {
480 	struct inode *ip;
481 	struct ufs2_dinode *dp;
482 	ufs_lbn_t lbn, lastlbn;
483 	struct fs *fs;
484 	struct buf *bp, *nbp;
485 	struct ufsmount *ump;
486 	struct indir indirs[NIADDR + 2];
487 	ufs2_daddr_t nb, newb, *bap, pref;
488 	ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
489 	ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
490 	int deallocated, osize, nsize, num, i, error;
491 	int unwindidx = -1;
492 
493 	ip = VTOI(vp);
494 	dp = ip->i_din2;
495 	fs = ip->i_fs;
496 	ump = ip->i_ump;
497 	lbn = lblkno(fs, startoffset);
498 	size = blkoff(fs, startoffset) + size;
499 	if (size > fs->fs_bsize)
500 		panic("ffs_balloc_ufs2: blk too big");
501 	*bpp = NULL;
502 	if (lbn < 0)
503 		return (EFBIG);
504 
505 	/*
506 	 * Check for allocating external data.
507 	 */
508 	if (flags & IO_EXT) {
509 		if (lbn >= NXADDR)
510 			return (EFBIG);
511 		/*
512 		 * If the next write will extend the data into a new block,
513 		 * and the data is currently composed of a fragment
514 		 * this fragment has to be extended to be a full block.
515 		 */
516 		lastlbn = lblkno(fs, dp->di_extsize);
517 		if (lastlbn < lbn) {
518 			nb = lastlbn;
519 			osize = sblksize(fs, dp->di_extsize, nb);
520 			if (osize < fs->fs_bsize && osize > 0) {
521 				UFS_LOCK(ump);
522 				error = ffs_realloccg(ip, -1 - nb,
523 				    dp->di_extb[nb],
524 				    ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
525 				    &dp->di_extb[0]), osize,
526 				    (int)fs->fs_bsize, cred, &bp);
527 				if (error)
528 					return (error);
529 				if (DOINGSOFTDEP(vp))
530 					softdep_setup_allocext(ip, nb,
531 					    dbtofsb(fs, bp->b_blkno),
532 					    dp->di_extb[nb],
533 					    fs->fs_bsize, osize, bp);
534 				dp->di_extsize = smalllblktosize(fs, nb + 1);
535 				dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno);
536 				bp->b_xflags |= BX_ALTDATA;
537 				ip->i_flag |= IN_CHANGE | IN_UPDATE;
538 				if (flags & IO_SYNC)
539 					bwrite(bp);
540 				else
541 					bawrite(bp);
542 			}
543 		}
544 		/*
545 		 * All blocks are direct blocks
546 		 */
547 		if (flags & BA_METAONLY)
548 			panic("ffs_balloc_ufs2: BA_METAONLY for ext block");
549 		nb = dp->di_extb[lbn];
550 		if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) {
551 			error = bread(vp, -1 - lbn, fs->fs_bsize, NOCRED, &bp);
552 			if (error) {
553 				brelse(bp);
554 				return (error);
555 			}
556 			bp->b_blkno = fsbtodb(fs, nb);
557 			bp->b_xflags |= BX_ALTDATA;
558 			*bpp = bp;
559 			return (0);
560 		}
561 		if (nb != 0) {
562 			/*
563 			 * Consider need to reallocate a fragment.
564 			 */
565 			osize = fragroundup(fs, blkoff(fs, dp->di_extsize));
566 			nsize = fragroundup(fs, size);
567 			if (nsize <= osize) {
568 				error = bread(vp, -1 - lbn, osize, NOCRED, &bp);
569 				if (error) {
570 					brelse(bp);
571 					return (error);
572 				}
573 				bp->b_blkno = fsbtodb(fs, nb);
574 				bp->b_xflags |= BX_ALTDATA;
575 			} else {
576 				UFS_LOCK(ump);
577 				error = ffs_realloccg(ip, -1 - lbn,
578 				    dp->di_extb[lbn],
579 				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
580 				    &dp->di_extb[0]), osize, nsize, cred, &bp);
581 				if (error)
582 					return (error);
583 				bp->b_xflags |= BX_ALTDATA;
584 				if (DOINGSOFTDEP(vp))
585 					softdep_setup_allocext(ip, lbn,
586 					    dbtofsb(fs, bp->b_blkno), nb,
587 					    nsize, osize, bp);
588 			}
589 		} else {
590 			if (dp->di_extsize < smalllblktosize(fs, lbn + 1))
591 				nsize = fragroundup(fs, size);
592 			else
593 				nsize = fs->fs_bsize;
594 			UFS_LOCK(ump);
595 			error = ffs_alloc(ip, lbn,
596 			   ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]),
597 			   nsize, cred, &newb);
598 			if (error)
599 				return (error);
600 			bp = getblk(vp, -1 - lbn, nsize, 0, 0, 0);
601 			bp->b_blkno = fsbtodb(fs, newb);
602 			bp->b_xflags |= BX_ALTDATA;
603 			if (flags & BA_CLRBUF)
604 				vfs_bio_clrbuf(bp);
605 			if (DOINGSOFTDEP(vp))
606 				softdep_setup_allocext(ip, lbn, newb, 0,
607 				    nsize, 0, bp);
608 		}
609 		dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno);
610 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
611 		*bpp = bp;
612 		return (0);
613 	}
614 	/*
615 	 * If the next write will extend the file into a new block,
616 	 * and the file is currently composed of a fragment
617 	 * this fragment has to be extended to be a full block.
618 	 */
619 	lastlbn = lblkno(fs, ip->i_size);
620 	if (lastlbn < NDADDR && lastlbn < lbn) {
621 		nb = lastlbn;
622 		osize = blksize(fs, ip, nb);
623 		if (osize < fs->fs_bsize && osize > 0) {
624 			UFS_LOCK(ump);
625 			error = ffs_realloccg(ip, nb, dp->di_db[nb],
626 				ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
627 				    &dp->di_db[0]), osize, (int)fs->fs_bsize,
628 				    cred, &bp);
629 			if (error)
630 				return (error);
631 			if (DOINGSOFTDEP(vp))
632 				softdep_setup_allocdirect(ip, nb,
633 				    dbtofsb(fs, bp->b_blkno),
634 				    dp->di_db[nb],
635 				    fs->fs_bsize, osize, bp);
636 			ip->i_size = smalllblktosize(fs, nb + 1);
637 			dp->di_size = ip->i_size;
638 			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
639 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
640 			if (flags & IO_SYNC)
641 				bwrite(bp);
642 			else
643 				bawrite(bp);
644 		}
645 	}
646 	/*
647 	 * The first NDADDR blocks are direct blocks
648 	 */
649 	if (lbn < NDADDR) {
650 		if (flags & BA_METAONLY)
651 			panic("ffs_balloc_ufs2: BA_METAONLY for direct block");
652 		nb = dp->di_db[lbn];
653 		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
654 			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
655 			if (error) {
656 				brelse(bp);
657 				return (error);
658 			}
659 			bp->b_blkno = fsbtodb(fs, nb);
660 			*bpp = bp;
661 			return (0);
662 		}
663 		if (nb != 0) {
664 			/*
665 			 * Consider need to reallocate a fragment.
666 			 */
667 			osize = fragroundup(fs, blkoff(fs, ip->i_size));
668 			nsize = fragroundup(fs, size);
669 			if (nsize <= osize) {
670 				error = bread(vp, lbn, osize, NOCRED, &bp);
671 				if (error) {
672 					brelse(bp);
673 					return (error);
674 				}
675 				bp->b_blkno = fsbtodb(fs, nb);
676 			} else {
677 				UFS_LOCK(ump);
678 				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
679 				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
680 				       &dp->di_db[0]), osize, nsize, cred, &bp);
681 				if (error)
682 					return (error);
683 				if (DOINGSOFTDEP(vp))
684 					softdep_setup_allocdirect(ip, lbn,
685 					    dbtofsb(fs, bp->b_blkno), nb,
686 					    nsize, osize, bp);
687 			}
688 		} else {
689 			if (ip->i_size < smalllblktosize(fs, lbn + 1))
690 				nsize = fragroundup(fs, size);
691 			else
692 				nsize = fs->fs_bsize;
693 			UFS_LOCK(ump);
694 			error = ffs_alloc(ip, lbn,
695 			    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
696 				&dp->di_db[0]), nsize, cred, &newb);
697 			if (error)
698 				return (error);
699 			bp = getblk(vp, lbn, nsize, 0, 0, 0);
700 			bp->b_blkno = fsbtodb(fs, newb);
701 			if (flags & BA_CLRBUF)
702 				vfs_bio_clrbuf(bp);
703 			if (DOINGSOFTDEP(vp))
704 				softdep_setup_allocdirect(ip, lbn, newb, 0,
705 				    nsize, 0, bp);
706 		}
707 		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
708 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
709 		*bpp = bp;
710 		return (0);
711 	}
712 	/*
713 	 * Determine the number of levels of indirection.
714 	 */
715 	pref = 0;
716 	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
717 		return(error);
718 #ifdef INVARIANTS
719 	if (num < 1)
720 		panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block");
721 #endif
722 	/*
723 	 * Fetch the first indirect block allocating if necessary.
724 	 */
725 	--num;
726 	nb = dp->di_ib[indirs[0].in_off];
727 	allocib = NULL;
728 	allocblk = allociblk;
729 	lbns_remfree = lbns;
730 	if (nb == 0) {
731 		UFS_LOCK(ump);
732 		pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
733 	        if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
734 		    cred, &newb)) != 0)
735 			return (error);
736 		nb = newb;
737 		*allocblk++ = nb;
738 		*lbns_remfree++ = indirs[1].in_lbn;
739 		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 0);
740 		bp->b_blkno = fsbtodb(fs, nb);
741 		vfs_bio_clrbuf(bp);
742 		if (DOINGSOFTDEP(vp)) {
743 			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
744 			    newb, 0, fs->fs_bsize, 0, bp);
745 			bdwrite(bp);
746 		} else {
747 			/*
748 			 * Write synchronously so that indirect blocks
749 			 * never point at garbage.
750 			 */
751 			if (DOINGASYNC(vp))
752 				bdwrite(bp);
753 			else if ((error = bwrite(bp)) != 0)
754 				goto fail;
755 		}
756 		allocib = &dp->di_ib[indirs[0].in_off];
757 		*allocib = nb;
758 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
759 	}
760 	/*
761 	 * Fetch through the indirect blocks, allocating as necessary.
762 	 */
763 	for (i = 1;;) {
764 		error = bread(vp,
765 		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
766 		if (error) {
767 			brelse(bp);
768 			goto fail;
769 		}
770 		bap = (ufs2_daddr_t *)bp->b_data;
771 		nb = bap[indirs[i].in_off];
772 		if (i == num)
773 			break;
774 		i += 1;
775 		if (nb != 0) {
776 			bqrelse(bp);
777 			continue;
778 		}
779 		UFS_LOCK(ump);
780 		if (pref == 0)
781 			pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
782 		if ((error =
783 		    ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb)) != 0) {
784 			brelse(bp);
785 			goto fail;
786 		}
787 		nb = newb;
788 		*allocblk++ = nb;
789 		*lbns_remfree++ = indirs[i].in_lbn;
790 		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
791 		nbp->b_blkno = fsbtodb(fs, nb);
792 		vfs_bio_clrbuf(nbp);
793 		if (DOINGSOFTDEP(vp)) {
794 			softdep_setup_allocindir_meta(nbp, ip, bp,
795 			    indirs[i - 1].in_off, nb);
796 			bdwrite(nbp);
797 		} else {
798 			/*
799 			 * Write synchronously so that indirect blocks
800 			 * never point at garbage.
801 			 */
802 			if ((error = bwrite(nbp)) != 0) {
803 				brelse(bp);
804 				goto fail;
805 			}
806 		}
807 		bap[indirs[i - 1].in_off] = nb;
808 		if (allocib == NULL && unwindidx < 0)
809 			unwindidx = i - 1;
810 		/*
811 		 * If required, write synchronously, otherwise use
812 		 * delayed write.
813 		 */
814 		if (flags & IO_SYNC) {
815 			bwrite(bp);
816 		} else {
817 			if (bp->b_bufsize == fs->fs_bsize)
818 				bp->b_flags |= B_CLUSTEROK;
819 			bdwrite(bp);
820 		}
821 	}
822 	/*
823 	 * If asked only for the indirect block, then return it.
824 	 */
825 	if (flags & BA_METAONLY) {
826 		*bpp = bp;
827 		return (0);
828 	}
829 	/*
830 	 * Get the data block, allocating if necessary.
831 	 */
832 	if (nb == 0) {
833 		UFS_LOCK(ump);
834 		pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off, &bap[0]);
835 		error = ffs_alloc(ip,
836 		    lbn, pref, (int)fs->fs_bsize, cred, &newb);
837 		if (error) {
838 			brelse(bp);
839 			goto fail;
840 		}
841 		nb = newb;
842 		*allocblk++ = nb;
843 		*lbns_remfree++ = lbn;
844 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
845 		nbp->b_blkno = fsbtodb(fs, nb);
846 		if (flags & BA_CLRBUF)
847 			vfs_bio_clrbuf(nbp);
848 		if (DOINGSOFTDEP(vp))
849 			softdep_setup_allocindir_page(ip, lbn, bp,
850 			    indirs[i].in_off, nb, 0, nbp);
851 		bap[indirs[i].in_off] = nb;
852 		/*
853 		 * If required, write synchronously, otherwise use
854 		 * delayed write.
855 		 */
856 		if (flags & IO_SYNC) {
857 			bwrite(bp);
858 		} else {
859 			if (bp->b_bufsize == fs->fs_bsize)
860 				bp->b_flags |= B_CLUSTEROK;
861 			bdwrite(bp);
862 		}
863 		*bpp = nbp;
864 		return (0);
865 	}
866 	brelse(bp);
867 	/*
868 	 * If requested clear invalid portions of the buffer.  If we
869 	 * have to do a read-before-write (typical if BA_CLRBUF is set),
870 	 * try to do some read-ahead in the sequential case to reduce
871 	 * the number of I/O transactions.
872 	 */
873 	if (flags & BA_CLRBUF) {
874 		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
875 		if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
876 			error = cluster_read(vp, ip->i_size, lbn,
877 			    (int)fs->fs_bsize, NOCRED,
878 			    MAXBSIZE, seqcount, &nbp);
879 		} else {
880 			error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp);
881 		}
882 		if (error) {
883 			brelse(nbp);
884 			goto fail;
885 		}
886 	} else {
887 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
888 		nbp->b_blkno = fsbtodb(fs, nb);
889 	}
890 	*bpp = nbp;
891 	return (0);
892 fail:
893 	/*
894 	 * If we have failed to allocate any blocks, simply return the error.
895 	 * This is the usual case and avoids the need to fsync the file.
896 	 */
897 	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
898 		return (error);
899 	/*
900 	 * If we have failed part way through block allocation, we
901 	 * have to deallocate any indirect blocks that we have allocated.
902 	 * We have to fsync the file before we start to get rid of all
903 	 * of its dependencies so that we do not leave them dangling.
904 	 * We have to sync it at the end so that the soft updates code
905 	 * does not find any untracked changes. Although this is really
906 	 * slow, running out of disk space is not expected to be a common
907 	 * occurence. The error return from fsync is ignored as we already
908 	 * have an error to return to the user.
909 	 */
910 	(void) ffs_syncvnode(vp, MNT_WAIT);
911 	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
912 	     blkp < allocblk; blkp++, lbns_remfree++) {
913 		/*
914 		 * We shall not leave the freed blocks on the vnode
915 		 * buffer object lists.
916 		 */
917 		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT);
918 		if (bp != NULL) {
919 			bp->b_flags |= (B_INVAL | B_RELBUF);
920 			bp->b_flags &= ~B_ASYNC;
921 			brelse(bp);
922 		}
923 		deallocated += fs->fs_bsize;
924 	}
925 	if (allocib != NULL) {
926 		*allocib = 0;
927 	} else if (unwindidx >= 0) {
928 		int r;
929 
930 		r = bread(vp, indirs[unwindidx].in_lbn,
931 		    (int)fs->fs_bsize, NOCRED, &bp);
932 		if (r) {
933 			panic("Could not unwind indirect block, error %d", r);
934 			brelse(bp);
935 		} else {
936 			bap = (ufs2_daddr_t *)bp->b_data;
937 			bap[indirs[unwindidx].in_off] = 0;
938 			if (flags & IO_SYNC) {
939 				bwrite(bp);
940 			} else {
941 				if (bp->b_bufsize == fs->fs_bsize)
942 					bp->b_flags |= B_CLUSTEROK;
943 				bdwrite(bp);
944 			}
945 		}
946 	}
947 	if (deallocated) {
948 #ifdef QUOTA
949 		/*
950 		 * Restore user's disk quota because allocation failed.
951 		 */
952 		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
953 #endif
954 		dp->di_blocks -= btodb(deallocated);
955 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
956 	}
957 	(void) ffs_syncvnode(vp, MNT_WAIT);
958 	/*
959 	 * After the buffers are invalidated and on-disk pointers are
960 	 * cleared, free the blocks.
961 	 */
962 	for (blkp = allociblk; blkp < allocblk; blkp++) {
963 		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
964 		    ip->i_number);
965 	}
966 	return (error);
967 }
968