xref: /freebsd/sys/ufs/ffs/ffs_balloc.c (revision 884a2a699669ec61e2366e3e358342dbc94be24a)
1 /*-
2  * Copyright (c) 2002 Networks Associates Technology, Inc.
3  * All rights reserved.
4  *
5  * This software was developed for the FreeBSD Project by Marshall
6  * Kirk McKusick and Network Associates Laboratories, the Security
7  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
8  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
9  * research program
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  * Copyright (c) 1982, 1986, 1989, 1993
33  *	The Regents of the University of California.  All rights reserved.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  * 4. Neither the name of the University nor the names of its contributors
44  *    may be used to endorse or promote products derived from this software
45  *    without specific prior written permission.
46  *
47  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57  * SUCH DAMAGE.
58  *
59  *	@(#)ffs_balloc.c	8.8 (Berkeley) 6/16/95
60  */
61 
62 #include <sys/cdefs.h>
63 __FBSDID("$FreeBSD$");
64 
65 #include <sys/param.h>
66 #include <sys/systm.h>
67 #include <sys/bio.h>
68 #include <sys/buf.h>
69 #include <sys/lock.h>
70 #include <sys/mount.h>
71 #include <sys/vnode.h>
72 
73 #include <ufs/ufs/quota.h>
74 #include <ufs/ufs/inode.h>
75 #include <ufs/ufs/ufs_extern.h>
76 #include <ufs/ufs/extattr.h>
77 #include <ufs/ufs/ufsmount.h>
78 
79 #include <ufs/ffs/fs.h>
80 #include <ufs/ffs/ffs_extern.h>
81 
82 /*
83  * Balloc defines the structure of filesystem storage
84  * by allocating the physical blocks on a device given
85  * the inode and the logical block number in a file.
86  * This is the allocation strategy for UFS1. Below is
87  * the allocation strategy for UFS2.
88  */
89 int
90 ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
91     struct ucred *cred, int flags, struct buf **bpp)
92 {
93 	struct inode *ip;
94 	struct ufs1_dinode *dp;
95 	ufs_lbn_t lbn, lastlbn;
96 	struct fs *fs;
97 	ufs1_daddr_t nb;
98 	struct buf *bp, *nbp;
99 	struct ufsmount *ump;
100 	struct indir indirs[NIADDR + 2];
101 	int deallocated, osize, nsize, num, i, error;
102 	ufs2_daddr_t newb;
103 	ufs1_daddr_t *bap, pref;
104 	ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
105 	ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
106 	int unwindidx = -1;
107 	int saved_inbdflush;
108 
109 	ip = VTOI(vp);
110 	dp = ip->i_din1;
111 	fs = ip->i_fs;
112 	ump = ip->i_ump;
113 	lbn = lblkno(fs, startoffset);
114 	size = blkoff(fs, startoffset) + size;
115 	if (size > fs->fs_bsize)
116 		panic("ffs_balloc_ufs1: blk too big");
117 	*bpp = NULL;
118 	if (flags & IO_EXT)
119 		return (EOPNOTSUPP);
120 	if (lbn < 0)
121 		return (EFBIG);
122 
123 	if (DOINGSOFTDEP(vp))
124 		softdep_prealloc(vp, MNT_WAIT);
125 	/*
126 	 * If the next write will extend the file into a new block,
127 	 * and the file is currently composed of a fragment
128 	 * this fragment has to be extended to be a full block.
129 	 */
130 	lastlbn = lblkno(fs, ip->i_size);
131 	if (lastlbn < NDADDR && lastlbn < lbn) {
132 		nb = lastlbn;
133 		osize = blksize(fs, ip, nb);
134 		if (osize < fs->fs_bsize && osize > 0) {
135 			UFS_LOCK(ump);
136 			error = ffs_realloccg(ip, nb, dp->di_db[nb],
137 			   ffs_blkpref_ufs1(ip, lastlbn, (int)nb,
138 			   &dp->di_db[0]), osize, (int)fs->fs_bsize, flags,
139 			   cred, &bp);
140 			if (error)
141 				return (error);
142 			if (DOINGSOFTDEP(vp))
143 				softdep_setup_allocdirect(ip, nb,
144 				    dbtofsb(fs, bp->b_blkno), dp->di_db[nb],
145 				    fs->fs_bsize, osize, bp);
146 			ip->i_size = smalllblktosize(fs, nb + 1);
147 			dp->di_size = ip->i_size;
148 			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
149 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
150 			if (flags & IO_SYNC)
151 				bwrite(bp);
152 			else
153 				bawrite(bp);
154 		}
155 	}
156 	/*
157 	 * The first NDADDR blocks are direct blocks
158 	 */
159 	if (lbn < NDADDR) {
160 		if (flags & BA_METAONLY)
161 			panic("ffs_balloc_ufs1: BA_METAONLY for direct block");
162 		nb = dp->di_db[lbn];
163 		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
164 			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
165 			if (error) {
166 				brelse(bp);
167 				return (error);
168 			}
169 			bp->b_blkno = fsbtodb(fs, nb);
170 			*bpp = bp;
171 			return (0);
172 		}
173 		if (nb != 0) {
174 			/*
175 			 * Consider need to reallocate a fragment.
176 			 */
177 			osize = fragroundup(fs, blkoff(fs, ip->i_size));
178 			nsize = fragroundup(fs, size);
179 			if (nsize <= osize) {
180 				error = bread(vp, lbn, osize, NOCRED, &bp);
181 				if (error) {
182 					brelse(bp);
183 					return (error);
184 				}
185 				bp->b_blkno = fsbtodb(fs, nb);
186 			} else {
187 				UFS_LOCK(ump);
188 				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
189 				    ffs_blkpref_ufs1(ip, lbn, (int)lbn,
190 				    &dp->di_db[0]), osize, nsize, flags,
191 				    cred, &bp);
192 				if (error)
193 					return (error);
194 				if (DOINGSOFTDEP(vp))
195 					softdep_setup_allocdirect(ip, lbn,
196 					    dbtofsb(fs, bp->b_blkno), nb,
197 					    nsize, osize, bp);
198 			}
199 		} else {
200 			if (ip->i_size < smalllblktosize(fs, lbn + 1))
201 				nsize = fragroundup(fs, size);
202 			else
203 				nsize = fs->fs_bsize;
204 			UFS_LOCK(ump);
205 			error = ffs_alloc(ip, lbn,
206 			    ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]),
207 			    nsize, flags, cred, &newb);
208 			if (error)
209 				return (error);
210 			bp = getblk(vp, lbn, nsize, 0, 0, 0);
211 			bp->b_blkno = fsbtodb(fs, newb);
212 			if (flags & BA_CLRBUF)
213 				vfs_bio_clrbuf(bp);
214 			if (DOINGSOFTDEP(vp))
215 				softdep_setup_allocdirect(ip, lbn, newb, 0,
216 				    nsize, 0, bp);
217 		}
218 		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
219 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
220 		*bpp = bp;
221 		return (0);
222 	}
223 	/*
224 	 * Determine the number of levels of indirection.
225 	 */
226 	pref = 0;
227 	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
228 		return(error);
229 #ifdef INVARIANTS
230 	if (num < 1)
231 		panic ("ffs_balloc_ufs1: ufs_getlbns returned indirect block");
232 #endif
233 	saved_inbdflush = ~TDP_INBDFLUSH | (curthread->td_pflags &
234 	    TDP_INBDFLUSH);
235 	curthread->td_pflags |= TDP_INBDFLUSH;
236 	/*
237 	 * Fetch the first indirect block allocating if necessary.
238 	 */
239 	--num;
240 	nb = dp->di_ib[indirs[0].in_off];
241 	allocib = NULL;
242 	allocblk = allociblk;
243 	lbns_remfree = lbns;
244 	if (nb == 0) {
245 		UFS_LOCK(ump);
246 		pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
247 	        if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
248 		    flags, cred, &newb)) != 0) {
249 			curthread->td_pflags &= saved_inbdflush;
250 			return (error);
251 		}
252 		nb = newb;
253 		*allocblk++ = nb;
254 		*lbns_remfree++ = indirs[1].in_lbn;
255 		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 0);
256 		bp->b_blkno = fsbtodb(fs, nb);
257 		vfs_bio_clrbuf(bp);
258 		if (DOINGSOFTDEP(vp)) {
259 			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
260 			    newb, 0, fs->fs_bsize, 0, bp);
261 			bdwrite(bp);
262 		} else {
263 			/*
264 			 * Write synchronously so that indirect blocks
265 			 * never point at garbage.
266 			 */
267 			if (DOINGASYNC(vp))
268 				bdwrite(bp);
269 			else if ((error = bwrite(bp)) != 0)
270 				goto fail;
271 		}
272 		allocib = &dp->di_ib[indirs[0].in_off];
273 		*allocib = nb;
274 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
275 	}
276 	/*
277 	 * Fetch through the indirect blocks, allocating as necessary.
278 	 */
279 	for (i = 1;;) {
280 		error = bread(vp,
281 		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
282 		if (error) {
283 			brelse(bp);
284 			goto fail;
285 		}
286 		bap = (ufs1_daddr_t *)bp->b_data;
287 		nb = bap[indirs[i].in_off];
288 		if (i == num)
289 			break;
290 		i += 1;
291 		if (nb != 0) {
292 			bqrelse(bp);
293 			continue;
294 		}
295 		UFS_LOCK(ump);
296 		if (pref == 0)
297 			pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
298 		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
299 		    flags, cred, &newb)) != 0) {
300 			brelse(bp);
301 			goto fail;
302 		}
303 		nb = newb;
304 		*allocblk++ = nb;
305 		*lbns_remfree++ = indirs[i].in_lbn;
306 		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
307 		nbp->b_blkno = fsbtodb(fs, nb);
308 		vfs_bio_clrbuf(nbp);
309 		if (DOINGSOFTDEP(vp)) {
310 			softdep_setup_allocindir_meta(nbp, ip, bp,
311 			    indirs[i - 1].in_off, nb);
312 			bdwrite(nbp);
313 		} else {
314 			/*
315 			 * Write synchronously so that indirect blocks
316 			 * never point at garbage.
317 			 */
318 			if ((error = bwrite(nbp)) != 0) {
319 				brelse(bp);
320 				goto fail;
321 			}
322 		}
323 		bap[indirs[i - 1].in_off] = nb;
324 		if (allocib == NULL && unwindidx < 0)
325 			unwindidx = i - 1;
326 		/*
327 		 * If required, write synchronously, otherwise use
328 		 * delayed write.
329 		 */
330 		if (flags & IO_SYNC) {
331 			bwrite(bp);
332 		} else {
333 			if (bp->b_bufsize == fs->fs_bsize)
334 				bp->b_flags |= B_CLUSTEROK;
335 			bdwrite(bp);
336 		}
337 	}
338 	/*
339 	 * If asked only for the indirect block, then return it.
340 	 */
341 	if (flags & BA_METAONLY) {
342 		curthread->td_pflags &= saved_inbdflush;
343 		*bpp = bp;
344 		return (0);
345 	}
346 	/*
347 	 * Get the data block, allocating if necessary.
348 	 */
349 	if (nb == 0) {
350 		UFS_LOCK(ump);
351 		pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off, &bap[0]);
352 		error = ffs_alloc(ip,
353 		    lbn, pref, (int)fs->fs_bsize, flags, cred, &newb);
354 		if (error) {
355 			brelse(bp);
356 			goto fail;
357 		}
358 		nb = newb;
359 		*allocblk++ = nb;
360 		*lbns_remfree++ = lbn;
361 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
362 		nbp->b_blkno = fsbtodb(fs, nb);
363 		if (flags & BA_CLRBUF)
364 			vfs_bio_clrbuf(nbp);
365 		if (DOINGSOFTDEP(vp))
366 			softdep_setup_allocindir_page(ip, lbn, bp,
367 			    indirs[i].in_off, nb, 0, nbp);
368 		bap[indirs[i].in_off] = nb;
369 		/*
370 		 * If required, write synchronously, otherwise use
371 		 * delayed write.
372 		 */
373 		if (flags & IO_SYNC) {
374 			bwrite(bp);
375 		} else {
376 			if (bp->b_bufsize == fs->fs_bsize)
377 				bp->b_flags |= B_CLUSTEROK;
378 			bdwrite(bp);
379 		}
380 		curthread->td_pflags &= saved_inbdflush;
381 		*bpp = nbp;
382 		return (0);
383 	}
384 	brelse(bp);
385 	if (flags & BA_CLRBUF) {
386 		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
387 		if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
388 			error = cluster_read(vp, ip->i_size, lbn,
389 			    (int)fs->fs_bsize, NOCRED,
390 			    MAXBSIZE, seqcount, &nbp);
391 		} else {
392 			error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp);
393 		}
394 		if (error) {
395 			brelse(nbp);
396 			goto fail;
397 		}
398 	} else {
399 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
400 		nbp->b_blkno = fsbtodb(fs, nb);
401 	}
402 	curthread->td_pflags &= saved_inbdflush;
403 	*bpp = nbp;
404 	return (0);
405 fail:
406 	curthread->td_pflags &= saved_inbdflush;
407 	/*
408 	 * If we have failed to allocate any blocks, simply return the error.
409 	 * This is the usual case and avoids the need to fsync the file.
410 	 */
411 	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
412 		return (error);
413 	/*
414 	 * If we have failed part way through block allocation, we
415 	 * have to deallocate any indirect blocks that we have allocated.
416 	 * We have to fsync the file before we start to get rid of all
417 	 * of its dependencies so that we do not leave them dangling.
418 	 * We have to sync it at the end so that the soft updates code
419 	 * does not find any untracked changes. Although this is really
420 	 * slow, running out of disk space is not expected to be a common
421 	 * occurence. The error return from fsync is ignored as we already
422 	 * have an error to return to the user.
423 	 *
424 	 * XXX Still have to journal the free below
425 	 */
426 	(void) ffs_syncvnode(vp, MNT_WAIT);
427 	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
428 	     blkp < allocblk; blkp++, lbns_remfree++) {
429 		/*
430 		 * We shall not leave the freed blocks on the vnode
431 		 * buffer object lists.
432 		 */
433 		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT);
434 		if (bp != NULL) {
435 			bp->b_flags |= (B_INVAL | B_RELBUF);
436 			bp->b_flags &= ~B_ASYNC;
437 			brelse(bp);
438 		}
439 		deallocated += fs->fs_bsize;
440 	}
441 	if (allocib != NULL) {
442 		*allocib = 0;
443 	} else if (unwindidx >= 0) {
444 		int r;
445 
446 		r = bread(vp, indirs[unwindidx].in_lbn,
447 		    (int)fs->fs_bsize, NOCRED, &bp);
448 		if (r) {
449 			panic("Could not unwind indirect block, error %d", r);
450 			brelse(bp);
451 		} else {
452 			bap = (ufs1_daddr_t *)bp->b_data;
453 			bap[indirs[unwindidx].in_off] = 0;
454 			if (flags & IO_SYNC) {
455 				bwrite(bp);
456 			} else {
457 				if (bp->b_bufsize == fs->fs_bsize)
458 					bp->b_flags |= B_CLUSTEROK;
459 				bdwrite(bp);
460 			}
461 		}
462 	}
463 	if (deallocated) {
464 #ifdef QUOTA
465 		/*
466 		 * Restore user's disk quota because allocation failed.
467 		 */
468 		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
469 #endif
470 		dp->di_blocks -= btodb(deallocated);
471 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
472 	}
473 	(void) ffs_syncvnode(vp, MNT_WAIT);
474 	/*
475 	 * After the buffers are invalidated and on-disk pointers are
476 	 * cleared, free the blocks.
477 	 */
478 	for (blkp = allociblk; blkp < allocblk; blkp++) {
479 		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
480 		    ip->i_number, NULL);
481 	}
482 	return (error);
483 }
484 
485 /*
486  * Balloc defines the structure of file system storage
487  * by allocating the physical blocks on a device given
488  * the inode and the logical block number in a file.
489  * This is the allocation strategy for UFS2. Above is
490  * the allocation strategy for UFS1.
491  */
492 int
493 ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
494     struct ucred *cred, int flags, struct buf **bpp)
495 {
496 	struct inode *ip;
497 	struct ufs2_dinode *dp;
498 	ufs_lbn_t lbn, lastlbn;
499 	struct fs *fs;
500 	struct buf *bp, *nbp;
501 	struct ufsmount *ump;
502 	struct indir indirs[NIADDR + 2];
503 	ufs2_daddr_t nb, newb, *bap, pref;
504 	ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
505 	ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
506 	int deallocated, osize, nsize, num, i, error;
507 	int unwindidx = -1;
508 	int saved_inbdflush;
509 
510 	ip = VTOI(vp);
511 	dp = ip->i_din2;
512 	fs = ip->i_fs;
513 	ump = ip->i_ump;
514 	lbn = lblkno(fs, startoffset);
515 	size = blkoff(fs, startoffset) + size;
516 	if (size > fs->fs_bsize)
517 		panic("ffs_balloc_ufs2: blk too big");
518 	*bpp = NULL;
519 	if (lbn < 0)
520 		return (EFBIG);
521 
522 	if (DOINGSOFTDEP(vp))
523 		softdep_prealloc(vp, MNT_WAIT);
524 
525 	/*
526 	 * Check for allocating external data.
527 	 */
528 	if (flags & IO_EXT) {
529 		if (lbn >= NXADDR)
530 			return (EFBIG);
531 		/*
532 		 * If the next write will extend the data into a new block,
533 		 * and the data is currently composed of a fragment
534 		 * this fragment has to be extended to be a full block.
535 		 */
536 		lastlbn = lblkno(fs, dp->di_extsize);
537 		if (lastlbn < lbn) {
538 			nb = lastlbn;
539 			osize = sblksize(fs, dp->di_extsize, nb);
540 			if (osize < fs->fs_bsize && osize > 0) {
541 				UFS_LOCK(ump);
542 				error = ffs_realloccg(ip, -1 - nb,
543 				    dp->di_extb[nb],
544 				    ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
545 				    &dp->di_extb[0]), osize,
546 				    (int)fs->fs_bsize, flags, cred, &bp);
547 				if (error)
548 					return (error);
549 				if (DOINGSOFTDEP(vp))
550 					softdep_setup_allocext(ip, nb,
551 					    dbtofsb(fs, bp->b_blkno),
552 					    dp->di_extb[nb],
553 					    fs->fs_bsize, osize, bp);
554 				dp->di_extsize = smalllblktosize(fs, nb + 1);
555 				dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno);
556 				bp->b_xflags |= BX_ALTDATA;
557 				ip->i_flag |= IN_CHANGE;
558 				if (flags & IO_SYNC)
559 					bwrite(bp);
560 				else
561 					bawrite(bp);
562 			}
563 		}
564 		/*
565 		 * All blocks are direct blocks
566 		 */
567 		if (flags & BA_METAONLY)
568 			panic("ffs_balloc_ufs2: BA_METAONLY for ext block");
569 		nb = dp->di_extb[lbn];
570 		if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) {
571 			error = bread(vp, -1 - lbn, fs->fs_bsize, NOCRED, &bp);
572 			if (error) {
573 				brelse(bp);
574 				return (error);
575 			}
576 			bp->b_blkno = fsbtodb(fs, nb);
577 			bp->b_xflags |= BX_ALTDATA;
578 			*bpp = bp;
579 			return (0);
580 		}
581 		if (nb != 0) {
582 			/*
583 			 * Consider need to reallocate a fragment.
584 			 */
585 			osize = fragroundup(fs, blkoff(fs, dp->di_extsize));
586 			nsize = fragroundup(fs, size);
587 			if (nsize <= osize) {
588 				error = bread(vp, -1 - lbn, osize, NOCRED, &bp);
589 				if (error) {
590 					brelse(bp);
591 					return (error);
592 				}
593 				bp->b_blkno = fsbtodb(fs, nb);
594 				bp->b_xflags |= BX_ALTDATA;
595 			} else {
596 				UFS_LOCK(ump);
597 				error = ffs_realloccg(ip, -1 - lbn,
598 				    dp->di_extb[lbn],
599 				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
600 				    &dp->di_extb[0]), osize, nsize, flags,
601 				    cred, &bp);
602 				if (error)
603 					return (error);
604 				bp->b_xflags |= BX_ALTDATA;
605 				if (DOINGSOFTDEP(vp))
606 					softdep_setup_allocext(ip, lbn,
607 					    dbtofsb(fs, bp->b_blkno), nb,
608 					    nsize, osize, bp);
609 			}
610 		} else {
611 			if (dp->di_extsize < smalllblktosize(fs, lbn + 1))
612 				nsize = fragroundup(fs, size);
613 			else
614 				nsize = fs->fs_bsize;
615 			UFS_LOCK(ump);
616 			error = ffs_alloc(ip, lbn,
617 			   ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]),
618 			   nsize, flags, cred, &newb);
619 			if (error)
620 				return (error);
621 			bp = getblk(vp, -1 - lbn, nsize, 0, 0, 0);
622 			bp->b_blkno = fsbtodb(fs, newb);
623 			bp->b_xflags |= BX_ALTDATA;
624 			if (flags & BA_CLRBUF)
625 				vfs_bio_clrbuf(bp);
626 			if (DOINGSOFTDEP(vp))
627 				softdep_setup_allocext(ip, lbn, newb, 0,
628 				    nsize, 0, bp);
629 		}
630 		dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno);
631 		ip->i_flag |= IN_CHANGE;
632 		*bpp = bp;
633 		return (0);
634 	}
635 	/*
636 	 * If the next write will extend the file into a new block,
637 	 * and the file is currently composed of a fragment
638 	 * this fragment has to be extended to be a full block.
639 	 */
640 	lastlbn = lblkno(fs, ip->i_size);
641 	if (lastlbn < NDADDR && lastlbn < lbn) {
642 		nb = lastlbn;
643 		osize = blksize(fs, ip, nb);
644 		if (osize < fs->fs_bsize && osize > 0) {
645 			UFS_LOCK(ump);
646 			error = ffs_realloccg(ip, nb, dp->di_db[nb],
647 				ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
648 				    &dp->di_db[0]), osize, (int)fs->fs_bsize,
649 				    flags, cred, &bp);
650 			if (error)
651 				return (error);
652 			if (DOINGSOFTDEP(vp))
653 				softdep_setup_allocdirect(ip, nb,
654 				    dbtofsb(fs, bp->b_blkno),
655 				    dp->di_db[nb],
656 				    fs->fs_bsize, osize, bp);
657 			ip->i_size = smalllblktosize(fs, nb + 1);
658 			dp->di_size = ip->i_size;
659 			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
660 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
661 			if (flags & IO_SYNC)
662 				bwrite(bp);
663 			else
664 				bawrite(bp);
665 		}
666 	}
667 	/*
668 	 * The first NDADDR blocks are direct blocks
669 	 */
670 	if (lbn < NDADDR) {
671 		if (flags & BA_METAONLY)
672 			panic("ffs_balloc_ufs2: BA_METAONLY for direct block");
673 		nb = dp->di_db[lbn];
674 		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
675 			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
676 			if (error) {
677 				brelse(bp);
678 				return (error);
679 			}
680 			bp->b_blkno = fsbtodb(fs, nb);
681 			*bpp = bp;
682 			return (0);
683 		}
684 		if (nb != 0) {
685 			/*
686 			 * Consider need to reallocate a fragment.
687 			 */
688 			osize = fragroundup(fs, blkoff(fs, ip->i_size));
689 			nsize = fragroundup(fs, size);
690 			if (nsize <= osize) {
691 				error = bread(vp, lbn, osize, NOCRED, &bp);
692 				if (error) {
693 					brelse(bp);
694 					return (error);
695 				}
696 				bp->b_blkno = fsbtodb(fs, nb);
697 			} else {
698 				UFS_LOCK(ump);
699 				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
700 				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
701 				       &dp->di_db[0]), osize, nsize, flags,
702 				    cred, &bp);
703 				if (error)
704 					return (error);
705 				if (DOINGSOFTDEP(vp))
706 					softdep_setup_allocdirect(ip, lbn,
707 					    dbtofsb(fs, bp->b_blkno), nb,
708 					    nsize, osize, bp);
709 			}
710 		} else {
711 			if (ip->i_size < smalllblktosize(fs, lbn + 1))
712 				nsize = fragroundup(fs, size);
713 			else
714 				nsize = fs->fs_bsize;
715 			UFS_LOCK(ump);
716 			error = ffs_alloc(ip, lbn,
717 			    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
718 				&dp->di_db[0]), nsize, flags, cred, &newb);
719 			if (error)
720 				return (error);
721 			bp = getblk(vp, lbn, nsize, 0, 0, 0);
722 			bp->b_blkno = fsbtodb(fs, newb);
723 			if (flags & BA_CLRBUF)
724 				vfs_bio_clrbuf(bp);
725 			if (DOINGSOFTDEP(vp))
726 				softdep_setup_allocdirect(ip, lbn, newb, 0,
727 				    nsize, 0, bp);
728 		}
729 		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
730 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
731 		*bpp = bp;
732 		return (0);
733 	}
734 	/*
735 	 * Determine the number of levels of indirection.
736 	 */
737 	pref = 0;
738 	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
739 		return(error);
740 #ifdef INVARIANTS
741 	if (num < 1)
742 		panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block");
743 #endif
744 	saved_inbdflush = ~TDP_INBDFLUSH | (curthread->td_pflags &
745 	    TDP_INBDFLUSH);
746 	curthread->td_pflags |= TDP_INBDFLUSH;
747 	/*
748 	 * Fetch the first indirect block allocating if necessary.
749 	 */
750 	--num;
751 	nb = dp->di_ib[indirs[0].in_off];
752 	allocib = NULL;
753 	allocblk = allociblk;
754 	lbns_remfree = lbns;
755 	if (nb == 0) {
756 		UFS_LOCK(ump);
757 		pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
758 	        if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
759 		    flags, cred, &newb)) != 0) {
760 			curthread->td_pflags &= saved_inbdflush;
761 			return (error);
762 		}
763 		nb = newb;
764 		*allocblk++ = nb;
765 		*lbns_remfree++ = indirs[1].in_lbn;
766 		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, 0);
767 		bp->b_blkno = fsbtodb(fs, nb);
768 		vfs_bio_clrbuf(bp);
769 		if (DOINGSOFTDEP(vp)) {
770 			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
771 			    newb, 0, fs->fs_bsize, 0, bp);
772 			bdwrite(bp);
773 		} else {
774 			/*
775 			 * Write synchronously so that indirect blocks
776 			 * never point at garbage.
777 			 */
778 			if (DOINGASYNC(vp))
779 				bdwrite(bp);
780 			else if ((error = bwrite(bp)) != 0)
781 				goto fail;
782 		}
783 		allocib = &dp->di_ib[indirs[0].in_off];
784 		*allocib = nb;
785 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
786 	}
787 	/*
788 	 * Fetch through the indirect blocks, allocating as necessary.
789 	 */
790 	for (i = 1;;) {
791 		error = bread(vp,
792 		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
793 		if (error) {
794 			brelse(bp);
795 			goto fail;
796 		}
797 		bap = (ufs2_daddr_t *)bp->b_data;
798 		nb = bap[indirs[i].in_off];
799 		if (i == num)
800 			break;
801 		i += 1;
802 		if (nb != 0) {
803 			bqrelse(bp);
804 			continue;
805 		}
806 		UFS_LOCK(ump);
807 		if (pref == 0)
808 			pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
809 		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
810 		    flags, cred, &newb)) != 0) {
811 			brelse(bp);
812 			goto fail;
813 		}
814 		nb = newb;
815 		*allocblk++ = nb;
816 		*lbns_remfree++ = indirs[i].in_lbn;
817 		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
818 		nbp->b_blkno = fsbtodb(fs, nb);
819 		vfs_bio_clrbuf(nbp);
820 		if (DOINGSOFTDEP(vp)) {
821 			softdep_setup_allocindir_meta(nbp, ip, bp,
822 			    indirs[i - 1].in_off, nb);
823 			bdwrite(nbp);
824 		} else {
825 			/*
826 			 * Write synchronously so that indirect blocks
827 			 * never point at garbage.
828 			 */
829 			if ((error = bwrite(nbp)) != 0) {
830 				brelse(bp);
831 				goto fail;
832 			}
833 		}
834 		bap[indirs[i - 1].in_off] = nb;
835 		if (allocib == NULL && unwindidx < 0)
836 			unwindidx = i - 1;
837 		/*
838 		 * If required, write synchronously, otherwise use
839 		 * delayed write.
840 		 */
841 		if (flags & IO_SYNC) {
842 			bwrite(bp);
843 		} else {
844 			if (bp->b_bufsize == fs->fs_bsize)
845 				bp->b_flags |= B_CLUSTEROK;
846 			bdwrite(bp);
847 		}
848 	}
849 	/*
850 	 * If asked only for the indirect block, then return it.
851 	 */
852 	if (flags & BA_METAONLY) {
853 		curthread->td_pflags &= saved_inbdflush;
854 		*bpp = bp;
855 		return (0);
856 	}
857 	/*
858 	 * Get the data block, allocating if necessary.
859 	 */
860 	if (nb == 0) {
861 		UFS_LOCK(ump);
862 		pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off, &bap[0]);
863 		error = ffs_alloc(ip,
864 		    lbn, pref, (int)fs->fs_bsize, flags, cred, &newb);
865 		if (error) {
866 			brelse(bp);
867 			goto fail;
868 		}
869 		nb = newb;
870 		*allocblk++ = nb;
871 		*lbns_remfree++ = lbn;
872 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
873 		nbp->b_blkno = fsbtodb(fs, nb);
874 		if (flags & BA_CLRBUF)
875 			vfs_bio_clrbuf(nbp);
876 		if (DOINGSOFTDEP(vp))
877 			softdep_setup_allocindir_page(ip, lbn, bp,
878 			    indirs[i].in_off, nb, 0, nbp);
879 		bap[indirs[i].in_off] = nb;
880 		/*
881 		 * If required, write synchronously, otherwise use
882 		 * delayed write.
883 		 */
884 		if (flags & IO_SYNC) {
885 			bwrite(bp);
886 		} else {
887 			if (bp->b_bufsize == fs->fs_bsize)
888 				bp->b_flags |= B_CLUSTEROK;
889 			bdwrite(bp);
890 		}
891 		curthread->td_pflags &= saved_inbdflush;
892 		*bpp = nbp;
893 		return (0);
894 	}
895 	brelse(bp);
896 	/*
897 	 * If requested clear invalid portions of the buffer.  If we
898 	 * have to do a read-before-write (typical if BA_CLRBUF is set),
899 	 * try to do some read-ahead in the sequential case to reduce
900 	 * the number of I/O transactions.
901 	 */
902 	if (flags & BA_CLRBUF) {
903 		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
904 		if (seqcount && (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
905 			error = cluster_read(vp, ip->i_size, lbn,
906 			    (int)fs->fs_bsize, NOCRED,
907 			    MAXBSIZE, seqcount, &nbp);
908 		} else {
909 			error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp);
910 		}
911 		if (error) {
912 			brelse(nbp);
913 			goto fail;
914 		}
915 	} else {
916 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, 0);
917 		nbp->b_blkno = fsbtodb(fs, nb);
918 	}
919 	curthread->td_pflags &= saved_inbdflush;
920 	*bpp = nbp;
921 	return (0);
922 fail:
923 	curthread->td_pflags &= saved_inbdflush;
924 	/*
925 	 * If we have failed to allocate any blocks, simply return the error.
926 	 * This is the usual case and avoids the need to fsync the file.
927 	 */
928 	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
929 		return (error);
930 	/*
931 	 * If we have failed part way through block allocation, we
932 	 * have to deallocate any indirect blocks that we have allocated.
933 	 * We have to fsync the file before we start to get rid of all
934 	 * of its dependencies so that we do not leave them dangling.
935 	 * We have to sync it at the end so that the soft updates code
936 	 * does not find any untracked changes. Although this is really
937 	 * slow, running out of disk space is not expected to be a common
938 	 * occurence. The error return from fsync is ignored as we already
939 	 * have an error to return to the user.
940 	 *
941 	 * XXX Still have to journal the free below
942 	 */
943 	(void) ffs_syncvnode(vp, MNT_WAIT);
944 	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
945 	     blkp < allocblk; blkp++, lbns_remfree++) {
946 		/*
947 		 * We shall not leave the freed blocks on the vnode
948 		 * buffer object lists.
949 		 */
950 		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT);
951 		if (bp != NULL) {
952 			bp->b_flags |= (B_INVAL | B_RELBUF);
953 			bp->b_flags &= ~B_ASYNC;
954 			brelse(bp);
955 		}
956 		deallocated += fs->fs_bsize;
957 	}
958 	if (allocib != NULL) {
959 		*allocib = 0;
960 	} else if (unwindidx >= 0) {
961 		int r;
962 
963 		r = bread(vp, indirs[unwindidx].in_lbn,
964 		    (int)fs->fs_bsize, NOCRED, &bp);
965 		if (r) {
966 			panic("Could not unwind indirect block, error %d", r);
967 			brelse(bp);
968 		} else {
969 			bap = (ufs2_daddr_t *)bp->b_data;
970 			bap[indirs[unwindidx].in_off] = 0;
971 			if (flags & IO_SYNC) {
972 				bwrite(bp);
973 			} else {
974 				if (bp->b_bufsize == fs->fs_bsize)
975 					bp->b_flags |= B_CLUSTEROK;
976 				bdwrite(bp);
977 			}
978 		}
979 	}
980 	if (deallocated) {
981 #ifdef QUOTA
982 		/*
983 		 * Restore user's disk quota because allocation failed.
984 		 */
985 		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
986 #endif
987 		dp->di_blocks -= btodb(deallocated);
988 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
989 	}
990 	(void) ffs_syncvnode(vp, MNT_WAIT);
991 	/*
992 	 * After the buffers are invalidated and on-disk pointers are
993 	 * cleared, free the blocks.
994 	 */
995 	for (blkp = allociblk; blkp < allocblk; blkp++) {
996 		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
997 		    ip->i_number, NULL);
998 	}
999 	return (error);
1000 }
1001