xref: /freebsd/sys/ufs/ffs/ffs_balloc.c (revision 2e3f49888ec8851bafb22011533217487764fdb0)
1 /*-
2  * SPDX-License-Identifier: (BSD-2-Clause AND BSD-3-Clause)
3  *
4  * Copyright (c) 2002 Networks Associates Technology, Inc.
5  * All rights reserved.
6  *
7  * This software was developed for the FreeBSD Project by Marshall
8  * Kirk McKusick and Network Associates Laboratories, the Security
9  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
10  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
11  * research program
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * Copyright (c) 1982, 1986, 1989, 1993
35  *	The Regents of the University of California.  All rights reserved.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. Neither the name of the University nor the names of its contributors
46  *    may be used to endorse or promote products derived from this software
47  *    without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  */
61 
62 #include <sys/param.h>
63 #include <sys/systm.h>
64 #include <sys/bio.h>
65 #include <sys/buf.h>
66 #include <sys/lock.h>
67 #include <sys/mount.h>
68 #include <sys/stat.h>
69 #include <sys/vnode.h>
70 #include <sys/vmmeter.h>
71 
72 #include <ufs/ufs/quota.h>
73 #include <ufs/ufs/inode.h>
74 #include <ufs/ufs/ufs_extern.h>
75 #include <ufs/ufs/extattr.h>
76 #include <ufs/ufs/ufsmount.h>
77 
78 #include <ufs/ffs/fs.h>
79 #include <ufs/ffs/ffs_extern.h>
80 
81 /*
82  * Balloc defines the structure of filesystem storage
83  * by allocating the physical blocks on a device given
84  * the inode and the logical block number in a file.
85  * This is the allocation strategy for UFS1. Below is
86  * the allocation strategy for UFS2.
87  */
88 int
89 ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
90     struct ucred *cred, int flags, struct buf **bpp)
91 {
92 	struct inode *ip;
93 	struct ufs1_dinode *dp;
94 	ufs_lbn_t lbn, lastlbn;
95 	struct fs *fs;
96 	ufs1_daddr_t nb;
97 	struct buf *bp, *nbp;
98 	struct mount *mp;
99 	struct ufsmount *ump;
100 	struct indir indirs[UFS_NIADDR + 2];
101 	int deallocated, osize, nsize, num, i, error;
102 	ufs2_daddr_t newb;
103 	ufs1_daddr_t *bap, pref;
104 	ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[UFS_NIADDR + 1];
105 	ufs2_daddr_t *lbns_remfree, lbns[UFS_NIADDR + 1];
106 	int unwindidx = -1;
107 	int saved_inbdflush;
108 	int gbflags, reclaimed;
109 
110 	ip = VTOI(vp);
111 	dp = ip->i_din1;
112 	fs = ITOFS(ip);
113 	mp = ITOVFS(ip);
114 	ump = ITOUMP(ip);
115 	lbn = lblkno(fs, startoffset);
116 	size = blkoff(fs, startoffset) + size;
117 	reclaimed = 0;
118 	if (size > fs->fs_bsize)
119 		panic("ffs_balloc_ufs1: blk too big");
120 	*bpp = NULL;
121 	if (flags & IO_EXT)
122 		return (EOPNOTSUPP);
123 	if (lbn < 0)
124 		return (EFBIG);
125 	gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
126 
127 	vn_seqc_write_begin(vp);
128 
129 	/*
130 	 * If the next write will extend the file into a new block,
131 	 * and the file is currently composed of a fragment
132 	 * this fragment has to be extended to be a full block.
133 	 */
134 	lastlbn = lblkno(fs, ip->i_size);
135 	if (lastlbn < UFS_NDADDR && lastlbn < lbn) {
136 		nb = lastlbn;
137 		osize = blksize(fs, ip, nb);
138 		if (osize < fs->fs_bsize && osize > 0) {
139 			UFS_LOCK(ump);
140 			error = ffs_realloccg(ip, nb, dp->di_db[nb],
141 			   ffs_blkpref_ufs1(ip, lastlbn, (int)nb,
142 			   &dp->di_db[0]), osize, (int)fs->fs_bsize, flags,
143 			   cred, &bp);
144 			if (error)
145 				goto done;
146 			if (DOINGSOFTDEP(vp))
147 				softdep_setup_allocdirect(ip, nb,
148 				    dbtofsb(fs, bp->b_blkno), dp->di_db[nb],
149 				    fs->fs_bsize, osize, bp);
150 			ip->i_size = smalllblktosize(fs, nb + 1);
151 			dp->di_size = ip->i_size;
152 			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
153 			UFS_INODE_SET_FLAG(ip,
154 			    IN_SIZEMOD | IN_CHANGE | IN_UPDATE | IN_IBLKDATA);
155 			if (flags & IO_SYNC)
156 				bwrite(bp);
157 			else if (DOINGASYNC(vp))
158 				bdwrite(bp);
159 			else
160 				bawrite(bp);
161 		}
162 	}
163 	/*
164 	 * The first UFS_NDADDR blocks are direct blocks
165 	 */
166 	if (lbn < UFS_NDADDR) {
167 		if (flags & BA_METAONLY)
168 			panic("ffs_balloc_ufs1: BA_METAONLY for direct block");
169 		nb = dp->di_db[lbn];
170 		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
171 			if ((flags & BA_CLRBUF) != 0) {
172 				error = bread(vp, lbn, fs->fs_bsize, NOCRED,
173 				    &bp);
174 				if (error != 0)
175 					goto done;
176 			} else {
177 				bp = getblk(vp, lbn, fs->fs_bsize, 0, 0,
178 				    gbflags);
179 				if (bp == NULL) {
180 					error = EIO;
181 					goto done;
182 				}
183 				vfs_bio_clrbuf(bp);
184 			}
185 			bp->b_blkno = fsbtodb(fs, nb);
186 			*bpp = bp;
187 			error = 0;
188 			goto done;
189 		}
190 		if (nb != 0) {
191 			/*
192 			 * Consider need to reallocate a fragment.
193 			 */
194 			osize = fragroundup(fs, blkoff(fs, ip->i_size));
195 			nsize = fragroundup(fs, size);
196 			if (nsize <= osize) {
197 				error = bread(vp, lbn, osize, NOCRED, &bp);
198 				if (error)
199 					goto done;
200 				bp->b_blkno = fsbtodb(fs, nb);
201 			} else {
202 				UFS_LOCK(ump);
203 				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
204 				    ffs_blkpref_ufs1(ip, lbn, (int)lbn,
205 				    &dp->di_db[0]), osize, nsize, flags,
206 				    cred, &bp);
207 				if (error)
208 					goto done;
209 				if (DOINGSOFTDEP(vp))
210 					softdep_setup_allocdirect(ip, lbn,
211 					    dbtofsb(fs, bp->b_blkno), nb,
212 					    nsize, osize, bp);
213 			}
214 		} else {
215 			if (ip->i_size < smalllblktosize(fs, lbn + 1))
216 				nsize = fragroundup(fs, size);
217 			else
218 				nsize = fs->fs_bsize;
219 			UFS_LOCK(ump);
220 			error = ffs_alloc(ip, lbn,
221 			    ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]),
222 			    nsize, flags, cred, &newb);
223 			if (error)
224 				goto done;
225 			bp = getblk(vp, lbn, nsize, 0, 0, gbflags);
226 			bp->b_blkno = fsbtodb(fs, newb);
227 			if (flags & BA_CLRBUF)
228 				vfs_bio_clrbuf(bp);
229 			if (DOINGSOFTDEP(vp))
230 				softdep_setup_allocdirect(ip, lbn, newb, 0,
231 				    nsize, 0, bp);
232 		}
233 		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
234 		UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE | IN_IBLKDATA);
235 		*bpp = bp;
236 		error = 0;
237 		goto done;
238 	}
239 	/*
240 	 * Determine the number of levels of indirection.
241 	 */
242 	pref = 0;
243 	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
244 		goto done;
245 #ifdef INVARIANTS
246 	if (num < 1)
247 		panic ("ffs_balloc_ufs1: ufs_getlbns returned indirect block");
248 #endif
249 	saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH);
250 	/*
251 	 * Fetch the first indirect block allocating if necessary.
252 	 */
253 	--num;
254 	nb = dp->di_ib[indirs[0].in_off];
255 	allocib = NULL;
256 	allocblk = allociblk;
257 	lbns_remfree = lbns;
258 	if (nb == 0) {
259 		UFS_LOCK(ump);
260 		pref = ffs_blkpref_ufs1(ip, lbn, -indirs[0].in_off - 1,
261 		    (ufs1_daddr_t *)0);
262 		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
263 		    flags, cred, &newb)) != 0) {
264 			curthread_pflags_restore(saved_inbdflush);
265 			goto done;
266 		}
267 		pref = newb + fs->fs_frag;
268 		nb = newb;
269 		MPASS(allocblk < allociblk + nitems(allociblk));
270 		MPASS(lbns_remfree < lbns + nitems(lbns));
271 		*allocblk++ = nb;
272 		*lbns_remfree++ = indirs[1].in_lbn;
273 		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, gbflags);
274 		bp->b_blkno = fsbtodb(fs, nb);
275 		vfs_bio_clrbuf(bp);
276 		if (DOINGSOFTDEP(vp)) {
277 			softdep_setup_allocdirect(ip,
278 			    UFS_NDADDR + indirs[0].in_off, newb, 0,
279 			    fs->fs_bsize, 0, bp);
280 			bdwrite(bp);
281 		} else if ((flags & IO_SYNC) == 0 && DOINGASYNC(vp)) {
282 			if (bp->b_bufsize == fs->fs_bsize)
283 				bp->b_flags |= B_CLUSTEROK;
284 			bdwrite(bp);
285 		} else {
286 			if ((error = bwrite(bp)) != 0)
287 				goto fail;
288 		}
289 		allocib = &dp->di_ib[indirs[0].in_off];
290 		*allocib = nb;
291 		UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE | IN_IBLKDATA);
292 	}
293 	/*
294 	 * Fetch through the indirect blocks, allocating as necessary.
295 	 */
296 retry:
297 	for (i = 1;;) {
298 		error = bread(vp,
299 		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
300 		if (error) {
301 			goto fail;
302 		}
303 		bap = (ufs1_daddr_t *)bp->b_data;
304 		nb = bap[indirs[i].in_off];
305 		if ((error = UFS_CHECK_BLKNO(mp, ip->i_number, nb,
306 		    fs->fs_bsize)) != 0) {
307 			brelse(bp);
308 			goto fail;
309 		}
310 		if (i == num)
311 			break;
312 		i += 1;
313 		if (nb != 0) {
314 			bqrelse(bp);
315 			continue;
316 		}
317 		UFS_LOCK(ump);
318 		/*
319 		 * If parent indirect has just been allocated, try to cluster
320 		 * immediately following it.
321 		 */
322 		if (pref == 0)
323 			pref = ffs_blkpref_ufs1(ip, lbn, i - num - 1,
324 			    (ufs1_daddr_t *)0);
325 		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
326 		    flags | IO_BUFLOCKED, cred, &newb)) != 0) {
327 			brelse(bp);
328 			UFS_LOCK(ump);
329 			if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
330 				softdep_request_cleanup(fs, vp, cred,
331 				    FLUSH_BLOCKS_WAIT);
332 				UFS_UNLOCK(ump);
333 				goto retry;
334 			}
335 			if (!ffs_fsfail_cleanup_locked(ump, error) &&
336 			    ppsratecheck(&ump->um_last_fullmsg,
337 			    &ump->um_secs_fullmsg, 1)) {
338 				UFS_UNLOCK(ump);
339 				ffs_fserr(fs, ip->i_number, "filesystem full");
340 				uprintf("\n%s: write failed, filesystem "
341 				    "is full\n", fs->fs_fsmnt);
342 			} else {
343 				UFS_UNLOCK(ump);
344 			}
345 			goto fail;
346 		}
347 		pref = newb + fs->fs_frag;
348 		nb = newb;
349 		MPASS(allocblk < allociblk + nitems(allociblk));
350 		MPASS(lbns_remfree < lbns + nitems(lbns));
351 		*allocblk++ = nb;
352 		*lbns_remfree++ = indirs[i].in_lbn;
353 		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
354 		nbp->b_blkno = fsbtodb(fs, nb);
355 		vfs_bio_clrbuf(nbp);
356 		if (DOINGSOFTDEP(vp)) {
357 			softdep_setup_allocindir_meta(nbp, ip, bp,
358 			    indirs[i - 1].in_off, nb);
359 			bdwrite(nbp);
360 		} else if ((flags & IO_SYNC) == 0 && DOINGASYNC(vp)) {
361 			if (nbp->b_bufsize == fs->fs_bsize)
362 				nbp->b_flags |= B_CLUSTEROK;
363 			bdwrite(nbp);
364 		} else {
365 			if ((error = bwrite(nbp)) != 0) {
366 				brelse(bp);
367 				goto fail;
368 			}
369 		}
370 		bap[indirs[i - 1].in_off] = nb;
371 		if (allocib == NULL && unwindidx < 0)
372 			unwindidx = i - 1;
373 		/*
374 		 * If required, write synchronously, otherwise use
375 		 * delayed write.
376 		 */
377 		if (flags & IO_SYNC) {
378 			bwrite(bp);
379 		} else {
380 			if (bp->b_bufsize == fs->fs_bsize)
381 				bp->b_flags |= B_CLUSTEROK;
382 			bdwrite(bp);
383 		}
384 	}
385 	/*
386 	 * If asked only for the indirect block, then return it.
387 	 */
388 	if (flags & BA_METAONLY) {
389 		curthread_pflags_restore(saved_inbdflush);
390 		*bpp = bp;
391 		error = 0;
392 		goto done;
393 	}
394 	/*
395 	 * Get the data block, allocating if necessary.
396 	 */
397 	if (nb == 0) {
398 		UFS_LOCK(ump);
399 		/*
400 		 * If allocating metadata at the front of the cylinder
401 		 * group and parent indirect block has just been allocated,
402 		 * then cluster next to it if it is the first indirect in
403 		 * the file. Otherwise it has been allocated in the metadata
404 		 * area, so we want to find our own place out in the data area.
405 		 */
406 		if (pref == 0 || (lbn > UFS_NDADDR && fs->fs_metaspace != 0))
407 			pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off,
408 			    &bap[0]);
409 		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
410 		    flags | IO_BUFLOCKED, cred, &newb);
411 		if (error) {
412 			brelse(bp);
413 			UFS_LOCK(ump);
414 			if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
415 				softdep_request_cleanup(fs, vp, cred,
416 				    FLUSH_BLOCKS_WAIT);
417 				UFS_UNLOCK(ump);
418 				goto retry;
419 			}
420 			if (!ffs_fsfail_cleanup_locked(ump, error) &&
421 			    ppsratecheck(&ump->um_last_fullmsg,
422 			    &ump->um_secs_fullmsg, 1)) {
423 				UFS_UNLOCK(ump);
424 				ffs_fserr(fs, ip->i_number, "filesystem full");
425 				uprintf("\n%s: write failed, filesystem "
426 				    "is full\n", fs->fs_fsmnt);
427 			} else {
428 				UFS_UNLOCK(ump);
429 			}
430 			goto fail;
431 		}
432 		nb = newb;
433 		MPASS(allocblk < allociblk + nitems(allociblk));
434 		MPASS(lbns_remfree < lbns + nitems(lbns));
435 		*allocblk++ = nb;
436 		*lbns_remfree++ = lbn;
437 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
438 		nbp->b_blkno = fsbtodb(fs, nb);
439 		if (flags & BA_CLRBUF)
440 			vfs_bio_clrbuf(nbp);
441 		if (DOINGSOFTDEP(vp))
442 			softdep_setup_allocindir_page(ip, lbn, bp,
443 			    indirs[i].in_off, nb, 0, nbp);
444 		bap[indirs[i].in_off] = nb;
445 		/*
446 		 * If required, write synchronously, otherwise use
447 		 * delayed write.
448 		 */
449 		if (flags & IO_SYNC) {
450 			bwrite(bp);
451 		} else {
452 			if (bp->b_bufsize == fs->fs_bsize)
453 				bp->b_flags |= B_CLUSTEROK;
454 			bdwrite(bp);
455 		}
456 		curthread_pflags_restore(saved_inbdflush);
457 		*bpp = nbp;
458 		error = 0;
459 		goto done;
460 	}
461 	brelse(bp);
462 	if (flags & BA_CLRBUF) {
463 		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
464 		if (seqcount != 0 &&
465 		    (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0 &&
466 		    !(vm_page_count_severe() || buf_dirty_count_severe())) {
467 			error = cluster_read(vp, ip->i_size, lbn,
468 			    (int)fs->fs_bsize, NOCRED,
469 			    MAXBSIZE, seqcount, gbflags, &nbp);
470 		} else {
471 			error = bread_gb(vp, lbn, (int)fs->fs_bsize, NOCRED,
472 			    gbflags, &nbp);
473 		}
474 		if (error) {
475 			brelse(nbp);
476 			goto fail;
477 		}
478 	} else {
479 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
480 		nbp->b_blkno = fsbtodb(fs, nb);
481 	}
482 	curthread_pflags_restore(saved_inbdflush);
483 	*bpp = nbp;
484 	error = 0;
485 	goto done;
486 fail:
487 	curthread_pflags_restore(saved_inbdflush);
488 	/*
489 	 * If we have failed to allocate any blocks, simply return the error.
490 	 * This is the usual case and avoids the need to fsync the file.
491 	 */
492 	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
493 		goto done;
494 	/*
495 	 * If we have failed part way through block allocation, we
496 	 * have to deallocate any indirect blocks that we have allocated.
497 	 * We have to fsync the file before we start to get rid of all
498 	 * of its dependencies so that we do not leave them dangling.
499 	 * We have to sync it at the end so that the soft updates code
500 	 * does not find any untracked changes. Although this is really
501 	 * slow, running out of disk space is not expected to be a common
502 	 * occurrence. The error return from fsync is ignored as we already
503 	 * have an error to return to the user.
504 	 *
505 	 * XXX Still have to journal the free below
506 	 */
507 	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
508 	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
509 	     blkp < allocblk; blkp++, lbns_remfree++) {
510 		/*
511 		 * We shall not leave the freed blocks on the vnode
512 		 * buffer object lists.
513 		 */
514 		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
515 		    GB_NOCREAT | GB_UNMAPPED);
516 		if (bp != NULL) {
517 			KASSERT(bp->b_blkno == fsbtodb(fs, *blkp),
518 			    ("mismatch1 l %jd %jd b %ju %ju",
519 			    (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree,
520 			    (uintmax_t)bp->b_blkno,
521 			    (uintmax_t)fsbtodb(fs, *blkp)));
522 			bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE;
523 			bp->b_flags &= ~(B_ASYNC | B_CACHE);
524 			brelse(bp);
525 		}
526 		deallocated += fs->fs_bsize;
527 	}
528 	if (allocib != NULL) {
529 		*allocib = 0;
530 	} else if (unwindidx >= 0) {
531 		int r;
532 
533 		r = bread(vp, indirs[unwindidx].in_lbn,
534 		    (int)fs->fs_bsize, NOCRED, &bp);
535 		if (r) {
536 			panic("Could not unwind indirect block, error %d", r);
537 			brelse(bp);
538 		} else {
539 			bap = (ufs1_daddr_t *)bp->b_data;
540 			bap[indirs[unwindidx].in_off] = 0;
541 			if (flags & IO_SYNC) {
542 				bwrite(bp);
543 			} else {
544 				if (bp->b_bufsize == fs->fs_bsize)
545 					bp->b_flags |= B_CLUSTEROK;
546 				bdwrite(bp);
547 			}
548 		}
549 	}
550 	if (deallocated) {
551 #ifdef QUOTA
552 		/*
553 		 * Restore user's disk quota because allocation failed.
554 		 */
555 		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
556 #endif
557 		dp->di_blocks -= btodb(deallocated);
558 		UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
559 	}
560 	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
561 	/*
562 	 * After the buffers are invalidated and on-disk pointers are
563 	 * cleared, free the blocks.
564 	 */
565 	for (blkp = allociblk; blkp < allocblk; blkp++) {
566 #ifdef INVARIANTS
567 		if (blkp == allociblk)
568 			lbns_remfree = lbns;
569 		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
570 		    GB_NOCREAT | GB_UNMAPPED);
571 		if (bp != NULL) {
572 			panic("zombie1 %jd %ju %ju",
573 			    (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno,
574 			    (uintmax_t)fsbtodb(fs, *blkp));
575 		}
576 		lbns_remfree++;
577 #endif
578 		ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize,
579 		    ip->i_number, vp->v_type, NULL, SINGLETON_KEY);
580 	}
581 done:
582 	vn_seqc_write_end(vp);
583 	return (error);
584 }
585 
586 /*
587  * Balloc defines the structure of file system storage
588  * by allocating the physical blocks on a device given
589  * the inode and the logical block number in a file.
590  * This is the allocation strategy for UFS2. Above is
591  * the allocation strategy for UFS1.
592  */
593 int
594 ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
595     struct ucred *cred, int flags, struct buf **bpp)
596 {
597 	struct inode *ip;
598 	struct ufs2_dinode *dp;
599 	ufs_lbn_t lbn, lastlbn;
600 	struct fs *fs;
601 	struct buf *bp, *nbp;
602 	struct mount *mp;
603 	struct ufsmount *ump;
604 	struct indir indirs[UFS_NIADDR + 2];
605 	ufs2_daddr_t nb, newb, *bap, pref;
606 	ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[UFS_NIADDR + 1];
607 	ufs2_daddr_t *lbns_remfree, lbns[UFS_NIADDR + 1];
608 	int deallocated, osize, nsize, num, i, error;
609 	int unwindidx = -1;
610 	int saved_inbdflush;
611 	int gbflags, gbwflag, reclaimed;
612 
613 	ip = VTOI(vp);
614 	dp = ip->i_din2;
615 	fs = ITOFS(ip);
616 	mp = ITOVFS(ip);
617 	ump = ITOUMP(ip);
618 	lbn = lblkno(fs, startoffset);
619 	size = blkoff(fs, startoffset) + size;
620 	reclaimed = 0;
621 	if (size > fs->fs_bsize)
622 		panic("ffs_balloc_ufs2: blk too big");
623 	*bpp = NULL;
624 	if (lbn < 0)
625 		return (EFBIG);
626 	gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
627 #ifdef WITNESS
628 	gbwflag = IS_SNAPSHOT(ip) ? GB_NOWITNESS : 0;
629 	gbflags |= gbwflag;
630 #else
631 	gbwflag = 0;
632 #endif
633 
634 	vn_seqc_write_begin(vp);
635 
636 	/*
637 	 * Check for allocating external data.
638 	 */
639 	if (flags & IO_EXT) {
640 		if (lbn >= UFS_NXADDR) {
641 			error = EFBIG;
642 			goto done;
643 		}
644 
645 		/*
646 		 * If the next write will extend the data into a new block,
647 		 * and the data is currently composed of a fragment
648 		 * this fragment has to be extended to be a full block.
649 		 */
650 		lastlbn = lblkno(fs, dp->di_extsize);
651 		if (lastlbn < lbn) {
652 			nb = lastlbn;
653 			osize = sblksize(fs, dp->di_extsize, nb);
654 			if (osize < fs->fs_bsize && osize > 0) {
655 				UFS_LOCK(ump);
656 				error = ffs_realloccg(ip, -1 - nb,
657 				    dp->di_extb[nb],
658 				    ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
659 				    &dp->di_extb[0]), osize,
660 				    (int)fs->fs_bsize, flags, cred, &bp);
661 				if (error)
662 					goto done;
663 				if (DOINGSOFTDEP(vp))
664 					softdep_setup_allocext(ip, nb,
665 					    dbtofsb(fs, bp->b_blkno),
666 					    dp->di_extb[nb],
667 					    fs->fs_bsize, osize, bp);
668 				dp->di_extsize = smalllblktosize(fs, nb + 1);
669 				dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno);
670 				bp->b_xflags |= BX_ALTDATA;
671 				UFS_INODE_SET_FLAG(ip,
672 				    IN_SIZEMOD | IN_CHANGE | IN_IBLKDATA);
673 				if (flags & IO_SYNC)
674 					bwrite(bp);
675 				else
676 					bawrite(bp);
677 			}
678 		}
679 		/*
680 		 * All blocks are direct blocks
681 		 */
682 		if (flags & BA_METAONLY)
683 			panic("ffs_balloc_ufs2: BA_METAONLY for ext block");
684 		nb = dp->di_extb[lbn];
685 		if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) {
686 			error = bread_gb(vp, -1 - lbn, fs->fs_bsize, NOCRED,
687 			    gbflags, &bp);
688 			if (error)
689 				goto done;
690 			bp->b_blkno = fsbtodb(fs, nb);
691 			bp->b_xflags |= BX_ALTDATA;
692 			*bpp = bp;
693 			goto done;
694 		}
695 		if (nb != 0) {
696 			/*
697 			 * Consider need to reallocate a fragment.
698 			 */
699 			osize = fragroundup(fs, blkoff(fs, dp->di_extsize));
700 			nsize = fragroundup(fs, size);
701 			if (nsize <= osize) {
702 				error = bread_gb(vp, -1 - lbn, osize, NOCRED,
703 				    gbflags, &bp);
704 				if (error)
705 					goto done;
706 				bp->b_blkno = fsbtodb(fs, nb);
707 				bp->b_xflags |= BX_ALTDATA;
708 			} else {
709 				UFS_LOCK(ump);
710 				error = ffs_realloccg(ip, -1 - lbn,
711 				    dp->di_extb[lbn],
712 				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
713 				    &dp->di_extb[0]), osize, nsize, flags,
714 				    cred, &bp);
715 				if (error)
716 					goto done;
717 				bp->b_xflags |= BX_ALTDATA;
718 				if (DOINGSOFTDEP(vp))
719 					softdep_setup_allocext(ip, lbn,
720 					    dbtofsb(fs, bp->b_blkno), nb,
721 					    nsize, osize, bp);
722 			}
723 		} else {
724 			if (dp->di_extsize < smalllblktosize(fs, lbn + 1))
725 				nsize = fragroundup(fs, size);
726 			else
727 				nsize = fs->fs_bsize;
728 			UFS_LOCK(ump);
729 			error = ffs_alloc(ip, lbn,
730 			   ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]),
731 			   nsize, flags, cred, &newb);
732 			if (error)
733 				goto done;
734 			bp = getblk(vp, -1 - lbn, nsize, 0, 0, gbflags);
735 			bp->b_blkno = fsbtodb(fs, newb);
736 			bp->b_xflags |= BX_ALTDATA;
737 			if (flags & BA_CLRBUF)
738 				vfs_bio_clrbuf(bp);
739 			if (DOINGSOFTDEP(vp))
740 				softdep_setup_allocext(ip, lbn, newb, 0,
741 				    nsize, 0, bp);
742 		}
743 		dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno);
744 		UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_IBLKDATA);
745 		*bpp = bp;
746 		error = 0;
747 		goto done;
748 	}
749 	/*
750 	 * If the next write will extend the file into a new block,
751 	 * and the file is currently composed of a fragment
752 	 * this fragment has to be extended to be a full block.
753 	 */
754 	lastlbn = lblkno(fs, ip->i_size);
755 	if (lastlbn < UFS_NDADDR && lastlbn < lbn) {
756 		nb = lastlbn;
757 		osize = blksize(fs, ip, nb);
758 		if (osize < fs->fs_bsize && osize > 0) {
759 			UFS_LOCK(ump);
760 			error = ffs_realloccg(ip, nb, dp->di_db[nb],
761 			    ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
762 			    &dp->di_db[0]), osize, (int)fs->fs_bsize,
763 			    flags, cred, &bp);
764 			if (error)
765 				goto done;
766 			if (DOINGSOFTDEP(vp))
767 				softdep_setup_allocdirect(ip, nb,
768 				    dbtofsb(fs, bp->b_blkno),
769 				    dp->di_db[nb],
770 				    fs->fs_bsize, osize, bp);
771 			ip->i_size = smalllblktosize(fs, nb + 1);
772 			dp->di_size = ip->i_size;
773 			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
774 			UFS_INODE_SET_FLAG(ip,
775 			    IN_SIZEMOD |IN_CHANGE | IN_UPDATE | IN_IBLKDATA);
776 			if (flags & IO_SYNC)
777 				bwrite(bp);
778 			else
779 				bawrite(bp);
780 		}
781 	}
782 	/*
783 	 * The first UFS_NDADDR blocks are direct blocks
784 	 */
785 	if (lbn < UFS_NDADDR) {
786 		if (flags & BA_METAONLY)
787 			panic("ffs_balloc_ufs2: BA_METAONLY for direct block");
788 		nb = dp->di_db[lbn];
789 		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
790 			if ((flags & BA_CLRBUF) != 0) {
791 				error = bread_gb(vp, lbn, fs->fs_bsize, NOCRED,
792 				    gbflags, &bp);
793 				if (error != 0)
794 					goto done;
795 			} else {
796 				bp = getblk(vp, lbn, fs->fs_bsize, 0, 0,
797 				    gbflags);
798 				if (bp == NULL) {
799 					error = EIO;
800 					goto done;
801 				}
802 				vfs_bio_clrbuf(bp);
803 			}
804 			bp->b_blkno = fsbtodb(fs, nb);
805 			*bpp = bp;
806 			error = 0;
807 			goto done;
808 		}
809 		if (nb != 0) {
810 			/*
811 			 * Consider need to reallocate a fragment.
812 			 */
813 			osize = fragroundup(fs, blkoff(fs, ip->i_size));
814 			nsize = fragroundup(fs, size);
815 			if (nsize <= osize) {
816 				error = bread_gb(vp, lbn, osize, NOCRED,
817 				    gbflags, &bp);
818 				if (error)
819 					goto done;
820 				bp->b_blkno = fsbtodb(fs, nb);
821 			} else {
822 				UFS_LOCK(ump);
823 				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
824 				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
825 				    &dp->di_db[0]), osize, nsize, flags,
826 				    cred, &bp);
827 				if (error)
828 					goto done;
829 				if (DOINGSOFTDEP(vp))
830 					softdep_setup_allocdirect(ip, lbn,
831 					    dbtofsb(fs, bp->b_blkno), nb,
832 					    nsize, osize, bp);
833 			}
834 		} else {
835 			if (ip->i_size < smalllblktosize(fs, lbn + 1))
836 				nsize = fragroundup(fs, size);
837 			else
838 				nsize = fs->fs_bsize;
839 			UFS_LOCK(ump);
840 			error = ffs_alloc(ip, lbn,
841 			    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
842 				&dp->di_db[0]), nsize, flags, cred, &newb);
843 			if (error)
844 				goto done;
845 			bp = getblk(vp, lbn, nsize, 0, 0, gbflags);
846 			bp->b_blkno = fsbtodb(fs, newb);
847 			if (flags & BA_CLRBUF)
848 				vfs_bio_clrbuf(bp);
849 			if (DOINGSOFTDEP(vp))
850 				softdep_setup_allocdirect(ip, lbn, newb, 0,
851 				    nsize, 0, bp);
852 		}
853 		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
854 		UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE | IN_IBLKDATA);
855 		*bpp = bp;
856 		error = 0;
857 		goto done;
858 	}
859 	/*
860 	 * Determine the number of levels of indirection.
861 	 */
862 	pref = 0;
863 	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
864 		goto done;
865 #ifdef INVARIANTS
866 	if (num < 1)
867 		panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block");
868 #endif
869 	saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH);
870 	/*
871 	 * Fetch the first indirect block allocating if necessary.
872 	 */
873 	--num;
874 	nb = dp->di_ib[indirs[0].in_off];
875 	allocib = NULL;
876 	allocblk = allociblk;
877 	lbns_remfree = lbns;
878 	if (nb == 0) {
879 		UFS_LOCK(ump);
880 		pref = ffs_blkpref_ufs2(ip, lbn, -indirs[0].in_off - 1,
881 		    (ufs2_daddr_t *)0);
882 		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
883 		    flags, cred, &newb)) != 0) {
884 			curthread_pflags_restore(saved_inbdflush);
885 			goto done;
886 		}
887 		pref = newb + fs->fs_frag;
888 		nb = newb;
889 		MPASS(allocblk < allociblk + nitems(allociblk));
890 		MPASS(lbns_remfree < lbns + nitems(lbns));
891 		*allocblk++ = nb;
892 		*lbns_remfree++ = indirs[1].in_lbn;
893 		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0,
894 		    GB_UNMAPPED | gbwflag);
895 		bp->b_blkno = fsbtodb(fs, nb);
896 		vfs_bio_clrbuf(bp);
897 		if (DOINGSOFTDEP(vp)) {
898 			softdep_setup_allocdirect(ip,
899 			    UFS_NDADDR + indirs[0].in_off, newb, 0,
900 			    fs->fs_bsize, 0, bp);
901 			bdwrite(bp);
902 		} else if ((flags & IO_SYNC) == 0 && DOINGASYNC(vp)) {
903 			if (bp->b_bufsize == fs->fs_bsize)
904 				bp->b_flags |= B_CLUSTEROK;
905 			bdwrite(bp);
906 		} else {
907 			if ((error = bwrite(bp)) != 0)
908 				goto fail;
909 		}
910 		allocib = &dp->di_ib[indirs[0].in_off];
911 		*allocib = nb;
912 		UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE | IN_IBLKDATA);
913 	}
914 	/*
915 	 * Fetch through the indirect blocks, allocating as necessary.
916 	 */
917 retry:
918 	for (i = 1;;) {
919 		error = bread_gb(vp, indirs[i].in_lbn, (int)fs->fs_bsize,
920 		    NOCRED, gbwflag, &bp);
921 		if (error) {
922 			goto fail;
923 		}
924 		bap = (ufs2_daddr_t *)bp->b_data;
925 		nb = bap[indirs[i].in_off];
926 		if ((error = UFS_CHECK_BLKNO(mp, ip->i_number, nb,
927 		    fs->fs_bsize)) != 0) {
928 			brelse(bp);
929 			goto fail;
930 		}
931 		if (i == num)
932 			break;
933 		i += 1;
934 		if (nb != 0) {
935 			bqrelse(bp);
936 			continue;
937 		}
938 		UFS_LOCK(ump);
939 		/*
940 		 * If parent indirect has just been allocated, try to cluster
941 		 * immediately following it.
942 		 */
943 		if (pref == 0)
944 			pref = ffs_blkpref_ufs2(ip, lbn, i - num - 1,
945 			    (ufs2_daddr_t *)0);
946 		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
947 		    flags | IO_BUFLOCKED, cred, &newb)) != 0) {
948 			brelse(bp);
949 			UFS_LOCK(ump);
950 			if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
951 				softdep_request_cleanup(fs, vp, cred,
952 				    FLUSH_BLOCKS_WAIT);
953 				UFS_UNLOCK(ump);
954 				goto retry;
955 			}
956 			if (!ffs_fsfail_cleanup_locked(ump, error) &&
957 			    ppsratecheck(&ump->um_last_fullmsg,
958 			    &ump->um_secs_fullmsg, 1)) {
959 				UFS_UNLOCK(ump);
960 				ffs_fserr(fs, ip->i_number, "filesystem full");
961 				uprintf("\n%s: write failed, filesystem "
962 				    "is full\n", fs->fs_fsmnt);
963 			} else {
964 				UFS_UNLOCK(ump);
965 			}
966 			goto fail;
967 		}
968 		pref = newb + fs->fs_frag;
969 		nb = newb;
970 		MPASS(allocblk < allociblk + nitems(allociblk));
971 		MPASS(lbns_remfree < lbns + nitems(lbns));
972 		*allocblk++ = nb;
973 		*lbns_remfree++ = indirs[i].in_lbn;
974 		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0,
975 		    GB_UNMAPPED);
976 		nbp->b_blkno = fsbtodb(fs, nb);
977 		vfs_bio_clrbuf(nbp);
978 		if (DOINGSOFTDEP(vp)) {
979 			softdep_setup_allocindir_meta(nbp, ip, bp,
980 			    indirs[i - 1].in_off, nb);
981 			bdwrite(nbp);
982 		} else if ((flags & IO_SYNC) == 0 && DOINGASYNC(vp)) {
983 			if (nbp->b_bufsize == fs->fs_bsize)
984 				nbp->b_flags |= B_CLUSTEROK;
985 			bdwrite(nbp);
986 		} else {
987 			if ((error = bwrite(nbp)) != 0) {
988 				brelse(bp);
989 				goto fail;
990 			}
991 		}
992 		bap[indirs[i - 1].in_off] = nb;
993 		if (allocib == NULL && unwindidx < 0)
994 			unwindidx = i - 1;
995 		/*
996 		 * If required, write synchronously, otherwise use
997 		 * delayed write.
998 		 */
999 		if (flags & IO_SYNC) {
1000 			bwrite(bp);
1001 		} else {
1002 			if (bp->b_bufsize == fs->fs_bsize)
1003 				bp->b_flags |= B_CLUSTEROK;
1004 			bdwrite(bp);
1005 		}
1006 	}
1007 	/*
1008 	 * If asked only for the indirect block, then return it.
1009 	 */
1010 	if (flags & BA_METAONLY) {
1011 		curthread_pflags_restore(saved_inbdflush);
1012 		*bpp = bp;
1013 		error = 0;
1014 		goto done;
1015 	}
1016 	/*
1017 	 * Get the data block, allocating if necessary.
1018 	 */
1019 	if (nb == 0) {
1020 		UFS_LOCK(ump);
1021 		/*
1022 		 * If allocating metadata at the front of the cylinder
1023 		 * group and parent indirect block has just been allocated,
1024 		 * then cluster next to it if it is the first indirect in
1025 		 * the file. Otherwise it has been allocated in the metadata
1026 		 * area, so we want to find our own place out in the data area.
1027 		 */
1028 		if (pref == 0 || (lbn > UFS_NDADDR && fs->fs_metaspace != 0))
1029 			pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off,
1030 			    &bap[0]);
1031 		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
1032 		    flags | IO_BUFLOCKED, cred, &newb);
1033 		if (error) {
1034 			brelse(bp);
1035 			UFS_LOCK(ump);
1036 			if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
1037 				softdep_request_cleanup(fs, vp, cred,
1038 				    FLUSH_BLOCKS_WAIT);
1039 				UFS_UNLOCK(ump);
1040 				goto retry;
1041 			}
1042 			if (!ffs_fsfail_cleanup_locked(ump, error) &&
1043 			    ppsratecheck(&ump->um_last_fullmsg,
1044 			    &ump->um_secs_fullmsg, 1)) {
1045 				UFS_UNLOCK(ump);
1046 				ffs_fserr(fs, ip->i_number, "filesystem full");
1047 				uprintf("\n%s: write failed, filesystem "
1048 				    "is full\n", fs->fs_fsmnt);
1049 			} else {
1050 				UFS_UNLOCK(ump);
1051 			}
1052 			goto fail;
1053 		}
1054 		nb = newb;
1055 		MPASS(allocblk < allociblk + nitems(allociblk));
1056 		MPASS(lbns_remfree < lbns + nitems(lbns));
1057 		*allocblk++ = nb;
1058 		*lbns_remfree++ = lbn;
1059 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
1060 		nbp->b_blkno = fsbtodb(fs, nb);
1061 		if (flags & BA_CLRBUF)
1062 			vfs_bio_clrbuf(nbp);
1063 		if (DOINGSOFTDEP(vp))
1064 			softdep_setup_allocindir_page(ip, lbn, bp,
1065 			    indirs[i].in_off, nb, 0, nbp);
1066 		bap[indirs[i].in_off] = nb;
1067 		/*
1068 		 * If required, write synchronously, otherwise use
1069 		 * delayed write.
1070 		 */
1071 		if (flags & IO_SYNC) {
1072 			bwrite(bp);
1073 		} else {
1074 			if (bp->b_bufsize == fs->fs_bsize)
1075 				bp->b_flags |= B_CLUSTEROK;
1076 			bdwrite(bp);
1077 		}
1078 		curthread_pflags_restore(saved_inbdflush);
1079 		*bpp = nbp;
1080 		error = 0;
1081 		goto done;
1082 	}
1083 	brelse(bp);
1084 	/*
1085 	 * If requested clear invalid portions of the buffer.  If we
1086 	 * have to do a read-before-write (typical if BA_CLRBUF is set),
1087 	 * try to do some read-ahead in the sequential case to reduce
1088 	 * the number of I/O transactions.
1089 	 */
1090 	if (flags & BA_CLRBUF) {
1091 		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
1092 		if (seqcount != 0 &&
1093 		    (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0 &&
1094 		    !(vm_page_count_severe() || buf_dirty_count_severe())) {
1095 			error = cluster_read(vp, ip->i_size, lbn,
1096 			    (int)fs->fs_bsize, NOCRED,
1097 			    MAXBSIZE, seqcount, gbflags, &nbp);
1098 		} else {
1099 			error = bread_gb(vp, lbn, (int)fs->fs_bsize,
1100 			    NOCRED, gbflags, &nbp);
1101 		}
1102 		if (error) {
1103 			brelse(nbp);
1104 			goto fail;
1105 		}
1106 	} else {
1107 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
1108 		nbp->b_blkno = fsbtodb(fs, nb);
1109 	}
1110 	curthread_pflags_restore(saved_inbdflush);
1111 	*bpp = nbp;
1112 	error = 0;
1113 	goto done;
1114 fail:
1115 	curthread_pflags_restore(saved_inbdflush);
1116 	/*
1117 	 * If we have failed to allocate any blocks, simply return the error.
1118 	 * This is the usual case and avoids the need to fsync the file.
1119 	 */
1120 	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
1121 		goto done;
1122 	/*
1123 	 * If we have failed part way through block allocation, we
1124 	 * have to deallocate any indirect blocks that we have allocated.
1125 	 * We have to fsync the file before we start to get rid of all
1126 	 * of its dependencies so that we do not leave them dangling.
1127 	 * We have to sync it at the end so that the soft updates code
1128 	 * does not find any untracked changes. Although this is really
1129 	 * slow, running out of disk space is not expected to be a common
1130 	 * occurrence. The error return from fsync is ignored as we already
1131 	 * have an error to return to the user.
1132 	 *
1133 	 * XXX Still have to journal the free below
1134 	 */
1135 	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
1136 	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
1137 	     blkp < allocblk; blkp++, lbns_remfree++) {
1138 		/*
1139 		 * We shall not leave the freed blocks on the vnode
1140 		 * buffer object lists.
1141 		 */
1142 		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
1143 		    GB_NOCREAT | GB_UNMAPPED | gbwflag);
1144 		if (bp != NULL) {
1145 			KASSERT(bp->b_blkno == fsbtodb(fs, *blkp),
1146 			    ("mismatch2 l %jd %jd b %ju %ju",
1147 			    (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree,
1148 			    (uintmax_t)bp->b_blkno,
1149 			    (uintmax_t)fsbtodb(fs, *blkp)));
1150 			bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE;
1151 			bp->b_flags &= ~(B_ASYNC | B_CACHE);
1152 			brelse(bp);
1153 		}
1154 		deallocated += fs->fs_bsize;
1155 	}
1156 	if (allocib != NULL) {
1157 		*allocib = 0;
1158 	} else if (unwindidx >= 0) {
1159 		int r;
1160 
1161 		r = bread_gb(vp, indirs[unwindidx].in_lbn,
1162 		    (int)fs->fs_bsize, NOCRED, gbwflag, &bp);
1163 		if (r) {
1164 			panic("Could not unwind indirect block, error %d", r);
1165 			brelse(bp);
1166 		} else {
1167 			bap = (ufs2_daddr_t *)bp->b_data;
1168 			bap[indirs[unwindidx].in_off] = 0;
1169 			if (flags & IO_SYNC) {
1170 				bwrite(bp);
1171 			} else {
1172 				if (bp->b_bufsize == fs->fs_bsize)
1173 					bp->b_flags |= B_CLUSTEROK;
1174 				bdwrite(bp);
1175 			}
1176 		}
1177 	}
1178 	if (deallocated) {
1179 #ifdef QUOTA
1180 		/*
1181 		 * Restore user's disk quota because allocation failed.
1182 		 */
1183 		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
1184 #endif
1185 		dp->di_blocks -= btodb(deallocated);
1186 		UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
1187 	}
1188 	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
1189 	/*
1190 	 * After the buffers are invalidated and on-disk pointers are
1191 	 * cleared, free the blocks.
1192 	 */
1193 	for (blkp = allociblk; blkp < allocblk; blkp++) {
1194 #ifdef INVARIANTS
1195 		if (blkp == allociblk)
1196 			lbns_remfree = lbns;
1197 		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
1198 		    GB_NOCREAT | GB_UNMAPPED | gbwflag);
1199 		if (bp != NULL) {
1200 			panic("zombie2 %jd %ju %ju",
1201 			    (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno,
1202 			    (uintmax_t)fsbtodb(fs, *blkp));
1203 		}
1204 		lbns_remfree++;
1205 #endif
1206 		ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize,
1207 		    ip->i_number, vp->v_type, NULL, SINGLETON_KEY);
1208 	}
1209 done:
1210 	vn_seqc_write_end(vp);
1211 	return (error);
1212 }
1213