xref: /titanic_52/usr/src/uts/common/fs/ufs/ufs_bmap.c (revision 72c9c96770c90a8042add81e56bb8a2f9e4a29dc)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
26 /*	  All Rights Reserved  	*/
27 
28 /*
29  * University Copyright- Copyright (c) 1982, 1986, 1988
30  * The Regents of the University of California
31  * All Rights Reserved
32  *
33  * University Acknowledgment- Portions of this document are derived from
34  * software developed by the University of California, Berkeley, and its
35  * contributors.
36  */
37 
38 
39 #include <sys/types.h>
40 #include <sys/t_lock.h>
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/signal.h>
44 #include <sys/user.h>
45 #include <sys/vnode.h>
46 #include <sys/buf.h>
47 #include <sys/disp.h>
48 #include <sys/proc.h>
49 #include <sys/conf.h>
50 #include <sys/fs/ufs_inode.h>
51 #include <sys/fs/ufs_fs.h>
52 #include <sys/fs/ufs_quota.h>
53 #include <sys/fs/ufs_trans.h>
54 #include <sys/fs/ufs_bio.h>
55 #include <vm/seg.h>
56 #include <sys/errno.h>
57 #include <sys/sysmacros.h>
58 #include <sys/vfs.h>
59 #include <sys/debug.h>
60 #include <sys/kmem.h>
61 #include <sys/cmn_err.h>
62 
63 /*
64  * This structure is used to track blocks as we allocate them, so that
65  * we can free them if we encounter an error during allocation.  We
66  * keep track of five pieces of information for each allocated block:
67  *   - The number of the newly allocated block
68  *   - The size of the block (lets us deal with fragments if we want)
69  *   - The number of the block containing a pointer to it; or whether
70  *     the pointer is in the inode
71  *   - The offset within the block (or inode) containing a pointer to it.
72  *   - A flag indicating the usage of the block.  (Logging needs to know
73  *     this to avoid overwriting a data block if it was previously used
74  *     for metadata.)
75  */
76 
77 enum ufs_owner_type {
78 	ufs_no_owner,		/* Owner has not yet been updated */
79 	ufs_inode_direct,	/* Listed in inode's direct block table */
80 	ufs_inode_indirect,	/* Listed in inode's indirect block table */
81 	ufs_indirect_block	/* Listed in an indirect block */
82 };
83 
84 struct ufs_allocated_block {
85 	daddr_t this_block;	    /* Number of this block */
86 	off_t block_size;	    /* Size of this block, in bytes */
87 	enum ufs_owner_type owner;  /* Who points to this block? */
88 	daddr_t owner_block;	    /* Number of the owning block */
89 	uint_t owner_offset;	    /* Offset within that block or inode */
90 	int usage_flags;	    /* Usage flags, as expected by free() */
91 };
92 
93 
94 static int findextent(struct fs *fs, daddr32_t *sbp, int n, int *lenp,
95 		int maxtrans);
96 
97 static void ufs_undo_allocation(inode_t *ip, int block_count,
98 	struct ufs_allocated_block table[], int inode_sector_adjust);
99 
100 /*
101  * Find the extent and the matching block number.
102  *
103  * bsize > PAGESIZE
104  *	boff indicates that we want a page in the middle
105  *	min expression is supposed to make sure no extra page[s] after EOF
106  * PAGESIZE >= bsize
107  *	we assume that a page is a multiple of bsize, i.e.,
108  *	boff always == 0
109  *
110  * We always return a length that is suitable for a disk transfer.
111  */
112 #define	DOEXTENT(fs, lbn, boff, bnp, lenp, size, tblp, n, chkfrag, maxtrans) {\
113 	register daddr32_t *dp = (tblp);				\
114 	register int _chkfrag = chkfrag; /* for lint. sigh */		\
115 									\
116 	if (*dp == 0) {							\
117 		*(bnp) = UFS_HOLE;					\
118 	} else {							\
119 		register int len;					\
120 									\
121 		len = findextent(fs, dp, (int)(n), lenp, maxtrans) << 	\
122 			(fs)->fs_bshift; 				\
123 		if (_chkfrag) {						\
124 			register u_offset_t tmp;			\
125 									\
126 			tmp = fragroundup((fs), size) -			\
127 			    (((u_offset_t)lbn) << fs->fs_bshift);	\
128 			len = (int)MIN(tmp, len);			\
129 		}							\
130 		len -= (boff);						\
131 		if (len <= 0) {						\
132 			*(bnp) = UFS_HOLE;				\
133 		} else {						\
134 			*(bnp) = fsbtodb(fs, *dp) + btodb(boff);	\
135 			*(lenp) = len;					\
136 		}							\
137 	}								\
138 }
139 
140 /*
141  * The maximum supported file size is actually somewhat less that 1
142  * terabyte.  This is because the total number of blocks used for the
143  * file and its metadata must fit into the ic_blocks field of the
144  * inode, which is a signed 32-bit quantity.  The metadata allocated
145  * for a file (that is, the single, double, and triple indirect blocks
146  * used to reference the file blocks) is actually quite small,
147  * but just to make sure, we check for overflow in the ic_blocks
148  * ic_blocks fields for all files whose total block count is
149  * within 1 GB of a terabyte.  VERYLARGEFILESIZE below is the number of
150  * 512-byte blocks in a terabyte (2^31), minus the number of 512-byte blocks
151  * in a gigabyte (2^21).  We only check for overflow in the ic_blocks
152  * field if the number of blocks currently allocated to the file is
153  * greater than VERYLARGEFILESIZE.
154  *
155  * Note that file "size" is the not the same as file "length".  A
156  * file's "size" is the number of blocks allocated to it.  A file's
157  * "length" is the maximum offset in the file.  A UFS FILE can have a
158  * length of a terabyte, but the size is limited to somewhat less than
159  * a terabyte, as described above.
160  */
161 #define	VERYLARGEFILESIZE	0x7FE00000
162 
163 /*
164  * bmap{read,write} define the structure of file system storage by mapping
165  * a logical offset in a file to a physical block number on the device.
166  * It should be called with a locked inode when allocation is to be
167  * done (bmap_write).  Note this strangeness: bmap_write is always called from
168  * getpage(), not putpage(), since getpage() is where all the allocation
169  * is done.
170  *
171  * S_READ, S_OTHER -> bmap_read; S_WRITE -> bmap_write.
172  *
173  * NOTICE: the block number returned is the disk block number, not the
174  * file system block number.  All the worries about block offsets and
175  * page/block sizes are hidden inside of bmap.  Well, not quite,
176  * unfortunately.  It's impossible to find one place to hide all this
177  * mess.  There are 3 cases:
178  *
179  * PAGESIZE < bsize
180  *	In this case, the {get,put}page routines will attempt to align to
181  *	a file system block boundry (XXX - maybe this is a mistake?).  Since
182  *	the kluster routines may be out of memory, we don't always get all
183  *	the pages we wanted.  If we called bmap first, to find out how much
184  *	to kluster, we handed in the block aligned offset.  If we didn't get
185  *	all the pages, we have to chop off the amount we didn't get from the
186  *	amount handed back by bmap.
187  *
188  * PAGESIZE == bsize
189  *	Life is quite pleasant here, no extra work needed, mainly because we
190  *	(probably?) won't kluster backwards, just forwards.
191  *
192  * PAGESIZE > bsize
193  *	This one has a different set of problems, specifically, we may have to
194  *	do N reads to fill one page.  Let us hope that Sun will stay with small
195  *	pages.
196  *
197  * Returns 0 on success, or a non-zero errno if an error occurs.
198  *
199  * TODO
200  *	LMXXX - add a bmap cache.  This could be a couple of extents in the
201  *	inode.  Two is nice for PAGESIZE > bsize.
202  */
203 
204 int
205 bmap_read(struct inode *ip, u_offset_t off, daddr_t *bnp, int *lenp)
206 {
207 	daddr_t lbn;
208 	ufsvfs_t *ufsvfsp = ip->i_ufsvfs;
209 	struct	fs *fs = ufsvfsp->vfs_fs;
210 	struct	buf *bp;
211 	int	i, j, boff;
212 	int	shft;			/* we maintain sh = 1 << shft */
213 	daddr_t	ob, nb, tbn;
214 	daddr32_t *bap;
215 	int	nindirshift, nindiroffset;
216 
217 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
218 	lbn = (daddr_t)lblkno(fs, off);
219 	boff = (int)blkoff(fs, off);
220 	if (lbn < 0)
221 		return (EFBIG);
222 
223 	/*
224 	 * The first NDADDR blocks are direct blocks.
225 	 */
226 	if (lbn < NDADDR) {
227 		DOEXTENT(fs, lbn, boff, bnp, lenp,
228 		    ip->i_size, &ip->i_db[lbn], NDADDR - lbn, 1,
229 		    ufsvfsp->vfs_iotransz);
230 		return (0);
231 	}
232 
233 	nindirshift = ufsvfsp->vfs_nindirshift;
234 	nindiroffset = ufsvfsp->vfs_nindiroffset;
235 	/*
236 	 * Determine how many levels of indirection.
237 	 */
238 	shft = 0;				/* sh = 1 */
239 	tbn = lbn - NDADDR;
240 	for (j = NIADDR; j > 0; j--) {
241 		longlong_t	sh;
242 
243 		shft += nindirshift;		/* sh *= nindir */
244 		sh = 1LL << shft;
245 		if (tbn < sh)
246 			break;
247 		tbn -= sh;
248 	}
249 	if (j == 0)
250 		return (EFBIG);
251 
252 	/*
253 	 * Fetch the first indirect block.
254 	 */
255 	nb = ip->i_ib[NIADDR - j];
256 	if (nb == 0) {
257 		*bnp = UFS_HOLE;
258 		return (0);
259 	}
260 
261 	/*
262 	 * Fetch through the indirect blocks.
263 	 */
264 	for (; j <= NIADDR; j++) {
265 		ob = nb;
266 		bp = UFS_BREAD(ufsvfsp,
267 		    ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize);
268 		if (bp->b_flags & B_ERROR) {
269 			brelse(bp);
270 			return (EIO);
271 		}
272 		bap = bp->b_un.b_daddr;
273 
274 		ASSERT(!ufs_indir_badblock(ip, bap));
275 
276 		shft -= nindirshift;		/* sh / nindir */
277 		i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
278 		nb = bap[i];
279 		if (nb == 0) {
280 			*bnp = UFS_HOLE;
281 			brelse(bp);
282 			return (0);
283 		}
284 		if (j != NIADDR)
285 			brelse(bp);
286 	}
287 	DOEXTENT(fs, lbn, boff, bnp, lenp, ip->i_size, &bap[i],
288 	    MIN(NINDIR(fs) - i, (daddr_t)lblkno(fs, ip->i_size - 1) - lbn + 1),
289 	    0, ufsvfsp->vfs_iotransz);
290 	brelse(bp);
291 	return (0);
292 }
293 
294 /*
295  * See bmap_read for general notes.
296  *
297  * The block must be at least size bytes and will be extended or
298  * allocated as needed.  If alloc_type is of type BI_ALLOC_ONLY, then bmap
299  * will not create any in-core pages that correspond to the new disk allocation.
300  * If alloc_type is of BI_FALLOCATE, blocks will be stored as (-1) * block addr
301  * and security is maintained b/c upon reading a negative block number pages
302  * are zeroed. For all other allocation types (BI_NORMAL) the in-core pages will
303  * be created and initialized as needed.
304  *
305  * Returns 0 on success, or a non-zero errno if an error occurs.
306  */
307 int
308 bmap_write(struct inode	*ip, u_offset_t	off, int size,
309     enum bi_type alloc_type, daddr_t *allocblk, struct cred *cr)
310 {
311 	struct	fs *fs;
312 	struct	buf *bp;
313 	int	i;
314 	struct	buf *nbp;
315 	int	j;
316 	int	shft;				/* we maintain sh = 1 << shft */
317 	daddr_t	ob, nb, pref, lbn, llbn, tbn;
318 	daddr32_t *bap;
319 	struct	vnode *vp = ITOV(ip);
320 	long	bsize = VBSIZE(vp);
321 	long	osize, nsize;
322 	int	issync, metaflag, isdirquota;
323 	int	err;
324 	dev_t	dev;
325 	struct	fbuf *fbp;
326 	int	nindirshift;
327 	int	nindiroffset;
328 	struct	ufsvfs	*ufsvfsp;
329 	int	added_sectors;		/* sectors added to this inode */
330 	int	alloced_blocks;		/* fs blocks newly allocated */
331 	struct  ufs_allocated_block undo_table[NIADDR+1];
332 	int	verylargefile = 0;
333 
334 	ASSERT(RW_WRITE_HELD(&ip->i_contents));
335 
336 	if (allocblk)
337 		*allocblk = 0;
338 
339 	ufsvfsp = ip->i_ufsvfs;
340 	fs = ufsvfsp->vfs_bufp->b_un.b_fs;
341 	lbn = (daddr_t)lblkno(fs, off);
342 	if (lbn < 0)
343 		return (EFBIG);
344 	if (ip->i_blocks >= VERYLARGEFILESIZE)
345 		verylargefile = 1;
346 	llbn = (daddr_t)((ip->i_size) ? lblkno(fs, ip->i_size - 1) : 0);
347 	metaflag = isdirquota = 0;
348 	if (((ip->i_mode & IFMT) == IFDIR) ||
349 	    ((ip->i_mode & IFMT) == IFATTRDIR))
350 		isdirquota = metaflag = I_DIR;
351 	else if ((ip->i_mode & IFMT) == IFSHAD)
352 		metaflag = I_SHAD;
353 	else if (ip->i_ufsvfs->vfs_qinod == ip)
354 		isdirquota = metaflag = I_QUOTA;
355 
356 	issync = ((ip->i_flag & ISYNC) != 0);
357 
358 	if (isdirquota || issync) {
359 		alloc_type = BI_NORMAL;	/* make sure */
360 	}
361 
362 	/*
363 	 * If the next write will extend the file into a new block,
364 	 * and the file is currently composed of a fragment
365 	 * this fragment has to be extended to be a full block.
366 	 */
367 	if (llbn < NDADDR && llbn < lbn && (ob = ip->i_db[llbn]) != 0) {
368 		osize = blksize(fs, ip, llbn);
369 		if (osize < bsize && osize > 0) {
370 			/*
371 			 * Check to see if doing this will make the file too
372 			 * big.  Only check if we are dealing with a very
373 			 * large file.
374 			 */
375 			if (verylargefile == 1) {
376 				if (((unsigned)ip->i_blocks +
377 				    btodb(bsize - osize)) > INT_MAX) {
378 					return (EFBIG);
379 				}
380 			}
381 			/*
382 			 * Make sure we have all needed pages setup correctly.
383 			 *
384 			 * We pass S_OTHER to fbread here because we want
385 			 * an exclusive lock on the page in question
386 			 * (see ufs_getpage). I/O to the old block location
387 			 * may still be in progress and we are about to free
388 			 * the old block. We don't want anyone else to get
389 			 * a hold of the old block once we free it until
390 			 * the I/O is complete.
391 			 */
392 			err =
393 			    fbread(ITOV(ip), ((offset_t)llbn << fs->fs_bshift),
394 			    (uint_t)bsize, S_OTHER, &fbp);
395 			if (err)
396 				return (err);
397 			pref = blkpref(ip, llbn, (int)llbn, &ip->i_db[0]);
398 			err = realloccg(ip, ob, pref, (int)osize, (int)bsize,
399 			    &nb, cr);
400 			if (err) {
401 				if (fbp)
402 					fbrelse(fbp, S_OTHER);
403 				return (err);
404 			}
405 			ASSERT(!ufs_badblock(ip, nb));
406 
407 			/*
408 			 * Update the inode before releasing the
409 			 * lock on the page. If we released the page
410 			 * lock first, the data could be written to it's
411 			 * old address and then destroyed.
412 			 */
413 			TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 0);
414 			ip->i_db[llbn] = nb;
415 			UFS_SET_ISIZE(((u_offset_t)(llbn + 1)) << fs->fs_bshift,
416 			    ip);
417 			ip->i_blocks += btodb(bsize - osize);
418 			ASSERT((unsigned)ip->i_blocks <= INT_MAX);
419 			TRANS_INODE(ufsvfsp, ip);
420 			ip->i_flag |= IUPD | ICHG | IATTCHG;
421 
422 			/* Caller is responsible for updating i_seq */
423 			/*
424 			 * Don't check metaflag here, directories won't do this
425 			 *
426 			 */
427 			if (issync) {
428 				(void) ufs_fbiwrite(fbp, ip, nb, fs->fs_fsize);
429 			} else {
430 				ASSERT(fbp);
431 				fbrelse(fbp, S_WRITE);
432 			}
433 
434 			if (nb != ob) {
435 				(void) free(ip, ob, (off_t)osize, metaflag);
436 			}
437 		}
438 	}
439 
440 	/*
441 	 * The first NDADDR blocks are direct blocks.
442 	 */
443 	if (lbn < NDADDR) {
444 		nb = ip->i_db[lbn];
445 		if (nb == 0 ||
446 		    ip->i_size < ((u_offset_t)(lbn + 1)) << fs->fs_bshift) {
447 			if (nb != 0) {
448 				/* consider need to reallocate a frag */
449 				osize = fragroundup(fs, blkoff(fs, ip->i_size));
450 				nsize = fragroundup(fs, size);
451 				if (nsize <= osize)
452 					goto gotit;
453 				/*
454 				 * Check to see if doing this will make the
455 				 * file too big.  Only check if we are dealing
456 				 * with a very large file.
457 				 */
458 				if (verylargefile == 1) {
459 					if (((unsigned)ip->i_blocks +
460 					    btodb(nsize - osize)) > INT_MAX) {
461 						return (EFBIG);
462 					}
463 				}
464 				/*
465 				 * need to re-allocate a block or frag
466 				 */
467 				ob = nb;
468 				pref = blkpref(ip, lbn, (int)lbn,
469 				    &ip->i_db[0]);
470 				err = realloccg(ip, ob, pref, (int)osize,
471 				    (int)nsize, &nb, cr);
472 				if (err)
473 					return (err);
474 				if (allocblk)
475 					*allocblk = nb;
476 				ASSERT(!ufs_badblock(ip, nb));
477 
478 			} else {
479 				/*
480 				 * need to allocate a block or frag
481 				 */
482 				osize = 0;
483 				if (ip->i_size <
484 				    ((u_offset_t)(lbn + 1)) << fs->fs_bshift)
485 					nsize = fragroundup(fs, size);
486 				else
487 					nsize = bsize;
488 				/*
489 				 * Check to see if doing this will make the
490 				 * file too big.  Only check if we are dealing
491 				 * with a very large file.
492 				 */
493 				if (verylargefile == 1) {
494 					if (((unsigned)ip->i_blocks +
495 					    btodb(nsize - osize)) > INT_MAX) {
496 						return (EFBIG);
497 					}
498 				}
499 				pref = blkpref(ip, lbn, (int)lbn, &ip->i_db[0]);
500 				err = alloc(ip, pref, (int)nsize, &nb, cr);
501 				if (err)
502 					return (err);
503 				if (allocblk)
504 					*allocblk = nb;
505 				ASSERT(!ufs_badblock(ip, nb));
506 				ob = nb;
507 			}
508 
509 			/*
510 			 * Read old/create new zero pages
511 			 */
512 			fbp = NULL;
513 			if (osize == 0) {
514 				/*
515 				 * mmap S_WRITE faults always enter here
516 				 */
517 				/*
518 				 * We zero it if its also BI_FALLOCATE, but
519 				 * only for direct blocks!
520 				 */
521 				if (alloc_type == BI_NORMAL ||
522 				    alloc_type == BI_FALLOCATE ||
523 				    P2ROUNDUP_TYPED(size,
524 				    PAGESIZE, u_offset_t) < nsize) {
525 					/* fbzero doesn't cause a pagefault */
526 					fbzero(ITOV(ip),
527 					    ((offset_t)lbn << fs->fs_bshift),
528 					    (uint_t)nsize, &fbp);
529 				}
530 			} else {
531 				err = fbread(vp,
532 				    ((offset_t)lbn << fs->fs_bshift),
533 				    (uint_t)nsize, S_OTHER, &fbp);
534 				if (err) {
535 					if (nb != ob) {
536 						(void) free(ip, nb,
537 						    (off_t)nsize, metaflag);
538 					} else {
539 						(void) free(ip,
540 						    ob + numfrags(fs, osize),
541 						    (off_t)(nsize - osize),
542 						    metaflag);
543 					}
544 					ASSERT(nsize >= osize);
545 					(void) chkdq(ip,
546 					    -(long)btodb(nsize - osize),
547 					    0, cr, (char **)NULL,
548 					    (size_t *)NULL);
549 					return (err);
550 				}
551 			}
552 			TRANS_MATA_ALLOC(ufsvfsp, ip, nb, nsize, 0);
553 			ip->i_db[lbn] = nb;
554 			ip->i_blocks += btodb(nsize - osize);
555 			ASSERT((unsigned)ip->i_blocks <= INT_MAX);
556 			TRANS_INODE(ufsvfsp, ip);
557 			ip->i_flag |= IUPD | ICHG | IATTCHG;
558 
559 			/* Caller is responsible for updating i_seq */
560 
561 			/*
562 			 * Write directory and shadow blocks synchronously so
563 			 * that they never appear with garbage in them on the
564 			 * disk.
565 			 *
566 			 */
567 			if (isdirquota && (ip->i_size ||
568 			    TRANS_ISTRANS(ufsvfsp))) {
569 			/*
570 			 * XXX man not be necessary with harpy trans
571 			 * bug id 1130055
572 			 */
573 				(void) ufs_fbiwrite(fbp, ip, nb, fs->fs_fsize);
574 			} else if (fbp) {
575 				fbrelse(fbp, S_WRITE);
576 			}
577 
578 			if (nb != ob)
579 				(void) free(ip, ob, (off_t)osize, metaflag);
580 		}
581 gotit:
582 		return (0);
583 	}
584 
585 	added_sectors = alloced_blocks = 0;	/* No blocks alloced yet */
586 
587 	/*
588 	 * Determine how many levels of indirection.
589 	 */
590 	nindirshift = ip->i_ufsvfs->vfs_nindirshift;
591 	nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
592 	pref = 0;
593 	shft = 0;				/* sh = 1 */
594 	tbn = lbn - NDADDR;
595 	for (j = NIADDR; j > 0; j--) {
596 		longlong_t	sh;
597 
598 		shft += nindirshift;		/* sh *= nindir */
599 		sh = 1LL << shft;
600 		if (tbn < sh)
601 			break;
602 		tbn -= sh;
603 	}
604 
605 	if (j == 0)
606 		return (EFBIG);
607 
608 	/*
609 	 * Fetch the first indirect block.
610 	 */
611 	dev = ip->i_dev;
612 	nb = ip->i_ib[NIADDR - j];
613 	if (nb == 0) {
614 		/*
615 		 * Check to see if doing this will make the
616 		 * file too big.  Only check if we are dealing
617 		 * with a very large file.
618 		 */
619 		if (verylargefile == 1) {
620 			if (((unsigned)ip->i_blocks + btodb(bsize))
621 			    > INT_MAX) {
622 				return (EFBIG);
623 			}
624 		}
625 		/*
626 		 * Need to allocate an indirect block.
627 		 */
628 		pref = blkpref(ip, lbn, 0, (daddr32_t *)0);
629 		err = alloc(ip, pref, (int)bsize, &nb, cr);
630 		if (err)
631 			return (err);
632 		TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 1);
633 		ASSERT(!ufs_badblock(ip, nb));
634 
635 		/*
636 		 * Keep track of this allocation so we can undo it if we
637 		 * get an error later.
638 		 */
639 
640 		ASSERT(alloced_blocks <= NIADDR);
641 
642 		undo_table[alloced_blocks].this_block = nb;
643 		undo_table[alloced_blocks].block_size = bsize;
644 		undo_table[alloced_blocks].owner = ufs_no_owner;
645 		undo_table[alloced_blocks].usage_flags = metaflag | I_IBLK;
646 
647 		alloced_blocks++;
648 
649 		/*
650 		 * Write zero block synchronously so that
651 		 * indirect blocks never point at garbage.
652 		 */
653 		bp = UFS_GETBLK(ufsvfsp, dev, fsbtodb(fs, nb), bsize);
654 
655 		clrbuf(bp);
656 		/* XXX Maybe special-case this? */
657 		TRANS_BUF(ufsvfsp, 0, bsize, bp, DT_ABZERO);
658 		UFS_BWRITE2(ufsvfsp, bp);
659 		if (bp->b_flags & B_ERROR) {
660 			err = geterror(bp);
661 			brelse(bp);
662 			ufs_undo_allocation(ip, alloced_blocks,
663 			    undo_table, added_sectors);
664 			return (err);
665 		}
666 		brelse(bp);
667 
668 		ip->i_ib[NIADDR - j] = nb;
669 		added_sectors += btodb(bsize);
670 		ip->i_blocks += btodb(bsize);
671 		ASSERT((unsigned)ip->i_blocks <= INT_MAX);
672 		TRANS_INODE(ufsvfsp, ip);
673 		ip->i_flag |= IUPD | ICHG | IATTCHG;
674 		/* Caller is responsible for updating i_seq */
675 
676 		/*
677 		 * Update the 'undo table' now that we've linked this block
678 		 * to an inode.
679 		 */
680 
681 		undo_table[alloced_blocks-1].owner = ufs_inode_indirect;
682 		undo_table[alloced_blocks-1].owner_offset = NIADDR - j;
683 
684 		/*
685 		 * In the ISYNC case, wrip will notice that the block
686 		 * count on the inode has changed and will be sure to
687 		 * ufs_iupdat the inode at the end of wrip.
688 		 */
689 	}
690 
691 	/*
692 	 * Fetch through the indirect blocks.
693 	 */
694 	for (; j <= NIADDR; j++) {
695 		ob = nb;
696 		bp = UFS_BREAD(ufsvfsp, ip->i_dev, fsbtodb(fs, ob), bsize);
697 
698 		if (bp->b_flags & B_ERROR) {
699 			err = geterror(bp);
700 			brelse(bp);
701 			/*
702 			 * Return any partial allocations.
703 			 *
704 			 * It is possible that we have not yet made any
705 			 * allocations at this point (if this is the first
706 			 * pass through the loop and we didn't have to
707 			 * allocate the first indirect block, above).
708 			 * In this case, alloced_blocks and added_sectors will
709 			 * be zero, and ufs_undo_allocation will do nothing.
710 			 */
711 			ufs_undo_allocation(ip, alloced_blocks,
712 			    undo_table, added_sectors);
713 			return (err);
714 		}
715 		bap = bp->b_un.b_daddr;
716 		shft -= nindirshift;		/* sh /= nindir */
717 		i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
718 		nb = bap[i];
719 
720 		if (nb == 0) {
721 			/*
722 			 * Check to see if doing this will make the
723 			 * file too big.  Only check if we are dealing
724 			 * with a very large file.
725 			 */
726 			if (verylargefile == 1) {
727 				if (((unsigned)ip->i_blocks + btodb(bsize))
728 				    > INT_MAX) {
729 					brelse(bp);
730 					ufs_undo_allocation(ip, alloced_blocks,
731 					    undo_table, added_sectors);
732 					return (EFBIG);
733 				}
734 			}
735 			if (pref == 0) {
736 				if (j < NIADDR) {
737 					/* Indirect block */
738 					pref = blkpref(ip, lbn, 0,
739 					    (daddr32_t *)0);
740 				} else {
741 					/* Data block */
742 					pref = blkpref(ip, lbn, i, &bap[0]);
743 				}
744 			}
745 
746 			/*
747 			 * release "bp" buf to avoid deadlock (re-bread later)
748 			 */
749 			brelse(bp);
750 
751 			err = alloc(ip, pref, (int)bsize, &nb, cr);
752 			if (err) {
753 				/*
754 				 * Return any partial allocations.
755 				 */
756 				ufs_undo_allocation(ip, alloced_blocks,
757 				    undo_table, added_sectors);
758 				return (err);
759 			}
760 
761 			ASSERT(!ufs_badblock(ip, nb));
762 			ASSERT(alloced_blocks <= NIADDR);
763 
764 			if (allocblk)
765 				*allocblk = nb;
766 
767 			undo_table[alloced_blocks].this_block = nb;
768 			undo_table[alloced_blocks].block_size = bsize;
769 			undo_table[alloced_blocks].owner = ufs_no_owner;
770 			undo_table[alloced_blocks].usage_flags = metaflag |
771 			    ((j < NIADDR) ? I_IBLK : 0);
772 
773 			alloced_blocks++;
774 
775 			if (j < NIADDR) {
776 				TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 1);
777 				/*
778 				 * Write synchronously so indirect
779 				 * blocks never point at garbage.
780 				 */
781 				nbp = UFS_GETBLK(
782 				    ufsvfsp, dev, fsbtodb(fs, nb), bsize);
783 
784 				clrbuf(nbp);
785 				/* XXX Maybe special-case this? */
786 				TRANS_BUF(ufsvfsp, 0, bsize, nbp, DT_ABZERO);
787 				UFS_BWRITE2(ufsvfsp, nbp);
788 				if (nbp->b_flags & B_ERROR) {
789 					err = geterror(nbp);
790 					brelse(nbp);
791 					/*
792 					 * Return any partial
793 					 * allocations.
794 					 */
795 					ufs_undo_allocation(ip,
796 					    alloced_blocks,
797 					    undo_table, added_sectors);
798 					return (err);
799 				}
800 				brelse(nbp);
801 			} else if (alloc_type == BI_NORMAL ||
802 			    P2ROUNDUP_TYPED(size,
803 			    PAGESIZE, u_offset_t) < bsize) {
804 				TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 0);
805 				fbzero(ITOV(ip),
806 				    ((offset_t)lbn << fs->fs_bshift),
807 				    (uint_t)bsize, &fbp);
808 
809 				/*
810 				 * Cases which we need to do a synchronous
811 				 * write of the zeroed data pages:
812 				 *
813 				 * 1) If we are writing a directory then we
814 				 * want to write synchronously so blocks in
815 				 * directories never contain garbage.
816 				 *
817 				 * 2) If we are filling in a hole and the
818 				 * indirect block is going to be synchronously
819 				 * written back below we need to make sure
820 				 * that the zeroes are written here before
821 				 * the indirect block is updated so that if
822 				 * we crash before the real data is pushed
823 				 * we will not end up with random data is
824 				 * the middle of the file.
825 				 *
826 				 * 3) If the size of the request rounded up
827 				 * to the system page size is smaller than
828 				 * the file system block size, we want to
829 				 * write out all the pages now so that
830 				 * they are not aborted before they actually
831 				 * make it to ufs_putpage since the length
832 				 * of the inode will not include the pages.
833 				 */
834 
835 				if (isdirquota || (issync &&
836 				    lbn < llbn))
837 					(void) ufs_fbiwrite(fbp, ip, nb,
838 					    fs->fs_fsize);
839 				else
840 					fbrelse(fbp, S_WRITE);
841 			}
842 
843 			/*
844 			 * re-acquire "bp" buf
845 			 */
846 			bp = UFS_BREAD(ufsvfsp,
847 			    ip->i_dev, fsbtodb(fs, ob), bsize);
848 			if (bp->b_flags & B_ERROR) {
849 				err = geterror(bp);
850 				brelse(bp);
851 				/*
852 				 * Return any partial allocations.
853 				 */
854 				ufs_undo_allocation(ip,
855 				    alloced_blocks,
856 				    undo_table, added_sectors);
857 				return (err);
858 			}
859 			bap = bp->b_un.b_daddr;
860 			bap[i] = nb;
861 
862 			/*
863 			 * The magic explained: j will be equal to NIADDR
864 			 * when we are at the lowest level, this is where the
865 			 * array entries point directly to data blocks. Since
866 			 * we will be 'fallocate'ing we will go ahead and negate
867 			 * the addresses.
868 			 */
869 			if (alloc_type == BI_FALLOCATE && j == NIADDR)
870 				bap[i] = -bap[i];
871 
872 			TRANS_BUF_ITEM_128(ufsvfsp, bap[i], bap, bp, DT_AB);
873 			added_sectors += btodb(bsize);
874 			ip->i_blocks += btodb(bsize);
875 			ASSERT((unsigned)ip->i_blocks <= INT_MAX);
876 			TRANS_INODE(ufsvfsp, ip);
877 			ip->i_flag |= IUPD | ICHG | IATTCHG;
878 
879 			/* Caller is responsible for updating i_seq */
880 
881 			undo_table[alloced_blocks-1].owner =
882 			    ufs_indirect_block;
883 			undo_table[alloced_blocks-1].owner_block = ob;
884 			undo_table[alloced_blocks-1].owner_offset = i;
885 
886 			if (issync) {
887 				UFS_BWRITE2(ufsvfsp, bp);
888 				if (bp->b_flags & B_ERROR) {
889 					err = geterror(bp);
890 					brelse(bp);
891 					/*
892 					 * Return any partial
893 					 * allocations.
894 					 */
895 					ufs_undo_allocation(ip,
896 					    alloced_blocks,
897 					    undo_table, added_sectors);
898 					return (err);
899 				}
900 				brelse(bp);
901 			} else {
902 				bdrwrite(bp);
903 			}
904 		} else {
905 			brelse(bp);
906 		}
907 	}
908 	return (0);
909 }
910 
911 /*
912  * Return 1 if inode has unmapped blocks (UFS holes) or if another thread
913  * is in the critical region of wrip().
914  */
915 int
916 bmap_has_holes(struct inode *ip)
917 {
918 	struct fs *fs = ip->i_fs;
919 	uint_t	dblks; 			/* # of data blocks */
920 	uint_t	mblks;			/* # of data + metadata blocks */
921 	int	nindirshift;
922 	int	nindiroffset;
923 	uint_t	cnt;
924 	int	n, j, shft;
925 	uint_t nindirblks;
926 
927 	int	fsbshift = fs->fs_bshift;
928 	int	fsboffset = (1 << fsbshift) - 1;
929 
930 	/*
931 	 * Check for writer in critical region, if found then we
932 	 * cannot trust the values of i_size and i_blocks
933 	 * simply return true.
934 	 */
935 	if (ip->i_writer != NULL && ip->i_writer != curthread) {
936 		return (1);
937 	}
938 
939 	dblks = (ip->i_size + fsboffset) >> fsbshift;
940 	mblks = (ldbtob((u_offset_t)ip->i_blocks) + fsboffset) >> fsbshift;
941 
942 	/*
943 	 * File has only direct blocks.
944 	 */
945 	if (dblks <= NDADDR)
946 		return (mblks < dblks);
947 	nindirshift = ip->i_ufsvfs->vfs_nindirshift;
948 
949 	nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
950 	nindirblks = nindiroffset + 1;
951 
952 	dblks -= NDADDR;
953 	shft = 0;
954 	/*
955 	 * Determine how many levels of indirection.
956 	 */
957 	for (j = NIADDR; j > 0; j--) {
958 		longlong_t	sh;
959 
960 		shft += nindirshift;	/* sh *= nindir */
961 		sh = 1LL << shft;
962 		if (dblks <= sh)
963 			break;
964 		dblks -= sh;
965 	}
966 	/* LINTED: warning: logical expression always true: op "||" */
967 	ASSERT(NIADDR <= 3);
968 	ASSERT(j <= NIADDR);
969 	if (j == NIADDR)	/* single level indirection */
970 		cnt = NDADDR + 1 + dblks;
971 	else if (j == NIADDR-1) /* double indirection */
972 		cnt = NDADDR + 1 + nindirblks +
973 		    1 + (dblks + nindiroffset)/nindirblks + dblks;
974 	else if (j == NIADDR-2) { /* triple indirection */
975 		n = (dblks + nindiroffset)/nindirblks;
976 		cnt = NDADDR + 1 + nindirblks +
977 		    1 + nindirblks + nindirblks*nindirblks +
978 		    1 + (n + nindiroffset)/nindirblks + n + dblks;
979 	}
980 
981 	return (mblks < cnt);
982 }
983 
984 /*
985  * find some contig blocks starting at *sbp and going for min(n, max_contig)
986  * return the number of blocks (not frags) found.
987  * The array passed in must be at least [0..n-1].
988  */
989 static int
990 findextent(struct fs *fs, daddr32_t *sbp, int n, int *lenp, int maxtransfer)
991 {
992 	register daddr_t bn, nextbn;
993 	register daddr32_t *bp;
994 	register int diff;
995 	int maxtransblk;
996 
997 	if (n <= 0)
998 		return (0);
999 	bn = *sbp;
1000 	if (bn == 0)
1001 		return (0);
1002 
1003 	diff = fs->fs_frag;
1004 	if (*lenp) {
1005 		n = MIN(n, lblkno(fs, *lenp));
1006 	} else {
1007 		/*
1008 		 * If the user has set the value for maxcontig lower than
1009 		 * the drive transfer size, then assume they want this
1010 		 * to be the maximum value for the size of the data transfer.
1011 		 */
1012 		maxtransblk = maxtransfer >> DEV_BSHIFT;
1013 		if (fs->fs_maxcontig < maxtransblk) {
1014 			n = MIN(n, fs->fs_maxcontig);
1015 		} else {
1016 			n = MIN(n, maxtransblk);
1017 		}
1018 	}
1019 	bp = sbp;
1020 	while (--n > 0) {
1021 		nextbn = *(bp + 1);
1022 		if (nextbn == 0 || bn + diff != nextbn)
1023 			break;
1024 		bn = nextbn;
1025 		bp++;
1026 	}
1027 	return ((int)(bp - sbp) + 1);
1028 }
1029 
1030 /*
1031  * Free any blocks which had been successfully allocated.  Always called
1032  * as a result of an error, so we don't bother returning an error code
1033  * from here.
1034  *
1035  * If block_count and inode_sector_adjust are both zero, we'll do nothing.
1036  * Thus it is safe to call this as part of error handling, whether or not
1037  * any blocks have been allocated.
1038  *
1039  * The ufs_inode_direct case is currently unused.
1040  */
1041 
1042 static void
1043 ufs_undo_allocation(
1044 	inode_t *ip,
1045 	int block_count,
1046 	struct ufs_allocated_block table[],
1047 	int inode_sector_adjust)
1048 {
1049 	int i;
1050 	int inode_changed;
1051 	int error_updating_pointers;
1052 	struct ufsvfs *ufsvfsp;
1053 
1054 	inode_changed = 0;
1055 	error_updating_pointers = 0;
1056 
1057 	ufsvfsp = ip->i_ufsvfs;
1058 
1059 	/*
1060 	 * Update pointers on disk before freeing blocks.  If we fail,
1061 	 * some blocks may remain busy; but they will be reclaimed by
1062 	 * an fsck.  (This is better than letting a block wind up with
1063 	 * two owners if we successfully freed it but could not remove
1064 	 * the pointer to it.)
1065 	 */
1066 
1067 	for (i = 0; i < block_count; i++) {
1068 		switch (table[i].owner) {
1069 		case ufs_no_owner:
1070 			/* Nothing to do here, nobody points to us */
1071 			break;
1072 		case ufs_inode_direct:
1073 			ASSERT(table[i].owner_offset < NDADDR);
1074 			ip->i_db[table[i].owner_offset] = 0;
1075 			inode_changed = 1;
1076 			break;
1077 		case ufs_inode_indirect:
1078 			ASSERT(table[i].owner_offset < NIADDR);
1079 			ip->i_ib[table[i].owner_offset] = 0;
1080 			inode_changed = 1;
1081 			break;
1082 		case ufs_indirect_block: {
1083 			buf_t *bp;
1084 			daddr32_t *block_data;
1085 
1086 			/* Read/modify/log/write. */
1087 
1088 			ASSERT(table[i].owner_offset <
1089 			    (VBSIZE(ITOV(ip)) / sizeof (daddr32_t)));
1090 
1091 			bp = UFS_BREAD(ufsvfsp, ip->i_dev,
1092 			    fsbtodb(ufsvfsp->vfs_fs, table[i].owner_block),
1093 			    VBSIZE(ITOV(ip)));
1094 
1095 			if (bp->b_flags & B_ERROR) {
1096 				/* Couldn't read this block; give up. */
1097 				error_updating_pointers = 1;
1098 				brelse(bp);
1099 				break;		/* out of SWITCH */
1100 			}
1101 
1102 			block_data = bp->b_un.b_daddr;
1103 			block_data[table[i].owner_offset] = 0;
1104 
1105 			/* Write a log entry which includes the zero. */
1106 			/* It might be possible to optimize this by using */
1107 			/* TRANS_BUF directly and zeroing only the four */
1108 			/* bytes involved, but an attempt to do that led */
1109 			/* to panics in the logging code.  The attempt was */
1110 			/* TRANS_BUF(ufsvfsp,				  */
1111 			/*    table[i].owner_offset * sizeof (daddr32_t), */
1112 			/*    sizeof (daddr32_t),			  */
1113 			/*    bp,					  */
1114 			/*    DT_ABZERO);				  */
1115 
1116 			TRANS_BUF_ITEM_128(ufsvfsp,
1117 			    block_data[table[i].owner_offset],
1118 			    block_data, bp, DT_AB);
1119 
1120 			/* Now we can write the buffer itself. */
1121 
1122 			UFS_BWRITE2(ufsvfsp, bp);
1123 
1124 			if (bp->b_flags & B_ERROR) {
1125 				error_updating_pointers = 1;
1126 			}
1127 
1128 			brelse(bp);
1129 			break;
1130 		}
1131 		default:
1132 			(void) ufs_fault(ITOV(ip),
1133 			    "ufs_undo_allocation failure\n");
1134 			break;
1135 		}
1136 	}
1137 
1138 	/*
1139 	 * If the inode changed, or if we need to update its block count,
1140 	 * then do that now.  We update the inode synchronously on disk
1141 	 * to ensure that it won't transiently point at a block we've
1142 	 * freed (only necessary if we're not logging).
1143 	 *
1144 	 * NOTE: Currently ufs_iupdat() does not check for errors.  When
1145 	 * it is fixed, we should verify that we successfully updated the
1146 	 * inode before freeing blocks below.
1147 	 */
1148 
1149 	if (inode_changed || (inode_sector_adjust != 0)) {
1150 		ip->i_blocks -= inode_sector_adjust;
1151 		ASSERT((unsigned)ip->i_blocks <= INT_MAX);
1152 		TRANS_INODE(ufsvfsp, ip);
1153 		ip->i_flag |= IUPD | ICHG | IATTCHG;
1154 		ip->i_seq++;
1155 		if (!TRANS_ISTRANS(ufsvfsp))
1156 			ufs_iupdat(ip, I_SYNC);
1157 	}
1158 
1159 	/*
1160 	 * Now we go through and actually free the blocks, but only if we
1161 	 * successfully removed the pointers to them.
1162 	 */
1163 
1164 	if (!error_updating_pointers) {
1165 		for (i = 0; i < block_count; i++) {
1166 			free(ip, table[i].this_block, table[i].block_size,
1167 			    table[i].usage_flags);
1168 		}
1169 	}
1170 }
1171 
1172 /*
1173  * Find the next hole or data block in file starting at *off
1174  * Return found offset in *off, which can be less than the
1175  * starting offset if not block aligned.
1176  * This code is based on bmap_read().
1177  * Errors: ENXIO for end of file
1178  *         EIO for block read error.
1179  */
1180 int
1181 bmap_find(struct inode *ip, boolean_t hole, u_offset_t *off)
1182 {
1183 	ufsvfs_t *ufsvfsp = ip->i_ufsvfs;
1184 	struct fs *fs = ufsvfsp->vfs_fs;
1185 	buf_t *bp[NIADDR];
1186 	int i, j;
1187 	int shft;			/* we maintain sh = 1 << shft */
1188 	int nindirshift, nindiroffset;
1189 	daddr_t	ob, nb, tbn, lbn, skip;
1190 	daddr32_t *bap;
1191 	u_offset_t isz = (offset_t)ip->i_size;
1192 	int32_t bs = fs->fs_bsize; /* file system block size */
1193 	int32_t nindir = fs->fs_nindir;
1194 	dev_t dev;
1195 	int error = 0;
1196 	daddr_t limits[NIADDR];
1197 
1198 	ASSERT(*off < isz);
1199 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
1200 	lbn = (daddr_t)lblkno(fs, *off);
1201 	ASSERT(lbn >= 0);
1202 
1203 	for (i = 0; i < NIADDR; i++)
1204 		bp[i] = NULL;
1205 
1206 	/*
1207 	 * The first NDADDR blocks are direct blocks.
1208 	 */
1209 	if (lbn < NDADDR) {
1210 		for (; lbn < NDADDR; lbn++) {
1211 			if ((hole && (ip->i_db[lbn] == 0)) ||
1212 			    (!hole && (ip->i_db[lbn] != 0))) {
1213 				goto out;
1214 			}
1215 		}
1216 		if ((u_offset_t)lbn << fs->fs_bshift >= isz)
1217 			goto out;
1218 	}
1219 
1220 	nindir = fs->fs_nindir;
1221 	nindirshift = ufsvfsp->vfs_nindirshift;
1222 	nindiroffset = ufsvfsp->vfs_nindiroffset;
1223 	dev = ip->i_dev;
1224 
1225 	/* Set up limits array */
1226 	for (limits[0] = NDADDR, j = 1; j  < NIADDR; j++)
1227 		limits[j] = limits[j-1] + (1ULL << (nindirshift * j));
1228 
1229 loop:
1230 	/*
1231 	 * Determine how many levels of indirection.
1232 	 */
1233 	shft = 0;				/* sh = 1 */
1234 	tbn = lbn - NDADDR;
1235 	for (j = NIADDR; j > 0; j--) {
1236 		longlong_t sh;
1237 
1238 		shft += nindirshift;		/* sh *= nindir */
1239 		sh = 1LL << shft;
1240 		if (tbn < sh)
1241 			break;
1242 		tbn -= sh;
1243 	}
1244 	if (j == 0) {
1245 		/* must have passed end of file */
1246 		ASSERT(((u_offset_t)lbn << fs->fs_bshift) >= isz);
1247 		goto out;
1248 	}
1249 
1250 	/*
1251 	 * Fetch the first indirect block.
1252 	 */
1253 	nb = ip->i_ib[NIADDR - j];
1254 	if (nb == 0) {
1255 		if (hole) {
1256 			lbn = limits[NIADDR - j];
1257 			goto out;
1258 		} else {
1259 			lbn = limits[NIADDR - j + 1];
1260 			if ((u_offset_t)lbn << fs->fs_bshift >= isz)
1261 				goto out;
1262 			goto loop;
1263 		}
1264 	}
1265 
1266 	/*
1267 	 * Fetch through the indirect blocks.
1268 	 */
1269 	for (; ((j <= NIADDR) && (nb != 0)); j++) {
1270 		ob = nb;
1271 		/*
1272 		 * if there's a different block at this level then release
1273 		 * the old one and in with the new.
1274 		 */
1275 		if ((bp[j-1] == NULL) || bp[j-1]->b_blkno != fsbtodb(fs, ob)) {
1276 			if (bp[j-1] != NULL)
1277 				brelse(bp[j-1]);
1278 			bp[j-1] = UFS_BREAD(ufsvfsp, dev, fsbtodb(fs, ob), bs);
1279 			if (bp[j-1]->b_flags & B_ERROR) {
1280 				error = EIO;
1281 				goto out;
1282 			}
1283 		}
1284 		bap = bp[j-1]->b_un.b_daddr;
1285 
1286 		shft -= nindirshift;		/* sh / nindir */
1287 		i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
1288 		nb = bap[i];
1289 		skip = 1LL << (nindirshift * (NIADDR - j));
1290 	}
1291 
1292 	/*
1293 	 * Scan through the blocks in this array.
1294 	 */
1295 	for (; i < nindir; i++, lbn += skip) {
1296 		if (hole && (bap[i] == 0))
1297 			goto out;
1298 		if (!hole && (bap[i] != 0)) {
1299 			if (skip == 1) {
1300 				/* we're at the lowest level */
1301 				goto out;
1302 			} else {
1303 				goto loop;
1304 			}
1305 		}
1306 	}
1307 	if (((u_offset_t)lbn << fs->fs_bshift) < isz)
1308 		goto loop;
1309 out:
1310 	for (i = 0; i < NIADDR; i++) {
1311 		if (bp[i])
1312 			brelse(bp[i]);
1313 	}
1314 	if (error == 0) {
1315 		if (((u_offset_t)lbn << fs->fs_bshift) >= isz) {
1316 			error = ENXIO;
1317 		} else {
1318 			/* success */
1319 			*off = (u_offset_t)lbn << fs->fs_bshift;
1320 		}
1321 	}
1322 	return (error);
1323 }
1324 
1325 /*
1326  * Set a particular offset in the inode list to be a certain block.
1327  * User is responsible for calling TRANS* functions
1328  */
1329 int
1330 bmap_set_bn(struct vnode *vp, u_offset_t off, daddr32_t bn)
1331 {
1332 	daddr_t lbn;
1333 	struct inode *ip;
1334 	ufsvfs_t *ufsvfsp;
1335 	struct	fs *fs;
1336 	struct	buf *bp;
1337 	int	i, j;
1338 	int	shft;			/* we maintain sh = 1 << shft */
1339 	int err;
1340 	daddr_t	ob, nb, tbn;
1341 	daddr32_t *bap;
1342 	int	nindirshift, nindiroffset;
1343 
1344 	ip = VTOI(vp);
1345 	ufsvfsp = ip->i_ufsvfs;
1346 	fs = ufsvfsp->vfs_fs;
1347 	lbn = (daddr_t)lblkno(fs, off);
1348 
1349 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
1350 
1351 	if (lbn < 0)
1352 		return (EFBIG);
1353 
1354 	/*
1355 	 * Take care of direct block assignment
1356 	 */
1357 	if (lbn < NDADDR) {
1358 		ip->i_db[lbn] = bn;
1359 		return (0);
1360 	}
1361 
1362 	nindirshift = ip->i_ufsvfs->vfs_nindirshift;
1363 	nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
1364 	/*
1365 	 * Determine how many levels of indirection.
1366 	 */
1367 	shft = 0;				/* sh = 1 */
1368 	tbn = lbn - NDADDR;
1369 	for (j = NIADDR; j > 0; j--) {
1370 		longlong_t	sh;
1371 
1372 		shft += nindirshift;		/* sh *= nindir */
1373 		sh = 1LL << shft;
1374 		if (tbn < sh)
1375 			break;
1376 		tbn -= sh;
1377 	}
1378 	if (j == 0)
1379 		return (EFBIG);
1380 
1381 	/*
1382 	 * Fetch the first indirect block.
1383 	 */
1384 	nb = ip->i_ib[NIADDR - j];
1385 	if (nb == 0) {
1386 		err = ufs_fault(ITOV(ip), "ufs_set_bn: nb == UFS_HOLE");
1387 		return (err);
1388 	}
1389 
1390 	/*
1391 	 * Fetch through the indirect blocks.
1392 	 */
1393 	for (; j <= NIADDR; j++) {
1394 		ob = nb;
1395 		bp = UFS_BREAD(ufsvfsp,
1396 		    ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize);
1397 		if (bp->b_flags & B_ERROR) {
1398 			err = geterror(bp);
1399 			brelse(bp);
1400 			return (err);
1401 		}
1402 		bap = bp->b_un.b_daddr;
1403 
1404 		ASSERT(!ufs_indir_badblock(ip, bap));
1405 
1406 		shft -= nindirshift;		/* sh / nindir */
1407 		i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
1408 
1409 		nb = bap[i];
1410 		if (nb == 0) {
1411 			err = ufs_fault(ITOV(ip), "ufs_set_bn: nb == UFS_HOLE");
1412 			return (err);
1413 		}
1414 
1415 		if (j == NIADDR) {
1416 			bap[i] = bn;
1417 			bdrwrite(bp);
1418 			return (0);
1419 		}
1420 
1421 		brelse(bp);
1422 	}
1423 	return (0);
1424 }
1425