xref: /titanic_52/usr/src/uts/common/fs/ufs/ufs_bmap.c (revision 03831d35f7499c87d51205817c93e9a8d42c4bae)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 /*
31  * University Copyright- Copyright (c) 1982, 1986, 1988
32  * The Regents of the University of California
33  * All Rights Reserved
34  *
35  * University Acknowledgment- Portions of this document are derived from
36  * software developed by the University of California, Berkeley, and its
37  * contributors.
38  */
39 
40 
41 #pragma ident	"%Z%%M%	%I%	%E% SMI"
42 
43 #include <sys/types.h>
44 #include <sys/t_lock.h>
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/signal.h>
48 #include <sys/user.h>
49 #include <sys/vnode.h>
50 #include <sys/buf.h>
51 #include <sys/disp.h>
52 #include <sys/proc.h>
53 #include <sys/conf.h>
54 #include <sys/fs/ufs_inode.h>
55 #include <sys/fs/ufs_fs.h>
56 #include <sys/fs/ufs_quota.h>
57 #include <sys/fs/ufs_trans.h>
58 #include <sys/fs/ufs_bio.h>
59 #include <vm/seg.h>
60 #include <sys/errno.h>
61 #include <sys/sysmacros.h>
62 #include <sys/vfs.h>
63 #include <sys/debug.h>
64 #include <sys/kmem.h>
65 #include <sys/cmn_err.h>
66 
67 /*
68  * This structure is used to track blocks as we allocate them, so that
69  * we can free them if we encounter an error during allocation.  We
70  * keep track of five pieces of information for each allocated block:
71  *   - The number of the newly allocated block
72  *   - The size of the block (lets us deal with fragments if we want)
73  *   - The number of the block containing a pointer to it; or whether
74  *     the pointer is in the inode
75  *   - The offset within the block (or inode) containing a pointer to it.
76  *   - A flag indicating the usage of the block.  (Logging needs to know
77  *     this to avoid overwriting a data block if it was previously used
78  *     for metadata.)
79  */
80 
81 enum ufs_owner_type {
82 	ufs_no_owner,		/* Owner has not yet been updated */
83 	ufs_inode_direct,	/* Listed in inode's direct block table */
84 	ufs_inode_indirect,	/* Listed in inode's indirect block table */
85 	ufs_indirect_block	/* Listed in an indirect block */
86 };
87 
88 struct ufs_allocated_block {
89 	daddr_t this_block;	    /* Number of this block */
90 	off_t block_size;	    /* Size of this block, in bytes */
91 	enum ufs_owner_type owner;  /* Who points to this block? */
92 	daddr_t owner_block;	    /* Number of the owning block */
93 	uint_t owner_offset;	    /* Offset within that block or inode */
94 	int usage_flags;	    /* Usage flags, as expected by free() */
95 };
96 
97 
98 static int findextent(struct fs *fs, daddr32_t *sbp, int n, int *lenp,
99 		int maxtrans);
100 
101 static void ufs_undo_allocation(inode_t *ip, int block_count,
102 	struct ufs_allocated_block table[], int inode_sector_adjust);
103 
104 /*
105  * Find the extent and the matching block number.
106  *
107  * bsize > PAGESIZE
108  *	boff indicates that we want a page in the middle
109  *	min expression is supposed to make sure no extra page[s] after EOF
110  * PAGESIZE >= bsize
111  *	we assume that a page is a multiple of bsize, i.e.,
112  *	boff always == 0
113  *
114  * We always return a length that is suitable for a disk transfer.
115  */
116 #define	DOEXTENT(fs, lbn, boff, bnp, lenp, size, tblp, n, chkfrag, maxtrans) {\
117 	register daddr32_t *dp = (tblp);				\
118 	register int _chkfrag = chkfrag; /* for lint. sigh */		\
119 									\
120 	if (*dp == 0) {							\
121 		*(bnp) = UFS_HOLE;					\
122 	} else {							\
123 		register int len;					\
124 									\
125 		len = findextent(fs, dp, (int)(n), lenp, maxtrans) << 	\
126 			(fs)->fs_bshift; 				\
127 		if (_chkfrag) {						\
128 			register u_offset_t tmp;			\
129 									\
130 			tmp = fragroundup((fs), size) -			\
131 			    (((u_offset_t)lbn) << fs->fs_bshift);	\
132 			len = (int)MIN(tmp, len);			\
133 		}							\
134 		len -= (boff);						\
135 		if (len <= 0) {						\
136 			*(bnp) = UFS_HOLE;				\
137 		} else {						\
138 			*(bnp) = fsbtodb(fs, *dp) + btodb(boff);	\
139 			*(lenp) = len;					\
140 		}							\
141 	}								\
142 }
143 
144 /*
145  * The maximum supported file size is actually somewhat less that 1
146  * terabyte.  This is because the total number of blocks used for the
147  * file and its metadata must fit into the ic_blocks field of the
148  * inode, which is a signed 32-bit quantity.  The metadata allocated
149  * for a file (that is, the single, double, and triple indirect blocks
150  * used to reference the file blocks) is actually quite small,
151  * but just to make sure, we check for overflow in the ic_blocks
152  * ic_blocks fields for all files whose total block count is
153  * within 1 GB of a terabyte.  VERYLARGEFILESIZE below is the number of
154  * 512-byte blocks in a terabyte (2^31), minus the number of 512-byte blocks
155  * in a gigabyte (2^21).  We only check for overflow in the ic_blocks
156  * field if the number of blocks currently allocated to the file is
157  * greater than VERYLARGEFILESIZE.
158  *
159  * Note that file "size" is the not the same as file "length".  A
160  * file's "size" is the number of blocks allocated to it.  A file's
161  * "length" is the maximum offset in the file.  A UFS FILE can have a
162  * length of a terabyte, but the size is limited to somewhat less than
163  * a terabyte, as described above.
164  */
165 #define	VERYLARGEFILESIZE	0x7FE00000
166 
167 /*
168  * bmap{read,write} define the structure of file system storage by mapping
169  * a logical offset in a file to a physical block number on the device.
170  * It should be called with a locked inode when allocation is to be
171  * done (bmap_write).  Note this strangeness: bmap_write is always called from
172  * getpage(), not putpage(), since getpage() is where all the allocation
173  * is done.
174  *
175  * S_READ, S_OTHER -> bmap_read; S_WRITE -> bmap_write.
176  *
177  * NOTICE: the block number returned is the disk block number, not the
178  * file system block number.  All the worries about block offsets and
179  * page/block sizes are hidden inside of bmap.  Well, not quite,
180  * unfortunately.  It's impossible to find one place to hide all this
181  * mess.  There are 3 cases:
182  *
183  * PAGESIZE < bsize
184  *	In this case, the {get,put}page routines will attempt to align to
185  *	a file system block boundry (XXX - maybe this is a mistake?).  Since
186  *	the kluster routines may be out of memory, we don't always get all
187  *	the pages we wanted.  If we called bmap first, to find out how much
188  *	to kluster, we handed in the block aligned offset.  If we didn't get
189  *	all the pages, we have to chop off the amount we didn't get from the
190  *	amount handed back by bmap.
191  *
192  * PAGESIZE == bsize
193  *	Life is quite pleasant here, no extra work needed, mainly because we
194  *	(probably?) won't kluster backwards, just forwards.
195  *
196  * PAGESIZE > bsize
197  *	This one has a different set of problems, specifically, we may have to
198  *	do N reads to fill one page.  Let us hope that Sun will stay with small
199  *	pages.
200  *
201  * Returns 0 on success, or a non-zero errno if an error occurs.
202  *
203  * TODO
204  *	LMXXX - add a bmap cache.  This could be a couple of extents in the
205  *	inode.  Two is nice for PAGESIZE > bsize.
206  */
207 
208 int
209 bmap_read(struct inode *ip, u_offset_t off, daddr_t *bnp, int *lenp)
210 {
211 	daddr_t lbn;
212 	ufsvfs_t *ufsvfsp = ip->i_ufsvfs;
213 	struct	fs *fs = ufsvfsp->vfs_fs;
214 	struct	buf *bp;
215 	int	i, j, boff;
216 	int	shft;			/* we maintain sh = 1 << shft */
217 	daddr_t	ob, nb, tbn;
218 	daddr32_t *bap;
219 	int	nindirshift, nindiroffset;
220 
221 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
222 	lbn = (daddr_t)lblkno(fs, off);
223 	boff = (int)blkoff(fs, off);
224 	if (lbn < 0)
225 		return (EFBIG);
226 
227 	/*
228 	 * The first NDADDR blocks are direct blocks.
229 	 */
230 	if (lbn < NDADDR) {
231 		DOEXTENT(fs, lbn, boff, bnp, lenp,
232 		    ip->i_size, &ip->i_db[lbn], NDADDR - lbn, 1,
233 			ufsvfsp->vfs_iotransz);
234 		return (0);
235 	}
236 
237 	nindirshift = ufsvfsp->vfs_nindirshift;
238 	nindiroffset = ufsvfsp->vfs_nindiroffset;
239 	/*
240 	 * Determine how many levels of indirection.
241 	 */
242 	shft = 0;				/* sh = 1 */
243 	tbn = lbn - NDADDR;
244 	for (j = NIADDR; j > 0; j--) {
245 		longlong_t	sh;
246 
247 		shft += nindirshift;		/* sh *= nindir */
248 		sh = 1LL << shft;
249 		if (tbn < sh)
250 			break;
251 		tbn -= sh;
252 	}
253 	if (j == 0)
254 		return (EFBIG);
255 
256 	/*
257 	 * Fetch the first indirect block.
258 	 */
259 	nb = ip->i_ib[NIADDR - j];
260 	if (nb == 0) {
261 		*bnp = UFS_HOLE;
262 		return (0);
263 	}
264 
265 	/*
266 	 * Fetch through the indirect blocks.
267 	 */
268 	for (; j <= NIADDR; j++) {
269 		ob = nb;
270 		bp = UFS_BREAD(ufsvfsp,
271 				ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize);
272 		if (bp->b_flags & B_ERROR) {
273 			brelse(bp);
274 			return (EIO);
275 		}
276 		bap = bp->b_un.b_daddr;
277 
278 		ASSERT(!ufs_indir_badblock(ip, bap));
279 
280 		shft -= nindirshift;		/* sh / nindir */
281 		i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
282 		nb = bap[i];
283 		if (nb == 0) {
284 			*bnp = UFS_HOLE;
285 			brelse(bp);
286 			return (0);
287 		}
288 		if (j != NIADDR)
289 			brelse(bp);
290 	}
291 	DOEXTENT(fs, lbn, boff, bnp, lenp, ip->i_size, &bap[i],
292 	    MIN(NINDIR(fs) - i, (daddr_t)lblkno(fs, ip->i_size - 1) - lbn + 1),
293 		0, ufsvfsp->vfs_iotransz);
294 	brelse(bp);
295 	return (0);
296 }
297 
298 /*
299  * See bmap_read for general notes.
300  *
301  * The block must be at least size bytes and will be extended or
302  * allocated as needed.  If alloc_type is of type BI_ALLOC_ONLY, then bmap
303  * will not create any in-core pages that correspond to the new disk allocation.
304  * If alloc_type is of BI_FALLOCATE, blocks will be stored as (-1) * block addr
305  * and security is maintained b/c upon reading a negative block number pages
306  * are zeroed. For all other allocation types (BI_NORMAL) the in-core pages will
307  * be created and initialized as needed.
308  *
309  * Returns 0 on success, or a non-zero errno if an error occurs.
310  */
311 int
312 bmap_write(struct inode	*ip, u_offset_t	off, int size,
313     enum bi_type alloc_type, daddr_t *allocblk, struct cred *cr)
314 {
315 	struct	fs *fs;
316 	struct	buf *bp;
317 	int	i;
318 	struct	buf *nbp;
319 	int	j;
320 	int	shft;				/* we maintain sh = 1 << shft */
321 	daddr_t	ob, nb, pref, lbn, llbn, tbn;
322 	daddr32_t *bap;
323 	struct	vnode *vp = ITOV(ip);
324 	long	bsize = VBSIZE(vp);
325 	long	osize, nsize;
326 	int	issync, metaflag, isdirquota;
327 	int	err;
328 	dev_t	dev;
329 	struct	fbuf *fbp;
330 	int	nindirshift;
331 	int	nindiroffset;
332 	struct	ufsvfs	*ufsvfsp;
333 	int	added_sectors;		/* sectors added to this inode */
334 	int	alloced_blocks;		/* fs blocks newly allocated */
335 	struct  ufs_allocated_block undo_table[NIADDR+1];
336 	int	verylargefile = 0;
337 
338 	ASSERT(RW_WRITE_HELD(&ip->i_contents));
339 
340 	if (allocblk)
341 		*allocblk = 0;
342 
343 	ufsvfsp = ip->i_ufsvfs;
344 	fs = ufsvfsp->vfs_bufp->b_un.b_fs;
345 	lbn = (daddr_t)lblkno(fs, off);
346 	if (lbn < 0)
347 		return (EFBIG);
348 	if (ip->i_blocks >= VERYLARGEFILESIZE)
349 		verylargefile = 1;
350 	llbn = (daddr_t)((ip->i_size) ? lblkno(fs, ip->i_size - 1) : 0);
351 	metaflag = isdirquota = 0;
352 	if (((ip->i_mode & IFMT) == IFDIR) ||
353 	    ((ip->i_mode & IFMT) == IFATTRDIR))
354 		isdirquota = metaflag = I_DIR;
355 	else if ((ip->i_mode & IFMT) == IFSHAD)
356 		metaflag = I_SHAD;
357 	else if (ip->i_ufsvfs->vfs_qinod == ip)
358 		isdirquota = metaflag = I_QUOTA;
359 
360 	issync = ((ip->i_flag & ISYNC) != 0);
361 
362 	if (isdirquota || issync) {
363 		alloc_type = BI_NORMAL;	/* make sure */
364 	}
365 
366 	/*
367 	 * If the next write will extend the file into a new block,
368 	 * and the file is currently composed of a fragment
369 	 * this fragment has to be extended to be a full block.
370 	 */
371 	if (llbn < NDADDR && llbn < lbn && (ob = ip->i_db[llbn]) != 0) {
372 		osize = blksize(fs, ip, llbn);
373 		if (osize < bsize && osize > 0) {
374 			/*
375 			 * Check to see if doing this will make the file too
376 			 * big.  Only check if we are dealing with a very
377 			 * large file.
378 			 */
379 			if (verylargefile == 1) {
380 				if (((unsigned)ip->i_blocks +
381 				    btodb(bsize - osize)) > INT_MAX) {
382 					return (EFBIG);
383 				}
384 			}
385 			/*
386 			 * Make sure we have all needed pages setup correctly.
387 			 *
388 			 * We pass S_OTHER to fbread here because we want
389 			 * an exclusive lock on the page in question
390 			 * (see ufs_getpage). I/O to the old block location
391 			 * may still be in progress and we are about to free
392 			 * the old block. We don't want anyone else to get
393 			 * a hold of the old block once we free it until
394 			 * the I/O is complete.
395 			 */
396 			err = fbread(ITOV(ip),
397 				    ((offset_t)llbn << fs->fs_bshift),
398 					(uint_t)bsize, S_OTHER, &fbp);
399 			if (err)
400 				return (err);
401 			pref = blkpref(ip, llbn, (int)llbn, &ip->i_db[0]);
402 			err = realloccg(ip, ob, pref, (int)osize, (int)bsize,
403 					&nb, cr);
404 			if (err) {
405 				if (fbp)
406 					fbrelse(fbp, S_OTHER);
407 				return (err);
408 			}
409 			ASSERT(!ufs_badblock(ip, nb));
410 
411 			/*
412 			 * Update the inode before releasing the
413 			 * lock on the page. If we released the page
414 			 * lock first, the data could be written to it's
415 			 * old address and then destroyed.
416 			 */
417 			TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 0);
418 			ip->i_db[llbn] = nb;
419 			UFS_SET_ISIZE(((u_offset_t)(llbn + 1)) << fs->fs_bshift,
420 			    ip);
421 			ip->i_blocks += btodb(bsize - osize);
422 			ASSERT((unsigned)ip->i_blocks <= INT_MAX);
423 			TRANS_INODE(ufsvfsp, ip);
424 			ip->i_flag |= IUPD | ICHG | IATTCHG;
425 
426 			/* Caller is responsible for updating i_seq */
427 			/*
428 			 * Don't check metaflag here, directories won't do this
429 			 *
430 			 */
431 			if (issync) {
432 				(void) ufs_fbiwrite(fbp, ip, nb, fs->fs_fsize);
433 			} else {
434 				ASSERT(fbp);
435 				fbrelse(fbp, S_WRITE);
436 			}
437 
438 			if (nb != ob) {
439 				(void) free(ip, ob, (off_t)osize, metaflag);
440 			}
441 		}
442 	}
443 
444 	/*
445 	 * The first NDADDR blocks are direct blocks.
446 	 */
447 	if (lbn < NDADDR) {
448 		nb = ip->i_db[lbn];
449 		if (nb == 0 ||
450 		    ip->i_size < ((u_offset_t)(lbn + 1)) << fs->fs_bshift) {
451 			if (nb != 0) {
452 				/* consider need to reallocate a frag */
453 				osize = fragroundup(fs, blkoff(fs, ip->i_size));
454 				nsize = fragroundup(fs, size);
455 				if (nsize <= osize)
456 					goto gotit;
457 				/*
458 				 * Check to see if doing this will make the
459 				 * file too big.  Only check if we are dealing
460 				 * with a very large file.
461 				 */
462 				if (verylargefile == 1) {
463 					if (((unsigned)ip->i_blocks +
464 					    btodb(nsize - osize)) > INT_MAX) {
465 						return (EFBIG);
466 					}
467 				}
468 				/*
469 				 * need to re-allocate a block or frag
470 				 */
471 				ob = nb;
472 				pref = blkpref(ip, lbn, (int)lbn,
473 								&ip->i_db[0]);
474 				err = realloccg(ip, ob, pref, (int)osize,
475 						(int)nsize, &nb, cr);
476 				if (err)
477 					return (err);
478 				if (allocblk)
479 					*allocblk = nb;
480 				ASSERT(!ufs_badblock(ip, nb));
481 
482 			} else {
483 				/*
484 				 * need to allocate a block or frag
485 				 */
486 				osize = 0;
487 				if (ip->i_size <
488 				    ((u_offset_t)(lbn + 1)) << fs->fs_bshift)
489 					nsize = fragroundup(fs, size);
490 				else
491 					nsize = bsize;
492 				/*
493 				 * Check to see if doing this will make the
494 				 * file too big.  Only check if we are dealing
495 				 * with a very large file.
496 				 */
497 				if (verylargefile == 1) {
498 					if (((unsigned)ip->i_blocks +
499 					    btodb(nsize - osize)) > INT_MAX) {
500 						return (EFBIG);
501 					}
502 				}
503 				pref = blkpref(ip, lbn, (int)lbn, &ip->i_db[0]);
504 				err = alloc(ip, pref, (int)nsize, &nb, cr);
505 				if (err)
506 					return (err);
507 				if (allocblk)
508 					*allocblk = nb;
509 				ASSERT(!ufs_badblock(ip, nb));
510 				ob = nb;
511 			}
512 
513 			/*
514 			 * Read old/create new zero pages
515 			 */
516 			fbp = NULL;
517 			if (osize == 0) {
518 				/*
519 				 * mmap S_WRITE faults always enter here
520 				 */
521 				/*
522 				 * We zero it if its also BI_FALLOCATE, but
523 				 * only for direct blocks!
524 				 */
525 				if (alloc_type == BI_NORMAL ||
526 				    alloc_type == BI_FALLOCATE ||
527 				    P2ROUNDUP_TYPED(size,
528 				    PAGESIZE, u_offset_t) < nsize) {
529 					/* fbzero doesn't cause a pagefault */
530 					fbzero(ITOV(ip),
531 					    ((offset_t)lbn << fs->fs_bshift),
532 					    (uint_t)nsize, &fbp);
533 				}
534 			} else {
535 				err = fbread(vp,
536 				    ((offset_t)lbn << fs->fs_bshift),
537 				    (uint_t)nsize, S_OTHER, &fbp);
538 				if (err) {
539 					if (nb != ob) {
540 						(void) free(ip, nb,
541 						    (off_t)nsize, metaflag);
542 					} else {
543 						(void) free(ip,
544 						    ob + numfrags(fs, osize),
545 						    (off_t)(nsize - osize),
546 						    metaflag);
547 					}
548 					ASSERT(nsize >= osize);
549 					(void) chkdq(ip,
550 						-(long)btodb(nsize - osize),
551 						0, cr, (char **)NULL,
552 						(size_t *)NULL);
553 					return (err);
554 				}
555 			}
556 			TRANS_MATA_ALLOC(ufsvfsp, ip, nb, nsize, 0);
557 			ip->i_db[lbn] = nb;
558 			ip->i_blocks += btodb(nsize - osize);
559 			ASSERT((unsigned)ip->i_blocks <= INT_MAX);
560 			TRANS_INODE(ufsvfsp, ip);
561 			ip->i_flag |= IUPD | ICHG | IATTCHG;
562 
563 			/* Caller is responsible for updating i_seq */
564 
565 			/*
566 			 * Write directory and shadow blocks synchronously so
567 			 * that they never appear with garbage in them on the
568 			 * disk.
569 			 *
570 			 */
571 			if (isdirquota && (ip->i_size ||
572 			    TRANS_ISTRANS(ufsvfsp))) {
573 			/*
574 			 * XXX man not be necessary with harpy trans
575 			 * bug id 1130055
576 			 */
577 				(void) ufs_fbiwrite(fbp, ip, nb, fs->fs_fsize);
578 			} else if (fbp) {
579 				fbrelse(fbp, S_WRITE);
580 			}
581 
582 			if (nb != ob)
583 				(void) free(ip, ob, (off_t)osize, metaflag);
584 		}
585 gotit:
586 		return (0);
587 	}
588 
589 	added_sectors = alloced_blocks = 0;	/* No blocks alloced yet */
590 
591 	/*
592 	 * Determine how many levels of indirection.
593 	 */
594 	nindirshift = ip->i_ufsvfs->vfs_nindirshift;
595 	nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
596 	pref = 0;
597 	shft = 0;				/* sh = 1 */
598 	tbn = lbn - NDADDR;
599 	for (j = NIADDR; j > 0; j--) {
600 		longlong_t	sh;
601 
602 		shft += nindirshift;		/* sh *= nindir */
603 		sh = 1LL << shft;
604 		if (tbn < sh)
605 			break;
606 		tbn -= sh;
607 	}
608 
609 	if (j == 0)
610 		return (EFBIG);
611 
612 	/*
613 	 * Fetch the first indirect block.
614 	 */
615 	dev = ip->i_dev;
616 	nb = ip->i_ib[NIADDR - j];
617 	if (nb == 0) {
618 		/*
619 		 * Check to see if doing this will make the
620 		 * file too big.  Only check if we are dealing
621 		 * with a very large file.
622 		 */
623 		if (verylargefile == 1) {
624 			if (((unsigned)ip->i_blocks + btodb(bsize))
625 			    > INT_MAX) {
626 				return (EFBIG);
627 			}
628 		}
629 		/*
630 		 * Need to allocate an indirect block.
631 		 */
632 		pref = blkpref(ip, lbn, 0, (daddr32_t *)0);
633 		err = alloc(ip, pref, (int)bsize, &nb, cr);
634 		if (err)
635 			return (err);
636 		TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 1);
637 		ASSERT(!ufs_badblock(ip, nb));
638 
639 		/*
640 		 * Keep track of this allocation so we can undo it if we
641 		 * get an error later.
642 		 */
643 
644 		ASSERT(alloced_blocks <= NIADDR);
645 
646 		undo_table[alloced_blocks].this_block = nb;
647 		undo_table[alloced_blocks].block_size = bsize;
648 		undo_table[alloced_blocks].owner = ufs_no_owner;
649 		undo_table[alloced_blocks].usage_flags = metaflag | I_IBLK;
650 
651 		alloced_blocks++;
652 
653 		/*
654 		 * Write zero block synchronously so that
655 		 * indirect blocks never point at garbage.
656 		 */
657 		bp = UFS_GETBLK(ufsvfsp, dev, fsbtodb(fs, nb), bsize);
658 
659 		clrbuf(bp);
660 		/* XXX Maybe special-case this? */
661 		TRANS_BUF(ufsvfsp, 0, bsize, bp, DT_ABZERO);
662 		UFS_BWRITE2(ufsvfsp, bp);
663 		if (bp->b_flags & B_ERROR) {
664 			err = geterror(bp);
665 			brelse(bp);
666 			ufs_undo_allocation(ip, alloced_blocks,
667 			    undo_table, added_sectors);
668 			return (err);
669 		}
670 		brelse(bp);
671 
672 		ip->i_ib[NIADDR - j] = nb;
673 		added_sectors += btodb(bsize);
674 		ip->i_blocks += btodb(bsize);
675 		ASSERT((unsigned)ip->i_blocks <= INT_MAX);
676 		TRANS_INODE(ufsvfsp, ip);
677 		ip->i_flag |= IUPD | ICHG | IATTCHG;
678 		/* Caller is responsible for updating i_seq */
679 
680 		/*
681 		 * Update the 'undo table' now that we've linked this block
682 		 * to an inode.
683 		 */
684 
685 		undo_table[alloced_blocks-1].owner = ufs_inode_indirect;
686 		undo_table[alloced_blocks-1].owner_offset = NIADDR - j;
687 
688 		/*
689 		 * In the ISYNC case, wrip will notice that the block
690 		 * count on the inode has changed and will be sure to
691 		 * ufs_iupdat the inode at the end of wrip.
692 		 */
693 	}
694 
695 	/*
696 	 * Fetch through the indirect blocks.
697 	 */
698 	for (; j <= NIADDR; j++) {
699 		ob = nb;
700 		bp = UFS_BREAD(ufsvfsp, ip->i_dev, fsbtodb(fs, ob), bsize);
701 
702 		if (bp->b_flags & B_ERROR) {
703 			err = geterror(bp);
704 			brelse(bp);
705 			/*
706 			 * Return any partial allocations.
707 			 *
708 			 * It is possible that we have not yet made any
709 			 * allocations at this point (if this is the first
710 			 * pass through the loop and we didn't have to
711 			 * allocate the first indirect block, above).
712 			 * In this case, alloced_blocks and added_sectors will
713 			 * be zero, and ufs_undo_allocation will do nothing.
714 			 */
715 			ufs_undo_allocation(ip, alloced_blocks,
716 			    undo_table, added_sectors);
717 			return (err);
718 		}
719 		bap = bp->b_un.b_daddr;
720 		shft -= nindirshift;		/* sh /= nindir */
721 		i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
722 		nb = bap[i];
723 
724 		if (nb == 0) {
725 			/*
726 			 * Check to see if doing this will make the
727 			 * file too big.  Only check if we are dealing
728 			 * with a very large file.
729 			 */
730 			if (verylargefile == 1) {
731 				if (((unsigned)ip->i_blocks + btodb(bsize))
732 				    > INT_MAX) {
733 					brelse(bp);
734 					ufs_undo_allocation(ip, alloced_blocks,
735 					    undo_table, added_sectors);
736 					return (EFBIG);
737 				}
738 			}
739 			if (pref == 0) {
740 				if (j < NIADDR) {
741 					/* Indirect block */
742 					pref = blkpref(ip, lbn, 0,
743 						(daddr32_t *)0);
744 				} else {
745 					/* Data block */
746 					pref = blkpref(ip, lbn, i, &bap[0]);
747 				}
748 			}
749 
750 			/*
751 			 * release "bp" buf to avoid deadlock (re-bread later)
752 			 */
753 			brelse(bp);
754 
755 			err = alloc(ip, pref, (int)bsize, &nb, cr);
756 			if (err) {
757 				/*
758 				 * Return any partial allocations.
759 				 */
760 				ufs_undo_allocation(ip, alloced_blocks,
761 				    undo_table, added_sectors);
762 				return (err);
763 			}
764 
765 			ASSERT(!ufs_badblock(ip, nb));
766 			ASSERT(alloced_blocks <= NIADDR);
767 
768 			if (allocblk)
769 				*allocblk = nb;
770 
771 			undo_table[alloced_blocks].this_block = nb;
772 			undo_table[alloced_blocks].block_size = bsize;
773 			undo_table[alloced_blocks].owner = ufs_no_owner;
774 			undo_table[alloced_blocks].usage_flags = metaflag |
775 			    ((j < NIADDR) ? I_IBLK : 0);
776 
777 			alloced_blocks++;
778 
779 			if (j < NIADDR) {
780 				TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 1);
781 				/*
782 				 * Write synchronously so indirect
783 				 * blocks never point at garbage.
784 				 */
785 				nbp = UFS_GETBLK(
786 					ufsvfsp, dev, fsbtodb(fs, nb), bsize);
787 
788 				clrbuf(nbp);
789 				/* XXX Maybe special-case this? */
790 				TRANS_BUF(ufsvfsp, 0, bsize, nbp, DT_ABZERO);
791 				UFS_BWRITE2(ufsvfsp, nbp);
792 				if (nbp->b_flags & B_ERROR) {
793 					err = geterror(nbp);
794 					brelse(nbp);
795 					/*
796 					 * Return any partial
797 					 * allocations.
798 					 */
799 					ufs_undo_allocation(ip,
800 					    alloced_blocks,
801 					    undo_table, added_sectors);
802 					return (err);
803 				}
804 				brelse(nbp);
805 			} else if (alloc_type == BI_NORMAL ||
806 			    P2ROUNDUP_TYPED(size,
807 			    PAGESIZE, u_offset_t) < bsize) {
808 				TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 0);
809 				fbzero(ITOV(ip),
810 				    ((offset_t)lbn << fs->fs_bshift),
811 				    (uint_t)bsize, &fbp);
812 
813 				/*
814 				 * Cases which we need to do a synchronous
815 				 * write of the zeroed data pages:
816 				 *
817 				 * 1) If we are writing a directory then we
818 				 * want to write synchronously so blocks in
819 				 * directories never contain garbage.
820 				 *
821 				 * 2) If we are filling in a hole and the
822 				 * indirect block is going to be synchronously
823 				 * written back below we need to make sure
824 				 * that the zeroes are written here before
825 				 * the indirect block is updated so that if
826 				 * we crash before the real data is pushed
827 				 * we will not end up with random data is
828 				 * the middle of the file.
829 				 *
830 				 * 3) If the size of the request rounded up
831 				 * to the system page size is smaller than
832 				 * the file system block size, we want to
833 				 * write out all the pages now so that
834 				 * they are not aborted before they actually
835 				 * make it to ufs_putpage since the length
836 				 * of the inode will not include the pages.
837 				 */
838 
839 				if (isdirquota || (issync &&
840 				    lbn < llbn))
841 					(void) ufs_fbiwrite(fbp, ip, nb,
842 						fs->fs_fsize);
843 				else
844 					fbrelse(fbp, S_WRITE);
845 			}
846 
847 			/*
848 			 * re-acquire "bp" buf
849 			 */
850 			bp = UFS_BREAD(ufsvfsp,
851 					ip->i_dev, fsbtodb(fs, ob), bsize);
852 			if (bp->b_flags & B_ERROR) {
853 				err = geterror(bp);
854 				brelse(bp);
855 				/*
856 				 * Return any partial allocations.
857 				 */
858 				ufs_undo_allocation(ip,
859 				    alloced_blocks,
860 				    undo_table, added_sectors);
861 				return (err);
862 			}
863 			bap = bp->b_un.b_daddr;
864 			bap[i] = nb;
865 
866 			/*
867 			 * The magic explained: j will be equal to NIADDR
868 			 * when we are at the lowest level, this is where the
869 			 * array entries point directly to data blocks. Since
870 			 * we will be 'fallocate'ing we will go ahead and negate
871 			 * the addresses.
872 			 */
873 			if (alloc_type == BI_FALLOCATE && j == NIADDR)
874 				bap[i] = -bap[i];
875 
876 			TRANS_BUF_ITEM_128(ufsvfsp, bap[i], bap, bp, DT_AB);
877 			added_sectors += btodb(bsize);
878 			ip->i_blocks += btodb(bsize);
879 			ASSERT((unsigned)ip->i_blocks <= INT_MAX);
880 			TRANS_INODE(ufsvfsp, ip);
881 			ip->i_flag |= IUPD | ICHG | IATTCHG;
882 
883 			/* Caller is responsible for updating i_seq */
884 
885 			undo_table[alloced_blocks-1].owner =
886 			    ufs_indirect_block;
887 			undo_table[alloced_blocks-1].owner_block = ob;
888 			undo_table[alloced_blocks-1].owner_offset = i;
889 
890 			if (issync) {
891 				UFS_BWRITE2(ufsvfsp, bp);
892 				if (bp->b_flags & B_ERROR) {
893 					err = geterror(bp);
894 					brelse(bp);
895 					/*
896 					 * Return any partial
897 					 * allocations.
898 					 */
899 					ufs_undo_allocation(ip,
900 					    alloced_blocks,
901 					    undo_table, added_sectors);
902 					return (err);
903 				}
904 				brelse(bp);
905 			} else {
906 				bdrwrite(bp);
907 			}
908 		} else {
909 			brelse(bp);
910 		}
911 	}
912 	return (0);
913 }
914 
915 /*
916  * Return 1 if inode has unmapped blocks (UFS holes).
917  */
918 int
919 bmap_has_holes(struct inode *ip)
920 {
921 	struct fs *fs = ip->i_fs;
922 	uint_t	dblks; 			/* # of data blocks */
923 	uint_t	mblks;			/* # of data + metadata blocks */
924 	int	nindirshift;
925 	int	nindiroffset;
926 	uint_t	cnt;
927 	int	n, j, shft;
928 	uint_t nindirblks;
929 
930 	int	fsbshift = fs->fs_bshift;
931 	int	fsboffset = (1 << fsbshift) - 1;
932 
933 	dblks = (ip->i_size + fsboffset) >> fsbshift;
934 	mblks = (ldbtob((u_offset_t)ip->i_blocks) + fsboffset) >> fsbshift;
935 
936 	/*
937 	 * File has only direct blocks.
938 	 */
939 	if (dblks <= NDADDR)
940 		return (mblks < dblks);
941 	nindirshift = ip->i_ufsvfs->vfs_nindirshift;
942 
943 	nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
944 	nindirblks = nindiroffset + 1;
945 
946 	dblks -= NDADDR;
947 	shft = 0;
948 	/*
949 	 * Determine how many levels of indirection.
950 	 */
951 	for (j = NIADDR; j > 0; j--) {
952 		longlong_t	sh;
953 
954 		shft += nindirshift;	/* sh *= nindir */
955 		sh = 1LL << shft;
956 		if (dblks <= sh)
957 			break;
958 		dblks -= sh;
959 	}
960 	/* LINTED: warning: logical expression always true: op "||" */
961 	ASSERT(NIADDR <= 3);
962 	ASSERT(j <= NIADDR);
963 	if (j == NIADDR)	/* single level indirection */
964 		cnt = NDADDR + 1 + dblks;
965 	else if (j == NIADDR-1) /* double indirection */
966 		cnt = NDADDR + 1 + nindirblks +
967 			1 + (dblks + nindiroffset)/nindirblks + dblks;
968 	else if (j == NIADDR-2) { /* triple indirection */
969 		n = (dblks + nindiroffset)/nindirblks;
970 		cnt = NDADDR + 1 + nindirblks +
971 			1 + nindirblks + nindirblks*nindirblks +
972 			1 + (n + nindiroffset)/nindirblks + n + dblks;
973 	}
974 
975 	return (mblks < cnt);
976 }
977 
978 /*
979  * find some contig blocks starting at *sbp and going for min(n, max_contig)
980  * return the number of blocks (not frags) found.
981  * The array passed in must be at least [0..n-1].
982  */
983 static int
984 findextent(struct fs *fs, daddr32_t *sbp, int n, int *lenp, int maxtransfer)
985 {
986 	register daddr_t bn, nextbn;
987 	register daddr32_t *bp;
988 	register int diff;
989 	int maxtransblk;
990 
991 	if (n <= 0)
992 		return (0);
993 	bn = *sbp;
994 	if (bn == 0)
995 		return (0);
996 
997 	diff = fs->fs_frag;
998 	if (*lenp) {
999 		n = MIN(n, lblkno(fs, *lenp));
1000 	} else {
1001 		/*
1002 		 * If the user has set the value for maxcontig lower than
1003 		 * the drive transfer size, then assume they want this
1004 		 * to be the maximum value for the size of the data transfer.
1005 		 */
1006 		maxtransblk = maxtransfer >> DEV_BSHIFT;
1007 		if (fs->fs_maxcontig < maxtransblk) {
1008 			n = MIN(n, fs->fs_maxcontig);
1009 		} else {
1010 			n = MIN(n, maxtransblk);
1011 		}
1012 	}
1013 	bp = sbp;
1014 	while (--n > 0) {
1015 		nextbn = *(bp + 1);
1016 		if (nextbn == 0 || bn + diff != nextbn)
1017 			break;
1018 		bn = nextbn;
1019 		bp++;
1020 	}
1021 	return ((int)(bp - sbp) + 1);
1022 }
1023 
1024 /*
1025  * Free any blocks which had been successfully allocated.  Always called
1026  * as a result of an error, so we don't bother returning an error code
1027  * from here.
1028  *
1029  * If block_count and inode_sector_adjust are both zero, we'll do nothing.
1030  * Thus it is safe to call this as part of error handling, whether or not
1031  * any blocks have been allocated.
1032  *
1033  * The ufs_inode_direct case is currently unused.
1034  */
1035 
1036 static void
1037 ufs_undo_allocation(
1038 	inode_t *ip,
1039 	int block_count,
1040 	struct ufs_allocated_block table[],
1041 	int inode_sector_adjust)
1042 {
1043 	int i;
1044 	int inode_changed;
1045 	int error_updating_pointers;
1046 	struct ufsvfs *ufsvfsp;
1047 
1048 	inode_changed = 0;
1049 	error_updating_pointers = 0;
1050 
1051 	ufsvfsp = ip->i_ufsvfs;
1052 
1053 	/*
1054 	 * Update pointers on disk before freeing blocks.  If we fail,
1055 	 * some blocks may remain busy; but they will be reclaimed by
1056 	 * an fsck.  (This is better than letting a block wind up with
1057 	 * two owners if we successfully freed it but could not remove
1058 	 * the pointer to it.)
1059 	 */
1060 
1061 	for (i = 0; i < block_count; i++) {
1062 		switch (table[i].owner) {
1063 		case ufs_no_owner:
1064 			/* Nothing to do here, nobody points to us */
1065 			break;
1066 		case ufs_inode_direct:
1067 			ASSERT(table[i].owner_offset < NDADDR);
1068 			ip->i_db[table[i].owner_offset] = 0;
1069 			inode_changed = 1;
1070 			break;
1071 		case ufs_inode_indirect:
1072 			ASSERT(table[i].owner_offset < NIADDR);
1073 			ip->i_ib[table[i].owner_offset] = 0;
1074 			inode_changed = 1;
1075 			break;
1076 		case ufs_indirect_block: {
1077 			buf_t *bp;
1078 			daddr32_t *block_data;
1079 
1080 			/* Read/modify/log/write. */
1081 
1082 			ASSERT(table[i].owner_offset <
1083 			    (VBSIZE(ITOV(ip)) / sizeof (daddr32_t)));
1084 
1085 			bp = UFS_BREAD(ufsvfsp, ip->i_dev,
1086 			    fsbtodb(ufsvfsp->vfs_fs, table[i].owner_block),
1087 			    VBSIZE(ITOV(ip)));
1088 
1089 			if (bp->b_flags & B_ERROR) {
1090 				/* Couldn't read this block; give up. */
1091 				error_updating_pointers = 1;
1092 				brelse(bp);
1093 				break;		/* out of SWITCH */
1094 			}
1095 
1096 			block_data = bp->b_un.b_daddr;
1097 			block_data[table[i].owner_offset] = 0;
1098 
1099 			/* Write a log entry which includes the zero. */
1100 			/* It might be possible to optimize this by using */
1101 			/* TRANS_BUF directly and zeroing only the four */
1102 			/* bytes involved, but an attempt to do that led */
1103 			/* to panics in the logging code.  The attempt was */
1104 			/* TRANS_BUF(ufsvfsp,				  */
1105 			/*    table[i].owner_offset * sizeof (daddr32_t), */
1106 			/*    sizeof (daddr32_t),			  */
1107 			/*    bp,					  */
1108 			/*    DT_ABZERO);				  */
1109 
1110 			TRANS_BUF_ITEM_128(ufsvfsp,
1111 			    block_data[table[i].owner_offset],
1112 			    block_data, bp, DT_AB);
1113 
1114 			/* Now we can write the buffer itself. */
1115 
1116 			UFS_BWRITE2(ufsvfsp, bp);
1117 
1118 			if (bp->b_flags & B_ERROR) {
1119 				error_updating_pointers = 1;
1120 			}
1121 
1122 			brelse(bp);
1123 			break;
1124 		}
1125 		default:
1126 			(void) ufs_fault(ITOV(ip),
1127 			    "ufs_undo_allocation failure\n");
1128 			break;
1129 		}
1130 	}
1131 
1132 	/*
1133 	 * If the inode changed, or if we need to update its block count,
1134 	 * then do that now.  We update the inode synchronously on disk
1135 	 * to ensure that it won't transiently point at a block we've
1136 	 * freed (only necessary if we're not logging).
1137 	 *
1138 	 * NOTE: Currently ufs_iupdat() does not check for errors.  When
1139 	 * it is fixed, we should verify that we successfully updated the
1140 	 * inode before freeing blocks below.
1141 	 */
1142 
1143 	if (inode_changed || (inode_sector_adjust != 0)) {
1144 		ip->i_blocks -= inode_sector_adjust;
1145 		ASSERT((unsigned)ip->i_blocks <= INT_MAX);
1146 		TRANS_INODE(ufsvfsp, ip);
1147 		ip->i_flag |= IUPD | ICHG | IATTCHG;
1148 		ip->i_seq++;
1149 		if (!TRANS_ISTRANS(ufsvfsp))
1150 			ufs_iupdat(ip, I_SYNC);
1151 	}
1152 
1153 	/*
1154 	 * Now we go through and actually free the blocks, but only if we
1155 	 * successfully removed the pointers to them.
1156 	 */
1157 
1158 	if (!error_updating_pointers) {
1159 		for (i = 0; i < block_count; i++) {
1160 			free(ip, table[i].this_block, table[i].block_size,
1161 			    table[i].usage_flags);
1162 		}
1163 	}
1164 }
1165 
1166 /*
1167  * Find the next hole or data block in file starting at *off
1168  * Return found offset in *off, which can be less than the
1169  * starting offset if not block aligned.
1170  * This code is based on bmap_read().
1171  * Errors: ENXIO for end of file
1172  *         EIO for block read error.
1173  */
1174 int
1175 bmap_find(struct inode *ip, boolean_t hole, u_offset_t *off)
1176 {
1177 	ufsvfs_t *ufsvfsp = ip->i_ufsvfs;
1178 	struct fs *fs = ufsvfsp->vfs_fs;
1179 	buf_t *bp[NIADDR];
1180 	int i, j;
1181 	int shft;			/* we maintain sh = 1 << shft */
1182 	int nindirshift, nindiroffset;
1183 	daddr_t	ob, nb, tbn, lbn, skip;
1184 	daddr32_t *bap;
1185 	u_offset_t isz = (offset_t)ip->i_size;
1186 	int32_t bs = fs->fs_bsize; /* file system block size */
1187 	int32_t nindir = fs->fs_nindir;
1188 	dev_t dev;
1189 	int error = 0;
1190 	daddr_t limits[NIADDR];
1191 
1192 	ASSERT(*off < isz);
1193 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
1194 	lbn = (daddr_t)lblkno(fs, *off);
1195 	ASSERT(lbn >= 0);
1196 
1197 	for (i = 0; i < NIADDR; i++)
1198 		bp[i] = NULL;
1199 
1200 	/*
1201 	 * The first NDADDR blocks are direct blocks.
1202 	 */
1203 	if (lbn < NDADDR) {
1204 		for (; lbn < NDADDR; lbn++) {
1205 			if ((hole && (ip->i_db[lbn] == 0)) ||
1206 			    (!hole && (ip->i_db[lbn] != 0))) {
1207 				goto out;
1208 			}
1209 		}
1210 		if ((u_offset_t)lbn << fs->fs_bshift >= isz)
1211 			goto out;
1212 	}
1213 
1214 	nindir = fs->fs_nindir;
1215 	nindirshift = ufsvfsp->vfs_nindirshift;
1216 	nindiroffset = ufsvfsp->vfs_nindiroffset;
1217 	dev = ip->i_dev;
1218 
1219 	/* Set up limits array */
1220 	for (limits[0] = NDADDR, j = 1; j  < NIADDR; j++)
1221 		limits[j] = limits[j-1] + (1ULL << (nindirshift * j));
1222 
1223 loop:
1224 	/*
1225 	 * Determine how many levels of indirection.
1226 	 */
1227 	shft = 0;				/* sh = 1 */
1228 	tbn = lbn - NDADDR;
1229 	for (j = NIADDR; j > 0; j--) {
1230 		longlong_t sh;
1231 
1232 		shft += nindirshift;		/* sh *= nindir */
1233 		sh = 1LL << shft;
1234 		if (tbn < sh)
1235 			break;
1236 		tbn -= sh;
1237 	}
1238 	if (j == 0) {
1239 		/* must have passed end of file */
1240 		ASSERT(((u_offset_t)lbn << fs->fs_bshift) >= isz);
1241 		goto out;
1242 	}
1243 
1244 	/*
1245 	 * Fetch the first indirect block.
1246 	 */
1247 	nb = ip->i_ib[NIADDR - j];
1248 	if (nb == 0) {
1249 		if (hole) {
1250 			lbn = limits[NIADDR - j];
1251 			goto out;
1252 		} else {
1253 			lbn = limits[NIADDR - j + 1];
1254 			if ((u_offset_t)lbn << fs->fs_bshift >= isz)
1255 				goto out;
1256 			goto loop;
1257 		}
1258 	}
1259 
1260 	/*
1261 	 * Fetch through the indirect blocks.
1262 	 */
1263 	for (; ((j <= NIADDR) && (nb != 0)); j++) {
1264 		ob = nb;
1265 		/*
1266 		 * if there's a different block at this level then release
1267 		 * the old one and in with the new.
1268 		 */
1269 		if ((bp[j-1] == NULL) || bp[j-1]->b_blkno != fsbtodb(fs, ob)) {
1270 			if (bp[j-1] != NULL)
1271 				brelse(bp[j-1]);
1272 			bp[j-1] = UFS_BREAD(ufsvfsp, dev, fsbtodb(fs, ob), bs);
1273 			if (bp[j-1]->b_flags & B_ERROR) {
1274 				error = EIO;
1275 				goto out;
1276 			}
1277 		}
1278 		bap = bp[j-1]->b_un.b_daddr;
1279 
1280 		shft -= nindirshift;		/* sh / nindir */
1281 		i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
1282 		nb = bap[i];
1283 		skip = 1LL << (nindirshift * (NIADDR - j));
1284 	}
1285 
1286 	/*
1287 	 * Scan through the blocks in this array.
1288 	 */
1289 	for (; i < nindir; i++, lbn += skip) {
1290 		if (hole && (bap[i] == 0))
1291 			goto out;
1292 		if (!hole && (bap[i] != 0)) {
1293 			if (skip == 1) {
1294 				/* we're at the lowest level */
1295 				goto out;
1296 			} else {
1297 				goto loop;
1298 			}
1299 		}
1300 	}
1301 	if (((u_offset_t)lbn << fs->fs_bshift) < isz)
1302 		goto loop;
1303 out:
1304 	for (i = 0; i < NIADDR; i++) {
1305 		if (bp[i])
1306 			brelse(bp[i]);
1307 	}
1308 	if (error == 0) {
1309 		if (((u_offset_t)lbn << fs->fs_bshift) >= isz) {
1310 			error = ENXIO;
1311 		} else {
1312 			/* success */
1313 			*off = (u_offset_t)lbn << fs->fs_bshift;
1314 		}
1315 	}
1316 	return (error);
1317 }
1318 
1319 /*
1320  * Set a particular offset in the inode list to be a certain block.
1321  * User is responsible for calling TRANS* functions
1322  */
1323 int
1324 bmap_set_bn(struct vnode *vp, u_offset_t off, daddr32_t bn)
1325 {
1326 	daddr_t lbn;
1327 	struct inode *ip;
1328 	ufsvfs_t *ufsvfsp;
1329 	struct	fs *fs;
1330 	struct	buf *bp;
1331 	int	i, j;
1332 	int	shft;			/* we maintain sh = 1 << shft */
1333 	int err;
1334 	daddr_t	ob, nb, tbn;
1335 	daddr32_t *bap;
1336 	int	nindirshift, nindiroffset;
1337 
1338 	ip = VTOI(vp);
1339 	ufsvfsp = ip->i_ufsvfs;
1340 	fs = ufsvfsp->vfs_fs;
1341 	lbn = (daddr_t)lblkno(fs, off);
1342 
1343 	ASSERT(RW_LOCK_HELD(&ip->i_contents));
1344 
1345 	if (lbn < 0)
1346 		return (EFBIG);
1347 
1348 	/*
1349 	 * Take care of direct block assignment
1350 	 */
1351 	if (lbn < NDADDR) {
1352 		ip->i_db[lbn] = bn;
1353 		return (0);
1354 	}
1355 
1356 	nindirshift = ip->i_ufsvfs->vfs_nindirshift;
1357 	nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
1358 	/*
1359 	 * Determine how many levels of indirection.
1360 	 */
1361 	shft = 0;				/* sh = 1 */
1362 	tbn = lbn - NDADDR;
1363 	for (j = NIADDR; j > 0; j--) {
1364 		longlong_t	sh;
1365 
1366 		shft += nindirshift;		/* sh *= nindir */
1367 		sh = 1LL << shft;
1368 		if (tbn < sh)
1369 			break;
1370 		tbn -= sh;
1371 	}
1372 	if (j == 0)
1373 		return (EFBIG);
1374 
1375 	/*
1376 	 * Fetch the first indirect block.
1377 	 */
1378 	nb = ip->i_ib[NIADDR - j];
1379 	if (nb == 0)
1380 		err = ufs_fault(ITOV(ip), "ufs_set_bn: nb == UFS_HOLE");
1381 
1382 	/*
1383 	 * Fetch through the indirect blocks.
1384 	 */
1385 	for (; j <= NIADDR; j++) {
1386 		ob = nb;
1387 		bp = UFS_BREAD(ufsvfsp,
1388 				ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize);
1389 		if (bp->b_flags & B_ERROR) {
1390 			err = geterror(bp);
1391 			brelse(bp);
1392 			return (err);
1393 		}
1394 		bap = bp->b_un.b_daddr;
1395 
1396 		ASSERT(!ufs_indir_badblock(ip, bap));
1397 
1398 		shft -= nindirshift;		/* sh / nindir */
1399 		i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
1400 
1401 		if (j == NIADDR) {
1402 			bap[i] = bn;
1403 			bdrwrite(bp);
1404 			return (0);
1405 		}
1406 		brelse(bp);
1407 	}
1408 	return (0);
1409 }
1410