fs/ufs/ufs_bmap.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
/*	  All Rights Reserved  	*/

/*
 * University Copyright- Copyright (c) 1982, 1986, 1988
 * The Regents of the University of California
 * All Rights Reserved
 *
 * University Acknowledgment- Portions of this document are derived from
 * software developed by the University of California, Berkeley, and its
 * contributors.
 */


#pragma ident	"%Z%%M%	%I%	%E% SMI"

#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/signal.h>
#include <sys/user.h>
#include <sys/vnode.h>
#include <sys/buf.h>
#include <sys/disp.h>
#include <sys/proc.h>
#include <sys/conf.h>
#include <sys/fs/ufs_inode.h>
#include <sys/fs/ufs_fs.h>
#include <sys/fs/ufs_quota.h>
#include <sys/fs/ufs_trans.h>
#include <sys/fs/ufs_bio.h>
#include <vm/seg.h>
#include <sys/errno.h>
#include <sys/sysmacros.h>
#include <sys/vfs.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/kmem.h>

/*
 * This structure is used to track blocks as we allocate them, so that
 * we can free them if we encounter an error during allocation.  We
 * keep track of five pieces of information for each allocated block:
 *   - The number of the newly allocated block
 *   - The size of the block (lets us deal with fragments if we want)
 *   - The number of the block containing a pointer to it; or whether
 *     the pointer is in the inode
 *   - The offset within the block (or inode) containing a pointer to it.
 *   - A flag indicating the usage of the block.  (Logging needs to know
 *     this to avoid overwriting a data block if it was previously used
 *     for metadata.)
 */

enum ufs_owner_type {
	ufs_no_owner,		/* Owner has not yet been updated */
	ufs_inode_direct,	/* Listed in inode's direct block table */
	ufs_inode_indirect,	/* Listed in inode's indirect block table */
	ufs_indirect_block	/* Listed in an indirect block */
};

struct ufs_allocated_block {
	daddr_t this_block;	    /* Number of this block */
	off_t block_size;	    /* Size of this block, in bytes */
	enum ufs_owner_type owner;  /* Who points to this block? */
	daddr_t owner_block;	    /* Number of the owning block */
	uint_t owner_offset;	    /* Offset within that block or inode */
	int usage_flags;	    /* Usage flags, as expected by free() */
};


static int findextent(struct fs *fs, daddr32_t *sbp, int n, int *lenp,
		int maxtrans);

static void ufs_undo_allocation(inode_t *ip, int block_count,
	struct ufs_allocated_block table[], int inode_sector_adjust);

/*
 * Find the extent and the matching block number.
 *
 * bsize > PAGESIZE
 *	boff indicates that we want a page in the middle
 *	min expression is supposed to make sure no extra page[s] after EOF
 * PAGESIZE >= bsize
 *	we assume that a page is a multiple of bsize, i.e.,
 *	boff always == 0
 *
 * We always return a length that is suitable for a disk transfer.
 */
#define	DOEXTENT(fs, lbn, boff, bnp, lenp, size, tblp, n, chkfrag, maxtrans) {\
	register daddr32_t *dp = (tblp);				\
	register int _chkfrag = chkfrag; /* for lint. sigh */		\
									\
	if (*dp == 0) {							\
		*(bnp) = UFS_HOLE;					\
	} else {							\
		register int len;					\
									\
		len = findextent(fs, dp, (int)(n), lenp, maxtrans) << 	\
			(fs)->fs_bshift; 				\
		if (_chkfrag) {						\
			register u_offset_t tmp;			\
									\
			tmp = fragroundup((fs), size) -			\
			    (((u_offset_t)lbn) << fs->fs_bshift);	\
			len = (int)MIN(tmp, len);			\
		}							\
		len -= (boff);						\
		if (len <= 0) {						\
			*(bnp) = UFS_HOLE;				\
		} else {						\
			*(bnp) = fsbtodb(fs, *dp) + btodb(boff);	\
			*(lenp) = len;					\
		}							\
	}								\
}

/*
 * The maximum supported file size is actually somewhat less that 1
 * terabyte.  This is because the total number of blocks used for the
 * file and its metadata must fit into the ic_blocks field of the
 * inode, which is a signed 32-bit quantity.  The metadata allocated
 * for a file (that is, the single, double, and triple indirect blocks
 * used to reference the file blocks) is actually quite small,
 * but just to make sure, we check for overflow in the ic_blocks
 * ic_blocks fields for all files whose total block count is
 * within 1 GB of a terabyte.  VERYLARGEFILESIZE below is the number of
 * 512-byte blocks in a terabyte (2^31), minus the number of 512-byte blocks
 * in a gigabyte (2^21).  We only check for overflow in the ic_blocks
 * field if the number of blocks currently allocated to the file is
 * greater than VERYLARGEFILESIZE.
 *
 * Note that file "size" is the not the same as file "length".  A
 * file's "size" is the number of blocks allocated to it.  A file's
 * "length" is the maximum offset in the file.  A UFS FILE can have a
 * length of a terabyte, but the size is limited to somewhat less than
 * a terabyte, as described above.
 */
#define	VERYLARGEFILESIZE	0x7FE00000

/*
 * bmap{rd,wr} define the structure of file system storage by mapping
 * a logical offset in a file to a physical block number on the device.
 * It should be called with a locked inode when allocation is to be
 * done (bmapwr).  Note this strangeness: bmapwr is always called from
 * getpage(), not putpage(), since getpage() is where all the allocation
 * is done.
 *
 * S_READ, S_OTHER -> bmaprd; S_WRITE -> bmapwr.
 *
 * NOTICE: the block number returned is the disk block number, not the
 * file system block number.  All the worries about block offsets and
 * page/block sizes are hidden inside of bmap.  Well, not quite,
 * unfortunately.  It's impossible to find one place to hide all this
 * mess.  There are 3 cases:
 *
 * PAGESIZE < bsize
 *	In this case, the {get,put}page routines will attempt to align to
 *	a file system block boundry (XXX - maybe this is a mistake?).  Since
 *	the kluster routines may be out of memory, we don't always get all
 *	the pages we wanted.  If we called bmap first, to find out how much
 *	to kluster, we handed in the block aligned offset.  If we didn't get
 *	all the pages, we have to chop off the amount we didn't get from the
 *	amount handed back by bmap.
 *
 * PAGESIZE == bsize
 *	Life is quite pleasant here, no extra work needed, mainly because we
 *	(probably?) won't kluster backwards, just forwards.
 *
 * PAGESIZE > bsize
 *	This one has a different set of problems, specifically, we may have to
 *	do N reads to fill one page.  Let us hope that Sun will stay with small
 *	pages.
 *
 * Returns 0 on success, or a non-zero errno if an error occurs.
 *
 * TODO
 *	LMXXX - add a bmap cache.  This could be a couple of extents in the
 *	inode.  Two is nice for PAGESIZE > bsize.
 */

int
bmap_read(struct inode *ip, u_offset_t off, daddr_t *bnp, int *lenp)
{
	daddr_t lbn;
	ufsvfs_t *ufsvfsp = ip->i_ufsvfs;
	struct	fs *fs = ufsvfsp->vfs_fs;
	struct	buf *bp;
	int	i, j, boff;
	int	shft;			/* we maintain sh = 1 << shft */
	daddr_t	ob, nb, tbn;
	daddr32_t *bap;
	int	nindirshift, nindiroffset;

	ASSERT(RW_LOCK_HELD(&ip->i_contents));
	lbn = (daddr_t)lblkno(fs, off);
	boff = (int)blkoff(fs, off);
	if (lbn < 0)
		return (EFBIG);

	/*
	 * The first NDADDR blocks are direct blocks.
	 */
	if (lbn < NDADDR) {
		DOEXTENT(fs, lbn, boff, bnp, lenp,
		    ip->i_size, &ip->i_db[lbn], NDADDR - lbn, 1,
			ufsvfsp->vfs_iotransz);
		return (0);
	}

	nindirshift = ufsvfsp->vfs_nindirshift;
	nindiroffset = ufsvfsp->vfs_nindiroffset;
	/*
	 * Determine how many levels of indirection.
	 */
	shft = 0;				/* sh = 1 */
	tbn = lbn - NDADDR;
	for (j = NIADDR; j > 0; j--) {
		longlong_t	sh;

		shft += nindirshift;		/* sh *= nindir */
		sh = 1LL << shft;
		if (tbn < sh)
			break;
		tbn -= sh;
	}
	if (j == 0)
		return (EFBIG);

	/*
	 * Fetch the first indirect block.
	 */
	nb = ip->i_ib[NIADDR - j];
	if (nb == 0) {
		*bnp = UFS_HOLE;
		return (0);
	}

	/*
	 * Fetch through the indirect blocks.
	 */
	for (; j <= NIADDR; j++) {
		ob = nb;
		bp = UFS_BREAD(ufsvfsp,
				ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize);
		if (bp->b_flags & B_ERROR) {
			brelse(bp);
			return (EIO);
		}
		bap = bp->b_un.b_daddr;

		ASSERT(!ufs_indir_badblock(ip, bap));

		shft -= nindirshift;		/* sh / nindir */
		i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
		nb = bap[i];
		if (nb == 0) {
			*bnp = UFS_HOLE;
			brelse(bp);
			return (0);
		}
		if (j != NIADDR)
			brelse(bp);
	}
	DOEXTENT(fs, lbn, boff, bnp, lenp, ip->i_size, &bap[i],
	    MIN(NINDIR(fs) - i, (daddr_t)lblkno(fs, ip->i_size - 1) - lbn + 1),
		0, ufsvfsp->vfs_iotransz);
	brelse(bp);
	return (0);
}

/*
 * See bmaprd for general notes.
 *
 * The block must be at least size bytes and will be extended or
 * allocated as needed.  If alloc_only is set, bmap will not create
 * any in-core pages that correspond to the new disk allocation.
 * Otherwise, the in-core pages will be created and initialized as
 * needed.
 *
 * Returns 0 on success, or a non-zero errno if an error occurs.
 */

int
bmap_write(
	struct inode	*ip,
	u_offset_t	off,
	int		size,
	int		alloc_only,
	struct cred	*cr)
{
	struct	fs *fs;
	struct	buf *bp;
	int	i;
	struct	buf *nbp;
	int	j;
	int	shft;				/* we maintain sh = 1 << shft */
	daddr_t	ob, nb, pref, lbn, llbn, tbn;
	daddr32_t *bap;
	struct	vnode *vp = ITOV(ip);
	long	bsize = VBSIZE(vp);
	long	osize, nsize;
	int	issync, metaflag, isdirquota;
	int	err;
	dev_t	dev;
	struct	fbuf *fbp;
	int	nindirshift;
	int	nindiroffset;
	struct	ufsvfs	*ufsvfsp;
	int	added_sectors;		/* sectors added to this inode */
	int	alloced_blocks;		/* fs blocks newly allocated */
	struct  ufs_allocated_block undo_table[NIADDR+1];
	int	verylargefile = 0;

	ASSERT(RW_WRITE_HELD(&ip->i_contents));

	ufsvfsp = ip->i_ufsvfs;
	fs = ufsvfsp->vfs_bufp->b_un.b_fs;
	lbn = (daddr_t)lblkno(fs, off);
	if (lbn < 0)
		return (EFBIG);
	if (ip->i_blocks >= VERYLARGEFILESIZE)
		verylargefile = 1;
	llbn = (daddr_t)((ip->i_size) ? lblkno(fs, ip->i_size - 1) : 0);
	metaflag = isdirquota = 0;
	if (((ip->i_mode & IFMT) == IFDIR) ||
	    ((ip->i_mode & IFMT) == IFATTRDIR))
		isdirquota = metaflag = I_DIR;
	else if ((ip->i_mode & IFMT) == IFSHAD)
		metaflag = I_SHAD;
	else if (ip->i_ufsvfs->vfs_qinod == ip)
		isdirquota = metaflag = I_QUOTA;

	issync = ((ip->i_flag & ISYNC) != 0);

	if (isdirquota || issync) {
		alloc_only = 0;		/* make sure */
	}

	/*
	 * If the next write will extend the file into a new block,
	 * and the file is currently composed of a fragment
	 * this fragment has to be extended to be a full block.
	 */
	if (llbn < NDADDR && llbn < lbn && (ob = ip->i_db[llbn]) != 0) {
		osize = blksize(fs, ip, llbn);
		if (osize < bsize && osize > 0) {
			/*
			 * Check to see if doing this will make the file too
			 * big.  Only check if we are dealing with a very
			 * large file.
			 */
			if (verylargefile == 1) {
				if (((unsigned)ip->i_blocks +
				    btodb(bsize - osize)) > INT_MAX) {
					return (EFBIG);
				}
			}
			/*
			 * Make sure we have all needed pages setup correctly.
			 *
			 * We pass S_OTHER to fbread here because we want
			 * an exclusive lock on the page in question
			 * (see ufs_getpage). I/O to the old block location
			 * may still be in progress and we are about to free
			 * the old block. We don't want anyone else to get
			 * a hold of the old block once we free it until
			 * the I/O is complete.
			 */
			err = fbread(ITOV(ip),
				    ((offset_t)llbn << fs->fs_bshift),
					(uint_t)bsize, S_OTHER, &fbp);
			if (err)
				return (err);
			pref = blkpref(ip, llbn, (int)llbn, &ip->i_db[0]);
			err = realloccg(ip, ob, pref, (int)osize, (int)bsize,
					&nb, cr);
			if (err) {
				if (fbp)
					fbrelse(fbp, S_OTHER);
				return (err);
			}
			ASSERT(!ufs_badblock(ip, nb));

			/*
			 * Update the inode before releasing the
			 * lock on the page. If we released the page
			 * lock first, the data could be written to it's
			 * old address and then destroyed.
			 */
			TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 0);
			ip->i_db[llbn] = nb;
			UFS_SET_ISIZE(((u_offset_t)(llbn + 1)) << fs->fs_bshift,
			    ip);
			ip->i_blocks += btodb(bsize - osize);
			ASSERT((unsigned)ip->i_blocks <= INT_MAX);
			TRANS_INODE(ufsvfsp, ip);
			ip->i_flag |= IUPD | ICHG | IATTCHG;
			/* Caller is responsible for updating i_seq */
			/*
			 * Don't check metaflag here, directories won't do this
			 *
			 */
			if (issync) {
				(void) ufs_fbiwrite(fbp, ip, nb, fs->fs_fsize);
			} else {
				ASSERT(fbp);
				fbrelse(fbp, S_WRITE);
			}

			if (nb != ob) {
				(void) free(ip, ob, (off_t)osize, metaflag);
			}
		}
	}

	/*
	 * The first NDADDR blocks are direct blocks.
	 */
	if (lbn < NDADDR) {
		nb = ip->i_db[lbn];
		if (nb == 0 ||
		    ip->i_size < ((u_offset_t)(lbn + 1)) << fs->fs_bshift) {
			if (nb != 0) {
				/* consider need to reallocate a frag */
				osize = fragroundup(fs, blkoff(fs, ip->i_size));
				nsize = fragroundup(fs, size);
				if (nsize <= osize)
					goto gotit;
				/*
				 * Check to see if doing this will make the
				 * file too big.  Only check if we are dealing
				 * with a very large file.
				 */
				if (verylargefile == 1) {
					if (((unsigned)ip->i_blocks +
					    btodb(nsize - osize)) > INT_MAX) {
						return (EFBIG);
					}
				}
				/*
				 * need to allocate a block or frag
				 */
				ob = nb;
				pref = blkpref(ip, lbn, (int)lbn,
								&ip->i_db[0]);
				err = realloccg(ip, ob, pref, (int)osize,
						(int)nsize, &nb, cr);
				if (err)
					return (err);
				ASSERT(!ufs_badblock(ip, nb));

			} else {
				/*
				 * need to allocate a block or frag
				 */
				osize = 0;
				if (ip->i_size <
				    ((u_offset_t)(lbn + 1)) << fs->fs_bshift)
					nsize = fragroundup(fs, size);
				else
					nsize = bsize;
				/*
				 * Check to see if doing this will make the
				 * file too big.  Only check if we are dealing
				 * with a very large file.
				 */
				if (verylargefile == 1) {
					if (((unsigned)ip->i_blocks +
					    btodb(nsize - osize)) > INT_MAX) {
						return (EFBIG);
					}
				}
				pref = blkpref(ip, lbn, (int)lbn, &ip->i_db[0]);
				err = alloc(ip, pref, (int)nsize, &nb, cr);
				if (err)
					return (err);
				ASSERT(!ufs_badblock(ip, nb));
				ob = nb;
			}

			/*
			 * Read old/create new zero pages
			 */
			fbp = NULL;
			if (osize == 0) {
				/*
				 * mmap S_WRITE faults always enter here
				 */
				if (!alloc_only || P2ROUNDUP_TYPED(size,
				    PAGESIZE, u_offset_t) < nsize) {
					/* fbzero doesn't cause a pagefault */
					fbzero(ITOV(ip),
					    ((offset_t)lbn << fs->fs_bshift),
					    (uint_t)nsize, &fbp);
				}
			} else {
				err = fbread(vp,
				    ((offset_t)lbn << fs->fs_bshift),
				    (uint_t)nsize, S_OTHER, &fbp);
				if (err) {
					if (nb != ob) {
						(void) free(ip, nb,
						    (off_t)nsize, metaflag);
					} else {
						(void) free(ip,
						    ob + numfrags(fs, osize),
						    (off_t)(nsize - osize),
						    metaflag);
					}
					ASSERT(nsize >= osize);
					(void) chkdq(ip,
						-(long)btodb(nsize - osize),
						0, cr, (char **)NULL,
						(size_t *)NULL);
					return (err);
				}
			}
			TRANS_MATA_ALLOC(ufsvfsp, ip, nb, nsize, 0);
			ip->i_db[lbn] = nb;
			ip->i_blocks += btodb(nsize - osize);
			ASSERT((unsigned)ip->i_blocks <= INT_MAX);
			TRANS_INODE(ufsvfsp, ip);
			ip->i_flag |= IUPD | ICHG | IATTCHG;
			/* Caller is responsible for updating i_seq */

			/*
			 * Write directory and shadow blocks synchronously so
			 * that they never appear with garbage in them on the
			 * disk.
			 *
			 */
			if (isdirquota && (ip->i_size ||
			    TRANS_ISTRANS(ufsvfsp))) {
			/*
			 * XXX man not be necessary with harpy trans
			 * bug id 1130055
			 */
				(void) ufs_fbiwrite(fbp, ip, nb, fs->fs_fsize);
			} else if (fbp) {
				fbrelse(fbp, S_WRITE);
			}

			if (nb != ob)
				(void) free(ip, ob, (off_t)osize, metaflag);
		}
gotit:
		return (0);
	}

	added_sectors = alloced_blocks = 0;	/* No blocks alloced yet */

	/*
	 * Determine how many levels of indirection.
	 */
	nindirshift = ip->i_ufsvfs->vfs_nindirshift;
	nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
	pref = 0;
	shft = 0;				/* sh = 1 */
	tbn = lbn - NDADDR;
	for (j = NIADDR; j > 0; j--) {
		longlong_t	sh;

		shft += nindirshift;		/* sh *= nindir */
		sh = 1LL << shft;
		if (tbn < sh)
			break;
		tbn -= sh;
	}

	if (j == 0)
		return (EFBIG);

	/*
	 * Fetch the first indirect block.
	 */
	dev = ip->i_dev;
	nb = ip->i_ib[NIADDR - j];
	if (nb == 0) {
		/*
		 * Check to see if doing this will make the
		 * file too big.  Only check if we are dealing
		 * with a very large file.
		 */
		if (verylargefile == 1) {
			if (((unsigned)ip->i_blocks + btodb(bsize))
			    > INT_MAX) {
				return (EFBIG);
			}
		}
		/*
		 * Need to allocate an indirect block.
		 */
		pref = blkpref(ip, lbn, 0, (daddr32_t *)0);
		err = alloc(ip, pref, (int)bsize, &nb, cr);
		if (err)
			return (err);
		TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 1);
		ASSERT(!ufs_badblock(ip, nb));

		/*
		 * Keep track of this allocation so we can undo it if we
		 * get an error later.
		 */

		ASSERT(alloced_blocks <= NIADDR);

		undo_table[alloced_blocks].this_block = nb;
		undo_table[alloced_blocks].block_size = bsize;
		undo_table[alloced_blocks].owner = ufs_no_owner;
		undo_table[alloced_blocks].usage_flags = metaflag | I_IBLK;

		alloced_blocks++;

		/*
		 * Write zero block synchronously so that
		 * indirect blocks never point at garbage.
		 */
		bp = UFS_GETBLK(ufsvfsp, dev, fsbtodb(fs, nb), bsize);

		clrbuf(bp);
		/* XXX Maybe special-case this? */
		TRANS_BUF(ufsvfsp, 0, bsize, bp, DT_ABZERO);
		UFS_BWRITE2(ufsvfsp, bp);
		if (bp->b_flags & B_ERROR) {
			err = geterror(bp);
			brelse(bp);
			ufs_undo_allocation(ip, alloced_blocks,
			    undo_table, added_sectors);
			return (err);
		}
		brelse(bp);

		ip->i_ib[NIADDR - j] = nb;
		added_sectors += btodb(bsize);
		ip->i_blocks += btodb(bsize);
		ASSERT((unsigned)ip->i_blocks <= INT_MAX);
		TRANS_INODE(ufsvfsp, ip);
		ip->i_flag |= IUPD | ICHG | IATTCHG;
		/* Caller is responsible for updating i_seq */

		/*
		 * Update the 'undo table' now that we've linked this block
		 * to an inode.
		 */

		undo_table[alloced_blocks-1].owner = ufs_inode_indirect;
		undo_table[alloced_blocks-1].owner_offset = NIADDR - j;

		/*
		 * In the ISYNC case, wrip will notice that the block
		 * count on the inode has changed and will be sure to
		 * ufs_iupdat the inode at the end of wrip.
		 */
	}

	/*
	 * Fetch through the indirect blocks.
	 */
	for (; j <= NIADDR; j++) {
		ob = nb;
		bp = UFS_BREAD(ufsvfsp, ip->i_dev, fsbtodb(fs, ob), bsize);

		if (bp->b_flags & B_ERROR) {
			err = geterror(bp);
			brelse(bp);
			/*
			 * Return any partial allocations.
			 *
			 * It is possible that we have not yet made any
			 * allocations at this point (if this is the first
			 * pass through the loop and we didn't have to
			 * allocate the first indirect block, above).
			 * In this case, alloced_blocks and added_sectors will
			 * be zero, and ufs_undo_allocation will do nothing.
			 */
			ufs_undo_allocation(ip, alloced_blocks,
			    undo_table, added_sectors);
			return (err);
		}
		bap = bp->b_un.b_daddr;
		shft -= nindirshift;		/* sh /= nindir */
		i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
		nb = bap[i];
		if (nb == 0) {
			/*
			 * Check to see if doing this will make the
			 * file too big.  Only check if we are dealing
			 * with a very large file.
			 */
			if (verylargefile == 1) {
				if (((unsigned)ip->i_blocks + btodb(bsize))
				    > INT_MAX) {
					brelse(bp);
					ufs_undo_allocation(ip, alloced_blocks,
					    undo_table, added_sectors);
					return (EFBIG);
				}
			}
			if (pref == 0) {
				if (j < NIADDR) {
					/* Indirect block */
					pref = blkpref(ip, lbn, 0,
						(daddr32_t *)0);
				} else {
					/* Data block */
					pref = blkpref(ip, lbn, i, &bap[0]);
				}
			}

			/*
			 * release "bp" buf to avoid deadlock (re-bread later)
			 */
			brelse(bp);

			err = alloc(ip, pref, (int)bsize, &nb, cr);
			if (err) {
				/*
				 * Return any partial allocations.
				 */
				ufs_undo_allocation(ip, alloced_blocks,
				    undo_table, added_sectors);
				return (err);
			}

			ASSERT(!ufs_badblock(ip, nb));

			ASSERT(alloced_blocks <= NIADDR);

			undo_table[alloced_blocks].this_block = nb;
			undo_table[alloced_blocks].block_size = bsize;
			undo_table[alloced_blocks].owner = ufs_no_owner;
			undo_table[alloced_blocks].usage_flags = metaflag |
			    ((j < NIADDR) ? I_IBLK : 0);

			alloced_blocks++;

			if (j < NIADDR) {
				TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 1);
				/*
				 * Write synchronously so indirect
				 * blocks never point at garbage.
				 */
				nbp = UFS_GETBLK(
					ufsvfsp, dev, fsbtodb(fs, nb), bsize);

				clrbuf(nbp);
				/* XXX Maybe special-case this? */
				TRANS_BUF(ufsvfsp, 0, bsize, nbp, DT_ABZERO);
				UFS_BWRITE2(ufsvfsp, nbp);
				if (nbp->b_flags & B_ERROR) {
					err = geterror(nbp);
					brelse(nbp);
					/*
					 * Return any partial
					 * allocations.
					 */
					ufs_undo_allocation(ip,
					    alloced_blocks,
					    undo_table, added_sectors);
					return (err);
				}
				brelse(nbp);
			} else if (!alloc_only || P2ROUNDUP_TYPED(size,
			    PAGESIZE, u_offset_t) < bsize) {
				TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 0);
				fbzero(ITOV(ip),
				    ((offset_t)lbn << fs->fs_bshift),
				    (uint_t)bsize, &fbp);

				/*
				 * Cases which we need to do a synchronous
				 * write of the zeroed data pages:
				 *
				 * 1) If we are writing a directory then we
				 * want to write synchronously so blocks in
				 * directories never contain garbage.
				 *
				 * 2) If we are filling in a hole and the
				 * indirect block is going to be synchronously
				 * written back below we need to make sure
				 * that the zeroes are written here before
				 * the indirect block is updated so that if
				 * we crash before the real data is pushed
				 * we will not end up with random data is
				 * the middle of the file.
				 *
				 * 3) If the size of the request rounded up
				 * to the system page size is smaller than
				 * the file system block size, we want to
				 * write out all the pages now so that
				 * they are not aborted before they actually
				 * make it to ufs_putpage since the length
				 * of the inode will not include the pages.
				 */

				if (isdirquota || (issync &&
				    lbn < llbn))
					(void) ufs_fbiwrite(fbp, ip, nb,
						fs->fs_fsize);
				else
					fbrelse(fbp, S_WRITE);
			}

			/*
			 * re-acquire "bp" buf
			 */
			bp = UFS_BREAD(ufsvfsp,
					ip->i_dev, fsbtodb(fs, ob), bsize);
			if (bp->b_flags & B_ERROR) {
				err = geterror(bp);
				brelse(bp);
				/*
				 * Return any partial allocations.
				 */
				ufs_undo_allocation(ip,
				    alloced_blocks,
				    undo_table, added_sectors);
				return (err);
			}
			bap = bp->b_un.b_daddr;
			bap[i] = nb;
			TRANS_BUF_ITEM_128(ufsvfsp, bap[i], bap, bp, DT_AB);
			added_sectors += btodb(bsize);
			ip->i_blocks += btodb(bsize);
			ASSERT((unsigned)ip->i_blocks <= INT_MAX);
			TRANS_INODE(ufsvfsp, ip);
			ip->i_flag |= IUPD | ICHG | IATTCHG;
			/* Caller is responsible for updating i_seq */

			undo_table[alloced_blocks-1].owner =
			    ufs_indirect_block;
			undo_table[alloced_blocks-1].owner_block = ob;
			undo_table[alloced_blocks-1].owner_offset = i;

			if (issync) {
				UFS_BWRITE2(ufsvfsp, bp);
				if (bp->b_flags & B_ERROR) {
					err = geterror(bp);
					brelse(bp);
					/*
					 * Return any partial
					 * allocations.
					 */
					ufs_undo_allocation(ip,
					    alloced_blocks,
					    undo_table, added_sectors);
					return (err);
				}
				brelse(bp);
			} else {
				bdrwrite(bp);
			}
		} else {
			brelse(bp);
		}
	}
	return (0);
}

/*
 * Return 1 if inode has unmapped blocks (UFS holes).
 */
int
bmap_has_holes(struct inode *ip)
{
	struct fs *fs = ip->i_fs;
	uint_t	dblks; 			/* # of data blocks */
	uint_t	mblks;			/* # of data + metadata blocks */
	int	nindirshift;
	int	nindiroffset;
	uint_t	cnt;
	int	n, j, shft;
	uint_t nindirblks;

	int	fsbshift = fs->fs_bshift;
	int	fsboffset = (1 << fsbshift) - 1;

	dblks = (ip->i_size + fsboffset) >> fsbshift;
	mblks = (ldbtob((u_offset_t)ip->i_blocks) + fsboffset) >> fsbshift;

	/*
	 * File has only direct blocks.
	 */
	if (dblks <= NDADDR)
		return (mblks < dblks);

	nindirshift = ip->i_ufsvfs->vfs_nindirshift;
	nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
	nindirblks = nindiroffset + 1;

	dblks -= NDADDR;
	shft = 0;
	/*
	 * Determine how many levels of indirection.
	 */
	for (j = NIADDR; j > 0; j--) {
		longlong_t	sh;

		shft += nindirshift;	/* sh *= nindir */
		sh = 1LL << shft;
		if (dblks <= sh)
			break;
		dblks -= sh;
	}
	/* LINTED: warning: logical expression always true: op "||" */
	ASSERT(NIADDR <= 3);
	ASSERT(j <= NIADDR);
	if (j == NIADDR)	/* single level indirection */
		cnt = NDADDR + 1 + dblks;
	else if (j == NIADDR-1) /* double indirection */
		cnt = NDADDR + 1 + nindirblks +
			1 + (dblks + nindiroffset)/nindirblks + dblks;
	else if (j == NIADDR-2) { /* triple indirection */
		n = (dblks + nindiroffset)/nindirblks;
		cnt = NDADDR + 1 + nindirblks +
			1 + nindirblks + nindirblks*nindirblks +
			1 + (n + nindiroffset)/nindirblks + n + dblks;
	}

	return (mblks < cnt);
}

/*
 * find some contig blocks starting at *sbp and going for min(n, max_contig)
 * return the number of blocks (not frags) found.
 * The array passed in must be at least [0..n-1].
 */
static int
findextent(struct fs *fs, daddr32_t *sbp, int n, int *lenp, int maxtransfer)
{
	register daddr_t bn, nextbn;
	register daddr32_t *bp;
	register int diff;
	int maxtransblk;

	if (n <= 0)
		return (0);
	bn = *sbp;
	if (bn == 0)
		return (0);
	diff = fs->fs_frag;
	if (*lenp) {
		n = MIN(n, lblkno(fs, *lenp));
	} else {
		/*
		 * If the user has set the value for maxcontig lower than
		 * the drive transfer size, then assume they want this
		 * to be the maximum value for the size of the data transfer.
		 */
		maxtransblk = maxtransfer >> DEV_BSHIFT;
		if (fs->fs_maxcontig < maxtransblk) {
			n = MIN(n, fs->fs_maxcontig);
		} else {
			n = MIN(n, maxtransblk);
		}
	}
	bp = sbp;
	while (--n > 0) {
		nextbn = *(bp + 1);
		if (nextbn == 0 || bn + diff != nextbn)
			break;
		bn = nextbn;
		bp++;
	}
	return ((int)(bp - sbp) + 1);
}

/*
 * Free any blocks which had been successfully allocated.  Always called
 * as a result of an error, so we don't bother returning an error code
 * from here.
 *
 * If block_count and inode_sector_adjust are both zero, we'll do nothing.
 * Thus it is safe to call this as part of error handling, whether or not
 * any blocks have been allocated.
 *
 * The ufs_inode_direct case is currently unused.
 */

static void
ufs_undo_allocation(
	inode_t *ip,
	int block_count,
	struct ufs_allocated_block table[],
	int inode_sector_adjust)
{
	int i;
	int inode_changed;
	int error_updating_pointers;
	struct ufsvfs *ufsvfsp;

	inode_changed = 0;
	error_updating_pointers = 0;

	ufsvfsp = ip->i_ufsvfs;

	/*
	 * Update pointers on disk before freeing blocks.  If we fail,
	 * some blocks may remain busy; but they will be reclaimed by
	 * an fsck.  (This is better than letting a block wind up with
	 * two owners if we successfully freed it but could not remove
	 * the pointer to it.)
	 */

	for (i = 0; i < block_count; i++) {
		switch (table[i].owner) {
		case ufs_no_owner:
			/* Nothing to do here, nobody points to us */
			break;
		case ufs_inode_direct:
			ASSERT(table[i].owner_offset < NDADDR);
			ip->i_db[table[i].owner_offset] = 0;
			inode_changed = 1;
			break;
		case ufs_inode_indirect:
			ASSERT(table[i].owner_offset < NIADDR);
			ip->i_ib[table[i].owner_offset] = 0;
			inode_changed = 1;
			break;
		case ufs_indirect_block: {
			buf_t *bp;
			daddr32_t *block_data;

			/* Read/modify/log/write. */

			ASSERT(table[i].owner_offset <
			    (VBSIZE(ITOV(ip)) / sizeof (daddr32_t)));

			bp = UFS_BREAD(ufsvfsp, ip->i_dev,
			    fsbtodb(ufsvfsp->vfs_fs, table[i].owner_block),
			    VBSIZE(ITOV(ip)));

			if (bp->b_flags & B_ERROR) {
				/* Couldn't read this block; give up. */
				error_updating_pointers = 1;
				brelse(bp);
				break;		/* out of SWITCH */
			}

			block_data = bp->b_un.b_daddr;
			block_data[table[i].owner_offset] = 0;

			/* Write a log entry which includes the zero. */
			/* It might be possible to optimize this by using */
			/* TRANS_BUF directly and zeroing only the four */
			/* bytes involved, but an attempt to do that led */
			/* to panics in the logging code.  The attempt was */
			/* TRANS_BUF(ufsvfsp,				  */
			/*    table[i].owner_offset * sizeof (daddr32_t), */
			/*    sizeof (daddr32_t),			  */
			/*    bp,					  */
			/*    DT_ABZERO);				  */

			TRANS_BUF_ITEM_128(ufsvfsp,
			    block_data[table[i].owner_offset],
			    block_data, bp, DT_AB);

			/* Now we can write the buffer itself. */

			UFS_BWRITE2(ufsvfsp, bp);

			if (bp->b_flags & B_ERROR) {
				error_updating_pointers = 1;
			}

			brelse(bp);
			break;
		}
		default:
			(void) ufs_fault(ITOV(ip),
			    "ufs_undo_allocation failure\n");
			break;
		}
	}

	/*
	 * If the inode changed, or if we need to update its block count,
	 * then do that now.  We update the inode synchronously on disk
	 * to ensure that it won't transiently point at a block we've
	 * freed (only necessary if we're not logging).
	 *
	 * NOTE: Currently ufs_iupdat() does not check for errors.  When
	 * it is fixed, we should verify that we successfully updated the
	 * inode before freeing blocks below.
	 */

	if (inode_changed || (inode_sector_adjust != 0)) {
		ip->i_blocks -= inode_sector_adjust;
		ASSERT((unsigned)ip->i_blocks <= INT_MAX);
		TRANS_INODE(ufsvfsp, ip);
		ip->i_flag |= IUPD | ICHG | IATTCHG;
		ip->i_seq++;
		if (!TRANS_ISTRANS(ufsvfsp))
			ufs_iupdat(ip, I_SYNC);
	}

	/*
	 * Now we go through and actually free the blocks, but only if we
	 * successfully removed the pointers to them.
	 */

	if (!error_updating_pointers) {
		for (i = 0; i < block_count; i++) {
			free(ip, table[i].this_block, table[i].block_size,
			    table[i].usage_flags);
		}
	}
}

/*
 * Find the next hole or data block in file starting at *off
 * Return found offset in *off, which can be less than the
 * starting offset if not block aligned.
 * This code is based on bmap_read().
 * Errors: ENXIO for end of file
 *         EIO for block read error.
 */
int
bmap_find(struct inode *ip, boolean_t hole, u_offset_t *off)
{
	ufsvfs_t *ufsvfsp = ip->i_ufsvfs;
	struct fs *fs = ufsvfsp->vfs_fs;
	buf_t *bp[NIADDR];
	int i, j;
	int shft;			/* we maintain sh = 1 << shft */
	int nindirshift, nindiroffset;
	daddr_t	ob, nb, tbn, lbn, skip;
	daddr32_t *bap;
	u_offset_t isz = (offset_t)ip->i_size;
	int32_t bs = fs->fs_bsize; /* file system block size */
	int32_t nindir = fs->fs_nindir;
	dev_t dev;
	int error = 0;
	daddr_t limits[NIADDR];

	ASSERT(*off < isz);
	ASSERT(RW_LOCK_HELD(&ip->i_contents));
	lbn = (daddr_t)lblkno(fs, *off);
	ASSERT(lbn >= 0);

	for (i = 0; i < NIADDR; i++)
		bp[i] = NULL;

	/*
	 * The first NDADDR blocks are direct blocks.
	 */
	if (lbn < NDADDR) {
		for (; lbn < NDADDR; lbn++) {
			if ((hole && (ip->i_db[lbn] == 0)) ||
			    (!hole && (ip->i_db[lbn] != 0))) {
				goto out;
			}
		}
		if ((u_offset_t)lbn << fs->fs_bshift >= isz)
			goto out;
	}

	nindir = fs->fs_nindir;
	nindirshift = ufsvfsp->vfs_nindirshift;
	nindiroffset = ufsvfsp->vfs_nindiroffset;
	dev = ip->i_dev;

	/* Set up limits array */
	for (limits[0] = NDADDR, j = 1; j  < NIADDR; j++)
		limits[j] = limits[j-1] + (1ULL << (nindirshift * j));

loop:
	/*
	 * Determine how many levels of indirection.
	 */
	shft = 0;				/* sh = 1 */
	tbn = lbn - NDADDR;
	for (j = NIADDR; j > 0; j--) {
		longlong_t sh;

		shft += nindirshift;		/* sh *= nindir */
		sh = 1LL << shft;
		if (tbn < sh)
			break;
		tbn -= sh;
	}
	if (j == 0) {
		/* must have passed end of file */
		ASSERT(((u_offset_t)lbn << fs->fs_bshift) >= isz);
		goto out;
	}

	/*
	 * Fetch the first indirect block.
	 */
	nb = ip->i_ib[NIADDR - j];
	if (nb == 0) {
		if (hole) {
			lbn = limits[NIADDR - j];
			goto out;
		} else {
			lbn = limits[NIADDR - j + 1];
			if ((u_offset_t)lbn << fs->fs_bshift >= isz)
				goto out;
			goto loop;
		}
	}

	/*
	 * Fetch through the indirect blocks.
	 */
	for (; ((j <= NIADDR) && (nb != 0)); j++) {
		ob = nb;
		/*
		 * if there's a different block at this level then release
		 * the old one and in with the new.
		 */
		if ((bp[j-1] == NULL) || bp[j-1]->b_blkno != fsbtodb(fs, ob)) {
			if (bp[j-1] != NULL)
				brelse(bp[j-1]);
			bp[j-1] = UFS_BREAD(ufsvfsp, dev, fsbtodb(fs, ob), bs);
			if (bp[j-1]->b_flags & B_ERROR) {
				error = EIO;
				goto out;
			}
		}
		bap = bp[j-1]->b_un.b_daddr;

		shft -= nindirshift;		/* sh / nindir */
		i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
		nb = bap[i];
		skip = 1LL << (nindirshift * (NIADDR - j));
	}

	/*
	 * Scan through the blocks in this array.
	 */
	for (; i < nindir; i++, lbn += skip) {
		if (hole && (bap[i] == 0))
			goto out;
		if (!hole && (bap[i] != 0)) {
			if (skip == 1) {
				/* we're at the lowest level */
				goto out;
			} else {
				goto loop;
			}
		}
	}
	if (((u_offset_t)lbn << fs->fs_bshift) < isz)
		goto loop;
out:
	for (i = 0; i < NIADDR; i++) {
		if (bp[i])
			brelse(bp[i]);
	}
	if (error == 0) {
		if (((u_offset_t)lbn << fs->fs_bshift) >= isz) {
			error = ENXIO;
		} else {
			/* success */
			*off = (u_offset_t)lbn << fs->fs_bshift;
		}
	}
	return (error);
}