fs/ufs/ufs_directio.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */

/*
 * Portions of this source code were derived from Berkeley 4.3 BSD
 * under license from the Regents of the University of California.
 */

#pragma ident	"%Z%%M%	%I%	%E% SMI"

#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/param.h>
#include <sys/time.h>
#include <sys/systm.h>
#include <sys/sysmacros.h>
#include <sys/resource.h>
#include <sys/signal.h>
#include <sys/cred.h>
#include <sys/user.h>
#include <sys/buf.h>
#include <sys/vfs.h>
#include <sys/vnode.h>
#include <sys/proc.h>
#include <sys/disp.h>
#include <sys/file.h>
#include <sys/fcntl.h>
#include <sys/flock.h>
#include <sys/kmem.h>
#include <sys/uio.h>
#include <sys/dnlc.h>
#include <sys/conf.h>
#include <sys/mman.h>
#include <sys/pathname.h>
#include <sys/debug.h>
#include <sys/vmsystm.h>
#include <sys/cmn_err.h>
#include <sys/vtrace.h>
#include <sys/filio.h>
#include <sys/atomic.h>

#include <sys/fssnap_if.h>
#include <sys/fs/ufs_fs.h>
#include <sys/fs/ufs_lockfs.h>
#include <sys/fs/ufs_filio.h>
#include <sys/fs/ufs_inode.h>
#include <sys/fs/ufs_fsdir.h>
#include <sys/fs/ufs_quota.h>
#include <sys/fs/ufs_trans.h>
#include <sys/fs/ufs_panic.h>
#include <sys/dirent.h>		/* must be AFTER <sys/fs/fsdir.h>! */
#include <sys/errno.h>

#include <sys/filio.h>		/* _FIOIO */

#include <vm/hat.h>
#include <vm/page.h>
#include <vm/pvn.h>
#include <vm/as.h>
#include <vm/seg.h>
#include <vm/seg_map.h>
#include <vm/seg_vn.h>
#include <vm/seg_kmem.h>
#include <vm/rm.h>
#include <sys/swap.h>
#include <sys/epm.h>

#include <fs/fs_subr.h>

static void	*ufs_directio_zero_buf;
static int	ufs_directio_zero_len	= 8192;

int	ufs_directio_enabled = 1;	/* feature is enabled */

/*
 * for kstats reader
 */
struct ufs_directio_kstats {
	kstat_named_t	logical_reads;
	kstat_named_t	phys_reads;
	kstat_named_t	hole_reads;
	kstat_named_t	nread;
	kstat_named_t	logical_writes;
	kstat_named_t	phys_writes;
	kstat_named_t	nwritten;
	kstat_named_t	nflushes;
} ufs_directio_kstats = {
	{ "logical_reads",	KSTAT_DATA_UINT64 },
	{ "phys_reads",		KSTAT_DATA_UINT64 },
	{ "hole_reads",		KSTAT_DATA_UINT64 },
	{ "nread",		KSTAT_DATA_UINT64 },
	{ "logical_writes",	KSTAT_DATA_UINT64 },
	{ "phys_writes",	KSTAT_DATA_UINT64 },
	{ "nwritten",		KSTAT_DATA_UINT64 },
	{ "nflushes",		KSTAT_DATA_UINT64 },
};

kstat_t	*ufs_directio_kstatsp;

/*
 * use kmem_cache_create for direct-physio buffers. This has shown
 * a better cache distribution compared to buffers on the
 * stack. It also avoids semaphore construction/deconstruction
 * per request
 */
struct directio_buf {
	struct directio_buf	*next;
	char		*addr;
	size_t		nbytes;
	struct buf	buf;
};
static struct kmem_cache *directio_buf_cache;


/* ARGSUSED */
static int
directio_buf_constructor(void *dbp, void *cdrarg, int kmflags)
{
	bioinit((struct buf *)&((struct directio_buf *)dbp)->buf);
	return (0);
}

/* ARGSUSED */
static void
directio_buf_destructor(void *dbp, void *cdrarg)
{
	biofini((struct buf *)&((struct directio_buf *)dbp)->buf);
}

void
directio_bufs_init(void)
{
	directio_buf_cache = kmem_cache_create("directio_buf_cache",
		sizeof (struct directio_buf), 0,
		directio_buf_constructor, directio_buf_destructor,
		NULL, NULL, NULL, 0);
}

void
ufs_directio_init(void)
{
	/*
	 * kstats
	 */
	ufs_directio_kstatsp = kstat_create("ufs", 0,
	    "directio", "ufs", KSTAT_TYPE_NAMED,
	    sizeof (ufs_directio_kstats) / sizeof (kstat_named_t),
	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
	if (ufs_directio_kstatsp) {
		ufs_directio_kstatsp->ks_data = (void *)&ufs_directio_kstats;
		kstat_install(ufs_directio_kstatsp);
	}
	/*
	 * kzero is broken so we have to use a private buf of zeroes
	 */
	ufs_directio_zero_buf = kmem_zalloc(ufs_directio_zero_len, KM_SLEEP);
	directio_bufs_init();
}

/*
 * Wait for the first direct IO operation to finish
 */
static int
directio_wait_one(struct directio_buf *dbp, long *bytes_iop)
{
	buf_t	*bp;
	int	error;

	/*
	 * Wait for IO to finish
	 */
	bp = &dbp->buf;
	error = biowait(bp);

	/*
	 * bytes_io will be used to figure out a resid
	 * for the caller. The resid is approximated by reporting
	 * the bytes following the first failed IO as the residual.
	 *
	 * I am cautious about using b_resid because I
	 * am not sure how well the disk drivers maintain it.
	 */
	if (error)
		if (bp->b_resid)
			*bytes_iop = bp->b_bcount - bp->b_resid;
		else
			*bytes_iop = 0;
	else
		*bytes_iop += bp->b_bcount;
	/*
	 * Release direct IO resources
	 */
	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
	kmem_cache_free(directio_buf_cache, dbp);
	return (error);
}

/*
 * Wait for all of the direct IO operations to finish
 */

uint32_t	ufs_directio_drop_kpri = 0;	/* enable kpri hack */

static int
directio_wait(struct directio_buf *tail, long *bytes_iop)
{
	int	error = 0, newerror;
	struct directio_buf	*dbp;
	uint_t	kpri_req_save;

	/*
	 * The linked list of directio buf structures is maintained
	 * in reverse order (tail->last request->penultimate request->...)
	 */
	/*
	 * This is the k_pri_req hack. Large numbers of threads
	 * sleeping with kernel priority will cause scheduler thrashing
	 * on an MP machine. This can be seen running Oracle using
	 * directio to ufs files. Sleep at normal priority here to
	 * more closely mimic physio to a device partition. This
	 * workaround is disabled by default as a niced thread could
	 * be starved from running while holding i_rwlock and i_contents.
	 */
	if (ufs_directio_drop_kpri) {
		kpri_req_save = curthread->t_kpri_req;
		curthread->t_kpri_req = 0;
	}
	while ((dbp = tail) != NULL) {
		tail = dbp->next;
		newerror = directio_wait_one(dbp, bytes_iop);
		if (error == 0)
			error = newerror;
	}
	if (ufs_directio_drop_kpri)
		curthread->t_kpri_req = kpri_req_save;
	return (error);
}
/*
 * Initiate direct IO request
 */
static void
directio_start(struct ufsvfs *ufsvfsp, dev_t dev, size_t nbytes,
	offset_t offset, char *addr, enum seg_rw rw, struct proc *procp,
	struct directio_buf **tailp, page_t **pplist)
{
	buf_t *bp;
	struct directio_buf *dbp;

	/*
	 * Allocate a directio buf header
	 *   Note - list is maintained in reverse order.
	 *   directio_wait_one() depends on this fact when
	 *   adjusting the ``bytes_io'' param. bytes_io
	 *   is used to compute a residual in the case of error.
	 */
	dbp = kmem_cache_alloc(directio_buf_cache, KM_SLEEP);
	dbp->next = *tailp;
	*tailp = dbp;

	/*
	 * Initialize buf header
	 */
	dbp->addr = addr;
	dbp->nbytes = nbytes;
	bp = &dbp->buf;
	bp->b_edev = dev;
	bp->b_lblkno = btodt(offset);
	bp->b_bcount = nbytes;
	bp->b_un.b_addr = addr;
	bp->b_proc = procp;

	/*
	 * Note that S_WRITE implies B_READ and vice versa: a read(2)
	 * will B_READ data from the filesystem and S_WRITE it into
	 * the user's buffer; a write(2) will S_READ data from the
	 * user's buffer and B_WRITE it to the filesystem.
	 */
	if (rw == S_WRITE) {
		bp->b_flags = B_BUSY | B_PHYS | B_READ;
		ufs_directio_kstats.phys_reads.value.ui64++;
		ufs_directio_kstats.nread.value.ui64 += nbytes;
	} else {
		bp->b_flags = B_BUSY | B_PHYS | B_WRITE;
		ufs_directio_kstats.phys_writes.value.ui64++;
		ufs_directio_kstats.nwritten.value.ui64 += nbytes;
	}
	bp->b_shadow = pplist;
	if (pplist != NULL)
		bp->b_flags |= B_SHADOW;

	/*
	 * Issue I/O request.
	 */
	ufsvfsp->vfs_iotstamp = lbolt;
	if (ufsvfsp->vfs_snapshot)
		fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
	else
		(void) bdev_strategy(bp);

	if (rw == S_WRITE)
		lwp_stat_update(LWP_STAT_OUBLK, 1);
	else
		lwp_stat_update(LWP_STAT_INBLK, 1);

}

uint32_t	ufs_shared_writes;	/* writes done w/ lock shared */
uint32_t	ufs_cur_writes;		/* # concurrent writes */
uint32_t	ufs_maxcur_writes;	/* high water concurrent writes */
uint32_t	ufs_posix_hits;		/* writes done /w lock excl. */

/*
 * Force POSIX syncronous data integrity on all writes for testing.
 */
uint32_t	ufs_force_posix_sdi = 0;

/*
 * Direct Write
 */

int
ufs_directio_write(struct inode *ip, uio_t *arg_uio, int ioflag, int rewrite,
	cred_t *cr, int *statusp)
{
	long		resid, bytes_written;
	u_offset_t	size, uoff;
	uio_t		*uio = arg_uio;
	rlim64_t	limit = uio->uio_llimit;
	int		on, n, error, newerror, len, has_holes;
	daddr_t		bn;
	size_t		nbytes;
	struct fs	*fs;
	vnode_t		*vp;
	iovec_t		*iov;
	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
	struct proc	*procp;
	struct as	*as;
	struct directio_buf	*tail;
	int		exclusive, ncur, bmap_peek;
	uio_t		copy_uio;
	iovec_t		copy_iov;
	char		*copy_base;
	long		copy_resid;

	/*
	 * assume that directio isn't possible (normal case)
	 */
	*statusp = DIRECTIO_FAILURE;

	/*
	 * Don't go direct
	 */
	if (ufs_directio_enabled == 0)
		return (0);

	/*
	 * mapped file; nevermind
	 */
	if (ip->i_mapcnt)
		return (0);

	/*
	 * CAN WE DO DIRECT IO?
	 */
	uoff = uio->uio_loffset;
	resid = uio->uio_resid;

	/*
	 * beyond limit
	 */
	if (uoff + resid > limit)
		return (0);

	/*
	 * must be sector aligned
	 */
	if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
		return (0);

	/*
	 * SHOULD WE DO DIRECT IO?
	 */
	size = ip->i_size;
	has_holes = -1;

	/*
	 * only on regular files; no metadata
	 */
	if (((ip->i_mode & IFMT) != IFREG) || ip->i_ufsvfs->vfs_qinod == ip)
		return (0);

	/*
	 * Synchronous, allocating writes run very slow in Direct-Mode
	 * 	XXX - can be fixed with bmap_write changes for large writes!!!
	 *	XXX - can be fixed for updates to "almost-full" files
	 *	XXX - WARNING - system hangs if bmap_write() has to
	 * 			allocate lots of pages since pageout
	 * 			suspends on locked inode
	 */
	if (!rewrite && (ip->i_flag & ISYNC)) {
		if ((uoff + resid) > size)
			return (0);
		has_holes = bmap_has_holes(ip);
		if (has_holes)
			return (0);
	}

	/*
	 * Each iovec must be short aligned and sector aligned.  If
	 * one is not, then kmem_alloc a new buffer and copy all of
	 * the smaller buffers into the new buffer.  This new
	 * buffer will be short aligned and sector aligned.
	 */
	iov = uio->uio_iov;
	nbytes = uio->uio_iovcnt;
	while (nbytes--) {
		if (((uint_t)iov->iov_len & (DEV_BSIZE - 1)) != 0 ||
		    (intptr_t)(iov->iov_base) & 1) {
			copy_resid = uio->uio_resid;
			copy_base = kmem_alloc(copy_resid, KM_NOSLEEP);
			if (copy_base == NULL)
				return (0);
			copy_iov.iov_base = copy_base;
			copy_iov.iov_len = copy_resid;
			copy_uio.uio_iov = &copy_iov;
			copy_uio.uio_iovcnt = 1;
			copy_uio.uio_segflg = UIO_SYSSPACE;
			copy_uio.uio_extflg = UIO_COPY_DEFAULT;
			copy_uio.uio_loffset = uio->uio_loffset;
			copy_uio.uio_resid = uio->uio_resid;
			copy_uio.uio_llimit = uio->uio_llimit;
			error = uiomove(copy_base, copy_resid, UIO_WRITE, uio);
			if (error) {
				kmem_free(copy_base, copy_resid);
				return (0);
			}
			uio = &copy_uio;
			break;
		}
		iov++;
	}

	/*
	 * From here on down, all error exits must go to errout and
	 * not simply return a 0.
	 */

	/*
	 * DIRECTIO
	 */

	fs = ip->i_fs;

	/*
	 * POSIX check. If attempting a concurrent re-write, make sure
	 * that this will be a single request to the driver to meet
	 * POSIX synchronous data integrity requirements.
	 */
	bmap_peek = 0;
	if (rewrite && ((ioflag & FDSYNC) || ufs_force_posix_sdi)) {
		int upgrade = 0;

		/* check easy conditions first */
		if (uio->uio_iovcnt != 1 || resid > ufsvfsp->vfs_ioclustsz) {
			upgrade = 1;
		} else {
			/* now look for contiguous allocation */
			len = (ssize_t)blkroundup(fs, resid);
			error = bmap_read(ip, uoff, &bn, &len);
			if (error || bn == UFS_HOLE || len == 0)
				goto errout;
			/* save a call to bmap_read later */
			bmap_peek = 1;
			if (len < resid)
				upgrade = 1;
		}
		if (upgrade) {
			rw_exit(&ip->i_contents);
			rw_enter(&ip->i_contents, RW_WRITER);
			ufs_posix_hits++;
		}
	}


	/*
	 * allocate space
	 */

	/*
	 * If attempting a re-write, there is no allocation to do.
	 * bmap_write would trip an ASSERT if i_contents is held shared.
	 */
	if (rewrite)
		goto skip_alloc;

	do {
		on = (int)blkoff(fs, uoff);
		n = (int)MIN(fs->fs_bsize - on, resid);
		if ((uoff + n) > ip->i_size) {
			error = bmap_write(ip, uoff, (int)(on + n),
			    (int)(uoff & (offset_t)MAXBOFFSET) == 0,
			    NULL, cr);
			/* Caller is responsible for updating i_seq if needed */
			if (error)
				break;
			ip->i_size = uoff + n;
			ip->i_flag |= IATTCHG;
		} else if (n == MAXBSIZE) {
			error = bmap_write(ip, uoff, (int)(on + n),
			    BI_ALLOC_ONLY, NULL, cr);
			/* Caller is responsible for updating i_seq if needed */
		} else {
			if (has_holes < 0)
				has_holes = bmap_has_holes(ip);
			if (has_holes) {
				uint_t	blk_size;
				u_offset_t offset;

				offset = uoff & (offset_t)fs->fs_bmask;
				blk_size = (int)blksize(fs, ip,
				    (daddr_t)lblkno(fs, offset));
				error = bmap_write(ip, uoff, blk_size,
				    BI_NORMAL, NULL, cr);
				/*
				 * Caller is responsible for updating
				 * i_seq if needed
				 */
			} else
				error = 0;
		}
		if (error)
			break;
		uoff += n;
		resid -= n;
		/*
		 * if file has grown larger than 2GB, set flag
		 * in superblock if not already set
		 */
		if ((ip->i_size > MAXOFF32_T) &&
		    !(fs->fs_flags & FSLARGEFILES)) {
			ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
			mutex_enter(&ufsvfsp->vfs_lock);
			fs->fs_flags |= FSLARGEFILES;
			ufs_sbwrite(ufsvfsp);
			mutex_exit(&ufsvfsp->vfs_lock);
		}
	} while (resid);

	if (error) {
		/*
		 * restore original state
		 */
		if (resid) {
			if (size == ip->i_size)
				goto errout;
			(void) ufs_itrunc(ip, size, 0, cr);
		}
		/*
		 * try non-directio path
		 */
		goto errout;
	}
skip_alloc:

	/*
	 * get rid of cached pages
	 */
	vp = ITOV(ip);
	exclusive = rw_write_held(&ip->i_contents);
	if (vn_has_cached_data(vp)) {
		if (!exclusive) {
			/*
			 * Still holding i_rwlock, so no allocations
			 * can happen after dropping contents.
			 */
			rw_exit(&ip->i_contents);
			rw_enter(&ip->i_contents, RW_WRITER);
		}
		(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_INVAL, cr);
		if (vn_has_cached_data(vp))
			goto errout;
		if (!exclusive)
			rw_downgrade(&ip->i_contents);
		ufs_directio_kstats.nflushes.value.ui64++;
	}

	/*
	 * Direct Writes
	 */

	if (!exclusive) {
		ufs_shared_writes++;
		ncur = atomic_add_32_nv(&ufs_cur_writes, 1);
		if (ncur > ufs_maxcur_writes)
			ufs_maxcur_writes = ncur;
	}

	/*
	 * proc and as are for VM operations in directio_start()
	 */
	if (uio->uio_segflg == UIO_USERSPACE) {
		procp = ttoproc(curthread);
		as = procp->p_as;
	} else {
		procp = NULL;
		as = &kas;
	}
	*statusp = DIRECTIO_SUCCESS;
	error = 0;
	newerror = 0;
	resid = uio->uio_resid;
	bytes_written = 0;
	ufs_directio_kstats.logical_writes.value.ui64++;
	while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
		size_t pglck_len, pglck_size;
		caddr_t pglck_base;
		page_t **pplist, **spplist;

		tail = NULL;

		/*
		 * Adjust number of bytes
		 */
		iov = uio->uio_iov;
		pglck_len = (size_t)MIN(iov->iov_len, resid);
		pglck_base = iov->iov_base;
		if (pglck_len == 0) {
			uio->uio_iov++;
			uio->uio_iovcnt--;
			continue;
		}

		/*
		 * Try to Lock down the largest chunck of pages possible.
		 */
		pglck_len = (size_t)MIN(pglck_len,  ufsvfsp->vfs_ioclustsz);
		error = as_pagelock(as, &pplist, pglck_base, pglck_len, S_READ);

		if (error)
			break;

		pglck_size = pglck_len;
		while (pglck_len) {

			nbytes = pglck_len;
			uoff = uio->uio_loffset;

			if (!bmap_peek) {

				/*
				 * Re-adjust number of bytes to contiguous
				 * range. May have already called bmap_read
				 * in the case of a concurrent rewrite.
				 */
				len = (ssize_t)blkroundup(fs, nbytes);
				error = bmap_read(ip, uoff, &bn, &len);
				if (error)
					break;
				if (bn == UFS_HOLE || len == 0)
					break;
			}
			nbytes = (size_t)MIN(nbytes, len);
			bmap_peek = 0;

			/*
			 * Get the pagelist pointer for this offset to be
			 * passed to directio_start.
			 */

			if (pplist != NULL)
				spplist = pplist +
				btop((uintptr_t)iov->iov_base -
					((uintptr_t)pglck_base & PAGEMASK));
			else
				spplist = NULL;

			/*
			 * Kick off the direct write requests
			 */
			directio_start(ufsvfsp, ip->i_dev, nbytes, ldbtob(bn),
				iov->iov_base, S_READ, procp, &tail, spplist);

			/*
			 * Adjust pointers and counters
			 */
			iov->iov_len -= nbytes;
			iov->iov_base += nbytes;
			uio->uio_loffset += nbytes;
			resid -= nbytes;
			pglck_len -= nbytes;
		}

		/*
		 * Wait for outstanding requests
		 */
		newerror = directio_wait(tail, &bytes_written);

		/*
		 * Release VM resources
		 */
		as_pageunlock(as, pplist, pglck_base, pglck_size, S_READ);

	}

	if (!exclusive) {
		atomic_add_32(&ufs_cur_writes, -1);
		/*
		 * If this write was done shared, readers may
		 * have pulled in unmodified pages. Get rid of
		 * these potentially stale pages.
		 */
		if (vn_has_cached_data(vp)) {
			rw_exit(&ip->i_contents);
			rw_enter(&ip->i_contents, RW_WRITER);
			(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
				B_INVAL, cr);
			ufs_directio_kstats.nflushes.value.ui64++;
			rw_downgrade(&ip->i_contents);
		}
	}

	/*
	 * If error, adjust resid to begin at the first
	 * un-writable byte.
	 */
	if (error == 0)
		error = newerror;
	if (error)
		resid = uio->uio_resid - bytes_written;
	arg_uio->uio_resid = resid;

	if (!rewrite) {
		ip->i_flag |= IUPD | ICHG;
		/* Caller will update i_seq */
		TRANS_INODE(ip->i_ufsvfs, ip);
	}
	/*
	 * If there is a residual; adjust the EOF if necessary
	 */
	if (resid) {
		if (size != ip->i_size) {
			if (uio->uio_loffset > size)
				size = uio->uio_loffset;
			(void) ufs_itrunc(ip, size, 0, cr);
		}
	}

	if (uio == &copy_uio)
		kmem_free(copy_base, copy_resid);

	return (error);

errout:
	if (uio == &copy_uio)
		kmem_free(copy_base, copy_resid);

	return (0);
}
/*
 * Direct read of a hole
 */
static int
directio_hole(struct uio *uio, size_t nbytes)
{
	int		error = 0, nzero;
	uio_t		phys_uio;
	iovec_t		phys_iov;

	ufs_directio_kstats.hole_reads.value.ui64++;
	ufs_directio_kstats.nread.value.ui64 += nbytes;

	phys_iov.iov_base = uio->uio_iov->iov_base;
	phys_iov.iov_len = nbytes;

	phys_uio.uio_iov = &phys_iov;
	phys_uio.uio_iovcnt = 1;
	phys_uio.uio_resid = phys_iov.iov_len;
	phys_uio.uio_segflg = uio->uio_segflg;
	phys_uio.uio_extflg = uio->uio_extflg;
	while (error == 0 && phys_uio.uio_resid) {
		nzero = (int)MIN(phys_iov.iov_len, ufs_directio_zero_len);
		error = uiomove(ufs_directio_zero_buf, nzero, UIO_READ,
				&phys_uio);
	}
	return (error);
}

/*
 * Direct Read
 */
int
ufs_directio_read(struct inode *ip, uio_t *uio, cred_t *cr, int *statusp)
{
	ssize_t		resid, bytes_read;
	u_offset_t	size, uoff;
	int		error, newerror, len;
	size_t		nbytes;
	struct fs	*fs;
	vnode_t		*vp;
	daddr_t		bn;
	iovec_t		*iov;
	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
	struct proc	*procp;
	struct as	*as;
	struct directio_buf	*tail;

	/*
	 * assume that directio isn't possible (normal case)
	 */
	*statusp = DIRECTIO_FAILURE;

	/*
	 * Don't go direct
	 */
	if (ufs_directio_enabled == 0)
		return (0);

	/*
	 * mapped file; nevermind
	 */
	if (ip->i_mapcnt)
		return (0);

	/*
	 * CAN WE DO DIRECT IO?
	 */
	/*
	 * must be sector aligned
	 */
	uoff = uio->uio_loffset;
	resid = uio->uio_resid;
	if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
		return (0);
	/*
	 * must be short aligned and sector aligned
	 */
	iov = uio->uio_iov;
	nbytes = uio->uio_iovcnt;
	while (nbytes--) {
		if (((size_t)iov->iov_len & (DEV_BSIZE - 1)) != 0)
			return (0);
		if ((intptr_t)(iov++->iov_base) & 1)
			return (0);
	}

	/*
	 * DIRECTIO
	 */
	fs = ip->i_fs;

	/*
	 * don't read past EOF
	 */
	size = ip->i_size;

	/*
	 * The file offset is past EOF so bail out here; we don't want
	 * to update uio_resid and make it look like we read something.
	 * We say that direct I/O was a success to avoid having rdip()
	 * go through the same "read past EOF logic".
	 */
	if (uoff >= size) {
		*statusp = DIRECTIO_SUCCESS;
		return (0);
	}

	/*
	 * The read would extend past EOF so make it smaller.
	 */
	if ((uoff + resid) > size) {
		resid = size - uoff;
		/*
		 * recheck sector alignment
		 */
		if (resid & (DEV_BSIZE - 1))
			return (0);
	}

	/*
	 * At this point, we know there is some real work to do.
	 */
	ASSERT(resid);

	/*
	 * get rid of cached pages
	 */
	vp = ITOV(ip);
	if (vn_has_cached_data(vp)) {
		rw_exit(&ip->i_contents);
		rw_enter(&ip->i_contents, RW_WRITER);
		(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_INVAL, cr);
		if (vn_has_cached_data(vp))
			return (0);
		rw_downgrade(&ip->i_contents);
		ufs_directio_kstats.nflushes.value.ui64++;
	}
	/*
	 * Direct Reads
	 */

	/*
	 * proc and as are for VM operations in directio_start()
	 */
	if (uio->uio_segflg == UIO_USERSPACE) {
		procp = ttoproc(curthread);
		as = procp->p_as;
	} else {
		procp = NULL;
		as = &kas;
	}

	*statusp = DIRECTIO_SUCCESS;
	error = 0;
	newerror = 0;
	bytes_read = 0;
	ufs_directio_kstats.logical_reads.value.ui64++;
	while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
		size_t pglck_len, pglck_size;
		caddr_t pglck_base;
		page_t **pplist, **spplist;

		tail = NULL;

		/*
		 * Adjust number of bytes
		 */
		iov = uio->uio_iov;
		pglck_len = (size_t)MIN(iov->iov_len, resid);
		pglck_base = iov->iov_base;
		if (pglck_len == 0) {
			uio->uio_iov++;
			uio->uio_iovcnt--;
			continue;
		}

		/*
		 * Try to Lock down the largest chunck of pages possible.
		 */
		pglck_len = (size_t)MIN(pglck_len,  ufsvfsp->vfs_ioclustsz);
		error = as_pagelock(as, &pplist, pglck_base,
							pglck_len, S_WRITE);

		if (error)
			break;

		pglck_size = pglck_len;
		while (pglck_len) {

			nbytes = pglck_len;
			uoff = uio->uio_loffset;

			/*
			 * Re-adjust number of bytes to contiguous range
			 */
			len = (ssize_t)blkroundup(fs, nbytes);
			error = bmap_read(ip, uoff, &bn, &len);
			if (error)
				break;

			if (bn == UFS_HOLE) {
				nbytes = (size_t)MIN(fs->fs_bsize -
						(long)blkoff(fs, uoff), nbytes);
				error = directio_hole(uio, nbytes);
				/*
				 * Hole reads are not added to the list
				 * processed by directio_wait() below so
				 * account for bytes read here.
				 */
				if (!error)
					bytes_read += nbytes;
			} else {
				nbytes = (size_t)MIN(nbytes, len);

				/*
				 * Get the pagelist pointer for this offset
				 * to be passed to directio_start.
				 */
				if (pplist != NULL)
					spplist = pplist +
					btop((uintptr_t)iov->iov_base -
					((uintptr_t)pglck_base & PAGEMASK));
				else
					spplist = NULL;

				/*
				 * Kick off the direct read requests
				 */
				directio_start(ufsvfsp, ip->i_dev, nbytes,
						ldbtob(bn), iov->iov_base,
						S_WRITE, procp, &tail, spplist);
			}

			if (error)
				break;

			/*
			 * Adjust pointers and counters
			 */
			iov->iov_len -= nbytes;
			iov->iov_base += nbytes;
			uio->uio_loffset += nbytes;
			resid -= nbytes;
			pglck_len -= nbytes;
		}

		/*
		 * Wait for outstanding requests
		 */
		newerror = directio_wait(tail, &bytes_read);
		/*
		 * Release VM resources
		 */
		as_pageunlock(as, pplist, pglck_base, pglck_size, S_WRITE);

	}

	/*
	 * If error, adjust resid to begin at the first
	 * un-read byte.
	 */
	if (error == 0)
		error = newerror;
	uio->uio_resid -= bytes_read;
	return (error);
}