/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident	"%Z%%M%	%I%	%E% SMI"

#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/conf.h>
#include <sys/cred.h>
#include <sys/kmem.h>
#include <sys/sysmacros.h>
#include <sys/vfs.h>
#include <sys/vnode.h>
#include <sys/debug.h>
#include <sys/errno.h>
#include <sys/time.h>
#include <sys/file.h>
#include <sys/open.h>
#include <sys/user.h>
#include <sys/termios.h>
#include <sys/stream.h>
#include <sys/strsubr.h>
#include <sys/esunddi.h>
#include <sys/flock.h>
#include <sys/modctl.h>
#include <sys/cmn_err.h>
#include <sys/vmsystm.h>

#include <sys/socket.h>
#include <sys/socketvar.h>
#include <netinet/in.h>
#include <sys/sendfile.h>
#include <sys/un.h>
#include <inet/nca/ncadoorhdr.h>
#include <inet/nca/ncaio.h>
#include <sys/tihdr.h>
#include <sys/atomic.h>

#include <inet/common.h>
#include <inet/ip.h>
#include <inet/ip6.h>
#include <inet/tcp.h>

extern int nca_sendfilev(file_t *, struct sendfilevec *, int, ssize_t *);
extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *,
		ssize32_t *);
extern void nl7c_sendfilev(struct sonode *, u_offset_t, struct sendfilevec *,
		int);

/*
 * kstrwritemp() has very similar semantics as that of strwrite().
 * The main difference is it obtains mblks from the caller and also
 * does not do any copy as done in strwrite() from user buffers to
 * kernel buffers.
 *
 * Currently, this routine is used by sendfile to send data allocated
 * within the kernel without any copying. This interface does not use the
 * synchronous stream interface as synch. stream interface implies
 * copying.
 */
int
kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
{
	struct stdata *stp;
	struct queue *wqp;
	char waitflag;
	int tempmode;
	int error = 0;
	int done = 0;
	struct sonode *so;
	boolean_t direct;

	ASSERT(vp->v_stream);
	stp = vp->v_stream;

	so = VTOSO(vp);
	direct = (so->so_state & SS_DIRECT);

	/*
	 * This is the sockfs direct fast path. canputnext() need
	 * not be accurate so we don't grab the sd_lock here. If
	 * we get flow-controlled, we grab sd_lock just before the
	 * do..while loop below to emulate what strwrite() does.
	 */
	wqp = stp->sd_wrq;
	if (canputnext(wqp) && direct &&
	    !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
		return (sostream_direct(so, NULL, mp, CRED()));
	} else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
		/* Fast check of flags before acquiring the lock */
		mutex_enter(&stp->sd_lock);
		error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
		mutex_exit(&stp->sd_lock);
		if (error != 0) {
			if (!(stp->sd_flag & STPLEX) &&
			    (stp->sd_wput_opt & SW_SIGPIPE)) {
				tsignal(curthread, SIGPIPE);
				error = EPIPE;
			}
			return (error);
		}
	}

	waitflag = WRITEWAIT;
	if (stp->sd_flag & OLDNDELAY)
		tempmode = fmode & ~FNDELAY;
	else
		tempmode = fmode;

	mutex_enter(&stp->sd_lock);
	do {
		if (canputnext(wqp)) {
			mutex_exit(&stp->sd_lock);
			putnext(wqp, mp);
			return (0);
		}
		error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
		    &done);
	} while (error == 0 && !done);

	mutex_exit(&stp->sd_lock);
	/*
	 * EAGAIN tells the application to try again. ENOMEM
	 * is returned only if the memory allocation size
	 * exceeds the physical limits of the system. ENOMEM
	 * can't be true here.
	 */
	if (error == ENOMEM)
		error = EAGAIN;
	return (error);
}

#define	SEND_MAX_CHUNK	16

#if defined(_SYSCALL32_IMPL) || defined(_ILP32)
/*
 * 64 bit offsets for 32 bit applications only running either on
 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer
 * more than 2GB of data.
 */
int
sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv,
    int copy_cnt, ssize32_t *count)
{
	struct vnode *vp;
	ushort_t fflag;
	int ioflag;
	size32_t cnt;
	ssize32_t sfv_len;
	ssize32_t tmpcount;
	u_offset_t sfv_off;
	struct uio auio;
	struct iovec aiov;
	int i, error;

	fflag = fp->f_flag;
	vp = fp->f_vnode;
	for (i = 0; i < copy_cnt; i++) {

		if (ISSIG(curthread, JUSTLOOKING))
			return (EINTR);

		/*
		 * Do similar checks as "write" as we are writing
		 * sfv_len bytes into "vp".
		 */
		sfv_len = (ssize32_t)sfv->sfv_len;

		if (sfv_len == 0)
			continue;

		if (sfv_len < 0)
			return (EINVAL);

		if (vp->v_type == VREG) {
			if (*fileoff >= curproc->p_fsz_ctl) {
				mutex_enter(&curproc->p_lock);
				(void) rctl_action(
				    rctlproc_legacy[RLIMIT_FSIZE],
				    curproc->p_rctls, curproc, RCA_SAFE);
				mutex_exit(&curproc->p_lock);
				return (EFBIG);
			}

			if (*fileoff >= OFFSET_MAX(fp))
				return (EFBIG);

			if (*fileoff + sfv_len > OFFSET_MAX(fp))
				return (EINVAL);
		}

		tmpcount = *count + sfv_len;
		if (tmpcount < 0)
			return (EINVAL);

		sfv_off = sfv->sfv_off;

		auio.uio_extflg = UIO_COPY_DEFAULT;
		if (sfv->sfv_fd == SFV_FD_SELF) {
			aiov.iov_len = sfv_len;
			aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
			auio.uio_loffset = *fileoff;
			auio.uio_iovcnt = 1;
			auio.uio_resid = sfv_len;
			auio.uio_iov = &aiov;
			auio.uio_segflg = UIO_USERSPACE;
			auio.uio_llimit = curproc->p_fsz_ctl;
			auio.uio_fmode = fflag;
			ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
			while (sfv_len > 0) {
				error = VOP_WRITE(vp, &auio, ioflag,
				    fp->f_cred, NULL);
				cnt = sfv_len - auio.uio_resid;
				sfv_len -= cnt;
				ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
				if (vp->v_type == VREG)
					*fileoff += cnt;
				*count += cnt;
				if (error != 0)
					return (error);
			}
		} else {
			file_t	*ffp;
			vnode_t	*readvp;
			int	readflg = 0;
			size_t	size;
			caddr_t	ptr;

			if ((ffp = getf(sfv->sfv_fd)) == NULL)
				return (EBADF);

			if ((ffp->f_flag & FREAD) == 0) {
				releasef(sfv->sfv_fd);
				return (EBADF);
			}

			readvp = ffp->f_vnode;
			if (readvp->v_type != VREG) {
				releasef(sfv->sfv_fd);
				return (EINVAL);
			}

			/*
			 * No point reading and writing to same vp,
			 * as long as both are regular files. readvp is not
			 * locked; but since we got it from an open file the
			 * contents will be valid during the time of access.
			 */
			if (VN_CMP(vp, readvp)) {
				releasef(sfv->sfv_fd);
				return (EINVAL);
			}

			/*
			 * Note: we assume readvp != vp. "vp" is already
			 * locked, and "readvp" must not be.
			 */
			(void) VOP_RWLOCK(readvp, readflg, NULL);

			/*
			 * Same checks as in pread64.
			 */
			if (sfv_off > MAXOFFSET_T) {
				VOP_RWUNLOCK(readvp, readflg, NULL);
				releasef(sfv->sfv_fd);
				return (EINVAL);
			}

			if (sfv_off + sfv_len > MAXOFFSET_T)
				sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off);

			/* Find the native blocksize to transfer data */
			size = MIN(vp->v_vfsp->vfs_bsize,
			    readvp->v_vfsp->vfs_bsize);
			size = sfv_len < size ? sfv_len : size;
			ptr = kmem_alloc(size, KM_SLEEP);

			while (sfv_len > 0) {
				size_t	iov_len;

				iov_len = MIN(size, sfv_len);
				aiov.iov_base = ptr;
				aiov.iov_len = iov_len;
				auio.uio_loffset = sfv_off;
				auio.uio_iov = &aiov;
				auio.uio_iovcnt = 1;
				auio.uio_resid = iov_len;
				auio.uio_segflg = UIO_SYSSPACE;
				auio.uio_llimit = MAXOFFSET_T;
				auio.uio_fmode = ffp->f_flag;
				ioflag = auio.uio_fmode &
				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);

				/*
				 * If read sync is not asked for,
				 * filter sync flags
				 */
				if ((ioflag & FRSYNC) == 0)
					ioflag &= ~(FSYNC|FDSYNC);
				error = VOP_READ(readvp, &auio, ioflag,
				    fp->f_cred, NULL);
				if (error) {
					kmem_free(ptr, size);
					VOP_RWUNLOCK(readvp, readflg, NULL);
					releasef(sfv->sfv_fd);
					return (error);
				}

				/*
				 * Check how must data was really read.
				 * Decrement the 'len' and increment the
				 * 'off' appropriately.
				 */
				cnt = iov_len - auio.uio_resid;
				if (cnt == 0) {
					/*
					 * If we were reading a pipe (currently
					 * not implemented), we may now lose
					 * data.
					 */
					kmem_free(ptr, size);
					VOP_RWUNLOCK(readvp, readflg, NULL);
					releasef(sfv->sfv_fd);
					return (EINVAL);
				}
				sfv_len -= cnt;
				sfv_off += cnt;

				aiov.iov_base = ptr;
				aiov.iov_len = cnt;
				auio.uio_loffset = *fileoff;
				auio.uio_resid = cnt;
				auio.uio_segflg = UIO_SYSSPACE;
				auio.uio_llimit = curproc->p_fsz_ctl;
				auio.uio_fmode = fflag;
				ioflag = auio.uio_fmode &
				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
				error = VOP_WRITE(vp, &auio, ioflag,
				    fp->f_cred, NULL);

				/*
				 * Check how much data was written. Increment
				 * the 'len' and decrement the 'off' if all
				 * the data was not written.
				 */
				cnt -= auio.uio_resid;
				sfv_len += auio.uio_resid;
				sfv_off -= auio.uio_resid;
				ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
				if (vp->v_type == VREG)
					*fileoff += cnt;
				*count += cnt;
				if (error != 0) {
					kmem_free(ptr, size);
					VOP_RWUNLOCK(readvp, readflg, NULL);
					releasef(sfv->sfv_fd);
					return (error);
				}
			}
			VOP_RWUNLOCK(readvp, readflg, NULL);
			releasef(sfv->sfv_fd);
			kmem_free(ptr, size);
		}
		sfv++;
	}
	return (0);
}

ssize32_t
sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt,
	size32_t *xferred, int fildes)
{
	int			rwflag;
	u_offset_t		fileoff;
	int			copy_cnt;
	const struct ksendfilevec64 *copy_vec;
	struct ksendfilevec64 sfv[SEND_MAX_CHUNK];
	struct vnode *vp;
	int error;
	ssize32_t count = 0;
	int osfvcnt;

	rwflag = 1;
	vp = fp->f_vnode;
	(void) VOP_RWLOCK(vp, rwflag, NULL);

	copy_vec = vec;
	fileoff = fp->f_offset;
	osfvcnt = sfvcnt;

	do {
		copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
		if (copyin(copy_vec, sfv, copy_cnt *
		    sizeof (struct ksendfilevec64))) {
			error = EFAULT;
			break;
		}

		/*
		 * Optimize the single regular file over
		 * the socket case.
		 */
		if (vp->v_type == VSOCK && osfvcnt == 1 &&
		    sfv->sfv_fd != SFV_FD_SELF) {
			file_t *rfp;
			vnode_t *rvp;

			if ((rfp = getf(sfv->sfv_fd)) == NULL) {
				error = EBADF;
				break;
			}
			if ((rfp->f_flag & FREAD) == 0) {
				releasef(sfv->sfv_fd);
				error = EBADF;
				break;
			}
			rvp = rfp->f_vnode;
			if (rvp->v_type == VREG) {
				error = sosendfile64(fp, rfp, sfv, &count);
				break;
			}
			releasef(sfv->sfv_fd);
		}
		error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count);
		if (error != 0)
			break;

		copy_vec += copy_cnt;
		sfvcnt -= copy_cnt;
	} while (sfvcnt > 0);

	if (vp->v_type == VREG)
		fp->f_offset += count;

	VOP_RWUNLOCK(vp, rwflag, NULL);
	if (copyout(&count, xferred, sizeof (count)))
		error = EFAULT;
	releasef(fildes);
	if (error != 0)
		return (set_errno(error));
	return (count);
}
#endif

int
sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
    int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count)
{
	struct vnode *vp;
	struct uio auio;
	struct iovec aiov;
	ushort_t fflag;
	int ioflag;
	int i, error;
	size_t cnt;
	ssize_t sfv_len;
	u_offset_t sfv_off;
#ifdef _SYSCALL32_IMPL
	model_t model = get_udatamodel();
	u_offset_t maxoff = (model == DATAMODEL_ILP32) ?
		MAXOFF32_T : MAXOFFSET_T;
#else
	const u_offset_t maxoff = MAXOFF32_T;
#endif
	mblk_t *dmp = NULL;
	int wroff;
	int buf_left = 0;
	size_t	iov_len;
	mblk_t  *head, *tmp;
	size_t  size = total_size;

	fflag = fp->f_flag;
	vp = fp->f_vnode;

	ASSERT(vp->v_type == VSOCK);
	ASSERT(maxblk > 0);

	wroff = (int)vp->v_stream->sd_wroff;
	buf_left = MIN(total_size, maxblk);
	head = dmp = allocb(buf_left + wroff, BPRI_HI);
	if (head == NULL)
		return (ENOMEM);
	head->b_wptr = head->b_rptr = head->b_rptr + wroff;

	auio.uio_extflg = UIO_COPY_DEFAULT;
	for (i = 0; i < copy_cnt; i++) {
		if (ISSIG(curthread, JUSTLOOKING))
			return (EINTR);

		/*
		 * Do similar checks as "write" as we are writing
		 * sfv_len bytes into "vp".
		 */
		sfv_len = (ssize_t)sfv->sfv_len;

		if (sfv_len == 0) {
			sfv++;
			continue;
		}

		/* Make sure sfv_len is not negative */
#ifdef _SYSCALL32_IMPL
		if (model == DATAMODEL_ILP32) {
			if ((ssize32_t)sfv_len < 0)
				return (EINVAL);
		} else
#endif
		if (sfv_len < 0)
			return (EINVAL);

		/* Check for overflow */
#ifdef _SYSCALL32_IMPL
		if (model == DATAMODEL_ILP32) {
			if (((ssize32_t)(*count + sfv_len)) < 0)
				return (EINVAL);
		} else
#endif
		if ((*count + sfv_len) < 0)
			return (EINVAL);

		sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off;

		if (sfv->sfv_fd == SFV_FD_SELF) {
			while (sfv_len > 0) {
				if (buf_left == 0) {
					tmp = dmp;
					buf_left = MIN(total_size, maxblk);
					iov_len = MIN(buf_left, sfv_len);
					dmp = allocb(buf_left + wroff, BPRI_HI);
					if (dmp == NULL) {
						freemsg(head);
						return (ENOMEM);
					}
					dmp->b_wptr = dmp->b_rptr =
					    dmp->b_rptr + wroff;
					tmp->b_cont = dmp;
				} else {
					iov_len = MIN(buf_left, sfv_len);
				}

				aiov.iov_len = iov_len;
				aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
				auio.uio_loffset = *fileoff;
				auio.uio_iovcnt = 1;
				auio.uio_resid = iov_len;
				auio.uio_iov = &aiov;
				auio.uio_segflg = UIO_USERSPACE;
				auio.uio_llimit = curproc->p_fsz_ctl;
				auio.uio_fmode = fflag;

				buf_left -= iov_len;
				total_size -= iov_len;
				sfv_len -= iov_len;
				sfv_off += iov_len;

				error = uiomove((caddr_t)dmp->b_wptr,
				    iov_len, UIO_WRITE, &auio);
				if (error != 0) {
					freemsg(head);
					return (error);
				}
				dmp->b_wptr += iov_len;
			}
		} else {
			file_t	*ffp;
			vnode_t	*readvp;
			int	readflg = 0;

			if ((ffp = getf(sfv->sfv_fd)) == NULL) {
				freemsg(head);
				return (EBADF);
			}

			if ((ffp->f_flag & FREAD) == 0) {
				releasef(sfv->sfv_fd);
				freemsg(head);
				return (EACCES);
			}

			readvp = ffp->f_vnode;
			if (readvp->v_type != VREG) {
				releasef(sfv->sfv_fd);
				freemsg(head);
				return (EINVAL);
			}

			/*
			 * No point reading and writing to same vp,
			 * as long as both are regular files. readvp is not
			 * locked; but since we got it from an open file the
			 * contents will be valid during the time of access.
			 */

			if (VN_CMP(vp, readvp)) {
				releasef(sfv->sfv_fd);
				freemsg(head);
				return (EINVAL);
			}

			/*
			 * Note: we assume readvp != vp. "vp" is already
			 * locked, and "readvp" must not be.
			 */

			(void) VOP_RWLOCK(readvp, readflg, NULL);

			/* Same checks as in pread */
			if (sfv_off > maxoff) {
				VOP_RWUNLOCK(readvp, readflg, NULL);
				releasef(sfv->sfv_fd);
				freemsg(head);
				return (EINVAL);
			}
			if (sfv_off + sfv_len > maxoff) {
				sfv_len = (ssize_t)((offset_t)maxoff -
				    sfv_off);
			}

			while (sfv_len > 0) {
				if (buf_left == 0) {
					tmp = dmp;
					buf_left = MIN(total_size, maxblk);
					iov_len = MIN(buf_left, sfv_len);
					dmp = allocb(buf_left + wroff, BPRI_HI);
					if (dmp == NULL) {
						VOP_RWUNLOCK(readvp, readflg,
									NULL);
						releasef(sfv->sfv_fd);
						freemsg(head);
						return (ENOMEM);
					}
					dmp->b_wptr = dmp->b_rptr =
					    dmp->b_rptr + wroff;
					tmp->b_cont = dmp;
				} else {
					iov_len = MIN(buf_left, sfv_len);
				}
				aiov.iov_base = (caddr_t)dmp->b_wptr;
				aiov.iov_len = iov_len;
				auio.uio_loffset = sfv_off;
				auio.uio_iov = &aiov;
				auio.uio_iovcnt = 1;
				auio.uio_resid = iov_len;
				auio.uio_segflg = UIO_SYSSPACE;
				auio.uio_llimit = MAXOFFSET_T;
				auio.uio_fmode = ffp->f_flag;
				ioflag = auio.uio_fmode &
				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);

				/*
				 * If read sync is not asked for,
				 * filter sync flags
				 */
				if ((ioflag & FRSYNC) == 0)
					ioflag &= ~(FSYNC|FDSYNC);
				error = VOP_READ(readvp, &auio, ioflag,
				    fp->f_cred, NULL);
				if (error != 0) {
					/*
					 * If we were reading a pipe (currently
					 * not implemented), we may now loose
					 * data.
					 */
					VOP_RWUNLOCK(readvp, readflg, NULL);
					releasef(sfv->sfv_fd);
					freemsg(head);
					return (error);
				}

				/*
				 * Check how much data was really read.
				 * Decrement the 'len' and increment the
				 * 'off' appropriately.
				 */
				cnt = iov_len - auio.uio_resid;
				if (cnt == 0) {
					VOP_RWUNLOCK(readvp, readflg, NULL);
					releasef(sfv->sfv_fd);
					freemsg(head);
					return (EINVAL);
				}
				sfv_len -= cnt;
				sfv_off += cnt;
				total_size -= cnt;
				buf_left -= cnt;

				dmp->b_wptr += cnt;
			}
			VOP_RWUNLOCK(readvp, readflg, NULL);
			releasef(sfv->sfv_fd);
		}
		sfv++;
	}

	ASSERT(total_size == 0);
	error = kstrwritemp(vp, head, fflag);
	if (error != 0) {
		freemsg(head);
		return (error);
	}
	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size;
	*count += size;

	return (0);
}


int
sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
    int copy_cnt, ssize_t *count)
{
	struct vnode *vp;
	struct uio auio;
	struct iovec aiov;
	ushort_t fflag;
	int ioflag;
	int i, error;
	size_t cnt;
	ssize_t sfv_len;
	u_offset_t sfv_off;
#ifdef _SYSCALL32_IMPL
	model_t model = get_udatamodel();
	u_offset_t maxoff = (model == DATAMODEL_ILP32) ?
		MAXOFF32_T : MAXOFFSET_T;
#else
	const u_offset_t maxoff = MAXOFF32_T;
#endif
	mblk_t	*dmp = NULL;
	char	*buf = NULL;

	fflag = fp->f_flag;
	vp = fp->f_vnode;

	auio.uio_extflg = UIO_COPY_DEFAULT;
	for (i = 0; i < copy_cnt; i++) {
		if (ISSIG(curthread, JUSTLOOKING))
			return (EINTR);

		/*
		 * Do similar checks as "write" as we are writing
		 * sfv_len bytes into "vp".
		 */
		sfv_len = (ssize_t)sfv->sfv_len;

		if (sfv_len == 0) {
			sfv++;
			continue;
		}

		/* Make sure sfv_len is not negative */
#ifdef _SYSCALL32_IMPL
		if (model == DATAMODEL_ILP32) {
			if ((ssize32_t)sfv_len < 0)
				return (EINVAL);
		} else
#endif
		if (sfv_len < 0)
			return (EINVAL);

		if (vp->v_type == VREG) {
			if (*fileoff >= curproc->p_fsz_ctl) {
				mutex_enter(&curproc->p_lock);
				(void) rctl_action(
				    rctlproc_legacy[RLIMIT_FSIZE],
				    curproc->p_rctls, curproc, RCA_SAFE);
				mutex_exit(&curproc->p_lock);

				return (EFBIG);
			}

			if (*fileoff >= maxoff)
				return (EFBIG);

			if (*fileoff + sfv_len > maxoff)
				return (EINVAL);
		}

		/* Check for overflow */
#ifdef _SYSCALL32_IMPL
		if (model == DATAMODEL_ILP32) {
			if (((ssize32_t)(*count + sfv_len)) < 0)
				return (EINVAL);
		} else
#endif
		if ((*count + sfv_len) < 0)
			return (EINVAL);

		sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off;

		if (sfv->sfv_fd == SFV_FD_SELF) {
			aiov.iov_len = sfv_len;
			aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
			auio.uio_loffset = *fileoff;
			auio.uio_iovcnt = 1;
			auio.uio_resid = sfv_len;
			auio.uio_iov = &aiov;
			auio.uio_segflg = UIO_USERSPACE;
			auio.uio_llimit = curproc->p_fsz_ctl;
			auio.uio_fmode = fflag;

			if (vp->v_type == VSOCK) {

				/*
				 * Optimize for the socket case
				 */
				int wroff = (int)vp->v_stream->sd_wroff;

				dmp = allocb(sfv_len + wroff, BPRI_HI);
				if (dmp == NULL)
					return (ENOMEM);
				dmp->b_wptr = dmp->b_rptr = dmp->b_rptr + wroff;
				error = uiomove((caddr_t)dmp->b_wptr,
				    sfv_len, UIO_WRITE, &auio);
				if (error != 0) {
					freeb(dmp);
					return (error);
				}
				dmp->b_wptr += sfv_len;
				error = kstrwritemp(vp, dmp, fflag);
				if (error != 0) {
					freeb(dmp);
					return (error);
				}
				ttolwp(curthread)->lwp_ru.ioch +=
				    (ulong_t)sfv_len;
				*count += sfv_len;
			} else {
				ioflag = auio.uio_fmode &
				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
				while (sfv_len > 0) {
					error = VOP_WRITE(vp, &auio, ioflag,
					    fp->f_cred, NULL);
					cnt = sfv_len - auio.uio_resid;
					sfv_len -= cnt;
					ttolwp(curthread)->lwp_ru.ioch +=
					    (ulong_t)cnt;
					*fileoff += cnt;
					*count += cnt;
					if (error != 0)
						return (error);
				}
			}
		} else {
			file_t	*ffp;
			vnode_t	*readvp;
			int	readflg = 0;
			size_t	size;
			caddr_t	ptr;

			if ((ffp = getf(sfv->sfv_fd)) == NULL)
				return (EBADF);

			if ((ffp->f_flag & FREAD) == 0) {
				releasef(sfv->sfv_fd);
				return (EBADF);
			}

			readvp = ffp->f_vnode;
			if (readvp->v_type != VREG) {
				releasef(sfv->sfv_fd);
				return (EINVAL);
			}

			/*
			 * No point reading and writing to same vp,
			 * as long as both are regular files. readvp is not
			 * locked; but since we got it from an open file the
			 * contents will be valid during the time of access.
			 */
			if (VN_CMP(vp, readvp)) {
				releasef(sfv->sfv_fd);
				return (EINVAL);
			}

			/*
			 * Note: we assume readvp != vp. "vp" is already
			 * locked, and "readvp" must not be.
			 */
			(void) VOP_RWLOCK(readvp, readflg, NULL);

			/* Same checks as in pread */
			if (sfv_off > maxoff) {
				VOP_RWUNLOCK(readvp, readflg, NULL);
				releasef(sfv->sfv_fd);
				return (EINVAL);
			}
			if (sfv_off + sfv_len > maxoff) {
				sfv_len = (ssize_t)((offset_t)maxoff -
				    sfv_off);
			}
			/* Find the native blocksize to transfer data */
			size = MIN(vp->v_vfsp->vfs_bsize,
			    readvp->v_vfsp->vfs_bsize);
			size = sfv_len < size ? sfv_len : size;

			if (vp->v_type != VSOCK) {
				buf = kmem_alloc(size, KM_NOSLEEP);
				if (buf == NULL) {
					VOP_RWUNLOCK(readvp, readflg, NULL);
					releasef(sfv->sfv_fd);
					return (ENOMEM);
				}
			}

			while (sfv_len > 0) {
				size_t	iov_len;

				iov_len = MIN(size, sfv_len);

				if (vp->v_type == VSOCK) {
					dmp = allocb(iov_len, BPRI_HI);
					if (dmp == NULL) {
						VOP_RWUNLOCK(readvp, readflg,
						    NULL);
						releasef(sfv->sfv_fd);
						return (ENOMEM);
					}
					ptr = (caddr_t)dmp->b_rptr;
				} else {
					ptr = buf;
				}

				aiov.iov_base = ptr;
				aiov.iov_len = iov_len;
				auio.uio_loffset = sfv_off;
				auio.uio_iov = &aiov;
				auio.uio_iovcnt = 1;
				auio.uio_resid = iov_len;
				auio.uio_segflg = UIO_SYSSPACE;
				auio.uio_llimit = MAXOFFSET_T;
				auio.uio_fmode = ffp->f_flag;
				ioflag = auio.uio_fmode &
				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);

				/*
				 * If read sync is not asked for,
				 * filter sync flags
				 */
				if ((ioflag & FRSYNC) == 0)
					ioflag &= ~(FSYNC|FDSYNC);
				error = VOP_READ(readvp, &auio, ioflag,
				    fp->f_cred, NULL);
				if (error != 0) {
					/*
					 * If we were reading a pipe (currently
					 * not implemented), we may now lose
					 * data.
					 */
					if (vp->v_type == VSOCK)
						freeb(dmp);
					else
						kmem_free(buf, size);
					VOP_RWUNLOCK(readvp, readflg, NULL);
					releasef(sfv->sfv_fd);
					return (error);
				}

				/*
				 * Check how much data was really read.
				 * Decrement the 'len' and increment the
				 * 'off' appropriately.
				 */
				cnt = iov_len - auio.uio_resid;
				if (cnt == 0) {
					if (vp->v_type == VSOCK)
						freeb(dmp);
					else
						kmem_free(buf, size);
					VOP_RWUNLOCK(readvp, readflg, NULL);
					releasef(sfv->sfv_fd);
					return (EINVAL);
				}
				sfv_len -= cnt;
				sfv_off += cnt;

				if (vp->v_type == VSOCK) {
					dmp->b_wptr = dmp->b_rptr + cnt;

					error = kstrwritemp(vp, dmp, fflag);
					if (error != 0) {
						freeb(dmp);
						VOP_RWUNLOCK(readvp, readflg,
									NULL);
						releasef(sfv->sfv_fd);
						return (error);
					}

					ttolwp(curthread)->lwp_ru.ioch +=
					    (ulong_t)cnt;
					*count += cnt;
				} else {

					aiov.iov_base = ptr;
					aiov.iov_len = cnt;
					auio.uio_loffset = *fileoff;
					auio.uio_resid = cnt;
					auio.uio_segflg = UIO_SYSSPACE;
					auio.uio_llimit = curproc->p_fsz_ctl;
					auio.uio_fmode = fflag;
					ioflag = auio.uio_fmode &
					    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
					error = VOP_WRITE(vp, &auio, ioflag,
					    fp->f_cred, NULL);

					/*
					 * Check how much data was written.
					 * Increment the 'len' and decrement the
					 * 'off' if all the data was not
					 * written.
					 */
					cnt -= auio.uio_resid;
					sfv_len += auio.uio_resid;
					sfv_off -= auio.uio_resid;
					ttolwp(curthread)->lwp_ru.ioch +=
					    (ulong_t)cnt;
					*fileoff += cnt;
					*count += cnt;
					if (error != 0) {
						VOP_RWUNLOCK(readvp, readflg,
									NULL);
						releasef(sfv->sfv_fd);
						return (error);
					}
				}
			}
			if (buf) {
				kmem_free(buf, size);
				buf = NULL;
			}
			VOP_RWUNLOCK(readvp, readflg, NULL);
			releasef(sfv->sfv_fd);
		}
		sfv++;
	}
	return (0);
}

ssize_t
sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt,
    size_t *xferred)
{
	int error;
	file_t *fp;
	struct vnode *vp;
	struct sonode *so;
	u_offset_t fileoff;
	int copy_cnt;
	const struct sendfilevec *copy_vec;
	struct sendfilevec sfv[SEND_MAX_CHUNK];
	ssize_t count = 0;
#ifdef _SYSCALL32_IMPL
	struct ksendfilevec32 sfv32[SEND_MAX_CHUNK];
#endif
	ssize_t total_size = 0;
	int i;
	boolean_t is_sock = B_FALSE;
	int maxblk = 0;

	if (sfvcnt <= 0)
		return (set_errno(EINVAL));

	if ((fp = getf(fildes)) == NULL)
		return (set_errno(EBADF));

	if (((fp->f_flag) & FWRITE) == 0) {
		error = EBADF;
		goto err;
	}

	fileoff = fp->f_offset;
	vp = fp->f_vnode;

	switch (vp->v_type) {
	case VSOCK:
		so = VTOSO(vp);
		/* sendfile not supported for SCTP */
		if (so->so_protocol == IPPROTO_SCTP) {
			error = EPROTONOSUPPORT;
			goto err;
		}
		is_sock = B_TRUE;
		switch (so->so_family) {
		case AF_NCA:
		case AF_INET:
		case AF_INET6:
			/*
			 * Make similar checks done in SOP_WRITE().
			 */
			if (so->so_state & SS_CANTSENDMORE) {
				tsignal(curthread, SIGPIPE);
				error = EPIPE;
				goto err;
			}
			if (so->so_type != SOCK_STREAM) {
				error = EOPNOTSUPP;
				goto err;
			}

			if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
			    (SS_ISCONNECTED|SS_ISBOUND)) {
				error = ENOTCONN;
				goto err;
			}

			if ((so->so_state & SS_DIRECT) &&
			    (so->so_priv != NULL)) {
				maxblk = ((tcp_t *)so->so_priv)->tcp_mss;
			} else {
				maxblk = (int)vp->v_stream->sd_maxblk;
			}
			break;
		default:
			error = EAFNOSUPPORT;
			goto err;
		}
		break;
	case VREG:
		break;
	default:
		error = EINVAL;
		goto err;
	}

	switch (opcode) {
	case SENDFILEV :
		break;
#if defined(_SYSCALL32_IMPL) || defined(_ILP32)
	case SENDFILEV64 :
		return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt,
		    (size32_t *)xferred, fildes));
#endif
	default :
		error = ENOSYS;
		break;
	}

	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
	copy_vec = vec;

	do {
		copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
#ifdef _SYSCALL32_IMPL
		/* 32-bit callers need to have their iovec expanded. */
		if (get_udatamodel() == DATAMODEL_ILP32) {
			if (copyin(copy_vec, sfv32,
			    copy_cnt * sizeof (ksendfilevec32_t))) {
				error = EFAULT;
				break;
			}

			for (i = 0; i < copy_cnt; i++) {
				sfv[i].sfv_fd = sfv32[i].sfv_fd;
				sfv[i].sfv_off =
					(off_t)(uint32_t)sfv32[i].sfv_off;
				sfv[i].sfv_len = (size_t)sfv32[i].sfv_len;
				total_size += sfv[i].sfv_len;
				sfv[i].sfv_flag = sfv32[i].sfv_flag;
			}
		} else {
#endif
			if (copyin(copy_vec, sfv,
			    copy_cnt * sizeof (sendfilevec_t))) {
				error = EFAULT;
				break;
			}

			for (i = 0; i < copy_cnt; i++) {
				total_size += sfv[i].sfv_len;
			}
#ifdef _SYSCALL32_IMPL
		}
#endif

		/*
		 * The task between deciding to use sendvec_small_chunk
		 * and sendvec_chunk is dependant on multiple things:
		 *
		 * i) latency is important for smaller files. So if the
		 * data is smaller than 'tcp_slow_start_initial' times
		 * maxblk, then use sendvec_small_chunk which creates
		 * maxblk size mblks and chains then together and sends
		 * them to TCP in one shot. It also leaves 'wroff' size
		 * space for the headers in each mblk.
		 *
		 * ii) for total size bigger than 'tcp_slow_start_initial'
		 * time maxblk, its probably real file data which is
		 * dominating. So its better to use sendvec_chunk because
		 * performance goes to dog if we don't do pagesize reads.
		 * sendvec_chunk will do pagesize reads and write them
		 * in pagesize mblks to TCP.
		 *
		 * Side Notes: A write to file has not been optimized.
		 * Future zero copy code will plugin into sendvec_chunk
		 * only because doing zero copy for files smaller then
		 * pagesize is useless.
		 *
		 * Note, if socket has NL7C enabled then call NL7C's
		 * senfilev() function to give NL7C a chance to copy
		 * the vec for caching, then continue processing as
		 * normal.
		 */
		if (is_sock) {
			switch (so->so_family) {
			case AF_INET:
			case AF_INET6:
				if (so->so_nl7c_flags != 0) {
					nl7c_sendfilev(so, fileoff,
					    sfv, copy_cnt);
				}
				if (total_size <= (4 * maxblk))
					error = sendvec_small_chunk(fp,
					    &fileoff, sfv, copy_cnt,
					    total_size, maxblk, &count);
				else
					error = sendvec_chunk(fp, &fileoff,
					    sfv, copy_cnt, &count);
				break;
			case AF_NCA:
				error = nca_sendfilev(fp, sfv, copy_cnt,
				    &count);
				break;
			}
		} else {
			ASSERT(vp->v_type == VREG);
			error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt,
			    &count);
		}


#ifdef _SYSCALL32_IMPL
	if (get_udatamodel() == DATAMODEL_ILP32)
		copy_vec = (const struct sendfilevec *)((char *)copy_vec +
		    (copy_cnt * sizeof (ksendfilevec32_t)));
	else
#endif
		copy_vec += copy_cnt;
		sfvcnt -= copy_cnt;
	} while (sfvcnt > 0);

	if (vp->v_type == VREG)
		fp->f_offset += count;


	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);

#ifdef _SYSCALL32_IMPL
	if (get_udatamodel() == DATAMODEL_ILP32) {
		ssize32_t count32 = (ssize32_t)count;
		if (copyout(&count32, xferred, sizeof (count32)))
			error = EFAULT;
		releasef(fildes);
		if (error != 0)
			return (set_errno(error));
		return (count32);
	}
#endif
	if (copyout(&count, xferred, sizeof (count)))
		error = EFAULT;
	releasef(fildes);
	if (error != 0)
		return (set_errno(error));
	return (count);
err:
	ASSERT(error != 0);
	releasef(fildes);
	return (set_errno(error));
}