/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" #include <sys/types.h> #include <sys/t_lock.h> #include <sys/param.h> #include <sys/systm.h> #include <sys/buf.h> #include <sys/conf.h> #include <sys/cred.h> #include <sys/kmem.h> #include <sys/sysmacros.h> #include <sys/vfs.h> #include <sys/vnode.h> #include <sys/debug.h> #include <sys/errno.h> #include <sys/time.h> #include <sys/file.h> #include <sys/open.h> #include <sys/user.h> #include <sys/termios.h> #include <sys/stream.h> #include <sys/strsubr.h> #include <sys/esunddi.h> #include <sys/flock.h> #include <sys/modctl.h> #include <sys/cmn_err.h> #include <sys/vmsystm.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <netinet/in.h> #include <sys/sendfile.h> #include <sys/un.h> #include <inet/nca/ncadoorhdr.h> #include <inet/nca/ncaio.h> #include <sys/tihdr.h> #include <sys/atomic.h> #include <inet/common.h> #include <inet/ip.h> #include <inet/ip6.h> #include <inet/tcp.h> extern int nca_sendfilev(file_t *, struct sendfilevec *, int, ssize_t *); extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *, ssize32_t *); extern void nl7c_sendfilev(struct sonode *, u_offset_t, struct sendfilevec *, int); /* * kstrwritemp() has very similar semantics as that of strwrite(). * The main difference is it obtains mblks from the caller and also * does not do any copy as done in strwrite() from user buffers to * kernel buffers. * * Currently, this routine is used by sendfile to send data allocated * within the kernel without any copying. This interface does not use the * synchronous stream interface as synch. stream interface implies * copying. */ int kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode) { struct stdata *stp; struct queue *wqp; char waitflag; int tempmode; int error = 0; int done = 0; struct sonode *so; boolean_t direct; ASSERT(vp->v_stream); stp = vp->v_stream; so = VTOSO(vp); direct = (so->so_state & SS_DIRECT); /* * This is the sockfs direct fast path. canputnext() need * not be accurate so we don't grab the sd_lock here. If * we get flow-controlled, we grab sd_lock just before the * do..while loop below to emulate what strwrite() does. */ wqp = stp->sd_wrq; if (canputnext(wqp) && direct && !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) { return (sostream_direct(so, NULL, mp, CRED())); } else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) { /* Fast check of flags before acquiring the lock */ mutex_enter(&stp->sd_lock); error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0); mutex_exit(&stp->sd_lock); if (error != 0) { if (!(stp->sd_flag & STPLEX) && (stp->sd_wput_opt & SW_SIGPIPE)) { tsignal(curthread, SIGPIPE); error = EPIPE; } return (error); } } waitflag = WRITEWAIT; if (stp->sd_flag & OLDNDELAY) tempmode = fmode & ~FNDELAY; else tempmode = fmode; mutex_enter(&stp->sd_lock); do { if (canputnext(wqp)) { mutex_exit(&stp->sd_lock); putnext(wqp, mp); return (0); } error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1, &done); } while (error == 0 && !done); mutex_exit(&stp->sd_lock); /* * EAGAIN tells the application to try again. ENOMEM * is returned only if the memory allocation size * exceeds the physical limits of the system. ENOMEM * can't be true here. */ if (error == ENOMEM) error = EAGAIN; return (error); } #define SEND_MAX_CHUNK 16 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) /* * 64 bit offsets for 32 bit applications only running either on * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer * more than 2GB of data. */ int sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, int copy_cnt, ssize32_t *count) { struct vnode *vp; ushort_t fflag; int ioflag; size32_t cnt; ssize32_t sfv_len; ssize32_t tmpcount; u_offset_t sfv_off; struct uio auio; struct iovec aiov; int i, error; fflag = fp->f_flag; vp = fp->f_vnode; for (i = 0; i < copy_cnt; i++) { if (ISSIG(curthread, JUSTLOOKING)) return (EINTR); /* * Do similar checks as "write" as we are writing * sfv_len bytes into "vp". */ sfv_len = (ssize32_t)sfv->sfv_len; if (sfv_len == 0) continue; if (sfv_len < 0) return (EINVAL); if (vp->v_type == VREG) { if (*fileoff >= curproc->p_fsz_ctl) { mutex_enter(&curproc->p_lock); (void) rctl_action( rctlproc_legacy[RLIMIT_FSIZE], curproc->p_rctls, curproc, RCA_SAFE); mutex_exit(&curproc->p_lock); return (EFBIG); } if (*fileoff >= OFFSET_MAX(fp)) return (EFBIG); if (*fileoff + sfv_len > OFFSET_MAX(fp)) return (EINVAL); } tmpcount = *count + sfv_len; if (tmpcount < 0) return (EINVAL); sfv_off = sfv->sfv_off; auio.uio_extflg = UIO_COPY_DEFAULT; if (sfv->sfv_fd == SFV_FD_SELF) { aiov.iov_len = sfv_len; aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; auio.uio_loffset = *fileoff; auio.uio_iovcnt = 1; auio.uio_resid = sfv_len; auio.uio_iov = &aiov; auio.uio_segflg = UIO_USERSPACE; auio.uio_llimit = curproc->p_fsz_ctl; auio.uio_fmode = fflag; ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); while (sfv_len > 0) { error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL); cnt = sfv_len - auio.uio_resid; sfv_len -= cnt; ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; if (vp->v_type == VREG) *fileoff += cnt; *count += cnt; if (error != 0) return (error); } } else { file_t *ffp; vnode_t *readvp; int readflg = 0; size_t size; caddr_t ptr; if ((ffp = getf(sfv->sfv_fd)) == NULL) return (EBADF); if ((ffp->f_flag & FREAD) == 0) { releasef(sfv->sfv_fd); return (EBADF); } readvp = ffp->f_vnode; if (readvp->v_type != VREG) { releasef(sfv->sfv_fd); return (EINVAL); } /* * No point reading and writing to same vp, * as long as both are regular files. readvp is not * locked; but since we got it from an open file the * contents will be valid during the time of access. */ if (VN_CMP(vp, readvp)) { releasef(sfv->sfv_fd); return (EINVAL); } /* * Note: we assume readvp != vp. "vp" is already * locked, and "readvp" must not be. */ (void) VOP_RWLOCK(readvp, readflg, NULL); /* * Same checks as in pread64. */ if (sfv_off > MAXOFFSET_T) { VOP_RWUNLOCK(readvp, readflg, NULL); releasef(sfv->sfv_fd); return (EINVAL); } if (sfv_off + sfv_len > MAXOFFSET_T) sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off); /* Find the native blocksize to transfer data */ size = MIN(vp->v_vfsp->vfs_bsize, readvp->v_vfsp->vfs_bsize); size = sfv_len < size ? sfv_len : size; ptr = kmem_alloc(size, KM_SLEEP); while (sfv_len > 0) { size_t iov_len; iov_len = MIN(size, sfv_len); aiov.iov_base = ptr; aiov.iov_len = iov_len; auio.uio_loffset = sfv_off; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = iov_len; auio.uio_segflg = UIO_SYSSPACE; auio.uio_llimit = MAXOFFSET_T; auio.uio_fmode = ffp->f_flag; ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); /* * If read sync is not asked for, * filter sync flags */ if ((ioflag & FRSYNC) == 0) ioflag &= ~(FSYNC|FDSYNC); error = VOP_READ(readvp, &auio, ioflag, fp->f_cred, NULL); if (error) { kmem_free(ptr, size); VOP_RWUNLOCK(readvp, readflg, NULL); releasef(sfv->sfv_fd); return (error); } /* * Check how must data was really read. * Decrement the 'len' and increment the * 'off' appropriately. */ cnt = iov_len - auio.uio_resid; if (cnt == 0) { /* * If we were reading a pipe (currently * not implemented), we may now lose * data. */ kmem_free(ptr, size); VOP_RWUNLOCK(readvp, readflg, NULL); releasef(sfv->sfv_fd); return (EINVAL); } sfv_len -= cnt; sfv_off += cnt; aiov.iov_base = ptr; aiov.iov_len = cnt; auio.uio_loffset = *fileoff; auio.uio_resid = cnt; auio.uio_segflg = UIO_SYSSPACE; auio.uio_llimit = curproc->p_fsz_ctl; auio.uio_fmode = fflag; ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL); /* * Check how much data was written. Increment * the 'len' and decrement the 'off' if all * the data was not written. */ cnt -= auio.uio_resid; sfv_len += auio.uio_resid; sfv_off -= auio.uio_resid; ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; if (vp->v_type == VREG) *fileoff += cnt; *count += cnt; if (error != 0) { kmem_free(ptr, size); VOP_RWUNLOCK(readvp, readflg, NULL); releasef(sfv->sfv_fd); return (error); } } VOP_RWUNLOCK(readvp, readflg, NULL); releasef(sfv->sfv_fd); kmem_free(ptr, size); } sfv++; } return (0); } ssize32_t sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, size32_t *xferred, int fildes) { int rwflag; u_offset_t fileoff; int copy_cnt; const struct ksendfilevec64 *copy_vec; struct ksendfilevec64 sfv[SEND_MAX_CHUNK]; struct vnode *vp; int error; ssize32_t count = 0; int osfvcnt; rwflag = 1; vp = fp->f_vnode; (void) VOP_RWLOCK(vp, rwflag, NULL); copy_vec = vec; fileoff = fp->f_offset; osfvcnt = sfvcnt; do { copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); if (copyin(copy_vec, sfv, copy_cnt * sizeof (struct ksendfilevec64))) { error = EFAULT; break; } /* * Optimize the single regular file over * the socket case. */ if (vp->v_type == VSOCK && osfvcnt == 1 && sfv->sfv_fd != SFV_FD_SELF) { file_t *rfp; vnode_t *rvp; if ((rfp = getf(sfv->sfv_fd)) == NULL) { error = EBADF; break; } if ((rfp->f_flag & FREAD) == 0) { releasef(sfv->sfv_fd); error = EBADF; break; } rvp = rfp->f_vnode; if (rvp->v_type == VREG) { error = sosendfile64(fp, rfp, sfv, &count); break; } releasef(sfv->sfv_fd); } error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count); if (error != 0) break; copy_vec += copy_cnt; sfvcnt -= copy_cnt; } while (sfvcnt > 0); if (vp->v_type == VREG) fp->f_offset += count; VOP_RWUNLOCK(vp, rwflag, NULL); if (copyout(&count, xferred, sizeof (count))) error = EFAULT; releasef(fildes); if (error != 0) return (set_errno(error)); return (count); } #endif int sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count) { struct vnode *vp; struct uio auio; struct iovec aiov; ushort_t fflag; int ioflag; int i, error; size_t cnt; ssize_t sfv_len; u_offset_t sfv_off; #ifdef _SYSCALL32_IMPL model_t model = get_udatamodel(); u_offset_t maxoff = (model == DATAMODEL_ILP32) ? MAXOFF32_T : MAXOFFSET_T; #else const u_offset_t maxoff = MAXOFF32_T; #endif mblk_t *dmp = NULL; int wroff; int buf_left = 0; size_t iov_len; mblk_t *head, *tmp; size_t size = total_size; fflag = fp->f_flag; vp = fp->f_vnode; ASSERT(vp->v_type == VSOCK); ASSERT(maxblk > 0); wroff = (int)vp->v_stream->sd_wroff; buf_left = MIN(total_size, maxblk); head = dmp = allocb(buf_left + wroff, BPRI_HI); if (head == NULL) return (ENOMEM); head->b_wptr = head->b_rptr = head->b_rptr + wroff; auio.uio_extflg = UIO_COPY_DEFAULT; for (i = 0; i < copy_cnt; i++) { if (ISSIG(curthread, JUSTLOOKING)) return (EINTR); /* * Do similar checks as "write" as we are writing * sfv_len bytes into "vp". */ sfv_len = (ssize_t)sfv->sfv_len; if (sfv_len == 0) { sfv++; continue; } /* Make sure sfv_len is not negative */ #ifdef _SYSCALL32_IMPL if (model == DATAMODEL_ILP32) { if ((ssize32_t)sfv_len < 0) return (EINVAL); } else #endif if (sfv_len < 0) return (EINVAL); /* Check for overflow */ #ifdef _SYSCALL32_IMPL if (model == DATAMODEL_ILP32) { if (((ssize32_t)(*count + sfv_len)) < 0) return (EINVAL); } else #endif if ((*count + sfv_len) < 0) return (EINVAL); sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; if (sfv->sfv_fd == SFV_FD_SELF) { while (sfv_len > 0) { if (buf_left == 0) { tmp = dmp; buf_left = MIN(total_size, maxblk); iov_len = MIN(buf_left, sfv_len); dmp = allocb(buf_left + wroff, BPRI_HI); if (dmp == NULL) { freemsg(head); return (ENOMEM); } dmp->b_wptr = dmp->b_rptr = dmp->b_rptr + wroff; tmp->b_cont = dmp; } else { iov_len = MIN(buf_left, sfv_len); } aiov.iov_len = iov_len; aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; auio.uio_loffset = *fileoff; auio.uio_iovcnt = 1; auio.uio_resid = iov_len; auio.uio_iov = &aiov; auio.uio_segflg = UIO_USERSPACE; auio.uio_llimit = curproc->p_fsz_ctl; auio.uio_fmode = fflag; buf_left -= iov_len; total_size -= iov_len; sfv_len -= iov_len; sfv_off += iov_len; error = uiomove((caddr_t)dmp->b_wptr, iov_len, UIO_WRITE, &auio); if (error != 0) { freemsg(head); return (error); } dmp->b_wptr += iov_len; } } else { file_t *ffp; vnode_t *readvp; int readflg = 0; if ((ffp = getf(sfv->sfv_fd)) == NULL) { freemsg(head); return (EBADF); } if ((ffp->f_flag & FREAD) == 0) { releasef(sfv->sfv_fd); freemsg(head); return (EACCES); } readvp = ffp->f_vnode; if (readvp->v_type != VREG) { releasef(sfv->sfv_fd); freemsg(head); return (EINVAL); } /* * No point reading and writing to same vp, * as long as both are regular files. readvp is not * locked; but since we got it from an open file the * contents will be valid during the time of access. */ if (VN_CMP(vp, readvp)) { releasef(sfv->sfv_fd); freemsg(head); return (EINVAL); } /* * Note: we assume readvp != vp. "vp" is already * locked, and "readvp" must not be. */ (void) VOP_RWLOCK(readvp, readflg, NULL); /* Same checks as in pread */ if (sfv_off > maxoff) { VOP_RWUNLOCK(readvp, readflg, NULL); releasef(sfv->sfv_fd); freemsg(head); return (EINVAL); } if (sfv_off + sfv_len > maxoff) { sfv_len = (ssize_t)((offset_t)maxoff - sfv_off); } while (sfv_len > 0) { if (buf_left == 0) { tmp = dmp; buf_left = MIN(total_size, maxblk); iov_len = MIN(buf_left, sfv_len); dmp = allocb(buf_left + wroff, BPRI_HI); if (dmp == NULL) { VOP_RWUNLOCK(readvp, readflg, NULL); releasef(sfv->sfv_fd); freemsg(head); return (ENOMEM); } dmp->b_wptr = dmp->b_rptr = dmp->b_rptr + wroff; tmp->b_cont = dmp; } else { iov_len = MIN(buf_left, sfv_len); } aiov.iov_base = (caddr_t)dmp->b_wptr; aiov.iov_len = iov_len; auio.uio_loffset = sfv_off; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = iov_len; auio.uio_segflg = UIO_SYSSPACE; auio.uio_llimit = MAXOFFSET_T; auio.uio_fmode = ffp->f_flag; ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); /* * If read sync is not asked for, * filter sync flags */ if ((ioflag & FRSYNC) == 0) ioflag &= ~(FSYNC|FDSYNC); error = VOP_READ(readvp, &auio, ioflag, fp->f_cred, NULL); if (error != 0) { /* * If we were reading a pipe (currently * not implemented), we may now loose * data. */ VOP_RWUNLOCK(readvp, readflg, NULL); releasef(sfv->sfv_fd); freemsg(head); return (error); } /* * Check how much data was really read. * Decrement the 'len' and increment the * 'off' appropriately. */ cnt = iov_len - auio.uio_resid; if (cnt == 0) { VOP_RWUNLOCK(readvp, readflg, NULL); releasef(sfv->sfv_fd); freemsg(head); return (EINVAL); } sfv_len -= cnt; sfv_off += cnt; total_size -= cnt; buf_left -= cnt; dmp->b_wptr += cnt; } VOP_RWUNLOCK(readvp, readflg, NULL); releasef(sfv->sfv_fd); } sfv++; } ASSERT(total_size == 0); error = kstrwritemp(vp, head, fflag); if (error != 0) { freemsg(head); return (error); } ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size; *count += size; return (0); } int sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, int copy_cnt, ssize_t *count) { struct vnode *vp; struct uio auio; struct iovec aiov; ushort_t fflag; int ioflag; int i, error; size_t cnt; ssize_t sfv_len; u_offset_t sfv_off; #ifdef _SYSCALL32_IMPL model_t model = get_udatamodel(); u_offset_t maxoff = (model == DATAMODEL_ILP32) ? MAXOFF32_T : MAXOFFSET_T; #else const u_offset_t maxoff = MAXOFF32_T; #endif mblk_t *dmp = NULL; char *buf = NULL; fflag = fp->f_flag; vp = fp->f_vnode; auio.uio_extflg = UIO_COPY_DEFAULT; for (i = 0; i < copy_cnt; i++) { if (ISSIG(curthread, JUSTLOOKING)) return (EINTR); /* * Do similar checks as "write" as we are writing * sfv_len bytes into "vp". */ sfv_len = (ssize_t)sfv->sfv_len; if (sfv_len == 0) { sfv++; continue; } /* Make sure sfv_len is not negative */ #ifdef _SYSCALL32_IMPL if (model == DATAMODEL_ILP32) { if ((ssize32_t)sfv_len < 0) return (EINVAL); } else #endif if (sfv_len < 0) return (EINVAL); if (vp->v_type == VREG) { if (*fileoff >= curproc->p_fsz_ctl) { mutex_enter(&curproc->p_lock); (void) rctl_action( rctlproc_legacy[RLIMIT_FSIZE], curproc->p_rctls, curproc, RCA_SAFE); mutex_exit(&curproc->p_lock); return (EFBIG); } if (*fileoff >= maxoff) return (EFBIG); if (*fileoff + sfv_len > maxoff) return (EINVAL); } /* Check for overflow */ #ifdef _SYSCALL32_IMPL if (model == DATAMODEL_ILP32) { if (((ssize32_t)(*count + sfv_len)) < 0) return (EINVAL); } else #endif if ((*count + sfv_len) < 0) return (EINVAL); sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; if (sfv->sfv_fd == SFV_FD_SELF) { aiov.iov_len = sfv_len; aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; auio.uio_loffset = *fileoff; auio.uio_iovcnt = 1; auio.uio_resid = sfv_len; auio.uio_iov = &aiov; auio.uio_segflg = UIO_USERSPACE; auio.uio_llimit = curproc->p_fsz_ctl; auio.uio_fmode = fflag; if (vp->v_type == VSOCK) { /* * Optimize for the socket case */ int wroff = (int)vp->v_stream->sd_wroff; dmp = allocb(sfv_len + wroff, BPRI_HI); if (dmp == NULL) return (ENOMEM); dmp->b_wptr = dmp->b_rptr = dmp->b_rptr + wroff; error = uiomove((caddr_t)dmp->b_wptr, sfv_len, UIO_WRITE, &auio); if (error != 0) { freeb(dmp); return (error); } dmp->b_wptr += sfv_len; error = kstrwritemp(vp, dmp, fflag); if (error != 0) { freeb(dmp); return (error); } ttolwp(curthread)->lwp_ru.ioch += (ulong_t)sfv_len; *count += sfv_len; } else { ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); while (sfv_len > 0) { error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL); cnt = sfv_len - auio.uio_resid; sfv_len -= cnt; ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; *fileoff += cnt; *count += cnt; if (error != 0) return (error); } } } else { file_t *ffp; vnode_t *readvp; int readflg = 0; size_t size; caddr_t ptr; if ((ffp = getf(sfv->sfv_fd)) == NULL) return (EBADF); if ((ffp->f_flag & FREAD) == 0) { releasef(sfv->sfv_fd); return (EBADF); } readvp = ffp->f_vnode; if (readvp->v_type != VREG) { releasef(sfv->sfv_fd); return (EINVAL); } /* * No point reading and writing to same vp, * as long as both are regular files. readvp is not * locked; but since we got it from an open file the * contents will be valid during the time of access. */ if (VN_CMP(vp, readvp)) { releasef(sfv->sfv_fd); return (EINVAL); } /* * Note: we assume readvp != vp. "vp" is already * locked, and "readvp" must not be. */ (void) VOP_RWLOCK(readvp, readflg, NULL); /* Same checks as in pread */ if (sfv_off > maxoff) { VOP_RWUNLOCK(readvp, readflg, NULL); releasef(sfv->sfv_fd); return (EINVAL); } if (sfv_off + sfv_len > maxoff) { sfv_len = (ssize_t)((offset_t)maxoff - sfv_off); } /* Find the native blocksize to transfer data */ size = MIN(vp->v_vfsp->vfs_bsize, readvp->v_vfsp->vfs_bsize); size = sfv_len < size ? sfv_len : size; if (vp->v_type != VSOCK) { buf = kmem_alloc(size, KM_NOSLEEP); if (buf == NULL) { VOP_RWUNLOCK(readvp, readflg, NULL); releasef(sfv->sfv_fd); return (ENOMEM); } } while (sfv_len > 0) { size_t iov_len; iov_len = MIN(size, sfv_len); if (vp->v_type == VSOCK) { dmp = allocb(iov_len, BPRI_HI); if (dmp == NULL) { VOP_RWUNLOCK(readvp, readflg, NULL); releasef(sfv->sfv_fd); return (ENOMEM); } ptr = (caddr_t)dmp->b_rptr; } else { ptr = buf; } aiov.iov_base = ptr; aiov.iov_len = iov_len; auio.uio_loffset = sfv_off; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = iov_len; auio.uio_segflg = UIO_SYSSPACE; auio.uio_llimit = MAXOFFSET_T; auio.uio_fmode = ffp->f_flag; ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); /* * If read sync is not asked for, * filter sync flags */ if ((ioflag & FRSYNC) == 0) ioflag &= ~(FSYNC|FDSYNC); error = VOP_READ(readvp, &auio, ioflag, fp->f_cred, NULL); if (error != 0) { /* * If we were reading a pipe (currently * not implemented), we may now lose * data. */ if (vp->v_type == VSOCK) freeb(dmp); else kmem_free(buf, size); VOP_RWUNLOCK(readvp, readflg, NULL); releasef(sfv->sfv_fd); return (error); } /* * Check how much data was really read. * Decrement the 'len' and increment the * 'off' appropriately. */ cnt = iov_len - auio.uio_resid; if (cnt == 0) { if (vp->v_type == VSOCK) freeb(dmp); else kmem_free(buf, size); VOP_RWUNLOCK(readvp, readflg, NULL); releasef(sfv->sfv_fd); return (EINVAL); } sfv_len -= cnt; sfv_off += cnt; if (vp->v_type == VSOCK) { dmp->b_wptr = dmp->b_rptr + cnt; error = kstrwritemp(vp, dmp, fflag); if (error != 0) { freeb(dmp); VOP_RWUNLOCK(readvp, readflg, NULL); releasef(sfv->sfv_fd); return (error); } ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; *count += cnt; } else { aiov.iov_base = ptr; aiov.iov_len = cnt; auio.uio_loffset = *fileoff; auio.uio_resid = cnt; auio.uio_segflg = UIO_SYSSPACE; auio.uio_llimit = curproc->p_fsz_ctl; auio.uio_fmode = fflag; ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL); /* * Check how much data was written. * Increment the 'len' and decrement the * 'off' if all the data was not * written. */ cnt -= auio.uio_resid; sfv_len += auio.uio_resid; sfv_off -= auio.uio_resid; ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; *fileoff += cnt; *count += cnt; if (error != 0) { VOP_RWUNLOCK(readvp, readflg, NULL); releasef(sfv->sfv_fd); return (error); } } } if (buf) { kmem_free(buf, size); buf = NULL; } VOP_RWUNLOCK(readvp, readflg, NULL); releasef(sfv->sfv_fd); } sfv++; } return (0); } ssize_t sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, size_t *xferred) { int error; file_t *fp; struct vnode *vp; struct sonode *so; u_offset_t fileoff; int copy_cnt; const struct sendfilevec *copy_vec; struct sendfilevec sfv[SEND_MAX_CHUNK]; ssize_t count = 0; #ifdef _SYSCALL32_IMPL struct ksendfilevec32 sfv32[SEND_MAX_CHUNK]; #endif ssize_t total_size = 0; int i; boolean_t is_sock = B_FALSE; int maxblk = 0; if (sfvcnt <= 0) return (set_errno(EINVAL)); if ((fp = getf(fildes)) == NULL) return (set_errno(EBADF)); if (((fp->f_flag) & FWRITE) == 0) { error = EBADF; goto err; } fileoff = fp->f_offset; vp = fp->f_vnode; switch (vp->v_type) { case VSOCK: so = VTOSO(vp); /* sendfile not supported for SCTP */ if (so->so_protocol == IPPROTO_SCTP) { error = EPROTONOSUPPORT; goto err; } is_sock = B_TRUE; switch (so->so_family) { case AF_NCA: case AF_INET: case AF_INET6: /* * Make similar checks done in SOP_WRITE(). */ if (so->so_state & SS_CANTSENDMORE) { tsignal(curthread, SIGPIPE); error = EPIPE; goto err; } if (so->so_type != SOCK_STREAM) { error = EOPNOTSUPP; goto err; } if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) != (SS_ISCONNECTED|SS_ISBOUND)) { error = ENOTCONN; goto err; } if ((so->so_state & SS_DIRECT) && (so->so_priv != NULL)) { maxblk = ((tcp_t *)so->so_priv)->tcp_mss; } else { maxblk = (int)vp->v_stream->sd_maxblk; } break; default: error = EAFNOSUPPORT; goto err; } break; case VREG: break; default: error = EINVAL; goto err; } switch (opcode) { case SENDFILEV : break; #if defined(_SYSCALL32_IMPL) || defined(_ILP32) case SENDFILEV64 : return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt, (size32_t *)xferred, fildes)); #endif default : error = ENOSYS; break; } (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); copy_vec = vec; do { copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); #ifdef _SYSCALL32_IMPL /* 32-bit callers need to have their iovec expanded. */ if (get_udatamodel() == DATAMODEL_ILP32) { if (copyin(copy_vec, sfv32, copy_cnt * sizeof (ksendfilevec32_t))) { error = EFAULT; break; } for (i = 0; i < copy_cnt; i++) { sfv[i].sfv_fd = sfv32[i].sfv_fd; sfv[i].sfv_off = (off_t)(uint32_t)sfv32[i].sfv_off; sfv[i].sfv_len = (size_t)sfv32[i].sfv_len; total_size += sfv[i].sfv_len; sfv[i].sfv_flag = sfv32[i].sfv_flag; } } else { #endif if (copyin(copy_vec, sfv, copy_cnt * sizeof (sendfilevec_t))) { error = EFAULT; break; } for (i = 0; i < copy_cnt; i++) { total_size += sfv[i].sfv_len; } #ifdef _SYSCALL32_IMPL } #endif /* * The task between deciding to use sendvec_small_chunk * and sendvec_chunk is dependant on multiple things: * * i) latency is important for smaller files. So if the * data is smaller than 'tcp_slow_start_initial' times * maxblk, then use sendvec_small_chunk which creates * maxblk size mblks and chains then together and sends * them to TCP in one shot. It also leaves 'wroff' size * space for the headers in each mblk. * * ii) for total size bigger than 'tcp_slow_start_initial' * time maxblk, its probably real file data which is * dominating. So its better to use sendvec_chunk because * performance goes to dog if we don't do pagesize reads. * sendvec_chunk will do pagesize reads and write them * in pagesize mblks to TCP. * * Side Notes: A write to file has not been optimized. * Future zero copy code will plugin into sendvec_chunk * only because doing zero copy for files smaller then * pagesize is useless. * * Note, if socket has NL7C enabled then call NL7C's * senfilev() function to give NL7C a chance to copy * the vec for caching, then continue processing as * normal. */ if (is_sock) { switch (so->so_family) { case AF_INET: case AF_INET6: if (so->so_nl7c_flags != 0) { nl7c_sendfilev(so, fileoff, sfv, copy_cnt); } if (total_size <= (4 * maxblk)) error = sendvec_small_chunk(fp, &fileoff, sfv, copy_cnt, total_size, maxblk, &count); else error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt, &count); break; case AF_NCA: error = nca_sendfilev(fp, sfv, copy_cnt, &count); break; } } else { ASSERT(vp->v_type == VREG); error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt, &count); } #ifdef _SYSCALL32_IMPL if (get_udatamodel() == DATAMODEL_ILP32) copy_vec = (const struct sendfilevec *)((char *)copy_vec + (copy_cnt * sizeof (ksendfilevec32_t))); else #endif copy_vec += copy_cnt; sfvcnt -= copy_cnt; } while (sfvcnt > 0); if (vp->v_type == VREG) fp->f_offset += count; VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); #ifdef _SYSCALL32_IMPL if (get_udatamodel() == DATAMODEL_ILP32) { ssize32_t count32 = (ssize32_t)count; if (copyout(&count32, xferred, sizeof (count32))) error = EFAULT; releasef(fildes); if (error != 0) return (set_errno(error)); return (count32); } #endif if (copyout(&count, xferred, sizeof (count))) error = EFAULT; releasef(fildes); if (error != 0) return (set_errno(error)); return (count); err: ASSERT(error != 0); releasef(fildes); return (set_errno(error)); }