/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *, ssize32_t *); extern int nl7c_sendfilev(struct sonode *, u_offset_t *, struct sendfilevec *, int, ssize_t *); extern int snf_segmap(file_t *, vnode_t *, u_offset_t, u_offset_t, ssize_t *, boolean_t); extern sotpi_info_t *sotpi_sototpi(struct sonode *); #define SEND_MAX_CHUNK 16 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) /* * 64 bit offsets for 32 bit applications only running either on * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer * more than 2GB of data. */ int sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, int copy_cnt, ssize32_t *count) { struct vnode *vp; ushort_t fflag; int ioflag; size32_t cnt; ssize32_t sfv_len; ssize32_t tmpcount; u_offset_t sfv_off; struct uio auio; struct iovec aiov; int i, error; fflag = fp->f_flag; vp = fp->f_vnode; for (i = 0; i < copy_cnt; i++) { if (ISSIG(curthread, JUSTLOOKING)) return (EINTR); /* * Do similar checks as "write" as we are writing * sfv_len bytes into "vp". */ sfv_len = (ssize32_t)sfv->sfv_len; if (sfv_len == 0) { sfv++; continue; } if (sfv_len < 0) return (EINVAL); if (vp->v_type == VREG) { if (*fileoff >= curproc->p_fsz_ctl) { mutex_enter(&curproc->p_lock); (void) rctl_action( rctlproc_legacy[RLIMIT_FSIZE], curproc->p_rctls, curproc, RCA_SAFE); mutex_exit(&curproc->p_lock); return (EFBIG); } if (*fileoff >= OFFSET_MAX(fp)) return (EFBIG); if (*fileoff + sfv_len > OFFSET_MAX(fp)) return (EINVAL); } tmpcount = *count + sfv_len; if (tmpcount < 0) return (EINVAL); sfv_off = sfv->sfv_off; auio.uio_extflg = UIO_COPY_DEFAULT; if (sfv->sfv_fd == SFV_FD_SELF) { aiov.iov_len = sfv_len; aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; auio.uio_loffset = *fileoff; auio.uio_iovcnt = 1; auio.uio_resid = sfv_len; auio.uio_iov = &aiov; auio.uio_segflg = UIO_USERSPACE; auio.uio_llimit = curproc->p_fsz_ctl; auio.uio_fmode = fflag; ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); while (sfv_len > 0) { error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL); cnt = sfv_len - auio.uio_resid; sfv_len -= cnt; ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; if (vp->v_type == VREG) *fileoff += cnt; *count += cnt; if (error != 0) return (error); } } else { file_t *ffp; vnode_t *readvp; size_t size; caddr_t ptr; if ((ffp = getf(sfv->sfv_fd)) == NULL) return (EBADF); if ((ffp->f_flag & FREAD) == 0) { releasef(sfv->sfv_fd); return (EBADF); } readvp = ffp->f_vnode; if (readvp->v_type != VREG) { releasef(sfv->sfv_fd); return (EINVAL); } /* * No point reading and writing to same vp, * as long as both are regular files. readvp is not * locked; but since we got it from an open file the * contents will be valid during the time of access. */ if (vn_compare(vp, readvp)) { releasef(sfv->sfv_fd); return (EINVAL); } /* * Optimize the regular file over * the socket case. */ if (vp->v_type == VSOCK) { error = sosendfile64(fp, ffp, sfv, (ssize32_t *)&cnt); *count += cnt; if (error) return (error); sfv++; continue; } /* * Note: we assume readvp != vp. "vp" is already * locked, and "readvp" must not be. */ if (readvp < vp) { VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, NULL); (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); } else { (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, NULL); } /* * Same checks as in pread64. */ if (sfv_off > MAXOFFSET_T) { VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); releasef(sfv->sfv_fd); return (EINVAL); } if (sfv_off + sfv_len > MAXOFFSET_T) sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off); /* Find the native blocksize to transfer data */ size = MIN(vp->v_vfsp->vfs_bsize, readvp->v_vfsp->vfs_bsize); size = sfv_len < size ? sfv_len : size; ptr = kmem_alloc(size, KM_NOSLEEP); if (ptr == NULL) { VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); releasef(sfv->sfv_fd); return (ENOMEM); } while (sfv_len > 0) { size_t iov_len; iov_len = MIN(size, sfv_len); aiov.iov_base = ptr; aiov.iov_len = iov_len; auio.uio_loffset = sfv_off; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = iov_len; auio.uio_segflg = UIO_SYSSPACE; auio.uio_llimit = MAXOFFSET_T; auio.uio_fmode = ffp->f_flag; ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); /* * If read sync is not asked for, * filter sync flags */ if ((ioflag & FRSYNC) == 0) ioflag &= ~(FSYNC|FDSYNC); error = VOP_READ(readvp, &auio, ioflag, fp->f_cred, NULL); if (error) { kmem_free(ptr, size); VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); releasef(sfv->sfv_fd); return (error); } /* * Check how must data was really read. * Decrement the 'len' and increment the * 'off' appropriately. */ cnt = iov_len - auio.uio_resid; if (cnt == 0) { /* * If we were reading a pipe (currently * not implemented), we may now lose * data. */ kmem_free(ptr, size); VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); releasef(sfv->sfv_fd); return (EINVAL); } sfv_len -= cnt; sfv_off += cnt; aiov.iov_base = ptr; aiov.iov_len = cnt; auio.uio_loffset = *fileoff; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = cnt; auio.uio_segflg = UIO_SYSSPACE; auio.uio_llimit = curproc->p_fsz_ctl; auio.uio_fmode = fflag; ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL); /* * Check how much data was written. Increment * the 'len' and decrement the 'off' if all * the data was not written. */ cnt -= auio.uio_resid; sfv_len += auio.uio_resid; sfv_off -= auio.uio_resid; ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; if (vp->v_type == VREG) *fileoff += cnt; *count += cnt; if (error != 0) { kmem_free(ptr, size); VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); releasef(sfv->sfv_fd); return (error); } } VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); releasef(sfv->sfv_fd); kmem_free(ptr, size); } sfv++; } return (0); } ssize32_t sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, size32_t *xferred, int fildes) { u_offset_t fileoff; int copy_cnt; const struct ksendfilevec64 *copy_vec; struct ksendfilevec64 sfv[SEND_MAX_CHUNK]; struct vnode *vp; int error; ssize32_t count = 0; vp = fp->f_vnode; (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); copy_vec = vec; fileoff = fp->f_offset; do { copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); if (copyin(copy_vec, sfv, copy_cnt * sizeof (struct ksendfilevec64))) { error = EFAULT; break; } error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count); if (error != 0) break; copy_vec += copy_cnt; sfvcnt -= copy_cnt; } while (sfvcnt > 0); if (vp->v_type == VREG) fp->f_offset += count; VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); if (copyout(&count, xferred, sizeof (count))) error = EFAULT; releasef(fildes); if (error != 0) return (set_errno(error)); return (count); } #endif int sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count) { struct vnode *vp; struct uio auio; struct iovec aiov; ushort_t fflag; int ioflag; int i, error; size_t cnt; ssize_t sfv_len; u_offset_t sfv_off; #ifdef _SYSCALL32_IMPL model_t model = get_udatamodel(); u_offset_t maxoff = (model == DATAMODEL_ILP32) ? MAXOFF32_T : MAXOFFSET_T; #else const u_offset_t maxoff = MAXOFF32_T; #endif mblk_t *dmp = NULL; int wroff; int buf_left = 0; size_t iov_len; mblk_t *head, *tmp; size_t size = total_size; size_t extra; int tail_len; struct nmsghdr msg; fflag = fp->f_flag; vp = fp->f_vnode; ASSERT(vp->v_type == VSOCK); ASSERT(maxblk > 0); /* If nothing to send, return */ if (total_size == 0) return (0); if (vp->v_stream != NULL) { wroff = (int)vp->v_stream->sd_wroff; tail_len = (int)vp->v_stream->sd_tail; } else { struct sonode *so; so = VTOSO(vp); wroff = so->so_proto_props.sopp_wroff; tail_len = so->so_proto_props.sopp_tail; } extra = wroff + tail_len; buf_left = MIN(total_size, maxblk); head = dmp = allocb(buf_left + extra, BPRI_HI); if (head == NULL) return (ENOMEM); head->b_wptr = head->b_rptr = head->b_rptr + wroff; bzero(&msg, sizeof (msg)); auio.uio_extflg = UIO_COPY_DEFAULT; for (i = 0; i < copy_cnt; i++) { if (ISSIG(curthread, JUSTLOOKING)) { freemsg(head); return (EINTR); } /* * Do similar checks as "write" as we are writing * sfv_len bytes into "vp". */ sfv_len = (ssize_t)sfv->sfv_len; if (sfv_len == 0) { sfv++; continue; } /* Check for overflow */ #ifdef _SYSCALL32_IMPL if (model == DATAMODEL_ILP32) { if (((ssize32_t)(*count + sfv_len)) < 0) { freemsg(head); return (EINVAL); } } else #endif if ((*count + sfv_len) < 0) { freemsg(head); return (EINVAL); } sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; if (sfv->sfv_fd == SFV_FD_SELF) { while (sfv_len > 0) { if (buf_left == 0) { tmp = dmp; buf_left = MIN(total_size, maxblk); iov_len = MIN(buf_left, sfv_len); dmp = allocb(buf_left + extra, BPRI_HI); if (dmp == NULL) { freemsg(head); return (ENOMEM); } dmp->b_wptr = dmp->b_rptr = dmp->b_rptr + wroff; tmp->b_cont = dmp; } else { iov_len = MIN(buf_left, sfv_len); } aiov.iov_len = iov_len; aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; auio.uio_loffset = *fileoff; auio.uio_iovcnt = 1; auio.uio_resid = iov_len; auio.uio_iov = &aiov; auio.uio_segflg = UIO_USERSPACE; auio.uio_llimit = curproc->p_fsz_ctl; auio.uio_fmode = fflag; buf_left -= iov_len; total_size -= iov_len; sfv_len -= iov_len; sfv_off += iov_len; error = uiomove((caddr_t)dmp->b_wptr, iov_len, UIO_WRITE, &auio); if (error != 0) { freemsg(head); return (error); } dmp->b_wptr += iov_len; } } else { file_t *ffp; vnode_t *readvp; if ((ffp = getf(sfv->sfv_fd)) == NULL) { freemsg(head); return (EBADF); } if ((ffp->f_flag & FREAD) == 0) { releasef(sfv->sfv_fd); freemsg(head); return (EACCES); } readvp = ffp->f_vnode; if (readvp->v_type != VREG) { releasef(sfv->sfv_fd); freemsg(head); return (EINVAL); } /* * No point reading and writing to same vp, * as long as both are regular files. readvp is not * locked; but since we got it from an open file the * contents will be valid during the time of access. */ if (vn_compare(vp, readvp)) { releasef(sfv->sfv_fd); freemsg(head); return (EINVAL); } /* * Note: we assume readvp != vp. "vp" is already * locked, and "readvp" must not be. */ if (readvp < vp) { VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, NULL); (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); } else { (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, NULL); } /* Same checks as in pread */ if (sfv_off > maxoff) { VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); releasef(sfv->sfv_fd); freemsg(head); return (EINVAL); } if (sfv_off + sfv_len > maxoff) { total_size -= (sfv_off + sfv_len - maxoff); sfv_len = (ssize_t)((offset_t)maxoff - sfv_off); } while (sfv_len > 0) { if (buf_left == 0) { tmp = dmp; buf_left = MIN(total_size, maxblk); iov_len = MIN(buf_left, sfv_len); dmp = allocb(buf_left + extra, BPRI_HI); if (dmp == NULL) { VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); releasef(sfv->sfv_fd); freemsg(head); return (ENOMEM); } dmp->b_wptr = dmp->b_rptr = dmp->b_rptr + wroff; tmp->b_cont = dmp; } else { iov_len = MIN(buf_left, sfv_len); } aiov.iov_base = (caddr_t)dmp->b_wptr; aiov.iov_len = iov_len; auio.uio_loffset = sfv_off; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = iov_len; auio.uio_segflg = UIO_SYSSPACE; auio.uio_llimit = MAXOFFSET_T; auio.uio_fmode = ffp->f_flag; ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); /* * If read sync is not asked for, * filter sync flags */ if ((ioflag & FRSYNC) == 0) ioflag &= ~(FSYNC|FDSYNC); error = VOP_READ(readvp, &auio, ioflag, fp->f_cred, NULL); if (error != 0) { /* * If we were reading a pipe (currently * not implemented), we may now loose * data. */ VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); releasef(sfv->sfv_fd); freemsg(head); return (error); } /* * Check how much data was really read. * Decrement the 'len' and increment the * 'off' appropriately. */ cnt = iov_len - auio.uio_resid; if (cnt == 0) { VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); releasef(sfv->sfv_fd); freemsg(head); return (EINVAL); } sfv_len -= cnt; sfv_off += cnt; total_size -= cnt; buf_left -= cnt; dmp->b_wptr += cnt; } VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); releasef(sfv->sfv_fd); } sfv++; } ASSERT(total_size == 0); error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &head); if (error != 0) { if (head != NULL) freemsg(head); return (error); } ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size; *count += size; return (0); } int sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, int copy_cnt, ssize_t *count) { struct vnode *vp; struct uio auio; struct iovec aiov; ushort_t fflag; int ioflag; int i, error; size_t cnt; ssize_t sfv_len; u_offset_t sfv_off; #ifdef _SYSCALL32_IMPL model_t model = get_udatamodel(); u_offset_t maxoff = (model == DATAMODEL_ILP32) ? MAXOFF32_T : MAXOFFSET_T; #else const u_offset_t maxoff = MAXOFF32_T; #endif mblk_t *dmp = NULL; char *buf = NULL; size_t extra; int maxblk, wroff, tail_len; struct sonode *so; stdata_t *stp; struct nmsghdr msg; fflag = fp->f_flag; vp = fp->f_vnode; if (vp->v_type == VSOCK) { so = VTOSO(vp); if (vp->v_stream != NULL) { stp = vp->v_stream; wroff = (int)stp->sd_wroff; tail_len = (int)stp->sd_tail; maxblk = (int)stp->sd_maxblk; } else { stp = NULL; wroff = so->so_proto_props.sopp_wroff; tail_len = so->so_proto_props.sopp_tail; maxblk = so->so_proto_props.sopp_maxblk; } extra = wroff + tail_len; } bzero(&msg, sizeof (msg)); auio.uio_extflg = UIO_COPY_DEFAULT; for (i = 0; i < copy_cnt; i++) { if (ISSIG(curthread, JUSTLOOKING)) return (EINTR); /* * Do similar checks as "write" as we are writing * sfv_len bytes into "vp". */ sfv_len = (ssize_t)sfv->sfv_len; if (sfv_len == 0) { sfv++; continue; } if (vp->v_type == VREG) { if (*fileoff >= curproc->p_fsz_ctl) { mutex_enter(&curproc->p_lock); (void) rctl_action( rctlproc_legacy[RLIMIT_FSIZE], curproc->p_rctls, curproc, RCA_SAFE); mutex_exit(&curproc->p_lock); return (EFBIG); } if (*fileoff >= maxoff) return (EFBIG); if (*fileoff + sfv_len > maxoff) return (EINVAL); } /* Check for overflow */ #ifdef _SYSCALL32_IMPL if (model == DATAMODEL_ILP32) { if (((ssize32_t)(*count + sfv_len)) < 0) return (EINVAL); } else #endif if ((*count + sfv_len) < 0) return (EINVAL); sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; if (sfv->sfv_fd == SFV_FD_SELF) { if (vp->v_type == VSOCK) { while (sfv_len > 0) { size_t iov_len; iov_len = sfv_len; /* * Socket filters can limit the mblk * size, so limit reads to maxblk if * there are filters present. */ if (so->so_filter_active > 0 && maxblk != INFPSZ) iov_len = MIN(iov_len, maxblk); aiov.iov_len = iov_len; aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_loffset = *fileoff; auio.uio_segflg = UIO_USERSPACE; auio.uio_fmode = fflag; auio.uio_llimit = curproc->p_fsz_ctl; auio.uio_resid = iov_len; dmp = allocb(iov_len + extra, BPRI_HI); if (dmp == NULL) return (ENOMEM); dmp->b_wptr = dmp->b_rptr = dmp->b_rptr + wroff; error = uiomove((caddr_t)dmp->b_wptr, iov_len, UIO_WRITE, &auio); if (error != 0) { freeb(dmp); return (error); } dmp->b_wptr += iov_len; error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &dmp); if (error != 0) { if (dmp != NULL) freeb(dmp); return (error); } ttolwp(curthread)->lwp_ru.ioch += (ulong_t)iov_len; *count += iov_len; sfv_len -= iov_len; sfv_off += iov_len; } } else { aiov.iov_len = sfv_len; aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_loffset = *fileoff; auio.uio_segflg = UIO_USERSPACE; auio.uio_fmode = fflag; auio.uio_llimit = curproc->p_fsz_ctl; auio.uio_resid = sfv_len; ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); while (sfv_len > 0) { error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL); cnt = sfv_len - auio.uio_resid; sfv_len -= cnt; ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; *fileoff += cnt; *count += cnt; if (error != 0) return (error); } } } else { int segmapit = 0; file_t *ffp; vnode_t *readvp; struct vnode *realvp; size_t size; caddr_t ptr; if ((ffp = getf(sfv->sfv_fd)) == NULL) return (EBADF); if ((ffp->f_flag & FREAD) == 0) { releasef(sfv->sfv_fd); return (EBADF); } readvp = ffp->f_vnode; if (VOP_REALVP(readvp, &realvp, NULL) == 0) readvp = realvp; if (readvp->v_type != VREG) { releasef(sfv->sfv_fd); return (EINVAL); } /* * No point reading and writing to same vp, * as long as both are regular files. readvp is not * locked; but since we got it from an open file the * contents will be valid during the time of access. */ if (vn_compare(vp, readvp)) { releasef(sfv->sfv_fd); return (EINVAL); } /* * Note: we assume readvp != vp. "vp" is already * locked, and "readvp" must not be. */ if (readvp < vp) { VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, NULL); (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); } else { (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, NULL); } /* Same checks as in pread */ if (sfv_off > maxoff) { VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); releasef(sfv->sfv_fd); return (EINVAL); } if (sfv_off + sfv_len > maxoff) { sfv_len = (ssize_t)((offset_t)maxoff - sfv_off); } /* Find the native blocksize to transfer data */ size = MIN(vp->v_vfsp->vfs_bsize, readvp->v_vfsp->vfs_bsize); size = sfv_len < size ? sfv_len : size; if (vp->v_type != VSOCK) { segmapit = 0; buf = kmem_alloc(size, KM_NOSLEEP); if (buf == NULL) { VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); releasef(sfv->sfv_fd); return (ENOMEM); } } else { uint_t copyflag; copyflag = stp != NULL ? stp->sd_copyflag : so->so_proto_props.sopp_zcopyflag; /* * Socket filters can limit the mblk size, * so limit reads to maxblk if there are * filters present. */ if (so->so_filter_active > 0 && maxblk != INFPSZ) size = MIN(size, maxblk); if (vn_has_flocks(readvp) || readvp->v_flag & VNOMAP || copyflag & STZCVMUNSAFE) { segmapit = 0; } else if (copyflag & STZCVMSAFE) { segmapit = 1; } else { int on = 1; if (socket_setsockopt(VTOSO(vp), SOL_SOCKET, SO_SND_COPYAVOID, &on, sizeof (on), CRED()) == 0) segmapit = 1; } } if (segmapit) { struct vattr va; boolean_t nowait; va.va_mask = AT_SIZE; error = VOP_GETATTR(readvp, &va, 0, kcred, NULL); if (error != 0 || sfv_off >= va.va_size) { VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); releasef(sfv->sfv_fd); return (error); } /* Read as much as possible. */ if (sfv_off + sfv_len > va.va_size) sfv_len = va.va_size - sfv_off; nowait = (sfv->sfv_flag & SFV_NOWAIT) != 0; error = snf_segmap(fp, readvp, sfv_off, (u_offset_t)sfv_len, (ssize_t *)&cnt, nowait); releasef(sfv->sfv_fd); *count += cnt; if (error) return (error); sfv++; continue; } while (sfv_len > 0) { size_t iov_len; iov_len = MIN(size, sfv_len); if (vp->v_type == VSOCK) { dmp = allocb(iov_len + extra, BPRI_HI); if (dmp == NULL) { VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); releasef(sfv->sfv_fd); return (ENOMEM); } dmp->b_wptr = dmp->b_rptr = dmp->b_rptr + wroff; ptr = (caddr_t)dmp->b_rptr; } else { ptr = buf; } aiov.iov_base = ptr; aiov.iov_len = iov_len; auio.uio_loffset = sfv_off; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = iov_len; auio.uio_segflg = UIO_SYSSPACE; auio.uio_llimit = MAXOFFSET_T; auio.uio_fmode = ffp->f_flag; ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); /* * If read sync is not asked for, * filter sync flags */ if ((ioflag & FRSYNC) == 0) ioflag &= ~(FSYNC|FDSYNC); error = VOP_READ(readvp, &auio, ioflag, fp->f_cred, NULL); if (error != 0) { /* * If we were reading a pipe (currently * not implemented), we may now lose * data. */ if (vp->v_type == VSOCK) freeb(dmp); else kmem_free(buf, size); VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); releasef(sfv->sfv_fd); return (error); } /* * Check how much data was really read. * Decrement the 'len' and increment the * 'off' appropriately. */ cnt = iov_len - auio.uio_resid; if (cnt == 0) { if (vp->v_type == VSOCK) freeb(dmp); else kmem_free(buf, size); VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); releasef(sfv->sfv_fd); return (EINVAL); } sfv_len -= cnt; sfv_off += cnt; if (vp->v_type == VSOCK) { dmp->b_wptr = dmp->b_rptr + cnt; error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &dmp); if (error != 0) { if (dmp != NULL) freeb(dmp); VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); releasef(sfv->sfv_fd); return (error); } ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; *count += cnt; } else { aiov.iov_base = ptr; aiov.iov_len = cnt; auio.uio_loffset = *fileoff; auio.uio_resid = cnt; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_SYSSPACE; auio.uio_llimit = curproc->p_fsz_ctl; auio.uio_fmode = fflag; ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL); /* * Check how much data was written. * Increment the 'len' and decrement the * 'off' if all the data was not * written. */ cnt -= auio.uio_resid; sfv_len += auio.uio_resid; sfv_off -= auio.uio_resid; ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; *fileoff += cnt; *count += cnt; if (error != 0) { kmem_free(buf, size); VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); releasef(sfv->sfv_fd); return (error); } } } if (buf) { kmem_free(buf, size); buf = NULL; } VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); releasef(sfv->sfv_fd); } sfv++; } return (0); } ssize_t sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, size_t *xferred) { int error = 0; int first_vector_error = 0; file_t *fp; struct vnode *vp; struct sonode *so; u_offset_t fileoff; int copy_cnt; const struct sendfilevec *copy_vec; struct sendfilevec sfv[SEND_MAX_CHUNK]; ssize_t count = 0; #ifdef _SYSCALL32_IMPL struct ksendfilevec32 sfv32[SEND_MAX_CHUNK]; #endif ssize_t total_size; int i; boolean_t is_sock = B_FALSE; int maxblk = 0; if (sfvcnt <= 0) return (set_errno(EINVAL)); if ((fp = getf(fildes)) == NULL) return (set_errno(EBADF)); if (((fp->f_flag) & FWRITE) == 0) { error = EBADF; goto err; } fileoff = fp->f_offset; vp = fp->f_vnode; switch (vp->v_type) { case VSOCK: so = VTOSO(vp); is_sock = B_TRUE; if (SOCK_IS_NONSTR(so)) { maxblk = so->so_proto_props.sopp_maxblk; } else { maxblk = (int)vp->v_stream->sd_maxblk; } break; case VREG: break; default: error = EINVAL; goto err; } switch (opcode) { case SENDFILEV : break; #if defined(_SYSCALL32_IMPL) || defined(_ILP32) case SENDFILEV64 : return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt, (size32_t *)xferred, fildes)); #endif default : error = ENOSYS; break; } (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); copy_vec = vec; do { total_size = 0; copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); #ifdef _SYSCALL32_IMPL /* 32-bit callers need to have their iovec expanded. */ if (get_udatamodel() == DATAMODEL_ILP32) { if (copyin(copy_vec, sfv32, copy_cnt * sizeof (ksendfilevec32_t))) { error = EFAULT; break; } for (i = 0; i < copy_cnt; i++) { sfv[i].sfv_fd = sfv32[i].sfv_fd; sfv[i].sfv_off = (off_t)(uint32_t)sfv32[i].sfv_off; sfv[i].sfv_len = (size_t)sfv32[i].sfv_len; total_size += sfv[i].sfv_len; sfv[i].sfv_flag = sfv32[i].sfv_flag; /* * Individual elements of the vector must not * wrap or overflow, as later math is signed. * Equally total_size needs to be checked after * each vector is added in, to be sure that * rogue values haven't overflowed the counter. */ if (((ssize32_t)sfv[i].sfv_len < 0) || ((ssize32_t)total_size < 0)) { /* * Truncate the vector to send data * described by elements before the * error. */ copy_cnt = i; first_vector_error = EINVAL; /* total_size can't be trusted */ if ((ssize32_t)total_size < 0) error = EINVAL; break; } } /* Nothing to do, process errors */ if (copy_cnt == 0) break; } else { #endif if (copyin(copy_vec, sfv, copy_cnt * sizeof (sendfilevec_t))) { error = EFAULT; break; } for (i = 0; i < copy_cnt; i++) { total_size += sfv[i].sfv_len; /* * Individual elements of the vector must not * wrap or overflow, as later math is signed. * Equally total_size needs to be checked after * each vector is added in, to be sure that * rogue values haven't overflowed the counter. */ if (((ssize_t)sfv[i].sfv_len < 0) || (total_size < 0)) { /* * Truncate the vector to send data * described by elements before the * error. */ copy_cnt = i; first_vector_error = EINVAL; /* total_size can't be trusted */ if (total_size < 0) error = EINVAL; break; } } /* Nothing to do, process errors */ if (copy_cnt == 0) break; #ifdef _SYSCALL32_IMPL } #endif /* * The task between deciding to use sendvec_small_chunk * and sendvec_chunk is dependant on multiple things: * * i) latency is important for smaller files. So if the * data is smaller than 'tcp_slow_start_initial' times * maxblk, then use sendvec_small_chunk which creates * maxblk size mblks and chains them together and sends * them to TCP in one shot. It also leaves 'wroff' size * space for the headers in each mblk. * * ii) for total size bigger than 'tcp_slow_start_initial' * time maxblk, its probably real file data which is * dominating. So its better to use sendvec_chunk because * performance goes to dog if we don't do pagesize reads. * sendvec_chunk will do pagesize reads and write them * in pagesize mblks to TCP. * * Side Notes: A write to file has not been optimized. * Future zero copy code will plugin into sendvec_chunk * only because doing zero copy for files smaller then * pagesize is useless. * * Note, if socket has NL7C enabled then call NL7C's * senfilev() function to consume the sfv[]. */ if (is_sock) { if (!SOCK_IS_NONSTR(so) && _SOTOTPI(so)->sti_nl7c_flags != 0) { error = nl7c_sendfilev(so, &fileoff, sfv, copy_cnt, &count); } else if ((total_size <= (4 * maxblk)) && error == 0) { error = sendvec_small_chunk(fp, &fileoff, sfv, copy_cnt, total_size, maxblk, &count); } else { error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt, &count); } } else { ASSERT(vp->v_type == VREG); error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt, &count); } #ifdef _SYSCALL32_IMPL if (get_udatamodel() == DATAMODEL_ILP32) copy_vec = (const struct sendfilevec *)((char *)copy_vec + (copy_cnt * sizeof (ksendfilevec32_t))); else #endif copy_vec += copy_cnt; sfvcnt -= copy_cnt; /* Process all vector members up to first error */ } while ((sfvcnt > 0) && first_vector_error == 0 && error == 0); if (vp->v_type == VREG) fp->f_offset += count; VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); #ifdef _SYSCALL32_IMPL if (get_udatamodel() == DATAMODEL_ILP32) { ssize32_t count32 = (ssize32_t)count; if (copyout(&count32, xferred, sizeof (count32))) error = EFAULT; releasef(fildes); if (error != 0) return (set_errno(error)); if (first_vector_error != 0) return (set_errno(first_vector_error)); return (count32); } #endif if (copyout(&count, xferred, sizeof (count))) error = EFAULT; releasef(fildes); if (error != 0) return (set_errno(error)); if (first_vector_error != 0) return (set_errno(first_vector_error)); return (count); err: ASSERT(error != 0); releasef(fildes); return (set_errno(error)); }