1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/t_lock.h> 29 #include <sys/param.h> 30 #include <sys/systm.h> 31 #include <sys/buf.h> 32 #include <sys/conf.h> 33 #include <sys/cred.h> 34 #include <sys/kmem.h> 35 #include <sys/sysmacros.h> 36 #include <sys/vfs.h> 37 #include <sys/vnode.h> 38 #include <sys/debug.h> 39 #include <sys/errno.h> 40 #include <sys/time.h> 41 #include <sys/file.h> 42 #include <sys/open.h> 43 #include <sys/user.h> 44 #include <sys/termios.h> 45 #include <sys/stream.h> 46 #include <sys/strsubr.h> 47 #include <sys/sunddi.h> 48 #include <sys/esunddi.h> 49 #include <sys/flock.h> 50 #include <sys/modctl.h> 51 #include <sys/cmn_err.h> 52 #include <sys/vmsystm.h> 53 54 #include <sys/socket.h> 55 #include <sys/socketvar.h> 56 57 #include <netinet/in.h> 58 #include <sys/sendfile.h> 59 #include <sys/un.h> 60 #include <sys/tihdr.h> 61 #include <sys/atomic.h> 62 63 #include <inet/common.h> 64 #include <inet/ip.h> 65 #include <inet/ip6.h> 66 #include <inet/tcp.h> 67 68 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *, 69 ssize32_t *); 70 extern int nl7c_sendfilev(struct sonode *, u_offset_t *, struct sendfilevec *, 71 int, ssize_t *); 72 extern int snf_segmap(file_t *, vnode_t *, u_offset_t, u_offset_t, ssize_t *, 73 boolean_t); 74 75 #define readflg (V_WRITELOCK_FALSE) 76 #define rwflag (V_WRITELOCK_TRUE) 77 78 /* 79 * kstrwritemp() has very similar semantics as that of strwrite(). 80 * The main difference is it obtains mblks from the caller and also 81 * does not do any copy as done in strwrite() from user buffers to 82 * kernel buffers. 83 * 84 * Currently, this routine is used by sendfile to send data allocated 85 * within the kernel without any copying. This interface does not use the 86 * synchronous stream interface as synch. stream interface implies 87 * copying. 88 */ 89 int 90 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode) 91 { 92 struct stdata *stp; 93 struct queue *wqp; 94 mblk_t *newmp; 95 char waitflag; 96 int tempmode; 97 int error = 0; 98 int done = 0; 99 struct sonode *so; 100 boolean_t direct; 101 102 ASSERT(vp->v_stream); 103 stp = vp->v_stream; 104 105 so = VTOSO(vp); 106 direct = (so->so_state & SS_DIRECT); 107 108 /* 109 * This is the sockfs direct fast path. canputnext() need 110 * not be accurate so we don't grab the sd_lock here. If 111 * we get flow-controlled, we grab sd_lock just before the 112 * do..while loop below to emulate what strwrite() does. 113 */ 114 wqp = stp->sd_wrq; 115 if (canputnext(wqp) && direct && 116 !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) { 117 return (sostream_direct(so, NULL, mp, CRED())); 118 } else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) { 119 /* Fast check of flags before acquiring the lock */ 120 mutex_enter(&stp->sd_lock); 121 error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0); 122 mutex_exit(&stp->sd_lock); 123 if (error != 0) { 124 if (!(stp->sd_flag & STPLEX) && 125 (stp->sd_wput_opt & SW_SIGPIPE)) { 126 tsignal(curthread, SIGPIPE); 127 error = EPIPE; 128 } 129 return (error); 130 } 131 } 132 133 waitflag = WRITEWAIT; 134 if (stp->sd_flag & OLDNDELAY) 135 tempmode = fmode & ~FNDELAY; 136 else 137 tempmode = fmode; 138 139 mutex_enter(&stp->sd_lock); 140 do { 141 if (canputnext(wqp)) { 142 mutex_exit(&stp->sd_lock); 143 if (stp->sd_wputdatafunc != NULL) { 144 newmp = (stp->sd_wputdatafunc)(vp, mp, NULL, 145 NULL, NULL, NULL); 146 if (newmp == NULL) { 147 /* The caller will free mp */ 148 return (ECOMM); 149 } 150 mp = newmp; 151 } 152 putnext(wqp, mp); 153 return (0); 154 } 155 error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1, 156 &done); 157 } while (error == 0 && !done); 158 159 mutex_exit(&stp->sd_lock); 160 /* 161 * EAGAIN tells the application to try again. ENOMEM 162 * is returned only if the memory allocation size 163 * exceeds the physical limits of the system. ENOMEM 164 * can't be true here. 165 */ 166 if (error == ENOMEM) 167 error = EAGAIN; 168 return (error); 169 } 170 171 #define SEND_MAX_CHUNK 16 172 173 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 174 /* 175 * 64 bit offsets for 32 bit applications only running either on 176 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer 177 * more than 2GB of data. 178 */ 179 int 180 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, 181 int copy_cnt, ssize32_t *count) 182 { 183 struct vnode *vp; 184 ushort_t fflag; 185 int ioflag; 186 size32_t cnt; 187 ssize32_t sfv_len; 188 ssize32_t tmpcount; 189 u_offset_t sfv_off; 190 struct uio auio; 191 struct iovec aiov; 192 int i, error; 193 194 fflag = fp->f_flag; 195 vp = fp->f_vnode; 196 for (i = 0; i < copy_cnt; i++) { 197 198 if (ISSIG(curthread, JUSTLOOKING)) 199 return (EINTR); 200 201 /* 202 * Do similar checks as "write" as we are writing 203 * sfv_len bytes into "vp". 204 */ 205 sfv_len = (ssize32_t)sfv->sfv_len; 206 207 if (sfv_len == 0) { 208 sfv++; 209 continue; 210 } 211 212 if (sfv_len < 0) 213 return (EINVAL); 214 215 if (vp->v_type == VREG) { 216 if (*fileoff >= curproc->p_fsz_ctl) { 217 mutex_enter(&curproc->p_lock); 218 (void) rctl_action( 219 rctlproc_legacy[RLIMIT_FSIZE], 220 curproc->p_rctls, curproc, RCA_SAFE); 221 mutex_exit(&curproc->p_lock); 222 return (EFBIG); 223 } 224 225 if (*fileoff >= OFFSET_MAX(fp)) 226 return (EFBIG); 227 228 if (*fileoff + sfv_len > OFFSET_MAX(fp)) 229 return (EINVAL); 230 } 231 232 tmpcount = *count + sfv_len; 233 if (tmpcount < 0) 234 return (EINVAL); 235 236 sfv_off = sfv->sfv_off; 237 238 auio.uio_extflg = UIO_COPY_DEFAULT; 239 if (sfv->sfv_fd == SFV_FD_SELF) { 240 aiov.iov_len = sfv_len; 241 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 242 auio.uio_loffset = *fileoff; 243 auio.uio_iovcnt = 1; 244 auio.uio_resid = sfv_len; 245 auio.uio_iov = &aiov; 246 auio.uio_segflg = UIO_USERSPACE; 247 auio.uio_llimit = curproc->p_fsz_ctl; 248 auio.uio_fmode = fflag; 249 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 250 while (sfv_len > 0) { 251 error = VOP_WRITE(vp, &auio, ioflag, 252 fp->f_cred, NULL); 253 cnt = sfv_len - auio.uio_resid; 254 sfv_len -= cnt; 255 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 256 if (vp->v_type == VREG) 257 *fileoff += cnt; 258 *count += cnt; 259 if (error != 0) 260 return (error); 261 } 262 } else { 263 file_t *ffp; 264 vnode_t *readvp; 265 size_t size; 266 caddr_t ptr; 267 268 if ((ffp = getf(sfv->sfv_fd)) == NULL) 269 return (EBADF); 270 271 if ((ffp->f_flag & FREAD) == 0) { 272 releasef(sfv->sfv_fd); 273 return (EBADF); 274 } 275 276 readvp = ffp->f_vnode; 277 if (readvp->v_type != VREG) { 278 releasef(sfv->sfv_fd); 279 return (EINVAL); 280 } 281 282 /* 283 * No point reading and writing to same vp, 284 * as long as both are regular files. readvp is not 285 * locked; but since we got it from an open file the 286 * contents will be valid during the time of access. 287 */ 288 if (vn_compare(vp, readvp)) { 289 releasef(sfv->sfv_fd); 290 return (EINVAL); 291 } 292 293 /* 294 * Note: we assume readvp != vp. "vp" is already 295 * locked, and "readvp" must not be. 296 */ 297 (void) VOP_RWLOCK(readvp, readflg, NULL); 298 299 /* 300 * Same checks as in pread64. 301 */ 302 if (sfv_off > MAXOFFSET_T) { 303 VOP_RWUNLOCK(readvp, readflg, NULL); 304 releasef(sfv->sfv_fd); 305 return (EINVAL); 306 } 307 308 if (sfv_off + sfv_len > MAXOFFSET_T) 309 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off); 310 311 /* Find the native blocksize to transfer data */ 312 size = MIN(vp->v_vfsp->vfs_bsize, 313 readvp->v_vfsp->vfs_bsize); 314 size = sfv_len < size ? sfv_len : size; 315 ptr = kmem_alloc(size, KM_SLEEP); 316 317 while (sfv_len > 0) { 318 size_t iov_len; 319 320 iov_len = MIN(size, sfv_len); 321 aiov.iov_base = ptr; 322 aiov.iov_len = iov_len; 323 auio.uio_loffset = sfv_off; 324 auio.uio_iov = &aiov; 325 auio.uio_iovcnt = 1; 326 auio.uio_resid = iov_len; 327 auio.uio_segflg = UIO_SYSSPACE; 328 auio.uio_llimit = MAXOFFSET_T; 329 auio.uio_fmode = ffp->f_flag; 330 ioflag = auio.uio_fmode & 331 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 332 333 /* 334 * If read sync is not asked for, 335 * filter sync flags 336 */ 337 if ((ioflag & FRSYNC) == 0) 338 ioflag &= ~(FSYNC|FDSYNC); 339 error = VOP_READ(readvp, &auio, ioflag, 340 fp->f_cred, NULL); 341 if (error) { 342 kmem_free(ptr, size); 343 VOP_RWUNLOCK(readvp, readflg, NULL); 344 releasef(sfv->sfv_fd); 345 return (error); 346 } 347 348 /* 349 * Check how must data was really read. 350 * Decrement the 'len' and increment the 351 * 'off' appropriately. 352 */ 353 cnt = iov_len - auio.uio_resid; 354 if (cnt == 0) { 355 /* 356 * If we were reading a pipe (currently 357 * not implemented), we may now lose 358 * data. 359 */ 360 kmem_free(ptr, size); 361 VOP_RWUNLOCK(readvp, readflg, NULL); 362 releasef(sfv->sfv_fd); 363 return (EINVAL); 364 } 365 sfv_len -= cnt; 366 sfv_off += cnt; 367 368 aiov.iov_base = ptr; 369 aiov.iov_len = cnt; 370 auio.uio_loffset = *fileoff; 371 auio.uio_iov = &aiov; 372 auio.uio_iovcnt = 1; 373 auio.uio_resid = cnt; 374 auio.uio_segflg = UIO_SYSSPACE; 375 auio.uio_llimit = curproc->p_fsz_ctl; 376 auio.uio_fmode = fflag; 377 ioflag = auio.uio_fmode & 378 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 379 error = VOP_WRITE(vp, &auio, ioflag, 380 fp->f_cred, NULL); 381 382 /* 383 * Check how much data was written. Increment 384 * the 'len' and decrement the 'off' if all 385 * the data was not written. 386 */ 387 cnt -= auio.uio_resid; 388 sfv_len += auio.uio_resid; 389 sfv_off -= auio.uio_resid; 390 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 391 if (vp->v_type == VREG) 392 *fileoff += cnt; 393 *count += cnt; 394 if (error != 0) { 395 kmem_free(ptr, size); 396 VOP_RWUNLOCK(readvp, readflg, NULL); 397 releasef(sfv->sfv_fd); 398 return (error); 399 } 400 } 401 VOP_RWUNLOCK(readvp, readflg, NULL); 402 releasef(sfv->sfv_fd); 403 kmem_free(ptr, size); 404 } 405 sfv++; 406 } 407 return (0); 408 } 409 410 ssize32_t 411 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, 412 size32_t *xferred, int fildes) 413 { 414 u_offset_t fileoff; 415 int copy_cnt; 416 const struct ksendfilevec64 *copy_vec; 417 struct ksendfilevec64 sfv[SEND_MAX_CHUNK]; 418 struct vnode *vp; 419 int error; 420 ssize32_t count = 0; 421 422 vp = fp->f_vnode; 423 (void) VOP_RWLOCK(vp, rwflag, NULL); 424 425 copy_vec = vec; 426 fileoff = fp->f_offset; 427 428 do { 429 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 430 if (copyin(copy_vec, sfv, copy_cnt * 431 sizeof (struct ksendfilevec64))) { 432 error = EFAULT; 433 break; 434 } 435 436 /* 437 * Optimize the regular file over 438 * the socket case. 439 */ 440 if (vp->v_type == VSOCK && sfv->sfv_fd != SFV_FD_SELF) { 441 file_t *rfp; 442 vnode_t *rvp; 443 444 if ((rfp = getf(sfv->sfv_fd)) == NULL) { 445 error = EBADF; 446 break; 447 } 448 if ((rfp->f_flag & FREAD) == 0) { 449 releasef(sfv->sfv_fd); 450 error = EBADF; 451 break; 452 } 453 rvp = rfp->f_vnode; 454 if (rvp->v_type == VREG) { 455 error = sosendfile64(fp, rfp, sfv, &count); 456 if (error) 457 break; 458 copy_vec++; 459 sfvcnt--; 460 continue; 461 } 462 releasef(sfv->sfv_fd); 463 } 464 error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count); 465 if (error != 0) 466 break; 467 468 copy_vec += copy_cnt; 469 sfvcnt -= copy_cnt; 470 } while (sfvcnt > 0); 471 472 if (vp->v_type == VREG) 473 fp->f_offset += count; 474 475 VOP_RWUNLOCK(vp, rwflag, NULL); 476 if (copyout(&count, xferred, sizeof (count))) 477 error = EFAULT; 478 releasef(fildes); 479 if (error != 0) 480 return (set_errno(error)); 481 return (count); 482 } 483 #endif 484 485 int 486 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 487 int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count) 488 { 489 struct vnode *vp; 490 struct uio auio; 491 struct iovec aiov; 492 ushort_t fflag; 493 int ioflag; 494 int i, error; 495 size_t cnt; 496 ssize_t sfv_len; 497 u_offset_t sfv_off; 498 #ifdef _SYSCALL32_IMPL 499 model_t model = get_udatamodel(); 500 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 501 MAXOFF32_T : MAXOFFSET_T; 502 #else 503 const u_offset_t maxoff = MAXOFF32_T; 504 #endif 505 mblk_t *dmp = NULL; 506 int wroff; 507 int buf_left = 0; 508 size_t iov_len; 509 mblk_t *head, *tmp; 510 size_t size = total_size; 511 size_t extra; 512 int tail_len; 513 514 fflag = fp->f_flag; 515 vp = fp->f_vnode; 516 517 ASSERT(vp->v_type == VSOCK); 518 ASSERT(maxblk > 0); 519 520 /* If nothing to send, return */ 521 if (total_size == 0) 522 return (0); 523 524 wroff = (int)vp->v_stream->sd_wroff; 525 tail_len = (int)vp->v_stream->sd_tail; 526 extra = wroff + tail_len; 527 528 buf_left = MIN(total_size, maxblk); 529 head = dmp = allocb(buf_left + extra, BPRI_HI); 530 if (head == NULL) 531 return (ENOMEM); 532 head->b_wptr = head->b_rptr = head->b_rptr + wroff; 533 534 auio.uio_extflg = UIO_COPY_DEFAULT; 535 for (i = 0; i < copy_cnt; i++) { 536 if (ISSIG(curthread, JUSTLOOKING)) { 537 freemsg(head); 538 return (EINTR); 539 } 540 541 /* 542 * Do similar checks as "write" as we are writing 543 * sfv_len bytes into "vp". 544 */ 545 sfv_len = (ssize_t)sfv->sfv_len; 546 547 if (sfv_len == 0) { 548 sfv++; 549 continue; 550 } 551 552 /* Check for overflow */ 553 #ifdef _SYSCALL32_IMPL 554 if (model == DATAMODEL_ILP32) { 555 if (((ssize32_t)(*count + sfv_len)) < 0) { 556 freemsg(head); 557 return (EINVAL); 558 } 559 } else 560 #endif 561 if ((*count + sfv_len) < 0) { 562 freemsg(head); 563 return (EINVAL); 564 } 565 566 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 567 568 if (sfv->sfv_fd == SFV_FD_SELF) { 569 while (sfv_len > 0) { 570 if (buf_left == 0) { 571 tmp = dmp; 572 buf_left = MIN(total_size, maxblk); 573 iov_len = MIN(buf_left, sfv_len); 574 dmp = allocb(buf_left + extra, BPRI_HI); 575 if (dmp == NULL) { 576 freemsg(head); 577 return (ENOMEM); 578 } 579 dmp->b_wptr = dmp->b_rptr = 580 dmp->b_rptr + wroff; 581 tmp->b_cont = dmp; 582 } else { 583 iov_len = MIN(buf_left, sfv_len); 584 } 585 586 aiov.iov_len = iov_len; 587 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 588 auio.uio_loffset = *fileoff; 589 auio.uio_iovcnt = 1; 590 auio.uio_resid = iov_len; 591 auio.uio_iov = &aiov; 592 auio.uio_segflg = UIO_USERSPACE; 593 auio.uio_llimit = curproc->p_fsz_ctl; 594 auio.uio_fmode = fflag; 595 596 buf_left -= iov_len; 597 total_size -= iov_len; 598 sfv_len -= iov_len; 599 sfv_off += iov_len; 600 601 error = uiomove((caddr_t)dmp->b_wptr, 602 iov_len, UIO_WRITE, &auio); 603 if (error != 0) { 604 freemsg(head); 605 return (error); 606 } 607 dmp->b_wptr += iov_len; 608 } 609 } else { 610 file_t *ffp; 611 vnode_t *readvp; 612 613 if ((ffp = getf(sfv->sfv_fd)) == NULL) { 614 freemsg(head); 615 return (EBADF); 616 } 617 618 if ((ffp->f_flag & FREAD) == 0) { 619 releasef(sfv->sfv_fd); 620 freemsg(head); 621 return (EACCES); 622 } 623 624 readvp = ffp->f_vnode; 625 if (readvp->v_type != VREG) { 626 releasef(sfv->sfv_fd); 627 freemsg(head); 628 return (EINVAL); 629 } 630 631 /* 632 * No point reading and writing to same vp, 633 * as long as both are regular files. readvp is not 634 * locked; but since we got it from an open file the 635 * contents will be valid during the time of access. 636 */ 637 638 if (vn_compare(vp, readvp)) { 639 releasef(sfv->sfv_fd); 640 freemsg(head); 641 return (EINVAL); 642 } 643 644 /* 645 * Note: we assume readvp != vp. "vp" is already 646 * locked, and "readvp" must not be. 647 */ 648 649 (void) VOP_RWLOCK(readvp, readflg, NULL); 650 651 /* Same checks as in pread */ 652 if (sfv_off > maxoff) { 653 VOP_RWUNLOCK(readvp, readflg, NULL); 654 releasef(sfv->sfv_fd); 655 freemsg(head); 656 return (EINVAL); 657 } 658 if (sfv_off + sfv_len > maxoff) { 659 total_size -= (sfv_off + sfv_len - maxoff); 660 sfv_len = (ssize_t)((offset_t)maxoff - 661 sfv_off); 662 } 663 664 while (sfv_len > 0) { 665 if (buf_left == 0) { 666 tmp = dmp; 667 buf_left = MIN(total_size, maxblk); 668 iov_len = MIN(buf_left, sfv_len); 669 dmp = allocb(buf_left + extra, BPRI_HI); 670 if (dmp == NULL) { 671 VOP_RWUNLOCK(readvp, readflg, 672 NULL); 673 releasef(sfv->sfv_fd); 674 freemsg(head); 675 return (ENOMEM); 676 } 677 dmp->b_wptr = dmp->b_rptr = 678 dmp->b_rptr + wroff; 679 tmp->b_cont = dmp; 680 } else { 681 iov_len = MIN(buf_left, sfv_len); 682 } 683 aiov.iov_base = (caddr_t)dmp->b_wptr; 684 aiov.iov_len = iov_len; 685 auio.uio_loffset = sfv_off; 686 auio.uio_iov = &aiov; 687 auio.uio_iovcnt = 1; 688 auio.uio_resid = iov_len; 689 auio.uio_segflg = UIO_SYSSPACE; 690 auio.uio_llimit = MAXOFFSET_T; 691 auio.uio_fmode = ffp->f_flag; 692 ioflag = auio.uio_fmode & 693 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 694 695 /* 696 * If read sync is not asked for, 697 * filter sync flags 698 */ 699 if ((ioflag & FRSYNC) == 0) 700 ioflag &= ~(FSYNC|FDSYNC); 701 error = VOP_READ(readvp, &auio, ioflag, 702 fp->f_cred, NULL); 703 if (error != 0) { 704 /* 705 * If we were reading a pipe (currently 706 * not implemented), we may now loose 707 * data. 708 */ 709 VOP_RWUNLOCK(readvp, readflg, NULL); 710 releasef(sfv->sfv_fd); 711 freemsg(head); 712 return (error); 713 } 714 715 /* 716 * Check how much data was really read. 717 * Decrement the 'len' and increment the 718 * 'off' appropriately. 719 */ 720 cnt = iov_len - auio.uio_resid; 721 if (cnt == 0) { 722 VOP_RWUNLOCK(readvp, readflg, NULL); 723 releasef(sfv->sfv_fd); 724 freemsg(head); 725 return (EINVAL); 726 } 727 sfv_len -= cnt; 728 sfv_off += cnt; 729 total_size -= cnt; 730 buf_left -= cnt; 731 732 dmp->b_wptr += cnt; 733 } 734 VOP_RWUNLOCK(readvp, readflg, NULL); 735 releasef(sfv->sfv_fd); 736 } 737 sfv++; 738 } 739 740 ASSERT(total_size == 0); 741 error = kstrwritemp(vp, head, fflag); 742 if (error != 0) { 743 freemsg(head); 744 return (error); 745 } 746 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size; 747 *count += size; 748 749 return (0); 750 } 751 752 753 int 754 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 755 int copy_cnt, ssize_t *count) 756 { 757 struct vnode *vp; 758 struct uio auio; 759 struct iovec aiov; 760 ushort_t fflag; 761 int ioflag; 762 int i, error; 763 size_t cnt; 764 ssize_t sfv_len; 765 u_offset_t sfv_off; 766 #ifdef _SYSCALL32_IMPL 767 model_t model = get_udatamodel(); 768 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 769 MAXOFF32_T : MAXOFFSET_T; 770 #else 771 const u_offset_t maxoff = MAXOFF32_T; 772 #endif 773 mblk_t *dmp = NULL; 774 char *buf = NULL; 775 size_t extra; 776 int maxblk, wroff, tail_len; 777 struct sonode *so; 778 stdata_t *stp; 779 780 fflag = fp->f_flag; 781 vp = fp->f_vnode; 782 783 if (vp->v_type == VSOCK) { 784 so = VTOSO(vp); 785 stp = vp->v_stream; 786 wroff = (int)stp->sd_wroff; 787 tail_len = (int)stp->sd_tail; 788 maxblk = (int)stp->sd_maxblk; 789 extra = wroff + tail_len; 790 } 791 792 auio.uio_extflg = UIO_COPY_DEFAULT; 793 for (i = 0; i < copy_cnt; i++) { 794 if (ISSIG(curthread, JUSTLOOKING)) 795 return (EINTR); 796 797 /* 798 * Do similar checks as "write" as we are writing 799 * sfv_len bytes into "vp". 800 */ 801 sfv_len = (ssize_t)sfv->sfv_len; 802 803 if (sfv_len == 0) { 804 sfv++; 805 continue; 806 } 807 808 if (vp->v_type == VREG) { 809 if (*fileoff >= curproc->p_fsz_ctl) { 810 mutex_enter(&curproc->p_lock); 811 (void) rctl_action( 812 rctlproc_legacy[RLIMIT_FSIZE], 813 curproc->p_rctls, curproc, RCA_SAFE); 814 mutex_exit(&curproc->p_lock); 815 816 return (EFBIG); 817 } 818 819 if (*fileoff >= maxoff) 820 return (EFBIG); 821 822 if (*fileoff + sfv_len > maxoff) 823 return (EINVAL); 824 } 825 826 /* Check for overflow */ 827 #ifdef _SYSCALL32_IMPL 828 if (model == DATAMODEL_ILP32) { 829 if (((ssize32_t)(*count + sfv_len)) < 0) 830 return (EINVAL); 831 } else 832 #endif 833 if ((*count + sfv_len) < 0) 834 return (EINVAL); 835 836 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 837 838 if (sfv->sfv_fd == SFV_FD_SELF) { 839 if (vp->v_type == VSOCK) { 840 while (sfv_len > 0) { 841 size_t iov_len; 842 843 iov_len = sfv_len; 844 if (so->so_kssl_ctx != NULL) 845 iov_len = MIN(iov_len, maxblk); 846 847 aiov.iov_len = iov_len; 848 aiov.iov_base = 849 (caddr_t)(uintptr_t)sfv_off; 850 851 auio.uio_iov = &aiov; 852 auio.uio_iovcnt = 1; 853 auio.uio_loffset = *fileoff; 854 auio.uio_segflg = UIO_USERSPACE; 855 auio.uio_fmode = fflag; 856 auio.uio_llimit = curproc->p_fsz_ctl; 857 auio.uio_resid = iov_len; 858 859 dmp = allocb(iov_len + extra, BPRI_HI); 860 if (dmp == NULL) 861 return (ENOMEM); 862 dmp->b_wptr = dmp->b_rptr = 863 dmp->b_rptr + wroff; 864 error = uiomove((caddr_t)dmp->b_wptr, 865 iov_len, UIO_WRITE, &auio); 866 if (error != 0) { 867 freeb(dmp); 868 return (error); 869 } 870 dmp->b_wptr += iov_len; 871 error = kstrwritemp(vp, dmp, fflag); 872 if (error != 0) { 873 freeb(dmp); 874 return (error); 875 } 876 ttolwp(curthread)->lwp_ru.ioch += 877 (ulong_t)iov_len; 878 *count += iov_len; 879 sfv_len -= iov_len; 880 sfv_off += iov_len; 881 } 882 } else { 883 aiov.iov_len = sfv_len; 884 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 885 886 auio.uio_iov = &aiov; 887 auio.uio_iovcnt = 1; 888 auio.uio_loffset = *fileoff; 889 auio.uio_segflg = UIO_USERSPACE; 890 auio.uio_fmode = fflag; 891 auio.uio_llimit = curproc->p_fsz_ctl; 892 auio.uio_resid = sfv_len; 893 894 ioflag = auio.uio_fmode & 895 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 896 while (sfv_len > 0) { 897 error = VOP_WRITE(vp, &auio, ioflag, 898 fp->f_cred, NULL); 899 cnt = sfv_len - auio.uio_resid; 900 sfv_len -= cnt; 901 ttolwp(curthread)->lwp_ru.ioch += 902 (ulong_t)cnt; 903 *fileoff += cnt; 904 *count += cnt; 905 if (error != 0) 906 return (error); 907 } 908 } 909 } else { 910 int segmapit = 0; 911 file_t *ffp; 912 vnode_t *readvp; 913 struct vnode *realvp; 914 size_t size; 915 caddr_t ptr; 916 917 if ((ffp = getf(sfv->sfv_fd)) == NULL) 918 return (EBADF); 919 920 if ((ffp->f_flag & FREAD) == 0) { 921 releasef(sfv->sfv_fd); 922 return (EBADF); 923 } 924 925 readvp = ffp->f_vnode; 926 if (VOP_REALVP(readvp, &realvp, NULL) == 0) 927 readvp = realvp; 928 if (readvp->v_type != VREG) { 929 releasef(sfv->sfv_fd); 930 return (EINVAL); 931 } 932 933 /* 934 * No point reading and writing to same vp, 935 * as long as both are regular files. readvp is not 936 * locked; but since we got it from an open file the 937 * contents will be valid during the time of access. 938 */ 939 if (vn_compare(vp, readvp)) { 940 releasef(sfv->sfv_fd); 941 return (EINVAL); 942 } 943 944 /* 945 * Note: we assume readvp != vp. "vp" is already 946 * locked, and "readvp" must not be. 947 */ 948 (void) VOP_RWLOCK(readvp, readflg, NULL); 949 950 /* Same checks as in pread */ 951 if (sfv_off > maxoff) { 952 VOP_RWUNLOCK(readvp, readflg, NULL); 953 releasef(sfv->sfv_fd); 954 return (EINVAL); 955 } 956 if (sfv_off + sfv_len > maxoff) { 957 sfv_len = (ssize_t)((offset_t)maxoff - 958 sfv_off); 959 } 960 /* Find the native blocksize to transfer data */ 961 size = MIN(vp->v_vfsp->vfs_bsize, 962 readvp->v_vfsp->vfs_bsize); 963 size = sfv_len < size ? sfv_len : size; 964 965 if (vp->v_type != VSOCK) { 966 segmapit = 0; 967 buf = kmem_alloc(size, KM_NOSLEEP); 968 if (buf == NULL) { 969 VOP_RWUNLOCK(readvp, readflg, NULL); 970 releasef(sfv->sfv_fd); 971 return (ENOMEM); 972 } 973 } else { 974 /* 975 * For sockets acting as an SSL proxy, we 976 * need to adjust the size to the maximum 977 * SSL record size set in the stream head. 978 */ 979 if (so->so_kssl_ctx != NULL) 980 size = MIN(size, maxblk); 981 982 if (vn_has_flocks(readvp) || 983 readvp->v_flag & VNOMAP || 984 stp->sd_copyflag & STZCVMUNSAFE) { 985 segmapit = 0; 986 } else if (stp->sd_copyflag & STZCVMSAFE) { 987 segmapit = 1; 988 } else { 989 int on = 1; 990 if (SOP_SETSOCKOPT(VTOSO(vp), 991 SOL_SOCKET, SO_SND_COPYAVOID, 992 &on, sizeof (on)) == 0) 993 segmapit = 1; 994 } 995 } 996 997 if (segmapit) { 998 boolean_t nowait; 999 1000 nowait = (sfv->sfv_flag & SFV_NOWAIT) != 0; 1001 error = snf_segmap(fp, readvp, sfv_off, 1002 (u_offset_t)sfv_len, (ssize_t *)&cnt, 1003 nowait); 1004 releasef(sfv->sfv_fd); 1005 *count += cnt; 1006 if (error) 1007 return (error); 1008 sfv++; 1009 continue; 1010 } 1011 1012 while (sfv_len > 0) { 1013 size_t iov_len; 1014 1015 iov_len = MIN(size, sfv_len); 1016 1017 if (vp->v_type == VSOCK) { 1018 dmp = allocb(iov_len + extra, BPRI_HI); 1019 if (dmp == NULL) { 1020 VOP_RWUNLOCK(readvp, readflg, 1021 NULL); 1022 releasef(sfv->sfv_fd); 1023 return (ENOMEM); 1024 } 1025 dmp->b_wptr = dmp->b_rptr = 1026 dmp->b_rptr + wroff; 1027 ptr = (caddr_t)dmp->b_rptr; 1028 } else { 1029 ptr = buf; 1030 } 1031 1032 aiov.iov_base = ptr; 1033 aiov.iov_len = iov_len; 1034 auio.uio_loffset = sfv_off; 1035 auio.uio_iov = &aiov; 1036 auio.uio_iovcnt = 1; 1037 auio.uio_resid = iov_len; 1038 auio.uio_segflg = UIO_SYSSPACE; 1039 auio.uio_llimit = MAXOFFSET_T; 1040 auio.uio_fmode = ffp->f_flag; 1041 ioflag = auio.uio_fmode & 1042 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1043 1044 /* 1045 * If read sync is not asked for, 1046 * filter sync flags 1047 */ 1048 if ((ioflag & FRSYNC) == 0) 1049 ioflag &= ~(FSYNC|FDSYNC); 1050 error = VOP_READ(readvp, &auio, ioflag, 1051 fp->f_cred, NULL); 1052 if (error != 0) { 1053 /* 1054 * If we were reading a pipe (currently 1055 * not implemented), we may now lose 1056 * data. 1057 */ 1058 if (vp->v_type == VSOCK) 1059 freeb(dmp); 1060 else 1061 kmem_free(buf, size); 1062 VOP_RWUNLOCK(readvp, readflg, NULL); 1063 releasef(sfv->sfv_fd); 1064 return (error); 1065 } 1066 1067 /* 1068 * Check how much data was really read. 1069 * Decrement the 'len' and increment the 1070 * 'off' appropriately. 1071 */ 1072 cnt = iov_len - auio.uio_resid; 1073 if (cnt == 0) { 1074 if (vp->v_type == VSOCK) 1075 freeb(dmp); 1076 else 1077 kmem_free(buf, size); 1078 VOP_RWUNLOCK(readvp, readflg, NULL); 1079 releasef(sfv->sfv_fd); 1080 return (EINVAL); 1081 } 1082 sfv_len -= cnt; 1083 sfv_off += cnt; 1084 1085 if (vp->v_type == VSOCK) { 1086 dmp->b_wptr = dmp->b_rptr + cnt; 1087 1088 error = kstrwritemp(vp, dmp, fflag); 1089 if (error != 0) { 1090 freeb(dmp); 1091 VOP_RWUNLOCK(readvp, readflg, 1092 NULL); 1093 releasef(sfv->sfv_fd); 1094 return (error); 1095 } 1096 1097 ttolwp(curthread)->lwp_ru.ioch += 1098 (ulong_t)cnt; 1099 *count += cnt; 1100 } else { 1101 1102 aiov.iov_base = ptr; 1103 aiov.iov_len = cnt; 1104 auio.uio_loffset = *fileoff; 1105 auio.uio_resid = cnt; 1106 auio.uio_iov = &aiov; 1107 auio.uio_iovcnt = 1; 1108 auio.uio_segflg = UIO_SYSSPACE; 1109 auio.uio_llimit = curproc->p_fsz_ctl; 1110 auio.uio_fmode = fflag; 1111 ioflag = auio.uio_fmode & 1112 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1113 error = VOP_WRITE(vp, &auio, ioflag, 1114 fp->f_cred, NULL); 1115 1116 /* 1117 * Check how much data was written. 1118 * Increment the 'len' and decrement the 1119 * 'off' if all the data was not 1120 * written. 1121 */ 1122 cnt -= auio.uio_resid; 1123 sfv_len += auio.uio_resid; 1124 sfv_off -= auio.uio_resid; 1125 ttolwp(curthread)->lwp_ru.ioch += 1126 (ulong_t)cnt; 1127 *fileoff += cnt; 1128 *count += cnt; 1129 if (error != 0) { 1130 kmem_free(buf, size); 1131 VOP_RWUNLOCK(readvp, readflg, 1132 NULL); 1133 releasef(sfv->sfv_fd); 1134 return (error); 1135 } 1136 } 1137 } 1138 if (buf) { 1139 kmem_free(buf, size); 1140 buf = NULL; 1141 } 1142 VOP_RWUNLOCK(readvp, readflg, NULL); 1143 releasef(sfv->sfv_fd); 1144 } 1145 sfv++; 1146 } 1147 return (0); 1148 } 1149 1150 ssize_t 1151 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, 1152 size_t *xferred) 1153 { 1154 int error = 0; 1155 int first_vector_error = 0; 1156 file_t *fp; 1157 struct vnode *vp; 1158 struct sonode *so; 1159 u_offset_t fileoff; 1160 int copy_cnt; 1161 const struct sendfilevec *copy_vec; 1162 struct sendfilevec sfv[SEND_MAX_CHUNK]; 1163 ssize_t count = 0; 1164 #ifdef _SYSCALL32_IMPL 1165 struct ksendfilevec32 sfv32[SEND_MAX_CHUNK]; 1166 #endif 1167 ssize_t total_size; 1168 int i; 1169 boolean_t is_sock = B_FALSE; 1170 int maxblk = 0; 1171 1172 if (sfvcnt <= 0) 1173 return (set_errno(EINVAL)); 1174 1175 if ((fp = getf(fildes)) == NULL) 1176 return (set_errno(EBADF)); 1177 1178 if (((fp->f_flag) & FWRITE) == 0) { 1179 error = EBADF; 1180 goto err; 1181 } 1182 1183 fileoff = fp->f_offset; 1184 vp = fp->f_vnode; 1185 1186 switch (vp->v_type) { 1187 case VSOCK: 1188 so = VTOSO(vp); 1189 /* sendfile not supported for SCTP */ 1190 if (so->so_protocol == IPPROTO_SCTP) { 1191 error = EPROTONOSUPPORT; 1192 goto err; 1193 } 1194 is_sock = B_TRUE; 1195 switch (so->so_family) { 1196 case AF_INET: 1197 case AF_INET6: 1198 /* 1199 * Make similar checks done in SOP_WRITE(). 1200 */ 1201 if (so->so_state & SS_CANTSENDMORE) { 1202 tsignal(curthread, SIGPIPE); 1203 error = EPIPE; 1204 goto err; 1205 } 1206 if (so->so_type != SOCK_STREAM) { 1207 error = EOPNOTSUPP; 1208 goto err; 1209 } 1210 1211 if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) != 1212 (SS_ISCONNECTED|SS_ISBOUND)) { 1213 error = ENOTCONN; 1214 goto err; 1215 } 1216 1217 if ((so->so_state & SS_DIRECT) && 1218 (so->so_priv != NULL) && 1219 (so->so_kssl_ctx == NULL)) { 1220 maxblk = ((tcp_t *)so->so_priv)->tcp_mss; 1221 } else { 1222 maxblk = (int)vp->v_stream->sd_maxblk; 1223 } 1224 break; 1225 default: 1226 error = EAFNOSUPPORT; 1227 goto err; 1228 } 1229 break; 1230 case VREG: 1231 break; 1232 default: 1233 error = EINVAL; 1234 goto err; 1235 } 1236 1237 switch (opcode) { 1238 case SENDFILEV : 1239 break; 1240 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 1241 case SENDFILEV64 : 1242 return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt, 1243 (size32_t *)xferred, fildes)); 1244 #endif 1245 default : 1246 error = ENOSYS; 1247 break; 1248 } 1249 1250 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1251 copy_vec = vec; 1252 1253 do { 1254 total_size = 0; 1255 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 1256 #ifdef _SYSCALL32_IMPL 1257 /* 32-bit callers need to have their iovec expanded. */ 1258 if (get_udatamodel() == DATAMODEL_ILP32) { 1259 if (copyin(copy_vec, sfv32, 1260 copy_cnt * sizeof (ksendfilevec32_t))) { 1261 error = EFAULT; 1262 break; 1263 } 1264 1265 for (i = 0; i < copy_cnt; i++) { 1266 sfv[i].sfv_fd = sfv32[i].sfv_fd; 1267 sfv[i].sfv_off = 1268 (off_t)(uint32_t)sfv32[i].sfv_off; 1269 sfv[i].sfv_len = (size_t)sfv32[i].sfv_len; 1270 total_size += sfv[i].sfv_len; 1271 sfv[i].sfv_flag = sfv32[i].sfv_flag; 1272 /* 1273 * Individual elements of the vector must not 1274 * wrap or overflow, as later math is signed. 1275 * Equally total_size needs to be checked after 1276 * each vector is added in, to be sure that 1277 * rogue values haven't overflowed the counter. 1278 */ 1279 if (((ssize32_t)sfv[i].sfv_len < 0) || 1280 ((ssize32_t)total_size < 0)) { 1281 /* 1282 * Truncate the vector to send data 1283 * described by elements before the 1284 * error. 1285 */ 1286 copy_cnt = i; 1287 first_vector_error = EINVAL; 1288 /* total_size can't be trusted */ 1289 if ((ssize32_t)total_size < 0) 1290 error = EINVAL; 1291 break; 1292 } 1293 } 1294 /* Nothing to do, process errors */ 1295 if (copy_cnt == 0) 1296 break; 1297 1298 } else { 1299 #endif 1300 if (copyin(copy_vec, sfv, 1301 copy_cnt * sizeof (sendfilevec_t))) { 1302 error = EFAULT; 1303 break; 1304 } 1305 1306 for (i = 0; i < copy_cnt; i++) { 1307 total_size += sfv[i].sfv_len; 1308 /* 1309 * Individual elements of the vector must not 1310 * wrap or overflow, as later math is signed. 1311 * Equally total_size needs to be checked after 1312 * each vector is added in, to be sure that 1313 * rogue values haven't overflowed the counter. 1314 */ 1315 if (((ssize_t)sfv[i].sfv_len < 0) || 1316 (total_size < 0)) { 1317 /* 1318 * Truncate the vector to send data 1319 * described by elements before the 1320 * error. 1321 */ 1322 copy_cnt = i; 1323 first_vector_error = EINVAL; 1324 /* total_size can't be trusted */ 1325 if (total_size < 0) 1326 error = EINVAL; 1327 break; 1328 } 1329 } 1330 /* Nothing to do, process errors */ 1331 if (copy_cnt == 0) 1332 break; 1333 #ifdef _SYSCALL32_IMPL 1334 } 1335 #endif 1336 1337 /* 1338 * The task between deciding to use sendvec_small_chunk 1339 * and sendvec_chunk is dependant on multiple things: 1340 * 1341 * i) latency is important for smaller files. So if the 1342 * data is smaller than 'tcp_slow_start_initial' times 1343 * maxblk, then use sendvec_small_chunk which creates 1344 * maxblk size mblks and chains them together and sends 1345 * them to TCP in one shot. It also leaves 'wroff' size 1346 * space for the headers in each mblk. 1347 * 1348 * ii) for total size bigger than 'tcp_slow_start_initial' 1349 * time maxblk, its probably real file data which is 1350 * dominating. So its better to use sendvec_chunk because 1351 * performance goes to dog if we don't do pagesize reads. 1352 * sendvec_chunk will do pagesize reads and write them 1353 * in pagesize mblks to TCP. 1354 * 1355 * Side Notes: A write to file has not been optimized. 1356 * Future zero copy code will plugin into sendvec_chunk 1357 * only because doing zero copy for files smaller then 1358 * pagesize is useless. 1359 * 1360 * Note, if socket has NL7C enabled then call NL7C's 1361 * senfilev() function to consume the sfv[]. 1362 */ 1363 if (is_sock) { 1364 switch (so->so_family) { 1365 case AF_INET: 1366 case AF_INET6: 1367 if (so->so_nl7c_flags != 0) 1368 error = nl7c_sendfilev(so, &fileoff, 1369 sfv, copy_cnt, &count); 1370 else if ((total_size <= (4 * maxblk)) && 1371 error == 0) 1372 error = sendvec_small_chunk(fp, 1373 &fileoff, sfv, copy_cnt, 1374 total_size, maxblk, &count); 1375 else 1376 error = sendvec_chunk(fp, &fileoff, 1377 sfv, copy_cnt, &count); 1378 break; 1379 } 1380 } else { 1381 ASSERT(vp->v_type == VREG); 1382 error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt, 1383 &count); 1384 } 1385 1386 1387 #ifdef _SYSCALL32_IMPL 1388 if (get_udatamodel() == DATAMODEL_ILP32) 1389 copy_vec = (const struct sendfilevec *)((char *)copy_vec + 1390 (copy_cnt * sizeof (ksendfilevec32_t))); 1391 else 1392 #endif 1393 copy_vec += copy_cnt; 1394 sfvcnt -= copy_cnt; 1395 1396 /* Process all vector members up to first error */ 1397 } while ((sfvcnt > 0) && first_vector_error == 0 && error == 0); 1398 1399 if (vp->v_type == VREG) 1400 fp->f_offset += count; 1401 1402 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1403 1404 #ifdef _SYSCALL32_IMPL 1405 if (get_udatamodel() == DATAMODEL_ILP32) { 1406 ssize32_t count32 = (ssize32_t)count; 1407 if (copyout(&count32, xferred, sizeof (count32))) 1408 error = EFAULT; 1409 releasef(fildes); 1410 if (error != 0) 1411 return (set_errno(error)); 1412 if (first_vector_error != 0) 1413 return (set_errno(first_vector_error)); 1414 return (count32); 1415 } 1416 #endif 1417 if (copyout(&count, xferred, sizeof (count))) 1418 error = EFAULT; 1419 releasef(fildes); 1420 if (error != 0) 1421 return (set_errno(error)); 1422 if (first_vector_error != 0) 1423 return (set_errno(first_vector_error)); 1424 return (count); 1425 err: 1426 ASSERT(error != 0); 1427 releasef(fildes); 1428 return (set_errno(error)); 1429 } 1430