1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/t_lock.h> 29 #include <sys/param.h> 30 #include <sys/systm.h> 31 #include <sys/buf.h> 32 #include <sys/conf.h> 33 #include <sys/cred.h> 34 #include <sys/kmem.h> 35 #include <sys/sysmacros.h> 36 #include <sys/vfs.h> 37 #include <sys/vnode.h> 38 #include <sys/debug.h> 39 #include <sys/errno.h> 40 #include <sys/time.h> 41 #include <sys/file.h> 42 #include <sys/open.h> 43 #include <sys/user.h> 44 #include <sys/termios.h> 45 #include <sys/stream.h> 46 #include <sys/strsubr.h> 47 #include <sys/sunddi.h> 48 #include <sys/esunddi.h> 49 #include <sys/flock.h> 50 #include <sys/modctl.h> 51 #include <sys/cmn_err.h> 52 #include <sys/vmsystm.h> 53 54 #include <sys/socket.h> 55 #include <sys/socketvar.h> 56 /* swilly code in sys/socketvar.h turns off DEBUG */ 57 #ifdef __lint 58 #define DEBUG 59 #endif 60 61 #include <netinet/in.h> 62 #include <sys/sendfile.h> 63 #include <sys/un.h> 64 #include <sys/tihdr.h> 65 #include <sys/atomic.h> 66 67 #include <inet/common.h> 68 #include <inet/ip.h> 69 #include <inet/ip6.h> 70 #include <inet/tcp.h> 71 72 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *, 73 ssize32_t *); 74 extern int nl7c_sendfilev(struct sonode *, u_offset_t *, struct sendfilevec *, 75 int, ssize_t *); 76 extern int snf_segmap(file_t *, vnode_t *, u_offset_t, u_offset_t, ssize_t *, 77 boolean_t); 78 79 #define readflg (V_WRITELOCK_FALSE) 80 #define rwflag (V_WRITELOCK_TRUE) 81 82 /* 83 * kstrwritemp() has very similar semantics as that of strwrite(). 84 * The main difference is it obtains mblks from the caller and also 85 * does not do any copy as done in strwrite() from user buffers to 86 * kernel buffers. 87 * 88 * Currently, this routine is used by sendfile to send data allocated 89 * within the kernel without any copying. This interface does not use the 90 * synchronous stream interface as synch. stream interface implies 91 * copying. 92 */ 93 int 94 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode) 95 { 96 struct stdata *stp; 97 struct queue *wqp; 98 mblk_t *newmp; 99 char waitflag; 100 int tempmode; 101 int error = 0; 102 int done = 0; 103 struct sonode *so; 104 boolean_t direct; 105 106 ASSERT(vp->v_stream); 107 stp = vp->v_stream; 108 109 so = VTOSO(vp); 110 direct = (so->so_state & SS_DIRECT); 111 112 /* 113 * This is the sockfs direct fast path. canputnext() need 114 * not be accurate so we don't grab the sd_lock here. If 115 * we get flow-controlled, we grab sd_lock just before the 116 * do..while loop below to emulate what strwrite() does. 117 */ 118 wqp = stp->sd_wrq; 119 if (canputnext(wqp) && direct && 120 !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) { 121 return (sostream_direct(so, NULL, mp, CRED())); 122 } else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) { 123 /* Fast check of flags before acquiring the lock */ 124 mutex_enter(&stp->sd_lock); 125 error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0); 126 mutex_exit(&stp->sd_lock); 127 if (error != 0) { 128 if (!(stp->sd_flag & STPLEX) && 129 (stp->sd_wput_opt & SW_SIGPIPE)) { 130 tsignal(curthread, SIGPIPE); 131 error = EPIPE; 132 } 133 return (error); 134 } 135 } 136 137 waitflag = WRITEWAIT; 138 if (stp->sd_flag & OLDNDELAY) 139 tempmode = fmode & ~FNDELAY; 140 else 141 tempmode = fmode; 142 143 mutex_enter(&stp->sd_lock); 144 do { 145 if (canputnext(wqp)) { 146 mutex_exit(&stp->sd_lock); 147 if (stp->sd_wputdatafunc != NULL) { 148 newmp = (stp->sd_wputdatafunc)(vp, mp, NULL, 149 NULL, NULL, NULL); 150 if (newmp == NULL) { 151 /* The caller will free mp */ 152 return (ECOMM); 153 } 154 mp = newmp; 155 } 156 putnext(wqp, mp); 157 return (0); 158 } 159 error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1, 160 &done); 161 } while (error == 0 && !done); 162 163 mutex_exit(&stp->sd_lock); 164 /* 165 * EAGAIN tells the application to try again. ENOMEM 166 * is returned only if the memory allocation size 167 * exceeds the physical limits of the system. ENOMEM 168 * can't be true here. 169 */ 170 if (error == ENOMEM) 171 error = EAGAIN; 172 return (error); 173 } 174 175 #define SEND_MAX_CHUNK 16 176 177 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 178 /* 179 * 64 bit offsets for 32 bit applications only running either on 180 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer 181 * more than 2GB of data. 182 */ 183 int 184 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, 185 int copy_cnt, ssize32_t *count) 186 { 187 struct vnode *vp; 188 ushort_t fflag; 189 int ioflag; 190 size32_t cnt; 191 ssize32_t sfv_len; 192 ssize32_t tmpcount; 193 u_offset_t sfv_off; 194 struct uio auio; 195 struct iovec aiov; 196 int i, error; 197 198 fflag = fp->f_flag; 199 vp = fp->f_vnode; 200 for (i = 0; i < copy_cnt; i++) { 201 202 if (ISSIG(curthread, JUSTLOOKING)) 203 return (EINTR); 204 205 /* 206 * Do similar checks as "write" as we are writing 207 * sfv_len bytes into "vp". 208 */ 209 sfv_len = (ssize32_t)sfv->sfv_len; 210 211 if (sfv_len == 0) 212 continue; 213 214 if (sfv_len < 0) 215 return (EINVAL); 216 217 if (vp->v_type == VREG) { 218 if (*fileoff >= curproc->p_fsz_ctl) { 219 mutex_enter(&curproc->p_lock); 220 (void) rctl_action( 221 rctlproc_legacy[RLIMIT_FSIZE], 222 curproc->p_rctls, curproc, RCA_SAFE); 223 mutex_exit(&curproc->p_lock); 224 return (EFBIG); 225 } 226 227 if (*fileoff >= OFFSET_MAX(fp)) 228 return (EFBIG); 229 230 if (*fileoff + sfv_len > OFFSET_MAX(fp)) 231 return (EINVAL); 232 } 233 234 tmpcount = *count + sfv_len; 235 if (tmpcount < 0) 236 return (EINVAL); 237 238 sfv_off = sfv->sfv_off; 239 240 auio.uio_extflg = UIO_COPY_DEFAULT; 241 if (sfv->sfv_fd == SFV_FD_SELF) { 242 aiov.iov_len = sfv_len; 243 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 244 auio.uio_loffset = *fileoff; 245 auio.uio_iovcnt = 1; 246 auio.uio_resid = sfv_len; 247 auio.uio_iov = &aiov; 248 auio.uio_segflg = UIO_USERSPACE; 249 auio.uio_llimit = curproc->p_fsz_ctl; 250 auio.uio_fmode = fflag; 251 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 252 while (sfv_len > 0) { 253 error = VOP_WRITE(vp, &auio, ioflag, 254 fp->f_cred, NULL); 255 cnt = sfv_len - auio.uio_resid; 256 sfv_len -= cnt; 257 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 258 if (vp->v_type == VREG) 259 *fileoff += cnt; 260 *count += cnt; 261 if (error != 0) 262 return (error); 263 } 264 } else { 265 file_t *ffp; 266 vnode_t *readvp; 267 size_t size; 268 caddr_t ptr; 269 270 if ((ffp = getf(sfv->sfv_fd)) == NULL) 271 return (EBADF); 272 273 if ((ffp->f_flag & FREAD) == 0) { 274 releasef(sfv->sfv_fd); 275 return (EBADF); 276 } 277 278 readvp = ffp->f_vnode; 279 if (readvp->v_type != VREG) { 280 releasef(sfv->sfv_fd); 281 return (EINVAL); 282 } 283 284 /* 285 * No point reading and writing to same vp, 286 * as long as both are regular files. readvp is not 287 * locked; but since we got it from an open file the 288 * contents will be valid during the time of access. 289 */ 290 if (vn_compare(vp, readvp)) { 291 releasef(sfv->sfv_fd); 292 return (EINVAL); 293 } 294 295 /* 296 * Note: we assume readvp != vp. "vp" is already 297 * locked, and "readvp" must not be. 298 */ 299 (void) VOP_RWLOCK(readvp, readflg, NULL); 300 301 /* 302 * Same checks as in pread64. 303 */ 304 if (sfv_off > MAXOFFSET_T) { 305 VOP_RWUNLOCK(readvp, readflg, NULL); 306 releasef(sfv->sfv_fd); 307 return (EINVAL); 308 } 309 310 if (sfv_off + sfv_len > MAXOFFSET_T) 311 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off); 312 313 /* Find the native blocksize to transfer data */ 314 size = MIN(vp->v_vfsp->vfs_bsize, 315 readvp->v_vfsp->vfs_bsize); 316 size = sfv_len < size ? sfv_len : size; 317 ptr = kmem_alloc(size, KM_SLEEP); 318 319 while (sfv_len > 0) { 320 size_t iov_len; 321 322 iov_len = MIN(size, sfv_len); 323 aiov.iov_base = ptr; 324 aiov.iov_len = iov_len; 325 auio.uio_loffset = sfv_off; 326 auio.uio_iov = &aiov; 327 auio.uio_iovcnt = 1; 328 auio.uio_resid = iov_len; 329 auio.uio_segflg = UIO_SYSSPACE; 330 auio.uio_llimit = MAXOFFSET_T; 331 auio.uio_fmode = ffp->f_flag; 332 ioflag = auio.uio_fmode & 333 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 334 335 /* 336 * If read sync is not asked for, 337 * filter sync flags 338 */ 339 if ((ioflag & FRSYNC) == 0) 340 ioflag &= ~(FSYNC|FDSYNC); 341 error = VOP_READ(readvp, &auio, ioflag, 342 fp->f_cred, NULL); 343 if (error) { 344 kmem_free(ptr, size); 345 VOP_RWUNLOCK(readvp, readflg, NULL); 346 releasef(sfv->sfv_fd); 347 return (error); 348 } 349 350 /* 351 * Check how must data was really read. 352 * Decrement the 'len' and increment the 353 * 'off' appropriately. 354 */ 355 cnt = iov_len - auio.uio_resid; 356 if (cnt == 0) { 357 /* 358 * If we were reading a pipe (currently 359 * not implemented), we may now lose 360 * data. 361 */ 362 kmem_free(ptr, size); 363 VOP_RWUNLOCK(readvp, readflg, NULL); 364 releasef(sfv->sfv_fd); 365 return (EINVAL); 366 } 367 sfv_len -= cnt; 368 sfv_off += cnt; 369 370 aiov.iov_base = ptr; 371 aiov.iov_len = cnt; 372 auio.uio_loffset = *fileoff; 373 auio.uio_iov = &aiov; 374 auio.uio_iovcnt = 1; 375 auio.uio_resid = cnt; 376 auio.uio_segflg = UIO_SYSSPACE; 377 auio.uio_llimit = curproc->p_fsz_ctl; 378 auio.uio_fmode = fflag; 379 ioflag = auio.uio_fmode & 380 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 381 error = VOP_WRITE(vp, &auio, ioflag, 382 fp->f_cred, NULL); 383 384 /* 385 * Check how much data was written. Increment 386 * the 'len' and decrement the 'off' if all 387 * the data was not written. 388 */ 389 cnt -= auio.uio_resid; 390 sfv_len += auio.uio_resid; 391 sfv_off -= auio.uio_resid; 392 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 393 if (vp->v_type == VREG) 394 *fileoff += cnt; 395 *count += cnt; 396 if (error != 0) { 397 kmem_free(ptr, size); 398 VOP_RWUNLOCK(readvp, readflg, NULL); 399 releasef(sfv->sfv_fd); 400 return (error); 401 } 402 } 403 VOP_RWUNLOCK(readvp, readflg, NULL); 404 releasef(sfv->sfv_fd); 405 kmem_free(ptr, size); 406 } 407 sfv++; 408 } 409 return (0); 410 } 411 412 ssize32_t 413 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, 414 size32_t *xferred, int fildes) 415 { 416 u_offset_t fileoff; 417 int copy_cnt; 418 const struct ksendfilevec64 *copy_vec; 419 struct ksendfilevec64 sfv[SEND_MAX_CHUNK]; 420 struct vnode *vp; 421 int error; 422 ssize32_t count = 0; 423 424 vp = fp->f_vnode; 425 (void) VOP_RWLOCK(vp, rwflag, NULL); 426 427 copy_vec = vec; 428 fileoff = fp->f_offset; 429 430 do { 431 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 432 if (copyin(copy_vec, sfv, copy_cnt * 433 sizeof (struct ksendfilevec64))) { 434 error = EFAULT; 435 break; 436 } 437 438 /* 439 * Optimize the regular file over 440 * the socket case. 441 */ 442 if (vp->v_type == VSOCK && sfv->sfv_fd != SFV_FD_SELF) { 443 file_t *rfp; 444 vnode_t *rvp; 445 446 if ((rfp = getf(sfv->sfv_fd)) == NULL) { 447 error = EBADF; 448 break; 449 } 450 if ((rfp->f_flag & FREAD) == 0) { 451 releasef(sfv->sfv_fd); 452 error = EBADF; 453 break; 454 } 455 rvp = rfp->f_vnode; 456 if (rvp->v_type == VREG) { 457 error = sosendfile64(fp, rfp, sfv, &count); 458 if (error) 459 break; 460 copy_vec++; 461 sfvcnt--; 462 continue; 463 } 464 releasef(sfv->sfv_fd); 465 } 466 error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count); 467 if (error != 0) 468 break; 469 470 copy_vec += copy_cnt; 471 sfvcnt -= copy_cnt; 472 } while (sfvcnt > 0); 473 474 if (vp->v_type == VREG) 475 fp->f_offset += count; 476 477 VOP_RWUNLOCK(vp, rwflag, NULL); 478 if (copyout(&count, xferred, sizeof (count))) 479 error = EFAULT; 480 releasef(fildes); 481 if (error != 0) 482 return (set_errno(error)); 483 return (count); 484 } 485 #endif 486 487 int 488 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 489 int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count) 490 { 491 struct vnode *vp; 492 struct uio auio; 493 struct iovec aiov; 494 ushort_t fflag; 495 int ioflag; 496 int i, error; 497 size_t cnt; 498 ssize_t sfv_len; 499 u_offset_t sfv_off; 500 #ifdef _SYSCALL32_IMPL 501 model_t model = get_udatamodel(); 502 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 503 MAXOFF32_T : MAXOFFSET_T; 504 #else 505 const u_offset_t maxoff = MAXOFF32_T; 506 #endif 507 mblk_t *dmp = NULL; 508 int wroff; 509 int buf_left = 0; 510 size_t iov_len; 511 mblk_t *head, *tmp; 512 size_t size = total_size; 513 size_t extra; 514 int tail_len; 515 516 fflag = fp->f_flag; 517 vp = fp->f_vnode; 518 519 ASSERT(vp->v_type == VSOCK); 520 ASSERT(maxblk > 0); 521 522 wroff = (int)vp->v_stream->sd_wroff; 523 tail_len = (int)vp->v_stream->sd_tail; 524 extra = wroff + tail_len; 525 526 buf_left = MIN(total_size, maxblk); 527 head = dmp = allocb(buf_left + extra, BPRI_HI); 528 if (head == NULL) 529 return (ENOMEM); 530 head->b_wptr = head->b_rptr = head->b_rptr + wroff; 531 532 auio.uio_extflg = UIO_COPY_DEFAULT; 533 for (i = 0; i < copy_cnt; i++) { 534 if (ISSIG(curthread, JUSTLOOKING)) { 535 freemsg(head); 536 return (EINTR); 537 } 538 539 /* 540 * Do similar checks as "write" as we are writing 541 * sfv_len bytes into "vp". 542 */ 543 sfv_len = (ssize_t)sfv->sfv_len; 544 545 if (sfv_len == 0) { 546 sfv++; 547 continue; 548 } 549 550 /* Check for overflow */ 551 #ifdef _SYSCALL32_IMPL 552 if (model == DATAMODEL_ILP32) { 553 if (((ssize32_t)(*count + sfv_len)) < 0) { 554 freemsg(head); 555 return (EINVAL); 556 } 557 } else 558 #endif 559 if ((*count + sfv_len) < 0) { 560 freemsg(head); 561 return (EINVAL); 562 } 563 564 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 565 566 if (sfv->sfv_fd == SFV_FD_SELF) { 567 while (sfv_len > 0) { 568 if (buf_left == 0) { 569 tmp = dmp; 570 buf_left = MIN(total_size, maxblk); 571 iov_len = MIN(buf_left, sfv_len); 572 dmp = allocb(buf_left + extra, BPRI_HI); 573 if (dmp == NULL) { 574 freemsg(head); 575 return (ENOMEM); 576 } 577 dmp->b_wptr = dmp->b_rptr = 578 dmp->b_rptr + wroff; 579 tmp->b_cont = dmp; 580 } else { 581 iov_len = MIN(buf_left, sfv_len); 582 } 583 584 aiov.iov_len = iov_len; 585 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 586 auio.uio_loffset = *fileoff; 587 auio.uio_iovcnt = 1; 588 auio.uio_resid = iov_len; 589 auio.uio_iov = &aiov; 590 auio.uio_segflg = UIO_USERSPACE; 591 auio.uio_llimit = curproc->p_fsz_ctl; 592 auio.uio_fmode = fflag; 593 594 buf_left -= iov_len; 595 total_size -= iov_len; 596 sfv_len -= iov_len; 597 sfv_off += iov_len; 598 599 error = uiomove((caddr_t)dmp->b_wptr, 600 iov_len, UIO_WRITE, &auio); 601 if (error != 0) { 602 freemsg(head); 603 return (error); 604 } 605 dmp->b_wptr += iov_len; 606 } 607 } else { 608 file_t *ffp; 609 vnode_t *readvp; 610 611 if ((ffp = getf(sfv->sfv_fd)) == NULL) { 612 freemsg(head); 613 return (EBADF); 614 } 615 616 if ((ffp->f_flag & FREAD) == 0) { 617 releasef(sfv->sfv_fd); 618 freemsg(head); 619 return (EACCES); 620 } 621 622 readvp = ffp->f_vnode; 623 if (readvp->v_type != VREG) { 624 releasef(sfv->sfv_fd); 625 freemsg(head); 626 return (EINVAL); 627 } 628 629 /* 630 * No point reading and writing to same vp, 631 * as long as both are regular files. readvp is not 632 * locked; but since we got it from an open file the 633 * contents will be valid during the time of access. 634 */ 635 636 if (vn_compare(vp, readvp)) { 637 releasef(sfv->sfv_fd); 638 freemsg(head); 639 return (EINVAL); 640 } 641 642 /* 643 * Note: we assume readvp != vp. "vp" is already 644 * locked, and "readvp" must not be. 645 */ 646 647 (void) VOP_RWLOCK(readvp, readflg, NULL); 648 649 /* Same checks as in pread */ 650 if (sfv_off > maxoff) { 651 VOP_RWUNLOCK(readvp, readflg, NULL); 652 releasef(sfv->sfv_fd); 653 freemsg(head); 654 return (EINVAL); 655 } 656 if (sfv_off + sfv_len > maxoff) { 657 total_size -= (sfv_off + sfv_len - maxoff); 658 sfv_len = (ssize_t)((offset_t)maxoff - 659 sfv_off); 660 } 661 662 while (sfv_len > 0) { 663 if (buf_left == 0) { 664 tmp = dmp; 665 buf_left = MIN(total_size, maxblk); 666 iov_len = MIN(buf_left, sfv_len); 667 dmp = allocb(buf_left + extra, BPRI_HI); 668 if (dmp == NULL) { 669 VOP_RWUNLOCK(readvp, readflg, 670 NULL); 671 releasef(sfv->sfv_fd); 672 freemsg(head); 673 return (ENOMEM); 674 } 675 dmp->b_wptr = dmp->b_rptr = 676 dmp->b_rptr + wroff; 677 tmp->b_cont = dmp; 678 } else { 679 iov_len = MIN(buf_left, sfv_len); 680 } 681 aiov.iov_base = (caddr_t)dmp->b_wptr; 682 aiov.iov_len = iov_len; 683 auio.uio_loffset = sfv_off; 684 auio.uio_iov = &aiov; 685 auio.uio_iovcnt = 1; 686 auio.uio_resid = iov_len; 687 auio.uio_segflg = UIO_SYSSPACE; 688 auio.uio_llimit = MAXOFFSET_T; 689 auio.uio_fmode = ffp->f_flag; 690 ioflag = auio.uio_fmode & 691 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 692 693 /* 694 * If read sync is not asked for, 695 * filter sync flags 696 */ 697 if ((ioflag & FRSYNC) == 0) 698 ioflag &= ~(FSYNC|FDSYNC); 699 error = VOP_READ(readvp, &auio, ioflag, 700 fp->f_cred, NULL); 701 if (error != 0) { 702 /* 703 * If we were reading a pipe (currently 704 * not implemented), we may now loose 705 * data. 706 */ 707 VOP_RWUNLOCK(readvp, readflg, NULL); 708 releasef(sfv->sfv_fd); 709 freemsg(head); 710 return (error); 711 } 712 713 /* 714 * Check how much data was really read. 715 * Decrement the 'len' and increment the 716 * 'off' appropriately. 717 */ 718 cnt = iov_len - auio.uio_resid; 719 if (cnt == 0) { 720 VOP_RWUNLOCK(readvp, readflg, NULL); 721 releasef(sfv->sfv_fd); 722 freemsg(head); 723 return (EINVAL); 724 } 725 sfv_len -= cnt; 726 sfv_off += cnt; 727 total_size -= cnt; 728 buf_left -= cnt; 729 730 dmp->b_wptr += cnt; 731 } 732 VOP_RWUNLOCK(readvp, readflg, NULL); 733 releasef(sfv->sfv_fd); 734 } 735 sfv++; 736 } 737 738 ASSERT(total_size == 0); 739 error = kstrwritemp(vp, head, fflag); 740 if (error != 0) { 741 freemsg(head); 742 return (error); 743 } 744 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size; 745 *count += size; 746 747 return (0); 748 } 749 750 751 int 752 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 753 int copy_cnt, ssize_t *count) 754 { 755 struct vnode *vp; 756 struct uio auio; 757 struct iovec aiov; 758 ushort_t fflag; 759 int ioflag; 760 int i, error; 761 size_t cnt; 762 ssize_t sfv_len; 763 u_offset_t sfv_off; 764 #ifdef _SYSCALL32_IMPL 765 model_t model = get_udatamodel(); 766 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 767 MAXOFF32_T : MAXOFFSET_T; 768 #else 769 const u_offset_t maxoff = MAXOFF32_T; 770 #endif 771 mblk_t *dmp = NULL; 772 char *buf = NULL; 773 size_t extra; 774 int maxblk, wroff, tail_len; 775 struct sonode *so; 776 stdata_t *stp; 777 778 fflag = fp->f_flag; 779 vp = fp->f_vnode; 780 781 if (vp->v_type == VSOCK) { 782 so = VTOSO(vp); 783 stp = vp->v_stream; 784 wroff = (int)stp->sd_wroff; 785 tail_len = (int)stp->sd_tail; 786 maxblk = (int)stp->sd_maxblk; 787 extra = wroff + tail_len; 788 } 789 790 auio.uio_extflg = UIO_COPY_DEFAULT; 791 for (i = 0; i < copy_cnt; i++) { 792 if (ISSIG(curthread, JUSTLOOKING)) 793 return (EINTR); 794 795 /* 796 * Do similar checks as "write" as we are writing 797 * sfv_len bytes into "vp". 798 */ 799 sfv_len = (ssize_t)sfv->sfv_len; 800 801 if (sfv_len == 0) { 802 sfv++; 803 continue; 804 } 805 806 if (vp->v_type == VREG) { 807 if (*fileoff >= curproc->p_fsz_ctl) { 808 mutex_enter(&curproc->p_lock); 809 (void) rctl_action( 810 rctlproc_legacy[RLIMIT_FSIZE], 811 curproc->p_rctls, curproc, RCA_SAFE); 812 mutex_exit(&curproc->p_lock); 813 814 return (EFBIG); 815 } 816 817 if (*fileoff >= maxoff) 818 return (EFBIG); 819 820 if (*fileoff + sfv_len > maxoff) 821 return (EINVAL); 822 } 823 824 /* Check for overflow */ 825 #ifdef _SYSCALL32_IMPL 826 if (model == DATAMODEL_ILP32) { 827 if (((ssize32_t)(*count + sfv_len)) < 0) 828 return (EINVAL); 829 } else 830 #endif 831 if ((*count + sfv_len) < 0) 832 return (EINVAL); 833 834 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 835 836 if (sfv->sfv_fd == SFV_FD_SELF) { 837 aiov.iov_len = sfv_len; 838 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 839 auio.uio_loffset = *fileoff; 840 auio.uio_iovcnt = 1; 841 auio.uio_resid = sfv_len; 842 auio.uio_iov = &aiov; 843 auio.uio_segflg = UIO_USERSPACE; 844 auio.uio_llimit = curproc->p_fsz_ctl; 845 auio.uio_fmode = fflag; 846 847 if (vp->v_type == VSOCK) { 848 849 /* 850 * Optimize for the socket case 851 */ 852 853 dmp = allocb(sfv_len + extra, BPRI_HI); 854 if (dmp == NULL) 855 return (ENOMEM); 856 dmp->b_wptr = dmp->b_rptr = dmp->b_rptr + wroff; 857 error = uiomove((caddr_t)dmp->b_wptr, 858 sfv_len, UIO_WRITE, &auio); 859 if (error != 0) { 860 freeb(dmp); 861 return (error); 862 } 863 dmp->b_wptr += sfv_len; 864 error = kstrwritemp(vp, dmp, fflag); 865 if (error != 0) { 866 freeb(dmp); 867 return (error); 868 } 869 ttolwp(curthread)->lwp_ru.ioch += 870 (ulong_t)sfv_len; 871 *count += sfv_len; 872 } else { 873 ioflag = auio.uio_fmode & 874 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 875 while (sfv_len > 0) { 876 error = VOP_WRITE(vp, &auio, ioflag, 877 fp->f_cred, NULL); 878 cnt = sfv_len - auio.uio_resid; 879 sfv_len -= cnt; 880 ttolwp(curthread)->lwp_ru.ioch += 881 (ulong_t)cnt; 882 *fileoff += cnt; 883 *count += cnt; 884 if (error != 0) 885 return (error); 886 } 887 } 888 } else { 889 int segmapit = 0; 890 file_t *ffp; 891 vnode_t *readvp; 892 struct vnode *realvp; 893 size_t size; 894 caddr_t ptr; 895 896 if ((ffp = getf(sfv->sfv_fd)) == NULL) 897 return (EBADF); 898 899 if ((ffp->f_flag & FREAD) == 0) { 900 releasef(sfv->sfv_fd); 901 return (EBADF); 902 } 903 904 readvp = ffp->f_vnode; 905 if (VOP_REALVP(readvp, &realvp, NULL) == 0) 906 readvp = realvp; 907 if (readvp->v_type != VREG) { 908 releasef(sfv->sfv_fd); 909 return (EINVAL); 910 } 911 912 /* 913 * No point reading and writing to same vp, 914 * as long as both are regular files. readvp is not 915 * locked; but since we got it from an open file the 916 * contents will be valid during the time of access. 917 */ 918 if (vn_compare(vp, readvp)) { 919 releasef(sfv->sfv_fd); 920 return (EINVAL); 921 } 922 923 /* 924 * Note: we assume readvp != vp. "vp" is already 925 * locked, and "readvp" must not be. 926 */ 927 (void) VOP_RWLOCK(readvp, readflg, NULL); 928 929 /* Same checks as in pread */ 930 if (sfv_off > maxoff) { 931 VOP_RWUNLOCK(readvp, readflg, NULL); 932 releasef(sfv->sfv_fd); 933 return (EINVAL); 934 } 935 if (sfv_off + sfv_len > maxoff) { 936 sfv_len = (ssize_t)((offset_t)maxoff - 937 sfv_off); 938 } 939 /* Find the native blocksize to transfer data */ 940 size = MIN(vp->v_vfsp->vfs_bsize, 941 readvp->v_vfsp->vfs_bsize); 942 size = sfv_len < size ? sfv_len : size; 943 944 if (vp->v_type != VSOCK) { 945 segmapit = 0; 946 buf = kmem_alloc(size, KM_NOSLEEP); 947 if (buf == NULL) { 948 VOP_RWUNLOCK(readvp, readflg, NULL); 949 releasef(sfv->sfv_fd); 950 return (ENOMEM); 951 } 952 } else { 953 /* 954 * For sockets acting as an SSL proxy, we 955 * need to adjust the size to the maximum 956 * SSL record size set in the stream head. 957 */ 958 if (so->so_kssl_ctx != NULL) 959 size = MIN(size, maxblk); 960 961 if (vn_has_flocks(readvp) || 962 readvp->v_flag & VNOMAP || 963 stp->sd_copyflag & STZCVMUNSAFE) { 964 segmapit = 0; 965 } else if (stp->sd_copyflag & STZCVMSAFE) { 966 segmapit = 1; 967 } else { 968 int on = 1; 969 if (SOP_SETSOCKOPT(VTOSO(vp), 970 SOL_SOCKET, SO_SND_COPYAVOID, 971 &on, sizeof (on)) == 0) 972 segmapit = 1; 973 } 974 } 975 976 if (segmapit) { 977 boolean_t nowait; 978 979 nowait = (sfv->sfv_flag & SFV_NOWAIT) != 0; 980 error = snf_segmap(fp, readvp, sfv_off, 981 (u_offset_t)sfv_len, (ssize_t *)&cnt, 982 nowait); 983 releasef(sfv->sfv_fd); 984 *count += cnt; 985 if (error) 986 return (error); 987 sfv++; 988 continue; 989 } 990 991 while (sfv_len > 0) { 992 size_t iov_len; 993 994 iov_len = MIN(size, sfv_len); 995 996 if (vp->v_type == VSOCK) { 997 dmp = allocb(iov_len + extra, BPRI_HI); 998 if (dmp == NULL) { 999 VOP_RWUNLOCK(readvp, readflg, 1000 NULL); 1001 releasef(sfv->sfv_fd); 1002 return (ENOMEM); 1003 } 1004 dmp->b_wptr = dmp->b_rptr = 1005 dmp->b_rptr + wroff; 1006 ptr = (caddr_t)dmp->b_rptr; 1007 } else { 1008 ptr = buf; 1009 } 1010 1011 aiov.iov_base = ptr; 1012 aiov.iov_len = iov_len; 1013 auio.uio_loffset = sfv_off; 1014 auio.uio_iov = &aiov; 1015 auio.uio_iovcnt = 1; 1016 auio.uio_resid = iov_len; 1017 auio.uio_segflg = UIO_SYSSPACE; 1018 auio.uio_llimit = MAXOFFSET_T; 1019 auio.uio_fmode = ffp->f_flag; 1020 ioflag = auio.uio_fmode & 1021 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1022 1023 /* 1024 * If read sync is not asked for, 1025 * filter sync flags 1026 */ 1027 if ((ioflag & FRSYNC) == 0) 1028 ioflag &= ~(FSYNC|FDSYNC); 1029 error = VOP_READ(readvp, &auio, ioflag, 1030 fp->f_cred, NULL); 1031 if (error != 0) { 1032 /* 1033 * If we were reading a pipe (currently 1034 * not implemented), we may now lose 1035 * data. 1036 */ 1037 if (vp->v_type == VSOCK) 1038 freeb(dmp); 1039 else 1040 kmem_free(buf, size); 1041 VOP_RWUNLOCK(readvp, readflg, NULL); 1042 releasef(sfv->sfv_fd); 1043 return (error); 1044 } 1045 1046 /* 1047 * Check how much data was really read. 1048 * Decrement the 'len' and increment the 1049 * 'off' appropriately. 1050 */ 1051 cnt = iov_len - auio.uio_resid; 1052 if (cnt == 0) { 1053 if (vp->v_type == VSOCK) 1054 freeb(dmp); 1055 else 1056 kmem_free(buf, size); 1057 VOP_RWUNLOCK(readvp, readflg, NULL); 1058 releasef(sfv->sfv_fd); 1059 return (EINVAL); 1060 } 1061 sfv_len -= cnt; 1062 sfv_off += cnt; 1063 1064 if (vp->v_type == VSOCK) { 1065 dmp->b_wptr = dmp->b_rptr + cnt; 1066 1067 error = kstrwritemp(vp, dmp, fflag); 1068 if (error != 0) { 1069 freeb(dmp); 1070 VOP_RWUNLOCK(readvp, readflg, 1071 NULL); 1072 releasef(sfv->sfv_fd); 1073 return (error); 1074 } 1075 1076 ttolwp(curthread)->lwp_ru.ioch += 1077 (ulong_t)cnt; 1078 *count += cnt; 1079 } else { 1080 1081 aiov.iov_base = ptr; 1082 aiov.iov_len = cnt; 1083 auio.uio_loffset = *fileoff; 1084 auio.uio_resid = cnt; 1085 auio.uio_iov = &aiov; 1086 auio.uio_iovcnt = 1; 1087 auio.uio_segflg = UIO_SYSSPACE; 1088 auio.uio_llimit = curproc->p_fsz_ctl; 1089 auio.uio_fmode = fflag; 1090 ioflag = auio.uio_fmode & 1091 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1092 error = VOP_WRITE(vp, &auio, ioflag, 1093 fp->f_cred, NULL); 1094 1095 /* 1096 * Check how much data was written. 1097 * Increment the 'len' and decrement the 1098 * 'off' if all the data was not 1099 * written. 1100 */ 1101 cnt -= auio.uio_resid; 1102 sfv_len += auio.uio_resid; 1103 sfv_off -= auio.uio_resid; 1104 ttolwp(curthread)->lwp_ru.ioch += 1105 (ulong_t)cnt; 1106 *fileoff += cnt; 1107 *count += cnt; 1108 if (error != 0) { 1109 kmem_free(buf, size); 1110 VOP_RWUNLOCK(readvp, readflg, 1111 NULL); 1112 releasef(sfv->sfv_fd); 1113 return (error); 1114 } 1115 } 1116 } 1117 if (buf) { 1118 kmem_free(buf, size); 1119 buf = NULL; 1120 } 1121 VOP_RWUNLOCK(readvp, readflg, NULL); 1122 releasef(sfv->sfv_fd); 1123 } 1124 sfv++; 1125 } 1126 return (0); 1127 } 1128 1129 ssize_t 1130 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, 1131 size_t *xferred) 1132 { 1133 int error = 0; 1134 int first_vector_error = 0; 1135 file_t *fp; 1136 struct vnode *vp; 1137 struct sonode *so; 1138 u_offset_t fileoff; 1139 int copy_cnt; 1140 const struct sendfilevec *copy_vec; 1141 struct sendfilevec sfv[SEND_MAX_CHUNK]; 1142 ssize_t count = 0; 1143 #ifdef _SYSCALL32_IMPL 1144 struct ksendfilevec32 sfv32[SEND_MAX_CHUNK]; 1145 #endif 1146 ssize_t total_size; 1147 int i; 1148 boolean_t is_sock = B_FALSE; 1149 int maxblk = 0; 1150 1151 if (sfvcnt <= 0) 1152 return (set_errno(EINVAL)); 1153 1154 if ((fp = getf(fildes)) == NULL) 1155 return (set_errno(EBADF)); 1156 1157 if (((fp->f_flag) & FWRITE) == 0) { 1158 error = EBADF; 1159 goto err; 1160 } 1161 1162 fileoff = fp->f_offset; 1163 vp = fp->f_vnode; 1164 1165 switch (vp->v_type) { 1166 case VSOCK: 1167 so = VTOSO(vp); 1168 /* sendfile not supported for SCTP */ 1169 if (so->so_protocol == IPPROTO_SCTP) { 1170 error = EPROTONOSUPPORT; 1171 goto err; 1172 } 1173 is_sock = B_TRUE; 1174 switch (so->so_family) { 1175 case AF_INET: 1176 case AF_INET6: 1177 /* 1178 * Make similar checks done in SOP_WRITE(). 1179 */ 1180 if (so->so_state & SS_CANTSENDMORE) { 1181 tsignal(curthread, SIGPIPE); 1182 error = EPIPE; 1183 goto err; 1184 } 1185 if (so->so_type != SOCK_STREAM) { 1186 error = EOPNOTSUPP; 1187 goto err; 1188 } 1189 1190 if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) != 1191 (SS_ISCONNECTED|SS_ISBOUND)) { 1192 error = ENOTCONN; 1193 goto err; 1194 } 1195 1196 if ((so->so_state & SS_DIRECT) && 1197 (so->so_priv != NULL) && 1198 (so->so_kssl_ctx == NULL)) { 1199 maxblk = ((tcp_t *)so->so_priv)->tcp_mss; 1200 } else { 1201 maxblk = (int)vp->v_stream->sd_maxblk; 1202 } 1203 break; 1204 default: 1205 error = EAFNOSUPPORT; 1206 goto err; 1207 } 1208 break; 1209 case VREG: 1210 break; 1211 default: 1212 error = EINVAL; 1213 goto err; 1214 } 1215 1216 switch (opcode) { 1217 case SENDFILEV : 1218 break; 1219 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 1220 case SENDFILEV64 : 1221 return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt, 1222 (size32_t *)xferred, fildes)); 1223 #endif 1224 default : 1225 error = ENOSYS; 1226 break; 1227 } 1228 1229 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1230 copy_vec = vec; 1231 1232 do { 1233 total_size = 0; 1234 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 1235 #ifdef _SYSCALL32_IMPL 1236 /* 32-bit callers need to have their iovec expanded. */ 1237 if (get_udatamodel() == DATAMODEL_ILP32) { 1238 if (copyin(copy_vec, sfv32, 1239 copy_cnt * sizeof (ksendfilevec32_t))) { 1240 error = EFAULT; 1241 break; 1242 } 1243 1244 for (i = 0; i < copy_cnt; i++) { 1245 sfv[i].sfv_fd = sfv32[i].sfv_fd; 1246 sfv[i].sfv_off = 1247 (off_t)(uint32_t)sfv32[i].sfv_off; 1248 sfv[i].sfv_len = (size_t)sfv32[i].sfv_len; 1249 total_size += sfv[i].sfv_len; 1250 sfv[i].sfv_flag = sfv32[i].sfv_flag; 1251 /* 1252 * Individual elements of the vector must not 1253 * wrap or overflow, as later math is signed. 1254 * Equally total_size needs to be checked after 1255 * each vector is added in, to be sure that 1256 * rogue values haven't overflowed the counter. 1257 */ 1258 if (((ssize32_t)sfv[i].sfv_len < 0) || 1259 ((ssize32_t)total_size < 0)) { 1260 /* 1261 * Truncate the vector to send data 1262 * described by elements before the 1263 * error. 1264 */ 1265 copy_cnt = i; 1266 first_vector_error = EINVAL; 1267 /* total_size can't be trusted */ 1268 if ((ssize32_t)total_size < 0) 1269 error = EINVAL; 1270 break; 1271 } 1272 } 1273 /* Nothing to do, process errors */ 1274 if (copy_cnt == 0) 1275 break; 1276 1277 } else { 1278 #endif 1279 if (copyin(copy_vec, sfv, 1280 copy_cnt * sizeof (sendfilevec_t))) { 1281 error = EFAULT; 1282 break; 1283 } 1284 1285 for (i = 0; i < copy_cnt; i++) { 1286 total_size += sfv[i].sfv_len; 1287 /* 1288 * Individual elements of the vector must not 1289 * wrap or overflow, as later math is signed. 1290 * Equally total_size needs to be checked after 1291 * each vector is added in, to be sure that 1292 * rogue values haven't overflowed the counter. 1293 */ 1294 if (((ssize_t)sfv[i].sfv_len < 0) || 1295 (total_size < 0)) { 1296 /* 1297 * Truncate the vector to send data 1298 * described by elements before the 1299 * error. 1300 */ 1301 copy_cnt = i; 1302 first_vector_error = EINVAL; 1303 /* total_size can't be trusted */ 1304 if (total_size < 0) 1305 error = EINVAL; 1306 break; 1307 } 1308 } 1309 /* Nothing to do, process errors */ 1310 if (copy_cnt == 0) 1311 break; 1312 #ifdef _SYSCALL32_IMPL 1313 } 1314 #endif 1315 1316 /* 1317 * The task between deciding to use sendvec_small_chunk 1318 * and sendvec_chunk is dependant on multiple things: 1319 * 1320 * i) latency is important for smaller files. So if the 1321 * data is smaller than 'tcp_slow_start_initial' times 1322 * maxblk, then use sendvec_small_chunk which creates 1323 * maxblk size mblks and chains them together and sends 1324 * them to TCP in one shot. It also leaves 'wroff' size 1325 * space for the headers in each mblk. 1326 * 1327 * ii) for total size bigger than 'tcp_slow_start_initial' 1328 * time maxblk, its probably real file data which is 1329 * dominating. So its better to use sendvec_chunk because 1330 * performance goes to dog if we don't do pagesize reads. 1331 * sendvec_chunk will do pagesize reads and write them 1332 * in pagesize mblks to TCP. 1333 * 1334 * Side Notes: A write to file has not been optimized. 1335 * Future zero copy code will plugin into sendvec_chunk 1336 * only because doing zero copy for files smaller then 1337 * pagesize is useless. 1338 * 1339 * Note, if socket has NL7C enabled then call NL7C's 1340 * senfilev() function to consume the sfv[]. 1341 */ 1342 if (is_sock) { 1343 switch (so->so_family) { 1344 case AF_INET: 1345 case AF_INET6: 1346 if (so->so_nl7c_flags != 0) 1347 error = nl7c_sendfilev(so, &fileoff, 1348 sfv, copy_cnt, &count); 1349 else if ((total_size <= (4 * maxblk)) && 1350 error == 0) 1351 error = sendvec_small_chunk(fp, 1352 &fileoff, sfv, copy_cnt, 1353 total_size, maxblk, &count); 1354 else 1355 error = sendvec_chunk(fp, &fileoff, 1356 sfv, copy_cnt, &count); 1357 break; 1358 } 1359 } else { 1360 ASSERT(vp->v_type == VREG); 1361 error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt, 1362 &count); 1363 } 1364 1365 1366 #ifdef _SYSCALL32_IMPL 1367 if (get_udatamodel() == DATAMODEL_ILP32) 1368 copy_vec = (const struct sendfilevec *)((char *)copy_vec + 1369 (copy_cnt * sizeof (ksendfilevec32_t))); 1370 else 1371 #endif 1372 copy_vec += copy_cnt; 1373 sfvcnt -= copy_cnt; 1374 1375 /* Process all vector members up to first error */ 1376 } while ((sfvcnt > 0) && first_vector_error == 0 && error == 0); 1377 1378 if (vp->v_type == VREG) 1379 fp->f_offset += count; 1380 1381 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1382 1383 #ifdef _SYSCALL32_IMPL 1384 if (get_udatamodel() == DATAMODEL_ILP32) { 1385 ssize32_t count32 = (ssize32_t)count; 1386 if (copyout(&count32, xferred, sizeof (count32))) 1387 error = EFAULT; 1388 releasef(fildes); 1389 if (error != 0) 1390 return (set_errno(error)); 1391 if (first_vector_error != 0) 1392 return (set_errno(first_vector_error)); 1393 return (count32); 1394 } 1395 #endif 1396 if (copyout(&count, xferred, sizeof (count))) 1397 error = EFAULT; 1398 releasef(fildes); 1399 if (error != 0) 1400 return (set_errno(error)); 1401 if (first_vector_error != 0) 1402 return (set_errno(first_vector_error)); 1403 return (count); 1404 err: 1405 ASSERT(error != 0); 1406 releasef(fildes); 1407 return (set_errno(error)); 1408 } 1409