1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/t_lock.h> 29 #include <sys/param.h> 30 #include <sys/systm.h> 31 #include <sys/buf.h> 32 #include <sys/conf.h> 33 #include <sys/cred.h> 34 #include <sys/kmem.h> 35 #include <sys/sysmacros.h> 36 #include <sys/vfs.h> 37 #include <sys/vnode.h> 38 #include <sys/debug.h> 39 #include <sys/errno.h> 40 #include <sys/time.h> 41 #include <sys/file.h> 42 #include <sys/open.h> 43 #include <sys/user.h> 44 #include <sys/termios.h> 45 #include <sys/stream.h> 46 #include <sys/strsubr.h> 47 #include <sys/sunddi.h> 48 #include <sys/esunddi.h> 49 #include <sys/flock.h> 50 #include <sys/modctl.h> 51 #include <sys/cmn_err.h> 52 #include <sys/vmsystm.h> 53 54 #include <sys/socket.h> 55 #include <sys/socketvar.h> 56 /* swilly code in sys/socketvar.h turns off DEBUG */ 57 #ifdef __lint 58 #define DEBUG 59 #endif 60 61 #include <netinet/in.h> 62 #include <sys/sendfile.h> 63 #include <sys/un.h> 64 #include <sys/tihdr.h> 65 #include <sys/atomic.h> 66 67 #include <inet/common.h> 68 #include <inet/ip.h> 69 #include <inet/ip6.h> 70 #include <inet/tcp.h> 71 72 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *, 73 ssize32_t *); 74 extern int nl7c_sendfilev(struct sonode *, u_offset_t *, struct sendfilevec *, 75 int, ssize_t *); 76 extern int snf_segmap(file_t *, vnode_t *, u_offset_t, u_offset_t, ssize_t *, 77 boolean_t); 78 79 #define readflg (V_WRITELOCK_FALSE) 80 #define rwflag (V_WRITELOCK_TRUE) 81 82 /* 83 * kstrwritemp() has very similar semantics as that of strwrite(). 84 * The main difference is it obtains mblks from the caller and also 85 * does not do any copy as done in strwrite() from user buffers to 86 * kernel buffers. 87 * 88 * Currently, this routine is used by sendfile to send data allocated 89 * within the kernel without any copying. This interface does not use the 90 * synchronous stream interface as synch. stream interface implies 91 * copying. 92 */ 93 int 94 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode) 95 { 96 struct stdata *stp; 97 struct queue *wqp; 98 mblk_t *newmp; 99 char waitflag; 100 int tempmode; 101 int error = 0; 102 int done = 0; 103 struct sonode *so; 104 boolean_t direct; 105 106 ASSERT(vp->v_stream); 107 stp = vp->v_stream; 108 109 so = VTOSO(vp); 110 direct = (so->so_state & SS_DIRECT); 111 112 /* 113 * This is the sockfs direct fast path. canputnext() need 114 * not be accurate so we don't grab the sd_lock here. If 115 * we get flow-controlled, we grab sd_lock just before the 116 * do..while loop below to emulate what strwrite() does. 117 */ 118 wqp = stp->sd_wrq; 119 if (canputnext(wqp) && direct && 120 !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) { 121 return (sostream_direct(so, NULL, mp, CRED())); 122 } else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) { 123 /* Fast check of flags before acquiring the lock */ 124 mutex_enter(&stp->sd_lock); 125 error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0); 126 mutex_exit(&stp->sd_lock); 127 if (error != 0) { 128 if (!(stp->sd_flag & STPLEX) && 129 (stp->sd_wput_opt & SW_SIGPIPE)) { 130 tsignal(curthread, SIGPIPE); 131 error = EPIPE; 132 } 133 return (error); 134 } 135 } 136 137 waitflag = WRITEWAIT; 138 if (stp->sd_flag & OLDNDELAY) 139 tempmode = fmode & ~FNDELAY; 140 else 141 tempmode = fmode; 142 143 mutex_enter(&stp->sd_lock); 144 do { 145 if (canputnext(wqp)) { 146 mutex_exit(&stp->sd_lock); 147 if (stp->sd_wputdatafunc != NULL) { 148 newmp = (stp->sd_wputdatafunc)(vp, mp, NULL, 149 NULL, NULL, NULL); 150 if (newmp == NULL) { 151 /* The caller will free mp */ 152 return (ECOMM); 153 } 154 mp = newmp; 155 } 156 putnext(wqp, mp); 157 return (0); 158 } 159 error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1, 160 &done); 161 } while (error == 0 && !done); 162 163 mutex_exit(&stp->sd_lock); 164 /* 165 * EAGAIN tells the application to try again. ENOMEM 166 * is returned only if the memory allocation size 167 * exceeds the physical limits of the system. ENOMEM 168 * can't be true here. 169 */ 170 if (error == ENOMEM) 171 error = EAGAIN; 172 return (error); 173 } 174 175 #define SEND_MAX_CHUNK 16 176 177 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 178 /* 179 * 64 bit offsets for 32 bit applications only running either on 180 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer 181 * more than 2GB of data. 182 */ 183 int 184 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, 185 int copy_cnt, ssize32_t *count) 186 { 187 struct vnode *vp; 188 ushort_t fflag; 189 int ioflag; 190 size32_t cnt; 191 ssize32_t sfv_len; 192 ssize32_t tmpcount; 193 u_offset_t sfv_off; 194 struct uio auio; 195 struct iovec aiov; 196 int i, error; 197 198 fflag = fp->f_flag; 199 vp = fp->f_vnode; 200 for (i = 0; i < copy_cnt; i++) { 201 202 if (ISSIG(curthread, JUSTLOOKING)) 203 return (EINTR); 204 205 /* 206 * Do similar checks as "write" as we are writing 207 * sfv_len bytes into "vp". 208 */ 209 sfv_len = (ssize32_t)sfv->sfv_len; 210 211 if (sfv_len == 0) 212 continue; 213 214 if (sfv_len < 0) 215 return (EINVAL); 216 217 if (vp->v_type == VREG) { 218 if (*fileoff >= curproc->p_fsz_ctl) { 219 mutex_enter(&curproc->p_lock); 220 (void) rctl_action( 221 rctlproc_legacy[RLIMIT_FSIZE], 222 curproc->p_rctls, curproc, RCA_SAFE); 223 mutex_exit(&curproc->p_lock); 224 return (EFBIG); 225 } 226 227 if (*fileoff >= OFFSET_MAX(fp)) 228 return (EFBIG); 229 230 if (*fileoff + sfv_len > OFFSET_MAX(fp)) 231 return (EINVAL); 232 } 233 234 tmpcount = *count + sfv_len; 235 if (tmpcount < 0) 236 return (EINVAL); 237 238 sfv_off = sfv->sfv_off; 239 240 auio.uio_extflg = UIO_COPY_DEFAULT; 241 if (sfv->sfv_fd == SFV_FD_SELF) { 242 aiov.iov_len = sfv_len; 243 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 244 auio.uio_loffset = *fileoff; 245 auio.uio_iovcnt = 1; 246 auio.uio_resid = sfv_len; 247 auio.uio_iov = &aiov; 248 auio.uio_segflg = UIO_USERSPACE; 249 auio.uio_llimit = curproc->p_fsz_ctl; 250 auio.uio_fmode = fflag; 251 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 252 while (sfv_len > 0) { 253 error = VOP_WRITE(vp, &auio, ioflag, 254 fp->f_cred, NULL); 255 cnt = sfv_len - auio.uio_resid; 256 sfv_len -= cnt; 257 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 258 if (vp->v_type == VREG) 259 *fileoff += cnt; 260 *count += cnt; 261 if (error != 0) 262 return (error); 263 } 264 } else { 265 file_t *ffp; 266 vnode_t *readvp; 267 size_t size; 268 caddr_t ptr; 269 270 if ((ffp = getf(sfv->sfv_fd)) == NULL) 271 return (EBADF); 272 273 if ((ffp->f_flag & FREAD) == 0) { 274 releasef(sfv->sfv_fd); 275 return (EBADF); 276 } 277 278 readvp = ffp->f_vnode; 279 if (readvp->v_type != VREG) { 280 releasef(sfv->sfv_fd); 281 return (EINVAL); 282 } 283 284 /* 285 * No point reading and writing to same vp, 286 * as long as both are regular files. readvp is not 287 * locked; but since we got it from an open file the 288 * contents will be valid during the time of access. 289 */ 290 if (vn_compare(vp, readvp)) { 291 releasef(sfv->sfv_fd); 292 return (EINVAL); 293 } 294 295 /* 296 * Note: we assume readvp != vp. "vp" is already 297 * locked, and "readvp" must not be. 298 */ 299 (void) VOP_RWLOCK(readvp, readflg, NULL); 300 301 /* 302 * Same checks as in pread64. 303 */ 304 if (sfv_off > MAXOFFSET_T) { 305 VOP_RWUNLOCK(readvp, readflg, NULL); 306 releasef(sfv->sfv_fd); 307 return (EINVAL); 308 } 309 310 if (sfv_off + sfv_len > MAXOFFSET_T) 311 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off); 312 313 /* Find the native blocksize to transfer data */ 314 size = MIN(vp->v_vfsp->vfs_bsize, 315 readvp->v_vfsp->vfs_bsize); 316 size = sfv_len < size ? sfv_len : size; 317 ptr = kmem_alloc(size, KM_SLEEP); 318 319 while (sfv_len > 0) { 320 size_t iov_len; 321 322 iov_len = MIN(size, sfv_len); 323 aiov.iov_base = ptr; 324 aiov.iov_len = iov_len; 325 auio.uio_loffset = sfv_off; 326 auio.uio_iov = &aiov; 327 auio.uio_iovcnt = 1; 328 auio.uio_resid = iov_len; 329 auio.uio_segflg = UIO_SYSSPACE; 330 auio.uio_llimit = MAXOFFSET_T; 331 auio.uio_fmode = ffp->f_flag; 332 ioflag = auio.uio_fmode & 333 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 334 335 /* 336 * If read sync is not asked for, 337 * filter sync flags 338 */ 339 if ((ioflag & FRSYNC) == 0) 340 ioflag &= ~(FSYNC|FDSYNC); 341 error = VOP_READ(readvp, &auio, ioflag, 342 fp->f_cred, NULL); 343 if (error) { 344 kmem_free(ptr, size); 345 VOP_RWUNLOCK(readvp, readflg, NULL); 346 releasef(sfv->sfv_fd); 347 return (error); 348 } 349 350 /* 351 * Check how must data was really read. 352 * Decrement the 'len' and increment the 353 * 'off' appropriately. 354 */ 355 cnt = iov_len - auio.uio_resid; 356 if (cnt == 0) { 357 /* 358 * If we were reading a pipe (currently 359 * not implemented), we may now lose 360 * data. 361 */ 362 kmem_free(ptr, size); 363 VOP_RWUNLOCK(readvp, readflg, NULL); 364 releasef(sfv->sfv_fd); 365 return (EINVAL); 366 } 367 sfv_len -= cnt; 368 sfv_off += cnt; 369 370 aiov.iov_base = ptr; 371 aiov.iov_len = cnt; 372 auio.uio_loffset = *fileoff; 373 auio.uio_resid = cnt; 374 auio.uio_segflg = UIO_SYSSPACE; 375 auio.uio_llimit = curproc->p_fsz_ctl; 376 auio.uio_fmode = fflag; 377 ioflag = auio.uio_fmode & 378 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 379 error = VOP_WRITE(vp, &auio, ioflag, 380 fp->f_cred, NULL); 381 382 /* 383 * Check how much data was written. Increment 384 * the 'len' and decrement the 'off' if all 385 * the data was not written. 386 */ 387 cnt -= auio.uio_resid; 388 sfv_len += auio.uio_resid; 389 sfv_off -= auio.uio_resid; 390 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 391 if (vp->v_type == VREG) 392 *fileoff += cnt; 393 *count += cnt; 394 if (error != 0) { 395 kmem_free(ptr, size); 396 VOP_RWUNLOCK(readvp, readflg, NULL); 397 releasef(sfv->sfv_fd); 398 return (error); 399 } 400 } 401 VOP_RWUNLOCK(readvp, readflg, NULL); 402 releasef(sfv->sfv_fd); 403 kmem_free(ptr, size); 404 } 405 sfv++; 406 } 407 return (0); 408 } 409 410 ssize32_t 411 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, 412 size32_t *xferred, int fildes) 413 { 414 u_offset_t fileoff; 415 int copy_cnt; 416 const struct ksendfilevec64 *copy_vec; 417 struct ksendfilevec64 sfv[SEND_MAX_CHUNK]; 418 struct vnode *vp; 419 int error; 420 ssize32_t count = 0; 421 422 vp = fp->f_vnode; 423 (void) VOP_RWLOCK(vp, rwflag, NULL); 424 425 copy_vec = vec; 426 fileoff = fp->f_offset; 427 428 do { 429 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 430 if (copyin(copy_vec, sfv, copy_cnt * 431 sizeof (struct ksendfilevec64))) { 432 error = EFAULT; 433 break; 434 } 435 436 /* 437 * Optimize the regular file over 438 * the socket case. 439 */ 440 if (vp->v_type == VSOCK && sfv->sfv_fd != SFV_FD_SELF) { 441 file_t *rfp; 442 vnode_t *rvp; 443 444 if ((rfp = getf(sfv->sfv_fd)) == NULL) { 445 error = EBADF; 446 break; 447 } 448 if ((rfp->f_flag & FREAD) == 0) { 449 releasef(sfv->sfv_fd); 450 error = EBADF; 451 break; 452 } 453 rvp = rfp->f_vnode; 454 if (rvp->v_type == VREG) { 455 error = sosendfile64(fp, rfp, sfv, &count); 456 if (error) 457 break; 458 copy_vec++; 459 sfvcnt--; 460 continue; 461 } 462 releasef(sfv->sfv_fd); 463 } 464 error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count); 465 if (error != 0) 466 break; 467 468 copy_vec += copy_cnt; 469 sfvcnt -= copy_cnt; 470 } while (sfvcnt > 0); 471 472 if (vp->v_type == VREG) 473 fp->f_offset += count; 474 475 VOP_RWUNLOCK(vp, rwflag, NULL); 476 if (copyout(&count, xferred, sizeof (count))) 477 error = EFAULT; 478 releasef(fildes); 479 if (error != 0) 480 return (set_errno(error)); 481 return (count); 482 } 483 #endif 484 485 int 486 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 487 int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count) 488 { 489 struct vnode *vp; 490 struct uio auio; 491 struct iovec aiov; 492 ushort_t fflag; 493 int ioflag; 494 int i, error; 495 size_t cnt; 496 ssize_t sfv_len; 497 u_offset_t sfv_off; 498 #ifdef _SYSCALL32_IMPL 499 model_t model = get_udatamodel(); 500 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 501 MAXOFF32_T : MAXOFFSET_T; 502 #else 503 const u_offset_t maxoff = MAXOFF32_T; 504 #endif 505 mblk_t *dmp = NULL; 506 int wroff; 507 int buf_left = 0; 508 size_t iov_len; 509 mblk_t *head, *tmp; 510 size_t size = total_size; 511 size_t extra; 512 int tail_len; 513 514 fflag = fp->f_flag; 515 vp = fp->f_vnode; 516 517 ASSERT(vp->v_type == VSOCK); 518 ASSERT(maxblk > 0); 519 520 wroff = (int)vp->v_stream->sd_wroff; 521 tail_len = (int)vp->v_stream->sd_tail; 522 extra = wroff + tail_len; 523 524 buf_left = MIN(total_size, maxblk); 525 head = dmp = allocb(buf_left + extra, BPRI_HI); 526 if (head == NULL) 527 return (ENOMEM); 528 head->b_wptr = head->b_rptr = head->b_rptr + wroff; 529 530 auio.uio_extflg = UIO_COPY_DEFAULT; 531 for (i = 0; i < copy_cnt; i++) { 532 if (ISSIG(curthread, JUSTLOOKING)) { 533 freemsg(head); 534 return (EINTR); 535 } 536 537 /* 538 * Do similar checks as "write" as we are writing 539 * sfv_len bytes into "vp". 540 */ 541 sfv_len = (ssize_t)sfv->sfv_len; 542 543 if (sfv_len == 0) { 544 sfv++; 545 continue; 546 } 547 548 /* Check for overflow */ 549 #ifdef _SYSCALL32_IMPL 550 if (model == DATAMODEL_ILP32) { 551 if (((ssize32_t)(*count + sfv_len)) < 0) { 552 freemsg(head); 553 return (EINVAL); 554 } 555 } else 556 #endif 557 if ((*count + sfv_len) < 0) { 558 freemsg(head); 559 return (EINVAL); 560 } 561 562 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 563 564 if (sfv->sfv_fd == SFV_FD_SELF) { 565 while (sfv_len > 0) { 566 if (buf_left == 0) { 567 tmp = dmp; 568 buf_left = MIN(total_size, maxblk); 569 iov_len = MIN(buf_left, sfv_len); 570 dmp = allocb(buf_left + extra, BPRI_HI); 571 if (dmp == NULL) { 572 freemsg(head); 573 return (ENOMEM); 574 } 575 dmp->b_wptr = dmp->b_rptr = 576 dmp->b_rptr + wroff; 577 tmp->b_cont = dmp; 578 } else { 579 iov_len = MIN(buf_left, sfv_len); 580 } 581 582 aiov.iov_len = iov_len; 583 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 584 auio.uio_loffset = *fileoff; 585 auio.uio_iovcnt = 1; 586 auio.uio_resid = iov_len; 587 auio.uio_iov = &aiov; 588 auio.uio_segflg = UIO_USERSPACE; 589 auio.uio_llimit = curproc->p_fsz_ctl; 590 auio.uio_fmode = fflag; 591 592 buf_left -= iov_len; 593 total_size -= iov_len; 594 sfv_len -= iov_len; 595 sfv_off += iov_len; 596 597 error = uiomove((caddr_t)dmp->b_wptr, 598 iov_len, UIO_WRITE, &auio); 599 if (error != 0) { 600 freemsg(head); 601 return (error); 602 } 603 dmp->b_wptr += iov_len; 604 } 605 } else { 606 file_t *ffp; 607 vnode_t *readvp; 608 609 if ((ffp = getf(sfv->sfv_fd)) == NULL) { 610 freemsg(head); 611 return (EBADF); 612 } 613 614 if ((ffp->f_flag & FREAD) == 0) { 615 releasef(sfv->sfv_fd); 616 freemsg(head); 617 return (EACCES); 618 } 619 620 readvp = ffp->f_vnode; 621 if (readvp->v_type != VREG) { 622 releasef(sfv->sfv_fd); 623 freemsg(head); 624 return (EINVAL); 625 } 626 627 /* 628 * No point reading and writing to same vp, 629 * as long as both are regular files. readvp is not 630 * locked; but since we got it from an open file the 631 * contents will be valid during the time of access. 632 */ 633 634 if (vn_compare(vp, readvp)) { 635 releasef(sfv->sfv_fd); 636 freemsg(head); 637 return (EINVAL); 638 } 639 640 /* 641 * Note: we assume readvp != vp. "vp" is already 642 * locked, and "readvp" must not be. 643 */ 644 645 (void) VOP_RWLOCK(readvp, readflg, NULL); 646 647 /* Same checks as in pread */ 648 if (sfv_off > maxoff) { 649 VOP_RWUNLOCK(readvp, readflg, NULL); 650 releasef(sfv->sfv_fd); 651 freemsg(head); 652 return (EINVAL); 653 } 654 if (sfv_off + sfv_len > maxoff) { 655 total_size -= (sfv_off + sfv_len - maxoff); 656 sfv_len = (ssize_t)((offset_t)maxoff - 657 sfv_off); 658 } 659 660 while (sfv_len > 0) { 661 if (buf_left == 0) { 662 tmp = dmp; 663 buf_left = MIN(total_size, maxblk); 664 iov_len = MIN(buf_left, sfv_len); 665 dmp = allocb(buf_left + extra, BPRI_HI); 666 if (dmp == NULL) { 667 VOP_RWUNLOCK(readvp, readflg, 668 NULL); 669 releasef(sfv->sfv_fd); 670 freemsg(head); 671 return (ENOMEM); 672 } 673 dmp->b_wptr = dmp->b_rptr = 674 dmp->b_rptr + wroff; 675 tmp->b_cont = dmp; 676 } else { 677 iov_len = MIN(buf_left, sfv_len); 678 } 679 aiov.iov_base = (caddr_t)dmp->b_wptr; 680 aiov.iov_len = iov_len; 681 auio.uio_loffset = sfv_off; 682 auio.uio_iov = &aiov; 683 auio.uio_iovcnt = 1; 684 auio.uio_resid = iov_len; 685 auio.uio_segflg = UIO_SYSSPACE; 686 auio.uio_llimit = MAXOFFSET_T; 687 auio.uio_fmode = ffp->f_flag; 688 ioflag = auio.uio_fmode & 689 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 690 691 /* 692 * If read sync is not asked for, 693 * filter sync flags 694 */ 695 if ((ioflag & FRSYNC) == 0) 696 ioflag &= ~(FSYNC|FDSYNC); 697 error = VOP_READ(readvp, &auio, ioflag, 698 fp->f_cred, NULL); 699 if (error != 0) { 700 /* 701 * If we were reading a pipe (currently 702 * not implemented), we may now loose 703 * data. 704 */ 705 VOP_RWUNLOCK(readvp, readflg, NULL); 706 releasef(sfv->sfv_fd); 707 freemsg(head); 708 return (error); 709 } 710 711 /* 712 * Check how much data was really read. 713 * Decrement the 'len' and increment the 714 * 'off' appropriately. 715 */ 716 cnt = iov_len - auio.uio_resid; 717 if (cnt == 0) { 718 VOP_RWUNLOCK(readvp, readflg, NULL); 719 releasef(sfv->sfv_fd); 720 freemsg(head); 721 return (EINVAL); 722 } 723 sfv_len -= cnt; 724 sfv_off += cnt; 725 total_size -= cnt; 726 buf_left -= cnt; 727 728 dmp->b_wptr += cnt; 729 } 730 VOP_RWUNLOCK(readvp, readflg, NULL); 731 releasef(sfv->sfv_fd); 732 } 733 sfv++; 734 } 735 736 ASSERT(total_size == 0); 737 error = kstrwritemp(vp, head, fflag); 738 if (error != 0) { 739 freemsg(head); 740 return (error); 741 } 742 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size; 743 *count += size; 744 745 return (0); 746 } 747 748 749 int 750 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 751 int copy_cnt, ssize_t *count) 752 { 753 struct vnode *vp; 754 struct uio auio; 755 struct iovec aiov; 756 ushort_t fflag; 757 int ioflag; 758 int i, error; 759 size_t cnt; 760 ssize_t sfv_len; 761 u_offset_t sfv_off; 762 #ifdef _SYSCALL32_IMPL 763 model_t model = get_udatamodel(); 764 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 765 MAXOFF32_T : MAXOFFSET_T; 766 #else 767 const u_offset_t maxoff = MAXOFF32_T; 768 #endif 769 mblk_t *dmp = NULL; 770 char *buf = NULL; 771 size_t extra; 772 int maxblk, wroff, tail_len; 773 struct sonode *so; 774 stdata_t *stp; 775 776 fflag = fp->f_flag; 777 vp = fp->f_vnode; 778 779 if (vp->v_type == VSOCK) { 780 so = VTOSO(vp); 781 stp = vp->v_stream; 782 wroff = (int)stp->sd_wroff; 783 tail_len = (int)stp->sd_tail; 784 maxblk = (int)stp->sd_maxblk; 785 extra = wroff + tail_len; 786 } 787 788 auio.uio_extflg = UIO_COPY_DEFAULT; 789 for (i = 0; i < copy_cnt; i++) { 790 if (ISSIG(curthread, JUSTLOOKING)) 791 return (EINTR); 792 793 /* 794 * Do similar checks as "write" as we are writing 795 * sfv_len bytes into "vp". 796 */ 797 sfv_len = (ssize_t)sfv->sfv_len; 798 799 if (sfv_len == 0) { 800 sfv++; 801 continue; 802 } 803 804 if (vp->v_type == VREG) { 805 if (*fileoff >= curproc->p_fsz_ctl) { 806 mutex_enter(&curproc->p_lock); 807 (void) rctl_action( 808 rctlproc_legacy[RLIMIT_FSIZE], 809 curproc->p_rctls, curproc, RCA_SAFE); 810 mutex_exit(&curproc->p_lock); 811 812 return (EFBIG); 813 } 814 815 if (*fileoff >= maxoff) 816 return (EFBIG); 817 818 if (*fileoff + sfv_len > maxoff) 819 return (EINVAL); 820 } 821 822 /* Check for overflow */ 823 #ifdef _SYSCALL32_IMPL 824 if (model == DATAMODEL_ILP32) { 825 if (((ssize32_t)(*count + sfv_len)) < 0) 826 return (EINVAL); 827 } else 828 #endif 829 if ((*count + sfv_len) < 0) 830 return (EINVAL); 831 832 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 833 834 if (sfv->sfv_fd == SFV_FD_SELF) { 835 aiov.iov_len = sfv_len; 836 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 837 auio.uio_loffset = *fileoff; 838 auio.uio_iovcnt = 1; 839 auio.uio_resid = sfv_len; 840 auio.uio_iov = &aiov; 841 auio.uio_segflg = UIO_USERSPACE; 842 auio.uio_llimit = curproc->p_fsz_ctl; 843 auio.uio_fmode = fflag; 844 845 if (vp->v_type == VSOCK) { 846 847 /* 848 * Optimize for the socket case 849 */ 850 851 dmp = allocb(sfv_len + extra, BPRI_HI); 852 if (dmp == NULL) 853 return (ENOMEM); 854 dmp->b_wptr = dmp->b_rptr = dmp->b_rptr + wroff; 855 error = uiomove((caddr_t)dmp->b_wptr, 856 sfv_len, UIO_WRITE, &auio); 857 if (error != 0) { 858 freeb(dmp); 859 return (error); 860 } 861 dmp->b_wptr += sfv_len; 862 error = kstrwritemp(vp, dmp, fflag); 863 if (error != 0) { 864 freeb(dmp); 865 return (error); 866 } 867 ttolwp(curthread)->lwp_ru.ioch += 868 (ulong_t)sfv_len; 869 *count += sfv_len; 870 } else { 871 ioflag = auio.uio_fmode & 872 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 873 while (sfv_len > 0) { 874 error = VOP_WRITE(vp, &auio, ioflag, 875 fp->f_cred, NULL); 876 cnt = sfv_len - auio.uio_resid; 877 sfv_len -= cnt; 878 ttolwp(curthread)->lwp_ru.ioch += 879 (ulong_t)cnt; 880 *fileoff += cnt; 881 *count += cnt; 882 if (error != 0) 883 return (error); 884 } 885 } 886 } else { 887 int segmapit = 0; 888 file_t *ffp; 889 vnode_t *readvp; 890 struct vnode *realvp; 891 size_t size; 892 caddr_t ptr; 893 894 if ((ffp = getf(sfv->sfv_fd)) == NULL) 895 return (EBADF); 896 897 if ((ffp->f_flag & FREAD) == 0) { 898 releasef(sfv->sfv_fd); 899 return (EBADF); 900 } 901 902 readvp = ffp->f_vnode; 903 if (VOP_REALVP(readvp, &realvp, NULL) == 0) 904 readvp = realvp; 905 if (readvp->v_type != VREG) { 906 releasef(sfv->sfv_fd); 907 return (EINVAL); 908 } 909 910 /* 911 * No point reading and writing to same vp, 912 * as long as both are regular files. readvp is not 913 * locked; but since we got it from an open file the 914 * contents will be valid during the time of access. 915 */ 916 if (vn_compare(vp, readvp)) { 917 releasef(sfv->sfv_fd); 918 return (EINVAL); 919 } 920 921 /* 922 * Note: we assume readvp != vp. "vp" is already 923 * locked, and "readvp" must not be. 924 */ 925 (void) VOP_RWLOCK(readvp, readflg, NULL); 926 927 /* Same checks as in pread */ 928 if (sfv_off > maxoff) { 929 VOP_RWUNLOCK(readvp, readflg, NULL); 930 releasef(sfv->sfv_fd); 931 return (EINVAL); 932 } 933 if (sfv_off + sfv_len > maxoff) { 934 sfv_len = (ssize_t)((offset_t)maxoff - 935 sfv_off); 936 } 937 /* Find the native blocksize to transfer data */ 938 size = MIN(vp->v_vfsp->vfs_bsize, 939 readvp->v_vfsp->vfs_bsize); 940 size = sfv_len < size ? sfv_len : size; 941 942 if (vp->v_type != VSOCK) { 943 segmapit = 0; 944 buf = kmem_alloc(size, KM_NOSLEEP); 945 if (buf == NULL) { 946 VOP_RWUNLOCK(readvp, readflg, NULL); 947 releasef(sfv->sfv_fd); 948 return (ENOMEM); 949 } 950 } else { 951 /* 952 * For sockets acting as an SSL proxy, we 953 * need to adjust the size to the maximum 954 * SSL record size set in the stream head. 955 */ 956 if (so->so_kssl_ctx != NULL) 957 size = MIN(size, maxblk); 958 959 if (vn_has_flocks(readvp) || 960 readvp->v_flag & VNOMAP || 961 stp->sd_copyflag & STZCVMUNSAFE) { 962 segmapit = 0; 963 } else if (stp->sd_copyflag & STZCVMSAFE) { 964 segmapit = 1; 965 } else { 966 int on = 1; 967 if (SOP_SETSOCKOPT(VTOSO(vp), 968 SOL_SOCKET, SO_SND_COPYAVOID, 969 &on, sizeof (on)) == 0) 970 segmapit = 1; 971 } 972 } 973 974 if (segmapit) { 975 boolean_t nowait; 976 977 nowait = (sfv->sfv_flag & SFV_NOWAIT) != 0; 978 error = snf_segmap(fp, readvp, sfv_off, 979 (u_offset_t)sfv_len, (ssize_t *)&cnt, 980 nowait); 981 releasef(sfv->sfv_fd); 982 *count += cnt; 983 if (error) 984 return (error); 985 sfv++; 986 continue; 987 } 988 989 while (sfv_len > 0) { 990 size_t iov_len; 991 992 iov_len = MIN(size, sfv_len); 993 994 if (vp->v_type == VSOCK) { 995 dmp = allocb(iov_len + extra, BPRI_HI); 996 if (dmp == NULL) { 997 VOP_RWUNLOCK(readvp, readflg, 998 NULL); 999 releasef(sfv->sfv_fd); 1000 return (ENOMEM); 1001 } 1002 dmp->b_wptr = dmp->b_rptr = 1003 dmp->b_rptr + wroff; 1004 ptr = (caddr_t)dmp->b_rptr; 1005 } else { 1006 ptr = buf; 1007 } 1008 1009 aiov.iov_base = ptr; 1010 aiov.iov_len = iov_len; 1011 auio.uio_loffset = sfv_off; 1012 auio.uio_iov = &aiov; 1013 auio.uio_iovcnt = 1; 1014 auio.uio_resid = iov_len; 1015 auio.uio_segflg = UIO_SYSSPACE; 1016 auio.uio_llimit = MAXOFFSET_T; 1017 auio.uio_fmode = ffp->f_flag; 1018 ioflag = auio.uio_fmode & 1019 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1020 1021 /* 1022 * If read sync is not asked for, 1023 * filter sync flags 1024 */ 1025 if ((ioflag & FRSYNC) == 0) 1026 ioflag &= ~(FSYNC|FDSYNC); 1027 error = VOP_READ(readvp, &auio, ioflag, 1028 fp->f_cred, NULL); 1029 if (error != 0) { 1030 /* 1031 * If we were reading a pipe (currently 1032 * not implemented), we may now lose 1033 * data. 1034 */ 1035 if (vp->v_type == VSOCK) 1036 freeb(dmp); 1037 else 1038 kmem_free(buf, size); 1039 VOP_RWUNLOCK(readvp, readflg, NULL); 1040 releasef(sfv->sfv_fd); 1041 return (error); 1042 } 1043 1044 /* 1045 * Check how much data was really read. 1046 * Decrement the 'len' and increment the 1047 * 'off' appropriately. 1048 */ 1049 cnt = iov_len - auio.uio_resid; 1050 if (cnt == 0) { 1051 if (vp->v_type == VSOCK) 1052 freeb(dmp); 1053 else 1054 kmem_free(buf, size); 1055 VOP_RWUNLOCK(readvp, readflg, NULL); 1056 releasef(sfv->sfv_fd); 1057 return (EINVAL); 1058 } 1059 sfv_len -= cnt; 1060 sfv_off += cnt; 1061 1062 if (vp->v_type == VSOCK) { 1063 dmp->b_wptr = dmp->b_rptr + cnt; 1064 1065 error = kstrwritemp(vp, dmp, fflag); 1066 if (error != 0) { 1067 freeb(dmp); 1068 VOP_RWUNLOCK(readvp, readflg, 1069 NULL); 1070 releasef(sfv->sfv_fd); 1071 return (error); 1072 } 1073 1074 ttolwp(curthread)->lwp_ru.ioch += 1075 (ulong_t)cnt; 1076 *count += cnt; 1077 } else { 1078 1079 aiov.iov_base = ptr; 1080 aiov.iov_len = cnt; 1081 auio.uio_loffset = *fileoff; 1082 auio.uio_resid = cnt; 1083 auio.uio_segflg = UIO_SYSSPACE; 1084 auio.uio_llimit = curproc->p_fsz_ctl; 1085 auio.uio_fmode = fflag; 1086 ioflag = auio.uio_fmode & 1087 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1088 error = VOP_WRITE(vp, &auio, ioflag, 1089 fp->f_cred, NULL); 1090 1091 /* 1092 * Check how much data was written. 1093 * Increment the 'len' and decrement the 1094 * 'off' if all the data was not 1095 * written. 1096 */ 1097 cnt -= auio.uio_resid; 1098 sfv_len += auio.uio_resid; 1099 sfv_off -= auio.uio_resid; 1100 ttolwp(curthread)->lwp_ru.ioch += 1101 (ulong_t)cnt; 1102 *fileoff += cnt; 1103 *count += cnt; 1104 if (error != 0) { 1105 kmem_free(buf, size); 1106 VOP_RWUNLOCK(readvp, readflg, 1107 NULL); 1108 releasef(sfv->sfv_fd); 1109 return (error); 1110 } 1111 } 1112 } 1113 if (buf) { 1114 kmem_free(buf, size); 1115 buf = NULL; 1116 } 1117 VOP_RWUNLOCK(readvp, readflg, NULL); 1118 releasef(sfv->sfv_fd); 1119 } 1120 sfv++; 1121 } 1122 return (0); 1123 } 1124 1125 ssize_t 1126 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, 1127 size_t *xferred) 1128 { 1129 int error = 0; 1130 int first_vector_error = 0; 1131 file_t *fp; 1132 struct vnode *vp; 1133 struct sonode *so; 1134 u_offset_t fileoff; 1135 int copy_cnt; 1136 const struct sendfilevec *copy_vec; 1137 struct sendfilevec sfv[SEND_MAX_CHUNK]; 1138 ssize_t count = 0; 1139 #ifdef _SYSCALL32_IMPL 1140 struct ksendfilevec32 sfv32[SEND_MAX_CHUNK]; 1141 #endif 1142 ssize_t total_size; 1143 int i; 1144 boolean_t is_sock = B_FALSE; 1145 int maxblk = 0; 1146 1147 if (sfvcnt <= 0) 1148 return (set_errno(EINVAL)); 1149 1150 if ((fp = getf(fildes)) == NULL) 1151 return (set_errno(EBADF)); 1152 1153 if (((fp->f_flag) & FWRITE) == 0) { 1154 error = EBADF; 1155 goto err; 1156 } 1157 1158 fileoff = fp->f_offset; 1159 vp = fp->f_vnode; 1160 1161 switch (vp->v_type) { 1162 case VSOCK: 1163 so = VTOSO(vp); 1164 /* sendfile not supported for SCTP */ 1165 if (so->so_protocol == IPPROTO_SCTP) { 1166 error = EPROTONOSUPPORT; 1167 goto err; 1168 } 1169 is_sock = B_TRUE; 1170 switch (so->so_family) { 1171 case AF_INET: 1172 case AF_INET6: 1173 /* 1174 * Make similar checks done in SOP_WRITE(). 1175 */ 1176 if (so->so_state & SS_CANTSENDMORE) { 1177 tsignal(curthread, SIGPIPE); 1178 error = EPIPE; 1179 goto err; 1180 } 1181 if (so->so_type != SOCK_STREAM) { 1182 error = EOPNOTSUPP; 1183 goto err; 1184 } 1185 1186 if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) != 1187 (SS_ISCONNECTED|SS_ISBOUND)) { 1188 error = ENOTCONN; 1189 goto err; 1190 } 1191 1192 if ((so->so_state & SS_DIRECT) && 1193 (so->so_priv != NULL) && 1194 (so->so_kssl_ctx == NULL)) { 1195 maxblk = ((tcp_t *)so->so_priv)->tcp_mss; 1196 } else { 1197 maxblk = (int)vp->v_stream->sd_maxblk; 1198 } 1199 break; 1200 default: 1201 error = EAFNOSUPPORT; 1202 goto err; 1203 } 1204 break; 1205 case VREG: 1206 break; 1207 default: 1208 error = EINVAL; 1209 goto err; 1210 } 1211 1212 switch (opcode) { 1213 case SENDFILEV : 1214 break; 1215 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 1216 case SENDFILEV64 : 1217 return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt, 1218 (size32_t *)xferred, fildes)); 1219 #endif 1220 default : 1221 error = ENOSYS; 1222 break; 1223 } 1224 1225 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1226 copy_vec = vec; 1227 1228 do { 1229 total_size = 0; 1230 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 1231 #ifdef _SYSCALL32_IMPL 1232 /* 32-bit callers need to have their iovec expanded. */ 1233 if (get_udatamodel() == DATAMODEL_ILP32) { 1234 if (copyin(copy_vec, sfv32, 1235 copy_cnt * sizeof (ksendfilevec32_t))) { 1236 error = EFAULT; 1237 break; 1238 } 1239 1240 for (i = 0; i < copy_cnt; i++) { 1241 sfv[i].sfv_fd = sfv32[i].sfv_fd; 1242 sfv[i].sfv_off = 1243 (off_t)(uint32_t)sfv32[i].sfv_off; 1244 sfv[i].sfv_len = (size_t)sfv32[i].sfv_len; 1245 total_size += sfv[i].sfv_len; 1246 sfv[i].sfv_flag = sfv32[i].sfv_flag; 1247 /* 1248 * Individual elements of the vector must not 1249 * wrap or overflow, as later math is signed. 1250 * Equally total_size needs to be checked after 1251 * each vector is added in, to be sure that 1252 * rogue values haven't overflowed the counter. 1253 */ 1254 if (((ssize32_t)sfv[i].sfv_len < 0) || 1255 ((ssize32_t)total_size < 0)) { 1256 /* 1257 * Truncate the vector to send data 1258 * described by elements before the 1259 * error. 1260 */ 1261 copy_cnt = i; 1262 first_vector_error = EINVAL; 1263 /* total_size can't be trusted */ 1264 if ((ssize32_t)total_size < 0) 1265 error = EINVAL; 1266 break; 1267 } 1268 } 1269 /* Nothing to do, process errors */ 1270 if (copy_cnt == 0) 1271 break; 1272 1273 } else { 1274 #endif 1275 if (copyin(copy_vec, sfv, 1276 copy_cnt * sizeof (sendfilevec_t))) { 1277 error = EFAULT; 1278 break; 1279 } 1280 1281 for (i = 0; i < copy_cnt; i++) { 1282 total_size += sfv[i].sfv_len; 1283 /* 1284 * Individual elements of the vector must not 1285 * wrap or overflow, as later math is signed. 1286 * Equally total_size needs to be checked after 1287 * each vector is added in, to be sure that 1288 * rogue values haven't overflowed the counter. 1289 */ 1290 if (((ssize_t)sfv[i].sfv_len < 0) || 1291 (total_size < 0)) { 1292 /* 1293 * Truncate the vector to send data 1294 * described by elements before the 1295 * error. 1296 */ 1297 copy_cnt = i; 1298 first_vector_error = EINVAL; 1299 /* total_size can't be trusted */ 1300 if (total_size < 0) 1301 error = EINVAL; 1302 break; 1303 } 1304 } 1305 /* Nothing to do, process errors */ 1306 if (copy_cnt == 0) 1307 break; 1308 #ifdef _SYSCALL32_IMPL 1309 } 1310 #endif 1311 1312 /* 1313 * The task between deciding to use sendvec_small_chunk 1314 * and sendvec_chunk is dependant on multiple things: 1315 * 1316 * i) latency is important for smaller files. So if the 1317 * data is smaller than 'tcp_slow_start_initial' times 1318 * maxblk, then use sendvec_small_chunk which creates 1319 * maxblk size mblks and chains them together and sends 1320 * them to TCP in one shot. It also leaves 'wroff' size 1321 * space for the headers in each mblk. 1322 * 1323 * ii) for total size bigger than 'tcp_slow_start_initial' 1324 * time maxblk, its probably real file data which is 1325 * dominating. So its better to use sendvec_chunk because 1326 * performance goes to dog if we don't do pagesize reads. 1327 * sendvec_chunk will do pagesize reads and write them 1328 * in pagesize mblks to TCP. 1329 * 1330 * Side Notes: A write to file has not been optimized. 1331 * Future zero copy code will plugin into sendvec_chunk 1332 * only because doing zero copy for files smaller then 1333 * pagesize is useless. 1334 * 1335 * Note, if socket has NL7C enabled then call NL7C's 1336 * senfilev() function to consume the sfv[]. 1337 */ 1338 if (is_sock) { 1339 switch (so->so_family) { 1340 case AF_INET: 1341 case AF_INET6: 1342 if (so->so_nl7c_flags != 0) 1343 error = nl7c_sendfilev(so, &fileoff, 1344 sfv, copy_cnt, &count); 1345 else if ((total_size <= (4 * maxblk)) && 1346 error == 0) 1347 error = sendvec_small_chunk(fp, 1348 &fileoff, sfv, copy_cnt, 1349 total_size, maxblk, &count); 1350 else 1351 error = sendvec_chunk(fp, &fileoff, 1352 sfv, copy_cnt, &count); 1353 break; 1354 } 1355 } else { 1356 ASSERT(vp->v_type == VREG); 1357 error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt, 1358 &count); 1359 } 1360 1361 1362 #ifdef _SYSCALL32_IMPL 1363 if (get_udatamodel() == DATAMODEL_ILP32) 1364 copy_vec = (const struct sendfilevec *)((char *)copy_vec + 1365 (copy_cnt * sizeof (ksendfilevec32_t))); 1366 else 1367 #endif 1368 copy_vec += copy_cnt; 1369 sfvcnt -= copy_cnt; 1370 1371 /* Process all vector members up to first error */ 1372 } while ((sfvcnt > 0) && first_vector_error == 0 && error == 0); 1373 1374 if (vp->v_type == VREG) 1375 fp->f_offset += count; 1376 1377 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1378 1379 #ifdef _SYSCALL32_IMPL 1380 if (get_udatamodel() == DATAMODEL_ILP32) { 1381 ssize32_t count32 = (ssize32_t)count; 1382 if (copyout(&count32, xferred, sizeof (count32))) 1383 error = EFAULT; 1384 releasef(fildes); 1385 if (error != 0) 1386 return (set_errno(error)); 1387 if (first_vector_error != 0) 1388 return (set_errno(first_vector_error)); 1389 return (count32); 1390 } 1391 #endif 1392 if (copyout(&count, xferred, sizeof (count))) 1393 error = EFAULT; 1394 releasef(fildes); 1395 if (error != 0) 1396 return (set_errno(error)); 1397 if (first_vector_error != 0) 1398 return (set_errno(first_vector_error)); 1399 return (count); 1400 err: 1401 ASSERT(error != 0); 1402 releasef(fildes); 1403 return (set_errno(error)); 1404 } 1405