1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/t_lock.h> 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/buf.h> 34 #include <sys/conf.h> 35 #include <sys/cred.h> 36 #include <sys/kmem.h> 37 #include <sys/sysmacros.h> 38 #include <sys/vfs.h> 39 #include <sys/vnode.h> 40 #include <sys/debug.h> 41 #include <sys/errno.h> 42 #include <sys/time.h> 43 #include <sys/file.h> 44 #include <sys/open.h> 45 #include <sys/user.h> 46 #include <sys/termios.h> 47 #include <sys/stream.h> 48 #include <sys/strsubr.h> 49 #include <sys/esunddi.h> 50 #include <sys/flock.h> 51 #include <sys/modctl.h> 52 #include <sys/cmn_err.h> 53 #include <sys/vmsystm.h> 54 55 #include <sys/socket.h> 56 #include <sys/socketvar.h> 57 /* swilly code in sys/socketvar.h turns off DEBUG */ 58 #ifdef __lint 59 #define DEBUG 60 #endif 61 62 #include <netinet/in.h> 63 #include <sys/sendfile.h> 64 #include <sys/un.h> 65 #include <sys/tihdr.h> 66 #include <sys/atomic.h> 67 68 #include <inet/common.h> 69 #include <inet/ip.h> 70 #include <inet/ip6.h> 71 #include <inet/tcp.h> 72 73 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *, 74 ssize32_t *); 75 extern int nl7c_sendfilev(struct sonode *, u_offset_t *, struct sendfilevec *, 76 int, ssize_t *); 77 78 /* 79 * kstrwritemp() has very similar semantics as that of strwrite(). 80 * The main difference is it obtains mblks from the caller and also 81 * does not do any copy as done in strwrite() from user buffers to 82 * kernel buffers. 83 * 84 * Currently, this routine is used by sendfile to send data allocated 85 * within the kernel without any copying. This interface does not use the 86 * synchronous stream interface as synch. stream interface implies 87 * copying. 88 */ 89 int 90 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode) 91 { 92 struct stdata *stp; 93 struct queue *wqp; 94 mblk_t *newmp; 95 char waitflag; 96 int tempmode; 97 int error = 0; 98 int done = 0; 99 struct sonode *so; 100 boolean_t direct; 101 102 ASSERT(vp->v_stream); 103 stp = vp->v_stream; 104 105 so = VTOSO(vp); 106 direct = (so->so_state & SS_DIRECT); 107 108 /* 109 * This is the sockfs direct fast path. canputnext() need 110 * not be accurate so we don't grab the sd_lock here. If 111 * we get flow-controlled, we grab sd_lock just before the 112 * do..while loop below to emulate what strwrite() does. 113 */ 114 wqp = stp->sd_wrq; 115 if (canputnext(wqp) && direct && 116 !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) { 117 return (sostream_direct(so, NULL, mp, CRED())); 118 } else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) { 119 /* Fast check of flags before acquiring the lock */ 120 mutex_enter(&stp->sd_lock); 121 error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0); 122 mutex_exit(&stp->sd_lock); 123 if (error != 0) { 124 if (!(stp->sd_flag & STPLEX) && 125 (stp->sd_wput_opt & SW_SIGPIPE)) { 126 tsignal(curthread, SIGPIPE); 127 error = EPIPE; 128 } 129 return (error); 130 } 131 } 132 133 waitflag = WRITEWAIT; 134 if (stp->sd_flag & OLDNDELAY) 135 tempmode = fmode & ~FNDELAY; 136 else 137 tempmode = fmode; 138 139 mutex_enter(&stp->sd_lock); 140 do { 141 if (canputnext(wqp)) { 142 mutex_exit(&stp->sd_lock); 143 if (stp->sd_wputdatafunc != NULL) { 144 newmp = (stp->sd_wputdatafunc)(vp, mp, NULL, 145 NULL, NULL, NULL); 146 if (newmp == NULL) { 147 /* The caller will free mp */ 148 return (ECOMM); 149 } 150 mp = newmp; 151 } 152 putnext(wqp, mp); 153 return (0); 154 } 155 error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1, 156 &done); 157 } while (error == 0 && !done); 158 159 mutex_exit(&stp->sd_lock); 160 /* 161 * EAGAIN tells the application to try again. ENOMEM 162 * is returned only if the memory allocation size 163 * exceeds the physical limits of the system. ENOMEM 164 * can't be true here. 165 */ 166 if (error == ENOMEM) 167 error = EAGAIN; 168 return (error); 169 } 170 171 #define SEND_MAX_CHUNK 16 172 173 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 174 /* 175 * 64 bit offsets for 32 bit applications only running either on 176 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer 177 * more than 2GB of data. 178 */ 179 int 180 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, 181 int copy_cnt, ssize32_t *count) 182 { 183 struct vnode *vp; 184 ushort_t fflag; 185 int ioflag; 186 size32_t cnt; 187 ssize32_t sfv_len; 188 ssize32_t tmpcount; 189 u_offset_t sfv_off; 190 struct uio auio; 191 struct iovec aiov; 192 int i, error; 193 194 fflag = fp->f_flag; 195 vp = fp->f_vnode; 196 for (i = 0; i < copy_cnt; i++) { 197 198 if (ISSIG(curthread, JUSTLOOKING)) 199 return (EINTR); 200 201 /* 202 * Do similar checks as "write" as we are writing 203 * sfv_len bytes into "vp". 204 */ 205 sfv_len = (ssize32_t)sfv->sfv_len; 206 207 if (sfv_len == 0) 208 continue; 209 210 if (sfv_len < 0) 211 return (EINVAL); 212 213 if (vp->v_type == VREG) { 214 if (*fileoff >= curproc->p_fsz_ctl) { 215 mutex_enter(&curproc->p_lock); 216 (void) rctl_action( 217 rctlproc_legacy[RLIMIT_FSIZE], 218 curproc->p_rctls, curproc, RCA_SAFE); 219 mutex_exit(&curproc->p_lock); 220 return (EFBIG); 221 } 222 223 if (*fileoff >= OFFSET_MAX(fp)) 224 return (EFBIG); 225 226 if (*fileoff + sfv_len > OFFSET_MAX(fp)) 227 return (EINVAL); 228 } 229 230 tmpcount = *count + sfv_len; 231 if (tmpcount < 0) 232 return (EINVAL); 233 234 sfv_off = sfv->sfv_off; 235 236 auio.uio_extflg = UIO_COPY_DEFAULT; 237 if (sfv->sfv_fd == SFV_FD_SELF) { 238 aiov.iov_len = sfv_len; 239 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 240 auio.uio_loffset = *fileoff; 241 auio.uio_iovcnt = 1; 242 auio.uio_resid = sfv_len; 243 auio.uio_iov = &aiov; 244 auio.uio_segflg = UIO_USERSPACE; 245 auio.uio_llimit = curproc->p_fsz_ctl; 246 auio.uio_fmode = fflag; 247 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 248 while (sfv_len > 0) { 249 error = VOP_WRITE(vp, &auio, ioflag, 250 fp->f_cred, NULL); 251 cnt = sfv_len - auio.uio_resid; 252 sfv_len -= cnt; 253 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 254 if (vp->v_type == VREG) 255 *fileoff += cnt; 256 *count += cnt; 257 if (error != 0) 258 return (error); 259 } 260 } else { 261 file_t *ffp; 262 vnode_t *readvp; 263 int readflg = 0; 264 size_t size; 265 caddr_t ptr; 266 267 if ((ffp = getf(sfv->sfv_fd)) == NULL) 268 return (EBADF); 269 270 if ((ffp->f_flag & FREAD) == 0) { 271 releasef(sfv->sfv_fd); 272 return (EBADF); 273 } 274 275 readvp = ffp->f_vnode; 276 if (readvp->v_type != VREG) { 277 releasef(sfv->sfv_fd); 278 return (EINVAL); 279 } 280 281 /* 282 * No point reading and writing to same vp, 283 * as long as both are regular files. readvp is not 284 * locked; but since we got it from an open file the 285 * contents will be valid during the time of access. 286 */ 287 if (VN_CMP(vp, readvp)) { 288 releasef(sfv->sfv_fd); 289 return (EINVAL); 290 } 291 292 /* 293 * Note: we assume readvp != vp. "vp" is already 294 * locked, and "readvp" must not be. 295 */ 296 (void) VOP_RWLOCK(readvp, readflg, NULL); 297 298 /* 299 * Same checks as in pread64. 300 */ 301 if (sfv_off > MAXOFFSET_T) { 302 VOP_RWUNLOCK(readvp, readflg, NULL); 303 releasef(sfv->sfv_fd); 304 return (EINVAL); 305 } 306 307 if (sfv_off + sfv_len > MAXOFFSET_T) 308 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off); 309 310 /* Find the native blocksize to transfer data */ 311 size = MIN(vp->v_vfsp->vfs_bsize, 312 readvp->v_vfsp->vfs_bsize); 313 size = sfv_len < size ? sfv_len : size; 314 ptr = kmem_alloc(size, KM_SLEEP); 315 316 while (sfv_len > 0) { 317 size_t iov_len; 318 319 iov_len = MIN(size, sfv_len); 320 aiov.iov_base = ptr; 321 aiov.iov_len = iov_len; 322 auio.uio_loffset = sfv_off; 323 auio.uio_iov = &aiov; 324 auio.uio_iovcnt = 1; 325 auio.uio_resid = iov_len; 326 auio.uio_segflg = UIO_SYSSPACE; 327 auio.uio_llimit = MAXOFFSET_T; 328 auio.uio_fmode = ffp->f_flag; 329 ioflag = auio.uio_fmode & 330 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 331 332 /* 333 * If read sync is not asked for, 334 * filter sync flags 335 */ 336 if ((ioflag & FRSYNC) == 0) 337 ioflag &= ~(FSYNC|FDSYNC); 338 error = VOP_READ(readvp, &auio, ioflag, 339 fp->f_cred, NULL); 340 if (error) { 341 kmem_free(ptr, size); 342 VOP_RWUNLOCK(readvp, readflg, NULL); 343 releasef(sfv->sfv_fd); 344 return (error); 345 } 346 347 /* 348 * Check how must data was really read. 349 * Decrement the 'len' and increment the 350 * 'off' appropriately. 351 */ 352 cnt = iov_len - auio.uio_resid; 353 if (cnt == 0) { 354 /* 355 * If we were reading a pipe (currently 356 * not implemented), we may now lose 357 * data. 358 */ 359 kmem_free(ptr, size); 360 VOP_RWUNLOCK(readvp, readflg, NULL); 361 releasef(sfv->sfv_fd); 362 return (EINVAL); 363 } 364 sfv_len -= cnt; 365 sfv_off += cnt; 366 367 aiov.iov_base = ptr; 368 aiov.iov_len = cnt; 369 auio.uio_loffset = *fileoff; 370 auio.uio_resid = cnt; 371 auio.uio_segflg = UIO_SYSSPACE; 372 auio.uio_llimit = curproc->p_fsz_ctl; 373 auio.uio_fmode = fflag; 374 ioflag = auio.uio_fmode & 375 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 376 error = VOP_WRITE(vp, &auio, ioflag, 377 fp->f_cred, NULL); 378 379 /* 380 * Check how much data was written. Increment 381 * the 'len' and decrement the 'off' if all 382 * the data was not written. 383 */ 384 cnt -= auio.uio_resid; 385 sfv_len += auio.uio_resid; 386 sfv_off -= auio.uio_resid; 387 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 388 if (vp->v_type == VREG) 389 *fileoff += cnt; 390 *count += cnt; 391 if (error != 0) { 392 kmem_free(ptr, size); 393 VOP_RWUNLOCK(readvp, readflg, NULL); 394 releasef(sfv->sfv_fd); 395 return (error); 396 } 397 } 398 VOP_RWUNLOCK(readvp, readflg, NULL); 399 releasef(sfv->sfv_fd); 400 kmem_free(ptr, size); 401 } 402 sfv++; 403 } 404 return (0); 405 } 406 407 ssize32_t 408 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, 409 size32_t *xferred, int fildes) 410 { 411 int rwflag; 412 u_offset_t fileoff; 413 int copy_cnt; 414 const struct ksendfilevec64 *copy_vec; 415 struct ksendfilevec64 sfv[SEND_MAX_CHUNK]; 416 struct vnode *vp; 417 int error; 418 ssize32_t count = 0; 419 int osfvcnt; 420 421 rwflag = 1; 422 vp = fp->f_vnode; 423 (void) VOP_RWLOCK(vp, rwflag, NULL); 424 425 copy_vec = vec; 426 fileoff = fp->f_offset; 427 osfvcnt = sfvcnt; 428 429 do { 430 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 431 if (copyin(copy_vec, sfv, copy_cnt * 432 sizeof (struct ksendfilevec64))) { 433 error = EFAULT; 434 break; 435 } 436 437 /* 438 * Optimize the single regular file over 439 * the socket case. 440 */ 441 if (vp->v_type == VSOCK && osfvcnt == 1 && 442 sfv->sfv_fd != SFV_FD_SELF) { 443 file_t *rfp; 444 vnode_t *rvp; 445 446 if ((rfp = getf(sfv->sfv_fd)) == NULL) { 447 error = EBADF; 448 break; 449 } 450 if ((rfp->f_flag & FREAD) == 0) { 451 releasef(sfv->sfv_fd); 452 error = EBADF; 453 break; 454 } 455 rvp = rfp->f_vnode; 456 if (rvp->v_type == VREG) { 457 error = sosendfile64(fp, rfp, sfv, &count); 458 break; 459 } 460 releasef(sfv->sfv_fd); 461 } 462 error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count); 463 if (error != 0) 464 break; 465 466 copy_vec += copy_cnt; 467 sfvcnt -= copy_cnt; 468 } while (sfvcnt > 0); 469 470 if (vp->v_type == VREG) 471 fp->f_offset += count; 472 473 VOP_RWUNLOCK(vp, rwflag, NULL); 474 if (copyout(&count, xferred, sizeof (count))) 475 error = EFAULT; 476 releasef(fildes); 477 if (error != 0) 478 return (set_errno(error)); 479 return (count); 480 } 481 #endif 482 483 int 484 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 485 int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count) 486 { 487 struct vnode *vp; 488 struct uio auio; 489 struct iovec aiov; 490 ushort_t fflag; 491 int ioflag; 492 int i, error; 493 size_t cnt; 494 ssize_t sfv_len; 495 u_offset_t sfv_off; 496 #ifdef _SYSCALL32_IMPL 497 model_t model = get_udatamodel(); 498 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 499 MAXOFF32_T : MAXOFFSET_T; 500 #else 501 const u_offset_t maxoff = MAXOFF32_T; 502 #endif 503 mblk_t *dmp = NULL; 504 int wroff; 505 int buf_left = 0; 506 size_t iov_len; 507 mblk_t *head, *tmp; 508 size_t size = total_size; 509 size_t extra; 510 int tail_len; 511 512 fflag = fp->f_flag; 513 vp = fp->f_vnode; 514 515 ASSERT(vp->v_type == VSOCK); 516 ASSERT(maxblk > 0); 517 518 wroff = (int)vp->v_stream->sd_wroff; 519 tail_len = (int)vp->v_stream->sd_tail; 520 extra = wroff + tail_len; 521 522 buf_left = MIN(total_size, maxblk); 523 head = dmp = allocb(buf_left + extra, BPRI_HI); 524 if (head == NULL) 525 return (ENOMEM); 526 head->b_wptr = head->b_rptr = head->b_rptr + wroff; 527 528 auio.uio_extflg = UIO_COPY_DEFAULT; 529 for (i = 0; i < copy_cnt; i++) { 530 if (ISSIG(curthread, JUSTLOOKING)) 531 return (EINTR); 532 533 /* 534 * Do similar checks as "write" as we are writing 535 * sfv_len bytes into "vp". 536 */ 537 sfv_len = (ssize_t)sfv->sfv_len; 538 539 if (sfv_len == 0) { 540 sfv++; 541 continue; 542 } 543 544 /* Make sure sfv_len is not negative */ 545 #ifdef _SYSCALL32_IMPL 546 if (model == DATAMODEL_ILP32) { 547 if ((ssize32_t)sfv_len < 0) 548 return (EINVAL); 549 } else 550 #endif 551 if (sfv_len < 0) 552 return (EINVAL); 553 554 /* Check for overflow */ 555 #ifdef _SYSCALL32_IMPL 556 if (model == DATAMODEL_ILP32) { 557 if (((ssize32_t)(*count + sfv_len)) < 0) 558 return (EINVAL); 559 } else 560 #endif 561 if ((*count + sfv_len) < 0) 562 return (EINVAL); 563 564 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 565 566 if (sfv->sfv_fd == SFV_FD_SELF) { 567 while (sfv_len > 0) { 568 if (buf_left == 0) { 569 tmp = dmp; 570 buf_left = MIN(total_size, maxblk); 571 iov_len = MIN(buf_left, sfv_len); 572 dmp = allocb(buf_left + extra, BPRI_HI); 573 if (dmp == NULL) { 574 freemsg(head); 575 return (ENOMEM); 576 } 577 dmp->b_wptr = dmp->b_rptr = 578 dmp->b_rptr + wroff; 579 tmp->b_cont = dmp; 580 } else { 581 iov_len = MIN(buf_left, sfv_len); 582 } 583 584 aiov.iov_len = iov_len; 585 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 586 auio.uio_loffset = *fileoff; 587 auio.uio_iovcnt = 1; 588 auio.uio_resid = iov_len; 589 auio.uio_iov = &aiov; 590 auio.uio_segflg = UIO_USERSPACE; 591 auio.uio_llimit = curproc->p_fsz_ctl; 592 auio.uio_fmode = fflag; 593 594 buf_left -= iov_len; 595 total_size -= iov_len; 596 sfv_len -= iov_len; 597 sfv_off += iov_len; 598 599 error = uiomove((caddr_t)dmp->b_wptr, 600 iov_len, UIO_WRITE, &auio); 601 if (error != 0) { 602 freemsg(head); 603 return (error); 604 } 605 dmp->b_wptr += iov_len; 606 } 607 } else { 608 file_t *ffp; 609 vnode_t *readvp; 610 int readflg = 0; 611 612 if ((ffp = getf(sfv->sfv_fd)) == NULL) { 613 freemsg(head); 614 return (EBADF); 615 } 616 617 if ((ffp->f_flag & FREAD) == 0) { 618 releasef(sfv->sfv_fd); 619 freemsg(head); 620 return (EACCES); 621 } 622 623 readvp = ffp->f_vnode; 624 if (readvp->v_type != VREG) { 625 releasef(sfv->sfv_fd); 626 freemsg(head); 627 return (EINVAL); 628 } 629 630 /* 631 * No point reading and writing to same vp, 632 * as long as both are regular files. readvp is not 633 * locked; but since we got it from an open file the 634 * contents will be valid during the time of access. 635 */ 636 637 if (VN_CMP(vp, readvp)) { 638 releasef(sfv->sfv_fd); 639 freemsg(head); 640 return (EINVAL); 641 } 642 643 /* 644 * Note: we assume readvp != vp. "vp" is already 645 * locked, and "readvp" must not be. 646 */ 647 648 (void) VOP_RWLOCK(readvp, readflg, NULL); 649 650 /* Same checks as in pread */ 651 if (sfv_off > maxoff) { 652 VOP_RWUNLOCK(readvp, readflg, NULL); 653 releasef(sfv->sfv_fd); 654 freemsg(head); 655 return (EINVAL); 656 } 657 if (sfv_off + sfv_len > maxoff) { 658 sfv_len = (ssize_t)((offset_t)maxoff - 659 sfv_off); 660 } 661 662 while (sfv_len > 0) { 663 if (buf_left == 0) { 664 tmp = dmp; 665 buf_left = MIN(total_size, maxblk); 666 iov_len = MIN(buf_left, sfv_len); 667 dmp = allocb(buf_left + extra, BPRI_HI); 668 if (dmp == NULL) { 669 VOP_RWUNLOCK(readvp, readflg, 670 NULL); 671 releasef(sfv->sfv_fd); 672 freemsg(head); 673 return (ENOMEM); 674 } 675 dmp->b_wptr = dmp->b_rptr = 676 dmp->b_rptr + wroff; 677 tmp->b_cont = dmp; 678 } else { 679 iov_len = MIN(buf_left, sfv_len); 680 } 681 aiov.iov_base = (caddr_t)dmp->b_wptr; 682 aiov.iov_len = iov_len; 683 auio.uio_loffset = sfv_off; 684 auio.uio_iov = &aiov; 685 auio.uio_iovcnt = 1; 686 auio.uio_resid = iov_len; 687 auio.uio_segflg = UIO_SYSSPACE; 688 auio.uio_llimit = MAXOFFSET_T; 689 auio.uio_fmode = ffp->f_flag; 690 ioflag = auio.uio_fmode & 691 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 692 693 /* 694 * If read sync is not asked for, 695 * filter sync flags 696 */ 697 if ((ioflag & FRSYNC) == 0) 698 ioflag &= ~(FSYNC|FDSYNC); 699 error = VOP_READ(readvp, &auio, ioflag, 700 fp->f_cred, NULL); 701 if (error != 0) { 702 /* 703 * If we were reading a pipe (currently 704 * not implemented), we may now loose 705 * data. 706 */ 707 VOP_RWUNLOCK(readvp, readflg, NULL); 708 releasef(sfv->sfv_fd); 709 freemsg(head); 710 return (error); 711 } 712 713 /* 714 * Check how much data was really read. 715 * Decrement the 'len' and increment the 716 * 'off' appropriately. 717 */ 718 cnt = iov_len - auio.uio_resid; 719 if (cnt == 0) { 720 VOP_RWUNLOCK(readvp, readflg, NULL); 721 releasef(sfv->sfv_fd); 722 freemsg(head); 723 return (EINVAL); 724 } 725 sfv_len -= cnt; 726 sfv_off += cnt; 727 total_size -= cnt; 728 buf_left -= cnt; 729 730 dmp->b_wptr += cnt; 731 } 732 VOP_RWUNLOCK(readvp, readflg, NULL); 733 releasef(sfv->sfv_fd); 734 } 735 sfv++; 736 } 737 738 ASSERT(total_size == 0); 739 error = kstrwritemp(vp, head, fflag); 740 if (error != 0) { 741 freemsg(head); 742 return (error); 743 } 744 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size; 745 *count += size; 746 747 return (0); 748 } 749 750 751 int 752 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 753 int copy_cnt, ssize_t *count) 754 { 755 struct vnode *vp; 756 struct uio auio; 757 struct iovec aiov; 758 ushort_t fflag; 759 int ioflag; 760 int i, error; 761 size_t cnt; 762 ssize_t sfv_len; 763 u_offset_t sfv_off; 764 #ifdef _SYSCALL32_IMPL 765 model_t model = get_udatamodel(); 766 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 767 MAXOFF32_T : MAXOFFSET_T; 768 #else 769 const u_offset_t maxoff = MAXOFF32_T; 770 #endif 771 mblk_t *dmp = NULL; 772 char *buf = NULL; 773 size_t extra; 774 int maxblk, wroff, tail_len; 775 struct sonode *so; 776 stdata_t *stp; 777 778 fflag = fp->f_flag; 779 vp = fp->f_vnode; 780 781 if (vp->v_type == VSOCK) { 782 so = VTOSO(vp); 783 stp = vp->v_stream; 784 wroff = (int)stp->sd_wroff; 785 tail_len = (int)stp->sd_tail; 786 maxblk = (int)stp->sd_maxblk; 787 extra = wroff + tail_len; 788 } 789 790 auio.uio_extflg = UIO_COPY_DEFAULT; 791 for (i = 0; i < copy_cnt; i++) { 792 if (ISSIG(curthread, JUSTLOOKING)) 793 return (EINTR); 794 795 /* 796 * Do similar checks as "write" as we are writing 797 * sfv_len bytes into "vp". 798 */ 799 sfv_len = (ssize_t)sfv->sfv_len; 800 801 if (sfv_len == 0) { 802 sfv++; 803 continue; 804 } 805 806 /* Make sure sfv_len is not negative */ 807 #ifdef _SYSCALL32_IMPL 808 if (model == DATAMODEL_ILP32) { 809 if ((ssize32_t)sfv_len < 0) 810 return (EINVAL); 811 } else 812 #endif 813 if (sfv_len < 0) 814 return (EINVAL); 815 816 if (vp->v_type == VREG) { 817 if (*fileoff >= curproc->p_fsz_ctl) { 818 mutex_enter(&curproc->p_lock); 819 (void) rctl_action( 820 rctlproc_legacy[RLIMIT_FSIZE], 821 curproc->p_rctls, curproc, RCA_SAFE); 822 mutex_exit(&curproc->p_lock); 823 824 return (EFBIG); 825 } 826 827 if (*fileoff >= maxoff) 828 return (EFBIG); 829 830 if (*fileoff + sfv_len > maxoff) 831 return (EINVAL); 832 } 833 834 /* Check for overflow */ 835 #ifdef _SYSCALL32_IMPL 836 if (model == DATAMODEL_ILP32) { 837 if (((ssize32_t)(*count + sfv_len)) < 0) 838 return (EINVAL); 839 } else 840 #endif 841 if ((*count + sfv_len) < 0) 842 return (EINVAL); 843 844 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 845 846 if (sfv->sfv_fd == SFV_FD_SELF) { 847 aiov.iov_len = sfv_len; 848 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 849 auio.uio_loffset = *fileoff; 850 auio.uio_iovcnt = 1; 851 auio.uio_resid = sfv_len; 852 auio.uio_iov = &aiov; 853 auio.uio_segflg = UIO_USERSPACE; 854 auio.uio_llimit = curproc->p_fsz_ctl; 855 auio.uio_fmode = fflag; 856 857 if (vp->v_type == VSOCK) { 858 859 /* 860 * Optimize for the socket case 861 */ 862 863 dmp = allocb(sfv_len + extra, BPRI_HI); 864 if (dmp == NULL) 865 return (ENOMEM); 866 dmp->b_wptr = dmp->b_rptr = dmp->b_rptr + wroff; 867 error = uiomove((caddr_t)dmp->b_wptr, 868 sfv_len, UIO_WRITE, &auio); 869 if (error != 0) { 870 freeb(dmp); 871 return (error); 872 } 873 dmp->b_wptr += sfv_len; 874 error = kstrwritemp(vp, dmp, fflag); 875 if (error != 0) { 876 freeb(dmp); 877 return (error); 878 } 879 ttolwp(curthread)->lwp_ru.ioch += 880 (ulong_t)sfv_len; 881 *count += sfv_len; 882 } else { 883 ioflag = auio.uio_fmode & 884 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 885 while (sfv_len > 0) { 886 error = VOP_WRITE(vp, &auio, ioflag, 887 fp->f_cred, NULL); 888 cnt = sfv_len - auio.uio_resid; 889 sfv_len -= cnt; 890 ttolwp(curthread)->lwp_ru.ioch += 891 (ulong_t)cnt; 892 *fileoff += cnt; 893 *count += cnt; 894 if (error != 0) 895 return (error); 896 } 897 } 898 } else { 899 file_t *ffp; 900 vnode_t *readvp; 901 int readflg = 0; 902 size_t size; 903 caddr_t ptr; 904 905 if ((ffp = getf(sfv->sfv_fd)) == NULL) 906 return (EBADF); 907 908 if ((ffp->f_flag & FREAD) == 0) { 909 releasef(sfv->sfv_fd); 910 return (EBADF); 911 } 912 913 readvp = ffp->f_vnode; 914 if (readvp->v_type != VREG) { 915 releasef(sfv->sfv_fd); 916 return (EINVAL); 917 } 918 919 /* 920 * No point reading and writing to same vp, 921 * as long as both are regular files. readvp is not 922 * locked; but since we got it from an open file the 923 * contents will be valid during the time of access. 924 */ 925 if (VN_CMP(vp, readvp)) { 926 releasef(sfv->sfv_fd); 927 return (EINVAL); 928 } 929 930 /* 931 * Note: we assume readvp != vp. "vp" is already 932 * locked, and "readvp" must not be. 933 */ 934 (void) VOP_RWLOCK(readvp, readflg, NULL); 935 936 /* Same checks as in pread */ 937 if (sfv_off > maxoff) { 938 VOP_RWUNLOCK(readvp, readflg, NULL); 939 releasef(sfv->sfv_fd); 940 return (EINVAL); 941 } 942 if (sfv_off + sfv_len > maxoff) { 943 sfv_len = (ssize_t)((offset_t)maxoff - 944 sfv_off); 945 } 946 /* Find the native blocksize to transfer data */ 947 size = MIN(vp->v_vfsp->vfs_bsize, 948 readvp->v_vfsp->vfs_bsize); 949 size = sfv_len < size ? sfv_len : size; 950 951 if (vp->v_type != VSOCK) { 952 buf = kmem_alloc(size, KM_NOSLEEP); 953 if (buf == NULL) { 954 VOP_RWUNLOCK(readvp, readflg, NULL); 955 releasef(sfv->sfv_fd); 956 return (ENOMEM); 957 } 958 } else { 959 /* 960 * For sockets acting as an SSL proxy, we 961 * need to adjust the size to the maximum 962 * SSL record size set in the stream head. 963 */ 964 if (so->so_kssl_ctx != NULL) 965 size = MIN(size, maxblk); 966 } 967 968 while (sfv_len > 0) { 969 size_t iov_len; 970 971 iov_len = MIN(size, sfv_len); 972 973 if (vp->v_type == VSOCK) { 974 dmp = allocb(iov_len + extra, BPRI_HI); 975 if (dmp == NULL) { 976 VOP_RWUNLOCK(readvp, readflg, 977 NULL); 978 releasef(sfv->sfv_fd); 979 return (ENOMEM); 980 } 981 dmp->b_wptr = dmp->b_rptr = 982 dmp->b_rptr + wroff; 983 ptr = (caddr_t)dmp->b_rptr; 984 } else { 985 ptr = buf; 986 } 987 988 aiov.iov_base = ptr; 989 aiov.iov_len = iov_len; 990 auio.uio_loffset = sfv_off; 991 auio.uio_iov = &aiov; 992 auio.uio_iovcnt = 1; 993 auio.uio_resid = iov_len; 994 auio.uio_segflg = UIO_SYSSPACE; 995 auio.uio_llimit = MAXOFFSET_T; 996 auio.uio_fmode = ffp->f_flag; 997 ioflag = auio.uio_fmode & 998 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 999 1000 /* 1001 * If read sync is not asked for, 1002 * filter sync flags 1003 */ 1004 if ((ioflag & FRSYNC) == 0) 1005 ioflag &= ~(FSYNC|FDSYNC); 1006 error = VOP_READ(readvp, &auio, ioflag, 1007 fp->f_cred, NULL); 1008 if (error != 0) { 1009 /* 1010 * If we were reading a pipe (currently 1011 * not implemented), we may now lose 1012 * data. 1013 */ 1014 if (vp->v_type == VSOCK) 1015 freeb(dmp); 1016 else 1017 kmem_free(buf, size); 1018 VOP_RWUNLOCK(readvp, readflg, NULL); 1019 releasef(sfv->sfv_fd); 1020 return (error); 1021 } 1022 1023 /* 1024 * Check how much data was really read. 1025 * Decrement the 'len' and increment the 1026 * 'off' appropriately. 1027 */ 1028 cnt = iov_len - auio.uio_resid; 1029 if (cnt == 0) { 1030 if (vp->v_type == VSOCK) 1031 freeb(dmp); 1032 else 1033 kmem_free(buf, size); 1034 VOP_RWUNLOCK(readvp, readflg, NULL); 1035 releasef(sfv->sfv_fd); 1036 return (EINVAL); 1037 } 1038 sfv_len -= cnt; 1039 sfv_off += cnt; 1040 1041 if (vp->v_type == VSOCK) { 1042 dmp->b_wptr = dmp->b_rptr + cnt; 1043 1044 error = kstrwritemp(vp, dmp, fflag); 1045 if (error != 0) { 1046 freeb(dmp); 1047 VOP_RWUNLOCK(readvp, readflg, 1048 NULL); 1049 releasef(sfv->sfv_fd); 1050 return (error); 1051 } 1052 1053 ttolwp(curthread)->lwp_ru.ioch += 1054 (ulong_t)cnt; 1055 *count += cnt; 1056 } else { 1057 1058 aiov.iov_base = ptr; 1059 aiov.iov_len = cnt; 1060 auio.uio_loffset = *fileoff; 1061 auio.uio_resid = cnt; 1062 auio.uio_segflg = UIO_SYSSPACE; 1063 auio.uio_llimit = curproc->p_fsz_ctl; 1064 auio.uio_fmode = fflag; 1065 ioflag = auio.uio_fmode & 1066 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1067 error = VOP_WRITE(vp, &auio, ioflag, 1068 fp->f_cred, NULL); 1069 1070 /* 1071 * Check how much data was written. 1072 * Increment the 'len' and decrement the 1073 * 'off' if all the data was not 1074 * written. 1075 */ 1076 cnt -= auio.uio_resid; 1077 sfv_len += auio.uio_resid; 1078 sfv_off -= auio.uio_resid; 1079 ttolwp(curthread)->lwp_ru.ioch += 1080 (ulong_t)cnt; 1081 *fileoff += cnt; 1082 *count += cnt; 1083 if (error != 0) { 1084 VOP_RWUNLOCK(readvp, readflg, 1085 NULL); 1086 releasef(sfv->sfv_fd); 1087 return (error); 1088 } 1089 } 1090 } 1091 if (buf) { 1092 kmem_free(buf, size); 1093 buf = NULL; 1094 } 1095 VOP_RWUNLOCK(readvp, readflg, NULL); 1096 releasef(sfv->sfv_fd); 1097 } 1098 sfv++; 1099 } 1100 return (0); 1101 } 1102 1103 ssize_t 1104 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, 1105 size_t *xferred) 1106 { 1107 int error; 1108 file_t *fp; 1109 struct vnode *vp; 1110 struct sonode *so; 1111 u_offset_t fileoff; 1112 int copy_cnt; 1113 const struct sendfilevec *copy_vec; 1114 struct sendfilevec sfv[SEND_MAX_CHUNK]; 1115 ssize_t count = 0; 1116 #ifdef _SYSCALL32_IMPL 1117 struct ksendfilevec32 sfv32[SEND_MAX_CHUNK]; 1118 #endif 1119 ssize_t total_size = 0; 1120 int i; 1121 boolean_t is_sock = B_FALSE; 1122 int maxblk = 0; 1123 1124 if (sfvcnt <= 0) 1125 return (set_errno(EINVAL)); 1126 1127 if ((fp = getf(fildes)) == NULL) 1128 return (set_errno(EBADF)); 1129 1130 if (((fp->f_flag) & FWRITE) == 0) { 1131 error = EBADF; 1132 goto err; 1133 } 1134 1135 fileoff = fp->f_offset; 1136 vp = fp->f_vnode; 1137 1138 switch (vp->v_type) { 1139 case VSOCK: 1140 so = VTOSO(vp); 1141 /* sendfile not supported for SCTP */ 1142 if (so->so_protocol == IPPROTO_SCTP) { 1143 error = EPROTONOSUPPORT; 1144 goto err; 1145 } 1146 is_sock = B_TRUE; 1147 switch (so->so_family) { 1148 case AF_INET: 1149 case AF_INET6: 1150 /* 1151 * Make similar checks done in SOP_WRITE(). 1152 */ 1153 if (so->so_state & SS_CANTSENDMORE) { 1154 tsignal(curthread, SIGPIPE); 1155 error = EPIPE; 1156 goto err; 1157 } 1158 if (so->so_type != SOCK_STREAM) { 1159 error = EOPNOTSUPP; 1160 goto err; 1161 } 1162 1163 if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) != 1164 (SS_ISCONNECTED|SS_ISBOUND)) { 1165 error = ENOTCONN; 1166 goto err; 1167 } 1168 1169 if ((so->so_state & SS_DIRECT) && 1170 (so->so_priv != NULL) && 1171 (so->so_kssl_ctx == NULL)) { 1172 maxblk = ((tcp_t *)so->so_priv)->tcp_mss; 1173 } else { 1174 maxblk = (int)vp->v_stream->sd_maxblk; 1175 } 1176 break; 1177 default: 1178 error = EAFNOSUPPORT; 1179 goto err; 1180 } 1181 break; 1182 case VREG: 1183 break; 1184 default: 1185 error = EINVAL; 1186 goto err; 1187 } 1188 1189 switch (opcode) { 1190 case SENDFILEV : 1191 break; 1192 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 1193 case SENDFILEV64 : 1194 return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt, 1195 (size32_t *)xferred, fildes)); 1196 #endif 1197 default : 1198 error = ENOSYS; 1199 break; 1200 } 1201 1202 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1203 copy_vec = vec; 1204 1205 do { 1206 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 1207 #ifdef _SYSCALL32_IMPL 1208 /* 32-bit callers need to have their iovec expanded. */ 1209 if (get_udatamodel() == DATAMODEL_ILP32) { 1210 if (copyin(copy_vec, sfv32, 1211 copy_cnt * sizeof (ksendfilevec32_t))) { 1212 error = EFAULT; 1213 break; 1214 } 1215 1216 for (i = 0; i < copy_cnt; i++) { 1217 sfv[i].sfv_fd = sfv32[i].sfv_fd; 1218 sfv[i].sfv_off = 1219 (off_t)(uint32_t)sfv32[i].sfv_off; 1220 sfv[i].sfv_len = (size_t)sfv32[i].sfv_len; 1221 total_size += sfv[i].sfv_len; 1222 sfv[i].sfv_flag = sfv32[i].sfv_flag; 1223 } 1224 } else { 1225 #endif 1226 if (copyin(copy_vec, sfv, 1227 copy_cnt * sizeof (sendfilevec_t))) { 1228 error = EFAULT; 1229 break; 1230 } 1231 1232 for (i = 0; i < copy_cnt; i++) { 1233 total_size += sfv[i].sfv_len; 1234 } 1235 #ifdef _SYSCALL32_IMPL 1236 } 1237 #endif 1238 1239 /* 1240 * The task between deciding to use sendvec_small_chunk 1241 * and sendvec_chunk is dependant on multiple things: 1242 * 1243 * i) latency is important for smaller files. So if the 1244 * data is smaller than 'tcp_slow_start_initial' times 1245 * maxblk, then use sendvec_small_chunk which creates 1246 * maxblk size mblks and chains then together and sends 1247 * them to TCP in one shot. It also leaves 'wroff' size 1248 * space for the headers in each mblk. 1249 * 1250 * ii) for total size bigger than 'tcp_slow_start_initial' 1251 * time maxblk, its probably real file data which is 1252 * dominating. So its better to use sendvec_chunk because 1253 * performance goes to dog if we don't do pagesize reads. 1254 * sendvec_chunk will do pagesize reads and write them 1255 * in pagesize mblks to TCP. 1256 * 1257 * Side Notes: A write to file has not been optimized. 1258 * Future zero copy code will plugin into sendvec_chunk 1259 * only because doing zero copy for files smaller then 1260 * pagesize is useless. 1261 * 1262 * Note, if socket has NL7C enabled then call NL7C's 1263 * senfilev() function to consume the sfv[]. 1264 */ 1265 if (is_sock) { 1266 switch (so->so_family) { 1267 case AF_INET: 1268 case AF_INET6: 1269 if (so->so_nl7c_flags != 0) 1270 error = nl7c_sendfilev(so, &fileoff, 1271 sfv, copy_cnt, &count); 1272 else if (total_size <= (4 * maxblk)) 1273 error = sendvec_small_chunk(fp, 1274 &fileoff, sfv, copy_cnt, 1275 total_size, maxblk, &count); 1276 else 1277 error = sendvec_chunk(fp, &fileoff, 1278 sfv, copy_cnt, &count); 1279 break; 1280 } 1281 } else { 1282 ASSERT(vp->v_type == VREG); 1283 error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt, 1284 &count); 1285 } 1286 1287 1288 #ifdef _SYSCALL32_IMPL 1289 if (get_udatamodel() == DATAMODEL_ILP32) 1290 copy_vec = (const struct sendfilevec *)((char *)copy_vec + 1291 (copy_cnt * sizeof (ksendfilevec32_t))); 1292 else 1293 #endif 1294 copy_vec += copy_cnt; 1295 sfvcnt -= copy_cnt; 1296 } while (sfvcnt > 0); 1297 1298 if (vp->v_type == VREG) 1299 fp->f_offset += count; 1300 1301 1302 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1303 1304 #ifdef _SYSCALL32_IMPL 1305 if (get_udatamodel() == DATAMODEL_ILP32) { 1306 ssize32_t count32 = (ssize32_t)count; 1307 if (copyout(&count32, xferred, sizeof (count32))) 1308 error = EFAULT; 1309 releasef(fildes); 1310 if (error != 0) 1311 return (set_errno(error)); 1312 return (count32); 1313 } 1314 #endif 1315 if (copyout(&count, xferred, sizeof (count))) 1316 error = EFAULT; 1317 releasef(fildes); 1318 if (error != 0) 1319 return (set_errno(error)); 1320 return (count); 1321 err: 1322 ASSERT(error != 0); 1323 releasef(fildes); 1324 return (set_errno(error)); 1325 } 1326