1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/t_lock.h> 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/buf.h> 34 #include <sys/conf.h> 35 #include <sys/cred.h> 36 #include <sys/kmem.h> 37 #include <sys/sysmacros.h> 38 #include <sys/vfs.h> 39 #include <sys/vnode.h> 40 #include <sys/debug.h> 41 #include <sys/errno.h> 42 #include <sys/time.h> 43 #include <sys/file.h> 44 #include <sys/open.h> 45 #include <sys/user.h> 46 #include <sys/termios.h> 47 #include <sys/stream.h> 48 #include <sys/strsubr.h> 49 #include <sys/esunddi.h> 50 #include <sys/flock.h> 51 #include <sys/modctl.h> 52 #include <sys/cmn_err.h> 53 #include <sys/vmsystm.h> 54 55 #include <sys/socket.h> 56 #include <sys/socketvar.h> 57 /* swilly code in sys/socketvar.h turns off DEBUG */ 58 #ifdef __lint 59 #define DEBUG 60 #endif 61 62 #include <netinet/in.h> 63 #include <sys/sendfile.h> 64 #include <sys/un.h> 65 #include <sys/tihdr.h> 66 #include <sys/atomic.h> 67 68 #include <inet/common.h> 69 #include <inet/ip.h> 70 #include <inet/ip6.h> 71 #include <inet/tcp.h> 72 73 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *, 74 ssize32_t *); 75 extern int nl7c_sendfilev(struct sonode *, u_offset_t *, struct sendfilevec *, 76 int, ssize_t *); 77 78 /* 79 * kstrwritemp() has very similar semantics as that of strwrite(). 80 * The main difference is it obtains mblks from the caller and also 81 * does not do any copy as done in strwrite() from user buffers to 82 * kernel buffers. 83 * 84 * Currently, this routine is used by sendfile to send data allocated 85 * within the kernel without any copying. This interface does not use the 86 * synchronous stream interface as synch. stream interface implies 87 * copying. 88 */ 89 int 90 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode) 91 { 92 struct stdata *stp; 93 struct queue *wqp; 94 mblk_t *newmp; 95 char waitflag; 96 int tempmode; 97 int error = 0; 98 int done = 0; 99 struct sonode *so; 100 boolean_t direct; 101 102 ASSERT(vp->v_stream); 103 stp = vp->v_stream; 104 105 so = VTOSO(vp); 106 direct = (so->so_state & SS_DIRECT); 107 108 /* 109 * This is the sockfs direct fast path. canputnext() need 110 * not be accurate so we don't grab the sd_lock here. If 111 * we get flow-controlled, we grab sd_lock just before the 112 * do..while loop below to emulate what strwrite() does. 113 */ 114 wqp = stp->sd_wrq; 115 if (canputnext(wqp) && direct && 116 !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) { 117 return (sostream_direct(so, NULL, mp, CRED())); 118 } else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) { 119 /* Fast check of flags before acquiring the lock */ 120 mutex_enter(&stp->sd_lock); 121 error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0); 122 mutex_exit(&stp->sd_lock); 123 if (error != 0) { 124 if (!(stp->sd_flag & STPLEX) && 125 (stp->sd_wput_opt & SW_SIGPIPE)) { 126 tsignal(curthread, SIGPIPE); 127 error = EPIPE; 128 } 129 return (error); 130 } 131 } 132 133 waitflag = WRITEWAIT; 134 if (stp->sd_flag & OLDNDELAY) 135 tempmode = fmode & ~FNDELAY; 136 else 137 tempmode = fmode; 138 139 mutex_enter(&stp->sd_lock); 140 do { 141 if (canputnext(wqp)) { 142 mutex_exit(&stp->sd_lock); 143 if (stp->sd_wputdatafunc != NULL) { 144 newmp = (stp->sd_wputdatafunc)(vp, mp, NULL, 145 NULL, NULL, NULL); 146 if (newmp == NULL) { 147 /* The caller will free mp */ 148 return (ECOMM); 149 } 150 mp = newmp; 151 } 152 putnext(wqp, mp); 153 return (0); 154 } 155 error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1, 156 &done); 157 } while (error == 0 && !done); 158 159 mutex_exit(&stp->sd_lock); 160 /* 161 * EAGAIN tells the application to try again. ENOMEM 162 * is returned only if the memory allocation size 163 * exceeds the physical limits of the system. ENOMEM 164 * can't be true here. 165 */ 166 if (error == ENOMEM) 167 error = EAGAIN; 168 return (error); 169 } 170 171 #define SEND_MAX_CHUNK 16 172 173 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 174 /* 175 * 64 bit offsets for 32 bit applications only running either on 176 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer 177 * more than 2GB of data. 178 */ 179 int 180 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, 181 int copy_cnt, ssize32_t *count) 182 { 183 struct vnode *vp; 184 ushort_t fflag; 185 int ioflag; 186 size32_t cnt; 187 ssize32_t sfv_len; 188 ssize32_t tmpcount; 189 u_offset_t sfv_off; 190 struct uio auio; 191 struct iovec aiov; 192 int i, error; 193 194 fflag = fp->f_flag; 195 vp = fp->f_vnode; 196 for (i = 0; i < copy_cnt; i++) { 197 198 if (ISSIG(curthread, JUSTLOOKING)) 199 return (EINTR); 200 201 /* 202 * Do similar checks as "write" as we are writing 203 * sfv_len bytes into "vp". 204 */ 205 sfv_len = (ssize32_t)sfv->sfv_len; 206 207 if (sfv_len == 0) 208 continue; 209 210 if (sfv_len < 0) 211 return (EINVAL); 212 213 if (vp->v_type == VREG) { 214 if (*fileoff >= curproc->p_fsz_ctl) { 215 mutex_enter(&curproc->p_lock); 216 (void) rctl_action( 217 rctlproc_legacy[RLIMIT_FSIZE], 218 curproc->p_rctls, curproc, RCA_SAFE); 219 mutex_exit(&curproc->p_lock); 220 return (EFBIG); 221 } 222 223 if (*fileoff >= OFFSET_MAX(fp)) 224 return (EFBIG); 225 226 if (*fileoff + sfv_len > OFFSET_MAX(fp)) 227 return (EINVAL); 228 } 229 230 tmpcount = *count + sfv_len; 231 if (tmpcount < 0) 232 return (EINVAL); 233 234 sfv_off = sfv->sfv_off; 235 236 auio.uio_extflg = UIO_COPY_DEFAULT; 237 if (sfv->sfv_fd == SFV_FD_SELF) { 238 aiov.iov_len = sfv_len; 239 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 240 auio.uio_loffset = *fileoff; 241 auio.uio_iovcnt = 1; 242 auio.uio_resid = sfv_len; 243 auio.uio_iov = &aiov; 244 auio.uio_segflg = UIO_USERSPACE; 245 auio.uio_llimit = curproc->p_fsz_ctl; 246 auio.uio_fmode = fflag; 247 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 248 while (sfv_len > 0) { 249 error = VOP_WRITE(vp, &auio, ioflag, 250 fp->f_cred, NULL); 251 cnt = sfv_len - auio.uio_resid; 252 sfv_len -= cnt; 253 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 254 if (vp->v_type == VREG) 255 *fileoff += cnt; 256 *count += cnt; 257 if (error != 0) 258 return (error); 259 } 260 } else { 261 file_t *ffp; 262 vnode_t *readvp; 263 int readflg = 0; 264 size_t size; 265 caddr_t ptr; 266 267 if ((ffp = getf(sfv->sfv_fd)) == NULL) 268 return (EBADF); 269 270 if ((ffp->f_flag & FREAD) == 0) { 271 releasef(sfv->sfv_fd); 272 return (EBADF); 273 } 274 275 readvp = ffp->f_vnode; 276 if (readvp->v_type != VREG) { 277 releasef(sfv->sfv_fd); 278 return (EINVAL); 279 } 280 281 /* 282 * No point reading and writing to same vp, 283 * as long as both are regular files. readvp is not 284 * locked; but since we got it from an open file the 285 * contents will be valid during the time of access. 286 */ 287 if (VN_CMP(vp, readvp)) { 288 releasef(sfv->sfv_fd); 289 return (EINVAL); 290 } 291 292 /* 293 * Note: we assume readvp != vp. "vp" is already 294 * locked, and "readvp" must not be. 295 */ 296 (void) VOP_RWLOCK(readvp, readflg, NULL); 297 298 /* 299 * Same checks as in pread64. 300 */ 301 if (sfv_off > MAXOFFSET_T) { 302 VOP_RWUNLOCK(readvp, readflg, NULL); 303 releasef(sfv->sfv_fd); 304 return (EINVAL); 305 } 306 307 if (sfv_off + sfv_len > MAXOFFSET_T) 308 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off); 309 310 /* Find the native blocksize to transfer data */ 311 size = MIN(vp->v_vfsp->vfs_bsize, 312 readvp->v_vfsp->vfs_bsize); 313 size = sfv_len < size ? sfv_len : size; 314 ptr = kmem_alloc(size, KM_SLEEP); 315 316 while (sfv_len > 0) { 317 size_t iov_len; 318 319 iov_len = MIN(size, sfv_len); 320 aiov.iov_base = ptr; 321 aiov.iov_len = iov_len; 322 auio.uio_loffset = sfv_off; 323 auio.uio_iov = &aiov; 324 auio.uio_iovcnt = 1; 325 auio.uio_resid = iov_len; 326 auio.uio_segflg = UIO_SYSSPACE; 327 auio.uio_llimit = MAXOFFSET_T; 328 auio.uio_fmode = ffp->f_flag; 329 ioflag = auio.uio_fmode & 330 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 331 332 /* 333 * If read sync is not asked for, 334 * filter sync flags 335 */ 336 if ((ioflag & FRSYNC) == 0) 337 ioflag &= ~(FSYNC|FDSYNC); 338 error = VOP_READ(readvp, &auio, ioflag, 339 fp->f_cred, NULL); 340 if (error) { 341 kmem_free(ptr, size); 342 VOP_RWUNLOCK(readvp, readflg, NULL); 343 releasef(sfv->sfv_fd); 344 return (error); 345 } 346 347 /* 348 * Check how must data was really read. 349 * Decrement the 'len' and increment the 350 * 'off' appropriately. 351 */ 352 cnt = iov_len - auio.uio_resid; 353 if (cnt == 0) { 354 /* 355 * If we were reading a pipe (currently 356 * not implemented), we may now lose 357 * data. 358 */ 359 kmem_free(ptr, size); 360 VOP_RWUNLOCK(readvp, readflg, NULL); 361 releasef(sfv->sfv_fd); 362 return (EINVAL); 363 } 364 sfv_len -= cnt; 365 sfv_off += cnt; 366 367 aiov.iov_base = ptr; 368 aiov.iov_len = cnt; 369 auio.uio_loffset = *fileoff; 370 auio.uio_resid = cnt; 371 auio.uio_segflg = UIO_SYSSPACE; 372 auio.uio_llimit = curproc->p_fsz_ctl; 373 auio.uio_fmode = fflag; 374 ioflag = auio.uio_fmode & 375 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 376 error = VOP_WRITE(vp, &auio, ioflag, 377 fp->f_cred, NULL); 378 379 /* 380 * Check how much data was written. Increment 381 * the 'len' and decrement the 'off' if all 382 * the data was not written. 383 */ 384 cnt -= auio.uio_resid; 385 sfv_len += auio.uio_resid; 386 sfv_off -= auio.uio_resid; 387 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 388 if (vp->v_type == VREG) 389 *fileoff += cnt; 390 *count += cnt; 391 if (error != 0) { 392 kmem_free(ptr, size); 393 VOP_RWUNLOCK(readvp, readflg, NULL); 394 releasef(sfv->sfv_fd); 395 return (error); 396 } 397 } 398 VOP_RWUNLOCK(readvp, readflg, NULL); 399 releasef(sfv->sfv_fd); 400 kmem_free(ptr, size); 401 } 402 sfv++; 403 } 404 return (0); 405 } 406 407 ssize32_t 408 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, 409 size32_t *xferred, int fildes) 410 { 411 int rwflag; 412 u_offset_t fileoff; 413 int copy_cnt; 414 const struct ksendfilevec64 *copy_vec; 415 struct ksendfilevec64 sfv[SEND_MAX_CHUNK]; 416 struct vnode *vp; 417 int error; 418 ssize32_t count = 0; 419 int osfvcnt; 420 421 rwflag = 1; 422 vp = fp->f_vnode; 423 (void) VOP_RWLOCK(vp, rwflag, NULL); 424 425 copy_vec = vec; 426 fileoff = fp->f_offset; 427 osfvcnt = sfvcnt; 428 429 do { 430 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 431 if (copyin(copy_vec, sfv, copy_cnt * 432 sizeof (struct ksendfilevec64))) { 433 error = EFAULT; 434 break; 435 } 436 437 /* 438 * Optimize the single regular file over 439 * the socket case. 440 */ 441 if (vp->v_type == VSOCK && osfvcnt == 1 && 442 sfv->sfv_fd != SFV_FD_SELF) { 443 file_t *rfp; 444 vnode_t *rvp; 445 446 if ((rfp = getf(sfv->sfv_fd)) == NULL) { 447 error = EBADF; 448 break; 449 } 450 if ((rfp->f_flag & FREAD) == 0) { 451 releasef(sfv->sfv_fd); 452 error = EBADF; 453 break; 454 } 455 rvp = rfp->f_vnode; 456 if (rvp->v_type == VREG) { 457 error = sosendfile64(fp, rfp, sfv, &count); 458 break; 459 } 460 releasef(sfv->sfv_fd); 461 } 462 error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count); 463 if (error != 0) 464 break; 465 466 copy_vec += copy_cnt; 467 sfvcnt -= copy_cnt; 468 } while (sfvcnt > 0); 469 470 if (vp->v_type == VREG) 471 fp->f_offset += count; 472 473 VOP_RWUNLOCK(vp, rwflag, NULL); 474 if (copyout(&count, xferred, sizeof (count))) 475 error = EFAULT; 476 releasef(fildes); 477 if (error != 0) 478 return (set_errno(error)); 479 return (count); 480 } 481 #endif 482 483 int 484 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 485 int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count) 486 { 487 struct vnode *vp; 488 struct uio auio; 489 struct iovec aiov; 490 ushort_t fflag; 491 int ioflag; 492 int i, error; 493 size_t cnt; 494 ssize_t sfv_len; 495 u_offset_t sfv_off; 496 #ifdef _SYSCALL32_IMPL 497 model_t model = get_udatamodel(); 498 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 499 MAXOFF32_T : MAXOFFSET_T; 500 #else 501 const u_offset_t maxoff = MAXOFF32_T; 502 #endif 503 mblk_t *dmp = NULL; 504 int wroff; 505 int buf_left = 0; 506 size_t iov_len; 507 mblk_t *head, *tmp; 508 size_t size = total_size; 509 size_t extra; 510 int tail_len; 511 512 fflag = fp->f_flag; 513 vp = fp->f_vnode; 514 515 ASSERT(vp->v_type == VSOCK); 516 ASSERT(maxblk > 0); 517 518 wroff = (int)vp->v_stream->sd_wroff; 519 tail_len = (int)vp->v_stream->sd_tail; 520 extra = wroff + tail_len; 521 522 buf_left = MIN(total_size, maxblk); 523 head = dmp = allocb(buf_left + extra, BPRI_HI); 524 if (head == NULL) 525 return (ENOMEM); 526 head->b_wptr = head->b_rptr = head->b_rptr + wroff; 527 528 auio.uio_extflg = UIO_COPY_DEFAULT; 529 for (i = 0; i < copy_cnt; i++) { 530 if (ISSIG(curthread, JUSTLOOKING)) 531 return (EINTR); 532 533 /* 534 * Do similar checks as "write" as we are writing 535 * sfv_len bytes into "vp". 536 */ 537 sfv_len = (ssize_t)sfv->sfv_len; 538 539 if (sfv_len == 0) { 540 sfv++; 541 continue; 542 } 543 544 /* Make sure sfv_len is not negative */ 545 #ifdef _SYSCALL32_IMPL 546 if (model == DATAMODEL_ILP32) { 547 if ((ssize32_t)sfv_len < 0) 548 return (EINVAL); 549 } else 550 #endif 551 if (sfv_len < 0) 552 return (EINVAL); 553 554 /* Check for overflow */ 555 #ifdef _SYSCALL32_IMPL 556 if (model == DATAMODEL_ILP32) { 557 if (((ssize32_t)(*count + sfv_len)) < 0) 558 return (EINVAL); 559 } else 560 #endif 561 if ((*count + sfv_len) < 0) 562 return (EINVAL); 563 564 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 565 566 if (sfv->sfv_fd == SFV_FD_SELF) { 567 while (sfv_len > 0) { 568 if (buf_left == 0) { 569 tmp = dmp; 570 buf_left = MIN(total_size, maxblk); 571 iov_len = MIN(buf_left, sfv_len); 572 dmp = allocb(buf_left + extra, BPRI_HI); 573 if (dmp == NULL) { 574 freemsg(head); 575 return (ENOMEM); 576 } 577 dmp->b_wptr = dmp->b_rptr = 578 dmp->b_rptr + wroff; 579 tmp->b_cont = dmp; 580 } else { 581 iov_len = MIN(buf_left, sfv_len); 582 } 583 584 aiov.iov_len = iov_len; 585 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 586 auio.uio_loffset = *fileoff; 587 auio.uio_iovcnt = 1; 588 auio.uio_resid = iov_len; 589 auio.uio_iov = &aiov; 590 auio.uio_segflg = UIO_USERSPACE; 591 auio.uio_llimit = curproc->p_fsz_ctl; 592 auio.uio_fmode = fflag; 593 594 buf_left -= iov_len; 595 total_size -= iov_len; 596 sfv_len -= iov_len; 597 sfv_off += iov_len; 598 599 error = uiomove((caddr_t)dmp->b_wptr, 600 iov_len, UIO_WRITE, &auio); 601 if (error != 0) { 602 freemsg(head); 603 return (error); 604 } 605 dmp->b_wptr += iov_len; 606 } 607 } else { 608 file_t *ffp; 609 vnode_t *readvp; 610 int readflg = 0; 611 612 if ((ffp = getf(sfv->sfv_fd)) == NULL) { 613 freemsg(head); 614 return (EBADF); 615 } 616 617 if ((ffp->f_flag & FREAD) == 0) { 618 releasef(sfv->sfv_fd); 619 freemsg(head); 620 return (EACCES); 621 } 622 623 readvp = ffp->f_vnode; 624 if (readvp->v_type != VREG) { 625 releasef(sfv->sfv_fd); 626 freemsg(head); 627 return (EINVAL); 628 } 629 630 /* 631 * No point reading and writing to same vp, 632 * as long as both are regular files. readvp is not 633 * locked; but since we got it from an open file the 634 * contents will be valid during the time of access. 635 */ 636 637 if (VN_CMP(vp, readvp)) { 638 releasef(sfv->sfv_fd); 639 freemsg(head); 640 return (EINVAL); 641 } 642 643 /* 644 * Note: we assume readvp != vp. "vp" is already 645 * locked, and "readvp" must not be. 646 */ 647 648 (void) VOP_RWLOCK(readvp, readflg, NULL); 649 650 /* Same checks as in pread */ 651 if (sfv_off > maxoff) { 652 VOP_RWUNLOCK(readvp, readflg, NULL); 653 releasef(sfv->sfv_fd); 654 freemsg(head); 655 return (EINVAL); 656 } 657 if (sfv_off + sfv_len > maxoff) { 658 total_size -= (sfv_off + sfv_len - maxoff); 659 sfv_len = (ssize_t)((offset_t)maxoff - 660 sfv_off); 661 } 662 663 while (sfv_len > 0) { 664 if (buf_left == 0) { 665 tmp = dmp; 666 buf_left = MIN(total_size, maxblk); 667 iov_len = MIN(buf_left, sfv_len); 668 dmp = allocb(buf_left + extra, BPRI_HI); 669 if (dmp == NULL) { 670 VOP_RWUNLOCK(readvp, readflg, 671 NULL); 672 releasef(sfv->sfv_fd); 673 freemsg(head); 674 return (ENOMEM); 675 } 676 dmp->b_wptr = dmp->b_rptr = 677 dmp->b_rptr + wroff; 678 tmp->b_cont = dmp; 679 } else { 680 iov_len = MIN(buf_left, sfv_len); 681 } 682 aiov.iov_base = (caddr_t)dmp->b_wptr; 683 aiov.iov_len = iov_len; 684 auio.uio_loffset = sfv_off; 685 auio.uio_iov = &aiov; 686 auio.uio_iovcnt = 1; 687 auio.uio_resid = iov_len; 688 auio.uio_segflg = UIO_SYSSPACE; 689 auio.uio_llimit = MAXOFFSET_T; 690 auio.uio_fmode = ffp->f_flag; 691 ioflag = auio.uio_fmode & 692 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 693 694 /* 695 * If read sync is not asked for, 696 * filter sync flags 697 */ 698 if ((ioflag & FRSYNC) == 0) 699 ioflag &= ~(FSYNC|FDSYNC); 700 error = VOP_READ(readvp, &auio, ioflag, 701 fp->f_cred, NULL); 702 if (error != 0) { 703 /* 704 * If we were reading a pipe (currently 705 * not implemented), we may now loose 706 * data. 707 */ 708 VOP_RWUNLOCK(readvp, readflg, NULL); 709 releasef(sfv->sfv_fd); 710 freemsg(head); 711 return (error); 712 } 713 714 /* 715 * Check how much data was really read. 716 * Decrement the 'len' and increment the 717 * 'off' appropriately. 718 */ 719 cnt = iov_len - auio.uio_resid; 720 if (cnt == 0) { 721 VOP_RWUNLOCK(readvp, readflg, NULL); 722 releasef(sfv->sfv_fd); 723 freemsg(head); 724 return (EINVAL); 725 } 726 sfv_len -= cnt; 727 sfv_off += cnt; 728 total_size -= cnt; 729 buf_left -= cnt; 730 731 dmp->b_wptr += cnt; 732 } 733 VOP_RWUNLOCK(readvp, readflg, NULL); 734 releasef(sfv->sfv_fd); 735 } 736 sfv++; 737 } 738 739 ASSERT(total_size == 0); 740 error = kstrwritemp(vp, head, fflag); 741 if (error != 0) { 742 freemsg(head); 743 return (error); 744 } 745 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size; 746 *count += size; 747 748 return (0); 749 } 750 751 752 int 753 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 754 int copy_cnt, ssize_t *count) 755 { 756 struct vnode *vp; 757 struct uio auio; 758 struct iovec aiov; 759 ushort_t fflag; 760 int ioflag; 761 int i, error; 762 size_t cnt; 763 ssize_t sfv_len; 764 u_offset_t sfv_off; 765 #ifdef _SYSCALL32_IMPL 766 model_t model = get_udatamodel(); 767 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 768 MAXOFF32_T : MAXOFFSET_T; 769 #else 770 const u_offset_t maxoff = MAXOFF32_T; 771 #endif 772 mblk_t *dmp = NULL; 773 char *buf = NULL; 774 size_t extra; 775 int maxblk, wroff, tail_len; 776 struct sonode *so; 777 stdata_t *stp; 778 779 fflag = fp->f_flag; 780 vp = fp->f_vnode; 781 782 if (vp->v_type == VSOCK) { 783 so = VTOSO(vp); 784 stp = vp->v_stream; 785 wroff = (int)stp->sd_wroff; 786 tail_len = (int)stp->sd_tail; 787 maxblk = (int)stp->sd_maxblk; 788 extra = wroff + tail_len; 789 } 790 791 auio.uio_extflg = UIO_COPY_DEFAULT; 792 for (i = 0; i < copy_cnt; i++) { 793 if (ISSIG(curthread, JUSTLOOKING)) 794 return (EINTR); 795 796 /* 797 * Do similar checks as "write" as we are writing 798 * sfv_len bytes into "vp". 799 */ 800 sfv_len = (ssize_t)sfv->sfv_len; 801 802 if (sfv_len == 0) { 803 sfv++; 804 continue; 805 } 806 807 /* Make sure sfv_len is not negative */ 808 #ifdef _SYSCALL32_IMPL 809 if (model == DATAMODEL_ILP32) { 810 if ((ssize32_t)sfv_len < 0) 811 return (EINVAL); 812 } else 813 #endif 814 if (sfv_len < 0) 815 return (EINVAL); 816 817 if (vp->v_type == VREG) { 818 if (*fileoff >= curproc->p_fsz_ctl) { 819 mutex_enter(&curproc->p_lock); 820 (void) rctl_action( 821 rctlproc_legacy[RLIMIT_FSIZE], 822 curproc->p_rctls, curproc, RCA_SAFE); 823 mutex_exit(&curproc->p_lock); 824 825 return (EFBIG); 826 } 827 828 if (*fileoff >= maxoff) 829 return (EFBIG); 830 831 if (*fileoff + sfv_len > maxoff) 832 return (EINVAL); 833 } 834 835 /* Check for overflow */ 836 #ifdef _SYSCALL32_IMPL 837 if (model == DATAMODEL_ILP32) { 838 if (((ssize32_t)(*count + sfv_len)) < 0) 839 return (EINVAL); 840 } else 841 #endif 842 if ((*count + sfv_len) < 0) 843 return (EINVAL); 844 845 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 846 847 if (sfv->sfv_fd == SFV_FD_SELF) { 848 aiov.iov_len = sfv_len; 849 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 850 auio.uio_loffset = *fileoff; 851 auio.uio_iovcnt = 1; 852 auio.uio_resid = sfv_len; 853 auio.uio_iov = &aiov; 854 auio.uio_segflg = UIO_USERSPACE; 855 auio.uio_llimit = curproc->p_fsz_ctl; 856 auio.uio_fmode = fflag; 857 858 if (vp->v_type == VSOCK) { 859 860 /* 861 * Optimize for the socket case 862 */ 863 864 dmp = allocb(sfv_len + extra, BPRI_HI); 865 if (dmp == NULL) 866 return (ENOMEM); 867 dmp->b_wptr = dmp->b_rptr = dmp->b_rptr + wroff; 868 error = uiomove((caddr_t)dmp->b_wptr, 869 sfv_len, UIO_WRITE, &auio); 870 if (error != 0) { 871 freeb(dmp); 872 return (error); 873 } 874 dmp->b_wptr += sfv_len; 875 error = kstrwritemp(vp, dmp, fflag); 876 if (error != 0) { 877 freeb(dmp); 878 return (error); 879 } 880 ttolwp(curthread)->lwp_ru.ioch += 881 (ulong_t)sfv_len; 882 *count += sfv_len; 883 } else { 884 ioflag = auio.uio_fmode & 885 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 886 while (sfv_len > 0) { 887 error = VOP_WRITE(vp, &auio, ioflag, 888 fp->f_cred, NULL); 889 cnt = sfv_len - auio.uio_resid; 890 sfv_len -= cnt; 891 ttolwp(curthread)->lwp_ru.ioch += 892 (ulong_t)cnt; 893 *fileoff += cnt; 894 *count += cnt; 895 if (error != 0) 896 return (error); 897 } 898 } 899 } else { 900 file_t *ffp; 901 vnode_t *readvp; 902 int readflg = 0; 903 size_t size; 904 caddr_t ptr; 905 906 if ((ffp = getf(sfv->sfv_fd)) == NULL) 907 return (EBADF); 908 909 if ((ffp->f_flag & FREAD) == 0) { 910 releasef(sfv->sfv_fd); 911 return (EBADF); 912 } 913 914 readvp = ffp->f_vnode; 915 if (readvp->v_type != VREG) { 916 releasef(sfv->sfv_fd); 917 return (EINVAL); 918 } 919 920 /* 921 * No point reading and writing to same vp, 922 * as long as both are regular files. readvp is not 923 * locked; but since we got it from an open file the 924 * contents will be valid during the time of access. 925 */ 926 if (VN_CMP(vp, readvp)) { 927 releasef(sfv->sfv_fd); 928 return (EINVAL); 929 } 930 931 /* 932 * Note: we assume readvp != vp. "vp" is already 933 * locked, and "readvp" must not be. 934 */ 935 (void) VOP_RWLOCK(readvp, readflg, NULL); 936 937 /* Same checks as in pread */ 938 if (sfv_off > maxoff) { 939 VOP_RWUNLOCK(readvp, readflg, NULL); 940 releasef(sfv->sfv_fd); 941 return (EINVAL); 942 } 943 if (sfv_off + sfv_len > maxoff) { 944 sfv_len = (ssize_t)((offset_t)maxoff - 945 sfv_off); 946 } 947 /* Find the native blocksize to transfer data */ 948 size = MIN(vp->v_vfsp->vfs_bsize, 949 readvp->v_vfsp->vfs_bsize); 950 size = sfv_len < size ? sfv_len : size; 951 952 if (vp->v_type != VSOCK) { 953 buf = kmem_alloc(size, KM_NOSLEEP); 954 if (buf == NULL) { 955 VOP_RWUNLOCK(readvp, readflg, NULL); 956 releasef(sfv->sfv_fd); 957 return (ENOMEM); 958 } 959 } else { 960 /* 961 * For sockets acting as an SSL proxy, we 962 * need to adjust the size to the maximum 963 * SSL record size set in the stream head. 964 */ 965 if (so->so_kssl_ctx != NULL) 966 size = MIN(size, maxblk); 967 } 968 969 while (sfv_len > 0) { 970 size_t iov_len; 971 972 iov_len = MIN(size, sfv_len); 973 974 if (vp->v_type == VSOCK) { 975 dmp = allocb(iov_len + extra, BPRI_HI); 976 if (dmp == NULL) { 977 VOP_RWUNLOCK(readvp, readflg, 978 NULL); 979 releasef(sfv->sfv_fd); 980 return (ENOMEM); 981 } 982 dmp->b_wptr = dmp->b_rptr = 983 dmp->b_rptr + wroff; 984 ptr = (caddr_t)dmp->b_rptr; 985 } else { 986 ptr = buf; 987 } 988 989 aiov.iov_base = ptr; 990 aiov.iov_len = iov_len; 991 auio.uio_loffset = sfv_off; 992 auio.uio_iov = &aiov; 993 auio.uio_iovcnt = 1; 994 auio.uio_resid = iov_len; 995 auio.uio_segflg = UIO_SYSSPACE; 996 auio.uio_llimit = MAXOFFSET_T; 997 auio.uio_fmode = ffp->f_flag; 998 ioflag = auio.uio_fmode & 999 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1000 1001 /* 1002 * If read sync is not asked for, 1003 * filter sync flags 1004 */ 1005 if ((ioflag & FRSYNC) == 0) 1006 ioflag &= ~(FSYNC|FDSYNC); 1007 error = VOP_READ(readvp, &auio, ioflag, 1008 fp->f_cred, NULL); 1009 if (error != 0) { 1010 /* 1011 * If we were reading a pipe (currently 1012 * not implemented), we may now lose 1013 * data. 1014 */ 1015 if (vp->v_type == VSOCK) 1016 freeb(dmp); 1017 else 1018 kmem_free(buf, size); 1019 VOP_RWUNLOCK(readvp, readflg, NULL); 1020 releasef(sfv->sfv_fd); 1021 return (error); 1022 } 1023 1024 /* 1025 * Check how much data was really read. 1026 * Decrement the 'len' and increment the 1027 * 'off' appropriately. 1028 */ 1029 cnt = iov_len - auio.uio_resid; 1030 if (cnt == 0) { 1031 if (vp->v_type == VSOCK) 1032 freeb(dmp); 1033 else 1034 kmem_free(buf, size); 1035 VOP_RWUNLOCK(readvp, readflg, NULL); 1036 releasef(sfv->sfv_fd); 1037 return (EINVAL); 1038 } 1039 sfv_len -= cnt; 1040 sfv_off += cnt; 1041 1042 if (vp->v_type == VSOCK) { 1043 dmp->b_wptr = dmp->b_rptr + cnt; 1044 1045 error = kstrwritemp(vp, dmp, fflag); 1046 if (error != 0) { 1047 freeb(dmp); 1048 VOP_RWUNLOCK(readvp, readflg, 1049 NULL); 1050 releasef(sfv->sfv_fd); 1051 return (error); 1052 } 1053 1054 ttolwp(curthread)->lwp_ru.ioch += 1055 (ulong_t)cnt; 1056 *count += cnt; 1057 } else { 1058 1059 aiov.iov_base = ptr; 1060 aiov.iov_len = cnt; 1061 auio.uio_loffset = *fileoff; 1062 auio.uio_resid = cnt; 1063 auio.uio_segflg = UIO_SYSSPACE; 1064 auio.uio_llimit = curproc->p_fsz_ctl; 1065 auio.uio_fmode = fflag; 1066 ioflag = auio.uio_fmode & 1067 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1068 error = VOP_WRITE(vp, &auio, ioflag, 1069 fp->f_cred, NULL); 1070 1071 /* 1072 * Check how much data was written. 1073 * Increment the 'len' and decrement the 1074 * 'off' if all the data was not 1075 * written. 1076 */ 1077 cnt -= auio.uio_resid; 1078 sfv_len += auio.uio_resid; 1079 sfv_off -= auio.uio_resid; 1080 ttolwp(curthread)->lwp_ru.ioch += 1081 (ulong_t)cnt; 1082 *fileoff += cnt; 1083 *count += cnt; 1084 if (error != 0) { 1085 kmem_free(buf, size); 1086 VOP_RWUNLOCK(readvp, readflg, 1087 NULL); 1088 releasef(sfv->sfv_fd); 1089 return (error); 1090 } 1091 } 1092 } 1093 if (buf) { 1094 kmem_free(buf, size); 1095 buf = NULL; 1096 } 1097 VOP_RWUNLOCK(readvp, readflg, NULL); 1098 releasef(sfv->sfv_fd); 1099 } 1100 sfv++; 1101 } 1102 return (0); 1103 } 1104 1105 ssize_t 1106 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, 1107 size_t *xferred) 1108 { 1109 int error; 1110 file_t *fp; 1111 struct vnode *vp; 1112 struct sonode *so; 1113 u_offset_t fileoff; 1114 int copy_cnt; 1115 const struct sendfilevec *copy_vec; 1116 struct sendfilevec sfv[SEND_MAX_CHUNK]; 1117 ssize_t count = 0; 1118 #ifdef _SYSCALL32_IMPL 1119 struct ksendfilevec32 sfv32[SEND_MAX_CHUNK]; 1120 #endif 1121 ssize_t total_size; 1122 int i; 1123 boolean_t is_sock = B_FALSE; 1124 int maxblk = 0; 1125 1126 if (sfvcnt <= 0) 1127 return (set_errno(EINVAL)); 1128 1129 if ((fp = getf(fildes)) == NULL) 1130 return (set_errno(EBADF)); 1131 1132 if (((fp->f_flag) & FWRITE) == 0) { 1133 error = EBADF; 1134 goto err; 1135 } 1136 1137 fileoff = fp->f_offset; 1138 vp = fp->f_vnode; 1139 1140 switch (vp->v_type) { 1141 case VSOCK: 1142 so = VTOSO(vp); 1143 /* sendfile not supported for SCTP */ 1144 if (so->so_protocol == IPPROTO_SCTP) { 1145 error = EPROTONOSUPPORT; 1146 goto err; 1147 } 1148 is_sock = B_TRUE; 1149 switch (so->so_family) { 1150 case AF_INET: 1151 case AF_INET6: 1152 /* 1153 * Make similar checks done in SOP_WRITE(). 1154 */ 1155 if (so->so_state & SS_CANTSENDMORE) { 1156 tsignal(curthread, SIGPIPE); 1157 error = EPIPE; 1158 goto err; 1159 } 1160 if (so->so_type != SOCK_STREAM) { 1161 error = EOPNOTSUPP; 1162 goto err; 1163 } 1164 1165 if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) != 1166 (SS_ISCONNECTED|SS_ISBOUND)) { 1167 error = ENOTCONN; 1168 goto err; 1169 } 1170 1171 if ((so->so_state & SS_DIRECT) && 1172 (so->so_priv != NULL) && 1173 (so->so_kssl_ctx == NULL)) { 1174 maxblk = ((tcp_t *)so->so_priv)->tcp_mss; 1175 } else { 1176 maxblk = (int)vp->v_stream->sd_maxblk; 1177 } 1178 break; 1179 default: 1180 error = EAFNOSUPPORT; 1181 goto err; 1182 } 1183 break; 1184 case VREG: 1185 break; 1186 default: 1187 error = EINVAL; 1188 goto err; 1189 } 1190 1191 switch (opcode) { 1192 case SENDFILEV : 1193 break; 1194 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 1195 case SENDFILEV64 : 1196 return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt, 1197 (size32_t *)xferred, fildes)); 1198 #endif 1199 default : 1200 error = ENOSYS; 1201 break; 1202 } 1203 1204 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1205 copy_vec = vec; 1206 1207 do { 1208 total_size = 0; 1209 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 1210 #ifdef _SYSCALL32_IMPL 1211 /* 32-bit callers need to have their iovec expanded. */ 1212 if (get_udatamodel() == DATAMODEL_ILP32) { 1213 if (copyin(copy_vec, sfv32, 1214 copy_cnt * sizeof (ksendfilevec32_t))) { 1215 error = EFAULT; 1216 break; 1217 } 1218 1219 for (i = 0; i < copy_cnt; i++) { 1220 sfv[i].sfv_fd = sfv32[i].sfv_fd; 1221 sfv[i].sfv_off = 1222 (off_t)(uint32_t)sfv32[i].sfv_off; 1223 sfv[i].sfv_len = (size_t)sfv32[i].sfv_len; 1224 total_size += sfv[i].sfv_len; 1225 sfv[i].sfv_flag = sfv32[i].sfv_flag; 1226 } 1227 } else { 1228 #endif 1229 if (copyin(copy_vec, sfv, 1230 copy_cnt * sizeof (sendfilevec_t))) { 1231 error = EFAULT; 1232 break; 1233 } 1234 1235 for (i = 0; i < copy_cnt; i++) { 1236 total_size += sfv[i].sfv_len; 1237 } 1238 #ifdef _SYSCALL32_IMPL 1239 } 1240 #endif 1241 1242 /* 1243 * The task between deciding to use sendvec_small_chunk 1244 * and sendvec_chunk is dependant on multiple things: 1245 * 1246 * i) latency is important for smaller files. So if the 1247 * data is smaller than 'tcp_slow_start_initial' times 1248 * maxblk, then use sendvec_small_chunk which creates 1249 * maxblk size mblks and chains then together and sends 1250 * them to TCP in one shot. It also leaves 'wroff' size 1251 * space for the headers in each mblk. 1252 * 1253 * ii) for total size bigger than 'tcp_slow_start_initial' 1254 * time maxblk, its probably real file data which is 1255 * dominating. So its better to use sendvec_chunk because 1256 * performance goes to dog if we don't do pagesize reads. 1257 * sendvec_chunk will do pagesize reads and write them 1258 * in pagesize mblks to TCP. 1259 * 1260 * Side Notes: A write to file has not been optimized. 1261 * Future zero copy code will plugin into sendvec_chunk 1262 * only because doing zero copy for files smaller then 1263 * pagesize is useless. 1264 * 1265 * Note, if socket has NL7C enabled then call NL7C's 1266 * senfilev() function to consume the sfv[]. 1267 */ 1268 if (is_sock) { 1269 switch (so->so_family) { 1270 case AF_INET: 1271 case AF_INET6: 1272 if (so->so_nl7c_flags != 0) 1273 error = nl7c_sendfilev(so, &fileoff, 1274 sfv, copy_cnt, &count); 1275 else if (total_size <= (4 * maxblk)) 1276 error = sendvec_small_chunk(fp, 1277 &fileoff, sfv, copy_cnt, 1278 total_size, maxblk, &count); 1279 else 1280 error = sendvec_chunk(fp, &fileoff, 1281 sfv, copy_cnt, &count); 1282 break; 1283 } 1284 } else { 1285 ASSERT(vp->v_type == VREG); 1286 error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt, 1287 &count); 1288 } 1289 1290 1291 #ifdef _SYSCALL32_IMPL 1292 if (get_udatamodel() == DATAMODEL_ILP32) 1293 copy_vec = (const struct sendfilevec *)((char *)copy_vec + 1294 (copy_cnt * sizeof (ksendfilevec32_t))); 1295 else 1296 #endif 1297 copy_vec += copy_cnt; 1298 sfvcnt -= copy_cnt; 1299 } while (sfvcnt > 0); 1300 1301 if (vp->v_type == VREG) 1302 fp->f_offset += count; 1303 1304 1305 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1306 1307 #ifdef _SYSCALL32_IMPL 1308 if (get_udatamodel() == DATAMODEL_ILP32) { 1309 ssize32_t count32 = (ssize32_t)count; 1310 if (copyout(&count32, xferred, sizeof (count32))) 1311 error = EFAULT; 1312 releasef(fildes); 1313 if (error != 0) 1314 return (set_errno(error)); 1315 return (count32); 1316 } 1317 #endif 1318 if (copyout(&count, xferred, sizeof (count))) 1319 error = EFAULT; 1320 releasef(fildes); 1321 if (error != 0) 1322 return (set_errno(error)); 1323 return (count); 1324 err: 1325 ASSERT(error != 0); 1326 releasef(fildes); 1327 return (set_errno(error)); 1328 } 1329