1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/t_lock.h> 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/buf.h> 34 #include <sys/conf.h> 35 #include <sys/cred.h> 36 #include <sys/kmem.h> 37 #include <sys/sysmacros.h> 38 #include <sys/vfs.h> 39 #include <sys/vnode.h> 40 #include <sys/debug.h> 41 #include <sys/errno.h> 42 #include <sys/time.h> 43 #include <sys/file.h> 44 #include <sys/open.h> 45 #include <sys/user.h> 46 #include <sys/termios.h> 47 #include <sys/stream.h> 48 #include <sys/strsubr.h> 49 #include <sys/esunddi.h> 50 #include <sys/flock.h> 51 #include <sys/modctl.h> 52 #include <sys/cmn_err.h> 53 #include <sys/vmsystm.h> 54 55 #include <sys/socket.h> 56 #include <sys/socketvar.h> 57 #include <netinet/in.h> 58 #include <sys/sendfile.h> 59 #include <sys/un.h> 60 #include <sys/tihdr.h> 61 #include <sys/atomic.h> 62 63 #include <inet/common.h> 64 #include <inet/ip.h> 65 #include <inet/ip6.h> 66 #include <inet/tcp.h> 67 68 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *, 69 ssize32_t *); 70 extern void nl7c_sendfilev(struct sonode *, u_offset_t, struct sendfilevec *, 71 int); 72 73 /* 74 * kstrwritemp() has very similar semantics as that of strwrite(). 75 * The main difference is it obtains mblks from the caller and also 76 * does not do any copy as done in strwrite() from user buffers to 77 * kernel buffers. 78 * 79 * Currently, this routine is used by sendfile to send data allocated 80 * within the kernel without any copying. This interface does not use the 81 * synchronous stream interface as synch. stream interface implies 82 * copying. 83 */ 84 int 85 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode) 86 { 87 struct stdata *stp; 88 struct queue *wqp; 89 mblk_t *newmp; 90 char waitflag; 91 int tempmode; 92 int error = 0; 93 int done = 0; 94 struct sonode *so; 95 boolean_t direct; 96 97 ASSERT(vp->v_stream); 98 stp = vp->v_stream; 99 100 so = VTOSO(vp); 101 direct = (so->so_state & SS_DIRECT); 102 103 /* 104 * This is the sockfs direct fast path. canputnext() need 105 * not be accurate so we don't grab the sd_lock here. If 106 * we get flow-controlled, we grab sd_lock just before the 107 * do..while loop below to emulate what strwrite() does. 108 */ 109 wqp = stp->sd_wrq; 110 if (canputnext(wqp) && direct && 111 !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) { 112 return (sostream_direct(so, NULL, mp, CRED())); 113 } else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) { 114 /* Fast check of flags before acquiring the lock */ 115 mutex_enter(&stp->sd_lock); 116 error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0); 117 mutex_exit(&stp->sd_lock); 118 if (error != 0) { 119 if (!(stp->sd_flag & STPLEX) && 120 (stp->sd_wput_opt & SW_SIGPIPE)) { 121 tsignal(curthread, SIGPIPE); 122 error = EPIPE; 123 } 124 return (error); 125 } 126 } 127 128 waitflag = WRITEWAIT; 129 if (stp->sd_flag & OLDNDELAY) 130 tempmode = fmode & ~FNDELAY; 131 else 132 tempmode = fmode; 133 134 mutex_enter(&stp->sd_lock); 135 do { 136 if (canputnext(wqp)) { 137 mutex_exit(&stp->sd_lock); 138 if (stp->sd_wputdatafunc != NULL) { 139 newmp = (stp->sd_wputdatafunc)(vp, mp, NULL, 140 NULL, NULL, NULL); 141 if (newmp == NULL) { 142 /* The caller will free mp */ 143 return (ECOMM); 144 } 145 mp = newmp; 146 } 147 putnext(wqp, mp); 148 return (0); 149 } 150 error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1, 151 &done); 152 } while (error == 0 && !done); 153 154 mutex_exit(&stp->sd_lock); 155 /* 156 * EAGAIN tells the application to try again. ENOMEM 157 * is returned only if the memory allocation size 158 * exceeds the physical limits of the system. ENOMEM 159 * can't be true here. 160 */ 161 if (error == ENOMEM) 162 error = EAGAIN; 163 return (error); 164 } 165 166 #define SEND_MAX_CHUNK 16 167 168 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 169 /* 170 * 64 bit offsets for 32 bit applications only running either on 171 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer 172 * more than 2GB of data. 173 */ 174 int 175 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, 176 int copy_cnt, ssize32_t *count) 177 { 178 struct vnode *vp; 179 ushort_t fflag; 180 int ioflag; 181 size32_t cnt; 182 ssize32_t sfv_len; 183 ssize32_t tmpcount; 184 u_offset_t sfv_off; 185 struct uio auio; 186 struct iovec aiov; 187 int i, error; 188 189 fflag = fp->f_flag; 190 vp = fp->f_vnode; 191 for (i = 0; i < copy_cnt; i++) { 192 193 if (ISSIG(curthread, JUSTLOOKING)) 194 return (EINTR); 195 196 /* 197 * Do similar checks as "write" as we are writing 198 * sfv_len bytes into "vp". 199 */ 200 sfv_len = (ssize32_t)sfv->sfv_len; 201 202 if (sfv_len == 0) 203 continue; 204 205 if (sfv_len < 0) 206 return (EINVAL); 207 208 if (vp->v_type == VREG) { 209 if (*fileoff >= curproc->p_fsz_ctl) { 210 mutex_enter(&curproc->p_lock); 211 (void) rctl_action( 212 rctlproc_legacy[RLIMIT_FSIZE], 213 curproc->p_rctls, curproc, RCA_SAFE); 214 mutex_exit(&curproc->p_lock); 215 return (EFBIG); 216 } 217 218 if (*fileoff >= OFFSET_MAX(fp)) 219 return (EFBIG); 220 221 if (*fileoff + sfv_len > OFFSET_MAX(fp)) 222 return (EINVAL); 223 } 224 225 tmpcount = *count + sfv_len; 226 if (tmpcount < 0) 227 return (EINVAL); 228 229 sfv_off = sfv->sfv_off; 230 231 auio.uio_extflg = UIO_COPY_DEFAULT; 232 if (sfv->sfv_fd == SFV_FD_SELF) { 233 aiov.iov_len = sfv_len; 234 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 235 auio.uio_loffset = *fileoff; 236 auio.uio_iovcnt = 1; 237 auio.uio_resid = sfv_len; 238 auio.uio_iov = &aiov; 239 auio.uio_segflg = UIO_USERSPACE; 240 auio.uio_llimit = curproc->p_fsz_ctl; 241 auio.uio_fmode = fflag; 242 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 243 while (sfv_len > 0) { 244 error = VOP_WRITE(vp, &auio, ioflag, 245 fp->f_cred, NULL); 246 cnt = sfv_len - auio.uio_resid; 247 sfv_len -= cnt; 248 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 249 if (vp->v_type == VREG) 250 *fileoff += cnt; 251 *count += cnt; 252 if (error != 0) 253 return (error); 254 } 255 } else { 256 file_t *ffp; 257 vnode_t *readvp; 258 int readflg = 0; 259 size_t size; 260 caddr_t ptr; 261 262 if ((ffp = getf(sfv->sfv_fd)) == NULL) 263 return (EBADF); 264 265 if ((ffp->f_flag & FREAD) == 0) { 266 releasef(sfv->sfv_fd); 267 return (EBADF); 268 } 269 270 readvp = ffp->f_vnode; 271 if (readvp->v_type != VREG) { 272 releasef(sfv->sfv_fd); 273 return (EINVAL); 274 } 275 276 /* 277 * No point reading and writing to same vp, 278 * as long as both are regular files. readvp is not 279 * locked; but since we got it from an open file the 280 * contents will be valid during the time of access. 281 */ 282 if (VN_CMP(vp, readvp)) { 283 releasef(sfv->sfv_fd); 284 return (EINVAL); 285 } 286 287 /* 288 * Note: we assume readvp != vp. "vp" is already 289 * locked, and "readvp" must not be. 290 */ 291 (void) VOP_RWLOCK(readvp, readflg, NULL); 292 293 /* 294 * Same checks as in pread64. 295 */ 296 if (sfv_off > MAXOFFSET_T) { 297 VOP_RWUNLOCK(readvp, readflg, NULL); 298 releasef(sfv->sfv_fd); 299 return (EINVAL); 300 } 301 302 if (sfv_off + sfv_len > MAXOFFSET_T) 303 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off); 304 305 /* Find the native blocksize to transfer data */ 306 size = MIN(vp->v_vfsp->vfs_bsize, 307 readvp->v_vfsp->vfs_bsize); 308 size = sfv_len < size ? sfv_len : size; 309 ptr = kmem_alloc(size, KM_SLEEP); 310 311 while (sfv_len > 0) { 312 size_t iov_len; 313 314 iov_len = MIN(size, sfv_len); 315 aiov.iov_base = ptr; 316 aiov.iov_len = iov_len; 317 auio.uio_loffset = sfv_off; 318 auio.uio_iov = &aiov; 319 auio.uio_iovcnt = 1; 320 auio.uio_resid = iov_len; 321 auio.uio_segflg = UIO_SYSSPACE; 322 auio.uio_llimit = MAXOFFSET_T; 323 auio.uio_fmode = ffp->f_flag; 324 ioflag = auio.uio_fmode & 325 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 326 327 /* 328 * If read sync is not asked for, 329 * filter sync flags 330 */ 331 if ((ioflag & FRSYNC) == 0) 332 ioflag &= ~(FSYNC|FDSYNC); 333 error = VOP_READ(readvp, &auio, ioflag, 334 fp->f_cred, NULL); 335 if (error) { 336 kmem_free(ptr, size); 337 VOP_RWUNLOCK(readvp, readflg, NULL); 338 releasef(sfv->sfv_fd); 339 return (error); 340 } 341 342 /* 343 * Check how must data was really read. 344 * Decrement the 'len' and increment the 345 * 'off' appropriately. 346 */ 347 cnt = iov_len - auio.uio_resid; 348 if (cnt == 0) { 349 /* 350 * If we were reading a pipe (currently 351 * not implemented), we may now lose 352 * data. 353 */ 354 kmem_free(ptr, size); 355 VOP_RWUNLOCK(readvp, readflg, NULL); 356 releasef(sfv->sfv_fd); 357 return (EINVAL); 358 } 359 sfv_len -= cnt; 360 sfv_off += cnt; 361 362 aiov.iov_base = ptr; 363 aiov.iov_len = cnt; 364 auio.uio_loffset = *fileoff; 365 auio.uio_resid = cnt; 366 auio.uio_segflg = UIO_SYSSPACE; 367 auio.uio_llimit = curproc->p_fsz_ctl; 368 auio.uio_fmode = fflag; 369 ioflag = auio.uio_fmode & 370 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 371 error = VOP_WRITE(vp, &auio, ioflag, 372 fp->f_cred, NULL); 373 374 /* 375 * Check how much data was written. Increment 376 * the 'len' and decrement the 'off' if all 377 * the data was not written. 378 */ 379 cnt -= auio.uio_resid; 380 sfv_len += auio.uio_resid; 381 sfv_off -= auio.uio_resid; 382 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 383 if (vp->v_type == VREG) 384 *fileoff += cnt; 385 *count += cnt; 386 if (error != 0) { 387 kmem_free(ptr, size); 388 VOP_RWUNLOCK(readvp, readflg, NULL); 389 releasef(sfv->sfv_fd); 390 return (error); 391 } 392 } 393 VOP_RWUNLOCK(readvp, readflg, NULL); 394 releasef(sfv->sfv_fd); 395 kmem_free(ptr, size); 396 } 397 sfv++; 398 } 399 return (0); 400 } 401 402 ssize32_t 403 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, 404 size32_t *xferred, int fildes) 405 { 406 int rwflag; 407 u_offset_t fileoff; 408 int copy_cnt; 409 const struct ksendfilevec64 *copy_vec; 410 struct ksendfilevec64 sfv[SEND_MAX_CHUNK]; 411 struct vnode *vp; 412 int error; 413 ssize32_t count = 0; 414 int osfvcnt; 415 416 rwflag = 1; 417 vp = fp->f_vnode; 418 (void) VOP_RWLOCK(vp, rwflag, NULL); 419 420 copy_vec = vec; 421 fileoff = fp->f_offset; 422 osfvcnt = sfvcnt; 423 424 do { 425 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 426 if (copyin(copy_vec, sfv, copy_cnt * 427 sizeof (struct ksendfilevec64))) { 428 error = EFAULT; 429 break; 430 } 431 432 /* 433 * Optimize the single regular file over 434 * the socket case. 435 */ 436 if (vp->v_type == VSOCK && osfvcnt == 1 && 437 sfv->sfv_fd != SFV_FD_SELF) { 438 file_t *rfp; 439 vnode_t *rvp; 440 441 if ((rfp = getf(sfv->sfv_fd)) == NULL) { 442 error = EBADF; 443 break; 444 } 445 if ((rfp->f_flag & FREAD) == 0) { 446 releasef(sfv->sfv_fd); 447 error = EBADF; 448 break; 449 } 450 rvp = rfp->f_vnode; 451 if (rvp->v_type == VREG) { 452 error = sosendfile64(fp, rfp, sfv, &count); 453 break; 454 } 455 releasef(sfv->sfv_fd); 456 } 457 error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count); 458 if (error != 0) 459 break; 460 461 copy_vec += copy_cnt; 462 sfvcnt -= copy_cnt; 463 } while (sfvcnt > 0); 464 465 if (vp->v_type == VREG) 466 fp->f_offset += count; 467 468 VOP_RWUNLOCK(vp, rwflag, NULL); 469 if (copyout(&count, xferred, sizeof (count))) 470 error = EFAULT; 471 releasef(fildes); 472 if (error != 0) 473 return (set_errno(error)); 474 return (count); 475 } 476 #endif 477 478 int 479 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 480 int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count) 481 { 482 struct vnode *vp; 483 struct uio auio; 484 struct iovec aiov; 485 ushort_t fflag; 486 int ioflag; 487 int i, error; 488 size_t cnt; 489 ssize_t sfv_len; 490 u_offset_t sfv_off; 491 #ifdef _SYSCALL32_IMPL 492 model_t model = get_udatamodel(); 493 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 494 MAXOFF32_T : MAXOFFSET_T; 495 #else 496 const u_offset_t maxoff = MAXOFF32_T; 497 #endif 498 mblk_t *dmp = NULL; 499 int wroff; 500 int buf_left = 0; 501 size_t iov_len; 502 mblk_t *head, *tmp; 503 size_t size = total_size; 504 size_t extra; 505 int tail_len; 506 507 fflag = fp->f_flag; 508 vp = fp->f_vnode; 509 510 ASSERT(vp->v_type == VSOCK); 511 ASSERT(maxblk > 0); 512 513 wroff = (int)vp->v_stream->sd_wroff; 514 tail_len = (int)vp->v_stream->sd_tail; 515 extra = wroff + tail_len; 516 517 buf_left = MIN(total_size, maxblk); 518 head = dmp = allocb(buf_left + extra, BPRI_HI); 519 if (head == NULL) 520 return (ENOMEM); 521 head->b_wptr = head->b_rptr = head->b_rptr + wroff; 522 523 auio.uio_extflg = UIO_COPY_DEFAULT; 524 for (i = 0; i < copy_cnt; i++) { 525 if (ISSIG(curthread, JUSTLOOKING)) 526 return (EINTR); 527 528 /* 529 * Do similar checks as "write" as we are writing 530 * sfv_len bytes into "vp". 531 */ 532 sfv_len = (ssize_t)sfv->sfv_len; 533 534 if (sfv_len == 0) { 535 sfv++; 536 continue; 537 } 538 539 /* Make sure sfv_len is not negative */ 540 #ifdef _SYSCALL32_IMPL 541 if (model == DATAMODEL_ILP32) { 542 if ((ssize32_t)sfv_len < 0) 543 return (EINVAL); 544 } else 545 #endif 546 if (sfv_len < 0) 547 return (EINVAL); 548 549 /* Check for overflow */ 550 #ifdef _SYSCALL32_IMPL 551 if (model == DATAMODEL_ILP32) { 552 if (((ssize32_t)(*count + sfv_len)) < 0) 553 return (EINVAL); 554 } else 555 #endif 556 if ((*count + sfv_len) < 0) 557 return (EINVAL); 558 559 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 560 561 if (sfv->sfv_fd == SFV_FD_SELF) { 562 while (sfv_len > 0) { 563 if (buf_left == 0) { 564 tmp = dmp; 565 buf_left = MIN(total_size, maxblk); 566 iov_len = MIN(buf_left, sfv_len); 567 dmp = allocb(buf_left + extra, BPRI_HI); 568 if (dmp == NULL) { 569 freemsg(head); 570 return (ENOMEM); 571 } 572 dmp->b_wptr = dmp->b_rptr = 573 dmp->b_rptr + wroff; 574 tmp->b_cont = dmp; 575 } else { 576 iov_len = MIN(buf_left, sfv_len); 577 } 578 579 aiov.iov_len = iov_len; 580 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 581 auio.uio_loffset = *fileoff; 582 auio.uio_iovcnt = 1; 583 auio.uio_resid = iov_len; 584 auio.uio_iov = &aiov; 585 auio.uio_segflg = UIO_USERSPACE; 586 auio.uio_llimit = curproc->p_fsz_ctl; 587 auio.uio_fmode = fflag; 588 589 buf_left -= iov_len; 590 total_size -= iov_len; 591 sfv_len -= iov_len; 592 sfv_off += iov_len; 593 594 error = uiomove((caddr_t)dmp->b_wptr, 595 iov_len, UIO_WRITE, &auio); 596 if (error != 0) { 597 freemsg(head); 598 return (error); 599 } 600 dmp->b_wptr += iov_len; 601 } 602 } else { 603 file_t *ffp; 604 vnode_t *readvp; 605 int readflg = 0; 606 607 if ((ffp = getf(sfv->sfv_fd)) == NULL) { 608 freemsg(head); 609 return (EBADF); 610 } 611 612 if ((ffp->f_flag & FREAD) == 0) { 613 releasef(sfv->sfv_fd); 614 freemsg(head); 615 return (EACCES); 616 } 617 618 readvp = ffp->f_vnode; 619 if (readvp->v_type != VREG) { 620 releasef(sfv->sfv_fd); 621 freemsg(head); 622 return (EINVAL); 623 } 624 625 /* 626 * No point reading and writing to same vp, 627 * as long as both are regular files. readvp is not 628 * locked; but since we got it from an open file the 629 * contents will be valid during the time of access. 630 */ 631 632 if (VN_CMP(vp, readvp)) { 633 releasef(sfv->sfv_fd); 634 freemsg(head); 635 return (EINVAL); 636 } 637 638 /* 639 * Note: we assume readvp != vp. "vp" is already 640 * locked, and "readvp" must not be. 641 */ 642 643 (void) VOP_RWLOCK(readvp, readflg, NULL); 644 645 /* Same checks as in pread */ 646 if (sfv_off > maxoff) { 647 VOP_RWUNLOCK(readvp, readflg, NULL); 648 releasef(sfv->sfv_fd); 649 freemsg(head); 650 return (EINVAL); 651 } 652 if (sfv_off + sfv_len > maxoff) { 653 sfv_len = (ssize_t)((offset_t)maxoff - 654 sfv_off); 655 } 656 657 while (sfv_len > 0) { 658 if (buf_left == 0) { 659 tmp = dmp; 660 buf_left = MIN(total_size, maxblk); 661 iov_len = MIN(buf_left, sfv_len); 662 dmp = allocb(buf_left + extra, BPRI_HI); 663 if (dmp == NULL) { 664 VOP_RWUNLOCK(readvp, readflg, 665 NULL); 666 releasef(sfv->sfv_fd); 667 freemsg(head); 668 return (ENOMEM); 669 } 670 dmp->b_wptr = dmp->b_rptr = 671 dmp->b_rptr + wroff; 672 tmp->b_cont = dmp; 673 } else { 674 iov_len = MIN(buf_left, sfv_len); 675 } 676 aiov.iov_base = (caddr_t)dmp->b_wptr; 677 aiov.iov_len = iov_len; 678 auio.uio_loffset = sfv_off; 679 auio.uio_iov = &aiov; 680 auio.uio_iovcnt = 1; 681 auio.uio_resid = iov_len; 682 auio.uio_segflg = UIO_SYSSPACE; 683 auio.uio_llimit = MAXOFFSET_T; 684 auio.uio_fmode = ffp->f_flag; 685 ioflag = auio.uio_fmode & 686 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 687 688 /* 689 * If read sync is not asked for, 690 * filter sync flags 691 */ 692 if ((ioflag & FRSYNC) == 0) 693 ioflag &= ~(FSYNC|FDSYNC); 694 error = VOP_READ(readvp, &auio, ioflag, 695 fp->f_cred, NULL); 696 if (error != 0) { 697 /* 698 * If we were reading a pipe (currently 699 * not implemented), we may now loose 700 * data. 701 */ 702 VOP_RWUNLOCK(readvp, readflg, NULL); 703 releasef(sfv->sfv_fd); 704 freemsg(head); 705 return (error); 706 } 707 708 /* 709 * Check how much data was really read. 710 * Decrement the 'len' and increment the 711 * 'off' appropriately. 712 */ 713 cnt = iov_len - auio.uio_resid; 714 if (cnt == 0) { 715 VOP_RWUNLOCK(readvp, readflg, NULL); 716 releasef(sfv->sfv_fd); 717 freemsg(head); 718 return (EINVAL); 719 } 720 sfv_len -= cnt; 721 sfv_off += cnt; 722 total_size -= cnt; 723 buf_left -= cnt; 724 725 dmp->b_wptr += cnt; 726 } 727 VOP_RWUNLOCK(readvp, readflg, NULL); 728 releasef(sfv->sfv_fd); 729 } 730 sfv++; 731 } 732 733 ASSERT(total_size == 0); 734 error = kstrwritemp(vp, head, fflag); 735 if (error != 0) { 736 freemsg(head); 737 return (error); 738 } 739 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size; 740 *count += size; 741 742 return (0); 743 } 744 745 746 int 747 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 748 int copy_cnt, ssize_t *count) 749 { 750 struct vnode *vp; 751 struct uio auio; 752 struct iovec aiov; 753 ushort_t fflag; 754 int ioflag; 755 int i, error; 756 size_t cnt; 757 ssize_t sfv_len; 758 u_offset_t sfv_off; 759 #ifdef _SYSCALL32_IMPL 760 model_t model = get_udatamodel(); 761 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 762 MAXOFF32_T : MAXOFFSET_T; 763 #else 764 const u_offset_t maxoff = MAXOFF32_T; 765 #endif 766 mblk_t *dmp = NULL; 767 char *buf = NULL; 768 size_t extra; 769 int maxblk, wroff, tail_len; 770 struct sonode *so; 771 stdata_t *stp; 772 773 fflag = fp->f_flag; 774 vp = fp->f_vnode; 775 776 if (vp->v_type == VSOCK) { 777 so = VTOSO(vp); 778 stp = vp->v_stream; 779 wroff = (int)stp->sd_wroff; 780 tail_len = (int)stp->sd_tail; 781 maxblk = (int)stp->sd_maxblk; 782 extra = wroff + tail_len; 783 } 784 785 auio.uio_extflg = UIO_COPY_DEFAULT; 786 for (i = 0; i < copy_cnt; i++) { 787 if (ISSIG(curthread, JUSTLOOKING)) 788 return (EINTR); 789 790 /* 791 * Do similar checks as "write" as we are writing 792 * sfv_len bytes into "vp". 793 */ 794 sfv_len = (ssize_t)sfv->sfv_len; 795 796 if (sfv_len == 0) { 797 sfv++; 798 continue; 799 } 800 801 /* Make sure sfv_len is not negative */ 802 #ifdef _SYSCALL32_IMPL 803 if (model == DATAMODEL_ILP32) { 804 if ((ssize32_t)sfv_len < 0) 805 return (EINVAL); 806 } else 807 #endif 808 if (sfv_len < 0) 809 return (EINVAL); 810 811 if (vp->v_type == VREG) { 812 if (*fileoff >= curproc->p_fsz_ctl) { 813 mutex_enter(&curproc->p_lock); 814 (void) rctl_action( 815 rctlproc_legacy[RLIMIT_FSIZE], 816 curproc->p_rctls, curproc, RCA_SAFE); 817 mutex_exit(&curproc->p_lock); 818 819 return (EFBIG); 820 } 821 822 if (*fileoff >= maxoff) 823 return (EFBIG); 824 825 if (*fileoff + sfv_len > maxoff) 826 return (EINVAL); 827 } 828 829 /* Check for overflow */ 830 #ifdef _SYSCALL32_IMPL 831 if (model == DATAMODEL_ILP32) { 832 if (((ssize32_t)(*count + sfv_len)) < 0) 833 return (EINVAL); 834 } else 835 #endif 836 if ((*count + sfv_len) < 0) 837 return (EINVAL); 838 839 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 840 841 if (sfv->sfv_fd == SFV_FD_SELF) { 842 aiov.iov_len = sfv_len; 843 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 844 auio.uio_loffset = *fileoff; 845 auio.uio_iovcnt = 1; 846 auio.uio_resid = sfv_len; 847 auio.uio_iov = &aiov; 848 auio.uio_segflg = UIO_USERSPACE; 849 auio.uio_llimit = curproc->p_fsz_ctl; 850 auio.uio_fmode = fflag; 851 852 if (vp->v_type == VSOCK) { 853 854 /* 855 * Optimize for the socket case 856 */ 857 858 dmp = allocb(sfv_len + extra, BPRI_HI); 859 if (dmp == NULL) 860 return (ENOMEM); 861 dmp->b_wptr = dmp->b_rptr = dmp->b_rptr + wroff; 862 error = uiomove((caddr_t)dmp->b_wptr, 863 sfv_len, UIO_WRITE, &auio); 864 if (error != 0) { 865 freeb(dmp); 866 return (error); 867 } 868 dmp->b_wptr += sfv_len; 869 error = kstrwritemp(vp, dmp, fflag); 870 if (error != 0) { 871 freeb(dmp); 872 return (error); 873 } 874 ttolwp(curthread)->lwp_ru.ioch += 875 (ulong_t)sfv_len; 876 *count += sfv_len; 877 } else { 878 ioflag = auio.uio_fmode & 879 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 880 while (sfv_len > 0) { 881 error = VOP_WRITE(vp, &auio, ioflag, 882 fp->f_cred, NULL); 883 cnt = sfv_len - auio.uio_resid; 884 sfv_len -= cnt; 885 ttolwp(curthread)->lwp_ru.ioch += 886 (ulong_t)cnt; 887 *fileoff += cnt; 888 *count += cnt; 889 if (error != 0) 890 return (error); 891 } 892 } 893 } else { 894 file_t *ffp; 895 vnode_t *readvp; 896 int readflg = 0; 897 size_t size; 898 caddr_t ptr; 899 900 if ((ffp = getf(sfv->sfv_fd)) == NULL) 901 return (EBADF); 902 903 if ((ffp->f_flag & FREAD) == 0) { 904 releasef(sfv->sfv_fd); 905 return (EBADF); 906 } 907 908 readvp = ffp->f_vnode; 909 if (readvp->v_type != VREG) { 910 releasef(sfv->sfv_fd); 911 return (EINVAL); 912 } 913 914 /* 915 * No point reading and writing to same vp, 916 * as long as both are regular files. readvp is not 917 * locked; but since we got it from an open file the 918 * contents will be valid during the time of access. 919 */ 920 if (VN_CMP(vp, readvp)) { 921 releasef(sfv->sfv_fd); 922 return (EINVAL); 923 } 924 925 /* 926 * Note: we assume readvp != vp. "vp" is already 927 * locked, and "readvp" must not be. 928 */ 929 (void) VOP_RWLOCK(readvp, readflg, NULL); 930 931 /* Same checks as in pread */ 932 if (sfv_off > maxoff) { 933 VOP_RWUNLOCK(readvp, readflg, NULL); 934 releasef(sfv->sfv_fd); 935 return (EINVAL); 936 } 937 if (sfv_off + sfv_len > maxoff) { 938 sfv_len = (ssize_t)((offset_t)maxoff - 939 sfv_off); 940 } 941 /* Find the native blocksize to transfer data */ 942 size = MIN(vp->v_vfsp->vfs_bsize, 943 readvp->v_vfsp->vfs_bsize); 944 size = sfv_len < size ? sfv_len : size; 945 946 if (vp->v_type != VSOCK) { 947 buf = kmem_alloc(size, KM_NOSLEEP); 948 if (buf == NULL) { 949 VOP_RWUNLOCK(readvp, readflg, NULL); 950 releasef(sfv->sfv_fd); 951 return (ENOMEM); 952 } 953 } else { 954 /* 955 * For sockets acting as an SSL proxy, we 956 * need to adjust the size to the maximum 957 * SSL record size set in the stream head. 958 */ 959 if (so->so_kssl_ctx != NULL) 960 size = MIN(size, maxblk); 961 } 962 963 while (sfv_len > 0) { 964 size_t iov_len; 965 966 iov_len = MIN(size, sfv_len); 967 968 if (vp->v_type == VSOCK) { 969 dmp = allocb(iov_len + extra, BPRI_HI); 970 if (dmp == NULL) { 971 VOP_RWUNLOCK(readvp, readflg, 972 NULL); 973 releasef(sfv->sfv_fd); 974 return (ENOMEM); 975 } 976 dmp->b_wptr = dmp->b_rptr = 977 dmp->b_rptr + wroff; 978 ptr = (caddr_t)dmp->b_rptr; 979 } else { 980 ptr = buf; 981 } 982 983 aiov.iov_base = ptr; 984 aiov.iov_len = iov_len; 985 auio.uio_loffset = sfv_off; 986 auio.uio_iov = &aiov; 987 auio.uio_iovcnt = 1; 988 auio.uio_resid = iov_len; 989 auio.uio_segflg = UIO_SYSSPACE; 990 auio.uio_llimit = MAXOFFSET_T; 991 auio.uio_fmode = ffp->f_flag; 992 ioflag = auio.uio_fmode & 993 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 994 995 /* 996 * If read sync is not asked for, 997 * filter sync flags 998 */ 999 if ((ioflag & FRSYNC) == 0) 1000 ioflag &= ~(FSYNC|FDSYNC); 1001 error = VOP_READ(readvp, &auio, ioflag, 1002 fp->f_cred, NULL); 1003 if (error != 0) { 1004 /* 1005 * If we were reading a pipe (currently 1006 * not implemented), we may now lose 1007 * data. 1008 */ 1009 if (vp->v_type == VSOCK) 1010 freeb(dmp); 1011 else 1012 kmem_free(buf, size); 1013 VOP_RWUNLOCK(readvp, readflg, NULL); 1014 releasef(sfv->sfv_fd); 1015 return (error); 1016 } 1017 1018 /* 1019 * Check how much data was really read. 1020 * Decrement the 'len' and increment the 1021 * 'off' appropriately. 1022 */ 1023 cnt = iov_len - auio.uio_resid; 1024 if (cnt == 0) { 1025 if (vp->v_type == VSOCK) 1026 freeb(dmp); 1027 else 1028 kmem_free(buf, size); 1029 VOP_RWUNLOCK(readvp, readflg, NULL); 1030 releasef(sfv->sfv_fd); 1031 return (EINVAL); 1032 } 1033 sfv_len -= cnt; 1034 sfv_off += cnt; 1035 1036 if (vp->v_type == VSOCK) { 1037 dmp->b_wptr = dmp->b_rptr + cnt; 1038 1039 error = kstrwritemp(vp, dmp, fflag); 1040 if (error != 0) { 1041 freeb(dmp); 1042 VOP_RWUNLOCK(readvp, readflg, 1043 NULL); 1044 releasef(sfv->sfv_fd); 1045 return (error); 1046 } 1047 1048 ttolwp(curthread)->lwp_ru.ioch += 1049 (ulong_t)cnt; 1050 *count += cnt; 1051 } else { 1052 1053 aiov.iov_base = ptr; 1054 aiov.iov_len = cnt; 1055 auio.uio_loffset = *fileoff; 1056 auio.uio_resid = cnt; 1057 auio.uio_segflg = UIO_SYSSPACE; 1058 auio.uio_llimit = curproc->p_fsz_ctl; 1059 auio.uio_fmode = fflag; 1060 ioflag = auio.uio_fmode & 1061 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1062 error = VOP_WRITE(vp, &auio, ioflag, 1063 fp->f_cred, NULL); 1064 1065 /* 1066 * Check how much data was written. 1067 * Increment the 'len' and decrement the 1068 * 'off' if all the data was not 1069 * written. 1070 */ 1071 cnt -= auio.uio_resid; 1072 sfv_len += auio.uio_resid; 1073 sfv_off -= auio.uio_resid; 1074 ttolwp(curthread)->lwp_ru.ioch += 1075 (ulong_t)cnt; 1076 *fileoff += cnt; 1077 *count += cnt; 1078 if (error != 0) { 1079 VOP_RWUNLOCK(readvp, readflg, 1080 NULL); 1081 releasef(sfv->sfv_fd); 1082 return (error); 1083 } 1084 } 1085 } 1086 if (buf) { 1087 kmem_free(buf, size); 1088 buf = NULL; 1089 } 1090 VOP_RWUNLOCK(readvp, readflg, NULL); 1091 releasef(sfv->sfv_fd); 1092 } 1093 sfv++; 1094 } 1095 return (0); 1096 } 1097 1098 ssize_t 1099 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, 1100 size_t *xferred) 1101 { 1102 int error; 1103 file_t *fp; 1104 struct vnode *vp; 1105 struct sonode *so; 1106 u_offset_t fileoff; 1107 int copy_cnt; 1108 const struct sendfilevec *copy_vec; 1109 struct sendfilevec sfv[SEND_MAX_CHUNK]; 1110 ssize_t count = 0; 1111 #ifdef _SYSCALL32_IMPL 1112 struct ksendfilevec32 sfv32[SEND_MAX_CHUNK]; 1113 #endif 1114 ssize_t total_size = 0; 1115 int i; 1116 boolean_t is_sock = B_FALSE; 1117 int maxblk = 0; 1118 1119 if (sfvcnt <= 0) 1120 return (set_errno(EINVAL)); 1121 1122 if ((fp = getf(fildes)) == NULL) 1123 return (set_errno(EBADF)); 1124 1125 if (((fp->f_flag) & FWRITE) == 0) { 1126 error = EBADF; 1127 goto err; 1128 } 1129 1130 fileoff = fp->f_offset; 1131 vp = fp->f_vnode; 1132 1133 switch (vp->v_type) { 1134 case VSOCK: 1135 so = VTOSO(vp); 1136 /* sendfile not supported for SCTP */ 1137 if (so->so_protocol == IPPROTO_SCTP) { 1138 error = EPROTONOSUPPORT; 1139 goto err; 1140 } 1141 is_sock = B_TRUE; 1142 switch (so->so_family) { 1143 case AF_INET: 1144 case AF_INET6: 1145 /* 1146 * Make similar checks done in SOP_WRITE(). 1147 */ 1148 if (so->so_state & SS_CANTSENDMORE) { 1149 tsignal(curthread, SIGPIPE); 1150 error = EPIPE; 1151 goto err; 1152 } 1153 if (so->so_type != SOCK_STREAM) { 1154 error = EOPNOTSUPP; 1155 goto err; 1156 } 1157 1158 if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) != 1159 (SS_ISCONNECTED|SS_ISBOUND)) { 1160 error = ENOTCONN; 1161 goto err; 1162 } 1163 1164 if ((so->so_state & SS_DIRECT) && 1165 (so->so_priv != NULL) && 1166 (so->so_kssl_ctx == NULL)) { 1167 maxblk = ((tcp_t *)so->so_priv)->tcp_mss; 1168 } else { 1169 maxblk = (int)vp->v_stream->sd_maxblk; 1170 } 1171 break; 1172 default: 1173 error = EAFNOSUPPORT; 1174 goto err; 1175 } 1176 break; 1177 case VREG: 1178 break; 1179 default: 1180 error = EINVAL; 1181 goto err; 1182 } 1183 1184 switch (opcode) { 1185 case SENDFILEV : 1186 break; 1187 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 1188 case SENDFILEV64 : 1189 return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt, 1190 (size32_t *)xferred, fildes)); 1191 #endif 1192 default : 1193 error = ENOSYS; 1194 break; 1195 } 1196 1197 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1198 copy_vec = vec; 1199 1200 do { 1201 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 1202 #ifdef _SYSCALL32_IMPL 1203 /* 32-bit callers need to have their iovec expanded. */ 1204 if (get_udatamodel() == DATAMODEL_ILP32) { 1205 if (copyin(copy_vec, sfv32, 1206 copy_cnt * sizeof (ksendfilevec32_t))) { 1207 error = EFAULT; 1208 break; 1209 } 1210 1211 for (i = 0; i < copy_cnt; i++) { 1212 sfv[i].sfv_fd = sfv32[i].sfv_fd; 1213 sfv[i].sfv_off = 1214 (off_t)(uint32_t)sfv32[i].sfv_off; 1215 sfv[i].sfv_len = (size_t)sfv32[i].sfv_len; 1216 total_size += sfv[i].sfv_len; 1217 sfv[i].sfv_flag = sfv32[i].sfv_flag; 1218 } 1219 } else { 1220 #endif 1221 if (copyin(copy_vec, sfv, 1222 copy_cnt * sizeof (sendfilevec_t))) { 1223 error = EFAULT; 1224 break; 1225 } 1226 1227 for (i = 0; i < copy_cnt; i++) { 1228 total_size += sfv[i].sfv_len; 1229 } 1230 #ifdef _SYSCALL32_IMPL 1231 } 1232 #endif 1233 1234 /* 1235 * The task between deciding to use sendvec_small_chunk 1236 * and sendvec_chunk is dependant on multiple things: 1237 * 1238 * i) latency is important for smaller files. So if the 1239 * data is smaller than 'tcp_slow_start_initial' times 1240 * maxblk, then use sendvec_small_chunk which creates 1241 * maxblk size mblks and chains then together and sends 1242 * them to TCP in one shot. It also leaves 'wroff' size 1243 * space for the headers in each mblk. 1244 * 1245 * ii) for total size bigger than 'tcp_slow_start_initial' 1246 * time maxblk, its probably real file data which is 1247 * dominating. So its better to use sendvec_chunk because 1248 * performance goes to dog if we don't do pagesize reads. 1249 * sendvec_chunk will do pagesize reads and write them 1250 * in pagesize mblks to TCP. 1251 * 1252 * Side Notes: A write to file has not been optimized. 1253 * Future zero copy code will plugin into sendvec_chunk 1254 * only because doing zero copy for files smaller then 1255 * pagesize is useless. 1256 * 1257 * Note, if socket has NL7C enabled then call NL7C's 1258 * senfilev() function to give NL7C a chance to copy 1259 * the vec for caching, then continue processing as 1260 * normal. 1261 */ 1262 if (is_sock) { 1263 switch (so->so_family) { 1264 case AF_INET: 1265 case AF_INET6: 1266 if (so->so_nl7c_flags != 0) { 1267 nl7c_sendfilev(so, fileoff, 1268 sfv, copy_cnt); 1269 } 1270 if (total_size <= (4 * maxblk)) 1271 error = sendvec_small_chunk(fp, 1272 &fileoff, sfv, copy_cnt, 1273 total_size, maxblk, &count); 1274 else 1275 error = sendvec_chunk(fp, &fileoff, 1276 sfv, copy_cnt, &count); 1277 break; 1278 } 1279 } else { 1280 ASSERT(vp->v_type == VREG); 1281 error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt, 1282 &count); 1283 } 1284 1285 1286 #ifdef _SYSCALL32_IMPL 1287 if (get_udatamodel() == DATAMODEL_ILP32) 1288 copy_vec = (const struct sendfilevec *)((char *)copy_vec + 1289 (copy_cnt * sizeof (ksendfilevec32_t))); 1290 else 1291 #endif 1292 copy_vec += copy_cnt; 1293 sfvcnt -= copy_cnt; 1294 } while (sfvcnt > 0); 1295 1296 if (vp->v_type == VREG) 1297 fp->f_offset += count; 1298 1299 1300 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1301 1302 #ifdef _SYSCALL32_IMPL 1303 if (get_udatamodel() == DATAMODEL_ILP32) { 1304 ssize32_t count32 = (ssize32_t)count; 1305 if (copyout(&count32, xferred, sizeof (count32))) 1306 error = EFAULT; 1307 releasef(fildes); 1308 if (error != 0) 1309 return (set_errno(error)); 1310 return (count32); 1311 } 1312 #endif 1313 if (copyout(&count, xferred, sizeof (count))) 1314 error = EFAULT; 1315 releasef(fildes); 1316 if (error != 0) 1317 return (set_errno(error)); 1318 return (count); 1319 err: 1320 ASSERT(error != 0); 1321 releasef(fildes); 1322 return (set_errno(error)); 1323 } 1324