1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/t_lock.h> 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/buf.h> 34 #include <sys/conf.h> 35 #include <sys/cred.h> 36 #include <sys/kmem.h> 37 #include <sys/sysmacros.h> 38 #include <sys/vfs.h> 39 #include <sys/vnode.h> 40 #include <sys/debug.h> 41 #include <sys/errno.h> 42 #include <sys/time.h> 43 #include <sys/file.h> 44 #include <sys/open.h> 45 #include <sys/user.h> 46 #include <sys/termios.h> 47 #include <sys/stream.h> 48 #include <sys/strsubr.h> 49 #include <sys/sunddi.h> 50 #include <sys/esunddi.h> 51 #include <sys/flock.h> 52 #include <sys/modctl.h> 53 #include <sys/cmn_err.h> 54 #include <sys/vmsystm.h> 55 56 #include <sys/socket.h> 57 #include <sys/socketvar.h> 58 /* swilly code in sys/socketvar.h turns off DEBUG */ 59 #ifdef __lint 60 #define DEBUG 61 #endif 62 63 #include <netinet/in.h> 64 #include <sys/sendfile.h> 65 #include <sys/un.h> 66 #include <sys/tihdr.h> 67 #include <sys/atomic.h> 68 69 #include <inet/common.h> 70 #include <inet/ip.h> 71 #include <inet/ip6.h> 72 #include <inet/tcp.h> 73 74 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *, 75 ssize32_t *); 76 extern int nl7c_sendfilev(struct sonode *, u_offset_t *, struct sendfilevec *, 77 int, ssize_t *); 78 extern int snf_segmap(file_t *, vnode_t *, u_offset_t, u_offset_t, uint_t, 79 ssize_t *, boolean_t); 80 81 #define readflg (V_WRITELOCK_FALSE) 82 #define rwflag (V_WRITELOCK_TRUE) 83 84 /* 85 * kstrwritemp() has very similar semantics as that of strwrite(). 86 * The main difference is it obtains mblks from the caller and also 87 * does not do any copy as done in strwrite() from user buffers to 88 * kernel buffers. 89 * 90 * Currently, this routine is used by sendfile to send data allocated 91 * within the kernel without any copying. This interface does not use the 92 * synchronous stream interface as synch. stream interface implies 93 * copying. 94 */ 95 int 96 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode) 97 { 98 struct stdata *stp; 99 struct queue *wqp; 100 mblk_t *newmp; 101 char waitflag; 102 int tempmode; 103 int error = 0; 104 int done = 0; 105 struct sonode *so; 106 boolean_t direct; 107 108 ASSERT(vp->v_stream); 109 stp = vp->v_stream; 110 111 so = VTOSO(vp); 112 direct = (so->so_state & SS_DIRECT); 113 114 /* 115 * This is the sockfs direct fast path. canputnext() need 116 * not be accurate so we don't grab the sd_lock here. If 117 * we get flow-controlled, we grab sd_lock just before the 118 * do..while loop below to emulate what strwrite() does. 119 */ 120 wqp = stp->sd_wrq; 121 if (canputnext(wqp) && direct && 122 !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) { 123 return (sostream_direct(so, NULL, mp, CRED())); 124 } else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) { 125 /* Fast check of flags before acquiring the lock */ 126 mutex_enter(&stp->sd_lock); 127 error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0); 128 mutex_exit(&stp->sd_lock); 129 if (error != 0) { 130 if (!(stp->sd_flag & STPLEX) && 131 (stp->sd_wput_opt & SW_SIGPIPE)) { 132 tsignal(curthread, SIGPIPE); 133 error = EPIPE; 134 } 135 return (error); 136 } 137 } 138 139 waitflag = WRITEWAIT; 140 if (stp->sd_flag & OLDNDELAY) 141 tempmode = fmode & ~FNDELAY; 142 else 143 tempmode = fmode; 144 145 mutex_enter(&stp->sd_lock); 146 do { 147 if (canputnext(wqp)) { 148 mutex_exit(&stp->sd_lock); 149 if (stp->sd_wputdatafunc != NULL) { 150 newmp = (stp->sd_wputdatafunc)(vp, mp, NULL, 151 NULL, NULL, NULL); 152 if (newmp == NULL) { 153 /* The caller will free mp */ 154 return (ECOMM); 155 } 156 mp = newmp; 157 } 158 putnext(wqp, mp); 159 return (0); 160 } 161 error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1, 162 &done); 163 } while (error == 0 && !done); 164 165 mutex_exit(&stp->sd_lock); 166 /* 167 * EAGAIN tells the application to try again. ENOMEM 168 * is returned only if the memory allocation size 169 * exceeds the physical limits of the system. ENOMEM 170 * can't be true here. 171 */ 172 if (error == ENOMEM) 173 error = EAGAIN; 174 return (error); 175 } 176 177 #define SEND_MAX_CHUNK 16 178 179 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 180 /* 181 * 64 bit offsets for 32 bit applications only running either on 182 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer 183 * more than 2GB of data. 184 */ 185 int 186 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, 187 int copy_cnt, ssize32_t *count) 188 { 189 struct vnode *vp; 190 ushort_t fflag; 191 int ioflag; 192 size32_t cnt; 193 ssize32_t sfv_len; 194 ssize32_t tmpcount; 195 u_offset_t sfv_off; 196 struct uio auio; 197 struct iovec aiov; 198 int i, error; 199 200 fflag = fp->f_flag; 201 vp = fp->f_vnode; 202 for (i = 0; i < copy_cnt; i++) { 203 204 if (ISSIG(curthread, JUSTLOOKING)) 205 return (EINTR); 206 207 /* 208 * Do similar checks as "write" as we are writing 209 * sfv_len bytes into "vp". 210 */ 211 sfv_len = (ssize32_t)sfv->sfv_len; 212 213 if (sfv_len == 0) 214 continue; 215 216 if (sfv_len < 0) 217 return (EINVAL); 218 219 if (vp->v_type == VREG) { 220 if (*fileoff >= curproc->p_fsz_ctl) { 221 mutex_enter(&curproc->p_lock); 222 (void) rctl_action( 223 rctlproc_legacy[RLIMIT_FSIZE], 224 curproc->p_rctls, curproc, RCA_SAFE); 225 mutex_exit(&curproc->p_lock); 226 return (EFBIG); 227 } 228 229 if (*fileoff >= OFFSET_MAX(fp)) 230 return (EFBIG); 231 232 if (*fileoff + sfv_len > OFFSET_MAX(fp)) 233 return (EINVAL); 234 } 235 236 tmpcount = *count + sfv_len; 237 if (tmpcount < 0) 238 return (EINVAL); 239 240 sfv_off = sfv->sfv_off; 241 242 auio.uio_extflg = UIO_COPY_DEFAULT; 243 if (sfv->sfv_fd == SFV_FD_SELF) { 244 aiov.iov_len = sfv_len; 245 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 246 auio.uio_loffset = *fileoff; 247 auio.uio_iovcnt = 1; 248 auio.uio_resid = sfv_len; 249 auio.uio_iov = &aiov; 250 auio.uio_segflg = UIO_USERSPACE; 251 auio.uio_llimit = curproc->p_fsz_ctl; 252 auio.uio_fmode = fflag; 253 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 254 while (sfv_len > 0) { 255 error = VOP_WRITE(vp, &auio, ioflag, 256 fp->f_cred, NULL); 257 cnt = sfv_len - auio.uio_resid; 258 sfv_len -= cnt; 259 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 260 if (vp->v_type == VREG) 261 *fileoff += cnt; 262 *count += cnt; 263 if (error != 0) 264 return (error); 265 } 266 } else { 267 file_t *ffp; 268 vnode_t *readvp; 269 size_t size; 270 caddr_t ptr; 271 272 if ((ffp = getf(sfv->sfv_fd)) == NULL) 273 return (EBADF); 274 275 if ((ffp->f_flag & FREAD) == 0) { 276 releasef(sfv->sfv_fd); 277 return (EBADF); 278 } 279 280 readvp = ffp->f_vnode; 281 if (readvp->v_type != VREG) { 282 releasef(sfv->sfv_fd); 283 return (EINVAL); 284 } 285 286 /* 287 * No point reading and writing to same vp, 288 * as long as both are regular files. readvp is not 289 * locked; but since we got it from an open file the 290 * contents will be valid during the time of access. 291 */ 292 if (VN_CMP(vp, readvp)) { 293 releasef(sfv->sfv_fd); 294 return (EINVAL); 295 } 296 297 /* 298 * Note: we assume readvp != vp. "vp" is already 299 * locked, and "readvp" must not be. 300 */ 301 (void) VOP_RWLOCK(readvp, readflg, NULL); 302 303 /* 304 * Same checks as in pread64. 305 */ 306 if (sfv_off > MAXOFFSET_T) { 307 VOP_RWUNLOCK(readvp, readflg, NULL); 308 releasef(sfv->sfv_fd); 309 return (EINVAL); 310 } 311 312 if (sfv_off + sfv_len > MAXOFFSET_T) 313 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off); 314 315 /* Find the native blocksize to transfer data */ 316 size = MIN(vp->v_vfsp->vfs_bsize, 317 readvp->v_vfsp->vfs_bsize); 318 size = sfv_len < size ? sfv_len : size; 319 ptr = kmem_alloc(size, KM_SLEEP); 320 321 while (sfv_len > 0) { 322 size_t iov_len; 323 324 iov_len = MIN(size, sfv_len); 325 aiov.iov_base = ptr; 326 aiov.iov_len = iov_len; 327 auio.uio_loffset = sfv_off; 328 auio.uio_iov = &aiov; 329 auio.uio_iovcnt = 1; 330 auio.uio_resid = iov_len; 331 auio.uio_segflg = UIO_SYSSPACE; 332 auio.uio_llimit = MAXOFFSET_T; 333 auio.uio_fmode = ffp->f_flag; 334 ioflag = auio.uio_fmode & 335 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 336 337 /* 338 * If read sync is not asked for, 339 * filter sync flags 340 */ 341 if ((ioflag & FRSYNC) == 0) 342 ioflag &= ~(FSYNC|FDSYNC); 343 error = VOP_READ(readvp, &auio, ioflag, 344 fp->f_cred, NULL); 345 if (error) { 346 kmem_free(ptr, size); 347 VOP_RWUNLOCK(readvp, readflg, NULL); 348 releasef(sfv->sfv_fd); 349 return (error); 350 } 351 352 /* 353 * Check how must data was really read. 354 * Decrement the 'len' and increment the 355 * 'off' appropriately. 356 */ 357 cnt = iov_len - auio.uio_resid; 358 if (cnt == 0) { 359 /* 360 * If we were reading a pipe (currently 361 * not implemented), we may now lose 362 * data. 363 */ 364 kmem_free(ptr, size); 365 VOP_RWUNLOCK(readvp, readflg, NULL); 366 releasef(sfv->sfv_fd); 367 return (EINVAL); 368 } 369 sfv_len -= cnt; 370 sfv_off += cnt; 371 372 aiov.iov_base = ptr; 373 aiov.iov_len = cnt; 374 auio.uio_loffset = *fileoff; 375 auio.uio_resid = cnt; 376 auio.uio_segflg = UIO_SYSSPACE; 377 auio.uio_llimit = curproc->p_fsz_ctl; 378 auio.uio_fmode = fflag; 379 ioflag = auio.uio_fmode & 380 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 381 error = VOP_WRITE(vp, &auio, ioflag, 382 fp->f_cred, NULL); 383 384 /* 385 * Check how much data was written. Increment 386 * the 'len' and decrement the 'off' if all 387 * the data was not written. 388 */ 389 cnt -= auio.uio_resid; 390 sfv_len += auio.uio_resid; 391 sfv_off -= auio.uio_resid; 392 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 393 if (vp->v_type == VREG) 394 *fileoff += cnt; 395 *count += cnt; 396 if (error != 0) { 397 kmem_free(ptr, size); 398 VOP_RWUNLOCK(readvp, readflg, NULL); 399 releasef(sfv->sfv_fd); 400 return (error); 401 } 402 } 403 VOP_RWUNLOCK(readvp, readflg, NULL); 404 releasef(sfv->sfv_fd); 405 kmem_free(ptr, size); 406 } 407 sfv++; 408 } 409 return (0); 410 } 411 412 ssize32_t 413 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, 414 size32_t *xferred, int fildes) 415 { 416 u_offset_t fileoff; 417 int copy_cnt; 418 const struct ksendfilevec64 *copy_vec; 419 struct ksendfilevec64 sfv[SEND_MAX_CHUNK]; 420 struct vnode *vp; 421 int error; 422 ssize32_t count = 0; 423 424 vp = fp->f_vnode; 425 (void) VOP_RWLOCK(vp, rwflag, NULL); 426 427 copy_vec = vec; 428 fileoff = fp->f_offset; 429 430 do { 431 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 432 if (copyin(copy_vec, sfv, copy_cnt * 433 sizeof (struct ksendfilevec64))) { 434 error = EFAULT; 435 break; 436 } 437 438 /* 439 * Optimize the regular file over 440 * the socket case. 441 */ 442 if (vp->v_type == VSOCK && sfv->sfv_fd != SFV_FD_SELF) { 443 file_t *rfp; 444 vnode_t *rvp; 445 446 if ((rfp = getf(sfv->sfv_fd)) == NULL) { 447 error = EBADF; 448 break; 449 } 450 if ((rfp->f_flag & FREAD) == 0) { 451 releasef(sfv->sfv_fd); 452 error = EBADF; 453 break; 454 } 455 rvp = rfp->f_vnode; 456 if (rvp->v_type == VREG) { 457 error = sosendfile64(fp, rfp, sfv, &count); 458 if (error) 459 break; 460 copy_vec++; 461 sfvcnt--; 462 continue; 463 } 464 releasef(sfv->sfv_fd); 465 } 466 error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count); 467 if (error != 0) 468 break; 469 470 copy_vec += copy_cnt; 471 sfvcnt -= copy_cnt; 472 } while (sfvcnt > 0); 473 474 if (vp->v_type == VREG) 475 fp->f_offset += count; 476 477 VOP_RWUNLOCK(vp, rwflag, NULL); 478 if (copyout(&count, xferred, sizeof (count))) 479 error = EFAULT; 480 releasef(fildes); 481 if (error != 0) 482 return (set_errno(error)); 483 return (count); 484 } 485 #endif 486 487 int 488 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 489 int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count) 490 { 491 struct vnode *vp; 492 struct uio auio; 493 struct iovec aiov; 494 ushort_t fflag; 495 int ioflag; 496 int i, error; 497 size_t cnt; 498 ssize_t sfv_len; 499 u_offset_t sfv_off; 500 #ifdef _SYSCALL32_IMPL 501 model_t model = get_udatamodel(); 502 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 503 MAXOFF32_T : MAXOFFSET_T; 504 #else 505 const u_offset_t maxoff = MAXOFF32_T; 506 #endif 507 mblk_t *dmp = NULL; 508 int wroff; 509 int buf_left = 0; 510 size_t iov_len; 511 mblk_t *head, *tmp; 512 size_t size = total_size; 513 size_t extra; 514 int tail_len; 515 516 fflag = fp->f_flag; 517 vp = fp->f_vnode; 518 519 ASSERT(vp->v_type == VSOCK); 520 ASSERT(maxblk > 0); 521 522 wroff = (int)vp->v_stream->sd_wroff; 523 tail_len = (int)vp->v_stream->sd_tail; 524 extra = wroff + tail_len; 525 526 buf_left = MIN(total_size, maxblk); 527 head = dmp = allocb(buf_left + extra, BPRI_HI); 528 if (head == NULL) 529 return (ENOMEM); 530 head->b_wptr = head->b_rptr = head->b_rptr + wroff; 531 532 auio.uio_extflg = UIO_COPY_DEFAULT; 533 for (i = 0; i < copy_cnt; i++) { 534 if (ISSIG(curthread, JUSTLOOKING)) 535 return (EINTR); 536 537 /* 538 * Do similar checks as "write" as we are writing 539 * sfv_len bytes into "vp". 540 */ 541 sfv_len = (ssize_t)sfv->sfv_len; 542 543 if (sfv_len == 0) { 544 sfv++; 545 continue; 546 } 547 548 /* Make sure sfv_len is not negative */ 549 #ifdef _SYSCALL32_IMPL 550 if (model == DATAMODEL_ILP32) { 551 if ((ssize32_t)sfv_len < 0) 552 return (EINVAL); 553 } else 554 #endif 555 if (sfv_len < 0) 556 return (EINVAL); 557 558 /* Check for overflow */ 559 #ifdef _SYSCALL32_IMPL 560 if (model == DATAMODEL_ILP32) { 561 if (((ssize32_t)(*count + sfv_len)) < 0) 562 return (EINVAL); 563 } else 564 #endif 565 if ((*count + sfv_len) < 0) 566 return (EINVAL); 567 568 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 569 570 if (sfv->sfv_fd == SFV_FD_SELF) { 571 while (sfv_len > 0) { 572 if (buf_left == 0) { 573 tmp = dmp; 574 buf_left = MIN(total_size, maxblk); 575 iov_len = MIN(buf_left, sfv_len); 576 dmp = allocb(buf_left + extra, BPRI_HI); 577 if (dmp == NULL) { 578 freemsg(head); 579 return (ENOMEM); 580 } 581 dmp->b_wptr = dmp->b_rptr = 582 dmp->b_rptr + wroff; 583 tmp->b_cont = dmp; 584 } else { 585 iov_len = MIN(buf_left, sfv_len); 586 } 587 588 aiov.iov_len = iov_len; 589 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 590 auio.uio_loffset = *fileoff; 591 auio.uio_iovcnt = 1; 592 auio.uio_resid = iov_len; 593 auio.uio_iov = &aiov; 594 auio.uio_segflg = UIO_USERSPACE; 595 auio.uio_llimit = curproc->p_fsz_ctl; 596 auio.uio_fmode = fflag; 597 598 buf_left -= iov_len; 599 total_size -= iov_len; 600 sfv_len -= iov_len; 601 sfv_off += iov_len; 602 603 error = uiomove((caddr_t)dmp->b_wptr, 604 iov_len, UIO_WRITE, &auio); 605 if (error != 0) { 606 freemsg(head); 607 return (error); 608 } 609 dmp->b_wptr += iov_len; 610 } 611 } else { 612 file_t *ffp; 613 vnode_t *readvp; 614 615 if ((ffp = getf(sfv->sfv_fd)) == NULL) { 616 freemsg(head); 617 return (EBADF); 618 } 619 620 if ((ffp->f_flag & FREAD) == 0) { 621 releasef(sfv->sfv_fd); 622 freemsg(head); 623 return (EACCES); 624 } 625 626 readvp = ffp->f_vnode; 627 if (readvp->v_type != VREG) { 628 releasef(sfv->sfv_fd); 629 freemsg(head); 630 return (EINVAL); 631 } 632 633 /* 634 * No point reading and writing to same vp, 635 * as long as both are regular files. readvp is not 636 * locked; but since we got it from an open file the 637 * contents will be valid during the time of access. 638 */ 639 640 if (VN_CMP(vp, readvp)) { 641 releasef(sfv->sfv_fd); 642 freemsg(head); 643 return (EINVAL); 644 } 645 646 /* 647 * Note: we assume readvp != vp. "vp" is already 648 * locked, and "readvp" must not be. 649 */ 650 651 (void) VOP_RWLOCK(readvp, readflg, NULL); 652 653 /* Same checks as in pread */ 654 if (sfv_off > maxoff) { 655 VOP_RWUNLOCK(readvp, readflg, NULL); 656 releasef(sfv->sfv_fd); 657 freemsg(head); 658 return (EINVAL); 659 } 660 if (sfv_off + sfv_len > maxoff) { 661 total_size -= (sfv_off + sfv_len - maxoff); 662 sfv_len = (ssize_t)((offset_t)maxoff - 663 sfv_off); 664 } 665 666 while (sfv_len > 0) { 667 if (buf_left == 0) { 668 tmp = dmp; 669 buf_left = MIN(total_size, maxblk); 670 iov_len = MIN(buf_left, sfv_len); 671 dmp = allocb(buf_left + extra, BPRI_HI); 672 if (dmp == NULL) { 673 VOP_RWUNLOCK(readvp, readflg, 674 NULL); 675 releasef(sfv->sfv_fd); 676 freemsg(head); 677 return (ENOMEM); 678 } 679 dmp->b_wptr = dmp->b_rptr = 680 dmp->b_rptr + wroff; 681 tmp->b_cont = dmp; 682 } else { 683 iov_len = MIN(buf_left, sfv_len); 684 } 685 aiov.iov_base = (caddr_t)dmp->b_wptr; 686 aiov.iov_len = iov_len; 687 auio.uio_loffset = sfv_off; 688 auio.uio_iov = &aiov; 689 auio.uio_iovcnt = 1; 690 auio.uio_resid = iov_len; 691 auio.uio_segflg = UIO_SYSSPACE; 692 auio.uio_llimit = MAXOFFSET_T; 693 auio.uio_fmode = ffp->f_flag; 694 ioflag = auio.uio_fmode & 695 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 696 697 /* 698 * If read sync is not asked for, 699 * filter sync flags 700 */ 701 if ((ioflag & FRSYNC) == 0) 702 ioflag &= ~(FSYNC|FDSYNC); 703 error = VOP_READ(readvp, &auio, ioflag, 704 fp->f_cred, NULL); 705 if (error != 0) { 706 /* 707 * If we were reading a pipe (currently 708 * not implemented), we may now loose 709 * data. 710 */ 711 VOP_RWUNLOCK(readvp, readflg, NULL); 712 releasef(sfv->sfv_fd); 713 freemsg(head); 714 return (error); 715 } 716 717 /* 718 * Check how much data was really read. 719 * Decrement the 'len' and increment the 720 * 'off' appropriately. 721 */ 722 cnt = iov_len - auio.uio_resid; 723 if (cnt == 0) { 724 VOP_RWUNLOCK(readvp, readflg, NULL); 725 releasef(sfv->sfv_fd); 726 freemsg(head); 727 return (EINVAL); 728 } 729 sfv_len -= cnt; 730 sfv_off += cnt; 731 total_size -= cnt; 732 buf_left -= cnt; 733 734 dmp->b_wptr += cnt; 735 } 736 VOP_RWUNLOCK(readvp, readflg, NULL); 737 releasef(sfv->sfv_fd); 738 } 739 sfv++; 740 } 741 742 ASSERT(total_size == 0); 743 error = kstrwritemp(vp, head, fflag); 744 if (error != 0) { 745 freemsg(head); 746 return (error); 747 } 748 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size; 749 *count += size; 750 751 return (0); 752 } 753 754 755 int 756 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 757 int copy_cnt, ssize_t *count) 758 { 759 struct vnode *vp; 760 struct uio auio; 761 struct iovec aiov; 762 ushort_t fflag; 763 int ioflag; 764 int i, error; 765 size_t cnt; 766 ssize_t sfv_len; 767 u_offset_t sfv_off; 768 #ifdef _SYSCALL32_IMPL 769 model_t model = get_udatamodel(); 770 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 771 MAXOFF32_T : MAXOFFSET_T; 772 #else 773 const u_offset_t maxoff = MAXOFF32_T; 774 #endif 775 mblk_t *dmp = NULL; 776 char *buf = NULL; 777 size_t extra; 778 int maxblk, wroff, tail_len; 779 struct sonode *so; 780 stdata_t *stp; 781 782 fflag = fp->f_flag; 783 vp = fp->f_vnode; 784 785 if (vp->v_type == VSOCK) { 786 so = VTOSO(vp); 787 stp = vp->v_stream; 788 wroff = (int)stp->sd_wroff; 789 tail_len = (int)stp->sd_tail; 790 maxblk = (int)stp->sd_maxblk; 791 extra = wroff + tail_len; 792 } 793 794 auio.uio_extflg = UIO_COPY_DEFAULT; 795 for (i = 0; i < copy_cnt; i++) { 796 if (ISSIG(curthread, JUSTLOOKING)) 797 return (EINTR); 798 799 /* 800 * Do similar checks as "write" as we are writing 801 * sfv_len bytes into "vp". 802 */ 803 sfv_len = (ssize_t)sfv->sfv_len; 804 805 if (sfv_len == 0) { 806 sfv++; 807 continue; 808 } 809 810 /* Make sure sfv_len is not negative */ 811 #ifdef _SYSCALL32_IMPL 812 if (model == DATAMODEL_ILP32) { 813 if ((ssize32_t)sfv_len < 0) 814 return (EINVAL); 815 } else 816 #endif 817 if (sfv_len < 0) 818 return (EINVAL); 819 820 if (vp->v_type == VREG) { 821 if (*fileoff >= curproc->p_fsz_ctl) { 822 mutex_enter(&curproc->p_lock); 823 (void) rctl_action( 824 rctlproc_legacy[RLIMIT_FSIZE], 825 curproc->p_rctls, curproc, RCA_SAFE); 826 mutex_exit(&curproc->p_lock); 827 828 return (EFBIG); 829 } 830 831 if (*fileoff >= maxoff) 832 return (EFBIG); 833 834 if (*fileoff + sfv_len > maxoff) 835 return (EINVAL); 836 } 837 838 /* Check for overflow */ 839 #ifdef _SYSCALL32_IMPL 840 if (model == DATAMODEL_ILP32) { 841 if (((ssize32_t)(*count + sfv_len)) < 0) 842 return (EINVAL); 843 } else 844 #endif 845 if ((*count + sfv_len) < 0) 846 return (EINVAL); 847 848 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 849 850 if (sfv->sfv_fd == SFV_FD_SELF) { 851 aiov.iov_len = sfv_len; 852 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 853 auio.uio_loffset = *fileoff; 854 auio.uio_iovcnt = 1; 855 auio.uio_resid = sfv_len; 856 auio.uio_iov = &aiov; 857 auio.uio_segflg = UIO_USERSPACE; 858 auio.uio_llimit = curproc->p_fsz_ctl; 859 auio.uio_fmode = fflag; 860 861 if (vp->v_type == VSOCK) { 862 863 /* 864 * Optimize for the socket case 865 */ 866 867 dmp = allocb(sfv_len + extra, BPRI_HI); 868 if (dmp == NULL) 869 return (ENOMEM); 870 dmp->b_wptr = dmp->b_rptr = dmp->b_rptr + wroff; 871 error = uiomove((caddr_t)dmp->b_wptr, 872 sfv_len, UIO_WRITE, &auio); 873 if (error != 0) { 874 freeb(dmp); 875 return (error); 876 } 877 dmp->b_wptr += sfv_len; 878 error = kstrwritemp(vp, dmp, fflag); 879 if (error != 0) { 880 freeb(dmp); 881 return (error); 882 } 883 ttolwp(curthread)->lwp_ru.ioch += 884 (ulong_t)sfv_len; 885 *count += sfv_len; 886 } else { 887 ioflag = auio.uio_fmode & 888 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 889 while (sfv_len > 0) { 890 error = VOP_WRITE(vp, &auio, ioflag, 891 fp->f_cred, NULL); 892 cnt = sfv_len - auio.uio_resid; 893 sfv_len -= cnt; 894 ttolwp(curthread)->lwp_ru.ioch += 895 (ulong_t)cnt; 896 *fileoff += cnt; 897 *count += cnt; 898 if (error != 0) 899 return (error); 900 } 901 } 902 } else { 903 int segmapit; 904 file_t *ffp; 905 vnode_t *readvp; 906 size_t size; 907 caddr_t ptr; 908 909 if ((ffp = getf(sfv->sfv_fd)) == NULL) 910 return (EBADF); 911 912 if ((ffp->f_flag & FREAD) == 0) { 913 releasef(sfv->sfv_fd); 914 return (EBADF); 915 } 916 917 readvp = ffp->f_vnode; 918 if (readvp->v_type != VREG) { 919 releasef(sfv->sfv_fd); 920 return (EINVAL); 921 } 922 923 /* 924 * No point reading and writing to same vp, 925 * as long as both are regular files. readvp is not 926 * locked; but since we got it from an open file the 927 * contents will be valid during the time of access. 928 */ 929 if (VN_CMP(vp, readvp)) { 930 releasef(sfv->sfv_fd); 931 return (EINVAL); 932 } 933 934 /* 935 * Note: we assume readvp != vp. "vp" is already 936 * locked, and "readvp" must not be. 937 */ 938 (void) VOP_RWLOCK(readvp, readflg, NULL); 939 940 /* Same checks as in pread */ 941 if (sfv_off > maxoff) { 942 VOP_RWUNLOCK(readvp, readflg, NULL); 943 releasef(sfv->sfv_fd); 944 return (EINVAL); 945 } 946 if (sfv_off + sfv_len > maxoff) { 947 sfv_len = (ssize_t)((offset_t)maxoff - 948 sfv_off); 949 } 950 /* Find the native blocksize to transfer data */ 951 size = MIN(vp->v_vfsp->vfs_bsize, 952 readvp->v_vfsp->vfs_bsize); 953 size = sfv_len < size ? sfv_len : size; 954 955 if (vp->v_type != VSOCK) { 956 segmapit = 0; 957 buf = kmem_alloc(size, KM_NOSLEEP); 958 if (buf == NULL) { 959 VOP_RWUNLOCK(readvp, readflg, NULL); 960 releasef(sfv->sfv_fd); 961 return (ENOMEM); 962 } 963 } else { 964 /* 965 * For sockets acting as an SSL proxy, we 966 * need to adjust the size to the maximum 967 * SSL record size set in the stream head. 968 */ 969 if (so->so_kssl_ctx != NULL) 970 size = MIN(size, maxblk); 971 972 if (vn_has_flocks(readvp) || 973 readvp->v_flag & VNOMAP || 974 stp->sd_copyflag & STZCVMUNSAFE) { 975 segmapit = 0; 976 } else if (stp->sd_copyflag & STZCVMSAFE) { 977 segmapit = 1; 978 } else { 979 int on = 1; 980 if (SOP_SETSOCKOPT(VTOSO(vp), 981 SOL_SOCKET, SO_SND_COPYAVOID, 982 &on, sizeof (on)) == 0) 983 segmapit = 1; 984 } 985 } 986 987 if (segmapit) { 988 boolean_t nowait; 989 uint_t maxpsz; 990 991 nowait = (sfv->sfv_flag & SFV_NOWAIT) != 0; 992 maxpsz = stp->sd_qn_maxpsz; 993 if (maxpsz == INFPSZ) 994 maxpsz = maxphys; 995 maxpsz = roundup(maxpsz, MAXBSIZE); 996 error = snf_segmap(fp, readvp, sfv_off, 997 (u_offset_t)sfv_len, maxpsz, 998 (ssize_t *)&cnt, nowait); 999 releasef(sfv->sfv_fd); 1000 *count += cnt; 1001 if (error) 1002 return (error); 1003 sfv++; 1004 continue; 1005 } 1006 1007 while (sfv_len > 0) { 1008 size_t iov_len; 1009 1010 iov_len = MIN(size, sfv_len); 1011 1012 if (vp->v_type == VSOCK) { 1013 dmp = allocb(iov_len + extra, BPRI_HI); 1014 if (dmp == NULL) { 1015 VOP_RWUNLOCK(readvp, readflg, 1016 NULL); 1017 releasef(sfv->sfv_fd); 1018 return (ENOMEM); 1019 } 1020 dmp->b_wptr = dmp->b_rptr = 1021 dmp->b_rptr + wroff; 1022 ptr = (caddr_t)dmp->b_rptr; 1023 } else { 1024 ptr = buf; 1025 } 1026 1027 aiov.iov_base = ptr; 1028 aiov.iov_len = iov_len; 1029 auio.uio_loffset = sfv_off; 1030 auio.uio_iov = &aiov; 1031 auio.uio_iovcnt = 1; 1032 auio.uio_resid = iov_len; 1033 auio.uio_segflg = UIO_SYSSPACE; 1034 auio.uio_llimit = MAXOFFSET_T; 1035 auio.uio_fmode = ffp->f_flag; 1036 ioflag = auio.uio_fmode & 1037 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1038 1039 /* 1040 * If read sync is not asked for, 1041 * filter sync flags 1042 */ 1043 if ((ioflag & FRSYNC) == 0) 1044 ioflag &= ~(FSYNC|FDSYNC); 1045 error = VOP_READ(readvp, &auio, ioflag, 1046 fp->f_cred, NULL); 1047 if (error != 0) { 1048 /* 1049 * If we were reading a pipe (currently 1050 * not implemented), we may now lose 1051 * data. 1052 */ 1053 if (vp->v_type == VSOCK) 1054 freeb(dmp); 1055 else 1056 kmem_free(buf, size); 1057 VOP_RWUNLOCK(readvp, readflg, NULL); 1058 releasef(sfv->sfv_fd); 1059 return (error); 1060 } 1061 1062 /* 1063 * Check how much data was really read. 1064 * Decrement the 'len' and increment the 1065 * 'off' appropriately. 1066 */ 1067 cnt = iov_len - auio.uio_resid; 1068 if (cnt == 0) { 1069 if (vp->v_type == VSOCK) 1070 freeb(dmp); 1071 else 1072 kmem_free(buf, size); 1073 VOP_RWUNLOCK(readvp, readflg, NULL); 1074 releasef(sfv->sfv_fd); 1075 return (EINVAL); 1076 } 1077 sfv_len -= cnt; 1078 sfv_off += cnt; 1079 1080 if (vp->v_type == VSOCK) { 1081 dmp->b_wptr = dmp->b_rptr + cnt; 1082 1083 error = kstrwritemp(vp, dmp, fflag); 1084 if (error != 0) { 1085 freeb(dmp); 1086 VOP_RWUNLOCK(readvp, readflg, 1087 NULL); 1088 releasef(sfv->sfv_fd); 1089 return (error); 1090 } 1091 1092 ttolwp(curthread)->lwp_ru.ioch += 1093 (ulong_t)cnt; 1094 *count += cnt; 1095 } else { 1096 1097 aiov.iov_base = ptr; 1098 aiov.iov_len = cnt; 1099 auio.uio_loffset = *fileoff; 1100 auio.uio_resid = cnt; 1101 auio.uio_segflg = UIO_SYSSPACE; 1102 auio.uio_llimit = curproc->p_fsz_ctl; 1103 auio.uio_fmode = fflag; 1104 ioflag = auio.uio_fmode & 1105 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1106 error = VOP_WRITE(vp, &auio, ioflag, 1107 fp->f_cred, NULL); 1108 1109 /* 1110 * Check how much data was written. 1111 * Increment the 'len' and decrement the 1112 * 'off' if all the data was not 1113 * written. 1114 */ 1115 cnt -= auio.uio_resid; 1116 sfv_len += auio.uio_resid; 1117 sfv_off -= auio.uio_resid; 1118 ttolwp(curthread)->lwp_ru.ioch += 1119 (ulong_t)cnt; 1120 *fileoff += cnt; 1121 *count += cnt; 1122 if (error != 0) { 1123 kmem_free(buf, size); 1124 VOP_RWUNLOCK(readvp, readflg, 1125 NULL); 1126 releasef(sfv->sfv_fd); 1127 return (error); 1128 } 1129 } 1130 } 1131 if (buf) { 1132 kmem_free(buf, size); 1133 buf = NULL; 1134 } 1135 VOP_RWUNLOCK(readvp, readflg, NULL); 1136 releasef(sfv->sfv_fd); 1137 } 1138 sfv++; 1139 } 1140 return (0); 1141 } 1142 1143 ssize_t 1144 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, 1145 size_t *xferred) 1146 { 1147 int error; 1148 file_t *fp; 1149 struct vnode *vp; 1150 struct sonode *so; 1151 u_offset_t fileoff; 1152 int copy_cnt; 1153 const struct sendfilevec *copy_vec; 1154 struct sendfilevec sfv[SEND_MAX_CHUNK]; 1155 ssize_t count = 0; 1156 #ifdef _SYSCALL32_IMPL 1157 struct ksendfilevec32 sfv32[SEND_MAX_CHUNK]; 1158 #endif 1159 ssize_t total_size; 1160 int i; 1161 boolean_t is_sock = B_FALSE; 1162 int maxblk = 0; 1163 1164 if (sfvcnt <= 0) 1165 return (set_errno(EINVAL)); 1166 1167 if ((fp = getf(fildes)) == NULL) 1168 return (set_errno(EBADF)); 1169 1170 if (((fp->f_flag) & FWRITE) == 0) { 1171 error = EBADF; 1172 goto err; 1173 } 1174 1175 fileoff = fp->f_offset; 1176 vp = fp->f_vnode; 1177 1178 switch (vp->v_type) { 1179 case VSOCK: 1180 so = VTOSO(vp); 1181 /* sendfile not supported for SCTP */ 1182 if (so->so_protocol == IPPROTO_SCTP) { 1183 error = EPROTONOSUPPORT; 1184 goto err; 1185 } 1186 is_sock = B_TRUE; 1187 switch (so->so_family) { 1188 case AF_INET: 1189 case AF_INET6: 1190 /* 1191 * Make similar checks done in SOP_WRITE(). 1192 */ 1193 if (so->so_state & SS_CANTSENDMORE) { 1194 tsignal(curthread, SIGPIPE); 1195 error = EPIPE; 1196 goto err; 1197 } 1198 if (so->so_type != SOCK_STREAM) { 1199 error = EOPNOTSUPP; 1200 goto err; 1201 } 1202 1203 if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) != 1204 (SS_ISCONNECTED|SS_ISBOUND)) { 1205 error = ENOTCONN; 1206 goto err; 1207 } 1208 1209 if ((so->so_state & SS_DIRECT) && 1210 (so->so_priv != NULL) && 1211 (so->so_kssl_ctx == NULL)) { 1212 maxblk = ((tcp_t *)so->so_priv)->tcp_mss; 1213 } else { 1214 maxblk = (int)vp->v_stream->sd_maxblk; 1215 } 1216 break; 1217 default: 1218 error = EAFNOSUPPORT; 1219 goto err; 1220 } 1221 break; 1222 case VREG: 1223 break; 1224 default: 1225 error = EINVAL; 1226 goto err; 1227 } 1228 1229 switch (opcode) { 1230 case SENDFILEV : 1231 break; 1232 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 1233 case SENDFILEV64 : 1234 return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt, 1235 (size32_t *)xferred, fildes)); 1236 #endif 1237 default : 1238 error = ENOSYS; 1239 break; 1240 } 1241 1242 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1243 copy_vec = vec; 1244 1245 do { 1246 total_size = 0; 1247 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 1248 #ifdef _SYSCALL32_IMPL 1249 /* 32-bit callers need to have their iovec expanded. */ 1250 if (get_udatamodel() == DATAMODEL_ILP32) { 1251 if (copyin(copy_vec, sfv32, 1252 copy_cnt * sizeof (ksendfilevec32_t))) { 1253 error = EFAULT; 1254 break; 1255 } 1256 1257 for (i = 0; i < copy_cnt; i++) { 1258 sfv[i].sfv_fd = sfv32[i].sfv_fd; 1259 sfv[i].sfv_off = 1260 (off_t)(uint32_t)sfv32[i].sfv_off; 1261 sfv[i].sfv_len = (size_t)sfv32[i].sfv_len; 1262 total_size += sfv[i].sfv_len; 1263 sfv[i].sfv_flag = sfv32[i].sfv_flag; 1264 } 1265 } else { 1266 #endif 1267 if (copyin(copy_vec, sfv, 1268 copy_cnt * sizeof (sendfilevec_t))) { 1269 error = EFAULT; 1270 break; 1271 } 1272 1273 for (i = 0; i < copy_cnt; i++) { 1274 total_size += sfv[i].sfv_len; 1275 } 1276 #ifdef _SYSCALL32_IMPL 1277 } 1278 #endif 1279 1280 /* 1281 * The task between deciding to use sendvec_small_chunk 1282 * and sendvec_chunk is dependant on multiple things: 1283 * 1284 * i) latency is important for smaller files. So if the 1285 * data is smaller than 'tcp_slow_start_initial' times 1286 * maxblk, then use sendvec_small_chunk which creates 1287 * maxblk size mblks and chains then together and sends 1288 * them to TCP in one shot. It also leaves 'wroff' size 1289 * space for the headers in each mblk. 1290 * 1291 * ii) for total size bigger than 'tcp_slow_start_initial' 1292 * time maxblk, its probably real file data which is 1293 * dominating. So its better to use sendvec_chunk because 1294 * performance goes to dog if we don't do pagesize reads. 1295 * sendvec_chunk will do pagesize reads and write them 1296 * in pagesize mblks to TCP. 1297 * 1298 * Side Notes: A write to file has not been optimized. 1299 * Future zero copy code will plugin into sendvec_chunk 1300 * only because doing zero copy for files smaller then 1301 * pagesize is useless. 1302 * 1303 * Note, if socket has NL7C enabled then call NL7C's 1304 * senfilev() function to consume the sfv[]. 1305 */ 1306 if (is_sock) { 1307 switch (so->so_family) { 1308 case AF_INET: 1309 case AF_INET6: 1310 if (so->so_nl7c_flags != 0) 1311 error = nl7c_sendfilev(so, &fileoff, 1312 sfv, copy_cnt, &count); 1313 else if (total_size <= (4 * maxblk)) 1314 error = sendvec_small_chunk(fp, 1315 &fileoff, sfv, copy_cnt, 1316 total_size, maxblk, &count); 1317 else 1318 error = sendvec_chunk(fp, &fileoff, 1319 sfv, copy_cnt, &count); 1320 break; 1321 } 1322 } else { 1323 ASSERT(vp->v_type == VREG); 1324 error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt, 1325 &count); 1326 } 1327 1328 1329 #ifdef _SYSCALL32_IMPL 1330 if (get_udatamodel() == DATAMODEL_ILP32) 1331 copy_vec = (const struct sendfilevec *)((char *)copy_vec + 1332 (copy_cnt * sizeof (ksendfilevec32_t))); 1333 else 1334 #endif 1335 copy_vec += copy_cnt; 1336 sfvcnt -= copy_cnt; 1337 } while (sfvcnt > 0); 1338 1339 if (vp->v_type == VREG) 1340 fp->f_offset += count; 1341 1342 1343 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1344 1345 #ifdef _SYSCALL32_IMPL 1346 if (get_udatamodel() == DATAMODEL_ILP32) { 1347 ssize32_t count32 = (ssize32_t)count; 1348 if (copyout(&count32, xferred, sizeof (count32))) 1349 error = EFAULT; 1350 releasef(fildes); 1351 if (error != 0) 1352 return (set_errno(error)); 1353 return (count32); 1354 } 1355 #endif 1356 if (copyout(&count, xferred, sizeof (count))) 1357 error = EFAULT; 1358 releasef(fildes); 1359 if (error != 0) 1360 return (set_errno(error)); 1361 return (count); 1362 err: 1363 ASSERT(error != 0); 1364 releasef(fildes); 1365 return (set_errno(error)); 1366 } 1367