1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/t_lock.h> 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/buf.h> 34 #include <sys/conf.h> 35 #include <sys/cred.h> 36 #include <sys/kmem.h> 37 #include <sys/sysmacros.h> 38 #include <sys/vfs.h> 39 #include <sys/vnode.h> 40 #include <sys/debug.h> 41 #include <sys/errno.h> 42 #include <sys/time.h> 43 #include <sys/file.h> 44 #include <sys/open.h> 45 #include <sys/user.h> 46 #include <sys/termios.h> 47 #include <sys/stream.h> 48 #include <sys/strsubr.h> 49 #include <sys/sunddi.h> 50 #include <sys/esunddi.h> 51 #include <sys/flock.h> 52 #include <sys/modctl.h> 53 #include <sys/cmn_err.h> 54 #include <sys/vmsystm.h> 55 56 #include <sys/socket.h> 57 #include <sys/socketvar.h> 58 /* swilly code in sys/socketvar.h turns off DEBUG */ 59 #ifdef __lint 60 #define DEBUG 61 #endif 62 63 #include <netinet/in.h> 64 #include <sys/sendfile.h> 65 #include <sys/un.h> 66 #include <sys/tihdr.h> 67 #include <sys/atomic.h> 68 69 #include <inet/common.h> 70 #include <inet/ip.h> 71 #include <inet/ip6.h> 72 #include <inet/tcp.h> 73 74 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *, 75 ssize32_t *); 76 extern int nl7c_sendfilev(struct sonode *, u_offset_t *, struct sendfilevec *, 77 int, ssize_t *); 78 extern int snf_segmap(file_t *, vnode_t *, u_offset_t, u_offset_t, uint_t, 79 ssize_t *, boolean_t); 80 81 #define readflg (V_WRITELOCK_FALSE) 82 #define rwflag (V_WRITELOCK_TRUE) 83 84 /* 85 * kstrwritemp() has very similar semantics as that of strwrite(). 86 * The main difference is it obtains mblks from the caller and also 87 * does not do any copy as done in strwrite() from user buffers to 88 * kernel buffers. 89 * 90 * Currently, this routine is used by sendfile to send data allocated 91 * within the kernel without any copying. This interface does not use the 92 * synchronous stream interface as synch. stream interface implies 93 * copying. 94 */ 95 int 96 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode) 97 { 98 struct stdata *stp; 99 struct queue *wqp; 100 mblk_t *newmp; 101 char waitflag; 102 int tempmode; 103 int error = 0; 104 int done = 0; 105 struct sonode *so; 106 boolean_t direct; 107 108 ASSERT(vp->v_stream); 109 stp = vp->v_stream; 110 111 so = VTOSO(vp); 112 direct = (so->so_state & SS_DIRECT); 113 114 /* 115 * This is the sockfs direct fast path. canputnext() need 116 * not be accurate so we don't grab the sd_lock here. If 117 * we get flow-controlled, we grab sd_lock just before the 118 * do..while loop below to emulate what strwrite() does. 119 */ 120 wqp = stp->sd_wrq; 121 if (canputnext(wqp) && direct && 122 !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) { 123 return (sostream_direct(so, NULL, mp, CRED())); 124 } else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) { 125 /* Fast check of flags before acquiring the lock */ 126 mutex_enter(&stp->sd_lock); 127 error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0); 128 mutex_exit(&stp->sd_lock); 129 if (error != 0) { 130 if (!(stp->sd_flag & STPLEX) && 131 (stp->sd_wput_opt & SW_SIGPIPE)) { 132 tsignal(curthread, SIGPIPE); 133 error = EPIPE; 134 } 135 return (error); 136 } 137 } 138 139 waitflag = WRITEWAIT; 140 if (stp->sd_flag & OLDNDELAY) 141 tempmode = fmode & ~FNDELAY; 142 else 143 tempmode = fmode; 144 145 mutex_enter(&stp->sd_lock); 146 do { 147 if (canputnext(wqp)) { 148 mutex_exit(&stp->sd_lock); 149 if (stp->sd_wputdatafunc != NULL) { 150 newmp = (stp->sd_wputdatafunc)(vp, mp, NULL, 151 NULL, NULL, NULL); 152 if (newmp == NULL) { 153 /* The caller will free mp */ 154 return (ECOMM); 155 } 156 mp = newmp; 157 } 158 putnext(wqp, mp); 159 return (0); 160 } 161 error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1, 162 &done); 163 } while (error == 0 && !done); 164 165 mutex_exit(&stp->sd_lock); 166 /* 167 * EAGAIN tells the application to try again. ENOMEM 168 * is returned only if the memory allocation size 169 * exceeds the physical limits of the system. ENOMEM 170 * can't be true here. 171 */ 172 if (error == ENOMEM) 173 error = EAGAIN; 174 return (error); 175 } 176 177 #define SEND_MAX_CHUNK 16 178 179 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 180 /* 181 * 64 bit offsets for 32 bit applications only running either on 182 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer 183 * more than 2GB of data. 184 */ 185 int 186 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, 187 int copy_cnt, ssize32_t *count) 188 { 189 struct vnode *vp; 190 ushort_t fflag; 191 int ioflag; 192 size32_t cnt; 193 ssize32_t sfv_len; 194 ssize32_t tmpcount; 195 u_offset_t sfv_off; 196 struct uio auio; 197 struct iovec aiov; 198 int i, error; 199 200 fflag = fp->f_flag; 201 vp = fp->f_vnode; 202 for (i = 0; i < copy_cnt; i++) { 203 204 if (ISSIG(curthread, JUSTLOOKING)) 205 return (EINTR); 206 207 /* 208 * Do similar checks as "write" as we are writing 209 * sfv_len bytes into "vp". 210 */ 211 sfv_len = (ssize32_t)sfv->sfv_len; 212 213 if (sfv_len == 0) 214 continue; 215 216 if (sfv_len < 0) 217 return (EINVAL); 218 219 if (vp->v_type == VREG) { 220 if (*fileoff >= curproc->p_fsz_ctl) { 221 mutex_enter(&curproc->p_lock); 222 (void) rctl_action( 223 rctlproc_legacy[RLIMIT_FSIZE], 224 curproc->p_rctls, curproc, RCA_SAFE); 225 mutex_exit(&curproc->p_lock); 226 return (EFBIG); 227 } 228 229 if (*fileoff >= OFFSET_MAX(fp)) 230 return (EFBIG); 231 232 if (*fileoff + sfv_len > OFFSET_MAX(fp)) 233 return (EINVAL); 234 } 235 236 tmpcount = *count + sfv_len; 237 if (tmpcount < 0) 238 return (EINVAL); 239 240 sfv_off = sfv->sfv_off; 241 242 auio.uio_extflg = UIO_COPY_DEFAULT; 243 if (sfv->sfv_fd == SFV_FD_SELF) { 244 aiov.iov_len = sfv_len; 245 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 246 auio.uio_loffset = *fileoff; 247 auio.uio_iovcnt = 1; 248 auio.uio_resid = sfv_len; 249 auio.uio_iov = &aiov; 250 auio.uio_segflg = UIO_USERSPACE; 251 auio.uio_llimit = curproc->p_fsz_ctl; 252 auio.uio_fmode = fflag; 253 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 254 while (sfv_len > 0) { 255 error = VOP_WRITE(vp, &auio, ioflag, 256 fp->f_cred, NULL); 257 cnt = sfv_len - auio.uio_resid; 258 sfv_len -= cnt; 259 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 260 if (vp->v_type == VREG) 261 *fileoff += cnt; 262 *count += cnt; 263 if (error != 0) 264 return (error); 265 } 266 } else { 267 file_t *ffp; 268 vnode_t *readvp; 269 size_t size; 270 caddr_t ptr; 271 272 if ((ffp = getf(sfv->sfv_fd)) == NULL) 273 return (EBADF); 274 275 if ((ffp->f_flag & FREAD) == 0) { 276 releasef(sfv->sfv_fd); 277 return (EBADF); 278 } 279 280 readvp = ffp->f_vnode; 281 if (readvp->v_type != VREG) { 282 releasef(sfv->sfv_fd); 283 return (EINVAL); 284 } 285 286 /* 287 * No point reading and writing to same vp, 288 * as long as both are regular files. readvp is not 289 * locked; but since we got it from an open file the 290 * contents will be valid during the time of access. 291 */ 292 if (vn_compare(vp, readvp)) { 293 releasef(sfv->sfv_fd); 294 return (EINVAL); 295 } 296 297 /* 298 * Note: we assume readvp != vp. "vp" is already 299 * locked, and "readvp" must not be. 300 */ 301 (void) VOP_RWLOCK(readvp, readflg, NULL); 302 303 /* 304 * Same checks as in pread64. 305 */ 306 if (sfv_off > MAXOFFSET_T) { 307 VOP_RWUNLOCK(readvp, readflg, NULL); 308 releasef(sfv->sfv_fd); 309 return (EINVAL); 310 } 311 312 if (sfv_off + sfv_len > MAXOFFSET_T) 313 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off); 314 315 /* Find the native blocksize to transfer data */ 316 size = MIN(vp->v_vfsp->vfs_bsize, 317 readvp->v_vfsp->vfs_bsize); 318 size = sfv_len < size ? sfv_len : size; 319 ptr = kmem_alloc(size, KM_SLEEP); 320 321 while (sfv_len > 0) { 322 size_t iov_len; 323 324 iov_len = MIN(size, sfv_len); 325 aiov.iov_base = ptr; 326 aiov.iov_len = iov_len; 327 auio.uio_loffset = sfv_off; 328 auio.uio_iov = &aiov; 329 auio.uio_iovcnt = 1; 330 auio.uio_resid = iov_len; 331 auio.uio_segflg = UIO_SYSSPACE; 332 auio.uio_llimit = MAXOFFSET_T; 333 auio.uio_fmode = ffp->f_flag; 334 ioflag = auio.uio_fmode & 335 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 336 337 /* 338 * If read sync is not asked for, 339 * filter sync flags 340 */ 341 if ((ioflag & FRSYNC) == 0) 342 ioflag &= ~(FSYNC|FDSYNC); 343 error = VOP_READ(readvp, &auio, ioflag, 344 fp->f_cred, NULL); 345 if (error) { 346 kmem_free(ptr, size); 347 VOP_RWUNLOCK(readvp, readflg, NULL); 348 releasef(sfv->sfv_fd); 349 return (error); 350 } 351 352 /* 353 * Check how must data was really read. 354 * Decrement the 'len' and increment the 355 * 'off' appropriately. 356 */ 357 cnt = iov_len - auio.uio_resid; 358 if (cnt == 0) { 359 /* 360 * If we were reading a pipe (currently 361 * not implemented), we may now lose 362 * data. 363 */ 364 kmem_free(ptr, size); 365 VOP_RWUNLOCK(readvp, readflg, NULL); 366 releasef(sfv->sfv_fd); 367 return (EINVAL); 368 } 369 sfv_len -= cnt; 370 sfv_off += cnt; 371 372 aiov.iov_base = ptr; 373 aiov.iov_len = cnt; 374 auio.uio_loffset = *fileoff; 375 auio.uio_resid = cnt; 376 auio.uio_segflg = UIO_SYSSPACE; 377 auio.uio_llimit = curproc->p_fsz_ctl; 378 auio.uio_fmode = fflag; 379 ioflag = auio.uio_fmode & 380 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 381 error = VOP_WRITE(vp, &auio, ioflag, 382 fp->f_cred, NULL); 383 384 /* 385 * Check how much data was written. Increment 386 * the 'len' and decrement the 'off' if all 387 * the data was not written. 388 */ 389 cnt -= auio.uio_resid; 390 sfv_len += auio.uio_resid; 391 sfv_off -= auio.uio_resid; 392 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 393 if (vp->v_type == VREG) 394 *fileoff += cnt; 395 *count += cnt; 396 if (error != 0) { 397 kmem_free(ptr, size); 398 VOP_RWUNLOCK(readvp, readflg, NULL); 399 releasef(sfv->sfv_fd); 400 return (error); 401 } 402 } 403 VOP_RWUNLOCK(readvp, readflg, NULL); 404 releasef(sfv->sfv_fd); 405 kmem_free(ptr, size); 406 } 407 sfv++; 408 } 409 return (0); 410 } 411 412 ssize32_t 413 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, 414 size32_t *xferred, int fildes) 415 { 416 u_offset_t fileoff; 417 int copy_cnt; 418 const struct ksendfilevec64 *copy_vec; 419 struct ksendfilevec64 sfv[SEND_MAX_CHUNK]; 420 struct vnode *vp; 421 int error; 422 ssize32_t count = 0; 423 424 vp = fp->f_vnode; 425 (void) VOP_RWLOCK(vp, rwflag, NULL); 426 427 copy_vec = vec; 428 fileoff = fp->f_offset; 429 430 do { 431 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 432 if (copyin(copy_vec, sfv, copy_cnt * 433 sizeof (struct ksendfilevec64))) { 434 error = EFAULT; 435 break; 436 } 437 438 /* 439 * Optimize the regular file over 440 * the socket case. 441 */ 442 if (vp->v_type == VSOCK && sfv->sfv_fd != SFV_FD_SELF) { 443 file_t *rfp; 444 vnode_t *rvp; 445 446 if ((rfp = getf(sfv->sfv_fd)) == NULL) { 447 error = EBADF; 448 break; 449 } 450 if ((rfp->f_flag & FREAD) == 0) { 451 releasef(sfv->sfv_fd); 452 error = EBADF; 453 break; 454 } 455 rvp = rfp->f_vnode; 456 if (rvp->v_type == VREG) { 457 error = sosendfile64(fp, rfp, sfv, &count); 458 if (error) 459 break; 460 copy_vec++; 461 sfvcnt--; 462 continue; 463 } 464 releasef(sfv->sfv_fd); 465 } 466 error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count); 467 if (error != 0) 468 break; 469 470 copy_vec += copy_cnt; 471 sfvcnt -= copy_cnt; 472 } while (sfvcnt > 0); 473 474 if (vp->v_type == VREG) 475 fp->f_offset += count; 476 477 VOP_RWUNLOCK(vp, rwflag, NULL); 478 if (copyout(&count, xferred, sizeof (count))) 479 error = EFAULT; 480 releasef(fildes); 481 if (error != 0) 482 return (set_errno(error)); 483 return (count); 484 } 485 #endif 486 487 int 488 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 489 int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count) 490 { 491 struct vnode *vp; 492 struct uio auio; 493 struct iovec aiov; 494 ushort_t fflag; 495 int ioflag; 496 int i, error; 497 size_t cnt; 498 ssize_t sfv_len; 499 u_offset_t sfv_off; 500 #ifdef _SYSCALL32_IMPL 501 model_t model = get_udatamodel(); 502 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 503 MAXOFF32_T : MAXOFFSET_T; 504 #else 505 const u_offset_t maxoff = MAXOFF32_T; 506 #endif 507 mblk_t *dmp = NULL; 508 int wroff; 509 int buf_left = 0; 510 size_t iov_len; 511 mblk_t *head, *tmp; 512 size_t size = total_size; 513 size_t extra; 514 int tail_len; 515 516 fflag = fp->f_flag; 517 vp = fp->f_vnode; 518 519 ASSERT(vp->v_type == VSOCK); 520 ASSERT(maxblk > 0); 521 522 wroff = (int)vp->v_stream->sd_wroff; 523 tail_len = (int)vp->v_stream->sd_tail; 524 extra = wroff + tail_len; 525 526 buf_left = MIN(total_size, maxblk); 527 head = dmp = allocb(buf_left + extra, BPRI_HI); 528 if (head == NULL) 529 return (ENOMEM); 530 head->b_wptr = head->b_rptr = head->b_rptr + wroff; 531 532 auio.uio_extflg = UIO_COPY_DEFAULT; 533 for (i = 0; i < copy_cnt; i++) { 534 if (ISSIG(curthread, JUSTLOOKING)) 535 return (EINTR); 536 537 /* 538 * Do similar checks as "write" as we are writing 539 * sfv_len bytes into "vp". 540 */ 541 sfv_len = (ssize_t)sfv->sfv_len; 542 543 if (sfv_len == 0) { 544 sfv++; 545 continue; 546 } 547 548 /* Make sure sfv_len is not negative */ 549 #ifdef _SYSCALL32_IMPL 550 if (model == DATAMODEL_ILP32) { 551 if ((ssize32_t)sfv_len < 0) 552 return (EINVAL); 553 } else 554 #endif 555 if (sfv_len < 0) 556 return (EINVAL); 557 558 /* Check for overflow */ 559 #ifdef _SYSCALL32_IMPL 560 if (model == DATAMODEL_ILP32) { 561 if (((ssize32_t)(*count + sfv_len)) < 0) 562 return (EINVAL); 563 } else 564 #endif 565 if ((*count + sfv_len) < 0) 566 return (EINVAL); 567 568 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 569 570 if (sfv->sfv_fd == SFV_FD_SELF) { 571 while (sfv_len > 0) { 572 if (buf_left == 0) { 573 tmp = dmp; 574 buf_left = MIN(total_size, maxblk); 575 iov_len = MIN(buf_left, sfv_len); 576 dmp = allocb(buf_left + extra, BPRI_HI); 577 if (dmp == NULL) { 578 freemsg(head); 579 return (ENOMEM); 580 } 581 dmp->b_wptr = dmp->b_rptr = 582 dmp->b_rptr + wroff; 583 tmp->b_cont = dmp; 584 } else { 585 iov_len = MIN(buf_left, sfv_len); 586 } 587 588 aiov.iov_len = iov_len; 589 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 590 auio.uio_loffset = *fileoff; 591 auio.uio_iovcnt = 1; 592 auio.uio_resid = iov_len; 593 auio.uio_iov = &aiov; 594 auio.uio_segflg = UIO_USERSPACE; 595 auio.uio_llimit = curproc->p_fsz_ctl; 596 auio.uio_fmode = fflag; 597 598 buf_left -= iov_len; 599 total_size -= iov_len; 600 sfv_len -= iov_len; 601 sfv_off += iov_len; 602 603 error = uiomove((caddr_t)dmp->b_wptr, 604 iov_len, UIO_WRITE, &auio); 605 if (error != 0) { 606 freemsg(head); 607 return (error); 608 } 609 dmp->b_wptr += iov_len; 610 } 611 } else { 612 file_t *ffp; 613 vnode_t *readvp; 614 615 if ((ffp = getf(sfv->sfv_fd)) == NULL) { 616 freemsg(head); 617 return (EBADF); 618 } 619 620 if ((ffp->f_flag & FREAD) == 0) { 621 releasef(sfv->sfv_fd); 622 freemsg(head); 623 return (EACCES); 624 } 625 626 readvp = ffp->f_vnode; 627 if (readvp->v_type != VREG) { 628 releasef(sfv->sfv_fd); 629 freemsg(head); 630 return (EINVAL); 631 } 632 633 /* 634 * No point reading and writing to same vp, 635 * as long as both are regular files. readvp is not 636 * locked; but since we got it from an open file the 637 * contents will be valid during the time of access. 638 */ 639 640 if (vn_compare(vp, readvp)) { 641 releasef(sfv->sfv_fd); 642 freemsg(head); 643 return (EINVAL); 644 } 645 646 /* 647 * Note: we assume readvp != vp. "vp" is already 648 * locked, and "readvp" must not be. 649 */ 650 651 (void) VOP_RWLOCK(readvp, readflg, NULL); 652 653 /* Same checks as in pread */ 654 if (sfv_off > maxoff) { 655 VOP_RWUNLOCK(readvp, readflg, NULL); 656 releasef(sfv->sfv_fd); 657 freemsg(head); 658 return (EINVAL); 659 } 660 if (sfv_off + sfv_len > maxoff) { 661 total_size -= (sfv_off + sfv_len - maxoff); 662 sfv_len = (ssize_t)((offset_t)maxoff - 663 sfv_off); 664 } 665 666 while (sfv_len > 0) { 667 if (buf_left == 0) { 668 tmp = dmp; 669 buf_left = MIN(total_size, maxblk); 670 iov_len = MIN(buf_left, sfv_len); 671 dmp = allocb(buf_left + extra, BPRI_HI); 672 if (dmp == NULL) { 673 VOP_RWUNLOCK(readvp, readflg, 674 NULL); 675 releasef(sfv->sfv_fd); 676 freemsg(head); 677 return (ENOMEM); 678 } 679 dmp->b_wptr = dmp->b_rptr = 680 dmp->b_rptr + wroff; 681 tmp->b_cont = dmp; 682 } else { 683 iov_len = MIN(buf_left, sfv_len); 684 } 685 aiov.iov_base = (caddr_t)dmp->b_wptr; 686 aiov.iov_len = iov_len; 687 auio.uio_loffset = sfv_off; 688 auio.uio_iov = &aiov; 689 auio.uio_iovcnt = 1; 690 auio.uio_resid = iov_len; 691 auio.uio_segflg = UIO_SYSSPACE; 692 auio.uio_llimit = MAXOFFSET_T; 693 auio.uio_fmode = ffp->f_flag; 694 ioflag = auio.uio_fmode & 695 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 696 697 /* 698 * If read sync is not asked for, 699 * filter sync flags 700 */ 701 if ((ioflag & FRSYNC) == 0) 702 ioflag &= ~(FSYNC|FDSYNC); 703 error = VOP_READ(readvp, &auio, ioflag, 704 fp->f_cred, NULL); 705 if (error != 0) { 706 /* 707 * If we were reading a pipe (currently 708 * not implemented), we may now loose 709 * data. 710 */ 711 VOP_RWUNLOCK(readvp, readflg, NULL); 712 releasef(sfv->sfv_fd); 713 freemsg(head); 714 return (error); 715 } 716 717 /* 718 * Check how much data was really read. 719 * Decrement the 'len' and increment the 720 * 'off' appropriately. 721 */ 722 cnt = iov_len - auio.uio_resid; 723 if (cnt == 0) { 724 VOP_RWUNLOCK(readvp, readflg, NULL); 725 releasef(sfv->sfv_fd); 726 freemsg(head); 727 return (EINVAL); 728 } 729 sfv_len -= cnt; 730 sfv_off += cnt; 731 total_size -= cnt; 732 buf_left -= cnt; 733 734 dmp->b_wptr += cnt; 735 } 736 VOP_RWUNLOCK(readvp, readflg, NULL); 737 releasef(sfv->sfv_fd); 738 } 739 sfv++; 740 } 741 742 ASSERT(total_size == 0); 743 error = kstrwritemp(vp, head, fflag); 744 if (error != 0) { 745 freemsg(head); 746 return (error); 747 } 748 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size; 749 *count += size; 750 751 return (0); 752 } 753 754 755 int 756 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 757 int copy_cnt, ssize_t *count) 758 { 759 struct vnode *vp; 760 struct uio auio; 761 struct iovec aiov; 762 ushort_t fflag; 763 int ioflag; 764 int i, error; 765 size_t cnt; 766 ssize_t sfv_len; 767 u_offset_t sfv_off; 768 #ifdef _SYSCALL32_IMPL 769 model_t model = get_udatamodel(); 770 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 771 MAXOFF32_T : MAXOFFSET_T; 772 #else 773 const u_offset_t maxoff = MAXOFF32_T; 774 #endif 775 mblk_t *dmp = NULL; 776 char *buf = NULL; 777 size_t extra; 778 int maxblk, wroff, tail_len; 779 struct sonode *so; 780 stdata_t *stp; 781 782 fflag = fp->f_flag; 783 vp = fp->f_vnode; 784 785 if (vp->v_type == VSOCK) { 786 so = VTOSO(vp); 787 stp = vp->v_stream; 788 wroff = (int)stp->sd_wroff; 789 tail_len = (int)stp->sd_tail; 790 maxblk = (int)stp->sd_maxblk; 791 extra = wroff + tail_len; 792 } 793 794 auio.uio_extflg = UIO_COPY_DEFAULT; 795 for (i = 0; i < copy_cnt; i++) { 796 if (ISSIG(curthread, JUSTLOOKING)) 797 return (EINTR); 798 799 /* 800 * Do similar checks as "write" as we are writing 801 * sfv_len bytes into "vp". 802 */ 803 sfv_len = (ssize_t)sfv->sfv_len; 804 805 if (sfv_len == 0) { 806 sfv++; 807 continue; 808 } 809 810 /* Make sure sfv_len is not negative */ 811 #ifdef _SYSCALL32_IMPL 812 if (model == DATAMODEL_ILP32) { 813 if ((ssize32_t)sfv_len < 0) 814 return (EINVAL); 815 } else 816 #endif 817 if (sfv_len < 0) 818 return (EINVAL); 819 820 if (vp->v_type == VREG) { 821 if (*fileoff >= curproc->p_fsz_ctl) { 822 mutex_enter(&curproc->p_lock); 823 (void) rctl_action( 824 rctlproc_legacy[RLIMIT_FSIZE], 825 curproc->p_rctls, curproc, RCA_SAFE); 826 mutex_exit(&curproc->p_lock); 827 828 return (EFBIG); 829 } 830 831 if (*fileoff >= maxoff) 832 return (EFBIG); 833 834 if (*fileoff + sfv_len > maxoff) 835 return (EINVAL); 836 } 837 838 /* Check for overflow */ 839 #ifdef _SYSCALL32_IMPL 840 if (model == DATAMODEL_ILP32) { 841 if (((ssize32_t)(*count + sfv_len)) < 0) 842 return (EINVAL); 843 } else 844 #endif 845 if ((*count + sfv_len) < 0) 846 return (EINVAL); 847 848 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 849 850 if (sfv->sfv_fd == SFV_FD_SELF) { 851 aiov.iov_len = sfv_len; 852 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 853 auio.uio_loffset = *fileoff; 854 auio.uio_iovcnt = 1; 855 auio.uio_resid = sfv_len; 856 auio.uio_iov = &aiov; 857 auio.uio_segflg = UIO_USERSPACE; 858 auio.uio_llimit = curproc->p_fsz_ctl; 859 auio.uio_fmode = fflag; 860 861 if (vp->v_type == VSOCK) { 862 863 /* 864 * Optimize for the socket case 865 */ 866 867 dmp = allocb(sfv_len + extra, BPRI_HI); 868 if (dmp == NULL) 869 return (ENOMEM); 870 dmp->b_wptr = dmp->b_rptr = dmp->b_rptr + wroff; 871 error = uiomove((caddr_t)dmp->b_wptr, 872 sfv_len, UIO_WRITE, &auio); 873 if (error != 0) { 874 freeb(dmp); 875 return (error); 876 } 877 dmp->b_wptr += sfv_len; 878 error = kstrwritemp(vp, dmp, fflag); 879 if (error != 0) { 880 freeb(dmp); 881 return (error); 882 } 883 ttolwp(curthread)->lwp_ru.ioch += 884 (ulong_t)sfv_len; 885 *count += sfv_len; 886 } else { 887 ioflag = auio.uio_fmode & 888 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 889 while (sfv_len > 0) { 890 error = VOP_WRITE(vp, &auio, ioflag, 891 fp->f_cred, NULL); 892 cnt = sfv_len - auio.uio_resid; 893 sfv_len -= cnt; 894 ttolwp(curthread)->lwp_ru.ioch += 895 (ulong_t)cnt; 896 *fileoff += cnt; 897 *count += cnt; 898 if (error != 0) 899 return (error); 900 } 901 } 902 } else { 903 int segmapit = 0; 904 file_t *ffp; 905 vnode_t *readvp; 906 struct vnode *realvp; 907 size_t size; 908 caddr_t ptr; 909 910 if ((ffp = getf(sfv->sfv_fd)) == NULL) 911 return (EBADF); 912 913 if ((ffp->f_flag & FREAD) == 0) { 914 releasef(sfv->sfv_fd); 915 return (EBADF); 916 } 917 918 readvp = ffp->f_vnode; 919 if (VOP_REALVP(readvp, &realvp, NULL) == 0) 920 readvp = realvp; 921 if (readvp->v_type != VREG) { 922 releasef(sfv->sfv_fd); 923 return (EINVAL); 924 } 925 926 /* 927 * No point reading and writing to same vp, 928 * as long as both are regular files. readvp is not 929 * locked; but since we got it from an open file the 930 * contents will be valid during the time of access. 931 */ 932 if (vn_compare(vp, readvp)) { 933 releasef(sfv->sfv_fd); 934 return (EINVAL); 935 } 936 937 /* 938 * Note: we assume readvp != vp. "vp" is already 939 * locked, and "readvp" must not be. 940 */ 941 (void) VOP_RWLOCK(readvp, readflg, NULL); 942 943 /* Same checks as in pread */ 944 if (sfv_off > maxoff) { 945 VOP_RWUNLOCK(readvp, readflg, NULL); 946 releasef(sfv->sfv_fd); 947 return (EINVAL); 948 } 949 if (sfv_off + sfv_len > maxoff) { 950 sfv_len = (ssize_t)((offset_t)maxoff - 951 sfv_off); 952 } 953 /* Find the native blocksize to transfer data */ 954 size = MIN(vp->v_vfsp->vfs_bsize, 955 readvp->v_vfsp->vfs_bsize); 956 size = sfv_len < size ? sfv_len : size; 957 958 if (vp->v_type != VSOCK) { 959 segmapit = 0; 960 buf = kmem_alloc(size, KM_NOSLEEP); 961 if (buf == NULL) { 962 VOP_RWUNLOCK(readvp, readflg, NULL); 963 releasef(sfv->sfv_fd); 964 return (ENOMEM); 965 } 966 } else { 967 /* 968 * For sockets acting as an SSL proxy, we 969 * need to adjust the size to the maximum 970 * SSL record size set in the stream head. 971 */ 972 if (so->so_kssl_ctx != NULL) 973 size = MIN(size, maxblk); 974 975 if (vn_has_flocks(readvp) || 976 readvp->v_flag & VNOMAP || 977 stp->sd_copyflag & STZCVMUNSAFE) { 978 segmapit = 0; 979 } else if (stp->sd_copyflag & STZCVMSAFE) { 980 segmapit = 1; 981 } else { 982 int on = 1; 983 if (SOP_SETSOCKOPT(VTOSO(vp), 984 SOL_SOCKET, SO_SND_COPYAVOID, 985 &on, sizeof (on)) == 0) 986 segmapit = 1; 987 } 988 } 989 990 if (segmapit) { 991 boolean_t nowait; 992 uint_t maxpsz; 993 994 nowait = (sfv->sfv_flag & SFV_NOWAIT) != 0; 995 maxpsz = stp->sd_qn_maxpsz; 996 if (maxpsz == INFPSZ) 997 maxpsz = maxphys; 998 maxpsz = roundup(maxpsz, MAXBSIZE); 999 error = snf_segmap(fp, readvp, sfv_off, 1000 (u_offset_t)sfv_len, maxpsz, 1001 (ssize_t *)&cnt, nowait); 1002 releasef(sfv->sfv_fd); 1003 *count += cnt; 1004 if (error) 1005 return (error); 1006 sfv++; 1007 continue; 1008 } 1009 1010 while (sfv_len > 0) { 1011 size_t iov_len; 1012 1013 iov_len = MIN(size, sfv_len); 1014 1015 if (vp->v_type == VSOCK) { 1016 dmp = allocb(iov_len + extra, BPRI_HI); 1017 if (dmp == NULL) { 1018 VOP_RWUNLOCK(readvp, readflg, 1019 NULL); 1020 releasef(sfv->sfv_fd); 1021 return (ENOMEM); 1022 } 1023 dmp->b_wptr = dmp->b_rptr = 1024 dmp->b_rptr + wroff; 1025 ptr = (caddr_t)dmp->b_rptr; 1026 } else { 1027 ptr = buf; 1028 } 1029 1030 aiov.iov_base = ptr; 1031 aiov.iov_len = iov_len; 1032 auio.uio_loffset = sfv_off; 1033 auio.uio_iov = &aiov; 1034 auio.uio_iovcnt = 1; 1035 auio.uio_resid = iov_len; 1036 auio.uio_segflg = UIO_SYSSPACE; 1037 auio.uio_llimit = MAXOFFSET_T; 1038 auio.uio_fmode = ffp->f_flag; 1039 ioflag = auio.uio_fmode & 1040 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1041 1042 /* 1043 * If read sync is not asked for, 1044 * filter sync flags 1045 */ 1046 if ((ioflag & FRSYNC) == 0) 1047 ioflag &= ~(FSYNC|FDSYNC); 1048 error = VOP_READ(readvp, &auio, ioflag, 1049 fp->f_cred, NULL); 1050 if (error != 0) { 1051 /* 1052 * If we were reading a pipe (currently 1053 * not implemented), we may now lose 1054 * data. 1055 */ 1056 if (vp->v_type == VSOCK) 1057 freeb(dmp); 1058 else 1059 kmem_free(buf, size); 1060 VOP_RWUNLOCK(readvp, readflg, NULL); 1061 releasef(sfv->sfv_fd); 1062 return (error); 1063 } 1064 1065 /* 1066 * Check how much data was really read. 1067 * Decrement the 'len' and increment the 1068 * 'off' appropriately. 1069 */ 1070 cnt = iov_len - auio.uio_resid; 1071 if (cnt == 0) { 1072 if (vp->v_type == VSOCK) 1073 freeb(dmp); 1074 else 1075 kmem_free(buf, size); 1076 VOP_RWUNLOCK(readvp, readflg, NULL); 1077 releasef(sfv->sfv_fd); 1078 return (EINVAL); 1079 } 1080 sfv_len -= cnt; 1081 sfv_off += cnt; 1082 1083 if (vp->v_type == VSOCK) { 1084 dmp->b_wptr = dmp->b_rptr + cnt; 1085 1086 error = kstrwritemp(vp, dmp, fflag); 1087 if (error != 0) { 1088 freeb(dmp); 1089 VOP_RWUNLOCK(readvp, readflg, 1090 NULL); 1091 releasef(sfv->sfv_fd); 1092 return (error); 1093 } 1094 1095 ttolwp(curthread)->lwp_ru.ioch += 1096 (ulong_t)cnt; 1097 *count += cnt; 1098 } else { 1099 1100 aiov.iov_base = ptr; 1101 aiov.iov_len = cnt; 1102 auio.uio_loffset = *fileoff; 1103 auio.uio_resid = cnt; 1104 auio.uio_segflg = UIO_SYSSPACE; 1105 auio.uio_llimit = curproc->p_fsz_ctl; 1106 auio.uio_fmode = fflag; 1107 ioflag = auio.uio_fmode & 1108 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1109 error = VOP_WRITE(vp, &auio, ioflag, 1110 fp->f_cred, NULL); 1111 1112 /* 1113 * Check how much data was written. 1114 * Increment the 'len' and decrement the 1115 * 'off' if all the data was not 1116 * written. 1117 */ 1118 cnt -= auio.uio_resid; 1119 sfv_len += auio.uio_resid; 1120 sfv_off -= auio.uio_resid; 1121 ttolwp(curthread)->lwp_ru.ioch += 1122 (ulong_t)cnt; 1123 *fileoff += cnt; 1124 *count += cnt; 1125 if (error != 0) { 1126 kmem_free(buf, size); 1127 VOP_RWUNLOCK(readvp, readflg, 1128 NULL); 1129 releasef(sfv->sfv_fd); 1130 return (error); 1131 } 1132 } 1133 } 1134 if (buf) { 1135 kmem_free(buf, size); 1136 buf = NULL; 1137 } 1138 VOP_RWUNLOCK(readvp, readflg, NULL); 1139 releasef(sfv->sfv_fd); 1140 } 1141 sfv++; 1142 } 1143 return (0); 1144 } 1145 1146 ssize_t 1147 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, 1148 size_t *xferred) 1149 { 1150 int error; 1151 file_t *fp; 1152 struct vnode *vp; 1153 struct sonode *so; 1154 u_offset_t fileoff; 1155 int copy_cnt; 1156 const struct sendfilevec *copy_vec; 1157 struct sendfilevec sfv[SEND_MAX_CHUNK]; 1158 ssize_t count = 0; 1159 #ifdef _SYSCALL32_IMPL 1160 struct ksendfilevec32 sfv32[SEND_MAX_CHUNK]; 1161 #endif 1162 ssize_t total_size; 1163 int i; 1164 boolean_t is_sock = B_FALSE; 1165 int maxblk = 0; 1166 1167 if (sfvcnt <= 0) 1168 return (set_errno(EINVAL)); 1169 1170 if ((fp = getf(fildes)) == NULL) 1171 return (set_errno(EBADF)); 1172 1173 if (((fp->f_flag) & FWRITE) == 0) { 1174 error = EBADF; 1175 goto err; 1176 } 1177 1178 fileoff = fp->f_offset; 1179 vp = fp->f_vnode; 1180 1181 switch (vp->v_type) { 1182 case VSOCK: 1183 so = VTOSO(vp); 1184 /* sendfile not supported for SCTP */ 1185 if (so->so_protocol == IPPROTO_SCTP) { 1186 error = EPROTONOSUPPORT; 1187 goto err; 1188 } 1189 is_sock = B_TRUE; 1190 switch (so->so_family) { 1191 case AF_INET: 1192 case AF_INET6: 1193 /* 1194 * Make similar checks done in SOP_WRITE(). 1195 */ 1196 if (so->so_state & SS_CANTSENDMORE) { 1197 tsignal(curthread, SIGPIPE); 1198 error = EPIPE; 1199 goto err; 1200 } 1201 if (so->so_type != SOCK_STREAM) { 1202 error = EOPNOTSUPP; 1203 goto err; 1204 } 1205 1206 if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) != 1207 (SS_ISCONNECTED|SS_ISBOUND)) { 1208 error = ENOTCONN; 1209 goto err; 1210 } 1211 1212 if ((so->so_state & SS_DIRECT) && 1213 (so->so_priv != NULL) && 1214 (so->so_kssl_ctx == NULL)) { 1215 maxblk = ((tcp_t *)so->so_priv)->tcp_mss; 1216 } else { 1217 maxblk = (int)vp->v_stream->sd_maxblk; 1218 } 1219 break; 1220 default: 1221 error = EAFNOSUPPORT; 1222 goto err; 1223 } 1224 break; 1225 case VREG: 1226 break; 1227 default: 1228 error = EINVAL; 1229 goto err; 1230 } 1231 1232 switch (opcode) { 1233 case SENDFILEV : 1234 break; 1235 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 1236 case SENDFILEV64 : 1237 return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt, 1238 (size32_t *)xferred, fildes)); 1239 #endif 1240 default : 1241 error = ENOSYS; 1242 break; 1243 } 1244 1245 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1246 copy_vec = vec; 1247 1248 do { 1249 total_size = 0; 1250 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 1251 #ifdef _SYSCALL32_IMPL 1252 /* 32-bit callers need to have their iovec expanded. */ 1253 if (get_udatamodel() == DATAMODEL_ILP32) { 1254 if (copyin(copy_vec, sfv32, 1255 copy_cnt * sizeof (ksendfilevec32_t))) { 1256 error = EFAULT; 1257 break; 1258 } 1259 1260 for (i = 0; i < copy_cnt; i++) { 1261 sfv[i].sfv_fd = sfv32[i].sfv_fd; 1262 sfv[i].sfv_off = 1263 (off_t)(uint32_t)sfv32[i].sfv_off; 1264 sfv[i].sfv_len = (size_t)sfv32[i].sfv_len; 1265 total_size += sfv[i].sfv_len; 1266 sfv[i].sfv_flag = sfv32[i].sfv_flag; 1267 } 1268 } else { 1269 #endif 1270 if (copyin(copy_vec, sfv, 1271 copy_cnt * sizeof (sendfilevec_t))) { 1272 error = EFAULT; 1273 break; 1274 } 1275 1276 for (i = 0; i < copy_cnt; i++) { 1277 total_size += sfv[i].sfv_len; 1278 } 1279 #ifdef _SYSCALL32_IMPL 1280 } 1281 #endif 1282 1283 /* 1284 * The task between deciding to use sendvec_small_chunk 1285 * and sendvec_chunk is dependant on multiple things: 1286 * 1287 * i) latency is important for smaller files. So if the 1288 * data is smaller than 'tcp_slow_start_initial' times 1289 * maxblk, then use sendvec_small_chunk which creates 1290 * maxblk size mblks and chains then together and sends 1291 * them to TCP in one shot. It also leaves 'wroff' size 1292 * space for the headers in each mblk. 1293 * 1294 * ii) for total size bigger than 'tcp_slow_start_initial' 1295 * time maxblk, its probably real file data which is 1296 * dominating. So its better to use sendvec_chunk because 1297 * performance goes to dog if we don't do pagesize reads. 1298 * sendvec_chunk will do pagesize reads and write them 1299 * in pagesize mblks to TCP. 1300 * 1301 * Side Notes: A write to file has not been optimized. 1302 * Future zero copy code will plugin into sendvec_chunk 1303 * only because doing zero copy for files smaller then 1304 * pagesize is useless. 1305 * 1306 * Note, if socket has NL7C enabled then call NL7C's 1307 * senfilev() function to consume the sfv[]. 1308 */ 1309 if (is_sock) { 1310 switch (so->so_family) { 1311 case AF_INET: 1312 case AF_INET6: 1313 if (so->so_nl7c_flags != 0) 1314 error = nl7c_sendfilev(so, &fileoff, 1315 sfv, copy_cnt, &count); 1316 else if (total_size <= (4 * maxblk)) 1317 error = sendvec_small_chunk(fp, 1318 &fileoff, sfv, copy_cnt, 1319 total_size, maxblk, &count); 1320 else 1321 error = sendvec_chunk(fp, &fileoff, 1322 sfv, copy_cnt, &count); 1323 break; 1324 } 1325 } else { 1326 ASSERT(vp->v_type == VREG); 1327 error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt, 1328 &count); 1329 } 1330 1331 1332 #ifdef _SYSCALL32_IMPL 1333 if (get_udatamodel() == DATAMODEL_ILP32) 1334 copy_vec = (const struct sendfilevec *)((char *)copy_vec + 1335 (copy_cnt * sizeof (ksendfilevec32_t))); 1336 else 1337 #endif 1338 copy_vec += copy_cnt; 1339 sfvcnt -= copy_cnt; 1340 } while (sfvcnt > 0); 1341 1342 if (vp->v_type == VREG) 1343 fp->f_offset += count; 1344 1345 1346 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1347 1348 #ifdef _SYSCALL32_IMPL 1349 if (get_udatamodel() == DATAMODEL_ILP32) { 1350 ssize32_t count32 = (ssize32_t)count; 1351 if (copyout(&count32, xferred, sizeof (count32))) 1352 error = EFAULT; 1353 releasef(fildes); 1354 if (error != 0) 1355 return (set_errno(error)); 1356 return (count32); 1357 } 1358 #endif 1359 if (copyout(&count, xferred, sizeof (count))) 1360 error = EFAULT; 1361 releasef(fildes); 1362 if (error != 0) 1363 return (set_errno(error)); 1364 return (count); 1365 err: 1366 ASSERT(error != 0); 1367 releasef(fildes); 1368 return (set_errno(error)); 1369 } 1370