1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/t_lock.h> 29 #include <sys/param.h> 30 #include <sys/systm.h> 31 #include <sys/buf.h> 32 #include <sys/conf.h> 33 #include <sys/cred.h> 34 #include <sys/kmem.h> 35 #include <sys/sysmacros.h> 36 #include <sys/vfs.h> 37 #include <sys/vnode.h> 38 #include <sys/debug.h> 39 #include <sys/errno.h> 40 #include <sys/time.h> 41 #include <sys/file.h> 42 #include <sys/open.h> 43 #include <sys/user.h> 44 #include <sys/termios.h> 45 #include <sys/stream.h> 46 #include <sys/strsubr.h> 47 #include <sys/sunddi.h> 48 #include <sys/esunddi.h> 49 #include <sys/flock.h> 50 #include <sys/modctl.h> 51 #include <sys/cmn_err.h> 52 #include <sys/vmsystm.h> 53 54 #include <sys/socket.h> 55 #include <sys/socketvar.h> 56 57 #include <netinet/in.h> 58 #include <sys/sendfile.h> 59 #include <sys/un.h> 60 #include <sys/tihdr.h> 61 #include <sys/atomic.h> 62 63 #include <inet/common.h> 64 #include <inet/ip.h> 65 #include <inet/ip6.h> 66 #include <inet/tcp.h> 67 68 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *, 69 ssize32_t *); 70 extern int nl7c_sendfilev(struct sonode *, u_offset_t *, struct sendfilevec *, 71 int, ssize_t *); 72 extern int snf_segmap(file_t *, vnode_t *, u_offset_t, u_offset_t, ssize_t *, 73 boolean_t); 74 75 #define readflg (V_WRITELOCK_FALSE) 76 #define rwflag (V_WRITELOCK_TRUE) 77 78 /* 79 * kstrwritemp() has very similar semantics as that of strwrite(). 80 * The main difference is it obtains mblks from the caller and also 81 * does not do any copy as done in strwrite() from user buffers to 82 * kernel buffers. 83 * 84 * Currently, this routine is used by sendfile to send data allocated 85 * within the kernel without any copying. This interface does not use the 86 * synchronous stream interface as synch. stream interface implies 87 * copying. 88 */ 89 int 90 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode) 91 { 92 struct stdata *stp; 93 struct queue *wqp; 94 mblk_t *newmp; 95 char waitflag; 96 int tempmode; 97 int error = 0; 98 int done = 0; 99 struct sonode *so; 100 boolean_t direct; 101 102 ASSERT(vp->v_stream); 103 stp = vp->v_stream; 104 105 so = VTOSO(vp); 106 direct = (so->so_state & SS_DIRECT); 107 108 /* 109 * This is the sockfs direct fast path. canputnext() need 110 * not be accurate so we don't grab the sd_lock here. If 111 * we get flow-controlled, we grab sd_lock just before the 112 * do..while loop below to emulate what strwrite() does. 113 */ 114 wqp = stp->sd_wrq; 115 if (canputnext(wqp) && direct && 116 !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) { 117 return (sostream_direct(so, NULL, mp, CRED())); 118 } else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) { 119 /* Fast check of flags before acquiring the lock */ 120 mutex_enter(&stp->sd_lock); 121 error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0); 122 mutex_exit(&stp->sd_lock); 123 if (error != 0) { 124 if (!(stp->sd_flag & STPLEX) && 125 (stp->sd_wput_opt & SW_SIGPIPE)) { 126 tsignal(curthread, SIGPIPE); 127 error = EPIPE; 128 } 129 return (error); 130 } 131 } 132 133 waitflag = WRITEWAIT; 134 if (stp->sd_flag & OLDNDELAY) 135 tempmode = fmode & ~FNDELAY; 136 else 137 tempmode = fmode; 138 139 mutex_enter(&stp->sd_lock); 140 do { 141 if (canputnext(wqp)) { 142 mutex_exit(&stp->sd_lock); 143 if (stp->sd_wputdatafunc != NULL) { 144 newmp = (stp->sd_wputdatafunc)(vp, mp, NULL, 145 NULL, NULL, NULL); 146 if (newmp == NULL) { 147 /* The caller will free mp */ 148 return (ECOMM); 149 } 150 mp = newmp; 151 } 152 putnext(wqp, mp); 153 return (0); 154 } 155 error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1, 156 &done); 157 } while (error == 0 && !done); 158 159 mutex_exit(&stp->sd_lock); 160 /* 161 * EAGAIN tells the application to try again. ENOMEM 162 * is returned only if the memory allocation size 163 * exceeds the physical limits of the system. ENOMEM 164 * can't be true here. 165 */ 166 if (error == ENOMEM) 167 error = EAGAIN; 168 return (error); 169 } 170 171 #define SEND_MAX_CHUNK 16 172 173 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 174 /* 175 * 64 bit offsets for 32 bit applications only running either on 176 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer 177 * more than 2GB of data. 178 */ 179 int 180 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, 181 int copy_cnt, ssize32_t *count) 182 { 183 struct vnode *vp; 184 ushort_t fflag; 185 int ioflag; 186 size32_t cnt; 187 ssize32_t sfv_len; 188 ssize32_t tmpcount; 189 u_offset_t sfv_off; 190 struct uio auio; 191 struct iovec aiov; 192 int i, error; 193 194 fflag = fp->f_flag; 195 vp = fp->f_vnode; 196 for (i = 0; i < copy_cnt; i++) { 197 198 if (ISSIG(curthread, JUSTLOOKING)) 199 return (EINTR); 200 201 /* 202 * Do similar checks as "write" as we are writing 203 * sfv_len bytes into "vp". 204 */ 205 sfv_len = (ssize32_t)sfv->sfv_len; 206 207 if (sfv_len == 0) 208 continue; 209 210 if (sfv_len < 0) 211 return (EINVAL); 212 213 if (vp->v_type == VREG) { 214 if (*fileoff >= curproc->p_fsz_ctl) { 215 mutex_enter(&curproc->p_lock); 216 (void) rctl_action( 217 rctlproc_legacy[RLIMIT_FSIZE], 218 curproc->p_rctls, curproc, RCA_SAFE); 219 mutex_exit(&curproc->p_lock); 220 return (EFBIG); 221 } 222 223 if (*fileoff >= OFFSET_MAX(fp)) 224 return (EFBIG); 225 226 if (*fileoff + sfv_len > OFFSET_MAX(fp)) 227 return (EINVAL); 228 } 229 230 tmpcount = *count + sfv_len; 231 if (tmpcount < 0) 232 return (EINVAL); 233 234 sfv_off = sfv->sfv_off; 235 236 auio.uio_extflg = UIO_COPY_DEFAULT; 237 if (sfv->sfv_fd == SFV_FD_SELF) { 238 aiov.iov_len = sfv_len; 239 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 240 auio.uio_loffset = *fileoff; 241 auio.uio_iovcnt = 1; 242 auio.uio_resid = sfv_len; 243 auio.uio_iov = &aiov; 244 auio.uio_segflg = UIO_USERSPACE; 245 auio.uio_llimit = curproc->p_fsz_ctl; 246 auio.uio_fmode = fflag; 247 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 248 while (sfv_len > 0) { 249 error = VOP_WRITE(vp, &auio, ioflag, 250 fp->f_cred, NULL); 251 cnt = sfv_len - auio.uio_resid; 252 sfv_len -= cnt; 253 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 254 if (vp->v_type == VREG) 255 *fileoff += cnt; 256 *count += cnt; 257 if (error != 0) 258 return (error); 259 } 260 } else { 261 file_t *ffp; 262 vnode_t *readvp; 263 size_t size; 264 caddr_t ptr; 265 266 if ((ffp = getf(sfv->sfv_fd)) == NULL) 267 return (EBADF); 268 269 if ((ffp->f_flag & FREAD) == 0) { 270 releasef(sfv->sfv_fd); 271 return (EBADF); 272 } 273 274 readvp = ffp->f_vnode; 275 if (readvp->v_type != VREG) { 276 releasef(sfv->sfv_fd); 277 return (EINVAL); 278 } 279 280 /* 281 * No point reading and writing to same vp, 282 * as long as both are regular files. readvp is not 283 * locked; but since we got it from an open file the 284 * contents will be valid during the time of access. 285 */ 286 if (vn_compare(vp, readvp)) { 287 releasef(sfv->sfv_fd); 288 return (EINVAL); 289 } 290 291 /* 292 * Note: we assume readvp != vp. "vp" is already 293 * locked, and "readvp" must not be. 294 */ 295 (void) VOP_RWLOCK(readvp, readflg, NULL); 296 297 /* 298 * Same checks as in pread64. 299 */ 300 if (sfv_off > MAXOFFSET_T) { 301 VOP_RWUNLOCK(readvp, readflg, NULL); 302 releasef(sfv->sfv_fd); 303 return (EINVAL); 304 } 305 306 if (sfv_off + sfv_len > MAXOFFSET_T) 307 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off); 308 309 /* Find the native blocksize to transfer data */ 310 size = MIN(vp->v_vfsp->vfs_bsize, 311 readvp->v_vfsp->vfs_bsize); 312 size = sfv_len < size ? sfv_len : size; 313 ptr = kmem_alloc(size, KM_SLEEP); 314 315 while (sfv_len > 0) { 316 size_t iov_len; 317 318 iov_len = MIN(size, sfv_len); 319 aiov.iov_base = ptr; 320 aiov.iov_len = iov_len; 321 auio.uio_loffset = sfv_off; 322 auio.uio_iov = &aiov; 323 auio.uio_iovcnt = 1; 324 auio.uio_resid = iov_len; 325 auio.uio_segflg = UIO_SYSSPACE; 326 auio.uio_llimit = MAXOFFSET_T; 327 auio.uio_fmode = ffp->f_flag; 328 ioflag = auio.uio_fmode & 329 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 330 331 /* 332 * If read sync is not asked for, 333 * filter sync flags 334 */ 335 if ((ioflag & FRSYNC) == 0) 336 ioflag &= ~(FSYNC|FDSYNC); 337 error = VOP_READ(readvp, &auio, ioflag, 338 fp->f_cred, NULL); 339 if (error) { 340 kmem_free(ptr, size); 341 VOP_RWUNLOCK(readvp, readflg, NULL); 342 releasef(sfv->sfv_fd); 343 return (error); 344 } 345 346 /* 347 * Check how must data was really read. 348 * Decrement the 'len' and increment the 349 * 'off' appropriately. 350 */ 351 cnt = iov_len - auio.uio_resid; 352 if (cnt == 0) { 353 /* 354 * If we were reading a pipe (currently 355 * not implemented), we may now lose 356 * data. 357 */ 358 kmem_free(ptr, size); 359 VOP_RWUNLOCK(readvp, readflg, NULL); 360 releasef(sfv->sfv_fd); 361 return (EINVAL); 362 } 363 sfv_len -= cnt; 364 sfv_off += cnt; 365 366 aiov.iov_base = ptr; 367 aiov.iov_len = cnt; 368 auio.uio_loffset = *fileoff; 369 auio.uio_iov = &aiov; 370 auio.uio_iovcnt = 1; 371 auio.uio_resid = cnt; 372 auio.uio_segflg = UIO_SYSSPACE; 373 auio.uio_llimit = curproc->p_fsz_ctl; 374 auio.uio_fmode = fflag; 375 ioflag = auio.uio_fmode & 376 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 377 error = VOP_WRITE(vp, &auio, ioflag, 378 fp->f_cred, NULL); 379 380 /* 381 * Check how much data was written. Increment 382 * the 'len' and decrement the 'off' if all 383 * the data was not written. 384 */ 385 cnt -= auio.uio_resid; 386 sfv_len += auio.uio_resid; 387 sfv_off -= auio.uio_resid; 388 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 389 if (vp->v_type == VREG) 390 *fileoff += cnt; 391 *count += cnt; 392 if (error != 0) { 393 kmem_free(ptr, size); 394 VOP_RWUNLOCK(readvp, readflg, NULL); 395 releasef(sfv->sfv_fd); 396 return (error); 397 } 398 } 399 VOP_RWUNLOCK(readvp, readflg, NULL); 400 releasef(sfv->sfv_fd); 401 kmem_free(ptr, size); 402 } 403 sfv++; 404 } 405 return (0); 406 } 407 408 ssize32_t 409 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, 410 size32_t *xferred, int fildes) 411 { 412 u_offset_t fileoff; 413 int copy_cnt; 414 const struct ksendfilevec64 *copy_vec; 415 struct ksendfilevec64 sfv[SEND_MAX_CHUNK]; 416 struct vnode *vp; 417 int error; 418 ssize32_t count = 0; 419 420 vp = fp->f_vnode; 421 (void) VOP_RWLOCK(vp, rwflag, NULL); 422 423 copy_vec = vec; 424 fileoff = fp->f_offset; 425 426 do { 427 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 428 if (copyin(copy_vec, sfv, copy_cnt * 429 sizeof (struct ksendfilevec64))) { 430 error = EFAULT; 431 break; 432 } 433 434 /* 435 * Optimize the regular file over 436 * the socket case. 437 */ 438 if (vp->v_type == VSOCK && sfv->sfv_fd != SFV_FD_SELF) { 439 file_t *rfp; 440 vnode_t *rvp; 441 442 if ((rfp = getf(sfv->sfv_fd)) == NULL) { 443 error = EBADF; 444 break; 445 } 446 if ((rfp->f_flag & FREAD) == 0) { 447 releasef(sfv->sfv_fd); 448 error = EBADF; 449 break; 450 } 451 rvp = rfp->f_vnode; 452 if (rvp->v_type == VREG) { 453 error = sosendfile64(fp, rfp, sfv, &count); 454 if (error) 455 break; 456 copy_vec++; 457 sfvcnt--; 458 continue; 459 } 460 releasef(sfv->sfv_fd); 461 } 462 error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count); 463 if (error != 0) 464 break; 465 466 copy_vec += copy_cnt; 467 sfvcnt -= copy_cnt; 468 } while (sfvcnt > 0); 469 470 if (vp->v_type == VREG) 471 fp->f_offset += count; 472 473 VOP_RWUNLOCK(vp, rwflag, NULL); 474 if (copyout(&count, xferred, sizeof (count))) 475 error = EFAULT; 476 releasef(fildes); 477 if (error != 0) 478 return (set_errno(error)); 479 return (count); 480 } 481 #endif 482 483 int 484 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 485 int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count) 486 { 487 struct vnode *vp; 488 struct uio auio; 489 struct iovec aiov; 490 ushort_t fflag; 491 int ioflag; 492 int i, error; 493 size_t cnt; 494 ssize_t sfv_len; 495 u_offset_t sfv_off; 496 #ifdef _SYSCALL32_IMPL 497 model_t model = get_udatamodel(); 498 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 499 MAXOFF32_T : MAXOFFSET_T; 500 #else 501 const u_offset_t maxoff = MAXOFF32_T; 502 #endif 503 mblk_t *dmp = NULL; 504 int wroff; 505 int buf_left = 0; 506 size_t iov_len; 507 mblk_t *head, *tmp; 508 size_t size = total_size; 509 size_t extra; 510 int tail_len; 511 512 fflag = fp->f_flag; 513 vp = fp->f_vnode; 514 515 ASSERT(vp->v_type == VSOCK); 516 ASSERT(maxblk > 0); 517 518 wroff = (int)vp->v_stream->sd_wroff; 519 tail_len = (int)vp->v_stream->sd_tail; 520 extra = wroff + tail_len; 521 522 buf_left = MIN(total_size, maxblk); 523 head = dmp = allocb(buf_left + extra, BPRI_HI); 524 if (head == NULL) 525 return (ENOMEM); 526 head->b_wptr = head->b_rptr = head->b_rptr + wroff; 527 528 auio.uio_extflg = UIO_COPY_DEFAULT; 529 for (i = 0; i < copy_cnt; i++) { 530 if (ISSIG(curthread, JUSTLOOKING)) { 531 freemsg(head); 532 return (EINTR); 533 } 534 535 /* 536 * Do similar checks as "write" as we are writing 537 * sfv_len bytes into "vp". 538 */ 539 sfv_len = (ssize_t)sfv->sfv_len; 540 541 if (sfv_len == 0) { 542 sfv++; 543 continue; 544 } 545 546 /* Check for overflow */ 547 #ifdef _SYSCALL32_IMPL 548 if (model == DATAMODEL_ILP32) { 549 if (((ssize32_t)(*count + sfv_len)) < 0) { 550 freemsg(head); 551 return (EINVAL); 552 } 553 } else 554 #endif 555 if ((*count + sfv_len) < 0) { 556 freemsg(head); 557 return (EINVAL); 558 } 559 560 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 561 562 if (sfv->sfv_fd == SFV_FD_SELF) { 563 while (sfv_len > 0) { 564 if (buf_left == 0) { 565 tmp = dmp; 566 buf_left = MIN(total_size, maxblk); 567 iov_len = MIN(buf_left, sfv_len); 568 dmp = allocb(buf_left + extra, BPRI_HI); 569 if (dmp == NULL) { 570 freemsg(head); 571 return (ENOMEM); 572 } 573 dmp->b_wptr = dmp->b_rptr = 574 dmp->b_rptr + wroff; 575 tmp->b_cont = dmp; 576 } else { 577 iov_len = MIN(buf_left, sfv_len); 578 } 579 580 aiov.iov_len = iov_len; 581 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 582 auio.uio_loffset = *fileoff; 583 auio.uio_iovcnt = 1; 584 auio.uio_resid = iov_len; 585 auio.uio_iov = &aiov; 586 auio.uio_segflg = UIO_USERSPACE; 587 auio.uio_llimit = curproc->p_fsz_ctl; 588 auio.uio_fmode = fflag; 589 590 buf_left -= iov_len; 591 total_size -= iov_len; 592 sfv_len -= iov_len; 593 sfv_off += iov_len; 594 595 error = uiomove((caddr_t)dmp->b_wptr, 596 iov_len, UIO_WRITE, &auio); 597 if (error != 0) { 598 freemsg(head); 599 return (error); 600 } 601 dmp->b_wptr += iov_len; 602 } 603 } else { 604 file_t *ffp; 605 vnode_t *readvp; 606 607 if ((ffp = getf(sfv->sfv_fd)) == NULL) { 608 freemsg(head); 609 return (EBADF); 610 } 611 612 if ((ffp->f_flag & FREAD) == 0) { 613 releasef(sfv->sfv_fd); 614 freemsg(head); 615 return (EACCES); 616 } 617 618 readvp = ffp->f_vnode; 619 if (readvp->v_type != VREG) { 620 releasef(sfv->sfv_fd); 621 freemsg(head); 622 return (EINVAL); 623 } 624 625 /* 626 * No point reading and writing to same vp, 627 * as long as both are regular files. readvp is not 628 * locked; but since we got it from an open file the 629 * contents will be valid during the time of access. 630 */ 631 632 if (vn_compare(vp, readvp)) { 633 releasef(sfv->sfv_fd); 634 freemsg(head); 635 return (EINVAL); 636 } 637 638 /* 639 * Note: we assume readvp != vp. "vp" is already 640 * locked, and "readvp" must not be. 641 */ 642 643 (void) VOP_RWLOCK(readvp, readflg, NULL); 644 645 /* Same checks as in pread */ 646 if (sfv_off > maxoff) { 647 VOP_RWUNLOCK(readvp, readflg, NULL); 648 releasef(sfv->sfv_fd); 649 freemsg(head); 650 return (EINVAL); 651 } 652 if (sfv_off + sfv_len > maxoff) { 653 total_size -= (sfv_off + sfv_len - maxoff); 654 sfv_len = (ssize_t)((offset_t)maxoff - 655 sfv_off); 656 } 657 658 while (sfv_len > 0) { 659 if (buf_left == 0) { 660 tmp = dmp; 661 buf_left = MIN(total_size, maxblk); 662 iov_len = MIN(buf_left, sfv_len); 663 dmp = allocb(buf_left + extra, BPRI_HI); 664 if (dmp == NULL) { 665 VOP_RWUNLOCK(readvp, readflg, 666 NULL); 667 releasef(sfv->sfv_fd); 668 freemsg(head); 669 return (ENOMEM); 670 } 671 dmp->b_wptr = dmp->b_rptr = 672 dmp->b_rptr + wroff; 673 tmp->b_cont = dmp; 674 } else { 675 iov_len = MIN(buf_left, sfv_len); 676 } 677 aiov.iov_base = (caddr_t)dmp->b_wptr; 678 aiov.iov_len = iov_len; 679 auio.uio_loffset = sfv_off; 680 auio.uio_iov = &aiov; 681 auio.uio_iovcnt = 1; 682 auio.uio_resid = iov_len; 683 auio.uio_segflg = UIO_SYSSPACE; 684 auio.uio_llimit = MAXOFFSET_T; 685 auio.uio_fmode = ffp->f_flag; 686 ioflag = auio.uio_fmode & 687 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 688 689 /* 690 * If read sync is not asked for, 691 * filter sync flags 692 */ 693 if ((ioflag & FRSYNC) == 0) 694 ioflag &= ~(FSYNC|FDSYNC); 695 error = VOP_READ(readvp, &auio, ioflag, 696 fp->f_cred, NULL); 697 if (error != 0) { 698 /* 699 * If we were reading a pipe (currently 700 * not implemented), we may now loose 701 * data. 702 */ 703 VOP_RWUNLOCK(readvp, readflg, NULL); 704 releasef(sfv->sfv_fd); 705 freemsg(head); 706 return (error); 707 } 708 709 /* 710 * Check how much data was really read. 711 * Decrement the 'len' and increment the 712 * 'off' appropriately. 713 */ 714 cnt = iov_len - auio.uio_resid; 715 if (cnt == 0) { 716 VOP_RWUNLOCK(readvp, readflg, NULL); 717 releasef(sfv->sfv_fd); 718 freemsg(head); 719 return (EINVAL); 720 } 721 sfv_len -= cnt; 722 sfv_off += cnt; 723 total_size -= cnt; 724 buf_left -= cnt; 725 726 dmp->b_wptr += cnt; 727 } 728 VOP_RWUNLOCK(readvp, readflg, NULL); 729 releasef(sfv->sfv_fd); 730 } 731 sfv++; 732 } 733 734 ASSERT(total_size == 0); 735 error = kstrwritemp(vp, head, fflag); 736 if (error != 0) { 737 freemsg(head); 738 return (error); 739 } 740 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size; 741 *count += size; 742 743 return (0); 744 } 745 746 747 int 748 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 749 int copy_cnt, ssize_t *count) 750 { 751 struct vnode *vp; 752 struct uio auio; 753 struct iovec aiov; 754 ushort_t fflag; 755 int ioflag; 756 int i, error; 757 size_t cnt; 758 ssize_t sfv_len; 759 u_offset_t sfv_off; 760 #ifdef _SYSCALL32_IMPL 761 model_t model = get_udatamodel(); 762 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 763 MAXOFF32_T : MAXOFFSET_T; 764 #else 765 const u_offset_t maxoff = MAXOFF32_T; 766 #endif 767 mblk_t *dmp = NULL; 768 char *buf = NULL; 769 size_t extra; 770 int maxblk, wroff, tail_len; 771 struct sonode *so; 772 stdata_t *stp; 773 774 fflag = fp->f_flag; 775 vp = fp->f_vnode; 776 777 if (vp->v_type == VSOCK) { 778 so = VTOSO(vp); 779 stp = vp->v_stream; 780 wroff = (int)stp->sd_wroff; 781 tail_len = (int)stp->sd_tail; 782 maxblk = (int)stp->sd_maxblk; 783 extra = wroff + tail_len; 784 } 785 786 auio.uio_extflg = UIO_COPY_DEFAULT; 787 for (i = 0; i < copy_cnt; i++) { 788 if (ISSIG(curthread, JUSTLOOKING)) 789 return (EINTR); 790 791 /* 792 * Do similar checks as "write" as we are writing 793 * sfv_len bytes into "vp". 794 */ 795 sfv_len = (ssize_t)sfv->sfv_len; 796 797 if (sfv_len == 0) { 798 sfv++; 799 continue; 800 } 801 802 if (vp->v_type == VREG) { 803 if (*fileoff >= curproc->p_fsz_ctl) { 804 mutex_enter(&curproc->p_lock); 805 (void) rctl_action( 806 rctlproc_legacy[RLIMIT_FSIZE], 807 curproc->p_rctls, curproc, RCA_SAFE); 808 mutex_exit(&curproc->p_lock); 809 810 return (EFBIG); 811 } 812 813 if (*fileoff >= maxoff) 814 return (EFBIG); 815 816 if (*fileoff + sfv_len > maxoff) 817 return (EINVAL); 818 } 819 820 /* Check for overflow */ 821 #ifdef _SYSCALL32_IMPL 822 if (model == DATAMODEL_ILP32) { 823 if (((ssize32_t)(*count + sfv_len)) < 0) 824 return (EINVAL); 825 } else 826 #endif 827 if ((*count + sfv_len) < 0) 828 return (EINVAL); 829 830 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 831 832 if (sfv->sfv_fd == SFV_FD_SELF) { 833 aiov.iov_len = sfv_len; 834 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 835 auio.uio_loffset = *fileoff; 836 auio.uio_iovcnt = 1; 837 auio.uio_resid = sfv_len; 838 auio.uio_iov = &aiov; 839 auio.uio_segflg = UIO_USERSPACE; 840 auio.uio_llimit = curproc->p_fsz_ctl; 841 auio.uio_fmode = fflag; 842 843 if (vp->v_type == VSOCK) { 844 845 /* 846 * Optimize for the socket case 847 */ 848 849 dmp = allocb(sfv_len + extra, BPRI_HI); 850 if (dmp == NULL) 851 return (ENOMEM); 852 dmp->b_wptr = dmp->b_rptr = dmp->b_rptr + wroff; 853 error = uiomove((caddr_t)dmp->b_wptr, 854 sfv_len, UIO_WRITE, &auio); 855 if (error != 0) { 856 freeb(dmp); 857 return (error); 858 } 859 dmp->b_wptr += sfv_len; 860 error = kstrwritemp(vp, dmp, fflag); 861 if (error != 0) { 862 freeb(dmp); 863 return (error); 864 } 865 ttolwp(curthread)->lwp_ru.ioch += 866 (ulong_t)sfv_len; 867 *count += sfv_len; 868 } else { 869 ioflag = auio.uio_fmode & 870 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 871 while (sfv_len > 0) { 872 error = VOP_WRITE(vp, &auio, ioflag, 873 fp->f_cred, NULL); 874 cnt = sfv_len - auio.uio_resid; 875 sfv_len -= cnt; 876 ttolwp(curthread)->lwp_ru.ioch += 877 (ulong_t)cnt; 878 *fileoff += cnt; 879 *count += cnt; 880 if (error != 0) 881 return (error); 882 } 883 } 884 } else { 885 int segmapit = 0; 886 file_t *ffp; 887 vnode_t *readvp; 888 struct vnode *realvp; 889 size_t size; 890 caddr_t ptr; 891 892 if ((ffp = getf(sfv->sfv_fd)) == NULL) 893 return (EBADF); 894 895 if ((ffp->f_flag & FREAD) == 0) { 896 releasef(sfv->sfv_fd); 897 return (EBADF); 898 } 899 900 readvp = ffp->f_vnode; 901 if (VOP_REALVP(readvp, &realvp, NULL) == 0) 902 readvp = realvp; 903 if (readvp->v_type != VREG) { 904 releasef(sfv->sfv_fd); 905 return (EINVAL); 906 } 907 908 /* 909 * No point reading and writing to same vp, 910 * as long as both are regular files. readvp is not 911 * locked; but since we got it from an open file the 912 * contents will be valid during the time of access. 913 */ 914 if (vn_compare(vp, readvp)) { 915 releasef(sfv->sfv_fd); 916 return (EINVAL); 917 } 918 919 /* 920 * Note: we assume readvp != vp. "vp" is already 921 * locked, and "readvp" must not be. 922 */ 923 (void) VOP_RWLOCK(readvp, readflg, NULL); 924 925 /* Same checks as in pread */ 926 if (sfv_off > maxoff) { 927 VOP_RWUNLOCK(readvp, readflg, NULL); 928 releasef(sfv->sfv_fd); 929 return (EINVAL); 930 } 931 if (sfv_off + sfv_len > maxoff) { 932 sfv_len = (ssize_t)((offset_t)maxoff - 933 sfv_off); 934 } 935 /* Find the native blocksize to transfer data */ 936 size = MIN(vp->v_vfsp->vfs_bsize, 937 readvp->v_vfsp->vfs_bsize); 938 size = sfv_len < size ? sfv_len : size; 939 940 if (vp->v_type != VSOCK) { 941 segmapit = 0; 942 buf = kmem_alloc(size, KM_NOSLEEP); 943 if (buf == NULL) { 944 VOP_RWUNLOCK(readvp, readflg, NULL); 945 releasef(sfv->sfv_fd); 946 return (ENOMEM); 947 } 948 } else { 949 /* 950 * For sockets acting as an SSL proxy, we 951 * need to adjust the size to the maximum 952 * SSL record size set in the stream head. 953 */ 954 if (so->so_kssl_ctx != NULL) 955 size = MIN(size, maxblk); 956 957 if (vn_has_flocks(readvp) || 958 readvp->v_flag & VNOMAP || 959 stp->sd_copyflag & STZCVMUNSAFE) { 960 segmapit = 0; 961 } else if (stp->sd_copyflag & STZCVMSAFE) { 962 segmapit = 1; 963 } else { 964 int on = 1; 965 if (SOP_SETSOCKOPT(VTOSO(vp), 966 SOL_SOCKET, SO_SND_COPYAVOID, 967 &on, sizeof (on)) == 0) 968 segmapit = 1; 969 } 970 } 971 972 if (segmapit) { 973 boolean_t nowait; 974 975 nowait = (sfv->sfv_flag & SFV_NOWAIT) != 0; 976 error = snf_segmap(fp, readvp, sfv_off, 977 (u_offset_t)sfv_len, (ssize_t *)&cnt, 978 nowait); 979 releasef(sfv->sfv_fd); 980 *count += cnt; 981 if (error) 982 return (error); 983 sfv++; 984 continue; 985 } 986 987 while (sfv_len > 0) { 988 size_t iov_len; 989 990 iov_len = MIN(size, sfv_len); 991 992 if (vp->v_type == VSOCK) { 993 dmp = allocb(iov_len + extra, BPRI_HI); 994 if (dmp == NULL) { 995 VOP_RWUNLOCK(readvp, readflg, 996 NULL); 997 releasef(sfv->sfv_fd); 998 return (ENOMEM); 999 } 1000 dmp->b_wptr = dmp->b_rptr = 1001 dmp->b_rptr + wroff; 1002 ptr = (caddr_t)dmp->b_rptr; 1003 } else { 1004 ptr = buf; 1005 } 1006 1007 aiov.iov_base = ptr; 1008 aiov.iov_len = iov_len; 1009 auio.uio_loffset = sfv_off; 1010 auio.uio_iov = &aiov; 1011 auio.uio_iovcnt = 1; 1012 auio.uio_resid = iov_len; 1013 auio.uio_segflg = UIO_SYSSPACE; 1014 auio.uio_llimit = MAXOFFSET_T; 1015 auio.uio_fmode = ffp->f_flag; 1016 ioflag = auio.uio_fmode & 1017 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1018 1019 /* 1020 * If read sync is not asked for, 1021 * filter sync flags 1022 */ 1023 if ((ioflag & FRSYNC) == 0) 1024 ioflag &= ~(FSYNC|FDSYNC); 1025 error = VOP_READ(readvp, &auio, ioflag, 1026 fp->f_cred, NULL); 1027 if (error != 0) { 1028 /* 1029 * If we were reading a pipe (currently 1030 * not implemented), we may now lose 1031 * data. 1032 */ 1033 if (vp->v_type == VSOCK) 1034 freeb(dmp); 1035 else 1036 kmem_free(buf, size); 1037 VOP_RWUNLOCK(readvp, readflg, NULL); 1038 releasef(sfv->sfv_fd); 1039 return (error); 1040 } 1041 1042 /* 1043 * Check how much data was really read. 1044 * Decrement the 'len' and increment the 1045 * 'off' appropriately. 1046 */ 1047 cnt = iov_len - auio.uio_resid; 1048 if (cnt == 0) { 1049 if (vp->v_type == VSOCK) 1050 freeb(dmp); 1051 else 1052 kmem_free(buf, size); 1053 VOP_RWUNLOCK(readvp, readflg, NULL); 1054 releasef(sfv->sfv_fd); 1055 return (EINVAL); 1056 } 1057 sfv_len -= cnt; 1058 sfv_off += cnt; 1059 1060 if (vp->v_type == VSOCK) { 1061 dmp->b_wptr = dmp->b_rptr + cnt; 1062 1063 error = kstrwritemp(vp, dmp, fflag); 1064 if (error != 0) { 1065 freeb(dmp); 1066 VOP_RWUNLOCK(readvp, readflg, 1067 NULL); 1068 releasef(sfv->sfv_fd); 1069 return (error); 1070 } 1071 1072 ttolwp(curthread)->lwp_ru.ioch += 1073 (ulong_t)cnt; 1074 *count += cnt; 1075 } else { 1076 1077 aiov.iov_base = ptr; 1078 aiov.iov_len = cnt; 1079 auio.uio_loffset = *fileoff; 1080 auio.uio_resid = cnt; 1081 auio.uio_iov = &aiov; 1082 auio.uio_iovcnt = 1; 1083 auio.uio_segflg = UIO_SYSSPACE; 1084 auio.uio_llimit = curproc->p_fsz_ctl; 1085 auio.uio_fmode = fflag; 1086 ioflag = auio.uio_fmode & 1087 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1088 error = VOP_WRITE(vp, &auio, ioflag, 1089 fp->f_cred, NULL); 1090 1091 /* 1092 * Check how much data was written. 1093 * Increment the 'len' and decrement the 1094 * 'off' if all the data was not 1095 * written. 1096 */ 1097 cnt -= auio.uio_resid; 1098 sfv_len += auio.uio_resid; 1099 sfv_off -= auio.uio_resid; 1100 ttolwp(curthread)->lwp_ru.ioch += 1101 (ulong_t)cnt; 1102 *fileoff += cnt; 1103 *count += cnt; 1104 if (error != 0) { 1105 kmem_free(buf, size); 1106 VOP_RWUNLOCK(readvp, readflg, 1107 NULL); 1108 releasef(sfv->sfv_fd); 1109 return (error); 1110 } 1111 } 1112 } 1113 if (buf) { 1114 kmem_free(buf, size); 1115 buf = NULL; 1116 } 1117 VOP_RWUNLOCK(readvp, readflg, NULL); 1118 releasef(sfv->sfv_fd); 1119 } 1120 sfv++; 1121 } 1122 return (0); 1123 } 1124 1125 ssize_t 1126 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, 1127 size_t *xferred) 1128 { 1129 int error = 0; 1130 int first_vector_error = 0; 1131 file_t *fp; 1132 struct vnode *vp; 1133 struct sonode *so; 1134 u_offset_t fileoff; 1135 int copy_cnt; 1136 const struct sendfilevec *copy_vec; 1137 struct sendfilevec sfv[SEND_MAX_CHUNK]; 1138 ssize_t count = 0; 1139 #ifdef _SYSCALL32_IMPL 1140 struct ksendfilevec32 sfv32[SEND_MAX_CHUNK]; 1141 #endif 1142 ssize_t total_size; 1143 int i; 1144 boolean_t is_sock = B_FALSE; 1145 int maxblk = 0; 1146 1147 if (sfvcnt <= 0) 1148 return (set_errno(EINVAL)); 1149 1150 if ((fp = getf(fildes)) == NULL) 1151 return (set_errno(EBADF)); 1152 1153 if (((fp->f_flag) & FWRITE) == 0) { 1154 error = EBADF; 1155 goto err; 1156 } 1157 1158 fileoff = fp->f_offset; 1159 vp = fp->f_vnode; 1160 1161 switch (vp->v_type) { 1162 case VSOCK: 1163 so = VTOSO(vp); 1164 /* sendfile not supported for SCTP */ 1165 if (so->so_protocol == IPPROTO_SCTP) { 1166 error = EPROTONOSUPPORT; 1167 goto err; 1168 } 1169 is_sock = B_TRUE; 1170 switch (so->so_family) { 1171 case AF_INET: 1172 case AF_INET6: 1173 /* 1174 * Make similar checks done in SOP_WRITE(). 1175 */ 1176 if (so->so_state & SS_CANTSENDMORE) { 1177 tsignal(curthread, SIGPIPE); 1178 error = EPIPE; 1179 goto err; 1180 } 1181 if (so->so_type != SOCK_STREAM) { 1182 error = EOPNOTSUPP; 1183 goto err; 1184 } 1185 1186 if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) != 1187 (SS_ISCONNECTED|SS_ISBOUND)) { 1188 error = ENOTCONN; 1189 goto err; 1190 } 1191 1192 if ((so->so_state & SS_DIRECT) && 1193 (so->so_priv != NULL) && 1194 (so->so_kssl_ctx == NULL)) { 1195 maxblk = ((tcp_t *)so->so_priv)->tcp_mss; 1196 } else { 1197 maxblk = (int)vp->v_stream->sd_maxblk; 1198 } 1199 break; 1200 default: 1201 error = EAFNOSUPPORT; 1202 goto err; 1203 } 1204 break; 1205 case VREG: 1206 break; 1207 default: 1208 error = EINVAL; 1209 goto err; 1210 } 1211 1212 switch (opcode) { 1213 case SENDFILEV : 1214 break; 1215 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 1216 case SENDFILEV64 : 1217 return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt, 1218 (size32_t *)xferred, fildes)); 1219 #endif 1220 default : 1221 error = ENOSYS; 1222 break; 1223 } 1224 1225 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1226 copy_vec = vec; 1227 1228 do { 1229 total_size = 0; 1230 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 1231 #ifdef _SYSCALL32_IMPL 1232 /* 32-bit callers need to have their iovec expanded. */ 1233 if (get_udatamodel() == DATAMODEL_ILP32) { 1234 if (copyin(copy_vec, sfv32, 1235 copy_cnt * sizeof (ksendfilevec32_t))) { 1236 error = EFAULT; 1237 break; 1238 } 1239 1240 for (i = 0; i < copy_cnt; i++) { 1241 sfv[i].sfv_fd = sfv32[i].sfv_fd; 1242 sfv[i].sfv_off = 1243 (off_t)(uint32_t)sfv32[i].sfv_off; 1244 sfv[i].sfv_len = (size_t)sfv32[i].sfv_len; 1245 total_size += sfv[i].sfv_len; 1246 sfv[i].sfv_flag = sfv32[i].sfv_flag; 1247 /* 1248 * Individual elements of the vector must not 1249 * wrap or overflow, as later math is signed. 1250 * Equally total_size needs to be checked after 1251 * each vector is added in, to be sure that 1252 * rogue values haven't overflowed the counter. 1253 */ 1254 if (((ssize32_t)sfv[i].sfv_len < 0) || 1255 ((ssize32_t)total_size < 0)) { 1256 /* 1257 * Truncate the vector to send data 1258 * described by elements before the 1259 * error. 1260 */ 1261 copy_cnt = i; 1262 first_vector_error = EINVAL; 1263 /* total_size can't be trusted */ 1264 if ((ssize32_t)total_size < 0) 1265 error = EINVAL; 1266 break; 1267 } 1268 } 1269 /* Nothing to do, process errors */ 1270 if (copy_cnt == 0) 1271 break; 1272 1273 } else { 1274 #endif 1275 if (copyin(copy_vec, sfv, 1276 copy_cnt * sizeof (sendfilevec_t))) { 1277 error = EFAULT; 1278 break; 1279 } 1280 1281 for (i = 0; i < copy_cnt; i++) { 1282 total_size += sfv[i].sfv_len; 1283 /* 1284 * Individual elements of the vector must not 1285 * wrap or overflow, as later math is signed. 1286 * Equally total_size needs to be checked after 1287 * each vector is added in, to be sure that 1288 * rogue values haven't overflowed the counter. 1289 */ 1290 if (((ssize_t)sfv[i].sfv_len < 0) || 1291 (total_size < 0)) { 1292 /* 1293 * Truncate the vector to send data 1294 * described by elements before the 1295 * error. 1296 */ 1297 copy_cnt = i; 1298 first_vector_error = EINVAL; 1299 /* total_size can't be trusted */ 1300 if (total_size < 0) 1301 error = EINVAL; 1302 break; 1303 } 1304 } 1305 /* Nothing to do, process errors */ 1306 if (copy_cnt == 0) 1307 break; 1308 #ifdef _SYSCALL32_IMPL 1309 } 1310 #endif 1311 1312 /* 1313 * The task between deciding to use sendvec_small_chunk 1314 * and sendvec_chunk is dependant on multiple things: 1315 * 1316 * i) latency is important for smaller files. So if the 1317 * data is smaller than 'tcp_slow_start_initial' times 1318 * maxblk, then use sendvec_small_chunk which creates 1319 * maxblk size mblks and chains them together and sends 1320 * them to TCP in one shot. It also leaves 'wroff' size 1321 * space for the headers in each mblk. 1322 * 1323 * ii) for total size bigger than 'tcp_slow_start_initial' 1324 * time maxblk, its probably real file data which is 1325 * dominating. So its better to use sendvec_chunk because 1326 * performance goes to dog if we don't do pagesize reads. 1327 * sendvec_chunk will do pagesize reads and write them 1328 * in pagesize mblks to TCP. 1329 * 1330 * Side Notes: A write to file has not been optimized. 1331 * Future zero copy code will plugin into sendvec_chunk 1332 * only because doing zero copy for files smaller then 1333 * pagesize is useless. 1334 * 1335 * Note, if socket has NL7C enabled then call NL7C's 1336 * senfilev() function to consume the sfv[]. 1337 */ 1338 if (is_sock) { 1339 switch (so->so_family) { 1340 case AF_INET: 1341 case AF_INET6: 1342 if (so->so_nl7c_flags != 0) 1343 error = nl7c_sendfilev(so, &fileoff, 1344 sfv, copy_cnt, &count); 1345 else if ((total_size <= (4 * maxblk)) && 1346 error == 0) 1347 error = sendvec_small_chunk(fp, 1348 &fileoff, sfv, copy_cnt, 1349 total_size, maxblk, &count); 1350 else 1351 error = sendvec_chunk(fp, &fileoff, 1352 sfv, copy_cnt, &count); 1353 break; 1354 } 1355 } else { 1356 ASSERT(vp->v_type == VREG); 1357 error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt, 1358 &count); 1359 } 1360 1361 1362 #ifdef _SYSCALL32_IMPL 1363 if (get_udatamodel() == DATAMODEL_ILP32) 1364 copy_vec = (const struct sendfilevec *)((char *)copy_vec + 1365 (copy_cnt * sizeof (ksendfilevec32_t))); 1366 else 1367 #endif 1368 copy_vec += copy_cnt; 1369 sfvcnt -= copy_cnt; 1370 1371 /* Process all vector members up to first error */ 1372 } while ((sfvcnt > 0) && first_vector_error == 0 && error == 0); 1373 1374 if (vp->v_type == VREG) 1375 fp->f_offset += count; 1376 1377 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1378 1379 #ifdef _SYSCALL32_IMPL 1380 if (get_udatamodel() == DATAMODEL_ILP32) { 1381 ssize32_t count32 = (ssize32_t)count; 1382 if (copyout(&count32, xferred, sizeof (count32))) 1383 error = EFAULT; 1384 releasef(fildes); 1385 if (error != 0) 1386 return (set_errno(error)); 1387 if (first_vector_error != 0) 1388 return (set_errno(first_vector_error)); 1389 return (count32); 1390 } 1391 #endif 1392 if (copyout(&count, xferred, sizeof (count))) 1393 error = EFAULT; 1394 releasef(fildes); 1395 if (error != 0) 1396 return (set_errno(error)); 1397 if (first_vector_error != 0) 1398 return (set_errno(first_vector_error)); 1399 return (count); 1400 err: 1401 ASSERT(error != 0); 1402 releasef(fildes); 1403 return (set_errno(error)); 1404 } 1405