1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/t_lock.h> 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/buf.h> 34 #include <sys/conf.h> 35 #include <sys/cred.h> 36 #include <sys/kmem.h> 37 #include <sys/sysmacros.h> 38 #include <sys/vfs.h> 39 #include <sys/vnode.h> 40 #include <sys/debug.h> 41 #include <sys/errno.h> 42 #include <sys/time.h> 43 #include <sys/file.h> 44 #include <sys/open.h> 45 #include <sys/user.h> 46 #include <sys/termios.h> 47 #include <sys/stream.h> 48 #include <sys/strsubr.h> 49 #include <sys/sunddi.h> 50 #include <sys/esunddi.h> 51 #include <sys/flock.h> 52 #include <sys/modctl.h> 53 #include <sys/cmn_err.h> 54 #include <sys/vmsystm.h> 55 56 #include <sys/socket.h> 57 #include <sys/socketvar.h> 58 /* swilly code in sys/socketvar.h turns off DEBUG */ 59 #ifdef __lint 60 #define DEBUG 61 #endif 62 63 #include <netinet/in.h> 64 #include <sys/sendfile.h> 65 #include <sys/un.h> 66 #include <sys/tihdr.h> 67 #include <sys/atomic.h> 68 69 #include <inet/common.h> 70 #include <inet/ip.h> 71 #include <inet/ip6.h> 72 #include <inet/tcp.h> 73 74 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *, 75 ssize32_t *); 76 extern int nl7c_sendfilev(struct sonode *, u_offset_t *, struct sendfilevec *, 77 int, ssize_t *); 78 extern int snf_segmap(file_t *, vnode_t *, u_offset_t, u_offset_t, uint_t, 79 ssize_t *, boolean_t); 80 81 #define readflg (V_WRITELOCK_FALSE) 82 #define rwflag (V_WRITELOCK_TRUE) 83 84 /* 85 * kstrwritemp() has very similar semantics as that of strwrite(). 86 * The main difference is it obtains mblks from the caller and also 87 * does not do any copy as done in strwrite() from user buffers to 88 * kernel buffers. 89 * 90 * Currently, this routine is used by sendfile to send data allocated 91 * within the kernel without any copying. This interface does not use the 92 * synchronous stream interface as synch. stream interface implies 93 * copying. 94 */ 95 int 96 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode) 97 { 98 struct stdata *stp; 99 struct queue *wqp; 100 mblk_t *newmp; 101 char waitflag; 102 int tempmode; 103 int error = 0; 104 int done = 0; 105 struct sonode *so; 106 boolean_t direct; 107 108 ASSERT(vp->v_stream); 109 stp = vp->v_stream; 110 111 so = VTOSO(vp); 112 direct = (so->so_state & SS_DIRECT); 113 114 /* 115 * This is the sockfs direct fast path. canputnext() need 116 * not be accurate so we don't grab the sd_lock here. If 117 * we get flow-controlled, we grab sd_lock just before the 118 * do..while loop below to emulate what strwrite() does. 119 */ 120 wqp = stp->sd_wrq; 121 if (canputnext(wqp) && direct && 122 !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) { 123 return (sostream_direct(so, NULL, mp, CRED())); 124 } else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) { 125 /* Fast check of flags before acquiring the lock */ 126 mutex_enter(&stp->sd_lock); 127 error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0); 128 mutex_exit(&stp->sd_lock); 129 if (error != 0) { 130 if (!(stp->sd_flag & STPLEX) && 131 (stp->sd_wput_opt & SW_SIGPIPE)) { 132 tsignal(curthread, SIGPIPE); 133 error = EPIPE; 134 } 135 return (error); 136 } 137 } 138 139 waitflag = WRITEWAIT; 140 if (stp->sd_flag & OLDNDELAY) 141 tempmode = fmode & ~FNDELAY; 142 else 143 tempmode = fmode; 144 145 mutex_enter(&stp->sd_lock); 146 do { 147 if (canputnext(wqp)) { 148 mutex_exit(&stp->sd_lock); 149 if (stp->sd_wputdatafunc != NULL) { 150 newmp = (stp->sd_wputdatafunc)(vp, mp, NULL, 151 NULL, NULL, NULL); 152 if (newmp == NULL) { 153 /* The caller will free mp */ 154 return (ECOMM); 155 } 156 mp = newmp; 157 } 158 putnext(wqp, mp); 159 return (0); 160 } 161 error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1, 162 &done); 163 } while (error == 0 && !done); 164 165 mutex_exit(&stp->sd_lock); 166 /* 167 * EAGAIN tells the application to try again. ENOMEM 168 * is returned only if the memory allocation size 169 * exceeds the physical limits of the system. ENOMEM 170 * can't be true here. 171 */ 172 if (error == ENOMEM) 173 error = EAGAIN; 174 return (error); 175 } 176 177 #define SEND_MAX_CHUNK 16 178 179 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 180 /* 181 * 64 bit offsets for 32 bit applications only running either on 182 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer 183 * more than 2GB of data. 184 */ 185 int 186 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, 187 int copy_cnt, ssize32_t *count) 188 { 189 struct vnode *vp; 190 ushort_t fflag; 191 int ioflag; 192 size32_t cnt; 193 ssize32_t sfv_len; 194 ssize32_t tmpcount; 195 u_offset_t sfv_off; 196 struct uio auio; 197 struct iovec aiov; 198 int i, error; 199 200 fflag = fp->f_flag; 201 vp = fp->f_vnode; 202 for (i = 0; i < copy_cnt; i++) { 203 204 if (ISSIG(curthread, JUSTLOOKING)) 205 return (EINTR); 206 207 /* 208 * Do similar checks as "write" as we are writing 209 * sfv_len bytes into "vp". 210 */ 211 sfv_len = (ssize32_t)sfv->sfv_len; 212 213 if (sfv_len == 0) 214 continue; 215 216 if (sfv_len < 0) 217 return (EINVAL); 218 219 if (vp->v_type == VREG) { 220 if (*fileoff >= curproc->p_fsz_ctl) { 221 mutex_enter(&curproc->p_lock); 222 (void) rctl_action( 223 rctlproc_legacy[RLIMIT_FSIZE], 224 curproc->p_rctls, curproc, RCA_SAFE); 225 mutex_exit(&curproc->p_lock); 226 return (EFBIG); 227 } 228 229 if (*fileoff >= OFFSET_MAX(fp)) 230 return (EFBIG); 231 232 if (*fileoff + sfv_len > OFFSET_MAX(fp)) 233 return (EINVAL); 234 } 235 236 tmpcount = *count + sfv_len; 237 if (tmpcount < 0) 238 return (EINVAL); 239 240 sfv_off = sfv->sfv_off; 241 242 auio.uio_extflg = UIO_COPY_DEFAULT; 243 if (sfv->sfv_fd == SFV_FD_SELF) { 244 aiov.iov_len = sfv_len; 245 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 246 auio.uio_loffset = *fileoff; 247 auio.uio_iovcnt = 1; 248 auio.uio_resid = sfv_len; 249 auio.uio_iov = &aiov; 250 auio.uio_segflg = UIO_USERSPACE; 251 auio.uio_llimit = curproc->p_fsz_ctl; 252 auio.uio_fmode = fflag; 253 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 254 while (sfv_len > 0) { 255 error = VOP_WRITE(vp, &auio, ioflag, 256 fp->f_cred, NULL); 257 cnt = sfv_len - auio.uio_resid; 258 sfv_len -= cnt; 259 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 260 if (vp->v_type == VREG) 261 *fileoff += cnt; 262 *count += cnt; 263 if (error != 0) 264 return (error); 265 } 266 } else { 267 file_t *ffp; 268 vnode_t *readvp; 269 size_t size; 270 caddr_t ptr; 271 272 if ((ffp = getf(sfv->sfv_fd)) == NULL) 273 return (EBADF); 274 275 if ((ffp->f_flag & FREAD) == 0) { 276 releasef(sfv->sfv_fd); 277 return (EBADF); 278 } 279 280 readvp = ffp->f_vnode; 281 if (readvp->v_type != VREG) { 282 releasef(sfv->sfv_fd); 283 return (EINVAL); 284 } 285 286 /* 287 * No point reading and writing to same vp, 288 * as long as both are regular files. readvp is not 289 * locked; but since we got it from an open file the 290 * contents will be valid during the time of access. 291 */ 292 if (vn_compare(vp, readvp)) { 293 releasef(sfv->sfv_fd); 294 return (EINVAL); 295 } 296 297 /* 298 * Note: we assume readvp != vp. "vp" is already 299 * locked, and "readvp" must not be. 300 */ 301 (void) VOP_RWLOCK(readvp, readflg, NULL); 302 303 /* 304 * Same checks as in pread64. 305 */ 306 if (sfv_off > MAXOFFSET_T) { 307 VOP_RWUNLOCK(readvp, readflg, NULL); 308 releasef(sfv->sfv_fd); 309 return (EINVAL); 310 } 311 312 if (sfv_off + sfv_len > MAXOFFSET_T) 313 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off); 314 315 /* Find the native blocksize to transfer data */ 316 size = MIN(vp->v_vfsp->vfs_bsize, 317 readvp->v_vfsp->vfs_bsize); 318 size = sfv_len < size ? sfv_len : size; 319 ptr = kmem_alloc(size, KM_SLEEP); 320 321 while (sfv_len > 0) { 322 size_t iov_len; 323 324 iov_len = MIN(size, sfv_len); 325 aiov.iov_base = ptr; 326 aiov.iov_len = iov_len; 327 auio.uio_loffset = sfv_off; 328 auio.uio_iov = &aiov; 329 auio.uio_iovcnt = 1; 330 auio.uio_resid = iov_len; 331 auio.uio_segflg = UIO_SYSSPACE; 332 auio.uio_llimit = MAXOFFSET_T; 333 auio.uio_fmode = ffp->f_flag; 334 ioflag = auio.uio_fmode & 335 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 336 337 /* 338 * If read sync is not asked for, 339 * filter sync flags 340 */ 341 if ((ioflag & FRSYNC) == 0) 342 ioflag &= ~(FSYNC|FDSYNC); 343 error = VOP_READ(readvp, &auio, ioflag, 344 fp->f_cred, NULL); 345 if (error) { 346 kmem_free(ptr, size); 347 VOP_RWUNLOCK(readvp, readflg, NULL); 348 releasef(sfv->sfv_fd); 349 return (error); 350 } 351 352 /* 353 * Check how must data was really read. 354 * Decrement the 'len' and increment the 355 * 'off' appropriately. 356 */ 357 cnt = iov_len - auio.uio_resid; 358 if (cnt == 0) { 359 /* 360 * If we were reading a pipe (currently 361 * not implemented), we may now lose 362 * data. 363 */ 364 kmem_free(ptr, size); 365 VOP_RWUNLOCK(readvp, readflg, NULL); 366 releasef(sfv->sfv_fd); 367 return (EINVAL); 368 } 369 sfv_len -= cnt; 370 sfv_off += cnt; 371 372 aiov.iov_base = ptr; 373 aiov.iov_len = cnt; 374 auio.uio_loffset = *fileoff; 375 auio.uio_resid = cnt; 376 auio.uio_segflg = UIO_SYSSPACE; 377 auio.uio_llimit = curproc->p_fsz_ctl; 378 auio.uio_fmode = fflag; 379 ioflag = auio.uio_fmode & 380 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 381 error = VOP_WRITE(vp, &auio, ioflag, 382 fp->f_cred, NULL); 383 384 /* 385 * Check how much data was written. Increment 386 * the 'len' and decrement the 'off' if all 387 * the data was not written. 388 */ 389 cnt -= auio.uio_resid; 390 sfv_len += auio.uio_resid; 391 sfv_off -= auio.uio_resid; 392 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 393 if (vp->v_type == VREG) 394 *fileoff += cnt; 395 *count += cnt; 396 if (error != 0) { 397 kmem_free(ptr, size); 398 VOP_RWUNLOCK(readvp, readflg, NULL); 399 releasef(sfv->sfv_fd); 400 return (error); 401 } 402 } 403 VOP_RWUNLOCK(readvp, readflg, NULL); 404 releasef(sfv->sfv_fd); 405 kmem_free(ptr, size); 406 } 407 sfv++; 408 } 409 return (0); 410 } 411 412 ssize32_t 413 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, 414 size32_t *xferred, int fildes) 415 { 416 u_offset_t fileoff; 417 int copy_cnt; 418 const struct ksendfilevec64 *copy_vec; 419 struct ksendfilevec64 sfv[SEND_MAX_CHUNK]; 420 struct vnode *vp; 421 int error; 422 ssize32_t count = 0; 423 424 vp = fp->f_vnode; 425 (void) VOP_RWLOCK(vp, rwflag, NULL); 426 427 copy_vec = vec; 428 fileoff = fp->f_offset; 429 430 do { 431 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 432 if (copyin(copy_vec, sfv, copy_cnt * 433 sizeof (struct ksendfilevec64))) { 434 error = EFAULT; 435 break; 436 } 437 438 /* 439 * Optimize the regular file over 440 * the socket case. 441 */ 442 if (vp->v_type == VSOCK && sfv->sfv_fd != SFV_FD_SELF) { 443 file_t *rfp; 444 vnode_t *rvp; 445 446 if ((rfp = getf(sfv->sfv_fd)) == NULL) { 447 error = EBADF; 448 break; 449 } 450 if ((rfp->f_flag & FREAD) == 0) { 451 releasef(sfv->sfv_fd); 452 error = EBADF; 453 break; 454 } 455 rvp = rfp->f_vnode; 456 if (rvp->v_type == VREG) { 457 error = sosendfile64(fp, rfp, sfv, &count); 458 if (error) 459 break; 460 copy_vec++; 461 sfvcnt--; 462 continue; 463 } 464 releasef(sfv->sfv_fd); 465 } 466 error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count); 467 if (error != 0) 468 break; 469 470 copy_vec += copy_cnt; 471 sfvcnt -= copy_cnt; 472 } while (sfvcnt > 0); 473 474 if (vp->v_type == VREG) 475 fp->f_offset += count; 476 477 VOP_RWUNLOCK(vp, rwflag, NULL); 478 if (copyout(&count, xferred, sizeof (count))) 479 error = EFAULT; 480 releasef(fildes); 481 if (error != 0) 482 return (set_errno(error)); 483 return (count); 484 } 485 #endif 486 487 int 488 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 489 int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count) 490 { 491 struct vnode *vp; 492 struct uio auio; 493 struct iovec aiov; 494 ushort_t fflag; 495 int ioflag; 496 int i, error; 497 size_t cnt; 498 ssize_t sfv_len; 499 u_offset_t sfv_off; 500 #ifdef _SYSCALL32_IMPL 501 model_t model = get_udatamodel(); 502 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 503 MAXOFF32_T : MAXOFFSET_T; 504 #else 505 const u_offset_t maxoff = MAXOFF32_T; 506 #endif 507 mblk_t *dmp = NULL; 508 int wroff; 509 int buf_left = 0; 510 size_t iov_len; 511 mblk_t *head, *tmp; 512 size_t size = total_size; 513 size_t extra; 514 int tail_len; 515 516 fflag = fp->f_flag; 517 vp = fp->f_vnode; 518 519 ASSERT(vp->v_type == VSOCK); 520 ASSERT(maxblk > 0); 521 522 wroff = (int)vp->v_stream->sd_wroff; 523 tail_len = (int)vp->v_stream->sd_tail; 524 extra = wroff + tail_len; 525 526 buf_left = MIN(total_size, maxblk); 527 head = dmp = allocb(buf_left + extra, BPRI_HI); 528 if (head == NULL) 529 return (ENOMEM); 530 head->b_wptr = head->b_rptr = head->b_rptr + wroff; 531 532 auio.uio_extflg = UIO_COPY_DEFAULT; 533 for (i = 0; i < copy_cnt; i++) { 534 if (ISSIG(curthread, JUSTLOOKING)) { 535 freemsg(head); 536 return (EINTR); 537 } 538 539 /* 540 * Do similar checks as "write" as we are writing 541 * sfv_len bytes into "vp". 542 */ 543 sfv_len = (ssize_t)sfv->sfv_len; 544 545 if (sfv_len == 0) { 546 sfv++; 547 continue; 548 } 549 550 /* Check for overflow */ 551 #ifdef _SYSCALL32_IMPL 552 if (model == DATAMODEL_ILP32) { 553 if (((ssize32_t)(*count + sfv_len)) < 0) { 554 freemsg(head); 555 return (EINVAL); 556 } 557 } else 558 #endif 559 if ((*count + sfv_len) < 0) { 560 freemsg(head); 561 return (EINVAL); 562 } 563 564 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 565 566 if (sfv->sfv_fd == SFV_FD_SELF) { 567 while (sfv_len > 0) { 568 if (buf_left == 0) { 569 tmp = dmp; 570 buf_left = MIN(total_size, maxblk); 571 iov_len = MIN(buf_left, sfv_len); 572 dmp = allocb(buf_left + extra, BPRI_HI); 573 if (dmp == NULL) { 574 freemsg(head); 575 return (ENOMEM); 576 } 577 dmp->b_wptr = dmp->b_rptr = 578 dmp->b_rptr + wroff; 579 tmp->b_cont = dmp; 580 } else { 581 iov_len = MIN(buf_left, sfv_len); 582 } 583 584 aiov.iov_len = iov_len; 585 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 586 auio.uio_loffset = *fileoff; 587 auio.uio_iovcnt = 1; 588 auio.uio_resid = iov_len; 589 auio.uio_iov = &aiov; 590 auio.uio_segflg = UIO_USERSPACE; 591 auio.uio_llimit = curproc->p_fsz_ctl; 592 auio.uio_fmode = fflag; 593 594 buf_left -= iov_len; 595 total_size -= iov_len; 596 sfv_len -= iov_len; 597 sfv_off += iov_len; 598 599 error = uiomove((caddr_t)dmp->b_wptr, 600 iov_len, UIO_WRITE, &auio); 601 if (error != 0) { 602 freemsg(head); 603 return (error); 604 } 605 dmp->b_wptr += iov_len; 606 } 607 } else { 608 file_t *ffp; 609 vnode_t *readvp; 610 611 if ((ffp = getf(sfv->sfv_fd)) == NULL) { 612 freemsg(head); 613 return (EBADF); 614 } 615 616 if ((ffp->f_flag & FREAD) == 0) { 617 releasef(sfv->sfv_fd); 618 freemsg(head); 619 return (EACCES); 620 } 621 622 readvp = ffp->f_vnode; 623 if (readvp->v_type != VREG) { 624 releasef(sfv->sfv_fd); 625 freemsg(head); 626 return (EINVAL); 627 } 628 629 /* 630 * No point reading and writing to same vp, 631 * as long as both are regular files. readvp is not 632 * locked; but since we got it from an open file the 633 * contents will be valid during the time of access. 634 */ 635 636 if (vn_compare(vp, readvp)) { 637 releasef(sfv->sfv_fd); 638 freemsg(head); 639 return (EINVAL); 640 } 641 642 /* 643 * Note: we assume readvp != vp. "vp" is already 644 * locked, and "readvp" must not be. 645 */ 646 647 (void) VOP_RWLOCK(readvp, readflg, NULL); 648 649 /* Same checks as in pread */ 650 if (sfv_off > maxoff) { 651 VOP_RWUNLOCK(readvp, readflg, NULL); 652 releasef(sfv->sfv_fd); 653 freemsg(head); 654 return (EINVAL); 655 } 656 if (sfv_off + sfv_len > maxoff) { 657 total_size -= (sfv_off + sfv_len - maxoff); 658 sfv_len = (ssize_t)((offset_t)maxoff - 659 sfv_off); 660 } 661 662 while (sfv_len > 0) { 663 if (buf_left == 0) { 664 tmp = dmp; 665 buf_left = MIN(total_size, maxblk); 666 iov_len = MIN(buf_left, sfv_len); 667 dmp = allocb(buf_left + extra, BPRI_HI); 668 if (dmp == NULL) { 669 VOP_RWUNLOCK(readvp, readflg, 670 NULL); 671 releasef(sfv->sfv_fd); 672 freemsg(head); 673 return (ENOMEM); 674 } 675 dmp->b_wptr = dmp->b_rptr = 676 dmp->b_rptr + wroff; 677 tmp->b_cont = dmp; 678 } else { 679 iov_len = MIN(buf_left, sfv_len); 680 } 681 aiov.iov_base = (caddr_t)dmp->b_wptr; 682 aiov.iov_len = iov_len; 683 auio.uio_loffset = sfv_off; 684 auio.uio_iov = &aiov; 685 auio.uio_iovcnt = 1; 686 auio.uio_resid = iov_len; 687 auio.uio_segflg = UIO_SYSSPACE; 688 auio.uio_llimit = MAXOFFSET_T; 689 auio.uio_fmode = ffp->f_flag; 690 ioflag = auio.uio_fmode & 691 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 692 693 /* 694 * If read sync is not asked for, 695 * filter sync flags 696 */ 697 if ((ioflag & FRSYNC) == 0) 698 ioflag &= ~(FSYNC|FDSYNC); 699 error = VOP_READ(readvp, &auio, ioflag, 700 fp->f_cred, NULL); 701 if (error != 0) { 702 /* 703 * If we were reading a pipe (currently 704 * not implemented), we may now loose 705 * data. 706 */ 707 VOP_RWUNLOCK(readvp, readflg, NULL); 708 releasef(sfv->sfv_fd); 709 freemsg(head); 710 return (error); 711 } 712 713 /* 714 * Check how much data was really read. 715 * Decrement the 'len' and increment the 716 * 'off' appropriately. 717 */ 718 cnt = iov_len - auio.uio_resid; 719 if (cnt == 0) { 720 VOP_RWUNLOCK(readvp, readflg, NULL); 721 releasef(sfv->sfv_fd); 722 freemsg(head); 723 return (EINVAL); 724 } 725 sfv_len -= cnt; 726 sfv_off += cnt; 727 total_size -= cnt; 728 buf_left -= cnt; 729 730 dmp->b_wptr += cnt; 731 } 732 VOP_RWUNLOCK(readvp, readflg, NULL); 733 releasef(sfv->sfv_fd); 734 } 735 sfv++; 736 } 737 738 ASSERT(total_size == 0); 739 error = kstrwritemp(vp, head, fflag); 740 if (error != 0) { 741 freemsg(head); 742 return (error); 743 } 744 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size; 745 *count += size; 746 747 return (0); 748 } 749 750 751 int 752 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 753 int copy_cnt, ssize_t *count) 754 { 755 struct vnode *vp; 756 struct uio auio; 757 struct iovec aiov; 758 ushort_t fflag; 759 int ioflag; 760 int i, error; 761 size_t cnt; 762 ssize_t sfv_len; 763 u_offset_t sfv_off; 764 #ifdef _SYSCALL32_IMPL 765 model_t model = get_udatamodel(); 766 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 767 MAXOFF32_T : MAXOFFSET_T; 768 #else 769 const u_offset_t maxoff = MAXOFF32_T; 770 #endif 771 mblk_t *dmp = NULL; 772 char *buf = NULL; 773 size_t extra; 774 int maxblk, wroff, tail_len; 775 struct sonode *so; 776 stdata_t *stp; 777 778 fflag = fp->f_flag; 779 vp = fp->f_vnode; 780 781 if (vp->v_type == VSOCK) { 782 so = VTOSO(vp); 783 stp = vp->v_stream; 784 wroff = (int)stp->sd_wroff; 785 tail_len = (int)stp->sd_tail; 786 maxblk = (int)stp->sd_maxblk; 787 extra = wroff + tail_len; 788 } 789 790 auio.uio_extflg = UIO_COPY_DEFAULT; 791 for (i = 0; i < copy_cnt; i++) { 792 if (ISSIG(curthread, JUSTLOOKING)) 793 return (EINTR); 794 795 /* 796 * Do similar checks as "write" as we are writing 797 * sfv_len bytes into "vp". 798 */ 799 sfv_len = (ssize_t)sfv->sfv_len; 800 801 if (sfv_len == 0) { 802 sfv++; 803 continue; 804 } 805 806 if (vp->v_type == VREG) { 807 if (*fileoff >= curproc->p_fsz_ctl) { 808 mutex_enter(&curproc->p_lock); 809 (void) rctl_action( 810 rctlproc_legacy[RLIMIT_FSIZE], 811 curproc->p_rctls, curproc, RCA_SAFE); 812 mutex_exit(&curproc->p_lock); 813 814 return (EFBIG); 815 } 816 817 if (*fileoff >= maxoff) 818 return (EFBIG); 819 820 if (*fileoff + sfv_len > maxoff) 821 return (EINVAL); 822 } 823 824 /* Check for overflow */ 825 #ifdef _SYSCALL32_IMPL 826 if (model == DATAMODEL_ILP32) { 827 if (((ssize32_t)(*count + sfv_len)) < 0) 828 return (EINVAL); 829 } else 830 #endif 831 if ((*count + sfv_len) < 0) 832 return (EINVAL); 833 834 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 835 836 if (sfv->sfv_fd == SFV_FD_SELF) { 837 aiov.iov_len = sfv_len; 838 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 839 auio.uio_loffset = *fileoff; 840 auio.uio_iovcnt = 1; 841 auio.uio_resid = sfv_len; 842 auio.uio_iov = &aiov; 843 auio.uio_segflg = UIO_USERSPACE; 844 auio.uio_llimit = curproc->p_fsz_ctl; 845 auio.uio_fmode = fflag; 846 847 if (vp->v_type == VSOCK) { 848 849 /* 850 * Optimize for the socket case 851 */ 852 853 dmp = allocb(sfv_len + extra, BPRI_HI); 854 if (dmp == NULL) 855 return (ENOMEM); 856 dmp->b_wptr = dmp->b_rptr = dmp->b_rptr + wroff; 857 error = uiomove((caddr_t)dmp->b_wptr, 858 sfv_len, UIO_WRITE, &auio); 859 if (error != 0) { 860 freeb(dmp); 861 return (error); 862 } 863 dmp->b_wptr += sfv_len; 864 error = kstrwritemp(vp, dmp, fflag); 865 if (error != 0) { 866 freeb(dmp); 867 return (error); 868 } 869 ttolwp(curthread)->lwp_ru.ioch += 870 (ulong_t)sfv_len; 871 *count += sfv_len; 872 } else { 873 ioflag = auio.uio_fmode & 874 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 875 while (sfv_len > 0) { 876 error = VOP_WRITE(vp, &auio, ioflag, 877 fp->f_cred, NULL); 878 cnt = sfv_len - auio.uio_resid; 879 sfv_len -= cnt; 880 ttolwp(curthread)->lwp_ru.ioch += 881 (ulong_t)cnt; 882 *fileoff += cnt; 883 *count += cnt; 884 if (error != 0) 885 return (error); 886 } 887 } 888 } else { 889 int segmapit = 0; 890 file_t *ffp; 891 vnode_t *readvp; 892 struct vnode *realvp; 893 size_t size; 894 caddr_t ptr; 895 896 if ((ffp = getf(sfv->sfv_fd)) == NULL) 897 return (EBADF); 898 899 if ((ffp->f_flag & FREAD) == 0) { 900 releasef(sfv->sfv_fd); 901 return (EBADF); 902 } 903 904 readvp = ffp->f_vnode; 905 if (VOP_REALVP(readvp, &realvp, NULL) == 0) 906 readvp = realvp; 907 if (readvp->v_type != VREG) { 908 releasef(sfv->sfv_fd); 909 return (EINVAL); 910 } 911 912 /* 913 * No point reading and writing to same vp, 914 * as long as both are regular files. readvp is not 915 * locked; but since we got it from an open file the 916 * contents will be valid during the time of access. 917 */ 918 if (vn_compare(vp, readvp)) { 919 releasef(sfv->sfv_fd); 920 return (EINVAL); 921 } 922 923 /* 924 * Note: we assume readvp != vp. "vp" is already 925 * locked, and "readvp" must not be. 926 */ 927 (void) VOP_RWLOCK(readvp, readflg, NULL); 928 929 /* Same checks as in pread */ 930 if (sfv_off > maxoff) { 931 VOP_RWUNLOCK(readvp, readflg, NULL); 932 releasef(sfv->sfv_fd); 933 return (EINVAL); 934 } 935 if (sfv_off + sfv_len > maxoff) { 936 sfv_len = (ssize_t)((offset_t)maxoff - 937 sfv_off); 938 } 939 /* Find the native blocksize to transfer data */ 940 size = MIN(vp->v_vfsp->vfs_bsize, 941 readvp->v_vfsp->vfs_bsize); 942 size = sfv_len < size ? sfv_len : size; 943 944 if (vp->v_type != VSOCK) { 945 segmapit = 0; 946 buf = kmem_alloc(size, KM_NOSLEEP); 947 if (buf == NULL) { 948 VOP_RWUNLOCK(readvp, readflg, NULL); 949 releasef(sfv->sfv_fd); 950 return (ENOMEM); 951 } 952 } else { 953 /* 954 * For sockets acting as an SSL proxy, we 955 * need to adjust the size to the maximum 956 * SSL record size set in the stream head. 957 */ 958 if (so->so_kssl_ctx != NULL) 959 size = MIN(size, maxblk); 960 961 if (vn_has_flocks(readvp) || 962 readvp->v_flag & VNOMAP || 963 stp->sd_copyflag & STZCVMUNSAFE) { 964 segmapit = 0; 965 } else if (stp->sd_copyflag & STZCVMSAFE) { 966 segmapit = 1; 967 } else { 968 int on = 1; 969 if (SOP_SETSOCKOPT(VTOSO(vp), 970 SOL_SOCKET, SO_SND_COPYAVOID, 971 &on, sizeof (on)) == 0) 972 segmapit = 1; 973 } 974 } 975 976 if (segmapit) { 977 boolean_t nowait; 978 uint_t maxpsz; 979 980 nowait = (sfv->sfv_flag & SFV_NOWAIT) != 0; 981 maxpsz = stp->sd_qn_maxpsz; 982 if (maxpsz == INFPSZ) 983 maxpsz = maxphys; 984 maxpsz = roundup(maxpsz, MAXBSIZE); 985 error = snf_segmap(fp, readvp, sfv_off, 986 (u_offset_t)sfv_len, maxpsz, 987 (ssize_t *)&cnt, nowait); 988 releasef(sfv->sfv_fd); 989 *count += cnt; 990 if (error) 991 return (error); 992 sfv++; 993 continue; 994 } 995 996 while (sfv_len > 0) { 997 size_t iov_len; 998 999 iov_len = MIN(size, sfv_len); 1000 1001 if (vp->v_type == VSOCK) { 1002 dmp = allocb(iov_len + extra, BPRI_HI); 1003 if (dmp == NULL) { 1004 VOP_RWUNLOCK(readvp, readflg, 1005 NULL); 1006 releasef(sfv->sfv_fd); 1007 return (ENOMEM); 1008 } 1009 dmp->b_wptr = dmp->b_rptr = 1010 dmp->b_rptr + wroff; 1011 ptr = (caddr_t)dmp->b_rptr; 1012 } else { 1013 ptr = buf; 1014 } 1015 1016 aiov.iov_base = ptr; 1017 aiov.iov_len = iov_len; 1018 auio.uio_loffset = sfv_off; 1019 auio.uio_iov = &aiov; 1020 auio.uio_iovcnt = 1; 1021 auio.uio_resid = iov_len; 1022 auio.uio_segflg = UIO_SYSSPACE; 1023 auio.uio_llimit = MAXOFFSET_T; 1024 auio.uio_fmode = ffp->f_flag; 1025 ioflag = auio.uio_fmode & 1026 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1027 1028 /* 1029 * If read sync is not asked for, 1030 * filter sync flags 1031 */ 1032 if ((ioflag & FRSYNC) == 0) 1033 ioflag &= ~(FSYNC|FDSYNC); 1034 error = VOP_READ(readvp, &auio, ioflag, 1035 fp->f_cred, NULL); 1036 if (error != 0) { 1037 /* 1038 * If we were reading a pipe (currently 1039 * not implemented), we may now lose 1040 * data. 1041 */ 1042 if (vp->v_type == VSOCK) 1043 freeb(dmp); 1044 else 1045 kmem_free(buf, size); 1046 VOP_RWUNLOCK(readvp, readflg, NULL); 1047 releasef(sfv->sfv_fd); 1048 return (error); 1049 } 1050 1051 /* 1052 * Check how much data was really read. 1053 * Decrement the 'len' and increment the 1054 * 'off' appropriately. 1055 */ 1056 cnt = iov_len - auio.uio_resid; 1057 if (cnt == 0) { 1058 if (vp->v_type == VSOCK) 1059 freeb(dmp); 1060 else 1061 kmem_free(buf, size); 1062 VOP_RWUNLOCK(readvp, readflg, NULL); 1063 releasef(sfv->sfv_fd); 1064 return (EINVAL); 1065 } 1066 sfv_len -= cnt; 1067 sfv_off += cnt; 1068 1069 if (vp->v_type == VSOCK) { 1070 dmp->b_wptr = dmp->b_rptr + cnt; 1071 1072 error = kstrwritemp(vp, dmp, fflag); 1073 if (error != 0) { 1074 freeb(dmp); 1075 VOP_RWUNLOCK(readvp, readflg, 1076 NULL); 1077 releasef(sfv->sfv_fd); 1078 return (error); 1079 } 1080 1081 ttolwp(curthread)->lwp_ru.ioch += 1082 (ulong_t)cnt; 1083 *count += cnt; 1084 } else { 1085 1086 aiov.iov_base = ptr; 1087 aiov.iov_len = cnt; 1088 auio.uio_loffset = *fileoff; 1089 auio.uio_resid = cnt; 1090 auio.uio_segflg = UIO_SYSSPACE; 1091 auio.uio_llimit = curproc->p_fsz_ctl; 1092 auio.uio_fmode = fflag; 1093 ioflag = auio.uio_fmode & 1094 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1095 error = VOP_WRITE(vp, &auio, ioflag, 1096 fp->f_cred, NULL); 1097 1098 /* 1099 * Check how much data was written. 1100 * Increment the 'len' and decrement the 1101 * 'off' if all the data was not 1102 * written. 1103 */ 1104 cnt -= auio.uio_resid; 1105 sfv_len += auio.uio_resid; 1106 sfv_off -= auio.uio_resid; 1107 ttolwp(curthread)->lwp_ru.ioch += 1108 (ulong_t)cnt; 1109 *fileoff += cnt; 1110 *count += cnt; 1111 if (error != 0) { 1112 kmem_free(buf, size); 1113 VOP_RWUNLOCK(readvp, readflg, 1114 NULL); 1115 releasef(sfv->sfv_fd); 1116 return (error); 1117 } 1118 } 1119 } 1120 if (buf) { 1121 kmem_free(buf, size); 1122 buf = NULL; 1123 } 1124 VOP_RWUNLOCK(readvp, readflg, NULL); 1125 releasef(sfv->sfv_fd); 1126 } 1127 sfv++; 1128 } 1129 return (0); 1130 } 1131 1132 ssize_t 1133 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, 1134 size_t *xferred) 1135 { 1136 int error = 0; 1137 int first_vector_error = 0; 1138 file_t *fp; 1139 struct vnode *vp; 1140 struct sonode *so; 1141 u_offset_t fileoff; 1142 int copy_cnt; 1143 const struct sendfilevec *copy_vec; 1144 struct sendfilevec sfv[SEND_MAX_CHUNK]; 1145 ssize_t count = 0; 1146 #ifdef _SYSCALL32_IMPL 1147 struct ksendfilevec32 sfv32[SEND_MAX_CHUNK]; 1148 #endif 1149 ssize_t total_size; 1150 int i; 1151 boolean_t is_sock = B_FALSE; 1152 int maxblk = 0; 1153 1154 if (sfvcnt <= 0) 1155 return (set_errno(EINVAL)); 1156 1157 if ((fp = getf(fildes)) == NULL) 1158 return (set_errno(EBADF)); 1159 1160 if (((fp->f_flag) & FWRITE) == 0) { 1161 error = EBADF; 1162 goto err; 1163 } 1164 1165 fileoff = fp->f_offset; 1166 vp = fp->f_vnode; 1167 1168 switch (vp->v_type) { 1169 case VSOCK: 1170 so = VTOSO(vp); 1171 /* sendfile not supported for SCTP */ 1172 if (so->so_protocol == IPPROTO_SCTP) { 1173 error = EPROTONOSUPPORT; 1174 goto err; 1175 } 1176 is_sock = B_TRUE; 1177 switch (so->so_family) { 1178 case AF_INET: 1179 case AF_INET6: 1180 /* 1181 * Make similar checks done in SOP_WRITE(). 1182 */ 1183 if (so->so_state & SS_CANTSENDMORE) { 1184 tsignal(curthread, SIGPIPE); 1185 error = EPIPE; 1186 goto err; 1187 } 1188 if (so->so_type != SOCK_STREAM) { 1189 error = EOPNOTSUPP; 1190 goto err; 1191 } 1192 1193 if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) != 1194 (SS_ISCONNECTED|SS_ISBOUND)) { 1195 error = ENOTCONN; 1196 goto err; 1197 } 1198 1199 if ((so->so_state & SS_DIRECT) && 1200 (so->so_priv != NULL) && 1201 (so->so_kssl_ctx == NULL)) { 1202 maxblk = ((tcp_t *)so->so_priv)->tcp_mss; 1203 } else { 1204 maxblk = (int)vp->v_stream->sd_maxblk; 1205 } 1206 break; 1207 default: 1208 error = EAFNOSUPPORT; 1209 goto err; 1210 } 1211 break; 1212 case VREG: 1213 break; 1214 default: 1215 error = EINVAL; 1216 goto err; 1217 } 1218 1219 switch (opcode) { 1220 case SENDFILEV : 1221 break; 1222 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 1223 case SENDFILEV64 : 1224 return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt, 1225 (size32_t *)xferred, fildes)); 1226 #endif 1227 default : 1228 error = ENOSYS; 1229 break; 1230 } 1231 1232 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1233 copy_vec = vec; 1234 1235 do { 1236 total_size = 0; 1237 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 1238 #ifdef _SYSCALL32_IMPL 1239 /* 32-bit callers need to have their iovec expanded. */ 1240 if (get_udatamodel() == DATAMODEL_ILP32) { 1241 if (copyin(copy_vec, sfv32, 1242 copy_cnt * sizeof (ksendfilevec32_t))) { 1243 error = EFAULT; 1244 break; 1245 } 1246 1247 for (i = 0; i < copy_cnt; i++) { 1248 sfv[i].sfv_fd = sfv32[i].sfv_fd; 1249 sfv[i].sfv_off = 1250 (off_t)(uint32_t)sfv32[i].sfv_off; 1251 sfv[i].sfv_len = (size_t)sfv32[i].sfv_len; 1252 total_size += sfv[i].sfv_len; 1253 sfv[i].sfv_flag = sfv32[i].sfv_flag; 1254 /* 1255 * Individual elements of the vector must not 1256 * wrap or overflow, as later math is signed. 1257 * Equally total_size needs to be checked after 1258 * each vector is added in, to be sure that 1259 * rogue values haven't overflowed the counter. 1260 */ 1261 if (((ssize32_t)sfv[i].sfv_len < 0) || 1262 ((ssize32_t)total_size < 0)) { 1263 /* 1264 * Truncate the vector to send data 1265 * described by elements before the 1266 * error. 1267 */ 1268 copy_cnt = i; 1269 first_vector_error = EINVAL; 1270 /* total_size can't be trusted */ 1271 if ((ssize32_t)total_size < 0) 1272 error = EINVAL; 1273 break; 1274 } 1275 } 1276 /* Nothing to do, process errors */ 1277 if (copy_cnt == 0) 1278 break; 1279 1280 } else { 1281 #endif 1282 if (copyin(copy_vec, sfv, 1283 copy_cnt * sizeof (sendfilevec_t))) { 1284 error = EFAULT; 1285 break; 1286 } 1287 1288 for (i = 0; i < copy_cnt; i++) { 1289 total_size += sfv[i].sfv_len; 1290 /* 1291 * Individual elements of the vector must not 1292 * wrap or overflow, as later math is signed. 1293 * Equally total_size needs to be checked after 1294 * each vector is added in, to be sure that 1295 * rogue values haven't overflowed the counter. 1296 */ 1297 if (((ssize_t)sfv[i].sfv_len < 0) || 1298 (total_size < 0)) { 1299 /* 1300 * Truncate the vector to send data 1301 * described by elements before the 1302 * error. 1303 */ 1304 copy_cnt = i; 1305 first_vector_error = EINVAL; 1306 /* total_size can't be trusted */ 1307 if (total_size < 0) 1308 error = EINVAL; 1309 break; 1310 } 1311 } 1312 /* Nothing to do, process errors */ 1313 if (copy_cnt == 0) 1314 break; 1315 #ifdef _SYSCALL32_IMPL 1316 } 1317 #endif 1318 1319 /* 1320 * The task between deciding to use sendvec_small_chunk 1321 * and sendvec_chunk is dependant on multiple things: 1322 * 1323 * i) latency is important for smaller files. So if the 1324 * data is smaller than 'tcp_slow_start_initial' times 1325 * maxblk, then use sendvec_small_chunk which creates 1326 * maxblk size mblks and chains then together and sends 1327 * them to TCP in one shot. It also leaves 'wroff' size 1328 * space for the headers in each mblk. 1329 * 1330 * ii) for total size bigger than 'tcp_slow_start_initial' 1331 * time maxblk, its probably real file data which is 1332 * dominating. So its better to use sendvec_chunk because 1333 * performance goes to dog if we don't do pagesize reads. 1334 * sendvec_chunk will do pagesize reads and write them 1335 * in pagesize mblks to TCP. 1336 * 1337 * Side Notes: A write to file has not been optimized. 1338 * Future zero copy code will plugin into sendvec_chunk 1339 * only because doing zero copy for files smaller then 1340 * pagesize is useless. 1341 * 1342 * Note, if socket has NL7C enabled then call NL7C's 1343 * senfilev() function to consume the sfv[]. 1344 */ 1345 if (is_sock) { 1346 switch (so->so_family) { 1347 case AF_INET: 1348 case AF_INET6: 1349 if (so->so_nl7c_flags != 0) 1350 error = nl7c_sendfilev(so, &fileoff, 1351 sfv, copy_cnt, &count); 1352 else if ((total_size <= (4 * maxblk)) && 1353 error == 0) 1354 error = sendvec_small_chunk(fp, 1355 &fileoff, sfv, copy_cnt, 1356 total_size, maxblk, &count); 1357 else 1358 error = sendvec_chunk(fp, &fileoff, 1359 sfv, copy_cnt, &count); 1360 break; 1361 } 1362 } else { 1363 ASSERT(vp->v_type == VREG); 1364 error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt, 1365 &count); 1366 } 1367 1368 1369 #ifdef _SYSCALL32_IMPL 1370 if (get_udatamodel() == DATAMODEL_ILP32) 1371 copy_vec = (const struct sendfilevec *)((char *)copy_vec + 1372 (copy_cnt * sizeof (ksendfilevec32_t))); 1373 else 1374 #endif 1375 copy_vec += copy_cnt; 1376 sfvcnt -= copy_cnt; 1377 1378 /* Process all vector members up to first error */ 1379 } while ((sfvcnt > 0) && first_vector_error == 0 && error == 0); 1380 1381 if (vp->v_type == VREG) 1382 fp->f_offset += count; 1383 1384 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1385 1386 #ifdef _SYSCALL32_IMPL 1387 if (get_udatamodel() == DATAMODEL_ILP32) { 1388 ssize32_t count32 = (ssize32_t)count; 1389 if (copyout(&count32, xferred, sizeof (count32))) 1390 error = EFAULT; 1391 releasef(fildes); 1392 if (error != 0) 1393 return (set_errno(error)); 1394 if (first_vector_error != 0) 1395 return (set_errno(first_vector_error)); 1396 return (count32); 1397 } 1398 #endif 1399 if (copyout(&count, xferred, sizeof (count))) 1400 error = EFAULT; 1401 releasef(fildes); 1402 if (error != 0) 1403 return (set_errno(error)); 1404 if (first_vector_error != 0) 1405 return (set_errno(first_vector_error)); 1406 return (count); 1407 err: 1408 ASSERT(error != 0); 1409 releasef(fildes); 1410 return (set_errno(error)); 1411 } 1412