1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/t_lock.h> 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/buf.h> 34 #include <sys/conf.h> 35 #include <sys/cred.h> 36 #include <sys/kmem.h> 37 #include <sys/sysmacros.h> 38 #include <sys/vfs.h> 39 #include <sys/vnode.h> 40 #include <sys/debug.h> 41 #include <sys/errno.h> 42 #include <sys/time.h> 43 #include <sys/file.h> 44 #include <sys/open.h> 45 #include <sys/user.h> 46 #include <sys/termios.h> 47 #include <sys/stream.h> 48 #include <sys/strsubr.h> 49 #include <sys/esunddi.h> 50 #include <sys/flock.h> 51 #include <sys/modctl.h> 52 #include <sys/cmn_err.h> 53 #include <sys/vmsystm.h> 54 55 #include <sys/socket.h> 56 #include <sys/socketvar.h> 57 #include <netinet/in.h> 58 #include <sys/sendfile.h> 59 #include <sys/un.h> 60 #include <inet/nca/ncadoorhdr.h> 61 #include <inet/nca/ncaio.h> 62 #include <sys/tihdr.h> 63 #include <sys/atomic.h> 64 65 #include <inet/common.h> 66 #include <inet/ip.h> 67 #include <inet/ip6.h> 68 #include <inet/tcp.h> 69 70 extern int nca_sendfilev(file_t *, struct sendfilevec *, int, ssize_t *); 71 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *, 72 ssize32_t *); 73 extern void nl7c_sendfilev(struct sonode *, u_offset_t, struct sendfilevec *, 74 int); 75 76 /* 77 * kstrwritemp() has very similar semantics as that of strwrite(). 78 * The main difference is it obtains mblks from the caller and also 79 * does not do any copy as done in strwrite() from user buffers to 80 * kernel buffers. 81 * 82 * Currently, this routine is used by sendfile to send data allocated 83 * within the kernel without any copying. This interface does not use the 84 * synchronous stream interface as synch. stream interface implies 85 * copying. 86 */ 87 int 88 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode) 89 { 90 struct stdata *stp; 91 struct queue *wqp; 92 mblk_t *newmp; 93 char waitflag; 94 int tempmode; 95 int error = 0; 96 int done = 0; 97 struct sonode *so; 98 boolean_t direct; 99 100 ASSERT(vp->v_stream); 101 stp = vp->v_stream; 102 103 so = VTOSO(vp); 104 direct = (so->so_state & SS_DIRECT); 105 106 /* 107 * This is the sockfs direct fast path. canputnext() need 108 * not be accurate so we don't grab the sd_lock here. If 109 * we get flow-controlled, we grab sd_lock just before the 110 * do..while loop below to emulate what strwrite() does. 111 */ 112 wqp = stp->sd_wrq; 113 if (canputnext(wqp) && direct && 114 !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) { 115 return (sostream_direct(so, NULL, mp, CRED())); 116 } else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) { 117 /* Fast check of flags before acquiring the lock */ 118 mutex_enter(&stp->sd_lock); 119 error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0); 120 mutex_exit(&stp->sd_lock); 121 if (error != 0) { 122 if (!(stp->sd_flag & STPLEX) && 123 (stp->sd_wput_opt & SW_SIGPIPE)) { 124 tsignal(curthread, SIGPIPE); 125 error = EPIPE; 126 } 127 return (error); 128 } 129 } 130 131 waitflag = WRITEWAIT; 132 if (stp->sd_flag & OLDNDELAY) 133 tempmode = fmode & ~FNDELAY; 134 else 135 tempmode = fmode; 136 137 mutex_enter(&stp->sd_lock); 138 do { 139 if (canputnext(wqp)) { 140 mutex_exit(&stp->sd_lock); 141 if (stp->sd_wputdatafunc != NULL) { 142 newmp = (stp->sd_wputdatafunc)(vp, mp, NULL, 143 NULL, NULL, NULL); 144 if (newmp == NULL) { 145 /* The caller will free mp */ 146 return (ECOMM); 147 } 148 mp = newmp; 149 } 150 putnext(wqp, mp); 151 return (0); 152 } 153 error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1, 154 &done); 155 } while (error == 0 && !done); 156 157 mutex_exit(&stp->sd_lock); 158 /* 159 * EAGAIN tells the application to try again. ENOMEM 160 * is returned only if the memory allocation size 161 * exceeds the physical limits of the system. ENOMEM 162 * can't be true here. 163 */ 164 if (error == ENOMEM) 165 error = EAGAIN; 166 return (error); 167 } 168 169 #define SEND_MAX_CHUNK 16 170 171 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 172 /* 173 * 64 bit offsets for 32 bit applications only running either on 174 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer 175 * more than 2GB of data. 176 */ 177 int 178 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, 179 int copy_cnt, ssize32_t *count) 180 { 181 struct vnode *vp; 182 ushort_t fflag; 183 int ioflag; 184 size32_t cnt; 185 ssize32_t sfv_len; 186 ssize32_t tmpcount; 187 u_offset_t sfv_off; 188 struct uio auio; 189 struct iovec aiov; 190 int i, error; 191 192 fflag = fp->f_flag; 193 vp = fp->f_vnode; 194 for (i = 0; i < copy_cnt; i++) { 195 196 if (ISSIG(curthread, JUSTLOOKING)) 197 return (EINTR); 198 199 /* 200 * Do similar checks as "write" as we are writing 201 * sfv_len bytes into "vp". 202 */ 203 sfv_len = (ssize32_t)sfv->sfv_len; 204 205 if (sfv_len == 0) 206 continue; 207 208 if (sfv_len < 0) 209 return (EINVAL); 210 211 if (vp->v_type == VREG) { 212 if (*fileoff >= curproc->p_fsz_ctl) { 213 mutex_enter(&curproc->p_lock); 214 (void) rctl_action( 215 rctlproc_legacy[RLIMIT_FSIZE], 216 curproc->p_rctls, curproc, RCA_SAFE); 217 mutex_exit(&curproc->p_lock); 218 return (EFBIG); 219 } 220 221 if (*fileoff >= OFFSET_MAX(fp)) 222 return (EFBIG); 223 224 if (*fileoff + sfv_len > OFFSET_MAX(fp)) 225 return (EINVAL); 226 } 227 228 tmpcount = *count + sfv_len; 229 if (tmpcount < 0) 230 return (EINVAL); 231 232 sfv_off = sfv->sfv_off; 233 234 auio.uio_extflg = UIO_COPY_DEFAULT; 235 if (sfv->sfv_fd == SFV_FD_SELF) { 236 aiov.iov_len = sfv_len; 237 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 238 auio.uio_loffset = *fileoff; 239 auio.uio_iovcnt = 1; 240 auio.uio_resid = sfv_len; 241 auio.uio_iov = &aiov; 242 auio.uio_segflg = UIO_USERSPACE; 243 auio.uio_llimit = curproc->p_fsz_ctl; 244 auio.uio_fmode = fflag; 245 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 246 while (sfv_len > 0) { 247 error = VOP_WRITE(vp, &auio, ioflag, 248 fp->f_cred, NULL); 249 cnt = sfv_len - auio.uio_resid; 250 sfv_len -= cnt; 251 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 252 if (vp->v_type == VREG) 253 *fileoff += cnt; 254 *count += cnt; 255 if (error != 0) 256 return (error); 257 } 258 } else { 259 file_t *ffp; 260 vnode_t *readvp; 261 int readflg = 0; 262 size_t size; 263 caddr_t ptr; 264 265 if ((ffp = getf(sfv->sfv_fd)) == NULL) 266 return (EBADF); 267 268 if ((ffp->f_flag & FREAD) == 0) { 269 releasef(sfv->sfv_fd); 270 return (EBADF); 271 } 272 273 readvp = ffp->f_vnode; 274 if (readvp->v_type != VREG) { 275 releasef(sfv->sfv_fd); 276 return (EINVAL); 277 } 278 279 /* 280 * No point reading and writing to same vp, 281 * as long as both are regular files. readvp is not 282 * locked; but since we got it from an open file the 283 * contents will be valid during the time of access. 284 */ 285 if (VN_CMP(vp, readvp)) { 286 releasef(sfv->sfv_fd); 287 return (EINVAL); 288 } 289 290 /* 291 * Note: we assume readvp != vp. "vp" is already 292 * locked, and "readvp" must not be. 293 */ 294 (void) VOP_RWLOCK(readvp, readflg, NULL); 295 296 /* 297 * Same checks as in pread64. 298 */ 299 if (sfv_off > MAXOFFSET_T) { 300 VOP_RWUNLOCK(readvp, readflg, NULL); 301 releasef(sfv->sfv_fd); 302 return (EINVAL); 303 } 304 305 if (sfv_off + sfv_len > MAXOFFSET_T) 306 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off); 307 308 /* Find the native blocksize to transfer data */ 309 size = MIN(vp->v_vfsp->vfs_bsize, 310 readvp->v_vfsp->vfs_bsize); 311 size = sfv_len < size ? sfv_len : size; 312 ptr = kmem_alloc(size, KM_SLEEP); 313 314 while (sfv_len > 0) { 315 size_t iov_len; 316 317 iov_len = MIN(size, sfv_len); 318 aiov.iov_base = ptr; 319 aiov.iov_len = iov_len; 320 auio.uio_loffset = sfv_off; 321 auio.uio_iov = &aiov; 322 auio.uio_iovcnt = 1; 323 auio.uio_resid = iov_len; 324 auio.uio_segflg = UIO_SYSSPACE; 325 auio.uio_llimit = MAXOFFSET_T; 326 auio.uio_fmode = ffp->f_flag; 327 ioflag = auio.uio_fmode & 328 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 329 330 /* 331 * If read sync is not asked for, 332 * filter sync flags 333 */ 334 if ((ioflag & FRSYNC) == 0) 335 ioflag &= ~(FSYNC|FDSYNC); 336 error = VOP_READ(readvp, &auio, ioflag, 337 fp->f_cred, NULL); 338 if (error) { 339 kmem_free(ptr, size); 340 VOP_RWUNLOCK(readvp, readflg, NULL); 341 releasef(sfv->sfv_fd); 342 return (error); 343 } 344 345 /* 346 * Check how must data was really read. 347 * Decrement the 'len' and increment the 348 * 'off' appropriately. 349 */ 350 cnt = iov_len - auio.uio_resid; 351 if (cnt == 0) { 352 /* 353 * If we were reading a pipe (currently 354 * not implemented), we may now lose 355 * data. 356 */ 357 kmem_free(ptr, size); 358 VOP_RWUNLOCK(readvp, readflg, NULL); 359 releasef(sfv->sfv_fd); 360 return (EINVAL); 361 } 362 sfv_len -= cnt; 363 sfv_off += cnt; 364 365 aiov.iov_base = ptr; 366 aiov.iov_len = cnt; 367 auio.uio_loffset = *fileoff; 368 auio.uio_resid = cnt; 369 auio.uio_segflg = UIO_SYSSPACE; 370 auio.uio_llimit = curproc->p_fsz_ctl; 371 auio.uio_fmode = fflag; 372 ioflag = auio.uio_fmode & 373 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 374 error = VOP_WRITE(vp, &auio, ioflag, 375 fp->f_cred, NULL); 376 377 /* 378 * Check how much data was written. Increment 379 * the 'len' and decrement the 'off' if all 380 * the data was not written. 381 */ 382 cnt -= auio.uio_resid; 383 sfv_len += auio.uio_resid; 384 sfv_off -= auio.uio_resid; 385 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 386 if (vp->v_type == VREG) 387 *fileoff += cnt; 388 *count += cnt; 389 if (error != 0) { 390 kmem_free(ptr, size); 391 VOP_RWUNLOCK(readvp, readflg, NULL); 392 releasef(sfv->sfv_fd); 393 return (error); 394 } 395 } 396 VOP_RWUNLOCK(readvp, readflg, NULL); 397 releasef(sfv->sfv_fd); 398 kmem_free(ptr, size); 399 } 400 sfv++; 401 } 402 return (0); 403 } 404 405 ssize32_t 406 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, 407 size32_t *xferred, int fildes) 408 { 409 int rwflag; 410 u_offset_t fileoff; 411 int copy_cnt; 412 const struct ksendfilevec64 *copy_vec; 413 struct ksendfilevec64 sfv[SEND_MAX_CHUNK]; 414 struct vnode *vp; 415 int error; 416 ssize32_t count = 0; 417 int osfvcnt; 418 419 rwflag = 1; 420 vp = fp->f_vnode; 421 (void) VOP_RWLOCK(vp, rwflag, NULL); 422 423 copy_vec = vec; 424 fileoff = fp->f_offset; 425 osfvcnt = sfvcnt; 426 427 do { 428 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 429 if (copyin(copy_vec, sfv, copy_cnt * 430 sizeof (struct ksendfilevec64))) { 431 error = EFAULT; 432 break; 433 } 434 435 /* 436 * Optimize the single regular file over 437 * the socket case. 438 */ 439 if (vp->v_type == VSOCK && osfvcnt == 1 && 440 sfv->sfv_fd != SFV_FD_SELF) { 441 file_t *rfp; 442 vnode_t *rvp; 443 444 if ((rfp = getf(sfv->sfv_fd)) == NULL) { 445 error = EBADF; 446 break; 447 } 448 if ((rfp->f_flag & FREAD) == 0) { 449 releasef(sfv->sfv_fd); 450 error = EBADF; 451 break; 452 } 453 rvp = rfp->f_vnode; 454 if (rvp->v_type == VREG) { 455 error = sosendfile64(fp, rfp, sfv, &count); 456 break; 457 } 458 releasef(sfv->sfv_fd); 459 } 460 error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count); 461 if (error != 0) 462 break; 463 464 copy_vec += copy_cnt; 465 sfvcnt -= copy_cnt; 466 } while (sfvcnt > 0); 467 468 if (vp->v_type == VREG) 469 fp->f_offset += count; 470 471 VOP_RWUNLOCK(vp, rwflag, NULL); 472 if (copyout(&count, xferred, sizeof (count))) 473 error = EFAULT; 474 releasef(fildes); 475 if (error != 0) 476 return (set_errno(error)); 477 return (count); 478 } 479 #endif 480 481 int 482 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 483 int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count) 484 { 485 struct vnode *vp; 486 struct uio auio; 487 struct iovec aiov; 488 ushort_t fflag; 489 int ioflag; 490 int i, error; 491 size_t cnt; 492 ssize_t sfv_len; 493 u_offset_t sfv_off; 494 #ifdef _SYSCALL32_IMPL 495 model_t model = get_udatamodel(); 496 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 497 MAXOFF32_T : MAXOFFSET_T; 498 #else 499 const u_offset_t maxoff = MAXOFF32_T; 500 #endif 501 mblk_t *dmp = NULL; 502 int wroff; 503 int buf_left = 0; 504 size_t iov_len; 505 mblk_t *head, *tmp; 506 size_t size = total_size; 507 size_t extra; 508 int tail_len; 509 510 fflag = fp->f_flag; 511 vp = fp->f_vnode; 512 513 ASSERT(vp->v_type == VSOCK); 514 ASSERT(maxblk > 0); 515 516 wroff = (int)vp->v_stream->sd_wroff; 517 tail_len = (int)vp->v_stream->sd_tail; 518 extra = wroff + tail_len; 519 520 buf_left = MIN(total_size, maxblk); 521 head = dmp = allocb(buf_left + extra, BPRI_HI); 522 if (head == NULL) 523 return (ENOMEM); 524 head->b_wptr = head->b_rptr = head->b_rptr + wroff; 525 526 auio.uio_extflg = UIO_COPY_DEFAULT; 527 for (i = 0; i < copy_cnt; i++) { 528 if (ISSIG(curthread, JUSTLOOKING)) 529 return (EINTR); 530 531 /* 532 * Do similar checks as "write" as we are writing 533 * sfv_len bytes into "vp". 534 */ 535 sfv_len = (ssize_t)sfv->sfv_len; 536 537 if (sfv_len == 0) { 538 sfv++; 539 continue; 540 } 541 542 /* Make sure sfv_len is not negative */ 543 #ifdef _SYSCALL32_IMPL 544 if (model == DATAMODEL_ILP32) { 545 if ((ssize32_t)sfv_len < 0) 546 return (EINVAL); 547 } else 548 #endif 549 if (sfv_len < 0) 550 return (EINVAL); 551 552 /* Check for overflow */ 553 #ifdef _SYSCALL32_IMPL 554 if (model == DATAMODEL_ILP32) { 555 if (((ssize32_t)(*count + sfv_len)) < 0) 556 return (EINVAL); 557 } else 558 #endif 559 if ((*count + sfv_len) < 0) 560 return (EINVAL); 561 562 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 563 564 if (sfv->sfv_fd == SFV_FD_SELF) { 565 while (sfv_len > 0) { 566 if (buf_left == 0) { 567 tmp = dmp; 568 buf_left = MIN(total_size, maxblk); 569 iov_len = MIN(buf_left, sfv_len); 570 dmp = allocb(buf_left + extra, BPRI_HI); 571 if (dmp == NULL) { 572 freemsg(head); 573 return (ENOMEM); 574 } 575 dmp->b_wptr = dmp->b_rptr = 576 dmp->b_rptr + wroff; 577 tmp->b_cont = dmp; 578 } else { 579 iov_len = MIN(buf_left, sfv_len); 580 } 581 582 aiov.iov_len = iov_len; 583 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 584 auio.uio_loffset = *fileoff; 585 auio.uio_iovcnt = 1; 586 auio.uio_resid = iov_len; 587 auio.uio_iov = &aiov; 588 auio.uio_segflg = UIO_USERSPACE; 589 auio.uio_llimit = curproc->p_fsz_ctl; 590 auio.uio_fmode = fflag; 591 592 buf_left -= iov_len; 593 total_size -= iov_len; 594 sfv_len -= iov_len; 595 sfv_off += iov_len; 596 597 error = uiomove((caddr_t)dmp->b_wptr, 598 iov_len, UIO_WRITE, &auio); 599 if (error != 0) { 600 freemsg(head); 601 return (error); 602 } 603 dmp->b_wptr += iov_len; 604 } 605 } else { 606 file_t *ffp; 607 vnode_t *readvp; 608 int readflg = 0; 609 610 if ((ffp = getf(sfv->sfv_fd)) == NULL) { 611 freemsg(head); 612 return (EBADF); 613 } 614 615 if ((ffp->f_flag & FREAD) == 0) { 616 releasef(sfv->sfv_fd); 617 freemsg(head); 618 return (EACCES); 619 } 620 621 readvp = ffp->f_vnode; 622 if (readvp->v_type != VREG) { 623 releasef(sfv->sfv_fd); 624 freemsg(head); 625 return (EINVAL); 626 } 627 628 /* 629 * No point reading and writing to same vp, 630 * as long as both are regular files. readvp is not 631 * locked; but since we got it from an open file the 632 * contents will be valid during the time of access. 633 */ 634 635 if (VN_CMP(vp, readvp)) { 636 releasef(sfv->sfv_fd); 637 freemsg(head); 638 return (EINVAL); 639 } 640 641 /* 642 * Note: we assume readvp != vp. "vp" is already 643 * locked, and "readvp" must not be. 644 */ 645 646 (void) VOP_RWLOCK(readvp, readflg, NULL); 647 648 /* Same checks as in pread */ 649 if (sfv_off > maxoff) { 650 VOP_RWUNLOCK(readvp, readflg, NULL); 651 releasef(sfv->sfv_fd); 652 freemsg(head); 653 return (EINVAL); 654 } 655 if (sfv_off + sfv_len > maxoff) { 656 sfv_len = (ssize_t)((offset_t)maxoff - 657 sfv_off); 658 } 659 660 while (sfv_len > 0) { 661 if (buf_left == 0) { 662 tmp = dmp; 663 buf_left = MIN(total_size, maxblk); 664 iov_len = MIN(buf_left, sfv_len); 665 dmp = allocb(buf_left + extra, BPRI_HI); 666 if (dmp == NULL) { 667 VOP_RWUNLOCK(readvp, readflg, 668 NULL); 669 releasef(sfv->sfv_fd); 670 freemsg(head); 671 return (ENOMEM); 672 } 673 dmp->b_wptr = dmp->b_rptr = 674 dmp->b_rptr + wroff; 675 tmp->b_cont = dmp; 676 } else { 677 iov_len = MIN(buf_left, sfv_len); 678 } 679 aiov.iov_base = (caddr_t)dmp->b_wptr; 680 aiov.iov_len = iov_len; 681 auio.uio_loffset = sfv_off; 682 auio.uio_iov = &aiov; 683 auio.uio_iovcnt = 1; 684 auio.uio_resid = iov_len; 685 auio.uio_segflg = UIO_SYSSPACE; 686 auio.uio_llimit = MAXOFFSET_T; 687 auio.uio_fmode = ffp->f_flag; 688 ioflag = auio.uio_fmode & 689 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 690 691 /* 692 * If read sync is not asked for, 693 * filter sync flags 694 */ 695 if ((ioflag & FRSYNC) == 0) 696 ioflag &= ~(FSYNC|FDSYNC); 697 error = VOP_READ(readvp, &auio, ioflag, 698 fp->f_cred, NULL); 699 if (error != 0) { 700 /* 701 * If we were reading a pipe (currently 702 * not implemented), we may now loose 703 * data. 704 */ 705 VOP_RWUNLOCK(readvp, readflg, NULL); 706 releasef(sfv->sfv_fd); 707 freemsg(head); 708 return (error); 709 } 710 711 /* 712 * Check how much data was really read. 713 * Decrement the 'len' and increment the 714 * 'off' appropriately. 715 */ 716 cnt = iov_len - auio.uio_resid; 717 if (cnt == 0) { 718 VOP_RWUNLOCK(readvp, readflg, NULL); 719 releasef(sfv->sfv_fd); 720 freemsg(head); 721 return (EINVAL); 722 } 723 sfv_len -= cnt; 724 sfv_off += cnt; 725 total_size -= cnt; 726 buf_left -= cnt; 727 728 dmp->b_wptr += cnt; 729 } 730 VOP_RWUNLOCK(readvp, readflg, NULL); 731 releasef(sfv->sfv_fd); 732 } 733 sfv++; 734 } 735 736 ASSERT(total_size == 0); 737 error = kstrwritemp(vp, head, fflag); 738 if (error != 0) { 739 freemsg(head); 740 return (error); 741 } 742 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size; 743 *count += size; 744 745 return (0); 746 } 747 748 749 int 750 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 751 int copy_cnt, ssize_t *count) 752 { 753 struct vnode *vp; 754 struct uio auio; 755 struct iovec aiov; 756 ushort_t fflag; 757 int ioflag; 758 int i, error; 759 size_t cnt; 760 ssize_t sfv_len; 761 u_offset_t sfv_off; 762 #ifdef _SYSCALL32_IMPL 763 model_t model = get_udatamodel(); 764 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 765 MAXOFF32_T : MAXOFFSET_T; 766 #else 767 const u_offset_t maxoff = MAXOFF32_T; 768 #endif 769 mblk_t *dmp = NULL; 770 char *buf = NULL; 771 size_t extra; 772 int maxblk, wroff, tail_len; 773 struct sonode *so; 774 stdata_t *stp; 775 776 fflag = fp->f_flag; 777 vp = fp->f_vnode; 778 779 if (vp->v_type == VSOCK) { 780 so = VTOSO(vp); 781 stp = vp->v_stream; 782 wroff = (int)stp->sd_wroff; 783 tail_len = (int)stp->sd_tail; 784 maxblk = (int)stp->sd_maxblk; 785 extra = wroff + tail_len; 786 } 787 788 auio.uio_extflg = UIO_COPY_DEFAULT; 789 for (i = 0; i < copy_cnt; i++) { 790 if (ISSIG(curthread, JUSTLOOKING)) 791 return (EINTR); 792 793 /* 794 * Do similar checks as "write" as we are writing 795 * sfv_len bytes into "vp". 796 */ 797 sfv_len = (ssize_t)sfv->sfv_len; 798 799 if (sfv_len == 0) { 800 sfv++; 801 continue; 802 } 803 804 /* Make sure sfv_len is not negative */ 805 #ifdef _SYSCALL32_IMPL 806 if (model == DATAMODEL_ILP32) { 807 if ((ssize32_t)sfv_len < 0) 808 return (EINVAL); 809 } else 810 #endif 811 if (sfv_len < 0) 812 return (EINVAL); 813 814 if (vp->v_type == VREG) { 815 if (*fileoff >= curproc->p_fsz_ctl) { 816 mutex_enter(&curproc->p_lock); 817 (void) rctl_action( 818 rctlproc_legacy[RLIMIT_FSIZE], 819 curproc->p_rctls, curproc, RCA_SAFE); 820 mutex_exit(&curproc->p_lock); 821 822 return (EFBIG); 823 } 824 825 if (*fileoff >= maxoff) 826 return (EFBIG); 827 828 if (*fileoff + sfv_len > maxoff) 829 return (EINVAL); 830 } 831 832 /* Check for overflow */ 833 #ifdef _SYSCALL32_IMPL 834 if (model == DATAMODEL_ILP32) { 835 if (((ssize32_t)(*count + sfv_len)) < 0) 836 return (EINVAL); 837 } else 838 #endif 839 if ((*count + sfv_len) < 0) 840 return (EINVAL); 841 842 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 843 844 if (sfv->sfv_fd == SFV_FD_SELF) { 845 aiov.iov_len = sfv_len; 846 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 847 auio.uio_loffset = *fileoff; 848 auio.uio_iovcnt = 1; 849 auio.uio_resid = sfv_len; 850 auio.uio_iov = &aiov; 851 auio.uio_segflg = UIO_USERSPACE; 852 auio.uio_llimit = curproc->p_fsz_ctl; 853 auio.uio_fmode = fflag; 854 855 if (vp->v_type == VSOCK) { 856 857 /* 858 * Optimize for the socket case 859 */ 860 861 dmp = allocb(sfv_len + extra, BPRI_HI); 862 if (dmp == NULL) 863 return (ENOMEM); 864 dmp->b_wptr = dmp->b_rptr = dmp->b_rptr + wroff; 865 error = uiomove((caddr_t)dmp->b_wptr, 866 sfv_len, UIO_WRITE, &auio); 867 if (error != 0) { 868 freeb(dmp); 869 return (error); 870 } 871 dmp->b_wptr += sfv_len; 872 error = kstrwritemp(vp, dmp, fflag); 873 if (error != 0) { 874 freeb(dmp); 875 return (error); 876 } 877 ttolwp(curthread)->lwp_ru.ioch += 878 (ulong_t)sfv_len; 879 *count += sfv_len; 880 } else { 881 ioflag = auio.uio_fmode & 882 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 883 while (sfv_len > 0) { 884 error = VOP_WRITE(vp, &auio, ioflag, 885 fp->f_cred, NULL); 886 cnt = sfv_len - auio.uio_resid; 887 sfv_len -= cnt; 888 ttolwp(curthread)->lwp_ru.ioch += 889 (ulong_t)cnt; 890 *fileoff += cnt; 891 *count += cnt; 892 if (error != 0) 893 return (error); 894 } 895 } 896 } else { 897 file_t *ffp; 898 vnode_t *readvp; 899 int readflg = 0; 900 size_t size; 901 caddr_t ptr; 902 903 if ((ffp = getf(sfv->sfv_fd)) == NULL) 904 return (EBADF); 905 906 if ((ffp->f_flag & FREAD) == 0) { 907 releasef(sfv->sfv_fd); 908 return (EBADF); 909 } 910 911 readvp = ffp->f_vnode; 912 if (readvp->v_type != VREG) { 913 releasef(sfv->sfv_fd); 914 return (EINVAL); 915 } 916 917 /* 918 * No point reading and writing to same vp, 919 * as long as both are regular files. readvp is not 920 * locked; but since we got it from an open file the 921 * contents will be valid during the time of access. 922 */ 923 if (VN_CMP(vp, readvp)) { 924 releasef(sfv->sfv_fd); 925 return (EINVAL); 926 } 927 928 /* 929 * Note: we assume readvp != vp. "vp" is already 930 * locked, and "readvp" must not be. 931 */ 932 (void) VOP_RWLOCK(readvp, readflg, NULL); 933 934 /* Same checks as in pread */ 935 if (sfv_off > maxoff) { 936 VOP_RWUNLOCK(readvp, readflg, NULL); 937 releasef(sfv->sfv_fd); 938 return (EINVAL); 939 } 940 if (sfv_off + sfv_len > maxoff) { 941 sfv_len = (ssize_t)((offset_t)maxoff - 942 sfv_off); 943 } 944 /* Find the native blocksize to transfer data */ 945 size = MIN(vp->v_vfsp->vfs_bsize, 946 readvp->v_vfsp->vfs_bsize); 947 size = sfv_len < size ? sfv_len : size; 948 949 if (vp->v_type != VSOCK) { 950 buf = kmem_alloc(size, KM_NOSLEEP); 951 if (buf == NULL) { 952 VOP_RWUNLOCK(readvp, readflg, NULL); 953 releasef(sfv->sfv_fd); 954 return (ENOMEM); 955 } 956 } else { 957 /* 958 * For sockets acting as an SSL proxy, we 959 * need to adjust the size to the maximum 960 * SSL record size set in the stream head. 961 */ 962 if (so->so_kssl_ctx != NULL) 963 size = MIN(size, maxblk); 964 } 965 966 while (sfv_len > 0) { 967 size_t iov_len; 968 969 iov_len = MIN(size, sfv_len); 970 971 if (vp->v_type == VSOCK) { 972 dmp = allocb(iov_len + extra, BPRI_HI); 973 if (dmp == NULL) { 974 VOP_RWUNLOCK(readvp, readflg, 975 NULL); 976 releasef(sfv->sfv_fd); 977 return (ENOMEM); 978 } 979 dmp->b_wptr = dmp->b_rptr = 980 dmp->b_rptr + wroff; 981 ptr = (caddr_t)dmp->b_rptr; 982 } else { 983 ptr = buf; 984 } 985 986 aiov.iov_base = ptr; 987 aiov.iov_len = iov_len; 988 auio.uio_loffset = sfv_off; 989 auio.uio_iov = &aiov; 990 auio.uio_iovcnt = 1; 991 auio.uio_resid = iov_len; 992 auio.uio_segflg = UIO_SYSSPACE; 993 auio.uio_llimit = MAXOFFSET_T; 994 auio.uio_fmode = ffp->f_flag; 995 ioflag = auio.uio_fmode & 996 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 997 998 /* 999 * If read sync is not asked for, 1000 * filter sync flags 1001 */ 1002 if ((ioflag & FRSYNC) == 0) 1003 ioflag &= ~(FSYNC|FDSYNC); 1004 error = VOP_READ(readvp, &auio, ioflag, 1005 fp->f_cred, NULL); 1006 if (error != 0) { 1007 /* 1008 * If we were reading a pipe (currently 1009 * not implemented), we may now lose 1010 * data. 1011 */ 1012 if (vp->v_type == VSOCK) 1013 freeb(dmp); 1014 else 1015 kmem_free(buf, size); 1016 VOP_RWUNLOCK(readvp, readflg, NULL); 1017 releasef(sfv->sfv_fd); 1018 return (error); 1019 } 1020 1021 /* 1022 * Check how much data was really read. 1023 * Decrement the 'len' and increment the 1024 * 'off' appropriately. 1025 */ 1026 cnt = iov_len - auio.uio_resid; 1027 if (cnt == 0) { 1028 if (vp->v_type == VSOCK) 1029 freeb(dmp); 1030 else 1031 kmem_free(buf, size); 1032 VOP_RWUNLOCK(readvp, readflg, NULL); 1033 releasef(sfv->sfv_fd); 1034 return (EINVAL); 1035 } 1036 sfv_len -= cnt; 1037 sfv_off += cnt; 1038 1039 if (vp->v_type == VSOCK) { 1040 dmp->b_wptr = dmp->b_rptr + cnt; 1041 1042 error = kstrwritemp(vp, dmp, fflag); 1043 if (error != 0) { 1044 freeb(dmp); 1045 VOP_RWUNLOCK(readvp, readflg, 1046 NULL); 1047 releasef(sfv->sfv_fd); 1048 return (error); 1049 } 1050 1051 ttolwp(curthread)->lwp_ru.ioch += 1052 (ulong_t)cnt; 1053 *count += cnt; 1054 } else { 1055 1056 aiov.iov_base = ptr; 1057 aiov.iov_len = cnt; 1058 auio.uio_loffset = *fileoff; 1059 auio.uio_resid = cnt; 1060 auio.uio_segflg = UIO_SYSSPACE; 1061 auio.uio_llimit = curproc->p_fsz_ctl; 1062 auio.uio_fmode = fflag; 1063 ioflag = auio.uio_fmode & 1064 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1065 error = VOP_WRITE(vp, &auio, ioflag, 1066 fp->f_cred, NULL); 1067 1068 /* 1069 * Check how much data was written. 1070 * Increment the 'len' and decrement the 1071 * 'off' if all the data was not 1072 * written. 1073 */ 1074 cnt -= auio.uio_resid; 1075 sfv_len += auio.uio_resid; 1076 sfv_off -= auio.uio_resid; 1077 ttolwp(curthread)->lwp_ru.ioch += 1078 (ulong_t)cnt; 1079 *fileoff += cnt; 1080 *count += cnt; 1081 if (error != 0) { 1082 VOP_RWUNLOCK(readvp, readflg, 1083 NULL); 1084 releasef(sfv->sfv_fd); 1085 return (error); 1086 } 1087 } 1088 } 1089 if (buf) { 1090 kmem_free(buf, size); 1091 buf = NULL; 1092 } 1093 VOP_RWUNLOCK(readvp, readflg, NULL); 1094 releasef(sfv->sfv_fd); 1095 } 1096 sfv++; 1097 } 1098 return (0); 1099 } 1100 1101 ssize_t 1102 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, 1103 size_t *xferred) 1104 { 1105 int error; 1106 file_t *fp; 1107 struct vnode *vp; 1108 struct sonode *so; 1109 u_offset_t fileoff; 1110 int copy_cnt; 1111 const struct sendfilevec *copy_vec; 1112 struct sendfilevec sfv[SEND_MAX_CHUNK]; 1113 ssize_t count = 0; 1114 #ifdef _SYSCALL32_IMPL 1115 struct ksendfilevec32 sfv32[SEND_MAX_CHUNK]; 1116 #endif 1117 ssize_t total_size = 0; 1118 int i; 1119 boolean_t is_sock = B_FALSE; 1120 int maxblk = 0; 1121 1122 if (sfvcnt <= 0) 1123 return (set_errno(EINVAL)); 1124 1125 if ((fp = getf(fildes)) == NULL) 1126 return (set_errno(EBADF)); 1127 1128 if (((fp->f_flag) & FWRITE) == 0) { 1129 error = EBADF; 1130 goto err; 1131 } 1132 1133 fileoff = fp->f_offset; 1134 vp = fp->f_vnode; 1135 1136 switch (vp->v_type) { 1137 case VSOCK: 1138 so = VTOSO(vp); 1139 /* sendfile not supported for SCTP */ 1140 if (so->so_protocol == IPPROTO_SCTP) { 1141 error = EPROTONOSUPPORT; 1142 goto err; 1143 } 1144 is_sock = B_TRUE; 1145 switch (so->so_family) { 1146 case AF_NCA: 1147 case AF_INET: 1148 case AF_INET6: 1149 /* 1150 * Make similar checks done in SOP_WRITE(). 1151 */ 1152 if (so->so_state & SS_CANTSENDMORE) { 1153 tsignal(curthread, SIGPIPE); 1154 error = EPIPE; 1155 goto err; 1156 } 1157 if (so->so_type != SOCK_STREAM) { 1158 error = EOPNOTSUPP; 1159 goto err; 1160 } 1161 1162 if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) != 1163 (SS_ISCONNECTED|SS_ISBOUND)) { 1164 error = ENOTCONN; 1165 goto err; 1166 } 1167 1168 if ((so->so_state & SS_DIRECT) && 1169 (so->so_priv != NULL) && 1170 (so->so_kssl_ctx == NULL)) { 1171 maxblk = ((tcp_t *)so->so_priv)->tcp_mss; 1172 } else { 1173 maxblk = (int)vp->v_stream->sd_maxblk; 1174 } 1175 break; 1176 default: 1177 error = EAFNOSUPPORT; 1178 goto err; 1179 } 1180 break; 1181 case VREG: 1182 break; 1183 default: 1184 error = EINVAL; 1185 goto err; 1186 } 1187 1188 switch (opcode) { 1189 case SENDFILEV : 1190 break; 1191 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 1192 case SENDFILEV64 : 1193 return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt, 1194 (size32_t *)xferred, fildes)); 1195 #endif 1196 default : 1197 error = ENOSYS; 1198 break; 1199 } 1200 1201 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1202 copy_vec = vec; 1203 1204 do { 1205 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 1206 #ifdef _SYSCALL32_IMPL 1207 /* 32-bit callers need to have their iovec expanded. */ 1208 if (get_udatamodel() == DATAMODEL_ILP32) { 1209 if (copyin(copy_vec, sfv32, 1210 copy_cnt * sizeof (ksendfilevec32_t))) { 1211 error = EFAULT; 1212 break; 1213 } 1214 1215 for (i = 0; i < copy_cnt; i++) { 1216 sfv[i].sfv_fd = sfv32[i].sfv_fd; 1217 sfv[i].sfv_off = 1218 (off_t)(uint32_t)sfv32[i].sfv_off; 1219 sfv[i].sfv_len = (size_t)sfv32[i].sfv_len; 1220 total_size += sfv[i].sfv_len; 1221 sfv[i].sfv_flag = sfv32[i].sfv_flag; 1222 } 1223 } else { 1224 #endif 1225 if (copyin(copy_vec, sfv, 1226 copy_cnt * sizeof (sendfilevec_t))) { 1227 error = EFAULT; 1228 break; 1229 } 1230 1231 for (i = 0; i < copy_cnt; i++) { 1232 total_size += sfv[i].sfv_len; 1233 } 1234 #ifdef _SYSCALL32_IMPL 1235 } 1236 #endif 1237 1238 /* 1239 * The task between deciding to use sendvec_small_chunk 1240 * and sendvec_chunk is dependant on multiple things: 1241 * 1242 * i) latency is important for smaller files. So if the 1243 * data is smaller than 'tcp_slow_start_initial' times 1244 * maxblk, then use sendvec_small_chunk which creates 1245 * maxblk size mblks and chains then together and sends 1246 * them to TCP in one shot. It also leaves 'wroff' size 1247 * space for the headers in each mblk. 1248 * 1249 * ii) for total size bigger than 'tcp_slow_start_initial' 1250 * time maxblk, its probably real file data which is 1251 * dominating. So its better to use sendvec_chunk because 1252 * performance goes to dog if we don't do pagesize reads. 1253 * sendvec_chunk will do pagesize reads and write them 1254 * in pagesize mblks to TCP. 1255 * 1256 * Side Notes: A write to file has not been optimized. 1257 * Future zero copy code will plugin into sendvec_chunk 1258 * only because doing zero copy for files smaller then 1259 * pagesize is useless. 1260 * 1261 * Note, if socket has NL7C enabled then call NL7C's 1262 * senfilev() function to give NL7C a chance to copy 1263 * the vec for caching, then continue processing as 1264 * normal. 1265 */ 1266 if (is_sock) { 1267 switch (so->so_family) { 1268 case AF_INET: 1269 case AF_INET6: 1270 if (so->so_nl7c_flags != 0) { 1271 nl7c_sendfilev(so, fileoff, 1272 sfv, copy_cnt); 1273 } 1274 if (total_size <= (4 * maxblk)) 1275 error = sendvec_small_chunk(fp, 1276 &fileoff, sfv, copy_cnt, 1277 total_size, maxblk, &count); 1278 else 1279 error = sendvec_chunk(fp, &fileoff, 1280 sfv, copy_cnt, &count); 1281 break; 1282 case AF_NCA: 1283 error = nca_sendfilev(fp, sfv, copy_cnt, 1284 &count); 1285 break; 1286 } 1287 } else { 1288 ASSERT(vp->v_type == VREG); 1289 error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt, 1290 &count); 1291 } 1292 1293 1294 #ifdef _SYSCALL32_IMPL 1295 if (get_udatamodel() == DATAMODEL_ILP32) 1296 copy_vec = (const struct sendfilevec *)((char *)copy_vec + 1297 (copy_cnt * sizeof (ksendfilevec32_t))); 1298 else 1299 #endif 1300 copy_vec += copy_cnt; 1301 sfvcnt -= copy_cnt; 1302 } while (sfvcnt > 0); 1303 1304 if (vp->v_type == VREG) 1305 fp->f_offset += count; 1306 1307 1308 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1309 1310 #ifdef _SYSCALL32_IMPL 1311 if (get_udatamodel() == DATAMODEL_ILP32) { 1312 ssize32_t count32 = (ssize32_t)count; 1313 if (copyout(&count32, xferred, sizeof (count32))) 1314 error = EFAULT; 1315 releasef(fildes); 1316 if (error != 0) 1317 return (set_errno(error)); 1318 return (count32); 1319 } 1320 #endif 1321 if (copyout(&count, xferred, sizeof (count))) 1322 error = EFAULT; 1323 releasef(fildes); 1324 if (error != 0) 1325 return (set_errno(error)); 1326 return (count); 1327 err: 1328 ASSERT(error != 0); 1329 releasef(fildes); 1330 return (set_errno(error)); 1331 } 1332