1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/t_lock.h> 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/buf.h> 34 #include <sys/conf.h> 35 #include <sys/cred.h> 36 #include <sys/kmem.h> 37 #include <sys/sysmacros.h> 38 #include <sys/vfs.h> 39 #include <sys/vnode.h> 40 #include <sys/debug.h> 41 #include <sys/errno.h> 42 #include <sys/time.h> 43 #include <sys/file.h> 44 #include <sys/open.h> 45 #include <sys/user.h> 46 #include <sys/termios.h> 47 #include <sys/stream.h> 48 #include <sys/strsubr.h> 49 #include <sys/esunddi.h> 50 #include <sys/flock.h> 51 #include <sys/modctl.h> 52 #include <sys/cmn_err.h> 53 #include <sys/vmsystm.h> 54 55 #include <sys/socket.h> 56 #include <sys/socketvar.h> 57 #include <netinet/in.h> 58 #include <sys/sendfile.h> 59 #include <sys/un.h> 60 #include <inet/nca/ncadoorhdr.h> 61 #include <inet/nca/ncaio.h> 62 #include <sys/tihdr.h> 63 #include <sys/atomic.h> 64 65 #include <inet/common.h> 66 #include <inet/ip.h> 67 #include <inet/ip6.h> 68 #include <inet/tcp.h> 69 70 extern int nca_sendfilev(file_t *, struct sendfilevec *, int, ssize_t *); 71 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *, 72 ssize32_t *); 73 extern void nl7c_sendfilev(struct sonode *, u_offset_t, struct sendfilevec *, 74 int); 75 76 /* 77 * kstrwritemp() has very similar semantics as that of strwrite(). 78 * The main difference is it obtains mblks from the caller and also 79 * does not do any copy as done in strwrite() from user buffers to 80 * kernel buffers. 81 * 82 * Currently, this routine is used by sendfile to send data allocated 83 * within the kernel without any copying. This interface does not use the 84 * synchronous stream interface as synch. stream interface implies 85 * copying. 86 */ 87 int 88 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode) 89 { 90 struct stdata *stp; 91 struct queue *wqp; 92 char waitflag; 93 int tempmode; 94 int error = 0; 95 int done = 0; 96 struct sonode *so; 97 boolean_t direct; 98 99 ASSERT(vp->v_stream); 100 stp = vp->v_stream; 101 102 so = VTOSO(vp); 103 direct = (so->so_state & SS_DIRECT); 104 105 /* 106 * This is the sockfs direct fast path. canputnext() need 107 * not be accurate so we don't grab the sd_lock here. If 108 * we get flow-controlled, we grab sd_lock just before the 109 * do..while loop below to emulate what strwrite() does. 110 */ 111 wqp = stp->sd_wrq; 112 if (canputnext(wqp) && direct && 113 !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) { 114 return (sostream_direct(so, NULL, mp, CRED())); 115 } else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) { 116 /* Fast check of flags before acquiring the lock */ 117 mutex_enter(&stp->sd_lock); 118 error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0); 119 mutex_exit(&stp->sd_lock); 120 if (error != 0) { 121 if (!(stp->sd_flag & STPLEX) && 122 (stp->sd_wput_opt & SW_SIGPIPE)) { 123 tsignal(curthread, SIGPIPE); 124 error = EPIPE; 125 } 126 return (error); 127 } 128 } 129 130 waitflag = WRITEWAIT; 131 if (stp->sd_flag & OLDNDELAY) 132 tempmode = fmode & ~FNDELAY; 133 else 134 tempmode = fmode; 135 136 mutex_enter(&stp->sd_lock); 137 do { 138 if (canputnext(wqp)) { 139 mutex_exit(&stp->sd_lock); 140 putnext(wqp, mp); 141 return (0); 142 } 143 error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1, 144 &done); 145 } while (error == 0 && !done); 146 147 mutex_exit(&stp->sd_lock); 148 /* 149 * EAGAIN tells the application to try again. ENOMEM 150 * is returned only if the memory allocation size 151 * exceeds the physical limits of the system. ENOMEM 152 * can't be true here. 153 */ 154 if (error == ENOMEM) 155 error = EAGAIN; 156 return (error); 157 } 158 159 #define SEND_MAX_CHUNK 16 160 161 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 162 /* 163 * 64 bit offsets for 32 bit applications only running either on 164 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer 165 * more than 2GB of data. 166 */ 167 int 168 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, 169 int copy_cnt, ssize32_t *count) 170 { 171 struct vnode *vp; 172 ushort_t fflag; 173 int ioflag; 174 size32_t cnt; 175 ssize32_t sfv_len; 176 ssize32_t tmpcount; 177 u_offset_t sfv_off; 178 struct uio auio; 179 struct iovec aiov; 180 int i, error; 181 182 fflag = fp->f_flag; 183 vp = fp->f_vnode; 184 for (i = 0; i < copy_cnt; i++) { 185 186 if (ISSIG(curthread, JUSTLOOKING)) 187 return (EINTR); 188 189 /* 190 * Do similar checks as "write" as we are writing 191 * sfv_len bytes into "vp". 192 */ 193 sfv_len = (ssize32_t)sfv->sfv_len; 194 195 if (sfv_len == 0) 196 continue; 197 198 if (sfv_len < 0) 199 return (EINVAL); 200 201 if (vp->v_type == VREG) { 202 if (*fileoff >= curproc->p_fsz_ctl) { 203 mutex_enter(&curproc->p_lock); 204 (void) rctl_action( 205 rctlproc_legacy[RLIMIT_FSIZE], 206 curproc->p_rctls, curproc, RCA_SAFE); 207 mutex_exit(&curproc->p_lock); 208 return (EFBIG); 209 } 210 211 if (*fileoff >= OFFSET_MAX(fp)) 212 return (EFBIG); 213 214 if (*fileoff + sfv_len > OFFSET_MAX(fp)) 215 return (EINVAL); 216 } 217 218 tmpcount = *count + sfv_len; 219 if (tmpcount < 0) 220 return (EINVAL); 221 222 sfv_off = sfv->sfv_off; 223 224 auio.uio_extflg = UIO_COPY_DEFAULT; 225 if (sfv->sfv_fd == SFV_FD_SELF) { 226 aiov.iov_len = sfv_len; 227 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 228 auio.uio_loffset = *fileoff; 229 auio.uio_iovcnt = 1; 230 auio.uio_resid = sfv_len; 231 auio.uio_iov = &aiov; 232 auio.uio_segflg = UIO_USERSPACE; 233 auio.uio_llimit = curproc->p_fsz_ctl; 234 auio.uio_fmode = fflag; 235 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 236 while (sfv_len > 0) { 237 error = VOP_WRITE(vp, &auio, ioflag, 238 fp->f_cred, NULL); 239 cnt = sfv_len - auio.uio_resid; 240 sfv_len -= cnt; 241 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 242 if (vp->v_type == VREG) 243 *fileoff += cnt; 244 *count += cnt; 245 if (error != 0) 246 return (error); 247 } 248 } else { 249 file_t *ffp; 250 vnode_t *readvp; 251 int readflg = 0; 252 size_t size; 253 caddr_t ptr; 254 255 if ((ffp = getf(sfv->sfv_fd)) == NULL) 256 return (EBADF); 257 258 if ((ffp->f_flag & FREAD) == 0) { 259 releasef(sfv->sfv_fd); 260 return (EBADF); 261 } 262 263 readvp = ffp->f_vnode; 264 if (readvp->v_type != VREG) { 265 releasef(sfv->sfv_fd); 266 return (EINVAL); 267 } 268 269 /* 270 * No point reading and writing to same vp, 271 * as long as both are regular files. readvp is not 272 * locked; but since we got it from an open file the 273 * contents will be valid during the time of access. 274 */ 275 if (VN_CMP(vp, readvp)) { 276 releasef(sfv->sfv_fd); 277 return (EINVAL); 278 } 279 280 /* 281 * Note: we assume readvp != vp. "vp" is already 282 * locked, and "readvp" must not be. 283 */ 284 (void) VOP_RWLOCK(readvp, readflg, NULL); 285 286 /* 287 * Same checks as in pread64. 288 */ 289 if (sfv_off > MAXOFFSET_T) { 290 VOP_RWUNLOCK(readvp, readflg, NULL); 291 releasef(sfv->sfv_fd); 292 return (EINVAL); 293 } 294 295 if (sfv_off + sfv_len > MAXOFFSET_T) 296 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off); 297 298 /* Find the native blocksize to transfer data */ 299 size = MIN(vp->v_vfsp->vfs_bsize, 300 readvp->v_vfsp->vfs_bsize); 301 size = sfv_len < size ? sfv_len : size; 302 ptr = kmem_alloc(size, KM_SLEEP); 303 304 while (sfv_len > 0) { 305 size_t iov_len; 306 307 iov_len = MIN(size, sfv_len); 308 aiov.iov_base = ptr; 309 aiov.iov_len = iov_len; 310 auio.uio_loffset = sfv_off; 311 auio.uio_iov = &aiov; 312 auio.uio_iovcnt = 1; 313 auio.uio_resid = iov_len; 314 auio.uio_segflg = UIO_SYSSPACE; 315 auio.uio_llimit = MAXOFFSET_T; 316 auio.uio_fmode = ffp->f_flag; 317 ioflag = auio.uio_fmode & 318 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 319 320 /* 321 * If read sync is not asked for, 322 * filter sync flags 323 */ 324 if ((ioflag & FRSYNC) == 0) 325 ioflag &= ~(FSYNC|FDSYNC); 326 error = VOP_READ(readvp, &auio, ioflag, 327 fp->f_cred, NULL); 328 if (error) { 329 kmem_free(ptr, size); 330 VOP_RWUNLOCK(readvp, readflg, NULL); 331 releasef(sfv->sfv_fd); 332 return (error); 333 } 334 335 /* 336 * Check how must data was really read. 337 * Decrement the 'len' and increment the 338 * 'off' appropriately. 339 */ 340 cnt = iov_len - auio.uio_resid; 341 if (cnt == 0) { 342 /* 343 * If we were reading a pipe (currently 344 * not implemented), we may now lose 345 * data. 346 */ 347 kmem_free(ptr, size); 348 VOP_RWUNLOCK(readvp, readflg, NULL); 349 releasef(sfv->sfv_fd); 350 return (EINVAL); 351 } 352 sfv_len -= cnt; 353 sfv_off += cnt; 354 355 aiov.iov_base = ptr; 356 aiov.iov_len = cnt; 357 auio.uio_loffset = *fileoff; 358 auio.uio_resid = cnt; 359 auio.uio_segflg = UIO_SYSSPACE; 360 auio.uio_llimit = curproc->p_fsz_ctl; 361 auio.uio_fmode = fflag; 362 ioflag = auio.uio_fmode & 363 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 364 error = VOP_WRITE(vp, &auio, ioflag, 365 fp->f_cred, NULL); 366 367 /* 368 * Check how much data was written. Increment 369 * the 'len' and decrement the 'off' if all 370 * the data was not written. 371 */ 372 cnt -= auio.uio_resid; 373 sfv_len += auio.uio_resid; 374 sfv_off -= auio.uio_resid; 375 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 376 if (vp->v_type == VREG) 377 *fileoff += cnt; 378 *count += cnt; 379 if (error != 0) { 380 kmem_free(ptr, size); 381 VOP_RWUNLOCK(readvp, readflg, NULL); 382 releasef(sfv->sfv_fd); 383 return (error); 384 } 385 } 386 VOP_RWUNLOCK(readvp, readflg, NULL); 387 releasef(sfv->sfv_fd); 388 kmem_free(ptr, size); 389 } 390 sfv++; 391 } 392 return (0); 393 } 394 395 ssize32_t 396 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, 397 size32_t *xferred, int fildes) 398 { 399 int rwflag; 400 u_offset_t fileoff; 401 int copy_cnt; 402 const struct ksendfilevec64 *copy_vec; 403 struct ksendfilevec64 sfv[SEND_MAX_CHUNK]; 404 struct vnode *vp; 405 int error; 406 ssize32_t count = 0; 407 int osfvcnt; 408 409 rwflag = 1; 410 vp = fp->f_vnode; 411 (void) VOP_RWLOCK(vp, rwflag, NULL); 412 413 copy_vec = vec; 414 fileoff = fp->f_offset; 415 osfvcnt = sfvcnt; 416 417 do { 418 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 419 if (copyin(copy_vec, sfv, copy_cnt * 420 sizeof (struct ksendfilevec64))) { 421 error = EFAULT; 422 break; 423 } 424 425 /* 426 * Optimize the single regular file over 427 * the socket case. 428 */ 429 if (vp->v_type == VSOCK && osfvcnt == 1 && 430 sfv->sfv_fd != SFV_FD_SELF) { 431 file_t *rfp; 432 vnode_t *rvp; 433 434 if ((rfp = getf(sfv->sfv_fd)) == NULL) { 435 error = EBADF; 436 break; 437 } 438 if ((rfp->f_flag & FREAD) == 0) { 439 releasef(sfv->sfv_fd); 440 error = EBADF; 441 break; 442 } 443 rvp = rfp->f_vnode; 444 if (rvp->v_type == VREG) { 445 error = sosendfile64(fp, rfp, sfv, &count); 446 break; 447 } 448 releasef(sfv->sfv_fd); 449 } 450 error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count); 451 if (error != 0) 452 break; 453 454 copy_vec += copy_cnt; 455 sfvcnt -= copy_cnt; 456 } while (sfvcnt > 0); 457 458 if (vp->v_type == VREG) 459 fp->f_offset += count; 460 461 VOP_RWUNLOCK(vp, rwflag, NULL); 462 if (copyout(&count, xferred, sizeof (count))) 463 error = EFAULT; 464 releasef(fildes); 465 if (error != 0) 466 return (set_errno(error)); 467 return (count); 468 } 469 #endif 470 471 int 472 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 473 int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count) 474 { 475 struct vnode *vp; 476 struct uio auio; 477 struct iovec aiov; 478 ushort_t fflag; 479 int ioflag; 480 int i, error; 481 size_t cnt; 482 ssize_t sfv_len; 483 u_offset_t sfv_off; 484 #ifdef _SYSCALL32_IMPL 485 model_t model = get_udatamodel(); 486 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 487 MAXOFF32_T : MAXOFFSET_T; 488 #else 489 const u_offset_t maxoff = MAXOFF32_T; 490 #endif 491 mblk_t *dmp = NULL; 492 int wroff; 493 int buf_left = 0; 494 size_t iov_len; 495 mblk_t *head, *tmp; 496 size_t size = total_size; 497 498 fflag = fp->f_flag; 499 vp = fp->f_vnode; 500 501 ASSERT(vp->v_type == VSOCK); 502 ASSERT(maxblk > 0); 503 504 wroff = (int)vp->v_stream->sd_wroff; 505 buf_left = MIN(total_size, maxblk); 506 head = dmp = allocb(buf_left + wroff, BPRI_HI); 507 if (head == NULL) 508 return (ENOMEM); 509 head->b_wptr = head->b_rptr = head->b_rptr + wroff; 510 511 auio.uio_extflg = UIO_COPY_DEFAULT; 512 for (i = 0; i < copy_cnt; i++) { 513 if (ISSIG(curthread, JUSTLOOKING)) 514 return (EINTR); 515 516 /* 517 * Do similar checks as "write" as we are writing 518 * sfv_len bytes into "vp". 519 */ 520 sfv_len = (ssize_t)sfv->sfv_len; 521 522 if (sfv_len == 0) { 523 sfv++; 524 continue; 525 } 526 527 /* Make sure sfv_len is not negative */ 528 #ifdef _SYSCALL32_IMPL 529 if (model == DATAMODEL_ILP32) { 530 if ((ssize32_t)sfv_len < 0) 531 return (EINVAL); 532 } else 533 #endif 534 if (sfv_len < 0) 535 return (EINVAL); 536 537 /* Check for overflow */ 538 #ifdef _SYSCALL32_IMPL 539 if (model == DATAMODEL_ILP32) { 540 if (((ssize32_t)(*count + sfv_len)) < 0) 541 return (EINVAL); 542 } else 543 #endif 544 if ((*count + sfv_len) < 0) 545 return (EINVAL); 546 547 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 548 549 if (sfv->sfv_fd == SFV_FD_SELF) { 550 while (sfv_len > 0) { 551 if (buf_left == 0) { 552 tmp = dmp; 553 buf_left = MIN(total_size, maxblk); 554 iov_len = MIN(buf_left, sfv_len); 555 dmp = allocb(buf_left + wroff, BPRI_HI); 556 if (dmp == NULL) { 557 freemsg(head); 558 return (ENOMEM); 559 } 560 dmp->b_wptr = dmp->b_rptr = 561 dmp->b_rptr + wroff; 562 tmp->b_cont = dmp; 563 } else { 564 iov_len = MIN(buf_left, sfv_len); 565 } 566 567 aiov.iov_len = iov_len; 568 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 569 auio.uio_loffset = *fileoff; 570 auio.uio_iovcnt = 1; 571 auio.uio_resid = iov_len; 572 auio.uio_iov = &aiov; 573 auio.uio_segflg = UIO_USERSPACE; 574 auio.uio_llimit = curproc->p_fsz_ctl; 575 auio.uio_fmode = fflag; 576 577 buf_left -= iov_len; 578 total_size -= iov_len; 579 sfv_len -= iov_len; 580 sfv_off += iov_len; 581 582 error = uiomove((caddr_t)dmp->b_wptr, 583 iov_len, UIO_WRITE, &auio); 584 if (error != 0) { 585 freemsg(head); 586 return (error); 587 } 588 dmp->b_wptr += iov_len; 589 } 590 } else { 591 file_t *ffp; 592 vnode_t *readvp; 593 int readflg = 0; 594 595 if ((ffp = getf(sfv->sfv_fd)) == NULL) { 596 freemsg(head); 597 return (EBADF); 598 } 599 600 if ((ffp->f_flag & FREAD) == 0) { 601 releasef(sfv->sfv_fd); 602 freemsg(head); 603 return (EACCES); 604 } 605 606 readvp = ffp->f_vnode; 607 if (readvp->v_type != VREG) { 608 releasef(sfv->sfv_fd); 609 freemsg(head); 610 return (EINVAL); 611 } 612 613 /* 614 * No point reading and writing to same vp, 615 * as long as both are regular files. readvp is not 616 * locked; but since we got it from an open file the 617 * contents will be valid during the time of access. 618 */ 619 620 if (VN_CMP(vp, readvp)) { 621 releasef(sfv->sfv_fd); 622 freemsg(head); 623 return (EINVAL); 624 } 625 626 /* 627 * Note: we assume readvp != vp. "vp" is already 628 * locked, and "readvp" must not be. 629 */ 630 631 (void) VOP_RWLOCK(readvp, readflg, NULL); 632 633 /* Same checks as in pread */ 634 if (sfv_off > maxoff) { 635 VOP_RWUNLOCK(readvp, readflg, NULL); 636 releasef(sfv->sfv_fd); 637 freemsg(head); 638 return (EINVAL); 639 } 640 if (sfv_off + sfv_len > maxoff) { 641 sfv_len = (ssize_t)((offset_t)maxoff - 642 sfv_off); 643 } 644 645 while (sfv_len > 0) { 646 if (buf_left == 0) { 647 tmp = dmp; 648 buf_left = MIN(total_size, maxblk); 649 iov_len = MIN(buf_left, sfv_len); 650 dmp = allocb(buf_left + wroff, BPRI_HI); 651 if (dmp == NULL) { 652 VOP_RWUNLOCK(readvp, readflg, 653 NULL); 654 releasef(sfv->sfv_fd); 655 freemsg(head); 656 return (ENOMEM); 657 } 658 dmp->b_wptr = dmp->b_rptr = 659 dmp->b_rptr + wroff; 660 tmp->b_cont = dmp; 661 } else { 662 iov_len = MIN(buf_left, sfv_len); 663 } 664 aiov.iov_base = (caddr_t)dmp->b_wptr; 665 aiov.iov_len = iov_len; 666 auio.uio_loffset = sfv_off; 667 auio.uio_iov = &aiov; 668 auio.uio_iovcnt = 1; 669 auio.uio_resid = iov_len; 670 auio.uio_segflg = UIO_SYSSPACE; 671 auio.uio_llimit = MAXOFFSET_T; 672 auio.uio_fmode = ffp->f_flag; 673 ioflag = auio.uio_fmode & 674 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 675 676 /* 677 * If read sync is not asked for, 678 * filter sync flags 679 */ 680 if ((ioflag & FRSYNC) == 0) 681 ioflag &= ~(FSYNC|FDSYNC); 682 error = VOP_READ(readvp, &auio, ioflag, 683 fp->f_cred, NULL); 684 if (error != 0) { 685 /* 686 * If we were reading a pipe (currently 687 * not implemented), we may now loose 688 * data. 689 */ 690 VOP_RWUNLOCK(readvp, readflg, NULL); 691 releasef(sfv->sfv_fd); 692 freemsg(head); 693 return (error); 694 } 695 696 /* 697 * Check how much data was really read. 698 * Decrement the 'len' and increment the 699 * 'off' appropriately. 700 */ 701 cnt = iov_len - auio.uio_resid; 702 if (cnt == 0) { 703 VOP_RWUNLOCK(readvp, readflg, NULL); 704 releasef(sfv->sfv_fd); 705 freemsg(head); 706 return (EINVAL); 707 } 708 sfv_len -= cnt; 709 sfv_off += cnt; 710 total_size -= cnt; 711 buf_left -= cnt; 712 713 dmp->b_wptr += cnt; 714 } 715 VOP_RWUNLOCK(readvp, readflg, NULL); 716 releasef(sfv->sfv_fd); 717 } 718 sfv++; 719 } 720 721 ASSERT(total_size == 0); 722 error = kstrwritemp(vp, head, fflag); 723 if (error != 0) { 724 freemsg(head); 725 return (error); 726 } 727 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size; 728 *count += size; 729 730 return (0); 731 } 732 733 734 int 735 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 736 int copy_cnt, ssize_t *count) 737 { 738 struct vnode *vp; 739 struct uio auio; 740 struct iovec aiov; 741 ushort_t fflag; 742 int ioflag; 743 int i, error; 744 size_t cnt; 745 ssize_t sfv_len; 746 u_offset_t sfv_off; 747 #ifdef _SYSCALL32_IMPL 748 model_t model = get_udatamodel(); 749 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 750 MAXOFF32_T : MAXOFFSET_T; 751 #else 752 const u_offset_t maxoff = MAXOFF32_T; 753 #endif 754 mblk_t *dmp = NULL; 755 char *buf = NULL; 756 757 fflag = fp->f_flag; 758 vp = fp->f_vnode; 759 760 auio.uio_extflg = UIO_COPY_DEFAULT; 761 for (i = 0; i < copy_cnt; i++) { 762 if (ISSIG(curthread, JUSTLOOKING)) 763 return (EINTR); 764 765 /* 766 * Do similar checks as "write" as we are writing 767 * sfv_len bytes into "vp". 768 */ 769 sfv_len = (ssize_t)sfv->sfv_len; 770 771 if (sfv_len == 0) { 772 sfv++; 773 continue; 774 } 775 776 /* Make sure sfv_len is not negative */ 777 #ifdef _SYSCALL32_IMPL 778 if (model == DATAMODEL_ILP32) { 779 if ((ssize32_t)sfv_len < 0) 780 return (EINVAL); 781 } else 782 #endif 783 if (sfv_len < 0) 784 return (EINVAL); 785 786 if (vp->v_type == VREG) { 787 if (*fileoff >= curproc->p_fsz_ctl) { 788 mutex_enter(&curproc->p_lock); 789 (void) rctl_action( 790 rctlproc_legacy[RLIMIT_FSIZE], 791 curproc->p_rctls, curproc, RCA_SAFE); 792 mutex_exit(&curproc->p_lock); 793 794 return (EFBIG); 795 } 796 797 if (*fileoff >= maxoff) 798 return (EFBIG); 799 800 if (*fileoff + sfv_len > maxoff) 801 return (EINVAL); 802 } 803 804 /* Check for overflow */ 805 #ifdef _SYSCALL32_IMPL 806 if (model == DATAMODEL_ILP32) { 807 if (((ssize32_t)(*count + sfv_len)) < 0) 808 return (EINVAL); 809 } else 810 #endif 811 if ((*count + sfv_len) < 0) 812 return (EINVAL); 813 814 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 815 816 if (sfv->sfv_fd == SFV_FD_SELF) { 817 aiov.iov_len = sfv_len; 818 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 819 auio.uio_loffset = *fileoff; 820 auio.uio_iovcnt = 1; 821 auio.uio_resid = sfv_len; 822 auio.uio_iov = &aiov; 823 auio.uio_segflg = UIO_USERSPACE; 824 auio.uio_llimit = curproc->p_fsz_ctl; 825 auio.uio_fmode = fflag; 826 827 if (vp->v_type == VSOCK) { 828 829 /* 830 * Optimize for the socket case 831 */ 832 int wroff = (int)vp->v_stream->sd_wroff; 833 834 dmp = allocb(sfv_len + wroff, BPRI_HI); 835 if (dmp == NULL) 836 return (ENOMEM); 837 dmp->b_wptr = dmp->b_rptr = dmp->b_rptr + wroff; 838 error = uiomove((caddr_t)dmp->b_wptr, 839 sfv_len, UIO_WRITE, &auio); 840 if (error != 0) { 841 freeb(dmp); 842 return (error); 843 } 844 dmp->b_wptr += sfv_len; 845 error = kstrwritemp(vp, dmp, fflag); 846 if (error != 0) { 847 freeb(dmp); 848 return (error); 849 } 850 ttolwp(curthread)->lwp_ru.ioch += 851 (ulong_t)sfv_len; 852 *count += sfv_len; 853 } else { 854 ioflag = auio.uio_fmode & 855 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 856 while (sfv_len > 0) { 857 error = VOP_WRITE(vp, &auio, ioflag, 858 fp->f_cred, NULL); 859 cnt = sfv_len - auio.uio_resid; 860 sfv_len -= cnt; 861 ttolwp(curthread)->lwp_ru.ioch += 862 (ulong_t)cnt; 863 *fileoff += cnt; 864 *count += cnt; 865 if (error != 0) 866 return (error); 867 } 868 } 869 } else { 870 file_t *ffp; 871 vnode_t *readvp; 872 int readflg = 0; 873 size_t size; 874 caddr_t ptr; 875 876 if ((ffp = getf(sfv->sfv_fd)) == NULL) 877 return (EBADF); 878 879 if ((ffp->f_flag & FREAD) == 0) { 880 releasef(sfv->sfv_fd); 881 return (EBADF); 882 } 883 884 readvp = ffp->f_vnode; 885 if (readvp->v_type != VREG) { 886 releasef(sfv->sfv_fd); 887 return (EINVAL); 888 } 889 890 /* 891 * No point reading and writing to same vp, 892 * as long as both are regular files. readvp is not 893 * locked; but since we got it from an open file the 894 * contents will be valid during the time of access. 895 */ 896 if (VN_CMP(vp, readvp)) { 897 releasef(sfv->sfv_fd); 898 return (EINVAL); 899 } 900 901 /* 902 * Note: we assume readvp != vp. "vp" is already 903 * locked, and "readvp" must not be. 904 */ 905 (void) VOP_RWLOCK(readvp, readflg, NULL); 906 907 /* Same checks as in pread */ 908 if (sfv_off > maxoff) { 909 VOP_RWUNLOCK(readvp, readflg, NULL); 910 releasef(sfv->sfv_fd); 911 return (EINVAL); 912 } 913 if (sfv_off + sfv_len > maxoff) { 914 sfv_len = (ssize_t)((offset_t)maxoff - 915 sfv_off); 916 } 917 /* Find the native blocksize to transfer data */ 918 size = MIN(vp->v_vfsp->vfs_bsize, 919 readvp->v_vfsp->vfs_bsize); 920 size = sfv_len < size ? sfv_len : size; 921 922 if (vp->v_type != VSOCK) { 923 buf = kmem_alloc(size, KM_NOSLEEP); 924 if (buf == NULL) { 925 VOP_RWUNLOCK(readvp, readflg, NULL); 926 releasef(sfv->sfv_fd); 927 return (ENOMEM); 928 } 929 } 930 931 while (sfv_len > 0) { 932 size_t iov_len; 933 934 iov_len = MIN(size, sfv_len); 935 936 if (vp->v_type == VSOCK) { 937 dmp = allocb(iov_len, BPRI_HI); 938 if (dmp == NULL) { 939 VOP_RWUNLOCK(readvp, readflg, 940 NULL); 941 releasef(sfv->sfv_fd); 942 return (ENOMEM); 943 } 944 ptr = (caddr_t)dmp->b_rptr; 945 } else { 946 ptr = buf; 947 } 948 949 aiov.iov_base = ptr; 950 aiov.iov_len = iov_len; 951 auio.uio_loffset = sfv_off; 952 auio.uio_iov = &aiov; 953 auio.uio_iovcnt = 1; 954 auio.uio_resid = iov_len; 955 auio.uio_segflg = UIO_SYSSPACE; 956 auio.uio_llimit = MAXOFFSET_T; 957 auio.uio_fmode = ffp->f_flag; 958 ioflag = auio.uio_fmode & 959 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 960 961 /* 962 * If read sync is not asked for, 963 * filter sync flags 964 */ 965 if ((ioflag & FRSYNC) == 0) 966 ioflag &= ~(FSYNC|FDSYNC); 967 error = VOP_READ(readvp, &auio, ioflag, 968 fp->f_cred, NULL); 969 if (error != 0) { 970 /* 971 * If we were reading a pipe (currently 972 * not implemented), we may now lose 973 * data. 974 */ 975 if (vp->v_type == VSOCK) 976 freeb(dmp); 977 else 978 kmem_free(buf, size); 979 VOP_RWUNLOCK(readvp, readflg, NULL); 980 releasef(sfv->sfv_fd); 981 return (error); 982 } 983 984 /* 985 * Check how much data was really read. 986 * Decrement the 'len' and increment the 987 * 'off' appropriately. 988 */ 989 cnt = iov_len - auio.uio_resid; 990 if (cnt == 0) { 991 if (vp->v_type == VSOCK) 992 freeb(dmp); 993 else 994 kmem_free(buf, size); 995 VOP_RWUNLOCK(readvp, readflg, NULL); 996 releasef(sfv->sfv_fd); 997 return (EINVAL); 998 } 999 sfv_len -= cnt; 1000 sfv_off += cnt; 1001 1002 if (vp->v_type == VSOCK) { 1003 dmp->b_wptr = dmp->b_rptr + cnt; 1004 1005 error = kstrwritemp(vp, dmp, fflag); 1006 if (error != 0) { 1007 freeb(dmp); 1008 VOP_RWUNLOCK(readvp, readflg, 1009 NULL); 1010 releasef(sfv->sfv_fd); 1011 return (error); 1012 } 1013 1014 ttolwp(curthread)->lwp_ru.ioch += 1015 (ulong_t)cnt; 1016 *count += cnt; 1017 } else { 1018 1019 aiov.iov_base = ptr; 1020 aiov.iov_len = cnt; 1021 auio.uio_loffset = *fileoff; 1022 auio.uio_resid = cnt; 1023 auio.uio_segflg = UIO_SYSSPACE; 1024 auio.uio_llimit = curproc->p_fsz_ctl; 1025 auio.uio_fmode = fflag; 1026 ioflag = auio.uio_fmode & 1027 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1028 error = VOP_WRITE(vp, &auio, ioflag, 1029 fp->f_cred, NULL); 1030 1031 /* 1032 * Check how much data was written. 1033 * Increment the 'len' and decrement the 1034 * 'off' if all the data was not 1035 * written. 1036 */ 1037 cnt -= auio.uio_resid; 1038 sfv_len += auio.uio_resid; 1039 sfv_off -= auio.uio_resid; 1040 ttolwp(curthread)->lwp_ru.ioch += 1041 (ulong_t)cnt; 1042 *fileoff += cnt; 1043 *count += cnt; 1044 if (error != 0) { 1045 VOP_RWUNLOCK(readvp, readflg, 1046 NULL); 1047 releasef(sfv->sfv_fd); 1048 return (error); 1049 } 1050 } 1051 } 1052 if (buf) { 1053 kmem_free(buf, size); 1054 buf = NULL; 1055 } 1056 VOP_RWUNLOCK(readvp, readflg, NULL); 1057 releasef(sfv->sfv_fd); 1058 } 1059 sfv++; 1060 } 1061 return (0); 1062 } 1063 1064 ssize_t 1065 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, 1066 size_t *xferred) 1067 { 1068 int error; 1069 file_t *fp; 1070 struct vnode *vp; 1071 struct sonode *so; 1072 u_offset_t fileoff; 1073 int copy_cnt; 1074 const struct sendfilevec *copy_vec; 1075 struct sendfilevec sfv[SEND_MAX_CHUNK]; 1076 ssize_t count = 0; 1077 #ifdef _SYSCALL32_IMPL 1078 struct ksendfilevec32 sfv32[SEND_MAX_CHUNK]; 1079 #endif 1080 ssize_t total_size = 0; 1081 int i; 1082 boolean_t is_sock = B_FALSE; 1083 int maxblk = 0; 1084 1085 if (sfvcnt <= 0) 1086 return (set_errno(EINVAL)); 1087 1088 if ((fp = getf(fildes)) == NULL) 1089 return (set_errno(EBADF)); 1090 1091 if (((fp->f_flag) & FWRITE) == 0) { 1092 error = EBADF; 1093 goto err; 1094 } 1095 1096 fileoff = fp->f_offset; 1097 vp = fp->f_vnode; 1098 1099 switch (vp->v_type) { 1100 case VSOCK: 1101 so = VTOSO(vp); 1102 /* sendfile not supported for SCTP */ 1103 if (so->so_protocol == IPPROTO_SCTP) { 1104 error = EPROTONOSUPPORT; 1105 goto err; 1106 } 1107 is_sock = B_TRUE; 1108 switch (so->so_family) { 1109 case AF_NCA: 1110 case AF_INET: 1111 case AF_INET6: 1112 /* 1113 * Make similar checks done in SOP_WRITE(). 1114 */ 1115 if (so->so_state & SS_CANTSENDMORE) { 1116 tsignal(curthread, SIGPIPE); 1117 error = EPIPE; 1118 goto err; 1119 } 1120 if (so->so_type != SOCK_STREAM) { 1121 error = EOPNOTSUPP; 1122 goto err; 1123 } 1124 1125 if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) != 1126 (SS_ISCONNECTED|SS_ISBOUND)) { 1127 error = ENOTCONN; 1128 goto err; 1129 } 1130 1131 if ((so->so_state & SS_DIRECT) && 1132 (so->so_priv != NULL)) { 1133 maxblk = ((tcp_t *)so->so_priv)->tcp_mss; 1134 } else { 1135 maxblk = (int)vp->v_stream->sd_maxblk; 1136 } 1137 break; 1138 default: 1139 error = EAFNOSUPPORT; 1140 goto err; 1141 } 1142 break; 1143 case VREG: 1144 break; 1145 default: 1146 error = EINVAL; 1147 goto err; 1148 } 1149 1150 switch (opcode) { 1151 case SENDFILEV : 1152 break; 1153 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 1154 case SENDFILEV64 : 1155 return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt, 1156 (size32_t *)xferred, fildes)); 1157 #endif 1158 default : 1159 error = ENOSYS; 1160 break; 1161 } 1162 1163 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1164 copy_vec = vec; 1165 1166 do { 1167 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 1168 #ifdef _SYSCALL32_IMPL 1169 /* 32-bit callers need to have their iovec expanded. */ 1170 if (get_udatamodel() == DATAMODEL_ILP32) { 1171 if (copyin(copy_vec, sfv32, 1172 copy_cnt * sizeof (ksendfilevec32_t))) { 1173 error = EFAULT; 1174 break; 1175 } 1176 1177 for (i = 0; i < copy_cnt; i++) { 1178 sfv[i].sfv_fd = sfv32[i].sfv_fd; 1179 sfv[i].sfv_off = 1180 (off_t)(uint32_t)sfv32[i].sfv_off; 1181 sfv[i].sfv_len = (size_t)sfv32[i].sfv_len; 1182 total_size += sfv[i].sfv_len; 1183 sfv[i].sfv_flag = sfv32[i].sfv_flag; 1184 } 1185 } else { 1186 #endif 1187 if (copyin(copy_vec, sfv, 1188 copy_cnt * sizeof (sendfilevec_t))) { 1189 error = EFAULT; 1190 break; 1191 } 1192 1193 for (i = 0; i < copy_cnt; i++) { 1194 total_size += sfv[i].sfv_len; 1195 } 1196 #ifdef _SYSCALL32_IMPL 1197 } 1198 #endif 1199 1200 /* 1201 * The task between deciding to use sendvec_small_chunk 1202 * and sendvec_chunk is dependant on multiple things: 1203 * 1204 * i) latency is important for smaller files. So if the 1205 * data is smaller than 'tcp_slow_start_initial' times 1206 * maxblk, then use sendvec_small_chunk which creates 1207 * maxblk size mblks and chains then together and sends 1208 * them to TCP in one shot. It also leaves 'wroff' size 1209 * space for the headers in each mblk. 1210 * 1211 * ii) for total size bigger than 'tcp_slow_start_initial' 1212 * time maxblk, its probably real file data which is 1213 * dominating. So its better to use sendvec_chunk because 1214 * performance goes to dog if we don't do pagesize reads. 1215 * sendvec_chunk will do pagesize reads and write them 1216 * in pagesize mblks to TCP. 1217 * 1218 * Side Notes: A write to file has not been optimized. 1219 * Future zero copy code will plugin into sendvec_chunk 1220 * only because doing zero copy for files smaller then 1221 * pagesize is useless. 1222 * 1223 * Note, if socket has NL7C enabled then call NL7C's 1224 * senfilev() function to give NL7C a chance to copy 1225 * the vec for caching, then continue processing as 1226 * normal. 1227 */ 1228 if (is_sock) { 1229 switch (so->so_family) { 1230 case AF_INET: 1231 case AF_INET6: 1232 if (so->so_nl7c_flags != 0) { 1233 nl7c_sendfilev(so, fileoff, 1234 sfv, copy_cnt); 1235 } 1236 if (total_size <= (4 * maxblk)) 1237 error = sendvec_small_chunk(fp, 1238 &fileoff, sfv, copy_cnt, 1239 total_size, maxblk, &count); 1240 else 1241 error = sendvec_chunk(fp, &fileoff, 1242 sfv, copy_cnt, &count); 1243 break; 1244 case AF_NCA: 1245 error = nca_sendfilev(fp, sfv, copy_cnt, 1246 &count); 1247 break; 1248 } 1249 } else { 1250 ASSERT(vp->v_type == VREG); 1251 error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt, 1252 &count); 1253 } 1254 1255 1256 #ifdef _SYSCALL32_IMPL 1257 if (get_udatamodel() == DATAMODEL_ILP32) 1258 copy_vec = (const struct sendfilevec *)((char *)copy_vec + 1259 (copy_cnt * sizeof (ksendfilevec32_t))); 1260 else 1261 #endif 1262 copy_vec += copy_cnt; 1263 sfvcnt -= copy_cnt; 1264 } while (sfvcnt > 0); 1265 1266 if (vp->v_type == VREG) 1267 fp->f_offset += count; 1268 1269 1270 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1271 1272 #ifdef _SYSCALL32_IMPL 1273 if (get_udatamodel() == DATAMODEL_ILP32) { 1274 ssize32_t count32 = (ssize32_t)count; 1275 if (copyout(&count32, xferred, sizeof (count32))) 1276 error = EFAULT; 1277 releasef(fildes); 1278 if (error != 0) 1279 return (set_errno(error)); 1280 return (count32); 1281 } 1282 #endif 1283 if (copyout(&count, xferred, sizeof (count))) 1284 error = EFAULT; 1285 releasef(fildes); 1286 if (error != 0) 1287 return (set_errno(error)); 1288 return (count); 1289 err: 1290 ASSERT(error != 0); 1291 releasef(fildes); 1292 return (set_errno(error)); 1293 } 1294