1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/t_lock.h> 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/buf.h> 34 #include <sys/conf.h> 35 #include <sys/cred.h> 36 #include <sys/kmem.h> 37 #include <sys/sysmacros.h> 38 #include <sys/vfs.h> 39 #include <sys/vnode.h> 40 #include <sys/debug.h> 41 #include <sys/errno.h> 42 #include <sys/time.h> 43 #include <sys/file.h> 44 #include <sys/open.h> 45 #include <sys/user.h> 46 #include <sys/termios.h> 47 #include <sys/stream.h> 48 #include <sys/strsubr.h> 49 #include <sys/esunddi.h> 50 #include <sys/flock.h> 51 #include <sys/modctl.h> 52 #include <sys/cmn_err.h> 53 #include <sys/vmsystm.h> 54 55 #include <sys/socket.h> 56 #include <sys/socketvar.h> 57 #include <netinet/in.h> 58 #include <sys/sendfile.h> 59 #include <sys/un.h> 60 #include <inet/nca/ncadoorhdr.h> 61 #include <inet/nca/ncaio.h> 62 #include <sys/tihdr.h> 63 #include <sys/atomic.h> 64 65 #include <inet/common.h> 66 #include <inet/ip.h> 67 #include <inet/ip6.h> 68 #include <inet/tcp.h> 69 70 extern int nca_sendfilev(file_t *, struct sendfilevec *, int, ssize_t *); 71 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *, 72 ssize32_t *); 73 extern void nl7c_sendfilev(struct sonode *, u_offset_t, struct sendfilevec *, 74 int); 75 76 #define SEND_MAX_CHUNK 16 77 78 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 79 /* 80 * 64 bit offsets for 32 bit applications only running either on 81 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer 82 * more than 2GB of data. 83 */ 84 int 85 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, 86 int copy_cnt, ssize32_t *count) 87 { 88 struct vnode *vp; 89 ushort_t fflag; 90 int ioflag; 91 size32_t cnt; 92 ssize32_t sfv_len; 93 ssize32_t tmpcount; 94 u_offset_t sfv_off; 95 struct uio auio; 96 struct iovec aiov; 97 int i, error; 98 99 fflag = fp->f_flag; 100 vp = fp->f_vnode; 101 for (i = 0; i < copy_cnt; i++) { 102 103 if (ISSIG(curthread, JUSTLOOKING)) 104 return (EINTR); 105 106 /* 107 * Do similar checks as "write" as we are writing 108 * sfv_len bytes into "vp". 109 */ 110 sfv_len = (ssize32_t)sfv->sfv_len; 111 112 if (sfv_len == 0) 113 continue; 114 115 if (sfv_len < 0) 116 return (EINVAL); 117 118 if (vp->v_type == VREG) { 119 if (*fileoff >= curproc->p_fsz_ctl) { 120 mutex_enter(&curproc->p_lock); 121 (void) rctl_action( 122 rctlproc_legacy[RLIMIT_FSIZE], 123 curproc->p_rctls, curproc, RCA_SAFE); 124 mutex_exit(&curproc->p_lock); 125 return (EFBIG); 126 } 127 128 if (*fileoff >= OFFSET_MAX(fp)) 129 return (EFBIG); 130 131 if (*fileoff + sfv_len > OFFSET_MAX(fp)) 132 return (EINVAL); 133 } 134 135 tmpcount = *count + sfv_len; 136 if (tmpcount < 0) 137 return (EINVAL); 138 139 sfv_off = sfv->sfv_off; 140 141 auio.uio_extflg = UIO_COPY_DEFAULT; 142 if (sfv->sfv_fd == SFV_FD_SELF) { 143 aiov.iov_len = sfv_len; 144 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 145 auio.uio_loffset = *fileoff; 146 auio.uio_iovcnt = 1; 147 auio.uio_resid = sfv_len; 148 auio.uio_iov = &aiov; 149 auio.uio_segflg = UIO_USERSPACE; 150 auio.uio_llimit = curproc->p_fsz_ctl; 151 auio.uio_fmode = fflag; 152 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 153 while (sfv_len > 0) { 154 error = VOP_WRITE(vp, &auio, ioflag, 155 fp->f_cred, NULL); 156 cnt = sfv_len - auio.uio_resid; 157 sfv_len -= cnt; 158 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 159 if (vp->v_type == VREG) 160 *fileoff += cnt; 161 *count += cnt; 162 if (error != 0) 163 return (error); 164 } 165 } else { 166 file_t *ffp; 167 vnode_t *readvp; 168 int readflg = 0; 169 size_t size; 170 caddr_t ptr; 171 172 if ((ffp = getf(sfv->sfv_fd)) == NULL) 173 return (EBADF); 174 175 if ((ffp->f_flag & FREAD) == 0) { 176 releasef(sfv->sfv_fd); 177 return (EBADF); 178 } 179 180 readvp = ffp->f_vnode; 181 if (readvp->v_type != VREG) { 182 releasef(sfv->sfv_fd); 183 return (EINVAL); 184 } 185 186 /* 187 * No point reading and writing to same vp, 188 * as long as both are regular files. readvp is not 189 * locked; but since we got it from an open file the 190 * contents will be valid during the time of access. 191 */ 192 if (VN_CMP(vp, readvp)) { 193 releasef(sfv->sfv_fd); 194 return (EINVAL); 195 } 196 197 /* 198 * Note: we assume readvp != vp. "vp" is already 199 * locked, and "readvp" must not be. 200 */ 201 (void) VOP_RWLOCK(readvp, readflg, NULL); 202 203 /* 204 * Same checks as in pread64. 205 */ 206 if (sfv_off > MAXOFFSET_T) { 207 VOP_RWUNLOCK(readvp, readflg, NULL); 208 releasef(sfv->sfv_fd); 209 return (EINVAL); 210 } 211 212 if (sfv_off + sfv_len > MAXOFFSET_T) 213 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off); 214 215 /* Find the native blocksize to transfer data */ 216 size = MIN(vp->v_vfsp->vfs_bsize, 217 readvp->v_vfsp->vfs_bsize); 218 size = sfv_len < size ? sfv_len : size; 219 ptr = kmem_alloc(size, KM_SLEEP); 220 221 while (sfv_len > 0) { 222 size_t iov_len; 223 224 iov_len = MIN(size, sfv_len); 225 aiov.iov_base = ptr; 226 aiov.iov_len = iov_len; 227 auio.uio_loffset = sfv_off; 228 auio.uio_iov = &aiov; 229 auio.uio_iovcnt = 1; 230 auio.uio_resid = iov_len; 231 auio.uio_segflg = UIO_SYSSPACE; 232 auio.uio_llimit = MAXOFFSET_T; 233 auio.uio_fmode = ffp->f_flag; 234 ioflag = auio.uio_fmode & 235 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 236 237 /* 238 * If read sync is not asked for, 239 * filter sync flags 240 */ 241 if ((ioflag & FRSYNC) == 0) 242 ioflag &= ~(FSYNC|FDSYNC); 243 error = VOP_READ(readvp, &auio, ioflag, 244 fp->f_cred, NULL); 245 if (error) { 246 kmem_free(ptr, size); 247 VOP_RWUNLOCK(readvp, readflg, NULL); 248 releasef(sfv->sfv_fd); 249 return (error); 250 } 251 252 /* 253 * Check how must data was really read. 254 * Decrement the 'len' and increment the 255 * 'off' appropriately. 256 */ 257 cnt = iov_len - auio.uio_resid; 258 if (cnt == 0) { 259 /* 260 * If we were reading a pipe (currently 261 * not implemented), we may now lose 262 * data. 263 */ 264 kmem_free(ptr, size); 265 VOP_RWUNLOCK(readvp, readflg, NULL); 266 releasef(sfv->sfv_fd); 267 return (EINVAL); 268 } 269 sfv_len -= cnt; 270 sfv_off += cnt; 271 272 aiov.iov_base = ptr; 273 aiov.iov_len = cnt; 274 auio.uio_loffset = *fileoff; 275 auio.uio_resid = cnt; 276 auio.uio_segflg = UIO_SYSSPACE; 277 auio.uio_llimit = curproc->p_fsz_ctl; 278 auio.uio_fmode = fflag; 279 ioflag = auio.uio_fmode & 280 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 281 error = VOP_WRITE(vp, &auio, ioflag, 282 fp->f_cred, NULL); 283 284 /* 285 * Check how much data was written. Increment 286 * the 'len' and decrement the 'off' if all 287 * the data was not written. 288 */ 289 cnt -= auio.uio_resid; 290 sfv_len += auio.uio_resid; 291 sfv_off -= auio.uio_resid; 292 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 293 if (vp->v_type == VREG) 294 *fileoff += cnt; 295 *count += cnt; 296 if (error != 0) { 297 kmem_free(ptr, size); 298 VOP_RWUNLOCK(readvp, readflg, NULL); 299 releasef(sfv->sfv_fd); 300 return (error); 301 } 302 } 303 VOP_RWUNLOCK(readvp, readflg, NULL); 304 releasef(sfv->sfv_fd); 305 kmem_free(ptr, size); 306 } 307 sfv++; 308 } 309 return (0); 310 } 311 312 ssize32_t 313 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, 314 size32_t *xferred, int fildes) 315 { 316 int rwflag; 317 u_offset_t fileoff; 318 int copy_cnt; 319 const struct ksendfilevec64 *copy_vec; 320 struct ksendfilevec64 sfv[SEND_MAX_CHUNK]; 321 struct vnode *vp; 322 int error; 323 ssize32_t count = 0; 324 int osfvcnt; 325 326 rwflag = 1; 327 vp = fp->f_vnode; 328 (void) VOP_RWLOCK(vp, rwflag, NULL); 329 330 copy_vec = vec; 331 fileoff = fp->f_offset; 332 osfvcnt = sfvcnt; 333 334 do { 335 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 336 if (copyin(copy_vec, sfv, copy_cnt * 337 sizeof (struct ksendfilevec64))) { 338 error = EFAULT; 339 break; 340 } 341 342 /* 343 * Optimize the single regular file over 344 * the socket case. 345 */ 346 if (vp->v_type == VSOCK && osfvcnt == 1 && 347 sfv->sfv_fd != SFV_FD_SELF) { 348 file_t *rfp; 349 vnode_t *rvp; 350 351 if ((rfp = getf(sfv->sfv_fd)) == NULL) { 352 error = EBADF; 353 break; 354 } 355 if ((rfp->f_flag & FREAD) == 0) { 356 releasef(sfv->sfv_fd); 357 error = EBADF; 358 break; 359 } 360 rvp = rfp->f_vnode; 361 if (rvp->v_type == VREG) { 362 error = sosendfile64(fp, rfp, sfv, &count); 363 break; 364 } 365 releasef(sfv->sfv_fd); 366 } 367 error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count); 368 if (error != 0) 369 break; 370 371 copy_vec += copy_cnt; 372 sfvcnt -= copy_cnt; 373 } while (sfvcnt > 0); 374 375 if (vp->v_type == VREG) 376 fp->f_offset += count; 377 378 VOP_RWUNLOCK(vp, rwflag, NULL); 379 if (copyout(&count, xferred, sizeof (count))) 380 error = EFAULT; 381 releasef(fildes); 382 if (error != 0) 383 return (set_errno(error)); 384 return (count); 385 } 386 #endif 387 388 int 389 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 390 int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count) 391 { 392 struct vnode *vp; 393 struct uio auio; 394 struct iovec aiov; 395 ushort_t fflag; 396 int ioflag; 397 int i, error; 398 size_t cnt; 399 ssize_t sfv_len; 400 u_offset_t sfv_off; 401 #ifdef _SYSCALL32_IMPL 402 model_t model = get_udatamodel(); 403 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 404 MAXOFF32_T : MAXOFFSET_T; 405 #else 406 const u_offset_t maxoff = MAXOFF32_T; 407 #endif 408 mblk_t *dmp = NULL; 409 int wroff; 410 int buf_left = 0; 411 size_t iov_len; 412 mblk_t *head, *tmp; 413 size_t size = total_size; 414 415 fflag = fp->f_flag; 416 vp = fp->f_vnode; 417 418 ASSERT(vp->v_type == VSOCK); 419 ASSERT(maxblk > 0); 420 421 wroff = (int)vp->v_stream->sd_wroff; 422 buf_left = MIN(total_size, maxblk); 423 head = dmp = allocb(buf_left + wroff, BPRI_HI); 424 if (head == NULL) 425 return (ENOMEM); 426 head->b_wptr = head->b_rptr = head->b_rptr + wroff; 427 428 auio.uio_extflg = UIO_COPY_DEFAULT; 429 for (i = 0; i < copy_cnt; i++) { 430 if (ISSIG(curthread, JUSTLOOKING)) 431 return (EINTR); 432 433 /* 434 * Do similar checks as "write" as we are writing 435 * sfv_len bytes into "vp". 436 */ 437 sfv_len = (ssize_t)sfv->sfv_len; 438 439 if (sfv_len == 0) { 440 sfv++; 441 continue; 442 } 443 444 /* Make sure sfv_len is not negative */ 445 #ifdef _SYSCALL32_IMPL 446 if (model == DATAMODEL_ILP32) { 447 if ((ssize32_t)sfv_len < 0) 448 return (EINVAL); 449 } else 450 #endif 451 if (sfv_len < 0) 452 return (EINVAL); 453 454 /* Check for overflow */ 455 #ifdef _SYSCALL32_IMPL 456 if (model == DATAMODEL_ILP32) { 457 if (((ssize32_t)(*count + sfv_len)) < 0) 458 return (EINVAL); 459 } else 460 #endif 461 if ((*count + sfv_len) < 0) 462 return (EINVAL); 463 464 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 465 466 if (sfv->sfv_fd == SFV_FD_SELF) { 467 while (sfv_len > 0) { 468 if (buf_left == 0) { 469 tmp = dmp; 470 buf_left = MIN(total_size, maxblk); 471 iov_len = MIN(buf_left, sfv_len); 472 dmp = allocb(buf_left + wroff, BPRI_HI); 473 if (dmp == NULL) { 474 freemsg(head); 475 return (ENOMEM); 476 } 477 dmp->b_wptr = dmp->b_rptr = 478 dmp->b_rptr + wroff; 479 tmp->b_cont = dmp; 480 } else { 481 iov_len = MIN(buf_left, sfv_len); 482 } 483 484 aiov.iov_len = iov_len; 485 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 486 auio.uio_loffset = *fileoff; 487 auio.uio_iovcnt = 1; 488 auio.uio_resid = iov_len; 489 auio.uio_iov = &aiov; 490 auio.uio_segflg = UIO_USERSPACE; 491 auio.uio_llimit = curproc->p_fsz_ctl; 492 auio.uio_fmode = fflag; 493 494 buf_left -= iov_len; 495 total_size -= iov_len; 496 sfv_len -= iov_len; 497 sfv_off += iov_len; 498 499 error = uiomove((caddr_t)dmp->b_wptr, 500 iov_len, UIO_WRITE, &auio); 501 if (error != 0) { 502 freemsg(head); 503 return (error); 504 } 505 dmp->b_wptr += iov_len; 506 } 507 } else { 508 file_t *ffp; 509 vnode_t *readvp; 510 int readflg = 0; 511 512 if ((ffp = getf(sfv->sfv_fd)) == NULL) { 513 freemsg(head); 514 return (EBADF); 515 } 516 517 if ((ffp->f_flag & FREAD) == 0) { 518 releasef(sfv->sfv_fd); 519 freemsg(head); 520 return (EACCES); 521 } 522 523 readvp = ffp->f_vnode; 524 if (readvp->v_type != VREG) { 525 releasef(sfv->sfv_fd); 526 freemsg(head); 527 return (EINVAL); 528 } 529 530 /* 531 * No point reading and writing to same vp, 532 * as long as both are regular files. readvp is not 533 * locked; but since we got it from an open file the 534 * contents will be valid during the time of access. 535 */ 536 537 if (VN_CMP(vp, readvp)) { 538 releasef(sfv->sfv_fd); 539 freemsg(head); 540 return (EINVAL); 541 } 542 543 /* 544 * Note: we assume readvp != vp. "vp" is already 545 * locked, and "readvp" must not be. 546 */ 547 548 (void) VOP_RWLOCK(readvp, readflg, NULL); 549 550 /* Same checks as in pread */ 551 if (sfv_off > maxoff) { 552 VOP_RWUNLOCK(readvp, readflg, NULL); 553 releasef(sfv->sfv_fd); 554 freemsg(head); 555 return (EINVAL); 556 } 557 if (sfv_off + sfv_len > maxoff) { 558 sfv_len = (ssize_t)((offset_t)maxoff - 559 sfv_off); 560 } 561 562 while (sfv_len > 0) { 563 if (buf_left == 0) { 564 tmp = dmp; 565 buf_left = MIN(total_size, maxblk); 566 iov_len = MIN(buf_left, sfv_len); 567 dmp = allocb(buf_left + wroff, BPRI_HI); 568 if (dmp == NULL) { 569 VOP_RWUNLOCK(readvp, readflg, 570 NULL); 571 releasef(sfv->sfv_fd); 572 freemsg(head); 573 return (ENOMEM); 574 } 575 dmp->b_wptr = dmp->b_rptr = 576 dmp->b_rptr + wroff; 577 tmp->b_cont = dmp; 578 } else { 579 iov_len = MIN(buf_left, sfv_len); 580 } 581 aiov.iov_base = (caddr_t)dmp->b_wptr; 582 aiov.iov_len = iov_len; 583 auio.uio_loffset = sfv_off; 584 auio.uio_iov = &aiov; 585 auio.uio_iovcnt = 1; 586 auio.uio_resid = iov_len; 587 auio.uio_segflg = UIO_SYSSPACE; 588 auio.uio_llimit = MAXOFFSET_T; 589 auio.uio_fmode = ffp->f_flag; 590 ioflag = auio.uio_fmode & 591 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 592 593 /* 594 * If read sync is not asked for, 595 * filter sync flags 596 */ 597 if ((ioflag & FRSYNC) == 0) 598 ioflag &= ~(FSYNC|FDSYNC); 599 error = VOP_READ(readvp, &auio, ioflag, 600 fp->f_cred, NULL); 601 if (error != 0) { 602 /* 603 * If we were reading a pipe (currently 604 * not implemented), we may now loose 605 * data. 606 */ 607 VOP_RWUNLOCK(readvp, readflg, NULL); 608 releasef(sfv->sfv_fd); 609 freemsg(head); 610 return (error); 611 } 612 613 /* 614 * Check how much data was really read. 615 * Decrement the 'len' and increment the 616 * 'off' appropriately. 617 */ 618 cnt = iov_len - auio.uio_resid; 619 if (cnt == 0) { 620 VOP_RWUNLOCK(readvp, readflg, NULL); 621 releasef(sfv->sfv_fd); 622 freemsg(head); 623 return (EINVAL); 624 } 625 sfv_len -= cnt; 626 sfv_off += cnt; 627 total_size -= cnt; 628 buf_left -= cnt; 629 630 dmp->b_wptr += cnt; 631 } 632 VOP_RWUNLOCK(readvp, readflg, NULL); 633 releasef(sfv->sfv_fd); 634 } 635 sfv++; 636 } 637 638 ASSERT(total_size == 0); 639 error = kstrwritemp(vp, head, fflag); 640 if (error != 0) { 641 freemsg(head); 642 return (error); 643 } 644 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size; 645 *count += size; 646 647 return (0); 648 } 649 650 651 int 652 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 653 int copy_cnt, ssize_t *count) 654 { 655 struct vnode *vp; 656 struct uio auio; 657 struct iovec aiov; 658 ushort_t fflag; 659 int ioflag; 660 int i, error; 661 size_t cnt; 662 ssize_t sfv_len; 663 u_offset_t sfv_off; 664 #ifdef _SYSCALL32_IMPL 665 model_t model = get_udatamodel(); 666 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 667 MAXOFF32_T : MAXOFFSET_T; 668 #else 669 const u_offset_t maxoff = MAXOFF32_T; 670 #endif 671 mblk_t *dmp = NULL; 672 char *buf = NULL; 673 674 fflag = fp->f_flag; 675 vp = fp->f_vnode; 676 677 auio.uio_extflg = UIO_COPY_DEFAULT; 678 for (i = 0; i < copy_cnt; i++) { 679 if (ISSIG(curthread, JUSTLOOKING)) 680 return (EINTR); 681 682 /* 683 * Do similar checks as "write" as we are writing 684 * sfv_len bytes into "vp". 685 */ 686 sfv_len = (ssize_t)sfv->sfv_len; 687 688 if (sfv_len == 0) { 689 sfv++; 690 continue; 691 } 692 693 /* Make sure sfv_len is not negative */ 694 #ifdef _SYSCALL32_IMPL 695 if (model == DATAMODEL_ILP32) { 696 if ((ssize32_t)sfv_len < 0) 697 return (EINVAL); 698 } else 699 #endif 700 if (sfv_len < 0) 701 return (EINVAL); 702 703 if (vp->v_type == VREG) { 704 if (*fileoff >= curproc->p_fsz_ctl) { 705 mutex_enter(&curproc->p_lock); 706 (void) rctl_action( 707 rctlproc_legacy[RLIMIT_FSIZE], 708 curproc->p_rctls, curproc, RCA_SAFE); 709 mutex_exit(&curproc->p_lock); 710 711 return (EFBIG); 712 } 713 714 if (*fileoff >= maxoff) 715 return (EFBIG); 716 717 if (*fileoff + sfv_len > maxoff) 718 return (EINVAL); 719 } 720 721 /* Check for overflow */ 722 #ifdef _SYSCALL32_IMPL 723 if (model == DATAMODEL_ILP32) { 724 if (((ssize32_t)(*count + sfv_len)) < 0) 725 return (EINVAL); 726 } else 727 #endif 728 if ((*count + sfv_len) < 0) 729 return (EINVAL); 730 731 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 732 733 if (sfv->sfv_fd == SFV_FD_SELF) { 734 aiov.iov_len = sfv_len; 735 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 736 auio.uio_loffset = *fileoff; 737 auio.uio_iovcnt = 1; 738 auio.uio_resid = sfv_len; 739 auio.uio_iov = &aiov; 740 auio.uio_segflg = UIO_USERSPACE; 741 auio.uio_llimit = curproc->p_fsz_ctl; 742 auio.uio_fmode = fflag; 743 744 if (vp->v_type == VSOCK) { 745 746 /* 747 * Optimize for the socket case 748 */ 749 int wroff = (int)vp->v_stream->sd_wroff; 750 751 dmp = allocb(sfv_len + wroff, BPRI_HI); 752 if (dmp == NULL) 753 return (ENOMEM); 754 dmp->b_wptr = dmp->b_rptr = dmp->b_rptr + wroff; 755 error = uiomove((caddr_t)dmp->b_wptr, 756 sfv_len, UIO_WRITE, &auio); 757 if (error != 0) { 758 freeb(dmp); 759 return (error); 760 } 761 dmp->b_wptr += sfv_len; 762 error = kstrwritemp(vp, dmp, fflag); 763 if (error != 0) { 764 freeb(dmp); 765 return (error); 766 } 767 ttolwp(curthread)->lwp_ru.ioch += 768 (ulong_t)sfv_len; 769 *count += sfv_len; 770 } else { 771 ioflag = auio.uio_fmode & 772 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 773 while (sfv_len > 0) { 774 error = VOP_WRITE(vp, &auio, ioflag, 775 fp->f_cred, NULL); 776 cnt = sfv_len - auio.uio_resid; 777 sfv_len -= cnt; 778 ttolwp(curthread)->lwp_ru.ioch += 779 (ulong_t)cnt; 780 *fileoff += cnt; 781 *count += cnt; 782 if (error != 0) 783 return (error); 784 } 785 } 786 } else { 787 file_t *ffp; 788 vnode_t *readvp; 789 int readflg = 0; 790 size_t size; 791 caddr_t ptr; 792 793 if ((ffp = getf(sfv->sfv_fd)) == NULL) 794 return (EBADF); 795 796 if ((ffp->f_flag & FREAD) == 0) { 797 releasef(sfv->sfv_fd); 798 return (EBADF); 799 } 800 801 readvp = ffp->f_vnode; 802 if (readvp->v_type != VREG) { 803 releasef(sfv->sfv_fd); 804 return (EINVAL); 805 } 806 807 /* 808 * No point reading and writing to same vp, 809 * as long as both are regular files. readvp is not 810 * locked; but since we got it from an open file the 811 * contents will be valid during the time of access. 812 */ 813 if (VN_CMP(vp, readvp)) { 814 releasef(sfv->sfv_fd); 815 return (EINVAL); 816 } 817 818 /* 819 * Note: we assume readvp != vp. "vp" is already 820 * locked, and "readvp" must not be. 821 */ 822 (void) VOP_RWLOCK(readvp, readflg, NULL); 823 824 /* Same checks as in pread */ 825 if (sfv_off > maxoff) { 826 VOP_RWUNLOCK(readvp, readflg, NULL); 827 releasef(sfv->sfv_fd); 828 return (EINVAL); 829 } 830 if (sfv_off + sfv_len > maxoff) { 831 sfv_len = (ssize_t)((offset_t)maxoff - 832 sfv_off); 833 } 834 /* Find the native blocksize to transfer data */ 835 size = MIN(vp->v_vfsp->vfs_bsize, 836 readvp->v_vfsp->vfs_bsize); 837 size = sfv_len < size ? sfv_len : size; 838 839 if (vp->v_type != VSOCK) { 840 buf = kmem_alloc(size, KM_NOSLEEP); 841 if (buf == NULL) { 842 VOP_RWUNLOCK(readvp, readflg, NULL); 843 releasef(sfv->sfv_fd); 844 return (ENOMEM); 845 } 846 } 847 848 while (sfv_len > 0) { 849 size_t iov_len; 850 851 iov_len = MIN(size, sfv_len); 852 853 if (vp->v_type == VSOCK) { 854 dmp = allocb(iov_len, BPRI_HI); 855 if (dmp == NULL) { 856 VOP_RWUNLOCK(readvp, readflg, 857 NULL); 858 releasef(sfv->sfv_fd); 859 return (ENOMEM); 860 } 861 ptr = (caddr_t)dmp->b_rptr; 862 } else { 863 ptr = buf; 864 } 865 866 aiov.iov_base = ptr; 867 aiov.iov_len = iov_len; 868 auio.uio_loffset = sfv_off; 869 auio.uio_iov = &aiov; 870 auio.uio_iovcnt = 1; 871 auio.uio_resid = iov_len; 872 auio.uio_segflg = UIO_SYSSPACE; 873 auio.uio_llimit = MAXOFFSET_T; 874 auio.uio_fmode = ffp->f_flag; 875 ioflag = auio.uio_fmode & 876 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 877 878 /* 879 * If read sync is not asked for, 880 * filter sync flags 881 */ 882 if ((ioflag & FRSYNC) == 0) 883 ioflag &= ~(FSYNC|FDSYNC); 884 error = VOP_READ(readvp, &auio, ioflag, 885 fp->f_cred, NULL); 886 if (error != 0) { 887 /* 888 * If we were reading a pipe (currently 889 * not implemented), we may now lose 890 * data. 891 */ 892 if (vp->v_type == VSOCK) 893 freeb(dmp); 894 else 895 kmem_free(buf, size); 896 VOP_RWUNLOCK(readvp, readflg, NULL); 897 releasef(sfv->sfv_fd); 898 return (error); 899 } 900 901 /* 902 * Check how much data was really read. 903 * Decrement the 'len' and increment the 904 * 'off' appropriately. 905 */ 906 cnt = iov_len - auio.uio_resid; 907 if (cnt == 0) { 908 if (vp->v_type == VSOCK) 909 freeb(dmp); 910 else 911 kmem_free(buf, size); 912 VOP_RWUNLOCK(readvp, readflg, NULL); 913 releasef(sfv->sfv_fd); 914 return (EINVAL); 915 } 916 sfv_len -= cnt; 917 sfv_off += cnt; 918 919 if (vp->v_type == VSOCK) { 920 dmp->b_wptr = dmp->b_rptr + cnt; 921 922 error = kstrwritemp(vp, dmp, fflag); 923 if (error != 0) { 924 freeb(dmp); 925 VOP_RWUNLOCK(readvp, readflg, 926 NULL); 927 releasef(sfv->sfv_fd); 928 return (error); 929 } 930 931 ttolwp(curthread)->lwp_ru.ioch += 932 (ulong_t)cnt; 933 *count += cnt; 934 } else { 935 936 aiov.iov_base = ptr; 937 aiov.iov_len = cnt; 938 auio.uio_loffset = *fileoff; 939 auio.uio_resid = cnt; 940 auio.uio_segflg = UIO_SYSSPACE; 941 auio.uio_llimit = curproc->p_fsz_ctl; 942 auio.uio_fmode = fflag; 943 ioflag = auio.uio_fmode & 944 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 945 error = VOP_WRITE(vp, &auio, ioflag, 946 fp->f_cred, NULL); 947 948 /* 949 * Check how much data was written. 950 * Increment the 'len' and decrement the 951 * 'off' if all the data was not 952 * written. 953 */ 954 cnt -= auio.uio_resid; 955 sfv_len += auio.uio_resid; 956 sfv_off -= auio.uio_resid; 957 ttolwp(curthread)->lwp_ru.ioch += 958 (ulong_t)cnt; 959 *fileoff += cnt; 960 *count += cnt; 961 if (error != 0) { 962 VOP_RWUNLOCK(readvp, readflg, 963 NULL); 964 releasef(sfv->sfv_fd); 965 return (error); 966 } 967 } 968 } 969 if (buf) { 970 kmem_free(buf, size); 971 buf = NULL; 972 } 973 VOP_RWUNLOCK(readvp, readflg, NULL); 974 releasef(sfv->sfv_fd); 975 } 976 sfv++; 977 } 978 return (0); 979 } 980 981 ssize_t 982 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, 983 size_t *xferred) 984 { 985 int error; 986 file_t *fp; 987 struct vnode *vp; 988 struct sonode *so; 989 u_offset_t fileoff; 990 int copy_cnt; 991 const struct sendfilevec *copy_vec; 992 struct sendfilevec sfv[SEND_MAX_CHUNK]; 993 ssize_t count = 0; 994 #ifdef _SYSCALL32_IMPL 995 struct ksendfilevec32 sfv32[SEND_MAX_CHUNK]; 996 #endif 997 ssize_t total_size = 0; 998 int i; 999 boolean_t is_sock = B_FALSE; 1000 int maxblk = 0; 1001 1002 if (sfvcnt <= 0) 1003 return (set_errno(EINVAL)); 1004 1005 if ((fp = getf(fildes)) == NULL) 1006 return (set_errno(EBADF)); 1007 1008 if (((fp->f_flag) & FWRITE) == 0) { 1009 error = EBADF; 1010 goto err; 1011 } 1012 1013 fileoff = fp->f_offset; 1014 vp = fp->f_vnode; 1015 1016 switch (vp->v_type) { 1017 case VSOCK: 1018 so = VTOSO(vp); 1019 /* sendfile not supported for SCTP */ 1020 if (so->so_protocol == IPPROTO_SCTP) { 1021 error = EPROTONOSUPPORT; 1022 goto err; 1023 } 1024 is_sock = B_TRUE; 1025 switch (so->so_family) { 1026 case AF_NCA: 1027 case AF_INET: 1028 case AF_INET6: 1029 /* 1030 * Make similar checks done in SOP_WRITE(). 1031 */ 1032 if (so->so_state & SS_CANTSENDMORE) { 1033 tsignal(curthread, SIGPIPE); 1034 error = EPIPE; 1035 goto err; 1036 } 1037 if (so->so_type != SOCK_STREAM) { 1038 error = EOPNOTSUPP; 1039 goto err; 1040 } 1041 1042 if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) != 1043 (SS_ISCONNECTED|SS_ISBOUND)) { 1044 error = ENOTCONN; 1045 goto err; 1046 } 1047 1048 if ((so->so_state & SS_TCP_FAST_ACCEPT) && 1049 (so->so_priv != NULL)) { 1050 maxblk = ((tcp_t *)so->so_priv)->tcp_mss; 1051 } else { 1052 maxblk = (int)vp->v_stream->sd_maxblk; 1053 } 1054 break; 1055 default: 1056 error = EAFNOSUPPORT; 1057 goto err; 1058 } 1059 break; 1060 case VREG: 1061 break; 1062 default: 1063 error = EINVAL; 1064 goto err; 1065 } 1066 1067 switch (opcode) { 1068 case SENDFILEV : 1069 break; 1070 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 1071 case SENDFILEV64 : 1072 return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt, 1073 (size32_t *)xferred, fildes)); 1074 #endif 1075 default : 1076 error = ENOSYS; 1077 break; 1078 } 1079 1080 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1081 copy_vec = vec; 1082 1083 do { 1084 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 1085 #ifdef _SYSCALL32_IMPL 1086 /* 32-bit callers need to have their iovec expanded. */ 1087 if (get_udatamodel() == DATAMODEL_ILP32) { 1088 if (copyin(copy_vec, sfv32, 1089 copy_cnt * sizeof (ksendfilevec32_t))) { 1090 error = EFAULT; 1091 break; 1092 } 1093 1094 for (i = 0; i < copy_cnt; i++) { 1095 sfv[i].sfv_fd = sfv32[i].sfv_fd; 1096 sfv[i].sfv_off = 1097 (off_t)(uint32_t)sfv32[i].sfv_off; 1098 sfv[i].sfv_len = (size_t)sfv32[i].sfv_len; 1099 total_size += sfv[i].sfv_len; 1100 sfv[i].sfv_flag = sfv32[i].sfv_flag; 1101 } 1102 } else { 1103 #endif 1104 if (copyin(copy_vec, sfv, 1105 copy_cnt * sizeof (sendfilevec_t))) { 1106 error = EFAULT; 1107 break; 1108 } 1109 1110 for (i = 0; i < copy_cnt; i++) { 1111 total_size += sfv[i].sfv_len; 1112 } 1113 #ifdef _SYSCALL32_IMPL 1114 } 1115 #endif 1116 1117 /* 1118 * The task between deciding to use sendvec_small_chunk 1119 * and sendvec_chunk is dependant on multiple things: 1120 * 1121 * i) latency is important for smaller files. So if the 1122 * data is smaller than 'tcp_slow_start_initial' times 1123 * maxblk, then use sendvec_small_chunk which creates 1124 * maxblk size mblks and chains then together and sends 1125 * them to TCP in one shot. It also leaves 'wroff' size 1126 * space for the headers in each mblk. 1127 * 1128 * ii) for total size bigger than 'tcp_slow_start_initial' 1129 * time maxblk, its probably real file data which is 1130 * dominating. So its better to use sendvec_chunk because 1131 * performance goes to dog if we don't do pagesize reads. 1132 * sendvec_chunk will do pagesize reads and write them 1133 * in pagesize mblks to TCP. 1134 * 1135 * Side Notes: A write to file has not been optimized. 1136 * Future zero copy code will plugin into sendvec_chunk 1137 * only because doing zero copy for files smaller then 1138 * pagesize is useless. 1139 * 1140 * Note, if socket has NL7C enabled then call NL7C's 1141 * senfilev() function to give NL7C a chance to copy 1142 * the vec for caching, then continue processing as 1143 * normal. 1144 */ 1145 if (is_sock) { 1146 switch (so->so_family) { 1147 case AF_INET: 1148 case AF_INET6: 1149 if (so->so_nl7c_flags != 0) { 1150 nl7c_sendfilev(so, fileoff, 1151 sfv, copy_cnt); 1152 } 1153 if (total_size <= (4 * maxblk)) 1154 error = sendvec_small_chunk(fp, 1155 &fileoff, sfv, copy_cnt, 1156 total_size, maxblk, &count); 1157 else 1158 error = sendvec_chunk(fp, &fileoff, 1159 sfv, copy_cnt, &count); 1160 break; 1161 case AF_NCA: 1162 error = nca_sendfilev(fp, sfv, copy_cnt, 1163 &count); 1164 break; 1165 } 1166 } else { 1167 ASSERT(vp->v_type == VREG); 1168 error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt, 1169 &count); 1170 } 1171 1172 1173 #ifdef _SYSCALL32_IMPL 1174 if (get_udatamodel() == DATAMODEL_ILP32) 1175 copy_vec = (const struct sendfilevec *)((char *)copy_vec + 1176 (copy_cnt * sizeof (ksendfilevec32_t))); 1177 else 1178 #endif 1179 copy_vec += copy_cnt; 1180 sfvcnt -= copy_cnt; 1181 } while (sfvcnt > 0); 1182 1183 if (vp->v_type == VREG) 1184 fp->f_offset += count; 1185 1186 1187 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1188 1189 #ifdef _SYSCALL32_IMPL 1190 if (get_udatamodel() == DATAMODEL_ILP32) { 1191 ssize32_t count32 = (ssize32_t)count; 1192 if (copyout(&count32, xferred, sizeof (count32))) 1193 error = EFAULT; 1194 releasef(fildes); 1195 if (error != 0) 1196 return (set_errno(error)); 1197 return (count32); 1198 } 1199 #endif 1200 if (copyout(&count, xferred, sizeof (count))) 1201 error = EFAULT; 1202 releasef(fildes); 1203 if (error != 0) 1204 return (set_errno(error)); 1205 return (count); 1206 err: 1207 ASSERT(error != 0); 1208 releasef(fildes); 1209 return (set_errno(error)); 1210 } 1211