1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/t_lock.h> 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/buf.h> 34 #include <sys/conf.h> 35 #include <sys/cred.h> 36 #include <sys/kmem.h> 37 #include <sys/sysmacros.h> 38 #include <sys/vfs.h> 39 #include <sys/vnode.h> 40 #include <sys/debug.h> 41 #include <sys/errno.h> 42 #include <sys/time.h> 43 #include <sys/file.h> 44 #include <sys/open.h> 45 #include <sys/user.h> 46 #include <sys/termios.h> 47 #include <sys/stream.h> 48 #include <sys/strsubr.h> 49 #include <sys/esunddi.h> 50 #include <sys/flock.h> 51 #include <sys/modctl.h> 52 #include <sys/cmn_err.h> 53 #include <sys/vmsystm.h> 54 55 #include <sys/socket.h> 56 #include <sys/socketvar.h> 57 #include <netinet/in.h> 58 #include <sys/sendfile.h> 59 #include <sys/un.h> 60 #include <inet/nca/ncadoorhdr.h> 61 #include <inet/nca/ncaio.h> 62 #include <sys/tihdr.h> 63 #include <sys/atomic.h> 64 65 #include <inet/common.h> 66 #include <inet/ip.h> 67 #include <inet/ip6.h> 68 #include <inet/tcp.h> 69 70 extern int nca_sendfilev(file_t *, struct sendfilevec *, int, ssize_t *); 71 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *, 72 ssize32_t *); 73 extern void nl7c_sendfilev(struct sonode *, u_offset_t, struct sendfilevec *, 74 int); 75 76 #define SEND_MAX_CHUNK 16 77 78 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 79 /* 80 * 64 bit offsets for 32 bit applications only running either on 81 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer 82 * more than 2GB of data. 83 */ 84 int 85 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, 86 int copy_cnt, ssize32_t *count) 87 { 88 struct vnode *vp; 89 ushort_t fflag; 90 int ioflag; 91 size32_t cnt; 92 ssize32_t sfv_len; 93 ssize32_t tmpcount; 94 u_offset_t sfv_off; 95 struct uio auio; 96 struct iovec aiov; 97 int i, error; 98 99 fflag = fp->f_flag; 100 vp = fp->f_vnode; 101 for (i = 0; i < copy_cnt; i++) { 102 103 if (ISSIG(curthread, JUSTLOOKING)) 104 return (EINTR); 105 106 /* 107 * Do similar checks as "write" as we are writing 108 * sfv_len bytes into "vp". 109 */ 110 sfv_len = (ssize32_t)sfv->sfv_len; 111 112 if (sfv_len == 0) 113 continue; 114 115 if (sfv_len < 0) 116 return (EINVAL); 117 118 if (vp->v_type == VREG) { 119 if (*fileoff >= curproc->p_fsz_ctl) { 120 mutex_enter(&curproc->p_lock); 121 (void) rctl_action( 122 rctlproc_legacy[RLIMIT_FSIZE], 123 curproc->p_rctls, curproc, RCA_SAFE); 124 mutex_exit(&curproc->p_lock); 125 return (EFBIG); 126 } 127 128 if (*fileoff >= OFFSET_MAX(fp)) 129 return (EFBIG); 130 131 if (*fileoff + sfv_len > OFFSET_MAX(fp)) 132 return (EINVAL); 133 } 134 135 tmpcount = *count + sfv_len; 136 if (tmpcount < 0) 137 return (EINVAL); 138 139 sfv_off = sfv->sfv_off; 140 141 auio.uio_extflg = UIO_COPY_DEFAULT; 142 if (sfv->sfv_fd == SFV_FD_SELF) { 143 aiov.iov_len = sfv_len; 144 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 145 auio.uio_loffset = *fileoff; 146 auio.uio_iovcnt = 1; 147 auio.uio_resid = sfv_len; 148 auio.uio_iov = &aiov; 149 auio.uio_segflg = UIO_USERSPACE; 150 auio.uio_llimit = curproc->p_fsz_ctl; 151 auio.uio_fmode = fflag; 152 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 153 while (sfv_len > 0) { 154 error = VOP_WRITE(vp, &auio, ioflag, 155 fp->f_cred, NULL); 156 cnt = sfv_len - auio.uio_resid; 157 sfv_len -= cnt; 158 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 159 if (vp->v_type == VREG) 160 *fileoff += cnt; 161 *count += cnt; 162 if (error != 0) 163 return (error); 164 } 165 } else { 166 file_t *ffp; 167 vnode_t *readvp; 168 int readflg = 0; 169 size_t size; 170 caddr_t ptr; 171 172 if ((ffp = getf(sfv->sfv_fd)) == NULL) 173 return (EBADF); 174 175 if ((ffp->f_flag & FREAD) == 0) { 176 releasef(sfv->sfv_fd); 177 return (EBADF); 178 } 179 180 readvp = ffp->f_vnode; 181 if (readvp->v_type != VREG) { 182 releasef(sfv->sfv_fd); 183 return (EINVAL); 184 } 185 186 /* 187 * No point reading and writing to same vp, 188 * as long as both are regular files. readvp is not 189 * locked; but since we got it from an open file the 190 * contents will be valid during the time of access. 191 */ 192 if (VN_CMP(vp, readvp)) { 193 releasef(sfv->sfv_fd); 194 return (EINVAL); 195 } 196 197 /* 198 * Note: we assume readvp != vp. "vp" is already 199 * locked, and "readvp" must not be. 200 */ 201 (void) VOP_RWLOCK(readvp, readflg, NULL); 202 203 /* 204 * Same checks as in pread64. 205 */ 206 if (sfv_off > MAXOFFSET_T) { 207 VOP_RWUNLOCK(readvp, readflg, NULL); 208 releasef(sfv->sfv_fd); 209 return (EINVAL); 210 } 211 212 if (sfv_off + sfv_len > MAXOFFSET_T) 213 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off); 214 215 /* Find the native blocksize to transfer data */ 216 size = MIN(vp->v_vfsp->vfs_bsize, 217 readvp->v_vfsp->vfs_bsize); 218 size = sfv_len < size ? sfv_len : size; 219 ptr = kmem_alloc(size, KM_SLEEP); 220 221 while (sfv_len > 0) { 222 size_t iov_len; 223 224 iov_len = MIN(size, sfv_len); 225 aiov.iov_base = ptr; 226 aiov.iov_len = iov_len; 227 auio.uio_loffset = sfv_off; 228 auio.uio_iov = &aiov; 229 auio.uio_iovcnt = 1; 230 auio.uio_resid = iov_len; 231 auio.uio_segflg = UIO_SYSSPACE; 232 auio.uio_llimit = MAXOFFSET_T; 233 auio.uio_fmode = ffp->f_flag; 234 ioflag = auio.uio_fmode & 235 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 236 237 /* 238 * If read sync is not asked for, 239 * filter sync flags 240 */ 241 if ((ioflag & FRSYNC) == 0) 242 ioflag &= ~(FSYNC|FDSYNC); 243 error = VOP_READ(readvp, &auio, ioflag, 244 fp->f_cred, NULL); 245 if (error) { 246 kmem_free(ptr, size); 247 VOP_RWUNLOCK(readvp, readflg, NULL); 248 releasef(sfv->sfv_fd); 249 return (error); 250 } 251 252 /* 253 * Check how must data was really read. 254 * Decrement the 'len' and increment the 255 * 'off' appropriately. 256 */ 257 cnt = iov_len - auio.uio_resid; 258 if (cnt == 0) { 259 /* 260 * If we were reading a pipe (currently 261 * not implemented), we may now lose 262 * data. 263 */ 264 kmem_free(ptr, size); 265 VOP_RWUNLOCK(readvp, readflg, NULL); 266 releasef(sfv->sfv_fd); 267 return (EINVAL); 268 } 269 sfv_len -= cnt; 270 sfv_off += cnt; 271 272 aiov.iov_base = ptr; 273 aiov.iov_len = cnt; 274 auio.uio_loffset = *fileoff; 275 auio.uio_resid = cnt; 276 auio.uio_segflg = UIO_SYSSPACE; 277 auio.uio_llimit = curproc->p_fsz_ctl; 278 auio.uio_fmode = fflag; 279 ioflag = auio.uio_fmode & 280 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 281 error = VOP_WRITE(vp, &auio, ioflag, 282 fp->f_cred, NULL); 283 284 /* 285 * Check how much data was written. Increment 286 * the 'len' and decrement the 'off' if all 287 * the data was not written. 288 */ 289 cnt -= auio.uio_resid; 290 sfv_len += auio.uio_resid; 291 sfv_off -= auio.uio_resid; 292 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 293 if (vp->v_type == VREG) 294 *fileoff += cnt; 295 *count += cnt; 296 if (error != 0) { 297 kmem_free(ptr, size); 298 VOP_RWUNLOCK(readvp, readflg, NULL); 299 releasef(sfv->sfv_fd); 300 return (error); 301 } 302 } 303 VOP_RWUNLOCK(readvp, readflg, NULL); 304 releasef(sfv->sfv_fd); 305 kmem_free(ptr, size); 306 } 307 sfv++; 308 } 309 return (0); 310 } 311 312 ssize32_t 313 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, 314 size32_t *xferred, int fildes) 315 { 316 int rwflag; 317 u_offset_t fileoff; 318 int copy_cnt; 319 const struct ksendfilevec64 *copy_vec; 320 struct ksendfilevec64 sfv[SEND_MAX_CHUNK]; 321 struct vnode *vp; 322 int error; 323 ssize32_t count = 0; 324 int osfvcnt; 325 326 rwflag = 1; 327 vp = fp->f_vnode; 328 (void) VOP_RWLOCK(vp, rwflag, NULL); 329 330 copy_vec = vec; 331 fileoff = fp->f_offset; 332 osfvcnt = sfvcnt; 333 334 do { 335 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 336 if (copyin(copy_vec, sfv, copy_cnt * 337 sizeof (struct ksendfilevec64))) { 338 error = EFAULT; 339 break; 340 } 341 342 /* 343 * Optimize the single regular file over 344 * the socket case. 345 */ 346 if (vp->v_type == VSOCK && osfvcnt == 1 && 347 sfv->sfv_fd != SFV_FD_SELF) { 348 file_t *rfp; 349 vnode_t *rvp; 350 351 if ((rfp = getf(sfv->sfv_fd)) == NULL) { 352 error = EBADF; 353 break; 354 } 355 if ((rfp->f_flag & FREAD) == 0) { 356 releasef(sfv->sfv_fd); 357 error = EBADF; 358 break; 359 } 360 rvp = rfp->f_vnode; 361 if (rvp->v_type == VREG) { 362 error = sosendfile64(fp, rfp, sfv, &count); 363 break; 364 } 365 releasef(sfv->sfv_fd); 366 } 367 error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count); 368 if (error != 0) 369 break; 370 371 copy_vec += copy_cnt; 372 sfvcnt -= copy_cnt; 373 } while (sfvcnt > 0); 374 375 if (vp->v_type == VREG) 376 fp->f_offset += count; 377 378 VOP_RWUNLOCK(vp, rwflag, NULL); 379 if (copyout(&count, xferred, sizeof (count))) 380 error = EFAULT; 381 releasef(fildes); 382 if (error != 0) 383 return (set_errno(error)); 384 return (count); 385 } 386 #endif 387 388 int 389 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 390 int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count) 391 { 392 struct vnode *vp; 393 struct uio auio; 394 struct iovec aiov; 395 ushort_t fflag; 396 int ioflag; 397 int i, error; 398 size_t cnt; 399 ssize_t sfv_len; 400 u_offset_t sfv_off; 401 #ifdef _SYSCALL32_IMPL 402 model_t model = get_udatamodel(); 403 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 404 MAXOFF32_T : MAXOFFSET_T; 405 #else 406 const u_offset_t maxoff = MAXOFF32_T; 407 #endif 408 mblk_t *dmp = NULL; 409 int wroff; 410 int buf_left = 0; 411 size_t iov_len; 412 mblk_t *head, *tmp; 413 size_t size = total_size; 414 415 fflag = fp->f_flag; 416 vp = fp->f_vnode; 417 418 ASSERT(vp->v_type == VSOCK); 419 ASSERT(maxblk > 0); 420 421 wroff = (int)vp->v_stream->sd_wroff; 422 buf_left = MIN(total_size, maxblk); 423 head = dmp = allocb(buf_left + wroff, BPRI_HI); 424 if (head == NULL) 425 return (ENOMEM); 426 head->b_wptr = head->b_rptr = head->b_rptr + wroff; 427 428 auio.uio_extflg = UIO_COPY_DEFAULT; 429 for (i = 0; i < copy_cnt; i++) { 430 if (ISSIG(curthread, JUSTLOOKING)) 431 return (EINTR); 432 433 /* 434 * Do similar checks as "write" as we are writing 435 * sfv_len bytes into "vp". 436 */ 437 sfv_len = (ssize_t)sfv->sfv_len; 438 439 if (sfv_len == 0) { 440 sfv++; 441 continue; 442 } 443 444 /* Make sure sfv_len is not negative */ 445 #ifdef _SYSCALL32_IMPL 446 if (model == DATAMODEL_ILP32) { 447 if ((ssize32_t)sfv_len < 0) 448 return (EINVAL); 449 } else 450 #endif 451 if (sfv_len < 0) 452 return (EINVAL); 453 454 /* Check for overflow */ 455 #ifdef _SYSCALL32_IMPL 456 if (model == DATAMODEL_ILP32) { 457 if (((ssize32_t)(*count + sfv_len)) < 0) 458 return (EINVAL); 459 } else 460 #endif 461 if ((*count + sfv_len) < 0) 462 return (EINVAL); 463 464 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 465 466 if (sfv->sfv_fd == SFV_FD_SELF) { 467 while (sfv_len > 0) { 468 if (buf_left == 0) { 469 tmp = dmp; 470 buf_left = MIN(total_size, maxblk); 471 iov_len = MIN(buf_left, sfv_len); 472 dmp = allocb(buf_left + wroff, BPRI_HI); 473 if (dmp == NULL) { 474 freemsg(head); 475 return (ENOMEM); 476 } 477 dmp->b_wptr = dmp->b_rptr = 478 dmp->b_rptr + wroff; 479 tmp->b_cont = dmp; 480 } else { 481 iov_len = MIN(buf_left, sfv_len); 482 } 483 484 aiov.iov_len = iov_len; 485 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 486 auio.uio_loffset = *fileoff; 487 auio.uio_iovcnt = 1; 488 auio.uio_resid = iov_len; 489 auio.uio_iov = &aiov; 490 auio.uio_segflg = UIO_USERSPACE; 491 auio.uio_llimit = curproc->p_fsz_ctl; 492 auio.uio_fmode = fflag; 493 494 buf_left -= iov_len; 495 total_size -= iov_len; 496 sfv_len -= iov_len; 497 sfv_off += iov_len; 498 499 error = uiomove((caddr_t)dmp->b_wptr, 500 iov_len, UIO_WRITE, &auio); 501 if (error != 0) { 502 freemsg(head); 503 return (error); 504 } 505 dmp->b_wptr += iov_len; 506 } 507 } else { 508 file_t *ffp; 509 vnode_t *readvp; 510 int readflg = 0; 511 512 if ((ffp = getf(sfv->sfv_fd)) == NULL) { 513 freemsg(head); 514 return (EBADF); 515 } 516 517 if ((ffp->f_flag & FREAD) == 0) { 518 releasef(sfv->sfv_fd); 519 freemsg(head); 520 return (EACCES); 521 } 522 523 readvp = ffp->f_vnode; 524 if (readvp->v_type != VREG) { 525 releasef(sfv->sfv_fd); 526 freemsg(head); 527 return (EINVAL); 528 } 529 530 /* 531 * No point reading and writing to same vp, 532 * as long as both are regular files. readvp is not 533 * locked; but since we got it from an open file the 534 * contents will be valid during the time of access. 535 */ 536 537 if (VN_CMP(vp, readvp)) { 538 releasef(sfv->sfv_fd); 539 freemsg(head); 540 return (EINVAL); 541 } 542 543 /* 544 * Note: we assume readvp != vp. "vp" is already 545 * locked, and "readvp" must not be. 546 */ 547 548 (void) VOP_RWLOCK(readvp, readflg, NULL); 549 550 /* Same checks as in pread */ 551 if (sfv_off > maxoff) { 552 VOP_RWUNLOCK(readvp, readflg, NULL); 553 releasef(sfv->sfv_fd); 554 freemsg(head); 555 return (EINVAL); 556 } 557 if (sfv_off + sfv_len > maxoff) { 558 sfv_len = (ssize_t)((offset_t)maxoff - 559 sfv_off); 560 } 561 562 while (sfv_len > 0) { 563 if (buf_left == 0) { 564 tmp = dmp; 565 buf_left = MIN(total_size, maxblk); 566 iov_len = MIN(buf_left, sfv_len); 567 dmp = allocb(buf_left + wroff, BPRI_HI); 568 if (dmp == NULL) { 569 VOP_RWUNLOCK(readvp, readflg, 570 NULL); 571 releasef(sfv->sfv_fd); 572 freemsg(head); 573 return (ENOMEM); 574 } 575 dmp->b_wptr = dmp->b_rptr = 576 dmp->b_rptr + wroff; 577 tmp->b_cont = dmp; 578 } else { 579 iov_len = MIN(buf_left, sfv_len); 580 } 581 aiov.iov_base = (caddr_t)dmp->b_wptr; 582 aiov.iov_len = iov_len; 583 auio.uio_loffset = sfv_off; 584 auio.uio_iov = &aiov; 585 auio.uio_iovcnt = 1; 586 auio.uio_resid = iov_len; 587 auio.uio_segflg = UIO_SYSSPACE; 588 auio.uio_llimit = MAXOFFSET_T; 589 auio.uio_fmode = ffp->f_flag; 590 ioflag = auio.uio_fmode & 591 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 592 593 /* 594 * If read sync is not asked for, 595 * filter sync flags 596 */ 597 if ((ioflag & FRSYNC) == 0) 598 ioflag &= ~(FSYNC|FDSYNC); 599 error = VOP_READ(readvp, &auio, ioflag, 600 fp->f_cred, NULL); 601 if (error != 0) { 602 /* 603 * If we were reading a pipe (currently 604 * not implemented), we may now loose 605 * data. 606 */ 607 VOP_RWUNLOCK(readvp, readflg, NULL); 608 releasef(sfv->sfv_fd); 609 freemsg(head); 610 return (error); 611 } 612 613 /* 614 * Check how much data was really read. 615 * Decrement the 'len' and increment the 616 * 'off' appropriately. 617 */ 618 cnt = iov_len - auio.uio_resid; 619 if (cnt == 0) { 620 VOP_RWUNLOCK(readvp, readflg, NULL); 621 releasef(sfv->sfv_fd); 622 freemsg(head); 623 return (EINVAL); 624 } 625 sfv_len -= cnt; 626 sfv_off += cnt; 627 total_size -= cnt; 628 buf_left -= cnt; 629 630 dmp->b_wptr += cnt; 631 } 632 VOP_RWUNLOCK(readvp, readflg, NULL); 633 releasef(sfv->sfv_fd); 634 } 635 sfv++; 636 } 637 638 ASSERT(total_size == 0); 639 error = kstrwritemp(vp, head, fflag); 640 if (error != 0) { 641 freemsg(head); 642 return (error); 643 } 644 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size; 645 *count += size; 646 647 return (0); 648 } 649 650 651 int 652 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 653 int copy_cnt, ssize_t *count) 654 { 655 struct vnode *vp; 656 struct uio auio; 657 struct iovec aiov; 658 ushort_t fflag; 659 int ioflag; 660 int i, error; 661 size_t cnt; 662 ssize_t sfv_len; 663 u_offset_t sfv_off; 664 #ifdef _SYSCALL32_IMPL 665 model_t model = get_udatamodel(); 666 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 667 MAXOFF32_T : MAXOFFSET_T; 668 #else 669 const u_offset_t maxoff = MAXOFF32_T; 670 #endif 671 mblk_t *dmp; 672 673 fflag = fp->f_flag; 674 vp = fp->f_vnode; 675 676 auio.uio_extflg = UIO_COPY_DEFAULT; 677 for (i = 0; i < copy_cnt; i++) { 678 if (ISSIG(curthread, JUSTLOOKING)) 679 return (EINTR); 680 681 /* 682 * Do similar checks as "write" as we are writing 683 * sfv_len bytes into "vp". 684 */ 685 sfv_len = (ssize_t)sfv->sfv_len; 686 687 if (sfv_len == 0) { 688 sfv++; 689 continue; 690 } 691 692 /* Make sure sfv_len is not negative */ 693 #ifdef _SYSCALL32_IMPL 694 if (model == DATAMODEL_ILP32) { 695 if ((ssize32_t)sfv_len < 0) 696 return (EINVAL); 697 } else 698 #endif 699 if (sfv_len < 0) 700 return (EINVAL); 701 702 if (vp->v_type == VREG) { 703 if (*fileoff >= curproc->p_fsz_ctl) { 704 mutex_enter(&curproc->p_lock); 705 (void) rctl_action( 706 rctlproc_legacy[RLIMIT_FSIZE], 707 curproc->p_rctls, curproc, RCA_SAFE); 708 mutex_exit(&curproc->p_lock); 709 710 return (EFBIG); 711 } 712 713 if (*fileoff >= maxoff) 714 return (EFBIG); 715 716 if (*fileoff + sfv_len > maxoff) 717 return (EINVAL); 718 } 719 720 /* Check for overflow */ 721 #ifdef _SYSCALL32_IMPL 722 if (model == DATAMODEL_ILP32) { 723 if (((ssize32_t)(*count + sfv_len)) < 0) 724 return (EINVAL); 725 } else 726 #endif 727 if ((*count + sfv_len) < 0) 728 return (EINVAL); 729 730 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 731 732 if (sfv->sfv_fd == SFV_FD_SELF) { 733 aiov.iov_len = sfv_len; 734 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 735 auio.uio_loffset = *fileoff; 736 auio.uio_iovcnt = 1; 737 auio.uio_resid = sfv_len; 738 auio.uio_iov = &aiov; 739 auio.uio_segflg = UIO_USERSPACE; 740 auio.uio_llimit = curproc->p_fsz_ctl; 741 auio.uio_fmode = fflag; 742 743 if (vp->v_type == VSOCK) { 744 745 /* 746 * Optimize for the socket case 747 */ 748 int wroff = (int)vp->v_stream->sd_wroff; 749 750 dmp = allocb(sfv_len + wroff, BPRI_HI); 751 if (dmp == NULL) 752 return (ENOMEM); 753 dmp->b_wptr = dmp->b_rptr = dmp->b_rptr + wroff; 754 error = uiomove((caddr_t)dmp->b_wptr, 755 sfv_len, UIO_WRITE, &auio); 756 if (error != 0) { 757 freeb(dmp); 758 return (error); 759 } 760 dmp->b_wptr += sfv_len; 761 error = kstrwritemp(vp, dmp, fflag); 762 if (error != 0) { 763 freeb(dmp); 764 return (error); 765 } 766 ttolwp(curthread)->lwp_ru.ioch += 767 (ulong_t)sfv_len; 768 *count += sfv_len; 769 } else { 770 ioflag = auio.uio_fmode & 771 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 772 while (sfv_len > 0) { 773 error = VOP_WRITE(vp, &auio, ioflag, 774 fp->f_cred, NULL); 775 cnt = sfv_len - auio.uio_resid; 776 sfv_len -= cnt; 777 ttolwp(curthread)->lwp_ru.ioch += 778 (ulong_t)cnt; 779 *fileoff += cnt; 780 *count += cnt; 781 if (error != 0) 782 return (error); 783 } 784 } 785 } else { 786 file_t *ffp; 787 vnode_t *readvp; 788 int readflg = 0; 789 size_t size; 790 caddr_t ptr; 791 792 if ((ffp = getf(sfv->sfv_fd)) == NULL) 793 return (EBADF); 794 795 if ((ffp->f_flag & FREAD) == 0) { 796 releasef(sfv->sfv_fd); 797 return (EBADF); 798 } 799 800 readvp = ffp->f_vnode; 801 if (readvp->v_type != VREG) { 802 releasef(sfv->sfv_fd); 803 return (EINVAL); 804 } 805 806 /* 807 * No point reading and writing to same vp, 808 * as long as both are regular files. readvp is not 809 * locked; but since we got it from an open file the 810 * contents will be valid during the time of access. 811 */ 812 if (VN_CMP(vp, readvp)) { 813 releasef(sfv->sfv_fd); 814 return (EINVAL); 815 } 816 817 /* 818 * Note: we assume readvp != vp. "vp" is already 819 * locked, and "readvp" must not be. 820 */ 821 (void) VOP_RWLOCK(readvp, readflg, NULL); 822 823 /* Same checks as in pread */ 824 if (sfv_off > maxoff) { 825 VOP_RWUNLOCK(readvp, readflg, NULL); 826 releasef(sfv->sfv_fd); 827 return (EINVAL); 828 } 829 if (sfv_off + sfv_len > maxoff) { 830 sfv_len = (ssize_t)((offset_t)maxoff - 831 sfv_off); 832 } 833 /* Find the native blocksize to transfer data */ 834 size = MIN(vp->v_vfsp->vfs_bsize, 835 readvp->v_vfsp->vfs_bsize); 836 size = sfv_len < size ? sfv_len : size; 837 838 while (sfv_len > 0) { 839 size_t iov_len; 840 841 iov_len = MIN(size, sfv_len); 842 843 dmp = allocb(iov_len, BPRI_HI); 844 if (dmp == NULL) { 845 VOP_RWUNLOCK(readvp, readflg, NULL); 846 releasef(sfv->sfv_fd); 847 return (ENOMEM); 848 } 849 ptr = (caddr_t)dmp->b_rptr; 850 851 aiov.iov_base = ptr; 852 aiov.iov_len = iov_len; 853 auio.uio_loffset = sfv_off; 854 auio.uio_iov = &aiov; 855 auio.uio_iovcnt = 1; 856 auio.uio_resid = iov_len; 857 auio.uio_segflg = UIO_SYSSPACE; 858 auio.uio_llimit = MAXOFFSET_T; 859 auio.uio_fmode = ffp->f_flag; 860 ioflag = auio.uio_fmode & 861 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 862 863 /* 864 * If read sync is not asked for, 865 * filter sync flags 866 */ 867 if ((ioflag & FRSYNC) == 0) 868 ioflag &= ~(FSYNC|FDSYNC); 869 error = VOP_READ(readvp, &auio, ioflag, 870 fp->f_cred, NULL); 871 if (error != 0) { 872 /* 873 * If we were reading a pipe (currently 874 * not implemented), we may now lose 875 * data. 876 */ 877 freeb(dmp); 878 VOP_RWUNLOCK(readvp, readflg, NULL); 879 releasef(sfv->sfv_fd); 880 return (error); 881 } 882 883 /* 884 * Check how much data was really read. 885 * Decrement the 'len' and increment the 886 * 'off' appropriately. 887 */ 888 cnt = iov_len - auio.uio_resid; 889 if (cnt == 0) { 890 freeb(dmp); 891 VOP_RWUNLOCK(readvp, readflg, NULL); 892 releasef(sfv->sfv_fd); 893 return (EINVAL); 894 } 895 sfv_len -= cnt; 896 sfv_off += cnt; 897 898 if (vp->v_type == VSOCK) { 899 dmp->b_wptr = dmp->b_rptr + cnt; 900 901 error = kstrwritemp(vp, dmp, fflag); 902 if (error != 0) { 903 freeb(dmp); 904 VOP_RWUNLOCK(readvp, readflg, 905 NULL); 906 releasef(sfv->sfv_fd); 907 return (error); 908 } 909 910 ttolwp(curthread)->lwp_ru.ioch += 911 (ulong_t)cnt; 912 *count += cnt; 913 } else { 914 915 aiov.iov_base = ptr; 916 aiov.iov_len = cnt; 917 auio.uio_loffset = *fileoff; 918 auio.uio_resid = cnt; 919 auio.uio_segflg = UIO_SYSSPACE; 920 auio.uio_llimit = curproc->p_fsz_ctl; 921 auio.uio_fmode = fflag; 922 ioflag = auio.uio_fmode & 923 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 924 error = VOP_WRITE(vp, &auio, ioflag, 925 fp->f_cred, NULL); 926 927 /* 928 * Check how much data was written. 929 * Increment the 'len' and decrement the 930 * 'off' if all the data was not 931 * written. 932 */ 933 cnt -= auio.uio_resid; 934 sfv_len += auio.uio_resid; 935 sfv_off -= auio.uio_resid; 936 ttolwp(curthread)->lwp_ru.ioch += 937 (ulong_t)cnt; 938 *fileoff += cnt; 939 *count += cnt; 940 freeb(dmp); 941 if (error != 0) { 942 VOP_RWUNLOCK(readvp, readflg, 943 NULL); 944 releasef(sfv->sfv_fd); 945 return (error); 946 } 947 } 948 } 949 VOP_RWUNLOCK(readvp, readflg, NULL); 950 releasef(sfv->sfv_fd); 951 } 952 sfv++; 953 } 954 return (0); 955 } 956 957 ssize_t 958 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, 959 size_t *xferred) 960 { 961 int error; 962 file_t *fp; 963 struct vnode *vp; 964 struct sonode *so; 965 u_offset_t fileoff; 966 int copy_cnt; 967 const struct sendfilevec *copy_vec; 968 struct sendfilevec sfv[SEND_MAX_CHUNK]; 969 ssize_t count = 0; 970 #ifdef _SYSCALL32_IMPL 971 struct ksendfilevec32 sfv32[SEND_MAX_CHUNK]; 972 #endif 973 ssize_t total_size = 0; 974 int i; 975 boolean_t is_sock = B_FALSE; 976 int maxblk = 0; 977 978 if (sfvcnt <= 0) 979 return (set_errno(EINVAL)); 980 981 if ((fp = getf(fildes)) == NULL) 982 return (set_errno(EBADF)); 983 984 if (((fp->f_flag) & FWRITE) == 0) { 985 error = EBADF; 986 goto err; 987 } 988 989 fileoff = fp->f_offset; 990 vp = fp->f_vnode; 991 992 switch (vp->v_type) { 993 case VSOCK: 994 so = VTOSO(vp); 995 /* sendfile not supported for SCTP */ 996 if (so->so_protocol == IPPROTO_SCTP) { 997 error = EPROTONOSUPPORT; 998 goto err; 999 } 1000 is_sock = B_TRUE; 1001 switch (so->so_family) { 1002 case AF_NCA: 1003 case AF_INET: 1004 case AF_INET6: 1005 /* 1006 * Make similar checks done in SOP_WRITE(). 1007 */ 1008 if (so->so_state & SS_CANTSENDMORE) { 1009 tsignal(curthread, SIGPIPE); 1010 error = EPIPE; 1011 goto err; 1012 } 1013 if (so->so_type != SOCK_STREAM) { 1014 error = EOPNOTSUPP; 1015 goto err; 1016 } 1017 1018 if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) != 1019 (SS_ISCONNECTED|SS_ISBOUND)) { 1020 error = ENOTCONN; 1021 goto err; 1022 } 1023 1024 if ((so->so_state & SS_TCP_FAST_ACCEPT) && 1025 (so->so_priv != NULL)) { 1026 maxblk = ((tcp_t *)so->so_priv)->tcp_mss; 1027 } else { 1028 maxblk = (int)vp->v_stream->sd_maxblk; 1029 } 1030 break; 1031 default: 1032 error = EAFNOSUPPORT; 1033 goto err; 1034 } 1035 break; 1036 case VREG: 1037 break; 1038 default: 1039 error = EINVAL; 1040 goto err; 1041 } 1042 1043 switch (opcode) { 1044 case SENDFILEV : 1045 break; 1046 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 1047 case SENDFILEV64 : 1048 return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt, 1049 (size32_t *)xferred, fildes)); 1050 #endif 1051 default : 1052 error = ENOSYS; 1053 break; 1054 } 1055 1056 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1057 copy_vec = vec; 1058 1059 do { 1060 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 1061 #ifdef _SYSCALL32_IMPL 1062 /* 32-bit callers need to have their iovec expanded. */ 1063 if (get_udatamodel() == DATAMODEL_ILP32) { 1064 if (copyin(copy_vec, sfv32, 1065 copy_cnt * sizeof (ksendfilevec32_t))) { 1066 error = EFAULT; 1067 break; 1068 } 1069 1070 for (i = 0; i < copy_cnt; i++) { 1071 sfv[i].sfv_fd = sfv32[i].sfv_fd; 1072 sfv[i].sfv_off = 1073 (off_t)(uint32_t)sfv32[i].sfv_off; 1074 sfv[i].sfv_len = (size_t)sfv32[i].sfv_len; 1075 total_size += sfv[i].sfv_len; 1076 sfv[i].sfv_flag = sfv32[i].sfv_flag; 1077 } 1078 } else { 1079 #endif 1080 if (copyin(copy_vec, sfv, 1081 copy_cnt * sizeof (sendfilevec_t))) { 1082 error = EFAULT; 1083 break; 1084 } 1085 1086 for (i = 0; i < copy_cnt; i++) { 1087 total_size += sfv[i].sfv_len; 1088 } 1089 #ifdef _SYSCALL32_IMPL 1090 } 1091 #endif 1092 1093 /* 1094 * The task between deciding to use sendvec_small_chunk 1095 * and sendvec_chunk is dependant on multiple things: 1096 * 1097 * i) latency is important for smaller files. So if the 1098 * data is smaller than 'tcp_slow_start_initial' times 1099 * maxblk, then use sendvec_small_chunk which creates 1100 * maxblk size mblks and chains then together and sends 1101 * them to TCP in one shot. It also leaves 'wroff' size 1102 * space for the headers in each mblk. 1103 * 1104 * ii) for total size bigger than 'tcp_slow_start_initial' 1105 * time maxblk, its probably real file data which is 1106 * dominating. So its better to use sendvec_chunk because 1107 * performance goes to dog if we don't do pagesize reads. 1108 * sendvec_chunk will do pagesize reads and write them 1109 * in pagesize mblks to TCP. 1110 * 1111 * Side Notes: A write to file has not been optimized. 1112 * Future zero copy code will plugin into sendvec_chunk 1113 * only because doing zero copy for files smaller then 1114 * pagesize is useless. 1115 * 1116 * Note, if socket has NL7C enabled then call NL7C's 1117 * senfilev() function to give NL7C a chance to copy 1118 * the vec for caching, then continue processing as 1119 * normal. 1120 */ 1121 if (is_sock) { 1122 switch (so->so_family) { 1123 case AF_INET: 1124 case AF_INET6: 1125 if (so->so_nl7c_flags != 0) { 1126 nl7c_sendfilev(so, fileoff, 1127 sfv, copy_cnt); 1128 } 1129 if (total_size <= (4 * maxblk)) 1130 error = sendvec_small_chunk(fp, 1131 &fileoff, sfv, copy_cnt, 1132 total_size, maxblk, &count); 1133 else 1134 error = sendvec_chunk(fp, &fileoff, 1135 sfv, copy_cnt, &count); 1136 break; 1137 case AF_NCA: 1138 error = nca_sendfilev(fp, sfv, copy_cnt, 1139 &count); 1140 break; 1141 } 1142 } else { 1143 ASSERT(vp->v_type == VREG); 1144 error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt, 1145 &count); 1146 } 1147 1148 1149 #ifdef _SYSCALL32_IMPL 1150 if (get_udatamodel() == DATAMODEL_ILP32) 1151 copy_vec = (const struct sendfilevec *)((char *)copy_vec + 1152 (copy_cnt * sizeof (ksendfilevec32_t))); 1153 else 1154 #endif 1155 copy_vec += copy_cnt; 1156 sfvcnt -= copy_cnt; 1157 } while (sfvcnt > 0); 1158 1159 if (vp->v_type == VREG) 1160 fp->f_offset += count; 1161 1162 1163 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1164 1165 #ifdef _SYSCALL32_IMPL 1166 if (get_udatamodel() == DATAMODEL_ILP32) { 1167 ssize32_t count32 = (ssize32_t)count; 1168 if (copyout(&count32, xferred, sizeof (count32))) 1169 error = EFAULT; 1170 releasef(fildes); 1171 if (error != 0) 1172 return (set_errno(error)); 1173 return (count32); 1174 } 1175 #endif 1176 if (copyout(&count, xferred, sizeof (count))) 1177 error = EFAULT; 1178 releasef(fildes); 1179 if (error != 0) 1180 return (set_errno(error)); 1181 return (count); 1182 err: 1183 ASSERT(error != 0); 1184 releasef(fildes); 1185 return (set_errno(error)); 1186 } 1187