1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/t_lock.h> 29 #include <sys/param.h> 30 #include <sys/systm.h> 31 #include <sys/buf.h> 32 #include <sys/conf.h> 33 #include <sys/cred.h> 34 #include <sys/kmem.h> 35 #include <sys/sysmacros.h> 36 #include <sys/vfs.h> 37 #include <sys/vnode.h> 38 #include <sys/debug.h> 39 #include <sys/errno.h> 40 #include <sys/time.h> 41 #include <sys/file.h> 42 #include <sys/open.h> 43 #include <sys/user.h> 44 #include <sys/termios.h> 45 #include <sys/stream.h> 46 #include <sys/strsubr.h> 47 #include <sys/sunddi.h> 48 #include <sys/esunddi.h> 49 #include <sys/flock.h> 50 #include <sys/modctl.h> 51 #include <sys/cmn_err.h> 52 #include <sys/vmsystm.h> 53 54 #include <sys/socket.h> 55 #include <sys/socketvar.h> 56 #include <fs/sockfs/sockcommon.h> 57 #include <fs/sockfs/socktpi.h> 58 59 #include <netinet/in.h> 60 #include <sys/sendfile.h> 61 #include <sys/un.h> 62 #include <sys/tihdr.h> 63 #include <sys/atomic.h> 64 65 #include <inet/common.h> 66 #include <inet/ip.h> 67 #include <inet/ip6.h> 68 #include <inet/tcp.h> 69 70 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *, 71 ssize32_t *); 72 extern int nl7c_sendfilev(struct sonode *, u_offset_t *, struct sendfilevec *, 73 int, ssize_t *); 74 extern int snf_segmap(file_t *, vnode_t *, u_offset_t, u_offset_t, ssize_t *, 75 boolean_t); 76 extern sotpi_info_t *sotpi_sototpi(struct sonode *); 77 78 #define readflg (V_WRITELOCK_FALSE) 79 #define rwflag (V_WRITELOCK_TRUE) 80 81 #define SEND_MAX_CHUNK 16 82 83 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 84 /* 85 * 64 bit offsets for 32 bit applications only running either on 86 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer 87 * more than 2GB of data. 88 */ 89 int 90 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, 91 int copy_cnt, ssize32_t *count) 92 { 93 struct vnode *vp; 94 ushort_t fflag; 95 int ioflag; 96 size32_t cnt; 97 ssize32_t sfv_len; 98 ssize32_t tmpcount; 99 u_offset_t sfv_off; 100 struct uio auio; 101 struct iovec aiov; 102 int i, error; 103 104 fflag = fp->f_flag; 105 vp = fp->f_vnode; 106 for (i = 0; i < copy_cnt; i++) { 107 108 if (ISSIG(curthread, JUSTLOOKING)) 109 return (EINTR); 110 111 /* 112 * Do similar checks as "write" as we are writing 113 * sfv_len bytes into "vp". 114 */ 115 sfv_len = (ssize32_t)sfv->sfv_len; 116 117 if (sfv_len == 0) { 118 sfv++; 119 continue; 120 } 121 122 if (sfv_len < 0) 123 return (EINVAL); 124 125 if (vp->v_type == VREG) { 126 if (*fileoff >= curproc->p_fsz_ctl) { 127 mutex_enter(&curproc->p_lock); 128 (void) rctl_action( 129 rctlproc_legacy[RLIMIT_FSIZE], 130 curproc->p_rctls, curproc, RCA_SAFE); 131 mutex_exit(&curproc->p_lock); 132 return (EFBIG); 133 } 134 135 if (*fileoff >= OFFSET_MAX(fp)) 136 return (EFBIG); 137 138 if (*fileoff + sfv_len > OFFSET_MAX(fp)) 139 return (EINVAL); 140 } 141 142 tmpcount = *count + sfv_len; 143 if (tmpcount < 0) 144 return (EINVAL); 145 146 sfv_off = sfv->sfv_off; 147 148 auio.uio_extflg = UIO_COPY_DEFAULT; 149 if (sfv->sfv_fd == SFV_FD_SELF) { 150 aiov.iov_len = sfv_len; 151 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 152 auio.uio_loffset = *fileoff; 153 auio.uio_iovcnt = 1; 154 auio.uio_resid = sfv_len; 155 auio.uio_iov = &aiov; 156 auio.uio_segflg = UIO_USERSPACE; 157 auio.uio_llimit = curproc->p_fsz_ctl; 158 auio.uio_fmode = fflag; 159 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 160 while (sfv_len > 0) { 161 error = VOP_WRITE(vp, &auio, ioflag, 162 fp->f_cred, NULL); 163 cnt = sfv_len - auio.uio_resid; 164 sfv_len -= cnt; 165 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 166 if (vp->v_type == VREG) 167 *fileoff += cnt; 168 *count += cnt; 169 if (error != 0) 170 return (error); 171 } 172 } else { 173 file_t *ffp; 174 vnode_t *readvp; 175 size_t size; 176 caddr_t ptr; 177 178 if ((ffp = getf(sfv->sfv_fd)) == NULL) 179 return (EBADF); 180 181 if ((ffp->f_flag & FREAD) == 0) { 182 releasef(sfv->sfv_fd); 183 return (EBADF); 184 } 185 186 readvp = ffp->f_vnode; 187 if (readvp->v_type != VREG) { 188 releasef(sfv->sfv_fd); 189 return (EINVAL); 190 } 191 192 /* 193 * No point reading and writing to same vp, 194 * as long as both are regular files. readvp is not 195 * locked; but since we got it from an open file the 196 * contents will be valid during the time of access. 197 */ 198 if (vn_compare(vp, readvp)) { 199 releasef(sfv->sfv_fd); 200 return (EINVAL); 201 } 202 203 /* 204 * Note: we assume readvp != vp. "vp" is already 205 * locked, and "readvp" must not be. 206 */ 207 (void) VOP_RWLOCK(readvp, readflg, NULL); 208 209 /* 210 * Same checks as in pread64. 211 */ 212 if (sfv_off > MAXOFFSET_T) { 213 VOP_RWUNLOCK(readvp, readflg, NULL); 214 releasef(sfv->sfv_fd); 215 return (EINVAL); 216 } 217 218 if (sfv_off + sfv_len > MAXOFFSET_T) 219 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off); 220 221 /* Find the native blocksize to transfer data */ 222 size = MIN(vp->v_vfsp->vfs_bsize, 223 readvp->v_vfsp->vfs_bsize); 224 size = sfv_len < size ? sfv_len : size; 225 ptr = kmem_alloc(size, KM_SLEEP); 226 227 while (sfv_len > 0) { 228 size_t iov_len; 229 230 iov_len = MIN(size, sfv_len); 231 aiov.iov_base = ptr; 232 aiov.iov_len = iov_len; 233 auio.uio_loffset = sfv_off; 234 auio.uio_iov = &aiov; 235 auio.uio_iovcnt = 1; 236 auio.uio_resid = iov_len; 237 auio.uio_segflg = UIO_SYSSPACE; 238 auio.uio_llimit = MAXOFFSET_T; 239 auio.uio_fmode = ffp->f_flag; 240 ioflag = auio.uio_fmode & 241 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 242 243 /* 244 * If read sync is not asked for, 245 * filter sync flags 246 */ 247 if ((ioflag & FRSYNC) == 0) 248 ioflag &= ~(FSYNC|FDSYNC); 249 error = VOP_READ(readvp, &auio, ioflag, 250 fp->f_cred, NULL); 251 if (error) { 252 kmem_free(ptr, size); 253 VOP_RWUNLOCK(readvp, readflg, NULL); 254 releasef(sfv->sfv_fd); 255 return (error); 256 } 257 258 /* 259 * Check how must data was really read. 260 * Decrement the 'len' and increment the 261 * 'off' appropriately. 262 */ 263 cnt = iov_len - auio.uio_resid; 264 if (cnt == 0) { 265 /* 266 * If we were reading a pipe (currently 267 * not implemented), we may now lose 268 * data. 269 */ 270 kmem_free(ptr, size); 271 VOP_RWUNLOCK(readvp, readflg, NULL); 272 releasef(sfv->sfv_fd); 273 return (EINVAL); 274 } 275 sfv_len -= cnt; 276 sfv_off += cnt; 277 278 aiov.iov_base = ptr; 279 aiov.iov_len = cnt; 280 auio.uio_loffset = *fileoff; 281 auio.uio_iov = &aiov; 282 auio.uio_iovcnt = 1; 283 auio.uio_resid = cnt; 284 auio.uio_segflg = UIO_SYSSPACE; 285 auio.uio_llimit = curproc->p_fsz_ctl; 286 auio.uio_fmode = fflag; 287 ioflag = auio.uio_fmode & 288 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 289 error = VOP_WRITE(vp, &auio, ioflag, 290 fp->f_cred, NULL); 291 292 /* 293 * Check how much data was written. Increment 294 * the 'len' and decrement the 'off' if all 295 * the data was not written. 296 */ 297 cnt -= auio.uio_resid; 298 sfv_len += auio.uio_resid; 299 sfv_off -= auio.uio_resid; 300 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 301 if (vp->v_type == VREG) 302 *fileoff += cnt; 303 *count += cnt; 304 if (error != 0) { 305 kmem_free(ptr, size); 306 VOP_RWUNLOCK(readvp, readflg, NULL); 307 releasef(sfv->sfv_fd); 308 return (error); 309 } 310 } 311 VOP_RWUNLOCK(readvp, readflg, NULL); 312 releasef(sfv->sfv_fd); 313 kmem_free(ptr, size); 314 } 315 sfv++; 316 } 317 return (0); 318 } 319 320 ssize32_t 321 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, 322 size32_t *xferred, int fildes) 323 { 324 u_offset_t fileoff; 325 int copy_cnt; 326 const struct ksendfilevec64 *copy_vec; 327 struct ksendfilevec64 sfv[SEND_MAX_CHUNK]; 328 struct vnode *vp; 329 int error; 330 ssize32_t count = 0; 331 332 vp = fp->f_vnode; 333 (void) VOP_RWLOCK(vp, rwflag, NULL); 334 335 copy_vec = vec; 336 fileoff = fp->f_offset; 337 338 do { 339 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 340 if (copyin(copy_vec, sfv, copy_cnt * 341 sizeof (struct ksendfilevec64))) { 342 error = EFAULT; 343 break; 344 } 345 346 /* 347 * Optimize the regular file over 348 * the socket case. 349 */ 350 if (vp->v_type == VSOCK && sfv->sfv_fd != SFV_FD_SELF) { 351 file_t *rfp; 352 vnode_t *rvp; 353 354 if ((rfp = getf(sfv->sfv_fd)) == NULL) { 355 error = EBADF; 356 break; 357 } 358 if ((rfp->f_flag & FREAD) == 0) { 359 releasef(sfv->sfv_fd); 360 error = EBADF; 361 break; 362 } 363 rvp = rfp->f_vnode; 364 if (rvp->v_type == VREG) { 365 error = sosendfile64(fp, rfp, sfv, &count); 366 if (error) 367 break; 368 copy_vec++; 369 sfvcnt--; 370 continue; 371 } 372 releasef(sfv->sfv_fd); 373 } 374 error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count); 375 if (error != 0) 376 break; 377 378 copy_vec += copy_cnt; 379 sfvcnt -= copy_cnt; 380 } while (sfvcnt > 0); 381 382 if (vp->v_type == VREG) 383 fp->f_offset += count; 384 385 VOP_RWUNLOCK(vp, rwflag, NULL); 386 if (copyout(&count, xferred, sizeof (count))) 387 error = EFAULT; 388 releasef(fildes); 389 if (error != 0) 390 return (set_errno(error)); 391 return (count); 392 } 393 #endif 394 395 int 396 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 397 int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count) 398 { 399 struct vnode *vp; 400 struct uio auio; 401 struct iovec aiov; 402 ushort_t fflag; 403 int ioflag; 404 int i, error; 405 size_t cnt; 406 ssize_t sfv_len; 407 u_offset_t sfv_off; 408 #ifdef _SYSCALL32_IMPL 409 model_t model = get_udatamodel(); 410 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 411 MAXOFF32_T : MAXOFFSET_T; 412 #else 413 const u_offset_t maxoff = MAXOFF32_T; 414 #endif 415 mblk_t *dmp = NULL; 416 int wroff; 417 int buf_left = 0; 418 size_t iov_len; 419 mblk_t *head, *tmp; 420 size_t size = total_size; 421 size_t extra; 422 int tail_len; 423 struct nmsghdr msg; 424 425 fflag = fp->f_flag; 426 vp = fp->f_vnode; 427 428 ASSERT(vp->v_type == VSOCK); 429 ASSERT(maxblk > 0); 430 431 /* If nothing to send, return */ 432 if (total_size == 0) 433 return (0); 434 435 if (vp->v_stream != NULL) { 436 wroff = (int)vp->v_stream->sd_wroff; 437 tail_len = (int)vp->v_stream->sd_tail; 438 } else { 439 struct sonode *so; 440 441 so = VTOSO(vp); 442 wroff = so->so_proto_props.sopp_wroff; 443 tail_len = so->so_proto_props.sopp_tail; 444 } 445 446 extra = wroff + tail_len; 447 448 buf_left = MIN(total_size, maxblk); 449 head = dmp = allocb(buf_left + extra, BPRI_HI); 450 if (head == NULL) 451 return (ENOMEM); 452 head->b_wptr = head->b_rptr = head->b_rptr + wroff; 453 bzero(&msg, sizeof (msg)); 454 455 auio.uio_extflg = UIO_COPY_DEFAULT; 456 for (i = 0; i < copy_cnt; i++) { 457 if (ISSIG(curthread, JUSTLOOKING)) { 458 freemsg(head); 459 return (EINTR); 460 } 461 462 /* 463 * Do similar checks as "write" as we are writing 464 * sfv_len bytes into "vp". 465 */ 466 sfv_len = (ssize_t)sfv->sfv_len; 467 468 if (sfv_len == 0) { 469 sfv++; 470 continue; 471 } 472 473 /* Check for overflow */ 474 #ifdef _SYSCALL32_IMPL 475 if (model == DATAMODEL_ILP32) { 476 if (((ssize32_t)(*count + sfv_len)) < 0) { 477 freemsg(head); 478 return (EINVAL); 479 } 480 } else 481 #endif 482 if ((*count + sfv_len) < 0) { 483 freemsg(head); 484 return (EINVAL); 485 } 486 487 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 488 489 if (sfv->sfv_fd == SFV_FD_SELF) { 490 while (sfv_len > 0) { 491 if (buf_left == 0) { 492 tmp = dmp; 493 buf_left = MIN(total_size, maxblk); 494 iov_len = MIN(buf_left, sfv_len); 495 dmp = allocb(buf_left + extra, BPRI_HI); 496 if (dmp == NULL) { 497 freemsg(head); 498 return (ENOMEM); 499 } 500 dmp->b_wptr = dmp->b_rptr = 501 dmp->b_rptr + wroff; 502 tmp->b_cont = dmp; 503 } else { 504 iov_len = MIN(buf_left, sfv_len); 505 } 506 507 aiov.iov_len = iov_len; 508 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 509 auio.uio_loffset = *fileoff; 510 auio.uio_iovcnt = 1; 511 auio.uio_resid = iov_len; 512 auio.uio_iov = &aiov; 513 auio.uio_segflg = UIO_USERSPACE; 514 auio.uio_llimit = curproc->p_fsz_ctl; 515 auio.uio_fmode = fflag; 516 517 buf_left -= iov_len; 518 total_size -= iov_len; 519 sfv_len -= iov_len; 520 sfv_off += iov_len; 521 522 error = uiomove((caddr_t)dmp->b_wptr, 523 iov_len, UIO_WRITE, &auio); 524 if (error != 0) { 525 freemsg(head); 526 return (error); 527 } 528 dmp->b_wptr += iov_len; 529 } 530 } else { 531 file_t *ffp; 532 vnode_t *readvp; 533 534 if ((ffp = getf(sfv->sfv_fd)) == NULL) { 535 freemsg(head); 536 return (EBADF); 537 } 538 539 if ((ffp->f_flag & FREAD) == 0) { 540 releasef(sfv->sfv_fd); 541 freemsg(head); 542 return (EACCES); 543 } 544 545 readvp = ffp->f_vnode; 546 if (readvp->v_type != VREG) { 547 releasef(sfv->sfv_fd); 548 freemsg(head); 549 return (EINVAL); 550 } 551 552 /* 553 * No point reading and writing to same vp, 554 * as long as both are regular files. readvp is not 555 * locked; but since we got it from an open file the 556 * contents will be valid during the time of access. 557 */ 558 559 if (vn_compare(vp, readvp)) { 560 releasef(sfv->sfv_fd); 561 freemsg(head); 562 return (EINVAL); 563 } 564 565 /* 566 * Note: we assume readvp != vp. "vp" is already 567 * locked, and "readvp" must not be. 568 */ 569 570 (void) VOP_RWLOCK(readvp, readflg, NULL); 571 572 /* Same checks as in pread */ 573 if (sfv_off > maxoff) { 574 VOP_RWUNLOCK(readvp, readflg, NULL); 575 releasef(sfv->sfv_fd); 576 freemsg(head); 577 return (EINVAL); 578 } 579 if (sfv_off + sfv_len > maxoff) { 580 total_size -= (sfv_off + sfv_len - maxoff); 581 sfv_len = (ssize_t)((offset_t)maxoff - 582 sfv_off); 583 } 584 585 while (sfv_len > 0) { 586 if (buf_left == 0) { 587 tmp = dmp; 588 buf_left = MIN(total_size, maxblk); 589 iov_len = MIN(buf_left, sfv_len); 590 dmp = allocb(buf_left + extra, BPRI_HI); 591 if (dmp == NULL) { 592 VOP_RWUNLOCK(readvp, readflg, 593 NULL); 594 releasef(sfv->sfv_fd); 595 freemsg(head); 596 return (ENOMEM); 597 } 598 dmp->b_wptr = dmp->b_rptr = 599 dmp->b_rptr + wroff; 600 tmp->b_cont = dmp; 601 } else { 602 iov_len = MIN(buf_left, sfv_len); 603 } 604 aiov.iov_base = (caddr_t)dmp->b_wptr; 605 aiov.iov_len = iov_len; 606 auio.uio_loffset = sfv_off; 607 auio.uio_iov = &aiov; 608 auio.uio_iovcnt = 1; 609 auio.uio_resid = iov_len; 610 auio.uio_segflg = UIO_SYSSPACE; 611 auio.uio_llimit = MAXOFFSET_T; 612 auio.uio_fmode = ffp->f_flag; 613 ioflag = auio.uio_fmode & 614 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 615 616 /* 617 * If read sync is not asked for, 618 * filter sync flags 619 */ 620 if ((ioflag & FRSYNC) == 0) 621 ioflag &= ~(FSYNC|FDSYNC); 622 error = VOP_READ(readvp, &auio, ioflag, 623 fp->f_cred, NULL); 624 if (error != 0) { 625 /* 626 * If we were reading a pipe (currently 627 * not implemented), we may now loose 628 * data. 629 */ 630 VOP_RWUNLOCK(readvp, readflg, NULL); 631 releasef(sfv->sfv_fd); 632 freemsg(head); 633 return (error); 634 } 635 636 /* 637 * Check how much data was really read. 638 * Decrement the 'len' and increment the 639 * 'off' appropriately. 640 */ 641 cnt = iov_len - auio.uio_resid; 642 if (cnt == 0) { 643 VOP_RWUNLOCK(readvp, readflg, NULL); 644 releasef(sfv->sfv_fd); 645 freemsg(head); 646 return (EINVAL); 647 } 648 sfv_len -= cnt; 649 sfv_off += cnt; 650 total_size -= cnt; 651 buf_left -= cnt; 652 653 dmp->b_wptr += cnt; 654 } 655 VOP_RWUNLOCK(readvp, readflg, NULL); 656 releasef(sfv->sfv_fd); 657 } 658 sfv++; 659 } 660 661 ASSERT(total_size == 0); 662 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &head); 663 if (error != 0) { 664 if (head != NULL) 665 freemsg(head); 666 return (error); 667 } 668 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size; 669 *count += size; 670 671 return (0); 672 } 673 674 675 int 676 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 677 int copy_cnt, ssize_t *count) 678 { 679 struct vnode *vp; 680 struct uio auio; 681 struct iovec aiov; 682 ushort_t fflag; 683 int ioflag; 684 int i, error; 685 size_t cnt; 686 ssize_t sfv_len; 687 u_offset_t sfv_off; 688 #ifdef _SYSCALL32_IMPL 689 model_t model = get_udatamodel(); 690 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 691 MAXOFF32_T : MAXOFFSET_T; 692 #else 693 const u_offset_t maxoff = MAXOFF32_T; 694 #endif 695 mblk_t *dmp = NULL; 696 char *buf = NULL; 697 size_t extra; 698 int maxblk, wroff, tail_len; 699 struct sonode *so; 700 stdata_t *stp; 701 struct nmsghdr msg; 702 703 fflag = fp->f_flag; 704 vp = fp->f_vnode; 705 706 if (vp->v_type == VSOCK) { 707 so = VTOSO(vp); 708 if (vp->v_stream != NULL) { 709 stp = vp->v_stream; 710 wroff = (int)stp->sd_wroff; 711 tail_len = (int)stp->sd_tail; 712 maxblk = (int)stp->sd_maxblk; 713 } else { 714 stp = NULL; 715 wroff = so->so_proto_props.sopp_wroff; 716 tail_len = so->so_proto_props.sopp_tail; 717 maxblk = so->so_proto_props.sopp_maxblk; 718 } 719 extra = wroff + tail_len; 720 } 721 722 bzero(&msg, sizeof (msg)); 723 auio.uio_extflg = UIO_COPY_DEFAULT; 724 for (i = 0; i < copy_cnt; i++) { 725 if (ISSIG(curthread, JUSTLOOKING)) 726 return (EINTR); 727 728 /* 729 * Do similar checks as "write" as we are writing 730 * sfv_len bytes into "vp". 731 */ 732 sfv_len = (ssize_t)sfv->sfv_len; 733 734 if (sfv_len == 0) { 735 sfv++; 736 continue; 737 } 738 739 if (vp->v_type == VREG) { 740 if (*fileoff >= curproc->p_fsz_ctl) { 741 mutex_enter(&curproc->p_lock); 742 (void) rctl_action( 743 rctlproc_legacy[RLIMIT_FSIZE], 744 curproc->p_rctls, curproc, RCA_SAFE); 745 mutex_exit(&curproc->p_lock); 746 747 return (EFBIG); 748 } 749 750 if (*fileoff >= maxoff) 751 return (EFBIG); 752 753 if (*fileoff + sfv_len > maxoff) 754 return (EINVAL); 755 } 756 757 /* Check for overflow */ 758 #ifdef _SYSCALL32_IMPL 759 if (model == DATAMODEL_ILP32) { 760 if (((ssize32_t)(*count + sfv_len)) < 0) 761 return (EINVAL); 762 } else 763 #endif 764 if ((*count + sfv_len) < 0) 765 return (EINVAL); 766 767 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 768 769 if (sfv->sfv_fd == SFV_FD_SELF) { 770 if (vp->v_type == VSOCK) { 771 while (sfv_len > 0) { 772 size_t iov_len; 773 774 iov_len = sfv_len; 775 if (!SOCK_IS_NONSTR(so) && 776 SOTOTPI(so)->sti_kssl_ctx != NULL) 777 iov_len = MIN(iov_len, maxblk); 778 779 aiov.iov_len = iov_len; 780 aiov.iov_base = 781 (caddr_t)(uintptr_t)sfv_off; 782 783 auio.uio_iov = &aiov; 784 auio.uio_iovcnt = 1; 785 auio.uio_loffset = *fileoff; 786 auio.uio_segflg = UIO_USERSPACE; 787 auio.uio_fmode = fflag; 788 auio.uio_llimit = curproc->p_fsz_ctl; 789 auio.uio_resid = iov_len; 790 791 dmp = allocb(iov_len + extra, BPRI_HI); 792 if (dmp == NULL) 793 return (ENOMEM); 794 dmp->b_wptr = dmp->b_rptr = 795 dmp->b_rptr + wroff; 796 error = uiomove((caddr_t)dmp->b_wptr, 797 iov_len, UIO_WRITE, &auio); 798 if (error != 0) { 799 freeb(dmp); 800 return (error); 801 } 802 dmp->b_wptr += iov_len; 803 error = socket_sendmblk(VTOSO(vp), 804 &msg, fflag, CRED(), &dmp); 805 806 if (error != 0) { 807 if (dmp != NULL) 808 freeb(dmp); 809 return (error); 810 } 811 ttolwp(curthread)->lwp_ru.ioch += 812 (ulong_t)iov_len; 813 *count += iov_len; 814 sfv_len -= iov_len; 815 sfv_off += iov_len; 816 } 817 } else { 818 ttolwp(curthread)->lwp_ru.ioch += 819 (ulong_t)sfv_len; 820 *count += sfv_len; 821 aiov.iov_len = sfv_len; 822 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 823 824 auio.uio_iov = &aiov; 825 auio.uio_iovcnt = 1; 826 auio.uio_loffset = *fileoff; 827 auio.uio_segflg = UIO_USERSPACE; 828 auio.uio_fmode = fflag; 829 auio.uio_llimit = curproc->p_fsz_ctl; 830 auio.uio_resid = sfv_len; 831 832 ioflag = auio.uio_fmode & 833 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 834 while (sfv_len > 0) { 835 error = VOP_WRITE(vp, &auio, ioflag, 836 fp->f_cred, NULL); 837 cnt = sfv_len - auio.uio_resid; 838 sfv_len -= cnt; 839 ttolwp(curthread)->lwp_ru.ioch += 840 (ulong_t)cnt; 841 *fileoff += cnt; 842 *count += cnt; 843 if (error != 0) 844 return (error); 845 } 846 } 847 } else { 848 int segmapit = 0; 849 file_t *ffp; 850 vnode_t *readvp; 851 struct vnode *realvp; 852 size_t size; 853 caddr_t ptr; 854 855 if ((ffp = getf(sfv->sfv_fd)) == NULL) 856 return (EBADF); 857 858 if ((ffp->f_flag & FREAD) == 0) { 859 releasef(sfv->sfv_fd); 860 return (EBADF); 861 } 862 863 readvp = ffp->f_vnode; 864 if (VOP_REALVP(readvp, &realvp, NULL) == 0) 865 readvp = realvp; 866 if (readvp->v_type != VREG) { 867 releasef(sfv->sfv_fd); 868 return (EINVAL); 869 } 870 871 /* 872 * No point reading and writing to same vp, 873 * as long as both are regular files. readvp is not 874 * locked; but since we got it from an open file the 875 * contents will be valid during the time of access. 876 */ 877 if (vn_compare(vp, readvp)) { 878 releasef(sfv->sfv_fd); 879 return (EINVAL); 880 } 881 882 /* 883 * Note: we assume readvp != vp. "vp" is already 884 * locked, and "readvp" must not be. 885 */ 886 (void) VOP_RWLOCK(readvp, readflg, NULL); 887 888 /* Same checks as in pread */ 889 if (sfv_off > maxoff) { 890 VOP_RWUNLOCK(readvp, readflg, NULL); 891 releasef(sfv->sfv_fd); 892 return (EINVAL); 893 } 894 if (sfv_off + sfv_len > maxoff) { 895 sfv_len = (ssize_t)((offset_t)maxoff - 896 sfv_off); 897 } 898 /* Find the native blocksize to transfer data */ 899 size = MIN(vp->v_vfsp->vfs_bsize, 900 readvp->v_vfsp->vfs_bsize); 901 size = sfv_len < size ? sfv_len : size; 902 903 if (vp->v_type != VSOCK) { 904 segmapit = 0; 905 buf = kmem_alloc(size, KM_NOSLEEP); 906 if (buf == NULL) { 907 VOP_RWUNLOCK(readvp, readflg, NULL); 908 releasef(sfv->sfv_fd); 909 return (ENOMEM); 910 } 911 } else { 912 uint_t copyflag; 913 914 copyflag = stp != NULL ? stp->sd_copyflag : 915 so->so_proto_props.sopp_zcopyflag; 916 /* 917 * For sockets acting as an SSL proxy, we 918 * need to adjust the size to the maximum 919 * SSL record size set in the stream head. 920 */ 921 if (!SOCK_IS_NONSTR(so) && 922 _SOTOTPI(so)->sti_kssl_ctx != NULL) 923 size = MIN(size, maxblk); 924 925 if (vn_has_flocks(readvp) || 926 readvp->v_flag & VNOMAP || 927 copyflag & STZCVMUNSAFE) { 928 segmapit = 0; 929 } else if (copyflag & STZCVMSAFE) { 930 segmapit = 1; 931 } else { 932 int on = 1; 933 if (socket_setsockopt(VTOSO(vp), 934 SOL_SOCKET, SO_SND_COPYAVOID, 935 &on, sizeof (on), CRED()) == 0) 936 segmapit = 1; 937 } 938 } 939 940 if (segmapit) { 941 boolean_t nowait; 942 943 nowait = (sfv->sfv_flag & SFV_NOWAIT) != 0; 944 error = snf_segmap(fp, readvp, sfv_off, 945 (u_offset_t)sfv_len, (ssize_t *)&cnt, 946 nowait); 947 releasef(sfv->sfv_fd); 948 *count += cnt; 949 if (error) 950 return (error); 951 sfv++; 952 continue; 953 } 954 955 while (sfv_len > 0) { 956 size_t iov_len; 957 958 iov_len = MIN(size, sfv_len); 959 960 if (vp->v_type == VSOCK) { 961 dmp = allocb(iov_len + extra, BPRI_HI); 962 if (dmp == NULL) { 963 VOP_RWUNLOCK(readvp, readflg, 964 NULL); 965 releasef(sfv->sfv_fd); 966 return (ENOMEM); 967 } 968 dmp->b_wptr = dmp->b_rptr = 969 dmp->b_rptr + wroff; 970 ptr = (caddr_t)dmp->b_rptr; 971 } else { 972 ptr = buf; 973 } 974 975 aiov.iov_base = ptr; 976 aiov.iov_len = iov_len; 977 auio.uio_loffset = sfv_off; 978 auio.uio_iov = &aiov; 979 auio.uio_iovcnt = 1; 980 auio.uio_resid = iov_len; 981 auio.uio_segflg = UIO_SYSSPACE; 982 auio.uio_llimit = MAXOFFSET_T; 983 auio.uio_fmode = ffp->f_flag; 984 ioflag = auio.uio_fmode & 985 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 986 987 /* 988 * If read sync is not asked for, 989 * filter sync flags 990 */ 991 if ((ioflag & FRSYNC) == 0) 992 ioflag &= ~(FSYNC|FDSYNC); 993 error = VOP_READ(readvp, &auio, ioflag, 994 fp->f_cred, NULL); 995 if (error != 0) { 996 /* 997 * If we were reading a pipe (currently 998 * not implemented), we may now lose 999 * data. 1000 */ 1001 if (vp->v_type == VSOCK) 1002 freeb(dmp); 1003 else 1004 kmem_free(buf, size); 1005 VOP_RWUNLOCK(readvp, readflg, NULL); 1006 releasef(sfv->sfv_fd); 1007 return (error); 1008 } 1009 1010 /* 1011 * Check how much data was really read. 1012 * Decrement the 'len' and increment the 1013 * 'off' appropriately. 1014 */ 1015 cnt = iov_len - auio.uio_resid; 1016 if (cnt == 0) { 1017 if (vp->v_type == VSOCK) 1018 freeb(dmp); 1019 else 1020 kmem_free(buf, size); 1021 VOP_RWUNLOCK(readvp, readflg, NULL); 1022 releasef(sfv->sfv_fd); 1023 return (EINVAL); 1024 } 1025 sfv_len -= cnt; 1026 sfv_off += cnt; 1027 1028 if (vp->v_type == VSOCK) { 1029 dmp->b_wptr = dmp->b_rptr + cnt; 1030 1031 error = socket_sendmblk(VTOSO(vp), 1032 &msg, fflag, CRED(), &dmp); 1033 1034 if (error != 0) { 1035 if (dmp != NULL) 1036 freeb(dmp); 1037 VOP_RWUNLOCK(readvp, readflg, 1038 NULL); 1039 releasef(sfv->sfv_fd); 1040 return (error); 1041 } 1042 1043 ttolwp(curthread)->lwp_ru.ioch += 1044 (ulong_t)cnt; 1045 *count += cnt; 1046 } else { 1047 1048 aiov.iov_base = ptr; 1049 aiov.iov_len = cnt; 1050 auio.uio_loffset = *fileoff; 1051 auio.uio_resid = cnt; 1052 auio.uio_iov = &aiov; 1053 auio.uio_iovcnt = 1; 1054 auio.uio_segflg = UIO_SYSSPACE; 1055 auio.uio_llimit = curproc->p_fsz_ctl; 1056 auio.uio_fmode = fflag; 1057 ioflag = auio.uio_fmode & 1058 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1059 error = VOP_WRITE(vp, &auio, ioflag, 1060 fp->f_cred, NULL); 1061 1062 /* 1063 * Check how much data was written. 1064 * Increment the 'len' and decrement the 1065 * 'off' if all the data was not 1066 * written. 1067 */ 1068 cnt -= auio.uio_resid; 1069 sfv_len += auio.uio_resid; 1070 sfv_off -= auio.uio_resid; 1071 ttolwp(curthread)->lwp_ru.ioch += 1072 (ulong_t)cnt; 1073 *fileoff += cnt; 1074 *count += cnt; 1075 if (error != 0) { 1076 kmem_free(buf, size); 1077 VOP_RWUNLOCK(readvp, readflg, 1078 NULL); 1079 releasef(sfv->sfv_fd); 1080 return (error); 1081 } 1082 } 1083 } 1084 if (buf) { 1085 kmem_free(buf, size); 1086 buf = NULL; 1087 } 1088 VOP_RWUNLOCK(readvp, readflg, NULL); 1089 releasef(sfv->sfv_fd); 1090 } 1091 sfv++; 1092 } 1093 return (0); 1094 } 1095 1096 ssize_t 1097 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, 1098 size_t *xferred) 1099 { 1100 int error = 0; 1101 int first_vector_error = 0; 1102 file_t *fp; 1103 struct vnode *vp; 1104 struct sonode *so; 1105 u_offset_t fileoff; 1106 int copy_cnt; 1107 const struct sendfilevec *copy_vec; 1108 struct sendfilevec sfv[SEND_MAX_CHUNK]; 1109 ssize_t count = 0; 1110 #ifdef _SYSCALL32_IMPL 1111 struct ksendfilevec32 sfv32[SEND_MAX_CHUNK]; 1112 #endif 1113 ssize_t total_size; 1114 int i; 1115 boolean_t is_sock = B_FALSE; 1116 int maxblk = 0; 1117 1118 if (sfvcnt <= 0) 1119 return (set_errno(EINVAL)); 1120 1121 if ((fp = getf(fildes)) == NULL) 1122 return (set_errno(EBADF)); 1123 1124 if (((fp->f_flag) & FWRITE) == 0) { 1125 error = EBADF; 1126 goto err; 1127 } 1128 1129 fileoff = fp->f_offset; 1130 vp = fp->f_vnode; 1131 1132 switch (vp->v_type) { 1133 case VSOCK: 1134 so = VTOSO(vp); 1135 is_sock = B_TRUE; 1136 if (SOCK_IS_NONSTR(so)) { 1137 maxblk = so->so_proto_props.sopp_maxblk; 1138 } else { 1139 maxblk = (int)vp->v_stream->sd_maxblk; 1140 } 1141 break; 1142 case VREG: 1143 break; 1144 default: 1145 error = EINVAL; 1146 goto err; 1147 } 1148 1149 switch (opcode) { 1150 case SENDFILEV : 1151 break; 1152 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 1153 case SENDFILEV64 : 1154 return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt, 1155 (size32_t *)xferred, fildes)); 1156 #endif 1157 default : 1158 error = ENOSYS; 1159 break; 1160 } 1161 1162 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1163 copy_vec = vec; 1164 1165 do { 1166 total_size = 0; 1167 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 1168 #ifdef _SYSCALL32_IMPL 1169 /* 32-bit callers need to have their iovec expanded. */ 1170 if (get_udatamodel() == DATAMODEL_ILP32) { 1171 if (copyin(copy_vec, sfv32, 1172 copy_cnt * sizeof (ksendfilevec32_t))) { 1173 error = EFAULT; 1174 break; 1175 } 1176 1177 for (i = 0; i < copy_cnt; i++) { 1178 sfv[i].sfv_fd = sfv32[i].sfv_fd; 1179 sfv[i].sfv_off = 1180 (off_t)(uint32_t)sfv32[i].sfv_off; 1181 sfv[i].sfv_len = (size_t)sfv32[i].sfv_len; 1182 total_size += sfv[i].sfv_len; 1183 sfv[i].sfv_flag = sfv32[i].sfv_flag; 1184 /* 1185 * Individual elements of the vector must not 1186 * wrap or overflow, as later math is signed. 1187 * Equally total_size needs to be checked after 1188 * each vector is added in, to be sure that 1189 * rogue values haven't overflowed the counter. 1190 */ 1191 if (((ssize32_t)sfv[i].sfv_len < 0) || 1192 ((ssize32_t)total_size < 0)) { 1193 /* 1194 * Truncate the vector to send data 1195 * described by elements before the 1196 * error. 1197 */ 1198 copy_cnt = i; 1199 first_vector_error = EINVAL; 1200 /* total_size can't be trusted */ 1201 if ((ssize32_t)total_size < 0) 1202 error = EINVAL; 1203 break; 1204 } 1205 } 1206 /* Nothing to do, process errors */ 1207 if (copy_cnt == 0) 1208 break; 1209 1210 } else { 1211 #endif 1212 if (copyin(copy_vec, sfv, 1213 copy_cnt * sizeof (sendfilevec_t))) { 1214 error = EFAULT; 1215 break; 1216 } 1217 1218 for (i = 0; i < copy_cnt; i++) { 1219 total_size += sfv[i].sfv_len; 1220 /* 1221 * Individual elements of the vector must not 1222 * wrap or overflow, as later math is signed. 1223 * Equally total_size needs to be checked after 1224 * each vector is added in, to be sure that 1225 * rogue values haven't overflowed the counter. 1226 */ 1227 if (((ssize_t)sfv[i].sfv_len < 0) || 1228 (total_size < 0)) { 1229 /* 1230 * Truncate the vector to send data 1231 * described by elements before the 1232 * error. 1233 */ 1234 copy_cnt = i; 1235 first_vector_error = EINVAL; 1236 /* total_size can't be trusted */ 1237 if (total_size < 0) 1238 error = EINVAL; 1239 break; 1240 } 1241 } 1242 /* Nothing to do, process errors */ 1243 if (copy_cnt == 0) 1244 break; 1245 #ifdef _SYSCALL32_IMPL 1246 } 1247 #endif 1248 1249 /* 1250 * The task between deciding to use sendvec_small_chunk 1251 * and sendvec_chunk is dependant on multiple things: 1252 * 1253 * i) latency is important for smaller files. So if the 1254 * data is smaller than 'tcp_slow_start_initial' times 1255 * maxblk, then use sendvec_small_chunk which creates 1256 * maxblk size mblks and chains them together and sends 1257 * them to TCP in one shot. It also leaves 'wroff' size 1258 * space for the headers in each mblk. 1259 * 1260 * ii) for total size bigger than 'tcp_slow_start_initial' 1261 * time maxblk, its probably real file data which is 1262 * dominating. So its better to use sendvec_chunk because 1263 * performance goes to dog if we don't do pagesize reads. 1264 * sendvec_chunk will do pagesize reads and write them 1265 * in pagesize mblks to TCP. 1266 * 1267 * Side Notes: A write to file has not been optimized. 1268 * Future zero copy code will plugin into sendvec_chunk 1269 * only because doing zero copy for files smaller then 1270 * pagesize is useless. 1271 * 1272 * Note, if socket has NL7C enabled then call NL7C's 1273 * senfilev() function to consume the sfv[]. 1274 */ 1275 if (is_sock) { 1276 if (!SOCK_IS_NONSTR(so) && 1277 _SOTOTPI(so)->sti_nl7c_flags != 0) { 1278 error = nl7c_sendfilev(so, &fileoff, 1279 sfv, copy_cnt, &count); 1280 } else if ((total_size <= (4 * maxblk)) && 1281 error == 0) { 1282 error = sendvec_small_chunk(fp, 1283 &fileoff, sfv, copy_cnt, 1284 total_size, maxblk, &count); 1285 } else { 1286 error = sendvec_chunk(fp, &fileoff, 1287 sfv, copy_cnt, &count); 1288 } 1289 } else { 1290 ASSERT(vp->v_type == VREG); 1291 error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt, 1292 &count); 1293 } 1294 1295 1296 #ifdef _SYSCALL32_IMPL 1297 if (get_udatamodel() == DATAMODEL_ILP32) 1298 copy_vec = (const struct sendfilevec *)((char *)copy_vec + 1299 (copy_cnt * sizeof (ksendfilevec32_t))); 1300 else 1301 #endif 1302 copy_vec += copy_cnt; 1303 sfvcnt -= copy_cnt; 1304 1305 /* Process all vector members up to first error */ 1306 } while ((sfvcnt > 0) && first_vector_error == 0 && error == 0); 1307 1308 if (vp->v_type == VREG) 1309 fp->f_offset += count; 1310 1311 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1312 1313 #ifdef _SYSCALL32_IMPL 1314 if (get_udatamodel() == DATAMODEL_ILP32) { 1315 ssize32_t count32 = (ssize32_t)count; 1316 if (copyout(&count32, xferred, sizeof (count32))) 1317 error = EFAULT; 1318 releasef(fildes); 1319 if (error != 0) 1320 return (set_errno(error)); 1321 if (first_vector_error != 0) 1322 return (set_errno(first_vector_error)); 1323 return (count32); 1324 } 1325 #endif 1326 if (copyout(&count, xferred, sizeof (count))) 1327 error = EFAULT; 1328 releasef(fildes); 1329 if (error != 0) 1330 return (set_errno(error)); 1331 if (first_vector_error != 0) 1332 return (set_errno(first_vector_error)); 1333 return (count); 1334 err: 1335 ASSERT(error != 0); 1336 releasef(fildes); 1337 return (set_errno(error)); 1338 } 1339