1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/t_lock.h> 29 #include <sys/param.h> 30 #include <sys/systm.h> 31 #include <sys/buf.h> 32 #include <sys/conf.h> 33 #include <sys/cred.h> 34 #include <sys/kmem.h> 35 #include <sys/sysmacros.h> 36 #include <sys/vfs.h> 37 #include <sys/vnode.h> 38 #include <sys/debug.h> 39 #include <sys/errno.h> 40 #include <sys/time.h> 41 #include <sys/file.h> 42 #include <sys/open.h> 43 #include <sys/user.h> 44 #include <sys/termios.h> 45 #include <sys/stream.h> 46 #include <sys/strsubr.h> 47 #include <sys/sunddi.h> 48 #include <sys/esunddi.h> 49 #include <sys/flock.h> 50 #include <sys/modctl.h> 51 #include <sys/cmn_err.h> 52 #include <sys/vmsystm.h> 53 54 #include <sys/socket.h> 55 #include <sys/socketvar.h> 56 #include <fs/sockfs/sockcommon.h> 57 #include <fs/sockfs/socktpi.h> 58 59 #include <netinet/in.h> 60 #include <sys/sendfile.h> 61 #include <sys/un.h> 62 #include <sys/tihdr.h> 63 #include <sys/atomic.h> 64 65 #include <inet/common.h> 66 #include <inet/ip.h> 67 #include <inet/ip6.h> 68 #include <inet/tcp.h> 69 70 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *, 71 ssize32_t *); 72 extern int nl7c_sendfilev(struct sonode *, u_offset_t *, struct sendfilevec *, 73 int, ssize_t *); 74 extern int snf_segmap(file_t *, vnode_t *, u_offset_t, u_offset_t, ssize_t *, 75 boolean_t); 76 extern sotpi_info_t *sotpi_sototpi(struct sonode *); 77 78 #define readflg (V_WRITELOCK_FALSE) 79 #define rwflag (V_WRITELOCK_TRUE) 80 81 #define SEND_MAX_CHUNK 16 82 83 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 84 /* 85 * 64 bit offsets for 32 bit applications only running either on 86 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer 87 * more than 2GB of data. 88 */ 89 int 90 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, 91 int copy_cnt, ssize32_t *count) 92 { 93 struct vnode *vp; 94 ushort_t fflag; 95 int ioflag; 96 size32_t cnt; 97 ssize32_t sfv_len; 98 ssize32_t tmpcount; 99 u_offset_t sfv_off; 100 struct uio auio; 101 struct iovec aiov; 102 int i, error; 103 104 fflag = fp->f_flag; 105 vp = fp->f_vnode; 106 for (i = 0; i < copy_cnt; i++) { 107 108 if (ISSIG(curthread, JUSTLOOKING)) 109 return (EINTR); 110 111 /* 112 * Do similar checks as "write" as we are writing 113 * sfv_len bytes into "vp". 114 */ 115 sfv_len = (ssize32_t)sfv->sfv_len; 116 117 if (sfv_len == 0) { 118 sfv++; 119 continue; 120 } 121 122 if (sfv_len < 0) 123 return (EINVAL); 124 125 if (vp->v_type == VREG) { 126 if (*fileoff >= curproc->p_fsz_ctl) { 127 mutex_enter(&curproc->p_lock); 128 (void) rctl_action( 129 rctlproc_legacy[RLIMIT_FSIZE], 130 curproc->p_rctls, curproc, RCA_SAFE); 131 mutex_exit(&curproc->p_lock); 132 return (EFBIG); 133 } 134 135 if (*fileoff >= OFFSET_MAX(fp)) 136 return (EFBIG); 137 138 if (*fileoff + sfv_len > OFFSET_MAX(fp)) 139 return (EINVAL); 140 } 141 142 tmpcount = *count + sfv_len; 143 if (tmpcount < 0) 144 return (EINVAL); 145 146 sfv_off = sfv->sfv_off; 147 148 auio.uio_extflg = UIO_COPY_DEFAULT; 149 if (sfv->sfv_fd == SFV_FD_SELF) { 150 aiov.iov_len = sfv_len; 151 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 152 auio.uio_loffset = *fileoff; 153 auio.uio_iovcnt = 1; 154 auio.uio_resid = sfv_len; 155 auio.uio_iov = &aiov; 156 auio.uio_segflg = UIO_USERSPACE; 157 auio.uio_llimit = curproc->p_fsz_ctl; 158 auio.uio_fmode = fflag; 159 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 160 while (sfv_len > 0) { 161 error = VOP_WRITE(vp, &auio, ioflag, 162 fp->f_cred, NULL); 163 cnt = sfv_len - auio.uio_resid; 164 sfv_len -= cnt; 165 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 166 if (vp->v_type == VREG) 167 *fileoff += cnt; 168 *count += cnt; 169 if (error != 0) 170 return (error); 171 } 172 } else { 173 file_t *ffp; 174 vnode_t *readvp; 175 size_t size; 176 caddr_t ptr; 177 178 if ((ffp = getf(sfv->sfv_fd)) == NULL) 179 return (EBADF); 180 181 if ((ffp->f_flag & FREAD) == 0) { 182 releasef(sfv->sfv_fd); 183 return (EBADF); 184 } 185 186 readvp = ffp->f_vnode; 187 if (readvp->v_type != VREG) { 188 releasef(sfv->sfv_fd); 189 return (EINVAL); 190 } 191 192 /* 193 * No point reading and writing to same vp, 194 * as long as both are regular files. readvp is not 195 * locked; but since we got it from an open file the 196 * contents will be valid during the time of access. 197 */ 198 if (vn_compare(vp, readvp)) { 199 releasef(sfv->sfv_fd); 200 return (EINVAL); 201 } 202 203 /* 204 * Note: we assume readvp != vp. "vp" is already 205 * locked, and "readvp" must not be. 206 */ 207 (void) VOP_RWLOCK(readvp, readflg, NULL); 208 209 /* 210 * Same checks as in pread64. 211 */ 212 if (sfv_off > MAXOFFSET_T) { 213 VOP_RWUNLOCK(readvp, readflg, NULL); 214 releasef(sfv->sfv_fd); 215 return (EINVAL); 216 } 217 218 if (sfv_off + sfv_len > MAXOFFSET_T) 219 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off); 220 221 /* Find the native blocksize to transfer data */ 222 size = MIN(vp->v_vfsp->vfs_bsize, 223 readvp->v_vfsp->vfs_bsize); 224 size = sfv_len < size ? sfv_len : size; 225 ptr = kmem_alloc(size, KM_SLEEP); 226 227 while (sfv_len > 0) { 228 size_t iov_len; 229 230 iov_len = MIN(size, sfv_len); 231 aiov.iov_base = ptr; 232 aiov.iov_len = iov_len; 233 auio.uio_loffset = sfv_off; 234 auio.uio_iov = &aiov; 235 auio.uio_iovcnt = 1; 236 auio.uio_resid = iov_len; 237 auio.uio_segflg = UIO_SYSSPACE; 238 auio.uio_llimit = MAXOFFSET_T; 239 auio.uio_fmode = ffp->f_flag; 240 ioflag = auio.uio_fmode & 241 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 242 243 /* 244 * If read sync is not asked for, 245 * filter sync flags 246 */ 247 if ((ioflag & FRSYNC) == 0) 248 ioflag &= ~(FSYNC|FDSYNC); 249 error = VOP_READ(readvp, &auio, ioflag, 250 fp->f_cred, NULL); 251 if (error) { 252 kmem_free(ptr, size); 253 VOP_RWUNLOCK(readvp, readflg, NULL); 254 releasef(sfv->sfv_fd); 255 return (error); 256 } 257 258 /* 259 * Check how must data was really read. 260 * Decrement the 'len' and increment the 261 * 'off' appropriately. 262 */ 263 cnt = iov_len - auio.uio_resid; 264 if (cnt == 0) { 265 /* 266 * If we were reading a pipe (currently 267 * not implemented), we may now lose 268 * data. 269 */ 270 kmem_free(ptr, size); 271 VOP_RWUNLOCK(readvp, readflg, NULL); 272 releasef(sfv->sfv_fd); 273 return (EINVAL); 274 } 275 sfv_len -= cnt; 276 sfv_off += cnt; 277 278 aiov.iov_base = ptr; 279 aiov.iov_len = cnt; 280 auio.uio_loffset = *fileoff; 281 auio.uio_iov = &aiov; 282 auio.uio_iovcnt = 1; 283 auio.uio_resid = cnt; 284 auio.uio_segflg = UIO_SYSSPACE; 285 auio.uio_llimit = curproc->p_fsz_ctl; 286 auio.uio_fmode = fflag; 287 ioflag = auio.uio_fmode & 288 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 289 error = VOP_WRITE(vp, &auio, ioflag, 290 fp->f_cred, NULL); 291 292 /* 293 * Check how much data was written. Increment 294 * the 'len' and decrement the 'off' if all 295 * the data was not written. 296 */ 297 cnt -= auio.uio_resid; 298 sfv_len += auio.uio_resid; 299 sfv_off -= auio.uio_resid; 300 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 301 if (vp->v_type == VREG) 302 *fileoff += cnt; 303 *count += cnt; 304 if (error != 0) { 305 kmem_free(ptr, size); 306 VOP_RWUNLOCK(readvp, readflg, NULL); 307 releasef(sfv->sfv_fd); 308 return (error); 309 } 310 } 311 VOP_RWUNLOCK(readvp, readflg, NULL); 312 releasef(sfv->sfv_fd); 313 kmem_free(ptr, size); 314 } 315 sfv++; 316 } 317 return (0); 318 } 319 320 ssize32_t 321 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, 322 size32_t *xferred, int fildes) 323 { 324 u_offset_t fileoff; 325 int copy_cnt; 326 const struct ksendfilevec64 *copy_vec; 327 struct ksendfilevec64 sfv[SEND_MAX_CHUNK]; 328 struct vnode *vp; 329 int error; 330 ssize32_t count = 0; 331 332 vp = fp->f_vnode; 333 (void) VOP_RWLOCK(vp, rwflag, NULL); 334 335 copy_vec = vec; 336 fileoff = fp->f_offset; 337 338 do { 339 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 340 if (copyin(copy_vec, sfv, copy_cnt * 341 sizeof (struct ksendfilevec64))) { 342 error = EFAULT; 343 break; 344 } 345 346 /* 347 * Optimize the regular file over 348 * the socket case. 349 */ 350 if (vp->v_type == VSOCK && sfv->sfv_fd != SFV_FD_SELF) { 351 file_t *rfp; 352 vnode_t *rvp; 353 354 if ((rfp = getf(sfv->sfv_fd)) == NULL) { 355 error = EBADF; 356 break; 357 } 358 if ((rfp->f_flag & FREAD) == 0) { 359 releasef(sfv->sfv_fd); 360 error = EBADF; 361 break; 362 } 363 rvp = rfp->f_vnode; 364 if (rvp->v_type == VREG) { 365 error = sosendfile64(fp, rfp, sfv, &count); 366 if (error) 367 break; 368 copy_vec++; 369 sfvcnt--; 370 continue; 371 } 372 releasef(sfv->sfv_fd); 373 } 374 error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count); 375 if (error != 0) 376 break; 377 378 copy_vec += copy_cnt; 379 sfvcnt -= copy_cnt; 380 } while (sfvcnt > 0); 381 382 if (vp->v_type == VREG) 383 fp->f_offset += count; 384 385 VOP_RWUNLOCK(vp, rwflag, NULL); 386 if (copyout(&count, xferred, sizeof (count))) 387 error = EFAULT; 388 releasef(fildes); 389 if (error != 0) 390 return (set_errno(error)); 391 return (count); 392 } 393 #endif 394 395 int 396 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 397 int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count) 398 { 399 struct vnode *vp; 400 struct uio auio; 401 struct iovec aiov; 402 ushort_t fflag; 403 int ioflag; 404 int i, error; 405 size_t cnt; 406 ssize_t sfv_len; 407 u_offset_t sfv_off; 408 #ifdef _SYSCALL32_IMPL 409 model_t model = get_udatamodel(); 410 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 411 MAXOFF32_T : MAXOFFSET_T; 412 #else 413 const u_offset_t maxoff = MAXOFF32_T; 414 #endif 415 mblk_t *dmp = NULL; 416 int wroff; 417 int buf_left = 0; 418 size_t iov_len; 419 mblk_t *head, *tmp; 420 size_t size = total_size; 421 size_t extra; 422 int tail_len; 423 struct nmsghdr msg; 424 425 fflag = fp->f_flag; 426 vp = fp->f_vnode; 427 428 ASSERT(vp->v_type == VSOCK); 429 ASSERT(maxblk > 0); 430 431 /* If nothing to send, return */ 432 if (total_size == 0) 433 return (0); 434 435 if (vp->v_stream != NULL) { 436 wroff = (int)vp->v_stream->sd_wroff; 437 tail_len = (int)vp->v_stream->sd_tail; 438 } else { 439 struct sonode *so; 440 441 so = VTOSO(vp); 442 wroff = so->so_proto_props.sopp_wroff; 443 tail_len = so->so_proto_props.sopp_tail; 444 } 445 446 extra = wroff + tail_len; 447 448 buf_left = MIN(total_size, maxblk); 449 head = dmp = allocb(buf_left + extra, BPRI_HI); 450 if (head == NULL) 451 return (ENOMEM); 452 head->b_wptr = head->b_rptr = head->b_rptr + wroff; 453 bzero(&msg, sizeof (msg)); 454 455 auio.uio_extflg = UIO_COPY_DEFAULT; 456 for (i = 0; i < copy_cnt; i++) { 457 if (ISSIG(curthread, JUSTLOOKING)) { 458 freemsg(head); 459 return (EINTR); 460 } 461 462 /* 463 * Do similar checks as "write" as we are writing 464 * sfv_len bytes into "vp". 465 */ 466 sfv_len = (ssize_t)sfv->sfv_len; 467 468 if (sfv_len == 0) { 469 sfv++; 470 continue; 471 } 472 473 /* Check for overflow */ 474 #ifdef _SYSCALL32_IMPL 475 if (model == DATAMODEL_ILP32) { 476 if (((ssize32_t)(*count + sfv_len)) < 0) { 477 freemsg(head); 478 return (EINVAL); 479 } 480 } else 481 #endif 482 if ((*count + sfv_len) < 0) { 483 freemsg(head); 484 return (EINVAL); 485 } 486 487 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 488 489 if (sfv->sfv_fd == SFV_FD_SELF) { 490 while (sfv_len > 0) { 491 if (buf_left == 0) { 492 tmp = dmp; 493 buf_left = MIN(total_size, maxblk); 494 iov_len = MIN(buf_left, sfv_len); 495 dmp = allocb(buf_left + extra, BPRI_HI); 496 if (dmp == NULL) { 497 freemsg(head); 498 return (ENOMEM); 499 } 500 dmp->b_wptr = dmp->b_rptr = 501 dmp->b_rptr + wroff; 502 tmp->b_cont = dmp; 503 } else { 504 iov_len = MIN(buf_left, sfv_len); 505 } 506 507 aiov.iov_len = iov_len; 508 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 509 auio.uio_loffset = *fileoff; 510 auio.uio_iovcnt = 1; 511 auio.uio_resid = iov_len; 512 auio.uio_iov = &aiov; 513 auio.uio_segflg = UIO_USERSPACE; 514 auio.uio_llimit = curproc->p_fsz_ctl; 515 auio.uio_fmode = fflag; 516 517 buf_left -= iov_len; 518 total_size -= iov_len; 519 sfv_len -= iov_len; 520 sfv_off += iov_len; 521 522 error = uiomove((caddr_t)dmp->b_wptr, 523 iov_len, UIO_WRITE, &auio); 524 if (error != 0) { 525 freemsg(head); 526 return (error); 527 } 528 dmp->b_wptr += iov_len; 529 } 530 } else { 531 file_t *ffp; 532 vnode_t *readvp; 533 534 if ((ffp = getf(sfv->sfv_fd)) == NULL) { 535 freemsg(head); 536 return (EBADF); 537 } 538 539 if ((ffp->f_flag & FREAD) == 0) { 540 releasef(sfv->sfv_fd); 541 freemsg(head); 542 return (EACCES); 543 } 544 545 readvp = ffp->f_vnode; 546 if (readvp->v_type != VREG) { 547 releasef(sfv->sfv_fd); 548 freemsg(head); 549 return (EINVAL); 550 } 551 552 /* 553 * No point reading and writing to same vp, 554 * as long as both are regular files. readvp is not 555 * locked; but since we got it from an open file the 556 * contents will be valid during the time of access. 557 */ 558 559 if (vn_compare(vp, readvp)) { 560 releasef(sfv->sfv_fd); 561 freemsg(head); 562 return (EINVAL); 563 } 564 565 /* 566 * Note: we assume readvp != vp. "vp" is already 567 * locked, and "readvp" must not be. 568 */ 569 570 (void) VOP_RWLOCK(readvp, readflg, NULL); 571 572 /* Same checks as in pread */ 573 if (sfv_off > maxoff) { 574 VOP_RWUNLOCK(readvp, readflg, NULL); 575 releasef(sfv->sfv_fd); 576 freemsg(head); 577 return (EINVAL); 578 } 579 if (sfv_off + sfv_len > maxoff) { 580 total_size -= (sfv_off + sfv_len - maxoff); 581 sfv_len = (ssize_t)((offset_t)maxoff - 582 sfv_off); 583 } 584 585 while (sfv_len > 0) { 586 if (buf_left == 0) { 587 tmp = dmp; 588 buf_left = MIN(total_size, maxblk); 589 iov_len = MIN(buf_left, sfv_len); 590 dmp = allocb(buf_left + extra, BPRI_HI); 591 if (dmp == NULL) { 592 VOP_RWUNLOCK(readvp, readflg, 593 NULL); 594 releasef(sfv->sfv_fd); 595 freemsg(head); 596 return (ENOMEM); 597 } 598 dmp->b_wptr = dmp->b_rptr = 599 dmp->b_rptr + wroff; 600 tmp->b_cont = dmp; 601 } else { 602 iov_len = MIN(buf_left, sfv_len); 603 } 604 aiov.iov_base = (caddr_t)dmp->b_wptr; 605 aiov.iov_len = iov_len; 606 auio.uio_loffset = sfv_off; 607 auio.uio_iov = &aiov; 608 auio.uio_iovcnt = 1; 609 auio.uio_resid = iov_len; 610 auio.uio_segflg = UIO_SYSSPACE; 611 auio.uio_llimit = MAXOFFSET_T; 612 auio.uio_fmode = ffp->f_flag; 613 ioflag = auio.uio_fmode & 614 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 615 616 /* 617 * If read sync is not asked for, 618 * filter sync flags 619 */ 620 if ((ioflag & FRSYNC) == 0) 621 ioflag &= ~(FSYNC|FDSYNC); 622 error = VOP_READ(readvp, &auio, ioflag, 623 fp->f_cred, NULL); 624 if (error != 0) { 625 /* 626 * If we were reading a pipe (currently 627 * not implemented), we may now loose 628 * data. 629 */ 630 VOP_RWUNLOCK(readvp, readflg, NULL); 631 releasef(sfv->sfv_fd); 632 freemsg(head); 633 return (error); 634 } 635 636 /* 637 * Check how much data was really read. 638 * Decrement the 'len' and increment the 639 * 'off' appropriately. 640 */ 641 cnt = iov_len - auio.uio_resid; 642 if (cnt == 0) { 643 VOP_RWUNLOCK(readvp, readflg, NULL); 644 releasef(sfv->sfv_fd); 645 freemsg(head); 646 return (EINVAL); 647 } 648 sfv_len -= cnt; 649 sfv_off += cnt; 650 total_size -= cnt; 651 buf_left -= cnt; 652 653 dmp->b_wptr += cnt; 654 } 655 VOP_RWUNLOCK(readvp, readflg, NULL); 656 releasef(sfv->sfv_fd); 657 } 658 sfv++; 659 } 660 661 ASSERT(total_size == 0); 662 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &head); 663 if (error != 0) { 664 if (head != NULL) 665 freemsg(head); 666 return (error); 667 } 668 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size; 669 *count += size; 670 671 return (0); 672 } 673 674 675 int 676 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 677 int copy_cnt, ssize_t *count) 678 { 679 struct vnode *vp; 680 struct uio auio; 681 struct iovec aiov; 682 ushort_t fflag; 683 int ioflag; 684 int i, error; 685 size_t cnt; 686 ssize_t sfv_len; 687 u_offset_t sfv_off; 688 #ifdef _SYSCALL32_IMPL 689 model_t model = get_udatamodel(); 690 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 691 MAXOFF32_T : MAXOFFSET_T; 692 #else 693 const u_offset_t maxoff = MAXOFF32_T; 694 #endif 695 mblk_t *dmp = NULL; 696 char *buf = NULL; 697 size_t extra; 698 int maxblk, wroff, tail_len; 699 struct sonode *so; 700 stdata_t *stp; 701 struct nmsghdr msg; 702 703 fflag = fp->f_flag; 704 vp = fp->f_vnode; 705 706 if (vp->v_type == VSOCK) { 707 so = VTOSO(vp); 708 if (vp->v_stream != NULL) { 709 stp = vp->v_stream; 710 wroff = (int)stp->sd_wroff; 711 tail_len = (int)stp->sd_tail; 712 maxblk = (int)stp->sd_maxblk; 713 } else { 714 stp = NULL; 715 wroff = so->so_proto_props.sopp_wroff; 716 tail_len = so->so_proto_props.sopp_tail; 717 maxblk = so->so_proto_props.sopp_maxblk; 718 } 719 extra = wroff + tail_len; 720 } 721 722 bzero(&msg, sizeof (msg)); 723 auio.uio_extflg = UIO_COPY_DEFAULT; 724 for (i = 0; i < copy_cnt; i++) { 725 if (ISSIG(curthread, JUSTLOOKING)) 726 return (EINTR); 727 728 /* 729 * Do similar checks as "write" as we are writing 730 * sfv_len bytes into "vp". 731 */ 732 sfv_len = (ssize_t)sfv->sfv_len; 733 734 if (sfv_len == 0) { 735 sfv++; 736 continue; 737 } 738 739 if (vp->v_type == VREG) { 740 if (*fileoff >= curproc->p_fsz_ctl) { 741 mutex_enter(&curproc->p_lock); 742 (void) rctl_action( 743 rctlproc_legacy[RLIMIT_FSIZE], 744 curproc->p_rctls, curproc, RCA_SAFE); 745 mutex_exit(&curproc->p_lock); 746 747 return (EFBIG); 748 } 749 750 if (*fileoff >= maxoff) 751 return (EFBIG); 752 753 if (*fileoff + sfv_len > maxoff) 754 return (EINVAL); 755 } 756 757 /* Check for overflow */ 758 #ifdef _SYSCALL32_IMPL 759 if (model == DATAMODEL_ILP32) { 760 if (((ssize32_t)(*count + sfv_len)) < 0) 761 return (EINVAL); 762 } else 763 #endif 764 if ((*count + sfv_len) < 0) 765 return (EINVAL); 766 767 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 768 769 if (sfv->sfv_fd == SFV_FD_SELF) { 770 if (vp->v_type == VSOCK) { 771 while (sfv_len > 0) { 772 size_t iov_len; 773 774 iov_len = sfv_len; 775 if (!SOCK_IS_NONSTR(so) && 776 SOTOTPI(so)->sti_kssl_ctx != NULL) 777 iov_len = MIN(iov_len, maxblk); 778 779 aiov.iov_len = iov_len; 780 aiov.iov_base = 781 (caddr_t)(uintptr_t)sfv_off; 782 783 auio.uio_iov = &aiov; 784 auio.uio_iovcnt = 1; 785 auio.uio_loffset = *fileoff; 786 auio.uio_segflg = UIO_USERSPACE; 787 auio.uio_fmode = fflag; 788 auio.uio_llimit = curproc->p_fsz_ctl; 789 auio.uio_resid = iov_len; 790 791 dmp = allocb(iov_len + extra, BPRI_HI); 792 if (dmp == NULL) 793 return (ENOMEM); 794 dmp->b_wptr = dmp->b_rptr = 795 dmp->b_rptr + wroff; 796 error = uiomove((caddr_t)dmp->b_wptr, 797 iov_len, UIO_WRITE, &auio); 798 if (error != 0) { 799 freeb(dmp); 800 return (error); 801 } 802 dmp->b_wptr += iov_len; 803 error = socket_sendmblk(VTOSO(vp), 804 &msg, fflag, CRED(), &dmp); 805 806 if (error != 0) { 807 if (dmp != NULL) 808 freeb(dmp); 809 return (error); 810 } 811 ttolwp(curthread)->lwp_ru.ioch += 812 (ulong_t)iov_len; 813 *count += iov_len; 814 sfv_len -= iov_len; 815 sfv_off += iov_len; 816 } 817 } else { 818 aiov.iov_len = sfv_len; 819 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 820 821 auio.uio_iov = &aiov; 822 auio.uio_iovcnt = 1; 823 auio.uio_loffset = *fileoff; 824 auio.uio_segflg = UIO_USERSPACE; 825 auio.uio_fmode = fflag; 826 auio.uio_llimit = curproc->p_fsz_ctl; 827 auio.uio_resid = sfv_len; 828 829 ioflag = auio.uio_fmode & 830 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 831 while (sfv_len > 0) { 832 error = VOP_WRITE(vp, &auio, ioflag, 833 fp->f_cred, NULL); 834 cnt = sfv_len - auio.uio_resid; 835 sfv_len -= cnt; 836 ttolwp(curthread)->lwp_ru.ioch += 837 (ulong_t)cnt; 838 *fileoff += cnt; 839 *count += cnt; 840 if (error != 0) 841 return (error); 842 } 843 } 844 } else { 845 int segmapit = 0; 846 file_t *ffp; 847 vnode_t *readvp; 848 struct vnode *realvp; 849 size_t size; 850 caddr_t ptr; 851 852 if ((ffp = getf(sfv->sfv_fd)) == NULL) 853 return (EBADF); 854 855 if ((ffp->f_flag & FREAD) == 0) { 856 releasef(sfv->sfv_fd); 857 return (EBADF); 858 } 859 860 readvp = ffp->f_vnode; 861 if (VOP_REALVP(readvp, &realvp, NULL) == 0) 862 readvp = realvp; 863 if (readvp->v_type != VREG) { 864 releasef(sfv->sfv_fd); 865 return (EINVAL); 866 } 867 868 /* 869 * No point reading and writing to same vp, 870 * as long as both are regular files. readvp is not 871 * locked; but since we got it from an open file the 872 * contents will be valid during the time of access. 873 */ 874 if (vn_compare(vp, readvp)) { 875 releasef(sfv->sfv_fd); 876 return (EINVAL); 877 } 878 879 /* 880 * Note: we assume readvp != vp. "vp" is already 881 * locked, and "readvp" must not be. 882 */ 883 (void) VOP_RWLOCK(readvp, readflg, NULL); 884 885 /* Same checks as in pread */ 886 if (sfv_off > maxoff) { 887 VOP_RWUNLOCK(readvp, readflg, NULL); 888 releasef(sfv->sfv_fd); 889 return (EINVAL); 890 } 891 if (sfv_off + sfv_len > maxoff) { 892 sfv_len = (ssize_t)((offset_t)maxoff - 893 sfv_off); 894 } 895 /* Find the native blocksize to transfer data */ 896 size = MIN(vp->v_vfsp->vfs_bsize, 897 readvp->v_vfsp->vfs_bsize); 898 size = sfv_len < size ? sfv_len : size; 899 900 if (vp->v_type != VSOCK) { 901 segmapit = 0; 902 buf = kmem_alloc(size, KM_NOSLEEP); 903 if (buf == NULL) { 904 VOP_RWUNLOCK(readvp, readflg, NULL); 905 releasef(sfv->sfv_fd); 906 return (ENOMEM); 907 } 908 } else { 909 uint_t copyflag; 910 911 copyflag = stp != NULL ? stp->sd_copyflag : 912 so->so_proto_props.sopp_zcopyflag; 913 /* 914 * For sockets acting as an SSL proxy, we 915 * need to adjust the size to the maximum 916 * SSL record size set in the stream head. 917 */ 918 if (!SOCK_IS_NONSTR(so) && 919 _SOTOTPI(so)->sti_kssl_ctx != NULL) 920 size = MIN(size, maxblk); 921 922 if (vn_has_flocks(readvp) || 923 readvp->v_flag & VNOMAP || 924 copyflag & STZCVMUNSAFE) { 925 segmapit = 0; 926 } else if (copyflag & STZCVMSAFE) { 927 segmapit = 1; 928 } else { 929 int on = 1; 930 if (socket_setsockopt(VTOSO(vp), 931 SOL_SOCKET, SO_SND_COPYAVOID, 932 &on, sizeof (on), CRED()) == 0) 933 segmapit = 1; 934 } 935 } 936 937 if (segmapit) { 938 boolean_t nowait; 939 940 nowait = (sfv->sfv_flag & SFV_NOWAIT) != 0; 941 error = snf_segmap(fp, readvp, sfv_off, 942 (u_offset_t)sfv_len, (ssize_t *)&cnt, 943 nowait); 944 releasef(sfv->sfv_fd); 945 *count += cnt; 946 if (error) 947 return (error); 948 sfv++; 949 continue; 950 } 951 952 while (sfv_len > 0) { 953 size_t iov_len; 954 955 iov_len = MIN(size, sfv_len); 956 957 if (vp->v_type == VSOCK) { 958 dmp = allocb(iov_len + extra, BPRI_HI); 959 if (dmp == NULL) { 960 VOP_RWUNLOCK(readvp, readflg, 961 NULL); 962 releasef(sfv->sfv_fd); 963 return (ENOMEM); 964 } 965 dmp->b_wptr = dmp->b_rptr = 966 dmp->b_rptr + wroff; 967 ptr = (caddr_t)dmp->b_rptr; 968 } else { 969 ptr = buf; 970 } 971 972 aiov.iov_base = ptr; 973 aiov.iov_len = iov_len; 974 auio.uio_loffset = sfv_off; 975 auio.uio_iov = &aiov; 976 auio.uio_iovcnt = 1; 977 auio.uio_resid = iov_len; 978 auio.uio_segflg = UIO_SYSSPACE; 979 auio.uio_llimit = MAXOFFSET_T; 980 auio.uio_fmode = ffp->f_flag; 981 ioflag = auio.uio_fmode & 982 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 983 984 /* 985 * If read sync is not asked for, 986 * filter sync flags 987 */ 988 if ((ioflag & FRSYNC) == 0) 989 ioflag &= ~(FSYNC|FDSYNC); 990 error = VOP_READ(readvp, &auio, ioflag, 991 fp->f_cred, NULL); 992 if (error != 0) { 993 /* 994 * If we were reading a pipe (currently 995 * not implemented), we may now lose 996 * data. 997 */ 998 if (vp->v_type == VSOCK) 999 freeb(dmp); 1000 else 1001 kmem_free(buf, size); 1002 VOP_RWUNLOCK(readvp, readflg, NULL); 1003 releasef(sfv->sfv_fd); 1004 return (error); 1005 } 1006 1007 /* 1008 * Check how much data was really read. 1009 * Decrement the 'len' and increment the 1010 * 'off' appropriately. 1011 */ 1012 cnt = iov_len - auio.uio_resid; 1013 if (cnt == 0) { 1014 if (vp->v_type == VSOCK) 1015 freeb(dmp); 1016 else 1017 kmem_free(buf, size); 1018 VOP_RWUNLOCK(readvp, readflg, NULL); 1019 releasef(sfv->sfv_fd); 1020 return (EINVAL); 1021 } 1022 sfv_len -= cnt; 1023 sfv_off += cnt; 1024 1025 if (vp->v_type == VSOCK) { 1026 dmp->b_wptr = dmp->b_rptr + cnt; 1027 1028 error = socket_sendmblk(VTOSO(vp), 1029 &msg, fflag, CRED(), &dmp); 1030 1031 if (error != 0) { 1032 if (dmp != NULL) 1033 freeb(dmp); 1034 VOP_RWUNLOCK(readvp, readflg, 1035 NULL); 1036 releasef(sfv->sfv_fd); 1037 return (error); 1038 } 1039 1040 ttolwp(curthread)->lwp_ru.ioch += 1041 (ulong_t)cnt; 1042 *count += cnt; 1043 } else { 1044 1045 aiov.iov_base = ptr; 1046 aiov.iov_len = cnt; 1047 auio.uio_loffset = *fileoff; 1048 auio.uio_resid = cnt; 1049 auio.uio_iov = &aiov; 1050 auio.uio_iovcnt = 1; 1051 auio.uio_segflg = UIO_SYSSPACE; 1052 auio.uio_llimit = curproc->p_fsz_ctl; 1053 auio.uio_fmode = fflag; 1054 ioflag = auio.uio_fmode & 1055 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1056 error = VOP_WRITE(vp, &auio, ioflag, 1057 fp->f_cred, NULL); 1058 1059 /* 1060 * Check how much data was written. 1061 * Increment the 'len' and decrement the 1062 * 'off' if all the data was not 1063 * written. 1064 */ 1065 cnt -= auio.uio_resid; 1066 sfv_len += auio.uio_resid; 1067 sfv_off -= auio.uio_resid; 1068 ttolwp(curthread)->lwp_ru.ioch += 1069 (ulong_t)cnt; 1070 *fileoff += cnt; 1071 *count += cnt; 1072 if (error != 0) { 1073 kmem_free(buf, size); 1074 VOP_RWUNLOCK(readvp, readflg, 1075 NULL); 1076 releasef(sfv->sfv_fd); 1077 return (error); 1078 } 1079 } 1080 } 1081 if (buf) { 1082 kmem_free(buf, size); 1083 buf = NULL; 1084 } 1085 VOP_RWUNLOCK(readvp, readflg, NULL); 1086 releasef(sfv->sfv_fd); 1087 } 1088 sfv++; 1089 } 1090 return (0); 1091 } 1092 1093 ssize_t 1094 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, 1095 size_t *xferred) 1096 { 1097 int error = 0; 1098 int first_vector_error = 0; 1099 file_t *fp; 1100 struct vnode *vp; 1101 struct sonode *so; 1102 u_offset_t fileoff; 1103 int copy_cnt; 1104 const struct sendfilevec *copy_vec; 1105 struct sendfilevec sfv[SEND_MAX_CHUNK]; 1106 ssize_t count = 0; 1107 #ifdef _SYSCALL32_IMPL 1108 struct ksendfilevec32 sfv32[SEND_MAX_CHUNK]; 1109 #endif 1110 ssize_t total_size; 1111 int i; 1112 boolean_t is_sock = B_FALSE; 1113 int maxblk = 0; 1114 1115 if (sfvcnt <= 0) 1116 return (set_errno(EINVAL)); 1117 1118 if ((fp = getf(fildes)) == NULL) 1119 return (set_errno(EBADF)); 1120 1121 if (((fp->f_flag) & FWRITE) == 0) { 1122 error = EBADF; 1123 goto err; 1124 } 1125 1126 fileoff = fp->f_offset; 1127 vp = fp->f_vnode; 1128 1129 switch (vp->v_type) { 1130 case VSOCK: 1131 so = VTOSO(vp); 1132 is_sock = B_TRUE; 1133 if (SOCK_IS_NONSTR(so)) { 1134 maxblk = so->so_proto_props.sopp_maxblk; 1135 } else { 1136 maxblk = (int)vp->v_stream->sd_maxblk; 1137 } 1138 break; 1139 case VREG: 1140 break; 1141 default: 1142 error = EINVAL; 1143 goto err; 1144 } 1145 1146 switch (opcode) { 1147 case SENDFILEV : 1148 break; 1149 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 1150 case SENDFILEV64 : 1151 return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt, 1152 (size32_t *)xferred, fildes)); 1153 #endif 1154 default : 1155 error = ENOSYS; 1156 break; 1157 } 1158 1159 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1160 copy_vec = vec; 1161 1162 do { 1163 total_size = 0; 1164 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 1165 #ifdef _SYSCALL32_IMPL 1166 /* 32-bit callers need to have their iovec expanded. */ 1167 if (get_udatamodel() == DATAMODEL_ILP32) { 1168 if (copyin(copy_vec, sfv32, 1169 copy_cnt * sizeof (ksendfilevec32_t))) { 1170 error = EFAULT; 1171 break; 1172 } 1173 1174 for (i = 0; i < copy_cnt; i++) { 1175 sfv[i].sfv_fd = sfv32[i].sfv_fd; 1176 sfv[i].sfv_off = 1177 (off_t)(uint32_t)sfv32[i].sfv_off; 1178 sfv[i].sfv_len = (size_t)sfv32[i].sfv_len; 1179 total_size += sfv[i].sfv_len; 1180 sfv[i].sfv_flag = sfv32[i].sfv_flag; 1181 /* 1182 * Individual elements of the vector must not 1183 * wrap or overflow, as later math is signed. 1184 * Equally total_size needs to be checked after 1185 * each vector is added in, to be sure that 1186 * rogue values haven't overflowed the counter. 1187 */ 1188 if (((ssize32_t)sfv[i].sfv_len < 0) || 1189 ((ssize32_t)total_size < 0)) { 1190 /* 1191 * Truncate the vector to send data 1192 * described by elements before the 1193 * error. 1194 */ 1195 copy_cnt = i; 1196 first_vector_error = EINVAL; 1197 /* total_size can't be trusted */ 1198 if ((ssize32_t)total_size < 0) 1199 error = EINVAL; 1200 break; 1201 } 1202 } 1203 /* Nothing to do, process errors */ 1204 if (copy_cnt == 0) 1205 break; 1206 1207 } else { 1208 #endif 1209 if (copyin(copy_vec, sfv, 1210 copy_cnt * sizeof (sendfilevec_t))) { 1211 error = EFAULT; 1212 break; 1213 } 1214 1215 for (i = 0; i < copy_cnt; i++) { 1216 total_size += sfv[i].sfv_len; 1217 /* 1218 * Individual elements of the vector must not 1219 * wrap or overflow, as later math is signed. 1220 * Equally total_size needs to be checked after 1221 * each vector is added in, to be sure that 1222 * rogue values haven't overflowed the counter. 1223 */ 1224 if (((ssize_t)sfv[i].sfv_len < 0) || 1225 (total_size < 0)) { 1226 /* 1227 * Truncate the vector to send data 1228 * described by elements before the 1229 * error. 1230 */ 1231 copy_cnt = i; 1232 first_vector_error = EINVAL; 1233 /* total_size can't be trusted */ 1234 if (total_size < 0) 1235 error = EINVAL; 1236 break; 1237 } 1238 } 1239 /* Nothing to do, process errors */ 1240 if (copy_cnt == 0) 1241 break; 1242 #ifdef _SYSCALL32_IMPL 1243 } 1244 #endif 1245 1246 /* 1247 * The task between deciding to use sendvec_small_chunk 1248 * and sendvec_chunk is dependant on multiple things: 1249 * 1250 * i) latency is important for smaller files. So if the 1251 * data is smaller than 'tcp_slow_start_initial' times 1252 * maxblk, then use sendvec_small_chunk which creates 1253 * maxblk size mblks and chains them together and sends 1254 * them to TCP in one shot. It also leaves 'wroff' size 1255 * space for the headers in each mblk. 1256 * 1257 * ii) for total size bigger than 'tcp_slow_start_initial' 1258 * time maxblk, its probably real file data which is 1259 * dominating. So its better to use sendvec_chunk because 1260 * performance goes to dog if we don't do pagesize reads. 1261 * sendvec_chunk will do pagesize reads and write them 1262 * in pagesize mblks to TCP. 1263 * 1264 * Side Notes: A write to file has not been optimized. 1265 * Future zero copy code will plugin into sendvec_chunk 1266 * only because doing zero copy for files smaller then 1267 * pagesize is useless. 1268 * 1269 * Note, if socket has NL7C enabled then call NL7C's 1270 * senfilev() function to consume the sfv[]. 1271 */ 1272 if (is_sock) { 1273 if (!SOCK_IS_NONSTR(so) && 1274 _SOTOTPI(so)->sti_nl7c_flags != 0) { 1275 error = nl7c_sendfilev(so, &fileoff, 1276 sfv, copy_cnt, &count); 1277 } else if ((total_size <= (4 * maxblk)) && 1278 error == 0) { 1279 error = sendvec_small_chunk(fp, 1280 &fileoff, sfv, copy_cnt, 1281 total_size, maxblk, &count); 1282 } else { 1283 error = sendvec_chunk(fp, &fileoff, 1284 sfv, copy_cnt, &count); 1285 } 1286 } else { 1287 ASSERT(vp->v_type == VREG); 1288 error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt, 1289 &count); 1290 } 1291 1292 1293 #ifdef _SYSCALL32_IMPL 1294 if (get_udatamodel() == DATAMODEL_ILP32) 1295 copy_vec = (const struct sendfilevec *)((char *)copy_vec + 1296 (copy_cnt * sizeof (ksendfilevec32_t))); 1297 else 1298 #endif 1299 copy_vec += copy_cnt; 1300 sfvcnt -= copy_cnt; 1301 1302 /* Process all vector members up to first error */ 1303 } while ((sfvcnt > 0) && first_vector_error == 0 && error == 0); 1304 1305 if (vp->v_type == VREG) 1306 fp->f_offset += count; 1307 1308 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1309 1310 #ifdef _SYSCALL32_IMPL 1311 if (get_udatamodel() == DATAMODEL_ILP32) { 1312 ssize32_t count32 = (ssize32_t)count; 1313 if (copyout(&count32, xferred, sizeof (count32))) 1314 error = EFAULT; 1315 releasef(fildes); 1316 if (error != 0) 1317 return (set_errno(error)); 1318 if (first_vector_error != 0) 1319 return (set_errno(first_vector_error)); 1320 return (count32); 1321 } 1322 #endif 1323 if (copyout(&count, xferred, sizeof (count))) 1324 error = EFAULT; 1325 releasef(fildes); 1326 if (error != 0) 1327 return (set_errno(error)); 1328 if (first_vector_error != 0) 1329 return (set_errno(first_vector_error)); 1330 return (count); 1331 err: 1332 ASSERT(error != 0); 1333 releasef(fildes); 1334 return (set_errno(error)); 1335 } 1336