1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/t_lock.h> 29 #include <sys/param.h> 30 #include <sys/systm.h> 31 #include <sys/buf.h> 32 #include <sys/conf.h> 33 #include <sys/cred.h> 34 #include <sys/kmem.h> 35 #include <sys/sysmacros.h> 36 #include <sys/vfs.h> 37 #include <sys/vnode.h> 38 #include <sys/debug.h> 39 #include <sys/errno.h> 40 #include <sys/time.h> 41 #include <sys/file.h> 42 #include <sys/open.h> 43 #include <sys/user.h> 44 #include <sys/termios.h> 45 #include <sys/stream.h> 46 #include <sys/strsubr.h> 47 #include <sys/sunddi.h> 48 #include <sys/esunddi.h> 49 #include <sys/flock.h> 50 #include <sys/modctl.h> 51 #include <sys/cmn_err.h> 52 #include <sys/vmsystm.h> 53 54 #include <sys/socket.h> 55 #include <sys/socketvar.h> 56 #include <fs/sockfs/sockcommon.h> 57 #include <fs/sockfs/socktpi.h> 58 59 #include <netinet/in.h> 60 #include <sys/sendfile.h> 61 #include <sys/un.h> 62 #include <sys/tihdr.h> 63 #include <sys/atomic.h> 64 65 #include <inet/common.h> 66 #include <inet/ip.h> 67 #include <inet/ip6.h> 68 #include <inet/tcp.h> 69 70 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *, 71 ssize32_t *); 72 extern int nl7c_sendfilev(struct sonode *, u_offset_t *, struct sendfilevec *, 73 int, ssize_t *); 74 extern int snf_segmap(file_t *, vnode_t *, u_offset_t, u_offset_t, ssize_t *, 75 boolean_t); 76 extern sotpi_info_t *sotpi_sototpi(struct sonode *); 77 78 #define SEND_MAX_CHUNK 16 79 80 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 81 /* 82 * 64 bit offsets for 32 bit applications only running either on 83 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer 84 * more than 2GB of data. 85 */ 86 int 87 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, 88 int copy_cnt, ssize32_t *count) 89 { 90 struct vnode *vp; 91 ushort_t fflag; 92 int ioflag; 93 size32_t cnt; 94 ssize32_t sfv_len; 95 ssize32_t tmpcount; 96 u_offset_t sfv_off; 97 struct uio auio; 98 struct iovec aiov; 99 int i, error; 100 101 fflag = fp->f_flag; 102 vp = fp->f_vnode; 103 for (i = 0; i < copy_cnt; i++) { 104 105 if (ISSIG(curthread, JUSTLOOKING)) 106 return (EINTR); 107 108 /* 109 * Do similar checks as "write" as we are writing 110 * sfv_len bytes into "vp". 111 */ 112 sfv_len = (ssize32_t)sfv->sfv_len; 113 114 if (sfv_len == 0) { 115 sfv++; 116 continue; 117 } 118 119 if (sfv_len < 0) 120 return (EINVAL); 121 122 if (vp->v_type == VREG) { 123 if (*fileoff >= curproc->p_fsz_ctl) { 124 mutex_enter(&curproc->p_lock); 125 (void) rctl_action( 126 rctlproc_legacy[RLIMIT_FSIZE], 127 curproc->p_rctls, curproc, RCA_SAFE); 128 mutex_exit(&curproc->p_lock); 129 return (EFBIG); 130 } 131 132 if (*fileoff >= OFFSET_MAX(fp)) 133 return (EFBIG); 134 135 if (*fileoff + sfv_len > OFFSET_MAX(fp)) 136 return (EINVAL); 137 } 138 139 tmpcount = *count + sfv_len; 140 if (tmpcount < 0) 141 return (EINVAL); 142 143 sfv_off = sfv->sfv_off; 144 145 auio.uio_extflg = UIO_COPY_DEFAULT; 146 if (sfv->sfv_fd == SFV_FD_SELF) { 147 aiov.iov_len = sfv_len; 148 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 149 auio.uio_loffset = *fileoff; 150 auio.uio_iovcnt = 1; 151 auio.uio_resid = sfv_len; 152 auio.uio_iov = &aiov; 153 auio.uio_segflg = UIO_USERSPACE; 154 auio.uio_llimit = curproc->p_fsz_ctl; 155 auio.uio_fmode = fflag; 156 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 157 while (sfv_len > 0) { 158 error = VOP_WRITE(vp, &auio, ioflag, 159 fp->f_cred, NULL); 160 cnt = sfv_len - auio.uio_resid; 161 sfv_len -= cnt; 162 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 163 if (vp->v_type == VREG) 164 *fileoff += cnt; 165 *count += cnt; 166 if (error != 0) 167 return (error); 168 } 169 } else { 170 file_t *ffp; 171 vnode_t *readvp; 172 size_t size; 173 caddr_t ptr; 174 175 if ((ffp = getf(sfv->sfv_fd)) == NULL) 176 return (EBADF); 177 178 if ((ffp->f_flag & FREAD) == 0) { 179 releasef(sfv->sfv_fd); 180 return (EBADF); 181 } 182 183 readvp = ffp->f_vnode; 184 if (readvp->v_type != VREG) { 185 releasef(sfv->sfv_fd); 186 return (EINVAL); 187 } 188 189 /* 190 * No point reading and writing to same vp, 191 * as long as both are regular files. readvp is not 192 * locked; but since we got it from an open file the 193 * contents will be valid during the time of access. 194 */ 195 if (vn_compare(vp, readvp)) { 196 releasef(sfv->sfv_fd); 197 return (EINVAL); 198 } 199 200 /* 201 * Optimize the regular file over 202 * the socket case. 203 */ 204 if (vp->v_type == VSOCK) { 205 error = sosendfile64(fp, ffp, sfv, count); 206 if (error) 207 return (error); 208 sfv++; 209 continue; 210 } 211 212 /* 213 * Note: we assume readvp != vp. "vp" is already 214 * locked, and "readvp" must not be. 215 */ 216 if (readvp < vp) { 217 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 218 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 219 NULL); 220 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 221 } else { 222 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 223 NULL); 224 } 225 226 /* 227 * Same checks as in pread64. 228 */ 229 if (sfv_off > MAXOFFSET_T) { 230 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 231 releasef(sfv->sfv_fd); 232 return (EINVAL); 233 } 234 235 if (sfv_off + sfv_len > MAXOFFSET_T) 236 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off); 237 238 /* Find the native blocksize to transfer data */ 239 size = MIN(vp->v_vfsp->vfs_bsize, 240 readvp->v_vfsp->vfs_bsize); 241 size = sfv_len < size ? sfv_len : size; 242 ptr = kmem_alloc(size, KM_NOSLEEP); 243 if (ptr == NULL) { 244 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 245 releasef(sfv->sfv_fd); 246 return (ENOMEM); 247 } 248 249 while (sfv_len > 0) { 250 size_t iov_len; 251 252 iov_len = MIN(size, sfv_len); 253 aiov.iov_base = ptr; 254 aiov.iov_len = iov_len; 255 auio.uio_loffset = sfv_off; 256 auio.uio_iov = &aiov; 257 auio.uio_iovcnt = 1; 258 auio.uio_resid = iov_len; 259 auio.uio_segflg = UIO_SYSSPACE; 260 auio.uio_llimit = MAXOFFSET_T; 261 auio.uio_fmode = ffp->f_flag; 262 ioflag = auio.uio_fmode & 263 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 264 265 /* 266 * If read sync is not asked for, 267 * filter sync flags 268 */ 269 if ((ioflag & FRSYNC) == 0) 270 ioflag &= ~(FSYNC|FDSYNC); 271 error = VOP_READ(readvp, &auio, ioflag, 272 fp->f_cred, NULL); 273 if (error) { 274 kmem_free(ptr, size); 275 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 276 NULL); 277 releasef(sfv->sfv_fd); 278 return (error); 279 } 280 281 /* 282 * Check how must data was really read. 283 * Decrement the 'len' and increment the 284 * 'off' appropriately. 285 */ 286 cnt = iov_len - auio.uio_resid; 287 if (cnt == 0) { 288 /* 289 * If we were reading a pipe (currently 290 * not implemented), we may now lose 291 * data. 292 */ 293 kmem_free(ptr, size); 294 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 295 NULL); 296 releasef(sfv->sfv_fd); 297 return (EINVAL); 298 } 299 sfv_len -= cnt; 300 sfv_off += cnt; 301 302 aiov.iov_base = ptr; 303 aiov.iov_len = cnt; 304 auio.uio_loffset = *fileoff; 305 auio.uio_iov = &aiov; 306 auio.uio_iovcnt = 1; 307 auio.uio_resid = cnt; 308 auio.uio_segflg = UIO_SYSSPACE; 309 auio.uio_llimit = curproc->p_fsz_ctl; 310 auio.uio_fmode = fflag; 311 ioflag = auio.uio_fmode & 312 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 313 error = VOP_WRITE(vp, &auio, ioflag, 314 fp->f_cred, NULL); 315 316 /* 317 * Check how much data was written. Increment 318 * the 'len' and decrement the 'off' if all 319 * the data was not written. 320 */ 321 cnt -= auio.uio_resid; 322 sfv_len += auio.uio_resid; 323 sfv_off -= auio.uio_resid; 324 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 325 if (vp->v_type == VREG) 326 *fileoff += cnt; 327 *count += cnt; 328 if (error != 0) { 329 kmem_free(ptr, size); 330 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 331 NULL); 332 releasef(sfv->sfv_fd); 333 return (error); 334 } 335 } 336 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 337 releasef(sfv->sfv_fd); 338 kmem_free(ptr, size); 339 } 340 sfv++; 341 } 342 return (0); 343 } 344 345 ssize32_t 346 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, 347 size32_t *xferred, int fildes) 348 { 349 u_offset_t fileoff; 350 int copy_cnt; 351 const struct ksendfilevec64 *copy_vec; 352 struct ksendfilevec64 sfv[SEND_MAX_CHUNK]; 353 struct vnode *vp; 354 int error; 355 ssize32_t count = 0; 356 357 vp = fp->f_vnode; 358 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 359 360 copy_vec = vec; 361 fileoff = fp->f_offset; 362 363 do { 364 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 365 if (copyin(copy_vec, sfv, copy_cnt * 366 sizeof (struct ksendfilevec64))) { 367 error = EFAULT; 368 break; 369 } 370 371 error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count); 372 if (error != 0) 373 break; 374 375 copy_vec += copy_cnt; 376 sfvcnt -= copy_cnt; 377 } while (sfvcnt > 0); 378 379 if (vp->v_type == VREG) 380 fp->f_offset += count; 381 382 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 383 if (copyout(&count, xferred, sizeof (count))) 384 error = EFAULT; 385 releasef(fildes); 386 if (error != 0) 387 return (set_errno(error)); 388 return (count); 389 } 390 #endif 391 392 int 393 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 394 int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count) 395 { 396 struct vnode *vp; 397 struct uio auio; 398 struct iovec aiov; 399 ushort_t fflag; 400 int ioflag; 401 int i, error; 402 size_t cnt; 403 ssize_t sfv_len; 404 u_offset_t sfv_off; 405 #ifdef _SYSCALL32_IMPL 406 model_t model = get_udatamodel(); 407 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 408 MAXOFF32_T : MAXOFFSET_T; 409 #else 410 const u_offset_t maxoff = MAXOFF32_T; 411 #endif 412 mblk_t *dmp = NULL; 413 int wroff; 414 int buf_left = 0; 415 size_t iov_len; 416 mblk_t *head, *tmp; 417 size_t size = total_size; 418 size_t extra; 419 int tail_len; 420 struct nmsghdr msg; 421 422 fflag = fp->f_flag; 423 vp = fp->f_vnode; 424 425 ASSERT(vp->v_type == VSOCK); 426 ASSERT(maxblk > 0); 427 428 /* If nothing to send, return */ 429 if (total_size == 0) 430 return (0); 431 432 if (vp->v_stream != NULL) { 433 wroff = (int)vp->v_stream->sd_wroff; 434 tail_len = (int)vp->v_stream->sd_tail; 435 } else { 436 struct sonode *so; 437 438 so = VTOSO(vp); 439 wroff = so->so_proto_props.sopp_wroff; 440 tail_len = so->so_proto_props.sopp_tail; 441 } 442 443 extra = wroff + tail_len; 444 445 buf_left = MIN(total_size, maxblk); 446 head = dmp = allocb(buf_left + extra, BPRI_HI); 447 if (head == NULL) 448 return (ENOMEM); 449 head->b_wptr = head->b_rptr = head->b_rptr + wroff; 450 bzero(&msg, sizeof (msg)); 451 452 auio.uio_extflg = UIO_COPY_DEFAULT; 453 for (i = 0; i < copy_cnt; i++) { 454 if (ISSIG(curthread, JUSTLOOKING)) { 455 freemsg(head); 456 return (EINTR); 457 } 458 459 /* 460 * Do similar checks as "write" as we are writing 461 * sfv_len bytes into "vp". 462 */ 463 sfv_len = (ssize_t)sfv->sfv_len; 464 465 if (sfv_len == 0) { 466 sfv++; 467 continue; 468 } 469 470 /* Check for overflow */ 471 #ifdef _SYSCALL32_IMPL 472 if (model == DATAMODEL_ILP32) { 473 if (((ssize32_t)(*count + sfv_len)) < 0) { 474 freemsg(head); 475 return (EINVAL); 476 } 477 } else 478 #endif 479 if ((*count + sfv_len) < 0) { 480 freemsg(head); 481 return (EINVAL); 482 } 483 484 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 485 486 if (sfv->sfv_fd == SFV_FD_SELF) { 487 while (sfv_len > 0) { 488 if (buf_left == 0) { 489 tmp = dmp; 490 buf_left = MIN(total_size, maxblk); 491 iov_len = MIN(buf_left, sfv_len); 492 dmp = allocb(buf_left + extra, BPRI_HI); 493 if (dmp == NULL) { 494 freemsg(head); 495 return (ENOMEM); 496 } 497 dmp->b_wptr = dmp->b_rptr = 498 dmp->b_rptr + wroff; 499 tmp->b_cont = dmp; 500 } else { 501 iov_len = MIN(buf_left, sfv_len); 502 } 503 504 aiov.iov_len = iov_len; 505 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 506 auio.uio_loffset = *fileoff; 507 auio.uio_iovcnt = 1; 508 auio.uio_resid = iov_len; 509 auio.uio_iov = &aiov; 510 auio.uio_segflg = UIO_USERSPACE; 511 auio.uio_llimit = curproc->p_fsz_ctl; 512 auio.uio_fmode = fflag; 513 514 buf_left -= iov_len; 515 total_size -= iov_len; 516 sfv_len -= iov_len; 517 sfv_off += iov_len; 518 519 error = uiomove((caddr_t)dmp->b_wptr, 520 iov_len, UIO_WRITE, &auio); 521 if (error != 0) { 522 freemsg(head); 523 return (error); 524 } 525 dmp->b_wptr += iov_len; 526 } 527 } else { 528 file_t *ffp; 529 vnode_t *readvp; 530 531 if ((ffp = getf(sfv->sfv_fd)) == NULL) { 532 freemsg(head); 533 return (EBADF); 534 } 535 536 if ((ffp->f_flag & FREAD) == 0) { 537 releasef(sfv->sfv_fd); 538 freemsg(head); 539 return (EACCES); 540 } 541 542 readvp = ffp->f_vnode; 543 if (readvp->v_type != VREG) { 544 releasef(sfv->sfv_fd); 545 freemsg(head); 546 return (EINVAL); 547 } 548 549 /* 550 * No point reading and writing to same vp, 551 * as long as both are regular files. readvp is not 552 * locked; but since we got it from an open file the 553 * contents will be valid during the time of access. 554 */ 555 556 if (vn_compare(vp, readvp)) { 557 releasef(sfv->sfv_fd); 558 freemsg(head); 559 return (EINVAL); 560 } 561 562 /* 563 * Note: we assume readvp != vp. "vp" is already 564 * locked, and "readvp" must not be. 565 */ 566 567 if (readvp < vp) { 568 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 569 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 570 NULL); 571 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 572 } else { 573 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 574 NULL); 575 } 576 577 /* Same checks as in pread */ 578 if (sfv_off > maxoff) { 579 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 580 releasef(sfv->sfv_fd); 581 freemsg(head); 582 return (EINVAL); 583 } 584 if (sfv_off + sfv_len > maxoff) { 585 total_size -= (sfv_off + sfv_len - maxoff); 586 sfv_len = (ssize_t)((offset_t)maxoff - 587 sfv_off); 588 } 589 590 while (sfv_len > 0) { 591 if (buf_left == 0) { 592 tmp = dmp; 593 buf_left = MIN(total_size, maxblk); 594 iov_len = MIN(buf_left, sfv_len); 595 dmp = allocb(buf_left + extra, BPRI_HI); 596 if (dmp == NULL) { 597 VOP_RWUNLOCK(readvp, 598 V_WRITELOCK_FALSE, NULL); 599 releasef(sfv->sfv_fd); 600 freemsg(head); 601 return (ENOMEM); 602 } 603 dmp->b_wptr = dmp->b_rptr = 604 dmp->b_rptr + wroff; 605 tmp->b_cont = dmp; 606 } else { 607 iov_len = MIN(buf_left, sfv_len); 608 } 609 aiov.iov_base = (caddr_t)dmp->b_wptr; 610 aiov.iov_len = iov_len; 611 auio.uio_loffset = sfv_off; 612 auio.uio_iov = &aiov; 613 auio.uio_iovcnt = 1; 614 auio.uio_resid = iov_len; 615 auio.uio_segflg = UIO_SYSSPACE; 616 auio.uio_llimit = MAXOFFSET_T; 617 auio.uio_fmode = ffp->f_flag; 618 ioflag = auio.uio_fmode & 619 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 620 621 /* 622 * If read sync is not asked for, 623 * filter sync flags 624 */ 625 if ((ioflag & FRSYNC) == 0) 626 ioflag &= ~(FSYNC|FDSYNC); 627 error = VOP_READ(readvp, &auio, ioflag, 628 fp->f_cred, NULL); 629 if (error != 0) { 630 /* 631 * If we were reading a pipe (currently 632 * not implemented), we may now loose 633 * data. 634 */ 635 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 636 NULL); 637 releasef(sfv->sfv_fd); 638 freemsg(head); 639 return (error); 640 } 641 642 /* 643 * Check how much data was really read. 644 * Decrement the 'len' and increment the 645 * 'off' appropriately. 646 */ 647 cnt = iov_len - auio.uio_resid; 648 if (cnt == 0) { 649 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 650 NULL); 651 releasef(sfv->sfv_fd); 652 freemsg(head); 653 return (EINVAL); 654 } 655 sfv_len -= cnt; 656 sfv_off += cnt; 657 total_size -= cnt; 658 buf_left -= cnt; 659 660 dmp->b_wptr += cnt; 661 } 662 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 663 releasef(sfv->sfv_fd); 664 } 665 sfv++; 666 } 667 668 ASSERT(total_size == 0); 669 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &head); 670 if (error != 0) { 671 if (head != NULL) 672 freemsg(head); 673 return (error); 674 } 675 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size; 676 *count += size; 677 678 return (0); 679 } 680 681 682 int 683 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 684 int copy_cnt, ssize_t *count) 685 { 686 struct vnode *vp; 687 struct uio auio; 688 struct iovec aiov; 689 ushort_t fflag; 690 int ioflag; 691 int i, error; 692 size_t cnt; 693 ssize_t sfv_len; 694 u_offset_t sfv_off; 695 #ifdef _SYSCALL32_IMPL 696 model_t model = get_udatamodel(); 697 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 698 MAXOFF32_T : MAXOFFSET_T; 699 #else 700 const u_offset_t maxoff = MAXOFF32_T; 701 #endif 702 mblk_t *dmp = NULL; 703 char *buf = NULL; 704 size_t extra; 705 int maxblk, wroff, tail_len; 706 struct sonode *so; 707 stdata_t *stp; 708 struct nmsghdr msg; 709 710 fflag = fp->f_flag; 711 vp = fp->f_vnode; 712 713 if (vp->v_type == VSOCK) { 714 so = VTOSO(vp); 715 if (vp->v_stream != NULL) { 716 stp = vp->v_stream; 717 wroff = (int)stp->sd_wroff; 718 tail_len = (int)stp->sd_tail; 719 maxblk = (int)stp->sd_maxblk; 720 } else { 721 stp = NULL; 722 wroff = so->so_proto_props.sopp_wroff; 723 tail_len = so->so_proto_props.sopp_tail; 724 maxblk = so->so_proto_props.sopp_maxblk; 725 } 726 extra = wroff + tail_len; 727 } 728 729 bzero(&msg, sizeof (msg)); 730 auio.uio_extflg = UIO_COPY_DEFAULT; 731 for (i = 0; i < copy_cnt; i++) { 732 if (ISSIG(curthread, JUSTLOOKING)) 733 return (EINTR); 734 735 /* 736 * Do similar checks as "write" as we are writing 737 * sfv_len bytes into "vp". 738 */ 739 sfv_len = (ssize_t)sfv->sfv_len; 740 741 if (sfv_len == 0) { 742 sfv++; 743 continue; 744 } 745 746 if (vp->v_type == VREG) { 747 if (*fileoff >= curproc->p_fsz_ctl) { 748 mutex_enter(&curproc->p_lock); 749 (void) rctl_action( 750 rctlproc_legacy[RLIMIT_FSIZE], 751 curproc->p_rctls, curproc, RCA_SAFE); 752 mutex_exit(&curproc->p_lock); 753 754 return (EFBIG); 755 } 756 757 if (*fileoff >= maxoff) 758 return (EFBIG); 759 760 if (*fileoff + sfv_len > maxoff) 761 return (EINVAL); 762 } 763 764 /* Check for overflow */ 765 #ifdef _SYSCALL32_IMPL 766 if (model == DATAMODEL_ILP32) { 767 if (((ssize32_t)(*count + sfv_len)) < 0) 768 return (EINVAL); 769 } else 770 #endif 771 if ((*count + sfv_len) < 0) 772 return (EINVAL); 773 774 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 775 776 if (sfv->sfv_fd == SFV_FD_SELF) { 777 if (vp->v_type == VSOCK) { 778 while (sfv_len > 0) { 779 size_t iov_len; 780 781 iov_len = sfv_len; 782 if (!SOCK_IS_NONSTR(so) && 783 SOTOTPI(so)->sti_kssl_ctx != NULL) 784 iov_len = MIN(iov_len, maxblk); 785 786 aiov.iov_len = iov_len; 787 aiov.iov_base = 788 (caddr_t)(uintptr_t)sfv_off; 789 790 auio.uio_iov = &aiov; 791 auio.uio_iovcnt = 1; 792 auio.uio_loffset = *fileoff; 793 auio.uio_segflg = UIO_USERSPACE; 794 auio.uio_fmode = fflag; 795 auio.uio_llimit = curproc->p_fsz_ctl; 796 auio.uio_resid = iov_len; 797 798 dmp = allocb(iov_len + extra, BPRI_HI); 799 if (dmp == NULL) 800 return (ENOMEM); 801 dmp->b_wptr = dmp->b_rptr = 802 dmp->b_rptr + wroff; 803 error = uiomove((caddr_t)dmp->b_wptr, 804 iov_len, UIO_WRITE, &auio); 805 if (error != 0) { 806 freeb(dmp); 807 return (error); 808 } 809 dmp->b_wptr += iov_len; 810 error = socket_sendmblk(VTOSO(vp), 811 &msg, fflag, CRED(), &dmp); 812 813 if (error != 0) { 814 if (dmp != NULL) 815 freeb(dmp); 816 return (error); 817 } 818 ttolwp(curthread)->lwp_ru.ioch += 819 (ulong_t)iov_len; 820 *count += iov_len; 821 sfv_len -= iov_len; 822 sfv_off += iov_len; 823 } 824 } else { 825 aiov.iov_len = sfv_len; 826 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 827 828 auio.uio_iov = &aiov; 829 auio.uio_iovcnt = 1; 830 auio.uio_loffset = *fileoff; 831 auio.uio_segflg = UIO_USERSPACE; 832 auio.uio_fmode = fflag; 833 auio.uio_llimit = curproc->p_fsz_ctl; 834 auio.uio_resid = sfv_len; 835 836 ioflag = auio.uio_fmode & 837 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 838 while (sfv_len > 0) { 839 error = VOP_WRITE(vp, &auio, ioflag, 840 fp->f_cred, NULL); 841 cnt = sfv_len - auio.uio_resid; 842 sfv_len -= cnt; 843 ttolwp(curthread)->lwp_ru.ioch += 844 (ulong_t)cnt; 845 *fileoff += cnt; 846 *count += cnt; 847 if (error != 0) 848 return (error); 849 } 850 } 851 } else { 852 int segmapit = 0; 853 file_t *ffp; 854 vnode_t *readvp; 855 struct vnode *realvp; 856 size_t size; 857 caddr_t ptr; 858 859 if ((ffp = getf(sfv->sfv_fd)) == NULL) 860 return (EBADF); 861 862 if ((ffp->f_flag & FREAD) == 0) { 863 releasef(sfv->sfv_fd); 864 return (EBADF); 865 } 866 867 readvp = ffp->f_vnode; 868 if (VOP_REALVP(readvp, &realvp, NULL) == 0) 869 readvp = realvp; 870 if (readvp->v_type != VREG) { 871 releasef(sfv->sfv_fd); 872 return (EINVAL); 873 } 874 875 /* 876 * No point reading and writing to same vp, 877 * as long as both are regular files. readvp is not 878 * locked; but since we got it from an open file the 879 * contents will be valid during the time of access. 880 */ 881 if (vn_compare(vp, readvp)) { 882 releasef(sfv->sfv_fd); 883 return (EINVAL); 884 } 885 886 /* 887 * Note: we assume readvp != vp. "vp" is already 888 * locked, and "readvp" must not be. 889 */ 890 if (readvp < vp) { 891 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 892 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 893 NULL); 894 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 895 } else { 896 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 897 NULL); 898 } 899 900 /* Same checks as in pread */ 901 if (sfv_off > maxoff) { 902 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 903 releasef(sfv->sfv_fd); 904 return (EINVAL); 905 } 906 if (sfv_off + sfv_len > maxoff) { 907 sfv_len = (ssize_t)((offset_t)maxoff - 908 sfv_off); 909 } 910 /* Find the native blocksize to transfer data */ 911 size = MIN(vp->v_vfsp->vfs_bsize, 912 readvp->v_vfsp->vfs_bsize); 913 size = sfv_len < size ? sfv_len : size; 914 915 if (vp->v_type != VSOCK) { 916 segmapit = 0; 917 buf = kmem_alloc(size, KM_NOSLEEP); 918 if (buf == NULL) { 919 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 920 NULL); 921 releasef(sfv->sfv_fd); 922 return (ENOMEM); 923 } 924 } else { 925 uint_t copyflag; 926 927 copyflag = stp != NULL ? stp->sd_copyflag : 928 so->so_proto_props.sopp_zcopyflag; 929 /* 930 * For sockets acting as an SSL proxy, we 931 * need to adjust the size to the maximum 932 * SSL record size set in the stream head. 933 */ 934 if (!SOCK_IS_NONSTR(so) && 935 _SOTOTPI(so)->sti_kssl_ctx != NULL) 936 size = MIN(size, maxblk); 937 938 if (vn_has_flocks(readvp) || 939 readvp->v_flag & VNOMAP || 940 copyflag & STZCVMUNSAFE) { 941 segmapit = 0; 942 } else if (copyflag & STZCVMSAFE) { 943 segmapit = 1; 944 } else { 945 int on = 1; 946 if (socket_setsockopt(VTOSO(vp), 947 SOL_SOCKET, SO_SND_COPYAVOID, 948 &on, sizeof (on), CRED()) == 0) 949 segmapit = 1; 950 } 951 } 952 953 if (segmapit) { 954 boolean_t nowait; 955 956 nowait = (sfv->sfv_flag & SFV_NOWAIT) != 0; 957 error = snf_segmap(fp, readvp, sfv_off, 958 (u_offset_t)sfv_len, (ssize_t *)&cnt, 959 nowait); 960 releasef(sfv->sfv_fd); 961 *count += cnt; 962 if (error) 963 return (error); 964 sfv++; 965 continue; 966 } 967 968 while (sfv_len > 0) { 969 size_t iov_len; 970 971 iov_len = MIN(size, sfv_len); 972 973 if (vp->v_type == VSOCK) { 974 dmp = allocb(iov_len + extra, BPRI_HI); 975 if (dmp == NULL) { 976 VOP_RWUNLOCK(readvp, 977 V_WRITELOCK_FALSE, NULL); 978 releasef(sfv->sfv_fd); 979 return (ENOMEM); 980 } 981 dmp->b_wptr = dmp->b_rptr = 982 dmp->b_rptr + wroff; 983 ptr = (caddr_t)dmp->b_rptr; 984 } else { 985 ptr = buf; 986 } 987 988 aiov.iov_base = ptr; 989 aiov.iov_len = iov_len; 990 auio.uio_loffset = sfv_off; 991 auio.uio_iov = &aiov; 992 auio.uio_iovcnt = 1; 993 auio.uio_resid = iov_len; 994 auio.uio_segflg = UIO_SYSSPACE; 995 auio.uio_llimit = MAXOFFSET_T; 996 auio.uio_fmode = ffp->f_flag; 997 ioflag = auio.uio_fmode & 998 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 999 1000 /* 1001 * If read sync is not asked for, 1002 * filter sync flags 1003 */ 1004 if ((ioflag & FRSYNC) == 0) 1005 ioflag &= ~(FSYNC|FDSYNC); 1006 error = VOP_READ(readvp, &auio, ioflag, 1007 fp->f_cred, NULL); 1008 if (error != 0) { 1009 /* 1010 * If we were reading a pipe (currently 1011 * not implemented), we may now lose 1012 * data. 1013 */ 1014 if (vp->v_type == VSOCK) 1015 freeb(dmp); 1016 else 1017 kmem_free(buf, size); 1018 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 1019 NULL); 1020 releasef(sfv->sfv_fd); 1021 return (error); 1022 } 1023 1024 /* 1025 * Check how much data was really read. 1026 * Decrement the 'len' and increment the 1027 * 'off' appropriately. 1028 */ 1029 cnt = iov_len - auio.uio_resid; 1030 if (cnt == 0) { 1031 if (vp->v_type == VSOCK) 1032 freeb(dmp); 1033 else 1034 kmem_free(buf, size); 1035 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 1036 NULL); 1037 releasef(sfv->sfv_fd); 1038 return (EINVAL); 1039 } 1040 sfv_len -= cnt; 1041 sfv_off += cnt; 1042 1043 if (vp->v_type == VSOCK) { 1044 dmp->b_wptr = dmp->b_rptr + cnt; 1045 1046 error = socket_sendmblk(VTOSO(vp), 1047 &msg, fflag, CRED(), &dmp); 1048 1049 if (error != 0) { 1050 if (dmp != NULL) 1051 freeb(dmp); 1052 VOP_RWUNLOCK(readvp, 1053 V_WRITELOCK_FALSE, NULL); 1054 releasef(sfv->sfv_fd); 1055 return (error); 1056 } 1057 1058 ttolwp(curthread)->lwp_ru.ioch += 1059 (ulong_t)cnt; 1060 *count += cnt; 1061 } else { 1062 1063 aiov.iov_base = ptr; 1064 aiov.iov_len = cnt; 1065 auio.uio_loffset = *fileoff; 1066 auio.uio_resid = cnt; 1067 auio.uio_iov = &aiov; 1068 auio.uio_iovcnt = 1; 1069 auio.uio_segflg = UIO_SYSSPACE; 1070 auio.uio_llimit = curproc->p_fsz_ctl; 1071 auio.uio_fmode = fflag; 1072 ioflag = auio.uio_fmode & 1073 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1074 error = VOP_WRITE(vp, &auio, ioflag, 1075 fp->f_cred, NULL); 1076 1077 /* 1078 * Check how much data was written. 1079 * Increment the 'len' and decrement the 1080 * 'off' if all the data was not 1081 * written. 1082 */ 1083 cnt -= auio.uio_resid; 1084 sfv_len += auio.uio_resid; 1085 sfv_off -= auio.uio_resid; 1086 ttolwp(curthread)->lwp_ru.ioch += 1087 (ulong_t)cnt; 1088 *fileoff += cnt; 1089 *count += cnt; 1090 if (error != 0) { 1091 kmem_free(buf, size); 1092 VOP_RWUNLOCK(readvp, 1093 V_WRITELOCK_FALSE, NULL); 1094 releasef(sfv->sfv_fd); 1095 return (error); 1096 } 1097 } 1098 } 1099 if (buf) { 1100 kmem_free(buf, size); 1101 buf = NULL; 1102 } 1103 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 1104 releasef(sfv->sfv_fd); 1105 } 1106 sfv++; 1107 } 1108 return (0); 1109 } 1110 1111 ssize_t 1112 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, 1113 size_t *xferred) 1114 { 1115 int error = 0; 1116 int first_vector_error = 0; 1117 file_t *fp; 1118 struct vnode *vp; 1119 struct sonode *so; 1120 u_offset_t fileoff; 1121 int copy_cnt; 1122 const struct sendfilevec *copy_vec; 1123 struct sendfilevec sfv[SEND_MAX_CHUNK]; 1124 ssize_t count = 0; 1125 #ifdef _SYSCALL32_IMPL 1126 struct ksendfilevec32 sfv32[SEND_MAX_CHUNK]; 1127 #endif 1128 ssize_t total_size; 1129 int i; 1130 boolean_t is_sock = B_FALSE; 1131 int maxblk = 0; 1132 1133 if (sfvcnt <= 0) 1134 return (set_errno(EINVAL)); 1135 1136 if ((fp = getf(fildes)) == NULL) 1137 return (set_errno(EBADF)); 1138 1139 if (((fp->f_flag) & FWRITE) == 0) { 1140 error = EBADF; 1141 goto err; 1142 } 1143 1144 fileoff = fp->f_offset; 1145 vp = fp->f_vnode; 1146 1147 switch (vp->v_type) { 1148 case VSOCK: 1149 so = VTOSO(vp); 1150 is_sock = B_TRUE; 1151 if (SOCK_IS_NONSTR(so)) { 1152 maxblk = so->so_proto_props.sopp_maxblk; 1153 } else { 1154 maxblk = (int)vp->v_stream->sd_maxblk; 1155 } 1156 break; 1157 case VREG: 1158 break; 1159 default: 1160 error = EINVAL; 1161 goto err; 1162 } 1163 1164 switch (opcode) { 1165 case SENDFILEV : 1166 break; 1167 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 1168 case SENDFILEV64 : 1169 return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt, 1170 (size32_t *)xferred, fildes)); 1171 #endif 1172 default : 1173 error = ENOSYS; 1174 break; 1175 } 1176 1177 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1178 copy_vec = vec; 1179 1180 do { 1181 total_size = 0; 1182 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 1183 #ifdef _SYSCALL32_IMPL 1184 /* 32-bit callers need to have their iovec expanded. */ 1185 if (get_udatamodel() == DATAMODEL_ILP32) { 1186 if (copyin(copy_vec, sfv32, 1187 copy_cnt * sizeof (ksendfilevec32_t))) { 1188 error = EFAULT; 1189 break; 1190 } 1191 1192 for (i = 0; i < copy_cnt; i++) { 1193 sfv[i].sfv_fd = sfv32[i].sfv_fd; 1194 sfv[i].sfv_off = 1195 (off_t)(uint32_t)sfv32[i].sfv_off; 1196 sfv[i].sfv_len = (size_t)sfv32[i].sfv_len; 1197 total_size += sfv[i].sfv_len; 1198 sfv[i].sfv_flag = sfv32[i].sfv_flag; 1199 /* 1200 * Individual elements of the vector must not 1201 * wrap or overflow, as later math is signed. 1202 * Equally total_size needs to be checked after 1203 * each vector is added in, to be sure that 1204 * rogue values haven't overflowed the counter. 1205 */ 1206 if (((ssize32_t)sfv[i].sfv_len < 0) || 1207 ((ssize32_t)total_size < 0)) { 1208 /* 1209 * Truncate the vector to send data 1210 * described by elements before the 1211 * error. 1212 */ 1213 copy_cnt = i; 1214 first_vector_error = EINVAL; 1215 /* total_size can't be trusted */ 1216 if ((ssize32_t)total_size < 0) 1217 error = EINVAL; 1218 break; 1219 } 1220 } 1221 /* Nothing to do, process errors */ 1222 if (copy_cnt == 0) 1223 break; 1224 1225 } else { 1226 #endif 1227 if (copyin(copy_vec, sfv, 1228 copy_cnt * sizeof (sendfilevec_t))) { 1229 error = EFAULT; 1230 break; 1231 } 1232 1233 for (i = 0; i < copy_cnt; i++) { 1234 total_size += sfv[i].sfv_len; 1235 /* 1236 * Individual elements of the vector must not 1237 * wrap or overflow, as later math is signed. 1238 * Equally total_size needs to be checked after 1239 * each vector is added in, to be sure that 1240 * rogue values haven't overflowed the counter. 1241 */ 1242 if (((ssize_t)sfv[i].sfv_len < 0) || 1243 (total_size < 0)) { 1244 /* 1245 * Truncate the vector to send data 1246 * described by elements before the 1247 * error. 1248 */ 1249 copy_cnt = i; 1250 first_vector_error = EINVAL; 1251 /* total_size can't be trusted */ 1252 if (total_size < 0) 1253 error = EINVAL; 1254 break; 1255 } 1256 } 1257 /* Nothing to do, process errors */ 1258 if (copy_cnt == 0) 1259 break; 1260 #ifdef _SYSCALL32_IMPL 1261 } 1262 #endif 1263 1264 /* 1265 * The task between deciding to use sendvec_small_chunk 1266 * and sendvec_chunk is dependant on multiple things: 1267 * 1268 * i) latency is important for smaller files. So if the 1269 * data is smaller than 'tcp_slow_start_initial' times 1270 * maxblk, then use sendvec_small_chunk which creates 1271 * maxblk size mblks and chains them together and sends 1272 * them to TCP in one shot. It also leaves 'wroff' size 1273 * space for the headers in each mblk. 1274 * 1275 * ii) for total size bigger than 'tcp_slow_start_initial' 1276 * time maxblk, its probably real file data which is 1277 * dominating. So its better to use sendvec_chunk because 1278 * performance goes to dog if we don't do pagesize reads. 1279 * sendvec_chunk will do pagesize reads and write them 1280 * in pagesize mblks to TCP. 1281 * 1282 * Side Notes: A write to file has not been optimized. 1283 * Future zero copy code will plugin into sendvec_chunk 1284 * only because doing zero copy for files smaller then 1285 * pagesize is useless. 1286 * 1287 * Note, if socket has NL7C enabled then call NL7C's 1288 * senfilev() function to consume the sfv[]. 1289 */ 1290 if (is_sock) { 1291 if (!SOCK_IS_NONSTR(so) && 1292 _SOTOTPI(so)->sti_nl7c_flags != 0) { 1293 error = nl7c_sendfilev(so, &fileoff, 1294 sfv, copy_cnt, &count); 1295 } else if ((total_size <= (4 * maxblk)) && 1296 error == 0) { 1297 error = sendvec_small_chunk(fp, 1298 &fileoff, sfv, copy_cnt, 1299 total_size, maxblk, &count); 1300 } else { 1301 error = sendvec_chunk(fp, &fileoff, 1302 sfv, copy_cnt, &count); 1303 } 1304 } else { 1305 ASSERT(vp->v_type == VREG); 1306 error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt, 1307 &count); 1308 } 1309 1310 1311 #ifdef _SYSCALL32_IMPL 1312 if (get_udatamodel() == DATAMODEL_ILP32) 1313 copy_vec = (const struct sendfilevec *)((char *)copy_vec + 1314 (copy_cnt * sizeof (ksendfilevec32_t))); 1315 else 1316 #endif 1317 copy_vec += copy_cnt; 1318 sfvcnt -= copy_cnt; 1319 1320 /* Process all vector members up to first error */ 1321 } while ((sfvcnt > 0) && first_vector_error == 0 && error == 0); 1322 1323 if (vp->v_type == VREG) 1324 fp->f_offset += count; 1325 1326 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1327 1328 #ifdef _SYSCALL32_IMPL 1329 if (get_udatamodel() == DATAMODEL_ILP32) { 1330 ssize32_t count32 = (ssize32_t)count; 1331 if (copyout(&count32, xferred, sizeof (count32))) 1332 error = EFAULT; 1333 releasef(fildes); 1334 if (error != 0) 1335 return (set_errno(error)); 1336 if (first_vector_error != 0) 1337 return (set_errno(first_vector_error)); 1338 return (count32); 1339 } 1340 #endif 1341 if (copyout(&count, xferred, sizeof (count))) 1342 error = EFAULT; 1343 releasef(fildes); 1344 if (error != 0) 1345 return (set_errno(error)); 1346 if (first_vector_error != 0) 1347 return (set_errno(first_vector_error)); 1348 return (count); 1349 err: 1350 ASSERT(error != 0); 1351 releasef(fildes); 1352 return (set_errno(error)); 1353 } 1354