1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/t_lock.h> 29 #include <sys/param.h> 30 #include <sys/systm.h> 31 #include <sys/buf.h> 32 #include <sys/conf.h> 33 #include <sys/cred.h> 34 #include <sys/kmem.h> 35 #include <sys/sysmacros.h> 36 #include <sys/vfs.h> 37 #include <sys/vnode.h> 38 #include <sys/debug.h> 39 #include <sys/errno.h> 40 #include <sys/time.h> 41 #include <sys/file.h> 42 #include <sys/open.h> 43 #include <sys/user.h> 44 #include <sys/termios.h> 45 #include <sys/stream.h> 46 #include <sys/strsubr.h> 47 #include <sys/sunddi.h> 48 #include <sys/esunddi.h> 49 #include <sys/flock.h> 50 #include <sys/modctl.h> 51 #include <sys/cmn_err.h> 52 #include <sys/vmsystm.h> 53 54 #include <sys/socket.h> 55 #include <sys/socketvar.h> 56 #include <fs/sockfs/sockcommon.h> 57 #include <fs/sockfs/socktpi.h> 58 59 #include <netinet/in.h> 60 #include <sys/sendfile.h> 61 #include <sys/un.h> 62 #include <sys/tihdr.h> 63 #include <sys/atomic.h> 64 65 #include <inet/common.h> 66 #include <inet/ip.h> 67 #include <inet/ip6.h> 68 #include <inet/tcp.h> 69 70 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *, 71 ssize32_t *); 72 extern int nl7c_sendfilev(struct sonode *, u_offset_t *, struct sendfilevec *, 73 int, ssize_t *); 74 extern int snf_segmap(file_t *, vnode_t *, u_offset_t, u_offset_t, ssize_t *, 75 boolean_t); 76 extern sotpi_info_t *sotpi_sototpi(struct sonode *); 77 78 #define SEND_MAX_CHUNK 16 79 80 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 81 /* 82 * 64 bit offsets for 32 bit applications only running either on 83 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer 84 * more than 2GB of data. 85 */ 86 int 87 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, 88 int copy_cnt, ssize32_t *count) 89 { 90 struct vnode *vp; 91 ushort_t fflag; 92 int ioflag; 93 size32_t cnt; 94 ssize32_t sfv_len; 95 ssize32_t tmpcount; 96 u_offset_t sfv_off; 97 struct uio auio; 98 struct iovec aiov; 99 int i, error; 100 101 fflag = fp->f_flag; 102 vp = fp->f_vnode; 103 for (i = 0; i < copy_cnt; i++) { 104 105 if (ISSIG(curthread, JUSTLOOKING)) 106 return (EINTR); 107 108 /* 109 * Do similar checks as "write" as we are writing 110 * sfv_len bytes into "vp". 111 */ 112 sfv_len = (ssize32_t)sfv->sfv_len; 113 114 if (sfv_len == 0) { 115 sfv++; 116 continue; 117 } 118 119 if (sfv_len < 0) 120 return (EINVAL); 121 122 if (vp->v_type == VREG) { 123 if (*fileoff >= curproc->p_fsz_ctl) { 124 mutex_enter(&curproc->p_lock); 125 (void) rctl_action( 126 rctlproc_legacy[RLIMIT_FSIZE], 127 curproc->p_rctls, curproc, RCA_SAFE); 128 mutex_exit(&curproc->p_lock); 129 return (EFBIG); 130 } 131 132 if (*fileoff >= OFFSET_MAX(fp)) 133 return (EFBIG); 134 135 if (*fileoff + sfv_len > OFFSET_MAX(fp)) 136 return (EINVAL); 137 } 138 139 tmpcount = *count + sfv_len; 140 if (tmpcount < 0) 141 return (EINVAL); 142 143 sfv_off = sfv->sfv_off; 144 145 auio.uio_extflg = UIO_COPY_DEFAULT; 146 if (sfv->sfv_fd == SFV_FD_SELF) { 147 aiov.iov_len = sfv_len; 148 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 149 auio.uio_loffset = *fileoff; 150 auio.uio_iovcnt = 1; 151 auio.uio_resid = sfv_len; 152 auio.uio_iov = &aiov; 153 auio.uio_segflg = UIO_USERSPACE; 154 auio.uio_llimit = curproc->p_fsz_ctl; 155 auio.uio_fmode = fflag; 156 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 157 while (sfv_len > 0) { 158 error = VOP_WRITE(vp, &auio, ioflag, 159 fp->f_cred, NULL); 160 cnt = sfv_len - auio.uio_resid; 161 sfv_len -= cnt; 162 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 163 if (vp->v_type == VREG) 164 *fileoff += cnt; 165 *count += cnt; 166 if (error != 0) 167 return (error); 168 } 169 } else { 170 file_t *ffp; 171 vnode_t *readvp; 172 size_t size; 173 caddr_t ptr; 174 175 if ((ffp = getf(sfv->sfv_fd)) == NULL) 176 return (EBADF); 177 178 if ((ffp->f_flag & FREAD) == 0) { 179 releasef(sfv->sfv_fd); 180 return (EBADF); 181 } 182 183 readvp = ffp->f_vnode; 184 if (readvp->v_type != VREG) { 185 releasef(sfv->sfv_fd); 186 return (EINVAL); 187 } 188 189 /* 190 * No point reading and writing to same vp, 191 * as long as both are regular files. readvp is not 192 * locked; but since we got it from an open file the 193 * contents will be valid during the time of access. 194 */ 195 if (vn_compare(vp, readvp)) { 196 releasef(sfv->sfv_fd); 197 return (EINVAL); 198 } 199 200 /* 201 * Optimize the regular file over 202 * the socket case. 203 */ 204 if (vp->v_type == VSOCK) { 205 error = sosendfile64(fp, ffp, sfv, 206 (ssize32_t *)&cnt); 207 *count += cnt; 208 if (error) 209 return (error); 210 sfv++; 211 continue; 212 } 213 214 /* 215 * Note: we assume readvp != vp. "vp" is already 216 * locked, and "readvp" must not be. 217 */ 218 if (readvp < vp) { 219 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 220 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 221 NULL); 222 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 223 } else { 224 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 225 NULL); 226 } 227 228 /* 229 * Same checks as in pread64. 230 */ 231 if (sfv_off > MAXOFFSET_T) { 232 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 233 releasef(sfv->sfv_fd); 234 return (EINVAL); 235 } 236 237 if (sfv_off + sfv_len > MAXOFFSET_T) 238 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off); 239 240 /* Find the native blocksize to transfer data */ 241 size = MIN(vp->v_vfsp->vfs_bsize, 242 readvp->v_vfsp->vfs_bsize); 243 size = sfv_len < size ? sfv_len : size; 244 ptr = kmem_alloc(size, KM_NOSLEEP); 245 if (ptr == NULL) { 246 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 247 releasef(sfv->sfv_fd); 248 return (ENOMEM); 249 } 250 251 while (sfv_len > 0) { 252 size_t iov_len; 253 254 iov_len = MIN(size, sfv_len); 255 aiov.iov_base = ptr; 256 aiov.iov_len = iov_len; 257 auio.uio_loffset = sfv_off; 258 auio.uio_iov = &aiov; 259 auio.uio_iovcnt = 1; 260 auio.uio_resid = iov_len; 261 auio.uio_segflg = UIO_SYSSPACE; 262 auio.uio_llimit = MAXOFFSET_T; 263 auio.uio_fmode = ffp->f_flag; 264 ioflag = auio.uio_fmode & 265 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 266 267 /* 268 * If read sync is not asked for, 269 * filter sync flags 270 */ 271 if ((ioflag & FRSYNC) == 0) 272 ioflag &= ~(FSYNC|FDSYNC); 273 error = VOP_READ(readvp, &auio, ioflag, 274 fp->f_cred, NULL); 275 if (error) { 276 kmem_free(ptr, size); 277 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 278 NULL); 279 releasef(sfv->sfv_fd); 280 return (error); 281 } 282 283 /* 284 * Check how must data was really read. 285 * Decrement the 'len' and increment the 286 * 'off' appropriately. 287 */ 288 cnt = iov_len - auio.uio_resid; 289 if (cnt == 0) { 290 /* 291 * If we were reading a pipe (currently 292 * not implemented), we may now lose 293 * data. 294 */ 295 kmem_free(ptr, size); 296 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 297 NULL); 298 releasef(sfv->sfv_fd); 299 return (EINVAL); 300 } 301 sfv_len -= cnt; 302 sfv_off += cnt; 303 304 aiov.iov_base = ptr; 305 aiov.iov_len = cnt; 306 auio.uio_loffset = *fileoff; 307 auio.uio_iov = &aiov; 308 auio.uio_iovcnt = 1; 309 auio.uio_resid = cnt; 310 auio.uio_segflg = UIO_SYSSPACE; 311 auio.uio_llimit = curproc->p_fsz_ctl; 312 auio.uio_fmode = fflag; 313 ioflag = auio.uio_fmode & 314 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 315 error = VOP_WRITE(vp, &auio, ioflag, 316 fp->f_cred, NULL); 317 318 /* 319 * Check how much data was written. Increment 320 * the 'len' and decrement the 'off' if all 321 * the data was not written. 322 */ 323 cnt -= auio.uio_resid; 324 sfv_len += auio.uio_resid; 325 sfv_off -= auio.uio_resid; 326 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 327 if (vp->v_type == VREG) 328 *fileoff += cnt; 329 *count += cnt; 330 if (error != 0) { 331 kmem_free(ptr, size); 332 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 333 NULL); 334 releasef(sfv->sfv_fd); 335 return (error); 336 } 337 } 338 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 339 releasef(sfv->sfv_fd); 340 kmem_free(ptr, size); 341 } 342 sfv++; 343 } 344 return (0); 345 } 346 347 ssize32_t 348 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, 349 size32_t *xferred, int fildes) 350 { 351 u_offset_t fileoff; 352 int copy_cnt; 353 const struct ksendfilevec64 *copy_vec; 354 struct ksendfilevec64 sfv[SEND_MAX_CHUNK]; 355 struct vnode *vp; 356 int error; 357 ssize32_t count = 0; 358 359 vp = fp->f_vnode; 360 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 361 362 copy_vec = vec; 363 fileoff = fp->f_offset; 364 365 do { 366 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 367 if (copyin(copy_vec, sfv, copy_cnt * 368 sizeof (struct ksendfilevec64))) { 369 error = EFAULT; 370 break; 371 } 372 373 error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count); 374 if (error != 0) 375 break; 376 377 copy_vec += copy_cnt; 378 sfvcnt -= copy_cnt; 379 } while (sfvcnt > 0); 380 381 if (vp->v_type == VREG) 382 fp->f_offset += count; 383 384 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 385 if (copyout(&count, xferred, sizeof (count))) 386 error = EFAULT; 387 releasef(fildes); 388 if (error != 0) 389 return (set_errno(error)); 390 return (count); 391 } 392 #endif 393 394 int 395 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 396 int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count) 397 { 398 struct vnode *vp; 399 struct uio auio; 400 struct iovec aiov; 401 ushort_t fflag; 402 int ioflag; 403 int i, error; 404 size_t cnt; 405 ssize_t sfv_len; 406 u_offset_t sfv_off; 407 #ifdef _SYSCALL32_IMPL 408 model_t model = get_udatamodel(); 409 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 410 MAXOFF32_T : MAXOFFSET_T; 411 #else 412 const u_offset_t maxoff = MAXOFF32_T; 413 #endif 414 mblk_t *dmp = NULL; 415 int wroff; 416 int buf_left = 0; 417 size_t iov_len; 418 mblk_t *head, *tmp; 419 size_t size = total_size; 420 size_t extra; 421 int tail_len; 422 struct nmsghdr msg; 423 424 fflag = fp->f_flag; 425 vp = fp->f_vnode; 426 427 ASSERT(vp->v_type == VSOCK); 428 ASSERT(maxblk > 0); 429 430 /* If nothing to send, return */ 431 if (total_size == 0) 432 return (0); 433 434 if (vp->v_stream != NULL) { 435 wroff = (int)vp->v_stream->sd_wroff; 436 tail_len = (int)vp->v_stream->sd_tail; 437 } else { 438 struct sonode *so; 439 440 so = VTOSO(vp); 441 wroff = so->so_proto_props.sopp_wroff; 442 tail_len = so->so_proto_props.sopp_tail; 443 } 444 445 extra = wroff + tail_len; 446 447 buf_left = MIN(total_size, maxblk); 448 head = dmp = allocb(buf_left + extra, BPRI_HI); 449 if (head == NULL) 450 return (ENOMEM); 451 head->b_wptr = head->b_rptr = head->b_rptr + wroff; 452 bzero(&msg, sizeof (msg)); 453 454 auio.uio_extflg = UIO_COPY_DEFAULT; 455 for (i = 0; i < copy_cnt; i++) { 456 if (ISSIG(curthread, JUSTLOOKING)) { 457 freemsg(head); 458 return (EINTR); 459 } 460 461 /* 462 * Do similar checks as "write" as we are writing 463 * sfv_len bytes into "vp". 464 */ 465 sfv_len = (ssize_t)sfv->sfv_len; 466 467 if (sfv_len == 0) { 468 sfv++; 469 continue; 470 } 471 472 /* Check for overflow */ 473 #ifdef _SYSCALL32_IMPL 474 if (model == DATAMODEL_ILP32) { 475 if (((ssize32_t)(*count + sfv_len)) < 0) { 476 freemsg(head); 477 return (EINVAL); 478 } 479 } else 480 #endif 481 if ((*count + sfv_len) < 0) { 482 freemsg(head); 483 return (EINVAL); 484 } 485 486 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 487 488 if (sfv->sfv_fd == SFV_FD_SELF) { 489 while (sfv_len > 0) { 490 if (buf_left == 0) { 491 tmp = dmp; 492 buf_left = MIN(total_size, maxblk); 493 iov_len = MIN(buf_left, sfv_len); 494 dmp = allocb(buf_left + extra, BPRI_HI); 495 if (dmp == NULL) { 496 freemsg(head); 497 return (ENOMEM); 498 } 499 dmp->b_wptr = dmp->b_rptr = 500 dmp->b_rptr + wroff; 501 tmp->b_cont = dmp; 502 } else { 503 iov_len = MIN(buf_left, sfv_len); 504 } 505 506 aiov.iov_len = iov_len; 507 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 508 auio.uio_loffset = *fileoff; 509 auio.uio_iovcnt = 1; 510 auio.uio_resid = iov_len; 511 auio.uio_iov = &aiov; 512 auio.uio_segflg = UIO_USERSPACE; 513 auio.uio_llimit = curproc->p_fsz_ctl; 514 auio.uio_fmode = fflag; 515 516 buf_left -= iov_len; 517 total_size -= iov_len; 518 sfv_len -= iov_len; 519 sfv_off += iov_len; 520 521 error = uiomove((caddr_t)dmp->b_wptr, 522 iov_len, UIO_WRITE, &auio); 523 if (error != 0) { 524 freemsg(head); 525 return (error); 526 } 527 dmp->b_wptr += iov_len; 528 } 529 } else { 530 file_t *ffp; 531 vnode_t *readvp; 532 533 if ((ffp = getf(sfv->sfv_fd)) == NULL) { 534 freemsg(head); 535 return (EBADF); 536 } 537 538 if ((ffp->f_flag & FREAD) == 0) { 539 releasef(sfv->sfv_fd); 540 freemsg(head); 541 return (EACCES); 542 } 543 544 readvp = ffp->f_vnode; 545 if (readvp->v_type != VREG) { 546 releasef(sfv->sfv_fd); 547 freemsg(head); 548 return (EINVAL); 549 } 550 551 /* 552 * No point reading and writing to same vp, 553 * as long as both are regular files. readvp is not 554 * locked; but since we got it from an open file the 555 * contents will be valid during the time of access. 556 */ 557 558 if (vn_compare(vp, readvp)) { 559 releasef(sfv->sfv_fd); 560 freemsg(head); 561 return (EINVAL); 562 } 563 564 /* 565 * Note: we assume readvp != vp. "vp" is already 566 * locked, and "readvp" must not be. 567 */ 568 569 if (readvp < vp) { 570 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 571 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 572 NULL); 573 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 574 } else { 575 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 576 NULL); 577 } 578 579 /* Same checks as in pread */ 580 if (sfv_off > maxoff) { 581 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 582 releasef(sfv->sfv_fd); 583 freemsg(head); 584 return (EINVAL); 585 } 586 if (sfv_off + sfv_len > maxoff) { 587 total_size -= (sfv_off + sfv_len - maxoff); 588 sfv_len = (ssize_t)((offset_t)maxoff - 589 sfv_off); 590 } 591 592 while (sfv_len > 0) { 593 if (buf_left == 0) { 594 tmp = dmp; 595 buf_left = MIN(total_size, maxblk); 596 iov_len = MIN(buf_left, sfv_len); 597 dmp = allocb(buf_left + extra, BPRI_HI); 598 if (dmp == NULL) { 599 VOP_RWUNLOCK(readvp, 600 V_WRITELOCK_FALSE, NULL); 601 releasef(sfv->sfv_fd); 602 freemsg(head); 603 return (ENOMEM); 604 } 605 dmp->b_wptr = dmp->b_rptr = 606 dmp->b_rptr + wroff; 607 tmp->b_cont = dmp; 608 } else { 609 iov_len = MIN(buf_left, sfv_len); 610 } 611 aiov.iov_base = (caddr_t)dmp->b_wptr; 612 aiov.iov_len = iov_len; 613 auio.uio_loffset = sfv_off; 614 auio.uio_iov = &aiov; 615 auio.uio_iovcnt = 1; 616 auio.uio_resid = iov_len; 617 auio.uio_segflg = UIO_SYSSPACE; 618 auio.uio_llimit = MAXOFFSET_T; 619 auio.uio_fmode = ffp->f_flag; 620 ioflag = auio.uio_fmode & 621 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 622 623 /* 624 * If read sync is not asked for, 625 * filter sync flags 626 */ 627 if ((ioflag & FRSYNC) == 0) 628 ioflag &= ~(FSYNC|FDSYNC); 629 error = VOP_READ(readvp, &auio, ioflag, 630 fp->f_cred, NULL); 631 if (error != 0) { 632 /* 633 * If we were reading a pipe (currently 634 * not implemented), we may now loose 635 * data. 636 */ 637 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 638 NULL); 639 releasef(sfv->sfv_fd); 640 freemsg(head); 641 return (error); 642 } 643 644 /* 645 * Check how much data was really read. 646 * Decrement the 'len' and increment the 647 * 'off' appropriately. 648 */ 649 cnt = iov_len - auio.uio_resid; 650 if (cnt == 0) { 651 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 652 NULL); 653 releasef(sfv->sfv_fd); 654 freemsg(head); 655 return (EINVAL); 656 } 657 sfv_len -= cnt; 658 sfv_off += cnt; 659 total_size -= cnt; 660 buf_left -= cnt; 661 662 dmp->b_wptr += cnt; 663 } 664 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 665 releasef(sfv->sfv_fd); 666 } 667 sfv++; 668 } 669 670 ASSERT(total_size == 0); 671 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &head); 672 if (error != 0) { 673 if (head != NULL) 674 freemsg(head); 675 return (error); 676 } 677 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size; 678 *count += size; 679 680 return (0); 681 } 682 683 684 int 685 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 686 int copy_cnt, ssize_t *count) 687 { 688 struct vnode *vp; 689 struct uio auio; 690 struct iovec aiov; 691 ushort_t fflag; 692 int ioflag; 693 int i, error; 694 size_t cnt; 695 ssize_t sfv_len; 696 u_offset_t sfv_off; 697 #ifdef _SYSCALL32_IMPL 698 model_t model = get_udatamodel(); 699 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 700 MAXOFF32_T : MAXOFFSET_T; 701 #else 702 const u_offset_t maxoff = MAXOFF32_T; 703 #endif 704 mblk_t *dmp = NULL; 705 char *buf = NULL; 706 size_t extra; 707 int maxblk, wroff, tail_len; 708 struct sonode *so; 709 stdata_t *stp; 710 struct nmsghdr msg; 711 712 fflag = fp->f_flag; 713 vp = fp->f_vnode; 714 715 if (vp->v_type == VSOCK) { 716 so = VTOSO(vp); 717 if (vp->v_stream != NULL) { 718 stp = vp->v_stream; 719 wroff = (int)stp->sd_wroff; 720 tail_len = (int)stp->sd_tail; 721 maxblk = (int)stp->sd_maxblk; 722 } else { 723 stp = NULL; 724 wroff = so->so_proto_props.sopp_wroff; 725 tail_len = so->so_proto_props.sopp_tail; 726 maxblk = so->so_proto_props.sopp_maxblk; 727 } 728 extra = wroff + tail_len; 729 } 730 731 bzero(&msg, sizeof (msg)); 732 auio.uio_extflg = UIO_COPY_DEFAULT; 733 for (i = 0; i < copy_cnt; i++) { 734 if (ISSIG(curthread, JUSTLOOKING)) 735 return (EINTR); 736 737 /* 738 * Do similar checks as "write" as we are writing 739 * sfv_len bytes into "vp". 740 */ 741 sfv_len = (ssize_t)sfv->sfv_len; 742 743 if (sfv_len == 0) { 744 sfv++; 745 continue; 746 } 747 748 if (vp->v_type == VREG) { 749 if (*fileoff >= curproc->p_fsz_ctl) { 750 mutex_enter(&curproc->p_lock); 751 (void) rctl_action( 752 rctlproc_legacy[RLIMIT_FSIZE], 753 curproc->p_rctls, curproc, RCA_SAFE); 754 mutex_exit(&curproc->p_lock); 755 756 return (EFBIG); 757 } 758 759 if (*fileoff >= maxoff) 760 return (EFBIG); 761 762 if (*fileoff + sfv_len > maxoff) 763 return (EINVAL); 764 } 765 766 /* Check for overflow */ 767 #ifdef _SYSCALL32_IMPL 768 if (model == DATAMODEL_ILP32) { 769 if (((ssize32_t)(*count + sfv_len)) < 0) 770 return (EINVAL); 771 } else 772 #endif 773 if ((*count + sfv_len) < 0) 774 return (EINVAL); 775 776 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 777 778 if (sfv->sfv_fd == SFV_FD_SELF) { 779 if (vp->v_type == VSOCK) { 780 while (sfv_len > 0) { 781 size_t iov_len; 782 783 iov_len = sfv_len; 784 if (!SOCK_IS_NONSTR(so) && 785 SOTOTPI(so)->sti_kssl_ctx != NULL) 786 iov_len = MIN(iov_len, maxblk); 787 788 aiov.iov_len = iov_len; 789 aiov.iov_base = 790 (caddr_t)(uintptr_t)sfv_off; 791 792 auio.uio_iov = &aiov; 793 auio.uio_iovcnt = 1; 794 auio.uio_loffset = *fileoff; 795 auio.uio_segflg = UIO_USERSPACE; 796 auio.uio_fmode = fflag; 797 auio.uio_llimit = curproc->p_fsz_ctl; 798 auio.uio_resid = iov_len; 799 800 dmp = allocb(iov_len + extra, BPRI_HI); 801 if (dmp == NULL) 802 return (ENOMEM); 803 dmp->b_wptr = dmp->b_rptr = 804 dmp->b_rptr + wroff; 805 error = uiomove((caddr_t)dmp->b_wptr, 806 iov_len, UIO_WRITE, &auio); 807 if (error != 0) { 808 freeb(dmp); 809 return (error); 810 } 811 dmp->b_wptr += iov_len; 812 error = socket_sendmblk(VTOSO(vp), 813 &msg, fflag, CRED(), &dmp); 814 815 if (error != 0) { 816 if (dmp != NULL) 817 freeb(dmp); 818 return (error); 819 } 820 ttolwp(curthread)->lwp_ru.ioch += 821 (ulong_t)iov_len; 822 *count += iov_len; 823 sfv_len -= iov_len; 824 sfv_off += iov_len; 825 } 826 } else { 827 aiov.iov_len = sfv_len; 828 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 829 830 auio.uio_iov = &aiov; 831 auio.uio_iovcnt = 1; 832 auio.uio_loffset = *fileoff; 833 auio.uio_segflg = UIO_USERSPACE; 834 auio.uio_fmode = fflag; 835 auio.uio_llimit = curproc->p_fsz_ctl; 836 auio.uio_resid = sfv_len; 837 838 ioflag = auio.uio_fmode & 839 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 840 while (sfv_len > 0) { 841 error = VOP_WRITE(vp, &auio, ioflag, 842 fp->f_cred, NULL); 843 cnt = sfv_len - auio.uio_resid; 844 sfv_len -= cnt; 845 ttolwp(curthread)->lwp_ru.ioch += 846 (ulong_t)cnt; 847 *fileoff += cnt; 848 *count += cnt; 849 if (error != 0) 850 return (error); 851 } 852 } 853 } else { 854 int segmapit = 0; 855 file_t *ffp; 856 vnode_t *readvp; 857 struct vnode *realvp; 858 size_t size; 859 caddr_t ptr; 860 861 if ((ffp = getf(sfv->sfv_fd)) == NULL) 862 return (EBADF); 863 864 if ((ffp->f_flag & FREAD) == 0) { 865 releasef(sfv->sfv_fd); 866 return (EBADF); 867 } 868 869 readvp = ffp->f_vnode; 870 if (VOP_REALVP(readvp, &realvp, NULL) == 0) 871 readvp = realvp; 872 if (readvp->v_type != VREG) { 873 releasef(sfv->sfv_fd); 874 return (EINVAL); 875 } 876 877 /* 878 * No point reading and writing to same vp, 879 * as long as both are regular files. readvp is not 880 * locked; but since we got it from an open file the 881 * contents will be valid during the time of access. 882 */ 883 if (vn_compare(vp, readvp)) { 884 releasef(sfv->sfv_fd); 885 return (EINVAL); 886 } 887 888 /* 889 * Note: we assume readvp != vp. "vp" is already 890 * locked, and "readvp" must not be. 891 */ 892 if (readvp < vp) { 893 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 894 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 895 NULL); 896 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 897 } else { 898 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 899 NULL); 900 } 901 902 /* Same checks as in pread */ 903 if (sfv_off > maxoff) { 904 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 905 releasef(sfv->sfv_fd); 906 return (EINVAL); 907 } 908 if (sfv_off + sfv_len > maxoff) { 909 sfv_len = (ssize_t)((offset_t)maxoff - 910 sfv_off); 911 } 912 /* Find the native blocksize to transfer data */ 913 size = MIN(vp->v_vfsp->vfs_bsize, 914 readvp->v_vfsp->vfs_bsize); 915 size = sfv_len < size ? sfv_len : size; 916 917 if (vp->v_type != VSOCK) { 918 segmapit = 0; 919 buf = kmem_alloc(size, KM_NOSLEEP); 920 if (buf == NULL) { 921 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 922 NULL); 923 releasef(sfv->sfv_fd); 924 return (ENOMEM); 925 } 926 } else { 927 uint_t copyflag; 928 929 copyflag = stp != NULL ? stp->sd_copyflag : 930 so->so_proto_props.sopp_zcopyflag; 931 /* 932 * For sockets acting as an SSL proxy, we 933 * need to adjust the size to the maximum 934 * SSL record size set in the stream head. 935 */ 936 if (!SOCK_IS_NONSTR(so) && 937 _SOTOTPI(so)->sti_kssl_ctx != NULL) 938 size = MIN(size, maxblk); 939 940 if (vn_has_flocks(readvp) || 941 readvp->v_flag & VNOMAP || 942 copyflag & STZCVMUNSAFE) { 943 segmapit = 0; 944 } else if (copyflag & STZCVMSAFE) { 945 segmapit = 1; 946 } else { 947 int on = 1; 948 if (socket_setsockopt(VTOSO(vp), 949 SOL_SOCKET, SO_SND_COPYAVOID, 950 &on, sizeof (on), CRED()) == 0) 951 segmapit = 1; 952 } 953 } 954 955 if (segmapit) { 956 boolean_t nowait; 957 958 nowait = (sfv->sfv_flag & SFV_NOWAIT) != 0; 959 error = snf_segmap(fp, readvp, sfv_off, 960 (u_offset_t)sfv_len, (ssize_t *)&cnt, 961 nowait); 962 releasef(sfv->sfv_fd); 963 *count += cnt; 964 if (error) 965 return (error); 966 sfv++; 967 continue; 968 } 969 970 while (sfv_len > 0) { 971 size_t iov_len; 972 973 iov_len = MIN(size, sfv_len); 974 975 if (vp->v_type == VSOCK) { 976 dmp = allocb(iov_len + extra, BPRI_HI); 977 if (dmp == NULL) { 978 VOP_RWUNLOCK(readvp, 979 V_WRITELOCK_FALSE, NULL); 980 releasef(sfv->sfv_fd); 981 return (ENOMEM); 982 } 983 dmp->b_wptr = dmp->b_rptr = 984 dmp->b_rptr + wroff; 985 ptr = (caddr_t)dmp->b_rptr; 986 } else { 987 ptr = buf; 988 } 989 990 aiov.iov_base = ptr; 991 aiov.iov_len = iov_len; 992 auio.uio_loffset = sfv_off; 993 auio.uio_iov = &aiov; 994 auio.uio_iovcnt = 1; 995 auio.uio_resid = iov_len; 996 auio.uio_segflg = UIO_SYSSPACE; 997 auio.uio_llimit = MAXOFFSET_T; 998 auio.uio_fmode = ffp->f_flag; 999 ioflag = auio.uio_fmode & 1000 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1001 1002 /* 1003 * If read sync is not asked for, 1004 * filter sync flags 1005 */ 1006 if ((ioflag & FRSYNC) == 0) 1007 ioflag &= ~(FSYNC|FDSYNC); 1008 error = VOP_READ(readvp, &auio, ioflag, 1009 fp->f_cred, NULL); 1010 if (error != 0) { 1011 /* 1012 * If we were reading a pipe (currently 1013 * not implemented), we may now lose 1014 * data. 1015 */ 1016 if (vp->v_type == VSOCK) 1017 freeb(dmp); 1018 else 1019 kmem_free(buf, size); 1020 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 1021 NULL); 1022 releasef(sfv->sfv_fd); 1023 return (error); 1024 } 1025 1026 /* 1027 * Check how much data was really read. 1028 * Decrement the 'len' and increment the 1029 * 'off' appropriately. 1030 */ 1031 cnt = iov_len - auio.uio_resid; 1032 if (cnt == 0) { 1033 if (vp->v_type == VSOCK) 1034 freeb(dmp); 1035 else 1036 kmem_free(buf, size); 1037 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 1038 NULL); 1039 releasef(sfv->sfv_fd); 1040 return (EINVAL); 1041 } 1042 sfv_len -= cnt; 1043 sfv_off += cnt; 1044 1045 if (vp->v_type == VSOCK) { 1046 dmp->b_wptr = dmp->b_rptr + cnt; 1047 1048 error = socket_sendmblk(VTOSO(vp), 1049 &msg, fflag, CRED(), &dmp); 1050 1051 if (error != 0) { 1052 if (dmp != NULL) 1053 freeb(dmp); 1054 VOP_RWUNLOCK(readvp, 1055 V_WRITELOCK_FALSE, NULL); 1056 releasef(sfv->sfv_fd); 1057 return (error); 1058 } 1059 1060 ttolwp(curthread)->lwp_ru.ioch += 1061 (ulong_t)cnt; 1062 *count += cnt; 1063 } else { 1064 1065 aiov.iov_base = ptr; 1066 aiov.iov_len = cnt; 1067 auio.uio_loffset = *fileoff; 1068 auio.uio_resid = cnt; 1069 auio.uio_iov = &aiov; 1070 auio.uio_iovcnt = 1; 1071 auio.uio_segflg = UIO_SYSSPACE; 1072 auio.uio_llimit = curproc->p_fsz_ctl; 1073 auio.uio_fmode = fflag; 1074 ioflag = auio.uio_fmode & 1075 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1076 error = VOP_WRITE(vp, &auio, ioflag, 1077 fp->f_cred, NULL); 1078 1079 /* 1080 * Check how much data was written. 1081 * Increment the 'len' and decrement the 1082 * 'off' if all the data was not 1083 * written. 1084 */ 1085 cnt -= auio.uio_resid; 1086 sfv_len += auio.uio_resid; 1087 sfv_off -= auio.uio_resid; 1088 ttolwp(curthread)->lwp_ru.ioch += 1089 (ulong_t)cnt; 1090 *fileoff += cnt; 1091 *count += cnt; 1092 if (error != 0) { 1093 kmem_free(buf, size); 1094 VOP_RWUNLOCK(readvp, 1095 V_WRITELOCK_FALSE, NULL); 1096 releasef(sfv->sfv_fd); 1097 return (error); 1098 } 1099 } 1100 } 1101 if (buf) { 1102 kmem_free(buf, size); 1103 buf = NULL; 1104 } 1105 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 1106 releasef(sfv->sfv_fd); 1107 } 1108 sfv++; 1109 } 1110 return (0); 1111 } 1112 1113 ssize_t 1114 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, 1115 size_t *xferred) 1116 { 1117 int error = 0; 1118 int first_vector_error = 0; 1119 file_t *fp; 1120 struct vnode *vp; 1121 struct sonode *so; 1122 u_offset_t fileoff; 1123 int copy_cnt; 1124 const struct sendfilevec *copy_vec; 1125 struct sendfilevec sfv[SEND_MAX_CHUNK]; 1126 ssize_t count = 0; 1127 #ifdef _SYSCALL32_IMPL 1128 struct ksendfilevec32 sfv32[SEND_MAX_CHUNK]; 1129 #endif 1130 ssize_t total_size; 1131 int i; 1132 boolean_t is_sock = B_FALSE; 1133 int maxblk = 0; 1134 1135 if (sfvcnt <= 0) 1136 return (set_errno(EINVAL)); 1137 1138 if ((fp = getf(fildes)) == NULL) 1139 return (set_errno(EBADF)); 1140 1141 if (((fp->f_flag) & FWRITE) == 0) { 1142 error = EBADF; 1143 goto err; 1144 } 1145 1146 fileoff = fp->f_offset; 1147 vp = fp->f_vnode; 1148 1149 switch (vp->v_type) { 1150 case VSOCK: 1151 so = VTOSO(vp); 1152 is_sock = B_TRUE; 1153 if (SOCK_IS_NONSTR(so)) { 1154 maxblk = so->so_proto_props.sopp_maxblk; 1155 } else { 1156 maxblk = (int)vp->v_stream->sd_maxblk; 1157 } 1158 break; 1159 case VREG: 1160 break; 1161 default: 1162 error = EINVAL; 1163 goto err; 1164 } 1165 1166 switch (opcode) { 1167 case SENDFILEV : 1168 break; 1169 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 1170 case SENDFILEV64 : 1171 return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt, 1172 (size32_t *)xferred, fildes)); 1173 #endif 1174 default : 1175 error = ENOSYS; 1176 break; 1177 } 1178 1179 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1180 copy_vec = vec; 1181 1182 do { 1183 total_size = 0; 1184 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 1185 #ifdef _SYSCALL32_IMPL 1186 /* 32-bit callers need to have their iovec expanded. */ 1187 if (get_udatamodel() == DATAMODEL_ILP32) { 1188 if (copyin(copy_vec, sfv32, 1189 copy_cnt * sizeof (ksendfilevec32_t))) { 1190 error = EFAULT; 1191 break; 1192 } 1193 1194 for (i = 0; i < copy_cnt; i++) { 1195 sfv[i].sfv_fd = sfv32[i].sfv_fd; 1196 sfv[i].sfv_off = 1197 (off_t)(uint32_t)sfv32[i].sfv_off; 1198 sfv[i].sfv_len = (size_t)sfv32[i].sfv_len; 1199 total_size += sfv[i].sfv_len; 1200 sfv[i].sfv_flag = sfv32[i].sfv_flag; 1201 /* 1202 * Individual elements of the vector must not 1203 * wrap or overflow, as later math is signed. 1204 * Equally total_size needs to be checked after 1205 * each vector is added in, to be sure that 1206 * rogue values haven't overflowed the counter. 1207 */ 1208 if (((ssize32_t)sfv[i].sfv_len < 0) || 1209 ((ssize32_t)total_size < 0)) { 1210 /* 1211 * Truncate the vector to send data 1212 * described by elements before the 1213 * error. 1214 */ 1215 copy_cnt = i; 1216 first_vector_error = EINVAL; 1217 /* total_size can't be trusted */ 1218 if ((ssize32_t)total_size < 0) 1219 error = EINVAL; 1220 break; 1221 } 1222 } 1223 /* Nothing to do, process errors */ 1224 if (copy_cnt == 0) 1225 break; 1226 1227 } else { 1228 #endif 1229 if (copyin(copy_vec, sfv, 1230 copy_cnt * sizeof (sendfilevec_t))) { 1231 error = EFAULT; 1232 break; 1233 } 1234 1235 for (i = 0; i < copy_cnt; i++) { 1236 total_size += sfv[i].sfv_len; 1237 /* 1238 * Individual elements of the vector must not 1239 * wrap or overflow, as later math is signed. 1240 * Equally total_size needs to be checked after 1241 * each vector is added in, to be sure that 1242 * rogue values haven't overflowed the counter. 1243 */ 1244 if (((ssize_t)sfv[i].sfv_len < 0) || 1245 (total_size < 0)) { 1246 /* 1247 * Truncate the vector to send data 1248 * described by elements before the 1249 * error. 1250 */ 1251 copy_cnt = i; 1252 first_vector_error = EINVAL; 1253 /* total_size can't be trusted */ 1254 if (total_size < 0) 1255 error = EINVAL; 1256 break; 1257 } 1258 } 1259 /* Nothing to do, process errors */ 1260 if (copy_cnt == 0) 1261 break; 1262 #ifdef _SYSCALL32_IMPL 1263 } 1264 #endif 1265 1266 /* 1267 * The task between deciding to use sendvec_small_chunk 1268 * and sendvec_chunk is dependant on multiple things: 1269 * 1270 * i) latency is important for smaller files. So if the 1271 * data is smaller than 'tcp_slow_start_initial' times 1272 * maxblk, then use sendvec_small_chunk which creates 1273 * maxblk size mblks and chains them together and sends 1274 * them to TCP in one shot. It also leaves 'wroff' size 1275 * space for the headers in each mblk. 1276 * 1277 * ii) for total size bigger than 'tcp_slow_start_initial' 1278 * time maxblk, its probably real file data which is 1279 * dominating. So its better to use sendvec_chunk because 1280 * performance goes to dog if we don't do pagesize reads. 1281 * sendvec_chunk will do pagesize reads and write them 1282 * in pagesize mblks to TCP. 1283 * 1284 * Side Notes: A write to file has not been optimized. 1285 * Future zero copy code will plugin into sendvec_chunk 1286 * only because doing zero copy for files smaller then 1287 * pagesize is useless. 1288 * 1289 * Note, if socket has NL7C enabled then call NL7C's 1290 * senfilev() function to consume the sfv[]. 1291 */ 1292 if (is_sock) { 1293 if (!SOCK_IS_NONSTR(so) && 1294 _SOTOTPI(so)->sti_nl7c_flags != 0) { 1295 error = nl7c_sendfilev(so, &fileoff, 1296 sfv, copy_cnt, &count); 1297 } else if ((total_size <= (4 * maxblk)) && 1298 error == 0) { 1299 error = sendvec_small_chunk(fp, 1300 &fileoff, sfv, copy_cnt, 1301 total_size, maxblk, &count); 1302 } else { 1303 error = sendvec_chunk(fp, &fileoff, 1304 sfv, copy_cnt, &count); 1305 } 1306 } else { 1307 ASSERT(vp->v_type == VREG); 1308 error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt, 1309 &count); 1310 } 1311 1312 1313 #ifdef _SYSCALL32_IMPL 1314 if (get_udatamodel() == DATAMODEL_ILP32) 1315 copy_vec = (const struct sendfilevec *)((char *)copy_vec + 1316 (copy_cnt * sizeof (ksendfilevec32_t))); 1317 else 1318 #endif 1319 copy_vec += copy_cnt; 1320 sfvcnt -= copy_cnt; 1321 1322 /* Process all vector members up to first error */ 1323 } while ((sfvcnt > 0) && first_vector_error == 0 && error == 0); 1324 1325 if (vp->v_type == VREG) 1326 fp->f_offset += count; 1327 1328 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1329 1330 #ifdef _SYSCALL32_IMPL 1331 if (get_udatamodel() == DATAMODEL_ILP32) { 1332 ssize32_t count32 = (ssize32_t)count; 1333 if (copyout(&count32, xferred, sizeof (count32))) 1334 error = EFAULT; 1335 releasef(fildes); 1336 if (error != 0) 1337 return (set_errno(error)); 1338 if (first_vector_error != 0) 1339 return (set_errno(first_vector_error)); 1340 return (count32); 1341 } 1342 #endif 1343 if (copyout(&count, xferred, sizeof (count))) 1344 error = EFAULT; 1345 releasef(fildes); 1346 if (error != 0) 1347 return (set_errno(error)); 1348 if (first_vector_error != 0) 1349 return (set_errno(first_vector_error)); 1350 return (count); 1351 err: 1352 ASSERT(error != 0); 1353 releasef(fildes); 1354 return (set_errno(error)); 1355 } 1356