1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/t_lock.h> 28 #include <sys/param.h> 29 #include <sys/systm.h> 30 #include <sys/buf.h> 31 #include <sys/conf.h> 32 #include <sys/cred.h> 33 #include <sys/kmem.h> 34 #include <sys/sysmacros.h> 35 #include <sys/vfs.h> 36 #include <sys/vnode.h> 37 #include <sys/debug.h> 38 #include <sys/errno.h> 39 #include <sys/time.h> 40 #include <sys/file.h> 41 #include <sys/open.h> 42 #include <sys/user.h> 43 #include <sys/termios.h> 44 #include <sys/stream.h> 45 #include <sys/strsubr.h> 46 #include <sys/sunddi.h> 47 #include <sys/esunddi.h> 48 #include <sys/flock.h> 49 #include <sys/modctl.h> 50 #include <sys/cmn_err.h> 51 #include <sys/vmsystm.h> 52 53 #include <sys/socket.h> 54 #include <sys/socketvar.h> 55 #include <fs/sockfs/sockcommon.h> 56 #include <fs/sockfs/socktpi.h> 57 58 #include <netinet/in.h> 59 #include <sys/sendfile.h> 60 #include <sys/un.h> 61 #include <sys/tihdr.h> 62 #include <sys/atomic.h> 63 64 #include <inet/common.h> 65 #include <inet/ip.h> 66 #include <inet/ip6.h> 67 #include <inet/tcp.h> 68 69 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *, 70 ssize32_t *); 71 extern int nl7c_sendfilev(struct sonode *, u_offset_t *, struct sendfilevec *, 72 int, ssize_t *); 73 extern int snf_segmap(file_t *, vnode_t *, u_offset_t, u_offset_t, ssize_t *, 74 boolean_t); 75 extern sotpi_info_t *sotpi_sototpi(struct sonode *); 76 77 #define SEND_MAX_CHUNK 16 78 79 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 80 /* 81 * 64 bit offsets for 32 bit applications only running either on 82 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer 83 * more than 2GB of data. 84 */ 85 int 86 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, 87 int copy_cnt, ssize32_t *count) 88 { 89 struct vnode *vp; 90 ushort_t fflag; 91 int ioflag; 92 size32_t cnt; 93 ssize32_t sfv_len; 94 ssize32_t tmpcount; 95 u_offset_t sfv_off; 96 struct uio auio; 97 struct iovec aiov; 98 int i, error; 99 100 fflag = fp->f_flag; 101 vp = fp->f_vnode; 102 for (i = 0; i < copy_cnt; i++) { 103 104 if (ISSIG(curthread, JUSTLOOKING)) 105 return (EINTR); 106 107 /* 108 * Do similar checks as "write" as we are writing 109 * sfv_len bytes into "vp". 110 */ 111 sfv_len = (ssize32_t)sfv->sfv_len; 112 113 if (sfv_len == 0) { 114 sfv++; 115 continue; 116 } 117 118 if (sfv_len < 0) 119 return (EINVAL); 120 121 if (vp->v_type == VREG) { 122 if (*fileoff >= curproc->p_fsz_ctl) { 123 mutex_enter(&curproc->p_lock); 124 (void) rctl_action( 125 rctlproc_legacy[RLIMIT_FSIZE], 126 curproc->p_rctls, curproc, RCA_SAFE); 127 mutex_exit(&curproc->p_lock); 128 return (EFBIG); 129 } 130 131 if (*fileoff >= OFFSET_MAX(fp)) 132 return (EFBIG); 133 134 if (*fileoff + sfv_len > OFFSET_MAX(fp)) 135 return (EINVAL); 136 } 137 138 tmpcount = *count + sfv_len; 139 if (tmpcount < 0) 140 return (EINVAL); 141 142 sfv_off = sfv->sfv_off; 143 144 auio.uio_extflg = UIO_COPY_DEFAULT; 145 if (sfv->sfv_fd == SFV_FD_SELF) { 146 aiov.iov_len = sfv_len; 147 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 148 auio.uio_loffset = *fileoff; 149 auio.uio_iovcnt = 1; 150 auio.uio_resid = sfv_len; 151 auio.uio_iov = &aiov; 152 auio.uio_segflg = UIO_USERSPACE; 153 auio.uio_llimit = curproc->p_fsz_ctl; 154 auio.uio_fmode = fflag; 155 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 156 while (sfv_len > 0) { 157 error = VOP_WRITE(vp, &auio, ioflag, 158 fp->f_cred, NULL); 159 cnt = sfv_len - auio.uio_resid; 160 sfv_len -= cnt; 161 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 162 if (vp->v_type == VREG) 163 *fileoff += cnt; 164 *count += cnt; 165 if (error != 0) 166 return (error); 167 } 168 } else { 169 file_t *ffp; 170 vnode_t *readvp; 171 size_t size; 172 caddr_t ptr; 173 174 if ((ffp = getf(sfv->sfv_fd)) == NULL) 175 return (EBADF); 176 177 if ((ffp->f_flag & FREAD) == 0) { 178 releasef(sfv->sfv_fd); 179 return (EBADF); 180 } 181 182 readvp = ffp->f_vnode; 183 if (readvp->v_type != VREG) { 184 releasef(sfv->sfv_fd); 185 return (EINVAL); 186 } 187 188 /* 189 * No point reading and writing to same vp, 190 * as long as both are regular files. readvp is not 191 * locked; but since we got it from an open file the 192 * contents will be valid during the time of access. 193 */ 194 if (vn_compare(vp, readvp)) { 195 releasef(sfv->sfv_fd); 196 return (EINVAL); 197 } 198 199 /* 200 * Optimize the regular file over 201 * the socket case. 202 */ 203 if (vp->v_type == VSOCK) { 204 error = sosendfile64(fp, ffp, sfv, 205 (ssize32_t *)&cnt); 206 *count += cnt; 207 if (error) 208 return (error); 209 sfv++; 210 continue; 211 } 212 213 /* 214 * Note: we assume readvp != vp. "vp" is already 215 * locked, and "readvp" must not be. 216 */ 217 if (readvp < vp) { 218 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 219 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 220 NULL); 221 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 222 } else { 223 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 224 NULL); 225 } 226 227 /* 228 * Same checks as in pread64. 229 */ 230 if (sfv_off > MAXOFFSET_T) { 231 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 232 releasef(sfv->sfv_fd); 233 return (EINVAL); 234 } 235 236 if (sfv_off + sfv_len > MAXOFFSET_T) 237 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off); 238 239 /* Find the native blocksize to transfer data */ 240 size = MIN(vp->v_vfsp->vfs_bsize, 241 readvp->v_vfsp->vfs_bsize); 242 size = sfv_len < size ? sfv_len : size; 243 ptr = kmem_alloc(size, KM_NOSLEEP); 244 if (ptr == NULL) { 245 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 246 releasef(sfv->sfv_fd); 247 return (ENOMEM); 248 } 249 250 while (sfv_len > 0) { 251 size_t iov_len; 252 253 iov_len = MIN(size, sfv_len); 254 aiov.iov_base = ptr; 255 aiov.iov_len = iov_len; 256 auio.uio_loffset = sfv_off; 257 auio.uio_iov = &aiov; 258 auio.uio_iovcnt = 1; 259 auio.uio_resid = iov_len; 260 auio.uio_segflg = UIO_SYSSPACE; 261 auio.uio_llimit = MAXOFFSET_T; 262 auio.uio_fmode = ffp->f_flag; 263 ioflag = auio.uio_fmode & 264 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 265 266 /* 267 * If read sync is not asked for, 268 * filter sync flags 269 */ 270 if ((ioflag & FRSYNC) == 0) 271 ioflag &= ~(FSYNC|FDSYNC); 272 error = VOP_READ(readvp, &auio, ioflag, 273 fp->f_cred, NULL); 274 if (error) { 275 kmem_free(ptr, size); 276 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 277 NULL); 278 releasef(sfv->sfv_fd); 279 return (error); 280 } 281 282 /* 283 * Check how must data was really read. 284 * Decrement the 'len' and increment the 285 * 'off' appropriately. 286 */ 287 cnt = iov_len - auio.uio_resid; 288 if (cnt == 0) { 289 /* 290 * If we were reading a pipe (currently 291 * not implemented), we may now lose 292 * data. 293 */ 294 kmem_free(ptr, size); 295 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 296 NULL); 297 releasef(sfv->sfv_fd); 298 return (EINVAL); 299 } 300 sfv_len -= cnt; 301 sfv_off += cnt; 302 303 aiov.iov_base = ptr; 304 aiov.iov_len = cnt; 305 auio.uio_loffset = *fileoff; 306 auio.uio_iov = &aiov; 307 auio.uio_iovcnt = 1; 308 auio.uio_resid = cnt; 309 auio.uio_segflg = UIO_SYSSPACE; 310 auio.uio_llimit = curproc->p_fsz_ctl; 311 auio.uio_fmode = fflag; 312 ioflag = auio.uio_fmode & 313 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 314 error = VOP_WRITE(vp, &auio, ioflag, 315 fp->f_cred, NULL); 316 317 /* 318 * Check how much data was written. Increment 319 * the 'len' and decrement the 'off' if all 320 * the data was not written. 321 */ 322 cnt -= auio.uio_resid; 323 sfv_len += auio.uio_resid; 324 sfv_off -= auio.uio_resid; 325 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 326 if (vp->v_type == VREG) 327 *fileoff += cnt; 328 *count += cnt; 329 if (error != 0) { 330 kmem_free(ptr, size); 331 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 332 NULL); 333 releasef(sfv->sfv_fd); 334 return (error); 335 } 336 } 337 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 338 releasef(sfv->sfv_fd); 339 kmem_free(ptr, size); 340 } 341 sfv++; 342 } 343 return (0); 344 } 345 346 ssize32_t 347 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, 348 size32_t *xferred, int fildes) 349 { 350 u_offset_t fileoff; 351 int copy_cnt; 352 const struct ksendfilevec64 *copy_vec; 353 struct ksendfilevec64 sfv[SEND_MAX_CHUNK]; 354 struct vnode *vp; 355 int error; 356 ssize32_t count = 0; 357 358 vp = fp->f_vnode; 359 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 360 361 copy_vec = vec; 362 fileoff = fp->f_offset; 363 364 do { 365 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 366 if (copyin(copy_vec, sfv, copy_cnt * 367 sizeof (struct ksendfilevec64))) { 368 error = EFAULT; 369 break; 370 } 371 372 error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count); 373 if (error != 0) 374 break; 375 376 copy_vec += copy_cnt; 377 sfvcnt -= copy_cnt; 378 } while (sfvcnt > 0); 379 380 if (vp->v_type == VREG) 381 fp->f_offset += count; 382 383 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 384 if (copyout(&count, xferred, sizeof (count))) 385 error = EFAULT; 386 releasef(fildes); 387 if (error != 0) 388 return (set_errno(error)); 389 return (count); 390 } 391 #endif 392 393 int 394 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 395 int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count) 396 { 397 struct vnode *vp; 398 struct uio auio; 399 struct iovec aiov; 400 ushort_t fflag; 401 int ioflag; 402 int i, error; 403 size_t cnt; 404 ssize_t sfv_len; 405 u_offset_t sfv_off; 406 #ifdef _SYSCALL32_IMPL 407 model_t model = get_udatamodel(); 408 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 409 MAXOFF32_T : MAXOFFSET_T; 410 #else 411 const u_offset_t maxoff = MAXOFF32_T; 412 #endif 413 mblk_t *dmp = NULL; 414 int wroff; 415 int buf_left = 0; 416 size_t iov_len; 417 mblk_t *head, *tmp; 418 size_t size = total_size; 419 size_t extra; 420 int tail_len; 421 struct nmsghdr msg; 422 423 fflag = fp->f_flag; 424 vp = fp->f_vnode; 425 426 ASSERT(vp->v_type == VSOCK); 427 ASSERT(maxblk > 0); 428 429 /* If nothing to send, return */ 430 if (total_size == 0) 431 return (0); 432 433 if (vp->v_stream != NULL) { 434 wroff = (int)vp->v_stream->sd_wroff; 435 tail_len = (int)vp->v_stream->sd_tail; 436 } else { 437 struct sonode *so; 438 439 so = VTOSO(vp); 440 wroff = so->so_proto_props.sopp_wroff; 441 tail_len = so->so_proto_props.sopp_tail; 442 } 443 444 extra = wroff + tail_len; 445 446 buf_left = MIN(total_size, maxblk); 447 head = dmp = allocb(buf_left + extra, BPRI_HI); 448 if (head == NULL) 449 return (ENOMEM); 450 head->b_wptr = head->b_rptr = head->b_rptr + wroff; 451 bzero(&msg, sizeof (msg)); 452 453 auio.uio_extflg = UIO_COPY_DEFAULT; 454 for (i = 0; i < copy_cnt; i++) { 455 if (ISSIG(curthread, JUSTLOOKING)) { 456 freemsg(head); 457 return (EINTR); 458 } 459 460 /* 461 * Do similar checks as "write" as we are writing 462 * sfv_len bytes into "vp". 463 */ 464 sfv_len = (ssize_t)sfv->sfv_len; 465 466 if (sfv_len == 0) { 467 sfv++; 468 continue; 469 } 470 471 /* Check for overflow */ 472 #ifdef _SYSCALL32_IMPL 473 if (model == DATAMODEL_ILP32) { 474 if (((ssize32_t)(*count + sfv_len)) < 0) { 475 freemsg(head); 476 return (EINVAL); 477 } 478 } else 479 #endif 480 if ((*count + sfv_len) < 0) { 481 freemsg(head); 482 return (EINVAL); 483 } 484 485 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 486 487 if (sfv->sfv_fd == SFV_FD_SELF) { 488 while (sfv_len > 0) { 489 if (buf_left == 0) { 490 tmp = dmp; 491 buf_left = MIN(total_size, maxblk); 492 iov_len = MIN(buf_left, sfv_len); 493 dmp = allocb(buf_left + extra, BPRI_HI); 494 if (dmp == NULL) { 495 freemsg(head); 496 return (ENOMEM); 497 } 498 dmp->b_wptr = dmp->b_rptr = 499 dmp->b_rptr + wroff; 500 tmp->b_cont = dmp; 501 } else { 502 iov_len = MIN(buf_left, sfv_len); 503 } 504 505 aiov.iov_len = iov_len; 506 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 507 auio.uio_loffset = *fileoff; 508 auio.uio_iovcnt = 1; 509 auio.uio_resid = iov_len; 510 auio.uio_iov = &aiov; 511 auio.uio_segflg = UIO_USERSPACE; 512 auio.uio_llimit = curproc->p_fsz_ctl; 513 auio.uio_fmode = fflag; 514 515 buf_left -= iov_len; 516 total_size -= iov_len; 517 sfv_len -= iov_len; 518 sfv_off += iov_len; 519 520 error = uiomove((caddr_t)dmp->b_wptr, 521 iov_len, UIO_WRITE, &auio); 522 if (error != 0) { 523 freemsg(head); 524 return (error); 525 } 526 dmp->b_wptr += iov_len; 527 } 528 } else { 529 file_t *ffp; 530 vnode_t *readvp; 531 532 if ((ffp = getf(sfv->sfv_fd)) == NULL) { 533 freemsg(head); 534 return (EBADF); 535 } 536 537 if ((ffp->f_flag & FREAD) == 0) { 538 releasef(sfv->sfv_fd); 539 freemsg(head); 540 return (EACCES); 541 } 542 543 readvp = ffp->f_vnode; 544 if (readvp->v_type != VREG) { 545 releasef(sfv->sfv_fd); 546 freemsg(head); 547 return (EINVAL); 548 } 549 550 /* 551 * No point reading and writing to same vp, 552 * as long as both are regular files. readvp is not 553 * locked; but since we got it from an open file the 554 * contents will be valid during the time of access. 555 */ 556 557 if (vn_compare(vp, readvp)) { 558 releasef(sfv->sfv_fd); 559 freemsg(head); 560 return (EINVAL); 561 } 562 563 /* 564 * Note: we assume readvp != vp. "vp" is already 565 * locked, and "readvp" must not be. 566 */ 567 568 if (readvp < vp) { 569 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 570 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 571 NULL); 572 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 573 } else { 574 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 575 NULL); 576 } 577 578 /* Same checks as in pread */ 579 if (sfv_off > maxoff) { 580 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 581 releasef(sfv->sfv_fd); 582 freemsg(head); 583 return (EINVAL); 584 } 585 if (sfv_off + sfv_len > maxoff) { 586 total_size -= (sfv_off + sfv_len - maxoff); 587 sfv_len = (ssize_t)((offset_t)maxoff - 588 sfv_off); 589 } 590 591 while (sfv_len > 0) { 592 if (buf_left == 0) { 593 tmp = dmp; 594 buf_left = MIN(total_size, maxblk); 595 iov_len = MIN(buf_left, sfv_len); 596 dmp = allocb(buf_left + extra, BPRI_HI); 597 if (dmp == NULL) { 598 VOP_RWUNLOCK(readvp, 599 V_WRITELOCK_FALSE, NULL); 600 releasef(sfv->sfv_fd); 601 freemsg(head); 602 return (ENOMEM); 603 } 604 dmp->b_wptr = dmp->b_rptr = 605 dmp->b_rptr + wroff; 606 tmp->b_cont = dmp; 607 } else { 608 iov_len = MIN(buf_left, sfv_len); 609 } 610 aiov.iov_base = (caddr_t)dmp->b_wptr; 611 aiov.iov_len = iov_len; 612 auio.uio_loffset = sfv_off; 613 auio.uio_iov = &aiov; 614 auio.uio_iovcnt = 1; 615 auio.uio_resid = iov_len; 616 auio.uio_segflg = UIO_SYSSPACE; 617 auio.uio_llimit = MAXOFFSET_T; 618 auio.uio_fmode = ffp->f_flag; 619 ioflag = auio.uio_fmode & 620 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 621 622 /* 623 * If read sync is not asked for, 624 * filter sync flags 625 */ 626 if ((ioflag & FRSYNC) == 0) 627 ioflag &= ~(FSYNC|FDSYNC); 628 error = VOP_READ(readvp, &auio, ioflag, 629 fp->f_cred, NULL); 630 if (error != 0) { 631 /* 632 * If we were reading a pipe (currently 633 * not implemented), we may now loose 634 * data. 635 */ 636 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 637 NULL); 638 releasef(sfv->sfv_fd); 639 freemsg(head); 640 return (error); 641 } 642 643 /* 644 * Check how much data was really read. 645 * Decrement the 'len' and increment the 646 * 'off' appropriately. 647 */ 648 cnt = iov_len - auio.uio_resid; 649 if (cnt == 0) { 650 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 651 NULL); 652 releasef(sfv->sfv_fd); 653 freemsg(head); 654 return (EINVAL); 655 } 656 sfv_len -= cnt; 657 sfv_off += cnt; 658 total_size -= cnt; 659 buf_left -= cnt; 660 661 dmp->b_wptr += cnt; 662 } 663 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 664 releasef(sfv->sfv_fd); 665 } 666 sfv++; 667 } 668 669 ASSERT(total_size == 0); 670 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &head); 671 if (error != 0) { 672 if (head != NULL) 673 freemsg(head); 674 return (error); 675 } 676 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size; 677 *count += size; 678 679 return (0); 680 } 681 682 683 int 684 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 685 int copy_cnt, ssize_t *count) 686 { 687 struct vnode *vp; 688 struct uio auio; 689 struct iovec aiov; 690 ushort_t fflag; 691 int ioflag; 692 int i, error; 693 size_t cnt; 694 ssize_t sfv_len; 695 u_offset_t sfv_off; 696 #ifdef _SYSCALL32_IMPL 697 model_t model = get_udatamodel(); 698 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 699 MAXOFF32_T : MAXOFFSET_T; 700 #else 701 const u_offset_t maxoff = MAXOFF32_T; 702 #endif 703 mblk_t *dmp = NULL; 704 char *buf = NULL; 705 size_t extra; 706 int maxblk, wroff, tail_len; 707 struct sonode *so; 708 stdata_t *stp; 709 struct nmsghdr msg; 710 711 fflag = fp->f_flag; 712 vp = fp->f_vnode; 713 714 if (vp->v_type == VSOCK) { 715 so = VTOSO(vp); 716 if (vp->v_stream != NULL) { 717 stp = vp->v_stream; 718 wroff = (int)stp->sd_wroff; 719 tail_len = (int)stp->sd_tail; 720 maxblk = (int)stp->sd_maxblk; 721 } else { 722 stp = NULL; 723 wroff = so->so_proto_props.sopp_wroff; 724 tail_len = so->so_proto_props.sopp_tail; 725 maxblk = so->so_proto_props.sopp_maxblk; 726 } 727 extra = wroff + tail_len; 728 } 729 730 bzero(&msg, sizeof (msg)); 731 auio.uio_extflg = UIO_COPY_DEFAULT; 732 for (i = 0; i < copy_cnt; i++) { 733 if (ISSIG(curthread, JUSTLOOKING)) 734 return (EINTR); 735 736 /* 737 * Do similar checks as "write" as we are writing 738 * sfv_len bytes into "vp". 739 */ 740 sfv_len = (ssize_t)sfv->sfv_len; 741 742 if (sfv_len == 0) { 743 sfv++; 744 continue; 745 } 746 747 if (vp->v_type == VREG) { 748 if (*fileoff >= curproc->p_fsz_ctl) { 749 mutex_enter(&curproc->p_lock); 750 (void) rctl_action( 751 rctlproc_legacy[RLIMIT_FSIZE], 752 curproc->p_rctls, curproc, RCA_SAFE); 753 mutex_exit(&curproc->p_lock); 754 755 return (EFBIG); 756 } 757 758 if (*fileoff >= maxoff) 759 return (EFBIG); 760 761 if (*fileoff + sfv_len > maxoff) 762 return (EINVAL); 763 } 764 765 /* Check for overflow */ 766 #ifdef _SYSCALL32_IMPL 767 if (model == DATAMODEL_ILP32) { 768 if (((ssize32_t)(*count + sfv_len)) < 0) 769 return (EINVAL); 770 } else 771 #endif 772 if ((*count + sfv_len) < 0) 773 return (EINVAL); 774 775 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 776 777 if (sfv->sfv_fd == SFV_FD_SELF) { 778 if (vp->v_type == VSOCK) { 779 while (sfv_len > 0) { 780 size_t iov_len; 781 782 iov_len = sfv_len; 783 /* 784 * Socket filters can limit the mblk 785 * size, so limit reads to maxblk if 786 * there are filters present. 787 */ 788 if ((!SOCK_IS_NONSTR(so) && 789 _SOTOTPI(so)->sti_kssl_ctx 790 != NULL) || 791 (so->so_filter_active > 0 && 792 maxblk != INFPSZ)) 793 iov_len = MIN(iov_len, maxblk); 794 795 aiov.iov_len = iov_len; 796 aiov.iov_base = 797 (caddr_t)(uintptr_t)sfv_off; 798 799 auio.uio_iov = &aiov; 800 auio.uio_iovcnt = 1; 801 auio.uio_loffset = *fileoff; 802 auio.uio_segflg = UIO_USERSPACE; 803 auio.uio_fmode = fflag; 804 auio.uio_llimit = curproc->p_fsz_ctl; 805 auio.uio_resid = iov_len; 806 807 dmp = allocb(iov_len + extra, BPRI_HI); 808 if (dmp == NULL) 809 return (ENOMEM); 810 dmp->b_wptr = dmp->b_rptr = 811 dmp->b_rptr + wroff; 812 error = uiomove((caddr_t)dmp->b_wptr, 813 iov_len, UIO_WRITE, &auio); 814 if (error != 0) { 815 freeb(dmp); 816 return (error); 817 } 818 dmp->b_wptr += iov_len; 819 error = socket_sendmblk(VTOSO(vp), 820 &msg, fflag, CRED(), &dmp); 821 822 if (error != 0) { 823 if (dmp != NULL) 824 freeb(dmp); 825 return (error); 826 } 827 ttolwp(curthread)->lwp_ru.ioch += 828 (ulong_t)iov_len; 829 *count += iov_len; 830 sfv_len -= iov_len; 831 sfv_off += iov_len; 832 } 833 } else { 834 aiov.iov_len = sfv_len; 835 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 836 837 auio.uio_iov = &aiov; 838 auio.uio_iovcnt = 1; 839 auio.uio_loffset = *fileoff; 840 auio.uio_segflg = UIO_USERSPACE; 841 auio.uio_fmode = fflag; 842 auio.uio_llimit = curproc->p_fsz_ctl; 843 auio.uio_resid = sfv_len; 844 845 ioflag = auio.uio_fmode & 846 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 847 while (sfv_len > 0) { 848 error = VOP_WRITE(vp, &auio, ioflag, 849 fp->f_cred, NULL); 850 cnt = sfv_len - auio.uio_resid; 851 sfv_len -= cnt; 852 ttolwp(curthread)->lwp_ru.ioch += 853 (ulong_t)cnt; 854 *fileoff += cnt; 855 *count += cnt; 856 if (error != 0) 857 return (error); 858 } 859 } 860 } else { 861 int segmapit = 0; 862 file_t *ffp; 863 vnode_t *readvp; 864 struct vnode *realvp; 865 size_t size; 866 caddr_t ptr; 867 868 if ((ffp = getf(sfv->sfv_fd)) == NULL) 869 return (EBADF); 870 871 if ((ffp->f_flag & FREAD) == 0) { 872 releasef(sfv->sfv_fd); 873 return (EBADF); 874 } 875 876 readvp = ffp->f_vnode; 877 if (VOP_REALVP(readvp, &realvp, NULL) == 0) 878 readvp = realvp; 879 if (readvp->v_type != VREG) { 880 releasef(sfv->sfv_fd); 881 return (EINVAL); 882 } 883 884 /* 885 * No point reading and writing to same vp, 886 * as long as both are regular files. readvp is not 887 * locked; but since we got it from an open file the 888 * contents will be valid during the time of access. 889 */ 890 if (vn_compare(vp, readvp)) { 891 releasef(sfv->sfv_fd); 892 return (EINVAL); 893 } 894 895 /* 896 * Note: we assume readvp != vp. "vp" is already 897 * locked, and "readvp" must not be. 898 */ 899 if (readvp < vp) { 900 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 901 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 902 NULL); 903 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 904 } else { 905 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 906 NULL); 907 } 908 909 /* Same checks as in pread */ 910 if (sfv_off > maxoff) { 911 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 912 releasef(sfv->sfv_fd); 913 return (EINVAL); 914 } 915 if (sfv_off + sfv_len > maxoff) { 916 sfv_len = (ssize_t)((offset_t)maxoff - 917 sfv_off); 918 } 919 /* Find the native blocksize to transfer data */ 920 size = MIN(vp->v_vfsp->vfs_bsize, 921 readvp->v_vfsp->vfs_bsize); 922 size = sfv_len < size ? sfv_len : size; 923 924 if (vp->v_type != VSOCK) { 925 segmapit = 0; 926 buf = kmem_alloc(size, KM_NOSLEEP); 927 if (buf == NULL) { 928 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 929 NULL); 930 releasef(sfv->sfv_fd); 931 return (ENOMEM); 932 } 933 } else { 934 uint_t copyflag; 935 936 copyflag = stp != NULL ? stp->sd_copyflag : 937 so->so_proto_props.sopp_zcopyflag; 938 939 /* 940 * Socket filters can limit the mblk size, 941 * so limit reads to maxblk if there are 942 * filters present. 943 */ 944 if ((!SOCK_IS_NONSTR(so) && 945 _SOTOTPI(so)->sti_kssl_ctx != NULL) || 946 (so->so_filter_active > 0 && 947 maxblk != INFPSZ)) 948 size = MIN(size, maxblk); 949 950 if (vn_has_flocks(readvp) || 951 readvp->v_flag & VNOMAP || 952 copyflag & STZCVMUNSAFE) { 953 segmapit = 0; 954 } else if (copyflag & STZCVMSAFE) { 955 segmapit = 1; 956 } else { 957 int on = 1; 958 if (socket_setsockopt(VTOSO(vp), 959 SOL_SOCKET, SO_SND_COPYAVOID, 960 &on, sizeof (on), CRED()) == 0) 961 segmapit = 1; 962 } 963 } 964 965 if (segmapit) { 966 boolean_t nowait; 967 968 nowait = (sfv->sfv_flag & SFV_NOWAIT) != 0; 969 error = snf_segmap(fp, readvp, sfv_off, 970 (u_offset_t)sfv_len, (ssize_t *)&cnt, 971 nowait); 972 releasef(sfv->sfv_fd); 973 *count += cnt; 974 if (error) 975 return (error); 976 sfv++; 977 continue; 978 } 979 980 while (sfv_len > 0) { 981 size_t iov_len; 982 983 iov_len = MIN(size, sfv_len); 984 985 if (vp->v_type == VSOCK) { 986 dmp = allocb(iov_len + extra, BPRI_HI); 987 if (dmp == NULL) { 988 VOP_RWUNLOCK(readvp, 989 V_WRITELOCK_FALSE, NULL); 990 releasef(sfv->sfv_fd); 991 return (ENOMEM); 992 } 993 dmp->b_wptr = dmp->b_rptr = 994 dmp->b_rptr + wroff; 995 ptr = (caddr_t)dmp->b_rptr; 996 } else { 997 ptr = buf; 998 } 999 1000 aiov.iov_base = ptr; 1001 aiov.iov_len = iov_len; 1002 auio.uio_loffset = sfv_off; 1003 auio.uio_iov = &aiov; 1004 auio.uio_iovcnt = 1; 1005 auio.uio_resid = iov_len; 1006 auio.uio_segflg = UIO_SYSSPACE; 1007 auio.uio_llimit = MAXOFFSET_T; 1008 auio.uio_fmode = ffp->f_flag; 1009 ioflag = auio.uio_fmode & 1010 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1011 1012 /* 1013 * If read sync is not asked for, 1014 * filter sync flags 1015 */ 1016 if ((ioflag & FRSYNC) == 0) 1017 ioflag &= ~(FSYNC|FDSYNC); 1018 error = VOP_READ(readvp, &auio, ioflag, 1019 fp->f_cred, NULL); 1020 if (error != 0) { 1021 /* 1022 * If we were reading a pipe (currently 1023 * not implemented), we may now lose 1024 * data. 1025 */ 1026 if (vp->v_type == VSOCK) 1027 freeb(dmp); 1028 else 1029 kmem_free(buf, size); 1030 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 1031 NULL); 1032 releasef(sfv->sfv_fd); 1033 return (error); 1034 } 1035 1036 /* 1037 * Check how much data was really read. 1038 * Decrement the 'len' and increment the 1039 * 'off' appropriately. 1040 */ 1041 cnt = iov_len - auio.uio_resid; 1042 if (cnt == 0) { 1043 if (vp->v_type == VSOCK) 1044 freeb(dmp); 1045 else 1046 kmem_free(buf, size); 1047 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 1048 NULL); 1049 releasef(sfv->sfv_fd); 1050 return (EINVAL); 1051 } 1052 sfv_len -= cnt; 1053 sfv_off += cnt; 1054 1055 if (vp->v_type == VSOCK) { 1056 dmp->b_wptr = dmp->b_rptr + cnt; 1057 1058 error = socket_sendmblk(VTOSO(vp), 1059 &msg, fflag, CRED(), &dmp); 1060 1061 if (error != 0) { 1062 if (dmp != NULL) 1063 freeb(dmp); 1064 VOP_RWUNLOCK(readvp, 1065 V_WRITELOCK_FALSE, NULL); 1066 releasef(sfv->sfv_fd); 1067 return (error); 1068 } 1069 1070 ttolwp(curthread)->lwp_ru.ioch += 1071 (ulong_t)cnt; 1072 *count += cnt; 1073 } else { 1074 1075 aiov.iov_base = ptr; 1076 aiov.iov_len = cnt; 1077 auio.uio_loffset = *fileoff; 1078 auio.uio_resid = cnt; 1079 auio.uio_iov = &aiov; 1080 auio.uio_iovcnt = 1; 1081 auio.uio_segflg = UIO_SYSSPACE; 1082 auio.uio_llimit = curproc->p_fsz_ctl; 1083 auio.uio_fmode = fflag; 1084 ioflag = auio.uio_fmode & 1085 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1086 error = VOP_WRITE(vp, &auio, ioflag, 1087 fp->f_cred, NULL); 1088 1089 /* 1090 * Check how much data was written. 1091 * Increment the 'len' and decrement the 1092 * 'off' if all the data was not 1093 * written. 1094 */ 1095 cnt -= auio.uio_resid; 1096 sfv_len += auio.uio_resid; 1097 sfv_off -= auio.uio_resid; 1098 ttolwp(curthread)->lwp_ru.ioch += 1099 (ulong_t)cnt; 1100 *fileoff += cnt; 1101 *count += cnt; 1102 if (error != 0) { 1103 kmem_free(buf, size); 1104 VOP_RWUNLOCK(readvp, 1105 V_WRITELOCK_FALSE, NULL); 1106 releasef(sfv->sfv_fd); 1107 return (error); 1108 } 1109 } 1110 } 1111 if (buf) { 1112 kmem_free(buf, size); 1113 buf = NULL; 1114 } 1115 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 1116 releasef(sfv->sfv_fd); 1117 } 1118 sfv++; 1119 } 1120 return (0); 1121 } 1122 1123 ssize_t 1124 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, 1125 size_t *xferred) 1126 { 1127 int error = 0; 1128 int first_vector_error = 0; 1129 file_t *fp; 1130 struct vnode *vp; 1131 struct sonode *so; 1132 u_offset_t fileoff; 1133 int copy_cnt; 1134 const struct sendfilevec *copy_vec; 1135 struct sendfilevec sfv[SEND_MAX_CHUNK]; 1136 ssize_t count = 0; 1137 #ifdef _SYSCALL32_IMPL 1138 struct ksendfilevec32 sfv32[SEND_MAX_CHUNK]; 1139 #endif 1140 ssize_t total_size; 1141 int i; 1142 boolean_t is_sock = B_FALSE; 1143 int maxblk = 0; 1144 1145 if (sfvcnt <= 0) 1146 return (set_errno(EINVAL)); 1147 1148 if ((fp = getf(fildes)) == NULL) 1149 return (set_errno(EBADF)); 1150 1151 if (((fp->f_flag) & FWRITE) == 0) { 1152 error = EBADF; 1153 goto err; 1154 } 1155 1156 fileoff = fp->f_offset; 1157 vp = fp->f_vnode; 1158 1159 switch (vp->v_type) { 1160 case VSOCK: 1161 so = VTOSO(vp); 1162 is_sock = B_TRUE; 1163 if (SOCK_IS_NONSTR(so)) { 1164 maxblk = so->so_proto_props.sopp_maxblk; 1165 } else { 1166 maxblk = (int)vp->v_stream->sd_maxblk; 1167 } 1168 break; 1169 case VREG: 1170 break; 1171 default: 1172 error = EINVAL; 1173 goto err; 1174 } 1175 1176 switch (opcode) { 1177 case SENDFILEV : 1178 break; 1179 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 1180 case SENDFILEV64 : 1181 return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt, 1182 (size32_t *)xferred, fildes)); 1183 #endif 1184 default : 1185 error = ENOSYS; 1186 break; 1187 } 1188 1189 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1190 copy_vec = vec; 1191 1192 do { 1193 total_size = 0; 1194 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 1195 #ifdef _SYSCALL32_IMPL 1196 /* 32-bit callers need to have their iovec expanded. */ 1197 if (get_udatamodel() == DATAMODEL_ILP32) { 1198 if (copyin(copy_vec, sfv32, 1199 copy_cnt * sizeof (ksendfilevec32_t))) { 1200 error = EFAULT; 1201 break; 1202 } 1203 1204 for (i = 0; i < copy_cnt; i++) { 1205 sfv[i].sfv_fd = sfv32[i].sfv_fd; 1206 sfv[i].sfv_off = 1207 (off_t)(uint32_t)sfv32[i].sfv_off; 1208 sfv[i].sfv_len = (size_t)sfv32[i].sfv_len; 1209 total_size += sfv[i].sfv_len; 1210 sfv[i].sfv_flag = sfv32[i].sfv_flag; 1211 /* 1212 * Individual elements of the vector must not 1213 * wrap or overflow, as later math is signed. 1214 * Equally total_size needs to be checked after 1215 * each vector is added in, to be sure that 1216 * rogue values haven't overflowed the counter. 1217 */ 1218 if (((ssize32_t)sfv[i].sfv_len < 0) || 1219 ((ssize32_t)total_size < 0)) { 1220 /* 1221 * Truncate the vector to send data 1222 * described by elements before the 1223 * error. 1224 */ 1225 copy_cnt = i; 1226 first_vector_error = EINVAL; 1227 /* total_size can't be trusted */ 1228 if ((ssize32_t)total_size < 0) 1229 error = EINVAL; 1230 break; 1231 } 1232 } 1233 /* Nothing to do, process errors */ 1234 if (copy_cnt == 0) 1235 break; 1236 1237 } else { 1238 #endif 1239 if (copyin(copy_vec, sfv, 1240 copy_cnt * sizeof (sendfilevec_t))) { 1241 error = EFAULT; 1242 break; 1243 } 1244 1245 for (i = 0; i < copy_cnt; i++) { 1246 total_size += sfv[i].sfv_len; 1247 /* 1248 * Individual elements of the vector must not 1249 * wrap or overflow, as later math is signed. 1250 * Equally total_size needs to be checked after 1251 * each vector is added in, to be sure that 1252 * rogue values haven't overflowed the counter. 1253 */ 1254 if (((ssize_t)sfv[i].sfv_len < 0) || 1255 (total_size < 0)) { 1256 /* 1257 * Truncate the vector to send data 1258 * described by elements before the 1259 * error. 1260 */ 1261 copy_cnt = i; 1262 first_vector_error = EINVAL; 1263 /* total_size can't be trusted */ 1264 if (total_size < 0) 1265 error = EINVAL; 1266 break; 1267 } 1268 } 1269 /* Nothing to do, process errors */ 1270 if (copy_cnt == 0) 1271 break; 1272 #ifdef _SYSCALL32_IMPL 1273 } 1274 #endif 1275 1276 /* 1277 * The task between deciding to use sendvec_small_chunk 1278 * and sendvec_chunk is dependant on multiple things: 1279 * 1280 * i) latency is important for smaller files. So if the 1281 * data is smaller than 'tcp_slow_start_initial' times 1282 * maxblk, then use sendvec_small_chunk which creates 1283 * maxblk size mblks and chains them together and sends 1284 * them to TCP in one shot. It also leaves 'wroff' size 1285 * space for the headers in each mblk. 1286 * 1287 * ii) for total size bigger than 'tcp_slow_start_initial' 1288 * time maxblk, its probably real file data which is 1289 * dominating. So its better to use sendvec_chunk because 1290 * performance goes to dog if we don't do pagesize reads. 1291 * sendvec_chunk will do pagesize reads and write them 1292 * in pagesize mblks to TCP. 1293 * 1294 * Side Notes: A write to file has not been optimized. 1295 * Future zero copy code will plugin into sendvec_chunk 1296 * only because doing zero copy for files smaller then 1297 * pagesize is useless. 1298 * 1299 * Note, if socket has NL7C enabled then call NL7C's 1300 * senfilev() function to consume the sfv[]. 1301 */ 1302 if (is_sock) { 1303 if (!SOCK_IS_NONSTR(so) && 1304 _SOTOTPI(so)->sti_nl7c_flags != 0) { 1305 error = nl7c_sendfilev(so, &fileoff, 1306 sfv, copy_cnt, &count); 1307 } else if ((total_size <= (4 * maxblk)) && 1308 error == 0) { 1309 error = sendvec_small_chunk(fp, 1310 &fileoff, sfv, copy_cnt, 1311 total_size, maxblk, &count); 1312 } else { 1313 error = sendvec_chunk(fp, &fileoff, 1314 sfv, copy_cnt, &count); 1315 } 1316 } else { 1317 ASSERT(vp->v_type == VREG); 1318 error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt, 1319 &count); 1320 } 1321 1322 1323 #ifdef _SYSCALL32_IMPL 1324 if (get_udatamodel() == DATAMODEL_ILP32) 1325 copy_vec = (const struct sendfilevec *)((char *)copy_vec + 1326 (copy_cnt * sizeof (ksendfilevec32_t))); 1327 else 1328 #endif 1329 copy_vec += copy_cnt; 1330 sfvcnt -= copy_cnt; 1331 1332 /* Process all vector members up to first error */ 1333 } while ((sfvcnt > 0) && first_vector_error == 0 && error == 0); 1334 1335 if (vp->v_type == VREG) 1336 fp->f_offset += count; 1337 1338 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1339 1340 #ifdef _SYSCALL32_IMPL 1341 if (get_udatamodel() == DATAMODEL_ILP32) { 1342 ssize32_t count32 = (ssize32_t)count; 1343 if (copyout(&count32, xferred, sizeof (count32))) 1344 error = EFAULT; 1345 releasef(fildes); 1346 if (error != 0) 1347 return (set_errno(error)); 1348 if (first_vector_error != 0) 1349 return (set_errno(first_vector_error)); 1350 return (count32); 1351 } 1352 #endif 1353 if (copyout(&count, xferred, sizeof (count))) 1354 error = EFAULT; 1355 releasef(fildes); 1356 if (error != 0) 1357 return (set_errno(error)); 1358 if (first_vector_error != 0) 1359 return (set_errno(first_vector_error)); 1360 return (count); 1361 err: 1362 ASSERT(error != 0); 1363 releasef(fildes); 1364 return (set_errno(error)); 1365 } 1366