1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/t_lock.h> 28 #include <sys/param.h> 29 #include <sys/systm.h> 30 #include <sys/buf.h> 31 #include <sys/conf.h> 32 #include <sys/cred.h> 33 #include <sys/kmem.h> 34 #include <sys/sysmacros.h> 35 #include <sys/vfs.h> 36 #include <sys/vnode.h> 37 #include <sys/debug.h> 38 #include <sys/errno.h> 39 #include <sys/time.h> 40 #include <sys/file.h> 41 #include <sys/open.h> 42 #include <sys/user.h> 43 #include <sys/termios.h> 44 #include <sys/stream.h> 45 #include <sys/strsubr.h> 46 #include <sys/sunddi.h> 47 #include <sys/esunddi.h> 48 #include <sys/flock.h> 49 #include <sys/modctl.h> 50 #include <sys/cmn_err.h> 51 #include <sys/vmsystm.h> 52 53 #include <sys/socket.h> 54 #include <sys/socketvar.h> 55 #include <fs/sockfs/sockcommon.h> 56 #include <fs/sockfs/socktpi.h> 57 58 #include <netinet/in.h> 59 #include <sys/sendfile.h> 60 #include <sys/un.h> 61 #include <sys/tihdr.h> 62 #include <sys/atomic.h> 63 64 #include <inet/common.h> 65 #include <inet/ip.h> 66 #include <inet/ip6.h> 67 #include <inet/tcp.h> 68 69 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *, 70 ssize32_t *); 71 extern int nl7c_sendfilev(struct sonode *, u_offset_t *, struct sendfilevec *, 72 int, ssize_t *); 73 extern int snf_segmap(file_t *, vnode_t *, u_offset_t, u_offset_t, ssize_t *, 74 boolean_t); 75 extern sotpi_info_t *sotpi_sototpi(struct sonode *); 76 77 #define SEND_MAX_CHUNK 16 78 79 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 80 /* 81 * 64 bit offsets for 32 bit applications only running either on 82 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer 83 * more than 2GB of data. 84 */ 85 int 86 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, 87 int copy_cnt, ssize32_t *count) 88 { 89 struct vnode *vp; 90 ushort_t fflag; 91 int ioflag; 92 size32_t cnt; 93 ssize32_t sfv_len; 94 ssize32_t tmpcount; 95 u_offset_t sfv_off; 96 struct uio auio; 97 struct iovec aiov; 98 int i, error; 99 100 fflag = fp->f_flag; 101 vp = fp->f_vnode; 102 for (i = 0; i < copy_cnt; i++) { 103 104 if (ISSIG(curthread, JUSTLOOKING)) 105 return (EINTR); 106 107 /* 108 * Do similar checks as "write" as we are writing 109 * sfv_len bytes into "vp". 110 */ 111 sfv_len = (ssize32_t)sfv->sfv_len; 112 113 if (sfv_len == 0) { 114 sfv++; 115 continue; 116 } 117 118 if (sfv_len < 0) 119 return (EINVAL); 120 121 if (vp->v_type == VREG) { 122 if (*fileoff >= curproc->p_fsz_ctl) { 123 mutex_enter(&curproc->p_lock); 124 (void) rctl_action( 125 rctlproc_legacy[RLIMIT_FSIZE], 126 curproc->p_rctls, curproc, RCA_SAFE); 127 mutex_exit(&curproc->p_lock); 128 return (EFBIG); 129 } 130 131 if (*fileoff >= OFFSET_MAX(fp)) 132 return (EFBIG); 133 134 if (*fileoff + sfv_len > OFFSET_MAX(fp)) 135 return (EINVAL); 136 } 137 138 tmpcount = *count + sfv_len; 139 if (tmpcount < 0) 140 return (EINVAL); 141 142 sfv_off = sfv->sfv_off; 143 144 auio.uio_extflg = UIO_COPY_DEFAULT; 145 if (sfv->sfv_fd == SFV_FD_SELF) { 146 aiov.iov_len = sfv_len; 147 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 148 auio.uio_loffset = *fileoff; 149 auio.uio_iovcnt = 1; 150 auio.uio_resid = sfv_len; 151 auio.uio_iov = &aiov; 152 auio.uio_segflg = UIO_USERSPACE; 153 auio.uio_llimit = curproc->p_fsz_ctl; 154 auio.uio_fmode = fflag; 155 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 156 while (sfv_len > 0) { 157 error = VOP_WRITE(vp, &auio, ioflag, 158 fp->f_cred, NULL); 159 cnt = sfv_len - auio.uio_resid; 160 sfv_len -= cnt; 161 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 162 if (vp->v_type == VREG) 163 *fileoff += cnt; 164 *count += cnt; 165 if (error != 0) 166 return (error); 167 } 168 } else { 169 file_t *ffp; 170 vnode_t *readvp; 171 size_t size; 172 caddr_t ptr; 173 174 if ((ffp = getf(sfv->sfv_fd)) == NULL) 175 return (EBADF); 176 177 if ((ffp->f_flag & FREAD) == 0) { 178 releasef(sfv->sfv_fd); 179 return (EBADF); 180 } 181 182 readvp = ffp->f_vnode; 183 if (readvp->v_type != VREG) { 184 releasef(sfv->sfv_fd); 185 return (EINVAL); 186 } 187 188 /* 189 * No point reading and writing to same vp, 190 * as long as both are regular files. readvp is not 191 * locked; but since we got it from an open file the 192 * contents will be valid during the time of access. 193 */ 194 if (vn_compare(vp, readvp)) { 195 releasef(sfv->sfv_fd); 196 return (EINVAL); 197 } 198 199 /* 200 * Optimize the regular file over 201 * the socket case. 202 */ 203 if (vp->v_type == VSOCK) { 204 error = sosendfile64(fp, ffp, sfv, 205 (ssize32_t *)&cnt); 206 *count += cnt; 207 if (error) 208 return (error); 209 sfv++; 210 continue; 211 } 212 213 /* 214 * Note: we assume readvp != vp. "vp" is already 215 * locked, and "readvp" must not be. 216 */ 217 if (readvp < vp) { 218 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 219 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 220 NULL); 221 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 222 } else { 223 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 224 NULL); 225 } 226 227 /* 228 * Same checks as in pread64. 229 */ 230 if (sfv_off > MAXOFFSET_T) { 231 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 232 releasef(sfv->sfv_fd); 233 return (EINVAL); 234 } 235 236 if (sfv_off + sfv_len > MAXOFFSET_T) 237 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off); 238 239 /* Find the native blocksize to transfer data */ 240 size = MIN(vp->v_vfsp->vfs_bsize, 241 readvp->v_vfsp->vfs_bsize); 242 size = sfv_len < size ? sfv_len : size; 243 ptr = kmem_alloc(size, KM_NOSLEEP); 244 if (ptr == NULL) { 245 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 246 releasef(sfv->sfv_fd); 247 return (ENOMEM); 248 } 249 250 while (sfv_len > 0) { 251 size_t iov_len; 252 253 iov_len = MIN(size, sfv_len); 254 aiov.iov_base = ptr; 255 aiov.iov_len = iov_len; 256 auio.uio_loffset = sfv_off; 257 auio.uio_iov = &aiov; 258 auio.uio_iovcnt = 1; 259 auio.uio_resid = iov_len; 260 auio.uio_segflg = UIO_SYSSPACE; 261 auio.uio_llimit = MAXOFFSET_T; 262 auio.uio_fmode = ffp->f_flag; 263 ioflag = auio.uio_fmode & 264 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 265 266 /* 267 * If read sync is not asked for, 268 * filter sync flags 269 */ 270 if ((ioflag & FRSYNC) == 0) 271 ioflag &= ~(FSYNC|FDSYNC); 272 error = VOP_READ(readvp, &auio, ioflag, 273 fp->f_cred, NULL); 274 if (error) { 275 kmem_free(ptr, size); 276 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 277 NULL); 278 releasef(sfv->sfv_fd); 279 return (error); 280 } 281 282 /* 283 * Check how must data was really read. 284 * Decrement the 'len' and increment the 285 * 'off' appropriately. 286 */ 287 cnt = iov_len - auio.uio_resid; 288 if (cnt == 0) { 289 /* 290 * If we were reading a pipe (currently 291 * not implemented), we may now lose 292 * data. 293 */ 294 kmem_free(ptr, size); 295 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 296 NULL); 297 releasef(sfv->sfv_fd); 298 return (EINVAL); 299 } 300 sfv_len -= cnt; 301 sfv_off += cnt; 302 303 aiov.iov_base = ptr; 304 aiov.iov_len = cnt; 305 auio.uio_loffset = *fileoff; 306 auio.uio_iov = &aiov; 307 auio.uio_iovcnt = 1; 308 auio.uio_resid = cnt; 309 auio.uio_segflg = UIO_SYSSPACE; 310 auio.uio_llimit = curproc->p_fsz_ctl; 311 auio.uio_fmode = fflag; 312 ioflag = auio.uio_fmode & 313 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 314 error = VOP_WRITE(vp, &auio, ioflag, 315 fp->f_cred, NULL); 316 317 /* 318 * Check how much data was written. Increment 319 * the 'len' and decrement the 'off' if all 320 * the data was not written. 321 */ 322 cnt -= auio.uio_resid; 323 sfv_len += auio.uio_resid; 324 sfv_off -= auio.uio_resid; 325 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 326 if (vp->v_type == VREG) 327 *fileoff += cnt; 328 *count += cnt; 329 if (error != 0) { 330 kmem_free(ptr, size); 331 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 332 NULL); 333 releasef(sfv->sfv_fd); 334 return (error); 335 } 336 } 337 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 338 releasef(sfv->sfv_fd); 339 kmem_free(ptr, size); 340 } 341 sfv++; 342 } 343 return (0); 344 } 345 346 ssize32_t 347 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, 348 size32_t *xferred, int fildes) 349 { 350 u_offset_t fileoff; 351 int copy_cnt; 352 const struct ksendfilevec64 *copy_vec; 353 struct ksendfilevec64 sfv[SEND_MAX_CHUNK]; 354 struct vnode *vp; 355 int error; 356 ssize32_t count = 0; 357 358 vp = fp->f_vnode; 359 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 360 361 copy_vec = vec; 362 fileoff = fp->f_offset; 363 364 do { 365 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 366 if (copyin(copy_vec, sfv, copy_cnt * 367 sizeof (struct ksendfilevec64))) { 368 error = EFAULT; 369 break; 370 } 371 372 error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count); 373 if (error != 0) 374 break; 375 376 copy_vec += copy_cnt; 377 sfvcnt -= copy_cnt; 378 } while (sfvcnt > 0); 379 380 if (vp->v_type == VREG) 381 fp->f_offset += count; 382 383 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 384 if (copyout(&count, xferred, sizeof (count))) 385 error = EFAULT; 386 releasef(fildes); 387 if (error != 0) 388 return (set_errno(error)); 389 return (count); 390 } 391 #endif 392 393 int 394 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 395 int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count) 396 { 397 struct vnode *vp; 398 struct uio auio; 399 struct iovec aiov; 400 ushort_t fflag; 401 int ioflag; 402 int i, error; 403 size_t cnt; 404 ssize_t sfv_len; 405 u_offset_t sfv_off; 406 #ifdef _SYSCALL32_IMPL 407 model_t model = get_udatamodel(); 408 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 409 MAXOFF32_T : MAXOFFSET_T; 410 #else 411 const u_offset_t maxoff = MAXOFF32_T; 412 #endif 413 mblk_t *dmp = NULL; 414 int wroff; 415 int buf_left = 0; 416 size_t iov_len; 417 mblk_t *head, *tmp; 418 size_t size = total_size; 419 size_t extra; 420 int tail_len; 421 struct nmsghdr msg; 422 423 fflag = fp->f_flag; 424 vp = fp->f_vnode; 425 426 ASSERT(vp->v_type == VSOCK); 427 ASSERT(maxblk > 0); 428 429 /* If nothing to send, return */ 430 if (total_size == 0) 431 return (0); 432 433 if (vp->v_stream != NULL) { 434 wroff = (int)vp->v_stream->sd_wroff; 435 tail_len = (int)vp->v_stream->sd_tail; 436 } else { 437 struct sonode *so; 438 439 so = VTOSO(vp); 440 wroff = so->so_proto_props.sopp_wroff; 441 tail_len = so->so_proto_props.sopp_tail; 442 } 443 444 extra = wroff + tail_len; 445 446 buf_left = MIN(total_size, maxblk); 447 head = dmp = allocb(buf_left + extra, BPRI_HI); 448 if (head == NULL) 449 return (ENOMEM); 450 head->b_wptr = head->b_rptr = head->b_rptr + wroff; 451 bzero(&msg, sizeof (msg)); 452 453 auio.uio_extflg = UIO_COPY_DEFAULT; 454 for (i = 0; i < copy_cnt; i++) { 455 if (ISSIG(curthread, JUSTLOOKING)) { 456 freemsg(head); 457 return (EINTR); 458 } 459 460 /* 461 * Do similar checks as "write" as we are writing 462 * sfv_len bytes into "vp". 463 */ 464 sfv_len = (ssize_t)sfv->sfv_len; 465 466 if (sfv_len == 0) { 467 sfv++; 468 continue; 469 } 470 471 /* Check for overflow */ 472 #ifdef _SYSCALL32_IMPL 473 if (model == DATAMODEL_ILP32) { 474 if (((ssize32_t)(*count + sfv_len)) < 0) { 475 freemsg(head); 476 return (EINVAL); 477 } 478 } else 479 #endif 480 if ((*count + sfv_len) < 0) { 481 freemsg(head); 482 return (EINVAL); 483 } 484 485 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 486 487 if (sfv->sfv_fd == SFV_FD_SELF) { 488 while (sfv_len > 0) { 489 if (buf_left == 0) { 490 tmp = dmp; 491 buf_left = MIN(total_size, maxblk); 492 iov_len = MIN(buf_left, sfv_len); 493 dmp = allocb(buf_left + extra, BPRI_HI); 494 if (dmp == NULL) { 495 freemsg(head); 496 return (ENOMEM); 497 } 498 dmp->b_wptr = dmp->b_rptr = 499 dmp->b_rptr + wroff; 500 tmp->b_cont = dmp; 501 } else { 502 iov_len = MIN(buf_left, sfv_len); 503 } 504 505 aiov.iov_len = iov_len; 506 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 507 auio.uio_loffset = *fileoff; 508 auio.uio_iovcnt = 1; 509 auio.uio_resid = iov_len; 510 auio.uio_iov = &aiov; 511 auio.uio_segflg = UIO_USERSPACE; 512 auio.uio_llimit = curproc->p_fsz_ctl; 513 auio.uio_fmode = fflag; 514 515 buf_left -= iov_len; 516 total_size -= iov_len; 517 sfv_len -= iov_len; 518 sfv_off += iov_len; 519 520 error = uiomove((caddr_t)dmp->b_wptr, 521 iov_len, UIO_WRITE, &auio); 522 if (error != 0) { 523 freemsg(head); 524 return (error); 525 } 526 dmp->b_wptr += iov_len; 527 } 528 } else { 529 file_t *ffp; 530 vnode_t *readvp; 531 532 if ((ffp = getf(sfv->sfv_fd)) == NULL) { 533 freemsg(head); 534 return (EBADF); 535 } 536 537 if ((ffp->f_flag & FREAD) == 0) { 538 releasef(sfv->sfv_fd); 539 freemsg(head); 540 return (EACCES); 541 } 542 543 readvp = ffp->f_vnode; 544 if (readvp->v_type != VREG) { 545 releasef(sfv->sfv_fd); 546 freemsg(head); 547 return (EINVAL); 548 } 549 550 /* 551 * No point reading and writing to same vp, 552 * as long as both are regular files. readvp is not 553 * locked; but since we got it from an open file the 554 * contents will be valid during the time of access. 555 */ 556 557 if (vn_compare(vp, readvp)) { 558 releasef(sfv->sfv_fd); 559 freemsg(head); 560 return (EINVAL); 561 } 562 563 /* 564 * Note: we assume readvp != vp. "vp" is already 565 * locked, and "readvp" must not be. 566 */ 567 568 if (readvp < vp) { 569 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 570 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 571 NULL); 572 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 573 } else { 574 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 575 NULL); 576 } 577 578 /* Same checks as in pread */ 579 if (sfv_off > maxoff) { 580 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 581 releasef(sfv->sfv_fd); 582 freemsg(head); 583 return (EINVAL); 584 } 585 if (sfv_off + sfv_len > maxoff) { 586 total_size -= (sfv_off + sfv_len - maxoff); 587 sfv_len = (ssize_t)((offset_t)maxoff - 588 sfv_off); 589 } 590 591 while (sfv_len > 0) { 592 if (buf_left == 0) { 593 tmp = dmp; 594 buf_left = MIN(total_size, maxblk); 595 iov_len = MIN(buf_left, sfv_len); 596 dmp = allocb(buf_left + extra, BPRI_HI); 597 if (dmp == NULL) { 598 VOP_RWUNLOCK(readvp, 599 V_WRITELOCK_FALSE, NULL); 600 releasef(sfv->sfv_fd); 601 freemsg(head); 602 return (ENOMEM); 603 } 604 dmp->b_wptr = dmp->b_rptr = 605 dmp->b_rptr + wroff; 606 tmp->b_cont = dmp; 607 } else { 608 iov_len = MIN(buf_left, sfv_len); 609 } 610 aiov.iov_base = (caddr_t)dmp->b_wptr; 611 aiov.iov_len = iov_len; 612 auio.uio_loffset = sfv_off; 613 auio.uio_iov = &aiov; 614 auio.uio_iovcnt = 1; 615 auio.uio_resid = iov_len; 616 auio.uio_segflg = UIO_SYSSPACE; 617 auio.uio_llimit = MAXOFFSET_T; 618 auio.uio_fmode = ffp->f_flag; 619 ioflag = auio.uio_fmode & 620 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 621 622 /* 623 * If read sync is not asked for, 624 * filter sync flags 625 */ 626 if ((ioflag & FRSYNC) == 0) 627 ioflag &= ~(FSYNC|FDSYNC); 628 error = VOP_READ(readvp, &auio, ioflag, 629 fp->f_cred, NULL); 630 if (error != 0) { 631 /* 632 * If we were reading a pipe (currently 633 * not implemented), we may now loose 634 * data. 635 */ 636 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 637 NULL); 638 releasef(sfv->sfv_fd); 639 freemsg(head); 640 return (error); 641 } 642 643 /* 644 * Check how much data was really read. 645 * Decrement the 'len' and increment the 646 * 'off' appropriately. 647 */ 648 cnt = iov_len - auio.uio_resid; 649 if (cnt == 0) { 650 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 651 NULL); 652 releasef(sfv->sfv_fd); 653 freemsg(head); 654 return (EINVAL); 655 } 656 sfv_len -= cnt; 657 sfv_off += cnt; 658 total_size -= cnt; 659 buf_left -= cnt; 660 661 dmp->b_wptr += cnt; 662 } 663 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 664 releasef(sfv->sfv_fd); 665 } 666 sfv++; 667 } 668 669 ASSERT(total_size == 0); 670 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &head); 671 if (error != 0) { 672 if (head != NULL) 673 freemsg(head); 674 return (error); 675 } 676 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size; 677 *count += size; 678 679 return (0); 680 } 681 682 683 int 684 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 685 int copy_cnt, ssize_t *count) 686 { 687 struct vnode *vp; 688 struct uio auio; 689 struct iovec aiov; 690 ushort_t fflag; 691 int ioflag; 692 int i, error; 693 size_t cnt; 694 ssize_t sfv_len; 695 u_offset_t sfv_off; 696 #ifdef _SYSCALL32_IMPL 697 model_t model = get_udatamodel(); 698 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 699 MAXOFF32_T : MAXOFFSET_T; 700 #else 701 const u_offset_t maxoff = MAXOFF32_T; 702 #endif 703 mblk_t *dmp = NULL; 704 char *buf = NULL; 705 size_t extra; 706 int maxblk, wroff, tail_len; 707 struct sonode *so; 708 stdata_t *stp; 709 struct nmsghdr msg; 710 711 fflag = fp->f_flag; 712 vp = fp->f_vnode; 713 714 if (vp->v_type == VSOCK) { 715 so = VTOSO(vp); 716 if (vp->v_stream != NULL) { 717 stp = vp->v_stream; 718 wroff = (int)stp->sd_wroff; 719 tail_len = (int)stp->sd_tail; 720 maxblk = (int)stp->sd_maxblk; 721 } else { 722 stp = NULL; 723 wroff = so->so_proto_props.sopp_wroff; 724 tail_len = so->so_proto_props.sopp_tail; 725 maxblk = so->so_proto_props.sopp_maxblk; 726 } 727 extra = wroff + tail_len; 728 } 729 730 bzero(&msg, sizeof (msg)); 731 auio.uio_extflg = UIO_COPY_DEFAULT; 732 for (i = 0; i < copy_cnt; i++) { 733 if (ISSIG(curthread, JUSTLOOKING)) 734 return (EINTR); 735 736 /* 737 * Do similar checks as "write" as we are writing 738 * sfv_len bytes into "vp". 739 */ 740 sfv_len = (ssize_t)sfv->sfv_len; 741 742 if (sfv_len == 0) { 743 sfv++; 744 continue; 745 } 746 747 if (vp->v_type == VREG) { 748 if (*fileoff >= curproc->p_fsz_ctl) { 749 mutex_enter(&curproc->p_lock); 750 (void) rctl_action( 751 rctlproc_legacy[RLIMIT_FSIZE], 752 curproc->p_rctls, curproc, RCA_SAFE); 753 mutex_exit(&curproc->p_lock); 754 755 return (EFBIG); 756 } 757 758 if (*fileoff >= maxoff) 759 return (EFBIG); 760 761 if (*fileoff + sfv_len > maxoff) 762 return (EINVAL); 763 } 764 765 /* Check for overflow */ 766 #ifdef _SYSCALL32_IMPL 767 if (model == DATAMODEL_ILP32) { 768 if (((ssize32_t)(*count + sfv_len)) < 0) 769 return (EINVAL); 770 } else 771 #endif 772 if ((*count + sfv_len) < 0) 773 return (EINVAL); 774 775 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 776 777 if (sfv->sfv_fd == SFV_FD_SELF) { 778 if (vp->v_type == VSOCK) { 779 while (sfv_len > 0) { 780 size_t iov_len; 781 782 iov_len = sfv_len; 783 /* 784 * Socket filters can limit the mblk 785 * size, so limit reads to maxblk if 786 * there are filters present. 787 */ 788 if (so->so_filter_active > 0 && 789 maxblk != INFPSZ) 790 iov_len = MIN(iov_len, maxblk); 791 792 aiov.iov_len = iov_len; 793 aiov.iov_base = 794 (caddr_t)(uintptr_t)sfv_off; 795 796 auio.uio_iov = &aiov; 797 auio.uio_iovcnt = 1; 798 auio.uio_loffset = *fileoff; 799 auio.uio_segflg = UIO_USERSPACE; 800 auio.uio_fmode = fflag; 801 auio.uio_llimit = curproc->p_fsz_ctl; 802 auio.uio_resid = iov_len; 803 804 dmp = allocb(iov_len + extra, BPRI_HI); 805 if (dmp == NULL) 806 return (ENOMEM); 807 dmp->b_wptr = dmp->b_rptr = 808 dmp->b_rptr + wroff; 809 error = uiomove((caddr_t)dmp->b_wptr, 810 iov_len, UIO_WRITE, &auio); 811 if (error != 0) { 812 freeb(dmp); 813 return (error); 814 } 815 dmp->b_wptr += iov_len; 816 error = socket_sendmblk(VTOSO(vp), 817 &msg, fflag, CRED(), &dmp); 818 819 if (error != 0) { 820 if (dmp != NULL) 821 freeb(dmp); 822 return (error); 823 } 824 ttolwp(curthread)->lwp_ru.ioch += 825 (ulong_t)iov_len; 826 *count += iov_len; 827 sfv_len -= iov_len; 828 sfv_off += iov_len; 829 } 830 } else { 831 aiov.iov_len = sfv_len; 832 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 833 834 auio.uio_iov = &aiov; 835 auio.uio_iovcnt = 1; 836 auio.uio_loffset = *fileoff; 837 auio.uio_segflg = UIO_USERSPACE; 838 auio.uio_fmode = fflag; 839 auio.uio_llimit = curproc->p_fsz_ctl; 840 auio.uio_resid = sfv_len; 841 842 ioflag = auio.uio_fmode & 843 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 844 while (sfv_len > 0) { 845 error = VOP_WRITE(vp, &auio, ioflag, 846 fp->f_cred, NULL); 847 cnt = sfv_len - auio.uio_resid; 848 sfv_len -= cnt; 849 ttolwp(curthread)->lwp_ru.ioch += 850 (ulong_t)cnt; 851 *fileoff += cnt; 852 *count += cnt; 853 if (error != 0) 854 return (error); 855 } 856 } 857 } else { 858 int segmapit = 0; 859 file_t *ffp; 860 vnode_t *readvp; 861 struct vnode *realvp; 862 size_t size; 863 caddr_t ptr; 864 865 if ((ffp = getf(sfv->sfv_fd)) == NULL) 866 return (EBADF); 867 868 if ((ffp->f_flag & FREAD) == 0) { 869 releasef(sfv->sfv_fd); 870 return (EBADF); 871 } 872 873 readvp = ffp->f_vnode; 874 if (VOP_REALVP(readvp, &realvp, NULL) == 0) 875 readvp = realvp; 876 if (readvp->v_type != VREG) { 877 releasef(sfv->sfv_fd); 878 return (EINVAL); 879 } 880 881 /* 882 * No point reading and writing to same vp, 883 * as long as both are regular files. readvp is not 884 * locked; but since we got it from an open file the 885 * contents will be valid during the time of access. 886 */ 887 if (vn_compare(vp, readvp)) { 888 releasef(sfv->sfv_fd); 889 return (EINVAL); 890 } 891 892 /* 893 * Note: we assume readvp != vp. "vp" is already 894 * locked, and "readvp" must not be. 895 */ 896 if (readvp < vp) { 897 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 898 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 899 NULL); 900 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 901 } else { 902 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 903 NULL); 904 } 905 906 /* Same checks as in pread */ 907 if (sfv_off > maxoff) { 908 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 909 releasef(sfv->sfv_fd); 910 return (EINVAL); 911 } 912 if (sfv_off + sfv_len > maxoff) { 913 sfv_len = (ssize_t)((offset_t)maxoff - 914 sfv_off); 915 } 916 /* Find the native blocksize to transfer data */ 917 size = MIN(vp->v_vfsp->vfs_bsize, 918 readvp->v_vfsp->vfs_bsize); 919 size = sfv_len < size ? sfv_len : size; 920 921 if (vp->v_type != VSOCK) { 922 segmapit = 0; 923 buf = kmem_alloc(size, KM_NOSLEEP); 924 if (buf == NULL) { 925 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 926 NULL); 927 releasef(sfv->sfv_fd); 928 return (ENOMEM); 929 } 930 } else { 931 uint_t copyflag; 932 933 copyflag = stp != NULL ? stp->sd_copyflag : 934 so->so_proto_props.sopp_zcopyflag; 935 936 /* 937 * Socket filters can limit the mblk size, 938 * so limit reads to maxblk if there are 939 * filters present. 940 */ 941 if (so->so_filter_active > 0 && 942 maxblk != INFPSZ) 943 size = MIN(size, maxblk); 944 945 if (vn_has_flocks(readvp) || 946 readvp->v_flag & VNOMAP || 947 copyflag & STZCVMUNSAFE) { 948 segmapit = 0; 949 } else if (copyflag & STZCVMSAFE) { 950 segmapit = 1; 951 } else { 952 int on = 1; 953 if (socket_setsockopt(VTOSO(vp), 954 SOL_SOCKET, SO_SND_COPYAVOID, 955 &on, sizeof (on), CRED()) == 0) 956 segmapit = 1; 957 } 958 } 959 960 if (segmapit) { 961 boolean_t nowait; 962 963 nowait = (sfv->sfv_flag & SFV_NOWAIT) != 0; 964 error = snf_segmap(fp, readvp, sfv_off, 965 (u_offset_t)sfv_len, (ssize_t *)&cnt, 966 nowait); 967 releasef(sfv->sfv_fd); 968 *count += cnt; 969 if (error) 970 return (error); 971 sfv++; 972 continue; 973 } 974 975 while (sfv_len > 0) { 976 size_t iov_len; 977 978 iov_len = MIN(size, sfv_len); 979 980 if (vp->v_type == VSOCK) { 981 dmp = allocb(iov_len + extra, BPRI_HI); 982 if (dmp == NULL) { 983 VOP_RWUNLOCK(readvp, 984 V_WRITELOCK_FALSE, NULL); 985 releasef(sfv->sfv_fd); 986 return (ENOMEM); 987 } 988 dmp->b_wptr = dmp->b_rptr = 989 dmp->b_rptr + wroff; 990 ptr = (caddr_t)dmp->b_rptr; 991 } else { 992 ptr = buf; 993 } 994 995 aiov.iov_base = ptr; 996 aiov.iov_len = iov_len; 997 auio.uio_loffset = sfv_off; 998 auio.uio_iov = &aiov; 999 auio.uio_iovcnt = 1; 1000 auio.uio_resid = iov_len; 1001 auio.uio_segflg = UIO_SYSSPACE; 1002 auio.uio_llimit = MAXOFFSET_T; 1003 auio.uio_fmode = ffp->f_flag; 1004 ioflag = auio.uio_fmode & 1005 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1006 1007 /* 1008 * If read sync is not asked for, 1009 * filter sync flags 1010 */ 1011 if ((ioflag & FRSYNC) == 0) 1012 ioflag &= ~(FSYNC|FDSYNC); 1013 error = VOP_READ(readvp, &auio, ioflag, 1014 fp->f_cred, NULL); 1015 if (error != 0) { 1016 /* 1017 * If we were reading a pipe (currently 1018 * not implemented), we may now lose 1019 * data. 1020 */ 1021 if (vp->v_type == VSOCK) 1022 freeb(dmp); 1023 else 1024 kmem_free(buf, size); 1025 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 1026 NULL); 1027 releasef(sfv->sfv_fd); 1028 return (error); 1029 } 1030 1031 /* 1032 * Check how much data was really read. 1033 * Decrement the 'len' and increment the 1034 * 'off' appropriately. 1035 */ 1036 cnt = iov_len - auio.uio_resid; 1037 if (cnt == 0) { 1038 if (vp->v_type == VSOCK) 1039 freeb(dmp); 1040 else 1041 kmem_free(buf, size); 1042 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 1043 NULL); 1044 releasef(sfv->sfv_fd); 1045 return (EINVAL); 1046 } 1047 sfv_len -= cnt; 1048 sfv_off += cnt; 1049 1050 if (vp->v_type == VSOCK) { 1051 dmp->b_wptr = dmp->b_rptr + cnt; 1052 1053 error = socket_sendmblk(VTOSO(vp), 1054 &msg, fflag, CRED(), &dmp); 1055 1056 if (error != 0) { 1057 if (dmp != NULL) 1058 freeb(dmp); 1059 VOP_RWUNLOCK(readvp, 1060 V_WRITELOCK_FALSE, NULL); 1061 releasef(sfv->sfv_fd); 1062 return (error); 1063 } 1064 1065 ttolwp(curthread)->lwp_ru.ioch += 1066 (ulong_t)cnt; 1067 *count += cnt; 1068 } else { 1069 1070 aiov.iov_base = ptr; 1071 aiov.iov_len = cnt; 1072 auio.uio_loffset = *fileoff; 1073 auio.uio_resid = cnt; 1074 auio.uio_iov = &aiov; 1075 auio.uio_iovcnt = 1; 1076 auio.uio_segflg = UIO_SYSSPACE; 1077 auio.uio_llimit = curproc->p_fsz_ctl; 1078 auio.uio_fmode = fflag; 1079 ioflag = auio.uio_fmode & 1080 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1081 error = VOP_WRITE(vp, &auio, ioflag, 1082 fp->f_cred, NULL); 1083 1084 /* 1085 * Check how much data was written. 1086 * Increment the 'len' and decrement the 1087 * 'off' if all the data was not 1088 * written. 1089 */ 1090 cnt -= auio.uio_resid; 1091 sfv_len += auio.uio_resid; 1092 sfv_off -= auio.uio_resid; 1093 ttolwp(curthread)->lwp_ru.ioch += 1094 (ulong_t)cnt; 1095 *fileoff += cnt; 1096 *count += cnt; 1097 if (error != 0) { 1098 kmem_free(buf, size); 1099 VOP_RWUNLOCK(readvp, 1100 V_WRITELOCK_FALSE, NULL); 1101 releasef(sfv->sfv_fd); 1102 return (error); 1103 } 1104 } 1105 } 1106 if (buf) { 1107 kmem_free(buf, size); 1108 buf = NULL; 1109 } 1110 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 1111 releasef(sfv->sfv_fd); 1112 } 1113 sfv++; 1114 } 1115 return (0); 1116 } 1117 1118 ssize_t 1119 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, 1120 size_t *xferred) 1121 { 1122 int error = 0; 1123 int first_vector_error = 0; 1124 file_t *fp; 1125 struct vnode *vp; 1126 struct sonode *so; 1127 u_offset_t fileoff; 1128 int copy_cnt; 1129 const struct sendfilevec *copy_vec; 1130 struct sendfilevec sfv[SEND_MAX_CHUNK]; 1131 ssize_t count = 0; 1132 #ifdef _SYSCALL32_IMPL 1133 struct ksendfilevec32 sfv32[SEND_MAX_CHUNK]; 1134 #endif 1135 ssize_t total_size; 1136 int i; 1137 boolean_t is_sock = B_FALSE; 1138 int maxblk = 0; 1139 1140 if (sfvcnt <= 0) 1141 return (set_errno(EINVAL)); 1142 1143 if ((fp = getf(fildes)) == NULL) 1144 return (set_errno(EBADF)); 1145 1146 if (((fp->f_flag) & FWRITE) == 0) { 1147 error = EBADF; 1148 goto err; 1149 } 1150 1151 fileoff = fp->f_offset; 1152 vp = fp->f_vnode; 1153 1154 switch (vp->v_type) { 1155 case VSOCK: 1156 so = VTOSO(vp); 1157 is_sock = B_TRUE; 1158 if (SOCK_IS_NONSTR(so)) { 1159 maxblk = so->so_proto_props.sopp_maxblk; 1160 } else { 1161 maxblk = (int)vp->v_stream->sd_maxblk; 1162 } 1163 break; 1164 case VREG: 1165 break; 1166 default: 1167 error = EINVAL; 1168 goto err; 1169 } 1170 1171 switch (opcode) { 1172 case SENDFILEV : 1173 break; 1174 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 1175 case SENDFILEV64 : 1176 return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt, 1177 (size32_t *)xferred, fildes)); 1178 #endif 1179 default : 1180 error = ENOSYS; 1181 break; 1182 } 1183 1184 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1185 copy_vec = vec; 1186 1187 do { 1188 total_size = 0; 1189 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 1190 #ifdef _SYSCALL32_IMPL 1191 /* 32-bit callers need to have their iovec expanded. */ 1192 if (get_udatamodel() == DATAMODEL_ILP32) { 1193 if (copyin(copy_vec, sfv32, 1194 copy_cnt * sizeof (ksendfilevec32_t))) { 1195 error = EFAULT; 1196 break; 1197 } 1198 1199 for (i = 0; i < copy_cnt; i++) { 1200 sfv[i].sfv_fd = sfv32[i].sfv_fd; 1201 sfv[i].sfv_off = 1202 (off_t)(uint32_t)sfv32[i].sfv_off; 1203 sfv[i].sfv_len = (size_t)sfv32[i].sfv_len; 1204 total_size += sfv[i].sfv_len; 1205 sfv[i].sfv_flag = sfv32[i].sfv_flag; 1206 /* 1207 * Individual elements of the vector must not 1208 * wrap or overflow, as later math is signed. 1209 * Equally total_size needs to be checked after 1210 * each vector is added in, to be sure that 1211 * rogue values haven't overflowed the counter. 1212 */ 1213 if (((ssize32_t)sfv[i].sfv_len < 0) || 1214 ((ssize32_t)total_size < 0)) { 1215 /* 1216 * Truncate the vector to send data 1217 * described by elements before the 1218 * error. 1219 */ 1220 copy_cnt = i; 1221 first_vector_error = EINVAL; 1222 /* total_size can't be trusted */ 1223 if ((ssize32_t)total_size < 0) 1224 error = EINVAL; 1225 break; 1226 } 1227 } 1228 /* Nothing to do, process errors */ 1229 if (copy_cnt == 0) 1230 break; 1231 1232 } else { 1233 #endif 1234 if (copyin(copy_vec, sfv, 1235 copy_cnt * sizeof (sendfilevec_t))) { 1236 error = EFAULT; 1237 break; 1238 } 1239 1240 for (i = 0; i < copy_cnt; i++) { 1241 total_size += sfv[i].sfv_len; 1242 /* 1243 * Individual elements of the vector must not 1244 * wrap or overflow, as later math is signed. 1245 * Equally total_size needs to be checked after 1246 * each vector is added in, to be sure that 1247 * rogue values haven't overflowed the counter. 1248 */ 1249 if (((ssize_t)sfv[i].sfv_len < 0) || 1250 (total_size < 0)) { 1251 /* 1252 * Truncate the vector to send data 1253 * described by elements before the 1254 * error. 1255 */ 1256 copy_cnt = i; 1257 first_vector_error = EINVAL; 1258 /* total_size can't be trusted */ 1259 if (total_size < 0) 1260 error = EINVAL; 1261 break; 1262 } 1263 } 1264 /* Nothing to do, process errors */ 1265 if (copy_cnt == 0) 1266 break; 1267 #ifdef _SYSCALL32_IMPL 1268 } 1269 #endif 1270 1271 /* 1272 * The task between deciding to use sendvec_small_chunk 1273 * and sendvec_chunk is dependant on multiple things: 1274 * 1275 * i) latency is important for smaller files. So if the 1276 * data is smaller than 'tcp_slow_start_initial' times 1277 * maxblk, then use sendvec_small_chunk which creates 1278 * maxblk size mblks and chains them together and sends 1279 * them to TCP in one shot. It also leaves 'wroff' size 1280 * space for the headers in each mblk. 1281 * 1282 * ii) for total size bigger than 'tcp_slow_start_initial' 1283 * time maxblk, its probably real file data which is 1284 * dominating. So its better to use sendvec_chunk because 1285 * performance goes to dog if we don't do pagesize reads. 1286 * sendvec_chunk will do pagesize reads and write them 1287 * in pagesize mblks to TCP. 1288 * 1289 * Side Notes: A write to file has not been optimized. 1290 * Future zero copy code will plugin into sendvec_chunk 1291 * only because doing zero copy for files smaller then 1292 * pagesize is useless. 1293 * 1294 * Note, if socket has NL7C enabled then call NL7C's 1295 * senfilev() function to consume the sfv[]. 1296 */ 1297 if (is_sock) { 1298 if (!SOCK_IS_NONSTR(so) && 1299 _SOTOTPI(so)->sti_nl7c_flags != 0) { 1300 error = nl7c_sendfilev(so, &fileoff, 1301 sfv, copy_cnt, &count); 1302 } else if ((total_size <= (4 * maxblk)) && 1303 error == 0) { 1304 error = sendvec_small_chunk(fp, 1305 &fileoff, sfv, copy_cnt, 1306 total_size, maxblk, &count); 1307 } else { 1308 error = sendvec_chunk(fp, &fileoff, 1309 sfv, copy_cnt, &count); 1310 } 1311 } else { 1312 ASSERT(vp->v_type == VREG); 1313 error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt, 1314 &count); 1315 } 1316 1317 1318 #ifdef _SYSCALL32_IMPL 1319 if (get_udatamodel() == DATAMODEL_ILP32) { 1320 copy_vec = (const struct sendfilevec *) 1321 ((char *)copy_vec + 1322 (copy_cnt * sizeof (ksendfilevec32_t))); 1323 } else 1324 #endif 1325 copy_vec += copy_cnt; 1326 sfvcnt -= copy_cnt; 1327 1328 /* Process all vector members up to first error */ 1329 } while ((sfvcnt > 0) && first_vector_error == 0 && error == 0); 1330 1331 if (vp->v_type == VREG) 1332 fp->f_offset += count; 1333 1334 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1335 1336 #ifdef _SYSCALL32_IMPL 1337 if (get_udatamodel() == DATAMODEL_ILP32) { 1338 ssize32_t count32 = (ssize32_t)count; 1339 if (copyout(&count32, xferred, sizeof (count32))) 1340 error = EFAULT; 1341 releasef(fildes); 1342 if (error != 0) 1343 return (set_errno(error)); 1344 if (first_vector_error != 0) 1345 return (set_errno(first_vector_error)); 1346 return (count32); 1347 } 1348 #endif 1349 if (copyout(&count, xferred, sizeof (count))) 1350 error = EFAULT; 1351 releasef(fildes); 1352 if (error != 0) 1353 return (set_errno(error)); 1354 if (first_vector_error != 0) 1355 return (set_errno(first_vector_error)); 1356 return (count); 1357 err: 1358 ASSERT(error != 0); 1359 releasef(fildes); 1360 return (set_errno(error)); 1361 } 1362