1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/t_lock.h> 28 #include <sys/param.h> 29 #include <sys/systm.h> 30 #include <sys/buf.h> 31 #include <sys/conf.h> 32 #include <sys/cred.h> 33 #include <sys/kmem.h> 34 #include <sys/sysmacros.h> 35 #include <sys/vfs.h> 36 #include <sys/vnode.h> 37 #include <sys/debug.h> 38 #include <sys/errno.h> 39 #include <sys/time.h> 40 #include <sys/file.h> 41 #include <sys/open.h> 42 #include <sys/user.h> 43 #include <sys/termios.h> 44 #include <sys/stream.h> 45 #include <sys/strsubr.h> 46 #include <sys/sunddi.h> 47 #include <sys/esunddi.h> 48 #include <sys/flock.h> 49 #include <sys/modctl.h> 50 #include <sys/cmn_err.h> 51 #include <sys/vmsystm.h> 52 53 #include <sys/socket.h> 54 #include <sys/socketvar.h> 55 #include <fs/sockfs/sockcommon.h> 56 #include <fs/sockfs/socktpi.h> 57 58 #include <netinet/in.h> 59 #include <sys/sendfile.h> 60 #include <sys/un.h> 61 #include <sys/tihdr.h> 62 #include <sys/atomic.h> 63 64 #include <inet/common.h> 65 #include <inet/ip.h> 66 #include <inet/ip6.h> 67 #include <inet/tcp.h> 68 69 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *, 70 ssize32_t *); 71 extern int nl7c_sendfilev(struct sonode *, u_offset_t *, struct sendfilevec *, 72 int, ssize_t *); 73 extern int snf_segmap(file_t *, vnode_t *, u_offset_t, u_offset_t, ssize_t *, 74 boolean_t); 75 extern sotpi_info_t *sotpi_sototpi(struct sonode *); 76 77 #define SEND_MAX_CHUNK 16 78 79 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 80 /* 81 * 64 bit offsets for 32 bit applications only running either on 82 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer 83 * more than 2GB of data. 84 */ 85 int 86 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, 87 int copy_cnt, ssize32_t *count) 88 { 89 struct vnode *vp; 90 ushort_t fflag; 91 int ioflag; 92 size32_t cnt; 93 ssize32_t sfv_len; 94 ssize32_t tmpcount; 95 u_offset_t sfv_off; 96 struct uio auio; 97 struct iovec aiov; 98 int i, error; 99 100 fflag = fp->f_flag; 101 vp = fp->f_vnode; 102 for (i = 0; i < copy_cnt; i++) { 103 104 if (ISSIG(curthread, JUSTLOOKING)) 105 return (EINTR); 106 107 /* 108 * Do similar checks as "write" as we are writing 109 * sfv_len bytes into "vp". 110 */ 111 sfv_len = (ssize32_t)sfv->sfv_len; 112 113 if (sfv_len == 0) { 114 sfv++; 115 continue; 116 } 117 118 if (sfv_len < 0) 119 return (EINVAL); 120 121 if (vp->v_type == VREG) { 122 if (*fileoff >= curproc->p_fsz_ctl) { 123 mutex_enter(&curproc->p_lock); 124 (void) rctl_action( 125 rctlproc_legacy[RLIMIT_FSIZE], 126 curproc->p_rctls, curproc, RCA_SAFE); 127 mutex_exit(&curproc->p_lock); 128 return (EFBIG); 129 } 130 131 if (*fileoff >= OFFSET_MAX(fp)) 132 return (EFBIG); 133 134 if (*fileoff + sfv_len > OFFSET_MAX(fp)) 135 return (EINVAL); 136 } 137 138 tmpcount = *count + sfv_len; 139 if (tmpcount < 0) 140 return (EINVAL); 141 142 sfv_off = sfv->sfv_off; 143 144 auio.uio_extflg = UIO_COPY_DEFAULT; 145 if (sfv->sfv_fd == SFV_FD_SELF) { 146 aiov.iov_len = sfv_len; 147 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 148 auio.uio_loffset = *fileoff; 149 auio.uio_iovcnt = 1; 150 auio.uio_resid = sfv_len; 151 auio.uio_iov = &aiov; 152 auio.uio_segflg = UIO_USERSPACE; 153 auio.uio_llimit = curproc->p_fsz_ctl; 154 auio.uio_fmode = fflag; 155 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 156 while (sfv_len > 0) { 157 error = VOP_WRITE(vp, &auio, ioflag, 158 fp->f_cred, NULL); 159 cnt = sfv_len - auio.uio_resid; 160 sfv_len -= cnt; 161 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 162 if (vp->v_type == VREG) 163 *fileoff += cnt; 164 *count += cnt; 165 if (error != 0) 166 return (error); 167 } 168 } else { 169 file_t *ffp; 170 vnode_t *readvp; 171 size_t size; 172 caddr_t ptr; 173 174 if ((ffp = getf(sfv->sfv_fd)) == NULL) 175 return (EBADF); 176 177 if ((ffp->f_flag & FREAD) == 0) { 178 releasef(sfv->sfv_fd); 179 return (EBADF); 180 } 181 182 readvp = ffp->f_vnode; 183 if (readvp->v_type != VREG) { 184 releasef(sfv->sfv_fd); 185 return (EINVAL); 186 } 187 188 /* 189 * No point reading and writing to same vp, 190 * as long as both are regular files. readvp is not 191 * locked; but since we got it from an open file the 192 * contents will be valid during the time of access. 193 */ 194 if (vn_compare(vp, readvp)) { 195 releasef(sfv->sfv_fd); 196 return (EINVAL); 197 } 198 199 /* 200 * Optimize the regular file over 201 * the socket case. 202 */ 203 if (vp->v_type == VSOCK) { 204 error = sosendfile64(fp, ffp, sfv, 205 (ssize32_t *)&cnt); 206 *count += cnt; 207 if (error) 208 return (error); 209 sfv++; 210 continue; 211 } 212 213 /* 214 * Note: we assume readvp != vp. "vp" is already 215 * locked, and "readvp" must not be. 216 */ 217 if (readvp < vp) { 218 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 219 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 220 NULL); 221 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 222 } else { 223 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 224 NULL); 225 } 226 227 /* 228 * Same checks as in pread64. 229 */ 230 if (sfv_off > MAXOFFSET_T) { 231 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 232 releasef(sfv->sfv_fd); 233 return (EINVAL); 234 } 235 236 if (sfv_off + sfv_len > MAXOFFSET_T) 237 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off); 238 239 /* Find the native blocksize to transfer data */ 240 size = MIN(vp->v_vfsp->vfs_bsize, 241 readvp->v_vfsp->vfs_bsize); 242 size = sfv_len < size ? sfv_len : size; 243 ptr = kmem_alloc(size, KM_NOSLEEP); 244 if (ptr == NULL) { 245 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 246 releasef(sfv->sfv_fd); 247 return (ENOMEM); 248 } 249 250 while (sfv_len > 0) { 251 size_t iov_len; 252 253 iov_len = MIN(size, sfv_len); 254 aiov.iov_base = ptr; 255 aiov.iov_len = iov_len; 256 auio.uio_loffset = sfv_off; 257 auio.uio_iov = &aiov; 258 auio.uio_iovcnt = 1; 259 auio.uio_resid = iov_len; 260 auio.uio_segflg = UIO_SYSSPACE; 261 auio.uio_llimit = MAXOFFSET_T; 262 auio.uio_fmode = ffp->f_flag; 263 ioflag = auio.uio_fmode & 264 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 265 266 /* 267 * If read sync is not asked for, 268 * filter sync flags 269 */ 270 if ((ioflag & FRSYNC) == 0) 271 ioflag &= ~(FSYNC|FDSYNC); 272 error = VOP_READ(readvp, &auio, ioflag, 273 fp->f_cred, NULL); 274 if (error) { 275 kmem_free(ptr, size); 276 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 277 NULL); 278 releasef(sfv->sfv_fd); 279 return (error); 280 } 281 282 /* 283 * Check how must data was really read. 284 * Decrement the 'len' and increment the 285 * 'off' appropriately. 286 */ 287 cnt = iov_len - auio.uio_resid; 288 if (cnt == 0) { 289 /* 290 * If we were reading a pipe (currently 291 * not implemented), we may now lose 292 * data. 293 */ 294 kmem_free(ptr, size); 295 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 296 NULL); 297 releasef(sfv->sfv_fd); 298 return (EINVAL); 299 } 300 sfv_len -= cnt; 301 sfv_off += cnt; 302 303 aiov.iov_base = ptr; 304 aiov.iov_len = cnt; 305 auio.uio_loffset = *fileoff; 306 auio.uio_iov = &aiov; 307 auio.uio_iovcnt = 1; 308 auio.uio_resid = cnt; 309 auio.uio_segflg = UIO_SYSSPACE; 310 auio.uio_llimit = curproc->p_fsz_ctl; 311 auio.uio_fmode = fflag; 312 ioflag = auio.uio_fmode & 313 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 314 error = VOP_WRITE(vp, &auio, ioflag, 315 fp->f_cred, NULL); 316 317 /* 318 * Check how much data was written. Increment 319 * the 'len' and decrement the 'off' if all 320 * the data was not written. 321 */ 322 cnt -= auio.uio_resid; 323 sfv_len += auio.uio_resid; 324 sfv_off -= auio.uio_resid; 325 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 326 if (vp->v_type == VREG) 327 *fileoff += cnt; 328 *count += cnt; 329 if (error != 0) { 330 kmem_free(ptr, size); 331 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 332 NULL); 333 releasef(sfv->sfv_fd); 334 return (error); 335 } 336 } 337 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 338 releasef(sfv->sfv_fd); 339 kmem_free(ptr, size); 340 } 341 sfv++; 342 } 343 return (0); 344 } 345 346 ssize32_t 347 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, 348 size32_t *xferred, int fildes) 349 { 350 u_offset_t fileoff; 351 int copy_cnt; 352 const struct ksendfilevec64 *copy_vec; 353 struct ksendfilevec64 sfv[SEND_MAX_CHUNK]; 354 struct vnode *vp; 355 int error; 356 ssize32_t count = 0; 357 358 vp = fp->f_vnode; 359 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 360 361 copy_vec = vec; 362 fileoff = fp->f_offset; 363 364 do { 365 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 366 if (copyin(copy_vec, sfv, copy_cnt * 367 sizeof (struct ksendfilevec64))) { 368 error = EFAULT; 369 break; 370 } 371 372 error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count); 373 if (error != 0) 374 break; 375 376 copy_vec += copy_cnt; 377 sfvcnt -= copy_cnt; 378 } while (sfvcnt > 0); 379 380 if (vp->v_type == VREG) 381 fp->f_offset += count; 382 383 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 384 if (copyout(&count, xferred, sizeof (count))) 385 error = EFAULT; 386 releasef(fildes); 387 if (error != 0) 388 return (set_errno(error)); 389 return (count); 390 } 391 #endif 392 393 int 394 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 395 int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count) 396 { 397 struct vnode *vp; 398 struct uio auio; 399 struct iovec aiov; 400 ushort_t fflag; 401 int ioflag; 402 int i, error; 403 size_t cnt; 404 ssize_t sfv_len; 405 u_offset_t sfv_off; 406 #ifdef _SYSCALL32_IMPL 407 model_t model = get_udatamodel(); 408 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 409 MAXOFF32_T : MAXOFFSET_T; 410 #else 411 const u_offset_t maxoff = MAXOFF32_T; 412 #endif 413 mblk_t *dmp = NULL; 414 int wroff; 415 int buf_left = 0; 416 size_t iov_len; 417 mblk_t *head, *tmp; 418 size_t size = total_size; 419 size_t extra; 420 int tail_len; 421 struct nmsghdr msg; 422 423 fflag = fp->f_flag; 424 vp = fp->f_vnode; 425 426 ASSERT(vp->v_type == VSOCK); 427 ASSERT(maxblk > 0); 428 429 /* If nothing to send, return */ 430 if (total_size == 0) 431 return (0); 432 433 if (vp->v_stream != NULL) { 434 wroff = (int)vp->v_stream->sd_wroff; 435 tail_len = (int)vp->v_stream->sd_tail; 436 } else { 437 struct sonode *so; 438 439 so = VTOSO(vp); 440 wroff = so->so_proto_props.sopp_wroff; 441 tail_len = so->so_proto_props.sopp_tail; 442 } 443 444 extra = wroff + tail_len; 445 446 buf_left = MIN(total_size, maxblk); 447 head = dmp = allocb(buf_left + extra, BPRI_HI); 448 if (head == NULL) 449 return (ENOMEM); 450 head->b_wptr = head->b_rptr = head->b_rptr + wroff; 451 bzero(&msg, sizeof (msg)); 452 453 auio.uio_extflg = UIO_COPY_DEFAULT; 454 for (i = 0; i < copy_cnt; i++) { 455 if (ISSIG(curthread, JUSTLOOKING)) { 456 freemsg(head); 457 return (EINTR); 458 } 459 460 /* 461 * Do similar checks as "write" as we are writing 462 * sfv_len bytes into "vp". 463 */ 464 sfv_len = (ssize_t)sfv->sfv_len; 465 466 if (sfv_len == 0) { 467 sfv++; 468 continue; 469 } 470 471 /* Check for overflow */ 472 #ifdef _SYSCALL32_IMPL 473 if (model == DATAMODEL_ILP32) { 474 if (((ssize32_t)(*count + sfv_len)) < 0) { 475 freemsg(head); 476 return (EINVAL); 477 } 478 } else 479 #endif 480 if ((*count + sfv_len) < 0) { 481 freemsg(head); 482 return (EINVAL); 483 } 484 485 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 486 487 if (sfv->sfv_fd == SFV_FD_SELF) { 488 while (sfv_len > 0) { 489 if (buf_left == 0) { 490 tmp = dmp; 491 buf_left = MIN(total_size, maxblk); 492 iov_len = MIN(buf_left, sfv_len); 493 dmp = allocb(buf_left + extra, BPRI_HI); 494 if (dmp == NULL) { 495 freemsg(head); 496 return (ENOMEM); 497 } 498 dmp->b_wptr = dmp->b_rptr = 499 dmp->b_rptr + wroff; 500 tmp->b_cont = dmp; 501 } else { 502 iov_len = MIN(buf_left, sfv_len); 503 } 504 505 aiov.iov_len = iov_len; 506 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 507 auio.uio_loffset = *fileoff; 508 auio.uio_iovcnt = 1; 509 auio.uio_resid = iov_len; 510 auio.uio_iov = &aiov; 511 auio.uio_segflg = UIO_USERSPACE; 512 auio.uio_llimit = curproc->p_fsz_ctl; 513 auio.uio_fmode = fflag; 514 515 buf_left -= iov_len; 516 total_size -= iov_len; 517 sfv_len -= iov_len; 518 sfv_off += iov_len; 519 520 error = uiomove((caddr_t)dmp->b_wptr, 521 iov_len, UIO_WRITE, &auio); 522 if (error != 0) { 523 freemsg(head); 524 return (error); 525 } 526 dmp->b_wptr += iov_len; 527 } 528 } else { 529 file_t *ffp; 530 vnode_t *readvp; 531 532 if ((ffp = getf(sfv->sfv_fd)) == NULL) { 533 freemsg(head); 534 return (EBADF); 535 } 536 537 if ((ffp->f_flag & FREAD) == 0) { 538 releasef(sfv->sfv_fd); 539 freemsg(head); 540 return (EACCES); 541 } 542 543 readvp = ffp->f_vnode; 544 if (readvp->v_type != VREG) { 545 releasef(sfv->sfv_fd); 546 freemsg(head); 547 return (EINVAL); 548 } 549 550 /* 551 * No point reading and writing to same vp, 552 * as long as both are regular files. readvp is not 553 * locked; but since we got it from an open file the 554 * contents will be valid during the time of access. 555 */ 556 557 if (vn_compare(vp, readvp)) { 558 releasef(sfv->sfv_fd); 559 freemsg(head); 560 return (EINVAL); 561 } 562 563 /* 564 * Note: we assume readvp != vp. "vp" is already 565 * locked, and "readvp" must not be. 566 */ 567 568 if (readvp < vp) { 569 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 570 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 571 NULL); 572 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 573 } else { 574 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 575 NULL); 576 } 577 578 /* Same checks as in pread */ 579 if (sfv_off > maxoff) { 580 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 581 releasef(sfv->sfv_fd); 582 freemsg(head); 583 return (EINVAL); 584 } 585 if (sfv_off + sfv_len > maxoff) { 586 total_size -= (sfv_off + sfv_len - maxoff); 587 sfv_len = (ssize_t)((offset_t)maxoff - 588 sfv_off); 589 } 590 591 while (sfv_len > 0) { 592 if (buf_left == 0) { 593 tmp = dmp; 594 buf_left = MIN(total_size, maxblk); 595 iov_len = MIN(buf_left, sfv_len); 596 dmp = allocb(buf_left + extra, BPRI_HI); 597 if (dmp == NULL) { 598 VOP_RWUNLOCK(readvp, 599 V_WRITELOCK_FALSE, NULL); 600 releasef(sfv->sfv_fd); 601 freemsg(head); 602 return (ENOMEM); 603 } 604 dmp->b_wptr = dmp->b_rptr = 605 dmp->b_rptr + wroff; 606 tmp->b_cont = dmp; 607 } else { 608 iov_len = MIN(buf_left, sfv_len); 609 } 610 aiov.iov_base = (caddr_t)dmp->b_wptr; 611 aiov.iov_len = iov_len; 612 auio.uio_loffset = sfv_off; 613 auio.uio_iov = &aiov; 614 auio.uio_iovcnt = 1; 615 auio.uio_resid = iov_len; 616 auio.uio_segflg = UIO_SYSSPACE; 617 auio.uio_llimit = MAXOFFSET_T; 618 auio.uio_fmode = ffp->f_flag; 619 ioflag = auio.uio_fmode & 620 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 621 622 /* 623 * If read sync is not asked for, 624 * filter sync flags 625 */ 626 if ((ioflag & FRSYNC) == 0) 627 ioflag &= ~(FSYNC|FDSYNC); 628 error = VOP_READ(readvp, &auio, ioflag, 629 fp->f_cred, NULL); 630 if (error != 0) { 631 /* 632 * If we were reading a pipe (currently 633 * not implemented), we may now loose 634 * data. 635 */ 636 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 637 NULL); 638 releasef(sfv->sfv_fd); 639 freemsg(head); 640 return (error); 641 } 642 643 /* 644 * Check how much data was really read. 645 * Decrement the 'len' and increment the 646 * 'off' appropriately. 647 */ 648 cnt = iov_len - auio.uio_resid; 649 if (cnt == 0) { 650 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 651 NULL); 652 releasef(sfv->sfv_fd); 653 freemsg(head); 654 return (EINVAL); 655 } 656 sfv_len -= cnt; 657 sfv_off += cnt; 658 total_size -= cnt; 659 buf_left -= cnt; 660 661 dmp->b_wptr += cnt; 662 } 663 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 664 releasef(sfv->sfv_fd); 665 } 666 sfv++; 667 } 668 669 ASSERT(total_size == 0); 670 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &head); 671 if (error != 0) { 672 if (head != NULL) 673 freemsg(head); 674 return (error); 675 } 676 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size; 677 *count += size; 678 679 return (0); 680 } 681 682 683 int 684 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 685 int copy_cnt, ssize_t *count) 686 { 687 struct vnode *vp; 688 struct uio auio; 689 struct iovec aiov; 690 ushort_t fflag; 691 int ioflag; 692 int i, error; 693 size_t cnt; 694 ssize_t sfv_len; 695 u_offset_t sfv_off; 696 #ifdef _SYSCALL32_IMPL 697 model_t model = get_udatamodel(); 698 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 699 MAXOFF32_T : MAXOFFSET_T; 700 #else 701 const u_offset_t maxoff = MAXOFF32_T; 702 #endif 703 mblk_t *dmp = NULL; 704 char *buf = NULL; 705 size_t extra; 706 int maxblk, wroff, tail_len; 707 struct sonode *so; 708 stdata_t *stp; 709 struct nmsghdr msg; 710 711 fflag = fp->f_flag; 712 vp = fp->f_vnode; 713 714 if (vp->v_type == VSOCK) { 715 so = VTOSO(vp); 716 if (vp->v_stream != NULL) { 717 stp = vp->v_stream; 718 wroff = (int)stp->sd_wroff; 719 tail_len = (int)stp->sd_tail; 720 maxblk = (int)stp->sd_maxblk; 721 } else { 722 stp = NULL; 723 wroff = so->so_proto_props.sopp_wroff; 724 tail_len = so->so_proto_props.sopp_tail; 725 maxblk = so->so_proto_props.sopp_maxblk; 726 } 727 extra = wroff + tail_len; 728 } 729 730 bzero(&msg, sizeof (msg)); 731 auio.uio_extflg = UIO_COPY_DEFAULT; 732 for (i = 0; i < copy_cnt; i++) { 733 if (ISSIG(curthread, JUSTLOOKING)) 734 return (EINTR); 735 736 /* 737 * Do similar checks as "write" as we are writing 738 * sfv_len bytes into "vp". 739 */ 740 sfv_len = (ssize_t)sfv->sfv_len; 741 742 if (sfv_len == 0) { 743 sfv++; 744 continue; 745 } 746 747 if (vp->v_type == VREG) { 748 if (*fileoff >= curproc->p_fsz_ctl) { 749 mutex_enter(&curproc->p_lock); 750 (void) rctl_action( 751 rctlproc_legacy[RLIMIT_FSIZE], 752 curproc->p_rctls, curproc, RCA_SAFE); 753 mutex_exit(&curproc->p_lock); 754 755 return (EFBIG); 756 } 757 758 if (*fileoff >= maxoff) 759 return (EFBIG); 760 761 if (*fileoff + sfv_len > maxoff) 762 return (EINVAL); 763 } 764 765 /* Check for overflow */ 766 #ifdef _SYSCALL32_IMPL 767 if (model == DATAMODEL_ILP32) { 768 if (((ssize32_t)(*count + sfv_len)) < 0) 769 return (EINVAL); 770 } else 771 #endif 772 if ((*count + sfv_len) < 0) 773 return (EINVAL); 774 775 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 776 777 if (sfv->sfv_fd == SFV_FD_SELF) { 778 if (vp->v_type == VSOCK) { 779 while (sfv_len > 0) { 780 size_t iov_len; 781 782 iov_len = sfv_len; 783 /* 784 * Socket filters can limit the mblk 785 * size, so limit reads to maxblk if 786 * there are filters present. 787 */ 788 if (so->so_filter_active > 0 && 789 maxblk != INFPSZ) 790 iov_len = MIN(iov_len, maxblk); 791 792 aiov.iov_len = iov_len; 793 aiov.iov_base = 794 (caddr_t)(uintptr_t)sfv_off; 795 796 auio.uio_iov = &aiov; 797 auio.uio_iovcnt = 1; 798 auio.uio_loffset = *fileoff; 799 auio.uio_segflg = UIO_USERSPACE; 800 auio.uio_fmode = fflag; 801 auio.uio_llimit = curproc->p_fsz_ctl; 802 auio.uio_resid = iov_len; 803 804 dmp = allocb(iov_len + extra, BPRI_HI); 805 if (dmp == NULL) 806 return (ENOMEM); 807 dmp->b_wptr = dmp->b_rptr = 808 dmp->b_rptr + wroff; 809 error = uiomove((caddr_t)dmp->b_wptr, 810 iov_len, UIO_WRITE, &auio); 811 if (error != 0) { 812 freeb(dmp); 813 return (error); 814 } 815 dmp->b_wptr += iov_len; 816 error = socket_sendmblk(VTOSO(vp), 817 &msg, fflag, CRED(), &dmp); 818 819 if (error != 0) { 820 if (dmp != NULL) 821 freeb(dmp); 822 return (error); 823 } 824 ttolwp(curthread)->lwp_ru.ioch += 825 (ulong_t)iov_len; 826 *count += iov_len; 827 sfv_len -= iov_len; 828 sfv_off += iov_len; 829 } 830 } else { 831 aiov.iov_len = sfv_len; 832 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 833 834 auio.uio_iov = &aiov; 835 auio.uio_iovcnt = 1; 836 auio.uio_loffset = *fileoff; 837 auio.uio_segflg = UIO_USERSPACE; 838 auio.uio_fmode = fflag; 839 auio.uio_llimit = curproc->p_fsz_ctl; 840 auio.uio_resid = sfv_len; 841 842 ioflag = auio.uio_fmode & 843 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 844 while (sfv_len > 0) { 845 error = VOP_WRITE(vp, &auio, ioflag, 846 fp->f_cred, NULL); 847 cnt = sfv_len - auio.uio_resid; 848 sfv_len -= cnt; 849 ttolwp(curthread)->lwp_ru.ioch += 850 (ulong_t)cnt; 851 *fileoff += cnt; 852 *count += cnt; 853 if (error != 0) 854 return (error); 855 } 856 } 857 } else { 858 int segmapit = 0; 859 file_t *ffp; 860 vnode_t *readvp; 861 struct vnode *realvp; 862 size_t size; 863 caddr_t ptr; 864 865 if ((ffp = getf(sfv->sfv_fd)) == NULL) 866 return (EBADF); 867 868 if ((ffp->f_flag & FREAD) == 0) { 869 releasef(sfv->sfv_fd); 870 return (EBADF); 871 } 872 873 readvp = ffp->f_vnode; 874 if (VOP_REALVP(readvp, &realvp, NULL) == 0) 875 readvp = realvp; 876 if (readvp->v_type != VREG) { 877 releasef(sfv->sfv_fd); 878 return (EINVAL); 879 } 880 881 /* 882 * No point reading and writing to same vp, 883 * as long as both are regular files. readvp is not 884 * locked; but since we got it from an open file the 885 * contents will be valid during the time of access. 886 */ 887 if (vn_compare(vp, readvp)) { 888 releasef(sfv->sfv_fd); 889 return (EINVAL); 890 } 891 892 /* 893 * Note: we assume readvp != vp. "vp" is already 894 * locked, and "readvp" must not be. 895 */ 896 if (readvp < vp) { 897 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 898 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 899 NULL); 900 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 901 } else { 902 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 903 NULL); 904 } 905 906 /* Same checks as in pread */ 907 if (sfv_off > maxoff) { 908 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 909 releasef(sfv->sfv_fd); 910 return (EINVAL); 911 } 912 if (sfv_off + sfv_len > maxoff) { 913 sfv_len = (ssize_t)((offset_t)maxoff - 914 sfv_off); 915 } 916 /* Find the native blocksize to transfer data */ 917 size = MIN(vp->v_vfsp->vfs_bsize, 918 readvp->v_vfsp->vfs_bsize); 919 size = sfv_len < size ? sfv_len : size; 920 921 if (vp->v_type != VSOCK) { 922 segmapit = 0; 923 buf = kmem_alloc(size, KM_NOSLEEP); 924 if (buf == NULL) { 925 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 926 NULL); 927 releasef(sfv->sfv_fd); 928 return (ENOMEM); 929 } 930 } else { 931 uint_t copyflag; 932 933 copyflag = stp != NULL ? stp->sd_copyflag : 934 so->so_proto_props.sopp_zcopyflag; 935 936 /* 937 * Socket filters can limit the mblk size, 938 * so limit reads to maxblk if there are 939 * filters present. 940 */ 941 if (so->so_filter_active > 0 && 942 maxblk != INFPSZ) 943 size = MIN(size, maxblk); 944 945 if (vn_has_flocks(readvp) || 946 readvp->v_flag & VNOMAP || 947 copyflag & STZCVMUNSAFE) { 948 segmapit = 0; 949 } else if (copyflag & STZCVMSAFE) { 950 segmapit = 1; 951 } else { 952 int on = 1; 953 if (socket_setsockopt(VTOSO(vp), 954 SOL_SOCKET, SO_SND_COPYAVOID, 955 &on, sizeof (on), CRED()) == 0) 956 segmapit = 1; 957 } 958 } 959 960 if (segmapit) { 961 struct vattr va; 962 boolean_t nowait; 963 964 va.va_mask = AT_SIZE; 965 error = VOP_GETATTR(readvp, &va, 0, kcred, 966 NULL); 967 if (error != 0 || sfv_off >= va.va_size) { 968 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 969 NULL); 970 releasef(sfv->sfv_fd); 971 return (error); 972 } 973 /* Read as much as possible. */ 974 if (sfv_off + sfv_len > va.va_size) 975 sfv_len = va.va_size - sfv_off; 976 977 nowait = (sfv->sfv_flag & SFV_NOWAIT) != 0; 978 error = snf_segmap(fp, readvp, sfv_off, 979 (u_offset_t)sfv_len, (ssize_t *)&cnt, 980 nowait); 981 releasef(sfv->sfv_fd); 982 *count += cnt; 983 if (error) 984 return (error); 985 sfv++; 986 continue; 987 } 988 989 while (sfv_len > 0) { 990 size_t iov_len; 991 992 iov_len = MIN(size, sfv_len); 993 994 if (vp->v_type == VSOCK) { 995 dmp = allocb(iov_len + extra, BPRI_HI); 996 if (dmp == NULL) { 997 VOP_RWUNLOCK(readvp, 998 V_WRITELOCK_FALSE, NULL); 999 releasef(sfv->sfv_fd); 1000 return (ENOMEM); 1001 } 1002 dmp->b_wptr = dmp->b_rptr = 1003 dmp->b_rptr + wroff; 1004 ptr = (caddr_t)dmp->b_rptr; 1005 } else { 1006 ptr = buf; 1007 } 1008 1009 aiov.iov_base = ptr; 1010 aiov.iov_len = iov_len; 1011 auio.uio_loffset = sfv_off; 1012 auio.uio_iov = &aiov; 1013 auio.uio_iovcnt = 1; 1014 auio.uio_resid = iov_len; 1015 auio.uio_segflg = UIO_SYSSPACE; 1016 auio.uio_llimit = MAXOFFSET_T; 1017 auio.uio_fmode = ffp->f_flag; 1018 ioflag = auio.uio_fmode & 1019 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1020 1021 /* 1022 * If read sync is not asked for, 1023 * filter sync flags 1024 */ 1025 if ((ioflag & FRSYNC) == 0) 1026 ioflag &= ~(FSYNC|FDSYNC); 1027 error = VOP_READ(readvp, &auio, ioflag, 1028 fp->f_cred, NULL); 1029 if (error != 0) { 1030 /* 1031 * If we were reading a pipe (currently 1032 * not implemented), we may now lose 1033 * data. 1034 */ 1035 if (vp->v_type == VSOCK) 1036 freeb(dmp); 1037 else 1038 kmem_free(buf, size); 1039 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 1040 NULL); 1041 releasef(sfv->sfv_fd); 1042 return (error); 1043 } 1044 1045 /* 1046 * Check how much data was really read. 1047 * Decrement the 'len' and increment the 1048 * 'off' appropriately. 1049 */ 1050 cnt = iov_len - auio.uio_resid; 1051 if (cnt == 0) { 1052 if (vp->v_type == VSOCK) 1053 freeb(dmp); 1054 else 1055 kmem_free(buf, size); 1056 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 1057 NULL); 1058 releasef(sfv->sfv_fd); 1059 return (EINVAL); 1060 } 1061 sfv_len -= cnt; 1062 sfv_off += cnt; 1063 1064 if (vp->v_type == VSOCK) { 1065 dmp->b_wptr = dmp->b_rptr + cnt; 1066 1067 error = socket_sendmblk(VTOSO(vp), 1068 &msg, fflag, CRED(), &dmp); 1069 1070 if (error != 0) { 1071 if (dmp != NULL) 1072 freeb(dmp); 1073 VOP_RWUNLOCK(readvp, 1074 V_WRITELOCK_FALSE, NULL); 1075 releasef(sfv->sfv_fd); 1076 return (error); 1077 } 1078 1079 ttolwp(curthread)->lwp_ru.ioch += 1080 (ulong_t)cnt; 1081 *count += cnt; 1082 } else { 1083 1084 aiov.iov_base = ptr; 1085 aiov.iov_len = cnt; 1086 auio.uio_loffset = *fileoff; 1087 auio.uio_resid = cnt; 1088 auio.uio_iov = &aiov; 1089 auio.uio_iovcnt = 1; 1090 auio.uio_segflg = UIO_SYSSPACE; 1091 auio.uio_llimit = curproc->p_fsz_ctl; 1092 auio.uio_fmode = fflag; 1093 ioflag = auio.uio_fmode & 1094 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1095 error = VOP_WRITE(vp, &auio, ioflag, 1096 fp->f_cred, NULL); 1097 1098 /* 1099 * Check how much data was written. 1100 * Increment the 'len' and decrement the 1101 * 'off' if all the data was not 1102 * written. 1103 */ 1104 cnt -= auio.uio_resid; 1105 sfv_len += auio.uio_resid; 1106 sfv_off -= auio.uio_resid; 1107 ttolwp(curthread)->lwp_ru.ioch += 1108 (ulong_t)cnt; 1109 *fileoff += cnt; 1110 *count += cnt; 1111 if (error != 0) { 1112 kmem_free(buf, size); 1113 VOP_RWUNLOCK(readvp, 1114 V_WRITELOCK_FALSE, NULL); 1115 releasef(sfv->sfv_fd); 1116 return (error); 1117 } 1118 } 1119 } 1120 if (buf) { 1121 kmem_free(buf, size); 1122 buf = NULL; 1123 } 1124 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 1125 releasef(sfv->sfv_fd); 1126 } 1127 sfv++; 1128 } 1129 return (0); 1130 } 1131 1132 ssize_t 1133 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, 1134 size_t *xferred) 1135 { 1136 int error = 0; 1137 int first_vector_error = 0; 1138 file_t *fp; 1139 struct vnode *vp; 1140 struct sonode *so; 1141 u_offset_t fileoff; 1142 int copy_cnt; 1143 const struct sendfilevec *copy_vec; 1144 struct sendfilevec sfv[SEND_MAX_CHUNK]; 1145 ssize_t count = 0; 1146 #ifdef _SYSCALL32_IMPL 1147 struct ksendfilevec32 sfv32[SEND_MAX_CHUNK]; 1148 #endif 1149 ssize_t total_size; 1150 int i; 1151 boolean_t is_sock = B_FALSE; 1152 int maxblk = 0; 1153 1154 if (sfvcnt <= 0) 1155 return (set_errno(EINVAL)); 1156 1157 if ((fp = getf(fildes)) == NULL) 1158 return (set_errno(EBADF)); 1159 1160 if (((fp->f_flag) & FWRITE) == 0) { 1161 error = EBADF; 1162 goto err; 1163 } 1164 1165 fileoff = fp->f_offset; 1166 vp = fp->f_vnode; 1167 1168 switch (vp->v_type) { 1169 case VSOCK: 1170 so = VTOSO(vp); 1171 is_sock = B_TRUE; 1172 if (SOCK_IS_NONSTR(so)) { 1173 maxblk = so->so_proto_props.sopp_maxblk; 1174 } else { 1175 maxblk = (int)vp->v_stream->sd_maxblk; 1176 } 1177 break; 1178 case VREG: 1179 break; 1180 default: 1181 error = EINVAL; 1182 goto err; 1183 } 1184 1185 switch (opcode) { 1186 case SENDFILEV : 1187 break; 1188 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 1189 case SENDFILEV64 : 1190 return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt, 1191 (size32_t *)xferred, fildes)); 1192 #endif 1193 default : 1194 error = ENOSYS; 1195 break; 1196 } 1197 1198 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1199 copy_vec = vec; 1200 1201 do { 1202 total_size = 0; 1203 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 1204 #ifdef _SYSCALL32_IMPL 1205 /* 32-bit callers need to have their iovec expanded. */ 1206 if (get_udatamodel() == DATAMODEL_ILP32) { 1207 if (copyin(copy_vec, sfv32, 1208 copy_cnt * sizeof (ksendfilevec32_t))) { 1209 error = EFAULT; 1210 break; 1211 } 1212 1213 for (i = 0; i < copy_cnt; i++) { 1214 sfv[i].sfv_fd = sfv32[i].sfv_fd; 1215 sfv[i].sfv_off = 1216 (off_t)(uint32_t)sfv32[i].sfv_off; 1217 sfv[i].sfv_len = (size_t)sfv32[i].sfv_len; 1218 total_size += sfv[i].sfv_len; 1219 sfv[i].sfv_flag = sfv32[i].sfv_flag; 1220 /* 1221 * Individual elements of the vector must not 1222 * wrap or overflow, as later math is signed. 1223 * Equally total_size needs to be checked after 1224 * each vector is added in, to be sure that 1225 * rogue values haven't overflowed the counter. 1226 */ 1227 if (((ssize32_t)sfv[i].sfv_len < 0) || 1228 ((ssize32_t)total_size < 0)) { 1229 /* 1230 * Truncate the vector to send data 1231 * described by elements before the 1232 * error. 1233 */ 1234 copy_cnt = i; 1235 first_vector_error = EINVAL; 1236 /* total_size can't be trusted */ 1237 if ((ssize32_t)total_size < 0) 1238 error = EINVAL; 1239 break; 1240 } 1241 } 1242 /* Nothing to do, process errors */ 1243 if (copy_cnt == 0) 1244 break; 1245 1246 } else { 1247 #endif 1248 if (copyin(copy_vec, sfv, 1249 copy_cnt * sizeof (sendfilevec_t))) { 1250 error = EFAULT; 1251 break; 1252 } 1253 1254 for (i = 0; i < copy_cnt; i++) { 1255 total_size += sfv[i].sfv_len; 1256 /* 1257 * Individual elements of the vector must not 1258 * wrap or overflow, as later math is signed. 1259 * Equally total_size needs to be checked after 1260 * each vector is added in, to be sure that 1261 * rogue values haven't overflowed the counter. 1262 */ 1263 if (((ssize_t)sfv[i].sfv_len < 0) || 1264 (total_size < 0)) { 1265 /* 1266 * Truncate the vector to send data 1267 * described by elements before the 1268 * error. 1269 */ 1270 copy_cnt = i; 1271 first_vector_error = EINVAL; 1272 /* total_size can't be trusted */ 1273 if (total_size < 0) 1274 error = EINVAL; 1275 break; 1276 } 1277 } 1278 /* Nothing to do, process errors */ 1279 if (copy_cnt == 0) 1280 break; 1281 #ifdef _SYSCALL32_IMPL 1282 } 1283 #endif 1284 1285 /* 1286 * The task between deciding to use sendvec_small_chunk 1287 * and sendvec_chunk is dependant on multiple things: 1288 * 1289 * i) latency is important for smaller files. So if the 1290 * data is smaller than 'tcp_slow_start_initial' times 1291 * maxblk, then use sendvec_small_chunk which creates 1292 * maxblk size mblks and chains them together and sends 1293 * them to TCP in one shot. It also leaves 'wroff' size 1294 * space for the headers in each mblk. 1295 * 1296 * ii) for total size bigger than 'tcp_slow_start_initial' 1297 * time maxblk, its probably real file data which is 1298 * dominating. So its better to use sendvec_chunk because 1299 * performance goes to dog if we don't do pagesize reads. 1300 * sendvec_chunk will do pagesize reads and write them 1301 * in pagesize mblks to TCP. 1302 * 1303 * Side Notes: A write to file has not been optimized. 1304 * Future zero copy code will plugin into sendvec_chunk 1305 * only because doing zero copy for files smaller then 1306 * pagesize is useless. 1307 * 1308 * Note, if socket has NL7C enabled then call NL7C's 1309 * senfilev() function to consume the sfv[]. 1310 */ 1311 if (is_sock) { 1312 if (!SOCK_IS_NONSTR(so) && 1313 _SOTOTPI(so)->sti_nl7c_flags != 0) { 1314 error = nl7c_sendfilev(so, &fileoff, 1315 sfv, copy_cnt, &count); 1316 } else if ((total_size <= (4 * maxblk)) && 1317 error == 0) { 1318 error = sendvec_small_chunk(fp, 1319 &fileoff, sfv, copy_cnt, 1320 total_size, maxblk, &count); 1321 } else { 1322 error = sendvec_chunk(fp, &fileoff, 1323 sfv, copy_cnt, &count); 1324 } 1325 } else { 1326 ASSERT(vp->v_type == VREG); 1327 error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt, 1328 &count); 1329 } 1330 1331 1332 #ifdef _SYSCALL32_IMPL 1333 if (get_udatamodel() == DATAMODEL_ILP32) 1334 copy_vec = (const struct sendfilevec *)((char *)copy_vec + 1335 (copy_cnt * sizeof (ksendfilevec32_t))); 1336 else 1337 #endif 1338 copy_vec += copy_cnt; 1339 sfvcnt -= copy_cnt; 1340 1341 /* Process all vector members up to first error */ 1342 } while ((sfvcnt > 0) && first_vector_error == 0 && error == 0); 1343 1344 if (vp->v_type == VREG) 1345 fp->f_offset += count; 1346 1347 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1348 1349 #ifdef _SYSCALL32_IMPL 1350 if (get_udatamodel() == DATAMODEL_ILP32) { 1351 ssize32_t count32 = (ssize32_t)count; 1352 if (copyout(&count32, xferred, sizeof (count32))) 1353 error = EFAULT; 1354 releasef(fildes); 1355 if (error != 0) 1356 return (set_errno(error)); 1357 if (first_vector_error != 0) 1358 return (set_errno(first_vector_error)); 1359 return (count32); 1360 } 1361 #endif 1362 if (copyout(&count, xferred, sizeof (count))) 1363 error = EFAULT; 1364 releasef(fildes); 1365 if (error != 0) 1366 return (set_errno(error)); 1367 if (first_vector_error != 0) 1368 return (set_errno(first_vector_error)); 1369 return (count); 1370 err: 1371 ASSERT(error != 0); 1372 releasef(fildes); 1373 return (set_errno(error)); 1374 } 1375