1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/t_lock.h> 28 #include <sys/param.h> 29 #include <sys/systm.h> 30 #include <sys/buf.h> 31 #include <sys/conf.h> 32 #include <sys/cred.h> 33 #include <sys/kmem.h> 34 #include <sys/sysmacros.h> 35 #include <sys/vfs.h> 36 #include <sys/vnode.h> 37 #include <sys/debug.h> 38 #include <sys/errno.h> 39 #include <sys/time.h> 40 #include <sys/file.h> 41 #include <sys/open.h> 42 #include <sys/user.h> 43 #include <sys/termios.h> 44 #include <sys/stream.h> 45 #include <sys/strsubr.h> 46 #include <sys/sunddi.h> 47 #include <sys/esunddi.h> 48 #include <sys/flock.h> 49 #include <sys/modctl.h> 50 #include <sys/cmn_err.h> 51 #include <sys/vmsystm.h> 52 53 #include <sys/socket.h> 54 #include <sys/socketvar.h> 55 #include <fs/sockfs/sockcommon.h> 56 #include <fs/sockfs/socktpi.h> 57 58 #include <netinet/in.h> 59 #include <sys/sendfile.h> 60 #include <sys/un.h> 61 #include <sys/tihdr.h> 62 #include <sys/atomic.h> 63 64 #include <inet/common.h> 65 #include <inet/ip.h> 66 #include <inet/ip6.h> 67 #include <inet/tcp.h> 68 69 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *, 70 ssize32_t *); 71 extern int nl7c_sendfilev(struct sonode *, u_offset_t *, struct sendfilevec *, 72 int, ssize_t *); 73 extern int snf_segmap(file_t *, vnode_t *, u_offset_t, u_offset_t, ssize_t *, 74 boolean_t); 75 extern sotpi_info_t *sotpi_sototpi(struct sonode *); 76 77 #define SEND_MAX_CHUNK 16 78 79 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 80 /* 81 * 64 bit offsets for 32 bit applications only running either on 82 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer 83 * more than 2GB of data. 84 */ 85 static int 86 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, 87 int copy_cnt, ssize32_t *count) 88 { 89 struct vnode *vp; 90 ushort_t fflag; 91 int ioflag; 92 size32_t cnt; 93 ssize32_t sfv_len; 94 ssize32_t tmpcount; 95 u_offset_t sfv_off; 96 struct uio auio; 97 struct iovec aiov; 98 int i, error; 99 100 fflag = fp->f_flag; 101 vp = fp->f_vnode; 102 for (i = 0; i < copy_cnt; i++) { 103 104 if (ISSIG(curthread, JUSTLOOKING)) 105 return (EINTR); 106 107 /* 108 * Do similar checks as "write" as we are writing 109 * sfv_len bytes into "vp". 110 */ 111 sfv_len = (ssize32_t)sfv->sfv_len; 112 113 if (sfv_len == 0) { 114 sfv++; 115 continue; 116 } 117 118 if (sfv_len < 0) 119 return (EINVAL); 120 121 if (vp->v_type == VREG) { 122 if (*fileoff >= curproc->p_fsz_ctl) { 123 mutex_enter(&curproc->p_lock); 124 (void) rctl_action( 125 rctlproc_legacy[RLIMIT_FSIZE], 126 curproc->p_rctls, curproc, RCA_SAFE); 127 mutex_exit(&curproc->p_lock); 128 return (EFBIG); 129 } 130 131 if (*fileoff >= OFFSET_MAX(fp)) 132 return (EFBIG); 133 134 if (*fileoff + sfv_len > OFFSET_MAX(fp)) 135 return (EINVAL); 136 } 137 138 tmpcount = *count + sfv_len; 139 if (tmpcount < 0) 140 return (EINVAL); 141 142 sfv_off = sfv->sfv_off; 143 144 auio.uio_extflg = UIO_COPY_DEFAULT; 145 if (sfv->sfv_fd == SFV_FD_SELF) { 146 aiov.iov_len = sfv_len; 147 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 148 auio.uio_loffset = *fileoff; 149 auio.uio_iovcnt = 1; 150 auio.uio_resid = sfv_len; 151 auio.uio_iov = &aiov; 152 auio.uio_segflg = UIO_USERSPACE; 153 auio.uio_llimit = curproc->p_fsz_ctl; 154 auio.uio_fmode = fflag; 155 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 156 while (sfv_len > 0) { 157 error = VOP_WRITE(vp, &auio, ioflag, 158 fp->f_cred, NULL); 159 cnt = sfv_len - auio.uio_resid; 160 sfv_len -= cnt; 161 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 162 if (vp->v_type == VREG) 163 *fileoff += cnt; 164 *count += cnt; 165 if (error != 0) 166 return (error); 167 } 168 } else { 169 file_t *ffp; 170 vnode_t *readvp; 171 size_t size; 172 caddr_t ptr; 173 174 if ((ffp = getf(sfv->sfv_fd)) == NULL) 175 return (EBADF); 176 177 if ((ffp->f_flag & FREAD) == 0) { 178 releasef(sfv->sfv_fd); 179 return (EBADF); 180 } 181 182 readvp = ffp->f_vnode; 183 if (readvp->v_type != VREG) { 184 releasef(sfv->sfv_fd); 185 return (EINVAL); 186 } 187 188 /* 189 * No point reading and writing to same vp, 190 * as long as both are regular files. readvp is not 191 * locked; but since we got it from an open file the 192 * contents will be valid during the time of access. 193 */ 194 if (vn_compare(vp, readvp)) { 195 releasef(sfv->sfv_fd); 196 return (EINVAL); 197 } 198 199 /* 200 * Optimize the regular file over 201 * the socket case. 202 */ 203 if (vp->v_type == VSOCK) { 204 error = sosendfile64(fp, ffp, sfv, 205 (ssize32_t *)&cnt); 206 *count += cnt; 207 if (error) 208 return (error); 209 sfv++; 210 continue; 211 } 212 213 /* 214 * Note: we assume readvp != vp. "vp" is already 215 * locked, and "readvp" must not be. 216 */ 217 if (readvp < vp) { 218 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 219 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 220 NULL); 221 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 222 } else { 223 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 224 NULL); 225 } 226 227 /* 228 * Same checks as in pread64. 229 */ 230 if (sfv_off > MAXOFFSET_T) { 231 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 232 releasef(sfv->sfv_fd); 233 return (EINVAL); 234 } 235 236 if (sfv_off + sfv_len > MAXOFFSET_T) 237 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off); 238 239 /* Find the native blocksize to transfer data */ 240 size = MIN(vp->v_vfsp->vfs_bsize, 241 readvp->v_vfsp->vfs_bsize); 242 size = sfv_len < size ? sfv_len : size; 243 ptr = kmem_alloc(size, KM_NOSLEEP); 244 if (ptr == NULL) { 245 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 246 releasef(sfv->sfv_fd); 247 return (ENOMEM); 248 } 249 250 while (sfv_len > 0) { 251 size_t iov_len; 252 253 iov_len = MIN(size, sfv_len); 254 aiov.iov_base = ptr; 255 aiov.iov_len = iov_len; 256 auio.uio_loffset = sfv_off; 257 auio.uio_iov = &aiov; 258 auio.uio_iovcnt = 1; 259 auio.uio_resid = iov_len; 260 auio.uio_segflg = UIO_SYSSPACE; 261 auio.uio_llimit = MAXOFFSET_T; 262 auio.uio_fmode = ffp->f_flag; 263 ioflag = auio.uio_fmode & 264 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 265 266 /* 267 * If read sync is not asked for, 268 * filter sync flags 269 */ 270 if ((ioflag & FRSYNC) == 0) 271 ioflag &= ~(FSYNC|FDSYNC); 272 error = VOP_READ(readvp, &auio, ioflag, 273 fp->f_cred, NULL); 274 if (error) { 275 kmem_free(ptr, size); 276 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 277 NULL); 278 releasef(sfv->sfv_fd); 279 return (error); 280 } 281 282 /* 283 * Check how must data was really read. 284 * Decrement the 'len' and increment the 285 * 'off' appropriately. 286 */ 287 cnt = iov_len - auio.uio_resid; 288 if (cnt == 0) { 289 /* 290 * If we were reading a pipe (currently 291 * not implemented), we may now lose 292 * data. 293 */ 294 kmem_free(ptr, size); 295 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 296 NULL); 297 releasef(sfv->sfv_fd); 298 return (EINVAL); 299 } 300 sfv_len -= cnt; 301 sfv_off += cnt; 302 303 aiov.iov_base = ptr; 304 aiov.iov_len = cnt; 305 auio.uio_loffset = *fileoff; 306 auio.uio_iov = &aiov; 307 auio.uio_iovcnt = 1; 308 auio.uio_resid = cnt; 309 auio.uio_segflg = UIO_SYSSPACE; 310 auio.uio_llimit = curproc->p_fsz_ctl; 311 auio.uio_fmode = fflag; 312 ioflag = auio.uio_fmode & 313 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 314 error = VOP_WRITE(vp, &auio, ioflag, 315 fp->f_cred, NULL); 316 317 /* 318 * Check how much data was written. Increment 319 * the 'len' and decrement the 'off' if all 320 * the data was not written. 321 */ 322 cnt -= auio.uio_resid; 323 sfv_len += auio.uio_resid; 324 sfv_off -= auio.uio_resid; 325 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 326 if (vp->v_type == VREG) 327 *fileoff += cnt; 328 *count += cnt; 329 if (error != 0) { 330 kmem_free(ptr, size); 331 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 332 NULL); 333 releasef(sfv->sfv_fd); 334 return (error); 335 } 336 } 337 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 338 releasef(sfv->sfv_fd); 339 kmem_free(ptr, size); 340 } 341 sfv++; 342 } 343 return (0); 344 } 345 346 static ssize32_t 347 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, 348 size32_t *xferred, int fildes) 349 { 350 u_offset_t fileoff; 351 int copy_cnt; 352 const struct ksendfilevec64 *copy_vec; 353 struct ksendfilevec64 sfv[SEND_MAX_CHUNK]; 354 struct vnode *vp; 355 int error; 356 ssize32_t count = 0; 357 358 vp = fp->f_vnode; 359 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 360 361 copy_vec = vec; 362 fileoff = fp->f_offset; 363 364 do { 365 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 366 if (copyin(copy_vec, sfv, copy_cnt * 367 sizeof (struct ksendfilevec64))) { 368 error = EFAULT; 369 break; 370 } 371 372 error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count); 373 if (error != 0) 374 break; 375 376 copy_vec += copy_cnt; 377 sfvcnt -= copy_cnt; 378 } while (sfvcnt > 0); 379 380 if (vp->v_type == VREG) 381 fp->f_offset += count; 382 383 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 384 if (copyout(&count, xferred, sizeof (count))) 385 error = EFAULT; 386 releasef(fildes); 387 if (error != 0) 388 return (set_errno(error)); 389 return (count); 390 } 391 #endif 392 393 static int 394 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 395 int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count) 396 { 397 struct vnode *vp; 398 struct uio auio; 399 struct iovec aiov; 400 ushort_t fflag; 401 int ioflag; 402 int i, error; 403 size_t cnt; 404 ssize_t sfv_len; 405 u_offset_t sfv_off; 406 #ifdef _SYSCALL32_IMPL 407 model_t model = get_udatamodel(); 408 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 409 MAXOFF32_T : MAXOFFSET_T; 410 #else 411 const u_offset_t maxoff = MAXOFF32_T; 412 #endif 413 mblk_t *dmp = NULL; 414 int wroff; 415 int buf_left = 0; 416 size_t iov_len; 417 mblk_t *head, *tmp; 418 size_t size = total_size; 419 size_t extra; 420 int tail_len; 421 struct nmsghdr msg; 422 423 fflag = fp->f_flag; 424 vp = fp->f_vnode; 425 426 ASSERT(vp->v_type == VSOCK); 427 ASSERT(maxblk > 0); 428 429 /* If nothing to send, return */ 430 if (total_size == 0) 431 return (0); 432 433 if (vp->v_stream != NULL) { 434 wroff = (int)vp->v_stream->sd_wroff; 435 tail_len = (int)vp->v_stream->sd_tail; 436 } else { 437 struct sonode *so; 438 439 so = VTOSO(vp); 440 wroff = so->so_proto_props.sopp_wroff; 441 tail_len = so->so_proto_props.sopp_tail; 442 } 443 444 extra = wroff + tail_len; 445 446 buf_left = MIN(total_size, maxblk); 447 head = dmp = allocb(buf_left + extra, BPRI_HI); 448 if (head == NULL) 449 return (ENOMEM); 450 head->b_wptr = head->b_rptr = head->b_rptr + wroff; 451 bzero(&msg, sizeof (msg)); 452 453 auio.uio_extflg = UIO_COPY_DEFAULT; 454 for (i = 0; i < copy_cnt; i++) { 455 if (ISSIG(curthread, JUSTLOOKING)) { 456 freemsg(head); 457 return (EINTR); 458 } 459 460 /* 461 * Do similar checks as "write" as we are writing 462 * sfv_len bytes into "vp". 463 */ 464 sfv_len = (ssize_t)sfv->sfv_len; 465 466 if (sfv_len == 0) { 467 sfv++; 468 continue; 469 } 470 471 /* Check for overflow */ 472 #ifdef _SYSCALL32_IMPL 473 if (model == DATAMODEL_ILP32) { 474 if (((ssize32_t)(*count + sfv_len)) < 0) { 475 freemsg(head); 476 return (EINVAL); 477 } 478 } else 479 #endif 480 if ((*count + sfv_len) < 0) { 481 freemsg(head); 482 return (EINVAL); 483 } 484 485 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 486 487 if (sfv->sfv_fd == SFV_FD_SELF) { 488 while (sfv_len > 0) { 489 if (buf_left == 0) { 490 tmp = dmp; 491 buf_left = MIN(total_size, maxblk); 492 iov_len = MIN(buf_left, sfv_len); 493 dmp = allocb(buf_left + extra, BPRI_HI); 494 if (dmp == NULL) { 495 freemsg(head); 496 return (ENOMEM); 497 } 498 dmp->b_wptr = dmp->b_rptr = 499 dmp->b_rptr + wroff; 500 tmp->b_cont = dmp; 501 } else { 502 iov_len = MIN(buf_left, sfv_len); 503 } 504 505 aiov.iov_len = iov_len; 506 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 507 auio.uio_loffset = *fileoff; 508 auio.uio_iovcnt = 1; 509 auio.uio_resid = iov_len; 510 auio.uio_iov = &aiov; 511 auio.uio_segflg = UIO_USERSPACE; 512 auio.uio_llimit = curproc->p_fsz_ctl; 513 auio.uio_fmode = fflag; 514 515 buf_left -= iov_len; 516 total_size -= iov_len; 517 sfv_len -= iov_len; 518 sfv_off += iov_len; 519 520 error = uiomove((caddr_t)dmp->b_wptr, 521 iov_len, UIO_WRITE, &auio); 522 if (error != 0) { 523 freemsg(head); 524 return (error); 525 } 526 dmp->b_wptr += iov_len; 527 } 528 } else { 529 file_t *ffp; 530 vnode_t *readvp; 531 532 if ((ffp = getf(sfv->sfv_fd)) == NULL) { 533 freemsg(head); 534 return (EBADF); 535 } 536 537 if ((ffp->f_flag & FREAD) == 0) { 538 releasef(sfv->sfv_fd); 539 freemsg(head); 540 return (EACCES); 541 } 542 543 readvp = ffp->f_vnode; 544 if (readvp->v_type != VREG) { 545 releasef(sfv->sfv_fd); 546 freemsg(head); 547 return (EINVAL); 548 } 549 550 /* 551 * No point reading and writing to same vp, 552 * as long as both are regular files. readvp is not 553 * locked; but since we got it from an open file the 554 * contents will be valid during the time of access. 555 */ 556 557 if (vn_compare(vp, readvp)) { 558 releasef(sfv->sfv_fd); 559 freemsg(head); 560 return (EINVAL); 561 } 562 563 /* 564 * Note: we assume readvp != vp. "vp" is already 565 * locked, and "readvp" must not be. 566 */ 567 568 if (readvp < vp) { 569 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 570 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 571 NULL); 572 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 573 } else { 574 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 575 NULL); 576 } 577 578 /* Same checks as in pread */ 579 if (sfv_off > maxoff) { 580 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 581 releasef(sfv->sfv_fd); 582 freemsg(head); 583 return (EINVAL); 584 } 585 if (sfv_off + sfv_len > maxoff) { 586 total_size -= (sfv_off + sfv_len - maxoff); 587 sfv_len = (ssize_t)((offset_t)maxoff - 588 sfv_off); 589 } 590 591 while (sfv_len > 0) { 592 if (buf_left == 0) { 593 tmp = dmp; 594 buf_left = MIN(total_size, maxblk); 595 iov_len = MIN(buf_left, sfv_len); 596 dmp = allocb(buf_left + extra, BPRI_HI); 597 if (dmp == NULL) { 598 VOP_RWUNLOCK(readvp, 599 V_WRITELOCK_FALSE, NULL); 600 releasef(sfv->sfv_fd); 601 freemsg(head); 602 return (ENOMEM); 603 } 604 dmp->b_wptr = dmp->b_rptr = 605 dmp->b_rptr + wroff; 606 tmp->b_cont = dmp; 607 } else { 608 iov_len = MIN(buf_left, sfv_len); 609 } 610 aiov.iov_base = (caddr_t)dmp->b_wptr; 611 aiov.iov_len = iov_len; 612 auio.uio_loffset = sfv_off; 613 auio.uio_iov = &aiov; 614 auio.uio_iovcnt = 1; 615 auio.uio_resid = iov_len; 616 auio.uio_segflg = UIO_SYSSPACE; 617 auio.uio_llimit = MAXOFFSET_T; 618 auio.uio_fmode = ffp->f_flag; 619 ioflag = auio.uio_fmode & 620 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 621 622 /* 623 * If read sync is not asked for, 624 * filter sync flags 625 */ 626 if ((ioflag & FRSYNC) == 0) 627 ioflag &= ~(FSYNC|FDSYNC); 628 error = VOP_READ(readvp, &auio, ioflag, 629 fp->f_cred, NULL); 630 if (error != 0) { 631 /* 632 * If we were reading a pipe (currently 633 * not implemented), we may now loose 634 * data. 635 */ 636 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 637 NULL); 638 releasef(sfv->sfv_fd); 639 freemsg(head); 640 return (error); 641 } 642 643 /* 644 * Check how much data was really read. 645 * Decrement the 'len' and increment the 646 * 'off' appropriately. 647 */ 648 cnt = iov_len - auio.uio_resid; 649 if (cnt == 0) { 650 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 651 NULL); 652 releasef(sfv->sfv_fd); 653 freemsg(head); 654 return (EINVAL); 655 } 656 sfv_len -= cnt; 657 sfv_off += cnt; 658 total_size -= cnt; 659 buf_left -= cnt; 660 661 dmp->b_wptr += cnt; 662 } 663 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 664 releasef(sfv->sfv_fd); 665 } 666 sfv++; 667 } 668 669 ASSERT(total_size == 0); 670 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &head); 671 if (error != 0) { 672 if (head != NULL) 673 freemsg(head); 674 return (error); 675 } 676 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size; 677 *count += size; 678 679 return (0); 680 } 681 682 683 static int 684 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 685 int copy_cnt, ssize_t *count) 686 { 687 struct vnode *vp; 688 struct uio auio; 689 struct iovec aiov; 690 ushort_t fflag; 691 int ioflag; 692 int i, error; 693 size_t cnt; 694 ssize_t sfv_len; 695 u_offset_t sfv_off; 696 #ifdef _SYSCALL32_IMPL 697 model_t model = get_udatamodel(); 698 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 699 MAXOFF32_T : MAXOFFSET_T; 700 #else 701 const u_offset_t maxoff = MAXOFF32_T; 702 #endif 703 mblk_t *dmp = NULL; 704 char *buf = NULL; 705 size_t extra = 0; 706 int maxblk, wroff, tail_len; 707 struct sonode *so; 708 stdata_t *stp; 709 struct nmsghdr msg; 710 711 maxblk = 0; 712 wroff = 0; 713 fflag = fp->f_flag; 714 vp = fp->f_vnode; 715 so = NULL; 716 stp = NULL; 717 718 if (vp->v_type == VSOCK) { 719 so = VTOSO(vp); 720 if (vp->v_stream != NULL) { 721 stp = vp->v_stream; 722 wroff = (int)stp->sd_wroff; 723 tail_len = (int)stp->sd_tail; 724 maxblk = (int)stp->sd_maxblk; 725 } else { 726 stp = NULL; 727 wroff = so->so_proto_props.sopp_wroff; 728 tail_len = so->so_proto_props.sopp_tail; 729 maxblk = so->so_proto_props.sopp_maxblk; 730 } 731 extra = wroff + tail_len; 732 } 733 734 bzero(&msg, sizeof (msg)); 735 auio.uio_extflg = UIO_COPY_DEFAULT; 736 for (i = 0; i < copy_cnt; i++) { 737 if (ISSIG(curthread, JUSTLOOKING)) 738 return (EINTR); 739 740 /* 741 * Do similar checks as "write" as we are writing 742 * sfv_len bytes into "vp". 743 */ 744 sfv_len = (ssize_t)sfv->sfv_len; 745 746 if (sfv_len == 0) { 747 sfv++; 748 continue; 749 } 750 751 if (vp->v_type == VREG) { 752 if (*fileoff >= curproc->p_fsz_ctl) { 753 mutex_enter(&curproc->p_lock); 754 (void) rctl_action( 755 rctlproc_legacy[RLIMIT_FSIZE], 756 curproc->p_rctls, curproc, RCA_SAFE); 757 mutex_exit(&curproc->p_lock); 758 759 return (EFBIG); 760 } 761 762 if (*fileoff >= maxoff) 763 return (EFBIG); 764 765 if (*fileoff + sfv_len > maxoff) 766 return (EINVAL); 767 } 768 769 /* Check for overflow */ 770 #ifdef _SYSCALL32_IMPL 771 if (model == DATAMODEL_ILP32) { 772 if (((ssize32_t)(*count + sfv_len)) < 0) 773 return (EINVAL); 774 } else 775 #endif 776 if ((*count + sfv_len) < 0) 777 return (EINVAL); 778 779 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 780 781 if (sfv->sfv_fd == SFV_FD_SELF) { 782 if (vp->v_type == VSOCK) { 783 while (sfv_len > 0) { 784 size_t iov_len; 785 786 iov_len = sfv_len; 787 /* 788 * Socket filters can limit the mblk 789 * size, so limit reads to maxblk if 790 * there are filters present. 791 */ 792 if (so->so_filter_active > 0 && 793 maxblk != INFPSZ) 794 iov_len = MIN(iov_len, maxblk); 795 796 aiov.iov_len = iov_len; 797 aiov.iov_base = 798 (caddr_t)(uintptr_t)sfv_off; 799 800 auio.uio_iov = &aiov; 801 auio.uio_iovcnt = 1; 802 auio.uio_loffset = *fileoff; 803 auio.uio_segflg = UIO_USERSPACE; 804 auio.uio_fmode = fflag; 805 auio.uio_llimit = curproc->p_fsz_ctl; 806 auio.uio_resid = iov_len; 807 808 dmp = allocb(iov_len + extra, BPRI_HI); 809 if (dmp == NULL) 810 return (ENOMEM); 811 dmp->b_wptr = dmp->b_rptr = 812 dmp->b_rptr + wroff; 813 error = uiomove((caddr_t)dmp->b_wptr, 814 iov_len, UIO_WRITE, &auio); 815 if (error != 0) { 816 freeb(dmp); 817 return (error); 818 } 819 dmp->b_wptr += iov_len; 820 error = socket_sendmblk(VTOSO(vp), 821 &msg, fflag, CRED(), &dmp); 822 823 if (error != 0) { 824 if (dmp != NULL) 825 freeb(dmp); 826 return (error); 827 } 828 ttolwp(curthread)->lwp_ru.ioch += 829 (ulong_t)iov_len; 830 *count += iov_len; 831 sfv_len -= iov_len; 832 sfv_off += iov_len; 833 } 834 } else { 835 aiov.iov_len = sfv_len; 836 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 837 838 auio.uio_iov = &aiov; 839 auio.uio_iovcnt = 1; 840 auio.uio_loffset = *fileoff; 841 auio.uio_segflg = UIO_USERSPACE; 842 auio.uio_fmode = fflag; 843 auio.uio_llimit = curproc->p_fsz_ctl; 844 auio.uio_resid = sfv_len; 845 846 ioflag = auio.uio_fmode & 847 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 848 while (sfv_len > 0) { 849 error = VOP_WRITE(vp, &auio, ioflag, 850 fp->f_cred, NULL); 851 cnt = sfv_len - auio.uio_resid; 852 sfv_len -= cnt; 853 ttolwp(curthread)->lwp_ru.ioch += 854 (ulong_t)cnt; 855 *fileoff += cnt; 856 *count += cnt; 857 if (error != 0) 858 return (error); 859 } 860 } 861 } else { 862 int segmapit = 0; 863 file_t *ffp; 864 vnode_t *readvp; 865 struct vnode *realvp; 866 size_t size; 867 caddr_t ptr; 868 869 if ((ffp = getf(sfv->sfv_fd)) == NULL) 870 return (EBADF); 871 872 if ((ffp->f_flag & FREAD) == 0) { 873 releasef(sfv->sfv_fd); 874 return (EBADF); 875 } 876 877 readvp = ffp->f_vnode; 878 if (VOP_REALVP(readvp, &realvp, NULL) == 0) 879 readvp = realvp; 880 if (readvp->v_type != VREG) { 881 releasef(sfv->sfv_fd); 882 return (EINVAL); 883 } 884 885 /* 886 * No point reading and writing to same vp, 887 * as long as both are regular files. readvp is not 888 * locked; but since we got it from an open file the 889 * contents will be valid during the time of access. 890 */ 891 if (vn_compare(vp, readvp)) { 892 releasef(sfv->sfv_fd); 893 return (EINVAL); 894 } 895 896 /* 897 * Note: we assume readvp != vp. "vp" is already 898 * locked, and "readvp" must not be. 899 */ 900 if (readvp < vp) { 901 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 902 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 903 NULL); 904 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 905 } else { 906 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 907 NULL); 908 } 909 910 /* Same checks as in pread */ 911 if (sfv_off > maxoff) { 912 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 913 releasef(sfv->sfv_fd); 914 return (EINVAL); 915 } 916 if (sfv_off + sfv_len > maxoff) { 917 sfv_len = (ssize_t)((offset_t)maxoff - 918 sfv_off); 919 } 920 /* Find the native blocksize to transfer data */ 921 size = MIN(vp->v_vfsp->vfs_bsize, 922 readvp->v_vfsp->vfs_bsize); 923 size = sfv_len < size ? sfv_len : size; 924 925 if (vp->v_type != VSOCK) { 926 segmapit = 0; 927 buf = kmem_alloc(size, KM_NOSLEEP); 928 if (buf == NULL) { 929 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 930 NULL); 931 releasef(sfv->sfv_fd); 932 return (ENOMEM); 933 } 934 } else { 935 uint_t copyflag; 936 937 copyflag = stp != NULL ? stp->sd_copyflag : 938 so->so_proto_props.sopp_zcopyflag; 939 940 /* 941 * Socket filters can limit the mblk size, 942 * so limit reads to maxblk if there are 943 * filters present. 944 */ 945 if (so->so_filter_active > 0 && 946 maxblk != INFPSZ) 947 size = MIN(size, maxblk); 948 949 if (vn_has_flocks(readvp) || 950 readvp->v_flag & VNOMAP || 951 copyflag & STZCVMUNSAFE) { 952 segmapit = 0; 953 } else if (copyflag & STZCVMSAFE) { 954 segmapit = 1; 955 } else { 956 int on = 1; 957 if (socket_setsockopt(VTOSO(vp), 958 SOL_SOCKET, SO_SND_COPYAVOID, 959 &on, sizeof (on), CRED()) == 0) 960 segmapit = 1; 961 } 962 } 963 964 if (segmapit) { 965 struct vattr va; 966 boolean_t nowait; 967 968 va.va_mask = AT_SIZE; 969 error = VOP_GETATTR(readvp, &va, 0, kcred, 970 NULL); 971 if (error != 0 || sfv_off >= va.va_size) { 972 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 973 NULL); 974 releasef(sfv->sfv_fd); 975 return (error); 976 } 977 /* Read as much as possible. */ 978 if (sfv_off + sfv_len > va.va_size) 979 sfv_len = va.va_size - sfv_off; 980 981 nowait = (sfv->sfv_flag & SFV_NOWAIT) != 0; 982 error = snf_segmap(fp, readvp, sfv_off, 983 (u_offset_t)sfv_len, (ssize_t *)&cnt, 984 nowait); 985 releasef(sfv->sfv_fd); 986 *count += cnt; 987 if (error) 988 return (error); 989 sfv++; 990 continue; 991 } 992 993 while (sfv_len > 0) { 994 size_t iov_len; 995 996 iov_len = MIN(size, sfv_len); 997 998 if (vp->v_type == VSOCK) { 999 dmp = allocb(iov_len + extra, BPRI_HI); 1000 if (dmp == NULL) { 1001 VOP_RWUNLOCK(readvp, 1002 V_WRITELOCK_FALSE, NULL); 1003 releasef(sfv->sfv_fd); 1004 return (ENOMEM); 1005 } 1006 dmp->b_wptr = dmp->b_rptr = 1007 dmp->b_rptr + wroff; 1008 ptr = (caddr_t)dmp->b_rptr; 1009 } else { 1010 ptr = buf; 1011 } 1012 1013 aiov.iov_base = ptr; 1014 aiov.iov_len = iov_len; 1015 auio.uio_loffset = sfv_off; 1016 auio.uio_iov = &aiov; 1017 auio.uio_iovcnt = 1; 1018 auio.uio_resid = iov_len; 1019 auio.uio_segflg = UIO_SYSSPACE; 1020 auio.uio_llimit = MAXOFFSET_T; 1021 auio.uio_fmode = ffp->f_flag; 1022 ioflag = auio.uio_fmode & 1023 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1024 1025 /* 1026 * If read sync is not asked for, 1027 * filter sync flags 1028 */ 1029 if ((ioflag & FRSYNC) == 0) 1030 ioflag &= ~(FSYNC|FDSYNC); 1031 error = VOP_READ(readvp, &auio, ioflag, 1032 fp->f_cred, NULL); 1033 if (error != 0) { 1034 /* 1035 * If we were reading a pipe (currently 1036 * not implemented), we may now lose 1037 * data. 1038 */ 1039 if (vp->v_type == VSOCK) 1040 freeb(dmp); 1041 else 1042 kmem_free(buf, size); 1043 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 1044 NULL); 1045 releasef(sfv->sfv_fd); 1046 return (error); 1047 } 1048 1049 /* 1050 * Check how much data was really read. 1051 * Decrement the 'len' and increment the 1052 * 'off' appropriately. 1053 */ 1054 cnt = iov_len - auio.uio_resid; 1055 if (cnt == 0) { 1056 if (vp->v_type == VSOCK) 1057 freeb(dmp); 1058 else 1059 kmem_free(buf, size); 1060 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 1061 NULL); 1062 releasef(sfv->sfv_fd); 1063 return (EINVAL); 1064 } 1065 sfv_len -= cnt; 1066 sfv_off += cnt; 1067 1068 if (vp->v_type == VSOCK) { 1069 dmp->b_wptr = dmp->b_rptr + cnt; 1070 1071 error = socket_sendmblk(VTOSO(vp), 1072 &msg, fflag, CRED(), &dmp); 1073 1074 if (error != 0) { 1075 if (dmp != NULL) 1076 freeb(dmp); 1077 VOP_RWUNLOCK(readvp, 1078 V_WRITELOCK_FALSE, NULL); 1079 releasef(sfv->sfv_fd); 1080 return (error); 1081 } 1082 1083 ttolwp(curthread)->lwp_ru.ioch += 1084 (ulong_t)cnt; 1085 *count += cnt; 1086 } else { 1087 1088 aiov.iov_base = ptr; 1089 aiov.iov_len = cnt; 1090 auio.uio_loffset = *fileoff; 1091 auio.uio_resid = cnt; 1092 auio.uio_iov = &aiov; 1093 auio.uio_iovcnt = 1; 1094 auio.uio_segflg = UIO_SYSSPACE; 1095 auio.uio_llimit = curproc->p_fsz_ctl; 1096 auio.uio_fmode = fflag; 1097 ioflag = auio.uio_fmode & 1098 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1099 error = VOP_WRITE(vp, &auio, ioflag, 1100 fp->f_cred, NULL); 1101 1102 /* 1103 * Check how much data was written. 1104 * Increment the 'len' and decrement the 1105 * 'off' if all the data was not 1106 * written. 1107 */ 1108 cnt -= auio.uio_resid; 1109 sfv_len += auio.uio_resid; 1110 sfv_off -= auio.uio_resid; 1111 ttolwp(curthread)->lwp_ru.ioch += 1112 (ulong_t)cnt; 1113 *fileoff += cnt; 1114 *count += cnt; 1115 if (error != 0) { 1116 kmem_free(buf, size); 1117 VOP_RWUNLOCK(readvp, 1118 V_WRITELOCK_FALSE, NULL); 1119 releasef(sfv->sfv_fd); 1120 return (error); 1121 } 1122 } 1123 } 1124 if (buf) { 1125 kmem_free(buf, size); 1126 buf = NULL; 1127 } 1128 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 1129 releasef(sfv->sfv_fd); 1130 } 1131 sfv++; 1132 } 1133 return (0); 1134 } 1135 1136 ssize_t 1137 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, 1138 size_t *xferred) 1139 { 1140 int error = 0; 1141 int first_vector_error = 0; 1142 file_t *fp; 1143 struct vnode *vp; 1144 struct sonode *so = NULL; 1145 u_offset_t fileoff; 1146 int copy_cnt; 1147 const struct sendfilevec *copy_vec; 1148 struct sendfilevec sfv[SEND_MAX_CHUNK]; 1149 ssize_t count = 0; 1150 #ifdef _SYSCALL32_IMPL 1151 struct ksendfilevec32 sfv32[SEND_MAX_CHUNK]; 1152 #endif 1153 ssize_t total_size; 1154 int i; 1155 boolean_t is_sock = B_FALSE; 1156 int maxblk = 0; 1157 1158 if (sfvcnt <= 0) 1159 return (set_errno(EINVAL)); 1160 1161 if ((fp = getf(fildes)) == NULL) 1162 return (set_errno(EBADF)); 1163 1164 if (((fp->f_flag) & FWRITE) == 0) { 1165 error = EBADF; 1166 goto err; 1167 } 1168 1169 fileoff = fp->f_offset; 1170 vp = fp->f_vnode; 1171 1172 switch (vp->v_type) { 1173 case VSOCK: 1174 so = VTOSO(vp); 1175 is_sock = B_TRUE; 1176 if (SOCK_IS_NONSTR(so)) { 1177 maxblk = so->so_proto_props.sopp_maxblk; 1178 } else { 1179 maxblk = (int)vp->v_stream->sd_maxblk; 1180 } 1181 1182 /* 1183 * We need to make sure that the socket that we're sending on 1184 * supports sendfile behavior. sockfs doesn't know that the APIs 1185 * we want to use are coming from sendfile, so we can't rely on 1186 * it to check for us. 1187 */ 1188 if ((so->so_mode & SM_SENDFILESUPP) == 0) { 1189 error = EOPNOTSUPP; 1190 goto err; 1191 } 1192 break; 1193 case VREG: 1194 break; 1195 default: 1196 error = EINVAL; 1197 goto err; 1198 } 1199 1200 switch (opcode) { 1201 case SENDFILEV : 1202 break; 1203 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 1204 case SENDFILEV64 : 1205 return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt, 1206 (size32_t *)xferred, fildes)); 1207 #endif 1208 default : 1209 error = ENOSYS; 1210 break; 1211 } 1212 1213 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1214 copy_vec = vec; 1215 1216 do { 1217 total_size = 0; 1218 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 1219 #ifdef _SYSCALL32_IMPL 1220 /* 32-bit callers need to have their iovec expanded. */ 1221 if (get_udatamodel() == DATAMODEL_ILP32) { 1222 if (copyin(copy_vec, sfv32, 1223 copy_cnt * sizeof (ksendfilevec32_t))) { 1224 error = EFAULT; 1225 break; 1226 } 1227 1228 for (i = 0; i < copy_cnt; i++) { 1229 sfv[i].sfv_fd = sfv32[i].sfv_fd; 1230 sfv[i].sfv_off = 1231 (off_t)(uint32_t)sfv32[i].sfv_off; 1232 sfv[i].sfv_len = (size_t)sfv32[i].sfv_len; 1233 total_size += sfv[i].sfv_len; 1234 sfv[i].sfv_flag = sfv32[i].sfv_flag; 1235 /* 1236 * Individual elements of the vector must not 1237 * wrap or overflow, as later math is signed. 1238 * Equally total_size needs to be checked after 1239 * each vector is added in, to be sure that 1240 * rogue values haven't overflowed the counter. 1241 */ 1242 if (((ssize32_t)sfv[i].sfv_len < 0) || 1243 ((ssize32_t)total_size < 0)) { 1244 /* 1245 * Truncate the vector to send data 1246 * described by elements before the 1247 * error. 1248 */ 1249 copy_cnt = i; 1250 first_vector_error = EINVAL; 1251 /* total_size can't be trusted */ 1252 if ((ssize32_t)total_size < 0) 1253 error = EINVAL; 1254 break; 1255 } 1256 } 1257 /* Nothing to do, process errors */ 1258 if (copy_cnt == 0) 1259 break; 1260 1261 } else { 1262 #endif 1263 if (copyin(copy_vec, sfv, 1264 copy_cnt * sizeof (sendfilevec_t))) { 1265 error = EFAULT; 1266 break; 1267 } 1268 1269 for (i = 0; i < copy_cnt; i++) { 1270 total_size += sfv[i].sfv_len; 1271 /* 1272 * Individual elements of the vector must not 1273 * wrap or overflow, as later math is signed. 1274 * Equally total_size needs to be checked after 1275 * each vector is added in, to be sure that 1276 * rogue values haven't overflowed the counter. 1277 */ 1278 if (((ssize_t)sfv[i].sfv_len < 0) || 1279 (total_size < 0)) { 1280 /* 1281 * Truncate the vector to send data 1282 * described by elements before the 1283 * error. 1284 */ 1285 copy_cnt = i; 1286 first_vector_error = EINVAL; 1287 /* total_size can't be trusted */ 1288 if (total_size < 0) 1289 error = EINVAL; 1290 break; 1291 } 1292 } 1293 /* Nothing to do, process errors */ 1294 if (copy_cnt == 0) 1295 break; 1296 #ifdef _SYSCALL32_IMPL 1297 } 1298 #endif 1299 1300 /* 1301 * The task between deciding to use sendvec_small_chunk 1302 * and sendvec_chunk is dependant on multiple things: 1303 * 1304 * i) latency is important for smaller files. So if the 1305 * data is smaller than 'tcp_slow_start_initial' times 1306 * maxblk, then use sendvec_small_chunk which creates 1307 * maxblk size mblks and chains them together and sends 1308 * them to TCP in one shot. It also leaves 'wroff' size 1309 * space for the headers in each mblk. 1310 * 1311 * ii) for total size bigger than 'tcp_slow_start_initial' 1312 * time maxblk, its probably real file data which is 1313 * dominating. So its better to use sendvec_chunk because 1314 * performance goes to dog if we don't do pagesize reads. 1315 * sendvec_chunk will do pagesize reads and write them 1316 * in pagesize mblks to TCP. 1317 * 1318 * Side Notes: A write to file has not been optimized. 1319 * Future zero copy code will plugin into sendvec_chunk 1320 * only because doing zero copy for files smaller then 1321 * pagesize is useless. 1322 * 1323 * Note, if socket has NL7C enabled then call NL7C's 1324 * senfilev() function to consume the sfv[]. 1325 */ 1326 if (is_sock) { 1327 if (!SOCK_IS_NONSTR(so) && 1328 _SOTOTPI(so)->sti_nl7c_flags != 0) { 1329 error = nl7c_sendfilev(so, &fileoff, 1330 sfv, copy_cnt, &count); 1331 } else if ((total_size <= (4 * maxblk)) && 1332 error == 0) { 1333 error = sendvec_small_chunk(fp, 1334 &fileoff, sfv, copy_cnt, 1335 total_size, maxblk, &count); 1336 } else { 1337 error = sendvec_chunk(fp, &fileoff, 1338 sfv, copy_cnt, &count); 1339 } 1340 } else { 1341 ASSERT(vp->v_type == VREG); 1342 error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt, 1343 &count); 1344 } 1345 1346 1347 #ifdef _SYSCALL32_IMPL 1348 if (get_udatamodel() == DATAMODEL_ILP32) { 1349 copy_vec = (const struct sendfilevec *) 1350 ((char *)copy_vec + 1351 (copy_cnt * sizeof (ksendfilevec32_t))); 1352 } else 1353 #endif 1354 copy_vec += copy_cnt; 1355 sfvcnt -= copy_cnt; 1356 1357 /* Process all vector members up to first error */ 1358 } while ((sfvcnt > 0) && first_vector_error == 0 && error == 0); 1359 1360 if (vp->v_type == VREG) 1361 fp->f_offset += count; 1362 1363 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1364 1365 #ifdef _SYSCALL32_IMPL 1366 if (get_udatamodel() == DATAMODEL_ILP32) { 1367 ssize32_t count32 = (ssize32_t)count; 1368 if (copyout(&count32, xferred, sizeof (count32))) 1369 error = EFAULT; 1370 releasef(fildes); 1371 if (error != 0) 1372 return (set_errno(error)); 1373 if (first_vector_error != 0) 1374 return (set_errno(first_vector_error)); 1375 return (count32); 1376 } 1377 #endif 1378 if (copyout(&count, xferred, sizeof (count))) 1379 error = EFAULT; 1380 releasef(fildes); 1381 if (error != 0) 1382 return (set_errno(error)); 1383 if (first_vector_error != 0) 1384 return (set_errno(first_vector_error)); 1385 return (count); 1386 err: 1387 ASSERT(error != 0); 1388 releasef(fildes); 1389 return (set_errno(error)); 1390 } 1391