1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2022 Garrett D'Amore 25 */ 26 27 #include <sys/types.h> 28 #include <sys/t_lock.h> 29 #include <sys/param.h> 30 #include <sys/systm.h> 31 #include <sys/buf.h> 32 #include <sys/conf.h> 33 #include <sys/cred.h> 34 #include <sys/kmem.h> 35 #include <sys/sysmacros.h> 36 #include <sys/vfs.h> 37 #include <sys/vnode.h> 38 #include <sys/debug.h> 39 #include <sys/errno.h> 40 #include <sys/time.h> 41 #include <sys/file.h> 42 #include <sys/open.h> 43 #include <sys/user.h> 44 #include <sys/termios.h> 45 #include <sys/stream.h> 46 #include <sys/strsubr.h> 47 #include <sys/sunddi.h> 48 #include <sys/esunddi.h> 49 #include <sys/flock.h> 50 #include <sys/modctl.h> 51 #include <sys/cmn_err.h> 52 #include <sys/vmsystm.h> 53 54 #include <sys/socket.h> 55 #include <sys/socketvar.h> 56 #include <fs/sockfs/sockcommon.h> 57 #include <fs/sockfs/socktpi.h> 58 59 #include <netinet/in.h> 60 #include <sys/sendfile.h> 61 #include <sys/un.h> 62 #include <sys/tihdr.h> 63 #include <sys/atomic.h> 64 65 #include <inet/common.h> 66 #include <inet/ip.h> 67 #include <inet/ip6.h> 68 #include <inet/tcp.h> 69 70 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *, 71 ssize32_t *); 72 extern int snf_segmap(file_t *, vnode_t *, u_offset_t, u_offset_t, ssize_t *, 73 boolean_t); 74 extern sotpi_info_t *sotpi_sototpi(struct sonode *); 75 76 #define SEND_MAX_CHUNK 16 77 78 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 79 /* 80 * 64 bit offsets for 32 bit applications only running either on 81 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer 82 * more than 2GB of data. 83 */ 84 static int 85 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, 86 int copy_cnt, ssize32_t *count) 87 { 88 struct vnode *vp; 89 ushort_t fflag; 90 int ioflag; 91 size32_t cnt; 92 ssize32_t sfv_len; 93 ssize32_t tmpcount; 94 u_offset_t sfv_off; 95 struct uio auio; 96 struct iovec aiov; 97 int i, error; 98 99 fflag = fp->f_flag; 100 vp = fp->f_vnode; 101 for (i = 0; i < copy_cnt; i++) { 102 103 if (ISSIG(curthread, JUSTLOOKING)) 104 return (EINTR); 105 106 /* 107 * Do similar checks as "write" as we are writing 108 * sfv_len bytes into "vp". 109 */ 110 sfv_len = (ssize32_t)sfv->sfv_len; 111 112 if (sfv_len == 0) { 113 sfv++; 114 continue; 115 } 116 117 if (sfv_len < 0) 118 return (EINVAL); 119 120 if (vp->v_type == VREG) { 121 if (*fileoff >= curproc->p_fsz_ctl) { 122 mutex_enter(&curproc->p_lock); 123 (void) rctl_action( 124 rctlproc_legacy[RLIMIT_FSIZE], 125 curproc->p_rctls, curproc, RCA_SAFE); 126 mutex_exit(&curproc->p_lock); 127 return (EFBIG); 128 } 129 130 if (*fileoff >= OFFSET_MAX(fp)) 131 return (EFBIG); 132 133 if (*fileoff + sfv_len > OFFSET_MAX(fp)) 134 return (EINVAL); 135 } 136 137 tmpcount = *count + sfv_len; 138 if (tmpcount < 0) 139 return (EINVAL); 140 141 sfv_off = sfv->sfv_off; 142 143 auio.uio_extflg = UIO_COPY_DEFAULT; 144 if (sfv->sfv_fd == SFV_FD_SELF) { 145 aiov.iov_len = sfv_len; 146 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 147 auio.uio_loffset = *fileoff; 148 auio.uio_iovcnt = 1; 149 auio.uio_resid = sfv_len; 150 auio.uio_iov = &aiov; 151 auio.uio_segflg = UIO_USERSPACE; 152 auio.uio_llimit = curproc->p_fsz_ctl; 153 auio.uio_fmode = fflag; 154 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); 155 while (sfv_len > 0) { 156 error = VOP_WRITE(vp, &auio, ioflag, 157 fp->f_cred, NULL); 158 cnt = sfv_len - auio.uio_resid; 159 sfv_len -= cnt; 160 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 161 if (vp->v_type == VREG) 162 *fileoff += cnt; 163 *count += cnt; 164 if (error != 0) 165 return (error); 166 } 167 } else { 168 file_t *ffp; 169 vnode_t *readvp; 170 size_t size; 171 caddr_t ptr; 172 173 if ((ffp = getf(sfv->sfv_fd)) == NULL) 174 return (EBADF); 175 176 if ((ffp->f_flag & FREAD) == 0) { 177 releasef(sfv->sfv_fd); 178 return (EBADF); 179 } 180 181 readvp = ffp->f_vnode; 182 if (readvp->v_type != VREG) { 183 releasef(sfv->sfv_fd); 184 return (EINVAL); 185 } 186 187 /* 188 * No point reading and writing to same vp, 189 * as long as both are regular files. readvp is not 190 * locked; but since we got it from an open file the 191 * contents will be valid during the time of access. 192 */ 193 if (vn_compare(vp, readvp)) { 194 releasef(sfv->sfv_fd); 195 return (EINVAL); 196 } 197 198 /* 199 * Optimize the regular file over 200 * the socket case. 201 */ 202 if (vp->v_type == VSOCK) { 203 error = sosendfile64(fp, ffp, sfv, 204 (ssize32_t *)&cnt); 205 *count += cnt; 206 if (error) 207 return (error); 208 sfv++; 209 continue; 210 } 211 212 /* 213 * Note: we assume readvp != vp. "vp" is already 214 * locked, and "readvp" must not be. 215 */ 216 if (readvp < vp) { 217 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 218 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 219 NULL); 220 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 221 } else { 222 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 223 NULL); 224 } 225 226 /* 227 * Same checks as in pread64. 228 */ 229 if (sfv_off > MAXOFFSET_T) { 230 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 231 releasef(sfv->sfv_fd); 232 return (EINVAL); 233 } 234 235 if (sfv_off + sfv_len > MAXOFFSET_T) 236 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off); 237 238 /* Find the native blocksize to transfer data */ 239 size = MIN(vp->v_vfsp->vfs_bsize, 240 readvp->v_vfsp->vfs_bsize); 241 size = sfv_len < size ? sfv_len : size; 242 ptr = kmem_alloc(size, KM_NOSLEEP); 243 if (ptr == NULL) { 244 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 245 releasef(sfv->sfv_fd); 246 return (ENOMEM); 247 } 248 249 while (sfv_len > 0) { 250 size_t iov_len; 251 252 iov_len = MIN(size, sfv_len); 253 aiov.iov_base = ptr; 254 aiov.iov_len = iov_len; 255 auio.uio_loffset = sfv_off; 256 auio.uio_iov = &aiov; 257 auio.uio_iovcnt = 1; 258 auio.uio_resid = iov_len; 259 auio.uio_segflg = UIO_SYSSPACE; 260 auio.uio_llimit = MAXOFFSET_T; 261 auio.uio_fmode = ffp->f_flag; 262 ioflag = auio.uio_fmode & 263 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 264 265 /* 266 * If read sync is not asked for, 267 * filter sync flags 268 */ 269 if ((ioflag & FRSYNC) == 0) 270 ioflag &= ~(FSYNC|FDSYNC); 271 error = VOP_READ(readvp, &auio, ioflag, 272 fp->f_cred, NULL); 273 if (error) { 274 kmem_free(ptr, size); 275 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 276 NULL); 277 releasef(sfv->sfv_fd); 278 return (error); 279 } 280 281 /* 282 * Check how must data was really read. 283 * Decrement the 'len' and increment the 284 * 'off' appropriately. 285 */ 286 cnt = iov_len - auio.uio_resid; 287 if (cnt == 0) { 288 /* 289 * If we were reading a pipe (currently 290 * not implemented), we may now lose 291 * data. 292 */ 293 kmem_free(ptr, size); 294 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 295 NULL); 296 releasef(sfv->sfv_fd); 297 return (EINVAL); 298 } 299 sfv_len -= cnt; 300 sfv_off += cnt; 301 302 aiov.iov_base = ptr; 303 aiov.iov_len = cnt; 304 auio.uio_loffset = *fileoff; 305 auio.uio_iov = &aiov; 306 auio.uio_iovcnt = 1; 307 auio.uio_resid = cnt; 308 auio.uio_segflg = UIO_SYSSPACE; 309 auio.uio_llimit = curproc->p_fsz_ctl; 310 auio.uio_fmode = fflag; 311 ioflag = auio.uio_fmode & 312 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 313 error = VOP_WRITE(vp, &auio, ioflag, 314 fp->f_cred, NULL); 315 316 /* 317 * Check how much data was written. Increment 318 * the 'len' and decrement the 'off' if all 319 * the data was not written. 320 */ 321 cnt -= auio.uio_resid; 322 sfv_len += auio.uio_resid; 323 sfv_off -= auio.uio_resid; 324 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt; 325 if (vp->v_type == VREG) 326 *fileoff += cnt; 327 *count += cnt; 328 if (error != 0) { 329 kmem_free(ptr, size); 330 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 331 NULL); 332 releasef(sfv->sfv_fd); 333 return (error); 334 } 335 } 336 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 337 releasef(sfv->sfv_fd); 338 kmem_free(ptr, size); 339 } 340 sfv++; 341 } 342 return (0); 343 } 344 345 static ssize32_t 346 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, 347 size32_t *xferred, int fildes) 348 { 349 u_offset_t fileoff; 350 int copy_cnt; 351 const struct ksendfilevec64 *copy_vec; 352 struct ksendfilevec64 sfv[SEND_MAX_CHUNK]; 353 struct vnode *vp; 354 int error; 355 ssize32_t count = 0; 356 357 vp = fp->f_vnode; 358 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 359 360 copy_vec = vec; 361 fileoff = fp->f_offset; 362 363 do { 364 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 365 if (copyin(copy_vec, sfv, copy_cnt * 366 sizeof (struct ksendfilevec64))) { 367 error = EFAULT; 368 break; 369 } 370 371 error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count); 372 if (error != 0) 373 break; 374 375 copy_vec += copy_cnt; 376 sfvcnt -= copy_cnt; 377 } while (sfvcnt > 0); 378 379 if (vp->v_type == VREG) 380 fp->f_offset += count; 381 382 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 383 if (copyout(&count, xferred, sizeof (count))) 384 error = EFAULT; 385 releasef(fildes); 386 if (error != 0) 387 return (set_errno(error)); 388 return (count); 389 } 390 #endif 391 392 static int 393 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 394 int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count) 395 { 396 struct vnode *vp; 397 struct uio auio; 398 struct iovec aiov; 399 ushort_t fflag; 400 int ioflag; 401 int i, error; 402 size_t cnt; 403 ssize_t sfv_len; 404 u_offset_t sfv_off; 405 #ifdef _SYSCALL32_IMPL 406 model_t model = get_udatamodel(); 407 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 408 MAXOFF32_T : MAXOFFSET_T; 409 #else 410 const u_offset_t maxoff = MAXOFF32_T; 411 #endif 412 mblk_t *dmp = NULL; 413 int wroff; 414 int buf_left = 0; 415 size_t iov_len; 416 mblk_t *head, *tmp; 417 size_t size = total_size; 418 size_t extra; 419 int tail_len; 420 struct nmsghdr msg; 421 422 fflag = fp->f_flag; 423 vp = fp->f_vnode; 424 425 ASSERT(vp->v_type == VSOCK); 426 ASSERT(maxblk > 0); 427 428 /* If nothing to send, return */ 429 if (total_size == 0) 430 return (0); 431 432 if (vp->v_stream != NULL) { 433 wroff = (int)vp->v_stream->sd_wroff; 434 tail_len = (int)vp->v_stream->sd_tail; 435 } else { 436 struct sonode *so; 437 438 so = VTOSO(vp); 439 wroff = so->so_proto_props.sopp_wroff; 440 tail_len = so->so_proto_props.sopp_tail; 441 } 442 443 extra = wroff + tail_len; 444 445 buf_left = MIN(total_size, maxblk); 446 head = dmp = allocb(buf_left + extra, BPRI_HI); 447 if (head == NULL) 448 return (ENOMEM); 449 head->b_wptr = head->b_rptr = head->b_rptr + wroff; 450 bzero(&msg, sizeof (msg)); 451 452 auio.uio_extflg = UIO_COPY_DEFAULT; 453 for (i = 0; i < copy_cnt; i++) { 454 if (ISSIG(curthread, JUSTLOOKING)) { 455 freemsg(head); 456 return (EINTR); 457 } 458 459 /* 460 * Do similar checks as "write" as we are writing 461 * sfv_len bytes into "vp". 462 */ 463 sfv_len = (ssize_t)sfv->sfv_len; 464 465 if (sfv_len == 0) { 466 sfv++; 467 continue; 468 } 469 470 /* Check for overflow */ 471 #ifdef _SYSCALL32_IMPL 472 if (model == DATAMODEL_ILP32) { 473 if (((ssize32_t)(*count + sfv_len)) < 0) { 474 freemsg(head); 475 return (EINVAL); 476 } 477 } else 478 #endif 479 if ((*count + sfv_len) < 0) { 480 freemsg(head); 481 return (EINVAL); 482 } 483 484 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 485 486 if (sfv->sfv_fd == SFV_FD_SELF) { 487 while (sfv_len > 0) { 488 if (buf_left == 0) { 489 tmp = dmp; 490 buf_left = MIN(total_size, maxblk); 491 iov_len = MIN(buf_left, sfv_len); 492 dmp = allocb(buf_left + extra, BPRI_HI); 493 if (dmp == NULL) { 494 freemsg(head); 495 return (ENOMEM); 496 } 497 dmp->b_wptr = dmp->b_rptr = 498 dmp->b_rptr + wroff; 499 tmp->b_cont = dmp; 500 } else { 501 iov_len = MIN(buf_left, sfv_len); 502 } 503 504 aiov.iov_len = iov_len; 505 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 506 auio.uio_loffset = *fileoff; 507 auio.uio_iovcnt = 1; 508 auio.uio_resid = iov_len; 509 auio.uio_iov = &aiov; 510 auio.uio_segflg = UIO_USERSPACE; 511 auio.uio_llimit = curproc->p_fsz_ctl; 512 auio.uio_fmode = fflag; 513 514 buf_left -= iov_len; 515 total_size -= iov_len; 516 sfv_len -= iov_len; 517 sfv_off += iov_len; 518 519 error = uiomove((caddr_t)dmp->b_wptr, 520 iov_len, UIO_WRITE, &auio); 521 if (error != 0) { 522 freemsg(head); 523 return (error); 524 } 525 dmp->b_wptr += iov_len; 526 } 527 } else { 528 file_t *ffp; 529 vnode_t *readvp; 530 531 if ((ffp = getf(sfv->sfv_fd)) == NULL) { 532 freemsg(head); 533 return (EBADF); 534 } 535 536 if ((ffp->f_flag & FREAD) == 0) { 537 releasef(sfv->sfv_fd); 538 freemsg(head); 539 return (EACCES); 540 } 541 542 readvp = ffp->f_vnode; 543 if (readvp->v_type != VREG) { 544 releasef(sfv->sfv_fd); 545 freemsg(head); 546 return (EINVAL); 547 } 548 549 /* 550 * No point reading and writing to same vp, 551 * as long as both are regular files. readvp is not 552 * locked; but since we got it from an open file the 553 * contents will be valid during the time of access. 554 */ 555 556 if (vn_compare(vp, readvp)) { 557 releasef(sfv->sfv_fd); 558 freemsg(head); 559 return (EINVAL); 560 } 561 562 /* 563 * Note: we assume readvp != vp. "vp" is already 564 * locked, and "readvp" must not be. 565 */ 566 567 if (readvp < vp) { 568 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 569 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 570 NULL); 571 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 572 } else { 573 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 574 NULL); 575 } 576 577 /* Same checks as in pread */ 578 if (sfv_off > maxoff) { 579 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 580 releasef(sfv->sfv_fd); 581 freemsg(head); 582 return (EINVAL); 583 } 584 if (sfv_off + sfv_len > maxoff) { 585 total_size -= (sfv_off + sfv_len - maxoff); 586 sfv_len = (ssize_t)((offset_t)maxoff - 587 sfv_off); 588 } 589 590 while (sfv_len > 0) { 591 if (buf_left == 0) { 592 tmp = dmp; 593 buf_left = MIN(total_size, maxblk); 594 iov_len = MIN(buf_left, sfv_len); 595 dmp = allocb(buf_left + extra, BPRI_HI); 596 if (dmp == NULL) { 597 VOP_RWUNLOCK(readvp, 598 V_WRITELOCK_FALSE, NULL); 599 releasef(sfv->sfv_fd); 600 freemsg(head); 601 return (ENOMEM); 602 } 603 dmp->b_wptr = dmp->b_rptr = 604 dmp->b_rptr + wroff; 605 tmp->b_cont = dmp; 606 } else { 607 iov_len = MIN(buf_left, sfv_len); 608 } 609 aiov.iov_base = (caddr_t)dmp->b_wptr; 610 aiov.iov_len = iov_len; 611 auio.uio_loffset = sfv_off; 612 auio.uio_iov = &aiov; 613 auio.uio_iovcnt = 1; 614 auio.uio_resid = iov_len; 615 auio.uio_segflg = UIO_SYSSPACE; 616 auio.uio_llimit = MAXOFFSET_T; 617 auio.uio_fmode = ffp->f_flag; 618 ioflag = auio.uio_fmode & 619 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 620 621 /* 622 * If read sync is not asked for, 623 * filter sync flags 624 */ 625 if ((ioflag & FRSYNC) == 0) 626 ioflag &= ~(FSYNC|FDSYNC); 627 error = VOP_READ(readvp, &auio, ioflag, 628 fp->f_cred, NULL); 629 if (error != 0) { 630 /* 631 * If we were reading a pipe (currently 632 * not implemented), we may now loose 633 * data. 634 */ 635 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 636 NULL); 637 releasef(sfv->sfv_fd); 638 freemsg(head); 639 return (error); 640 } 641 642 /* 643 * Check how much data was really read. 644 * Decrement the 'len' and increment the 645 * 'off' appropriately. 646 */ 647 cnt = iov_len - auio.uio_resid; 648 if (cnt == 0) { 649 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 650 NULL); 651 releasef(sfv->sfv_fd); 652 freemsg(head); 653 return (EINVAL); 654 } 655 sfv_len -= cnt; 656 sfv_off += cnt; 657 total_size -= cnt; 658 buf_left -= cnt; 659 660 dmp->b_wptr += cnt; 661 } 662 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 663 releasef(sfv->sfv_fd); 664 } 665 sfv++; 666 } 667 668 ASSERT(total_size == 0); 669 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &head); 670 if (error != 0) { 671 if (head != NULL) 672 freemsg(head); 673 return (error); 674 } 675 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size; 676 *count += size; 677 678 return (0); 679 } 680 681 682 static int 683 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, 684 int copy_cnt, ssize_t *count) 685 { 686 struct vnode *vp; 687 struct uio auio; 688 struct iovec aiov; 689 ushort_t fflag; 690 int ioflag; 691 int i, error; 692 size_t cnt; 693 ssize_t sfv_len; 694 u_offset_t sfv_off; 695 #ifdef _SYSCALL32_IMPL 696 model_t model = get_udatamodel(); 697 u_offset_t maxoff = (model == DATAMODEL_ILP32) ? 698 MAXOFF32_T : MAXOFFSET_T; 699 #else 700 const u_offset_t maxoff = MAXOFF32_T; 701 #endif 702 mblk_t *dmp = NULL; 703 char *buf = NULL; 704 size_t extra = 0; 705 int maxblk, wroff, tail_len; 706 struct sonode *so; 707 stdata_t *stp; 708 struct nmsghdr msg; 709 710 maxblk = 0; 711 wroff = 0; 712 fflag = fp->f_flag; 713 vp = fp->f_vnode; 714 so = NULL; 715 stp = NULL; 716 717 if (vp->v_type == VSOCK) { 718 so = VTOSO(vp); 719 if (vp->v_stream != NULL) { 720 stp = vp->v_stream; 721 wroff = (int)stp->sd_wroff; 722 tail_len = (int)stp->sd_tail; 723 maxblk = (int)stp->sd_maxblk; 724 } else { 725 stp = NULL; 726 wroff = so->so_proto_props.sopp_wroff; 727 tail_len = so->so_proto_props.sopp_tail; 728 maxblk = so->so_proto_props.sopp_maxblk; 729 } 730 extra = wroff + tail_len; 731 } 732 733 bzero(&msg, sizeof (msg)); 734 auio.uio_extflg = UIO_COPY_DEFAULT; 735 for (i = 0; i < copy_cnt; i++) { 736 if (ISSIG(curthread, JUSTLOOKING)) 737 return (EINTR); 738 739 /* 740 * Do similar checks as "write" as we are writing 741 * sfv_len bytes into "vp". 742 */ 743 sfv_len = (ssize_t)sfv->sfv_len; 744 745 if (sfv_len == 0) { 746 sfv++; 747 continue; 748 } 749 750 if (vp->v_type == VREG) { 751 if (*fileoff >= curproc->p_fsz_ctl) { 752 mutex_enter(&curproc->p_lock); 753 (void) rctl_action( 754 rctlproc_legacy[RLIMIT_FSIZE], 755 curproc->p_rctls, curproc, RCA_SAFE); 756 mutex_exit(&curproc->p_lock); 757 758 return (EFBIG); 759 } 760 761 if (*fileoff >= maxoff) 762 return (EFBIG); 763 764 if (*fileoff + sfv_len > maxoff) 765 return (EINVAL); 766 } 767 768 /* Check for overflow */ 769 #ifdef _SYSCALL32_IMPL 770 if (model == DATAMODEL_ILP32) { 771 if (((ssize32_t)(*count + sfv_len)) < 0) 772 return (EINVAL); 773 } else 774 #endif 775 if ((*count + sfv_len) < 0) 776 return (EINVAL); 777 778 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off; 779 780 if (sfv->sfv_fd == SFV_FD_SELF) { 781 if (vp->v_type == VSOCK) { 782 while (sfv_len > 0) { 783 size_t iov_len; 784 785 iov_len = sfv_len; 786 /* 787 * Socket filters can limit the mblk 788 * size, so limit reads to maxblk if 789 * there are filters present. 790 */ 791 if (so->so_filter_active > 0 && 792 maxblk != INFPSZ) 793 iov_len = MIN(iov_len, maxblk); 794 795 aiov.iov_len = iov_len; 796 aiov.iov_base = 797 (caddr_t)(uintptr_t)sfv_off; 798 799 auio.uio_iov = &aiov; 800 auio.uio_iovcnt = 1; 801 auio.uio_loffset = *fileoff; 802 auio.uio_segflg = UIO_USERSPACE; 803 auio.uio_fmode = fflag; 804 auio.uio_llimit = curproc->p_fsz_ctl; 805 auio.uio_resid = iov_len; 806 807 dmp = allocb(iov_len + extra, BPRI_HI); 808 if (dmp == NULL) 809 return (ENOMEM); 810 dmp->b_wptr = dmp->b_rptr = 811 dmp->b_rptr + wroff; 812 error = uiomove((caddr_t)dmp->b_wptr, 813 iov_len, UIO_WRITE, &auio); 814 if (error != 0) { 815 freeb(dmp); 816 return (error); 817 } 818 dmp->b_wptr += iov_len; 819 error = socket_sendmblk(VTOSO(vp), 820 &msg, fflag, CRED(), &dmp); 821 822 if (error != 0) { 823 if (dmp != NULL) 824 freeb(dmp); 825 return (error); 826 } 827 ttolwp(curthread)->lwp_ru.ioch += 828 (ulong_t)iov_len; 829 *count += iov_len; 830 sfv_len -= iov_len; 831 sfv_off += iov_len; 832 } 833 } else { 834 aiov.iov_len = sfv_len; 835 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; 836 837 auio.uio_iov = &aiov; 838 auio.uio_iovcnt = 1; 839 auio.uio_loffset = *fileoff; 840 auio.uio_segflg = UIO_USERSPACE; 841 auio.uio_fmode = fflag; 842 auio.uio_llimit = curproc->p_fsz_ctl; 843 auio.uio_resid = sfv_len; 844 845 ioflag = auio.uio_fmode & 846 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 847 while (sfv_len > 0) { 848 error = VOP_WRITE(vp, &auio, ioflag, 849 fp->f_cred, NULL); 850 cnt = sfv_len - auio.uio_resid; 851 sfv_len -= cnt; 852 ttolwp(curthread)->lwp_ru.ioch += 853 (ulong_t)cnt; 854 *fileoff += cnt; 855 *count += cnt; 856 if (error != 0) 857 return (error); 858 } 859 } 860 } else { 861 int segmapit = 0; 862 file_t *ffp; 863 vnode_t *readvp; 864 struct vnode *realvp; 865 size_t size; 866 caddr_t ptr; 867 868 if ((ffp = getf(sfv->sfv_fd)) == NULL) 869 return (EBADF); 870 871 if ((ffp->f_flag & FREAD) == 0) { 872 releasef(sfv->sfv_fd); 873 return (EBADF); 874 } 875 876 readvp = ffp->f_vnode; 877 if (VOP_REALVP(readvp, &realvp, NULL) == 0) 878 readvp = realvp; 879 if (readvp->v_type != VREG) { 880 releasef(sfv->sfv_fd); 881 return (EINVAL); 882 } 883 884 /* 885 * No point reading and writing to same vp, 886 * as long as both are regular files. readvp is not 887 * locked; but since we got it from an open file the 888 * contents will be valid during the time of access. 889 */ 890 if (vn_compare(vp, readvp)) { 891 releasef(sfv->sfv_fd); 892 return (EINVAL); 893 } 894 895 /* 896 * Note: we assume readvp != vp. "vp" is already 897 * locked, and "readvp" must not be. 898 */ 899 if (readvp < vp) { 900 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 901 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 902 NULL); 903 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 904 } else { 905 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE, 906 NULL); 907 } 908 909 /* Same checks as in pread */ 910 if (sfv_off > maxoff) { 911 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 912 releasef(sfv->sfv_fd); 913 return (EINVAL); 914 } 915 if (sfv_off + sfv_len > maxoff) { 916 sfv_len = (ssize_t)((offset_t)maxoff - 917 sfv_off); 918 } 919 /* Find the native blocksize to transfer data */ 920 size = MIN(vp->v_vfsp->vfs_bsize, 921 readvp->v_vfsp->vfs_bsize); 922 size = sfv_len < size ? sfv_len : size; 923 924 if (vp->v_type != VSOCK) { 925 segmapit = 0; 926 buf = kmem_alloc(size, KM_NOSLEEP); 927 if (buf == NULL) { 928 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 929 NULL); 930 releasef(sfv->sfv_fd); 931 return (ENOMEM); 932 } 933 } else { 934 uint_t copyflag; 935 936 copyflag = stp != NULL ? stp->sd_copyflag : 937 so->so_proto_props.sopp_zcopyflag; 938 939 /* 940 * Socket filters can limit the mblk size, 941 * so limit reads to maxblk if there are 942 * filters present. 943 */ 944 if (so->so_filter_active > 0 && 945 maxblk != INFPSZ) 946 size = MIN(size, maxblk); 947 948 if (vn_has_flocks(readvp) || 949 readvp->v_flag & VNOMAP || 950 copyflag & STZCVMUNSAFE) { 951 segmapit = 0; 952 } else if (copyflag & STZCVMSAFE) { 953 segmapit = 1; 954 } else { 955 int on = 1; 956 if (socket_setsockopt(VTOSO(vp), 957 SOL_SOCKET, SO_SND_COPYAVOID, 958 &on, sizeof (on), CRED()) == 0) 959 segmapit = 1; 960 } 961 } 962 963 if (segmapit) { 964 struct vattr va; 965 boolean_t nowait; 966 967 va.va_mask = AT_SIZE; 968 error = VOP_GETATTR(readvp, &va, 0, kcred, 969 NULL); 970 if (error != 0 || sfv_off >= va.va_size) { 971 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 972 NULL); 973 releasef(sfv->sfv_fd); 974 return (error); 975 } 976 /* Read as much as possible. */ 977 if (sfv_off + sfv_len > va.va_size) 978 sfv_len = va.va_size - sfv_off; 979 980 nowait = (sfv->sfv_flag & SFV_NOWAIT) != 0; 981 error = snf_segmap(fp, readvp, sfv_off, 982 (u_offset_t)sfv_len, (ssize_t *)&cnt, 983 nowait); 984 releasef(sfv->sfv_fd); 985 *count += cnt; 986 if (error) 987 return (error); 988 sfv++; 989 continue; 990 } 991 992 while (sfv_len > 0) { 993 size_t iov_len; 994 995 iov_len = MIN(size, sfv_len); 996 997 if (vp->v_type == VSOCK) { 998 dmp = allocb(iov_len + extra, BPRI_HI); 999 if (dmp == NULL) { 1000 VOP_RWUNLOCK(readvp, 1001 V_WRITELOCK_FALSE, NULL); 1002 releasef(sfv->sfv_fd); 1003 return (ENOMEM); 1004 } 1005 dmp->b_wptr = dmp->b_rptr = 1006 dmp->b_rptr + wroff; 1007 ptr = (caddr_t)dmp->b_rptr; 1008 } else { 1009 ptr = buf; 1010 } 1011 1012 aiov.iov_base = ptr; 1013 aiov.iov_len = iov_len; 1014 auio.uio_loffset = sfv_off; 1015 auio.uio_iov = &aiov; 1016 auio.uio_iovcnt = 1; 1017 auio.uio_resid = iov_len; 1018 auio.uio_segflg = UIO_SYSSPACE; 1019 auio.uio_llimit = MAXOFFSET_T; 1020 auio.uio_fmode = ffp->f_flag; 1021 ioflag = auio.uio_fmode & 1022 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1023 1024 /* 1025 * If read sync is not asked for, 1026 * filter sync flags 1027 */ 1028 if ((ioflag & FRSYNC) == 0) 1029 ioflag &= ~(FSYNC|FDSYNC); 1030 error = VOP_READ(readvp, &auio, ioflag, 1031 fp->f_cred, NULL); 1032 if (error != 0) { 1033 /* 1034 * If we were reading a pipe (currently 1035 * not implemented), we may now lose 1036 * data. 1037 */ 1038 if (vp->v_type == VSOCK) 1039 freeb(dmp); 1040 else 1041 kmem_free(buf, size); 1042 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 1043 NULL); 1044 releasef(sfv->sfv_fd); 1045 return (error); 1046 } 1047 1048 /* 1049 * Check how much data was really read. 1050 * Decrement the 'len' and increment the 1051 * 'off' appropriately. 1052 */ 1053 cnt = iov_len - auio.uio_resid; 1054 if (cnt == 0) { 1055 if (vp->v_type == VSOCK) 1056 freeb(dmp); 1057 else 1058 kmem_free(buf, size); 1059 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, 1060 NULL); 1061 releasef(sfv->sfv_fd); 1062 return (EINVAL); 1063 } 1064 sfv_len -= cnt; 1065 sfv_off += cnt; 1066 1067 if (vp->v_type == VSOCK) { 1068 dmp->b_wptr = dmp->b_rptr + cnt; 1069 1070 error = socket_sendmblk(VTOSO(vp), 1071 &msg, fflag, CRED(), &dmp); 1072 1073 if (error != 0) { 1074 if (dmp != NULL) 1075 freeb(dmp); 1076 VOP_RWUNLOCK(readvp, 1077 V_WRITELOCK_FALSE, NULL); 1078 releasef(sfv->sfv_fd); 1079 return (error); 1080 } 1081 1082 ttolwp(curthread)->lwp_ru.ioch += 1083 (ulong_t)cnt; 1084 *count += cnt; 1085 } else { 1086 1087 aiov.iov_base = ptr; 1088 aiov.iov_len = cnt; 1089 auio.uio_loffset = *fileoff; 1090 auio.uio_resid = cnt; 1091 auio.uio_iov = &aiov; 1092 auio.uio_iovcnt = 1; 1093 auio.uio_segflg = UIO_SYSSPACE; 1094 auio.uio_llimit = curproc->p_fsz_ctl; 1095 auio.uio_fmode = fflag; 1096 ioflag = auio.uio_fmode & 1097 (FAPPEND|FSYNC|FDSYNC|FRSYNC); 1098 error = VOP_WRITE(vp, &auio, ioflag, 1099 fp->f_cred, NULL); 1100 1101 /* 1102 * Check how much data was written. 1103 * Increment the 'len' and decrement the 1104 * 'off' if all the data was not 1105 * written. 1106 */ 1107 cnt -= auio.uio_resid; 1108 sfv_len += auio.uio_resid; 1109 sfv_off -= auio.uio_resid; 1110 ttolwp(curthread)->lwp_ru.ioch += 1111 (ulong_t)cnt; 1112 *fileoff += cnt; 1113 *count += cnt; 1114 if (error != 0) { 1115 kmem_free(buf, size); 1116 VOP_RWUNLOCK(readvp, 1117 V_WRITELOCK_FALSE, NULL); 1118 releasef(sfv->sfv_fd); 1119 return (error); 1120 } 1121 } 1122 } 1123 if (buf) { 1124 kmem_free(buf, size); 1125 buf = NULL; 1126 } 1127 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL); 1128 releasef(sfv->sfv_fd); 1129 } 1130 sfv++; 1131 } 1132 return (0); 1133 } 1134 1135 ssize_t 1136 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, 1137 size_t *xferred) 1138 { 1139 int error = 0; 1140 int first_vector_error = 0; 1141 file_t *fp; 1142 struct vnode *vp; 1143 struct sonode *so = NULL; 1144 u_offset_t fileoff; 1145 int copy_cnt; 1146 const struct sendfilevec *copy_vec; 1147 struct sendfilevec sfv[SEND_MAX_CHUNK]; 1148 ssize_t count = 0; 1149 #ifdef _SYSCALL32_IMPL 1150 struct ksendfilevec32 sfv32[SEND_MAX_CHUNK]; 1151 #endif 1152 ssize_t total_size; 1153 int i; 1154 boolean_t is_sock = B_FALSE; 1155 int maxblk = 0; 1156 1157 if (sfvcnt <= 0) 1158 return (set_errno(EINVAL)); 1159 1160 if ((fp = getf(fildes)) == NULL) 1161 return (set_errno(EBADF)); 1162 1163 if (((fp->f_flag) & FWRITE) == 0) { 1164 error = EBADF; 1165 goto err; 1166 } 1167 1168 fileoff = fp->f_offset; 1169 vp = fp->f_vnode; 1170 1171 switch (vp->v_type) { 1172 case VSOCK: 1173 so = VTOSO(vp); 1174 is_sock = B_TRUE; 1175 if (SOCK_IS_NONSTR(so)) { 1176 maxblk = so->so_proto_props.sopp_maxblk; 1177 } else { 1178 maxblk = (int)vp->v_stream->sd_maxblk; 1179 } 1180 1181 /* 1182 * We need to make sure that the socket that we're sending on 1183 * supports sendfile behavior. sockfs doesn't know that the APIs 1184 * we want to use are coming from sendfile, so we can't rely on 1185 * it to check for us. 1186 */ 1187 if ((so->so_mode & SM_SENDFILESUPP) == 0) { 1188 error = EOPNOTSUPP; 1189 goto err; 1190 } 1191 break; 1192 case VREG: 1193 break; 1194 default: 1195 error = EINVAL; 1196 goto err; 1197 } 1198 1199 switch (opcode) { 1200 case SENDFILEV : 1201 break; 1202 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) 1203 case SENDFILEV64 : 1204 return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt, 1205 (size32_t *)xferred, fildes)); 1206 #endif 1207 default : 1208 error = ENOSYS; 1209 break; 1210 } 1211 1212 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL); 1213 copy_vec = vec; 1214 1215 do { 1216 total_size = 0; 1217 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK); 1218 #ifdef _SYSCALL32_IMPL 1219 /* 32-bit callers need to have their iovec expanded. */ 1220 if (get_udatamodel() == DATAMODEL_ILP32) { 1221 if (copyin(copy_vec, sfv32, 1222 copy_cnt * sizeof (ksendfilevec32_t))) { 1223 error = EFAULT; 1224 break; 1225 } 1226 1227 for (i = 0; i < copy_cnt; i++) { 1228 sfv[i].sfv_fd = sfv32[i].sfv_fd; 1229 sfv[i].sfv_off = 1230 (off_t)(uint32_t)sfv32[i].sfv_off; 1231 sfv[i].sfv_len = (size_t)sfv32[i].sfv_len; 1232 total_size += sfv[i].sfv_len; 1233 sfv[i].sfv_flag = sfv32[i].sfv_flag; 1234 /* 1235 * Individual elements of the vector must not 1236 * wrap or overflow, as later math is signed. 1237 * Equally total_size needs to be checked after 1238 * each vector is added in, to be sure that 1239 * rogue values haven't overflowed the counter. 1240 */ 1241 if (((ssize32_t)sfv[i].sfv_len < 0) || 1242 ((ssize32_t)total_size < 0)) { 1243 /* 1244 * Truncate the vector to send data 1245 * described by elements before the 1246 * error. 1247 */ 1248 copy_cnt = i; 1249 first_vector_error = EINVAL; 1250 /* total_size can't be trusted */ 1251 if ((ssize32_t)total_size < 0) 1252 error = EINVAL; 1253 break; 1254 } 1255 } 1256 /* Nothing to do, process errors */ 1257 if (copy_cnt == 0) 1258 break; 1259 1260 } else { 1261 #endif 1262 if (copyin(copy_vec, sfv, 1263 copy_cnt * sizeof (sendfilevec_t))) { 1264 error = EFAULT; 1265 break; 1266 } 1267 1268 for (i = 0; i < copy_cnt; i++) { 1269 total_size += sfv[i].sfv_len; 1270 /* 1271 * Individual elements of the vector must not 1272 * wrap or overflow, as later math is signed. 1273 * Equally total_size needs to be checked after 1274 * each vector is added in, to be sure that 1275 * rogue values haven't overflowed the counter. 1276 */ 1277 if (((ssize_t)sfv[i].sfv_len < 0) || 1278 (total_size < 0)) { 1279 /* 1280 * Truncate the vector to send data 1281 * described by elements before the 1282 * error. 1283 */ 1284 copy_cnt = i; 1285 first_vector_error = EINVAL; 1286 /* total_size can't be trusted */ 1287 if (total_size < 0) 1288 error = EINVAL; 1289 break; 1290 } 1291 } 1292 /* Nothing to do, process errors */ 1293 if (copy_cnt == 0) 1294 break; 1295 #ifdef _SYSCALL32_IMPL 1296 } 1297 #endif 1298 1299 /* 1300 * The task between deciding to use sendvec_small_chunk 1301 * and sendvec_chunk is dependant on multiple things: 1302 * 1303 * i) latency is important for smaller files. So if the 1304 * data is smaller than 'tcp_slow_start_initial' times 1305 * maxblk, then use sendvec_small_chunk which creates 1306 * maxblk size mblks and chains them together and sends 1307 * them to TCP in one shot. It also leaves 'wroff' size 1308 * space for the headers in each mblk. 1309 * 1310 * ii) for total size bigger than 'tcp_slow_start_initial' 1311 * time maxblk, its probably real file data which is 1312 * dominating. So its better to use sendvec_chunk because 1313 * performance goes to dog if we don't do pagesize reads. 1314 * sendvec_chunk will do pagesize reads and write them 1315 * in pagesize mblks to TCP. 1316 * 1317 * Side Notes: A write to file has not been optimized. 1318 * Future zero copy code will plugin into sendvec_chunk 1319 * only because doing zero copy for files smaller then 1320 * pagesize is useless. 1321 */ 1322 if (is_sock) { 1323 if ((total_size <= (4 * maxblk)) && 1324 error == 0) { 1325 error = sendvec_small_chunk(fp, 1326 &fileoff, sfv, copy_cnt, 1327 total_size, maxblk, &count); 1328 } else { 1329 error = sendvec_chunk(fp, &fileoff, 1330 sfv, copy_cnt, &count); 1331 } 1332 } else { 1333 ASSERT(vp->v_type == VREG); 1334 error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt, 1335 &count); 1336 } 1337 1338 1339 #ifdef _SYSCALL32_IMPL 1340 if (get_udatamodel() == DATAMODEL_ILP32) { 1341 copy_vec = (const struct sendfilevec *) 1342 ((char *)copy_vec + 1343 (copy_cnt * sizeof (ksendfilevec32_t))); 1344 } else 1345 #endif 1346 copy_vec += copy_cnt; 1347 sfvcnt -= copy_cnt; 1348 1349 /* Process all vector members up to first error */ 1350 } while ((sfvcnt > 0) && first_vector_error == 0 && error == 0); 1351 1352 if (vp->v_type == VREG) 1353 fp->f_offset += count; 1354 1355 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL); 1356 1357 #ifdef _SYSCALL32_IMPL 1358 if (get_udatamodel() == DATAMODEL_ILP32) { 1359 ssize32_t count32 = (ssize32_t)count; 1360 if (copyout(&count32, xferred, sizeof (count32))) 1361 error = EFAULT; 1362 releasef(fildes); 1363 if (error != 0) 1364 return (set_errno(error)); 1365 if (first_vector_error != 0) 1366 return (set_errno(first_vector_error)); 1367 return (count32); 1368 } 1369 #endif 1370 if (copyout(&count, xferred, sizeof (count))) 1371 error = EFAULT; 1372 releasef(fildes); 1373 if (error != 0) 1374 return (set_errno(error)); 1375 if (first_vector_error != 0) 1376 return (set_errno(first_vector_error)); 1377 return (count); 1378 err: 1379 ASSERT(error != 0); 1380 releasef(fildes); 1381 return (set_errno(error)); 1382 } 1383