1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2022 Garrett D'Amore
25 */
26
27 #include <sys/types.h>
28 #include <sys/t_lock.h>
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/buf.h>
32 #include <sys/conf.h>
33 #include <sys/cred.h>
34 #include <sys/kmem.h>
35 #include <sys/sysmacros.h>
36 #include <sys/vfs.h>
37 #include <sys/vnode.h>
38 #include <sys/debug.h>
39 #include <sys/errno.h>
40 #include <sys/time.h>
41 #include <sys/file.h>
42 #include <sys/open.h>
43 #include <sys/user.h>
44 #include <sys/termios.h>
45 #include <sys/stream.h>
46 #include <sys/strsubr.h>
47 #include <sys/sunddi.h>
48 #include <sys/esunddi.h>
49 #include <sys/flock.h>
50 #include <sys/modctl.h>
51 #include <sys/cmn_err.h>
52 #include <sys/vmsystm.h>
53
54 #include <sys/socket.h>
55 #include <sys/socketvar.h>
56 #include <fs/sockfs/sockcommon.h>
57 #include <fs/sockfs/socktpi.h>
58
59 #include <netinet/in.h>
60 #include <sys/sendfile.h>
61 #include <sys/un.h>
62 #include <sys/tihdr.h>
63 #include <sys/atomic.h>
64
65 #include <inet/common.h>
66 #include <inet/ip.h>
67 #include <inet/ip6.h>
68 #include <inet/tcp.h>
69
70 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *,
71 ssize32_t *);
72 extern int snf_segmap(file_t *, vnode_t *, u_offset_t, u_offset_t, ssize_t *,
73 boolean_t);
74 extern sotpi_info_t *sotpi_sototpi(struct sonode *);
75
76 #define SEND_MAX_CHUNK 16
77
78 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
79 /*
80 * 64 bit offsets for 32 bit applications only running either on
81 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer
82 * more than 2GB of data.
83 */
84 static int
sendvec_chunk64(file_t * fp,u_offset_t * fileoff,struct ksendfilevec64 * sfv,int copy_cnt,ssize32_t * count)85 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv,
86 int copy_cnt, ssize32_t *count)
87 {
88 struct vnode *vp;
89 ushort_t fflag;
90 int ioflag;
91 size32_t cnt;
92 ssize32_t sfv_len;
93 ssize32_t tmpcount;
94 u_offset_t sfv_off;
95 struct uio auio;
96 struct iovec aiov;
97 int i, error;
98
99 fflag = fp->f_flag;
100 vp = fp->f_vnode;
101 for (i = 0; i < copy_cnt; i++) {
102
103 if (ISSIG(curthread, JUSTLOOKING))
104 return (EINTR);
105
106 /*
107 * Do similar checks as "write" as we are writing
108 * sfv_len bytes into "vp".
109 */
110 sfv_len = (ssize32_t)sfv->sfv_len;
111
112 if (sfv_len == 0) {
113 sfv++;
114 continue;
115 }
116
117 if (sfv_len < 0)
118 return (EINVAL);
119
120 if (vp->v_type == VREG) {
121 if (*fileoff >= curproc->p_fsz_ctl) {
122 mutex_enter(&curproc->p_lock);
123 (void) rctl_action(
124 rctlproc_legacy[RLIMIT_FSIZE],
125 curproc->p_rctls, curproc, RCA_SAFE);
126 mutex_exit(&curproc->p_lock);
127 return (EFBIG);
128 }
129
130 if (*fileoff >= OFFSET_MAX(fp))
131 return (EFBIG);
132
133 if (*fileoff + sfv_len > OFFSET_MAX(fp))
134 return (EINVAL);
135 }
136
137 tmpcount = *count + sfv_len;
138 if (tmpcount < 0)
139 return (EINVAL);
140
141 sfv_off = sfv->sfv_off;
142
143 auio.uio_extflg = UIO_COPY_DEFAULT;
144 if (sfv->sfv_fd == SFV_FD_SELF) {
145 aiov.iov_len = sfv_len;
146 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
147 auio.uio_loffset = *fileoff;
148 auio.uio_iovcnt = 1;
149 auio.uio_resid = sfv_len;
150 auio.uio_iov = &aiov;
151 auio.uio_segflg = UIO_USERSPACE;
152 auio.uio_llimit = curproc->p_fsz_ctl;
153 auio.uio_fmode = fflag;
154 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
155 while (sfv_len > 0) {
156 error = VOP_WRITE(vp, &auio, ioflag,
157 fp->f_cred, NULL);
158 cnt = sfv_len - auio.uio_resid;
159 sfv_len -= cnt;
160 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
161 if (vp->v_type == VREG)
162 *fileoff += cnt;
163 *count += cnt;
164 if (error != 0)
165 return (error);
166 }
167 } else {
168 file_t *ffp;
169 vnode_t *readvp;
170 size_t size;
171 caddr_t ptr;
172
173 if ((ffp = getf(sfv->sfv_fd)) == NULL)
174 return (EBADF);
175
176 if ((ffp->f_flag & FREAD) == 0) {
177 releasef(sfv->sfv_fd);
178 return (EBADF);
179 }
180
181 readvp = ffp->f_vnode;
182 if (readvp->v_type != VREG) {
183 releasef(sfv->sfv_fd);
184 return (EINVAL);
185 }
186
187 /*
188 * No point reading and writing to same vp,
189 * as long as both are regular files. readvp is not
190 * locked; but since we got it from an open file the
191 * contents will be valid during the time of access.
192 */
193 if (vn_compare(vp, readvp)) {
194 releasef(sfv->sfv_fd);
195 return (EINVAL);
196 }
197
198 /*
199 * Optimize the regular file over
200 * the socket case.
201 */
202 if (vp->v_type == VSOCK) {
203 error = sosendfile64(fp, ffp, sfv,
204 (ssize32_t *)&cnt);
205 *count += cnt;
206 if (error)
207 return (error);
208 sfv++;
209 continue;
210 }
211
212 /*
213 * Note: we assume readvp != vp. "vp" is already
214 * locked, and "readvp" must not be.
215 */
216 if (readvp < vp) {
217 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
218 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE,
219 NULL);
220 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
221 } else {
222 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE,
223 NULL);
224 }
225
226 /*
227 * Same checks as in pread64.
228 */
229 if (sfv_off > MAXOFFSET_T) {
230 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL);
231 releasef(sfv->sfv_fd);
232 return (EINVAL);
233 }
234
235 if (sfv_off + sfv_len > MAXOFFSET_T)
236 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off);
237
238 /* Find the native blocksize to transfer data */
239 size = MIN(vp->v_vfsp->vfs_bsize,
240 readvp->v_vfsp->vfs_bsize);
241 size = sfv_len < size ? sfv_len : size;
242 ptr = kmem_alloc(size, KM_NOSLEEP);
243 if (ptr == NULL) {
244 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL);
245 releasef(sfv->sfv_fd);
246 return (ENOMEM);
247 }
248
249 while (sfv_len > 0) {
250 size_t iov_len;
251
252 iov_len = MIN(size, sfv_len);
253 aiov.iov_base = ptr;
254 aiov.iov_len = iov_len;
255 auio.uio_loffset = sfv_off;
256 auio.uio_iov = &aiov;
257 auio.uio_iovcnt = 1;
258 auio.uio_resid = iov_len;
259 auio.uio_segflg = UIO_SYSSPACE;
260 auio.uio_llimit = MAXOFFSET_T;
261 auio.uio_fmode = ffp->f_flag;
262 ioflag = auio.uio_fmode &
263 (FAPPEND|FSYNC|FDSYNC|FRSYNC);
264
265 /*
266 * If read sync is not asked for,
267 * filter sync flags
268 */
269 if ((ioflag & FRSYNC) == 0)
270 ioflag &= ~(FSYNC|FDSYNC);
271 error = VOP_READ(readvp, &auio, ioflag,
272 fp->f_cred, NULL);
273 if (error) {
274 kmem_free(ptr, size);
275 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
276 NULL);
277 releasef(sfv->sfv_fd);
278 return (error);
279 }
280
281 /*
282 * Check how must data was really read.
283 * Decrement the 'len' and increment the
284 * 'off' appropriately.
285 */
286 cnt = iov_len - auio.uio_resid;
287 if (cnt == 0) {
288 /*
289 * If we were reading a pipe (currently
290 * not implemented), we may now lose
291 * data.
292 */
293 kmem_free(ptr, size);
294 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
295 NULL);
296 releasef(sfv->sfv_fd);
297 return (EINVAL);
298 }
299 sfv_len -= cnt;
300 sfv_off += cnt;
301
302 aiov.iov_base = ptr;
303 aiov.iov_len = cnt;
304 auio.uio_loffset = *fileoff;
305 auio.uio_iov = &aiov;
306 auio.uio_iovcnt = 1;
307 auio.uio_resid = cnt;
308 auio.uio_segflg = UIO_SYSSPACE;
309 auio.uio_llimit = curproc->p_fsz_ctl;
310 auio.uio_fmode = fflag;
311 ioflag = auio.uio_fmode &
312 (FAPPEND|FSYNC|FDSYNC|FRSYNC);
313 error = VOP_WRITE(vp, &auio, ioflag,
314 fp->f_cred, NULL);
315
316 /*
317 * Check how much data was written. Increment
318 * the 'len' and decrement the 'off' if all
319 * the data was not written.
320 */
321 cnt -= auio.uio_resid;
322 sfv_len += auio.uio_resid;
323 sfv_off -= auio.uio_resid;
324 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
325 if (vp->v_type == VREG)
326 *fileoff += cnt;
327 *count += cnt;
328 if (error != 0) {
329 kmem_free(ptr, size);
330 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
331 NULL);
332 releasef(sfv->sfv_fd);
333 return (error);
334 }
335 }
336 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL);
337 releasef(sfv->sfv_fd);
338 kmem_free(ptr, size);
339 }
340 sfv++;
341 }
342 return (0);
343 }
344
345 static ssize32_t
sendvec64(file_t * fp,const struct ksendfilevec64 * vec,int sfvcnt,size32_t * xferred,int fildes)346 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt,
347 size32_t *xferred, int fildes)
348 {
349 u_offset_t fileoff;
350 int copy_cnt;
351 const struct ksendfilevec64 *copy_vec;
352 struct ksendfilevec64 sfv[SEND_MAX_CHUNK];
353 struct vnode *vp;
354 int error;
355 ssize32_t count = 0;
356
357 vp = fp->f_vnode;
358 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
359
360 copy_vec = vec;
361 fileoff = fp->f_offset;
362
363 do {
364 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
365 if (copyin(copy_vec, sfv, copy_cnt *
366 sizeof (struct ksendfilevec64))) {
367 error = EFAULT;
368 break;
369 }
370
371 error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count);
372 if (error != 0)
373 break;
374
375 copy_vec += copy_cnt;
376 sfvcnt -= copy_cnt;
377 } while (sfvcnt > 0);
378
379 if (vp->v_type == VREG)
380 fp->f_offset += count;
381
382 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
383 if (copyout(&count, xferred, sizeof (count)))
384 error = EFAULT;
385 releasef(fildes);
386 if (error != 0)
387 return (set_errno(error));
388 return (count);
389 }
390 #endif
391
392 static int
sendvec_small_chunk(file_t * fp,u_offset_t * fileoff,struct sendfilevec * sfv,int copy_cnt,ssize_t total_size,int maxblk,ssize_t * count)393 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
394 int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count)
395 {
396 struct vnode *vp;
397 struct uio auio;
398 struct iovec aiov;
399 ushort_t fflag;
400 int ioflag;
401 int i, error;
402 size_t cnt;
403 ssize_t sfv_len;
404 u_offset_t sfv_off;
405 #ifdef _SYSCALL32_IMPL
406 model_t model = get_udatamodel();
407 u_offset_t maxoff = (model == DATAMODEL_ILP32) ?
408 MAXOFF32_T : MAXOFFSET_T;
409 #else
410 const u_offset_t maxoff = MAXOFF32_T;
411 #endif
412 mblk_t *dmp = NULL;
413 int wroff;
414 int buf_left = 0;
415 size_t iov_len;
416 mblk_t *head, *tmp;
417 size_t size = total_size;
418 size_t extra;
419 int tail_len;
420 struct nmsghdr msg;
421
422 fflag = fp->f_flag;
423 vp = fp->f_vnode;
424
425 ASSERT(vp->v_type == VSOCK);
426 ASSERT(maxblk > 0);
427
428 /* If nothing to send, return */
429 if (total_size == 0)
430 return (0);
431
432 if (vp->v_stream != NULL) {
433 wroff = (int)vp->v_stream->sd_wroff;
434 tail_len = (int)vp->v_stream->sd_tail;
435 } else {
436 struct sonode *so;
437
438 so = VTOSO(vp);
439 wroff = so->so_proto_props.sopp_wroff;
440 tail_len = so->so_proto_props.sopp_tail;
441 }
442
443 extra = wroff + tail_len;
444
445 buf_left = MIN(total_size, maxblk);
446 head = dmp = allocb(buf_left + extra, BPRI_HI);
447 if (head == NULL)
448 return (ENOMEM);
449 head->b_wptr = head->b_rptr = head->b_rptr + wroff;
450 bzero(&msg, sizeof (msg));
451
452 auio.uio_extflg = UIO_COPY_DEFAULT;
453 for (i = 0; i < copy_cnt; i++) {
454 if (ISSIG(curthread, JUSTLOOKING)) {
455 freemsg(head);
456 return (EINTR);
457 }
458
459 /*
460 * Do similar checks as "write" as we are writing
461 * sfv_len bytes into "vp".
462 */
463 sfv_len = (ssize_t)sfv->sfv_len;
464
465 if (sfv_len == 0) {
466 sfv++;
467 continue;
468 }
469
470 /* Check for overflow */
471 #ifdef _SYSCALL32_IMPL
472 if (model == DATAMODEL_ILP32) {
473 if (((ssize32_t)(*count + sfv_len)) < 0) {
474 freemsg(head);
475 return (EINVAL);
476 }
477 } else
478 #endif
479 if ((*count + sfv_len) < 0) {
480 freemsg(head);
481 return (EINVAL);
482 }
483
484 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off;
485
486 if (sfv->sfv_fd == SFV_FD_SELF) {
487 while (sfv_len > 0) {
488 if (buf_left == 0) {
489 tmp = dmp;
490 buf_left = MIN(total_size, maxblk);
491 iov_len = MIN(buf_left, sfv_len);
492 dmp = allocb(buf_left + extra, BPRI_HI);
493 if (dmp == NULL) {
494 freemsg(head);
495 return (ENOMEM);
496 }
497 dmp->b_wptr = dmp->b_rptr =
498 dmp->b_rptr + wroff;
499 tmp->b_cont = dmp;
500 } else {
501 iov_len = MIN(buf_left, sfv_len);
502 }
503
504 aiov.iov_len = iov_len;
505 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
506 auio.uio_loffset = *fileoff;
507 auio.uio_iovcnt = 1;
508 auio.uio_resid = iov_len;
509 auio.uio_iov = &aiov;
510 auio.uio_segflg = UIO_USERSPACE;
511 auio.uio_llimit = curproc->p_fsz_ctl;
512 auio.uio_fmode = fflag;
513
514 buf_left -= iov_len;
515 total_size -= iov_len;
516 sfv_len -= iov_len;
517 sfv_off += iov_len;
518
519 error = uiomove((caddr_t)dmp->b_wptr,
520 iov_len, UIO_WRITE, &auio);
521 if (error != 0) {
522 freemsg(head);
523 return (error);
524 }
525 dmp->b_wptr += iov_len;
526 }
527 } else {
528 file_t *ffp;
529 vnode_t *readvp;
530
531 if ((ffp = getf(sfv->sfv_fd)) == NULL) {
532 freemsg(head);
533 return (EBADF);
534 }
535
536 if ((ffp->f_flag & FREAD) == 0) {
537 releasef(sfv->sfv_fd);
538 freemsg(head);
539 return (EACCES);
540 }
541
542 readvp = ffp->f_vnode;
543 if (readvp->v_type != VREG) {
544 releasef(sfv->sfv_fd);
545 freemsg(head);
546 return (EINVAL);
547 }
548
549 /*
550 * No point reading and writing to same vp,
551 * as long as both are regular files. readvp is not
552 * locked; but since we got it from an open file the
553 * contents will be valid during the time of access.
554 */
555
556 if (vn_compare(vp, readvp)) {
557 releasef(sfv->sfv_fd);
558 freemsg(head);
559 return (EINVAL);
560 }
561
562 /*
563 * Note: we assume readvp != vp. "vp" is already
564 * locked, and "readvp" must not be.
565 */
566
567 if (readvp < vp) {
568 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
569 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE,
570 NULL);
571 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
572 } else {
573 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE,
574 NULL);
575 }
576
577 /* Same checks as in pread */
578 if (sfv_off > maxoff) {
579 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL);
580 releasef(sfv->sfv_fd);
581 freemsg(head);
582 return (EINVAL);
583 }
584 if (sfv_off + sfv_len > maxoff) {
585 total_size -= (sfv_off + sfv_len - maxoff);
586 sfv_len = (ssize_t)((offset_t)maxoff -
587 sfv_off);
588 }
589
590 while (sfv_len > 0) {
591 if (buf_left == 0) {
592 tmp = dmp;
593 buf_left = MIN(total_size, maxblk);
594 iov_len = MIN(buf_left, sfv_len);
595 dmp = allocb(buf_left + extra, BPRI_HI);
596 if (dmp == NULL) {
597 VOP_RWUNLOCK(readvp,
598 V_WRITELOCK_FALSE, NULL);
599 releasef(sfv->sfv_fd);
600 freemsg(head);
601 return (ENOMEM);
602 }
603 dmp->b_wptr = dmp->b_rptr =
604 dmp->b_rptr + wroff;
605 tmp->b_cont = dmp;
606 } else {
607 iov_len = MIN(buf_left, sfv_len);
608 }
609 aiov.iov_base = (caddr_t)dmp->b_wptr;
610 aiov.iov_len = iov_len;
611 auio.uio_loffset = sfv_off;
612 auio.uio_iov = &aiov;
613 auio.uio_iovcnt = 1;
614 auio.uio_resid = iov_len;
615 auio.uio_segflg = UIO_SYSSPACE;
616 auio.uio_llimit = MAXOFFSET_T;
617 auio.uio_fmode = ffp->f_flag;
618 ioflag = auio.uio_fmode &
619 (FAPPEND|FSYNC|FDSYNC|FRSYNC);
620
621 /*
622 * If read sync is not asked for,
623 * filter sync flags
624 */
625 if ((ioflag & FRSYNC) == 0)
626 ioflag &= ~(FSYNC|FDSYNC);
627 error = VOP_READ(readvp, &auio, ioflag,
628 fp->f_cred, NULL);
629 if (error != 0) {
630 /*
631 * If we were reading a pipe (currently
632 * not implemented), we may now loose
633 * data.
634 */
635 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
636 NULL);
637 releasef(sfv->sfv_fd);
638 freemsg(head);
639 return (error);
640 }
641
642 /*
643 * Check how much data was really read.
644 * Decrement the 'len' and increment the
645 * 'off' appropriately.
646 */
647 cnt = iov_len - auio.uio_resid;
648 if (cnt == 0) {
649 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
650 NULL);
651 releasef(sfv->sfv_fd);
652 freemsg(head);
653 return (EINVAL);
654 }
655 sfv_len -= cnt;
656 sfv_off += cnt;
657 total_size -= cnt;
658 buf_left -= cnt;
659
660 dmp->b_wptr += cnt;
661 }
662 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL);
663 releasef(sfv->sfv_fd);
664 }
665 sfv++;
666 }
667
668 ASSERT(total_size == 0);
669 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &head);
670 if (error != 0) {
671 if (head != NULL)
672 freemsg(head);
673 return (error);
674 }
675 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size;
676 *count += size;
677
678 return (0);
679 }
680
681
682 static int
sendvec_chunk(file_t * fp,u_offset_t * fileoff,struct sendfilevec * sfv,int copy_cnt,ssize_t * count)683 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
684 int copy_cnt, ssize_t *count)
685 {
686 struct vnode *vp;
687 struct uio auio;
688 struct iovec aiov;
689 ushort_t fflag;
690 int ioflag;
691 int i, error;
692 size_t cnt;
693 ssize_t sfv_len;
694 u_offset_t sfv_off;
695 #ifdef _SYSCALL32_IMPL
696 model_t model = get_udatamodel();
697 u_offset_t maxoff = (model == DATAMODEL_ILP32) ?
698 MAXOFF32_T : MAXOFFSET_T;
699 #else
700 const u_offset_t maxoff = MAXOFF32_T;
701 #endif
702 mblk_t *dmp = NULL;
703 char *buf = NULL;
704 size_t extra = 0;
705 int maxblk, wroff, tail_len;
706 struct sonode *so;
707 stdata_t *stp;
708 struct nmsghdr msg;
709
710 maxblk = 0;
711 wroff = 0;
712 fflag = fp->f_flag;
713 vp = fp->f_vnode;
714 so = NULL;
715 stp = NULL;
716
717 if (vp->v_type == VSOCK) {
718 so = VTOSO(vp);
719 if (vp->v_stream != NULL) {
720 stp = vp->v_stream;
721 wroff = (int)stp->sd_wroff;
722 tail_len = (int)stp->sd_tail;
723 maxblk = (int)stp->sd_maxblk;
724 } else {
725 stp = NULL;
726 wroff = so->so_proto_props.sopp_wroff;
727 tail_len = so->so_proto_props.sopp_tail;
728 maxblk = so->so_proto_props.sopp_maxblk;
729 }
730 extra = wroff + tail_len;
731 }
732
733 bzero(&msg, sizeof (msg));
734 auio.uio_extflg = UIO_COPY_DEFAULT;
735 for (i = 0; i < copy_cnt; i++) {
736 if (ISSIG(curthread, JUSTLOOKING))
737 return (EINTR);
738
739 /*
740 * Do similar checks as "write" as we are writing
741 * sfv_len bytes into "vp".
742 */
743 sfv_len = (ssize_t)sfv->sfv_len;
744
745 if (sfv_len == 0) {
746 sfv++;
747 continue;
748 }
749
750 if (vp->v_type == VREG) {
751 if (*fileoff >= curproc->p_fsz_ctl) {
752 mutex_enter(&curproc->p_lock);
753 (void) rctl_action(
754 rctlproc_legacy[RLIMIT_FSIZE],
755 curproc->p_rctls, curproc, RCA_SAFE);
756 mutex_exit(&curproc->p_lock);
757
758 return (EFBIG);
759 }
760
761 if (*fileoff >= maxoff)
762 return (EFBIG);
763
764 if (*fileoff + sfv_len > maxoff)
765 return (EINVAL);
766 }
767
768 /* Check for overflow */
769 #ifdef _SYSCALL32_IMPL
770 if (model == DATAMODEL_ILP32) {
771 if (((ssize32_t)(*count + sfv_len)) < 0)
772 return (EINVAL);
773 } else
774 #endif
775 if ((*count + sfv_len) < 0)
776 return (EINVAL);
777
778 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off;
779
780 if (sfv->sfv_fd == SFV_FD_SELF) {
781 if (vp->v_type == VSOCK) {
782 while (sfv_len > 0) {
783 size_t iov_len;
784
785 iov_len = sfv_len;
786 /*
787 * Socket filters can limit the mblk
788 * size, so limit reads to maxblk if
789 * there are filters present.
790 */
791 if (so->so_filter_active > 0 &&
792 maxblk != INFPSZ)
793 iov_len = MIN(iov_len, maxblk);
794
795 aiov.iov_len = iov_len;
796 aiov.iov_base =
797 (caddr_t)(uintptr_t)sfv_off;
798
799 auio.uio_iov = &aiov;
800 auio.uio_iovcnt = 1;
801 auio.uio_loffset = *fileoff;
802 auio.uio_segflg = UIO_USERSPACE;
803 auio.uio_fmode = fflag;
804 auio.uio_llimit = curproc->p_fsz_ctl;
805 auio.uio_resid = iov_len;
806
807 dmp = allocb(iov_len + extra, BPRI_HI);
808 if (dmp == NULL)
809 return (ENOMEM);
810 dmp->b_wptr = dmp->b_rptr =
811 dmp->b_rptr + wroff;
812 error = uiomove((caddr_t)dmp->b_wptr,
813 iov_len, UIO_WRITE, &auio);
814 if (error != 0) {
815 freeb(dmp);
816 return (error);
817 }
818 dmp->b_wptr += iov_len;
819 error = socket_sendmblk(VTOSO(vp),
820 &msg, fflag, CRED(), &dmp);
821
822 if (error != 0) {
823 if (dmp != NULL)
824 freeb(dmp);
825 return (error);
826 }
827 ttolwp(curthread)->lwp_ru.ioch +=
828 (ulong_t)iov_len;
829 *count += iov_len;
830 sfv_len -= iov_len;
831 sfv_off += iov_len;
832 }
833 } else {
834 aiov.iov_len = sfv_len;
835 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
836
837 auio.uio_iov = &aiov;
838 auio.uio_iovcnt = 1;
839 auio.uio_loffset = *fileoff;
840 auio.uio_segflg = UIO_USERSPACE;
841 auio.uio_fmode = fflag;
842 auio.uio_llimit = curproc->p_fsz_ctl;
843 auio.uio_resid = sfv_len;
844
845 ioflag = auio.uio_fmode &
846 (FAPPEND|FSYNC|FDSYNC|FRSYNC);
847 while (sfv_len > 0) {
848 error = VOP_WRITE(vp, &auio, ioflag,
849 fp->f_cred, NULL);
850 cnt = sfv_len - auio.uio_resid;
851 sfv_len -= cnt;
852 ttolwp(curthread)->lwp_ru.ioch +=
853 (ulong_t)cnt;
854 *fileoff += cnt;
855 *count += cnt;
856 if (error != 0)
857 return (error);
858 }
859 }
860 } else {
861 int segmapit = 0;
862 file_t *ffp;
863 vnode_t *readvp;
864 struct vnode *realvp;
865 size_t size;
866 caddr_t ptr;
867
868 if ((ffp = getf(sfv->sfv_fd)) == NULL)
869 return (EBADF);
870
871 if ((ffp->f_flag & FREAD) == 0) {
872 releasef(sfv->sfv_fd);
873 return (EBADF);
874 }
875
876 readvp = ffp->f_vnode;
877 if (VOP_REALVP(readvp, &realvp, NULL) == 0)
878 readvp = realvp;
879 if (readvp->v_type != VREG) {
880 releasef(sfv->sfv_fd);
881 return (EINVAL);
882 }
883
884 /*
885 * No point reading and writing to same vp,
886 * as long as both are regular files. readvp is not
887 * locked; but since we got it from an open file the
888 * contents will be valid during the time of access.
889 */
890 if (vn_compare(vp, readvp)) {
891 releasef(sfv->sfv_fd);
892 return (EINVAL);
893 }
894
895 /*
896 * Note: we assume readvp != vp. "vp" is already
897 * locked, and "readvp" must not be.
898 */
899 if (readvp < vp) {
900 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
901 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE,
902 NULL);
903 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
904 } else {
905 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE,
906 NULL);
907 }
908
909 /* Same checks as in pread */
910 if (sfv_off > maxoff) {
911 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL);
912 releasef(sfv->sfv_fd);
913 return (EINVAL);
914 }
915 if (sfv_off + sfv_len > maxoff) {
916 sfv_len = (ssize_t)((offset_t)maxoff -
917 sfv_off);
918 }
919 /* Find the native blocksize to transfer data */
920 size = MIN(vp->v_vfsp->vfs_bsize,
921 readvp->v_vfsp->vfs_bsize);
922 size = sfv_len < size ? sfv_len : size;
923
924 if (vp->v_type != VSOCK) {
925 segmapit = 0;
926 buf = kmem_alloc(size, KM_NOSLEEP);
927 if (buf == NULL) {
928 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
929 NULL);
930 releasef(sfv->sfv_fd);
931 return (ENOMEM);
932 }
933 } else {
934 uint_t copyflag;
935
936 copyflag = stp != NULL ? stp->sd_copyflag :
937 so->so_proto_props.sopp_zcopyflag;
938
939 /*
940 * Socket filters can limit the mblk size,
941 * so limit reads to maxblk if there are
942 * filters present.
943 */
944 if (so->so_filter_active > 0 &&
945 maxblk != INFPSZ)
946 size = MIN(size, maxblk);
947
948 if (vn_has_flocks(readvp) ||
949 readvp->v_flag & VNOMAP ||
950 copyflag & STZCVMUNSAFE) {
951 segmapit = 0;
952 } else if (copyflag & STZCVMSAFE) {
953 segmapit = 1;
954 } else {
955 int on = 1;
956 if (socket_setsockopt(VTOSO(vp),
957 SOL_SOCKET, SO_SND_COPYAVOID,
958 &on, sizeof (on), CRED()) == 0)
959 segmapit = 1;
960 }
961 }
962
963 if (segmapit) {
964 struct vattr va;
965 boolean_t nowait;
966
967 va.va_mask = AT_SIZE;
968 error = VOP_GETATTR(readvp, &va, 0, kcred,
969 NULL);
970 if (error != 0 || sfv_off >= va.va_size) {
971 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
972 NULL);
973 releasef(sfv->sfv_fd);
974 return (error);
975 }
976 /* Read as much as possible. */
977 if (sfv_off + sfv_len > va.va_size)
978 sfv_len = va.va_size - sfv_off;
979
980 nowait = (sfv->sfv_flag & SFV_NOWAIT) != 0;
981 error = snf_segmap(fp, readvp, sfv_off,
982 (u_offset_t)sfv_len, (ssize_t *)&cnt,
983 nowait);
984 releasef(sfv->sfv_fd);
985 *count += cnt;
986 if (error)
987 return (error);
988 sfv++;
989 continue;
990 }
991
992 while (sfv_len > 0) {
993 size_t iov_len;
994
995 iov_len = MIN(size, sfv_len);
996
997 if (vp->v_type == VSOCK) {
998 dmp = allocb(iov_len + extra, BPRI_HI);
999 if (dmp == NULL) {
1000 VOP_RWUNLOCK(readvp,
1001 V_WRITELOCK_FALSE, NULL);
1002 releasef(sfv->sfv_fd);
1003 return (ENOMEM);
1004 }
1005 dmp->b_wptr = dmp->b_rptr =
1006 dmp->b_rptr + wroff;
1007 ptr = (caddr_t)dmp->b_rptr;
1008 } else {
1009 ptr = buf;
1010 }
1011
1012 aiov.iov_base = ptr;
1013 aiov.iov_len = iov_len;
1014 auio.uio_loffset = sfv_off;
1015 auio.uio_iov = &aiov;
1016 auio.uio_iovcnt = 1;
1017 auio.uio_resid = iov_len;
1018 auio.uio_segflg = UIO_SYSSPACE;
1019 auio.uio_llimit = MAXOFFSET_T;
1020 auio.uio_fmode = ffp->f_flag;
1021 ioflag = auio.uio_fmode &
1022 (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1023
1024 /*
1025 * If read sync is not asked for,
1026 * filter sync flags
1027 */
1028 if ((ioflag & FRSYNC) == 0)
1029 ioflag &= ~(FSYNC|FDSYNC);
1030 error = VOP_READ(readvp, &auio, ioflag,
1031 fp->f_cred, NULL);
1032 if (error != 0) {
1033 /*
1034 * If we were reading a pipe (currently
1035 * not implemented), we may now lose
1036 * data.
1037 */
1038 if (vp->v_type == VSOCK)
1039 freeb(dmp);
1040 else
1041 kmem_free(buf, size);
1042 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
1043 NULL);
1044 releasef(sfv->sfv_fd);
1045 return (error);
1046 }
1047
1048 /*
1049 * Check how much data was really read.
1050 * Decrement the 'len' and increment the
1051 * 'off' appropriately.
1052 */
1053 cnt = iov_len - auio.uio_resid;
1054 if (cnt == 0) {
1055 if (vp->v_type == VSOCK)
1056 freeb(dmp);
1057 else
1058 kmem_free(buf, size);
1059 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
1060 NULL);
1061 releasef(sfv->sfv_fd);
1062 return (EINVAL);
1063 }
1064 sfv_len -= cnt;
1065 sfv_off += cnt;
1066
1067 if (vp->v_type == VSOCK) {
1068 dmp->b_wptr = dmp->b_rptr + cnt;
1069
1070 error = socket_sendmblk(VTOSO(vp),
1071 &msg, fflag, CRED(), &dmp);
1072
1073 if (error != 0) {
1074 if (dmp != NULL)
1075 freeb(dmp);
1076 VOP_RWUNLOCK(readvp,
1077 V_WRITELOCK_FALSE, NULL);
1078 releasef(sfv->sfv_fd);
1079 return (error);
1080 }
1081
1082 ttolwp(curthread)->lwp_ru.ioch +=
1083 (ulong_t)cnt;
1084 *count += cnt;
1085 } else {
1086
1087 aiov.iov_base = ptr;
1088 aiov.iov_len = cnt;
1089 auio.uio_loffset = *fileoff;
1090 auio.uio_resid = cnt;
1091 auio.uio_iov = &aiov;
1092 auio.uio_iovcnt = 1;
1093 auio.uio_segflg = UIO_SYSSPACE;
1094 auio.uio_llimit = curproc->p_fsz_ctl;
1095 auio.uio_fmode = fflag;
1096 ioflag = auio.uio_fmode &
1097 (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1098 error = VOP_WRITE(vp, &auio, ioflag,
1099 fp->f_cred, NULL);
1100
1101 /*
1102 * Check how much data was written.
1103 * Increment the 'len' and decrement the
1104 * 'off' if all the data was not
1105 * written.
1106 */
1107 cnt -= auio.uio_resid;
1108 sfv_len += auio.uio_resid;
1109 sfv_off -= auio.uio_resid;
1110 ttolwp(curthread)->lwp_ru.ioch +=
1111 (ulong_t)cnt;
1112 *fileoff += cnt;
1113 *count += cnt;
1114 if (error != 0) {
1115 kmem_free(buf, size);
1116 VOP_RWUNLOCK(readvp,
1117 V_WRITELOCK_FALSE, NULL);
1118 releasef(sfv->sfv_fd);
1119 return (error);
1120 }
1121 }
1122 }
1123 if (buf) {
1124 kmem_free(buf, size);
1125 buf = NULL;
1126 }
1127 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL);
1128 releasef(sfv->sfv_fd);
1129 }
1130 sfv++;
1131 }
1132 return (0);
1133 }
1134
1135 ssize_t
sendfilev(int opcode,int fildes,const struct sendfilevec * vec,int sfvcnt,size_t * xferred)1136 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt,
1137 size_t *xferred)
1138 {
1139 int error = 0;
1140 int first_vector_error = 0;
1141 file_t *fp;
1142 struct vnode *vp;
1143 struct sonode *so = NULL;
1144 u_offset_t fileoff;
1145 int copy_cnt;
1146 const struct sendfilevec *copy_vec;
1147 struct sendfilevec sfv[SEND_MAX_CHUNK];
1148 ssize_t count = 0;
1149 #ifdef _SYSCALL32_IMPL
1150 struct ksendfilevec32 sfv32[SEND_MAX_CHUNK];
1151 #endif
1152 ssize_t total_size;
1153 int i;
1154 boolean_t is_sock = B_FALSE;
1155 int maxblk = 0;
1156
1157 if (sfvcnt <= 0)
1158 return (set_errno(EINVAL));
1159
1160 if ((fp = getf(fildes)) == NULL)
1161 return (set_errno(EBADF));
1162
1163 if (((fp->f_flag) & FWRITE) == 0) {
1164 error = EBADF;
1165 goto err;
1166 }
1167
1168 fileoff = fp->f_offset;
1169 vp = fp->f_vnode;
1170
1171 switch (vp->v_type) {
1172 case VSOCK:
1173 so = VTOSO(vp);
1174 is_sock = B_TRUE;
1175 if (SOCK_IS_NONSTR(so)) {
1176 maxblk = so->so_proto_props.sopp_maxblk;
1177 } else {
1178 maxblk = (int)vp->v_stream->sd_maxblk;
1179 }
1180
1181 /*
1182 * We need to make sure that the socket that we're sending on
1183 * supports sendfile behavior. sockfs doesn't know that the APIs
1184 * we want to use are coming from sendfile, so we can't rely on
1185 * it to check for us.
1186 */
1187 if ((so->so_mode & SM_SENDFILESUPP) == 0) {
1188 error = EOPNOTSUPP;
1189 goto err;
1190 }
1191 break;
1192 case VREG:
1193 break;
1194 default:
1195 error = EINVAL;
1196 goto err;
1197 }
1198
1199 switch (opcode) {
1200 case SENDFILEV :
1201 break;
1202 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1203 case SENDFILEV64 :
1204 return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt,
1205 (size32_t *)xferred, fildes));
1206 #endif
1207 default :
1208 error = ENOSYS;
1209 break;
1210 }
1211
1212 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1213 copy_vec = vec;
1214
1215 do {
1216 total_size = 0;
1217 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
1218 #ifdef _SYSCALL32_IMPL
1219 /* 32-bit callers need to have their iovec expanded. */
1220 if (get_udatamodel() == DATAMODEL_ILP32) {
1221 if (copyin(copy_vec, sfv32,
1222 copy_cnt * sizeof (ksendfilevec32_t))) {
1223 error = EFAULT;
1224 break;
1225 }
1226
1227 for (i = 0; i < copy_cnt; i++) {
1228 sfv[i].sfv_fd = sfv32[i].sfv_fd;
1229 sfv[i].sfv_off =
1230 (off_t)(uint32_t)sfv32[i].sfv_off;
1231 sfv[i].sfv_len = (size_t)sfv32[i].sfv_len;
1232 total_size += sfv[i].sfv_len;
1233 sfv[i].sfv_flag = sfv32[i].sfv_flag;
1234 /*
1235 * Individual elements of the vector must not
1236 * wrap or overflow, as later math is signed.
1237 * Equally total_size needs to be checked after
1238 * each vector is added in, to be sure that
1239 * rogue values haven't overflowed the counter.
1240 */
1241 if (((ssize32_t)sfv[i].sfv_len < 0) ||
1242 ((ssize32_t)total_size < 0)) {
1243 /*
1244 * Truncate the vector to send data
1245 * described by elements before the
1246 * error.
1247 */
1248 copy_cnt = i;
1249 first_vector_error = EINVAL;
1250 /* total_size can't be trusted */
1251 if ((ssize32_t)total_size < 0)
1252 error = EINVAL;
1253 break;
1254 }
1255 }
1256 /* Nothing to do, process errors */
1257 if (copy_cnt == 0)
1258 break;
1259
1260 } else {
1261 #endif
1262 if (copyin(copy_vec, sfv,
1263 copy_cnt * sizeof (sendfilevec_t))) {
1264 error = EFAULT;
1265 break;
1266 }
1267
1268 for (i = 0; i < copy_cnt; i++) {
1269 total_size += sfv[i].sfv_len;
1270 /*
1271 * Individual elements of the vector must not
1272 * wrap or overflow, as later math is signed.
1273 * Equally total_size needs to be checked after
1274 * each vector is added in, to be sure that
1275 * rogue values haven't overflowed the counter.
1276 */
1277 if (((ssize_t)sfv[i].sfv_len < 0) ||
1278 (total_size < 0)) {
1279 /*
1280 * Truncate the vector to send data
1281 * described by elements before the
1282 * error.
1283 */
1284 copy_cnt = i;
1285 first_vector_error = EINVAL;
1286 /* total_size can't be trusted */
1287 if (total_size < 0)
1288 error = EINVAL;
1289 break;
1290 }
1291 }
1292 /* Nothing to do, process errors */
1293 if (copy_cnt == 0)
1294 break;
1295 #ifdef _SYSCALL32_IMPL
1296 }
1297 #endif
1298
1299 /*
1300 * The task between deciding to use sendvec_small_chunk
1301 * and sendvec_chunk is dependant on multiple things:
1302 *
1303 * i) latency is important for smaller files. So if the
1304 * data is smaller than 'tcp_slow_start_initial' times
1305 * maxblk, then use sendvec_small_chunk which creates
1306 * maxblk size mblks and chains them together and sends
1307 * them to TCP in one shot. It also leaves 'wroff' size
1308 * space for the headers in each mblk.
1309 *
1310 * ii) for total size bigger than 'tcp_slow_start_initial'
1311 * time maxblk, its probably real file data which is
1312 * dominating. So its better to use sendvec_chunk because
1313 * performance goes to dog if we don't do pagesize reads.
1314 * sendvec_chunk will do pagesize reads and write them
1315 * in pagesize mblks to TCP.
1316 *
1317 * Side Notes: A write to file has not been optimized.
1318 * Future zero copy code will plugin into sendvec_chunk
1319 * only because doing zero copy for files smaller then
1320 * pagesize is useless.
1321 */
1322 if (is_sock) {
1323 if ((total_size <= (4 * maxblk)) &&
1324 error == 0) {
1325 error = sendvec_small_chunk(fp,
1326 &fileoff, sfv, copy_cnt,
1327 total_size, maxblk, &count);
1328 } else {
1329 error = sendvec_chunk(fp, &fileoff,
1330 sfv, copy_cnt, &count);
1331 }
1332 } else {
1333 ASSERT(vp->v_type == VREG);
1334 error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt,
1335 &count);
1336 }
1337
1338
1339 #ifdef _SYSCALL32_IMPL
1340 if (get_udatamodel() == DATAMODEL_ILP32) {
1341 copy_vec = (const struct sendfilevec *)
1342 ((char *)copy_vec +
1343 (copy_cnt * sizeof (ksendfilevec32_t)));
1344 } else
1345 #endif
1346 copy_vec += copy_cnt;
1347 sfvcnt -= copy_cnt;
1348
1349 /* Process all vector members up to first error */
1350 } while ((sfvcnt > 0) && first_vector_error == 0 && error == 0);
1351
1352 if (vp->v_type == VREG)
1353 fp->f_offset += count;
1354
1355 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1356
1357 #ifdef _SYSCALL32_IMPL
1358 if (get_udatamodel() == DATAMODEL_ILP32) {
1359 ssize32_t count32 = (ssize32_t)count;
1360 if (copyout(&count32, xferred, sizeof (count32)))
1361 error = EFAULT;
1362 releasef(fildes);
1363 if (error != 0)
1364 return (set_errno(error));
1365 if (first_vector_error != 0)
1366 return (set_errno(first_vector_error));
1367 return (count32);
1368 }
1369 #endif
1370 if (copyout(&count, xferred, sizeof (count)))
1371 error = EFAULT;
1372 releasef(fildes);
1373 if (error != 0)
1374 return (set_errno(error));
1375 if (first_vector_error != 0)
1376 return (set_errno(first_vector_error));
1377 return (count);
1378 err:
1379 ASSERT(error != 0);
1380 releasef(fildes);
1381 return (set_errno(error));
1382 }
1383