1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 #include <sys/types.h>
27 #include <sys/t_lock.h>
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/buf.h>
31 #include <sys/conf.h>
32 #include <sys/cred.h>
33 #include <sys/kmem.h>
34 #include <sys/sysmacros.h>
35 #include <sys/vfs.h>
36 #include <sys/vnode.h>
37 #include <sys/debug.h>
38 #include <sys/errno.h>
39 #include <sys/time.h>
40 #include <sys/file.h>
41 #include <sys/open.h>
42 #include <sys/user.h>
43 #include <sys/termios.h>
44 #include <sys/stream.h>
45 #include <sys/strsubr.h>
46 #include <sys/sunddi.h>
47 #include <sys/esunddi.h>
48 #include <sys/flock.h>
49 #include <sys/modctl.h>
50 #include <sys/cmn_err.h>
51 #include <sys/vmsystm.h>
52
53 #include <sys/socket.h>
54 #include <sys/socketvar.h>
55 #include <fs/sockfs/sockcommon.h>
56 #include <fs/sockfs/socktpi.h>
57
58 #include <netinet/in.h>
59 #include <sys/sendfile.h>
60 #include <sys/un.h>
61 #include <sys/tihdr.h>
62 #include <sys/atomic.h>
63
64 #include <inet/common.h>
65 #include <inet/ip.h>
66 #include <inet/ip6.h>
67 #include <inet/tcp.h>
68
69 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *,
70 ssize32_t *);
71 extern int nl7c_sendfilev(struct sonode *, u_offset_t *, struct sendfilevec *,
72 int, ssize_t *);
73 extern int snf_segmap(file_t *, vnode_t *, u_offset_t, u_offset_t, ssize_t *,
74 boolean_t);
75 extern sotpi_info_t *sotpi_sototpi(struct sonode *);
76
77 #define SEND_MAX_CHUNK 16
78
79 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
80 /*
81 * 64 bit offsets for 32 bit applications only running either on
82 * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer
83 * more than 2GB of data.
84 */
85 int
sendvec_chunk64(file_t * fp,u_offset_t * fileoff,struct ksendfilevec64 * sfv,int copy_cnt,ssize32_t * count)86 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv,
87 int copy_cnt, ssize32_t *count)
88 {
89 struct vnode *vp;
90 ushort_t fflag;
91 int ioflag;
92 size32_t cnt;
93 ssize32_t sfv_len;
94 ssize32_t tmpcount;
95 u_offset_t sfv_off;
96 struct uio auio;
97 struct iovec aiov;
98 int i, error;
99
100 fflag = fp->f_flag;
101 vp = fp->f_vnode;
102 for (i = 0; i < copy_cnt; i++) {
103
104 if (ISSIG(curthread, JUSTLOOKING))
105 return (EINTR);
106
107 /*
108 * Do similar checks as "write" as we are writing
109 * sfv_len bytes into "vp".
110 */
111 sfv_len = (ssize32_t)sfv->sfv_len;
112
113 if (sfv_len == 0) {
114 sfv++;
115 continue;
116 }
117
118 if (sfv_len < 0)
119 return (EINVAL);
120
121 if (vp->v_type == VREG) {
122 if (*fileoff >= curproc->p_fsz_ctl) {
123 mutex_enter(&curproc->p_lock);
124 (void) rctl_action(
125 rctlproc_legacy[RLIMIT_FSIZE],
126 curproc->p_rctls, curproc, RCA_SAFE);
127 mutex_exit(&curproc->p_lock);
128 return (EFBIG);
129 }
130
131 if (*fileoff >= OFFSET_MAX(fp))
132 return (EFBIG);
133
134 if (*fileoff + sfv_len > OFFSET_MAX(fp))
135 return (EINVAL);
136 }
137
138 tmpcount = *count + sfv_len;
139 if (tmpcount < 0)
140 return (EINVAL);
141
142 sfv_off = sfv->sfv_off;
143
144 auio.uio_extflg = UIO_COPY_DEFAULT;
145 if (sfv->sfv_fd == SFV_FD_SELF) {
146 aiov.iov_len = sfv_len;
147 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
148 auio.uio_loffset = *fileoff;
149 auio.uio_iovcnt = 1;
150 auio.uio_resid = sfv_len;
151 auio.uio_iov = &aiov;
152 auio.uio_segflg = UIO_USERSPACE;
153 auio.uio_llimit = curproc->p_fsz_ctl;
154 auio.uio_fmode = fflag;
155 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
156 while (sfv_len > 0) {
157 error = VOP_WRITE(vp, &auio, ioflag,
158 fp->f_cred, NULL);
159 cnt = sfv_len - auio.uio_resid;
160 sfv_len -= cnt;
161 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
162 if (vp->v_type == VREG)
163 *fileoff += cnt;
164 *count += cnt;
165 if (error != 0)
166 return (error);
167 }
168 } else {
169 file_t *ffp;
170 vnode_t *readvp;
171 size_t size;
172 caddr_t ptr;
173
174 if ((ffp = getf(sfv->sfv_fd)) == NULL)
175 return (EBADF);
176
177 if ((ffp->f_flag & FREAD) == 0) {
178 releasef(sfv->sfv_fd);
179 return (EBADF);
180 }
181
182 readvp = ffp->f_vnode;
183 if (readvp->v_type != VREG) {
184 releasef(sfv->sfv_fd);
185 return (EINVAL);
186 }
187
188 /*
189 * No point reading and writing to same vp,
190 * as long as both are regular files. readvp is not
191 * locked; but since we got it from an open file the
192 * contents will be valid during the time of access.
193 */
194 if (vn_compare(vp, readvp)) {
195 releasef(sfv->sfv_fd);
196 return (EINVAL);
197 }
198
199 /*
200 * Optimize the regular file over
201 * the socket case.
202 */
203 if (vp->v_type == VSOCK) {
204 error = sosendfile64(fp, ffp, sfv,
205 (ssize32_t *)&cnt);
206 *count += cnt;
207 if (error)
208 return (error);
209 sfv++;
210 continue;
211 }
212
213 /*
214 * Note: we assume readvp != vp. "vp" is already
215 * locked, and "readvp" must not be.
216 */
217 if (readvp < vp) {
218 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
219 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE,
220 NULL);
221 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
222 } else {
223 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE,
224 NULL);
225 }
226
227 /*
228 * Same checks as in pread64.
229 */
230 if (sfv_off > MAXOFFSET_T) {
231 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL);
232 releasef(sfv->sfv_fd);
233 return (EINVAL);
234 }
235
236 if (sfv_off + sfv_len > MAXOFFSET_T)
237 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off);
238
239 /* Find the native blocksize to transfer data */
240 size = MIN(vp->v_vfsp->vfs_bsize,
241 readvp->v_vfsp->vfs_bsize);
242 size = sfv_len < size ? sfv_len : size;
243 ptr = kmem_alloc(size, KM_NOSLEEP);
244 if (ptr == NULL) {
245 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL);
246 releasef(sfv->sfv_fd);
247 return (ENOMEM);
248 }
249
250 while (sfv_len > 0) {
251 size_t iov_len;
252
253 iov_len = MIN(size, sfv_len);
254 aiov.iov_base = ptr;
255 aiov.iov_len = iov_len;
256 auio.uio_loffset = sfv_off;
257 auio.uio_iov = &aiov;
258 auio.uio_iovcnt = 1;
259 auio.uio_resid = iov_len;
260 auio.uio_segflg = UIO_SYSSPACE;
261 auio.uio_llimit = MAXOFFSET_T;
262 auio.uio_fmode = ffp->f_flag;
263 ioflag = auio.uio_fmode &
264 (FAPPEND|FSYNC|FDSYNC|FRSYNC);
265
266 /*
267 * If read sync is not asked for,
268 * filter sync flags
269 */
270 if ((ioflag & FRSYNC) == 0)
271 ioflag &= ~(FSYNC|FDSYNC);
272 error = VOP_READ(readvp, &auio, ioflag,
273 fp->f_cred, NULL);
274 if (error) {
275 kmem_free(ptr, size);
276 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
277 NULL);
278 releasef(sfv->sfv_fd);
279 return (error);
280 }
281
282 /*
283 * Check how must data was really read.
284 * Decrement the 'len' and increment the
285 * 'off' appropriately.
286 */
287 cnt = iov_len - auio.uio_resid;
288 if (cnt == 0) {
289 /*
290 * If we were reading a pipe (currently
291 * not implemented), we may now lose
292 * data.
293 */
294 kmem_free(ptr, size);
295 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
296 NULL);
297 releasef(sfv->sfv_fd);
298 return (EINVAL);
299 }
300 sfv_len -= cnt;
301 sfv_off += cnt;
302
303 aiov.iov_base = ptr;
304 aiov.iov_len = cnt;
305 auio.uio_loffset = *fileoff;
306 auio.uio_iov = &aiov;
307 auio.uio_iovcnt = 1;
308 auio.uio_resid = cnt;
309 auio.uio_segflg = UIO_SYSSPACE;
310 auio.uio_llimit = curproc->p_fsz_ctl;
311 auio.uio_fmode = fflag;
312 ioflag = auio.uio_fmode &
313 (FAPPEND|FSYNC|FDSYNC|FRSYNC);
314 error = VOP_WRITE(vp, &auio, ioflag,
315 fp->f_cred, NULL);
316
317 /*
318 * Check how much data was written. Increment
319 * the 'len' and decrement the 'off' if all
320 * the data was not written.
321 */
322 cnt -= auio.uio_resid;
323 sfv_len += auio.uio_resid;
324 sfv_off -= auio.uio_resid;
325 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
326 if (vp->v_type == VREG)
327 *fileoff += cnt;
328 *count += cnt;
329 if (error != 0) {
330 kmem_free(ptr, size);
331 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
332 NULL);
333 releasef(sfv->sfv_fd);
334 return (error);
335 }
336 }
337 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL);
338 releasef(sfv->sfv_fd);
339 kmem_free(ptr, size);
340 }
341 sfv++;
342 }
343 return (0);
344 }
345
346 ssize32_t
sendvec64(file_t * fp,const struct ksendfilevec64 * vec,int sfvcnt,size32_t * xferred,int fildes)347 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt,
348 size32_t *xferred, int fildes)
349 {
350 u_offset_t fileoff;
351 int copy_cnt;
352 const struct ksendfilevec64 *copy_vec;
353 struct ksendfilevec64 sfv[SEND_MAX_CHUNK];
354 struct vnode *vp;
355 int error;
356 ssize32_t count = 0;
357
358 vp = fp->f_vnode;
359 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
360
361 copy_vec = vec;
362 fileoff = fp->f_offset;
363
364 do {
365 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
366 if (copyin(copy_vec, sfv, copy_cnt *
367 sizeof (struct ksendfilevec64))) {
368 error = EFAULT;
369 break;
370 }
371
372 error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count);
373 if (error != 0)
374 break;
375
376 copy_vec += copy_cnt;
377 sfvcnt -= copy_cnt;
378 } while (sfvcnt > 0);
379
380 if (vp->v_type == VREG)
381 fp->f_offset += count;
382
383 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
384 if (copyout(&count, xferred, sizeof (count)))
385 error = EFAULT;
386 releasef(fildes);
387 if (error != 0)
388 return (set_errno(error));
389 return (count);
390 }
391 #endif
392
393 int
sendvec_small_chunk(file_t * fp,u_offset_t * fileoff,struct sendfilevec * sfv,int copy_cnt,ssize_t total_size,int maxblk,ssize_t * count)394 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
395 int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count)
396 {
397 struct vnode *vp;
398 struct uio auio;
399 struct iovec aiov;
400 ushort_t fflag;
401 int ioflag;
402 int i, error;
403 size_t cnt;
404 ssize_t sfv_len;
405 u_offset_t sfv_off;
406 #ifdef _SYSCALL32_IMPL
407 model_t model = get_udatamodel();
408 u_offset_t maxoff = (model == DATAMODEL_ILP32) ?
409 MAXOFF32_T : MAXOFFSET_T;
410 #else
411 const u_offset_t maxoff = MAXOFF32_T;
412 #endif
413 mblk_t *dmp = NULL;
414 int wroff;
415 int buf_left = 0;
416 size_t iov_len;
417 mblk_t *head, *tmp;
418 size_t size = total_size;
419 size_t extra;
420 int tail_len;
421 struct nmsghdr msg;
422
423 fflag = fp->f_flag;
424 vp = fp->f_vnode;
425
426 ASSERT(vp->v_type == VSOCK);
427 ASSERT(maxblk > 0);
428
429 /* If nothing to send, return */
430 if (total_size == 0)
431 return (0);
432
433 if (vp->v_stream != NULL) {
434 wroff = (int)vp->v_stream->sd_wroff;
435 tail_len = (int)vp->v_stream->sd_tail;
436 } else {
437 struct sonode *so;
438
439 so = VTOSO(vp);
440 wroff = so->so_proto_props.sopp_wroff;
441 tail_len = so->so_proto_props.sopp_tail;
442 }
443
444 extra = wroff + tail_len;
445
446 buf_left = MIN(total_size, maxblk);
447 head = dmp = allocb(buf_left + extra, BPRI_HI);
448 if (head == NULL)
449 return (ENOMEM);
450 head->b_wptr = head->b_rptr = head->b_rptr + wroff;
451 bzero(&msg, sizeof (msg));
452
453 auio.uio_extflg = UIO_COPY_DEFAULT;
454 for (i = 0; i < copy_cnt; i++) {
455 if (ISSIG(curthread, JUSTLOOKING)) {
456 freemsg(head);
457 return (EINTR);
458 }
459
460 /*
461 * Do similar checks as "write" as we are writing
462 * sfv_len bytes into "vp".
463 */
464 sfv_len = (ssize_t)sfv->sfv_len;
465
466 if (sfv_len == 0) {
467 sfv++;
468 continue;
469 }
470
471 /* Check for overflow */
472 #ifdef _SYSCALL32_IMPL
473 if (model == DATAMODEL_ILP32) {
474 if (((ssize32_t)(*count + sfv_len)) < 0) {
475 freemsg(head);
476 return (EINVAL);
477 }
478 } else
479 #endif
480 if ((*count + sfv_len) < 0) {
481 freemsg(head);
482 return (EINVAL);
483 }
484
485 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off;
486
487 if (sfv->sfv_fd == SFV_FD_SELF) {
488 while (sfv_len > 0) {
489 if (buf_left == 0) {
490 tmp = dmp;
491 buf_left = MIN(total_size, maxblk);
492 iov_len = MIN(buf_left, sfv_len);
493 dmp = allocb(buf_left + extra, BPRI_HI);
494 if (dmp == NULL) {
495 freemsg(head);
496 return (ENOMEM);
497 }
498 dmp->b_wptr = dmp->b_rptr =
499 dmp->b_rptr + wroff;
500 tmp->b_cont = dmp;
501 } else {
502 iov_len = MIN(buf_left, sfv_len);
503 }
504
505 aiov.iov_len = iov_len;
506 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
507 auio.uio_loffset = *fileoff;
508 auio.uio_iovcnt = 1;
509 auio.uio_resid = iov_len;
510 auio.uio_iov = &aiov;
511 auio.uio_segflg = UIO_USERSPACE;
512 auio.uio_llimit = curproc->p_fsz_ctl;
513 auio.uio_fmode = fflag;
514
515 buf_left -= iov_len;
516 total_size -= iov_len;
517 sfv_len -= iov_len;
518 sfv_off += iov_len;
519
520 error = uiomove((caddr_t)dmp->b_wptr,
521 iov_len, UIO_WRITE, &auio);
522 if (error != 0) {
523 freemsg(head);
524 return (error);
525 }
526 dmp->b_wptr += iov_len;
527 }
528 } else {
529 file_t *ffp;
530 vnode_t *readvp;
531
532 if ((ffp = getf(sfv->sfv_fd)) == NULL) {
533 freemsg(head);
534 return (EBADF);
535 }
536
537 if ((ffp->f_flag & FREAD) == 0) {
538 releasef(sfv->sfv_fd);
539 freemsg(head);
540 return (EACCES);
541 }
542
543 readvp = ffp->f_vnode;
544 if (readvp->v_type != VREG) {
545 releasef(sfv->sfv_fd);
546 freemsg(head);
547 return (EINVAL);
548 }
549
550 /*
551 * No point reading and writing to same vp,
552 * as long as both are regular files. readvp is not
553 * locked; but since we got it from an open file the
554 * contents will be valid during the time of access.
555 */
556
557 if (vn_compare(vp, readvp)) {
558 releasef(sfv->sfv_fd);
559 freemsg(head);
560 return (EINVAL);
561 }
562
563 /*
564 * Note: we assume readvp != vp. "vp" is already
565 * locked, and "readvp" must not be.
566 */
567
568 if (readvp < vp) {
569 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
570 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE,
571 NULL);
572 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
573 } else {
574 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE,
575 NULL);
576 }
577
578 /* Same checks as in pread */
579 if (sfv_off > maxoff) {
580 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL);
581 releasef(sfv->sfv_fd);
582 freemsg(head);
583 return (EINVAL);
584 }
585 if (sfv_off + sfv_len > maxoff) {
586 total_size -= (sfv_off + sfv_len - maxoff);
587 sfv_len = (ssize_t)((offset_t)maxoff -
588 sfv_off);
589 }
590
591 while (sfv_len > 0) {
592 if (buf_left == 0) {
593 tmp = dmp;
594 buf_left = MIN(total_size, maxblk);
595 iov_len = MIN(buf_left, sfv_len);
596 dmp = allocb(buf_left + extra, BPRI_HI);
597 if (dmp == NULL) {
598 VOP_RWUNLOCK(readvp,
599 V_WRITELOCK_FALSE, NULL);
600 releasef(sfv->sfv_fd);
601 freemsg(head);
602 return (ENOMEM);
603 }
604 dmp->b_wptr = dmp->b_rptr =
605 dmp->b_rptr + wroff;
606 tmp->b_cont = dmp;
607 } else {
608 iov_len = MIN(buf_left, sfv_len);
609 }
610 aiov.iov_base = (caddr_t)dmp->b_wptr;
611 aiov.iov_len = iov_len;
612 auio.uio_loffset = sfv_off;
613 auio.uio_iov = &aiov;
614 auio.uio_iovcnt = 1;
615 auio.uio_resid = iov_len;
616 auio.uio_segflg = UIO_SYSSPACE;
617 auio.uio_llimit = MAXOFFSET_T;
618 auio.uio_fmode = ffp->f_flag;
619 ioflag = auio.uio_fmode &
620 (FAPPEND|FSYNC|FDSYNC|FRSYNC);
621
622 /*
623 * If read sync is not asked for,
624 * filter sync flags
625 */
626 if ((ioflag & FRSYNC) == 0)
627 ioflag &= ~(FSYNC|FDSYNC);
628 error = VOP_READ(readvp, &auio, ioflag,
629 fp->f_cred, NULL);
630 if (error != 0) {
631 /*
632 * If we were reading a pipe (currently
633 * not implemented), we may now loose
634 * data.
635 */
636 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
637 NULL);
638 releasef(sfv->sfv_fd);
639 freemsg(head);
640 return (error);
641 }
642
643 /*
644 * Check how much data was really read.
645 * Decrement the 'len' and increment the
646 * 'off' appropriately.
647 */
648 cnt = iov_len - auio.uio_resid;
649 if (cnt == 0) {
650 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
651 NULL);
652 releasef(sfv->sfv_fd);
653 freemsg(head);
654 return (EINVAL);
655 }
656 sfv_len -= cnt;
657 sfv_off += cnt;
658 total_size -= cnt;
659 buf_left -= cnt;
660
661 dmp->b_wptr += cnt;
662 }
663 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL);
664 releasef(sfv->sfv_fd);
665 }
666 sfv++;
667 }
668
669 ASSERT(total_size == 0);
670 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &head);
671 if (error != 0) {
672 if (head != NULL)
673 freemsg(head);
674 return (error);
675 }
676 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size;
677 *count += size;
678
679 return (0);
680 }
681
682
683 int
sendvec_chunk(file_t * fp,u_offset_t * fileoff,struct sendfilevec * sfv,int copy_cnt,ssize_t * count)684 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
685 int copy_cnt, ssize_t *count)
686 {
687 struct vnode *vp;
688 struct uio auio;
689 struct iovec aiov;
690 ushort_t fflag;
691 int ioflag;
692 int i, error;
693 size_t cnt;
694 ssize_t sfv_len;
695 u_offset_t sfv_off;
696 #ifdef _SYSCALL32_IMPL
697 model_t model = get_udatamodel();
698 u_offset_t maxoff = (model == DATAMODEL_ILP32) ?
699 MAXOFF32_T : MAXOFFSET_T;
700 #else
701 const u_offset_t maxoff = MAXOFF32_T;
702 #endif
703 mblk_t *dmp = NULL;
704 char *buf = NULL;
705 size_t extra;
706 int maxblk, wroff, tail_len;
707 struct sonode *so;
708 stdata_t *stp;
709 struct nmsghdr msg;
710
711 fflag = fp->f_flag;
712 vp = fp->f_vnode;
713
714 if (vp->v_type == VSOCK) {
715 so = VTOSO(vp);
716 if (vp->v_stream != NULL) {
717 stp = vp->v_stream;
718 wroff = (int)stp->sd_wroff;
719 tail_len = (int)stp->sd_tail;
720 maxblk = (int)stp->sd_maxblk;
721 } else {
722 stp = NULL;
723 wroff = so->so_proto_props.sopp_wroff;
724 tail_len = so->so_proto_props.sopp_tail;
725 maxblk = so->so_proto_props.sopp_maxblk;
726 }
727 extra = wroff + tail_len;
728 }
729
730 bzero(&msg, sizeof (msg));
731 auio.uio_extflg = UIO_COPY_DEFAULT;
732 for (i = 0; i < copy_cnt; i++) {
733 if (ISSIG(curthread, JUSTLOOKING))
734 return (EINTR);
735
736 /*
737 * Do similar checks as "write" as we are writing
738 * sfv_len bytes into "vp".
739 */
740 sfv_len = (ssize_t)sfv->sfv_len;
741
742 if (sfv_len == 0) {
743 sfv++;
744 continue;
745 }
746
747 if (vp->v_type == VREG) {
748 if (*fileoff >= curproc->p_fsz_ctl) {
749 mutex_enter(&curproc->p_lock);
750 (void) rctl_action(
751 rctlproc_legacy[RLIMIT_FSIZE],
752 curproc->p_rctls, curproc, RCA_SAFE);
753 mutex_exit(&curproc->p_lock);
754
755 return (EFBIG);
756 }
757
758 if (*fileoff >= maxoff)
759 return (EFBIG);
760
761 if (*fileoff + sfv_len > maxoff)
762 return (EINVAL);
763 }
764
765 /* Check for overflow */
766 #ifdef _SYSCALL32_IMPL
767 if (model == DATAMODEL_ILP32) {
768 if (((ssize32_t)(*count + sfv_len)) < 0)
769 return (EINVAL);
770 } else
771 #endif
772 if ((*count + sfv_len) < 0)
773 return (EINVAL);
774
775 sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off;
776
777 if (sfv->sfv_fd == SFV_FD_SELF) {
778 if (vp->v_type == VSOCK) {
779 while (sfv_len > 0) {
780 size_t iov_len;
781
782 iov_len = sfv_len;
783 /*
784 * Socket filters can limit the mblk
785 * size, so limit reads to maxblk if
786 * there are filters present.
787 */
788 if (so->so_filter_active > 0 &&
789 maxblk != INFPSZ)
790 iov_len = MIN(iov_len, maxblk);
791
792 aiov.iov_len = iov_len;
793 aiov.iov_base =
794 (caddr_t)(uintptr_t)sfv_off;
795
796 auio.uio_iov = &aiov;
797 auio.uio_iovcnt = 1;
798 auio.uio_loffset = *fileoff;
799 auio.uio_segflg = UIO_USERSPACE;
800 auio.uio_fmode = fflag;
801 auio.uio_llimit = curproc->p_fsz_ctl;
802 auio.uio_resid = iov_len;
803
804 dmp = allocb(iov_len + extra, BPRI_HI);
805 if (dmp == NULL)
806 return (ENOMEM);
807 dmp->b_wptr = dmp->b_rptr =
808 dmp->b_rptr + wroff;
809 error = uiomove((caddr_t)dmp->b_wptr,
810 iov_len, UIO_WRITE, &auio);
811 if (error != 0) {
812 freeb(dmp);
813 return (error);
814 }
815 dmp->b_wptr += iov_len;
816 error = socket_sendmblk(VTOSO(vp),
817 &msg, fflag, CRED(), &dmp);
818
819 if (error != 0) {
820 if (dmp != NULL)
821 freeb(dmp);
822 return (error);
823 }
824 ttolwp(curthread)->lwp_ru.ioch +=
825 (ulong_t)iov_len;
826 *count += iov_len;
827 sfv_len -= iov_len;
828 sfv_off += iov_len;
829 }
830 } else {
831 aiov.iov_len = sfv_len;
832 aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
833
834 auio.uio_iov = &aiov;
835 auio.uio_iovcnt = 1;
836 auio.uio_loffset = *fileoff;
837 auio.uio_segflg = UIO_USERSPACE;
838 auio.uio_fmode = fflag;
839 auio.uio_llimit = curproc->p_fsz_ctl;
840 auio.uio_resid = sfv_len;
841
842 ioflag = auio.uio_fmode &
843 (FAPPEND|FSYNC|FDSYNC|FRSYNC);
844 while (sfv_len > 0) {
845 error = VOP_WRITE(vp, &auio, ioflag,
846 fp->f_cred, NULL);
847 cnt = sfv_len - auio.uio_resid;
848 sfv_len -= cnt;
849 ttolwp(curthread)->lwp_ru.ioch +=
850 (ulong_t)cnt;
851 *fileoff += cnt;
852 *count += cnt;
853 if (error != 0)
854 return (error);
855 }
856 }
857 } else {
858 int segmapit = 0;
859 file_t *ffp;
860 vnode_t *readvp;
861 struct vnode *realvp;
862 size_t size;
863 caddr_t ptr;
864
865 if ((ffp = getf(sfv->sfv_fd)) == NULL)
866 return (EBADF);
867
868 if ((ffp->f_flag & FREAD) == 0) {
869 releasef(sfv->sfv_fd);
870 return (EBADF);
871 }
872
873 readvp = ffp->f_vnode;
874 if (VOP_REALVP(readvp, &realvp, NULL) == 0)
875 readvp = realvp;
876 if (readvp->v_type != VREG) {
877 releasef(sfv->sfv_fd);
878 return (EINVAL);
879 }
880
881 /*
882 * No point reading and writing to same vp,
883 * as long as both are regular files. readvp is not
884 * locked; but since we got it from an open file the
885 * contents will be valid during the time of access.
886 */
887 if (vn_compare(vp, readvp)) {
888 releasef(sfv->sfv_fd);
889 return (EINVAL);
890 }
891
892 /*
893 * Note: we assume readvp != vp. "vp" is already
894 * locked, and "readvp" must not be.
895 */
896 if (readvp < vp) {
897 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
898 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE,
899 NULL);
900 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
901 } else {
902 (void) VOP_RWLOCK(readvp, V_WRITELOCK_FALSE,
903 NULL);
904 }
905
906 /* Same checks as in pread */
907 if (sfv_off > maxoff) {
908 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL);
909 releasef(sfv->sfv_fd);
910 return (EINVAL);
911 }
912 if (sfv_off + sfv_len > maxoff) {
913 sfv_len = (ssize_t)((offset_t)maxoff -
914 sfv_off);
915 }
916 /* Find the native blocksize to transfer data */
917 size = MIN(vp->v_vfsp->vfs_bsize,
918 readvp->v_vfsp->vfs_bsize);
919 size = sfv_len < size ? sfv_len : size;
920
921 if (vp->v_type != VSOCK) {
922 segmapit = 0;
923 buf = kmem_alloc(size, KM_NOSLEEP);
924 if (buf == NULL) {
925 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
926 NULL);
927 releasef(sfv->sfv_fd);
928 return (ENOMEM);
929 }
930 } else {
931 uint_t copyflag;
932
933 copyflag = stp != NULL ? stp->sd_copyflag :
934 so->so_proto_props.sopp_zcopyflag;
935
936 /*
937 * Socket filters can limit the mblk size,
938 * so limit reads to maxblk if there are
939 * filters present.
940 */
941 if (so->so_filter_active > 0 &&
942 maxblk != INFPSZ)
943 size = MIN(size, maxblk);
944
945 if (vn_has_flocks(readvp) ||
946 readvp->v_flag & VNOMAP ||
947 copyflag & STZCVMUNSAFE) {
948 segmapit = 0;
949 } else if (copyflag & STZCVMSAFE) {
950 segmapit = 1;
951 } else {
952 int on = 1;
953 if (socket_setsockopt(VTOSO(vp),
954 SOL_SOCKET, SO_SND_COPYAVOID,
955 &on, sizeof (on), CRED()) == 0)
956 segmapit = 1;
957 }
958 }
959
960 if (segmapit) {
961 struct vattr va;
962 boolean_t nowait;
963
964 va.va_mask = AT_SIZE;
965 error = VOP_GETATTR(readvp, &va, 0, kcred,
966 NULL);
967 if (error != 0 || sfv_off >= va.va_size) {
968 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
969 NULL);
970 releasef(sfv->sfv_fd);
971 return (error);
972 }
973 /* Read as much as possible. */
974 if (sfv_off + sfv_len > va.va_size)
975 sfv_len = va.va_size - sfv_off;
976
977 nowait = (sfv->sfv_flag & SFV_NOWAIT) != 0;
978 error = snf_segmap(fp, readvp, sfv_off,
979 (u_offset_t)sfv_len, (ssize_t *)&cnt,
980 nowait);
981 releasef(sfv->sfv_fd);
982 *count += cnt;
983 if (error)
984 return (error);
985 sfv++;
986 continue;
987 }
988
989 while (sfv_len > 0) {
990 size_t iov_len;
991
992 iov_len = MIN(size, sfv_len);
993
994 if (vp->v_type == VSOCK) {
995 dmp = allocb(iov_len + extra, BPRI_HI);
996 if (dmp == NULL) {
997 VOP_RWUNLOCK(readvp,
998 V_WRITELOCK_FALSE, NULL);
999 releasef(sfv->sfv_fd);
1000 return (ENOMEM);
1001 }
1002 dmp->b_wptr = dmp->b_rptr =
1003 dmp->b_rptr + wroff;
1004 ptr = (caddr_t)dmp->b_rptr;
1005 } else {
1006 ptr = buf;
1007 }
1008
1009 aiov.iov_base = ptr;
1010 aiov.iov_len = iov_len;
1011 auio.uio_loffset = sfv_off;
1012 auio.uio_iov = &aiov;
1013 auio.uio_iovcnt = 1;
1014 auio.uio_resid = iov_len;
1015 auio.uio_segflg = UIO_SYSSPACE;
1016 auio.uio_llimit = MAXOFFSET_T;
1017 auio.uio_fmode = ffp->f_flag;
1018 ioflag = auio.uio_fmode &
1019 (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1020
1021 /*
1022 * If read sync is not asked for,
1023 * filter sync flags
1024 */
1025 if ((ioflag & FRSYNC) == 0)
1026 ioflag &= ~(FSYNC|FDSYNC);
1027 error = VOP_READ(readvp, &auio, ioflag,
1028 fp->f_cred, NULL);
1029 if (error != 0) {
1030 /*
1031 * If we were reading a pipe (currently
1032 * not implemented), we may now lose
1033 * data.
1034 */
1035 if (vp->v_type == VSOCK)
1036 freeb(dmp);
1037 else
1038 kmem_free(buf, size);
1039 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
1040 NULL);
1041 releasef(sfv->sfv_fd);
1042 return (error);
1043 }
1044
1045 /*
1046 * Check how much data was really read.
1047 * Decrement the 'len' and increment the
1048 * 'off' appropriately.
1049 */
1050 cnt = iov_len - auio.uio_resid;
1051 if (cnt == 0) {
1052 if (vp->v_type == VSOCK)
1053 freeb(dmp);
1054 else
1055 kmem_free(buf, size);
1056 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE,
1057 NULL);
1058 releasef(sfv->sfv_fd);
1059 return (EINVAL);
1060 }
1061 sfv_len -= cnt;
1062 sfv_off += cnt;
1063
1064 if (vp->v_type == VSOCK) {
1065 dmp->b_wptr = dmp->b_rptr + cnt;
1066
1067 error = socket_sendmblk(VTOSO(vp),
1068 &msg, fflag, CRED(), &dmp);
1069
1070 if (error != 0) {
1071 if (dmp != NULL)
1072 freeb(dmp);
1073 VOP_RWUNLOCK(readvp,
1074 V_WRITELOCK_FALSE, NULL);
1075 releasef(sfv->sfv_fd);
1076 return (error);
1077 }
1078
1079 ttolwp(curthread)->lwp_ru.ioch +=
1080 (ulong_t)cnt;
1081 *count += cnt;
1082 } else {
1083
1084 aiov.iov_base = ptr;
1085 aiov.iov_len = cnt;
1086 auio.uio_loffset = *fileoff;
1087 auio.uio_resid = cnt;
1088 auio.uio_iov = &aiov;
1089 auio.uio_iovcnt = 1;
1090 auio.uio_segflg = UIO_SYSSPACE;
1091 auio.uio_llimit = curproc->p_fsz_ctl;
1092 auio.uio_fmode = fflag;
1093 ioflag = auio.uio_fmode &
1094 (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1095 error = VOP_WRITE(vp, &auio, ioflag,
1096 fp->f_cred, NULL);
1097
1098 /*
1099 * Check how much data was written.
1100 * Increment the 'len' and decrement the
1101 * 'off' if all the data was not
1102 * written.
1103 */
1104 cnt -= auio.uio_resid;
1105 sfv_len += auio.uio_resid;
1106 sfv_off -= auio.uio_resid;
1107 ttolwp(curthread)->lwp_ru.ioch +=
1108 (ulong_t)cnt;
1109 *fileoff += cnt;
1110 *count += cnt;
1111 if (error != 0) {
1112 kmem_free(buf, size);
1113 VOP_RWUNLOCK(readvp,
1114 V_WRITELOCK_FALSE, NULL);
1115 releasef(sfv->sfv_fd);
1116 return (error);
1117 }
1118 }
1119 }
1120 if (buf) {
1121 kmem_free(buf, size);
1122 buf = NULL;
1123 }
1124 VOP_RWUNLOCK(readvp, V_WRITELOCK_FALSE, NULL);
1125 releasef(sfv->sfv_fd);
1126 }
1127 sfv++;
1128 }
1129 return (0);
1130 }
1131
1132 ssize_t
sendfilev(int opcode,int fildes,const struct sendfilevec * vec,int sfvcnt,size_t * xferred)1133 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt,
1134 size_t *xferred)
1135 {
1136 int error = 0;
1137 int first_vector_error = 0;
1138 file_t *fp;
1139 struct vnode *vp;
1140 struct sonode *so;
1141 u_offset_t fileoff;
1142 int copy_cnt;
1143 const struct sendfilevec *copy_vec;
1144 struct sendfilevec sfv[SEND_MAX_CHUNK];
1145 ssize_t count = 0;
1146 #ifdef _SYSCALL32_IMPL
1147 struct ksendfilevec32 sfv32[SEND_MAX_CHUNK];
1148 #endif
1149 ssize_t total_size;
1150 int i;
1151 boolean_t is_sock = B_FALSE;
1152 int maxblk = 0;
1153
1154 if (sfvcnt <= 0)
1155 return (set_errno(EINVAL));
1156
1157 if ((fp = getf(fildes)) == NULL)
1158 return (set_errno(EBADF));
1159
1160 if (((fp->f_flag) & FWRITE) == 0) {
1161 error = EBADF;
1162 goto err;
1163 }
1164
1165 fileoff = fp->f_offset;
1166 vp = fp->f_vnode;
1167
1168 switch (vp->v_type) {
1169 case VSOCK:
1170 so = VTOSO(vp);
1171 is_sock = B_TRUE;
1172 if (SOCK_IS_NONSTR(so)) {
1173 maxblk = so->so_proto_props.sopp_maxblk;
1174 } else {
1175 maxblk = (int)vp->v_stream->sd_maxblk;
1176 }
1177 break;
1178 case VREG:
1179 break;
1180 default:
1181 error = EINVAL;
1182 goto err;
1183 }
1184
1185 switch (opcode) {
1186 case SENDFILEV :
1187 break;
1188 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1189 case SENDFILEV64 :
1190 return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt,
1191 (size32_t *)xferred, fildes));
1192 #endif
1193 default :
1194 error = ENOSYS;
1195 break;
1196 }
1197
1198 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1199 copy_vec = vec;
1200
1201 do {
1202 total_size = 0;
1203 copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
1204 #ifdef _SYSCALL32_IMPL
1205 /* 32-bit callers need to have their iovec expanded. */
1206 if (get_udatamodel() == DATAMODEL_ILP32) {
1207 if (copyin(copy_vec, sfv32,
1208 copy_cnt * sizeof (ksendfilevec32_t))) {
1209 error = EFAULT;
1210 break;
1211 }
1212
1213 for (i = 0; i < copy_cnt; i++) {
1214 sfv[i].sfv_fd = sfv32[i].sfv_fd;
1215 sfv[i].sfv_off =
1216 (off_t)(uint32_t)sfv32[i].sfv_off;
1217 sfv[i].sfv_len = (size_t)sfv32[i].sfv_len;
1218 total_size += sfv[i].sfv_len;
1219 sfv[i].sfv_flag = sfv32[i].sfv_flag;
1220 /*
1221 * Individual elements of the vector must not
1222 * wrap or overflow, as later math is signed.
1223 * Equally total_size needs to be checked after
1224 * each vector is added in, to be sure that
1225 * rogue values haven't overflowed the counter.
1226 */
1227 if (((ssize32_t)sfv[i].sfv_len < 0) ||
1228 ((ssize32_t)total_size < 0)) {
1229 /*
1230 * Truncate the vector to send data
1231 * described by elements before the
1232 * error.
1233 */
1234 copy_cnt = i;
1235 first_vector_error = EINVAL;
1236 /* total_size can't be trusted */
1237 if ((ssize32_t)total_size < 0)
1238 error = EINVAL;
1239 break;
1240 }
1241 }
1242 /* Nothing to do, process errors */
1243 if (copy_cnt == 0)
1244 break;
1245
1246 } else {
1247 #endif
1248 if (copyin(copy_vec, sfv,
1249 copy_cnt * sizeof (sendfilevec_t))) {
1250 error = EFAULT;
1251 break;
1252 }
1253
1254 for (i = 0; i < copy_cnt; i++) {
1255 total_size += sfv[i].sfv_len;
1256 /*
1257 * Individual elements of the vector must not
1258 * wrap or overflow, as later math is signed.
1259 * Equally total_size needs to be checked after
1260 * each vector is added in, to be sure that
1261 * rogue values haven't overflowed the counter.
1262 */
1263 if (((ssize_t)sfv[i].sfv_len < 0) ||
1264 (total_size < 0)) {
1265 /*
1266 * Truncate the vector to send data
1267 * described by elements before the
1268 * error.
1269 */
1270 copy_cnt = i;
1271 first_vector_error = EINVAL;
1272 /* total_size can't be trusted */
1273 if (total_size < 0)
1274 error = EINVAL;
1275 break;
1276 }
1277 }
1278 /* Nothing to do, process errors */
1279 if (copy_cnt == 0)
1280 break;
1281 #ifdef _SYSCALL32_IMPL
1282 }
1283 #endif
1284
1285 /*
1286 * The task between deciding to use sendvec_small_chunk
1287 * and sendvec_chunk is dependant on multiple things:
1288 *
1289 * i) latency is important for smaller files. So if the
1290 * data is smaller than 'tcp_slow_start_initial' times
1291 * maxblk, then use sendvec_small_chunk which creates
1292 * maxblk size mblks and chains them together and sends
1293 * them to TCP in one shot. It also leaves 'wroff' size
1294 * space for the headers in each mblk.
1295 *
1296 * ii) for total size bigger than 'tcp_slow_start_initial'
1297 * time maxblk, its probably real file data which is
1298 * dominating. So its better to use sendvec_chunk because
1299 * performance goes to dog if we don't do pagesize reads.
1300 * sendvec_chunk will do pagesize reads and write them
1301 * in pagesize mblks to TCP.
1302 *
1303 * Side Notes: A write to file has not been optimized.
1304 * Future zero copy code will plugin into sendvec_chunk
1305 * only because doing zero copy for files smaller then
1306 * pagesize is useless.
1307 *
1308 * Note, if socket has NL7C enabled then call NL7C's
1309 * senfilev() function to consume the sfv[].
1310 */
1311 if (is_sock) {
1312 if (!SOCK_IS_NONSTR(so) &&
1313 _SOTOTPI(so)->sti_nl7c_flags != 0) {
1314 error = nl7c_sendfilev(so, &fileoff,
1315 sfv, copy_cnt, &count);
1316 } else if ((total_size <= (4 * maxblk)) &&
1317 error == 0) {
1318 error = sendvec_small_chunk(fp,
1319 &fileoff, sfv, copy_cnt,
1320 total_size, maxblk, &count);
1321 } else {
1322 error = sendvec_chunk(fp, &fileoff,
1323 sfv, copy_cnt, &count);
1324 }
1325 } else {
1326 ASSERT(vp->v_type == VREG);
1327 error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt,
1328 &count);
1329 }
1330
1331
1332 #ifdef _SYSCALL32_IMPL
1333 if (get_udatamodel() == DATAMODEL_ILP32)
1334 copy_vec = (const struct sendfilevec *)((char *)copy_vec +
1335 (copy_cnt * sizeof (ksendfilevec32_t)));
1336 else
1337 #endif
1338 copy_vec += copy_cnt;
1339 sfvcnt -= copy_cnt;
1340
1341 /* Process all vector members up to first error */
1342 } while ((sfvcnt > 0) && first_vector_error == 0 && error == 0);
1343
1344 if (vp->v_type == VREG)
1345 fp->f_offset += count;
1346
1347 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1348
1349 #ifdef _SYSCALL32_IMPL
1350 if (get_udatamodel() == DATAMODEL_ILP32) {
1351 ssize32_t count32 = (ssize32_t)count;
1352 if (copyout(&count32, xferred, sizeof (count32)))
1353 error = EFAULT;
1354 releasef(fildes);
1355 if (error != 0)
1356 return (set_errno(error));
1357 if (first_vector_error != 0)
1358 return (set_errno(first_vector_error));
1359 return (count32);
1360 }
1361 #endif
1362 if (copyout(&count, xferred, sizeof (count)))
1363 error = EFAULT;
1364 releasef(fildes);
1365 if (error != 0)
1366 return (set_errno(error));
1367 if (first_vector_error != 0)
1368 return (set_errno(first_vector_error));
1369 return (count);
1370 err:
1371 ASSERT(error != 0);
1372 releasef(fildes);
1373 return (set_errno(error));
1374 }
1375