xref: /illumos-gate/usr/src/uts/common/syscall/sendfile.c (revision 3d393ee6c37fa10ac512ed6d36109ad616dc7c1a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/t_lock.h>
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/buf.h>
32 #include <sys/conf.h>
33 #include <sys/cred.h>
34 #include <sys/kmem.h>
35 #include <sys/sysmacros.h>
36 #include <sys/vfs.h>
37 #include <sys/vnode.h>
38 #include <sys/debug.h>
39 #include <sys/errno.h>
40 #include <sys/time.h>
41 #include <sys/file.h>
42 #include <sys/open.h>
43 #include <sys/user.h>
44 #include <sys/termios.h>
45 #include <sys/stream.h>
46 #include <sys/strsubr.h>
47 #include <sys/sunddi.h>
48 #include <sys/esunddi.h>
49 #include <sys/flock.h>
50 #include <sys/modctl.h>
51 #include <sys/cmn_err.h>
52 #include <sys/vmsystm.h>
53 
54 #include <sys/socket.h>
55 #include <sys/socketvar.h>
56 #include <fs/sockfs/sockcommon.h>
57 #include <fs/sockfs/socktpi.h>
58 
59 #include <netinet/in.h>
60 #include <sys/sendfile.h>
61 #include <sys/un.h>
62 #include <sys/tihdr.h>
63 #include <sys/atomic.h>
64 
65 #include <inet/common.h>
66 #include <inet/ip.h>
67 #include <inet/ip6.h>
68 #include <inet/tcp.h>
69 
70 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *,
71 		ssize32_t *);
72 extern int nl7c_sendfilev(struct sonode *, u_offset_t *, struct sendfilevec *,
73 		int, ssize_t *);
74 extern int snf_segmap(file_t *, vnode_t *, u_offset_t, u_offset_t, ssize_t *,
75 		boolean_t);
76 extern sotpi_info_t *sotpi_sototpi(struct sonode *);
77 
78 #define	readflg	(V_WRITELOCK_FALSE)
79 #define	rwflag	(V_WRITELOCK_TRUE)
80 
81 #define	SEND_MAX_CHUNK	16
82 
83 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
84 /*
85  * 64 bit offsets for 32 bit applications only running either on
86  * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer
87  * more than 2GB of data.
88  */
89 int
90 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv,
91     int copy_cnt, ssize32_t *count)
92 {
93 	struct vnode *vp;
94 	ushort_t fflag;
95 	int ioflag;
96 	size32_t cnt;
97 	ssize32_t sfv_len;
98 	ssize32_t tmpcount;
99 	u_offset_t sfv_off;
100 	struct uio auio;
101 	struct iovec aiov;
102 	int i, error;
103 
104 	fflag = fp->f_flag;
105 	vp = fp->f_vnode;
106 	for (i = 0; i < copy_cnt; i++) {
107 
108 		if (ISSIG(curthread, JUSTLOOKING))
109 			return (EINTR);
110 
111 		/*
112 		 * Do similar checks as "write" as we are writing
113 		 * sfv_len bytes into "vp".
114 		 */
115 		sfv_len = (ssize32_t)sfv->sfv_len;
116 
117 		if (sfv_len == 0) {
118 			sfv++;
119 			continue;
120 		}
121 
122 		if (sfv_len < 0)
123 			return (EINVAL);
124 
125 		if (vp->v_type == VREG) {
126 			if (*fileoff >= curproc->p_fsz_ctl) {
127 				mutex_enter(&curproc->p_lock);
128 				(void) rctl_action(
129 				    rctlproc_legacy[RLIMIT_FSIZE],
130 				    curproc->p_rctls, curproc, RCA_SAFE);
131 				mutex_exit(&curproc->p_lock);
132 				return (EFBIG);
133 			}
134 
135 			if (*fileoff >= OFFSET_MAX(fp))
136 				return (EFBIG);
137 
138 			if (*fileoff + sfv_len > OFFSET_MAX(fp))
139 				return (EINVAL);
140 		}
141 
142 		tmpcount = *count + sfv_len;
143 		if (tmpcount < 0)
144 			return (EINVAL);
145 
146 		sfv_off = sfv->sfv_off;
147 
148 		auio.uio_extflg = UIO_COPY_DEFAULT;
149 		if (sfv->sfv_fd == SFV_FD_SELF) {
150 			aiov.iov_len = sfv_len;
151 			aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
152 			auio.uio_loffset = *fileoff;
153 			auio.uio_iovcnt = 1;
154 			auio.uio_resid = sfv_len;
155 			auio.uio_iov = &aiov;
156 			auio.uio_segflg = UIO_USERSPACE;
157 			auio.uio_llimit = curproc->p_fsz_ctl;
158 			auio.uio_fmode = fflag;
159 			ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
160 			while (sfv_len > 0) {
161 				error = VOP_WRITE(vp, &auio, ioflag,
162 				    fp->f_cred, NULL);
163 				cnt = sfv_len - auio.uio_resid;
164 				sfv_len -= cnt;
165 				ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
166 				if (vp->v_type == VREG)
167 					*fileoff += cnt;
168 				*count += cnt;
169 				if (error != 0)
170 					return (error);
171 			}
172 		} else {
173 			file_t	*ffp;
174 			vnode_t	*readvp;
175 			size_t	size;
176 			caddr_t	ptr;
177 
178 			if ((ffp = getf(sfv->sfv_fd)) == NULL)
179 				return (EBADF);
180 
181 			if ((ffp->f_flag & FREAD) == 0) {
182 				releasef(sfv->sfv_fd);
183 				return (EBADF);
184 			}
185 
186 			readvp = ffp->f_vnode;
187 			if (readvp->v_type != VREG) {
188 				releasef(sfv->sfv_fd);
189 				return (EINVAL);
190 			}
191 
192 			/*
193 			 * No point reading and writing to same vp,
194 			 * as long as both are regular files. readvp is not
195 			 * locked; but since we got it from an open file the
196 			 * contents will be valid during the time of access.
197 			 */
198 			if (vn_compare(vp, readvp)) {
199 				releasef(sfv->sfv_fd);
200 				return (EINVAL);
201 			}
202 
203 			/*
204 			 * Note: we assume readvp != vp. "vp" is already
205 			 * locked, and "readvp" must not be.
206 			 */
207 			(void) VOP_RWLOCK(readvp, readflg, NULL);
208 
209 			/*
210 			 * Same checks as in pread64.
211 			 */
212 			if (sfv_off > MAXOFFSET_T) {
213 				VOP_RWUNLOCK(readvp, readflg, NULL);
214 				releasef(sfv->sfv_fd);
215 				return (EINVAL);
216 			}
217 
218 			if (sfv_off + sfv_len > MAXOFFSET_T)
219 				sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off);
220 
221 			/* Find the native blocksize to transfer data */
222 			size = MIN(vp->v_vfsp->vfs_bsize,
223 			    readvp->v_vfsp->vfs_bsize);
224 			size = sfv_len < size ? sfv_len : size;
225 			ptr = kmem_alloc(size, KM_SLEEP);
226 
227 			while (sfv_len > 0) {
228 				size_t	iov_len;
229 
230 				iov_len = MIN(size, sfv_len);
231 				aiov.iov_base = ptr;
232 				aiov.iov_len = iov_len;
233 				auio.uio_loffset = sfv_off;
234 				auio.uio_iov = &aiov;
235 				auio.uio_iovcnt = 1;
236 				auio.uio_resid = iov_len;
237 				auio.uio_segflg = UIO_SYSSPACE;
238 				auio.uio_llimit = MAXOFFSET_T;
239 				auio.uio_fmode = ffp->f_flag;
240 				ioflag = auio.uio_fmode &
241 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
242 
243 				/*
244 				 * If read sync is not asked for,
245 				 * filter sync flags
246 				 */
247 				if ((ioflag & FRSYNC) == 0)
248 					ioflag &= ~(FSYNC|FDSYNC);
249 				error = VOP_READ(readvp, &auio, ioflag,
250 				    fp->f_cred, NULL);
251 				if (error) {
252 					kmem_free(ptr, size);
253 					VOP_RWUNLOCK(readvp, readflg, NULL);
254 					releasef(sfv->sfv_fd);
255 					return (error);
256 				}
257 
258 				/*
259 				 * Check how must data was really read.
260 				 * Decrement the 'len' and increment the
261 				 * 'off' appropriately.
262 				 */
263 				cnt = iov_len - auio.uio_resid;
264 				if (cnt == 0) {
265 					/*
266 					 * If we were reading a pipe (currently
267 					 * not implemented), we may now lose
268 					 * data.
269 					 */
270 					kmem_free(ptr, size);
271 					VOP_RWUNLOCK(readvp, readflg, NULL);
272 					releasef(sfv->sfv_fd);
273 					return (EINVAL);
274 				}
275 				sfv_len -= cnt;
276 				sfv_off += cnt;
277 
278 				aiov.iov_base = ptr;
279 				aiov.iov_len = cnt;
280 				auio.uio_loffset = *fileoff;
281 				auio.uio_iov = &aiov;
282 				auio.uio_iovcnt = 1;
283 				auio.uio_resid = cnt;
284 				auio.uio_segflg = UIO_SYSSPACE;
285 				auio.uio_llimit = curproc->p_fsz_ctl;
286 				auio.uio_fmode = fflag;
287 				ioflag = auio.uio_fmode &
288 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
289 				error = VOP_WRITE(vp, &auio, ioflag,
290 				    fp->f_cred, NULL);
291 
292 				/*
293 				 * Check how much data was written. Increment
294 				 * the 'len' and decrement the 'off' if all
295 				 * the data was not written.
296 				 */
297 				cnt -= auio.uio_resid;
298 				sfv_len += auio.uio_resid;
299 				sfv_off -= auio.uio_resid;
300 				ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
301 				if (vp->v_type == VREG)
302 					*fileoff += cnt;
303 				*count += cnt;
304 				if (error != 0) {
305 					kmem_free(ptr, size);
306 					VOP_RWUNLOCK(readvp, readflg, NULL);
307 					releasef(sfv->sfv_fd);
308 					return (error);
309 				}
310 			}
311 			VOP_RWUNLOCK(readvp, readflg, NULL);
312 			releasef(sfv->sfv_fd);
313 			kmem_free(ptr, size);
314 		}
315 		sfv++;
316 	}
317 	return (0);
318 }
319 
320 ssize32_t
321 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt,
322 	size32_t *xferred, int fildes)
323 {
324 	u_offset_t		fileoff;
325 	int			copy_cnt;
326 	const struct ksendfilevec64 *copy_vec;
327 	struct ksendfilevec64 sfv[SEND_MAX_CHUNK];
328 	struct vnode *vp;
329 	int error;
330 	ssize32_t count = 0;
331 
332 	vp = fp->f_vnode;
333 	(void) VOP_RWLOCK(vp, rwflag, NULL);
334 
335 	copy_vec = vec;
336 	fileoff = fp->f_offset;
337 
338 	do {
339 		copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
340 		if (copyin(copy_vec, sfv, copy_cnt *
341 		    sizeof (struct ksendfilevec64))) {
342 			error = EFAULT;
343 			break;
344 		}
345 
346 		/*
347 		 * Optimize the regular file over
348 		 * the socket case.
349 		 */
350 		if (vp->v_type == VSOCK && sfv->sfv_fd != SFV_FD_SELF) {
351 			file_t *rfp;
352 			vnode_t *rvp;
353 
354 			if ((rfp = getf(sfv->sfv_fd)) == NULL) {
355 				error = EBADF;
356 				break;
357 			}
358 			if ((rfp->f_flag & FREAD) == 0) {
359 				releasef(sfv->sfv_fd);
360 				error = EBADF;
361 				break;
362 			}
363 			rvp = rfp->f_vnode;
364 			if (rvp->v_type == VREG) {
365 				error = sosendfile64(fp, rfp, sfv, &count);
366 				if (error)
367 					break;
368 				copy_vec++;
369 				sfvcnt--;
370 				continue;
371 			}
372 			releasef(sfv->sfv_fd);
373 		}
374 		error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count);
375 		if (error != 0)
376 			break;
377 
378 		copy_vec += copy_cnt;
379 		sfvcnt -= copy_cnt;
380 	} while (sfvcnt > 0);
381 
382 	if (vp->v_type == VREG)
383 		fp->f_offset += count;
384 
385 	VOP_RWUNLOCK(vp, rwflag, NULL);
386 	if (copyout(&count, xferred, sizeof (count)))
387 		error = EFAULT;
388 	releasef(fildes);
389 	if (error != 0)
390 		return (set_errno(error));
391 	return (count);
392 }
393 #endif
394 
395 int
396 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
397     int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count)
398 {
399 	struct vnode *vp;
400 	struct uio auio;
401 	struct iovec aiov;
402 	ushort_t fflag;
403 	int ioflag;
404 	int i, error;
405 	size_t cnt;
406 	ssize_t sfv_len;
407 	u_offset_t sfv_off;
408 #ifdef _SYSCALL32_IMPL
409 	model_t model = get_udatamodel();
410 	u_offset_t maxoff = (model == DATAMODEL_ILP32) ?
411 	    MAXOFF32_T : MAXOFFSET_T;
412 #else
413 	const u_offset_t maxoff = MAXOFF32_T;
414 #endif
415 	mblk_t *dmp = NULL;
416 	int wroff;
417 	int buf_left = 0;
418 	size_t	iov_len;
419 	mblk_t  *head, *tmp;
420 	size_t  size = total_size;
421 	size_t  extra;
422 	int tail_len;
423 	struct nmsghdr msg;
424 
425 	fflag = fp->f_flag;
426 	vp = fp->f_vnode;
427 
428 	ASSERT(vp->v_type == VSOCK);
429 	ASSERT(maxblk > 0);
430 
431 	/* If nothing to send, return */
432 	if (total_size == 0)
433 		return (0);
434 
435 	if (vp->v_stream != NULL) {
436 		wroff = (int)vp->v_stream->sd_wroff;
437 		tail_len = (int)vp->v_stream->sd_tail;
438 	} else {
439 		struct sonode *so;
440 
441 		so = VTOSO(vp);
442 		wroff = so->so_proto_props.sopp_wroff;
443 		tail_len = so->so_proto_props.sopp_tail;
444 	}
445 
446 	extra = wroff + tail_len;
447 
448 	buf_left = MIN(total_size, maxblk);
449 	head = dmp = allocb(buf_left + extra, BPRI_HI);
450 	if (head == NULL)
451 		return (ENOMEM);
452 	head->b_wptr = head->b_rptr = head->b_rptr + wroff;
453 	bzero(&msg, sizeof (msg));
454 
455 	auio.uio_extflg = UIO_COPY_DEFAULT;
456 	for (i = 0; i < copy_cnt; i++) {
457 		if (ISSIG(curthread, JUSTLOOKING)) {
458 			freemsg(head);
459 			return (EINTR);
460 		}
461 
462 		/*
463 		 * Do similar checks as "write" as we are writing
464 		 * sfv_len bytes into "vp".
465 		 */
466 		sfv_len = (ssize_t)sfv->sfv_len;
467 
468 		if (sfv_len == 0) {
469 			sfv++;
470 			continue;
471 		}
472 
473 		/* Check for overflow */
474 #ifdef _SYSCALL32_IMPL
475 		if (model == DATAMODEL_ILP32) {
476 			if (((ssize32_t)(*count + sfv_len)) < 0) {
477 				freemsg(head);
478 				return (EINVAL);
479 			}
480 		} else
481 #endif
482 		if ((*count + sfv_len) < 0) {
483 			freemsg(head);
484 			return (EINVAL);
485 		}
486 
487 		sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off;
488 
489 		if (sfv->sfv_fd == SFV_FD_SELF) {
490 			while (sfv_len > 0) {
491 				if (buf_left == 0) {
492 					tmp = dmp;
493 					buf_left = MIN(total_size, maxblk);
494 					iov_len = MIN(buf_left, sfv_len);
495 					dmp = allocb(buf_left + extra, BPRI_HI);
496 					if (dmp == NULL) {
497 						freemsg(head);
498 						return (ENOMEM);
499 					}
500 					dmp->b_wptr = dmp->b_rptr =
501 					    dmp->b_rptr + wroff;
502 					tmp->b_cont = dmp;
503 				} else {
504 					iov_len = MIN(buf_left, sfv_len);
505 				}
506 
507 				aiov.iov_len = iov_len;
508 				aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
509 				auio.uio_loffset = *fileoff;
510 				auio.uio_iovcnt = 1;
511 				auio.uio_resid = iov_len;
512 				auio.uio_iov = &aiov;
513 				auio.uio_segflg = UIO_USERSPACE;
514 				auio.uio_llimit = curproc->p_fsz_ctl;
515 				auio.uio_fmode = fflag;
516 
517 				buf_left -= iov_len;
518 				total_size -= iov_len;
519 				sfv_len -= iov_len;
520 				sfv_off += iov_len;
521 
522 				error = uiomove((caddr_t)dmp->b_wptr,
523 				    iov_len, UIO_WRITE, &auio);
524 				if (error != 0) {
525 					freemsg(head);
526 					return (error);
527 				}
528 				dmp->b_wptr += iov_len;
529 			}
530 		} else {
531 			file_t	*ffp;
532 			vnode_t	*readvp;
533 
534 			if ((ffp = getf(sfv->sfv_fd)) == NULL) {
535 				freemsg(head);
536 				return (EBADF);
537 			}
538 
539 			if ((ffp->f_flag & FREAD) == 0) {
540 				releasef(sfv->sfv_fd);
541 				freemsg(head);
542 				return (EACCES);
543 			}
544 
545 			readvp = ffp->f_vnode;
546 			if (readvp->v_type != VREG) {
547 				releasef(sfv->sfv_fd);
548 				freemsg(head);
549 				return (EINVAL);
550 			}
551 
552 			/*
553 			 * No point reading and writing to same vp,
554 			 * as long as both are regular files. readvp is not
555 			 * locked; but since we got it from an open file the
556 			 * contents will be valid during the time of access.
557 			 */
558 
559 			if (vn_compare(vp, readvp)) {
560 				releasef(sfv->sfv_fd);
561 				freemsg(head);
562 				return (EINVAL);
563 			}
564 
565 			/*
566 			 * Note: we assume readvp != vp. "vp" is already
567 			 * locked, and "readvp" must not be.
568 			 */
569 
570 			(void) VOP_RWLOCK(readvp, readflg, NULL);
571 
572 			/* Same checks as in pread */
573 			if (sfv_off > maxoff) {
574 				VOP_RWUNLOCK(readvp, readflg, NULL);
575 				releasef(sfv->sfv_fd);
576 				freemsg(head);
577 				return (EINVAL);
578 			}
579 			if (sfv_off + sfv_len > maxoff) {
580 				total_size -= (sfv_off + sfv_len - maxoff);
581 				sfv_len = (ssize_t)((offset_t)maxoff -
582 				    sfv_off);
583 			}
584 
585 			while (sfv_len > 0) {
586 				if (buf_left == 0) {
587 					tmp = dmp;
588 					buf_left = MIN(total_size, maxblk);
589 					iov_len = MIN(buf_left, sfv_len);
590 					dmp = allocb(buf_left + extra, BPRI_HI);
591 					if (dmp == NULL) {
592 						VOP_RWUNLOCK(readvp, readflg,
593 						    NULL);
594 						releasef(sfv->sfv_fd);
595 						freemsg(head);
596 						return (ENOMEM);
597 					}
598 					dmp->b_wptr = dmp->b_rptr =
599 					    dmp->b_rptr + wroff;
600 					tmp->b_cont = dmp;
601 				} else {
602 					iov_len = MIN(buf_left, sfv_len);
603 				}
604 				aiov.iov_base = (caddr_t)dmp->b_wptr;
605 				aiov.iov_len = iov_len;
606 				auio.uio_loffset = sfv_off;
607 				auio.uio_iov = &aiov;
608 				auio.uio_iovcnt = 1;
609 				auio.uio_resid = iov_len;
610 				auio.uio_segflg = UIO_SYSSPACE;
611 				auio.uio_llimit = MAXOFFSET_T;
612 				auio.uio_fmode = ffp->f_flag;
613 				ioflag = auio.uio_fmode &
614 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
615 
616 				/*
617 				 * If read sync is not asked for,
618 				 * filter sync flags
619 				 */
620 				if ((ioflag & FRSYNC) == 0)
621 					ioflag &= ~(FSYNC|FDSYNC);
622 				error = VOP_READ(readvp, &auio, ioflag,
623 				    fp->f_cred, NULL);
624 				if (error != 0) {
625 					/*
626 					 * If we were reading a pipe (currently
627 					 * not implemented), we may now loose
628 					 * data.
629 					 */
630 					VOP_RWUNLOCK(readvp, readflg, NULL);
631 					releasef(sfv->sfv_fd);
632 					freemsg(head);
633 					return (error);
634 				}
635 
636 				/*
637 				 * Check how much data was really read.
638 				 * Decrement the 'len' and increment the
639 				 * 'off' appropriately.
640 				 */
641 				cnt = iov_len - auio.uio_resid;
642 				if (cnt == 0) {
643 					VOP_RWUNLOCK(readvp, readflg, NULL);
644 					releasef(sfv->sfv_fd);
645 					freemsg(head);
646 					return (EINVAL);
647 				}
648 				sfv_len -= cnt;
649 				sfv_off += cnt;
650 				total_size -= cnt;
651 				buf_left -= cnt;
652 
653 				dmp->b_wptr += cnt;
654 			}
655 			VOP_RWUNLOCK(readvp, readflg, NULL);
656 			releasef(sfv->sfv_fd);
657 		}
658 		sfv++;
659 	}
660 
661 	ASSERT(total_size == 0);
662 	error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &head);
663 	if (error != 0) {
664 		if (head != NULL)
665 			freemsg(head);
666 		return (error);
667 	}
668 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size;
669 	*count += size;
670 
671 	return (0);
672 }
673 
674 
675 int
676 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
677     int copy_cnt, ssize_t *count)
678 {
679 	struct vnode *vp;
680 	struct uio auio;
681 	struct iovec aiov;
682 	ushort_t fflag;
683 	int ioflag;
684 	int i, error;
685 	size_t cnt;
686 	ssize_t sfv_len;
687 	u_offset_t sfv_off;
688 #ifdef _SYSCALL32_IMPL
689 	model_t model = get_udatamodel();
690 	u_offset_t maxoff = (model == DATAMODEL_ILP32) ?
691 	    MAXOFF32_T : MAXOFFSET_T;
692 #else
693 	const u_offset_t maxoff = MAXOFF32_T;
694 #endif
695 	mblk_t	*dmp = NULL;
696 	char	*buf = NULL;
697 	size_t  extra;
698 	int maxblk, wroff, tail_len;
699 	struct sonode *so;
700 	stdata_t *stp;
701 	struct nmsghdr msg;
702 
703 	fflag = fp->f_flag;
704 	vp = fp->f_vnode;
705 
706 	if (vp->v_type == VSOCK) {
707 		so = VTOSO(vp);
708 		if (vp->v_stream != NULL) {
709 			stp = vp->v_stream;
710 			wroff = (int)stp->sd_wroff;
711 			tail_len = (int)stp->sd_tail;
712 			maxblk = (int)stp->sd_maxblk;
713 		} else {
714 			stp = NULL;
715 			wroff = so->so_proto_props.sopp_wroff;
716 			tail_len = so->so_proto_props.sopp_tail;
717 			maxblk = so->so_proto_props.sopp_maxblk;
718 		}
719 		extra = wroff + tail_len;
720 	}
721 
722 	bzero(&msg, sizeof (msg));
723 	auio.uio_extflg = UIO_COPY_DEFAULT;
724 	for (i = 0; i < copy_cnt; i++) {
725 		if (ISSIG(curthread, JUSTLOOKING))
726 			return (EINTR);
727 
728 		/*
729 		 * Do similar checks as "write" as we are writing
730 		 * sfv_len bytes into "vp".
731 		 */
732 		sfv_len = (ssize_t)sfv->sfv_len;
733 
734 		if (sfv_len == 0) {
735 			sfv++;
736 			continue;
737 		}
738 
739 		if (vp->v_type == VREG) {
740 			if (*fileoff >= curproc->p_fsz_ctl) {
741 				mutex_enter(&curproc->p_lock);
742 				(void) rctl_action(
743 				    rctlproc_legacy[RLIMIT_FSIZE],
744 				    curproc->p_rctls, curproc, RCA_SAFE);
745 				mutex_exit(&curproc->p_lock);
746 
747 				return (EFBIG);
748 			}
749 
750 			if (*fileoff >= maxoff)
751 				return (EFBIG);
752 
753 			if (*fileoff + sfv_len > maxoff)
754 				return (EINVAL);
755 		}
756 
757 		/* Check for overflow */
758 #ifdef _SYSCALL32_IMPL
759 		if (model == DATAMODEL_ILP32) {
760 			if (((ssize32_t)(*count + sfv_len)) < 0)
761 				return (EINVAL);
762 		} else
763 #endif
764 		if ((*count + sfv_len) < 0)
765 			return (EINVAL);
766 
767 		sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off;
768 
769 		if (sfv->sfv_fd == SFV_FD_SELF) {
770 			if (vp->v_type == VSOCK) {
771 				while (sfv_len > 0) {
772 					size_t iov_len;
773 
774 					iov_len = sfv_len;
775 					if (!SOCK_IS_NONSTR(so) &&
776 					    SOTOTPI(so)->sti_kssl_ctx != NULL)
777 						iov_len = MIN(iov_len, maxblk);
778 
779 					aiov.iov_len = iov_len;
780 					aiov.iov_base =
781 					    (caddr_t)(uintptr_t)sfv_off;
782 
783 					auio.uio_iov = &aiov;
784 					auio.uio_iovcnt = 1;
785 					auio.uio_loffset = *fileoff;
786 					auio.uio_segflg = UIO_USERSPACE;
787 					auio.uio_fmode = fflag;
788 					auio.uio_llimit = curproc->p_fsz_ctl;
789 					auio.uio_resid = iov_len;
790 
791 					dmp = allocb(iov_len + extra, BPRI_HI);
792 					if (dmp == NULL)
793 						return (ENOMEM);
794 					dmp->b_wptr = dmp->b_rptr =
795 					    dmp->b_rptr + wroff;
796 					error = uiomove((caddr_t)dmp->b_wptr,
797 					    iov_len, UIO_WRITE, &auio);
798 					if (error != 0) {
799 						freeb(dmp);
800 						return (error);
801 					}
802 					dmp->b_wptr += iov_len;
803 					error = socket_sendmblk(VTOSO(vp),
804 					    &msg, fflag, CRED(), &dmp);
805 
806 					if (error != 0) {
807 						if (dmp != NULL)
808 							freeb(dmp);
809 						return (error);
810 					}
811 					ttolwp(curthread)->lwp_ru.ioch +=
812 					    (ulong_t)iov_len;
813 					*count += iov_len;
814 					sfv_len -= iov_len;
815 					sfv_off += iov_len;
816 				}
817 			} else {
818 				aiov.iov_len = sfv_len;
819 				aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
820 
821 				auio.uio_iov = &aiov;
822 				auio.uio_iovcnt = 1;
823 				auio.uio_loffset = *fileoff;
824 				auio.uio_segflg = UIO_USERSPACE;
825 				auio.uio_fmode = fflag;
826 				auio.uio_llimit = curproc->p_fsz_ctl;
827 				auio.uio_resid = sfv_len;
828 
829 				ioflag = auio.uio_fmode &
830 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
831 				while (sfv_len > 0) {
832 					error = VOP_WRITE(vp, &auio, ioflag,
833 					    fp->f_cred, NULL);
834 					cnt = sfv_len - auio.uio_resid;
835 					sfv_len -= cnt;
836 					ttolwp(curthread)->lwp_ru.ioch +=
837 					    (ulong_t)cnt;
838 					*fileoff += cnt;
839 					*count += cnt;
840 					if (error != 0)
841 						return (error);
842 				}
843 			}
844 		} else {
845 			int segmapit = 0;
846 			file_t	*ffp;
847 			vnode_t	*readvp;
848 			struct vnode *realvp;
849 			size_t	size;
850 			caddr_t	ptr;
851 
852 			if ((ffp = getf(sfv->sfv_fd)) == NULL)
853 				return (EBADF);
854 
855 			if ((ffp->f_flag & FREAD) == 0) {
856 				releasef(sfv->sfv_fd);
857 				return (EBADF);
858 			}
859 
860 			readvp = ffp->f_vnode;
861 			if (VOP_REALVP(readvp, &realvp, NULL) == 0)
862 				readvp = realvp;
863 			if (readvp->v_type != VREG) {
864 				releasef(sfv->sfv_fd);
865 				return (EINVAL);
866 			}
867 
868 			/*
869 			 * No point reading and writing to same vp,
870 			 * as long as both are regular files. readvp is not
871 			 * locked; but since we got it from an open file the
872 			 * contents will be valid during the time of access.
873 			 */
874 			if (vn_compare(vp, readvp)) {
875 				releasef(sfv->sfv_fd);
876 				return (EINVAL);
877 			}
878 
879 			/*
880 			 * Note: we assume readvp != vp. "vp" is already
881 			 * locked, and "readvp" must not be.
882 			 */
883 			(void) VOP_RWLOCK(readvp, readflg, NULL);
884 
885 			/* Same checks as in pread */
886 			if (sfv_off > maxoff) {
887 				VOP_RWUNLOCK(readvp, readflg, NULL);
888 				releasef(sfv->sfv_fd);
889 				return (EINVAL);
890 			}
891 			if (sfv_off + sfv_len > maxoff) {
892 				sfv_len = (ssize_t)((offset_t)maxoff -
893 				    sfv_off);
894 			}
895 			/* Find the native blocksize to transfer data */
896 			size = MIN(vp->v_vfsp->vfs_bsize,
897 			    readvp->v_vfsp->vfs_bsize);
898 			size = sfv_len < size ? sfv_len : size;
899 
900 			if (vp->v_type != VSOCK) {
901 				segmapit = 0;
902 				buf = kmem_alloc(size, KM_NOSLEEP);
903 				if (buf == NULL) {
904 					VOP_RWUNLOCK(readvp, readflg, NULL);
905 					releasef(sfv->sfv_fd);
906 					return (ENOMEM);
907 				}
908 			} else {
909 				uint_t	copyflag;
910 
911 				copyflag = stp != NULL ? stp->sd_copyflag :
912 				    so->so_proto_props.sopp_zcopyflag;
913 				/*
914 				 * For sockets acting as an SSL proxy, we
915 				 * need to adjust the size to the maximum
916 				 * SSL record size set in the stream head.
917 				 */
918 				if (!SOCK_IS_NONSTR(so) &&
919 				    _SOTOTPI(so)->sti_kssl_ctx != NULL)
920 					size = MIN(size, maxblk);
921 
922 				if (vn_has_flocks(readvp) ||
923 				    readvp->v_flag & VNOMAP ||
924 				    copyflag & STZCVMUNSAFE) {
925 					segmapit = 0;
926 				} else if (copyflag & STZCVMSAFE) {
927 					segmapit = 1;
928 				} else {
929 					int on = 1;
930 					if (socket_setsockopt(VTOSO(vp),
931 					    SOL_SOCKET, SO_SND_COPYAVOID,
932 					    &on, sizeof (on), CRED()) == 0)
933 					segmapit = 1;
934 				}
935 			}
936 
937 			if (segmapit) {
938 				boolean_t nowait;
939 
940 				nowait = (sfv->sfv_flag & SFV_NOWAIT) != 0;
941 				error = snf_segmap(fp, readvp, sfv_off,
942 				    (u_offset_t)sfv_len, (ssize_t *)&cnt,
943 				    nowait);
944 				releasef(sfv->sfv_fd);
945 				*count += cnt;
946 				if (error)
947 					return (error);
948 				sfv++;
949 				continue;
950 			}
951 
952 			while (sfv_len > 0) {
953 				size_t	iov_len;
954 
955 				iov_len = MIN(size, sfv_len);
956 
957 				if (vp->v_type == VSOCK) {
958 					dmp = allocb(iov_len + extra, BPRI_HI);
959 					if (dmp == NULL) {
960 						VOP_RWUNLOCK(readvp, readflg,
961 						    NULL);
962 						releasef(sfv->sfv_fd);
963 						return (ENOMEM);
964 					}
965 					dmp->b_wptr = dmp->b_rptr =
966 					    dmp->b_rptr + wroff;
967 					ptr = (caddr_t)dmp->b_rptr;
968 				} else {
969 					ptr = buf;
970 				}
971 
972 				aiov.iov_base = ptr;
973 				aiov.iov_len = iov_len;
974 				auio.uio_loffset = sfv_off;
975 				auio.uio_iov = &aiov;
976 				auio.uio_iovcnt = 1;
977 				auio.uio_resid = iov_len;
978 				auio.uio_segflg = UIO_SYSSPACE;
979 				auio.uio_llimit = MAXOFFSET_T;
980 				auio.uio_fmode = ffp->f_flag;
981 				ioflag = auio.uio_fmode &
982 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
983 
984 				/*
985 				 * If read sync is not asked for,
986 				 * filter sync flags
987 				 */
988 				if ((ioflag & FRSYNC) == 0)
989 					ioflag &= ~(FSYNC|FDSYNC);
990 				error = VOP_READ(readvp, &auio, ioflag,
991 				    fp->f_cred, NULL);
992 				if (error != 0) {
993 					/*
994 					 * If we were reading a pipe (currently
995 					 * not implemented), we may now lose
996 					 * data.
997 					 */
998 					if (vp->v_type == VSOCK)
999 						freeb(dmp);
1000 					else
1001 						kmem_free(buf, size);
1002 					VOP_RWUNLOCK(readvp, readflg, NULL);
1003 					releasef(sfv->sfv_fd);
1004 					return (error);
1005 				}
1006 
1007 				/*
1008 				 * Check how much data was really read.
1009 				 * Decrement the 'len' and increment the
1010 				 * 'off' appropriately.
1011 				 */
1012 				cnt = iov_len - auio.uio_resid;
1013 				if (cnt == 0) {
1014 					if (vp->v_type == VSOCK)
1015 						freeb(dmp);
1016 					else
1017 						kmem_free(buf, size);
1018 					VOP_RWUNLOCK(readvp, readflg, NULL);
1019 					releasef(sfv->sfv_fd);
1020 					return (EINVAL);
1021 				}
1022 				sfv_len -= cnt;
1023 				sfv_off += cnt;
1024 
1025 				if (vp->v_type == VSOCK) {
1026 					dmp->b_wptr = dmp->b_rptr + cnt;
1027 
1028 					error = socket_sendmblk(VTOSO(vp),
1029 					    &msg, fflag, CRED(), &dmp);
1030 
1031 					if (error != 0) {
1032 						if (dmp != NULL)
1033 							freeb(dmp);
1034 						VOP_RWUNLOCK(readvp, readflg,
1035 						    NULL);
1036 						releasef(sfv->sfv_fd);
1037 						return (error);
1038 					}
1039 
1040 					ttolwp(curthread)->lwp_ru.ioch +=
1041 					    (ulong_t)cnt;
1042 					*count += cnt;
1043 				} else {
1044 
1045 					aiov.iov_base = ptr;
1046 					aiov.iov_len = cnt;
1047 					auio.uio_loffset = *fileoff;
1048 					auio.uio_resid = cnt;
1049 					auio.uio_iov = &aiov;
1050 					auio.uio_iovcnt = 1;
1051 					auio.uio_segflg = UIO_SYSSPACE;
1052 					auio.uio_llimit = curproc->p_fsz_ctl;
1053 					auio.uio_fmode = fflag;
1054 					ioflag = auio.uio_fmode &
1055 					    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1056 					error = VOP_WRITE(vp, &auio, ioflag,
1057 					    fp->f_cred, NULL);
1058 
1059 					/*
1060 					 * Check how much data was written.
1061 					 * Increment the 'len' and decrement the
1062 					 * 'off' if all the data was not
1063 					 * written.
1064 					 */
1065 					cnt -= auio.uio_resid;
1066 					sfv_len += auio.uio_resid;
1067 					sfv_off -= auio.uio_resid;
1068 					ttolwp(curthread)->lwp_ru.ioch +=
1069 					    (ulong_t)cnt;
1070 					*fileoff += cnt;
1071 					*count += cnt;
1072 					if (error != 0) {
1073 						kmem_free(buf, size);
1074 						VOP_RWUNLOCK(readvp, readflg,
1075 						    NULL);
1076 						releasef(sfv->sfv_fd);
1077 						return (error);
1078 					}
1079 				}
1080 			}
1081 			if (buf) {
1082 				kmem_free(buf, size);
1083 				buf = NULL;
1084 			}
1085 			VOP_RWUNLOCK(readvp, readflg, NULL);
1086 			releasef(sfv->sfv_fd);
1087 		}
1088 		sfv++;
1089 	}
1090 	return (0);
1091 }
1092 
1093 ssize_t
1094 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt,
1095     size_t *xferred)
1096 {
1097 	int error = 0;
1098 	int first_vector_error = 0;
1099 	file_t *fp;
1100 	struct vnode *vp;
1101 	struct sonode *so;
1102 	u_offset_t fileoff;
1103 	int copy_cnt;
1104 	const struct sendfilevec *copy_vec;
1105 	struct sendfilevec sfv[SEND_MAX_CHUNK];
1106 	ssize_t count = 0;
1107 #ifdef _SYSCALL32_IMPL
1108 	struct ksendfilevec32 sfv32[SEND_MAX_CHUNK];
1109 #endif
1110 	ssize_t total_size;
1111 	int i;
1112 	boolean_t is_sock = B_FALSE;
1113 	int maxblk = 0;
1114 
1115 	if (sfvcnt <= 0)
1116 		return (set_errno(EINVAL));
1117 
1118 	if ((fp = getf(fildes)) == NULL)
1119 		return (set_errno(EBADF));
1120 
1121 	if (((fp->f_flag) & FWRITE) == 0) {
1122 		error = EBADF;
1123 		goto err;
1124 	}
1125 
1126 	fileoff = fp->f_offset;
1127 	vp = fp->f_vnode;
1128 
1129 	switch (vp->v_type) {
1130 	case VSOCK:
1131 		so = VTOSO(vp);
1132 		is_sock = B_TRUE;
1133 		if (SOCK_IS_NONSTR(so)) {
1134 			maxblk = so->so_proto_props.sopp_maxblk;
1135 		} else {
1136 			maxblk = (int)vp->v_stream->sd_maxblk;
1137 		}
1138 		break;
1139 	case VREG:
1140 		break;
1141 	default:
1142 		error = EINVAL;
1143 		goto err;
1144 	}
1145 
1146 	switch (opcode) {
1147 	case SENDFILEV :
1148 		break;
1149 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1150 	case SENDFILEV64 :
1151 		return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt,
1152 		    (size32_t *)xferred, fildes));
1153 #endif
1154 	default :
1155 		error = ENOSYS;
1156 		break;
1157 	}
1158 
1159 	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1160 	copy_vec = vec;
1161 
1162 	do {
1163 		total_size = 0;
1164 		copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
1165 #ifdef _SYSCALL32_IMPL
1166 		/* 32-bit callers need to have their iovec expanded. */
1167 		if (get_udatamodel() == DATAMODEL_ILP32) {
1168 			if (copyin(copy_vec, sfv32,
1169 			    copy_cnt * sizeof (ksendfilevec32_t))) {
1170 				error = EFAULT;
1171 				break;
1172 			}
1173 
1174 			for (i = 0; i < copy_cnt; i++) {
1175 				sfv[i].sfv_fd = sfv32[i].sfv_fd;
1176 				sfv[i].sfv_off =
1177 				    (off_t)(uint32_t)sfv32[i].sfv_off;
1178 				sfv[i].sfv_len = (size_t)sfv32[i].sfv_len;
1179 				total_size += sfv[i].sfv_len;
1180 				sfv[i].sfv_flag = sfv32[i].sfv_flag;
1181 				/*
1182 				 * Individual elements of the vector must not
1183 				 * wrap or overflow, as later math is signed.
1184 				 * Equally total_size needs to be checked after
1185 				 * each vector is added in, to be sure that
1186 				 * rogue values haven't overflowed the counter.
1187 				 */
1188 				if (((ssize32_t)sfv[i].sfv_len < 0) ||
1189 				    ((ssize32_t)total_size < 0)) {
1190 					/*
1191 					 * Truncate the vector to send data
1192 					 * described by elements before the
1193 					 * error.
1194 					 */
1195 					copy_cnt = i;
1196 					first_vector_error = EINVAL;
1197 					/* total_size can't be trusted */
1198 					if ((ssize32_t)total_size < 0)
1199 						error = EINVAL;
1200 					break;
1201 				}
1202 			}
1203 			/* Nothing to do, process errors */
1204 			if (copy_cnt == 0)
1205 				break;
1206 
1207 		} else {
1208 #endif
1209 			if (copyin(copy_vec, sfv,
1210 			    copy_cnt * sizeof (sendfilevec_t))) {
1211 				error = EFAULT;
1212 				break;
1213 			}
1214 
1215 			for (i = 0; i < copy_cnt; i++) {
1216 				total_size += sfv[i].sfv_len;
1217 				/*
1218 				 * Individual elements of the vector must not
1219 				 * wrap or overflow, as later math is signed.
1220 				 * Equally total_size needs to be checked after
1221 				 * each vector is added in, to be sure that
1222 				 * rogue values haven't overflowed the counter.
1223 				 */
1224 				if (((ssize_t)sfv[i].sfv_len < 0) ||
1225 				    (total_size < 0)) {
1226 					/*
1227 					 * Truncate the vector to send data
1228 					 * described by elements before the
1229 					 * error.
1230 					 */
1231 					copy_cnt = i;
1232 					first_vector_error = EINVAL;
1233 					/* total_size can't be trusted */
1234 					if (total_size < 0)
1235 						error = EINVAL;
1236 					break;
1237 				}
1238 			}
1239 			/* Nothing to do, process errors */
1240 			if (copy_cnt == 0)
1241 				break;
1242 #ifdef _SYSCALL32_IMPL
1243 		}
1244 #endif
1245 
1246 		/*
1247 		 * The task between deciding to use sendvec_small_chunk
1248 		 * and sendvec_chunk is dependant on multiple things:
1249 		 *
1250 		 * i) latency is important for smaller files. So if the
1251 		 * data is smaller than 'tcp_slow_start_initial' times
1252 		 * maxblk, then use sendvec_small_chunk which creates
1253 		 * maxblk size mblks and chains them together and sends
1254 		 * them to TCP in one shot. It also leaves 'wroff' size
1255 		 * space for the headers in each mblk.
1256 		 *
1257 		 * ii) for total size bigger than 'tcp_slow_start_initial'
1258 		 * time maxblk, its probably real file data which is
1259 		 * dominating. So its better to use sendvec_chunk because
1260 		 * performance goes to dog if we don't do pagesize reads.
1261 		 * sendvec_chunk will do pagesize reads and write them
1262 		 * in pagesize mblks to TCP.
1263 		 *
1264 		 * Side Notes: A write to file has not been optimized.
1265 		 * Future zero copy code will plugin into sendvec_chunk
1266 		 * only because doing zero copy for files smaller then
1267 		 * pagesize is useless.
1268 		 *
1269 		 * Note, if socket has NL7C enabled then call NL7C's
1270 		 * senfilev() function to consume the sfv[].
1271 		 */
1272 		if (is_sock) {
1273 			if (!SOCK_IS_NONSTR(so) &&
1274 			    _SOTOTPI(so)->sti_nl7c_flags != 0) {
1275 				error = nl7c_sendfilev(so, &fileoff,
1276 				    sfv, copy_cnt, &count);
1277 			} else if ((total_size <= (4 * maxblk)) &&
1278 			    error == 0) {
1279 				error = sendvec_small_chunk(fp,
1280 				    &fileoff, sfv, copy_cnt,
1281 				    total_size, maxblk, &count);
1282 			} else {
1283 				error = sendvec_chunk(fp, &fileoff,
1284 				    sfv, copy_cnt, &count);
1285 			}
1286 		} else {
1287 			ASSERT(vp->v_type == VREG);
1288 			error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt,
1289 			    &count);
1290 		}
1291 
1292 
1293 #ifdef _SYSCALL32_IMPL
1294 	if (get_udatamodel() == DATAMODEL_ILP32)
1295 		copy_vec = (const struct sendfilevec *)((char *)copy_vec +
1296 		    (copy_cnt * sizeof (ksendfilevec32_t)));
1297 	else
1298 #endif
1299 		copy_vec += copy_cnt;
1300 		sfvcnt -= copy_cnt;
1301 
1302 	/* Process all vector members up to first error */
1303 	} while ((sfvcnt > 0) && first_vector_error == 0 && error == 0);
1304 
1305 	if (vp->v_type == VREG)
1306 		fp->f_offset += count;
1307 
1308 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1309 
1310 #ifdef _SYSCALL32_IMPL
1311 	if (get_udatamodel() == DATAMODEL_ILP32) {
1312 		ssize32_t count32 = (ssize32_t)count;
1313 		if (copyout(&count32, xferred, sizeof (count32)))
1314 			error = EFAULT;
1315 		releasef(fildes);
1316 		if (error != 0)
1317 			return (set_errno(error));
1318 		if (first_vector_error != 0)
1319 			return (set_errno(first_vector_error));
1320 		return (count32);
1321 	}
1322 #endif
1323 	if (copyout(&count, xferred, sizeof (count)))
1324 		error = EFAULT;
1325 	releasef(fildes);
1326 	if (error != 0)
1327 		return (set_errno(error));
1328 	if (first_vector_error != 0)
1329 		return (set_errno(first_vector_error));
1330 	return (count);
1331 err:
1332 	ASSERT(error != 0);
1333 	releasef(fildes);
1334 	return (set_errno(error));
1335 }
1336