xref: /illumos-gate/usr/src/uts/common/syscall/sendfile.c (revision 68ac2337c38c8af06edcf32a72e42de36ec72a9d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/t_lock.h>
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/buf.h>
34 #include <sys/conf.h>
35 #include <sys/cred.h>
36 #include <sys/kmem.h>
37 #include <sys/sysmacros.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/debug.h>
41 #include <sys/errno.h>
42 #include <sys/time.h>
43 #include <sys/file.h>
44 #include <sys/open.h>
45 #include <sys/user.h>
46 #include <sys/termios.h>
47 #include <sys/stream.h>
48 #include <sys/strsubr.h>
49 #include <sys/esunddi.h>
50 #include <sys/flock.h>
51 #include <sys/modctl.h>
52 #include <sys/cmn_err.h>
53 #include <sys/vmsystm.h>
54 
55 #include <sys/socket.h>
56 #include <sys/socketvar.h>
57 /* swilly code in sys/socketvar.h turns off DEBUG */
58 #ifdef __lint
59 #define	DEBUG
60 #endif
61 
62 #include <netinet/in.h>
63 #include <sys/sendfile.h>
64 #include <sys/un.h>
65 #include <sys/tihdr.h>
66 #include <sys/atomic.h>
67 
68 #include <inet/common.h>
69 #include <inet/ip.h>
70 #include <inet/ip6.h>
71 #include <inet/tcp.h>
72 
73 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *,
74 		ssize32_t *);
75 extern int nl7c_sendfilev(struct sonode *, u_offset_t *, struct sendfilevec *,
76 		int, ssize_t *);
77 
78 /*
79  * kstrwritemp() has very similar semantics as that of strwrite().
80  * The main difference is it obtains mblks from the caller and also
81  * does not do any copy as done in strwrite() from user buffers to
82  * kernel buffers.
83  *
84  * Currently, this routine is used by sendfile to send data allocated
85  * within the kernel without any copying. This interface does not use the
86  * synchronous stream interface as synch. stream interface implies
87  * copying.
88  */
89 int
90 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
91 {
92 	struct stdata *stp;
93 	struct queue *wqp;
94 	mblk_t *newmp;
95 	char waitflag;
96 	int tempmode;
97 	int error = 0;
98 	int done = 0;
99 	struct sonode *so;
100 	boolean_t direct;
101 
102 	ASSERT(vp->v_stream);
103 	stp = vp->v_stream;
104 
105 	so = VTOSO(vp);
106 	direct = (so->so_state & SS_DIRECT);
107 
108 	/*
109 	 * This is the sockfs direct fast path. canputnext() need
110 	 * not be accurate so we don't grab the sd_lock here. If
111 	 * we get flow-controlled, we grab sd_lock just before the
112 	 * do..while loop below to emulate what strwrite() does.
113 	 */
114 	wqp = stp->sd_wrq;
115 	if (canputnext(wqp) && direct &&
116 	    !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
117 		return (sostream_direct(so, NULL, mp, CRED()));
118 	} else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
119 		/* Fast check of flags before acquiring the lock */
120 		mutex_enter(&stp->sd_lock);
121 		error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
122 		mutex_exit(&stp->sd_lock);
123 		if (error != 0) {
124 			if (!(stp->sd_flag & STPLEX) &&
125 			    (stp->sd_wput_opt & SW_SIGPIPE)) {
126 				tsignal(curthread, SIGPIPE);
127 				error = EPIPE;
128 			}
129 			return (error);
130 		}
131 	}
132 
133 	waitflag = WRITEWAIT;
134 	if (stp->sd_flag & OLDNDELAY)
135 		tempmode = fmode & ~FNDELAY;
136 	else
137 		tempmode = fmode;
138 
139 	mutex_enter(&stp->sd_lock);
140 	do {
141 		if (canputnext(wqp)) {
142 			mutex_exit(&stp->sd_lock);
143 			if (stp->sd_wputdatafunc != NULL) {
144 				newmp = (stp->sd_wputdatafunc)(vp, mp, NULL,
145 				    NULL, NULL, NULL);
146 				if (newmp == NULL) {
147 					/* The caller will free mp */
148 					return (ECOMM);
149 				}
150 				mp = newmp;
151 			}
152 			putnext(wqp, mp);
153 			return (0);
154 		}
155 		error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
156 		    &done);
157 	} while (error == 0 && !done);
158 
159 	mutex_exit(&stp->sd_lock);
160 	/*
161 	 * EAGAIN tells the application to try again. ENOMEM
162 	 * is returned only if the memory allocation size
163 	 * exceeds the physical limits of the system. ENOMEM
164 	 * can't be true here.
165 	 */
166 	if (error == ENOMEM)
167 		error = EAGAIN;
168 	return (error);
169 }
170 
171 #define	SEND_MAX_CHUNK	16
172 
173 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
174 /*
175  * 64 bit offsets for 32 bit applications only running either on
176  * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer
177  * more than 2GB of data.
178  */
179 int
180 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv,
181     int copy_cnt, ssize32_t *count)
182 {
183 	struct vnode *vp;
184 	ushort_t fflag;
185 	int ioflag;
186 	size32_t cnt;
187 	ssize32_t sfv_len;
188 	ssize32_t tmpcount;
189 	u_offset_t sfv_off;
190 	struct uio auio;
191 	struct iovec aiov;
192 	int i, error;
193 
194 	fflag = fp->f_flag;
195 	vp = fp->f_vnode;
196 	for (i = 0; i < copy_cnt; i++) {
197 
198 		if (ISSIG(curthread, JUSTLOOKING))
199 			return (EINTR);
200 
201 		/*
202 		 * Do similar checks as "write" as we are writing
203 		 * sfv_len bytes into "vp".
204 		 */
205 		sfv_len = (ssize32_t)sfv->sfv_len;
206 
207 		if (sfv_len == 0)
208 			continue;
209 
210 		if (sfv_len < 0)
211 			return (EINVAL);
212 
213 		if (vp->v_type == VREG) {
214 			if (*fileoff >= curproc->p_fsz_ctl) {
215 				mutex_enter(&curproc->p_lock);
216 				(void) rctl_action(
217 				    rctlproc_legacy[RLIMIT_FSIZE],
218 				    curproc->p_rctls, curproc, RCA_SAFE);
219 				mutex_exit(&curproc->p_lock);
220 				return (EFBIG);
221 			}
222 
223 			if (*fileoff >= OFFSET_MAX(fp))
224 				return (EFBIG);
225 
226 			if (*fileoff + sfv_len > OFFSET_MAX(fp))
227 				return (EINVAL);
228 		}
229 
230 		tmpcount = *count + sfv_len;
231 		if (tmpcount < 0)
232 			return (EINVAL);
233 
234 		sfv_off = sfv->sfv_off;
235 
236 		auio.uio_extflg = UIO_COPY_DEFAULT;
237 		if (sfv->sfv_fd == SFV_FD_SELF) {
238 			aiov.iov_len = sfv_len;
239 			aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
240 			auio.uio_loffset = *fileoff;
241 			auio.uio_iovcnt = 1;
242 			auio.uio_resid = sfv_len;
243 			auio.uio_iov = &aiov;
244 			auio.uio_segflg = UIO_USERSPACE;
245 			auio.uio_llimit = curproc->p_fsz_ctl;
246 			auio.uio_fmode = fflag;
247 			ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
248 			while (sfv_len > 0) {
249 				error = VOP_WRITE(vp, &auio, ioflag,
250 				    fp->f_cred, NULL);
251 				cnt = sfv_len - auio.uio_resid;
252 				sfv_len -= cnt;
253 				ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
254 				if (vp->v_type == VREG)
255 					*fileoff += cnt;
256 				*count += cnt;
257 				if (error != 0)
258 					return (error);
259 			}
260 		} else {
261 			file_t	*ffp;
262 			vnode_t	*readvp;
263 			int	readflg = 0;
264 			size_t	size;
265 			caddr_t	ptr;
266 
267 			if ((ffp = getf(sfv->sfv_fd)) == NULL)
268 				return (EBADF);
269 
270 			if ((ffp->f_flag & FREAD) == 0) {
271 				releasef(sfv->sfv_fd);
272 				return (EBADF);
273 			}
274 
275 			readvp = ffp->f_vnode;
276 			if (readvp->v_type != VREG) {
277 				releasef(sfv->sfv_fd);
278 				return (EINVAL);
279 			}
280 
281 			/*
282 			 * No point reading and writing to same vp,
283 			 * as long as both are regular files. readvp is not
284 			 * locked; but since we got it from an open file the
285 			 * contents will be valid during the time of access.
286 			 */
287 			if (VN_CMP(vp, readvp)) {
288 				releasef(sfv->sfv_fd);
289 				return (EINVAL);
290 			}
291 
292 			/*
293 			 * Note: we assume readvp != vp. "vp" is already
294 			 * locked, and "readvp" must not be.
295 			 */
296 			(void) VOP_RWLOCK(readvp, readflg, NULL);
297 
298 			/*
299 			 * Same checks as in pread64.
300 			 */
301 			if (sfv_off > MAXOFFSET_T) {
302 				VOP_RWUNLOCK(readvp, readflg, NULL);
303 				releasef(sfv->sfv_fd);
304 				return (EINVAL);
305 			}
306 
307 			if (sfv_off + sfv_len > MAXOFFSET_T)
308 				sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off);
309 
310 			/* Find the native blocksize to transfer data */
311 			size = MIN(vp->v_vfsp->vfs_bsize,
312 			    readvp->v_vfsp->vfs_bsize);
313 			size = sfv_len < size ? sfv_len : size;
314 			ptr = kmem_alloc(size, KM_SLEEP);
315 
316 			while (sfv_len > 0) {
317 				size_t	iov_len;
318 
319 				iov_len = MIN(size, sfv_len);
320 				aiov.iov_base = ptr;
321 				aiov.iov_len = iov_len;
322 				auio.uio_loffset = sfv_off;
323 				auio.uio_iov = &aiov;
324 				auio.uio_iovcnt = 1;
325 				auio.uio_resid = iov_len;
326 				auio.uio_segflg = UIO_SYSSPACE;
327 				auio.uio_llimit = MAXOFFSET_T;
328 				auio.uio_fmode = ffp->f_flag;
329 				ioflag = auio.uio_fmode &
330 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
331 
332 				/*
333 				 * If read sync is not asked for,
334 				 * filter sync flags
335 				 */
336 				if ((ioflag & FRSYNC) == 0)
337 					ioflag &= ~(FSYNC|FDSYNC);
338 				error = VOP_READ(readvp, &auio, ioflag,
339 				    fp->f_cred, NULL);
340 				if (error) {
341 					kmem_free(ptr, size);
342 					VOP_RWUNLOCK(readvp, readflg, NULL);
343 					releasef(sfv->sfv_fd);
344 					return (error);
345 				}
346 
347 				/*
348 				 * Check how must data was really read.
349 				 * Decrement the 'len' and increment the
350 				 * 'off' appropriately.
351 				 */
352 				cnt = iov_len - auio.uio_resid;
353 				if (cnt == 0) {
354 					/*
355 					 * If we were reading a pipe (currently
356 					 * not implemented), we may now lose
357 					 * data.
358 					 */
359 					kmem_free(ptr, size);
360 					VOP_RWUNLOCK(readvp, readflg, NULL);
361 					releasef(sfv->sfv_fd);
362 					return (EINVAL);
363 				}
364 				sfv_len -= cnt;
365 				sfv_off += cnt;
366 
367 				aiov.iov_base = ptr;
368 				aiov.iov_len = cnt;
369 				auio.uio_loffset = *fileoff;
370 				auio.uio_resid = cnt;
371 				auio.uio_segflg = UIO_SYSSPACE;
372 				auio.uio_llimit = curproc->p_fsz_ctl;
373 				auio.uio_fmode = fflag;
374 				ioflag = auio.uio_fmode &
375 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
376 				error = VOP_WRITE(vp, &auio, ioflag,
377 				    fp->f_cred, NULL);
378 
379 				/*
380 				 * Check how much data was written. Increment
381 				 * the 'len' and decrement the 'off' if all
382 				 * the data was not written.
383 				 */
384 				cnt -= auio.uio_resid;
385 				sfv_len += auio.uio_resid;
386 				sfv_off -= auio.uio_resid;
387 				ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
388 				if (vp->v_type == VREG)
389 					*fileoff += cnt;
390 				*count += cnt;
391 				if (error != 0) {
392 					kmem_free(ptr, size);
393 					VOP_RWUNLOCK(readvp, readflg, NULL);
394 					releasef(sfv->sfv_fd);
395 					return (error);
396 				}
397 			}
398 			VOP_RWUNLOCK(readvp, readflg, NULL);
399 			releasef(sfv->sfv_fd);
400 			kmem_free(ptr, size);
401 		}
402 		sfv++;
403 	}
404 	return (0);
405 }
406 
407 ssize32_t
408 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt,
409 	size32_t *xferred, int fildes)
410 {
411 	int			rwflag;
412 	u_offset_t		fileoff;
413 	int			copy_cnt;
414 	const struct ksendfilevec64 *copy_vec;
415 	struct ksendfilevec64 sfv[SEND_MAX_CHUNK];
416 	struct vnode *vp;
417 	int error;
418 	ssize32_t count = 0;
419 	int osfvcnt;
420 
421 	rwflag = 1;
422 	vp = fp->f_vnode;
423 	(void) VOP_RWLOCK(vp, rwflag, NULL);
424 
425 	copy_vec = vec;
426 	fileoff = fp->f_offset;
427 	osfvcnt = sfvcnt;
428 
429 	do {
430 		copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
431 		if (copyin(copy_vec, sfv, copy_cnt *
432 		    sizeof (struct ksendfilevec64))) {
433 			error = EFAULT;
434 			break;
435 		}
436 
437 		/*
438 		 * Optimize the single regular file over
439 		 * the socket case.
440 		 */
441 		if (vp->v_type == VSOCK && osfvcnt == 1 &&
442 		    sfv->sfv_fd != SFV_FD_SELF) {
443 			file_t *rfp;
444 			vnode_t *rvp;
445 
446 			if ((rfp = getf(sfv->sfv_fd)) == NULL) {
447 				error = EBADF;
448 				break;
449 			}
450 			if ((rfp->f_flag & FREAD) == 0) {
451 				releasef(sfv->sfv_fd);
452 				error = EBADF;
453 				break;
454 			}
455 			rvp = rfp->f_vnode;
456 			if (rvp->v_type == VREG) {
457 				error = sosendfile64(fp, rfp, sfv, &count);
458 				break;
459 			}
460 			releasef(sfv->sfv_fd);
461 		}
462 		error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count);
463 		if (error != 0)
464 			break;
465 
466 		copy_vec += copy_cnt;
467 		sfvcnt -= copy_cnt;
468 	} while (sfvcnt > 0);
469 
470 	if (vp->v_type == VREG)
471 		fp->f_offset += count;
472 
473 	VOP_RWUNLOCK(vp, rwflag, NULL);
474 	if (copyout(&count, xferred, sizeof (count)))
475 		error = EFAULT;
476 	releasef(fildes);
477 	if (error != 0)
478 		return (set_errno(error));
479 	return (count);
480 }
481 #endif
482 
483 int
484 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
485     int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count)
486 {
487 	struct vnode *vp;
488 	struct uio auio;
489 	struct iovec aiov;
490 	ushort_t fflag;
491 	int ioflag;
492 	int i, error;
493 	size_t cnt;
494 	ssize_t sfv_len;
495 	u_offset_t sfv_off;
496 #ifdef _SYSCALL32_IMPL
497 	model_t model = get_udatamodel();
498 	u_offset_t maxoff = (model == DATAMODEL_ILP32) ?
499 		MAXOFF32_T : MAXOFFSET_T;
500 #else
501 	const u_offset_t maxoff = MAXOFF32_T;
502 #endif
503 	mblk_t *dmp = NULL;
504 	int wroff;
505 	int buf_left = 0;
506 	size_t	iov_len;
507 	mblk_t  *head, *tmp;
508 	size_t  size = total_size;
509 	size_t  extra;
510 	int tail_len;
511 
512 	fflag = fp->f_flag;
513 	vp = fp->f_vnode;
514 
515 	ASSERT(vp->v_type == VSOCK);
516 	ASSERT(maxblk > 0);
517 
518 	wroff = (int)vp->v_stream->sd_wroff;
519 	tail_len = (int)vp->v_stream->sd_tail;
520 	extra = wroff + tail_len;
521 
522 	buf_left = MIN(total_size, maxblk);
523 	head = dmp = allocb(buf_left + extra, BPRI_HI);
524 	if (head == NULL)
525 		return (ENOMEM);
526 	head->b_wptr = head->b_rptr = head->b_rptr + wroff;
527 
528 	auio.uio_extflg = UIO_COPY_DEFAULT;
529 	for (i = 0; i < copy_cnt; i++) {
530 		if (ISSIG(curthread, JUSTLOOKING))
531 			return (EINTR);
532 
533 		/*
534 		 * Do similar checks as "write" as we are writing
535 		 * sfv_len bytes into "vp".
536 		 */
537 		sfv_len = (ssize_t)sfv->sfv_len;
538 
539 		if (sfv_len == 0) {
540 			sfv++;
541 			continue;
542 		}
543 
544 		/* Make sure sfv_len is not negative */
545 #ifdef _SYSCALL32_IMPL
546 		if (model == DATAMODEL_ILP32) {
547 			if ((ssize32_t)sfv_len < 0)
548 				return (EINVAL);
549 		} else
550 #endif
551 		if (sfv_len < 0)
552 			return (EINVAL);
553 
554 		/* Check for overflow */
555 #ifdef _SYSCALL32_IMPL
556 		if (model == DATAMODEL_ILP32) {
557 			if (((ssize32_t)(*count + sfv_len)) < 0)
558 				return (EINVAL);
559 		} else
560 #endif
561 		if ((*count + sfv_len) < 0)
562 			return (EINVAL);
563 
564 		sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off;
565 
566 		if (sfv->sfv_fd == SFV_FD_SELF) {
567 			while (sfv_len > 0) {
568 				if (buf_left == 0) {
569 					tmp = dmp;
570 					buf_left = MIN(total_size, maxblk);
571 					iov_len = MIN(buf_left, sfv_len);
572 					dmp = allocb(buf_left + extra, BPRI_HI);
573 					if (dmp == NULL) {
574 						freemsg(head);
575 						return (ENOMEM);
576 					}
577 					dmp->b_wptr = dmp->b_rptr =
578 					    dmp->b_rptr + wroff;
579 					tmp->b_cont = dmp;
580 				} else {
581 					iov_len = MIN(buf_left, sfv_len);
582 				}
583 
584 				aiov.iov_len = iov_len;
585 				aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
586 				auio.uio_loffset = *fileoff;
587 				auio.uio_iovcnt = 1;
588 				auio.uio_resid = iov_len;
589 				auio.uio_iov = &aiov;
590 				auio.uio_segflg = UIO_USERSPACE;
591 				auio.uio_llimit = curproc->p_fsz_ctl;
592 				auio.uio_fmode = fflag;
593 
594 				buf_left -= iov_len;
595 				total_size -= iov_len;
596 				sfv_len -= iov_len;
597 				sfv_off += iov_len;
598 
599 				error = uiomove((caddr_t)dmp->b_wptr,
600 				    iov_len, UIO_WRITE, &auio);
601 				if (error != 0) {
602 					freemsg(head);
603 					return (error);
604 				}
605 				dmp->b_wptr += iov_len;
606 			}
607 		} else {
608 			file_t	*ffp;
609 			vnode_t	*readvp;
610 			int	readflg = 0;
611 
612 			if ((ffp = getf(sfv->sfv_fd)) == NULL) {
613 				freemsg(head);
614 				return (EBADF);
615 			}
616 
617 			if ((ffp->f_flag & FREAD) == 0) {
618 				releasef(sfv->sfv_fd);
619 				freemsg(head);
620 				return (EACCES);
621 			}
622 
623 			readvp = ffp->f_vnode;
624 			if (readvp->v_type != VREG) {
625 				releasef(sfv->sfv_fd);
626 				freemsg(head);
627 				return (EINVAL);
628 			}
629 
630 			/*
631 			 * No point reading and writing to same vp,
632 			 * as long as both are regular files. readvp is not
633 			 * locked; but since we got it from an open file the
634 			 * contents will be valid during the time of access.
635 			 */
636 
637 			if (VN_CMP(vp, readvp)) {
638 				releasef(sfv->sfv_fd);
639 				freemsg(head);
640 				return (EINVAL);
641 			}
642 
643 			/*
644 			 * Note: we assume readvp != vp. "vp" is already
645 			 * locked, and "readvp" must not be.
646 			 */
647 
648 			(void) VOP_RWLOCK(readvp, readflg, NULL);
649 
650 			/* Same checks as in pread */
651 			if (sfv_off > maxoff) {
652 				VOP_RWUNLOCK(readvp, readflg, NULL);
653 				releasef(sfv->sfv_fd);
654 				freemsg(head);
655 				return (EINVAL);
656 			}
657 			if (sfv_off + sfv_len > maxoff) {
658 				total_size -= (sfv_off + sfv_len - maxoff);
659 				sfv_len = (ssize_t)((offset_t)maxoff -
660 				    sfv_off);
661 			}
662 
663 			while (sfv_len > 0) {
664 				if (buf_left == 0) {
665 					tmp = dmp;
666 					buf_left = MIN(total_size, maxblk);
667 					iov_len = MIN(buf_left, sfv_len);
668 					dmp = allocb(buf_left + extra, BPRI_HI);
669 					if (dmp == NULL) {
670 						VOP_RWUNLOCK(readvp, readflg,
671 									NULL);
672 						releasef(sfv->sfv_fd);
673 						freemsg(head);
674 						return (ENOMEM);
675 					}
676 					dmp->b_wptr = dmp->b_rptr =
677 					    dmp->b_rptr + wroff;
678 					tmp->b_cont = dmp;
679 				} else {
680 					iov_len = MIN(buf_left, sfv_len);
681 				}
682 				aiov.iov_base = (caddr_t)dmp->b_wptr;
683 				aiov.iov_len = iov_len;
684 				auio.uio_loffset = sfv_off;
685 				auio.uio_iov = &aiov;
686 				auio.uio_iovcnt = 1;
687 				auio.uio_resid = iov_len;
688 				auio.uio_segflg = UIO_SYSSPACE;
689 				auio.uio_llimit = MAXOFFSET_T;
690 				auio.uio_fmode = ffp->f_flag;
691 				ioflag = auio.uio_fmode &
692 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
693 
694 				/*
695 				 * If read sync is not asked for,
696 				 * filter sync flags
697 				 */
698 				if ((ioflag & FRSYNC) == 0)
699 					ioflag &= ~(FSYNC|FDSYNC);
700 				error = VOP_READ(readvp, &auio, ioflag,
701 				    fp->f_cred, NULL);
702 				if (error != 0) {
703 					/*
704 					 * If we were reading a pipe (currently
705 					 * not implemented), we may now loose
706 					 * data.
707 					 */
708 					VOP_RWUNLOCK(readvp, readflg, NULL);
709 					releasef(sfv->sfv_fd);
710 					freemsg(head);
711 					return (error);
712 				}
713 
714 				/*
715 				 * Check how much data was really read.
716 				 * Decrement the 'len' and increment the
717 				 * 'off' appropriately.
718 				 */
719 				cnt = iov_len - auio.uio_resid;
720 				if (cnt == 0) {
721 					VOP_RWUNLOCK(readvp, readflg, NULL);
722 					releasef(sfv->sfv_fd);
723 					freemsg(head);
724 					return (EINVAL);
725 				}
726 				sfv_len -= cnt;
727 				sfv_off += cnt;
728 				total_size -= cnt;
729 				buf_left -= cnt;
730 
731 				dmp->b_wptr += cnt;
732 			}
733 			VOP_RWUNLOCK(readvp, readflg, NULL);
734 			releasef(sfv->sfv_fd);
735 		}
736 		sfv++;
737 	}
738 
739 	ASSERT(total_size == 0);
740 	error = kstrwritemp(vp, head, fflag);
741 	if (error != 0) {
742 		freemsg(head);
743 		return (error);
744 	}
745 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size;
746 	*count += size;
747 
748 	return (0);
749 }
750 
751 
752 int
753 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
754     int copy_cnt, ssize_t *count)
755 {
756 	struct vnode *vp;
757 	struct uio auio;
758 	struct iovec aiov;
759 	ushort_t fflag;
760 	int ioflag;
761 	int i, error;
762 	size_t cnt;
763 	ssize_t sfv_len;
764 	u_offset_t sfv_off;
765 #ifdef _SYSCALL32_IMPL
766 	model_t model = get_udatamodel();
767 	u_offset_t maxoff = (model == DATAMODEL_ILP32) ?
768 		MAXOFF32_T : MAXOFFSET_T;
769 #else
770 	const u_offset_t maxoff = MAXOFF32_T;
771 #endif
772 	mblk_t	*dmp = NULL;
773 	char	*buf = NULL;
774 	size_t  extra;
775 	int maxblk, wroff, tail_len;
776 	struct sonode *so;
777 	stdata_t *stp;
778 
779 	fflag = fp->f_flag;
780 	vp = fp->f_vnode;
781 
782 	if (vp->v_type == VSOCK) {
783 		so = VTOSO(vp);
784 		stp = vp->v_stream;
785 		wroff = (int)stp->sd_wroff;
786 		tail_len = (int)stp->sd_tail;
787 		maxblk = (int)stp->sd_maxblk;
788 		extra = wroff + tail_len;
789 	}
790 
791 	auio.uio_extflg = UIO_COPY_DEFAULT;
792 	for (i = 0; i < copy_cnt; i++) {
793 		if (ISSIG(curthread, JUSTLOOKING))
794 			return (EINTR);
795 
796 		/*
797 		 * Do similar checks as "write" as we are writing
798 		 * sfv_len bytes into "vp".
799 		 */
800 		sfv_len = (ssize_t)sfv->sfv_len;
801 
802 		if (sfv_len == 0) {
803 			sfv++;
804 			continue;
805 		}
806 
807 		/* Make sure sfv_len is not negative */
808 #ifdef _SYSCALL32_IMPL
809 		if (model == DATAMODEL_ILP32) {
810 			if ((ssize32_t)sfv_len < 0)
811 				return (EINVAL);
812 		} else
813 #endif
814 		if (sfv_len < 0)
815 			return (EINVAL);
816 
817 		if (vp->v_type == VREG) {
818 			if (*fileoff >= curproc->p_fsz_ctl) {
819 				mutex_enter(&curproc->p_lock);
820 				(void) rctl_action(
821 				    rctlproc_legacy[RLIMIT_FSIZE],
822 				    curproc->p_rctls, curproc, RCA_SAFE);
823 				mutex_exit(&curproc->p_lock);
824 
825 				return (EFBIG);
826 			}
827 
828 			if (*fileoff >= maxoff)
829 				return (EFBIG);
830 
831 			if (*fileoff + sfv_len > maxoff)
832 				return (EINVAL);
833 		}
834 
835 		/* Check for overflow */
836 #ifdef _SYSCALL32_IMPL
837 		if (model == DATAMODEL_ILP32) {
838 			if (((ssize32_t)(*count + sfv_len)) < 0)
839 				return (EINVAL);
840 		} else
841 #endif
842 		if ((*count + sfv_len) < 0)
843 			return (EINVAL);
844 
845 		sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off;
846 
847 		if (sfv->sfv_fd == SFV_FD_SELF) {
848 			aiov.iov_len = sfv_len;
849 			aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
850 			auio.uio_loffset = *fileoff;
851 			auio.uio_iovcnt = 1;
852 			auio.uio_resid = sfv_len;
853 			auio.uio_iov = &aiov;
854 			auio.uio_segflg = UIO_USERSPACE;
855 			auio.uio_llimit = curproc->p_fsz_ctl;
856 			auio.uio_fmode = fflag;
857 
858 			if (vp->v_type == VSOCK) {
859 
860 				/*
861 				 * Optimize for the socket case
862 				 */
863 
864 				dmp = allocb(sfv_len + extra, BPRI_HI);
865 				if (dmp == NULL)
866 					return (ENOMEM);
867 				dmp->b_wptr = dmp->b_rptr = dmp->b_rptr + wroff;
868 				error = uiomove((caddr_t)dmp->b_wptr,
869 				    sfv_len, UIO_WRITE, &auio);
870 				if (error != 0) {
871 					freeb(dmp);
872 					return (error);
873 				}
874 				dmp->b_wptr += sfv_len;
875 				error = kstrwritemp(vp, dmp, fflag);
876 				if (error != 0) {
877 					freeb(dmp);
878 					return (error);
879 				}
880 				ttolwp(curthread)->lwp_ru.ioch +=
881 				    (ulong_t)sfv_len;
882 				*count += sfv_len;
883 			} else {
884 				ioflag = auio.uio_fmode &
885 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
886 				while (sfv_len > 0) {
887 					error = VOP_WRITE(vp, &auio, ioflag,
888 					    fp->f_cred, NULL);
889 					cnt = sfv_len - auio.uio_resid;
890 					sfv_len -= cnt;
891 					ttolwp(curthread)->lwp_ru.ioch +=
892 					    (ulong_t)cnt;
893 					*fileoff += cnt;
894 					*count += cnt;
895 					if (error != 0)
896 						return (error);
897 				}
898 			}
899 		} else {
900 			file_t	*ffp;
901 			vnode_t	*readvp;
902 			int	readflg = 0;
903 			size_t	size;
904 			caddr_t	ptr;
905 
906 			if ((ffp = getf(sfv->sfv_fd)) == NULL)
907 				return (EBADF);
908 
909 			if ((ffp->f_flag & FREAD) == 0) {
910 				releasef(sfv->sfv_fd);
911 				return (EBADF);
912 			}
913 
914 			readvp = ffp->f_vnode;
915 			if (readvp->v_type != VREG) {
916 				releasef(sfv->sfv_fd);
917 				return (EINVAL);
918 			}
919 
920 			/*
921 			 * No point reading and writing to same vp,
922 			 * as long as both are regular files. readvp is not
923 			 * locked; but since we got it from an open file the
924 			 * contents will be valid during the time of access.
925 			 */
926 			if (VN_CMP(vp, readvp)) {
927 				releasef(sfv->sfv_fd);
928 				return (EINVAL);
929 			}
930 
931 			/*
932 			 * Note: we assume readvp != vp. "vp" is already
933 			 * locked, and "readvp" must not be.
934 			 */
935 			(void) VOP_RWLOCK(readvp, readflg, NULL);
936 
937 			/* Same checks as in pread */
938 			if (sfv_off > maxoff) {
939 				VOP_RWUNLOCK(readvp, readflg, NULL);
940 				releasef(sfv->sfv_fd);
941 				return (EINVAL);
942 			}
943 			if (sfv_off + sfv_len > maxoff) {
944 				sfv_len = (ssize_t)((offset_t)maxoff -
945 				    sfv_off);
946 			}
947 			/* Find the native blocksize to transfer data */
948 			size = MIN(vp->v_vfsp->vfs_bsize,
949 			    readvp->v_vfsp->vfs_bsize);
950 			size = sfv_len < size ? sfv_len : size;
951 
952 			if (vp->v_type != VSOCK) {
953 				buf = kmem_alloc(size, KM_NOSLEEP);
954 				if (buf == NULL) {
955 					VOP_RWUNLOCK(readvp, readflg, NULL);
956 					releasef(sfv->sfv_fd);
957 					return (ENOMEM);
958 				}
959 			} else {
960 				/*
961 				 * For sockets acting as an SSL proxy, we
962 				 * need to adjust the size to the maximum
963 				 * SSL record size set in the stream head.
964 				 */
965 				if (so->so_kssl_ctx != NULL)
966 					size = MIN(size, maxblk);
967 			}
968 
969 			while (sfv_len > 0) {
970 				size_t	iov_len;
971 
972 				iov_len = MIN(size, sfv_len);
973 
974 				if (vp->v_type == VSOCK) {
975 					dmp = allocb(iov_len + extra, BPRI_HI);
976 					if (dmp == NULL) {
977 						VOP_RWUNLOCK(readvp, readflg,
978 						    NULL);
979 						releasef(sfv->sfv_fd);
980 						return (ENOMEM);
981 					}
982 					dmp->b_wptr = dmp->b_rptr =
983 					    dmp->b_rptr + wroff;
984 					ptr = (caddr_t)dmp->b_rptr;
985 				} else {
986 					ptr = buf;
987 				}
988 
989 				aiov.iov_base = ptr;
990 				aiov.iov_len = iov_len;
991 				auio.uio_loffset = sfv_off;
992 				auio.uio_iov = &aiov;
993 				auio.uio_iovcnt = 1;
994 				auio.uio_resid = iov_len;
995 				auio.uio_segflg = UIO_SYSSPACE;
996 				auio.uio_llimit = MAXOFFSET_T;
997 				auio.uio_fmode = ffp->f_flag;
998 				ioflag = auio.uio_fmode &
999 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1000 
1001 				/*
1002 				 * If read sync is not asked for,
1003 				 * filter sync flags
1004 				 */
1005 				if ((ioflag & FRSYNC) == 0)
1006 					ioflag &= ~(FSYNC|FDSYNC);
1007 				error = VOP_READ(readvp, &auio, ioflag,
1008 				    fp->f_cred, NULL);
1009 				if (error != 0) {
1010 					/*
1011 					 * If we were reading a pipe (currently
1012 					 * not implemented), we may now lose
1013 					 * data.
1014 					 */
1015 					if (vp->v_type == VSOCK)
1016 						freeb(dmp);
1017 					else
1018 						kmem_free(buf, size);
1019 					VOP_RWUNLOCK(readvp, readflg, NULL);
1020 					releasef(sfv->sfv_fd);
1021 					return (error);
1022 				}
1023 
1024 				/*
1025 				 * Check how much data was really read.
1026 				 * Decrement the 'len' and increment the
1027 				 * 'off' appropriately.
1028 				 */
1029 				cnt = iov_len - auio.uio_resid;
1030 				if (cnt == 0) {
1031 					if (vp->v_type == VSOCK)
1032 						freeb(dmp);
1033 					else
1034 						kmem_free(buf, size);
1035 					VOP_RWUNLOCK(readvp, readflg, NULL);
1036 					releasef(sfv->sfv_fd);
1037 					return (EINVAL);
1038 				}
1039 				sfv_len -= cnt;
1040 				sfv_off += cnt;
1041 
1042 				if (vp->v_type == VSOCK) {
1043 					dmp->b_wptr = dmp->b_rptr + cnt;
1044 
1045 					error = kstrwritemp(vp, dmp, fflag);
1046 					if (error != 0) {
1047 						freeb(dmp);
1048 						VOP_RWUNLOCK(readvp, readflg,
1049 									NULL);
1050 						releasef(sfv->sfv_fd);
1051 						return (error);
1052 					}
1053 
1054 					ttolwp(curthread)->lwp_ru.ioch +=
1055 					    (ulong_t)cnt;
1056 					*count += cnt;
1057 				} else {
1058 
1059 					aiov.iov_base = ptr;
1060 					aiov.iov_len = cnt;
1061 					auio.uio_loffset = *fileoff;
1062 					auio.uio_resid = cnt;
1063 					auio.uio_segflg = UIO_SYSSPACE;
1064 					auio.uio_llimit = curproc->p_fsz_ctl;
1065 					auio.uio_fmode = fflag;
1066 					ioflag = auio.uio_fmode &
1067 					    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1068 					error = VOP_WRITE(vp, &auio, ioflag,
1069 					    fp->f_cred, NULL);
1070 
1071 					/*
1072 					 * Check how much data was written.
1073 					 * Increment the 'len' and decrement the
1074 					 * 'off' if all the data was not
1075 					 * written.
1076 					 */
1077 					cnt -= auio.uio_resid;
1078 					sfv_len += auio.uio_resid;
1079 					sfv_off -= auio.uio_resid;
1080 					ttolwp(curthread)->lwp_ru.ioch +=
1081 					    (ulong_t)cnt;
1082 					*fileoff += cnt;
1083 					*count += cnt;
1084 					if (error != 0) {
1085 						kmem_free(buf, size);
1086 						VOP_RWUNLOCK(readvp, readflg,
1087 									NULL);
1088 						releasef(sfv->sfv_fd);
1089 						return (error);
1090 					}
1091 				}
1092 			}
1093 			if (buf) {
1094 				kmem_free(buf, size);
1095 				buf = NULL;
1096 			}
1097 			VOP_RWUNLOCK(readvp, readflg, NULL);
1098 			releasef(sfv->sfv_fd);
1099 		}
1100 		sfv++;
1101 	}
1102 	return (0);
1103 }
1104 
1105 ssize_t
1106 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt,
1107     size_t *xferred)
1108 {
1109 	int error;
1110 	file_t *fp;
1111 	struct vnode *vp;
1112 	struct sonode *so;
1113 	u_offset_t fileoff;
1114 	int copy_cnt;
1115 	const struct sendfilevec *copy_vec;
1116 	struct sendfilevec sfv[SEND_MAX_CHUNK];
1117 	ssize_t count = 0;
1118 #ifdef _SYSCALL32_IMPL
1119 	struct ksendfilevec32 sfv32[SEND_MAX_CHUNK];
1120 #endif
1121 	ssize_t total_size;
1122 	int i;
1123 	boolean_t is_sock = B_FALSE;
1124 	int maxblk = 0;
1125 
1126 	if (sfvcnt <= 0)
1127 		return (set_errno(EINVAL));
1128 
1129 	if ((fp = getf(fildes)) == NULL)
1130 		return (set_errno(EBADF));
1131 
1132 	if (((fp->f_flag) & FWRITE) == 0) {
1133 		error = EBADF;
1134 		goto err;
1135 	}
1136 
1137 	fileoff = fp->f_offset;
1138 	vp = fp->f_vnode;
1139 
1140 	switch (vp->v_type) {
1141 	case VSOCK:
1142 		so = VTOSO(vp);
1143 		/* sendfile not supported for SCTP */
1144 		if (so->so_protocol == IPPROTO_SCTP) {
1145 			error = EPROTONOSUPPORT;
1146 			goto err;
1147 		}
1148 		is_sock = B_TRUE;
1149 		switch (so->so_family) {
1150 		case AF_INET:
1151 		case AF_INET6:
1152 			/*
1153 			 * Make similar checks done in SOP_WRITE().
1154 			 */
1155 			if (so->so_state & SS_CANTSENDMORE) {
1156 				tsignal(curthread, SIGPIPE);
1157 				error = EPIPE;
1158 				goto err;
1159 			}
1160 			if (so->so_type != SOCK_STREAM) {
1161 				error = EOPNOTSUPP;
1162 				goto err;
1163 			}
1164 
1165 			if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
1166 			    (SS_ISCONNECTED|SS_ISBOUND)) {
1167 				error = ENOTCONN;
1168 				goto err;
1169 			}
1170 
1171 			if ((so->so_state & SS_DIRECT) &&
1172 			    (so->so_priv != NULL) &&
1173 			    (so->so_kssl_ctx == NULL)) {
1174 				maxblk = ((tcp_t *)so->so_priv)->tcp_mss;
1175 			} else {
1176 				maxblk = (int)vp->v_stream->sd_maxblk;
1177 			}
1178 			break;
1179 		default:
1180 			error = EAFNOSUPPORT;
1181 			goto err;
1182 		}
1183 		break;
1184 	case VREG:
1185 		break;
1186 	default:
1187 		error = EINVAL;
1188 		goto err;
1189 	}
1190 
1191 	switch (opcode) {
1192 	case SENDFILEV :
1193 		break;
1194 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1195 	case SENDFILEV64 :
1196 		return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt,
1197 		    (size32_t *)xferred, fildes));
1198 #endif
1199 	default :
1200 		error = ENOSYS;
1201 		break;
1202 	}
1203 
1204 	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1205 	copy_vec = vec;
1206 
1207 	do {
1208 		total_size = 0;
1209 		copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
1210 #ifdef _SYSCALL32_IMPL
1211 		/* 32-bit callers need to have their iovec expanded. */
1212 		if (get_udatamodel() == DATAMODEL_ILP32) {
1213 			if (copyin(copy_vec, sfv32,
1214 			    copy_cnt * sizeof (ksendfilevec32_t))) {
1215 				error = EFAULT;
1216 				break;
1217 			}
1218 
1219 			for (i = 0; i < copy_cnt; i++) {
1220 				sfv[i].sfv_fd = sfv32[i].sfv_fd;
1221 				sfv[i].sfv_off =
1222 					(off_t)(uint32_t)sfv32[i].sfv_off;
1223 				sfv[i].sfv_len = (size_t)sfv32[i].sfv_len;
1224 				total_size += sfv[i].sfv_len;
1225 				sfv[i].sfv_flag = sfv32[i].sfv_flag;
1226 			}
1227 		} else {
1228 #endif
1229 			if (copyin(copy_vec, sfv,
1230 			    copy_cnt * sizeof (sendfilevec_t))) {
1231 				error = EFAULT;
1232 				break;
1233 			}
1234 
1235 			for (i = 0; i < copy_cnt; i++) {
1236 				total_size += sfv[i].sfv_len;
1237 			}
1238 #ifdef _SYSCALL32_IMPL
1239 		}
1240 #endif
1241 
1242 		/*
1243 		 * The task between deciding to use sendvec_small_chunk
1244 		 * and sendvec_chunk is dependant on multiple things:
1245 		 *
1246 		 * i) latency is important for smaller files. So if the
1247 		 * data is smaller than 'tcp_slow_start_initial' times
1248 		 * maxblk, then use sendvec_small_chunk which creates
1249 		 * maxblk size mblks and chains then together and sends
1250 		 * them to TCP in one shot. It also leaves 'wroff' size
1251 		 * space for the headers in each mblk.
1252 		 *
1253 		 * ii) for total size bigger than 'tcp_slow_start_initial'
1254 		 * time maxblk, its probably real file data which is
1255 		 * dominating. So its better to use sendvec_chunk because
1256 		 * performance goes to dog if we don't do pagesize reads.
1257 		 * sendvec_chunk will do pagesize reads and write them
1258 		 * in pagesize mblks to TCP.
1259 		 *
1260 		 * Side Notes: A write to file has not been optimized.
1261 		 * Future zero copy code will plugin into sendvec_chunk
1262 		 * only because doing zero copy for files smaller then
1263 		 * pagesize is useless.
1264 		 *
1265 		 * Note, if socket has NL7C enabled then call NL7C's
1266 		 * senfilev() function to consume the sfv[].
1267 		 */
1268 		if (is_sock) {
1269 			switch (so->so_family) {
1270 			case AF_INET:
1271 			case AF_INET6:
1272 				if (so->so_nl7c_flags != 0)
1273 					error = nl7c_sendfilev(so, &fileoff,
1274 					    sfv, copy_cnt, &count);
1275 				else if (total_size <= (4 * maxblk))
1276 					error = sendvec_small_chunk(fp,
1277 					    &fileoff, sfv, copy_cnt,
1278 					    total_size, maxblk, &count);
1279 				else
1280 					error = sendvec_chunk(fp, &fileoff,
1281 					    sfv, copy_cnt, &count);
1282 				break;
1283 			}
1284 		} else {
1285 			ASSERT(vp->v_type == VREG);
1286 			error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt,
1287 			    &count);
1288 		}
1289 
1290 
1291 #ifdef _SYSCALL32_IMPL
1292 	if (get_udatamodel() == DATAMODEL_ILP32)
1293 		copy_vec = (const struct sendfilevec *)((char *)copy_vec +
1294 		    (copy_cnt * sizeof (ksendfilevec32_t)));
1295 	else
1296 #endif
1297 		copy_vec += copy_cnt;
1298 		sfvcnt -= copy_cnt;
1299 	} while (sfvcnt > 0);
1300 
1301 	if (vp->v_type == VREG)
1302 		fp->f_offset += count;
1303 
1304 
1305 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1306 
1307 #ifdef _SYSCALL32_IMPL
1308 	if (get_udatamodel() == DATAMODEL_ILP32) {
1309 		ssize32_t count32 = (ssize32_t)count;
1310 		if (copyout(&count32, xferred, sizeof (count32)))
1311 			error = EFAULT;
1312 		releasef(fildes);
1313 		if (error != 0)
1314 			return (set_errno(error));
1315 		return (count32);
1316 	}
1317 #endif
1318 	if (copyout(&count, xferred, sizeof (count)))
1319 		error = EFAULT;
1320 	releasef(fildes);
1321 	if (error != 0)
1322 		return (set_errno(error));
1323 	return (count);
1324 err:
1325 	ASSERT(error != 0);
1326 	releasef(fildes);
1327 	return (set_errno(error));
1328 }
1329