xref: /titanic_52/usr/src/uts/common/syscall/sendfile.c (revision 5c88ba20fc79ecf19255b4a04f03d77630b6d0e7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/t_lock.h>
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/buf.h>
34 #include <sys/conf.h>
35 #include <sys/cred.h>
36 #include <sys/kmem.h>
37 #include <sys/sysmacros.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/debug.h>
41 #include <sys/errno.h>
42 #include <sys/time.h>
43 #include <sys/file.h>
44 #include <sys/open.h>
45 #include <sys/user.h>
46 #include <sys/termios.h>
47 #include <sys/stream.h>
48 #include <sys/strsubr.h>
49 #include <sys/esunddi.h>
50 #include <sys/flock.h>
51 #include <sys/modctl.h>
52 #include <sys/cmn_err.h>
53 #include <sys/vmsystm.h>
54 
55 #include <sys/socket.h>
56 #include <sys/socketvar.h>
57 #include <netinet/in.h>
58 #include <sys/sendfile.h>
59 #include <sys/un.h>
60 #include <inet/nca/ncadoorhdr.h>
61 #include <inet/nca/ncaio.h>
62 #include <sys/tihdr.h>
63 #include <sys/atomic.h>
64 
65 #include <inet/common.h>
66 #include <inet/ip.h>
67 #include <inet/ip6.h>
68 #include <inet/tcp.h>
69 
70 extern int nca_sendfilev(file_t *, struct sendfilevec *, int, ssize_t *);
71 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *,
72 		ssize32_t *);
73 extern void nl7c_sendfilev(struct sonode *, u_offset_t, struct sendfilevec *,
74 		int);
75 
76 /*
77  * kstrwritemp() has very similar semantics as that of strwrite().
78  * The main difference is it obtains mblks from the caller and also
79  * does not do any copy as done in strwrite() from user buffers to
80  * kernel buffers.
81  *
82  * Currently, this routine is used by sendfile to send data allocated
83  * within the kernel without any copying. This interface does not use the
84  * synchronous stream interface as synch. stream interface implies
85  * copying.
86  */
87 int
88 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
89 {
90 	struct stdata *stp;
91 	struct queue *wqp;
92 	char waitflag;
93 	int tempmode;
94 	int error = 0;
95 	int done = 0;
96 	struct sonode *so;
97 	boolean_t direct;
98 
99 	ASSERT(vp->v_stream);
100 	stp = vp->v_stream;
101 
102 	so = VTOSO(vp);
103 	direct = (so->so_state & SS_DIRECT);
104 
105 	/*
106 	 * This is the sockfs direct fast path. canputnext() need
107 	 * not be accurate so we don't grab the sd_lock here. If
108 	 * we get flow-controlled, we grab sd_lock just before the
109 	 * do..while loop below to emulate what strwrite() does.
110 	 */
111 	wqp = stp->sd_wrq;
112 	if (canputnext(wqp) && direct &&
113 	    !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
114 		return (sostream_direct(so, NULL, mp, CRED()));
115 	} else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
116 		/* Fast check of flags before acquiring the lock */
117 		mutex_enter(&stp->sd_lock);
118 		error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
119 		mutex_exit(&stp->sd_lock);
120 		if (error != 0) {
121 			if (!(stp->sd_flag & STPLEX) &&
122 			    (stp->sd_wput_opt & SW_SIGPIPE)) {
123 				tsignal(curthread, SIGPIPE);
124 				error = EPIPE;
125 			}
126 			return (error);
127 		}
128 	}
129 
130 	waitflag = WRITEWAIT;
131 	if (stp->sd_flag & OLDNDELAY)
132 		tempmode = fmode & ~FNDELAY;
133 	else
134 		tempmode = fmode;
135 
136 	mutex_enter(&stp->sd_lock);
137 	do {
138 		if (canputnext(wqp)) {
139 			mutex_exit(&stp->sd_lock);
140 			putnext(wqp, mp);
141 			return (0);
142 		}
143 		error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
144 		    &done);
145 	} while (error == 0 && !done);
146 
147 	mutex_exit(&stp->sd_lock);
148 	/*
149 	 * EAGAIN tells the application to try again. ENOMEM
150 	 * is returned only if the memory allocation size
151 	 * exceeds the physical limits of the system. ENOMEM
152 	 * can't be true here.
153 	 */
154 	if (error == ENOMEM)
155 		error = EAGAIN;
156 	return (error);
157 }
158 
159 #define	SEND_MAX_CHUNK	16
160 
161 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
162 /*
163  * 64 bit offsets for 32 bit applications only running either on
164  * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer
165  * more than 2GB of data.
166  */
167 int
168 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv,
169     int copy_cnt, ssize32_t *count)
170 {
171 	struct vnode *vp;
172 	ushort_t fflag;
173 	int ioflag;
174 	size32_t cnt;
175 	ssize32_t sfv_len;
176 	ssize32_t tmpcount;
177 	u_offset_t sfv_off;
178 	struct uio auio;
179 	struct iovec aiov;
180 	int i, error;
181 
182 	fflag = fp->f_flag;
183 	vp = fp->f_vnode;
184 	for (i = 0; i < copy_cnt; i++) {
185 
186 		if (ISSIG(curthread, JUSTLOOKING))
187 			return (EINTR);
188 
189 		/*
190 		 * Do similar checks as "write" as we are writing
191 		 * sfv_len bytes into "vp".
192 		 */
193 		sfv_len = (ssize32_t)sfv->sfv_len;
194 
195 		if (sfv_len == 0)
196 			continue;
197 
198 		if (sfv_len < 0)
199 			return (EINVAL);
200 
201 		if (vp->v_type == VREG) {
202 			if (*fileoff >= curproc->p_fsz_ctl) {
203 				mutex_enter(&curproc->p_lock);
204 				(void) rctl_action(
205 				    rctlproc_legacy[RLIMIT_FSIZE],
206 				    curproc->p_rctls, curproc, RCA_SAFE);
207 				mutex_exit(&curproc->p_lock);
208 				return (EFBIG);
209 			}
210 
211 			if (*fileoff >= OFFSET_MAX(fp))
212 				return (EFBIG);
213 
214 			if (*fileoff + sfv_len > OFFSET_MAX(fp))
215 				return (EINVAL);
216 		}
217 
218 		tmpcount = *count + sfv_len;
219 		if (tmpcount < 0)
220 			return (EINVAL);
221 
222 		sfv_off = sfv->sfv_off;
223 
224 		auio.uio_extflg = UIO_COPY_DEFAULT;
225 		if (sfv->sfv_fd == SFV_FD_SELF) {
226 			aiov.iov_len = sfv_len;
227 			aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
228 			auio.uio_loffset = *fileoff;
229 			auio.uio_iovcnt = 1;
230 			auio.uio_resid = sfv_len;
231 			auio.uio_iov = &aiov;
232 			auio.uio_segflg = UIO_USERSPACE;
233 			auio.uio_llimit = curproc->p_fsz_ctl;
234 			auio.uio_fmode = fflag;
235 			ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
236 			while (sfv_len > 0) {
237 				error = VOP_WRITE(vp, &auio, ioflag,
238 				    fp->f_cred, NULL);
239 				cnt = sfv_len - auio.uio_resid;
240 				sfv_len -= cnt;
241 				ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
242 				if (vp->v_type == VREG)
243 					*fileoff += cnt;
244 				*count += cnt;
245 				if (error != 0)
246 					return (error);
247 			}
248 		} else {
249 			file_t	*ffp;
250 			vnode_t	*readvp;
251 			int	readflg = 0;
252 			size_t	size;
253 			caddr_t	ptr;
254 
255 			if ((ffp = getf(sfv->sfv_fd)) == NULL)
256 				return (EBADF);
257 
258 			if ((ffp->f_flag & FREAD) == 0) {
259 				releasef(sfv->sfv_fd);
260 				return (EBADF);
261 			}
262 
263 			readvp = ffp->f_vnode;
264 			if (readvp->v_type != VREG) {
265 				releasef(sfv->sfv_fd);
266 				return (EINVAL);
267 			}
268 
269 			/*
270 			 * No point reading and writing to same vp,
271 			 * as long as both are regular files. readvp is not
272 			 * locked; but since we got it from an open file the
273 			 * contents will be valid during the time of access.
274 			 */
275 			if (VN_CMP(vp, readvp)) {
276 				releasef(sfv->sfv_fd);
277 				return (EINVAL);
278 			}
279 
280 			/*
281 			 * Note: we assume readvp != vp. "vp" is already
282 			 * locked, and "readvp" must not be.
283 			 */
284 			(void) VOP_RWLOCK(readvp, readflg, NULL);
285 
286 			/*
287 			 * Same checks as in pread64.
288 			 */
289 			if (sfv_off > MAXOFFSET_T) {
290 				VOP_RWUNLOCK(readvp, readflg, NULL);
291 				releasef(sfv->sfv_fd);
292 				return (EINVAL);
293 			}
294 
295 			if (sfv_off + sfv_len > MAXOFFSET_T)
296 				sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off);
297 
298 			/* Find the native blocksize to transfer data */
299 			size = MIN(vp->v_vfsp->vfs_bsize,
300 			    readvp->v_vfsp->vfs_bsize);
301 			size = sfv_len < size ? sfv_len : size;
302 			ptr = kmem_alloc(size, KM_SLEEP);
303 
304 			while (sfv_len > 0) {
305 				size_t	iov_len;
306 
307 				iov_len = MIN(size, sfv_len);
308 				aiov.iov_base = ptr;
309 				aiov.iov_len = iov_len;
310 				auio.uio_loffset = sfv_off;
311 				auio.uio_iov = &aiov;
312 				auio.uio_iovcnt = 1;
313 				auio.uio_resid = iov_len;
314 				auio.uio_segflg = UIO_SYSSPACE;
315 				auio.uio_llimit = MAXOFFSET_T;
316 				auio.uio_fmode = ffp->f_flag;
317 				ioflag = auio.uio_fmode &
318 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
319 
320 				/*
321 				 * If read sync is not asked for,
322 				 * filter sync flags
323 				 */
324 				if ((ioflag & FRSYNC) == 0)
325 					ioflag &= ~(FSYNC|FDSYNC);
326 				error = VOP_READ(readvp, &auio, ioflag,
327 				    fp->f_cred, NULL);
328 				if (error) {
329 					kmem_free(ptr, size);
330 					VOP_RWUNLOCK(readvp, readflg, NULL);
331 					releasef(sfv->sfv_fd);
332 					return (error);
333 				}
334 
335 				/*
336 				 * Check how must data was really read.
337 				 * Decrement the 'len' and increment the
338 				 * 'off' appropriately.
339 				 */
340 				cnt = iov_len - auio.uio_resid;
341 				if (cnt == 0) {
342 					/*
343 					 * If we were reading a pipe (currently
344 					 * not implemented), we may now lose
345 					 * data.
346 					 */
347 					kmem_free(ptr, size);
348 					VOP_RWUNLOCK(readvp, readflg, NULL);
349 					releasef(sfv->sfv_fd);
350 					return (EINVAL);
351 				}
352 				sfv_len -= cnt;
353 				sfv_off += cnt;
354 
355 				aiov.iov_base = ptr;
356 				aiov.iov_len = cnt;
357 				auio.uio_loffset = *fileoff;
358 				auio.uio_resid = cnt;
359 				auio.uio_segflg = UIO_SYSSPACE;
360 				auio.uio_llimit = curproc->p_fsz_ctl;
361 				auio.uio_fmode = fflag;
362 				ioflag = auio.uio_fmode &
363 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
364 				error = VOP_WRITE(vp, &auio, ioflag,
365 				    fp->f_cred, NULL);
366 
367 				/*
368 				 * Check how much data was written. Increment
369 				 * the 'len' and decrement the 'off' if all
370 				 * the data was not written.
371 				 */
372 				cnt -= auio.uio_resid;
373 				sfv_len += auio.uio_resid;
374 				sfv_off -= auio.uio_resid;
375 				ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
376 				if (vp->v_type == VREG)
377 					*fileoff += cnt;
378 				*count += cnt;
379 				if (error != 0) {
380 					kmem_free(ptr, size);
381 					VOP_RWUNLOCK(readvp, readflg, NULL);
382 					releasef(sfv->sfv_fd);
383 					return (error);
384 				}
385 			}
386 			VOP_RWUNLOCK(readvp, readflg, NULL);
387 			releasef(sfv->sfv_fd);
388 			kmem_free(ptr, size);
389 		}
390 		sfv++;
391 	}
392 	return (0);
393 }
394 
395 ssize32_t
396 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt,
397 	size32_t *xferred, int fildes)
398 {
399 	int			rwflag;
400 	u_offset_t		fileoff;
401 	int			copy_cnt;
402 	const struct ksendfilevec64 *copy_vec;
403 	struct ksendfilevec64 sfv[SEND_MAX_CHUNK];
404 	struct vnode *vp;
405 	int error;
406 	ssize32_t count = 0;
407 	int osfvcnt;
408 
409 	rwflag = 1;
410 	vp = fp->f_vnode;
411 	(void) VOP_RWLOCK(vp, rwflag, NULL);
412 
413 	copy_vec = vec;
414 	fileoff = fp->f_offset;
415 	osfvcnt = sfvcnt;
416 
417 	do {
418 		copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
419 		if (copyin(copy_vec, sfv, copy_cnt *
420 		    sizeof (struct ksendfilevec64))) {
421 			error = EFAULT;
422 			break;
423 		}
424 
425 		/*
426 		 * Optimize the single regular file over
427 		 * the socket case.
428 		 */
429 		if (vp->v_type == VSOCK && osfvcnt == 1 &&
430 		    sfv->sfv_fd != SFV_FD_SELF) {
431 			file_t *rfp;
432 			vnode_t *rvp;
433 
434 			if ((rfp = getf(sfv->sfv_fd)) == NULL) {
435 				error = EBADF;
436 				break;
437 			}
438 			if ((rfp->f_flag & FREAD) == 0) {
439 				releasef(sfv->sfv_fd);
440 				error = EBADF;
441 				break;
442 			}
443 			rvp = rfp->f_vnode;
444 			if (rvp->v_type == VREG) {
445 				error = sosendfile64(fp, rfp, sfv, &count);
446 				break;
447 			}
448 			releasef(sfv->sfv_fd);
449 		}
450 		error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count);
451 		if (error != 0)
452 			break;
453 
454 		copy_vec += copy_cnt;
455 		sfvcnt -= copy_cnt;
456 	} while (sfvcnt > 0);
457 
458 	if (vp->v_type == VREG)
459 		fp->f_offset += count;
460 
461 	VOP_RWUNLOCK(vp, rwflag, NULL);
462 	if (copyout(&count, xferred, sizeof (count)))
463 		error = EFAULT;
464 	releasef(fildes);
465 	if (error != 0)
466 		return (set_errno(error));
467 	return (count);
468 }
469 #endif
470 
471 int
472 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
473     int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count)
474 {
475 	struct vnode *vp;
476 	struct uio auio;
477 	struct iovec aiov;
478 	ushort_t fflag;
479 	int ioflag;
480 	int i, error;
481 	size_t cnt;
482 	ssize_t sfv_len;
483 	u_offset_t sfv_off;
484 #ifdef _SYSCALL32_IMPL
485 	model_t model = get_udatamodel();
486 	u_offset_t maxoff = (model == DATAMODEL_ILP32) ?
487 		MAXOFF32_T : MAXOFFSET_T;
488 #else
489 	const u_offset_t maxoff = MAXOFF32_T;
490 #endif
491 	mblk_t *dmp = NULL;
492 	int wroff;
493 	int buf_left = 0;
494 	size_t	iov_len;
495 	mblk_t  *head, *tmp;
496 	size_t  size = total_size;
497 
498 	fflag = fp->f_flag;
499 	vp = fp->f_vnode;
500 
501 	ASSERT(vp->v_type == VSOCK);
502 	ASSERT(maxblk > 0);
503 
504 	wroff = (int)vp->v_stream->sd_wroff;
505 	buf_left = MIN(total_size, maxblk);
506 	head = dmp = allocb(buf_left + wroff, BPRI_HI);
507 	if (head == NULL)
508 		return (ENOMEM);
509 	head->b_wptr = head->b_rptr = head->b_rptr + wroff;
510 
511 	auio.uio_extflg = UIO_COPY_DEFAULT;
512 	for (i = 0; i < copy_cnt; i++) {
513 		if (ISSIG(curthread, JUSTLOOKING))
514 			return (EINTR);
515 
516 		/*
517 		 * Do similar checks as "write" as we are writing
518 		 * sfv_len bytes into "vp".
519 		 */
520 		sfv_len = (ssize_t)sfv->sfv_len;
521 
522 		if (sfv_len == 0) {
523 			sfv++;
524 			continue;
525 		}
526 
527 		/* Make sure sfv_len is not negative */
528 #ifdef _SYSCALL32_IMPL
529 		if (model == DATAMODEL_ILP32) {
530 			if ((ssize32_t)sfv_len < 0)
531 				return (EINVAL);
532 		} else
533 #endif
534 		if (sfv_len < 0)
535 			return (EINVAL);
536 
537 		/* Check for overflow */
538 #ifdef _SYSCALL32_IMPL
539 		if (model == DATAMODEL_ILP32) {
540 			if (((ssize32_t)(*count + sfv_len)) < 0)
541 				return (EINVAL);
542 		} else
543 #endif
544 		if ((*count + sfv_len) < 0)
545 			return (EINVAL);
546 
547 		sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off;
548 
549 		if (sfv->sfv_fd == SFV_FD_SELF) {
550 			while (sfv_len > 0) {
551 				if (buf_left == 0) {
552 					tmp = dmp;
553 					buf_left = MIN(total_size, maxblk);
554 					iov_len = MIN(buf_left, sfv_len);
555 					dmp = allocb(buf_left + wroff, BPRI_HI);
556 					if (dmp == NULL) {
557 						freemsg(head);
558 						return (ENOMEM);
559 					}
560 					dmp->b_wptr = dmp->b_rptr =
561 					    dmp->b_rptr + wroff;
562 					tmp->b_cont = dmp;
563 				} else {
564 					iov_len = MIN(buf_left, sfv_len);
565 				}
566 
567 				aiov.iov_len = iov_len;
568 				aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
569 				auio.uio_loffset = *fileoff;
570 				auio.uio_iovcnt = 1;
571 				auio.uio_resid = iov_len;
572 				auio.uio_iov = &aiov;
573 				auio.uio_segflg = UIO_USERSPACE;
574 				auio.uio_llimit = curproc->p_fsz_ctl;
575 				auio.uio_fmode = fflag;
576 
577 				buf_left -= iov_len;
578 				total_size -= iov_len;
579 				sfv_len -= iov_len;
580 				sfv_off += iov_len;
581 
582 				error = uiomove((caddr_t)dmp->b_wptr,
583 				    iov_len, UIO_WRITE, &auio);
584 				if (error != 0) {
585 					freemsg(head);
586 					return (error);
587 				}
588 				dmp->b_wptr += iov_len;
589 			}
590 		} else {
591 			file_t	*ffp;
592 			vnode_t	*readvp;
593 			int	readflg = 0;
594 
595 			if ((ffp = getf(sfv->sfv_fd)) == NULL) {
596 				freemsg(head);
597 				return (EBADF);
598 			}
599 
600 			if ((ffp->f_flag & FREAD) == 0) {
601 				releasef(sfv->sfv_fd);
602 				freemsg(head);
603 				return (EACCES);
604 			}
605 
606 			readvp = ffp->f_vnode;
607 			if (readvp->v_type != VREG) {
608 				releasef(sfv->sfv_fd);
609 				freemsg(head);
610 				return (EINVAL);
611 			}
612 
613 			/*
614 			 * No point reading and writing to same vp,
615 			 * as long as both are regular files. readvp is not
616 			 * locked; but since we got it from an open file the
617 			 * contents will be valid during the time of access.
618 			 */
619 
620 			if (VN_CMP(vp, readvp)) {
621 				releasef(sfv->sfv_fd);
622 				freemsg(head);
623 				return (EINVAL);
624 			}
625 
626 			/*
627 			 * Note: we assume readvp != vp. "vp" is already
628 			 * locked, and "readvp" must not be.
629 			 */
630 
631 			(void) VOP_RWLOCK(readvp, readflg, NULL);
632 
633 			/* Same checks as in pread */
634 			if (sfv_off > maxoff) {
635 				VOP_RWUNLOCK(readvp, readflg, NULL);
636 				releasef(sfv->sfv_fd);
637 				freemsg(head);
638 				return (EINVAL);
639 			}
640 			if (sfv_off + sfv_len > maxoff) {
641 				sfv_len = (ssize_t)((offset_t)maxoff -
642 				    sfv_off);
643 			}
644 
645 			while (sfv_len > 0) {
646 				if (buf_left == 0) {
647 					tmp = dmp;
648 					buf_left = MIN(total_size, maxblk);
649 					iov_len = MIN(buf_left, sfv_len);
650 					dmp = allocb(buf_left + wroff, BPRI_HI);
651 					if (dmp == NULL) {
652 						VOP_RWUNLOCK(readvp, readflg,
653 									NULL);
654 						releasef(sfv->sfv_fd);
655 						freemsg(head);
656 						return (ENOMEM);
657 					}
658 					dmp->b_wptr = dmp->b_rptr =
659 					    dmp->b_rptr + wroff;
660 					tmp->b_cont = dmp;
661 				} else {
662 					iov_len = MIN(buf_left, sfv_len);
663 				}
664 				aiov.iov_base = (caddr_t)dmp->b_wptr;
665 				aiov.iov_len = iov_len;
666 				auio.uio_loffset = sfv_off;
667 				auio.uio_iov = &aiov;
668 				auio.uio_iovcnt = 1;
669 				auio.uio_resid = iov_len;
670 				auio.uio_segflg = UIO_SYSSPACE;
671 				auio.uio_llimit = MAXOFFSET_T;
672 				auio.uio_fmode = ffp->f_flag;
673 				ioflag = auio.uio_fmode &
674 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
675 
676 				/*
677 				 * If read sync is not asked for,
678 				 * filter sync flags
679 				 */
680 				if ((ioflag & FRSYNC) == 0)
681 					ioflag &= ~(FSYNC|FDSYNC);
682 				error = VOP_READ(readvp, &auio, ioflag,
683 				    fp->f_cred, NULL);
684 				if (error != 0) {
685 					/*
686 					 * If we were reading a pipe (currently
687 					 * not implemented), we may now loose
688 					 * data.
689 					 */
690 					VOP_RWUNLOCK(readvp, readflg, NULL);
691 					releasef(sfv->sfv_fd);
692 					freemsg(head);
693 					return (error);
694 				}
695 
696 				/*
697 				 * Check how much data was really read.
698 				 * Decrement the 'len' and increment the
699 				 * 'off' appropriately.
700 				 */
701 				cnt = iov_len - auio.uio_resid;
702 				if (cnt == 0) {
703 					VOP_RWUNLOCK(readvp, readflg, NULL);
704 					releasef(sfv->sfv_fd);
705 					freemsg(head);
706 					return (EINVAL);
707 				}
708 				sfv_len -= cnt;
709 				sfv_off += cnt;
710 				total_size -= cnt;
711 				buf_left -= cnt;
712 
713 				dmp->b_wptr += cnt;
714 			}
715 			VOP_RWUNLOCK(readvp, readflg, NULL);
716 			releasef(sfv->sfv_fd);
717 		}
718 		sfv++;
719 	}
720 
721 	ASSERT(total_size == 0);
722 	error = kstrwritemp(vp, head, fflag);
723 	if (error != 0) {
724 		freemsg(head);
725 		return (error);
726 	}
727 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size;
728 	*count += size;
729 
730 	return (0);
731 }
732 
733 
734 int
735 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
736     int copy_cnt, ssize_t *count)
737 {
738 	struct vnode *vp;
739 	struct uio auio;
740 	struct iovec aiov;
741 	ushort_t fflag;
742 	int ioflag;
743 	int i, error;
744 	size_t cnt;
745 	ssize_t sfv_len;
746 	u_offset_t sfv_off;
747 #ifdef _SYSCALL32_IMPL
748 	model_t model = get_udatamodel();
749 	u_offset_t maxoff = (model == DATAMODEL_ILP32) ?
750 		MAXOFF32_T : MAXOFFSET_T;
751 #else
752 	const u_offset_t maxoff = MAXOFF32_T;
753 #endif
754 	mblk_t	*dmp = NULL;
755 	char	*buf = NULL;
756 
757 	fflag = fp->f_flag;
758 	vp = fp->f_vnode;
759 
760 	auio.uio_extflg = UIO_COPY_DEFAULT;
761 	for (i = 0; i < copy_cnt; i++) {
762 		if (ISSIG(curthread, JUSTLOOKING))
763 			return (EINTR);
764 
765 		/*
766 		 * Do similar checks as "write" as we are writing
767 		 * sfv_len bytes into "vp".
768 		 */
769 		sfv_len = (ssize_t)sfv->sfv_len;
770 
771 		if (sfv_len == 0) {
772 			sfv++;
773 			continue;
774 		}
775 
776 		/* Make sure sfv_len is not negative */
777 #ifdef _SYSCALL32_IMPL
778 		if (model == DATAMODEL_ILP32) {
779 			if ((ssize32_t)sfv_len < 0)
780 				return (EINVAL);
781 		} else
782 #endif
783 		if (sfv_len < 0)
784 			return (EINVAL);
785 
786 		if (vp->v_type == VREG) {
787 			if (*fileoff >= curproc->p_fsz_ctl) {
788 				mutex_enter(&curproc->p_lock);
789 				(void) rctl_action(
790 				    rctlproc_legacy[RLIMIT_FSIZE],
791 				    curproc->p_rctls, curproc, RCA_SAFE);
792 				mutex_exit(&curproc->p_lock);
793 
794 				return (EFBIG);
795 			}
796 
797 			if (*fileoff >= maxoff)
798 				return (EFBIG);
799 
800 			if (*fileoff + sfv_len > maxoff)
801 				return (EINVAL);
802 		}
803 
804 		/* Check for overflow */
805 #ifdef _SYSCALL32_IMPL
806 		if (model == DATAMODEL_ILP32) {
807 			if (((ssize32_t)(*count + sfv_len)) < 0)
808 				return (EINVAL);
809 		} else
810 #endif
811 		if ((*count + sfv_len) < 0)
812 			return (EINVAL);
813 
814 		sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off;
815 
816 		if (sfv->sfv_fd == SFV_FD_SELF) {
817 			aiov.iov_len = sfv_len;
818 			aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
819 			auio.uio_loffset = *fileoff;
820 			auio.uio_iovcnt = 1;
821 			auio.uio_resid = sfv_len;
822 			auio.uio_iov = &aiov;
823 			auio.uio_segflg = UIO_USERSPACE;
824 			auio.uio_llimit = curproc->p_fsz_ctl;
825 			auio.uio_fmode = fflag;
826 
827 			if (vp->v_type == VSOCK) {
828 
829 				/*
830 				 * Optimize for the socket case
831 				 */
832 				int wroff = (int)vp->v_stream->sd_wroff;
833 
834 				dmp = allocb(sfv_len + wroff, BPRI_HI);
835 				if (dmp == NULL)
836 					return (ENOMEM);
837 				dmp->b_wptr = dmp->b_rptr = dmp->b_rptr + wroff;
838 				error = uiomove((caddr_t)dmp->b_wptr,
839 				    sfv_len, UIO_WRITE, &auio);
840 				if (error != 0) {
841 					freeb(dmp);
842 					return (error);
843 				}
844 				dmp->b_wptr += sfv_len;
845 				error = kstrwritemp(vp, dmp, fflag);
846 				if (error != 0) {
847 					freeb(dmp);
848 					return (error);
849 				}
850 				ttolwp(curthread)->lwp_ru.ioch +=
851 				    (ulong_t)sfv_len;
852 				*count += sfv_len;
853 			} else {
854 				ioflag = auio.uio_fmode &
855 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
856 				while (sfv_len > 0) {
857 					error = VOP_WRITE(vp, &auio, ioflag,
858 					    fp->f_cred, NULL);
859 					cnt = sfv_len - auio.uio_resid;
860 					sfv_len -= cnt;
861 					ttolwp(curthread)->lwp_ru.ioch +=
862 					    (ulong_t)cnt;
863 					*fileoff += cnt;
864 					*count += cnt;
865 					if (error != 0)
866 						return (error);
867 				}
868 			}
869 		} else {
870 			file_t	*ffp;
871 			vnode_t	*readvp;
872 			int	readflg = 0;
873 			size_t	size;
874 			caddr_t	ptr;
875 
876 			if ((ffp = getf(sfv->sfv_fd)) == NULL)
877 				return (EBADF);
878 
879 			if ((ffp->f_flag & FREAD) == 0) {
880 				releasef(sfv->sfv_fd);
881 				return (EBADF);
882 			}
883 
884 			readvp = ffp->f_vnode;
885 			if (readvp->v_type != VREG) {
886 				releasef(sfv->sfv_fd);
887 				return (EINVAL);
888 			}
889 
890 			/*
891 			 * No point reading and writing to same vp,
892 			 * as long as both are regular files. readvp is not
893 			 * locked; but since we got it from an open file the
894 			 * contents will be valid during the time of access.
895 			 */
896 			if (VN_CMP(vp, readvp)) {
897 				releasef(sfv->sfv_fd);
898 				return (EINVAL);
899 			}
900 
901 			/*
902 			 * Note: we assume readvp != vp. "vp" is already
903 			 * locked, and "readvp" must not be.
904 			 */
905 			(void) VOP_RWLOCK(readvp, readflg, NULL);
906 
907 			/* Same checks as in pread */
908 			if (sfv_off > maxoff) {
909 				VOP_RWUNLOCK(readvp, readflg, NULL);
910 				releasef(sfv->sfv_fd);
911 				return (EINVAL);
912 			}
913 			if (sfv_off + sfv_len > maxoff) {
914 				sfv_len = (ssize_t)((offset_t)maxoff -
915 				    sfv_off);
916 			}
917 			/* Find the native blocksize to transfer data */
918 			size = MIN(vp->v_vfsp->vfs_bsize,
919 			    readvp->v_vfsp->vfs_bsize);
920 			size = sfv_len < size ? sfv_len : size;
921 
922 			if (vp->v_type != VSOCK) {
923 				buf = kmem_alloc(size, KM_NOSLEEP);
924 				if (buf == NULL) {
925 					VOP_RWUNLOCK(readvp, readflg, NULL);
926 					releasef(sfv->sfv_fd);
927 					return (ENOMEM);
928 				}
929 			}
930 
931 			while (sfv_len > 0) {
932 				size_t	iov_len;
933 
934 				iov_len = MIN(size, sfv_len);
935 
936 				if (vp->v_type == VSOCK) {
937 					dmp = allocb(iov_len, BPRI_HI);
938 					if (dmp == NULL) {
939 						VOP_RWUNLOCK(readvp, readflg,
940 						    NULL);
941 						releasef(sfv->sfv_fd);
942 						return (ENOMEM);
943 					}
944 					ptr = (caddr_t)dmp->b_rptr;
945 				} else {
946 					ptr = buf;
947 				}
948 
949 				aiov.iov_base = ptr;
950 				aiov.iov_len = iov_len;
951 				auio.uio_loffset = sfv_off;
952 				auio.uio_iov = &aiov;
953 				auio.uio_iovcnt = 1;
954 				auio.uio_resid = iov_len;
955 				auio.uio_segflg = UIO_SYSSPACE;
956 				auio.uio_llimit = MAXOFFSET_T;
957 				auio.uio_fmode = ffp->f_flag;
958 				ioflag = auio.uio_fmode &
959 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
960 
961 				/*
962 				 * If read sync is not asked for,
963 				 * filter sync flags
964 				 */
965 				if ((ioflag & FRSYNC) == 0)
966 					ioflag &= ~(FSYNC|FDSYNC);
967 				error = VOP_READ(readvp, &auio, ioflag,
968 				    fp->f_cred, NULL);
969 				if (error != 0) {
970 					/*
971 					 * If we were reading a pipe (currently
972 					 * not implemented), we may now lose
973 					 * data.
974 					 */
975 					if (vp->v_type == VSOCK)
976 						freeb(dmp);
977 					else
978 						kmem_free(buf, size);
979 					VOP_RWUNLOCK(readvp, readflg, NULL);
980 					releasef(sfv->sfv_fd);
981 					return (error);
982 				}
983 
984 				/*
985 				 * Check how much data was really read.
986 				 * Decrement the 'len' and increment the
987 				 * 'off' appropriately.
988 				 */
989 				cnt = iov_len - auio.uio_resid;
990 				if (cnt == 0) {
991 					if (vp->v_type == VSOCK)
992 						freeb(dmp);
993 					else
994 						kmem_free(buf, size);
995 					VOP_RWUNLOCK(readvp, readflg, NULL);
996 					releasef(sfv->sfv_fd);
997 					return (EINVAL);
998 				}
999 				sfv_len -= cnt;
1000 				sfv_off += cnt;
1001 
1002 				if (vp->v_type == VSOCK) {
1003 					dmp->b_wptr = dmp->b_rptr + cnt;
1004 
1005 					error = kstrwritemp(vp, dmp, fflag);
1006 					if (error != 0) {
1007 						freeb(dmp);
1008 						VOP_RWUNLOCK(readvp, readflg,
1009 									NULL);
1010 						releasef(sfv->sfv_fd);
1011 						return (error);
1012 					}
1013 
1014 					ttolwp(curthread)->lwp_ru.ioch +=
1015 					    (ulong_t)cnt;
1016 					*count += cnt;
1017 				} else {
1018 
1019 					aiov.iov_base = ptr;
1020 					aiov.iov_len = cnt;
1021 					auio.uio_loffset = *fileoff;
1022 					auio.uio_resid = cnt;
1023 					auio.uio_segflg = UIO_SYSSPACE;
1024 					auio.uio_llimit = curproc->p_fsz_ctl;
1025 					auio.uio_fmode = fflag;
1026 					ioflag = auio.uio_fmode &
1027 					    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1028 					error = VOP_WRITE(vp, &auio, ioflag,
1029 					    fp->f_cred, NULL);
1030 
1031 					/*
1032 					 * Check how much data was written.
1033 					 * Increment the 'len' and decrement the
1034 					 * 'off' if all the data was not
1035 					 * written.
1036 					 */
1037 					cnt -= auio.uio_resid;
1038 					sfv_len += auio.uio_resid;
1039 					sfv_off -= auio.uio_resid;
1040 					ttolwp(curthread)->lwp_ru.ioch +=
1041 					    (ulong_t)cnt;
1042 					*fileoff += cnt;
1043 					*count += cnt;
1044 					if (error != 0) {
1045 						VOP_RWUNLOCK(readvp, readflg,
1046 									NULL);
1047 						releasef(sfv->sfv_fd);
1048 						return (error);
1049 					}
1050 				}
1051 			}
1052 			if (buf) {
1053 				kmem_free(buf, size);
1054 				buf = NULL;
1055 			}
1056 			VOP_RWUNLOCK(readvp, readflg, NULL);
1057 			releasef(sfv->sfv_fd);
1058 		}
1059 		sfv++;
1060 	}
1061 	return (0);
1062 }
1063 
1064 ssize_t
1065 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt,
1066     size_t *xferred)
1067 {
1068 	int error;
1069 	file_t *fp;
1070 	struct vnode *vp;
1071 	struct sonode *so;
1072 	u_offset_t fileoff;
1073 	int copy_cnt;
1074 	const struct sendfilevec *copy_vec;
1075 	struct sendfilevec sfv[SEND_MAX_CHUNK];
1076 	ssize_t count = 0;
1077 #ifdef _SYSCALL32_IMPL
1078 	struct ksendfilevec32 sfv32[SEND_MAX_CHUNK];
1079 #endif
1080 	ssize_t total_size = 0;
1081 	int i;
1082 	boolean_t is_sock = B_FALSE;
1083 	int maxblk = 0;
1084 
1085 	if (sfvcnt <= 0)
1086 		return (set_errno(EINVAL));
1087 
1088 	if ((fp = getf(fildes)) == NULL)
1089 		return (set_errno(EBADF));
1090 
1091 	if (((fp->f_flag) & FWRITE) == 0) {
1092 		error = EBADF;
1093 		goto err;
1094 	}
1095 
1096 	fileoff = fp->f_offset;
1097 	vp = fp->f_vnode;
1098 
1099 	switch (vp->v_type) {
1100 	case VSOCK:
1101 		so = VTOSO(vp);
1102 		/* sendfile not supported for SCTP */
1103 		if (so->so_protocol == IPPROTO_SCTP) {
1104 			error = EPROTONOSUPPORT;
1105 			goto err;
1106 		}
1107 		is_sock = B_TRUE;
1108 		switch (so->so_family) {
1109 		case AF_NCA:
1110 		case AF_INET:
1111 		case AF_INET6:
1112 			/*
1113 			 * Make similar checks done in SOP_WRITE().
1114 			 */
1115 			if (so->so_state & SS_CANTSENDMORE) {
1116 				tsignal(curthread, SIGPIPE);
1117 				error = EPIPE;
1118 				goto err;
1119 			}
1120 			if (so->so_type != SOCK_STREAM) {
1121 				error = EOPNOTSUPP;
1122 				goto err;
1123 			}
1124 
1125 			if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
1126 			    (SS_ISCONNECTED|SS_ISBOUND)) {
1127 				error = ENOTCONN;
1128 				goto err;
1129 			}
1130 
1131 			if ((so->so_state & SS_DIRECT) &&
1132 			    (so->so_priv != NULL)) {
1133 				maxblk = ((tcp_t *)so->so_priv)->tcp_mss;
1134 			} else {
1135 				maxblk = (int)vp->v_stream->sd_maxblk;
1136 			}
1137 			break;
1138 		default:
1139 			error = EAFNOSUPPORT;
1140 			goto err;
1141 		}
1142 		break;
1143 	case VREG:
1144 		break;
1145 	default:
1146 		error = EINVAL;
1147 		goto err;
1148 	}
1149 
1150 	switch (opcode) {
1151 	case SENDFILEV :
1152 		break;
1153 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1154 	case SENDFILEV64 :
1155 		return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt,
1156 		    (size32_t *)xferred, fildes));
1157 #endif
1158 	default :
1159 		error = ENOSYS;
1160 		break;
1161 	}
1162 
1163 	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1164 	copy_vec = vec;
1165 
1166 	do {
1167 		copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
1168 #ifdef _SYSCALL32_IMPL
1169 		/* 32-bit callers need to have their iovec expanded. */
1170 		if (get_udatamodel() == DATAMODEL_ILP32) {
1171 			if (copyin(copy_vec, sfv32,
1172 			    copy_cnt * sizeof (ksendfilevec32_t))) {
1173 				error = EFAULT;
1174 				break;
1175 			}
1176 
1177 			for (i = 0; i < copy_cnt; i++) {
1178 				sfv[i].sfv_fd = sfv32[i].sfv_fd;
1179 				sfv[i].sfv_off =
1180 					(off_t)(uint32_t)sfv32[i].sfv_off;
1181 				sfv[i].sfv_len = (size_t)sfv32[i].sfv_len;
1182 				total_size += sfv[i].sfv_len;
1183 				sfv[i].sfv_flag = sfv32[i].sfv_flag;
1184 			}
1185 		} else {
1186 #endif
1187 			if (copyin(copy_vec, sfv,
1188 			    copy_cnt * sizeof (sendfilevec_t))) {
1189 				error = EFAULT;
1190 				break;
1191 			}
1192 
1193 			for (i = 0; i < copy_cnt; i++) {
1194 				total_size += sfv[i].sfv_len;
1195 			}
1196 #ifdef _SYSCALL32_IMPL
1197 		}
1198 #endif
1199 
1200 		/*
1201 		 * The task between deciding to use sendvec_small_chunk
1202 		 * and sendvec_chunk is dependant on multiple things:
1203 		 *
1204 		 * i) latency is important for smaller files. So if the
1205 		 * data is smaller than 'tcp_slow_start_initial' times
1206 		 * maxblk, then use sendvec_small_chunk which creates
1207 		 * maxblk size mblks and chains then together and sends
1208 		 * them to TCP in one shot. It also leaves 'wroff' size
1209 		 * space for the headers in each mblk.
1210 		 *
1211 		 * ii) for total size bigger than 'tcp_slow_start_initial'
1212 		 * time maxblk, its probably real file data which is
1213 		 * dominating. So its better to use sendvec_chunk because
1214 		 * performance goes to dog if we don't do pagesize reads.
1215 		 * sendvec_chunk will do pagesize reads and write them
1216 		 * in pagesize mblks to TCP.
1217 		 *
1218 		 * Side Notes: A write to file has not been optimized.
1219 		 * Future zero copy code will plugin into sendvec_chunk
1220 		 * only because doing zero copy for files smaller then
1221 		 * pagesize is useless.
1222 		 *
1223 		 * Note, if socket has NL7C enabled then call NL7C's
1224 		 * senfilev() function to give NL7C a chance to copy
1225 		 * the vec for caching, then continue processing as
1226 		 * normal.
1227 		 */
1228 		if (is_sock) {
1229 			switch (so->so_family) {
1230 			case AF_INET:
1231 			case AF_INET6:
1232 				if (so->so_nl7c_flags != 0) {
1233 					nl7c_sendfilev(so, fileoff,
1234 					    sfv, copy_cnt);
1235 				}
1236 				if (total_size <= (4 * maxblk))
1237 					error = sendvec_small_chunk(fp,
1238 					    &fileoff, sfv, copy_cnt,
1239 					    total_size, maxblk, &count);
1240 				else
1241 					error = sendvec_chunk(fp, &fileoff,
1242 					    sfv, copy_cnt, &count);
1243 				break;
1244 			case AF_NCA:
1245 				error = nca_sendfilev(fp, sfv, copy_cnt,
1246 				    &count);
1247 				break;
1248 			}
1249 		} else {
1250 			ASSERT(vp->v_type == VREG);
1251 			error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt,
1252 			    &count);
1253 		}
1254 
1255 
1256 #ifdef _SYSCALL32_IMPL
1257 	if (get_udatamodel() == DATAMODEL_ILP32)
1258 		copy_vec = (const struct sendfilevec *)((char *)copy_vec +
1259 		    (copy_cnt * sizeof (ksendfilevec32_t)));
1260 	else
1261 #endif
1262 		copy_vec += copy_cnt;
1263 		sfvcnt -= copy_cnt;
1264 	} while (sfvcnt > 0);
1265 
1266 	if (vp->v_type == VREG)
1267 		fp->f_offset += count;
1268 
1269 
1270 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1271 
1272 #ifdef _SYSCALL32_IMPL
1273 	if (get_udatamodel() == DATAMODEL_ILP32) {
1274 		ssize32_t count32 = (ssize32_t)count;
1275 		if (copyout(&count32, xferred, sizeof (count32)))
1276 			error = EFAULT;
1277 		releasef(fildes);
1278 		if (error != 0)
1279 			return (set_errno(error));
1280 		return (count32);
1281 	}
1282 #endif
1283 	if (copyout(&count, xferred, sizeof (count)))
1284 		error = EFAULT;
1285 	releasef(fildes);
1286 	if (error != 0)
1287 		return (set_errno(error));
1288 	return (count);
1289 err:
1290 	ASSERT(error != 0);
1291 	releasef(fildes);
1292 	return (set_errno(error));
1293 }
1294