xref: /titanic_50/usr/src/uts/common/syscall/sendfile.c (revision 0917b783fd655a0c943e0b8fb848db2301774947)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/t_lock.h>
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/buf.h>
34 #include <sys/conf.h>
35 #include <sys/cred.h>
36 #include <sys/kmem.h>
37 #include <sys/sysmacros.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/debug.h>
41 #include <sys/errno.h>
42 #include <sys/time.h>
43 #include <sys/file.h>
44 #include <sys/open.h>
45 #include <sys/user.h>
46 #include <sys/termios.h>
47 #include <sys/stream.h>
48 #include <sys/strsubr.h>
49 #include <sys/esunddi.h>
50 #include <sys/flock.h>
51 #include <sys/modctl.h>
52 #include <sys/cmn_err.h>
53 #include <sys/vmsystm.h>
54 
55 #include <sys/socket.h>
56 #include <sys/socketvar.h>
57 #include <netinet/in.h>
58 #include <sys/sendfile.h>
59 #include <sys/un.h>
60 #include <sys/tihdr.h>
61 #include <sys/atomic.h>
62 
63 #include <inet/common.h>
64 #include <inet/ip.h>
65 #include <inet/ip6.h>
66 #include <inet/tcp.h>
67 
68 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *,
69 		ssize32_t *);
70 extern void nl7c_sendfilev(struct sonode *, u_offset_t, struct sendfilevec *,
71 		int);
72 
73 /*
74  * kstrwritemp() has very similar semantics as that of strwrite().
75  * The main difference is it obtains mblks from the caller and also
76  * does not do any copy as done in strwrite() from user buffers to
77  * kernel buffers.
78  *
79  * Currently, this routine is used by sendfile to send data allocated
80  * within the kernel without any copying. This interface does not use the
81  * synchronous stream interface as synch. stream interface implies
82  * copying.
83  */
84 int
85 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
86 {
87 	struct stdata *stp;
88 	struct queue *wqp;
89 	mblk_t *newmp;
90 	char waitflag;
91 	int tempmode;
92 	int error = 0;
93 	int done = 0;
94 	struct sonode *so;
95 	boolean_t direct;
96 
97 	ASSERT(vp->v_stream);
98 	stp = vp->v_stream;
99 
100 	so = VTOSO(vp);
101 	direct = (so->so_state & SS_DIRECT);
102 
103 	/*
104 	 * This is the sockfs direct fast path. canputnext() need
105 	 * not be accurate so we don't grab the sd_lock here. If
106 	 * we get flow-controlled, we grab sd_lock just before the
107 	 * do..while loop below to emulate what strwrite() does.
108 	 */
109 	wqp = stp->sd_wrq;
110 	if (canputnext(wqp) && direct &&
111 	    !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
112 		return (sostream_direct(so, NULL, mp, CRED()));
113 	} else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
114 		/* Fast check of flags before acquiring the lock */
115 		mutex_enter(&stp->sd_lock);
116 		error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
117 		mutex_exit(&stp->sd_lock);
118 		if (error != 0) {
119 			if (!(stp->sd_flag & STPLEX) &&
120 			    (stp->sd_wput_opt & SW_SIGPIPE)) {
121 				tsignal(curthread, SIGPIPE);
122 				error = EPIPE;
123 			}
124 			return (error);
125 		}
126 	}
127 
128 	waitflag = WRITEWAIT;
129 	if (stp->sd_flag & OLDNDELAY)
130 		tempmode = fmode & ~FNDELAY;
131 	else
132 		tempmode = fmode;
133 
134 	mutex_enter(&stp->sd_lock);
135 	do {
136 		if (canputnext(wqp)) {
137 			mutex_exit(&stp->sd_lock);
138 			if (stp->sd_wputdatafunc != NULL) {
139 				newmp = (stp->sd_wputdatafunc)(vp, mp, NULL,
140 				    NULL, NULL, NULL);
141 				if (newmp == NULL) {
142 					/* The caller will free mp */
143 					return (ECOMM);
144 				}
145 				mp = newmp;
146 			}
147 			putnext(wqp, mp);
148 			return (0);
149 		}
150 		error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
151 		    &done);
152 	} while (error == 0 && !done);
153 
154 	mutex_exit(&stp->sd_lock);
155 	/*
156 	 * EAGAIN tells the application to try again. ENOMEM
157 	 * is returned only if the memory allocation size
158 	 * exceeds the physical limits of the system. ENOMEM
159 	 * can't be true here.
160 	 */
161 	if (error == ENOMEM)
162 		error = EAGAIN;
163 	return (error);
164 }
165 
166 #define	SEND_MAX_CHUNK	16
167 
168 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
169 /*
170  * 64 bit offsets for 32 bit applications only running either on
171  * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer
172  * more than 2GB of data.
173  */
174 int
175 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv,
176     int copy_cnt, ssize32_t *count)
177 {
178 	struct vnode *vp;
179 	ushort_t fflag;
180 	int ioflag;
181 	size32_t cnt;
182 	ssize32_t sfv_len;
183 	ssize32_t tmpcount;
184 	u_offset_t sfv_off;
185 	struct uio auio;
186 	struct iovec aiov;
187 	int i, error;
188 
189 	fflag = fp->f_flag;
190 	vp = fp->f_vnode;
191 	for (i = 0; i < copy_cnt; i++) {
192 
193 		if (ISSIG(curthread, JUSTLOOKING))
194 			return (EINTR);
195 
196 		/*
197 		 * Do similar checks as "write" as we are writing
198 		 * sfv_len bytes into "vp".
199 		 */
200 		sfv_len = (ssize32_t)sfv->sfv_len;
201 
202 		if (sfv_len == 0)
203 			continue;
204 
205 		if (sfv_len < 0)
206 			return (EINVAL);
207 
208 		if (vp->v_type == VREG) {
209 			if (*fileoff >= curproc->p_fsz_ctl) {
210 				mutex_enter(&curproc->p_lock);
211 				(void) rctl_action(
212 				    rctlproc_legacy[RLIMIT_FSIZE],
213 				    curproc->p_rctls, curproc, RCA_SAFE);
214 				mutex_exit(&curproc->p_lock);
215 				return (EFBIG);
216 			}
217 
218 			if (*fileoff >= OFFSET_MAX(fp))
219 				return (EFBIG);
220 
221 			if (*fileoff + sfv_len > OFFSET_MAX(fp))
222 				return (EINVAL);
223 		}
224 
225 		tmpcount = *count + sfv_len;
226 		if (tmpcount < 0)
227 			return (EINVAL);
228 
229 		sfv_off = sfv->sfv_off;
230 
231 		auio.uio_extflg = UIO_COPY_DEFAULT;
232 		if (sfv->sfv_fd == SFV_FD_SELF) {
233 			aiov.iov_len = sfv_len;
234 			aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
235 			auio.uio_loffset = *fileoff;
236 			auio.uio_iovcnt = 1;
237 			auio.uio_resid = sfv_len;
238 			auio.uio_iov = &aiov;
239 			auio.uio_segflg = UIO_USERSPACE;
240 			auio.uio_llimit = curproc->p_fsz_ctl;
241 			auio.uio_fmode = fflag;
242 			ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
243 			while (sfv_len > 0) {
244 				error = VOP_WRITE(vp, &auio, ioflag,
245 				    fp->f_cred, NULL);
246 				cnt = sfv_len - auio.uio_resid;
247 				sfv_len -= cnt;
248 				ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
249 				if (vp->v_type == VREG)
250 					*fileoff += cnt;
251 				*count += cnt;
252 				if (error != 0)
253 					return (error);
254 			}
255 		} else {
256 			file_t	*ffp;
257 			vnode_t	*readvp;
258 			int	readflg = 0;
259 			size_t	size;
260 			caddr_t	ptr;
261 
262 			if ((ffp = getf(sfv->sfv_fd)) == NULL)
263 				return (EBADF);
264 
265 			if ((ffp->f_flag & FREAD) == 0) {
266 				releasef(sfv->sfv_fd);
267 				return (EBADF);
268 			}
269 
270 			readvp = ffp->f_vnode;
271 			if (readvp->v_type != VREG) {
272 				releasef(sfv->sfv_fd);
273 				return (EINVAL);
274 			}
275 
276 			/*
277 			 * No point reading and writing to same vp,
278 			 * as long as both are regular files. readvp is not
279 			 * locked; but since we got it from an open file the
280 			 * contents will be valid during the time of access.
281 			 */
282 			if (VN_CMP(vp, readvp)) {
283 				releasef(sfv->sfv_fd);
284 				return (EINVAL);
285 			}
286 
287 			/*
288 			 * Note: we assume readvp != vp. "vp" is already
289 			 * locked, and "readvp" must not be.
290 			 */
291 			(void) VOP_RWLOCK(readvp, readflg, NULL);
292 
293 			/*
294 			 * Same checks as in pread64.
295 			 */
296 			if (sfv_off > MAXOFFSET_T) {
297 				VOP_RWUNLOCK(readvp, readflg, NULL);
298 				releasef(sfv->sfv_fd);
299 				return (EINVAL);
300 			}
301 
302 			if (sfv_off + sfv_len > MAXOFFSET_T)
303 				sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off);
304 
305 			/* Find the native blocksize to transfer data */
306 			size = MIN(vp->v_vfsp->vfs_bsize,
307 			    readvp->v_vfsp->vfs_bsize);
308 			size = sfv_len < size ? sfv_len : size;
309 			ptr = kmem_alloc(size, KM_SLEEP);
310 
311 			while (sfv_len > 0) {
312 				size_t	iov_len;
313 
314 				iov_len = MIN(size, sfv_len);
315 				aiov.iov_base = ptr;
316 				aiov.iov_len = iov_len;
317 				auio.uio_loffset = sfv_off;
318 				auio.uio_iov = &aiov;
319 				auio.uio_iovcnt = 1;
320 				auio.uio_resid = iov_len;
321 				auio.uio_segflg = UIO_SYSSPACE;
322 				auio.uio_llimit = MAXOFFSET_T;
323 				auio.uio_fmode = ffp->f_flag;
324 				ioflag = auio.uio_fmode &
325 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
326 
327 				/*
328 				 * If read sync is not asked for,
329 				 * filter sync flags
330 				 */
331 				if ((ioflag & FRSYNC) == 0)
332 					ioflag &= ~(FSYNC|FDSYNC);
333 				error = VOP_READ(readvp, &auio, ioflag,
334 				    fp->f_cred, NULL);
335 				if (error) {
336 					kmem_free(ptr, size);
337 					VOP_RWUNLOCK(readvp, readflg, NULL);
338 					releasef(sfv->sfv_fd);
339 					return (error);
340 				}
341 
342 				/*
343 				 * Check how must data was really read.
344 				 * Decrement the 'len' and increment the
345 				 * 'off' appropriately.
346 				 */
347 				cnt = iov_len - auio.uio_resid;
348 				if (cnt == 0) {
349 					/*
350 					 * If we were reading a pipe (currently
351 					 * not implemented), we may now lose
352 					 * data.
353 					 */
354 					kmem_free(ptr, size);
355 					VOP_RWUNLOCK(readvp, readflg, NULL);
356 					releasef(sfv->sfv_fd);
357 					return (EINVAL);
358 				}
359 				sfv_len -= cnt;
360 				sfv_off += cnt;
361 
362 				aiov.iov_base = ptr;
363 				aiov.iov_len = cnt;
364 				auio.uio_loffset = *fileoff;
365 				auio.uio_resid = cnt;
366 				auio.uio_segflg = UIO_SYSSPACE;
367 				auio.uio_llimit = curproc->p_fsz_ctl;
368 				auio.uio_fmode = fflag;
369 				ioflag = auio.uio_fmode &
370 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
371 				error = VOP_WRITE(vp, &auio, ioflag,
372 				    fp->f_cred, NULL);
373 
374 				/*
375 				 * Check how much data was written. Increment
376 				 * the 'len' and decrement the 'off' if all
377 				 * the data was not written.
378 				 */
379 				cnt -= auio.uio_resid;
380 				sfv_len += auio.uio_resid;
381 				sfv_off -= auio.uio_resid;
382 				ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
383 				if (vp->v_type == VREG)
384 					*fileoff += cnt;
385 				*count += cnt;
386 				if (error != 0) {
387 					kmem_free(ptr, size);
388 					VOP_RWUNLOCK(readvp, readflg, NULL);
389 					releasef(sfv->sfv_fd);
390 					return (error);
391 				}
392 			}
393 			VOP_RWUNLOCK(readvp, readflg, NULL);
394 			releasef(sfv->sfv_fd);
395 			kmem_free(ptr, size);
396 		}
397 		sfv++;
398 	}
399 	return (0);
400 }
401 
402 ssize32_t
403 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt,
404 	size32_t *xferred, int fildes)
405 {
406 	int			rwflag;
407 	u_offset_t		fileoff;
408 	int			copy_cnt;
409 	const struct ksendfilevec64 *copy_vec;
410 	struct ksendfilevec64 sfv[SEND_MAX_CHUNK];
411 	struct vnode *vp;
412 	int error;
413 	ssize32_t count = 0;
414 	int osfvcnt;
415 
416 	rwflag = 1;
417 	vp = fp->f_vnode;
418 	(void) VOP_RWLOCK(vp, rwflag, NULL);
419 
420 	copy_vec = vec;
421 	fileoff = fp->f_offset;
422 	osfvcnt = sfvcnt;
423 
424 	do {
425 		copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
426 		if (copyin(copy_vec, sfv, copy_cnt *
427 		    sizeof (struct ksendfilevec64))) {
428 			error = EFAULT;
429 			break;
430 		}
431 
432 		/*
433 		 * Optimize the single regular file over
434 		 * the socket case.
435 		 */
436 		if (vp->v_type == VSOCK && osfvcnt == 1 &&
437 		    sfv->sfv_fd != SFV_FD_SELF) {
438 			file_t *rfp;
439 			vnode_t *rvp;
440 
441 			if ((rfp = getf(sfv->sfv_fd)) == NULL) {
442 				error = EBADF;
443 				break;
444 			}
445 			if ((rfp->f_flag & FREAD) == 0) {
446 				releasef(sfv->sfv_fd);
447 				error = EBADF;
448 				break;
449 			}
450 			rvp = rfp->f_vnode;
451 			if (rvp->v_type == VREG) {
452 				error = sosendfile64(fp, rfp, sfv, &count);
453 				break;
454 			}
455 			releasef(sfv->sfv_fd);
456 		}
457 		error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count);
458 		if (error != 0)
459 			break;
460 
461 		copy_vec += copy_cnt;
462 		sfvcnt -= copy_cnt;
463 	} while (sfvcnt > 0);
464 
465 	if (vp->v_type == VREG)
466 		fp->f_offset += count;
467 
468 	VOP_RWUNLOCK(vp, rwflag, NULL);
469 	if (copyout(&count, xferred, sizeof (count)))
470 		error = EFAULT;
471 	releasef(fildes);
472 	if (error != 0)
473 		return (set_errno(error));
474 	return (count);
475 }
476 #endif
477 
478 int
479 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
480     int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count)
481 {
482 	struct vnode *vp;
483 	struct uio auio;
484 	struct iovec aiov;
485 	ushort_t fflag;
486 	int ioflag;
487 	int i, error;
488 	size_t cnt;
489 	ssize_t sfv_len;
490 	u_offset_t sfv_off;
491 #ifdef _SYSCALL32_IMPL
492 	model_t model = get_udatamodel();
493 	u_offset_t maxoff = (model == DATAMODEL_ILP32) ?
494 		MAXOFF32_T : MAXOFFSET_T;
495 #else
496 	const u_offset_t maxoff = MAXOFF32_T;
497 #endif
498 	mblk_t *dmp = NULL;
499 	int wroff;
500 	int buf_left = 0;
501 	size_t	iov_len;
502 	mblk_t  *head, *tmp;
503 	size_t  size = total_size;
504 	size_t  extra;
505 	int tail_len;
506 
507 	fflag = fp->f_flag;
508 	vp = fp->f_vnode;
509 
510 	ASSERT(vp->v_type == VSOCK);
511 	ASSERT(maxblk > 0);
512 
513 	wroff = (int)vp->v_stream->sd_wroff;
514 	tail_len = (int)vp->v_stream->sd_tail;
515 	extra = wroff + tail_len;
516 
517 	buf_left = MIN(total_size, maxblk);
518 	head = dmp = allocb(buf_left + extra, BPRI_HI);
519 	if (head == NULL)
520 		return (ENOMEM);
521 	head->b_wptr = head->b_rptr = head->b_rptr + wroff;
522 
523 	auio.uio_extflg = UIO_COPY_DEFAULT;
524 	for (i = 0; i < copy_cnt; i++) {
525 		if (ISSIG(curthread, JUSTLOOKING))
526 			return (EINTR);
527 
528 		/*
529 		 * Do similar checks as "write" as we are writing
530 		 * sfv_len bytes into "vp".
531 		 */
532 		sfv_len = (ssize_t)sfv->sfv_len;
533 
534 		if (sfv_len == 0) {
535 			sfv++;
536 			continue;
537 		}
538 
539 		/* Make sure sfv_len is not negative */
540 #ifdef _SYSCALL32_IMPL
541 		if (model == DATAMODEL_ILP32) {
542 			if ((ssize32_t)sfv_len < 0)
543 				return (EINVAL);
544 		} else
545 #endif
546 		if (sfv_len < 0)
547 			return (EINVAL);
548 
549 		/* Check for overflow */
550 #ifdef _SYSCALL32_IMPL
551 		if (model == DATAMODEL_ILP32) {
552 			if (((ssize32_t)(*count + sfv_len)) < 0)
553 				return (EINVAL);
554 		} else
555 #endif
556 		if ((*count + sfv_len) < 0)
557 			return (EINVAL);
558 
559 		sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off;
560 
561 		if (sfv->sfv_fd == SFV_FD_SELF) {
562 			while (sfv_len > 0) {
563 				if (buf_left == 0) {
564 					tmp = dmp;
565 					buf_left = MIN(total_size, maxblk);
566 					iov_len = MIN(buf_left, sfv_len);
567 					dmp = allocb(buf_left + extra, BPRI_HI);
568 					if (dmp == NULL) {
569 						freemsg(head);
570 						return (ENOMEM);
571 					}
572 					dmp->b_wptr = dmp->b_rptr =
573 					    dmp->b_rptr + wroff;
574 					tmp->b_cont = dmp;
575 				} else {
576 					iov_len = MIN(buf_left, sfv_len);
577 				}
578 
579 				aiov.iov_len = iov_len;
580 				aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
581 				auio.uio_loffset = *fileoff;
582 				auio.uio_iovcnt = 1;
583 				auio.uio_resid = iov_len;
584 				auio.uio_iov = &aiov;
585 				auio.uio_segflg = UIO_USERSPACE;
586 				auio.uio_llimit = curproc->p_fsz_ctl;
587 				auio.uio_fmode = fflag;
588 
589 				buf_left -= iov_len;
590 				total_size -= iov_len;
591 				sfv_len -= iov_len;
592 				sfv_off += iov_len;
593 
594 				error = uiomove((caddr_t)dmp->b_wptr,
595 				    iov_len, UIO_WRITE, &auio);
596 				if (error != 0) {
597 					freemsg(head);
598 					return (error);
599 				}
600 				dmp->b_wptr += iov_len;
601 			}
602 		} else {
603 			file_t	*ffp;
604 			vnode_t	*readvp;
605 			int	readflg = 0;
606 
607 			if ((ffp = getf(sfv->sfv_fd)) == NULL) {
608 				freemsg(head);
609 				return (EBADF);
610 			}
611 
612 			if ((ffp->f_flag & FREAD) == 0) {
613 				releasef(sfv->sfv_fd);
614 				freemsg(head);
615 				return (EACCES);
616 			}
617 
618 			readvp = ffp->f_vnode;
619 			if (readvp->v_type != VREG) {
620 				releasef(sfv->sfv_fd);
621 				freemsg(head);
622 				return (EINVAL);
623 			}
624 
625 			/*
626 			 * No point reading and writing to same vp,
627 			 * as long as both are regular files. readvp is not
628 			 * locked; but since we got it from an open file the
629 			 * contents will be valid during the time of access.
630 			 */
631 
632 			if (VN_CMP(vp, readvp)) {
633 				releasef(sfv->sfv_fd);
634 				freemsg(head);
635 				return (EINVAL);
636 			}
637 
638 			/*
639 			 * Note: we assume readvp != vp. "vp" is already
640 			 * locked, and "readvp" must not be.
641 			 */
642 
643 			(void) VOP_RWLOCK(readvp, readflg, NULL);
644 
645 			/* Same checks as in pread */
646 			if (sfv_off > maxoff) {
647 				VOP_RWUNLOCK(readvp, readflg, NULL);
648 				releasef(sfv->sfv_fd);
649 				freemsg(head);
650 				return (EINVAL);
651 			}
652 			if (sfv_off + sfv_len > maxoff) {
653 				sfv_len = (ssize_t)((offset_t)maxoff -
654 				    sfv_off);
655 			}
656 
657 			while (sfv_len > 0) {
658 				if (buf_left == 0) {
659 					tmp = dmp;
660 					buf_left = MIN(total_size, maxblk);
661 					iov_len = MIN(buf_left, sfv_len);
662 					dmp = allocb(buf_left + extra, BPRI_HI);
663 					if (dmp == NULL) {
664 						VOP_RWUNLOCK(readvp, readflg,
665 									NULL);
666 						releasef(sfv->sfv_fd);
667 						freemsg(head);
668 						return (ENOMEM);
669 					}
670 					dmp->b_wptr = dmp->b_rptr =
671 					    dmp->b_rptr + wroff;
672 					tmp->b_cont = dmp;
673 				} else {
674 					iov_len = MIN(buf_left, sfv_len);
675 				}
676 				aiov.iov_base = (caddr_t)dmp->b_wptr;
677 				aiov.iov_len = iov_len;
678 				auio.uio_loffset = sfv_off;
679 				auio.uio_iov = &aiov;
680 				auio.uio_iovcnt = 1;
681 				auio.uio_resid = iov_len;
682 				auio.uio_segflg = UIO_SYSSPACE;
683 				auio.uio_llimit = MAXOFFSET_T;
684 				auio.uio_fmode = ffp->f_flag;
685 				ioflag = auio.uio_fmode &
686 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
687 
688 				/*
689 				 * If read sync is not asked for,
690 				 * filter sync flags
691 				 */
692 				if ((ioflag & FRSYNC) == 0)
693 					ioflag &= ~(FSYNC|FDSYNC);
694 				error = VOP_READ(readvp, &auio, ioflag,
695 				    fp->f_cred, NULL);
696 				if (error != 0) {
697 					/*
698 					 * If we were reading a pipe (currently
699 					 * not implemented), we may now loose
700 					 * data.
701 					 */
702 					VOP_RWUNLOCK(readvp, readflg, NULL);
703 					releasef(sfv->sfv_fd);
704 					freemsg(head);
705 					return (error);
706 				}
707 
708 				/*
709 				 * Check how much data was really read.
710 				 * Decrement the 'len' and increment the
711 				 * 'off' appropriately.
712 				 */
713 				cnt = iov_len - auio.uio_resid;
714 				if (cnt == 0) {
715 					VOP_RWUNLOCK(readvp, readflg, NULL);
716 					releasef(sfv->sfv_fd);
717 					freemsg(head);
718 					return (EINVAL);
719 				}
720 				sfv_len -= cnt;
721 				sfv_off += cnt;
722 				total_size -= cnt;
723 				buf_left -= cnt;
724 
725 				dmp->b_wptr += cnt;
726 			}
727 			VOP_RWUNLOCK(readvp, readflg, NULL);
728 			releasef(sfv->sfv_fd);
729 		}
730 		sfv++;
731 	}
732 
733 	ASSERT(total_size == 0);
734 	error = kstrwritemp(vp, head, fflag);
735 	if (error != 0) {
736 		freemsg(head);
737 		return (error);
738 	}
739 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size;
740 	*count += size;
741 
742 	return (0);
743 }
744 
745 
746 int
747 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
748     int copy_cnt, ssize_t *count)
749 {
750 	struct vnode *vp;
751 	struct uio auio;
752 	struct iovec aiov;
753 	ushort_t fflag;
754 	int ioflag;
755 	int i, error;
756 	size_t cnt;
757 	ssize_t sfv_len;
758 	u_offset_t sfv_off;
759 #ifdef _SYSCALL32_IMPL
760 	model_t model = get_udatamodel();
761 	u_offset_t maxoff = (model == DATAMODEL_ILP32) ?
762 		MAXOFF32_T : MAXOFFSET_T;
763 #else
764 	const u_offset_t maxoff = MAXOFF32_T;
765 #endif
766 	mblk_t	*dmp = NULL;
767 	char	*buf = NULL;
768 	size_t  extra;
769 	int maxblk, wroff, tail_len;
770 	struct sonode *so;
771 	stdata_t *stp;
772 
773 	fflag = fp->f_flag;
774 	vp = fp->f_vnode;
775 
776 	if (vp->v_type == VSOCK) {
777 		so = VTOSO(vp);
778 		stp = vp->v_stream;
779 		wroff = (int)stp->sd_wroff;
780 		tail_len = (int)stp->sd_tail;
781 		maxblk = (int)stp->sd_maxblk;
782 		extra = wroff + tail_len;
783 	}
784 
785 	auio.uio_extflg = UIO_COPY_DEFAULT;
786 	for (i = 0; i < copy_cnt; i++) {
787 		if (ISSIG(curthread, JUSTLOOKING))
788 			return (EINTR);
789 
790 		/*
791 		 * Do similar checks as "write" as we are writing
792 		 * sfv_len bytes into "vp".
793 		 */
794 		sfv_len = (ssize_t)sfv->sfv_len;
795 
796 		if (sfv_len == 0) {
797 			sfv++;
798 			continue;
799 		}
800 
801 		/* Make sure sfv_len is not negative */
802 #ifdef _SYSCALL32_IMPL
803 		if (model == DATAMODEL_ILP32) {
804 			if ((ssize32_t)sfv_len < 0)
805 				return (EINVAL);
806 		} else
807 #endif
808 		if (sfv_len < 0)
809 			return (EINVAL);
810 
811 		if (vp->v_type == VREG) {
812 			if (*fileoff >= curproc->p_fsz_ctl) {
813 				mutex_enter(&curproc->p_lock);
814 				(void) rctl_action(
815 				    rctlproc_legacy[RLIMIT_FSIZE],
816 				    curproc->p_rctls, curproc, RCA_SAFE);
817 				mutex_exit(&curproc->p_lock);
818 
819 				return (EFBIG);
820 			}
821 
822 			if (*fileoff >= maxoff)
823 				return (EFBIG);
824 
825 			if (*fileoff + sfv_len > maxoff)
826 				return (EINVAL);
827 		}
828 
829 		/* Check for overflow */
830 #ifdef _SYSCALL32_IMPL
831 		if (model == DATAMODEL_ILP32) {
832 			if (((ssize32_t)(*count + sfv_len)) < 0)
833 				return (EINVAL);
834 		} else
835 #endif
836 		if ((*count + sfv_len) < 0)
837 			return (EINVAL);
838 
839 		sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off;
840 
841 		if (sfv->sfv_fd == SFV_FD_SELF) {
842 			aiov.iov_len = sfv_len;
843 			aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
844 			auio.uio_loffset = *fileoff;
845 			auio.uio_iovcnt = 1;
846 			auio.uio_resid = sfv_len;
847 			auio.uio_iov = &aiov;
848 			auio.uio_segflg = UIO_USERSPACE;
849 			auio.uio_llimit = curproc->p_fsz_ctl;
850 			auio.uio_fmode = fflag;
851 
852 			if (vp->v_type == VSOCK) {
853 
854 				/*
855 				 * Optimize for the socket case
856 				 */
857 
858 				dmp = allocb(sfv_len + extra, BPRI_HI);
859 				if (dmp == NULL)
860 					return (ENOMEM);
861 				dmp->b_wptr = dmp->b_rptr = dmp->b_rptr + wroff;
862 				error = uiomove((caddr_t)dmp->b_wptr,
863 				    sfv_len, UIO_WRITE, &auio);
864 				if (error != 0) {
865 					freeb(dmp);
866 					return (error);
867 				}
868 				dmp->b_wptr += sfv_len;
869 				error = kstrwritemp(vp, dmp, fflag);
870 				if (error != 0) {
871 					freeb(dmp);
872 					return (error);
873 				}
874 				ttolwp(curthread)->lwp_ru.ioch +=
875 				    (ulong_t)sfv_len;
876 				*count += sfv_len;
877 			} else {
878 				ioflag = auio.uio_fmode &
879 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
880 				while (sfv_len > 0) {
881 					error = VOP_WRITE(vp, &auio, ioflag,
882 					    fp->f_cred, NULL);
883 					cnt = sfv_len - auio.uio_resid;
884 					sfv_len -= cnt;
885 					ttolwp(curthread)->lwp_ru.ioch +=
886 					    (ulong_t)cnt;
887 					*fileoff += cnt;
888 					*count += cnt;
889 					if (error != 0)
890 						return (error);
891 				}
892 			}
893 		} else {
894 			file_t	*ffp;
895 			vnode_t	*readvp;
896 			int	readflg = 0;
897 			size_t	size;
898 			caddr_t	ptr;
899 
900 			if ((ffp = getf(sfv->sfv_fd)) == NULL)
901 				return (EBADF);
902 
903 			if ((ffp->f_flag & FREAD) == 0) {
904 				releasef(sfv->sfv_fd);
905 				return (EBADF);
906 			}
907 
908 			readvp = ffp->f_vnode;
909 			if (readvp->v_type != VREG) {
910 				releasef(sfv->sfv_fd);
911 				return (EINVAL);
912 			}
913 
914 			/*
915 			 * No point reading and writing to same vp,
916 			 * as long as both are regular files. readvp is not
917 			 * locked; but since we got it from an open file the
918 			 * contents will be valid during the time of access.
919 			 */
920 			if (VN_CMP(vp, readvp)) {
921 				releasef(sfv->sfv_fd);
922 				return (EINVAL);
923 			}
924 
925 			/*
926 			 * Note: we assume readvp != vp. "vp" is already
927 			 * locked, and "readvp" must not be.
928 			 */
929 			(void) VOP_RWLOCK(readvp, readflg, NULL);
930 
931 			/* Same checks as in pread */
932 			if (sfv_off > maxoff) {
933 				VOP_RWUNLOCK(readvp, readflg, NULL);
934 				releasef(sfv->sfv_fd);
935 				return (EINVAL);
936 			}
937 			if (sfv_off + sfv_len > maxoff) {
938 				sfv_len = (ssize_t)((offset_t)maxoff -
939 				    sfv_off);
940 			}
941 			/* Find the native blocksize to transfer data */
942 			size = MIN(vp->v_vfsp->vfs_bsize,
943 			    readvp->v_vfsp->vfs_bsize);
944 			size = sfv_len < size ? sfv_len : size;
945 
946 			if (vp->v_type != VSOCK) {
947 				buf = kmem_alloc(size, KM_NOSLEEP);
948 				if (buf == NULL) {
949 					VOP_RWUNLOCK(readvp, readflg, NULL);
950 					releasef(sfv->sfv_fd);
951 					return (ENOMEM);
952 				}
953 			} else {
954 				/*
955 				 * For sockets acting as an SSL proxy, we
956 				 * need to adjust the size to the maximum
957 				 * SSL record size set in the stream head.
958 				 */
959 				if (so->so_kssl_ctx != NULL)
960 					size = MIN(size, maxblk);
961 			}
962 
963 			while (sfv_len > 0) {
964 				size_t	iov_len;
965 
966 				iov_len = MIN(size, sfv_len);
967 
968 				if (vp->v_type == VSOCK) {
969 					dmp = allocb(iov_len + extra, BPRI_HI);
970 					if (dmp == NULL) {
971 						VOP_RWUNLOCK(readvp, readflg,
972 						    NULL);
973 						releasef(sfv->sfv_fd);
974 						return (ENOMEM);
975 					}
976 					dmp->b_wptr = dmp->b_rptr =
977 					    dmp->b_rptr + wroff;
978 					ptr = (caddr_t)dmp->b_rptr;
979 				} else {
980 					ptr = buf;
981 				}
982 
983 				aiov.iov_base = ptr;
984 				aiov.iov_len = iov_len;
985 				auio.uio_loffset = sfv_off;
986 				auio.uio_iov = &aiov;
987 				auio.uio_iovcnt = 1;
988 				auio.uio_resid = iov_len;
989 				auio.uio_segflg = UIO_SYSSPACE;
990 				auio.uio_llimit = MAXOFFSET_T;
991 				auio.uio_fmode = ffp->f_flag;
992 				ioflag = auio.uio_fmode &
993 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
994 
995 				/*
996 				 * If read sync is not asked for,
997 				 * filter sync flags
998 				 */
999 				if ((ioflag & FRSYNC) == 0)
1000 					ioflag &= ~(FSYNC|FDSYNC);
1001 				error = VOP_READ(readvp, &auio, ioflag,
1002 				    fp->f_cred, NULL);
1003 				if (error != 0) {
1004 					/*
1005 					 * If we were reading a pipe (currently
1006 					 * not implemented), we may now lose
1007 					 * data.
1008 					 */
1009 					if (vp->v_type == VSOCK)
1010 						freeb(dmp);
1011 					else
1012 						kmem_free(buf, size);
1013 					VOP_RWUNLOCK(readvp, readflg, NULL);
1014 					releasef(sfv->sfv_fd);
1015 					return (error);
1016 				}
1017 
1018 				/*
1019 				 * Check how much data was really read.
1020 				 * Decrement the 'len' and increment the
1021 				 * 'off' appropriately.
1022 				 */
1023 				cnt = iov_len - auio.uio_resid;
1024 				if (cnt == 0) {
1025 					if (vp->v_type == VSOCK)
1026 						freeb(dmp);
1027 					else
1028 						kmem_free(buf, size);
1029 					VOP_RWUNLOCK(readvp, readflg, NULL);
1030 					releasef(sfv->sfv_fd);
1031 					return (EINVAL);
1032 				}
1033 				sfv_len -= cnt;
1034 				sfv_off += cnt;
1035 
1036 				if (vp->v_type == VSOCK) {
1037 					dmp->b_wptr = dmp->b_rptr + cnt;
1038 
1039 					error = kstrwritemp(vp, dmp, fflag);
1040 					if (error != 0) {
1041 						freeb(dmp);
1042 						VOP_RWUNLOCK(readvp, readflg,
1043 									NULL);
1044 						releasef(sfv->sfv_fd);
1045 						return (error);
1046 					}
1047 
1048 					ttolwp(curthread)->lwp_ru.ioch +=
1049 					    (ulong_t)cnt;
1050 					*count += cnt;
1051 				} else {
1052 
1053 					aiov.iov_base = ptr;
1054 					aiov.iov_len = cnt;
1055 					auio.uio_loffset = *fileoff;
1056 					auio.uio_resid = cnt;
1057 					auio.uio_segflg = UIO_SYSSPACE;
1058 					auio.uio_llimit = curproc->p_fsz_ctl;
1059 					auio.uio_fmode = fflag;
1060 					ioflag = auio.uio_fmode &
1061 					    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1062 					error = VOP_WRITE(vp, &auio, ioflag,
1063 					    fp->f_cred, NULL);
1064 
1065 					/*
1066 					 * Check how much data was written.
1067 					 * Increment the 'len' and decrement the
1068 					 * 'off' if all the data was not
1069 					 * written.
1070 					 */
1071 					cnt -= auio.uio_resid;
1072 					sfv_len += auio.uio_resid;
1073 					sfv_off -= auio.uio_resid;
1074 					ttolwp(curthread)->lwp_ru.ioch +=
1075 					    (ulong_t)cnt;
1076 					*fileoff += cnt;
1077 					*count += cnt;
1078 					if (error != 0) {
1079 						VOP_RWUNLOCK(readvp, readflg,
1080 									NULL);
1081 						releasef(sfv->sfv_fd);
1082 						return (error);
1083 					}
1084 				}
1085 			}
1086 			if (buf) {
1087 				kmem_free(buf, size);
1088 				buf = NULL;
1089 			}
1090 			VOP_RWUNLOCK(readvp, readflg, NULL);
1091 			releasef(sfv->sfv_fd);
1092 		}
1093 		sfv++;
1094 	}
1095 	return (0);
1096 }
1097 
1098 ssize_t
1099 sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt,
1100     size_t *xferred)
1101 {
1102 	int error;
1103 	file_t *fp;
1104 	struct vnode *vp;
1105 	struct sonode *so;
1106 	u_offset_t fileoff;
1107 	int copy_cnt;
1108 	const struct sendfilevec *copy_vec;
1109 	struct sendfilevec sfv[SEND_MAX_CHUNK];
1110 	ssize_t count = 0;
1111 #ifdef _SYSCALL32_IMPL
1112 	struct ksendfilevec32 sfv32[SEND_MAX_CHUNK];
1113 #endif
1114 	ssize_t total_size = 0;
1115 	int i;
1116 	boolean_t is_sock = B_FALSE;
1117 	int maxblk = 0;
1118 
1119 	if (sfvcnt <= 0)
1120 		return (set_errno(EINVAL));
1121 
1122 	if ((fp = getf(fildes)) == NULL)
1123 		return (set_errno(EBADF));
1124 
1125 	if (((fp->f_flag) & FWRITE) == 0) {
1126 		error = EBADF;
1127 		goto err;
1128 	}
1129 
1130 	fileoff = fp->f_offset;
1131 	vp = fp->f_vnode;
1132 
1133 	switch (vp->v_type) {
1134 	case VSOCK:
1135 		so = VTOSO(vp);
1136 		/* sendfile not supported for SCTP */
1137 		if (so->so_protocol == IPPROTO_SCTP) {
1138 			error = EPROTONOSUPPORT;
1139 			goto err;
1140 		}
1141 		is_sock = B_TRUE;
1142 		switch (so->so_family) {
1143 		case AF_INET:
1144 		case AF_INET6:
1145 			/*
1146 			 * Make similar checks done in SOP_WRITE().
1147 			 */
1148 			if (so->so_state & SS_CANTSENDMORE) {
1149 				tsignal(curthread, SIGPIPE);
1150 				error = EPIPE;
1151 				goto err;
1152 			}
1153 			if (so->so_type != SOCK_STREAM) {
1154 				error = EOPNOTSUPP;
1155 				goto err;
1156 			}
1157 
1158 			if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
1159 			    (SS_ISCONNECTED|SS_ISBOUND)) {
1160 				error = ENOTCONN;
1161 				goto err;
1162 			}
1163 
1164 			if ((so->so_state & SS_DIRECT) &&
1165 			    (so->so_priv != NULL) &&
1166 			    (so->so_kssl_ctx == NULL)) {
1167 				maxblk = ((tcp_t *)so->so_priv)->tcp_mss;
1168 			} else {
1169 				maxblk = (int)vp->v_stream->sd_maxblk;
1170 			}
1171 			break;
1172 		default:
1173 			error = EAFNOSUPPORT;
1174 			goto err;
1175 		}
1176 		break;
1177 	case VREG:
1178 		break;
1179 	default:
1180 		error = EINVAL;
1181 		goto err;
1182 	}
1183 
1184 	switch (opcode) {
1185 	case SENDFILEV :
1186 		break;
1187 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1188 	case SENDFILEV64 :
1189 		return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt,
1190 		    (size32_t *)xferred, fildes));
1191 #endif
1192 	default :
1193 		error = ENOSYS;
1194 		break;
1195 	}
1196 
1197 	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1198 	copy_vec = vec;
1199 
1200 	do {
1201 		copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
1202 #ifdef _SYSCALL32_IMPL
1203 		/* 32-bit callers need to have their iovec expanded. */
1204 		if (get_udatamodel() == DATAMODEL_ILP32) {
1205 			if (copyin(copy_vec, sfv32,
1206 			    copy_cnt * sizeof (ksendfilevec32_t))) {
1207 				error = EFAULT;
1208 				break;
1209 			}
1210 
1211 			for (i = 0; i < copy_cnt; i++) {
1212 				sfv[i].sfv_fd = sfv32[i].sfv_fd;
1213 				sfv[i].sfv_off =
1214 					(off_t)(uint32_t)sfv32[i].sfv_off;
1215 				sfv[i].sfv_len = (size_t)sfv32[i].sfv_len;
1216 				total_size += sfv[i].sfv_len;
1217 				sfv[i].sfv_flag = sfv32[i].sfv_flag;
1218 			}
1219 		} else {
1220 #endif
1221 			if (copyin(copy_vec, sfv,
1222 			    copy_cnt * sizeof (sendfilevec_t))) {
1223 				error = EFAULT;
1224 				break;
1225 			}
1226 
1227 			for (i = 0; i < copy_cnt; i++) {
1228 				total_size += sfv[i].sfv_len;
1229 			}
1230 #ifdef _SYSCALL32_IMPL
1231 		}
1232 #endif
1233 
1234 		/*
1235 		 * The task between deciding to use sendvec_small_chunk
1236 		 * and sendvec_chunk is dependant on multiple things:
1237 		 *
1238 		 * i) latency is important for smaller files. So if the
1239 		 * data is smaller than 'tcp_slow_start_initial' times
1240 		 * maxblk, then use sendvec_small_chunk which creates
1241 		 * maxblk size mblks and chains then together and sends
1242 		 * them to TCP in one shot. It also leaves 'wroff' size
1243 		 * space for the headers in each mblk.
1244 		 *
1245 		 * ii) for total size bigger than 'tcp_slow_start_initial'
1246 		 * time maxblk, its probably real file data which is
1247 		 * dominating. So its better to use sendvec_chunk because
1248 		 * performance goes to dog if we don't do pagesize reads.
1249 		 * sendvec_chunk will do pagesize reads and write them
1250 		 * in pagesize mblks to TCP.
1251 		 *
1252 		 * Side Notes: A write to file has not been optimized.
1253 		 * Future zero copy code will plugin into sendvec_chunk
1254 		 * only because doing zero copy for files smaller then
1255 		 * pagesize is useless.
1256 		 *
1257 		 * Note, if socket has NL7C enabled then call NL7C's
1258 		 * senfilev() function to give NL7C a chance to copy
1259 		 * the vec for caching, then continue processing as
1260 		 * normal.
1261 		 */
1262 		if (is_sock) {
1263 			switch (so->so_family) {
1264 			case AF_INET:
1265 			case AF_INET6:
1266 				if (so->so_nl7c_flags != 0) {
1267 					nl7c_sendfilev(so, fileoff,
1268 					    sfv, copy_cnt);
1269 				}
1270 				if (total_size <= (4 * maxblk))
1271 					error = sendvec_small_chunk(fp,
1272 					    &fileoff, sfv, copy_cnt,
1273 					    total_size, maxblk, &count);
1274 				else
1275 					error = sendvec_chunk(fp, &fileoff,
1276 					    sfv, copy_cnt, &count);
1277 				break;
1278 			}
1279 		} else {
1280 			ASSERT(vp->v_type == VREG);
1281 			error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt,
1282 			    &count);
1283 		}
1284 
1285 
1286 #ifdef _SYSCALL32_IMPL
1287 	if (get_udatamodel() == DATAMODEL_ILP32)
1288 		copy_vec = (const struct sendfilevec *)((char *)copy_vec +
1289 		    (copy_cnt * sizeof (ksendfilevec32_t)));
1290 	else
1291 #endif
1292 		copy_vec += copy_cnt;
1293 		sfvcnt -= copy_cnt;
1294 	} while (sfvcnt > 0);
1295 
1296 	if (vp->v_type == VREG)
1297 		fp->f_offset += count;
1298 
1299 
1300 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1301 
1302 #ifdef _SYSCALL32_IMPL
1303 	if (get_udatamodel() == DATAMODEL_ILP32) {
1304 		ssize32_t count32 = (ssize32_t)count;
1305 		if (copyout(&count32, xferred, sizeof (count32)))
1306 			error = EFAULT;
1307 		releasef(fildes);
1308 		if (error != 0)
1309 			return (set_errno(error));
1310 		return (count32);
1311 	}
1312 #endif
1313 	if (copyout(&count, xferred, sizeof (count)))
1314 		error = EFAULT;
1315 	releasef(fildes);
1316 	if (error != 0)
1317 		return (set_errno(error));
1318 	return (count);
1319 err:
1320 	ASSERT(error != 0);
1321 	releasef(fildes);
1322 	return (set_errno(error));
1323 }
1324