xref: /titanic_51/usr/src/uts/common/syscall/rw.c (revision ac19272f7eb4a433cfccf2fdccc769cca5528169)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * Portions of this source code were derived from Berkeley 4.3 BSD
31  * under license from the Regents of the University of California.
32  */
33 
34 #pragma ident	"%Z%%M%	%I%	%E% SMI"
35 
36 #include <sys/param.h>
37 #include <sys/isa_defs.h>
38 #include <sys/types.h>
39 #include <sys/inttypes.h>
40 #include <sys/sysmacros.h>
41 #include <sys/cred.h>
42 #include <sys/user.h>
43 #include <sys/systm.h>
44 #include <sys/errno.h>
45 #include <sys/vnode.h>
46 #include <sys/file.h>
47 #include <sys/proc.h>
48 #include <sys/cpuvar.h>
49 #include <sys/uio.h>
50 #include <sys/debug.h>
51 #include <sys/rctl.h>
52 #include <sys/nbmlock.h>
53 
54 #define	COPYOUT_MAX_CACHE	(1<<17)		/* 128K */
55 
56 size_t copyout_max_cached = COPYOUT_MAX_CACHE;	/* global so it's patchable */
57 
58 /*
59  * read, write, pread, pwrite, readv, and writev syscalls.
60  *
61  * 64-bit open:	all open's are large file opens.
62  * Large Files: the behaviour of read depends on whether the fd
63  *		corresponds to large open or not.
64  * 32-bit open:	FOFFMAX flag not set.
65  *		read until MAXOFF32_T - 1 and read at MAXOFF32_T returns
66  *		EOVERFLOW if count is non-zero and if size of file
67  *		is > MAXOFF32_T. If size of file is <= MAXOFF32_T read
68  *		at >= MAXOFF32_T returns EOF.
69  */
70 
71 /*
72  * Native system call
73  */
74 ssize_t
75 read(int fdes, void *cbuf, size_t count)
76 {
77 	struct uio auio;
78 	struct iovec aiov;
79 	file_t *fp;
80 	register vnode_t *vp;
81 	struct cpu *cp;
82 	int fflag, ioflag, rwflag;
83 	ssize_t cnt, bcount;
84 	int error = 0;
85 	u_offset_t fileoff;
86 	int in_crit = 0;
87 
88 	if ((cnt = (ssize_t)count) < 0)
89 		return (set_errno(EINVAL));
90 	if ((fp = getf(fdes)) == NULL)
91 		return (set_errno(EBADF));
92 	if (((fflag = fp->f_flag) & FREAD) == 0) {
93 		error = EBADF;
94 		goto out;
95 	}
96 	vp = fp->f_vnode;
97 
98 	if (vp->v_type == VREG && cnt == 0) {
99 		goto out;
100 	}
101 
102 	rwflag = 0;
103 	aiov.iov_base = cbuf;
104 	aiov.iov_len = cnt;
105 
106 	/*
107 	 * We have to enter the critical region before calling VOP_RWLOCK
108 	 * to avoid a deadlock with write() calls.
109 	 */
110 	if (nbl_need_check(vp)) {
111 		int svmand;
112 
113 		nbl_start_crit(vp, RW_READER);
114 		in_crit = 1;
115 		error = nbl_svmand(vp, fp->f_cred, &svmand);
116 		if (error != 0)
117 			goto out;
118 		if (nbl_conflict(vp, NBL_READ, fp->f_offset, cnt, svmand,
119 		    NULL)) {
120 			error = EACCES;
121 			goto out;
122 		}
123 	}
124 
125 	(void) VOP_RWLOCK(vp, rwflag, NULL);
126 
127 	/*
128 	 * We do the following checks inside VOP_RWLOCK so as to
129 	 * prevent file size from changing while these checks are
130 	 * being done. Also, we load fp's offset to the local
131 	 * variable fileoff because we can have a parallel lseek
132 	 * going on (f_offset is not protected by any lock) which
133 	 * could change f_offset. We need to see the value only
134 	 * once here and take a decision. Seeing it more than once
135 	 * can lead to incorrect functionality.
136 	 */
137 
138 	fileoff = (u_offset_t)fp->f_offset;
139 	if (fileoff >= OFFSET_MAX(fp) && (vp->v_type == VREG)) {
140 		struct vattr va;
141 		va.va_mask = AT_SIZE;
142 		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL)))  {
143 			VOP_RWUNLOCK(vp, rwflag, NULL);
144 			goto out;
145 		}
146 		if (fileoff >= va.va_size) {
147 			cnt = 0;
148 			VOP_RWUNLOCK(vp, rwflag, NULL);
149 			goto out;
150 		} else {
151 			error = EOVERFLOW;
152 			VOP_RWUNLOCK(vp, rwflag, NULL);
153 			goto out;
154 		}
155 	}
156 	if ((vp->v_type == VREG) &&
157 	    (fileoff + cnt > OFFSET_MAX(fp))) {
158 		cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
159 	}
160 	auio.uio_loffset = fileoff;
161 	auio.uio_iov = &aiov;
162 	auio.uio_iovcnt = 1;
163 	auio.uio_resid = bcount = cnt;
164 	auio.uio_segflg = UIO_USERSPACE;
165 	auio.uio_llimit = MAXOFFSET_T;
166 	auio.uio_fmode = fflag;
167 	/*
168 	 * Only use bypass caches when the count is large enough
169 	 */
170 	if (bcount <= copyout_max_cached)
171 		auio.uio_extflg = UIO_COPY_CACHED;
172 	else
173 		auio.uio_extflg = UIO_COPY_DEFAULT;
174 
175 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
176 
177 	/* If read sync is not asked for, filter sync flags */
178 	if ((ioflag & FRSYNC) == 0)
179 		ioflag &= ~(FSYNC|FDSYNC);
180 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
181 	cnt -= auio.uio_resid;
182 	CPU_STATS_ENTER_K();
183 	cp = CPU;
184 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
185 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)cnt);
186 	CPU_STATS_EXIT_K();
187 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
188 
189 	if (vp->v_type == VFIFO)	/* Backward compatibility */
190 		fp->f_offset = cnt;
191 	else if (((fp->f_flag & FAPPEND) == 0) ||
192 		(vp->v_type != VREG) || (bcount != 0))	/* POSIX */
193 		fp->f_offset = auio.uio_loffset;
194 	VOP_RWUNLOCK(vp, rwflag, NULL);
195 
196 	if (error == EINTR && cnt != 0)
197 		error = 0;
198 out:
199 	if (in_crit)
200 		nbl_end_crit(vp);
201 	releasef(fdes);
202 	if (error)
203 		return (set_errno(error));
204 	return (cnt);
205 }
206 
207 /*
208  * Native system call
209  */
210 ssize_t
211 write(int fdes, void *cbuf, size_t count)
212 {
213 	struct uio auio;
214 	struct iovec aiov;
215 	file_t *fp;
216 	register vnode_t *vp;
217 	struct cpu *cp;
218 	int fflag, ioflag, rwflag;
219 	ssize_t cnt, bcount;
220 	int error = 0;
221 	u_offset_t fileoff;
222 	int in_crit = 0;
223 
224 	if ((cnt = (ssize_t)count) < 0)
225 		return (set_errno(EINVAL));
226 	if ((fp = getf(fdes)) == NULL)
227 		return (set_errno(EBADF));
228 	if (((fflag = fp->f_flag) & FWRITE) == 0) {
229 		error = EBADF;
230 		goto out;
231 	}
232 	vp = fp->f_vnode;
233 
234 	if (vp->v_type == VREG && cnt == 0) {
235 		goto out;
236 	}
237 
238 	rwflag = 1;
239 	aiov.iov_base = cbuf;
240 	aiov.iov_len = cnt;
241 
242 	/*
243 	 * We have to enter the critical region before calling VOP_RWLOCK
244 	 * to avoid a deadlock with ufs.
245 	 */
246 	if (nbl_need_check(vp)) {
247 		int svmand;
248 
249 		nbl_start_crit(vp, RW_READER);
250 		in_crit = 1;
251 		error = nbl_svmand(vp, fp->f_cred, &svmand);
252 		if (error != 0)
253 			goto out;
254 		if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, cnt, svmand,
255 		    NULL)) {
256 			error = EACCES;
257 			goto out;
258 		}
259 	}
260 
261 	(void) VOP_RWLOCK(vp, rwflag, NULL);
262 
263 	fileoff = fp->f_offset;
264 	if (vp->v_type == VREG) {
265 
266 		/*
267 		 * We raise psignal if write for >0 bytes causes
268 		 * it to exceed the ulimit.
269 		 */
270 		if (fileoff >= curproc->p_fsz_ctl) {
271 			VOP_RWUNLOCK(vp, rwflag, NULL);
272 
273 			mutex_enter(&curproc->p_lock);
274 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
275 			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
276 			mutex_exit(&curproc->p_lock);
277 
278 			error = EFBIG;
279 			goto out;
280 		}
281 		/*
282 		 * We return EFBIG if write is done at an offset
283 		 * greater than the offset maximum for this file structure.
284 		 */
285 
286 		if (fileoff >= OFFSET_MAX(fp)) {
287 			VOP_RWUNLOCK(vp, rwflag, NULL);
288 			error = EFBIG;
289 			goto out;
290 		}
291 		/*
292 		 * Limit the bytes to be written  upto offset maximum for
293 		 * this open file structure.
294 		 */
295 		if (fileoff + cnt > OFFSET_MAX(fp))
296 			cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
297 	}
298 	auio.uio_loffset = fileoff;
299 	auio.uio_iov = &aiov;
300 	auio.uio_iovcnt = 1;
301 	auio.uio_resid = bcount = cnt;
302 	auio.uio_segflg = UIO_USERSPACE;
303 	auio.uio_llimit = curproc->p_fsz_ctl;
304 	auio.uio_fmode = fflag;
305 	auio.uio_extflg = UIO_COPY_DEFAULT;
306 
307 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
308 
309 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
310 	cnt -= auio.uio_resid;
311 	CPU_STATS_ENTER_K();
312 	cp = CPU;
313 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
314 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)cnt);
315 	CPU_STATS_EXIT_K();
316 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
317 
318 	if (vp->v_type == VFIFO)	/* Backward compatibility */
319 		fp->f_offset = cnt;
320 	else if (((fp->f_flag & FAPPEND) == 0) ||
321 		(vp->v_type != VREG) || (bcount != 0))	/* POSIX */
322 		fp->f_offset = auio.uio_loffset;
323 	VOP_RWUNLOCK(vp, rwflag, NULL);
324 
325 	if (error == EINTR && cnt != 0)
326 		error = 0;
327 out:
328 	if (in_crit)
329 		nbl_end_crit(vp);
330 	releasef(fdes);
331 	if (error)
332 		return (set_errno(error));
333 	return (cnt);
334 }
335 
336 ssize_t
337 pread(int fdes, void *cbuf, size_t count, off_t offset)
338 {
339 	struct uio auio;
340 	struct iovec aiov;
341 	file_t *fp;
342 	register vnode_t *vp;
343 	struct cpu *cp;
344 	int fflag, ioflag, rwflag;
345 	ssize_t bcount;
346 	int error = 0;
347 	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
348 #ifdef _SYSCALL32_IMPL
349 	u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
350 		MAXOFF32_T : MAXOFFSET_T;
351 #else
352 	const u_offset_t maxoff = MAXOFF32_T;
353 #endif
354 	int in_crit = 0;
355 
356 	if ((bcount = (ssize_t)count) < 0)
357 		return (set_errno(EINVAL));
358 
359 	if ((fp = getf(fdes)) == NULL)
360 		return (set_errno(EBADF));
361 	if (((fflag = fp->f_flag) & (FREAD)) == 0) {
362 		error = EBADF;
363 		goto out;
364 	}
365 
366 	rwflag = 0;
367 	vp = fp->f_vnode;
368 
369 	if (vp->v_type == VREG) {
370 
371 		if (bcount == 0)
372 			goto out;
373 
374 		/*
375 		 * Return EINVAL if an invalid offset comes to pread.
376 		 * Negative offset from user will cause this error.
377 		 */
378 
379 		if (fileoff > maxoff) {
380 			error = EINVAL;
381 			goto out;
382 		}
383 		/*
384 		 * Limit offset such that we don't read or write
385 		 * a file beyond the maximum offset representable in
386 		 * an off_t structure.
387 		 */
388 		if (fileoff + bcount > maxoff)
389 			bcount = (ssize_t)((offset_t)maxoff - fileoff);
390 	} else if (vp->v_type == VFIFO) {
391 		error = ESPIPE;
392 		goto out;
393 	}
394 
395 	/*
396 	 * We have to enter the critical region before calling VOP_RWLOCK
397 	 * to avoid a deadlock with ufs.
398 	 */
399 	if (nbl_need_check(vp)) {
400 		int svmand;
401 
402 		nbl_start_crit(vp, RW_READER);
403 		in_crit = 1;
404 		error = nbl_svmand(vp, fp->f_cred, &svmand);
405 		if (error != 0)
406 			goto out;
407 		if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand,
408 		    NULL)) {
409 			error = EACCES;
410 			goto out;
411 		}
412 	}
413 
414 	aiov.iov_base = cbuf;
415 	aiov.iov_len = bcount;
416 	(void) VOP_RWLOCK(vp, rwflag, NULL);
417 	if (vp->v_type == VREG && fileoff == (u_offset_t)maxoff) {
418 		struct vattr va;
419 		va.va_mask = AT_SIZE;
420 		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL))) {
421 			VOP_RWUNLOCK(vp, rwflag, NULL);
422 			goto out;
423 		}
424 		VOP_RWUNLOCK(vp, rwflag, NULL);
425 
426 		/*
427 		 * We have to return EOF if fileoff is >= file size.
428 		 */
429 		if (fileoff >= va.va_size) {
430 			bcount = 0;
431 			goto out;
432 		}
433 
434 		/*
435 		 * File is greater than or equal to maxoff and therefore
436 		 * we return EOVERFLOW.
437 		 */
438 		error = EOVERFLOW;
439 		goto out;
440 	}
441 	auio.uio_loffset = fileoff;
442 	auio.uio_iov = &aiov;
443 	auio.uio_iovcnt = 1;
444 	auio.uio_resid = bcount;
445 	auio.uio_segflg = UIO_USERSPACE;
446 	auio.uio_llimit = MAXOFFSET_T;
447 	auio.uio_fmode = fflag;
448 	auio.uio_extflg = UIO_COPY_CACHED;
449 
450 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
451 
452 	/* If read sync is not asked for, filter sync flags */
453 	if ((ioflag & FRSYNC) == 0)
454 		ioflag &= ~(FSYNC|FDSYNC);
455 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
456 	bcount -= auio.uio_resid;
457 	CPU_STATS_ENTER_K();
458 	cp = CPU;
459 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
460 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
461 	CPU_STATS_EXIT_K();
462 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
463 	VOP_RWUNLOCK(vp, rwflag, NULL);
464 
465 	if (error == EINTR && bcount != 0)
466 		error = 0;
467 out:
468 	if (in_crit)
469 		nbl_end_crit(vp);
470 	releasef(fdes);
471 	if (error)
472 		return (set_errno(error));
473 	return (bcount);
474 }
475 
476 ssize_t
477 pwrite(int fdes, void *cbuf, size_t count, off_t offset)
478 {
479 	struct uio auio;
480 	struct iovec aiov;
481 	file_t *fp;
482 	register vnode_t *vp;
483 	struct cpu *cp;
484 	int fflag, ioflag, rwflag;
485 	ssize_t bcount;
486 	int error = 0;
487 	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
488 #ifdef _SYSCALL32_IMPL
489 	u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
490 		MAXOFF32_T : MAXOFFSET_T;
491 #else
492 	const u_offset_t maxoff = MAXOFF32_T;
493 #endif
494 	int in_crit = 0;
495 
496 	if ((bcount = (ssize_t)count) < 0)
497 		return (set_errno(EINVAL));
498 	if ((fp = getf(fdes)) == NULL)
499 		return (set_errno(EBADF));
500 	if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
501 		error = EBADF;
502 		goto out;
503 	}
504 
505 	rwflag = 1;
506 	vp = fp->f_vnode;
507 
508 	if (vp->v_type == VREG) {
509 
510 		if (bcount == 0)
511 			goto out;
512 
513 		/*
514 		 * return EINVAL for offsets that cannot be
515 		 * represented in an off_t.
516 		 */
517 		if (fileoff > maxoff) {
518 			error = EINVAL;
519 			goto out;
520 		}
521 		/*
522 		 * Take appropriate action if we are trying to write above the
523 		 * resource limit.
524 		 */
525 		if (fileoff >= curproc->p_fsz_ctl) {
526 			mutex_enter(&curproc->p_lock);
527 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
528 			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
529 			mutex_exit(&curproc->p_lock);
530 
531 			error = EFBIG;
532 			goto out;
533 		}
534 		/*
535 		 * Don't allow pwrite to cause file sizes to exceed
536 		 * maxoff.
537 		 */
538 		if (fileoff == maxoff) {
539 			error = EFBIG;
540 			goto out;
541 		}
542 		if (fileoff + count > maxoff)
543 			bcount = (ssize_t)((u_offset_t)maxoff - fileoff);
544 	} else if (vp->v_type == VFIFO) {
545 		error = ESPIPE;
546 		goto out;
547 	}
548 
549 	/*
550 	 * We have to enter the critical region before calling VOP_RWLOCK
551 	 * to avoid a deadlock with ufs.
552 	 */
553 	if (nbl_need_check(vp)) {
554 		int svmand;
555 
556 		nbl_start_crit(vp, RW_READER);
557 		in_crit = 1;
558 		error = nbl_svmand(vp, fp->f_cred, &svmand);
559 		if (error != 0)
560 			goto out;
561 		if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand,
562 		    NULL)) {
563 			error = EACCES;
564 			goto out;
565 		}
566 	}
567 
568 	aiov.iov_base = cbuf;
569 	aiov.iov_len = bcount;
570 	(void) VOP_RWLOCK(vp, rwflag, NULL);
571 	auio.uio_loffset = fileoff;
572 	auio.uio_iov = &aiov;
573 	auio.uio_iovcnt = 1;
574 	auio.uio_resid = bcount;
575 	auio.uio_segflg = UIO_USERSPACE;
576 	auio.uio_llimit = curproc->p_fsz_ctl;
577 	auio.uio_fmode = fflag;
578 	auio.uio_extflg = UIO_COPY_CACHED;
579 
580 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
581 
582 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
583 	bcount -= auio.uio_resid;
584 	CPU_STATS_ENTER_K();
585 	cp = CPU;
586 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
587 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
588 	CPU_STATS_EXIT_K();
589 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
590 	VOP_RWUNLOCK(vp, rwflag, NULL);
591 
592 	if (error == EINTR && bcount != 0)
593 		error = 0;
594 out:
595 	if (in_crit)
596 		nbl_end_crit(vp);
597 	releasef(fdes);
598 	if (error)
599 		return (set_errno(error));
600 	return (bcount);
601 }
602 
603 /*
604  * XXX -- The SVID refers to IOV_MAX, but doesn't define it.  Grrrr....
605  * XXX -- However, SVVS expects readv() and writev() to fail if
606  * XXX -- iovcnt > 16 (yes, it's hard-coded in the SVVS source),
607  * XXX -- so I guess that's the "interface".
608  */
609 #define	DEF_IOV_MAX	16
610 
611 ssize_t
612 readv(int fdes, struct iovec *iovp, int iovcnt)
613 {
614 	struct uio auio;
615 	struct iovec aiov[DEF_IOV_MAX];
616 	file_t *fp;
617 	register vnode_t *vp;
618 	struct cpu *cp;
619 	int fflag, ioflag, rwflag;
620 	ssize_t count, bcount;
621 	int error = 0;
622 	int i;
623 	u_offset_t fileoff;
624 	int in_crit = 0;
625 
626 	if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
627 		return (set_errno(EINVAL));
628 
629 #ifdef _SYSCALL32_IMPL
630 	/*
631 	 * 32-bit callers need to have their iovec expanded,
632 	 * while ensuring that they can't move more than 2Gbytes
633 	 * of data in a single call.
634 	 */
635 	if (get_udatamodel() == DATAMODEL_ILP32) {
636 		struct iovec32 aiov32[DEF_IOV_MAX];
637 		ssize32_t count32;
638 
639 		if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
640 			return (set_errno(EFAULT));
641 
642 		count32 = 0;
643 		for (i = 0; i < iovcnt; i++) {
644 			ssize32_t iovlen32 = aiov32[i].iov_len;
645 			count32 += iovlen32;
646 			if (iovlen32 < 0 || count32 < 0)
647 				return (set_errno(EINVAL));
648 			aiov[i].iov_len = iovlen32;
649 			aiov[i].iov_base =
650 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
651 		}
652 	} else
653 #endif
654 	if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
655 		return (set_errno(EFAULT));
656 
657 	count = 0;
658 	for (i = 0; i < iovcnt; i++) {
659 		ssize_t iovlen = aiov[i].iov_len;
660 		count += iovlen;
661 		if (iovlen < 0 || count < 0)
662 			return (set_errno(EINVAL));
663 	}
664 	if ((fp = getf(fdes)) == NULL)
665 		return (set_errno(EBADF));
666 	if (((fflag = fp->f_flag) & FREAD) == 0) {
667 		error = EBADF;
668 		goto out;
669 	}
670 	vp = fp->f_vnode;
671 	if (vp->v_type == VREG && count == 0) {
672 		goto out;
673 	}
674 
675 	rwflag = 0;
676 
677 	/*
678 	 * We have to enter the critical region before calling VOP_RWLOCK
679 	 * to avoid a deadlock with ufs.
680 	 */
681 	if (nbl_need_check(vp)) {
682 		int svmand;
683 
684 		nbl_start_crit(vp, RW_READER);
685 		in_crit = 1;
686 		error = nbl_svmand(vp, fp->f_cred, &svmand);
687 		if (error != 0)
688 			goto out;
689 		if (nbl_conflict(vp, NBL_READ, fp->f_offset, count, svmand,
690 		    NULL)) {
691 			error = EACCES;
692 			goto out;
693 		}
694 	}
695 
696 	(void) VOP_RWLOCK(vp, rwflag, NULL);
697 	fileoff = fp->f_offset;
698 
699 	/*
700 	 * Behaviour is same as read. Please see comments in read.
701 	 */
702 
703 	if ((vp->v_type == VREG) && (fileoff >= OFFSET_MAX(fp))) {
704 		struct vattr va;
705 		va.va_mask = AT_SIZE;
706 		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL)))  {
707 			VOP_RWUNLOCK(vp, rwflag, NULL);
708 			goto out;
709 		}
710 		if (fileoff >= va.va_size) {
711 			VOP_RWUNLOCK(vp, rwflag, NULL);
712 			count = 0;
713 			goto out;
714 		} else {
715 			VOP_RWUNLOCK(vp, rwflag, NULL);
716 			error = EOVERFLOW;
717 			goto out;
718 		}
719 	}
720 	if ((vp->v_type == VREG) && (fileoff + count > OFFSET_MAX(fp))) {
721 		count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
722 	}
723 	auio.uio_loffset = fileoff;
724 	auio.uio_iov = aiov;
725 	auio.uio_iovcnt = iovcnt;
726 	auio.uio_resid = bcount = count;
727 	auio.uio_segflg = UIO_USERSPACE;
728 	auio.uio_llimit = MAXOFFSET_T;
729 	auio.uio_fmode = fflag;
730 	if (bcount <= copyout_max_cached)
731 		auio.uio_extflg = UIO_COPY_CACHED;
732 	else
733 		auio.uio_extflg = UIO_COPY_DEFAULT;
734 
735 
736 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
737 
738 	/* If read sync is not asked for, filter sync flags */
739 	if ((ioflag & FRSYNC) == 0)
740 		ioflag &= ~(FSYNC|FDSYNC);
741 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
742 	count -= auio.uio_resid;
743 	CPU_STATS_ENTER_K();
744 	cp = CPU;
745 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
746 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)count);
747 	CPU_STATS_EXIT_K();
748 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
749 
750 	if (vp->v_type == VFIFO)	/* Backward compatibility */
751 		fp->f_offset = count;
752 	else if (((fp->f_flag & FAPPEND) == 0) ||
753 		(vp->v_type != VREG) || (bcount != 0))	/* POSIX */
754 		fp->f_offset = auio.uio_loffset;
755 
756 	VOP_RWUNLOCK(vp, rwflag, NULL);
757 
758 	if (error == EINTR && count != 0)
759 		error = 0;
760 out:
761 	if (in_crit)
762 		nbl_end_crit(vp);
763 	releasef(fdes);
764 	if (error)
765 		return (set_errno(error));
766 	return (count);
767 }
768 
769 ssize_t
770 writev(int fdes, struct iovec *iovp, int iovcnt)
771 {
772 	struct uio auio;
773 	struct iovec aiov[DEF_IOV_MAX];
774 	file_t *fp;
775 	register vnode_t *vp;
776 	struct cpu *cp;
777 	int fflag, ioflag, rwflag;
778 	ssize_t count, bcount;
779 	int error = 0;
780 	int i;
781 	u_offset_t fileoff;
782 	int in_crit = 0;
783 
784 	if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
785 		return (set_errno(EINVAL));
786 
787 #ifdef _SYSCALL32_IMPL
788 	/*
789 	 * 32-bit callers need to have their iovec expanded,
790 	 * while ensuring that they can't move more than 2Gbytes
791 	 * of data in a single call.
792 	 */
793 	if (get_udatamodel() == DATAMODEL_ILP32) {
794 		struct iovec32 aiov32[DEF_IOV_MAX];
795 		ssize32_t count32;
796 
797 		if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
798 			return (set_errno(EFAULT));
799 
800 		count32 = 0;
801 		for (i = 0; i < iovcnt; i++) {
802 			ssize32_t iovlen = aiov32[i].iov_len;
803 			count32 += iovlen;
804 			if (iovlen < 0 || count32 < 0)
805 				return (set_errno(EINVAL));
806 			aiov[i].iov_len = iovlen;
807 			aiov[i].iov_base =
808 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
809 		}
810 	} else
811 #endif
812 	if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
813 		return (set_errno(EFAULT));
814 
815 	count = 0;
816 	for (i = 0; i < iovcnt; i++) {
817 		ssize_t iovlen = aiov[i].iov_len;
818 		count += iovlen;
819 		if (iovlen < 0 || count < 0)
820 			return (set_errno(EINVAL));
821 	}
822 	if ((fp = getf(fdes)) == NULL)
823 		return (set_errno(EBADF));
824 	if (((fflag = fp->f_flag) & FWRITE) == 0) {
825 		error = EBADF;
826 		goto out;
827 	}
828 	vp = fp->f_vnode;
829 	if (vp->v_type == VREG && count == 0) {
830 		goto out;
831 	}
832 
833 	rwflag = 1;
834 
835 	/*
836 	 * We have to enter the critical region before calling VOP_RWLOCK
837 	 * to avoid a deadlock with ufs.
838 	 */
839 	if (nbl_need_check(vp)) {
840 		int svmand;
841 
842 		nbl_start_crit(vp, RW_READER);
843 		in_crit = 1;
844 		error = nbl_svmand(vp, fp->f_cred, &svmand);
845 		if (error != 0)
846 			goto out;
847 		if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, count, svmand,
848 		    NULL)) {
849 			error = EACCES;
850 			goto out;
851 		}
852 	}
853 
854 	(void) VOP_RWLOCK(vp, rwflag, NULL);
855 
856 	fileoff = fp->f_offset;
857 
858 	/*
859 	 * Behaviour is same as write. Please see comments for write.
860 	 */
861 
862 	if (vp->v_type == VREG) {
863 		if (fileoff >= curproc->p_fsz_ctl) {
864 			VOP_RWUNLOCK(vp, rwflag, NULL);
865 			mutex_enter(&curproc->p_lock);
866 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
867 			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
868 			mutex_exit(&curproc->p_lock);
869 			error = EFBIG;
870 			goto out;
871 		}
872 		if (fileoff >= OFFSET_MAX(fp)) {
873 			VOP_RWUNLOCK(vp, rwflag, NULL);
874 			error = EFBIG;
875 			goto out;
876 		}
877 		if (fileoff + count > OFFSET_MAX(fp))
878 			count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
879 	}
880 	auio.uio_loffset = fileoff;
881 	auio.uio_iov = aiov;
882 	auio.uio_iovcnt = iovcnt;
883 	auio.uio_resid = bcount = count;
884 	auio.uio_segflg = UIO_USERSPACE;
885 	auio.uio_llimit = curproc->p_fsz_ctl;
886 	auio.uio_fmode = fflag;
887 	auio.uio_extflg = UIO_COPY_DEFAULT;
888 
889 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
890 
891 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
892 	count -= auio.uio_resid;
893 	CPU_STATS_ENTER_K();
894 	cp = CPU;
895 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
896 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)count);
897 	CPU_STATS_EXIT_K();
898 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
899 
900 	if (vp->v_type == VFIFO)	/* Backward compatibility */
901 		fp->f_offset = count;
902 	else if (((fp->f_flag & FAPPEND) == 0) ||
903 		(vp->v_type != VREG) || (bcount != 0))	/* POSIX */
904 		fp->f_offset = auio.uio_loffset;
905 	VOP_RWUNLOCK(vp, rwflag, NULL);
906 
907 	if (error == EINTR && count != 0)
908 		error = 0;
909 out:
910 	if (in_crit)
911 		nbl_end_crit(vp);
912 	releasef(fdes);
913 	if (error)
914 		return (set_errno(error));
915 	return (count);
916 }
917 
918 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
919 
920 /*
921  * This syscall supplies 64-bit file offsets to 32-bit applications only.
922  */
923 ssize32_t
924 pread64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
925     uint32_t offset_2)
926 {
927 	struct uio auio;
928 	struct iovec aiov;
929 	file_t *fp;
930 	register vnode_t *vp;
931 	struct cpu *cp;
932 	int fflag, ioflag, rwflag;
933 	ssize_t bcount;
934 	int error = 0;
935 	u_offset_t fileoff;
936 	int in_crit = 0;
937 
938 #if defined(_LITTLE_ENDIAN)
939 	fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
940 #else
941 	fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
942 #endif
943 
944 	if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
945 		return (set_errno(EINVAL));
946 
947 	if ((fp = getf(fdes)) == NULL)
948 		return (set_errno(EBADF));
949 	if (((fflag = fp->f_flag) & (FREAD)) == 0) {
950 		error = EBADF;
951 		goto out;
952 	}
953 
954 	rwflag = 0;
955 	vp = fp->f_vnode;
956 
957 	if (vp->v_type == VREG) {
958 
959 		if (bcount == 0)
960 			goto out;
961 
962 		/*
963 		 * Same as pread. See comments in pread.
964 		 */
965 
966 		if (fileoff > MAXOFFSET_T) {
967 			error = EINVAL;
968 			goto out;
969 		}
970 		if (fileoff + bcount > MAXOFFSET_T)
971 			bcount = (ssize_t)(MAXOFFSET_T - fileoff);
972 	} else if (vp->v_type == VFIFO) {
973 		error = ESPIPE;
974 		goto out;
975 	}
976 
977 	/*
978 	 * We have to enter the critical region before calling VOP_RWLOCK
979 	 * to avoid a deadlock with ufs.
980 	 */
981 	if (nbl_need_check(vp)) {
982 		int svmand;
983 
984 		nbl_start_crit(vp, RW_READER);
985 		in_crit = 1;
986 		error = nbl_svmand(vp, fp->f_cred, &svmand);
987 		if (error != 0)
988 			goto out;
989 		if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand,
990 		    NULL)) {
991 			error = EACCES;
992 			goto out;
993 		}
994 	}
995 
996 	aiov.iov_base = cbuf;
997 	aiov.iov_len = bcount;
998 	(void) VOP_RWLOCK(vp, rwflag, NULL);
999 	auio.uio_loffset = fileoff;
1000 
1001 	/*
1002 	 * Note: File size can never be greater than MAXOFFSET_T.
1003 	 * If ever we start supporting 128 bit files the code
1004 	 * similar to the one in pread at this place should be here.
1005 	 * Here we avoid the unnecessary VOP_GETATTR() when we
1006 	 * know that fileoff == MAXOFFSET_T implies that it is always
1007 	 * greater than or equal to file size.
1008 	 */
1009 	auio.uio_iov = &aiov;
1010 	auio.uio_iovcnt = 1;
1011 	auio.uio_resid = bcount;
1012 	auio.uio_segflg = UIO_USERSPACE;
1013 	auio.uio_llimit = MAXOFFSET_T;
1014 	auio.uio_fmode = fflag;
1015 	auio.uio_extflg = UIO_COPY_CACHED;
1016 
1017 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1018 
1019 	/* If read sync is not asked for, filter sync flags */
1020 	if ((ioflag & FRSYNC) == 0)
1021 		ioflag &= ~(FSYNC|FDSYNC);
1022 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1023 	bcount -= auio.uio_resid;
1024 	CPU_STATS_ENTER_K();
1025 	cp = CPU;
1026 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
1027 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
1028 	CPU_STATS_EXIT_K();
1029 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
1030 	VOP_RWUNLOCK(vp, rwflag, NULL);
1031 
1032 	if (error == EINTR && bcount != 0)
1033 		error = 0;
1034 out:
1035 	if (in_crit)
1036 		nbl_end_crit(vp);
1037 	releasef(fdes);
1038 	if (error)
1039 		return (set_errno(error));
1040 	return (bcount);
1041 }
1042 
1043 /*
1044  * This syscall supplies 64-bit file offsets to 32-bit applications only.
1045  */
1046 ssize32_t
1047 pwrite64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
1048     uint32_t offset_2)
1049 {
1050 	struct uio auio;
1051 	struct iovec aiov;
1052 	file_t *fp;
1053 	register vnode_t *vp;
1054 	struct cpu *cp;
1055 	int fflag, ioflag, rwflag;
1056 	ssize_t bcount;
1057 	int error = 0;
1058 	u_offset_t fileoff;
1059 	int in_crit = 0;
1060 
1061 #if defined(_LITTLE_ENDIAN)
1062 	fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
1063 #else
1064 	fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
1065 #endif
1066 
1067 	if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
1068 		return (set_errno(EINVAL));
1069 	if ((fp = getf(fdes)) == NULL)
1070 		return (set_errno(EBADF));
1071 	if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
1072 		error = EBADF;
1073 		goto out;
1074 	}
1075 
1076 	rwflag = 1;
1077 	vp = fp->f_vnode;
1078 
1079 	if (vp->v_type == VREG) {
1080 
1081 		if (bcount == 0)
1082 			goto out;
1083 
1084 		/*
1085 		 * See comments in pwrite.
1086 		 */
1087 		if (fileoff > MAXOFFSET_T) {
1088 			error = EINVAL;
1089 			goto out;
1090 		}
1091 		if (fileoff >= curproc->p_fsz_ctl) {
1092 			mutex_enter(&curproc->p_lock);
1093 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
1094 			    curproc->p_rctls, curproc, RCA_SAFE);
1095 			mutex_exit(&curproc->p_lock);
1096 			error = EFBIG;
1097 			goto out;
1098 		}
1099 		if (fileoff == MAXOFFSET_T) {
1100 			error = EFBIG;
1101 			goto out;
1102 		}
1103 		if (fileoff + bcount > MAXOFFSET_T)
1104 			bcount = (ssize_t)((u_offset_t)MAXOFFSET_T - fileoff);
1105 	} else if (vp->v_type == VFIFO) {
1106 		error = ESPIPE;
1107 		goto out;
1108 	}
1109 
1110 	/*
1111 	 * We have to enter the critical region before calling VOP_RWLOCK
1112 	 * to avoid a deadlock with ufs.
1113 	 */
1114 	if (nbl_need_check(vp)) {
1115 		int svmand;
1116 
1117 		nbl_start_crit(vp, RW_READER);
1118 		in_crit = 1;
1119 		error = nbl_svmand(vp, fp->f_cred, &svmand);
1120 		if (error != 0)
1121 			goto out;
1122 		if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand,
1123 		    NULL)) {
1124 			error = EACCES;
1125 			goto out;
1126 		}
1127 	}
1128 
1129 	aiov.iov_base = cbuf;
1130 	aiov.iov_len = bcount;
1131 	(void) VOP_RWLOCK(vp, rwflag, NULL);
1132 	auio.uio_loffset = fileoff;
1133 	auio.uio_iov = &aiov;
1134 	auio.uio_iovcnt = 1;
1135 	auio.uio_resid = bcount;
1136 	auio.uio_segflg = UIO_USERSPACE;
1137 	auio.uio_llimit = curproc->p_fsz_ctl;
1138 	auio.uio_fmode = fflag;
1139 	auio.uio_extflg = UIO_COPY_CACHED;
1140 
1141 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1142 
1143 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
1144 	bcount -= auio.uio_resid;
1145 	CPU_STATS_ENTER_K();
1146 	cp = CPU;
1147 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
1148 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
1149 	CPU_STATS_EXIT_K();
1150 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
1151 	VOP_RWUNLOCK(vp, rwflag, NULL);
1152 
1153 	if (error == EINTR && bcount != 0)
1154 		error = 0;
1155 out:
1156 	if (in_crit)
1157 		nbl_end_crit(vp);
1158 	releasef(fdes);
1159 	if (error)
1160 		return (set_errno(error));
1161 	return (bcount);
1162 }
1163 
1164 #endif	/* _SYSCALL32_IMPL || _ILP32 */
1165 
1166 #ifdef _SYSCALL32_IMPL
1167 /*
1168  * Tail-call elimination of xxx32() down to xxx()
1169  *
1170  * A number of xxx32 system calls take a len (or count) argument and
1171  * return a number in the range [0,len] or -1 on error.
1172  * Given an ssize32_t input len, the downcall xxx() will return
1173  * a 64-bit value that is -1 or in the range [0,len] which actually
1174  * is a proper return value for the xxx32 call. So even if the xxx32
1175  * calls can be considered as returning a ssize32_t, they are currently
1176  * declared as returning a ssize_t as this enables tail-call elimination.
1177  *
1178  * The cast of len (or count) to ssize32_t is needed to ensure we pass
1179  * down negative input values as such and let the downcall handle error
1180  * reporting. Functions covered by this comments are:
1181  *
1182  * rw.c:           read32, write32, pread32, pwrite32, readv32, writev32.
1183  * socksyscall.c:  recv32, recvfrom32, send32, sendto32.
1184  * readlink.c:     readlink32.
1185  */
1186 
1187 ssize_t
1188 read32(int32_t fdes, caddr32_t cbuf, size32_t count)
1189 {
1190 	return (read(fdes,
1191 	    (void *)(uintptr_t)cbuf, (ssize32_t)count));
1192 }
1193 
1194 ssize_t
1195 write32(int32_t fdes, caddr32_t cbuf, size32_t count)
1196 {
1197 	return (write(fdes,
1198 	    (void *)(uintptr_t)cbuf, (ssize32_t)count));
1199 }
1200 
1201 ssize_t
1202 pread32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
1203 {
1204 	return (pread(fdes,
1205 	    (void *)(uintptr_t)cbuf, (ssize32_t)count,
1206 	    (off_t)(uint32_t)offset));
1207 }
1208 
1209 ssize_t
1210 pwrite32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
1211 {
1212 	return (pwrite(fdes,
1213 	    (void *)(uintptr_t)cbuf, (ssize32_t)count,
1214 	    (off_t)(uint32_t)offset));
1215 }
1216 
1217 ssize_t
1218 readv32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
1219 {
1220 	return (readv(fdes, (void *)(uintptr_t)iovp, iovcnt));
1221 }
1222 
1223 ssize_t
1224 writev32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
1225 {
1226 	return (writev(fdes, (void *)(uintptr_t)iovp, iovcnt));
1227 }
1228 
1229 #endif	/* _SYSCALL32_IMPL */
1230