xref: /illumos-gate/usr/src/uts/common/syscall/rw.c (revision bdfc6d18da790deeec2e0eb09c625902defe2498)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 /*
31  * Portions of this source code were derived from Berkeley 4.3 BSD
32  * under license from the Regents of the University of California.
33  */
34 
35 #pragma ident	"%Z%%M%	%I%	%E% SMI"
36 
37 #include <sys/param.h>
38 #include <sys/isa_defs.h>
39 #include <sys/types.h>
40 #include <sys/inttypes.h>
41 #include <sys/sysmacros.h>
42 #include <sys/cred.h>
43 #include <sys/user.h>
44 #include <sys/systm.h>
45 #include <sys/errno.h>
46 #include <sys/vnode.h>
47 #include <sys/file.h>
48 #include <sys/proc.h>
49 #include <sys/cpuvar.h>
50 #include <sys/uio.h>
51 #include <sys/ioreq.h>
52 #include <sys/debug.h>
53 #include <sys/rctl.h>
54 #include <sys/nbmlock.h>
55 
56 #define	COPYOUT_MIN_SIZE	(1<<17)		/* 128K */
57 
58 static size_t copyout_min_size = COPYOUT_MIN_SIZE;
59 
60 /*
61  * read, write, pread, pwrite, readv, and writev syscalls.
62  *
63  * 64-bit open:	all open's are large file opens.
64  * Large Files: the behaviour of read depends on whether the fd
65  *		corresponds to large open or not.
66  * 32-bit open:	FOFFMAX flag not set.
67  *		read until MAXOFF32_T - 1 and read at MAXOFF32_T returns
68  *		EOVERFLOW if count is non-zero and if size of file
69  *		is > MAXOFF32_T. If size of file is <= MAXOFF32_T read
70  *		at >= MAXOFF32_T returns EOF.
71  */
72 
73 /*
74  * Native system call
75  */
76 ssize_t
77 read(int fdes, void *cbuf, size_t count)
78 {
79 	struct uio auio;
80 	struct iovec aiov;
81 	file_t *fp;
82 	register vnode_t *vp;
83 	struct cpu *cp;
84 	int fflag, ioflag, rwflag;
85 	ssize_t cnt, bcount;
86 	int error = 0;
87 	u_offset_t fileoff;
88 	int in_crit = 0;
89 
90 	if ((cnt = (ssize_t)count) < 0)
91 		return (set_errno(EINVAL));
92 	if ((fp = getf(fdes)) == NULL)
93 		return (set_errno(EBADF));
94 	if (((fflag = fp->f_flag) & FREAD) == 0) {
95 		error = EBADF;
96 		goto out;
97 	}
98 	vp = fp->f_vnode;
99 
100 	if (vp->v_type == VREG && cnt == 0) {
101 		goto out;
102 	}
103 
104 	rwflag = 0;
105 	aiov.iov_base = cbuf;
106 	aiov.iov_len = cnt;
107 
108 	/*
109 	 * We have to enter the critical region before calling VOP_RWLOCK
110 	 * to avoid a deadlock with write() calls.
111 	 */
112 	if (nbl_need_check(vp)) {
113 		int svmand;
114 
115 		nbl_start_crit(vp, RW_READER);
116 		in_crit = 1;
117 		error = nbl_svmand(vp, fp->f_cred, &svmand);
118 		if (error != 0)
119 			goto out;
120 		if (nbl_conflict(vp, NBL_READ, fp->f_offset, cnt, svmand)) {
121 			error = EACCES;
122 			goto out;
123 		}
124 	}
125 
126 	(void) VOP_RWLOCK(vp, rwflag, NULL);
127 
128 	/*
129 	 * We do the following checks inside VOP_RWLOCK so as to
130 	 * prevent file size from changing while these checks are
131 	 * being done. Also, we load fp's offset to the local
132 	 * variable fileoff because we can have a parallel lseek
133 	 * going on (f_offset is not protected by any lock) which
134 	 * could change f_offset. We need to see the value only
135 	 * once here and take a decision. Seeing it more than once
136 	 * can lead to incorrect functionality.
137 	 */
138 
139 	fileoff = (u_offset_t)fp->f_offset;
140 	if (fileoff >= OFFSET_MAX(fp) && (vp->v_type == VREG)) {
141 		struct vattr va;
142 		va.va_mask = AT_SIZE;
143 		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred)))  {
144 			VOP_RWUNLOCK(vp, rwflag, NULL);
145 			goto out;
146 		}
147 		if (fileoff >= va.va_size) {
148 			cnt = 0;
149 			VOP_RWUNLOCK(vp, rwflag, NULL);
150 			goto out;
151 		} else {
152 			error = EOVERFLOW;
153 			VOP_RWUNLOCK(vp, rwflag, NULL);
154 			goto out;
155 		}
156 	}
157 	if ((vp->v_type == VREG) &&
158 	    (fileoff + cnt > OFFSET_MAX(fp))) {
159 		cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
160 	}
161 	auio.uio_loffset = fileoff;
162 	auio.uio_iov = &aiov;
163 	auio.uio_iovcnt = 1;
164 	auio.uio_resid = bcount = cnt;
165 	auio.uio_segflg = UIO_USERSPACE;
166 	auio.uio_llimit = MAXOFFSET_T;
167 	auio.uio_fmode = fflag;
168 	/*
169 	 * Only use bypass caches when the count is large enough
170 	 */
171 	if (bcount < copyout_min_size)
172 		auio.uio_extflg = UIO_COPY_CACHED;
173 	else
174 		auio.uio_extflg = UIO_COPY_DEFAULT;
175 
176 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
177 
178 	/* If read sync is not asked for, filter sync flags */
179 	if ((ioflag & FRSYNC) == 0)
180 		ioflag &= ~(FSYNC|FDSYNC);
181 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
182 	cnt -= auio.uio_resid;
183 	CPU_STATS_ENTER_K();
184 	cp = CPU;
185 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
186 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)cnt);
187 	CPU_STATS_EXIT_K();
188 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
189 
190 	if (vp->v_type == VFIFO)	/* Backward compatibility */
191 		fp->f_offset = cnt;
192 	else if (((fp->f_flag & FAPPEND) == 0) ||
193 		(vp->v_type != VREG) || (bcount != 0))	/* POSIX */
194 		fp->f_offset = auio.uio_loffset;
195 	VOP_RWUNLOCK(vp, rwflag, NULL);
196 
197 	if (error == EINTR && cnt != 0)
198 		error = 0;
199 out:
200 	if (in_crit)
201 		nbl_end_crit(vp);
202 	releasef(fdes);
203 	if (error)
204 		return (set_errno(error));
205 	return (cnt);
206 }
207 
208 /*
209  * Native system call
210  */
211 ssize_t
212 write(int fdes, void *cbuf, size_t count)
213 {
214 	struct uio auio;
215 	struct iovec aiov;
216 	file_t *fp;
217 	register vnode_t *vp;
218 	struct cpu *cp;
219 	int fflag, ioflag, rwflag;
220 	ssize_t cnt, bcount;
221 	int error = 0;
222 	u_offset_t fileoff;
223 	int in_crit = 0;
224 
225 	if ((cnt = (ssize_t)count) < 0)
226 		return (set_errno(EINVAL));
227 	if ((fp = getf(fdes)) == NULL)
228 		return (set_errno(EBADF));
229 	if (((fflag = fp->f_flag) & FWRITE) == 0) {
230 		error = EBADF;
231 		goto out;
232 	}
233 	vp = fp->f_vnode;
234 
235 	if (vp->v_type == VREG && cnt == 0) {
236 		goto out;
237 	}
238 
239 	rwflag = 1;
240 	aiov.iov_base = cbuf;
241 	aiov.iov_len = cnt;
242 
243 	/*
244 	 * We have to enter the critical region before calling VOP_RWLOCK
245 	 * to avoid a deadlock with ufs.
246 	 */
247 	if (nbl_need_check(vp)) {
248 		int svmand;
249 
250 		nbl_start_crit(vp, RW_READER);
251 		in_crit = 1;
252 		error = nbl_svmand(vp, fp->f_cred, &svmand);
253 		if (error != 0)
254 			goto out;
255 		if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, cnt, svmand)) {
256 			error = EACCES;
257 			goto out;
258 		}
259 	}
260 
261 	(void) VOP_RWLOCK(vp, rwflag, NULL);
262 
263 	fileoff = fp->f_offset;
264 	if (vp->v_type == VREG) {
265 
266 		/*
267 		 * We raise psignal if write for >0 bytes causes
268 		 * it to exceed the ulimit.
269 		 */
270 		if (fileoff >= curproc->p_fsz_ctl) {
271 			VOP_RWUNLOCK(vp, rwflag, NULL);
272 
273 			mutex_enter(&curproc->p_lock);
274 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
275 			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
276 			mutex_exit(&curproc->p_lock);
277 
278 			error = EFBIG;
279 			goto out;
280 		}
281 		/*
282 		 * We return EFBIG if write is done at an offset
283 		 * greater than the offset maximum for this file structure.
284 		 */
285 
286 		if (fileoff >= OFFSET_MAX(fp)) {
287 			VOP_RWUNLOCK(vp, rwflag, NULL);
288 			error = EFBIG;
289 			goto out;
290 		}
291 		/*
292 		 * Limit the bytes to be written  upto offset maximum for
293 		 * this open file structure.
294 		 */
295 		if (fileoff + cnt > OFFSET_MAX(fp))
296 			cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
297 	}
298 	auio.uio_loffset = fileoff;
299 	auio.uio_iov = &aiov;
300 	auio.uio_iovcnt = 1;
301 	auio.uio_resid = bcount = cnt;
302 	auio.uio_segflg = UIO_USERSPACE;
303 	auio.uio_llimit = curproc->p_fsz_ctl;
304 	auio.uio_fmode = fflag;
305 	auio.uio_extflg = UIO_COPY_DEFAULT;
306 
307 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
308 
309 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
310 	cnt -= auio.uio_resid;
311 	CPU_STATS_ENTER_K();
312 	cp = CPU;
313 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
314 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)cnt);
315 	CPU_STATS_EXIT_K();
316 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
317 
318 	if (vp->v_type == VFIFO)	/* Backward compatibility */
319 		fp->f_offset = cnt;
320 	else if (((fp->f_flag & FAPPEND) == 0) ||
321 		(vp->v_type != VREG) || (bcount != 0))	/* POSIX */
322 		fp->f_offset = auio.uio_loffset;
323 	VOP_RWUNLOCK(vp, rwflag, NULL);
324 
325 	if (error == EINTR && cnt != 0)
326 		error = 0;
327 out:
328 	if (in_crit)
329 		nbl_end_crit(vp);
330 	releasef(fdes);
331 	if (error)
332 		return (set_errno(error));
333 	return (cnt);
334 }
335 
336 ssize_t
337 pread(int fdes, void *cbuf, size_t count, off_t offset)
338 {
339 	struct uio auio;
340 	struct iovec aiov;
341 	file_t *fp;
342 	register vnode_t *vp;
343 	struct cpu *cp;
344 	int fflag, ioflag, rwflag;
345 	ssize_t bcount;
346 	int error = 0;
347 	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
348 #ifdef _SYSCALL32_IMPL
349 	u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
350 		MAXOFF32_T : MAXOFFSET_T;
351 #else
352 	const u_offset_t maxoff = MAXOFF32_T;
353 #endif
354 	int in_crit = 0;
355 
356 	if ((bcount = (ssize_t)count) < 0)
357 		return (set_errno(EINVAL));
358 
359 	if ((fp = getf(fdes)) == NULL)
360 		return (set_errno(EBADF));
361 	if (((fflag = fp->f_flag) & (FREAD)) == 0) {
362 		error = EBADF;
363 		goto out;
364 	}
365 
366 	rwflag = 0;
367 	vp = fp->f_vnode;
368 
369 	if (vp->v_type == VREG) {
370 
371 		if (bcount == 0)
372 			goto out;
373 
374 		/*
375 		 * Return EINVAL if an invalid offset comes to pread.
376 		 * Negative offset from user will cause this error.
377 		 */
378 
379 		if (fileoff > maxoff) {
380 			error = EINVAL;
381 			goto out;
382 		}
383 		/*
384 		 * Limit offset such that we don't read or write
385 		 * a file beyond the maximum offset representable in
386 		 * an off_t structure.
387 		 */
388 		if (fileoff + bcount > maxoff)
389 			bcount = (ssize_t)((offset_t)maxoff - fileoff);
390 	} else if (vp->v_type == VFIFO) {
391 		error = ESPIPE;
392 		goto out;
393 	}
394 
395 	/*
396 	 * We have to enter the critical region before calling VOP_RWLOCK
397 	 * to avoid a deadlock with ufs.
398 	 */
399 	if (nbl_need_check(vp)) {
400 		int svmand;
401 
402 		nbl_start_crit(vp, RW_READER);
403 		in_crit = 1;
404 		error = nbl_svmand(vp, fp->f_cred, &svmand);
405 		if (error != 0)
406 			goto out;
407 		if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand)) {
408 			error = EACCES;
409 			goto out;
410 		}
411 	}
412 
413 	aiov.iov_base = cbuf;
414 	aiov.iov_len = bcount;
415 	(void) VOP_RWLOCK(vp, rwflag, NULL);
416 	if (vp->v_type == VREG && fileoff == (u_offset_t)maxoff) {
417 		struct vattr va;
418 		va.va_mask = AT_SIZE;
419 		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred))) {
420 			VOP_RWUNLOCK(vp, rwflag, NULL);
421 			goto out;
422 		}
423 		VOP_RWUNLOCK(vp, rwflag, NULL);
424 
425 		/*
426 		 * We have to return EOF if fileoff is >= file size.
427 		 */
428 		if (fileoff >= va.va_size) {
429 			bcount = 0;
430 			goto out;
431 		}
432 
433 		/*
434 		 * File is greater than or equal to maxoff and therefore
435 		 * we return EOVERFLOW.
436 		 */
437 		error = EOVERFLOW;
438 		goto out;
439 	}
440 	auio.uio_loffset = fileoff;
441 	auio.uio_iov = &aiov;
442 	auio.uio_iovcnt = 1;
443 	auio.uio_resid = bcount;
444 	auio.uio_segflg = UIO_USERSPACE;
445 	auio.uio_llimit = MAXOFFSET_T;
446 	auio.uio_fmode = fflag;
447 	auio.uio_extflg = UIO_COPY_CACHED;
448 
449 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
450 
451 	/* If read sync is not asked for, filter sync flags */
452 	if ((ioflag & FRSYNC) == 0)
453 		ioflag &= ~(FSYNC|FDSYNC);
454 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
455 	bcount -= auio.uio_resid;
456 	CPU_STATS_ENTER_K();
457 	cp = CPU;
458 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
459 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
460 	CPU_STATS_EXIT_K();
461 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
462 	VOP_RWUNLOCK(vp, rwflag, NULL);
463 
464 	if (error == EINTR && bcount != 0)
465 		error = 0;
466 out:
467 	if (in_crit)
468 		nbl_end_crit(vp);
469 	releasef(fdes);
470 	if (error)
471 		return (set_errno(error));
472 	return (bcount);
473 }
474 
475 ssize_t
476 pwrite(int fdes, void *cbuf, size_t count, off_t offset)
477 {
478 	struct uio auio;
479 	struct iovec aiov;
480 	file_t *fp;
481 	register vnode_t *vp;
482 	struct cpu *cp;
483 	int fflag, ioflag, rwflag;
484 	ssize_t bcount;
485 	int error = 0;
486 	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
487 #ifdef _SYSCALL32_IMPL
488 	u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
489 		MAXOFF32_T : MAXOFFSET_T;
490 #else
491 	const u_offset_t maxoff = MAXOFF32_T;
492 #endif
493 	int in_crit = 0;
494 
495 	if ((bcount = (ssize_t)count) < 0)
496 		return (set_errno(EINVAL));
497 	if ((fp = getf(fdes)) == NULL)
498 		return (set_errno(EBADF));
499 	if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
500 		error = EBADF;
501 		goto out;
502 	}
503 
504 	rwflag = 1;
505 	vp = fp->f_vnode;
506 
507 	if (vp->v_type == VREG) {
508 
509 		if (bcount == 0)
510 			goto out;
511 
512 		/*
513 		 * return EINVAL for offsets that cannot be
514 		 * represented in an off_t.
515 		 */
516 		if (fileoff > maxoff) {
517 			error = EINVAL;
518 			goto out;
519 		}
520 		/*
521 		 * Take appropriate action if we are trying to write above the
522 		 * resource limit.
523 		 */
524 		if (fileoff >= curproc->p_fsz_ctl) {
525 			mutex_enter(&curproc->p_lock);
526 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
527 			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
528 			mutex_exit(&curproc->p_lock);
529 
530 			error = EFBIG;
531 			goto out;
532 		}
533 		/*
534 		 * Don't allow pwrite to cause file sizes to exceed
535 		 * maxoff.
536 		 */
537 		if (fileoff == maxoff) {
538 			error = EFBIG;
539 			goto out;
540 		}
541 		if (fileoff + count > maxoff)
542 			bcount = (ssize_t)((u_offset_t)maxoff - fileoff);
543 	} else if (vp->v_type == VFIFO) {
544 		error = ESPIPE;
545 		goto out;
546 	}
547 
548 	/*
549 	 * We have to enter the critical region before calling VOP_RWLOCK
550 	 * to avoid a deadlock with ufs.
551 	 */
552 	if (nbl_need_check(vp)) {
553 		int svmand;
554 
555 		nbl_start_crit(vp, RW_READER);
556 		in_crit = 1;
557 		error = nbl_svmand(vp, fp->f_cred, &svmand);
558 		if (error != 0)
559 			goto out;
560 		if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand)) {
561 			error = EACCES;
562 			goto out;
563 		}
564 	}
565 
566 	aiov.iov_base = cbuf;
567 	aiov.iov_len = bcount;
568 	(void) VOP_RWLOCK(vp, rwflag, NULL);
569 	auio.uio_loffset = fileoff;
570 	auio.uio_iov = &aiov;
571 	auio.uio_iovcnt = 1;
572 	auio.uio_resid = bcount;
573 	auio.uio_segflg = UIO_USERSPACE;
574 	auio.uio_llimit = curproc->p_fsz_ctl;
575 	auio.uio_fmode = fflag;
576 	auio.uio_extflg = UIO_COPY_CACHED;
577 
578 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
579 
580 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
581 	bcount -= auio.uio_resid;
582 	CPU_STATS_ENTER_K();
583 	cp = CPU;
584 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
585 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
586 	CPU_STATS_EXIT_K();
587 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
588 	VOP_RWUNLOCK(vp, rwflag, NULL);
589 
590 	if (error == EINTR && bcount != 0)
591 		error = 0;
592 out:
593 	if (in_crit)
594 		nbl_end_crit(vp);
595 	releasef(fdes);
596 	if (error)
597 		return (set_errno(error));
598 	return (bcount);
599 }
600 
601 /*
602  * XXX -- The SVID refers to IOV_MAX, but doesn't define it.  Grrrr....
603  * XXX -- However, SVVS expects readv() and writev() to fail if
604  * XXX -- iovcnt > 16 (yes, it's hard-coded in the SVVS source),
605  * XXX -- so I guess that's the "interface".
606  */
607 #define	DEF_IOV_MAX	16
608 
609 ssize_t
610 readv(int fdes, struct iovec *iovp, int iovcnt)
611 {
612 	struct uio auio;
613 	struct iovec aiov[DEF_IOV_MAX];
614 	file_t *fp;
615 	register vnode_t *vp;
616 	struct cpu *cp;
617 	int fflag, ioflag, rwflag;
618 	ssize_t count, bcount;
619 	int error = 0;
620 	int i;
621 	u_offset_t fileoff;
622 	int in_crit = 0;
623 
624 	if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
625 		return (set_errno(EINVAL));
626 
627 #ifdef _SYSCALL32_IMPL
628 	/*
629 	 * 32-bit callers need to have their iovec expanded,
630 	 * while ensuring that they can't move more than 2Gbytes
631 	 * of data in a single call.
632 	 */
633 	if (get_udatamodel() == DATAMODEL_ILP32) {
634 		struct iovec32 aiov32[DEF_IOV_MAX];
635 		ssize32_t count32;
636 
637 		if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
638 			return (set_errno(EFAULT));
639 
640 		count32 = 0;
641 		for (i = 0; i < iovcnt; i++) {
642 			ssize32_t iovlen32 = aiov32[i].iov_len;
643 			count32 += iovlen32;
644 			if (iovlen32 < 0 || count32 < 0)
645 				return (set_errno(EINVAL));
646 			aiov[i].iov_len = iovlen32;
647 			aiov[i].iov_base =
648 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
649 		}
650 	} else
651 #endif
652 	if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
653 		return (set_errno(EFAULT));
654 
655 	count = 0;
656 	for (i = 0; i < iovcnt; i++) {
657 		ssize_t iovlen = aiov[i].iov_len;
658 		count += iovlen;
659 		if (iovlen < 0 || count < 0)
660 			return (set_errno(EINVAL));
661 	}
662 	if ((fp = getf(fdes)) == NULL)
663 		return (set_errno(EBADF));
664 	if (((fflag = fp->f_flag) & FREAD) == 0) {
665 		error = EBADF;
666 		goto out;
667 	}
668 	vp = fp->f_vnode;
669 	if (vp->v_type == VREG && count == 0) {
670 		goto out;
671 	}
672 
673 	rwflag = 0;
674 
675 	/*
676 	 * We have to enter the critical region before calling VOP_RWLOCK
677 	 * to avoid a deadlock with ufs.
678 	 */
679 	if (nbl_need_check(vp)) {
680 		int svmand;
681 
682 		nbl_start_crit(vp, RW_READER);
683 		in_crit = 1;
684 		error = nbl_svmand(vp, fp->f_cred, &svmand);
685 		if (error != 0)
686 			goto out;
687 		if (nbl_conflict(vp, NBL_READ, fp->f_offset, count, svmand)) {
688 			error = EACCES;
689 			goto out;
690 		}
691 	}
692 
693 	(void) VOP_RWLOCK(vp, rwflag, NULL);
694 	fileoff = fp->f_offset;
695 
696 	/*
697 	 * Behaviour is same as read. Please see comments in read.
698 	 */
699 
700 	if ((vp->v_type == VREG) && (fileoff >= OFFSET_MAX(fp))) {
701 		struct vattr va;
702 		va.va_mask = AT_SIZE;
703 		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred)))  {
704 			VOP_RWUNLOCK(vp, rwflag, NULL);
705 			goto out;
706 		}
707 		if (fileoff >= va.va_size) {
708 			VOP_RWUNLOCK(vp, rwflag, NULL);
709 			count = 0;
710 			goto out;
711 		} else {
712 			VOP_RWUNLOCK(vp, rwflag, NULL);
713 			error = EOVERFLOW;
714 			goto out;
715 		}
716 	}
717 	if ((vp->v_type == VREG) && (fileoff + count > OFFSET_MAX(fp))) {
718 		count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
719 	}
720 	auio.uio_loffset = fileoff;
721 	auio.uio_iov = aiov;
722 	auio.uio_iovcnt = iovcnt;
723 	auio.uio_resid = bcount = count;
724 	auio.uio_segflg = UIO_USERSPACE;
725 	auio.uio_llimit = MAXOFFSET_T;
726 	auio.uio_fmode = fflag;
727 	if (bcount < copyout_min_size)
728 		auio.uio_extflg = UIO_COPY_CACHED;
729 	else
730 		auio.uio_extflg = UIO_COPY_DEFAULT;
731 
732 
733 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
734 
735 	/* If read sync is not asked for, filter sync flags */
736 	if ((ioflag & FRSYNC) == 0)
737 		ioflag &= ~(FSYNC|FDSYNC);
738 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
739 	count -= auio.uio_resid;
740 	CPU_STATS_ENTER_K();
741 	cp = CPU;
742 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
743 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)count);
744 	CPU_STATS_EXIT_K();
745 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
746 
747 	if (vp->v_type == VFIFO)	/* Backward compatibility */
748 		fp->f_offset = count;
749 	else if (((fp->f_flag & FAPPEND) == 0) ||
750 		(vp->v_type != VREG) || (bcount != 0))	/* POSIX */
751 		fp->f_offset = auio.uio_loffset;
752 
753 	VOP_RWUNLOCK(vp, rwflag, NULL);
754 
755 	if (error == EINTR && count != 0)
756 		error = 0;
757 out:
758 	if (in_crit)
759 		nbl_end_crit(vp);
760 	releasef(fdes);
761 	if (error)
762 		return (set_errno(error));
763 	return (count);
764 }
765 
766 ssize_t
767 writev(int fdes, struct iovec *iovp, int iovcnt)
768 {
769 	struct uio auio;
770 	struct iovec aiov[DEF_IOV_MAX];
771 	file_t *fp;
772 	register vnode_t *vp;
773 	struct cpu *cp;
774 	int fflag, ioflag, rwflag;
775 	ssize_t count, bcount;
776 	int error = 0;
777 	int i;
778 	u_offset_t fileoff;
779 	int in_crit = 0;
780 
781 	if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
782 		return (set_errno(EINVAL));
783 
784 #ifdef _SYSCALL32_IMPL
785 	/*
786 	 * 32-bit callers need to have their iovec expanded,
787 	 * while ensuring that they can't move more than 2Gbytes
788 	 * of data in a single call.
789 	 */
790 	if (get_udatamodel() == DATAMODEL_ILP32) {
791 		struct iovec32 aiov32[DEF_IOV_MAX];
792 		ssize32_t count32;
793 
794 		if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
795 			return (set_errno(EFAULT));
796 
797 		count32 = 0;
798 		for (i = 0; i < iovcnt; i++) {
799 			ssize32_t iovlen = aiov32[i].iov_len;
800 			count32 += iovlen;
801 			if (iovlen < 0 || count32 < 0)
802 				return (set_errno(EINVAL));
803 			aiov[i].iov_len = iovlen;
804 			aiov[i].iov_base =
805 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
806 		}
807 	} else
808 #endif
809 	if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
810 		return (set_errno(EFAULT));
811 
812 	count = 0;
813 	for (i = 0; i < iovcnt; i++) {
814 		ssize_t iovlen = aiov[i].iov_len;
815 		count += iovlen;
816 		if (iovlen < 0 || count < 0)
817 			return (set_errno(EINVAL));
818 	}
819 	if ((fp = getf(fdes)) == NULL)
820 		return (set_errno(EBADF));
821 	if (((fflag = fp->f_flag) & FWRITE) == 0) {
822 		error = EBADF;
823 		goto out;
824 	}
825 	vp = fp->f_vnode;
826 	if (vp->v_type == VREG && count == 0) {
827 		goto out;
828 	}
829 
830 	rwflag = 1;
831 
832 	/*
833 	 * We have to enter the critical region before calling VOP_RWLOCK
834 	 * to avoid a deadlock with ufs.
835 	 */
836 	if (nbl_need_check(vp)) {
837 		int svmand;
838 
839 		nbl_start_crit(vp, RW_READER);
840 		in_crit = 1;
841 		error = nbl_svmand(vp, fp->f_cred, &svmand);
842 		if (error != 0)
843 			goto out;
844 		if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, count, svmand)) {
845 			error = EACCES;
846 			goto out;
847 		}
848 	}
849 
850 	(void) VOP_RWLOCK(vp, rwflag, NULL);
851 
852 	fileoff = fp->f_offset;
853 
854 	/*
855 	 * Behaviour is same as write. Please see comments for write.
856 	 */
857 
858 	if (vp->v_type == VREG) {
859 		if (fileoff >= curproc->p_fsz_ctl) {
860 			VOP_RWUNLOCK(vp, rwflag, NULL);
861 			mutex_enter(&curproc->p_lock);
862 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
863 			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
864 			mutex_exit(&curproc->p_lock);
865 			error = EFBIG;
866 			goto out;
867 		}
868 		if (fileoff >= OFFSET_MAX(fp)) {
869 			VOP_RWUNLOCK(vp, rwflag, NULL);
870 			error = EFBIG;
871 			goto out;
872 		}
873 		if (fileoff + count > OFFSET_MAX(fp))
874 			count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
875 	}
876 	auio.uio_loffset = fileoff;
877 	auio.uio_iov = aiov;
878 	auio.uio_iovcnt = iovcnt;
879 	auio.uio_resid = bcount = count;
880 	auio.uio_segflg = UIO_USERSPACE;
881 	auio.uio_llimit = curproc->p_fsz_ctl;
882 	auio.uio_fmode = fflag;
883 	auio.uio_extflg = UIO_COPY_DEFAULT;
884 
885 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
886 
887 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
888 	count -= auio.uio_resid;
889 	CPU_STATS_ENTER_K();
890 	cp = CPU;
891 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
892 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)count);
893 	CPU_STATS_EXIT_K();
894 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
895 
896 	if (vp->v_type == VFIFO)	/* Backward compatibility */
897 		fp->f_offset = count;
898 	else if (((fp->f_flag & FAPPEND) == 0) ||
899 		(vp->v_type != VREG) || (bcount != 0))	/* POSIX */
900 		fp->f_offset = auio.uio_loffset;
901 	VOP_RWUNLOCK(vp, rwflag, NULL);
902 
903 	if (error == EINTR && count != 0)
904 		error = 0;
905 out:
906 	if (in_crit)
907 		nbl_end_crit(vp);
908 	releasef(fdes);
909 	if (error)
910 		return (set_errno(error));
911 	return (count);
912 }
913 
914 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
915 
916 /*
917  * This syscall supplies 64-bit file offsets to 32-bit applications only.
918  */
919 ssize32_t
920 pread64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
921     uint32_t offset_2)
922 {
923 	struct uio auio;
924 	struct iovec aiov;
925 	file_t *fp;
926 	register vnode_t *vp;
927 	struct cpu *cp;
928 	int fflag, ioflag, rwflag;
929 	ssize_t bcount;
930 	int error = 0;
931 	u_offset_t fileoff;
932 	int in_crit = 0;
933 
934 #if defined(_LITTLE_ENDIAN)
935 	fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
936 #else
937 	fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
938 #endif
939 
940 	if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
941 		return (set_errno(EINVAL));
942 
943 	if ((fp = getf(fdes)) == NULL)
944 		return (set_errno(EBADF));
945 	if (((fflag = fp->f_flag) & (FREAD)) == 0) {
946 		error = EBADF;
947 		goto out;
948 	}
949 
950 	rwflag = 0;
951 	vp = fp->f_vnode;
952 
953 	if (vp->v_type == VREG) {
954 
955 		if (bcount == 0)
956 			goto out;
957 
958 		/*
959 		 * Same as pread. See comments in pread.
960 		 */
961 
962 		if (fileoff > MAXOFFSET_T) {
963 			error = EINVAL;
964 			goto out;
965 		}
966 		if (fileoff + bcount > MAXOFFSET_T)
967 			bcount = (ssize_t)(MAXOFFSET_T - fileoff);
968 	} else if (vp->v_type == VFIFO) {
969 		error = ESPIPE;
970 		goto out;
971 	}
972 
973 	/*
974 	 * We have to enter the critical region before calling VOP_RWLOCK
975 	 * to avoid a deadlock with ufs.
976 	 */
977 	if (nbl_need_check(vp)) {
978 		int svmand;
979 
980 		nbl_start_crit(vp, RW_READER);
981 		in_crit = 1;
982 		error = nbl_svmand(vp, fp->f_cred, &svmand);
983 		if (error != 0)
984 			goto out;
985 		if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand)) {
986 			error = EACCES;
987 			goto out;
988 		}
989 	}
990 
991 	aiov.iov_base = cbuf;
992 	aiov.iov_len = bcount;
993 	(void) VOP_RWLOCK(vp, rwflag, NULL);
994 	auio.uio_loffset = fileoff;
995 
996 	/*
997 	 * Note: File size can never be greater than MAXOFFSET_T.
998 	 * If ever we start supporting 128 bit files the code
999 	 * similar to the one in pread at this place should be here.
1000 	 * Here we avoid the unnecessary VOP_GETATTR() when we
1001 	 * know that fileoff == MAXOFFSET_T implies that it is always
1002 	 * greater than or equal to file size.
1003 	 */
1004 	auio.uio_iov = &aiov;
1005 	auio.uio_iovcnt = 1;
1006 	auio.uio_resid = bcount;
1007 	auio.uio_segflg = UIO_USERSPACE;
1008 	auio.uio_llimit = MAXOFFSET_T;
1009 	auio.uio_fmode = fflag;
1010 	auio.uio_extflg = UIO_COPY_CACHED;
1011 
1012 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1013 
1014 	/* If read sync is not asked for, filter sync flags */
1015 	if ((ioflag & FRSYNC) == 0)
1016 		ioflag &= ~(FSYNC|FDSYNC);
1017 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1018 	bcount -= auio.uio_resid;
1019 	CPU_STATS_ENTER_K();
1020 	cp = CPU;
1021 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
1022 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
1023 	CPU_STATS_EXIT_K();
1024 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
1025 	VOP_RWUNLOCK(vp, rwflag, NULL);
1026 
1027 	if (error == EINTR && bcount != 0)
1028 		error = 0;
1029 out:
1030 	if (in_crit)
1031 		nbl_end_crit(vp);
1032 	releasef(fdes);
1033 	if (error)
1034 		return (set_errno(error));
1035 	return (bcount);
1036 }
1037 
1038 /*
1039  * This syscall supplies 64-bit file offsets to 32-bit applications only.
1040  */
1041 ssize32_t
1042 pwrite64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
1043     uint32_t offset_2)
1044 {
1045 	struct uio auio;
1046 	struct iovec aiov;
1047 	file_t *fp;
1048 	register vnode_t *vp;
1049 	struct cpu *cp;
1050 	int fflag, ioflag, rwflag;
1051 	ssize_t bcount;
1052 	int error = 0;
1053 	u_offset_t fileoff;
1054 	int in_crit = 0;
1055 
1056 #if defined(_LITTLE_ENDIAN)
1057 	fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
1058 #else
1059 	fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
1060 #endif
1061 
1062 	if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
1063 		return (set_errno(EINVAL));
1064 	if ((fp = getf(fdes)) == NULL)
1065 		return (set_errno(EBADF));
1066 	if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
1067 		error = EBADF;
1068 		goto out;
1069 	}
1070 
1071 	rwflag = 1;
1072 	vp = fp->f_vnode;
1073 
1074 	if (vp->v_type == VREG) {
1075 
1076 		if (bcount == 0)
1077 			goto out;
1078 
1079 		/*
1080 		 * See comments in pwrite.
1081 		 */
1082 		if (fileoff > MAXOFFSET_T) {
1083 			error = EINVAL;
1084 			goto out;
1085 		}
1086 		if (fileoff >= curproc->p_fsz_ctl) {
1087 			mutex_enter(&curproc->p_lock);
1088 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
1089 			    curproc->p_rctls, curproc, RCA_SAFE);
1090 			mutex_exit(&curproc->p_lock);
1091 			error = EFBIG;
1092 			goto out;
1093 		}
1094 		if (fileoff == MAXOFFSET_T) {
1095 			error = EFBIG;
1096 			goto out;
1097 		}
1098 		if (fileoff + bcount > MAXOFFSET_T)
1099 			bcount = (ssize_t)((u_offset_t)MAXOFFSET_T - fileoff);
1100 	} else if (vp->v_type == VFIFO) {
1101 		error = ESPIPE;
1102 		goto out;
1103 	}
1104 
1105 	/*
1106 	 * We have to enter the critical region before calling VOP_RWLOCK
1107 	 * to avoid a deadlock with ufs.
1108 	 */
1109 	if (nbl_need_check(vp)) {
1110 		int svmand;
1111 
1112 		nbl_start_crit(vp, RW_READER);
1113 		in_crit = 1;
1114 		error = nbl_svmand(vp, fp->f_cred, &svmand);
1115 		if (error != 0)
1116 			goto out;
1117 		if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand)) {
1118 			error = EACCES;
1119 			goto out;
1120 		}
1121 	}
1122 
1123 	aiov.iov_base = cbuf;
1124 	aiov.iov_len = bcount;
1125 	(void) VOP_RWLOCK(vp, rwflag, NULL);
1126 	auio.uio_loffset = fileoff;
1127 	auio.uio_iov = &aiov;
1128 	auio.uio_iovcnt = 1;
1129 	auio.uio_resid = bcount;
1130 	auio.uio_segflg = UIO_USERSPACE;
1131 	auio.uio_llimit = curproc->p_fsz_ctl;
1132 	auio.uio_fmode = fflag;
1133 	auio.uio_extflg = UIO_COPY_CACHED;
1134 
1135 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1136 
1137 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
1138 	bcount -= auio.uio_resid;
1139 	CPU_STATS_ENTER_K();
1140 	cp = CPU;
1141 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
1142 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
1143 	CPU_STATS_EXIT_K();
1144 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
1145 	VOP_RWUNLOCK(vp, rwflag, NULL);
1146 
1147 	if (error == EINTR && bcount != 0)
1148 		error = 0;
1149 out:
1150 	if (in_crit)
1151 		nbl_end_crit(vp);
1152 	releasef(fdes);
1153 	if (error)
1154 		return (set_errno(error));
1155 	return (bcount);
1156 }
1157 
1158 #endif	/* _SYSCALL32_IMPL || _ILP32 */
1159 
1160 #ifdef _SYSCALL32_IMPL
1161 /*
1162  * Tail-call elimination of xxx32() down to xxx()
1163  *
1164  * A number of xxx32 system calls take a len (or count) argument and
1165  * return a number in the range [0,len] or -1 on error.
1166  * Given an ssize32_t input len, the downcall xxx() will return
1167  * a 64-bit value that is -1 or in the range [0,len] which actually
1168  * is a proper return value for the xxx32 call. So even if the xxx32
1169  * calls can be considered as returning a ssize32_t, they are currently
1170  * declared as returning a ssize_t as this enables tail-call elimination.
1171  *
1172  * The cast of len (or count) to ssize32_t is needed to ensure we pass
1173  * down negative input values as such and let the downcall handle error
1174  * reporting. Functions covered by this comments are:
1175  *
1176  * rw.c:           read32, write32, pread32, pwrite32, readv32, writev32.
1177  * socksyscall.c:  recv32, recvfrom32, send32, sendto32.
1178  * readlink.c:     readlink32.
1179  */
1180 
1181 ssize_t
1182 read32(int32_t fdes, caddr32_t cbuf, size32_t count)
1183 {
1184 	return (read(fdes,
1185 	    (void *)(uintptr_t)cbuf, (ssize32_t)count));
1186 }
1187 
1188 ssize_t
1189 write32(int32_t fdes, caddr32_t cbuf, size32_t count)
1190 {
1191 	return (write(fdes,
1192 	    (void *)(uintptr_t)cbuf, (ssize32_t)count));
1193 }
1194 
1195 ssize_t
1196 pread32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
1197 {
1198 	return (pread(fdes,
1199 	    (void *)(uintptr_t)cbuf, (ssize32_t)count,
1200 	    (off_t)(uint32_t)offset));
1201 }
1202 
1203 ssize_t
1204 pwrite32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
1205 {
1206 	return (pwrite(fdes,
1207 	    (void *)(uintptr_t)cbuf, (ssize32_t)count,
1208 	    (off_t)(uint32_t)offset));
1209 }
1210 
1211 ssize_t
1212 readv32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
1213 {
1214 	return (readv(fdes, (void *)(uintptr_t)iovp, iovcnt));
1215 }
1216 
1217 ssize_t
1218 writev32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
1219 {
1220 	return (writev(fdes, (void *)(uintptr_t)iovp, iovcnt));
1221 }
1222 
1223 #endif	/* _SYSCALL32_IMPL */
1224