xref: /illumos-gate/usr/src/uts/common/syscall/rw.c (revision bcd524b5c10222cf2a1ef37ac7ea8bf1baa3a2ee)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * Portions of this source code were derived from Berkeley 4.3 BSD
31  * under license from the Regents of the University of California.
32  */
33 
34 #pragma ident	"%Z%%M%	%I%	%E% SMI"
35 
36 #include <sys/param.h>
37 #include <sys/isa_defs.h>
38 #include <sys/types.h>
39 #include <sys/inttypes.h>
40 #include <sys/sysmacros.h>
41 #include <sys/cred.h>
42 #include <sys/user.h>
43 #include <sys/systm.h>
44 #include <sys/errno.h>
45 #include <sys/vnode.h>
46 #include <sys/file.h>
47 #include <sys/proc.h>
48 #include <sys/cpuvar.h>
49 #include <sys/uio.h>
50 #include <sys/debug.h>
51 #include <sys/rctl.h>
52 #include <sys/nbmlock.h>
53 
54 #define	COPYOUT_MAX_CACHE	(1<<17)		/* 128K */
55 
56 size_t copyout_max_cached = COPYOUT_MAX_CACHE;	/* global so it's patchable */
57 
58 /*
59  * read, write, pread, pwrite, readv, and writev syscalls.
60  *
61  * 64-bit open:	all open's are large file opens.
62  * Large Files: the behaviour of read depends on whether the fd
63  *		corresponds to large open or not.
64  * 32-bit open:	FOFFMAX flag not set.
65  *		read until MAXOFF32_T - 1 and read at MAXOFF32_T returns
66  *		EOVERFLOW if count is non-zero and if size of file
67  *		is > MAXOFF32_T. If size of file is <= MAXOFF32_T read
68  *		at >= MAXOFF32_T returns EOF.
69  */
70 
71 /*
72  * Native system call
73  */
74 ssize_t
75 read(int fdes, void *cbuf, size_t count)
76 {
77 	struct uio auio;
78 	struct iovec aiov;
79 	file_t *fp;
80 	register vnode_t *vp;
81 	struct cpu *cp;
82 	int fflag, ioflag, rwflag;
83 	ssize_t cnt, bcount;
84 	int error = 0;
85 	u_offset_t fileoff;
86 	int in_crit = 0;
87 
88 	if ((cnt = (ssize_t)count) < 0)
89 		return (set_errno(EINVAL));
90 	if ((fp = getf(fdes)) == NULL)
91 		return (set_errno(EBADF));
92 	if (((fflag = fp->f_flag) & FREAD) == 0) {
93 		error = EBADF;
94 		goto out;
95 	}
96 	vp = fp->f_vnode;
97 
98 	if (vp->v_type == VREG && cnt == 0) {
99 		goto out;
100 	}
101 
102 	rwflag = 0;
103 	aiov.iov_base = cbuf;
104 	aiov.iov_len = cnt;
105 
106 	/*
107 	 * We have to enter the critical region before calling VOP_RWLOCK
108 	 * to avoid a deadlock with write() calls.
109 	 */
110 	if (nbl_need_check(vp)) {
111 		int svmand;
112 
113 		nbl_start_crit(vp, RW_READER);
114 		in_crit = 1;
115 		error = nbl_svmand(vp, fp->f_cred, &svmand);
116 		if (error != 0)
117 			goto out;
118 		if (nbl_conflict(vp, NBL_READ, fp->f_offset, cnt, svmand)) {
119 			error = EACCES;
120 			goto out;
121 		}
122 	}
123 
124 	(void) VOP_RWLOCK(vp, rwflag, NULL);
125 
126 	/*
127 	 * We do the following checks inside VOP_RWLOCK so as to
128 	 * prevent file size from changing while these checks are
129 	 * being done. Also, we load fp's offset to the local
130 	 * variable fileoff because we can have a parallel lseek
131 	 * going on (f_offset is not protected by any lock) which
132 	 * could change f_offset. We need to see the value only
133 	 * once here and take a decision. Seeing it more than once
134 	 * can lead to incorrect functionality.
135 	 */
136 
137 	fileoff = (u_offset_t)fp->f_offset;
138 	if (fileoff >= OFFSET_MAX(fp) && (vp->v_type == VREG)) {
139 		struct vattr va;
140 		va.va_mask = AT_SIZE;
141 		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred)))  {
142 			VOP_RWUNLOCK(vp, rwflag, NULL);
143 			goto out;
144 		}
145 		if (fileoff >= va.va_size) {
146 			cnt = 0;
147 			VOP_RWUNLOCK(vp, rwflag, NULL);
148 			goto out;
149 		} else {
150 			error = EOVERFLOW;
151 			VOP_RWUNLOCK(vp, rwflag, NULL);
152 			goto out;
153 		}
154 	}
155 	if ((vp->v_type == VREG) &&
156 	    (fileoff + cnt > OFFSET_MAX(fp))) {
157 		cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
158 	}
159 	auio.uio_loffset = fileoff;
160 	auio.uio_iov = &aiov;
161 	auio.uio_iovcnt = 1;
162 	auio.uio_resid = bcount = cnt;
163 	auio.uio_segflg = UIO_USERSPACE;
164 	auio.uio_llimit = MAXOFFSET_T;
165 	auio.uio_fmode = fflag;
166 	/*
167 	 * Only use bypass caches when the count is large enough
168 	 */
169 	if (bcount <= copyout_max_cached)
170 		auio.uio_extflg = UIO_COPY_CACHED;
171 	else
172 		auio.uio_extflg = UIO_COPY_DEFAULT;
173 
174 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
175 
176 	/* If read sync is not asked for, filter sync flags */
177 	if ((ioflag & FRSYNC) == 0)
178 		ioflag &= ~(FSYNC|FDSYNC);
179 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
180 	cnt -= auio.uio_resid;
181 	CPU_STATS_ENTER_K();
182 	cp = CPU;
183 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
184 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)cnt);
185 	CPU_STATS_EXIT_K();
186 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
187 
188 	if (vp->v_type == VFIFO)	/* Backward compatibility */
189 		fp->f_offset = cnt;
190 	else if (((fp->f_flag & FAPPEND) == 0) ||
191 		(vp->v_type != VREG) || (bcount != 0))	/* POSIX */
192 		fp->f_offset = auio.uio_loffset;
193 	VOP_RWUNLOCK(vp, rwflag, NULL);
194 
195 	if (error == EINTR && cnt != 0)
196 		error = 0;
197 out:
198 	if (in_crit)
199 		nbl_end_crit(vp);
200 	releasef(fdes);
201 	if (error)
202 		return (set_errno(error));
203 	return (cnt);
204 }
205 
206 /*
207  * Native system call
208  */
209 ssize_t
210 write(int fdes, void *cbuf, size_t count)
211 {
212 	struct uio auio;
213 	struct iovec aiov;
214 	file_t *fp;
215 	register vnode_t *vp;
216 	struct cpu *cp;
217 	int fflag, ioflag, rwflag;
218 	ssize_t cnt, bcount;
219 	int error = 0;
220 	u_offset_t fileoff;
221 	int in_crit = 0;
222 
223 	if ((cnt = (ssize_t)count) < 0)
224 		return (set_errno(EINVAL));
225 	if ((fp = getf(fdes)) == NULL)
226 		return (set_errno(EBADF));
227 	if (((fflag = fp->f_flag) & FWRITE) == 0) {
228 		error = EBADF;
229 		goto out;
230 	}
231 	vp = fp->f_vnode;
232 
233 	if (vp->v_type == VREG && cnt == 0) {
234 		goto out;
235 	}
236 
237 	rwflag = 1;
238 	aiov.iov_base = cbuf;
239 	aiov.iov_len = cnt;
240 
241 	/*
242 	 * We have to enter the critical region before calling VOP_RWLOCK
243 	 * to avoid a deadlock with ufs.
244 	 */
245 	if (nbl_need_check(vp)) {
246 		int svmand;
247 
248 		nbl_start_crit(vp, RW_READER);
249 		in_crit = 1;
250 		error = nbl_svmand(vp, fp->f_cred, &svmand);
251 		if (error != 0)
252 			goto out;
253 		if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, cnt, svmand)) {
254 			error = EACCES;
255 			goto out;
256 		}
257 	}
258 
259 	(void) VOP_RWLOCK(vp, rwflag, NULL);
260 
261 	fileoff = fp->f_offset;
262 	if (vp->v_type == VREG) {
263 
264 		/*
265 		 * We raise psignal if write for >0 bytes causes
266 		 * it to exceed the ulimit.
267 		 */
268 		if (fileoff >= curproc->p_fsz_ctl) {
269 			VOP_RWUNLOCK(vp, rwflag, NULL);
270 
271 			mutex_enter(&curproc->p_lock);
272 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
273 			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
274 			mutex_exit(&curproc->p_lock);
275 
276 			error = EFBIG;
277 			goto out;
278 		}
279 		/*
280 		 * We return EFBIG if write is done at an offset
281 		 * greater than the offset maximum for this file structure.
282 		 */
283 
284 		if (fileoff >= OFFSET_MAX(fp)) {
285 			VOP_RWUNLOCK(vp, rwflag, NULL);
286 			error = EFBIG;
287 			goto out;
288 		}
289 		/*
290 		 * Limit the bytes to be written  upto offset maximum for
291 		 * this open file structure.
292 		 */
293 		if (fileoff + cnt > OFFSET_MAX(fp))
294 			cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
295 	}
296 	auio.uio_loffset = fileoff;
297 	auio.uio_iov = &aiov;
298 	auio.uio_iovcnt = 1;
299 	auio.uio_resid = bcount = cnt;
300 	auio.uio_segflg = UIO_USERSPACE;
301 	auio.uio_llimit = curproc->p_fsz_ctl;
302 	auio.uio_fmode = fflag;
303 	auio.uio_extflg = UIO_COPY_DEFAULT;
304 
305 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
306 
307 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
308 	cnt -= auio.uio_resid;
309 	CPU_STATS_ENTER_K();
310 	cp = CPU;
311 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
312 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)cnt);
313 	CPU_STATS_EXIT_K();
314 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
315 
316 	if (vp->v_type == VFIFO)	/* Backward compatibility */
317 		fp->f_offset = cnt;
318 	else if (((fp->f_flag & FAPPEND) == 0) ||
319 		(vp->v_type != VREG) || (bcount != 0))	/* POSIX */
320 		fp->f_offset = auio.uio_loffset;
321 	VOP_RWUNLOCK(vp, rwflag, NULL);
322 
323 	if (error == EINTR && cnt != 0)
324 		error = 0;
325 out:
326 	if (in_crit)
327 		nbl_end_crit(vp);
328 	releasef(fdes);
329 	if (error)
330 		return (set_errno(error));
331 	return (cnt);
332 }
333 
334 ssize_t
335 pread(int fdes, void *cbuf, size_t count, off_t offset)
336 {
337 	struct uio auio;
338 	struct iovec aiov;
339 	file_t *fp;
340 	register vnode_t *vp;
341 	struct cpu *cp;
342 	int fflag, ioflag, rwflag;
343 	ssize_t bcount;
344 	int error = 0;
345 	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
346 #ifdef _SYSCALL32_IMPL
347 	u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
348 		MAXOFF32_T : MAXOFFSET_T;
349 #else
350 	const u_offset_t maxoff = MAXOFF32_T;
351 #endif
352 	int in_crit = 0;
353 
354 	if ((bcount = (ssize_t)count) < 0)
355 		return (set_errno(EINVAL));
356 
357 	if ((fp = getf(fdes)) == NULL)
358 		return (set_errno(EBADF));
359 	if (((fflag = fp->f_flag) & (FREAD)) == 0) {
360 		error = EBADF;
361 		goto out;
362 	}
363 
364 	rwflag = 0;
365 	vp = fp->f_vnode;
366 
367 	if (vp->v_type == VREG) {
368 
369 		if (bcount == 0)
370 			goto out;
371 
372 		/*
373 		 * Return EINVAL if an invalid offset comes to pread.
374 		 * Negative offset from user will cause this error.
375 		 */
376 
377 		if (fileoff > maxoff) {
378 			error = EINVAL;
379 			goto out;
380 		}
381 		/*
382 		 * Limit offset such that we don't read or write
383 		 * a file beyond the maximum offset representable in
384 		 * an off_t structure.
385 		 */
386 		if (fileoff + bcount > maxoff)
387 			bcount = (ssize_t)((offset_t)maxoff - fileoff);
388 	} else if (vp->v_type == VFIFO) {
389 		error = ESPIPE;
390 		goto out;
391 	}
392 
393 	/*
394 	 * We have to enter the critical region before calling VOP_RWLOCK
395 	 * to avoid a deadlock with ufs.
396 	 */
397 	if (nbl_need_check(vp)) {
398 		int svmand;
399 
400 		nbl_start_crit(vp, RW_READER);
401 		in_crit = 1;
402 		error = nbl_svmand(vp, fp->f_cred, &svmand);
403 		if (error != 0)
404 			goto out;
405 		if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand)) {
406 			error = EACCES;
407 			goto out;
408 		}
409 	}
410 
411 	aiov.iov_base = cbuf;
412 	aiov.iov_len = bcount;
413 	(void) VOP_RWLOCK(vp, rwflag, NULL);
414 	if (vp->v_type == VREG && fileoff == (u_offset_t)maxoff) {
415 		struct vattr va;
416 		va.va_mask = AT_SIZE;
417 		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred))) {
418 			VOP_RWUNLOCK(vp, rwflag, NULL);
419 			goto out;
420 		}
421 		VOP_RWUNLOCK(vp, rwflag, NULL);
422 
423 		/*
424 		 * We have to return EOF if fileoff is >= file size.
425 		 */
426 		if (fileoff >= va.va_size) {
427 			bcount = 0;
428 			goto out;
429 		}
430 
431 		/*
432 		 * File is greater than or equal to maxoff and therefore
433 		 * we return EOVERFLOW.
434 		 */
435 		error = EOVERFLOW;
436 		goto out;
437 	}
438 	auio.uio_loffset = fileoff;
439 	auio.uio_iov = &aiov;
440 	auio.uio_iovcnt = 1;
441 	auio.uio_resid = bcount;
442 	auio.uio_segflg = UIO_USERSPACE;
443 	auio.uio_llimit = MAXOFFSET_T;
444 	auio.uio_fmode = fflag;
445 	auio.uio_extflg = UIO_COPY_CACHED;
446 
447 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
448 
449 	/* If read sync is not asked for, filter sync flags */
450 	if ((ioflag & FRSYNC) == 0)
451 		ioflag &= ~(FSYNC|FDSYNC);
452 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
453 	bcount -= auio.uio_resid;
454 	CPU_STATS_ENTER_K();
455 	cp = CPU;
456 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
457 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
458 	CPU_STATS_EXIT_K();
459 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
460 	VOP_RWUNLOCK(vp, rwflag, NULL);
461 
462 	if (error == EINTR && bcount != 0)
463 		error = 0;
464 out:
465 	if (in_crit)
466 		nbl_end_crit(vp);
467 	releasef(fdes);
468 	if (error)
469 		return (set_errno(error));
470 	return (bcount);
471 }
472 
473 ssize_t
474 pwrite(int fdes, void *cbuf, size_t count, off_t offset)
475 {
476 	struct uio auio;
477 	struct iovec aiov;
478 	file_t *fp;
479 	register vnode_t *vp;
480 	struct cpu *cp;
481 	int fflag, ioflag, rwflag;
482 	ssize_t bcount;
483 	int error = 0;
484 	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
485 #ifdef _SYSCALL32_IMPL
486 	u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
487 		MAXOFF32_T : MAXOFFSET_T;
488 #else
489 	const u_offset_t maxoff = MAXOFF32_T;
490 #endif
491 	int in_crit = 0;
492 
493 	if ((bcount = (ssize_t)count) < 0)
494 		return (set_errno(EINVAL));
495 	if ((fp = getf(fdes)) == NULL)
496 		return (set_errno(EBADF));
497 	if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
498 		error = EBADF;
499 		goto out;
500 	}
501 
502 	rwflag = 1;
503 	vp = fp->f_vnode;
504 
505 	if (vp->v_type == VREG) {
506 
507 		if (bcount == 0)
508 			goto out;
509 
510 		/*
511 		 * return EINVAL for offsets that cannot be
512 		 * represented in an off_t.
513 		 */
514 		if (fileoff > maxoff) {
515 			error = EINVAL;
516 			goto out;
517 		}
518 		/*
519 		 * Take appropriate action if we are trying to write above the
520 		 * resource limit.
521 		 */
522 		if (fileoff >= curproc->p_fsz_ctl) {
523 			mutex_enter(&curproc->p_lock);
524 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
525 			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
526 			mutex_exit(&curproc->p_lock);
527 
528 			error = EFBIG;
529 			goto out;
530 		}
531 		/*
532 		 * Don't allow pwrite to cause file sizes to exceed
533 		 * maxoff.
534 		 */
535 		if (fileoff == maxoff) {
536 			error = EFBIG;
537 			goto out;
538 		}
539 		if (fileoff + count > maxoff)
540 			bcount = (ssize_t)((u_offset_t)maxoff - fileoff);
541 	} else if (vp->v_type == VFIFO) {
542 		error = ESPIPE;
543 		goto out;
544 	}
545 
546 	/*
547 	 * We have to enter the critical region before calling VOP_RWLOCK
548 	 * to avoid a deadlock with ufs.
549 	 */
550 	if (nbl_need_check(vp)) {
551 		int svmand;
552 
553 		nbl_start_crit(vp, RW_READER);
554 		in_crit = 1;
555 		error = nbl_svmand(vp, fp->f_cred, &svmand);
556 		if (error != 0)
557 			goto out;
558 		if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand)) {
559 			error = EACCES;
560 			goto out;
561 		}
562 	}
563 
564 	aiov.iov_base = cbuf;
565 	aiov.iov_len = bcount;
566 	(void) VOP_RWLOCK(vp, rwflag, NULL);
567 	auio.uio_loffset = fileoff;
568 	auio.uio_iov = &aiov;
569 	auio.uio_iovcnt = 1;
570 	auio.uio_resid = bcount;
571 	auio.uio_segflg = UIO_USERSPACE;
572 	auio.uio_llimit = curproc->p_fsz_ctl;
573 	auio.uio_fmode = fflag;
574 	auio.uio_extflg = UIO_COPY_CACHED;
575 
576 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
577 
578 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
579 	bcount -= auio.uio_resid;
580 	CPU_STATS_ENTER_K();
581 	cp = CPU;
582 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
583 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
584 	CPU_STATS_EXIT_K();
585 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
586 	VOP_RWUNLOCK(vp, rwflag, NULL);
587 
588 	if (error == EINTR && bcount != 0)
589 		error = 0;
590 out:
591 	if (in_crit)
592 		nbl_end_crit(vp);
593 	releasef(fdes);
594 	if (error)
595 		return (set_errno(error));
596 	return (bcount);
597 }
598 
599 /*
600  * XXX -- The SVID refers to IOV_MAX, but doesn't define it.  Grrrr....
601  * XXX -- However, SVVS expects readv() and writev() to fail if
602  * XXX -- iovcnt > 16 (yes, it's hard-coded in the SVVS source),
603  * XXX -- so I guess that's the "interface".
604  */
605 #define	DEF_IOV_MAX	16
606 
607 ssize_t
608 readv(int fdes, struct iovec *iovp, int iovcnt)
609 {
610 	struct uio auio;
611 	struct iovec aiov[DEF_IOV_MAX];
612 	file_t *fp;
613 	register vnode_t *vp;
614 	struct cpu *cp;
615 	int fflag, ioflag, rwflag;
616 	ssize_t count, bcount;
617 	int error = 0;
618 	int i;
619 	u_offset_t fileoff;
620 	int in_crit = 0;
621 
622 	if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
623 		return (set_errno(EINVAL));
624 
625 #ifdef _SYSCALL32_IMPL
626 	/*
627 	 * 32-bit callers need to have their iovec expanded,
628 	 * while ensuring that they can't move more than 2Gbytes
629 	 * of data in a single call.
630 	 */
631 	if (get_udatamodel() == DATAMODEL_ILP32) {
632 		struct iovec32 aiov32[DEF_IOV_MAX];
633 		ssize32_t count32;
634 
635 		if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
636 			return (set_errno(EFAULT));
637 
638 		count32 = 0;
639 		for (i = 0; i < iovcnt; i++) {
640 			ssize32_t iovlen32 = aiov32[i].iov_len;
641 			count32 += iovlen32;
642 			if (iovlen32 < 0 || count32 < 0)
643 				return (set_errno(EINVAL));
644 			aiov[i].iov_len = iovlen32;
645 			aiov[i].iov_base =
646 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
647 		}
648 	} else
649 #endif
650 	if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
651 		return (set_errno(EFAULT));
652 
653 	count = 0;
654 	for (i = 0; i < iovcnt; i++) {
655 		ssize_t iovlen = aiov[i].iov_len;
656 		count += iovlen;
657 		if (iovlen < 0 || count < 0)
658 			return (set_errno(EINVAL));
659 	}
660 	if ((fp = getf(fdes)) == NULL)
661 		return (set_errno(EBADF));
662 	if (((fflag = fp->f_flag) & FREAD) == 0) {
663 		error = EBADF;
664 		goto out;
665 	}
666 	vp = fp->f_vnode;
667 	if (vp->v_type == VREG && count == 0) {
668 		goto out;
669 	}
670 
671 	rwflag = 0;
672 
673 	/*
674 	 * We have to enter the critical region before calling VOP_RWLOCK
675 	 * to avoid a deadlock with ufs.
676 	 */
677 	if (nbl_need_check(vp)) {
678 		int svmand;
679 
680 		nbl_start_crit(vp, RW_READER);
681 		in_crit = 1;
682 		error = nbl_svmand(vp, fp->f_cred, &svmand);
683 		if (error != 0)
684 			goto out;
685 		if (nbl_conflict(vp, NBL_READ, fp->f_offset, count, svmand)) {
686 			error = EACCES;
687 			goto out;
688 		}
689 	}
690 
691 	(void) VOP_RWLOCK(vp, rwflag, NULL);
692 	fileoff = fp->f_offset;
693 
694 	/*
695 	 * Behaviour is same as read. Please see comments in read.
696 	 */
697 
698 	if ((vp->v_type == VREG) && (fileoff >= OFFSET_MAX(fp))) {
699 		struct vattr va;
700 		va.va_mask = AT_SIZE;
701 		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred)))  {
702 			VOP_RWUNLOCK(vp, rwflag, NULL);
703 			goto out;
704 		}
705 		if (fileoff >= va.va_size) {
706 			VOP_RWUNLOCK(vp, rwflag, NULL);
707 			count = 0;
708 			goto out;
709 		} else {
710 			VOP_RWUNLOCK(vp, rwflag, NULL);
711 			error = EOVERFLOW;
712 			goto out;
713 		}
714 	}
715 	if ((vp->v_type == VREG) && (fileoff + count > OFFSET_MAX(fp))) {
716 		count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
717 	}
718 	auio.uio_loffset = fileoff;
719 	auio.uio_iov = aiov;
720 	auio.uio_iovcnt = iovcnt;
721 	auio.uio_resid = bcount = count;
722 	auio.uio_segflg = UIO_USERSPACE;
723 	auio.uio_llimit = MAXOFFSET_T;
724 	auio.uio_fmode = fflag;
725 	if (bcount <= copyout_max_cached)
726 		auio.uio_extflg = UIO_COPY_CACHED;
727 	else
728 		auio.uio_extflg = UIO_COPY_DEFAULT;
729 
730 
731 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
732 
733 	/* If read sync is not asked for, filter sync flags */
734 	if ((ioflag & FRSYNC) == 0)
735 		ioflag &= ~(FSYNC|FDSYNC);
736 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
737 	count -= auio.uio_resid;
738 	CPU_STATS_ENTER_K();
739 	cp = CPU;
740 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
741 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)count);
742 	CPU_STATS_EXIT_K();
743 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
744 
745 	if (vp->v_type == VFIFO)	/* Backward compatibility */
746 		fp->f_offset = count;
747 	else if (((fp->f_flag & FAPPEND) == 0) ||
748 		(vp->v_type != VREG) || (bcount != 0))	/* POSIX */
749 		fp->f_offset = auio.uio_loffset;
750 
751 	VOP_RWUNLOCK(vp, rwflag, NULL);
752 
753 	if (error == EINTR && count != 0)
754 		error = 0;
755 out:
756 	if (in_crit)
757 		nbl_end_crit(vp);
758 	releasef(fdes);
759 	if (error)
760 		return (set_errno(error));
761 	return (count);
762 }
763 
764 ssize_t
765 writev(int fdes, struct iovec *iovp, int iovcnt)
766 {
767 	struct uio auio;
768 	struct iovec aiov[DEF_IOV_MAX];
769 	file_t *fp;
770 	register vnode_t *vp;
771 	struct cpu *cp;
772 	int fflag, ioflag, rwflag;
773 	ssize_t count, bcount;
774 	int error = 0;
775 	int i;
776 	u_offset_t fileoff;
777 	int in_crit = 0;
778 
779 	if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
780 		return (set_errno(EINVAL));
781 
782 #ifdef _SYSCALL32_IMPL
783 	/*
784 	 * 32-bit callers need to have their iovec expanded,
785 	 * while ensuring that they can't move more than 2Gbytes
786 	 * of data in a single call.
787 	 */
788 	if (get_udatamodel() == DATAMODEL_ILP32) {
789 		struct iovec32 aiov32[DEF_IOV_MAX];
790 		ssize32_t count32;
791 
792 		if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
793 			return (set_errno(EFAULT));
794 
795 		count32 = 0;
796 		for (i = 0; i < iovcnt; i++) {
797 			ssize32_t iovlen = aiov32[i].iov_len;
798 			count32 += iovlen;
799 			if (iovlen < 0 || count32 < 0)
800 				return (set_errno(EINVAL));
801 			aiov[i].iov_len = iovlen;
802 			aiov[i].iov_base =
803 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
804 		}
805 	} else
806 #endif
807 	if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
808 		return (set_errno(EFAULT));
809 
810 	count = 0;
811 	for (i = 0; i < iovcnt; i++) {
812 		ssize_t iovlen = aiov[i].iov_len;
813 		count += iovlen;
814 		if (iovlen < 0 || count < 0)
815 			return (set_errno(EINVAL));
816 	}
817 	if ((fp = getf(fdes)) == NULL)
818 		return (set_errno(EBADF));
819 	if (((fflag = fp->f_flag) & FWRITE) == 0) {
820 		error = EBADF;
821 		goto out;
822 	}
823 	vp = fp->f_vnode;
824 	if (vp->v_type == VREG && count == 0) {
825 		goto out;
826 	}
827 
828 	rwflag = 1;
829 
830 	/*
831 	 * We have to enter the critical region before calling VOP_RWLOCK
832 	 * to avoid a deadlock with ufs.
833 	 */
834 	if (nbl_need_check(vp)) {
835 		int svmand;
836 
837 		nbl_start_crit(vp, RW_READER);
838 		in_crit = 1;
839 		error = nbl_svmand(vp, fp->f_cred, &svmand);
840 		if (error != 0)
841 			goto out;
842 		if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, count, svmand)) {
843 			error = EACCES;
844 			goto out;
845 		}
846 	}
847 
848 	(void) VOP_RWLOCK(vp, rwflag, NULL);
849 
850 	fileoff = fp->f_offset;
851 
852 	/*
853 	 * Behaviour is same as write. Please see comments for write.
854 	 */
855 
856 	if (vp->v_type == VREG) {
857 		if (fileoff >= curproc->p_fsz_ctl) {
858 			VOP_RWUNLOCK(vp, rwflag, NULL);
859 			mutex_enter(&curproc->p_lock);
860 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
861 			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
862 			mutex_exit(&curproc->p_lock);
863 			error = EFBIG;
864 			goto out;
865 		}
866 		if (fileoff >= OFFSET_MAX(fp)) {
867 			VOP_RWUNLOCK(vp, rwflag, NULL);
868 			error = EFBIG;
869 			goto out;
870 		}
871 		if (fileoff + count > OFFSET_MAX(fp))
872 			count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
873 	}
874 	auio.uio_loffset = fileoff;
875 	auio.uio_iov = aiov;
876 	auio.uio_iovcnt = iovcnt;
877 	auio.uio_resid = bcount = count;
878 	auio.uio_segflg = UIO_USERSPACE;
879 	auio.uio_llimit = curproc->p_fsz_ctl;
880 	auio.uio_fmode = fflag;
881 	auio.uio_extflg = UIO_COPY_DEFAULT;
882 
883 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
884 
885 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
886 	count -= auio.uio_resid;
887 	CPU_STATS_ENTER_K();
888 	cp = CPU;
889 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
890 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)count);
891 	CPU_STATS_EXIT_K();
892 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
893 
894 	if (vp->v_type == VFIFO)	/* Backward compatibility */
895 		fp->f_offset = count;
896 	else if (((fp->f_flag & FAPPEND) == 0) ||
897 		(vp->v_type != VREG) || (bcount != 0))	/* POSIX */
898 		fp->f_offset = auio.uio_loffset;
899 	VOP_RWUNLOCK(vp, rwflag, NULL);
900 
901 	if (error == EINTR && count != 0)
902 		error = 0;
903 out:
904 	if (in_crit)
905 		nbl_end_crit(vp);
906 	releasef(fdes);
907 	if (error)
908 		return (set_errno(error));
909 	return (count);
910 }
911 
912 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
913 
914 /*
915  * This syscall supplies 64-bit file offsets to 32-bit applications only.
916  */
917 ssize32_t
918 pread64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
919     uint32_t offset_2)
920 {
921 	struct uio auio;
922 	struct iovec aiov;
923 	file_t *fp;
924 	register vnode_t *vp;
925 	struct cpu *cp;
926 	int fflag, ioflag, rwflag;
927 	ssize_t bcount;
928 	int error = 0;
929 	u_offset_t fileoff;
930 	int in_crit = 0;
931 
932 #if defined(_LITTLE_ENDIAN)
933 	fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
934 #else
935 	fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
936 #endif
937 
938 	if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
939 		return (set_errno(EINVAL));
940 
941 	if ((fp = getf(fdes)) == NULL)
942 		return (set_errno(EBADF));
943 	if (((fflag = fp->f_flag) & (FREAD)) == 0) {
944 		error = EBADF;
945 		goto out;
946 	}
947 
948 	rwflag = 0;
949 	vp = fp->f_vnode;
950 
951 	if (vp->v_type == VREG) {
952 
953 		if (bcount == 0)
954 			goto out;
955 
956 		/*
957 		 * Same as pread. See comments in pread.
958 		 */
959 
960 		if (fileoff > MAXOFFSET_T) {
961 			error = EINVAL;
962 			goto out;
963 		}
964 		if (fileoff + bcount > MAXOFFSET_T)
965 			bcount = (ssize_t)(MAXOFFSET_T - fileoff);
966 	} else if (vp->v_type == VFIFO) {
967 		error = ESPIPE;
968 		goto out;
969 	}
970 
971 	/*
972 	 * We have to enter the critical region before calling VOP_RWLOCK
973 	 * to avoid a deadlock with ufs.
974 	 */
975 	if (nbl_need_check(vp)) {
976 		int svmand;
977 
978 		nbl_start_crit(vp, RW_READER);
979 		in_crit = 1;
980 		error = nbl_svmand(vp, fp->f_cred, &svmand);
981 		if (error != 0)
982 			goto out;
983 		if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand)) {
984 			error = EACCES;
985 			goto out;
986 		}
987 	}
988 
989 	aiov.iov_base = cbuf;
990 	aiov.iov_len = bcount;
991 	(void) VOP_RWLOCK(vp, rwflag, NULL);
992 	auio.uio_loffset = fileoff;
993 
994 	/*
995 	 * Note: File size can never be greater than MAXOFFSET_T.
996 	 * If ever we start supporting 128 bit files the code
997 	 * similar to the one in pread at this place should be here.
998 	 * Here we avoid the unnecessary VOP_GETATTR() when we
999 	 * know that fileoff == MAXOFFSET_T implies that it is always
1000 	 * greater than or equal to file size.
1001 	 */
1002 	auio.uio_iov = &aiov;
1003 	auio.uio_iovcnt = 1;
1004 	auio.uio_resid = bcount;
1005 	auio.uio_segflg = UIO_USERSPACE;
1006 	auio.uio_llimit = MAXOFFSET_T;
1007 	auio.uio_fmode = fflag;
1008 	auio.uio_extflg = UIO_COPY_CACHED;
1009 
1010 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1011 
1012 	/* If read sync is not asked for, filter sync flags */
1013 	if ((ioflag & FRSYNC) == 0)
1014 		ioflag &= ~(FSYNC|FDSYNC);
1015 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1016 	bcount -= auio.uio_resid;
1017 	CPU_STATS_ENTER_K();
1018 	cp = CPU;
1019 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
1020 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
1021 	CPU_STATS_EXIT_K();
1022 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
1023 	VOP_RWUNLOCK(vp, rwflag, NULL);
1024 
1025 	if (error == EINTR && bcount != 0)
1026 		error = 0;
1027 out:
1028 	if (in_crit)
1029 		nbl_end_crit(vp);
1030 	releasef(fdes);
1031 	if (error)
1032 		return (set_errno(error));
1033 	return (bcount);
1034 }
1035 
1036 /*
1037  * This syscall supplies 64-bit file offsets to 32-bit applications only.
1038  */
1039 ssize32_t
1040 pwrite64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
1041     uint32_t offset_2)
1042 {
1043 	struct uio auio;
1044 	struct iovec aiov;
1045 	file_t *fp;
1046 	register vnode_t *vp;
1047 	struct cpu *cp;
1048 	int fflag, ioflag, rwflag;
1049 	ssize_t bcount;
1050 	int error = 0;
1051 	u_offset_t fileoff;
1052 	int in_crit = 0;
1053 
1054 #if defined(_LITTLE_ENDIAN)
1055 	fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
1056 #else
1057 	fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
1058 #endif
1059 
1060 	if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
1061 		return (set_errno(EINVAL));
1062 	if ((fp = getf(fdes)) == NULL)
1063 		return (set_errno(EBADF));
1064 	if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
1065 		error = EBADF;
1066 		goto out;
1067 	}
1068 
1069 	rwflag = 1;
1070 	vp = fp->f_vnode;
1071 
1072 	if (vp->v_type == VREG) {
1073 
1074 		if (bcount == 0)
1075 			goto out;
1076 
1077 		/*
1078 		 * See comments in pwrite.
1079 		 */
1080 		if (fileoff > MAXOFFSET_T) {
1081 			error = EINVAL;
1082 			goto out;
1083 		}
1084 		if (fileoff >= curproc->p_fsz_ctl) {
1085 			mutex_enter(&curproc->p_lock);
1086 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
1087 			    curproc->p_rctls, curproc, RCA_SAFE);
1088 			mutex_exit(&curproc->p_lock);
1089 			error = EFBIG;
1090 			goto out;
1091 		}
1092 		if (fileoff == MAXOFFSET_T) {
1093 			error = EFBIG;
1094 			goto out;
1095 		}
1096 		if (fileoff + bcount > MAXOFFSET_T)
1097 			bcount = (ssize_t)((u_offset_t)MAXOFFSET_T - fileoff);
1098 	} else if (vp->v_type == VFIFO) {
1099 		error = ESPIPE;
1100 		goto out;
1101 	}
1102 
1103 	/*
1104 	 * We have to enter the critical region before calling VOP_RWLOCK
1105 	 * to avoid a deadlock with ufs.
1106 	 */
1107 	if (nbl_need_check(vp)) {
1108 		int svmand;
1109 
1110 		nbl_start_crit(vp, RW_READER);
1111 		in_crit = 1;
1112 		error = nbl_svmand(vp, fp->f_cred, &svmand);
1113 		if (error != 0)
1114 			goto out;
1115 		if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand)) {
1116 			error = EACCES;
1117 			goto out;
1118 		}
1119 	}
1120 
1121 	aiov.iov_base = cbuf;
1122 	aiov.iov_len = bcount;
1123 	(void) VOP_RWLOCK(vp, rwflag, NULL);
1124 	auio.uio_loffset = fileoff;
1125 	auio.uio_iov = &aiov;
1126 	auio.uio_iovcnt = 1;
1127 	auio.uio_resid = bcount;
1128 	auio.uio_segflg = UIO_USERSPACE;
1129 	auio.uio_llimit = curproc->p_fsz_ctl;
1130 	auio.uio_fmode = fflag;
1131 	auio.uio_extflg = UIO_COPY_CACHED;
1132 
1133 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1134 
1135 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
1136 	bcount -= auio.uio_resid;
1137 	CPU_STATS_ENTER_K();
1138 	cp = CPU;
1139 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
1140 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
1141 	CPU_STATS_EXIT_K();
1142 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
1143 	VOP_RWUNLOCK(vp, rwflag, NULL);
1144 
1145 	if (error == EINTR && bcount != 0)
1146 		error = 0;
1147 out:
1148 	if (in_crit)
1149 		nbl_end_crit(vp);
1150 	releasef(fdes);
1151 	if (error)
1152 		return (set_errno(error));
1153 	return (bcount);
1154 }
1155 
1156 #endif	/* _SYSCALL32_IMPL || _ILP32 */
1157 
1158 #ifdef _SYSCALL32_IMPL
1159 /*
1160  * Tail-call elimination of xxx32() down to xxx()
1161  *
1162  * A number of xxx32 system calls take a len (or count) argument and
1163  * return a number in the range [0,len] or -1 on error.
1164  * Given an ssize32_t input len, the downcall xxx() will return
1165  * a 64-bit value that is -1 or in the range [0,len] which actually
1166  * is a proper return value for the xxx32 call. So even if the xxx32
1167  * calls can be considered as returning a ssize32_t, they are currently
1168  * declared as returning a ssize_t as this enables tail-call elimination.
1169  *
1170  * The cast of len (or count) to ssize32_t is needed to ensure we pass
1171  * down negative input values as such and let the downcall handle error
1172  * reporting. Functions covered by this comments are:
1173  *
1174  * rw.c:           read32, write32, pread32, pwrite32, readv32, writev32.
1175  * socksyscall.c:  recv32, recvfrom32, send32, sendto32.
1176  * readlink.c:     readlink32.
1177  */
1178 
1179 ssize_t
1180 read32(int32_t fdes, caddr32_t cbuf, size32_t count)
1181 {
1182 	return (read(fdes,
1183 	    (void *)(uintptr_t)cbuf, (ssize32_t)count));
1184 }
1185 
1186 ssize_t
1187 write32(int32_t fdes, caddr32_t cbuf, size32_t count)
1188 {
1189 	return (write(fdes,
1190 	    (void *)(uintptr_t)cbuf, (ssize32_t)count));
1191 }
1192 
1193 ssize_t
1194 pread32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
1195 {
1196 	return (pread(fdes,
1197 	    (void *)(uintptr_t)cbuf, (ssize32_t)count,
1198 	    (off_t)(uint32_t)offset));
1199 }
1200 
1201 ssize_t
1202 pwrite32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
1203 {
1204 	return (pwrite(fdes,
1205 	    (void *)(uintptr_t)cbuf, (ssize32_t)count,
1206 	    (off_t)(uint32_t)offset));
1207 }
1208 
1209 ssize_t
1210 readv32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
1211 {
1212 	return (readv(fdes, (void *)(uintptr_t)iovp, iovcnt));
1213 }
1214 
1215 ssize_t
1216 writev32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
1217 {
1218 	return (writev(fdes, (void *)(uintptr_t)iovp, iovcnt));
1219 }
1220 
1221 #endif	/* _SYSCALL32_IMPL */
1222