xref: /titanic_51/usr/src/uts/common/syscall/rw.c (revision 24b3ac2ee83543e2e7856433213b6f026a225ac1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 /*
31  * Portions of this source code were derived from Berkeley 4.3 BSD
32  * under license from the Regents of the University of California.
33  */
34 
35 #pragma ident	"%Z%%M%	%I%	%E% SMI"
36 
37 #include <sys/param.h>
38 #include <sys/isa_defs.h>
39 #include <sys/types.h>
40 #include <sys/inttypes.h>
41 #include <sys/sysmacros.h>
42 #include <sys/cred.h>
43 #include <sys/user.h>
44 #include <sys/systm.h>
45 #include <sys/errno.h>
46 #include <sys/vnode.h>
47 #include <sys/file.h>
48 #include <sys/proc.h>
49 #include <sys/cpuvar.h>
50 #include <sys/uio.h>
51 #include <sys/debug.h>
52 #include <sys/rctl.h>
53 #include <sys/nbmlock.h>
54 
55 #define	COPYOUT_MAX_CACHE	(1<<17)		/* 128K */
56 
57 size_t copyout_max_cached = COPYOUT_MAX_CACHE;	/* global so it's patchable */
58 
59 /*
60  * read, write, pread, pwrite, readv, and writev syscalls.
61  *
62  * 64-bit open:	all open's are large file opens.
63  * Large Files: the behaviour of read depends on whether the fd
64  *		corresponds to large open or not.
65  * 32-bit open:	FOFFMAX flag not set.
66  *		read until MAXOFF32_T - 1 and read at MAXOFF32_T returns
67  *		EOVERFLOW if count is non-zero and if size of file
68  *		is > MAXOFF32_T. If size of file is <= MAXOFF32_T read
69  *		at >= MAXOFF32_T returns EOF.
70  */
71 
72 /*
73  * Native system call
74  */
75 ssize_t
76 read(int fdes, void *cbuf, size_t count)
77 {
78 	struct uio auio;
79 	struct iovec aiov;
80 	file_t *fp;
81 	register vnode_t *vp;
82 	struct cpu *cp;
83 	int fflag, ioflag, rwflag;
84 	ssize_t cnt, bcount;
85 	int error = 0;
86 	u_offset_t fileoff;
87 	int in_crit = 0;
88 
89 	if ((cnt = (ssize_t)count) < 0)
90 		return (set_errno(EINVAL));
91 	if ((fp = getf(fdes)) == NULL)
92 		return (set_errno(EBADF));
93 	if (((fflag = fp->f_flag) & FREAD) == 0) {
94 		error = EBADF;
95 		goto out;
96 	}
97 	vp = fp->f_vnode;
98 
99 	if (vp->v_type == VREG && cnt == 0) {
100 		goto out;
101 	}
102 
103 	rwflag = 0;
104 	aiov.iov_base = cbuf;
105 	aiov.iov_len = cnt;
106 
107 	/*
108 	 * We have to enter the critical region before calling VOP_RWLOCK
109 	 * to avoid a deadlock with write() calls.
110 	 */
111 	if (nbl_need_check(vp)) {
112 		int svmand;
113 
114 		nbl_start_crit(vp, RW_READER);
115 		in_crit = 1;
116 		error = nbl_svmand(vp, fp->f_cred, &svmand);
117 		if (error != 0)
118 			goto out;
119 		if (nbl_conflict(vp, NBL_READ, fp->f_offset, cnt, svmand,
120 		    NULL)) {
121 			error = EACCES;
122 			goto out;
123 		}
124 	}
125 
126 	(void) VOP_RWLOCK(vp, rwflag, NULL);
127 
128 	/*
129 	 * We do the following checks inside VOP_RWLOCK so as to
130 	 * prevent file size from changing while these checks are
131 	 * being done. Also, we load fp's offset to the local
132 	 * variable fileoff because we can have a parallel lseek
133 	 * going on (f_offset is not protected by any lock) which
134 	 * could change f_offset. We need to see the value only
135 	 * once here and take a decision. Seeing it more than once
136 	 * can lead to incorrect functionality.
137 	 */
138 
139 	fileoff = (u_offset_t)fp->f_offset;
140 	if (fileoff >= OFFSET_MAX(fp) && (vp->v_type == VREG)) {
141 		struct vattr va;
142 		va.va_mask = AT_SIZE;
143 		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL)))  {
144 			VOP_RWUNLOCK(vp, rwflag, NULL);
145 			goto out;
146 		}
147 		if (fileoff >= va.va_size) {
148 			cnt = 0;
149 			VOP_RWUNLOCK(vp, rwflag, NULL);
150 			goto out;
151 		} else {
152 			error = EOVERFLOW;
153 			VOP_RWUNLOCK(vp, rwflag, NULL);
154 			goto out;
155 		}
156 	}
157 	if ((vp->v_type == VREG) &&
158 	    (fileoff + cnt > OFFSET_MAX(fp))) {
159 		cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
160 	}
161 	auio.uio_loffset = fileoff;
162 	auio.uio_iov = &aiov;
163 	auio.uio_iovcnt = 1;
164 	auio.uio_resid = bcount = cnt;
165 	auio.uio_segflg = UIO_USERSPACE;
166 	auio.uio_llimit = MAXOFFSET_T;
167 	auio.uio_fmode = fflag;
168 	/*
169 	 * Only use bypass caches when the count is large enough
170 	 */
171 	if (bcount <= copyout_max_cached)
172 		auio.uio_extflg = UIO_COPY_CACHED;
173 	else
174 		auio.uio_extflg = UIO_COPY_DEFAULT;
175 
176 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
177 
178 	/* If read sync is not asked for, filter sync flags */
179 	if ((ioflag & FRSYNC) == 0)
180 		ioflag &= ~(FSYNC|FDSYNC);
181 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
182 	cnt -= auio.uio_resid;
183 	CPU_STATS_ENTER_K();
184 	cp = CPU;
185 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
186 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)cnt);
187 	CPU_STATS_EXIT_K();
188 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
189 
190 	if (vp->v_type == VFIFO)	/* Backward compatibility */
191 		fp->f_offset = cnt;
192 	else if (((fp->f_flag & FAPPEND) == 0) ||
193 	    (vp->v_type != VREG) || (bcount != 0))	/* POSIX */
194 		fp->f_offset = auio.uio_loffset;
195 	VOP_RWUNLOCK(vp, rwflag, NULL);
196 
197 	if (error == EINTR && cnt != 0)
198 		error = 0;
199 out:
200 	if (in_crit)
201 		nbl_end_crit(vp);
202 	releasef(fdes);
203 	if (error)
204 		return (set_errno(error));
205 	return (cnt);
206 }
207 
208 /*
209  * Native system call
210  */
211 ssize_t
212 write(int fdes, void *cbuf, size_t count)
213 {
214 	struct uio auio;
215 	struct iovec aiov;
216 	file_t *fp;
217 	register vnode_t *vp;
218 	struct cpu *cp;
219 	int fflag, ioflag, rwflag;
220 	ssize_t cnt, bcount;
221 	int error = 0;
222 	u_offset_t fileoff;
223 	int in_crit = 0;
224 
225 	if ((cnt = (ssize_t)count) < 0)
226 		return (set_errno(EINVAL));
227 	if ((fp = getf(fdes)) == NULL)
228 		return (set_errno(EBADF));
229 	if (((fflag = fp->f_flag) & FWRITE) == 0) {
230 		error = EBADF;
231 		goto out;
232 	}
233 	vp = fp->f_vnode;
234 
235 	if (vp->v_type == VREG && cnt == 0) {
236 		goto out;
237 	}
238 
239 	rwflag = 1;
240 	aiov.iov_base = cbuf;
241 	aiov.iov_len = cnt;
242 
243 	/*
244 	 * We have to enter the critical region before calling VOP_RWLOCK
245 	 * to avoid a deadlock with ufs.
246 	 */
247 	if (nbl_need_check(vp)) {
248 		int svmand;
249 
250 		nbl_start_crit(vp, RW_READER);
251 		in_crit = 1;
252 		error = nbl_svmand(vp, fp->f_cred, &svmand);
253 		if (error != 0)
254 			goto out;
255 		if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, cnt, svmand,
256 		    NULL)) {
257 			error = EACCES;
258 			goto out;
259 		}
260 	}
261 
262 	(void) VOP_RWLOCK(vp, rwflag, NULL);
263 
264 	fileoff = fp->f_offset;
265 	if (vp->v_type == VREG) {
266 
267 		/*
268 		 * We raise psignal if write for >0 bytes causes
269 		 * it to exceed the ulimit.
270 		 */
271 		if (fileoff >= curproc->p_fsz_ctl) {
272 			VOP_RWUNLOCK(vp, rwflag, NULL);
273 
274 			mutex_enter(&curproc->p_lock);
275 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
276 			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
277 			mutex_exit(&curproc->p_lock);
278 
279 			error = EFBIG;
280 			goto out;
281 		}
282 		/*
283 		 * We return EFBIG if write is done at an offset
284 		 * greater than the offset maximum for this file structure.
285 		 */
286 
287 		if (fileoff >= OFFSET_MAX(fp)) {
288 			VOP_RWUNLOCK(vp, rwflag, NULL);
289 			error = EFBIG;
290 			goto out;
291 		}
292 		/*
293 		 * Limit the bytes to be written  upto offset maximum for
294 		 * this open file structure.
295 		 */
296 		if (fileoff + cnt > OFFSET_MAX(fp))
297 			cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
298 	}
299 	auio.uio_loffset = fileoff;
300 	auio.uio_iov = &aiov;
301 	auio.uio_iovcnt = 1;
302 	auio.uio_resid = bcount = cnt;
303 	auio.uio_segflg = UIO_USERSPACE;
304 	auio.uio_llimit = curproc->p_fsz_ctl;
305 	auio.uio_fmode = fflag;
306 	auio.uio_extflg = UIO_COPY_DEFAULT;
307 
308 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
309 
310 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
311 	cnt -= auio.uio_resid;
312 	CPU_STATS_ENTER_K();
313 	cp = CPU;
314 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
315 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)cnt);
316 	CPU_STATS_EXIT_K();
317 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
318 
319 	if (vp->v_type == VFIFO)	/* Backward compatibility */
320 		fp->f_offset = cnt;
321 	else if (((fp->f_flag & FAPPEND) == 0) ||
322 	    (vp->v_type != VREG) || (bcount != 0))	/* POSIX */
323 		fp->f_offset = auio.uio_loffset;
324 	VOP_RWUNLOCK(vp, rwflag, NULL);
325 
326 	if (error == EINTR && cnt != 0)
327 		error = 0;
328 out:
329 	if (in_crit)
330 		nbl_end_crit(vp);
331 	releasef(fdes);
332 	if (error)
333 		return (set_errno(error));
334 	return (cnt);
335 }
336 
337 ssize_t
338 pread(int fdes, void *cbuf, size_t count, off_t offset)
339 {
340 	struct uio auio;
341 	struct iovec aiov;
342 	file_t *fp;
343 	register vnode_t *vp;
344 	struct cpu *cp;
345 	int fflag, ioflag, rwflag;
346 	ssize_t bcount;
347 	int error = 0;
348 	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
349 #ifdef _SYSCALL32_IMPL
350 	u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
351 	    MAXOFF32_T : MAXOFFSET_T;
352 #else
353 	const u_offset_t maxoff = MAXOFF32_T;
354 #endif
355 	int in_crit = 0;
356 
357 	if ((bcount = (ssize_t)count) < 0)
358 		return (set_errno(EINVAL));
359 
360 	if ((fp = getf(fdes)) == NULL)
361 		return (set_errno(EBADF));
362 	if (((fflag = fp->f_flag) & (FREAD)) == 0) {
363 		error = EBADF;
364 		goto out;
365 	}
366 
367 	rwflag = 0;
368 	vp = fp->f_vnode;
369 
370 	if (vp->v_type == VREG) {
371 
372 		if (bcount == 0)
373 			goto out;
374 
375 		/*
376 		 * Return EINVAL if an invalid offset comes to pread.
377 		 * Negative offset from user will cause this error.
378 		 */
379 
380 		if (fileoff > maxoff) {
381 			error = EINVAL;
382 			goto out;
383 		}
384 		/*
385 		 * Limit offset such that we don't read or write
386 		 * a file beyond the maximum offset representable in
387 		 * an off_t structure.
388 		 */
389 		if (fileoff + bcount > maxoff)
390 			bcount = (ssize_t)((offset_t)maxoff - fileoff);
391 	} else if (vp->v_type == VFIFO) {
392 		error = ESPIPE;
393 		goto out;
394 	}
395 
396 	/*
397 	 * We have to enter the critical region before calling VOP_RWLOCK
398 	 * to avoid a deadlock with ufs.
399 	 */
400 	if (nbl_need_check(vp)) {
401 		int svmand;
402 
403 		nbl_start_crit(vp, RW_READER);
404 		in_crit = 1;
405 		error = nbl_svmand(vp, fp->f_cred, &svmand);
406 		if (error != 0)
407 			goto out;
408 		if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand,
409 		    NULL)) {
410 			error = EACCES;
411 			goto out;
412 		}
413 	}
414 
415 	aiov.iov_base = cbuf;
416 	aiov.iov_len = bcount;
417 	(void) VOP_RWLOCK(vp, rwflag, NULL);
418 	if (vp->v_type == VREG && fileoff == (u_offset_t)maxoff) {
419 		struct vattr va;
420 		va.va_mask = AT_SIZE;
421 		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL))) {
422 			VOP_RWUNLOCK(vp, rwflag, NULL);
423 			goto out;
424 		}
425 		VOP_RWUNLOCK(vp, rwflag, NULL);
426 
427 		/*
428 		 * We have to return EOF if fileoff is >= file size.
429 		 */
430 		if (fileoff >= va.va_size) {
431 			bcount = 0;
432 			goto out;
433 		}
434 
435 		/*
436 		 * File is greater than or equal to maxoff and therefore
437 		 * we return EOVERFLOW.
438 		 */
439 		error = EOVERFLOW;
440 		goto out;
441 	}
442 	auio.uio_loffset = fileoff;
443 	auio.uio_iov = &aiov;
444 	auio.uio_iovcnt = 1;
445 	auio.uio_resid = bcount;
446 	auio.uio_segflg = UIO_USERSPACE;
447 	auio.uio_llimit = MAXOFFSET_T;
448 	auio.uio_fmode = fflag;
449 	auio.uio_extflg = UIO_COPY_CACHED;
450 
451 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
452 
453 	/* If read sync is not asked for, filter sync flags */
454 	if ((ioflag & FRSYNC) == 0)
455 		ioflag &= ~(FSYNC|FDSYNC);
456 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
457 	bcount -= auio.uio_resid;
458 	CPU_STATS_ENTER_K();
459 	cp = CPU;
460 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
461 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
462 	CPU_STATS_EXIT_K();
463 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
464 	VOP_RWUNLOCK(vp, rwflag, NULL);
465 
466 	if (error == EINTR && bcount != 0)
467 		error = 0;
468 out:
469 	if (in_crit)
470 		nbl_end_crit(vp);
471 	releasef(fdes);
472 	if (error)
473 		return (set_errno(error));
474 	return (bcount);
475 }
476 
477 ssize_t
478 pwrite(int fdes, void *cbuf, size_t count, off_t offset)
479 {
480 	struct uio auio;
481 	struct iovec aiov;
482 	file_t *fp;
483 	register vnode_t *vp;
484 	struct cpu *cp;
485 	int fflag, ioflag, rwflag;
486 	ssize_t bcount;
487 	int error = 0;
488 	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
489 #ifdef _SYSCALL32_IMPL
490 	u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
491 	    MAXOFF32_T : MAXOFFSET_T;
492 #else
493 	const u_offset_t maxoff = MAXOFF32_T;
494 #endif
495 	int in_crit = 0;
496 
497 	if ((bcount = (ssize_t)count) < 0)
498 		return (set_errno(EINVAL));
499 	if ((fp = getf(fdes)) == NULL)
500 		return (set_errno(EBADF));
501 	if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
502 		error = EBADF;
503 		goto out;
504 	}
505 
506 	rwflag = 1;
507 	vp = fp->f_vnode;
508 
509 	if (vp->v_type == VREG) {
510 
511 		if (bcount == 0)
512 			goto out;
513 
514 		/*
515 		 * return EINVAL for offsets that cannot be
516 		 * represented in an off_t.
517 		 */
518 		if (fileoff > maxoff) {
519 			error = EINVAL;
520 			goto out;
521 		}
522 		/*
523 		 * Take appropriate action if we are trying to write above the
524 		 * resource limit.
525 		 */
526 		if (fileoff >= curproc->p_fsz_ctl) {
527 			mutex_enter(&curproc->p_lock);
528 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
529 			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
530 			mutex_exit(&curproc->p_lock);
531 
532 			error = EFBIG;
533 			goto out;
534 		}
535 		/*
536 		 * Don't allow pwrite to cause file sizes to exceed
537 		 * maxoff.
538 		 */
539 		if (fileoff == maxoff) {
540 			error = EFBIG;
541 			goto out;
542 		}
543 		if (fileoff + count > maxoff)
544 			bcount = (ssize_t)((u_offset_t)maxoff - fileoff);
545 	} else if (vp->v_type == VFIFO) {
546 		error = ESPIPE;
547 		goto out;
548 	}
549 
550 	/*
551 	 * We have to enter the critical region before calling VOP_RWLOCK
552 	 * to avoid a deadlock with ufs.
553 	 */
554 	if (nbl_need_check(vp)) {
555 		int svmand;
556 
557 		nbl_start_crit(vp, RW_READER);
558 		in_crit = 1;
559 		error = nbl_svmand(vp, fp->f_cred, &svmand);
560 		if (error != 0)
561 			goto out;
562 		if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand,
563 		    NULL)) {
564 			error = EACCES;
565 			goto out;
566 		}
567 	}
568 
569 	aiov.iov_base = cbuf;
570 	aiov.iov_len = bcount;
571 	(void) VOP_RWLOCK(vp, rwflag, NULL);
572 	auio.uio_loffset = fileoff;
573 	auio.uio_iov = &aiov;
574 	auio.uio_iovcnt = 1;
575 	auio.uio_resid = bcount;
576 	auio.uio_segflg = UIO_USERSPACE;
577 	auio.uio_llimit = curproc->p_fsz_ctl;
578 	auio.uio_fmode = fflag;
579 	auio.uio_extflg = UIO_COPY_CACHED;
580 
581 	/*
582 	 * The SUSv4 POSIX specification states:
583 	 *	The pwrite() function shall be equivalent to write(), except
584 	 *	that it writes into a given position and does not change
585 	 *	the file offset (regardless of whether O_APPEND is set).
586 	 * To make this be true, we omit the FAPPEND flag from ioflag.
587 	 */
588 	ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
589 
590 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
591 	bcount -= auio.uio_resid;
592 	CPU_STATS_ENTER_K();
593 	cp = CPU;
594 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
595 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
596 	CPU_STATS_EXIT_K();
597 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
598 	VOP_RWUNLOCK(vp, rwflag, NULL);
599 
600 	if (error == EINTR && bcount != 0)
601 		error = 0;
602 out:
603 	if (in_crit)
604 		nbl_end_crit(vp);
605 	releasef(fdes);
606 	if (error)
607 		return (set_errno(error));
608 	return (bcount);
609 }
610 
611 /*
612  * XXX -- The SVID refers to IOV_MAX, but doesn't define it.  Grrrr....
613  * XXX -- However, SVVS expects readv() and writev() to fail if
614  * XXX -- iovcnt > 16 (yes, it's hard-coded in the SVVS source),
615  * XXX -- so I guess that's the "interface".
616  */
617 #define	DEF_IOV_MAX	16
618 
619 ssize_t
620 readv(int fdes, struct iovec *iovp, int iovcnt)
621 {
622 	struct uio auio;
623 	struct iovec aiov[DEF_IOV_MAX];
624 	file_t *fp;
625 	register vnode_t *vp;
626 	struct cpu *cp;
627 	int fflag, ioflag, rwflag;
628 	ssize_t count, bcount;
629 	int error = 0;
630 	int i;
631 	u_offset_t fileoff;
632 	int in_crit = 0;
633 
634 	if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
635 		return (set_errno(EINVAL));
636 
637 #ifdef _SYSCALL32_IMPL
638 	/*
639 	 * 32-bit callers need to have their iovec expanded,
640 	 * while ensuring that they can't move more than 2Gbytes
641 	 * of data in a single call.
642 	 */
643 	if (get_udatamodel() == DATAMODEL_ILP32) {
644 		struct iovec32 aiov32[DEF_IOV_MAX];
645 		ssize32_t count32;
646 
647 		if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
648 			return (set_errno(EFAULT));
649 
650 		count32 = 0;
651 		for (i = 0; i < iovcnt; i++) {
652 			ssize32_t iovlen32 = aiov32[i].iov_len;
653 			count32 += iovlen32;
654 			if (iovlen32 < 0 || count32 < 0)
655 				return (set_errno(EINVAL));
656 			aiov[i].iov_len = iovlen32;
657 			aiov[i].iov_base =
658 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
659 		}
660 	} else
661 #endif
662 	if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
663 		return (set_errno(EFAULT));
664 
665 	count = 0;
666 	for (i = 0; i < iovcnt; i++) {
667 		ssize_t iovlen = aiov[i].iov_len;
668 		count += iovlen;
669 		if (iovlen < 0 || count < 0)
670 			return (set_errno(EINVAL));
671 	}
672 	if ((fp = getf(fdes)) == NULL)
673 		return (set_errno(EBADF));
674 	if (((fflag = fp->f_flag) & FREAD) == 0) {
675 		error = EBADF;
676 		goto out;
677 	}
678 	vp = fp->f_vnode;
679 	if (vp->v_type == VREG && count == 0) {
680 		goto out;
681 	}
682 
683 	rwflag = 0;
684 
685 	/*
686 	 * We have to enter the critical region before calling VOP_RWLOCK
687 	 * to avoid a deadlock with ufs.
688 	 */
689 	if (nbl_need_check(vp)) {
690 		int svmand;
691 
692 		nbl_start_crit(vp, RW_READER);
693 		in_crit = 1;
694 		error = nbl_svmand(vp, fp->f_cred, &svmand);
695 		if (error != 0)
696 			goto out;
697 		if (nbl_conflict(vp, NBL_READ, fp->f_offset, count, svmand,
698 		    NULL)) {
699 			error = EACCES;
700 			goto out;
701 		}
702 	}
703 
704 	(void) VOP_RWLOCK(vp, rwflag, NULL);
705 	fileoff = fp->f_offset;
706 
707 	/*
708 	 * Behaviour is same as read. Please see comments in read.
709 	 */
710 
711 	if ((vp->v_type == VREG) && (fileoff >= OFFSET_MAX(fp))) {
712 		struct vattr va;
713 		va.va_mask = AT_SIZE;
714 		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL)))  {
715 			VOP_RWUNLOCK(vp, rwflag, NULL);
716 			goto out;
717 		}
718 		if (fileoff >= va.va_size) {
719 			VOP_RWUNLOCK(vp, rwflag, NULL);
720 			count = 0;
721 			goto out;
722 		} else {
723 			VOP_RWUNLOCK(vp, rwflag, NULL);
724 			error = EOVERFLOW;
725 			goto out;
726 		}
727 	}
728 	if ((vp->v_type == VREG) && (fileoff + count > OFFSET_MAX(fp))) {
729 		count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
730 	}
731 	auio.uio_loffset = fileoff;
732 	auio.uio_iov = aiov;
733 	auio.uio_iovcnt = iovcnt;
734 	auio.uio_resid = bcount = count;
735 	auio.uio_segflg = UIO_USERSPACE;
736 	auio.uio_llimit = MAXOFFSET_T;
737 	auio.uio_fmode = fflag;
738 	if (bcount <= copyout_max_cached)
739 		auio.uio_extflg = UIO_COPY_CACHED;
740 	else
741 		auio.uio_extflg = UIO_COPY_DEFAULT;
742 
743 
744 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
745 
746 	/* If read sync is not asked for, filter sync flags */
747 	if ((ioflag & FRSYNC) == 0)
748 		ioflag &= ~(FSYNC|FDSYNC);
749 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
750 	count -= auio.uio_resid;
751 	CPU_STATS_ENTER_K();
752 	cp = CPU;
753 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
754 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)count);
755 	CPU_STATS_EXIT_K();
756 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
757 
758 	if (vp->v_type == VFIFO)	/* Backward compatibility */
759 		fp->f_offset = count;
760 	else if (((fp->f_flag & FAPPEND) == 0) ||
761 	    (vp->v_type != VREG) || (bcount != 0))	/* POSIX */
762 		fp->f_offset = auio.uio_loffset;
763 
764 	VOP_RWUNLOCK(vp, rwflag, NULL);
765 
766 	if (error == EINTR && count != 0)
767 		error = 0;
768 out:
769 	if (in_crit)
770 		nbl_end_crit(vp);
771 	releasef(fdes);
772 	if (error)
773 		return (set_errno(error));
774 	return (count);
775 }
776 
777 ssize_t
778 writev(int fdes, struct iovec *iovp, int iovcnt)
779 {
780 	struct uio auio;
781 	struct iovec aiov[DEF_IOV_MAX];
782 	file_t *fp;
783 	register vnode_t *vp;
784 	struct cpu *cp;
785 	int fflag, ioflag, rwflag;
786 	ssize_t count, bcount;
787 	int error = 0;
788 	int i;
789 	u_offset_t fileoff;
790 	int in_crit = 0;
791 
792 	if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
793 		return (set_errno(EINVAL));
794 
795 #ifdef _SYSCALL32_IMPL
796 	/*
797 	 * 32-bit callers need to have their iovec expanded,
798 	 * while ensuring that they can't move more than 2Gbytes
799 	 * of data in a single call.
800 	 */
801 	if (get_udatamodel() == DATAMODEL_ILP32) {
802 		struct iovec32 aiov32[DEF_IOV_MAX];
803 		ssize32_t count32;
804 
805 		if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
806 			return (set_errno(EFAULT));
807 
808 		count32 = 0;
809 		for (i = 0; i < iovcnt; i++) {
810 			ssize32_t iovlen = aiov32[i].iov_len;
811 			count32 += iovlen;
812 			if (iovlen < 0 || count32 < 0)
813 				return (set_errno(EINVAL));
814 			aiov[i].iov_len = iovlen;
815 			aiov[i].iov_base =
816 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
817 		}
818 	} else
819 #endif
820 	if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
821 		return (set_errno(EFAULT));
822 
823 	count = 0;
824 	for (i = 0; i < iovcnt; i++) {
825 		ssize_t iovlen = aiov[i].iov_len;
826 		count += iovlen;
827 		if (iovlen < 0 || count < 0)
828 			return (set_errno(EINVAL));
829 	}
830 	if ((fp = getf(fdes)) == NULL)
831 		return (set_errno(EBADF));
832 	if (((fflag = fp->f_flag) & FWRITE) == 0) {
833 		error = EBADF;
834 		goto out;
835 	}
836 	vp = fp->f_vnode;
837 	if (vp->v_type == VREG && count == 0) {
838 		goto out;
839 	}
840 
841 	rwflag = 1;
842 
843 	/*
844 	 * We have to enter the critical region before calling VOP_RWLOCK
845 	 * to avoid a deadlock with ufs.
846 	 */
847 	if (nbl_need_check(vp)) {
848 		int svmand;
849 
850 		nbl_start_crit(vp, RW_READER);
851 		in_crit = 1;
852 		error = nbl_svmand(vp, fp->f_cred, &svmand);
853 		if (error != 0)
854 			goto out;
855 		if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, count, svmand,
856 		    NULL)) {
857 			error = EACCES;
858 			goto out;
859 		}
860 	}
861 
862 	(void) VOP_RWLOCK(vp, rwflag, NULL);
863 
864 	fileoff = fp->f_offset;
865 
866 	/*
867 	 * Behaviour is same as write. Please see comments for write.
868 	 */
869 
870 	if (vp->v_type == VREG) {
871 		if (fileoff >= curproc->p_fsz_ctl) {
872 			VOP_RWUNLOCK(vp, rwflag, NULL);
873 			mutex_enter(&curproc->p_lock);
874 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
875 			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
876 			mutex_exit(&curproc->p_lock);
877 			error = EFBIG;
878 			goto out;
879 		}
880 		if (fileoff >= OFFSET_MAX(fp)) {
881 			VOP_RWUNLOCK(vp, rwflag, NULL);
882 			error = EFBIG;
883 			goto out;
884 		}
885 		if (fileoff + count > OFFSET_MAX(fp))
886 			count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
887 	}
888 	auio.uio_loffset = fileoff;
889 	auio.uio_iov = aiov;
890 	auio.uio_iovcnt = iovcnt;
891 	auio.uio_resid = bcount = count;
892 	auio.uio_segflg = UIO_USERSPACE;
893 	auio.uio_llimit = curproc->p_fsz_ctl;
894 	auio.uio_fmode = fflag;
895 	auio.uio_extflg = UIO_COPY_DEFAULT;
896 
897 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
898 
899 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
900 	count -= auio.uio_resid;
901 	CPU_STATS_ENTER_K();
902 	cp = CPU;
903 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
904 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)count);
905 	CPU_STATS_EXIT_K();
906 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
907 
908 	if (vp->v_type == VFIFO)	/* Backward compatibility */
909 		fp->f_offset = count;
910 	else if (((fp->f_flag & FAPPEND) == 0) ||
911 	    (vp->v_type != VREG) || (bcount != 0))	/* POSIX */
912 		fp->f_offset = auio.uio_loffset;
913 	VOP_RWUNLOCK(vp, rwflag, NULL);
914 
915 	if (error == EINTR && count != 0)
916 		error = 0;
917 out:
918 	if (in_crit)
919 		nbl_end_crit(vp);
920 	releasef(fdes);
921 	if (error)
922 		return (set_errno(error));
923 	return (count);
924 }
925 
926 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
927 
928 /*
929  * This syscall supplies 64-bit file offsets to 32-bit applications only.
930  */
931 ssize32_t
932 pread64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
933     uint32_t offset_2)
934 {
935 	struct uio auio;
936 	struct iovec aiov;
937 	file_t *fp;
938 	register vnode_t *vp;
939 	struct cpu *cp;
940 	int fflag, ioflag, rwflag;
941 	ssize_t bcount;
942 	int error = 0;
943 	u_offset_t fileoff;
944 	int in_crit = 0;
945 
946 #if defined(_LITTLE_ENDIAN)
947 	fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
948 #else
949 	fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
950 #endif
951 
952 	if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
953 		return (set_errno(EINVAL));
954 
955 	if ((fp = getf(fdes)) == NULL)
956 		return (set_errno(EBADF));
957 	if (((fflag = fp->f_flag) & (FREAD)) == 0) {
958 		error = EBADF;
959 		goto out;
960 	}
961 
962 	rwflag = 0;
963 	vp = fp->f_vnode;
964 
965 	if (vp->v_type == VREG) {
966 
967 		if (bcount == 0)
968 			goto out;
969 
970 		/*
971 		 * Same as pread. See comments in pread.
972 		 */
973 
974 		if (fileoff > MAXOFFSET_T) {
975 			error = EINVAL;
976 			goto out;
977 		}
978 		if (fileoff + bcount > MAXOFFSET_T)
979 			bcount = (ssize_t)(MAXOFFSET_T - fileoff);
980 	} else if (vp->v_type == VFIFO) {
981 		error = ESPIPE;
982 		goto out;
983 	}
984 
985 	/*
986 	 * We have to enter the critical region before calling VOP_RWLOCK
987 	 * to avoid a deadlock with ufs.
988 	 */
989 	if (nbl_need_check(vp)) {
990 		int svmand;
991 
992 		nbl_start_crit(vp, RW_READER);
993 		in_crit = 1;
994 		error = nbl_svmand(vp, fp->f_cred, &svmand);
995 		if (error != 0)
996 			goto out;
997 		if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand,
998 		    NULL)) {
999 			error = EACCES;
1000 			goto out;
1001 		}
1002 	}
1003 
1004 	aiov.iov_base = cbuf;
1005 	aiov.iov_len = bcount;
1006 	(void) VOP_RWLOCK(vp, rwflag, NULL);
1007 	auio.uio_loffset = fileoff;
1008 
1009 	/*
1010 	 * Note: File size can never be greater than MAXOFFSET_T.
1011 	 * If ever we start supporting 128 bit files the code
1012 	 * similar to the one in pread at this place should be here.
1013 	 * Here we avoid the unnecessary VOP_GETATTR() when we
1014 	 * know that fileoff == MAXOFFSET_T implies that it is always
1015 	 * greater than or equal to file size.
1016 	 */
1017 	auio.uio_iov = &aiov;
1018 	auio.uio_iovcnt = 1;
1019 	auio.uio_resid = bcount;
1020 	auio.uio_segflg = UIO_USERSPACE;
1021 	auio.uio_llimit = MAXOFFSET_T;
1022 	auio.uio_fmode = fflag;
1023 	auio.uio_extflg = UIO_COPY_CACHED;
1024 
1025 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1026 
1027 	/* If read sync is not asked for, filter sync flags */
1028 	if ((ioflag & FRSYNC) == 0)
1029 		ioflag &= ~(FSYNC|FDSYNC);
1030 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1031 	bcount -= auio.uio_resid;
1032 	CPU_STATS_ENTER_K();
1033 	cp = CPU;
1034 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
1035 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
1036 	CPU_STATS_EXIT_K();
1037 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
1038 	VOP_RWUNLOCK(vp, rwflag, NULL);
1039 
1040 	if (error == EINTR && bcount != 0)
1041 		error = 0;
1042 out:
1043 	if (in_crit)
1044 		nbl_end_crit(vp);
1045 	releasef(fdes);
1046 	if (error)
1047 		return (set_errno(error));
1048 	return (bcount);
1049 }
1050 
1051 /*
1052  * This syscall supplies 64-bit file offsets to 32-bit applications only.
1053  */
1054 ssize32_t
1055 pwrite64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
1056     uint32_t offset_2)
1057 {
1058 	struct uio auio;
1059 	struct iovec aiov;
1060 	file_t *fp;
1061 	register vnode_t *vp;
1062 	struct cpu *cp;
1063 	int fflag, ioflag, rwflag;
1064 	ssize_t bcount;
1065 	int error = 0;
1066 	u_offset_t fileoff;
1067 	int in_crit = 0;
1068 
1069 #if defined(_LITTLE_ENDIAN)
1070 	fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
1071 #else
1072 	fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
1073 #endif
1074 
1075 	if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
1076 		return (set_errno(EINVAL));
1077 	if ((fp = getf(fdes)) == NULL)
1078 		return (set_errno(EBADF));
1079 	if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
1080 		error = EBADF;
1081 		goto out;
1082 	}
1083 
1084 	rwflag = 1;
1085 	vp = fp->f_vnode;
1086 
1087 	if (vp->v_type == VREG) {
1088 
1089 		if (bcount == 0)
1090 			goto out;
1091 
1092 		/*
1093 		 * See comments in pwrite.
1094 		 */
1095 		if (fileoff > MAXOFFSET_T) {
1096 			error = EINVAL;
1097 			goto out;
1098 		}
1099 		if (fileoff >= curproc->p_fsz_ctl) {
1100 			mutex_enter(&curproc->p_lock);
1101 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
1102 			    curproc->p_rctls, curproc, RCA_SAFE);
1103 			mutex_exit(&curproc->p_lock);
1104 			error = EFBIG;
1105 			goto out;
1106 		}
1107 		if (fileoff == MAXOFFSET_T) {
1108 			error = EFBIG;
1109 			goto out;
1110 		}
1111 		if (fileoff + bcount > MAXOFFSET_T)
1112 			bcount = (ssize_t)((u_offset_t)MAXOFFSET_T - fileoff);
1113 	} else if (vp->v_type == VFIFO) {
1114 		error = ESPIPE;
1115 		goto out;
1116 	}
1117 
1118 	/*
1119 	 * We have to enter the critical region before calling VOP_RWLOCK
1120 	 * to avoid a deadlock with ufs.
1121 	 */
1122 	if (nbl_need_check(vp)) {
1123 		int svmand;
1124 
1125 		nbl_start_crit(vp, RW_READER);
1126 		in_crit = 1;
1127 		error = nbl_svmand(vp, fp->f_cred, &svmand);
1128 		if (error != 0)
1129 			goto out;
1130 		if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand,
1131 		    NULL)) {
1132 			error = EACCES;
1133 			goto out;
1134 		}
1135 	}
1136 
1137 	aiov.iov_base = cbuf;
1138 	aiov.iov_len = bcount;
1139 	(void) VOP_RWLOCK(vp, rwflag, NULL);
1140 	auio.uio_loffset = fileoff;
1141 	auio.uio_iov = &aiov;
1142 	auio.uio_iovcnt = 1;
1143 	auio.uio_resid = bcount;
1144 	auio.uio_segflg = UIO_USERSPACE;
1145 	auio.uio_llimit = curproc->p_fsz_ctl;
1146 	auio.uio_fmode = fflag;
1147 	auio.uio_extflg = UIO_COPY_CACHED;
1148 
1149 	/*
1150 	 * The SUSv4 POSIX specification states:
1151 	 *	The pwrite() function shall be equivalent to write(), except
1152 	 *	that it writes into a given position and does not change
1153 	 *	the file offset (regardless of whether O_APPEND is set).
1154 	 * To make this be true, we omit the FAPPEND flag from ioflag.
1155 	 */
1156 	ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
1157 
1158 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
1159 	bcount -= auio.uio_resid;
1160 	CPU_STATS_ENTER_K();
1161 	cp = CPU;
1162 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
1163 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
1164 	CPU_STATS_EXIT_K();
1165 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
1166 	VOP_RWUNLOCK(vp, rwflag, NULL);
1167 
1168 	if (error == EINTR && bcount != 0)
1169 		error = 0;
1170 out:
1171 	if (in_crit)
1172 		nbl_end_crit(vp);
1173 	releasef(fdes);
1174 	if (error)
1175 		return (set_errno(error));
1176 	return (bcount);
1177 }
1178 
1179 #endif	/* _SYSCALL32_IMPL || _ILP32 */
1180 
1181 #ifdef _SYSCALL32_IMPL
1182 /*
1183  * Tail-call elimination of xxx32() down to xxx()
1184  *
1185  * A number of xxx32 system calls take a len (or count) argument and
1186  * return a number in the range [0,len] or -1 on error.
1187  * Given an ssize32_t input len, the downcall xxx() will return
1188  * a 64-bit value that is -1 or in the range [0,len] which actually
1189  * is a proper return value for the xxx32 call. So even if the xxx32
1190  * calls can be considered as returning a ssize32_t, they are currently
1191  * declared as returning a ssize_t as this enables tail-call elimination.
1192  *
1193  * The cast of len (or count) to ssize32_t is needed to ensure we pass
1194  * down negative input values as such and let the downcall handle error
1195  * reporting. Functions covered by this comments are:
1196  *
1197  * rw.c:           read32, write32, pread32, pwrite32, readv32, writev32.
1198  * socksyscall.c:  recv32, recvfrom32, send32, sendto32.
1199  * readlink.c:     readlink32.
1200  */
1201 
1202 ssize_t
1203 read32(int32_t fdes, caddr32_t cbuf, size32_t count)
1204 {
1205 	return (read(fdes,
1206 	    (void *)(uintptr_t)cbuf, (ssize32_t)count));
1207 }
1208 
1209 ssize_t
1210 write32(int32_t fdes, caddr32_t cbuf, size32_t count)
1211 {
1212 	return (write(fdes,
1213 	    (void *)(uintptr_t)cbuf, (ssize32_t)count));
1214 }
1215 
1216 ssize_t
1217 pread32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
1218 {
1219 	return (pread(fdes,
1220 	    (void *)(uintptr_t)cbuf, (ssize32_t)count,
1221 	    (off_t)(uint32_t)offset));
1222 }
1223 
1224 ssize_t
1225 pwrite32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
1226 {
1227 	return (pwrite(fdes,
1228 	    (void *)(uintptr_t)cbuf, (ssize32_t)count,
1229 	    (off_t)(uint32_t)offset));
1230 }
1231 
1232 ssize_t
1233 readv32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
1234 {
1235 	return (readv(fdes, (void *)(uintptr_t)iovp, iovcnt));
1236 }
1237 
1238 ssize_t
1239 writev32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
1240 {
1241 	return (writev(fdes, (void *)(uintptr_t)iovp, iovcnt));
1242 }
1243 
1244 #endif	/* _SYSCALL32_IMPL */
1245