xref: /titanic_50/usr/src/uts/common/syscall/rw.c (revision 5f9e250aa611c12bbaccc0be612e5b97ccca2762)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 /*
31  * Portions of this source code were derived from Berkeley 4.3 BSD
32  * under license from the Regents of the University of California.
33  */
34 
35 #pragma ident	"%Z%%M%	%I%	%E% SMI"
36 
37 #include <sys/param.h>
38 #include <sys/isa_defs.h>
39 #include <sys/types.h>
40 #include <sys/inttypes.h>
41 #include <sys/sysmacros.h>
42 #include <sys/cred.h>
43 #include <sys/user.h>
44 #include <sys/systm.h>
45 #include <sys/errno.h>
46 #include <sys/vnode.h>
47 #include <sys/file.h>
48 #include <sys/proc.h>
49 #include <sys/cpuvar.h>
50 #include <sys/uio.h>
51 #include <sys/debug.h>
52 #include <sys/rctl.h>
53 #include <sys/nbmlock.h>
54 
55 #define	COPYOUT_MIN_SIZE	(1<<17)		/* 128K */
56 
57 static size_t copyout_min_size = COPYOUT_MIN_SIZE;
58 
59 /*
60  * read, write, pread, pwrite, readv, and writev syscalls.
61  *
62  * 64-bit open:	all open's are large file opens.
63  * Large Files: the behaviour of read depends on whether the fd
64  *		corresponds to large open or not.
65  * 32-bit open:	FOFFMAX flag not set.
66  *		read until MAXOFF32_T - 1 and read at MAXOFF32_T returns
67  *		EOVERFLOW if count is non-zero and if size of file
68  *		is > MAXOFF32_T. If size of file is <= MAXOFF32_T read
69  *		at >= MAXOFF32_T returns EOF.
70  */
71 
72 /*
73  * Native system call
74  */
75 ssize_t
76 read(int fdes, void *cbuf, size_t count)
77 {
78 	struct uio auio;
79 	struct iovec aiov;
80 	file_t *fp;
81 	register vnode_t *vp;
82 	struct cpu *cp;
83 	int fflag, ioflag, rwflag;
84 	ssize_t cnt, bcount;
85 	int error = 0;
86 	u_offset_t fileoff;
87 	int in_crit = 0;
88 
89 	if ((cnt = (ssize_t)count) < 0)
90 		return (set_errno(EINVAL));
91 	if ((fp = getf(fdes)) == NULL)
92 		return (set_errno(EBADF));
93 	if (((fflag = fp->f_flag) & FREAD) == 0) {
94 		error = EBADF;
95 		goto out;
96 	}
97 	vp = fp->f_vnode;
98 
99 	if (vp->v_type == VREG && cnt == 0) {
100 		goto out;
101 	}
102 
103 	rwflag = 0;
104 	aiov.iov_base = cbuf;
105 	aiov.iov_len = cnt;
106 
107 	/*
108 	 * We have to enter the critical region before calling VOP_RWLOCK
109 	 * to avoid a deadlock with write() calls.
110 	 */
111 	if (nbl_need_check(vp)) {
112 		int svmand;
113 
114 		nbl_start_crit(vp, RW_READER);
115 		in_crit = 1;
116 		error = nbl_svmand(vp, fp->f_cred, &svmand);
117 		if (error != 0)
118 			goto out;
119 		if (nbl_conflict(vp, NBL_READ, fp->f_offset, cnt, svmand)) {
120 			error = EACCES;
121 			goto out;
122 		}
123 	}
124 
125 	(void) VOP_RWLOCK(vp, rwflag, NULL);
126 
127 	/*
128 	 * We do the following checks inside VOP_RWLOCK so as to
129 	 * prevent file size from changing while these checks are
130 	 * being done. Also, we load fp's offset to the local
131 	 * variable fileoff because we can have a parallel lseek
132 	 * going on (f_offset is not protected by any lock) which
133 	 * could change f_offset. We need to see the value only
134 	 * once here and take a decision. Seeing it more than once
135 	 * can lead to incorrect functionality.
136 	 */
137 
138 	fileoff = (u_offset_t)fp->f_offset;
139 	if (fileoff >= OFFSET_MAX(fp) && (vp->v_type == VREG)) {
140 		struct vattr va;
141 		va.va_mask = AT_SIZE;
142 		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred)))  {
143 			VOP_RWUNLOCK(vp, rwflag, NULL);
144 			goto out;
145 		}
146 		if (fileoff >= va.va_size) {
147 			cnt = 0;
148 			VOP_RWUNLOCK(vp, rwflag, NULL);
149 			goto out;
150 		} else {
151 			error = EOVERFLOW;
152 			VOP_RWUNLOCK(vp, rwflag, NULL);
153 			goto out;
154 		}
155 	}
156 	if ((vp->v_type == VREG) &&
157 	    (fileoff + cnt > OFFSET_MAX(fp))) {
158 		cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
159 	}
160 	auio.uio_loffset = fileoff;
161 	auio.uio_iov = &aiov;
162 	auio.uio_iovcnt = 1;
163 	auio.uio_resid = bcount = cnt;
164 	auio.uio_segflg = UIO_USERSPACE;
165 	auio.uio_llimit = MAXOFFSET_T;
166 	auio.uio_fmode = fflag;
167 	/*
168 	 * Only use bypass caches when the count is large enough
169 	 */
170 	if (bcount < copyout_min_size)
171 		auio.uio_extflg = UIO_COPY_CACHED;
172 	else
173 		auio.uio_extflg = UIO_COPY_DEFAULT;
174 
175 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
176 
177 	/* If read sync is not asked for, filter sync flags */
178 	if ((ioflag & FRSYNC) == 0)
179 		ioflag &= ~(FSYNC|FDSYNC);
180 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
181 	cnt -= auio.uio_resid;
182 	CPU_STATS_ENTER_K();
183 	cp = CPU;
184 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
185 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)cnt);
186 	CPU_STATS_EXIT_K();
187 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
188 
189 	if (vp->v_type == VFIFO)	/* Backward compatibility */
190 		fp->f_offset = cnt;
191 	else if (((fp->f_flag & FAPPEND) == 0) ||
192 		(vp->v_type != VREG) || (bcount != 0))	/* POSIX */
193 		fp->f_offset = auio.uio_loffset;
194 	VOP_RWUNLOCK(vp, rwflag, NULL);
195 
196 	if (error == EINTR && cnt != 0)
197 		error = 0;
198 out:
199 	if (in_crit)
200 		nbl_end_crit(vp);
201 	releasef(fdes);
202 	if (error)
203 		return (set_errno(error));
204 	return (cnt);
205 }
206 
207 /*
208  * Native system call
209  */
210 ssize_t
211 write(int fdes, void *cbuf, size_t count)
212 {
213 	struct uio auio;
214 	struct iovec aiov;
215 	file_t *fp;
216 	register vnode_t *vp;
217 	struct cpu *cp;
218 	int fflag, ioflag, rwflag;
219 	ssize_t cnt, bcount;
220 	int error = 0;
221 	u_offset_t fileoff;
222 	int in_crit = 0;
223 
224 	if ((cnt = (ssize_t)count) < 0)
225 		return (set_errno(EINVAL));
226 	if ((fp = getf(fdes)) == NULL)
227 		return (set_errno(EBADF));
228 	if (((fflag = fp->f_flag) & FWRITE) == 0) {
229 		error = EBADF;
230 		goto out;
231 	}
232 	vp = fp->f_vnode;
233 
234 	if (vp->v_type == VREG && cnt == 0) {
235 		goto out;
236 	}
237 
238 	rwflag = 1;
239 	aiov.iov_base = cbuf;
240 	aiov.iov_len = cnt;
241 
242 	/*
243 	 * We have to enter the critical region before calling VOP_RWLOCK
244 	 * to avoid a deadlock with ufs.
245 	 */
246 	if (nbl_need_check(vp)) {
247 		int svmand;
248 
249 		nbl_start_crit(vp, RW_READER);
250 		in_crit = 1;
251 		error = nbl_svmand(vp, fp->f_cred, &svmand);
252 		if (error != 0)
253 			goto out;
254 		if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, cnt, svmand)) {
255 			error = EACCES;
256 			goto out;
257 		}
258 	}
259 
260 	(void) VOP_RWLOCK(vp, rwflag, NULL);
261 
262 	fileoff = fp->f_offset;
263 	if (vp->v_type == VREG) {
264 
265 		/*
266 		 * We raise psignal if write for >0 bytes causes
267 		 * it to exceed the ulimit.
268 		 */
269 		if (fileoff >= curproc->p_fsz_ctl) {
270 			VOP_RWUNLOCK(vp, rwflag, NULL);
271 
272 			mutex_enter(&curproc->p_lock);
273 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
274 			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
275 			mutex_exit(&curproc->p_lock);
276 
277 			error = EFBIG;
278 			goto out;
279 		}
280 		/*
281 		 * We return EFBIG if write is done at an offset
282 		 * greater than the offset maximum for this file structure.
283 		 */
284 
285 		if (fileoff >= OFFSET_MAX(fp)) {
286 			VOP_RWUNLOCK(vp, rwflag, NULL);
287 			error = EFBIG;
288 			goto out;
289 		}
290 		/*
291 		 * Limit the bytes to be written  upto offset maximum for
292 		 * this open file structure.
293 		 */
294 		if (fileoff + cnt > OFFSET_MAX(fp))
295 			cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
296 	}
297 	auio.uio_loffset = fileoff;
298 	auio.uio_iov = &aiov;
299 	auio.uio_iovcnt = 1;
300 	auio.uio_resid = bcount = cnt;
301 	auio.uio_segflg = UIO_USERSPACE;
302 	auio.uio_llimit = curproc->p_fsz_ctl;
303 	auio.uio_fmode = fflag;
304 	auio.uio_extflg = UIO_COPY_DEFAULT;
305 
306 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
307 
308 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
309 	cnt -= auio.uio_resid;
310 	CPU_STATS_ENTER_K();
311 	cp = CPU;
312 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
313 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)cnt);
314 	CPU_STATS_EXIT_K();
315 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
316 
317 	if (vp->v_type == VFIFO)	/* Backward compatibility */
318 		fp->f_offset = cnt;
319 	else if (((fp->f_flag & FAPPEND) == 0) ||
320 		(vp->v_type != VREG) || (bcount != 0))	/* POSIX */
321 		fp->f_offset = auio.uio_loffset;
322 	VOP_RWUNLOCK(vp, rwflag, NULL);
323 
324 	if (error == EINTR && cnt != 0)
325 		error = 0;
326 out:
327 	if (in_crit)
328 		nbl_end_crit(vp);
329 	releasef(fdes);
330 	if (error)
331 		return (set_errno(error));
332 	return (cnt);
333 }
334 
335 ssize_t
336 pread(int fdes, void *cbuf, size_t count, off_t offset)
337 {
338 	struct uio auio;
339 	struct iovec aiov;
340 	file_t *fp;
341 	register vnode_t *vp;
342 	struct cpu *cp;
343 	int fflag, ioflag, rwflag;
344 	ssize_t bcount;
345 	int error = 0;
346 	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
347 #ifdef _SYSCALL32_IMPL
348 	u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
349 		MAXOFF32_T : MAXOFFSET_T;
350 #else
351 	const u_offset_t maxoff = MAXOFF32_T;
352 #endif
353 	int in_crit = 0;
354 
355 	if ((bcount = (ssize_t)count) < 0)
356 		return (set_errno(EINVAL));
357 
358 	if ((fp = getf(fdes)) == NULL)
359 		return (set_errno(EBADF));
360 	if (((fflag = fp->f_flag) & (FREAD)) == 0) {
361 		error = EBADF;
362 		goto out;
363 	}
364 
365 	rwflag = 0;
366 	vp = fp->f_vnode;
367 
368 	if (vp->v_type == VREG) {
369 
370 		if (bcount == 0)
371 			goto out;
372 
373 		/*
374 		 * Return EINVAL if an invalid offset comes to pread.
375 		 * Negative offset from user will cause this error.
376 		 */
377 
378 		if (fileoff > maxoff) {
379 			error = EINVAL;
380 			goto out;
381 		}
382 		/*
383 		 * Limit offset such that we don't read or write
384 		 * a file beyond the maximum offset representable in
385 		 * an off_t structure.
386 		 */
387 		if (fileoff + bcount > maxoff)
388 			bcount = (ssize_t)((offset_t)maxoff - fileoff);
389 	} else if (vp->v_type == VFIFO) {
390 		error = ESPIPE;
391 		goto out;
392 	}
393 
394 	/*
395 	 * We have to enter the critical region before calling VOP_RWLOCK
396 	 * to avoid a deadlock with ufs.
397 	 */
398 	if (nbl_need_check(vp)) {
399 		int svmand;
400 
401 		nbl_start_crit(vp, RW_READER);
402 		in_crit = 1;
403 		error = nbl_svmand(vp, fp->f_cred, &svmand);
404 		if (error != 0)
405 			goto out;
406 		if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand)) {
407 			error = EACCES;
408 			goto out;
409 		}
410 	}
411 
412 	aiov.iov_base = cbuf;
413 	aiov.iov_len = bcount;
414 	(void) VOP_RWLOCK(vp, rwflag, NULL);
415 	if (vp->v_type == VREG && fileoff == (u_offset_t)maxoff) {
416 		struct vattr va;
417 		va.va_mask = AT_SIZE;
418 		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred))) {
419 			VOP_RWUNLOCK(vp, rwflag, NULL);
420 			goto out;
421 		}
422 		VOP_RWUNLOCK(vp, rwflag, NULL);
423 
424 		/*
425 		 * We have to return EOF if fileoff is >= file size.
426 		 */
427 		if (fileoff >= va.va_size) {
428 			bcount = 0;
429 			goto out;
430 		}
431 
432 		/*
433 		 * File is greater than or equal to maxoff and therefore
434 		 * we return EOVERFLOW.
435 		 */
436 		error = EOVERFLOW;
437 		goto out;
438 	}
439 	auio.uio_loffset = fileoff;
440 	auio.uio_iov = &aiov;
441 	auio.uio_iovcnt = 1;
442 	auio.uio_resid = bcount;
443 	auio.uio_segflg = UIO_USERSPACE;
444 	auio.uio_llimit = MAXOFFSET_T;
445 	auio.uio_fmode = fflag;
446 	auio.uio_extflg = UIO_COPY_CACHED;
447 
448 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
449 
450 	/* If read sync is not asked for, filter sync flags */
451 	if ((ioflag & FRSYNC) == 0)
452 		ioflag &= ~(FSYNC|FDSYNC);
453 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
454 	bcount -= auio.uio_resid;
455 	CPU_STATS_ENTER_K();
456 	cp = CPU;
457 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
458 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
459 	CPU_STATS_EXIT_K();
460 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
461 	VOP_RWUNLOCK(vp, rwflag, NULL);
462 
463 	if (error == EINTR && bcount != 0)
464 		error = 0;
465 out:
466 	if (in_crit)
467 		nbl_end_crit(vp);
468 	releasef(fdes);
469 	if (error)
470 		return (set_errno(error));
471 	return (bcount);
472 }
473 
474 ssize_t
475 pwrite(int fdes, void *cbuf, size_t count, off_t offset)
476 {
477 	struct uio auio;
478 	struct iovec aiov;
479 	file_t *fp;
480 	register vnode_t *vp;
481 	struct cpu *cp;
482 	int fflag, ioflag, rwflag;
483 	ssize_t bcount;
484 	int error = 0;
485 	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
486 #ifdef _SYSCALL32_IMPL
487 	u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
488 		MAXOFF32_T : MAXOFFSET_T;
489 #else
490 	const u_offset_t maxoff = MAXOFF32_T;
491 #endif
492 	int in_crit = 0;
493 
494 	if ((bcount = (ssize_t)count) < 0)
495 		return (set_errno(EINVAL));
496 	if ((fp = getf(fdes)) == NULL)
497 		return (set_errno(EBADF));
498 	if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
499 		error = EBADF;
500 		goto out;
501 	}
502 
503 	rwflag = 1;
504 	vp = fp->f_vnode;
505 
506 	if (vp->v_type == VREG) {
507 
508 		if (bcount == 0)
509 			goto out;
510 
511 		/*
512 		 * return EINVAL for offsets that cannot be
513 		 * represented in an off_t.
514 		 */
515 		if (fileoff > maxoff) {
516 			error = EINVAL;
517 			goto out;
518 		}
519 		/*
520 		 * Take appropriate action if we are trying to write above the
521 		 * resource limit.
522 		 */
523 		if (fileoff >= curproc->p_fsz_ctl) {
524 			mutex_enter(&curproc->p_lock);
525 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
526 			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
527 			mutex_exit(&curproc->p_lock);
528 
529 			error = EFBIG;
530 			goto out;
531 		}
532 		/*
533 		 * Don't allow pwrite to cause file sizes to exceed
534 		 * maxoff.
535 		 */
536 		if (fileoff == maxoff) {
537 			error = EFBIG;
538 			goto out;
539 		}
540 		if (fileoff + count > maxoff)
541 			bcount = (ssize_t)((u_offset_t)maxoff - fileoff);
542 	} else if (vp->v_type == VFIFO) {
543 		error = ESPIPE;
544 		goto out;
545 	}
546 
547 	/*
548 	 * We have to enter the critical region before calling VOP_RWLOCK
549 	 * to avoid a deadlock with ufs.
550 	 */
551 	if (nbl_need_check(vp)) {
552 		int svmand;
553 
554 		nbl_start_crit(vp, RW_READER);
555 		in_crit = 1;
556 		error = nbl_svmand(vp, fp->f_cred, &svmand);
557 		if (error != 0)
558 			goto out;
559 		if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand)) {
560 			error = EACCES;
561 			goto out;
562 		}
563 	}
564 
565 	aiov.iov_base = cbuf;
566 	aiov.iov_len = bcount;
567 	(void) VOP_RWLOCK(vp, rwflag, NULL);
568 	auio.uio_loffset = fileoff;
569 	auio.uio_iov = &aiov;
570 	auio.uio_iovcnt = 1;
571 	auio.uio_resid = bcount;
572 	auio.uio_segflg = UIO_USERSPACE;
573 	auio.uio_llimit = curproc->p_fsz_ctl;
574 	auio.uio_fmode = fflag;
575 	auio.uio_extflg = UIO_COPY_CACHED;
576 
577 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
578 
579 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
580 	bcount -= auio.uio_resid;
581 	CPU_STATS_ENTER_K();
582 	cp = CPU;
583 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
584 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
585 	CPU_STATS_EXIT_K();
586 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
587 	VOP_RWUNLOCK(vp, rwflag, NULL);
588 
589 	if (error == EINTR && bcount != 0)
590 		error = 0;
591 out:
592 	if (in_crit)
593 		nbl_end_crit(vp);
594 	releasef(fdes);
595 	if (error)
596 		return (set_errno(error));
597 	return (bcount);
598 }
599 
600 /*
601  * XXX -- The SVID refers to IOV_MAX, but doesn't define it.  Grrrr....
602  * XXX -- However, SVVS expects readv() and writev() to fail if
603  * XXX -- iovcnt > 16 (yes, it's hard-coded in the SVVS source),
604  * XXX -- so I guess that's the "interface".
605  */
606 #define	DEF_IOV_MAX	16
607 
608 ssize_t
609 readv(int fdes, struct iovec *iovp, int iovcnt)
610 {
611 	struct uio auio;
612 	struct iovec aiov[DEF_IOV_MAX];
613 	file_t *fp;
614 	register vnode_t *vp;
615 	struct cpu *cp;
616 	int fflag, ioflag, rwflag;
617 	ssize_t count, bcount;
618 	int error = 0;
619 	int i;
620 	u_offset_t fileoff;
621 	int in_crit = 0;
622 
623 	if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
624 		return (set_errno(EINVAL));
625 
626 #ifdef _SYSCALL32_IMPL
627 	/*
628 	 * 32-bit callers need to have their iovec expanded,
629 	 * while ensuring that they can't move more than 2Gbytes
630 	 * of data in a single call.
631 	 */
632 	if (get_udatamodel() == DATAMODEL_ILP32) {
633 		struct iovec32 aiov32[DEF_IOV_MAX];
634 		ssize32_t count32;
635 
636 		if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
637 			return (set_errno(EFAULT));
638 
639 		count32 = 0;
640 		for (i = 0; i < iovcnt; i++) {
641 			ssize32_t iovlen32 = aiov32[i].iov_len;
642 			count32 += iovlen32;
643 			if (iovlen32 < 0 || count32 < 0)
644 				return (set_errno(EINVAL));
645 			aiov[i].iov_len = iovlen32;
646 			aiov[i].iov_base =
647 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
648 		}
649 	} else
650 #endif
651 	if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
652 		return (set_errno(EFAULT));
653 
654 	count = 0;
655 	for (i = 0; i < iovcnt; i++) {
656 		ssize_t iovlen = aiov[i].iov_len;
657 		count += iovlen;
658 		if (iovlen < 0 || count < 0)
659 			return (set_errno(EINVAL));
660 	}
661 	if ((fp = getf(fdes)) == NULL)
662 		return (set_errno(EBADF));
663 	if (((fflag = fp->f_flag) & FREAD) == 0) {
664 		error = EBADF;
665 		goto out;
666 	}
667 	vp = fp->f_vnode;
668 	if (vp->v_type == VREG && count == 0) {
669 		goto out;
670 	}
671 
672 	rwflag = 0;
673 
674 	/*
675 	 * We have to enter the critical region before calling VOP_RWLOCK
676 	 * to avoid a deadlock with ufs.
677 	 */
678 	if (nbl_need_check(vp)) {
679 		int svmand;
680 
681 		nbl_start_crit(vp, RW_READER);
682 		in_crit = 1;
683 		error = nbl_svmand(vp, fp->f_cred, &svmand);
684 		if (error != 0)
685 			goto out;
686 		if (nbl_conflict(vp, NBL_READ, fp->f_offset, count, svmand)) {
687 			error = EACCES;
688 			goto out;
689 		}
690 	}
691 
692 	(void) VOP_RWLOCK(vp, rwflag, NULL);
693 	fileoff = fp->f_offset;
694 
695 	/*
696 	 * Behaviour is same as read. Please see comments in read.
697 	 */
698 
699 	if ((vp->v_type == VREG) && (fileoff >= OFFSET_MAX(fp))) {
700 		struct vattr va;
701 		va.va_mask = AT_SIZE;
702 		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred)))  {
703 			VOP_RWUNLOCK(vp, rwflag, NULL);
704 			goto out;
705 		}
706 		if (fileoff >= va.va_size) {
707 			VOP_RWUNLOCK(vp, rwflag, NULL);
708 			count = 0;
709 			goto out;
710 		} else {
711 			VOP_RWUNLOCK(vp, rwflag, NULL);
712 			error = EOVERFLOW;
713 			goto out;
714 		}
715 	}
716 	if ((vp->v_type == VREG) && (fileoff + count > OFFSET_MAX(fp))) {
717 		count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
718 	}
719 	auio.uio_loffset = fileoff;
720 	auio.uio_iov = aiov;
721 	auio.uio_iovcnt = iovcnt;
722 	auio.uio_resid = bcount = count;
723 	auio.uio_segflg = UIO_USERSPACE;
724 	auio.uio_llimit = MAXOFFSET_T;
725 	auio.uio_fmode = fflag;
726 	if (bcount < copyout_min_size)
727 		auio.uio_extflg = UIO_COPY_CACHED;
728 	else
729 		auio.uio_extflg = UIO_COPY_DEFAULT;
730 
731 
732 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
733 
734 	/* If read sync is not asked for, filter sync flags */
735 	if ((ioflag & FRSYNC) == 0)
736 		ioflag &= ~(FSYNC|FDSYNC);
737 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
738 	count -= auio.uio_resid;
739 	CPU_STATS_ENTER_K();
740 	cp = CPU;
741 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
742 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)count);
743 	CPU_STATS_EXIT_K();
744 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
745 
746 	if (vp->v_type == VFIFO)	/* Backward compatibility */
747 		fp->f_offset = count;
748 	else if (((fp->f_flag & FAPPEND) == 0) ||
749 		(vp->v_type != VREG) || (bcount != 0))	/* POSIX */
750 		fp->f_offset = auio.uio_loffset;
751 
752 	VOP_RWUNLOCK(vp, rwflag, NULL);
753 
754 	if (error == EINTR && count != 0)
755 		error = 0;
756 out:
757 	if (in_crit)
758 		nbl_end_crit(vp);
759 	releasef(fdes);
760 	if (error)
761 		return (set_errno(error));
762 	return (count);
763 }
764 
765 ssize_t
766 writev(int fdes, struct iovec *iovp, int iovcnt)
767 {
768 	struct uio auio;
769 	struct iovec aiov[DEF_IOV_MAX];
770 	file_t *fp;
771 	register vnode_t *vp;
772 	struct cpu *cp;
773 	int fflag, ioflag, rwflag;
774 	ssize_t count, bcount;
775 	int error = 0;
776 	int i;
777 	u_offset_t fileoff;
778 	int in_crit = 0;
779 
780 	if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
781 		return (set_errno(EINVAL));
782 
783 #ifdef _SYSCALL32_IMPL
784 	/*
785 	 * 32-bit callers need to have their iovec expanded,
786 	 * while ensuring that they can't move more than 2Gbytes
787 	 * of data in a single call.
788 	 */
789 	if (get_udatamodel() == DATAMODEL_ILP32) {
790 		struct iovec32 aiov32[DEF_IOV_MAX];
791 		ssize32_t count32;
792 
793 		if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
794 			return (set_errno(EFAULT));
795 
796 		count32 = 0;
797 		for (i = 0; i < iovcnt; i++) {
798 			ssize32_t iovlen = aiov32[i].iov_len;
799 			count32 += iovlen;
800 			if (iovlen < 0 || count32 < 0)
801 				return (set_errno(EINVAL));
802 			aiov[i].iov_len = iovlen;
803 			aiov[i].iov_base =
804 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
805 		}
806 	} else
807 #endif
808 	if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
809 		return (set_errno(EFAULT));
810 
811 	count = 0;
812 	for (i = 0; i < iovcnt; i++) {
813 		ssize_t iovlen = aiov[i].iov_len;
814 		count += iovlen;
815 		if (iovlen < 0 || count < 0)
816 			return (set_errno(EINVAL));
817 	}
818 	if ((fp = getf(fdes)) == NULL)
819 		return (set_errno(EBADF));
820 	if (((fflag = fp->f_flag) & FWRITE) == 0) {
821 		error = EBADF;
822 		goto out;
823 	}
824 	vp = fp->f_vnode;
825 	if (vp->v_type == VREG && count == 0) {
826 		goto out;
827 	}
828 
829 	rwflag = 1;
830 
831 	/*
832 	 * We have to enter the critical region before calling VOP_RWLOCK
833 	 * to avoid a deadlock with ufs.
834 	 */
835 	if (nbl_need_check(vp)) {
836 		int svmand;
837 
838 		nbl_start_crit(vp, RW_READER);
839 		in_crit = 1;
840 		error = nbl_svmand(vp, fp->f_cred, &svmand);
841 		if (error != 0)
842 			goto out;
843 		if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, count, svmand)) {
844 			error = EACCES;
845 			goto out;
846 		}
847 	}
848 
849 	(void) VOP_RWLOCK(vp, rwflag, NULL);
850 
851 	fileoff = fp->f_offset;
852 
853 	/*
854 	 * Behaviour is same as write. Please see comments for write.
855 	 */
856 
857 	if (vp->v_type == VREG) {
858 		if (fileoff >= curproc->p_fsz_ctl) {
859 			VOP_RWUNLOCK(vp, rwflag, NULL);
860 			mutex_enter(&curproc->p_lock);
861 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
862 			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
863 			mutex_exit(&curproc->p_lock);
864 			error = EFBIG;
865 			goto out;
866 		}
867 		if (fileoff >= OFFSET_MAX(fp)) {
868 			VOP_RWUNLOCK(vp, rwflag, NULL);
869 			error = EFBIG;
870 			goto out;
871 		}
872 		if (fileoff + count > OFFSET_MAX(fp))
873 			count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
874 	}
875 	auio.uio_loffset = fileoff;
876 	auio.uio_iov = aiov;
877 	auio.uio_iovcnt = iovcnt;
878 	auio.uio_resid = bcount = count;
879 	auio.uio_segflg = UIO_USERSPACE;
880 	auio.uio_llimit = curproc->p_fsz_ctl;
881 	auio.uio_fmode = fflag;
882 	auio.uio_extflg = UIO_COPY_DEFAULT;
883 
884 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
885 
886 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
887 	count -= auio.uio_resid;
888 	CPU_STATS_ENTER_K();
889 	cp = CPU;
890 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
891 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)count);
892 	CPU_STATS_EXIT_K();
893 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
894 
895 	if (vp->v_type == VFIFO)	/* Backward compatibility */
896 		fp->f_offset = count;
897 	else if (((fp->f_flag & FAPPEND) == 0) ||
898 		(vp->v_type != VREG) || (bcount != 0))	/* POSIX */
899 		fp->f_offset = auio.uio_loffset;
900 	VOP_RWUNLOCK(vp, rwflag, NULL);
901 
902 	if (error == EINTR && count != 0)
903 		error = 0;
904 out:
905 	if (in_crit)
906 		nbl_end_crit(vp);
907 	releasef(fdes);
908 	if (error)
909 		return (set_errno(error));
910 	return (count);
911 }
912 
913 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
914 
915 /*
916  * This syscall supplies 64-bit file offsets to 32-bit applications only.
917  */
918 ssize32_t
919 pread64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
920     uint32_t offset_2)
921 {
922 	struct uio auio;
923 	struct iovec aiov;
924 	file_t *fp;
925 	register vnode_t *vp;
926 	struct cpu *cp;
927 	int fflag, ioflag, rwflag;
928 	ssize_t bcount;
929 	int error = 0;
930 	u_offset_t fileoff;
931 	int in_crit = 0;
932 
933 #if defined(_LITTLE_ENDIAN)
934 	fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
935 #else
936 	fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
937 #endif
938 
939 	if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
940 		return (set_errno(EINVAL));
941 
942 	if ((fp = getf(fdes)) == NULL)
943 		return (set_errno(EBADF));
944 	if (((fflag = fp->f_flag) & (FREAD)) == 0) {
945 		error = EBADF;
946 		goto out;
947 	}
948 
949 	rwflag = 0;
950 	vp = fp->f_vnode;
951 
952 	if (vp->v_type == VREG) {
953 
954 		if (bcount == 0)
955 			goto out;
956 
957 		/*
958 		 * Same as pread. See comments in pread.
959 		 */
960 
961 		if (fileoff > MAXOFFSET_T) {
962 			error = EINVAL;
963 			goto out;
964 		}
965 		if (fileoff + bcount > MAXOFFSET_T)
966 			bcount = (ssize_t)(MAXOFFSET_T - fileoff);
967 	} else if (vp->v_type == VFIFO) {
968 		error = ESPIPE;
969 		goto out;
970 	}
971 
972 	/*
973 	 * We have to enter the critical region before calling VOP_RWLOCK
974 	 * to avoid a deadlock with ufs.
975 	 */
976 	if (nbl_need_check(vp)) {
977 		int svmand;
978 
979 		nbl_start_crit(vp, RW_READER);
980 		in_crit = 1;
981 		error = nbl_svmand(vp, fp->f_cred, &svmand);
982 		if (error != 0)
983 			goto out;
984 		if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand)) {
985 			error = EACCES;
986 			goto out;
987 		}
988 	}
989 
990 	aiov.iov_base = cbuf;
991 	aiov.iov_len = bcount;
992 	(void) VOP_RWLOCK(vp, rwflag, NULL);
993 	auio.uio_loffset = fileoff;
994 
995 	/*
996 	 * Note: File size can never be greater than MAXOFFSET_T.
997 	 * If ever we start supporting 128 bit files the code
998 	 * similar to the one in pread at this place should be here.
999 	 * Here we avoid the unnecessary VOP_GETATTR() when we
1000 	 * know that fileoff == MAXOFFSET_T implies that it is always
1001 	 * greater than or equal to file size.
1002 	 */
1003 	auio.uio_iov = &aiov;
1004 	auio.uio_iovcnt = 1;
1005 	auio.uio_resid = bcount;
1006 	auio.uio_segflg = UIO_USERSPACE;
1007 	auio.uio_llimit = MAXOFFSET_T;
1008 	auio.uio_fmode = fflag;
1009 	auio.uio_extflg = UIO_COPY_CACHED;
1010 
1011 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1012 
1013 	/* If read sync is not asked for, filter sync flags */
1014 	if ((ioflag & FRSYNC) == 0)
1015 		ioflag &= ~(FSYNC|FDSYNC);
1016 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1017 	bcount -= auio.uio_resid;
1018 	CPU_STATS_ENTER_K();
1019 	cp = CPU;
1020 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
1021 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
1022 	CPU_STATS_EXIT_K();
1023 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
1024 	VOP_RWUNLOCK(vp, rwflag, NULL);
1025 
1026 	if (error == EINTR && bcount != 0)
1027 		error = 0;
1028 out:
1029 	if (in_crit)
1030 		nbl_end_crit(vp);
1031 	releasef(fdes);
1032 	if (error)
1033 		return (set_errno(error));
1034 	return (bcount);
1035 }
1036 
1037 /*
1038  * This syscall supplies 64-bit file offsets to 32-bit applications only.
1039  */
1040 ssize32_t
1041 pwrite64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
1042     uint32_t offset_2)
1043 {
1044 	struct uio auio;
1045 	struct iovec aiov;
1046 	file_t *fp;
1047 	register vnode_t *vp;
1048 	struct cpu *cp;
1049 	int fflag, ioflag, rwflag;
1050 	ssize_t bcount;
1051 	int error = 0;
1052 	u_offset_t fileoff;
1053 	int in_crit = 0;
1054 
1055 #if defined(_LITTLE_ENDIAN)
1056 	fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
1057 #else
1058 	fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
1059 #endif
1060 
1061 	if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
1062 		return (set_errno(EINVAL));
1063 	if ((fp = getf(fdes)) == NULL)
1064 		return (set_errno(EBADF));
1065 	if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
1066 		error = EBADF;
1067 		goto out;
1068 	}
1069 
1070 	rwflag = 1;
1071 	vp = fp->f_vnode;
1072 
1073 	if (vp->v_type == VREG) {
1074 
1075 		if (bcount == 0)
1076 			goto out;
1077 
1078 		/*
1079 		 * See comments in pwrite.
1080 		 */
1081 		if (fileoff > MAXOFFSET_T) {
1082 			error = EINVAL;
1083 			goto out;
1084 		}
1085 		if (fileoff >= curproc->p_fsz_ctl) {
1086 			mutex_enter(&curproc->p_lock);
1087 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
1088 			    curproc->p_rctls, curproc, RCA_SAFE);
1089 			mutex_exit(&curproc->p_lock);
1090 			error = EFBIG;
1091 			goto out;
1092 		}
1093 		if (fileoff == MAXOFFSET_T) {
1094 			error = EFBIG;
1095 			goto out;
1096 		}
1097 		if (fileoff + bcount > MAXOFFSET_T)
1098 			bcount = (ssize_t)((u_offset_t)MAXOFFSET_T - fileoff);
1099 	} else if (vp->v_type == VFIFO) {
1100 		error = ESPIPE;
1101 		goto out;
1102 	}
1103 
1104 	/*
1105 	 * We have to enter the critical region before calling VOP_RWLOCK
1106 	 * to avoid a deadlock with ufs.
1107 	 */
1108 	if (nbl_need_check(vp)) {
1109 		int svmand;
1110 
1111 		nbl_start_crit(vp, RW_READER);
1112 		in_crit = 1;
1113 		error = nbl_svmand(vp, fp->f_cred, &svmand);
1114 		if (error != 0)
1115 			goto out;
1116 		if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand)) {
1117 			error = EACCES;
1118 			goto out;
1119 		}
1120 	}
1121 
1122 	aiov.iov_base = cbuf;
1123 	aiov.iov_len = bcount;
1124 	(void) VOP_RWLOCK(vp, rwflag, NULL);
1125 	auio.uio_loffset = fileoff;
1126 	auio.uio_iov = &aiov;
1127 	auio.uio_iovcnt = 1;
1128 	auio.uio_resid = bcount;
1129 	auio.uio_segflg = UIO_USERSPACE;
1130 	auio.uio_llimit = curproc->p_fsz_ctl;
1131 	auio.uio_fmode = fflag;
1132 	auio.uio_extflg = UIO_COPY_CACHED;
1133 
1134 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1135 
1136 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
1137 	bcount -= auio.uio_resid;
1138 	CPU_STATS_ENTER_K();
1139 	cp = CPU;
1140 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
1141 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
1142 	CPU_STATS_EXIT_K();
1143 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
1144 	VOP_RWUNLOCK(vp, rwflag, NULL);
1145 
1146 	if (error == EINTR && bcount != 0)
1147 		error = 0;
1148 out:
1149 	if (in_crit)
1150 		nbl_end_crit(vp);
1151 	releasef(fdes);
1152 	if (error)
1153 		return (set_errno(error));
1154 	return (bcount);
1155 }
1156 
1157 #endif	/* _SYSCALL32_IMPL || _ILP32 */
1158 
1159 #ifdef _SYSCALL32_IMPL
1160 /*
1161  * Tail-call elimination of xxx32() down to xxx()
1162  *
1163  * A number of xxx32 system calls take a len (or count) argument and
1164  * return a number in the range [0,len] or -1 on error.
1165  * Given an ssize32_t input len, the downcall xxx() will return
1166  * a 64-bit value that is -1 or in the range [0,len] which actually
1167  * is a proper return value for the xxx32 call. So even if the xxx32
1168  * calls can be considered as returning a ssize32_t, they are currently
1169  * declared as returning a ssize_t as this enables tail-call elimination.
1170  *
1171  * The cast of len (or count) to ssize32_t is needed to ensure we pass
1172  * down negative input values as such and let the downcall handle error
1173  * reporting. Functions covered by this comments are:
1174  *
1175  * rw.c:           read32, write32, pread32, pwrite32, readv32, writev32.
1176  * socksyscall.c:  recv32, recvfrom32, send32, sendto32.
1177  * readlink.c:     readlink32.
1178  */
1179 
1180 ssize_t
1181 read32(int32_t fdes, caddr32_t cbuf, size32_t count)
1182 {
1183 	return (read(fdes,
1184 	    (void *)(uintptr_t)cbuf, (ssize32_t)count));
1185 }
1186 
1187 ssize_t
1188 write32(int32_t fdes, caddr32_t cbuf, size32_t count)
1189 {
1190 	return (write(fdes,
1191 	    (void *)(uintptr_t)cbuf, (ssize32_t)count));
1192 }
1193 
1194 ssize_t
1195 pread32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
1196 {
1197 	return (pread(fdes,
1198 	    (void *)(uintptr_t)cbuf, (ssize32_t)count,
1199 	    (off_t)(uint32_t)offset));
1200 }
1201 
1202 ssize_t
1203 pwrite32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
1204 {
1205 	return (pwrite(fdes,
1206 	    (void *)(uintptr_t)cbuf, (ssize32_t)count,
1207 	    (off_t)(uint32_t)offset));
1208 }
1209 
1210 ssize_t
1211 readv32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
1212 {
1213 	return (readv(fdes, (void *)(uintptr_t)iovp, iovcnt));
1214 }
1215 
1216 ssize_t
1217 writev32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
1218 {
1219 	return (writev(fdes, (void *)(uintptr_t)iovp, iovcnt));
1220 }
1221 
1222 #endif	/* _SYSCALL32_IMPL */
1223