xref: /illumos-gate/usr/src/uts/common/syscall/rw.c (revision d48be21240dfd051b689384ce2b23479d757f2d8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  * Copyright 2020, Joyent, Inc.
26  */
27 
28 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
29 /*	  All Rights Reserved	*/
30 
31 /*
32  * Portions of this source code were derived from Berkeley 4.3 BSD
33  * under license from the Regents of the University of California.
34  */
35 
36 #include <sys/param.h>
37 #include <sys/isa_defs.h>
38 #include <sys/types.h>
39 #include <sys/inttypes.h>
40 #include <sys/sysmacros.h>
41 #include <sys/cred.h>
42 #include <sys/user.h>
43 #include <sys/systm.h>
44 #include <sys/errno.h>
45 #include <sys/vnode.h>
46 #include <sys/file.h>
47 #include <sys/proc.h>
48 #include <sys/cpuvar.h>
49 #include <sys/uio.h>
50 #include <sys/debug.h>
51 #include <sys/rctl.h>
52 #include <sys/nbmlock.h>
53 #include <sys/limits.h>
54 
55 #define	COPYOUT_MAX_CACHE	(1<<17)		/* 128K */
56 
57 size_t copyout_max_cached = COPYOUT_MAX_CACHE;	/* global so it's patchable */
58 
59 /*
60  * read, write, pread, pwrite, readv, and writev syscalls.
61  *
62  * 64-bit open:	all open's are large file opens.
63  * Large Files: the behaviour of read depends on whether the fd
64  *		corresponds to large open or not.
65  * 32-bit open:	FOFFMAX flag not set.
66  *		read until MAXOFF32_T - 1 and read at MAXOFF32_T returns
67  *		EOVERFLOW if count is non-zero and if size of file
68  *		is > MAXOFF32_T. If size of file is <= MAXOFF32_T read
69  *		at >= MAXOFF32_T returns EOF.
70  */
71 
72 /*
73  * Native system call
74  */
75 ssize_t
76 read(int fdes, void *cbuf, size_t count)
77 {
78 	struct uio auio;
79 	struct iovec aiov;
80 	file_t *fp;
81 	register vnode_t *vp;
82 	struct cpu *cp;
83 	int fflag, ioflag, rwflag;
84 	ssize_t cnt, bcount;
85 	int error = 0;
86 	u_offset_t fileoff;
87 	int in_crit = 0;
88 
89 	if ((cnt = (ssize_t)count) < 0)
90 		return (set_errno(EINVAL));
91 	if ((fp = getf(fdes)) == NULL)
92 		return (set_errno(EBADF));
93 	if (((fflag = fp->f_flag) & FREAD) == 0) {
94 		error = EBADF;
95 		goto out;
96 	}
97 	vp = fp->f_vnode;
98 
99 	if (vp->v_type == VREG && cnt == 0) {
100 		goto out;
101 	}
102 
103 	rwflag = 0;
104 	aiov.iov_base = cbuf;
105 	aiov.iov_len = cnt;
106 
107 	/*
108 	 * We have to enter the critical region before calling VOP_RWLOCK
109 	 * to avoid a deadlock with write() calls.
110 	 */
111 	if (nbl_need_check(vp)) {
112 		int svmand;
113 
114 		nbl_start_crit(vp, RW_READER);
115 		in_crit = 1;
116 		error = nbl_svmand(vp, fp->f_cred, &svmand);
117 		if (error != 0)
118 			goto out;
119 		if (nbl_conflict(vp, NBL_READ, fp->f_offset, cnt, svmand,
120 		    NULL)) {
121 			error = EACCES;
122 			goto out;
123 		}
124 	}
125 
126 	(void) VOP_RWLOCK(vp, rwflag, NULL);
127 
128 	/*
129 	 * We do the following checks inside VOP_RWLOCK so as to
130 	 * prevent file size from changing while these checks are
131 	 * being done. Also, we load fp's offset to the local
132 	 * variable fileoff because we can have a parallel lseek
133 	 * going on (f_offset is not protected by any lock) which
134 	 * could change f_offset. We need to see the value only
135 	 * once here and take a decision. Seeing it more than once
136 	 * can lead to incorrect functionality.
137 	 */
138 
139 	fileoff = (u_offset_t)fp->f_offset;
140 	if (fileoff >= OFFSET_MAX(fp) && (vp->v_type == VREG)) {
141 		struct vattr va;
142 		va.va_mask = AT_SIZE;
143 		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL)))  {
144 			VOP_RWUNLOCK(vp, rwflag, NULL);
145 			goto out;
146 		}
147 		if (fileoff >= va.va_size) {
148 			cnt = 0;
149 			VOP_RWUNLOCK(vp, rwflag, NULL);
150 			goto out;
151 		} else {
152 			error = EOVERFLOW;
153 			VOP_RWUNLOCK(vp, rwflag, NULL);
154 			goto out;
155 		}
156 	}
157 	if ((vp->v_type == VREG) &&
158 	    (fileoff + cnt > OFFSET_MAX(fp))) {
159 		cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
160 	}
161 	auio.uio_loffset = fileoff;
162 	auio.uio_iov = &aiov;
163 	auio.uio_iovcnt = 1;
164 	auio.uio_resid = bcount = cnt;
165 	auio.uio_segflg = UIO_USERSPACE;
166 	auio.uio_llimit = MAXOFFSET_T;
167 	auio.uio_fmode = fflag;
168 	/*
169 	 * Only use bypass caches when the count is large enough
170 	 */
171 	if (bcount <= copyout_max_cached)
172 		auio.uio_extflg = UIO_COPY_CACHED;
173 	else
174 		auio.uio_extflg = UIO_COPY_DEFAULT;
175 
176 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
177 
178 	/* If read sync is not asked for, filter sync flags */
179 	if ((ioflag & FRSYNC) == 0)
180 		ioflag &= ~(FSYNC|FDSYNC);
181 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
182 	cnt -= auio.uio_resid;
183 	CPU_STATS_ENTER_K();
184 	cp = CPU;
185 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
186 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)cnt);
187 	CPU_STATS_EXIT_K();
188 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
189 
190 	if (vp->v_type == VFIFO)	/* Backward compatibility */
191 		fp->f_offset = cnt;
192 	else if (((fp->f_flag & FAPPEND) == 0) ||
193 	    (vp->v_type != VREG) || (bcount != 0))	/* POSIX */
194 		fp->f_offset = auio.uio_loffset;
195 	VOP_RWUNLOCK(vp, rwflag, NULL);
196 
197 	if (error == EINTR && cnt != 0)
198 		error = 0;
199 out:
200 	if (in_crit)
201 		nbl_end_crit(vp);
202 	releasef(fdes);
203 	if (error)
204 		return (set_errno(error));
205 	return (cnt);
206 }
207 
208 /*
209  * Native system call
210  */
211 ssize_t
212 write(int fdes, void *cbuf, size_t count)
213 {
214 	struct uio auio;
215 	struct iovec aiov;
216 	file_t *fp;
217 	register vnode_t *vp;
218 	struct cpu *cp;
219 	int fflag, ioflag, rwflag;
220 	ssize_t cnt, bcount;
221 	int error = 0;
222 	u_offset_t fileoff;
223 	int in_crit = 0;
224 
225 	if ((cnt = (ssize_t)count) < 0)
226 		return (set_errno(EINVAL));
227 	if ((fp = getf(fdes)) == NULL)
228 		return (set_errno(EBADF));
229 	if (((fflag = fp->f_flag) & FWRITE) == 0) {
230 		error = EBADF;
231 		goto out;
232 	}
233 	vp = fp->f_vnode;
234 
235 	if (vp->v_type == VREG && cnt == 0) {
236 		goto out;
237 	}
238 
239 	rwflag = 1;
240 	aiov.iov_base = cbuf;
241 	aiov.iov_len = cnt;
242 
243 	/*
244 	 * We have to enter the critical region before calling VOP_RWLOCK
245 	 * to avoid a deadlock with ufs.
246 	 */
247 	if (nbl_need_check(vp)) {
248 		int svmand;
249 
250 		nbl_start_crit(vp, RW_READER);
251 		in_crit = 1;
252 		error = nbl_svmand(vp, fp->f_cred, &svmand);
253 		if (error != 0)
254 			goto out;
255 		if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, cnt, svmand,
256 		    NULL)) {
257 			error = EACCES;
258 			goto out;
259 		}
260 	}
261 
262 	(void) VOP_RWLOCK(vp, rwflag, NULL);
263 
264 	fileoff = fp->f_offset;
265 	if (vp->v_type == VREG) {
266 
267 		/*
268 		 * We raise psignal if write for >0 bytes causes
269 		 * it to exceed the ulimit.
270 		 */
271 		if (fileoff >= curproc->p_fsz_ctl) {
272 			VOP_RWUNLOCK(vp, rwflag, NULL);
273 
274 			mutex_enter(&curproc->p_lock);
275 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
276 			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
277 			mutex_exit(&curproc->p_lock);
278 
279 			error = EFBIG;
280 			goto out;
281 		}
282 		/*
283 		 * We return EFBIG if write is done at an offset
284 		 * greater than the offset maximum for this file structure.
285 		 */
286 
287 		if (fileoff >= OFFSET_MAX(fp)) {
288 			VOP_RWUNLOCK(vp, rwflag, NULL);
289 			error = EFBIG;
290 			goto out;
291 		}
292 		/*
293 		 * Limit the bytes to be written  upto offset maximum for
294 		 * this open file structure.
295 		 */
296 		if (fileoff + cnt > OFFSET_MAX(fp))
297 			cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
298 	}
299 	auio.uio_loffset = fileoff;
300 	auio.uio_iov = &aiov;
301 	auio.uio_iovcnt = 1;
302 	auio.uio_resid = bcount = cnt;
303 	auio.uio_segflg = UIO_USERSPACE;
304 	auio.uio_llimit = curproc->p_fsz_ctl;
305 	auio.uio_fmode = fflag;
306 	auio.uio_extflg = UIO_COPY_DEFAULT;
307 
308 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
309 
310 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
311 	cnt -= auio.uio_resid;
312 	CPU_STATS_ENTER_K();
313 	cp = CPU;
314 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
315 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)cnt);
316 	CPU_STATS_EXIT_K();
317 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
318 
319 	if (vp->v_type == VFIFO)	/* Backward compatibility */
320 		fp->f_offset = cnt;
321 	else if (((fp->f_flag & FAPPEND) == 0) ||
322 	    (vp->v_type != VREG) || (bcount != 0))	/* POSIX */
323 		fp->f_offset = auio.uio_loffset;
324 	VOP_RWUNLOCK(vp, rwflag, NULL);
325 
326 	if (error == EINTR && cnt != 0)
327 		error = 0;
328 out:
329 	if (in_crit)
330 		nbl_end_crit(vp);
331 	releasef(fdes);
332 	if (error)
333 		return (set_errno(error));
334 	return (cnt);
335 }
336 
337 ssize_t
338 pread(int fdes, void *cbuf, size_t count, off_t offset)
339 {
340 	struct uio auio;
341 	struct iovec aiov;
342 	file_t *fp;
343 	register vnode_t *vp;
344 	struct cpu *cp;
345 	int fflag, ioflag, rwflag;
346 	ssize_t bcount;
347 	int error = 0;
348 	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
349 #ifdef _SYSCALL32_IMPL
350 	u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
351 	    MAXOFF32_T : MAXOFFSET_T;
352 #else
353 	const u_offset_t maxoff = MAXOFF32_T;
354 #endif
355 	int in_crit = 0;
356 
357 	if ((bcount = (ssize_t)count) < 0)
358 		return (set_errno(EINVAL));
359 
360 	if ((fp = getf(fdes)) == NULL)
361 		return (set_errno(EBADF));
362 	if (((fflag = fp->f_flag) & (FREAD)) == 0) {
363 		error = EBADF;
364 		goto out;
365 	}
366 
367 	rwflag = 0;
368 	vp = fp->f_vnode;
369 
370 	if (vp->v_type == VREG) {
371 
372 		if (bcount == 0)
373 			goto out;
374 
375 		/*
376 		 * Return EINVAL if an invalid offset comes to pread.
377 		 * Negative offset from user will cause this error.
378 		 */
379 
380 		if (fileoff > maxoff) {
381 			error = EINVAL;
382 			goto out;
383 		}
384 		/*
385 		 * Limit offset such that we don't read or write
386 		 * a file beyond the maximum offset representable in
387 		 * an off_t structure.
388 		 */
389 		if (fileoff + bcount > maxoff)
390 			bcount = (ssize_t)((offset_t)maxoff - fileoff);
391 	} else if (vp->v_type == VFIFO) {
392 		error = ESPIPE;
393 		goto out;
394 	}
395 
396 	/*
397 	 * We have to enter the critical region before calling VOP_RWLOCK
398 	 * to avoid a deadlock with ufs.
399 	 */
400 	if (nbl_need_check(vp)) {
401 		int svmand;
402 
403 		nbl_start_crit(vp, RW_READER);
404 		in_crit = 1;
405 		error = nbl_svmand(vp, fp->f_cred, &svmand);
406 		if (error != 0)
407 			goto out;
408 		if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand,
409 		    NULL)) {
410 			error = EACCES;
411 			goto out;
412 		}
413 	}
414 
415 	aiov.iov_base = cbuf;
416 	aiov.iov_len = bcount;
417 	(void) VOP_RWLOCK(vp, rwflag, NULL);
418 	if (vp->v_type == VREG && fileoff == (u_offset_t)maxoff) {
419 		struct vattr va;
420 		va.va_mask = AT_SIZE;
421 		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL))) {
422 			VOP_RWUNLOCK(vp, rwflag, NULL);
423 			goto out;
424 		}
425 		VOP_RWUNLOCK(vp, rwflag, NULL);
426 
427 		/*
428 		 * We have to return EOF if fileoff is >= file size.
429 		 */
430 		if (fileoff >= va.va_size) {
431 			bcount = 0;
432 			goto out;
433 		}
434 
435 		/*
436 		 * File is greater than or equal to maxoff and therefore
437 		 * we return EOVERFLOW.
438 		 */
439 		error = EOVERFLOW;
440 		goto out;
441 	}
442 	auio.uio_loffset = fileoff;
443 	auio.uio_iov = &aiov;
444 	auio.uio_iovcnt = 1;
445 	auio.uio_resid = bcount;
446 	auio.uio_segflg = UIO_USERSPACE;
447 	auio.uio_llimit = MAXOFFSET_T;
448 	auio.uio_fmode = fflag;
449 	auio.uio_extflg = UIO_COPY_CACHED;
450 
451 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
452 
453 	/* If read sync is not asked for, filter sync flags */
454 	if ((ioflag & FRSYNC) == 0)
455 		ioflag &= ~(FSYNC|FDSYNC);
456 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
457 	bcount -= auio.uio_resid;
458 	CPU_STATS_ENTER_K();
459 	cp = CPU;
460 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
461 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
462 	CPU_STATS_EXIT_K();
463 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
464 	VOP_RWUNLOCK(vp, rwflag, NULL);
465 
466 	if (error == EINTR && bcount != 0)
467 		error = 0;
468 out:
469 	if (in_crit)
470 		nbl_end_crit(vp);
471 	releasef(fdes);
472 	if (error)
473 		return (set_errno(error));
474 	return (bcount);
475 }
476 
477 ssize_t
478 pwrite(int fdes, void *cbuf, size_t count, off_t offset)
479 {
480 	struct uio auio;
481 	struct iovec aiov;
482 	file_t *fp;
483 	register vnode_t *vp;
484 	struct cpu *cp;
485 	int fflag, ioflag, rwflag;
486 	ssize_t bcount;
487 	int error = 0;
488 	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
489 #ifdef _SYSCALL32_IMPL
490 	u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
491 	    MAXOFF32_T : MAXOFFSET_T;
492 #else
493 	const u_offset_t maxoff = MAXOFF32_T;
494 #endif
495 	int in_crit = 0;
496 
497 	if ((bcount = (ssize_t)count) < 0)
498 		return (set_errno(EINVAL));
499 	if ((fp = getf(fdes)) == NULL)
500 		return (set_errno(EBADF));
501 	if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
502 		error = EBADF;
503 		goto out;
504 	}
505 
506 	rwflag = 1;
507 	vp = fp->f_vnode;
508 
509 	if (vp->v_type == VREG) {
510 
511 		if (bcount == 0)
512 			goto out;
513 
514 		/*
515 		 * return EINVAL for offsets that cannot be
516 		 * represented in an off_t.
517 		 */
518 		if (fileoff > maxoff) {
519 			error = EINVAL;
520 			goto out;
521 		}
522 		/*
523 		 * Take appropriate action if we are trying to write above the
524 		 * resource limit.
525 		 */
526 		if (fileoff >= curproc->p_fsz_ctl) {
527 			mutex_enter(&curproc->p_lock);
528 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
529 			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
530 			mutex_exit(&curproc->p_lock);
531 
532 			error = EFBIG;
533 			goto out;
534 		}
535 		/*
536 		 * Don't allow pwrite to cause file sizes to exceed
537 		 * maxoff.
538 		 */
539 		if (fileoff == maxoff) {
540 			error = EFBIG;
541 			goto out;
542 		}
543 		if (fileoff + count > maxoff)
544 			bcount = (ssize_t)((u_offset_t)maxoff - fileoff);
545 	} else if (vp->v_type == VFIFO) {
546 		error = ESPIPE;
547 		goto out;
548 	}
549 
550 	/*
551 	 * We have to enter the critical region before calling VOP_RWLOCK
552 	 * to avoid a deadlock with ufs.
553 	 */
554 	if (nbl_need_check(vp)) {
555 		int svmand;
556 
557 		nbl_start_crit(vp, RW_READER);
558 		in_crit = 1;
559 		error = nbl_svmand(vp, fp->f_cred, &svmand);
560 		if (error != 0)
561 			goto out;
562 		if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand,
563 		    NULL)) {
564 			error = EACCES;
565 			goto out;
566 		}
567 	}
568 
569 	aiov.iov_base = cbuf;
570 	aiov.iov_len = bcount;
571 	(void) VOP_RWLOCK(vp, rwflag, NULL);
572 	auio.uio_loffset = fileoff;
573 	auio.uio_iov = &aiov;
574 	auio.uio_iovcnt = 1;
575 	auio.uio_resid = bcount;
576 	auio.uio_segflg = UIO_USERSPACE;
577 	auio.uio_llimit = curproc->p_fsz_ctl;
578 	auio.uio_fmode = fflag;
579 	auio.uio_extflg = UIO_COPY_CACHED;
580 
581 	/*
582 	 * The SUSv4 POSIX specification states:
583 	 *	The pwrite() function shall be equivalent to write(), except
584 	 *	that it writes into a given position and does not change
585 	 *	the file offset (regardless of whether O_APPEND is set).
586 	 * To make this be true, we omit the FAPPEND flag from ioflag.
587 	 */
588 	ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
589 
590 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
591 	bcount -= auio.uio_resid;
592 	CPU_STATS_ENTER_K();
593 	cp = CPU;
594 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
595 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
596 	CPU_STATS_EXIT_K();
597 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
598 	VOP_RWUNLOCK(vp, rwflag, NULL);
599 
600 	if (error == EINTR && bcount != 0)
601 		error = 0;
602 out:
603 	if (in_crit)
604 		nbl_end_crit(vp);
605 	releasef(fdes);
606 	if (error)
607 		return (set_errno(error));
608 	return (bcount);
609 }
610 
611 ssize_t
612 readv(int fdes, struct iovec *iovp, int iovcnt)
613 {
614 	struct uio auio;
615 	struct iovec buf[IOV_MAX_STACK], *aiov = buf;
616 	int aiovlen = 0;
617 	file_t *fp;
618 	register vnode_t *vp;
619 	struct cpu *cp;
620 	int fflag, ioflag, rwflag;
621 	ssize_t count, bcount;
622 	int error = 0;
623 	int i;
624 	u_offset_t fileoff;
625 	int in_crit = 0;
626 
627 	if (iovcnt <= 0 || iovcnt > IOV_MAX)
628 		return (set_errno(EINVAL));
629 
630 	if (iovcnt > IOV_MAX_STACK) {
631 		aiovlen = iovcnt * sizeof (iovec_t);
632 		aiov = kmem_alloc(aiovlen, KM_SLEEP);
633 	}
634 
635 #ifdef _SYSCALL32_IMPL
636 	/*
637 	 * 32-bit callers need to have their iovec expanded,
638 	 * while ensuring that they can't move more than 2Gbytes
639 	 * of data in a single call.
640 	 */
641 	if (get_udatamodel() == DATAMODEL_ILP32) {
642 		struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
643 		int aiov32len;
644 		ssize32_t count32;
645 
646 		aiov32len = iovcnt * sizeof (iovec32_t);
647 		if (aiovlen != 0)
648 			aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
649 
650 		if (copyin(iovp, aiov32, aiov32len)) {
651 			if (aiovlen != 0) {
652 				kmem_free(aiov32, aiov32len);
653 				kmem_free(aiov, aiovlen);
654 			}
655 			return (set_errno(EFAULT));
656 		}
657 
658 		count32 = 0;
659 		for (i = 0; i < iovcnt; i++) {
660 			ssize32_t iovlen32 = aiov32[i].iov_len;
661 			count32 += iovlen32;
662 			if (iovlen32 < 0 || count32 < 0) {
663 				if (aiovlen != 0) {
664 					kmem_free(aiov32, aiov32len);
665 					kmem_free(aiov, aiovlen);
666 				}
667 				return (set_errno(EINVAL));
668 			}
669 			aiov[i].iov_len = iovlen32;
670 			aiov[i].iov_base =
671 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
672 		}
673 
674 		if (aiovlen != 0)
675 			kmem_free(aiov32, aiov32len);
676 	} else
677 #endif
678 	if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
679 		if (aiovlen != 0)
680 			kmem_free(aiov, aiovlen);
681 		return (set_errno(EFAULT));
682 	}
683 
684 	count = 0;
685 	for (i = 0; i < iovcnt; i++) {
686 		ssize_t iovlen = aiov[i].iov_len;
687 		count += iovlen;
688 		if (iovlen < 0 || count < 0) {
689 			if (aiovlen != 0)
690 				kmem_free(aiov, aiovlen);
691 			return (set_errno(EINVAL));
692 		}
693 	}
694 	if ((fp = getf(fdes)) == NULL) {
695 		if (aiovlen != 0)
696 			kmem_free(aiov, aiovlen);
697 		return (set_errno(EBADF));
698 	}
699 	if (((fflag = fp->f_flag) & FREAD) == 0) {
700 		error = EBADF;
701 		goto out;
702 	}
703 	vp = fp->f_vnode;
704 	if (vp->v_type == VREG && count == 0) {
705 		goto out;
706 	}
707 
708 	rwflag = 0;
709 
710 	/*
711 	 * We have to enter the critical region before calling VOP_RWLOCK
712 	 * to avoid a deadlock with ufs.
713 	 */
714 	if (nbl_need_check(vp)) {
715 		int svmand;
716 
717 		nbl_start_crit(vp, RW_READER);
718 		in_crit = 1;
719 		error = nbl_svmand(vp, fp->f_cred, &svmand);
720 		if (error != 0)
721 			goto out;
722 		if (nbl_conflict(vp, NBL_READ, fp->f_offset, count, svmand,
723 		    NULL)) {
724 			error = EACCES;
725 			goto out;
726 		}
727 	}
728 
729 	(void) VOP_RWLOCK(vp, rwflag, NULL);
730 	fileoff = fp->f_offset;
731 
732 	/*
733 	 * Behaviour is same as read. Please see comments in read.
734 	 */
735 
736 	if ((vp->v_type == VREG) && (fileoff >= OFFSET_MAX(fp))) {
737 		struct vattr va;
738 		va.va_mask = AT_SIZE;
739 		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL)))  {
740 			VOP_RWUNLOCK(vp, rwflag, NULL);
741 			goto out;
742 		}
743 		if (fileoff >= va.va_size) {
744 			VOP_RWUNLOCK(vp, rwflag, NULL);
745 			count = 0;
746 			goto out;
747 		} else {
748 			VOP_RWUNLOCK(vp, rwflag, NULL);
749 			error = EOVERFLOW;
750 			goto out;
751 		}
752 	}
753 	if ((vp->v_type == VREG) && (fileoff + count > OFFSET_MAX(fp))) {
754 		count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
755 	}
756 	auio.uio_loffset = fileoff;
757 	auio.uio_iov = aiov;
758 	auio.uio_iovcnt = iovcnt;
759 	auio.uio_resid = bcount = count;
760 	auio.uio_segflg = UIO_USERSPACE;
761 	auio.uio_llimit = MAXOFFSET_T;
762 	auio.uio_fmode = fflag;
763 	if (bcount <= copyout_max_cached)
764 		auio.uio_extflg = UIO_COPY_CACHED;
765 	else
766 		auio.uio_extflg = UIO_COPY_DEFAULT;
767 
768 
769 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
770 
771 	/* If read sync is not asked for, filter sync flags */
772 	if ((ioflag & FRSYNC) == 0)
773 		ioflag &= ~(FSYNC|FDSYNC);
774 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
775 	count -= auio.uio_resid;
776 	CPU_STATS_ENTER_K();
777 	cp = CPU;
778 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
779 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)count);
780 	CPU_STATS_EXIT_K();
781 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
782 
783 	if (vp->v_type == VFIFO)	/* Backward compatibility */
784 		fp->f_offset = count;
785 	else if (((fp->f_flag & FAPPEND) == 0) ||
786 	    (vp->v_type != VREG) || (bcount != 0))	/* POSIX */
787 		fp->f_offset = auio.uio_loffset;
788 
789 	VOP_RWUNLOCK(vp, rwflag, NULL);
790 
791 	if (error == EINTR && count != 0)
792 		error = 0;
793 out:
794 	if (in_crit)
795 		nbl_end_crit(vp);
796 	releasef(fdes);
797 	if (aiovlen != 0)
798 		kmem_free(aiov, aiovlen);
799 	if (error)
800 		return (set_errno(error));
801 	return (count);
802 }
803 
804 ssize_t
805 writev(int fdes, struct iovec *iovp, int iovcnt)
806 {
807 	struct uio auio;
808 	struct iovec buf[IOV_MAX_STACK], *aiov = buf;
809 	int aiovlen = 0;
810 	file_t *fp;
811 	register vnode_t *vp;
812 	struct cpu *cp;
813 	int fflag, ioflag, rwflag;
814 	ssize_t count, bcount;
815 	int error = 0;
816 	int i;
817 	u_offset_t fileoff;
818 	int in_crit = 0;
819 
820 	if (iovcnt <= 0 || iovcnt > IOV_MAX)
821 		return (set_errno(EINVAL));
822 
823 	if (iovcnt > IOV_MAX_STACK) {
824 		aiovlen = iovcnt * sizeof (iovec_t);
825 		aiov = kmem_alloc(aiovlen, KM_SLEEP);
826 	}
827 
828 #ifdef _SYSCALL32_IMPL
829 	/*
830 	 * 32-bit callers need to have their iovec expanded,
831 	 * while ensuring that they can't move more than 2Gbytes
832 	 * of data in a single call.
833 	 */
834 	if (get_udatamodel() == DATAMODEL_ILP32) {
835 		struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
836 		int aiov32len;
837 		ssize32_t count32;
838 
839 		aiov32len = iovcnt * sizeof (iovec32_t);
840 		if (aiovlen != 0)
841 			aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
842 
843 		if (copyin(iovp, aiov32, aiov32len)) {
844 			if (aiovlen != 0) {
845 				kmem_free(aiov32, aiov32len);
846 				kmem_free(aiov, aiovlen);
847 			}
848 			return (set_errno(EFAULT));
849 		}
850 
851 		count32 = 0;
852 		for (i = 0; i < iovcnt; i++) {
853 			ssize32_t iovlen = aiov32[i].iov_len;
854 			count32 += iovlen;
855 			if (iovlen < 0 || count32 < 0) {
856 				if (aiovlen != 0) {
857 					kmem_free(aiov32, aiov32len);
858 					kmem_free(aiov, aiovlen);
859 				}
860 				return (set_errno(EINVAL));
861 			}
862 			aiov[i].iov_len = iovlen;
863 			aiov[i].iov_base =
864 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
865 		}
866 		if (aiovlen != 0)
867 			kmem_free(aiov32, aiov32len);
868 	} else
869 #endif
870 	if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
871 		if (aiovlen != 0)
872 			kmem_free(aiov, aiovlen);
873 		return (set_errno(EFAULT));
874 	}
875 
876 	count = 0;
877 	for (i = 0; i < iovcnt; i++) {
878 		ssize_t iovlen = aiov[i].iov_len;
879 		count += iovlen;
880 		if (iovlen < 0 || count < 0) {
881 			if (aiovlen != 0)
882 				kmem_free(aiov, aiovlen);
883 			return (set_errno(EINVAL));
884 		}
885 	}
886 	if ((fp = getf(fdes)) == NULL) {
887 		if (aiovlen != 0)
888 			kmem_free(aiov, aiovlen);
889 		return (set_errno(EBADF));
890 	}
891 	if (((fflag = fp->f_flag) & FWRITE) == 0) {
892 		error = EBADF;
893 		goto out;
894 	}
895 	vp = fp->f_vnode;
896 	if (vp->v_type == VREG && count == 0) {
897 		goto out;
898 	}
899 
900 	rwflag = 1;
901 
902 	/*
903 	 * We have to enter the critical region before calling VOP_RWLOCK
904 	 * to avoid a deadlock with ufs.
905 	 */
906 	if (nbl_need_check(vp)) {
907 		int svmand;
908 
909 		nbl_start_crit(vp, RW_READER);
910 		in_crit = 1;
911 		error = nbl_svmand(vp, fp->f_cred, &svmand);
912 		if (error != 0)
913 			goto out;
914 		if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, count, svmand,
915 		    NULL)) {
916 			error = EACCES;
917 			goto out;
918 		}
919 	}
920 
921 	(void) VOP_RWLOCK(vp, rwflag, NULL);
922 
923 	fileoff = fp->f_offset;
924 
925 	/*
926 	 * Behaviour is same as write. Please see comments for write.
927 	 */
928 
929 	if (vp->v_type == VREG) {
930 		if (fileoff >= curproc->p_fsz_ctl) {
931 			VOP_RWUNLOCK(vp, rwflag, NULL);
932 			mutex_enter(&curproc->p_lock);
933 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
934 			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
935 			mutex_exit(&curproc->p_lock);
936 			error = EFBIG;
937 			goto out;
938 		}
939 		if (fileoff >= OFFSET_MAX(fp)) {
940 			VOP_RWUNLOCK(vp, rwflag, NULL);
941 			error = EFBIG;
942 			goto out;
943 		}
944 		if (fileoff + count > OFFSET_MAX(fp))
945 			count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
946 	}
947 	auio.uio_loffset = fileoff;
948 	auio.uio_iov = aiov;
949 	auio.uio_iovcnt = iovcnt;
950 	auio.uio_resid = bcount = count;
951 	auio.uio_segflg = UIO_USERSPACE;
952 	auio.uio_llimit = curproc->p_fsz_ctl;
953 	auio.uio_fmode = fflag;
954 	auio.uio_extflg = UIO_COPY_DEFAULT;
955 
956 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
957 
958 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
959 	count -= auio.uio_resid;
960 	CPU_STATS_ENTER_K();
961 	cp = CPU;
962 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
963 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)count);
964 	CPU_STATS_EXIT_K();
965 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
966 
967 	if (vp->v_type == VFIFO)	/* Backward compatibility */
968 		fp->f_offset = count;
969 	else if (((fp->f_flag & FAPPEND) == 0) ||
970 	    (vp->v_type != VREG) || (bcount != 0))	/* POSIX */
971 		fp->f_offset = auio.uio_loffset;
972 	VOP_RWUNLOCK(vp, rwflag, NULL);
973 
974 	if (error == EINTR && count != 0)
975 		error = 0;
976 out:
977 	if (in_crit)
978 		nbl_end_crit(vp);
979 	releasef(fdes);
980 	if (aiovlen != 0)
981 		kmem_free(aiov, aiovlen);
982 	if (error)
983 		return (set_errno(error));
984 	return (count);
985 }
986 
987 ssize_t
988 preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
989     off_t extended_offset)
990 {
991 	struct uio auio;
992 	struct iovec buf[IOV_MAX_STACK], *aiov = buf;
993 	int aiovlen = 0;
994 	file_t *fp;
995 	register vnode_t *vp;
996 	struct cpu *cp;
997 	int fflag, ioflag, rwflag;
998 	ssize_t count, bcount;
999 	int error = 0;
1000 	int i;
1001 
1002 	/*
1003 	 * In a 64-bit kernel, this interface supports native 64-bit
1004 	 * applications as well as 32-bit applications using both standard and
1005 	 * large-file access. For 32-bit large-file aware applications, the
1006 	 * offset is passed as two parameters which are joined into the actual
1007 	 * offset used. The 64-bit libc always passes 0 for the extended_offset.
1008 	 * Note that off_t is a signed value, but the preadv/pwritev API treats
1009 	 * the offset as a position in the file for the operation, so passing
1010 	 * a negative value will likely fail the maximum offset checks below
1011 	 * because we convert it to an unsigned value which will be larger than
1012 	 * the maximum valid offset.
1013 	 */
1014 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1015 	u_offset_t fileoff = ((u_offset_t)extended_offset << 32) |
1016 	    (u_offset_t)offset;
1017 #else /* _SYSCALL32_IMPL || _ILP32 */
1018 	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
1019 #endif /* _SYSCALL32_IMPR || _ILP32 */
1020 
1021 	int in_crit = 0;
1022 
1023 	if (iovcnt <= 0 || iovcnt > IOV_MAX)
1024 		return (set_errno(EINVAL));
1025 
1026 	if (iovcnt > IOV_MAX_STACK) {
1027 		aiovlen = iovcnt * sizeof (iovec_t);
1028 		aiov = kmem_alloc(aiovlen, KM_SLEEP);
1029 	}
1030 
1031 #ifdef _SYSCALL32_IMPL
1032 	/*
1033 	 * 32-bit callers need to have their iovec expanded,
1034 	 * while ensuring that they can't move more than 2Gbytes
1035 	 * of data in a single call.
1036 	 */
1037 	if (get_udatamodel() == DATAMODEL_ILP32) {
1038 		struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
1039 		int aiov32len;
1040 		ssize32_t count32;
1041 
1042 		aiov32len = iovcnt * sizeof (iovec32_t);
1043 		if (aiovlen != 0)
1044 			aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
1045 
1046 		if (copyin(iovp, aiov32, aiov32len)) {
1047 			if (aiovlen != 0) {
1048 				kmem_free(aiov32, aiov32len);
1049 				kmem_free(aiov, aiovlen);
1050 			}
1051 			return (set_errno(EFAULT));
1052 		}
1053 
1054 		count32 = 0;
1055 		for (i = 0; i < iovcnt; i++) {
1056 			ssize32_t iovlen32 = aiov32[i].iov_len;
1057 			count32 += iovlen32;
1058 			if (iovlen32 < 0 || count32 < 0) {
1059 				if (aiovlen != 0) {
1060 					kmem_free(aiov32, aiov32len);
1061 					kmem_free(aiov, aiovlen);
1062 				}
1063 				return (set_errno(EINVAL));
1064 			}
1065 			aiov[i].iov_len = iovlen32;
1066 			aiov[i].iov_base =
1067 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
1068 		}
1069 		if (aiovlen != 0)
1070 			kmem_free(aiov32, aiov32len);
1071 	} else
1072 #endif /* _SYSCALL32_IMPL */
1073 		if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
1074 			if (aiovlen != 0)
1075 				kmem_free(aiov, aiovlen);
1076 			return (set_errno(EFAULT));
1077 		}
1078 
1079 	count = 0;
1080 	for (i = 0; i < iovcnt; i++) {
1081 		ssize_t iovlen = aiov[i].iov_len;
1082 		count += iovlen;
1083 		if (iovlen < 0 || count < 0) {
1084 			if (aiovlen != 0)
1085 				kmem_free(aiov, aiovlen);
1086 			return (set_errno(EINVAL));
1087 		}
1088 	}
1089 
1090 	if ((bcount = count) < 0) {
1091 		if (aiovlen != 0)
1092 			kmem_free(aiov, aiovlen);
1093 		return (set_errno(EINVAL));
1094 	}
1095 	if ((fp = getf(fdes)) == NULL) {
1096 		if (aiovlen != 0)
1097 			kmem_free(aiov, aiovlen);
1098 		return (set_errno(EBADF));
1099 	}
1100 	if (((fflag = fp->f_flag) & FREAD) == 0) {
1101 		error = EBADF;
1102 		goto out;
1103 	}
1104 	vp = fp->f_vnode;
1105 	rwflag = 0;
1106 
1107 	/*
1108 	 * Behaviour is same as read(2). Please see comments in read above.
1109 	 */
1110 	if (vp->v_type == VREG) {
1111 		if (bcount == 0)
1112 			goto out;
1113 
1114 		/* Handle offset past maximum offset allowed for file. */
1115 		if (fileoff >= OFFSET_MAX(fp)) {
1116 			struct vattr va;
1117 			va.va_mask = AT_SIZE;
1118 
1119 			error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL);
1120 			if (error == 0)  {
1121 				if (fileoff >= va.va_size) {
1122 					count = 0;
1123 				} else {
1124 					error = EOVERFLOW;
1125 				}
1126 			}
1127 			goto out;
1128 		}
1129 
1130 		ASSERT(bcount == count);
1131 
1132 		/* Note: modified count used in nbl_conflict() call below. */
1133 		if ((fileoff + count) > OFFSET_MAX(fp))
1134 			count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
1135 
1136 	} else if (vp->v_type == VFIFO) {
1137 		error = ESPIPE;
1138 		goto out;
1139 	}
1140 	/*
1141 	 * We have to enter the critical region before calling VOP_RWLOCK
1142 	 * to avoid a deadlock with ufs.
1143 	 */
1144 	if (nbl_need_check(vp)) {
1145 		int svmand;
1146 
1147 		nbl_start_crit(vp, RW_READER);
1148 		in_crit = 1;
1149 		error = nbl_svmand(vp, fp->f_cred, &svmand);
1150 		if (error != 0)
1151 			goto out;
1152 		if (nbl_conflict(vp, NBL_WRITE, fileoff, count, svmand, NULL)) {
1153 			error = EACCES;
1154 			goto out;
1155 		}
1156 	}
1157 
1158 	(void) VOP_RWLOCK(vp, rwflag, NULL);
1159 
1160 	auio.uio_loffset = fileoff;
1161 	auio.uio_iov = aiov;
1162 	auio.uio_iovcnt = iovcnt;
1163 	auio.uio_resid = bcount = count;
1164 	auio.uio_segflg = UIO_USERSPACE;
1165 	auio.uio_llimit = MAXOFFSET_T;
1166 	auio.uio_fmode = fflag;
1167 	if (bcount <= copyout_max_cached)
1168 		auio.uio_extflg = UIO_COPY_CACHED;
1169 	else
1170 		auio.uio_extflg = UIO_COPY_DEFAULT;
1171 
1172 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1173 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1174 	count -= auio.uio_resid;
1175 	CPU_STATS_ENTER_K();
1176 	cp = CPU;
1177 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
1178 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)count);
1179 	CPU_STATS_EXIT_K();
1180 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
1181 
1182 	VOP_RWUNLOCK(vp, rwflag, NULL);
1183 
1184 	if (error == EINTR && count != 0)
1185 		error = 0;
1186 out:
1187 	if (in_crit)
1188 		nbl_end_crit(vp);
1189 	releasef(fdes);
1190 	if (aiovlen != 0)
1191 		kmem_free(aiov, aiovlen);
1192 	if (error)
1193 		return (set_errno(error));
1194 	return (count);
1195 }
1196 
1197 ssize_t
1198 pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
1199     off_t extended_offset)
1200 {
1201 	struct uio auio;
1202 	struct iovec buf[IOV_MAX_STACK], *aiov = buf;
1203 	int aiovlen = 0;
1204 	file_t *fp;
1205 	register vnode_t *vp;
1206 	struct cpu *cp;
1207 	int fflag, ioflag, rwflag;
1208 	ssize_t count, bcount;
1209 	int error = 0;
1210 	int i;
1211 
1212 	/*
1213 	 * See the comment in preadv for how the offset is handled.
1214 	 */
1215 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1216 	u_offset_t fileoff = ((u_offset_t)extended_offset << 32) |
1217 	    (u_offset_t)offset;
1218 #else /* _SYSCALL32_IMPL || _ILP32 */
1219 	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
1220 #endif /* _SYSCALL32_IMPR || _ILP32 */
1221 
1222 	int in_crit = 0;
1223 
1224 	if (iovcnt <= 0 || iovcnt > IOV_MAX)
1225 		return (set_errno(EINVAL));
1226 
1227 	if (iovcnt > IOV_MAX_STACK) {
1228 		aiovlen = iovcnt * sizeof (iovec_t);
1229 		aiov = kmem_alloc(aiovlen, KM_SLEEP);
1230 	}
1231 
1232 #ifdef _SYSCALL32_IMPL
1233 	/*
1234 	 * 32-bit callers need to have their iovec expanded,
1235 	 * while ensuring that they can't move more than 2Gbytes
1236 	 * of data in a single call.
1237 	 */
1238 	if (get_udatamodel() == DATAMODEL_ILP32) {
1239 		struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
1240 		int aiov32len;
1241 		ssize32_t count32;
1242 
1243 		aiov32len = iovcnt * sizeof (iovec32_t);
1244 		if (aiovlen != 0)
1245 			aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
1246 
1247 		if (copyin(iovp, aiov32, aiov32len)) {
1248 			if (aiovlen != 0) {
1249 				kmem_free(aiov32, aiov32len);
1250 				kmem_free(aiov, aiovlen);
1251 			}
1252 			return (set_errno(EFAULT));
1253 		}
1254 
1255 		count32 = 0;
1256 		for (i = 0; i < iovcnt; i++) {
1257 			ssize32_t iovlen32 = aiov32[i].iov_len;
1258 			count32 += iovlen32;
1259 			if (iovlen32 < 0 || count32 < 0) {
1260 				if (aiovlen != 0) {
1261 					kmem_free(aiov32, aiov32len);
1262 					kmem_free(aiov, aiovlen);
1263 				}
1264 				return (set_errno(EINVAL));
1265 			}
1266 			aiov[i].iov_len = iovlen32;
1267 			aiov[i].iov_base =
1268 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
1269 		}
1270 		if (aiovlen != 0)
1271 			kmem_free(aiov32, aiov32len);
1272 	} else
1273 #endif /* _SYSCALL32_IMPL */
1274 		if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
1275 			if (aiovlen != 0)
1276 				kmem_free(aiov, aiovlen);
1277 			return (set_errno(EFAULT));
1278 		}
1279 
1280 	count = 0;
1281 	for (i = 0; i < iovcnt; i++) {
1282 		ssize_t iovlen = aiov[i].iov_len;
1283 		count += iovlen;
1284 		if (iovlen < 0 || count < 0) {
1285 			if (aiovlen != 0)
1286 				kmem_free(aiov, aiovlen);
1287 			return (set_errno(EINVAL));
1288 		}
1289 	}
1290 
1291 	if ((bcount = count) < 0) {
1292 		if (aiovlen != 0)
1293 			kmem_free(aiov, aiovlen);
1294 		return (set_errno(EINVAL));
1295 	}
1296 	if ((fp = getf(fdes)) == NULL) {
1297 		if (aiovlen != 0)
1298 			kmem_free(aiov, aiovlen);
1299 		return (set_errno(EBADF));
1300 	}
1301 	if (((fflag = fp->f_flag) & FWRITE) == 0) {
1302 		error = EBADF;
1303 		goto out;
1304 	}
1305 	vp = fp->f_vnode;
1306 	rwflag = 1;
1307 
1308 	/*
1309 	 * The kernel's write(2) code checks OFFSET_MAX and the rctl, and
1310 	 * returns EFBIG when fileoff exceeds either limit. We do the same.
1311 	 */
1312 	if (vp->v_type == VREG) {
1313 		if (bcount == 0)
1314 			goto out;
1315 
1316 		/*
1317 		 * Don't allow pwritev to cause file size to exceed the proper
1318 		 * offset limit.
1319 		 */
1320 		if (fileoff >= OFFSET_MAX(fp)) {
1321 			error = EFBIG;
1322 			goto out;
1323 		}
1324 
1325 		/*
1326 		 * Take appropriate action if we are trying
1327 		 * to write above the resource limit.
1328 		 */
1329 		if (fileoff >= curproc->p_fsz_ctl) {
1330 			mutex_enter(&curproc->p_lock);
1331 			/*
1332 			 * Return value ignored because it lists
1333 			 * actions taken, but we are in an error case.
1334 			 * We don't have any actions that depend on
1335 			 * what could happen in this call, so we ignore
1336 			 * the return value.
1337 			 */
1338 			(void) rctl_action(
1339 			    rctlproc_legacy[RLIMIT_FSIZE],
1340 			    curproc->p_rctls, curproc,
1341 			    RCA_UNSAFE_SIGINFO);
1342 			mutex_exit(&curproc->p_lock);
1343 
1344 			error = EFBIG;
1345 			goto out;
1346 		}
1347 
1348 		ASSERT(bcount == count);
1349 
1350 		/* Note: modified count used in nbl_conflict() call below. */
1351 		if ((fileoff + count) > OFFSET_MAX(fp))
1352 			count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
1353 
1354 	} else if (vp->v_type == VFIFO) {
1355 		error = ESPIPE;
1356 		goto out;
1357 	}
1358 	/*
1359 	 * We have to enter the critical region before calling VOP_RWLOCK
1360 	 * to avoid a deadlock with ufs.
1361 	 */
1362 	if (nbl_need_check(vp)) {
1363 		int svmand;
1364 
1365 		nbl_start_crit(vp, RW_READER);
1366 		in_crit = 1;
1367 		error = nbl_svmand(vp, fp->f_cred, &svmand);
1368 		if (error != 0)
1369 			goto out;
1370 		if (nbl_conflict(vp, NBL_WRITE, fileoff, count, svmand, NULL)) {
1371 			error = EACCES;
1372 			goto out;
1373 		}
1374 	}
1375 
1376 	(void) VOP_RWLOCK(vp, rwflag, NULL);
1377 
1378 	auio.uio_loffset = fileoff;
1379 	auio.uio_iov = aiov;
1380 	auio.uio_iovcnt = iovcnt;
1381 	auio.uio_resid = bcount = count;
1382 	auio.uio_segflg = UIO_USERSPACE;
1383 	auio.uio_llimit = curproc->p_fsz_ctl;
1384 	auio.uio_fmode = fflag;
1385 	auio.uio_extflg = UIO_COPY_CACHED;
1386 	ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
1387 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
1388 	count -= auio.uio_resid;
1389 	CPU_STATS_ENTER_K();
1390 	cp = CPU;
1391 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
1392 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)count);
1393 	CPU_STATS_EXIT_K();
1394 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
1395 
1396 	VOP_RWUNLOCK(vp, rwflag, NULL);
1397 
1398 	if (error == EINTR && count != 0)
1399 		error = 0;
1400 out:
1401 	if (in_crit)
1402 		nbl_end_crit(vp);
1403 	releasef(fdes);
1404 	if (aiovlen != 0)
1405 		kmem_free(aiov, aiovlen);
1406 	if (error)
1407 		return (set_errno(error));
1408 	return (count);
1409 }
1410 
1411 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1412 
1413 /*
1414  * This syscall supplies 64-bit file offsets to 32-bit applications only.
1415  */
1416 ssize32_t
1417 pread64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
1418     uint32_t offset_2)
1419 {
1420 	struct uio auio;
1421 	struct iovec aiov;
1422 	file_t *fp;
1423 	register vnode_t *vp;
1424 	struct cpu *cp;
1425 	int fflag, ioflag, rwflag;
1426 	ssize_t bcount;
1427 	int error = 0;
1428 	u_offset_t fileoff;
1429 	int in_crit = 0;
1430 
1431 #if defined(_LITTLE_ENDIAN)
1432 	fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
1433 #else
1434 	fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
1435 #endif
1436 
1437 	if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
1438 		return (set_errno(EINVAL));
1439 
1440 	if ((fp = getf(fdes)) == NULL)
1441 		return (set_errno(EBADF));
1442 	if (((fflag = fp->f_flag) & (FREAD)) == 0) {
1443 		error = EBADF;
1444 		goto out;
1445 	}
1446 
1447 	rwflag = 0;
1448 	vp = fp->f_vnode;
1449 
1450 	if (vp->v_type == VREG) {
1451 
1452 		if (bcount == 0)
1453 			goto out;
1454 
1455 		/*
1456 		 * Same as pread. See comments in pread.
1457 		 */
1458 
1459 		if (fileoff > MAXOFFSET_T) {
1460 			error = EINVAL;
1461 			goto out;
1462 		}
1463 		if (fileoff + bcount > MAXOFFSET_T)
1464 			bcount = (ssize_t)(MAXOFFSET_T - fileoff);
1465 	} else if (vp->v_type == VFIFO) {
1466 		error = ESPIPE;
1467 		goto out;
1468 	}
1469 
1470 	/*
1471 	 * We have to enter the critical region before calling VOP_RWLOCK
1472 	 * to avoid a deadlock with ufs.
1473 	 */
1474 	if (nbl_need_check(vp)) {
1475 		int svmand;
1476 
1477 		nbl_start_crit(vp, RW_READER);
1478 		in_crit = 1;
1479 		error = nbl_svmand(vp, fp->f_cred, &svmand);
1480 		if (error != 0)
1481 			goto out;
1482 		if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand,
1483 		    NULL)) {
1484 			error = EACCES;
1485 			goto out;
1486 		}
1487 	}
1488 
1489 	aiov.iov_base = cbuf;
1490 	aiov.iov_len = bcount;
1491 	(void) VOP_RWLOCK(vp, rwflag, NULL);
1492 	auio.uio_loffset = fileoff;
1493 
1494 	/*
1495 	 * Note: File size can never be greater than MAXOFFSET_T.
1496 	 * If ever we start supporting 128 bit files the code
1497 	 * similar to the one in pread at this place should be here.
1498 	 * Here we avoid the unnecessary VOP_GETATTR() when we
1499 	 * know that fileoff == MAXOFFSET_T implies that it is always
1500 	 * greater than or equal to file size.
1501 	 */
1502 	auio.uio_iov = &aiov;
1503 	auio.uio_iovcnt = 1;
1504 	auio.uio_resid = bcount;
1505 	auio.uio_segflg = UIO_USERSPACE;
1506 	auio.uio_llimit = MAXOFFSET_T;
1507 	auio.uio_fmode = fflag;
1508 	auio.uio_extflg = UIO_COPY_CACHED;
1509 
1510 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1511 
1512 	/* If read sync is not asked for, filter sync flags */
1513 	if ((ioflag & FRSYNC) == 0)
1514 		ioflag &= ~(FSYNC|FDSYNC);
1515 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1516 	bcount -= auio.uio_resid;
1517 	CPU_STATS_ENTER_K();
1518 	cp = CPU;
1519 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
1520 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
1521 	CPU_STATS_EXIT_K();
1522 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
1523 	VOP_RWUNLOCK(vp, rwflag, NULL);
1524 
1525 	if (error == EINTR && bcount != 0)
1526 		error = 0;
1527 out:
1528 	if (in_crit)
1529 		nbl_end_crit(vp);
1530 	releasef(fdes);
1531 	if (error)
1532 		return (set_errno(error));
1533 	return (bcount);
1534 }
1535 
1536 /*
1537  * This syscall supplies 64-bit file offsets to 32-bit applications only.
1538  */
1539 ssize32_t
1540 pwrite64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
1541     uint32_t offset_2)
1542 {
1543 	struct uio auio;
1544 	struct iovec aiov;
1545 	file_t *fp;
1546 	register vnode_t *vp;
1547 	struct cpu *cp;
1548 	int fflag, ioflag, rwflag;
1549 	ssize_t bcount;
1550 	int error = 0;
1551 	u_offset_t fileoff;
1552 	int in_crit = 0;
1553 
1554 #if defined(_LITTLE_ENDIAN)
1555 	fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
1556 #else
1557 	fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
1558 #endif
1559 
1560 	if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
1561 		return (set_errno(EINVAL));
1562 	if ((fp = getf(fdes)) == NULL)
1563 		return (set_errno(EBADF));
1564 	if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
1565 		error = EBADF;
1566 		goto out;
1567 	}
1568 
1569 	rwflag = 1;
1570 	vp = fp->f_vnode;
1571 
1572 	if (vp->v_type == VREG) {
1573 
1574 		if (bcount == 0)
1575 			goto out;
1576 
1577 		/*
1578 		 * See comments in pwrite.
1579 		 */
1580 		if (fileoff > MAXOFFSET_T) {
1581 			error = EINVAL;
1582 			goto out;
1583 		}
1584 		if (fileoff >= curproc->p_fsz_ctl) {
1585 			mutex_enter(&curproc->p_lock);
1586 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
1587 			    curproc->p_rctls, curproc, RCA_SAFE);
1588 			mutex_exit(&curproc->p_lock);
1589 			error = EFBIG;
1590 			goto out;
1591 		}
1592 		if (fileoff == MAXOFFSET_T) {
1593 			error = EFBIG;
1594 			goto out;
1595 		}
1596 		if (fileoff + bcount > MAXOFFSET_T)
1597 			bcount = (ssize_t)((u_offset_t)MAXOFFSET_T - fileoff);
1598 	} else if (vp->v_type == VFIFO) {
1599 		error = ESPIPE;
1600 		goto out;
1601 	}
1602 
1603 	/*
1604 	 * We have to enter the critical region before calling VOP_RWLOCK
1605 	 * to avoid a deadlock with ufs.
1606 	 */
1607 	if (nbl_need_check(vp)) {
1608 		int svmand;
1609 
1610 		nbl_start_crit(vp, RW_READER);
1611 		in_crit = 1;
1612 		error = nbl_svmand(vp, fp->f_cred, &svmand);
1613 		if (error != 0)
1614 			goto out;
1615 		if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand,
1616 		    NULL)) {
1617 			error = EACCES;
1618 			goto out;
1619 		}
1620 	}
1621 
1622 	aiov.iov_base = cbuf;
1623 	aiov.iov_len = bcount;
1624 	(void) VOP_RWLOCK(vp, rwflag, NULL);
1625 	auio.uio_loffset = fileoff;
1626 	auio.uio_iov = &aiov;
1627 	auio.uio_iovcnt = 1;
1628 	auio.uio_resid = bcount;
1629 	auio.uio_segflg = UIO_USERSPACE;
1630 	auio.uio_llimit = curproc->p_fsz_ctl;
1631 	auio.uio_fmode = fflag;
1632 	auio.uio_extflg = UIO_COPY_CACHED;
1633 
1634 	/*
1635 	 * The SUSv4 POSIX specification states:
1636 	 *	The pwrite() function shall be equivalent to write(), except
1637 	 *	that it writes into a given position and does not change
1638 	 *	the file offset (regardless of whether O_APPEND is set).
1639 	 * To make this be true, we omit the FAPPEND flag from ioflag.
1640 	 */
1641 	ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
1642 
1643 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
1644 	bcount -= auio.uio_resid;
1645 	CPU_STATS_ENTER_K();
1646 	cp = CPU;
1647 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
1648 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
1649 	CPU_STATS_EXIT_K();
1650 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
1651 	VOP_RWUNLOCK(vp, rwflag, NULL);
1652 
1653 	if (error == EINTR && bcount != 0)
1654 		error = 0;
1655 out:
1656 	if (in_crit)
1657 		nbl_end_crit(vp);
1658 	releasef(fdes);
1659 	if (error)
1660 		return (set_errno(error));
1661 	return (bcount);
1662 }
1663 
1664 #endif	/* _SYSCALL32_IMPL || _ILP32 */
1665 
1666 #ifdef _SYSCALL32_IMPL
1667 /*
1668  * Tail-call elimination of xxx32() down to xxx()
1669  *
1670  * A number of xxx32 system calls take a len (or count) argument and
1671  * return a number in the range [0,len] or -1 on error.
1672  * Given an ssize32_t input len, the downcall xxx() will return
1673  * a 64-bit value that is -1 or in the range [0,len] which actually
1674  * is a proper return value for the xxx32 call. So even if the xxx32
1675  * calls can be considered as returning a ssize32_t, they are currently
1676  * declared as returning a ssize_t as this enables tail-call elimination.
1677  *
1678  * The cast of len (or count) to ssize32_t is needed to ensure we pass
1679  * down negative input values as such and let the downcall handle error
1680  * reporting. Functions covered by this comments are:
1681  *
1682  * rw.c:           read32, write32, pread32, pwrite32, readv32, writev32.
1683  * socksyscall.c:  recv32, recvfrom32, send32, sendto32.
1684  * readlink.c:     readlink32.
1685  */
1686 
1687 ssize_t
1688 read32(int32_t fdes, caddr32_t cbuf, size32_t count)
1689 {
1690 	return (read(fdes,
1691 	    (void *)(uintptr_t)cbuf, (ssize32_t)count));
1692 }
1693 
1694 ssize_t
1695 write32(int32_t fdes, caddr32_t cbuf, size32_t count)
1696 {
1697 	return (write(fdes,
1698 	    (void *)(uintptr_t)cbuf, (ssize32_t)count));
1699 }
1700 
1701 ssize_t
1702 pread32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
1703 {
1704 	return (pread(fdes,
1705 	    (void *)(uintptr_t)cbuf, (ssize32_t)count,
1706 	    (off_t)(uint32_t)offset));
1707 }
1708 
1709 ssize_t
1710 pwrite32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
1711 {
1712 	return (pwrite(fdes,
1713 	    (void *)(uintptr_t)cbuf, (ssize32_t)count,
1714 	    (off_t)(uint32_t)offset));
1715 }
1716 
1717 ssize_t
1718 readv32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
1719 {
1720 	return (readv(fdes, (void *)(uintptr_t)iovp, iovcnt));
1721 }
1722 
1723 ssize_t
1724 writev32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
1725 {
1726 	return (writev(fdes, (void *)(uintptr_t)iovp, iovcnt));
1727 }
1728 #endif	/* _SYSCALL32_IMPL */
1729