xref: /titanic_52/usr/src/uts/common/syscall/rw.c (revision 799823bbed51a695d01e13511bbb1369980bb714)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  * Copyright (c) 2015, Joyent, Inc.  All rights reserved.
26  */
27 
28 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
29 /*	  All Rights Reserved  	*/
30 
31 /*
32  * Portions of this source code were derived from Berkeley 4.3 BSD
33  * under license from the Regents of the University of California.
34  */
35 
36 #include <sys/param.h>
37 #include <sys/isa_defs.h>
38 #include <sys/types.h>
39 #include <sys/inttypes.h>
40 #include <sys/sysmacros.h>
41 #include <sys/cred.h>
42 #include <sys/user.h>
43 #include <sys/systm.h>
44 #include <sys/errno.h>
45 #include <sys/vnode.h>
46 #include <sys/file.h>
47 #include <sys/proc.h>
48 #include <sys/cpuvar.h>
49 #include <sys/uio.h>
50 #include <sys/debug.h>
51 #include <sys/rctl.h>
52 #include <sys/nbmlock.h>
53 
54 #define	COPYOUT_MAX_CACHE	(1<<17)		/* 128K */
55 
56 size_t copyout_max_cached = COPYOUT_MAX_CACHE;	/* global so it's patchable */
57 
58 /*
59  * read, write, pread, pwrite, readv, and writev syscalls.
60  *
61  * 64-bit open:	all open's are large file opens.
62  * Large Files: the behaviour of read depends on whether the fd
63  *		corresponds to large open or not.
64  * 32-bit open:	FOFFMAX flag not set.
65  *		read until MAXOFF32_T - 1 and read at MAXOFF32_T returns
66  *		EOVERFLOW if count is non-zero and if size of file
67  *		is > MAXOFF32_T. If size of file is <= MAXOFF32_T read
68  *		at >= MAXOFF32_T returns EOF.
69  */
70 
71 /*
72  * Native system call
73  */
74 ssize_t
75 read(int fdes, void *cbuf, size_t count)
76 {
77 	struct uio auio;
78 	struct iovec aiov;
79 	file_t *fp;
80 	register vnode_t *vp;
81 	struct cpu *cp;
82 	int fflag, ioflag, rwflag;
83 	ssize_t cnt, bcount;
84 	int error = 0;
85 	u_offset_t fileoff;
86 	int in_crit = 0;
87 
88 	if ((cnt = (ssize_t)count) < 0)
89 		return (set_errno(EINVAL));
90 	if ((fp = getf(fdes)) == NULL)
91 		return (set_errno(EBADF));
92 	if (((fflag = fp->f_flag) & FREAD) == 0) {
93 		error = EBADF;
94 		goto out;
95 	}
96 	vp = fp->f_vnode;
97 
98 	if (vp->v_type == VREG && cnt == 0) {
99 		goto out;
100 	}
101 
102 	rwflag = 0;
103 	aiov.iov_base = cbuf;
104 	aiov.iov_len = cnt;
105 
106 	/*
107 	 * We have to enter the critical region before calling VOP_RWLOCK
108 	 * to avoid a deadlock with write() calls.
109 	 */
110 	if (nbl_need_check(vp)) {
111 		int svmand;
112 
113 		nbl_start_crit(vp, RW_READER);
114 		in_crit = 1;
115 		error = nbl_svmand(vp, fp->f_cred, &svmand);
116 		if (error != 0)
117 			goto out;
118 		if (nbl_conflict(vp, NBL_READ, fp->f_offset, cnt, svmand,
119 		    NULL)) {
120 			error = EACCES;
121 			goto out;
122 		}
123 	}
124 
125 	(void) VOP_RWLOCK(vp, rwflag, NULL);
126 
127 	/*
128 	 * We do the following checks inside VOP_RWLOCK so as to
129 	 * prevent file size from changing while these checks are
130 	 * being done. Also, we load fp's offset to the local
131 	 * variable fileoff because we can have a parallel lseek
132 	 * going on (f_offset is not protected by any lock) which
133 	 * could change f_offset. We need to see the value only
134 	 * once here and take a decision. Seeing it more than once
135 	 * can lead to incorrect functionality.
136 	 */
137 
138 	fileoff = (u_offset_t)fp->f_offset;
139 	if (fileoff >= OFFSET_MAX(fp) && (vp->v_type == VREG)) {
140 		struct vattr va;
141 		va.va_mask = AT_SIZE;
142 		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL)))  {
143 			VOP_RWUNLOCK(vp, rwflag, NULL);
144 			goto out;
145 		}
146 		if (fileoff >= va.va_size) {
147 			cnt = 0;
148 			VOP_RWUNLOCK(vp, rwflag, NULL);
149 			goto out;
150 		} else {
151 			error = EOVERFLOW;
152 			VOP_RWUNLOCK(vp, rwflag, NULL);
153 			goto out;
154 		}
155 	}
156 	if ((vp->v_type == VREG) &&
157 	    (fileoff + cnt > OFFSET_MAX(fp))) {
158 		cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
159 	}
160 	auio.uio_loffset = fileoff;
161 	auio.uio_iov = &aiov;
162 	auio.uio_iovcnt = 1;
163 	auio.uio_resid = bcount = cnt;
164 	auio.uio_segflg = UIO_USERSPACE;
165 	auio.uio_llimit = MAXOFFSET_T;
166 	auio.uio_fmode = fflag;
167 	/*
168 	 * Only use bypass caches when the count is large enough
169 	 */
170 	if (bcount <= copyout_max_cached)
171 		auio.uio_extflg = UIO_COPY_CACHED;
172 	else
173 		auio.uio_extflg = UIO_COPY_DEFAULT;
174 
175 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
176 
177 	/* If read sync is not asked for, filter sync flags */
178 	if ((ioflag & FRSYNC) == 0)
179 		ioflag &= ~(FSYNC|FDSYNC);
180 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
181 	cnt -= auio.uio_resid;
182 	CPU_STATS_ENTER_K();
183 	cp = CPU;
184 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
185 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)cnt);
186 	CPU_STATS_EXIT_K();
187 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
188 
189 	if (vp->v_type == VFIFO)	/* Backward compatibility */
190 		fp->f_offset = cnt;
191 	else if (((fp->f_flag & FAPPEND) == 0) ||
192 	    (vp->v_type != VREG) || (bcount != 0))	/* POSIX */
193 		fp->f_offset = auio.uio_loffset;
194 	VOP_RWUNLOCK(vp, rwflag, NULL);
195 
196 	if (error == EINTR && cnt != 0)
197 		error = 0;
198 out:
199 	if (in_crit)
200 		nbl_end_crit(vp);
201 	releasef(fdes);
202 	if (error)
203 		return (set_errno(error));
204 	return (cnt);
205 }
206 
207 /*
208  * Native system call
209  */
210 ssize_t
211 write(int fdes, void *cbuf, size_t count)
212 {
213 	struct uio auio;
214 	struct iovec aiov;
215 	file_t *fp;
216 	register vnode_t *vp;
217 	struct cpu *cp;
218 	int fflag, ioflag, rwflag;
219 	ssize_t cnt, bcount;
220 	int error = 0;
221 	u_offset_t fileoff;
222 	int in_crit = 0;
223 
224 	if ((cnt = (ssize_t)count) < 0)
225 		return (set_errno(EINVAL));
226 	if ((fp = getf(fdes)) == NULL)
227 		return (set_errno(EBADF));
228 	if (((fflag = fp->f_flag) & FWRITE) == 0) {
229 		error = EBADF;
230 		goto out;
231 	}
232 	vp = fp->f_vnode;
233 
234 	if (vp->v_type == VREG && cnt == 0) {
235 		goto out;
236 	}
237 
238 	rwflag = 1;
239 	aiov.iov_base = cbuf;
240 	aiov.iov_len = cnt;
241 
242 	/*
243 	 * We have to enter the critical region before calling VOP_RWLOCK
244 	 * to avoid a deadlock with ufs.
245 	 */
246 	if (nbl_need_check(vp)) {
247 		int svmand;
248 
249 		nbl_start_crit(vp, RW_READER);
250 		in_crit = 1;
251 		error = nbl_svmand(vp, fp->f_cred, &svmand);
252 		if (error != 0)
253 			goto out;
254 		if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, cnt, svmand,
255 		    NULL)) {
256 			error = EACCES;
257 			goto out;
258 		}
259 	}
260 
261 	(void) VOP_RWLOCK(vp, rwflag, NULL);
262 
263 	fileoff = fp->f_offset;
264 	if (vp->v_type == VREG) {
265 
266 		/*
267 		 * We raise psignal if write for >0 bytes causes
268 		 * it to exceed the ulimit.
269 		 */
270 		if (fileoff >= curproc->p_fsz_ctl) {
271 			VOP_RWUNLOCK(vp, rwflag, NULL);
272 
273 			mutex_enter(&curproc->p_lock);
274 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
275 			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
276 			mutex_exit(&curproc->p_lock);
277 
278 			error = EFBIG;
279 			goto out;
280 		}
281 		/*
282 		 * We return EFBIG if write is done at an offset
283 		 * greater than the offset maximum for this file structure.
284 		 */
285 
286 		if (fileoff >= OFFSET_MAX(fp)) {
287 			VOP_RWUNLOCK(vp, rwflag, NULL);
288 			error = EFBIG;
289 			goto out;
290 		}
291 		/*
292 		 * Limit the bytes to be written  upto offset maximum for
293 		 * this open file structure.
294 		 */
295 		if (fileoff + cnt > OFFSET_MAX(fp))
296 			cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
297 	}
298 	auio.uio_loffset = fileoff;
299 	auio.uio_iov = &aiov;
300 	auio.uio_iovcnt = 1;
301 	auio.uio_resid = bcount = cnt;
302 	auio.uio_segflg = UIO_USERSPACE;
303 	auio.uio_llimit = curproc->p_fsz_ctl;
304 	auio.uio_fmode = fflag;
305 	auio.uio_extflg = UIO_COPY_DEFAULT;
306 
307 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
308 
309 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
310 	cnt -= auio.uio_resid;
311 	CPU_STATS_ENTER_K();
312 	cp = CPU;
313 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
314 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)cnt);
315 	CPU_STATS_EXIT_K();
316 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
317 
318 	if (vp->v_type == VFIFO)	/* Backward compatibility */
319 		fp->f_offset = cnt;
320 	else if (((fp->f_flag & FAPPEND) == 0) ||
321 	    (vp->v_type != VREG) || (bcount != 0))	/* POSIX */
322 		fp->f_offset = auio.uio_loffset;
323 	VOP_RWUNLOCK(vp, rwflag, NULL);
324 
325 	if (error == EINTR && cnt != 0)
326 		error = 0;
327 out:
328 	if (in_crit)
329 		nbl_end_crit(vp);
330 	releasef(fdes);
331 	if (error)
332 		return (set_errno(error));
333 	return (cnt);
334 }
335 
336 ssize_t
337 pread(int fdes, void *cbuf, size_t count, off_t offset)
338 {
339 	struct uio auio;
340 	struct iovec aiov;
341 	file_t *fp;
342 	register vnode_t *vp;
343 	struct cpu *cp;
344 	int fflag, ioflag, rwflag;
345 	ssize_t bcount;
346 	int error = 0;
347 	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
348 #ifdef _SYSCALL32_IMPL
349 	u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
350 	    MAXOFF32_T : MAXOFFSET_T;
351 #else
352 	const u_offset_t maxoff = MAXOFF32_T;
353 #endif
354 	int in_crit = 0;
355 
356 	if ((bcount = (ssize_t)count) < 0)
357 		return (set_errno(EINVAL));
358 
359 	if ((fp = getf(fdes)) == NULL)
360 		return (set_errno(EBADF));
361 	if (((fflag = fp->f_flag) & (FREAD)) == 0) {
362 		error = EBADF;
363 		goto out;
364 	}
365 
366 	rwflag = 0;
367 	vp = fp->f_vnode;
368 
369 	if (vp->v_type == VREG) {
370 
371 		if (bcount == 0)
372 			goto out;
373 
374 		/*
375 		 * Return EINVAL if an invalid offset comes to pread.
376 		 * Negative offset from user will cause this error.
377 		 */
378 
379 		if (fileoff > maxoff) {
380 			error = EINVAL;
381 			goto out;
382 		}
383 		/*
384 		 * Limit offset such that we don't read or write
385 		 * a file beyond the maximum offset representable in
386 		 * an off_t structure.
387 		 */
388 		if (fileoff + bcount > maxoff)
389 			bcount = (ssize_t)((offset_t)maxoff - fileoff);
390 	} else if (vp->v_type == VFIFO) {
391 		error = ESPIPE;
392 		goto out;
393 	}
394 
395 	/*
396 	 * We have to enter the critical region before calling VOP_RWLOCK
397 	 * to avoid a deadlock with ufs.
398 	 */
399 	if (nbl_need_check(vp)) {
400 		int svmand;
401 
402 		nbl_start_crit(vp, RW_READER);
403 		in_crit = 1;
404 		error = nbl_svmand(vp, fp->f_cred, &svmand);
405 		if (error != 0)
406 			goto out;
407 		if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand,
408 		    NULL)) {
409 			error = EACCES;
410 			goto out;
411 		}
412 	}
413 
414 	aiov.iov_base = cbuf;
415 	aiov.iov_len = bcount;
416 	(void) VOP_RWLOCK(vp, rwflag, NULL);
417 	if (vp->v_type == VREG && fileoff == (u_offset_t)maxoff) {
418 		struct vattr va;
419 		va.va_mask = AT_SIZE;
420 		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL))) {
421 			VOP_RWUNLOCK(vp, rwflag, NULL);
422 			goto out;
423 		}
424 		VOP_RWUNLOCK(vp, rwflag, NULL);
425 
426 		/*
427 		 * We have to return EOF if fileoff is >= file size.
428 		 */
429 		if (fileoff >= va.va_size) {
430 			bcount = 0;
431 			goto out;
432 		}
433 
434 		/*
435 		 * File is greater than or equal to maxoff and therefore
436 		 * we return EOVERFLOW.
437 		 */
438 		error = EOVERFLOW;
439 		goto out;
440 	}
441 	auio.uio_loffset = fileoff;
442 	auio.uio_iov = &aiov;
443 	auio.uio_iovcnt = 1;
444 	auio.uio_resid = bcount;
445 	auio.uio_segflg = UIO_USERSPACE;
446 	auio.uio_llimit = MAXOFFSET_T;
447 	auio.uio_fmode = fflag;
448 	auio.uio_extflg = UIO_COPY_CACHED;
449 
450 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
451 
452 	/* If read sync is not asked for, filter sync flags */
453 	if ((ioflag & FRSYNC) == 0)
454 		ioflag &= ~(FSYNC|FDSYNC);
455 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
456 	bcount -= auio.uio_resid;
457 	CPU_STATS_ENTER_K();
458 	cp = CPU;
459 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
460 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
461 	CPU_STATS_EXIT_K();
462 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
463 	VOP_RWUNLOCK(vp, rwflag, NULL);
464 
465 	if (error == EINTR && bcount != 0)
466 		error = 0;
467 out:
468 	if (in_crit)
469 		nbl_end_crit(vp);
470 	releasef(fdes);
471 	if (error)
472 		return (set_errno(error));
473 	return (bcount);
474 }
475 
476 ssize_t
477 pwrite(int fdes, void *cbuf, size_t count, off_t offset)
478 {
479 	struct uio auio;
480 	struct iovec aiov;
481 	file_t *fp;
482 	register vnode_t *vp;
483 	struct cpu *cp;
484 	int fflag, ioflag, rwflag;
485 	ssize_t bcount;
486 	int error = 0;
487 	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
488 #ifdef _SYSCALL32_IMPL
489 	u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
490 	    MAXOFF32_T : MAXOFFSET_T;
491 #else
492 	const u_offset_t maxoff = MAXOFF32_T;
493 #endif
494 	int in_crit = 0;
495 
496 	if ((bcount = (ssize_t)count) < 0)
497 		return (set_errno(EINVAL));
498 	if ((fp = getf(fdes)) == NULL)
499 		return (set_errno(EBADF));
500 	if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
501 		error = EBADF;
502 		goto out;
503 	}
504 
505 	rwflag = 1;
506 	vp = fp->f_vnode;
507 
508 	if (vp->v_type == VREG) {
509 
510 		if (bcount == 0)
511 			goto out;
512 
513 		/*
514 		 * return EINVAL for offsets that cannot be
515 		 * represented in an off_t.
516 		 */
517 		if (fileoff > maxoff) {
518 			error = EINVAL;
519 			goto out;
520 		}
521 		/*
522 		 * Take appropriate action if we are trying to write above the
523 		 * resource limit.
524 		 */
525 		if (fileoff >= curproc->p_fsz_ctl) {
526 			mutex_enter(&curproc->p_lock);
527 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
528 			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
529 			mutex_exit(&curproc->p_lock);
530 
531 			error = EFBIG;
532 			goto out;
533 		}
534 		/*
535 		 * Don't allow pwrite to cause file sizes to exceed
536 		 * maxoff.
537 		 */
538 		if (fileoff == maxoff) {
539 			error = EFBIG;
540 			goto out;
541 		}
542 		if (fileoff + count > maxoff)
543 			bcount = (ssize_t)((u_offset_t)maxoff - fileoff);
544 	} else if (vp->v_type == VFIFO) {
545 		error = ESPIPE;
546 		goto out;
547 	}
548 
549 	/*
550 	 * We have to enter the critical region before calling VOP_RWLOCK
551 	 * to avoid a deadlock with ufs.
552 	 */
553 	if (nbl_need_check(vp)) {
554 		int svmand;
555 
556 		nbl_start_crit(vp, RW_READER);
557 		in_crit = 1;
558 		error = nbl_svmand(vp, fp->f_cred, &svmand);
559 		if (error != 0)
560 			goto out;
561 		if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand,
562 		    NULL)) {
563 			error = EACCES;
564 			goto out;
565 		}
566 	}
567 
568 	aiov.iov_base = cbuf;
569 	aiov.iov_len = bcount;
570 	(void) VOP_RWLOCK(vp, rwflag, NULL);
571 	auio.uio_loffset = fileoff;
572 	auio.uio_iov = &aiov;
573 	auio.uio_iovcnt = 1;
574 	auio.uio_resid = bcount;
575 	auio.uio_segflg = UIO_USERSPACE;
576 	auio.uio_llimit = curproc->p_fsz_ctl;
577 	auio.uio_fmode = fflag;
578 	auio.uio_extflg = UIO_COPY_CACHED;
579 
580 	/*
581 	 * The SUSv4 POSIX specification states:
582 	 *	The pwrite() function shall be equivalent to write(), except
583 	 *	that it writes into a given position and does not change
584 	 *	the file offset (regardless of whether O_APPEND is set).
585 	 * To make this be true, we omit the FAPPEND flag from ioflag.
586 	 */
587 	ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
588 
589 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
590 	bcount -= auio.uio_resid;
591 	CPU_STATS_ENTER_K();
592 	cp = CPU;
593 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
594 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
595 	CPU_STATS_EXIT_K();
596 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
597 	VOP_RWUNLOCK(vp, rwflag, NULL);
598 
599 	if (error == EINTR && bcount != 0)
600 		error = 0;
601 out:
602 	if (in_crit)
603 		nbl_end_crit(vp);
604 	releasef(fdes);
605 	if (error)
606 		return (set_errno(error));
607 	return (bcount);
608 }
609 
610 /*
611  * XXX -- The SVID refers to IOV_MAX, but doesn't define it.  Grrrr....
612  * XXX -- However, SVVS expects readv() and writev() to fail if
613  * XXX -- iovcnt > 16 (yes, it's hard-coded in the SVVS source),
614  * XXX -- so I guess that's the "interface".
615  */
616 #define	DEF_IOV_MAX	16
617 
618 ssize_t
619 readv(int fdes, struct iovec *iovp, int iovcnt)
620 {
621 	struct uio auio;
622 	struct iovec aiov[DEF_IOV_MAX];
623 	file_t *fp;
624 	register vnode_t *vp;
625 	struct cpu *cp;
626 	int fflag, ioflag, rwflag;
627 	ssize_t count, bcount;
628 	int error = 0;
629 	int i;
630 	u_offset_t fileoff;
631 	int in_crit = 0;
632 
633 	if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
634 		return (set_errno(EINVAL));
635 
636 #ifdef _SYSCALL32_IMPL
637 	/*
638 	 * 32-bit callers need to have their iovec expanded,
639 	 * while ensuring that they can't move more than 2Gbytes
640 	 * of data in a single call.
641 	 */
642 	if (get_udatamodel() == DATAMODEL_ILP32) {
643 		struct iovec32 aiov32[DEF_IOV_MAX];
644 		ssize32_t count32;
645 
646 		if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
647 			return (set_errno(EFAULT));
648 
649 		count32 = 0;
650 		for (i = 0; i < iovcnt; i++) {
651 			ssize32_t iovlen32 = aiov32[i].iov_len;
652 			count32 += iovlen32;
653 			if (iovlen32 < 0 || count32 < 0)
654 				return (set_errno(EINVAL));
655 			aiov[i].iov_len = iovlen32;
656 			aiov[i].iov_base =
657 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
658 		}
659 	} else
660 #endif
661 	if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
662 		return (set_errno(EFAULT));
663 
664 	count = 0;
665 	for (i = 0; i < iovcnt; i++) {
666 		ssize_t iovlen = aiov[i].iov_len;
667 		count += iovlen;
668 		if (iovlen < 0 || count < 0)
669 			return (set_errno(EINVAL));
670 	}
671 	if ((fp = getf(fdes)) == NULL)
672 		return (set_errno(EBADF));
673 	if (((fflag = fp->f_flag) & FREAD) == 0) {
674 		error = EBADF;
675 		goto out;
676 	}
677 	vp = fp->f_vnode;
678 	if (vp->v_type == VREG && count == 0) {
679 		goto out;
680 	}
681 
682 	rwflag = 0;
683 
684 	/*
685 	 * We have to enter the critical region before calling VOP_RWLOCK
686 	 * to avoid a deadlock with ufs.
687 	 */
688 	if (nbl_need_check(vp)) {
689 		int svmand;
690 
691 		nbl_start_crit(vp, RW_READER);
692 		in_crit = 1;
693 		error = nbl_svmand(vp, fp->f_cred, &svmand);
694 		if (error != 0)
695 			goto out;
696 		if (nbl_conflict(vp, NBL_READ, fp->f_offset, count, svmand,
697 		    NULL)) {
698 			error = EACCES;
699 			goto out;
700 		}
701 	}
702 
703 	(void) VOP_RWLOCK(vp, rwflag, NULL);
704 	fileoff = fp->f_offset;
705 
706 	/*
707 	 * Behaviour is same as read. Please see comments in read.
708 	 */
709 
710 	if ((vp->v_type == VREG) && (fileoff >= OFFSET_MAX(fp))) {
711 		struct vattr va;
712 		va.va_mask = AT_SIZE;
713 		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL)))  {
714 			VOP_RWUNLOCK(vp, rwflag, NULL);
715 			goto out;
716 		}
717 		if (fileoff >= va.va_size) {
718 			VOP_RWUNLOCK(vp, rwflag, NULL);
719 			count = 0;
720 			goto out;
721 		} else {
722 			VOP_RWUNLOCK(vp, rwflag, NULL);
723 			error = EOVERFLOW;
724 			goto out;
725 		}
726 	}
727 	if ((vp->v_type == VREG) && (fileoff + count > OFFSET_MAX(fp))) {
728 		count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
729 	}
730 	auio.uio_loffset = fileoff;
731 	auio.uio_iov = aiov;
732 	auio.uio_iovcnt = iovcnt;
733 	auio.uio_resid = bcount = count;
734 	auio.uio_segflg = UIO_USERSPACE;
735 	auio.uio_llimit = MAXOFFSET_T;
736 	auio.uio_fmode = fflag;
737 	if (bcount <= copyout_max_cached)
738 		auio.uio_extflg = UIO_COPY_CACHED;
739 	else
740 		auio.uio_extflg = UIO_COPY_DEFAULT;
741 
742 
743 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
744 
745 	/* If read sync is not asked for, filter sync flags */
746 	if ((ioflag & FRSYNC) == 0)
747 		ioflag &= ~(FSYNC|FDSYNC);
748 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
749 	count -= auio.uio_resid;
750 	CPU_STATS_ENTER_K();
751 	cp = CPU;
752 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
753 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)count);
754 	CPU_STATS_EXIT_K();
755 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
756 
757 	if (vp->v_type == VFIFO)	/* Backward compatibility */
758 		fp->f_offset = count;
759 	else if (((fp->f_flag & FAPPEND) == 0) ||
760 	    (vp->v_type != VREG) || (bcount != 0))	/* POSIX */
761 		fp->f_offset = auio.uio_loffset;
762 
763 	VOP_RWUNLOCK(vp, rwflag, NULL);
764 
765 	if (error == EINTR && count != 0)
766 		error = 0;
767 out:
768 	if (in_crit)
769 		nbl_end_crit(vp);
770 	releasef(fdes);
771 	if (error)
772 		return (set_errno(error));
773 	return (count);
774 }
775 
776 ssize_t
777 writev(int fdes, struct iovec *iovp, int iovcnt)
778 {
779 	struct uio auio;
780 	struct iovec aiov[DEF_IOV_MAX];
781 	file_t *fp;
782 	register vnode_t *vp;
783 	struct cpu *cp;
784 	int fflag, ioflag, rwflag;
785 	ssize_t count, bcount;
786 	int error = 0;
787 	int i;
788 	u_offset_t fileoff;
789 	int in_crit = 0;
790 
791 	if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
792 		return (set_errno(EINVAL));
793 
794 #ifdef _SYSCALL32_IMPL
795 	/*
796 	 * 32-bit callers need to have their iovec expanded,
797 	 * while ensuring that they can't move more than 2Gbytes
798 	 * of data in a single call.
799 	 */
800 	if (get_udatamodel() == DATAMODEL_ILP32) {
801 		struct iovec32 aiov32[DEF_IOV_MAX];
802 		ssize32_t count32;
803 
804 		if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
805 			return (set_errno(EFAULT));
806 
807 		count32 = 0;
808 		for (i = 0; i < iovcnt; i++) {
809 			ssize32_t iovlen = aiov32[i].iov_len;
810 			count32 += iovlen;
811 			if (iovlen < 0 || count32 < 0)
812 				return (set_errno(EINVAL));
813 			aiov[i].iov_len = iovlen;
814 			aiov[i].iov_base =
815 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
816 		}
817 	} else
818 #endif
819 	if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
820 		return (set_errno(EFAULT));
821 
822 	count = 0;
823 	for (i = 0; i < iovcnt; i++) {
824 		ssize_t iovlen = aiov[i].iov_len;
825 		count += iovlen;
826 		if (iovlen < 0 || count < 0)
827 			return (set_errno(EINVAL));
828 	}
829 	if ((fp = getf(fdes)) == NULL)
830 		return (set_errno(EBADF));
831 	if (((fflag = fp->f_flag) & FWRITE) == 0) {
832 		error = EBADF;
833 		goto out;
834 	}
835 	vp = fp->f_vnode;
836 	if (vp->v_type == VREG && count == 0) {
837 		goto out;
838 	}
839 
840 	rwflag = 1;
841 
842 	/*
843 	 * We have to enter the critical region before calling VOP_RWLOCK
844 	 * to avoid a deadlock with ufs.
845 	 */
846 	if (nbl_need_check(vp)) {
847 		int svmand;
848 
849 		nbl_start_crit(vp, RW_READER);
850 		in_crit = 1;
851 		error = nbl_svmand(vp, fp->f_cred, &svmand);
852 		if (error != 0)
853 			goto out;
854 		if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, count, svmand,
855 		    NULL)) {
856 			error = EACCES;
857 			goto out;
858 		}
859 	}
860 
861 	(void) VOP_RWLOCK(vp, rwflag, NULL);
862 
863 	fileoff = fp->f_offset;
864 
865 	/*
866 	 * Behaviour is same as write. Please see comments for write.
867 	 */
868 
869 	if (vp->v_type == VREG) {
870 		if (fileoff >= curproc->p_fsz_ctl) {
871 			VOP_RWUNLOCK(vp, rwflag, NULL);
872 			mutex_enter(&curproc->p_lock);
873 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
874 			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
875 			mutex_exit(&curproc->p_lock);
876 			error = EFBIG;
877 			goto out;
878 		}
879 		if (fileoff >= OFFSET_MAX(fp)) {
880 			VOP_RWUNLOCK(vp, rwflag, NULL);
881 			error = EFBIG;
882 			goto out;
883 		}
884 		if (fileoff + count > OFFSET_MAX(fp))
885 			count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
886 	}
887 	auio.uio_loffset = fileoff;
888 	auio.uio_iov = aiov;
889 	auio.uio_iovcnt = iovcnt;
890 	auio.uio_resid = bcount = count;
891 	auio.uio_segflg = UIO_USERSPACE;
892 	auio.uio_llimit = curproc->p_fsz_ctl;
893 	auio.uio_fmode = fflag;
894 	auio.uio_extflg = UIO_COPY_DEFAULT;
895 
896 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
897 
898 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
899 	count -= auio.uio_resid;
900 	CPU_STATS_ENTER_K();
901 	cp = CPU;
902 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
903 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)count);
904 	CPU_STATS_EXIT_K();
905 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
906 
907 	if (vp->v_type == VFIFO)	/* Backward compatibility */
908 		fp->f_offset = count;
909 	else if (((fp->f_flag & FAPPEND) == 0) ||
910 	    (vp->v_type != VREG) || (bcount != 0))	/* POSIX */
911 		fp->f_offset = auio.uio_loffset;
912 	VOP_RWUNLOCK(vp, rwflag, NULL);
913 
914 	if (error == EINTR && count != 0)
915 		error = 0;
916 out:
917 	if (in_crit)
918 		nbl_end_crit(vp);
919 	releasef(fdes);
920 	if (error)
921 		return (set_errno(error));
922 	return (count);
923 }
924 
925 ssize_t
926 preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
927     off_t extended_offset)
928 {
929 	struct uio auio;
930 	struct iovec aiov[DEF_IOV_MAX];
931 	file_t *fp;
932 	register vnode_t *vp;
933 	struct cpu *cp;
934 	int fflag, ioflag, rwflag;
935 	ssize_t count, bcount;
936 	int error = 0;
937 	int i;
938 
939 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
940 	u_offset_t fileoff = ((u_offset_t)extended_offset << 32) |
941 	    (u_offset_t)offset;
942 #else /* _SYSCALL32_IMPL || _ILP32 */
943 	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
944 #endif /* _SYSCALL32_IMPR || _ILP32 */
945 #ifdef _SYSCALL32_IMPL
946 	const u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 &&
947 	    extended_offset == 0?
948 	    MAXOFF32_T : MAXOFFSET_T;
949 #else /* _SYSCALL32_IMPL */
950 	const u_offset_t maxoff = MAXOFF32_T;
951 #endif /* _SYSCALL32_IMPL */
952 
953 	int in_crit = 0;
954 
955 	if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
956 		return (set_errno(EINVAL));
957 
958 #ifdef _SYSCALL32_IMPL
959 	/*
960 	 * 32-bit callers need to have their iovec expanded,
961 	 * while ensuring that they can't move more than 2Gbytes
962 	 * of data in a single call.
963 	 */
964 	if (get_udatamodel() == DATAMODEL_ILP32) {
965 		struct iovec32 aiov32[DEF_IOV_MAX];
966 		ssize32_t count32;
967 
968 		if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
969 			return (set_errno(EFAULT));
970 
971 		count32 = 0;
972 		for (i = 0; i < iovcnt; i++) {
973 			ssize32_t iovlen32 = aiov32[i].iov_len;
974 			count32 += iovlen32;
975 			if (iovlen32 < 0 || count32 < 0)
976 				return (set_errno(EINVAL));
977 			aiov[i].iov_len = iovlen32;
978 			aiov[i].iov_base =
979 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
980 		}
981 	} else
982 #endif /* _SYSCALL32_IMPL */
983 		if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
984 			return (set_errno(EFAULT));
985 
986 	count = 0;
987 	for (i = 0; i < iovcnt; i++) {
988 		ssize_t iovlen = aiov[i].iov_len;
989 		count += iovlen;
990 		if (iovlen < 0 || count < 0)
991 			return (set_errno(EINVAL));
992 	}
993 
994 	if ((bcount = (ssize_t)count) < 0)
995 		return (set_errno(EINVAL));
996 	if ((fp = getf(fdes)) == NULL)
997 		return (set_errno(EBADF));
998 	if (((fflag = fp->f_flag) & FREAD) == 0) {
999 		error = EBADF;
1000 		goto out;
1001 	}
1002 	vp = fp->f_vnode;
1003 	rwflag = 0;
1004 	if (vp->v_type == VREG) {
1005 
1006 		if (bcount == 0)
1007 			goto out;
1008 
1009 		/*
1010 		 * return EINVAL for offsets that cannot be
1011 		 * represented in an off_t.
1012 		 */
1013 		if (fileoff > maxoff) {
1014 			error = EINVAL;
1015 			goto out;
1016 		}
1017 
1018 		if (fileoff + bcount > maxoff)
1019 			bcount = (ssize_t)((u_offset_t)maxoff - fileoff);
1020 	} else if (vp->v_type == VFIFO) {
1021 		error = ESPIPE;
1022 		goto out;
1023 	}
1024 	/*
1025 	 * We have to enter the critical region before calling VOP_RWLOCK
1026 	 * to avoid a deadlock with ufs.
1027 	 */
1028 	if (nbl_need_check(vp)) {
1029 		int svmand;
1030 
1031 		nbl_start_crit(vp, RW_READER);
1032 		in_crit = 1;
1033 		error = nbl_svmand(vp, fp->f_cred, &svmand);
1034 		if (error != 0)
1035 			goto out;
1036 		if (nbl_conflict(vp, NBL_WRITE, fileoff, count, svmand,
1037 		    NULL)) {
1038 			error = EACCES;
1039 			goto out;
1040 		}
1041 	}
1042 
1043 	(void) VOP_RWLOCK(vp, rwflag, NULL);
1044 
1045 	/*
1046 	 * Behaviour is same as read(2). Please see comments in
1047 	 * read(2).
1048 	 */
1049 
1050 	if ((vp->v_type == VREG) && (fileoff >= OFFSET_MAX(fp))) {
1051 		struct vattr va;
1052 		va.va_mask = AT_SIZE;
1053 		if ((error =
1054 		    VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL)))  {
1055 			VOP_RWUNLOCK(vp, rwflag, NULL);
1056 			goto out;
1057 		}
1058 		if (fileoff >= va.va_size) {
1059 			VOP_RWUNLOCK(vp, rwflag, NULL);
1060 			count = 0;
1061 			goto out;
1062 		} else {
1063 			VOP_RWUNLOCK(vp, rwflag, NULL);
1064 			error = EOVERFLOW;
1065 			goto out;
1066 		}
1067 	}
1068 	if ((vp->v_type == VREG) &&
1069 	    (fileoff + count > OFFSET_MAX(fp))) {
1070 		count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
1071 	}
1072 	auio.uio_loffset = fileoff;
1073 	auio.uio_iov = aiov;
1074 	auio.uio_iovcnt = iovcnt;
1075 	auio.uio_resid = bcount = count;
1076 	auio.uio_segflg = UIO_USERSPACE;
1077 	auio.uio_llimit = MAXOFFSET_T;
1078 	auio.uio_fmode = fflag;
1079 	if (bcount <= copyout_max_cached)
1080 		auio.uio_extflg = UIO_COPY_CACHED;
1081 	else
1082 		auio.uio_extflg = UIO_COPY_DEFAULT;
1083 
1084 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1085 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1086 	count -= auio.uio_resid;
1087 	CPU_STATS_ENTER_K();
1088 	cp = CPU;
1089 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
1090 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)count);
1091 	CPU_STATS_EXIT_K();
1092 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
1093 
1094 	VOP_RWUNLOCK(vp, rwflag, NULL);
1095 
1096 	if (error == EINTR && count != 0)
1097 		error = 0;
1098 out:
1099 	if (in_crit)
1100 		nbl_end_crit(vp);
1101 	releasef(fdes);
1102 	if (error)
1103 		return (set_errno(error));
1104 	return (count);
1105 }
1106 
1107 ssize_t
1108 pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
1109     off_t extended_offset)
1110 {
1111 	struct uio auio;
1112 	struct iovec aiov[DEF_IOV_MAX];
1113 	file_t *fp;
1114 	register vnode_t *vp;
1115 	struct cpu *cp;
1116 	int fflag, ioflag, rwflag;
1117 	ssize_t count, bcount;
1118 	int error = 0;
1119 	int i;
1120 
1121 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1122 	u_offset_t fileoff = ((u_offset_t)extended_offset << 32) |
1123 	    (u_offset_t)offset;
1124 #else /* _SYSCALL32_IMPL || _ILP32 */
1125 	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
1126 #endif /* _SYSCALL32_IMPR || _ILP32 */
1127 #ifdef _SYSCALL32_IMPL
1128 	const u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 &&
1129 	    extended_offset == 0?
1130 	    MAXOFF32_T : MAXOFFSET_T;
1131 #else /* _SYSCALL32_IMPL */
1132 	const u_offset_t maxoff = MAXOFF32_T;
1133 #endif /* _SYSCALL32_IMPL */
1134 
1135 	int in_crit = 0;
1136 
1137 	if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
1138 		return (set_errno(EINVAL));
1139 
1140 #ifdef _SYSCALL32_IMPL
1141 	/*
1142 	 * 32-bit callers need to have their iovec expanded,
1143 	 * while ensuring that they can't move more than 2Gbytes
1144 	 * of data in a single call.
1145 	 */
1146 	if (get_udatamodel() == DATAMODEL_ILP32) {
1147 		struct iovec32 aiov32[DEF_IOV_MAX];
1148 		ssize32_t count32;
1149 
1150 		if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
1151 			return (set_errno(EFAULT));
1152 
1153 		count32 = 0;
1154 		for (i = 0; i < iovcnt; i++) {
1155 			ssize32_t iovlen32 = aiov32[i].iov_len;
1156 			count32 += iovlen32;
1157 			if (iovlen32 < 0 || count32 < 0)
1158 				return (set_errno(EINVAL));
1159 			aiov[i].iov_len = iovlen32;
1160 			aiov[i].iov_base =
1161 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
1162 		}
1163 	} else
1164 #endif /* _SYSCALL32_IMPL */
1165 		if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
1166 			return (set_errno(EFAULT));
1167 
1168 	count = 0;
1169 	for (i = 0; i < iovcnt; i++) {
1170 		ssize_t iovlen = aiov[i].iov_len;
1171 		count += iovlen;
1172 		if (iovlen < 0 || count < 0)
1173 			return (set_errno(EINVAL));
1174 	}
1175 
1176 	if ((bcount = (ssize_t)count) < 0)
1177 		return (set_errno(EINVAL));
1178 	if ((fp = getf(fdes)) == NULL)
1179 		return (set_errno(EBADF));
1180 	if (((fflag = fp->f_flag) & FWRITE) == 0) {
1181 		error = EBADF;
1182 		goto out;
1183 	}
1184 	vp = fp->f_vnode;
1185 	rwflag = 1;
1186 	if (vp->v_type == VREG) {
1187 
1188 		if (bcount == 0)
1189 			goto out;
1190 
1191 		/*
1192 		 * return EINVAL for offsets that cannot be
1193 		 * represented in an off_t.
1194 		 */
1195 		if (fileoff > maxoff) {
1196 			error = EINVAL;
1197 			goto out;
1198 		}
1199 		/*
1200 		 * Take appropriate action if we are trying
1201 		 * to write above the resource limit.
1202 		 */
1203 		if (fileoff >= curproc->p_fsz_ctl) {
1204 			mutex_enter(&curproc->p_lock);
1205 			/*
1206 			 * Return value ignored because it lists
1207 			 * actions taken, but we are in an error case.
1208 			 * We don't have any actions that depend on
1209 			 * what could happen in this call, so we ignore
1210 			 * the return value.
1211 			 */
1212 			(void) rctl_action(
1213 			    rctlproc_legacy[RLIMIT_FSIZE],
1214 			    curproc->p_rctls, curproc,
1215 			    RCA_UNSAFE_SIGINFO);
1216 			mutex_exit(&curproc->p_lock);
1217 
1218 			error = EFBIG;
1219 			goto out;
1220 		}
1221 		/*
1222 		 * Don't allow pwritev to cause file sizes to exceed
1223 		 * maxoff.
1224 		 */
1225 		if (fileoff == maxoff) {
1226 			error = EFBIG;
1227 			goto out;
1228 		}
1229 
1230 		if (fileoff + bcount > maxoff)
1231 			bcount = (ssize_t)((u_offset_t)maxoff - fileoff);
1232 	} else if (vp->v_type == VFIFO) {
1233 		error = ESPIPE;
1234 		goto out;
1235 	}
1236 	/*
1237 	 * We have to enter the critical region before calling VOP_RWLOCK
1238 	 * to avoid a deadlock with ufs.
1239 	 */
1240 	if (nbl_need_check(vp)) {
1241 		int svmand;
1242 
1243 		nbl_start_crit(vp, RW_READER);
1244 		in_crit = 1;
1245 		error = nbl_svmand(vp, fp->f_cred, &svmand);
1246 		if (error != 0)
1247 			goto out;
1248 		if (nbl_conflict(vp, NBL_WRITE, fileoff, count, svmand,
1249 		    NULL)) {
1250 			error = EACCES;
1251 			goto out;
1252 		}
1253 	}
1254 
1255 	(void) VOP_RWLOCK(vp, rwflag, NULL);
1256 
1257 
1258 	/*
1259 	 * Behaviour is same as write(2). Please see comments for
1260 	 * write(2).
1261 	 */
1262 
1263 	if (vp->v_type == VREG) {
1264 		if (fileoff >= curproc->p_fsz_ctl) {
1265 			VOP_RWUNLOCK(vp, rwflag, NULL);
1266 			mutex_enter(&curproc->p_lock);
1267 			/* see above rctl_action comment */
1268 			(void) rctl_action(
1269 			    rctlproc_legacy[RLIMIT_FSIZE],
1270 			    curproc->p_rctls,
1271 			    curproc, RCA_UNSAFE_SIGINFO);
1272 			mutex_exit(&curproc->p_lock);
1273 			error = EFBIG;
1274 			goto out;
1275 		}
1276 		if (fileoff >= OFFSET_MAX(fp)) {
1277 			VOP_RWUNLOCK(vp, rwflag, NULL);
1278 			error = EFBIG;
1279 			goto out;
1280 		}
1281 		if (fileoff + count > OFFSET_MAX(fp))
1282 			count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
1283 	}
1284 
1285 	auio.uio_loffset = fileoff;
1286 	auio.uio_iov = aiov;
1287 	auio.uio_iovcnt = iovcnt;
1288 	auio.uio_resid = bcount = count;
1289 	auio.uio_segflg = UIO_USERSPACE;
1290 	auio.uio_llimit = curproc->p_fsz_ctl;
1291 	auio.uio_fmode = fflag;
1292 	auio.uio_extflg = UIO_COPY_CACHED;
1293 	ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
1294 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
1295 	count -= auio.uio_resid;
1296 	CPU_STATS_ENTER_K();
1297 	cp = CPU;
1298 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
1299 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)count);
1300 	CPU_STATS_EXIT_K();
1301 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
1302 
1303 	VOP_RWUNLOCK(vp, rwflag, NULL);
1304 
1305 	if (error == EINTR && count != 0)
1306 		error = 0;
1307 out:
1308 	if (in_crit)
1309 		nbl_end_crit(vp);
1310 	releasef(fdes);
1311 	if (error)
1312 		return (set_errno(error));
1313 	return (count);
1314 }
1315 
1316 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1317 
1318 /*
1319  * This syscall supplies 64-bit file offsets to 32-bit applications only.
1320  */
1321 ssize32_t
1322 pread64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
1323     uint32_t offset_2)
1324 {
1325 	struct uio auio;
1326 	struct iovec aiov;
1327 	file_t *fp;
1328 	register vnode_t *vp;
1329 	struct cpu *cp;
1330 	int fflag, ioflag, rwflag;
1331 	ssize_t bcount;
1332 	int error = 0;
1333 	u_offset_t fileoff;
1334 	int in_crit = 0;
1335 
1336 #if defined(_LITTLE_ENDIAN)
1337 	fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
1338 #else
1339 	fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
1340 #endif
1341 
1342 	if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
1343 		return (set_errno(EINVAL));
1344 
1345 	if ((fp = getf(fdes)) == NULL)
1346 		return (set_errno(EBADF));
1347 	if (((fflag = fp->f_flag) & (FREAD)) == 0) {
1348 		error = EBADF;
1349 		goto out;
1350 	}
1351 
1352 	rwflag = 0;
1353 	vp = fp->f_vnode;
1354 
1355 	if (vp->v_type == VREG) {
1356 
1357 		if (bcount == 0)
1358 			goto out;
1359 
1360 		/*
1361 		 * Same as pread. See comments in pread.
1362 		 */
1363 
1364 		if (fileoff > MAXOFFSET_T) {
1365 			error = EINVAL;
1366 			goto out;
1367 		}
1368 		if (fileoff + bcount > MAXOFFSET_T)
1369 			bcount = (ssize_t)(MAXOFFSET_T - fileoff);
1370 	} else if (vp->v_type == VFIFO) {
1371 		error = ESPIPE;
1372 		goto out;
1373 	}
1374 
1375 	/*
1376 	 * We have to enter the critical region before calling VOP_RWLOCK
1377 	 * to avoid a deadlock with ufs.
1378 	 */
1379 	if (nbl_need_check(vp)) {
1380 		int svmand;
1381 
1382 		nbl_start_crit(vp, RW_READER);
1383 		in_crit = 1;
1384 		error = nbl_svmand(vp, fp->f_cred, &svmand);
1385 		if (error != 0)
1386 			goto out;
1387 		if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand,
1388 		    NULL)) {
1389 			error = EACCES;
1390 			goto out;
1391 		}
1392 	}
1393 
1394 	aiov.iov_base = cbuf;
1395 	aiov.iov_len = bcount;
1396 	(void) VOP_RWLOCK(vp, rwflag, NULL);
1397 	auio.uio_loffset = fileoff;
1398 
1399 	/*
1400 	 * Note: File size can never be greater than MAXOFFSET_T.
1401 	 * If ever we start supporting 128 bit files the code
1402 	 * similar to the one in pread at this place should be here.
1403 	 * Here we avoid the unnecessary VOP_GETATTR() when we
1404 	 * know that fileoff == MAXOFFSET_T implies that it is always
1405 	 * greater than or equal to file size.
1406 	 */
1407 	auio.uio_iov = &aiov;
1408 	auio.uio_iovcnt = 1;
1409 	auio.uio_resid = bcount;
1410 	auio.uio_segflg = UIO_USERSPACE;
1411 	auio.uio_llimit = MAXOFFSET_T;
1412 	auio.uio_fmode = fflag;
1413 	auio.uio_extflg = UIO_COPY_CACHED;
1414 
1415 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1416 
1417 	/* If read sync is not asked for, filter sync flags */
1418 	if ((ioflag & FRSYNC) == 0)
1419 		ioflag &= ~(FSYNC|FDSYNC);
1420 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1421 	bcount -= auio.uio_resid;
1422 	CPU_STATS_ENTER_K();
1423 	cp = CPU;
1424 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
1425 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
1426 	CPU_STATS_EXIT_K();
1427 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
1428 	VOP_RWUNLOCK(vp, rwflag, NULL);
1429 
1430 	if (error == EINTR && bcount != 0)
1431 		error = 0;
1432 out:
1433 	if (in_crit)
1434 		nbl_end_crit(vp);
1435 	releasef(fdes);
1436 	if (error)
1437 		return (set_errno(error));
1438 	return (bcount);
1439 }
1440 
1441 /*
1442  * This syscall supplies 64-bit file offsets to 32-bit applications only.
1443  */
1444 ssize32_t
1445 pwrite64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
1446     uint32_t offset_2)
1447 {
1448 	struct uio auio;
1449 	struct iovec aiov;
1450 	file_t *fp;
1451 	register vnode_t *vp;
1452 	struct cpu *cp;
1453 	int fflag, ioflag, rwflag;
1454 	ssize_t bcount;
1455 	int error = 0;
1456 	u_offset_t fileoff;
1457 	int in_crit = 0;
1458 
1459 #if defined(_LITTLE_ENDIAN)
1460 	fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
1461 #else
1462 	fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
1463 #endif
1464 
1465 	if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
1466 		return (set_errno(EINVAL));
1467 	if ((fp = getf(fdes)) == NULL)
1468 		return (set_errno(EBADF));
1469 	if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
1470 		error = EBADF;
1471 		goto out;
1472 	}
1473 
1474 	rwflag = 1;
1475 	vp = fp->f_vnode;
1476 
1477 	if (vp->v_type == VREG) {
1478 
1479 		if (bcount == 0)
1480 			goto out;
1481 
1482 		/*
1483 		 * See comments in pwrite.
1484 		 */
1485 		if (fileoff > MAXOFFSET_T) {
1486 			error = EINVAL;
1487 			goto out;
1488 		}
1489 		if (fileoff >= curproc->p_fsz_ctl) {
1490 			mutex_enter(&curproc->p_lock);
1491 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
1492 			    curproc->p_rctls, curproc, RCA_SAFE);
1493 			mutex_exit(&curproc->p_lock);
1494 			error = EFBIG;
1495 			goto out;
1496 		}
1497 		if (fileoff == MAXOFFSET_T) {
1498 			error = EFBIG;
1499 			goto out;
1500 		}
1501 		if (fileoff + bcount > MAXOFFSET_T)
1502 			bcount = (ssize_t)((u_offset_t)MAXOFFSET_T - fileoff);
1503 	} else if (vp->v_type == VFIFO) {
1504 		error = ESPIPE;
1505 		goto out;
1506 	}
1507 
1508 	/*
1509 	 * We have to enter the critical region before calling VOP_RWLOCK
1510 	 * to avoid a deadlock with ufs.
1511 	 */
1512 	if (nbl_need_check(vp)) {
1513 		int svmand;
1514 
1515 		nbl_start_crit(vp, RW_READER);
1516 		in_crit = 1;
1517 		error = nbl_svmand(vp, fp->f_cred, &svmand);
1518 		if (error != 0)
1519 			goto out;
1520 		if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand,
1521 		    NULL)) {
1522 			error = EACCES;
1523 			goto out;
1524 		}
1525 	}
1526 
1527 	aiov.iov_base = cbuf;
1528 	aiov.iov_len = bcount;
1529 	(void) VOP_RWLOCK(vp, rwflag, NULL);
1530 	auio.uio_loffset = fileoff;
1531 	auio.uio_iov = &aiov;
1532 	auio.uio_iovcnt = 1;
1533 	auio.uio_resid = bcount;
1534 	auio.uio_segflg = UIO_USERSPACE;
1535 	auio.uio_llimit = curproc->p_fsz_ctl;
1536 	auio.uio_fmode = fflag;
1537 	auio.uio_extflg = UIO_COPY_CACHED;
1538 
1539 	/*
1540 	 * The SUSv4 POSIX specification states:
1541 	 *	The pwrite() function shall be equivalent to write(), except
1542 	 *	that it writes into a given position and does not change
1543 	 *	the file offset (regardless of whether O_APPEND is set).
1544 	 * To make this be true, we omit the FAPPEND flag from ioflag.
1545 	 */
1546 	ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
1547 
1548 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
1549 	bcount -= auio.uio_resid;
1550 	CPU_STATS_ENTER_K();
1551 	cp = CPU;
1552 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
1553 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
1554 	CPU_STATS_EXIT_K();
1555 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
1556 	VOP_RWUNLOCK(vp, rwflag, NULL);
1557 
1558 	if (error == EINTR && bcount != 0)
1559 		error = 0;
1560 out:
1561 	if (in_crit)
1562 		nbl_end_crit(vp);
1563 	releasef(fdes);
1564 	if (error)
1565 		return (set_errno(error));
1566 	return (bcount);
1567 }
1568 
1569 #endif	/* _SYSCALL32_IMPL || _ILP32 */
1570 
1571 #ifdef _SYSCALL32_IMPL
1572 /*
1573  * Tail-call elimination of xxx32() down to xxx()
1574  *
1575  * A number of xxx32 system calls take a len (or count) argument and
1576  * return a number in the range [0,len] or -1 on error.
1577  * Given an ssize32_t input len, the downcall xxx() will return
1578  * a 64-bit value that is -1 or in the range [0,len] which actually
1579  * is a proper return value for the xxx32 call. So even if the xxx32
1580  * calls can be considered as returning a ssize32_t, they are currently
1581  * declared as returning a ssize_t as this enables tail-call elimination.
1582  *
1583  * The cast of len (or count) to ssize32_t is needed to ensure we pass
1584  * down negative input values as such and let the downcall handle error
1585  * reporting. Functions covered by this comments are:
1586  *
1587  * rw.c:           read32, write32, pread32, pwrite32, readv32, writev32.
1588  * socksyscall.c:  recv32, recvfrom32, send32, sendto32.
1589  * readlink.c:     readlink32.
1590  */
1591 
1592 ssize_t
1593 read32(int32_t fdes, caddr32_t cbuf, size32_t count)
1594 {
1595 	return (read(fdes,
1596 	    (void *)(uintptr_t)cbuf, (ssize32_t)count));
1597 }
1598 
1599 ssize_t
1600 write32(int32_t fdes, caddr32_t cbuf, size32_t count)
1601 {
1602 	return (write(fdes,
1603 	    (void *)(uintptr_t)cbuf, (ssize32_t)count));
1604 }
1605 
1606 ssize_t
1607 pread32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
1608 {
1609 	return (pread(fdes,
1610 	    (void *)(uintptr_t)cbuf, (ssize32_t)count,
1611 	    (off_t)(uint32_t)offset));
1612 }
1613 
1614 ssize_t
1615 pwrite32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
1616 {
1617 	return (pwrite(fdes,
1618 	    (void *)(uintptr_t)cbuf, (ssize32_t)count,
1619 	    (off_t)(uint32_t)offset));
1620 }
1621 
1622 ssize_t
1623 readv32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
1624 {
1625 	return (readv(fdes, (void *)(uintptr_t)iovp, iovcnt));
1626 }
1627 
1628 ssize_t
1629 writev32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
1630 {
1631 	return (writev(fdes, (void *)(uintptr_t)iovp, iovcnt));
1632 }
1633 #endif	/* _SYSCALL32_IMPL */
1634