1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 * Copyright 2020, Joyent, Inc.
26 */
27
28 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
29 /* All Rights Reserved */
30
31 /*
32 * Portions of this source code were derived from Berkeley 4.3 BSD
33 * under license from the Regents of the University of California.
34 */
35
36 #include <sys/param.h>
37 #include <sys/isa_defs.h>
38 #include <sys/types.h>
39 #include <sys/inttypes.h>
40 #include <sys/sysmacros.h>
41 #include <sys/cred.h>
42 #include <sys/user.h>
43 #include <sys/systm.h>
44 #include <sys/errno.h>
45 #include <sys/vnode.h>
46 #include <sys/file.h>
47 #include <sys/proc.h>
48 #include <sys/cpuvar.h>
49 #include <sys/uio.h>
50 #include <sys/debug.h>
51 #include <sys/rctl.h>
52 #include <sys/nbmlock.h>
53 #include <sys/limits.h>
54
55 #define COPYOUT_MAX_CACHE (1<<17) /* 128K */
56
57 size_t copyout_max_cached = COPYOUT_MAX_CACHE; /* global so it's patchable */
58
59 /*
60 * read, write, pread, pwrite, readv, and writev syscalls.
61 *
62 * 64-bit open: all open's are large file opens.
63 * Large Files: the behaviour of read depends on whether the fd
64 * corresponds to large open or not.
65 * 32-bit open: FOFFMAX flag not set.
66 * read until MAXOFF32_T - 1 and read at MAXOFF32_T returns
67 * EOVERFLOW if count is non-zero and if size of file
68 * is > MAXOFF32_T. If size of file is <= MAXOFF32_T read
69 * at >= MAXOFF32_T returns EOF.
70 */
71
72 /*
73 * Native system call
74 */
75 ssize_t
read(int fdes,void * cbuf,size_t count)76 read(int fdes, void *cbuf, size_t count)
77 {
78 struct uio auio;
79 struct iovec aiov;
80 file_t *fp;
81 register vnode_t *vp;
82 struct cpu *cp;
83 int fflag, ioflag, rwflag;
84 ssize_t cnt, bcount;
85 int error = 0;
86 u_offset_t fileoff;
87 int in_crit = 0;
88
89 if ((cnt = (ssize_t)count) < 0)
90 return (set_errno(EINVAL));
91 if ((fp = getf(fdes)) == NULL)
92 return (set_errno(EBADF));
93 if (((fflag = fp->f_flag) & FREAD) == 0) {
94 error = EBADF;
95 goto out;
96 }
97 vp = fp->f_vnode;
98
99 if (vp->v_type == VREG && cnt == 0) {
100 goto out;
101 }
102
103 rwflag = 0;
104 aiov.iov_base = cbuf;
105 aiov.iov_len = cnt;
106
107 /*
108 * We have to enter the critical region before calling VOP_RWLOCK
109 * to avoid a deadlock with write() calls.
110 */
111 if (nbl_need_check(vp)) {
112 int svmand;
113
114 nbl_start_crit(vp, RW_READER);
115 in_crit = 1;
116 error = nbl_svmand(vp, fp->f_cred, &svmand);
117 if (error != 0)
118 goto out;
119 if (nbl_conflict(vp, NBL_READ, fp->f_offset, cnt, svmand,
120 NULL)) {
121 error = EACCES;
122 goto out;
123 }
124 }
125
126 (void) VOP_RWLOCK(vp, rwflag, NULL);
127
128 /*
129 * We do the following checks inside VOP_RWLOCK so as to
130 * prevent file size from changing while these checks are
131 * being done. Also, we load fp's offset to the local
132 * variable fileoff because we can have a parallel lseek
133 * going on (f_offset is not protected by any lock) which
134 * could change f_offset. We need to see the value only
135 * once here and take a decision. Seeing it more than once
136 * can lead to incorrect functionality.
137 */
138
139 fileoff = (u_offset_t)fp->f_offset;
140 if (fileoff >= OFFSET_MAX(fp) && (vp->v_type == VREG)) {
141 struct vattr va;
142 va.va_mask = AT_SIZE;
143 if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL))) {
144 VOP_RWUNLOCK(vp, rwflag, NULL);
145 goto out;
146 }
147 if (fileoff >= va.va_size) {
148 cnt = 0;
149 VOP_RWUNLOCK(vp, rwflag, NULL);
150 goto out;
151 } else {
152 error = EOVERFLOW;
153 VOP_RWUNLOCK(vp, rwflag, NULL);
154 goto out;
155 }
156 }
157 if ((vp->v_type == VREG) &&
158 (fileoff + cnt > OFFSET_MAX(fp))) {
159 cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
160 }
161 auio.uio_loffset = fileoff;
162 auio.uio_iov = &aiov;
163 auio.uio_iovcnt = 1;
164 auio.uio_resid = bcount = cnt;
165 auio.uio_segflg = UIO_USERSPACE;
166 auio.uio_llimit = MAXOFFSET_T;
167 auio.uio_fmode = fflag;
168 /*
169 * Only use bypass caches when the count is large enough
170 */
171 if (bcount <= copyout_max_cached)
172 auio.uio_extflg = UIO_COPY_CACHED;
173 else
174 auio.uio_extflg = UIO_COPY_DEFAULT;
175
176 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
177
178 /* If read sync is not asked for, filter sync flags */
179 if ((ioflag & FRSYNC) == 0)
180 ioflag &= ~(FSYNC|FDSYNC);
181 error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
182 cnt -= auio.uio_resid;
183 CPU_STATS_ENTER_K();
184 cp = CPU;
185 CPU_STATS_ADDQ(cp, sys, sysread, 1);
186 CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)cnt);
187 CPU_STATS_EXIT_K();
188 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
189
190 if (vp->v_type == VFIFO) /* Backward compatibility */
191 fp->f_offset = cnt;
192 else if (((fp->f_flag & FAPPEND) == 0) ||
193 (vp->v_type != VREG) || (bcount != 0)) /* POSIX */
194 fp->f_offset = auio.uio_loffset;
195 VOP_RWUNLOCK(vp, rwflag, NULL);
196
197 if (error == EINTR && cnt != 0)
198 error = 0;
199 out:
200 if (in_crit)
201 nbl_end_crit(vp);
202 releasef(fdes);
203 if (error)
204 return (set_errno(error));
205 return (cnt);
206 }
207
208 /*
209 * Native system call
210 */
211 ssize_t
write(int fdes,void * cbuf,size_t count)212 write(int fdes, void *cbuf, size_t count)
213 {
214 struct uio auio;
215 struct iovec aiov;
216 file_t *fp;
217 register vnode_t *vp;
218 struct cpu *cp;
219 int fflag, ioflag, rwflag;
220 ssize_t cnt, bcount;
221 int error = 0;
222 u_offset_t fileoff;
223 int in_crit = 0;
224
225 if ((cnt = (ssize_t)count) < 0)
226 return (set_errno(EINVAL));
227 if ((fp = getf(fdes)) == NULL)
228 return (set_errno(EBADF));
229 if (((fflag = fp->f_flag) & FWRITE) == 0) {
230 error = EBADF;
231 goto out;
232 }
233 vp = fp->f_vnode;
234
235 if (vp->v_type == VREG && cnt == 0) {
236 goto out;
237 }
238
239 rwflag = 1;
240 aiov.iov_base = cbuf;
241 aiov.iov_len = cnt;
242
243 /*
244 * We have to enter the critical region before calling VOP_RWLOCK
245 * to avoid a deadlock with ufs.
246 */
247 if (nbl_need_check(vp)) {
248 int svmand;
249
250 nbl_start_crit(vp, RW_READER);
251 in_crit = 1;
252 error = nbl_svmand(vp, fp->f_cred, &svmand);
253 if (error != 0)
254 goto out;
255 if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, cnt, svmand,
256 NULL)) {
257 error = EACCES;
258 goto out;
259 }
260 }
261
262 (void) VOP_RWLOCK(vp, rwflag, NULL);
263
264 fileoff = fp->f_offset;
265 if (vp->v_type == VREG) {
266
267 /*
268 * We raise psignal if write for >0 bytes causes
269 * it to exceed the ulimit.
270 */
271 if (fileoff >= curproc->p_fsz_ctl) {
272 VOP_RWUNLOCK(vp, rwflag, NULL);
273
274 mutex_enter(&curproc->p_lock);
275 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
276 curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
277 mutex_exit(&curproc->p_lock);
278
279 error = EFBIG;
280 goto out;
281 }
282 /*
283 * We return EFBIG if write is done at an offset
284 * greater than the offset maximum for this file structure.
285 */
286
287 if (fileoff >= OFFSET_MAX(fp)) {
288 VOP_RWUNLOCK(vp, rwflag, NULL);
289 error = EFBIG;
290 goto out;
291 }
292 /*
293 * Limit the bytes to be written upto offset maximum for
294 * this open file structure.
295 */
296 if (fileoff + cnt > OFFSET_MAX(fp))
297 cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
298 }
299 auio.uio_loffset = fileoff;
300 auio.uio_iov = &aiov;
301 auio.uio_iovcnt = 1;
302 auio.uio_resid = bcount = cnt;
303 auio.uio_segflg = UIO_USERSPACE;
304 auio.uio_llimit = curproc->p_fsz_ctl;
305 auio.uio_fmode = fflag;
306 auio.uio_extflg = UIO_COPY_DEFAULT;
307
308 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
309
310 error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
311 cnt -= auio.uio_resid;
312 CPU_STATS_ENTER_K();
313 cp = CPU;
314 CPU_STATS_ADDQ(cp, sys, syswrite, 1);
315 CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)cnt);
316 CPU_STATS_EXIT_K();
317 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
318
319 if (vp->v_type == VFIFO) /* Backward compatibility */
320 fp->f_offset = cnt;
321 else if (((fp->f_flag & FAPPEND) == 0) ||
322 (vp->v_type != VREG) || (bcount != 0)) /* POSIX */
323 fp->f_offset = auio.uio_loffset;
324 VOP_RWUNLOCK(vp, rwflag, NULL);
325
326 if (error == EINTR && cnt != 0)
327 error = 0;
328 out:
329 if (in_crit)
330 nbl_end_crit(vp);
331 releasef(fdes);
332 if (error)
333 return (set_errno(error));
334 return (cnt);
335 }
336
337 ssize_t
pread(int fdes,void * cbuf,size_t count,off_t offset)338 pread(int fdes, void *cbuf, size_t count, off_t offset)
339 {
340 struct uio auio;
341 struct iovec aiov;
342 file_t *fp;
343 register vnode_t *vp;
344 struct cpu *cp;
345 int fflag, ioflag, rwflag;
346 ssize_t bcount;
347 int error = 0;
348 u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
349 #ifdef _SYSCALL32_IMPL
350 u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
351 MAXOFF32_T : MAXOFFSET_T;
352 #else
353 const u_offset_t maxoff = MAXOFF32_T;
354 #endif
355 int in_crit = 0;
356
357 if ((bcount = (ssize_t)count) < 0)
358 return (set_errno(EINVAL));
359
360 if ((fp = getf(fdes)) == NULL)
361 return (set_errno(EBADF));
362 if (((fflag = fp->f_flag) & (FREAD)) == 0) {
363 error = EBADF;
364 goto out;
365 }
366
367 rwflag = 0;
368 vp = fp->f_vnode;
369
370 if (vp->v_type == VREG) {
371
372 if (bcount == 0)
373 goto out;
374
375 /*
376 * Return EINVAL if an invalid offset comes to pread.
377 * Negative offset from user will cause this error.
378 */
379
380 if (fileoff > maxoff) {
381 error = EINVAL;
382 goto out;
383 }
384 /*
385 * Limit offset such that we don't read or write
386 * a file beyond the maximum offset representable in
387 * an off_t structure.
388 */
389 if (fileoff + bcount > maxoff)
390 bcount = (ssize_t)((offset_t)maxoff - fileoff);
391 } else if (vp->v_type == VFIFO) {
392 error = ESPIPE;
393 goto out;
394 }
395
396 /*
397 * We have to enter the critical region before calling VOP_RWLOCK
398 * to avoid a deadlock with ufs.
399 */
400 if (nbl_need_check(vp)) {
401 int svmand;
402
403 nbl_start_crit(vp, RW_READER);
404 in_crit = 1;
405 error = nbl_svmand(vp, fp->f_cred, &svmand);
406 if (error != 0)
407 goto out;
408 if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand,
409 NULL)) {
410 error = EACCES;
411 goto out;
412 }
413 }
414
415 aiov.iov_base = cbuf;
416 aiov.iov_len = bcount;
417 (void) VOP_RWLOCK(vp, rwflag, NULL);
418 if (vp->v_type == VREG && fileoff == (u_offset_t)maxoff) {
419 struct vattr va;
420 va.va_mask = AT_SIZE;
421 if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL))) {
422 VOP_RWUNLOCK(vp, rwflag, NULL);
423 goto out;
424 }
425 VOP_RWUNLOCK(vp, rwflag, NULL);
426
427 /*
428 * We have to return EOF if fileoff is >= file size.
429 */
430 if (fileoff >= va.va_size) {
431 bcount = 0;
432 goto out;
433 }
434
435 /*
436 * File is greater than or equal to maxoff and therefore
437 * we return EOVERFLOW.
438 */
439 error = EOVERFLOW;
440 goto out;
441 }
442 auio.uio_loffset = fileoff;
443 auio.uio_iov = &aiov;
444 auio.uio_iovcnt = 1;
445 auio.uio_resid = bcount;
446 auio.uio_segflg = UIO_USERSPACE;
447 auio.uio_llimit = MAXOFFSET_T;
448 auio.uio_fmode = fflag;
449 auio.uio_extflg = UIO_COPY_CACHED;
450
451 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
452
453 /* If read sync is not asked for, filter sync flags */
454 if ((ioflag & FRSYNC) == 0)
455 ioflag &= ~(FSYNC|FDSYNC);
456 error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
457 bcount -= auio.uio_resid;
458 CPU_STATS_ENTER_K();
459 cp = CPU;
460 CPU_STATS_ADDQ(cp, sys, sysread, 1);
461 CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
462 CPU_STATS_EXIT_K();
463 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
464 VOP_RWUNLOCK(vp, rwflag, NULL);
465
466 if (error == EINTR && bcount != 0)
467 error = 0;
468 out:
469 if (in_crit)
470 nbl_end_crit(vp);
471 releasef(fdes);
472 if (error)
473 return (set_errno(error));
474 return (bcount);
475 }
476
477 ssize_t
pwrite(int fdes,void * cbuf,size_t count,off_t offset)478 pwrite(int fdes, void *cbuf, size_t count, off_t offset)
479 {
480 struct uio auio;
481 struct iovec aiov;
482 file_t *fp;
483 register vnode_t *vp;
484 struct cpu *cp;
485 int fflag, ioflag, rwflag;
486 ssize_t bcount;
487 int error = 0;
488 u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
489 #ifdef _SYSCALL32_IMPL
490 u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
491 MAXOFF32_T : MAXOFFSET_T;
492 #else
493 const u_offset_t maxoff = MAXOFF32_T;
494 #endif
495 int in_crit = 0;
496
497 if ((bcount = (ssize_t)count) < 0)
498 return (set_errno(EINVAL));
499 if ((fp = getf(fdes)) == NULL)
500 return (set_errno(EBADF));
501 if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
502 error = EBADF;
503 goto out;
504 }
505
506 rwflag = 1;
507 vp = fp->f_vnode;
508
509 if (vp->v_type == VREG) {
510
511 if (bcount == 0)
512 goto out;
513
514 /*
515 * return EINVAL for offsets that cannot be
516 * represented in an off_t.
517 */
518 if (fileoff > maxoff) {
519 error = EINVAL;
520 goto out;
521 }
522 /*
523 * Take appropriate action if we are trying to write above the
524 * resource limit.
525 */
526 if (fileoff >= curproc->p_fsz_ctl) {
527 mutex_enter(&curproc->p_lock);
528 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
529 curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
530 mutex_exit(&curproc->p_lock);
531
532 error = EFBIG;
533 goto out;
534 }
535 /*
536 * Don't allow pwrite to cause file sizes to exceed
537 * maxoff.
538 */
539 if (fileoff == maxoff) {
540 error = EFBIG;
541 goto out;
542 }
543 if (fileoff + count > maxoff)
544 bcount = (ssize_t)((u_offset_t)maxoff - fileoff);
545 } else if (vp->v_type == VFIFO) {
546 error = ESPIPE;
547 goto out;
548 }
549
550 /*
551 * We have to enter the critical region before calling VOP_RWLOCK
552 * to avoid a deadlock with ufs.
553 */
554 if (nbl_need_check(vp)) {
555 int svmand;
556
557 nbl_start_crit(vp, RW_READER);
558 in_crit = 1;
559 error = nbl_svmand(vp, fp->f_cred, &svmand);
560 if (error != 0)
561 goto out;
562 if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand,
563 NULL)) {
564 error = EACCES;
565 goto out;
566 }
567 }
568
569 aiov.iov_base = cbuf;
570 aiov.iov_len = bcount;
571 (void) VOP_RWLOCK(vp, rwflag, NULL);
572 auio.uio_loffset = fileoff;
573 auio.uio_iov = &aiov;
574 auio.uio_iovcnt = 1;
575 auio.uio_resid = bcount;
576 auio.uio_segflg = UIO_USERSPACE;
577 auio.uio_llimit = curproc->p_fsz_ctl;
578 auio.uio_fmode = fflag;
579 auio.uio_extflg = UIO_COPY_CACHED;
580
581 /*
582 * The SUSv4 POSIX specification states:
583 * The pwrite() function shall be equivalent to write(), except
584 * that it writes into a given position and does not change
585 * the file offset (regardless of whether O_APPEND is set).
586 * To make this be true, we omit the FAPPEND flag from ioflag.
587 */
588 ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
589
590 error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
591 bcount -= auio.uio_resid;
592 CPU_STATS_ENTER_K();
593 cp = CPU;
594 CPU_STATS_ADDQ(cp, sys, syswrite, 1);
595 CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
596 CPU_STATS_EXIT_K();
597 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
598 VOP_RWUNLOCK(vp, rwflag, NULL);
599
600 if (error == EINTR && bcount != 0)
601 error = 0;
602 out:
603 if (in_crit)
604 nbl_end_crit(vp);
605 releasef(fdes);
606 if (error)
607 return (set_errno(error));
608 return (bcount);
609 }
610
611 ssize_t
readv(int fdes,struct iovec * iovp,int iovcnt)612 readv(int fdes, struct iovec *iovp, int iovcnt)
613 {
614 struct uio auio;
615 struct iovec buf[IOV_MAX_STACK], *aiov = buf;
616 int aiovlen = 0;
617 file_t *fp;
618 register vnode_t *vp;
619 struct cpu *cp;
620 int fflag, ioflag, rwflag;
621 ssize_t count, bcount;
622 int error = 0;
623 int i;
624 u_offset_t fileoff;
625 int in_crit = 0;
626
627 if (iovcnt <= 0 || iovcnt > IOV_MAX)
628 return (set_errno(EINVAL));
629
630 if (iovcnt > IOV_MAX_STACK) {
631 aiovlen = iovcnt * sizeof (iovec_t);
632 aiov = kmem_alloc(aiovlen, KM_SLEEP);
633 }
634
635 #ifdef _SYSCALL32_IMPL
636 /*
637 * 32-bit callers need to have their iovec expanded,
638 * while ensuring that they can't move more than 2Gbytes
639 * of data in a single call.
640 */
641 if (get_udatamodel() == DATAMODEL_ILP32) {
642 struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
643 int aiov32len;
644 ssize32_t count32;
645
646 aiov32len = iovcnt * sizeof (iovec32_t);
647 if (aiovlen != 0)
648 aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
649
650 if (copyin(iovp, aiov32, aiov32len)) {
651 if (aiovlen != 0) {
652 kmem_free(aiov32, aiov32len);
653 kmem_free(aiov, aiovlen);
654 }
655 return (set_errno(EFAULT));
656 }
657
658 count32 = 0;
659 for (i = 0; i < iovcnt; i++) {
660 ssize32_t iovlen32 = aiov32[i].iov_len;
661 count32 += iovlen32;
662 if (iovlen32 < 0 || count32 < 0) {
663 if (aiovlen != 0) {
664 kmem_free(aiov32, aiov32len);
665 kmem_free(aiov, aiovlen);
666 }
667 return (set_errno(EINVAL));
668 }
669 aiov[i].iov_len = iovlen32;
670 aiov[i].iov_base =
671 (caddr_t)(uintptr_t)aiov32[i].iov_base;
672 }
673
674 if (aiovlen != 0)
675 kmem_free(aiov32, aiov32len);
676 } else
677 #endif
678 if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
679 if (aiovlen != 0)
680 kmem_free(aiov, aiovlen);
681 return (set_errno(EFAULT));
682 }
683
684 count = 0;
685 for (i = 0; i < iovcnt; i++) {
686 ssize_t iovlen = aiov[i].iov_len;
687 count += iovlen;
688 if (iovlen < 0 || count < 0) {
689 if (aiovlen != 0)
690 kmem_free(aiov, aiovlen);
691 return (set_errno(EINVAL));
692 }
693 }
694 if ((fp = getf(fdes)) == NULL) {
695 if (aiovlen != 0)
696 kmem_free(aiov, aiovlen);
697 return (set_errno(EBADF));
698 }
699 if (((fflag = fp->f_flag) & FREAD) == 0) {
700 error = EBADF;
701 goto out;
702 }
703 vp = fp->f_vnode;
704 if (vp->v_type == VREG && count == 0) {
705 goto out;
706 }
707
708 rwflag = 0;
709
710 /*
711 * We have to enter the critical region before calling VOP_RWLOCK
712 * to avoid a deadlock with ufs.
713 */
714 if (nbl_need_check(vp)) {
715 int svmand;
716
717 nbl_start_crit(vp, RW_READER);
718 in_crit = 1;
719 error = nbl_svmand(vp, fp->f_cred, &svmand);
720 if (error != 0)
721 goto out;
722 if (nbl_conflict(vp, NBL_READ, fp->f_offset, count, svmand,
723 NULL)) {
724 error = EACCES;
725 goto out;
726 }
727 }
728
729 (void) VOP_RWLOCK(vp, rwflag, NULL);
730 fileoff = fp->f_offset;
731
732 /*
733 * Behaviour is same as read. Please see comments in read.
734 */
735
736 if ((vp->v_type == VREG) && (fileoff >= OFFSET_MAX(fp))) {
737 struct vattr va;
738 va.va_mask = AT_SIZE;
739 if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL))) {
740 VOP_RWUNLOCK(vp, rwflag, NULL);
741 goto out;
742 }
743 if (fileoff >= va.va_size) {
744 VOP_RWUNLOCK(vp, rwflag, NULL);
745 count = 0;
746 goto out;
747 } else {
748 VOP_RWUNLOCK(vp, rwflag, NULL);
749 error = EOVERFLOW;
750 goto out;
751 }
752 }
753 if ((vp->v_type == VREG) && (fileoff + count > OFFSET_MAX(fp))) {
754 count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
755 }
756 auio.uio_loffset = fileoff;
757 auio.uio_iov = aiov;
758 auio.uio_iovcnt = iovcnt;
759 auio.uio_resid = bcount = count;
760 auio.uio_segflg = UIO_USERSPACE;
761 auio.uio_llimit = MAXOFFSET_T;
762 auio.uio_fmode = fflag;
763 if (bcount <= copyout_max_cached)
764 auio.uio_extflg = UIO_COPY_CACHED;
765 else
766 auio.uio_extflg = UIO_COPY_DEFAULT;
767
768
769 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
770
771 /* If read sync is not asked for, filter sync flags */
772 if ((ioflag & FRSYNC) == 0)
773 ioflag &= ~(FSYNC|FDSYNC);
774 error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
775 count -= auio.uio_resid;
776 CPU_STATS_ENTER_K();
777 cp = CPU;
778 CPU_STATS_ADDQ(cp, sys, sysread, 1);
779 CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)count);
780 CPU_STATS_EXIT_K();
781 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
782
783 if (vp->v_type == VFIFO) /* Backward compatibility */
784 fp->f_offset = count;
785 else if (((fp->f_flag & FAPPEND) == 0) ||
786 (vp->v_type != VREG) || (bcount != 0)) /* POSIX */
787 fp->f_offset = auio.uio_loffset;
788
789 VOP_RWUNLOCK(vp, rwflag, NULL);
790
791 if (error == EINTR && count != 0)
792 error = 0;
793 out:
794 if (in_crit)
795 nbl_end_crit(vp);
796 releasef(fdes);
797 if (aiovlen != 0)
798 kmem_free(aiov, aiovlen);
799 if (error)
800 return (set_errno(error));
801 return (count);
802 }
803
804 ssize_t
writev(int fdes,struct iovec * iovp,int iovcnt)805 writev(int fdes, struct iovec *iovp, int iovcnt)
806 {
807 struct uio auio;
808 struct iovec buf[IOV_MAX_STACK], *aiov = buf;
809 int aiovlen = 0;
810 file_t *fp;
811 register vnode_t *vp;
812 struct cpu *cp;
813 int fflag, ioflag, rwflag;
814 ssize_t count, bcount;
815 int error = 0;
816 int i;
817 u_offset_t fileoff;
818 int in_crit = 0;
819
820 if (iovcnt <= 0 || iovcnt > IOV_MAX)
821 return (set_errno(EINVAL));
822
823 if (iovcnt > IOV_MAX_STACK) {
824 aiovlen = iovcnt * sizeof (iovec_t);
825 aiov = kmem_alloc(aiovlen, KM_SLEEP);
826 }
827
828 #ifdef _SYSCALL32_IMPL
829 /*
830 * 32-bit callers need to have their iovec expanded,
831 * while ensuring that they can't move more than 2Gbytes
832 * of data in a single call.
833 */
834 if (get_udatamodel() == DATAMODEL_ILP32) {
835 struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
836 int aiov32len;
837 ssize32_t count32;
838
839 aiov32len = iovcnt * sizeof (iovec32_t);
840 if (aiovlen != 0)
841 aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
842
843 if (copyin(iovp, aiov32, aiov32len)) {
844 if (aiovlen != 0) {
845 kmem_free(aiov32, aiov32len);
846 kmem_free(aiov, aiovlen);
847 }
848 return (set_errno(EFAULT));
849 }
850
851 count32 = 0;
852 for (i = 0; i < iovcnt; i++) {
853 ssize32_t iovlen = aiov32[i].iov_len;
854 count32 += iovlen;
855 if (iovlen < 0 || count32 < 0) {
856 if (aiovlen != 0) {
857 kmem_free(aiov32, aiov32len);
858 kmem_free(aiov, aiovlen);
859 }
860 return (set_errno(EINVAL));
861 }
862 aiov[i].iov_len = iovlen;
863 aiov[i].iov_base =
864 (caddr_t)(uintptr_t)aiov32[i].iov_base;
865 }
866 if (aiovlen != 0)
867 kmem_free(aiov32, aiov32len);
868 } else
869 #endif
870 if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
871 if (aiovlen != 0)
872 kmem_free(aiov, aiovlen);
873 return (set_errno(EFAULT));
874 }
875
876 count = 0;
877 for (i = 0; i < iovcnt; i++) {
878 ssize_t iovlen = aiov[i].iov_len;
879 count += iovlen;
880 if (iovlen < 0 || count < 0) {
881 if (aiovlen != 0)
882 kmem_free(aiov, aiovlen);
883 return (set_errno(EINVAL));
884 }
885 }
886 if ((fp = getf(fdes)) == NULL) {
887 if (aiovlen != 0)
888 kmem_free(aiov, aiovlen);
889 return (set_errno(EBADF));
890 }
891 if (((fflag = fp->f_flag) & FWRITE) == 0) {
892 error = EBADF;
893 goto out;
894 }
895 vp = fp->f_vnode;
896 if (vp->v_type == VREG && count == 0) {
897 goto out;
898 }
899
900 rwflag = 1;
901
902 /*
903 * We have to enter the critical region before calling VOP_RWLOCK
904 * to avoid a deadlock with ufs.
905 */
906 if (nbl_need_check(vp)) {
907 int svmand;
908
909 nbl_start_crit(vp, RW_READER);
910 in_crit = 1;
911 error = nbl_svmand(vp, fp->f_cred, &svmand);
912 if (error != 0)
913 goto out;
914 if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, count, svmand,
915 NULL)) {
916 error = EACCES;
917 goto out;
918 }
919 }
920
921 (void) VOP_RWLOCK(vp, rwflag, NULL);
922
923 fileoff = fp->f_offset;
924
925 /*
926 * Behaviour is same as write. Please see comments for write.
927 */
928
929 if (vp->v_type == VREG) {
930 if (fileoff >= curproc->p_fsz_ctl) {
931 VOP_RWUNLOCK(vp, rwflag, NULL);
932 mutex_enter(&curproc->p_lock);
933 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
934 curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
935 mutex_exit(&curproc->p_lock);
936 error = EFBIG;
937 goto out;
938 }
939 if (fileoff >= OFFSET_MAX(fp)) {
940 VOP_RWUNLOCK(vp, rwflag, NULL);
941 error = EFBIG;
942 goto out;
943 }
944 if (fileoff + count > OFFSET_MAX(fp))
945 count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
946 }
947 auio.uio_loffset = fileoff;
948 auio.uio_iov = aiov;
949 auio.uio_iovcnt = iovcnt;
950 auio.uio_resid = bcount = count;
951 auio.uio_segflg = UIO_USERSPACE;
952 auio.uio_llimit = curproc->p_fsz_ctl;
953 auio.uio_fmode = fflag;
954 auio.uio_extflg = UIO_COPY_DEFAULT;
955
956 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
957
958 error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
959 count -= auio.uio_resid;
960 CPU_STATS_ENTER_K();
961 cp = CPU;
962 CPU_STATS_ADDQ(cp, sys, syswrite, 1);
963 CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)count);
964 CPU_STATS_EXIT_K();
965 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
966
967 if (vp->v_type == VFIFO) /* Backward compatibility */
968 fp->f_offset = count;
969 else if (((fp->f_flag & FAPPEND) == 0) ||
970 (vp->v_type != VREG) || (bcount != 0)) /* POSIX */
971 fp->f_offset = auio.uio_loffset;
972 VOP_RWUNLOCK(vp, rwflag, NULL);
973
974 if (error == EINTR && count != 0)
975 error = 0;
976 out:
977 if (in_crit)
978 nbl_end_crit(vp);
979 releasef(fdes);
980 if (aiovlen != 0)
981 kmem_free(aiov, aiovlen);
982 if (error)
983 return (set_errno(error));
984 return (count);
985 }
986
987 ssize_t
preadv(int fdes,struct iovec * iovp,int iovcnt,off_t offset,off_t extended_offset)988 preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
989 off_t extended_offset)
990 {
991 struct uio auio;
992 struct iovec buf[IOV_MAX_STACK], *aiov = buf;
993 int aiovlen = 0;
994 file_t *fp;
995 register vnode_t *vp;
996 struct cpu *cp;
997 int fflag, ioflag, rwflag;
998 ssize_t count, bcount;
999 int error = 0;
1000 int i;
1001
1002 /*
1003 * In a 64-bit kernel, this interface supports native 64-bit
1004 * applications as well as 32-bit applications using both standard and
1005 * large-file access. For 32-bit large-file aware applications, the
1006 * offset is passed as two parameters which are joined into the actual
1007 * offset used. The 64-bit libc always passes 0 for the extended_offset.
1008 * Note that off_t is a signed value, but the preadv/pwritev API treats
1009 * the offset as a position in the file for the operation, so passing
1010 * a negative value will likely fail the maximum offset checks below
1011 * because we convert it to an unsigned value which will be larger than
1012 * the maximum valid offset.
1013 */
1014 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1015 u_offset_t fileoff = ((u_offset_t)extended_offset << 32) |
1016 (u_offset_t)offset;
1017 #else /* _SYSCALL32_IMPL || _ILP32 */
1018 u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
1019 #endif /* _SYSCALL32_IMPR || _ILP32 */
1020
1021 int in_crit = 0;
1022
1023 if (iovcnt <= 0 || iovcnt > IOV_MAX)
1024 return (set_errno(EINVAL));
1025
1026 if (iovcnt > IOV_MAX_STACK) {
1027 aiovlen = iovcnt * sizeof (iovec_t);
1028 aiov = kmem_alloc(aiovlen, KM_SLEEP);
1029 }
1030
1031 #ifdef _SYSCALL32_IMPL
1032 /*
1033 * 32-bit callers need to have their iovec expanded,
1034 * while ensuring that they can't move more than 2Gbytes
1035 * of data in a single call.
1036 */
1037 if (get_udatamodel() == DATAMODEL_ILP32) {
1038 struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
1039 int aiov32len;
1040 ssize32_t count32;
1041
1042 aiov32len = iovcnt * sizeof (iovec32_t);
1043 if (aiovlen != 0)
1044 aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
1045
1046 if (copyin(iovp, aiov32, aiov32len)) {
1047 if (aiovlen != 0) {
1048 kmem_free(aiov32, aiov32len);
1049 kmem_free(aiov, aiovlen);
1050 }
1051 return (set_errno(EFAULT));
1052 }
1053
1054 count32 = 0;
1055 for (i = 0; i < iovcnt; i++) {
1056 ssize32_t iovlen32 = aiov32[i].iov_len;
1057 count32 += iovlen32;
1058 if (iovlen32 < 0 || count32 < 0) {
1059 if (aiovlen != 0) {
1060 kmem_free(aiov32, aiov32len);
1061 kmem_free(aiov, aiovlen);
1062 }
1063 return (set_errno(EINVAL));
1064 }
1065 aiov[i].iov_len = iovlen32;
1066 aiov[i].iov_base =
1067 (caddr_t)(uintptr_t)aiov32[i].iov_base;
1068 }
1069 if (aiovlen != 0)
1070 kmem_free(aiov32, aiov32len);
1071 } else
1072 #endif /* _SYSCALL32_IMPL */
1073 if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
1074 if (aiovlen != 0)
1075 kmem_free(aiov, aiovlen);
1076 return (set_errno(EFAULT));
1077 }
1078
1079 count = 0;
1080 for (i = 0; i < iovcnt; i++) {
1081 ssize_t iovlen = aiov[i].iov_len;
1082 count += iovlen;
1083 if (iovlen < 0 || count < 0) {
1084 if (aiovlen != 0)
1085 kmem_free(aiov, aiovlen);
1086 return (set_errno(EINVAL));
1087 }
1088 }
1089
1090 if ((bcount = count) < 0) {
1091 if (aiovlen != 0)
1092 kmem_free(aiov, aiovlen);
1093 return (set_errno(EINVAL));
1094 }
1095 if ((fp = getf(fdes)) == NULL) {
1096 if (aiovlen != 0)
1097 kmem_free(aiov, aiovlen);
1098 return (set_errno(EBADF));
1099 }
1100 if (((fflag = fp->f_flag) & FREAD) == 0) {
1101 error = EBADF;
1102 goto out;
1103 }
1104 vp = fp->f_vnode;
1105 rwflag = 0;
1106
1107 /*
1108 * Behaviour is same as read(2). Please see comments in read above.
1109 */
1110 if (vp->v_type == VREG) {
1111 if (bcount == 0)
1112 goto out;
1113
1114 /* Handle offset past maximum offset allowed for file. */
1115 if (fileoff >= OFFSET_MAX(fp)) {
1116 struct vattr va;
1117 va.va_mask = AT_SIZE;
1118
1119 error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL);
1120 if (error == 0) {
1121 if (fileoff >= va.va_size) {
1122 count = 0;
1123 } else {
1124 error = EOVERFLOW;
1125 }
1126 }
1127 goto out;
1128 }
1129
1130 ASSERT(bcount == count);
1131
1132 /* Note: modified count used in nbl_conflict() call below. */
1133 if ((fileoff + count) > OFFSET_MAX(fp))
1134 count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
1135
1136 } else if (vp->v_type == VFIFO) {
1137 error = ESPIPE;
1138 goto out;
1139 }
1140 /*
1141 * We have to enter the critical region before calling VOP_RWLOCK
1142 * to avoid a deadlock with ufs.
1143 */
1144 if (nbl_need_check(vp)) {
1145 int svmand;
1146
1147 nbl_start_crit(vp, RW_READER);
1148 in_crit = 1;
1149 error = nbl_svmand(vp, fp->f_cred, &svmand);
1150 if (error != 0)
1151 goto out;
1152 if (nbl_conflict(vp, NBL_WRITE, fileoff, count, svmand, NULL)) {
1153 error = EACCES;
1154 goto out;
1155 }
1156 }
1157
1158 (void) VOP_RWLOCK(vp, rwflag, NULL);
1159
1160 auio.uio_loffset = fileoff;
1161 auio.uio_iov = aiov;
1162 auio.uio_iovcnt = iovcnt;
1163 auio.uio_resid = bcount = count;
1164 auio.uio_segflg = UIO_USERSPACE;
1165 auio.uio_llimit = MAXOFFSET_T;
1166 auio.uio_fmode = fflag;
1167 if (bcount <= copyout_max_cached)
1168 auio.uio_extflg = UIO_COPY_CACHED;
1169 else
1170 auio.uio_extflg = UIO_COPY_DEFAULT;
1171
1172 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1173 error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1174 count -= auio.uio_resid;
1175 CPU_STATS_ENTER_K();
1176 cp = CPU;
1177 CPU_STATS_ADDQ(cp, sys, sysread, 1);
1178 CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)count);
1179 CPU_STATS_EXIT_K();
1180 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
1181
1182 VOP_RWUNLOCK(vp, rwflag, NULL);
1183
1184 if (error == EINTR && count != 0)
1185 error = 0;
1186 out:
1187 if (in_crit)
1188 nbl_end_crit(vp);
1189 releasef(fdes);
1190 if (aiovlen != 0)
1191 kmem_free(aiov, aiovlen);
1192 if (error)
1193 return (set_errno(error));
1194 return (count);
1195 }
1196
1197 ssize_t
pwritev(int fdes,struct iovec * iovp,int iovcnt,off_t offset,off_t extended_offset)1198 pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset,
1199 off_t extended_offset)
1200 {
1201 struct uio auio;
1202 struct iovec buf[IOV_MAX_STACK], *aiov = buf;
1203 int aiovlen = 0;
1204 file_t *fp;
1205 register vnode_t *vp;
1206 struct cpu *cp;
1207 int fflag, ioflag, rwflag;
1208 ssize_t count, bcount;
1209 int error = 0;
1210 int i;
1211
1212 /*
1213 * See the comment in preadv for how the offset is handled.
1214 */
1215 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1216 u_offset_t fileoff = ((u_offset_t)extended_offset << 32) |
1217 (u_offset_t)offset;
1218 #else /* _SYSCALL32_IMPL || _ILP32 */
1219 u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
1220 #endif /* _SYSCALL32_IMPR || _ILP32 */
1221
1222 int in_crit = 0;
1223
1224 if (iovcnt <= 0 || iovcnt > IOV_MAX)
1225 return (set_errno(EINVAL));
1226
1227 if (iovcnt > IOV_MAX_STACK) {
1228 aiovlen = iovcnt * sizeof (iovec_t);
1229 aiov = kmem_alloc(aiovlen, KM_SLEEP);
1230 }
1231
1232 #ifdef _SYSCALL32_IMPL
1233 /*
1234 * 32-bit callers need to have their iovec expanded,
1235 * while ensuring that they can't move more than 2Gbytes
1236 * of data in a single call.
1237 */
1238 if (get_udatamodel() == DATAMODEL_ILP32) {
1239 struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
1240 int aiov32len;
1241 ssize32_t count32;
1242
1243 aiov32len = iovcnt * sizeof (iovec32_t);
1244 if (aiovlen != 0)
1245 aiov32 = kmem_alloc(aiov32len, KM_SLEEP);
1246
1247 if (copyin(iovp, aiov32, aiov32len)) {
1248 if (aiovlen != 0) {
1249 kmem_free(aiov32, aiov32len);
1250 kmem_free(aiov, aiovlen);
1251 }
1252 return (set_errno(EFAULT));
1253 }
1254
1255 count32 = 0;
1256 for (i = 0; i < iovcnt; i++) {
1257 ssize32_t iovlen32 = aiov32[i].iov_len;
1258 count32 += iovlen32;
1259 if (iovlen32 < 0 || count32 < 0) {
1260 if (aiovlen != 0) {
1261 kmem_free(aiov32, aiov32len);
1262 kmem_free(aiov, aiovlen);
1263 }
1264 return (set_errno(EINVAL));
1265 }
1266 aiov[i].iov_len = iovlen32;
1267 aiov[i].iov_base =
1268 (caddr_t)(uintptr_t)aiov32[i].iov_base;
1269 }
1270 if (aiovlen != 0)
1271 kmem_free(aiov32, aiov32len);
1272 } else
1273 #endif /* _SYSCALL32_IMPL */
1274 if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) {
1275 if (aiovlen != 0)
1276 kmem_free(aiov, aiovlen);
1277 return (set_errno(EFAULT));
1278 }
1279
1280 count = 0;
1281 for (i = 0; i < iovcnt; i++) {
1282 ssize_t iovlen = aiov[i].iov_len;
1283 count += iovlen;
1284 if (iovlen < 0 || count < 0) {
1285 if (aiovlen != 0)
1286 kmem_free(aiov, aiovlen);
1287 return (set_errno(EINVAL));
1288 }
1289 }
1290
1291 if ((bcount = count) < 0) {
1292 if (aiovlen != 0)
1293 kmem_free(aiov, aiovlen);
1294 return (set_errno(EINVAL));
1295 }
1296 if ((fp = getf(fdes)) == NULL) {
1297 if (aiovlen != 0)
1298 kmem_free(aiov, aiovlen);
1299 return (set_errno(EBADF));
1300 }
1301 if (((fflag = fp->f_flag) & FWRITE) == 0) {
1302 error = EBADF;
1303 goto out;
1304 }
1305 vp = fp->f_vnode;
1306 rwflag = 1;
1307
1308 /*
1309 * The kernel's write(2) code checks OFFSET_MAX and the rctl, and
1310 * returns EFBIG when fileoff exceeds either limit. We do the same.
1311 */
1312 if (vp->v_type == VREG) {
1313 if (bcount == 0)
1314 goto out;
1315
1316 /*
1317 * Don't allow pwritev to cause file size to exceed the proper
1318 * offset limit.
1319 */
1320 if (fileoff >= OFFSET_MAX(fp)) {
1321 error = EFBIG;
1322 goto out;
1323 }
1324
1325 /*
1326 * Take appropriate action if we are trying
1327 * to write above the resource limit.
1328 */
1329 if (fileoff >= curproc->p_fsz_ctl) {
1330 mutex_enter(&curproc->p_lock);
1331 /*
1332 * Return value ignored because it lists
1333 * actions taken, but we are in an error case.
1334 * We don't have any actions that depend on
1335 * what could happen in this call, so we ignore
1336 * the return value.
1337 */
1338 (void) rctl_action(
1339 rctlproc_legacy[RLIMIT_FSIZE],
1340 curproc->p_rctls, curproc,
1341 RCA_UNSAFE_SIGINFO);
1342 mutex_exit(&curproc->p_lock);
1343
1344 error = EFBIG;
1345 goto out;
1346 }
1347
1348 ASSERT(bcount == count);
1349
1350 /* Note: modified count used in nbl_conflict() call below. */
1351 if ((fileoff + count) > OFFSET_MAX(fp))
1352 count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
1353
1354 } else if (vp->v_type == VFIFO) {
1355 error = ESPIPE;
1356 goto out;
1357 }
1358 /*
1359 * We have to enter the critical region before calling VOP_RWLOCK
1360 * to avoid a deadlock with ufs.
1361 */
1362 if (nbl_need_check(vp)) {
1363 int svmand;
1364
1365 nbl_start_crit(vp, RW_READER);
1366 in_crit = 1;
1367 error = nbl_svmand(vp, fp->f_cred, &svmand);
1368 if (error != 0)
1369 goto out;
1370 if (nbl_conflict(vp, NBL_WRITE, fileoff, count, svmand, NULL)) {
1371 error = EACCES;
1372 goto out;
1373 }
1374 }
1375
1376 (void) VOP_RWLOCK(vp, rwflag, NULL);
1377
1378 auio.uio_loffset = fileoff;
1379 auio.uio_iov = aiov;
1380 auio.uio_iovcnt = iovcnt;
1381 auio.uio_resid = bcount = count;
1382 auio.uio_segflg = UIO_USERSPACE;
1383 auio.uio_llimit = curproc->p_fsz_ctl;
1384 auio.uio_fmode = fflag;
1385 auio.uio_extflg = UIO_COPY_CACHED;
1386 ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
1387 error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
1388 count -= auio.uio_resid;
1389 CPU_STATS_ENTER_K();
1390 cp = CPU;
1391 CPU_STATS_ADDQ(cp, sys, syswrite, 1);
1392 CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)count);
1393 CPU_STATS_EXIT_K();
1394 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
1395
1396 VOP_RWUNLOCK(vp, rwflag, NULL);
1397
1398 if (error == EINTR && count != 0)
1399 error = 0;
1400 out:
1401 if (in_crit)
1402 nbl_end_crit(vp);
1403 releasef(fdes);
1404 if (aiovlen != 0)
1405 kmem_free(aiov, aiovlen);
1406 if (error)
1407 return (set_errno(error));
1408 return (count);
1409 }
1410
1411 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
1412
1413 /*
1414 * This syscall supplies 64-bit file offsets to 32-bit applications only.
1415 */
1416 ssize32_t
pread64(int fdes,void * cbuf,size32_t count,uint32_t offset_1,uint32_t offset_2)1417 pread64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
1418 uint32_t offset_2)
1419 {
1420 struct uio auio;
1421 struct iovec aiov;
1422 file_t *fp;
1423 register vnode_t *vp;
1424 struct cpu *cp;
1425 int fflag, ioflag, rwflag;
1426 ssize_t bcount;
1427 int error = 0;
1428 u_offset_t fileoff;
1429 int in_crit = 0;
1430
1431 #if defined(_LITTLE_ENDIAN)
1432 fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
1433 #else
1434 fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
1435 #endif
1436
1437 if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
1438 return (set_errno(EINVAL));
1439
1440 if ((fp = getf(fdes)) == NULL)
1441 return (set_errno(EBADF));
1442 if (((fflag = fp->f_flag) & (FREAD)) == 0) {
1443 error = EBADF;
1444 goto out;
1445 }
1446
1447 rwflag = 0;
1448 vp = fp->f_vnode;
1449
1450 if (vp->v_type == VREG) {
1451
1452 if (bcount == 0)
1453 goto out;
1454
1455 /*
1456 * Same as pread. See comments in pread.
1457 */
1458
1459 if (fileoff > MAXOFFSET_T) {
1460 error = EINVAL;
1461 goto out;
1462 }
1463 if (fileoff + bcount > MAXOFFSET_T)
1464 bcount = (ssize_t)(MAXOFFSET_T - fileoff);
1465 } else if (vp->v_type == VFIFO) {
1466 error = ESPIPE;
1467 goto out;
1468 }
1469
1470 /*
1471 * We have to enter the critical region before calling VOP_RWLOCK
1472 * to avoid a deadlock with ufs.
1473 */
1474 if (nbl_need_check(vp)) {
1475 int svmand;
1476
1477 nbl_start_crit(vp, RW_READER);
1478 in_crit = 1;
1479 error = nbl_svmand(vp, fp->f_cred, &svmand);
1480 if (error != 0)
1481 goto out;
1482 if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand,
1483 NULL)) {
1484 error = EACCES;
1485 goto out;
1486 }
1487 }
1488
1489 aiov.iov_base = cbuf;
1490 aiov.iov_len = bcount;
1491 (void) VOP_RWLOCK(vp, rwflag, NULL);
1492 auio.uio_loffset = fileoff;
1493
1494 /*
1495 * Note: File size can never be greater than MAXOFFSET_T.
1496 * If ever we start supporting 128 bit files the code
1497 * similar to the one in pread at this place should be here.
1498 * Here we avoid the unnecessary VOP_GETATTR() when we
1499 * know that fileoff == MAXOFFSET_T implies that it is always
1500 * greater than or equal to file size.
1501 */
1502 auio.uio_iov = &aiov;
1503 auio.uio_iovcnt = 1;
1504 auio.uio_resid = bcount;
1505 auio.uio_segflg = UIO_USERSPACE;
1506 auio.uio_llimit = MAXOFFSET_T;
1507 auio.uio_fmode = fflag;
1508 auio.uio_extflg = UIO_COPY_CACHED;
1509
1510 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1511
1512 /* If read sync is not asked for, filter sync flags */
1513 if ((ioflag & FRSYNC) == 0)
1514 ioflag &= ~(FSYNC|FDSYNC);
1515 error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1516 bcount -= auio.uio_resid;
1517 CPU_STATS_ENTER_K();
1518 cp = CPU;
1519 CPU_STATS_ADDQ(cp, sys, sysread, 1);
1520 CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
1521 CPU_STATS_EXIT_K();
1522 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
1523 VOP_RWUNLOCK(vp, rwflag, NULL);
1524
1525 if (error == EINTR && bcount != 0)
1526 error = 0;
1527 out:
1528 if (in_crit)
1529 nbl_end_crit(vp);
1530 releasef(fdes);
1531 if (error)
1532 return (set_errno(error));
1533 return (bcount);
1534 }
1535
1536 /*
1537 * This syscall supplies 64-bit file offsets to 32-bit applications only.
1538 */
1539 ssize32_t
pwrite64(int fdes,void * cbuf,size32_t count,uint32_t offset_1,uint32_t offset_2)1540 pwrite64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
1541 uint32_t offset_2)
1542 {
1543 struct uio auio;
1544 struct iovec aiov;
1545 file_t *fp;
1546 register vnode_t *vp;
1547 struct cpu *cp;
1548 int fflag, ioflag, rwflag;
1549 ssize_t bcount;
1550 int error = 0;
1551 u_offset_t fileoff;
1552 int in_crit = 0;
1553
1554 #if defined(_LITTLE_ENDIAN)
1555 fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
1556 #else
1557 fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
1558 #endif
1559
1560 if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
1561 return (set_errno(EINVAL));
1562 if ((fp = getf(fdes)) == NULL)
1563 return (set_errno(EBADF));
1564 if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
1565 error = EBADF;
1566 goto out;
1567 }
1568
1569 rwflag = 1;
1570 vp = fp->f_vnode;
1571
1572 if (vp->v_type == VREG) {
1573
1574 if (bcount == 0)
1575 goto out;
1576
1577 /*
1578 * See comments in pwrite.
1579 */
1580 if (fileoff > MAXOFFSET_T) {
1581 error = EINVAL;
1582 goto out;
1583 }
1584 if (fileoff >= curproc->p_fsz_ctl) {
1585 mutex_enter(&curproc->p_lock);
1586 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
1587 curproc->p_rctls, curproc, RCA_SAFE);
1588 mutex_exit(&curproc->p_lock);
1589 error = EFBIG;
1590 goto out;
1591 }
1592 if (fileoff == MAXOFFSET_T) {
1593 error = EFBIG;
1594 goto out;
1595 }
1596 if (fileoff + bcount > MAXOFFSET_T)
1597 bcount = (ssize_t)((u_offset_t)MAXOFFSET_T - fileoff);
1598 } else if (vp->v_type == VFIFO) {
1599 error = ESPIPE;
1600 goto out;
1601 }
1602
1603 /*
1604 * We have to enter the critical region before calling VOP_RWLOCK
1605 * to avoid a deadlock with ufs.
1606 */
1607 if (nbl_need_check(vp)) {
1608 int svmand;
1609
1610 nbl_start_crit(vp, RW_READER);
1611 in_crit = 1;
1612 error = nbl_svmand(vp, fp->f_cred, &svmand);
1613 if (error != 0)
1614 goto out;
1615 if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand,
1616 NULL)) {
1617 error = EACCES;
1618 goto out;
1619 }
1620 }
1621
1622 aiov.iov_base = cbuf;
1623 aiov.iov_len = bcount;
1624 (void) VOP_RWLOCK(vp, rwflag, NULL);
1625 auio.uio_loffset = fileoff;
1626 auio.uio_iov = &aiov;
1627 auio.uio_iovcnt = 1;
1628 auio.uio_resid = bcount;
1629 auio.uio_segflg = UIO_USERSPACE;
1630 auio.uio_llimit = curproc->p_fsz_ctl;
1631 auio.uio_fmode = fflag;
1632 auio.uio_extflg = UIO_COPY_CACHED;
1633
1634 /*
1635 * The SUSv4 POSIX specification states:
1636 * The pwrite() function shall be equivalent to write(), except
1637 * that it writes into a given position and does not change
1638 * the file offset (regardless of whether O_APPEND is set).
1639 * To make this be true, we omit the FAPPEND flag from ioflag.
1640 */
1641 ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
1642
1643 error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
1644 bcount -= auio.uio_resid;
1645 CPU_STATS_ENTER_K();
1646 cp = CPU;
1647 CPU_STATS_ADDQ(cp, sys, syswrite, 1);
1648 CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
1649 CPU_STATS_EXIT_K();
1650 ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
1651 VOP_RWUNLOCK(vp, rwflag, NULL);
1652
1653 if (error == EINTR && bcount != 0)
1654 error = 0;
1655 out:
1656 if (in_crit)
1657 nbl_end_crit(vp);
1658 releasef(fdes);
1659 if (error)
1660 return (set_errno(error));
1661 return (bcount);
1662 }
1663
1664 #endif /* _SYSCALL32_IMPL || _ILP32 */
1665
1666 #ifdef _SYSCALL32_IMPL
1667 /*
1668 * Tail-call elimination of xxx32() down to xxx()
1669 *
1670 * A number of xxx32 system calls take a len (or count) argument and
1671 * return a number in the range [0,len] or -1 on error.
1672 * Given an ssize32_t input len, the downcall xxx() will return
1673 * a 64-bit value that is -1 or in the range [0,len] which actually
1674 * is a proper return value for the xxx32 call. So even if the xxx32
1675 * calls can be considered as returning a ssize32_t, they are currently
1676 * declared as returning a ssize_t as this enables tail-call elimination.
1677 *
1678 * The cast of len (or count) to ssize32_t is needed to ensure we pass
1679 * down negative input values as such and let the downcall handle error
1680 * reporting. Functions covered by this comments are:
1681 *
1682 * rw.c: read32, write32, pread32, pwrite32, readv32, writev32.
1683 * socksyscall.c: recv32, recvfrom32, send32, sendto32.
1684 * readlink.c: readlink32.
1685 */
1686
1687 ssize_t
read32(int32_t fdes,caddr32_t cbuf,size32_t count)1688 read32(int32_t fdes, caddr32_t cbuf, size32_t count)
1689 {
1690 return (read(fdes,
1691 (void *)(uintptr_t)cbuf, (ssize32_t)count));
1692 }
1693
1694 ssize_t
write32(int32_t fdes,caddr32_t cbuf,size32_t count)1695 write32(int32_t fdes, caddr32_t cbuf, size32_t count)
1696 {
1697 return (write(fdes,
1698 (void *)(uintptr_t)cbuf, (ssize32_t)count));
1699 }
1700
1701 ssize_t
pread32(int32_t fdes,caddr32_t cbuf,size32_t count,off32_t offset)1702 pread32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
1703 {
1704 return (pread(fdes,
1705 (void *)(uintptr_t)cbuf, (ssize32_t)count,
1706 (off_t)(uint32_t)offset));
1707 }
1708
1709 ssize_t
pwrite32(int32_t fdes,caddr32_t cbuf,size32_t count,off32_t offset)1710 pwrite32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
1711 {
1712 return (pwrite(fdes,
1713 (void *)(uintptr_t)cbuf, (ssize32_t)count,
1714 (off_t)(uint32_t)offset));
1715 }
1716
1717 ssize_t
readv32(int32_t fdes,caddr32_t iovp,int32_t iovcnt)1718 readv32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
1719 {
1720 return (readv(fdes, (void *)(uintptr_t)iovp, iovcnt));
1721 }
1722
1723 ssize_t
writev32(int32_t fdes,caddr32_t iovp,int32_t iovcnt)1724 writev32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
1725 {
1726 return (writev(fdes, (void *)(uintptr_t)iovp, iovcnt));
1727 }
1728 #endif /* _SYSCALL32_IMPL */
1729