xref: /linux/fs/read_write.c (revision cc04a46f11ea046ed53e2c832ae29e4790f7e35f)
1 /*
2  *  linux/fs/read_write.c
3  *
4  *  Copyright (C) 1991, 1992  Linus Torvalds
5  */
6 
7 #include <linux/slab.h>
8 #include <linux/stat.h>
9 #include <linux/fcntl.h>
10 #include <linux/file.h>
11 #include <linux/uio.h>
12 #include <linux/fsnotify.h>
13 #include <linux/security.h>
14 #include <linux/export.h>
15 #include <linux/syscalls.h>
16 #include <linux/pagemap.h>
17 #include <linux/splice.h>
18 #include <linux/compat.h>
19 #include "internal.h"
20 
21 #include <asm/uaccess.h>
22 #include <asm/unistd.h>
23 
24 typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
25 typedef ssize_t (*iter_fn_t)(struct kiocb *, struct iov_iter *);
26 
27 const struct file_operations generic_ro_fops = {
28 	.llseek		= generic_file_llseek,
29 	.read_iter	= generic_file_read_iter,
30 	.mmap		= generic_file_readonly_mmap,
31 	.splice_read	= generic_file_splice_read,
32 };
33 
34 EXPORT_SYMBOL(generic_ro_fops);
35 
36 static inline int unsigned_offsets(struct file *file)
37 {
38 	return file->f_mode & FMODE_UNSIGNED_OFFSET;
39 }
40 
41 /**
42  * vfs_setpos - update the file offset for lseek
43  * @file:	file structure in question
44  * @offset:	file offset to seek to
45  * @maxsize:	maximum file size
46  *
47  * This is a low-level filesystem helper for updating the file offset to
48  * the value specified by @offset if the given offset is valid and it is
49  * not equal to the current file offset.
50  *
51  * Return the specified offset on success and -EINVAL on invalid offset.
52  */
53 loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
54 {
55 	if (offset < 0 && !unsigned_offsets(file))
56 		return -EINVAL;
57 	if (offset > maxsize)
58 		return -EINVAL;
59 
60 	if (offset != file->f_pos) {
61 		file->f_pos = offset;
62 		file->f_version = 0;
63 	}
64 	return offset;
65 }
66 EXPORT_SYMBOL(vfs_setpos);
67 
68 /**
69  * generic_file_llseek_size - generic llseek implementation for regular files
70  * @file:	file structure to seek on
71  * @offset:	file offset to seek to
72  * @whence:	type of seek
73  * @size:	max size of this file in file system
74  * @eof:	offset used for SEEK_END position
75  *
76  * This is a variant of generic_file_llseek that allows passing in a custom
77  * maximum file size and a custom EOF position, for e.g. hashed directories
78  *
79  * Synchronization:
80  * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
81  * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
82  * read/writes behave like SEEK_SET against seeks.
83  */
84 loff_t
85 generic_file_llseek_size(struct file *file, loff_t offset, int whence,
86 		loff_t maxsize, loff_t eof)
87 {
88 	switch (whence) {
89 	case SEEK_END:
90 		offset += eof;
91 		break;
92 	case SEEK_CUR:
93 		/*
94 		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
95 		 * position-querying operation.  Avoid rewriting the "same"
96 		 * f_pos value back to the file because a concurrent read(),
97 		 * write() or lseek() might have altered it
98 		 */
99 		if (offset == 0)
100 			return file->f_pos;
101 		/*
102 		 * f_lock protects against read/modify/write race with other
103 		 * SEEK_CURs. Note that parallel writes and reads behave
104 		 * like SEEK_SET.
105 		 */
106 		spin_lock(&file->f_lock);
107 		offset = vfs_setpos(file, file->f_pos + offset, maxsize);
108 		spin_unlock(&file->f_lock);
109 		return offset;
110 	case SEEK_DATA:
111 		/*
112 		 * In the generic case the entire file is data, so as long as
113 		 * offset isn't at the end of the file then the offset is data.
114 		 */
115 		if (offset >= eof)
116 			return -ENXIO;
117 		break;
118 	case SEEK_HOLE:
119 		/*
120 		 * There is a virtual hole at the end of the file, so as long as
121 		 * offset isn't i_size or larger, return i_size.
122 		 */
123 		if (offset >= eof)
124 			return -ENXIO;
125 		offset = eof;
126 		break;
127 	}
128 
129 	return vfs_setpos(file, offset, maxsize);
130 }
131 EXPORT_SYMBOL(generic_file_llseek_size);
132 
133 /**
134  * generic_file_llseek - generic llseek implementation for regular files
135  * @file:	file structure to seek on
136  * @offset:	file offset to seek to
137  * @whence:	type of seek
138  *
139  * This is a generic implemenation of ->llseek useable for all normal local
140  * filesystems.  It just updates the file offset to the value specified by
141  * @offset and @whence.
142  */
143 loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
144 {
145 	struct inode *inode = file->f_mapping->host;
146 
147 	return generic_file_llseek_size(file, offset, whence,
148 					inode->i_sb->s_maxbytes,
149 					i_size_read(inode));
150 }
151 EXPORT_SYMBOL(generic_file_llseek);
152 
153 /**
154  * fixed_size_llseek - llseek implementation for fixed-sized devices
155  * @file:	file structure to seek on
156  * @offset:	file offset to seek to
157  * @whence:	type of seek
158  * @size:	size of the file
159  *
160  */
161 loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
162 {
163 	switch (whence) {
164 	case SEEK_SET: case SEEK_CUR: case SEEK_END:
165 		return generic_file_llseek_size(file, offset, whence,
166 						size, size);
167 	default:
168 		return -EINVAL;
169 	}
170 }
171 EXPORT_SYMBOL(fixed_size_llseek);
172 
173 /**
174  * noop_llseek - No Operation Performed llseek implementation
175  * @file:	file structure to seek on
176  * @offset:	file offset to seek to
177  * @whence:	type of seek
178  *
179  * This is an implementation of ->llseek useable for the rare special case when
180  * userspace expects the seek to succeed but the (device) file is actually not
181  * able to perform the seek. In this case you use noop_llseek() instead of
182  * falling back to the default implementation of ->llseek.
183  */
184 loff_t noop_llseek(struct file *file, loff_t offset, int whence)
185 {
186 	return file->f_pos;
187 }
188 EXPORT_SYMBOL(noop_llseek);
189 
190 loff_t no_llseek(struct file *file, loff_t offset, int whence)
191 {
192 	return -ESPIPE;
193 }
194 EXPORT_SYMBOL(no_llseek);
195 
196 loff_t default_llseek(struct file *file, loff_t offset, int whence)
197 {
198 	struct inode *inode = file_inode(file);
199 	loff_t retval;
200 
201 	mutex_lock(&inode->i_mutex);
202 	switch (whence) {
203 		case SEEK_END:
204 			offset += i_size_read(inode);
205 			break;
206 		case SEEK_CUR:
207 			if (offset == 0) {
208 				retval = file->f_pos;
209 				goto out;
210 			}
211 			offset += file->f_pos;
212 			break;
213 		case SEEK_DATA:
214 			/*
215 			 * In the generic case the entire file is data, so as
216 			 * long as offset isn't at the end of the file then the
217 			 * offset is data.
218 			 */
219 			if (offset >= inode->i_size) {
220 				retval = -ENXIO;
221 				goto out;
222 			}
223 			break;
224 		case SEEK_HOLE:
225 			/*
226 			 * There is a virtual hole at the end of the file, so
227 			 * as long as offset isn't i_size or larger, return
228 			 * i_size.
229 			 */
230 			if (offset >= inode->i_size) {
231 				retval = -ENXIO;
232 				goto out;
233 			}
234 			offset = inode->i_size;
235 			break;
236 	}
237 	retval = -EINVAL;
238 	if (offset >= 0 || unsigned_offsets(file)) {
239 		if (offset != file->f_pos) {
240 			file->f_pos = offset;
241 			file->f_version = 0;
242 		}
243 		retval = offset;
244 	}
245 out:
246 	mutex_unlock(&inode->i_mutex);
247 	return retval;
248 }
249 EXPORT_SYMBOL(default_llseek);
250 
251 loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
252 {
253 	loff_t (*fn)(struct file *, loff_t, int);
254 
255 	fn = no_llseek;
256 	if (file->f_mode & FMODE_LSEEK) {
257 		if (file->f_op->llseek)
258 			fn = file->f_op->llseek;
259 	}
260 	return fn(file, offset, whence);
261 }
262 EXPORT_SYMBOL(vfs_llseek);
263 
264 static inline struct fd fdget_pos(int fd)
265 {
266 	return __to_fd(__fdget_pos(fd));
267 }
268 
269 static inline void fdput_pos(struct fd f)
270 {
271 	if (f.flags & FDPUT_POS_UNLOCK)
272 		mutex_unlock(&f.file->f_pos_lock);
273 	fdput(f);
274 }
275 
276 SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
277 {
278 	off_t retval;
279 	struct fd f = fdget_pos(fd);
280 	if (!f.file)
281 		return -EBADF;
282 
283 	retval = -EINVAL;
284 	if (whence <= SEEK_MAX) {
285 		loff_t res = vfs_llseek(f.file, offset, whence);
286 		retval = res;
287 		if (res != (loff_t)retval)
288 			retval = -EOVERFLOW;	/* LFS: should only happen on 32 bit platforms */
289 	}
290 	fdput_pos(f);
291 	return retval;
292 }
293 
294 #ifdef CONFIG_COMPAT
295 COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
296 {
297 	return sys_lseek(fd, offset, whence);
298 }
299 #endif
300 
301 #ifdef __ARCH_WANT_SYS_LLSEEK
302 SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
303 		unsigned long, offset_low, loff_t __user *, result,
304 		unsigned int, whence)
305 {
306 	int retval;
307 	struct fd f = fdget_pos(fd);
308 	loff_t offset;
309 
310 	if (!f.file)
311 		return -EBADF;
312 
313 	retval = -EINVAL;
314 	if (whence > SEEK_MAX)
315 		goto out_putf;
316 
317 	offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
318 			whence);
319 
320 	retval = (int)offset;
321 	if (offset >= 0) {
322 		retval = -EFAULT;
323 		if (!copy_to_user(result, &offset, sizeof(offset)))
324 			retval = 0;
325 	}
326 out_putf:
327 	fdput_pos(f);
328 	return retval;
329 }
330 #endif
331 
332 ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos)
333 {
334 	struct kiocb kiocb;
335 	ssize_t ret;
336 
337 	if (!file->f_op->read_iter)
338 		return -EINVAL;
339 
340 	init_sync_kiocb(&kiocb, file);
341 	kiocb.ki_pos = *ppos;
342 
343 	iter->type |= READ;
344 	ret = file->f_op->read_iter(&kiocb, iter);
345 	BUG_ON(ret == -EIOCBQUEUED);
346 	if (ret > 0)
347 		*ppos = kiocb.ki_pos;
348 	return ret;
349 }
350 EXPORT_SYMBOL(vfs_iter_read);
351 
352 ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos)
353 {
354 	struct kiocb kiocb;
355 	ssize_t ret;
356 
357 	if (!file->f_op->write_iter)
358 		return -EINVAL;
359 
360 	init_sync_kiocb(&kiocb, file);
361 	kiocb.ki_pos = *ppos;
362 
363 	iter->type |= WRITE;
364 	ret = file->f_op->write_iter(&kiocb, iter);
365 	BUG_ON(ret == -EIOCBQUEUED);
366 	if (ret > 0)
367 		*ppos = kiocb.ki_pos;
368 	return ret;
369 }
370 EXPORT_SYMBOL(vfs_iter_write);
371 
372 /*
373  * rw_verify_area doesn't like huge counts. We limit
374  * them to something that fits in "int" so that others
375  * won't have to do range checks all the time.
376  */
377 int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
378 {
379 	struct inode *inode;
380 	loff_t pos;
381 	int retval = -EINVAL;
382 
383 	inode = file_inode(file);
384 	if (unlikely((ssize_t) count < 0))
385 		return retval;
386 	pos = *ppos;
387 	if (unlikely(pos < 0)) {
388 		if (!unsigned_offsets(file))
389 			return retval;
390 		if (count >= -pos) /* both values are in 0..LLONG_MAX */
391 			return -EOVERFLOW;
392 	} else if (unlikely((loff_t) (pos + count) < 0)) {
393 		if (!unsigned_offsets(file))
394 			return retval;
395 	}
396 
397 	if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
398 		retval = locks_mandatory_area(
399 			read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
400 			inode, file, pos, count);
401 		if (retval < 0)
402 			return retval;
403 	}
404 	retval = security_file_permission(file,
405 				read_write == READ ? MAY_READ : MAY_WRITE);
406 	if (retval)
407 		return retval;
408 	return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
409 }
410 
411 static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
412 {
413 	struct iovec iov = { .iov_base = buf, .iov_len = len };
414 	struct kiocb kiocb;
415 	struct iov_iter iter;
416 	ssize_t ret;
417 
418 	init_sync_kiocb(&kiocb, filp);
419 	kiocb.ki_pos = *ppos;
420 	iov_iter_init(&iter, READ, &iov, 1, len);
421 
422 	ret = filp->f_op->read_iter(&kiocb, &iter);
423 	BUG_ON(ret == -EIOCBQUEUED);
424 	*ppos = kiocb.ki_pos;
425 	return ret;
426 }
427 
428 ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
429 		   loff_t *pos)
430 {
431 	if (file->f_op->read)
432 		return file->f_op->read(file, buf, count, pos);
433 	else if (file->f_op->read_iter)
434 		return new_sync_read(file, buf, count, pos);
435 	else
436 		return -EINVAL;
437 }
438 EXPORT_SYMBOL(__vfs_read);
439 
440 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
441 {
442 	ssize_t ret;
443 
444 	if (!(file->f_mode & FMODE_READ))
445 		return -EBADF;
446 	if (!(file->f_mode & FMODE_CAN_READ))
447 		return -EINVAL;
448 	if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
449 		return -EFAULT;
450 
451 	ret = rw_verify_area(READ, file, pos, count);
452 	if (ret >= 0) {
453 		count = ret;
454 		ret = __vfs_read(file, buf, count, pos);
455 		if (ret > 0) {
456 			fsnotify_access(file);
457 			add_rchar(current, ret);
458 		}
459 		inc_syscr(current);
460 	}
461 
462 	return ret;
463 }
464 
465 EXPORT_SYMBOL(vfs_read);
466 
467 static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
468 {
469 	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
470 	struct kiocb kiocb;
471 	struct iov_iter iter;
472 	ssize_t ret;
473 
474 	init_sync_kiocb(&kiocb, filp);
475 	kiocb.ki_pos = *ppos;
476 	iov_iter_init(&iter, WRITE, &iov, 1, len);
477 
478 	ret = filp->f_op->write_iter(&kiocb, &iter);
479 	BUG_ON(ret == -EIOCBQUEUED);
480 	if (ret > 0)
481 		*ppos = kiocb.ki_pos;
482 	return ret;
483 }
484 
485 ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
486 		    loff_t *pos)
487 {
488 	if (file->f_op->write)
489 		return file->f_op->write(file, p, count, pos);
490 	else if (file->f_op->write_iter)
491 		return new_sync_write(file, p, count, pos);
492 	else
493 		return -EINVAL;
494 }
495 EXPORT_SYMBOL(__vfs_write);
496 
497 ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
498 {
499 	mm_segment_t old_fs;
500 	const char __user *p;
501 	ssize_t ret;
502 
503 	if (!(file->f_mode & FMODE_CAN_WRITE))
504 		return -EINVAL;
505 
506 	old_fs = get_fs();
507 	set_fs(get_ds());
508 	p = (__force const char __user *)buf;
509 	if (count > MAX_RW_COUNT)
510 		count =  MAX_RW_COUNT;
511 	ret = __vfs_write(file, p, count, pos);
512 	set_fs(old_fs);
513 	if (ret > 0) {
514 		fsnotify_modify(file);
515 		add_wchar(current, ret);
516 	}
517 	inc_syscw(current);
518 	return ret;
519 }
520 
521 EXPORT_SYMBOL(__kernel_write);
522 
523 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
524 {
525 	ssize_t ret;
526 
527 	if (!(file->f_mode & FMODE_WRITE))
528 		return -EBADF;
529 	if (!(file->f_mode & FMODE_CAN_WRITE))
530 		return -EINVAL;
531 	if (unlikely(!access_ok(VERIFY_READ, buf, count)))
532 		return -EFAULT;
533 
534 	ret = rw_verify_area(WRITE, file, pos, count);
535 	if (ret >= 0) {
536 		count = ret;
537 		file_start_write(file);
538 		ret = __vfs_write(file, buf, count, pos);
539 		if (ret > 0) {
540 			fsnotify_modify(file);
541 			add_wchar(current, ret);
542 		}
543 		inc_syscw(current);
544 		file_end_write(file);
545 	}
546 
547 	return ret;
548 }
549 
550 EXPORT_SYMBOL(vfs_write);
551 
552 static inline loff_t file_pos_read(struct file *file)
553 {
554 	return file->f_pos;
555 }
556 
557 static inline void file_pos_write(struct file *file, loff_t pos)
558 {
559 	file->f_pos = pos;
560 }
561 
562 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
563 {
564 	struct fd f = fdget_pos(fd);
565 	ssize_t ret = -EBADF;
566 
567 	if (f.file) {
568 		loff_t pos = file_pos_read(f.file);
569 		ret = vfs_read(f.file, buf, count, &pos);
570 		if (ret >= 0)
571 			file_pos_write(f.file, pos);
572 		fdput_pos(f);
573 	}
574 	return ret;
575 }
576 
577 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
578 		size_t, count)
579 {
580 	struct fd f = fdget_pos(fd);
581 	ssize_t ret = -EBADF;
582 
583 	if (f.file) {
584 		loff_t pos = file_pos_read(f.file);
585 		ret = vfs_write(f.file, buf, count, &pos);
586 		if (ret >= 0)
587 			file_pos_write(f.file, pos);
588 		fdput_pos(f);
589 	}
590 
591 	return ret;
592 }
593 
594 SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
595 			size_t, count, loff_t, pos)
596 {
597 	struct fd f;
598 	ssize_t ret = -EBADF;
599 
600 	if (pos < 0)
601 		return -EINVAL;
602 
603 	f = fdget(fd);
604 	if (f.file) {
605 		ret = -ESPIPE;
606 		if (f.file->f_mode & FMODE_PREAD)
607 			ret = vfs_read(f.file, buf, count, &pos);
608 		fdput(f);
609 	}
610 
611 	return ret;
612 }
613 
614 SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
615 			 size_t, count, loff_t, pos)
616 {
617 	struct fd f;
618 	ssize_t ret = -EBADF;
619 
620 	if (pos < 0)
621 		return -EINVAL;
622 
623 	f = fdget(fd);
624 	if (f.file) {
625 		ret = -ESPIPE;
626 		if (f.file->f_mode & FMODE_PWRITE)
627 			ret = vfs_write(f.file, buf, count, &pos);
628 		fdput(f);
629 	}
630 
631 	return ret;
632 }
633 
634 /*
635  * Reduce an iovec's length in-place.  Return the resulting number of segments
636  */
637 unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
638 {
639 	unsigned long seg = 0;
640 	size_t len = 0;
641 
642 	while (seg < nr_segs) {
643 		seg++;
644 		if (len + iov->iov_len >= to) {
645 			iov->iov_len = to - len;
646 			break;
647 		}
648 		len += iov->iov_len;
649 		iov++;
650 	}
651 	return seg;
652 }
653 EXPORT_SYMBOL(iov_shorten);
654 
655 static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
656 		loff_t *ppos, iter_fn_t fn)
657 {
658 	struct kiocb kiocb;
659 	ssize_t ret;
660 
661 	init_sync_kiocb(&kiocb, filp);
662 	kiocb.ki_pos = *ppos;
663 
664 	ret = fn(&kiocb, iter);
665 	BUG_ON(ret == -EIOCBQUEUED);
666 	*ppos = kiocb.ki_pos;
667 	return ret;
668 }
669 
670 /* Do it by hand, with file-ops */
671 static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
672 		loff_t *ppos, io_fn_t fn)
673 {
674 	ssize_t ret = 0;
675 
676 	while (iov_iter_count(iter)) {
677 		struct iovec iovec = iov_iter_iovec(iter);
678 		ssize_t nr;
679 
680 		nr = fn(filp, iovec.iov_base, iovec.iov_len, ppos);
681 
682 		if (nr < 0) {
683 			if (!ret)
684 				ret = nr;
685 			break;
686 		}
687 		ret += nr;
688 		if (nr != iovec.iov_len)
689 			break;
690 		iov_iter_advance(iter, nr);
691 	}
692 
693 	return ret;
694 }
695 
696 /* A write operation does a read from user space and vice versa */
697 #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
698 
699 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
700 			      unsigned long nr_segs, unsigned long fast_segs,
701 			      struct iovec *fast_pointer,
702 			      struct iovec **ret_pointer)
703 {
704 	unsigned long seg;
705 	ssize_t ret;
706 	struct iovec *iov = fast_pointer;
707 
708 	/*
709 	 * SuS says "The readv() function *may* fail if the iovcnt argument
710 	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
711 	 * traditionally returned zero for zero segments, so...
712 	 */
713 	if (nr_segs == 0) {
714 		ret = 0;
715 		goto out;
716 	}
717 
718 	/*
719 	 * First get the "struct iovec" from user memory and
720 	 * verify all the pointers
721 	 */
722 	if (nr_segs > UIO_MAXIOV) {
723 		ret = -EINVAL;
724 		goto out;
725 	}
726 	if (nr_segs > fast_segs) {
727 		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
728 		if (iov == NULL) {
729 			ret = -ENOMEM;
730 			goto out;
731 		}
732 	}
733 	if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
734 		ret = -EFAULT;
735 		goto out;
736 	}
737 
738 	/*
739 	 * According to the Single Unix Specification we should return EINVAL
740 	 * if an element length is < 0 when cast to ssize_t or if the
741 	 * total length would overflow the ssize_t return value of the
742 	 * system call.
743 	 *
744 	 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
745 	 * overflow case.
746 	 */
747 	ret = 0;
748 	for (seg = 0; seg < nr_segs; seg++) {
749 		void __user *buf = iov[seg].iov_base;
750 		ssize_t len = (ssize_t)iov[seg].iov_len;
751 
752 		/* see if we we're about to use an invalid len or if
753 		 * it's about to overflow ssize_t */
754 		if (len < 0) {
755 			ret = -EINVAL;
756 			goto out;
757 		}
758 		if (type >= 0
759 		    && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
760 			ret = -EFAULT;
761 			goto out;
762 		}
763 		if (len > MAX_RW_COUNT - ret) {
764 			len = MAX_RW_COUNT - ret;
765 			iov[seg].iov_len = len;
766 		}
767 		ret += len;
768 	}
769 out:
770 	*ret_pointer = iov;
771 	return ret;
772 }
773 
774 static ssize_t do_readv_writev(int type, struct file *file,
775 			       const struct iovec __user * uvector,
776 			       unsigned long nr_segs, loff_t *pos)
777 {
778 	size_t tot_len;
779 	struct iovec iovstack[UIO_FASTIOV];
780 	struct iovec *iov = iovstack;
781 	struct iov_iter iter;
782 	ssize_t ret;
783 	io_fn_t fn;
784 	iter_fn_t iter_fn;
785 
786 	ret = import_iovec(type, uvector, nr_segs,
787 			   ARRAY_SIZE(iovstack), &iov, &iter);
788 	if (ret < 0)
789 		return ret;
790 
791 	tot_len = iov_iter_count(&iter);
792 	if (!tot_len)
793 		goto out;
794 	ret = rw_verify_area(type, file, pos, tot_len);
795 	if (ret < 0)
796 		goto out;
797 
798 	if (type == READ) {
799 		fn = file->f_op->read;
800 		iter_fn = file->f_op->read_iter;
801 	} else {
802 		fn = (io_fn_t)file->f_op->write;
803 		iter_fn = file->f_op->write_iter;
804 		file_start_write(file);
805 	}
806 
807 	if (iter_fn)
808 		ret = do_iter_readv_writev(file, &iter, pos, iter_fn);
809 	else
810 		ret = do_loop_readv_writev(file, &iter, pos, fn);
811 
812 	if (type != READ)
813 		file_end_write(file);
814 
815 out:
816 	kfree(iov);
817 	if ((ret + (type == READ)) > 0) {
818 		if (type == READ)
819 			fsnotify_access(file);
820 		else
821 			fsnotify_modify(file);
822 	}
823 	return ret;
824 }
825 
826 ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
827 		  unsigned long vlen, loff_t *pos)
828 {
829 	if (!(file->f_mode & FMODE_READ))
830 		return -EBADF;
831 	if (!(file->f_mode & FMODE_CAN_READ))
832 		return -EINVAL;
833 
834 	return do_readv_writev(READ, file, vec, vlen, pos);
835 }
836 
837 EXPORT_SYMBOL(vfs_readv);
838 
839 ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
840 		   unsigned long vlen, loff_t *pos)
841 {
842 	if (!(file->f_mode & FMODE_WRITE))
843 		return -EBADF;
844 	if (!(file->f_mode & FMODE_CAN_WRITE))
845 		return -EINVAL;
846 
847 	return do_readv_writev(WRITE, file, vec, vlen, pos);
848 }
849 
850 EXPORT_SYMBOL(vfs_writev);
851 
852 SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
853 		unsigned long, vlen)
854 {
855 	struct fd f = fdget_pos(fd);
856 	ssize_t ret = -EBADF;
857 
858 	if (f.file) {
859 		loff_t pos = file_pos_read(f.file);
860 		ret = vfs_readv(f.file, vec, vlen, &pos);
861 		if (ret >= 0)
862 			file_pos_write(f.file, pos);
863 		fdput_pos(f);
864 	}
865 
866 	if (ret > 0)
867 		add_rchar(current, ret);
868 	inc_syscr(current);
869 	return ret;
870 }
871 
872 SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
873 		unsigned long, vlen)
874 {
875 	struct fd f = fdget_pos(fd);
876 	ssize_t ret = -EBADF;
877 
878 	if (f.file) {
879 		loff_t pos = file_pos_read(f.file);
880 		ret = vfs_writev(f.file, vec, vlen, &pos);
881 		if (ret >= 0)
882 			file_pos_write(f.file, pos);
883 		fdput_pos(f);
884 	}
885 
886 	if (ret > 0)
887 		add_wchar(current, ret);
888 	inc_syscw(current);
889 	return ret;
890 }
891 
892 static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
893 {
894 #define HALF_LONG_BITS (BITS_PER_LONG / 2)
895 	return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
896 }
897 
898 SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
899 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
900 {
901 	loff_t pos = pos_from_hilo(pos_h, pos_l);
902 	struct fd f;
903 	ssize_t ret = -EBADF;
904 
905 	if (pos < 0)
906 		return -EINVAL;
907 
908 	f = fdget(fd);
909 	if (f.file) {
910 		ret = -ESPIPE;
911 		if (f.file->f_mode & FMODE_PREAD)
912 			ret = vfs_readv(f.file, vec, vlen, &pos);
913 		fdput(f);
914 	}
915 
916 	if (ret > 0)
917 		add_rchar(current, ret);
918 	inc_syscr(current);
919 	return ret;
920 }
921 
922 SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
923 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
924 {
925 	loff_t pos = pos_from_hilo(pos_h, pos_l);
926 	struct fd f;
927 	ssize_t ret = -EBADF;
928 
929 	if (pos < 0)
930 		return -EINVAL;
931 
932 	f = fdget(fd);
933 	if (f.file) {
934 		ret = -ESPIPE;
935 		if (f.file->f_mode & FMODE_PWRITE)
936 			ret = vfs_writev(f.file, vec, vlen, &pos);
937 		fdput(f);
938 	}
939 
940 	if (ret > 0)
941 		add_wchar(current, ret);
942 	inc_syscw(current);
943 	return ret;
944 }
945 
946 #ifdef CONFIG_COMPAT
947 
948 static ssize_t compat_do_readv_writev(int type, struct file *file,
949 			       const struct compat_iovec __user *uvector,
950 			       unsigned long nr_segs, loff_t *pos)
951 {
952 	compat_ssize_t tot_len;
953 	struct iovec iovstack[UIO_FASTIOV];
954 	struct iovec *iov = iovstack;
955 	struct iov_iter iter;
956 	ssize_t ret;
957 	io_fn_t fn;
958 	iter_fn_t iter_fn;
959 
960 	ret = compat_import_iovec(type, uvector, nr_segs,
961 				  UIO_FASTIOV, &iov, &iter);
962 	if (ret < 0)
963 		return ret;
964 
965 	tot_len = iov_iter_count(&iter);
966 	if (!tot_len)
967 		goto out;
968 	ret = rw_verify_area(type, file, pos, tot_len);
969 	if (ret < 0)
970 		goto out;
971 
972 	if (type == READ) {
973 		fn = file->f_op->read;
974 		iter_fn = file->f_op->read_iter;
975 	} else {
976 		fn = (io_fn_t)file->f_op->write;
977 		iter_fn = file->f_op->write_iter;
978 		file_start_write(file);
979 	}
980 
981 	if (iter_fn)
982 		ret = do_iter_readv_writev(file, &iter, pos, iter_fn);
983 	else
984 		ret = do_loop_readv_writev(file, &iter, pos, fn);
985 
986 	if (type != READ)
987 		file_end_write(file);
988 
989 out:
990 	kfree(iov);
991 	if ((ret + (type == READ)) > 0) {
992 		if (type == READ)
993 			fsnotify_access(file);
994 		else
995 			fsnotify_modify(file);
996 	}
997 	return ret;
998 }
999 
1000 static size_t compat_readv(struct file *file,
1001 			   const struct compat_iovec __user *vec,
1002 			   unsigned long vlen, loff_t *pos)
1003 {
1004 	ssize_t ret = -EBADF;
1005 
1006 	if (!(file->f_mode & FMODE_READ))
1007 		goto out;
1008 
1009 	ret = -EINVAL;
1010 	if (!(file->f_mode & FMODE_CAN_READ))
1011 		goto out;
1012 
1013 	ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
1014 
1015 out:
1016 	if (ret > 0)
1017 		add_rchar(current, ret);
1018 	inc_syscr(current);
1019 	return ret;
1020 }
1021 
1022 COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
1023 		const struct compat_iovec __user *,vec,
1024 		compat_ulong_t, vlen)
1025 {
1026 	struct fd f = fdget_pos(fd);
1027 	ssize_t ret;
1028 	loff_t pos;
1029 
1030 	if (!f.file)
1031 		return -EBADF;
1032 	pos = f.file->f_pos;
1033 	ret = compat_readv(f.file, vec, vlen, &pos);
1034 	if (ret >= 0)
1035 		f.file->f_pos = pos;
1036 	fdput_pos(f);
1037 	return ret;
1038 }
1039 
1040 static long __compat_sys_preadv64(unsigned long fd,
1041 				  const struct compat_iovec __user *vec,
1042 				  unsigned long vlen, loff_t pos)
1043 {
1044 	struct fd f;
1045 	ssize_t ret;
1046 
1047 	if (pos < 0)
1048 		return -EINVAL;
1049 	f = fdget(fd);
1050 	if (!f.file)
1051 		return -EBADF;
1052 	ret = -ESPIPE;
1053 	if (f.file->f_mode & FMODE_PREAD)
1054 		ret = compat_readv(f.file, vec, vlen, &pos);
1055 	fdput(f);
1056 	return ret;
1057 }
1058 
1059 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
1060 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1061 		const struct compat_iovec __user *,vec,
1062 		unsigned long, vlen, loff_t, pos)
1063 {
1064 	return __compat_sys_preadv64(fd, vec, vlen, pos);
1065 }
1066 #endif
1067 
1068 COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1069 		const struct compat_iovec __user *,vec,
1070 		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1071 {
1072 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1073 
1074 	return __compat_sys_preadv64(fd, vec, vlen, pos);
1075 }
1076 
1077 static size_t compat_writev(struct file *file,
1078 			    const struct compat_iovec __user *vec,
1079 			    unsigned long vlen, loff_t *pos)
1080 {
1081 	ssize_t ret = -EBADF;
1082 
1083 	if (!(file->f_mode & FMODE_WRITE))
1084 		goto out;
1085 
1086 	ret = -EINVAL;
1087 	if (!(file->f_mode & FMODE_CAN_WRITE))
1088 		goto out;
1089 
1090 	ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
1091 
1092 out:
1093 	if (ret > 0)
1094 		add_wchar(current, ret);
1095 	inc_syscw(current);
1096 	return ret;
1097 }
1098 
1099 COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
1100 		const struct compat_iovec __user *, vec,
1101 		compat_ulong_t, vlen)
1102 {
1103 	struct fd f = fdget_pos(fd);
1104 	ssize_t ret;
1105 	loff_t pos;
1106 
1107 	if (!f.file)
1108 		return -EBADF;
1109 	pos = f.file->f_pos;
1110 	ret = compat_writev(f.file, vec, vlen, &pos);
1111 	if (ret >= 0)
1112 		f.file->f_pos = pos;
1113 	fdput_pos(f);
1114 	return ret;
1115 }
1116 
1117 static long __compat_sys_pwritev64(unsigned long fd,
1118 				   const struct compat_iovec __user *vec,
1119 				   unsigned long vlen, loff_t pos)
1120 {
1121 	struct fd f;
1122 	ssize_t ret;
1123 
1124 	if (pos < 0)
1125 		return -EINVAL;
1126 	f = fdget(fd);
1127 	if (!f.file)
1128 		return -EBADF;
1129 	ret = -ESPIPE;
1130 	if (f.file->f_mode & FMODE_PWRITE)
1131 		ret = compat_writev(f.file, vec, vlen, &pos);
1132 	fdput(f);
1133 	return ret;
1134 }
1135 
1136 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
1137 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1138 		const struct compat_iovec __user *,vec,
1139 		unsigned long, vlen, loff_t, pos)
1140 {
1141 	return __compat_sys_pwritev64(fd, vec, vlen, pos);
1142 }
1143 #endif
1144 
1145 COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1146 		const struct compat_iovec __user *,vec,
1147 		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1148 {
1149 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1150 
1151 	return __compat_sys_pwritev64(fd, vec, vlen, pos);
1152 }
1153 #endif
1154 
1155 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1156 		  	   size_t count, loff_t max)
1157 {
1158 	struct fd in, out;
1159 	struct inode *in_inode, *out_inode;
1160 	loff_t pos;
1161 	loff_t out_pos;
1162 	ssize_t retval;
1163 	int fl;
1164 
1165 	/*
1166 	 * Get input file, and verify that it is ok..
1167 	 */
1168 	retval = -EBADF;
1169 	in = fdget(in_fd);
1170 	if (!in.file)
1171 		goto out;
1172 	if (!(in.file->f_mode & FMODE_READ))
1173 		goto fput_in;
1174 	retval = -ESPIPE;
1175 	if (!ppos) {
1176 		pos = in.file->f_pos;
1177 	} else {
1178 		pos = *ppos;
1179 		if (!(in.file->f_mode & FMODE_PREAD))
1180 			goto fput_in;
1181 	}
1182 	retval = rw_verify_area(READ, in.file, &pos, count);
1183 	if (retval < 0)
1184 		goto fput_in;
1185 	count = retval;
1186 
1187 	/*
1188 	 * Get output file, and verify that it is ok..
1189 	 */
1190 	retval = -EBADF;
1191 	out = fdget(out_fd);
1192 	if (!out.file)
1193 		goto fput_in;
1194 	if (!(out.file->f_mode & FMODE_WRITE))
1195 		goto fput_out;
1196 	retval = -EINVAL;
1197 	in_inode = file_inode(in.file);
1198 	out_inode = file_inode(out.file);
1199 	out_pos = out.file->f_pos;
1200 	retval = rw_verify_area(WRITE, out.file, &out_pos, count);
1201 	if (retval < 0)
1202 		goto fput_out;
1203 	count = retval;
1204 
1205 	if (!max)
1206 		max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1207 
1208 	if (unlikely(pos + count > max)) {
1209 		retval = -EOVERFLOW;
1210 		if (pos >= max)
1211 			goto fput_out;
1212 		count = max - pos;
1213 	}
1214 
1215 	fl = 0;
1216 #if 0
1217 	/*
1218 	 * We need to debate whether we can enable this or not. The
1219 	 * man page documents EAGAIN return for the output at least,
1220 	 * and the application is arguably buggy if it doesn't expect
1221 	 * EAGAIN on a non-blocking file descriptor.
1222 	 */
1223 	if (in.file->f_flags & O_NONBLOCK)
1224 		fl = SPLICE_F_NONBLOCK;
1225 #endif
1226 	file_start_write(out.file);
1227 	retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
1228 	file_end_write(out.file);
1229 
1230 	if (retval > 0) {
1231 		add_rchar(current, retval);
1232 		add_wchar(current, retval);
1233 		fsnotify_access(in.file);
1234 		fsnotify_modify(out.file);
1235 		out.file->f_pos = out_pos;
1236 		if (ppos)
1237 			*ppos = pos;
1238 		else
1239 			in.file->f_pos = pos;
1240 	}
1241 
1242 	inc_syscr(current);
1243 	inc_syscw(current);
1244 	if (pos > max)
1245 		retval = -EOVERFLOW;
1246 
1247 fput_out:
1248 	fdput(out);
1249 fput_in:
1250 	fdput(in);
1251 out:
1252 	return retval;
1253 }
1254 
1255 SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1256 {
1257 	loff_t pos;
1258 	off_t off;
1259 	ssize_t ret;
1260 
1261 	if (offset) {
1262 		if (unlikely(get_user(off, offset)))
1263 			return -EFAULT;
1264 		pos = off;
1265 		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1266 		if (unlikely(put_user(pos, offset)))
1267 			return -EFAULT;
1268 		return ret;
1269 	}
1270 
1271 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1272 }
1273 
1274 SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1275 {
1276 	loff_t pos;
1277 	ssize_t ret;
1278 
1279 	if (offset) {
1280 		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1281 			return -EFAULT;
1282 		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1283 		if (unlikely(put_user(pos, offset)))
1284 			return -EFAULT;
1285 		return ret;
1286 	}
1287 
1288 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1289 }
1290 
1291 #ifdef CONFIG_COMPAT
1292 COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1293 		compat_off_t __user *, offset, compat_size_t, count)
1294 {
1295 	loff_t pos;
1296 	off_t off;
1297 	ssize_t ret;
1298 
1299 	if (offset) {
1300 		if (unlikely(get_user(off, offset)))
1301 			return -EFAULT;
1302 		pos = off;
1303 		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1304 		if (unlikely(put_user(pos, offset)))
1305 			return -EFAULT;
1306 		return ret;
1307 	}
1308 
1309 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1310 }
1311 
1312 COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1313 		compat_loff_t __user *, offset, compat_size_t, count)
1314 {
1315 	loff_t pos;
1316 	ssize_t ret;
1317 
1318 	if (offset) {
1319 		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1320 			return -EFAULT;
1321 		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1322 		if (unlikely(put_user(pos, offset)))
1323 			return -EFAULT;
1324 		return ret;
1325 	}
1326 
1327 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1328 }
1329 #endif
1330