xref: /linux/fs/read_write.c (revision 704bf317fd21683e5c71a542f5fb5f65271a1582)
1 /*
2  *  linux/fs/read_write.c
3  *
4  *  Copyright (C) 1991, 1992  Linus Torvalds
5  */
6 
7 #include <linux/slab.h>
8 #include <linux/stat.h>
9 #include <linux/fcntl.h>
10 #include <linux/file.h>
11 #include <linux/uio.h>
12 #include <linux/fsnotify.h>
13 #include <linux/security.h>
14 #include <linux/module.h>
15 #include <linux/syscalls.h>
16 #include <linux/pagemap.h>
17 #include <linux/splice.h>
18 #include "read_write.h"
19 
20 #include <asm/uaccess.h>
21 #include <asm/unistd.h>
22 
23 const struct file_operations generic_ro_fops = {
24 	.llseek		= generic_file_llseek,
25 	.read		= do_sync_read,
26 	.aio_read	= generic_file_aio_read,
27 	.mmap		= generic_file_readonly_mmap,
28 	.splice_read	= generic_file_splice_read,
29 };
30 
31 EXPORT_SYMBOL(generic_ro_fops);
32 
33 static int
34 __negative_fpos_check(struct file *file, loff_t pos, size_t count)
35 {
36 	/*
37 	 * pos or pos+count is negative here, check overflow.
38 	 * too big "count" will be caught in rw_verify_area().
39 	 */
40 	if ((pos < 0) && (pos + count < pos))
41 		return -EOVERFLOW;
42 	if (file->f_mode & FMODE_UNSIGNED_OFFSET)
43 		return 0;
44 	return -EINVAL;
45 }
46 
47 /**
48  * generic_file_llseek_unlocked - lockless generic llseek implementation
49  * @file:	file structure to seek on
50  * @offset:	file offset to seek to
51  * @origin:	type of seek
52  *
53  * Updates the file offset to the value specified by @offset and @origin.
54  * Locking must be provided by the caller.
55  */
56 loff_t
57 generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
58 {
59 	struct inode *inode = file->f_mapping->host;
60 
61 	switch (origin) {
62 	case SEEK_END:
63 		offset += inode->i_size;
64 		break;
65 	case SEEK_CUR:
66 		/*
67 		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
68 		 * position-querying operation.  Avoid rewriting the "same"
69 		 * f_pos value back to the file because a concurrent read(),
70 		 * write() or lseek() might have altered it
71 		 */
72 		if (offset == 0)
73 			return file->f_pos;
74 		offset += file->f_pos;
75 		break;
76 	}
77 
78 	if (offset < 0 && __negative_fpos_check(file, offset, 0))
79 		return -EINVAL;
80 	if (offset > inode->i_sb->s_maxbytes)
81 		return -EINVAL;
82 
83 	/* Special lock needed here? */
84 	if (offset != file->f_pos) {
85 		file->f_pos = offset;
86 		file->f_version = 0;
87 	}
88 
89 	return offset;
90 }
91 EXPORT_SYMBOL(generic_file_llseek_unlocked);
92 
93 /**
94  * generic_file_llseek - generic llseek implementation for regular files
95  * @file:	file structure to seek on
96  * @offset:	file offset to seek to
97  * @origin:	type of seek
98  *
99  * This is a generic implemenation of ->llseek useable for all normal local
100  * filesystems.  It just updates the file offset to the value specified by
101  * @offset and @origin under i_mutex.
102  */
103 loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
104 {
105 	loff_t rval;
106 
107 	mutex_lock(&file->f_dentry->d_inode->i_mutex);
108 	rval = generic_file_llseek_unlocked(file, offset, origin);
109 	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
110 
111 	return rval;
112 }
113 EXPORT_SYMBOL(generic_file_llseek);
114 
115 /**
116  * noop_llseek - No Operation Performed llseek implementation
117  * @file:	file structure to seek on
118  * @offset:	file offset to seek to
119  * @origin:	type of seek
120  *
121  * This is an implementation of ->llseek useable for the rare special case when
122  * userspace expects the seek to succeed but the (device) file is actually not
123  * able to perform the seek. In this case you use noop_llseek() instead of
124  * falling back to the default implementation of ->llseek.
125  */
126 loff_t noop_llseek(struct file *file, loff_t offset, int origin)
127 {
128 	return file->f_pos;
129 }
130 EXPORT_SYMBOL(noop_llseek);
131 
132 loff_t no_llseek(struct file *file, loff_t offset, int origin)
133 {
134 	return -ESPIPE;
135 }
136 EXPORT_SYMBOL(no_llseek);
137 
138 loff_t default_llseek(struct file *file, loff_t offset, int origin)
139 {
140 	loff_t retval;
141 
142 	mutex_lock(&file->f_dentry->d_inode->i_mutex);
143 	switch (origin) {
144 		case SEEK_END:
145 			offset += i_size_read(file->f_path.dentry->d_inode);
146 			break;
147 		case SEEK_CUR:
148 			if (offset == 0) {
149 				retval = file->f_pos;
150 				goto out;
151 			}
152 			offset += file->f_pos;
153 	}
154 	retval = -EINVAL;
155 	if (offset >= 0 || !__negative_fpos_check(file, offset, 0)) {
156 		if (offset != file->f_pos) {
157 			file->f_pos = offset;
158 			file->f_version = 0;
159 		}
160 		retval = offset;
161 	}
162 out:
163 	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
164 	return retval;
165 }
166 EXPORT_SYMBOL(default_llseek);
167 
168 loff_t vfs_llseek(struct file *file, loff_t offset, int origin)
169 {
170 	loff_t (*fn)(struct file *, loff_t, int);
171 
172 	fn = no_llseek;
173 	if (file->f_mode & FMODE_LSEEK) {
174 		if (file->f_op && file->f_op->llseek)
175 			fn = file->f_op->llseek;
176 	}
177 	return fn(file, offset, origin);
178 }
179 EXPORT_SYMBOL(vfs_llseek);
180 
181 SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin)
182 {
183 	off_t retval;
184 	struct file * file;
185 	int fput_needed;
186 
187 	retval = -EBADF;
188 	file = fget_light(fd, &fput_needed);
189 	if (!file)
190 		goto bad;
191 
192 	retval = -EINVAL;
193 	if (origin <= SEEK_MAX) {
194 		loff_t res = vfs_llseek(file, offset, origin);
195 		retval = res;
196 		if (res != (loff_t)retval)
197 			retval = -EOVERFLOW;	/* LFS: should only happen on 32 bit platforms */
198 	}
199 	fput_light(file, fput_needed);
200 bad:
201 	return retval;
202 }
203 
204 #ifdef __ARCH_WANT_SYS_LLSEEK
205 SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
206 		unsigned long, offset_low, loff_t __user *, result,
207 		unsigned int, origin)
208 {
209 	int retval;
210 	struct file * file;
211 	loff_t offset;
212 	int fput_needed;
213 
214 	retval = -EBADF;
215 	file = fget_light(fd, &fput_needed);
216 	if (!file)
217 		goto bad;
218 
219 	retval = -EINVAL;
220 	if (origin > SEEK_MAX)
221 		goto out_putf;
222 
223 	offset = vfs_llseek(file, ((loff_t) offset_high << 32) | offset_low,
224 			origin);
225 
226 	retval = (int)offset;
227 	if (offset >= 0) {
228 		retval = -EFAULT;
229 		if (!copy_to_user(result, &offset, sizeof(offset)))
230 			retval = 0;
231 	}
232 out_putf:
233 	fput_light(file, fput_needed);
234 bad:
235 	return retval;
236 }
237 #endif
238 
239 
240 /*
241  * rw_verify_area doesn't like huge counts. We limit
242  * them to something that fits in "int" so that others
243  * won't have to do range checks all the time.
244  */
245 int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
246 {
247 	struct inode *inode;
248 	loff_t pos;
249 	int retval = -EINVAL;
250 
251 	inode = file->f_path.dentry->d_inode;
252 	if (unlikely((ssize_t) count < 0))
253 		return retval;
254 	pos = *ppos;
255 	if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) {
256 		retval = __negative_fpos_check(file, pos, count);
257 		if (retval)
258 			return retval;
259 	}
260 
261 	if (unlikely(inode->i_flock && mandatory_lock(inode))) {
262 		retval = locks_mandatory_area(
263 			read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
264 			inode, file, pos, count);
265 		if (retval < 0)
266 			return retval;
267 	}
268 	retval = security_file_permission(file,
269 				read_write == READ ? MAY_READ : MAY_WRITE);
270 	if (retval)
271 		return retval;
272 	return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
273 }
274 
275 static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
276 {
277 	set_current_state(TASK_UNINTERRUPTIBLE);
278 	if (!kiocbIsKicked(iocb))
279 		schedule();
280 	else
281 		kiocbClearKicked(iocb);
282 	__set_current_state(TASK_RUNNING);
283 }
284 
285 ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
286 {
287 	struct iovec iov = { .iov_base = buf, .iov_len = len };
288 	struct kiocb kiocb;
289 	ssize_t ret;
290 
291 	init_sync_kiocb(&kiocb, filp);
292 	kiocb.ki_pos = *ppos;
293 	kiocb.ki_left = len;
294 	kiocb.ki_nbytes = len;
295 
296 	for (;;) {
297 		ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
298 		if (ret != -EIOCBRETRY)
299 			break;
300 		wait_on_retry_sync_kiocb(&kiocb);
301 	}
302 
303 	if (-EIOCBQUEUED == ret)
304 		ret = wait_on_sync_kiocb(&kiocb);
305 	*ppos = kiocb.ki_pos;
306 	return ret;
307 }
308 
309 EXPORT_SYMBOL(do_sync_read);
310 
311 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
312 {
313 	ssize_t ret;
314 
315 	if (!(file->f_mode & FMODE_READ))
316 		return -EBADF;
317 	if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
318 		return -EINVAL;
319 	if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
320 		return -EFAULT;
321 
322 	ret = rw_verify_area(READ, file, pos, count);
323 	if (ret >= 0) {
324 		count = ret;
325 		if (file->f_op->read)
326 			ret = file->f_op->read(file, buf, count, pos);
327 		else
328 			ret = do_sync_read(file, buf, count, pos);
329 		if (ret > 0) {
330 			fsnotify_access(file);
331 			add_rchar(current, ret);
332 		}
333 		inc_syscr(current);
334 	}
335 
336 	return ret;
337 }
338 
339 EXPORT_SYMBOL(vfs_read);
340 
341 ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
342 {
343 	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
344 	struct kiocb kiocb;
345 	ssize_t ret;
346 
347 	init_sync_kiocb(&kiocb, filp);
348 	kiocb.ki_pos = *ppos;
349 	kiocb.ki_left = len;
350 	kiocb.ki_nbytes = len;
351 
352 	for (;;) {
353 		ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
354 		if (ret != -EIOCBRETRY)
355 			break;
356 		wait_on_retry_sync_kiocb(&kiocb);
357 	}
358 
359 	if (-EIOCBQUEUED == ret)
360 		ret = wait_on_sync_kiocb(&kiocb);
361 	*ppos = kiocb.ki_pos;
362 	return ret;
363 }
364 
365 EXPORT_SYMBOL(do_sync_write);
366 
367 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
368 {
369 	ssize_t ret;
370 
371 	if (!(file->f_mode & FMODE_WRITE))
372 		return -EBADF;
373 	if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
374 		return -EINVAL;
375 	if (unlikely(!access_ok(VERIFY_READ, buf, count)))
376 		return -EFAULT;
377 
378 	ret = rw_verify_area(WRITE, file, pos, count);
379 	if (ret >= 0) {
380 		count = ret;
381 		if (file->f_op->write)
382 			ret = file->f_op->write(file, buf, count, pos);
383 		else
384 			ret = do_sync_write(file, buf, count, pos);
385 		if (ret > 0) {
386 			fsnotify_modify(file);
387 			add_wchar(current, ret);
388 		}
389 		inc_syscw(current);
390 	}
391 
392 	return ret;
393 }
394 
395 EXPORT_SYMBOL(vfs_write);
396 
397 static inline loff_t file_pos_read(struct file *file)
398 {
399 	return file->f_pos;
400 }
401 
402 static inline void file_pos_write(struct file *file, loff_t pos)
403 {
404 	file->f_pos = pos;
405 }
406 
407 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
408 {
409 	struct file *file;
410 	ssize_t ret = -EBADF;
411 	int fput_needed;
412 
413 	file = fget_light(fd, &fput_needed);
414 	if (file) {
415 		loff_t pos = file_pos_read(file);
416 		ret = vfs_read(file, buf, count, &pos);
417 		file_pos_write(file, pos);
418 		fput_light(file, fput_needed);
419 	}
420 
421 	return ret;
422 }
423 
424 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
425 		size_t, count)
426 {
427 	struct file *file;
428 	ssize_t ret = -EBADF;
429 	int fput_needed;
430 
431 	file = fget_light(fd, &fput_needed);
432 	if (file) {
433 		loff_t pos = file_pos_read(file);
434 		ret = vfs_write(file, buf, count, &pos);
435 		file_pos_write(file, pos);
436 		fput_light(file, fput_needed);
437 	}
438 
439 	return ret;
440 }
441 
442 SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf,
443 			size_t count, loff_t pos)
444 {
445 	struct file *file;
446 	ssize_t ret = -EBADF;
447 	int fput_needed;
448 
449 	if (pos < 0)
450 		return -EINVAL;
451 
452 	file = fget_light(fd, &fput_needed);
453 	if (file) {
454 		ret = -ESPIPE;
455 		if (file->f_mode & FMODE_PREAD)
456 			ret = vfs_read(file, buf, count, &pos);
457 		fput_light(file, fput_needed);
458 	}
459 
460 	return ret;
461 }
462 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
463 asmlinkage long SyS_pread64(long fd, long buf, long count, loff_t pos)
464 {
465 	return SYSC_pread64((unsigned int) fd, (char __user *) buf,
466 			    (size_t) count, pos);
467 }
468 SYSCALL_ALIAS(sys_pread64, SyS_pread64);
469 #endif
470 
471 SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf,
472 			 size_t count, loff_t pos)
473 {
474 	struct file *file;
475 	ssize_t ret = -EBADF;
476 	int fput_needed;
477 
478 	if (pos < 0)
479 		return -EINVAL;
480 
481 	file = fget_light(fd, &fput_needed);
482 	if (file) {
483 		ret = -ESPIPE;
484 		if (file->f_mode & FMODE_PWRITE)
485 			ret = vfs_write(file, buf, count, &pos);
486 		fput_light(file, fput_needed);
487 	}
488 
489 	return ret;
490 }
491 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
492 asmlinkage long SyS_pwrite64(long fd, long buf, long count, loff_t pos)
493 {
494 	return SYSC_pwrite64((unsigned int) fd, (const char __user *) buf,
495 			     (size_t) count, pos);
496 }
497 SYSCALL_ALIAS(sys_pwrite64, SyS_pwrite64);
498 #endif
499 
500 /*
501  * Reduce an iovec's length in-place.  Return the resulting number of segments
502  */
503 unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
504 {
505 	unsigned long seg = 0;
506 	size_t len = 0;
507 
508 	while (seg < nr_segs) {
509 		seg++;
510 		if (len + iov->iov_len >= to) {
511 			iov->iov_len = to - len;
512 			break;
513 		}
514 		len += iov->iov_len;
515 		iov++;
516 	}
517 	return seg;
518 }
519 EXPORT_SYMBOL(iov_shorten);
520 
521 ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
522 		unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
523 {
524 	struct kiocb kiocb;
525 	ssize_t ret;
526 
527 	init_sync_kiocb(&kiocb, filp);
528 	kiocb.ki_pos = *ppos;
529 	kiocb.ki_left = len;
530 	kiocb.ki_nbytes = len;
531 
532 	for (;;) {
533 		ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
534 		if (ret != -EIOCBRETRY)
535 			break;
536 		wait_on_retry_sync_kiocb(&kiocb);
537 	}
538 
539 	if (ret == -EIOCBQUEUED)
540 		ret = wait_on_sync_kiocb(&kiocb);
541 	*ppos = kiocb.ki_pos;
542 	return ret;
543 }
544 
545 /* Do it by hand, with file-ops */
546 ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
547 		unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
548 {
549 	struct iovec *vector = iov;
550 	ssize_t ret = 0;
551 
552 	while (nr_segs > 0) {
553 		void __user *base;
554 		size_t len;
555 		ssize_t nr;
556 
557 		base = vector->iov_base;
558 		len = vector->iov_len;
559 		vector++;
560 		nr_segs--;
561 
562 		nr = fn(filp, base, len, ppos);
563 
564 		if (nr < 0) {
565 			if (!ret)
566 				ret = nr;
567 			break;
568 		}
569 		ret += nr;
570 		if (nr != len)
571 			break;
572 	}
573 
574 	return ret;
575 }
576 
577 /* A write operation does a read from user space and vice versa */
578 #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
579 
580 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
581 			      unsigned long nr_segs, unsigned long fast_segs,
582 			      struct iovec *fast_pointer,
583 			      struct iovec **ret_pointer)
584 {
585 	unsigned long seg;
586 	ssize_t ret;
587 	struct iovec *iov = fast_pointer;
588 
589 	/*
590 	 * SuS says "The readv() function *may* fail if the iovcnt argument
591 	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
592 	 * traditionally returned zero for zero segments, so...
593 	 */
594 	if (nr_segs == 0) {
595 		ret = 0;
596 		goto out;
597 	}
598 
599 	/*
600 	 * First get the "struct iovec" from user memory and
601 	 * verify all the pointers
602 	 */
603 	if (nr_segs > UIO_MAXIOV) {
604 		ret = -EINVAL;
605 		goto out;
606 	}
607 	if (nr_segs > fast_segs) {
608 		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
609 		if (iov == NULL) {
610 			ret = -ENOMEM;
611 			goto out;
612 		}
613 	}
614 	if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
615 		ret = -EFAULT;
616 		goto out;
617 	}
618 
619 	/*
620 	 * According to the Single Unix Specification we should return EINVAL
621 	 * if an element length is < 0 when cast to ssize_t or if the
622 	 * total length would overflow the ssize_t return value of the
623 	 * system call.
624 	 *
625 	 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
626 	 * overflow case.
627 	 */
628 	ret = 0;
629 	for (seg = 0; seg < nr_segs; seg++) {
630 		void __user *buf = iov[seg].iov_base;
631 		ssize_t len = (ssize_t)iov[seg].iov_len;
632 
633 		/* see if we we're about to use an invalid len or if
634 		 * it's about to overflow ssize_t */
635 		if (len < 0) {
636 			ret = -EINVAL;
637 			goto out;
638 		}
639 		if (unlikely(!access_ok(vrfy_dir(type), buf, len))) {
640 			ret = -EFAULT;
641 			goto out;
642 		}
643 		if (len > MAX_RW_COUNT - ret) {
644 			len = MAX_RW_COUNT - ret;
645 			iov[seg].iov_len = len;
646 		}
647 		ret += len;
648 	}
649 out:
650 	*ret_pointer = iov;
651 	return ret;
652 }
653 
654 static ssize_t do_readv_writev(int type, struct file *file,
655 			       const struct iovec __user * uvector,
656 			       unsigned long nr_segs, loff_t *pos)
657 {
658 	size_t tot_len;
659 	struct iovec iovstack[UIO_FASTIOV];
660 	struct iovec *iov = iovstack;
661 	ssize_t ret;
662 	io_fn_t fn;
663 	iov_fn_t fnv;
664 
665 	if (!file->f_op) {
666 		ret = -EINVAL;
667 		goto out;
668 	}
669 
670 	ret = rw_copy_check_uvector(type, uvector, nr_segs,
671 			ARRAY_SIZE(iovstack), iovstack, &iov);
672 	if (ret <= 0)
673 		goto out;
674 
675 	tot_len = ret;
676 	ret = rw_verify_area(type, file, pos, tot_len);
677 	if (ret < 0)
678 		goto out;
679 
680 	fnv = NULL;
681 	if (type == READ) {
682 		fn = file->f_op->read;
683 		fnv = file->f_op->aio_read;
684 	} else {
685 		fn = (io_fn_t)file->f_op->write;
686 		fnv = file->f_op->aio_write;
687 	}
688 
689 	if (fnv)
690 		ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
691 						pos, fnv);
692 	else
693 		ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
694 
695 out:
696 	if (iov != iovstack)
697 		kfree(iov);
698 	if ((ret + (type == READ)) > 0) {
699 		if (type == READ)
700 			fsnotify_access(file);
701 		else
702 			fsnotify_modify(file);
703 	}
704 	return ret;
705 }
706 
707 ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
708 		  unsigned long vlen, loff_t *pos)
709 {
710 	if (!(file->f_mode & FMODE_READ))
711 		return -EBADF;
712 	if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
713 		return -EINVAL;
714 
715 	return do_readv_writev(READ, file, vec, vlen, pos);
716 }
717 
718 EXPORT_SYMBOL(vfs_readv);
719 
720 ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
721 		   unsigned long vlen, loff_t *pos)
722 {
723 	if (!(file->f_mode & FMODE_WRITE))
724 		return -EBADF;
725 	if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
726 		return -EINVAL;
727 
728 	return do_readv_writev(WRITE, file, vec, vlen, pos);
729 }
730 
731 EXPORT_SYMBOL(vfs_writev);
732 
733 SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
734 		unsigned long, vlen)
735 {
736 	struct file *file;
737 	ssize_t ret = -EBADF;
738 	int fput_needed;
739 
740 	file = fget_light(fd, &fput_needed);
741 	if (file) {
742 		loff_t pos = file_pos_read(file);
743 		ret = vfs_readv(file, vec, vlen, &pos);
744 		file_pos_write(file, pos);
745 		fput_light(file, fput_needed);
746 	}
747 
748 	if (ret > 0)
749 		add_rchar(current, ret);
750 	inc_syscr(current);
751 	return ret;
752 }
753 
754 SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
755 		unsigned long, vlen)
756 {
757 	struct file *file;
758 	ssize_t ret = -EBADF;
759 	int fput_needed;
760 
761 	file = fget_light(fd, &fput_needed);
762 	if (file) {
763 		loff_t pos = file_pos_read(file);
764 		ret = vfs_writev(file, vec, vlen, &pos);
765 		file_pos_write(file, pos);
766 		fput_light(file, fput_needed);
767 	}
768 
769 	if (ret > 0)
770 		add_wchar(current, ret);
771 	inc_syscw(current);
772 	return ret;
773 }
774 
775 static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
776 {
777 #define HALF_LONG_BITS (BITS_PER_LONG / 2)
778 	return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
779 }
780 
781 SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
782 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
783 {
784 	loff_t pos = pos_from_hilo(pos_h, pos_l);
785 	struct file *file;
786 	ssize_t ret = -EBADF;
787 	int fput_needed;
788 
789 	if (pos < 0)
790 		return -EINVAL;
791 
792 	file = fget_light(fd, &fput_needed);
793 	if (file) {
794 		ret = -ESPIPE;
795 		if (file->f_mode & FMODE_PREAD)
796 			ret = vfs_readv(file, vec, vlen, &pos);
797 		fput_light(file, fput_needed);
798 	}
799 
800 	if (ret > 0)
801 		add_rchar(current, ret);
802 	inc_syscr(current);
803 	return ret;
804 }
805 
806 SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
807 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
808 {
809 	loff_t pos = pos_from_hilo(pos_h, pos_l);
810 	struct file *file;
811 	ssize_t ret = -EBADF;
812 	int fput_needed;
813 
814 	if (pos < 0)
815 		return -EINVAL;
816 
817 	file = fget_light(fd, &fput_needed);
818 	if (file) {
819 		ret = -ESPIPE;
820 		if (file->f_mode & FMODE_PWRITE)
821 			ret = vfs_writev(file, vec, vlen, &pos);
822 		fput_light(file, fput_needed);
823 	}
824 
825 	if (ret > 0)
826 		add_wchar(current, ret);
827 	inc_syscw(current);
828 	return ret;
829 }
830 
831 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
832 			   size_t count, loff_t max)
833 {
834 	struct file * in_file, * out_file;
835 	struct inode * in_inode, * out_inode;
836 	loff_t pos;
837 	ssize_t retval;
838 	int fput_needed_in, fput_needed_out, fl;
839 
840 	/*
841 	 * Get input file, and verify that it is ok..
842 	 */
843 	retval = -EBADF;
844 	in_file = fget_light(in_fd, &fput_needed_in);
845 	if (!in_file)
846 		goto out;
847 	if (!(in_file->f_mode & FMODE_READ))
848 		goto fput_in;
849 	retval = -ESPIPE;
850 	if (!ppos)
851 		ppos = &in_file->f_pos;
852 	else
853 		if (!(in_file->f_mode & FMODE_PREAD))
854 			goto fput_in;
855 	retval = rw_verify_area(READ, in_file, ppos, count);
856 	if (retval < 0)
857 		goto fput_in;
858 	count = retval;
859 
860 	/*
861 	 * Get output file, and verify that it is ok..
862 	 */
863 	retval = -EBADF;
864 	out_file = fget_light(out_fd, &fput_needed_out);
865 	if (!out_file)
866 		goto fput_in;
867 	if (!(out_file->f_mode & FMODE_WRITE))
868 		goto fput_out;
869 	retval = -EINVAL;
870 	in_inode = in_file->f_path.dentry->d_inode;
871 	out_inode = out_file->f_path.dentry->d_inode;
872 	retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
873 	if (retval < 0)
874 		goto fput_out;
875 	count = retval;
876 
877 	if (!max)
878 		max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
879 
880 	pos = *ppos;
881 	if (unlikely(pos + count > max)) {
882 		retval = -EOVERFLOW;
883 		if (pos >= max)
884 			goto fput_out;
885 		count = max - pos;
886 	}
887 
888 	fl = 0;
889 #if 0
890 	/*
891 	 * We need to debate whether we can enable this or not. The
892 	 * man page documents EAGAIN return for the output at least,
893 	 * and the application is arguably buggy if it doesn't expect
894 	 * EAGAIN on a non-blocking file descriptor.
895 	 */
896 	if (in_file->f_flags & O_NONBLOCK)
897 		fl = SPLICE_F_NONBLOCK;
898 #endif
899 	retval = do_splice_direct(in_file, ppos, out_file, count, fl);
900 
901 	if (retval > 0) {
902 		add_rchar(current, retval);
903 		add_wchar(current, retval);
904 	}
905 
906 	inc_syscr(current);
907 	inc_syscw(current);
908 	if (*ppos > max)
909 		retval = -EOVERFLOW;
910 
911 fput_out:
912 	fput_light(out_file, fput_needed_out);
913 fput_in:
914 	fput_light(in_file, fput_needed_in);
915 out:
916 	return retval;
917 }
918 
919 SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
920 {
921 	loff_t pos;
922 	off_t off;
923 	ssize_t ret;
924 
925 	if (offset) {
926 		if (unlikely(get_user(off, offset)))
927 			return -EFAULT;
928 		pos = off;
929 		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
930 		if (unlikely(put_user(pos, offset)))
931 			return -EFAULT;
932 		return ret;
933 	}
934 
935 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
936 }
937 
938 SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
939 {
940 	loff_t pos;
941 	ssize_t ret;
942 
943 	if (offset) {
944 		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
945 			return -EFAULT;
946 		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
947 		if (unlikely(put_user(pos, offset)))
948 			return -EFAULT;
949 		return ret;
950 	}
951 
952 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
953 }
954