xref: /linux/fs/read_write.c (revision 55f1b540d893da740a81200450014c45a8103f54)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  linux/fs/read_write.c
4  *
5  *  Copyright (C) 1991, 1992  Linus Torvalds
6  */
7 
8 #include <linux/slab.h>
9 #include <linux/stat.h>
10 #include <linux/sched/xacct.h>
11 #include <linux/fcntl.h>
12 #include <linux/file.h>
13 #include <linux/uio.h>
14 #include <linux/fsnotify.h>
15 #include <linux/security.h>
16 #include <linux/export.h>
17 #include <linux/syscalls.h>
18 #include <linux/pagemap.h>
19 #include <linux/splice.h>
20 #include <linux/compat.h>
21 #include <linux/mount.h>
22 #include <linux/fs.h>
23 #include "internal.h"
24 
25 #include <linux/uaccess.h>
26 #include <asm/unistd.h>
27 
28 const struct file_operations generic_ro_fops = {
29 	.llseek		= generic_file_llseek,
30 	.read_iter	= generic_file_read_iter,
31 	.mmap		= generic_file_readonly_mmap,
32 	.splice_read	= filemap_splice_read,
33 };
34 
35 EXPORT_SYMBOL(generic_ro_fops);
36 
37 static inline bool unsigned_offsets(struct file *file)
38 {
39 	return file->f_op->fop_flags & FOP_UNSIGNED_OFFSET;
40 }
41 
42 /**
43  * vfs_setpos_cookie - update the file offset for lseek and reset cookie
44  * @file:	file structure in question
45  * @offset:	file offset to seek to
46  * @maxsize:	maximum file size
47  * @cookie:	cookie to reset
48  *
49  * Update the file offset to the value specified by @offset if the given
50  * offset is valid and it is not equal to the current file offset and
51  * reset the specified cookie to indicate that a seek happened.
52  *
53  * Return the specified offset on success and -EINVAL on invalid offset.
54  */
55 static loff_t vfs_setpos_cookie(struct file *file, loff_t offset,
56 				loff_t maxsize, u64 *cookie)
57 {
58 	if (offset < 0 && !unsigned_offsets(file))
59 		return -EINVAL;
60 	if (offset > maxsize)
61 		return -EINVAL;
62 
63 	if (offset != file->f_pos) {
64 		file->f_pos = offset;
65 		if (cookie)
66 			*cookie = 0;
67 	}
68 	return offset;
69 }
70 
71 /**
72  * vfs_setpos - update the file offset for lseek
73  * @file:	file structure in question
74  * @offset:	file offset to seek to
75  * @maxsize:	maximum file size
76  *
77  * This is a low-level filesystem helper for updating the file offset to
78  * the value specified by @offset if the given offset is valid and it is
79  * not equal to the current file offset.
80  *
81  * Return the specified offset on success and -EINVAL on invalid offset.
82  */
83 loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
84 {
85 	return vfs_setpos_cookie(file, offset, maxsize, NULL);
86 }
87 EXPORT_SYMBOL(vfs_setpos);
88 
89 /**
90  * must_set_pos - check whether f_pos has to be updated
91  * @file: file to seek on
92  * @offset: offset to use
93  * @whence: type of seek operation
94  * @eof: end of file
95  *
96  * Check whether f_pos needs to be updated and update @offset according
97  * to @whence.
98  *
99  * Return: 0 if f_pos doesn't need to be updated, 1 if f_pos has to be
100  * updated, and negative error code on failure.
101  */
102 static int must_set_pos(struct file *file, loff_t *offset, int whence, loff_t eof)
103 {
104 	switch (whence) {
105 	case SEEK_END:
106 		*offset += eof;
107 		break;
108 	case SEEK_CUR:
109 		/*
110 		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
111 		 * position-querying operation.  Avoid rewriting the "same"
112 		 * f_pos value back to the file because a concurrent read(),
113 		 * write() or lseek() might have altered it
114 		 */
115 		if (*offset == 0) {
116 			*offset = file->f_pos;
117 			return 0;
118 		}
119 		break;
120 	case SEEK_DATA:
121 		/*
122 		 * In the generic case the entire file is data, so as long as
123 		 * offset isn't at the end of the file then the offset is data.
124 		 */
125 		if ((unsigned long long)*offset >= eof)
126 			return -ENXIO;
127 		break;
128 	case SEEK_HOLE:
129 		/*
130 		 * There is a virtual hole at the end of the file, so as long as
131 		 * offset isn't i_size or larger, return i_size.
132 		 */
133 		if ((unsigned long long)*offset >= eof)
134 			return -ENXIO;
135 		*offset = eof;
136 		break;
137 	}
138 
139 	return 1;
140 }
141 
142 /**
143  * generic_file_llseek_size - generic llseek implementation for regular files
144  * @file:	file structure to seek on
145  * @offset:	file offset to seek to
146  * @whence:	type of seek
147  * @maxsize:	max size of this file in file system
148  * @eof:	offset used for SEEK_END position
149  *
150  * This is a variant of generic_file_llseek that allows passing in a custom
151  * maximum file size and a custom EOF position, for e.g. hashed directories
152  *
153  * Synchronization:
154  * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
155  * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
156  * read/writes behave like SEEK_SET against seeks.
157  */
158 loff_t
159 generic_file_llseek_size(struct file *file, loff_t offset, int whence,
160 		loff_t maxsize, loff_t eof)
161 {
162 	int ret;
163 
164 	ret = must_set_pos(file, &offset, whence, eof);
165 	if (ret < 0)
166 		return ret;
167 	if (ret == 0)
168 		return offset;
169 
170 	if (whence == SEEK_CUR) {
171 		/*
172 		 * f_lock protects against read/modify/write race with
173 		 * other SEEK_CURs. Note that parallel writes and reads
174 		 * behave like SEEK_SET.
175 		 */
176 		guard(spinlock)(&file->f_lock);
177 		return vfs_setpos(file, file->f_pos + offset, maxsize);
178 	}
179 
180 	return vfs_setpos(file, offset, maxsize);
181 }
182 EXPORT_SYMBOL(generic_file_llseek_size);
183 
184 /**
185  * generic_llseek_cookie - versioned llseek implementation
186  * @file:	file structure to seek on
187  * @offset:	file offset to seek to
188  * @whence:	type of seek
189  * @cookie:	cookie to update
190  *
191  * See generic_file_llseek for a general description and locking assumptions.
192  *
193  * In contrast to generic_file_llseek, this function also resets a
194  * specified cookie to indicate a seek took place.
195  */
196 loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence,
197 			     u64 *cookie)
198 {
199 	struct inode *inode = file->f_mapping->host;
200 	loff_t maxsize = inode->i_sb->s_maxbytes;
201 	loff_t eof = i_size_read(inode);
202 	int ret;
203 
204 	if (WARN_ON_ONCE(!cookie))
205 		return -EINVAL;
206 
207 	/*
208 	 * Require that this is only used for directories that guarantee
209 	 * synchronization between readdir and seek so that an update to
210 	 * @cookie is correctly synchronized with concurrent readdir.
211 	 */
212 	if (WARN_ON_ONCE(!(file->f_mode & FMODE_ATOMIC_POS)))
213 		return -EINVAL;
214 
215 	ret = must_set_pos(file, &offset, whence, eof);
216 	if (ret < 0)
217 		return ret;
218 	if (ret == 0)
219 		return offset;
220 
221 	/* No need to hold f_lock because we know that f_pos_lock is held. */
222 	if (whence == SEEK_CUR)
223 		return vfs_setpos_cookie(file, file->f_pos + offset, maxsize, cookie);
224 
225 	return vfs_setpos_cookie(file, offset, maxsize, cookie);
226 }
227 EXPORT_SYMBOL(generic_llseek_cookie);
228 
229 /**
230  * generic_file_llseek - generic llseek implementation for regular files
231  * @file:	file structure to seek on
232  * @offset:	file offset to seek to
233  * @whence:	type of seek
234  *
235  * This is a generic implemenation of ->llseek useable for all normal local
236  * filesystems.  It just updates the file offset to the value specified by
237  * @offset and @whence.
238  */
239 loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
240 {
241 	struct inode *inode = file->f_mapping->host;
242 
243 	return generic_file_llseek_size(file, offset, whence,
244 					inode->i_sb->s_maxbytes,
245 					i_size_read(inode));
246 }
247 EXPORT_SYMBOL(generic_file_llseek);
248 
249 /**
250  * fixed_size_llseek - llseek implementation for fixed-sized devices
251  * @file:	file structure to seek on
252  * @offset:	file offset to seek to
253  * @whence:	type of seek
254  * @size:	size of the file
255  *
256  */
257 loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
258 {
259 	switch (whence) {
260 	case SEEK_SET: case SEEK_CUR: case SEEK_END:
261 		return generic_file_llseek_size(file, offset, whence,
262 						size, size);
263 	default:
264 		return -EINVAL;
265 	}
266 }
267 EXPORT_SYMBOL(fixed_size_llseek);
268 
269 /**
270  * no_seek_end_llseek - llseek implementation for fixed-sized devices
271  * @file:	file structure to seek on
272  * @offset:	file offset to seek to
273  * @whence:	type of seek
274  *
275  */
276 loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
277 {
278 	switch (whence) {
279 	case SEEK_SET: case SEEK_CUR:
280 		return generic_file_llseek_size(file, offset, whence,
281 						OFFSET_MAX, 0);
282 	default:
283 		return -EINVAL;
284 	}
285 }
286 EXPORT_SYMBOL(no_seek_end_llseek);
287 
288 /**
289  * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
290  * @file:	file structure to seek on
291  * @offset:	file offset to seek to
292  * @whence:	type of seek
293  * @size:	maximal offset allowed
294  *
295  */
296 loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
297 {
298 	switch (whence) {
299 	case SEEK_SET: case SEEK_CUR:
300 		return generic_file_llseek_size(file, offset, whence,
301 						size, 0);
302 	default:
303 		return -EINVAL;
304 	}
305 }
306 EXPORT_SYMBOL(no_seek_end_llseek_size);
307 
308 /**
309  * noop_llseek - No Operation Performed llseek implementation
310  * @file:	file structure to seek on
311  * @offset:	file offset to seek to
312  * @whence:	type of seek
313  *
314  * This is an implementation of ->llseek useable for the rare special case when
315  * userspace expects the seek to succeed but the (device) file is actually not
316  * able to perform the seek. In this case you use noop_llseek() instead of
317  * falling back to the default implementation of ->llseek.
318  */
319 loff_t noop_llseek(struct file *file, loff_t offset, int whence)
320 {
321 	return file->f_pos;
322 }
323 EXPORT_SYMBOL(noop_llseek);
324 
325 loff_t default_llseek(struct file *file, loff_t offset, int whence)
326 {
327 	struct inode *inode = file_inode(file);
328 	loff_t retval;
329 
330 	inode_lock(inode);
331 	switch (whence) {
332 		case SEEK_END:
333 			offset += i_size_read(inode);
334 			break;
335 		case SEEK_CUR:
336 			if (offset == 0) {
337 				retval = file->f_pos;
338 				goto out;
339 			}
340 			offset += file->f_pos;
341 			break;
342 		case SEEK_DATA:
343 			/*
344 			 * In the generic case the entire file is data, so as
345 			 * long as offset isn't at the end of the file then the
346 			 * offset is data.
347 			 */
348 			if (offset >= inode->i_size) {
349 				retval = -ENXIO;
350 				goto out;
351 			}
352 			break;
353 		case SEEK_HOLE:
354 			/*
355 			 * There is a virtual hole at the end of the file, so
356 			 * as long as offset isn't i_size or larger, return
357 			 * i_size.
358 			 */
359 			if (offset >= inode->i_size) {
360 				retval = -ENXIO;
361 				goto out;
362 			}
363 			offset = inode->i_size;
364 			break;
365 	}
366 	retval = -EINVAL;
367 	if (offset >= 0 || unsigned_offsets(file)) {
368 		if (offset != file->f_pos)
369 			file->f_pos = offset;
370 		retval = offset;
371 	}
372 out:
373 	inode_unlock(inode);
374 	return retval;
375 }
376 EXPORT_SYMBOL(default_llseek);
377 
378 loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
379 {
380 	if (!(file->f_mode & FMODE_LSEEK))
381 		return -ESPIPE;
382 	return file->f_op->llseek(file, offset, whence);
383 }
384 EXPORT_SYMBOL(vfs_llseek);
385 
386 static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
387 {
388 	off_t retval;
389 	struct fd f = fdget_pos(fd);
390 	if (!fd_file(f))
391 		return -EBADF;
392 
393 	retval = -EINVAL;
394 	if (whence <= SEEK_MAX) {
395 		loff_t res = vfs_llseek(fd_file(f), offset, whence);
396 		retval = res;
397 		if (res != (loff_t)retval)
398 			retval = -EOVERFLOW;	/* LFS: should only happen on 32 bit platforms */
399 	}
400 	fdput_pos(f);
401 	return retval;
402 }
403 
404 SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
405 {
406 	return ksys_lseek(fd, offset, whence);
407 }
408 
409 #ifdef CONFIG_COMPAT
410 COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
411 {
412 	return ksys_lseek(fd, offset, whence);
413 }
414 #endif
415 
416 #if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \
417 	defined(__ARCH_WANT_SYS_LLSEEK)
418 SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
419 		unsigned long, offset_low, loff_t __user *, result,
420 		unsigned int, whence)
421 {
422 	int retval;
423 	struct fd f = fdget_pos(fd);
424 	loff_t offset;
425 
426 	if (!fd_file(f))
427 		return -EBADF;
428 
429 	retval = -EINVAL;
430 	if (whence > SEEK_MAX)
431 		goto out_putf;
432 
433 	offset = vfs_llseek(fd_file(f), ((loff_t) offset_high << 32) | offset_low,
434 			whence);
435 
436 	retval = (int)offset;
437 	if (offset >= 0) {
438 		retval = -EFAULT;
439 		if (!copy_to_user(result, &offset, sizeof(offset)))
440 			retval = 0;
441 	}
442 out_putf:
443 	fdput_pos(f);
444 	return retval;
445 }
446 #endif
447 
448 int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
449 {
450 	int mask = read_write == READ ? MAY_READ : MAY_WRITE;
451 	int ret;
452 
453 	if (unlikely((ssize_t) count < 0))
454 		return -EINVAL;
455 
456 	if (ppos) {
457 		loff_t pos = *ppos;
458 
459 		if (unlikely(pos < 0)) {
460 			if (!unsigned_offsets(file))
461 				return -EINVAL;
462 			if (count >= -pos) /* both values are in 0..LLONG_MAX */
463 				return -EOVERFLOW;
464 		} else if (unlikely((loff_t) (pos + count) < 0)) {
465 			if (!unsigned_offsets(file))
466 				return -EINVAL;
467 		}
468 	}
469 
470 	ret = security_file_permission(file, mask);
471 	if (ret)
472 		return ret;
473 
474 	return fsnotify_file_area_perm(file, mask, ppos, count);
475 }
476 EXPORT_SYMBOL(rw_verify_area);
477 
478 static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
479 {
480 	struct kiocb kiocb;
481 	struct iov_iter iter;
482 	ssize_t ret;
483 
484 	init_sync_kiocb(&kiocb, filp);
485 	kiocb.ki_pos = (ppos ? *ppos : 0);
486 	iov_iter_ubuf(&iter, ITER_DEST, buf, len);
487 
488 	ret = filp->f_op->read_iter(&kiocb, &iter);
489 	BUG_ON(ret == -EIOCBQUEUED);
490 	if (ppos)
491 		*ppos = kiocb.ki_pos;
492 	return ret;
493 }
494 
495 static int warn_unsupported(struct file *file, const char *op)
496 {
497 	pr_warn_ratelimited(
498 		"kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
499 		op, file, current->pid, current->comm);
500 	return -EINVAL;
501 }
502 
503 ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
504 {
505 	struct kvec iov = {
506 		.iov_base	= buf,
507 		.iov_len	= min_t(size_t, count, MAX_RW_COUNT),
508 	};
509 	struct kiocb kiocb;
510 	struct iov_iter iter;
511 	ssize_t ret;
512 
513 	if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ)))
514 		return -EINVAL;
515 	if (!(file->f_mode & FMODE_CAN_READ))
516 		return -EINVAL;
517 	/*
518 	 * Also fail if ->read_iter and ->read are both wired up as that
519 	 * implies very convoluted semantics.
520 	 */
521 	if (unlikely(!file->f_op->read_iter || file->f_op->read))
522 		return warn_unsupported(file, "read");
523 
524 	init_sync_kiocb(&kiocb, file);
525 	kiocb.ki_pos = pos ? *pos : 0;
526 	iov_iter_kvec(&iter, ITER_DEST, &iov, 1, iov.iov_len);
527 	ret = file->f_op->read_iter(&kiocb, &iter);
528 	if (ret > 0) {
529 		if (pos)
530 			*pos = kiocb.ki_pos;
531 		fsnotify_access(file);
532 		add_rchar(current, ret);
533 	}
534 	inc_syscr(current);
535 	return ret;
536 }
537 
538 ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
539 {
540 	ssize_t ret;
541 
542 	ret = rw_verify_area(READ, file, pos, count);
543 	if (ret)
544 		return ret;
545 	return __kernel_read(file, buf, count, pos);
546 }
547 EXPORT_SYMBOL(kernel_read);
548 
549 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
550 {
551 	ssize_t ret;
552 
553 	if (!(file->f_mode & FMODE_READ))
554 		return -EBADF;
555 	if (!(file->f_mode & FMODE_CAN_READ))
556 		return -EINVAL;
557 	if (unlikely(!access_ok(buf, count)))
558 		return -EFAULT;
559 
560 	ret = rw_verify_area(READ, file, pos, count);
561 	if (ret)
562 		return ret;
563 	if (count > MAX_RW_COUNT)
564 		count =  MAX_RW_COUNT;
565 
566 	if (file->f_op->read)
567 		ret = file->f_op->read(file, buf, count, pos);
568 	else if (file->f_op->read_iter)
569 		ret = new_sync_read(file, buf, count, pos);
570 	else
571 		ret = -EINVAL;
572 	if (ret > 0) {
573 		fsnotify_access(file);
574 		add_rchar(current, ret);
575 	}
576 	inc_syscr(current);
577 	return ret;
578 }
579 
580 static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
581 {
582 	struct kiocb kiocb;
583 	struct iov_iter iter;
584 	ssize_t ret;
585 
586 	init_sync_kiocb(&kiocb, filp);
587 	kiocb.ki_pos = (ppos ? *ppos : 0);
588 	iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len);
589 
590 	ret = filp->f_op->write_iter(&kiocb, &iter);
591 	BUG_ON(ret == -EIOCBQUEUED);
592 	if (ret > 0 && ppos)
593 		*ppos = kiocb.ki_pos;
594 	return ret;
595 }
596 
597 /* caller is responsible for file_start_write/file_end_write */
598 ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos)
599 {
600 	struct kiocb kiocb;
601 	ssize_t ret;
602 
603 	if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
604 		return -EBADF;
605 	if (!(file->f_mode & FMODE_CAN_WRITE))
606 		return -EINVAL;
607 	/*
608 	 * Also fail if ->write_iter and ->write are both wired up as that
609 	 * implies very convoluted semantics.
610 	 */
611 	if (unlikely(!file->f_op->write_iter || file->f_op->write))
612 		return warn_unsupported(file, "write");
613 
614 	init_sync_kiocb(&kiocb, file);
615 	kiocb.ki_pos = pos ? *pos : 0;
616 	ret = file->f_op->write_iter(&kiocb, from);
617 	if (ret > 0) {
618 		if (pos)
619 			*pos = kiocb.ki_pos;
620 		fsnotify_modify(file);
621 		add_wchar(current, ret);
622 	}
623 	inc_syscw(current);
624 	return ret;
625 }
626 
627 /* caller is responsible for file_start_write/file_end_write */
628 ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
629 {
630 	struct kvec iov = {
631 		.iov_base	= (void *)buf,
632 		.iov_len	= min_t(size_t, count, MAX_RW_COUNT),
633 	};
634 	struct iov_iter iter;
635 	iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, iov.iov_len);
636 	return __kernel_write_iter(file, &iter, pos);
637 }
638 /*
639  * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()",
640  * but autofs is one of the few internal kernel users that actually
641  * wants this _and_ can be built as a module. So we need to export
642  * this symbol for autofs, even though it really isn't appropriate
643  * for any other kernel modules.
644  */
645 EXPORT_SYMBOL_GPL(__kernel_write);
646 
647 ssize_t kernel_write(struct file *file, const void *buf, size_t count,
648 			    loff_t *pos)
649 {
650 	ssize_t ret;
651 
652 	ret = rw_verify_area(WRITE, file, pos, count);
653 	if (ret)
654 		return ret;
655 
656 	file_start_write(file);
657 	ret =  __kernel_write(file, buf, count, pos);
658 	file_end_write(file);
659 	return ret;
660 }
661 EXPORT_SYMBOL(kernel_write);
662 
663 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
664 {
665 	ssize_t ret;
666 
667 	if (!(file->f_mode & FMODE_WRITE))
668 		return -EBADF;
669 	if (!(file->f_mode & FMODE_CAN_WRITE))
670 		return -EINVAL;
671 	if (unlikely(!access_ok(buf, count)))
672 		return -EFAULT;
673 
674 	ret = rw_verify_area(WRITE, file, pos, count);
675 	if (ret)
676 		return ret;
677 	if (count > MAX_RW_COUNT)
678 		count =  MAX_RW_COUNT;
679 	file_start_write(file);
680 	if (file->f_op->write)
681 		ret = file->f_op->write(file, buf, count, pos);
682 	else if (file->f_op->write_iter)
683 		ret = new_sync_write(file, buf, count, pos);
684 	else
685 		ret = -EINVAL;
686 	if (ret > 0) {
687 		fsnotify_modify(file);
688 		add_wchar(current, ret);
689 	}
690 	inc_syscw(current);
691 	file_end_write(file);
692 	return ret;
693 }
694 
695 /* file_ppos returns &file->f_pos or NULL if file is stream */
696 static inline loff_t *file_ppos(struct file *file)
697 {
698 	return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos;
699 }
700 
701 ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
702 {
703 	struct fd f = fdget_pos(fd);
704 	ssize_t ret = -EBADF;
705 
706 	if (fd_file(f)) {
707 		loff_t pos, *ppos = file_ppos(fd_file(f));
708 		if (ppos) {
709 			pos = *ppos;
710 			ppos = &pos;
711 		}
712 		ret = vfs_read(fd_file(f), buf, count, ppos);
713 		if (ret >= 0 && ppos)
714 			fd_file(f)->f_pos = pos;
715 		fdput_pos(f);
716 	}
717 	return ret;
718 }
719 
720 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
721 {
722 	return ksys_read(fd, buf, count);
723 }
724 
725 ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
726 {
727 	struct fd f = fdget_pos(fd);
728 	ssize_t ret = -EBADF;
729 
730 	if (fd_file(f)) {
731 		loff_t pos, *ppos = file_ppos(fd_file(f));
732 		if (ppos) {
733 			pos = *ppos;
734 			ppos = &pos;
735 		}
736 		ret = vfs_write(fd_file(f), buf, count, ppos);
737 		if (ret >= 0 && ppos)
738 			fd_file(f)->f_pos = pos;
739 		fdput_pos(f);
740 	}
741 
742 	return ret;
743 }
744 
745 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
746 		size_t, count)
747 {
748 	return ksys_write(fd, buf, count);
749 }
750 
751 ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
752 		     loff_t pos)
753 {
754 	struct fd f;
755 	ssize_t ret = -EBADF;
756 
757 	if (pos < 0)
758 		return -EINVAL;
759 
760 	f = fdget(fd);
761 	if (fd_file(f)) {
762 		ret = -ESPIPE;
763 		if (fd_file(f)->f_mode & FMODE_PREAD)
764 			ret = vfs_read(fd_file(f), buf, count, &pos);
765 		fdput(f);
766 	}
767 
768 	return ret;
769 }
770 
771 SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
772 			size_t, count, loff_t, pos)
773 {
774 	return ksys_pread64(fd, buf, count, pos);
775 }
776 
777 #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PREAD64)
778 COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf,
779 		       size_t, count, compat_arg_u64_dual(pos))
780 {
781 	return ksys_pread64(fd, buf, count, compat_arg_u64_glue(pos));
782 }
783 #endif
784 
785 ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
786 		      size_t count, loff_t pos)
787 {
788 	struct fd f;
789 	ssize_t ret = -EBADF;
790 
791 	if (pos < 0)
792 		return -EINVAL;
793 
794 	f = fdget(fd);
795 	if (fd_file(f)) {
796 		ret = -ESPIPE;
797 		if (fd_file(f)->f_mode & FMODE_PWRITE)
798 			ret = vfs_write(fd_file(f), buf, count, &pos);
799 		fdput(f);
800 	}
801 
802 	return ret;
803 }
804 
805 SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
806 			 size_t, count, loff_t, pos)
807 {
808 	return ksys_pwrite64(fd, buf, count, pos);
809 }
810 
811 #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PWRITE64)
812 COMPAT_SYSCALL_DEFINE5(pwrite64, unsigned int, fd, const char __user *, buf,
813 		       size_t, count, compat_arg_u64_dual(pos))
814 {
815 	return ksys_pwrite64(fd, buf, count, compat_arg_u64_glue(pos));
816 }
817 #endif
818 
819 static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
820 		loff_t *ppos, int type, rwf_t flags)
821 {
822 	struct kiocb kiocb;
823 	ssize_t ret;
824 
825 	init_sync_kiocb(&kiocb, filp);
826 	ret = kiocb_set_rw_flags(&kiocb, flags, type);
827 	if (ret)
828 		return ret;
829 	kiocb.ki_pos = (ppos ? *ppos : 0);
830 
831 	if (type == READ)
832 		ret = filp->f_op->read_iter(&kiocb, iter);
833 	else
834 		ret = filp->f_op->write_iter(&kiocb, iter);
835 	BUG_ON(ret == -EIOCBQUEUED);
836 	if (ppos)
837 		*ppos = kiocb.ki_pos;
838 	return ret;
839 }
840 
841 /* Do it by hand, with file-ops */
842 static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
843 		loff_t *ppos, int type, rwf_t flags)
844 {
845 	ssize_t ret = 0;
846 
847 	if (flags & ~RWF_HIPRI)
848 		return -EOPNOTSUPP;
849 
850 	while (iov_iter_count(iter)) {
851 		ssize_t nr;
852 
853 		if (type == READ) {
854 			nr = filp->f_op->read(filp, iter_iov_addr(iter),
855 						iter_iov_len(iter), ppos);
856 		} else {
857 			nr = filp->f_op->write(filp, iter_iov_addr(iter),
858 						iter_iov_len(iter), ppos);
859 		}
860 
861 		if (nr < 0) {
862 			if (!ret)
863 				ret = nr;
864 			break;
865 		}
866 		ret += nr;
867 		if (nr != iter_iov_len(iter))
868 			break;
869 		iov_iter_advance(iter, nr);
870 	}
871 
872 	return ret;
873 }
874 
875 ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
876 			   struct iov_iter *iter)
877 {
878 	size_t tot_len;
879 	ssize_t ret = 0;
880 
881 	if (!file->f_op->read_iter)
882 		return -EINVAL;
883 	if (!(file->f_mode & FMODE_READ))
884 		return -EBADF;
885 	if (!(file->f_mode & FMODE_CAN_READ))
886 		return -EINVAL;
887 
888 	tot_len = iov_iter_count(iter);
889 	if (!tot_len)
890 		goto out;
891 	ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len);
892 	if (ret < 0)
893 		return ret;
894 
895 	ret = file->f_op->read_iter(iocb, iter);
896 out:
897 	if (ret >= 0)
898 		fsnotify_access(file);
899 	return ret;
900 }
901 EXPORT_SYMBOL(vfs_iocb_iter_read);
902 
903 ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
904 		      rwf_t flags)
905 {
906 	size_t tot_len;
907 	ssize_t ret = 0;
908 
909 	if (!file->f_op->read_iter)
910 		return -EINVAL;
911 	if (!(file->f_mode & FMODE_READ))
912 		return -EBADF;
913 	if (!(file->f_mode & FMODE_CAN_READ))
914 		return -EINVAL;
915 
916 	tot_len = iov_iter_count(iter);
917 	if (!tot_len)
918 		goto out;
919 	ret = rw_verify_area(READ, file, ppos, tot_len);
920 	if (ret < 0)
921 		return ret;
922 
923 	ret = do_iter_readv_writev(file, iter, ppos, READ, flags);
924 out:
925 	if (ret >= 0)
926 		fsnotify_access(file);
927 	return ret;
928 }
929 EXPORT_SYMBOL(vfs_iter_read);
930 
931 /*
932  * Caller is responsible for calling kiocb_end_write() on completion
933  * if async iocb was queued.
934  */
935 ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
936 			    struct iov_iter *iter)
937 {
938 	size_t tot_len;
939 	ssize_t ret = 0;
940 
941 	if (!file->f_op->write_iter)
942 		return -EINVAL;
943 	if (!(file->f_mode & FMODE_WRITE))
944 		return -EBADF;
945 	if (!(file->f_mode & FMODE_CAN_WRITE))
946 		return -EINVAL;
947 
948 	tot_len = iov_iter_count(iter);
949 	if (!tot_len)
950 		return 0;
951 	ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len);
952 	if (ret < 0)
953 		return ret;
954 
955 	kiocb_start_write(iocb);
956 	ret = file->f_op->write_iter(iocb, iter);
957 	if (ret != -EIOCBQUEUED)
958 		kiocb_end_write(iocb);
959 	if (ret > 0)
960 		fsnotify_modify(file);
961 
962 	return ret;
963 }
964 EXPORT_SYMBOL(vfs_iocb_iter_write);
965 
966 ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
967 		       rwf_t flags)
968 {
969 	size_t tot_len;
970 	ssize_t ret;
971 
972 	if (!(file->f_mode & FMODE_WRITE))
973 		return -EBADF;
974 	if (!(file->f_mode & FMODE_CAN_WRITE))
975 		return -EINVAL;
976 	if (!file->f_op->write_iter)
977 		return -EINVAL;
978 
979 	tot_len = iov_iter_count(iter);
980 	if (!tot_len)
981 		return 0;
982 
983 	ret = rw_verify_area(WRITE, file, ppos, tot_len);
984 	if (ret < 0)
985 		return ret;
986 
987 	file_start_write(file);
988 	ret = do_iter_readv_writev(file, iter, ppos, WRITE, flags);
989 	if (ret > 0)
990 		fsnotify_modify(file);
991 	file_end_write(file);
992 
993 	return ret;
994 }
995 EXPORT_SYMBOL(vfs_iter_write);
996 
997 static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
998 			 unsigned long vlen, loff_t *pos, rwf_t flags)
999 {
1000 	struct iovec iovstack[UIO_FASTIOV];
1001 	struct iovec *iov = iovstack;
1002 	struct iov_iter iter;
1003 	size_t tot_len;
1004 	ssize_t ret = 0;
1005 
1006 	if (!(file->f_mode & FMODE_READ))
1007 		return -EBADF;
1008 	if (!(file->f_mode & FMODE_CAN_READ))
1009 		return -EINVAL;
1010 
1011 	ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov,
1012 			   &iter);
1013 	if (ret < 0)
1014 		return ret;
1015 
1016 	tot_len = iov_iter_count(&iter);
1017 	if (!tot_len)
1018 		goto out;
1019 
1020 	ret = rw_verify_area(READ, file, pos, tot_len);
1021 	if (ret < 0)
1022 		goto out;
1023 
1024 	if (file->f_op->read_iter)
1025 		ret = do_iter_readv_writev(file, &iter, pos, READ, flags);
1026 	else
1027 		ret = do_loop_readv_writev(file, &iter, pos, READ, flags);
1028 out:
1029 	if (ret >= 0)
1030 		fsnotify_access(file);
1031 	kfree(iov);
1032 	return ret;
1033 }
1034 
1035 static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
1036 			  unsigned long vlen, loff_t *pos, rwf_t flags)
1037 {
1038 	struct iovec iovstack[UIO_FASTIOV];
1039 	struct iovec *iov = iovstack;
1040 	struct iov_iter iter;
1041 	size_t tot_len;
1042 	ssize_t ret = 0;
1043 
1044 	if (!(file->f_mode & FMODE_WRITE))
1045 		return -EBADF;
1046 	if (!(file->f_mode & FMODE_CAN_WRITE))
1047 		return -EINVAL;
1048 
1049 	ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov,
1050 			   &iter);
1051 	if (ret < 0)
1052 		return ret;
1053 
1054 	tot_len = iov_iter_count(&iter);
1055 	if (!tot_len)
1056 		goto out;
1057 
1058 	ret = rw_verify_area(WRITE, file, pos, tot_len);
1059 	if (ret < 0)
1060 		goto out;
1061 
1062 	file_start_write(file);
1063 	if (file->f_op->write_iter)
1064 		ret = do_iter_readv_writev(file, &iter, pos, WRITE, flags);
1065 	else
1066 		ret = do_loop_readv_writev(file, &iter, pos, WRITE, flags);
1067 	if (ret > 0)
1068 		fsnotify_modify(file);
1069 	file_end_write(file);
1070 out:
1071 	kfree(iov);
1072 	return ret;
1073 }
1074 
1075 static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
1076 			unsigned long vlen, rwf_t flags)
1077 {
1078 	struct fd f = fdget_pos(fd);
1079 	ssize_t ret = -EBADF;
1080 
1081 	if (fd_file(f)) {
1082 		loff_t pos, *ppos = file_ppos(fd_file(f));
1083 		if (ppos) {
1084 			pos = *ppos;
1085 			ppos = &pos;
1086 		}
1087 		ret = vfs_readv(fd_file(f), vec, vlen, ppos, flags);
1088 		if (ret >= 0 && ppos)
1089 			fd_file(f)->f_pos = pos;
1090 		fdput_pos(f);
1091 	}
1092 
1093 	if (ret > 0)
1094 		add_rchar(current, ret);
1095 	inc_syscr(current);
1096 	return ret;
1097 }
1098 
1099 static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
1100 			 unsigned long vlen, rwf_t flags)
1101 {
1102 	struct fd f = fdget_pos(fd);
1103 	ssize_t ret = -EBADF;
1104 
1105 	if (fd_file(f)) {
1106 		loff_t pos, *ppos = file_ppos(fd_file(f));
1107 		if (ppos) {
1108 			pos = *ppos;
1109 			ppos = &pos;
1110 		}
1111 		ret = vfs_writev(fd_file(f), vec, vlen, ppos, flags);
1112 		if (ret >= 0 && ppos)
1113 			fd_file(f)->f_pos = pos;
1114 		fdput_pos(f);
1115 	}
1116 
1117 	if (ret > 0)
1118 		add_wchar(current, ret);
1119 	inc_syscw(current);
1120 	return ret;
1121 }
1122 
1123 static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
1124 {
1125 #define HALF_LONG_BITS (BITS_PER_LONG / 2)
1126 	return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
1127 }
1128 
1129 static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
1130 			 unsigned long vlen, loff_t pos, rwf_t flags)
1131 {
1132 	struct fd f;
1133 	ssize_t ret = -EBADF;
1134 
1135 	if (pos < 0)
1136 		return -EINVAL;
1137 
1138 	f = fdget(fd);
1139 	if (fd_file(f)) {
1140 		ret = -ESPIPE;
1141 		if (fd_file(f)->f_mode & FMODE_PREAD)
1142 			ret = vfs_readv(fd_file(f), vec, vlen, &pos, flags);
1143 		fdput(f);
1144 	}
1145 
1146 	if (ret > 0)
1147 		add_rchar(current, ret);
1148 	inc_syscr(current);
1149 	return ret;
1150 }
1151 
1152 static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
1153 			  unsigned long vlen, loff_t pos, rwf_t flags)
1154 {
1155 	struct fd f;
1156 	ssize_t ret = -EBADF;
1157 
1158 	if (pos < 0)
1159 		return -EINVAL;
1160 
1161 	f = fdget(fd);
1162 	if (fd_file(f)) {
1163 		ret = -ESPIPE;
1164 		if (fd_file(f)->f_mode & FMODE_PWRITE)
1165 			ret = vfs_writev(fd_file(f), vec, vlen, &pos, flags);
1166 		fdput(f);
1167 	}
1168 
1169 	if (ret > 0)
1170 		add_wchar(current, ret);
1171 	inc_syscw(current);
1172 	return ret;
1173 }
1174 
1175 SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
1176 		unsigned long, vlen)
1177 {
1178 	return do_readv(fd, vec, vlen, 0);
1179 }
1180 
1181 SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
1182 		unsigned long, vlen)
1183 {
1184 	return do_writev(fd, vec, vlen, 0);
1185 }
1186 
1187 SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
1188 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1189 {
1190 	loff_t pos = pos_from_hilo(pos_h, pos_l);
1191 
1192 	return do_preadv(fd, vec, vlen, pos, 0);
1193 }
1194 
1195 SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
1196 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1197 		rwf_t, flags)
1198 {
1199 	loff_t pos = pos_from_hilo(pos_h, pos_l);
1200 
1201 	if (pos == -1)
1202 		return do_readv(fd, vec, vlen, flags);
1203 
1204 	return do_preadv(fd, vec, vlen, pos, flags);
1205 }
1206 
1207 SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
1208 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1209 {
1210 	loff_t pos = pos_from_hilo(pos_h, pos_l);
1211 
1212 	return do_pwritev(fd, vec, vlen, pos, 0);
1213 }
1214 
1215 SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
1216 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1217 		rwf_t, flags)
1218 {
1219 	loff_t pos = pos_from_hilo(pos_h, pos_l);
1220 
1221 	if (pos == -1)
1222 		return do_writev(fd, vec, vlen, flags);
1223 
1224 	return do_pwritev(fd, vec, vlen, pos, flags);
1225 }
1226 
1227 /*
1228  * Various compat syscalls.  Note that they all pretend to take a native
1229  * iovec - import_iovec will properly treat those as compat_iovecs based on
1230  * in_compat_syscall().
1231  */
1232 #ifdef CONFIG_COMPAT
1233 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
1234 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1235 		const struct iovec __user *, vec,
1236 		unsigned long, vlen, loff_t, pos)
1237 {
1238 	return do_preadv(fd, vec, vlen, pos, 0);
1239 }
1240 #endif
1241 
1242 COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1243 		const struct iovec __user *, vec,
1244 		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1245 {
1246 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1247 
1248 	return do_preadv(fd, vec, vlen, pos, 0);
1249 }
1250 
1251 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
1252 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
1253 		const struct iovec __user *, vec,
1254 		unsigned long, vlen, loff_t, pos, rwf_t, flags)
1255 {
1256 	if (pos == -1)
1257 		return do_readv(fd, vec, vlen, flags);
1258 	return do_preadv(fd, vec, vlen, pos, flags);
1259 }
1260 #endif
1261 
1262 COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
1263 		const struct iovec __user *, vec,
1264 		compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
1265 		rwf_t, flags)
1266 {
1267 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1268 
1269 	if (pos == -1)
1270 		return do_readv(fd, vec, vlen, flags);
1271 	return do_preadv(fd, vec, vlen, pos, flags);
1272 }
1273 
1274 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
1275 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1276 		const struct iovec __user *, vec,
1277 		unsigned long, vlen, loff_t, pos)
1278 {
1279 	return do_pwritev(fd, vec, vlen, pos, 0);
1280 }
1281 #endif
1282 
1283 COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1284 		const struct iovec __user *,vec,
1285 		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1286 {
1287 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1288 
1289 	return do_pwritev(fd, vec, vlen, pos, 0);
1290 }
1291 
1292 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
1293 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
1294 		const struct iovec __user *, vec,
1295 		unsigned long, vlen, loff_t, pos, rwf_t, flags)
1296 {
1297 	if (pos == -1)
1298 		return do_writev(fd, vec, vlen, flags);
1299 	return do_pwritev(fd, vec, vlen, pos, flags);
1300 }
1301 #endif
1302 
1303 COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
1304 		const struct iovec __user *,vec,
1305 		compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
1306 {
1307 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1308 
1309 	if (pos == -1)
1310 		return do_writev(fd, vec, vlen, flags);
1311 	return do_pwritev(fd, vec, vlen, pos, flags);
1312 }
1313 #endif /* CONFIG_COMPAT */
1314 
1315 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1316 			   size_t count, loff_t max)
1317 {
1318 	struct fd in, out;
1319 	struct inode *in_inode, *out_inode;
1320 	struct pipe_inode_info *opipe;
1321 	loff_t pos;
1322 	loff_t out_pos;
1323 	ssize_t retval;
1324 	int fl;
1325 
1326 	/*
1327 	 * Get input file, and verify that it is ok..
1328 	 */
1329 	retval = -EBADF;
1330 	in = fdget(in_fd);
1331 	if (!fd_file(in))
1332 		goto out;
1333 	if (!(fd_file(in)->f_mode & FMODE_READ))
1334 		goto fput_in;
1335 	retval = -ESPIPE;
1336 	if (!ppos) {
1337 		pos = fd_file(in)->f_pos;
1338 	} else {
1339 		pos = *ppos;
1340 		if (!(fd_file(in)->f_mode & FMODE_PREAD))
1341 			goto fput_in;
1342 	}
1343 	retval = rw_verify_area(READ, fd_file(in), &pos, count);
1344 	if (retval < 0)
1345 		goto fput_in;
1346 	if (count > MAX_RW_COUNT)
1347 		count =  MAX_RW_COUNT;
1348 
1349 	/*
1350 	 * Get output file, and verify that it is ok..
1351 	 */
1352 	retval = -EBADF;
1353 	out = fdget(out_fd);
1354 	if (!fd_file(out))
1355 		goto fput_in;
1356 	if (!(fd_file(out)->f_mode & FMODE_WRITE))
1357 		goto fput_out;
1358 	in_inode = file_inode(fd_file(in));
1359 	out_inode = file_inode(fd_file(out));
1360 	out_pos = fd_file(out)->f_pos;
1361 
1362 	if (!max)
1363 		max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1364 
1365 	if (unlikely(pos + count > max)) {
1366 		retval = -EOVERFLOW;
1367 		if (pos >= max)
1368 			goto fput_out;
1369 		count = max - pos;
1370 	}
1371 
1372 	fl = 0;
1373 #if 0
1374 	/*
1375 	 * We need to debate whether we can enable this or not. The
1376 	 * man page documents EAGAIN return for the output at least,
1377 	 * and the application is arguably buggy if it doesn't expect
1378 	 * EAGAIN on a non-blocking file descriptor.
1379 	 */
1380 	if (fd_file(in)->f_flags & O_NONBLOCK)
1381 		fl = SPLICE_F_NONBLOCK;
1382 #endif
1383 	opipe = get_pipe_info(fd_file(out), true);
1384 	if (!opipe) {
1385 		retval = rw_verify_area(WRITE, fd_file(out), &out_pos, count);
1386 		if (retval < 0)
1387 			goto fput_out;
1388 		retval = do_splice_direct(fd_file(in), &pos, fd_file(out), &out_pos,
1389 					  count, fl);
1390 	} else {
1391 		if (fd_file(out)->f_flags & O_NONBLOCK)
1392 			fl |= SPLICE_F_NONBLOCK;
1393 
1394 		retval = splice_file_to_pipe(fd_file(in), opipe, &pos, count, fl);
1395 	}
1396 
1397 	if (retval > 0) {
1398 		add_rchar(current, retval);
1399 		add_wchar(current, retval);
1400 		fsnotify_access(fd_file(in));
1401 		fsnotify_modify(fd_file(out));
1402 		fd_file(out)->f_pos = out_pos;
1403 		if (ppos)
1404 			*ppos = pos;
1405 		else
1406 			fd_file(in)->f_pos = pos;
1407 	}
1408 
1409 	inc_syscr(current);
1410 	inc_syscw(current);
1411 	if (pos > max)
1412 		retval = -EOVERFLOW;
1413 
1414 fput_out:
1415 	fdput(out);
1416 fput_in:
1417 	fdput(in);
1418 out:
1419 	return retval;
1420 }
1421 
1422 SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1423 {
1424 	loff_t pos;
1425 	off_t off;
1426 	ssize_t ret;
1427 
1428 	if (offset) {
1429 		if (unlikely(get_user(off, offset)))
1430 			return -EFAULT;
1431 		pos = off;
1432 		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1433 		if (unlikely(put_user(pos, offset)))
1434 			return -EFAULT;
1435 		return ret;
1436 	}
1437 
1438 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1439 }
1440 
1441 SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1442 {
1443 	loff_t pos;
1444 	ssize_t ret;
1445 
1446 	if (offset) {
1447 		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1448 			return -EFAULT;
1449 		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1450 		if (unlikely(put_user(pos, offset)))
1451 			return -EFAULT;
1452 		return ret;
1453 	}
1454 
1455 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1456 }
1457 
1458 #ifdef CONFIG_COMPAT
1459 COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1460 		compat_off_t __user *, offset, compat_size_t, count)
1461 {
1462 	loff_t pos;
1463 	off_t off;
1464 	ssize_t ret;
1465 
1466 	if (offset) {
1467 		if (unlikely(get_user(off, offset)))
1468 			return -EFAULT;
1469 		pos = off;
1470 		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1471 		if (unlikely(put_user(pos, offset)))
1472 			return -EFAULT;
1473 		return ret;
1474 	}
1475 
1476 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1477 }
1478 
1479 COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1480 		compat_loff_t __user *, offset, compat_size_t, count)
1481 {
1482 	loff_t pos;
1483 	ssize_t ret;
1484 
1485 	if (offset) {
1486 		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1487 			return -EFAULT;
1488 		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1489 		if (unlikely(put_user(pos, offset)))
1490 			return -EFAULT;
1491 		return ret;
1492 	}
1493 
1494 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1495 }
1496 #endif
1497 
1498 /*
1499  * Performs necessary checks before doing a file copy
1500  *
1501  * Can adjust amount of bytes to copy via @req_count argument.
1502  * Returns appropriate error code that caller should return or
1503  * zero in case the copy should be allowed.
1504  */
1505 static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
1506 				    struct file *file_out, loff_t pos_out,
1507 				    size_t *req_count, unsigned int flags)
1508 {
1509 	struct inode *inode_in = file_inode(file_in);
1510 	struct inode *inode_out = file_inode(file_out);
1511 	uint64_t count = *req_count;
1512 	loff_t size_in;
1513 	int ret;
1514 
1515 	ret = generic_file_rw_checks(file_in, file_out);
1516 	if (ret)
1517 		return ret;
1518 
1519 	/*
1520 	 * We allow some filesystems to handle cross sb copy, but passing
1521 	 * a file of the wrong filesystem type to filesystem driver can result
1522 	 * in an attempt to dereference the wrong type of ->private_data, so
1523 	 * avoid doing that until we really have a good reason.
1524 	 *
1525 	 * nfs and cifs define several different file_system_type structures
1526 	 * and several different sets of file_operations, but they all end up
1527 	 * using the same ->copy_file_range() function pointer.
1528 	 */
1529 	if (flags & COPY_FILE_SPLICE) {
1530 		/* cross sb splice is allowed */
1531 	} else if (file_out->f_op->copy_file_range) {
1532 		if (file_in->f_op->copy_file_range !=
1533 		    file_out->f_op->copy_file_range)
1534 			return -EXDEV;
1535 	} else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) {
1536 		return -EXDEV;
1537 	}
1538 
1539 	/* Don't touch certain kinds of inodes */
1540 	if (IS_IMMUTABLE(inode_out))
1541 		return -EPERM;
1542 
1543 	if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
1544 		return -ETXTBSY;
1545 
1546 	/* Ensure offsets don't wrap. */
1547 	if (pos_in + count < pos_in || pos_out + count < pos_out)
1548 		return -EOVERFLOW;
1549 
1550 	/* Shorten the copy to EOF */
1551 	size_in = i_size_read(inode_in);
1552 	if (pos_in >= size_in)
1553 		count = 0;
1554 	else
1555 		count = min(count, size_in - (uint64_t)pos_in);
1556 
1557 	ret = generic_write_check_limits(file_out, pos_out, &count);
1558 	if (ret)
1559 		return ret;
1560 
1561 	/* Don't allow overlapped copying within the same file. */
1562 	if (inode_in == inode_out &&
1563 	    pos_out + count > pos_in &&
1564 	    pos_out < pos_in + count)
1565 		return -EINVAL;
1566 
1567 	*req_count = count;
1568 	return 0;
1569 }
1570 
1571 /*
1572  * copy_file_range() differs from regular file read and write in that it
1573  * specifically allows return partial success.  When it does so is up to
1574  * the copy_file_range method.
1575  */
1576 ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
1577 			    struct file *file_out, loff_t pos_out,
1578 			    size_t len, unsigned int flags)
1579 {
1580 	ssize_t ret;
1581 	bool splice = flags & COPY_FILE_SPLICE;
1582 	bool samesb = file_inode(file_in)->i_sb == file_inode(file_out)->i_sb;
1583 
1584 	if (flags & ~COPY_FILE_SPLICE)
1585 		return -EINVAL;
1586 
1587 	ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
1588 				       flags);
1589 	if (unlikely(ret))
1590 		return ret;
1591 
1592 	ret = rw_verify_area(READ, file_in, &pos_in, len);
1593 	if (unlikely(ret))
1594 		return ret;
1595 
1596 	ret = rw_verify_area(WRITE, file_out, &pos_out, len);
1597 	if (unlikely(ret))
1598 		return ret;
1599 
1600 	if (len == 0)
1601 		return 0;
1602 
1603 	file_start_write(file_out);
1604 
1605 	/*
1606 	 * Cloning is supported by more file systems, so we implement copy on
1607 	 * same sb using clone, but for filesystems where both clone and copy
1608 	 * are supported (e.g. nfs,cifs), we only call the copy method.
1609 	 */
1610 	if (!splice && file_out->f_op->copy_file_range) {
1611 		ret = file_out->f_op->copy_file_range(file_in, pos_in,
1612 						      file_out, pos_out,
1613 						      len, flags);
1614 	} else if (!splice && file_in->f_op->remap_file_range && samesb) {
1615 		ret = file_in->f_op->remap_file_range(file_in, pos_in,
1616 				file_out, pos_out,
1617 				min_t(loff_t, MAX_RW_COUNT, len),
1618 				REMAP_FILE_CAN_SHORTEN);
1619 		/* fallback to splice */
1620 		if (ret <= 0)
1621 			splice = true;
1622 	} else if (samesb) {
1623 		/* Fallback to splice for same sb copy for backward compat */
1624 		splice = true;
1625 	}
1626 
1627 	file_end_write(file_out);
1628 
1629 	if (!splice)
1630 		goto done;
1631 
1632 	/*
1633 	 * We can get here for same sb copy of filesystems that do not implement
1634 	 * ->copy_file_range() in case filesystem does not support clone or in
1635 	 * case filesystem supports clone but rejected the clone request (e.g.
1636 	 * because it was not block aligned).
1637 	 *
1638 	 * In both cases, fall back to kernel copy so we are able to maintain a
1639 	 * consistent story about which filesystems support copy_file_range()
1640 	 * and which filesystems do not, that will allow userspace tools to
1641 	 * make consistent desicions w.r.t using copy_file_range().
1642 	 *
1643 	 * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE
1644 	 * for server-side-copy between any two sb.
1645 	 *
1646 	 * In any case, we call do_splice_direct() and not splice_file_range(),
1647 	 * without file_start_write() held, to avoid possible deadlocks related
1648 	 * to splicing from input file, while file_start_write() is held on
1649 	 * the output file on a different sb.
1650 	 */
1651 	ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
1652 			       min_t(size_t, len, MAX_RW_COUNT), 0);
1653 done:
1654 	if (ret > 0) {
1655 		fsnotify_access(file_in);
1656 		add_rchar(current, ret);
1657 		fsnotify_modify(file_out);
1658 		add_wchar(current, ret);
1659 	}
1660 
1661 	inc_syscr(current);
1662 	inc_syscw(current);
1663 
1664 	return ret;
1665 }
1666 EXPORT_SYMBOL(vfs_copy_file_range);
1667 
1668 SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
1669 		int, fd_out, loff_t __user *, off_out,
1670 		size_t, len, unsigned int, flags)
1671 {
1672 	loff_t pos_in;
1673 	loff_t pos_out;
1674 	struct fd f_in;
1675 	struct fd f_out;
1676 	ssize_t ret = -EBADF;
1677 
1678 	f_in = fdget(fd_in);
1679 	if (!fd_file(f_in))
1680 		goto out2;
1681 
1682 	f_out = fdget(fd_out);
1683 	if (!fd_file(f_out))
1684 		goto out1;
1685 
1686 	ret = -EFAULT;
1687 	if (off_in) {
1688 		if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
1689 			goto out;
1690 	} else {
1691 		pos_in = fd_file(f_in)->f_pos;
1692 	}
1693 
1694 	if (off_out) {
1695 		if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
1696 			goto out;
1697 	} else {
1698 		pos_out = fd_file(f_out)->f_pos;
1699 	}
1700 
1701 	ret = -EINVAL;
1702 	if (flags != 0)
1703 		goto out;
1704 
1705 	ret = vfs_copy_file_range(fd_file(f_in), pos_in, fd_file(f_out), pos_out, len,
1706 				  flags);
1707 	if (ret > 0) {
1708 		pos_in += ret;
1709 		pos_out += ret;
1710 
1711 		if (off_in) {
1712 			if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
1713 				ret = -EFAULT;
1714 		} else {
1715 			fd_file(f_in)->f_pos = pos_in;
1716 		}
1717 
1718 		if (off_out) {
1719 			if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
1720 				ret = -EFAULT;
1721 		} else {
1722 			fd_file(f_out)->f_pos = pos_out;
1723 		}
1724 	}
1725 
1726 out:
1727 	fdput(f_out);
1728 out1:
1729 	fdput(f_in);
1730 out2:
1731 	return ret;
1732 }
1733 
1734 /*
1735  * Don't operate on ranges the page cache doesn't support, and don't exceed the
1736  * LFS limits.  If pos is under the limit it becomes a short access.  If it
1737  * exceeds the limit we return -EFBIG.
1738  */
1739 int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count)
1740 {
1741 	struct inode *inode = file->f_mapping->host;
1742 	loff_t max_size = inode->i_sb->s_maxbytes;
1743 	loff_t limit = rlimit(RLIMIT_FSIZE);
1744 
1745 	if (limit != RLIM_INFINITY) {
1746 		if (pos >= limit) {
1747 			send_sig(SIGXFSZ, current, 0);
1748 			return -EFBIG;
1749 		}
1750 		*count = min(*count, limit - pos);
1751 	}
1752 
1753 	if (!(file->f_flags & O_LARGEFILE))
1754 		max_size = MAX_NON_LFS;
1755 
1756 	if (unlikely(pos >= max_size))
1757 		return -EFBIG;
1758 
1759 	*count = min(*count, max_size - pos);
1760 
1761 	return 0;
1762 }
1763 EXPORT_SYMBOL_GPL(generic_write_check_limits);
1764 
1765 /* Like generic_write_checks(), but takes size of write instead of iter. */
1766 int generic_write_checks_count(struct kiocb *iocb, loff_t *count)
1767 {
1768 	struct file *file = iocb->ki_filp;
1769 	struct inode *inode = file->f_mapping->host;
1770 
1771 	if (IS_SWAPFILE(inode))
1772 		return -ETXTBSY;
1773 
1774 	if (!*count)
1775 		return 0;
1776 
1777 	if (iocb->ki_flags & IOCB_APPEND)
1778 		iocb->ki_pos = i_size_read(inode);
1779 
1780 	if ((iocb->ki_flags & IOCB_NOWAIT) &&
1781 	    !((iocb->ki_flags & IOCB_DIRECT) ||
1782 	      (file->f_op->fop_flags & FOP_BUFFER_WASYNC)))
1783 		return -EINVAL;
1784 
1785 	return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
1786 }
1787 EXPORT_SYMBOL(generic_write_checks_count);
1788 
1789 /*
1790  * Performs necessary checks before doing a write
1791  *
1792  * Can adjust writing position or amount of bytes to write.
1793  * Returns appropriate error code that caller should return or
1794  * zero in case that write should be allowed.
1795  */
1796 ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
1797 {
1798 	loff_t count = iov_iter_count(from);
1799 	int ret;
1800 
1801 	ret = generic_write_checks_count(iocb, &count);
1802 	if (ret)
1803 		return ret;
1804 
1805 	iov_iter_truncate(from, count);
1806 	return iov_iter_count(from);
1807 }
1808 EXPORT_SYMBOL(generic_write_checks);
1809 
1810 /*
1811  * Performs common checks before doing a file copy/clone
1812  * from @file_in to @file_out.
1813  */
1814 int generic_file_rw_checks(struct file *file_in, struct file *file_out)
1815 {
1816 	struct inode *inode_in = file_inode(file_in);
1817 	struct inode *inode_out = file_inode(file_out);
1818 
1819 	/* Don't copy dirs, pipes, sockets... */
1820 	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1821 		return -EISDIR;
1822 	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1823 		return -EINVAL;
1824 
1825 	if (!(file_in->f_mode & FMODE_READ) ||
1826 	    !(file_out->f_mode & FMODE_WRITE) ||
1827 	    (file_out->f_flags & O_APPEND))
1828 		return -EBADF;
1829 
1830 	return 0;
1831 }
1832 
1833 bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos)
1834 {
1835 	size_t len = iov_iter_count(iter);
1836 
1837 	if (!iter_is_ubuf(iter))
1838 		return false;
1839 
1840 	if (!is_power_of_2(len))
1841 		return false;
1842 
1843 	if (!IS_ALIGNED(pos, len))
1844 		return false;
1845 
1846 	return true;
1847 }
1848