xref: /linux/lib/iov_iter.c (revision c70a4be130de333ea079c59da41cc959712bb01c)
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <crypto/hash.h>
3 #include <linux/export.h>
4 #include <linux/bvec.h>
5 #include <linux/fault-inject-usercopy.h>
6 #include <linux/uio.h>
7 #include <linux/pagemap.h>
8 #include <linux/slab.h>
9 #include <linux/vmalloc.h>
10 #include <linux/splice.h>
11 #include <linux/compat.h>
12 #include <net/checksum.h>
13 #include <linux/scatterlist.h>
14 #include <linux/instrumented.h>
15 
16 #define PIPE_PARANOIA /* for now */
17 
18 #define iterate_iovec(i, n, __v, __p, skip, STEP) {	\
19 	size_t left;					\
20 	size_t wanted = n;				\
21 	__p = i->iov;					\
22 	__v.iov_len = min(n, __p->iov_len - skip);	\
23 	if (likely(__v.iov_len)) {			\
24 		__v.iov_base = __p->iov_base + skip;	\
25 		left = (STEP);				\
26 		__v.iov_len -= left;			\
27 		skip += __v.iov_len;			\
28 		n -= __v.iov_len;			\
29 	} else {					\
30 		left = 0;				\
31 	}						\
32 	while (unlikely(!left && n)) {			\
33 		__p++;					\
34 		__v.iov_len = min(n, __p->iov_len);	\
35 		if (unlikely(!__v.iov_len))		\
36 			continue;			\
37 		__v.iov_base = __p->iov_base;		\
38 		left = (STEP);				\
39 		__v.iov_len -= left;			\
40 		skip = __v.iov_len;			\
41 		n -= __v.iov_len;			\
42 	}						\
43 	n = wanted - n;					\
44 }
45 
46 #define iterate_kvec(i, n, __v, __p, skip, STEP) {	\
47 	size_t wanted = n;				\
48 	__p = i->kvec;					\
49 	__v.iov_len = min(n, __p->iov_len - skip);	\
50 	if (likely(__v.iov_len)) {			\
51 		__v.iov_base = __p->iov_base + skip;	\
52 		(void)(STEP);				\
53 		skip += __v.iov_len;			\
54 		n -= __v.iov_len;			\
55 	}						\
56 	while (unlikely(n)) {				\
57 		__p++;					\
58 		__v.iov_len = min(n, __p->iov_len);	\
59 		if (unlikely(!__v.iov_len))		\
60 			continue;			\
61 		__v.iov_base = __p->iov_base;		\
62 		(void)(STEP);				\
63 		skip = __v.iov_len;			\
64 		n -= __v.iov_len;			\
65 	}						\
66 	n = wanted;					\
67 }
68 
69 #define iterate_bvec(i, n, __v, __bi, skip, STEP) {	\
70 	struct bvec_iter __start;			\
71 	__start.bi_size = n;				\
72 	__start.bi_bvec_done = skip;			\
73 	__start.bi_idx = 0;				\
74 	for_each_bvec(__v, i->bvec, __bi, __start) {	\
75 		(void)(STEP);				\
76 	}						\
77 }
78 
79 #define iterate_xarray(i, n, __v, skip, STEP) {		\
80 	struct page *head = NULL;				\
81 	size_t wanted = n, seg, offset;				\
82 	loff_t start = i->xarray_start + skip;			\
83 	pgoff_t index = start >> PAGE_SHIFT;			\
84 	int j;							\
85 								\
86 	XA_STATE(xas, i->xarray, index);			\
87 								\
88 	rcu_read_lock();						\
89 	xas_for_each(&xas, head, ULONG_MAX) {				\
90 		if (xas_retry(&xas, head))				\
91 			continue;					\
92 		if (WARN_ON(xa_is_value(head)))				\
93 			break;						\
94 		if (WARN_ON(PageHuge(head)))				\
95 			break;						\
96 		for (j = (head->index < index) ? index - head->index : 0; \
97 		     j < thp_nr_pages(head); j++) {			\
98 			__v.bv_page = head + j;				\
99 			offset = (i->xarray_start + skip) & ~PAGE_MASK;	\
100 			seg = PAGE_SIZE - offset;			\
101 			__v.bv_offset = offset;				\
102 			__v.bv_len = min(n, seg);			\
103 			(void)(STEP);					\
104 			n -= __v.bv_len;				\
105 			skip += __v.bv_len;				\
106 			if (n == 0)					\
107 				break;					\
108 		}							\
109 		if (n == 0)						\
110 			break;						\
111 	}							\
112 	rcu_read_unlock();					\
113 	n = wanted - n;						\
114 }
115 
116 #define iterate_all_kinds(i, n, v, I, B, K, X) {		\
117 	if (likely(n)) {					\
118 		size_t skip = i->iov_offset;			\
119 		if (unlikely(i->type & ITER_BVEC)) {		\
120 			struct bio_vec v;			\
121 			struct bvec_iter __bi;			\
122 			iterate_bvec(i, n, v, __bi, skip, (B))	\
123 		} else if (unlikely(i->type & ITER_KVEC)) {	\
124 			const struct kvec *kvec;		\
125 			struct kvec v;				\
126 			iterate_kvec(i, n, v, kvec, skip, (K))	\
127 		} else if (unlikely(i->type & ITER_DISCARD)) {	\
128 		} else if (unlikely(i->type & ITER_XARRAY)) {	\
129 			struct bio_vec v;			\
130 			iterate_xarray(i, n, v, skip, (X));	\
131 		} else {					\
132 			const struct iovec *iov;		\
133 			struct iovec v;				\
134 			iterate_iovec(i, n, v, iov, skip, (I))	\
135 		}						\
136 	}							\
137 }
138 
139 #define iterate_and_advance(i, n, v, I, B, K, X) {		\
140 	if (unlikely(i->count < n))				\
141 		n = i->count;					\
142 	if (i->count) {						\
143 		size_t skip = i->iov_offset;			\
144 		if (unlikely(i->type & ITER_BVEC)) {		\
145 			const struct bio_vec *bvec = i->bvec;	\
146 			struct bio_vec v;			\
147 			struct bvec_iter __bi;			\
148 			iterate_bvec(i, n, v, __bi, skip, (B))	\
149 			i->bvec = __bvec_iter_bvec(i->bvec, __bi);	\
150 			i->nr_segs -= i->bvec - bvec;		\
151 			skip = __bi.bi_bvec_done;		\
152 		} else if (unlikely(i->type & ITER_KVEC)) {	\
153 			const struct kvec *kvec;		\
154 			struct kvec v;				\
155 			iterate_kvec(i, n, v, kvec, skip, (K))	\
156 			if (skip == kvec->iov_len) {		\
157 				kvec++;				\
158 				skip = 0;			\
159 			}					\
160 			i->nr_segs -= kvec - i->kvec;		\
161 			i->kvec = kvec;				\
162 		} else if (unlikely(i->type & ITER_DISCARD)) {	\
163 			skip += n;				\
164 		} else if (unlikely(i->type & ITER_XARRAY)) {	\
165 			struct bio_vec v;			\
166 			iterate_xarray(i, n, v, skip, (X))	\
167 		} else {					\
168 			const struct iovec *iov;		\
169 			struct iovec v;				\
170 			iterate_iovec(i, n, v, iov, skip, (I))	\
171 			if (skip == iov->iov_len) {		\
172 				iov++;				\
173 				skip = 0;			\
174 			}					\
175 			i->nr_segs -= iov - i->iov;		\
176 			i->iov = iov;				\
177 		}						\
178 		i->count -= n;					\
179 		i->iov_offset = skip;				\
180 	}							\
181 }
182 
183 static int copyout(void __user *to, const void *from, size_t n)
184 {
185 	if (should_fail_usercopy())
186 		return n;
187 	if (access_ok(to, n)) {
188 		instrument_copy_to_user(to, from, n);
189 		n = raw_copy_to_user(to, from, n);
190 	}
191 	return n;
192 }
193 
194 static int copyin(void *to, const void __user *from, size_t n)
195 {
196 	if (should_fail_usercopy())
197 		return n;
198 	if (access_ok(from, n)) {
199 		instrument_copy_from_user(to, from, n);
200 		n = raw_copy_from_user(to, from, n);
201 	}
202 	return n;
203 }
204 
205 static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
206 			 struct iov_iter *i)
207 {
208 	size_t skip, copy, left, wanted;
209 	const struct iovec *iov;
210 	char __user *buf;
211 	void *kaddr, *from;
212 
213 	if (unlikely(bytes > i->count))
214 		bytes = i->count;
215 
216 	if (unlikely(!bytes))
217 		return 0;
218 
219 	might_fault();
220 	wanted = bytes;
221 	iov = i->iov;
222 	skip = i->iov_offset;
223 	buf = iov->iov_base + skip;
224 	copy = min(bytes, iov->iov_len - skip);
225 
226 	if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) {
227 		kaddr = kmap_atomic(page);
228 		from = kaddr + offset;
229 
230 		/* first chunk, usually the only one */
231 		left = copyout(buf, from, copy);
232 		copy -= left;
233 		skip += copy;
234 		from += copy;
235 		bytes -= copy;
236 
237 		while (unlikely(!left && bytes)) {
238 			iov++;
239 			buf = iov->iov_base;
240 			copy = min(bytes, iov->iov_len);
241 			left = copyout(buf, from, copy);
242 			copy -= left;
243 			skip = copy;
244 			from += copy;
245 			bytes -= copy;
246 		}
247 		if (likely(!bytes)) {
248 			kunmap_atomic(kaddr);
249 			goto done;
250 		}
251 		offset = from - kaddr;
252 		buf += copy;
253 		kunmap_atomic(kaddr);
254 		copy = min(bytes, iov->iov_len - skip);
255 	}
256 	/* Too bad - revert to non-atomic kmap */
257 
258 	kaddr = kmap(page);
259 	from = kaddr + offset;
260 	left = copyout(buf, from, copy);
261 	copy -= left;
262 	skip += copy;
263 	from += copy;
264 	bytes -= copy;
265 	while (unlikely(!left && bytes)) {
266 		iov++;
267 		buf = iov->iov_base;
268 		copy = min(bytes, iov->iov_len);
269 		left = copyout(buf, from, copy);
270 		copy -= left;
271 		skip = copy;
272 		from += copy;
273 		bytes -= copy;
274 	}
275 	kunmap(page);
276 
277 done:
278 	if (skip == iov->iov_len) {
279 		iov++;
280 		skip = 0;
281 	}
282 	i->count -= wanted - bytes;
283 	i->nr_segs -= iov - i->iov;
284 	i->iov = iov;
285 	i->iov_offset = skip;
286 	return wanted - bytes;
287 }
288 
289 static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes,
290 			 struct iov_iter *i)
291 {
292 	size_t skip, copy, left, wanted;
293 	const struct iovec *iov;
294 	char __user *buf;
295 	void *kaddr, *to;
296 
297 	if (unlikely(bytes > i->count))
298 		bytes = i->count;
299 
300 	if (unlikely(!bytes))
301 		return 0;
302 
303 	might_fault();
304 	wanted = bytes;
305 	iov = i->iov;
306 	skip = i->iov_offset;
307 	buf = iov->iov_base + skip;
308 	copy = min(bytes, iov->iov_len - skip);
309 
310 	if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) {
311 		kaddr = kmap_atomic(page);
312 		to = kaddr + offset;
313 
314 		/* first chunk, usually the only one */
315 		left = copyin(to, buf, copy);
316 		copy -= left;
317 		skip += copy;
318 		to += copy;
319 		bytes -= copy;
320 
321 		while (unlikely(!left && bytes)) {
322 			iov++;
323 			buf = iov->iov_base;
324 			copy = min(bytes, iov->iov_len);
325 			left = copyin(to, buf, copy);
326 			copy -= left;
327 			skip = copy;
328 			to += copy;
329 			bytes -= copy;
330 		}
331 		if (likely(!bytes)) {
332 			kunmap_atomic(kaddr);
333 			goto done;
334 		}
335 		offset = to - kaddr;
336 		buf += copy;
337 		kunmap_atomic(kaddr);
338 		copy = min(bytes, iov->iov_len - skip);
339 	}
340 	/* Too bad - revert to non-atomic kmap */
341 
342 	kaddr = kmap(page);
343 	to = kaddr + offset;
344 	left = copyin(to, buf, copy);
345 	copy -= left;
346 	skip += copy;
347 	to += copy;
348 	bytes -= copy;
349 	while (unlikely(!left && bytes)) {
350 		iov++;
351 		buf = iov->iov_base;
352 		copy = min(bytes, iov->iov_len);
353 		left = copyin(to, buf, copy);
354 		copy -= left;
355 		skip = copy;
356 		to += copy;
357 		bytes -= copy;
358 	}
359 	kunmap(page);
360 
361 done:
362 	if (skip == iov->iov_len) {
363 		iov++;
364 		skip = 0;
365 	}
366 	i->count -= wanted - bytes;
367 	i->nr_segs -= iov - i->iov;
368 	i->iov = iov;
369 	i->iov_offset = skip;
370 	return wanted - bytes;
371 }
372 
373 #ifdef PIPE_PARANOIA
374 static bool sanity(const struct iov_iter *i)
375 {
376 	struct pipe_inode_info *pipe = i->pipe;
377 	unsigned int p_head = pipe->head;
378 	unsigned int p_tail = pipe->tail;
379 	unsigned int p_mask = pipe->ring_size - 1;
380 	unsigned int p_occupancy = pipe_occupancy(p_head, p_tail);
381 	unsigned int i_head = i->head;
382 	unsigned int idx;
383 
384 	if (i->iov_offset) {
385 		struct pipe_buffer *p;
386 		if (unlikely(p_occupancy == 0))
387 			goto Bad;	// pipe must be non-empty
388 		if (unlikely(i_head != p_head - 1))
389 			goto Bad;	// must be at the last buffer...
390 
391 		p = &pipe->bufs[i_head & p_mask];
392 		if (unlikely(p->offset + p->len != i->iov_offset))
393 			goto Bad;	// ... at the end of segment
394 	} else {
395 		if (i_head != p_head)
396 			goto Bad;	// must be right after the last buffer
397 	}
398 	return true;
399 Bad:
400 	printk(KERN_ERR "idx = %d, offset = %zd\n", i_head, i->iov_offset);
401 	printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n",
402 			p_head, p_tail, pipe->ring_size);
403 	for (idx = 0; idx < pipe->ring_size; idx++)
404 		printk(KERN_ERR "[%p %p %d %d]\n",
405 			pipe->bufs[idx].ops,
406 			pipe->bufs[idx].page,
407 			pipe->bufs[idx].offset,
408 			pipe->bufs[idx].len);
409 	WARN_ON(1);
410 	return false;
411 }
412 #else
413 #define sanity(i) true
414 #endif
415 
416 static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
417 			 struct iov_iter *i)
418 {
419 	struct pipe_inode_info *pipe = i->pipe;
420 	struct pipe_buffer *buf;
421 	unsigned int p_tail = pipe->tail;
422 	unsigned int p_mask = pipe->ring_size - 1;
423 	unsigned int i_head = i->head;
424 	size_t off;
425 
426 	if (unlikely(bytes > i->count))
427 		bytes = i->count;
428 
429 	if (unlikely(!bytes))
430 		return 0;
431 
432 	if (!sanity(i))
433 		return 0;
434 
435 	off = i->iov_offset;
436 	buf = &pipe->bufs[i_head & p_mask];
437 	if (off) {
438 		if (offset == off && buf->page == page) {
439 			/* merge with the last one */
440 			buf->len += bytes;
441 			i->iov_offset += bytes;
442 			goto out;
443 		}
444 		i_head++;
445 		buf = &pipe->bufs[i_head & p_mask];
446 	}
447 	if (pipe_full(i_head, p_tail, pipe->max_usage))
448 		return 0;
449 
450 	buf->ops = &page_cache_pipe_buf_ops;
451 	get_page(page);
452 	buf->page = page;
453 	buf->offset = offset;
454 	buf->len = bytes;
455 
456 	pipe->head = i_head + 1;
457 	i->iov_offset = offset + bytes;
458 	i->head = i_head;
459 out:
460 	i->count -= bytes;
461 	return bytes;
462 }
463 
464 /*
465  * Fault in one or more iovecs of the given iov_iter, to a maximum length of
466  * bytes.  For each iovec, fault in each page that constitutes the iovec.
467  *
468  * Return 0 on success, or non-zero if the memory could not be accessed (i.e.
469  * because it is an invalid address).
470  */
471 int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
472 {
473 	size_t skip = i->iov_offset;
474 	const struct iovec *iov;
475 	int err;
476 	struct iovec v;
477 
478 	if (!(i->type & (ITER_BVEC|ITER_KVEC))) {
479 		iterate_iovec(i, bytes, v, iov, skip, ({
480 			err = fault_in_pages_readable(v.iov_base, v.iov_len);
481 			if (unlikely(err))
482 			return err;
483 		0;}))
484 	}
485 	return 0;
486 }
487 EXPORT_SYMBOL(iov_iter_fault_in_readable);
488 
489 void iov_iter_init(struct iov_iter *i, unsigned int direction,
490 			const struct iovec *iov, unsigned long nr_segs,
491 			size_t count)
492 {
493 	WARN_ON(direction & ~(READ | WRITE));
494 	direction &= READ | WRITE;
495 
496 	/* It will get better.  Eventually... */
497 	if (uaccess_kernel()) {
498 		i->type = ITER_KVEC | direction;
499 		i->kvec = (struct kvec *)iov;
500 	} else {
501 		i->type = ITER_IOVEC | direction;
502 		i->iov = iov;
503 	}
504 	i->nr_segs = nr_segs;
505 	i->iov_offset = 0;
506 	i->count = count;
507 }
508 EXPORT_SYMBOL(iov_iter_init);
509 
510 static void memzero_page(struct page *page, size_t offset, size_t len)
511 {
512 	char *addr = kmap_atomic(page);
513 	memset(addr + offset, 0, len);
514 	kunmap_atomic(addr);
515 }
516 
517 static inline bool allocated(struct pipe_buffer *buf)
518 {
519 	return buf->ops == &default_pipe_buf_ops;
520 }
521 
522 static inline void data_start(const struct iov_iter *i,
523 			      unsigned int *iter_headp, size_t *offp)
524 {
525 	unsigned int p_mask = i->pipe->ring_size - 1;
526 	unsigned int iter_head = i->head;
527 	size_t off = i->iov_offset;
528 
529 	if (off && (!allocated(&i->pipe->bufs[iter_head & p_mask]) ||
530 		    off == PAGE_SIZE)) {
531 		iter_head++;
532 		off = 0;
533 	}
534 	*iter_headp = iter_head;
535 	*offp = off;
536 }
537 
538 static size_t push_pipe(struct iov_iter *i, size_t size,
539 			int *iter_headp, size_t *offp)
540 {
541 	struct pipe_inode_info *pipe = i->pipe;
542 	unsigned int p_tail = pipe->tail;
543 	unsigned int p_mask = pipe->ring_size - 1;
544 	unsigned int iter_head;
545 	size_t off;
546 	ssize_t left;
547 
548 	if (unlikely(size > i->count))
549 		size = i->count;
550 	if (unlikely(!size))
551 		return 0;
552 
553 	left = size;
554 	data_start(i, &iter_head, &off);
555 	*iter_headp = iter_head;
556 	*offp = off;
557 	if (off) {
558 		left -= PAGE_SIZE - off;
559 		if (left <= 0) {
560 			pipe->bufs[iter_head & p_mask].len += size;
561 			return size;
562 		}
563 		pipe->bufs[iter_head & p_mask].len = PAGE_SIZE;
564 		iter_head++;
565 	}
566 	while (!pipe_full(iter_head, p_tail, pipe->max_usage)) {
567 		struct pipe_buffer *buf = &pipe->bufs[iter_head & p_mask];
568 		struct page *page = alloc_page(GFP_USER);
569 		if (!page)
570 			break;
571 
572 		buf->ops = &default_pipe_buf_ops;
573 		buf->page = page;
574 		buf->offset = 0;
575 		buf->len = min_t(ssize_t, left, PAGE_SIZE);
576 		left -= buf->len;
577 		iter_head++;
578 		pipe->head = iter_head;
579 
580 		if (left == 0)
581 			return size;
582 	}
583 	return size - left;
584 }
585 
586 static size_t copy_pipe_to_iter(const void *addr, size_t bytes,
587 				struct iov_iter *i)
588 {
589 	struct pipe_inode_info *pipe = i->pipe;
590 	unsigned int p_mask = pipe->ring_size - 1;
591 	unsigned int i_head;
592 	size_t n, off;
593 
594 	if (!sanity(i))
595 		return 0;
596 
597 	bytes = n = push_pipe(i, bytes, &i_head, &off);
598 	if (unlikely(!n))
599 		return 0;
600 	do {
601 		size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
602 		memcpy_to_page(pipe->bufs[i_head & p_mask].page, off, addr, chunk);
603 		i->head = i_head;
604 		i->iov_offset = off + chunk;
605 		n -= chunk;
606 		addr += chunk;
607 		off = 0;
608 		i_head++;
609 	} while (n);
610 	i->count -= bytes;
611 	return bytes;
612 }
613 
614 static __wsum csum_and_memcpy(void *to, const void *from, size_t len,
615 			      __wsum sum, size_t off)
616 {
617 	__wsum next = csum_partial_copy_nocheck(from, to, len);
618 	return csum_block_add(sum, next, off);
619 }
620 
621 static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes,
622 					 struct csum_state *csstate,
623 					 struct iov_iter *i)
624 {
625 	struct pipe_inode_info *pipe = i->pipe;
626 	unsigned int p_mask = pipe->ring_size - 1;
627 	__wsum sum = csstate->csum;
628 	size_t off = csstate->off;
629 	unsigned int i_head;
630 	size_t n, r;
631 
632 	if (!sanity(i))
633 		return 0;
634 
635 	bytes = n = push_pipe(i, bytes, &i_head, &r);
636 	if (unlikely(!n))
637 		return 0;
638 	do {
639 		size_t chunk = min_t(size_t, n, PAGE_SIZE - r);
640 		char *p = kmap_atomic(pipe->bufs[i_head & p_mask].page);
641 		sum = csum_and_memcpy(p + r, addr, chunk, sum, off);
642 		kunmap_atomic(p);
643 		i->head = i_head;
644 		i->iov_offset = r + chunk;
645 		n -= chunk;
646 		off += chunk;
647 		addr += chunk;
648 		r = 0;
649 		i_head++;
650 	} while (n);
651 	i->count -= bytes;
652 	csstate->csum = sum;
653 	csstate->off = off;
654 	return bytes;
655 }
656 
657 size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
658 {
659 	const char *from = addr;
660 	if (unlikely(iov_iter_is_pipe(i)))
661 		return copy_pipe_to_iter(addr, bytes, i);
662 	if (iter_is_iovec(i))
663 		might_fault();
664 	iterate_and_advance(i, bytes, v,
665 		copyout(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len),
666 		memcpy_to_page(v.bv_page, v.bv_offset,
667 			       (from += v.bv_len) - v.bv_len, v.bv_len),
668 		memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len),
669 		memcpy_to_page(v.bv_page, v.bv_offset,
670 			       (from += v.bv_len) - v.bv_len, v.bv_len)
671 	)
672 
673 	return bytes;
674 }
675 EXPORT_SYMBOL(_copy_to_iter);
676 
677 #ifdef CONFIG_ARCH_HAS_COPY_MC
678 static int copyout_mc(void __user *to, const void *from, size_t n)
679 {
680 	if (access_ok(to, n)) {
681 		instrument_copy_to_user(to, from, n);
682 		n = copy_mc_to_user((__force void *) to, from, n);
683 	}
684 	return n;
685 }
686 
687 static unsigned long copy_mc_to_page(struct page *page, size_t offset,
688 		const char *from, size_t len)
689 {
690 	unsigned long ret;
691 	char *to;
692 
693 	to = kmap_atomic(page);
694 	ret = copy_mc_to_kernel(to + offset, from, len);
695 	kunmap_atomic(to);
696 
697 	return ret;
698 }
699 
700 static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes,
701 				struct iov_iter *i)
702 {
703 	struct pipe_inode_info *pipe = i->pipe;
704 	unsigned int p_mask = pipe->ring_size - 1;
705 	unsigned int i_head;
706 	size_t n, off, xfer = 0;
707 
708 	if (!sanity(i))
709 		return 0;
710 
711 	bytes = n = push_pipe(i, bytes, &i_head, &off);
712 	if (unlikely(!n))
713 		return 0;
714 	do {
715 		size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
716 		unsigned long rem;
717 
718 		rem = copy_mc_to_page(pipe->bufs[i_head & p_mask].page,
719 					    off, addr, chunk);
720 		i->head = i_head;
721 		i->iov_offset = off + chunk - rem;
722 		xfer += chunk - rem;
723 		if (rem)
724 			break;
725 		n -= chunk;
726 		addr += chunk;
727 		off = 0;
728 		i_head++;
729 	} while (n);
730 	i->count -= xfer;
731 	return xfer;
732 }
733 
734 /**
735  * _copy_mc_to_iter - copy to iter with source memory error exception handling
736  * @addr: source kernel address
737  * @bytes: total transfer length
738  * @iter: destination iterator
739  *
740  * The pmem driver deploys this for the dax operation
741  * (dax_copy_to_iter()) for dax reads (bypass page-cache and the
742  * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes
743  * successfully copied.
744  *
745  * The main differences between this and typical _copy_to_iter().
746  *
747  * * Typical tail/residue handling after a fault retries the copy
748  *   byte-by-byte until the fault happens again. Re-triggering machine
749  *   checks is potentially fatal so the implementation uses source
750  *   alignment and poison alignment assumptions to avoid re-triggering
751  *   hardware exceptions.
752  *
753  * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies.
754  *   Compare to copy_to_iter() where only ITER_IOVEC attempts might return
755  *   a short copy.
756  */
757 size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
758 {
759 	const char *from = addr;
760 	unsigned long rem, curr_addr, s_addr = (unsigned long) addr;
761 
762 	if (unlikely(iov_iter_is_pipe(i)))
763 		return copy_mc_pipe_to_iter(addr, bytes, i);
764 	if (iter_is_iovec(i))
765 		might_fault();
766 	iterate_and_advance(i, bytes, v,
767 		copyout_mc(v.iov_base, (from += v.iov_len) - v.iov_len,
768 			   v.iov_len),
769 		({
770 		rem = copy_mc_to_page(v.bv_page, v.bv_offset,
771 				      (from += v.bv_len) - v.bv_len, v.bv_len);
772 		if (rem) {
773 			curr_addr = (unsigned long) from;
774 			bytes = curr_addr - s_addr - rem;
775 			return bytes;
776 		}
777 		}),
778 		({
779 		rem = copy_mc_to_kernel(v.iov_base, (from += v.iov_len)
780 					- v.iov_len, v.iov_len);
781 		if (rem) {
782 			curr_addr = (unsigned long) from;
783 			bytes = curr_addr - s_addr - rem;
784 			return bytes;
785 		}
786 		}),
787 		({
788 		rem = copy_mc_to_page(v.bv_page, v.bv_offset,
789 				      (from += v.bv_len) - v.bv_len, v.bv_len);
790 		if (rem) {
791 			curr_addr = (unsigned long) from;
792 			bytes = curr_addr - s_addr - rem;
793 			rcu_read_unlock();
794 			i->iov_offset += bytes;
795 			i->count -= bytes;
796 			return bytes;
797 		}
798 		})
799 	)
800 
801 	return bytes;
802 }
803 EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
804 #endif /* CONFIG_ARCH_HAS_COPY_MC */
805 
806 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
807 {
808 	char *to = addr;
809 	if (unlikely(iov_iter_is_pipe(i))) {
810 		WARN_ON(1);
811 		return 0;
812 	}
813 	if (iter_is_iovec(i))
814 		might_fault();
815 	iterate_and_advance(i, bytes, v,
816 		copyin((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
817 		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
818 				 v.bv_offset, v.bv_len),
819 		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
820 		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
821 				 v.bv_offset, v.bv_len)
822 	)
823 
824 	return bytes;
825 }
826 EXPORT_SYMBOL(_copy_from_iter);
827 
828 bool _copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i)
829 {
830 	char *to = addr;
831 	if (unlikely(iov_iter_is_pipe(i))) {
832 		WARN_ON(1);
833 		return false;
834 	}
835 	if (unlikely(i->count < bytes))
836 		return false;
837 
838 	if (iter_is_iovec(i))
839 		might_fault();
840 	iterate_all_kinds(i, bytes, v, ({
841 		if (copyin((to += v.iov_len) - v.iov_len,
842 				      v.iov_base, v.iov_len))
843 			return false;
844 		0;}),
845 		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
846 				 v.bv_offset, v.bv_len),
847 		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
848 		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
849 				 v.bv_offset, v.bv_len)
850 	)
851 
852 	iov_iter_advance(i, bytes);
853 	return true;
854 }
855 EXPORT_SYMBOL(_copy_from_iter_full);
856 
857 size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
858 {
859 	char *to = addr;
860 	if (unlikely(iov_iter_is_pipe(i))) {
861 		WARN_ON(1);
862 		return 0;
863 	}
864 	iterate_and_advance(i, bytes, v,
865 		__copy_from_user_inatomic_nocache((to += v.iov_len) - v.iov_len,
866 					 v.iov_base, v.iov_len),
867 		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
868 				 v.bv_offset, v.bv_len),
869 		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
870 		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
871 				 v.bv_offset, v.bv_len)
872 	)
873 
874 	return bytes;
875 }
876 EXPORT_SYMBOL(_copy_from_iter_nocache);
877 
878 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
879 /**
880  * _copy_from_iter_flushcache - write destination through cpu cache
881  * @addr: destination kernel address
882  * @bytes: total transfer length
883  * @iter: source iterator
884  *
885  * The pmem driver arranges for filesystem-dax to use this facility via
886  * dax_copy_from_iter() for ensuring that writes to persistent memory
887  * are flushed through the CPU cache. It is differentiated from
888  * _copy_from_iter_nocache() in that guarantees all data is flushed for
889  * all iterator types. The _copy_from_iter_nocache() only attempts to
890  * bypass the cache for the ITER_IOVEC case, and on some archs may use
891  * instructions that strand dirty-data in the cache.
892  */
893 size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
894 {
895 	char *to = addr;
896 	if (unlikely(iov_iter_is_pipe(i))) {
897 		WARN_ON(1);
898 		return 0;
899 	}
900 	iterate_and_advance(i, bytes, v,
901 		__copy_from_user_flushcache((to += v.iov_len) - v.iov_len,
902 					 v.iov_base, v.iov_len),
903 		memcpy_page_flushcache((to += v.bv_len) - v.bv_len, v.bv_page,
904 				 v.bv_offset, v.bv_len),
905 		memcpy_flushcache((to += v.iov_len) - v.iov_len, v.iov_base,
906 			v.iov_len),
907 		memcpy_page_flushcache((to += v.bv_len) - v.bv_len, v.bv_page,
908 				 v.bv_offset, v.bv_len)
909 	)
910 
911 	return bytes;
912 }
913 EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache);
914 #endif
915 
916 bool _copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i)
917 {
918 	char *to = addr;
919 	if (unlikely(iov_iter_is_pipe(i))) {
920 		WARN_ON(1);
921 		return false;
922 	}
923 	if (unlikely(i->count < bytes))
924 		return false;
925 	iterate_all_kinds(i, bytes, v, ({
926 		if (__copy_from_user_inatomic_nocache((to += v.iov_len) - v.iov_len,
927 					     v.iov_base, v.iov_len))
928 			return false;
929 		0;}),
930 		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
931 				 v.bv_offset, v.bv_len),
932 		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
933 		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
934 				 v.bv_offset, v.bv_len)
935 	)
936 
937 	iov_iter_advance(i, bytes);
938 	return true;
939 }
940 EXPORT_SYMBOL(_copy_from_iter_full_nocache);
941 
942 static inline bool page_copy_sane(struct page *page, size_t offset, size_t n)
943 {
944 	struct page *head;
945 	size_t v = n + offset;
946 
947 	/*
948 	 * The general case needs to access the page order in order
949 	 * to compute the page size.
950 	 * However, we mostly deal with order-0 pages and thus can
951 	 * avoid a possible cache line miss for requests that fit all
952 	 * page orders.
953 	 */
954 	if (n <= v && v <= PAGE_SIZE)
955 		return true;
956 
957 	head = compound_head(page);
958 	v += (page - head) << PAGE_SHIFT;
959 
960 	if (likely(n <= v && v <= (page_size(head))))
961 		return true;
962 	WARN_ON(1);
963 	return false;
964 }
965 
966 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
967 			 struct iov_iter *i)
968 {
969 	if (unlikely(!page_copy_sane(page, offset, bytes)))
970 		return 0;
971 	if (i->type & (ITER_BVEC | ITER_KVEC | ITER_XARRAY)) {
972 		void *kaddr = kmap_atomic(page);
973 		size_t wanted = copy_to_iter(kaddr + offset, bytes, i);
974 		kunmap_atomic(kaddr);
975 		return wanted;
976 	} else if (unlikely(iov_iter_is_discard(i)))
977 		return bytes;
978 	else if (likely(!iov_iter_is_pipe(i)))
979 		return copy_page_to_iter_iovec(page, offset, bytes, i);
980 	else
981 		return copy_page_to_iter_pipe(page, offset, bytes, i);
982 }
983 EXPORT_SYMBOL(copy_page_to_iter);
984 
985 size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
986 			 struct iov_iter *i)
987 {
988 	if (unlikely(!page_copy_sane(page, offset, bytes)))
989 		return 0;
990 	if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
991 		WARN_ON(1);
992 		return 0;
993 	}
994 	if (i->type & (ITER_BVEC | ITER_KVEC | ITER_XARRAY)) {
995 		void *kaddr = kmap_atomic(page);
996 		size_t wanted = _copy_from_iter(kaddr + offset, bytes, i);
997 		kunmap_atomic(kaddr);
998 		return wanted;
999 	} else
1000 		return copy_page_from_iter_iovec(page, offset, bytes, i);
1001 }
1002 EXPORT_SYMBOL(copy_page_from_iter);
1003 
1004 static size_t pipe_zero(size_t bytes, struct iov_iter *i)
1005 {
1006 	struct pipe_inode_info *pipe = i->pipe;
1007 	unsigned int p_mask = pipe->ring_size - 1;
1008 	unsigned int i_head;
1009 	size_t n, off;
1010 
1011 	if (!sanity(i))
1012 		return 0;
1013 
1014 	bytes = n = push_pipe(i, bytes, &i_head, &off);
1015 	if (unlikely(!n))
1016 		return 0;
1017 
1018 	do {
1019 		size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
1020 		memzero_page(pipe->bufs[i_head & p_mask].page, off, chunk);
1021 		i->head = i_head;
1022 		i->iov_offset = off + chunk;
1023 		n -= chunk;
1024 		off = 0;
1025 		i_head++;
1026 	} while (n);
1027 	i->count -= bytes;
1028 	return bytes;
1029 }
1030 
1031 size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
1032 {
1033 	if (unlikely(iov_iter_is_pipe(i)))
1034 		return pipe_zero(bytes, i);
1035 	iterate_and_advance(i, bytes, v,
1036 		clear_user(v.iov_base, v.iov_len),
1037 		memzero_page(v.bv_page, v.bv_offset, v.bv_len),
1038 		memset(v.iov_base, 0, v.iov_len),
1039 		memzero_page(v.bv_page, v.bv_offset, v.bv_len)
1040 	)
1041 
1042 	return bytes;
1043 }
1044 EXPORT_SYMBOL(iov_iter_zero);
1045 
1046 size_t iov_iter_copy_from_user_atomic(struct page *page,
1047 		struct iov_iter *i, unsigned long offset, size_t bytes)
1048 {
1049 	char *kaddr = kmap_atomic(page), *p = kaddr + offset;
1050 	if (unlikely(!page_copy_sane(page, offset, bytes))) {
1051 		kunmap_atomic(kaddr);
1052 		return 0;
1053 	}
1054 	if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
1055 		kunmap_atomic(kaddr);
1056 		WARN_ON(1);
1057 		return 0;
1058 	}
1059 	iterate_all_kinds(i, bytes, v,
1060 		copyin((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
1061 		memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page,
1062 				 v.bv_offset, v.bv_len),
1063 		memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
1064 		memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page,
1065 				 v.bv_offset, v.bv_len)
1066 	)
1067 	kunmap_atomic(kaddr);
1068 	return bytes;
1069 }
1070 EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
1071 
1072 static inline void pipe_truncate(struct iov_iter *i)
1073 {
1074 	struct pipe_inode_info *pipe = i->pipe;
1075 	unsigned int p_tail = pipe->tail;
1076 	unsigned int p_head = pipe->head;
1077 	unsigned int p_mask = pipe->ring_size - 1;
1078 
1079 	if (!pipe_empty(p_head, p_tail)) {
1080 		struct pipe_buffer *buf;
1081 		unsigned int i_head = i->head;
1082 		size_t off = i->iov_offset;
1083 
1084 		if (off) {
1085 			buf = &pipe->bufs[i_head & p_mask];
1086 			buf->len = off - buf->offset;
1087 			i_head++;
1088 		}
1089 		while (p_head != i_head) {
1090 			p_head--;
1091 			pipe_buf_release(pipe, &pipe->bufs[p_head & p_mask]);
1092 		}
1093 
1094 		pipe->head = p_head;
1095 	}
1096 }
1097 
1098 static void pipe_advance(struct iov_iter *i, size_t size)
1099 {
1100 	struct pipe_inode_info *pipe = i->pipe;
1101 	if (unlikely(i->count < size))
1102 		size = i->count;
1103 	if (size) {
1104 		struct pipe_buffer *buf;
1105 		unsigned int p_mask = pipe->ring_size - 1;
1106 		unsigned int i_head = i->head;
1107 		size_t off = i->iov_offset, left = size;
1108 
1109 		if (off) /* make it relative to the beginning of buffer */
1110 			left += off - pipe->bufs[i_head & p_mask].offset;
1111 		while (1) {
1112 			buf = &pipe->bufs[i_head & p_mask];
1113 			if (left <= buf->len)
1114 				break;
1115 			left -= buf->len;
1116 			i_head++;
1117 		}
1118 		i->head = i_head;
1119 		i->iov_offset = buf->offset + left;
1120 	}
1121 	i->count -= size;
1122 	/* ... and discard everything past that point */
1123 	pipe_truncate(i);
1124 }
1125 
1126 static void iov_iter_bvec_advance(struct iov_iter *i, size_t size)
1127 {
1128 	struct bvec_iter bi;
1129 
1130 	bi.bi_size = i->count;
1131 	bi.bi_bvec_done = i->iov_offset;
1132 	bi.bi_idx = 0;
1133 	bvec_iter_advance(i->bvec, &bi, size);
1134 
1135 	i->bvec += bi.bi_idx;
1136 	i->nr_segs -= bi.bi_idx;
1137 	i->count = bi.bi_size;
1138 	i->iov_offset = bi.bi_bvec_done;
1139 }
1140 
1141 void iov_iter_advance(struct iov_iter *i, size_t size)
1142 {
1143 	if (unlikely(iov_iter_is_pipe(i))) {
1144 		pipe_advance(i, size);
1145 		return;
1146 	}
1147 	if (unlikely(iov_iter_is_discard(i))) {
1148 		i->count -= size;
1149 		return;
1150 	}
1151 	if (unlikely(iov_iter_is_xarray(i))) {
1152 		size = min(size, i->count);
1153 		i->iov_offset += size;
1154 		i->count -= size;
1155 		return;
1156 	}
1157 	if (iov_iter_is_bvec(i)) {
1158 		iov_iter_bvec_advance(i, size);
1159 		return;
1160 	}
1161 	iterate_and_advance(i, size, v, 0, 0, 0, 0)
1162 }
1163 EXPORT_SYMBOL(iov_iter_advance);
1164 
1165 void iov_iter_revert(struct iov_iter *i, size_t unroll)
1166 {
1167 	if (!unroll)
1168 		return;
1169 	if (WARN_ON(unroll > MAX_RW_COUNT))
1170 		return;
1171 	i->count += unroll;
1172 	if (unlikely(iov_iter_is_pipe(i))) {
1173 		struct pipe_inode_info *pipe = i->pipe;
1174 		unsigned int p_mask = pipe->ring_size - 1;
1175 		unsigned int i_head = i->head;
1176 		size_t off = i->iov_offset;
1177 		while (1) {
1178 			struct pipe_buffer *b = &pipe->bufs[i_head & p_mask];
1179 			size_t n = off - b->offset;
1180 			if (unroll < n) {
1181 				off -= unroll;
1182 				break;
1183 			}
1184 			unroll -= n;
1185 			if (!unroll && i_head == i->start_head) {
1186 				off = 0;
1187 				break;
1188 			}
1189 			i_head--;
1190 			b = &pipe->bufs[i_head & p_mask];
1191 			off = b->offset + b->len;
1192 		}
1193 		i->iov_offset = off;
1194 		i->head = i_head;
1195 		pipe_truncate(i);
1196 		return;
1197 	}
1198 	if (unlikely(iov_iter_is_discard(i)))
1199 		return;
1200 	if (unroll <= i->iov_offset) {
1201 		i->iov_offset -= unroll;
1202 		return;
1203 	}
1204 	unroll -= i->iov_offset;
1205 	if (iov_iter_is_xarray(i)) {
1206 		BUG(); /* We should never go beyond the start of the specified
1207 			* range since we might then be straying into pages that
1208 			* aren't pinned.
1209 			*/
1210 	} else if (iov_iter_is_bvec(i)) {
1211 		const struct bio_vec *bvec = i->bvec;
1212 		while (1) {
1213 			size_t n = (--bvec)->bv_len;
1214 			i->nr_segs++;
1215 			if (unroll <= n) {
1216 				i->bvec = bvec;
1217 				i->iov_offset = n - unroll;
1218 				return;
1219 			}
1220 			unroll -= n;
1221 		}
1222 	} else { /* same logics for iovec and kvec */
1223 		const struct iovec *iov = i->iov;
1224 		while (1) {
1225 			size_t n = (--iov)->iov_len;
1226 			i->nr_segs++;
1227 			if (unroll <= n) {
1228 				i->iov = iov;
1229 				i->iov_offset = n - unroll;
1230 				return;
1231 			}
1232 			unroll -= n;
1233 		}
1234 	}
1235 }
1236 EXPORT_SYMBOL(iov_iter_revert);
1237 
1238 /*
1239  * Return the count of just the current iov_iter segment.
1240  */
1241 size_t iov_iter_single_seg_count(const struct iov_iter *i)
1242 {
1243 	if (unlikely(iov_iter_is_pipe(i)))
1244 		return i->count;	// it is a silly place, anyway
1245 	if (i->nr_segs == 1)
1246 		return i->count;
1247 	if (unlikely(iov_iter_is_discard(i) || iov_iter_is_xarray(i)))
1248 		return i->count;
1249 	if (iov_iter_is_bvec(i))
1250 		return min(i->count, i->bvec->bv_len - i->iov_offset);
1251 	else
1252 		return min(i->count, i->iov->iov_len - i->iov_offset);
1253 }
1254 EXPORT_SYMBOL(iov_iter_single_seg_count);
1255 
1256 void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
1257 			const struct kvec *kvec, unsigned long nr_segs,
1258 			size_t count)
1259 {
1260 	WARN_ON(direction & ~(READ | WRITE));
1261 	i->type = ITER_KVEC | (direction & (READ | WRITE));
1262 	i->kvec = kvec;
1263 	i->nr_segs = nr_segs;
1264 	i->iov_offset = 0;
1265 	i->count = count;
1266 }
1267 EXPORT_SYMBOL(iov_iter_kvec);
1268 
1269 void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
1270 			const struct bio_vec *bvec, unsigned long nr_segs,
1271 			size_t count)
1272 {
1273 	WARN_ON(direction & ~(READ | WRITE));
1274 	i->type = ITER_BVEC | (direction & (READ | WRITE));
1275 	i->bvec = bvec;
1276 	i->nr_segs = nr_segs;
1277 	i->iov_offset = 0;
1278 	i->count = count;
1279 }
1280 EXPORT_SYMBOL(iov_iter_bvec);
1281 
1282 void iov_iter_pipe(struct iov_iter *i, unsigned int direction,
1283 			struct pipe_inode_info *pipe,
1284 			size_t count)
1285 {
1286 	BUG_ON(direction != READ);
1287 	WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size));
1288 	i->type = ITER_PIPE | READ;
1289 	i->pipe = pipe;
1290 	i->head = pipe->head;
1291 	i->iov_offset = 0;
1292 	i->count = count;
1293 	i->start_head = i->head;
1294 }
1295 EXPORT_SYMBOL(iov_iter_pipe);
1296 
1297 /**
1298  * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray
1299  * @i: The iterator to initialise.
1300  * @direction: The direction of the transfer.
1301  * @xarray: The xarray to access.
1302  * @start: The start file position.
1303  * @count: The size of the I/O buffer in bytes.
1304  *
1305  * Set up an I/O iterator to either draw data out of the pages attached to an
1306  * inode or to inject data into those pages.  The pages *must* be prevented
1307  * from evaporation, either by taking a ref on them or locking them by the
1308  * caller.
1309  */
1310 void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
1311 		     struct xarray *xarray, loff_t start, size_t count)
1312 {
1313 	BUG_ON(direction & ~1);
1314 	i->type = ITER_XARRAY | (direction & (READ | WRITE));
1315 	i->xarray = xarray;
1316 	i->xarray_start = start;
1317 	i->count = count;
1318 	i->iov_offset = 0;
1319 }
1320 EXPORT_SYMBOL(iov_iter_xarray);
1321 
1322 /**
1323  * iov_iter_discard - Initialise an I/O iterator that discards data
1324  * @i: The iterator to initialise.
1325  * @direction: The direction of the transfer.
1326  * @count: The size of the I/O buffer in bytes.
1327  *
1328  * Set up an I/O iterator that just discards everything that's written to it.
1329  * It's only available as a READ iterator.
1330  */
1331 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
1332 {
1333 	BUG_ON(direction != READ);
1334 	i->type = ITER_DISCARD | READ;
1335 	i->count = count;
1336 	i->iov_offset = 0;
1337 }
1338 EXPORT_SYMBOL(iov_iter_discard);
1339 
1340 unsigned long iov_iter_alignment(const struct iov_iter *i)
1341 {
1342 	unsigned long res = 0;
1343 	size_t size = i->count;
1344 
1345 	if (unlikely(iov_iter_is_pipe(i))) {
1346 		unsigned int p_mask = i->pipe->ring_size - 1;
1347 
1348 		if (size && i->iov_offset && allocated(&i->pipe->bufs[i->head & p_mask]))
1349 			return size | i->iov_offset;
1350 		return size;
1351 	}
1352 	if (unlikely(iov_iter_is_xarray(i)))
1353 		return (i->xarray_start + i->iov_offset) | i->count;
1354 	iterate_all_kinds(i, size, v,
1355 		(res |= (unsigned long)v.iov_base | v.iov_len, 0),
1356 		res |= v.bv_offset | v.bv_len,
1357 		res |= (unsigned long)v.iov_base | v.iov_len,
1358 		res |= v.bv_offset | v.bv_len
1359 	)
1360 	return res;
1361 }
1362 EXPORT_SYMBOL(iov_iter_alignment);
1363 
1364 unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
1365 {
1366 	unsigned long res = 0;
1367 	size_t size = i->count;
1368 
1369 	if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
1370 		WARN_ON(1);
1371 		return ~0U;
1372 	}
1373 
1374 	iterate_all_kinds(i, size, v,
1375 		(res |= (!res ? 0 : (unsigned long)v.iov_base) |
1376 			(size != v.iov_len ? size : 0), 0),
1377 		(res |= (!res ? 0 : (unsigned long)v.bv_offset) |
1378 			(size != v.bv_len ? size : 0)),
1379 		(res |= (!res ? 0 : (unsigned long)v.iov_base) |
1380 			(size != v.iov_len ? size : 0)),
1381 		(res |= (!res ? 0 : (unsigned long)v.bv_offset) |
1382 			(size != v.bv_len ? size : 0))
1383 		);
1384 	return res;
1385 }
1386 EXPORT_SYMBOL(iov_iter_gap_alignment);
1387 
1388 static inline ssize_t __pipe_get_pages(struct iov_iter *i,
1389 				size_t maxsize,
1390 				struct page **pages,
1391 				int iter_head,
1392 				size_t *start)
1393 {
1394 	struct pipe_inode_info *pipe = i->pipe;
1395 	unsigned int p_mask = pipe->ring_size - 1;
1396 	ssize_t n = push_pipe(i, maxsize, &iter_head, start);
1397 	if (!n)
1398 		return -EFAULT;
1399 
1400 	maxsize = n;
1401 	n += *start;
1402 	while (n > 0) {
1403 		get_page(*pages++ = pipe->bufs[iter_head & p_mask].page);
1404 		iter_head++;
1405 		n -= PAGE_SIZE;
1406 	}
1407 
1408 	return maxsize;
1409 }
1410 
1411 static ssize_t pipe_get_pages(struct iov_iter *i,
1412 		   struct page **pages, size_t maxsize, unsigned maxpages,
1413 		   size_t *start)
1414 {
1415 	unsigned int iter_head, npages;
1416 	size_t capacity;
1417 
1418 	if (!maxsize)
1419 		return 0;
1420 
1421 	if (!sanity(i))
1422 		return -EFAULT;
1423 
1424 	data_start(i, &iter_head, start);
1425 	/* Amount of free space: some of this one + all after this one */
1426 	npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1427 	capacity = min(npages, maxpages) * PAGE_SIZE - *start;
1428 
1429 	return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, start);
1430 }
1431 
1432 static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa,
1433 					  pgoff_t index, unsigned int nr_pages)
1434 {
1435 	XA_STATE(xas, xa, index);
1436 	struct page *page;
1437 	unsigned int ret = 0;
1438 
1439 	rcu_read_lock();
1440 	for (page = xas_load(&xas); page; page = xas_next(&xas)) {
1441 		if (xas_retry(&xas, page))
1442 			continue;
1443 
1444 		/* Has the page moved or been split? */
1445 		if (unlikely(page != xas_reload(&xas))) {
1446 			xas_reset(&xas);
1447 			continue;
1448 		}
1449 
1450 		pages[ret] = find_subpage(page, xas.xa_index);
1451 		get_page(pages[ret]);
1452 		if (++ret == nr_pages)
1453 			break;
1454 	}
1455 	rcu_read_unlock();
1456 	return ret;
1457 }
1458 
1459 static ssize_t iter_xarray_get_pages(struct iov_iter *i,
1460 				     struct page **pages, size_t maxsize,
1461 				     unsigned maxpages, size_t *_start_offset)
1462 {
1463 	unsigned nr, offset;
1464 	pgoff_t index, count;
1465 	size_t size = maxsize, actual;
1466 	loff_t pos;
1467 
1468 	if (!size || !maxpages)
1469 		return 0;
1470 
1471 	pos = i->xarray_start + i->iov_offset;
1472 	index = pos >> PAGE_SHIFT;
1473 	offset = pos & ~PAGE_MASK;
1474 	*_start_offset = offset;
1475 
1476 	count = 1;
1477 	if (size > PAGE_SIZE - offset) {
1478 		size -= PAGE_SIZE - offset;
1479 		count += size >> PAGE_SHIFT;
1480 		size &= ~PAGE_MASK;
1481 		if (size)
1482 			count++;
1483 	}
1484 
1485 	if (count > maxpages)
1486 		count = maxpages;
1487 
1488 	nr = iter_xarray_populate_pages(pages, i->xarray, index, count);
1489 	if (nr == 0)
1490 		return 0;
1491 
1492 	actual = PAGE_SIZE * nr;
1493 	actual -= offset;
1494 	if (nr == count && size > 0) {
1495 		unsigned last_offset = (nr > 1) ? 0 : offset;
1496 		actual -= PAGE_SIZE - (last_offset + size);
1497 	}
1498 	return actual;
1499 }
1500 
1501 ssize_t iov_iter_get_pages(struct iov_iter *i,
1502 		   struct page **pages, size_t maxsize, unsigned maxpages,
1503 		   size_t *start)
1504 {
1505 	if (maxsize > i->count)
1506 		maxsize = i->count;
1507 
1508 	if (unlikely(iov_iter_is_pipe(i)))
1509 		return pipe_get_pages(i, pages, maxsize, maxpages, start);
1510 	if (unlikely(iov_iter_is_xarray(i)))
1511 		return iter_xarray_get_pages(i, pages, maxsize, maxpages, start);
1512 	if (unlikely(iov_iter_is_discard(i)))
1513 		return -EFAULT;
1514 
1515 	iterate_all_kinds(i, maxsize, v, ({
1516 		unsigned long addr = (unsigned long)v.iov_base;
1517 		size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
1518 		int n;
1519 		int res;
1520 
1521 		if (len > maxpages * PAGE_SIZE)
1522 			len = maxpages * PAGE_SIZE;
1523 		addr &= ~(PAGE_SIZE - 1);
1524 		n = DIV_ROUND_UP(len, PAGE_SIZE);
1525 		res = get_user_pages_fast(addr, n,
1526 				iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0,
1527 				pages);
1528 		if (unlikely(res < 0))
1529 			return res;
1530 		return (res == n ? len : res * PAGE_SIZE) - *start;
1531 	0;}),({
1532 		/* can't be more than PAGE_SIZE */
1533 		*start = v.bv_offset;
1534 		get_page(*pages = v.bv_page);
1535 		return v.bv_len;
1536 	}),({
1537 		return -EFAULT;
1538 	}),
1539 	0
1540 	)
1541 	return 0;
1542 }
1543 EXPORT_SYMBOL(iov_iter_get_pages);
1544 
1545 static struct page **get_pages_array(size_t n)
1546 {
1547 	return kvmalloc_array(n, sizeof(struct page *), GFP_KERNEL);
1548 }
1549 
1550 static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
1551 		   struct page ***pages, size_t maxsize,
1552 		   size_t *start)
1553 {
1554 	struct page **p;
1555 	unsigned int iter_head, npages;
1556 	ssize_t n;
1557 
1558 	if (!maxsize)
1559 		return 0;
1560 
1561 	if (!sanity(i))
1562 		return -EFAULT;
1563 
1564 	data_start(i, &iter_head, start);
1565 	/* Amount of free space: some of this one + all after this one */
1566 	npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1567 	n = npages * PAGE_SIZE - *start;
1568 	if (maxsize > n)
1569 		maxsize = n;
1570 	else
1571 		npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE);
1572 	p = get_pages_array(npages);
1573 	if (!p)
1574 		return -ENOMEM;
1575 	n = __pipe_get_pages(i, maxsize, p, iter_head, start);
1576 	if (n > 0)
1577 		*pages = p;
1578 	else
1579 		kvfree(p);
1580 	return n;
1581 }
1582 
1583 static ssize_t iter_xarray_get_pages_alloc(struct iov_iter *i,
1584 					   struct page ***pages, size_t maxsize,
1585 					   size_t *_start_offset)
1586 {
1587 	struct page **p;
1588 	unsigned nr, offset;
1589 	pgoff_t index, count;
1590 	size_t size = maxsize, actual;
1591 	loff_t pos;
1592 
1593 	if (!size)
1594 		return 0;
1595 
1596 	pos = i->xarray_start + i->iov_offset;
1597 	index = pos >> PAGE_SHIFT;
1598 	offset = pos & ~PAGE_MASK;
1599 	*_start_offset = offset;
1600 
1601 	count = 1;
1602 	if (size > PAGE_SIZE - offset) {
1603 		size -= PAGE_SIZE - offset;
1604 		count += size >> PAGE_SHIFT;
1605 		size &= ~PAGE_MASK;
1606 		if (size)
1607 			count++;
1608 	}
1609 
1610 	p = get_pages_array(count);
1611 	if (!p)
1612 		return -ENOMEM;
1613 	*pages = p;
1614 
1615 	nr = iter_xarray_populate_pages(p, i->xarray, index, count);
1616 	if (nr == 0)
1617 		return 0;
1618 
1619 	actual = PAGE_SIZE * nr;
1620 	actual -= offset;
1621 	if (nr == count && size > 0) {
1622 		unsigned last_offset = (nr > 1) ? 0 : offset;
1623 		actual -= PAGE_SIZE - (last_offset + size);
1624 	}
1625 	return actual;
1626 }
1627 
1628 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
1629 		   struct page ***pages, size_t maxsize,
1630 		   size_t *start)
1631 {
1632 	struct page **p;
1633 
1634 	if (maxsize > i->count)
1635 		maxsize = i->count;
1636 
1637 	if (unlikely(iov_iter_is_pipe(i)))
1638 		return pipe_get_pages_alloc(i, pages, maxsize, start);
1639 	if (unlikely(iov_iter_is_xarray(i)))
1640 		return iter_xarray_get_pages_alloc(i, pages, maxsize, start);
1641 	if (unlikely(iov_iter_is_discard(i)))
1642 		return -EFAULT;
1643 
1644 	iterate_all_kinds(i, maxsize, v, ({
1645 		unsigned long addr = (unsigned long)v.iov_base;
1646 		size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
1647 		int n;
1648 		int res;
1649 
1650 		addr &= ~(PAGE_SIZE - 1);
1651 		n = DIV_ROUND_UP(len, PAGE_SIZE);
1652 		p = get_pages_array(n);
1653 		if (!p)
1654 			return -ENOMEM;
1655 		res = get_user_pages_fast(addr, n,
1656 				iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0, p);
1657 		if (unlikely(res < 0)) {
1658 			kvfree(p);
1659 			return res;
1660 		}
1661 		*pages = p;
1662 		return (res == n ? len : res * PAGE_SIZE) - *start;
1663 	0;}),({
1664 		/* can't be more than PAGE_SIZE */
1665 		*start = v.bv_offset;
1666 		*pages = p = get_pages_array(1);
1667 		if (!p)
1668 			return -ENOMEM;
1669 		get_page(*p = v.bv_page);
1670 		return v.bv_len;
1671 	}),({
1672 		return -EFAULT;
1673 	}), 0
1674 	)
1675 	return 0;
1676 }
1677 EXPORT_SYMBOL(iov_iter_get_pages_alloc);
1678 
1679 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
1680 			       struct iov_iter *i)
1681 {
1682 	char *to = addr;
1683 	__wsum sum, next;
1684 	size_t off = 0;
1685 	sum = *csum;
1686 	if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
1687 		WARN_ON(1);
1688 		return 0;
1689 	}
1690 	iterate_and_advance(i, bytes, v, ({
1691 		next = csum_and_copy_from_user(v.iov_base,
1692 					       (to += v.iov_len) - v.iov_len,
1693 					       v.iov_len);
1694 		if (next) {
1695 			sum = csum_block_add(sum, next, off);
1696 			off += v.iov_len;
1697 		}
1698 		next ? 0 : v.iov_len;
1699 	}), ({
1700 		char *p = kmap_atomic(v.bv_page);
1701 		sum = csum_and_memcpy((to += v.bv_len) - v.bv_len,
1702 				      p + v.bv_offset, v.bv_len,
1703 				      sum, off);
1704 		kunmap_atomic(p);
1705 		off += v.bv_len;
1706 	}),({
1707 		sum = csum_and_memcpy((to += v.iov_len) - v.iov_len,
1708 				      v.iov_base, v.iov_len,
1709 				      sum, off);
1710 		off += v.iov_len;
1711 	}), ({
1712 		char *p = kmap_atomic(v.bv_page);
1713 		sum = csum_and_memcpy((to += v.bv_len) - v.bv_len,
1714 				      p + v.bv_offset, v.bv_len,
1715 				      sum, off);
1716 		kunmap_atomic(p);
1717 		off += v.bv_len;
1718 	})
1719 	)
1720 	*csum = sum;
1721 	return bytes;
1722 }
1723 EXPORT_SYMBOL(csum_and_copy_from_iter);
1724 
1725 bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum,
1726 			       struct iov_iter *i)
1727 {
1728 	char *to = addr;
1729 	__wsum sum, next;
1730 	size_t off = 0;
1731 	sum = *csum;
1732 	if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
1733 		WARN_ON(1);
1734 		return false;
1735 	}
1736 	if (unlikely(i->count < bytes))
1737 		return false;
1738 	iterate_all_kinds(i, bytes, v, ({
1739 		next = csum_and_copy_from_user(v.iov_base,
1740 					       (to += v.iov_len) - v.iov_len,
1741 					       v.iov_len);
1742 		if (!next)
1743 			return false;
1744 		sum = csum_block_add(sum, next, off);
1745 		off += v.iov_len;
1746 		0;
1747 	}), ({
1748 		char *p = kmap_atomic(v.bv_page);
1749 		sum = csum_and_memcpy((to += v.bv_len) - v.bv_len,
1750 				      p + v.bv_offset, v.bv_len,
1751 				      sum, off);
1752 		kunmap_atomic(p);
1753 		off += v.bv_len;
1754 	}),({
1755 		sum = csum_and_memcpy((to += v.iov_len) - v.iov_len,
1756 				      v.iov_base, v.iov_len,
1757 				      sum, off);
1758 		off += v.iov_len;
1759 	}), ({
1760 		char *p = kmap_atomic(v.bv_page);
1761 		sum = csum_and_memcpy((to += v.bv_len) - v.bv_len,
1762 				      p + v.bv_offset, v.bv_len,
1763 				      sum, off);
1764 		kunmap_atomic(p);
1765 		off += v.bv_len;
1766 	})
1767 	)
1768 	*csum = sum;
1769 	iov_iter_advance(i, bytes);
1770 	return true;
1771 }
1772 EXPORT_SYMBOL(csum_and_copy_from_iter_full);
1773 
1774 size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
1775 			     struct iov_iter *i)
1776 {
1777 	struct csum_state *csstate = _csstate;
1778 	const char *from = addr;
1779 	__wsum sum, next;
1780 	size_t off;
1781 
1782 	if (unlikely(iov_iter_is_pipe(i)))
1783 		return csum_and_copy_to_pipe_iter(addr, bytes, _csstate, i);
1784 
1785 	sum = csstate->csum;
1786 	off = csstate->off;
1787 	if (unlikely(iov_iter_is_discard(i))) {
1788 		WARN_ON(1);	/* for now */
1789 		return 0;
1790 	}
1791 	iterate_and_advance(i, bytes, v, ({
1792 		next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len,
1793 					     v.iov_base,
1794 					     v.iov_len);
1795 		if (next) {
1796 			sum = csum_block_add(sum, next, off);
1797 			off += v.iov_len;
1798 		}
1799 		next ? 0 : v.iov_len;
1800 	}), ({
1801 		char *p = kmap_atomic(v.bv_page);
1802 		sum = csum_and_memcpy(p + v.bv_offset,
1803 				      (from += v.bv_len) - v.bv_len,
1804 				      v.bv_len, sum, off);
1805 		kunmap_atomic(p);
1806 		off += v.bv_len;
1807 	}),({
1808 		sum = csum_and_memcpy(v.iov_base,
1809 				     (from += v.iov_len) - v.iov_len,
1810 				     v.iov_len, sum, off);
1811 		off += v.iov_len;
1812 	}), ({
1813 		char *p = kmap_atomic(v.bv_page);
1814 		sum = csum_and_memcpy(p + v.bv_offset,
1815 				      (from += v.bv_len) - v.bv_len,
1816 				      v.bv_len, sum, off);
1817 		kunmap_atomic(p);
1818 		off += v.bv_len;
1819 	})
1820 	)
1821 	csstate->csum = sum;
1822 	csstate->off = off;
1823 	return bytes;
1824 }
1825 EXPORT_SYMBOL(csum_and_copy_to_iter);
1826 
1827 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
1828 		struct iov_iter *i)
1829 {
1830 #ifdef CONFIG_CRYPTO_HASH
1831 	struct ahash_request *hash = hashp;
1832 	struct scatterlist sg;
1833 	size_t copied;
1834 
1835 	copied = copy_to_iter(addr, bytes, i);
1836 	sg_init_one(&sg, addr, copied);
1837 	ahash_request_set_crypt(hash, &sg, NULL, copied);
1838 	crypto_ahash_update(hash);
1839 	return copied;
1840 #else
1841 	return 0;
1842 #endif
1843 }
1844 EXPORT_SYMBOL(hash_and_copy_to_iter);
1845 
1846 int iov_iter_npages(const struct iov_iter *i, int maxpages)
1847 {
1848 	size_t size = i->count;
1849 	int npages = 0;
1850 
1851 	if (!size)
1852 		return 0;
1853 	if (unlikely(iov_iter_is_discard(i)))
1854 		return 0;
1855 
1856 	if (unlikely(iov_iter_is_pipe(i))) {
1857 		struct pipe_inode_info *pipe = i->pipe;
1858 		unsigned int iter_head;
1859 		size_t off;
1860 
1861 		if (!sanity(i))
1862 			return 0;
1863 
1864 		data_start(i, &iter_head, &off);
1865 		/* some of this one + all after this one */
1866 		npages = pipe_space_for_user(iter_head, pipe->tail, pipe);
1867 		if (npages >= maxpages)
1868 			return maxpages;
1869 	} else if (unlikely(iov_iter_is_xarray(i))) {
1870 		unsigned offset;
1871 
1872 		offset = (i->xarray_start + i->iov_offset) & ~PAGE_MASK;
1873 
1874 		npages = 1;
1875 		if (size > PAGE_SIZE - offset) {
1876 			size -= PAGE_SIZE - offset;
1877 			npages += size >> PAGE_SHIFT;
1878 			size &= ~PAGE_MASK;
1879 			if (size)
1880 				npages++;
1881 		}
1882 		if (npages >= maxpages)
1883 			return maxpages;
1884 	} else iterate_all_kinds(i, size, v, ({
1885 		unsigned long p = (unsigned long)v.iov_base;
1886 		npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE)
1887 			- p / PAGE_SIZE;
1888 		if (npages >= maxpages)
1889 			return maxpages;
1890 	0;}),({
1891 		npages++;
1892 		if (npages >= maxpages)
1893 			return maxpages;
1894 	}),({
1895 		unsigned long p = (unsigned long)v.iov_base;
1896 		npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE)
1897 			- p / PAGE_SIZE;
1898 		if (npages >= maxpages)
1899 			return maxpages;
1900 	}),
1901 	0
1902 	)
1903 	return npages;
1904 }
1905 EXPORT_SYMBOL(iov_iter_npages);
1906 
1907 const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
1908 {
1909 	*new = *old;
1910 	if (unlikely(iov_iter_is_pipe(new))) {
1911 		WARN_ON(1);
1912 		return NULL;
1913 	}
1914 	if (unlikely(iov_iter_is_discard(new) || iov_iter_is_xarray(new)))
1915 		return NULL;
1916 	if (iov_iter_is_bvec(new))
1917 		return new->bvec = kmemdup(new->bvec,
1918 				    new->nr_segs * sizeof(struct bio_vec),
1919 				    flags);
1920 	else
1921 		/* iovec and kvec have identical layout */
1922 		return new->iov = kmemdup(new->iov,
1923 				   new->nr_segs * sizeof(struct iovec),
1924 				   flags);
1925 }
1926 EXPORT_SYMBOL(dup_iter);
1927 
1928 static int copy_compat_iovec_from_user(struct iovec *iov,
1929 		const struct iovec __user *uvec, unsigned long nr_segs)
1930 {
1931 	const struct compat_iovec __user *uiov =
1932 		(const struct compat_iovec __user *)uvec;
1933 	int ret = -EFAULT, i;
1934 
1935 	if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
1936 		return -EFAULT;
1937 
1938 	for (i = 0; i < nr_segs; i++) {
1939 		compat_uptr_t buf;
1940 		compat_ssize_t len;
1941 
1942 		unsafe_get_user(len, &uiov[i].iov_len, uaccess_end);
1943 		unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end);
1944 
1945 		/* check for compat_size_t not fitting in compat_ssize_t .. */
1946 		if (len < 0) {
1947 			ret = -EINVAL;
1948 			goto uaccess_end;
1949 		}
1950 		iov[i].iov_base = compat_ptr(buf);
1951 		iov[i].iov_len = len;
1952 	}
1953 
1954 	ret = 0;
1955 uaccess_end:
1956 	user_access_end();
1957 	return ret;
1958 }
1959 
1960 static int copy_iovec_from_user(struct iovec *iov,
1961 		const struct iovec __user *uvec, unsigned long nr_segs)
1962 {
1963 	unsigned long seg;
1964 
1965 	if (copy_from_user(iov, uvec, nr_segs * sizeof(*uvec)))
1966 		return -EFAULT;
1967 	for (seg = 0; seg < nr_segs; seg++) {
1968 		if ((ssize_t)iov[seg].iov_len < 0)
1969 			return -EINVAL;
1970 	}
1971 
1972 	return 0;
1973 }
1974 
1975 struct iovec *iovec_from_user(const struct iovec __user *uvec,
1976 		unsigned long nr_segs, unsigned long fast_segs,
1977 		struct iovec *fast_iov, bool compat)
1978 {
1979 	struct iovec *iov = fast_iov;
1980 	int ret;
1981 
1982 	/*
1983 	 * SuS says "The readv() function *may* fail if the iovcnt argument was
1984 	 * less than or equal to 0, or greater than {IOV_MAX}.  Linux has
1985 	 * traditionally returned zero for zero segments, so...
1986 	 */
1987 	if (nr_segs == 0)
1988 		return iov;
1989 	if (nr_segs > UIO_MAXIOV)
1990 		return ERR_PTR(-EINVAL);
1991 	if (nr_segs > fast_segs) {
1992 		iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
1993 		if (!iov)
1994 			return ERR_PTR(-ENOMEM);
1995 	}
1996 
1997 	if (compat)
1998 		ret = copy_compat_iovec_from_user(iov, uvec, nr_segs);
1999 	else
2000 		ret = copy_iovec_from_user(iov, uvec, nr_segs);
2001 	if (ret) {
2002 		if (iov != fast_iov)
2003 			kfree(iov);
2004 		return ERR_PTR(ret);
2005 	}
2006 
2007 	return iov;
2008 }
2009 
2010 ssize_t __import_iovec(int type, const struct iovec __user *uvec,
2011 		 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
2012 		 struct iov_iter *i, bool compat)
2013 {
2014 	ssize_t total_len = 0;
2015 	unsigned long seg;
2016 	struct iovec *iov;
2017 
2018 	iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat);
2019 	if (IS_ERR(iov)) {
2020 		*iovp = NULL;
2021 		return PTR_ERR(iov);
2022 	}
2023 
2024 	/*
2025 	 * According to the Single Unix Specification we should return EINVAL if
2026 	 * an element length is < 0 when cast to ssize_t or if the total length
2027 	 * would overflow the ssize_t return value of the system call.
2028 	 *
2029 	 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
2030 	 * overflow case.
2031 	 */
2032 	for (seg = 0; seg < nr_segs; seg++) {
2033 		ssize_t len = (ssize_t)iov[seg].iov_len;
2034 
2035 		if (!access_ok(iov[seg].iov_base, len)) {
2036 			if (iov != *iovp)
2037 				kfree(iov);
2038 			*iovp = NULL;
2039 			return -EFAULT;
2040 		}
2041 
2042 		if (len > MAX_RW_COUNT - total_len) {
2043 			len = MAX_RW_COUNT - total_len;
2044 			iov[seg].iov_len = len;
2045 		}
2046 		total_len += len;
2047 	}
2048 
2049 	iov_iter_init(i, type, iov, nr_segs, total_len);
2050 	if (iov == *iovp)
2051 		*iovp = NULL;
2052 	else
2053 		*iovp = iov;
2054 	return total_len;
2055 }
2056 
2057 /**
2058  * import_iovec() - Copy an array of &struct iovec from userspace
2059  *     into the kernel, check that it is valid, and initialize a new
2060  *     &struct iov_iter iterator to access it.
2061  *
2062  * @type: One of %READ or %WRITE.
2063  * @uvec: Pointer to the userspace array.
2064  * @nr_segs: Number of elements in userspace array.
2065  * @fast_segs: Number of elements in @iov.
2066  * @iovp: (input and output parameter) Pointer to pointer to (usually small
2067  *     on-stack) kernel array.
2068  * @i: Pointer to iterator that will be initialized on success.
2069  *
2070  * If the array pointed to by *@iov is large enough to hold all @nr_segs,
2071  * then this function places %NULL in *@iov on return. Otherwise, a new
2072  * array will be allocated and the result placed in *@iov. This means that
2073  * the caller may call kfree() on *@iov regardless of whether the small
2074  * on-stack array was used or not (and regardless of whether this function
2075  * returns an error or not).
2076  *
2077  * Return: Negative error code on error, bytes imported on success
2078  */
2079 ssize_t import_iovec(int type, const struct iovec __user *uvec,
2080 		 unsigned nr_segs, unsigned fast_segs,
2081 		 struct iovec **iovp, struct iov_iter *i)
2082 {
2083 	return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i,
2084 			      in_compat_syscall());
2085 }
2086 EXPORT_SYMBOL(import_iovec);
2087 
2088 int import_single_range(int rw, void __user *buf, size_t len,
2089 		 struct iovec *iov, struct iov_iter *i)
2090 {
2091 	if (len > MAX_RW_COUNT)
2092 		len = MAX_RW_COUNT;
2093 	if (unlikely(!access_ok(buf, len)))
2094 		return -EFAULT;
2095 
2096 	iov->iov_base = buf;
2097 	iov->iov_len = len;
2098 	iov_iter_init(i, rw, iov, 1, len);
2099 	return 0;
2100 }
2101 EXPORT_SYMBOL(import_single_range);
2102 
2103 int iov_iter_for_each_range(struct iov_iter *i, size_t bytes,
2104 			    int (*f)(struct kvec *vec, void *context),
2105 			    void *context)
2106 {
2107 	struct kvec w;
2108 	int err = -EINVAL;
2109 	if (!bytes)
2110 		return 0;
2111 
2112 	iterate_all_kinds(i, bytes, v, -EINVAL, ({
2113 		w.iov_base = kmap(v.bv_page) + v.bv_offset;
2114 		w.iov_len = v.bv_len;
2115 		err = f(&w, context);
2116 		kunmap(v.bv_page);
2117 		err;}), ({
2118 		w = v;
2119 		err = f(&w, context);}), ({
2120 		w.iov_base = kmap(v.bv_page) + v.bv_offset;
2121 		w.iov_len = v.bv_len;
2122 		err = f(&w, context);
2123 		kunmap(v.bv_page);
2124 		err;})
2125 	)
2126 	return err;
2127 }
2128 EXPORT_SYMBOL(iov_iter_for_each_range);
2129