xref: /linux/lib/iov_iter.c (revision 7fc2cd2e4b398c57c9cf961cfea05eadbf34c05c)
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/export.h>
3 #include <linux/bvec.h>
4 #include <linux/fault-inject-usercopy.h>
5 #include <linux/uio.h>
6 #include <linux/pagemap.h>
7 #include <linux/highmem.h>
8 #include <linux/slab.h>
9 #include <linux/vmalloc.h>
10 #include <linux/splice.h>
11 #include <linux/compat.h>
12 #include <linux/scatterlist.h>
13 #include <linux/instrumented.h>
14 #include <linux/iov_iter.h>
15 
16 static __always_inline
17 size_t copy_to_user_iter(void __user *iter_to, size_t progress,
18 			 size_t len, void *from, void *priv2)
19 {
20 	if (should_fail_usercopy())
21 		return len;
22 	if (access_ok(iter_to, len)) {
23 		from += progress;
24 		instrument_copy_to_user(iter_to, from, len);
25 		len = raw_copy_to_user(iter_to, from, len);
26 	}
27 	return len;
28 }
29 
30 static __always_inline
31 size_t copy_to_user_iter_nofault(void __user *iter_to, size_t progress,
32 				 size_t len, void *from, void *priv2)
33 {
34 	ssize_t res;
35 
36 	if (should_fail_usercopy())
37 		return len;
38 
39 	from += progress;
40 	res = copy_to_user_nofault(iter_to, from, len);
41 	return res < 0 ? len : res;
42 }
43 
44 static __always_inline
45 size_t copy_from_user_iter(void __user *iter_from, size_t progress,
46 			   size_t len, void *to, void *priv2)
47 {
48 	size_t res = len;
49 
50 	if (should_fail_usercopy())
51 		return len;
52 	if (can_do_masked_user_access()) {
53 		iter_from = mask_user_address(iter_from);
54 	} else {
55 		if (!access_ok(iter_from, len))
56 			return res;
57 
58 		/*
59 		 * Ensure that bad access_ok() speculation will not
60 		 * lead to nasty side effects *after* the copy is
61 		 * finished:
62 		 */
63 		barrier_nospec();
64 	}
65 	to += progress;
66 	instrument_copy_from_user_before(to, iter_from, len);
67 	res = raw_copy_from_user(to, iter_from, len);
68 	instrument_copy_from_user_after(to, iter_from, len, res);
69 
70 	return res;
71 }
72 
73 static __always_inline
74 size_t memcpy_to_iter(void *iter_to, size_t progress,
75 		      size_t len, void *from, void *priv2)
76 {
77 	memcpy(iter_to, from + progress, len);
78 	return 0;
79 }
80 
81 static __always_inline
82 size_t memcpy_from_iter(void *iter_from, size_t progress,
83 			size_t len, void *to, void *priv2)
84 {
85 	memcpy(to + progress, iter_from, len);
86 	return 0;
87 }
88 
89 /*
90  * fault_in_iov_iter_readable - fault in iov iterator for reading
91  * @i: iterator
92  * @size: maximum length
93  *
94  * Fault in one or more iovecs of the given iov_iter, to a maximum length of
95  * @size.  For each iovec, fault in each page that constitutes the iovec.
96  *
97  * Returns the number of bytes not faulted in (like copy_to_user() and
98  * copy_from_user()).
99  *
100  * Always returns 0 for non-userspace iterators.
101  */
102 size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size)
103 {
104 	if (iter_is_ubuf(i)) {
105 		size_t n = min(size, iov_iter_count(i));
106 		n -= fault_in_readable(i->ubuf + i->iov_offset, n);
107 		return size - n;
108 	} else if (iter_is_iovec(i)) {
109 		size_t count = min(size, iov_iter_count(i));
110 		const struct iovec *p;
111 		size_t skip;
112 
113 		size -= count;
114 		for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) {
115 			size_t len = min(count, p->iov_len - skip);
116 			size_t ret;
117 
118 			if (unlikely(!len))
119 				continue;
120 			ret = fault_in_readable(p->iov_base + skip, len);
121 			count -= len - ret;
122 			if (ret)
123 				break;
124 		}
125 		return count + size;
126 	}
127 	return 0;
128 }
129 EXPORT_SYMBOL(fault_in_iov_iter_readable);
130 
131 /*
132  * fault_in_iov_iter_writeable - fault in iov iterator for writing
133  * @i: iterator
134  * @size: maximum length
135  *
136  * Faults in the iterator using get_user_pages(), i.e., without triggering
137  * hardware page faults.  This is primarily useful when we already know that
138  * some or all of the pages in @i aren't in memory.
139  *
140  * Returns the number of bytes not faulted in, like copy_to_user() and
141  * copy_from_user().
142  *
143  * Always returns 0 for non-user-space iterators.
144  */
145 size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size)
146 {
147 	if (iter_is_ubuf(i)) {
148 		size_t n = min(size, iov_iter_count(i));
149 		n -= fault_in_safe_writeable(i->ubuf + i->iov_offset, n);
150 		return size - n;
151 	} else if (iter_is_iovec(i)) {
152 		size_t count = min(size, iov_iter_count(i));
153 		const struct iovec *p;
154 		size_t skip;
155 
156 		size -= count;
157 		for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) {
158 			size_t len = min(count, p->iov_len - skip);
159 			size_t ret;
160 
161 			if (unlikely(!len))
162 				continue;
163 			ret = fault_in_safe_writeable(p->iov_base + skip, len);
164 			count -= len - ret;
165 			if (ret)
166 				break;
167 		}
168 		return count + size;
169 	}
170 	return 0;
171 }
172 EXPORT_SYMBOL(fault_in_iov_iter_writeable);
173 
174 void iov_iter_init(struct iov_iter *i, unsigned int direction,
175 			const struct iovec *iov, unsigned long nr_segs,
176 			size_t count)
177 {
178 	WARN_ON(direction & ~(READ | WRITE));
179 	*i = (struct iov_iter) {
180 		.iter_type = ITER_IOVEC,
181 		.nofault = false,
182 		.data_source = direction,
183 		.__iov = iov,
184 		.nr_segs = nr_segs,
185 		.iov_offset = 0,
186 		.count = count
187 	};
188 }
189 EXPORT_SYMBOL(iov_iter_init);
190 
191 size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
192 {
193 	if (WARN_ON_ONCE(i->data_source))
194 		return 0;
195 	if (user_backed_iter(i))
196 		might_fault();
197 	return iterate_and_advance(i, bytes, (void *)addr,
198 				   copy_to_user_iter, memcpy_to_iter);
199 }
200 EXPORT_SYMBOL(_copy_to_iter);
201 
202 #ifdef CONFIG_ARCH_HAS_COPY_MC
203 static __always_inline
204 size_t copy_to_user_iter_mc(void __user *iter_to, size_t progress,
205 			    size_t len, void *from, void *priv2)
206 {
207 	if (access_ok(iter_to, len)) {
208 		from += progress;
209 		instrument_copy_to_user(iter_to, from, len);
210 		len = copy_mc_to_user(iter_to, from, len);
211 	}
212 	return len;
213 }
214 
215 static __always_inline
216 size_t memcpy_to_iter_mc(void *iter_to, size_t progress,
217 			 size_t len, void *from, void *priv2)
218 {
219 	return copy_mc_to_kernel(iter_to, from + progress, len);
220 }
221 
222 /**
223  * _copy_mc_to_iter - copy to iter with source memory error exception handling
224  * @addr: source kernel address
225  * @bytes: total transfer length
226  * @i: destination iterator
227  *
228  * The pmem driver deploys this for the dax operation
229  * (dax_copy_to_iter()) for dax reads (bypass page-cache and the
230  * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes
231  * successfully copied.
232  *
233  * The main differences between this and typical _copy_to_iter().
234  *
235  * * Typical tail/residue handling after a fault retries the copy
236  *   byte-by-byte until the fault happens again. Re-triggering machine
237  *   checks is potentially fatal so the implementation uses source
238  *   alignment and poison alignment assumptions to avoid re-triggering
239  *   hardware exceptions.
240  *
241  * * ITER_KVEC and ITER_BVEC can return short copies.  Compare to
242  *   copy_to_iter() where only ITER_IOVEC attempts might return a short copy.
243  *
244  * Return: number of bytes copied (may be %0)
245  */
246 size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
247 {
248 	if (WARN_ON_ONCE(i->data_source))
249 		return 0;
250 	if (user_backed_iter(i))
251 		might_fault();
252 	return iterate_and_advance(i, bytes, (void *)addr,
253 				   copy_to_user_iter_mc, memcpy_to_iter_mc);
254 }
255 EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
256 #endif /* CONFIG_ARCH_HAS_COPY_MC */
257 
258 static __always_inline
259 size_t __copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
260 {
261 	return iterate_and_advance(i, bytes, addr,
262 				   copy_from_user_iter, memcpy_from_iter);
263 }
264 
265 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
266 {
267 	if (WARN_ON_ONCE(!i->data_source))
268 		return 0;
269 
270 	if (user_backed_iter(i))
271 		might_fault();
272 	return __copy_from_iter(addr, bytes, i);
273 }
274 EXPORT_SYMBOL(_copy_from_iter);
275 
276 static __always_inline
277 size_t copy_from_user_iter_nocache(void __user *iter_from, size_t progress,
278 				   size_t len, void *to, void *priv2)
279 {
280 	return __copy_from_user_inatomic_nocache(to + progress, iter_from, len);
281 }
282 
283 size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
284 {
285 	if (WARN_ON_ONCE(!i->data_source))
286 		return 0;
287 
288 	return iterate_and_advance(i, bytes, addr,
289 				   copy_from_user_iter_nocache,
290 				   memcpy_from_iter);
291 }
292 EXPORT_SYMBOL(_copy_from_iter_nocache);
293 
294 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
295 static __always_inline
296 size_t copy_from_user_iter_flushcache(void __user *iter_from, size_t progress,
297 				      size_t len, void *to, void *priv2)
298 {
299 	return __copy_from_user_flushcache(to + progress, iter_from, len);
300 }
301 
302 static __always_inline
303 size_t memcpy_from_iter_flushcache(void *iter_from, size_t progress,
304 				   size_t len, void *to, void *priv2)
305 {
306 	memcpy_flushcache(to + progress, iter_from, len);
307 	return 0;
308 }
309 
310 /**
311  * _copy_from_iter_flushcache - write destination through cpu cache
312  * @addr: destination kernel address
313  * @bytes: total transfer length
314  * @i: source iterator
315  *
316  * The pmem driver arranges for filesystem-dax to use this facility via
317  * dax_copy_from_iter() for ensuring that writes to persistent memory
318  * are flushed through the CPU cache. It is differentiated from
319  * _copy_from_iter_nocache() in that guarantees all data is flushed for
320  * all iterator types. The _copy_from_iter_nocache() only attempts to
321  * bypass the cache for the ITER_IOVEC case, and on some archs may use
322  * instructions that strand dirty-data in the cache.
323  *
324  * Return: number of bytes copied (may be %0)
325  */
326 size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
327 {
328 	if (WARN_ON_ONCE(!i->data_source))
329 		return 0;
330 
331 	return iterate_and_advance(i, bytes, addr,
332 				   copy_from_user_iter_flushcache,
333 				   memcpy_from_iter_flushcache);
334 }
335 EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache);
336 #endif
337 
338 static inline bool page_copy_sane(struct page *page, size_t offset, size_t n)
339 {
340 	struct page *head;
341 	size_t v = n + offset;
342 
343 	/*
344 	 * The general case needs to access the page order in order
345 	 * to compute the page size.
346 	 * However, we mostly deal with order-0 pages and thus can
347 	 * avoid a possible cache line miss for requests that fit all
348 	 * page orders.
349 	 */
350 	if (n <= v && v <= PAGE_SIZE)
351 		return true;
352 
353 	head = compound_head(page);
354 	v += (page - head) << PAGE_SHIFT;
355 
356 	if (WARN_ON(n > v || v > page_size(head)))
357 		return false;
358 	return true;
359 }
360 
361 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
362 			 struct iov_iter *i)
363 {
364 	size_t res = 0;
365 	if (!page_copy_sane(page, offset, bytes))
366 		return 0;
367 	if (WARN_ON_ONCE(i->data_source))
368 		return 0;
369 	page += offset / PAGE_SIZE; // first subpage
370 	offset %= PAGE_SIZE;
371 	while (1) {
372 		void *kaddr = kmap_local_page(page);
373 		size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
374 		n = _copy_to_iter(kaddr + offset, n, i);
375 		kunmap_local(kaddr);
376 		res += n;
377 		bytes -= n;
378 		if (!bytes || !n)
379 			break;
380 		offset += n;
381 		if (offset == PAGE_SIZE) {
382 			page++;
383 			offset = 0;
384 		}
385 	}
386 	return res;
387 }
388 EXPORT_SYMBOL(copy_page_to_iter);
389 
390 size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes,
391 				 struct iov_iter *i)
392 {
393 	size_t res = 0;
394 
395 	if (!page_copy_sane(page, offset, bytes))
396 		return 0;
397 	if (WARN_ON_ONCE(i->data_source))
398 		return 0;
399 	page += offset / PAGE_SIZE; // first subpage
400 	offset %= PAGE_SIZE;
401 	while (1) {
402 		void *kaddr = kmap_local_page(page);
403 		size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
404 
405 		n = iterate_and_advance(i, n, kaddr + offset,
406 					copy_to_user_iter_nofault,
407 					memcpy_to_iter);
408 		kunmap_local(kaddr);
409 		res += n;
410 		bytes -= n;
411 		if (!bytes || !n)
412 			break;
413 		offset += n;
414 		if (offset == PAGE_SIZE) {
415 			page++;
416 			offset = 0;
417 		}
418 	}
419 	return res;
420 }
421 EXPORT_SYMBOL(copy_page_to_iter_nofault);
422 
423 size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
424 			 struct iov_iter *i)
425 {
426 	size_t res = 0;
427 	if (!page_copy_sane(page, offset, bytes))
428 		return 0;
429 	page += offset / PAGE_SIZE; // first subpage
430 	offset %= PAGE_SIZE;
431 	while (1) {
432 		void *kaddr = kmap_local_page(page);
433 		size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
434 		n = _copy_from_iter(kaddr + offset, n, i);
435 		kunmap_local(kaddr);
436 		res += n;
437 		bytes -= n;
438 		if (!bytes || !n)
439 			break;
440 		offset += n;
441 		if (offset == PAGE_SIZE) {
442 			page++;
443 			offset = 0;
444 		}
445 	}
446 	return res;
447 }
448 EXPORT_SYMBOL(copy_page_from_iter);
449 
450 static __always_inline
451 size_t zero_to_user_iter(void __user *iter_to, size_t progress,
452 			 size_t len, void *priv, void *priv2)
453 {
454 	return clear_user(iter_to, len);
455 }
456 
457 static __always_inline
458 size_t zero_to_iter(void *iter_to, size_t progress,
459 		    size_t len, void *priv, void *priv2)
460 {
461 	memset(iter_to, 0, len);
462 	return 0;
463 }
464 
465 size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
466 {
467 	return iterate_and_advance(i, bytes, NULL,
468 				   zero_to_user_iter, zero_to_iter);
469 }
470 EXPORT_SYMBOL(iov_iter_zero);
471 
472 size_t copy_folio_from_iter_atomic(struct folio *folio, size_t offset,
473 		size_t bytes, struct iov_iter *i)
474 {
475 	size_t n, copied = 0;
476 
477 	if (!page_copy_sane(&folio->page, offset, bytes))
478 		return 0;
479 	if (WARN_ON_ONCE(!i->data_source))
480 		return 0;
481 
482 	do {
483 		char *to = kmap_local_folio(folio, offset);
484 
485 		n = bytes - copied;
486 		if (folio_test_partial_kmap(folio) &&
487 		    n > PAGE_SIZE - offset_in_page(offset))
488 			n = PAGE_SIZE - offset_in_page(offset);
489 
490 		pagefault_disable();
491 		n = __copy_from_iter(to, n, i);
492 		pagefault_enable();
493 		kunmap_local(to);
494 		copied += n;
495 		offset += n;
496 	} while (copied != bytes && n > 0);
497 
498 	return copied;
499 }
500 EXPORT_SYMBOL(copy_folio_from_iter_atomic);
501 
502 static void iov_iter_bvec_advance(struct iov_iter *i, size_t size)
503 {
504 	const struct bio_vec *bvec, *end;
505 
506 	if (!i->count)
507 		return;
508 	i->count -= size;
509 
510 	size += i->iov_offset;
511 
512 	for (bvec = i->bvec, end = bvec + i->nr_segs; bvec < end; bvec++) {
513 		if (likely(size < bvec->bv_len))
514 			break;
515 		size -= bvec->bv_len;
516 	}
517 	i->iov_offset = size;
518 	i->nr_segs -= bvec - i->bvec;
519 	i->bvec = bvec;
520 }
521 
522 static void iov_iter_iovec_advance(struct iov_iter *i, size_t size)
523 {
524 	const struct iovec *iov, *end;
525 
526 	if (!i->count)
527 		return;
528 	i->count -= size;
529 
530 	size += i->iov_offset; // from beginning of current segment
531 	for (iov = iter_iov(i), end = iov + i->nr_segs; iov < end; iov++) {
532 		if (likely(size < iov->iov_len))
533 			break;
534 		size -= iov->iov_len;
535 	}
536 	i->iov_offset = size;
537 	i->nr_segs -= iov - iter_iov(i);
538 	i->__iov = iov;
539 }
540 
541 static void iov_iter_folioq_advance(struct iov_iter *i, size_t size)
542 {
543 	const struct folio_queue *folioq = i->folioq;
544 	unsigned int slot = i->folioq_slot;
545 
546 	if (!i->count)
547 		return;
548 	i->count -= size;
549 
550 	if (slot >= folioq_nr_slots(folioq)) {
551 		folioq = folioq->next;
552 		slot = 0;
553 	}
554 
555 	size += i->iov_offset; /* From beginning of current segment. */
556 	do {
557 		size_t fsize = folioq_folio_size(folioq, slot);
558 
559 		if (likely(size < fsize))
560 			break;
561 		size -= fsize;
562 		slot++;
563 		if (slot >= folioq_nr_slots(folioq) && folioq->next) {
564 			folioq = folioq->next;
565 			slot = 0;
566 		}
567 	} while (size);
568 
569 	i->iov_offset = size;
570 	i->folioq_slot = slot;
571 	i->folioq = folioq;
572 }
573 
574 void iov_iter_advance(struct iov_iter *i, size_t size)
575 {
576 	if (unlikely(i->count < size))
577 		size = i->count;
578 	if (likely(iter_is_ubuf(i)) || unlikely(iov_iter_is_xarray(i))) {
579 		i->iov_offset += size;
580 		i->count -= size;
581 	} else if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) {
582 		/* iovec and kvec have identical layouts */
583 		iov_iter_iovec_advance(i, size);
584 	} else if (iov_iter_is_bvec(i)) {
585 		iov_iter_bvec_advance(i, size);
586 	} else if (iov_iter_is_folioq(i)) {
587 		iov_iter_folioq_advance(i, size);
588 	} else if (iov_iter_is_discard(i)) {
589 		i->count -= size;
590 	}
591 }
592 EXPORT_SYMBOL(iov_iter_advance);
593 
594 static void iov_iter_folioq_revert(struct iov_iter *i, size_t unroll)
595 {
596 	const struct folio_queue *folioq = i->folioq;
597 	unsigned int slot = i->folioq_slot;
598 
599 	for (;;) {
600 		size_t fsize;
601 
602 		if (slot == 0) {
603 			folioq = folioq->prev;
604 			slot = folioq_nr_slots(folioq);
605 		}
606 		slot--;
607 
608 		fsize = folioq_folio_size(folioq, slot);
609 		if (unroll <= fsize) {
610 			i->iov_offset = fsize - unroll;
611 			break;
612 		}
613 		unroll -= fsize;
614 	}
615 
616 	i->folioq_slot = slot;
617 	i->folioq = folioq;
618 }
619 
620 void iov_iter_revert(struct iov_iter *i, size_t unroll)
621 {
622 	if (!unroll)
623 		return;
624 	if (WARN_ON(unroll > MAX_RW_COUNT))
625 		return;
626 	i->count += unroll;
627 	if (unlikely(iov_iter_is_discard(i)))
628 		return;
629 	if (unroll <= i->iov_offset) {
630 		i->iov_offset -= unroll;
631 		return;
632 	}
633 	unroll -= i->iov_offset;
634 	if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) {
635 		BUG(); /* We should never go beyond the start of the specified
636 			* range since we might then be straying into pages that
637 			* aren't pinned.
638 			*/
639 	} else if (iov_iter_is_bvec(i)) {
640 		const struct bio_vec *bvec = i->bvec;
641 		while (1) {
642 			size_t n = (--bvec)->bv_len;
643 			i->nr_segs++;
644 			if (unroll <= n) {
645 				i->bvec = bvec;
646 				i->iov_offset = n - unroll;
647 				return;
648 			}
649 			unroll -= n;
650 		}
651 	} else if (iov_iter_is_folioq(i)) {
652 		i->iov_offset = 0;
653 		iov_iter_folioq_revert(i, unroll);
654 	} else { /* same logics for iovec and kvec */
655 		const struct iovec *iov = iter_iov(i);
656 		while (1) {
657 			size_t n = (--iov)->iov_len;
658 			i->nr_segs++;
659 			if (unroll <= n) {
660 				i->__iov = iov;
661 				i->iov_offset = n - unroll;
662 				return;
663 			}
664 			unroll -= n;
665 		}
666 	}
667 }
668 EXPORT_SYMBOL(iov_iter_revert);
669 
670 /*
671  * Return the count of just the current iov_iter segment.
672  */
673 size_t iov_iter_single_seg_count(const struct iov_iter *i)
674 {
675 	if (i->nr_segs > 1) {
676 		if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
677 			return min(i->count, iter_iov(i)->iov_len - i->iov_offset);
678 		if (iov_iter_is_bvec(i))
679 			return min(i->count, i->bvec->bv_len - i->iov_offset);
680 	}
681 	if (unlikely(iov_iter_is_folioq(i)))
682 		return !i->count ? 0 :
683 			umin(folioq_folio_size(i->folioq, i->folioq_slot), i->count);
684 	return i->count;
685 }
686 EXPORT_SYMBOL(iov_iter_single_seg_count);
687 
688 void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
689 			const struct kvec *kvec, unsigned long nr_segs,
690 			size_t count)
691 {
692 	WARN_ON(direction & ~(READ | WRITE));
693 	*i = (struct iov_iter){
694 		.iter_type = ITER_KVEC,
695 		.data_source = direction,
696 		.kvec = kvec,
697 		.nr_segs = nr_segs,
698 		.iov_offset = 0,
699 		.count = count
700 	};
701 }
702 EXPORT_SYMBOL(iov_iter_kvec);
703 
704 void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
705 			const struct bio_vec *bvec, unsigned long nr_segs,
706 			size_t count)
707 {
708 	WARN_ON(direction & ~(READ | WRITE));
709 	*i = (struct iov_iter){
710 		.iter_type = ITER_BVEC,
711 		.data_source = direction,
712 		.bvec = bvec,
713 		.nr_segs = nr_segs,
714 		.iov_offset = 0,
715 		.count = count
716 	};
717 }
718 EXPORT_SYMBOL(iov_iter_bvec);
719 
720 /**
721  * iov_iter_folio_queue - Initialise an I/O iterator to use the folios in a folio queue
722  * @i: The iterator to initialise.
723  * @direction: The direction of the transfer.
724  * @folioq: The starting point in the folio queue.
725  * @first_slot: The first slot in the folio queue to use
726  * @offset: The offset into the folio in the first slot to start at
727  * @count: The size of the I/O buffer in bytes.
728  *
729  * Set up an I/O iterator to either draw data out of the pages attached to an
730  * inode or to inject data into those pages.  The pages *must* be prevented
731  * from evaporation, either by taking a ref on them or locking them by the
732  * caller.
733  */
734 void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction,
735 			  const struct folio_queue *folioq, unsigned int first_slot,
736 			  unsigned int offset, size_t count)
737 {
738 	BUG_ON(direction & ~1);
739 	*i = (struct iov_iter) {
740 		.iter_type = ITER_FOLIOQ,
741 		.data_source = direction,
742 		.folioq = folioq,
743 		.folioq_slot = first_slot,
744 		.count = count,
745 		.iov_offset = offset,
746 	};
747 }
748 EXPORT_SYMBOL(iov_iter_folio_queue);
749 
750 /**
751  * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray
752  * @i: The iterator to initialise.
753  * @direction: The direction of the transfer.
754  * @xarray: The xarray to access.
755  * @start: The start file position.
756  * @count: The size of the I/O buffer in bytes.
757  *
758  * Set up an I/O iterator to either draw data out of the pages attached to an
759  * inode or to inject data into those pages.  The pages *must* be prevented
760  * from evaporation, either by taking a ref on them or locking them by the
761  * caller.
762  */
763 void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
764 		     struct xarray *xarray, loff_t start, size_t count)
765 {
766 	BUG_ON(direction & ~1);
767 	*i = (struct iov_iter) {
768 		.iter_type = ITER_XARRAY,
769 		.data_source = direction,
770 		.xarray = xarray,
771 		.xarray_start = start,
772 		.count = count,
773 		.iov_offset = 0
774 	};
775 }
776 EXPORT_SYMBOL(iov_iter_xarray);
777 
778 /**
779  * iov_iter_discard - Initialise an I/O iterator that discards data
780  * @i: The iterator to initialise.
781  * @direction: The direction of the transfer.
782  * @count: The size of the I/O buffer in bytes.
783  *
784  * Set up an I/O iterator that just discards everything that's written to it.
785  * It's only available as a READ iterator.
786  */
787 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
788 {
789 	BUG_ON(direction != READ);
790 	*i = (struct iov_iter){
791 		.iter_type = ITER_DISCARD,
792 		.data_source = false,
793 		.count = count,
794 		.iov_offset = 0
795 	};
796 }
797 EXPORT_SYMBOL(iov_iter_discard);
798 
799 static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
800 {
801 	const struct iovec *iov = iter_iov(i);
802 	unsigned long res = 0;
803 	size_t size = i->count;
804 	size_t skip = i->iov_offset;
805 
806 	do {
807 		size_t len = iov->iov_len - skip;
808 		if (len) {
809 			res |= (unsigned long)iov->iov_base + skip;
810 			if (len > size)
811 				len = size;
812 			res |= len;
813 			size -= len;
814 		}
815 		iov++;
816 		skip = 0;
817 	} while (size);
818 	return res;
819 }
820 
821 static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i)
822 {
823 	const struct bio_vec *bvec = i->bvec;
824 	unsigned res = 0;
825 	size_t size = i->count;
826 	unsigned skip = i->iov_offset;
827 
828 	do {
829 		size_t len = bvec->bv_len - skip;
830 		res |= (unsigned long)bvec->bv_offset + skip;
831 		if (len > size)
832 			len = size;
833 		res |= len;
834 		bvec++;
835 		size -= len;
836 		skip = 0;
837 	} while (size);
838 
839 	return res;
840 }
841 
842 unsigned long iov_iter_alignment(const struct iov_iter *i)
843 {
844 	if (likely(iter_is_ubuf(i))) {
845 		size_t size = i->count;
846 		if (size)
847 			return ((unsigned long)i->ubuf + i->iov_offset) | size;
848 		return 0;
849 	}
850 
851 	/* iovec and kvec have identical layouts */
852 	if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
853 		return iov_iter_alignment_iovec(i);
854 
855 	if (iov_iter_is_bvec(i))
856 		return iov_iter_alignment_bvec(i);
857 
858 	/* With both xarray and folioq types, we're dealing with whole folios. */
859 	if (iov_iter_is_folioq(i))
860 		return i->iov_offset | i->count;
861 	if (iov_iter_is_xarray(i))
862 		return (i->xarray_start + i->iov_offset) | i->count;
863 
864 	return 0;
865 }
866 EXPORT_SYMBOL(iov_iter_alignment);
867 
868 unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
869 {
870 	unsigned long res = 0;
871 	unsigned long v = 0;
872 	size_t size = i->count;
873 	unsigned k;
874 
875 	if (iter_is_ubuf(i))
876 		return 0;
877 
878 	if (WARN_ON(!iter_is_iovec(i)))
879 		return ~0U;
880 
881 	for (k = 0; k < i->nr_segs; k++) {
882 		const struct iovec *iov = iter_iov(i) + k;
883 		if (iov->iov_len) {
884 			unsigned long base = (unsigned long)iov->iov_base;
885 			if (v) // if not the first one
886 				res |= base | v; // this start | previous end
887 			v = base + iov->iov_len;
888 			if (size <= iov->iov_len)
889 				break;
890 			size -= iov->iov_len;
891 		}
892 	}
893 	return res;
894 }
895 EXPORT_SYMBOL(iov_iter_gap_alignment);
896 
897 static int want_pages_array(struct page ***res, size_t size,
898 			    size_t start, unsigned int maxpages)
899 {
900 	unsigned int count = DIV_ROUND_UP(size + start, PAGE_SIZE);
901 
902 	if (count > maxpages)
903 		count = maxpages;
904 	WARN_ON(!count);	// caller should've prevented that
905 	if (!*res) {
906 		*res = kvmalloc_array(count, sizeof(struct page *), GFP_KERNEL);
907 		if (!*res)
908 			return 0;
909 	}
910 	return count;
911 }
912 
913 static ssize_t iter_folioq_get_pages(struct iov_iter *iter,
914 				     struct page ***ppages, size_t maxsize,
915 				     unsigned maxpages, size_t *_start_offset)
916 {
917 	const struct folio_queue *folioq = iter->folioq;
918 	struct page **pages;
919 	unsigned int slot = iter->folioq_slot;
920 	size_t extracted = 0, count = iter->count, iov_offset = iter->iov_offset;
921 
922 	if (slot >= folioq_nr_slots(folioq)) {
923 		folioq = folioq->next;
924 		slot = 0;
925 		if (WARN_ON(iov_offset != 0))
926 			return -EIO;
927 	}
928 
929 	maxpages = want_pages_array(ppages, maxsize, iov_offset & ~PAGE_MASK, maxpages);
930 	if (!maxpages)
931 		return -ENOMEM;
932 	*_start_offset = iov_offset & ~PAGE_MASK;
933 	pages = *ppages;
934 
935 	for (;;) {
936 		struct folio *folio = folioq_folio(folioq, slot);
937 		size_t offset = iov_offset, fsize = folioq_folio_size(folioq, slot);
938 		size_t part = PAGE_SIZE - offset % PAGE_SIZE;
939 
940 		if (offset < fsize) {
941 			part = umin(part, umin(maxsize - extracted, fsize - offset));
942 			count -= part;
943 			iov_offset += part;
944 			extracted += part;
945 
946 			*pages = folio_page(folio, offset / PAGE_SIZE);
947 			get_page(*pages);
948 			pages++;
949 			maxpages--;
950 		}
951 
952 		if (maxpages == 0 || extracted >= maxsize)
953 			break;
954 
955 		if (iov_offset >= fsize) {
956 			iov_offset = 0;
957 			slot++;
958 			if (slot == folioq_nr_slots(folioq) && folioq->next) {
959 				folioq = folioq->next;
960 				slot = 0;
961 			}
962 		}
963 	}
964 
965 	iter->count = count;
966 	iter->iov_offset = iov_offset;
967 	iter->folioq = folioq;
968 	iter->folioq_slot = slot;
969 	return extracted;
970 }
971 
972 static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa,
973 					  pgoff_t index, unsigned int nr_pages)
974 {
975 	XA_STATE(xas, xa, index);
976 	struct folio *folio;
977 	unsigned int ret = 0;
978 
979 	rcu_read_lock();
980 	for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
981 		if (xas_retry(&xas, folio))
982 			continue;
983 
984 		/* Has the folio moved or been split? */
985 		if (unlikely(folio != xas_reload(&xas))) {
986 			xas_reset(&xas);
987 			continue;
988 		}
989 
990 		pages[ret] = folio_file_page(folio, xas.xa_index);
991 		folio_get(folio);
992 		if (++ret == nr_pages)
993 			break;
994 	}
995 	rcu_read_unlock();
996 	return ret;
997 }
998 
999 static ssize_t iter_xarray_get_pages(struct iov_iter *i,
1000 				     struct page ***pages, size_t maxsize,
1001 				     unsigned maxpages, size_t *_start_offset)
1002 {
1003 	unsigned nr, offset, count;
1004 	pgoff_t index;
1005 	loff_t pos;
1006 
1007 	pos = i->xarray_start + i->iov_offset;
1008 	index = pos >> PAGE_SHIFT;
1009 	offset = pos & ~PAGE_MASK;
1010 	*_start_offset = offset;
1011 
1012 	count = want_pages_array(pages, maxsize, offset, maxpages);
1013 	if (!count)
1014 		return -ENOMEM;
1015 	nr = iter_xarray_populate_pages(*pages, i->xarray, index, count);
1016 	if (nr == 0)
1017 		return 0;
1018 
1019 	maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
1020 	i->iov_offset += maxsize;
1021 	i->count -= maxsize;
1022 	return maxsize;
1023 }
1024 
1025 /* must be done on non-empty ITER_UBUF or ITER_IOVEC one */
1026 static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size)
1027 {
1028 	size_t skip;
1029 	long k;
1030 
1031 	if (iter_is_ubuf(i))
1032 		return (unsigned long)i->ubuf + i->iov_offset;
1033 
1034 	for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) {
1035 		const struct iovec *iov = iter_iov(i) + k;
1036 		size_t len = iov->iov_len - skip;
1037 
1038 		if (unlikely(!len))
1039 			continue;
1040 		if (*size > len)
1041 			*size = len;
1042 		return (unsigned long)iov->iov_base + skip;
1043 	}
1044 	BUG(); // if it had been empty, we wouldn't get called
1045 }
1046 
1047 /* must be done on non-empty ITER_BVEC one */
1048 static struct page *first_bvec_segment(const struct iov_iter *i,
1049 				       size_t *size, size_t *start)
1050 {
1051 	struct page *page;
1052 	size_t skip = i->iov_offset, len;
1053 
1054 	len = i->bvec->bv_len - skip;
1055 	if (*size > len)
1056 		*size = len;
1057 	skip += i->bvec->bv_offset;
1058 	page = i->bvec->bv_page + skip / PAGE_SIZE;
1059 	*start = skip % PAGE_SIZE;
1060 	return page;
1061 }
1062 
1063 static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i,
1064 		   struct page ***pages, size_t maxsize,
1065 		   unsigned int maxpages, size_t *start)
1066 {
1067 	unsigned int n, gup_flags = 0;
1068 
1069 	if (maxsize > i->count)
1070 		maxsize = i->count;
1071 	if (!maxsize)
1072 		return 0;
1073 	if (maxsize > MAX_RW_COUNT)
1074 		maxsize = MAX_RW_COUNT;
1075 
1076 	if (likely(user_backed_iter(i))) {
1077 		unsigned long addr;
1078 		int res;
1079 
1080 		if (iov_iter_rw(i) != WRITE)
1081 			gup_flags |= FOLL_WRITE;
1082 		if (i->nofault)
1083 			gup_flags |= FOLL_NOFAULT;
1084 
1085 		addr = first_iovec_segment(i, &maxsize);
1086 		*start = addr % PAGE_SIZE;
1087 		addr &= PAGE_MASK;
1088 		n = want_pages_array(pages, maxsize, *start, maxpages);
1089 		if (!n)
1090 			return -ENOMEM;
1091 		res = get_user_pages_fast(addr, n, gup_flags, *pages);
1092 		if (unlikely(res <= 0))
1093 			return res;
1094 		maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - *start);
1095 		iov_iter_advance(i, maxsize);
1096 		return maxsize;
1097 	}
1098 	if (iov_iter_is_bvec(i)) {
1099 		struct page **p;
1100 		struct page *page;
1101 
1102 		page = first_bvec_segment(i, &maxsize, start);
1103 		n = want_pages_array(pages, maxsize, *start, maxpages);
1104 		if (!n)
1105 			return -ENOMEM;
1106 		p = *pages;
1107 		for (int k = 0; k < n; k++) {
1108 			struct folio *folio = page_folio(page + k);
1109 			p[k] = page + k;
1110 			if (!folio_test_slab(folio))
1111 				folio_get(folio);
1112 		}
1113 		maxsize = min_t(size_t, maxsize, n * PAGE_SIZE - *start);
1114 		i->count -= maxsize;
1115 		i->iov_offset += maxsize;
1116 		if (i->iov_offset == i->bvec->bv_len) {
1117 			i->iov_offset = 0;
1118 			i->bvec++;
1119 			i->nr_segs--;
1120 		}
1121 		return maxsize;
1122 	}
1123 	if (iov_iter_is_folioq(i))
1124 		return iter_folioq_get_pages(i, pages, maxsize, maxpages, start);
1125 	if (iov_iter_is_xarray(i))
1126 		return iter_xarray_get_pages(i, pages, maxsize, maxpages, start);
1127 	return -EFAULT;
1128 }
1129 
1130 ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,
1131 		size_t maxsize, unsigned maxpages, size_t *start)
1132 {
1133 	if (!maxpages)
1134 		return 0;
1135 	BUG_ON(!pages);
1136 
1137 	return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, start);
1138 }
1139 EXPORT_SYMBOL(iov_iter_get_pages2);
1140 
1141 ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i,
1142 		struct page ***pages, size_t maxsize, size_t *start)
1143 {
1144 	ssize_t len;
1145 
1146 	*pages = NULL;
1147 
1148 	len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start);
1149 	if (len <= 0) {
1150 		kvfree(*pages);
1151 		*pages = NULL;
1152 	}
1153 	return len;
1154 }
1155 EXPORT_SYMBOL(iov_iter_get_pages_alloc2);
1156 
1157 static int iov_npages(const struct iov_iter *i, int maxpages)
1158 {
1159 	size_t skip = i->iov_offset, size = i->count;
1160 	const struct iovec *p;
1161 	int npages = 0;
1162 
1163 	for (p = iter_iov(i); size; skip = 0, p++) {
1164 		unsigned offs = offset_in_page(p->iov_base + skip);
1165 		size_t len = min(p->iov_len - skip, size);
1166 
1167 		if (len) {
1168 			size -= len;
1169 			npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
1170 			if (unlikely(npages > maxpages))
1171 				return maxpages;
1172 		}
1173 	}
1174 	return npages;
1175 }
1176 
1177 static int bvec_npages(const struct iov_iter *i, int maxpages)
1178 {
1179 	size_t skip = i->iov_offset, size = i->count;
1180 	const struct bio_vec *p;
1181 	int npages = 0;
1182 
1183 	for (p = i->bvec; size; skip = 0, p++) {
1184 		unsigned offs = (p->bv_offset + skip) % PAGE_SIZE;
1185 		size_t len = min(p->bv_len - skip, size);
1186 
1187 		size -= len;
1188 		npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
1189 		if (unlikely(npages > maxpages))
1190 			return maxpages;
1191 	}
1192 	return npages;
1193 }
1194 
1195 int iov_iter_npages(const struct iov_iter *i, int maxpages)
1196 {
1197 	if (unlikely(!i->count))
1198 		return 0;
1199 	if (likely(iter_is_ubuf(i))) {
1200 		unsigned offs = offset_in_page(i->ubuf + i->iov_offset);
1201 		int npages = DIV_ROUND_UP(offs + i->count, PAGE_SIZE);
1202 		return min(npages, maxpages);
1203 	}
1204 	/* iovec and kvec have identical layouts */
1205 	if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1206 		return iov_npages(i, maxpages);
1207 	if (iov_iter_is_bvec(i))
1208 		return bvec_npages(i, maxpages);
1209 	if (iov_iter_is_folioq(i)) {
1210 		unsigned offset = i->iov_offset % PAGE_SIZE;
1211 		int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
1212 		return min(npages, maxpages);
1213 	}
1214 	if (iov_iter_is_xarray(i)) {
1215 		unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE;
1216 		int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
1217 		return min(npages, maxpages);
1218 	}
1219 	return 0;
1220 }
1221 EXPORT_SYMBOL(iov_iter_npages);
1222 
1223 const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
1224 {
1225 	*new = *old;
1226 	if (iov_iter_is_bvec(new))
1227 		return new->bvec = kmemdup(new->bvec,
1228 				    new->nr_segs * sizeof(struct bio_vec),
1229 				    flags);
1230 	else if (iov_iter_is_kvec(new) || iter_is_iovec(new))
1231 		/* iovec and kvec have identical layout */
1232 		return new->__iov = kmemdup(new->__iov,
1233 				   new->nr_segs * sizeof(struct iovec),
1234 				   flags);
1235 	return NULL;
1236 }
1237 EXPORT_SYMBOL(dup_iter);
1238 
1239 static __noclone int copy_compat_iovec_from_user(struct iovec *iov,
1240 		const struct iovec __user *uvec, u32 nr_segs)
1241 {
1242 	const struct compat_iovec __user *uiov =
1243 		(const struct compat_iovec __user *)uvec;
1244 	int ret = -EFAULT;
1245 	u32 i;
1246 
1247 	if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
1248 		return -EFAULT;
1249 
1250 	for (i = 0; i < nr_segs; i++) {
1251 		compat_uptr_t buf;
1252 		compat_ssize_t len;
1253 
1254 		unsafe_get_user(len, &uiov[i].iov_len, uaccess_end);
1255 		unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end);
1256 
1257 		/* check for compat_size_t not fitting in compat_ssize_t .. */
1258 		if (len < 0) {
1259 			ret = -EINVAL;
1260 			goto uaccess_end;
1261 		}
1262 		iov[i].iov_base = compat_ptr(buf);
1263 		iov[i].iov_len = len;
1264 	}
1265 
1266 	ret = 0;
1267 uaccess_end:
1268 	user_access_end();
1269 	return ret;
1270 }
1271 
1272 static __noclone int copy_iovec_from_user(struct iovec *iov,
1273 		const struct iovec __user *uiov, unsigned long nr_segs)
1274 {
1275 	int ret = -EFAULT;
1276 
1277 	if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
1278 		return -EFAULT;
1279 
1280 	do {
1281 		void __user *buf;
1282 		ssize_t len;
1283 
1284 		unsafe_get_user(len, &uiov->iov_len, uaccess_end);
1285 		unsafe_get_user(buf, &uiov->iov_base, uaccess_end);
1286 
1287 		/* check for size_t not fitting in ssize_t .. */
1288 		if (unlikely(len < 0)) {
1289 			ret = -EINVAL;
1290 			goto uaccess_end;
1291 		}
1292 		iov->iov_base = buf;
1293 		iov->iov_len = len;
1294 
1295 		uiov++; iov++;
1296 	} while (--nr_segs);
1297 
1298 	ret = 0;
1299 uaccess_end:
1300 	user_access_end();
1301 	return ret;
1302 }
1303 
1304 struct iovec *iovec_from_user(const struct iovec __user *uvec,
1305 		unsigned long nr_segs, unsigned long fast_segs,
1306 		struct iovec *fast_iov, bool compat)
1307 {
1308 	struct iovec *iov = fast_iov;
1309 	int ret;
1310 
1311 	/*
1312 	 * SuS says "The readv() function *may* fail if the iovcnt argument was
1313 	 * less than or equal to 0, or greater than {IOV_MAX}.  Linux has
1314 	 * traditionally returned zero for zero segments, so...
1315 	 */
1316 	if (nr_segs == 0)
1317 		return iov;
1318 	if (nr_segs > UIO_MAXIOV)
1319 		return ERR_PTR(-EINVAL);
1320 	if (nr_segs > fast_segs) {
1321 		iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
1322 		if (!iov)
1323 			return ERR_PTR(-ENOMEM);
1324 	}
1325 
1326 	if (unlikely(compat))
1327 		ret = copy_compat_iovec_from_user(iov, uvec, nr_segs);
1328 	else
1329 		ret = copy_iovec_from_user(iov, uvec, nr_segs);
1330 	if (ret) {
1331 		if (iov != fast_iov)
1332 			kfree(iov);
1333 		return ERR_PTR(ret);
1334 	}
1335 
1336 	return iov;
1337 }
1338 
1339 /*
1340  * Single segment iovec supplied by the user, import it as ITER_UBUF.
1341  */
1342 static ssize_t __import_iovec_ubuf(int type, const struct iovec __user *uvec,
1343 				   struct iovec **iovp, struct iov_iter *i,
1344 				   bool compat)
1345 {
1346 	struct iovec *iov = *iovp;
1347 	ssize_t ret;
1348 
1349 	*iovp = NULL;
1350 
1351 	if (compat)
1352 		ret = copy_compat_iovec_from_user(iov, uvec, 1);
1353 	else
1354 		ret = copy_iovec_from_user(iov, uvec, 1);
1355 	if (unlikely(ret))
1356 		return ret;
1357 
1358 	ret = import_ubuf(type, iov->iov_base, iov->iov_len, i);
1359 	if (unlikely(ret))
1360 		return ret;
1361 	return i->count;
1362 }
1363 
1364 ssize_t __import_iovec(int type, const struct iovec __user *uvec,
1365 		 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
1366 		 struct iov_iter *i, bool compat)
1367 {
1368 	ssize_t total_len = 0;
1369 	unsigned long seg;
1370 	struct iovec *iov;
1371 
1372 	if (nr_segs == 1)
1373 		return __import_iovec_ubuf(type, uvec, iovp, i, compat);
1374 
1375 	iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat);
1376 	if (IS_ERR(iov)) {
1377 		*iovp = NULL;
1378 		return PTR_ERR(iov);
1379 	}
1380 
1381 	/*
1382 	 * According to the Single Unix Specification we should return EINVAL if
1383 	 * an element length is < 0 when cast to ssize_t or if the total length
1384 	 * would overflow the ssize_t return value of the system call.
1385 	 *
1386 	 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
1387 	 * overflow case.
1388 	 */
1389 	for (seg = 0; seg < nr_segs; seg++) {
1390 		ssize_t len = (ssize_t)iov[seg].iov_len;
1391 
1392 		if (!access_ok(iov[seg].iov_base, len)) {
1393 			if (iov != *iovp)
1394 				kfree(iov);
1395 			*iovp = NULL;
1396 			return -EFAULT;
1397 		}
1398 
1399 		if (len > MAX_RW_COUNT - total_len) {
1400 			len = MAX_RW_COUNT - total_len;
1401 			iov[seg].iov_len = len;
1402 		}
1403 		total_len += len;
1404 	}
1405 
1406 	iov_iter_init(i, type, iov, nr_segs, total_len);
1407 	if (iov == *iovp)
1408 		*iovp = NULL;
1409 	else
1410 		*iovp = iov;
1411 	return total_len;
1412 }
1413 
1414 /**
1415  * import_iovec() - Copy an array of &struct iovec from userspace
1416  *     into the kernel, check that it is valid, and initialize a new
1417  *     &struct iov_iter iterator to access it.
1418  *
1419  * @type: One of %READ or %WRITE.
1420  * @uvec: Pointer to the userspace array.
1421  * @nr_segs: Number of elements in userspace array.
1422  * @fast_segs: Number of elements in @iov.
1423  * @iovp: (input and output parameter) Pointer to pointer to (usually small
1424  *     on-stack) kernel array.
1425  * @i: Pointer to iterator that will be initialized on success.
1426  *
1427  * If the array pointed to by *@iov is large enough to hold all @nr_segs,
1428  * then this function places %NULL in *@iov on return. Otherwise, a new
1429  * array will be allocated and the result placed in *@iov. This means that
1430  * the caller may call kfree() on *@iov regardless of whether the small
1431  * on-stack array was used or not (and regardless of whether this function
1432  * returns an error or not).
1433  *
1434  * Return: Negative error code on error, bytes imported on success
1435  */
1436 ssize_t import_iovec(int type, const struct iovec __user *uvec,
1437 		 unsigned nr_segs, unsigned fast_segs,
1438 		 struct iovec **iovp, struct iov_iter *i)
1439 {
1440 	return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i,
1441 			      in_compat_syscall());
1442 }
1443 EXPORT_SYMBOL(import_iovec);
1444 
1445 int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i)
1446 {
1447 	if (len > MAX_RW_COUNT)
1448 		len = MAX_RW_COUNT;
1449 	if (unlikely(!access_ok(buf, len)))
1450 		return -EFAULT;
1451 
1452 	iov_iter_ubuf(i, rw, buf, len);
1453 	return 0;
1454 }
1455 EXPORT_SYMBOL_GPL(import_ubuf);
1456 
1457 /**
1458  * iov_iter_restore() - Restore a &struct iov_iter to the same state as when
1459  *     iov_iter_save_state() was called.
1460  *
1461  * @i: &struct iov_iter to restore
1462  * @state: state to restore from
1463  *
1464  * Used after iov_iter_save_state() to bring restore @i, if operations may
1465  * have advanced it.
1466  *
1467  * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC
1468  */
1469 void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
1470 {
1471 	if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) &&
1472 			 !iter_is_ubuf(i)) && !iov_iter_is_kvec(i))
1473 		return;
1474 	i->iov_offset = state->iov_offset;
1475 	i->count = state->count;
1476 	if (iter_is_ubuf(i))
1477 		return;
1478 	/*
1479 	 * For the *vec iters, nr_segs + iov is constant - if we increment
1480 	 * the vec, then we also decrement the nr_segs count. Hence we don't
1481 	 * need to track both of these, just one is enough and we can deduct
1482 	 * the other from that. ITER_KVEC and ITER_IOVEC are the same struct
1483 	 * size, so we can just increment the iov pointer as they are unionzed.
1484 	 * ITER_BVEC _may_ be the same size on some archs, but on others it is
1485 	 * not. Be safe and handle it separately.
1486 	 */
1487 	BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
1488 	if (iov_iter_is_bvec(i))
1489 		i->bvec -= state->nr_segs - i->nr_segs;
1490 	else
1491 		i->__iov -= state->nr_segs - i->nr_segs;
1492 	i->nr_segs = state->nr_segs;
1493 }
1494 
1495 /*
1496  * Extract a list of contiguous pages from an ITER_FOLIOQ iterator.  This does
1497  * not get references on the pages, nor does it get a pin on them.
1498  */
1499 static ssize_t iov_iter_extract_folioq_pages(struct iov_iter *i,
1500 					     struct page ***pages, size_t maxsize,
1501 					     unsigned int maxpages,
1502 					     iov_iter_extraction_t extraction_flags,
1503 					     size_t *offset0)
1504 {
1505 	const struct folio_queue *folioq = i->folioq;
1506 	struct page **p;
1507 	unsigned int nr = 0;
1508 	size_t extracted = 0, offset, slot = i->folioq_slot;
1509 
1510 	if (slot >= folioq_nr_slots(folioq)) {
1511 		folioq = folioq->next;
1512 		slot = 0;
1513 		if (WARN_ON(i->iov_offset != 0))
1514 			return -EIO;
1515 	}
1516 
1517 	offset = i->iov_offset & ~PAGE_MASK;
1518 	*offset0 = offset;
1519 
1520 	maxpages = want_pages_array(pages, maxsize, offset, maxpages);
1521 	if (!maxpages)
1522 		return -ENOMEM;
1523 	p = *pages;
1524 
1525 	for (;;) {
1526 		struct folio *folio = folioq_folio(folioq, slot);
1527 		size_t offset = i->iov_offset, fsize = folioq_folio_size(folioq, slot);
1528 		size_t part = PAGE_SIZE - offset % PAGE_SIZE;
1529 
1530 		if (offset < fsize) {
1531 			part = umin(part, umin(maxsize - extracted, fsize - offset));
1532 			i->count -= part;
1533 			i->iov_offset += part;
1534 			extracted += part;
1535 
1536 			p[nr++] = folio_page(folio, offset / PAGE_SIZE);
1537 		}
1538 
1539 		if (nr >= maxpages || extracted >= maxsize)
1540 			break;
1541 
1542 		if (i->iov_offset >= fsize) {
1543 			i->iov_offset = 0;
1544 			slot++;
1545 			if (slot == folioq_nr_slots(folioq) && folioq->next) {
1546 				folioq = folioq->next;
1547 				slot = 0;
1548 			}
1549 		}
1550 	}
1551 
1552 	i->folioq = folioq;
1553 	i->folioq_slot = slot;
1554 	return extracted;
1555 }
1556 
1557 /*
1558  * Extract a list of contiguous pages from an ITER_XARRAY iterator.  This does not
1559  * get references on the pages, nor does it get a pin on them.
1560  */
1561 static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i,
1562 					     struct page ***pages, size_t maxsize,
1563 					     unsigned int maxpages,
1564 					     iov_iter_extraction_t extraction_flags,
1565 					     size_t *offset0)
1566 {
1567 	struct page **p;
1568 	struct folio *folio;
1569 	unsigned int nr = 0, offset;
1570 	loff_t pos = i->xarray_start + i->iov_offset;
1571 	XA_STATE(xas, i->xarray, pos >> PAGE_SHIFT);
1572 
1573 	offset = pos & ~PAGE_MASK;
1574 	*offset0 = offset;
1575 
1576 	maxpages = want_pages_array(pages, maxsize, offset, maxpages);
1577 	if (!maxpages)
1578 		return -ENOMEM;
1579 	p = *pages;
1580 
1581 	rcu_read_lock();
1582 	for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
1583 		if (xas_retry(&xas, folio))
1584 			continue;
1585 
1586 		/* Has the folio moved or been split? */
1587 		if (unlikely(folio != xas_reload(&xas))) {
1588 			xas_reset(&xas);
1589 			continue;
1590 		}
1591 
1592 		p[nr++] = folio_file_page(folio, xas.xa_index);
1593 		if (nr == maxpages)
1594 			break;
1595 	}
1596 	rcu_read_unlock();
1597 
1598 	maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
1599 	iov_iter_advance(i, maxsize);
1600 	return maxsize;
1601 }
1602 
1603 /*
1604  * Extract a list of virtually contiguous pages from an ITER_BVEC iterator.
1605  * This does not get references on the pages, nor does it get a pin on them.
1606  */
1607 static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i,
1608 					   struct page ***pages, size_t maxsize,
1609 					   unsigned int maxpages,
1610 					   iov_iter_extraction_t extraction_flags,
1611 					   size_t *offset0)
1612 {
1613 	size_t skip = i->iov_offset, size = 0;
1614 	struct bvec_iter bi;
1615 	int k = 0;
1616 
1617 	if (i->nr_segs == 0)
1618 		return 0;
1619 
1620 	if (i->iov_offset == i->bvec->bv_len) {
1621 		i->iov_offset = 0;
1622 		i->nr_segs--;
1623 		i->bvec++;
1624 		skip = 0;
1625 	}
1626 	bi.bi_idx = 0;
1627 	bi.bi_size = maxsize;
1628 	bi.bi_bvec_done = skip;
1629 
1630 	maxpages = want_pages_array(pages, maxsize, skip, maxpages);
1631 
1632 	while (bi.bi_size && bi.bi_idx < i->nr_segs) {
1633 		struct bio_vec bv = bvec_iter_bvec(i->bvec, bi);
1634 
1635 		/*
1636 		 * The iov_iter_extract_pages interface only allows an offset
1637 		 * into the first page.  Break out of the loop if we see an
1638 		 * offset into subsequent pages, the caller will have to call
1639 		 * iov_iter_extract_pages again for the reminder.
1640 		 */
1641 		if (k) {
1642 			if (bv.bv_offset)
1643 				break;
1644 		} else {
1645 			*offset0 = bv.bv_offset;
1646 		}
1647 
1648 		(*pages)[k++] = bv.bv_page;
1649 		size += bv.bv_len;
1650 
1651 		if (k >= maxpages)
1652 			break;
1653 
1654 		/*
1655 		 * We are done when the end of the bvec doesn't align to a page
1656 		 * boundary as that would create a hole in the returned space.
1657 		 * The caller will handle this with another call to
1658 		 * iov_iter_extract_pages.
1659 		 */
1660 		if (bv.bv_offset + bv.bv_len != PAGE_SIZE)
1661 			break;
1662 
1663 		bvec_iter_advance_single(i->bvec, &bi, bv.bv_len);
1664 	}
1665 
1666 	iov_iter_advance(i, size);
1667 	return size;
1668 }
1669 
1670 /*
1671  * Extract a list of virtually contiguous pages from an ITER_KVEC iterator.
1672  * This does not get references on the pages, nor does it get a pin on them.
1673  */
1674 static ssize_t iov_iter_extract_kvec_pages(struct iov_iter *i,
1675 					   struct page ***pages, size_t maxsize,
1676 					   unsigned int maxpages,
1677 					   iov_iter_extraction_t extraction_flags,
1678 					   size_t *offset0)
1679 {
1680 	struct page **p, *page;
1681 	const void *kaddr;
1682 	size_t skip = i->iov_offset, offset, len, size;
1683 	int k;
1684 
1685 	for (;;) {
1686 		if (i->nr_segs == 0)
1687 			return 0;
1688 		size = min(maxsize, i->kvec->iov_len - skip);
1689 		if (size)
1690 			break;
1691 		i->iov_offset = 0;
1692 		i->nr_segs--;
1693 		i->kvec++;
1694 		skip = 0;
1695 	}
1696 
1697 	kaddr = i->kvec->iov_base + skip;
1698 	offset = (unsigned long)kaddr & ~PAGE_MASK;
1699 	*offset0 = offset;
1700 
1701 	maxpages = want_pages_array(pages, size, offset, maxpages);
1702 	if (!maxpages)
1703 		return -ENOMEM;
1704 	p = *pages;
1705 
1706 	kaddr -= offset;
1707 	len = offset + size;
1708 	for (k = 0; k < maxpages; k++) {
1709 		size_t seg = min_t(size_t, len, PAGE_SIZE);
1710 
1711 		if (is_vmalloc_or_module_addr(kaddr))
1712 			page = vmalloc_to_page(kaddr);
1713 		else
1714 			page = virt_to_page(kaddr);
1715 
1716 		p[k] = page;
1717 		len -= seg;
1718 		kaddr += PAGE_SIZE;
1719 	}
1720 
1721 	size = min_t(size_t, size, maxpages * PAGE_SIZE - offset);
1722 	iov_iter_advance(i, size);
1723 	return size;
1724 }
1725 
1726 /*
1727  * Extract a list of contiguous pages from a user iterator and get a pin on
1728  * each of them.  This should only be used if the iterator is user-backed
1729  * (IOBUF/UBUF).
1730  *
1731  * It does not get refs on the pages, but the pages must be unpinned by the
1732  * caller once the transfer is complete.
1733  *
1734  * This is safe to be used where background IO/DMA *is* going to be modifying
1735  * the buffer; using a pin rather than a ref makes forces fork() to give the
1736  * child a copy of the page.
1737  */
1738 static ssize_t iov_iter_extract_user_pages(struct iov_iter *i,
1739 					   struct page ***pages,
1740 					   size_t maxsize,
1741 					   unsigned int maxpages,
1742 					   iov_iter_extraction_t extraction_flags,
1743 					   size_t *offset0)
1744 {
1745 	unsigned long addr;
1746 	unsigned int gup_flags = 0;
1747 	size_t offset;
1748 	int res;
1749 
1750 	if (i->data_source == ITER_DEST)
1751 		gup_flags |= FOLL_WRITE;
1752 	if (extraction_flags & ITER_ALLOW_P2PDMA)
1753 		gup_flags |= FOLL_PCI_P2PDMA;
1754 	if (i->nofault)
1755 		gup_flags |= FOLL_NOFAULT;
1756 
1757 	addr = first_iovec_segment(i, &maxsize);
1758 	*offset0 = offset = addr % PAGE_SIZE;
1759 	addr &= PAGE_MASK;
1760 	maxpages = want_pages_array(pages, maxsize, offset, maxpages);
1761 	if (!maxpages)
1762 		return -ENOMEM;
1763 	res = pin_user_pages_fast(addr, maxpages, gup_flags, *pages);
1764 	if (unlikely(res <= 0))
1765 		return res;
1766 	maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - offset);
1767 	iov_iter_advance(i, maxsize);
1768 	return maxsize;
1769 }
1770 
1771 /**
1772  * iov_iter_extract_pages - Extract a list of contiguous pages from an iterator
1773  * @i: The iterator to extract from
1774  * @pages: Where to return the list of pages
1775  * @maxsize: The maximum amount of iterator to extract
1776  * @maxpages: The maximum size of the list of pages
1777  * @extraction_flags: Flags to qualify request
1778  * @offset0: Where to return the starting offset into (*@pages)[0]
1779  *
1780  * Extract a list of contiguous pages from the current point of the iterator,
1781  * advancing the iterator.  The maximum number of pages and the maximum amount
1782  * of page contents can be set.
1783  *
1784  * If *@pages is NULL, a page list will be allocated to the required size and
1785  * *@pages will be set to its base.  If *@pages is not NULL, it will be assumed
1786  * that the caller allocated a page list at least @maxpages in size and this
1787  * will be filled in.
1788  *
1789  * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA
1790  * be allowed on the pages extracted.
1791  *
1792  * The iov_iter_extract_will_pin() function can be used to query how cleanup
1793  * should be performed.
1794  *
1795  * Extra refs or pins on the pages may be obtained as follows:
1796  *
1797  *  (*) If the iterator is user-backed (ITER_IOVEC/ITER_UBUF), pins will be
1798  *      added to the pages, but refs will not be taken.
1799  *      iov_iter_extract_will_pin() will return true.
1800  *
1801  *  (*) If the iterator is ITER_KVEC, ITER_BVEC, ITER_FOLIOQ or ITER_XARRAY, the
1802  *      pages are merely listed; no extra refs or pins are obtained.
1803  *      iov_iter_extract_will_pin() will return 0.
1804  *
1805  * Note also:
1806  *
1807  *  (*) Use with ITER_DISCARD is not supported as that has no content.
1808  *
1809  * On success, the function sets *@pages to the new pagelist, if allocated, and
1810  * sets *offset0 to the offset into the first page.
1811  *
1812  * It may also return -ENOMEM and -EFAULT.
1813  */
1814 ssize_t iov_iter_extract_pages(struct iov_iter *i,
1815 			       struct page ***pages,
1816 			       size_t maxsize,
1817 			       unsigned int maxpages,
1818 			       iov_iter_extraction_t extraction_flags,
1819 			       size_t *offset0)
1820 {
1821 	maxsize = min_t(size_t, min_t(size_t, maxsize, i->count), MAX_RW_COUNT);
1822 	if (!maxsize)
1823 		return 0;
1824 
1825 	if (likely(user_backed_iter(i)))
1826 		return iov_iter_extract_user_pages(i, pages, maxsize,
1827 						   maxpages, extraction_flags,
1828 						   offset0);
1829 	if (iov_iter_is_kvec(i))
1830 		return iov_iter_extract_kvec_pages(i, pages, maxsize,
1831 						   maxpages, extraction_flags,
1832 						   offset0);
1833 	if (iov_iter_is_bvec(i))
1834 		return iov_iter_extract_bvec_pages(i, pages, maxsize,
1835 						   maxpages, extraction_flags,
1836 						   offset0);
1837 	if (iov_iter_is_folioq(i))
1838 		return iov_iter_extract_folioq_pages(i, pages, maxsize,
1839 						     maxpages, extraction_flags,
1840 						     offset0);
1841 	if (iov_iter_is_xarray(i))
1842 		return iov_iter_extract_xarray_pages(i, pages, maxsize,
1843 						     maxpages, extraction_flags,
1844 						     offset0);
1845 	return -EFAULT;
1846 }
1847 EXPORT_SYMBOL_GPL(iov_iter_extract_pages);
1848