xref: /linux/lib/iov_iter.c (revision 5c71729ab92c7e710d48ed93043a2d1e35cc8d3c)
1  // SPDX-License-Identifier: GPL-2.0-only
2  #include <linux/export.h>
3  #include <linux/bvec.h>
4  #include <linux/fault-inject-usercopy.h>
5  #include <linux/uio.h>
6  #include <linux/pagemap.h>
7  #include <linux/highmem.h>
8  #include <linux/slab.h>
9  #include <linux/vmalloc.h>
10  #include <linux/splice.h>
11  #include <linux/compat.h>
12  #include <linux/scatterlist.h>
13  #include <linux/instrumented.h>
14  #include <linux/iov_iter.h>
15  
16  static __always_inline
17  size_t copy_to_user_iter(void __user *iter_to, size_t progress,
18  			 size_t len, void *from, void *priv2)
19  {
20  	if (should_fail_usercopy())
21  		return len;
22  	if (access_ok(iter_to, len)) {
23  		from += progress;
24  		instrument_copy_to_user(iter_to, from, len);
25  		len = raw_copy_to_user(iter_to, from, len);
26  	}
27  	return len;
28  }
29  
30  static __always_inline
31  size_t copy_to_user_iter_nofault(void __user *iter_to, size_t progress,
32  				 size_t len, void *from, void *priv2)
33  {
34  	ssize_t res;
35  
36  	if (should_fail_usercopy())
37  		return len;
38  
39  	from += progress;
40  	res = copy_to_user_nofault(iter_to, from, len);
41  	return res < 0 ? len : res;
42  }
43  
44  static __always_inline
45  size_t copy_from_user_iter(void __user *iter_from, size_t progress,
46  			   size_t len, void *to, void *priv2)
47  {
48  	size_t res = len;
49  
50  	if (should_fail_usercopy())
51  		return len;
52  	if (access_ok(iter_from, len)) {
53  		to += progress;
54  		instrument_copy_from_user_before(to, iter_from, len);
55  		res = raw_copy_from_user(to, iter_from, len);
56  		instrument_copy_from_user_after(to, iter_from, len, res);
57  	}
58  	return res;
59  }
60  
61  static __always_inline
62  size_t memcpy_to_iter(void *iter_to, size_t progress,
63  		      size_t len, void *from, void *priv2)
64  {
65  	memcpy(iter_to, from + progress, len);
66  	return 0;
67  }
68  
69  static __always_inline
70  size_t memcpy_from_iter(void *iter_from, size_t progress,
71  			size_t len, void *to, void *priv2)
72  {
73  	memcpy(to + progress, iter_from, len);
74  	return 0;
75  }
76  
77  /*
78   * fault_in_iov_iter_readable - fault in iov iterator for reading
79   * @i: iterator
80   * @size: maximum length
81   *
82   * Fault in one or more iovecs of the given iov_iter, to a maximum length of
83   * @size.  For each iovec, fault in each page that constitutes the iovec.
84   *
85   * Returns the number of bytes not faulted in (like copy_to_user() and
86   * copy_from_user()).
87   *
88   * Always returns 0 for non-userspace iterators.
89   */
90  size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size)
91  {
92  	if (iter_is_ubuf(i)) {
93  		size_t n = min(size, iov_iter_count(i));
94  		n -= fault_in_readable(i->ubuf + i->iov_offset, n);
95  		return size - n;
96  	} else if (iter_is_iovec(i)) {
97  		size_t count = min(size, iov_iter_count(i));
98  		const struct iovec *p;
99  		size_t skip;
100  
101  		size -= count;
102  		for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) {
103  			size_t len = min(count, p->iov_len - skip);
104  			size_t ret;
105  
106  			if (unlikely(!len))
107  				continue;
108  			ret = fault_in_readable(p->iov_base + skip, len);
109  			count -= len - ret;
110  			if (ret)
111  				break;
112  		}
113  		return count + size;
114  	}
115  	return 0;
116  }
117  EXPORT_SYMBOL(fault_in_iov_iter_readable);
118  
119  /*
120   * fault_in_iov_iter_writeable - fault in iov iterator for writing
121   * @i: iterator
122   * @size: maximum length
123   *
124   * Faults in the iterator using get_user_pages(), i.e., without triggering
125   * hardware page faults.  This is primarily useful when we already know that
126   * some or all of the pages in @i aren't in memory.
127   *
128   * Returns the number of bytes not faulted in, like copy_to_user() and
129   * copy_from_user().
130   *
131   * Always returns 0 for non-user-space iterators.
132   */
133  size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size)
134  {
135  	if (iter_is_ubuf(i)) {
136  		size_t n = min(size, iov_iter_count(i));
137  		n -= fault_in_safe_writeable(i->ubuf + i->iov_offset, n);
138  		return size - n;
139  	} else if (iter_is_iovec(i)) {
140  		size_t count = min(size, iov_iter_count(i));
141  		const struct iovec *p;
142  		size_t skip;
143  
144  		size -= count;
145  		for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) {
146  			size_t len = min(count, p->iov_len - skip);
147  			size_t ret;
148  
149  			if (unlikely(!len))
150  				continue;
151  			ret = fault_in_safe_writeable(p->iov_base + skip, len);
152  			count -= len - ret;
153  			if (ret)
154  				break;
155  		}
156  		return count + size;
157  	}
158  	return 0;
159  }
160  EXPORT_SYMBOL(fault_in_iov_iter_writeable);
161  
162  void iov_iter_init(struct iov_iter *i, unsigned int direction,
163  			const struct iovec *iov, unsigned long nr_segs,
164  			size_t count)
165  {
166  	WARN_ON(direction & ~(READ | WRITE));
167  	*i = (struct iov_iter) {
168  		.iter_type = ITER_IOVEC,
169  		.nofault = false,
170  		.data_source = direction,
171  		.__iov = iov,
172  		.nr_segs = nr_segs,
173  		.iov_offset = 0,
174  		.count = count
175  	};
176  }
177  EXPORT_SYMBOL(iov_iter_init);
178  
179  size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
180  {
181  	if (WARN_ON_ONCE(i->data_source))
182  		return 0;
183  	if (user_backed_iter(i))
184  		might_fault();
185  	return iterate_and_advance(i, bytes, (void *)addr,
186  				   copy_to_user_iter, memcpy_to_iter);
187  }
188  EXPORT_SYMBOL(_copy_to_iter);
189  
190  #ifdef CONFIG_ARCH_HAS_COPY_MC
191  static __always_inline
192  size_t copy_to_user_iter_mc(void __user *iter_to, size_t progress,
193  			    size_t len, void *from, void *priv2)
194  {
195  	if (access_ok(iter_to, len)) {
196  		from += progress;
197  		instrument_copy_to_user(iter_to, from, len);
198  		len = copy_mc_to_user(iter_to, from, len);
199  	}
200  	return len;
201  }
202  
203  static __always_inline
204  size_t memcpy_to_iter_mc(void *iter_to, size_t progress,
205  			 size_t len, void *from, void *priv2)
206  {
207  	return copy_mc_to_kernel(iter_to, from + progress, len);
208  }
209  
210  /**
211   * _copy_mc_to_iter - copy to iter with source memory error exception handling
212   * @addr: source kernel address
213   * @bytes: total transfer length
214   * @i: destination iterator
215   *
216   * The pmem driver deploys this for the dax operation
217   * (dax_copy_to_iter()) for dax reads (bypass page-cache and the
218   * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes
219   * successfully copied.
220   *
221   * The main differences between this and typical _copy_to_iter().
222   *
223   * * Typical tail/residue handling after a fault retries the copy
224   *   byte-by-byte until the fault happens again. Re-triggering machine
225   *   checks is potentially fatal so the implementation uses source
226   *   alignment and poison alignment assumptions to avoid re-triggering
227   *   hardware exceptions.
228   *
229   * * ITER_KVEC and ITER_BVEC can return short copies.  Compare to
230   *   copy_to_iter() where only ITER_IOVEC attempts might return a short copy.
231   *
232   * Return: number of bytes copied (may be %0)
233   */
234  size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
235  {
236  	if (WARN_ON_ONCE(i->data_source))
237  		return 0;
238  	if (user_backed_iter(i))
239  		might_fault();
240  	return iterate_and_advance(i, bytes, (void *)addr,
241  				   copy_to_user_iter_mc, memcpy_to_iter_mc);
242  }
243  EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
244  #endif /* CONFIG_ARCH_HAS_COPY_MC */
245  
246  static __always_inline
247  size_t __copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
248  {
249  	return iterate_and_advance(i, bytes, addr,
250  				   copy_from_user_iter, memcpy_from_iter);
251  }
252  
253  size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
254  {
255  	if (WARN_ON_ONCE(!i->data_source))
256  		return 0;
257  
258  	if (user_backed_iter(i))
259  		might_fault();
260  	return __copy_from_iter(addr, bytes, i);
261  }
262  EXPORT_SYMBOL(_copy_from_iter);
263  
264  static __always_inline
265  size_t copy_from_user_iter_nocache(void __user *iter_from, size_t progress,
266  				   size_t len, void *to, void *priv2)
267  {
268  	return __copy_from_user_inatomic_nocache(to + progress, iter_from, len);
269  }
270  
271  size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
272  {
273  	if (WARN_ON_ONCE(!i->data_source))
274  		return 0;
275  
276  	return iterate_and_advance(i, bytes, addr,
277  				   copy_from_user_iter_nocache,
278  				   memcpy_from_iter);
279  }
280  EXPORT_SYMBOL(_copy_from_iter_nocache);
281  
282  #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
283  static __always_inline
284  size_t copy_from_user_iter_flushcache(void __user *iter_from, size_t progress,
285  				      size_t len, void *to, void *priv2)
286  {
287  	return __copy_from_user_flushcache(to + progress, iter_from, len);
288  }
289  
290  static __always_inline
291  size_t memcpy_from_iter_flushcache(void *iter_from, size_t progress,
292  				   size_t len, void *to, void *priv2)
293  {
294  	memcpy_flushcache(to + progress, iter_from, len);
295  	return 0;
296  }
297  
298  /**
299   * _copy_from_iter_flushcache - write destination through cpu cache
300   * @addr: destination kernel address
301   * @bytes: total transfer length
302   * @i: source iterator
303   *
304   * The pmem driver arranges for filesystem-dax to use this facility via
305   * dax_copy_from_iter() for ensuring that writes to persistent memory
306   * are flushed through the CPU cache. It is differentiated from
307   * _copy_from_iter_nocache() in that guarantees all data is flushed for
308   * all iterator types. The _copy_from_iter_nocache() only attempts to
309   * bypass the cache for the ITER_IOVEC case, and on some archs may use
310   * instructions that strand dirty-data in the cache.
311   *
312   * Return: number of bytes copied (may be %0)
313   */
314  size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
315  {
316  	if (WARN_ON_ONCE(!i->data_source))
317  		return 0;
318  
319  	return iterate_and_advance(i, bytes, addr,
320  				   copy_from_user_iter_flushcache,
321  				   memcpy_from_iter_flushcache);
322  }
323  EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache);
324  #endif
325  
326  static inline bool page_copy_sane(struct page *page, size_t offset, size_t n)
327  {
328  	struct page *head;
329  	size_t v = n + offset;
330  
331  	/*
332  	 * The general case needs to access the page order in order
333  	 * to compute the page size.
334  	 * However, we mostly deal with order-0 pages and thus can
335  	 * avoid a possible cache line miss for requests that fit all
336  	 * page orders.
337  	 */
338  	if (n <= v && v <= PAGE_SIZE)
339  		return true;
340  
341  	head = compound_head(page);
342  	v += (page - head) << PAGE_SHIFT;
343  
344  	if (WARN_ON(n > v || v > page_size(head)))
345  		return false;
346  	return true;
347  }
348  
349  size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
350  			 struct iov_iter *i)
351  {
352  	size_t res = 0;
353  	if (!page_copy_sane(page, offset, bytes))
354  		return 0;
355  	if (WARN_ON_ONCE(i->data_source))
356  		return 0;
357  	page += offset / PAGE_SIZE; // first subpage
358  	offset %= PAGE_SIZE;
359  	while (1) {
360  		void *kaddr = kmap_local_page(page);
361  		size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
362  		n = _copy_to_iter(kaddr + offset, n, i);
363  		kunmap_local(kaddr);
364  		res += n;
365  		bytes -= n;
366  		if (!bytes || !n)
367  			break;
368  		offset += n;
369  		if (offset == PAGE_SIZE) {
370  			page++;
371  			offset = 0;
372  		}
373  	}
374  	return res;
375  }
376  EXPORT_SYMBOL(copy_page_to_iter);
377  
378  size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes,
379  				 struct iov_iter *i)
380  {
381  	size_t res = 0;
382  
383  	if (!page_copy_sane(page, offset, bytes))
384  		return 0;
385  	if (WARN_ON_ONCE(i->data_source))
386  		return 0;
387  	page += offset / PAGE_SIZE; // first subpage
388  	offset %= PAGE_SIZE;
389  	while (1) {
390  		void *kaddr = kmap_local_page(page);
391  		size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
392  
393  		n = iterate_and_advance(i, n, kaddr + offset,
394  					copy_to_user_iter_nofault,
395  					memcpy_to_iter);
396  		kunmap_local(kaddr);
397  		res += n;
398  		bytes -= n;
399  		if (!bytes || !n)
400  			break;
401  		offset += n;
402  		if (offset == PAGE_SIZE) {
403  			page++;
404  			offset = 0;
405  		}
406  	}
407  	return res;
408  }
409  EXPORT_SYMBOL(copy_page_to_iter_nofault);
410  
411  size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
412  			 struct iov_iter *i)
413  {
414  	size_t res = 0;
415  	if (!page_copy_sane(page, offset, bytes))
416  		return 0;
417  	page += offset / PAGE_SIZE; // first subpage
418  	offset %= PAGE_SIZE;
419  	while (1) {
420  		void *kaddr = kmap_local_page(page);
421  		size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
422  		n = _copy_from_iter(kaddr + offset, n, i);
423  		kunmap_local(kaddr);
424  		res += n;
425  		bytes -= n;
426  		if (!bytes || !n)
427  			break;
428  		offset += n;
429  		if (offset == PAGE_SIZE) {
430  			page++;
431  			offset = 0;
432  		}
433  	}
434  	return res;
435  }
436  EXPORT_SYMBOL(copy_page_from_iter);
437  
438  static __always_inline
439  size_t zero_to_user_iter(void __user *iter_to, size_t progress,
440  			 size_t len, void *priv, void *priv2)
441  {
442  	return clear_user(iter_to, len);
443  }
444  
445  static __always_inline
446  size_t zero_to_iter(void *iter_to, size_t progress,
447  		    size_t len, void *priv, void *priv2)
448  {
449  	memset(iter_to, 0, len);
450  	return 0;
451  }
452  
453  size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
454  {
455  	return iterate_and_advance(i, bytes, NULL,
456  				   zero_to_user_iter, zero_to_iter);
457  }
458  EXPORT_SYMBOL(iov_iter_zero);
459  
460  size_t copy_page_from_iter_atomic(struct page *page, size_t offset,
461  		size_t bytes, struct iov_iter *i)
462  {
463  	size_t n, copied = 0;
464  	bool uses_kmap = IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP) ||
465  			 PageHighMem(page);
466  
467  	if (!page_copy_sane(page, offset, bytes))
468  		return 0;
469  	if (WARN_ON_ONCE(!i->data_source))
470  		return 0;
471  
472  	do {
473  		char *p;
474  
475  		n = bytes - copied;
476  		if (uses_kmap) {
477  			page += offset / PAGE_SIZE;
478  			offset %= PAGE_SIZE;
479  			n = min_t(size_t, n, PAGE_SIZE - offset);
480  		}
481  
482  		p = kmap_atomic(page) + offset;
483  		n = __copy_from_iter(p, n, i);
484  		kunmap_atomic(p);
485  		copied += n;
486  		offset += n;
487  	} while (uses_kmap && copied != bytes && n > 0);
488  
489  	return copied;
490  }
491  EXPORT_SYMBOL(copy_page_from_iter_atomic);
492  
493  static void iov_iter_bvec_advance(struct iov_iter *i, size_t size)
494  {
495  	const struct bio_vec *bvec, *end;
496  
497  	if (!i->count)
498  		return;
499  	i->count -= size;
500  
501  	size += i->iov_offset;
502  
503  	for (bvec = i->bvec, end = bvec + i->nr_segs; bvec < end; bvec++) {
504  		if (likely(size < bvec->bv_len))
505  			break;
506  		size -= bvec->bv_len;
507  	}
508  	i->iov_offset = size;
509  	i->nr_segs -= bvec - i->bvec;
510  	i->bvec = bvec;
511  }
512  
513  static void iov_iter_iovec_advance(struct iov_iter *i, size_t size)
514  {
515  	const struct iovec *iov, *end;
516  
517  	if (!i->count)
518  		return;
519  	i->count -= size;
520  
521  	size += i->iov_offset; // from beginning of current segment
522  	for (iov = iter_iov(i), end = iov + i->nr_segs; iov < end; iov++) {
523  		if (likely(size < iov->iov_len))
524  			break;
525  		size -= iov->iov_len;
526  	}
527  	i->iov_offset = size;
528  	i->nr_segs -= iov - iter_iov(i);
529  	i->__iov = iov;
530  }
531  
532  static void iov_iter_folioq_advance(struct iov_iter *i, size_t size)
533  {
534  	const struct folio_queue *folioq = i->folioq;
535  	unsigned int slot = i->folioq_slot;
536  
537  	if (!i->count)
538  		return;
539  	i->count -= size;
540  
541  	if (slot >= folioq_nr_slots(folioq)) {
542  		folioq = folioq->next;
543  		slot = 0;
544  	}
545  
546  	size += i->iov_offset; /* From beginning of current segment. */
547  	do {
548  		size_t fsize = folioq_folio_size(folioq, slot);
549  
550  		if (likely(size < fsize))
551  			break;
552  		size -= fsize;
553  		slot++;
554  		if (slot >= folioq_nr_slots(folioq) && folioq->next) {
555  			folioq = folioq->next;
556  			slot = 0;
557  		}
558  	} while (size);
559  
560  	i->iov_offset = size;
561  	i->folioq_slot = slot;
562  	i->folioq = folioq;
563  }
564  
565  void iov_iter_advance(struct iov_iter *i, size_t size)
566  {
567  	if (unlikely(i->count < size))
568  		size = i->count;
569  	if (likely(iter_is_ubuf(i)) || unlikely(iov_iter_is_xarray(i))) {
570  		i->iov_offset += size;
571  		i->count -= size;
572  	} else if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) {
573  		/* iovec and kvec have identical layouts */
574  		iov_iter_iovec_advance(i, size);
575  	} else if (iov_iter_is_bvec(i)) {
576  		iov_iter_bvec_advance(i, size);
577  	} else if (iov_iter_is_folioq(i)) {
578  		iov_iter_folioq_advance(i, size);
579  	} else if (iov_iter_is_discard(i)) {
580  		i->count -= size;
581  	}
582  }
583  EXPORT_SYMBOL(iov_iter_advance);
584  
585  static void iov_iter_folioq_revert(struct iov_iter *i, size_t unroll)
586  {
587  	const struct folio_queue *folioq = i->folioq;
588  	unsigned int slot = i->folioq_slot;
589  
590  	for (;;) {
591  		size_t fsize;
592  
593  		if (slot == 0) {
594  			folioq = folioq->prev;
595  			slot = folioq_nr_slots(folioq);
596  		}
597  		slot--;
598  
599  		fsize = folioq_folio_size(folioq, slot);
600  		if (unroll <= fsize) {
601  			i->iov_offset = fsize - unroll;
602  			break;
603  		}
604  		unroll -= fsize;
605  	}
606  
607  	i->folioq_slot = slot;
608  	i->folioq = folioq;
609  }
610  
611  void iov_iter_revert(struct iov_iter *i, size_t unroll)
612  {
613  	if (!unroll)
614  		return;
615  	if (WARN_ON(unroll > MAX_RW_COUNT))
616  		return;
617  	i->count += unroll;
618  	if (unlikely(iov_iter_is_discard(i)))
619  		return;
620  	if (unroll <= i->iov_offset) {
621  		i->iov_offset -= unroll;
622  		return;
623  	}
624  	unroll -= i->iov_offset;
625  	if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) {
626  		BUG(); /* We should never go beyond the start of the specified
627  			* range since we might then be straying into pages that
628  			* aren't pinned.
629  			*/
630  	} else if (iov_iter_is_bvec(i)) {
631  		const struct bio_vec *bvec = i->bvec;
632  		while (1) {
633  			size_t n = (--bvec)->bv_len;
634  			i->nr_segs++;
635  			if (unroll <= n) {
636  				i->bvec = bvec;
637  				i->iov_offset = n - unroll;
638  				return;
639  			}
640  			unroll -= n;
641  		}
642  	} else if (iov_iter_is_folioq(i)) {
643  		i->iov_offset = 0;
644  		iov_iter_folioq_revert(i, unroll);
645  	} else { /* same logics for iovec and kvec */
646  		const struct iovec *iov = iter_iov(i);
647  		while (1) {
648  			size_t n = (--iov)->iov_len;
649  			i->nr_segs++;
650  			if (unroll <= n) {
651  				i->__iov = iov;
652  				i->iov_offset = n - unroll;
653  				return;
654  			}
655  			unroll -= n;
656  		}
657  	}
658  }
659  EXPORT_SYMBOL(iov_iter_revert);
660  
661  /*
662   * Return the count of just the current iov_iter segment.
663   */
664  size_t iov_iter_single_seg_count(const struct iov_iter *i)
665  {
666  	if (i->nr_segs > 1) {
667  		if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
668  			return min(i->count, iter_iov(i)->iov_len - i->iov_offset);
669  		if (iov_iter_is_bvec(i))
670  			return min(i->count, i->bvec->bv_len - i->iov_offset);
671  	}
672  	if (unlikely(iov_iter_is_folioq(i)))
673  		return !i->count ? 0 :
674  			umin(folioq_folio_size(i->folioq, i->folioq_slot), i->count);
675  	return i->count;
676  }
677  EXPORT_SYMBOL(iov_iter_single_seg_count);
678  
679  void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
680  			const struct kvec *kvec, unsigned long nr_segs,
681  			size_t count)
682  {
683  	WARN_ON(direction & ~(READ | WRITE));
684  	*i = (struct iov_iter){
685  		.iter_type = ITER_KVEC,
686  		.data_source = direction,
687  		.kvec = kvec,
688  		.nr_segs = nr_segs,
689  		.iov_offset = 0,
690  		.count = count
691  	};
692  }
693  EXPORT_SYMBOL(iov_iter_kvec);
694  
695  void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
696  			const struct bio_vec *bvec, unsigned long nr_segs,
697  			size_t count)
698  {
699  	WARN_ON(direction & ~(READ | WRITE));
700  	*i = (struct iov_iter){
701  		.iter_type = ITER_BVEC,
702  		.data_source = direction,
703  		.bvec = bvec,
704  		.nr_segs = nr_segs,
705  		.iov_offset = 0,
706  		.count = count
707  	};
708  }
709  EXPORT_SYMBOL(iov_iter_bvec);
710  
711  /**
712   * iov_iter_folio_queue - Initialise an I/O iterator to use the folios in a folio queue
713   * @i: The iterator to initialise.
714   * @direction: The direction of the transfer.
715   * @folioq: The starting point in the folio queue.
716   * @first_slot: The first slot in the folio queue to use
717   * @offset: The offset into the folio in the first slot to start at
718   * @count: The size of the I/O buffer in bytes.
719   *
720   * Set up an I/O iterator to either draw data out of the pages attached to an
721   * inode or to inject data into those pages.  The pages *must* be prevented
722   * from evaporation, either by taking a ref on them or locking them by the
723   * caller.
724   */
725  void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction,
726  			  const struct folio_queue *folioq, unsigned int first_slot,
727  			  unsigned int offset, size_t count)
728  {
729  	BUG_ON(direction & ~1);
730  	*i = (struct iov_iter) {
731  		.iter_type = ITER_FOLIOQ,
732  		.data_source = direction,
733  		.folioq = folioq,
734  		.folioq_slot = first_slot,
735  		.count = count,
736  		.iov_offset = offset,
737  	};
738  }
739  EXPORT_SYMBOL(iov_iter_folio_queue);
740  
741  /**
742   * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray
743   * @i: The iterator to initialise.
744   * @direction: The direction of the transfer.
745   * @xarray: The xarray to access.
746   * @start: The start file position.
747   * @count: The size of the I/O buffer in bytes.
748   *
749   * Set up an I/O iterator to either draw data out of the pages attached to an
750   * inode or to inject data into those pages.  The pages *must* be prevented
751   * from evaporation, either by taking a ref on them or locking them by the
752   * caller.
753   */
754  void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
755  		     struct xarray *xarray, loff_t start, size_t count)
756  {
757  	BUG_ON(direction & ~1);
758  	*i = (struct iov_iter) {
759  		.iter_type = ITER_XARRAY,
760  		.data_source = direction,
761  		.xarray = xarray,
762  		.xarray_start = start,
763  		.count = count,
764  		.iov_offset = 0
765  	};
766  }
767  EXPORT_SYMBOL(iov_iter_xarray);
768  
769  /**
770   * iov_iter_discard - Initialise an I/O iterator that discards data
771   * @i: The iterator to initialise.
772   * @direction: The direction of the transfer.
773   * @count: The size of the I/O buffer in bytes.
774   *
775   * Set up an I/O iterator that just discards everything that's written to it.
776   * It's only available as a READ iterator.
777   */
778  void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
779  {
780  	BUG_ON(direction != READ);
781  	*i = (struct iov_iter){
782  		.iter_type = ITER_DISCARD,
783  		.data_source = false,
784  		.count = count,
785  		.iov_offset = 0
786  	};
787  }
788  EXPORT_SYMBOL(iov_iter_discard);
789  
790  static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask,
791  				   unsigned len_mask)
792  {
793  	const struct iovec *iov = iter_iov(i);
794  	size_t size = i->count;
795  	size_t skip = i->iov_offset;
796  
797  	do {
798  		size_t len = iov->iov_len - skip;
799  
800  		if (len > size)
801  			len = size;
802  		if (len & len_mask)
803  			return false;
804  		if ((unsigned long)(iov->iov_base + skip) & addr_mask)
805  			return false;
806  
807  		iov++;
808  		size -= len;
809  		skip = 0;
810  	} while (size);
811  
812  	return true;
813  }
814  
815  static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask,
816  				  unsigned len_mask)
817  {
818  	const struct bio_vec *bvec = i->bvec;
819  	unsigned skip = i->iov_offset;
820  	size_t size = i->count;
821  
822  	do {
823  		size_t len = bvec->bv_len;
824  
825  		if (len > size)
826  			len = size;
827  		if (len & len_mask)
828  			return false;
829  		if ((unsigned long)(bvec->bv_offset + skip) & addr_mask)
830  			return false;
831  
832  		bvec++;
833  		size -= len;
834  		skip = 0;
835  	} while (size);
836  
837  	return true;
838  }
839  
840  /**
841   * iov_iter_is_aligned() - Check if the addresses and lengths of each segments
842   * 	are aligned to the parameters.
843   *
844   * @i: &struct iov_iter to restore
845   * @addr_mask: bit mask to check against the iov element's addresses
846   * @len_mask: bit mask to check against the iov element's lengths
847   *
848   * Return: false if any addresses or lengths intersect with the provided masks
849   */
850  bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask,
851  			 unsigned len_mask)
852  {
853  	if (likely(iter_is_ubuf(i))) {
854  		if (i->count & len_mask)
855  			return false;
856  		if ((unsigned long)(i->ubuf + i->iov_offset) & addr_mask)
857  			return false;
858  		return true;
859  	}
860  
861  	if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
862  		return iov_iter_aligned_iovec(i, addr_mask, len_mask);
863  
864  	if (iov_iter_is_bvec(i))
865  		return iov_iter_aligned_bvec(i, addr_mask, len_mask);
866  
867  	/* With both xarray and folioq types, we're dealing with whole folios. */
868  	if (iov_iter_is_xarray(i)) {
869  		if (i->count & len_mask)
870  			return false;
871  		if ((i->xarray_start + i->iov_offset) & addr_mask)
872  			return false;
873  	}
874  	if (iov_iter_is_folioq(i)) {
875  		if (i->count & len_mask)
876  			return false;
877  		if (i->iov_offset & addr_mask)
878  			return false;
879  	}
880  
881  	return true;
882  }
883  EXPORT_SYMBOL_GPL(iov_iter_is_aligned);
884  
885  static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
886  {
887  	const struct iovec *iov = iter_iov(i);
888  	unsigned long res = 0;
889  	size_t size = i->count;
890  	size_t skip = i->iov_offset;
891  
892  	do {
893  		size_t len = iov->iov_len - skip;
894  		if (len) {
895  			res |= (unsigned long)iov->iov_base + skip;
896  			if (len > size)
897  				len = size;
898  			res |= len;
899  			size -= len;
900  		}
901  		iov++;
902  		skip = 0;
903  	} while (size);
904  	return res;
905  }
906  
907  static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i)
908  {
909  	const struct bio_vec *bvec = i->bvec;
910  	unsigned res = 0;
911  	size_t size = i->count;
912  	unsigned skip = i->iov_offset;
913  
914  	do {
915  		size_t len = bvec->bv_len - skip;
916  		res |= (unsigned long)bvec->bv_offset + skip;
917  		if (len > size)
918  			len = size;
919  		res |= len;
920  		bvec++;
921  		size -= len;
922  		skip = 0;
923  	} while (size);
924  
925  	return res;
926  }
927  
928  unsigned long iov_iter_alignment(const struct iov_iter *i)
929  {
930  	if (likely(iter_is_ubuf(i))) {
931  		size_t size = i->count;
932  		if (size)
933  			return ((unsigned long)i->ubuf + i->iov_offset) | size;
934  		return 0;
935  	}
936  
937  	/* iovec and kvec have identical layouts */
938  	if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
939  		return iov_iter_alignment_iovec(i);
940  
941  	if (iov_iter_is_bvec(i))
942  		return iov_iter_alignment_bvec(i);
943  
944  	/* With both xarray and folioq types, we're dealing with whole folios. */
945  	if (iov_iter_is_folioq(i))
946  		return i->iov_offset | i->count;
947  	if (iov_iter_is_xarray(i))
948  		return (i->xarray_start + i->iov_offset) | i->count;
949  
950  	return 0;
951  }
952  EXPORT_SYMBOL(iov_iter_alignment);
953  
954  unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
955  {
956  	unsigned long res = 0;
957  	unsigned long v = 0;
958  	size_t size = i->count;
959  	unsigned k;
960  
961  	if (iter_is_ubuf(i))
962  		return 0;
963  
964  	if (WARN_ON(!iter_is_iovec(i)))
965  		return ~0U;
966  
967  	for (k = 0; k < i->nr_segs; k++) {
968  		const struct iovec *iov = iter_iov(i) + k;
969  		if (iov->iov_len) {
970  			unsigned long base = (unsigned long)iov->iov_base;
971  			if (v) // if not the first one
972  				res |= base | v; // this start | previous end
973  			v = base + iov->iov_len;
974  			if (size <= iov->iov_len)
975  				break;
976  			size -= iov->iov_len;
977  		}
978  	}
979  	return res;
980  }
981  EXPORT_SYMBOL(iov_iter_gap_alignment);
982  
983  static int want_pages_array(struct page ***res, size_t size,
984  			    size_t start, unsigned int maxpages)
985  {
986  	unsigned int count = DIV_ROUND_UP(size + start, PAGE_SIZE);
987  
988  	if (count > maxpages)
989  		count = maxpages;
990  	WARN_ON(!count);	// caller should've prevented that
991  	if (!*res) {
992  		*res = kvmalloc_array(count, sizeof(struct page *), GFP_KERNEL);
993  		if (!*res)
994  			return 0;
995  	}
996  	return count;
997  }
998  
999  static ssize_t iter_folioq_get_pages(struct iov_iter *iter,
1000  				     struct page ***ppages, size_t maxsize,
1001  				     unsigned maxpages, size_t *_start_offset)
1002  {
1003  	const struct folio_queue *folioq = iter->folioq;
1004  	struct page **pages;
1005  	unsigned int slot = iter->folioq_slot;
1006  	size_t extracted = 0, count = iter->count, iov_offset = iter->iov_offset;
1007  
1008  	if (slot >= folioq_nr_slots(folioq)) {
1009  		folioq = folioq->next;
1010  		slot = 0;
1011  		if (WARN_ON(iov_offset != 0))
1012  			return -EIO;
1013  	}
1014  
1015  	maxpages = want_pages_array(ppages, maxsize, iov_offset & ~PAGE_MASK, maxpages);
1016  	if (!maxpages)
1017  		return -ENOMEM;
1018  	*_start_offset = iov_offset & ~PAGE_MASK;
1019  	pages = *ppages;
1020  
1021  	for (;;) {
1022  		struct folio *folio = folioq_folio(folioq, slot);
1023  		size_t offset = iov_offset, fsize = folioq_folio_size(folioq, slot);
1024  		size_t part = PAGE_SIZE - offset % PAGE_SIZE;
1025  
1026  		if (offset < fsize) {
1027  			part = umin(part, umin(maxsize - extracted, fsize - offset));
1028  			count -= part;
1029  			iov_offset += part;
1030  			extracted += part;
1031  
1032  			*pages = folio_page(folio, offset / PAGE_SIZE);
1033  			get_page(*pages);
1034  			pages++;
1035  			maxpages--;
1036  		}
1037  
1038  		if (maxpages == 0 || extracted >= maxsize)
1039  			break;
1040  
1041  		if (iov_offset >= fsize) {
1042  			iov_offset = 0;
1043  			slot++;
1044  			if (slot == folioq_nr_slots(folioq) && folioq->next) {
1045  				folioq = folioq->next;
1046  				slot = 0;
1047  			}
1048  		}
1049  	}
1050  
1051  	iter->count = count;
1052  	iter->iov_offset = iov_offset;
1053  	iter->folioq = folioq;
1054  	iter->folioq_slot = slot;
1055  	return extracted;
1056  }
1057  
1058  static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa,
1059  					  pgoff_t index, unsigned int nr_pages)
1060  {
1061  	XA_STATE(xas, xa, index);
1062  	struct page *page;
1063  	unsigned int ret = 0;
1064  
1065  	rcu_read_lock();
1066  	for (page = xas_load(&xas); page; page = xas_next(&xas)) {
1067  		if (xas_retry(&xas, page))
1068  			continue;
1069  
1070  		/* Has the page moved or been split? */
1071  		if (unlikely(page != xas_reload(&xas))) {
1072  			xas_reset(&xas);
1073  			continue;
1074  		}
1075  
1076  		pages[ret] = find_subpage(page, xas.xa_index);
1077  		get_page(pages[ret]);
1078  		if (++ret == nr_pages)
1079  			break;
1080  	}
1081  	rcu_read_unlock();
1082  	return ret;
1083  }
1084  
1085  static ssize_t iter_xarray_get_pages(struct iov_iter *i,
1086  				     struct page ***pages, size_t maxsize,
1087  				     unsigned maxpages, size_t *_start_offset)
1088  {
1089  	unsigned nr, offset, count;
1090  	pgoff_t index;
1091  	loff_t pos;
1092  
1093  	pos = i->xarray_start + i->iov_offset;
1094  	index = pos >> PAGE_SHIFT;
1095  	offset = pos & ~PAGE_MASK;
1096  	*_start_offset = offset;
1097  
1098  	count = want_pages_array(pages, maxsize, offset, maxpages);
1099  	if (!count)
1100  		return -ENOMEM;
1101  	nr = iter_xarray_populate_pages(*pages, i->xarray, index, count);
1102  	if (nr == 0)
1103  		return 0;
1104  
1105  	maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
1106  	i->iov_offset += maxsize;
1107  	i->count -= maxsize;
1108  	return maxsize;
1109  }
1110  
1111  /* must be done on non-empty ITER_UBUF or ITER_IOVEC one */
1112  static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size)
1113  {
1114  	size_t skip;
1115  	long k;
1116  
1117  	if (iter_is_ubuf(i))
1118  		return (unsigned long)i->ubuf + i->iov_offset;
1119  
1120  	for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) {
1121  		const struct iovec *iov = iter_iov(i) + k;
1122  		size_t len = iov->iov_len - skip;
1123  
1124  		if (unlikely(!len))
1125  			continue;
1126  		if (*size > len)
1127  			*size = len;
1128  		return (unsigned long)iov->iov_base + skip;
1129  	}
1130  	BUG(); // if it had been empty, we wouldn't get called
1131  }
1132  
1133  /* must be done on non-empty ITER_BVEC one */
1134  static struct page *first_bvec_segment(const struct iov_iter *i,
1135  				       size_t *size, size_t *start)
1136  {
1137  	struct page *page;
1138  	size_t skip = i->iov_offset, len;
1139  
1140  	len = i->bvec->bv_len - skip;
1141  	if (*size > len)
1142  		*size = len;
1143  	skip += i->bvec->bv_offset;
1144  	page = i->bvec->bv_page + skip / PAGE_SIZE;
1145  	*start = skip % PAGE_SIZE;
1146  	return page;
1147  }
1148  
1149  static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i,
1150  		   struct page ***pages, size_t maxsize,
1151  		   unsigned int maxpages, size_t *start)
1152  {
1153  	unsigned int n, gup_flags = 0;
1154  
1155  	if (maxsize > i->count)
1156  		maxsize = i->count;
1157  	if (!maxsize)
1158  		return 0;
1159  	if (maxsize > MAX_RW_COUNT)
1160  		maxsize = MAX_RW_COUNT;
1161  
1162  	if (likely(user_backed_iter(i))) {
1163  		unsigned long addr;
1164  		int res;
1165  
1166  		if (iov_iter_rw(i) != WRITE)
1167  			gup_flags |= FOLL_WRITE;
1168  		if (i->nofault)
1169  			gup_flags |= FOLL_NOFAULT;
1170  
1171  		addr = first_iovec_segment(i, &maxsize);
1172  		*start = addr % PAGE_SIZE;
1173  		addr &= PAGE_MASK;
1174  		n = want_pages_array(pages, maxsize, *start, maxpages);
1175  		if (!n)
1176  			return -ENOMEM;
1177  		res = get_user_pages_fast(addr, n, gup_flags, *pages);
1178  		if (unlikely(res <= 0))
1179  			return res;
1180  		maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - *start);
1181  		iov_iter_advance(i, maxsize);
1182  		return maxsize;
1183  	}
1184  	if (iov_iter_is_bvec(i)) {
1185  		struct page **p;
1186  		struct page *page;
1187  
1188  		page = first_bvec_segment(i, &maxsize, start);
1189  		n = want_pages_array(pages, maxsize, *start, maxpages);
1190  		if (!n)
1191  			return -ENOMEM;
1192  		p = *pages;
1193  		for (int k = 0; k < n; k++)
1194  			get_page(p[k] = page + k);
1195  		maxsize = min_t(size_t, maxsize, n * PAGE_SIZE - *start);
1196  		i->count -= maxsize;
1197  		i->iov_offset += maxsize;
1198  		if (i->iov_offset == i->bvec->bv_len) {
1199  			i->iov_offset = 0;
1200  			i->bvec++;
1201  			i->nr_segs--;
1202  		}
1203  		return maxsize;
1204  	}
1205  	if (iov_iter_is_folioq(i))
1206  		return iter_folioq_get_pages(i, pages, maxsize, maxpages, start);
1207  	if (iov_iter_is_xarray(i))
1208  		return iter_xarray_get_pages(i, pages, maxsize, maxpages, start);
1209  	return -EFAULT;
1210  }
1211  
1212  ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,
1213  		size_t maxsize, unsigned maxpages, size_t *start)
1214  {
1215  	if (!maxpages)
1216  		return 0;
1217  	BUG_ON(!pages);
1218  
1219  	return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, start);
1220  }
1221  EXPORT_SYMBOL(iov_iter_get_pages2);
1222  
1223  ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i,
1224  		struct page ***pages, size_t maxsize, size_t *start)
1225  {
1226  	ssize_t len;
1227  
1228  	*pages = NULL;
1229  
1230  	len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start);
1231  	if (len <= 0) {
1232  		kvfree(*pages);
1233  		*pages = NULL;
1234  	}
1235  	return len;
1236  }
1237  EXPORT_SYMBOL(iov_iter_get_pages_alloc2);
1238  
1239  static int iov_npages(const struct iov_iter *i, int maxpages)
1240  {
1241  	size_t skip = i->iov_offset, size = i->count;
1242  	const struct iovec *p;
1243  	int npages = 0;
1244  
1245  	for (p = iter_iov(i); size; skip = 0, p++) {
1246  		unsigned offs = offset_in_page(p->iov_base + skip);
1247  		size_t len = min(p->iov_len - skip, size);
1248  
1249  		if (len) {
1250  			size -= len;
1251  			npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
1252  			if (unlikely(npages > maxpages))
1253  				return maxpages;
1254  		}
1255  	}
1256  	return npages;
1257  }
1258  
1259  static int bvec_npages(const struct iov_iter *i, int maxpages)
1260  {
1261  	size_t skip = i->iov_offset, size = i->count;
1262  	const struct bio_vec *p;
1263  	int npages = 0;
1264  
1265  	for (p = i->bvec; size; skip = 0, p++) {
1266  		unsigned offs = (p->bv_offset + skip) % PAGE_SIZE;
1267  		size_t len = min(p->bv_len - skip, size);
1268  
1269  		size -= len;
1270  		npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
1271  		if (unlikely(npages > maxpages))
1272  			return maxpages;
1273  	}
1274  	return npages;
1275  }
1276  
1277  int iov_iter_npages(const struct iov_iter *i, int maxpages)
1278  {
1279  	if (unlikely(!i->count))
1280  		return 0;
1281  	if (likely(iter_is_ubuf(i))) {
1282  		unsigned offs = offset_in_page(i->ubuf + i->iov_offset);
1283  		int npages = DIV_ROUND_UP(offs + i->count, PAGE_SIZE);
1284  		return min(npages, maxpages);
1285  	}
1286  	/* iovec and kvec have identical layouts */
1287  	if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1288  		return iov_npages(i, maxpages);
1289  	if (iov_iter_is_bvec(i))
1290  		return bvec_npages(i, maxpages);
1291  	if (iov_iter_is_folioq(i)) {
1292  		unsigned offset = i->iov_offset % PAGE_SIZE;
1293  		int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
1294  		return min(npages, maxpages);
1295  	}
1296  	if (iov_iter_is_xarray(i)) {
1297  		unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE;
1298  		int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
1299  		return min(npages, maxpages);
1300  	}
1301  	return 0;
1302  }
1303  EXPORT_SYMBOL(iov_iter_npages);
1304  
1305  const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
1306  {
1307  	*new = *old;
1308  	if (iov_iter_is_bvec(new))
1309  		return new->bvec = kmemdup(new->bvec,
1310  				    new->nr_segs * sizeof(struct bio_vec),
1311  				    flags);
1312  	else if (iov_iter_is_kvec(new) || iter_is_iovec(new))
1313  		/* iovec and kvec have identical layout */
1314  		return new->__iov = kmemdup(new->__iov,
1315  				   new->nr_segs * sizeof(struct iovec),
1316  				   flags);
1317  	return NULL;
1318  }
1319  EXPORT_SYMBOL(dup_iter);
1320  
1321  static __noclone int copy_compat_iovec_from_user(struct iovec *iov,
1322  		const struct iovec __user *uvec, u32 nr_segs)
1323  {
1324  	const struct compat_iovec __user *uiov =
1325  		(const struct compat_iovec __user *)uvec;
1326  	int ret = -EFAULT;
1327  	u32 i;
1328  
1329  	if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
1330  		return -EFAULT;
1331  
1332  	for (i = 0; i < nr_segs; i++) {
1333  		compat_uptr_t buf;
1334  		compat_ssize_t len;
1335  
1336  		unsafe_get_user(len, &uiov[i].iov_len, uaccess_end);
1337  		unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end);
1338  
1339  		/* check for compat_size_t not fitting in compat_ssize_t .. */
1340  		if (len < 0) {
1341  			ret = -EINVAL;
1342  			goto uaccess_end;
1343  		}
1344  		iov[i].iov_base = compat_ptr(buf);
1345  		iov[i].iov_len = len;
1346  	}
1347  
1348  	ret = 0;
1349  uaccess_end:
1350  	user_access_end();
1351  	return ret;
1352  }
1353  
1354  static __noclone int copy_iovec_from_user(struct iovec *iov,
1355  		const struct iovec __user *uiov, unsigned long nr_segs)
1356  {
1357  	int ret = -EFAULT;
1358  
1359  	if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
1360  		return -EFAULT;
1361  
1362  	do {
1363  		void __user *buf;
1364  		ssize_t len;
1365  
1366  		unsafe_get_user(len, &uiov->iov_len, uaccess_end);
1367  		unsafe_get_user(buf, &uiov->iov_base, uaccess_end);
1368  
1369  		/* check for size_t not fitting in ssize_t .. */
1370  		if (unlikely(len < 0)) {
1371  			ret = -EINVAL;
1372  			goto uaccess_end;
1373  		}
1374  		iov->iov_base = buf;
1375  		iov->iov_len = len;
1376  
1377  		uiov++; iov++;
1378  	} while (--nr_segs);
1379  
1380  	ret = 0;
1381  uaccess_end:
1382  	user_access_end();
1383  	return ret;
1384  }
1385  
1386  struct iovec *iovec_from_user(const struct iovec __user *uvec,
1387  		unsigned long nr_segs, unsigned long fast_segs,
1388  		struct iovec *fast_iov, bool compat)
1389  {
1390  	struct iovec *iov = fast_iov;
1391  	int ret;
1392  
1393  	/*
1394  	 * SuS says "The readv() function *may* fail if the iovcnt argument was
1395  	 * less than or equal to 0, or greater than {IOV_MAX}.  Linux has
1396  	 * traditionally returned zero for zero segments, so...
1397  	 */
1398  	if (nr_segs == 0)
1399  		return iov;
1400  	if (nr_segs > UIO_MAXIOV)
1401  		return ERR_PTR(-EINVAL);
1402  	if (nr_segs > fast_segs) {
1403  		iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
1404  		if (!iov)
1405  			return ERR_PTR(-ENOMEM);
1406  	}
1407  
1408  	if (unlikely(compat))
1409  		ret = copy_compat_iovec_from_user(iov, uvec, nr_segs);
1410  	else
1411  		ret = copy_iovec_from_user(iov, uvec, nr_segs);
1412  	if (ret) {
1413  		if (iov != fast_iov)
1414  			kfree(iov);
1415  		return ERR_PTR(ret);
1416  	}
1417  
1418  	return iov;
1419  }
1420  
1421  /*
1422   * Single segment iovec supplied by the user, import it as ITER_UBUF.
1423   */
1424  static ssize_t __import_iovec_ubuf(int type, const struct iovec __user *uvec,
1425  				   struct iovec **iovp, struct iov_iter *i,
1426  				   bool compat)
1427  {
1428  	struct iovec *iov = *iovp;
1429  	ssize_t ret;
1430  
1431  	if (compat)
1432  		ret = copy_compat_iovec_from_user(iov, uvec, 1);
1433  	else
1434  		ret = copy_iovec_from_user(iov, uvec, 1);
1435  	if (unlikely(ret))
1436  		return ret;
1437  
1438  	ret = import_ubuf(type, iov->iov_base, iov->iov_len, i);
1439  	if (unlikely(ret))
1440  		return ret;
1441  	*iovp = NULL;
1442  	return i->count;
1443  }
1444  
1445  ssize_t __import_iovec(int type, const struct iovec __user *uvec,
1446  		 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
1447  		 struct iov_iter *i, bool compat)
1448  {
1449  	ssize_t total_len = 0;
1450  	unsigned long seg;
1451  	struct iovec *iov;
1452  
1453  	if (nr_segs == 1)
1454  		return __import_iovec_ubuf(type, uvec, iovp, i, compat);
1455  
1456  	iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat);
1457  	if (IS_ERR(iov)) {
1458  		*iovp = NULL;
1459  		return PTR_ERR(iov);
1460  	}
1461  
1462  	/*
1463  	 * According to the Single Unix Specification we should return EINVAL if
1464  	 * an element length is < 0 when cast to ssize_t or if the total length
1465  	 * would overflow the ssize_t return value of the system call.
1466  	 *
1467  	 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
1468  	 * overflow case.
1469  	 */
1470  	for (seg = 0; seg < nr_segs; seg++) {
1471  		ssize_t len = (ssize_t)iov[seg].iov_len;
1472  
1473  		if (!access_ok(iov[seg].iov_base, len)) {
1474  			if (iov != *iovp)
1475  				kfree(iov);
1476  			*iovp = NULL;
1477  			return -EFAULT;
1478  		}
1479  
1480  		if (len > MAX_RW_COUNT - total_len) {
1481  			len = MAX_RW_COUNT - total_len;
1482  			iov[seg].iov_len = len;
1483  		}
1484  		total_len += len;
1485  	}
1486  
1487  	iov_iter_init(i, type, iov, nr_segs, total_len);
1488  	if (iov == *iovp)
1489  		*iovp = NULL;
1490  	else
1491  		*iovp = iov;
1492  	return total_len;
1493  }
1494  
1495  /**
1496   * import_iovec() - Copy an array of &struct iovec from userspace
1497   *     into the kernel, check that it is valid, and initialize a new
1498   *     &struct iov_iter iterator to access it.
1499   *
1500   * @type: One of %READ or %WRITE.
1501   * @uvec: Pointer to the userspace array.
1502   * @nr_segs: Number of elements in userspace array.
1503   * @fast_segs: Number of elements in @iov.
1504   * @iovp: (input and output parameter) Pointer to pointer to (usually small
1505   *     on-stack) kernel array.
1506   * @i: Pointer to iterator that will be initialized on success.
1507   *
1508   * If the array pointed to by *@iov is large enough to hold all @nr_segs,
1509   * then this function places %NULL in *@iov on return. Otherwise, a new
1510   * array will be allocated and the result placed in *@iov. This means that
1511   * the caller may call kfree() on *@iov regardless of whether the small
1512   * on-stack array was used or not (and regardless of whether this function
1513   * returns an error or not).
1514   *
1515   * Return: Negative error code on error, bytes imported on success
1516   */
1517  ssize_t import_iovec(int type, const struct iovec __user *uvec,
1518  		 unsigned nr_segs, unsigned fast_segs,
1519  		 struct iovec **iovp, struct iov_iter *i)
1520  {
1521  	return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i,
1522  			      in_compat_syscall());
1523  }
1524  EXPORT_SYMBOL(import_iovec);
1525  
1526  int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i)
1527  {
1528  	if (len > MAX_RW_COUNT)
1529  		len = MAX_RW_COUNT;
1530  	if (unlikely(!access_ok(buf, len)))
1531  		return -EFAULT;
1532  
1533  	iov_iter_ubuf(i, rw, buf, len);
1534  	return 0;
1535  }
1536  EXPORT_SYMBOL_GPL(import_ubuf);
1537  
1538  /**
1539   * iov_iter_restore() - Restore a &struct iov_iter to the same state as when
1540   *     iov_iter_save_state() was called.
1541   *
1542   * @i: &struct iov_iter to restore
1543   * @state: state to restore from
1544   *
1545   * Used after iov_iter_save_state() to bring restore @i, if operations may
1546   * have advanced it.
1547   *
1548   * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC
1549   */
1550  void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
1551  {
1552  	if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) &&
1553  			 !iter_is_ubuf(i)) && !iov_iter_is_kvec(i))
1554  		return;
1555  	i->iov_offset = state->iov_offset;
1556  	i->count = state->count;
1557  	if (iter_is_ubuf(i))
1558  		return;
1559  	/*
1560  	 * For the *vec iters, nr_segs + iov is constant - if we increment
1561  	 * the vec, then we also decrement the nr_segs count. Hence we don't
1562  	 * need to track both of these, just one is enough and we can deduct
1563  	 * the other from that. ITER_KVEC and ITER_IOVEC are the same struct
1564  	 * size, so we can just increment the iov pointer as they are unionzed.
1565  	 * ITER_BVEC _may_ be the same size on some archs, but on others it is
1566  	 * not. Be safe and handle it separately.
1567  	 */
1568  	BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
1569  	if (iov_iter_is_bvec(i))
1570  		i->bvec -= state->nr_segs - i->nr_segs;
1571  	else
1572  		i->__iov -= state->nr_segs - i->nr_segs;
1573  	i->nr_segs = state->nr_segs;
1574  }
1575  
1576  /*
1577   * Extract a list of contiguous pages from an ITER_FOLIOQ iterator.  This does
1578   * not get references on the pages, nor does it get a pin on them.
1579   */
1580  static ssize_t iov_iter_extract_folioq_pages(struct iov_iter *i,
1581  					     struct page ***pages, size_t maxsize,
1582  					     unsigned int maxpages,
1583  					     iov_iter_extraction_t extraction_flags,
1584  					     size_t *offset0)
1585  {
1586  	const struct folio_queue *folioq = i->folioq;
1587  	struct page **p;
1588  	unsigned int nr = 0;
1589  	size_t extracted = 0, offset, slot = i->folioq_slot;
1590  
1591  	if (slot >= folioq_nr_slots(folioq)) {
1592  		folioq = folioq->next;
1593  		slot = 0;
1594  		if (WARN_ON(i->iov_offset != 0))
1595  			return -EIO;
1596  	}
1597  
1598  	offset = i->iov_offset & ~PAGE_MASK;
1599  	*offset0 = offset;
1600  
1601  	maxpages = want_pages_array(pages, maxsize, offset, maxpages);
1602  	if (!maxpages)
1603  		return -ENOMEM;
1604  	p = *pages;
1605  
1606  	for (;;) {
1607  		struct folio *folio = folioq_folio(folioq, slot);
1608  		size_t offset = i->iov_offset, fsize = folioq_folio_size(folioq, slot);
1609  		size_t part = PAGE_SIZE - offset % PAGE_SIZE;
1610  
1611  		if (offset < fsize) {
1612  			part = umin(part, umin(maxsize - extracted, fsize - offset));
1613  			i->count -= part;
1614  			i->iov_offset += part;
1615  			extracted += part;
1616  
1617  			p[nr++] = folio_page(folio, offset / PAGE_SIZE);
1618  		}
1619  
1620  		if (nr >= maxpages || extracted >= maxsize)
1621  			break;
1622  
1623  		if (i->iov_offset >= fsize) {
1624  			i->iov_offset = 0;
1625  			slot++;
1626  			if (slot == folioq_nr_slots(folioq) && folioq->next) {
1627  				folioq = folioq->next;
1628  				slot = 0;
1629  			}
1630  		}
1631  	}
1632  
1633  	i->folioq = folioq;
1634  	i->folioq_slot = slot;
1635  	return extracted;
1636  }
1637  
1638  /*
1639   * Extract a list of contiguous pages from an ITER_XARRAY iterator.  This does not
1640   * get references on the pages, nor does it get a pin on them.
1641   */
1642  static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i,
1643  					     struct page ***pages, size_t maxsize,
1644  					     unsigned int maxpages,
1645  					     iov_iter_extraction_t extraction_flags,
1646  					     size_t *offset0)
1647  {
1648  	struct page *page, **p;
1649  	unsigned int nr = 0, offset;
1650  	loff_t pos = i->xarray_start + i->iov_offset;
1651  	pgoff_t index = pos >> PAGE_SHIFT;
1652  	XA_STATE(xas, i->xarray, index);
1653  
1654  	offset = pos & ~PAGE_MASK;
1655  	*offset0 = offset;
1656  
1657  	maxpages = want_pages_array(pages, maxsize, offset, maxpages);
1658  	if (!maxpages)
1659  		return -ENOMEM;
1660  	p = *pages;
1661  
1662  	rcu_read_lock();
1663  	for (page = xas_load(&xas); page; page = xas_next(&xas)) {
1664  		if (xas_retry(&xas, page))
1665  			continue;
1666  
1667  		/* Has the page moved or been split? */
1668  		if (unlikely(page != xas_reload(&xas))) {
1669  			xas_reset(&xas);
1670  			continue;
1671  		}
1672  
1673  		p[nr++] = find_subpage(page, xas.xa_index);
1674  		if (nr == maxpages)
1675  			break;
1676  	}
1677  	rcu_read_unlock();
1678  
1679  	maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
1680  	iov_iter_advance(i, maxsize);
1681  	return maxsize;
1682  }
1683  
1684  /*
1685   * Extract a list of virtually contiguous pages from an ITER_BVEC iterator.
1686   * This does not get references on the pages, nor does it get a pin on them.
1687   */
1688  static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i,
1689  					   struct page ***pages, size_t maxsize,
1690  					   unsigned int maxpages,
1691  					   iov_iter_extraction_t extraction_flags,
1692  					   size_t *offset0)
1693  {
1694  	size_t skip = i->iov_offset, size = 0;
1695  	struct bvec_iter bi;
1696  	int k = 0;
1697  
1698  	if (i->nr_segs == 0)
1699  		return 0;
1700  
1701  	if (i->iov_offset == i->bvec->bv_len) {
1702  		i->iov_offset = 0;
1703  		i->nr_segs--;
1704  		i->bvec++;
1705  		skip = 0;
1706  	}
1707  	bi.bi_idx = 0;
1708  	bi.bi_size = maxsize;
1709  	bi.bi_bvec_done = skip;
1710  
1711  	maxpages = want_pages_array(pages, maxsize, skip, maxpages);
1712  
1713  	while (bi.bi_size && bi.bi_idx < i->nr_segs) {
1714  		struct bio_vec bv = bvec_iter_bvec(i->bvec, bi);
1715  
1716  		/*
1717  		 * The iov_iter_extract_pages interface only allows an offset
1718  		 * into the first page.  Break out of the loop if we see an
1719  		 * offset into subsequent pages, the caller will have to call
1720  		 * iov_iter_extract_pages again for the reminder.
1721  		 */
1722  		if (k) {
1723  			if (bv.bv_offset)
1724  				break;
1725  		} else {
1726  			*offset0 = bv.bv_offset;
1727  		}
1728  
1729  		(*pages)[k++] = bv.bv_page;
1730  		size += bv.bv_len;
1731  
1732  		if (k >= maxpages)
1733  			break;
1734  
1735  		/*
1736  		 * We are done when the end of the bvec doesn't align to a page
1737  		 * boundary as that would create a hole in the returned space.
1738  		 * The caller will handle this with another call to
1739  		 * iov_iter_extract_pages.
1740  		 */
1741  		if (bv.bv_offset + bv.bv_len != PAGE_SIZE)
1742  			break;
1743  
1744  		bvec_iter_advance_single(i->bvec, &bi, bv.bv_len);
1745  	}
1746  
1747  	iov_iter_advance(i, size);
1748  	return size;
1749  }
1750  
1751  /*
1752   * Extract a list of virtually contiguous pages from an ITER_KVEC iterator.
1753   * This does not get references on the pages, nor does it get a pin on them.
1754   */
1755  static ssize_t iov_iter_extract_kvec_pages(struct iov_iter *i,
1756  					   struct page ***pages, size_t maxsize,
1757  					   unsigned int maxpages,
1758  					   iov_iter_extraction_t extraction_flags,
1759  					   size_t *offset0)
1760  {
1761  	struct page **p, *page;
1762  	const void *kaddr;
1763  	size_t skip = i->iov_offset, offset, len, size;
1764  	int k;
1765  
1766  	for (;;) {
1767  		if (i->nr_segs == 0)
1768  			return 0;
1769  		size = min(maxsize, i->kvec->iov_len - skip);
1770  		if (size)
1771  			break;
1772  		i->iov_offset = 0;
1773  		i->nr_segs--;
1774  		i->kvec++;
1775  		skip = 0;
1776  	}
1777  
1778  	kaddr = i->kvec->iov_base + skip;
1779  	offset = (unsigned long)kaddr & ~PAGE_MASK;
1780  	*offset0 = offset;
1781  
1782  	maxpages = want_pages_array(pages, size, offset, maxpages);
1783  	if (!maxpages)
1784  		return -ENOMEM;
1785  	p = *pages;
1786  
1787  	kaddr -= offset;
1788  	len = offset + size;
1789  	for (k = 0; k < maxpages; k++) {
1790  		size_t seg = min_t(size_t, len, PAGE_SIZE);
1791  
1792  		if (is_vmalloc_or_module_addr(kaddr))
1793  			page = vmalloc_to_page(kaddr);
1794  		else
1795  			page = virt_to_page(kaddr);
1796  
1797  		p[k] = page;
1798  		len -= seg;
1799  		kaddr += PAGE_SIZE;
1800  	}
1801  
1802  	size = min_t(size_t, size, maxpages * PAGE_SIZE - offset);
1803  	iov_iter_advance(i, size);
1804  	return size;
1805  }
1806  
1807  /*
1808   * Extract a list of contiguous pages from a user iterator and get a pin on
1809   * each of them.  This should only be used if the iterator is user-backed
1810   * (IOBUF/UBUF).
1811   *
1812   * It does not get refs on the pages, but the pages must be unpinned by the
1813   * caller once the transfer is complete.
1814   *
1815   * This is safe to be used where background IO/DMA *is* going to be modifying
1816   * the buffer; using a pin rather than a ref makes forces fork() to give the
1817   * child a copy of the page.
1818   */
1819  static ssize_t iov_iter_extract_user_pages(struct iov_iter *i,
1820  					   struct page ***pages,
1821  					   size_t maxsize,
1822  					   unsigned int maxpages,
1823  					   iov_iter_extraction_t extraction_flags,
1824  					   size_t *offset0)
1825  {
1826  	unsigned long addr;
1827  	unsigned int gup_flags = 0;
1828  	size_t offset;
1829  	int res;
1830  
1831  	if (i->data_source == ITER_DEST)
1832  		gup_flags |= FOLL_WRITE;
1833  	if (extraction_flags & ITER_ALLOW_P2PDMA)
1834  		gup_flags |= FOLL_PCI_P2PDMA;
1835  	if (i->nofault)
1836  		gup_flags |= FOLL_NOFAULT;
1837  
1838  	addr = first_iovec_segment(i, &maxsize);
1839  	*offset0 = offset = addr % PAGE_SIZE;
1840  	addr &= PAGE_MASK;
1841  	maxpages = want_pages_array(pages, maxsize, offset, maxpages);
1842  	if (!maxpages)
1843  		return -ENOMEM;
1844  	res = pin_user_pages_fast(addr, maxpages, gup_flags, *pages);
1845  	if (unlikely(res <= 0))
1846  		return res;
1847  	maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - offset);
1848  	iov_iter_advance(i, maxsize);
1849  	return maxsize;
1850  }
1851  
1852  /**
1853   * iov_iter_extract_pages - Extract a list of contiguous pages from an iterator
1854   * @i: The iterator to extract from
1855   * @pages: Where to return the list of pages
1856   * @maxsize: The maximum amount of iterator to extract
1857   * @maxpages: The maximum size of the list of pages
1858   * @extraction_flags: Flags to qualify request
1859   * @offset0: Where to return the starting offset into (*@pages)[0]
1860   *
1861   * Extract a list of contiguous pages from the current point of the iterator,
1862   * advancing the iterator.  The maximum number of pages and the maximum amount
1863   * of page contents can be set.
1864   *
1865   * If *@pages is NULL, a page list will be allocated to the required size and
1866   * *@pages will be set to its base.  If *@pages is not NULL, it will be assumed
1867   * that the caller allocated a page list at least @maxpages in size and this
1868   * will be filled in.
1869   *
1870   * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA
1871   * be allowed on the pages extracted.
1872   *
1873   * The iov_iter_extract_will_pin() function can be used to query how cleanup
1874   * should be performed.
1875   *
1876   * Extra refs or pins on the pages may be obtained as follows:
1877   *
1878   *  (*) If the iterator is user-backed (ITER_IOVEC/ITER_UBUF), pins will be
1879   *      added to the pages, but refs will not be taken.
1880   *      iov_iter_extract_will_pin() will return true.
1881   *
1882   *  (*) If the iterator is ITER_KVEC, ITER_BVEC, ITER_FOLIOQ or ITER_XARRAY, the
1883   *      pages are merely listed; no extra refs or pins are obtained.
1884   *      iov_iter_extract_will_pin() will return 0.
1885   *
1886   * Note also:
1887   *
1888   *  (*) Use with ITER_DISCARD is not supported as that has no content.
1889   *
1890   * On success, the function sets *@pages to the new pagelist, if allocated, and
1891   * sets *offset0 to the offset into the first page.
1892   *
1893   * It may also return -ENOMEM and -EFAULT.
1894   */
1895  ssize_t iov_iter_extract_pages(struct iov_iter *i,
1896  			       struct page ***pages,
1897  			       size_t maxsize,
1898  			       unsigned int maxpages,
1899  			       iov_iter_extraction_t extraction_flags,
1900  			       size_t *offset0)
1901  {
1902  	maxsize = min_t(size_t, min_t(size_t, maxsize, i->count), MAX_RW_COUNT);
1903  	if (!maxsize)
1904  		return 0;
1905  
1906  	if (likely(user_backed_iter(i)))
1907  		return iov_iter_extract_user_pages(i, pages, maxsize,
1908  						   maxpages, extraction_flags,
1909  						   offset0);
1910  	if (iov_iter_is_kvec(i))
1911  		return iov_iter_extract_kvec_pages(i, pages, maxsize,
1912  						   maxpages, extraction_flags,
1913  						   offset0);
1914  	if (iov_iter_is_bvec(i))
1915  		return iov_iter_extract_bvec_pages(i, pages, maxsize,
1916  						   maxpages, extraction_flags,
1917  						   offset0);
1918  	if (iov_iter_is_folioq(i))
1919  		return iov_iter_extract_folioq_pages(i, pages, maxsize,
1920  						     maxpages, extraction_flags,
1921  						     offset0);
1922  	if (iov_iter_is_xarray(i))
1923  		return iov_iter_extract_xarray_pages(i, pages, maxsize,
1924  						     maxpages, extraction_flags,
1925  						     offset0);
1926  	return -EFAULT;
1927  }
1928  EXPORT_SYMBOL_GPL(iov_iter_extract_pages);
1929