xref: /freebsd/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c (revision 61145dc2b94f12f6a47344fb9aac702321880e43)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved	*/
29 
30 /*
31  * University Copyright- Copyright (c) 1982, 1986, 1988
32  * The Regents of the University of California
33  * All Rights Reserved
34  *
35  * University Acknowledgment- Portions of this document are derived from
36  * software developed by the University of California, Berkeley, and its
37  * contributors.
38  */
39 /*
40  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
41  */
42 
43 #ifdef _KERNEL
44 
45 #include <sys/errno.h>
46 #include <sys/vmem.h>
47 #include <sys/sysmacros.h>
48 #include <sys/types.h>
49 #include <sys/uio_impl.h>
50 #include <sys/sysmacros.h>
51 #include <sys/string.h>
52 #include <sys/zfs_refcount.h>
53 #include <sys/zfs_debug.h>
54 #include <linux/kmap_compat.h>
55 #include <linux/uaccess.h>
56 #include <linux/pagemap.h>
57 #include <linux/mman.h>
58 
59 /*
60  * Move "n" bytes at byte address "p"; "rw" indicates the direction
61  * of the move, and the I/O parameters are provided in "uio", which is
62  * update to reflect the data which was moved.  Returns 0 on success or
63  * a non-zero errno on failure.
64  */
65 static int
zfs_uiomove_iov(void * p,size_t n,zfs_uio_rw_t rw,zfs_uio_t * uio)66 zfs_uiomove_iov(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
67 {
68 	const struct iovec *iov = uio->uio_iov;
69 	size_t skip = uio->uio_skip;
70 	ulong_t cnt;
71 
72 	ASSERT3S(uio->uio_segflg, ==, UIO_SYSSPACE);
73 	while (n && uio->uio_resid) {
74 		cnt = MIN(iov->iov_len - skip, n);
75 		if (rw == UIO_READ)
76 			memcpy(iov->iov_base + skip, p, cnt);
77 		else
78 			memcpy(p, iov->iov_base + skip, cnt);
79 		skip += cnt;
80 		if (skip == iov->iov_len) {
81 			skip = 0;
82 			uio->uio_iov = (++iov);
83 			uio->uio_iovcnt--;
84 		}
85 		uio->uio_skip = skip;
86 		uio->uio_resid -= cnt;
87 		uio->uio_loffset += cnt;
88 		p = (caddr_t)p + cnt;
89 		n -= cnt;
90 	}
91 	return (0);
92 }
93 
94 static int
zfs_uiomove_bvec_impl(void * p,size_t n,zfs_uio_rw_t rw,zfs_uio_t * uio)95 zfs_uiomove_bvec_impl(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
96 {
97 	const struct bio_vec *bv = uio->uio_bvec;
98 	size_t skip = uio->uio_skip;
99 	ulong_t cnt;
100 
101 	while (n && uio->uio_resid) {
102 		void *paddr;
103 		cnt = MIN(bv->bv_len - skip, n);
104 
105 		paddr = zfs_kmap_local(bv->bv_page);
106 		if (rw == UIO_READ) {
107 			/* Copy from buffer 'p' to the bvec data */
108 			memcpy(paddr + bv->bv_offset + skip, p, cnt);
109 		} else {
110 			/* Copy from bvec data to buffer 'p' */
111 			memcpy(p, paddr + bv->bv_offset + skip, cnt);
112 		}
113 		zfs_kunmap_local(paddr);
114 
115 		skip += cnt;
116 		if (skip == bv->bv_len) {
117 			skip = 0;
118 			uio->uio_bvec = (++bv);
119 			uio->uio_iovcnt--;
120 		}
121 		uio->uio_skip = skip;
122 		uio->uio_resid -= cnt;
123 		uio->uio_loffset += cnt;
124 		p = (caddr_t)p + cnt;
125 		n -= cnt;
126 	}
127 	return (0);
128 }
129 
130 static void
zfs_copy_bvec(void * p,size_t skip,size_t cnt,zfs_uio_rw_t rw,struct bio_vec * bv)131 zfs_copy_bvec(void *p, size_t skip, size_t cnt, zfs_uio_rw_t rw,
132     struct bio_vec *bv)
133 {
134 	void *paddr;
135 
136 	paddr = zfs_kmap_local(bv->bv_page);
137 	if (rw == UIO_READ) {
138 		/* Copy from buffer 'p' to the bvec data */
139 		memcpy(paddr + bv->bv_offset + skip, p, cnt);
140 	} else {
141 		/* Copy from bvec data to buffer 'p' */
142 		memcpy(p, paddr + bv->bv_offset + skip, cnt);
143 	}
144 	zfs_kunmap_local(paddr);
145 }
146 
147 /*
148  * Copy 'n' bytes of data between the buffer p[] and the data represented
149  * by the request in the uio.
150  */
151 static int
zfs_uiomove_bvec_rq(void * p,size_t n,zfs_uio_rw_t rw,zfs_uio_t * uio)152 zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
153 {
154 	struct request *rq = uio->rq;
155 	struct bio_vec bv;
156 	struct req_iterator iter;
157 	size_t this_seg_start;	/* logical offset */
158 	size_t this_seg_end;		/* logical offset */
159 	size_t skip_in_seg;
160 	size_t copy_from_seg;
161 	size_t orig_loffset;
162 	int copied = 0;
163 
164 	/*
165 	 * Get the original logical offset of this entire request (because
166 	 * uio->uio_loffset will be modified over time).
167 	 */
168 	orig_loffset = io_offset(NULL, rq);
169 	this_seg_start = orig_loffset;
170 
171 	rq_for_each_segment(bv, rq, iter) {
172 		/*
173 		 * Lookup what the logical offset of the last byte of this
174 		 * segment is.
175 		 */
176 		this_seg_end = this_seg_start + bv.bv_len - 1;
177 
178 		/*
179 		 * We only need to operate on segments that have data we're
180 		 * copying.
181 		 */
182 		if (uio->uio_loffset >= this_seg_start &&
183 		    uio->uio_loffset <= this_seg_end) {
184 			/*
185 			 * Some, or all, of the data in this segment needs to be
186 			 * copied.
187 			 */
188 
189 			/*
190 			 * We may be not be copying from the first byte in the
191 			 * segment.  Figure out how many bytes to skip copying
192 			 * from the beginning of this segment.
193 			 */
194 			skip_in_seg = uio->uio_loffset - this_seg_start;
195 
196 			/*
197 			 * Calculate the total number of bytes from this
198 			 * segment that we will be copying.
199 			 */
200 			copy_from_seg = MIN(bv.bv_len - skip_in_seg, n);
201 
202 			/* Copy the bytes */
203 			zfs_copy_bvec(p, skip_in_seg, copy_from_seg, rw, &bv);
204 			p = ((char *)p) + copy_from_seg;
205 
206 			n -= copy_from_seg;
207 			uio->uio_resid -= copy_from_seg;
208 			uio->uio_loffset += copy_from_seg;
209 			copied = 1;	/* We copied some data */
210 		}
211 
212 		this_seg_start = this_seg_end + 1;
213 	}
214 
215 	if (!copied) {
216 		/* Didn't copy anything */
217 		uio->uio_resid = 0;
218 	}
219 	return (0);
220 }
221 
222 static int
zfs_uiomove_bvec(void * p,size_t n,zfs_uio_rw_t rw,zfs_uio_t * uio)223 zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
224 {
225 	if (uio->rq != NULL)
226 		return (zfs_uiomove_bvec_rq(p, n, rw, uio));
227 	return (zfs_uiomove_bvec_impl(p, n, rw, uio));
228 }
229 
230 static int
zfs_uiomove_iter(void * p,size_t n,zfs_uio_rw_t rw,zfs_uio_t * uio,boolean_t revert)231 zfs_uiomove_iter(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio,
232     boolean_t revert)
233 {
234 	size_t cnt = MIN(n, uio->uio_resid);
235 
236 	if (uio->uio_skip)
237 		iov_iter_advance(uio->uio_iter, uio->uio_skip);
238 
239 	if (rw == UIO_READ)
240 		cnt = copy_to_iter(p, cnt, uio->uio_iter);
241 	else
242 		cnt = copy_from_iter(p, cnt, uio->uio_iter);
243 
244 	/*
245 	 * When operating on a full pipe no bytes are processed.
246 	 * In which case return EFAULT which is converted to EAGAIN
247 	 * by the kernel's generic_file_splice_read() function.
248 	 */
249 	if (cnt == 0)
250 		return (EFAULT);
251 
252 	/*
253 	 * Revert advancing the uio_iter.  This is set by zfs_uiocopy()
254 	 * to avoid consuming the uio and its iov_iter structure.
255 	 */
256 	if (revert)
257 		iov_iter_revert(uio->uio_iter, cnt);
258 
259 	uio->uio_resid -= cnt;
260 	uio->uio_loffset += cnt;
261 
262 	return (0);
263 }
264 
265 int
zfs_uiomove(void * p,size_t n,zfs_uio_rw_t rw,zfs_uio_t * uio)266 zfs_uiomove(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
267 {
268 	if (uio->uio_segflg == UIO_BVEC)
269 		return (zfs_uiomove_bvec(p, n, rw, uio));
270 	else if (uio->uio_segflg == UIO_ITER)
271 		return (zfs_uiomove_iter(p, n, rw, uio, B_FALSE));
272 	else
273 		return (zfs_uiomove_iov(p, n, rw, uio));
274 }
275 EXPORT_SYMBOL(zfs_uiomove);
276 
277 /*
278  * Fault in the pages of the first n bytes specified by the uio structure.
279  * 1 byte in each page is touched and the uio struct is unmodified. Any
280  * error will terminate the process as this is only a best attempt to get
281  * the pages resident.
282  */
283 int
zfs_uio_prefaultpages(ssize_t n,zfs_uio_t * uio)284 zfs_uio_prefaultpages(ssize_t n, zfs_uio_t *uio)
285 {
286 	if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC ||
287 	    (uio->uio_extflg & UIO_DIRECT)) {
288 		/*
289 		 * There's never a need to fault in kernel pages or Direct I/O
290 		 * write pages. Direct I/O write pages have been pinned in so
291 		 * there is never a time for these pages a fault will occur.
292 		 */
293 		return (0);
294 	} else  {
295 		ASSERT3S(uio->uio_segflg, ==, UIO_ITER);
296 		/*
297 		 * At least a Linux 4.18 kernel, iov_iter_fault_in_readable()
298 		 * can be relied on to fault in user pages when referenced.
299 		 */
300 		if (iov_iter_fault_in_readable(uio->uio_iter, n))
301 			return (EFAULT);
302 	}
303 
304 	return (0);
305 }
306 EXPORT_SYMBOL(zfs_uio_prefaultpages);
307 
308 /*
309  * The same as zfs_uiomove() but doesn't modify uio structure.
310  * return in cbytes how many bytes were copied.
311  */
312 int
zfs_uiocopy(void * p,size_t n,zfs_uio_rw_t rw,zfs_uio_t * uio,size_t * cbytes)313 zfs_uiocopy(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, size_t *cbytes)
314 {
315 	zfs_uio_t uio_copy;
316 	int ret;
317 
318 	memcpy(&uio_copy, uio, sizeof (zfs_uio_t));
319 
320 	if (uio->uio_segflg == UIO_BVEC)
321 		ret = zfs_uiomove_bvec(p, n, rw, &uio_copy);
322 	else if (uio->uio_segflg == UIO_ITER)
323 		ret = zfs_uiomove_iter(p, n, rw, &uio_copy, B_TRUE);
324 	else
325 		ret = zfs_uiomove_iov(p, n, rw, &uio_copy);
326 
327 	*cbytes = uio->uio_resid - uio_copy.uio_resid;
328 
329 	return (ret);
330 }
331 EXPORT_SYMBOL(zfs_uiocopy);
332 
333 /*
334  * Drop the next n chars out of *uio.
335  */
336 void
zfs_uioskip(zfs_uio_t * uio,size_t n)337 zfs_uioskip(zfs_uio_t *uio, size_t n)
338 {
339 	if (n > uio->uio_resid)
340 		return;
341 	/*
342 	 * When using a uio with a struct request, we simply
343 	 * use uio_loffset as a pointer to the next logical byte to
344 	 * copy in the request.  We don't have to do any fancy
345 	 * accounting with uio_bvec/uio_iovcnt since we don't use
346 	 * them.
347 	 */
348 	if (uio->uio_segflg == UIO_BVEC && uio->rq == NULL) {
349 		uio->uio_skip += n;
350 		while (uio->uio_iovcnt &&
351 		    uio->uio_skip >= uio->uio_bvec->bv_len) {
352 			uio->uio_skip -= uio->uio_bvec->bv_len;
353 			uio->uio_bvec++;
354 			uio->uio_iovcnt--;
355 		}
356 	} else if (uio->uio_segflg == UIO_ITER) {
357 		iov_iter_advance(uio->uio_iter, n);
358 	} else {
359 		ASSERT3S(uio->uio_segflg, ==, UIO_SYSSPACE);
360 		uio->uio_skip += n;
361 		while (uio->uio_iovcnt &&
362 		    uio->uio_skip >= uio->uio_iov->iov_len) {
363 			uio->uio_skip -= uio->uio_iov->iov_len;
364 			uio->uio_iov++;
365 			uio->uio_iovcnt--;
366 		}
367 	}
368 
369 	uio->uio_loffset += n;
370 	uio->uio_resid -= n;
371 }
372 EXPORT_SYMBOL(zfs_uioskip);
373 
374 /*
375  * Check if the uio is page-aligned in memory.
376  */
377 boolean_t
zfs_uio_page_aligned(zfs_uio_t * uio)378 zfs_uio_page_aligned(zfs_uio_t *uio)
379 {
380 	boolean_t aligned = B_TRUE;
381 
382 	if (uio->uio_segflg == UIO_SYSSPACE) {
383 		const struct iovec *iov = uio->uio_iov;
384 		size_t skip = uio->uio_skip;
385 
386 		for (int i = uio->uio_iovcnt; i > 0; iov++, i--) {
387 			uintptr_t addr = (uintptr_t)(iov->iov_base + skip);
388 			size_t size = iov->iov_len - skip;
389 			if ((addr & (PAGE_SIZE - 1)) ||
390 			    (size & (PAGE_SIZE - 1))) {
391 				aligned = B_FALSE;
392 				break;
393 			}
394 			skip = 0;
395 		}
396 	} else if (uio->uio_segflg == UIO_ITER) {
397 		unsigned long alignment =
398 		    iov_iter_alignment(uio->uio_iter);
399 		aligned = IS_P2ALIGNED(alignment, PAGE_SIZE);
400 	} else {
401 		/* Currently not supported */
402 		aligned = B_FALSE;
403 	}
404 
405 	return (aligned);
406 }
407 
408 #if defined(HAVE_ZERO_PAGE_GPL_ONLY) || !defined(_LP64)
409 #define	ZFS_MARKEED_PAGE	0x0
410 #define	IS_ZFS_MARKED_PAGE(_p)	0
411 #define	zfs_mark_page(_p)
412 #define	zfs_unmark_page(_p)
413 #define	IS_ZERO_PAGE(_p)	0
414 
415 #else
416 /*
417  * Mark pages to know if they were allocated to replace ZERO_PAGE() for
418  * Direct I/O writes.
419  */
420 #define	ZFS_MARKED_PAGE		0x5a465350414745 /* ASCII: ZFSPAGE */
421 #define	IS_ZFS_MARKED_PAGE(_p) \
422 	(page_private(_p) == (unsigned long)ZFS_MARKED_PAGE)
423 #define	IS_ZERO_PAGE(_p) ((_p) == ZERO_PAGE(0))
424 
425 static inline void
zfs_mark_page(struct page * page)426 zfs_mark_page(struct page *page)
427 {
428 	ASSERT3P(page, !=, NULL);
429 	get_page(page);
430 	SetPagePrivate(page);
431 	set_page_private(page, ZFS_MARKED_PAGE);
432 }
433 
434 static inline void
zfs_unmark_page(struct page * page)435 zfs_unmark_page(struct page *page)
436 {
437 	ASSERT3P(page, !=, NULL);
438 	set_page_private(page, 0UL);
439 	ClearPagePrivate(page);
440 	put_page(page);
441 }
442 #endif /* HAVE_ZERO_PAGE_GPL_ONLY || !_LP64 */
443 
444 static void
zfs_uio_dio_check_for_zero_page(zfs_uio_t * uio)445 zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio)
446 {
447 	ASSERT3P(uio->uio_dio.pages, !=, NULL);
448 
449 	for (long i = 0; i < uio->uio_dio.npages; i++) {
450 		struct page *p = uio->uio_dio.pages[i];
451 		lock_page(p);
452 
453 		if (IS_ZERO_PAGE(p)) {
454 			/*
455 			 * If the user page points the kernels ZERO_PAGE() a
456 			 * new zero filled page will just be allocated so the
457 			 * contents of the page can not be changed by the user
458 			 * while a Direct I/O write is taking place.
459 			 */
460 			gfp_t gfp_zero_page  = __GFP_NOWARN | GFP_NOIO |
461 			    __GFP_ZERO | GFP_KERNEL;
462 
463 			ASSERT0(IS_ZFS_MARKED_PAGE(p));
464 			unlock_page(p);
465 			put_page(p);
466 
467 			uio->uio_dio.pages[i] =
468 			    __page_cache_alloc(gfp_zero_page);
469 			zfs_mark_page(uio->uio_dio.pages[i]);
470 		} else {
471 			unlock_page(p);
472 		}
473 	}
474 }
475 
476 void
zfs_uio_free_dio_pages(zfs_uio_t * uio,zfs_uio_rw_t rw)477 zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
478 {
479 
480 	ASSERT(uio->uio_extflg & UIO_DIRECT);
481 	ASSERT3P(uio->uio_dio.pages, !=, NULL);
482 
483 	if (uio->uio_dio.pinned) {
484 #if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
485 		unpin_user_pages(uio->uio_dio.pages, uio->uio_dio.npages);
486 #endif
487 	} else {
488 		for (long i = 0; i < uio->uio_dio.npages; i++) {
489 			struct page *p = uio->uio_dio.pages[i];
490 
491 			if (IS_ZFS_MARKED_PAGE(p)) {
492 				zfs_unmark_page(p);
493 				__free_page(p);
494 				continue;
495 			}
496 
497 			put_page(p);
498 		}
499 	}
500 
501 	vmem_free(uio->uio_dio.pages,
502 	    uio->uio_dio.npages * sizeof (struct page *));
503 }
504 
505 #if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
506 static int
zfs_uio_pin_user_pages(zfs_uio_t * uio,zfs_uio_rw_t rw)507 zfs_uio_pin_user_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
508 {
509 	long res;
510 	size_t skip = uio->uio_skip;
511 	size_t len = uio->uio_resid - skip;
512 	unsigned int gup_flags = 0;
513 	unsigned long addr;
514 	unsigned long nr_pages;
515 
516 	/*
517 	 * Kernel 6.2 introduced the FOLL_PCI_P2PDMA flag. This flag could
518 	 * possibly be used here in the future to allow for P2P operations with
519 	 * user pages.
520 	 */
521 	if (rw == UIO_READ)
522 		gup_flags = FOLL_WRITE;
523 
524 	if (len == 0)
525 		return (0);
526 
527 	uio->uio_dio.pinned = B_TRUE;
528 #if defined(HAVE_ITER_IS_UBUF)
529 	if (iter_is_ubuf(uio->uio_iter)) {
530 		nr_pages = DIV_ROUND_UP(len, PAGE_SIZE);
531 		addr = (unsigned long)uio->uio_iter->ubuf + skip;
532 		res = pin_user_pages_unlocked(addr, nr_pages,
533 		    &uio->uio_dio.pages[uio->uio_dio.npages], gup_flags);
534 		if (res < 0) {
535 			return (SET_ERROR(-res));
536 		} else if (len != (res * PAGE_SIZE)) {
537 			uio->uio_dio.npages += res;
538 			return (SET_ERROR(EFAULT));
539 		}
540 		uio->uio_dio.npages += res;
541 		return (0);
542 	}
543 #endif
544 	const struct iovec *iovp = zfs_uio_iter_iov(uio->uio_iter);
545 	for (int i = 0; i < uio->uio_iovcnt; i++) {
546 		size_t amt = iovp->iov_len - skip;
547 		if (amt == 0) {
548 			iovp++;
549 			skip = 0;
550 			continue;
551 		}
552 
553 		addr = (unsigned long)iovp->iov_base + skip;
554 		nr_pages = DIV_ROUND_UP(amt, PAGE_SIZE);
555 		res = pin_user_pages_unlocked(addr, nr_pages,
556 		    &uio->uio_dio.pages[uio->uio_dio.npages], gup_flags);
557 		if (res < 0) {
558 			return (SET_ERROR(-res));
559 		} else if (amt != (res * PAGE_SIZE)) {
560 			uio->uio_dio.npages += res;
561 			return (SET_ERROR(EFAULT));
562 		}
563 
564 		len -= amt;
565 		uio->uio_dio.npages += res;
566 		skip = 0;
567 		iovp++;
568 	};
569 
570 	ASSERT0(len);
571 
572 	return (0);
573 }
574 #endif
575 
576 static int
zfs_uio_get_dio_pages_iov_iter(zfs_uio_t * uio,zfs_uio_rw_t rw)577 zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw)
578 {
579 	size_t start;
580 	size_t wanted = uio->uio_resid - uio->uio_skip;
581 	ssize_t rollback = 0;
582 	ssize_t cnt;
583 	unsigned maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE);
584 
585 	while (wanted) {
586 #if defined(HAVE_IOV_ITER_GET_PAGES2)
587 		cnt = iov_iter_get_pages2(uio->uio_iter,
588 		    &uio->uio_dio.pages[uio->uio_dio.npages],
589 		    wanted, maxpages, &start);
590 #else
591 		cnt = iov_iter_get_pages(uio->uio_iter,
592 		    &uio->uio_dio.pages[uio->uio_dio.npages],
593 		    wanted, maxpages, &start);
594 #endif
595 		if (cnt < 0) {
596 			iov_iter_revert(uio->uio_iter, rollback);
597 			return (SET_ERROR(-cnt));
598 		}
599 		/*
600 		 * All Direct I/O operations must be page aligned.
601 		 */
602 		ASSERT(IS_P2ALIGNED(start, PAGE_SIZE));
603 		uio->uio_dio.npages += DIV_ROUND_UP(cnt, PAGE_SIZE);
604 		rollback += cnt;
605 		wanted -= cnt;
606 #if !defined(HAVE_IOV_ITER_GET_PAGES2)
607 		/*
608 		 * iov_iter_get_pages2() advances the iov_iter on success.
609 		 */
610 		iov_iter_advance(uio->uio_iter, cnt);
611 #endif
612 
613 	}
614 	ASSERT3U(rollback, ==, uio->uio_resid - uio->uio_skip);
615 	iov_iter_revert(uio->uio_iter, rollback);
616 
617 	return (0);
618 }
619 
620 /*
621  * This function pins user pages. In the event that the user pages were not
622  * successfully pinned an error value is returned.
623  *
624  * On success, 0 is returned.
625  */
626 int
zfs_uio_get_dio_pages_alloc(zfs_uio_t * uio,zfs_uio_rw_t rw)627 zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw)
628 {
629 	int error = 0;
630 	long npages = DIV_ROUND_UP(uio->uio_resid, PAGE_SIZE);
631 	size_t size = npages * sizeof (struct page *);
632 
633 	if (uio->uio_segflg == UIO_ITER) {
634 		uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP);
635 #if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
636 		if (zfs_user_backed_iov_iter(uio->uio_iter))
637 			error = zfs_uio_pin_user_pages(uio, rw);
638 		else
639 			error = zfs_uio_get_dio_pages_iov_iter(uio, rw);
640 #else
641 		error = zfs_uio_get_dio_pages_iov_iter(uio, rw);
642 #endif
643 	} else {
644 		return (SET_ERROR(EOPNOTSUPP));
645 	}
646 
647 	ASSERT3S(uio->uio_dio.npages, >=, 0);
648 
649 	if (error) {
650 		if (uio->uio_dio.pinned) {
651 #if defined(HAVE_PIN_USER_PAGES_UNLOCKED)
652 			unpin_user_pages(uio->uio_dio.pages,
653 			    uio->uio_dio.npages);
654 #endif
655 		} else {
656 			for (long i = 0; i < uio->uio_dio.npages; i++)
657 				put_page(uio->uio_dio.pages[i]);
658 		}
659 
660 		vmem_free(uio->uio_dio.pages, size);
661 		return (error);
662 	} else {
663 		ASSERT3S(uio->uio_dio.npages, ==, npages);
664 	}
665 
666 	if (rw == UIO_WRITE && !uio->uio_dio.pinned)
667 		zfs_uio_dio_check_for_zero_page(uio);
668 
669 	uio->uio_extflg |= UIO_DIRECT;
670 
671 	return (0);
672 }
673 
674 #endif /* _KERNEL */
675