xref: /freebsd/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c (revision 7a7741af18d6c8a804cc643cb7ecda9d730c6aa6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 /*
39  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
40  */
41 
42 #ifdef _KERNEL
43 
44 #include <sys/errno.h>
45 #include <sys/vmem.h>
46 #include <sys/sysmacros.h>
47 #include <sys/types.h>
48 #include <sys/uio_impl.h>
49 #include <sys/sysmacros.h>
50 #include <sys/string.h>
51 #include <sys/zfs_refcount.h>
52 #include <sys/zfs_debug.h>
53 #include <linux/kmap_compat.h>
54 #include <linux/uaccess.h>
55 #include <linux/pagemap.h>
56 #include <linux/mman.h>
57 
58 /*
59  * Move "n" bytes at byte address "p"; "rw" indicates the direction
60  * of the move, and the I/O parameters are provided in "uio", which is
61  * update to reflect the data which was moved.  Returns 0 on success or
62  * a non-zero errno on failure.
63  */
64 static int
zfs_uiomove_iov(void * p,size_t n,zfs_uio_rw_t rw,zfs_uio_t * uio)65 zfs_uiomove_iov(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
66 {
67 	const struct iovec *iov = uio->uio_iov;
68 	size_t skip = uio->uio_skip;
69 	ulong_t cnt;
70 
71 	while (n && uio->uio_resid) {
72 		cnt = MIN(iov->iov_len - skip, n);
73 		switch (uio->uio_segflg) {
74 		case UIO_USERSPACE:
75 			/*
76 			 * p = kernel data pointer
77 			 * iov->iov_base = user data pointer
78 			 */
79 			if (rw == UIO_READ) {
80 				if (copy_to_user(iov->iov_base+skip, p, cnt))
81 					return (EFAULT);
82 			} else {
83 				unsigned long b_left = 0;
84 				if (uio->uio_fault_disable) {
85 					if (!zfs_access_ok(VERIFY_READ,
86 					    (iov->iov_base + skip), cnt)) {
87 						return (EFAULT);
88 					}
89 					pagefault_disable();
90 					b_left =
91 					    __copy_from_user_inatomic(p,
92 					    (iov->iov_base + skip), cnt);
93 					pagefault_enable();
94 				} else {
95 					b_left =
96 					    copy_from_user(p,
97 					    (iov->iov_base + skip), cnt);
98 				}
99 				if (b_left > 0) {
100 					unsigned long c_bytes =
101 					    cnt - b_left;
102 					uio->uio_skip += c_bytes;
103 					ASSERT3U(uio->uio_skip, <,
104 					    iov->iov_len);
105 					uio->uio_resid -= c_bytes;
106 					uio->uio_loffset += c_bytes;
107 					return (EFAULT);
108 				}
109 			}
110 			break;
111 		case UIO_SYSSPACE:
112 			if (rw == UIO_READ)
113 				memcpy(iov->iov_base + skip, p, cnt);
114 			else
115 				memcpy(p, iov->iov_base + skip, cnt);
116 			break;
117 		default:
118 			ASSERT(0);
119 		}
120 		skip += cnt;
121 		if (skip == iov->iov_len) {
122 			skip = 0;
123 			uio->uio_iov = (++iov);
124 			uio->uio_iovcnt--;
125 		}
126 		uio->uio_skip = skip;
127 		uio->uio_resid -= cnt;
128 		uio->uio_loffset += cnt;
129 		p = (caddr_t)p + cnt;
130 		n -= cnt;
131 	}
132 	return (0);
133 }
134 
135 static int
zfs_uiomove_bvec_impl(void * p,size_t n,zfs_uio_rw_t rw,zfs_uio_t * uio)136 zfs_uiomove_bvec_impl(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
137 {
138 	const struct bio_vec *bv = uio->uio_bvec;
139 	size_t skip = uio->uio_skip;
140 	ulong_t cnt;
141 
142 	while (n && uio->uio_resid) {
143 		void *paddr;
144 		cnt = MIN(bv->bv_len - skip, n);
145 
146 		paddr = zfs_kmap_local(bv->bv_page);
147 		if (rw == UIO_READ) {
148 			/* Copy from buffer 'p' to the bvec data */
149 			memcpy(paddr + bv->bv_offset + skip, p, cnt);
150 		} else {
151 			/* Copy from bvec data to buffer 'p' */
152 			memcpy(p, paddr + bv->bv_offset + skip, cnt);
153 		}
154 		zfs_kunmap_local(paddr);
155 
156 		skip += cnt;
157 		if (skip == bv->bv_len) {
158 			skip = 0;
159 			uio->uio_bvec = (++bv);
160 			uio->uio_iovcnt--;
161 		}
162 		uio->uio_skip = skip;
163 		uio->uio_resid -= cnt;
164 		uio->uio_loffset += cnt;
165 		p = (caddr_t)p + cnt;
166 		n -= cnt;
167 	}
168 	return (0);
169 }
170 
171 static void
zfs_copy_bvec(void * p,size_t skip,size_t cnt,zfs_uio_rw_t rw,struct bio_vec * bv)172 zfs_copy_bvec(void *p, size_t skip, size_t cnt, zfs_uio_rw_t rw,
173     struct bio_vec *bv)
174 {
175 	void *paddr;
176 
177 	paddr = zfs_kmap_local(bv->bv_page);
178 	if (rw == UIO_READ) {
179 		/* Copy from buffer 'p' to the bvec data */
180 		memcpy(paddr + bv->bv_offset + skip, p, cnt);
181 	} else {
182 		/* Copy from bvec data to buffer 'p' */
183 		memcpy(p, paddr + bv->bv_offset + skip, cnt);
184 	}
185 	zfs_kunmap_local(paddr);
186 }
187 
188 /*
189  * Copy 'n' bytes of data between the buffer p[] and the data represented
190  * by the request in the uio.
191  */
192 static int
zfs_uiomove_bvec_rq(void * p,size_t n,zfs_uio_rw_t rw,zfs_uio_t * uio)193 zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
194 {
195 	struct request *rq = uio->rq;
196 	struct bio_vec bv;
197 	struct req_iterator iter;
198 	size_t this_seg_start;	/* logical offset */
199 	size_t this_seg_end;		/* logical offset */
200 	size_t skip_in_seg;
201 	size_t copy_from_seg;
202 	size_t orig_loffset;
203 	int copied = 0;
204 
205 	/*
206 	 * Get the original logical offset of this entire request (because
207 	 * uio->uio_loffset will be modified over time).
208 	 */
209 	orig_loffset = io_offset(NULL, rq);
210 	this_seg_start = orig_loffset;
211 
212 	rq_for_each_segment(bv, rq, iter) {
213 		/*
214 		 * Lookup what the logical offset of the last byte of this
215 		 * segment is.
216 		 */
217 		this_seg_end = this_seg_start + bv.bv_len - 1;
218 
219 		/*
220 		 * We only need to operate on segments that have data we're
221 		 * copying.
222 		 */
223 		if (uio->uio_loffset >= this_seg_start &&
224 		    uio->uio_loffset <= this_seg_end) {
225 			/*
226 			 * Some, or all, of the data in this segment needs to be
227 			 * copied.
228 			 */
229 
230 			/*
231 			 * We may be not be copying from the first byte in the
232 			 * segment.  Figure out how many bytes to skip copying
233 			 * from the beginning of this segment.
234 			 */
235 			skip_in_seg = uio->uio_loffset - this_seg_start;
236 
237 			/*
238 			 * Calculate the total number of bytes from this
239 			 * segment that we will be copying.
240 			 */
241 			copy_from_seg = MIN(bv.bv_len - skip_in_seg, n);
242 
243 			/* Copy the bytes */
244 			zfs_copy_bvec(p, skip_in_seg, copy_from_seg, rw, &bv);
245 			p = ((char *)p) + copy_from_seg;
246 
247 			n -= copy_from_seg;
248 			uio->uio_resid -= copy_from_seg;
249 			uio->uio_loffset += copy_from_seg;
250 			copied = 1;	/* We copied some data */
251 		}
252 
253 		this_seg_start = this_seg_end + 1;
254 	}
255 
256 	if (!copied) {
257 		/* Didn't copy anything */
258 		uio->uio_resid = 0;
259 	}
260 	return (0);
261 }
262 
263 static int
zfs_uiomove_bvec(void * p,size_t n,zfs_uio_rw_t rw,zfs_uio_t * uio)264 zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
265 {
266 	if (uio->rq != NULL)
267 		return (zfs_uiomove_bvec_rq(p, n, rw, uio));
268 	return (zfs_uiomove_bvec_impl(p, n, rw, uio));
269 }
270 
271 #if defined(HAVE_VFS_IOV_ITER)
272 static int
zfs_uiomove_iter(void * p,size_t n,zfs_uio_rw_t rw,zfs_uio_t * uio,boolean_t revert)273 zfs_uiomove_iter(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio,
274     boolean_t revert)
275 {
276 	size_t cnt = MIN(n, uio->uio_resid);
277 
278 	if (uio->uio_skip)
279 		iov_iter_advance(uio->uio_iter, uio->uio_skip);
280 
281 	if (rw == UIO_READ)
282 		cnt = copy_to_iter(p, cnt, uio->uio_iter);
283 	else
284 		cnt = copy_from_iter(p, cnt, uio->uio_iter);
285 
286 	/*
287 	 * When operating on a full pipe no bytes are processed.
288 	 * In which case return EFAULT which is converted to EAGAIN
289 	 * by the kernel's generic_file_splice_read() function.
290 	 */
291 	if (cnt == 0)
292 		return (EFAULT);
293 
294 	/*
295 	 * Revert advancing the uio_iter.  This is set by zfs_uiocopy()
296 	 * to avoid consuming the uio and its iov_iter structure.
297 	 */
298 	if (revert)
299 		iov_iter_revert(uio->uio_iter, cnt);
300 
301 	uio->uio_resid -= cnt;
302 	uio->uio_loffset += cnt;
303 
304 	return (0);
305 }
306 #endif
307 
308 int
zfs_uiomove(void * p,size_t n,zfs_uio_rw_t rw,zfs_uio_t * uio)309 zfs_uiomove(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
310 {
311 	if (uio->uio_segflg == UIO_BVEC)
312 		return (zfs_uiomove_bvec(p, n, rw, uio));
313 #if defined(HAVE_VFS_IOV_ITER)
314 	else if (uio->uio_segflg == UIO_ITER)
315 		return (zfs_uiomove_iter(p, n, rw, uio, B_FALSE));
316 #endif
317 	else
318 		return (zfs_uiomove_iov(p, n, rw, uio));
319 }
320 EXPORT_SYMBOL(zfs_uiomove);
321 
322 /*
323  * Fault in the pages of the first n bytes specified by the uio structure.
324  * 1 byte in each page is touched and the uio struct is unmodified. Any
325  * error will terminate the process as this is only a best attempt to get
326  * the pages resident.
327  */
328 int
zfs_uio_prefaultpages(ssize_t n,zfs_uio_t * uio)329 zfs_uio_prefaultpages(ssize_t n, zfs_uio_t *uio)
330 {
331 	if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC ||
332 	    (uio->uio_extflg & UIO_DIRECT)) {
333 		/*
334 		 * There's never a need to fault in kernel pages or Direct I/O
335 		 * write pages. Direct I/O write pages have been pinned in so
336 		 * there is never a time for these pages a fault will occur.
337 		 */
338 		return (0);
339 #if defined(HAVE_VFS_IOV_ITER)
340 	} else if (uio->uio_segflg == UIO_ITER) {
341 		/*
342 		 * At least a Linux 4.9 kernel, iov_iter_fault_in_readable()
343 		 * can be relied on to fault in user pages when referenced.
344 		 */
345 		if (iov_iter_fault_in_readable(uio->uio_iter, n))
346 			return (EFAULT);
347 #endif
348 	} else {
349 		/* Fault in all user pages */
350 		ASSERT3S(uio->uio_segflg, ==, UIO_USERSPACE);
351 		const struct iovec *iov = uio->uio_iov;
352 		int iovcnt = uio->uio_iovcnt;
353 		size_t skip = uio->uio_skip;
354 		uint8_t tmp;
355 		caddr_t p;
356 
357 		for (; n > 0 && iovcnt > 0; iov++, iovcnt--, skip = 0) {
358 			ulong_t cnt = MIN(iov->iov_len - skip, n);
359 			/* empty iov */
360 			if (cnt == 0)
361 				continue;
362 			n -= cnt;
363 			/* touch each page in this segment. */
364 			p = iov->iov_base + skip;
365 			while (cnt) {
366 				if (copy_from_user(&tmp, p, 1))
367 					return (EFAULT);
368 				ulong_t incr = MIN(cnt, PAGESIZE);
369 				p += incr;
370 				cnt -= incr;
371 			}
372 			/* touch the last byte in case it straddles a page. */
373 			p--;
374 			if (copy_from_user(&tmp, p, 1))
375 				return (EFAULT);
376 		}
377 	}
378 
379 	return (0);
380 }
381 EXPORT_SYMBOL(zfs_uio_prefaultpages);
382 
383 /*
384  * The same as zfs_uiomove() but doesn't modify uio structure.
385  * return in cbytes how many bytes were copied.
386  */
387 int
zfs_uiocopy(void * p,size_t n,zfs_uio_rw_t rw,zfs_uio_t * uio,size_t * cbytes)388 zfs_uiocopy(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, size_t *cbytes)
389 {
390 	zfs_uio_t uio_copy;
391 	int ret;
392 
393 	memcpy(&uio_copy, uio, sizeof (zfs_uio_t));
394 
395 	if (uio->uio_segflg == UIO_BVEC)
396 		ret = zfs_uiomove_bvec(p, n, rw, &uio_copy);
397 #if defined(HAVE_VFS_IOV_ITER)
398 	else if (uio->uio_segflg == UIO_ITER)
399 		ret = zfs_uiomove_iter(p, n, rw, &uio_copy, B_TRUE);
400 #endif
401 	else
402 		ret = zfs_uiomove_iov(p, n, rw, &uio_copy);
403 
404 	*cbytes = uio->uio_resid - uio_copy.uio_resid;
405 
406 	return (ret);
407 }
408 EXPORT_SYMBOL(zfs_uiocopy);
409 
410 /*
411  * Drop the next n chars out of *uio.
412  */
413 void
zfs_uioskip(zfs_uio_t * uio,size_t n)414 zfs_uioskip(zfs_uio_t *uio, size_t n)
415 {
416 	if (n > uio->uio_resid)
417 		return;
418 	/*
419 	 * When using a uio with a struct request, we simply
420 	 * use uio_loffset as a pointer to the next logical byte to
421 	 * copy in the request.  We don't have to do any fancy
422 	 * accounting with uio_bvec/uio_iovcnt since we don't use
423 	 * them.
424 	 */
425 	if (uio->uio_segflg == UIO_BVEC && uio->rq == NULL) {
426 		uio->uio_skip += n;
427 		while (uio->uio_iovcnt &&
428 		    uio->uio_skip >= uio->uio_bvec->bv_len) {
429 			uio->uio_skip -= uio->uio_bvec->bv_len;
430 			uio->uio_bvec++;
431 			uio->uio_iovcnt--;
432 		}
433 #if defined(HAVE_VFS_IOV_ITER)
434 	} else if (uio->uio_segflg == UIO_ITER) {
435 		iov_iter_advance(uio->uio_iter, n);
436 #endif
437 	} else {
438 		uio->uio_skip += n;
439 		while (uio->uio_iovcnt &&
440 		    uio->uio_skip >= uio->uio_iov->iov_len) {
441 			uio->uio_skip -= uio->uio_iov->iov_len;
442 			uio->uio_iov++;
443 			uio->uio_iovcnt--;
444 		}
445 	}
446 
447 	uio->uio_loffset += n;
448 	uio->uio_resid -= n;
449 }
450 EXPORT_SYMBOL(zfs_uioskip);
451 
452 /*
453  * Check if the uio is page-aligned in memory.
454  */
455 boolean_t
zfs_uio_page_aligned(zfs_uio_t * uio)456 zfs_uio_page_aligned(zfs_uio_t *uio)
457 {
458 	boolean_t aligned = B_TRUE;
459 
460 	if (uio->uio_segflg == UIO_USERSPACE ||
461 	    uio->uio_segflg == UIO_SYSSPACE) {
462 		const struct iovec *iov = uio->uio_iov;
463 		size_t skip = uio->uio_skip;
464 
465 		for (int i = uio->uio_iovcnt; i > 0; iov++, i--) {
466 			uintptr_t addr = (uintptr_t)(iov->iov_base + skip);
467 			size_t size = iov->iov_len - skip;
468 			if ((addr & (PAGE_SIZE - 1)) ||
469 			    (size & (PAGE_SIZE - 1))) {
470 				aligned = B_FALSE;
471 				break;
472 			}
473 			skip = 0;
474 		}
475 #if defined(HAVE_VFS_IOV_ITER)
476 	} else if (uio->uio_segflg == UIO_ITER) {
477 		unsigned long alignment =
478 		    iov_iter_alignment(uio->uio_iter);
479 		aligned = IS_P2ALIGNED(alignment, PAGE_SIZE);
480 #endif
481 	} else {
482 		/* Currently not supported */
483 		aligned = B_FALSE;
484 	}
485 
486 	return (aligned);
487 }
488 
489 
490 #if defined(HAVE_ZERO_PAGE_GPL_ONLY) || !defined(_LP64)
491 #define	ZFS_MARKEED_PAGE	0x0
492 #define	IS_ZFS_MARKED_PAGE(_p)	0
493 #define	zfs_mark_page(_p)
494 #define	zfs_unmark_page(_p)
495 #define	IS_ZERO_PAGE(_p)	0
496 
497 #else
498 /*
499  * Mark pages to know if they were allocated to replace ZERO_PAGE() for
500  * Direct I/O writes.
501  */
502 #define	ZFS_MARKED_PAGE		0x5a465350414745 /* ASCII: ZFSPAGE */
503 #define	IS_ZFS_MARKED_PAGE(_p) \
504 	(page_private(_p) == (unsigned long)ZFS_MARKED_PAGE)
505 #define	IS_ZERO_PAGE(_p) ((_p) == ZERO_PAGE(0))
506 
507 static inline void
zfs_mark_page(struct page * page)508 zfs_mark_page(struct page *page)
509 {
510 	ASSERT3P(page, !=, NULL);
511 	get_page(page);
512 	SetPagePrivate(page);
513 	set_page_private(page, ZFS_MARKED_PAGE);
514 }
515 
516 static inline void
zfs_unmark_page(struct page * page)517 zfs_unmark_page(struct page *page)
518 {
519 	ASSERT3P(page, !=, NULL);
520 	set_page_private(page, 0UL);
521 	ClearPagePrivate(page);
522 	put_page(page);
523 }
524 #endif /* HAVE_ZERO_PAGE_GPL_ONLY || !_LP64 */
525 
526 static void
zfs_uio_dio_check_for_zero_page(zfs_uio_t * uio)527 zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio)
528 {
529 	ASSERT3P(uio->uio_dio.pages, !=, NULL);
530 
531 	for (long i = 0; i < uio->uio_dio.npages; i++) {
532 		struct page *p = uio->uio_dio.pages[i];
533 		lock_page(p);
534 
535 		if (IS_ZERO_PAGE(p)) {
536 			/*
537 			 * If the user page points the kernels ZERO_PAGE() a
538 			 * new zero filled page will just be allocated so the
539 			 * contents of the page can not be changed by the user
540 			 * while a Direct I/O write is taking place.
541 			 */
542 			gfp_t gfp_zero_page  = __GFP_NOWARN | GFP_NOIO |
543 			    __GFP_ZERO | GFP_KERNEL;
544 
545 			ASSERT0(IS_ZFS_MARKED_PAGE(p));
546 			unlock_page(p);
547 			put_page(p);
548 
549 			p = __page_cache_alloc(gfp_zero_page);
550 			zfs_mark_page(p);
551 		} else {
552 			unlock_page(p);
553 		}
554 	}
555 }
556 
557 void
zfs_uio_free_dio_pages(zfs_uio_t * uio,zfs_uio_rw_t rw)558 zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw)
559 {
560 
561 	ASSERT(uio->uio_extflg & UIO_DIRECT);
562 	ASSERT3P(uio->uio_dio.pages, !=, NULL);
563 
564 	for (long i = 0; i < uio->uio_dio.npages; i++) {
565 		struct page *p = uio->uio_dio.pages[i];
566 
567 		if (IS_ZFS_MARKED_PAGE(p)) {
568 			zfs_unmark_page(p);
569 			__free_page(p);
570 			continue;
571 		}
572 
573 		put_page(p);
574 	}
575 
576 	vmem_free(uio->uio_dio.pages,
577 	    uio->uio_dio.npages * sizeof (struct page *));
578 }
579 
580 /*
581  * zfs_uio_iov_step() is just a modified version of the STEP function of Linux's
582  * iov_iter_get_pages().
583  */
584 static int
zfs_uio_iov_step(struct iovec v,zfs_uio_rw_t rw,zfs_uio_t * uio,long * numpages)585 zfs_uio_iov_step(struct iovec v, zfs_uio_rw_t rw, zfs_uio_t *uio,
586     long *numpages)
587 {
588 	unsigned long addr = (unsigned long)(v.iov_base);
589 	size_t len = v.iov_len;
590 	unsigned long n = DIV_ROUND_UP(len, PAGE_SIZE);
591 
592 	/*
593 	 * read returning FOLL_WRITE is due to the fact that we are stating
594 	 * that the kernel will have write access to the user pages. So, when a
595 	 * Direct I/O read request is issued, the kernel must write to the user
596 	 * pages.
597 	 */
598 	long res = get_user_pages_unlocked(
599 	    P2ALIGN_TYPED(addr, PAGE_SIZE, unsigned long), n,
600 	    &uio->uio_dio.pages[uio->uio_dio.npages],
601 	    rw == UIO_READ ? FOLL_WRITE : 0);
602 	if (res < 0) {
603 		return (SET_ERROR(-res));
604 	} else if (len != (res * PAGE_SIZE)) {
605 		return (SET_ERROR(EFAULT));
606 	}
607 
608 	ASSERT3S(len, ==, res * PAGE_SIZE);
609 	*numpages = res;
610 	return (0);
611 }
612 
613 static int
zfs_uio_get_dio_pages_iov(zfs_uio_t * uio,zfs_uio_rw_t rw)614 zfs_uio_get_dio_pages_iov(zfs_uio_t *uio, zfs_uio_rw_t rw)
615 {
616 	const struct iovec *iovp = uio->uio_iov;
617 	size_t skip = uio->uio_skip;
618 	size_t len = uio->uio_resid - skip;
619 
620 	ASSERT(uio->uio_segflg != UIO_SYSSPACE);
621 
622 	for (int i = 0; i < uio->uio_iovcnt; i++) {
623 		struct iovec iov;
624 		long numpages = 0;
625 
626 		if (iovp->iov_len == 0) {
627 			iovp++;
628 			skip = 0;
629 			continue;
630 		}
631 		iov.iov_len = MIN(len, iovp->iov_len - skip);
632 		iov.iov_base = iovp->iov_base + skip;
633 		int error = zfs_uio_iov_step(iov, rw, uio, &numpages);
634 
635 		if (error)
636 			return (error);
637 
638 		uio->uio_dio.npages += numpages;
639 		len -= iov.iov_len;
640 		skip = 0;
641 		iovp++;
642 	}
643 
644 	ASSERT0(len);
645 
646 	return (0);
647 }
648 
649 #if defined(HAVE_VFS_IOV_ITER)
650 static int
zfs_uio_get_dio_pages_iov_iter(zfs_uio_t * uio,zfs_uio_rw_t rw)651 zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw)
652 {
653 	size_t skip = uio->uio_skip;
654 	size_t wanted = uio->uio_resid - uio->uio_skip;
655 	ssize_t rollback = 0;
656 	ssize_t cnt;
657 	unsigned maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE);
658 
659 	while (wanted) {
660 #if defined(HAVE_IOV_ITER_GET_PAGES2)
661 		cnt = iov_iter_get_pages2(uio->uio_iter,
662 		    &uio->uio_dio.pages[uio->uio_dio.npages],
663 		    wanted, maxpages, &skip);
664 #else
665 		cnt = iov_iter_get_pages(uio->uio_iter,
666 		    &uio->uio_dio.pages[uio->uio_dio.npages],
667 		    wanted, maxpages, &skip);
668 #endif
669 		if (cnt < 0) {
670 			iov_iter_revert(uio->uio_iter, rollback);
671 			return (SET_ERROR(-cnt));
672 		}
673 		uio->uio_dio.npages += DIV_ROUND_UP(cnt, PAGE_SIZE);
674 		rollback += cnt;
675 		wanted -= cnt;
676 		skip = 0;
677 #if !defined(HAVE_IOV_ITER_GET_PAGES2)
678 		/*
679 		 * iov_iter_get_pages2() advances the iov_iter on success.
680 		 */
681 		iov_iter_advance(uio->uio_iter, cnt);
682 #endif
683 
684 	}
685 	ASSERT3U(rollback, ==, uio->uio_resid - uio->uio_skip);
686 	iov_iter_revert(uio->uio_iter, rollback);
687 
688 	return (0);
689 }
690 #endif /* HAVE_VFS_IOV_ITER */
691 
692 /*
693  * This function pins user pages. In the event that the user pages were not
694  * successfully pinned an error value is returned.
695  *
696  * On success, 0 is returned.
697  */
698 int
zfs_uio_get_dio_pages_alloc(zfs_uio_t * uio,zfs_uio_rw_t rw)699 zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw)
700 {
701 	int error = 0;
702 	long npages = DIV_ROUND_UP(uio->uio_resid, PAGE_SIZE);
703 	size_t size = npages * sizeof (struct page *);
704 
705 	if (uio->uio_segflg == UIO_USERSPACE) {
706 		uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP);
707 		error = zfs_uio_get_dio_pages_iov(uio, rw);
708 #if defined(HAVE_VFS_IOV_ITER)
709 	} else if (uio->uio_segflg == UIO_ITER) {
710 		uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP);
711 		error = zfs_uio_get_dio_pages_iov_iter(uio, rw);
712 #endif
713 	} else {
714 		return (SET_ERROR(EOPNOTSUPP));
715 	}
716 
717 	ASSERT3S(uio->uio_dio.npages, >=, 0);
718 
719 	if (error) {
720 		for (long i = 0; i < uio->uio_dio.npages; i++)
721 			put_page(uio->uio_dio.pages[i]);
722 		vmem_free(uio->uio_dio.pages, size);
723 		return (error);
724 	} else {
725 		ASSERT3S(uio->uio_dio.npages, ==, npages);
726 	}
727 
728 	if (rw == UIO_WRITE) {
729 		zfs_uio_dio_check_for_zero_page(uio);
730 	}
731 
732 	uio->uio_extflg |= UIO_DIRECT;
733 
734 	return (0);
735 }
736 
737 #endif /* _KERNEL */
738