xref: /linux/drivers/iommu/iommufd/io_pagetable.c (revision 8602018b1f17fbdaa5e5d79f4c8603ad20640c12)
1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
3  *
4  * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
5  * PFNs can be placed into an iommu_domain, or returned to the caller as a page
6  * list for access by an in-kernel user.
7  *
8  * The datastructure uses the iopt_pages to optimize the storage of the PFNs
9  * between the domains and xarray.
10  */
11 #include <linux/dma-buf.h>
12 #include <linux/err.h>
13 #include <linux/errno.h>
14 #include <linux/file.h>
15 #include <linux/iommu.h>
16 #include <linux/iommufd.h>
17 #include <linux/lockdep.h>
18 #include <linux/sched/mm.h>
19 #include <linux/slab.h>
20 #include <uapi/linux/iommufd.h>
21 
22 #include "double_span.h"
23 #include "io_pagetable.h"
24 
25 struct iopt_pages_list {
26 	struct iopt_pages *pages;
27 	struct iopt_area *area;
28 	struct list_head next;
29 	unsigned long start_byte;
30 	unsigned long length;
31 };
32 
33 struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
34 					struct io_pagetable *iopt,
35 					unsigned long iova,
36 					unsigned long last_iova)
37 {
38 	lockdep_assert_held(&iopt->iova_rwsem);
39 
40 	iter->cur_iova = iova;
41 	iter->last_iova = last_iova;
42 	iter->area = iopt_area_iter_first(iopt, iova, iova);
43 	if (!iter->area)
44 		return NULL;
45 	if (!iter->area->pages) {
46 		iter->area = NULL;
47 		return NULL;
48 	}
49 	return iter->area;
50 }
51 
52 struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
53 {
54 	unsigned long last_iova;
55 
56 	if (!iter->area)
57 		return NULL;
58 	last_iova = iopt_area_last_iova(iter->area);
59 	if (iter->last_iova <= last_iova)
60 		return NULL;
61 
62 	iter->cur_iova = last_iova + 1;
63 	iter->area = iopt_area_iter_next(iter->area, iter->cur_iova,
64 					 iter->last_iova);
65 	if (!iter->area)
66 		return NULL;
67 	if (iter->cur_iova != iopt_area_iova(iter->area) ||
68 	    !iter->area->pages) {
69 		iter->area = NULL;
70 		return NULL;
71 	}
72 	return iter->area;
73 }
74 
75 static bool __alloc_iova_check_range(unsigned long *start, unsigned long last,
76 				     unsigned long length,
77 				     unsigned long iova_alignment,
78 				     unsigned long page_offset)
79 {
80 	unsigned long aligned_start;
81 
82 	/* ALIGN_UP() */
83 	if (check_add_overflow(*start, iova_alignment - 1, &aligned_start))
84 		return false;
85 	aligned_start &= ~(iova_alignment - 1);
86 	aligned_start |= page_offset;
87 
88 	if (aligned_start >= last || last - aligned_start < length - 1)
89 		return false;
90 	*start = aligned_start;
91 	return true;
92 }
93 
94 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
95 				    unsigned long length,
96 				    unsigned long iova_alignment,
97 				    unsigned long page_offset)
98 {
99 	if (span->is_used)
100 		return false;
101 	return __alloc_iova_check_range(&span->start_hole, span->last_hole,
102 					length, iova_alignment, page_offset);
103 }
104 
105 static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
106 				    unsigned long length,
107 				    unsigned long iova_alignment,
108 				    unsigned long page_offset)
109 {
110 	if (span->is_hole)
111 		return false;
112 	return __alloc_iova_check_range(&span->start_used, span->last_used,
113 					length, iova_alignment, page_offset);
114 }
115 
116 /*
117  * Automatically find a block of IOVA that is not being used and not reserved.
118  * Does not return a 0 IOVA even if it is valid.
119  */
120 static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
121 			   unsigned long addr, unsigned long length)
122 {
123 	unsigned long page_offset = addr % PAGE_SIZE;
124 	struct interval_tree_double_span_iter used_span;
125 	struct interval_tree_span_iter allowed_span;
126 	unsigned long max_alignment = PAGE_SIZE;
127 	unsigned long iova_alignment;
128 
129 	lockdep_assert_held(&iopt->iova_rwsem);
130 
131 	/* Protect roundup_pow-of_two() from overflow */
132 	if (length == 0 || length >= ULONG_MAX / 2)
133 		return -EOVERFLOW;
134 
135 	/*
136 	 * Keep alignment present in addr when building the IOVA, which
137 	 * increases the chance we can map a THP.
138 	 */
139 	if (!addr)
140 		iova_alignment = roundup_pow_of_two(length);
141 	else
142 		iova_alignment = min_t(unsigned long,
143 				       roundup_pow_of_two(length),
144 				       1UL << __ffs64(addr));
145 
146 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
147 	max_alignment = HPAGE_SIZE;
148 #endif
149 	/* Protect against ALIGN() overflow */
150 	if (iova_alignment >= max_alignment)
151 		iova_alignment = max_alignment;
152 
153 	if (iova_alignment < iopt->iova_alignment)
154 		return -EINVAL;
155 
156 	interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree,
157 				    PAGE_SIZE, ULONG_MAX - PAGE_SIZE) {
158 		if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) {
159 			allowed_span.start_used = PAGE_SIZE;
160 			allowed_span.last_used = ULONG_MAX - PAGE_SIZE;
161 			allowed_span.is_hole = false;
162 		}
163 
164 		if (!__alloc_iova_check_used(&allowed_span, length,
165 					     iova_alignment, page_offset))
166 			continue;
167 
168 		interval_tree_for_each_double_span(
169 			&used_span, &iopt->reserved_itree, &iopt->area_itree,
170 			allowed_span.start_used, allowed_span.last_used) {
171 			if (!__alloc_iova_check_hole(&used_span, length,
172 						     iova_alignment,
173 						     page_offset))
174 				continue;
175 
176 			*iova = used_span.start_hole;
177 			return 0;
178 		}
179 	}
180 	return -ENOSPC;
181 }
182 
183 static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova,
184 			   unsigned long length)
185 {
186 	unsigned long last;
187 
188 	lockdep_assert_held(&iopt->iova_rwsem);
189 
190 	if ((iova & (iopt->iova_alignment - 1)))
191 		return -EINVAL;
192 
193 	if (check_add_overflow(iova, length - 1, &last))
194 		return -EOVERFLOW;
195 
196 	/* No reserved IOVA intersects the range */
197 	if (iopt_reserved_iter_first(iopt, iova, last))
198 		return -EINVAL;
199 
200 	/* Check that there is not already a mapping in the range */
201 	if (iopt_area_iter_first(iopt, iova, last))
202 		return -EEXIST;
203 	return 0;
204 }
205 
206 /*
207  * The area takes a slice of the pages from start_bytes to start_byte + length
208  */
209 static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
210 			    struct iopt_pages *pages, unsigned long iova,
211 			    unsigned long start_byte, unsigned long length,
212 			    int iommu_prot)
213 {
214 	lockdep_assert_held_write(&iopt->iova_rwsem);
215 
216 	if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
217 		return -EPERM;
218 
219 	area->iommu_prot = iommu_prot;
220 	area->page_offset = start_byte % PAGE_SIZE;
221 	if (area->page_offset & (iopt->iova_alignment - 1))
222 		return -EINVAL;
223 
224 	area->node.start = iova;
225 	if (check_add_overflow(iova, length - 1, &area->node.last))
226 		return -EOVERFLOW;
227 
228 	area->pages_node.start = start_byte / PAGE_SIZE;
229 	if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
230 		return -EOVERFLOW;
231 	area->pages_node.last = area->pages_node.last / PAGE_SIZE;
232 	if (WARN_ON(area->pages_node.last >= pages->npages))
233 		return -EOVERFLOW;
234 
235 	/*
236 	 * The area is inserted with a NULL pages indicating it is not fully
237 	 * initialized yet.
238 	 */
239 	area->iopt = iopt;
240 	interval_tree_insert(&area->node, &iopt->area_itree);
241 	return 0;
242 }
243 
244 static struct iopt_area *iopt_area_alloc(void)
245 {
246 	struct iopt_area *area;
247 
248 	area = kzalloc_obj(*area, GFP_KERNEL_ACCOUNT);
249 	if (!area)
250 		return NULL;
251 	RB_CLEAR_NODE(&area->node.rb);
252 	RB_CLEAR_NODE(&area->pages_node.rb);
253 	return area;
254 }
255 
256 static int iopt_alloc_area_pages(struct io_pagetable *iopt,
257 				 struct list_head *pages_list,
258 				 unsigned long length, unsigned long *dst_iova,
259 				 int iommu_prot, unsigned int flags)
260 {
261 	struct iopt_pages_list *elm;
262 	unsigned long start;
263 	unsigned long iova;
264 	int rc = 0;
265 
266 	list_for_each_entry(elm, pages_list, next) {
267 		elm->area = iopt_area_alloc();
268 		if (!elm->area)
269 			return -ENOMEM;
270 	}
271 
272 	down_write(&iopt->iova_rwsem);
273 	if ((length & (iopt->iova_alignment - 1)) || !length) {
274 		rc = -EINVAL;
275 		goto out_unlock;
276 	}
277 
278 	if (flags & IOPT_ALLOC_IOVA) {
279 		/* Use the first entry to guess the ideal IOVA alignment */
280 		elm = list_first_entry(pages_list, struct iopt_pages_list,
281 				       next);
282 		switch (elm->pages->type) {
283 		case IOPT_ADDRESS_USER:
284 			start = elm->start_byte + (uintptr_t)elm->pages->uptr;
285 			break;
286 		case IOPT_ADDRESS_FILE:
287 			start = elm->start_byte + elm->pages->start;
288 			break;
289 		case IOPT_ADDRESS_DMABUF:
290 			start = elm->start_byte + elm->pages->dmabuf.start;
291 			break;
292 		}
293 		rc = iopt_alloc_iova(iopt, dst_iova, start, length);
294 		if (rc)
295 			goto out_unlock;
296 		if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
297 		    WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) {
298 			rc = -EINVAL;
299 			goto out_unlock;
300 		}
301 	} else {
302 		rc = iopt_check_iova(iopt, *dst_iova, length);
303 		if (rc)
304 			goto out_unlock;
305 	}
306 
307 	/*
308 	 * Areas are created with a NULL pages so that the IOVA space is
309 	 * reserved and we can unlock the iova_rwsem.
310 	 */
311 	iova = *dst_iova;
312 	list_for_each_entry(elm, pages_list, next) {
313 		rc = iopt_insert_area(iopt, elm->area, elm->pages, iova,
314 				      elm->start_byte, elm->length, iommu_prot);
315 		if (rc)
316 			goto out_unlock;
317 		iova += elm->length;
318 	}
319 
320 out_unlock:
321 	up_write(&iopt->iova_rwsem);
322 	return rc;
323 }
324 
325 static void iopt_abort_area(struct iopt_area *area)
326 {
327 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
328 		WARN_ON(area->pages);
329 	if (area->iopt) {
330 		down_write(&area->iopt->iova_rwsem);
331 		interval_tree_remove(&area->node, &area->iopt->area_itree);
332 		up_write(&area->iopt->iova_rwsem);
333 	}
334 	kfree(area);
335 }
336 
337 void iopt_free_pages_list(struct list_head *pages_list)
338 {
339 	struct iopt_pages_list *elm;
340 
341 	while ((elm = list_first_entry_or_null(pages_list,
342 					       struct iopt_pages_list, next))) {
343 		if (elm->area)
344 			iopt_abort_area(elm->area);
345 		if (elm->pages)
346 			iopt_put_pages(elm->pages);
347 		list_del(&elm->next);
348 		kfree(elm);
349 	}
350 }
351 
352 static int iopt_fill_domains_pages(struct list_head *pages_list)
353 {
354 	struct iopt_pages_list *undo_elm;
355 	struct iopt_pages_list *elm;
356 	int rc;
357 
358 	list_for_each_entry(elm, pages_list, next) {
359 		rc = iopt_area_fill_domains(elm->area, elm->pages);
360 		if (rc)
361 			goto err_undo;
362 	}
363 	return 0;
364 
365 err_undo:
366 	list_for_each_entry(undo_elm, pages_list, next) {
367 		if (undo_elm == elm)
368 			break;
369 		iopt_area_unfill_domains(undo_elm->area, undo_elm->pages);
370 	}
371 	return rc;
372 }
373 
374 int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
375 		   unsigned long length, unsigned long *dst_iova,
376 		   int iommu_prot, unsigned int flags)
377 {
378 	struct iopt_pages_list *elm;
379 	int rc;
380 
381 	rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova,
382 				   iommu_prot, flags);
383 	if (rc)
384 		return rc;
385 
386 	down_read(&iopt->domains_rwsem);
387 	rc = iopt_fill_domains_pages(pages_list);
388 	if (rc)
389 		goto out_unlock_domains;
390 
391 	down_write(&iopt->iova_rwsem);
392 	list_for_each_entry(elm, pages_list, next) {
393 		/*
394 		 * area->pages must be set inside the domains_rwsem to ensure
395 		 * any newly added domains will get filled. Moves the reference
396 		 * in from the list.
397 		 */
398 		elm->area->pages = elm->pages;
399 		elm->pages = NULL;
400 		elm->area = NULL;
401 	}
402 	up_write(&iopt->iova_rwsem);
403 out_unlock_domains:
404 	up_read(&iopt->domains_rwsem);
405 	return rc;
406 }
407 
408 static int iopt_map_common(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
409 			   struct iopt_pages *pages, unsigned long *iova,
410 			   unsigned long length, unsigned long start_byte,
411 			   int iommu_prot, unsigned int flags)
412 {
413 	struct iopt_pages_list elm = {};
414 	LIST_HEAD(pages_list);
415 	int rc;
416 
417 	elm.pages = pages;
418 	elm.start_byte = start_byte;
419 	if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
420 	    elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
421 		elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
422 	elm.length = length;
423 	list_add(&elm.next, &pages_list);
424 
425 	rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
426 	if (rc) {
427 		if (elm.area)
428 			iopt_abort_area(elm.area);
429 		if (elm.pages)
430 			iopt_put_pages(elm.pages);
431 		return rc;
432 	}
433 	return 0;
434 }
435 
436 /**
437  * iopt_map_user_pages() - Map a user VA to an iova in the io page table
438  * @ictx: iommufd_ctx the iopt is part of
439  * @iopt: io_pagetable to act on
440  * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
441  *        the chosen iova on output. Otherwise is the iova to map to on input
442  * @uptr: User VA to map
443  * @length: Number of bytes to map
444  * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
445  * @flags: IOPT_ALLOC_IOVA or zero
446  *
447  * iova, uptr, and length must be aligned to iova_alignment. For domain backed
448  * page tables this will pin the pages and load them into the domain at iova.
449  * For non-domain page tables this will only setup a lazy reference and the
450  * caller must use iopt_access_pages() to touch them.
451  *
452  * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
453  * destroyed.
454  */
455 int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
456 			unsigned long *iova, void __user *uptr,
457 			unsigned long length, int iommu_prot,
458 			unsigned int flags)
459 {
460 	struct iopt_pages *pages;
461 
462 	pages = iopt_alloc_user_pages(uptr, length, iommu_prot & IOMMU_WRITE);
463 	if (IS_ERR(pages))
464 		return PTR_ERR(pages);
465 
466 	return iopt_map_common(ictx, iopt, pages, iova, length,
467 			       uptr - pages->uptr, iommu_prot, flags);
468 }
469 
470 /**
471  * iopt_map_file_pages() - Like iopt_map_user_pages, but map a file.
472  * @ictx: iommufd_ctx the iopt is part of
473  * @iopt: io_pagetable to act on
474  * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
475  *        the chosen iova on output. Otherwise is the iova to map to on input
476  * @fd: fdno of a file to map
477  * @start: map file starting at this byte offset
478  * @length: Number of bytes to map
479  * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
480  * @flags: IOPT_ALLOC_IOVA or zero
481  */
482 int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
483 			unsigned long *iova, int fd, unsigned long start,
484 			unsigned long length, int iommu_prot,
485 			unsigned int flags)
486 {
487 	struct iopt_pages *pages;
488 	struct dma_buf *dmabuf;
489 	unsigned long start_byte;
490 	unsigned long last;
491 
492 	if (!length)
493 		return -EINVAL;
494 	if (check_add_overflow(start, length - 1, &last))
495 		return -EOVERFLOW;
496 
497 	start_byte = start - ALIGN_DOWN(start, PAGE_SIZE);
498 	if (IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
499 		dmabuf = dma_buf_get(fd);
500 	else
501 		dmabuf = ERR_PTR(-ENXIO);
502 
503 	if (!IS_ERR(dmabuf)) {
504 		pages = iopt_alloc_dmabuf_pages(ictx, dmabuf, start_byte, start,
505 						length,
506 						iommu_prot & IOMMU_WRITE);
507 		if (IS_ERR(pages)) {
508 			dma_buf_put(dmabuf);
509 			return PTR_ERR(pages);
510 		}
511 	} else {
512 		struct file *file;
513 
514 		file = fget(fd);
515 		if (!file)
516 			return -EBADF;
517 
518 		pages = iopt_alloc_file_pages(file, start_byte, start, length,
519 					      iommu_prot & IOMMU_WRITE);
520 		fput(file);
521 		if (IS_ERR(pages))
522 			return PTR_ERR(pages);
523 	}
524 
525 	return iopt_map_common(ictx, iopt, pages, iova, length,
526 			       start_byte, iommu_prot, flags);
527 }
528 
529 struct iova_bitmap_fn_arg {
530 	unsigned long flags;
531 	struct io_pagetable *iopt;
532 	struct iommu_domain *domain;
533 	struct iommu_dirty_bitmap *dirty;
534 };
535 
536 static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap,
537 					unsigned long iova, size_t length,
538 					void *opaque)
539 {
540 	struct iopt_area *area;
541 	struct iopt_area_contig_iter iter;
542 	struct iova_bitmap_fn_arg *arg = opaque;
543 	struct iommu_domain *domain = arg->domain;
544 	struct iommu_dirty_bitmap *dirty = arg->dirty;
545 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
546 	unsigned long last_iova = iova + length - 1;
547 	unsigned long flags = arg->flags;
548 	int ret;
549 
550 	iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) {
551 		unsigned long last = min(last_iova, iopt_area_last_iova(area));
552 
553 		ret = ops->read_and_clear_dirty(domain, iter.cur_iova,
554 						last - iter.cur_iova + 1, flags,
555 						dirty);
556 		if (ret)
557 			return ret;
558 	}
559 
560 	if (!iopt_area_contig_done(&iter))
561 		return -EINVAL;
562 	return 0;
563 }
564 
565 static int
566 iommu_read_and_clear_dirty(struct iommu_domain *domain,
567 			   struct io_pagetable *iopt, unsigned long flags,
568 			   struct iommu_hwpt_get_dirty_bitmap *bitmap)
569 {
570 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
571 	struct iommu_iotlb_gather gather;
572 	struct iommu_dirty_bitmap dirty;
573 	struct iova_bitmap_fn_arg arg;
574 	struct iova_bitmap *iter;
575 	int ret = 0;
576 
577 	if (!ops || !ops->read_and_clear_dirty)
578 		return -EOPNOTSUPP;
579 
580 	iter = iova_bitmap_alloc(bitmap->iova, bitmap->length,
581 				 bitmap->page_size,
582 				 u64_to_user_ptr(bitmap->data));
583 	if (IS_ERR(iter))
584 		return -ENOMEM;
585 
586 	iommu_dirty_bitmap_init(&dirty, iter, &gather);
587 
588 	arg.flags = flags;
589 	arg.iopt = iopt;
590 	arg.domain = domain;
591 	arg.dirty = &dirty;
592 	iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty);
593 
594 	if (!(flags & IOMMU_DIRTY_NO_CLEAR))
595 		iommu_iotlb_sync(domain, &gather);
596 
597 	iova_bitmap_free(iter);
598 
599 	return ret;
600 }
601 
602 int iommufd_check_iova_range(struct io_pagetable *iopt,
603 			     struct iommu_hwpt_get_dirty_bitmap *bitmap)
604 {
605 	size_t iommu_pgsize = iopt->iova_alignment;
606 	u64 last_iova;
607 
608 	if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova))
609 		return -EOVERFLOW;
610 
611 	if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX)
612 		return -EOVERFLOW;
613 
614 	if ((bitmap->iova & (iommu_pgsize - 1)) ||
615 	    ((last_iova + 1) & (iommu_pgsize - 1)))
616 		return -EINVAL;
617 
618 	if (!bitmap->page_size)
619 		return -EINVAL;
620 
621 	if ((bitmap->iova & (bitmap->page_size - 1)) ||
622 	    ((last_iova + 1) & (bitmap->page_size - 1)))
623 		return -EINVAL;
624 
625 	return 0;
626 }
627 
628 int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
629 				   struct iommu_domain *domain,
630 				   unsigned long flags,
631 				   struct iommu_hwpt_get_dirty_bitmap *bitmap)
632 {
633 	int ret;
634 
635 	ret = iommufd_check_iova_range(iopt, bitmap);
636 	if (ret)
637 		return ret;
638 
639 	down_read(&iopt->iova_rwsem);
640 	ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
641 	up_read(&iopt->iova_rwsem);
642 
643 	return ret;
644 }
645 
646 static int iopt_clear_dirty_data(struct io_pagetable *iopt,
647 				 struct iommu_domain *domain)
648 {
649 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
650 	struct iommu_iotlb_gather gather;
651 	struct iommu_dirty_bitmap dirty;
652 	struct iopt_area *area;
653 	int ret = 0;
654 
655 	lockdep_assert_held_read(&iopt->iova_rwsem);
656 
657 	iommu_dirty_bitmap_init(&dirty, NULL, &gather);
658 
659 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
660 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
661 		if (!area->pages)
662 			continue;
663 
664 		ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area),
665 						iopt_area_length(area), 0,
666 						&dirty);
667 		if (ret)
668 			break;
669 	}
670 
671 	iommu_iotlb_sync(domain, &gather);
672 	return ret;
673 }
674 
675 int iopt_set_dirty_tracking(struct io_pagetable *iopt,
676 			    struct iommu_domain *domain, bool enable)
677 {
678 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
679 	int ret = 0;
680 
681 	if (!ops)
682 		return -EOPNOTSUPP;
683 
684 	down_read(&iopt->iova_rwsem);
685 
686 	/* Clear dirty bits from PTEs to ensure a clean snapshot */
687 	if (enable) {
688 		ret = iopt_clear_dirty_data(iopt, domain);
689 		if (ret)
690 			goto out_unlock;
691 	}
692 
693 	ret = ops->set_dirty_tracking(domain, enable);
694 
695 out_unlock:
696 	up_read(&iopt->iova_rwsem);
697 	return ret;
698 }
699 
700 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
701 		   unsigned long length, struct list_head *pages_list)
702 {
703 	struct iopt_area_contig_iter iter;
704 	unsigned long last_iova;
705 	struct iopt_area *area;
706 	int rc;
707 
708 	if (!length)
709 		return -EINVAL;
710 	if (check_add_overflow(iova, length - 1, &last_iova))
711 		return -EOVERFLOW;
712 
713 	down_read(&iopt->iova_rwsem);
714 	iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
715 		struct iopt_pages_list *elm;
716 		unsigned long last = min(last_iova, iopt_area_last_iova(area));
717 
718 		elm = kzalloc_obj(*elm, GFP_KERNEL_ACCOUNT);
719 		if (!elm) {
720 			rc = -ENOMEM;
721 			goto err_free;
722 		}
723 		elm->start_byte = iopt_area_start_byte(area, iter.cur_iova);
724 		elm->pages = area->pages;
725 		elm->length = (last - iter.cur_iova) + 1;
726 		kref_get(&elm->pages->kref);
727 		list_add_tail(&elm->next, pages_list);
728 	}
729 	if (!iopt_area_contig_done(&iter)) {
730 		rc = -ENOENT;
731 		goto err_free;
732 	}
733 	up_read(&iopt->iova_rwsem);
734 	return 0;
735 err_free:
736 	up_read(&iopt->iova_rwsem);
737 	iopt_free_pages_list(pages_list);
738 	return rc;
739 }
740 
741 static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
742 				 unsigned long last, unsigned long *unmapped)
743 {
744 	struct iopt_area *area;
745 	unsigned long unmapped_bytes = 0;
746 	unsigned int tries = 0;
747 	/* If there are no mapped entries then success */
748 	int rc = 0;
749 
750 	/*
751 	 * The domains_rwsem must be held in read mode any time any area->pages
752 	 * is NULL. This prevents domain attach/detatch from running
753 	 * concurrently with cleaning up the area.
754 	 */
755 again:
756 	down_read(&iopt->domains_rwsem);
757 	down_write(&iopt->iova_rwsem);
758 	while ((area = iopt_area_iter_first(iopt, start, last))) {
759 		unsigned long area_last = iopt_area_last_iova(area);
760 		unsigned long area_first = iopt_area_iova(area);
761 		struct iopt_pages *pages;
762 
763 		/* Userspace should not race map/unmap's of the same area */
764 		if (!area->pages) {
765 			rc = -EBUSY;
766 			goto out_unlock_iova;
767 		}
768 
769 		/* The area is locked by an object that has not been destroyed */
770 		if (area->num_locks) {
771 			rc = -EBUSY;
772 			goto out_unlock_iova;
773 		}
774 
775 		if (area_first < start || area_last > last) {
776 			rc = -ENOENT;
777 			goto out_unlock_iova;
778 		}
779 
780 		if (area_first != start)
781 			tries = 0;
782 
783 		/*
784 		 * num_accesses writers must hold the iova_rwsem too, so we can
785 		 * safely read it under the write side of the iovam_rwsem
786 		 * without the pages->mutex.
787 		 */
788 		if (area->num_accesses) {
789 			size_t length = iopt_area_length(area);
790 
791 			start = area_first;
792 			area->prevent_access = true;
793 			up_write(&iopt->iova_rwsem);
794 			up_read(&iopt->domains_rwsem);
795 
796 			iommufd_access_notify_unmap(iopt, area_first, length);
797 			/* Something is not responding to unmap requests. */
798 			tries++;
799 			if (WARN_ON(tries > 100)) {
800 				rc = -EDEADLOCK;
801 				goto out_unmapped;
802 			}
803 			goto again;
804 		}
805 
806 		pages = area->pages;
807 		area->pages = NULL;
808 		up_write(&iopt->iova_rwsem);
809 
810 		iopt_area_unfill_domains(area, pages);
811 		iopt_abort_area(area);
812 		iopt_put_pages(pages);
813 
814 		unmapped_bytes += area_last - area_first + 1;
815 
816 		down_write(&iopt->iova_rwsem);
817 
818 		/*
819 		 * After releasing the iova_rwsem concurrent allocation could
820 		 * place new areas at IOVAs we have already unmapped. Keep
821 		 * moving the start of the search forward to ignore the area
822 		 * already unmapped.
823 		 */
824 		if (area_last >= last)
825 			break;
826 		start = area_last + 1;
827 	}
828 
829 out_unlock_iova:
830 	up_write(&iopt->iova_rwsem);
831 	up_read(&iopt->domains_rwsem);
832 out_unmapped:
833 	if (unmapped)
834 		*unmapped = unmapped_bytes;
835 	return rc;
836 }
837 
838 /**
839  * iopt_unmap_iova() - Remove a range of iova
840  * @iopt: io_pagetable to act on
841  * @iova: Starting iova to unmap
842  * @length: Number of bytes to unmap
843  * @unmapped: Return number of bytes unmapped
844  *
845  * The requested range must be a superset of existing ranges.
846  * Splitting/truncating IOVA mappings is not allowed.
847  */
848 int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
849 		    unsigned long length, unsigned long *unmapped)
850 {
851 	unsigned long iova_last;
852 
853 	if (!length)
854 		return -EINVAL;
855 
856 	if (check_add_overflow(iova, length - 1, &iova_last))
857 		return -EOVERFLOW;
858 
859 	return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped);
860 }
861 
862 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
863 {
864 	/* If the IOVAs are empty then unmap all succeeds */
865 	return iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
866 }
867 
868 /* The caller must always free all the nodes in the allowed_iova rb_root. */
869 int iopt_set_allow_iova(struct io_pagetable *iopt,
870 			struct rb_root_cached *allowed_iova)
871 {
872 	struct iopt_allowed *allowed;
873 
874 	down_write(&iopt->iova_rwsem);
875 	swap(*allowed_iova, iopt->allowed_itree);
876 
877 	for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed;
878 	     allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) {
879 		if (iopt_reserved_iter_first(iopt, allowed->node.start,
880 					     allowed->node.last)) {
881 			swap(*allowed_iova, iopt->allowed_itree);
882 			up_write(&iopt->iova_rwsem);
883 			return -EADDRINUSE;
884 		}
885 	}
886 	up_write(&iopt->iova_rwsem);
887 	return 0;
888 }
889 
890 int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
891 		      unsigned long last, void *owner)
892 {
893 	struct iopt_reserved *reserved;
894 
895 	lockdep_assert_held_write(&iopt->iova_rwsem);
896 
897 	if (iopt_area_iter_first(iopt, start, last) ||
898 	    iopt_allowed_iter_first(iopt, start, last))
899 		return -EADDRINUSE;
900 
901 	reserved = kzalloc_obj(*reserved, GFP_KERNEL_ACCOUNT);
902 	if (!reserved)
903 		return -ENOMEM;
904 	reserved->node.start = start;
905 	reserved->node.last = last;
906 	reserved->owner = owner;
907 	interval_tree_insert(&reserved->node, &iopt->reserved_itree);
908 	return 0;
909 }
910 
911 static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
912 {
913 	struct iopt_reserved *reserved, *next;
914 
915 	lockdep_assert_held_write(&iopt->iova_rwsem);
916 
917 	for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved;
918 	     reserved = next) {
919 		next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX);
920 
921 		if (reserved->owner == owner) {
922 			interval_tree_remove(&reserved->node,
923 					     &iopt->reserved_itree);
924 			kfree(reserved);
925 		}
926 	}
927 }
928 
929 void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
930 {
931 	down_write(&iopt->iova_rwsem);
932 	__iopt_remove_reserved_iova(iopt, owner);
933 	up_write(&iopt->iova_rwsem);
934 }
935 
936 void iopt_init_table(struct io_pagetable *iopt)
937 {
938 	init_rwsem(&iopt->iova_rwsem);
939 	init_rwsem(&iopt->domains_rwsem);
940 	iopt->area_itree = RB_ROOT_CACHED;
941 	iopt->allowed_itree = RB_ROOT_CACHED;
942 	iopt->reserved_itree = RB_ROOT_CACHED;
943 	xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT);
944 	xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC);
945 
946 	/*
947 	 * iopt's start as SW tables that can use the entire size_t IOVA space
948 	 * due to the use of size_t in the APIs. They have no alignment
949 	 * restriction.
950 	 */
951 	iopt->iova_alignment = 1;
952 }
953 
954 void iopt_destroy_table(struct io_pagetable *iopt)
955 {
956 	struct interval_tree_node *node;
957 
958 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
959 		iopt_remove_reserved_iova(iopt, NULL);
960 
961 	while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0,
962 						ULONG_MAX))) {
963 		interval_tree_remove(node, &iopt->allowed_itree);
964 		kfree(container_of(node, struct iopt_allowed, node));
965 	}
966 
967 	WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root));
968 	WARN_ON(!xa_empty(&iopt->domains));
969 	WARN_ON(!xa_empty(&iopt->access_list));
970 	WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
971 }
972 
973 /**
974  * iopt_unfill_domain() - Unfill a domain with PFNs
975  * @iopt: io_pagetable to act on
976  * @domain: domain to unfill
977  *
978  * This is used when removing a domain from the iopt. Every area in the iopt
979  * will be unmapped from the domain. The domain must already be removed from the
980  * domains xarray.
981  */
982 static void iopt_unfill_domain(struct io_pagetable *iopt,
983 			       struct iommu_domain *domain)
984 {
985 	struct iopt_area *area;
986 
987 	lockdep_assert_held(&iopt->iova_rwsem);
988 	lockdep_assert_held_write(&iopt->domains_rwsem);
989 
990 	/*
991 	 * Some other domain is holding all the pfns still, rapidly unmap this
992 	 * domain.
993 	 */
994 	if (iopt->next_domain_id != 0) {
995 		/* Pick an arbitrary remaining domain to act as storage */
996 		struct iommu_domain *storage_domain =
997 			xa_load(&iopt->domains, 0);
998 
999 		for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1000 		     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
1001 			struct iopt_pages *pages = area->pages;
1002 
1003 			if (!pages)
1004 				continue;
1005 
1006 			mutex_lock(&pages->mutex);
1007 			if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
1008 				WARN_ON(!area->storage_domain);
1009 			if (area->storage_domain == domain)
1010 				area->storage_domain = storage_domain;
1011 			if (iopt_is_dmabuf(pages)) {
1012 				if (!iopt_dmabuf_revoked(pages))
1013 					iopt_area_unmap_domain(area, domain);
1014 				iopt_dmabuf_untrack_domain(pages, area, domain);
1015 			}
1016 			mutex_unlock(&pages->mutex);
1017 
1018 			if (!iopt_is_dmabuf(pages))
1019 				iopt_area_unmap_domain(area, domain);
1020 		}
1021 		return;
1022 	}
1023 
1024 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1025 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
1026 		struct iopt_pages *pages = area->pages;
1027 
1028 		if (!pages)
1029 			continue;
1030 
1031 		mutex_lock(&pages->mutex);
1032 		interval_tree_remove(&area->pages_node, &pages->domains_itree);
1033 		WARN_ON(area->storage_domain != domain);
1034 		area->storage_domain = NULL;
1035 		iopt_area_unfill_domain(area, pages, domain);
1036 		if (iopt_is_dmabuf(pages))
1037 			iopt_dmabuf_untrack_domain(pages, area, domain);
1038 		mutex_unlock(&pages->mutex);
1039 	}
1040 }
1041 
1042 /**
1043  * iopt_fill_domain() - Fill a domain with PFNs
1044  * @iopt: io_pagetable to act on
1045  * @domain: domain to fill
1046  *
1047  * Fill the domain with PFNs from every area in the iopt. On failure the domain
1048  * is left unchanged.
1049  */
1050 static int iopt_fill_domain(struct io_pagetable *iopt,
1051 			    struct iommu_domain *domain)
1052 {
1053 	struct iopt_area *end_area;
1054 	struct iopt_area *area;
1055 	int rc;
1056 
1057 	lockdep_assert_held(&iopt->iova_rwsem);
1058 	lockdep_assert_held_write(&iopt->domains_rwsem);
1059 
1060 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1061 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
1062 		struct iopt_pages *pages = area->pages;
1063 
1064 		if (!pages)
1065 			continue;
1066 
1067 		guard(mutex)(&pages->mutex);
1068 		if (iopt_is_dmabuf(pages)) {
1069 			rc = iopt_dmabuf_track_domain(pages, area, domain);
1070 			if (rc)
1071 				goto out_unfill;
1072 		}
1073 		rc = iopt_area_fill_domain(area, domain);
1074 		if (rc) {
1075 			if (iopt_is_dmabuf(pages))
1076 				iopt_dmabuf_untrack_domain(pages, area, domain);
1077 			goto out_unfill;
1078 		}
1079 		if (!area->storage_domain) {
1080 			WARN_ON(iopt->next_domain_id != 0);
1081 			area->storage_domain = domain;
1082 			interval_tree_insert(&area->pages_node,
1083 					     &pages->domains_itree);
1084 		}
1085 	}
1086 	return 0;
1087 
1088 out_unfill:
1089 	end_area = area;
1090 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1091 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
1092 		struct iopt_pages *pages = area->pages;
1093 
1094 		if (area == end_area)
1095 			break;
1096 		if (!pages)
1097 			continue;
1098 		mutex_lock(&pages->mutex);
1099 		if (iopt->next_domain_id == 0) {
1100 			interval_tree_remove(&area->pages_node,
1101 					     &pages->domains_itree);
1102 			area->storage_domain = NULL;
1103 		}
1104 		iopt_area_unfill_domain(area, pages, domain);
1105 		if (iopt_is_dmabuf(pages))
1106 			iopt_dmabuf_untrack_domain(pages, area, domain);
1107 		mutex_unlock(&pages->mutex);
1108 	}
1109 	return rc;
1110 }
1111 
1112 /* All existing area's conform to an increased page size */
1113 static int iopt_check_iova_alignment(struct io_pagetable *iopt,
1114 				     unsigned long new_iova_alignment)
1115 {
1116 	unsigned long align_mask = new_iova_alignment - 1;
1117 	struct iopt_area *area;
1118 
1119 	lockdep_assert_held(&iopt->iova_rwsem);
1120 	lockdep_assert_held(&iopt->domains_rwsem);
1121 
1122 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1123 	     area = iopt_area_iter_next(area, 0, ULONG_MAX))
1124 		if ((iopt_area_iova(area) & align_mask) ||
1125 		    (iopt_area_length(area) & align_mask) ||
1126 		    (area->page_offset & align_mask))
1127 			return -EADDRINUSE;
1128 
1129 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) {
1130 		struct iommufd_access *access;
1131 		unsigned long index;
1132 
1133 		xa_for_each(&iopt->access_list, index, access)
1134 			if (WARN_ON(access->iova_alignment >
1135 				    new_iova_alignment))
1136 				return -EADDRINUSE;
1137 	}
1138 	return 0;
1139 }
1140 
1141 int iopt_table_add_domain(struct io_pagetable *iopt,
1142 			  struct iommu_domain *domain)
1143 {
1144 	const struct iommu_domain_geometry *geometry = &domain->geometry;
1145 	struct iommu_domain *iter_domain;
1146 	unsigned int new_iova_alignment;
1147 	unsigned long index;
1148 	int rc;
1149 
1150 	down_write(&iopt->domains_rwsem);
1151 	down_write(&iopt->iova_rwsem);
1152 
1153 	xa_for_each(&iopt->domains, index, iter_domain) {
1154 		if (WARN_ON(iter_domain == domain)) {
1155 			rc = -EEXIST;
1156 			goto out_unlock;
1157 		}
1158 	}
1159 
1160 	/*
1161 	 * The io page size drives the iova_alignment. Internally the iopt_pages
1162 	 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
1163 	 * objects into the iommu_domain.
1164 	 *
1165 	 * A iommu_domain must always be able to accept PAGE_SIZE to be
1166 	 * compatible as we can't guarantee higher contiguity.
1167 	 */
1168 	new_iova_alignment = max_t(unsigned long,
1169 				   1UL << __ffs(domain->pgsize_bitmap),
1170 				   iopt->iova_alignment);
1171 	if (new_iova_alignment > PAGE_SIZE) {
1172 		rc = -EINVAL;
1173 		goto out_unlock;
1174 	}
1175 	if (new_iova_alignment != iopt->iova_alignment) {
1176 		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1177 		if (rc)
1178 			goto out_unlock;
1179 	}
1180 
1181 	/* No area exists that is outside the allowed domain aperture */
1182 	if (geometry->aperture_start != 0) {
1183 		rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
1184 				       domain);
1185 		if (rc)
1186 			goto out_reserved;
1187 	}
1188 	if (geometry->aperture_end != ULONG_MAX) {
1189 		rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
1190 				       ULONG_MAX, domain);
1191 		if (rc)
1192 			goto out_reserved;
1193 	}
1194 
1195 	rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
1196 	if (rc)
1197 		goto out_reserved;
1198 
1199 	rc = iopt_fill_domain(iopt, domain);
1200 	if (rc)
1201 		goto out_release;
1202 
1203 	iopt->iova_alignment = new_iova_alignment;
1204 	xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
1205 	iopt->next_domain_id++;
1206 	up_write(&iopt->iova_rwsem);
1207 	up_write(&iopt->domains_rwsem);
1208 	return 0;
1209 out_release:
1210 	xa_release(&iopt->domains, iopt->next_domain_id);
1211 out_reserved:
1212 	__iopt_remove_reserved_iova(iopt, domain);
1213 out_unlock:
1214 	up_write(&iopt->iova_rwsem);
1215 	up_write(&iopt->domains_rwsem);
1216 	return rc;
1217 }
1218 
1219 static int iopt_calculate_iova_alignment(struct io_pagetable *iopt)
1220 {
1221 	unsigned long new_iova_alignment;
1222 	struct iommufd_access *access;
1223 	struct iommu_domain *domain;
1224 	unsigned long index;
1225 
1226 	lockdep_assert_held_write(&iopt->iova_rwsem);
1227 	lockdep_assert_held(&iopt->domains_rwsem);
1228 
1229 	/* See batch_iommu_map_small() */
1230 	if (iopt->disable_large_pages)
1231 		new_iova_alignment = PAGE_SIZE;
1232 	else
1233 		new_iova_alignment = 1;
1234 
1235 	xa_for_each(&iopt->domains, index, domain)
1236 		new_iova_alignment = max_t(unsigned long,
1237 					   1UL << __ffs(domain->pgsize_bitmap),
1238 					   new_iova_alignment);
1239 	xa_for_each(&iopt->access_list, index, access)
1240 		new_iova_alignment = max_t(unsigned long,
1241 					   access->iova_alignment,
1242 					   new_iova_alignment);
1243 
1244 	if (new_iova_alignment > iopt->iova_alignment) {
1245 		int rc;
1246 
1247 		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1248 		if (rc)
1249 			return rc;
1250 	}
1251 	iopt->iova_alignment = new_iova_alignment;
1252 	return 0;
1253 }
1254 
1255 void iopt_table_remove_domain(struct io_pagetable *iopt,
1256 			      struct iommu_domain *domain)
1257 {
1258 	struct iommu_domain *iter_domain = NULL;
1259 	unsigned long index;
1260 
1261 	down_write(&iopt->domains_rwsem);
1262 	down_write(&iopt->iova_rwsem);
1263 
1264 	xa_for_each(&iopt->domains, index, iter_domain)
1265 		if (iter_domain == domain)
1266 			break;
1267 	if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
1268 		goto out_unlock;
1269 
1270 	/*
1271 	 * Compress the xarray to keep it linear by swapping the entry to erase
1272 	 * with the tail entry and shrinking the tail.
1273 	 */
1274 	iopt->next_domain_id--;
1275 	iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
1276 	if (index != iopt->next_domain_id)
1277 		xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
1278 
1279 	iopt_unfill_domain(iopt, domain);
1280 	__iopt_remove_reserved_iova(iopt, domain);
1281 
1282 	WARN_ON(iopt_calculate_iova_alignment(iopt));
1283 out_unlock:
1284 	up_write(&iopt->iova_rwsem);
1285 	up_write(&iopt->domains_rwsem);
1286 }
1287 
1288 /**
1289  * iopt_area_split - Split an area into two parts at iova
1290  * @area: The area to split
1291  * @iova: Becomes the last of a new area
1292  *
1293  * This splits an area into two. It is part of the VFIO compatibility to allow
1294  * poking a hole in the mapping. The two areas continue to point at the same
1295  * iopt_pages, just with different starting bytes.
1296  */
1297 static int iopt_area_split(struct iopt_area *area, unsigned long iova)
1298 {
1299 	unsigned long alignment = area->iopt->iova_alignment;
1300 	unsigned long last_iova = iopt_area_last_iova(area);
1301 	unsigned long start_iova = iopt_area_iova(area);
1302 	unsigned long new_start = iova + 1;
1303 	struct io_pagetable *iopt = area->iopt;
1304 	struct iopt_pages *pages = area->pages;
1305 	struct iopt_area *lhs;
1306 	struct iopt_area *rhs;
1307 	int rc;
1308 
1309 	lockdep_assert_held_write(&iopt->iova_rwsem);
1310 
1311 	if (iova == start_iova || iova == last_iova)
1312 		return 0;
1313 
1314 	if (!pages || area->prevent_access)
1315 		return -EBUSY;
1316 
1317 	/* Maintaining the domains_itree below is a bit complicated */
1318 	if (iopt_is_dmabuf(pages))
1319 		return -EOPNOTSUPP;
1320 
1321 	if (new_start & (alignment - 1) ||
1322 	    iopt_area_start_byte(area, new_start) & (alignment - 1))
1323 		return -EINVAL;
1324 
1325 	lhs = iopt_area_alloc();
1326 	if (!lhs)
1327 		return -ENOMEM;
1328 
1329 	rhs = iopt_area_alloc();
1330 	if (!rhs) {
1331 		rc = -ENOMEM;
1332 		goto err_free_lhs;
1333 	}
1334 
1335 	mutex_lock(&pages->mutex);
1336 	/*
1337 	 * Splitting is not permitted if an access exists, we don't track enough
1338 	 * information to split existing accesses.
1339 	 */
1340 	if (area->num_accesses) {
1341 		rc = -EINVAL;
1342 		goto err_unlock;
1343 	}
1344 
1345 	/*
1346 	 * Splitting is not permitted if a domain could have been mapped with
1347 	 * huge pages.
1348 	 */
1349 	if (area->storage_domain && !iopt->disable_large_pages) {
1350 		rc = -EINVAL;
1351 		goto err_unlock;
1352 	}
1353 
1354 	interval_tree_remove(&area->node, &iopt->area_itree);
1355 	rc = iopt_insert_area(iopt, lhs, area->pages, start_iova,
1356 			      iopt_area_start_byte(area, start_iova),
1357 			      (new_start - 1) - start_iova + 1,
1358 			      area->iommu_prot);
1359 	if (WARN_ON(rc))
1360 		goto err_insert;
1361 
1362 	rc = iopt_insert_area(iopt, rhs, area->pages, new_start,
1363 			      iopt_area_start_byte(area, new_start),
1364 			      last_iova - new_start + 1, area->iommu_prot);
1365 	if (WARN_ON(rc))
1366 		goto err_remove_lhs;
1367 
1368 	/*
1369 	 * If the original area has filled a domain, domains_itree has to be
1370 	 * updated.
1371 	 */
1372 	if (area->storage_domain) {
1373 		interval_tree_remove(&area->pages_node, &pages->domains_itree);
1374 		interval_tree_insert(&lhs->pages_node, &pages->domains_itree);
1375 		interval_tree_insert(&rhs->pages_node, &pages->domains_itree);
1376 	}
1377 
1378 	lhs->storage_domain = area->storage_domain;
1379 	lhs->pages = area->pages;
1380 	rhs->storage_domain = area->storage_domain;
1381 	rhs->pages = area->pages;
1382 	kref_get(&rhs->pages->kref);
1383 	kfree(area);
1384 	mutex_unlock(&pages->mutex);
1385 
1386 	/*
1387 	 * No change to domains or accesses because the pages hasn't been
1388 	 * changed
1389 	 */
1390 	return 0;
1391 
1392 err_remove_lhs:
1393 	interval_tree_remove(&lhs->node, &iopt->area_itree);
1394 err_insert:
1395 	interval_tree_insert(&area->node, &iopt->area_itree);
1396 err_unlock:
1397 	mutex_unlock(&pages->mutex);
1398 	kfree(rhs);
1399 err_free_lhs:
1400 	kfree(lhs);
1401 	return rc;
1402 }
1403 
1404 int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
1405 		  size_t num_iovas)
1406 {
1407 	int rc = 0;
1408 	int i;
1409 
1410 	down_write(&iopt->iova_rwsem);
1411 	for (i = 0; i < num_iovas; i++) {
1412 		struct iopt_area *area;
1413 
1414 		area = iopt_area_iter_first(iopt, iovas[i], iovas[i]);
1415 		if (!area)
1416 			continue;
1417 		rc = iopt_area_split(area, iovas[i]);
1418 		if (rc)
1419 			break;
1420 	}
1421 	up_write(&iopt->iova_rwsem);
1422 	return rc;
1423 }
1424 
1425 void iopt_enable_large_pages(struct io_pagetable *iopt)
1426 {
1427 	int rc;
1428 
1429 	down_write(&iopt->domains_rwsem);
1430 	down_write(&iopt->iova_rwsem);
1431 	WRITE_ONCE(iopt->disable_large_pages, false);
1432 	rc = iopt_calculate_iova_alignment(iopt);
1433 	WARN_ON(rc);
1434 	up_write(&iopt->iova_rwsem);
1435 	up_write(&iopt->domains_rwsem);
1436 }
1437 
1438 int iopt_disable_large_pages(struct io_pagetable *iopt)
1439 {
1440 	int rc = 0;
1441 
1442 	down_write(&iopt->domains_rwsem);
1443 	down_write(&iopt->iova_rwsem);
1444 	if (iopt->disable_large_pages)
1445 		goto out_unlock;
1446 
1447 	/* Won't do it if domains already have pages mapped in them */
1448 	if (!xa_empty(&iopt->domains) &&
1449 	    !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) {
1450 		rc = -EINVAL;
1451 		goto out_unlock;
1452 	}
1453 
1454 	WRITE_ONCE(iopt->disable_large_pages, true);
1455 	rc = iopt_calculate_iova_alignment(iopt);
1456 	if (rc)
1457 		WRITE_ONCE(iopt->disable_large_pages, false);
1458 out_unlock:
1459 	up_write(&iopt->iova_rwsem);
1460 	up_write(&iopt->domains_rwsem);
1461 	return rc;
1462 }
1463 
1464 int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access)
1465 {
1466 	u32 new_id;
1467 	int rc;
1468 
1469 	down_write(&iopt->domains_rwsem);
1470 	down_write(&iopt->iova_rwsem);
1471 	rc = xa_alloc(&iopt->access_list, &new_id, access, xa_limit_16b,
1472 		      GFP_KERNEL_ACCOUNT);
1473 
1474 	if (rc)
1475 		goto out_unlock;
1476 
1477 	rc = iopt_calculate_iova_alignment(iopt);
1478 	if (rc) {
1479 		xa_erase(&iopt->access_list, new_id);
1480 		goto out_unlock;
1481 	}
1482 	access->iopt_access_list_id = new_id;
1483 
1484 out_unlock:
1485 	up_write(&iopt->iova_rwsem);
1486 	up_write(&iopt->domains_rwsem);
1487 	return rc;
1488 }
1489 
1490 void iopt_remove_access(struct io_pagetable *iopt,
1491 			struct iommufd_access *access, u32 iopt_access_list_id)
1492 {
1493 	down_write(&iopt->domains_rwsem);
1494 	down_write(&iopt->iova_rwsem);
1495 	WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access);
1496 	WARN_ON(iopt_calculate_iova_alignment(iopt));
1497 	up_write(&iopt->iova_rwsem);
1498 	up_write(&iopt->domains_rwsem);
1499 }
1500 
1501 /* Narrow the valid_iova_itree to include reserved ranges from a device. */
1502 int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
1503 					struct device *dev,
1504 					phys_addr_t *sw_msi_start)
1505 {
1506 	struct iommu_resv_region *resv;
1507 	LIST_HEAD(resv_regions);
1508 	unsigned int num_hw_msi = 0;
1509 	unsigned int num_sw_msi = 0;
1510 	int rc;
1511 
1512 	if (iommufd_should_fail())
1513 		return -EINVAL;
1514 
1515 	down_write(&iopt->iova_rwsem);
1516 	/* FIXME: drivers allocate memory but there is no failure propogated */
1517 	iommu_get_resv_regions(dev, &resv_regions);
1518 
1519 	list_for_each_entry(resv, &resv_regions, list) {
1520 		if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1521 			continue;
1522 
1523 		if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
1524 			num_hw_msi++;
1525 		if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
1526 			*sw_msi_start = resv->start;
1527 			num_sw_msi++;
1528 		}
1529 
1530 		rc = iopt_reserve_iova(iopt, resv->start,
1531 				       resv->length - 1 + resv->start, dev);
1532 		if (rc)
1533 			goto out_reserved;
1534 	}
1535 
1536 	/* Drivers must offer sane combinations of regions */
1537 	if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) {
1538 		rc = -EINVAL;
1539 		goto out_reserved;
1540 	}
1541 
1542 	rc = 0;
1543 	goto out_free_resv;
1544 
1545 out_reserved:
1546 	__iopt_remove_reserved_iova(iopt, dev);
1547 out_free_resv:
1548 	iommu_put_resv_regions(dev, &resv_regions);
1549 	up_write(&iopt->iova_rwsem);
1550 	return rc;
1551 }
1552