xref: /linux/drivers/iommu/iommufd/io_pagetable.c (revision eb01fe7abbe2d0b38824d2a93fdb4cc3eaf2ccc1)
1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
3  *
4  * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
5  * PFNs can be placed into an iommu_domain, or returned to the caller as a page
6  * list for access by an in-kernel user.
7  *
8  * The datastructure uses the iopt_pages to optimize the storage of the PFNs
9  * between the domains and xarray.
10  */
11 #include <linux/iommufd.h>
12 #include <linux/lockdep.h>
13 #include <linux/iommu.h>
14 #include <linux/sched/mm.h>
15 #include <linux/err.h>
16 #include <linux/slab.h>
17 #include <linux/errno.h>
18 #include <uapi/linux/iommufd.h>
19 
20 #include "io_pagetable.h"
21 #include "double_span.h"
22 
23 struct iopt_pages_list {
24 	struct iopt_pages *pages;
25 	struct iopt_area *area;
26 	struct list_head next;
27 	unsigned long start_byte;
28 	unsigned long length;
29 };
30 
31 struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
32 					struct io_pagetable *iopt,
33 					unsigned long iova,
34 					unsigned long last_iova)
35 {
36 	lockdep_assert_held(&iopt->iova_rwsem);
37 
38 	iter->cur_iova = iova;
39 	iter->last_iova = last_iova;
40 	iter->area = iopt_area_iter_first(iopt, iova, iova);
41 	if (!iter->area)
42 		return NULL;
43 	if (!iter->area->pages) {
44 		iter->area = NULL;
45 		return NULL;
46 	}
47 	return iter->area;
48 }
49 
50 struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
51 {
52 	unsigned long last_iova;
53 
54 	if (!iter->area)
55 		return NULL;
56 	last_iova = iopt_area_last_iova(iter->area);
57 	if (iter->last_iova <= last_iova)
58 		return NULL;
59 
60 	iter->cur_iova = last_iova + 1;
61 	iter->area = iopt_area_iter_next(iter->area, iter->cur_iova,
62 					 iter->last_iova);
63 	if (!iter->area)
64 		return NULL;
65 	if (iter->cur_iova != iopt_area_iova(iter->area) ||
66 	    !iter->area->pages) {
67 		iter->area = NULL;
68 		return NULL;
69 	}
70 	return iter->area;
71 }
72 
73 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
74 				    unsigned long length,
75 				    unsigned long iova_alignment,
76 				    unsigned long page_offset)
77 {
78 	if (span->is_used || span->last_hole - span->start_hole < length - 1)
79 		return false;
80 
81 	span->start_hole = ALIGN(span->start_hole, iova_alignment) |
82 			   page_offset;
83 	if (span->start_hole > span->last_hole ||
84 	    span->last_hole - span->start_hole < length - 1)
85 		return false;
86 	return true;
87 }
88 
89 static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
90 				    unsigned long length,
91 				    unsigned long iova_alignment,
92 				    unsigned long page_offset)
93 {
94 	if (span->is_hole || span->last_used - span->start_used < length - 1)
95 		return false;
96 
97 	span->start_used = ALIGN(span->start_used, iova_alignment) |
98 			   page_offset;
99 	if (span->start_used > span->last_used ||
100 	    span->last_used - span->start_used < length - 1)
101 		return false;
102 	return true;
103 }
104 
105 /*
106  * Automatically find a block of IOVA that is not being used and not reserved.
107  * Does not return a 0 IOVA even if it is valid.
108  */
109 static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
110 			   unsigned long uptr, unsigned long length)
111 {
112 	unsigned long page_offset = uptr % PAGE_SIZE;
113 	struct interval_tree_double_span_iter used_span;
114 	struct interval_tree_span_iter allowed_span;
115 	unsigned long iova_alignment;
116 
117 	lockdep_assert_held(&iopt->iova_rwsem);
118 
119 	/* Protect roundup_pow-of_two() from overflow */
120 	if (length == 0 || length >= ULONG_MAX / 2)
121 		return -EOVERFLOW;
122 
123 	/*
124 	 * Keep alignment present in the uptr when building the IOVA, this
125 	 * increases the chance we can map a THP.
126 	 */
127 	if (!uptr)
128 		iova_alignment = roundup_pow_of_two(length);
129 	else
130 		iova_alignment = min_t(unsigned long,
131 				       roundup_pow_of_two(length),
132 				       1UL << __ffs64(uptr));
133 
134 	if (iova_alignment < iopt->iova_alignment)
135 		return -EINVAL;
136 
137 	interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree,
138 				    PAGE_SIZE, ULONG_MAX - PAGE_SIZE) {
139 		if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) {
140 			allowed_span.start_used = PAGE_SIZE;
141 			allowed_span.last_used = ULONG_MAX - PAGE_SIZE;
142 			allowed_span.is_hole = false;
143 		}
144 
145 		if (!__alloc_iova_check_used(&allowed_span, length,
146 					     iova_alignment, page_offset))
147 			continue;
148 
149 		interval_tree_for_each_double_span(
150 			&used_span, &iopt->reserved_itree, &iopt->area_itree,
151 			allowed_span.start_used, allowed_span.last_used) {
152 			if (!__alloc_iova_check_hole(&used_span, length,
153 						     iova_alignment,
154 						     page_offset))
155 				continue;
156 
157 			*iova = used_span.start_hole;
158 			return 0;
159 		}
160 	}
161 	return -ENOSPC;
162 }
163 
164 static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova,
165 			   unsigned long length)
166 {
167 	unsigned long last;
168 
169 	lockdep_assert_held(&iopt->iova_rwsem);
170 
171 	if ((iova & (iopt->iova_alignment - 1)))
172 		return -EINVAL;
173 
174 	if (check_add_overflow(iova, length - 1, &last))
175 		return -EOVERFLOW;
176 
177 	/* No reserved IOVA intersects the range */
178 	if (iopt_reserved_iter_first(iopt, iova, last))
179 		return -EINVAL;
180 
181 	/* Check that there is not already a mapping in the range */
182 	if (iopt_area_iter_first(iopt, iova, last))
183 		return -EEXIST;
184 	return 0;
185 }
186 
187 /*
188  * The area takes a slice of the pages from start_bytes to start_byte + length
189  */
190 static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
191 			    struct iopt_pages *pages, unsigned long iova,
192 			    unsigned long start_byte, unsigned long length,
193 			    int iommu_prot)
194 {
195 	lockdep_assert_held_write(&iopt->iova_rwsem);
196 
197 	if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
198 		return -EPERM;
199 
200 	area->iommu_prot = iommu_prot;
201 	area->page_offset = start_byte % PAGE_SIZE;
202 	if (area->page_offset & (iopt->iova_alignment - 1))
203 		return -EINVAL;
204 
205 	area->node.start = iova;
206 	if (check_add_overflow(iova, length - 1, &area->node.last))
207 		return -EOVERFLOW;
208 
209 	area->pages_node.start = start_byte / PAGE_SIZE;
210 	if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
211 		return -EOVERFLOW;
212 	area->pages_node.last = area->pages_node.last / PAGE_SIZE;
213 	if (WARN_ON(area->pages_node.last >= pages->npages))
214 		return -EOVERFLOW;
215 
216 	/*
217 	 * The area is inserted with a NULL pages indicating it is not fully
218 	 * initialized yet.
219 	 */
220 	area->iopt = iopt;
221 	interval_tree_insert(&area->node, &iopt->area_itree);
222 	return 0;
223 }
224 
225 static struct iopt_area *iopt_area_alloc(void)
226 {
227 	struct iopt_area *area;
228 
229 	area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
230 	if (!area)
231 		return NULL;
232 	RB_CLEAR_NODE(&area->node.rb);
233 	RB_CLEAR_NODE(&area->pages_node.rb);
234 	return area;
235 }
236 
237 static int iopt_alloc_area_pages(struct io_pagetable *iopt,
238 				 struct list_head *pages_list,
239 				 unsigned long length, unsigned long *dst_iova,
240 				 int iommu_prot, unsigned int flags)
241 {
242 	struct iopt_pages_list *elm;
243 	unsigned long iova;
244 	int rc = 0;
245 
246 	list_for_each_entry(elm, pages_list, next) {
247 		elm->area = iopt_area_alloc();
248 		if (!elm->area)
249 			return -ENOMEM;
250 	}
251 
252 	down_write(&iopt->iova_rwsem);
253 	if ((length & (iopt->iova_alignment - 1)) || !length) {
254 		rc = -EINVAL;
255 		goto out_unlock;
256 	}
257 
258 	if (flags & IOPT_ALLOC_IOVA) {
259 		/* Use the first entry to guess the ideal IOVA alignment */
260 		elm = list_first_entry(pages_list, struct iopt_pages_list,
261 				       next);
262 		rc = iopt_alloc_iova(
263 			iopt, dst_iova,
264 			(uintptr_t)elm->pages->uptr + elm->start_byte, length);
265 		if (rc)
266 			goto out_unlock;
267 		if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
268 		    WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) {
269 			rc = -EINVAL;
270 			goto out_unlock;
271 		}
272 	} else {
273 		rc = iopt_check_iova(iopt, *dst_iova, length);
274 		if (rc)
275 			goto out_unlock;
276 	}
277 
278 	/*
279 	 * Areas are created with a NULL pages so that the IOVA space is
280 	 * reserved and we can unlock the iova_rwsem.
281 	 */
282 	iova = *dst_iova;
283 	list_for_each_entry(elm, pages_list, next) {
284 		rc = iopt_insert_area(iopt, elm->area, elm->pages, iova,
285 				      elm->start_byte, elm->length, iommu_prot);
286 		if (rc)
287 			goto out_unlock;
288 		iova += elm->length;
289 	}
290 
291 out_unlock:
292 	up_write(&iopt->iova_rwsem);
293 	return rc;
294 }
295 
296 static void iopt_abort_area(struct iopt_area *area)
297 {
298 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
299 		WARN_ON(area->pages);
300 	if (area->iopt) {
301 		down_write(&area->iopt->iova_rwsem);
302 		interval_tree_remove(&area->node, &area->iopt->area_itree);
303 		up_write(&area->iopt->iova_rwsem);
304 	}
305 	kfree(area);
306 }
307 
308 void iopt_free_pages_list(struct list_head *pages_list)
309 {
310 	struct iopt_pages_list *elm;
311 
312 	while ((elm = list_first_entry_or_null(pages_list,
313 					       struct iopt_pages_list, next))) {
314 		if (elm->area)
315 			iopt_abort_area(elm->area);
316 		if (elm->pages)
317 			iopt_put_pages(elm->pages);
318 		list_del(&elm->next);
319 		kfree(elm);
320 	}
321 }
322 
323 static int iopt_fill_domains_pages(struct list_head *pages_list)
324 {
325 	struct iopt_pages_list *undo_elm;
326 	struct iopt_pages_list *elm;
327 	int rc;
328 
329 	list_for_each_entry(elm, pages_list, next) {
330 		rc = iopt_area_fill_domains(elm->area, elm->pages);
331 		if (rc)
332 			goto err_undo;
333 	}
334 	return 0;
335 
336 err_undo:
337 	list_for_each_entry(undo_elm, pages_list, next) {
338 		if (undo_elm == elm)
339 			break;
340 		iopt_area_unfill_domains(undo_elm->area, undo_elm->pages);
341 	}
342 	return rc;
343 }
344 
345 int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
346 		   unsigned long length, unsigned long *dst_iova,
347 		   int iommu_prot, unsigned int flags)
348 {
349 	struct iopt_pages_list *elm;
350 	int rc;
351 
352 	rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova,
353 				   iommu_prot, flags);
354 	if (rc)
355 		return rc;
356 
357 	down_read(&iopt->domains_rwsem);
358 	rc = iopt_fill_domains_pages(pages_list);
359 	if (rc)
360 		goto out_unlock_domains;
361 
362 	down_write(&iopt->iova_rwsem);
363 	list_for_each_entry(elm, pages_list, next) {
364 		/*
365 		 * area->pages must be set inside the domains_rwsem to ensure
366 		 * any newly added domains will get filled. Moves the reference
367 		 * in from the list.
368 		 */
369 		elm->area->pages = elm->pages;
370 		elm->pages = NULL;
371 		elm->area = NULL;
372 	}
373 	up_write(&iopt->iova_rwsem);
374 out_unlock_domains:
375 	up_read(&iopt->domains_rwsem);
376 	return rc;
377 }
378 
379 /**
380  * iopt_map_user_pages() - Map a user VA to an iova in the io page table
381  * @ictx: iommufd_ctx the iopt is part of
382  * @iopt: io_pagetable to act on
383  * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
384  *        the chosen iova on output. Otherwise is the iova to map to on input
385  * @uptr: User VA to map
386  * @length: Number of bytes to map
387  * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
388  * @flags: IOPT_ALLOC_IOVA or zero
389  *
390  * iova, uptr, and length must be aligned to iova_alignment. For domain backed
391  * page tables this will pin the pages and load them into the domain at iova.
392  * For non-domain page tables this will only setup a lazy reference and the
393  * caller must use iopt_access_pages() to touch them.
394  *
395  * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
396  * destroyed.
397  */
398 int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
399 			unsigned long *iova, void __user *uptr,
400 			unsigned long length, int iommu_prot,
401 			unsigned int flags)
402 {
403 	struct iopt_pages_list elm = {};
404 	LIST_HEAD(pages_list);
405 	int rc;
406 
407 	elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE);
408 	if (IS_ERR(elm.pages))
409 		return PTR_ERR(elm.pages);
410 	if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
411 	    elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
412 		elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
413 	elm.start_byte = uptr - elm.pages->uptr;
414 	elm.length = length;
415 	list_add(&elm.next, &pages_list);
416 
417 	rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
418 	if (rc) {
419 		if (elm.area)
420 			iopt_abort_area(elm.area);
421 		if (elm.pages)
422 			iopt_put_pages(elm.pages);
423 		return rc;
424 	}
425 	return 0;
426 }
427 
428 struct iova_bitmap_fn_arg {
429 	unsigned long flags;
430 	struct io_pagetable *iopt;
431 	struct iommu_domain *domain;
432 	struct iommu_dirty_bitmap *dirty;
433 };
434 
435 static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap,
436 					unsigned long iova, size_t length,
437 					void *opaque)
438 {
439 	struct iopt_area *area;
440 	struct iopt_area_contig_iter iter;
441 	struct iova_bitmap_fn_arg *arg = opaque;
442 	struct iommu_domain *domain = arg->domain;
443 	struct iommu_dirty_bitmap *dirty = arg->dirty;
444 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
445 	unsigned long last_iova = iova + length - 1;
446 	unsigned long flags = arg->flags;
447 	int ret;
448 
449 	iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) {
450 		unsigned long last = min(last_iova, iopt_area_last_iova(area));
451 
452 		ret = ops->read_and_clear_dirty(domain, iter.cur_iova,
453 						last - iter.cur_iova + 1, flags,
454 						dirty);
455 		if (ret)
456 			return ret;
457 	}
458 
459 	if (!iopt_area_contig_done(&iter))
460 		return -EINVAL;
461 	return 0;
462 }
463 
464 static int
465 iommu_read_and_clear_dirty(struct iommu_domain *domain,
466 			   struct io_pagetable *iopt, unsigned long flags,
467 			   struct iommu_hwpt_get_dirty_bitmap *bitmap)
468 {
469 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
470 	struct iommu_iotlb_gather gather;
471 	struct iommu_dirty_bitmap dirty;
472 	struct iova_bitmap_fn_arg arg;
473 	struct iova_bitmap *iter;
474 	int ret = 0;
475 
476 	if (!ops || !ops->read_and_clear_dirty)
477 		return -EOPNOTSUPP;
478 
479 	iter = iova_bitmap_alloc(bitmap->iova, bitmap->length,
480 				 bitmap->page_size,
481 				 u64_to_user_ptr(bitmap->data));
482 	if (IS_ERR(iter))
483 		return -ENOMEM;
484 
485 	iommu_dirty_bitmap_init(&dirty, iter, &gather);
486 
487 	arg.flags = flags;
488 	arg.iopt = iopt;
489 	arg.domain = domain;
490 	arg.dirty = &dirty;
491 	iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty);
492 
493 	if (!(flags & IOMMU_DIRTY_NO_CLEAR))
494 		iommu_iotlb_sync(domain, &gather);
495 
496 	iova_bitmap_free(iter);
497 
498 	return ret;
499 }
500 
501 int iommufd_check_iova_range(struct io_pagetable *iopt,
502 			     struct iommu_hwpt_get_dirty_bitmap *bitmap)
503 {
504 	size_t iommu_pgsize = iopt->iova_alignment;
505 	u64 last_iova;
506 
507 	if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova))
508 		return -EOVERFLOW;
509 
510 	if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX)
511 		return -EOVERFLOW;
512 
513 	if ((bitmap->iova & (iommu_pgsize - 1)) ||
514 	    ((last_iova + 1) & (iommu_pgsize - 1)))
515 		return -EINVAL;
516 
517 	if (!bitmap->page_size)
518 		return -EINVAL;
519 
520 	if ((bitmap->iova & (bitmap->page_size - 1)) ||
521 	    ((last_iova + 1) & (bitmap->page_size - 1)))
522 		return -EINVAL;
523 
524 	return 0;
525 }
526 
527 int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
528 				   struct iommu_domain *domain,
529 				   unsigned long flags,
530 				   struct iommu_hwpt_get_dirty_bitmap *bitmap)
531 {
532 	int ret;
533 
534 	ret = iommufd_check_iova_range(iopt, bitmap);
535 	if (ret)
536 		return ret;
537 
538 	down_read(&iopt->iova_rwsem);
539 	ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
540 	up_read(&iopt->iova_rwsem);
541 
542 	return ret;
543 }
544 
545 static int iopt_clear_dirty_data(struct io_pagetable *iopt,
546 				 struct iommu_domain *domain)
547 {
548 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
549 	struct iommu_iotlb_gather gather;
550 	struct iommu_dirty_bitmap dirty;
551 	struct iopt_area *area;
552 	int ret = 0;
553 
554 	lockdep_assert_held_read(&iopt->iova_rwsem);
555 
556 	iommu_dirty_bitmap_init(&dirty, NULL, &gather);
557 
558 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
559 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
560 		if (!area->pages)
561 			continue;
562 
563 		ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area),
564 						iopt_area_length(area), 0,
565 						&dirty);
566 		if (ret)
567 			break;
568 	}
569 
570 	iommu_iotlb_sync(domain, &gather);
571 	return ret;
572 }
573 
574 int iopt_set_dirty_tracking(struct io_pagetable *iopt,
575 			    struct iommu_domain *domain, bool enable)
576 {
577 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
578 	int ret = 0;
579 
580 	if (!ops)
581 		return -EOPNOTSUPP;
582 
583 	down_read(&iopt->iova_rwsem);
584 
585 	/* Clear dirty bits from PTEs to ensure a clean snapshot */
586 	if (enable) {
587 		ret = iopt_clear_dirty_data(iopt, domain);
588 		if (ret)
589 			goto out_unlock;
590 	}
591 
592 	ret = ops->set_dirty_tracking(domain, enable);
593 
594 out_unlock:
595 	up_read(&iopt->iova_rwsem);
596 	return ret;
597 }
598 
599 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
600 		   unsigned long length, struct list_head *pages_list)
601 {
602 	struct iopt_area_contig_iter iter;
603 	unsigned long last_iova;
604 	struct iopt_area *area;
605 	int rc;
606 
607 	if (!length)
608 		return -EINVAL;
609 	if (check_add_overflow(iova, length - 1, &last_iova))
610 		return -EOVERFLOW;
611 
612 	down_read(&iopt->iova_rwsem);
613 	iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
614 		struct iopt_pages_list *elm;
615 		unsigned long last = min(last_iova, iopt_area_last_iova(area));
616 
617 		elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT);
618 		if (!elm) {
619 			rc = -ENOMEM;
620 			goto err_free;
621 		}
622 		elm->start_byte = iopt_area_start_byte(area, iter.cur_iova);
623 		elm->pages = area->pages;
624 		elm->length = (last - iter.cur_iova) + 1;
625 		kref_get(&elm->pages->kref);
626 		list_add_tail(&elm->next, pages_list);
627 	}
628 	if (!iopt_area_contig_done(&iter)) {
629 		rc = -ENOENT;
630 		goto err_free;
631 	}
632 	up_read(&iopt->iova_rwsem);
633 	return 0;
634 err_free:
635 	up_read(&iopt->iova_rwsem);
636 	iopt_free_pages_list(pages_list);
637 	return rc;
638 }
639 
640 static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
641 				 unsigned long last, unsigned long *unmapped)
642 {
643 	struct iopt_area *area;
644 	unsigned long unmapped_bytes = 0;
645 	unsigned int tries = 0;
646 	int rc = -ENOENT;
647 
648 	/*
649 	 * The domains_rwsem must be held in read mode any time any area->pages
650 	 * is NULL. This prevents domain attach/detatch from running
651 	 * concurrently with cleaning up the area.
652 	 */
653 again:
654 	down_read(&iopt->domains_rwsem);
655 	down_write(&iopt->iova_rwsem);
656 	while ((area = iopt_area_iter_first(iopt, start, last))) {
657 		unsigned long area_last = iopt_area_last_iova(area);
658 		unsigned long area_first = iopt_area_iova(area);
659 		struct iopt_pages *pages;
660 
661 		/* Userspace should not race map/unmap's of the same area */
662 		if (!area->pages) {
663 			rc = -EBUSY;
664 			goto out_unlock_iova;
665 		}
666 
667 		if (area_first < start || area_last > last) {
668 			rc = -ENOENT;
669 			goto out_unlock_iova;
670 		}
671 
672 		if (area_first != start)
673 			tries = 0;
674 
675 		/*
676 		 * num_accesses writers must hold the iova_rwsem too, so we can
677 		 * safely read it under the write side of the iovam_rwsem
678 		 * without the pages->mutex.
679 		 */
680 		if (area->num_accesses) {
681 			size_t length = iopt_area_length(area);
682 
683 			start = area_first;
684 			area->prevent_access = true;
685 			up_write(&iopt->iova_rwsem);
686 			up_read(&iopt->domains_rwsem);
687 
688 			iommufd_access_notify_unmap(iopt, area_first, length);
689 			/* Something is not responding to unmap requests. */
690 			tries++;
691 			if (WARN_ON(tries > 100))
692 				return -EDEADLOCK;
693 			goto again;
694 		}
695 
696 		pages = area->pages;
697 		area->pages = NULL;
698 		up_write(&iopt->iova_rwsem);
699 
700 		iopt_area_unfill_domains(area, pages);
701 		iopt_abort_area(area);
702 		iopt_put_pages(pages);
703 
704 		unmapped_bytes += area_last - area_first + 1;
705 
706 		down_write(&iopt->iova_rwsem);
707 	}
708 	if (unmapped_bytes)
709 		rc = 0;
710 
711 out_unlock_iova:
712 	up_write(&iopt->iova_rwsem);
713 	up_read(&iopt->domains_rwsem);
714 	if (unmapped)
715 		*unmapped = unmapped_bytes;
716 	return rc;
717 }
718 
719 /**
720  * iopt_unmap_iova() - Remove a range of iova
721  * @iopt: io_pagetable to act on
722  * @iova: Starting iova to unmap
723  * @length: Number of bytes to unmap
724  * @unmapped: Return number of bytes unmapped
725  *
726  * The requested range must be a superset of existing ranges.
727  * Splitting/truncating IOVA mappings is not allowed.
728  */
729 int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
730 		    unsigned long length, unsigned long *unmapped)
731 {
732 	unsigned long iova_last;
733 
734 	if (!length)
735 		return -EINVAL;
736 
737 	if (check_add_overflow(iova, length - 1, &iova_last))
738 		return -EOVERFLOW;
739 
740 	return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped);
741 }
742 
743 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
744 {
745 	int rc;
746 
747 	rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
748 	/* If the IOVAs are empty then unmap all succeeds */
749 	if (rc == -ENOENT)
750 		return 0;
751 	return rc;
752 }
753 
754 /* The caller must always free all the nodes in the allowed_iova rb_root. */
755 int iopt_set_allow_iova(struct io_pagetable *iopt,
756 			struct rb_root_cached *allowed_iova)
757 {
758 	struct iopt_allowed *allowed;
759 
760 	down_write(&iopt->iova_rwsem);
761 	swap(*allowed_iova, iopt->allowed_itree);
762 
763 	for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed;
764 	     allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) {
765 		if (iopt_reserved_iter_first(iopt, allowed->node.start,
766 					     allowed->node.last)) {
767 			swap(*allowed_iova, iopt->allowed_itree);
768 			up_write(&iopt->iova_rwsem);
769 			return -EADDRINUSE;
770 		}
771 	}
772 	up_write(&iopt->iova_rwsem);
773 	return 0;
774 }
775 
776 int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
777 		      unsigned long last, void *owner)
778 {
779 	struct iopt_reserved *reserved;
780 
781 	lockdep_assert_held_write(&iopt->iova_rwsem);
782 
783 	if (iopt_area_iter_first(iopt, start, last) ||
784 	    iopt_allowed_iter_first(iopt, start, last))
785 		return -EADDRINUSE;
786 
787 	reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT);
788 	if (!reserved)
789 		return -ENOMEM;
790 	reserved->node.start = start;
791 	reserved->node.last = last;
792 	reserved->owner = owner;
793 	interval_tree_insert(&reserved->node, &iopt->reserved_itree);
794 	return 0;
795 }
796 
797 static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
798 {
799 	struct iopt_reserved *reserved, *next;
800 
801 	lockdep_assert_held_write(&iopt->iova_rwsem);
802 
803 	for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved;
804 	     reserved = next) {
805 		next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX);
806 
807 		if (reserved->owner == owner) {
808 			interval_tree_remove(&reserved->node,
809 					     &iopt->reserved_itree);
810 			kfree(reserved);
811 		}
812 	}
813 }
814 
815 void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
816 {
817 	down_write(&iopt->iova_rwsem);
818 	__iopt_remove_reserved_iova(iopt, owner);
819 	up_write(&iopt->iova_rwsem);
820 }
821 
822 void iopt_init_table(struct io_pagetable *iopt)
823 {
824 	init_rwsem(&iopt->iova_rwsem);
825 	init_rwsem(&iopt->domains_rwsem);
826 	iopt->area_itree = RB_ROOT_CACHED;
827 	iopt->allowed_itree = RB_ROOT_CACHED;
828 	iopt->reserved_itree = RB_ROOT_CACHED;
829 	xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT);
830 	xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC);
831 
832 	/*
833 	 * iopt's start as SW tables that can use the entire size_t IOVA space
834 	 * due to the use of size_t in the APIs. They have no alignment
835 	 * restriction.
836 	 */
837 	iopt->iova_alignment = 1;
838 }
839 
840 void iopt_destroy_table(struct io_pagetable *iopt)
841 {
842 	struct interval_tree_node *node;
843 
844 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
845 		iopt_remove_reserved_iova(iopt, NULL);
846 
847 	while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0,
848 						ULONG_MAX))) {
849 		interval_tree_remove(node, &iopt->allowed_itree);
850 		kfree(container_of(node, struct iopt_allowed, node));
851 	}
852 
853 	WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root));
854 	WARN_ON(!xa_empty(&iopt->domains));
855 	WARN_ON(!xa_empty(&iopt->access_list));
856 	WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
857 }
858 
859 /**
860  * iopt_unfill_domain() - Unfill a domain with PFNs
861  * @iopt: io_pagetable to act on
862  * @domain: domain to unfill
863  *
864  * This is used when removing a domain from the iopt. Every area in the iopt
865  * will be unmapped from the domain. The domain must already be removed from the
866  * domains xarray.
867  */
868 static void iopt_unfill_domain(struct io_pagetable *iopt,
869 			       struct iommu_domain *domain)
870 {
871 	struct iopt_area *area;
872 
873 	lockdep_assert_held(&iopt->iova_rwsem);
874 	lockdep_assert_held_write(&iopt->domains_rwsem);
875 
876 	/*
877 	 * Some other domain is holding all the pfns still, rapidly unmap this
878 	 * domain.
879 	 */
880 	if (iopt->next_domain_id != 0) {
881 		/* Pick an arbitrary remaining domain to act as storage */
882 		struct iommu_domain *storage_domain =
883 			xa_load(&iopt->domains, 0);
884 
885 		for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
886 		     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
887 			struct iopt_pages *pages = area->pages;
888 
889 			if (!pages)
890 				continue;
891 
892 			mutex_lock(&pages->mutex);
893 			if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
894 				WARN_ON(!area->storage_domain);
895 			if (area->storage_domain == domain)
896 				area->storage_domain = storage_domain;
897 			mutex_unlock(&pages->mutex);
898 
899 			iopt_area_unmap_domain(area, domain);
900 		}
901 		return;
902 	}
903 
904 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
905 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
906 		struct iopt_pages *pages = area->pages;
907 
908 		if (!pages)
909 			continue;
910 
911 		mutex_lock(&pages->mutex);
912 		interval_tree_remove(&area->pages_node, &pages->domains_itree);
913 		WARN_ON(area->storage_domain != domain);
914 		area->storage_domain = NULL;
915 		iopt_area_unfill_domain(area, pages, domain);
916 		mutex_unlock(&pages->mutex);
917 	}
918 }
919 
920 /**
921  * iopt_fill_domain() - Fill a domain with PFNs
922  * @iopt: io_pagetable to act on
923  * @domain: domain to fill
924  *
925  * Fill the domain with PFNs from every area in the iopt. On failure the domain
926  * is left unchanged.
927  */
928 static int iopt_fill_domain(struct io_pagetable *iopt,
929 			    struct iommu_domain *domain)
930 {
931 	struct iopt_area *end_area;
932 	struct iopt_area *area;
933 	int rc;
934 
935 	lockdep_assert_held(&iopt->iova_rwsem);
936 	lockdep_assert_held_write(&iopt->domains_rwsem);
937 
938 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
939 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
940 		struct iopt_pages *pages = area->pages;
941 
942 		if (!pages)
943 			continue;
944 
945 		mutex_lock(&pages->mutex);
946 		rc = iopt_area_fill_domain(area, domain);
947 		if (rc) {
948 			mutex_unlock(&pages->mutex);
949 			goto out_unfill;
950 		}
951 		if (!area->storage_domain) {
952 			WARN_ON(iopt->next_domain_id != 0);
953 			area->storage_domain = domain;
954 			interval_tree_insert(&area->pages_node,
955 					     &pages->domains_itree);
956 		}
957 		mutex_unlock(&pages->mutex);
958 	}
959 	return 0;
960 
961 out_unfill:
962 	end_area = area;
963 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
964 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
965 		struct iopt_pages *pages = area->pages;
966 
967 		if (area == end_area)
968 			break;
969 		if (!pages)
970 			continue;
971 		mutex_lock(&pages->mutex);
972 		if (iopt->next_domain_id == 0) {
973 			interval_tree_remove(&area->pages_node,
974 					     &pages->domains_itree);
975 			area->storage_domain = NULL;
976 		}
977 		iopt_area_unfill_domain(area, pages, domain);
978 		mutex_unlock(&pages->mutex);
979 	}
980 	return rc;
981 }
982 
983 /* All existing area's conform to an increased page size */
984 static int iopt_check_iova_alignment(struct io_pagetable *iopt,
985 				     unsigned long new_iova_alignment)
986 {
987 	unsigned long align_mask = new_iova_alignment - 1;
988 	struct iopt_area *area;
989 
990 	lockdep_assert_held(&iopt->iova_rwsem);
991 	lockdep_assert_held(&iopt->domains_rwsem);
992 
993 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
994 	     area = iopt_area_iter_next(area, 0, ULONG_MAX))
995 		if ((iopt_area_iova(area) & align_mask) ||
996 		    (iopt_area_length(area) & align_mask) ||
997 		    (area->page_offset & align_mask))
998 			return -EADDRINUSE;
999 
1000 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) {
1001 		struct iommufd_access *access;
1002 		unsigned long index;
1003 
1004 		xa_for_each(&iopt->access_list, index, access)
1005 			if (WARN_ON(access->iova_alignment >
1006 				    new_iova_alignment))
1007 				return -EADDRINUSE;
1008 	}
1009 	return 0;
1010 }
1011 
1012 int iopt_table_add_domain(struct io_pagetable *iopt,
1013 			  struct iommu_domain *domain)
1014 {
1015 	const struct iommu_domain_geometry *geometry = &domain->geometry;
1016 	struct iommu_domain *iter_domain;
1017 	unsigned int new_iova_alignment;
1018 	unsigned long index;
1019 	int rc;
1020 
1021 	down_write(&iopt->domains_rwsem);
1022 	down_write(&iopt->iova_rwsem);
1023 
1024 	xa_for_each(&iopt->domains, index, iter_domain) {
1025 		if (WARN_ON(iter_domain == domain)) {
1026 			rc = -EEXIST;
1027 			goto out_unlock;
1028 		}
1029 	}
1030 
1031 	/*
1032 	 * The io page size drives the iova_alignment. Internally the iopt_pages
1033 	 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
1034 	 * objects into the iommu_domain.
1035 	 *
1036 	 * A iommu_domain must always be able to accept PAGE_SIZE to be
1037 	 * compatible as we can't guarantee higher contiguity.
1038 	 */
1039 	new_iova_alignment = max_t(unsigned long,
1040 				   1UL << __ffs(domain->pgsize_bitmap),
1041 				   iopt->iova_alignment);
1042 	if (new_iova_alignment > PAGE_SIZE) {
1043 		rc = -EINVAL;
1044 		goto out_unlock;
1045 	}
1046 	if (new_iova_alignment != iopt->iova_alignment) {
1047 		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1048 		if (rc)
1049 			goto out_unlock;
1050 	}
1051 
1052 	/* No area exists that is outside the allowed domain aperture */
1053 	if (geometry->aperture_start != 0) {
1054 		rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
1055 				       domain);
1056 		if (rc)
1057 			goto out_reserved;
1058 	}
1059 	if (geometry->aperture_end != ULONG_MAX) {
1060 		rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
1061 				       ULONG_MAX, domain);
1062 		if (rc)
1063 			goto out_reserved;
1064 	}
1065 
1066 	rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
1067 	if (rc)
1068 		goto out_reserved;
1069 
1070 	rc = iopt_fill_domain(iopt, domain);
1071 	if (rc)
1072 		goto out_release;
1073 
1074 	iopt->iova_alignment = new_iova_alignment;
1075 	xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
1076 	iopt->next_domain_id++;
1077 	up_write(&iopt->iova_rwsem);
1078 	up_write(&iopt->domains_rwsem);
1079 	return 0;
1080 out_release:
1081 	xa_release(&iopt->domains, iopt->next_domain_id);
1082 out_reserved:
1083 	__iopt_remove_reserved_iova(iopt, domain);
1084 out_unlock:
1085 	up_write(&iopt->iova_rwsem);
1086 	up_write(&iopt->domains_rwsem);
1087 	return rc;
1088 }
1089 
1090 static int iopt_calculate_iova_alignment(struct io_pagetable *iopt)
1091 {
1092 	unsigned long new_iova_alignment;
1093 	struct iommufd_access *access;
1094 	struct iommu_domain *domain;
1095 	unsigned long index;
1096 
1097 	lockdep_assert_held_write(&iopt->iova_rwsem);
1098 	lockdep_assert_held(&iopt->domains_rwsem);
1099 
1100 	/* See batch_iommu_map_small() */
1101 	if (iopt->disable_large_pages)
1102 		new_iova_alignment = PAGE_SIZE;
1103 	else
1104 		new_iova_alignment = 1;
1105 
1106 	xa_for_each(&iopt->domains, index, domain)
1107 		new_iova_alignment = max_t(unsigned long,
1108 					   1UL << __ffs(domain->pgsize_bitmap),
1109 					   new_iova_alignment);
1110 	xa_for_each(&iopt->access_list, index, access)
1111 		new_iova_alignment = max_t(unsigned long,
1112 					   access->iova_alignment,
1113 					   new_iova_alignment);
1114 
1115 	if (new_iova_alignment > iopt->iova_alignment) {
1116 		int rc;
1117 
1118 		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1119 		if (rc)
1120 			return rc;
1121 	}
1122 	iopt->iova_alignment = new_iova_alignment;
1123 	return 0;
1124 }
1125 
1126 void iopt_table_remove_domain(struct io_pagetable *iopt,
1127 			      struct iommu_domain *domain)
1128 {
1129 	struct iommu_domain *iter_domain = NULL;
1130 	unsigned long index;
1131 
1132 	down_write(&iopt->domains_rwsem);
1133 	down_write(&iopt->iova_rwsem);
1134 
1135 	xa_for_each(&iopt->domains, index, iter_domain)
1136 		if (iter_domain == domain)
1137 			break;
1138 	if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
1139 		goto out_unlock;
1140 
1141 	/*
1142 	 * Compress the xarray to keep it linear by swapping the entry to erase
1143 	 * with the tail entry and shrinking the tail.
1144 	 */
1145 	iopt->next_domain_id--;
1146 	iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
1147 	if (index != iopt->next_domain_id)
1148 		xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
1149 
1150 	iopt_unfill_domain(iopt, domain);
1151 	__iopt_remove_reserved_iova(iopt, domain);
1152 
1153 	WARN_ON(iopt_calculate_iova_alignment(iopt));
1154 out_unlock:
1155 	up_write(&iopt->iova_rwsem);
1156 	up_write(&iopt->domains_rwsem);
1157 }
1158 
1159 /**
1160  * iopt_area_split - Split an area into two parts at iova
1161  * @area: The area to split
1162  * @iova: Becomes the last of a new area
1163  *
1164  * This splits an area into two. It is part of the VFIO compatibility to allow
1165  * poking a hole in the mapping. The two areas continue to point at the same
1166  * iopt_pages, just with different starting bytes.
1167  */
1168 static int iopt_area_split(struct iopt_area *area, unsigned long iova)
1169 {
1170 	unsigned long alignment = area->iopt->iova_alignment;
1171 	unsigned long last_iova = iopt_area_last_iova(area);
1172 	unsigned long start_iova = iopt_area_iova(area);
1173 	unsigned long new_start = iova + 1;
1174 	struct io_pagetable *iopt = area->iopt;
1175 	struct iopt_pages *pages = area->pages;
1176 	struct iopt_area *lhs;
1177 	struct iopt_area *rhs;
1178 	int rc;
1179 
1180 	lockdep_assert_held_write(&iopt->iova_rwsem);
1181 
1182 	if (iova == start_iova || iova == last_iova)
1183 		return 0;
1184 
1185 	if (!pages || area->prevent_access)
1186 		return -EBUSY;
1187 
1188 	if (new_start & (alignment - 1) ||
1189 	    iopt_area_start_byte(area, new_start) & (alignment - 1))
1190 		return -EINVAL;
1191 
1192 	lhs = iopt_area_alloc();
1193 	if (!lhs)
1194 		return -ENOMEM;
1195 
1196 	rhs = iopt_area_alloc();
1197 	if (!rhs) {
1198 		rc = -ENOMEM;
1199 		goto err_free_lhs;
1200 	}
1201 
1202 	mutex_lock(&pages->mutex);
1203 	/*
1204 	 * Splitting is not permitted if an access exists, we don't track enough
1205 	 * information to split existing accesses.
1206 	 */
1207 	if (area->num_accesses) {
1208 		rc = -EINVAL;
1209 		goto err_unlock;
1210 	}
1211 
1212 	/*
1213 	 * Splitting is not permitted if a domain could have been mapped with
1214 	 * huge pages.
1215 	 */
1216 	if (area->storage_domain && !iopt->disable_large_pages) {
1217 		rc = -EINVAL;
1218 		goto err_unlock;
1219 	}
1220 
1221 	interval_tree_remove(&area->node, &iopt->area_itree);
1222 	rc = iopt_insert_area(iopt, lhs, area->pages, start_iova,
1223 			      iopt_area_start_byte(area, start_iova),
1224 			      (new_start - 1) - start_iova + 1,
1225 			      area->iommu_prot);
1226 	if (WARN_ON(rc))
1227 		goto err_insert;
1228 
1229 	rc = iopt_insert_area(iopt, rhs, area->pages, new_start,
1230 			      iopt_area_start_byte(area, new_start),
1231 			      last_iova - new_start + 1, area->iommu_prot);
1232 	if (WARN_ON(rc))
1233 		goto err_remove_lhs;
1234 
1235 	/*
1236 	 * If the original area has filled a domain, domains_itree has to be
1237 	 * updated.
1238 	 */
1239 	if (area->storage_domain) {
1240 		interval_tree_remove(&area->pages_node, &pages->domains_itree);
1241 		interval_tree_insert(&lhs->pages_node, &pages->domains_itree);
1242 		interval_tree_insert(&rhs->pages_node, &pages->domains_itree);
1243 	}
1244 
1245 	lhs->storage_domain = area->storage_domain;
1246 	lhs->pages = area->pages;
1247 	rhs->storage_domain = area->storage_domain;
1248 	rhs->pages = area->pages;
1249 	kref_get(&rhs->pages->kref);
1250 	kfree(area);
1251 	mutex_unlock(&pages->mutex);
1252 
1253 	/*
1254 	 * No change to domains or accesses because the pages hasn't been
1255 	 * changed
1256 	 */
1257 	return 0;
1258 
1259 err_remove_lhs:
1260 	interval_tree_remove(&lhs->node, &iopt->area_itree);
1261 err_insert:
1262 	interval_tree_insert(&area->node, &iopt->area_itree);
1263 err_unlock:
1264 	mutex_unlock(&pages->mutex);
1265 	kfree(rhs);
1266 err_free_lhs:
1267 	kfree(lhs);
1268 	return rc;
1269 }
1270 
1271 int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
1272 		  size_t num_iovas)
1273 {
1274 	int rc = 0;
1275 	int i;
1276 
1277 	down_write(&iopt->iova_rwsem);
1278 	for (i = 0; i < num_iovas; i++) {
1279 		struct iopt_area *area;
1280 
1281 		area = iopt_area_iter_first(iopt, iovas[i], iovas[i]);
1282 		if (!area)
1283 			continue;
1284 		rc = iopt_area_split(area, iovas[i]);
1285 		if (rc)
1286 			break;
1287 	}
1288 	up_write(&iopt->iova_rwsem);
1289 	return rc;
1290 }
1291 
1292 void iopt_enable_large_pages(struct io_pagetable *iopt)
1293 {
1294 	int rc;
1295 
1296 	down_write(&iopt->domains_rwsem);
1297 	down_write(&iopt->iova_rwsem);
1298 	WRITE_ONCE(iopt->disable_large_pages, false);
1299 	rc = iopt_calculate_iova_alignment(iopt);
1300 	WARN_ON(rc);
1301 	up_write(&iopt->iova_rwsem);
1302 	up_write(&iopt->domains_rwsem);
1303 }
1304 
1305 int iopt_disable_large_pages(struct io_pagetable *iopt)
1306 {
1307 	int rc = 0;
1308 
1309 	down_write(&iopt->domains_rwsem);
1310 	down_write(&iopt->iova_rwsem);
1311 	if (iopt->disable_large_pages)
1312 		goto out_unlock;
1313 
1314 	/* Won't do it if domains already have pages mapped in them */
1315 	if (!xa_empty(&iopt->domains) &&
1316 	    !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) {
1317 		rc = -EINVAL;
1318 		goto out_unlock;
1319 	}
1320 
1321 	WRITE_ONCE(iopt->disable_large_pages, true);
1322 	rc = iopt_calculate_iova_alignment(iopt);
1323 	if (rc)
1324 		WRITE_ONCE(iopt->disable_large_pages, false);
1325 out_unlock:
1326 	up_write(&iopt->iova_rwsem);
1327 	up_write(&iopt->domains_rwsem);
1328 	return rc;
1329 }
1330 
1331 int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access)
1332 {
1333 	u32 new_id;
1334 	int rc;
1335 
1336 	down_write(&iopt->domains_rwsem);
1337 	down_write(&iopt->iova_rwsem);
1338 	rc = xa_alloc(&iopt->access_list, &new_id, access, xa_limit_16b,
1339 		      GFP_KERNEL_ACCOUNT);
1340 
1341 	if (rc)
1342 		goto out_unlock;
1343 
1344 	rc = iopt_calculate_iova_alignment(iopt);
1345 	if (rc) {
1346 		xa_erase(&iopt->access_list, new_id);
1347 		goto out_unlock;
1348 	}
1349 	access->iopt_access_list_id = new_id;
1350 
1351 out_unlock:
1352 	up_write(&iopt->iova_rwsem);
1353 	up_write(&iopt->domains_rwsem);
1354 	return rc;
1355 }
1356 
1357 void iopt_remove_access(struct io_pagetable *iopt,
1358 			struct iommufd_access *access,
1359 			u32 iopt_access_list_id)
1360 {
1361 	down_write(&iopt->domains_rwsem);
1362 	down_write(&iopt->iova_rwsem);
1363 	WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access);
1364 	WARN_ON(iopt_calculate_iova_alignment(iopt));
1365 	up_write(&iopt->iova_rwsem);
1366 	up_write(&iopt->domains_rwsem);
1367 }
1368 
1369 /* Narrow the valid_iova_itree to include reserved ranges from a device. */
1370 int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
1371 					struct device *dev,
1372 					phys_addr_t *sw_msi_start)
1373 {
1374 	struct iommu_resv_region *resv;
1375 	LIST_HEAD(resv_regions);
1376 	unsigned int num_hw_msi = 0;
1377 	unsigned int num_sw_msi = 0;
1378 	int rc;
1379 
1380 	if (iommufd_should_fail())
1381 		return -EINVAL;
1382 
1383 	down_write(&iopt->iova_rwsem);
1384 	/* FIXME: drivers allocate memory but there is no failure propogated */
1385 	iommu_get_resv_regions(dev, &resv_regions);
1386 
1387 	list_for_each_entry(resv, &resv_regions, list) {
1388 		if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1389 			continue;
1390 
1391 		if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
1392 			num_hw_msi++;
1393 		if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
1394 			*sw_msi_start = resv->start;
1395 			num_sw_msi++;
1396 		}
1397 
1398 		rc = iopt_reserve_iova(iopt, resv->start,
1399 				       resv->length - 1 + resv->start, dev);
1400 		if (rc)
1401 			goto out_reserved;
1402 	}
1403 
1404 	/* Drivers must offer sane combinations of regions */
1405 	if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) {
1406 		rc = -EINVAL;
1407 		goto out_reserved;
1408 	}
1409 
1410 	rc = 0;
1411 	goto out_free_resv;
1412 
1413 out_reserved:
1414 	__iopt_remove_reserved_iova(iopt, dev);
1415 out_free_resv:
1416 	iommu_put_resv_regions(dev, &resv_regions);
1417 	up_write(&iopt->iova_rwsem);
1418 	return rc;
1419 }
1420