xref: /linux/drivers/iommu/iommufd/io_pagetable.c (revision c532de5a67a70f8533d495f8f2aaa9a0491c3ad0)
1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
3  *
4  * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
5  * PFNs can be placed into an iommu_domain, or returned to the caller as a page
6  * list for access by an in-kernel user.
7  *
8  * The datastructure uses the iopt_pages to optimize the storage of the PFNs
9  * between the domains and xarray.
10  */
11 #include <linux/err.h>
12 #include <linux/errno.h>
13 #include <linux/iommu.h>
14 #include <linux/iommufd.h>
15 #include <linux/lockdep.h>
16 #include <linux/sched/mm.h>
17 #include <linux/slab.h>
18 #include <uapi/linux/iommufd.h>
19 
20 #include "double_span.h"
21 #include "io_pagetable.h"
22 
23 struct iopt_pages_list {
24 	struct iopt_pages *pages;
25 	struct iopt_area *area;
26 	struct list_head next;
27 	unsigned long start_byte;
28 	unsigned long length;
29 };
30 
31 struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
32 					struct io_pagetable *iopt,
33 					unsigned long iova,
34 					unsigned long last_iova)
35 {
36 	lockdep_assert_held(&iopt->iova_rwsem);
37 
38 	iter->cur_iova = iova;
39 	iter->last_iova = last_iova;
40 	iter->area = iopt_area_iter_first(iopt, iova, iova);
41 	if (!iter->area)
42 		return NULL;
43 	if (!iter->area->pages) {
44 		iter->area = NULL;
45 		return NULL;
46 	}
47 	return iter->area;
48 }
49 
50 struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
51 {
52 	unsigned long last_iova;
53 
54 	if (!iter->area)
55 		return NULL;
56 	last_iova = iopt_area_last_iova(iter->area);
57 	if (iter->last_iova <= last_iova)
58 		return NULL;
59 
60 	iter->cur_iova = last_iova + 1;
61 	iter->area = iopt_area_iter_next(iter->area, iter->cur_iova,
62 					 iter->last_iova);
63 	if (!iter->area)
64 		return NULL;
65 	if (iter->cur_iova != iopt_area_iova(iter->area) ||
66 	    !iter->area->pages) {
67 		iter->area = NULL;
68 		return NULL;
69 	}
70 	return iter->area;
71 }
72 
73 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
74 				    unsigned long length,
75 				    unsigned long iova_alignment,
76 				    unsigned long page_offset)
77 {
78 	if (span->is_used || span->last_hole - span->start_hole < length - 1)
79 		return false;
80 
81 	span->start_hole = ALIGN(span->start_hole, iova_alignment) |
82 			   page_offset;
83 	if (span->start_hole > span->last_hole ||
84 	    span->last_hole - span->start_hole < length - 1)
85 		return false;
86 	return true;
87 }
88 
89 static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
90 				    unsigned long length,
91 				    unsigned long iova_alignment,
92 				    unsigned long page_offset)
93 {
94 	if (span->is_hole || span->last_used - span->start_used < length - 1)
95 		return false;
96 
97 	span->start_used = ALIGN(span->start_used, iova_alignment) |
98 			   page_offset;
99 	if (span->start_used > span->last_used ||
100 	    span->last_used - span->start_used < length - 1)
101 		return false;
102 	return true;
103 }
104 
105 /*
106  * Automatically find a block of IOVA that is not being used and not reserved.
107  * Does not return a 0 IOVA even if it is valid.
108  */
109 static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
110 			   unsigned long uptr, unsigned long length)
111 {
112 	unsigned long page_offset = uptr % PAGE_SIZE;
113 	struct interval_tree_double_span_iter used_span;
114 	struct interval_tree_span_iter allowed_span;
115 	unsigned long max_alignment = PAGE_SIZE;
116 	unsigned long iova_alignment;
117 
118 	lockdep_assert_held(&iopt->iova_rwsem);
119 
120 	/* Protect roundup_pow-of_two() from overflow */
121 	if (length == 0 || length >= ULONG_MAX / 2)
122 		return -EOVERFLOW;
123 
124 	/*
125 	 * Keep alignment present in the uptr when building the IOVA, this
126 	 * increases the chance we can map a THP.
127 	 */
128 	if (!uptr)
129 		iova_alignment = roundup_pow_of_two(length);
130 	else
131 		iova_alignment = min_t(unsigned long,
132 				       roundup_pow_of_two(length),
133 				       1UL << __ffs64(uptr));
134 
135 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
136 	max_alignment = HPAGE_SIZE;
137 #endif
138 	/* Protect against ALIGN() overflow */
139 	if (iova_alignment >= max_alignment)
140 		iova_alignment = max_alignment;
141 
142 	if (iova_alignment < iopt->iova_alignment)
143 		return -EINVAL;
144 
145 	interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree,
146 				    PAGE_SIZE, ULONG_MAX - PAGE_SIZE) {
147 		if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) {
148 			allowed_span.start_used = PAGE_SIZE;
149 			allowed_span.last_used = ULONG_MAX - PAGE_SIZE;
150 			allowed_span.is_hole = false;
151 		}
152 
153 		if (!__alloc_iova_check_used(&allowed_span, length,
154 					     iova_alignment, page_offset))
155 			continue;
156 
157 		interval_tree_for_each_double_span(
158 			&used_span, &iopt->reserved_itree, &iopt->area_itree,
159 			allowed_span.start_used, allowed_span.last_used) {
160 			if (!__alloc_iova_check_hole(&used_span, length,
161 						     iova_alignment,
162 						     page_offset))
163 				continue;
164 
165 			*iova = used_span.start_hole;
166 			return 0;
167 		}
168 	}
169 	return -ENOSPC;
170 }
171 
172 static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova,
173 			   unsigned long length)
174 {
175 	unsigned long last;
176 
177 	lockdep_assert_held(&iopt->iova_rwsem);
178 
179 	if ((iova & (iopt->iova_alignment - 1)))
180 		return -EINVAL;
181 
182 	if (check_add_overflow(iova, length - 1, &last))
183 		return -EOVERFLOW;
184 
185 	/* No reserved IOVA intersects the range */
186 	if (iopt_reserved_iter_first(iopt, iova, last))
187 		return -EINVAL;
188 
189 	/* Check that there is not already a mapping in the range */
190 	if (iopt_area_iter_first(iopt, iova, last))
191 		return -EEXIST;
192 	return 0;
193 }
194 
195 /*
196  * The area takes a slice of the pages from start_bytes to start_byte + length
197  */
198 static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
199 			    struct iopt_pages *pages, unsigned long iova,
200 			    unsigned long start_byte, unsigned long length,
201 			    int iommu_prot)
202 {
203 	lockdep_assert_held_write(&iopt->iova_rwsem);
204 
205 	if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
206 		return -EPERM;
207 
208 	area->iommu_prot = iommu_prot;
209 	area->page_offset = start_byte % PAGE_SIZE;
210 	if (area->page_offset & (iopt->iova_alignment - 1))
211 		return -EINVAL;
212 
213 	area->node.start = iova;
214 	if (check_add_overflow(iova, length - 1, &area->node.last))
215 		return -EOVERFLOW;
216 
217 	area->pages_node.start = start_byte / PAGE_SIZE;
218 	if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
219 		return -EOVERFLOW;
220 	area->pages_node.last = area->pages_node.last / PAGE_SIZE;
221 	if (WARN_ON(area->pages_node.last >= pages->npages))
222 		return -EOVERFLOW;
223 
224 	/*
225 	 * The area is inserted with a NULL pages indicating it is not fully
226 	 * initialized yet.
227 	 */
228 	area->iopt = iopt;
229 	interval_tree_insert(&area->node, &iopt->area_itree);
230 	return 0;
231 }
232 
233 static struct iopt_area *iopt_area_alloc(void)
234 {
235 	struct iopt_area *area;
236 
237 	area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
238 	if (!area)
239 		return NULL;
240 	RB_CLEAR_NODE(&area->node.rb);
241 	RB_CLEAR_NODE(&area->pages_node.rb);
242 	return area;
243 }
244 
245 static int iopt_alloc_area_pages(struct io_pagetable *iopt,
246 				 struct list_head *pages_list,
247 				 unsigned long length, unsigned long *dst_iova,
248 				 int iommu_prot, unsigned int flags)
249 {
250 	struct iopt_pages_list *elm;
251 	unsigned long iova;
252 	int rc = 0;
253 
254 	list_for_each_entry(elm, pages_list, next) {
255 		elm->area = iopt_area_alloc();
256 		if (!elm->area)
257 			return -ENOMEM;
258 	}
259 
260 	down_write(&iopt->iova_rwsem);
261 	if ((length & (iopt->iova_alignment - 1)) || !length) {
262 		rc = -EINVAL;
263 		goto out_unlock;
264 	}
265 
266 	if (flags & IOPT_ALLOC_IOVA) {
267 		/* Use the first entry to guess the ideal IOVA alignment */
268 		elm = list_first_entry(pages_list, struct iopt_pages_list,
269 				       next);
270 		rc = iopt_alloc_iova(
271 			iopt, dst_iova,
272 			(uintptr_t)elm->pages->uptr + elm->start_byte, length);
273 		if (rc)
274 			goto out_unlock;
275 		if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
276 		    WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) {
277 			rc = -EINVAL;
278 			goto out_unlock;
279 		}
280 	} else {
281 		rc = iopt_check_iova(iopt, *dst_iova, length);
282 		if (rc)
283 			goto out_unlock;
284 	}
285 
286 	/*
287 	 * Areas are created with a NULL pages so that the IOVA space is
288 	 * reserved and we can unlock the iova_rwsem.
289 	 */
290 	iova = *dst_iova;
291 	list_for_each_entry(elm, pages_list, next) {
292 		rc = iopt_insert_area(iopt, elm->area, elm->pages, iova,
293 				      elm->start_byte, elm->length, iommu_prot);
294 		if (rc)
295 			goto out_unlock;
296 		iova += elm->length;
297 	}
298 
299 out_unlock:
300 	up_write(&iopt->iova_rwsem);
301 	return rc;
302 }
303 
304 static void iopt_abort_area(struct iopt_area *area)
305 {
306 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
307 		WARN_ON(area->pages);
308 	if (area->iopt) {
309 		down_write(&area->iopt->iova_rwsem);
310 		interval_tree_remove(&area->node, &area->iopt->area_itree);
311 		up_write(&area->iopt->iova_rwsem);
312 	}
313 	kfree(area);
314 }
315 
316 void iopt_free_pages_list(struct list_head *pages_list)
317 {
318 	struct iopt_pages_list *elm;
319 
320 	while ((elm = list_first_entry_or_null(pages_list,
321 					       struct iopt_pages_list, next))) {
322 		if (elm->area)
323 			iopt_abort_area(elm->area);
324 		if (elm->pages)
325 			iopt_put_pages(elm->pages);
326 		list_del(&elm->next);
327 		kfree(elm);
328 	}
329 }
330 
331 static int iopt_fill_domains_pages(struct list_head *pages_list)
332 {
333 	struct iopt_pages_list *undo_elm;
334 	struct iopt_pages_list *elm;
335 	int rc;
336 
337 	list_for_each_entry(elm, pages_list, next) {
338 		rc = iopt_area_fill_domains(elm->area, elm->pages);
339 		if (rc)
340 			goto err_undo;
341 	}
342 	return 0;
343 
344 err_undo:
345 	list_for_each_entry(undo_elm, pages_list, next) {
346 		if (undo_elm == elm)
347 			break;
348 		iopt_area_unfill_domains(undo_elm->area, undo_elm->pages);
349 	}
350 	return rc;
351 }
352 
353 int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
354 		   unsigned long length, unsigned long *dst_iova,
355 		   int iommu_prot, unsigned int flags)
356 {
357 	struct iopt_pages_list *elm;
358 	int rc;
359 
360 	rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova,
361 				   iommu_prot, flags);
362 	if (rc)
363 		return rc;
364 
365 	down_read(&iopt->domains_rwsem);
366 	rc = iopt_fill_domains_pages(pages_list);
367 	if (rc)
368 		goto out_unlock_domains;
369 
370 	down_write(&iopt->iova_rwsem);
371 	list_for_each_entry(elm, pages_list, next) {
372 		/*
373 		 * area->pages must be set inside the domains_rwsem to ensure
374 		 * any newly added domains will get filled. Moves the reference
375 		 * in from the list.
376 		 */
377 		elm->area->pages = elm->pages;
378 		elm->pages = NULL;
379 		elm->area = NULL;
380 	}
381 	up_write(&iopt->iova_rwsem);
382 out_unlock_domains:
383 	up_read(&iopt->domains_rwsem);
384 	return rc;
385 }
386 
387 /**
388  * iopt_map_user_pages() - Map a user VA to an iova in the io page table
389  * @ictx: iommufd_ctx the iopt is part of
390  * @iopt: io_pagetable to act on
391  * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
392  *        the chosen iova on output. Otherwise is the iova to map to on input
393  * @uptr: User VA to map
394  * @length: Number of bytes to map
395  * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
396  * @flags: IOPT_ALLOC_IOVA or zero
397  *
398  * iova, uptr, and length must be aligned to iova_alignment. For domain backed
399  * page tables this will pin the pages and load them into the domain at iova.
400  * For non-domain page tables this will only setup a lazy reference and the
401  * caller must use iopt_access_pages() to touch them.
402  *
403  * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
404  * destroyed.
405  */
406 int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
407 			unsigned long *iova, void __user *uptr,
408 			unsigned long length, int iommu_prot,
409 			unsigned int flags)
410 {
411 	struct iopt_pages_list elm = {};
412 	LIST_HEAD(pages_list);
413 	int rc;
414 
415 	elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE);
416 	if (IS_ERR(elm.pages))
417 		return PTR_ERR(elm.pages);
418 	if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
419 	    elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
420 		elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
421 	elm.start_byte = uptr - elm.pages->uptr;
422 	elm.length = length;
423 	list_add(&elm.next, &pages_list);
424 
425 	rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
426 	if (rc) {
427 		if (elm.area)
428 			iopt_abort_area(elm.area);
429 		if (elm.pages)
430 			iopt_put_pages(elm.pages);
431 		return rc;
432 	}
433 	return 0;
434 }
435 
436 struct iova_bitmap_fn_arg {
437 	unsigned long flags;
438 	struct io_pagetable *iopt;
439 	struct iommu_domain *domain;
440 	struct iommu_dirty_bitmap *dirty;
441 };
442 
443 static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap,
444 					unsigned long iova, size_t length,
445 					void *opaque)
446 {
447 	struct iopt_area *area;
448 	struct iopt_area_contig_iter iter;
449 	struct iova_bitmap_fn_arg *arg = opaque;
450 	struct iommu_domain *domain = arg->domain;
451 	struct iommu_dirty_bitmap *dirty = arg->dirty;
452 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
453 	unsigned long last_iova = iova + length - 1;
454 	unsigned long flags = arg->flags;
455 	int ret;
456 
457 	iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) {
458 		unsigned long last = min(last_iova, iopt_area_last_iova(area));
459 
460 		ret = ops->read_and_clear_dirty(domain, iter.cur_iova,
461 						last - iter.cur_iova + 1, flags,
462 						dirty);
463 		if (ret)
464 			return ret;
465 	}
466 
467 	if (!iopt_area_contig_done(&iter))
468 		return -EINVAL;
469 	return 0;
470 }
471 
472 static int
473 iommu_read_and_clear_dirty(struct iommu_domain *domain,
474 			   struct io_pagetable *iopt, unsigned long flags,
475 			   struct iommu_hwpt_get_dirty_bitmap *bitmap)
476 {
477 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
478 	struct iommu_iotlb_gather gather;
479 	struct iommu_dirty_bitmap dirty;
480 	struct iova_bitmap_fn_arg arg;
481 	struct iova_bitmap *iter;
482 	int ret = 0;
483 
484 	if (!ops || !ops->read_and_clear_dirty)
485 		return -EOPNOTSUPP;
486 
487 	iter = iova_bitmap_alloc(bitmap->iova, bitmap->length,
488 				 bitmap->page_size,
489 				 u64_to_user_ptr(bitmap->data));
490 	if (IS_ERR(iter))
491 		return -ENOMEM;
492 
493 	iommu_dirty_bitmap_init(&dirty, iter, &gather);
494 
495 	arg.flags = flags;
496 	arg.iopt = iopt;
497 	arg.domain = domain;
498 	arg.dirty = &dirty;
499 	iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty);
500 
501 	if (!(flags & IOMMU_DIRTY_NO_CLEAR))
502 		iommu_iotlb_sync(domain, &gather);
503 
504 	iova_bitmap_free(iter);
505 
506 	return ret;
507 }
508 
509 int iommufd_check_iova_range(struct io_pagetable *iopt,
510 			     struct iommu_hwpt_get_dirty_bitmap *bitmap)
511 {
512 	size_t iommu_pgsize = iopt->iova_alignment;
513 	u64 last_iova;
514 
515 	if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova))
516 		return -EOVERFLOW;
517 
518 	if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX)
519 		return -EOVERFLOW;
520 
521 	if ((bitmap->iova & (iommu_pgsize - 1)) ||
522 	    ((last_iova + 1) & (iommu_pgsize - 1)))
523 		return -EINVAL;
524 
525 	if (!bitmap->page_size)
526 		return -EINVAL;
527 
528 	if ((bitmap->iova & (bitmap->page_size - 1)) ||
529 	    ((last_iova + 1) & (bitmap->page_size - 1)))
530 		return -EINVAL;
531 
532 	return 0;
533 }
534 
535 int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
536 				   struct iommu_domain *domain,
537 				   unsigned long flags,
538 				   struct iommu_hwpt_get_dirty_bitmap *bitmap)
539 {
540 	int ret;
541 
542 	ret = iommufd_check_iova_range(iopt, bitmap);
543 	if (ret)
544 		return ret;
545 
546 	down_read(&iopt->iova_rwsem);
547 	ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
548 	up_read(&iopt->iova_rwsem);
549 
550 	return ret;
551 }
552 
553 static int iopt_clear_dirty_data(struct io_pagetable *iopt,
554 				 struct iommu_domain *domain)
555 {
556 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
557 	struct iommu_iotlb_gather gather;
558 	struct iommu_dirty_bitmap dirty;
559 	struct iopt_area *area;
560 	int ret = 0;
561 
562 	lockdep_assert_held_read(&iopt->iova_rwsem);
563 
564 	iommu_dirty_bitmap_init(&dirty, NULL, &gather);
565 
566 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
567 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
568 		if (!area->pages)
569 			continue;
570 
571 		ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area),
572 						iopt_area_length(area), 0,
573 						&dirty);
574 		if (ret)
575 			break;
576 	}
577 
578 	iommu_iotlb_sync(domain, &gather);
579 	return ret;
580 }
581 
582 int iopt_set_dirty_tracking(struct io_pagetable *iopt,
583 			    struct iommu_domain *domain, bool enable)
584 {
585 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
586 	int ret = 0;
587 
588 	if (!ops)
589 		return -EOPNOTSUPP;
590 
591 	down_read(&iopt->iova_rwsem);
592 
593 	/* Clear dirty bits from PTEs to ensure a clean snapshot */
594 	if (enable) {
595 		ret = iopt_clear_dirty_data(iopt, domain);
596 		if (ret)
597 			goto out_unlock;
598 	}
599 
600 	ret = ops->set_dirty_tracking(domain, enable);
601 
602 out_unlock:
603 	up_read(&iopt->iova_rwsem);
604 	return ret;
605 }
606 
607 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
608 		   unsigned long length, struct list_head *pages_list)
609 {
610 	struct iopt_area_contig_iter iter;
611 	unsigned long last_iova;
612 	struct iopt_area *area;
613 	int rc;
614 
615 	if (!length)
616 		return -EINVAL;
617 	if (check_add_overflow(iova, length - 1, &last_iova))
618 		return -EOVERFLOW;
619 
620 	down_read(&iopt->iova_rwsem);
621 	iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
622 		struct iopt_pages_list *elm;
623 		unsigned long last = min(last_iova, iopt_area_last_iova(area));
624 
625 		elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT);
626 		if (!elm) {
627 			rc = -ENOMEM;
628 			goto err_free;
629 		}
630 		elm->start_byte = iopt_area_start_byte(area, iter.cur_iova);
631 		elm->pages = area->pages;
632 		elm->length = (last - iter.cur_iova) + 1;
633 		kref_get(&elm->pages->kref);
634 		list_add_tail(&elm->next, pages_list);
635 	}
636 	if (!iopt_area_contig_done(&iter)) {
637 		rc = -ENOENT;
638 		goto err_free;
639 	}
640 	up_read(&iopt->iova_rwsem);
641 	return 0;
642 err_free:
643 	up_read(&iopt->iova_rwsem);
644 	iopt_free_pages_list(pages_list);
645 	return rc;
646 }
647 
648 static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
649 				 unsigned long last, unsigned long *unmapped)
650 {
651 	struct iopt_area *area;
652 	unsigned long unmapped_bytes = 0;
653 	unsigned int tries = 0;
654 	int rc = -ENOENT;
655 
656 	/*
657 	 * The domains_rwsem must be held in read mode any time any area->pages
658 	 * is NULL. This prevents domain attach/detatch from running
659 	 * concurrently with cleaning up the area.
660 	 */
661 again:
662 	down_read(&iopt->domains_rwsem);
663 	down_write(&iopt->iova_rwsem);
664 	while ((area = iopt_area_iter_first(iopt, start, last))) {
665 		unsigned long area_last = iopt_area_last_iova(area);
666 		unsigned long area_first = iopt_area_iova(area);
667 		struct iopt_pages *pages;
668 
669 		/* Userspace should not race map/unmap's of the same area */
670 		if (!area->pages) {
671 			rc = -EBUSY;
672 			goto out_unlock_iova;
673 		}
674 
675 		if (area_first < start || area_last > last) {
676 			rc = -ENOENT;
677 			goto out_unlock_iova;
678 		}
679 
680 		if (area_first != start)
681 			tries = 0;
682 
683 		/*
684 		 * num_accesses writers must hold the iova_rwsem too, so we can
685 		 * safely read it under the write side of the iovam_rwsem
686 		 * without the pages->mutex.
687 		 */
688 		if (area->num_accesses) {
689 			size_t length = iopt_area_length(area);
690 
691 			start = area_first;
692 			area->prevent_access = true;
693 			up_write(&iopt->iova_rwsem);
694 			up_read(&iopt->domains_rwsem);
695 
696 			iommufd_access_notify_unmap(iopt, area_first, length);
697 			/* Something is not responding to unmap requests. */
698 			tries++;
699 			if (WARN_ON(tries > 100))
700 				return -EDEADLOCK;
701 			goto again;
702 		}
703 
704 		pages = area->pages;
705 		area->pages = NULL;
706 		up_write(&iopt->iova_rwsem);
707 
708 		iopt_area_unfill_domains(area, pages);
709 		iopt_abort_area(area);
710 		iopt_put_pages(pages);
711 
712 		unmapped_bytes += area_last - area_first + 1;
713 
714 		down_write(&iopt->iova_rwsem);
715 	}
716 	if (unmapped_bytes)
717 		rc = 0;
718 
719 out_unlock_iova:
720 	up_write(&iopt->iova_rwsem);
721 	up_read(&iopt->domains_rwsem);
722 	if (unmapped)
723 		*unmapped = unmapped_bytes;
724 	return rc;
725 }
726 
727 /**
728  * iopt_unmap_iova() - Remove a range of iova
729  * @iopt: io_pagetable to act on
730  * @iova: Starting iova to unmap
731  * @length: Number of bytes to unmap
732  * @unmapped: Return number of bytes unmapped
733  *
734  * The requested range must be a superset of existing ranges.
735  * Splitting/truncating IOVA mappings is not allowed.
736  */
737 int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
738 		    unsigned long length, unsigned long *unmapped)
739 {
740 	unsigned long iova_last;
741 
742 	if (!length)
743 		return -EINVAL;
744 
745 	if (check_add_overflow(iova, length - 1, &iova_last))
746 		return -EOVERFLOW;
747 
748 	return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped);
749 }
750 
751 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
752 {
753 	int rc;
754 
755 	rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
756 	/* If the IOVAs are empty then unmap all succeeds */
757 	if (rc == -ENOENT)
758 		return 0;
759 	return rc;
760 }
761 
762 /* The caller must always free all the nodes in the allowed_iova rb_root. */
763 int iopt_set_allow_iova(struct io_pagetable *iopt,
764 			struct rb_root_cached *allowed_iova)
765 {
766 	struct iopt_allowed *allowed;
767 
768 	down_write(&iopt->iova_rwsem);
769 	swap(*allowed_iova, iopt->allowed_itree);
770 
771 	for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed;
772 	     allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) {
773 		if (iopt_reserved_iter_first(iopt, allowed->node.start,
774 					     allowed->node.last)) {
775 			swap(*allowed_iova, iopt->allowed_itree);
776 			up_write(&iopt->iova_rwsem);
777 			return -EADDRINUSE;
778 		}
779 	}
780 	up_write(&iopt->iova_rwsem);
781 	return 0;
782 }
783 
784 int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
785 		      unsigned long last, void *owner)
786 {
787 	struct iopt_reserved *reserved;
788 
789 	lockdep_assert_held_write(&iopt->iova_rwsem);
790 
791 	if (iopt_area_iter_first(iopt, start, last) ||
792 	    iopt_allowed_iter_first(iopt, start, last))
793 		return -EADDRINUSE;
794 
795 	reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT);
796 	if (!reserved)
797 		return -ENOMEM;
798 	reserved->node.start = start;
799 	reserved->node.last = last;
800 	reserved->owner = owner;
801 	interval_tree_insert(&reserved->node, &iopt->reserved_itree);
802 	return 0;
803 }
804 
805 static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
806 {
807 	struct iopt_reserved *reserved, *next;
808 
809 	lockdep_assert_held_write(&iopt->iova_rwsem);
810 
811 	for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved;
812 	     reserved = next) {
813 		next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX);
814 
815 		if (reserved->owner == owner) {
816 			interval_tree_remove(&reserved->node,
817 					     &iopt->reserved_itree);
818 			kfree(reserved);
819 		}
820 	}
821 }
822 
823 void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
824 {
825 	down_write(&iopt->iova_rwsem);
826 	__iopt_remove_reserved_iova(iopt, owner);
827 	up_write(&iopt->iova_rwsem);
828 }
829 
830 void iopt_init_table(struct io_pagetable *iopt)
831 {
832 	init_rwsem(&iopt->iova_rwsem);
833 	init_rwsem(&iopt->domains_rwsem);
834 	iopt->area_itree = RB_ROOT_CACHED;
835 	iopt->allowed_itree = RB_ROOT_CACHED;
836 	iopt->reserved_itree = RB_ROOT_CACHED;
837 	xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT);
838 	xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC);
839 
840 	/*
841 	 * iopt's start as SW tables that can use the entire size_t IOVA space
842 	 * due to the use of size_t in the APIs. They have no alignment
843 	 * restriction.
844 	 */
845 	iopt->iova_alignment = 1;
846 }
847 
848 void iopt_destroy_table(struct io_pagetable *iopt)
849 {
850 	struct interval_tree_node *node;
851 
852 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
853 		iopt_remove_reserved_iova(iopt, NULL);
854 
855 	while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0,
856 						ULONG_MAX))) {
857 		interval_tree_remove(node, &iopt->allowed_itree);
858 		kfree(container_of(node, struct iopt_allowed, node));
859 	}
860 
861 	WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root));
862 	WARN_ON(!xa_empty(&iopt->domains));
863 	WARN_ON(!xa_empty(&iopt->access_list));
864 	WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
865 }
866 
867 /**
868  * iopt_unfill_domain() - Unfill a domain with PFNs
869  * @iopt: io_pagetable to act on
870  * @domain: domain to unfill
871  *
872  * This is used when removing a domain from the iopt. Every area in the iopt
873  * will be unmapped from the domain. The domain must already be removed from the
874  * domains xarray.
875  */
876 static void iopt_unfill_domain(struct io_pagetable *iopt,
877 			       struct iommu_domain *domain)
878 {
879 	struct iopt_area *area;
880 
881 	lockdep_assert_held(&iopt->iova_rwsem);
882 	lockdep_assert_held_write(&iopt->domains_rwsem);
883 
884 	/*
885 	 * Some other domain is holding all the pfns still, rapidly unmap this
886 	 * domain.
887 	 */
888 	if (iopt->next_domain_id != 0) {
889 		/* Pick an arbitrary remaining domain to act as storage */
890 		struct iommu_domain *storage_domain =
891 			xa_load(&iopt->domains, 0);
892 
893 		for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
894 		     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
895 			struct iopt_pages *pages = area->pages;
896 
897 			if (!pages)
898 				continue;
899 
900 			mutex_lock(&pages->mutex);
901 			if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
902 				WARN_ON(!area->storage_domain);
903 			if (area->storage_domain == domain)
904 				area->storage_domain = storage_domain;
905 			mutex_unlock(&pages->mutex);
906 
907 			iopt_area_unmap_domain(area, domain);
908 		}
909 		return;
910 	}
911 
912 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
913 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
914 		struct iopt_pages *pages = area->pages;
915 
916 		if (!pages)
917 			continue;
918 
919 		mutex_lock(&pages->mutex);
920 		interval_tree_remove(&area->pages_node, &pages->domains_itree);
921 		WARN_ON(area->storage_domain != domain);
922 		area->storage_domain = NULL;
923 		iopt_area_unfill_domain(area, pages, domain);
924 		mutex_unlock(&pages->mutex);
925 	}
926 }
927 
928 /**
929  * iopt_fill_domain() - Fill a domain with PFNs
930  * @iopt: io_pagetable to act on
931  * @domain: domain to fill
932  *
933  * Fill the domain with PFNs from every area in the iopt. On failure the domain
934  * is left unchanged.
935  */
936 static int iopt_fill_domain(struct io_pagetable *iopt,
937 			    struct iommu_domain *domain)
938 {
939 	struct iopt_area *end_area;
940 	struct iopt_area *area;
941 	int rc;
942 
943 	lockdep_assert_held(&iopt->iova_rwsem);
944 	lockdep_assert_held_write(&iopt->domains_rwsem);
945 
946 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
947 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
948 		struct iopt_pages *pages = area->pages;
949 
950 		if (!pages)
951 			continue;
952 
953 		mutex_lock(&pages->mutex);
954 		rc = iopt_area_fill_domain(area, domain);
955 		if (rc) {
956 			mutex_unlock(&pages->mutex);
957 			goto out_unfill;
958 		}
959 		if (!area->storage_domain) {
960 			WARN_ON(iopt->next_domain_id != 0);
961 			area->storage_domain = domain;
962 			interval_tree_insert(&area->pages_node,
963 					     &pages->domains_itree);
964 		}
965 		mutex_unlock(&pages->mutex);
966 	}
967 	return 0;
968 
969 out_unfill:
970 	end_area = area;
971 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
972 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
973 		struct iopt_pages *pages = area->pages;
974 
975 		if (area == end_area)
976 			break;
977 		if (!pages)
978 			continue;
979 		mutex_lock(&pages->mutex);
980 		if (iopt->next_domain_id == 0) {
981 			interval_tree_remove(&area->pages_node,
982 					     &pages->domains_itree);
983 			area->storage_domain = NULL;
984 		}
985 		iopt_area_unfill_domain(area, pages, domain);
986 		mutex_unlock(&pages->mutex);
987 	}
988 	return rc;
989 }
990 
991 /* All existing area's conform to an increased page size */
992 static int iopt_check_iova_alignment(struct io_pagetable *iopt,
993 				     unsigned long new_iova_alignment)
994 {
995 	unsigned long align_mask = new_iova_alignment - 1;
996 	struct iopt_area *area;
997 
998 	lockdep_assert_held(&iopt->iova_rwsem);
999 	lockdep_assert_held(&iopt->domains_rwsem);
1000 
1001 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1002 	     area = iopt_area_iter_next(area, 0, ULONG_MAX))
1003 		if ((iopt_area_iova(area) & align_mask) ||
1004 		    (iopt_area_length(area) & align_mask) ||
1005 		    (area->page_offset & align_mask))
1006 			return -EADDRINUSE;
1007 
1008 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) {
1009 		struct iommufd_access *access;
1010 		unsigned long index;
1011 
1012 		xa_for_each(&iopt->access_list, index, access)
1013 			if (WARN_ON(access->iova_alignment >
1014 				    new_iova_alignment))
1015 				return -EADDRINUSE;
1016 	}
1017 	return 0;
1018 }
1019 
1020 int iopt_table_add_domain(struct io_pagetable *iopt,
1021 			  struct iommu_domain *domain)
1022 {
1023 	const struct iommu_domain_geometry *geometry = &domain->geometry;
1024 	struct iommu_domain *iter_domain;
1025 	unsigned int new_iova_alignment;
1026 	unsigned long index;
1027 	int rc;
1028 
1029 	down_write(&iopt->domains_rwsem);
1030 	down_write(&iopt->iova_rwsem);
1031 
1032 	xa_for_each(&iopt->domains, index, iter_domain) {
1033 		if (WARN_ON(iter_domain == domain)) {
1034 			rc = -EEXIST;
1035 			goto out_unlock;
1036 		}
1037 	}
1038 
1039 	/*
1040 	 * The io page size drives the iova_alignment. Internally the iopt_pages
1041 	 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
1042 	 * objects into the iommu_domain.
1043 	 *
1044 	 * A iommu_domain must always be able to accept PAGE_SIZE to be
1045 	 * compatible as we can't guarantee higher contiguity.
1046 	 */
1047 	new_iova_alignment = max_t(unsigned long,
1048 				   1UL << __ffs(domain->pgsize_bitmap),
1049 				   iopt->iova_alignment);
1050 	if (new_iova_alignment > PAGE_SIZE) {
1051 		rc = -EINVAL;
1052 		goto out_unlock;
1053 	}
1054 	if (new_iova_alignment != iopt->iova_alignment) {
1055 		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1056 		if (rc)
1057 			goto out_unlock;
1058 	}
1059 
1060 	/* No area exists that is outside the allowed domain aperture */
1061 	if (geometry->aperture_start != 0) {
1062 		rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
1063 				       domain);
1064 		if (rc)
1065 			goto out_reserved;
1066 	}
1067 	if (geometry->aperture_end != ULONG_MAX) {
1068 		rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
1069 				       ULONG_MAX, domain);
1070 		if (rc)
1071 			goto out_reserved;
1072 	}
1073 
1074 	rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
1075 	if (rc)
1076 		goto out_reserved;
1077 
1078 	rc = iopt_fill_domain(iopt, domain);
1079 	if (rc)
1080 		goto out_release;
1081 
1082 	iopt->iova_alignment = new_iova_alignment;
1083 	xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
1084 	iopt->next_domain_id++;
1085 	up_write(&iopt->iova_rwsem);
1086 	up_write(&iopt->domains_rwsem);
1087 	return 0;
1088 out_release:
1089 	xa_release(&iopt->domains, iopt->next_domain_id);
1090 out_reserved:
1091 	__iopt_remove_reserved_iova(iopt, domain);
1092 out_unlock:
1093 	up_write(&iopt->iova_rwsem);
1094 	up_write(&iopt->domains_rwsem);
1095 	return rc;
1096 }
1097 
1098 static int iopt_calculate_iova_alignment(struct io_pagetable *iopt)
1099 {
1100 	unsigned long new_iova_alignment;
1101 	struct iommufd_access *access;
1102 	struct iommu_domain *domain;
1103 	unsigned long index;
1104 
1105 	lockdep_assert_held_write(&iopt->iova_rwsem);
1106 	lockdep_assert_held(&iopt->domains_rwsem);
1107 
1108 	/* See batch_iommu_map_small() */
1109 	if (iopt->disable_large_pages)
1110 		new_iova_alignment = PAGE_SIZE;
1111 	else
1112 		new_iova_alignment = 1;
1113 
1114 	xa_for_each(&iopt->domains, index, domain)
1115 		new_iova_alignment = max_t(unsigned long,
1116 					   1UL << __ffs(domain->pgsize_bitmap),
1117 					   new_iova_alignment);
1118 	xa_for_each(&iopt->access_list, index, access)
1119 		new_iova_alignment = max_t(unsigned long,
1120 					   access->iova_alignment,
1121 					   new_iova_alignment);
1122 
1123 	if (new_iova_alignment > iopt->iova_alignment) {
1124 		int rc;
1125 
1126 		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1127 		if (rc)
1128 			return rc;
1129 	}
1130 	iopt->iova_alignment = new_iova_alignment;
1131 	return 0;
1132 }
1133 
1134 void iopt_table_remove_domain(struct io_pagetable *iopt,
1135 			      struct iommu_domain *domain)
1136 {
1137 	struct iommu_domain *iter_domain = NULL;
1138 	unsigned long index;
1139 
1140 	down_write(&iopt->domains_rwsem);
1141 	down_write(&iopt->iova_rwsem);
1142 
1143 	xa_for_each(&iopt->domains, index, iter_domain)
1144 		if (iter_domain == domain)
1145 			break;
1146 	if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
1147 		goto out_unlock;
1148 
1149 	/*
1150 	 * Compress the xarray to keep it linear by swapping the entry to erase
1151 	 * with the tail entry and shrinking the tail.
1152 	 */
1153 	iopt->next_domain_id--;
1154 	iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
1155 	if (index != iopt->next_domain_id)
1156 		xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
1157 
1158 	iopt_unfill_domain(iopt, domain);
1159 	__iopt_remove_reserved_iova(iopt, domain);
1160 
1161 	WARN_ON(iopt_calculate_iova_alignment(iopt));
1162 out_unlock:
1163 	up_write(&iopt->iova_rwsem);
1164 	up_write(&iopt->domains_rwsem);
1165 }
1166 
1167 /**
1168  * iopt_area_split - Split an area into two parts at iova
1169  * @area: The area to split
1170  * @iova: Becomes the last of a new area
1171  *
1172  * This splits an area into two. It is part of the VFIO compatibility to allow
1173  * poking a hole in the mapping. The two areas continue to point at the same
1174  * iopt_pages, just with different starting bytes.
1175  */
1176 static int iopt_area_split(struct iopt_area *area, unsigned long iova)
1177 {
1178 	unsigned long alignment = area->iopt->iova_alignment;
1179 	unsigned long last_iova = iopt_area_last_iova(area);
1180 	unsigned long start_iova = iopt_area_iova(area);
1181 	unsigned long new_start = iova + 1;
1182 	struct io_pagetable *iopt = area->iopt;
1183 	struct iopt_pages *pages = area->pages;
1184 	struct iopt_area *lhs;
1185 	struct iopt_area *rhs;
1186 	int rc;
1187 
1188 	lockdep_assert_held_write(&iopt->iova_rwsem);
1189 
1190 	if (iova == start_iova || iova == last_iova)
1191 		return 0;
1192 
1193 	if (!pages || area->prevent_access)
1194 		return -EBUSY;
1195 
1196 	if (new_start & (alignment - 1) ||
1197 	    iopt_area_start_byte(area, new_start) & (alignment - 1))
1198 		return -EINVAL;
1199 
1200 	lhs = iopt_area_alloc();
1201 	if (!lhs)
1202 		return -ENOMEM;
1203 
1204 	rhs = iopt_area_alloc();
1205 	if (!rhs) {
1206 		rc = -ENOMEM;
1207 		goto err_free_lhs;
1208 	}
1209 
1210 	mutex_lock(&pages->mutex);
1211 	/*
1212 	 * Splitting is not permitted if an access exists, we don't track enough
1213 	 * information to split existing accesses.
1214 	 */
1215 	if (area->num_accesses) {
1216 		rc = -EINVAL;
1217 		goto err_unlock;
1218 	}
1219 
1220 	/*
1221 	 * Splitting is not permitted if a domain could have been mapped with
1222 	 * huge pages.
1223 	 */
1224 	if (area->storage_domain && !iopt->disable_large_pages) {
1225 		rc = -EINVAL;
1226 		goto err_unlock;
1227 	}
1228 
1229 	interval_tree_remove(&area->node, &iopt->area_itree);
1230 	rc = iopt_insert_area(iopt, lhs, area->pages, start_iova,
1231 			      iopt_area_start_byte(area, start_iova),
1232 			      (new_start - 1) - start_iova + 1,
1233 			      area->iommu_prot);
1234 	if (WARN_ON(rc))
1235 		goto err_insert;
1236 
1237 	rc = iopt_insert_area(iopt, rhs, area->pages, new_start,
1238 			      iopt_area_start_byte(area, new_start),
1239 			      last_iova - new_start + 1, area->iommu_prot);
1240 	if (WARN_ON(rc))
1241 		goto err_remove_lhs;
1242 
1243 	/*
1244 	 * If the original area has filled a domain, domains_itree has to be
1245 	 * updated.
1246 	 */
1247 	if (area->storage_domain) {
1248 		interval_tree_remove(&area->pages_node, &pages->domains_itree);
1249 		interval_tree_insert(&lhs->pages_node, &pages->domains_itree);
1250 		interval_tree_insert(&rhs->pages_node, &pages->domains_itree);
1251 	}
1252 
1253 	lhs->storage_domain = area->storage_domain;
1254 	lhs->pages = area->pages;
1255 	rhs->storage_domain = area->storage_domain;
1256 	rhs->pages = area->pages;
1257 	kref_get(&rhs->pages->kref);
1258 	kfree(area);
1259 	mutex_unlock(&pages->mutex);
1260 
1261 	/*
1262 	 * No change to domains or accesses because the pages hasn't been
1263 	 * changed
1264 	 */
1265 	return 0;
1266 
1267 err_remove_lhs:
1268 	interval_tree_remove(&lhs->node, &iopt->area_itree);
1269 err_insert:
1270 	interval_tree_insert(&area->node, &iopt->area_itree);
1271 err_unlock:
1272 	mutex_unlock(&pages->mutex);
1273 	kfree(rhs);
1274 err_free_lhs:
1275 	kfree(lhs);
1276 	return rc;
1277 }
1278 
1279 int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
1280 		  size_t num_iovas)
1281 {
1282 	int rc = 0;
1283 	int i;
1284 
1285 	down_write(&iopt->iova_rwsem);
1286 	for (i = 0; i < num_iovas; i++) {
1287 		struct iopt_area *area;
1288 
1289 		area = iopt_area_iter_first(iopt, iovas[i], iovas[i]);
1290 		if (!area)
1291 			continue;
1292 		rc = iopt_area_split(area, iovas[i]);
1293 		if (rc)
1294 			break;
1295 	}
1296 	up_write(&iopt->iova_rwsem);
1297 	return rc;
1298 }
1299 
1300 void iopt_enable_large_pages(struct io_pagetable *iopt)
1301 {
1302 	int rc;
1303 
1304 	down_write(&iopt->domains_rwsem);
1305 	down_write(&iopt->iova_rwsem);
1306 	WRITE_ONCE(iopt->disable_large_pages, false);
1307 	rc = iopt_calculate_iova_alignment(iopt);
1308 	WARN_ON(rc);
1309 	up_write(&iopt->iova_rwsem);
1310 	up_write(&iopt->domains_rwsem);
1311 }
1312 
1313 int iopt_disable_large_pages(struct io_pagetable *iopt)
1314 {
1315 	int rc = 0;
1316 
1317 	down_write(&iopt->domains_rwsem);
1318 	down_write(&iopt->iova_rwsem);
1319 	if (iopt->disable_large_pages)
1320 		goto out_unlock;
1321 
1322 	/* Won't do it if domains already have pages mapped in them */
1323 	if (!xa_empty(&iopt->domains) &&
1324 	    !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) {
1325 		rc = -EINVAL;
1326 		goto out_unlock;
1327 	}
1328 
1329 	WRITE_ONCE(iopt->disable_large_pages, true);
1330 	rc = iopt_calculate_iova_alignment(iopt);
1331 	if (rc)
1332 		WRITE_ONCE(iopt->disable_large_pages, false);
1333 out_unlock:
1334 	up_write(&iopt->iova_rwsem);
1335 	up_write(&iopt->domains_rwsem);
1336 	return rc;
1337 }
1338 
1339 int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access)
1340 {
1341 	u32 new_id;
1342 	int rc;
1343 
1344 	down_write(&iopt->domains_rwsem);
1345 	down_write(&iopt->iova_rwsem);
1346 	rc = xa_alloc(&iopt->access_list, &new_id, access, xa_limit_16b,
1347 		      GFP_KERNEL_ACCOUNT);
1348 
1349 	if (rc)
1350 		goto out_unlock;
1351 
1352 	rc = iopt_calculate_iova_alignment(iopt);
1353 	if (rc) {
1354 		xa_erase(&iopt->access_list, new_id);
1355 		goto out_unlock;
1356 	}
1357 	access->iopt_access_list_id = new_id;
1358 
1359 out_unlock:
1360 	up_write(&iopt->iova_rwsem);
1361 	up_write(&iopt->domains_rwsem);
1362 	return rc;
1363 }
1364 
1365 void iopt_remove_access(struct io_pagetable *iopt,
1366 			struct iommufd_access *access,
1367 			u32 iopt_access_list_id)
1368 {
1369 	down_write(&iopt->domains_rwsem);
1370 	down_write(&iopt->iova_rwsem);
1371 	WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access);
1372 	WARN_ON(iopt_calculate_iova_alignment(iopt));
1373 	up_write(&iopt->iova_rwsem);
1374 	up_write(&iopt->domains_rwsem);
1375 }
1376 
1377 /* Narrow the valid_iova_itree to include reserved ranges from a device. */
1378 int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
1379 					struct device *dev,
1380 					phys_addr_t *sw_msi_start)
1381 {
1382 	struct iommu_resv_region *resv;
1383 	LIST_HEAD(resv_regions);
1384 	unsigned int num_hw_msi = 0;
1385 	unsigned int num_sw_msi = 0;
1386 	int rc;
1387 
1388 	if (iommufd_should_fail())
1389 		return -EINVAL;
1390 
1391 	down_write(&iopt->iova_rwsem);
1392 	/* FIXME: drivers allocate memory but there is no failure propogated */
1393 	iommu_get_resv_regions(dev, &resv_regions);
1394 
1395 	list_for_each_entry(resv, &resv_regions, list) {
1396 		if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1397 			continue;
1398 
1399 		if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
1400 			num_hw_msi++;
1401 		if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
1402 			*sw_msi_start = resv->start;
1403 			num_sw_msi++;
1404 		}
1405 
1406 		rc = iopt_reserve_iova(iopt, resv->start,
1407 				       resv->length - 1 + resv->start, dev);
1408 		if (rc)
1409 			goto out_reserved;
1410 	}
1411 
1412 	/* Drivers must offer sane combinations of regions */
1413 	if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) {
1414 		rc = -EINVAL;
1415 		goto out_reserved;
1416 	}
1417 
1418 	rc = 0;
1419 	goto out_free_resv;
1420 
1421 out_reserved:
1422 	__iopt_remove_reserved_iova(iopt, dev);
1423 out_free_resv:
1424 	iommu_put_resv_regions(dev, &resv_regions);
1425 	up_write(&iopt->iova_rwsem);
1426 	return rc;
1427 }
1428