xref: /linux/drivers/iommu/iommufd/io_pagetable.c (revision 79d2e1919a2728ef49d938eb20ebd5903c14dfb0)
1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
3  *
4  * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
5  * PFNs can be placed into an iommu_domain, or returned to the caller as a page
6  * list for access by an in-kernel user.
7  *
8  * The datastructure uses the iopt_pages to optimize the storage of the PFNs
9  * between the domains and xarray.
10  */
11 #include <linux/err.h>
12 #include <linux/errno.h>
13 #include <linux/iommu.h>
14 #include <linux/iommufd.h>
15 #include <linux/lockdep.h>
16 #include <linux/sched/mm.h>
17 #include <linux/slab.h>
18 #include <uapi/linux/iommufd.h>
19 
20 #include "double_span.h"
21 #include "io_pagetable.h"
22 
23 struct iopt_pages_list {
24 	struct iopt_pages *pages;
25 	struct iopt_area *area;
26 	struct list_head next;
27 	unsigned long start_byte;
28 	unsigned long length;
29 };
30 
31 struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
32 					struct io_pagetable *iopt,
33 					unsigned long iova,
34 					unsigned long last_iova)
35 {
36 	lockdep_assert_held(&iopt->iova_rwsem);
37 
38 	iter->cur_iova = iova;
39 	iter->last_iova = last_iova;
40 	iter->area = iopt_area_iter_first(iopt, iova, iova);
41 	if (!iter->area)
42 		return NULL;
43 	if (!iter->area->pages) {
44 		iter->area = NULL;
45 		return NULL;
46 	}
47 	return iter->area;
48 }
49 
50 struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
51 {
52 	unsigned long last_iova;
53 
54 	if (!iter->area)
55 		return NULL;
56 	last_iova = iopt_area_last_iova(iter->area);
57 	if (iter->last_iova <= last_iova)
58 		return NULL;
59 
60 	iter->cur_iova = last_iova + 1;
61 	iter->area = iopt_area_iter_next(iter->area, iter->cur_iova,
62 					 iter->last_iova);
63 	if (!iter->area)
64 		return NULL;
65 	if (iter->cur_iova != iopt_area_iova(iter->area) ||
66 	    !iter->area->pages) {
67 		iter->area = NULL;
68 		return NULL;
69 	}
70 	return iter->area;
71 }
72 
73 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
74 				    unsigned long length,
75 				    unsigned long iova_alignment,
76 				    unsigned long page_offset)
77 {
78 	if (span->is_used || span->last_hole - span->start_hole < length - 1)
79 		return false;
80 
81 	span->start_hole = ALIGN(span->start_hole, iova_alignment) |
82 			   page_offset;
83 	if (span->start_hole > span->last_hole ||
84 	    span->last_hole - span->start_hole < length - 1)
85 		return false;
86 	return true;
87 }
88 
89 static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
90 				    unsigned long length,
91 				    unsigned long iova_alignment,
92 				    unsigned long page_offset)
93 {
94 	if (span->is_hole || span->last_used - span->start_used < length - 1)
95 		return false;
96 
97 	span->start_used = ALIGN(span->start_used, iova_alignment) |
98 			   page_offset;
99 	if (span->start_used > span->last_used ||
100 	    span->last_used - span->start_used < length - 1)
101 		return false;
102 	return true;
103 }
104 
105 /*
106  * Automatically find a block of IOVA that is not being used and not reserved.
107  * Does not return a 0 IOVA even if it is valid.
108  */
109 static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
110 			   unsigned long addr, unsigned long length)
111 {
112 	unsigned long page_offset = addr % PAGE_SIZE;
113 	struct interval_tree_double_span_iter used_span;
114 	struct interval_tree_span_iter allowed_span;
115 	unsigned long max_alignment = PAGE_SIZE;
116 	unsigned long iova_alignment;
117 
118 	lockdep_assert_held(&iopt->iova_rwsem);
119 
120 	/* Protect roundup_pow-of_two() from overflow */
121 	if (length == 0 || length >= ULONG_MAX / 2)
122 		return -EOVERFLOW;
123 
124 	/*
125 	 * Keep alignment present in addr when building the IOVA, which
126 	 * increases the chance we can map a THP.
127 	 */
128 	if (!addr)
129 		iova_alignment = roundup_pow_of_two(length);
130 	else
131 		iova_alignment = min_t(unsigned long,
132 				       roundup_pow_of_two(length),
133 				       1UL << __ffs64(addr));
134 
135 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
136 	max_alignment = HPAGE_SIZE;
137 #endif
138 	/* Protect against ALIGN() overflow */
139 	if (iova_alignment >= max_alignment)
140 		iova_alignment = max_alignment;
141 
142 	if (iova_alignment < iopt->iova_alignment)
143 		return -EINVAL;
144 
145 	interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree,
146 				    PAGE_SIZE, ULONG_MAX - PAGE_SIZE) {
147 		if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) {
148 			allowed_span.start_used = PAGE_SIZE;
149 			allowed_span.last_used = ULONG_MAX - PAGE_SIZE;
150 			allowed_span.is_hole = false;
151 		}
152 
153 		if (!__alloc_iova_check_used(&allowed_span, length,
154 					     iova_alignment, page_offset))
155 			continue;
156 
157 		interval_tree_for_each_double_span(
158 			&used_span, &iopt->reserved_itree, &iopt->area_itree,
159 			allowed_span.start_used, allowed_span.last_used) {
160 			if (!__alloc_iova_check_hole(&used_span, length,
161 						     iova_alignment,
162 						     page_offset))
163 				continue;
164 
165 			*iova = used_span.start_hole;
166 			return 0;
167 		}
168 	}
169 	return -ENOSPC;
170 }
171 
172 static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova,
173 			   unsigned long length)
174 {
175 	unsigned long last;
176 
177 	lockdep_assert_held(&iopt->iova_rwsem);
178 
179 	if ((iova & (iopt->iova_alignment - 1)))
180 		return -EINVAL;
181 
182 	if (check_add_overflow(iova, length - 1, &last))
183 		return -EOVERFLOW;
184 
185 	/* No reserved IOVA intersects the range */
186 	if (iopt_reserved_iter_first(iopt, iova, last))
187 		return -EINVAL;
188 
189 	/* Check that there is not already a mapping in the range */
190 	if (iopt_area_iter_first(iopt, iova, last))
191 		return -EEXIST;
192 	return 0;
193 }
194 
195 /*
196  * The area takes a slice of the pages from start_bytes to start_byte + length
197  */
198 static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
199 			    struct iopt_pages *pages, unsigned long iova,
200 			    unsigned long start_byte, unsigned long length,
201 			    int iommu_prot)
202 {
203 	lockdep_assert_held_write(&iopt->iova_rwsem);
204 
205 	if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
206 		return -EPERM;
207 
208 	area->iommu_prot = iommu_prot;
209 	area->page_offset = start_byte % PAGE_SIZE;
210 	if (area->page_offset & (iopt->iova_alignment - 1))
211 		return -EINVAL;
212 
213 	area->node.start = iova;
214 	if (check_add_overflow(iova, length - 1, &area->node.last))
215 		return -EOVERFLOW;
216 
217 	area->pages_node.start = start_byte / PAGE_SIZE;
218 	if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
219 		return -EOVERFLOW;
220 	area->pages_node.last = area->pages_node.last / PAGE_SIZE;
221 	if (WARN_ON(area->pages_node.last >= pages->npages))
222 		return -EOVERFLOW;
223 
224 	/*
225 	 * The area is inserted with a NULL pages indicating it is not fully
226 	 * initialized yet.
227 	 */
228 	area->iopt = iopt;
229 	interval_tree_insert(&area->node, &iopt->area_itree);
230 	return 0;
231 }
232 
233 static struct iopt_area *iopt_area_alloc(void)
234 {
235 	struct iopt_area *area;
236 
237 	area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
238 	if (!area)
239 		return NULL;
240 	RB_CLEAR_NODE(&area->node.rb);
241 	RB_CLEAR_NODE(&area->pages_node.rb);
242 	return area;
243 }
244 
245 static int iopt_alloc_area_pages(struct io_pagetable *iopt,
246 				 struct list_head *pages_list,
247 				 unsigned long length, unsigned long *dst_iova,
248 				 int iommu_prot, unsigned int flags)
249 {
250 	struct iopt_pages_list *elm;
251 	unsigned long start;
252 	unsigned long iova;
253 	int rc = 0;
254 
255 	list_for_each_entry(elm, pages_list, next) {
256 		elm->area = iopt_area_alloc();
257 		if (!elm->area)
258 			return -ENOMEM;
259 	}
260 
261 	down_write(&iopt->iova_rwsem);
262 	if ((length & (iopt->iova_alignment - 1)) || !length) {
263 		rc = -EINVAL;
264 		goto out_unlock;
265 	}
266 
267 	if (flags & IOPT_ALLOC_IOVA) {
268 		/* Use the first entry to guess the ideal IOVA alignment */
269 		elm = list_first_entry(pages_list, struct iopt_pages_list,
270 				       next);
271 		switch (elm->pages->type) {
272 		case IOPT_ADDRESS_USER:
273 			start = elm->start_byte + (uintptr_t)elm->pages->uptr;
274 			break;
275 		case IOPT_ADDRESS_FILE:
276 			start = elm->start_byte + elm->pages->start;
277 			break;
278 		}
279 		rc = iopt_alloc_iova(iopt, dst_iova, start, length);
280 		if (rc)
281 			goto out_unlock;
282 		if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
283 		    WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) {
284 			rc = -EINVAL;
285 			goto out_unlock;
286 		}
287 	} else {
288 		rc = iopt_check_iova(iopt, *dst_iova, length);
289 		if (rc)
290 			goto out_unlock;
291 	}
292 
293 	/*
294 	 * Areas are created with a NULL pages so that the IOVA space is
295 	 * reserved and we can unlock the iova_rwsem.
296 	 */
297 	iova = *dst_iova;
298 	list_for_each_entry(elm, pages_list, next) {
299 		rc = iopt_insert_area(iopt, elm->area, elm->pages, iova,
300 				      elm->start_byte, elm->length, iommu_prot);
301 		if (rc)
302 			goto out_unlock;
303 		iova += elm->length;
304 	}
305 
306 out_unlock:
307 	up_write(&iopt->iova_rwsem);
308 	return rc;
309 }
310 
311 static void iopt_abort_area(struct iopt_area *area)
312 {
313 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
314 		WARN_ON(area->pages);
315 	if (area->iopt) {
316 		down_write(&area->iopt->iova_rwsem);
317 		interval_tree_remove(&area->node, &area->iopt->area_itree);
318 		up_write(&area->iopt->iova_rwsem);
319 	}
320 	kfree(area);
321 }
322 
323 void iopt_free_pages_list(struct list_head *pages_list)
324 {
325 	struct iopt_pages_list *elm;
326 
327 	while ((elm = list_first_entry_or_null(pages_list,
328 					       struct iopt_pages_list, next))) {
329 		if (elm->area)
330 			iopt_abort_area(elm->area);
331 		if (elm->pages)
332 			iopt_put_pages(elm->pages);
333 		list_del(&elm->next);
334 		kfree(elm);
335 	}
336 }
337 
338 static int iopt_fill_domains_pages(struct list_head *pages_list)
339 {
340 	struct iopt_pages_list *undo_elm;
341 	struct iopt_pages_list *elm;
342 	int rc;
343 
344 	list_for_each_entry(elm, pages_list, next) {
345 		rc = iopt_area_fill_domains(elm->area, elm->pages);
346 		if (rc)
347 			goto err_undo;
348 	}
349 	return 0;
350 
351 err_undo:
352 	list_for_each_entry(undo_elm, pages_list, next) {
353 		if (undo_elm == elm)
354 			break;
355 		iopt_area_unfill_domains(undo_elm->area, undo_elm->pages);
356 	}
357 	return rc;
358 }
359 
360 int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
361 		   unsigned long length, unsigned long *dst_iova,
362 		   int iommu_prot, unsigned int flags)
363 {
364 	struct iopt_pages_list *elm;
365 	int rc;
366 
367 	rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova,
368 				   iommu_prot, flags);
369 	if (rc)
370 		return rc;
371 
372 	down_read(&iopt->domains_rwsem);
373 	rc = iopt_fill_domains_pages(pages_list);
374 	if (rc)
375 		goto out_unlock_domains;
376 
377 	down_write(&iopt->iova_rwsem);
378 	list_for_each_entry(elm, pages_list, next) {
379 		/*
380 		 * area->pages must be set inside the domains_rwsem to ensure
381 		 * any newly added domains will get filled. Moves the reference
382 		 * in from the list.
383 		 */
384 		elm->area->pages = elm->pages;
385 		elm->pages = NULL;
386 		elm->area = NULL;
387 	}
388 	up_write(&iopt->iova_rwsem);
389 out_unlock_domains:
390 	up_read(&iopt->domains_rwsem);
391 	return rc;
392 }
393 
394 static int iopt_map_common(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
395 			   struct iopt_pages *pages, unsigned long *iova,
396 			   unsigned long length, unsigned long start_byte,
397 			   int iommu_prot, unsigned int flags)
398 {
399 	struct iopt_pages_list elm = {};
400 	LIST_HEAD(pages_list);
401 	int rc;
402 
403 	elm.pages = pages;
404 	elm.start_byte = start_byte;
405 	if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
406 	    elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
407 		elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
408 	elm.length = length;
409 	list_add(&elm.next, &pages_list);
410 
411 	rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
412 	if (rc) {
413 		if (elm.area)
414 			iopt_abort_area(elm.area);
415 		if (elm.pages)
416 			iopt_put_pages(elm.pages);
417 		return rc;
418 	}
419 	return 0;
420 }
421 
422 /**
423  * iopt_map_user_pages() - Map a user VA to an iova in the io page table
424  * @ictx: iommufd_ctx the iopt is part of
425  * @iopt: io_pagetable to act on
426  * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
427  *        the chosen iova on output. Otherwise is the iova to map to on input
428  * @uptr: User VA to map
429  * @length: Number of bytes to map
430  * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
431  * @flags: IOPT_ALLOC_IOVA or zero
432  *
433  * iova, uptr, and length must be aligned to iova_alignment. For domain backed
434  * page tables this will pin the pages and load them into the domain at iova.
435  * For non-domain page tables this will only setup a lazy reference and the
436  * caller must use iopt_access_pages() to touch them.
437  *
438  * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
439  * destroyed.
440  */
441 int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
442 			unsigned long *iova, void __user *uptr,
443 			unsigned long length, int iommu_prot,
444 			unsigned int flags)
445 {
446 	struct iopt_pages *pages;
447 
448 	pages = iopt_alloc_user_pages(uptr, length, iommu_prot & IOMMU_WRITE);
449 	if (IS_ERR(pages))
450 		return PTR_ERR(pages);
451 
452 	return iopt_map_common(ictx, iopt, pages, iova, length,
453 			       uptr - pages->uptr, iommu_prot, flags);
454 }
455 
456 /**
457  * iopt_map_file_pages() - Like iopt_map_user_pages, but map a file.
458  * @ictx: iommufd_ctx the iopt is part of
459  * @iopt: io_pagetable to act on
460  * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
461  *        the chosen iova on output. Otherwise is the iova to map to on input
462  * @file: file to map
463  * @start: map file starting at this byte offset
464  * @length: Number of bytes to map
465  * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
466  * @flags: IOPT_ALLOC_IOVA or zero
467  */
468 int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
469 			unsigned long *iova, struct file *file,
470 			unsigned long start, unsigned long length,
471 			int iommu_prot, unsigned int flags)
472 {
473 	struct iopt_pages *pages;
474 
475 	pages = iopt_alloc_file_pages(file, start, length,
476 				      iommu_prot & IOMMU_WRITE);
477 	if (IS_ERR(pages))
478 		return PTR_ERR(pages);
479 	return iopt_map_common(ictx, iopt, pages, iova, length,
480 			       start - pages->start, iommu_prot, flags);
481 }
482 
483 struct iova_bitmap_fn_arg {
484 	unsigned long flags;
485 	struct io_pagetable *iopt;
486 	struct iommu_domain *domain;
487 	struct iommu_dirty_bitmap *dirty;
488 };
489 
490 static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap,
491 					unsigned long iova, size_t length,
492 					void *opaque)
493 {
494 	struct iopt_area *area;
495 	struct iopt_area_contig_iter iter;
496 	struct iova_bitmap_fn_arg *arg = opaque;
497 	struct iommu_domain *domain = arg->domain;
498 	struct iommu_dirty_bitmap *dirty = arg->dirty;
499 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
500 	unsigned long last_iova = iova + length - 1;
501 	unsigned long flags = arg->flags;
502 	int ret;
503 
504 	iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) {
505 		unsigned long last = min(last_iova, iopt_area_last_iova(area));
506 
507 		ret = ops->read_and_clear_dirty(domain, iter.cur_iova,
508 						last - iter.cur_iova + 1, flags,
509 						dirty);
510 		if (ret)
511 			return ret;
512 	}
513 
514 	if (!iopt_area_contig_done(&iter))
515 		return -EINVAL;
516 	return 0;
517 }
518 
519 static int
520 iommu_read_and_clear_dirty(struct iommu_domain *domain,
521 			   struct io_pagetable *iopt, unsigned long flags,
522 			   struct iommu_hwpt_get_dirty_bitmap *bitmap)
523 {
524 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
525 	struct iommu_iotlb_gather gather;
526 	struct iommu_dirty_bitmap dirty;
527 	struct iova_bitmap_fn_arg arg;
528 	struct iova_bitmap *iter;
529 	int ret = 0;
530 
531 	if (!ops || !ops->read_and_clear_dirty)
532 		return -EOPNOTSUPP;
533 
534 	iter = iova_bitmap_alloc(bitmap->iova, bitmap->length,
535 				 bitmap->page_size,
536 				 u64_to_user_ptr(bitmap->data));
537 	if (IS_ERR(iter))
538 		return -ENOMEM;
539 
540 	iommu_dirty_bitmap_init(&dirty, iter, &gather);
541 
542 	arg.flags = flags;
543 	arg.iopt = iopt;
544 	arg.domain = domain;
545 	arg.dirty = &dirty;
546 	iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty);
547 
548 	if (!(flags & IOMMU_DIRTY_NO_CLEAR))
549 		iommu_iotlb_sync(domain, &gather);
550 
551 	iova_bitmap_free(iter);
552 
553 	return ret;
554 }
555 
556 int iommufd_check_iova_range(struct io_pagetable *iopt,
557 			     struct iommu_hwpt_get_dirty_bitmap *bitmap)
558 {
559 	size_t iommu_pgsize = iopt->iova_alignment;
560 	u64 last_iova;
561 
562 	if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova))
563 		return -EOVERFLOW;
564 
565 	if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX)
566 		return -EOVERFLOW;
567 
568 	if ((bitmap->iova & (iommu_pgsize - 1)) ||
569 	    ((last_iova + 1) & (iommu_pgsize - 1)))
570 		return -EINVAL;
571 
572 	if (!bitmap->page_size)
573 		return -EINVAL;
574 
575 	if ((bitmap->iova & (bitmap->page_size - 1)) ||
576 	    ((last_iova + 1) & (bitmap->page_size - 1)))
577 		return -EINVAL;
578 
579 	return 0;
580 }
581 
582 int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
583 				   struct iommu_domain *domain,
584 				   unsigned long flags,
585 				   struct iommu_hwpt_get_dirty_bitmap *bitmap)
586 {
587 	int ret;
588 
589 	ret = iommufd_check_iova_range(iopt, bitmap);
590 	if (ret)
591 		return ret;
592 
593 	down_read(&iopt->iova_rwsem);
594 	ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
595 	up_read(&iopt->iova_rwsem);
596 
597 	return ret;
598 }
599 
600 static int iopt_clear_dirty_data(struct io_pagetable *iopt,
601 				 struct iommu_domain *domain)
602 {
603 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
604 	struct iommu_iotlb_gather gather;
605 	struct iommu_dirty_bitmap dirty;
606 	struct iopt_area *area;
607 	int ret = 0;
608 
609 	lockdep_assert_held_read(&iopt->iova_rwsem);
610 
611 	iommu_dirty_bitmap_init(&dirty, NULL, &gather);
612 
613 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
614 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
615 		if (!area->pages)
616 			continue;
617 
618 		ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area),
619 						iopt_area_length(area), 0,
620 						&dirty);
621 		if (ret)
622 			break;
623 	}
624 
625 	iommu_iotlb_sync(domain, &gather);
626 	return ret;
627 }
628 
629 int iopt_set_dirty_tracking(struct io_pagetable *iopt,
630 			    struct iommu_domain *domain, bool enable)
631 {
632 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
633 	int ret = 0;
634 
635 	if (!ops)
636 		return -EOPNOTSUPP;
637 
638 	down_read(&iopt->iova_rwsem);
639 
640 	/* Clear dirty bits from PTEs to ensure a clean snapshot */
641 	if (enable) {
642 		ret = iopt_clear_dirty_data(iopt, domain);
643 		if (ret)
644 			goto out_unlock;
645 	}
646 
647 	ret = ops->set_dirty_tracking(domain, enable);
648 
649 out_unlock:
650 	up_read(&iopt->iova_rwsem);
651 	return ret;
652 }
653 
654 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
655 		   unsigned long length, struct list_head *pages_list)
656 {
657 	struct iopt_area_contig_iter iter;
658 	unsigned long last_iova;
659 	struct iopt_area *area;
660 	int rc;
661 
662 	if (!length)
663 		return -EINVAL;
664 	if (check_add_overflow(iova, length - 1, &last_iova))
665 		return -EOVERFLOW;
666 
667 	down_read(&iopt->iova_rwsem);
668 	iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
669 		struct iopt_pages_list *elm;
670 		unsigned long last = min(last_iova, iopt_area_last_iova(area));
671 
672 		elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT);
673 		if (!elm) {
674 			rc = -ENOMEM;
675 			goto err_free;
676 		}
677 		elm->start_byte = iopt_area_start_byte(area, iter.cur_iova);
678 		elm->pages = area->pages;
679 		elm->length = (last - iter.cur_iova) + 1;
680 		kref_get(&elm->pages->kref);
681 		list_add_tail(&elm->next, pages_list);
682 	}
683 	if (!iopt_area_contig_done(&iter)) {
684 		rc = -ENOENT;
685 		goto err_free;
686 	}
687 	up_read(&iopt->iova_rwsem);
688 	return 0;
689 err_free:
690 	up_read(&iopt->iova_rwsem);
691 	iopt_free_pages_list(pages_list);
692 	return rc;
693 }
694 
695 static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
696 				 unsigned long last, unsigned long *unmapped)
697 {
698 	struct iopt_area *area;
699 	unsigned long unmapped_bytes = 0;
700 	unsigned int tries = 0;
701 	int rc = -ENOENT;
702 
703 	/*
704 	 * The domains_rwsem must be held in read mode any time any area->pages
705 	 * is NULL. This prevents domain attach/detatch from running
706 	 * concurrently with cleaning up the area.
707 	 */
708 again:
709 	down_read(&iopt->domains_rwsem);
710 	down_write(&iopt->iova_rwsem);
711 	while ((area = iopt_area_iter_first(iopt, start, last))) {
712 		unsigned long area_last = iopt_area_last_iova(area);
713 		unsigned long area_first = iopt_area_iova(area);
714 		struct iopt_pages *pages;
715 
716 		/* Userspace should not race map/unmap's of the same area */
717 		if (!area->pages) {
718 			rc = -EBUSY;
719 			goto out_unlock_iova;
720 		}
721 
722 		if (area_first < start || area_last > last) {
723 			rc = -ENOENT;
724 			goto out_unlock_iova;
725 		}
726 
727 		if (area_first != start)
728 			tries = 0;
729 
730 		/*
731 		 * num_accesses writers must hold the iova_rwsem too, so we can
732 		 * safely read it under the write side of the iovam_rwsem
733 		 * without the pages->mutex.
734 		 */
735 		if (area->num_accesses) {
736 			size_t length = iopt_area_length(area);
737 
738 			start = area_first;
739 			area->prevent_access = true;
740 			up_write(&iopt->iova_rwsem);
741 			up_read(&iopt->domains_rwsem);
742 
743 			iommufd_access_notify_unmap(iopt, area_first, length);
744 			/* Something is not responding to unmap requests. */
745 			tries++;
746 			if (WARN_ON(tries > 100))
747 				return -EDEADLOCK;
748 			goto again;
749 		}
750 
751 		pages = area->pages;
752 		area->pages = NULL;
753 		up_write(&iopt->iova_rwsem);
754 
755 		iopt_area_unfill_domains(area, pages);
756 		iopt_abort_area(area);
757 		iopt_put_pages(pages);
758 
759 		unmapped_bytes += area_last - area_first + 1;
760 
761 		down_write(&iopt->iova_rwsem);
762 	}
763 	if (unmapped_bytes)
764 		rc = 0;
765 
766 out_unlock_iova:
767 	up_write(&iopt->iova_rwsem);
768 	up_read(&iopt->domains_rwsem);
769 	if (unmapped)
770 		*unmapped = unmapped_bytes;
771 	return rc;
772 }
773 
774 /**
775  * iopt_unmap_iova() - Remove a range of iova
776  * @iopt: io_pagetable to act on
777  * @iova: Starting iova to unmap
778  * @length: Number of bytes to unmap
779  * @unmapped: Return number of bytes unmapped
780  *
781  * The requested range must be a superset of existing ranges.
782  * Splitting/truncating IOVA mappings is not allowed.
783  */
784 int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
785 		    unsigned long length, unsigned long *unmapped)
786 {
787 	unsigned long iova_last;
788 
789 	if (!length)
790 		return -EINVAL;
791 
792 	if (check_add_overflow(iova, length - 1, &iova_last))
793 		return -EOVERFLOW;
794 
795 	return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped);
796 }
797 
798 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
799 {
800 	int rc;
801 
802 	rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
803 	/* If the IOVAs are empty then unmap all succeeds */
804 	if (rc == -ENOENT)
805 		return 0;
806 	return rc;
807 }
808 
809 /* The caller must always free all the nodes in the allowed_iova rb_root. */
810 int iopt_set_allow_iova(struct io_pagetable *iopt,
811 			struct rb_root_cached *allowed_iova)
812 {
813 	struct iopt_allowed *allowed;
814 
815 	down_write(&iopt->iova_rwsem);
816 	swap(*allowed_iova, iopt->allowed_itree);
817 
818 	for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed;
819 	     allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) {
820 		if (iopt_reserved_iter_first(iopt, allowed->node.start,
821 					     allowed->node.last)) {
822 			swap(*allowed_iova, iopt->allowed_itree);
823 			up_write(&iopt->iova_rwsem);
824 			return -EADDRINUSE;
825 		}
826 	}
827 	up_write(&iopt->iova_rwsem);
828 	return 0;
829 }
830 
831 int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
832 		      unsigned long last, void *owner)
833 {
834 	struct iopt_reserved *reserved;
835 
836 	lockdep_assert_held_write(&iopt->iova_rwsem);
837 
838 	if (iopt_area_iter_first(iopt, start, last) ||
839 	    iopt_allowed_iter_first(iopt, start, last))
840 		return -EADDRINUSE;
841 
842 	reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT);
843 	if (!reserved)
844 		return -ENOMEM;
845 	reserved->node.start = start;
846 	reserved->node.last = last;
847 	reserved->owner = owner;
848 	interval_tree_insert(&reserved->node, &iopt->reserved_itree);
849 	return 0;
850 }
851 
852 static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
853 {
854 	struct iopt_reserved *reserved, *next;
855 
856 	lockdep_assert_held_write(&iopt->iova_rwsem);
857 
858 	for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved;
859 	     reserved = next) {
860 		next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX);
861 
862 		if (reserved->owner == owner) {
863 			interval_tree_remove(&reserved->node,
864 					     &iopt->reserved_itree);
865 			kfree(reserved);
866 		}
867 	}
868 }
869 
870 void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
871 {
872 	down_write(&iopt->iova_rwsem);
873 	__iopt_remove_reserved_iova(iopt, owner);
874 	up_write(&iopt->iova_rwsem);
875 }
876 
877 void iopt_init_table(struct io_pagetable *iopt)
878 {
879 	init_rwsem(&iopt->iova_rwsem);
880 	init_rwsem(&iopt->domains_rwsem);
881 	iopt->area_itree = RB_ROOT_CACHED;
882 	iopt->allowed_itree = RB_ROOT_CACHED;
883 	iopt->reserved_itree = RB_ROOT_CACHED;
884 	xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT);
885 	xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC);
886 
887 	/*
888 	 * iopt's start as SW tables that can use the entire size_t IOVA space
889 	 * due to the use of size_t in the APIs. They have no alignment
890 	 * restriction.
891 	 */
892 	iopt->iova_alignment = 1;
893 }
894 
895 void iopt_destroy_table(struct io_pagetable *iopt)
896 {
897 	struct interval_tree_node *node;
898 
899 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
900 		iopt_remove_reserved_iova(iopt, NULL);
901 
902 	while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0,
903 						ULONG_MAX))) {
904 		interval_tree_remove(node, &iopt->allowed_itree);
905 		kfree(container_of(node, struct iopt_allowed, node));
906 	}
907 
908 	WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root));
909 	WARN_ON(!xa_empty(&iopt->domains));
910 	WARN_ON(!xa_empty(&iopt->access_list));
911 	WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
912 }
913 
914 /**
915  * iopt_unfill_domain() - Unfill a domain with PFNs
916  * @iopt: io_pagetable to act on
917  * @domain: domain to unfill
918  *
919  * This is used when removing a domain from the iopt. Every area in the iopt
920  * will be unmapped from the domain. The domain must already be removed from the
921  * domains xarray.
922  */
923 static void iopt_unfill_domain(struct io_pagetable *iopt,
924 			       struct iommu_domain *domain)
925 {
926 	struct iopt_area *area;
927 
928 	lockdep_assert_held(&iopt->iova_rwsem);
929 	lockdep_assert_held_write(&iopt->domains_rwsem);
930 
931 	/*
932 	 * Some other domain is holding all the pfns still, rapidly unmap this
933 	 * domain.
934 	 */
935 	if (iopt->next_domain_id != 0) {
936 		/* Pick an arbitrary remaining domain to act as storage */
937 		struct iommu_domain *storage_domain =
938 			xa_load(&iopt->domains, 0);
939 
940 		for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
941 		     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
942 			struct iopt_pages *pages = area->pages;
943 
944 			if (!pages)
945 				continue;
946 
947 			mutex_lock(&pages->mutex);
948 			if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
949 				WARN_ON(!area->storage_domain);
950 			if (area->storage_domain == domain)
951 				area->storage_domain = storage_domain;
952 			mutex_unlock(&pages->mutex);
953 
954 			iopt_area_unmap_domain(area, domain);
955 		}
956 		return;
957 	}
958 
959 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
960 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
961 		struct iopt_pages *pages = area->pages;
962 
963 		if (!pages)
964 			continue;
965 
966 		mutex_lock(&pages->mutex);
967 		interval_tree_remove(&area->pages_node, &pages->domains_itree);
968 		WARN_ON(area->storage_domain != domain);
969 		area->storage_domain = NULL;
970 		iopt_area_unfill_domain(area, pages, domain);
971 		mutex_unlock(&pages->mutex);
972 	}
973 }
974 
975 /**
976  * iopt_fill_domain() - Fill a domain with PFNs
977  * @iopt: io_pagetable to act on
978  * @domain: domain to fill
979  *
980  * Fill the domain with PFNs from every area in the iopt. On failure the domain
981  * is left unchanged.
982  */
983 static int iopt_fill_domain(struct io_pagetable *iopt,
984 			    struct iommu_domain *domain)
985 {
986 	struct iopt_area *end_area;
987 	struct iopt_area *area;
988 	int rc;
989 
990 	lockdep_assert_held(&iopt->iova_rwsem);
991 	lockdep_assert_held_write(&iopt->domains_rwsem);
992 
993 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
994 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
995 		struct iopt_pages *pages = area->pages;
996 
997 		if (!pages)
998 			continue;
999 
1000 		mutex_lock(&pages->mutex);
1001 		rc = iopt_area_fill_domain(area, domain);
1002 		if (rc) {
1003 			mutex_unlock(&pages->mutex);
1004 			goto out_unfill;
1005 		}
1006 		if (!area->storage_domain) {
1007 			WARN_ON(iopt->next_domain_id != 0);
1008 			area->storage_domain = domain;
1009 			interval_tree_insert(&area->pages_node,
1010 					     &pages->domains_itree);
1011 		}
1012 		mutex_unlock(&pages->mutex);
1013 	}
1014 	return 0;
1015 
1016 out_unfill:
1017 	end_area = area;
1018 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1019 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
1020 		struct iopt_pages *pages = area->pages;
1021 
1022 		if (area == end_area)
1023 			break;
1024 		if (!pages)
1025 			continue;
1026 		mutex_lock(&pages->mutex);
1027 		if (iopt->next_domain_id == 0) {
1028 			interval_tree_remove(&area->pages_node,
1029 					     &pages->domains_itree);
1030 			area->storage_domain = NULL;
1031 		}
1032 		iopt_area_unfill_domain(area, pages, domain);
1033 		mutex_unlock(&pages->mutex);
1034 	}
1035 	return rc;
1036 }
1037 
1038 /* All existing area's conform to an increased page size */
1039 static int iopt_check_iova_alignment(struct io_pagetable *iopt,
1040 				     unsigned long new_iova_alignment)
1041 {
1042 	unsigned long align_mask = new_iova_alignment - 1;
1043 	struct iopt_area *area;
1044 
1045 	lockdep_assert_held(&iopt->iova_rwsem);
1046 	lockdep_assert_held(&iopt->domains_rwsem);
1047 
1048 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1049 	     area = iopt_area_iter_next(area, 0, ULONG_MAX))
1050 		if ((iopt_area_iova(area) & align_mask) ||
1051 		    (iopt_area_length(area) & align_mask) ||
1052 		    (area->page_offset & align_mask))
1053 			return -EADDRINUSE;
1054 
1055 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) {
1056 		struct iommufd_access *access;
1057 		unsigned long index;
1058 
1059 		xa_for_each(&iopt->access_list, index, access)
1060 			if (WARN_ON(access->iova_alignment >
1061 				    new_iova_alignment))
1062 				return -EADDRINUSE;
1063 	}
1064 	return 0;
1065 }
1066 
1067 int iopt_table_add_domain(struct io_pagetable *iopt,
1068 			  struct iommu_domain *domain)
1069 {
1070 	const struct iommu_domain_geometry *geometry = &domain->geometry;
1071 	struct iommu_domain *iter_domain;
1072 	unsigned int new_iova_alignment;
1073 	unsigned long index;
1074 	int rc;
1075 
1076 	down_write(&iopt->domains_rwsem);
1077 	down_write(&iopt->iova_rwsem);
1078 
1079 	xa_for_each(&iopt->domains, index, iter_domain) {
1080 		if (WARN_ON(iter_domain == domain)) {
1081 			rc = -EEXIST;
1082 			goto out_unlock;
1083 		}
1084 	}
1085 
1086 	/*
1087 	 * The io page size drives the iova_alignment. Internally the iopt_pages
1088 	 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
1089 	 * objects into the iommu_domain.
1090 	 *
1091 	 * A iommu_domain must always be able to accept PAGE_SIZE to be
1092 	 * compatible as we can't guarantee higher contiguity.
1093 	 */
1094 	new_iova_alignment = max_t(unsigned long,
1095 				   1UL << __ffs(domain->pgsize_bitmap),
1096 				   iopt->iova_alignment);
1097 	if (new_iova_alignment > PAGE_SIZE) {
1098 		rc = -EINVAL;
1099 		goto out_unlock;
1100 	}
1101 	if (new_iova_alignment != iopt->iova_alignment) {
1102 		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1103 		if (rc)
1104 			goto out_unlock;
1105 	}
1106 
1107 	/* No area exists that is outside the allowed domain aperture */
1108 	if (geometry->aperture_start != 0) {
1109 		rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
1110 				       domain);
1111 		if (rc)
1112 			goto out_reserved;
1113 	}
1114 	if (geometry->aperture_end != ULONG_MAX) {
1115 		rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
1116 				       ULONG_MAX, domain);
1117 		if (rc)
1118 			goto out_reserved;
1119 	}
1120 
1121 	rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
1122 	if (rc)
1123 		goto out_reserved;
1124 
1125 	rc = iopt_fill_domain(iopt, domain);
1126 	if (rc)
1127 		goto out_release;
1128 
1129 	iopt->iova_alignment = new_iova_alignment;
1130 	xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
1131 	iopt->next_domain_id++;
1132 	up_write(&iopt->iova_rwsem);
1133 	up_write(&iopt->domains_rwsem);
1134 	return 0;
1135 out_release:
1136 	xa_release(&iopt->domains, iopt->next_domain_id);
1137 out_reserved:
1138 	__iopt_remove_reserved_iova(iopt, domain);
1139 out_unlock:
1140 	up_write(&iopt->iova_rwsem);
1141 	up_write(&iopt->domains_rwsem);
1142 	return rc;
1143 }
1144 
1145 static int iopt_calculate_iova_alignment(struct io_pagetable *iopt)
1146 {
1147 	unsigned long new_iova_alignment;
1148 	struct iommufd_access *access;
1149 	struct iommu_domain *domain;
1150 	unsigned long index;
1151 
1152 	lockdep_assert_held_write(&iopt->iova_rwsem);
1153 	lockdep_assert_held(&iopt->domains_rwsem);
1154 
1155 	/* See batch_iommu_map_small() */
1156 	if (iopt->disable_large_pages)
1157 		new_iova_alignment = PAGE_SIZE;
1158 	else
1159 		new_iova_alignment = 1;
1160 
1161 	xa_for_each(&iopt->domains, index, domain)
1162 		new_iova_alignment = max_t(unsigned long,
1163 					   1UL << __ffs(domain->pgsize_bitmap),
1164 					   new_iova_alignment);
1165 	xa_for_each(&iopt->access_list, index, access)
1166 		new_iova_alignment = max_t(unsigned long,
1167 					   access->iova_alignment,
1168 					   new_iova_alignment);
1169 
1170 	if (new_iova_alignment > iopt->iova_alignment) {
1171 		int rc;
1172 
1173 		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1174 		if (rc)
1175 			return rc;
1176 	}
1177 	iopt->iova_alignment = new_iova_alignment;
1178 	return 0;
1179 }
1180 
1181 void iopt_table_remove_domain(struct io_pagetable *iopt,
1182 			      struct iommu_domain *domain)
1183 {
1184 	struct iommu_domain *iter_domain = NULL;
1185 	unsigned long index;
1186 
1187 	down_write(&iopt->domains_rwsem);
1188 	down_write(&iopt->iova_rwsem);
1189 
1190 	xa_for_each(&iopt->domains, index, iter_domain)
1191 		if (iter_domain == domain)
1192 			break;
1193 	if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
1194 		goto out_unlock;
1195 
1196 	/*
1197 	 * Compress the xarray to keep it linear by swapping the entry to erase
1198 	 * with the tail entry and shrinking the tail.
1199 	 */
1200 	iopt->next_domain_id--;
1201 	iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
1202 	if (index != iopt->next_domain_id)
1203 		xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
1204 
1205 	iopt_unfill_domain(iopt, domain);
1206 	__iopt_remove_reserved_iova(iopt, domain);
1207 
1208 	WARN_ON(iopt_calculate_iova_alignment(iopt));
1209 out_unlock:
1210 	up_write(&iopt->iova_rwsem);
1211 	up_write(&iopt->domains_rwsem);
1212 }
1213 
1214 /**
1215  * iopt_area_split - Split an area into two parts at iova
1216  * @area: The area to split
1217  * @iova: Becomes the last of a new area
1218  *
1219  * This splits an area into two. It is part of the VFIO compatibility to allow
1220  * poking a hole in the mapping. The two areas continue to point at the same
1221  * iopt_pages, just with different starting bytes.
1222  */
1223 static int iopt_area_split(struct iopt_area *area, unsigned long iova)
1224 {
1225 	unsigned long alignment = area->iopt->iova_alignment;
1226 	unsigned long last_iova = iopt_area_last_iova(area);
1227 	unsigned long start_iova = iopt_area_iova(area);
1228 	unsigned long new_start = iova + 1;
1229 	struct io_pagetable *iopt = area->iopt;
1230 	struct iopt_pages *pages = area->pages;
1231 	struct iopt_area *lhs;
1232 	struct iopt_area *rhs;
1233 	int rc;
1234 
1235 	lockdep_assert_held_write(&iopt->iova_rwsem);
1236 
1237 	if (iova == start_iova || iova == last_iova)
1238 		return 0;
1239 
1240 	if (!pages || area->prevent_access)
1241 		return -EBUSY;
1242 
1243 	if (new_start & (alignment - 1) ||
1244 	    iopt_area_start_byte(area, new_start) & (alignment - 1))
1245 		return -EINVAL;
1246 
1247 	lhs = iopt_area_alloc();
1248 	if (!lhs)
1249 		return -ENOMEM;
1250 
1251 	rhs = iopt_area_alloc();
1252 	if (!rhs) {
1253 		rc = -ENOMEM;
1254 		goto err_free_lhs;
1255 	}
1256 
1257 	mutex_lock(&pages->mutex);
1258 	/*
1259 	 * Splitting is not permitted if an access exists, we don't track enough
1260 	 * information to split existing accesses.
1261 	 */
1262 	if (area->num_accesses) {
1263 		rc = -EINVAL;
1264 		goto err_unlock;
1265 	}
1266 
1267 	/*
1268 	 * Splitting is not permitted if a domain could have been mapped with
1269 	 * huge pages.
1270 	 */
1271 	if (area->storage_domain && !iopt->disable_large_pages) {
1272 		rc = -EINVAL;
1273 		goto err_unlock;
1274 	}
1275 
1276 	interval_tree_remove(&area->node, &iopt->area_itree);
1277 	rc = iopt_insert_area(iopt, lhs, area->pages, start_iova,
1278 			      iopt_area_start_byte(area, start_iova),
1279 			      (new_start - 1) - start_iova + 1,
1280 			      area->iommu_prot);
1281 	if (WARN_ON(rc))
1282 		goto err_insert;
1283 
1284 	rc = iopt_insert_area(iopt, rhs, area->pages, new_start,
1285 			      iopt_area_start_byte(area, new_start),
1286 			      last_iova - new_start + 1, area->iommu_prot);
1287 	if (WARN_ON(rc))
1288 		goto err_remove_lhs;
1289 
1290 	/*
1291 	 * If the original area has filled a domain, domains_itree has to be
1292 	 * updated.
1293 	 */
1294 	if (area->storage_domain) {
1295 		interval_tree_remove(&area->pages_node, &pages->domains_itree);
1296 		interval_tree_insert(&lhs->pages_node, &pages->domains_itree);
1297 		interval_tree_insert(&rhs->pages_node, &pages->domains_itree);
1298 	}
1299 
1300 	lhs->storage_domain = area->storage_domain;
1301 	lhs->pages = area->pages;
1302 	rhs->storage_domain = area->storage_domain;
1303 	rhs->pages = area->pages;
1304 	kref_get(&rhs->pages->kref);
1305 	kfree(area);
1306 	mutex_unlock(&pages->mutex);
1307 
1308 	/*
1309 	 * No change to domains or accesses because the pages hasn't been
1310 	 * changed
1311 	 */
1312 	return 0;
1313 
1314 err_remove_lhs:
1315 	interval_tree_remove(&lhs->node, &iopt->area_itree);
1316 err_insert:
1317 	interval_tree_insert(&area->node, &iopt->area_itree);
1318 err_unlock:
1319 	mutex_unlock(&pages->mutex);
1320 	kfree(rhs);
1321 err_free_lhs:
1322 	kfree(lhs);
1323 	return rc;
1324 }
1325 
1326 int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
1327 		  size_t num_iovas)
1328 {
1329 	int rc = 0;
1330 	int i;
1331 
1332 	down_write(&iopt->iova_rwsem);
1333 	for (i = 0; i < num_iovas; i++) {
1334 		struct iopt_area *area;
1335 
1336 		area = iopt_area_iter_first(iopt, iovas[i], iovas[i]);
1337 		if (!area)
1338 			continue;
1339 		rc = iopt_area_split(area, iovas[i]);
1340 		if (rc)
1341 			break;
1342 	}
1343 	up_write(&iopt->iova_rwsem);
1344 	return rc;
1345 }
1346 
1347 void iopt_enable_large_pages(struct io_pagetable *iopt)
1348 {
1349 	int rc;
1350 
1351 	down_write(&iopt->domains_rwsem);
1352 	down_write(&iopt->iova_rwsem);
1353 	WRITE_ONCE(iopt->disable_large_pages, false);
1354 	rc = iopt_calculate_iova_alignment(iopt);
1355 	WARN_ON(rc);
1356 	up_write(&iopt->iova_rwsem);
1357 	up_write(&iopt->domains_rwsem);
1358 }
1359 
1360 int iopt_disable_large_pages(struct io_pagetable *iopt)
1361 {
1362 	int rc = 0;
1363 
1364 	down_write(&iopt->domains_rwsem);
1365 	down_write(&iopt->iova_rwsem);
1366 	if (iopt->disable_large_pages)
1367 		goto out_unlock;
1368 
1369 	/* Won't do it if domains already have pages mapped in them */
1370 	if (!xa_empty(&iopt->domains) &&
1371 	    !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) {
1372 		rc = -EINVAL;
1373 		goto out_unlock;
1374 	}
1375 
1376 	WRITE_ONCE(iopt->disable_large_pages, true);
1377 	rc = iopt_calculate_iova_alignment(iopt);
1378 	if (rc)
1379 		WRITE_ONCE(iopt->disable_large_pages, false);
1380 out_unlock:
1381 	up_write(&iopt->iova_rwsem);
1382 	up_write(&iopt->domains_rwsem);
1383 	return rc;
1384 }
1385 
1386 int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access)
1387 {
1388 	u32 new_id;
1389 	int rc;
1390 
1391 	down_write(&iopt->domains_rwsem);
1392 	down_write(&iopt->iova_rwsem);
1393 	rc = xa_alloc(&iopt->access_list, &new_id, access, xa_limit_16b,
1394 		      GFP_KERNEL_ACCOUNT);
1395 
1396 	if (rc)
1397 		goto out_unlock;
1398 
1399 	rc = iopt_calculate_iova_alignment(iopt);
1400 	if (rc) {
1401 		xa_erase(&iopt->access_list, new_id);
1402 		goto out_unlock;
1403 	}
1404 	access->iopt_access_list_id = new_id;
1405 
1406 out_unlock:
1407 	up_write(&iopt->iova_rwsem);
1408 	up_write(&iopt->domains_rwsem);
1409 	return rc;
1410 }
1411 
1412 void iopt_remove_access(struct io_pagetable *iopt,
1413 			struct iommufd_access *access,
1414 			u32 iopt_access_list_id)
1415 {
1416 	down_write(&iopt->domains_rwsem);
1417 	down_write(&iopt->iova_rwsem);
1418 	WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access);
1419 	WARN_ON(iopt_calculate_iova_alignment(iopt));
1420 	up_write(&iopt->iova_rwsem);
1421 	up_write(&iopt->domains_rwsem);
1422 }
1423 
1424 /* Narrow the valid_iova_itree to include reserved ranges from a device. */
1425 int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
1426 					struct device *dev,
1427 					phys_addr_t *sw_msi_start)
1428 {
1429 	struct iommu_resv_region *resv;
1430 	LIST_HEAD(resv_regions);
1431 	unsigned int num_hw_msi = 0;
1432 	unsigned int num_sw_msi = 0;
1433 	int rc;
1434 
1435 	if (iommufd_should_fail())
1436 		return -EINVAL;
1437 
1438 	down_write(&iopt->iova_rwsem);
1439 	/* FIXME: drivers allocate memory but there is no failure propogated */
1440 	iommu_get_resv_regions(dev, &resv_regions);
1441 
1442 	list_for_each_entry(resv, &resv_regions, list) {
1443 		if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1444 			continue;
1445 
1446 		if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
1447 			num_hw_msi++;
1448 		if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
1449 			*sw_msi_start = resv->start;
1450 			num_sw_msi++;
1451 		}
1452 
1453 		rc = iopt_reserve_iova(iopt, resv->start,
1454 				       resv->length - 1 + resv->start, dev);
1455 		if (rc)
1456 			goto out_reserved;
1457 	}
1458 
1459 	/* Drivers must offer sane combinations of regions */
1460 	if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) {
1461 		rc = -EINVAL;
1462 		goto out_reserved;
1463 	}
1464 
1465 	rc = 0;
1466 	goto out_free_resv;
1467 
1468 out_reserved:
1469 	__iopt_remove_reserved_iova(iopt, dev);
1470 out_free_resv:
1471 	iommu_put_resv_regions(dev, &resv_regions);
1472 	up_write(&iopt->iova_rwsem);
1473 	return rc;
1474 }
1475