xref: /linux/drivers/iommu/iommufd/io_pagetable.c (revision f85f5ae45ad945270a8884261de8249431e8b5a6)
1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
3  *
4  * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
5  * PFNs can be placed into an iommu_domain, or returned to the caller as a page
6  * list for access by an in-kernel user.
7  *
8  * The datastructure uses the iopt_pages to optimize the storage of the PFNs
9  * between the domains and xarray.
10  */
11 #include <linux/iommufd.h>
12 #include <linux/lockdep.h>
13 #include <linux/iommu.h>
14 #include <linux/sched/mm.h>
15 #include <linux/err.h>
16 #include <linux/slab.h>
17 #include <linux/errno.h>
18 
19 #include "io_pagetable.h"
20 #include "double_span.h"
21 
22 struct iopt_pages_list {
23 	struct iopt_pages *pages;
24 	struct iopt_area *area;
25 	struct list_head next;
26 	unsigned long start_byte;
27 	unsigned long length;
28 };
29 
30 struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
31 					struct io_pagetable *iopt,
32 					unsigned long iova,
33 					unsigned long last_iova)
34 {
35 	lockdep_assert_held(&iopt->iova_rwsem);
36 
37 	iter->cur_iova = iova;
38 	iter->last_iova = last_iova;
39 	iter->area = iopt_area_iter_first(iopt, iova, iova);
40 	if (!iter->area)
41 		return NULL;
42 	if (!iter->area->pages) {
43 		iter->area = NULL;
44 		return NULL;
45 	}
46 	return iter->area;
47 }
48 
49 struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
50 {
51 	unsigned long last_iova;
52 
53 	if (!iter->area)
54 		return NULL;
55 	last_iova = iopt_area_last_iova(iter->area);
56 	if (iter->last_iova <= last_iova)
57 		return NULL;
58 
59 	iter->cur_iova = last_iova + 1;
60 	iter->area = iopt_area_iter_next(iter->area, iter->cur_iova,
61 					 iter->last_iova);
62 	if (!iter->area)
63 		return NULL;
64 	if (iter->cur_iova != iopt_area_iova(iter->area) ||
65 	    !iter->area->pages) {
66 		iter->area = NULL;
67 		return NULL;
68 	}
69 	return iter->area;
70 }
71 
72 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
73 				    unsigned long length,
74 				    unsigned long iova_alignment,
75 				    unsigned long page_offset)
76 {
77 	if (span->is_used || span->last_hole - span->start_hole < length - 1)
78 		return false;
79 
80 	span->start_hole = ALIGN(span->start_hole, iova_alignment) |
81 			   page_offset;
82 	if (span->start_hole > span->last_hole ||
83 	    span->last_hole - span->start_hole < length - 1)
84 		return false;
85 	return true;
86 }
87 
88 static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
89 				    unsigned long length,
90 				    unsigned long iova_alignment,
91 				    unsigned long page_offset)
92 {
93 	if (span->is_hole || span->last_used - span->start_used < length - 1)
94 		return false;
95 
96 	span->start_used = ALIGN(span->start_used, iova_alignment) |
97 			   page_offset;
98 	if (span->start_used > span->last_used ||
99 	    span->last_used - span->start_used < length - 1)
100 		return false;
101 	return true;
102 }
103 
104 /*
105  * Automatically find a block of IOVA that is not being used and not reserved.
106  * Does not return a 0 IOVA even if it is valid.
107  */
108 static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
109 			   unsigned long uptr, unsigned long length)
110 {
111 	unsigned long page_offset = uptr % PAGE_SIZE;
112 	struct interval_tree_double_span_iter used_span;
113 	struct interval_tree_span_iter allowed_span;
114 	unsigned long iova_alignment;
115 
116 	lockdep_assert_held(&iopt->iova_rwsem);
117 
118 	/* Protect roundup_pow-of_two() from overflow */
119 	if (length == 0 || length >= ULONG_MAX / 2)
120 		return -EOVERFLOW;
121 
122 	/*
123 	 * Keep alignment present in the uptr when building the IOVA, this
124 	 * increases the chance we can map a THP.
125 	 */
126 	if (!uptr)
127 		iova_alignment = roundup_pow_of_two(length);
128 	else
129 		iova_alignment = min_t(unsigned long,
130 				       roundup_pow_of_two(length),
131 				       1UL << __ffs64(uptr));
132 
133 	if (iova_alignment < iopt->iova_alignment)
134 		return -EINVAL;
135 
136 	interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree,
137 				    PAGE_SIZE, ULONG_MAX - PAGE_SIZE) {
138 		if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) {
139 			allowed_span.start_used = PAGE_SIZE;
140 			allowed_span.last_used = ULONG_MAX - PAGE_SIZE;
141 			allowed_span.is_hole = false;
142 		}
143 
144 		if (!__alloc_iova_check_used(&allowed_span, length,
145 					     iova_alignment, page_offset))
146 			continue;
147 
148 		interval_tree_for_each_double_span(
149 			&used_span, &iopt->reserved_itree, &iopt->area_itree,
150 			allowed_span.start_used, allowed_span.last_used) {
151 			if (!__alloc_iova_check_hole(&used_span, length,
152 						     iova_alignment,
153 						     page_offset))
154 				continue;
155 
156 			*iova = used_span.start_hole;
157 			return 0;
158 		}
159 	}
160 	return -ENOSPC;
161 }
162 
163 static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova,
164 			   unsigned long length)
165 {
166 	unsigned long last;
167 
168 	lockdep_assert_held(&iopt->iova_rwsem);
169 
170 	if ((iova & (iopt->iova_alignment - 1)))
171 		return -EINVAL;
172 
173 	if (check_add_overflow(iova, length - 1, &last))
174 		return -EOVERFLOW;
175 
176 	/* No reserved IOVA intersects the range */
177 	if (iopt_reserved_iter_first(iopt, iova, last))
178 		return -EINVAL;
179 
180 	/* Check that there is not already a mapping in the range */
181 	if (iopt_area_iter_first(iopt, iova, last))
182 		return -EEXIST;
183 	return 0;
184 }
185 
186 /*
187  * The area takes a slice of the pages from start_bytes to start_byte + length
188  */
189 static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
190 			    struct iopt_pages *pages, unsigned long iova,
191 			    unsigned long start_byte, unsigned long length,
192 			    int iommu_prot)
193 {
194 	lockdep_assert_held_write(&iopt->iova_rwsem);
195 
196 	if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
197 		return -EPERM;
198 
199 	area->iommu_prot = iommu_prot;
200 	area->page_offset = start_byte % PAGE_SIZE;
201 	if (area->page_offset & (iopt->iova_alignment - 1))
202 		return -EINVAL;
203 
204 	area->node.start = iova;
205 	if (check_add_overflow(iova, length - 1, &area->node.last))
206 		return -EOVERFLOW;
207 
208 	area->pages_node.start = start_byte / PAGE_SIZE;
209 	if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
210 		return -EOVERFLOW;
211 	area->pages_node.last = area->pages_node.last / PAGE_SIZE;
212 	if (WARN_ON(area->pages_node.last >= pages->npages))
213 		return -EOVERFLOW;
214 
215 	/*
216 	 * The area is inserted with a NULL pages indicating it is not fully
217 	 * initialized yet.
218 	 */
219 	area->iopt = iopt;
220 	interval_tree_insert(&area->node, &iopt->area_itree);
221 	return 0;
222 }
223 
224 static int iopt_alloc_area_pages(struct io_pagetable *iopt,
225 				 struct list_head *pages_list,
226 				 unsigned long length, unsigned long *dst_iova,
227 				 int iommu_prot, unsigned int flags)
228 {
229 	struct iopt_pages_list *elm;
230 	unsigned long iova;
231 	int rc = 0;
232 
233 	list_for_each_entry(elm, pages_list, next) {
234 		elm->area = kzalloc(sizeof(*elm->area), GFP_KERNEL_ACCOUNT);
235 		if (!elm->area)
236 			return -ENOMEM;
237 	}
238 
239 	down_write(&iopt->iova_rwsem);
240 	if ((length & (iopt->iova_alignment - 1)) || !length) {
241 		rc = -EINVAL;
242 		goto out_unlock;
243 	}
244 
245 	if (flags & IOPT_ALLOC_IOVA) {
246 		/* Use the first entry to guess the ideal IOVA alignment */
247 		elm = list_first_entry(pages_list, struct iopt_pages_list,
248 				       next);
249 		rc = iopt_alloc_iova(
250 			iopt, dst_iova,
251 			(uintptr_t)elm->pages->uptr + elm->start_byte, length);
252 		if (rc)
253 			goto out_unlock;
254 		if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
255 		    WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) {
256 			rc = -EINVAL;
257 			goto out_unlock;
258 		}
259 	} else {
260 		rc = iopt_check_iova(iopt, *dst_iova, length);
261 		if (rc)
262 			goto out_unlock;
263 	}
264 
265 	/*
266 	 * Areas are created with a NULL pages so that the IOVA space is
267 	 * reserved and we can unlock the iova_rwsem.
268 	 */
269 	iova = *dst_iova;
270 	list_for_each_entry(elm, pages_list, next) {
271 		rc = iopt_insert_area(iopt, elm->area, elm->pages, iova,
272 				      elm->start_byte, elm->length, iommu_prot);
273 		if (rc)
274 			goto out_unlock;
275 		iova += elm->length;
276 	}
277 
278 out_unlock:
279 	up_write(&iopt->iova_rwsem);
280 	return rc;
281 }
282 
283 static void iopt_abort_area(struct iopt_area *area)
284 {
285 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
286 		WARN_ON(area->pages);
287 	if (area->iopt) {
288 		down_write(&area->iopt->iova_rwsem);
289 		interval_tree_remove(&area->node, &area->iopt->area_itree);
290 		up_write(&area->iopt->iova_rwsem);
291 	}
292 	kfree(area);
293 }
294 
295 void iopt_free_pages_list(struct list_head *pages_list)
296 {
297 	struct iopt_pages_list *elm;
298 
299 	while ((elm = list_first_entry_or_null(pages_list,
300 					       struct iopt_pages_list, next))) {
301 		if (elm->area)
302 			iopt_abort_area(elm->area);
303 		if (elm->pages)
304 			iopt_put_pages(elm->pages);
305 		list_del(&elm->next);
306 		kfree(elm);
307 	}
308 }
309 
310 static int iopt_fill_domains_pages(struct list_head *pages_list)
311 {
312 	struct iopt_pages_list *undo_elm;
313 	struct iopt_pages_list *elm;
314 	int rc;
315 
316 	list_for_each_entry(elm, pages_list, next) {
317 		rc = iopt_area_fill_domains(elm->area, elm->pages);
318 		if (rc)
319 			goto err_undo;
320 	}
321 	return 0;
322 
323 err_undo:
324 	list_for_each_entry(undo_elm, pages_list, next) {
325 		if (undo_elm == elm)
326 			break;
327 		iopt_area_unfill_domains(undo_elm->area, undo_elm->pages);
328 	}
329 	return rc;
330 }
331 
332 int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
333 		   unsigned long length, unsigned long *dst_iova,
334 		   int iommu_prot, unsigned int flags)
335 {
336 	struct iopt_pages_list *elm;
337 	int rc;
338 
339 	rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova,
340 				   iommu_prot, flags);
341 	if (rc)
342 		return rc;
343 
344 	down_read(&iopt->domains_rwsem);
345 	rc = iopt_fill_domains_pages(pages_list);
346 	if (rc)
347 		goto out_unlock_domains;
348 
349 	down_write(&iopt->iova_rwsem);
350 	list_for_each_entry(elm, pages_list, next) {
351 		/*
352 		 * area->pages must be set inside the domains_rwsem to ensure
353 		 * any newly added domains will get filled. Moves the reference
354 		 * in from the list.
355 		 */
356 		elm->area->pages = elm->pages;
357 		elm->pages = NULL;
358 		elm->area = NULL;
359 	}
360 	up_write(&iopt->iova_rwsem);
361 out_unlock_domains:
362 	up_read(&iopt->domains_rwsem);
363 	return rc;
364 }
365 
366 /**
367  * iopt_map_user_pages() - Map a user VA to an iova in the io page table
368  * @ictx: iommufd_ctx the iopt is part of
369  * @iopt: io_pagetable to act on
370  * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
371  *        the chosen iova on output. Otherwise is the iova to map to on input
372  * @uptr: User VA to map
373  * @length: Number of bytes to map
374  * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
375  * @flags: IOPT_ALLOC_IOVA or zero
376  *
377  * iova, uptr, and length must be aligned to iova_alignment. For domain backed
378  * page tables this will pin the pages and load them into the domain at iova.
379  * For non-domain page tables this will only setup a lazy reference and the
380  * caller must use iopt_access_pages() to touch them.
381  *
382  * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
383  * destroyed.
384  */
385 int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
386 			unsigned long *iova, void __user *uptr,
387 			unsigned long length, int iommu_prot,
388 			unsigned int flags)
389 {
390 	struct iopt_pages_list elm = {};
391 	LIST_HEAD(pages_list);
392 	int rc;
393 
394 	elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE);
395 	if (IS_ERR(elm.pages))
396 		return PTR_ERR(elm.pages);
397 	if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
398 	    elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
399 		elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
400 	elm.start_byte = uptr - elm.pages->uptr;
401 	elm.length = length;
402 	list_add(&elm.next, &pages_list);
403 
404 	rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
405 	if (rc) {
406 		if (elm.area)
407 			iopt_abort_area(elm.area);
408 		if (elm.pages)
409 			iopt_put_pages(elm.pages);
410 		return rc;
411 	}
412 	return 0;
413 }
414 
415 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
416 		   unsigned long length, struct list_head *pages_list)
417 {
418 	struct iopt_area_contig_iter iter;
419 	unsigned long last_iova;
420 	struct iopt_area *area;
421 	int rc;
422 
423 	if (!length)
424 		return -EINVAL;
425 	if (check_add_overflow(iova, length - 1, &last_iova))
426 		return -EOVERFLOW;
427 
428 	down_read(&iopt->iova_rwsem);
429 	iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
430 		struct iopt_pages_list *elm;
431 		unsigned long last = min(last_iova, iopt_area_last_iova(area));
432 
433 		elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT);
434 		if (!elm) {
435 			rc = -ENOMEM;
436 			goto err_free;
437 		}
438 		elm->start_byte = iopt_area_start_byte(area, iter.cur_iova);
439 		elm->pages = area->pages;
440 		elm->length = (last - iter.cur_iova) + 1;
441 		kref_get(&elm->pages->kref);
442 		list_add_tail(&elm->next, pages_list);
443 	}
444 	if (!iopt_area_contig_done(&iter)) {
445 		rc = -ENOENT;
446 		goto err_free;
447 	}
448 	up_read(&iopt->iova_rwsem);
449 	return 0;
450 err_free:
451 	up_read(&iopt->iova_rwsem);
452 	iopt_free_pages_list(pages_list);
453 	return rc;
454 }
455 
456 static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
457 				 unsigned long last, unsigned long *unmapped)
458 {
459 	struct iopt_area *area;
460 	unsigned long unmapped_bytes = 0;
461 	unsigned int tries = 0;
462 	int rc = -ENOENT;
463 
464 	/*
465 	 * The domains_rwsem must be held in read mode any time any area->pages
466 	 * is NULL. This prevents domain attach/detatch from running
467 	 * concurrently with cleaning up the area.
468 	 */
469 again:
470 	down_read(&iopt->domains_rwsem);
471 	down_write(&iopt->iova_rwsem);
472 	while ((area = iopt_area_iter_first(iopt, start, last))) {
473 		unsigned long area_last = iopt_area_last_iova(area);
474 		unsigned long area_first = iopt_area_iova(area);
475 		struct iopt_pages *pages;
476 
477 		/* Userspace should not race map/unmap's of the same area */
478 		if (!area->pages) {
479 			rc = -EBUSY;
480 			goto out_unlock_iova;
481 		}
482 
483 		if (area_first < start || area_last > last) {
484 			rc = -ENOENT;
485 			goto out_unlock_iova;
486 		}
487 
488 		if (area_first != start)
489 			tries = 0;
490 
491 		/*
492 		 * num_accesses writers must hold the iova_rwsem too, so we can
493 		 * safely read it under the write side of the iovam_rwsem
494 		 * without the pages->mutex.
495 		 */
496 		if (area->num_accesses) {
497 			size_t length = iopt_area_length(area);
498 
499 			start = area_first;
500 			area->prevent_access = true;
501 			up_write(&iopt->iova_rwsem);
502 			up_read(&iopt->domains_rwsem);
503 
504 			iommufd_access_notify_unmap(iopt, area_first, length);
505 			/* Something is not responding to unmap requests. */
506 			tries++;
507 			if (WARN_ON(tries > 100))
508 				return -EDEADLOCK;
509 			goto again;
510 		}
511 
512 		pages = area->pages;
513 		area->pages = NULL;
514 		up_write(&iopt->iova_rwsem);
515 
516 		iopt_area_unfill_domains(area, pages);
517 		iopt_abort_area(area);
518 		iopt_put_pages(pages);
519 
520 		unmapped_bytes += area_last - area_first + 1;
521 
522 		down_write(&iopt->iova_rwsem);
523 	}
524 	if (unmapped_bytes)
525 		rc = 0;
526 
527 out_unlock_iova:
528 	up_write(&iopt->iova_rwsem);
529 	up_read(&iopt->domains_rwsem);
530 	if (unmapped)
531 		*unmapped = unmapped_bytes;
532 	return rc;
533 }
534 
535 /**
536  * iopt_unmap_iova() - Remove a range of iova
537  * @iopt: io_pagetable to act on
538  * @iova: Starting iova to unmap
539  * @length: Number of bytes to unmap
540  * @unmapped: Return number of bytes unmapped
541  *
542  * The requested range must be a superset of existing ranges.
543  * Splitting/truncating IOVA mappings is not allowed.
544  */
545 int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
546 		    unsigned long length, unsigned long *unmapped)
547 {
548 	unsigned long iova_last;
549 
550 	if (!length)
551 		return -EINVAL;
552 
553 	if (check_add_overflow(iova, length - 1, &iova_last))
554 		return -EOVERFLOW;
555 
556 	return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped);
557 }
558 
559 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
560 {
561 	int rc;
562 
563 	rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
564 	/* If the IOVAs are empty then unmap all succeeds */
565 	if (rc == -ENOENT)
566 		return 0;
567 	return rc;
568 }
569 
570 /* The caller must always free all the nodes in the allowed_iova rb_root. */
571 int iopt_set_allow_iova(struct io_pagetable *iopt,
572 			struct rb_root_cached *allowed_iova)
573 {
574 	struct iopt_allowed *allowed;
575 
576 	down_write(&iopt->iova_rwsem);
577 	swap(*allowed_iova, iopt->allowed_itree);
578 
579 	for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed;
580 	     allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) {
581 		if (iopt_reserved_iter_first(iopt, allowed->node.start,
582 					     allowed->node.last)) {
583 			swap(*allowed_iova, iopt->allowed_itree);
584 			up_write(&iopt->iova_rwsem);
585 			return -EADDRINUSE;
586 		}
587 	}
588 	up_write(&iopt->iova_rwsem);
589 	return 0;
590 }
591 
592 int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
593 		      unsigned long last, void *owner)
594 {
595 	struct iopt_reserved *reserved;
596 
597 	lockdep_assert_held_write(&iopt->iova_rwsem);
598 
599 	if (iopt_area_iter_first(iopt, start, last) ||
600 	    iopt_allowed_iter_first(iopt, start, last))
601 		return -EADDRINUSE;
602 
603 	reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT);
604 	if (!reserved)
605 		return -ENOMEM;
606 	reserved->node.start = start;
607 	reserved->node.last = last;
608 	reserved->owner = owner;
609 	interval_tree_insert(&reserved->node, &iopt->reserved_itree);
610 	return 0;
611 }
612 
613 static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
614 {
615 	struct iopt_reserved *reserved, *next;
616 
617 	lockdep_assert_held_write(&iopt->iova_rwsem);
618 
619 	for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved;
620 	     reserved = next) {
621 		next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX);
622 
623 		if (reserved->owner == owner) {
624 			interval_tree_remove(&reserved->node,
625 					     &iopt->reserved_itree);
626 			kfree(reserved);
627 		}
628 	}
629 }
630 
631 void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
632 {
633 	down_write(&iopt->iova_rwsem);
634 	__iopt_remove_reserved_iova(iopt, owner);
635 	up_write(&iopt->iova_rwsem);
636 }
637 
638 void iopt_init_table(struct io_pagetable *iopt)
639 {
640 	init_rwsem(&iopt->iova_rwsem);
641 	init_rwsem(&iopt->domains_rwsem);
642 	iopt->area_itree = RB_ROOT_CACHED;
643 	iopt->allowed_itree = RB_ROOT_CACHED;
644 	iopt->reserved_itree = RB_ROOT_CACHED;
645 	xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT);
646 	xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC);
647 
648 	/*
649 	 * iopt's start as SW tables that can use the entire size_t IOVA space
650 	 * due to the use of size_t in the APIs. They have no alignment
651 	 * restriction.
652 	 */
653 	iopt->iova_alignment = 1;
654 }
655 
656 void iopt_destroy_table(struct io_pagetable *iopt)
657 {
658 	struct interval_tree_node *node;
659 
660 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
661 		iopt_remove_reserved_iova(iopt, NULL);
662 
663 	while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0,
664 						ULONG_MAX))) {
665 		interval_tree_remove(node, &iopt->allowed_itree);
666 		kfree(container_of(node, struct iopt_allowed, node));
667 	}
668 
669 	WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root));
670 	WARN_ON(!xa_empty(&iopt->domains));
671 	WARN_ON(!xa_empty(&iopt->access_list));
672 	WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
673 }
674 
675 /**
676  * iopt_unfill_domain() - Unfill a domain with PFNs
677  * @iopt: io_pagetable to act on
678  * @domain: domain to unfill
679  *
680  * This is used when removing a domain from the iopt. Every area in the iopt
681  * will be unmapped from the domain. The domain must already be removed from the
682  * domains xarray.
683  */
684 static void iopt_unfill_domain(struct io_pagetable *iopt,
685 			       struct iommu_domain *domain)
686 {
687 	struct iopt_area *area;
688 
689 	lockdep_assert_held(&iopt->iova_rwsem);
690 	lockdep_assert_held_write(&iopt->domains_rwsem);
691 
692 	/*
693 	 * Some other domain is holding all the pfns still, rapidly unmap this
694 	 * domain.
695 	 */
696 	if (iopt->next_domain_id != 0) {
697 		/* Pick an arbitrary remaining domain to act as storage */
698 		struct iommu_domain *storage_domain =
699 			xa_load(&iopt->domains, 0);
700 
701 		for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
702 		     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
703 			struct iopt_pages *pages = area->pages;
704 
705 			if (!pages)
706 				continue;
707 
708 			mutex_lock(&pages->mutex);
709 			if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
710 				WARN_ON(!area->storage_domain);
711 			if (area->storage_domain == domain)
712 				area->storage_domain = storage_domain;
713 			mutex_unlock(&pages->mutex);
714 
715 			iopt_area_unmap_domain(area, domain);
716 		}
717 		return;
718 	}
719 
720 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
721 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
722 		struct iopt_pages *pages = area->pages;
723 
724 		if (!pages)
725 			continue;
726 
727 		mutex_lock(&pages->mutex);
728 		interval_tree_remove(&area->pages_node, &pages->domains_itree);
729 		WARN_ON(area->storage_domain != domain);
730 		area->storage_domain = NULL;
731 		iopt_area_unfill_domain(area, pages, domain);
732 		mutex_unlock(&pages->mutex);
733 	}
734 }
735 
736 /**
737  * iopt_fill_domain() - Fill a domain with PFNs
738  * @iopt: io_pagetable to act on
739  * @domain: domain to fill
740  *
741  * Fill the domain with PFNs from every area in the iopt. On failure the domain
742  * is left unchanged.
743  */
744 static int iopt_fill_domain(struct io_pagetable *iopt,
745 			    struct iommu_domain *domain)
746 {
747 	struct iopt_area *end_area;
748 	struct iopt_area *area;
749 	int rc;
750 
751 	lockdep_assert_held(&iopt->iova_rwsem);
752 	lockdep_assert_held_write(&iopt->domains_rwsem);
753 
754 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
755 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
756 		struct iopt_pages *pages = area->pages;
757 
758 		if (!pages)
759 			continue;
760 
761 		mutex_lock(&pages->mutex);
762 		rc = iopt_area_fill_domain(area, domain);
763 		if (rc) {
764 			mutex_unlock(&pages->mutex);
765 			goto out_unfill;
766 		}
767 		if (!area->storage_domain) {
768 			WARN_ON(iopt->next_domain_id != 0);
769 			area->storage_domain = domain;
770 			interval_tree_insert(&area->pages_node,
771 					     &pages->domains_itree);
772 		}
773 		mutex_unlock(&pages->mutex);
774 	}
775 	return 0;
776 
777 out_unfill:
778 	end_area = area;
779 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
780 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
781 		struct iopt_pages *pages = area->pages;
782 
783 		if (area == end_area)
784 			break;
785 		if (!pages)
786 			continue;
787 		mutex_lock(&pages->mutex);
788 		if (iopt->next_domain_id == 0) {
789 			interval_tree_remove(&area->pages_node,
790 					     &pages->domains_itree);
791 			area->storage_domain = NULL;
792 		}
793 		iopt_area_unfill_domain(area, pages, domain);
794 		mutex_unlock(&pages->mutex);
795 	}
796 	return rc;
797 }
798 
799 /* All existing area's conform to an increased page size */
800 static int iopt_check_iova_alignment(struct io_pagetable *iopt,
801 				     unsigned long new_iova_alignment)
802 {
803 	unsigned long align_mask = new_iova_alignment - 1;
804 	struct iopt_area *area;
805 
806 	lockdep_assert_held(&iopt->iova_rwsem);
807 	lockdep_assert_held(&iopt->domains_rwsem);
808 
809 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
810 	     area = iopt_area_iter_next(area, 0, ULONG_MAX))
811 		if ((iopt_area_iova(area) & align_mask) ||
812 		    (iopt_area_length(area) & align_mask) ||
813 		    (area->page_offset & align_mask))
814 			return -EADDRINUSE;
815 
816 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) {
817 		struct iommufd_access *access;
818 		unsigned long index;
819 
820 		xa_for_each(&iopt->access_list, index, access)
821 			if (WARN_ON(access->iova_alignment >
822 				    new_iova_alignment))
823 				return -EADDRINUSE;
824 	}
825 	return 0;
826 }
827 
828 int iopt_table_add_domain(struct io_pagetable *iopt,
829 			  struct iommu_domain *domain)
830 {
831 	const struct iommu_domain_geometry *geometry = &domain->geometry;
832 	struct iommu_domain *iter_domain;
833 	unsigned int new_iova_alignment;
834 	unsigned long index;
835 	int rc;
836 
837 	down_write(&iopt->domains_rwsem);
838 	down_write(&iopt->iova_rwsem);
839 
840 	xa_for_each(&iopt->domains, index, iter_domain) {
841 		if (WARN_ON(iter_domain == domain)) {
842 			rc = -EEXIST;
843 			goto out_unlock;
844 		}
845 	}
846 
847 	/*
848 	 * The io page size drives the iova_alignment. Internally the iopt_pages
849 	 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
850 	 * objects into the iommu_domain.
851 	 *
852 	 * A iommu_domain must always be able to accept PAGE_SIZE to be
853 	 * compatible as we can't guarantee higher contiguity.
854 	 */
855 	new_iova_alignment = max_t(unsigned long,
856 				   1UL << __ffs(domain->pgsize_bitmap),
857 				   iopt->iova_alignment);
858 	if (new_iova_alignment > PAGE_SIZE) {
859 		rc = -EINVAL;
860 		goto out_unlock;
861 	}
862 	if (new_iova_alignment != iopt->iova_alignment) {
863 		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
864 		if (rc)
865 			goto out_unlock;
866 	}
867 
868 	/* No area exists that is outside the allowed domain aperture */
869 	if (geometry->aperture_start != 0) {
870 		rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
871 				       domain);
872 		if (rc)
873 			goto out_reserved;
874 	}
875 	if (geometry->aperture_end != ULONG_MAX) {
876 		rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
877 				       ULONG_MAX, domain);
878 		if (rc)
879 			goto out_reserved;
880 	}
881 
882 	rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
883 	if (rc)
884 		goto out_reserved;
885 
886 	rc = iopt_fill_domain(iopt, domain);
887 	if (rc)
888 		goto out_release;
889 
890 	iopt->iova_alignment = new_iova_alignment;
891 	xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
892 	iopt->next_domain_id++;
893 	up_write(&iopt->iova_rwsem);
894 	up_write(&iopt->domains_rwsem);
895 	return 0;
896 out_release:
897 	xa_release(&iopt->domains, iopt->next_domain_id);
898 out_reserved:
899 	__iopt_remove_reserved_iova(iopt, domain);
900 out_unlock:
901 	up_write(&iopt->iova_rwsem);
902 	up_write(&iopt->domains_rwsem);
903 	return rc;
904 }
905 
906 static int iopt_calculate_iova_alignment(struct io_pagetable *iopt)
907 {
908 	unsigned long new_iova_alignment;
909 	struct iommufd_access *access;
910 	struct iommu_domain *domain;
911 	unsigned long index;
912 
913 	lockdep_assert_held_write(&iopt->iova_rwsem);
914 	lockdep_assert_held(&iopt->domains_rwsem);
915 
916 	/* See batch_iommu_map_small() */
917 	if (iopt->disable_large_pages)
918 		new_iova_alignment = PAGE_SIZE;
919 	else
920 		new_iova_alignment = 1;
921 
922 	xa_for_each(&iopt->domains, index, domain)
923 		new_iova_alignment = max_t(unsigned long,
924 					   1UL << __ffs(domain->pgsize_bitmap),
925 					   new_iova_alignment);
926 	xa_for_each(&iopt->access_list, index, access)
927 		new_iova_alignment = max_t(unsigned long,
928 					   access->iova_alignment,
929 					   new_iova_alignment);
930 
931 	if (new_iova_alignment > iopt->iova_alignment) {
932 		int rc;
933 
934 		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
935 		if (rc)
936 			return rc;
937 	}
938 	iopt->iova_alignment = new_iova_alignment;
939 	return 0;
940 }
941 
942 void iopt_table_remove_domain(struct io_pagetable *iopt,
943 			      struct iommu_domain *domain)
944 {
945 	struct iommu_domain *iter_domain = NULL;
946 	unsigned long index;
947 
948 	down_write(&iopt->domains_rwsem);
949 	down_write(&iopt->iova_rwsem);
950 
951 	xa_for_each(&iopt->domains, index, iter_domain)
952 		if (iter_domain == domain)
953 			break;
954 	if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
955 		goto out_unlock;
956 
957 	/*
958 	 * Compress the xarray to keep it linear by swapping the entry to erase
959 	 * with the tail entry and shrinking the tail.
960 	 */
961 	iopt->next_domain_id--;
962 	iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
963 	if (index != iopt->next_domain_id)
964 		xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
965 
966 	iopt_unfill_domain(iopt, domain);
967 	__iopt_remove_reserved_iova(iopt, domain);
968 
969 	WARN_ON(iopt_calculate_iova_alignment(iopt));
970 out_unlock:
971 	up_write(&iopt->iova_rwsem);
972 	up_write(&iopt->domains_rwsem);
973 }
974 
975 /**
976  * iopt_area_split - Split an area into two parts at iova
977  * @area: The area to split
978  * @iova: Becomes the last of a new area
979  *
980  * This splits an area into two. It is part of the VFIO compatibility to allow
981  * poking a hole in the mapping. The two areas continue to point at the same
982  * iopt_pages, just with different starting bytes.
983  */
984 static int iopt_area_split(struct iopt_area *area, unsigned long iova)
985 {
986 	unsigned long alignment = area->iopt->iova_alignment;
987 	unsigned long last_iova = iopt_area_last_iova(area);
988 	unsigned long start_iova = iopt_area_iova(area);
989 	unsigned long new_start = iova + 1;
990 	struct io_pagetable *iopt = area->iopt;
991 	struct iopt_pages *pages = area->pages;
992 	struct iopt_area *lhs;
993 	struct iopt_area *rhs;
994 	int rc;
995 
996 	lockdep_assert_held_write(&iopt->iova_rwsem);
997 
998 	if (iova == start_iova || iova == last_iova)
999 		return 0;
1000 
1001 	if (!pages || area->prevent_access)
1002 		return -EBUSY;
1003 
1004 	if (new_start & (alignment - 1) ||
1005 	    iopt_area_start_byte(area, new_start) & (alignment - 1))
1006 		return -EINVAL;
1007 
1008 	lhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
1009 	if (!lhs)
1010 		return -ENOMEM;
1011 
1012 	rhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
1013 	if (!rhs) {
1014 		rc = -ENOMEM;
1015 		goto err_free_lhs;
1016 	}
1017 
1018 	mutex_lock(&pages->mutex);
1019 	/*
1020 	 * Splitting is not permitted if an access exists, we don't track enough
1021 	 * information to split existing accesses.
1022 	 */
1023 	if (area->num_accesses) {
1024 		rc = -EINVAL;
1025 		goto err_unlock;
1026 	}
1027 
1028 	/*
1029 	 * Splitting is not permitted if a domain could have been mapped with
1030 	 * huge pages.
1031 	 */
1032 	if (area->storage_domain && !iopt->disable_large_pages) {
1033 		rc = -EINVAL;
1034 		goto err_unlock;
1035 	}
1036 
1037 	interval_tree_remove(&area->node, &iopt->area_itree);
1038 	rc = iopt_insert_area(iopt, lhs, area->pages, start_iova,
1039 			      iopt_area_start_byte(area, start_iova),
1040 			      (new_start - 1) - start_iova + 1,
1041 			      area->iommu_prot);
1042 	if (WARN_ON(rc))
1043 		goto err_insert;
1044 
1045 	rc = iopt_insert_area(iopt, rhs, area->pages, new_start,
1046 			      iopt_area_start_byte(area, new_start),
1047 			      last_iova - new_start + 1, area->iommu_prot);
1048 	if (WARN_ON(rc))
1049 		goto err_remove_lhs;
1050 
1051 	lhs->storage_domain = area->storage_domain;
1052 	lhs->pages = area->pages;
1053 	rhs->storage_domain = area->storage_domain;
1054 	rhs->pages = area->pages;
1055 	kref_get(&rhs->pages->kref);
1056 	kfree(area);
1057 	mutex_unlock(&pages->mutex);
1058 
1059 	/*
1060 	 * No change to domains or accesses because the pages hasn't been
1061 	 * changed
1062 	 */
1063 	return 0;
1064 
1065 err_remove_lhs:
1066 	interval_tree_remove(&lhs->node, &iopt->area_itree);
1067 err_insert:
1068 	interval_tree_insert(&area->node, &iopt->area_itree);
1069 err_unlock:
1070 	mutex_unlock(&pages->mutex);
1071 	kfree(rhs);
1072 err_free_lhs:
1073 	kfree(lhs);
1074 	return rc;
1075 }
1076 
1077 int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
1078 		  size_t num_iovas)
1079 {
1080 	int rc = 0;
1081 	int i;
1082 
1083 	down_write(&iopt->iova_rwsem);
1084 	for (i = 0; i < num_iovas; i++) {
1085 		struct iopt_area *area;
1086 
1087 		area = iopt_area_iter_first(iopt, iovas[i], iovas[i]);
1088 		if (!area)
1089 			continue;
1090 		rc = iopt_area_split(area, iovas[i]);
1091 		if (rc)
1092 			break;
1093 	}
1094 	up_write(&iopt->iova_rwsem);
1095 	return rc;
1096 }
1097 
1098 void iopt_enable_large_pages(struct io_pagetable *iopt)
1099 {
1100 	int rc;
1101 
1102 	down_write(&iopt->domains_rwsem);
1103 	down_write(&iopt->iova_rwsem);
1104 	WRITE_ONCE(iopt->disable_large_pages, false);
1105 	rc = iopt_calculate_iova_alignment(iopt);
1106 	WARN_ON(rc);
1107 	up_write(&iopt->iova_rwsem);
1108 	up_write(&iopt->domains_rwsem);
1109 }
1110 
1111 int iopt_disable_large_pages(struct io_pagetable *iopt)
1112 {
1113 	int rc = 0;
1114 
1115 	down_write(&iopt->domains_rwsem);
1116 	down_write(&iopt->iova_rwsem);
1117 	if (iopt->disable_large_pages)
1118 		goto out_unlock;
1119 
1120 	/* Won't do it if domains already have pages mapped in them */
1121 	if (!xa_empty(&iopt->domains) &&
1122 	    !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) {
1123 		rc = -EINVAL;
1124 		goto out_unlock;
1125 	}
1126 
1127 	WRITE_ONCE(iopt->disable_large_pages, true);
1128 	rc = iopt_calculate_iova_alignment(iopt);
1129 	if (rc)
1130 		WRITE_ONCE(iopt->disable_large_pages, false);
1131 out_unlock:
1132 	up_write(&iopt->iova_rwsem);
1133 	up_write(&iopt->domains_rwsem);
1134 	return rc;
1135 }
1136 
1137 int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access)
1138 {
1139 	int rc;
1140 
1141 	down_write(&iopt->domains_rwsem);
1142 	down_write(&iopt->iova_rwsem);
1143 	rc = xa_alloc(&iopt->access_list, &access->iopt_access_list_id, access,
1144 		      xa_limit_16b, GFP_KERNEL_ACCOUNT);
1145 	if (rc)
1146 		goto out_unlock;
1147 
1148 	rc = iopt_calculate_iova_alignment(iopt);
1149 	if (rc) {
1150 		xa_erase(&iopt->access_list, access->iopt_access_list_id);
1151 		goto out_unlock;
1152 	}
1153 
1154 out_unlock:
1155 	up_write(&iopt->iova_rwsem);
1156 	up_write(&iopt->domains_rwsem);
1157 	return rc;
1158 }
1159 
1160 void iopt_remove_access(struct io_pagetable *iopt,
1161 			struct iommufd_access *access,
1162 			u32 iopt_access_list_id)
1163 {
1164 	down_write(&iopt->domains_rwsem);
1165 	down_write(&iopt->iova_rwsem);
1166 	WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access);
1167 	WARN_ON(iopt_calculate_iova_alignment(iopt));
1168 	up_write(&iopt->iova_rwsem);
1169 	up_write(&iopt->domains_rwsem);
1170 }
1171 
1172 /* Narrow the valid_iova_itree to include reserved ranges from a device. */
1173 int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
1174 					struct device *dev,
1175 					phys_addr_t *sw_msi_start)
1176 {
1177 	struct iommu_resv_region *resv;
1178 	LIST_HEAD(resv_regions);
1179 	unsigned int num_hw_msi = 0;
1180 	unsigned int num_sw_msi = 0;
1181 	int rc;
1182 
1183 	if (iommufd_should_fail())
1184 		return -EINVAL;
1185 
1186 	down_write(&iopt->iova_rwsem);
1187 	/* FIXME: drivers allocate memory but there is no failure propogated */
1188 	iommu_get_resv_regions(dev, &resv_regions);
1189 
1190 	list_for_each_entry(resv, &resv_regions, list) {
1191 		if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1192 			continue;
1193 
1194 		if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
1195 			num_hw_msi++;
1196 		if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
1197 			*sw_msi_start = resv->start;
1198 			num_sw_msi++;
1199 		}
1200 
1201 		rc = iopt_reserve_iova(iopt, resv->start,
1202 				       resv->length - 1 + resv->start, dev);
1203 		if (rc)
1204 			goto out_reserved;
1205 	}
1206 
1207 	/* Drivers must offer sane combinations of regions */
1208 	if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) {
1209 		rc = -EINVAL;
1210 		goto out_reserved;
1211 	}
1212 
1213 	rc = 0;
1214 	goto out_free_resv;
1215 
1216 out_reserved:
1217 	__iopt_remove_reserved_iova(iopt, dev);
1218 out_free_resv:
1219 	iommu_put_resv_regions(dev, &resv_regions);
1220 	up_write(&iopt->iova_rwsem);
1221 	return rc;
1222 }
1223