xref: /linux/drivers/iommu/iommufd/io_pagetable.c (revision a2e33fb92649f4efcc9e81c1d1a1905ca2a76d03)
1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
3  *
4  * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
5  * PFNs can be placed into an iommu_domain, or returned to the caller as a page
6  * list for access by an in-kernel user.
7  *
8  * The datastructure uses the iopt_pages to optimize the storage of the PFNs
9  * between the domains and xarray.
10  */
11 #include <linux/err.h>
12 #include <linux/errno.h>
13 #include <linux/iommu.h>
14 #include <linux/iommufd.h>
15 #include <linux/lockdep.h>
16 #include <linux/sched/mm.h>
17 #include <linux/slab.h>
18 #include <uapi/linux/iommufd.h>
19 
20 #include "double_span.h"
21 #include "io_pagetable.h"
22 
23 struct iopt_pages_list {
24 	struct iopt_pages *pages;
25 	struct iopt_area *area;
26 	struct list_head next;
27 	unsigned long start_byte;
28 	unsigned long length;
29 };
30 
iopt_area_contig_init(struct iopt_area_contig_iter * iter,struct io_pagetable * iopt,unsigned long iova,unsigned long last_iova)31 struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
32 					struct io_pagetable *iopt,
33 					unsigned long iova,
34 					unsigned long last_iova)
35 {
36 	lockdep_assert_held(&iopt->iova_rwsem);
37 
38 	iter->cur_iova = iova;
39 	iter->last_iova = last_iova;
40 	iter->area = iopt_area_iter_first(iopt, iova, iova);
41 	if (!iter->area)
42 		return NULL;
43 	if (!iter->area->pages) {
44 		iter->area = NULL;
45 		return NULL;
46 	}
47 	return iter->area;
48 }
49 
iopt_area_contig_next(struct iopt_area_contig_iter * iter)50 struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
51 {
52 	unsigned long last_iova;
53 
54 	if (!iter->area)
55 		return NULL;
56 	last_iova = iopt_area_last_iova(iter->area);
57 	if (iter->last_iova <= last_iova)
58 		return NULL;
59 
60 	iter->cur_iova = last_iova + 1;
61 	iter->area = iopt_area_iter_next(iter->area, iter->cur_iova,
62 					 iter->last_iova);
63 	if (!iter->area)
64 		return NULL;
65 	if (iter->cur_iova != iopt_area_iova(iter->area) ||
66 	    !iter->area->pages) {
67 		iter->area = NULL;
68 		return NULL;
69 	}
70 	return iter->area;
71 }
72 
__alloc_iova_check_range(unsigned long * start,unsigned long last,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)73 static bool __alloc_iova_check_range(unsigned long *start, unsigned long last,
74 				     unsigned long length,
75 				     unsigned long iova_alignment,
76 				     unsigned long page_offset)
77 {
78 	unsigned long aligned_start;
79 
80 	/* ALIGN_UP() */
81 	if (check_add_overflow(*start, iova_alignment - 1, &aligned_start))
82 		return false;
83 	aligned_start &= ~(iova_alignment - 1);
84 	aligned_start |= page_offset;
85 
86 	if (aligned_start >= last || last - aligned_start < length - 1)
87 		return false;
88 	*start = aligned_start;
89 	return true;
90 }
91 
__alloc_iova_check_hole(struct interval_tree_double_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)92 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
93 				    unsigned long length,
94 				    unsigned long iova_alignment,
95 				    unsigned long page_offset)
96 {
97 	if (span->is_used)
98 		return false;
99 	return __alloc_iova_check_range(&span->start_hole, span->last_hole,
100 					length, iova_alignment, page_offset);
101 }
102 
__alloc_iova_check_used(struct interval_tree_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)103 static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
104 				    unsigned long length,
105 				    unsigned long iova_alignment,
106 				    unsigned long page_offset)
107 {
108 	if (span->is_hole)
109 		return false;
110 	return __alloc_iova_check_range(&span->start_used, span->last_used,
111 					length, iova_alignment, page_offset);
112 }
113 
114 /*
115  * Automatically find a block of IOVA that is not being used and not reserved.
116  * Does not return a 0 IOVA even if it is valid.
117  */
iopt_alloc_iova(struct io_pagetable * iopt,unsigned long * iova,unsigned long addr,unsigned long length)118 static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
119 			   unsigned long addr, unsigned long length)
120 {
121 	unsigned long page_offset = addr % PAGE_SIZE;
122 	struct interval_tree_double_span_iter used_span;
123 	struct interval_tree_span_iter allowed_span;
124 	unsigned long max_alignment = PAGE_SIZE;
125 	unsigned long iova_alignment;
126 
127 	lockdep_assert_held(&iopt->iova_rwsem);
128 
129 	/* Protect roundup_pow-of_two() from overflow */
130 	if (length == 0 || length >= ULONG_MAX / 2)
131 		return -EOVERFLOW;
132 
133 	/*
134 	 * Keep alignment present in addr when building the IOVA, which
135 	 * increases the chance we can map a THP.
136 	 */
137 	if (!addr)
138 		iova_alignment = roundup_pow_of_two(length);
139 	else
140 		iova_alignment = min_t(unsigned long,
141 				       roundup_pow_of_two(length),
142 				       1UL << __ffs64(addr));
143 
144 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
145 	max_alignment = HPAGE_SIZE;
146 #endif
147 	/* Protect against ALIGN() overflow */
148 	if (iova_alignment >= max_alignment)
149 		iova_alignment = max_alignment;
150 
151 	if (iova_alignment < iopt->iova_alignment)
152 		return -EINVAL;
153 
154 	interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree,
155 				    PAGE_SIZE, ULONG_MAX - PAGE_SIZE) {
156 		if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) {
157 			allowed_span.start_used = PAGE_SIZE;
158 			allowed_span.last_used = ULONG_MAX - PAGE_SIZE;
159 			allowed_span.is_hole = false;
160 		}
161 
162 		if (!__alloc_iova_check_used(&allowed_span, length,
163 					     iova_alignment, page_offset))
164 			continue;
165 
166 		interval_tree_for_each_double_span(
167 			&used_span, &iopt->reserved_itree, &iopt->area_itree,
168 			allowed_span.start_used, allowed_span.last_used) {
169 			if (!__alloc_iova_check_hole(&used_span, length,
170 						     iova_alignment,
171 						     page_offset))
172 				continue;
173 
174 			*iova = used_span.start_hole;
175 			return 0;
176 		}
177 	}
178 	return -ENOSPC;
179 }
180 
iopt_check_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length)181 static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova,
182 			   unsigned long length)
183 {
184 	unsigned long last;
185 
186 	lockdep_assert_held(&iopt->iova_rwsem);
187 
188 	if ((iova & (iopt->iova_alignment - 1)))
189 		return -EINVAL;
190 
191 	if (check_add_overflow(iova, length - 1, &last))
192 		return -EOVERFLOW;
193 
194 	/* No reserved IOVA intersects the range */
195 	if (iopt_reserved_iter_first(iopt, iova, last))
196 		return -EINVAL;
197 
198 	/* Check that there is not already a mapping in the range */
199 	if (iopt_area_iter_first(iopt, iova, last))
200 		return -EEXIST;
201 	return 0;
202 }
203 
204 /*
205  * The area takes a slice of the pages from start_bytes to start_byte + length
206  */
iopt_insert_area(struct io_pagetable * iopt,struct iopt_area * area,struct iopt_pages * pages,unsigned long iova,unsigned long start_byte,unsigned long length,int iommu_prot)207 static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
208 			    struct iopt_pages *pages, unsigned long iova,
209 			    unsigned long start_byte, unsigned long length,
210 			    int iommu_prot)
211 {
212 	lockdep_assert_held_write(&iopt->iova_rwsem);
213 
214 	if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
215 		return -EPERM;
216 
217 	area->iommu_prot = iommu_prot;
218 	area->page_offset = start_byte % PAGE_SIZE;
219 	if (area->page_offset & (iopt->iova_alignment - 1))
220 		return -EINVAL;
221 
222 	area->node.start = iova;
223 	if (check_add_overflow(iova, length - 1, &area->node.last))
224 		return -EOVERFLOW;
225 
226 	area->pages_node.start = start_byte / PAGE_SIZE;
227 	if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
228 		return -EOVERFLOW;
229 	area->pages_node.last = area->pages_node.last / PAGE_SIZE;
230 	if (WARN_ON(area->pages_node.last >= pages->npages))
231 		return -EOVERFLOW;
232 
233 	/*
234 	 * The area is inserted with a NULL pages indicating it is not fully
235 	 * initialized yet.
236 	 */
237 	area->iopt = iopt;
238 	interval_tree_insert(&area->node, &iopt->area_itree);
239 	return 0;
240 }
241 
iopt_area_alloc(void)242 static struct iopt_area *iopt_area_alloc(void)
243 {
244 	struct iopt_area *area;
245 
246 	area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
247 	if (!area)
248 		return NULL;
249 	RB_CLEAR_NODE(&area->node.rb);
250 	RB_CLEAR_NODE(&area->pages_node.rb);
251 	return area;
252 }
253 
iopt_alloc_area_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)254 static int iopt_alloc_area_pages(struct io_pagetable *iopt,
255 				 struct list_head *pages_list,
256 				 unsigned long length, unsigned long *dst_iova,
257 				 int iommu_prot, unsigned int flags)
258 {
259 	struct iopt_pages_list *elm;
260 	unsigned long start;
261 	unsigned long iova;
262 	int rc = 0;
263 
264 	list_for_each_entry(elm, pages_list, next) {
265 		elm->area = iopt_area_alloc();
266 		if (!elm->area)
267 			return -ENOMEM;
268 	}
269 
270 	down_write(&iopt->iova_rwsem);
271 	if ((length & (iopt->iova_alignment - 1)) || !length) {
272 		rc = -EINVAL;
273 		goto out_unlock;
274 	}
275 
276 	if (flags & IOPT_ALLOC_IOVA) {
277 		/* Use the first entry to guess the ideal IOVA alignment */
278 		elm = list_first_entry(pages_list, struct iopt_pages_list,
279 				       next);
280 		switch (elm->pages->type) {
281 		case IOPT_ADDRESS_USER:
282 			start = elm->start_byte + (uintptr_t)elm->pages->uptr;
283 			break;
284 		case IOPT_ADDRESS_FILE:
285 			start = elm->start_byte + elm->pages->start;
286 			break;
287 		}
288 		rc = iopt_alloc_iova(iopt, dst_iova, start, length);
289 		if (rc)
290 			goto out_unlock;
291 		if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
292 		    WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) {
293 			rc = -EINVAL;
294 			goto out_unlock;
295 		}
296 	} else {
297 		rc = iopt_check_iova(iopt, *dst_iova, length);
298 		if (rc)
299 			goto out_unlock;
300 	}
301 
302 	/*
303 	 * Areas are created with a NULL pages so that the IOVA space is
304 	 * reserved and we can unlock the iova_rwsem.
305 	 */
306 	iova = *dst_iova;
307 	list_for_each_entry(elm, pages_list, next) {
308 		rc = iopt_insert_area(iopt, elm->area, elm->pages, iova,
309 				      elm->start_byte, elm->length, iommu_prot);
310 		if (rc)
311 			goto out_unlock;
312 		iova += elm->length;
313 	}
314 
315 out_unlock:
316 	up_write(&iopt->iova_rwsem);
317 	return rc;
318 }
319 
iopt_abort_area(struct iopt_area * area)320 static void iopt_abort_area(struct iopt_area *area)
321 {
322 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
323 		WARN_ON(area->pages);
324 	if (area->iopt) {
325 		down_write(&area->iopt->iova_rwsem);
326 		interval_tree_remove(&area->node, &area->iopt->area_itree);
327 		up_write(&area->iopt->iova_rwsem);
328 	}
329 	kfree(area);
330 }
331 
iopt_free_pages_list(struct list_head * pages_list)332 void iopt_free_pages_list(struct list_head *pages_list)
333 {
334 	struct iopt_pages_list *elm;
335 
336 	while ((elm = list_first_entry_or_null(pages_list,
337 					       struct iopt_pages_list, next))) {
338 		if (elm->area)
339 			iopt_abort_area(elm->area);
340 		if (elm->pages)
341 			iopt_put_pages(elm->pages);
342 		list_del(&elm->next);
343 		kfree(elm);
344 	}
345 }
346 
iopt_fill_domains_pages(struct list_head * pages_list)347 static int iopt_fill_domains_pages(struct list_head *pages_list)
348 {
349 	struct iopt_pages_list *undo_elm;
350 	struct iopt_pages_list *elm;
351 	int rc;
352 
353 	list_for_each_entry(elm, pages_list, next) {
354 		rc = iopt_area_fill_domains(elm->area, elm->pages);
355 		if (rc)
356 			goto err_undo;
357 	}
358 	return 0;
359 
360 err_undo:
361 	list_for_each_entry(undo_elm, pages_list, next) {
362 		if (undo_elm == elm)
363 			break;
364 		iopt_area_unfill_domains(undo_elm->area, undo_elm->pages);
365 	}
366 	return rc;
367 }
368 
iopt_map_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)369 int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
370 		   unsigned long length, unsigned long *dst_iova,
371 		   int iommu_prot, unsigned int flags)
372 {
373 	struct iopt_pages_list *elm;
374 	int rc;
375 
376 	rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova,
377 				   iommu_prot, flags);
378 	if (rc)
379 		return rc;
380 
381 	down_read(&iopt->domains_rwsem);
382 	rc = iopt_fill_domains_pages(pages_list);
383 	if (rc)
384 		goto out_unlock_domains;
385 
386 	down_write(&iopt->iova_rwsem);
387 	list_for_each_entry(elm, pages_list, next) {
388 		/*
389 		 * area->pages must be set inside the domains_rwsem to ensure
390 		 * any newly added domains will get filled. Moves the reference
391 		 * in from the list.
392 		 */
393 		elm->area->pages = elm->pages;
394 		elm->pages = NULL;
395 		elm->area = NULL;
396 	}
397 	up_write(&iopt->iova_rwsem);
398 out_unlock_domains:
399 	up_read(&iopt->domains_rwsem);
400 	return rc;
401 }
402 
iopt_map_common(struct iommufd_ctx * ictx,struct io_pagetable * iopt,struct iopt_pages * pages,unsigned long * iova,unsigned long length,unsigned long start_byte,int iommu_prot,unsigned int flags)403 static int iopt_map_common(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
404 			   struct iopt_pages *pages, unsigned long *iova,
405 			   unsigned long length, unsigned long start_byte,
406 			   int iommu_prot, unsigned int flags)
407 {
408 	struct iopt_pages_list elm = {};
409 	LIST_HEAD(pages_list);
410 	int rc;
411 
412 	elm.pages = pages;
413 	elm.start_byte = start_byte;
414 	if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
415 	    elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
416 		elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
417 	elm.length = length;
418 	list_add(&elm.next, &pages_list);
419 
420 	rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
421 	if (rc) {
422 		if (elm.area)
423 			iopt_abort_area(elm.area);
424 		if (elm.pages)
425 			iopt_put_pages(elm.pages);
426 		return rc;
427 	}
428 	return 0;
429 }
430 
431 /**
432  * iopt_map_user_pages() - Map a user VA to an iova in the io page table
433  * @ictx: iommufd_ctx the iopt is part of
434  * @iopt: io_pagetable to act on
435  * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
436  *        the chosen iova on output. Otherwise is the iova to map to on input
437  * @uptr: User VA to map
438  * @length: Number of bytes to map
439  * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
440  * @flags: IOPT_ALLOC_IOVA or zero
441  *
442  * iova, uptr, and length must be aligned to iova_alignment. For domain backed
443  * page tables this will pin the pages and load them into the domain at iova.
444  * For non-domain page tables this will only setup a lazy reference and the
445  * caller must use iopt_access_pages() to touch them.
446  *
447  * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
448  * destroyed.
449  */
iopt_map_user_pages(struct iommufd_ctx * ictx,struct io_pagetable * iopt,unsigned long * iova,void __user * uptr,unsigned long length,int iommu_prot,unsigned int flags)450 int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
451 			unsigned long *iova, void __user *uptr,
452 			unsigned long length, int iommu_prot,
453 			unsigned int flags)
454 {
455 	struct iopt_pages *pages;
456 
457 	pages = iopt_alloc_user_pages(uptr, length, iommu_prot & IOMMU_WRITE);
458 	if (IS_ERR(pages))
459 		return PTR_ERR(pages);
460 
461 	return iopt_map_common(ictx, iopt, pages, iova, length,
462 			       uptr - pages->uptr, iommu_prot, flags);
463 }
464 
465 /**
466  * iopt_map_file_pages() - Like iopt_map_user_pages, but map a file.
467  * @ictx: iommufd_ctx the iopt is part of
468  * @iopt: io_pagetable to act on
469  * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
470  *        the chosen iova on output. Otherwise is the iova to map to on input
471  * @file: file to map
472  * @start: map file starting at this byte offset
473  * @length: Number of bytes to map
474  * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
475  * @flags: IOPT_ALLOC_IOVA or zero
476  */
iopt_map_file_pages(struct iommufd_ctx * ictx,struct io_pagetable * iopt,unsigned long * iova,struct file * file,unsigned long start,unsigned long length,int iommu_prot,unsigned int flags)477 int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
478 			unsigned long *iova, struct file *file,
479 			unsigned long start, unsigned long length,
480 			int iommu_prot, unsigned int flags)
481 {
482 	struct iopt_pages *pages;
483 
484 	pages = iopt_alloc_file_pages(file, start, length,
485 				      iommu_prot & IOMMU_WRITE);
486 	if (IS_ERR(pages))
487 		return PTR_ERR(pages);
488 	return iopt_map_common(ictx, iopt, pages, iova, length,
489 			       start - pages->start, iommu_prot, flags);
490 }
491 
492 struct iova_bitmap_fn_arg {
493 	unsigned long flags;
494 	struct io_pagetable *iopt;
495 	struct iommu_domain *domain;
496 	struct iommu_dirty_bitmap *dirty;
497 };
498 
__iommu_read_and_clear_dirty(struct iova_bitmap * bitmap,unsigned long iova,size_t length,void * opaque)499 static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap,
500 					unsigned long iova, size_t length,
501 					void *opaque)
502 {
503 	struct iopt_area *area;
504 	struct iopt_area_contig_iter iter;
505 	struct iova_bitmap_fn_arg *arg = opaque;
506 	struct iommu_domain *domain = arg->domain;
507 	struct iommu_dirty_bitmap *dirty = arg->dirty;
508 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
509 	unsigned long last_iova = iova + length - 1;
510 	unsigned long flags = arg->flags;
511 	int ret;
512 
513 	iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) {
514 		unsigned long last = min(last_iova, iopt_area_last_iova(area));
515 
516 		ret = ops->read_and_clear_dirty(domain, iter.cur_iova,
517 						last - iter.cur_iova + 1, flags,
518 						dirty);
519 		if (ret)
520 			return ret;
521 	}
522 
523 	if (!iopt_area_contig_done(&iter))
524 		return -EINVAL;
525 	return 0;
526 }
527 
528 static int
iommu_read_and_clear_dirty(struct iommu_domain * domain,struct io_pagetable * iopt,unsigned long flags,struct iommu_hwpt_get_dirty_bitmap * bitmap)529 iommu_read_and_clear_dirty(struct iommu_domain *domain,
530 			   struct io_pagetable *iopt, unsigned long flags,
531 			   struct iommu_hwpt_get_dirty_bitmap *bitmap)
532 {
533 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
534 	struct iommu_iotlb_gather gather;
535 	struct iommu_dirty_bitmap dirty;
536 	struct iova_bitmap_fn_arg arg;
537 	struct iova_bitmap *iter;
538 	int ret = 0;
539 
540 	if (!ops || !ops->read_and_clear_dirty)
541 		return -EOPNOTSUPP;
542 
543 	iter = iova_bitmap_alloc(bitmap->iova, bitmap->length,
544 				 bitmap->page_size,
545 				 u64_to_user_ptr(bitmap->data));
546 	if (IS_ERR(iter))
547 		return -ENOMEM;
548 
549 	iommu_dirty_bitmap_init(&dirty, iter, &gather);
550 
551 	arg.flags = flags;
552 	arg.iopt = iopt;
553 	arg.domain = domain;
554 	arg.dirty = &dirty;
555 	iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty);
556 
557 	if (!(flags & IOMMU_DIRTY_NO_CLEAR))
558 		iommu_iotlb_sync(domain, &gather);
559 
560 	iova_bitmap_free(iter);
561 
562 	return ret;
563 }
564 
iommufd_check_iova_range(struct io_pagetable * iopt,struct iommu_hwpt_get_dirty_bitmap * bitmap)565 int iommufd_check_iova_range(struct io_pagetable *iopt,
566 			     struct iommu_hwpt_get_dirty_bitmap *bitmap)
567 {
568 	size_t iommu_pgsize = iopt->iova_alignment;
569 	u64 last_iova;
570 
571 	if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova))
572 		return -EOVERFLOW;
573 
574 	if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX)
575 		return -EOVERFLOW;
576 
577 	if ((bitmap->iova & (iommu_pgsize - 1)) ||
578 	    ((last_iova + 1) & (iommu_pgsize - 1)))
579 		return -EINVAL;
580 
581 	if (!bitmap->page_size)
582 		return -EINVAL;
583 
584 	if ((bitmap->iova & (bitmap->page_size - 1)) ||
585 	    ((last_iova + 1) & (bitmap->page_size - 1)))
586 		return -EINVAL;
587 
588 	return 0;
589 }
590 
iopt_read_and_clear_dirty_data(struct io_pagetable * iopt,struct iommu_domain * domain,unsigned long flags,struct iommu_hwpt_get_dirty_bitmap * bitmap)591 int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
592 				   struct iommu_domain *domain,
593 				   unsigned long flags,
594 				   struct iommu_hwpt_get_dirty_bitmap *bitmap)
595 {
596 	int ret;
597 
598 	ret = iommufd_check_iova_range(iopt, bitmap);
599 	if (ret)
600 		return ret;
601 
602 	down_read(&iopt->iova_rwsem);
603 	ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
604 	up_read(&iopt->iova_rwsem);
605 
606 	return ret;
607 }
608 
iopt_clear_dirty_data(struct io_pagetable * iopt,struct iommu_domain * domain)609 static int iopt_clear_dirty_data(struct io_pagetable *iopt,
610 				 struct iommu_domain *domain)
611 {
612 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
613 	struct iommu_iotlb_gather gather;
614 	struct iommu_dirty_bitmap dirty;
615 	struct iopt_area *area;
616 	int ret = 0;
617 
618 	lockdep_assert_held_read(&iopt->iova_rwsem);
619 
620 	iommu_dirty_bitmap_init(&dirty, NULL, &gather);
621 
622 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
623 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
624 		if (!area->pages)
625 			continue;
626 
627 		ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area),
628 						iopt_area_length(area), 0,
629 						&dirty);
630 		if (ret)
631 			break;
632 	}
633 
634 	iommu_iotlb_sync(domain, &gather);
635 	return ret;
636 }
637 
iopt_set_dirty_tracking(struct io_pagetable * iopt,struct iommu_domain * domain,bool enable)638 int iopt_set_dirty_tracking(struct io_pagetable *iopt,
639 			    struct iommu_domain *domain, bool enable)
640 {
641 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
642 	int ret = 0;
643 
644 	if (!ops)
645 		return -EOPNOTSUPP;
646 
647 	down_read(&iopt->iova_rwsem);
648 
649 	/* Clear dirty bits from PTEs to ensure a clean snapshot */
650 	if (enable) {
651 		ret = iopt_clear_dirty_data(iopt, domain);
652 		if (ret)
653 			goto out_unlock;
654 	}
655 
656 	ret = ops->set_dirty_tracking(domain, enable);
657 
658 out_unlock:
659 	up_read(&iopt->iova_rwsem);
660 	return ret;
661 }
662 
iopt_get_pages(struct io_pagetable * iopt,unsigned long iova,unsigned long length,struct list_head * pages_list)663 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
664 		   unsigned long length, struct list_head *pages_list)
665 {
666 	struct iopt_area_contig_iter iter;
667 	unsigned long last_iova;
668 	struct iopt_area *area;
669 	int rc;
670 
671 	if (!length)
672 		return -EINVAL;
673 	if (check_add_overflow(iova, length - 1, &last_iova))
674 		return -EOVERFLOW;
675 
676 	down_read(&iopt->iova_rwsem);
677 	iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
678 		struct iopt_pages_list *elm;
679 		unsigned long last = min(last_iova, iopt_area_last_iova(area));
680 
681 		elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT);
682 		if (!elm) {
683 			rc = -ENOMEM;
684 			goto err_free;
685 		}
686 		elm->start_byte = iopt_area_start_byte(area, iter.cur_iova);
687 		elm->pages = area->pages;
688 		elm->length = (last - iter.cur_iova) + 1;
689 		kref_get(&elm->pages->kref);
690 		list_add_tail(&elm->next, pages_list);
691 	}
692 	if (!iopt_area_contig_done(&iter)) {
693 		rc = -ENOENT;
694 		goto err_free;
695 	}
696 	up_read(&iopt->iova_rwsem);
697 	return 0;
698 err_free:
699 	up_read(&iopt->iova_rwsem);
700 	iopt_free_pages_list(pages_list);
701 	return rc;
702 }
703 
iopt_unmap_iova_range(struct io_pagetable * iopt,unsigned long start,unsigned long last,unsigned long * unmapped)704 static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
705 				 unsigned long last, unsigned long *unmapped)
706 {
707 	struct iopt_area *area;
708 	unsigned long unmapped_bytes = 0;
709 	unsigned int tries = 0;
710 	/* If there are no mapped entries then success */
711 	int rc = 0;
712 
713 	/*
714 	 * The domains_rwsem must be held in read mode any time any area->pages
715 	 * is NULL. This prevents domain attach/detatch from running
716 	 * concurrently with cleaning up the area.
717 	 */
718 again:
719 	down_read(&iopt->domains_rwsem);
720 	down_write(&iopt->iova_rwsem);
721 	while ((area = iopt_area_iter_first(iopt, start, last))) {
722 		unsigned long area_last = iopt_area_last_iova(area);
723 		unsigned long area_first = iopt_area_iova(area);
724 		struct iopt_pages *pages;
725 
726 		/* Userspace should not race map/unmap's of the same area */
727 		if (!area->pages) {
728 			rc = -EBUSY;
729 			goto out_unlock_iova;
730 		}
731 
732 		/* The area is locked by an object that has not been destroyed */
733 		if (area->num_locks) {
734 			rc = -EBUSY;
735 			goto out_unlock_iova;
736 		}
737 
738 		if (area_first < start || area_last > last) {
739 			rc = -ENOENT;
740 			goto out_unlock_iova;
741 		}
742 
743 		if (area_first != start)
744 			tries = 0;
745 
746 		/*
747 		 * num_accesses writers must hold the iova_rwsem too, so we can
748 		 * safely read it under the write side of the iovam_rwsem
749 		 * without the pages->mutex.
750 		 */
751 		if (area->num_accesses) {
752 			size_t length = iopt_area_length(area);
753 
754 			start = area_first;
755 			area->prevent_access = true;
756 			up_write(&iopt->iova_rwsem);
757 			up_read(&iopt->domains_rwsem);
758 
759 			iommufd_access_notify_unmap(iopt, area_first, length);
760 			/* Something is not responding to unmap requests. */
761 			tries++;
762 			if (WARN_ON(tries > 100)) {
763 				rc = -EDEADLOCK;
764 				goto out_unmapped;
765 			}
766 			goto again;
767 		}
768 
769 		pages = area->pages;
770 		area->pages = NULL;
771 		up_write(&iopt->iova_rwsem);
772 
773 		iopt_area_unfill_domains(area, pages);
774 		iopt_abort_area(area);
775 		iopt_put_pages(pages);
776 
777 		unmapped_bytes += area_last - area_first + 1;
778 
779 		down_write(&iopt->iova_rwsem);
780 	}
781 
782 out_unlock_iova:
783 	up_write(&iopt->iova_rwsem);
784 	up_read(&iopt->domains_rwsem);
785 out_unmapped:
786 	if (unmapped)
787 		*unmapped = unmapped_bytes;
788 	return rc;
789 }
790 
791 /**
792  * iopt_unmap_iova() - Remove a range of iova
793  * @iopt: io_pagetable to act on
794  * @iova: Starting iova to unmap
795  * @length: Number of bytes to unmap
796  * @unmapped: Return number of bytes unmapped
797  *
798  * The requested range must be a superset of existing ranges.
799  * Splitting/truncating IOVA mappings is not allowed.
800  */
iopt_unmap_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length,unsigned long * unmapped)801 int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
802 		    unsigned long length, unsigned long *unmapped)
803 {
804 	unsigned long iova_last;
805 
806 	if (!length)
807 		return -EINVAL;
808 
809 	if (check_add_overflow(iova, length - 1, &iova_last))
810 		return -EOVERFLOW;
811 
812 	return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped);
813 }
814 
iopt_unmap_all(struct io_pagetable * iopt,unsigned long * unmapped)815 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
816 {
817 	/* If the IOVAs are empty then unmap all succeeds */
818 	return iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
819 }
820 
821 /* The caller must always free all the nodes in the allowed_iova rb_root. */
iopt_set_allow_iova(struct io_pagetable * iopt,struct rb_root_cached * allowed_iova)822 int iopt_set_allow_iova(struct io_pagetable *iopt,
823 			struct rb_root_cached *allowed_iova)
824 {
825 	struct iopt_allowed *allowed;
826 
827 	down_write(&iopt->iova_rwsem);
828 	swap(*allowed_iova, iopt->allowed_itree);
829 
830 	for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed;
831 	     allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) {
832 		if (iopt_reserved_iter_first(iopt, allowed->node.start,
833 					     allowed->node.last)) {
834 			swap(*allowed_iova, iopt->allowed_itree);
835 			up_write(&iopt->iova_rwsem);
836 			return -EADDRINUSE;
837 		}
838 	}
839 	up_write(&iopt->iova_rwsem);
840 	return 0;
841 }
842 
iopt_reserve_iova(struct io_pagetable * iopt,unsigned long start,unsigned long last,void * owner)843 int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
844 		      unsigned long last, void *owner)
845 {
846 	struct iopt_reserved *reserved;
847 
848 	lockdep_assert_held_write(&iopt->iova_rwsem);
849 
850 	if (iopt_area_iter_first(iopt, start, last) ||
851 	    iopt_allowed_iter_first(iopt, start, last))
852 		return -EADDRINUSE;
853 
854 	reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT);
855 	if (!reserved)
856 		return -ENOMEM;
857 	reserved->node.start = start;
858 	reserved->node.last = last;
859 	reserved->owner = owner;
860 	interval_tree_insert(&reserved->node, &iopt->reserved_itree);
861 	return 0;
862 }
863 
__iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)864 static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
865 {
866 	struct iopt_reserved *reserved, *next;
867 
868 	lockdep_assert_held_write(&iopt->iova_rwsem);
869 
870 	for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved;
871 	     reserved = next) {
872 		next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX);
873 
874 		if (reserved->owner == owner) {
875 			interval_tree_remove(&reserved->node,
876 					     &iopt->reserved_itree);
877 			kfree(reserved);
878 		}
879 	}
880 }
881 
iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)882 void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
883 {
884 	down_write(&iopt->iova_rwsem);
885 	__iopt_remove_reserved_iova(iopt, owner);
886 	up_write(&iopt->iova_rwsem);
887 }
888 
iopt_init_table(struct io_pagetable * iopt)889 void iopt_init_table(struct io_pagetable *iopt)
890 {
891 	init_rwsem(&iopt->iova_rwsem);
892 	init_rwsem(&iopt->domains_rwsem);
893 	iopt->area_itree = RB_ROOT_CACHED;
894 	iopt->allowed_itree = RB_ROOT_CACHED;
895 	iopt->reserved_itree = RB_ROOT_CACHED;
896 	xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT);
897 	xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC);
898 
899 	/*
900 	 * iopt's start as SW tables that can use the entire size_t IOVA space
901 	 * due to the use of size_t in the APIs. They have no alignment
902 	 * restriction.
903 	 */
904 	iopt->iova_alignment = 1;
905 }
906 
iopt_destroy_table(struct io_pagetable * iopt)907 void iopt_destroy_table(struct io_pagetable *iopt)
908 {
909 	struct interval_tree_node *node;
910 
911 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
912 		iopt_remove_reserved_iova(iopt, NULL);
913 
914 	while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0,
915 						ULONG_MAX))) {
916 		interval_tree_remove(node, &iopt->allowed_itree);
917 		kfree(container_of(node, struct iopt_allowed, node));
918 	}
919 
920 	WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root));
921 	WARN_ON(!xa_empty(&iopt->domains));
922 	WARN_ON(!xa_empty(&iopt->access_list));
923 	WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
924 }
925 
926 /**
927  * iopt_unfill_domain() - Unfill a domain with PFNs
928  * @iopt: io_pagetable to act on
929  * @domain: domain to unfill
930  *
931  * This is used when removing a domain from the iopt. Every area in the iopt
932  * will be unmapped from the domain. The domain must already be removed from the
933  * domains xarray.
934  */
iopt_unfill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)935 static void iopt_unfill_domain(struct io_pagetable *iopt,
936 			       struct iommu_domain *domain)
937 {
938 	struct iopt_area *area;
939 
940 	lockdep_assert_held(&iopt->iova_rwsem);
941 	lockdep_assert_held_write(&iopt->domains_rwsem);
942 
943 	/*
944 	 * Some other domain is holding all the pfns still, rapidly unmap this
945 	 * domain.
946 	 */
947 	if (iopt->next_domain_id != 0) {
948 		/* Pick an arbitrary remaining domain to act as storage */
949 		struct iommu_domain *storage_domain =
950 			xa_load(&iopt->domains, 0);
951 
952 		for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
953 		     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
954 			struct iopt_pages *pages = area->pages;
955 
956 			if (!pages)
957 				continue;
958 
959 			mutex_lock(&pages->mutex);
960 			if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
961 				WARN_ON(!area->storage_domain);
962 			if (area->storage_domain == domain)
963 				area->storage_domain = storage_domain;
964 			mutex_unlock(&pages->mutex);
965 
966 			iopt_area_unmap_domain(area, domain);
967 		}
968 		return;
969 	}
970 
971 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
972 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
973 		struct iopt_pages *pages = area->pages;
974 
975 		if (!pages)
976 			continue;
977 
978 		mutex_lock(&pages->mutex);
979 		interval_tree_remove(&area->pages_node, &pages->domains_itree);
980 		WARN_ON(area->storage_domain != domain);
981 		area->storage_domain = NULL;
982 		iopt_area_unfill_domain(area, pages, domain);
983 		mutex_unlock(&pages->mutex);
984 	}
985 }
986 
987 /**
988  * iopt_fill_domain() - Fill a domain with PFNs
989  * @iopt: io_pagetable to act on
990  * @domain: domain to fill
991  *
992  * Fill the domain with PFNs from every area in the iopt. On failure the domain
993  * is left unchanged.
994  */
iopt_fill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)995 static int iopt_fill_domain(struct io_pagetable *iopt,
996 			    struct iommu_domain *domain)
997 {
998 	struct iopt_area *end_area;
999 	struct iopt_area *area;
1000 	int rc;
1001 
1002 	lockdep_assert_held(&iopt->iova_rwsem);
1003 	lockdep_assert_held_write(&iopt->domains_rwsem);
1004 
1005 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1006 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
1007 		struct iopt_pages *pages = area->pages;
1008 
1009 		if (!pages)
1010 			continue;
1011 
1012 		mutex_lock(&pages->mutex);
1013 		rc = iopt_area_fill_domain(area, domain);
1014 		if (rc) {
1015 			mutex_unlock(&pages->mutex);
1016 			goto out_unfill;
1017 		}
1018 		if (!area->storage_domain) {
1019 			WARN_ON(iopt->next_domain_id != 0);
1020 			area->storage_domain = domain;
1021 			interval_tree_insert(&area->pages_node,
1022 					     &pages->domains_itree);
1023 		}
1024 		mutex_unlock(&pages->mutex);
1025 	}
1026 	return 0;
1027 
1028 out_unfill:
1029 	end_area = area;
1030 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1031 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
1032 		struct iopt_pages *pages = area->pages;
1033 
1034 		if (area == end_area)
1035 			break;
1036 		if (!pages)
1037 			continue;
1038 		mutex_lock(&pages->mutex);
1039 		if (iopt->next_domain_id == 0) {
1040 			interval_tree_remove(&area->pages_node,
1041 					     &pages->domains_itree);
1042 			area->storage_domain = NULL;
1043 		}
1044 		iopt_area_unfill_domain(area, pages, domain);
1045 		mutex_unlock(&pages->mutex);
1046 	}
1047 	return rc;
1048 }
1049 
1050 /* All existing area's conform to an increased page size */
iopt_check_iova_alignment(struct io_pagetable * iopt,unsigned long new_iova_alignment)1051 static int iopt_check_iova_alignment(struct io_pagetable *iopt,
1052 				     unsigned long new_iova_alignment)
1053 {
1054 	unsigned long align_mask = new_iova_alignment - 1;
1055 	struct iopt_area *area;
1056 
1057 	lockdep_assert_held(&iopt->iova_rwsem);
1058 	lockdep_assert_held(&iopt->domains_rwsem);
1059 
1060 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1061 	     area = iopt_area_iter_next(area, 0, ULONG_MAX))
1062 		if ((iopt_area_iova(area) & align_mask) ||
1063 		    (iopt_area_length(area) & align_mask) ||
1064 		    (area->page_offset & align_mask))
1065 			return -EADDRINUSE;
1066 
1067 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) {
1068 		struct iommufd_access *access;
1069 		unsigned long index;
1070 
1071 		xa_for_each(&iopt->access_list, index, access)
1072 			if (WARN_ON(access->iova_alignment >
1073 				    new_iova_alignment))
1074 				return -EADDRINUSE;
1075 	}
1076 	return 0;
1077 }
1078 
iopt_table_add_domain(struct io_pagetable * iopt,struct iommu_domain * domain)1079 int iopt_table_add_domain(struct io_pagetable *iopt,
1080 			  struct iommu_domain *domain)
1081 {
1082 	const struct iommu_domain_geometry *geometry = &domain->geometry;
1083 	struct iommu_domain *iter_domain;
1084 	unsigned int new_iova_alignment;
1085 	unsigned long index;
1086 	int rc;
1087 
1088 	down_write(&iopt->domains_rwsem);
1089 	down_write(&iopt->iova_rwsem);
1090 
1091 	xa_for_each(&iopt->domains, index, iter_domain) {
1092 		if (WARN_ON(iter_domain == domain)) {
1093 			rc = -EEXIST;
1094 			goto out_unlock;
1095 		}
1096 	}
1097 
1098 	/*
1099 	 * The io page size drives the iova_alignment. Internally the iopt_pages
1100 	 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
1101 	 * objects into the iommu_domain.
1102 	 *
1103 	 * A iommu_domain must always be able to accept PAGE_SIZE to be
1104 	 * compatible as we can't guarantee higher contiguity.
1105 	 */
1106 	new_iova_alignment = max_t(unsigned long,
1107 				   1UL << __ffs(domain->pgsize_bitmap),
1108 				   iopt->iova_alignment);
1109 	if (new_iova_alignment > PAGE_SIZE) {
1110 		rc = -EINVAL;
1111 		goto out_unlock;
1112 	}
1113 	if (new_iova_alignment != iopt->iova_alignment) {
1114 		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1115 		if (rc)
1116 			goto out_unlock;
1117 	}
1118 
1119 	/* No area exists that is outside the allowed domain aperture */
1120 	if (geometry->aperture_start != 0) {
1121 		rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
1122 				       domain);
1123 		if (rc)
1124 			goto out_reserved;
1125 	}
1126 	if (geometry->aperture_end != ULONG_MAX) {
1127 		rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
1128 				       ULONG_MAX, domain);
1129 		if (rc)
1130 			goto out_reserved;
1131 	}
1132 
1133 	rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
1134 	if (rc)
1135 		goto out_reserved;
1136 
1137 	rc = iopt_fill_domain(iopt, domain);
1138 	if (rc)
1139 		goto out_release;
1140 
1141 	iopt->iova_alignment = new_iova_alignment;
1142 	xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
1143 	iopt->next_domain_id++;
1144 	up_write(&iopt->iova_rwsem);
1145 	up_write(&iopt->domains_rwsem);
1146 	return 0;
1147 out_release:
1148 	xa_release(&iopt->domains, iopt->next_domain_id);
1149 out_reserved:
1150 	__iopt_remove_reserved_iova(iopt, domain);
1151 out_unlock:
1152 	up_write(&iopt->iova_rwsem);
1153 	up_write(&iopt->domains_rwsem);
1154 	return rc;
1155 }
1156 
iopt_calculate_iova_alignment(struct io_pagetable * iopt)1157 static int iopt_calculate_iova_alignment(struct io_pagetable *iopt)
1158 {
1159 	unsigned long new_iova_alignment;
1160 	struct iommufd_access *access;
1161 	struct iommu_domain *domain;
1162 	unsigned long index;
1163 
1164 	lockdep_assert_held_write(&iopt->iova_rwsem);
1165 	lockdep_assert_held(&iopt->domains_rwsem);
1166 
1167 	/* See batch_iommu_map_small() */
1168 	if (iopt->disable_large_pages)
1169 		new_iova_alignment = PAGE_SIZE;
1170 	else
1171 		new_iova_alignment = 1;
1172 
1173 	xa_for_each(&iopt->domains, index, domain)
1174 		new_iova_alignment = max_t(unsigned long,
1175 					   1UL << __ffs(domain->pgsize_bitmap),
1176 					   new_iova_alignment);
1177 	xa_for_each(&iopt->access_list, index, access)
1178 		new_iova_alignment = max_t(unsigned long,
1179 					   access->iova_alignment,
1180 					   new_iova_alignment);
1181 
1182 	if (new_iova_alignment > iopt->iova_alignment) {
1183 		int rc;
1184 
1185 		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1186 		if (rc)
1187 			return rc;
1188 	}
1189 	iopt->iova_alignment = new_iova_alignment;
1190 	return 0;
1191 }
1192 
iopt_table_remove_domain(struct io_pagetable * iopt,struct iommu_domain * domain)1193 void iopt_table_remove_domain(struct io_pagetable *iopt,
1194 			      struct iommu_domain *domain)
1195 {
1196 	struct iommu_domain *iter_domain = NULL;
1197 	unsigned long index;
1198 
1199 	down_write(&iopt->domains_rwsem);
1200 	down_write(&iopt->iova_rwsem);
1201 
1202 	xa_for_each(&iopt->domains, index, iter_domain)
1203 		if (iter_domain == domain)
1204 			break;
1205 	if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
1206 		goto out_unlock;
1207 
1208 	/*
1209 	 * Compress the xarray to keep it linear by swapping the entry to erase
1210 	 * with the tail entry and shrinking the tail.
1211 	 */
1212 	iopt->next_domain_id--;
1213 	iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
1214 	if (index != iopt->next_domain_id)
1215 		xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
1216 
1217 	iopt_unfill_domain(iopt, domain);
1218 	__iopt_remove_reserved_iova(iopt, domain);
1219 
1220 	WARN_ON(iopt_calculate_iova_alignment(iopt));
1221 out_unlock:
1222 	up_write(&iopt->iova_rwsem);
1223 	up_write(&iopt->domains_rwsem);
1224 }
1225 
1226 /**
1227  * iopt_area_split - Split an area into two parts at iova
1228  * @area: The area to split
1229  * @iova: Becomes the last of a new area
1230  *
1231  * This splits an area into two. It is part of the VFIO compatibility to allow
1232  * poking a hole in the mapping. The two areas continue to point at the same
1233  * iopt_pages, just with different starting bytes.
1234  */
iopt_area_split(struct iopt_area * area,unsigned long iova)1235 static int iopt_area_split(struct iopt_area *area, unsigned long iova)
1236 {
1237 	unsigned long alignment = area->iopt->iova_alignment;
1238 	unsigned long last_iova = iopt_area_last_iova(area);
1239 	unsigned long start_iova = iopt_area_iova(area);
1240 	unsigned long new_start = iova + 1;
1241 	struct io_pagetable *iopt = area->iopt;
1242 	struct iopt_pages *pages = area->pages;
1243 	struct iopt_area *lhs;
1244 	struct iopt_area *rhs;
1245 	int rc;
1246 
1247 	lockdep_assert_held_write(&iopt->iova_rwsem);
1248 
1249 	if (iova == start_iova || iova == last_iova)
1250 		return 0;
1251 
1252 	if (!pages || area->prevent_access)
1253 		return -EBUSY;
1254 
1255 	if (new_start & (alignment - 1) ||
1256 	    iopt_area_start_byte(area, new_start) & (alignment - 1))
1257 		return -EINVAL;
1258 
1259 	lhs = iopt_area_alloc();
1260 	if (!lhs)
1261 		return -ENOMEM;
1262 
1263 	rhs = iopt_area_alloc();
1264 	if (!rhs) {
1265 		rc = -ENOMEM;
1266 		goto err_free_lhs;
1267 	}
1268 
1269 	mutex_lock(&pages->mutex);
1270 	/*
1271 	 * Splitting is not permitted if an access exists, we don't track enough
1272 	 * information to split existing accesses.
1273 	 */
1274 	if (area->num_accesses) {
1275 		rc = -EINVAL;
1276 		goto err_unlock;
1277 	}
1278 
1279 	/*
1280 	 * Splitting is not permitted if a domain could have been mapped with
1281 	 * huge pages.
1282 	 */
1283 	if (area->storage_domain && !iopt->disable_large_pages) {
1284 		rc = -EINVAL;
1285 		goto err_unlock;
1286 	}
1287 
1288 	interval_tree_remove(&area->node, &iopt->area_itree);
1289 	rc = iopt_insert_area(iopt, lhs, area->pages, start_iova,
1290 			      iopt_area_start_byte(area, start_iova),
1291 			      (new_start - 1) - start_iova + 1,
1292 			      area->iommu_prot);
1293 	if (WARN_ON(rc))
1294 		goto err_insert;
1295 
1296 	rc = iopt_insert_area(iopt, rhs, area->pages, new_start,
1297 			      iopt_area_start_byte(area, new_start),
1298 			      last_iova - new_start + 1, area->iommu_prot);
1299 	if (WARN_ON(rc))
1300 		goto err_remove_lhs;
1301 
1302 	/*
1303 	 * If the original area has filled a domain, domains_itree has to be
1304 	 * updated.
1305 	 */
1306 	if (area->storage_domain) {
1307 		interval_tree_remove(&area->pages_node, &pages->domains_itree);
1308 		interval_tree_insert(&lhs->pages_node, &pages->domains_itree);
1309 		interval_tree_insert(&rhs->pages_node, &pages->domains_itree);
1310 	}
1311 
1312 	lhs->storage_domain = area->storage_domain;
1313 	lhs->pages = area->pages;
1314 	rhs->storage_domain = area->storage_domain;
1315 	rhs->pages = area->pages;
1316 	kref_get(&rhs->pages->kref);
1317 	kfree(area);
1318 	mutex_unlock(&pages->mutex);
1319 
1320 	/*
1321 	 * No change to domains or accesses because the pages hasn't been
1322 	 * changed
1323 	 */
1324 	return 0;
1325 
1326 err_remove_lhs:
1327 	interval_tree_remove(&lhs->node, &iopt->area_itree);
1328 err_insert:
1329 	interval_tree_insert(&area->node, &iopt->area_itree);
1330 err_unlock:
1331 	mutex_unlock(&pages->mutex);
1332 	kfree(rhs);
1333 err_free_lhs:
1334 	kfree(lhs);
1335 	return rc;
1336 }
1337 
iopt_cut_iova(struct io_pagetable * iopt,unsigned long * iovas,size_t num_iovas)1338 int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
1339 		  size_t num_iovas)
1340 {
1341 	int rc = 0;
1342 	int i;
1343 
1344 	down_write(&iopt->iova_rwsem);
1345 	for (i = 0; i < num_iovas; i++) {
1346 		struct iopt_area *area;
1347 
1348 		area = iopt_area_iter_first(iopt, iovas[i], iovas[i]);
1349 		if (!area)
1350 			continue;
1351 		rc = iopt_area_split(area, iovas[i]);
1352 		if (rc)
1353 			break;
1354 	}
1355 	up_write(&iopt->iova_rwsem);
1356 	return rc;
1357 }
1358 
iopt_enable_large_pages(struct io_pagetable * iopt)1359 void iopt_enable_large_pages(struct io_pagetable *iopt)
1360 {
1361 	int rc;
1362 
1363 	down_write(&iopt->domains_rwsem);
1364 	down_write(&iopt->iova_rwsem);
1365 	WRITE_ONCE(iopt->disable_large_pages, false);
1366 	rc = iopt_calculate_iova_alignment(iopt);
1367 	WARN_ON(rc);
1368 	up_write(&iopt->iova_rwsem);
1369 	up_write(&iopt->domains_rwsem);
1370 }
1371 
iopt_disable_large_pages(struct io_pagetable * iopt)1372 int iopt_disable_large_pages(struct io_pagetable *iopt)
1373 {
1374 	int rc = 0;
1375 
1376 	down_write(&iopt->domains_rwsem);
1377 	down_write(&iopt->iova_rwsem);
1378 	if (iopt->disable_large_pages)
1379 		goto out_unlock;
1380 
1381 	/* Won't do it if domains already have pages mapped in them */
1382 	if (!xa_empty(&iopt->domains) &&
1383 	    !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) {
1384 		rc = -EINVAL;
1385 		goto out_unlock;
1386 	}
1387 
1388 	WRITE_ONCE(iopt->disable_large_pages, true);
1389 	rc = iopt_calculate_iova_alignment(iopt);
1390 	if (rc)
1391 		WRITE_ONCE(iopt->disable_large_pages, false);
1392 out_unlock:
1393 	up_write(&iopt->iova_rwsem);
1394 	up_write(&iopt->domains_rwsem);
1395 	return rc;
1396 }
1397 
iopt_add_access(struct io_pagetable * iopt,struct iommufd_access * access)1398 int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access)
1399 {
1400 	u32 new_id;
1401 	int rc;
1402 
1403 	down_write(&iopt->domains_rwsem);
1404 	down_write(&iopt->iova_rwsem);
1405 	rc = xa_alloc(&iopt->access_list, &new_id, access, xa_limit_16b,
1406 		      GFP_KERNEL_ACCOUNT);
1407 
1408 	if (rc)
1409 		goto out_unlock;
1410 
1411 	rc = iopt_calculate_iova_alignment(iopt);
1412 	if (rc) {
1413 		xa_erase(&iopt->access_list, new_id);
1414 		goto out_unlock;
1415 	}
1416 	access->iopt_access_list_id = new_id;
1417 
1418 out_unlock:
1419 	up_write(&iopt->iova_rwsem);
1420 	up_write(&iopt->domains_rwsem);
1421 	return rc;
1422 }
1423 
iopt_remove_access(struct io_pagetable * iopt,struct iommufd_access * access,u32 iopt_access_list_id)1424 void iopt_remove_access(struct io_pagetable *iopt,
1425 			struct iommufd_access *access, u32 iopt_access_list_id)
1426 {
1427 	down_write(&iopt->domains_rwsem);
1428 	down_write(&iopt->iova_rwsem);
1429 	WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access);
1430 	WARN_ON(iopt_calculate_iova_alignment(iopt));
1431 	up_write(&iopt->iova_rwsem);
1432 	up_write(&iopt->domains_rwsem);
1433 }
1434 
1435 /* Narrow the valid_iova_itree to include reserved ranges from a device. */
iopt_table_enforce_dev_resv_regions(struct io_pagetable * iopt,struct device * dev,phys_addr_t * sw_msi_start)1436 int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
1437 					struct device *dev,
1438 					phys_addr_t *sw_msi_start)
1439 {
1440 	struct iommu_resv_region *resv;
1441 	LIST_HEAD(resv_regions);
1442 	unsigned int num_hw_msi = 0;
1443 	unsigned int num_sw_msi = 0;
1444 	int rc;
1445 
1446 	if (iommufd_should_fail())
1447 		return -EINVAL;
1448 
1449 	down_write(&iopt->iova_rwsem);
1450 	/* FIXME: drivers allocate memory but there is no failure propogated */
1451 	iommu_get_resv_regions(dev, &resv_regions);
1452 
1453 	list_for_each_entry(resv, &resv_regions, list) {
1454 		if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1455 			continue;
1456 
1457 		if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
1458 			num_hw_msi++;
1459 		if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
1460 			*sw_msi_start = resv->start;
1461 			num_sw_msi++;
1462 		}
1463 
1464 		rc = iopt_reserve_iova(iopt, resv->start,
1465 				       resv->length - 1 + resv->start, dev);
1466 		if (rc)
1467 			goto out_reserved;
1468 	}
1469 
1470 	/* Drivers must offer sane combinations of regions */
1471 	if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) {
1472 		rc = -EINVAL;
1473 		goto out_reserved;
1474 	}
1475 
1476 	rc = 0;
1477 	goto out_free_resv;
1478 
1479 out_reserved:
1480 	__iopt_remove_reserved_iova(iopt, dev);
1481 out_free_resv:
1482 	iommu_put_resv_regions(dev, &resv_regions);
1483 	up_write(&iopt->iova_rwsem);
1484 	return rc;
1485 }
1486