xref: /linux/drivers/iommu/iommufd/io_pagetable.c (revision f2161d5f1aae21a42b0a64d87e10cb31db423f42)
1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
3  *
4  * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
5  * PFNs can be placed into an iommu_domain, or returned to the caller as a page
6  * list for access by an in-kernel user.
7  *
8  * The datastructure uses the iopt_pages to optimize the storage of the PFNs
9  * between the domains and xarray.
10  */
11 #include <linux/dma-buf.h>
12 #include <linux/err.h>
13 #include <linux/errno.h>
14 #include <linux/file.h>
15 #include <linux/iommu.h>
16 #include <linux/iommufd.h>
17 #include <linux/lockdep.h>
18 #include <linux/sched/mm.h>
19 #include <linux/slab.h>
20 #include <uapi/linux/iommufd.h>
21 
22 #include "double_span.h"
23 #include "io_pagetable.h"
24 
25 struct iopt_pages_list {
26 	struct iopt_pages *pages;
27 	struct iopt_area *area;
28 	struct list_head next;
29 	unsigned long start_byte;
30 	unsigned long length;
31 };
32 
iopt_area_contig_init(struct iopt_area_contig_iter * iter,struct io_pagetable * iopt,unsigned long iova,unsigned long last_iova)33 struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
34 					struct io_pagetable *iopt,
35 					unsigned long iova,
36 					unsigned long last_iova)
37 {
38 	lockdep_assert_held(&iopt->iova_rwsem);
39 
40 	iter->cur_iova = iova;
41 	iter->last_iova = last_iova;
42 	iter->area = iopt_area_iter_first(iopt, iova, iova);
43 	if (!iter->area)
44 		return NULL;
45 	if (!iter->area->pages) {
46 		iter->area = NULL;
47 		return NULL;
48 	}
49 	return iter->area;
50 }
51 
iopt_area_contig_next(struct iopt_area_contig_iter * iter)52 struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
53 {
54 	unsigned long last_iova;
55 
56 	if (!iter->area)
57 		return NULL;
58 	last_iova = iopt_area_last_iova(iter->area);
59 	if (iter->last_iova <= last_iova)
60 		return NULL;
61 
62 	iter->cur_iova = last_iova + 1;
63 	iter->area = iopt_area_iter_next(iter->area, iter->cur_iova,
64 					 iter->last_iova);
65 	if (!iter->area)
66 		return NULL;
67 	if (iter->cur_iova != iopt_area_iova(iter->area) ||
68 	    !iter->area->pages) {
69 		iter->area = NULL;
70 		return NULL;
71 	}
72 	return iter->area;
73 }
74 
__alloc_iova_check_range(unsigned long * start,unsigned long last,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)75 static bool __alloc_iova_check_range(unsigned long *start, unsigned long last,
76 				     unsigned long length,
77 				     unsigned long iova_alignment,
78 				     unsigned long page_offset)
79 {
80 	unsigned long aligned_start;
81 
82 	/* ALIGN_UP() */
83 	if (check_add_overflow(*start, iova_alignment - 1, &aligned_start))
84 		return false;
85 	aligned_start &= ~(iova_alignment - 1);
86 	aligned_start |= page_offset;
87 
88 	if (aligned_start >= last || last - aligned_start < length - 1)
89 		return false;
90 	*start = aligned_start;
91 	return true;
92 }
93 
__alloc_iova_check_hole(struct interval_tree_double_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)94 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
95 				    unsigned long length,
96 				    unsigned long iova_alignment,
97 				    unsigned long page_offset)
98 {
99 	if (span->is_used)
100 		return false;
101 	return __alloc_iova_check_range(&span->start_hole, span->last_hole,
102 					length, iova_alignment, page_offset);
103 }
104 
__alloc_iova_check_used(struct interval_tree_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)105 static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
106 				    unsigned long length,
107 				    unsigned long iova_alignment,
108 				    unsigned long page_offset)
109 {
110 	if (span->is_hole)
111 		return false;
112 	return __alloc_iova_check_range(&span->start_used, span->last_used,
113 					length, iova_alignment, page_offset);
114 }
115 
116 /*
117  * Automatically find a block of IOVA that is not being used and not reserved.
118  * Does not return a 0 IOVA even if it is valid.
119  */
iopt_alloc_iova(struct io_pagetable * iopt,unsigned long * iova,unsigned long addr,unsigned long length)120 static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
121 			   unsigned long addr, unsigned long length)
122 {
123 	unsigned long page_offset = addr % PAGE_SIZE;
124 	struct interval_tree_double_span_iter used_span;
125 	struct interval_tree_span_iter allowed_span;
126 	unsigned long max_alignment = PAGE_SIZE;
127 	unsigned long iova_alignment;
128 
129 	lockdep_assert_held(&iopt->iova_rwsem);
130 
131 	/* Protect roundup_pow-of_two() from overflow */
132 	if (length == 0 || length >= ULONG_MAX / 2)
133 		return -EOVERFLOW;
134 
135 	/*
136 	 * Keep alignment present in addr when building the IOVA, which
137 	 * increases the chance we can map a THP.
138 	 */
139 	if (!addr)
140 		iova_alignment = roundup_pow_of_two(length);
141 	else
142 		iova_alignment = min_t(unsigned long,
143 				       roundup_pow_of_two(length),
144 				       1UL << __ffs64(addr));
145 
146 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
147 	max_alignment = HPAGE_SIZE;
148 #endif
149 	/* Protect against ALIGN() overflow */
150 	if (iova_alignment >= max_alignment)
151 		iova_alignment = max_alignment;
152 
153 	if (iova_alignment < iopt->iova_alignment)
154 		return -EINVAL;
155 
156 	interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree,
157 				    PAGE_SIZE, ULONG_MAX - PAGE_SIZE) {
158 		if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) {
159 			allowed_span.start_used = PAGE_SIZE;
160 			allowed_span.last_used = ULONG_MAX - PAGE_SIZE;
161 			allowed_span.is_hole = false;
162 		}
163 
164 		if (!__alloc_iova_check_used(&allowed_span, length,
165 					     iova_alignment, page_offset))
166 			continue;
167 
168 		interval_tree_for_each_double_span(
169 			&used_span, &iopt->reserved_itree, &iopt->area_itree,
170 			allowed_span.start_used, allowed_span.last_used) {
171 			if (!__alloc_iova_check_hole(&used_span, length,
172 						     iova_alignment,
173 						     page_offset))
174 				continue;
175 
176 			*iova = used_span.start_hole;
177 			return 0;
178 		}
179 	}
180 	return -ENOSPC;
181 }
182 
iopt_check_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length)183 static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova,
184 			   unsigned long length)
185 {
186 	unsigned long last;
187 
188 	lockdep_assert_held(&iopt->iova_rwsem);
189 
190 	if ((iova & (iopt->iova_alignment - 1)))
191 		return -EINVAL;
192 
193 	if (check_add_overflow(iova, length - 1, &last))
194 		return -EOVERFLOW;
195 
196 	/* No reserved IOVA intersects the range */
197 	if (iopt_reserved_iter_first(iopt, iova, last))
198 		return -EINVAL;
199 
200 	/* Check that there is not already a mapping in the range */
201 	if (iopt_area_iter_first(iopt, iova, last))
202 		return -EEXIST;
203 	return 0;
204 }
205 
206 /*
207  * The area takes a slice of the pages from start_bytes to start_byte + length
208  */
iopt_insert_area(struct io_pagetable * iopt,struct iopt_area * area,struct iopt_pages * pages,unsigned long iova,unsigned long start_byte,unsigned long length,int iommu_prot)209 static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
210 			    struct iopt_pages *pages, unsigned long iova,
211 			    unsigned long start_byte, unsigned long length,
212 			    int iommu_prot)
213 {
214 	lockdep_assert_held_write(&iopt->iova_rwsem);
215 
216 	if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
217 		return -EPERM;
218 
219 	area->iommu_prot = iommu_prot;
220 	area->page_offset = start_byte % PAGE_SIZE;
221 	if (area->page_offset & (iopt->iova_alignment - 1))
222 		return -EINVAL;
223 
224 	area->node.start = iova;
225 	if (check_add_overflow(iova, length - 1, &area->node.last))
226 		return -EOVERFLOW;
227 
228 	area->pages_node.start = start_byte / PAGE_SIZE;
229 	if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
230 		return -EOVERFLOW;
231 	area->pages_node.last = area->pages_node.last / PAGE_SIZE;
232 	if (WARN_ON(area->pages_node.last >= pages->npages))
233 		return -EOVERFLOW;
234 
235 	/*
236 	 * The area is inserted with a NULL pages indicating it is not fully
237 	 * initialized yet.
238 	 */
239 	area->iopt = iopt;
240 	interval_tree_insert(&area->node, &iopt->area_itree);
241 	return 0;
242 }
243 
iopt_area_alloc(void)244 static struct iopt_area *iopt_area_alloc(void)
245 {
246 	struct iopt_area *area;
247 
248 	area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
249 	if (!area)
250 		return NULL;
251 	RB_CLEAR_NODE(&area->node.rb);
252 	RB_CLEAR_NODE(&area->pages_node.rb);
253 	return area;
254 }
255 
iopt_alloc_area_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)256 static int iopt_alloc_area_pages(struct io_pagetable *iopt,
257 				 struct list_head *pages_list,
258 				 unsigned long length, unsigned long *dst_iova,
259 				 int iommu_prot, unsigned int flags)
260 {
261 	struct iopt_pages_list *elm;
262 	unsigned long start;
263 	unsigned long iova;
264 	int rc = 0;
265 
266 	list_for_each_entry(elm, pages_list, next) {
267 		elm->area = iopt_area_alloc();
268 		if (!elm->area)
269 			return -ENOMEM;
270 	}
271 
272 	down_write(&iopt->iova_rwsem);
273 	if ((length & (iopt->iova_alignment - 1)) || !length) {
274 		rc = -EINVAL;
275 		goto out_unlock;
276 	}
277 
278 	if (flags & IOPT_ALLOC_IOVA) {
279 		/* Use the first entry to guess the ideal IOVA alignment */
280 		elm = list_first_entry(pages_list, struct iopt_pages_list,
281 				       next);
282 		switch (elm->pages->type) {
283 		case IOPT_ADDRESS_USER:
284 			start = elm->start_byte + (uintptr_t)elm->pages->uptr;
285 			break;
286 		case IOPT_ADDRESS_FILE:
287 			start = elm->start_byte + elm->pages->start;
288 			break;
289 		case IOPT_ADDRESS_DMABUF:
290 			start = elm->start_byte + elm->pages->dmabuf.start;
291 			break;
292 		}
293 		rc = iopt_alloc_iova(iopt, dst_iova, start, length);
294 		if (rc)
295 			goto out_unlock;
296 		if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
297 		    WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) {
298 			rc = -EINVAL;
299 			goto out_unlock;
300 		}
301 	} else {
302 		rc = iopt_check_iova(iopt, *dst_iova, length);
303 		if (rc)
304 			goto out_unlock;
305 	}
306 
307 	/*
308 	 * Areas are created with a NULL pages so that the IOVA space is
309 	 * reserved and we can unlock the iova_rwsem.
310 	 */
311 	iova = *dst_iova;
312 	list_for_each_entry(elm, pages_list, next) {
313 		rc = iopt_insert_area(iopt, elm->area, elm->pages, iova,
314 				      elm->start_byte, elm->length, iommu_prot);
315 		if (rc)
316 			goto out_unlock;
317 		iova += elm->length;
318 	}
319 
320 out_unlock:
321 	up_write(&iopt->iova_rwsem);
322 	return rc;
323 }
324 
iopt_abort_area(struct iopt_area * area)325 static void iopt_abort_area(struct iopt_area *area)
326 {
327 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
328 		WARN_ON(area->pages);
329 	if (area->iopt) {
330 		down_write(&area->iopt->iova_rwsem);
331 		interval_tree_remove(&area->node, &area->iopt->area_itree);
332 		up_write(&area->iopt->iova_rwsem);
333 	}
334 	kfree(area);
335 }
336 
iopt_free_pages_list(struct list_head * pages_list)337 void iopt_free_pages_list(struct list_head *pages_list)
338 {
339 	struct iopt_pages_list *elm;
340 
341 	while ((elm = list_first_entry_or_null(pages_list,
342 					       struct iopt_pages_list, next))) {
343 		if (elm->area)
344 			iopt_abort_area(elm->area);
345 		if (elm->pages)
346 			iopt_put_pages(elm->pages);
347 		list_del(&elm->next);
348 		kfree(elm);
349 	}
350 }
351 
iopt_fill_domains_pages(struct list_head * pages_list)352 static int iopt_fill_domains_pages(struct list_head *pages_list)
353 {
354 	struct iopt_pages_list *undo_elm;
355 	struct iopt_pages_list *elm;
356 	int rc;
357 
358 	list_for_each_entry(elm, pages_list, next) {
359 		rc = iopt_area_fill_domains(elm->area, elm->pages);
360 		if (rc)
361 			goto err_undo;
362 	}
363 	return 0;
364 
365 err_undo:
366 	list_for_each_entry(undo_elm, pages_list, next) {
367 		if (undo_elm == elm)
368 			break;
369 		iopt_area_unfill_domains(undo_elm->area, undo_elm->pages);
370 	}
371 	return rc;
372 }
373 
iopt_map_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)374 int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
375 		   unsigned long length, unsigned long *dst_iova,
376 		   int iommu_prot, unsigned int flags)
377 {
378 	struct iopt_pages_list *elm;
379 	int rc;
380 
381 	rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova,
382 				   iommu_prot, flags);
383 	if (rc)
384 		return rc;
385 
386 	down_read(&iopt->domains_rwsem);
387 	rc = iopt_fill_domains_pages(pages_list);
388 	if (rc)
389 		goto out_unlock_domains;
390 
391 	down_write(&iopt->iova_rwsem);
392 	list_for_each_entry(elm, pages_list, next) {
393 		/*
394 		 * area->pages must be set inside the domains_rwsem to ensure
395 		 * any newly added domains will get filled. Moves the reference
396 		 * in from the list.
397 		 */
398 		elm->area->pages = elm->pages;
399 		elm->pages = NULL;
400 		elm->area = NULL;
401 	}
402 	up_write(&iopt->iova_rwsem);
403 out_unlock_domains:
404 	up_read(&iopt->domains_rwsem);
405 	return rc;
406 }
407 
iopt_map_common(struct iommufd_ctx * ictx,struct io_pagetable * iopt,struct iopt_pages * pages,unsigned long * iova,unsigned long length,unsigned long start_byte,int iommu_prot,unsigned int flags)408 static int iopt_map_common(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
409 			   struct iopt_pages *pages, unsigned long *iova,
410 			   unsigned long length, unsigned long start_byte,
411 			   int iommu_prot, unsigned int flags)
412 {
413 	struct iopt_pages_list elm = {};
414 	LIST_HEAD(pages_list);
415 	int rc;
416 
417 	elm.pages = pages;
418 	elm.start_byte = start_byte;
419 	if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
420 	    elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
421 		elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
422 	elm.length = length;
423 	list_add(&elm.next, &pages_list);
424 
425 	rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
426 	if (rc) {
427 		if (elm.area)
428 			iopt_abort_area(elm.area);
429 		if (elm.pages)
430 			iopt_put_pages(elm.pages);
431 		return rc;
432 	}
433 	return 0;
434 }
435 
436 /**
437  * iopt_map_user_pages() - Map a user VA to an iova in the io page table
438  * @ictx: iommufd_ctx the iopt is part of
439  * @iopt: io_pagetable to act on
440  * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
441  *        the chosen iova on output. Otherwise is the iova to map to on input
442  * @uptr: User VA to map
443  * @length: Number of bytes to map
444  * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
445  * @flags: IOPT_ALLOC_IOVA or zero
446  *
447  * iova, uptr, and length must be aligned to iova_alignment. For domain backed
448  * page tables this will pin the pages and load them into the domain at iova.
449  * For non-domain page tables this will only setup a lazy reference and the
450  * caller must use iopt_access_pages() to touch them.
451  *
452  * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
453  * destroyed.
454  */
iopt_map_user_pages(struct iommufd_ctx * ictx,struct io_pagetable * iopt,unsigned long * iova,void __user * uptr,unsigned long length,int iommu_prot,unsigned int flags)455 int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
456 			unsigned long *iova, void __user *uptr,
457 			unsigned long length, int iommu_prot,
458 			unsigned int flags)
459 {
460 	struct iopt_pages *pages;
461 
462 	pages = iopt_alloc_user_pages(uptr, length, iommu_prot & IOMMU_WRITE);
463 	if (IS_ERR(pages))
464 		return PTR_ERR(pages);
465 
466 	return iopt_map_common(ictx, iopt, pages, iova, length,
467 			       uptr - pages->uptr, iommu_prot, flags);
468 }
469 
470 /**
471  * iopt_map_file_pages() - Like iopt_map_user_pages, but map a file.
472  * @ictx: iommufd_ctx the iopt is part of
473  * @iopt: io_pagetable to act on
474  * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
475  *        the chosen iova on output. Otherwise is the iova to map to on input
476  * @fd: fdno of a file to map
477  * @start: map file starting at this byte offset
478  * @length: Number of bytes to map
479  * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
480  * @flags: IOPT_ALLOC_IOVA or zero
481  */
iopt_map_file_pages(struct iommufd_ctx * ictx,struct io_pagetable * iopt,unsigned long * iova,int fd,unsigned long start,unsigned long length,int iommu_prot,unsigned int flags)482 int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
483 			unsigned long *iova, int fd, unsigned long start,
484 			unsigned long length, int iommu_prot,
485 			unsigned int flags)
486 {
487 	struct iopt_pages *pages;
488 	struct dma_buf *dmabuf;
489 	unsigned long start_byte;
490 	unsigned long last;
491 
492 	if (!length)
493 		return -EINVAL;
494 	if (check_add_overflow(start, length - 1, &last))
495 		return -EOVERFLOW;
496 
497 	start_byte = start - ALIGN_DOWN(start, PAGE_SIZE);
498 	if (IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
499 		dmabuf = dma_buf_get(fd);
500 	else
501 		dmabuf = ERR_PTR(-ENXIO);
502 
503 	if (!IS_ERR(dmabuf)) {
504 		pages = iopt_alloc_dmabuf_pages(ictx, dmabuf, start_byte, start,
505 						length,
506 						iommu_prot & IOMMU_WRITE);
507 		if (IS_ERR(pages)) {
508 			dma_buf_put(dmabuf);
509 			return PTR_ERR(pages);
510 		}
511 	} else {
512 		struct file *file;
513 
514 		file = fget(fd);
515 		if (!file)
516 			return -EBADF;
517 
518 		pages = iopt_alloc_file_pages(file, start_byte, start, length,
519 					      iommu_prot & IOMMU_WRITE);
520 		fput(file);
521 		if (IS_ERR(pages))
522 			return PTR_ERR(pages);
523 	}
524 
525 	return iopt_map_common(ictx, iopt, pages, iova, length,
526 			       start_byte, iommu_prot, flags);
527 }
528 
529 struct iova_bitmap_fn_arg {
530 	unsigned long flags;
531 	struct io_pagetable *iopt;
532 	struct iommu_domain *domain;
533 	struct iommu_dirty_bitmap *dirty;
534 };
535 
__iommu_read_and_clear_dirty(struct iova_bitmap * bitmap,unsigned long iova,size_t length,void * opaque)536 static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap,
537 					unsigned long iova, size_t length,
538 					void *opaque)
539 {
540 	struct iopt_area *area;
541 	struct iopt_area_contig_iter iter;
542 	struct iova_bitmap_fn_arg *arg = opaque;
543 	struct iommu_domain *domain = arg->domain;
544 	struct iommu_dirty_bitmap *dirty = arg->dirty;
545 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
546 	unsigned long last_iova = iova + length - 1;
547 	unsigned long flags = arg->flags;
548 	int ret;
549 
550 	iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) {
551 		unsigned long last = min(last_iova, iopt_area_last_iova(area));
552 
553 		ret = ops->read_and_clear_dirty(domain, iter.cur_iova,
554 						last - iter.cur_iova + 1, flags,
555 						dirty);
556 		if (ret)
557 			return ret;
558 	}
559 
560 	if (!iopt_area_contig_done(&iter))
561 		return -EINVAL;
562 	return 0;
563 }
564 
565 static int
iommu_read_and_clear_dirty(struct iommu_domain * domain,struct io_pagetable * iopt,unsigned long flags,struct iommu_hwpt_get_dirty_bitmap * bitmap)566 iommu_read_and_clear_dirty(struct iommu_domain *domain,
567 			   struct io_pagetable *iopt, unsigned long flags,
568 			   struct iommu_hwpt_get_dirty_bitmap *bitmap)
569 {
570 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
571 	struct iommu_iotlb_gather gather;
572 	struct iommu_dirty_bitmap dirty;
573 	struct iova_bitmap_fn_arg arg;
574 	struct iova_bitmap *iter;
575 	int ret = 0;
576 
577 	if (!ops || !ops->read_and_clear_dirty)
578 		return -EOPNOTSUPP;
579 
580 	iter = iova_bitmap_alloc(bitmap->iova, bitmap->length,
581 				 bitmap->page_size,
582 				 u64_to_user_ptr(bitmap->data));
583 	if (IS_ERR(iter))
584 		return -ENOMEM;
585 
586 	iommu_dirty_bitmap_init(&dirty, iter, &gather);
587 
588 	arg.flags = flags;
589 	arg.iopt = iopt;
590 	arg.domain = domain;
591 	arg.dirty = &dirty;
592 	iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty);
593 
594 	if (!(flags & IOMMU_DIRTY_NO_CLEAR))
595 		iommu_iotlb_sync(domain, &gather);
596 
597 	iova_bitmap_free(iter);
598 
599 	return ret;
600 }
601 
iommufd_check_iova_range(struct io_pagetable * iopt,struct iommu_hwpt_get_dirty_bitmap * bitmap)602 int iommufd_check_iova_range(struct io_pagetable *iopt,
603 			     struct iommu_hwpt_get_dirty_bitmap *bitmap)
604 {
605 	size_t iommu_pgsize = iopt->iova_alignment;
606 	u64 last_iova;
607 
608 	if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova))
609 		return -EOVERFLOW;
610 
611 	if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX)
612 		return -EOVERFLOW;
613 
614 	if ((bitmap->iova & (iommu_pgsize - 1)) ||
615 	    ((last_iova + 1) & (iommu_pgsize - 1)))
616 		return -EINVAL;
617 
618 	if (!bitmap->page_size)
619 		return -EINVAL;
620 
621 	if ((bitmap->iova & (bitmap->page_size - 1)) ||
622 	    ((last_iova + 1) & (bitmap->page_size - 1)))
623 		return -EINVAL;
624 
625 	return 0;
626 }
627 
iopt_read_and_clear_dirty_data(struct io_pagetable * iopt,struct iommu_domain * domain,unsigned long flags,struct iommu_hwpt_get_dirty_bitmap * bitmap)628 int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
629 				   struct iommu_domain *domain,
630 				   unsigned long flags,
631 				   struct iommu_hwpt_get_dirty_bitmap *bitmap)
632 {
633 	int ret;
634 
635 	ret = iommufd_check_iova_range(iopt, bitmap);
636 	if (ret)
637 		return ret;
638 
639 	down_read(&iopt->iova_rwsem);
640 	ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
641 	up_read(&iopt->iova_rwsem);
642 
643 	return ret;
644 }
645 
iopt_clear_dirty_data(struct io_pagetable * iopt,struct iommu_domain * domain)646 static int iopt_clear_dirty_data(struct io_pagetable *iopt,
647 				 struct iommu_domain *domain)
648 {
649 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
650 	struct iommu_iotlb_gather gather;
651 	struct iommu_dirty_bitmap dirty;
652 	struct iopt_area *area;
653 	int ret = 0;
654 
655 	lockdep_assert_held_read(&iopt->iova_rwsem);
656 
657 	iommu_dirty_bitmap_init(&dirty, NULL, &gather);
658 
659 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
660 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
661 		if (!area->pages)
662 			continue;
663 
664 		ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area),
665 						iopt_area_length(area), 0,
666 						&dirty);
667 		if (ret)
668 			break;
669 	}
670 
671 	iommu_iotlb_sync(domain, &gather);
672 	return ret;
673 }
674 
iopt_set_dirty_tracking(struct io_pagetable * iopt,struct iommu_domain * domain,bool enable)675 int iopt_set_dirty_tracking(struct io_pagetable *iopt,
676 			    struct iommu_domain *domain, bool enable)
677 {
678 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
679 	int ret = 0;
680 
681 	if (!ops)
682 		return -EOPNOTSUPP;
683 
684 	down_read(&iopt->iova_rwsem);
685 
686 	/* Clear dirty bits from PTEs to ensure a clean snapshot */
687 	if (enable) {
688 		ret = iopt_clear_dirty_data(iopt, domain);
689 		if (ret)
690 			goto out_unlock;
691 	}
692 
693 	ret = ops->set_dirty_tracking(domain, enable);
694 
695 out_unlock:
696 	up_read(&iopt->iova_rwsem);
697 	return ret;
698 }
699 
iopt_get_pages(struct io_pagetable * iopt,unsigned long iova,unsigned long length,struct list_head * pages_list)700 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
701 		   unsigned long length, struct list_head *pages_list)
702 {
703 	struct iopt_area_contig_iter iter;
704 	unsigned long last_iova;
705 	struct iopt_area *area;
706 	int rc;
707 
708 	if (!length)
709 		return -EINVAL;
710 	if (check_add_overflow(iova, length - 1, &last_iova))
711 		return -EOVERFLOW;
712 
713 	down_read(&iopt->iova_rwsem);
714 	iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
715 		struct iopt_pages_list *elm;
716 		unsigned long last = min(last_iova, iopt_area_last_iova(area));
717 
718 		elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT);
719 		if (!elm) {
720 			rc = -ENOMEM;
721 			goto err_free;
722 		}
723 		elm->start_byte = iopt_area_start_byte(area, iter.cur_iova);
724 		elm->pages = area->pages;
725 		elm->length = (last - iter.cur_iova) + 1;
726 		kref_get(&elm->pages->kref);
727 		list_add_tail(&elm->next, pages_list);
728 	}
729 	if (!iopt_area_contig_done(&iter)) {
730 		rc = -ENOENT;
731 		goto err_free;
732 	}
733 	up_read(&iopt->iova_rwsem);
734 	return 0;
735 err_free:
736 	up_read(&iopt->iova_rwsem);
737 	iopt_free_pages_list(pages_list);
738 	return rc;
739 }
740 
iopt_unmap_iova_range(struct io_pagetable * iopt,unsigned long start,unsigned long last,unsigned long * unmapped)741 static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
742 				 unsigned long last, unsigned long *unmapped)
743 {
744 	struct iopt_area *area;
745 	unsigned long unmapped_bytes = 0;
746 	unsigned int tries = 0;
747 	/* If there are no mapped entries then success */
748 	int rc = 0;
749 
750 	/*
751 	 * The domains_rwsem must be held in read mode any time any area->pages
752 	 * is NULL. This prevents domain attach/detatch from running
753 	 * concurrently with cleaning up the area.
754 	 */
755 again:
756 	down_read(&iopt->domains_rwsem);
757 	down_write(&iopt->iova_rwsem);
758 	while ((area = iopt_area_iter_first(iopt, start, last))) {
759 		unsigned long area_last = iopt_area_last_iova(area);
760 		unsigned long area_first = iopt_area_iova(area);
761 		struct iopt_pages *pages;
762 
763 		/* Userspace should not race map/unmap's of the same area */
764 		if (!area->pages) {
765 			rc = -EBUSY;
766 			goto out_unlock_iova;
767 		}
768 
769 		/* The area is locked by an object that has not been destroyed */
770 		if (area->num_locks) {
771 			rc = -EBUSY;
772 			goto out_unlock_iova;
773 		}
774 
775 		if (area_first < start || area_last > last) {
776 			rc = -ENOENT;
777 			goto out_unlock_iova;
778 		}
779 
780 		if (area_first != start)
781 			tries = 0;
782 
783 		/*
784 		 * num_accesses writers must hold the iova_rwsem too, so we can
785 		 * safely read it under the write side of the iovam_rwsem
786 		 * without the pages->mutex.
787 		 */
788 		if (area->num_accesses) {
789 			size_t length = iopt_area_length(area);
790 
791 			start = area_first;
792 			area->prevent_access = true;
793 			up_write(&iopt->iova_rwsem);
794 			up_read(&iopt->domains_rwsem);
795 
796 			iommufd_access_notify_unmap(iopt, area_first, length);
797 			/* Something is not responding to unmap requests. */
798 			tries++;
799 			if (WARN_ON(tries > 100)) {
800 				rc = -EDEADLOCK;
801 				goto out_unmapped;
802 			}
803 			goto again;
804 		}
805 
806 		pages = area->pages;
807 		area->pages = NULL;
808 		up_write(&iopt->iova_rwsem);
809 
810 		iopt_area_unfill_domains(area, pages);
811 		iopt_abort_area(area);
812 		iopt_put_pages(pages);
813 
814 		unmapped_bytes += area_last - area_first + 1;
815 
816 		down_write(&iopt->iova_rwsem);
817 	}
818 
819 out_unlock_iova:
820 	up_write(&iopt->iova_rwsem);
821 	up_read(&iopt->domains_rwsem);
822 out_unmapped:
823 	if (unmapped)
824 		*unmapped = unmapped_bytes;
825 	return rc;
826 }
827 
828 /**
829  * iopt_unmap_iova() - Remove a range of iova
830  * @iopt: io_pagetable to act on
831  * @iova: Starting iova to unmap
832  * @length: Number of bytes to unmap
833  * @unmapped: Return number of bytes unmapped
834  *
835  * The requested range must be a superset of existing ranges.
836  * Splitting/truncating IOVA mappings is not allowed.
837  */
iopt_unmap_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length,unsigned long * unmapped)838 int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
839 		    unsigned long length, unsigned long *unmapped)
840 {
841 	unsigned long iova_last;
842 
843 	if (!length)
844 		return -EINVAL;
845 
846 	if (check_add_overflow(iova, length - 1, &iova_last))
847 		return -EOVERFLOW;
848 
849 	return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped);
850 }
851 
iopt_unmap_all(struct io_pagetable * iopt,unsigned long * unmapped)852 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
853 {
854 	/* If the IOVAs are empty then unmap all succeeds */
855 	return iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
856 }
857 
858 /* The caller must always free all the nodes in the allowed_iova rb_root. */
iopt_set_allow_iova(struct io_pagetable * iopt,struct rb_root_cached * allowed_iova)859 int iopt_set_allow_iova(struct io_pagetable *iopt,
860 			struct rb_root_cached *allowed_iova)
861 {
862 	struct iopt_allowed *allowed;
863 
864 	down_write(&iopt->iova_rwsem);
865 	swap(*allowed_iova, iopt->allowed_itree);
866 
867 	for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed;
868 	     allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) {
869 		if (iopt_reserved_iter_first(iopt, allowed->node.start,
870 					     allowed->node.last)) {
871 			swap(*allowed_iova, iopt->allowed_itree);
872 			up_write(&iopt->iova_rwsem);
873 			return -EADDRINUSE;
874 		}
875 	}
876 	up_write(&iopt->iova_rwsem);
877 	return 0;
878 }
879 
iopt_reserve_iova(struct io_pagetable * iopt,unsigned long start,unsigned long last,void * owner)880 int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
881 		      unsigned long last, void *owner)
882 {
883 	struct iopt_reserved *reserved;
884 
885 	lockdep_assert_held_write(&iopt->iova_rwsem);
886 
887 	if (iopt_area_iter_first(iopt, start, last) ||
888 	    iopt_allowed_iter_first(iopt, start, last))
889 		return -EADDRINUSE;
890 
891 	reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT);
892 	if (!reserved)
893 		return -ENOMEM;
894 	reserved->node.start = start;
895 	reserved->node.last = last;
896 	reserved->owner = owner;
897 	interval_tree_insert(&reserved->node, &iopt->reserved_itree);
898 	return 0;
899 }
900 
__iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)901 static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
902 {
903 	struct iopt_reserved *reserved, *next;
904 
905 	lockdep_assert_held_write(&iopt->iova_rwsem);
906 
907 	for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved;
908 	     reserved = next) {
909 		next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX);
910 
911 		if (reserved->owner == owner) {
912 			interval_tree_remove(&reserved->node,
913 					     &iopt->reserved_itree);
914 			kfree(reserved);
915 		}
916 	}
917 }
918 
iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)919 void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
920 {
921 	down_write(&iopt->iova_rwsem);
922 	__iopt_remove_reserved_iova(iopt, owner);
923 	up_write(&iopt->iova_rwsem);
924 }
925 
iopt_init_table(struct io_pagetable * iopt)926 void iopt_init_table(struct io_pagetable *iopt)
927 {
928 	init_rwsem(&iopt->iova_rwsem);
929 	init_rwsem(&iopt->domains_rwsem);
930 	iopt->area_itree = RB_ROOT_CACHED;
931 	iopt->allowed_itree = RB_ROOT_CACHED;
932 	iopt->reserved_itree = RB_ROOT_CACHED;
933 	xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT);
934 	xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC);
935 
936 	/*
937 	 * iopt's start as SW tables that can use the entire size_t IOVA space
938 	 * due to the use of size_t in the APIs. They have no alignment
939 	 * restriction.
940 	 */
941 	iopt->iova_alignment = 1;
942 }
943 
iopt_destroy_table(struct io_pagetable * iopt)944 void iopt_destroy_table(struct io_pagetable *iopt)
945 {
946 	struct interval_tree_node *node;
947 
948 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
949 		iopt_remove_reserved_iova(iopt, NULL);
950 
951 	while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0,
952 						ULONG_MAX))) {
953 		interval_tree_remove(node, &iopt->allowed_itree);
954 		kfree(container_of(node, struct iopt_allowed, node));
955 	}
956 
957 	WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root));
958 	WARN_ON(!xa_empty(&iopt->domains));
959 	WARN_ON(!xa_empty(&iopt->access_list));
960 	WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
961 }
962 
963 /**
964  * iopt_unfill_domain() - Unfill a domain with PFNs
965  * @iopt: io_pagetable to act on
966  * @domain: domain to unfill
967  *
968  * This is used when removing a domain from the iopt. Every area in the iopt
969  * will be unmapped from the domain. The domain must already be removed from the
970  * domains xarray.
971  */
iopt_unfill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)972 static void iopt_unfill_domain(struct io_pagetable *iopt,
973 			       struct iommu_domain *domain)
974 {
975 	struct iopt_area *area;
976 
977 	lockdep_assert_held(&iopt->iova_rwsem);
978 	lockdep_assert_held_write(&iopt->domains_rwsem);
979 
980 	/*
981 	 * Some other domain is holding all the pfns still, rapidly unmap this
982 	 * domain.
983 	 */
984 	if (iopt->next_domain_id != 0) {
985 		/* Pick an arbitrary remaining domain to act as storage */
986 		struct iommu_domain *storage_domain =
987 			xa_load(&iopt->domains, 0);
988 
989 		for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
990 		     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
991 			struct iopt_pages *pages = area->pages;
992 
993 			if (!pages)
994 				continue;
995 
996 			mutex_lock(&pages->mutex);
997 			if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
998 				WARN_ON(!area->storage_domain);
999 			if (area->storage_domain == domain)
1000 				area->storage_domain = storage_domain;
1001 			if (iopt_is_dmabuf(pages)) {
1002 				if (!iopt_dmabuf_revoked(pages))
1003 					iopt_area_unmap_domain(area, domain);
1004 				iopt_dmabuf_untrack_domain(pages, area, domain);
1005 			}
1006 			mutex_unlock(&pages->mutex);
1007 
1008 			if (!iopt_is_dmabuf(pages))
1009 				iopt_area_unmap_domain(area, domain);
1010 		}
1011 		return;
1012 	}
1013 
1014 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1015 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
1016 		struct iopt_pages *pages = area->pages;
1017 
1018 		if (!pages)
1019 			continue;
1020 
1021 		mutex_lock(&pages->mutex);
1022 		interval_tree_remove(&area->pages_node, &pages->domains_itree);
1023 		WARN_ON(area->storage_domain != domain);
1024 		area->storage_domain = NULL;
1025 		iopt_area_unfill_domain(area, pages, domain);
1026 		if (iopt_is_dmabuf(pages))
1027 			iopt_dmabuf_untrack_domain(pages, area, domain);
1028 		mutex_unlock(&pages->mutex);
1029 	}
1030 }
1031 
1032 /**
1033  * iopt_fill_domain() - Fill a domain with PFNs
1034  * @iopt: io_pagetable to act on
1035  * @domain: domain to fill
1036  *
1037  * Fill the domain with PFNs from every area in the iopt. On failure the domain
1038  * is left unchanged.
1039  */
iopt_fill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)1040 static int iopt_fill_domain(struct io_pagetable *iopt,
1041 			    struct iommu_domain *domain)
1042 {
1043 	struct iopt_area *end_area;
1044 	struct iopt_area *area;
1045 	int rc;
1046 
1047 	lockdep_assert_held(&iopt->iova_rwsem);
1048 	lockdep_assert_held_write(&iopt->domains_rwsem);
1049 
1050 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1051 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
1052 		struct iopt_pages *pages = area->pages;
1053 
1054 		if (!pages)
1055 			continue;
1056 
1057 		guard(mutex)(&pages->mutex);
1058 		if (iopt_is_dmabuf(pages)) {
1059 			rc = iopt_dmabuf_track_domain(pages, area, domain);
1060 			if (rc)
1061 				goto out_unfill;
1062 		}
1063 		rc = iopt_area_fill_domain(area, domain);
1064 		if (rc) {
1065 			if (iopt_is_dmabuf(pages))
1066 				iopt_dmabuf_untrack_domain(pages, area, domain);
1067 			goto out_unfill;
1068 		}
1069 		if (!area->storage_domain) {
1070 			WARN_ON(iopt->next_domain_id != 0);
1071 			area->storage_domain = domain;
1072 			interval_tree_insert(&area->pages_node,
1073 					     &pages->domains_itree);
1074 		}
1075 	}
1076 	return 0;
1077 
1078 out_unfill:
1079 	end_area = area;
1080 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1081 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
1082 		struct iopt_pages *pages = area->pages;
1083 
1084 		if (area == end_area)
1085 			break;
1086 		if (!pages)
1087 			continue;
1088 		mutex_lock(&pages->mutex);
1089 		if (iopt->next_domain_id == 0) {
1090 			interval_tree_remove(&area->pages_node,
1091 					     &pages->domains_itree);
1092 			area->storage_domain = NULL;
1093 		}
1094 		iopt_area_unfill_domain(area, pages, domain);
1095 		if (iopt_is_dmabuf(pages))
1096 			iopt_dmabuf_untrack_domain(pages, area, domain);
1097 		mutex_unlock(&pages->mutex);
1098 	}
1099 	return rc;
1100 }
1101 
1102 /* All existing area's conform to an increased page size */
iopt_check_iova_alignment(struct io_pagetable * iopt,unsigned long new_iova_alignment)1103 static int iopt_check_iova_alignment(struct io_pagetable *iopt,
1104 				     unsigned long new_iova_alignment)
1105 {
1106 	unsigned long align_mask = new_iova_alignment - 1;
1107 	struct iopt_area *area;
1108 
1109 	lockdep_assert_held(&iopt->iova_rwsem);
1110 	lockdep_assert_held(&iopt->domains_rwsem);
1111 
1112 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1113 	     area = iopt_area_iter_next(area, 0, ULONG_MAX))
1114 		if ((iopt_area_iova(area) & align_mask) ||
1115 		    (iopt_area_length(area) & align_mask) ||
1116 		    (area->page_offset & align_mask))
1117 			return -EADDRINUSE;
1118 
1119 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) {
1120 		struct iommufd_access *access;
1121 		unsigned long index;
1122 
1123 		xa_for_each(&iopt->access_list, index, access)
1124 			if (WARN_ON(access->iova_alignment >
1125 				    new_iova_alignment))
1126 				return -EADDRINUSE;
1127 	}
1128 	return 0;
1129 }
1130 
iopt_table_add_domain(struct io_pagetable * iopt,struct iommu_domain * domain)1131 int iopt_table_add_domain(struct io_pagetable *iopt,
1132 			  struct iommu_domain *domain)
1133 {
1134 	const struct iommu_domain_geometry *geometry = &domain->geometry;
1135 	struct iommu_domain *iter_domain;
1136 	unsigned int new_iova_alignment;
1137 	unsigned long index;
1138 	int rc;
1139 
1140 	down_write(&iopt->domains_rwsem);
1141 	down_write(&iopt->iova_rwsem);
1142 
1143 	xa_for_each(&iopt->domains, index, iter_domain) {
1144 		if (WARN_ON(iter_domain == domain)) {
1145 			rc = -EEXIST;
1146 			goto out_unlock;
1147 		}
1148 	}
1149 
1150 	/*
1151 	 * The io page size drives the iova_alignment. Internally the iopt_pages
1152 	 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
1153 	 * objects into the iommu_domain.
1154 	 *
1155 	 * A iommu_domain must always be able to accept PAGE_SIZE to be
1156 	 * compatible as we can't guarantee higher contiguity.
1157 	 */
1158 	new_iova_alignment = max_t(unsigned long,
1159 				   1UL << __ffs(domain->pgsize_bitmap),
1160 				   iopt->iova_alignment);
1161 	if (new_iova_alignment > PAGE_SIZE) {
1162 		rc = -EINVAL;
1163 		goto out_unlock;
1164 	}
1165 	if (new_iova_alignment != iopt->iova_alignment) {
1166 		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1167 		if (rc)
1168 			goto out_unlock;
1169 	}
1170 
1171 	/* No area exists that is outside the allowed domain aperture */
1172 	if (geometry->aperture_start != 0) {
1173 		rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
1174 				       domain);
1175 		if (rc)
1176 			goto out_reserved;
1177 	}
1178 	if (geometry->aperture_end != ULONG_MAX) {
1179 		rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
1180 				       ULONG_MAX, domain);
1181 		if (rc)
1182 			goto out_reserved;
1183 	}
1184 
1185 	rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
1186 	if (rc)
1187 		goto out_reserved;
1188 
1189 	rc = iopt_fill_domain(iopt, domain);
1190 	if (rc)
1191 		goto out_release;
1192 
1193 	iopt->iova_alignment = new_iova_alignment;
1194 	xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
1195 	iopt->next_domain_id++;
1196 	up_write(&iopt->iova_rwsem);
1197 	up_write(&iopt->domains_rwsem);
1198 	return 0;
1199 out_release:
1200 	xa_release(&iopt->domains, iopt->next_domain_id);
1201 out_reserved:
1202 	__iopt_remove_reserved_iova(iopt, domain);
1203 out_unlock:
1204 	up_write(&iopt->iova_rwsem);
1205 	up_write(&iopt->domains_rwsem);
1206 	return rc;
1207 }
1208 
iopt_calculate_iova_alignment(struct io_pagetable * iopt)1209 static int iopt_calculate_iova_alignment(struct io_pagetable *iopt)
1210 {
1211 	unsigned long new_iova_alignment;
1212 	struct iommufd_access *access;
1213 	struct iommu_domain *domain;
1214 	unsigned long index;
1215 
1216 	lockdep_assert_held_write(&iopt->iova_rwsem);
1217 	lockdep_assert_held(&iopt->domains_rwsem);
1218 
1219 	/* See batch_iommu_map_small() */
1220 	if (iopt->disable_large_pages)
1221 		new_iova_alignment = PAGE_SIZE;
1222 	else
1223 		new_iova_alignment = 1;
1224 
1225 	xa_for_each(&iopt->domains, index, domain)
1226 		new_iova_alignment = max_t(unsigned long,
1227 					   1UL << __ffs(domain->pgsize_bitmap),
1228 					   new_iova_alignment);
1229 	xa_for_each(&iopt->access_list, index, access)
1230 		new_iova_alignment = max_t(unsigned long,
1231 					   access->iova_alignment,
1232 					   new_iova_alignment);
1233 
1234 	if (new_iova_alignment > iopt->iova_alignment) {
1235 		int rc;
1236 
1237 		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1238 		if (rc)
1239 			return rc;
1240 	}
1241 	iopt->iova_alignment = new_iova_alignment;
1242 	return 0;
1243 }
1244 
iopt_table_remove_domain(struct io_pagetable * iopt,struct iommu_domain * domain)1245 void iopt_table_remove_domain(struct io_pagetable *iopt,
1246 			      struct iommu_domain *domain)
1247 {
1248 	struct iommu_domain *iter_domain = NULL;
1249 	unsigned long index;
1250 
1251 	down_write(&iopt->domains_rwsem);
1252 	down_write(&iopt->iova_rwsem);
1253 
1254 	xa_for_each(&iopt->domains, index, iter_domain)
1255 		if (iter_domain == domain)
1256 			break;
1257 	if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
1258 		goto out_unlock;
1259 
1260 	/*
1261 	 * Compress the xarray to keep it linear by swapping the entry to erase
1262 	 * with the tail entry and shrinking the tail.
1263 	 */
1264 	iopt->next_domain_id--;
1265 	iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
1266 	if (index != iopt->next_domain_id)
1267 		xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
1268 
1269 	iopt_unfill_domain(iopt, domain);
1270 	__iopt_remove_reserved_iova(iopt, domain);
1271 
1272 	WARN_ON(iopt_calculate_iova_alignment(iopt));
1273 out_unlock:
1274 	up_write(&iopt->iova_rwsem);
1275 	up_write(&iopt->domains_rwsem);
1276 }
1277 
1278 /**
1279  * iopt_area_split - Split an area into two parts at iova
1280  * @area: The area to split
1281  * @iova: Becomes the last of a new area
1282  *
1283  * This splits an area into two. It is part of the VFIO compatibility to allow
1284  * poking a hole in the mapping. The two areas continue to point at the same
1285  * iopt_pages, just with different starting bytes.
1286  */
iopt_area_split(struct iopt_area * area,unsigned long iova)1287 static int iopt_area_split(struct iopt_area *area, unsigned long iova)
1288 {
1289 	unsigned long alignment = area->iopt->iova_alignment;
1290 	unsigned long last_iova = iopt_area_last_iova(area);
1291 	unsigned long start_iova = iopt_area_iova(area);
1292 	unsigned long new_start = iova + 1;
1293 	struct io_pagetable *iopt = area->iopt;
1294 	struct iopt_pages *pages = area->pages;
1295 	struct iopt_area *lhs;
1296 	struct iopt_area *rhs;
1297 	int rc;
1298 
1299 	lockdep_assert_held_write(&iopt->iova_rwsem);
1300 
1301 	if (iova == start_iova || iova == last_iova)
1302 		return 0;
1303 
1304 	if (!pages || area->prevent_access)
1305 		return -EBUSY;
1306 
1307 	/* Maintaining the domains_itree below is a bit complicated */
1308 	if (iopt_is_dmabuf(pages))
1309 		return -EOPNOTSUPP;
1310 
1311 	if (new_start & (alignment - 1) ||
1312 	    iopt_area_start_byte(area, new_start) & (alignment - 1))
1313 		return -EINVAL;
1314 
1315 	lhs = iopt_area_alloc();
1316 	if (!lhs)
1317 		return -ENOMEM;
1318 
1319 	rhs = iopt_area_alloc();
1320 	if (!rhs) {
1321 		rc = -ENOMEM;
1322 		goto err_free_lhs;
1323 	}
1324 
1325 	mutex_lock(&pages->mutex);
1326 	/*
1327 	 * Splitting is not permitted if an access exists, we don't track enough
1328 	 * information to split existing accesses.
1329 	 */
1330 	if (area->num_accesses) {
1331 		rc = -EINVAL;
1332 		goto err_unlock;
1333 	}
1334 
1335 	/*
1336 	 * Splitting is not permitted if a domain could have been mapped with
1337 	 * huge pages.
1338 	 */
1339 	if (area->storage_domain && !iopt->disable_large_pages) {
1340 		rc = -EINVAL;
1341 		goto err_unlock;
1342 	}
1343 
1344 	interval_tree_remove(&area->node, &iopt->area_itree);
1345 	rc = iopt_insert_area(iopt, lhs, area->pages, start_iova,
1346 			      iopt_area_start_byte(area, start_iova),
1347 			      (new_start - 1) - start_iova + 1,
1348 			      area->iommu_prot);
1349 	if (WARN_ON(rc))
1350 		goto err_insert;
1351 
1352 	rc = iopt_insert_area(iopt, rhs, area->pages, new_start,
1353 			      iopt_area_start_byte(area, new_start),
1354 			      last_iova - new_start + 1, area->iommu_prot);
1355 	if (WARN_ON(rc))
1356 		goto err_remove_lhs;
1357 
1358 	/*
1359 	 * If the original area has filled a domain, domains_itree has to be
1360 	 * updated.
1361 	 */
1362 	if (area->storage_domain) {
1363 		interval_tree_remove(&area->pages_node, &pages->domains_itree);
1364 		interval_tree_insert(&lhs->pages_node, &pages->domains_itree);
1365 		interval_tree_insert(&rhs->pages_node, &pages->domains_itree);
1366 	}
1367 
1368 	lhs->storage_domain = area->storage_domain;
1369 	lhs->pages = area->pages;
1370 	rhs->storage_domain = area->storage_domain;
1371 	rhs->pages = area->pages;
1372 	kref_get(&rhs->pages->kref);
1373 	kfree(area);
1374 	mutex_unlock(&pages->mutex);
1375 
1376 	/*
1377 	 * No change to domains or accesses because the pages hasn't been
1378 	 * changed
1379 	 */
1380 	return 0;
1381 
1382 err_remove_lhs:
1383 	interval_tree_remove(&lhs->node, &iopt->area_itree);
1384 err_insert:
1385 	interval_tree_insert(&area->node, &iopt->area_itree);
1386 err_unlock:
1387 	mutex_unlock(&pages->mutex);
1388 	kfree(rhs);
1389 err_free_lhs:
1390 	kfree(lhs);
1391 	return rc;
1392 }
1393 
iopt_cut_iova(struct io_pagetable * iopt,unsigned long * iovas,size_t num_iovas)1394 int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
1395 		  size_t num_iovas)
1396 {
1397 	int rc = 0;
1398 	int i;
1399 
1400 	down_write(&iopt->iova_rwsem);
1401 	for (i = 0; i < num_iovas; i++) {
1402 		struct iopt_area *area;
1403 
1404 		area = iopt_area_iter_first(iopt, iovas[i], iovas[i]);
1405 		if (!area)
1406 			continue;
1407 		rc = iopt_area_split(area, iovas[i]);
1408 		if (rc)
1409 			break;
1410 	}
1411 	up_write(&iopt->iova_rwsem);
1412 	return rc;
1413 }
1414 
iopt_enable_large_pages(struct io_pagetable * iopt)1415 void iopt_enable_large_pages(struct io_pagetable *iopt)
1416 {
1417 	int rc;
1418 
1419 	down_write(&iopt->domains_rwsem);
1420 	down_write(&iopt->iova_rwsem);
1421 	WRITE_ONCE(iopt->disable_large_pages, false);
1422 	rc = iopt_calculate_iova_alignment(iopt);
1423 	WARN_ON(rc);
1424 	up_write(&iopt->iova_rwsem);
1425 	up_write(&iopt->domains_rwsem);
1426 }
1427 
iopt_disable_large_pages(struct io_pagetable * iopt)1428 int iopt_disable_large_pages(struct io_pagetable *iopt)
1429 {
1430 	int rc = 0;
1431 
1432 	down_write(&iopt->domains_rwsem);
1433 	down_write(&iopt->iova_rwsem);
1434 	if (iopt->disable_large_pages)
1435 		goto out_unlock;
1436 
1437 	/* Won't do it if domains already have pages mapped in them */
1438 	if (!xa_empty(&iopt->domains) &&
1439 	    !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) {
1440 		rc = -EINVAL;
1441 		goto out_unlock;
1442 	}
1443 
1444 	WRITE_ONCE(iopt->disable_large_pages, true);
1445 	rc = iopt_calculate_iova_alignment(iopt);
1446 	if (rc)
1447 		WRITE_ONCE(iopt->disable_large_pages, false);
1448 out_unlock:
1449 	up_write(&iopt->iova_rwsem);
1450 	up_write(&iopt->domains_rwsem);
1451 	return rc;
1452 }
1453 
iopt_add_access(struct io_pagetable * iopt,struct iommufd_access * access)1454 int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access)
1455 {
1456 	u32 new_id;
1457 	int rc;
1458 
1459 	down_write(&iopt->domains_rwsem);
1460 	down_write(&iopt->iova_rwsem);
1461 	rc = xa_alloc(&iopt->access_list, &new_id, access, xa_limit_16b,
1462 		      GFP_KERNEL_ACCOUNT);
1463 
1464 	if (rc)
1465 		goto out_unlock;
1466 
1467 	rc = iopt_calculate_iova_alignment(iopt);
1468 	if (rc) {
1469 		xa_erase(&iopt->access_list, new_id);
1470 		goto out_unlock;
1471 	}
1472 	access->iopt_access_list_id = new_id;
1473 
1474 out_unlock:
1475 	up_write(&iopt->iova_rwsem);
1476 	up_write(&iopt->domains_rwsem);
1477 	return rc;
1478 }
1479 
iopt_remove_access(struct io_pagetable * iopt,struct iommufd_access * access,u32 iopt_access_list_id)1480 void iopt_remove_access(struct io_pagetable *iopt,
1481 			struct iommufd_access *access, u32 iopt_access_list_id)
1482 {
1483 	down_write(&iopt->domains_rwsem);
1484 	down_write(&iopt->iova_rwsem);
1485 	WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access);
1486 	WARN_ON(iopt_calculate_iova_alignment(iopt));
1487 	up_write(&iopt->iova_rwsem);
1488 	up_write(&iopt->domains_rwsem);
1489 }
1490 
1491 /* Narrow the valid_iova_itree to include reserved ranges from a device. */
iopt_table_enforce_dev_resv_regions(struct io_pagetable * iopt,struct device * dev,phys_addr_t * sw_msi_start)1492 int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
1493 					struct device *dev,
1494 					phys_addr_t *sw_msi_start)
1495 {
1496 	struct iommu_resv_region *resv;
1497 	LIST_HEAD(resv_regions);
1498 	unsigned int num_hw_msi = 0;
1499 	unsigned int num_sw_msi = 0;
1500 	int rc;
1501 
1502 	if (iommufd_should_fail())
1503 		return -EINVAL;
1504 
1505 	down_write(&iopt->iova_rwsem);
1506 	/* FIXME: drivers allocate memory but there is no failure propogated */
1507 	iommu_get_resv_regions(dev, &resv_regions);
1508 
1509 	list_for_each_entry(resv, &resv_regions, list) {
1510 		if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1511 			continue;
1512 
1513 		if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
1514 			num_hw_msi++;
1515 		if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
1516 			*sw_msi_start = resv->start;
1517 			num_sw_msi++;
1518 		}
1519 
1520 		rc = iopt_reserve_iova(iopt, resv->start,
1521 				       resv->length - 1 + resv->start, dev);
1522 		if (rc)
1523 			goto out_reserved;
1524 	}
1525 
1526 	/* Drivers must offer sane combinations of regions */
1527 	if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) {
1528 		rc = -EINVAL;
1529 		goto out_reserved;
1530 	}
1531 
1532 	rc = 0;
1533 	goto out_free_resv;
1534 
1535 out_reserved:
1536 	__iopt_remove_reserved_iova(iopt, dev);
1537 out_free_resv:
1538 	iommu_put_resv_regions(dev, &resv_regions);
1539 	up_write(&iopt->iova_rwsem);
1540 	return rc;
1541 }
1542