xref: /linux/drivers/iommu/iommufd/io_pagetable.c (revision 056daec2925dc200b22c30419bc7b9e01f7843c4)
1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
3  *
4  * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
5  * PFNs can be placed into an iommu_domain, or returned to the caller as a page
6  * list for access by an in-kernel user.
7  *
8  * The datastructure uses the iopt_pages to optimize the storage of the PFNs
9  * between the domains and xarray.
10  */
11 #include <linux/dma-buf.h>
12 #include <linux/err.h>
13 #include <linux/errno.h>
14 #include <linux/file.h>
15 #include <linux/iommu.h>
16 #include <linux/iommufd.h>
17 #include <linux/lockdep.h>
18 #include <linux/sched/mm.h>
19 #include <linux/slab.h>
20 #include <uapi/linux/iommufd.h>
21 
22 #include "double_span.h"
23 #include "io_pagetable.h"
24 
25 struct iopt_pages_list {
26 	struct iopt_pages *pages;
27 	struct iopt_area *area;
28 	struct list_head next;
29 	unsigned long start_byte;
30 	unsigned long length;
31 };
32 
iopt_area_contig_init(struct iopt_area_contig_iter * iter,struct io_pagetable * iopt,unsigned long iova,unsigned long last_iova)33 struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
34 					struct io_pagetable *iopt,
35 					unsigned long iova,
36 					unsigned long last_iova)
37 {
38 	lockdep_assert_held(&iopt->iova_rwsem);
39 
40 	iter->cur_iova = iova;
41 	iter->last_iova = last_iova;
42 	iter->area = iopt_area_iter_first(iopt, iova, iova);
43 	if (!iter->area)
44 		return NULL;
45 	if (!iter->area->pages) {
46 		iter->area = NULL;
47 		return NULL;
48 	}
49 	return iter->area;
50 }
51 
iopt_area_contig_next(struct iopt_area_contig_iter * iter)52 struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
53 {
54 	unsigned long last_iova;
55 
56 	if (!iter->area)
57 		return NULL;
58 	last_iova = iopt_area_last_iova(iter->area);
59 	if (iter->last_iova <= last_iova)
60 		return NULL;
61 
62 	iter->cur_iova = last_iova + 1;
63 	iter->area = iopt_area_iter_next(iter->area, iter->cur_iova,
64 					 iter->last_iova);
65 	if (!iter->area)
66 		return NULL;
67 	if (iter->cur_iova != iopt_area_iova(iter->area) ||
68 	    !iter->area->pages) {
69 		iter->area = NULL;
70 		return NULL;
71 	}
72 	return iter->area;
73 }
74 
__alloc_iova_check_range(unsigned long * start,unsigned long last,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)75 static bool __alloc_iova_check_range(unsigned long *start, unsigned long last,
76 				     unsigned long length,
77 				     unsigned long iova_alignment,
78 				     unsigned long page_offset)
79 {
80 	unsigned long aligned_start;
81 
82 	/* ALIGN_UP() */
83 	if (check_add_overflow(*start, iova_alignment - 1, &aligned_start))
84 		return false;
85 	aligned_start &= ~(iova_alignment - 1);
86 	aligned_start |= page_offset;
87 
88 	if (aligned_start >= last || last - aligned_start < length - 1)
89 		return false;
90 	*start = aligned_start;
91 	return true;
92 }
93 
__alloc_iova_check_hole(struct interval_tree_double_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)94 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
95 				    unsigned long length,
96 				    unsigned long iova_alignment,
97 				    unsigned long page_offset)
98 {
99 	if (span->is_used)
100 		return false;
101 	return __alloc_iova_check_range(&span->start_hole, span->last_hole,
102 					length, iova_alignment, page_offset);
103 }
104 
__alloc_iova_check_used(struct interval_tree_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)105 static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
106 				    unsigned long length,
107 				    unsigned long iova_alignment,
108 				    unsigned long page_offset)
109 {
110 	if (span->is_hole)
111 		return false;
112 	return __alloc_iova_check_range(&span->start_used, span->last_used,
113 					length, iova_alignment, page_offset);
114 }
115 
116 /*
117  * Automatically find a block of IOVA that is not being used and not reserved.
118  * Does not return a 0 IOVA even if it is valid.
119  */
iopt_alloc_iova(struct io_pagetable * iopt,unsigned long * iova,unsigned long addr,unsigned long length)120 static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
121 			   unsigned long addr, unsigned long length)
122 {
123 	unsigned long page_offset = addr % PAGE_SIZE;
124 	struct interval_tree_double_span_iter used_span;
125 	struct interval_tree_span_iter allowed_span;
126 	unsigned long max_alignment = PAGE_SIZE;
127 	unsigned long iova_alignment;
128 
129 	lockdep_assert_held(&iopt->iova_rwsem);
130 
131 	/* Protect roundup_pow-of_two() from overflow */
132 	if (length == 0 || length >= ULONG_MAX / 2)
133 		return -EOVERFLOW;
134 
135 	/*
136 	 * Keep alignment present in addr when building the IOVA, which
137 	 * increases the chance we can map a THP.
138 	 */
139 	if (!addr)
140 		iova_alignment = roundup_pow_of_two(length);
141 	else
142 		iova_alignment = min_t(unsigned long,
143 				       roundup_pow_of_two(length),
144 				       1UL << __ffs64(addr));
145 
146 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
147 	max_alignment = HPAGE_SIZE;
148 #endif
149 	/* Protect against ALIGN() overflow */
150 	if (iova_alignment >= max_alignment)
151 		iova_alignment = max_alignment;
152 
153 	if (iova_alignment < iopt->iova_alignment)
154 		return -EINVAL;
155 
156 	interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree,
157 				    PAGE_SIZE, ULONG_MAX - PAGE_SIZE) {
158 		if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) {
159 			allowed_span.start_used = PAGE_SIZE;
160 			allowed_span.last_used = ULONG_MAX - PAGE_SIZE;
161 			allowed_span.is_hole = false;
162 		}
163 
164 		if (!__alloc_iova_check_used(&allowed_span, length,
165 					     iova_alignment, page_offset))
166 			continue;
167 
168 		interval_tree_for_each_double_span(
169 			&used_span, &iopt->reserved_itree, &iopt->area_itree,
170 			allowed_span.start_used, allowed_span.last_used) {
171 			if (!__alloc_iova_check_hole(&used_span, length,
172 						     iova_alignment,
173 						     page_offset))
174 				continue;
175 
176 			*iova = used_span.start_hole;
177 			return 0;
178 		}
179 	}
180 	return -ENOSPC;
181 }
182 
iopt_check_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length)183 static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova,
184 			   unsigned long length)
185 {
186 	unsigned long last;
187 
188 	lockdep_assert_held(&iopt->iova_rwsem);
189 
190 	if ((iova & (iopt->iova_alignment - 1)))
191 		return -EINVAL;
192 
193 	if (check_add_overflow(iova, length - 1, &last))
194 		return -EOVERFLOW;
195 
196 	/* No reserved IOVA intersects the range */
197 	if (iopt_reserved_iter_first(iopt, iova, last))
198 		return -EINVAL;
199 
200 	/* Check that there is not already a mapping in the range */
201 	if (iopt_area_iter_first(iopt, iova, last))
202 		return -EEXIST;
203 	return 0;
204 }
205 
206 /*
207  * The area takes a slice of the pages from start_bytes to start_byte + length
208  */
iopt_insert_area(struct io_pagetable * iopt,struct iopt_area * area,struct iopt_pages * pages,unsigned long iova,unsigned long start_byte,unsigned long length,int iommu_prot)209 static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
210 			    struct iopt_pages *pages, unsigned long iova,
211 			    unsigned long start_byte, unsigned long length,
212 			    int iommu_prot)
213 {
214 	lockdep_assert_held_write(&iopt->iova_rwsem);
215 
216 	if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
217 		return -EPERM;
218 
219 	area->iommu_prot = iommu_prot;
220 	area->page_offset = start_byte % PAGE_SIZE;
221 	if (area->page_offset & (iopt->iova_alignment - 1))
222 		return -EINVAL;
223 
224 	area->node.start = iova;
225 	if (check_add_overflow(iova, length - 1, &area->node.last))
226 		return -EOVERFLOW;
227 
228 	area->pages_node.start = start_byte / PAGE_SIZE;
229 	if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
230 		return -EOVERFLOW;
231 	area->pages_node.last = area->pages_node.last / PAGE_SIZE;
232 	if (WARN_ON(area->pages_node.last >= pages->npages))
233 		return -EOVERFLOW;
234 
235 	/*
236 	 * The area is inserted with a NULL pages indicating it is not fully
237 	 * initialized yet.
238 	 */
239 	area->iopt = iopt;
240 	interval_tree_insert(&area->node, &iopt->area_itree);
241 	return 0;
242 }
243 
iopt_area_alloc(void)244 static struct iopt_area *iopt_area_alloc(void)
245 {
246 	struct iopt_area *area;
247 
248 	area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
249 	if (!area)
250 		return NULL;
251 	RB_CLEAR_NODE(&area->node.rb);
252 	RB_CLEAR_NODE(&area->pages_node.rb);
253 	return area;
254 }
255 
iopt_alloc_area_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)256 static int iopt_alloc_area_pages(struct io_pagetable *iopt,
257 				 struct list_head *pages_list,
258 				 unsigned long length, unsigned long *dst_iova,
259 				 int iommu_prot, unsigned int flags)
260 {
261 	struct iopt_pages_list *elm;
262 	unsigned long start;
263 	unsigned long iova;
264 	int rc = 0;
265 
266 	list_for_each_entry(elm, pages_list, next) {
267 		elm->area = iopt_area_alloc();
268 		if (!elm->area)
269 			return -ENOMEM;
270 	}
271 
272 	down_write(&iopt->iova_rwsem);
273 	if ((length & (iopt->iova_alignment - 1)) || !length) {
274 		rc = -EINVAL;
275 		goto out_unlock;
276 	}
277 
278 	if (flags & IOPT_ALLOC_IOVA) {
279 		/* Use the first entry to guess the ideal IOVA alignment */
280 		elm = list_first_entry(pages_list, struct iopt_pages_list,
281 				       next);
282 		switch (elm->pages->type) {
283 		case IOPT_ADDRESS_USER:
284 			start = elm->start_byte + (uintptr_t)elm->pages->uptr;
285 			break;
286 		case IOPT_ADDRESS_FILE:
287 			start = elm->start_byte + elm->pages->start;
288 			break;
289 		case IOPT_ADDRESS_DMABUF:
290 			start = elm->start_byte + elm->pages->dmabuf.start;
291 			break;
292 		}
293 		rc = iopt_alloc_iova(iopt, dst_iova, start, length);
294 		if (rc)
295 			goto out_unlock;
296 		if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
297 		    WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) {
298 			rc = -EINVAL;
299 			goto out_unlock;
300 		}
301 	} else {
302 		rc = iopt_check_iova(iopt, *dst_iova, length);
303 		if (rc)
304 			goto out_unlock;
305 	}
306 
307 	/*
308 	 * Areas are created with a NULL pages so that the IOVA space is
309 	 * reserved and we can unlock the iova_rwsem.
310 	 */
311 	iova = *dst_iova;
312 	list_for_each_entry(elm, pages_list, next) {
313 		rc = iopt_insert_area(iopt, elm->area, elm->pages, iova,
314 				      elm->start_byte, elm->length, iommu_prot);
315 		if (rc)
316 			goto out_unlock;
317 		iova += elm->length;
318 	}
319 
320 out_unlock:
321 	up_write(&iopt->iova_rwsem);
322 	return rc;
323 }
324 
iopt_abort_area(struct iopt_area * area)325 static void iopt_abort_area(struct iopt_area *area)
326 {
327 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
328 		WARN_ON(area->pages);
329 	if (area->iopt) {
330 		down_write(&area->iopt->iova_rwsem);
331 		interval_tree_remove(&area->node, &area->iopt->area_itree);
332 		up_write(&area->iopt->iova_rwsem);
333 	}
334 	kfree(area);
335 }
336 
iopt_free_pages_list(struct list_head * pages_list)337 void iopt_free_pages_list(struct list_head *pages_list)
338 {
339 	struct iopt_pages_list *elm;
340 
341 	while ((elm = list_first_entry_or_null(pages_list,
342 					       struct iopt_pages_list, next))) {
343 		if (elm->area)
344 			iopt_abort_area(elm->area);
345 		if (elm->pages)
346 			iopt_put_pages(elm->pages);
347 		list_del(&elm->next);
348 		kfree(elm);
349 	}
350 }
351 
iopt_fill_domains_pages(struct list_head * pages_list)352 static int iopt_fill_domains_pages(struct list_head *pages_list)
353 {
354 	struct iopt_pages_list *undo_elm;
355 	struct iopt_pages_list *elm;
356 	int rc;
357 
358 	list_for_each_entry(elm, pages_list, next) {
359 		rc = iopt_area_fill_domains(elm->area, elm->pages);
360 		if (rc)
361 			goto err_undo;
362 	}
363 	return 0;
364 
365 err_undo:
366 	list_for_each_entry(undo_elm, pages_list, next) {
367 		if (undo_elm == elm)
368 			break;
369 		iopt_area_unfill_domains(undo_elm->area, undo_elm->pages);
370 	}
371 	return rc;
372 }
373 
iopt_map_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)374 int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
375 		   unsigned long length, unsigned long *dst_iova,
376 		   int iommu_prot, unsigned int flags)
377 {
378 	struct iopt_pages_list *elm;
379 	int rc;
380 
381 	rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova,
382 				   iommu_prot, flags);
383 	if (rc)
384 		return rc;
385 
386 	down_read(&iopt->domains_rwsem);
387 	rc = iopt_fill_domains_pages(pages_list);
388 	if (rc)
389 		goto out_unlock_domains;
390 
391 	down_write(&iopt->iova_rwsem);
392 	list_for_each_entry(elm, pages_list, next) {
393 		/*
394 		 * area->pages must be set inside the domains_rwsem to ensure
395 		 * any newly added domains will get filled. Moves the reference
396 		 * in from the list.
397 		 */
398 		elm->area->pages = elm->pages;
399 		elm->pages = NULL;
400 		elm->area = NULL;
401 	}
402 	up_write(&iopt->iova_rwsem);
403 out_unlock_domains:
404 	up_read(&iopt->domains_rwsem);
405 	return rc;
406 }
407 
iopt_map_common(struct iommufd_ctx * ictx,struct io_pagetable * iopt,struct iopt_pages * pages,unsigned long * iova,unsigned long length,unsigned long start_byte,int iommu_prot,unsigned int flags)408 static int iopt_map_common(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
409 			   struct iopt_pages *pages, unsigned long *iova,
410 			   unsigned long length, unsigned long start_byte,
411 			   int iommu_prot, unsigned int flags)
412 {
413 	struct iopt_pages_list elm = {};
414 	LIST_HEAD(pages_list);
415 	int rc;
416 
417 	elm.pages = pages;
418 	elm.start_byte = start_byte;
419 	if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
420 	    elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
421 		elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
422 	elm.length = length;
423 	list_add(&elm.next, &pages_list);
424 
425 	rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
426 	if (rc) {
427 		if (elm.area)
428 			iopt_abort_area(elm.area);
429 		if (elm.pages)
430 			iopt_put_pages(elm.pages);
431 		return rc;
432 	}
433 	return 0;
434 }
435 
436 /**
437  * iopt_map_user_pages() - Map a user VA to an iova in the io page table
438  * @ictx: iommufd_ctx the iopt is part of
439  * @iopt: io_pagetable to act on
440  * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
441  *        the chosen iova on output. Otherwise is the iova to map to on input
442  * @uptr: User VA to map
443  * @length: Number of bytes to map
444  * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
445  * @flags: IOPT_ALLOC_IOVA or zero
446  *
447  * iova, uptr, and length must be aligned to iova_alignment. For domain backed
448  * page tables this will pin the pages and load them into the domain at iova.
449  * For non-domain page tables this will only setup a lazy reference and the
450  * caller must use iopt_access_pages() to touch them.
451  *
452  * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
453  * destroyed.
454  */
iopt_map_user_pages(struct iommufd_ctx * ictx,struct io_pagetable * iopt,unsigned long * iova,void __user * uptr,unsigned long length,int iommu_prot,unsigned int flags)455 int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
456 			unsigned long *iova, void __user *uptr,
457 			unsigned long length, int iommu_prot,
458 			unsigned int flags)
459 {
460 	struct iopt_pages *pages;
461 
462 	pages = iopt_alloc_user_pages(uptr, length, iommu_prot & IOMMU_WRITE);
463 	if (IS_ERR(pages))
464 		return PTR_ERR(pages);
465 
466 	return iopt_map_common(ictx, iopt, pages, iova, length,
467 			       uptr - pages->uptr, iommu_prot, flags);
468 }
469 
470 /**
471  * iopt_map_file_pages() - Like iopt_map_user_pages, but map a file.
472  * @ictx: iommufd_ctx the iopt is part of
473  * @iopt: io_pagetable to act on
474  * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
475  *        the chosen iova on output. Otherwise is the iova to map to on input
476  * @fd: fdno of a file to map
477  * @start: map file starting at this byte offset
478  * @length: Number of bytes to map
479  * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
480  * @flags: IOPT_ALLOC_IOVA or zero
481  */
iopt_map_file_pages(struct iommufd_ctx * ictx,struct io_pagetable * iopt,unsigned long * iova,int fd,unsigned long start,unsigned long length,int iommu_prot,unsigned int flags)482 int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
483 			unsigned long *iova, int fd, unsigned long start,
484 			unsigned long length, int iommu_prot,
485 			unsigned int flags)
486 {
487 	struct iopt_pages *pages;
488 	struct dma_buf *dmabuf;
489 	unsigned long start_byte;
490 	unsigned long last;
491 
492 	if (!length)
493 		return -EINVAL;
494 	if (check_add_overflow(start, length - 1, &last))
495 		return -EOVERFLOW;
496 
497 	start_byte = start - ALIGN_DOWN(start, PAGE_SIZE);
498 	dmabuf = dma_buf_get(fd);
499 	if (!IS_ERR(dmabuf)) {
500 		pages = iopt_alloc_dmabuf_pages(ictx, dmabuf, start_byte, start,
501 						length,
502 						iommu_prot & IOMMU_WRITE);
503 		if (IS_ERR(pages)) {
504 			dma_buf_put(dmabuf);
505 			return PTR_ERR(pages);
506 		}
507 	} else {
508 		struct file *file;
509 
510 		file = fget(fd);
511 		if (!file)
512 			return -EBADF;
513 
514 		pages = iopt_alloc_file_pages(file, start_byte, start, length,
515 					      iommu_prot & IOMMU_WRITE);
516 		fput(file);
517 		if (IS_ERR(pages))
518 			return PTR_ERR(pages);
519 	}
520 
521 	return iopt_map_common(ictx, iopt, pages, iova, length,
522 			       start_byte, iommu_prot, flags);
523 }
524 
525 struct iova_bitmap_fn_arg {
526 	unsigned long flags;
527 	struct io_pagetable *iopt;
528 	struct iommu_domain *domain;
529 	struct iommu_dirty_bitmap *dirty;
530 };
531 
__iommu_read_and_clear_dirty(struct iova_bitmap * bitmap,unsigned long iova,size_t length,void * opaque)532 static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap,
533 					unsigned long iova, size_t length,
534 					void *opaque)
535 {
536 	struct iopt_area *area;
537 	struct iopt_area_contig_iter iter;
538 	struct iova_bitmap_fn_arg *arg = opaque;
539 	struct iommu_domain *domain = arg->domain;
540 	struct iommu_dirty_bitmap *dirty = arg->dirty;
541 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
542 	unsigned long last_iova = iova + length - 1;
543 	unsigned long flags = arg->flags;
544 	int ret;
545 
546 	iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) {
547 		unsigned long last = min(last_iova, iopt_area_last_iova(area));
548 
549 		ret = ops->read_and_clear_dirty(domain, iter.cur_iova,
550 						last - iter.cur_iova + 1, flags,
551 						dirty);
552 		if (ret)
553 			return ret;
554 	}
555 
556 	if (!iopt_area_contig_done(&iter))
557 		return -EINVAL;
558 	return 0;
559 }
560 
561 static int
iommu_read_and_clear_dirty(struct iommu_domain * domain,struct io_pagetable * iopt,unsigned long flags,struct iommu_hwpt_get_dirty_bitmap * bitmap)562 iommu_read_and_clear_dirty(struct iommu_domain *domain,
563 			   struct io_pagetable *iopt, unsigned long flags,
564 			   struct iommu_hwpt_get_dirty_bitmap *bitmap)
565 {
566 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
567 	struct iommu_iotlb_gather gather;
568 	struct iommu_dirty_bitmap dirty;
569 	struct iova_bitmap_fn_arg arg;
570 	struct iova_bitmap *iter;
571 	int ret = 0;
572 
573 	if (!ops || !ops->read_and_clear_dirty)
574 		return -EOPNOTSUPP;
575 
576 	iter = iova_bitmap_alloc(bitmap->iova, bitmap->length,
577 				 bitmap->page_size,
578 				 u64_to_user_ptr(bitmap->data));
579 	if (IS_ERR(iter))
580 		return -ENOMEM;
581 
582 	iommu_dirty_bitmap_init(&dirty, iter, &gather);
583 
584 	arg.flags = flags;
585 	arg.iopt = iopt;
586 	arg.domain = domain;
587 	arg.dirty = &dirty;
588 	iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty);
589 
590 	if (!(flags & IOMMU_DIRTY_NO_CLEAR))
591 		iommu_iotlb_sync(domain, &gather);
592 
593 	iova_bitmap_free(iter);
594 
595 	return ret;
596 }
597 
iommufd_check_iova_range(struct io_pagetable * iopt,struct iommu_hwpt_get_dirty_bitmap * bitmap)598 int iommufd_check_iova_range(struct io_pagetable *iopt,
599 			     struct iommu_hwpt_get_dirty_bitmap *bitmap)
600 {
601 	size_t iommu_pgsize = iopt->iova_alignment;
602 	u64 last_iova;
603 
604 	if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova))
605 		return -EOVERFLOW;
606 
607 	if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX)
608 		return -EOVERFLOW;
609 
610 	if ((bitmap->iova & (iommu_pgsize - 1)) ||
611 	    ((last_iova + 1) & (iommu_pgsize - 1)))
612 		return -EINVAL;
613 
614 	if (!bitmap->page_size)
615 		return -EINVAL;
616 
617 	if ((bitmap->iova & (bitmap->page_size - 1)) ||
618 	    ((last_iova + 1) & (bitmap->page_size - 1)))
619 		return -EINVAL;
620 
621 	return 0;
622 }
623 
iopt_read_and_clear_dirty_data(struct io_pagetable * iopt,struct iommu_domain * domain,unsigned long flags,struct iommu_hwpt_get_dirty_bitmap * bitmap)624 int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
625 				   struct iommu_domain *domain,
626 				   unsigned long flags,
627 				   struct iommu_hwpt_get_dirty_bitmap *bitmap)
628 {
629 	int ret;
630 
631 	ret = iommufd_check_iova_range(iopt, bitmap);
632 	if (ret)
633 		return ret;
634 
635 	down_read(&iopt->iova_rwsem);
636 	ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
637 	up_read(&iopt->iova_rwsem);
638 
639 	return ret;
640 }
641 
iopt_clear_dirty_data(struct io_pagetable * iopt,struct iommu_domain * domain)642 static int iopt_clear_dirty_data(struct io_pagetable *iopt,
643 				 struct iommu_domain *domain)
644 {
645 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
646 	struct iommu_iotlb_gather gather;
647 	struct iommu_dirty_bitmap dirty;
648 	struct iopt_area *area;
649 	int ret = 0;
650 
651 	lockdep_assert_held_read(&iopt->iova_rwsem);
652 
653 	iommu_dirty_bitmap_init(&dirty, NULL, &gather);
654 
655 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
656 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
657 		if (!area->pages)
658 			continue;
659 
660 		ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area),
661 						iopt_area_length(area), 0,
662 						&dirty);
663 		if (ret)
664 			break;
665 	}
666 
667 	iommu_iotlb_sync(domain, &gather);
668 	return ret;
669 }
670 
iopt_set_dirty_tracking(struct io_pagetable * iopt,struct iommu_domain * domain,bool enable)671 int iopt_set_dirty_tracking(struct io_pagetable *iopt,
672 			    struct iommu_domain *domain, bool enable)
673 {
674 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
675 	int ret = 0;
676 
677 	if (!ops)
678 		return -EOPNOTSUPP;
679 
680 	down_read(&iopt->iova_rwsem);
681 
682 	/* Clear dirty bits from PTEs to ensure a clean snapshot */
683 	if (enable) {
684 		ret = iopt_clear_dirty_data(iopt, domain);
685 		if (ret)
686 			goto out_unlock;
687 	}
688 
689 	ret = ops->set_dirty_tracking(domain, enable);
690 
691 out_unlock:
692 	up_read(&iopt->iova_rwsem);
693 	return ret;
694 }
695 
iopt_get_pages(struct io_pagetable * iopt,unsigned long iova,unsigned long length,struct list_head * pages_list)696 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
697 		   unsigned long length, struct list_head *pages_list)
698 {
699 	struct iopt_area_contig_iter iter;
700 	unsigned long last_iova;
701 	struct iopt_area *area;
702 	int rc;
703 
704 	if (!length)
705 		return -EINVAL;
706 	if (check_add_overflow(iova, length - 1, &last_iova))
707 		return -EOVERFLOW;
708 
709 	down_read(&iopt->iova_rwsem);
710 	iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
711 		struct iopt_pages_list *elm;
712 		unsigned long last = min(last_iova, iopt_area_last_iova(area));
713 
714 		elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT);
715 		if (!elm) {
716 			rc = -ENOMEM;
717 			goto err_free;
718 		}
719 		elm->start_byte = iopt_area_start_byte(area, iter.cur_iova);
720 		elm->pages = area->pages;
721 		elm->length = (last - iter.cur_iova) + 1;
722 		kref_get(&elm->pages->kref);
723 		list_add_tail(&elm->next, pages_list);
724 	}
725 	if (!iopt_area_contig_done(&iter)) {
726 		rc = -ENOENT;
727 		goto err_free;
728 	}
729 	up_read(&iopt->iova_rwsem);
730 	return 0;
731 err_free:
732 	up_read(&iopt->iova_rwsem);
733 	iopt_free_pages_list(pages_list);
734 	return rc;
735 }
736 
iopt_unmap_iova_range(struct io_pagetable * iopt,unsigned long start,unsigned long last,unsigned long * unmapped)737 static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
738 				 unsigned long last, unsigned long *unmapped)
739 {
740 	struct iopt_area *area;
741 	unsigned long unmapped_bytes = 0;
742 	unsigned int tries = 0;
743 	/* If there are no mapped entries then success */
744 	int rc = 0;
745 
746 	/*
747 	 * The domains_rwsem must be held in read mode any time any area->pages
748 	 * is NULL. This prevents domain attach/detatch from running
749 	 * concurrently with cleaning up the area.
750 	 */
751 again:
752 	down_read(&iopt->domains_rwsem);
753 	down_write(&iopt->iova_rwsem);
754 	while ((area = iopt_area_iter_first(iopt, start, last))) {
755 		unsigned long area_last = iopt_area_last_iova(area);
756 		unsigned long area_first = iopt_area_iova(area);
757 		struct iopt_pages *pages;
758 
759 		/* Userspace should not race map/unmap's of the same area */
760 		if (!area->pages) {
761 			rc = -EBUSY;
762 			goto out_unlock_iova;
763 		}
764 
765 		/* The area is locked by an object that has not been destroyed */
766 		if (area->num_locks) {
767 			rc = -EBUSY;
768 			goto out_unlock_iova;
769 		}
770 
771 		if (area_first < start || area_last > last) {
772 			rc = -ENOENT;
773 			goto out_unlock_iova;
774 		}
775 
776 		if (area_first != start)
777 			tries = 0;
778 
779 		/*
780 		 * num_accesses writers must hold the iova_rwsem too, so we can
781 		 * safely read it under the write side of the iovam_rwsem
782 		 * without the pages->mutex.
783 		 */
784 		if (area->num_accesses) {
785 			size_t length = iopt_area_length(area);
786 
787 			start = area_first;
788 			area->prevent_access = true;
789 			up_write(&iopt->iova_rwsem);
790 			up_read(&iopt->domains_rwsem);
791 
792 			iommufd_access_notify_unmap(iopt, area_first, length);
793 			/* Something is not responding to unmap requests. */
794 			tries++;
795 			if (WARN_ON(tries > 100)) {
796 				rc = -EDEADLOCK;
797 				goto out_unmapped;
798 			}
799 			goto again;
800 		}
801 
802 		pages = area->pages;
803 		area->pages = NULL;
804 		up_write(&iopt->iova_rwsem);
805 
806 		iopt_area_unfill_domains(area, pages);
807 		iopt_abort_area(area);
808 		iopt_put_pages(pages);
809 
810 		unmapped_bytes += area_last - area_first + 1;
811 
812 		down_write(&iopt->iova_rwsem);
813 	}
814 
815 out_unlock_iova:
816 	up_write(&iopt->iova_rwsem);
817 	up_read(&iopt->domains_rwsem);
818 out_unmapped:
819 	if (unmapped)
820 		*unmapped = unmapped_bytes;
821 	return rc;
822 }
823 
824 /**
825  * iopt_unmap_iova() - Remove a range of iova
826  * @iopt: io_pagetable to act on
827  * @iova: Starting iova to unmap
828  * @length: Number of bytes to unmap
829  * @unmapped: Return number of bytes unmapped
830  *
831  * The requested range must be a superset of existing ranges.
832  * Splitting/truncating IOVA mappings is not allowed.
833  */
iopt_unmap_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length,unsigned long * unmapped)834 int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
835 		    unsigned long length, unsigned long *unmapped)
836 {
837 	unsigned long iova_last;
838 
839 	if (!length)
840 		return -EINVAL;
841 
842 	if (check_add_overflow(iova, length - 1, &iova_last))
843 		return -EOVERFLOW;
844 
845 	return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped);
846 }
847 
iopt_unmap_all(struct io_pagetable * iopt,unsigned long * unmapped)848 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
849 {
850 	/* If the IOVAs are empty then unmap all succeeds */
851 	return iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
852 }
853 
854 /* The caller must always free all the nodes in the allowed_iova rb_root. */
iopt_set_allow_iova(struct io_pagetable * iopt,struct rb_root_cached * allowed_iova)855 int iopt_set_allow_iova(struct io_pagetable *iopt,
856 			struct rb_root_cached *allowed_iova)
857 {
858 	struct iopt_allowed *allowed;
859 
860 	down_write(&iopt->iova_rwsem);
861 	swap(*allowed_iova, iopt->allowed_itree);
862 
863 	for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed;
864 	     allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) {
865 		if (iopt_reserved_iter_first(iopt, allowed->node.start,
866 					     allowed->node.last)) {
867 			swap(*allowed_iova, iopt->allowed_itree);
868 			up_write(&iopt->iova_rwsem);
869 			return -EADDRINUSE;
870 		}
871 	}
872 	up_write(&iopt->iova_rwsem);
873 	return 0;
874 }
875 
iopt_reserve_iova(struct io_pagetable * iopt,unsigned long start,unsigned long last,void * owner)876 int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
877 		      unsigned long last, void *owner)
878 {
879 	struct iopt_reserved *reserved;
880 
881 	lockdep_assert_held_write(&iopt->iova_rwsem);
882 
883 	if (iopt_area_iter_first(iopt, start, last) ||
884 	    iopt_allowed_iter_first(iopt, start, last))
885 		return -EADDRINUSE;
886 
887 	reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT);
888 	if (!reserved)
889 		return -ENOMEM;
890 	reserved->node.start = start;
891 	reserved->node.last = last;
892 	reserved->owner = owner;
893 	interval_tree_insert(&reserved->node, &iopt->reserved_itree);
894 	return 0;
895 }
896 
__iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)897 static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
898 {
899 	struct iopt_reserved *reserved, *next;
900 
901 	lockdep_assert_held_write(&iopt->iova_rwsem);
902 
903 	for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved;
904 	     reserved = next) {
905 		next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX);
906 
907 		if (reserved->owner == owner) {
908 			interval_tree_remove(&reserved->node,
909 					     &iopt->reserved_itree);
910 			kfree(reserved);
911 		}
912 	}
913 }
914 
iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)915 void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
916 {
917 	down_write(&iopt->iova_rwsem);
918 	__iopt_remove_reserved_iova(iopt, owner);
919 	up_write(&iopt->iova_rwsem);
920 }
921 
iopt_init_table(struct io_pagetable * iopt)922 void iopt_init_table(struct io_pagetable *iopt)
923 {
924 	init_rwsem(&iopt->iova_rwsem);
925 	init_rwsem(&iopt->domains_rwsem);
926 	iopt->area_itree = RB_ROOT_CACHED;
927 	iopt->allowed_itree = RB_ROOT_CACHED;
928 	iopt->reserved_itree = RB_ROOT_CACHED;
929 	xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT);
930 	xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC);
931 
932 	/*
933 	 * iopt's start as SW tables that can use the entire size_t IOVA space
934 	 * due to the use of size_t in the APIs. They have no alignment
935 	 * restriction.
936 	 */
937 	iopt->iova_alignment = 1;
938 }
939 
iopt_destroy_table(struct io_pagetable * iopt)940 void iopt_destroy_table(struct io_pagetable *iopt)
941 {
942 	struct interval_tree_node *node;
943 
944 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
945 		iopt_remove_reserved_iova(iopt, NULL);
946 
947 	while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0,
948 						ULONG_MAX))) {
949 		interval_tree_remove(node, &iopt->allowed_itree);
950 		kfree(container_of(node, struct iopt_allowed, node));
951 	}
952 
953 	WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root));
954 	WARN_ON(!xa_empty(&iopt->domains));
955 	WARN_ON(!xa_empty(&iopt->access_list));
956 	WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
957 }
958 
959 /**
960  * iopt_unfill_domain() - Unfill a domain with PFNs
961  * @iopt: io_pagetable to act on
962  * @domain: domain to unfill
963  *
964  * This is used when removing a domain from the iopt. Every area in the iopt
965  * will be unmapped from the domain. The domain must already be removed from the
966  * domains xarray.
967  */
iopt_unfill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)968 static void iopt_unfill_domain(struct io_pagetable *iopt,
969 			       struct iommu_domain *domain)
970 {
971 	struct iopt_area *area;
972 
973 	lockdep_assert_held(&iopt->iova_rwsem);
974 	lockdep_assert_held_write(&iopt->domains_rwsem);
975 
976 	/*
977 	 * Some other domain is holding all the pfns still, rapidly unmap this
978 	 * domain.
979 	 */
980 	if (iopt->next_domain_id != 0) {
981 		/* Pick an arbitrary remaining domain to act as storage */
982 		struct iommu_domain *storage_domain =
983 			xa_load(&iopt->domains, 0);
984 
985 		for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
986 		     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
987 			struct iopt_pages *pages = area->pages;
988 
989 			if (!pages)
990 				continue;
991 
992 			mutex_lock(&pages->mutex);
993 			if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
994 				WARN_ON(!area->storage_domain);
995 			if (area->storage_domain == domain)
996 				area->storage_domain = storage_domain;
997 			if (iopt_is_dmabuf(pages)) {
998 				if (!iopt_dmabuf_revoked(pages))
999 					iopt_area_unmap_domain(area, domain);
1000 				iopt_dmabuf_untrack_domain(pages, area, domain);
1001 			}
1002 			mutex_unlock(&pages->mutex);
1003 
1004 			if (!iopt_is_dmabuf(pages))
1005 				iopt_area_unmap_domain(area, domain);
1006 		}
1007 		return;
1008 	}
1009 
1010 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1011 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
1012 		struct iopt_pages *pages = area->pages;
1013 
1014 		if (!pages)
1015 			continue;
1016 
1017 		mutex_lock(&pages->mutex);
1018 		interval_tree_remove(&area->pages_node, &pages->domains_itree);
1019 		WARN_ON(area->storage_domain != domain);
1020 		area->storage_domain = NULL;
1021 		iopt_area_unfill_domain(area, pages, domain);
1022 		if (iopt_is_dmabuf(pages))
1023 			iopt_dmabuf_untrack_domain(pages, area, domain);
1024 		mutex_unlock(&pages->mutex);
1025 	}
1026 }
1027 
1028 /**
1029  * iopt_fill_domain() - Fill a domain with PFNs
1030  * @iopt: io_pagetable to act on
1031  * @domain: domain to fill
1032  *
1033  * Fill the domain with PFNs from every area in the iopt. On failure the domain
1034  * is left unchanged.
1035  */
iopt_fill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)1036 static int iopt_fill_domain(struct io_pagetable *iopt,
1037 			    struct iommu_domain *domain)
1038 {
1039 	struct iopt_area *end_area;
1040 	struct iopt_area *area;
1041 	int rc;
1042 
1043 	lockdep_assert_held(&iopt->iova_rwsem);
1044 	lockdep_assert_held_write(&iopt->domains_rwsem);
1045 
1046 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1047 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
1048 		struct iopt_pages *pages = area->pages;
1049 
1050 		if (!pages)
1051 			continue;
1052 
1053 		guard(mutex)(&pages->mutex);
1054 		if (iopt_is_dmabuf(pages)) {
1055 			rc = iopt_dmabuf_track_domain(pages, area, domain);
1056 			if (rc)
1057 				goto out_unfill;
1058 		}
1059 		rc = iopt_area_fill_domain(area, domain);
1060 		if (rc) {
1061 			if (iopt_is_dmabuf(pages))
1062 				iopt_dmabuf_untrack_domain(pages, area, domain);
1063 			goto out_unfill;
1064 		}
1065 		if (!area->storage_domain) {
1066 			WARN_ON(iopt->next_domain_id != 0);
1067 			area->storage_domain = domain;
1068 			interval_tree_insert(&area->pages_node,
1069 					     &pages->domains_itree);
1070 		}
1071 	}
1072 	return 0;
1073 
1074 out_unfill:
1075 	end_area = area;
1076 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1077 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
1078 		struct iopt_pages *pages = area->pages;
1079 
1080 		if (area == end_area)
1081 			break;
1082 		if (!pages)
1083 			continue;
1084 		mutex_lock(&pages->mutex);
1085 		if (iopt->next_domain_id == 0) {
1086 			interval_tree_remove(&area->pages_node,
1087 					     &pages->domains_itree);
1088 			area->storage_domain = NULL;
1089 		}
1090 		iopt_area_unfill_domain(area, pages, domain);
1091 		if (iopt_is_dmabuf(pages))
1092 			iopt_dmabuf_untrack_domain(pages, area, domain);
1093 		mutex_unlock(&pages->mutex);
1094 	}
1095 	return rc;
1096 }
1097 
1098 /* All existing area's conform to an increased page size */
iopt_check_iova_alignment(struct io_pagetable * iopt,unsigned long new_iova_alignment)1099 static int iopt_check_iova_alignment(struct io_pagetable *iopt,
1100 				     unsigned long new_iova_alignment)
1101 {
1102 	unsigned long align_mask = new_iova_alignment - 1;
1103 	struct iopt_area *area;
1104 
1105 	lockdep_assert_held(&iopt->iova_rwsem);
1106 	lockdep_assert_held(&iopt->domains_rwsem);
1107 
1108 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1109 	     area = iopt_area_iter_next(area, 0, ULONG_MAX))
1110 		if ((iopt_area_iova(area) & align_mask) ||
1111 		    (iopt_area_length(area) & align_mask) ||
1112 		    (area->page_offset & align_mask))
1113 			return -EADDRINUSE;
1114 
1115 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) {
1116 		struct iommufd_access *access;
1117 		unsigned long index;
1118 
1119 		xa_for_each(&iopt->access_list, index, access)
1120 			if (WARN_ON(access->iova_alignment >
1121 				    new_iova_alignment))
1122 				return -EADDRINUSE;
1123 	}
1124 	return 0;
1125 }
1126 
iopt_table_add_domain(struct io_pagetable * iopt,struct iommu_domain * domain)1127 int iopt_table_add_domain(struct io_pagetable *iopt,
1128 			  struct iommu_domain *domain)
1129 {
1130 	const struct iommu_domain_geometry *geometry = &domain->geometry;
1131 	struct iommu_domain *iter_domain;
1132 	unsigned int new_iova_alignment;
1133 	unsigned long index;
1134 	int rc;
1135 
1136 	down_write(&iopt->domains_rwsem);
1137 	down_write(&iopt->iova_rwsem);
1138 
1139 	xa_for_each(&iopt->domains, index, iter_domain) {
1140 		if (WARN_ON(iter_domain == domain)) {
1141 			rc = -EEXIST;
1142 			goto out_unlock;
1143 		}
1144 	}
1145 
1146 	/*
1147 	 * The io page size drives the iova_alignment. Internally the iopt_pages
1148 	 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
1149 	 * objects into the iommu_domain.
1150 	 *
1151 	 * A iommu_domain must always be able to accept PAGE_SIZE to be
1152 	 * compatible as we can't guarantee higher contiguity.
1153 	 */
1154 	new_iova_alignment = max_t(unsigned long,
1155 				   1UL << __ffs(domain->pgsize_bitmap),
1156 				   iopt->iova_alignment);
1157 	if (new_iova_alignment > PAGE_SIZE) {
1158 		rc = -EINVAL;
1159 		goto out_unlock;
1160 	}
1161 	if (new_iova_alignment != iopt->iova_alignment) {
1162 		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1163 		if (rc)
1164 			goto out_unlock;
1165 	}
1166 
1167 	/* No area exists that is outside the allowed domain aperture */
1168 	if (geometry->aperture_start != 0) {
1169 		rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
1170 				       domain);
1171 		if (rc)
1172 			goto out_reserved;
1173 	}
1174 	if (geometry->aperture_end != ULONG_MAX) {
1175 		rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
1176 				       ULONG_MAX, domain);
1177 		if (rc)
1178 			goto out_reserved;
1179 	}
1180 
1181 	rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
1182 	if (rc)
1183 		goto out_reserved;
1184 
1185 	rc = iopt_fill_domain(iopt, domain);
1186 	if (rc)
1187 		goto out_release;
1188 
1189 	iopt->iova_alignment = new_iova_alignment;
1190 	xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
1191 	iopt->next_domain_id++;
1192 	up_write(&iopt->iova_rwsem);
1193 	up_write(&iopt->domains_rwsem);
1194 	return 0;
1195 out_release:
1196 	xa_release(&iopt->domains, iopt->next_domain_id);
1197 out_reserved:
1198 	__iopt_remove_reserved_iova(iopt, domain);
1199 out_unlock:
1200 	up_write(&iopt->iova_rwsem);
1201 	up_write(&iopt->domains_rwsem);
1202 	return rc;
1203 }
1204 
iopt_calculate_iova_alignment(struct io_pagetable * iopt)1205 static int iopt_calculate_iova_alignment(struct io_pagetable *iopt)
1206 {
1207 	unsigned long new_iova_alignment;
1208 	struct iommufd_access *access;
1209 	struct iommu_domain *domain;
1210 	unsigned long index;
1211 
1212 	lockdep_assert_held_write(&iopt->iova_rwsem);
1213 	lockdep_assert_held(&iopt->domains_rwsem);
1214 
1215 	/* See batch_iommu_map_small() */
1216 	if (iopt->disable_large_pages)
1217 		new_iova_alignment = PAGE_SIZE;
1218 	else
1219 		new_iova_alignment = 1;
1220 
1221 	xa_for_each(&iopt->domains, index, domain)
1222 		new_iova_alignment = max_t(unsigned long,
1223 					   1UL << __ffs(domain->pgsize_bitmap),
1224 					   new_iova_alignment);
1225 	xa_for_each(&iopt->access_list, index, access)
1226 		new_iova_alignment = max_t(unsigned long,
1227 					   access->iova_alignment,
1228 					   new_iova_alignment);
1229 
1230 	if (new_iova_alignment > iopt->iova_alignment) {
1231 		int rc;
1232 
1233 		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1234 		if (rc)
1235 			return rc;
1236 	}
1237 	iopt->iova_alignment = new_iova_alignment;
1238 	return 0;
1239 }
1240 
iopt_table_remove_domain(struct io_pagetable * iopt,struct iommu_domain * domain)1241 void iopt_table_remove_domain(struct io_pagetable *iopt,
1242 			      struct iommu_domain *domain)
1243 {
1244 	struct iommu_domain *iter_domain = NULL;
1245 	unsigned long index;
1246 
1247 	down_write(&iopt->domains_rwsem);
1248 	down_write(&iopt->iova_rwsem);
1249 
1250 	xa_for_each(&iopt->domains, index, iter_domain)
1251 		if (iter_domain == domain)
1252 			break;
1253 	if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
1254 		goto out_unlock;
1255 
1256 	/*
1257 	 * Compress the xarray to keep it linear by swapping the entry to erase
1258 	 * with the tail entry and shrinking the tail.
1259 	 */
1260 	iopt->next_domain_id--;
1261 	iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
1262 	if (index != iopt->next_domain_id)
1263 		xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
1264 
1265 	iopt_unfill_domain(iopt, domain);
1266 	__iopt_remove_reserved_iova(iopt, domain);
1267 
1268 	WARN_ON(iopt_calculate_iova_alignment(iopt));
1269 out_unlock:
1270 	up_write(&iopt->iova_rwsem);
1271 	up_write(&iopt->domains_rwsem);
1272 }
1273 
1274 /**
1275  * iopt_area_split - Split an area into two parts at iova
1276  * @area: The area to split
1277  * @iova: Becomes the last of a new area
1278  *
1279  * This splits an area into two. It is part of the VFIO compatibility to allow
1280  * poking a hole in the mapping. The two areas continue to point at the same
1281  * iopt_pages, just with different starting bytes.
1282  */
iopt_area_split(struct iopt_area * area,unsigned long iova)1283 static int iopt_area_split(struct iopt_area *area, unsigned long iova)
1284 {
1285 	unsigned long alignment = area->iopt->iova_alignment;
1286 	unsigned long last_iova = iopt_area_last_iova(area);
1287 	unsigned long start_iova = iopt_area_iova(area);
1288 	unsigned long new_start = iova + 1;
1289 	struct io_pagetable *iopt = area->iopt;
1290 	struct iopt_pages *pages = area->pages;
1291 	struct iopt_area *lhs;
1292 	struct iopt_area *rhs;
1293 	int rc;
1294 
1295 	lockdep_assert_held_write(&iopt->iova_rwsem);
1296 
1297 	if (iova == start_iova || iova == last_iova)
1298 		return 0;
1299 
1300 	if (!pages || area->prevent_access)
1301 		return -EBUSY;
1302 
1303 	/* Maintaining the domains_itree below is a bit complicated */
1304 	if (iopt_is_dmabuf(pages))
1305 		return -EOPNOTSUPP;
1306 
1307 	if (new_start & (alignment - 1) ||
1308 	    iopt_area_start_byte(area, new_start) & (alignment - 1))
1309 		return -EINVAL;
1310 
1311 	lhs = iopt_area_alloc();
1312 	if (!lhs)
1313 		return -ENOMEM;
1314 
1315 	rhs = iopt_area_alloc();
1316 	if (!rhs) {
1317 		rc = -ENOMEM;
1318 		goto err_free_lhs;
1319 	}
1320 
1321 	mutex_lock(&pages->mutex);
1322 	/*
1323 	 * Splitting is not permitted if an access exists, we don't track enough
1324 	 * information to split existing accesses.
1325 	 */
1326 	if (area->num_accesses) {
1327 		rc = -EINVAL;
1328 		goto err_unlock;
1329 	}
1330 
1331 	/*
1332 	 * Splitting is not permitted if a domain could have been mapped with
1333 	 * huge pages.
1334 	 */
1335 	if (area->storage_domain && !iopt->disable_large_pages) {
1336 		rc = -EINVAL;
1337 		goto err_unlock;
1338 	}
1339 
1340 	interval_tree_remove(&area->node, &iopt->area_itree);
1341 	rc = iopt_insert_area(iopt, lhs, area->pages, start_iova,
1342 			      iopt_area_start_byte(area, start_iova),
1343 			      (new_start - 1) - start_iova + 1,
1344 			      area->iommu_prot);
1345 	if (WARN_ON(rc))
1346 		goto err_insert;
1347 
1348 	rc = iopt_insert_area(iopt, rhs, area->pages, new_start,
1349 			      iopt_area_start_byte(area, new_start),
1350 			      last_iova - new_start + 1, area->iommu_prot);
1351 	if (WARN_ON(rc))
1352 		goto err_remove_lhs;
1353 
1354 	/*
1355 	 * If the original area has filled a domain, domains_itree has to be
1356 	 * updated.
1357 	 */
1358 	if (area->storage_domain) {
1359 		interval_tree_remove(&area->pages_node, &pages->domains_itree);
1360 		interval_tree_insert(&lhs->pages_node, &pages->domains_itree);
1361 		interval_tree_insert(&rhs->pages_node, &pages->domains_itree);
1362 	}
1363 
1364 	lhs->storage_domain = area->storage_domain;
1365 	lhs->pages = area->pages;
1366 	rhs->storage_domain = area->storage_domain;
1367 	rhs->pages = area->pages;
1368 	kref_get(&rhs->pages->kref);
1369 	kfree(area);
1370 	mutex_unlock(&pages->mutex);
1371 
1372 	/*
1373 	 * No change to domains or accesses because the pages hasn't been
1374 	 * changed
1375 	 */
1376 	return 0;
1377 
1378 err_remove_lhs:
1379 	interval_tree_remove(&lhs->node, &iopt->area_itree);
1380 err_insert:
1381 	interval_tree_insert(&area->node, &iopt->area_itree);
1382 err_unlock:
1383 	mutex_unlock(&pages->mutex);
1384 	kfree(rhs);
1385 err_free_lhs:
1386 	kfree(lhs);
1387 	return rc;
1388 }
1389 
iopt_cut_iova(struct io_pagetable * iopt,unsigned long * iovas,size_t num_iovas)1390 int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
1391 		  size_t num_iovas)
1392 {
1393 	int rc = 0;
1394 	int i;
1395 
1396 	down_write(&iopt->iova_rwsem);
1397 	for (i = 0; i < num_iovas; i++) {
1398 		struct iopt_area *area;
1399 
1400 		area = iopt_area_iter_first(iopt, iovas[i], iovas[i]);
1401 		if (!area)
1402 			continue;
1403 		rc = iopt_area_split(area, iovas[i]);
1404 		if (rc)
1405 			break;
1406 	}
1407 	up_write(&iopt->iova_rwsem);
1408 	return rc;
1409 }
1410 
iopt_enable_large_pages(struct io_pagetable * iopt)1411 void iopt_enable_large_pages(struct io_pagetable *iopt)
1412 {
1413 	int rc;
1414 
1415 	down_write(&iopt->domains_rwsem);
1416 	down_write(&iopt->iova_rwsem);
1417 	WRITE_ONCE(iopt->disable_large_pages, false);
1418 	rc = iopt_calculate_iova_alignment(iopt);
1419 	WARN_ON(rc);
1420 	up_write(&iopt->iova_rwsem);
1421 	up_write(&iopt->domains_rwsem);
1422 }
1423 
iopt_disable_large_pages(struct io_pagetable * iopt)1424 int iopt_disable_large_pages(struct io_pagetable *iopt)
1425 {
1426 	int rc = 0;
1427 
1428 	down_write(&iopt->domains_rwsem);
1429 	down_write(&iopt->iova_rwsem);
1430 	if (iopt->disable_large_pages)
1431 		goto out_unlock;
1432 
1433 	/* Won't do it if domains already have pages mapped in them */
1434 	if (!xa_empty(&iopt->domains) &&
1435 	    !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) {
1436 		rc = -EINVAL;
1437 		goto out_unlock;
1438 	}
1439 
1440 	WRITE_ONCE(iopt->disable_large_pages, true);
1441 	rc = iopt_calculate_iova_alignment(iopt);
1442 	if (rc)
1443 		WRITE_ONCE(iopt->disable_large_pages, false);
1444 out_unlock:
1445 	up_write(&iopt->iova_rwsem);
1446 	up_write(&iopt->domains_rwsem);
1447 	return rc;
1448 }
1449 
iopt_add_access(struct io_pagetable * iopt,struct iommufd_access * access)1450 int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access)
1451 {
1452 	u32 new_id;
1453 	int rc;
1454 
1455 	down_write(&iopt->domains_rwsem);
1456 	down_write(&iopt->iova_rwsem);
1457 	rc = xa_alloc(&iopt->access_list, &new_id, access, xa_limit_16b,
1458 		      GFP_KERNEL_ACCOUNT);
1459 
1460 	if (rc)
1461 		goto out_unlock;
1462 
1463 	rc = iopt_calculate_iova_alignment(iopt);
1464 	if (rc) {
1465 		xa_erase(&iopt->access_list, new_id);
1466 		goto out_unlock;
1467 	}
1468 	access->iopt_access_list_id = new_id;
1469 
1470 out_unlock:
1471 	up_write(&iopt->iova_rwsem);
1472 	up_write(&iopt->domains_rwsem);
1473 	return rc;
1474 }
1475 
iopt_remove_access(struct io_pagetable * iopt,struct iommufd_access * access,u32 iopt_access_list_id)1476 void iopt_remove_access(struct io_pagetable *iopt,
1477 			struct iommufd_access *access, u32 iopt_access_list_id)
1478 {
1479 	down_write(&iopt->domains_rwsem);
1480 	down_write(&iopt->iova_rwsem);
1481 	WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access);
1482 	WARN_ON(iopt_calculate_iova_alignment(iopt));
1483 	up_write(&iopt->iova_rwsem);
1484 	up_write(&iopt->domains_rwsem);
1485 }
1486 
1487 /* Narrow the valid_iova_itree to include reserved ranges from a device. */
iopt_table_enforce_dev_resv_regions(struct io_pagetable * iopt,struct device * dev,phys_addr_t * sw_msi_start)1488 int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
1489 					struct device *dev,
1490 					phys_addr_t *sw_msi_start)
1491 {
1492 	struct iommu_resv_region *resv;
1493 	LIST_HEAD(resv_regions);
1494 	unsigned int num_hw_msi = 0;
1495 	unsigned int num_sw_msi = 0;
1496 	int rc;
1497 
1498 	if (iommufd_should_fail())
1499 		return -EINVAL;
1500 
1501 	down_write(&iopt->iova_rwsem);
1502 	/* FIXME: drivers allocate memory but there is no failure propogated */
1503 	iommu_get_resv_regions(dev, &resv_regions);
1504 
1505 	list_for_each_entry(resv, &resv_regions, list) {
1506 		if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1507 			continue;
1508 
1509 		if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
1510 			num_hw_msi++;
1511 		if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
1512 			*sw_msi_start = resv->start;
1513 			num_sw_msi++;
1514 		}
1515 
1516 		rc = iopt_reserve_iova(iopt, resv->start,
1517 				       resv->length - 1 + resv->start, dev);
1518 		if (rc)
1519 			goto out_reserved;
1520 	}
1521 
1522 	/* Drivers must offer sane combinations of regions */
1523 	if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) {
1524 		rc = -EINVAL;
1525 		goto out_reserved;
1526 	}
1527 
1528 	rc = 0;
1529 	goto out_free_resv;
1530 
1531 out_reserved:
1532 	__iopt_remove_reserved_iova(iopt, dev);
1533 out_free_resv:
1534 	iommu_put_resv_regions(dev, &resv_regions);
1535 	up_write(&iopt->iova_rwsem);
1536 	return rc;
1537 }
1538