xref: /linux/drivers/vfio/vfio_iommu_type1.c (revision 3536049822060347c8cb5a923186a8d65a8f7a48)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO: IOMMU DMA mapping support for Type1 IOMMU
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  *
12  * We arbitrarily define a Type1 IOMMU as one matching the below code.
13  * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel
14  * VT-d, but that makes it harder to re-use as theoretically anyone
15  * implementing a similar IOMMU could make use of this.  We expect the
16  * IOMMU to support the IOMMU API and have few to no restrictions around
17  * the IOVA range that can be mapped.  The Type1 IOMMU is currently
18  * optimized for relatively static mappings of a userspace process with
19  * userspace pages pinned into memory.  We also assume devices and IOMMU
20  * domains are PCI based as the IOMMU API is still centered around a
21  * device/bus interface rather than a group interface.
22  */
23 
24 #include <linux/compat.h>
25 #include <linux/device.h>
26 #include <linux/fs.h>
27 #include <linux/highmem.h>
28 #include <linux/iommu.h>
29 #include <linux/module.h>
30 #include <linux/mm.h>
31 #include <linux/kthread.h>
32 #include <linux/rbtree.h>
33 #include <linux/sched/signal.h>
34 #include <linux/sched/mm.h>
35 #include <linux/slab.h>
36 #include <linux/uaccess.h>
37 #include <linux/vfio.h>
38 #include <linux/workqueue.h>
39 #include <linux/notifier.h>
40 #include "vfio.h"
41 
42 #define DRIVER_VERSION  "0.2"
43 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
44 #define DRIVER_DESC     "Type1 IOMMU driver for VFIO"
45 
46 static bool allow_unsafe_interrupts;
47 module_param_named(allow_unsafe_interrupts,
48 		   allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
49 MODULE_PARM_DESC(allow_unsafe_interrupts,
50 		 "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
51 
52 static bool disable_hugepages;
53 module_param_named(disable_hugepages,
54 		   disable_hugepages, bool, S_IRUGO | S_IWUSR);
55 MODULE_PARM_DESC(disable_hugepages,
56 		 "Disable VFIO IOMMU support for IOMMU hugepages.");
57 
58 static unsigned int dma_entry_limit __read_mostly = U16_MAX;
59 module_param_named(dma_entry_limit, dma_entry_limit, uint, 0644);
60 MODULE_PARM_DESC(dma_entry_limit,
61 		 "Maximum number of user DMA mappings per container (65535).");
62 
63 struct vfio_iommu {
64 	struct list_head	domain_list;
65 	struct list_head	iova_list;
66 	struct mutex		lock;
67 	struct rb_root		dma_list;
68 	struct list_head	device_list;
69 	struct mutex		device_list_lock;
70 	unsigned int		dma_avail;
71 	unsigned int		vaddr_invalid_count;
72 	uint64_t		pgsize_bitmap;
73 	uint64_t		num_non_pinned_groups;
74 	bool			v2;
75 	bool			dirty_page_tracking;
76 	struct list_head	emulated_iommu_groups;
77 };
78 
79 struct vfio_domain {
80 	struct iommu_domain	*domain;
81 	struct list_head	next;
82 	struct list_head	group_list;
83 	bool			enforce_cache_coherency : 1;
84 };
85 
86 struct vfio_dma {
87 	struct rb_node		node;
88 	dma_addr_t		iova;		/* Device address */
89 	unsigned long		vaddr;		/* Process virtual addr */
90 	size_t			size;		/* Map size (bytes) */
91 	int			prot;		/* IOMMU_READ/WRITE */
92 	bool			iommu_mapped;
93 	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
94 	bool			vaddr_invalid;
95 	struct task_struct	*task;
96 	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
97 	unsigned long		*bitmap;
98 	struct mm_struct	*mm;
99 	size_t			locked_vm;
100 };
101 
102 struct vfio_batch {
103 	struct page		**pages;	/* for pin_user_pages_remote */
104 	struct page		*fallback_page; /* if pages alloc fails */
105 	unsigned int		capacity;	/* length of pages array */
106 	unsigned int		size;		/* of batch currently */
107 	unsigned int		offset;		/* of next entry in pages */
108 };
109 
110 struct vfio_iommu_group {
111 	struct iommu_group	*iommu_group;
112 	struct list_head	next;
113 	bool			pinned_page_dirty_scope;
114 };
115 
116 struct vfio_iova {
117 	struct list_head	list;
118 	dma_addr_t		start;
119 	dma_addr_t		end;
120 };
121 
122 /*
123  * Guest RAM pinning working set or DMA target
124  */
125 struct vfio_pfn {
126 	struct rb_node		node;
127 	dma_addr_t		iova;		/* Device address */
128 	unsigned long		pfn;		/* Host pfn */
129 	unsigned int		ref_count;
130 };
131 
132 struct vfio_regions {
133 	struct list_head list;
134 	dma_addr_t iova;
135 	phys_addr_t phys;
136 	size_t len;
137 };
138 
139 #define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
140 
141 /*
142  * Input argument of number of bits to bitmap_set() is unsigned integer, which
143  * further casts to signed integer for unaligned multi-bit operation,
144  * __bitmap_set().
145  * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
146  * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
147  * system.
148  */
149 #define DIRTY_BITMAP_PAGES_MAX	 ((u64)INT_MAX)
150 #define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
151 
152 static int put_pfn(unsigned long pfn, int prot);
153 
154 static struct vfio_iommu_group*
155 vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
156 			    struct iommu_group *iommu_group);
157 
158 /*
159  * This code handles mapping and unmapping of user data buffers
160  * into DMA'ble space using the IOMMU
161  */
162 
vfio_find_dma(struct vfio_iommu * iommu,dma_addr_t start,size_t size)163 static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
164 				      dma_addr_t start, size_t size)
165 {
166 	struct rb_node *node = iommu->dma_list.rb_node;
167 
168 	while (node) {
169 		struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
170 
171 		if (start + size <= dma->iova)
172 			node = node->rb_left;
173 		else if (start >= dma->iova + dma->size)
174 			node = node->rb_right;
175 		else
176 			return dma;
177 	}
178 
179 	return NULL;
180 }
181 
vfio_find_dma_first_node(struct vfio_iommu * iommu,dma_addr_t start,u64 size)182 static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu,
183 						dma_addr_t start, u64 size)
184 {
185 	struct rb_node *res = NULL;
186 	struct rb_node *node = iommu->dma_list.rb_node;
187 	struct vfio_dma *dma_res = NULL;
188 
189 	while (node) {
190 		struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
191 
192 		if (start < dma->iova + dma->size) {
193 			res = node;
194 			dma_res = dma;
195 			if (start >= dma->iova)
196 				break;
197 			node = node->rb_left;
198 		} else {
199 			node = node->rb_right;
200 		}
201 	}
202 	if (res && size && dma_res->iova >= start + size)
203 		res = NULL;
204 	return res;
205 }
206 
vfio_link_dma(struct vfio_iommu * iommu,struct vfio_dma * new)207 static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
208 {
209 	struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
210 	struct vfio_dma *dma;
211 
212 	while (*link) {
213 		parent = *link;
214 		dma = rb_entry(parent, struct vfio_dma, node);
215 
216 		if (new->iova + new->size <= dma->iova)
217 			link = &(*link)->rb_left;
218 		else
219 			link = &(*link)->rb_right;
220 	}
221 
222 	rb_link_node(&new->node, parent, link);
223 	rb_insert_color(&new->node, &iommu->dma_list);
224 }
225 
vfio_unlink_dma(struct vfio_iommu * iommu,struct vfio_dma * old)226 static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
227 {
228 	rb_erase(&old->node, &iommu->dma_list);
229 }
230 
231 
vfio_dma_bitmap_alloc(struct vfio_dma * dma,size_t pgsize)232 static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
233 {
234 	uint64_t npages = dma->size / pgsize;
235 
236 	if (npages > DIRTY_BITMAP_PAGES_MAX)
237 		return -EINVAL;
238 
239 	/*
240 	 * Allocate extra 64 bits that are used to calculate shift required for
241 	 * bitmap_shift_left() to manipulate and club unaligned number of pages
242 	 * in adjacent vfio_dma ranges.
243 	 */
244 	dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages) + sizeof(u64),
245 			       GFP_KERNEL);
246 	if (!dma->bitmap)
247 		return -ENOMEM;
248 
249 	return 0;
250 }
251 
vfio_dma_bitmap_free(struct vfio_dma * dma)252 static void vfio_dma_bitmap_free(struct vfio_dma *dma)
253 {
254 	kvfree(dma->bitmap);
255 	dma->bitmap = NULL;
256 }
257 
vfio_dma_populate_bitmap(struct vfio_dma * dma,size_t pgsize)258 static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
259 {
260 	struct rb_node *p;
261 	unsigned long pgshift = __ffs(pgsize);
262 
263 	for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
264 		struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
265 
266 		bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) >> pgshift, 1);
267 	}
268 }
269 
vfio_iommu_populate_bitmap_full(struct vfio_iommu * iommu)270 static void vfio_iommu_populate_bitmap_full(struct vfio_iommu *iommu)
271 {
272 	struct rb_node *n;
273 	unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
274 
275 	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
276 		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
277 
278 		bitmap_set(dma->bitmap, 0, dma->size >> pgshift);
279 	}
280 }
281 
vfio_dma_bitmap_alloc_all(struct vfio_iommu * iommu,size_t pgsize)282 static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
283 {
284 	struct rb_node *n;
285 
286 	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
287 		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
288 		int ret;
289 
290 		ret = vfio_dma_bitmap_alloc(dma, pgsize);
291 		if (ret) {
292 			struct rb_node *p;
293 
294 			for (p = rb_prev(n); p; p = rb_prev(p)) {
295 				struct vfio_dma *dma = rb_entry(p,
296 							struct vfio_dma, node);
297 
298 				vfio_dma_bitmap_free(dma);
299 			}
300 			return ret;
301 		}
302 		vfio_dma_populate_bitmap(dma, pgsize);
303 	}
304 	return 0;
305 }
306 
vfio_dma_bitmap_free_all(struct vfio_iommu * iommu)307 static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
308 {
309 	struct rb_node *n;
310 
311 	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
312 		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
313 
314 		vfio_dma_bitmap_free(dma);
315 	}
316 }
317 
318 /*
319  * Helper Functions for host iova-pfn list
320  */
vfio_find_vpfn(struct vfio_dma * dma,dma_addr_t iova)321 static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova)
322 {
323 	struct vfio_pfn *vpfn;
324 	struct rb_node *node = dma->pfn_list.rb_node;
325 
326 	while (node) {
327 		vpfn = rb_entry(node, struct vfio_pfn, node);
328 
329 		if (iova < vpfn->iova)
330 			node = node->rb_left;
331 		else if (iova > vpfn->iova)
332 			node = node->rb_right;
333 		else
334 			return vpfn;
335 	}
336 	return NULL;
337 }
338 
vfio_link_pfn(struct vfio_dma * dma,struct vfio_pfn * new)339 static void vfio_link_pfn(struct vfio_dma *dma,
340 			  struct vfio_pfn *new)
341 {
342 	struct rb_node **link, *parent = NULL;
343 	struct vfio_pfn *vpfn;
344 
345 	link = &dma->pfn_list.rb_node;
346 	while (*link) {
347 		parent = *link;
348 		vpfn = rb_entry(parent, struct vfio_pfn, node);
349 
350 		if (new->iova < vpfn->iova)
351 			link = &(*link)->rb_left;
352 		else
353 			link = &(*link)->rb_right;
354 	}
355 
356 	rb_link_node(&new->node, parent, link);
357 	rb_insert_color(&new->node, &dma->pfn_list);
358 }
359 
vfio_unlink_pfn(struct vfio_dma * dma,struct vfio_pfn * old)360 static void vfio_unlink_pfn(struct vfio_dma *dma, struct vfio_pfn *old)
361 {
362 	rb_erase(&old->node, &dma->pfn_list);
363 }
364 
vfio_add_to_pfn_list(struct vfio_dma * dma,dma_addr_t iova,unsigned long pfn)365 static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova,
366 				unsigned long pfn)
367 {
368 	struct vfio_pfn *vpfn;
369 
370 	vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
371 	if (!vpfn)
372 		return -ENOMEM;
373 
374 	vpfn->iova = iova;
375 	vpfn->pfn = pfn;
376 	vpfn->ref_count = 1;
377 	vfio_link_pfn(dma, vpfn);
378 	return 0;
379 }
380 
vfio_remove_from_pfn_list(struct vfio_dma * dma,struct vfio_pfn * vpfn)381 static void vfio_remove_from_pfn_list(struct vfio_dma *dma,
382 				      struct vfio_pfn *vpfn)
383 {
384 	vfio_unlink_pfn(dma, vpfn);
385 	kfree(vpfn);
386 }
387 
vfio_iova_get_vfio_pfn(struct vfio_dma * dma,unsigned long iova)388 static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
389 					       unsigned long iova)
390 {
391 	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
392 
393 	if (vpfn)
394 		vpfn->ref_count++;
395 	return vpfn;
396 }
397 
vfio_iova_put_vfio_pfn(struct vfio_dma * dma,struct vfio_pfn * vpfn)398 static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
399 {
400 	int ret = 0;
401 
402 	vpfn->ref_count--;
403 	if (!vpfn->ref_count) {
404 		ret = put_pfn(vpfn->pfn, dma->prot);
405 		vfio_remove_from_pfn_list(dma, vpfn);
406 	}
407 	return ret;
408 }
409 
mm_lock_acct(struct task_struct * task,struct mm_struct * mm,bool lock_cap,long npage)410 static int mm_lock_acct(struct task_struct *task, struct mm_struct *mm,
411 			bool lock_cap, long npage)
412 {
413 	int ret = mmap_write_lock_killable(mm);
414 
415 	if (ret)
416 		return ret;
417 
418 	ret = __account_locked_vm(mm, abs(npage), npage > 0, task, lock_cap);
419 	mmap_write_unlock(mm);
420 	return ret;
421 }
422 
vfio_lock_acct(struct vfio_dma * dma,long npage,bool async)423 static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
424 {
425 	struct mm_struct *mm;
426 	int ret;
427 
428 	if (!npage)
429 		return 0;
430 
431 	mm = dma->mm;
432 	if (async && !mmget_not_zero(mm))
433 		return -ESRCH; /* process exited */
434 
435 	ret = mm_lock_acct(dma->task, mm, dma->lock_cap, npage);
436 	if (!ret)
437 		dma->locked_vm += npage;
438 
439 	if (async)
440 		mmput(mm);
441 
442 	return ret;
443 }
444 
445 /*
446  * Some mappings aren't backed by a struct page, for example an mmap'd
447  * MMIO range for our own or another device.  These use a different
448  * pfn conversion and shouldn't be tracked as locked pages.
449  * For compound pages, any driver that sets the reserved bit in head
450  * page needs to set the reserved bit in all subpages to be safe.
451  */
is_invalid_reserved_pfn(unsigned long pfn)452 static bool is_invalid_reserved_pfn(unsigned long pfn)
453 {
454 	if (pfn_valid(pfn))
455 		return PageReserved(pfn_to_page(pfn));
456 
457 	return true;
458 }
459 
put_pfn(unsigned long pfn,int prot)460 static int put_pfn(unsigned long pfn, int prot)
461 {
462 	if (!is_invalid_reserved_pfn(pfn)) {
463 		struct page *page = pfn_to_page(pfn);
464 
465 		unpin_user_pages_dirty_lock(&page, 1, prot & IOMMU_WRITE);
466 		return 1;
467 	}
468 	return 0;
469 }
470 
471 #define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *))
472 
__vfio_batch_init(struct vfio_batch * batch,bool single)473 static void __vfio_batch_init(struct vfio_batch *batch, bool single)
474 {
475 	batch->size = 0;
476 	batch->offset = 0;
477 
478 	if (single || unlikely(disable_hugepages))
479 		goto fallback;
480 
481 	batch->pages = (struct page **) __get_free_page(GFP_KERNEL);
482 	if (!batch->pages)
483 		goto fallback;
484 
485 	batch->capacity = VFIO_BATCH_MAX_CAPACITY;
486 	return;
487 
488 fallback:
489 	batch->pages = &batch->fallback_page;
490 	batch->capacity = 1;
491 }
492 
vfio_batch_init(struct vfio_batch * batch)493 static void vfio_batch_init(struct vfio_batch *batch)
494 {
495 	__vfio_batch_init(batch, false);
496 }
497 
vfio_batch_init_single(struct vfio_batch * batch)498 static void vfio_batch_init_single(struct vfio_batch *batch)
499 {
500 	__vfio_batch_init(batch, true);
501 }
502 
vfio_batch_unpin(struct vfio_batch * batch,struct vfio_dma * dma)503 static void vfio_batch_unpin(struct vfio_batch *batch, struct vfio_dma *dma)
504 {
505 	while (batch->size) {
506 		unsigned long pfn = page_to_pfn(batch->pages[batch->offset]);
507 
508 		put_pfn(pfn, dma->prot);
509 		batch->offset++;
510 		batch->size--;
511 	}
512 }
513 
vfio_batch_fini(struct vfio_batch * batch)514 static void vfio_batch_fini(struct vfio_batch *batch)
515 {
516 	if (batch->capacity == VFIO_BATCH_MAX_CAPACITY)
517 		free_page((unsigned long)batch->pages);
518 }
519 
follow_fault_pfn(struct vm_area_struct * vma,struct mm_struct * mm,unsigned long vaddr,unsigned long * pfn,unsigned long * addr_mask,bool write_fault)520 static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
521 			    unsigned long vaddr, unsigned long *pfn,
522 			    unsigned long *addr_mask, bool write_fault)
523 {
524 	struct follow_pfnmap_args args = { .vma = vma, .address = vaddr };
525 	int ret;
526 
527 	ret = follow_pfnmap_start(&args);
528 	if (ret) {
529 		bool unlocked = false;
530 
531 		ret = fixup_user_fault(mm, vaddr,
532 				       FAULT_FLAG_REMOTE |
533 				       (write_fault ?  FAULT_FLAG_WRITE : 0),
534 				       &unlocked);
535 		if (unlocked)
536 			return -EAGAIN;
537 
538 		if (ret)
539 			return ret;
540 
541 		ret = follow_pfnmap_start(&args);
542 		if (ret)
543 			return ret;
544 	}
545 
546 	if (write_fault && !args.writable) {
547 		ret = -EFAULT;
548 	} else {
549 		*pfn = args.pfn;
550 		*addr_mask = args.addr_mask;
551 	}
552 
553 	follow_pfnmap_end(&args);
554 	return ret;
555 }
556 
557 /*
558  * Returns the positive number of pfns successfully obtained or a negative
559  * error code.  The initial pfn is stored in the pfn arg.  For page-backed
560  * pfns, the provided batch is also updated to indicate the filled pages and
561  * initial offset.  For VM_PFNMAP pfns, only the returned number of pfns and
562  * returned initial pfn are provided; subsequent pfns are contiguous.
563  */
vaddr_get_pfns(struct mm_struct * mm,unsigned long vaddr,unsigned long npages,int prot,unsigned long * pfn,struct vfio_batch * batch)564 static long vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
565 			   unsigned long npages, int prot, unsigned long *pfn,
566 			   struct vfio_batch *batch)
567 {
568 	unsigned long pin_pages = min_t(unsigned long, npages, batch->capacity);
569 	struct vm_area_struct *vma;
570 	unsigned int flags = 0;
571 	long ret;
572 
573 	if (prot & IOMMU_WRITE)
574 		flags |= FOLL_WRITE;
575 
576 	mmap_read_lock(mm);
577 	ret = pin_user_pages_remote(mm, vaddr, pin_pages, flags | FOLL_LONGTERM,
578 				    batch->pages, NULL);
579 	if (ret > 0) {
580 		*pfn = page_to_pfn(batch->pages[0]);
581 		batch->size = ret;
582 		batch->offset = 0;
583 		goto done;
584 	} else if (!ret) {
585 		ret = -EFAULT;
586 	}
587 
588 	vaddr = untagged_addr_remote(mm, vaddr);
589 
590 retry:
591 	vma = vma_lookup(mm, vaddr);
592 
593 	if (vma && vma->vm_flags & VM_PFNMAP) {
594 		unsigned long addr_mask;
595 
596 		ret = follow_fault_pfn(vma, mm, vaddr, pfn, &addr_mask,
597 				       prot & IOMMU_WRITE);
598 		if (ret == -EAGAIN)
599 			goto retry;
600 
601 		if (!ret) {
602 			if (is_invalid_reserved_pfn(*pfn)) {
603 				unsigned long epfn;
604 
605 				epfn = (*pfn | (~addr_mask >> PAGE_SHIFT)) + 1;
606 				ret = min_t(long, npages, epfn - *pfn);
607 			} else {
608 				ret = -EFAULT;
609 			}
610 		}
611 	}
612 done:
613 	mmap_read_unlock(mm);
614 	return ret;
615 }
616 
617 /*
618  * Attempt to pin pages.  We really don't want to track all the pfns and
619  * the iommu can only map chunks of consecutive pfns anyway, so get the
620  * first page and all consecutive pages with the same locking.
621  */
vfio_pin_pages_remote(struct vfio_dma * dma,unsigned long vaddr,unsigned long npage,unsigned long * pfn_base,unsigned long limit,struct vfio_batch * batch)622 static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
623 				  unsigned long npage, unsigned long *pfn_base,
624 				  unsigned long limit, struct vfio_batch *batch)
625 {
626 	unsigned long pfn;
627 	struct mm_struct *mm = current->mm;
628 	long ret, pinned = 0, lock_acct = 0;
629 	bool rsvd;
630 	dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
631 
632 	/* This code path is only user initiated */
633 	if (!mm)
634 		return -ENODEV;
635 
636 	if (batch->size) {
637 		/* Leftover pages in batch from an earlier call. */
638 		*pfn_base = page_to_pfn(batch->pages[batch->offset]);
639 		pfn = *pfn_base;
640 		rsvd = is_invalid_reserved_pfn(*pfn_base);
641 	} else {
642 		*pfn_base = 0;
643 	}
644 
645 	if (unlikely(disable_hugepages))
646 		npage = 1;
647 
648 	while (npage) {
649 		if (!batch->size) {
650 			/* Empty batch, so refill it. */
651 			ret = vaddr_get_pfns(mm, vaddr, npage, dma->prot,
652 					     &pfn, batch);
653 			if (ret < 0)
654 				goto unpin_out;
655 
656 			if (!*pfn_base) {
657 				*pfn_base = pfn;
658 				rsvd = is_invalid_reserved_pfn(*pfn_base);
659 			}
660 
661 			/* Handle pfnmap */
662 			if (!batch->size) {
663 				if (pfn != *pfn_base + pinned || !rsvd)
664 					goto out;
665 
666 				pinned += ret;
667 				npage -= ret;
668 				vaddr += (PAGE_SIZE * ret);
669 				iova += (PAGE_SIZE * ret);
670 				continue;
671 			}
672 		}
673 
674 		/*
675 		 * pfn is preset for the first iteration of this inner loop
676 		 * due to the fact that vaddr_get_pfns() needs to provide the
677 		 * initial pfn for pfnmaps.  Therefore to reduce redundancy,
678 		 * the next pfn is fetched at the end of the loop.
679 		 * A PageReserved() page could still qualify as page backed
680 		 * and rsvd here, and therefore continues to use the batch.
681 		 */
682 		while (true) {
683 			if (pfn != *pfn_base + pinned ||
684 			    rsvd != is_invalid_reserved_pfn(pfn))
685 				goto out;
686 
687 			/*
688 			 * Reserved pages aren't counted against the user,
689 			 * externally pinned pages are already counted against
690 			 * the user.
691 			 */
692 			if (!rsvd && !vfio_find_vpfn(dma, iova)) {
693 				if (!dma->lock_cap &&
694 				    mm->locked_vm + lock_acct + 1 > limit) {
695 					pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
696 						__func__, limit << PAGE_SHIFT);
697 					ret = -ENOMEM;
698 					goto unpin_out;
699 				}
700 				lock_acct++;
701 			}
702 
703 			pinned++;
704 			npage--;
705 			vaddr += PAGE_SIZE;
706 			iova += PAGE_SIZE;
707 			batch->offset++;
708 			batch->size--;
709 
710 			if (!batch->size)
711 				break;
712 
713 			pfn = page_to_pfn(batch->pages[batch->offset]);
714 		}
715 	}
716 
717 out:
718 	ret = vfio_lock_acct(dma, lock_acct, false);
719 
720 unpin_out:
721 	if (ret < 0) {
722 		if (pinned && !rsvd) {
723 			for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
724 				put_pfn(pfn, dma->prot);
725 		}
726 		vfio_batch_unpin(batch, dma);
727 
728 		return ret;
729 	}
730 
731 	return pinned;
732 }
733 
vfio_unpin_pages_remote(struct vfio_dma * dma,dma_addr_t iova,unsigned long pfn,unsigned long npage,bool do_accounting)734 static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
735 				    unsigned long pfn, unsigned long npage,
736 				    bool do_accounting)
737 {
738 	long unlocked = 0, locked = 0;
739 	long i;
740 
741 	for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
742 		if (put_pfn(pfn++, dma->prot)) {
743 			unlocked++;
744 			if (vfio_find_vpfn(dma, iova))
745 				locked++;
746 		}
747 	}
748 
749 	if (do_accounting)
750 		vfio_lock_acct(dma, locked - unlocked, true);
751 
752 	return unlocked;
753 }
754 
vfio_pin_page_external(struct vfio_dma * dma,unsigned long vaddr,unsigned long * pfn_base,bool do_accounting)755 static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
756 				  unsigned long *pfn_base, bool do_accounting)
757 {
758 	struct vfio_batch batch;
759 	struct mm_struct *mm;
760 	int ret;
761 
762 	mm = dma->mm;
763 	if (!mmget_not_zero(mm))
764 		return -ENODEV;
765 
766 	vfio_batch_init_single(&batch);
767 
768 	ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, &batch);
769 	if (ret != 1)
770 		goto out;
771 
772 	ret = 0;
773 
774 	if (do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
775 		ret = vfio_lock_acct(dma, 1, false);
776 		if (ret) {
777 			put_pfn(*pfn_base, dma->prot);
778 			if (ret == -ENOMEM)
779 				pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK "
780 					"(%ld) exceeded\n", __func__,
781 					dma->task->comm, task_pid_nr(dma->task),
782 					task_rlimit(dma->task, RLIMIT_MEMLOCK));
783 		}
784 	}
785 
786 out:
787 	vfio_batch_fini(&batch);
788 	mmput(mm);
789 	return ret;
790 }
791 
vfio_unpin_page_external(struct vfio_dma * dma,dma_addr_t iova,bool do_accounting)792 static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
793 				    bool do_accounting)
794 {
795 	int unlocked;
796 	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
797 
798 	if (!vpfn)
799 		return 0;
800 
801 	unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
802 
803 	if (do_accounting)
804 		vfio_lock_acct(dma, -unlocked, true);
805 
806 	return unlocked;
807 }
808 
vfio_iommu_type1_pin_pages(void * iommu_data,struct iommu_group * iommu_group,dma_addr_t user_iova,int npage,int prot,struct page ** pages)809 static int vfio_iommu_type1_pin_pages(void *iommu_data,
810 				      struct iommu_group *iommu_group,
811 				      dma_addr_t user_iova,
812 				      int npage, int prot,
813 				      struct page **pages)
814 {
815 	struct vfio_iommu *iommu = iommu_data;
816 	struct vfio_iommu_group *group;
817 	int i, j, ret;
818 	unsigned long remote_vaddr;
819 	struct vfio_dma *dma;
820 	bool do_accounting;
821 
822 	if (!iommu || !pages)
823 		return -EINVAL;
824 
825 	/* Supported for v2 version only */
826 	if (!iommu->v2)
827 		return -EACCES;
828 
829 	mutex_lock(&iommu->lock);
830 
831 	if (WARN_ONCE(iommu->vaddr_invalid_count,
832 		      "vfio_pin_pages not allowed with VFIO_UPDATE_VADDR\n")) {
833 		ret = -EBUSY;
834 		goto pin_done;
835 	}
836 
837 	/* Fail if no dma_umap notifier is registered */
838 	if (list_empty(&iommu->device_list)) {
839 		ret = -EINVAL;
840 		goto pin_done;
841 	}
842 
843 	/*
844 	 * If iommu capable domain exist in the container then all pages are
845 	 * already pinned and accounted. Accounting should be done if there is no
846 	 * iommu capable domain in the container.
847 	 */
848 	do_accounting = list_empty(&iommu->domain_list);
849 
850 	for (i = 0; i < npage; i++) {
851 		unsigned long phys_pfn;
852 		dma_addr_t iova;
853 		struct vfio_pfn *vpfn;
854 
855 		iova = user_iova + PAGE_SIZE * i;
856 		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
857 		if (!dma) {
858 			ret = -EINVAL;
859 			goto pin_unwind;
860 		}
861 
862 		if ((dma->prot & prot) != prot) {
863 			ret = -EPERM;
864 			goto pin_unwind;
865 		}
866 
867 		vpfn = vfio_iova_get_vfio_pfn(dma, iova);
868 		if (vpfn) {
869 			pages[i] = pfn_to_page(vpfn->pfn);
870 			continue;
871 		}
872 
873 		remote_vaddr = dma->vaddr + (iova - dma->iova);
874 		ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn,
875 					     do_accounting);
876 		if (ret)
877 			goto pin_unwind;
878 
879 		if (!pfn_valid(phys_pfn)) {
880 			ret = -EINVAL;
881 			goto pin_unwind;
882 		}
883 
884 		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn);
885 		if (ret) {
886 			if (put_pfn(phys_pfn, dma->prot) && do_accounting)
887 				vfio_lock_acct(dma, -1, true);
888 			goto pin_unwind;
889 		}
890 
891 		pages[i] = pfn_to_page(phys_pfn);
892 
893 		if (iommu->dirty_page_tracking) {
894 			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
895 
896 			/*
897 			 * Bitmap populated with the smallest supported page
898 			 * size
899 			 */
900 			bitmap_set(dma->bitmap,
901 				   (iova - dma->iova) >> pgshift, 1);
902 		}
903 	}
904 	ret = i;
905 
906 	group = vfio_iommu_find_iommu_group(iommu, iommu_group);
907 	if (!group->pinned_page_dirty_scope) {
908 		group->pinned_page_dirty_scope = true;
909 		iommu->num_non_pinned_groups--;
910 	}
911 
912 	goto pin_done;
913 
914 pin_unwind:
915 	pages[i] = NULL;
916 	for (j = 0; j < i; j++) {
917 		dma_addr_t iova;
918 
919 		iova = user_iova + PAGE_SIZE * j;
920 		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
921 		vfio_unpin_page_external(dma, iova, do_accounting);
922 		pages[j] = NULL;
923 	}
924 pin_done:
925 	mutex_unlock(&iommu->lock);
926 	return ret;
927 }
928 
vfio_iommu_type1_unpin_pages(void * iommu_data,dma_addr_t user_iova,int npage)929 static void vfio_iommu_type1_unpin_pages(void *iommu_data,
930 					 dma_addr_t user_iova, int npage)
931 {
932 	struct vfio_iommu *iommu = iommu_data;
933 	bool do_accounting;
934 	int i;
935 
936 	/* Supported for v2 version only */
937 	if (WARN_ON(!iommu->v2))
938 		return;
939 
940 	mutex_lock(&iommu->lock);
941 
942 	do_accounting = list_empty(&iommu->domain_list);
943 	for (i = 0; i < npage; i++) {
944 		dma_addr_t iova = user_iova + PAGE_SIZE * i;
945 		struct vfio_dma *dma;
946 
947 		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
948 		if (!dma)
949 			break;
950 
951 		vfio_unpin_page_external(dma, iova, do_accounting);
952 	}
953 
954 	mutex_unlock(&iommu->lock);
955 
956 	WARN_ON(i != npage);
957 }
958 
vfio_sync_unpin(struct vfio_dma * dma,struct vfio_domain * domain,struct list_head * regions,struct iommu_iotlb_gather * iotlb_gather)959 static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
960 			    struct list_head *regions,
961 			    struct iommu_iotlb_gather *iotlb_gather)
962 {
963 	long unlocked = 0;
964 	struct vfio_regions *entry, *next;
965 
966 	iommu_iotlb_sync(domain->domain, iotlb_gather);
967 
968 	list_for_each_entry_safe(entry, next, regions, list) {
969 		unlocked += vfio_unpin_pages_remote(dma,
970 						    entry->iova,
971 						    entry->phys >> PAGE_SHIFT,
972 						    entry->len >> PAGE_SHIFT,
973 						    false);
974 		list_del(&entry->list);
975 		kfree(entry);
976 	}
977 
978 	cond_resched();
979 
980 	return unlocked;
981 }
982 
983 /*
984  * Generally, VFIO needs to unpin remote pages after each IOTLB flush.
985  * Therefore, when using IOTLB flush sync interface, VFIO need to keep track
986  * of these regions (currently using a list).
987  *
988  * This value specifies maximum number of regions for each IOTLB flush sync.
989  */
990 #define VFIO_IOMMU_TLB_SYNC_MAX		512
991 
unmap_unpin_fast(struct vfio_domain * domain,struct vfio_dma * dma,dma_addr_t * iova,size_t len,phys_addr_t phys,long * unlocked,struct list_head * unmapped_list,int * unmapped_cnt,struct iommu_iotlb_gather * iotlb_gather)992 static size_t unmap_unpin_fast(struct vfio_domain *domain,
993 			       struct vfio_dma *dma, dma_addr_t *iova,
994 			       size_t len, phys_addr_t phys, long *unlocked,
995 			       struct list_head *unmapped_list,
996 			       int *unmapped_cnt,
997 			       struct iommu_iotlb_gather *iotlb_gather)
998 {
999 	size_t unmapped = 0;
1000 	struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
1001 
1002 	if (entry) {
1003 		unmapped = iommu_unmap_fast(domain->domain, *iova, len,
1004 					    iotlb_gather);
1005 
1006 		if (!unmapped) {
1007 			kfree(entry);
1008 		} else {
1009 			entry->iova = *iova;
1010 			entry->phys = phys;
1011 			entry->len  = unmapped;
1012 			list_add_tail(&entry->list, unmapped_list);
1013 
1014 			*iova += unmapped;
1015 			(*unmapped_cnt)++;
1016 		}
1017 	}
1018 
1019 	/*
1020 	 * Sync if the number of fast-unmap regions hits the limit
1021 	 * or in case of errors.
1022 	 */
1023 	if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) {
1024 		*unlocked += vfio_sync_unpin(dma, domain, unmapped_list,
1025 					     iotlb_gather);
1026 		*unmapped_cnt = 0;
1027 	}
1028 
1029 	return unmapped;
1030 }
1031 
unmap_unpin_slow(struct vfio_domain * domain,struct vfio_dma * dma,dma_addr_t * iova,size_t len,phys_addr_t phys,long * unlocked)1032 static size_t unmap_unpin_slow(struct vfio_domain *domain,
1033 			       struct vfio_dma *dma, dma_addr_t *iova,
1034 			       size_t len, phys_addr_t phys,
1035 			       long *unlocked)
1036 {
1037 	size_t unmapped = iommu_unmap(domain->domain, *iova, len);
1038 
1039 	if (unmapped) {
1040 		*unlocked += vfio_unpin_pages_remote(dma, *iova,
1041 						     phys >> PAGE_SHIFT,
1042 						     unmapped >> PAGE_SHIFT,
1043 						     false);
1044 		*iova += unmapped;
1045 		cond_resched();
1046 	}
1047 	return unmapped;
1048 }
1049 
vfio_unmap_unpin(struct vfio_iommu * iommu,struct vfio_dma * dma,bool do_accounting)1050 static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
1051 			     bool do_accounting)
1052 {
1053 	dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
1054 	struct vfio_domain *domain, *d;
1055 	LIST_HEAD(unmapped_region_list);
1056 	struct iommu_iotlb_gather iotlb_gather;
1057 	int unmapped_region_cnt = 0;
1058 	long unlocked = 0;
1059 
1060 	if (!dma->size)
1061 		return 0;
1062 
1063 	if (list_empty(&iommu->domain_list))
1064 		return 0;
1065 
1066 	/*
1067 	 * We use the IOMMU to track the physical addresses, otherwise we'd
1068 	 * need a much more complicated tracking system.  Unfortunately that
1069 	 * means we need to use one of the iommu domains to figure out the
1070 	 * pfns to unpin.  The rest need to be unmapped in advance so we have
1071 	 * no iommu translations remaining when the pages are unpinned.
1072 	 */
1073 	domain = d = list_first_entry(&iommu->domain_list,
1074 				      struct vfio_domain, next);
1075 
1076 	list_for_each_entry_continue(d, &iommu->domain_list, next) {
1077 		iommu_unmap(d->domain, dma->iova, dma->size);
1078 		cond_resched();
1079 	}
1080 
1081 	iommu_iotlb_gather_init(&iotlb_gather);
1082 	while (iova < end) {
1083 		size_t unmapped, len;
1084 		phys_addr_t phys, next;
1085 
1086 		phys = iommu_iova_to_phys(domain->domain, iova);
1087 		if (WARN_ON(!phys)) {
1088 			iova += PAGE_SIZE;
1089 			continue;
1090 		}
1091 
1092 		/*
1093 		 * To optimize for fewer iommu_unmap() calls, each of which
1094 		 * may require hardware cache flushing, try to find the
1095 		 * largest contiguous physical memory chunk to unmap.
1096 		 */
1097 		for (len = PAGE_SIZE; iova + len < end; len += PAGE_SIZE) {
1098 			next = iommu_iova_to_phys(domain->domain, iova + len);
1099 			if (next != phys + len)
1100 				break;
1101 		}
1102 
1103 		/*
1104 		 * First, try to use fast unmap/unpin. In case of failure,
1105 		 * switch to slow unmap/unpin path.
1106 		 */
1107 		unmapped = unmap_unpin_fast(domain, dma, &iova, len, phys,
1108 					    &unlocked, &unmapped_region_list,
1109 					    &unmapped_region_cnt,
1110 					    &iotlb_gather);
1111 		if (!unmapped) {
1112 			unmapped = unmap_unpin_slow(domain, dma, &iova, len,
1113 						    phys, &unlocked);
1114 			if (WARN_ON(!unmapped))
1115 				break;
1116 		}
1117 	}
1118 
1119 	dma->iommu_mapped = false;
1120 
1121 	if (unmapped_region_cnt) {
1122 		unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list,
1123 					    &iotlb_gather);
1124 	}
1125 
1126 	if (do_accounting) {
1127 		vfio_lock_acct(dma, -unlocked, true);
1128 		return 0;
1129 	}
1130 	return unlocked;
1131 }
1132 
vfio_remove_dma(struct vfio_iommu * iommu,struct vfio_dma * dma)1133 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
1134 {
1135 	WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list));
1136 	vfio_unmap_unpin(iommu, dma, true);
1137 	vfio_unlink_dma(iommu, dma);
1138 	put_task_struct(dma->task);
1139 	mmdrop(dma->mm);
1140 	vfio_dma_bitmap_free(dma);
1141 	if (dma->vaddr_invalid)
1142 		iommu->vaddr_invalid_count--;
1143 	kfree(dma);
1144 	iommu->dma_avail++;
1145 }
1146 
vfio_update_pgsize_bitmap(struct vfio_iommu * iommu)1147 static void vfio_update_pgsize_bitmap(struct vfio_iommu *iommu)
1148 {
1149 	struct vfio_domain *domain;
1150 
1151 	iommu->pgsize_bitmap = ULONG_MAX;
1152 
1153 	list_for_each_entry(domain, &iommu->domain_list, next)
1154 		iommu->pgsize_bitmap &= domain->domain->pgsize_bitmap;
1155 
1156 	/*
1157 	 * In case the IOMMU supports page sizes smaller than PAGE_SIZE
1158 	 * we pretend PAGE_SIZE is supported and hide sub-PAGE_SIZE sizes.
1159 	 * That way the user will be able to map/unmap buffers whose size/
1160 	 * start address is aligned with PAGE_SIZE. Pinning code uses that
1161 	 * granularity while iommu driver can use the sub-PAGE_SIZE size
1162 	 * to map the buffer.
1163 	 */
1164 	if (iommu->pgsize_bitmap & ~PAGE_MASK) {
1165 		iommu->pgsize_bitmap &= PAGE_MASK;
1166 		iommu->pgsize_bitmap |= PAGE_SIZE;
1167 	}
1168 }
1169 
update_user_bitmap(u64 __user * bitmap,struct vfio_iommu * iommu,struct vfio_dma * dma,dma_addr_t base_iova,size_t pgsize)1170 static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
1171 			      struct vfio_dma *dma, dma_addr_t base_iova,
1172 			      size_t pgsize)
1173 {
1174 	unsigned long pgshift = __ffs(pgsize);
1175 	unsigned long nbits = dma->size >> pgshift;
1176 	unsigned long bit_offset = (dma->iova - base_iova) >> pgshift;
1177 	unsigned long copy_offset = bit_offset / BITS_PER_LONG;
1178 	unsigned long shift = bit_offset % BITS_PER_LONG;
1179 	unsigned long leftover;
1180 
1181 	/*
1182 	 * mark all pages dirty if any IOMMU capable device is not able
1183 	 * to report dirty pages and all pages are pinned and mapped.
1184 	 */
1185 	if (iommu->num_non_pinned_groups && dma->iommu_mapped)
1186 		bitmap_set(dma->bitmap, 0, nbits);
1187 
1188 	if (shift) {
1189 		bitmap_shift_left(dma->bitmap, dma->bitmap, shift,
1190 				  nbits + shift);
1191 
1192 		if (copy_from_user(&leftover,
1193 				   (void __user *)(bitmap + copy_offset),
1194 				   sizeof(leftover)))
1195 			return -EFAULT;
1196 
1197 		bitmap_or(dma->bitmap, dma->bitmap, &leftover, shift);
1198 	}
1199 
1200 	if (copy_to_user((void __user *)(bitmap + copy_offset), dma->bitmap,
1201 			 DIRTY_BITMAP_BYTES(nbits + shift)))
1202 		return -EFAULT;
1203 
1204 	return 0;
1205 }
1206 
vfio_iova_dirty_bitmap(u64 __user * bitmap,struct vfio_iommu * iommu,dma_addr_t iova,size_t size,size_t pgsize)1207 static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
1208 				  dma_addr_t iova, size_t size, size_t pgsize)
1209 {
1210 	struct vfio_dma *dma;
1211 	struct rb_node *n;
1212 	unsigned long pgshift = __ffs(pgsize);
1213 	int ret;
1214 
1215 	/*
1216 	 * GET_BITMAP request must fully cover vfio_dma mappings.  Multiple
1217 	 * vfio_dma mappings may be clubbed by specifying large ranges, but
1218 	 * there must not be any previous mappings bisected by the range.
1219 	 * An error will be returned if these conditions are not met.
1220 	 */
1221 	dma = vfio_find_dma(iommu, iova, 1);
1222 	if (dma && dma->iova != iova)
1223 		return -EINVAL;
1224 
1225 	dma = vfio_find_dma(iommu, iova + size - 1, 0);
1226 	if (dma && dma->iova + dma->size != iova + size)
1227 		return -EINVAL;
1228 
1229 	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
1230 		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1231 
1232 		if (dma->iova < iova)
1233 			continue;
1234 
1235 		if (dma->iova > iova + size - 1)
1236 			break;
1237 
1238 		ret = update_user_bitmap(bitmap, iommu, dma, iova, pgsize);
1239 		if (ret)
1240 			return ret;
1241 
1242 		/*
1243 		 * Re-populate bitmap to include all pinned pages which are
1244 		 * considered as dirty but exclude pages which are unpinned and
1245 		 * pages which are marked dirty by vfio_dma_rw()
1246 		 */
1247 		bitmap_clear(dma->bitmap, 0, dma->size >> pgshift);
1248 		vfio_dma_populate_bitmap(dma, pgsize);
1249 	}
1250 	return 0;
1251 }
1252 
verify_bitmap_size(uint64_t npages,uint64_t bitmap_size)1253 static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
1254 {
1255 	if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX) ||
1256 	    (bitmap_size < DIRTY_BITMAP_BYTES(npages)))
1257 		return -EINVAL;
1258 
1259 	return 0;
1260 }
1261 
1262 /*
1263  * Notify VFIO drivers using vfio_register_emulated_iommu_dev() to invalidate
1264  * and unmap iovas within the range we're about to unmap. Drivers MUST unpin
1265  * pages in response to an invalidation.
1266  */
vfio_notify_dma_unmap(struct vfio_iommu * iommu,struct vfio_dma * dma)1267 static void vfio_notify_dma_unmap(struct vfio_iommu *iommu,
1268 				  struct vfio_dma *dma)
1269 {
1270 	struct vfio_device *device;
1271 
1272 	if (list_empty(&iommu->device_list))
1273 		return;
1274 
1275 	/*
1276 	 * The device is expected to call vfio_unpin_pages() for any IOVA it has
1277 	 * pinned within the range. Since vfio_unpin_pages() will eventually
1278 	 * call back down to this code and try to obtain the iommu->lock we must
1279 	 * drop it.
1280 	 */
1281 	mutex_lock(&iommu->device_list_lock);
1282 	mutex_unlock(&iommu->lock);
1283 
1284 	list_for_each_entry(device, &iommu->device_list, iommu_entry)
1285 		device->ops->dma_unmap(device, dma->iova, dma->size);
1286 
1287 	mutex_unlock(&iommu->device_list_lock);
1288 	mutex_lock(&iommu->lock);
1289 }
1290 
vfio_dma_do_unmap(struct vfio_iommu * iommu,struct vfio_iommu_type1_dma_unmap * unmap,struct vfio_bitmap * bitmap)1291 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
1292 			     struct vfio_iommu_type1_dma_unmap *unmap,
1293 			     struct vfio_bitmap *bitmap)
1294 {
1295 	struct vfio_dma *dma, *dma_last = NULL;
1296 	size_t unmapped = 0, pgsize;
1297 	int ret = -EINVAL, retries = 0;
1298 	unsigned long pgshift;
1299 	dma_addr_t iova = unmap->iova;
1300 	u64 size = unmap->size;
1301 	bool unmap_all = unmap->flags & VFIO_DMA_UNMAP_FLAG_ALL;
1302 	bool invalidate_vaddr = unmap->flags & VFIO_DMA_UNMAP_FLAG_VADDR;
1303 	struct rb_node *n, *first_n;
1304 
1305 	mutex_lock(&iommu->lock);
1306 
1307 	/* Cannot update vaddr if mdev is present. */
1308 	if (invalidate_vaddr && !list_empty(&iommu->emulated_iommu_groups)) {
1309 		ret = -EBUSY;
1310 		goto unlock;
1311 	}
1312 
1313 	pgshift = __ffs(iommu->pgsize_bitmap);
1314 	pgsize = (size_t)1 << pgshift;
1315 
1316 	if (iova & (pgsize - 1))
1317 		goto unlock;
1318 
1319 	if (unmap_all) {
1320 		if (iova || size)
1321 			goto unlock;
1322 		size = U64_MAX;
1323 	} else if (!size || size & (pgsize - 1) ||
1324 		   iova + size - 1 < iova || size > SIZE_MAX) {
1325 		goto unlock;
1326 	}
1327 
1328 	/* When dirty tracking is enabled, allow only min supported pgsize */
1329 	if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
1330 	    (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
1331 		goto unlock;
1332 	}
1333 
1334 	WARN_ON((pgsize - 1) & PAGE_MASK);
1335 again:
1336 	/*
1337 	 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
1338 	 * avoid tracking individual mappings.  This means that the granularity
1339 	 * of the original mapping was lost and the user was allowed to attempt
1340 	 * to unmap any range.  Depending on the contiguousness of physical
1341 	 * memory and page sizes supported by the IOMMU, arbitrary unmaps may
1342 	 * or may not have worked.  We only guaranteed unmap granularity
1343 	 * matching the original mapping; even though it was untracked here,
1344 	 * the original mappings are reflected in IOMMU mappings.  This
1345 	 * resulted in a couple unusual behaviors.  First, if a range is not
1346 	 * able to be unmapped, ex. a set of 4k pages that was mapped as a
1347 	 * 2M hugepage into the IOMMU, the unmap ioctl returns success but with
1348 	 * a zero sized unmap.  Also, if an unmap request overlaps the first
1349 	 * address of a hugepage, the IOMMU will unmap the entire hugepage.
1350 	 * This also returns success and the returned unmap size reflects the
1351 	 * actual size unmapped.
1352 	 *
1353 	 * We attempt to maintain compatibility with this "v1" interface, but
1354 	 * we take control out of the hands of the IOMMU.  Therefore, an unmap
1355 	 * request offset from the beginning of the original mapping will
1356 	 * return success with zero sized unmap.  And an unmap request covering
1357 	 * the first iova of mapping will unmap the entire range.
1358 	 *
1359 	 * The v2 version of this interface intends to be more deterministic.
1360 	 * Unmap requests must fully cover previous mappings.  Multiple
1361 	 * mappings may still be unmaped by specifying large ranges, but there
1362 	 * must not be any previous mappings bisected by the range.  An error
1363 	 * will be returned if these conditions are not met.  The v2 interface
1364 	 * will only return success and a size of zero if there were no
1365 	 * mappings within the range.
1366 	 */
1367 	if (iommu->v2 && !unmap_all) {
1368 		dma = vfio_find_dma(iommu, iova, 1);
1369 		if (dma && dma->iova != iova)
1370 			goto unlock;
1371 
1372 		dma = vfio_find_dma(iommu, iova + size - 1, 0);
1373 		if (dma && dma->iova + dma->size != iova + size)
1374 			goto unlock;
1375 	}
1376 
1377 	ret = 0;
1378 	n = first_n = vfio_find_dma_first_node(iommu, iova, size);
1379 
1380 	while (n) {
1381 		dma = rb_entry(n, struct vfio_dma, node);
1382 		if (dma->iova >= iova + size)
1383 			break;
1384 
1385 		if (!iommu->v2 && iova > dma->iova)
1386 			break;
1387 
1388 		if (invalidate_vaddr) {
1389 			if (dma->vaddr_invalid) {
1390 				struct rb_node *last_n = n;
1391 
1392 				for (n = first_n; n != last_n; n = rb_next(n)) {
1393 					dma = rb_entry(n,
1394 						       struct vfio_dma, node);
1395 					dma->vaddr_invalid = false;
1396 					iommu->vaddr_invalid_count--;
1397 				}
1398 				ret = -EINVAL;
1399 				unmapped = 0;
1400 				break;
1401 			}
1402 			dma->vaddr_invalid = true;
1403 			iommu->vaddr_invalid_count++;
1404 			unmapped += dma->size;
1405 			n = rb_next(n);
1406 			continue;
1407 		}
1408 
1409 		if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
1410 			if (dma_last == dma) {
1411 				BUG_ON(++retries > 10);
1412 			} else {
1413 				dma_last = dma;
1414 				retries = 0;
1415 			}
1416 
1417 			vfio_notify_dma_unmap(iommu, dma);
1418 			goto again;
1419 		}
1420 
1421 		if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
1422 			ret = update_user_bitmap(bitmap->data, iommu, dma,
1423 						 iova, pgsize);
1424 			if (ret)
1425 				break;
1426 		}
1427 
1428 		unmapped += dma->size;
1429 		n = rb_next(n);
1430 		vfio_remove_dma(iommu, dma);
1431 	}
1432 
1433 unlock:
1434 	mutex_unlock(&iommu->lock);
1435 
1436 	/* Report how much was unmapped */
1437 	unmap->size = unmapped;
1438 
1439 	return ret;
1440 }
1441 
vfio_iommu_map(struct vfio_iommu * iommu,dma_addr_t iova,unsigned long pfn,long npage,int prot)1442 static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
1443 			  unsigned long pfn, long npage, int prot)
1444 {
1445 	struct vfio_domain *d;
1446 	int ret;
1447 
1448 	list_for_each_entry(d, &iommu->domain_list, next) {
1449 		ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
1450 				npage << PAGE_SHIFT, prot | IOMMU_CACHE,
1451 				GFP_KERNEL_ACCOUNT);
1452 		if (ret)
1453 			goto unwind;
1454 
1455 		cond_resched();
1456 	}
1457 
1458 	return 0;
1459 
1460 unwind:
1461 	list_for_each_entry_continue_reverse(d, &iommu->domain_list, next) {
1462 		iommu_unmap(d->domain, iova, npage << PAGE_SHIFT);
1463 		cond_resched();
1464 	}
1465 
1466 	return ret;
1467 }
1468 
vfio_pin_map_dma(struct vfio_iommu * iommu,struct vfio_dma * dma,size_t map_size)1469 static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
1470 			    size_t map_size)
1471 {
1472 	dma_addr_t iova = dma->iova;
1473 	unsigned long vaddr = dma->vaddr;
1474 	struct vfio_batch batch;
1475 	size_t size = map_size;
1476 	long npage;
1477 	unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1478 	int ret = 0;
1479 
1480 	vfio_batch_init(&batch);
1481 
1482 	while (size) {
1483 		/* Pin a contiguous chunk of memory */
1484 		npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
1485 					      size >> PAGE_SHIFT, &pfn, limit,
1486 					      &batch);
1487 		if (npage <= 0) {
1488 			WARN_ON(!npage);
1489 			ret = (int)npage;
1490 			break;
1491 		}
1492 
1493 		/* Map it! */
1494 		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
1495 				     dma->prot);
1496 		if (ret) {
1497 			vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
1498 						npage, true);
1499 			vfio_batch_unpin(&batch, dma);
1500 			break;
1501 		}
1502 
1503 		size -= npage << PAGE_SHIFT;
1504 		dma->size += npage << PAGE_SHIFT;
1505 	}
1506 
1507 	vfio_batch_fini(&batch);
1508 	dma->iommu_mapped = true;
1509 
1510 	if (ret)
1511 		vfio_remove_dma(iommu, dma);
1512 
1513 	return ret;
1514 }
1515 
1516 /*
1517  * Check dma map request is within a valid iova range
1518  */
vfio_iommu_iova_dma_valid(struct vfio_iommu * iommu,dma_addr_t start,dma_addr_t end)1519 static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
1520 				      dma_addr_t start, dma_addr_t end)
1521 {
1522 	struct list_head *iova = &iommu->iova_list;
1523 	struct vfio_iova *node;
1524 
1525 	list_for_each_entry(node, iova, list) {
1526 		if (start >= node->start && end <= node->end)
1527 			return true;
1528 	}
1529 
1530 	/*
1531 	 * Check for list_empty() as well since a container with
1532 	 * a single mdev device will have an empty list.
1533 	 */
1534 	return list_empty(iova);
1535 }
1536 
vfio_change_dma_owner(struct vfio_dma * dma)1537 static int vfio_change_dma_owner(struct vfio_dma *dma)
1538 {
1539 	struct task_struct *task = current->group_leader;
1540 	struct mm_struct *mm = current->mm;
1541 	long npage = dma->locked_vm;
1542 	bool lock_cap;
1543 	int ret;
1544 
1545 	if (mm == dma->mm)
1546 		return 0;
1547 
1548 	lock_cap = capable(CAP_IPC_LOCK);
1549 	ret = mm_lock_acct(task, mm, lock_cap, npage);
1550 	if (ret)
1551 		return ret;
1552 
1553 	if (mmget_not_zero(dma->mm)) {
1554 		mm_lock_acct(dma->task, dma->mm, dma->lock_cap, -npage);
1555 		mmput(dma->mm);
1556 	}
1557 
1558 	if (dma->task != task) {
1559 		put_task_struct(dma->task);
1560 		dma->task = get_task_struct(task);
1561 	}
1562 	mmdrop(dma->mm);
1563 	dma->mm = mm;
1564 	mmgrab(dma->mm);
1565 	dma->lock_cap = lock_cap;
1566 	return 0;
1567 }
1568 
vfio_dma_do_map(struct vfio_iommu * iommu,struct vfio_iommu_type1_dma_map * map)1569 static int vfio_dma_do_map(struct vfio_iommu *iommu,
1570 			   struct vfio_iommu_type1_dma_map *map)
1571 {
1572 	bool set_vaddr = map->flags & VFIO_DMA_MAP_FLAG_VADDR;
1573 	dma_addr_t iova = map->iova;
1574 	unsigned long vaddr = map->vaddr;
1575 	size_t size = map->size;
1576 	int ret = 0, prot = 0;
1577 	size_t pgsize;
1578 	struct vfio_dma *dma;
1579 
1580 	/* Verify that none of our __u64 fields overflow */
1581 	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
1582 		return -EINVAL;
1583 
1584 	/* READ/WRITE from device perspective */
1585 	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
1586 		prot |= IOMMU_WRITE;
1587 	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
1588 		prot |= IOMMU_READ;
1589 
1590 	if ((prot && set_vaddr) || (!prot && !set_vaddr))
1591 		return -EINVAL;
1592 
1593 	mutex_lock(&iommu->lock);
1594 
1595 	pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
1596 
1597 	WARN_ON((pgsize - 1) & PAGE_MASK);
1598 
1599 	if (!size || (size | iova | vaddr) & (pgsize - 1)) {
1600 		ret = -EINVAL;
1601 		goto out_unlock;
1602 	}
1603 
1604 	/* Don't allow IOVA or virtual address wrap */
1605 	if (iova + size - 1 < iova || vaddr + size - 1 < vaddr) {
1606 		ret = -EINVAL;
1607 		goto out_unlock;
1608 	}
1609 
1610 	dma = vfio_find_dma(iommu, iova, size);
1611 	if (set_vaddr) {
1612 		if (!dma) {
1613 			ret = -ENOENT;
1614 		} else if (!dma->vaddr_invalid || dma->iova != iova ||
1615 			   dma->size != size) {
1616 			ret = -EINVAL;
1617 		} else {
1618 			ret = vfio_change_dma_owner(dma);
1619 			if (ret)
1620 				goto out_unlock;
1621 			dma->vaddr = vaddr;
1622 			dma->vaddr_invalid = false;
1623 			iommu->vaddr_invalid_count--;
1624 		}
1625 		goto out_unlock;
1626 	} else if (dma) {
1627 		ret = -EEXIST;
1628 		goto out_unlock;
1629 	}
1630 
1631 	if (!iommu->dma_avail) {
1632 		ret = -ENOSPC;
1633 		goto out_unlock;
1634 	}
1635 
1636 	if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) {
1637 		ret = -EINVAL;
1638 		goto out_unlock;
1639 	}
1640 
1641 	dma = kzalloc(sizeof(*dma), GFP_KERNEL);
1642 	if (!dma) {
1643 		ret = -ENOMEM;
1644 		goto out_unlock;
1645 	}
1646 
1647 	iommu->dma_avail--;
1648 	dma->iova = iova;
1649 	dma->vaddr = vaddr;
1650 	dma->prot = prot;
1651 
1652 	/*
1653 	 * We need to be able to both add to a task's locked memory and test
1654 	 * against the locked memory limit and we need to be able to do both
1655 	 * outside of this call path as pinning can be asynchronous via the
1656 	 * external interfaces for mdev devices.  RLIMIT_MEMLOCK requires a
1657 	 * task_struct. Save the group_leader so that all DMA tracking uses
1658 	 * the same task, to make debugging easier.  VM locked pages requires
1659 	 * an mm_struct, so grab the mm in case the task dies.
1660 	 */
1661 	get_task_struct(current->group_leader);
1662 	dma->task = current->group_leader;
1663 	dma->lock_cap = capable(CAP_IPC_LOCK);
1664 	dma->mm = current->mm;
1665 	mmgrab(dma->mm);
1666 
1667 	dma->pfn_list = RB_ROOT;
1668 
1669 	/* Insert zero-sized and grow as we map chunks of it */
1670 	vfio_link_dma(iommu, dma);
1671 
1672 	/* Don't pin and map if container doesn't contain IOMMU capable domain*/
1673 	if (list_empty(&iommu->domain_list))
1674 		dma->size = size;
1675 	else
1676 		ret = vfio_pin_map_dma(iommu, dma, size);
1677 
1678 	if (!ret && iommu->dirty_page_tracking) {
1679 		ret = vfio_dma_bitmap_alloc(dma, pgsize);
1680 		if (ret)
1681 			vfio_remove_dma(iommu, dma);
1682 	}
1683 
1684 out_unlock:
1685 	mutex_unlock(&iommu->lock);
1686 	return ret;
1687 }
1688 
vfio_iommu_replay(struct vfio_iommu * iommu,struct vfio_domain * domain)1689 static int vfio_iommu_replay(struct vfio_iommu *iommu,
1690 			     struct vfio_domain *domain)
1691 {
1692 	struct vfio_batch batch;
1693 	struct vfio_domain *d = NULL;
1694 	struct rb_node *n;
1695 	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1696 	int ret;
1697 
1698 	/* Arbitrarily pick the first domain in the list for lookups */
1699 	if (!list_empty(&iommu->domain_list))
1700 		d = list_first_entry(&iommu->domain_list,
1701 				     struct vfio_domain, next);
1702 
1703 	vfio_batch_init(&batch);
1704 
1705 	n = rb_first(&iommu->dma_list);
1706 
1707 	for (; n; n = rb_next(n)) {
1708 		struct vfio_dma *dma;
1709 		dma_addr_t iova;
1710 
1711 		dma = rb_entry(n, struct vfio_dma, node);
1712 		iova = dma->iova;
1713 
1714 		while (iova < dma->iova + dma->size) {
1715 			phys_addr_t phys;
1716 			size_t size;
1717 
1718 			if (dma->iommu_mapped) {
1719 				phys_addr_t p;
1720 				dma_addr_t i;
1721 
1722 				if (WARN_ON(!d)) { /* mapped w/o a domain?! */
1723 					ret = -EINVAL;
1724 					goto unwind;
1725 				}
1726 
1727 				phys = iommu_iova_to_phys(d->domain, iova);
1728 
1729 				if (WARN_ON(!phys)) {
1730 					iova += PAGE_SIZE;
1731 					continue;
1732 				}
1733 
1734 				size = PAGE_SIZE;
1735 				p = phys + size;
1736 				i = iova + size;
1737 				while (i < dma->iova + dma->size &&
1738 				       p == iommu_iova_to_phys(d->domain, i)) {
1739 					size += PAGE_SIZE;
1740 					p += PAGE_SIZE;
1741 					i += PAGE_SIZE;
1742 				}
1743 			} else {
1744 				unsigned long pfn;
1745 				unsigned long vaddr = dma->vaddr +
1746 						     (iova - dma->iova);
1747 				size_t n = dma->iova + dma->size - iova;
1748 				long npage;
1749 
1750 				npage = vfio_pin_pages_remote(dma, vaddr,
1751 							      n >> PAGE_SHIFT,
1752 							      &pfn, limit,
1753 							      &batch);
1754 				if (npage <= 0) {
1755 					WARN_ON(!npage);
1756 					ret = (int)npage;
1757 					goto unwind;
1758 				}
1759 
1760 				phys = pfn << PAGE_SHIFT;
1761 				size = npage << PAGE_SHIFT;
1762 			}
1763 
1764 			ret = iommu_map(domain->domain, iova, phys, size,
1765 					dma->prot | IOMMU_CACHE,
1766 					GFP_KERNEL_ACCOUNT);
1767 			if (ret) {
1768 				if (!dma->iommu_mapped) {
1769 					vfio_unpin_pages_remote(dma, iova,
1770 							phys >> PAGE_SHIFT,
1771 							size >> PAGE_SHIFT,
1772 							true);
1773 					vfio_batch_unpin(&batch, dma);
1774 				}
1775 				goto unwind;
1776 			}
1777 
1778 			iova += size;
1779 		}
1780 	}
1781 
1782 	/* All dmas are now mapped, defer to second tree walk for unwind */
1783 	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
1784 		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1785 
1786 		dma->iommu_mapped = true;
1787 	}
1788 
1789 	vfio_batch_fini(&batch);
1790 	return 0;
1791 
1792 unwind:
1793 	for (; n; n = rb_prev(n)) {
1794 		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1795 		dma_addr_t iova;
1796 
1797 		if (dma->iommu_mapped) {
1798 			iommu_unmap(domain->domain, dma->iova, dma->size);
1799 			continue;
1800 		}
1801 
1802 		iova = dma->iova;
1803 		while (iova < dma->iova + dma->size) {
1804 			phys_addr_t phys, p;
1805 			size_t size;
1806 			dma_addr_t i;
1807 
1808 			phys = iommu_iova_to_phys(domain->domain, iova);
1809 			if (!phys) {
1810 				iova += PAGE_SIZE;
1811 				continue;
1812 			}
1813 
1814 			size = PAGE_SIZE;
1815 			p = phys + size;
1816 			i = iova + size;
1817 			while (i < dma->iova + dma->size &&
1818 			       p == iommu_iova_to_phys(domain->domain, i)) {
1819 				size += PAGE_SIZE;
1820 				p += PAGE_SIZE;
1821 				i += PAGE_SIZE;
1822 			}
1823 
1824 			iommu_unmap(domain->domain, iova, size);
1825 			vfio_unpin_pages_remote(dma, iova, phys >> PAGE_SHIFT,
1826 						size >> PAGE_SHIFT, true);
1827 		}
1828 	}
1829 
1830 	vfio_batch_fini(&batch);
1831 	return ret;
1832 }
1833 
find_iommu_group(struct vfio_domain * domain,struct iommu_group * iommu_group)1834 static struct vfio_iommu_group *find_iommu_group(struct vfio_domain *domain,
1835 						 struct iommu_group *iommu_group)
1836 {
1837 	struct vfio_iommu_group *g;
1838 
1839 	list_for_each_entry(g, &domain->group_list, next) {
1840 		if (g->iommu_group == iommu_group)
1841 			return g;
1842 	}
1843 
1844 	return NULL;
1845 }
1846 
1847 static struct vfio_iommu_group*
vfio_iommu_find_iommu_group(struct vfio_iommu * iommu,struct iommu_group * iommu_group)1848 vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
1849 			    struct iommu_group *iommu_group)
1850 {
1851 	struct vfio_iommu_group *group;
1852 	struct vfio_domain *domain;
1853 
1854 	list_for_each_entry(domain, &iommu->domain_list, next) {
1855 		group = find_iommu_group(domain, iommu_group);
1856 		if (group)
1857 			return group;
1858 	}
1859 
1860 	list_for_each_entry(group, &iommu->emulated_iommu_groups, next)
1861 		if (group->iommu_group == iommu_group)
1862 			return group;
1863 	return NULL;
1864 }
1865 
vfio_iommu_has_sw_msi(struct list_head * group_resv_regions,phys_addr_t * base)1866 static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
1867 				  phys_addr_t *base)
1868 {
1869 	struct iommu_resv_region *region;
1870 	bool ret = false;
1871 
1872 	list_for_each_entry(region, group_resv_regions, list) {
1873 		/*
1874 		 * The presence of any 'real' MSI regions should take
1875 		 * precedence over the software-managed one if the
1876 		 * IOMMU driver happens to advertise both types.
1877 		 */
1878 		if (region->type == IOMMU_RESV_MSI) {
1879 			ret = false;
1880 			break;
1881 		}
1882 
1883 		if (region->type == IOMMU_RESV_SW_MSI) {
1884 			*base = region->start;
1885 			ret = true;
1886 		}
1887 	}
1888 
1889 	return ret;
1890 }
1891 
1892 /*
1893  * This is a helper function to insert an address range to iova list.
1894  * The list is initially created with a single entry corresponding to
1895  * the IOMMU domain geometry to which the device group is attached.
1896  * The list aperture gets modified when a new domain is added to the
1897  * container if the new aperture doesn't conflict with the current one
1898  * or with any existing dma mappings. The list is also modified to
1899  * exclude any reserved regions associated with the device group.
1900  */
vfio_iommu_iova_insert(struct list_head * head,dma_addr_t start,dma_addr_t end)1901 static int vfio_iommu_iova_insert(struct list_head *head,
1902 				  dma_addr_t start, dma_addr_t end)
1903 {
1904 	struct vfio_iova *region;
1905 
1906 	region = kmalloc(sizeof(*region), GFP_KERNEL);
1907 	if (!region)
1908 		return -ENOMEM;
1909 
1910 	INIT_LIST_HEAD(&region->list);
1911 	region->start = start;
1912 	region->end = end;
1913 
1914 	list_add_tail(&region->list, head);
1915 	return 0;
1916 }
1917 
1918 /*
1919  * Check the new iommu aperture conflicts with existing aper or with any
1920  * existing dma mappings.
1921  */
vfio_iommu_aper_conflict(struct vfio_iommu * iommu,dma_addr_t start,dma_addr_t end)1922 static bool vfio_iommu_aper_conflict(struct vfio_iommu *iommu,
1923 				     dma_addr_t start, dma_addr_t end)
1924 {
1925 	struct vfio_iova *first, *last;
1926 	struct list_head *iova = &iommu->iova_list;
1927 
1928 	if (list_empty(iova))
1929 		return false;
1930 
1931 	/* Disjoint sets, return conflict */
1932 	first = list_first_entry(iova, struct vfio_iova, list);
1933 	last = list_last_entry(iova, struct vfio_iova, list);
1934 	if (start > last->end || end < first->start)
1935 		return true;
1936 
1937 	/* Check for any existing dma mappings below the new start */
1938 	if (start > first->start) {
1939 		if (vfio_find_dma(iommu, first->start, start - first->start))
1940 			return true;
1941 	}
1942 
1943 	/* Check for any existing dma mappings beyond the new end */
1944 	if (end < last->end) {
1945 		if (vfio_find_dma(iommu, end + 1, last->end - end))
1946 			return true;
1947 	}
1948 
1949 	return false;
1950 }
1951 
1952 /*
1953  * Resize iommu iova aperture window. This is called only if the new
1954  * aperture has no conflict with existing aperture and dma mappings.
1955  */
vfio_iommu_aper_resize(struct list_head * iova,dma_addr_t start,dma_addr_t end)1956 static int vfio_iommu_aper_resize(struct list_head *iova,
1957 				  dma_addr_t start, dma_addr_t end)
1958 {
1959 	struct vfio_iova *node, *next;
1960 
1961 	if (list_empty(iova))
1962 		return vfio_iommu_iova_insert(iova, start, end);
1963 
1964 	/* Adjust iova list start */
1965 	list_for_each_entry_safe(node, next, iova, list) {
1966 		if (start < node->start)
1967 			break;
1968 		if (start >= node->start && start < node->end) {
1969 			node->start = start;
1970 			break;
1971 		}
1972 		/* Delete nodes before new start */
1973 		list_del(&node->list);
1974 		kfree(node);
1975 	}
1976 
1977 	/* Adjust iova list end */
1978 	list_for_each_entry_safe(node, next, iova, list) {
1979 		if (end > node->end)
1980 			continue;
1981 		if (end > node->start && end <= node->end) {
1982 			node->end = end;
1983 			continue;
1984 		}
1985 		/* Delete nodes after new end */
1986 		list_del(&node->list);
1987 		kfree(node);
1988 	}
1989 
1990 	return 0;
1991 }
1992 
1993 /*
1994  * Check reserved region conflicts with existing dma mappings
1995  */
vfio_iommu_resv_conflict(struct vfio_iommu * iommu,struct list_head * resv_regions)1996 static bool vfio_iommu_resv_conflict(struct vfio_iommu *iommu,
1997 				     struct list_head *resv_regions)
1998 {
1999 	struct iommu_resv_region *region;
2000 
2001 	/* Check for conflict with existing dma mappings */
2002 	list_for_each_entry(region, resv_regions, list) {
2003 		if (region->type == IOMMU_RESV_DIRECT_RELAXABLE)
2004 			continue;
2005 
2006 		if (vfio_find_dma(iommu, region->start, region->length))
2007 			return true;
2008 	}
2009 
2010 	return false;
2011 }
2012 
2013 /*
2014  * Check iova region overlap with  reserved regions and
2015  * exclude them from the iommu iova range
2016  */
vfio_iommu_resv_exclude(struct list_head * iova,struct list_head * resv_regions)2017 static int vfio_iommu_resv_exclude(struct list_head *iova,
2018 				   struct list_head *resv_regions)
2019 {
2020 	struct iommu_resv_region *resv;
2021 	struct vfio_iova *n, *next;
2022 
2023 	list_for_each_entry(resv, resv_regions, list) {
2024 		phys_addr_t start, end;
2025 
2026 		if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
2027 			continue;
2028 
2029 		start = resv->start;
2030 		end = resv->start + resv->length - 1;
2031 
2032 		list_for_each_entry_safe(n, next, iova, list) {
2033 			int ret = 0;
2034 
2035 			/* No overlap */
2036 			if (start > n->end || end < n->start)
2037 				continue;
2038 			/*
2039 			 * Insert a new node if current node overlaps with the
2040 			 * reserve region to exclude that from valid iova range.
2041 			 * Note that, new node is inserted before the current
2042 			 * node and finally the current node is deleted keeping
2043 			 * the list updated and sorted.
2044 			 */
2045 			if (start > n->start)
2046 				ret = vfio_iommu_iova_insert(&n->list, n->start,
2047 							     start - 1);
2048 			if (!ret && end < n->end)
2049 				ret = vfio_iommu_iova_insert(&n->list, end + 1,
2050 							     n->end);
2051 			if (ret)
2052 				return ret;
2053 
2054 			list_del(&n->list);
2055 			kfree(n);
2056 		}
2057 	}
2058 
2059 	if (list_empty(iova))
2060 		return -EINVAL;
2061 
2062 	return 0;
2063 }
2064 
vfio_iommu_resv_free(struct list_head * resv_regions)2065 static void vfio_iommu_resv_free(struct list_head *resv_regions)
2066 {
2067 	struct iommu_resv_region *n, *next;
2068 
2069 	list_for_each_entry_safe(n, next, resv_regions, list) {
2070 		list_del(&n->list);
2071 		kfree(n);
2072 	}
2073 }
2074 
vfio_iommu_iova_free(struct list_head * iova)2075 static void vfio_iommu_iova_free(struct list_head *iova)
2076 {
2077 	struct vfio_iova *n, *next;
2078 
2079 	list_for_each_entry_safe(n, next, iova, list) {
2080 		list_del(&n->list);
2081 		kfree(n);
2082 	}
2083 }
2084 
vfio_iommu_iova_get_copy(struct vfio_iommu * iommu,struct list_head * iova_copy)2085 static int vfio_iommu_iova_get_copy(struct vfio_iommu *iommu,
2086 				    struct list_head *iova_copy)
2087 {
2088 	struct list_head *iova = &iommu->iova_list;
2089 	struct vfio_iova *n;
2090 	int ret;
2091 
2092 	list_for_each_entry(n, iova, list) {
2093 		ret = vfio_iommu_iova_insert(iova_copy, n->start, n->end);
2094 		if (ret)
2095 			goto out_free;
2096 	}
2097 
2098 	return 0;
2099 
2100 out_free:
2101 	vfio_iommu_iova_free(iova_copy);
2102 	return ret;
2103 }
2104 
vfio_iommu_iova_insert_copy(struct vfio_iommu * iommu,struct list_head * iova_copy)2105 static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu,
2106 					struct list_head *iova_copy)
2107 {
2108 	struct list_head *iova = &iommu->iova_list;
2109 
2110 	vfio_iommu_iova_free(iova);
2111 
2112 	list_splice_tail(iova_copy, iova);
2113 }
2114 
vfio_iommu_domain_alloc(struct device * dev,void * data)2115 static int vfio_iommu_domain_alloc(struct device *dev, void *data)
2116 {
2117 	struct iommu_domain **domain = data;
2118 
2119 	*domain = iommu_paging_domain_alloc(dev);
2120 	return 1; /* Don't iterate */
2121 }
2122 
vfio_iommu_type1_attach_group(void * iommu_data,struct iommu_group * iommu_group,enum vfio_group_type type)2123 static int vfio_iommu_type1_attach_group(void *iommu_data,
2124 		struct iommu_group *iommu_group, enum vfio_group_type type)
2125 {
2126 	struct vfio_iommu *iommu = iommu_data;
2127 	struct vfio_iommu_group *group;
2128 	struct vfio_domain *domain, *d;
2129 	bool resv_msi;
2130 	phys_addr_t resv_msi_base = 0;
2131 	struct iommu_domain_geometry *geo;
2132 	LIST_HEAD(iova_copy);
2133 	LIST_HEAD(group_resv_regions);
2134 	int ret = -EBUSY;
2135 
2136 	mutex_lock(&iommu->lock);
2137 
2138 	/* Attach could require pinning, so disallow while vaddr is invalid. */
2139 	if (iommu->vaddr_invalid_count)
2140 		goto out_unlock;
2141 
2142 	/* Check for duplicates */
2143 	ret = -EINVAL;
2144 	if (vfio_iommu_find_iommu_group(iommu, iommu_group))
2145 		goto out_unlock;
2146 
2147 	ret = -ENOMEM;
2148 	group = kzalloc(sizeof(*group), GFP_KERNEL);
2149 	if (!group)
2150 		goto out_unlock;
2151 	group->iommu_group = iommu_group;
2152 
2153 	if (type == VFIO_EMULATED_IOMMU) {
2154 		list_add(&group->next, &iommu->emulated_iommu_groups);
2155 		/*
2156 		 * An emulated IOMMU group cannot dirty memory directly, it can
2157 		 * only use interfaces that provide dirty tracking.
2158 		 * The iommu scope can only be promoted with the addition of a
2159 		 * dirty tracking group.
2160 		 */
2161 		group->pinned_page_dirty_scope = true;
2162 		ret = 0;
2163 		goto out_unlock;
2164 	}
2165 
2166 	ret = -ENOMEM;
2167 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
2168 	if (!domain)
2169 		goto out_free_group;
2170 
2171 	/*
2172 	 * Going via the iommu_group iterator avoids races, and trivially gives
2173 	 * us a representative device for the IOMMU API call. We don't actually
2174 	 * want to iterate beyond the first device (if any).
2175 	 */
2176 	iommu_group_for_each_dev(iommu_group, &domain->domain,
2177 				 vfio_iommu_domain_alloc);
2178 	if (IS_ERR(domain->domain)) {
2179 		ret = PTR_ERR(domain->domain);
2180 		goto out_free_domain;
2181 	}
2182 
2183 	ret = iommu_attach_group(domain->domain, group->iommu_group);
2184 	if (ret)
2185 		goto out_domain;
2186 
2187 	/* Get aperture info */
2188 	geo = &domain->domain->geometry;
2189 	if (vfio_iommu_aper_conflict(iommu, geo->aperture_start,
2190 				     geo->aperture_end)) {
2191 		ret = -EINVAL;
2192 		goto out_detach;
2193 	}
2194 
2195 	ret = iommu_get_group_resv_regions(iommu_group, &group_resv_regions);
2196 	if (ret)
2197 		goto out_detach;
2198 
2199 	if (vfio_iommu_resv_conflict(iommu, &group_resv_regions)) {
2200 		ret = -EINVAL;
2201 		goto out_detach;
2202 	}
2203 
2204 	/*
2205 	 * We don't want to work on the original iova list as the list
2206 	 * gets modified and in case of failure we have to retain the
2207 	 * original list. Get a copy here.
2208 	 */
2209 	ret = vfio_iommu_iova_get_copy(iommu, &iova_copy);
2210 	if (ret)
2211 		goto out_detach;
2212 
2213 	ret = vfio_iommu_aper_resize(&iova_copy, geo->aperture_start,
2214 				     geo->aperture_end);
2215 	if (ret)
2216 		goto out_detach;
2217 
2218 	ret = vfio_iommu_resv_exclude(&iova_copy, &group_resv_regions);
2219 	if (ret)
2220 		goto out_detach;
2221 
2222 	resv_msi = vfio_iommu_has_sw_msi(&group_resv_regions, &resv_msi_base);
2223 
2224 	INIT_LIST_HEAD(&domain->group_list);
2225 	list_add(&group->next, &domain->group_list);
2226 
2227 	if (!allow_unsafe_interrupts &&
2228 	    !iommu_group_has_isolated_msi(iommu_group)) {
2229 		pr_warn("%s: No interrupt remapping support.  Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
2230 		       __func__);
2231 		ret = -EPERM;
2232 		goto out_detach;
2233 	}
2234 
2235 	/*
2236 	 * If the IOMMU can block non-coherent operations (ie PCIe TLPs with
2237 	 * no-snoop set) then VFIO always turns this feature on because on Intel
2238 	 * platforms it optimizes KVM to disable wbinvd emulation.
2239 	 */
2240 	if (domain->domain->ops->enforce_cache_coherency)
2241 		domain->enforce_cache_coherency =
2242 			domain->domain->ops->enforce_cache_coherency(
2243 				domain->domain);
2244 
2245 	/*
2246 	 * Try to match an existing compatible domain.  We don't want to
2247 	 * preclude an IOMMU driver supporting multiple bus_types and being
2248 	 * able to include different bus_types in the same IOMMU domain, so
2249 	 * we test whether the domains use the same iommu_ops rather than
2250 	 * testing if they're on the same bus_type.
2251 	 */
2252 	list_for_each_entry(d, &iommu->domain_list, next) {
2253 		if (d->domain->ops == domain->domain->ops &&
2254 		    d->enforce_cache_coherency ==
2255 			    domain->enforce_cache_coherency) {
2256 			iommu_detach_group(domain->domain, group->iommu_group);
2257 			if (!iommu_attach_group(d->domain,
2258 						group->iommu_group)) {
2259 				list_add(&group->next, &d->group_list);
2260 				iommu_domain_free(domain->domain);
2261 				kfree(domain);
2262 				goto done;
2263 			}
2264 
2265 			ret = iommu_attach_group(domain->domain,
2266 						 group->iommu_group);
2267 			if (ret)
2268 				goto out_domain;
2269 		}
2270 	}
2271 
2272 	/* replay mappings on new domains */
2273 	ret = vfio_iommu_replay(iommu, domain);
2274 	if (ret)
2275 		goto out_detach;
2276 
2277 	if (resv_msi) {
2278 		ret = iommu_get_msi_cookie(domain->domain, resv_msi_base);
2279 		if (ret && ret != -ENODEV)
2280 			goto out_detach;
2281 	}
2282 
2283 	list_add(&domain->next, &iommu->domain_list);
2284 	vfio_update_pgsize_bitmap(iommu);
2285 done:
2286 	/* Delete the old one and insert new iova list */
2287 	vfio_iommu_iova_insert_copy(iommu, &iova_copy);
2288 
2289 	/*
2290 	 * An iommu backed group can dirty memory directly and therefore
2291 	 * demotes the iommu scope until it declares itself dirty tracking
2292 	 * capable via the page pinning interface.
2293 	 */
2294 	iommu->num_non_pinned_groups++;
2295 	mutex_unlock(&iommu->lock);
2296 	vfio_iommu_resv_free(&group_resv_regions);
2297 
2298 	return 0;
2299 
2300 out_detach:
2301 	iommu_detach_group(domain->domain, group->iommu_group);
2302 out_domain:
2303 	iommu_domain_free(domain->domain);
2304 	vfio_iommu_iova_free(&iova_copy);
2305 	vfio_iommu_resv_free(&group_resv_regions);
2306 out_free_domain:
2307 	kfree(domain);
2308 out_free_group:
2309 	kfree(group);
2310 out_unlock:
2311 	mutex_unlock(&iommu->lock);
2312 	return ret;
2313 }
2314 
vfio_iommu_unmap_unpin_all(struct vfio_iommu * iommu)2315 static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
2316 {
2317 	struct rb_node *node;
2318 
2319 	while ((node = rb_first(&iommu->dma_list)))
2320 		vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
2321 }
2322 
vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu * iommu)2323 static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
2324 {
2325 	struct rb_node *n, *p;
2326 
2327 	n = rb_first(&iommu->dma_list);
2328 	for (; n; n = rb_next(n)) {
2329 		struct vfio_dma *dma;
2330 		long locked = 0, unlocked = 0;
2331 
2332 		dma = rb_entry(n, struct vfio_dma, node);
2333 		unlocked += vfio_unmap_unpin(iommu, dma, false);
2334 		p = rb_first(&dma->pfn_list);
2335 		for (; p; p = rb_next(p)) {
2336 			struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
2337 							 node);
2338 
2339 			if (!is_invalid_reserved_pfn(vpfn->pfn))
2340 				locked++;
2341 		}
2342 		vfio_lock_acct(dma, locked - unlocked, true);
2343 	}
2344 }
2345 
2346 /*
2347  * Called when a domain is removed in detach. It is possible that
2348  * the removed domain decided the iova aperture window. Modify the
2349  * iova aperture with the smallest window among existing domains.
2350  */
vfio_iommu_aper_expand(struct vfio_iommu * iommu,struct list_head * iova_copy)2351 static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
2352 				   struct list_head *iova_copy)
2353 {
2354 	struct vfio_domain *domain;
2355 	struct vfio_iova *node;
2356 	dma_addr_t start = 0;
2357 	dma_addr_t end = (dma_addr_t)~0;
2358 
2359 	if (list_empty(iova_copy))
2360 		return;
2361 
2362 	list_for_each_entry(domain, &iommu->domain_list, next) {
2363 		struct iommu_domain_geometry *geo = &domain->domain->geometry;
2364 
2365 		if (geo->aperture_start > start)
2366 			start = geo->aperture_start;
2367 		if (geo->aperture_end < end)
2368 			end = geo->aperture_end;
2369 	}
2370 
2371 	/* Modify aperture limits. The new aper is either same or bigger */
2372 	node = list_first_entry(iova_copy, struct vfio_iova, list);
2373 	node->start = start;
2374 	node = list_last_entry(iova_copy, struct vfio_iova, list);
2375 	node->end = end;
2376 }
2377 
2378 /*
2379  * Called when a group is detached. The reserved regions for that
2380  * group can be part of valid iova now. But since reserved regions
2381  * may be duplicated among groups, populate the iova valid regions
2382  * list again.
2383  */
vfio_iommu_resv_refresh(struct vfio_iommu * iommu,struct list_head * iova_copy)2384 static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu,
2385 				   struct list_head *iova_copy)
2386 {
2387 	struct vfio_domain *d;
2388 	struct vfio_iommu_group *g;
2389 	struct vfio_iova *node;
2390 	dma_addr_t start, end;
2391 	LIST_HEAD(resv_regions);
2392 	int ret;
2393 
2394 	if (list_empty(iova_copy))
2395 		return -EINVAL;
2396 
2397 	list_for_each_entry(d, &iommu->domain_list, next) {
2398 		list_for_each_entry(g, &d->group_list, next) {
2399 			ret = iommu_get_group_resv_regions(g->iommu_group,
2400 							   &resv_regions);
2401 			if (ret)
2402 				goto done;
2403 		}
2404 	}
2405 
2406 	node = list_first_entry(iova_copy, struct vfio_iova, list);
2407 	start = node->start;
2408 	node = list_last_entry(iova_copy, struct vfio_iova, list);
2409 	end = node->end;
2410 
2411 	/* purge the iova list and create new one */
2412 	vfio_iommu_iova_free(iova_copy);
2413 
2414 	ret = vfio_iommu_aper_resize(iova_copy, start, end);
2415 	if (ret)
2416 		goto done;
2417 
2418 	/* Exclude current reserved regions from iova ranges */
2419 	ret = vfio_iommu_resv_exclude(iova_copy, &resv_regions);
2420 done:
2421 	vfio_iommu_resv_free(&resv_regions);
2422 	return ret;
2423 }
2424 
vfio_iommu_type1_detach_group(void * iommu_data,struct iommu_group * iommu_group)2425 static void vfio_iommu_type1_detach_group(void *iommu_data,
2426 					  struct iommu_group *iommu_group)
2427 {
2428 	struct vfio_iommu *iommu = iommu_data;
2429 	struct vfio_domain *domain;
2430 	struct vfio_iommu_group *group;
2431 	bool update_dirty_scope = false;
2432 	LIST_HEAD(iova_copy);
2433 
2434 	mutex_lock(&iommu->lock);
2435 	list_for_each_entry(group, &iommu->emulated_iommu_groups, next) {
2436 		if (group->iommu_group != iommu_group)
2437 			continue;
2438 		update_dirty_scope = !group->pinned_page_dirty_scope;
2439 		list_del(&group->next);
2440 		kfree(group);
2441 
2442 		if (list_empty(&iommu->emulated_iommu_groups) &&
2443 		    list_empty(&iommu->domain_list)) {
2444 			WARN_ON(!list_empty(&iommu->device_list));
2445 			vfio_iommu_unmap_unpin_all(iommu);
2446 		}
2447 		goto detach_group_done;
2448 	}
2449 
2450 	/*
2451 	 * Get a copy of iova list. This will be used to update
2452 	 * and to replace the current one later. Please note that
2453 	 * we will leave the original list as it is if update fails.
2454 	 */
2455 	vfio_iommu_iova_get_copy(iommu, &iova_copy);
2456 
2457 	list_for_each_entry(domain, &iommu->domain_list, next) {
2458 		group = find_iommu_group(domain, iommu_group);
2459 		if (!group)
2460 			continue;
2461 
2462 		iommu_detach_group(domain->domain, group->iommu_group);
2463 		update_dirty_scope = !group->pinned_page_dirty_scope;
2464 		list_del(&group->next);
2465 		kfree(group);
2466 		/*
2467 		 * Group ownership provides privilege, if the group list is
2468 		 * empty, the domain goes away. If it's the last domain with
2469 		 * iommu and external domain doesn't exist, then all the
2470 		 * mappings go away too. If it's the last domain with iommu and
2471 		 * external domain exist, update accounting
2472 		 */
2473 		if (list_empty(&domain->group_list)) {
2474 			if (list_is_singular(&iommu->domain_list)) {
2475 				if (list_empty(&iommu->emulated_iommu_groups)) {
2476 					WARN_ON(!list_empty(
2477 						&iommu->device_list));
2478 					vfio_iommu_unmap_unpin_all(iommu);
2479 				} else {
2480 					vfio_iommu_unmap_unpin_reaccount(iommu);
2481 				}
2482 			}
2483 			iommu_domain_free(domain->domain);
2484 			list_del(&domain->next);
2485 			kfree(domain);
2486 			vfio_iommu_aper_expand(iommu, &iova_copy);
2487 			vfio_update_pgsize_bitmap(iommu);
2488 		}
2489 		break;
2490 	}
2491 
2492 	if (!vfio_iommu_resv_refresh(iommu, &iova_copy))
2493 		vfio_iommu_iova_insert_copy(iommu, &iova_copy);
2494 	else
2495 		vfio_iommu_iova_free(&iova_copy);
2496 
2497 detach_group_done:
2498 	/*
2499 	 * Removal of a group without dirty tracking may allow the iommu scope
2500 	 * to be promoted.
2501 	 */
2502 	if (update_dirty_scope) {
2503 		iommu->num_non_pinned_groups--;
2504 		if (iommu->dirty_page_tracking)
2505 			vfio_iommu_populate_bitmap_full(iommu);
2506 	}
2507 	mutex_unlock(&iommu->lock);
2508 }
2509 
vfio_iommu_type1_open(unsigned long arg)2510 static void *vfio_iommu_type1_open(unsigned long arg)
2511 {
2512 	struct vfio_iommu *iommu;
2513 
2514 	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
2515 	if (!iommu)
2516 		return ERR_PTR(-ENOMEM);
2517 
2518 	switch (arg) {
2519 	case VFIO_TYPE1_IOMMU:
2520 		break;
2521 	case __VFIO_RESERVED_TYPE1_NESTING_IOMMU:
2522 	case VFIO_TYPE1v2_IOMMU:
2523 		iommu->v2 = true;
2524 		break;
2525 	default:
2526 		kfree(iommu);
2527 		return ERR_PTR(-EINVAL);
2528 	}
2529 
2530 	INIT_LIST_HEAD(&iommu->domain_list);
2531 	INIT_LIST_HEAD(&iommu->iova_list);
2532 	iommu->dma_list = RB_ROOT;
2533 	iommu->dma_avail = dma_entry_limit;
2534 	mutex_init(&iommu->lock);
2535 	mutex_init(&iommu->device_list_lock);
2536 	INIT_LIST_HEAD(&iommu->device_list);
2537 	iommu->pgsize_bitmap = PAGE_MASK;
2538 	INIT_LIST_HEAD(&iommu->emulated_iommu_groups);
2539 
2540 	return iommu;
2541 }
2542 
vfio_release_domain(struct vfio_domain * domain)2543 static void vfio_release_domain(struct vfio_domain *domain)
2544 {
2545 	struct vfio_iommu_group *group, *group_tmp;
2546 
2547 	list_for_each_entry_safe(group, group_tmp,
2548 				 &domain->group_list, next) {
2549 		iommu_detach_group(domain->domain, group->iommu_group);
2550 		list_del(&group->next);
2551 		kfree(group);
2552 	}
2553 
2554 	iommu_domain_free(domain->domain);
2555 }
2556 
vfio_iommu_type1_release(void * iommu_data)2557 static void vfio_iommu_type1_release(void *iommu_data)
2558 {
2559 	struct vfio_iommu *iommu = iommu_data;
2560 	struct vfio_domain *domain, *domain_tmp;
2561 	struct vfio_iommu_group *group, *next_group;
2562 
2563 	list_for_each_entry_safe(group, next_group,
2564 			&iommu->emulated_iommu_groups, next) {
2565 		list_del(&group->next);
2566 		kfree(group);
2567 	}
2568 
2569 	vfio_iommu_unmap_unpin_all(iommu);
2570 
2571 	list_for_each_entry_safe(domain, domain_tmp,
2572 				 &iommu->domain_list, next) {
2573 		vfio_release_domain(domain);
2574 		list_del(&domain->next);
2575 		kfree(domain);
2576 	}
2577 
2578 	vfio_iommu_iova_free(&iommu->iova_list);
2579 
2580 	kfree(iommu);
2581 }
2582 
vfio_domains_have_enforce_cache_coherency(struct vfio_iommu * iommu)2583 static int vfio_domains_have_enforce_cache_coherency(struct vfio_iommu *iommu)
2584 {
2585 	struct vfio_domain *domain;
2586 	int ret = 1;
2587 
2588 	mutex_lock(&iommu->lock);
2589 	list_for_each_entry(domain, &iommu->domain_list, next) {
2590 		if (!(domain->enforce_cache_coherency)) {
2591 			ret = 0;
2592 			break;
2593 		}
2594 	}
2595 	mutex_unlock(&iommu->lock);
2596 
2597 	return ret;
2598 }
2599 
vfio_iommu_has_emulated(struct vfio_iommu * iommu)2600 static bool vfio_iommu_has_emulated(struct vfio_iommu *iommu)
2601 {
2602 	bool ret;
2603 
2604 	mutex_lock(&iommu->lock);
2605 	ret = !list_empty(&iommu->emulated_iommu_groups);
2606 	mutex_unlock(&iommu->lock);
2607 	return ret;
2608 }
2609 
vfio_iommu_type1_check_extension(struct vfio_iommu * iommu,unsigned long arg)2610 static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
2611 					    unsigned long arg)
2612 {
2613 	switch (arg) {
2614 	case VFIO_TYPE1_IOMMU:
2615 	case VFIO_TYPE1v2_IOMMU:
2616 	case VFIO_UNMAP_ALL:
2617 		return 1;
2618 	case VFIO_UPDATE_VADDR:
2619 		/*
2620 		 * Disable this feature if mdevs are present.  They cannot
2621 		 * safely pin/unpin/rw while vaddrs are being updated.
2622 		 */
2623 		return iommu && !vfio_iommu_has_emulated(iommu);
2624 	case VFIO_DMA_CC_IOMMU:
2625 		if (!iommu)
2626 			return 0;
2627 		return vfio_domains_have_enforce_cache_coherency(iommu);
2628 	default:
2629 		return 0;
2630 	}
2631 }
2632 
vfio_iommu_iova_add_cap(struct vfio_info_cap * caps,struct vfio_iommu_type1_info_cap_iova_range * cap_iovas,size_t size)2633 static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps,
2634 		 struct vfio_iommu_type1_info_cap_iova_range *cap_iovas,
2635 		 size_t size)
2636 {
2637 	struct vfio_info_cap_header *header;
2638 	struct vfio_iommu_type1_info_cap_iova_range *iova_cap;
2639 
2640 	header = vfio_info_cap_add(caps, size,
2641 				   VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
2642 	if (IS_ERR(header))
2643 		return PTR_ERR(header);
2644 
2645 	iova_cap = container_of(header,
2646 				struct vfio_iommu_type1_info_cap_iova_range,
2647 				header);
2648 	iova_cap->nr_iovas = cap_iovas->nr_iovas;
2649 	memcpy(iova_cap->iova_ranges, cap_iovas->iova_ranges,
2650 	       cap_iovas->nr_iovas * sizeof(*cap_iovas->iova_ranges));
2651 	return 0;
2652 }
2653 
vfio_iommu_iova_build_caps(struct vfio_iommu * iommu,struct vfio_info_cap * caps)2654 static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
2655 				      struct vfio_info_cap *caps)
2656 {
2657 	struct vfio_iommu_type1_info_cap_iova_range *cap_iovas;
2658 	struct vfio_iova *iova;
2659 	size_t size;
2660 	int iovas = 0, i = 0, ret;
2661 
2662 	list_for_each_entry(iova, &iommu->iova_list, list)
2663 		iovas++;
2664 
2665 	if (!iovas) {
2666 		/*
2667 		 * Return 0 as a container with a single mdev device
2668 		 * will have an empty list
2669 		 */
2670 		return 0;
2671 	}
2672 
2673 	size = struct_size(cap_iovas, iova_ranges, iovas);
2674 
2675 	cap_iovas = kzalloc(size, GFP_KERNEL);
2676 	if (!cap_iovas)
2677 		return -ENOMEM;
2678 
2679 	cap_iovas->nr_iovas = iovas;
2680 
2681 	list_for_each_entry(iova, &iommu->iova_list, list) {
2682 		cap_iovas->iova_ranges[i].start = iova->start;
2683 		cap_iovas->iova_ranges[i].end = iova->end;
2684 		i++;
2685 	}
2686 
2687 	ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size);
2688 
2689 	kfree(cap_iovas);
2690 	return ret;
2691 }
2692 
vfio_iommu_migration_build_caps(struct vfio_iommu * iommu,struct vfio_info_cap * caps)2693 static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
2694 					   struct vfio_info_cap *caps)
2695 {
2696 	struct vfio_iommu_type1_info_cap_migration cap_mig = {};
2697 
2698 	cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION;
2699 	cap_mig.header.version = 1;
2700 
2701 	cap_mig.flags = 0;
2702 	/* support minimum pgsize */
2703 	cap_mig.pgsize_bitmap = (size_t)1 << __ffs(iommu->pgsize_bitmap);
2704 	cap_mig.max_dirty_bitmap_size = DIRTY_BITMAP_SIZE_MAX;
2705 
2706 	return vfio_info_add_capability(caps, &cap_mig.header, sizeof(cap_mig));
2707 }
2708 
vfio_iommu_dma_avail_build_caps(struct vfio_iommu * iommu,struct vfio_info_cap * caps)2709 static int vfio_iommu_dma_avail_build_caps(struct vfio_iommu *iommu,
2710 					   struct vfio_info_cap *caps)
2711 {
2712 	struct vfio_iommu_type1_info_dma_avail cap_dma_avail;
2713 
2714 	cap_dma_avail.header.id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL;
2715 	cap_dma_avail.header.version = 1;
2716 
2717 	cap_dma_avail.avail = iommu->dma_avail;
2718 
2719 	return vfio_info_add_capability(caps, &cap_dma_avail.header,
2720 					sizeof(cap_dma_avail));
2721 }
2722 
vfio_iommu_type1_get_info(struct vfio_iommu * iommu,unsigned long arg)2723 static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu,
2724 				     unsigned long arg)
2725 {
2726 	struct vfio_iommu_type1_info info = {};
2727 	unsigned long minsz;
2728 	struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
2729 	int ret;
2730 
2731 	minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
2732 
2733 	if (copy_from_user(&info, (void __user *)arg, minsz))
2734 		return -EFAULT;
2735 
2736 	if (info.argsz < minsz)
2737 		return -EINVAL;
2738 
2739 	minsz = min_t(size_t, info.argsz, sizeof(info));
2740 
2741 	mutex_lock(&iommu->lock);
2742 	info.flags = VFIO_IOMMU_INFO_PGSIZES;
2743 
2744 	info.iova_pgsizes = iommu->pgsize_bitmap;
2745 
2746 	ret = vfio_iommu_migration_build_caps(iommu, &caps);
2747 
2748 	if (!ret)
2749 		ret = vfio_iommu_dma_avail_build_caps(iommu, &caps);
2750 
2751 	if (!ret)
2752 		ret = vfio_iommu_iova_build_caps(iommu, &caps);
2753 
2754 	mutex_unlock(&iommu->lock);
2755 
2756 	if (ret)
2757 		return ret;
2758 
2759 	if (caps.size) {
2760 		info.flags |= VFIO_IOMMU_INFO_CAPS;
2761 
2762 		if (info.argsz < sizeof(info) + caps.size) {
2763 			info.argsz = sizeof(info) + caps.size;
2764 		} else {
2765 			vfio_info_cap_shift(&caps, sizeof(info));
2766 			if (copy_to_user((void __user *)arg +
2767 					sizeof(info), caps.buf,
2768 					caps.size)) {
2769 				kfree(caps.buf);
2770 				return -EFAULT;
2771 			}
2772 			info.cap_offset = sizeof(info);
2773 		}
2774 
2775 		kfree(caps.buf);
2776 	}
2777 
2778 	return copy_to_user((void __user *)arg, &info, minsz) ?
2779 			-EFAULT : 0;
2780 }
2781 
vfio_iommu_type1_map_dma(struct vfio_iommu * iommu,unsigned long arg)2782 static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu,
2783 				    unsigned long arg)
2784 {
2785 	struct vfio_iommu_type1_dma_map map;
2786 	unsigned long minsz;
2787 	uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE |
2788 			VFIO_DMA_MAP_FLAG_VADDR;
2789 
2790 	minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
2791 
2792 	if (copy_from_user(&map, (void __user *)arg, minsz))
2793 		return -EFAULT;
2794 
2795 	if (map.argsz < minsz || map.flags & ~mask)
2796 		return -EINVAL;
2797 
2798 	return vfio_dma_do_map(iommu, &map);
2799 }
2800 
vfio_iommu_type1_unmap_dma(struct vfio_iommu * iommu,unsigned long arg)2801 static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu,
2802 				      unsigned long arg)
2803 {
2804 	struct vfio_iommu_type1_dma_unmap unmap;
2805 	struct vfio_bitmap bitmap = { 0 };
2806 	uint32_t mask = VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP |
2807 			VFIO_DMA_UNMAP_FLAG_VADDR |
2808 			VFIO_DMA_UNMAP_FLAG_ALL;
2809 	unsigned long minsz;
2810 	int ret;
2811 
2812 	minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
2813 
2814 	if (copy_from_user(&unmap, (void __user *)arg, minsz))
2815 		return -EFAULT;
2816 
2817 	if (unmap.argsz < minsz || unmap.flags & ~mask)
2818 		return -EINVAL;
2819 
2820 	if ((unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
2821 	    (unmap.flags & (VFIO_DMA_UNMAP_FLAG_ALL |
2822 			    VFIO_DMA_UNMAP_FLAG_VADDR)))
2823 		return -EINVAL;
2824 
2825 	if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
2826 		unsigned long pgshift;
2827 
2828 		if (unmap.argsz < (minsz + sizeof(bitmap)))
2829 			return -EINVAL;
2830 
2831 		if (copy_from_user(&bitmap,
2832 				   (void __user *)(arg + minsz),
2833 				   sizeof(bitmap)))
2834 			return -EFAULT;
2835 
2836 		if (!access_ok((void __user *)bitmap.data, bitmap.size))
2837 			return -EINVAL;
2838 
2839 		pgshift = __ffs(bitmap.pgsize);
2840 		ret = verify_bitmap_size(unmap.size >> pgshift,
2841 					 bitmap.size);
2842 		if (ret)
2843 			return ret;
2844 	}
2845 
2846 	ret = vfio_dma_do_unmap(iommu, &unmap, &bitmap);
2847 	if (ret)
2848 		return ret;
2849 
2850 	return copy_to_user((void __user *)arg, &unmap, minsz) ?
2851 			-EFAULT : 0;
2852 }
2853 
vfio_iommu_type1_dirty_pages(struct vfio_iommu * iommu,unsigned long arg)2854 static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
2855 					unsigned long arg)
2856 {
2857 	struct vfio_iommu_type1_dirty_bitmap dirty;
2858 	uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
2859 			VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
2860 			VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
2861 	unsigned long minsz;
2862 	int ret = 0;
2863 
2864 	if (!iommu->v2)
2865 		return -EACCES;
2866 
2867 	minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap, flags);
2868 
2869 	if (copy_from_user(&dirty, (void __user *)arg, minsz))
2870 		return -EFAULT;
2871 
2872 	if (dirty.argsz < minsz || dirty.flags & ~mask)
2873 		return -EINVAL;
2874 
2875 	/* only one flag should be set at a time */
2876 	if (__ffs(dirty.flags) != __fls(dirty.flags))
2877 		return -EINVAL;
2878 
2879 	if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
2880 		size_t pgsize;
2881 
2882 		mutex_lock(&iommu->lock);
2883 		pgsize = 1 << __ffs(iommu->pgsize_bitmap);
2884 		if (!iommu->dirty_page_tracking) {
2885 			ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
2886 			if (!ret)
2887 				iommu->dirty_page_tracking = true;
2888 		}
2889 		mutex_unlock(&iommu->lock);
2890 		return ret;
2891 	} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
2892 		mutex_lock(&iommu->lock);
2893 		if (iommu->dirty_page_tracking) {
2894 			iommu->dirty_page_tracking = false;
2895 			vfio_dma_bitmap_free_all(iommu);
2896 		}
2897 		mutex_unlock(&iommu->lock);
2898 		return 0;
2899 	} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
2900 		struct vfio_iommu_type1_dirty_bitmap_get range;
2901 		unsigned long pgshift;
2902 		size_t data_size = dirty.argsz - minsz;
2903 		size_t iommu_pgsize;
2904 
2905 		if (!data_size || data_size < sizeof(range))
2906 			return -EINVAL;
2907 
2908 		if (copy_from_user(&range, (void __user *)(arg + minsz),
2909 				   sizeof(range)))
2910 			return -EFAULT;
2911 
2912 		if (range.iova + range.size < range.iova)
2913 			return -EINVAL;
2914 		if (!access_ok((void __user *)range.bitmap.data,
2915 			       range.bitmap.size))
2916 			return -EINVAL;
2917 
2918 		pgshift = __ffs(range.bitmap.pgsize);
2919 		ret = verify_bitmap_size(range.size >> pgshift,
2920 					 range.bitmap.size);
2921 		if (ret)
2922 			return ret;
2923 
2924 		mutex_lock(&iommu->lock);
2925 
2926 		iommu_pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
2927 
2928 		/* allow only smallest supported pgsize */
2929 		if (range.bitmap.pgsize != iommu_pgsize) {
2930 			ret = -EINVAL;
2931 			goto out_unlock;
2932 		}
2933 		if (range.iova & (iommu_pgsize - 1)) {
2934 			ret = -EINVAL;
2935 			goto out_unlock;
2936 		}
2937 		if (!range.size || range.size & (iommu_pgsize - 1)) {
2938 			ret = -EINVAL;
2939 			goto out_unlock;
2940 		}
2941 
2942 		if (iommu->dirty_page_tracking)
2943 			ret = vfio_iova_dirty_bitmap(range.bitmap.data,
2944 						     iommu, range.iova,
2945 						     range.size,
2946 						     range.bitmap.pgsize);
2947 		else
2948 			ret = -EINVAL;
2949 out_unlock:
2950 		mutex_unlock(&iommu->lock);
2951 
2952 		return ret;
2953 	}
2954 
2955 	return -EINVAL;
2956 }
2957 
vfio_iommu_type1_ioctl(void * iommu_data,unsigned int cmd,unsigned long arg)2958 static long vfio_iommu_type1_ioctl(void *iommu_data,
2959 				   unsigned int cmd, unsigned long arg)
2960 {
2961 	struct vfio_iommu *iommu = iommu_data;
2962 
2963 	switch (cmd) {
2964 	case VFIO_CHECK_EXTENSION:
2965 		return vfio_iommu_type1_check_extension(iommu, arg);
2966 	case VFIO_IOMMU_GET_INFO:
2967 		return vfio_iommu_type1_get_info(iommu, arg);
2968 	case VFIO_IOMMU_MAP_DMA:
2969 		return vfio_iommu_type1_map_dma(iommu, arg);
2970 	case VFIO_IOMMU_UNMAP_DMA:
2971 		return vfio_iommu_type1_unmap_dma(iommu, arg);
2972 	case VFIO_IOMMU_DIRTY_PAGES:
2973 		return vfio_iommu_type1_dirty_pages(iommu, arg);
2974 	default:
2975 		return -ENOTTY;
2976 	}
2977 }
2978 
vfio_iommu_type1_register_device(void * iommu_data,struct vfio_device * vdev)2979 static void vfio_iommu_type1_register_device(void *iommu_data,
2980 					     struct vfio_device *vdev)
2981 {
2982 	struct vfio_iommu *iommu = iommu_data;
2983 
2984 	if (!vdev->ops->dma_unmap)
2985 		return;
2986 
2987 	/*
2988 	 * list_empty(&iommu->device_list) is tested under the iommu->lock while
2989 	 * iteration for dma_unmap must be done under the device_list_lock.
2990 	 * Holding both locks here allows avoiding the device_list_lock in
2991 	 * several fast paths. See vfio_notify_dma_unmap()
2992 	 */
2993 	mutex_lock(&iommu->lock);
2994 	mutex_lock(&iommu->device_list_lock);
2995 	list_add(&vdev->iommu_entry, &iommu->device_list);
2996 	mutex_unlock(&iommu->device_list_lock);
2997 	mutex_unlock(&iommu->lock);
2998 }
2999 
vfio_iommu_type1_unregister_device(void * iommu_data,struct vfio_device * vdev)3000 static void vfio_iommu_type1_unregister_device(void *iommu_data,
3001 					       struct vfio_device *vdev)
3002 {
3003 	struct vfio_iommu *iommu = iommu_data;
3004 
3005 	if (!vdev->ops->dma_unmap)
3006 		return;
3007 
3008 	mutex_lock(&iommu->lock);
3009 	mutex_lock(&iommu->device_list_lock);
3010 	list_del(&vdev->iommu_entry);
3011 	mutex_unlock(&iommu->device_list_lock);
3012 	mutex_unlock(&iommu->lock);
3013 }
3014 
vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu * iommu,dma_addr_t user_iova,void * data,size_t count,bool write,size_t * copied)3015 static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
3016 					 dma_addr_t user_iova, void *data,
3017 					 size_t count, bool write,
3018 					 size_t *copied)
3019 {
3020 	struct mm_struct *mm;
3021 	unsigned long vaddr;
3022 	struct vfio_dma *dma;
3023 	bool kthread = current->mm == NULL;
3024 	size_t offset;
3025 
3026 	*copied = 0;
3027 
3028 	dma = vfio_find_dma(iommu, user_iova, 1);
3029 	if (!dma)
3030 		return -EINVAL;
3031 
3032 	if ((write && !(dma->prot & IOMMU_WRITE)) ||
3033 			!(dma->prot & IOMMU_READ))
3034 		return -EPERM;
3035 
3036 	mm = dma->mm;
3037 	if (!mmget_not_zero(mm))
3038 		return -EPERM;
3039 
3040 	if (kthread)
3041 		kthread_use_mm(mm);
3042 	else if (current->mm != mm)
3043 		goto out;
3044 
3045 	offset = user_iova - dma->iova;
3046 
3047 	if (count > dma->size - offset)
3048 		count = dma->size - offset;
3049 
3050 	vaddr = dma->vaddr + offset;
3051 
3052 	if (write) {
3053 		*copied = copy_to_user((void __user *)vaddr, data,
3054 					 count) ? 0 : count;
3055 		if (*copied && iommu->dirty_page_tracking) {
3056 			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
3057 			/*
3058 			 * Bitmap populated with the smallest supported page
3059 			 * size
3060 			 */
3061 			bitmap_set(dma->bitmap, offset >> pgshift,
3062 				   ((offset + *copied - 1) >> pgshift) -
3063 				   (offset >> pgshift) + 1);
3064 		}
3065 	} else
3066 		*copied = copy_from_user(data, (void __user *)vaddr,
3067 					   count) ? 0 : count;
3068 	if (kthread)
3069 		kthread_unuse_mm(mm);
3070 out:
3071 	mmput(mm);
3072 	return *copied ? 0 : -EFAULT;
3073 }
3074 
vfio_iommu_type1_dma_rw(void * iommu_data,dma_addr_t user_iova,void * data,size_t count,bool write)3075 static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t user_iova,
3076 				   void *data, size_t count, bool write)
3077 {
3078 	struct vfio_iommu *iommu = iommu_data;
3079 	int ret = 0;
3080 	size_t done;
3081 
3082 	mutex_lock(&iommu->lock);
3083 
3084 	if (WARN_ONCE(iommu->vaddr_invalid_count,
3085 		      "vfio_dma_rw not allowed with VFIO_UPDATE_VADDR\n")) {
3086 		ret = -EBUSY;
3087 		goto out;
3088 	}
3089 
3090 	while (count > 0) {
3091 		ret = vfio_iommu_type1_dma_rw_chunk(iommu, user_iova, data,
3092 						    count, write, &done);
3093 		if (ret)
3094 			break;
3095 
3096 		count -= done;
3097 		data += done;
3098 		user_iova += done;
3099 	}
3100 
3101 out:
3102 	mutex_unlock(&iommu->lock);
3103 	return ret;
3104 }
3105 
3106 static struct iommu_domain *
vfio_iommu_type1_group_iommu_domain(void * iommu_data,struct iommu_group * iommu_group)3107 vfio_iommu_type1_group_iommu_domain(void *iommu_data,
3108 				    struct iommu_group *iommu_group)
3109 {
3110 	struct iommu_domain *domain = ERR_PTR(-ENODEV);
3111 	struct vfio_iommu *iommu = iommu_data;
3112 	struct vfio_domain *d;
3113 
3114 	if (!iommu || !iommu_group)
3115 		return ERR_PTR(-EINVAL);
3116 
3117 	mutex_lock(&iommu->lock);
3118 	list_for_each_entry(d, &iommu->domain_list, next) {
3119 		if (find_iommu_group(d, iommu_group)) {
3120 			domain = d->domain;
3121 			break;
3122 		}
3123 	}
3124 	mutex_unlock(&iommu->lock);
3125 
3126 	return domain;
3127 }
3128 
3129 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
3130 	.name			= "vfio-iommu-type1",
3131 	.owner			= THIS_MODULE,
3132 	.open			= vfio_iommu_type1_open,
3133 	.release		= vfio_iommu_type1_release,
3134 	.ioctl			= vfio_iommu_type1_ioctl,
3135 	.attach_group		= vfio_iommu_type1_attach_group,
3136 	.detach_group		= vfio_iommu_type1_detach_group,
3137 	.pin_pages		= vfio_iommu_type1_pin_pages,
3138 	.unpin_pages		= vfio_iommu_type1_unpin_pages,
3139 	.register_device	= vfio_iommu_type1_register_device,
3140 	.unregister_device	= vfio_iommu_type1_unregister_device,
3141 	.dma_rw			= vfio_iommu_type1_dma_rw,
3142 	.group_iommu_domain	= vfio_iommu_type1_group_iommu_domain,
3143 };
3144 
vfio_iommu_type1_init(void)3145 static int __init vfio_iommu_type1_init(void)
3146 {
3147 	return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
3148 }
3149 
vfio_iommu_type1_cleanup(void)3150 static void __exit vfio_iommu_type1_cleanup(void)
3151 {
3152 	vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1);
3153 }
3154 
3155 module_init(vfio_iommu_type1_init);
3156 module_exit(vfio_iommu_type1_cleanup);
3157 
3158 MODULE_VERSION(DRIVER_VERSION);
3159 MODULE_LICENSE("GPL v2");
3160 MODULE_AUTHOR(DRIVER_AUTHOR);
3161 MODULE_DESCRIPTION(DRIVER_DESC);
3162