1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
3 *
4 * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
5 * PFNs can be placed into an iommu_domain, or returned to the caller as a page
6 * list for access by an in-kernel user.
7 *
8 * The datastructure uses the iopt_pages to optimize the storage of the PFNs
9 * between the domains and xarray.
10 */
11 #include <linux/dma-buf.h>
12 #include <linux/err.h>
13 #include <linux/errno.h>
14 #include <linux/file.h>
15 #include <linux/iommu.h>
16 #include <linux/iommufd.h>
17 #include <linux/lockdep.h>
18 #include <linux/sched/mm.h>
19 #include <linux/slab.h>
20 #include <uapi/linux/iommufd.h>
21
22 #include "double_span.h"
23 #include "io_pagetable.h"
24
25 struct iopt_pages_list {
26 struct iopt_pages *pages;
27 struct iopt_area *area;
28 struct list_head next;
29 unsigned long start_byte;
30 unsigned long length;
31 };
32
iopt_area_contig_init(struct iopt_area_contig_iter * iter,struct io_pagetable * iopt,unsigned long iova,unsigned long last_iova)33 struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
34 struct io_pagetable *iopt,
35 unsigned long iova,
36 unsigned long last_iova)
37 {
38 lockdep_assert_held(&iopt->iova_rwsem);
39
40 iter->cur_iova = iova;
41 iter->last_iova = last_iova;
42 iter->area = iopt_area_iter_first(iopt, iova, iova);
43 if (!iter->area)
44 return NULL;
45 if (!iter->area->pages) {
46 iter->area = NULL;
47 return NULL;
48 }
49 return iter->area;
50 }
51
iopt_area_contig_next(struct iopt_area_contig_iter * iter)52 struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
53 {
54 unsigned long last_iova;
55
56 if (!iter->area)
57 return NULL;
58 last_iova = iopt_area_last_iova(iter->area);
59 if (iter->last_iova <= last_iova)
60 return NULL;
61
62 iter->cur_iova = last_iova + 1;
63 iter->area = iopt_area_iter_next(iter->area, iter->cur_iova,
64 iter->last_iova);
65 if (!iter->area)
66 return NULL;
67 if (iter->cur_iova != iopt_area_iova(iter->area) ||
68 !iter->area->pages) {
69 iter->area = NULL;
70 return NULL;
71 }
72 return iter->area;
73 }
74
__alloc_iova_check_range(unsigned long * start,unsigned long last,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)75 static bool __alloc_iova_check_range(unsigned long *start, unsigned long last,
76 unsigned long length,
77 unsigned long iova_alignment,
78 unsigned long page_offset)
79 {
80 unsigned long aligned_start;
81
82 /* ALIGN_UP() */
83 if (check_add_overflow(*start, iova_alignment - 1, &aligned_start))
84 return false;
85 aligned_start &= ~(iova_alignment - 1);
86 aligned_start |= page_offset;
87
88 if (aligned_start >= last || last - aligned_start < length - 1)
89 return false;
90 *start = aligned_start;
91 return true;
92 }
93
__alloc_iova_check_hole(struct interval_tree_double_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)94 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
95 unsigned long length,
96 unsigned long iova_alignment,
97 unsigned long page_offset)
98 {
99 if (span->is_used)
100 return false;
101 return __alloc_iova_check_range(&span->start_hole, span->last_hole,
102 length, iova_alignment, page_offset);
103 }
104
__alloc_iova_check_used(struct interval_tree_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)105 static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
106 unsigned long length,
107 unsigned long iova_alignment,
108 unsigned long page_offset)
109 {
110 if (span->is_hole)
111 return false;
112 return __alloc_iova_check_range(&span->start_used, span->last_used,
113 length, iova_alignment, page_offset);
114 }
115
116 /*
117 * Automatically find a block of IOVA that is not being used and not reserved.
118 * Does not return a 0 IOVA even if it is valid.
119 */
iopt_alloc_iova(struct io_pagetable * iopt,unsigned long * iova,unsigned long addr,unsigned long length)120 static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
121 unsigned long addr, unsigned long length)
122 {
123 unsigned long page_offset = addr % PAGE_SIZE;
124 struct interval_tree_double_span_iter used_span;
125 struct interval_tree_span_iter allowed_span;
126 unsigned long max_alignment = PAGE_SIZE;
127 unsigned long iova_alignment;
128
129 lockdep_assert_held(&iopt->iova_rwsem);
130
131 /* Protect roundup_pow-of_two() from overflow */
132 if (length == 0 || length >= ULONG_MAX / 2)
133 return -EOVERFLOW;
134
135 /*
136 * Keep alignment present in addr when building the IOVA, which
137 * increases the chance we can map a THP.
138 */
139 if (!addr)
140 iova_alignment = roundup_pow_of_two(length);
141 else
142 iova_alignment = min_t(unsigned long,
143 roundup_pow_of_two(length),
144 1UL << __ffs64(addr));
145
146 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
147 max_alignment = HPAGE_SIZE;
148 #endif
149 /* Protect against ALIGN() overflow */
150 if (iova_alignment >= max_alignment)
151 iova_alignment = max_alignment;
152
153 if (iova_alignment < iopt->iova_alignment)
154 return -EINVAL;
155
156 interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree,
157 PAGE_SIZE, ULONG_MAX - PAGE_SIZE) {
158 if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) {
159 allowed_span.start_used = PAGE_SIZE;
160 allowed_span.last_used = ULONG_MAX - PAGE_SIZE;
161 allowed_span.is_hole = false;
162 }
163
164 if (!__alloc_iova_check_used(&allowed_span, length,
165 iova_alignment, page_offset))
166 continue;
167
168 interval_tree_for_each_double_span(
169 &used_span, &iopt->reserved_itree, &iopt->area_itree,
170 allowed_span.start_used, allowed_span.last_used) {
171 if (!__alloc_iova_check_hole(&used_span, length,
172 iova_alignment,
173 page_offset))
174 continue;
175
176 *iova = used_span.start_hole;
177 return 0;
178 }
179 }
180 return -ENOSPC;
181 }
182
iopt_check_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length)183 static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova,
184 unsigned long length)
185 {
186 unsigned long last;
187
188 lockdep_assert_held(&iopt->iova_rwsem);
189
190 if ((iova & (iopt->iova_alignment - 1)))
191 return -EINVAL;
192
193 if (check_add_overflow(iova, length - 1, &last))
194 return -EOVERFLOW;
195
196 /* No reserved IOVA intersects the range */
197 if (iopt_reserved_iter_first(iopt, iova, last))
198 return -EINVAL;
199
200 /* Check that there is not already a mapping in the range */
201 if (iopt_area_iter_first(iopt, iova, last))
202 return -EEXIST;
203 return 0;
204 }
205
206 /*
207 * The area takes a slice of the pages from start_bytes to start_byte + length
208 */
iopt_insert_area(struct io_pagetable * iopt,struct iopt_area * area,struct iopt_pages * pages,unsigned long iova,unsigned long start_byte,unsigned long length,int iommu_prot)209 static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
210 struct iopt_pages *pages, unsigned long iova,
211 unsigned long start_byte, unsigned long length,
212 int iommu_prot)
213 {
214 lockdep_assert_held_write(&iopt->iova_rwsem);
215
216 if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
217 return -EPERM;
218
219 area->iommu_prot = iommu_prot;
220 area->page_offset = start_byte % PAGE_SIZE;
221 if (area->page_offset & (iopt->iova_alignment - 1))
222 return -EINVAL;
223
224 area->node.start = iova;
225 if (check_add_overflow(iova, length - 1, &area->node.last))
226 return -EOVERFLOW;
227
228 area->pages_node.start = start_byte / PAGE_SIZE;
229 if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
230 return -EOVERFLOW;
231 area->pages_node.last = area->pages_node.last / PAGE_SIZE;
232 if (WARN_ON(area->pages_node.last >= pages->npages))
233 return -EOVERFLOW;
234
235 /*
236 * The area is inserted with a NULL pages indicating it is not fully
237 * initialized yet.
238 */
239 area->iopt = iopt;
240 interval_tree_insert(&area->node, &iopt->area_itree);
241 return 0;
242 }
243
iopt_area_alloc(void)244 static struct iopt_area *iopt_area_alloc(void)
245 {
246 struct iopt_area *area;
247
248 area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
249 if (!area)
250 return NULL;
251 RB_CLEAR_NODE(&area->node.rb);
252 RB_CLEAR_NODE(&area->pages_node.rb);
253 return area;
254 }
255
iopt_alloc_area_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)256 static int iopt_alloc_area_pages(struct io_pagetable *iopt,
257 struct list_head *pages_list,
258 unsigned long length, unsigned long *dst_iova,
259 int iommu_prot, unsigned int flags)
260 {
261 struct iopt_pages_list *elm;
262 unsigned long start;
263 unsigned long iova;
264 int rc = 0;
265
266 list_for_each_entry(elm, pages_list, next) {
267 elm->area = iopt_area_alloc();
268 if (!elm->area)
269 return -ENOMEM;
270 }
271
272 down_write(&iopt->iova_rwsem);
273 if ((length & (iopt->iova_alignment - 1)) || !length) {
274 rc = -EINVAL;
275 goto out_unlock;
276 }
277
278 if (flags & IOPT_ALLOC_IOVA) {
279 /* Use the first entry to guess the ideal IOVA alignment */
280 elm = list_first_entry(pages_list, struct iopt_pages_list,
281 next);
282 switch (elm->pages->type) {
283 case IOPT_ADDRESS_USER:
284 start = elm->start_byte + (uintptr_t)elm->pages->uptr;
285 break;
286 case IOPT_ADDRESS_FILE:
287 start = elm->start_byte + elm->pages->start;
288 break;
289 case IOPT_ADDRESS_DMABUF:
290 start = elm->start_byte + elm->pages->dmabuf.start;
291 break;
292 }
293 rc = iopt_alloc_iova(iopt, dst_iova, start, length);
294 if (rc)
295 goto out_unlock;
296 if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
297 WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) {
298 rc = -EINVAL;
299 goto out_unlock;
300 }
301 } else {
302 rc = iopt_check_iova(iopt, *dst_iova, length);
303 if (rc)
304 goto out_unlock;
305 }
306
307 /*
308 * Areas are created with a NULL pages so that the IOVA space is
309 * reserved and we can unlock the iova_rwsem.
310 */
311 iova = *dst_iova;
312 list_for_each_entry(elm, pages_list, next) {
313 rc = iopt_insert_area(iopt, elm->area, elm->pages, iova,
314 elm->start_byte, elm->length, iommu_prot);
315 if (rc)
316 goto out_unlock;
317 iova += elm->length;
318 }
319
320 out_unlock:
321 up_write(&iopt->iova_rwsem);
322 return rc;
323 }
324
iopt_abort_area(struct iopt_area * area)325 static void iopt_abort_area(struct iopt_area *area)
326 {
327 if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
328 WARN_ON(area->pages);
329 if (area->iopt) {
330 down_write(&area->iopt->iova_rwsem);
331 interval_tree_remove(&area->node, &area->iopt->area_itree);
332 up_write(&area->iopt->iova_rwsem);
333 }
334 kfree(area);
335 }
336
iopt_free_pages_list(struct list_head * pages_list)337 void iopt_free_pages_list(struct list_head *pages_list)
338 {
339 struct iopt_pages_list *elm;
340
341 while ((elm = list_first_entry_or_null(pages_list,
342 struct iopt_pages_list, next))) {
343 if (elm->area)
344 iopt_abort_area(elm->area);
345 if (elm->pages)
346 iopt_put_pages(elm->pages);
347 list_del(&elm->next);
348 kfree(elm);
349 }
350 }
351
iopt_fill_domains_pages(struct list_head * pages_list)352 static int iopt_fill_domains_pages(struct list_head *pages_list)
353 {
354 struct iopt_pages_list *undo_elm;
355 struct iopt_pages_list *elm;
356 int rc;
357
358 list_for_each_entry(elm, pages_list, next) {
359 rc = iopt_area_fill_domains(elm->area, elm->pages);
360 if (rc)
361 goto err_undo;
362 }
363 return 0;
364
365 err_undo:
366 list_for_each_entry(undo_elm, pages_list, next) {
367 if (undo_elm == elm)
368 break;
369 iopt_area_unfill_domains(undo_elm->area, undo_elm->pages);
370 }
371 return rc;
372 }
373
iopt_map_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)374 int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
375 unsigned long length, unsigned long *dst_iova,
376 int iommu_prot, unsigned int flags)
377 {
378 struct iopt_pages_list *elm;
379 int rc;
380
381 rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova,
382 iommu_prot, flags);
383 if (rc)
384 return rc;
385
386 down_read(&iopt->domains_rwsem);
387 rc = iopt_fill_domains_pages(pages_list);
388 if (rc)
389 goto out_unlock_domains;
390
391 down_write(&iopt->iova_rwsem);
392 list_for_each_entry(elm, pages_list, next) {
393 /*
394 * area->pages must be set inside the domains_rwsem to ensure
395 * any newly added domains will get filled. Moves the reference
396 * in from the list.
397 */
398 elm->area->pages = elm->pages;
399 elm->pages = NULL;
400 elm->area = NULL;
401 }
402 up_write(&iopt->iova_rwsem);
403 out_unlock_domains:
404 up_read(&iopt->domains_rwsem);
405 return rc;
406 }
407
iopt_map_common(struct iommufd_ctx * ictx,struct io_pagetable * iopt,struct iopt_pages * pages,unsigned long * iova,unsigned long length,unsigned long start_byte,int iommu_prot,unsigned int flags)408 static int iopt_map_common(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
409 struct iopt_pages *pages, unsigned long *iova,
410 unsigned long length, unsigned long start_byte,
411 int iommu_prot, unsigned int flags)
412 {
413 struct iopt_pages_list elm = {};
414 LIST_HEAD(pages_list);
415 int rc;
416
417 elm.pages = pages;
418 elm.start_byte = start_byte;
419 if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
420 elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
421 elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
422 elm.length = length;
423 list_add(&elm.next, &pages_list);
424
425 rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
426 if (rc) {
427 if (elm.area)
428 iopt_abort_area(elm.area);
429 if (elm.pages)
430 iopt_put_pages(elm.pages);
431 return rc;
432 }
433 return 0;
434 }
435
436 /**
437 * iopt_map_user_pages() - Map a user VA to an iova in the io page table
438 * @ictx: iommufd_ctx the iopt is part of
439 * @iopt: io_pagetable to act on
440 * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
441 * the chosen iova on output. Otherwise is the iova to map to on input
442 * @uptr: User VA to map
443 * @length: Number of bytes to map
444 * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
445 * @flags: IOPT_ALLOC_IOVA or zero
446 *
447 * iova, uptr, and length must be aligned to iova_alignment. For domain backed
448 * page tables this will pin the pages and load them into the domain at iova.
449 * For non-domain page tables this will only setup a lazy reference and the
450 * caller must use iopt_access_pages() to touch them.
451 *
452 * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
453 * destroyed.
454 */
iopt_map_user_pages(struct iommufd_ctx * ictx,struct io_pagetable * iopt,unsigned long * iova,void __user * uptr,unsigned long length,int iommu_prot,unsigned int flags)455 int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
456 unsigned long *iova, void __user *uptr,
457 unsigned long length, int iommu_prot,
458 unsigned int flags)
459 {
460 struct iopt_pages *pages;
461
462 pages = iopt_alloc_user_pages(uptr, length, iommu_prot & IOMMU_WRITE);
463 if (IS_ERR(pages))
464 return PTR_ERR(pages);
465
466 return iopt_map_common(ictx, iopt, pages, iova, length,
467 uptr - pages->uptr, iommu_prot, flags);
468 }
469
470 /**
471 * iopt_map_file_pages() - Like iopt_map_user_pages, but map a file.
472 * @ictx: iommufd_ctx the iopt is part of
473 * @iopt: io_pagetable to act on
474 * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
475 * the chosen iova on output. Otherwise is the iova to map to on input
476 * @fd: fdno of a file to map
477 * @start: map file starting at this byte offset
478 * @length: Number of bytes to map
479 * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
480 * @flags: IOPT_ALLOC_IOVA or zero
481 */
iopt_map_file_pages(struct iommufd_ctx * ictx,struct io_pagetable * iopt,unsigned long * iova,int fd,unsigned long start,unsigned long length,int iommu_prot,unsigned int flags)482 int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
483 unsigned long *iova, int fd, unsigned long start,
484 unsigned long length, int iommu_prot,
485 unsigned int flags)
486 {
487 struct iopt_pages *pages;
488 struct dma_buf *dmabuf;
489 unsigned long start_byte;
490 unsigned long last;
491
492 if (!length)
493 return -EINVAL;
494 if (check_add_overflow(start, length - 1, &last))
495 return -EOVERFLOW;
496
497 start_byte = start - ALIGN_DOWN(start, PAGE_SIZE);
498 dmabuf = dma_buf_get(fd);
499 if (!IS_ERR(dmabuf)) {
500 pages = iopt_alloc_dmabuf_pages(ictx, dmabuf, start_byte, start,
501 length,
502 iommu_prot & IOMMU_WRITE);
503 if (IS_ERR(pages)) {
504 dma_buf_put(dmabuf);
505 return PTR_ERR(pages);
506 }
507 } else {
508 struct file *file;
509
510 file = fget(fd);
511 if (!file)
512 return -EBADF;
513
514 pages = iopt_alloc_file_pages(file, start_byte, start, length,
515 iommu_prot & IOMMU_WRITE);
516 fput(file);
517 if (IS_ERR(pages))
518 return PTR_ERR(pages);
519 }
520
521 return iopt_map_common(ictx, iopt, pages, iova, length,
522 start_byte, iommu_prot, flags);
523 }
524
525 struct iova_bitmap_fn_arg {
526 unsigned long flags;
527 struct io_pagetable *iopt;
528 struct iommu_domain *domain;
529 struct iommu_dirty_bitmap *dirty;
530 };
531
__iommu_read_and_clear_dirty(struct iova_bitmap * bitmap,unsigned long iova,size_t length,void * opaque)532 static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap,
533 unsigned long iova, size_t length,
534 void *opaque)
535 {
536 struct iopt_area *area;
537 struct iopt_area_contig_iter iter;
538 struct iova_bitmap_fn_arg *arg = opaque;
539 struct iommu_domain *domain = arg->domain;
540 struct iommu_dirty_bitmap *dirty = arg->dirty;
541 const struct iommu_dirty_ops *ops = domain->dirty_ops;
542 unsigned long last_iova = iova + length - 1;
543 unsigned long flags = arg->flags;
544 int ret;
545
546 iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) {
547 unsigned long last = min(last_iova, iopt_area_last_iova(area));
548
549 ret = ops->read_and_clear_dirty(domain, iter.cur_iova,
550 last - iter.cur_iova + 1, flags,
551 dirty);
552 if (ret)
553 return ret;
554 }
555
556 if (!iopt_area_contig_done(&iter))
557 return -EINVAL;
558 return 0;
559 }
560
561 static int
iommu_read_and_clear_dirty(struct iommu_domain * domain,struct io_pagetable * iopt,unsigned long flags,struct iommu_hwpt_get_dirty_bitmap * bitmap)562 iommu_read_and_clear_dirty(struct iommu_domain *domain,
563 struct io_pagetable *iopt, unsigned long flags,
564 struct iommu_hwpt_get_dirty_bitmap *bitmap)
565 {
566 const struct iommu_dirty_ops *ops = domain->dirty_ops;
567 struct iommu_iotlb_gather gather;
568 struct iommu_dirty_bitmap dirty;
569 struct iova_bitmap_fn_arg arg;
570 struct iova_bitmap *iter;
571 int ret = 0;
572
573 if (!ops || !ops->read_and_clear_dirty)
574 return -EOPNOTSUPP;
575
576 iter = iova_bitmap_alloc(bitmap->iova, bitmap->length,
577 bitmap->page_size,
578 u64_to_user_ptr(bitmap->data));
579 if (IS_ERR(iter))
580 return -ENOMEM;
581
582 iommu_dirty_bitmap_init(&dirty, iter, &gather);
583
584 arg.flags = flags;
585 arg.iopt = iopt;
586 arg.domain = domain;
587 arg.dirty = &dirty;
588 iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty);
589
590 if (!(flags & IOMMU_DIRTY_NO_CLEAR))
591 iommu_iotlb_sync(domain, &gather);
592
593 iova_bitmap_free(iter);
594
595 return ret;
596 }
597
iommufd_check_iova_range(struct io_pagetable * iopt,struct iommu_hwpt_get_dirty_bitmap * bitmap)598 int iommufd_check_iova_range(struct io_pagetable *iopt,
599 struct iommu_hwpt_get_dirty_bitmap *bitmap)
600 {
601 size_t iommu_pgsize = iopt->iova_alignment;
602 u64 last_iova;
603
604 if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova))
605 return -EOVERFLOW;
606
607 if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX)
608 return -EOVERFLOW;
609
610 if ((bitmap->iova & (iommu_pgsize - 1)) ||
611 ((last_iova + 1) & (iommu_pgsize - 1)))
612 return -EINVAL;
613
614 if (!bitmap->page_size)
615 return -EINVAL;
616
617 if ((bitmap->iova & (bitmap->page_size - 1)) ||
618 ((last_iova + 1) & (bitmap->page_size - 1)))
619 return -EINVAL;
620
621 return 0;
622 }
623
iopt_read_and_clear_dirty_data(struct io_pagetable * iopt,struct iommu_domain * domain,unsigned long flags,struct iommu_hwpt_get_dirty_bitmap * bitmap)624 int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
625 struct iommu_domain *domain,
626 unsigned long flags,
627 struct iommu_hwpt_get_dirty_bitmap *bitmap)
628 {
629 int ret;
630
631 ret = iommufd_check_iova_range(iopt, bitmap);
632 if (ret)
633 return ret;
634
635 down_read(&iopt->iova_rwsem);
636 ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
637 up_read(&iopt->iova_rwsem);
638
639 return ret;
640 }
641
iopt_clear_dirty_data(struct io_pagetable * iopt,struct iommu_domain * domain)642 static int iopt_clear_dirty_data(struct io_pagetable *iopt,
643 struct iommu_domain *domain)
644 {
645 const struct iommu_dirty_ops *ops = domain->dirty_ops;
646 struct iommu_iotlb_gather gather;
647 struct iommu_dirty_bitmap dirty;
648 struct iopt_area *area;
649 int ret = 0;
650
651 lockdep_assert_held_read(&iopt->iova_rwsem);
652
653 iommu_dirty_bitmap_init(&dirty, NULL, &gather);
654
655 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
656 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
657 if (!area->pages)
658 continue;
659
660 ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area),
661 iopt_area_length(area), 0,
662 &dirty);
663 if (ret)
664 break;
665 }
666
667 iommu_iotlb_sync(domain, &gather);
668 return ret;
669 }
670
iopt_set_dirty_tracking(struct io_pagetable * iopt,struct iommu_domain * domain,bool enable)671 int iopt_set_dirty_tracking(struct io_pagetable *iopt,
672 struct iommu_domain *domain, bool enable)
673 {
674 const struct iommu_dirty_ops *ops = domain->dirty_ops;
675 int ret = 0;
676
677 if (!ops)
678 return -EOPNOTSUPP;
679
680 down_read(&iopt->iova_rwsem);
681
682 /* Clear dirty bits from PTEs to ensure a clean snapshot */
683 if (enable) {
684 ret = iopt_clear_dirty_data(iopt, domain);
685 if (ret)
686 goto out_unlock;
687 }
688
689 ret = ops->set_dirty_tracking(domain, enable);
690
691 out_unlock:
692 up_read(&iopt->iova_rwsem);
693 return ret;
694 }
695
iopt_get_pages(struct io_pagetable * iopt,unsigned long iova,unsigned long length,struct list_head * pages_list)696 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
697 unsigned long length, struct list_head *pages_list)
698 {
699 struct iopt_area_contig_iter iter;
700 unsigned long last_iova;
701 struct iopt_area *area;
702 int rc;
703
704 if (!length)
705 return -EINVAL;
706 if (check_add_overflow(iova, length - 1, &last_iova))
707 return -EOVERFLOW;
708
709 down_read(&iopt->iova_rwsem);
710 iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
711 struct iopt_pages_list *elm;
712 unsigned long last = min(last_iova, iopt_area_last_iova(area));
713
714 elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT);
715 if (!elm) {
716 rc = -ENOMEM;
717 goto err_free;
718 }
719 elm->start_byte = iopt_area_start_byte(area, iter.cur_iova);
720 elm->pages = area->pages;
721 elm->length = (last - iter.cur_iova) + 1;
722 kref_get(&elm->pages->kref);
723 list_add_tail(&elm->next, pages_list);
724 }
725 if (!iopt_area_contig_done(&iter)) {
726 rc = -ENOENT;
727 goto err_free;
728 }
729 up_read(&iopt->iova_rwsem);
730 return 0;
731 err_free:
732 up_read(&iopt->iova_rwsem);
733 iopt_free_pages_list(pages_list);
734 return rc;
735 }
736
iopt_unmap_iova_range(struct io_pagetable * iopt,unsigned long start,unsigned long last,unsigned long * unmapped)737 static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
738 unsigned long last, unsigned long *unmapped)
739 {
740 struct iopt_area *area;
741 unsigned long unmapped_bytes = 0;
742 unsigned int tries = 0;
743 /* If there are no mapped entries then success */
744 int rc = 0;
745
746 /*
747 * The domains_rwsem must be held in read mode any time any area->pages
748 * is NULL. This prevents domain attach/detatch from running
749 * concurrently with cleaning up the area.
750 */
751 again:
752 down_read(&iopt->domains_rwsem);
753 down_write(&iopt->iova_rwsem);
754 while ((area = iopt_area_iter_first(iopt, start, last))) {
755 unsigned long area_last = iopt_area_last_iova(area);
756 unsigned long area_first = iopt_area_iova(area);
757 struct iopt_pages *pages;
758
759 /* Userspace should not race map/unmap's of the same area */
760 if (!area->pages) {
761 rc = -EBUSY;
762 goto out_unlock_iova;
763 }
764
765 /* The area is locked by an object that has not been destroyed */
766 if (area->num_locks) {
767 rc = -EBUSY;
768 goto out_unlock_iova;
769 }
770
771 if (area_first < start || area_last > last) {
772 rc = -ENOENT;
773 goto out_unlock_iova;
774 }
775
776 if (area_first != start)
777 tries = 0;
778
779 /*
780 * num_accesses writers must hold the iova_rwsem too, so we can
781 * safely read it under the write side of the iovam_rwsem
782 * without the pages->mutex.
783 */
784 if (area->num_accesses) {
785 size_t length = iopt_area_length(area);
786
787 start = area_first;
788 area->prevent_access = true;
789 up_write(&iopt->iova_rwsem);
790 up_read(&iopt->domains_rwsem);
791
792 iommufd_access_notify_unmap(iopt, area_first, length);
793 /* Something is not responding to unmap requests. */
794 tries++;
795 if (WARN_ON(tries > 100)) {
796 rc = -EDEADLOCK;
797 goto out_unmapped;
798 }
799 goto again;
800 }
801
802 pages = area->pages;
803 area->pages = NULL;
804 up_write(&iopt->iova_rwsem);
805
806 iopt_area_unfill_domains(area, pages);
807 iopt_abort_area(area);
808 iopt_put_pages(pages);
809
810 unmapped_bytes += area_last - area_first + 1;
811
812 down_write(&iopt->iova_rwsem);
813 }
814
815 out_unlock_iova:
816 up_write(&iopt->iova_rwsem);
817 up_read(&iopt->domains_rwsem);
818 out_unmapped:
819 if (unmapped)
820 *unmapped = unmapped_bytes;
821 return rc;
822 }
823
824 /**
825 * iopt_unmap_iova() - Remove a range of iova
826 * @iopt: io_pagetable to act on
827 * @iova: Starting iova to unmap
828 * @length: Number of bytes to unmap
829 * @unmapped: Return number of bytes unmapped
830 *
831 * The requested range must be a superset of existing ranges.
832 * Splitting/truncating IOVA mappings is not allowed.
833 */
iopt_unmap_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length,unsigned long * unmapped)834 int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
835 unsigned long length, unsigned long *unmapped)
836 {
837 unsigned long iova_last;
838
839 if (!length)
840 return -EINVAL;
841
842 if (check_add_overflow(iova, length - 1, &iova_last))
843 return -EOVERFLOW;
844
845 return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped);
846 }
847
iopt_unmap_all(struct io_pagetable * iopt,unsigned long * unmapped)848 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
849 {
850 /* If the IOVAs are empty then unmap all succeeds */
851 return iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
852 }
853
854 /* The caller must always free all the nodes in the allowed_iova rb_root. */
iopt_set_allow_iova(struct io_pagetable * iopt,struct rb_root_cached * allowed_iova)855 int iopt_set_allow_iova(struct io_pagetable *iopt,
856 struct rb_root_cached *allowed_iova)
857 {
858 struct iopt_allowed *allowed;
859
860 down_write(&iopt->iova_rwsem);
861 swap(*allowed_iova, iopt->allowed_itree);
862
863 for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed;
864 allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) {
865 if (iopt_reserved_iter_first(iopt, allowed->node.start,
866 allowed->node.last)) {
867 swap(*allowed_iova, iopt->allowed_itree);
868 up_write(&iopt->iova_rwsem);
869 return -EADDRINUSE;
870 }
871 }
872 up_write(&iopt->iova_rwsem);
873 return 0;
874 }
875
iopt_reserve_iova(struct io_pagetable * iopt,unsigned long start,unsigned long last,void * owner)876 int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
877 unsigned long last, void *owner)
878 {
879 struct iopt_reserved *reserved;
880
881 lockdep_assert_held_write(&iopt->iova_rwsem);
882
883 if (iopt_area_iter_first(iopt, start, last) ||
884 iopt_allowed_iter_first(iopt, start, last))
885 return -EADDRINUSE;
886
887 reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT);
888 if (!reserved)
889 return -ENOMEM;
890 reserved->node.start = start;
891 reserved->node.last = last;
892 reserved->owner = owner;
893 interval_tree_insert(&reserved->node, &iopt->reserved_itree);
894 return 0;
895 }
896
__iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)897 static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
898 {
899 struct iopt_reserved *reserved, *next;
900
901 lockdep_assert_held_write(&iopt->iova_rwsem);
902
903 for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved;
904 reserved = next) {
905 next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX);
906
907 if (reserved->owner == owner) {
908 interval_tree_remove(&reserved->node,
909 &iopt->reserved_itree);
910 kfree(reserved);
911 }
912 }
913 }
914
iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)915 void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
916 {
917 down_write(&iopt->iova_rwsem);
918 __iopt_remove_reserved_iova(iopt, owner);
919 up_write(&iopt->iova_rwsem);
920 }
921
iopt_init_table(struct io_pagetable * iopt)922 void iopt_init_table(struct io_pagetable *iopt)
923 {
924 init_rwsem(&iopt->iova_rwsem);
925 init_rwsem(&iopt->domains_rwsem);
926 iopt->area_itree = RB_ROOT_CACHED;
927 iopt->allowed_itree = RB_ROOT_CACHED;
928 iopt->reserved_itree = RB_ROOT_CACHED;
929 xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT);
930 xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC);
931
932 /*
933 * iopt's start as SW tables that can use the entire size_t IOVA space
934 * due to the use of size_t in the APIs. They have no alignment
935 * restriction.
936 */
937 iopt->iova_alignment = 1;
938 }
939
iopt_destroy_table(struct io_pagetable * iopt)940 void iopt_destroy_table(struct io_pagetable *iopt)
941 {
942 struct interval_tree_node *node;
943
944 if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
945 iopt_remove_reserved_iova(iopt, NULL);
946
947 while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0,
948 ULONG_MAX))) {
949 interval_tree_remove(node, &iopt->allowed_itree);
950 kfree(container_of(node, struct iopt_allowed, node));
951 }
952
953 WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root));
954 WARN_ON(!xa_empty(&iopt->domains));
955 WARN_ON(!xa_empty(&iopt->access_list));
956 WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
957 }
958
959 /**
960 * iopt_unfill_domain() - Unfill a domain with PFNs
961 * @iopt: io_pagetable to act on
962 * @domain: domain to unfill
963 *
964 * This is used when removing a domain from the iopt. Every area in the iopt
965 * will be unmapped from the domain. The domain must already be removed from the
966 * domains xarray.
967 */
iopt_unfill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)968 static void iopt_unfill_domain(struct io_pagetable *iopt,
969 struct iommu_domain *domain)
970 {
971 struct iopt_area *area;
972
973 lockdep_assert_held(&iopt->iova_rwsem);
974 lockdep_assert_held_write(&iopt->domains_rwsem);
975
976 /*
977 * Some other domain is holding all the pfns still, rapidly unmap this
978 * domain.
979 */
980 if (iopt->next_domain_id != 0) {
981 /* Pick an arbitrary remaining domain to act as storage */
982 struct iommu_domain *storage_domain =
983 xa_load(&iopt->domains, 0);
984
985 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
986 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
987 struct iopt_pages *pages = area->pages;
988
989 if (!pages)
990 continue;
991
992 mutex_lock(&pages->mutex);
993 if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
994 WARN_ON(!area->storage_domain);
995 if (area->storage_domain == domain)
996 area->storage_domain = storage_domain;
997 if (iopt_is_dmabuf(pages)) {
998 if (!iopt_dmabuf_revoked(pages))
999 iopt_area_unmap_domain(area, domain);
1000 iopt_dmabuf_untrack_domain(pages, area, domain);
1001 }
1002 mutex_unlock(&pages->mutex);
1003
1004 if (!iopt_is_dmabuf(pages))
1005 iopt_area_unmap_domain(area, domain);
1006 }
1007 return;
1008 }
1009
1010 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1011 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
1012 struct iopt_pages *pages = area->pages;
1013
1014 if (!pages)
1015 continue;
1016
1017 mutex_lock(&pages->mutex);
1018 interval_tree_remove(&area->pages_node, &pages->domains_itree);
1019 WARN_ON(area->storage_domain != domain);
1020 area->storage_domain = NULL;
1021 iopt_area_unfill_domain(area, pages, domain);
1022 if (iopt_is_dmabuf(pages))
1023 iopt_dmabuf_untrack_domain(pages, area, domain);
1024 mutex_unlock(&pages->mutex);
1025 }
1026 }
1027
1028 /**
1029 * iopt_fill_domain() - Fill a domain with PFNs
1030 * @iopt: io_pagetable to act on
1031 * @domain: domain to fill
1032 *
1033 * Fill the domain with PFNs from every area in the iopt. On failure the domain
1034 * is left unchanged.
1035 */
iopt_fill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)1036 static int iopt_fill_domain(struct io_pagetable *iopt,
1037 struct iommu_domain *domain)
1038 {
1039 struct iopt_area *end_area;
1040 struct iopt_area *area;
1041 int rc;
1042
1043 lockdep_assert_held(&iopt->iova_rwsem);
1044 lockdep_assert_held_write(&iopt->domains_rwsem);
1045
1046 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1047 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
1048 struct iopt_pages *pages = area->pages;
1049
1050 if (!pages)
1051 continue;
1052
1053 guard(mutex)(&pages->mutex);
1054 if (iopt_is_dmabuf(pages)) {
1055 rc = iopt_dmabuf_track_domain(pages, area, domain);
1056 if (rc)
1057 goto out_unfill;
1058 }
1059 rc = iopt_area_fill_domain(area, domain);
1060 if (rc) {
1061 if (iopt_is_dmabuf(pages))
1062 iopt_dmabuf_untrack_domain(pages, area, domain);
1063 goto out_unfill;
1064 }
1065 if (!area->storage_domain) {
1066 WARN_ON(iopt->next_domain_id != 0);
1067 area->storage_domain = domain;
1068 interval_tree_insert(&area->pages_node,
1069 &pages->domains_itree);
1070 }
1071 }
1072 return 0;
1073
1074 out_unfill:
1075 end_area = area;
1076 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1077 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
1078 struct iopt_pages *pages = area->pages;
1079
1080 if (area == end_area)
1081 break;
1082 if (!pages)
1083 continue;
1084 mutex_lock(&pages->mutex);
1085 if (iopt->next_domain_id == 0) {
1086 interval_tree_remove(&area->pages_node,
1087 &pages->domains_itree);
1088 area->storage_domain = NULL;
1089 }
1090 iopt_area_unfill_domain(area, pages, domain);
1091 if (iopt_is_dmabuf(pages))
1092 iopt_dmabuf_untrack_domain(pages, area, domain);
1093 mutex_unlock(&pages->mutex);
1094 }
1095 return rc;
1096 }
1097
1098 /* All existing area's conform to an increased page size */
iopt_check_iova_alignment(struct io_pagetable * iopt,unsigned long new_iova_alignment)1099 static int iopt_check_iova_alignment(struct io_pagetable *iopt,
1100 unsigned long new_iova_alignment)
1101 {
1102 unsigned long align_mask = new_iova_alignment - 1;
1103 struct iopt_area *area;
1104
1105 lockdep_assert_held(&iopt->iova_rwsem);
1106 lockdep_assert_held(&iopt->domains_rwsem);
1107
1108 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1109 area = iopt_area_iter_next(area, 0, ULONG_MAX))
1110 if ((iopt_area_iova(area) & align_mask) ||
1111 (iopt_area_length(area) & align_mask) ||
1112 (area->page_offset & align_mask))
1113 return -EADDRINUSE;
1114
1115 if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) {
1116 struct iommufd_access *access;
1117 unsigned long index;
1118
1119 xa_for_each(&iopt->access_list, index, access)
1120 if (WARN_ON(access->iova_alignment >
1121 new_iova_alignment))
1122 return -EADDRINUSE;
1123 }
1124 return 0;
1125 }
1126
iopt_table_add_domain(struct io_pagetable * iopt,struct iommu_domain * domain)1127 int iopt_table_add_domain(struct io_pagetable *iopt,
1128 struct iommu_domain *domain)
1129 {
1130 const struct iommu_domain_geometry *geometry = &domain->geometry;
1131 struct iommu_domain *iter_domain;
1132 unsigned int new_iova_alignment;
1133 unsigned long index;
1134 int rc;
1135
1136 down_write(&iopt->domains_rwsem);
1137 down_write(&iopt->iova_rwsem);
1138
1139 xa_for_each(&iopt->domains, index, iter_domain) {
1140 if (WARN_ON(iter_domain == domain)) {
1141 rc = -EEXIST;
1142 goto out_unlock;
1143 }
1144 }
1145
1146 /*
1147 * The io page size drives the iova_alignment. Internally the iopt_pages
1148 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
1149 * objects into the iommu_domain.
1150 *
1151 * A iommu_domain must always be able to accept PAGE_SIZE to be
1152 * compatible as we can't guarantee higher contiguity.
1153 */
1154 new_iova_alignment = max_t(unsigned long,
1155 1UL << __ffs(domain->pgsize_bitmap),
1156 iopt->iova_alignment);
1157 if (new_iova_alignment > PAGE_SIZE) {
1158 rc = -EINVAL;
1159 goto out_unlock;
1160 }
1161 if (new_iova_alignment != iopt->iova_alignment) {
1162 rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1163 if (rc)
1164 goto out_unlock;
1165 }
1166
1167 /* No area exists that is outside the allowed domain aperture */
1168 if (geometry->aperture_start != 0) {
1169 rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
1170 domain);
1171 if (rc)
1172 goto out_reserved;
1173 }
1174 if (geometry->aperture_end != ULONG_MAX) {
1175 rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
1176 ULONG_MAX, domain);
1177 if (rc)
1178 goto out_reserved;
1179 }
1180
1181 rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
1182 if (rc)
1183 goto out_reserved;
1184
1185 rc = iopt_fill_domain(iopt, domain);
1186 if (rc)
1187 goto out_release;
1188
1189 iopt->iova_alignment = new_iova_alignment;
1190 xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
1191 iopt->next_domain_id++;
1192 up_write(&iopt->iova_rwsem);
1193 up_write(&iopt->domains_rwsem);
1194 return 0;
1195 out_release:
1196 xa_release(&iopt->domains, iopt->next_domain_id);
1197 out_reserved:
1198 __iopt_remove_reserved_iova(iopt, domain);
1199 out_unlock:
1200 up_write(&iopt->iova_rwsem);
1201 up_write(&iopt->domains_rwsem);
1202 return rc;
1203 }
1204
iopt_calculate_iova_alignment(struct io_pagetable * iopt)1205 static int iopt_calculate_iova_alignment(struct io_pagetable *iopt)
1206 {
1207 unsigned long new_iova_alignment;
1208 struct iommufd_access *access;
1209 struct iommu_domain *domain;
1210 unsigned long index;
1211
1212 lockdep_assert_held_write(&iopt->iova_rwsem);
1213 lockdep_assert_held(&iopt->domains_rwsem);
1214
1215 /* See batch_iommu_map_small() */
1216 if (iopt->disable_large_pages)
1217 new_iova_alignment = PAGE_SIZE;
1218 else
1219 new_iova_alignment = 1;
1220
1221 xa_for_each(&iopt->domains, index, domain)
1222 new_iova_alignment = max_t(unsigned long,
1223 1UL << __ffs(domain->pgsize_bitmap),
1224 new_iova_alignment);
1225 xa_for_each(&iopt->access_list, index, access)
1226 new_iova_alignment = max_t(unsigned long,
1227 access->iova_alignment,
1228 new_iova_alignment);
1229
1230 if (new_iova_alignment > iopt->iova_alignment) {
1231 int rc;
1232
1233 rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1234 if (rc)
1235 return rc;
1236 }
1237 iopt->iova_alignment = new_iova_alignment;
1238 return 0;
1239 }
1240
iopt_table_remove_domain(struct io_pagetable * iopt,struct iommu_domain * domain)1241 void iopt_table_remove_domain(struct io_pagetable *iopt,
1242 struct iommu_domain *domain)
1243 {
1244 struct iommu_domain *iter_domain = NULL;
1245 unsigned long index;
1246
1247 down_write(&iopt->domains_rwsem);
1248 down_write(&iopt->iova_rwsem);
1249
1250 xa_for_each(&iopt->domains, index, iter_domain)
1251 if (iter_domain == domain)
1252 break;
1253 if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
1254 goto out_unlock;
1255
1256 /*
1257 * Compress the xarray to keep it linear by swapping the entry to erase
1258 * with the tail entry and shrinking the tail.
1259 */
1260 iopt->next_domain_id--;
1261 iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
1262 if (index != iopt->next_domain_id)
1263 xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
1264
1265 iopt_unfill_domain(iopt, domain);
1266 __iopt_remove_reserved_iova(iopt, domain);
1267
1268 WARN_ON(iopt_calculate_iova_alignment(iopt));
1269 out_unlock:
1270 up_write(&iopt->iova_rwsem);
1271 up_write(&iopt->domains_rwsem);
1272 }
1273
1274 /**
1275 * iopt_area_split - Split an area into two parts at iova
1276 * @area: The area to split
1277 * @iova: Becomes the last of a new area
1278 *
1279 * This splits an area into two. It is part of the VFIO compatibility to allow
1280 * poking a hole in the mapping. The two areas continue to point at the same
1281 * iopt_pages, just with different starting bytes.
1282 */
iopt_area_split(struct iopt_area * area,unsigned long iova)1283 static int iopt_area_split(struct iopt_area *area, unsigned long iova)
1284 {
1285 unsigned long alignment = area->iopt->iova_alignment;
1286 unsigned long last_iova = iopt_area_last_iova(area);
1287 unsigned long start_iova = iopt_area_iova(area);
1288 unsigned long new_start = iova + 1;
1289 struct io_pagetable *iopt = area->iopt;
1290 struct iopt_pages *pages = area->pages;
1291 struct iopt_area *lhs;
1292 struct iopt_area *rhs;
1293 int rc;
1294
1295 lockdep_assert_held_write(&iopt->iova_rwsem);
1296
1297 if (iova == start_iova || iova == last_iova)
1298 return 0;
1299
1300 if (!pages || area->prevent_access)
1301 return -EBUSY;
1302
1303 /* Maintaining the domains_itree below is a bit complicated */
1304 if (iopt_is_dmabuf(pages))
1305 return -EOPNOTSUPP;
1306
1307 if (new_start & (alignment - 1) ||
1308 iopt_area_start_byte(area, new_start) & (alignment - 1))
1309 return -EINVAL;
1310
1311 lhs = iopt_area_alloc();
1312 if (!lhs)
1313 return -ENOMEM;
1314
1315 rhs = iopt_area_alloc();
1316 if (!rhs) {
1317 rc = -ENOMEM;
1318 goto err_free_lhs;
1319 }
1320
1321 mutex_lock(&pages->mutex);
1322 /*
1323 * Splitting is not permitted if an access exists, we don't track enough
1324 * information to split existing accesses.
1325 */
1326 if (area->num_accesses) {
1327 rc = -EINVAL;
1328 goto err_unlock;
1329 }
1330
1331 /*
1332 * Splitting is not permitted if a domain could have been mapped with
1333 * huge pages.
1334 */
1335 if (area->storage_domain && !iopt->disable_large_pages) {
1336 rc = -EINVAL;
1337 goto err_unlock;
1338 }
1339
1340 interval_tree_remove(&area->node, &iopt->area_itree);
1341 rc = iopt_insert_area(iopt, lhs, area->pages, start_iova,
1342 iopt_area_start_byte(area, start_iova),
1343 (new_start - 1) - start_iova + 1,
1344 area->iommu_prot);
1345 if (WARN_ON(rc))
1346 goto err_insert;
1347
1348 rc = iopt_insert_area(iopt, rhs, area->pages, new_start,
1349 iopt_area_start_byte(area, new_start),
1350 last_iova - new_start + 1, area->iommu_prot);
1351 if (WARN_ON(rc))
1352 goto err_remove_lhs;
1353
1354 /*
1355 * If the original area has filled a domain, domains_itree has to be
1356 * updated.
1357 */
1358 if (area->storage_domain) {
1359 interval_tree_remove(&area->pages_node, &pages->domains_itree);
1360 interval_tree_insert(&lhs->pages_node, &pages->domains_itree);
1361 interval_tree_insert(&rhs->pages_node, &pages->domains_itree);
1362 }
1363
1364 lhs->storage_domain = area->storage_domain;
1365 lhs->pages = area->pages;
1366 rhs->storage_domain = area->storage_domain;
1367 rhs->pages = area->pages;
1368 kref_get(&rhs->pages->kref);
1369 kfree(area);
1370 mutex_unlock(&pages->mutex);
1371
1372 /*
1373 * No change to domains or accesses because the pages hasn't been
1374 * changed
1375 */
1376 return 0;
1377
1378 err_remove_lhs:
1379 interval_tree_remove(&lhs->node, &iopt->area_itree);
1380 err_insert:
1381 interval_tree_insert(&area->node, &iopt->area_itree);
1382 err_unlock:
1383 mutex_unlock(&pages->mutex);
1384 kfree(rhs);
1385 err_free_lhs:
1386 kfree(lhs);
1387 return rc;
1388 }
1389
iopt_cut_iova(struct io_pagetable * iopt,unsigned long * iovas,size_t num_iovas)1390 int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
1391 size_t num_iovas)
1392 {
1393 int rc = 0;
1394 int i;
1395
1396 down_write(&iopt->iova_rwsem);
1397 for (i = 0; i < num_iovas; i++) {
1398 struct iopt_area *area;
1399
1400 area = iopt_area_iter_first(iopt, iovas[i], iovas[i]);
1401 if (!area)
1402 continue;
1403 rc = iopt_area_split(area, iovas[i]);
1404 if (rc)
1405 break;
1406 }
1407 up_write(&iopt->iova_rwsem);
1408 return rc;
1409 }
1410
iopt_enable_large_pages(struct io_pagetable * iopt)1411 void iopt_enable_large_pages(struct io_pagetable *iopt)
1412 {
1413 int rc;
1414
1415 down_write(&iopt->domains_rwsem);
1416 down_write(&iopt->iova_rwsem);
1417 WRITE_ONCE(iopt->disable_large_pages, false);
1418 rc = iopt_calculate_iova_alignment(iopt);
1419 WARN_ON(rc);
1420 up_write(&iopt->iova_rwsem);
1421 up_write(&iopt->domains_rwsem);
1422 }
1423
iopt_disable_large_pages(struct io_pagetable * iopt)1424 int iopt_disable_large_pages(struct io_pagetable *iopt)
1425 {
1426 int rc = 0;
1427
1428 down_write(&iopt->domains_rwsem);
1429 down_write(&iopt->iova_rwsem);
1430 if (iopt->disable_large_pages)
1431 goto out_unlock;
1432
1433 /* Won't do it if domains already have pages mapped in them */
1434 if (!xa_empty(&iopt->domains) &&
1435 !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) {
1436 rc = -EINVAL;
1437 goto out_unlock;
1438 }
1439
1440 WRITE_ONCE(iopt->disable_large_pages, true);
1441 rc = iopt_calculate_iova_alignment(iopt);
1442 if (rc)
1443 WRITE_ONCE(iopt->disable_large_pages, false);
1444 out_unlock:
1445 up_write(&iopt->iova_rwsem);
1446 up_write(&iopt->domains_rwsem);
1447 return rc;
1448 }
1449
iopt_add_access(struct io_pagetable * iopt,struct iommufd_access * access)1450 int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access)
1451 {
1452 u32 new_id;
1453 int rc;
1454
1455 down_write(&iopt->domains_rwsem);
1456 down_write(&iopt->iova_rwsem);
1457 rc = xa_alloc(&iopt->access_list, &new_id, access, xa_limit_16b,
1458 GFP_KERNEL_ACCOUNT);
1459
1460 if (rc)
1461 goto out_unlock;
1462
1463 rc = iopt_calculate_iova_alignment(iopt);
1464 if (rc) {
1465 xa_erase(&iopt->access_list, new_id);
1466 goto out_unlock;
1467 }
1468 access->iopt_access_list_id = new_id;
1469
1470 out_unlock:
1471 up_write(&iopt->iova_rwsem);
1472 up_write(&iopt->domains_rwsem);
1473 return rc;
1474 }
1475
iopt_remove_access(struct io_pagetable * iopt,struct iommufd_access * access,u32 iopt_access_list_id)1476 void iopt_remove_access(struct io_pagetable *iopt,
1477 struct iommufd_access *access, u32 iopt_access_list_id)
1478 {
1479 down_write(&iopt->domains_rwsem);
1480 down_write(&iopt->iova_rwsem);
1481 WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access);
1482 WARN_ON(iopt_calculate_iova_alignment(iopt));
1483 up_write(&iopt->iova_rwsem);
1484 up_write(&iopt->domains_rwsem);
1485 }
1486
1487 /* Narrow the valid_iova_itree to include reserved ranges from a device. */
iopt_table_enforce_dev_resv_regions(struct io_pagetable * iopt,struct device * dev,phys_addr_t * sw_msi_start)1488 int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
1489 struct device *dev,
1490 phys_addr_t *sw_msi_start)
1491 {
1492 struct iommu_resv_region *resv;
1493 LIST_HEAD(resv_regions);
1494 unsigned int num_hw_msi = 0;
1495 unsigned int num_sw_msi = 0;
1496 int rc;
1497
1498 if (iommufd_should_fail())
1499 return -EINVAL;
1500
1501 down_write(&iopt->iova_rwsem);
1502 /* FIXME: drivers allocate memory but there is no failure propogated */
1503 iommu_get_resv_regions(dev, &resv_regions);
1504
1505 list_for_each_entry(resv, &resv_regions, list) {
1506 if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1507 continue;
1508
1509 if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
1510 num_hw_msi++;
1511 if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
1512 *sw_msi_start = resv->start;
1513 num_sw_msi++;
1514 }
1515
1516 rc = iopt_reserve_iova(iopt, resv->start,
1517 resv->length - 1 + resv->start, dev);
1518 if (rc)
1519 goto out_reserved;
1520 }
1521
1522 /* Drivers must offer sane combinations of regions */
1523 if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) {
1524 rc = -EINVAL;
1525 goto out_reserved;
1526 }
1527
1528 rc = 0;
1529 goto out_free_resv;
1530
1531 out_reserved:
1532 __iopt_remove_reserved_iova(iopt, dev);
1533 out_free_resv:
1534 iommu_put_resv_regions(dev, &resv_regions);
1535 up_write(&iopt->iova_rwsem);
1536 return rc;
1537 }
1538