1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
3 *
4 * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
5 * PFNs can be placed into an iommu_domain, or returned to the caller as a page
6 * list for access by an in-kernel user.
7 *
8 * The datastructure uses the iopt_pages to optimize the storage of the PFNs
9 * between the domains and xarray.
10 */
11 #include <linux/dma-buf.h>
12 #include <linux/err.h>
13 #include <linux/errno.h>
14 #include <linux/file.h>
15 #include <linux/iommu.h>
16 #include <linux/iommufd.h>
17 #include <linux/lockdep.h>
18 #include <linux/sched/mm.h>
19 #include <linux/slab.h>
20 #include <uapi/linux/iommufd.h>
21
22 #include "double_span.h"
23 #include "io_pagetable.h"
24
25 struct iopt_pages_list {
26 struct iopt_pages *pages;
27 struct iopt_area *area;
28 struct list_head next;
29 unsigned long start_byte;
30 unsigned long length;
31 };
32
iopt_area_contig_init(struct iopt_area_contig_iter * iter,struct io_pagetable * iopt,unsigned long iova,unsigned long last_iova)33 struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
34 struct io_pagetable *iopt,
35 unsigned long iova,
36 unsigned long last_iova)
37 {
38 lockdep_assert_held(&iopt->iova_rwsem);
39
40 iter->cur_iova = iova;
41 iter->last_iova = last_iova;
42 iter->area = iopt_area_iter_first(iopt, iova, iova);
43 if (!iter->area)
44 return NULL;
45 if (!iter->area->pages) {
46 iter->area = NULL;
47 return NULL;
48 }
49 return iter->area;
50 }
51
iopt_area_contig_next(struct iopt_area_contig_iter * iter)52 struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
53 {
54 unsigned long last_iova;
55
56 if (!iter->area)
57 return NULL;
58 last_iova = iopt_area_last_iova(iter->area);
59 if (iter->last_iova <= last_iova)
60 return NULL;
61
62 iter->cur_iova = last_iova + 1;
63 iter->area = iopt_area_iter_next(iter->area, iter->cur_iova,
64 iter->last_iova);
65 if (!iter->area)
66 return NULL;
67 if (iter->cur_iova != iopt_area_iova(iter->area) ||
68 !iter->area->pages) {
69 iter->area = NULL;
70 return NULL;
71 }
72 return iter->area;
73 }
74
__alloc_iova_check_range(unsigned long * start,unsigned long last,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)75 static bool __alloc_iova_check_range(unsigned long *start, unsigned long last,
76 unsigned long length,
77 unsigned long iova_alignment,
78 unsigned long page_offset)
79 {
80 unsigned long aligned_start;
81
82 /* ALIGN_UP() */
83 if (check_add_overflow(*start, iova_alignment - 1, &aligned_start))
84 return false;
85 aligned_start &= ~(iova_alignment - 1);
86 aligned_start |= page_offset;
87
88 if (aligned_start >= last || last - aligned_start < length - 1)
89 return false;
90 *start = aligned_start;
91 return true;
92 }
93
__alloc_iova_check_hole(struct interval_tree_double_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)94 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
95 unsigned long length,
96 unsigned long iova_alignment,
97 unsigned long page_offset)
98 {
99 if (span->is_used)
100 return false;
101 return __alloc_iova_check_range(&span->start_hole, span->last_hole,
102 length, iova_alignment, page_offset);
103 }
104
__alloc_iova_check_used(struct interval_tree_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)105 static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
106 unsigned long length,
107 unsigned long iova_alignment,
108 unsigned long page_offset)
109 {
110 if (span->is_hole)
111 return false;
112 return __alloc_iova_check_range(&span->start_used, span->last_used,
113 length, iova_alignment, page_offset);
114 }
115
116 /*
117 * Automatically find a block of IOVA that is not being used and not reserved.
118 * Does not return a 0 IOVA even if it is valid.
119 */
iopt_alloc_iova(struct io_pagetable * iopt,unsigned long * iova,unsigned long addr,unsigned long length)120 static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
121 unsigned long addr, unsigned long length)
122 {
123 unsigned long page_offset = addr % PAGE_SIZE;
124 struct interval_tree_double_span_iter used_span;
125 struct interval_tree_span_iter allowed_span;
126 unsigned long max_alignment = PAGE_SIZE;
127 unsigned long iova_alignment;
128
129 lockdep_assert_held(&iopt->iova_rwsem);
130
131 /* Protect roundup_pow-of_two() from overflow */
132 if (length == 0 || length >= ULONG_MAX / 2)
133 return -EOVERFLOW;
134
135 /*
136 * Keep alignment present in addr when building the IOVA, which
137 * increases the chance we can map a THP.
138 */
139 if (!addr)
140 iova_alignment = roundup_pow_of_two(length);
141 else
142 iova_alignment = min_t(unsigned long,
143 roundup_pow_of_two(length),
144 1UL << __ffs64(addr));
145
146 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
147 max_alignment = HPAGE_SIZE;
148 #endif
149 /* Protect against ALIGN() overflow */
150 if (iova_alignment >= max_alignment)
151 iova_alignment = max_alignment;
152
153 if (iova_alignment < iopt->iova_alignment)
154 return -EINVAL;
155
156 interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree,
157 PAGE_SIZE, ULONG_MAX - PAGE_SIZE) {
158 if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) {
159 allowed_span.start_used = PAGE_SIZE;
160 allowed_span.last_used = ULONG_MAX - PAGE_SIZE;
161 allowed_span.is_hole = false;
162 }
163
164 if (!__alloc_iova_check_used(&allowed_span, length,
165 iova_alignment, page_offset))
166 continue;
167
168 interval_tree_for_each_double_span(
169 &used_span, &iopt->reserved_itree, &iopt->area_itree,
170 allowed_span.start_used, allowed_span.last_used) {
171 if (!__alloc_iova_check_hole(&used_span, length,
172 iova_alignment,
173 page_offset))
174 continue;
175
176 *iova = used_span.start_hole;
177 return 0;
178 }
179 }
180 return -ENOSPC;
181 }
182
iopt_check_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length)183 static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova,
184 unsigned long length)
185 {
186 unsigned long last;
187
188 lockdep_assert_held(&iopt->iova_rwsem);
189
190 if ((iova & (iopt->iova_alignment - 1)))
191 return -EINVAL;
192
193 if (check_add_overflow(iova, length - 1, &last))
194 return -EOVERFLOW;
195
196 /* No reserved IOVA intersects the range */
197 if (iopt_reserved_iter_first(iopt, iova, last))
198 return -EINVAL;
199
200 /* Check that there is not already a mapping in the range */
201 if (iopt_area_iter_first(iopt, iova, last))
202 return -EEXIST;
203 return 0;
204 }
205
206 /*
207 * The area takes a slice of the pages from start_bytes to start_byte + length
208 */
iopt_insert_area(struct io_pagetable * iopt,struct iopt_area * area,struct iopt_pages * pages,unsigned long iova,unsigned long start_byte,unsigned long length,int iommu_prot)209 static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
210 struct iopt_pages *pages, unsigned long iova,
211 unsigned long start_byte, unsigned long length,
212 int iommu_prot)
213 {
214 lockdep_assert_held_write(&iopt->iova_rwsem);
215
216 if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
217 return -EPERM;
218
219 area->iommu_prot = iommu_prot;
220 area->page_offset = start_byte % PAGE_SIZE;
221 if (area->page_offset & (iopt->iova_alignment - 1))
222 return -EINVAL;
223
224 area->node.start = iova;
225 if (check_add_overflow(iova, length - 1, &area->node.last))
226 return -EOVERFLOW;
227
228 area->pages_node.start = start_byte / PAGE_SIZE;
229 if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
230 return -EOVERFLOW;
231 area->pages_node.last = area->pages_node.last / PAGE_SIZE;
232 if (WARN_ON(area->pages_node.last >= pages->npages))
233 return -EOVERFLOW;
234
235 /*
236 * The area is inserted with a NULL pages indicating it is not fully
237 * initialized yet.
238 */
239 area->iopt = iopt;
240 interval_tree_insert(&area->node, &iopt->area_itree);
241 return 0;
242 }
243
iopt_area_alloc(void)244 static struct iopt_area *iopt_area_alloc(void)
245 {
246 struct iopt_area *area;
247
248 area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
249 if (!area)
250 return NULL;
251 RB_CLEAR_NODE(&area->node.rb);
252 RB_CLEAR_NODE(&area->pages_node.rb);
253 return area;
254 }
255
iopt_alloc_area_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)256 static int iopt_alloc_area_pages(struct io_pagetable *iopt,
257 struct list_head *pages_list,
258 unsigned long length, unsigned long *dst_iova,
259 int iommu_prot, unsigned int flags)
260 {
261 struct iopt_pages_list *elm;
262 unsigned long start;
263 unsigned long iova;
264 int rc = 0;
265
266 list_for_each_entry(elm, pages_list, next) {
267 elm->area = iopt_area_alloc();
268 if (!elm->area)
269 return -ENOMEM;
270 }
271
272 down_write(&iopt->iova_rwsem);
273 if ((length & (iopt->iova_alignment - 1)) || !length) {
274 rc = -EINVAL;
275 goto out_unlock;
276 }
277
278 if (flags & IOPT_ALLOC_IOVA) {
279 /* Use the first entry to guess the ideal IOVA alignment */
280 elm = list_first_entry(pages_list, struct iopt_pages_list,
281 next);
282 switch (elm->pages->type) {
283 case IOPT_ADDRESS_USER:
284 start = elm->start_byte + (uintptr_t)elm->pages->uptr;
285 break;
286 case IOPT_ADDRESS_FILE:
287 start = elm->start_byte + elm->pages->start;
288 break;
289 case IOPT_ADDRESS_DMABUF:
290 start = elm->start_byte + elm->pages->dmabuf.start;
291 break;
292 }
293 rc = iopt_alloc_iova(iopt, dst_iova, start, length);
294 if (rc)
295 goto out_unlock;
296 if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
297 WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) {
298 rc = -EINVAL;
299 goto out_unlock;
300 }
301 } else {
302 rc = iopt_check_iova(iopt, *dst_iova, length);
303 if (rc)
304 goto out_unlock;
305 }
306
307 /*
308 * Areas are created with a NULL pages so that the IOVA space is
309 * reserved and we can unlock the iova_rwsem.
310 */
311 iova = *dst_iova;
312 list_for_each_entry(elm, pages_list, next) {
313 rc = iopt_insert_area(iopt, elm->area, elm->pages, iova,
314 elm->start_byte, elm->length, iommu_prot);
315 if (rc)
316 goto out_unlock;
317 iova += elm->length;
318 }
319
320 out_unlock:
321 up_write(&iopt->iova_rwsem);
322 return rc;
323 }
324
iopt_abort_area(struct iopt_area * area)325 static void iopt_abort_area(struct iopt_area *area)
326 {
327 if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
328 WARN_ON(area->pages);
329 if (area->iopt) {
330 down_write(&area->iopt->iova_rwsem);
331 interval_tree_remove(&area->node, &area->iopt->area_itree);
332 up_write(&area->iopt->iova_rwsem);
333 }
334 kfree(area);
335 }
336
iopt_free_pages_list(struct list_head * pages_list)337 void iopt_free_pages_list(struct list_head *pages_list)
338 {
339 struct iopt_pages_list *elm;
340
341 while ((elm = list_first_entry_or_null(pages_list,
342 struct iopt_pages_list, next))) {
343 if (elm->area)
344 iopt_abort_area(elm->area);
345 if (elm->pages)
346 iopt_put_pages(elm->pages);
347 list_del(&elm->next);
348 kfree(elm);
349 }
350 }
351
iopt_fill_domains_pages(struct list_head * pages_list)352 static int iopt_fill_domains_pages(struct list_head *pages_list)
353 {
354 struct iopt_pages_list *undo_elm;
355 struct iopt_pages_list *elm;
356 int rc;
357
358 list_for_each_entry(elm, pages_list, next) {
359 rc = iopt_area_fill_domains(elm->area, elm->pages);
360 if (rc)
361 goto err_undo;
362 }
363 return 0;
364
365 err_undo:
366 list_for_each_entry(undo_elm, pages_list, next) {
367 if (undo_elm == elm)
368 break;
369 iopt_area_unfill_domains(undo_elm->area, undo_elm->pages);
370 }
371 return rc;
372 }
373
iopt_map_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)374 int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
375 unsigned long length, unsigned long *dst_iova,
376 int iommu_prot, unsigned int flags)
377 {
378 struct iopt_pages_list *elm;
379 int rc;
380
381 rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova,
382 iommu_prot, flags);
383 if (rc)
384 return rc;
385
386 down_read(&iopt->domains_rwsem);
387 rc = iopt_fill_domains_pages(pages_list);
388 if (rc)
389 goto out_unlock_domains;
390
391 down_write(&iopt->iova_rwsem);
392 list_for_each_entry(elm, pages_list, next) {
393 /*
394 * area->pages must be set inside the domains_rwsem to ensure
395 * any newly added domains will get filled. Moves the reference
396 * in from the list.
397 */
398 elm->area->pages = elm->pages;
399 elm->pages = NULL;
400 elm->area = NULL;
401 }
402 up_write(&iopt->iova_rwsem);
403 out_unlock_domains:
404 up_read(&iopt->domains_rwsem);
405 return rc;
406 }
407
iopt_map_common(struct iommufd_ctx * ictx,struct io_pagetable * iopt,struct iopt_pages * pages,unsigned long * iova,unsigned long length,unsigned long start_byte,int iommu_prot,unsigned int flags)408 static int iopt_map_common(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
409 struct iopt_pages *pages, unsigned long *iova,
410 unsigned long length, unsigned long start_byte,
411 int iommu_prot, unsigned int flags)
412 {
413 struct iopt_pages_list elm = {};
414 LIST_HEAD(pages_list);
415 int rc;
416
417 elm.pages = pages;
418 elm.start_byte = start_byte;
419 if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
420 elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
421 elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
422 elm.length = length;
423 list_add(&elm.next, &pages_list);
424
425 rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
426 if (rc) {
427 if (elm.area)
428 iopt_abort_area(elm.area);
429 if (elm.pages)
430 iopt_put_pages(elm.pages);
431 return rc;
432 }
433 return 0;
434 }
435
436 /**
437 * iopt_map_user_pages() - Map a user VA to an iova in the io page table
438 * @ictx: iommufd_ctx the iopt is part of
439 * @iopt: io_pagetable to act on
440 * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
441 * the chosen iova on output. Otherwise is the iova to map to on input
442 * @uptr: User VA to map
443 * @length: Number of bytes to map
444 * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
445 * @flags: IOPT_ALLOC_IOVA or zero
446 *
447 * iova, uptr, and length must be aligned to iova_alignment. For domain backed
448 * page tables this will pin the pages and load them into the domain at iova.
449 * For non-domain page tables this will only setup a lazy reference and the
450 * caller must use iopt_access_pages() to touch them.
451 *
452 * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
453 * destroyed.
454 */
iopt_map_user_pages(struct iommufd_ctx * ictx,struct io_pagetable * iopt,unsigned long * iova,void __user * uptr,unsigned long length,int iommu_prot,unsigned int flags)455 int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
456 unsigned long *iova, void __user *uptr,
457 unsigned long length, int iommu_prot,
458 unsigned int flags)
459 {
460 struct iopt_pages *pages;
461
462 pages = iopt_alloc_user_pages(uptr, length, iommu_prot & IOMMU_WRITE);
463 if (IS_ERR(pages))
464 return PTR_ERR(pages);
465
466 return iopt_map_common(ictx, iopt, pages, iova, length,
467 uptr - pages->uptr, iommu_prot, flags);
468 }
469
470 /**
471 * iopt_map_file_pages() - Like iopt_map_user_pages, but map a file.
472 * @ictx: iommufd_ctx the iopt is part of
473 * @iopt: io_pagetable to act on
474 * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
475 * the chosen iova on output. Otherwise is the iova to map to on input
476 * @fd: fdno of a file to map
477 * @start: map file starting at this byte offset
478 * @length: Number of bytes to map
479 * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
480 * @flags: IOPT_ALLOC_IOVA or zero
481 */
iopt_map_file_pages(struct iommufd_ctx * ictx,struct io_pagetable * iopt,unsigned long * iova,int fd,unsigned long start,unsigned long length,int iommu_prot,unsigned int flags)482 int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
483 unsigned long *iova, int fd, unsigned long start,
484 unsigned long length, int iommu_prot,
485 unsigned int flags)
486 {
487 struct iopt_pages *pages;
488 struct dma_buf *dmabuf;
489 unsigned long start_byte;
490 unsigned long last;
491
492 if (!length)
493 return -EINVAL;
494 if (check_add_overflow(start, length - 1, &last))
495 return -EOVERFLOW;
496
497 start_byte = start - ALIGN_DOWN(start, PAGE_SIZE);
498 if (IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
499 dmabuf = dma_buf_get(fd);
500 else
501 dmabuf = ERR_PTR(-ENXIO);
502
503 if (!IS_ERR(dmabuf)) {
504 pages = iopt_alloc_dmabuf_pages(ictx, dmabuf, start_byte, start,
505 length,
506 iommu_prot & IOMMU_WRITE);
507 if (IS_ERR(pages)) {
508 dma_buf_put(dmabuf);
509 return PTR_ERR(pages);
510 }
511 } else {
512 struct file *file;
513
514 file = fget(fd);
515 if (!file)
516 return -EBADF;
517
518 pages = iopt_alloc_file_pages(file, start_byte, start, length,
519 iommu_prot & IOMMU_WRITE);
520 fput(file);
521 if (IS_ERR(pages))
522 return PTR_ERR(pages);
523 }
524
525 return iopt_map_common(ictx, iopt, pages, iova, length,
526 start_byte, iommu_prot, flags);
527 }
528
529 struct iova_bitmap_fn_arg {
530 unsigned long flags;
531 struct io_pagetable *iopt;
532 struct iommu_domain *domain;
533 struct iommu_dirty_bitmap *dirty;
534 };
535
__iommu_read_and_clear_dirty(struct iova_bitmap * bitmap,unsigned long iova,size_t length,void * opaque)536 static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap,
537 unsigned long iova, size_t length,
538 void *opaque)
539 {
540 struct iopt_area *area;
541 struct iopt_area_contig_iter iter;
542 struct iova_bitmap_fn_arg *arg = opaque;
543 struct iommu_domain *domain = arg->domain;
544 struct iommu_dirty_bitmap *dirty = arg->dirty;
545 const struct iommu_dirty_ops *ops = domain->dirty_ops;
546 unsigned long last_iova = iova + length - 1;
547 unsigned long flags = arg->flags;
548 int ret;
549
550 iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) {
551 unsigned long last = min(last_iova, iopt_area_last_iova(area));
552
553 ret = ops->read_and_clear_dirty(domain, iter.cur_iova,
554 last - iter.cur_iova + 1, flags,
555 dirty);
556 if (ret)
557 return ret;
558 }
559
560 if (!iopt_area_contig_done(&iter))
561 return -EINVAL;
562 return 0;
563 }
564
565 static int
iommu_read_and_clear_dirty(struct iommu_domain * domain,struct io_pagetable * iopt,unsigned long flags,struct iommu_hwpt_get_dirty_bitmap * bitmap)566 iommu_read_and_clear_dirty(struct iommu_domain *domain,
567 struct io_pagetable *iopt, unsigned long flags,
568 struct iommu_hwpt_get_dirty_bitmap *bitmap)
569 {
570 const struct iommu_dirty_ops *ops = domain->dirty_ops;
571 struct iommu_iotlb_gather gather;
572 struct iommu_dirty_bitmap dirty;
573 struct iova_bitmap_fn_arg arg;
574 struct iova_bitmap *iter;
575 int ret = 0;
576
577 if (!ops || !ops->read_and_clear_dirty)
578 return -EOPNOTSUPP;
579
580 iter = iova_bitmap_alloc(bitmap->iova, bitmap->length,
581 bitmap->page_size,
582 u64_to_user_ptr(bitmap->data));
583 if (IS_ERR(iter))
584 return -ENOMEM;
585
586 iommu_dirty_bitmap_init(&dirty, iter, &gather);
587
588 arg.flags = flags;
589 arg.iopt = iopt;
590 arg.domain = domain;
591 arg.dirty = &dirty;
592 iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty);
593
594 if (!(flags & IOMMU_DIRTY_NO_CLEAR))
595 iommu_iotlb_sync(domain, &gather);
596
597 iova_bitmap_free(iter);
598
599 return ret;
600 }
601
iommufd_check_iova_range(struct io_pagetable * iopt,struct iommu_hwpt_get_dirty_bitmap * bitmap)602 int iommufd_check_iova_range(struct io_pagetable *iopt,
603 struct iommu_hwpt_get_dirty_bitmap *bitmap)
604 {
605 size_t iommu_pgsize = iopt->iova_alignment;
606 u64 last_iova;
607
608 if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova))
609 return -EOVERFLOW;
610
611 if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX)
612 return -EOVERFLOW;
613
614 if ((bitmap->iova & (iommu_pgsize - 1)) ||
615 ((last_iova + 1) & (iommu_pgsize - 1)))
616 return -EINVAL;
617
618 if (!bitmap->page_size)
619 return -EINVAL;
620
621 if ((bitmap->iova & (bitmap->page_size - 1)) ||
622 ((last_iova + 1) & (bitmap->page_size - 1)))
623 return -EINVAL;
624
625 return 0;
626 }
627
iopt_read_and_clear_dirty_data(struct io_pagetable * iopt,struct iommu_domain * domain,unsigned long flags,struct iommu_hwpt_get_dirty_bitmap * bitmap)628 int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
629 struct iommu_domain *domain,
630 unsigned long flags,
631 struct iommu_hwpt_get_dirty_bitmap *bitmap)
632 {
633 int ret;
634
635 ret = iommufd_check_iova_range(iopt, bitmap);
636 if (ret)
637 return ret;
638
639 down_read(&iopt->iova_rwsem);
640 ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
641 up_read(&iopt->iova_rwsem);
642
643 return ret;
644 }
645
iopt_clear_dirty_data(struct io_pagetable * iopt,struct iommu_domain * domain)646 static int iopt_clear_dirty_data(struct io_pagetable *iopt,
647 struct iommu_domain *domain)
648 {
649 const struct iommu_dirty_ops *ops = domain->dirty_ops;
650 struct iommu_iotlb_gather gather;
651 struct iommu_dirty_bitmap dirty;
652 struct iopt_area *area;
653 int ret = 0;
654
655 lockdep_assert_held_read(&iopt->iova_rwsem);
656
657 iommu_dirty_bitmap_init(&dirty, NULL, &gather);
658
659 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
660 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
661 if (!area->pages)
662 continue;
663
664 ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area),
665 iopt_area_length(area), 0,
666 &dirty);
667 if (ret)
668 break;
669 }
670
671 iommu_iotlb_sync(domain, &gather);
672 return ret;
673 }
674
iopt_set_dirty_tracking(struct io_pagetable * iopt,struct iommu_domain * domain,bool enable)675 int iopt_set_dirty_tracking(struct io_pagetable *iopt,
676 struct iommu_domain *domain, bool enable)
677 {
678 const struct iommu_dirty_ops *ops = domain->dirty_ops;
679 int ret = 0;
680
681 if (!ops)
682 return -EOPNOTSUPP;
683
684 down_read(&iopt->iova_rwsem);
685
686 /* Clear dirty bits from PTEs to ensure a clean snapshot */
687 if (enable) {
688 ret = iopt_clear_dirty_data(iopt, domain);
689 if (ret)
690 goto out_unlock;
691 }
692
693 ret = ops->set_dirty_tracking(domain, enable);
694
695 out_unlock:
696 up_read(&iopt->iova_rwsem);
697 return ret;
698 }
699
iopt_get_pages(struct io_pagetable * iopt,unsigned long iova,unsigned long length,struct list_head * pages_list)700 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
701 unsigned long length, struct list_head *pages_list)
702 {
703 struct iopt_area_contig_iter iter;
704 unsigned long last_iova;
705 struct iopt_area *area;
706 int rc;
707
708 if (!length)
709 return -EINVAL;
710 if (check_add_overflow(iova, length - 1, &last_iova))
711 return -EOVERFLOW;
712
713 down_read(&iopt->iova_rwsem);
714 iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
715 struct iopt_pages_list *elm;
716 unsigned long last = min(last_iova, iopt_area_last_iova(area));
717
718 elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT);
719 if (!elm) {
720 rc = -ENOMEM;
721 goto err_free;
722 }
723 elm->start_byte = iopt_area_start_byte(area, iter.cur_iova);
724 elm->pages = area->pages;
725 elm->length = (last - iter.cur_iova) + 1;
726 kref_get(&elm->pages->kref);
727 list_add_tail(&elm->next, pages_list);
728 }
729 if (!iopt_area_contig_done(&iter)) {
730 rc = -ENOENT;
731 goto err_free;
732 }
733 up_read(&iopt->iova_rwsem);
734 return 0;
735 err_free:
736 up_read(&iopt->iova_rwsem);
737 iopt_free_pages_list(pages_list);
738 return rc;
739 }
740
iopt_unmap_iova_range(struct io_pagetable * iopt,unsigned long start,unsigned long last,unsigned long * unmapped)741 static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
742 unsigned long last, unsigned long *unmapped)
743 {
744 struct iopt_area *area;
745 unsigned long unmapped_bytes = 0;
746 unsigned int tries = 0;
747 /* If there are no mapped entries then success */
748 int rc = 0;
749
750 /*
751 * The domains_rwsem must be held in read mode any time any area->pages
752 * is NULL. This prevents domain attach/detatch from running
753 * concurrently with cleaning up the area.
754 */
755 again:
756 down_read(&iopt->domains_rwsem);
757 down_write(&iopt->iova_rwsem);
758 while ((area = iopt_area_iter_first(iopt, start, last))) {
759 unsigned long area_last = iopt_area_last_iova(area);
760 unsigned long area_first = iopt_area_iova(area);
761 struct iopt_pages *pages;
762
763 /* Userspace should not race map/unmap's of the same area */
764 if (!area->pages) {
765 rc = -EBUSY;
766 goto out_unlock_iova;
767 }
768
769 /* The area is locked by an object that has not been destroyed */
770 if (area->num_locks) {
771 rc = -EBUSY;
772 goto out_unlock_iova;
773 }
774
775 if (area_first < start || area_last > last) {
776 rc = -ENOENT;
777 goto out_unlock_iova;
778 }
779
780 if (area_first != start)
781 tries = 0;
782
783 /*
784 * num_accesses writers must hold the iova_rwsem too, so we can
785 * safely read it under the write side of the iovam_rwsem
786 * without the pages->mutex.
787 */
788 if (area->num_accesses) {
789 size_t length = iopt_area_length(area);
790
791 start = area_first;
792 area->prevent_access = true;
793 up_write(&iopt->iova_rwsem);
794 up_read(&iopt->domains_rwsem);
795
796 iommufd_access_notify_unmap(iopt, area_first, length);
797 /* Something is not responding to unmap requests. */
798 tries++;
799 if (WARN_ON(tries > 100)) {
800 rc = -EDEADLOCK;
801 goto out_unmapped;
802 }
803 goto again;
804 }
805
806 pages = area->pages;
807 area->pages = NULL;
808 up_write(&iopt->iova_rwsem);
809
810 iopt_area_unfill_domains(area, pages);
811 iopt_abort_area(area);
812 iopt_put_pages(pages);
813
814 unmapped_bytes += area_last - area_first + 1;
815
816 down_write(&iopt->iova_rwsem);
817 }
818
819 out_unlock_iova:
820 up_write(&iopt->iova_rwsem);
821 up_read(&iopt->domains_rwsem);
822 out_unmapped:
823 if (unmapped)
824 *unmapped = unmapped_bytes;
825 return rc;
826 }
827
828 /**
829 * iopt_unmap_iova() - Remove a range of iova
830 * @iopt: io_pagetable to act on
831 * @iova: Starting iova to unmap
832 * @length: Number of bytes to unmap
833 * @unmapped: Return number of bytes unmapped
834 *
835 * The requested range must be a superset of existing ranges.
836 * Splitting/truncating IOVA mappings is not allowed.
837 */
iopt_unmap_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length,unsigned long * unmapped)838 int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
839 unsigned long length, unsigned long *unmapped)
840 {
841 unsigned long iova_last;
842
843 if (!length)
844 return -EINVAL;
845
846 if (check_add_overflow(iova, length - 1, &iova_last))
847 return -EOVERFLOW;
848
849 return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped);
850 }
851
iopt_unmap_all(struct io_pagetable * iopt,unsigned long * unmapped)852 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
853 {
854 /* If the IOVAs are empty then unmap all succeeds */
855 return iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
856 }
857
858 /* The caller must always free all the nodes in the allowed_iova rb_root. */
iopt_set_allow_iova(struct io_pagetable * iopt,struct rb_root_cached * allowed_iova)859 int iopt_set_allow_iova(struct io_pagetable *iopt,
860 struct rb_root_cached *allowed_iova)
861 {
862 struct iopt_allowed *allowed;
863
864 down_write(&iopt->iova_rwsem);
865 swap(*allowed_iova, iopt->allowed_itree);
866
867 for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed;
868 allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) {
869 if (iopt_reserved_iter_first(iopt, allowed->node.start,
870 allowed->node.last)) {
871 swap(*allowed_iova, iopt->allowed_itree);
872 up_write(&iopt->iova_rwsem);
873 return -EADDRINUSE;
874 }
875 }
876 up_write(&iopt->iova_rwsem);
877 return 0;
878 }
879
iopt_reserve_iova(struct io_pagetable * iopt,unsigned long start,unsigned long last,void * owner)880 int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
881 unsigned long last, void *owner)
882 {
883 struct iopt_reserved *reserved;
884
885 lockdep_assert_held_write(&iopt->iova_rwsem);
886
887 if (iopt_area_iter_first(iopt, start, last) ||
888 iopt_allowed_iter_first(iopt, start, last))
889 return -EADDRINUSE;
890
891 reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT);
892 if (!reserved)
893 return -ENOMEM;
894 reserved->node.start = start;
895 reserved->node.last = last;
896 reserved->owner = owner;
897 interval_tree_insert(&reserved->node, &iopt->reserved_itree);
898 return 0;
899 }
900
__iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)901 static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
902 {
903 struct iopt_reserved *reserved, *next;
904
905 lockdep_assert_held_write(&iopt->iova_rwsem);
906
907 for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved;
908 reserved = next) {
909 next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX);
910
911 if (reserved->owner == owner) {
912 interval_tree_remove(&reserved->node,
913 &iopt->reserved_itree);
914 kfree(reserved);
915 }
916 }
917 }
918
iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)919 void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
920 {
921 down_write(&iopt->iova_rwsem);
922 __iopt_remove_reserved_iova(iopt, owner);
923 up_write(&iopt->iova_rwsem);
924 }
925
iopt_init_table(struct io_pagetable * iopt)926 void iopt_init_table(struct io_pagetable *iopt)
927 {
928 init_rwsem(&iopt->iova_rwsem);
929 init_rwsem(&iopt->domains_rwsem);
930 iopt->area_itree = RB_ROOT_CACHED;
931 iopt->allowed_itree = RB_ROOT_CACHED;
932 iopt->reserved_itree = RB_ROOT_CACHED;
933 xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT);
934 xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC);
935
936 /*
937 * iopt's start as SW tables that can use the entire size_t IOVA space
938 * due to the use of size_t in the APIs. They have no alignment
939 * restriction.
940 */
941 iopt->iova_alignment = 1;
942 }
943
iopt_destroy_table(struct io_pagetable * iopt)944 void iopt_destroy_table(struct io_pagetable *iopt)
945 {
946 struct interval_tree_node *node;
947
948 if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
949 iopt_remove_reserved_iova(iopt, NULL);
950
951 while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0,
952 ULONG_MAX))) {
953 interval_tree_remove(node, &iopt->allowed_itree);
954 kfree(container_of(node, struct iopt_allowed, node));
955 }
956
957 WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root));
958 WARN_ON(!xa_empty(&iopt->domains));
959 WARN_ON(!xa_empty(&iopt->access_list));
960 WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
961 }
962
963 /**
964 * iopt_unfill_domain() - Unfill a domain with PFNs
965 * @iopt: io_pagetable to act on
966 * @domain: domain to unfill
967 *
968 * This is used when removing a domain from the iopt. Every area in the iopt
969 * will be unmapped from the domain. The domain must already be removed from the
970 * domains xarray.
971 */
iopt_unfill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)972 static void iopt_unfill_domain(struct io_pagetable *iopt,
973 struct iommu_domain *domain)
974 {
975 struct iopt_area *area;
976
977 lockdep_assert_held(&iopt->iova_rwsem);
978 lockdep_assert_held_write(&iopt->domains_rwsem);
979
980 /*
981 * Some other domain is holding all the pfns still, rapidly unmap this
982 * domain.
983 */
984 if (iopt->next_domain_id != 0) {
985 /* Pick an arbitrary remaining domain to act as storage */
986 struct iommu_domain *storage_domain =
987 xa_load(&iopt->domains, 0);
988
989 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
990 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
991 struct iopt_pages *pages = area->pages;
992
993 if (!pages)
994 continue;
995
996 mutex_lock(&pages->mutex);
997 if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
998 WARN_ON(!area->storage_domain);
999 if (area->storage_domain == domain)
1000 area->storage_domain = storage_domain;
1001 if (iopt_is_dmabuf(pages)) {
1002 if (!iopt_dmabuf_revoked(pages))
1003 iopt_area_unmap_domain(area, domain);
1004 iopt_dmabuf_untrack_domain(pages, area, domain);
1005 }
1006 mutex_unlock(&pages->mutex);
1007
1008 if (!iopt_is_dmabuf(pages))
1009 iopt_area_unmap_domain(area, domain);
1010 }
1011 return;
1012 }
1013
1014 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1015 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
1016 struct iopt_pages *pages = area->pages;
1017
1018 if (!pages)
1019 continue;
1020
1021 mutex_lock(&pages->mutex);
1022 interval_tree_remove(&area->pages_node, &pages->domains_itree);
1023 WARN_ON(area->storage_domain != domain);
1024 area->storage_domain = NULL;
1025 iopt_area_unfill_domain(area, pages, domain);
1026 if (iopt_is_dmabuf(pages))
1027 iopt_dmabuf_untrack_domain(pages, area, domain);
1028 mutex_unlock(&pages->mutex);
1029 }
1030 }
1031
1032 /**
1033 * iopt_fill_domain() - Fill a domain with PFNs
1034 * @iopt: io_pagetable to act on
1035 * @domain: domain to fill
1036 *
1037 * Fill the domain with PFNs from every area in the iopt. On failure the domain
1038 * is left unchanged.
1039 */
iopt_fill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)1040 static int iopt_fill_domain(struct io_pagetable *iopt,
1041 struct iommu_domain *domain)
1042 {
1043 struct iopt_area *end_area;
1044 struct iopt_area *area;
1045 int rc;
1046
1047 lockdep_assert_held(&iopt->iova_rwsem);
1048 lockdep_assert_held_write(&iopt->domains_rwsem);
1049
1050 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1051 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
1052 struct iopt_pages *pages = area->pages;
1053
1054 if (!pages)
1055 continue;
1056
1057 guard(mutex)(&pages->mutex);
1058 if (iopt_is_dmabuf(pages)) {
1059 rc = iopt_dmabuf_track_domain(pages, area, domain);
1060 if (rc)
1061 goto out_unfill;
1062 }
1063 rc = iopt_area_fill_domain(area, domain);
1064 if (rc) {
1065 if (iopt_is_dmabuf(pages))
1066 iopt_dmabuf_untrack_domain(pages, area, domain);
1067 goto out_unfill;
1068 }
1069 if (!area->storage_domain) {
1070 WARN_ON(iopt->next_domain_id != 0);
1071 area->storage_domain = domain;
1072 interval_tree_insert(&area->pages_node,
1073 &pages->domains_itree);
1074 }
1075 }
1076 return 0;
1077
1078 out_unfill:
1079 end_area = area;
1080 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1081 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
1082 struct iopt_pages *pages = area->pages;
1083
1084 if (area == end_area)
1085 break;
1086 if (!pages)
1087 continue;
1088 mutex_lock(&pages->mutex);
1089 if (iopt->next_domain_id == 0) {
1090 interval_tree_remove(&area->pages_node,
1091 &pages->domains_itree);
1092 area->storage_domain = NULL;
1093 }
1094 iopt_area_unfill_domain(area, pages, domain);
1095 if (iopt_is_dmabuf(pages))
1096 iopt_dmabuf_untrack_domain(pages, area, domain);
1097 mutex_unlock(&pages->mutex);
1098 }
1099 return rc;
1100 }
1101
1102 /* All existing area's conform to an increased page size */
iopt_check_iova_alignment(struct io_pagetable * iopt,unsigned long new_iova_alignment)1103 static int iopt_check_iova_alignment(struct io_pagetable *iopt,
1104 unsigned long new_iova_alignment)
1105 {
1106 unsigned long align_mask = new_iova_alignment - 1;
1107 struct iopt_area *area;
1108
1109 lockdep_assert_held(&iopt->iova_rwsem);
1110 lockdep_assert_held(&iopt->domains_rwsem);
1111
1112 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1113 area = iopt_area_iter_next(area, 0, ULONG_MAX))
1114 if ((iopt_area_iova(area) & align_mask) ||
1115 (iopt_area_length(area) & align_mask) ||
1116 (area->page_offset & align_mask))
1117 return -EADDRINUSE;
1118
1119 if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) {
1120 struct iommufd_access *access;
1121 unsigned long index;
1122
1123 xa_for_each(&iopt->access_list, index, access)
1124 if (WARN_ON(access->iova_alignment >
1125 new_iova_alignment))
1126 return -EADDRINUSE;
1127 }
1128 return 0;
1129 }
1130
iopt_table_add_domain(struct io_pagetable * iopt,struct iommu_domain * domain)1131 int iopt_table_add_domain(struct io_pagetable *iopt,
1132 struct iommu_domain *domain)
1133 {
1134 const struct iommu_domain_geometry *geometry = &domain->geometry;
1135 struct iommu_domain *iter_domain;
1136 unsigned int new_iova_alignment;
1137 unsigned long index;
1138 int rc;
1139
1140 down_write(&iopt->domains_rwsem);
1141 down_write(&iopt->iova_rwsem);
1142
1143 xa_for_each(&iopt->domains, index, iter_domain) {
1144 if (WARN_ON(iter_domain == domain)) {
1145 rc = -EEXIST;
1146 goto out_unlock;
1147 }
1148 }
1149
1150 /*
1151 * The io page size drives the iova_alignment. Internally the iopt_pages
1152 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
1153 * objects into the iommu_domain.
1154 *
1155 * A iommu_domain must always be able to accept PAGE_SIZE to be
1156 * compatible as we can't guarantee higher contiguity.
1157 */
1158 new_iova_alignment = max_t(unsigned long,
1159 1UL << __ffs(domain->pgsize_bitmap),
1160 iopt->iova_alignment);
1161 if (new_iova_alignment > PAGE_SIZE) {
1162 rc = -EINVAL;
1163 goto out_unlock;
1164 }
1165 if (new_iova_alignment != iopt->iova_alignment) {
1166 rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1167 if (rc)
1168 goto out_unlock;
1169 }
1170
1171 /* No area exists that is outside the allowed domain aperture */
1172 if (geometry->aperture_start != 0) {
1173 rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
1174 domain);
1175 if (rc)
1176 goto out_reserved;
1177 }
1178 if (geometry->aperture_end != ULONG_MAX) {
1179 rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
1180 ULONG_MAX, domain);
1181 if (rc)
1182 goto out_reserved;
1183 }
1184
1185 rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
1186 if (rc)
1187 goto out_reserved;
1188
1189 rc = iopt_fill_domain(iopt, domain);
1190 if (rc)
1191 goto out_release;
1192
1193 iopt->iova_alignment = new_iova_alignment;
1194 xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
1195 iopt->next_domain_id++;
1196 up_write(&iopt->iova_rwsem);
1197 up_write(&iopt->domains_rwsem);
1198 return 0;
1199 out_release:
1200 xa_release(&iopt->domains, iopt->next_domain_id);
1201 out_reserved:
1202 __iopt_remove_reserved_iova(iopt, domain);
1203 out_unlock:
1204 up_write(&iopt->iova_rwsem);
1205 up_write(&iopt->domains_rwsem);
1206 return rc;
1207 }
1208
iopt_calculate_iova_alignment(struct io_pagetable * iopt)1209 static int iopt_calculate_iova_alignment(struct io_pagetable *iopt)
1210 {
1211 unsigned long new_iova_alignment;
1212 struct iommufd_access *access;
1213 struct iommu_domain *domain;
1214 unsigned long index;
1215
1216 lockdep_assert_held_write(&iopt->iova_rwsem);
1217 lockdep_assert_held(&iopt->domains_rwsem);
1218
1219 /* See batch_iommu_map_small() */
1220 if (iopt->disable_large_pages)
1221 new_iova_alignment = PAGE_SIZE;
1222 else
1223 new_iova_alignment = 1;
1224
1225 xa_for_each(&iopt->domains, index, domain)
1226 new_iova_alignment = max_t(unsigned long,
1227 1UL << __ffs(domain->pgsize_bitmap),
1228 new_iova_alignment);
1229 xa_for_each(&iopt->access_list, index, access)
1230 new_iova_alignment = max_t(unsigned long,
1231 access->iova_alignment,
1232 new_iova_alignment);
1233
1234 if (new_iova_alignment > iopt->iova_alignment) {
1235 int rc;
1236
1237 rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1238 if (rc)
1239 return rc;
1240 }
1241 iopt->iova_alignment = new_iova_alignment;
1242 return 0;
1243 }
1244
iopt_table_remove_domain(struct io_pagetable * iopt,struct iommu_domain * domain)1245 void iopt_table_remove_domain(struct io_pagetable *iopt,
1246 struct iommu_domain *domain)
1247 {
1248 struct iommu_domain *iter_domain = NULL;
1249 unsigned long index;
1250
1251 down_write(&iopt->domains_rwsem);
1252 down_write(&iopt->iova_rwsem);
1253
1254 xa_for_each(&iopt->domains, index, iter_domain)
1255 if (iter_domain == domain)
1256 break;
1257 if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
1258 goto out_unlock;
1259
1260 /*
1261 * Compress the xarray to keep it linear by swapping the entry to erase
1262 * with the tail entry and shrinking the tail.
1263 */
1264 iopt->next_domain_id--;
1265 iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
1266 if (index != iopt->next_domain_id)
1267 xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
1268
1269 iopt_unfill_domain(iopt, domain);
1270 __iopt_remove_reserved_iova(iopt, domain);
1271
1272 WARN_ON(iopt_calculate_iova_alignment(iopt));
1273 out_unlock:
1274 up_write(&iopt->iova_rwsem);
1275 up_write(&iopt->domains_rwsem);
1276 }
1277
1278 /**
1279 * iopt_area_split - Split an area into two parts at iova
1280 * @area: The area to split
1281 * @iova: Becomes the last of a new area
1282 *
1283 * This splits an area into two. It is part of the VFIO compatibility to allow
1284 * poking a hole in the mapping. The two areas continue to point at the same
1285 * iopt_pages, just with different starting bytes.
1286 */
iopt_area_split(struct iopt_area * area,unsigned long iova)1287 static int iopt_area_split(struct iopt_area *area, unsigned long iova)
1288 {
1289 unsigned long alignment = area->iopt->iova_alignment;
1290 unsigned long last_iova = iopt_area_last_iova(area);
1291 unsigned long start_iova = iopt_area_iova(area);
1292 unsigned long new_start = iova + 1;
1293 struct io_pagetable *iopt = area->iopt;
1294 struct iopt_pages *pages = area->pages;
1295 struct iopt_area *lhs;
1296 struct iopt_area *rhs;
1297 int rc;
1298
1299 lockdep_assert_held_write(&iopt->iova_rwsem);
1300
1301 if (iova == start_iova || iova == last_iova)
1302 return 0;
1303
1304 if (!pages || area->prevent_access)
1305 return -EBUSY;
1306
1307 /* Maintaining the domains_itree below is a bit complicated */
1308 if (iopt_is_dmabuf(pages))
1309 return -EOPNOTSUPP;
1310
1311 if (new_start & (alignment - 1) ||
1312 iopt_area_start_byte(area, new_start) & (alignment - 1))
1313 return -EINVAL;
1314
1315 lhs = iopt_area_alloc();
1316 if (!lhs)
1317 return -ENOMEM;
1318
1319 rhs = iopt_area_alloc();
1320 if (!rhs) {
1321 rc = -ENOMEM;
1322 goto err_free_lhs;
1323 }
1324
1325 mutex_lock(&pages->mutex);
1326 /*
1327 * Splitting is not permitted if an access exists, we don't track enough
1328 * information to split existing accesses.
1329 */
1330 if (area->num_accesses) {
1331 rc = -EINVAL;
1332 goto err_unlock;
1333 }
1334
1335 /*
1336 * Splitting is not permitted if a domain could have been mapped with
1337 * huge pages.
1338 */
1339 if (area->storage_domain && !iopt->disable_large_pages) {
1340 rc = -EINVAL;
1341 goto err_unlock;
1342 }
1343
1344 interval_tree_remove(&area->node, &iopt->area_itree);
1345 rc = iopt_insert_area(iopt, lhs, area->pages, start_iova,
1346 iopt_area_start_byte(area, start_iova),
1347 (new_start - 1) - start_iova + 1,
1348 area->iommu_prot);
1349 if (WARN_ON(rc))
1350 goto err_insert;
1351
1352 rc = iopt_insert_area(iopt, rhs, area->pages, new_start,
1353 iopt_area_start_byte(area, new_start),
1354 last_iova - new_start + 1, area->iommu_prot);
1355 if (WARN_ON(rc))
1356 goto err_remove_lhs;
1357
1358 /*
1359 * If the original area has filled a domain, domains_itree has to be
1360 * updated.
1361 */
1362 if (area->storage_domain) {
1363 interval_tree_remove(&area->pages_node, &pages->domains_itree);
1364 interval_tree_insert(&lhs->pages_node, &pages->domains_itree);
1365 interval_tree_insert(&rhs->pages_node, &pages->domains_itree);
1366 }
1367
1368 lhs->storage_domain = area->storage_domain;
1369 lhs->pages = area->pages;
1370 rhs->storage_domain = area->storage_domain;
1371 rhs->pages = area->pages;
1372 kref_get(&rhs->pages->kref);
1373 kfree(area);
1374 mutex_unlock(&pages->mutex);
1375
1376 /*
1377 * No change to domains or accesses because the pages hasn't been
1378 * changed
1379 */
1380 return 0;
1381
1382 err_remove_lhs:
1383 interval_tree_remove(&lhs->node, &iopt->area_itree);
1384 err_insert:
1385 interval_tree_insert(&area->node, &iopt->area_itree);
1386 err_unlock:
1387 mutex_unlock(&pages->mutex);
1388 kfree(rhs);
1389 err_free_lhs:
1390 kfree(lhs);
1391 return rc;
1392 }
1393
iopt_cut_iova(struct io_pagetable * iopt,unsigned long * iovas,size_t num_iovas)1394 int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
1395 size_t num_iovas)
1396 {
1397 int rc = 0;
1398 int i;
1399
1400 down_write(&iopt->iova_rwsem);
1401 for (i = 0; i < num_iovas; i++) {
1402 struct iopt_area *area;
1403
1404 area = iopt_area_iter_first(iopt, iovas[i], iovas[i]);
1405 if (!area)
1406 continue;
1407 rc = iopt_area_split(area, iovas[i]);
1408 if (rc)
1409 break;
1410 }
1411 up_write(&iopt->iova_rwsem);
1412 return rc;
1413 }
1414
iopt_enable_large_pages(struct io_pagetable * iopt)1415 void iopt_enable_large_pages(struct io_pagetable *iopt)
1416 {
1417 int rc;
1418
1419 down_write(&iopt->domains_rwsem);
1420 down_write(&iopt->iova_rwsem);
1421 WRITE_ONCE(iopt->disable_large_pages, false);
1422 rc = iopt_calculate_iova_alignment(iopt);
1423 WARN_ON(rc);
1424 up_write(&iopt->iova_rwsem);
1425 up_write(&iopt->domains_rwsem);
1426 }
1427
iopt_disable_large_pages(struct io_pagetable * iopt)1428 int iopt_disable_large_pages(struct io_pagetable *iopt)
1429 {
1430 int rc = 0;
1431
1432 down_write(&iopt->domains_rwsem);
1433 down_write(&iopt->iova_rwsem);
1434 if (iopt->disable_large_pages)
1435 goto out_unlock;
1436
1437 /* Won't do it if domains already have pages mapped in them */
1438 if (!xa_empty(&iopt->domains) &&
1439 !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) {
1440 rc = -EINVAL;
1441 goto out_unlock;
1442 }
1443
1444 WRITE_ONCE(iopt->disable_large_pages, true);
1445 rc = iopt_calculate_iova_alignment(iopt);
1446 if (rc)
1447 WRITE_ONCE(iopt->disable_large_pages, false);
1448 out_unlock:
1449 up_write(&iopt->iova_rwsem);
1450 up_write(&iopt->domains_rwsem);
1451 return rc;
1452 }
1453
iopt_add_access(struct io_pagetable * iopt,struct iommufd_access * access)1454 int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access)
1455 {
1456 u32 new_id;
1457 int rc;
1458
1459 down_write(&iopt->domains_rwsem);
1460 down_write(&iopt->iova_rwsem);
1461 rc = xa_alloc(&iopt->access_list, &new_id, access, xa_limit_16b,
1462 GFP_KERNEL_ACCOUNT);
1463
1464 if (rc)
1465 goto out_unlock;
1466
1467 rc = iopt_calculate_iova_alignment(iopt);
1468 if (rc) {
1469 xa_erase(&iopt->access_list, new_id);
1470 goto out_unlock;
1471 }
1472 access->iopt_access_list_id = new_id;
1473
1474 out_unlock:
1475 up_write(&iopt->iova_rwsem);
1476 up_write(&iopt->domains_rwsem);
1477 return rc;
1478 }
1479
iopt_remove_access(struct io_pagetable * iopt,struct iommufd_access * access,u32 iopt_access_list_id)1480 void iopt_remove_access(struct io_pagetable *iopt,
1481 struct iommufd_access *access, u32 iopt_access_list_id)
1482 {
1483 down_write(&iopt->domains_rwsem);
1484 down_write(&iopt->iova_rwsem);
1485 WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access);
1486 WARN_ON(iopt_calculate_iova_alignment(iopt));
1487 up_write(&iopt->iova_rwsem);
1488 up_write(&iopt->domains_rwsem);
1489 }
1490
1491 /* Narrow the valid_iova_itree to include reserved ranges from a device. */
iopt_table_enforce_dev_resv_regions(struct io_pagetable * iopt,struct device * dev,phys_addr_t * sw_msi_start)1492 int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
1493 struct device *dev,
1494 phys_addr_t *sw_msi_start)
1495 {
1496 struct iommu_resv_region *resv;
1497 LIST_HEAD(resv_regions);
1498 unsigned int num_hw_msi = 0;
1499 unsigned int num_sw_msi = 0;
1500 int rc;
1501
1502 if (iommufd_should_fail())
1503 return -EINVAL;
1504
1505 down_write(&iopt->iova_rwsem);
1506 /* FIXME: drivers allocate memory but there is no failure propogated */
1507 iommu_get_resv_regions(dev, &resv_regions);
1508
1509 list_for_each_entry(resv, &resv_regions, list) {
1510 if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1511 continue;
1512
1513 if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
1514 num_hw_msi++;
1515 if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
1516 *sw_msi_start = resv->start;
1517 num_sw_msi++;
1518 }
1519
1520 rc = iopt_reserve_iova(iopt, resv->start,
1521 resv->length - 1 + resv->start, dev);
1522 if (rc)
1523 goto out_reserved;
1524 }
1525
1526 /* Drivers must offer sane combinations of regions */
1527 if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) {
1528 rc = -EINVAL;
1529 goto out_reserved;
1530 }
1531
1532 rc = 0;
1533 goto out_free_resv;
1534
1535 out_reserved:
1536 __iopt_remove_reserved_iova(iopt, dev);
1537 out_free_resv:
1538 iommu_put_resv_regions(dev, &resv_regions);
1539 up_write(&iopt->iova_rwsem);
1540 return rc;
1541 }
1542