1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
3 *
4 * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
5 * PFNs can be placed into an iommu_domain, or returned to the caller as a page
6 * list for access by an in-kernel user.
7 *
8 * The datastructure uses the iopt_pages to optimize the storage of the PFNs
9 * between the domains and xarray.
10 */
11 #include <linux/err.h>
12 #include <linux/errno.h>
13 #include <linux/iommu.h>
14 #include <linux/iommufd.h>
15 #include <linux/lockdep.h>
16 #include <linux/sched/mm.h>
17 #include <linux/slab.h>
18 #include <uapi/linux/iommufd.h>
19
20 #include "double_span.h"
21 #include "io_pagetable.h"
22
23 struct iopt_pages_list {
24 struct iopt_pages *pages;
25 struct iopt_area *area;
26 struct list_head next;
27 unsigned long start_byte;
28 unsigned long length;
29 };
30
iopt_area_contig_init(struct iopt_area_contig_iter * iter,struct io_pagetable * iopt,unsigned long iova,unsigned long last_iova)31 struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
32 struct io_pagetable *iopt,
33 unsigned long iova,
34 unsigned long last_iova)
35 {
36 lockdep_assert_held(&iopt->iova_rwsem);
37
38 iter->cur_iova = iova;
39 iter->last_iova = last_iova;
40 iter->area = iopt_area_iter_first(iopt, iova, iova);
41 if (!iter->area)
42 return NULL;
43 if (!iter->area->pages) {
44 iter->area = NULL;
45 return NULL;
46 }
47 return iter->area;
48 }
49
iopt_area_contig_next(struct iopt_area_contig_iter * iter)50 struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
51 {
52 unsigned long last_iova;
53
54 if (!iter->area)
55 return NULL;
56 last_iova = iopt_area_last_iova(iter->area);
57 if (iter->last_iova <= last_iova)
58 return NULL;
59
60 iter->cur_iova = last_iova + 1;
61 iter->area = iopt_area_iter_next(iter->area, iter->cur_iova,
62 iter->last_iova);
63 if (!iter->area)
64 return NULL;
65 if (iter->cur_iova != iopt_area_iova(iter->area) ||
66 !iter->area->pages) {
67 iter->area = NULL;
68 return NULL;
69 }
70 return iter->area;
71 }
72
__alloc_iova_check_hole(struct interval_tree_double_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)73 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
74 unsigned long length,
75 unsigned long iova_alignment,
76 unsigned long page_offset)
77 {
78 if (span->is_used || span->last_hole - span->start_hole < length - 1)
79 return false;
80
81 span->start_hole = ALIGN(span->start_hole, iova_alignment) |
82 page_offset;
83 if (span->start_hole > span->last_hole ||
84 span->last_hole - span->start_hole < length - 1)
85 return false;
86 return true;
87 }
88
__alloc_iova_check_used(struct interval_tree_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)89 static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
90 unsigned long length,
91 unsigned long iova_alignment,
92 unsigned long page_offset)
93 {
94 if (span->is_hole || span->last_used - span->start_used < length - 1)
95 return false;
96
97 span->start_used = ALIGN(span->start_used, iova_alignment) |
98 page_offset;
99 if (span->start_used > span->last_used ||
100 span->last_used - span->start_used < length - 1)
101 return false;
102 return true;
103 }
104
105 /*
106 * Automatically find a block of IOVA that is not being used and not reserved.
107 * Does not return a 0 IOVA even if it is valid.
108 */
iopt_alloc_iova(struct io_pagetable * iopt,unsigned long * iova,unsigned long addr,unsigned long length)109 static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
110 unsigned long addr, unsigned long length)
111 {
112 unsigned long page_offset = addr % PAGE_SIZE;
113 struct interval_tree_double_span_iter used_span;
114 struct interval_tree_span_iter allowed_span;
115 unsigned long max_alignment = PAGE_SIZE;
116 unsigned long iova_alignment;
117
118 lockdep_assert_held(&iopt->iova_rwsem);
119
120 /* Protect roundup_pow-of_two() from overflow */
121 if (length == 0 || length >= ULONG_MAX / 2)
122 return -EOVERFLOW;
123
124 /*
125 * Keep alignment present in addr when building the IOVA, which
126 * increases the chance we can map a THP.
127 */
128 if (!addr)
129 iova_alignment = roundup_pow_of_two(length);
130 else
131 iova_alignment = min_t(unsigned long,
132 roundup_pow_of_two(length),
133 1UL << __ffs64(addr));
134
135 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
136 max_alignment = HPAGE_SIZE;
137 #endif
138 /* Protect against ALIGN() overflow */
139 if (iova_alignment >= max_alignment)
140 iova_alignment = max_alignment;
141
142 if (iova_alignment < iopt->iova_alignment)
143 return -EINVAL;
144
145 interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree,
146 PAGE_SIZE, ULONG_MAX - PAGE_SIZE) {
147 if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) {
148 allowed_span.start_used = PAGE_SIZE;
149 allowed_span.last_used = ULONG_MAX - PAGE_SIZE;
150 allowed_span.is_hole = false;
151 }
152
153 if (!__alloc_iova_check_used(&allowed_span, length,
154 iova_alignment, page_offset))
155 continue;
156
157 interval_tree_for_each_double_span(
158 &used_span, &iopt->reserved_itree, &iopt->area_itree,
159 allowed_span.start_used, allowed_span.last_used) {
160 if (!__alloc_iova_check_hole(&used_span, length,
161 iova_alignment,
162 page_offset))
163 continue;
164
165 *iova = used_span.start_hole;
166 return 0;
167 }
168 }
169 return -ENOSPC;
170 }
171
iopt_check_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length)172 static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova,
173 unsigned long length)
174 {
175 unsigned long last;
176
177 lockdep_assert_held(&iopt->iova_rwsem);
178
179 if ((iova & (iopt->iova_alignment - 1)))
180 return -EINVAL;
181
182 if (check_add_overflow(iova, length - 1, &last))
183 return -EOVERFLOW;
184
185 /* No reserved IOVA intersects the range */
186 if (iopt_reserved_iter_first(iopt, iova, last))
187 return -EINVAL;
188
189 /* Check that there is not already a mapping in the range */
190 if (iopt_area_iter_first(iopt, iova, last))
191 return -EEXIST;
192 return 0;
193 }
194
195 /*
196 * The area takes a slice of the pages from start_bytes to start_byte + length
197 */
iopt_insert_area(struct io_pagetable * iopt,struct iopt_area * area,struct iopt_pages * pages,unsigned long iova,unsigned long start_byte,unsigned long length,int iommu_prot)198 static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
199 struct iopt_pages *pages, unsigned long iova,
200 unsigned long start_byte, unsigned long length,
201 int iommu_prot)
202 {
203 lockdep_assert_held_write(&iopt->iova_rwsem);
204
205 if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
206 return -EPERM;
207
208 area->iommu_prot = iommu_prot;
209 area->page_offset = start_byte % PAGE_SIZE;
210 if (area->page_offset & (iopt->iova_alignment - 1))
211 return -EINVAL;
212
213 area->node.start = iova;
214 if (check_add_overflow(iova, length - 1, &area->node.last))
215 return -EOVERFLOW;
216
217 area->pages_node.start = start_byte / PAGE_SIZE;
218 if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
219 return -EOVERFLOW;
220 area->pages_node.last = area->pages_node.last / PAGE_SIZE;
221 if (WARN_ON(area->pages_node.last >= pages->npages))
222 return -EOVERFLOW;
223
224 /*
225 * The area is inserted with a NULL pages indicating it is not fully
226 * initialized yet.
227 */
228 area->iopt = iopt;
229 interval_tree_insert(&area->node, &iopt->area_itree);
230 return 0;
231 }
232
iopt_area_alloc(void)233 static struct iopt_area *iopt_area_alloc(void)
234 {
235 struct iopt_area *area;
236
237 area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
238 if (!area)
239 return NULL;
240 RB_CLEAR_NODE(&area->node.rb);
241 RB_CLEAR_NODE(&area->pages_node.rb);
242 return area;
243 }
244
iopt_alloc_area_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)245 static int iopt_alloc_area_pages(struct io_pagetable *iopt,
246 struct list_head *pages_list,
247 unsigned long length, unsigned long *dst_iova,
248 int iommu_prot, unsigned int flags)
249 {
250 struct iopt_pages_list *elm;
251 unsigned long start;
252 unsigned long iova;
253 int rc = 0;
254
255 list_for_each_entry(elm, pages_list, next) {
256 elm->area = iopt_area_alloc();
257 if (!elm->area)
258 return -ENOMEM;
259 }
260
261 down_write(&iopt->iova_rwsem);
262 if ((length & (iopt->iova_alignment - 1)) || !length) {
263 rc = -EINVAL;
264 goto out_unlock;
265 }
266
267 if (flags & IOPT_ALLOC_IOVA) {
268 /* Use the first entry to guess the ideal IOVA alignment */
269 elm = list_first_entry(pages_list, struct iopt_pages_list,
270 next);
271 switch (elm->pages->type) {
272 case IOPT_ADDRESS_USER:
273 start = elm->start_byte + (uintptr_t)elm->pages->uptr;
274 break;
275 case IOPT_ADDRESS_FILE:
276 start = elm->start_byte + elm->pages->start;
277 break;
278 }
279 rc = iopt_alloc_iova(iopt, dst_iova, start, length);
280 if (rc)
281 goto out_unlock;
282 if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
283 WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) {
284 rc = -EINVAL;
285 goto out_unlock;
286 }
287 } else {
288 rc = iopt_check_iova(iopt, *dst_iova, length);
289 if (rc)
290 goto out_unlock;
291 }
292
293 /*
294 * Areas are created with a NULL pages so that the IOVA space is
295 * reserved and we can unlock the iova_rwsem.
296 */
297 iova = *dst_iova;
298 list_for_each_entry(elm, pages_list, next) {
299 rc = iopt_insert_area(iopt, elm->area, elm->pages, iova,
300 elm->start_byte, elm->length, iommu_prot);
301 if (rc)
302 goto out_unlock;
303 iova += elm->length;
304 }
305
306 out_unlock:
307 up_write(&iopt->iova_rwsem);
308 return rc;
309 }
310
iopt_abort_area(struct iopt_area * area)311 static void iopt_abort_area(struct iopt_area *area)
312 {
313 if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
314 WARN_ON(area->pages);
315 if (area->iopt) {
316 down_write(&area->iopt->iova_rwsem);
317 interval_tree_remove(&area->node, &area->iopt->area_itree);
318 up_write(&area->iopt->iova_rwsem);
319 }
320 kfree(area);
321 }
322
iopt_free_pages_list(struct list_head * pages_list)323 void iopt_free_pages_list(struct list_head *pages_list)
324 {
325 struct iopt_pages_list *elm;
326
327 while ((elm = list_first_entry_or_null(pages_list,
328 struct iopt_pages_list, next))) {
329 if (elm->area)
330 iopt_abort_area(elm->area);
331 if (elm->pages)
332 iopt_put_pages(elm->pages);
333 list_del(&elm->next);
334 kfree(elm);
335 }
336 }
337
iopt_fill_domains_pages(struct list_head * pages_list)338 static int iopt_fill_domains_pages(struct list_head *pages_list)
339 {
340 struct iopt_pages_list *undo_elm;
341 struct iopt_pages_list *elm;
342 int rc;
343
344 list_for_each_entry(elm, pages_list, next) {
345 rc = iopt_area_fill_domains(elm->area, elm->pages);
346 if (rc)
347 goto err_undo;
348 }
349 return 0;
350
351 err_undo:
352 list_for_each_entry(undo_elm, pages_list, next) {
353 if (undo_elm == elm)
354 break;
355 iopt_area_unfill_domains(undo_elm->area, undo_elm->pages);
356 }
357 return rc;
358 }
359
iopt_map_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)360 int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
361 unsigned long length, unsigned long *dst_iova,
362 int iommu_prot, unsigned int flags)
363 {
364 struct iopt_pages_list *elm;
365 int rc;
366
367 rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova,
368 iommu_prot, flags);
369 if (rc)
370 return rc;
371
372 down_read(&iopt->domains_rwsem);
373 rc = iopt_fill_domains_pages(pages_list);
374 if (rc)
375 goto out_unlock_domains;
376
377 down_write(&iopt->iova_rwsem);
378 list_for_each_entry(elm, pages_list, next) {
379 /*
380 * area->pages must be set inside the domains_rwsem to ensure
381 * any newly added domains will get filled. Moves the reference
382 * in from the list.
383 */
384 elm->area->pages = elm->pages;
385 elm->pages = NULL;
386 elm->area = NULL;
387 }
388 up_write(&iopt->iova_rwsem);
389 out_unlock_domains:
390 up_read(&iopt->domains_rwsem);
391 return rc;
392 }
393
iopt_map_common(struct iommufd_ctx * ictx,struct io_pagetable * iopt,struct iopt_pages * pages,unsigned long * iova,unsigned long length,unsigned long start_byte,int iommu_prot,unsigned int flags)394 static int iopt_map_common(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
395 struct iopt_pages *pages, unsigned long *iova,
396 unsigned long length, unsigned long start_byte,
397 int iommu_prot, unsigned int flags)
398 {
399 struct iopt_pages_list elm = {};
400 LIST_HEAD(pages_list);
401 int rc;
402
403 elm.pages = pages;
404 elm.start_byte = start_byte;
405 if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
406 elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
407 elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
408 elm.length = length;
409 list_add(&elm.next, &pages_list);
410
411 rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
412 if (rc) {
413 if (elm.area)
414 iopt_abort_area(elm.area);
415 if (elm.pages)
416 iopt_put_pages(elm.pages);
417 return rc;
418 }
419 return 0;
420 }
421
422 /**
423 * iopt_map_user_pages() - Map a user VA to an iova in the io page table
424 * @ictx: iommufd_ctx the iopt is part of
425 * @iopt: io_pagetable to act on
426 * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
427 * the chosen iova on output. Otherwise is the iova to map to on input
428 * @uptr: User VA to map
429 * @length: Number of bytes to map
430 * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
431 * @flags: IOPT_ALLOC_IOVA or zero
432 *
433 * iova, uptr, and length must be aligned to iova_alignment. For domain backed
434 * page tables this will pin the pages and load them into the domain at iova.
435 * For non-domain page tables this will only setup a lazy reference and the
436 * caller must use iopt_access_pages() to touch them.
437 *
438 * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
439 * destroyed.
440 */
iopt_map_user_pages(struct iommufd_ctx * ictx,struct io_pagetable * iopt,unsigned long * iova,void __user * uptr,unsigned long length,int iommu_prot,unsigned int flags)441 int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
442 unsigned long *iova, void __user *uptr,
443 unsigned long length, int iommu_prot,
444 unsigned int flags)
445 {
446 struct iopt_pages *pages;
447
448 pages = iopt_alloc_user_pages(uptr, length, iommu_prot & IOMMU_WRITE);
449 if (IS_ERR(pages))
450 return PTR_ERR(pages);
451
452 return iopt_map_common(ictx, iopt, pages, iova, length,
453 uptr - pages->uptr, iommu_prot, flags);
454 }
455
456 /**
457 * iopt_map_file_pages() - Like iopt_map_user_pages, but map a file.
458 * @ictx: iommufd_ctx the iopt is part of
459 * @iopt: io_pagetable to act on
460 * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
461 * the chosen iova on output. Otherwise is the iova to map to on input
462 * @file: file to map
463 * @start: map file starting at this byte offset
464 * @length: Number of bytes to map
465 * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
466 * @flags: IOPT_ALLOC_IOVA or zero
467 */
iopt_map_file_pages(struct iommufd_ctx * ictx,struct io_pagetable * iopt,unsigned long * iova,struct file * file,unsigned long start,unsigned long length,int iommu_prot,unsigned int flags)468 int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
469 unsigned long *iova, struct file *file,
470 unsigned long start, unsigned long length,
471 int iommu_prot, unsigned int flags)
472 {
473 struct iopt_pages *pages;
474
475 pages = iopt_alloc_file_pages(file, start, length,
476 iommu_prot & IOMMU_WRITE);
477 if (IS_ERR(pages))
478 return PTR_ERR(pages);
479 return iopt_map_common(ictx, iopt, pages, iova, length,
480 start - pages->start, iommu_prot, flags);
481 }
482
483 struct iova_bitmap_fn_arg {
484 unsigned long flags;
485 struct io_pagetable *iopt;
486 struct iommu_domain *domain;
487 struct iommu_dirty_bitmap *dirty;
488 };
489
__iommu_read_and_clear_dirty(struct iova_bitmap * bitmap,unsigned long iova,size_t length,void * opaque)490 static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap,
491 unsigned long iova, size_t length,
492 void *opaque)
493 {
494 struct iopt_area *area;
495 struct iopt_area_contig_iter iter;
496 struct iova_bitmap_fn_arg *arg = opaque;
497 struct iommu_domain *domain = arg->domain;
498 struct iommu_dirty_bitmap *dirty = arg->dirty;
499 const struct iommu_dirty_ops *ops = domain->dirty_ops;
500 unsigned long last_iova = iova + length - 1;
501 unsigned long flags = arg->flags;
502 int ret;
503
504 iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) {
505 unsigned long last = min(last_iova, iopt_area_last_iova(area));
506
507 ret = ops->read_and_clear_dirty(domain, iter.cur_iova,
508 last - iter.cur_iova + 1, flags,
509 dirty);
510 if (ret)
511 return ret;
512 }
513
514 if (!iopt_area_contig_done(&iter))
515 return -EINVAL;
516 return 0;
517 }
518
519 static int
iommu_read_and_clear_dirty(struct iommu_domain * domain,struct io_pagetable * iopt,unsigned long flags,struct iommu_hwpt_get_dirty_bitmap * bitmap)520 iommu_read_and_clear_dirty(struct iommu_domain *domain,
521 struct io_pagetable *iopt, unsigned long flags,
522 struct iommu_hwpt_get_dirty_bitmap *bitmap)
523 {
524 const struct iommu_dirty_ops *ops = domain->dirty_ops;
525 struct iommu_iotlb_gather gather;
526 struct iommu_dirty_bitmap dirty;
527 struct iova_bitmap_fn_arg arg;
528 struct iova_bitmap *iter;
529 int ret = 0;
530
531 if (!ops || !ops->read_and_clear_dirty)
532 return -EOPNOTSUPP;
533
534 iter = iova_bitmap_alloc(bitmap->iova, bitmap->length,
535 bitmap->page_size,
536 u64_to_user_ptr(bitmap->data));
537 if (IS_ERR(iter))
538 return -ENOMEM;
539
540 iommu_dirty_bitmap_init(&dirty, iter, &gather);
541
542 arg.flags = flags;
543 arg.iopt = iopt;
544 arg.domain = domain;
545 arg.dirty = &dirty;
546 iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty);
547
548 if (!(flags & IOMMU_DIRTY_NO_CLEAR))
549 iommu_iotlb_sync(domain, &gather);
550
551 iova_bitmap_free(iter);
552
553 return ret;
554 }
555
iommufd_check_iova_range(struct io_pagetable * iopt,struct iommu_hwpt_get_dirty_bitmap * bitmap)556 int iommufd_check_iova_range(struct io_pagetable *iopt,
557 struct iommu_hwpt_get_dirty_bitmap *bitmap)
558 {
559 size_t iommu_pgsize = iopt->iova_alignment;
560 u64 last_iova;
561
562 if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova))
563 return -EOVERFLOW;
564
565 if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX)
566 return -EOVERFLOW;
567
568 if ((bitmap->iova & (iommu_pgsize - 1)) ||
569 ((last_iova + 1) & (iommu_pgsize - 1)))
570 return -EINVAL;
571
572 if (!bitmap->page_size)
573 return -EINVAL;
574
575 if ((bitmap->iova & (bitmap->page_size - 1)) ||
576 ((last_iova + 1) & (bitmap->page_size - 1)))
577 return -EINVAL;
578
579 return 0;
580 }
581
iopt_read_and_clear_dirty_data(struct io_pagetable * iopt,struct iommu_domain * domain,unsigned long flags,struct iommu_hwpt_get_dirty_bitmap * bitmap)582 int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
583 struct iommu_domain *domain,
584 unsigned long flags,
585 struct iommu_hwpt_get_dirty_bitmap *bitmap)
586 {
587 int ret;
588
589 ret = iommufd_check_iova_range(iopt, bitmap);
590 if (ret)
591 return ret;
592
593 down_read(&iopt->iova_rwsem);
594 ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
595 up_read(&iopt->iova_rwsem);
596
597 return ret;
598 }
599
iopt_clear_dirty_data(struct io_pagetable * iopt,struct iommu_domain * domain)600 static int iopt_clear_dirty_data(struct io_pagetable *iopt,
601 struct iommu_domain *domain)
602 {
603 const struct iommu_dirty_ops *ops = domain->dirty_ops;
604 struct iommu_iotlb_gather gather;
605 struct iommu_dirty_bitmap dirty;
606 struct iopt_area *area;
607 int ret = 0;
608
609 lockdep_assert_held_read(&iopt->iova_rwsem);
610
611 iommu_dirty_bitmap_init(&dirty, NULL, &gather);
612
613 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
614 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
615 if (!area->pages)
616 continue;
617
618 ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area),
619 iopt_area_length(area), 0,
620 &dirty);
621 if (ret)
622 break;
623 }
624
625 iommu_iotlb_sync(domain, &gather);
626 return ret;
627 }
628
iopt_set_dirty_tracking(struct io_pagetable * iopt,struct iommu_domain * domain,bool enable)629 int iopt_set_dirty_tracking(struct io_pagetable *iopt,
630 struct iommu_domain *domain, bool enable)
631 {
632 const struct iommu_dirty_ops *ops = domain->dirty_ops;
633 int ret = 0;
634
635 if (!ops)
636 return -EOPNOTSUPP;
637
638 down_read(&iopt->iova_rwsem);
639
640 /* Clear dirty bits from PTEs to ensure a clean snapshot */
641 if (enable) {
642 ret = iopt_clear_dirty_data(iopt, domain);
643 if (ret)
644 goto out_unlock;
645 }
646
647 ret = ops->set_dirty_tracking(domain, enable);
648
649 out_unlock:
650 up_read(&iopt->iova_rwsem);
651 return ret;
652 }
653
iopt_get_pages(struct io_pagetable * iopt,unsigned long iova,unsigned long length,struct list_head * pages_list)654 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
655 unsigned long length, struct list_head *pages_list)
656 {
657 struct iopt_area_contig_iter iter;
658 unsigned long last_iova;
659 struct iopt_area *area;
660 int rc;
661
662 if (!length)
663 return -EINVAL;
664 if (check_add_overflow(iova, length - 1, &last_iova))
665 return -EOVERFLOW;
666
667 down_read(&iopt->iova_rwsem);
668 iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
669 struct iopt_pages_list *elm;
670 unsigned long last = min(last_iova, iopt_area_last_iova(area));
671
672 elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT);
673 if (!elm) {
674 rc = -ENOMEM;
675 goto err_free;
676 }
677 elm->start_byte = iopt_area_start_byte(area, iter.cur_iova);
678 elm->pages = area->pages;
679 elm->length = (last - iter.cur_iova) + 1;
680 kref_get(&elm->pages->kref);
681 list_add_tail(&elm->next, pages_list);
682 }
683 if (!iopt_area_contig_done(&iter)) {
684 rc = -ENOENT;
685 goto err_free;
686 }
687 up_read(&iopt->iova_rwsem);
688 return 0;
689 err_free:
690 up_read(&iopt->iova_rwsem);
691 iopt_free_pages_list(pages_list);
692 return rc;
693 }
694
iopt_unmap_iova_range(struct io_pagetable * iopt,unsigned long start,unsigned long last,unsigned long * unmapped)695 static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
696 unsigned long last, unsigned long *unmapped)
697 {
698 struct iopt_area *area;
699 unsigned long unmapped_bytes = 0;
700 unsigned int tries = 0;
701 int rc = -ENOENT;
702
703 /*
704 * The domains_rwsem must be held in read mode any time any area->pages
705 * is NULL. This prevents domain attach/detatch from running
706 * concurrently with cleaning up the area.
707 */
708 again:
709 down_read(&iopt->domains_rwsem);
710 down_write(&iopt->iova_rwsem);
711 while ((area = iopt_area_iter_first(iopt, start, last))) {
712 unsigned long area_last = iopt_area_last_iova(area);
713 unsigned long area_first = iopt_area_iova(area);
714 struct iopt_pages *pages;
715
716 /* Userspace should not race map/unmap's of the same area */
717 if (!area->pages) {
718 rc = -EBUSY;
719 goto out_unlock_iova;
720 }
721
722 if (area_first < start || area_last > last) {
723 rc = -ENOENT;
724 goto out_unlock_iova;
725 }
726
727 if (area_first != start)
728 tries = 0;
729
730 /*
731 * num_accesses writers must hold the iova_rwsem too, so we can
732 * safely read it under the write side of the iovam_rwsem
733 * without the pages->mutex.
734 */
735 if (area->num_accesses) {
736 size_t length = iopt_area_length(area);
737
738 start = area_first;
739 area->prevent_access = true;
740 up_write(&iopt->iova_rwsem);
741 up_read(&iopt->domains_rwsem);
742
743 iommufd_access_notify_unmap(iopt, area_first, length);
744 /* Something is not responding to unmap requests. */
745 tries++;
746 if (WARN_ON(tries > 100))
747 return -EDEADLOCK;
748 goto again;
749 }
750
751 pages = area->pages;
752 area->pages = NULL;
753 up_write(&iopt->iova_rwsem);
754
755 iopt_area_unfill_domains(area, pages);
756 iopt_abort_area(area);
757 iopt_put_pages(pages);
758
759 unmapped_bytes += area_last - area_first + 1;
760
761 down_write(&iopt->iova_rwsem);
762 }
763 if (unmapped_bytes)
764 rc = 0;
765
766 out_unlock_iova:
767 up_write(&iopt->iova_rwsem);
768 up_read(&iopt->domains_rwsem);
769 if (unmapped)
770 *unmapped = unmapped_bytes;
771 return rc;
772 }
773
774 /**
775 * iopt_unmap_iova() - Remove a range of iova
776 * @iopt: io_pagetable to act on
777 * @iova: Starting iova to unmap
778 * @length: Number of bytes to unmap
779 * @unmapped: Return number of bytes unmapped
780 *
781 * The requested range must be a superset of existing ranges.
782 * Splitting/truncating IOVA mappings is not allowed.
783 */
iopt_unmap_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length,unsigned long * unmapped)784 int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
785 unsigned long length, unsigned long *unmapped)
786 {
787 unsigned long iova_last;
788
789 if (!length)
790 return -EINVAL;
791
792 if (check_add_overflow(iova, length - 1, &iova_last))
793 return -EOVERFLOW;
794
795 return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped);
796 }
797
iopt_unmap_all(struct io_pagetable * iopt,unsigned long * unmapped)798 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
799 {
800 int rc;
801
802 rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
803 /* If the IOVAs are empty then unmap all succeeds */
804 if (rc == -ENOENT)
805 return 0;
806 return rc;
807 }
808
809 /* The caller must always free all the nodes in the allowed_iova rb_root. */
iopt_set_allow_iova(struct io_pagetable * iopt,struct rb_root_cached * allowed_iova)810 int iopt_set_allow_iova(struct io_pagetable *iopt,
811 struct rb_root_cached *allowed_iova)
812 {
813 struct iopt_allowed *allowed;
814
815 down_write(&iopt->iova_rwsem);
816 swap(*allowed_iova, iopt->allowed_itree);
817
818 for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed;
819 allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) {
820 if (iopt_reserved_iter_first(iopt, allowed->node.start,
821 allowed->node.last)) {
822 swap(*allowed_iova, iopt->allowed_itree);
823 up_write(&iopt->iova_rwsem);
824 return -EADDRINUSE;
825 }
826 }
827 up_write(&iopt->iova_rwsem);
828 return 0;
829 }
830
iopt_reserve_iova(struct io_pagetable * iopt,unsigned long start,unsigned long last,void * owner)831 int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
832 unsigned long last, void *owner)
833 {
834 struct iopt_reserved *reserved;
835
836 lockdep_assert_held_write(&iopt->iova_rwsem);
837
838 if (iopt_area_iter_first(iopt, start, last) ||
839 iopt_allowed_iter_first(iopt, start, last))
840 return -EADDRINUSE;
841
842 reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT);
843 if (!reserved)
844 return -ENOMEM;
845 reserved->node.start = start;
846 reserved->node.last = last;
847 reserved->owner = owner;
848 interval_tree_insert(&reserved->node, &iopt->reserved_itree);
849 return 0;
850 }
851
__iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)852 static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
853 {
854 struct iopt_reserved *reserved, *next;
855
856 lockdep_assert_held_write(&iopt->iova_rwsem);
857
858 for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved;
859 reserved = next) {
860 next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX);
861
862 if (reserved->owner == owner) {
863 interval_tree_remove(&reserved->node,
864 &iopt->reserved_itree);
865 kfree(reserved);
866 }
867 }
868 }
869
iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)870 void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
871 {
872 down_write(&iopt->iova_rwsem);
873 __iopt_remove_reserved_iova(iopt, owner);
874 up_write(&iopt->iova_rwsem);
875 }
876
iopt_init_table(struct io_pagetable * iopt)877 void iopt_init_table(struct io_pagetable *iopt)
878 {
879 init_rwsem(&iopt->iova_rwsem);
880 init_rwsem(&iopt->domains_rwsem);
881 iopt->area_itree = RB_ROOT_CACHED;
882 iopt->allowed_itree = RB_ROOT_CACHED;
883 iopt->reserved_itree = RB_ROOT_CACHED;
884 xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT);
885 xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC);
886
887 /*
888 * iopt's start as SW tables that can use the entire size_t IOVA space
889 * due to the use of size_t in the APIs. They have no alignment
890 * restriction.
891 */
892 iopt->iova_alignment = 1;
893 }
894
iopt_destroy_table(struct io_pagetable * iopt)895 void iopt_destroy_table(struct io_pagetable *iopt)
896 {
897 struct interval_tree_node *node;
898
899 if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
900 iopt_remove_reserved_iova(iopt, NULL);
901
902 while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0,
903 ULONG_MAX))) {
904 interval_tree_remove(node, &iopt->allowed_itree);
905 kfree(container_of(node, struct iopt_allowed, node));
906 }
907
908 WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root));
909 WARN_ON(!xa_empty(&iopt->domains));
910 WARN_ON(!xa_empty(&iopt->access_list));
911 WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
912 }
913
914 /**
915 * iopt_unfill_domain() - Unfill a domain with PFNs
916 * @iopt: io_pagetable to act on
917 * @domain: domain to unfill
918 *
919 * This is used when removing a domain from the iopt. Every area in the iopt
920 * will be unmapped from the domain. The domain must already be removed from the
921 * domains xarray.
922 */
iopt_unfill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)923 static void iopt_unfill_domain(struct io_pagetable *iopt,
924 struct iommu_domain *domain)
925 {
926 struct iopt_area *area;
927
928 lockdep_assert_held(&iopt->iova_rwsem);
929 lockdep_assert_held_write(&iopt->domains_rwsem);
930
931 /*
932 * Some other domain is holding all the pfns still, rapidly unmap this
933 * domain.
934 */
935 if (iopt->next_domain_id != 0) {
936 /* Pick an arbitrary remaining domain to act as storage */
937 struct iommu_domain *storage_domain =
938 xa_load(&iopt->domains, 0);
939
940 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
941 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
942 struct iopt_pages *pages = area->pages;
943
944 if (!pages)
945 continue;
946
947 mutex_lock(&pages->mutex);
948 if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
949 WARN_ON(!area->storage_domain);
950 if (area->storage_domain == domain)
951 area->storage_domain = storage_domain;
952 mutex_unlock(&pages->mutex);
953
954 iopt_area_unmap_domain(area, domain);
955 }
956 return;
957 }
958
959 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
960 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
961 struct iopt_pages *pages = area->pages;
962
963 if (!pages)
964 continue;
965
966 mutex_lock(&pages->mutex);
967 interval_tree_remove(&area->pages_node, &pages->domains_itree);
968 WARN_ON(area->storage_domain != domain);
969 area->storage_domain = NULL;
970 iopt_area_unfill_domain(area, pages, domain);
971 mutex_unlock(&pages->mutex);
972 }
973 }
974
975 /**
976 * iopt_fill_domain() - Fill a domain with PFNs
977 * @iopt: io_pagetable to act on
978 * @domain: domain to fill
979 *
980 * Fill the domain with PFNs from every area in the iopt. On failure the domain
981 * is left unchanged.
982 */
iopt_fill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)983 static int iopt_fill_domain(struct io_pagetable *iopt,
984 struct iommu_domain *domain)
985 {
986 struct iopt_area *end_area;
987 struct iopt_area *area;
988 int rc;
989
990 lockdep_assert_held(&iopt->iova_rwsem);
991 lockdep_assert_held_write(&iopt->domains_rwsem);
992
993 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
994 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
995 struct iopt_pages *pages = area->pages;
996
997 if (!pages)
998 continue;
999
1000 mutex_lock(&pages->mutex);
1001 rc = iopt_area_fill_domain(area, domain);
1002 if (rc) {
1003 mutex_unlock(&pages->mutex);
1004 goto out_unfill;
1005 }
1006 if (!area->storage_domain) {
1007 WARN_ON(iopt->next_domain_id != 0);
1008 area->storage_domain = domain;
1009 interval_tree_insert(&area->pages_node,
1010 &pages->domains_itree);
1011 }
1012 mutex_unlock(&pages->mutex);
1013 }
1014 return 0;
1015
1016 out_unfill:
1017 end_area = area;
1018 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1019 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
1020 struct iopt_pages *pages = area->pages;
1021
1022 if (area == end_area)
1023 break;
1024 if (!pages)
1025 continue;
1026 mutex_lock(&pages->mutex);
1027 if (iopt->next_domain_id == 0) {
1028 interval_tree_remove(&area->pages_node,
1029 &pages->domains_itree);
1030 area->storage_domain = NULL;
1031 }
1032 iopt_area_unfill_domain(area, pages, domain);
1033 mutex_unlock(&pages->mutex);
1034 }
1035 return rc;
1036 }
1037
1038 /* All existing area's conform to an increased page size */
iopt_check_iova_alignment(struct io_pagetable * iopt,unsigned long new_iova_alignment)1039 static int iopt_check_iova_alignment(struct io_pagetable *iopt,
1040 unsigned long new_iova_alignment)
1041 {
1042 unsigned long align_mask = new_iova_alignment - 1;
1043 struct iopt_area *area;
1044
1045 lockdep_assert_held(&iopt->iova_rwsem);
1046 lockdep_assert_held(&iopt->domains_rwsem);
1047
1048 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1049 area = iopt_area_iter_next(area, 0, ULONG_MAX))
1050 if ((iopt_area_iova(area) & align_mask) ||
1051 (iopt_area_length(area) & align_mask) ||
1052 (area->page_offset & align_mask))
1053 return -EADDRINUSE;
1054
1055 if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) {
1056 struct iommufd_access *access;
1057 unsigned long index;
1058
1059 xa_for_each(&iopt->access_list, index, access)
1060 if (WARN_ON(access->iova_alignment >
1061 new_iova_alignment))
1062 return -EADDRINUSE;
1063 }
1064 return 0;
1065 }
1066
iopt_table_add_domain(struct io_pagetable * iopt,struct iommu_domain * domain)1067 int iopt_table_add_domain(struct io_pagetable *iopt,
1068 struct iommu_domain *domain)
1069 {
1070 const struct iommu_domain_geometry *geometry = &domain->geometry;
1071 struct iommu_domain *iter_domain;
1072 unsigned int new_iova_alignment;
1073 unsigned long index;
1074 int rc;
1075
1076 down_write(&iopt->domains_rwsem);
1077 down_write(&iopt->iova_rwsem);
1078
1079 xa_for_each(&iopt->domains, index, iter_domain) {
1080 if (WARN_ON(iter_domain == domain)) {
1081 rc = -EEXIST;
1082 goto out_unlock;
1083 }
1084 }
1085
1086 /*
1087 * The io page size drives the iova_alignment. Internally the iopt_pages
1088 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
1089 * objects into the iommu_domain.
1090 *
1091 * A iommu_domain must always be able to accept PAGE_SIZE to be
1092 * compatible as we can't guarantee higher contiguity.
1093 */
1094 new_iova_alignment = max_t(unsigned long,
1095 1UL << __ffs(domain->pgsize_bitmap),
1096 iopt->iova_alignment);
1097 if (new_iova_alignment > PAGE_SIZE) {
1098 rc = -EINVAL;
1099 goto out_unlock;
1100 }
1101 if (new_iova_alignment != iopt->iova_alignment) {
1102 rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1103 if (rc)
1104 goto out_unlock;
1105 }
1106
1107 /* No area exists that is outside the allowed domain aperture */
1108 if (geometry->aperture_start != 0) {
1109 rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
1110 domain);
1111 if (rc)
1112 goto out_reserved;
1113 }
1114 if (geometry->aperture_end != ULONG_MAX) {
1115 rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
1116 ULONG_MAX, domain);
1117 if (rc)
1118 goto out_reserved;
1119 }
1120
1121 rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
1122 if (rc)
1123 goto out_reserved;
1124
1125 rc = iopt_fill_domain(iopt, domain);
1126 if (rc)
1127 goto out_release;
1128
1129 iopt->iova_alignment = new_iova_alignment;
1130 xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
1131 iopt->next_domain_id++;
1132 up_write(&iopt->iova_rwsem);
1133 up_write(&iopt->domains_rwsem);
1134 return 0;
1135 out_release:
1136 xa_release(&iopt->domains, iopt->next_domain_id);
1137 out_reserved:
1138 __iopt_remove_reserved_iova(iopt, domain);
1139 out_unlock:
1140 up_write(&iopt->iova_rwsem);
1141 up_write(&iopt->domains_rwsem);
1142 return rc;
1143 }
1144
iopt_calculate_iova_alignment(struct io_pagetable * iopt)1145 static int iopt_calculate_iova_alignment(struct io_pagetable *iopt)
1146 {
1147 unsigned long new_iova_alignment;
1148 struct iommufd_access *access;
1149 struct iommu_domain *domain;
1150 unsigned long index;
1151
1152 lockdep_assert_held_write(&iopt->iova_rwsem);
1153 lockdep_assert_held(&iopt->domains_rwsem);
1154
1155 /* See batch_iommu_map_small() */
1156 if (iopt->disable_large_pages)
1157 new_iova_alignment = PAGE_SIZE;
1158 else
1159 new_iova_alignment = 1;
1160
1161 xa_for_each(&iopt->domains, index, domain)
1162 new_iova_alignment = max_t(unsigned long,
1163 1UL << __ffs(domain->pgsize_bitmap),
1164 new_iova_alignment);
1165 xa_for_each(&iopt->access_list, index, access)
1166 new_iova_alignment = max_t(unsigned long,
1167 access->iova_alignment,
1168 new_iova_alignment);
1169
1170 if (new_iova_alignment > iopt->iova_alignment) {
1171 int rc;
1172
1173 rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1174 if (rc)
1175 return rc;
1176 }
1177 iopt->iova_alignment = new_iova_alignment;
1178 return 0;
1179 }
1180
iopt_table_remove_domain(struct io_pagetable * iopt,struct iommu_domain * domain)1181 void iopt_table_remove_domain(struct io_pagetable *iopt,
1182 struct iommu_domain *domain)
1183 {
1184 struct iommu_domain *iter_domain = NULL;
1185 unsigned long index;
1186
1187 down_write(&iopt->domains_rwsem);
1188 down_write(&iopt->iova_rwsem);
1189
1190 xa_for_each(&iopt->domains, index, iter_domain)
1191 if (iter_domain == domain)
1192 break;
1193 if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
1194 goto out_unlock;
1195
1196 /*
1197 * Compress the xarray to keep it linear by swapping the entry to erase
1198 * with the tail entry and shrinking the tail.
1199 */
1200 iopt->next_domain_id--;
1201 iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
1202 if (index != iopt->next_domain_id)
1203 xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
1204
1205 iopt_unfill_domain(iopt, domain);
1206 __iopt_remove_reserved_iova(iopt, domain);
1207
1208 WARN_ON(iopt_calculate_iova_alignment(iopt));
1209 out_unlock:
1210 up_write(&iopt->iova_rwsem);
1211 up_write(&iopt->domains_rwsem);
1212 }
1213
1214 /**
1215 * iopt_area_split - Split an area into two parts at iova
1216 * @area: The area to split
1217 * @iova: Becomes the last of a new area
1218 *
1219 * This splits an area into two. It is part of the VFIO compatibility to allow
1220 * poking a hole in the mapping. The two areas continue to point at the same
1221 * iopt_pages, just with different starting bytes.
1222 */
iopt_area_split(struct iopt_area * area,unsigned long iova)1223 static int iopt_area_split(struct iopt_area *area, unsigned long iova)
1224 {
1225 unsigned long alignment = area->iopt->iova_alignment;
1226 unsigned long last_iova = iopt_area_last_iova(area);
1227 unsigned long start_iova = iopt_area_iova(area);
1228 unsigned long new_start = iova + 1;
1229 struct io_pagetable *iopt = area->iopt;
1230 struct iopt_pages *pages = area->pages;
1231 struct iopt_area *lhs;
1232 struct iopt_area *rhs;
1233 int rc;
1234
1235 lockdep_assert_held_write(&iopt->iova_rwsem);
1236
1237 if (iova == start_iova || iova == last_iova)
1238 return 0;
1239
1240 if (!pages || area->prevent_access)
1241 return -EBUSY;
1242
1243 if (new_start & (alignment - 1) ||
1244 iopt_area_start_byte(area, new_start) & (alignment - 1))
1245 return -EINVAL;
1246
1247 lhs = iopt_area_alloc();
1248 if (!lhs)
1249 return -ENOMEM;
1250
1251 rhs = iopt_area_alloc();
1252 if (!rhs) {
1253 rc = -ENOMEM;
1254 goto err_free_lhs;
1255 }
1256
1257 mutex_lock(&pages->mutex);
1258 /*
1259 * Splitting is not permitted if an access exists, we don't track enough
1260 * information to split existing accesses.
1261 */
1262 if (area->num_accesses) {
1263 rc = -EINVAL;
1264 goto err_unlock;
1265 }
1266
1267 /*
1268 * Splitting is not permitted if a domain could have been mapped with
1269 * huge pages.
1270 */
1271 if (area->storage_domain && !iopt->disable_large_pages) {
1272 rc = -EINVAL;
1273 goto err_unlock;
1274 }
1275
1276 interval_tree_remove(&area->node, &iopt->area_itree);
1277 rc = iopt_insert_area(iopt, lhs, area->pages, start_iova,
1278 iopt_area_start_byte(area, start_iova),
1279 (new_start - 1) - start_iova + 1,
1280 area->iommu_prot);
1281 if (WARN_ON(rc))
1282 goto err_insert;
1283
1284 rc = iopt_insert_area(iopt, rhs, area->pages, new_start,
1285 iopt_area_start_byte(area, new_start),
1286 last_iova - new_start + 1, area->iommu_prot);
1287 if (WARN_ON(rc))
1288 goto err_remove_lhs;
1289
1290 /*
1291 * If the original area has filled a domain, domains_itree has to be
1292 * updated.
1293 */
1294 if (area->storage_domain) {
1295 interval_tree_remove(&area->pages_node, &pages->domains_itree);
1296 interval_tree_insert(&lhs->pages_node, &pages->domains_itree);
1297 interval_tree_insert(&rhs->pages_node, &pages->domains_itree);
1298 }
1299
1300 lhs->storage_domain = area->storage_domain;
1301 lhs->pages = area->pages;
1302 rhs->storage_domain = area->storage_domain;
1303 rhs->pages = area->pages;
1304 kref_get(&rhs->pages->kref);
1305 kfree(area);
1306 mutex_unlock(&pages->mutex);
1307
1308 /*
1309 * No change to domains or accesses because the pages hasn't been
1310 * changed
1311 */
1312 return 0;
1313
1314 err_remove_lhs:
1315 interval_tree_remove(&lhs->node, &iopt->area_itree);
1316 err_insert:
1317 interval_tree_insert(&area->node, &iopt->area_itree);
1318 err_unlock:
1319 mutex_unlock(&pages->mutex);
1320 kfree(rhs);
1321 err_free_lhs:
1322 kfree(lhs);
1323 return rc;
1324 }
1325
iopt_cut_iova(struct io_pagetable * iopt,unsigned long * iovas,size_t num_iovas)1326 int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
1327 size_t num_iovas)
1328 {
1329 int rc = 0;
1330 int i;
1331
1332 down_write(&iopt->iova_rwsem);
1333 for (i = 0; i < num_iovas; i++) {
1334 struct iopt_area *area;
1335
1336 area = iopt_area_iter_first(iopt, iovas[i], iovas[i]);
1337 if (!area)
1338 continue;
1339 rc = iopt_area_split(area, iovas[i]);
1340 if (rc)
1341 break;
1342 }
1343 up_write(&iopt->iova_rwsem);
1344 return rc;
1345 }
1346
iopt_enable_large_pages(struct io_pagetable * iopt)1347 void iopt_enable_large_pages(struct io_pagetable *iopt)
1348 {
1349 int rc;
1350
1351 down_write(&iopt->domains_rwsem);
1352 down_write(&iopt->iova_rwsem);
1353 WRITE_ONCE(iopt->disable_large_pages, false);
1354 rc = iopt_calculate_iova_alignment(iopt);
1355 WARN_ON(rc);
1356 up_write(&iopt->iova_rwsem);
1357 up_write(&iopt->domains_rwsem);
1358 }
1359
iopt_disable_large_pages(struct io_pagetable * iopt)1360 int iopt_disable_large_pages(struct io_pagetable *iopt)
1361 {
1362 int rc = 0;
1363
1364 down_write(&iopt->domains_rwsem);
1365 down_write(&iopt->iova_rwsem);
1366 if (iopt->disable_large_pages)
1367 goto out_unlock;
1368
1369 /* Won't do it if domains already have pages mapped in them */
1370 if (!xa_empty(&iopt->domains) &&
1371 !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) {
1372 rc = -EINVAL;
1373 goto out_unlock;
1374 }
1375
1376 WRITE_ONCE(iopt->disable_large_pages, true);
1377 rc = iopt_calculate_iova_alignment(iopt);
1378 if (rc)
1379 WRITE_ONCE(iopt->disable_large_pages, false);
1380 out_unlock:
1381 up_write(&iopt->iova_rwsem);
1382 up_write(&iopt->domains_rwsem);
1383 return rc;
1384 }
1385
iopt_add_access(struct io_pagetable * iopt,struct iommufd_access * access)1386 int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access)
1387 {
1388 u32 new_id;
1389 int rc;
1390
1391 down_write(&iopt->domains_rwsem);
1392 down_write(&iopt->iova_rwsem);
1393 rc = xa_alloc(&iopt->access_list, &new_id, access, xa_limit_16b,
1394 GFP_KERNEL_ACCOUNT);
1395
1396 if (rc)
1397 goto out_unlock;
1398
1399 rc = iopt_calculate_iova_alignment(iopt);
1400 if (rc) {
1401 xa_erase(&iopt->access_list, new_id);
1402 goto out_unlock;
1403 }
1404 access->iopt_access_list_id = new_id;
1405
1406 out_unlock:
1407 up_write(&iopt->iova_rwsem);
1408 up_write(&iopt->domains_rwsem);
1409 return rc;
1410 }
1411
iopt_remove_access(struct io_pagetable * iopt,struct iommufd_access * access,u32 iopt_access_list_id)1412 void iopt_remove_access(struct io_pagetable *iopt,
1413 struct iommufd_access *access,
1414 u32 iopt_access_list_id)
1415 {
1416 down_write(&iopt->domains_rwsem);
1417 down_write(&iopt->iova_rwsem);
1418 WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access);
1419 WARN_ON(iopt_calculate_iova_alignment(iopt));
1420 up_write(&iopt->iova_rwsem);
1421 up_write(&iopt->domains_rwsem);
1422 }
1423
1424 /* Narrow the valid_iova_itree to include reserved ranges from a device. */
iopt_table_enforce_dev_resv_regions(struct io_pagetable * iopt,struct device * dev,phys_addr_t * sw_msi_start)1425 int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
1426 struct device *dev,
1427 phys_addr_t *sw_msi_start)
1428 {
1429 struct iommu_resv_region *resv;
1430 LIST_HEAD(resv_regions);
1431 unsigned int num_hw_msi = 0;
1432 unsigned int num_sw_msi = 0;
1433 int rc;
1434
1435 if (iommufd_should_fail())
1436 return -EINVAL;
1437
1438 down_write(&iopt->iova_rwsem);
1439 /* FIXME: drivers allocate memory but there is no failure propogated */
1440 iommu_get_resv_regions(dev, &resv_regions);
1441
1442 list_for_each_entry(resv, &resv_regions, list) {
1443 if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1444 continue;
1445
1446 if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
1447 num_hw_msi++;
1448 if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
1449 *sw_msi_start = resv->start;
1450 num_sw_msi++;
1451 }
1452
1453 rc = iopt_reserve_iova(iopt, resv->start,
1454 resv->length - 1 + resv->start, dev);
1455 if (rc)
1456 goto out_reserved;
1457 }
1458
1459 /* Drivers must offer sane combinations of regions */
1460 if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) {
1461 rc = -EINVAL;
1462 goto out_reserved;
1463 }
1464
1465 rc = 0;
1466 goto out_free_resv;
1467
1468 out_reserved:
1469 __iopt_remove_reserved_iova(iopt, dev);
1470 out_free_resv:
1471 iommu_put_resv_regions(dev, &resv_regions);
1472 up_write(&iopt->iova_rwsem);
1473 return rc;
1474 }
1475