1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
3 *
4 * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
5 * PFNs can be placed into an iommu_domain, or returned to the caller as a page
6 * list for access by an in-kernel user.
7 *
8 * The datastructure uses the iopt_pages to optimize the storage of the PFNs
9 * between the domains and xarray.
10 */
11 #include <linux/err.h>
12 #include <linux/errno.h>
13 #include <linux/iommu.h>
14 #include <linux/iommufd.h>
15 #include <linux/lockdep.h>
16 #include <linux/sched/mm.h>
17 #include <linux/slab.h>
18 #include <uapi/linux/iommufd.h>
19
20 #include "double_span.h"
21 #include "io_pagetable.h"
22
23 struct iopt_pages_list {
24 struct iopt_pages *pages;
25 struct iopt_area *area;
26 struct list_head next;
27 unsigned long start_byte;
28 unsigned long length;
29 };
30
iopt_area_contig_init(struct iopt_area_contig_iter * iter,struct io_pagetable * iopt,unsigned long iova,unsigned long last_iova)31 struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
32 struct io_pagetable *iopt,
33 unsigned long iova,
34 unsigned long last_iova)
35 {
36 lockdep_assert_held(&iopt->iova_rwsem);
37
38 iter->cur_iova = iova;
39 iter->last_iova = last_iova;
40 iter->area = iopt_area_iter_first(iopt, iova, iova);
41 if (!iter->area)
42 return NULL;
43 if (!iter->area->pages) {
44 iter->area = NULL;
45 return NULL;
46 }
47 return iter->area;
48 }
49
iopt_area_contig_next(struct iopt_area_contig_iter * iter)50 struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
51 {
52 unsigned long last_iova;
53
54 if (!iter->area)
55 return NULL;
56 last_iova = iopt_area_last_iova(iter->area);
57 if (iter->last_iova <= last_iova)
58 return NULL;
59
60 iter->cur_iova = last_iova + 1;
61 iter->area = iopt_area_iter_next(iter->area, iter->cur_iova,
62 iter->last_iova);
63 if (!iter->area)
64 return NULL;
65 if (iter->cur_iova != iopt_area_iova(iter->area) ||
66 !iter->area->pages) {
67 iter->area = NULL;
68 return NULL;
69 }
70 return iter->area;
71 }
72
__alloc_iova_check_range(unsigned long * start,unsigned long last,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)73 static bool __alloc_iova_check_range(unsigned long *start, unsigned long last,
74 unsigned long length,
75 unsigned long iova_alignment,
76 unsigned long page_offset)
77 {
78 unsigned long aligned_start;
79
80 /* ALIGN_UP() */
81 if (check_add_overflow(*start, iova_alignment - 1, &aligned_start))
82 return false;
83 aligned_start &= ~(iova_alignment - 1);
84 aligned_start |= page_offset;
85
86 if (aligned_start >= last || last - aligned_start < length - 1)
87 return false;
88 *start = aligned_start;
89 return true;
90 }
91
__alloc_iova_check_hole(struct interval_tree_double_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)92 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
93 unsigned long length,
94 unsigned long iova_alignment,
95 unsigned long page_offset)
96 {
97 if (span->is_used)
98 return false;
99 return __alloc_iova_check_range(&span->start_hole, span->last_hole,
100 length, iova_alignment, page_offset);
101 }
102
__alloc_iova_check_used(struct interval_tree_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)103 static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
104 unsigned long length,
105 unsigned long iova_alignment,
106 unsigned long page_offset)
107 {
108 if (span->is_hole)
109 return false;
110 return __alloc_iova_check_range(&span->start_used, span->last_used,
111 length, iova_alignment, page_offset);
112 }
113
114 /*
115 * Automatically find a block of IOVA that is not being used and not reserved.
116 * Does not return a 0 IOVA even if it is valid.
117 */
iopt_alloc_iova(struct io_pagetable * iopt,unsigned long * iova,unsigned long addr,unsigned long length)118 static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
119 unsigned long addr, unsigned long length)
120 {
121 unsigned long page_offset = addr % PAGE_SIZE;
122 struct interval_tree_double_span_iter used_span;
123 struct interval_tree_span_iter allowed_span;
124 unsigned long max_alignment = PAGE_SIZE;
125 unsigned long iova_alignment;
126
127 lockdep_assert_held(&iopt->iova_rwsem);
128
129 /* Protect roundup_pow-of_two() from overflow */
130 if (length == 0 || length >= ULONG_MAX / 2)
131 return -EOVERFLOW;
132
133 /*
134 * Keep alignment present in addr when building the IOVA, which
135 * increases the chance we can map a THP.
136 */
137 if (!addr)
138 iova_alignment = roundup_pow_of_two(length);
139 else
140 iova_alignment = min_t(unsigned long,
141 roundup_pow_of_two(length),
142 1UL << __ffs64(addr));
143
144 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
145 max_alignment = HPAGE_SIZE;
146 #endif
147 /* Protect against ALIGN() overflow */
148 if (iova_alignment >= max_alignment)
149 iova_alignment = max_alignment;
150
151 if (iova_alignment < iopt->iova_alignment)
152 return -EINVAL;
153
154 interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree,
155 PAGE_SIZE, ULONG_MAX - PAGE_SIZE) {
156 if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) {
157 allowed_span.start_used = PAGE_SIZE;
158 allowed_span.last_used = ULONG_MAX - PAGE_SIZE;
159 allowed_span.is_hole = false;
160 }
161
162 if (!__alloc_iova_check_used(&allowed_span, length,
163 iova_alignment, page_offset))
164 continue;
165
166 interval_tree_for_each_double_span(
167 &used_span, &iopt->reserved_itree, &iopt->area_itree,
168 allowed_span.start_used, allowed_span.last_used) {
169 if (!__alloc_iova_check_hole(&used_span, length,
170 iova_alignment,
171 page_offset))
172 continue;
173
174 *iova = used_span.start_hole;
175 return 0;
176 }
177 }
178 return -ENOSPC;
179 }
180
iopt_check_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length)181 static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova,
182 unsigned long length)
183 {
184 unsigned long last;
185
186 lockdep_assert_held(&iopt->iova_rwsem);
187
188 if ((iova & (iopt->iova_alignment - 1)))
189 return -EINVAL;
190
191 if (check_add_overflow(iova, length - 1, &last))
192 return -EOVERFLOW;
193
194 /* No reserved IOVA intersects the range */
195 if (iopt_reserved_iter_first(iopt, iova, last))
196 return -EINVAL;
197
198 /* Check that there is not already a mapping in the range */
199 if (iopt_area_iter_first(iopt, iova, last))
200 return -EEXIST;
201 return 0;
202 }
203
204 /*
205 * The area takes a slice of the pages from start_bytes to start_byte + length
206 */
iopt_insert_area(struct io_pagetable * iopt,struct iopt_area * area,struct iopt_pages * pages,unsigned long iova,unsigned long start_byte,unsigned long length,int iommu_prot)207 static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
208 struct iopt_pages *pages, unsigned long iova,
209 unsigned long start_byte, unsigned long length,
210 int iommu_prot)
211 {
212 lockdep_assert_held_write(&iopt->iova_rwsem);
213
214 if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
215 return -EPERM;
216
217 area->iommu_prot = iommu_prot;
218 area->page_offset = start_byte % PAGE_SIZE;
219 if (area->page_offset & (iopt->iova_alignment - 1))
220 return -EINVAL;
221
222 area->node.start = iova;
223 if (check_add_overflow(iova, length - 1, &area->node.last))
224 return -EOVERFLOW;
225
226 area->pages_node.start = start_byte / PAGE_SIZE;
227 if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
228 return -EOVERFLOW;
229 area->pages_node.last = area->pages_node.last / PAGE_SIZE;
230 if (WARN_ON(area->pages_node.last >= pages->npages))
231 return -EOVERFLOW;
232
233 /*
234 * The area is inserted with a NULL pages indicating it is not fully
235 * initialized yet.
236 */
237 area->iopt = iopt;
238 interval_tree_insert(&area->node, &iopt->area_itree);
239 return 0;
240 }
241
iopt_area_alloc(void)242 static struct iopt_area *iopt_area_alloc(void)
243 {
244 struct iopt_area *area;
245
246 area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
247 if (!area)
248 return NULL;
249 RB_CLEAR_NODE(&area->node.rb);
250 RB_CLEAR_NODE(&area->pages_node.rb);
251 return area;
252 }
253
iopt_alloc_area_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)254 static int iopt_alloc_area_pages(struct io_pagetable *iopt,
255 struct list_head *pages_list,
256 unsigned long length, unsigned long *dst_iova,
257 int iommu_prot, unsigned int flags)
258 {
259 struct iopt_pages_list *elm;
260 unsigned long start;
261 unsigned long iova;
262 int rc = 0;
263
264 list_for_each_entry(elm, pages_list, next) {
265 elm->area = iopt_area_alloc();
266 if (!elm->area)
267 return -ENOMEM;
268 }
269
270 down_write(&iopt->iova_rwsem);
271 if ((length & (iopt->iova_alignment - 1)) || !length) {
272 rc = -EINVAL;
273 goto out_unlock;
274 }
275
276 if (flags & IOPT_ALLOC_IOVA) {
277 /* Use the first entry to guess the ideal IOVA alignment */
278 elm = list_first_entry(pages_list, struct iopt_pages_list,
279 next);
280 switch (elm->pages->type) {
281 case IOPT_ADDRESS_USER:
282 start = elm->start_byte + (uintptr_t)elm->pages->uptr;
283 break;
284 case IOPT_ADDRESS_FILE:
285 start = elm->start_byte + elm->pages->start;
286 break;
287 }
288 rc = iopt_alloc_iova(iopt, dst_iova, start, length);
289 if (rc)
290 goto out_unlock;
291 if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
292 WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) {
293 rc = -EINVAL;
294 goto out_unlock;
295 }
296 } else {
297 rc = iopt_check_iova(iopt, *dst_iova, length);
298 if (rc)
299 goto out_unlock;
300 }
301
302 /*
303 * Areas are created with a NULL pages so that the IOVA space is
304 * reserved and we can unlock the iova_rwsem.
305 */
306 iova = *dst_iova;
307 list_for_each_entry(elm, pages_list, next) {
308 rc = iopt_insert_area(iopt, elm->area, elm->pages, iova,
309 elm->start_byte, elm->length, iommu_prot);
310 if (rc)
311 goto out_unlock;
312 iova += elm->length;
313 }
314
315 out_unlock:
316 up_write(&iopt->iova_rwsem);
317 return rc;
318 }
319
iopt_abort_area(struct iopt_area * area)320 static void iopt_abort_area(struct iopt_area *area)
321 {
322 if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
323 WARN_ON(area->pages);
324 if (area->iopt) {
325 down_write(&area->iopt->iova_rwsem);
326 interval_tree_remove(&area->node, &area->iopt->area_itree);
327 up_write(&area->iopt->iova_rwsem);
328 }
329 kfree(area);
330 }
331
iopt_free_pages_list(struct list_head * pages_list)332 void iopt_free_pages_list(struct list_head *pages_list)
333 {
334 struct iopt_pages_list *elm;
335
336 while ((elm = list_first_entry_or_null(pages_list,
337 struct iopt_pages_list, next))) {
338 if (elm->area)
339 iopt_abort_area(elm->area);
340 if (elm->pages)
341 iopt_put_pages(elm->pages);
342 list_del(&elm->next);
343 kfree(elm);
344 }
345 }
346
iopt_fill_domains_pages(struct list_head * pages_list)347 static int iopt_fill_domains_pages(struct list_head *pages_list)
348 {
349 struct iopt_pages_list *undo_elm;
350 struct iopt_pages_list *elm;
351 int rc;
352
353 list_for_each_entry(elm, pages_list, next) {
354 rc = iopt_area_fill_domains(elm->area, elm->pages);
355 if (rc)
356 goto err_undo;
357 }
358 return 0;
359
360 err_undo:
361 list_for_each_entry(undo_elm, pages_list, next) {
362 if (undo_elm == elm)
363 break;
364 iopt_area_unfill_domains(undo_elm->area, undo_elm->pages);
365 }
366 return rc;
367 }
368
iopt_map_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)369 int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
370 unsigned long length, unsigned long *dst_iova,
371 int iommu_prot, unsigned int flags)
372 {
373 struct iopt_pages_list *elm;
374 int rc;
375
376 rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova,
377 iommu_prot, flags);
378 if (rc)
379 return rc;
380
381 down_read(&iopt->domains_rwsem);
382 rc = iopt_fill_domains_pages(pages_list);
383 if (rc)
384 goto out_unlock_domains;
385
386 down_write(&iopt->iova_rwsem);
387 list_for_each_entry(elm, pages_list, next) {
388 /*
389 * area->pages must be set inside the domains_rwsem to ensure
390 * any newly added domains will get filled. Moves the reference
391 * in from the list.
392 */
393 elm->area->pages = elm->pages;
394 elm->pages = NULL;
395 elm->area = NULL;
396 }
397 up_write(&iopt->iova_rwsem);
398 out_unlock_domains:
399 up_read(&iopt->domains_rwsem);
400 return rc;
401 }
402
iopt_map_common(struct iommufd_ctx * ictx,struct io_pagetable * iopt,struct iopt_pages * pages,unsigned long * iova,unsigned long length,unsigned long start_byte,int iommu_prot,unsigned int flags)403 static int iopt_map_common(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
404 struct iopt_pages *pages, unsigned long *iova,
405 unsigned long length, unsigned long start_byte,
406 int iommu_prot, unsigned int flags)
407 {
408 struct iopt_pages_list elm = {};
409 LIST_HEAD(pages_list);
410 int rc;
411
412 elm.pages = pages;
413 elm.start_byte = start_byte;
414 if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
415 elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
416 elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
417 elm.length = length;
418 list_add(&elm.next, &pages_list);
419
420 rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
421 if (rc) {
422 if (elm.area)
423 iopt_abort_area(elm.area);
424 if (elm.pages)
425 iopt_put_pages(elm.pages);
426 return rc;
427 }
428 return 0;
429 }
430
431 /**
432 * iopt_map_user_pages() - Map a user VA to an iova in the io page table
433 * @ictx: iommufd_ctx the iopt is part of
434 * @iopt: io_pagetable to act on
435 * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
436 * the chosen iova on output. Otherwise is the iova to map to on input
437 * @uptr: User VA to map
438 * @length: Number of bytes to map
439 * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
440 * @flags: IOPT_ALLOC_IOVA or zero
441 *
442 * iova, uptr, and length must be aligned to iova_alignment. For domain backed
443 * page tables this will pin the pages and load them into the domain at iova.
444 * For non-domain page tables this will only setup a lazy reference and the
445 * caller must use iopt_access_pages() to touch them.
446 *
447 * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
448 * destroyed.
449 */
iopt_map_user_pages(struct iommufd_ctx * ictx,struct io_pagetable * iopt,unsigned long * iova,void __user * uptr,unsigned long length,int iommu_prot,unsigned int flags)450 int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
451 unsigned long *iova, void __user *uptr,
452 unsigned long length, int iommu_prot,
453 unsigned int flags)
454 {
455 struct iopt_pages *pages;
456
457 pages = iopt_alloc_user_pages(uptr, length, iommu_prot & IOMMU_WRITE);
458 if (IS_ERR(pages))
459 return PTR_ERR(pages);
460
461 return iopt_map_common(ictx, iopt, pages, iova, length,
462 uptr - pages->uptr, iommu_prot, flags);
463 }
464
465 /**
466 * iopt_map_file_pages() - Like iopt_map_user_pages, but map a file.
467 * @ictx: iommufd_ctx the iopt is part of
468 * @iopt: io_pagetable to act on
469 * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
470 * the chosen iova on output. Otherwise is the iova to map to on input
471 * @file: file to map
472 * @start: map file starting at this byte offset
473 * @length: Number of bytes to map
474 * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
475 * @flags: IOPT_ALLOC_IOVA or zero
476 */
iopt_map_file_pages(struct iommufd_ctx * ictx,struct io_pagetable * iopt,unsigned long * iova,struct file * file,unsigned long start,unsigned long length,int iommu_prot,unsigned int flags)477 int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
478 unsigned long *iova, struct file *file,
479 unsigned long start, unsigned long length,
480 int iommu_prot, unsigned int flags)
481 {
482 struct iopt_pages *pages;
483
484 pages = iopt_alloc_file_pages(file, start, length,
485 iommu_prot & IOMMU_WRITE);
486 if (IS_ERR(pages))
487 return PTR_ERR(pages);
488 return iopt_map_common(ictx, iopt, pages, iova, length,
489 start - pages->start, iommu_prot, flags);
490 }
491
492 struct iova_bitmap_fn_arg {
493 unsigned long flags;
494 struct io_pagetable *iopt;
495 struct iommu_domain *domain;
496 struct iommu_dirty_bitmap *dirty;
497 };
498
__iommu_read_and_clear_dirty(struct iova_bitmap * bitmap,unsigned long iova,size_t length,void * opaque)499 static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap,
500 unsigned long iova, size_t length,
501 void *opaque)
502 {
503 struct iopt_area *area;
504 struct iopt_area_contig_iter iter;
505 struct iova_bitmap_fn_arg *arg = opaque;
506 struct iommu_domain *domain = arg->domain;
507 struct iommu_dirty_bitmap *dirty = arg->dirty;
508 const struct iommu_dirty_ops *ops = domain->dirty_ops;
509 unsigned long last_iova = iova + length - 1;
510 unsigned long flags = arg->flags;
511 int ret;
512
513 iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) {
514 unsigned long last = min(last_iova, iopt_area_last_iova(area));
515
516 ret = ops->read_and_clear_dirty(domain, iter.cur_iova,
517 last - iter.cur_iova + 1, flags,
518 dirty);
519 if (ret)
520 return ret;
521 }
522
523 if (!iopt_area_contig_done(&iter))
524 return -EINVAL;
525 return 0;
526 }
527
528 static int
iommu_read_and_clear_dirty(struct iommu_domain * domain,struct io_pagetable * iopt,unsigned long flags,struct iommu_hwpt_get_dirty_bitmap * bitmap)529 iommu_read_and_clear_dirty(struct iommu_domain *domain,
530 struct io_pagetable *iopt, unsigned long flags,
531 struct iommu_hwpt_get_dirty_bitmap *bitmap)
532 {
533 const struct iommu_dirty_ops *ops = domain->dirty_ops;
534 struct iommu_iotlb_gather gather;
535 struct iommu_dirty_bitmap dirty;
536 struct iova_bitmap_fn_arg arg;
537 struct iova_bitmap *iter;
538 int ret = 0;
539
540 if (!ops || !ops->read_and_clear_dirty)
541 return -EOPNOTSUPP;
542
543 iter = iova_bitmap_alloc(bitmap->iova, bitmap->length,
544 bitmap->page_size,
545 u64_to_user_ptr(bitmap->data));
546 if (IS_ERR(iter))
547 return -ENOMEM;
548
549 iommu_dirty_bitmap_init(&dirty, iter, &gather);
550
551 arg.flags = flags;
552 arg.iopt = iopt;
553 arg.domain = domain;
554 arg.dirty = &dirty;
555 iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty);
556
557 if (!(flags & IOMMU_DIRTY_NO_CLEAR))
558 iommu_iotlb_sync(domain, &gather);
559
560 iova_bitmap_free(iter);
561
562 return ret;
563 }
564
iommufd_check_iova_range(struct io_pagetable * iopt,struct iommu_hwpt_get_dirty_bitmap * bitmap)565 int iommufd_check_iova_range(struct io_pagetable *iopt,
566 struct iommu_hwpt_get_dirty_bitmap *bitmap)
567 {
568 size_t iommu_pgsize = iopt->iova_alignment;
569 u64 last_iova;
570
571 if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova))
572 return -EOVERFLOW;
573
574 if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX)
575 return -EOVERFLOW;
576
577 if ((bitmap->iova & (iommu_pgsize - 1)) ||
578 ((last_iova + 1) & (iommu_pgsize - 1)))
579 return -EINVAL;
580
581 if (!bitmap->page_size)
582 return -EINVAL;
583
584 if ((bitmap->iova & (bitmap->page_size - 1)) ||
585 ((last_iova + 1) & (bitmap->page_size - 1)))
586 return -EINVAL;
587
588 return 0;
589 }
590
iopt_read_and_clear_dirty_data(struct io_pagetable * iopt,struct iommu_domain * domain,unsigned long flags,struct iommu_hwpt_get_dirty_bitmap * bitmap)591 int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
592 struct iommu_domain *domain,
593 unsigned long flags,
594 struct iommu_hwpt_get_dirty_bitmap *bitmap)
595 {
596 int ret;
597
598 ret = iommufd_check_iova_range(iopt, bitmap);
599 if (ret)
600 return ret;
601
602 down_read(&iopt->iova_rwsem);
603 ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
604 up_read(&iopt->iova_rwsem);
605
606 return ret;
607 }
608
iopt_clear_dirty_data(struct io_pagetable * iopt,struct iommu_domain * domain)609 static int iopt_clear_dirty_data(struct io_pagetable *iopt,
610 struct iommu_domain *domain)
611 {
612 const struct iommu_dirty_ops *ops = domain->dirty_ops;
613 struct iommu_iotlb_gather gather;
614 struct iommu_dirty_bitmap dirty;
615 struct iopt_area *area;
616 int ret = 0;
617
618 lockdep_assert_held_read(&iopt->iova_rwsem);
619
620 iommu_dirty_bitmap_init(&dirty, NULL, &gather);
621
622 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
623 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
624 if (!area->pages)
625 continue;
626
627 ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area),
628 iopt_area_length(area), 0,
629 &dirty);
630 if (ret)
631 break;
632 }
633
634 iommu_iotlb_sync(domain, &gather);
635 return ret;
636 }
637
iopt_set_dirty_tracking(struct io_pagetable * iopt,struct iommu_domain * domain,bool enable)638 int iopt_set_dirty_tracking(struct io_pagetable *iopt,
639 struct iommu_domain *domain, bool enable)
640 {
641 const struct iommu_dirty_ops *ops = domain->dirty_ops;
642 int ret = 0;
643
644 if (!ops)
645 return -EOPNOTSUPP;
646
647 down_read(&iopt->iova_rwsem);
648
649 /* Clear dirty bits from PTEs to ensure a clean snapshot */
650 if (enable) {
651 ret = iopt_clear_dirty_data(iopt, domain);
652 if (ret)
653 goto out_unlock;
654 }
655
656 ret = ops->set_dirty_tracking(domain, enable);
657
658 out_unlock:
659 up_read(&iopt->iova_rwsem);
660 return ret;
661 }
662
iopt_get_pages(struct io_pagetable * iopt,unsigned long iova,unsigned long length,struct list_head * pages_list)663 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
664 unsigned long length, struct list_head *pages_list)
665 {
666 struct iopt_area_contig_iter iter;
667 unsigned long last_iova;
668 struct iopt_area *area;
669 int rc;
670
671 if (!length)
672 return -EINVAL;
673 if (check_add_overflow(iova, length - 1, &last_iova))
674 return -EOVERFLOW;
675
676 down_read(&iopt->iova_rwsem);
677 iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
678 struct iopt_pages_list *elm;
679 unsigned long last = min(last_iova, iopt_area_last_iova(area));
680
681 elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT);
682 if (!elm) {
683 rc = -ENOMEM;
684 goto err_free;
685 }
686 elm->start_byte = iopt_area_start_byte(area, iter.cur_iova);
687 elm->pages = area->pages;
688 elm->length = (last - iter.cur_iova) + 1;
689 kref_get(&elm->pages->kref);
690 list_add_tail(&elm->next, pages_list);
691 }
692 if (!iopt_area_contig_done(&iter)) {
693 rc = -ENOENT;
694 goto err_free;
695 }
696 up_read(&iopt->iova_rwsem);
697 return 0;
698 err_free:
699 up_read(&iopt->iova_rwsem);
700 iopt_free_pages_list(pages_list);
701 return rc;
702 }
703
iopt_unmap_iova_range(struct io_pagetable * iopt,unsigned long start,unsigned long last,unsigned long * unmapped)704 static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
705 unsigned long last, unsigned long *unmapped)
706 {
707 struct iopt_area *area;
708 unsigned long unmapped_bytes = 0;
709 unsigned int tries = 0;
710 /* If there are no mapped entries then success */
711 int rc = 0;
712
713 /*
714 * The domains_rwsem must be held in read mode any time any area->pages
715 * is NULL. This prevents domain attach/detatch from running
716 * concurrently with cleaning up the area.
717 */
718 again:
719 down_read(&iopt->domains_rwsem);
720 down_write(&iopt->iova_rwsem);
721 while ((area = iopt_area_iter_first(iopt, start, last))) {
722 unsigned long area_last = iopt_area_last_iova(area);
723 unsigned long area_first = iopt_area_iova(area);
724 struct iopt_pages *pages;
725
726 /* Userspace should not race map/unmap's of the same area */
727 if (!area->pages) {
728 rc = -EBUSY;
729 goto out_unlock_iova;
730 }
731
732 /* The area is locked by an object that has not been destroyed */
733 if (area->num_locks) {
734 rc = -EBUSY;
735 goto out_unlock_iova;
736 }
737
738 if (area_first < start || area_last > last) {
739 rc = -ENOENT;
740 goto out_unlock_iova;
741 }
742
743 if (area_first != start)
744 tries = 0;
745
746 /*
747 * num_accesses writers must hold the iova_rwsem too, so we can
748 * safely read it under the write side of the iovam_rwsem
749 * without the pages->mutex.
750 */
751 if (area->num_accesses) {
752 size_t length = iopt_area_length(area);
753
754 start = area_first;
755 area->prevent_access = true;
756 up_write(&iopt->iova_rwsem);
757 up_read(&iopt->domains_rwsem);
758
759 iommufd_access_notify_unmap(iopt, area_first, length);
760 /* Something is not responding to unmap requests. */
761 tries++;
762 if (WARN_ON(tries > 100)) {
763 rc = -EDEADLOCK;
764 goto out_unmapped;
765 }
766 goto again;
767 }
768
769 pages = area->pages;
770 area->pages = NULL;
771 up_write(&iopt->iova_rwsem);
772
773 iopt_area_unfill_domains(area, pages);
774 iopt_abort_area(area);
775 iopt_put_pages(pages);
776
777 unmapped_bytes += area_last - area_first + 1;
778
779 down_write(&iopt->iova_rwsem);
780 }
781
782 out_unlock_iova:
783 up_write(&iopt->iova_rwsem);
784 up_read(&iopt->domains_rwsem);
785 out_unmapped:
786 if (unmapped)
787 *unmapped = unmapped_bytes;
788 return rc;
789 }
790
791 /**
792 * iopt_unmap_iova() - Remove a range of iova
793 * @iopt: io_pagetable to act on
794 * @iova: Starting iova to unmap
795 * @length: Number of bytes to unmap
796 * @unmapped: Return number of bytes unmapped
797 *
798 * The requested range must be a superset of existing ranges.
799 * Splitting/truncating IOVA mappings is not allowed.
800 */
iopt_unmap_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length,unsigned long * unmapped)801 int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
802 unsigned long length, unsigned long *unmapped)
803 {
804 unsigned long iova_last;
805
806 if (!length)
807 return -EINVAL;
808
809 if (check_add_overflow(iova, length - 1, &iova_last))
810 return -EOVERFLOW;
811
812 return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped);
813 }
814
iopt_unmap_all(struct io_pagetable * iopt,unsigned long * unmapped)815 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
816 {
817 /* If the IOVAs are empty then unmap all succeeds */
818 return iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
819 }
820
821 /* The caller must always free all the nodes in the allowed_iova rb_root. */
iopt_set_allow_iova(struct io_pagetable * iopt,struct rb_root_cached * allowed_iova)822 int iopt_set_allow_iova(struct io_pagetable *iopt,
823 struct rb_root_cached *allowed_iova)
824 {
825 struct iopt_allowed *allowed;
826
827 down_write(&iopt->iova_rwsem);
828 swap(*allowed_iova, iopt->allowed_itree);
829
830 for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed;
831 allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) {
832 if (iopt_reserved_iter_first(iopt, allowed->node.start,
833 allowed->node.last)) {
834 swap(*allowed_iova, iopt->allowed_itree);
835 up_write(&iopt->iova_rwsem);
836 return -EADDRINUSE;
837 }
838 }
839 up_write(&iopt->iova_rwsem);
840 return 0;
841 }
842
iopt_reserve_iova(struct io_pagetable * iopt,unsigned long start,unsigned long last,void * owner)843 int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
844 unsigned long last, void *owner)
845 {
846 struct iopt_reserved *reserved;
847
848 lockdep_assert_held_write(&iopt->iova_rwsem);
849
850 if (iopt_area_iter_first(iopt, start, last) ||
851 iopt_allowed_iter_first(iopt, start, last))
852 return -EADDRINUSE;
853
854 reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT);
855 if (!reserved)
856 return -ENOMEM;
857 reserved->node.start = start;
858 reserved->node.last = last;
859 reserved->owner = owner;
860 interval_tree_insert(&reserved->node, &iopt->reserved_itree);
861 return 0;
862 }
863
__iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)864 static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
865 {
866 struct iopt_reserved *reserved, *next;
867
868 lockdep_assert_held_write(&iopt->iova_rwsem);
869
870 for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved;
871 reserved = next) {
872 next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX);
873
874 if (reserved->owner == owner) {
875 interval_tree_remove(&reserved->node,
876 &iopt->reserved_itree);
877 kfree(reserved);
878 }
879 }
880 }
881
iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)882 void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
883 {
884 down_write(&iopt->iova_rwsem);
885 __iopt_remove_reserved_iova(iopt, owner);
886 up_write(&iopt->iova_rwsem);
887 }
888
iopt_init_table(struct io_pagetable * iopt)889 void iopt_init_table(struct io_pagetable *iopt)
890 {
891 init_rwsem(&iopt->iova_rwsem);
892 init_rwsem(&iopt->domains_rwsem);
893 iopt->area_itree = RB_ROOT_CACHED;
894 iopt->allowed_itree = RB_ROOT_CACHED;
895 iopt->reserved_itree = RB_ROOT_CACHED;
896 xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT);
897 xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC);
898
899 /*
900 * iopt's start as SW tables that can use the entire size_t IOVA space
901 * due to the use of size_t in the APIs. They have no alignment
902 * restriction.
903 */
904 iopt->iova_alignment = 1;
905 }
906
iopt_destroy_table(struct io_pagetable * iopt)907 void iopt_destroy_table(struct io_pagetable *iopt)
908 {
909 struct interval_tree_node *node;
910
911 if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
912 iopt_remove_reserved_iova(iopt, NULL);
913
914 while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0,
915 ULONG_MAX))) {
916 interval_tree_remove(node, &iopt->allowed_itree);
917 kfree(container_of(node, struct iopt_allowed, node));
918 }
919
920 WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root));
921 WARN_ON(!xa_empty(&iopt->domains));
922 WARN_ON(!xa_empty(&iopt->access_list));
923 WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
924 }
925
926 /**
927 * iopt_unfill_domain() - Unfill a domain with PFNs
928 * @iopt: io_pagetable to act on
929 * @domain: domain to unfill
930 *
931 * This is used when removing a domain from the iopt. Every area in the iopt
932 * will be unmapped from the domain. The domain must already be removed from the
933 * domains xarray.
934 */
iopt_unfill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)935 static void iopt_unfill_domain(struct io_pagetable *iopt,
936 struct iommu_domain *domain)
937 {
938 struct iopt_area *area;
939
940 lockdep_assert_held(&iopt->iova_rwsem);
941 lockdep_assert_held_write(&iopt->domains_rwsem);
942
943 /*
944 * Some other domain is holding all the pfns still, rapidly unmap this
945 * domain.
946 */
947 if (iopt->next_domain_id != 0) {
948 /* Pick an arbitrary remaining domain to act as storage */
949 struct iommu_domain *storage_domain =
950 xa_load(&iopt->domains, 0);
951
952 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
953 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
954 struct iopt_pages *pages = area->pages;
955
956 if (!pages)
957 continue;
958
959 mutex_lock(&pages->mutex);
960 if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
961 WARN_ON(!area->storage_domain);
962 if (area->storage_domain == domain)
963 area->storage_domain = storage_domain;
964 mutex_unlock(&pages->mutex);
965
966 iopt_area_unmap_domain(area, domain);
967 }
968 return;
969 }
970
971 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
972 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
973 struct iopt_pages *pages = area->pages;
974
975 if (!pages)
976 continue;
977
978 mutex_lock(&pages->mutex);
979 interval_tree_remove(&area->pages_node, &pages->domains_itree);
980 WARN_ON(area->storage_domain != domain);
981 area->storage_domain = NULL;
982 iopt_area_unfill_domain(area, pages, domain);
983 mutex_unlock(&pages->mutex);
984 }
985 }
986
987 /**
988 * iopt_fill_domain() - Fill a domain with PFNs
989 * @iopt: io_pagetable to act on
990 * @domain: domain to fill
991 *
992 * Fill the domain with PFNs from every area in the iopt. On failure the domain
993 * is left unchanged.
994 */
iopt_fill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)995 static int iopt_fill_domain(struct io_pagetable *iopt,
996 struct iommu_domain *domain)
997 {
998 struct iopt_area *end_area;
999 struct iopt_area *area;
1000 int rc;
1001
1002 lockdep_assert_held(&iopt->iova_rwsem);
1003 lockdep_assert_held_write(&iopt->domains_rwsem);
1004
1005 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1006 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
1007 struct iopt_pages *pages = area->pages;
1008
1009 if (!pages)
1010 continue;
1011
1012 mutex_lock(&pages->mutex);
1013 rc = iopt_area_fill_domain(area, domain);
1014 if (rc) {
1015 mutex_unlock(&pages->mutex);
1016 goto out_unfill;
1017 }
1018 if (!area->storage_domain) {
1019 WARN_ON(iopt->next_domain_id != 0);
1020 area->storage_domain = domain;
1021 interval_tree_insert(&area->pages_node,
1022 &pages->domains_itree);
1023 }
1024 mutex_unlock(&pages->mutex);
1025 }
1026 return 0;
1027
1028 out_unfill:
1029 end_area = area;
1030 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1031 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
1032 struct iopt_pages *pages = area->pages;
1033
1034 if (area == end_area)
1035 break;
1036 if (!pages)
1037 continue;
1038 mutex_lock(&pages->mutex);
1039 if (iopt->next_domain_id == 0) {
1040 interval_tree_remove(&area->pages_node,
1041 &pages->domains_itree);
1042 area->storage_domain = NULL;
1043 }
1044 iopt_area_unfill_domain(area, pages, domain);
1045 mutex_unlock(&pages->mutex);
1046 }
1047 return rc;
1048 }
1049
1050 /* All existing area's conform to an increased page size */
iopt_check_iova_alignment(struct io_pagetable * iopt,unsigned long new_iova_alignment)1051 static int iopt_check_iova_alignment(struct io_pagetable *iopt,
1052 unsigned long new_iova_alignment)
1053 {
1054 unsigned long align_mask = new_iova_alignment - 1;
1055 struct iopt_area *area;
1056
1057 lockdep_assert_held(&iopt->iova_rwsem);
1058 lockdep_assert_held(&iopt->domains_rwsem);
1059
1060 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1061 area = iopt_area_iter_next(area, 0, ULONG_MAX))
1062 if ((iopt_area_iova(area) & align_mask) ||
1063 (iopt_area_length(area) & align_mask) ||
1064 (area->page_offset & align_mask))
1065 return -EADDRINUSE;
1066
1067 if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) {
1068 struct iommufd_access *access;
1069 unsigned long index;
1070
1071 xa_for_each(&iopt->access_list, index, access)
1072 if (WARN_ON(access->iova_alignment >
1073 new_iova_alignment))
1074 return -EADDRINUSE;
1075 }
1076 return 0;
1077 }
1078
iopt_table_add_domain(struct io_pagetable * iopt,struct iommu_domain * domain)1079 int iopt_table_add_domain(struct io_pagetable *iopt,
1080 struct iommu_domain *domain)
1081 {
1082 const struct iommu_domain_geometry *geometry = &domain->geometry;
1083 struct iommu_domain *iter_domain;
1084 unsigned int new_iova_alignment;
1085 unsigned long index;
1086 int rc;
1087
1088 down_write(&iopt->domains_rwsem);
1089 down_write(&iopt->iova_rwsem);
1090
1091 xa_for_each(&iopt->domains, index, iter_domain) {
1092 if (WARN_ON(iter_domain == domain)) {
1093 rc = -EEXIST;
1094 goto out_unlock;
1095 }
1096 }
1097
1098 /*
1099 * The io page size drives the iova_alignment. Internally the iopt_pages
1100 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
1101 * objects into the iommu_domain.
1102 *
1103 * A iommu_domain must always be able to accept PAGE_SIZE to be
1104 * compatible as we can't guarantee higher contiguity.
1105 */
1106 new_iova_alignment = max_t(unsigned long,
1107 1UL << __ffs(domain->pgsize_bitmap),
1108 iopt->iova_alignment);
1109 if (new_iova_alignment > PAGE_SIZE) {
1110 rc = -EINVAL;
1111 goto out_unlock;
1112 }
1113 if (new_iova_alignment != iopt->iova_alignment) {
1114 rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1115 if (rc)
1116 goto out_unlock;
1117 }
1118
1119 /* No area exists that is outside the allowed domain aperture */
1120 if (geometry->aperture_start != 0) {
1121 rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
1122 domain);
1123 if (rc)
1124 goto out_reserved;
1125 }
1126 if (geometry->aperture_end != ULONG_MAX) {
1127 rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
1128 ULONG_MAX, domain);
1129 if (rc)
1130 goto out_reserved;
1131 }
1132
1133 rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
1134 if (rc)
1135 goto out_reserved;
1136
1137 rc = iopt_fill_domain(iopt, domain);
1138 if (rc)
1139 goto out_release;
1140
1141 iopt->iova_alignment = new_iova_alignment;
1142 xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
1143 iopt->next_domain_id++;
1144 up_write(&iopt->iova_rwsem);
1145 up_write(&iopt->domains_rwsem);
1146 return 0;
1147 out_release:
1148 xa_release(&iopt->domains, iopt->next_domain_id);
1149 out_reserved:
1150 __iopt_remove_reserved_iova(iopt, domain);
1151 out_unlock:
1152 up_write(&iopt->iova_rwsem);
1153 up_write(&iopt->domains_rwsem);
1154 return rc;
1155 }
1156
iopt_calculate_iova_alignment(struct io_pagetable * iopt)1157 static int iopt_calculate_iova_alignment(struct io_pagetable *iopt)
1158 {
1159 unsigned long new_iova_alignment;
1160 struct iommufd_access *access;
1161 struct iommu_domain *domain;
1162 unsigned long index;
1163
1164 lockdep_assert_held_write(&iopt->iova_rwsem);
1165 lockdep_assert_held(&iopt->domains_rwsem);
1166
1167 /* See batch_iommu_map_small() */
1168 if (iopt->disable_large_pages)
1169 new_iova_alignment = PAGE_SIZE;
1170 else
1171 new_iova_alignment = 1;
1172
1173 xa_for_each(&iopt->domains, index, domain)
1174 new_iova_alignment = max_t(unsigned long,
1175 1UL << __ffs(domain->pgsize_bitmap),
1176 new_iova_alignment);
1177 xa_for_each(&iopt->access_list, index, access)
1178 new_iova_alignment = max_t(unsigned long,
1179 access->iova_alignment,
1180 new_iova_alignment);
1181
1182 if (new_iova_alignment > iopt->iova_alignment) {
1183 int rc;
1184
1185 rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1186 if (rc)
1187 return rc;
1188 }
1189 iopt->iova_alignment = new_iova_alignment;
1190 return 0;
1191 }
1192
iopt_table_remove_domain(struct io_pagetable * iopt,struct iommu_domain * domain)1193 void iopt_table_remove_domain(struct io_pagetable *iopt,
1194 struct iommu_domain *domain)
1195 {
1196 struct iommu_domain *iter_domain = NULL;
1197 unsigned long index;
1198
1199 down_write(&iopt->domains_rwsem);
1200 down_write(&iopt->iova_rwsem);
1201
1202 xa_for_each(&iopt->domains, index, iter_domain)
1203 if (iter_domain == domain)
1204 break;
1205 if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
1206 goto out_unlock;
1207
1208 /*
1209 * Compress the xarray to keep it linear by swapping the entry to erase
1210 * with the tail entry and shrinking the tail.
1211 */
1212 iopt->next_domain_id--;
1213 iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
1214 if (index != iopt->next_domain_id)
1215 xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
1216
1217 iopt_unfill_domain(iopt, domain);
1218 __iopt_remove_reserved_iova(iopt, domain);
1219
1220 WARN_ON(iopt_calculate_iova_alignment(iopt));
1221 out_unlock:
1222 up_write(&iopt->iova_rwsem);
1223 up_write(&iopt->domains_rwsem);
1224 }
1225
1226 /**
1227 * iopt_area_split - Split an area into two parts at iova
1228 * @area: The area to split
1229 * @iova: Becomes the last of a new area
1230 *
1231 * This splits an area into two. It is part of the VFIO compatibility to allow
1232 * poking a hole in the mapping. The two areas continue to point at the same
1233 * iopt_pages, just with different starting bytes.
1234 */
iopt_area_split(struct iopt_area * area,unsigned long iova)1235 static int iopt_area_split(struct iopt_area *area, unsigned long iova)
1236 {
1237 unsigned long alignment = area->iopt->iova_alignment;
1238 unsigned long last_iova = iopt_area_last_iova(area);
1239 unsigned long start_iova = iopt_area_iova(area);
1240 unsigned long new_start = iova + 1;
1241 struct io_pagetable *iopt = area->iopt;
1242 struct iopt_pages *pages = area->pages;
1243 struct iopt_area *lhs;
1244 struct iopt_area *rhs;
1245 int rc;
1246
1247 lockdep_assert_held_write(&iopt->iova_rwsem);
1248
1249 if (iova == start_iova || iova == last_iova)
1250 return 0;
1251
1252 if (!pages || area->prevent_access)
1253 return -EBUSY;
1254
1255 if (new_start & (alignment - 1) ||
1256 iopt_area_start_byte(area, new_start) & (alignment - 1))
1257 return -EINVAL;
1258
1259 lhs = iopt_area_alloc();
1260 if (!lhs)
1261 return -ENOMEM;
1262
1263 rhs = iopt_area_alloc();
1264 if (!rhs) {
1265 rc = -ENOMEM;
1266 goto err_free_lhs;
1267 }
1268
1269 mutex_lock(&pages->mutex);
1270 /*
1271 * Splitting is not permitted if an access exists, we don't track enough
1272 * information to split existing accesses.
1273 */
1274 if (area->num_accesses) {
1275 rc = -EINVAL;
1276 goto err_unlock;
1277 }
1278
1279 /*
1280 * Splitting is not permitted if a domain could have been mapped with
1281 * huge pages.
1282 */
1283 if (area->storage_domain && !iopt->disable_large_pages) {
1284 rc = -EINVAL;
1285 goto err_unlock;
1286 }
1287
1288 interval_tree_remove(&area->node, &iopt->area_itree);
1289 rc = iopt_insert_area(iopt, lhs, area->pages, start_iova,
1290 iopt_area_start_byte(area, start_iova),
1291 (new_start - 1) - start_iova + 1,
1292 area->iommu_prot);
1293 if (WARN_ON(rc))
1294 goto err_insert;
1295
1296 rc = iopt_insert_area(iopt, rhs, area->pages, new_start,
1297 iopt_area_start_byte(area, new_start),
1298 last_iova - new_start + 1, area->iommu_prot);
1299 if (WARN_ON(rc))
1300 goto err_remove_lhs;
1301
1302 /*
1303 * If the original area has filled a domain, domains_itree has to be
1304 * updated.
1305 */
1306 if (area->storage_domain) {
1307 interval_tree_remove(&area->pages_node, &pages->domains_itree);
1308 interval_tree_insert(&lhs->pages_node, &pages->domains_itree);
1309 interval_tree_insert(&rhs->pages_node, &pages->domains_itree);
1310 }
1311
1312 lhs->storage_domain = area->storage_domain;
1313 lhs->pages = area->pages;
1314 rhs->storage_domain = area->storage_domain;
1315 rhs->pages = area->pages;
1316 kref_get(&rhs->pages->kref);
1317 kfree(area);
1318 mutex_unlock(&pages->mutex);
1319
1320 /*
1321 * No change to domains or accesses because the pages hasn't been
1322 * changed
1323 */
1324 return 0;
1325
1326 err_remove_lhs:
1327 interval_tree_remove(&lhs->node, &iopt->area_itree);
1328 err_insert:
1329 interval_tree_insert(&area->node, &iopt->area_itree);
1330 err_unlock:
1331 mutex_unlock(&pages->mutex);
1332 kfree(rhs);
1333 err_free_lhs:
1334 kfree(lhs);
1335 return rc;
1336 }
1337
iopt_cut_iova(struct io_pagetable * iopt,unsigned long * iovas,size_t num_iovas)1338 int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
1339 size_t num_iovas)
1340 {
1341 int rc = 0;
1342 int i;
1343
1344 down_write(&iopt->iova_rwsem);
1345 for (i = 0; i < num_iovas; i++) {
1346 struct iopt_area *area;
1347
1348 area = iopt_area_iter_first(iopt, iovas[i], iovas[i]);
1349 if (!area)
1350 continue;
1351 rc = iopt_area_split(area, iovas[i]);
1352 if (rc)
1353 break;
1354 }
1355 up_write(&iopt->iova_rwsem);
1356 return rc;
1357 }
1358
iopt_enable_large_pages(struct io_pagetable * iopt)1359 void iopt_enable_large_pages(struct io_pagetable *iopt)
1360 {
1361 int rc;
1362
1363 down_write(&iopt->domains_rwsem);
1364 down_write(&iopt->iova_rwsem);
1365 WRITE_ONCE(iopt->disable_large_pages, false);
1366 rc = iopt_calculate_iova_alignment(iopt);
1367 WARN_ON(rc);
1368 up_write(&iopt->iova_rwsem);
1369 up_write(&iopt->domains_rwsem);
1370 }
1371
iopt_disable_large_pages(struct io_pagetable * iopt)1372 int iopt_disable_large_pages(struct io_pagetable *iopt)
1373 {
1374 int rc = 0;
1375
1376 down_write(&iopt->domains_rwsem);
1377 down_write(&iopt->iova_rwsem);
1378 if (iopt->disable_large_pages)
1379 goto out_unlock;
1380
1381 /* Won't do it if domains already have pages mapped in them */
1382 if (!xa_empty(&iopt->domains) &&
1383 !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) {
1384 rc = -EINVAL;
1385 goto out_unlock;
1386 }
1387
1388 WRITE_ONCE(iopt->disable_large_pages, true);
1389 rc = iopt_calculate_iova_alignment(iopt);
1390 if (rc)
1391 WRITE_ONCE(iopt->disable_large_pages, false);
1392 out_unlock:
1393 up_write(&iopt->iova_rwsem);
1394 up_write(&iopt->domains_rwsem);
1395 return rc;
1396 }
1397
iopt_add_access(struct io_pagetable * iopt,struct iommufd_access * access)1398 int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access)
1399 {
1400 u32 new_id;
1401 int rc;
1402
1403 down_write(&iopt->domains_rwsem);
1404 down_write(&iopt->iova_rwsem);
1405 rc = xa_alloc(&iopt->access_list, &new_id, access, xa_limit_16b,
1406 GFP_KERNEL_ACCOUNT);
1407
1408 if (rc)
1409 goto out_unlock;
1410
1411 rc = iopt_calculate_iova_alignment(iopt);
1412 if (rc) {
1413 xa_erase(&iopt->access_list, new_id);
1414 goto out_unlock;
1415 }
1416 access->iopt_access_list_id = new_id;
1417
1418 out_unlock:
1419 up_write(&iopt->iova_rwsem);
1420 up_write(&iopt->domains_rwsem);
1421 return rc;
1422 }
1423
iopt_remove_access(struct io_pagetable * iopt,struct iommufd_access * access,u32 iopt_access_list_id)1424 void iopt_remove_access(struct io_pagetable *iopt,
1425 struct iommufd_access *access, u32 iopt_access_list_id)
1426 {
1427 down_write(&iopt->domains_rwsem);
1428 down_write(&iopt->iova_rwsem);
1429 WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access);
1430 WARN_ON(iopt_calculate_iova_alignment(iopt));
1431 up_write(&iopt->iova_rwsem);
1432 up_write(&iopt->domains_rwsem);
1433 }
1434
1435 /* Narrow the valid_iova_itree to include reserved ranges from a device. */
iopt_table_enforce_dev_resv_regions(struct io_pagetable * iopt,struct device * dev,phys_addr_t * sw_msi_start)1436 int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
1437 struct device *dev,
1438 phys_addr_t *sw_msi_start)
1439 {
1440 struct iommu_resv_region *resv;
1441 LIST_HEAD(resv_regions);
1442 unsigned int num_hw_msi = 0;
1443 unsigned int num_sw_msi = 0;
1444 int rc;
1445
1446 if (iommufd_should_fail())
1447 return -EINVAL;
1448
1449 down_write(&iopt->iova_rwsem);
1450 /* FIXME: drivers allocate memory but there is no failure propogated */
1451 iommu_get_resv_regions(dev, &resv_regions);
1452
1453 list_for_each_entry(resv, &resv_regions, list) {
1454 if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1455 continue;
1456
1457 if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
1458 num_hw_msi++;
1459 if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
1460 *sw_msi_start = resv->start;
1461 num_sw_msi++;
1462 }
1463
1464 rc = iopt_reserve_iova(iopt, resv->start,
1465 resv->length - 1 + resv->start, dev);
1466 if (rc)
1467 goto out_reserved;
1468 }
1469
1470 /* Drivers must offer sane combinations of regions */
1471 if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) {
1472 rc = -EINVAL;
1473 goto out_reserved;
1474 }
1475
1476 rc = 0;
1477 goto out_free_resv;
1478
1479 out_reserved:
1480 __iopt_remove_reserved_iova(iopt, dev);
1481 out_free_resv:
1482 iommu_put_resv_regions(dev, &resv_regions);
1483 up_write(&iopt->iova_rwsem);
1484 return rc;
1485 }
1486