1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3 * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
4 *
5 * "Templated C code" for implementing the iommu operations for page tables.
6 * This is compiled multiple times, over all the page table formats to pick up
7 * the per-format definitions.
8 */
9 #ifndef __GENERIC_PT_IOMMU_PT_H
10 #define __GENERIC_PT_IOMMU_PT_H
11
12 #include "pt_iter.h"
13
14 #include <linux/export.h>
15 #include <linux/iommu.h>
16 #include "../iommu-pages.h"
17 #include <linux/cleanup.h>
18 #include <linux/dma-mapping.h>
19
20 enum {
21 SW_BIT_CACHE_FLUSH_DONE = 0,
22 };
23
flush_writes_range(const struct pt_state * pts,unsigned int start_index,unsigned int end_index)24 static void flush_writes_range(const struct pt_state *pts,
25 unsigned int start_index, unsigned int end_index)
26 {
27 if (pts_feature(pts, PT_FEAT_DMA_INCOHERENT))
28 iommu_pages_flush_incoherent(
29 iommu_from_common(pts->range->common)->iommu_device,
30 pts->table, start_index * PT_ITEM_WORD_SIZE,
31 (end_index - start_index) * PT_ITEM_WORD_SIZE);
32 }
33
flush_writes_item(const struct pt_state * pts)34 static void flush_writes_item(const struct pt_state *pts)
35 {
36 if (pts_feature(pts, PT_FEAT_DMA_INCOHERENT))
37 iommu_pages_flush_incoherent(
38 iommu_from_common(pts->range->common)->iommu_device,
39 pts->table, pts->index * PT_ITEM_WORD_SIZE,
40 PT_ITEM_WORD_SIZE);
41 }
42
gather_range_pages(struct iommu_iotlb_gather * iotlb_gather,struct pt_iommu * iommu_table,pt_vaddr_t iova,pt_vaddr_t len,struct iommu_pages_list * free_list)43 static void gather_range_pages(struct iommu_iotlb_gather *iotlb_gather,
44 struct pt_iommu *iommu_table, pt_vaddr_t iova,
45 pt_vaddr_t len,
46 struct iommu_pages_list *free_list)
47 {
48 struct pt_common *common = common_from_iommu(iommu_table);
49
50 if (pt_feature(common, PT_FEAT_DMA_INCOHERENT))
51 iommu_pages_stop_incoherent_list(free_list,
52 iommu_table->iommu_device);
53
54 /*
55 * If running in DMA-FQ mode then the unmap will be followed by an IOTLB
56 * flush all so we need to optimize by never flushing the IOTLB here.
57 *
58 * For NO_GAPS the user gets to pick if flushing all or doing micro
59 * flushes is better for their work load by choosing DMA vs DMA-FQ
60 * operation. Drivers should also see shadow_on_flush.
61 */
62 if (!iommu_iotlb_gather_queued(iotlb_gather)) {
63 if (pt_feature(common, PT_FEAT_FLUSH_RANGE_NO_GAPS) &&
64 iommu_iotlb_gather_is_disjoint(iotlb_gather, iova, len)) {
65 iommu_iotlb_sync(&iommu_table->domain, iotlb_gather);
66 /*
67 * Note that the sync frees the gather's free list, so
68 * we must not have any pages on that list that are
69 * covered by iova/len
70 */
71 }
72 iommu_iotlb_gather_add_range(iotlb_gather, iova, len);
73 }
74
75 iommu_pages_list_splice(free_list, &iotlb_gather->freelist);
76 }
77
78 #define DOMAIN_NS(op) CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), op)
79
make_range_ul(struct pt_common * common,struct pt_range * range,unsigned long iova,unsigned long len)80 static int make_range_ul(struct pt_common *common, struct pt_range *range,
81 unsigned long iova, unsigned long len)
82 {
83 unsigned long last;
84
85 if (unlikely(len == 0))
86 return -EINVAL;
87
88 if (check_add_overflow(iova, len - 1, &last))
89 return -EOVERFLOW;
90
91 *range = pt_make_range(common, iova, last);
92 if (sizeof(iova) > sizeof(range->va)) {
93 if (unlikely(range->va != iova || range->last_va != last))
94 return -EOVERFLOW;
95 }
96 return 0;
97 }
98
make_range_u64(struct pt_common * common,struct pt_range * range,u64 iova,u64 len)99 static __maybe_unused int make_range_u64(struct pt_common *common,
100 struct pt_range *range, u64 iova,
101 u64 len)
102 {
103 if (unlikely(iova > ULONG_MAX || len > ULONG_MAX))
104 return -EOVERFLOW;
105 return make_range_ul(common, range, iova, len);
106 }
107
108 /*
109 * Some APIs use unsigned long, while othersuse dma_addr_t as the type. Dispatch
110 * to the correct validation based on the type.
111 */
112 #define make_range_no_check(common, range, iova, len) \
113 ({ \
114 int ret; \
115 if (sizeof(iova) > sizeof(unsigned long) || \
116 sizeof(len) > sizeof(unsigned long)) \
117 ret = make_range_u64(common, range, iova, len); \
118 else \
119 ret = make_range_ul(common, range, iova, len); \
120 ret; \
121 })
122
123 #define make_range(common, range, iova, len) \
124 ({ \
125 int ret = make_range_no_check(common, range, iova, len); \
126 if (!ret) \
127 ret = pt_check_range(range); \
128 ret; \
129 })
130
compute_best_pgsize(struct pt_state * pts,pt_oaddr_t oa)131 static inline unsigned int compute_best_pgsize(struct pt_state *pts,
132 pt_oaddr_t oa)
133 {
134 struct pt_iommu *iommu_table = iommu_from_common(pts->range->common);
135
136 if (!pt_can_have_leaf(pts))
137 return 0;
138
139 /*
140 * The page size is limited by the domain's bitmap. This allows the core
141 * code to reduce the supported page sizes by changing the bitmap.
142 */
143 return pt_compute_best_pgsize(pt_possible_sizes(pts) &
144 iommu_table->domain.pgsize_bitmap,
145 pts->range->va, pts->range->last_va, oa);
146 }
147
__do_iova_to_phys(struct pt_range * range,void * arg,unsigned int level,struct pt_table_p * table,pt_level_fn_t descend_fn)148 static __always_inline int __do_iova_to_phys(struct pt_range *range, void *arg,
149 unsigned int level,
150 struct pt_table_p *table,
151 pt_level_fn_t descend_fn)
152 {
153 struct pt_state pts = pt_init(range, level, table);
154 pt_oaddr_t *res = arg;
155
156 switch (pt_load_single_entry(&pts)) {
157 case PT_ENTRY_EMPTY:
158 return -ENOENT;
159 case PT_ENTRY_TABLE:
160 return pt_descend(&pts, arg, descend_fn);
161 case PT_ENTRY_OA:
162 *res = pt_entry_oa_exact(&pts);
163 return 0;
164 }
165 return -ENOENT;
166 }
167 PT_MAKE_LEVELS(__iova_to_phys, __do_iova_to_phys);
168
169 /**
170 * iova_to_phys() - Return the output address for the given IOVA
171 * @domain: Table to query
172 * @iova: IO virtual address to query
173 *
174 * Determine the output address from the given IOVA. @iova may have any
175 * alignment, the returned physical will be adjusted with any sub page offset.
176 *
177 * Context: The caller must hold a read range lock that includes @iova.
178 *
179 * Return: 0 if there is no translation for the given iova.
180 */
DOMAIN_NS(iova_to_phys)181 phys_addr_t DOMAIN_NS(iova_to_phys)(struct iommu_domain *domain,
182 dma_addr_t iova)
183 {
184 struct pt_iommu *iommu_table =
185 container_of(domain, struct pt_iommu, domain);
186 struct pt_range range;
187 pt_oaddr_t res;
188 int ret;
189
190 ret = make_range(common_from_iommu(iommu_table), &range, iova, 1);
191 if (ret)
192 return ret;
193
194 ret = pt_walk_range(&range, __iova_to_phys, &res);
195 /* PHYS_ADDR_MAX would be a better error code */
196 if (ret)
197 return 0;
198 return res;
199 }
200 EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(iova_to_phys), "GENERIC_PT_IOMMU");
201
202 struct pt_iommu_dirty_args {
203 struct iommu_dirty_bitmap *dirty;
204 unsigned int flags;
205 };
206
record_dirty(struct pt_state * pts,struct pt_iommu_dirty_args * dirty,unsigned int num_contig_lg2)207 static void record_dirty(struct pt_state *pts,
208 struct pt_iommu_dirty_args *dirty,
209 unsigned int num_contig_lg2)
210 {
211 pt_vaddr_t dirty_len;
212
213 if (num_contig_lg2 != ilog2(1)) {
214 unsigned int index = pts->index;
215 unsigned int end_index = log2_set_mod_max_t(
216 unsigned int, pts->index, num_contig_lg2);
217
218 /* Adjust for being contained inside a contiguous page */
219 end_index = min(end_index, pts->end_index);
220 dirty_len = (end_index - index) *
221 log2_to_int(pt_table_item_lg2sz(pts));
222 } else {
223 dirty_len = log2_to_int(pt_table_item_lg2sz(pts));
224 }
225
226 if (dirty->dirty->bitmap)
227 iova_bitmap_set(dirty->dirty->bitmap, pts->range->va,
228 dirty_len);
229
230 if (!(dirty->flags & IOMMU_DIRTY_NO_CLEAR)) {
231 /*
232 * No write log required because DMA incoherence and atomic
233 * dirty tracking bits can't work together
234 */
235 pt_entry_make_write_clean(pts);
236 iommu_iotlb_gather_add_range(dirty->dirty->gather,
237 pts->range->va, dirty_len);
238 }
239 }
240
__read_and_clear_dirty(struct pt_range * range,void * arg,unsigned int level,struct pt_table_p * table)241 static inline int __read_and_clear_dirty(struct pt_range *range, void *arg,
242 unsigned int level,
243 struct pt_table_p *table)
244 {
245 struct pt_state pts = pt_init(range, level, table);
246 struct pt_iommu_dirty_args *dirty = arg;
247 int ret;
248
249 for_each_pt_level_entry(&pts) {
250 if (pts.type == PT_ENTRY_TABLE) {
251 ret = pt_descend(&pts, arg, __read_and_clear_dirty);
252 if (ret)
253 return ret;
254 continue;
255 }
256 if (pts.type == PT_ENTRY_OA && pt_entry_is_write_dirty(&pts))
257 record_dirty(&pts, dirty,
258 pt_entry_num_contig_lg2(&pts));
259 }
260 return 0;
261 }
262
263 /**
264 * read_and_clear_dirty() - Manipulate the HW set write dirty state
265 * @domain: Domain to manipulate
266 * @iova: IO virtual address to start
267 * @size: Length of the IOVA
268 * @flags: A bitmap of IOMMU_DIRTY_NO_CLEAR
269 * @dirty: Place to store the dirty bits
270 *
271 * Iterate over all the entries in the mapped range and record their write dirty
272 * status in iommu_dirty_bitmap. If IOMMU_DIRTY_NO_CLEAR is not specified then
273 * the entries will be left dirty, otherwise they are returned to being not
274 * write dirty.
275 *
276 * Context: The caller must hold a read range lock that includes @iova.
277 *
278 * Returns: -ERRNO on failure, 0 on success.
279 */
DOMAIN_NS(read_and_clear_dirty)280 int DOMAIN_NS(read_and_clear_dirty)(struct iommu_domain *domain,
281 unsigned long iova, size_t size,
282 unsigned long flags,
283 struct iommu_dirty_bitmap *dirty)
284 {
285 struct pt_iommu *iommu_table =
286 container_of(domain, struct pt_iommu, domain);
287 struct pt_iommu_dirty_args dirty_args = {
288 .dirty = dirty,
289 .flags = flags,
290 };
291 struct pt_range range;
292 int ret;
293
294 #if !IS_ENABLED(CONFIG_IOMMUFD_DRIVER) || !defined(pt_entry_is_write_dirty)
295 return -EOPNOTSUPP;
296 #endif
297
298 ret = make_range(common_from_iommu(iommu_table), &range, iova, size);
299 if (ret)
300 return ret;
301
302 ret = pt_walk_range(&range, __read_and_clear_dirty, &dirty_args);
303 PT_WARN_ON(ret);
304 return ret;
305 }
306 EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(read_and_clear_dirty), "GENERIC_PT_IOMMU");
307
__set_dirty(struct pt_range * range,void * arg,unsigned int level,struct pt_table_p * table)308 static inline int __set_dirty(struct pt_range *range, void *arg,
309 unsigned int level, struct pt_table_p *table)
310 {
311 struct pt_state pts = pt_init(range, level, table);
312
313 switch (pt_load_single_entry(&pts)) {
314 case PT_ENTRY_EMPTY:
315 return -ENOENT;
316 case PT_ENTRY_TABLE:
317 return pt_descend(&pts, arg, __set_dirty);
318 case PT_ENTRY_OA:
319 if (!pt_entry_make_write_dirty(&pts))
320 return -EAGAIN;
321 return 0;
322 }
323 return -ENOENT;
324 }
325
NS(set_dirty)326 static int __maybe_unused NS(set_dirty)(struct pt_iommu *iommu_table,
327 dma_addr_t iova)
328 {
329 struct pt_range range;
330 int ret;
331
332 ret = make_range(common_from_iommu(iommu_table), &range, iova, 1);
333 if (ret)
334 return ret;
335
336 /*
337 * Note: There is no locking here yet, if the test suite races this it
338 * can crash. It should use RCU locking eventually.
339 */
340 return pt_walk_range(&range, __set_dirty, NULL);
341 }
342
343 struct pt_iommu_collect_args {
344 struct iommu_pages_list free_list;
345 /* Fail if any OAs are within the range */
346 u8 check_mapped : 1;
347 };
348
__collect_tables(struct pt_range * range,void * arg,unsigned int level,struct pt_table_p * table)349 static int __collect_tables(struct pt_range *range, void *arg,
350 unsigned int level, struct pt_table_p *table)
351 {
352 struct pt_state pts = pt_init(range, level, table);
353 struct pt_iommu_collect_args *collect = arg;
354 int ret;
355
356 if (!collect->check_mapped && !pt_can_have_table(&pts))
357 return 0;
358
359 for_each_pt_level_entry(&pts) {
360 if (pts.type == PT_ENTRY_TABLE) {
361 iommu_pages_list_add(&collect->free_list, pts.table_lower);
362 ret = pt_descend(&pts, arg, __collect_tables);
363 if (ret)
364 return ret;
365 continue;
366 }
367 if (pts.type == PT_ENTRY_OA && collect->check_mapped)
368 return -EADDRINUSE;
369 }
370 return 0;
371 }
372
373 enum alloc_mode {ALLOC_NORMAL, ALLOC_DEFER_COHERENT_FLUSH};
374
375 /* Allocate a table, the empty table will be ready to be installed. */
_table_alloc(struct pt_common * common,size_t lg2sz,gfp_t gfp,enum alloc_mode mode)376 static inline struct pt_table_p *_table_alloc(struct pt_common *common,
377 size_t lg2sz, gfp_t gfp,
378 enum alloc_mode mode)
379 {
380 struct pt_iommu *iommu_table = iommu_from_common(common);
381 struct pt_table_p *table_mem;
382
383 table_mem = iommu_alloc_pages_node_sz(iommu_table->nid, gfp,
384 log2_to_int(lg2sz));
385 if (!table_mem)
386 return ERR_PTR(-ENOMEM);
387
388 if (pt_feature(common, PT_FEAT_DMA_INCOHERENT) &&
389 mode == ALLOC_NORMAL) {
390 int ret = iommu_pages_start_incoherent(
391 table_mem, iommu_table->iommu_device);
392 if (ret) {
393 iommu_free_pages(table_mem);
394 return ERR_PTR(ret);
395 }
396 }
397 return table_mem;
398 }
399
table_alloc_top(struct pt_common * common,uintptr_t top_of_table,gfp_t gfp,enum alloc_mode mode)400 static inline struct pt_table_p *table_alloc_top(struct pt_common *common,
401 uintptr_t top_of_table,
402 gfp_t gfp,
403 enum alloc_mode mode)
404 {
405 /*
406 * Top doesn't need the free list or otherwise, so it technically
407 * doesn't need to use iommu pages. Use the API anyhow as the top is
408 * usually not smaller than PAGE_SIZE to keep things simple.
409 */
410 return _table_alloc(common, pt_top_memsize_lg2(common, top_of_table),
411 gfp, mode);
412 }
413
414 /* Allocate an interior table */
table_alloc(const struct pt_state * parent_pts,gfp_t gfp,enum alloc_mode mode)415 static inline struct pt_table_p *table_alloc(const struct pt_state *parent_pts,
416 gfp_t gfp, enum alloc_mode mode)
417 {
418 struct pt_state child_pts =
419 pt_init(parent_pts->range, parent_pts->level - 1, NULL);
420
421 return _table_alloc(parent_pts->range->common,
422 pt_num_items_lg2(&child_pts) +
423 ilog2(PT_ITEM_WORD_SIZE),
424 gfp, mode);
425 }
426
pt_iommu_new_table(struct pt_state * pts,struct pt_write_attrs * attrs)427 static inline int pt_iommu_new_table(struct pt_state *pts,
428 struct pt_write_attrs *attrs)
429 {
430 struct pt_table_p *table_mem;
431 phys_addr_t phys;
432
433 /* Given PA/VA/length can't be represented */
434 if (PT_WARN_ON(!pt_can_have_table(pts)))
435 return -ENXIO;
436
437 table_mem = table_alloc(pts, attrs->gfp, ALLOC_NORMAL);
438 if (IS_ERR(table_mem))
439 return PTR_ERR(table_mem);
440
441 phys = virt_to_phys(table_mem);
442 if (!pt_install_table(pts, phys, attrs)) {
443 iommu_pages_free_incoherent(
444 table_mem,
445 iommu_from_common(pts->range->common)->iommu_device);
446 return -EAGAIN;
447 }
448
449 if (pts_feature(pts, PT_FEAT_DMA_INCOHERENT)) {
450 flush_writes_item(pts);
451 pt_set_sw_bit_release(pts, SW_BIT_CACHE_FLUSH_DONE);
452 }
453
454 if (IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)) {
455 /*
456 * The underlying table can't store the physical table address.
457 * This happens when kunit testing tables outside their normal
458 * environment where a CPU might be limited.
459 */
460 pt_load_single_entry(pts);
461 if (PT_WARN_ON(pt_table_pa(pts) != phys)) {
462 pt_clear_entries(pts, ilog2(1));
463 iommu_pages_free_incoherent(
464 table_mem, iommu_from_common(pts->range->common)
465 ->iommu_device);
466 return -EINVAL;
467 }
468 }
469
470 pts->table_lower = table_mem;
471 return 0;
472 }
473
474 struct pt_iommu_map_args {
475 struct iommu_iotlb_gather *iotlb_gather;
476 struct pt_write_attrs attrs;
477 pt_oaddr_t oa;
478 unsigned int leaf_pgsize_lg2;
479 unsigned int leaf_level;
480 pt_vaddr_t num_leaves;
481 };
482
483 /*
484 * This will recursively check any tables in the block to validate they are
485 * empty and then free them through the gather.
486 */
clear_contig(const struct pt_state * start_pts,struct iommu_iotlb_gather * iotlb_gather,unsigned int step,unsigned int pgsize_lg2)487 static int clear_contig(const struct pt_state *start_pts,
488 struct iommu_iotlb_gather *iotlb_gather,
489 unsigned int step, unsigned int pgsize_lg2)
490 {
491 struct pt_iommu *iommu_table =
492 iommu_from_common(start_pts->range->common);
493 struct pt_range range = *start_pts->range;
494 struct pt_state pts =
495 pt_init(&range, start_pts->level, start_pts->table);
496 struct pt_iommu_collect_args collect = { .check_mapped = true };
497 int ret;
498
499 pts.index = start_pts->index;
500 pts.end_index = start_pts->index + step;
501 for (; _pt_iter_load(&pts); pt_next_entry(&pts)) {
502 if (pts.type == PT_ENTRY_TABLE) {
503 collect.free_list =
504 IOMMU_PAGES_LIST_INIT(collect.free_list);
505 ret = pt_walk_descend_all(&pts, __collect_tables,
506 &collect);
507 if (ret)
508 return ret;
509
510 /*
511 * The table item must be cleared before we can update
512 * the gather
513 */
514 pt_clear_entries(&pts, ilog2(1));
515 flush_writes_item(&pts);
516
517 iommu_pages_list_add(&collect.free_list,
518 pt_table_ptr(&pts));
519 gather_range_pages(
520 iotlb_gather, iommu_table, range.va,
521 log2_to_int(pt_table_item_lg2sz(&pts)),
522 &collect.free_list);
523 } else if (pts.type != PT_ENTRY_EMPTY) {
524 return -EADDRINUSE;
525 }
526 }
527 return 0;
528 }
529
__map_range_leaf(struct pt_range * range,void * arg,unsigned int level,struct pt_table_p * table)530 static int __map_range_leaf(struct pt_range *range, void *arg,
531 unsigned int level, struct pt_table_p *table)
532 {
533 struct pt_iommu *iommu_table = iommu_from_common(range->common);
534 struct pt_state pts = pt_init(range, level, table);
535 struct pt_iommu_map_args *map = arg;
536 unsigned int leaf_pgsize_lg2 = map->leaf_pgsize_lg2;
537 unsigned int leaves_avail;
538 unsigned int start_index;
539 pt_oaddr_t oa = map->oa;
540 pt_vaddr_t num_leaves;
541 unsigned int orig_end;
542 unsigned int step_lg2;
543 pt_vaddr_t last_va;
544 unsigned int step;
545 bool need_contig;
546 int ret = 0;
547
548 PT_WARN_ON(map->leaf_level != level);
549 PT_WARN_ON(!pt_can_have_leaf(&pts));
550
551 step_lg2 = leaf_pgsize_lg2 - pt_table_item_lg2sz(&pts);
552 step = log2_to_int_t(unsigned int, step_lg2);
553 need_contig = step_lg2 != 0;
554
555 _pt_iter_first(&pts);
556 start_index = pts.index;
557 orig_end = pts.end_index;
558 leaves_avail =
559 log2_div_t(unsigned int, pts.end_index - pts.index, step_lg2);
560 if (map->num_leaves <= leaves_avail) {
561 /* Need to stop in the middle of the table to change sizes */
562 pts.end_index = pts.index + log2_mul(map->num_leaves, step_lg2);
563 num_leaves = 0;
564 } else {
565 num_leaves = map->num_leaves - leaves_avail;
566 }
567
568 PT_WARN_ON(
569 log2_mod_t(unsigned int, pts.end_index - pts.index, step_lg2));
570 do {
571 pts.type = pt_load_entry_raw(&pts);
572 if (pts.type != PT_ENTRY_EMPTY || need_contig) {
573 if (pts.index != start_index)
574 pt_index_to_va(&pts);
575 ret = clear_contig(&pts, map->iotlb_gather, step,
576 leaf_pgsize_lg2);
577 if (ret)
578 break;
579 }
580
581 if (IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)) {
582 pt_index_to_va(&pts);
583 PT_WARN_ON(compute_best_pgsize(&pts, oa) !=
584 leaf_pgsize_lg2);
585 }
586 pt_install_leaf_entry(&pts, oa, leaf_pgsize_lg2, &map->attrs);
587
588 oa += log2_to_int(leaf_pgsize_lg2);
589 pts.index += step;
590 } while (pts.index < pts.end_index);
591
592 flush_writes_range(&pts, start_index, pts.index);
593
594 map->oa = oa;
595 map->num_leaves = num_leaves;
596 if (ret || num_leaves)
597 return ret;
598
599 /* range->va is not valid if we reached the end of the table */
600 pts.index -= step;
601 pt_index_to_va(&pts);
602 pts.index += step;
603 last_va = range->va + log2_to_int(leaf_pgsize_lg2);
604
605 if (last_va - 1 == range->last_va) {
606 PT_WARN_ON(pts.index != orig_end);
607 return 0;
608 }
609
610 /*
611 * Reached a point where the page size changed, compute the new
612 * parameters.
613 */
614 map->leaf_pgsize_lg2 = pt_compute_best_pgsize(
615 iommu_table->domain.pgsize_bitmap, last_va, range->last_va, oa);
616 map->leaf_level =
617 pt_pgsz_lg2_to_level(range->common, map->leaf_pgsize_lg2);
618 map->num_leaves = pt_pgsz_count(iommu_table->domain.pgsize_bitmap,
619 last_va, range->last_va, oa,
620 map->leaf_pgsize_lg2);
621
622 /* Didn't finish this table level, caller will repeat it */
623 if (pts.index != orig_end) {
624 if (pts.index != start_index)
625 pt_index_to_va(&pts);
626 return -EAGAIN;
627 }
628 return 0;
629 }
630
__map_range(struct pt_range * range,void * arg,unsigned int level,struct pt_table_p * table)631 static int __map_range(struct pt_range *range, void *arg, unsigned int level,
632 struct pt_table_p *table)
633 {
634 struct pt_state pts = pt_init(range, level, table);
635 struct pt_iommu_map_args *map = arg;
636 int ret;
637
638 PT_WARN_ON(map->leaf_level == level);
639 PT_WARN_ON(!pt_can_have_table(&pts));
640
641 _pt_iter_first(&pts);
642
643 /* Descend to a child table */
644 do {
645 pts.type = pt_load_entry_raw(&pts);
646
647 if (pts.type != PT_ENTRY_TABLE) {
648 if (pts.type != PT_ENTRY_EMPTY)
649 return -EADDRINUSE;
650 ret = pt_iommu_new_table(&pts, &map->attrs);
651 /* EAGAIN on a race will loop again */
652 if (ret)
653 return ret;
654 } else {
655 pts.table_lower = pt_table_ptr(&pts);
656 /*
657 * Racing with a shared pt_iommu_new_table()? The other
658 * thread is still flushing the cache, so we have to
659 * also flush it to ensure that when our thread's map
660 * completes all the table items leading to our mapping
661 * are visible.
662 *
663 * This requires the pt_set_bit_release() to be a
664 * release of the cache flush so that this can acquire
665 * visibility at the iommu.
666 */
667 if (pts_feature(&pts, PT_FEAT_DMA_INCOHERENT) &&
668 !pt_test_sw_bit_acquire(&pts,
669 SW_BIT_CACHE_FLUSH_DONE))
670 flush_writes_item(&pts);
671 }
672
673 /*
674 * The already present table can possibly be shared with another
675 * concurrent map.
676 */
677 do {
678 if (map->leaf_level == level - 1)
679 ret = pt_descend(&pts, arg, __map_range_leaf);
680 else
681 ret = pt_descend(&pts, arg, __map_range);
682 } while (ret == -EAGAIN);
683 if (ret)
684 return ret;
685
686 pts.index++;
687 pt_index_to_va(&pts);
688 if (pts.index >= pts.end_index)
689 break;
690
691 /*
692 * This level is currently running __map_range_leaf() which is
693 * not correct if the target level has been updated to this
694 * level. Have the caller invoke __map_range_leaf.
695 */
696 if (map->leaf_level == level)
697 return -EAGAIN;
698 } while (true);
699 return 0;
700 }
701
702 /*
703 * Fast path for the easy case of mapping a 4k page to an already allocated
704 * table. This is a common workload. If it returns EAGAIN run the full algorithm
705 * instead.
706 */
__do_map_single_page(struct pt_range * range,void * arg,unsigned int level,struct pt_table_p * table,pt_level_fn_t descend_fn)707 static __always_inline int __do_map_single_page(struct pt_range *range,
708 void *arg, unsigned int level,
709 struct pt_table_p *table,
710 pt_level_fn_t descend_fn)
711 {
712 struct pt_state pts = pt_init(range, level, table);
713 struct pt_iommu_map_args *map = arg;
714
715 pts.type = pt_load_single_entry(&pts);
716 if (pts.level == 0) {
717 if (pts.type != PT_ENTRY_EMPTY)
718 return -EADDRINUSE;
719 pt_install_leaf_entry(&pts, map->oa, PAGE_SHIFT,
720 &map->attrs);
721 /* No flush, not used when incoherent */
722 map->oa += PAGE_SIZE;
723 return 0;
724 }
725 if (pts.type == PT_ENTRY_TABLE)
726 return pt_descend(&pts, arg, descend_fn);
727 /* Something else, use the slow path */
728 return -EAGAIN;
729 }
730 PT_MAKE_LEVELS(__map_single_page, __do_map_single_page);
731
732 /*
733 * Add a table to the top, increasing the top level as much as necessary to
734 * encompass range.
735 */
increase_top(struct pt_iommu * iommu_table,struct pt_range * range,struct pt_iommu_map_args * map)736 static int increase_top(struct pt_iommu *iommu_table, struct pt_range *range,
737 struct pt_iommu_map_args *map)
738 {
739 struct iommu_pages_list free_list = IOMMU_PAGES_LIST_INIT(free_list);
740 struct pt_common *common = common_from_iommu(iommu_table);
741 uintptr_t top_of_table = READ_ONCE(common->top_of_table);
742 uintptr_t new_top_of_table = top_of_table;
743 struct pt_table_p *table_mem;
744 unsigned int new_level;
745 spinlock_t *domain_lock;
746 unsigned long flags;
747 int ret;
748
749 while (true) {
750 struct pt_range top_range =
751 _pt_top_range(common, new_top_of_table);
752 struct pt_state pts = pt_init_top(&top_range);
753
754 top_range.va = range->va;
755 top_range.last_va = range->last_va;
756
757 if (!pt_check_range(&top_range) &&
758 map->leaf_level <= pts.level) {
759 new_level = pts.level;
760 break;
761 }
762
763 pts.level++;
764 if (pts.level > PT_MAX_TOP_LEVEL ||
765 pt_table_item_lg2sz(&pts) >= common->max_vasz_lg2) {
766 ret = -ERANGE;
767 goto err_free;
768 }
769
770 table_mem =
771 table_alloc_top(common, _pt_top_set(NULL, pts.level),
772 map->attrs.gfp, ALLOC_DEFER_COHERENT_FLUSH);
773 if (IS_ERR(table_mem)) {
774 ret = PTR_ERR(table_mem);
775 goto err_free;
776 }
777 iommu_pages_list_add(&free_list, table_mem);
778
779 /* The new table links to the lower table always at index 0 */
780 top_range.va = 0;
781 top_range.top_level = pts.level;
782 pts.table_lower = pts.table;
783 pts.table = table_mem;
784 pt_load_single_entry(&pts);
785 PT_WARN_ON(pts.index != 0);
786 pt_install_table(&pts, virt_to_phys(pts.table_lower),
787 &map->attrs);
788 new_top_of_table = _pt_top_set(pts.table, pts.level);
789 }
790
791 /*
792 * Avoid double flushing, flush it once after all pt_install_table()
793 */
794 if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) {
795 ret = iommu_pages_start_incoherent_list(
796 &free_list, iommu_table->iommu_device);
797 if (ret)
798 goto err_free;
799 }
800
801 /*
802 * top_of_table is write locked by the spinlock, but readers can use
803 * READ_ONCE() to get the value. Since we encode both the level and the
804 * pointer in one quanta the lockless reader will always see something
805 * valid. The HW must be updated to the new level under the spinlock
806 * before top_of_table is updated so that concurrent readers don't map
807 * into the new level until it is fully functional. If another thread
808 * already updated it while we were working then throw everything away
809 * and try again.
810 */
811 domain_lock = iommu_table->driver_ops->get_top_lock(iommu_table);
812 spin_lock_irqsave(domain_lock, flags);
813 if (common->top_of_table != top_of_table ||
814 top_of_table == new_top_of_table) {
815 spin_unlock_irqrestore(domain_lock, flags);
816 ret = -EAGAIN;
817 goto err_free;
818 }
819
820 /*
821 * We do not issue any flushes for change_top on the expectation that
822 * any walk cache will not become a problem by adding another layer to
823 * the tree. Misses will rewalk from the updated top pointer, hits
824 * continue to be correct. Negative caching is fine too since all the
825 * new IOVA added by the new top is non-present.
826 */
827 iommu_table->driver_ops->change_top(
828 iommu_table, virt_to_phys(table_mem), new_level);
829 WRITE_ONCE(common->top_of_table, new_top_of_table);
830 spin_unlock_irqrestore(domain_lock, flags);
831 return 0;
832
833 err_free:
834 if (pt_feature(common, PT_FEAT_DMA_INCOHERENT))
835 iommu_pages_stop_incoherent_list(&free_list,
836 iommu_table->iommu_device);
837 iommu_put_pages_list(&free_list);
838 return ret;
839 }
840
check_map_range(struct pt_iommu * iommu_table,struct pt_range * range,struct pt_iommu_map_args * map)841 static int check_map_range(struct pt_iommu *iommu_table, struct pt_range *range,
842 struct pt_iommu_map_args *map)
843 {
844 struct pt_common *common = common_from_iommu(iommu_table);
845 int ret;
846
847 do {
848 ret = pt_check_range(range);
849 if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP))
850 return ret;
851
852 if (!ret && map->leaf_level <= range->top_level)
853 break;
854
855 ret = increase_top(iommu_table, range, map);
856 if (ret && ret != -EAGAIN)
857 return ret;
858
859 /* Reload the new top */
860 *range = pt_make_range(common, range->va, range->last_va);
861 } while (ret);
862 PT_WARN_ON(pt_check_range(range));
863 return 0;
864 }
865
do_map(struct pt_range * range,struct pt_common * common,bool single_page,struct pt_iommu_map_args * map)866 static int do_map(struct pt_range *range, struct pt_common *common,
867 bool single_page, struct pt_iommu_map_args *map)
868 {
869 int ret;
870
871 /*
872 * The __map_single_page() fast path does not support DMA_INCOHERENT
873 * flushing to keep its .text small.
874 */
875 if (single_page && !pt_feature(common, PT_FEAT_DMA_INCOHERENT)) {
876
877 ret = pt_walk_range(range, __map_single_page, map);
878 if (ret != -EAGAIN)
879 return ret;
880 /* EAGAIN falls through to the full path */
881 }
882
883 do {
884 if (map->leaf_level == range->top_level)
885 ret = pt_walk_range(range, __map_range_leaf, map);
886 else
887 ret = pt_walk_range(range, __map_range, map);
888 } while (ret == -EAGAIN);
889 return ret;
890 }
891
NS(map_range)892 static int NS(map_range)(struct pt_iommu *iommu_table, dma_addr_t iova,
893 phys_addr_t paddr, dma_addr_t len, unsigned int prot,
894 gfp_t gfp, size_t *mapped)
895 {
896 pt_vaddr_t pgsize_bitmap = iommu_table->domain.pgsize_bitmap;
897 struct pt_common *common = common_from_iommu(iommu_table);
898 struct iommu_iotlb_gather iotlb_gather;
899 struct pt_iommu_map_args map = {
900 .iotlb_gather = &iotlb_gather,
901 .oa = paddr,
902 };
903 bool single_page = false;
904 struct pt_range range;
905 int ret;
906
907 iommu_iotlb_gather_init(&iotlb_gather);
908
909 if (WARN_ON(!(prot & (IOMMU_READ | IOMMU_WRITE))))
910 return -EINVAL;
911
912 /* Check the paddr doesn't exceed what the table can store */
913 if ((sizeof(pt_oaddr_t) < sizeof(paddr) &&
914 (pt_vaddr_t)paddr > PT_VADDR_MAX) ||
915 (common->max_oasz_lg2 != PT_VADDR_MAX_LG2 &&
916 oalog2_div(paddr, common->max_oasz_lg2)))
917 return -ERANGE;
918
919 ret = pt_iommu_set_prot(common, &map.attrs, prot);
920 if (ret)
921 return ret;
922 map.attrs.gfp = gfp;
923
924 ret = make_range_no_check(common, &range, iova, len);
925 if (ret)
926 return ret;
927
928 /* Calculate target page size and level for the leaves */
929 if (pt_has_system_page_size(common) && len == PAGE_SIZE &&
930 likely(pgsize_bitmap & PAGE_SIZE)) {
931 if (log2_mod(iova | paddr, PAGE_SHIFT))
932 return -ENXIO;
933 map.leaf_pgsize_lg2 = PAGE_SHIFT;
934 map.leaf_level = 0;
935 map.num_leaves = 1;
936 single_page = true;
937 } else {
938 map.leaf_pgsize_lg2 = pt_compute_best_pgsize(
939 pgsize_bitmap, range.va, range.last_va, paddr);
940 if (!map.leaf_pgsize_lg2)
941 return -ENXIO;
942 map.leaf_level =
943 pt_pgsz_lg2_to_level(common, map.leaf_pgsize_lg2);
944 map.num_leaves = pt_pgsz_count(pgsize_bitmap, range.va,
945 range.last_va, paddr,
946 map.leaf_pgsize_lg2);
947 }
948
949 ret = check_map_range(iommu_table, &range, &map);
950 if (ret)
951 return ret;
952
953 PT_WARN_ON(map.leaf_level > range.top_level);
954
955 ret = do_map(&range, common, single_page, &map);
956
957 /*
958 * Table levels were freed and replaced with large items, flush any walk
959 * cache that may refer to the freed levels.
960 */
961 if (!iommu_pages_list_empty(&iotlb_gather.freelist))
962 iommu_iotlb_sync(&iommu_table->domain, &iotlb_gather);
963
964 /* Bytes successfully mapped */
965 PT_WARN_ON(!ret && map.oa - paddr != len);
966 *mapped += map.oa - paddr;
967 return ret;
968 }
969
970 struct pt_unmap_args {
971 struct iommu_pages_list free_list;
972 pt_vaddr_t unmapped;
973 };
974
__unmap_range(struct pt_range * range,void * arg,unsigned int level,struct pt_table_p * table)975 static __maybe_unused int __unmap_range(struct pt_range *range, void *arg,
976 unsigned int level,
977 struct pt_table_p *table)
978 {
979 struct pt_state pts = pt_init(range, level, table);
980 unsigned int flush_start_index = UINT_MAX;
981 unsigned int flush_end_index = UINT_MAX;
982 struct pt_unmap_args *unmap = arg;
983 unsigned int num_oas = 0;
984 unsigned int start_index;
985 int ret = 0;
986
987 _pt_iter_first(&pts);
988 start_index = pts.index;
989 pts.type = pt_load_entry_raw(&pts);
990 /*
991 * A starting index is in the middle of a contiguous entry
992 *
993 * The IOMMU API does not require drivers to support unmapping parts of
994 * large pages. Long ago VFIO would try to split maps but the current
995 * version never does.
996 *
997 * Instead when unmap reaches a partial unmap of the start of a large
998 * IOPTE it should remove the entire IOPTE and return that size to the
999 * caller.
1000 */
1001 if (pts.type == PT_ENTRY_OA) {
1002 if (log2_mod(range->va, pt_entry_oa_lg2sz(&pts)))
1003 return -EINVAL;
1004 /* Micro optimization */
1005 goto start_oa;
1006 }
1007
1008 do {
1009 if (pts.type != PT_ENTRY_OA) {
1010 bool fully_covered;
1011
1012 if (pts.type != PT_ENTRY_TABLE) {
1013 ret = -EINVAL;
1014 break;
1015 }
1016
1017 if (pts.index != start_index)
1018 pt_index_to_va(&pts);
1019 pts.table_lower = pt_table_ptr(&pts);
1020
1021 fully_covered = pt_entry_fully_covered(
1022 &pts, pt_table_item_lg2sz(&pts));
1023
1024 ret = pt_descend(&pts, arg, __unmap_range);
1025 if (ret)
1026 break;
1027
1028 /*
1029 * If the unmapping range fully covers the table then we
1030 * can free it as well. The clear is delayed until we
1031 * succeed in clearing the lower table levels.
1032 */
1033 if (fully_covered) {
1034 iommu_pages_list_add(&unmap->free_list,
1035 pts.table_lower);
1036 pt_clear_entries(&pts, ilog2(1));
1037 if (pts.index < flush_start_index)
1038 flush_start_index = pts.index;
1039 flush_end_index = pts.index + 1;
1040 }
1041 pts.index++;
1042 } else {
1043 unsigned int num_contig_lg2;
1044 start_oa:
1045 /*
1046 * If the caller requested an last that falls within a
1047 * single entry then the entire entry is unmapped and
1048 * the length returned will be larger than requested.
1049 */
1050 num_contig_lg2 = pt_entry_num_contig_lg2(&pts);
1051 pt_clear_entries(&pts, num_contig_lg2);
1052 num_oas += log2_to_int(num_contig_lg2);
1053 if (pts.index < flush_start_index)
1054 flush_start_index = pts.index;
1055 pts.index += log2_to_int(num_contig_lg2);
1056 flush_end_index = pts.index;
1057 }
1058 if (pts.index >= pts.end_index)
1059 break;
1060 pts.type = pt_load_entry_raw(&pts);
1061 } while (true);
1062
1063 unmap->unmapped += log2_mul(num_oas, pt_table_item_lg2sz(&pts));
1064 if (flush_start_index != flush_end_index)
1065 flush_writes_range(&pts, flush_start_index, flush_end_index);
1066
1067 return ret;
1068 }
1069
NS(unmap_range)1070 static size_t NS(unmap_range)(struct pt_iommu *iommu_table, dma_addr_t iova,
1071 dma_addr_t len,
1072 struct iommu_iotlb_gather *iotlb_gather)
1073 {
1074 struct pt_unmap_args unmap = { .free_list = IOMMU_PAGES_LIST_INIT(
1075 unmap.free_list) };
1076 struct pt_range range;
1077 int ret;
1078
1079 ret = make_range(common_from_iommu(iommu_table), &range, iova, len);
1080 if (ret)
1081 return 0;
1082
1083 pt_walk_range(&range, __unmap_range, &unmap);
1084
1085 gather_range_pages(iotlb_gather, iommu_table, iova, unmap.unmapped,
1086 &unmap.free_list);
1087
1088 return unmap.unmapped;
1089 }
1090
NS(get_info)1091 static void NS(get_info)(struct pt_iommu *iommu_table,
1092 struct pt_iommu_info *info)
1093 {
1094 struct pt_common *common = common_from_iommu(iommu_table);
1095 struct pt_range range = pt_top_range(common);
1096 struct pt_state pts = pt_init_top(&range);
1097 pt_vaddr_t pgsize_bitmap = 0;
1098
1099 if (pt_feature(common, PT_FEAT_DYNAMIC_TOP)) {
1100 for (pts.level = 0; pts.level <= PT_MAX_TOP_LEVEL;
1101 pts.level++) {
1102 if (pt_table_item_lg2sz(&pts) >= common->max_vasz_lg2)
1103 break;
1104 pgsize_bitmap |= pt_possible_sizes(&pts);
1105 }
1106 } else {
1107 for (pts.level = 0; pts.level <= range.top_level; pts.level++)
1108 pgsize_bitmap |= pt_possible_sizes(&pts);
1109 }
1110
1111 /* Hide page sizes larger than the maximum OA */
1112 info->pgsize_bitmap = oalog2_mod(pgsize_bitmap, common->max_oasz_lg2);
1113 }
1114
NS(deinit)1115 static void NS(deinit)(struct pt_iommu *iommu_table)
1116 {
1117 struct pt_common *common = common_from_iommu(iommu_table);
1118 struct pt_range range = pt_all_range(common);
1119 struct pt_iommu_collect_args collect = {
1120 .free_list = IOMMU_PAGES_LIST_INIT(collect.free_list),
1121 };
1122
1123 iommu_pages_list_add(&collect.free_list, range.top_table);
1124 pt_walk_range(&range, __collect_tables, &collect);
1125
1126 /*
1127 * The driver has to already have fenced the HW access to the page table
1128 * and invalidated any caching referring to this memory.
1129 */
1130 if (pt_feature(common, PT_FEAT_DMA_INCOHERENT))
1131 iommu_pages_stop_incoherent_list(&collect.free_list,
1132 iommu_table->iommu_device);
1133 iommu_put_pages_list(&collect.free_list);
1134 }
1135
1136 static const struct pt_iommu_ops NS(ops) = {
1137 .map_range = NS(map_range),
1138 .unmap_range = NS(unmap_range),
1139 #if IS_ENABLED(CONFIG_IOMMUFD_DRIVER) && defined(pt_entry_is_write_dirty) && \
1140 IS_ENABLED(CONFIG_IOMMUFD_TEST) && defined(pt_entry_make_write_dirty)
1141 .set_dirty = NS(set_dirty),
1142 #endif
1143 .get_info = NS(get_info),
1144 .deinit = NS(deinit),
1145 };
1146
pt_init_common(struct pt_common * common)1147 static int pt_init_common(struct pt_common *common)
1148 {
1149 struct pt_range top_range = pt_top_range(common);
1150
1151 if (PT_WARN_ON(top_range.top_level > PT_MAX_TOP_LEVEL))
1152 return -EINVAL;
1153
1154 if (top_range.top_level == PT_MAX_TOP_LEVEL ||
1155 common->max_vasz_lg2 == top_range.max_vasz_lg2)
1156 common->features &= ~BIT(PT_FEAT_DYNAMIC_TOP);
1157
1158 if (top_range.max_vasz_lg2 == PT_VADDR_MAX_LG2)
1159 common->features |= BIT(PT_FEAT_FULL_VA);
1160
1161 /* Requested features must match features compiled into this format */
1162 if ((common->features & ~(unsigned int)PT_SUPPORTED_FEATURES) ||
1163 (!IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) &&
1164 (common->features & PT_FORCE_ENABLED_FEATURES) !=
1165 PT_FORCE_ENABLED_FEATURES))
1166 return -EOPNOTSUPP;
1167
1168 /*
1169 * Check if the top level of the page table is too small to hold the
1170 * specified maxvasz.
1171 */
1172 if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP) &&
1173 top_range.top_level != PT_MAX_TOP_LEVEL) {
1174 struct pt_state pts = { .range = &top_range,
1175 .level = top_range.top_level };
1176
1177 if (common->max_vasz_lg2 >
1178 pt_num_items_lg2(&pts) + pt_table_item_lg2sz(&pts))
1179 return -EOPNOTSUPP;
1180 }
1181
1182 if (common->max_oasz_lg2 == 0)
1183 common->max_oasz_lg2 = pt_max_oa_lg2(common);
1184 else
1185 common->max_oasz_lg2 = min(common->max_oasz_lg2,
1186 pt_max_oa_lg2(common));
1187 return 0;
1188 }
1189
pt_iommu_init_domain(struct pt_iommu * iommu_table,struct iommu_domain * domain)1190 static int pt_iommu_init_domain(struct pt_iommu *iommu_table,
1191 struct iommu_domain *domain)
1192 {
1193 struct pt_common *common = common_from_iommu(iommu_table);
1194 struct pt_iommu_info info;
1195 struct pt_range range;
1196
1197 NS(get_info)(iommu_table, &info);
1198
1199 domain->type = __IOMMU_DOMAIN_PAGING;
1200 domain->pgsize_bitmap = info.pgsize_bitmap;
1201 domain->is_iommupt = true;
1202
1203 if (pt_feature(common, PT_FEAT_DYNAMIC_TOP))
1204 range = _pt_top_range(common,
1205 _pt_top_set(NULL, PT_MAX_TOP_LEVEL));
1206 else
1207 range = pt_top_range(common);
1208
1209 /* A 64-bit high address space table on a 32-bit system cannot work. */
1210 domain->geometry.aperture_start = (unsigned long)range.va;
1211 if ((pt_vaddr_t)domain->geometry.aperture_start != range.va)
1212 return -EOVERFLOW;
1213
1214 /*
1215 * The aperture is limited to what the API can do after considering all
1216 * the different types dma_addr_t/unsigned long/pt_vaddr_t that are used
1217 * to store a VA. Set the aperture to something that is valid for all
1218 * cases. Saturate instead of truncate the end if the types are smaller
1219 * than the top range. aperture_end should be called aperture_last.
1220 */
1221 domain->geometry.aperture_end = (unsigned long)range.last_va;
1222 if ((pt_vaddr_t)domain->geometry.aperture_end != range.last_va) {
1223 domain->geometry.aperture_end = ULONG_MAX;
1224 domain->pgsize_bitmap &= ULONG_MAX;
1225 }
1226 domain->geometry.force_aperture = true;
1227
1228 return 0;
1229 }
1230
pt_iommu_zero(struct pt_iommu_table * fmt_table)1231 static void pt_iommu_zero(struct pt_iommu_table *fmt_table)
1232 {
1233 struct pt_iommu *iommu_table = &fmt_table->iommu;
1234 struct pt_iommu cfg = *iommu_table;
1235
1236 static_assert(offsetof(struct pt_iommu_table, iommu.domain) == 0);
1237 memset_after(fmt_table, 0, iommu.domain);
1238
1239 /* The caller can initialize some of these values */
1240 iommu_table->iommu_device = cfg.iommu_device;
1241 iommu_table->driver_ops = cfg.driver_ops;
1242 iommu_table->nid = cfg.nid;
1243 }
1244
1245 #define pt_iommu_table_cfg CONCATENATE(pt_iommu_table, _cfg)
1246 #define pt_iommu_init CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), init)
1247
pt_iommu_init(struct pt_iommu_table * fmt_table,const struct pt_iommu_table_cfg * cfg,gfp_t gfp)1248 int pt_iommu_init(struct pt_iommu_table *fmt_table,
1249 const struct pt_iommu_table_cfg *cfg, gfp_t gfp)
1250 {
1251 struct pt_iommu *iommu_table = &fmt_table->iommu;
1252 struct pt_common *common = common_from_iommu(iommu_table);
1253 struct pt_table_p *table_mem;
1254 int ret;
1255
1256 if (cfg->common.hw_max_vasz_lg2 > PT_MAX_VA_ADDRESS_LG2 ||
1257 !cfg->common.hw_max_vasz_lg2 || !cfg->common.hw_max_oasz_lg2)
1258 return -EINVAL;
1259
1260 pt_iommu_zero(fmt_table);
1261 common->features = cfg->common.features;
1262 common->max_vasz_lg2 = cfg->common.hw_max_vasz_lg2;
1263 common->max_oasz_lg2 = cfg->common.hw_max_oasz_lg2;
1264 ret = pt_iommu_fmt_init(fmt_table, cfg);
1265 if (ret)
1266 return ret;
1267
1268 if (cfg->common.hw_max_oasz_lg2 > pt_max_oa_lg2(common))
1269 return -EINVAL;
1270
1271 ret = pt_init_common(common);
1272 if (ret)
1273 return ret;
1274
1275 if (pt_feature(common, PT_FEAT_DYNAMIC_TOP) &&
1276 WARN_ON(!iommu_table->driver_ops ||
1277 !iommu_table->driver_ops->change_top ||
1278 !iommu_table->driver_ops->get_top_lock))
1279 return -EINVAL;
1280
1281 if (pt_feature(common, PT_FEAT_SIGN_EXTEND) &&
1282 (pt_feature(common, PT_FEAT_FULL_VA) ||
1283 pt_feature(common, PT_FEAT_DYNAMIC_TOP)))
1284 return -EINVAL;
1285
1286 if (pt_feature(common, PT_FEAT_DMA_INCOHERENT) &&
1287 WARN_ON(!iommu_table->iommu_device))
1288 return -EINVAL;
1289
1290 ret = pt_iommu_init_domain(iommu_table, &iommu_table->domain);
1291 if (ret)
1292 return ret;
1293
1294 table_mem = table_alloc_top(common, common->top_of_table, gfp,
1295 ALLOC_NORMAL);
1296 if (IS_ERR(table_mem))
1297 return PTR_ERR(table_mem);
1298 pt_top_set(common, table_mem, pt_top_get_level(common));
1299
1300 /* Must be last, see pt_iommu_deinit() */
1301 iommu_table->ops = &NS(ops);
1302 return 0;
1303 }
1304 EXPORT_SYMBOL_NS_GPL(pt_iommu_init, "GENERIC_PT_IOMMU");
1305
1306 #ifdef pt_iommu_fmt_hw_info
1307 #define pt_iommu_table_hw_info CONCATENATE(pt_iommu_table, _hw_info)
1308 #define pt_iommu_hw_info CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), hw_info)
pt_iommu_hw_info(struct pt_iommu_table * fmt_table,struct pt_iommu_table_hw_info * info)1309 void pt_iommu_hw_info(struct pt_iommu_table *fmt_table,
1310 struct pt_iommu_table_hw_info *info)
1311 {
1312 struct pt_iommu *iommu_table = &fmt_table->iommu;
1313 struct pt_common *common = common_from_iommu(iommu_table);
1314 struct pt_range top_range = pt_top_range(common);
1315
1316 pt_iommu_fmt_hw_info(fmt_table, &top_range, info);
1317 }
1318 EXPORT_SYMBOL_NS_GPL(pt_iommu_hw_info, "GENERIC_PT_IOMMU");
1319 #endif
1320
1321 MODULE_LICENSE("GPL");
1322 MODULE_DESCRIPTION("IOMMU Page table implementation for " __stringify(PTPFX_RAW));
1323 MODULE_IMPORT_NS("GENERIC_PT");
1324 /* For iommu_dirty_bitmap_record() */
1325 MODULE_IMPORT_NS("IOMMUFD");
1326
1327 #endif /* __GENERIC_PT_IOMMU_PT_H */
1328