1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright © 2006-2014 Intel Corporation.
4 *
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
11 */
12
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
15
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-pages.h"
31 #include "pasid.h"
32 #include "perfmon.h"
33
34 #define ROOT_SIZE VTD_PAGE_SIZE
35 #define CONTEXT_SIZE VTD_PAGE_SIZE
36
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41
42 #define IOAPIC_RANGE_START (0xfee00000)
43 #define IOAPIC_RANGE_END (0xfeefffff)
44 #define IOVA_START_ADDR (0x1000)
45
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47
48 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
49 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
50
51 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
52 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
53 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
54 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
55 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
56
57 static void __init check_tylersburg_isoch(void);
58 static int rwbf_quirk;
59
60 /*
61 * set to 1 to panic kernel if can't successfully enable VT-d
62 * (used when kernel is launched w/ TXT)
63 */
64 static int force_on = 0;
65 static int intel_iommu_tboot_noforce;
66 static int no_platform_optin;
67
68 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
69
70 /*
71 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
72 * if marked present.
73 */
root_entry_lctp(struct root_entry * re)74 static phys_addr_t root_entry_lctp(struct root_entry *re)
75 {
76 if (!(re->lo & 1))
77 return 0;
78
79 return re->lo & VTD_PAGE_MASK;
80 }
81
82 /*
83 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
84 * if marked present.
85 */
root_entry_uctp(struct root_entry * re)86 static phys_addr_t root_entry_uctp(struct root_entry *re)
87 {
88 if (!(re->hi & 1))
89 return 0;
90
91 return re->hi & VTD_PAGE_MASK;
92 }
93
device_rid_cmp_key(const void * key,const struct rb_node * node)94 static int device_rid_cmp_key(const void *key, const struct rb_node *node)
95 {
96 struct device_domain_info *info =
97 rb_entry(node, struct device_domain_info, node);
98 const u16 *rid_lhs = key;
99
100 if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
101 return -1;
102
103 if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
104 return 1;
105
106 return 0;
107 }
108
device_rid_cmp(struct rb_node * lhs,const struct rb_node * rhs)109 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
110 {
111 struct device_domain_info *info =
112 rb_entry(lhs, struct device_domain_info, node);
113 u16 key = PCI_DEVID(info->bus, info->devfn);
114
115 return device_rid_cmp_key(&key, rhs);
116 }
117
118 /*
119 * Looks up an IOMMU-probed device using its source ID.
120 *
121 * Returns the pointer to the device if there is a match. Otherwise,
122 * returns NULL.
123 *
124 * Note that this helper doesn't guarantee that the device won't be
125 * released by the iommu subsystem after being returned. The caller
126 * should use its own synchronization mechanism to avoid the device
127 * being released during its use if its possibly the case.
128 */
device_rbtree_find(struct intel_iommu * iommu,u16 rid)129 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
130 {
131 struct device_domain_info *info = NULL;
132 struct rb_node *node;
133 unsigned long flags;
134
135 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
136 node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
137 if (node)
138 info = rb_entry(node, struct device_domain_info, node);
139 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
140
141 return info ? info->dev : NULL;
142 }
143
device_rbtree_insert(struct intel_iommu * iommu,struct device_domain_info * info)144 static int device_rbtree_insert(struct intel_iommu *iommu,
145 struct device_domain_info *info)
146 {
147 struct rb_node *curr;
148 unsigned long flags;
149
150 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
151 curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
152 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
153 if (WARN_ON(curr))
154 return -EEXIST;
155
156 return 0;
157 }
158
device_rbtree_remove(struct device_domain_info * info)159 static void device_rbtree_remove(struct device_domain_info *info)
160 {
161 struct intel_iommu *iommu = info->iommu;
162 unsigned long flags;
163
164 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
165 rb_erase(&info->node, &iommu->device_rbtree);
166 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
167 }
168
169 struct dmar_rmrr_unit {
170 struct list_head list; /* list of rmrr units */
171 struct acpi_dmar_header *hdr; /* ACPI header */
172 u64 base_address; /* reserved base address*/
173 u64 end_address; /* reserved end address */
174 struct dmar_dev_scope *devices; /* target devices */
175 int devices_cnt; /* target device count */
176 };
177
178 struct dmar_atsr_unit {
179 struct list_head list; /* list of ATSR units */
180 struct acpi_dmar_header *hdr; /* ACPI header */
181 struct dmar_dev_scope *devices; /* target devices */
182 int devices_cnt; /* target device count */
183 u8 include_all:1; /* include all ports */
184 };
185
186 struct dmar_satc_unit {
187 struct list_head list; /* list of SATC units */
188 struct acpi_dmar_header *hdr; /* ACPI header */
189 struct dmar_dev_scope *devices; /* target devices */
190 struct intel_iommu *iommu; /* the corresponding iommu */
191 int devices_cnt; /* target device count */
192 u8 atc_required:1; /* ATS is required */
193 };
194
195 static LIST_HEAD(dmar_atsr_units);
196 static LIST_HEAD(dmar_rmrr_units);
197 static LIST_HEAD(dmar_satc_units);
198
199 #define for_each_rmrr_units(rmrr) \
200 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
201
202 static void intel_iommu_domain_free(struct iommu_domain *domain);
203
204 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
205 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
206
207 int intel_iommu_enabled = 0;
208 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
209
210 static int intel_iommu_superpage = 1;
211 static int iommu_identity_mapping;
212 static int iommu_skip_te_disable;
213 static int disable_igfx_iommu;
214
215 #define IDENTMAP_AZALIA 4
216
217 const struct iommu_ops intel_iommu_ops;
218 static const struct iommu_dirty_ops intel_dirty_ops;
219
translation_pre_enabled(struct intel_iommu * iommu)220 static bool translation_pre_enabled(struct intel_iommu *iommu)
221 {
222 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
223 }
224
clear_translation_pre_enabled(struct intel_iommu * iommu)225 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
226 {
227 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
228 }
229
init_translation_status(struct intel_iommu * iommu)230 static void init_translation_status(struct intel_iommu *iommu)
231 {
232 u32 gsts;
233
234 gsts = readl(iommu->reg + DMAR_GSTS_REG);
235 if (gsts & DMA_GSTS_TES)
236 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
237 }
238
intel_iommu_setup(char * str)239 static int __init intel_iommu_setup(char *str)
240 {
241 if (!str)
242 return -EINVAL;
243
244 while (*str) {
245 if (!strncmp(str, "on", 2)) {
246 dmar_disabled = 0;
247 pr_info("IOMMU enabled\n");
248 } else if (!strncmp(str, "off", 3)) {
249 dmar_disabled = 1;
250 no_platform_optin = 1;
251 pr_info("IOMMU disabled\n");
252 } else if (!strncmp(str, "igfx_off", 8)) {
253 disable_igfx_iommu = 1;
254 pr_info("Disable GFX device mapping\n");
255 } else if (!strncmp(str, "forcedac", 8)) {
256 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
257 iommu_dma_forcedac = true;
258 } else if (!strncmp(str, "strict", 6)) {
259 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
260 iommu_set_dma_strict();
261 } else if (!strncmp(str, "sp_off", 6)) {
262 pr_info("Disable supported super page\n");
263 intel_iommu_superpage = 0;
264 } else if (!strncmp(str, "sm_on", 5)) {
265 pr_info("Enable scalable mode if hardware supports\n");
266 intel_iommu_sm = 1;
267 } else if (!strncmp(str, "sm_off", 6)) {
268 pr_info("Scalable mode is disallowed\n");
269 intel_iommu_sm = 0;
270 } else if (!strncmp(str, "tboot_noforce", 13)) {
271 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
272 intel_iommu_tboot_noforce = 1;
273 } else {
274 pr_notice("Unknown option - '%s'\n", str);
275 }
276
277 str += strcspn(str, ",");
278 while (*str == ',')
279 str++;
280 }
281
282 return 1;
283 }
284 __setup("intel_iommu=", intel_iommu_setup);
285
domain_pfn_supported(struct dmar_domain * domain,unsigned long pfn)286 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
287 {
288 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
289
290 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
291 }
292
293 /*
294 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
295 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
296 * the returned SAGAW.
297 */
__iommu_calculate_sagaw(struct intel_iommu * iommu)298 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
299 {
300 unsigned long fl_sagaw, sl_sagaw;
301
302 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
303 sl_sagaw = cap_sagaw(iommu->cap);
304
305 /* Second level only. */
306 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
307 return sl_sagaw;
308
309 /* First level only. */
310 if (!ecap_slts(iommu->ecap))
311 return fl_sagaw;
312
313 return fl_sagaw & sl_sagaw;
314 }
315
__iommu_calculate_agaw(struct intel_iommu * iommu,int max_gaw)316 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
317 {
318 unsigned long sagaw;
319 int agaw;
320
321 sagaw = __iommu_calculate_sagaw(iommu);
322 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
323 if (test_bit(agaw, &sagaw))
324 break;
325 }
326
327 return agaw;
328 }
329
330 /*
331 * Calculate max SAGAW for each iommu.
332 */
iommu_calculate_max_sagaw(struct intel_iommu * iommu)333 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
334 {
335 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
336 }
337
338 /*
339 * calculate agaw for each iommu.
340 * "SAGAW" may be different across iommus, use a default agaw, and
341 * get a supported less agaw for iommus that don't support the default agaw.
342 */
iommu_calculate_agaw(struct intel_iommu * iommu)343 int iommu_calculate_agaw(struct intel_iommu *iommu)
344 {
345 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
346 }
347
iommu_paging_structure_coherency(struct intel_iommu * iommu)348 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
349 {
350 return sm_supported(iommu) ?
351 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
352 }
353
354 /* Return the super pagesize bitmap if supported. */
domain_super_pgsize_bitmap(struct dmar_domain * domain)355 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
356 {
357 unsigned long bitmap = 0;
358
359 /*
360 * 1-level super page supports page size of 2MiB, 2-level super page
361 * supports page size of both 2MiB and 1GiB.
362 */
363 if (domain->iommu_superpage == 1)
364 bitmap |= SZ_2M;
365 else if (domain->iommu_superpage == 2)
366 bitmap |= SZ_2M | SZ_1G;
367
368 return bitmap;
369 }
370
iommu_context_addr(struct intel_iommu * iommu,u8 bus,u8 devfn,int alloc)371 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
372 u8 devfn, int alloc)
373 {
374 struct root_entry *root = &iommu->root_entry[bus];
375 struct context_entry *context;
376 u64 *entry;
377
378 /*
379 * Except that the caller requested to allocate a new entry,
380 * returning a copied context entry makes no sense.
381 */
382 if (!alloc && context_copied(iommu, bus, devfn))
383 return NULL;
384
385 entry = &root->lo;
386 if (sm_supported(iommu)) {
387 if (devfn >= 0x80) {
388 devfn -= 0x80;
389 entry = &root->hi;
390 }
391 devfn *= 2;
392 }
393 if (*entry & 1)
394 context = phys_to_virt(*entry & VTD_PAGE_MASK);
395 else {
396 unsigned long phy_addr;
397 if (!alloc)
398 return NULL;
399
400 context = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC,
401 SZ_4K);
402 if (!context)
403 return NULL;
404
405 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
406 phy_addr = virt_to_phys((void *)context);
407 *entry = phy_addr | 1;
408 __iommu_flush_cache(iommu, entry, sizeof(*entry));
409 }
410 return &context[devfn];
411 }
412
413 /**
414 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
415 * sub-hierarchy of a candidate PCI-PCI bridge
416 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
417 * @bridge: the candidate PCI-PCI bridge
418 *
419 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
420 */
421 static bool
is_downstream_to_pci_bridge(struct device * dev,struct device * bridge)422 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
423 {
424 struct pci_dev *pdev, *pbridge;
425
426 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
427 return false;
428
429 pdev = to_pci_dev(dev);
430 pbridge = to_pci_dev(bridge);
431
432 if (pbridge->subordinate &&
433 pbridge->subordinate->number <= pdev->bus->number &&
434 pbridge->subordinate->busn_res.end >= pdev->bus->number)
435 return true;
436
437 return false;
438 }
439
quirk_ioat_snb_local_iommu(struct pci_dev * pdev)440 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
441 {
442 struct dmar_drhd_unit *drhd;
443 u32 vtbar;
444 int rc;
445
446 /* We know that this device on this chipset has its own IOMMU.
447 * If we find it under a different IOMMU, then the BIOS is lying
448 * to us. Hope that the IOMMU for this device is actually
449 * disabled, and it needs no translation...
450 */
451 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
452 if (rc) {
453 /* "can't" happen */
454 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
455 return false;
456 }
457 vtbar &= 0xffff0000;
458
459 /* we know that the this iommu should be at offset 0xa000 from vtbar */
460 drhd = dmar_find_matched_drhd_unit(pdev);
461 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
462 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
463 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
464 return true;
465 }
466
467 return false;
468 }
469
iommu_is_dummy(struct intel_iommu * iommu,struct device * dev)470 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
471 {
472 if (!iommu || iommu->drhd->ignored)
473 return true;
474
475 if (dev_is_pci(dev)) {
476 struct pci_dev *pdev = to_pci_dev(dev);
477
478 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
479 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
480 quirk_ioat_snb_local_iommu(pdev))
481 return true;
482 }
483
484 return false;
485 }
486
device_lookup_iommu(struct device * dev,u8 * bus,u8 * devfn)487 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
488 {
489 struct dmar_drhd_unit *drhd = NULL;
490 struct pci_dev *pdev = NULL;
491 struct intel_iommu *iommu;
492 struct device *tmp;
493 u16 segment = 0;
494 int i;
495
496 if (!dev)
497 return NULL;
498
499 if (dev_is_pci(dev)) {
500 struct pci_dev *pf_pdev;
501
502 pdev = pci_real_dma_dev(to_pci_dev(dev));
503
504 /* VFs aren't listed in scope tables; we need to look up
505 * the PF instead to find the IOMMU. */
506 pf_pdev = pci_physfn(pdev);
507 dev = &pf_pdev->dev;
508 segment = pci_domain_nr(pdev->bus);
509 } else if (has_acpi_companion(dev))
510 dev = &ACPI_COMPANION(dev)->dev;
511
512 rcu_read_lock();
513 for_each_iommu(iommu, drhd) {
514 if (pdev && segment != drhd->segment)
515 continue;
516
517 for_each_active_dev_scope(drhd->devices,
518 drhd->devices_cnt, i, tmp) {
519 if (tmp == dev) {
520 /* For a VF use its original BDF# not that of the PF
521 * which we used for the IOMMU lookup. Strictly speaking
522 * we could do this for all PCI devices; we only need to
523 * get the BDF# from the scope table for ACPI matches. */
524 if (pdev && pdev->is_virtfn)
525 goto got_pdev;
526
527 if (bus && devfn) {
528 *bus = drhd->devices[i].bus;
529 *devfn = drhd->devices[i].devfn;
530 }
531 goto out;
532 }
533
534 if (is_downstream_to_pci_bridge(dev, tmp))
535 goto got_pdev;
536 }
537
538 if (pdev && drhd->include_all) {
539 got_pdev:
540 if (bus && devfn) {
541 *bus = pdev->bus->number;
542 *devfn = pdev->devfn;
543 }
544 goto out;
545 }
546 }
547 iommu = NULL;
548 out:
549 if (iommu_is_dummy(iommu, dev))
550 iommu = NULL;
551
552 rcu_read_unlock();
553
554 return iommu;
555 }
556
domain_flush_cache(struct dmar_domain * domain,void * addr,int size)557 static void domain_flush_cache(struct dmar_domain *domain,
558 void *addr, int size)
559 {
560 if (!domain->iommu_coherency)
561 clflush_cache_range(addr, size);
562 }
563
free_context_table(struct intel_iommu * iommu)564 static void free_context_table(struct intel_iommu *iommu)
565 {
566 struct context_entry *context;
567 int i;
568
569 if (!iommu->root_entry)
570 return;
571
572 for (i = 0; i < ROOT_ENTRY_NR; i++) {
573 context = iommu_context_addr(iommu, i, 0, 0);
574 if (context)
575 iommu_free_pages(context);
576
577 if (!sm_supported(iommu))
578 continue;
579
580 context = iommu_context_addr(iommu, i, 0x80, 0);
581 if (context)
582 iommu_free_pages(context);
583 }
584
585 iommu_free_pages(iommu->root_entry);
586 iommu->root_entry = NULL;
587 }
588
589 #ifdef CONFIG_DMAR_DEBUG
pgtable_walk(struct intel_iommu * iommu,unsigned long pfn,u8 bus,u8 devfn,struct dma_pte * parent,int level)590 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
591 u8 bus, u8 devfn, struct dma_pte *parent, int level)
592 {
593 struct dma_pte *pte;
594 int offset;
595
596 while (1) {
597 offset = pfn_level_offset(pfn, level);
598 pte = &parent[offset];
599
600 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
601
602 if (!dma_pte_present(pte)) {
603 pr_info("page table not present at level %d\n", level - 1);
604 break;
605 }
606
607 if (level == 1 || dma_pte_superpage(pte))
608 break;
609
610 parent = phys_to_virt(dma_pte_addr(pte));
611 level--;
612 }
613 }
614
dmar_fault_dump_ptes(struct intel_iommu * iommu,u16 source_id,unsigned long long addr,u32 pasid)615 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
616 unsigned long long addr, u32 pasid)
617 {
618 struct pasid_dir_entry *dir, *pde;
619 struct pasid_entry *entries, *pte;
620 struct context_entry *ctx_entry;
621 struct root_entry *rt_entry;
622 int i, dir_index, index, level;
623 u8 devfn = source_id & 0xff;
624 u8 bus = source_id >> 8;
625 struct dma_pte *pgtable;
626
627 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
628
629 /* root entry dump */
630 if (!iommu->root_entry) {
631 pr_info("root table is not present\n");
632 return;
633 }
634 rt_entry = &iommu->root_entry[bus];
635
636 if (sm_supported(iommu))
637 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
638 rt_entry->hi, rt_entry->lo);
639 else
640 pr_info("root entry: 0x%016llx", rt_entry->lo);
641
642 /* context entry dump */
643 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
644 if (!ctx_entry) {
645 pr_info("context table is not present\n");
646 return;
647 }
648
649 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
650 ctx_entry->hi, ctx_entry->lo);
651
652 /* legacy mode does not require PASID entries */
653 if (!sm_supported(iommu)) {
654 if (!context_present(ctx_entry)) {
655 pr_info("legacy mode page table is not present\n");
656 return;
657 }
658 level = agaw_to_level(ctx_entry->hi & 7);
659 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
660 goto pgtable_walk;
661 }
662
663 if (!context_present(ctx_entry)) {
664 pr_info("pasid directory table is not present\n");
665 return;
666 }
667
668 /* get the pointer to pasid directory entry */
669 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
670
671 /* For request-without-pasid, get the pasid from context entry */
672 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
673 pasid = IOMMU_NO_PASID;
674
675 dir_index = pasid >> PASID_PDE_SHIFT;
676 pde = &dir[dir_index];
677 pr_info("pasid dir entry: 0x%016llx\n", pde->val);
678
679 /* get the pointer to the pasid table entry */
680 entries = get_pasid_table_from_pde(pde);
681 if (!entries) {
682 pr_info("pasid table is not present\n");
683 return;
684 }
685 index = pasid & PASID_PTE_MASK;
686 pte = &entries[index];
687 for (i = 0; i < ARRAY_SIZE(pte->val); i++)
688 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
689
690 if (!pasid_pte_is_present(pte)) {
691 pr_info("scalable mode page table is not present\n");
692 return;
693 }
694
695 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
696 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
697 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
698 } else {
699 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
700 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
701 }
702
703 pgtable_walk:
704 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
705 }
706 #endif
707
pfn_to_dma_pte(struct dmar_domain * domain,unsigned long pfn,int * target_level,gfp_t gfp)708 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
709 unsigned long pfn, int *target_level,
710 gfp_t gfp)
711 {
712 struct dma_pte *parent, *pte;
713 int level = agaw_to_level(domain->agaw);
714 int offset;
715
716 if (!domain_pfn_supported(domain, pfn))
717 /* Address beyond IOMMU's addressing capabilities. */
718 return NULL;
719
720 parent = domain->pgd;
721
722 while (1) {
723 void *tmp_page;
724
725 offset = pfn_level_offset(pfn, level);
726 pte = &parent[offset];
727 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
728 break;
729 if (level == *target_level)
730 break;
731
732 if (!dma_pte_present(pte)) {
733 uint64_t pteval, tmp;
734
735 tmp_page = iommu_alloc_pages_node_sz(domain->nid, gfp,
736 SZ_4K);
737
738 if (!tmp_page)
739 return NULL;
740
741 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
742 pteval = virt_to_phys(tmp_page) | DMA_PTE_READ |
743 DMA_PTE_WRITE;
744 if (domain->use_first_level)
745 pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
746
747 tmp = 0ULL;
748 if (!try_cmpxchg64(&pte->val, &tmp, pteval))
749 /* Someone else set it while we were thinking; use theirs. */
750 iommu_free_pages(tmp_page);
751 else
752 domain_flush_cache(domain, pte, sizeof(*pte));
753 }
754 if (level == 1)
755 break;
756
757 parent = phys_to_virt(dma_pte_addr(pte));
758 level--;
759 }
760
761 if (!*target_level)
762 *target_level = level;
763
764 return pte;
765 }
766
767 /* return address's pte at specific level */
dma_pfn_level_pte(struct dmar_domain * domain,unsigned long pfn,int level,int * large_page)768 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
769 unsigned long pfn,
770 int level, int *large_page)
771 {
772 struct dma_pte *parent, *pte;
773 int total = agaw_to_level(domain->agaw);
774 int offset;
775
776 parent = domain->pgd;
777 while (level <= total) {
778 offset = pfn_level_offset(pfn, total);
779 pte = &parent[offset];
780 if (level == total)
781 return pte;
782
783 if (!dma_pte_present(pte)) {
784 *large_page = total;
785 break;
786 }
787
788 if (dma_pte_superpage(pte)) {
789 *large_page = total;
790 return pte;
791 }
792
793 parent = phys_to_virt(dma_pte_addr(pte));
794 total--;
795 }
796 return NULL;
797 }
798
799 /* clear last level pte, a tlb flush should be followed */
dma_pte_clear_range(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn)800 static void dma_pte_clear_range(struct dmar_domain *domain,
801 unsigned long start_pfn,
802 unsigned long last_pfn)
803 {
804 unsigned int large_page;
805 struct dma_pte *first_pte, *pte;
806
807 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
808 WARN_ON(start_pfn > last_pfn))
809 return;
810
811 /* we don't need lock here; nobody else touches the iova range */
812 do {
813 large_page = 1;
814 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
815 if (!pte) {
816 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
817 continue;
818 }
819 do {
820 dma_clear_pte(pte);
821 start_pfn += lvl_to_nr_pages(large_page);
822 pte++;
823 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
824
825 domain_flush_cache(domain, first_pte,
826 (void *)pte - (void *)first_pte);
827
828 } while (start_pfn && start_pfn <= last_pfn);
829 }
830
dma_pte_free_level(struct dmar_domain * domain,int level,int retain_level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn)831 static void dma_pte_free_level(struct dmar_domain *domain, int level,
832 int retain_level, struct dma_pte *pte,
833 unsigned long pfn, unsigned long start_pfn,
834 unsigned long last_pfn)
835 {
836 pfn = max(start_pfn, pfn);
837 pte = &pte[pfn_level_offset(pfn, level)];
838
839 do {
840 unsigned long level_pfn;
841 struct dma_pte *level_pte;
842
843 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
844 goto next;
845
846 level_pfn = pfn & level_mask(level);
847 level_pte = phys_to_virt(dma_pte_addr(pte));
848
849 if (level > 2) {
850 dma_pte_free_level(domain, level - 1, retain_level,
851 level_pte, level_pfn, start_pfn,
852 last_pfn);
853 }
854
855 /*
856 * Free the page table if we're below the level we want to
857 * retain and the range covers the entire table.
858 */
859 if (level < retain_level && !(start_pfn > level_pfn ||
860 last_pfn < level_pfn + level_size(level) - 1)) {
861 dma_clear_pte(pte);
862 domain_flush_cache(domain, pte, sizeof(*pte));
863 iommu_free_pages(level_pte);
864 }
865 next:
866 pfn += level_size(level);
867 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
868 }
869
870 /*
871 * clear last level (leaf) ptes and free page table pages below the
872 * level we wish to keep intact.
873 */
dma_pte_free_pagetable(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,int retain_level)874 static void dma_pte_free_pagetable(struct dmar_domain *domain,
875 unsigned long start_pfn,
876 unsigned long last_pfn,
877 int retain_level)
878 {
879 dma_pte_clear_range(domain, start_pfn, last_pfn);
880
881 /* We don't need lock here; nobody else touches the iova range */
882 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
883 domain->pgd, 0, start_pfn, last_pfn);
884
885 /* free pgd */
886 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
887 iommu_free_pages(domain->pgd);
888 domain->pgd = NULL;
889 }
890 }
891
892 /* When a page at a given level is being unlinked from its parent, we don't
893 need to *modify* it at all. All we need to do is make a list of all the
894 pages which can be freed just as soon as we've flushed the IOTLB and we
895 know the hardware page-walk will no longer touch them.
896 The 'pte' argument is the *parent* PTE, pointing to the page that is to
897 be freed. */
dma_pte_list_pagetables(struct dmar_domain * domain,int level,struct dma_pte * parent_pte,struct iommu_pages_list * freelist)898 static void dma_pte_list_pagetables(struct dmar_domain *domain,
899 int level, struct dma_pte *parent_pte,
900 struct iommu_pages_list *freelist)
901 {
902 struct dma_pte *pte = phys_to_virt(dma_pte_addr(parent_pte));
903
904 iommu_pages_list_add(freelist, pte);
905
906 if (level == 1)
907 return;
908
909 do {
910 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
911 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
912 pte++;
913 } while (!first_pte_in_page(pte));
914 }
915
dma_pte_clear_level(struct dmar_domain * domain,int level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn,struct iommu_pages_list * freelist)916 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
917 struct dma_pte *pte, unsigned long pfn,
918 unsigned long start_pfn, unsigned long last_pfn,
919 struct iommu_pages_list *freelist)
920 {
921 struct dma_pte *first_pte = NULL, *last_pte = NULL;
922
923 pfn = max(start_pfn, pfn);
924 pte = &pte[pfn_level_offset(pfn, level)];
925
926 do {
927 unsigned long level_pfn = pfn & level_mask(level);
928
929 if (!dma_pte_present(pte))
930 goto next;
931
932 /* If range covers entire pagetable, free it */
933 if (start_pfn <= level_pfn &&
934 last_pfn >= level_pfn + level_size(level) - 1) {
935 /* These suborbinate page tables are going away entirely. Don't
936 bother to clear them; we're just going to *free* them. */
937 if (level > 1 && !dma_pte_superpage(pte))
938 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
939
940 dma_clear_pte(pte);
941 if (!first_pte)
942 first_pte = pte;
943 last_pte = pte;
944 } else if (level > 1) {
945 /* Recurse down into a level that isn't *entirely* obsolete */
946 dma_pte_clear_level(domain, level - 1,
947 phys_to_virt(dma_pte_addr(pte)),
948 level_pfn, start_pfn, last_pfn,
949 freelist);
950 }
951 next:
952 pfn = level_pfn + level_size(level);
953 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
954
955 if (first_pte)
956 domain_flush_cache(domain, first_pte,
957 (void *)++last_pte - (void *)first_pte);
958 }
959
960 /* We can't just free the pages because the IOMMU may still be walking
961 the page tables, and may have cached the intermediate levels. The
962 pages can only be freed after the IOTLB flush has been done. */
domain_unmap(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,struct iommu_pages_list * freelist)963 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
964 unsigned long last_pfn,
965 struct iommu_pages_list *freelist)
966 {
967 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
968 WARN_ON(start_pfn > last_pfn))
969 return;
970
971 /* we don't need lock here; nobody else touches the iova range */
972 dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
973 domain->pgd, 0, start_pfn, last_pfn, freelist);
974
975 /* free pgd */
976 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
977 iommu_pages_list_add(freelist, domain->pgd);
978 domain->pgd = NULL;
979 }
980 }
981
982 /* iommu handling */
iommu_alloc_root_entry(struct intel_iommu * iommu)983 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
984 {
985 struct root_entry *root;
986
987 root = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC, SZ_4K);
988 if (!root) {
989 pr_err("Allocating root entry for %s failed\n",
990 iommu->name);
991 return -ENOMEM;
992 }
993
994 __iommu_flush_cache(iommu, root, ROOT_SIZE);
995 iommu->root_entry = root;
996
997 return 0;
998 }
999
iommu_set_root_entry(struct intel_iommu * iommu)1000 static void iommu_set_root_entry(struct intel_iommu *iommu)
1001 {
1002 u64 addr;
1003 u32 sts;
1004 unsigned long flag;
1005
1006 addr = virt_to_phys(iommu->root_entry);
1007 if (sm_supported(iommu))
1008 addr |= DMA_RTADDR_SMT;
1009
1010 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1011 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1012
1013 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1014
1015 /* Make sure hardware complete it */
1016 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1017 readl, (sts & DMA_GSTS_RTPS), sts);
1018
1019 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1020
1021 /*
1022 * Hardware invalidates all DMA remapping hardware translation
1023 * caches as part of SRTP flow.
1024 */
1025 if (cap_esrtps(iommu->cap))
1026 return;
1027
1028 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1029 if (sm_supported(iommu))
1030 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1031 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1032 }
1033
iommu_flush_write_buffer(struct intel_iommu * iommu)1034 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1035 {
1036 u32 val;
1037 unsigned long flag;
1038
1039 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1040 return;
1041
1042 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1043 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1044
1045 /* Make sure hardware complete it */
1046 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1047 readl, (!(val & DMA_GSTS_WBFS)), val);
1048
1049 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1050 }
1051
1052 /* return value determine if we need a write buffer flush */
__iommu_flush_context(struct intel_iommu * iommu,u16 did,u16 source_id,u8 function_mask,u64 type)1053 static void __iommu_flush_context(struct intel_iommu *iommu,
1054 u16 did, u16 source_id, u8 function_mask,
1055 u64 type)
1056 {
1057 u64 val = 0;
1058 unsigned long flag;
1059
1060 switch (type) {
1061 case DMA_CCMD_GLOBAL_INVL:
1062 val = DMA_CCMD_GLOBAL_INVL;
1063 break;
1064 case DMA_CCMD_DOMAIN_INVL:
1065 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1066 break;
1067 case DMA_CCMD_DEVICE_INVL:
1068 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1069 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1070 break;
1071 default:
1072 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1073 iommu->name, type);
1074 return;
1075 }
1076 val |= DMA_CCMD_ICC;
1077
1078 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1079 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1080
1081 /* Make sure hardware complete it */
1082 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1083 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1084
1085 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1086 }
1087
__iommu_flush_iotlb(struct intel_iommu * iommu,u16 did,u64 addr,unsigned int size_order,u64 type)1088 void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
1089 unsigned int size_order, u64 type)
1090 {
1091 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1092 u64 val = 0, val_iva = 0;
1093 unsigned long flag;
1094
1095 switch (type) {
1096 case DMA_TLB_GLOBAL_FLUSH:
1097 /* global flush doesn't need set IVA_REG */
1098 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1099 break;
1100 case DMA_TLB_DSI_FLUSH:
1101 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1102 break;
1103 case DMA_TLB_PSI_FLUSH:
1104 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1105 /* IH bit is passed in as part of address */
1106 val_iva = size_order | addr;
1107 break;
1108 default:
1109 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1110 iommu->name, type);
1111 return;
1112 }
1113
1114 if (cap_write_drain(iommu->cap))
1115 val |= DMA_TLB_WRITE_DRAIN;
1116
1117 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1118 /* Note: Only uses first TLB reg currently */
1119 if (val_iva)
1120 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1121 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1122
1123 /* Make sure hardware complete it */
1124 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1125 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1126
1127 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1128
1129 /* check IOTLB invalidation granularity */
1130 if (DMA_TLB_IAIG(val) == 0)
1131 pr_err("Flush IOTLB failed\n");
1132 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1133 pr_debug("TLB flush request %Lx, actual %Lx\n",
1134 (unsigned long long)DMA_TLB_IIRG(type),
1135 (unsigned long long)DMA_TLB_IAIG(val));
1136 }
1137
1138 static struct device_domain_info *
domain_lookup_dev_info(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1139 domain_lookup_dev_info(struct dmar_domain *domain,
1140 struct intel_iommu *iommu, u8 bus, u8 devfn)
1141 {
1142 struct device_domain_info *info;
1143 unsigned long flags;
1144
1145 spin_lock_irqsave(&domain->lock, flags);
1146 list_for_each_entry(info, &domain->devices, link) {
1147 if (info->iommu == iommu && info->bus == bus &&
1148 info->devfn == devfn) {
1149 spin_unlock_irqrestore(&domain->lock, flags);
1150 return info;
1151 }
1152 }
1153 spin_unlock_irqrestore(&domain->lock, flags);
1154
1155 return NULL;
1156 }
1157
1158 /*
1159 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1160 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1161 * check because it applies only to the built-in QAT devices and it doesn't
1162 * grant additional privileges.
1163 */
1164 #define BUGGY_QAT_DEVID_MASK 0x4940
dev_needs_extra_dtlb_flush(struct pci_dev * pdev)1165 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1166 {
1167 if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1168 return false;
1169
1170 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1171 return false;
1172
1173 return true;
1174 }
1175
iommu_enable_pci_ats(struct device_domain_info * info)1176 static void iommu_enable_pci_ats(struct device_domain_info *info)
1177 {
1178 struct pci_dev *pdev;
1179
1180 if (!info->ats_supported)
1181 return;
1182
1183 pdev = to_pci_dev(info->dev);
1184 if (!pci_ats_page_aligned(pdev))
1185 return;
1186
1187 if (!pci_enable_ats(pdev, VTD_PAGE_SHIFT))
1188 info->ats_enabled = 1;
1189 }
1190
iommu_disable_pci_ats(struct device_domain_info * info)1191 static void iommu_disable_pci_ats(struct device_domain_info *info)
1192 {
1193 if (!info->ats_enabled)
1194 return;
1195
1196 pci_disable_ats(to_pci_dev(info->dev));
1197 info->ats_enabled = 0;
1198 }
1199
iommu_enable_pci_pri(struct device_domain_info * info)1200 static void iommu_enable_pci_pri(struct device_domain_info *info)
1201 {
1202 struct pci_dev *pdev;
1203
1204 if (!info->ats_enabled || !info->pri_supported)
1205 return;
1206
1207 pdev = to_pci_dev(info->dev);
1208 /* PASID is required in PRG Response Message. */
1209 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
1210 return;
1211
1212 if (pci_reset_pri(pdev))
1213 return;
1214
1215 if (!pci_enable_pri(pdev, PRQ_DEPTH))
1216 info->pri_enabled = 1;
1217 }
1218
iommu_disable_pci_pri(struct device_domain_info * info)1219 static void iommu_disable_pci_pri(struct device_domain_info *info)
1220 {
1221 if (!info->pri_enabled)
1222 return;
1223
1224 if (WARN_ON(info->iopf_refcount))
1225 iopf_queue_remove_device(info->iommu->iopf_queue, info->dev);
1226
1227 pci_disable_pri(to_pci_dev(info->dev));
1228 info->pri_enabled = 0;
1229 }
1230
intel_flush_iotlb_all(struct iommu_domain * domain)1231 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1232 {
1233 cache_tag_flush_all(to_dmar_domain(domain));
1234 }
1235
iommu_disable_protect_mem_regions(struct intel_iommu * iommu)1236 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1237 {
1238 u32 pmen;
1239 unsigned long flags;
1240
1241 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1242 return;
1243
1244 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1245 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1246 pmen &= ~DMA_PMEN_EPM;
1247 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1248
1249 /* wait for the protected region status bit to clear */
1250 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1251 readl, !(pmen & DMA_PMEN_PRS), pmen);
1252
1253 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1254 }
1255
iommu_enable_translation(struct intel_iommu * iommu)1256 static void iommu_enable_translation(struct intel_iommu *iommu)
1257 {
1258 u32 sts;
1259 unsigned long flags;
1260
1261 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1262 iommu->gcmd |= DMA_GCMD_TE;
1263 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1264
1265 /* Make sure hardware complete it */
1266 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1267 readl, (sts & DMA_GSTS_TES), sts);
1268
1269 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1270 }
1271
iommu_disable_translation(struct intel_iommu * iommu)1272 static void iommu_disable_translation(struct intel_iommu *iommu)
1273 {
1274 u32 sts;
1275 unsigned long flag;
1276
1277 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1278 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1279 return;
1280
1281 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1282 iommu->gcmd &= ~DMA_GCMD_TE;
1283 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1284
1285 /* Make sure hardware complete it */
1286 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1287 readl, (!(sts & DMA_GSTS_TES)), sts);
1288
1289 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1290 }
1291
disable_dmar_iommu(struct intel_iommu * iommu)1292 static void disable_dmar_iommu(struct intel_iommu *iommu)
1293 {
1294 /*
1295 * All iommu domains must have been detached from the devices,
1296 * hence there should be no domain IDs in use.
1297 */
1298 if (WARN_ON(!ida_is_empty(&iommu->domain_ida)))
1299 return;
1300
1301 if (iommu->gcmd & DMA_GCMD_TE)
1302 iommu_disable_translation(iommu);
1303 }
1304
free_dmar_iommu(struct intel_iommu * iommu)1305 static void free_dmar_iommu(struct intel_iommu *iommu)
1306 {
1307 if (iommu->copied_tables) {
1308 bitmap_free(iommu->copied_tables);
1309 iommu->copied_tables = NULL;
1310 }
1311
1312 /* free context mapping */
1313 free_context_table(iommu);
1314
1315 if (ecap_prs(iommu->ecap))
1316 intel_iommu_finish_prq(iommu);
1317 }
1318
1319 /*
1320 * Check and return whether first level is used by default for
1321 * DMA translation.
1322 */
first_level_by_default(struct intel_iommu * iommu)1323 static bool first_level_by_default(struct intel_iommu *iommu)
1324 {
1325 /* Only SL is available in legacy mode */
1326 if (!sm_supported(iommu))
1327 return false;
1328
1329 /* Only level (either FL or SL) is available, just use it */
1330 if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap))
1331 return ecap_flts(iommu->ecap);
1332
1333 return true;
1334 }
1335
domain_attach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1336 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1337 {
1338 struct iommu_domain_info *info, *curr;
1339 int num, ret = -ENOSPC;
1340
1341 if (domain->domain.type == IOMMU_DOMAIN_SVA)
1342 return 0;
1343
1344 info = kzalloc(sizeof(*info), GFP_KERNEL);
1345 if (!info)
1346 return -ENOMEM;
1347
1348 guard(mutex)(&iommu->did_lock);
1349 curr = xa_load(&domain->iommu_array, iommu->seq_id);
1350 if (curr) {
1351 curr->refcnt++;
1352 kfree(info);
1353 return 0;
1354 }
1355
1356 num = ida_alloc_range(&iommu->domain_ida, IDA_START_DID,
1357 cap_ndoms(iommu->cap) - 1, GFP_KERNEL);
1358 if (num < 0) {
1359 pr_err("%s: No free domain ids\n", iommu->name);
1360 goto err_unlock;
1361 }
1362
1363 info->refcnt = 1;
1364 info->did = num;
1365 info->iommu = iommu;
1366 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1367 NULL, info, GFP_KERNEL);
1368 if (curr) {
1369 ret = xa_err(curr) ? : -EBUSY;
1370 goto err_clear;
1371 }
1372
1373 return 0;
1374
1375 err_clear:
1376 ida_free(&iommu->domain_ida, info->did);
1377 err_unlock:
1378 kfree(info);
1379 return ret;
1380 }
1381
domain_detach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1382 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1383 {
1384 struct iommu_domain_info *info;
1385
1386 if (domain->domain.type == IOMMU_DOMAIN_SVA)
1387 return;
1388
1389 guard(mutex)(&iommu->did_lock);
1390 info = xa_load(&domain->iommu_array, iommu->seq_id);
1391 if (--info->refcnt == 0) {
1392 ida_free(&iommu->domain_ida, info->did);
1393 xa_erase(&domain->iommu_array, iommu->seq_id);
1394 domain->nid = NUMA_NO_NODE;
1395 kfree(info);
1396 }
1397 }
1398
domain_exit(struct dmar_domain * domain)1399 static void domain_exit(struct dmar_domain *domain)
1400 {
1401 if (domain->pgd) {
1402 struct iommu_pages_list freelist =
1403 IOMMU_PAGES_LIST_INIT(freelist);
1404
1405 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1406 iommu_put_pages_list(&freelist);
1407 }
1408
1409 if (WARN_ON(!list_empty(&domain->devices)))
1410 return;
1411
1412 kfree(domain->qi_batch);
1413 kfree(domain);
1414 }
1415
1416 /*
1417 * For kdump cases, old valid entries may be cached due to the
1418 * in-flight DMA and copied pgtable, but there is no unmapping
1419 * behaviour for them, thus we need an explicit cache flush for
1420 * the newly-mapped device. For kdump, at this point, the device
1421 * is supposed to finish reset at its driver probe stage, so no
1422 * in-flight DMA will exist, and we don't need to worry anymore
1423 * hereafter.
1424 */
copied_context_tear_down(struct intel_iommu * iommu,struct context_entry * context,u8 bus,u8 devfn)1425 static void copied_context_tear_down(struct intel_iommu *iommu,
1426 struct context_entry *context,
1427 u8 bus, u8 devfn)
1428 {
1429 u16 did_old;
1430
1431 if (!context_copied(iommu, bus, devfn))
1432 return;
1433
1434 assert_spin_locked(&iommu->lock);
1435
1436 did_old = context_domain_id(context);
1437 context_clear_entry(context);
1438
1439 if (did_old < cap_ndoms(iommu->cap)) {
1440 iommu->flush.flush_context(iommu, did_old,
1441 PCI_DEVID(bus, devfn),
1442 DMA_CCMD_MASK_NOBIT,
1443 DMA_CCMD_DEVICE_INVL);
1444 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1445 DMA_TLB_DSI_FLUSH);
1446 }
1447
1448 clear_context_copied(iommu, bus, devfn);
1449 }
1450
1451 /*
1452 * It's a non-present to present mapping. If hardware doesn't cache
1453 * non-present entry we only need to flush the write-buffer. If the
1454 * _does_ cache non-present entries, then it does so in the special
1455 * domain #0, which we have to flush:
1456 */
context_present_cache_flush(struct intel_iommu * iommu,u16 did,u8 bus,u8 devfn)1457 static void context_present_cache_flush(struct intel_iommu *iommu, u16 did,
1458 u8 bus, u8 devfn)
1459 {
1460 if (cap_caching_mode(iommu->cap)) {
1461 iommu->flush.flush_context(iommu, 0,
1462 PCI_DEVID(bus, devfn),
1463 DMA_CCMD_MASK_NOBIT,
1464 DMA_CCMD_DEVICE_INVL);
1465 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1466 } else {
1467 iommu_flush_write_buffer(iommu);
1468 }
1469 }
1470
domain_context_mapping_one(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1471 static int domain_context_mapping_one(struct dmar_domain *domain,
1472 struct intel_iommu *iommu,
1473 u8 bus, u8 devfn)
1474 {
1475 struct device_domain_info *info =
1476 domain_lookup_dev_info(domain, iommu, bus, devfn);
1477 u16 did = domain_id_iommu(domain, iommu);
1478 int translation = CONTEXT_TT_MULTI_LEVEL;
1479 struct dma_pte *pgd = domain->pgd;
1480 struct context_entry *context;
1481 int ret;
1482
1483 pr_debug("Set context mapping for %02x:%02x.%d\n",
1484 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1485
1486 spin_lock(&iommu->lock);
1487 ret = -ENOMEM;
1488 context = iommu_context_addr(iommu, bus, devfn, 1);
1489 if (!context)
1490 goto out_unlock;
1491
1492 ret = 0;
1493 if (context_present(context) && !context_copied(iommu, bus, devfn))
1494 goto out_unlock;
1495
1496 copied_context_tear_down(iommu, context, bus, devfn);
1497 context_clear_entry(context);
1498 context_set_domain_id(context, did);
1499
1500 if (info && info->ats_supported)
1501 translation = CONTEXT_TT_DEV_IOTLB;
1502 else
1503 translation = CONTEXT_TT_MULTI_LEVEL;
1504
1505 context_set_address_root(context, virt_to_phys(pgd));
1506 context_set_address_width(context, domain->agaw);
1507 context_set_translation_type(context, translation);
1508 context_set_fault_enable(context);
1509 context_set_present(context);
1510 if (!ecap_coherent(iommu->ecap))
1511 clflush_cache_range(context, sizeof(*context));
1512 context_present_cache_flush(iommu, did, bus, devfn);
1513 ret = 0;
1514
1515 out_unlock:
1516 spin_unlock(&iommu->lock);
1517
1518 return ret;
1519 }
1520
domain_context_mapping_cb(struct pci_dev * pdev,u16 alias,void * opaque)1521 static int domain_context_mapping_cb(struct pci_dev *pdev,
1522 u16 alias, void *opaque)
1523 {
1524 struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1525 struct intel_iommu *iommu = info->iommu;
1526 struct dmar_domain *domain = opaque;
1527
1528 return domain_context_mapping_one(domain, iommu,
1529 PCI_BUS_NUM(alias), alias & 0xff);
1530 }
1531
1532 static int
domain_context_mapping(struct dmar_domain * domain,struct device * dev)1533 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1534 {
1535 struct device_domain_info *info = dev_iommu_priv_get(dev);
1536 struct intel_iommu *iommu = info->iommu;
1537 u8 bus = info->bus, devfn = info->devfn;
1538 int ret;
1539
1540 if (!dev_is_pci(dev))
1541 return domain_context_mapping_one(domain, iommu, bus, devfn);
1542
1543 ret = pci_for_each_dma_alias(to_pci_dev(dev),
1544 domain_context_mapping_cb, domain);
1545 if (ret)
1546 return ret;
1547
1548 iommu_enable_pci_ats(info);
1549
1550 return 0;
1551 }
1552
1553 /* Return largest possible superpage level for a given mapping */
hardware_largepage_caps(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phy_pfn,unsigned long pages)1554 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1555 unsigned long phy_pfn, unsigned long pages)
1556 {
1557 int support, level = 1;
1558 unsigned long pfnmerge;
1559
1560 support = domain->iommu_superpage;
1561
1562 /* To use a large page, the virtual *and* physical addresses
1563 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1564 of them will mean we have to use smaller pages. So just
1565 merge them and check both at once. */
1566 pfnmerge = iov_pfn | phy_pfn;
1567
1568 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1569 pages >>= VTD_STRIDE_SHIFT;
1570 if (!pages)
1571 break;
1572 pfnmerge >>= VTD_STRIDE_SHIFT;
1573 level++;
1574 support--;
1575 }
1576 return level;
1577 }
1578
1579 /*
1580 * Ensure that old small page tables are removed to make room for superpage(s).
1581 * We're going to add new large pages, so make sure we don't remove their parent
1582 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1583 */
switch_to_super_page(struct dmar_domain * domain,unsigned long start_pfn,unsigned long end_pfn,int level)1584 static void switch_to_super_page(struct dmar_domain *domain,
1585 unsigned long start_pfn,
1586 unsigned long end_pfn, int level)
1587 {
1588 unsigned long lvl_pages = lvl_to_nr_pages(level);
1589 struct dma_pte *pte = NULL;
1590
1591 while (start_pfn <= end_pfn) {
1592 if (!pte)
1593 pte = pfn_to_dma_pte(domain, start_pfn, &level,
1594 GFP_ATOMIC);
1595
1596 if (dma_pte_present(pte)) {
1597 dma_pte_free_pagetable(domain, start_pfn,
1598 start_pfn + lvl_pages - 1,
1599 level + 1);
1600
1601 cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT,
1602 end_pfn << VTD_PAGE_SHIFT, 0);
1603 }
1604
1605 pte++;
1606 start_pfn += lvl_pages;
1607 if (first_pte_in_page(pte))
1608 pte = NULL;
1609 }
1610 }
1611
1612 static int
__domain_mapping(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phys_pfn,unsigned long nr_pages,int prot,gfp_t gfp)1613 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1614 unsigned long phys_pfn, unsigned long nr_pages, int prot,
1615 gfp_t gfp)
1616 {
1617 struct dma_pte *first_pte = NULL, *pte = NULL;
1618 unsigned int largepage_lvl = 0;
1619 unsigned long lvl_pages = 0;
1620 phys_addr_t pteval;
1621 u64 attr;
1622
1623 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
1624 return -EINVAL;
1625
1626 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1627 return -EINVAL;
1628
1629 if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
1630 pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
1631 return -EINVAL;
1632 }
1633
1634 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
1635 if (domain->use_first_level) {
1636 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
1637 if (prot & DMA_PTE_WRITE)
1638 attr |= DMA_FL_PTE_DIRTY;
1639 }
1640
1641 domain->has_mappings = true;
1642
1643 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
1644
1645 while (nr_pages > 0) {
1646 uint64_t tmp;
1647
1648 if (!pte) {
1649 largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
1650 phys_pfn, nr_pages);
1651
1652 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
1653 gfp);
1654 if (!pte)
1655 return -ENOMEM;
1656 first_pte = pte;
1657
1658 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1659
1660 /* It is large page*/
1661 if (largepage_lvl > 1) {
1662 unsigned long end_pfn;
1663 unsigned long pages_to_remove;
1664
1665 pteval |= DMA_PTE_LARGE_PAGE;
1666 pages_to_remove = min_t(unsigned long, nr_pages,
1667 nr_pte_to_next_page(pte) * lvl_pages);
1668 end_pfn = iov_pfn + pages_to_remove - 1;
1669 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
1670 } else {
1671 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1672 }
1673
1674 }
1675 /* We don't need lock here, nobody else
1676 * touches the iova range
1677 */
1678 tmp = 0ULL;
1679 if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
1680 static int dumps = 5;
1681 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1682 iov_pfn, tmp, (unsigned long long)pteval);
1683 if (dumps) {
1684 dumps--;
1685 debug_dma_dump_mappings(NULL);
1686 }
1687 WARN_ON(1);
1688 }
1689
1690 nr_pages -= lvl_pages;
1691 iov_pfn += lvl_pages;
1692 phys_pfn += lvl_pages;
1693 pteval += lvl_pages * VTD_PAGE_SIZE;
1694
1695 /* If the next PTE would be the first in a new page, then we
1696 * need to flush the cache on the entries we've just written.
1697 * And then we'll need to recalculate 'pte', so clear it and
1698 * let it get set again in the if (!pte) block above.
1699 *
1700 * If we're done (!nr_pages) we need to flush the cache too.
1701 *
1702 * Also if we've been setting superpages, we may need to
1703 * recalculate 'pte' and switch back to smaller pages for the
1704 * end of the mapping, if the trailing size is not enough to
1705 * use another superpage (i.e. nr_pages < lvl_pages).
1706 */
1707 pte++;
1708 if (!nr_pages || first_pte_in_page(pte) ||
1709 (largepage_lvl > 1 && nr_pages < lvl_pages)) {
1710 domain_flush_cache(domain, first_pte,
1711 (void *)pte - (void *)first_pte);
1712 pte = NULL;
1713 }
1714 }
1715
1716 return 0;
1717 }
1718
domain_context_clear_one(struct device_domain_info * info,u8 bus,u8 devfn)1719 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
1720 {
1721 struct intel_iommu *iommu = info->iommu;
1722 struct context_entry *context;
1723 u16 did;
1724
1725 spin_lock(&iommu->lock);
1726 context = iommu_context_addr(iommu, bus, devfn, 0);
1727 if (!context) {
1728 spin_unlock(&iommu->lock);
1729 return;
1730 }
1731
1732 did = context_domain_id(context);
1733 context_clear_entry(context);
1734 __iommu_flush_cache(iommu, context, sizeof(*context));
1735 spin_unlock(&iommu->lock);
1736 intel_context_flush_no_pasid(info, context, did);
1737 }
1738
__domain_setup_first_level(struct intel_iommu * iommu,struct device * dev,ioasid_t pasid,u16 did,pgd_t * pgd,int flags,struct iommu_domain * old)1739 int __domain_setup_first_level(struct intel_iommu *iommu,
1740 struct device *dev, ioasid_t pasid,
1741 u16 did, pgd_t *pgd, int flags,
1742 struct iommu_domain *old)
1743 {
1744 if (!old)
1745 return intel_pasid_setup_first_level(iommu, dev, pgd,
1746 pasid, did, flags);
1747 return intel_pasid_replace_first_level(iommu, dev, pgd, pasid, did,
1748 iommu_domain_did(old, iommu),
1749 flags);
1750 }
1751
domain_setup_second_level(struct intel_iommu * iommu,struct dmar_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)1752 static int domain_setup_second_level(struct intel_iommu *iommu,
1753 struct dmar_domain *domain,
1754 struct device *dev, ioasid_t pasid,
1755 struct iommu_domain *old)
1756 {
1757 if (!old)
1758 return intel_pasid_setup_second_level(iommu, domain,
1759 dev, pasid);
1760 return intel_pasid_replace_second_level(iommu, domain, dev,
1761 iommu_domain_did(old, iommu),
1762 pasid);
1763 }
1764
domain_setup_passthrough(struct intel_iommu * iommu,struct device * dev,ioasid_t pasid,struct iommu_domain * old)1765 static int domain_setup_passthrough(struct intel_iommu *iommu,
1766 struct device *dev, ioasid_t pasid,
1767 struct iommu_domain *old)
1768 {
1769 if (!old)
1770 return intel_pasid_setup_pass_through(iommu, dev, pasid);
1771 return intel_pasid_replace_pass_through(iommu, dev,
1772 iommu_domain_did(old, iommu),
1773 pasid);
1774 }
1775
domain_setup_first_level(struct intel_iommu * iommu,struct dmar_domain * domain,struct device * dev,u32 pasid,struct iommu_domain * old)1776 static int domain_setup_first_level(struct intel_iommu *iommu,
1777 struct dmar_domain *domain,
1778 struct device *dev,
1779 u32 pasid, struct iommu_domain *old)
1780 {
1781 struct dma_pte *pgd = domain->pgd;
1782 int level, flags = 0;
1783
1784 level = agaw_to_level(domain->agaw);
1785 if (level != 4 && level != 5)
1786 return -EINVAL;
1787
1788 if (level == 5)
1789 flags |= PASID_FLAG_FL5LP;
1790
1791 if (domain->force_snooping)
1792 flags |= PASID_FLAG_PAGE_SNOOP;
1793
1794 return __domain_setup_first_level(iommu, dev, pasid,
1795 domain_id_iommu(domain, iommu),
1796 (pgd_t *)pgd, flags, old);
1797 }
1798
dmar_domain_attach_device(struct dmar_domain * domain,struct device * dev)1799 static int dmar_domain_attach_device(struct dmar_domain *domain,
1800 struct device *dev)
1801 {
1802 struct device_domain_info *info = dev_iommu_priv_get(dev);
1803 struct intel_iommu *iommu = info->iommu;
1804 unsigned long flags;
1805 int ret;
1806
1807 ret = domain_attach_iommu(domain, iommu);
1808 if (ret)
1809 return ret;
1810
1811 info->domain = domain;
1812 info->domain_attached = true;
1813 spin_lock_irqsave(&domain->lock, flags);
1814 list_add(&info->link, &domain->devices);
1815 spin_unlock_irqrestore(&domain->lock, flags);
1816
1817 if (dev_is_real_dma_subdevice(dev))
1818 return 0;
1819
1820 if (!sm_supported(iommu))
1821 ret = domain_context_mapping(domain, dev);
1822 else if (domain->use_first_level)
1823 ret = domain_setup_first_level(iommu, domain, dev,
1824 IOMMU_NO_PASID, NULL);
1825 else
1826 ret = domain_setup_second_level(iommu, domain, dev,
1827 IOMMU_NO_PASID, NULL);
1828
1829 if (ret)
1830 goto out_block_translation;
1831
1832 ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID);
1833 if (ret)
1834 goto out_block_translation;
1835
1836 return 0;
1837
1838 out_block_translation:
1839 device_block_translation(dev);
1840 return ret;
1841 }
1842
1843 /**
1844 * device_rmrr_is_relaxable - Test whether the RMRR of this device
1845 * is relaxable (ie. is allowed to be not enforced under some conditions)
1846 * @dev: device handle
1847 *
1848 * We assume that PCI USB devices with RMRRs have them largely
1849 * for historical reasons and that the RMRR space is not actively used post
1850 * boot. This exclusion may change if vendors begin to abuse it.
1851 *
1852 * The same exception is made for graphics devices, with the requirement that
1853 * any use of the RMRR regions will be torn down before assigning the device
1854 * to a guest.
1855 *
1856 * Return: true if the RMRR is relaxable, false otherwise
1857 */
device_rmrr_is_relaxable(struct device * dev)1858 static bool device_rmrr_is_relaxable(struct device *dev)
1859 {
1860 struct pci_dev *pdev;
1861
1862 if (!dev_is_pci(dev))
1863 return false;
1864
1865 pdev = to_pci_dev(dev);
1866 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
1867 return true;
1868 else
1869 return false;
1870 }
1871
device_def_domain_type(struct device * dev)1872 static int device_def_domain_type(struct device *dev)
1873 {
1874 struct device_domain_info *info = dev_iommu_priv_get(dev);
1875 struct intel_iommu *iommu = info->iommu;
1876
1877 /*
1878 * Hardware does not support the passthrough translation mode.
1879 * Always use a dynamaic mapping domain.
1880 */
1881 if (!ecap_pass_through(iommu->ecap))
1882 return IOMMU_DOMAIN_DMA;
1883
1884 if (dev_is_pci(dev)) {
1885 struct pci_dev *pdev = to_pci_dev(dev);
1886
1887 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
1888 return IOMMU_DOMAIN_IDENTITY;
1889 }
1890
1891 return 0;
1892 }
1893
intel_iommu_init_qi(struct intel_iommu * iommu)1894 static void intel_iommu_init_qi(struct intel_iommu *iommu)
1895 {
1896 /*
1897 * Start from the sane iommu hardware state.
1898 * If the queued invalidation is already initialized by us
1899 * (for example, while enabling interrupt-remapping) then
1900 * we got the things already rolling from a sane state.
1901 */
1902 if (!iommu->qi) {
1903 /*
1904 * Clear any previous faults.
1905 */
1906 dmar_fault(-1, iommu);
1907 /*
1908 * Disable queued invalidation if supported and already enabled
1909 * before OS handover.
1910 */
1911 dmar_disable_qi(iommu);
1912 }
1913
1914 if (dmar_enable_qi(iommu)) {
1915 /*
1916 * Queued Invalidate not enabled, use Register Based Invalidate
1917 */
1918 iommu->flush.flush_context = __iommu_flush_context;
1919 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1920 pr_info("%s: Using Register based invalidation\n",
1921 iommu->name);
1922 } else {
1923 iommu->flush.flush_context = qi_flush_context;
1924 iommu->flush.flush_iotlb = qi_flush_iotlb;
1925 pr_info("%s: Using Queued invalidation\n", iommu->name);
1926 }
1927 }
1928
copy_context_table(struct intel_iommu * iommu,struct root_entry * old_re,struct context_entry ** tbl,int bus,bool ext)1929 static int copy_context_table(struct intel_iommu *iommu,
1930 struct root_entry *old_re,
1931 struct context_entry **tbl,
1932 int bus, bool ext)
1933 {
1934 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
1935 struct context_entry *new_ce = NULL, ce;
1936 struct context_entry *old_ce = NULL;
1937 struct root_entry re;
1938 phys_addr_t old_ce_phys;
1939
1940 tbl_idx = ext ? bus * 2 : bus;
1941 memcpy(&re, old_re, sizeof(re));
1942
1943 for (devfn = 0; devfn < 256; devfn++) {
1944 /* First calculate the correct index */
1945 idx = (ext ? devfn * 2 : devfn) % 256;
1946
1947 if (idx == 0) {
1948 /* First save what we may have and clean up */
1949 if (new_ce) {
1950 tbl[tbl_idx] = new_ce;
1951 __iommu_flush_cache(iommu, new_ce,
1952 VTD_PAGE_SIZE);
1953 pos = 1;
1954 }
1955
1956 if (old_ce)
1957 memunmap(old_ce);
1958
1959 ret = 0;
1960 if (devfn < 0x80)
1961 old_ce_phys = root_entry_lctp(&re);
1962 else
1963 old_ce_phys = root_entry_uctp(&re);
1964
1965 if (!old_ce_phys) {
1966 if (ext && devfn == 0) {
1967 /* No LCTP, try UCTP */
1968 devfn = 0x7f;
1969 continue;
1970 } else {
1971 goto out;
1972 }
1973 }
1974
1975 ret = -ENOMEM;
1976 old_ce = memremap(old_ce_phys, PAGE_SIZE,
1977 MEMREMAP_WB);
1978 if (!old_ce)
1979 goto out;
1980
1981 new_ce = iommu_alloc_pages_node_sz(iommu->node,
1982 GFP_KERNEL, SZ_4K);
1983 if (!new_ce)
1984 goto out_unmap;
1985
1986 ret = 0;
1987 }
1988
1989 /* Now copy the context entry */
1990 memcpy(&ce, old_ce + idx, sizeof(ce));
1991
1992 if (!context_present(&ce))
1993 continue;
1994
1995 did = context_domain_id(&ce);
1996 if (did >= 0 && did < cap_ndoms(iommu->cap))
1997 ida_alloc_range(&iommu->domain_ida, did, did, GFP_KERNEL);
1998
1999 set_context_copied(iommu, bus, devfn);
2000 new_ce[idx] = ce;
2001 }
2002
2003 tbl[tbl_idx + pos] = new_ce;
2004
2005 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2006
2007 out_unmap:
2008 memunmap(old_ce);
2009
2010 out:
2011 return ret;
2012 }
2013
copy_translation_tables(struct intel_iommu * iommu)2014 static int copy_translation_tables(struct intel_iommu *iommu)
2015 {
2016 struct context_entry **ctxt_tbls;
2017 struct root_entry *old_rt;
2018 phys_addr_t old_rt_phys;
2019 int ctxt_table_entries;
2020 u64 rtaddr_reg;
2021 int bus, ret;
2022 bool new_ext, ext;
2023
2024 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2025 ext = !!(rtaddr_reg & DMA_RTADDR_SMT);
2026 new_ext = !!sm_supported(iommu);
2027
2028 /*
2029 * The RTT bit can only be changed when translation is disabled,
2030 * but disabling translation means to open a window for data
2031 * corruption. So bail out and don't copy anything if we would
2032 * have to change the bit.
2033 */
2034 if (new_ext != ext)
2035 return -EINVAL;
2036
2037 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2038 if (!iommu->copied_tables)
2039 return -ENOMEM;
2040
2041 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2042 if (!old_rt_phys)
2043 return -EINVAL;
2044
2045 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2046 if (!old_rt)
2047 return -ENOMEM;
2048
2049 /* This is too big for the stack - allocate it from slab */
2050 ctxt_table_entries = ext ? 512 : 256;
2051 ret = -ENOMEM;
2052 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2053 if (!ctxt_tbls)
2054 goto out_unmap;
2055
2056 for (bus = 0; bus < 256; bus++) {
2057 ret = copy_context_table(iommu, &old_rt[bus],
2058 ctxt_tbls, bus, ext);
2059 if (ret) {
2060 pr_err("%s: Failed to copy context table for bus %d\n",
2061 iommu->name, bus);
2062 continue;
2063 }
2064 }
2065
2066 spin_lock(&iommu->lock);
2067
2068 /* Context tables are copied, now write them to the root_entry table */
2069 for (bus = 0; bus < 256; bus++) {
2070 int idx = ext ? bus * 2 : bus;
2071 u64 val;
2072
2073 if (ctxt_tbls[idx]) {
2074 val = virt_to_phys(ctxt_tbls[idx]) | 1;
2075 iommu->root_entry[bus].lo = val;
2076 }
2077
2078 if (!ext || !ctxt_tbls[idx + 1])
2079 continue;
2080
2081 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2082 iommu->root_entry[bus].hi = val;
2083 }
2084
2085 spin_unlock(&iommu->lock);
2086
2087 kfree(ctxt_tbls);
2088
2089 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2090
2091 ret = 0;
2092
2093 out_unmap:
2094 memunmap(old_rt);
2095
2096 return ret;
2097 }
2098
init_dmars(void)2099 static int __init init_dmars(void)
2100 {
2101 struct dmar_drhd_unit *drhd;
2102 struct intel_iommu *iommu;
2103 int ret;
2104
2105 for_each_iommu(iommu, drhd) {
2106 if (drhd->ignored) {
2107 iommu_disable_translation(iommu);
2108 continue;
2109 }
2110
2111 /*
2112 * Find the max pasid size of all IOMMU's in the system.
2113 * We need to ensure the system pasid table is no bigger
2114 * than the smallest supported.
2115 */
2116 if (pasid_supported(iommu)) {
2117 u32 temp = 2 << ecap_pss(iommu->ecap);
2118
2119 intel_pasid_max_id = min_t(u32, temp,
2120 intel_pasid_max_id);
2121 }
2122
2123 intel_iommu_init_qi(iommu);
2124 init_translation_status(iommu);
2125
2126 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2127 iommu_disable_translation(iommu);
2128 clear_translation_pre_enabled(iommu);
2129 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2130 iommu->name);
2131 }
2132
2133 /*
2134 * TBD:
2135 * we could share the same root & context tables
2136 * among all IOMMU's. Need to Split it later.
2137 */
2138 ret = iommu_alloc_root_entry(iommu);
2139 if (ret)
2140 goto free_iommu;
2141
2142 if (translation_pre_enabled(iommu)) {
2143 pr_info("Translation already enabled - trying to copy translation structures\n");
2144
2145 ret = copy_translation_tables(iommu);
2146 if (ret) {
2147 /*
2148 * We found the IOMMU with translation
2149 * enabled - but failed to copy over the
2150 * old root-entry table. Try to proceed
2151 * by disabling translation now and
2152 * allocating a clean root-entry table.
2153 * This might cause DMAR faults, but
2154 * probably the dump will still succeed.
2155 */
2156 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2157 iommu->name);
2158 iommu_disable_translation(iommu);
2159 clear_translation_pre_enabled(iommu);
2160 } else {
2161 pr_info("Copied translation tables from previous kernel for %s\n",
2162 iommu->name);
2163 }
2164 }
2165
2166 intel_svm_check(iommu);
2167 }
2168
2169 /*
2170 * Now that qi is enabled on all iommus, set the root entry and flush
2171 * caches. This is required on some Intel X58 chipsets, otherwise the
2172 * flush_context function will loop forever and the boot hangs.
2173 */
2174 for_each_active_iommu(iommu, drhd) {
2175 iommu_flush_write_buffer(iommu);
2176 iommu_set_root_entry(iommu);
2177 }
2178
2179 check_tylersburg_isoch();
2180
2181 /*
2182 * for each drhd
2183 * enable fault log
2184 * global invalidate context cache
2185 * global invalidate iotlb
2186 * enable translation
2187 */
2188 for_each_iommu(iommu, drhd) {
2189 if (drhd->ignored) {
2190 /*
2191 * we always have to disable PMRs or DMA may fail on
2192 * this device
2193 */
2194 if (force_on)
2195 iommu_disable_protect_mem_regions(iommu);
2196 continue;
2197 }
2198
2199 iommu_flush_write_buffer(iommu);
2200
2201 if (ecap_prs(iommu->ecap)) {
2202 /*
2203 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2204 * could cause possible lock race condition.
2205 */
2206 up_write(&dmar_global_lock);
2207 ret = intel_iommu_enable_prq(iommu);
2208 down_write(&dmar_global_lock);
2209 if (ret)
2210 goto free_iommu;
2211 }
2212
2213 ret = dmar_set_interrupt(iommu);
2214 if (ret)
2215 goto free_iommu;
2216 }
2217
2218 return 0;
2219
2220 free_iommu:
2221 for_each_active_iommu(iommu, drhd) {
2222 disable_dmar_iommu(iommu);
2223 free_dmar_iommu(iommu);
2224 }
2225
2226 return ret;
2227 }
2228
init_no_remapping_devices(void)2229 static void __init init_no_remapping_devices(void)
2230 {
2231 struct dmar_drhd_unit *drhd;
2232 struct device *dev;
2233 int i;
2234
2235 for_each_drhd_unit(drhd) {
2236 if (!drhd->include_all) {
2237 for_each_active_dev_scope(drhd->devices,
2238 drhd->devices_cnt, i, dev)
2239 break;
2240 /* ignore DMAR unit if no devices exist */
2241 if (i == drhd->devices_cnt)
2242 drhd->ignored = 1;
2243 }
2244 }
2245
2246 for_each_active_drhd_unit(drhd) {
2247 if (drhd->include_all)
2248 continue;
2249
2250 for_each_active_dev_scope(drhd->devices,
2251 drhd->devices_cnt, i, dev)
2252 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2253 break;
2254 if (i < drhd->devices_cnt)
2255 continue;
2256
2257 /* This IOMMU has *only* gfx devices. Either bypass it or
2258 set the gfx_mapped flag, as appropriate */
2259 drhd->gfx_dedicated = 1;
2260 if (disable_igfx_iommu)
2261 drhd->ignored = 1;
2262 }
2263 }
2264
2265 #ifdef CONFIG_SUSPEND
init_iommu_hw(void)2266 static int init_iommu_hw(void)
2267 {
2268 struct dmar_drhd_unit *drhd;
2269 struct intel_iommu *iommu = NULL;
2270 int ret;
2271
2272 for_each_active_iommu(iommu, drhd) {
2273 if (iommu->qi) {
2274 ret = dmar_reenable_qi(iommu);
2275 if (ret)
2276 return ret;
2277 }
2278 }
2279
2280 for_each_iommu(iommu, drhd) {
2281 if (drhd->ignored) {
2282 /*
2283 * we always have to disable PMRs or DMA may fail on
2284 * this device
2285 */
2286 if (force_on)
2287 iommu_disable_protect_mem_regions(iommu);
2288 continue;
2289 }
2290
2291 iommu_flush_write_buffer(iommu);
2292 iommu_set_root_entry(iommu);
2293 iommu_enable_translation(iommu);
2294 iommu_disable_protect_mem_regions(iommu);
2295 }
2296
2297 return 0;
2298 }
2299
iommu_flush_all(void)2300 static void iommu_flush_all(void)
2301 {
2302 struct dmar_drhd_unit *drhd;
2303 struct intel_iommu *iommu;
2304
2305 for_each_active_iommu(iommu, drhd) {
2306 iommu->flush.flush_context(iommu, 0, 0, 0,
2307 DMA_CCMD_GLOBAL_INVL);
2308 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2309 DMA_TLB_GLOBAL_FLUSH);
2310 }
2311 }
2312
iommu_suspend(void)2313 static int iommu_suspend(void)
2314 {
2315 struct dmar_drhd_unit *drhd;
2316 struct intel_iommu *iommu = NULL;
2317 unsigned long flag;
2318
2319 iommu_flush_all();
2320
2321 for_each_active_iommu(iommu, drhd) {
2322 iommu_disable_translation(iommu);
2323
2324 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2325
2326 iommu->iommu_state[SR_DMAR_FECTL_REG] =
2327 readl(iommu->reg + DMAR_FECTL_REG);
2328 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2329 readl(iommu->reg + DMAR_FEDATA_REG);
2330 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2331 readl(iommu->reg + DMAR_FEADDR_REG);
2332 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2333 readl(iommu->reg + DMAR_FEUADDR_REG);
2334
2335 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2336 }
2337 return 0;
2338 }
2339
iommu_resume(void)2340 static void iommu_resume(void)
2341 {
2342 struct dmar_drhd_unit *drhd;
2343 struct intel_iommu *iommu = NULL;
2344 unsigned long flag;
2345
2346 if (init_iommu_hw()) {
2347 if (force_on)
2348 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2349 else
2350 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2351 return;
2352 }
2353
2354 for_each_active_iommu(iommu, drhd) {
2355
2356 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2357
2358 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2359 iommu->reg + DMAR_FECTL_REG);
2360 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2361 iommu->reg + DMAR_FEDATA_REG);
2362 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2363 iommu->reg + DMAR_FEADDR_REG);
2364 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2365 iommu->reg + DMAR_FEUADDR_REG);
2366
2367 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2368 }
2369 }
2370
2371 static struct syscore_ops iommu_syscore_ops = {
2372 .resume = iommu_resume,
2373 .suspend = iommu_suspend,
2374 };
2375
init_iommu_pm_ops(void)2376 static void __init init_iommu_pm_ops(void)
2377 {
2378 register_syscore_ops(&iommu_syscore_ops);
2379 }
2380
2381 #else
init_iommu_pm_ops(void)2382 static inline void init_iommu_pm_ops(void) {}
2383 #endif /* CONFIG_PM */
2384
rmrr_sanity_check(struct acpi_dmar_reserved_memory * rmrr)2385 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2386 {
2387 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2388 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2389 rmrr->end_address <= rmrr->base_address ||
2390 arch_rmrr_sanity_check(rmrr))
2391 return -EINVAL;
2392
2393 return 0;
2394 }
2395
dmar_parse_one_rmrr(struct acpi_dmar_header * header,void * arg)2396 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2397 {
2398 struct acpi_dmar_reserved_memory *rmrr;
2399 struct dmar_rmrr_unit *rmrru;
2400
2401 rmrr = (struct acpi_dmar_reserved_memory *)header;
2402 if (rmrr_sanity_check(rmrr)) {
2403 pr_warn(FW_BUG
2404 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2405 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2406 rmrr->base_address, rmrr->end_address,
2407 dmi_get_system_info(DMI_BIOS_VENDOR),
2408 dmi_get_system_info(DMI_BIOS_VERSION),
2409 dmi_get_system_info(DMI_PRODUCT_VERSION));
2410 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2411 }
2412
2413 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2414 if (!rmrru)
2415 goto out;
2416
2417 rmrru->hdr = header;
2418
2419 rmrru->base_address = rmrr->base_address;
2420 rmrru->end_address = rmrr->end_address;
2421
2422 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2423 ((void *)rmrr) + rmrr->header.length,
2424 &rmrru->devices_cnt);
2425 if (rmrru->devices_cnt && rmrru->devices == NULL)
2426 goto free_rmrru;
2427
2428 list_add(&rmrru->list, &dmar_rmrr_units);
2429
2430 return 0;
2431 free_rmrru:
2432 kfree(rmrru);
2433 out:
2434 return -ENOMEM;
2435 }
2436
dmar_find_atsr(struct acpi_dmar_atsr * atsr)2437 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2438 {
2439 struct dmar_atsr_unit *atsru;
2440 struct acpi_dmar_atsr *tmp;
2441
2442 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2443 dmar_rcu_check()) {
2444 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2445 if (atsr->segment != tmp->segment)
2446 continue;
2447 if (atsr->header.length != tmp->header.length)
2448 continue;
2449 if (memcmp(atsr, tmp, atsr->header.length) == 0)
2450 return atsru;
2451 }
2452
2453 return NULL;
2454 }
2455
dmar_parse_one_atsr(struct acpi_dmar_header * hdr,void * arg)2456 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2457 {
2458 struct acpi_dmar_atsr *atsr;
2459 struct dmar_atsr_unit *atsru;
2460
2461 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2462 return 0;
2463
2464 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2465 atsru = dmar_find_atsr(atsr);
2466 if (atsru)
2467 return 0;
2468
2469 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2470 if (!atsru)
2471 return -ENOMEM;
2472
2473 /*
2474 * If memory is allocated from slab by ACPI _DSM method, we need to
2475 * copy the memory content because the memory buffer will be freed
2476 * on return.
2477 */
2478 atsru->hdr = (void *)(atsru + 1);
2479 memcpy(atsru->hdr, hdr, hdr->length);
2480 atsru->include_all = atsr->flags & 0x1;
2481 if (!atsru->include_all) {
2482 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
2483 (void *)atsr + atsr->header.length,
2484 &atsru->devices_cnt);
2485 if (atsru->devices_cnt && atsru->devices == NULL) {
2486 kfree(atsru);
2487 return -ENOMEM;
2488 }
2489 }
2490
2491 list_add_rcu(&atsru->list, &dmar_atsr_units);
2492
2493 return 0;
2494 }
2495
intel_iommu_free_atsr(struct dmar_atsr_unit * atsru)2496 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2497 {
2498 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
2499 kfree(atsru);
2500 }
2501
dmar_release_one_atsr(struct acpi_dmar_header * hdr,void * arg)2502 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2503 {
2504 struct acpi_dmar_atsr *atsr;
2505 struct dmar_atsr_unit *atsru;
2506
2507 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2508 atsru = dmar_find_atsr(atsr);
2509 if (atsru) {
2510 list_del_rcu(&atsru->list);
2511 synchronize_rcu();
2512 intel_iommu_free_atsr(atsru);
2513 }
2514
2515 return 0;
2516 }
2517
dmar_check_one_atsr(struct acpi_dmar_header * hdr,void * arg)2518 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2519 {
2520 int i;
2521 struct device *dev;
2522 struct acpi_dmar_atsr *atsr;
2523 struct dmar_atsr_unit *atsru;
2524
2525 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2526 atsru = dmar_find_atsr(atsr);
2527 if (!atsru)
2528 return 0;
2529
2530 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
2531 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
2532 i, dev)
2533 return -EBUSY;
2534 }
2535
2536 return 0;
2537 }
2538
dmar_find_satc(struct acpi_dmar_satc * satc)2539 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
2540 {
2541 struct dmar_satc_unit *satcu;
2542 struct acpi_dmar_satc *tmp;
2543
2544 list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
2545 dmar_rcu_check()) {
2546 tmp = (struct acpi_dmar_satc *)satcu->hdr;
2547 if (satc->segment != tmp->segment)
2548 continue;
2549 if (satc->header.length != tmp->header.length)
2550 continue;
2551 if (memcmp(satc, tmp, satc->header.length) == 0)
2552 return satcu;
2553 }
2554
2555 return NULL;
2556 }
2557
dmar_parse_one_satc(struct acpi_dmar_header * hdr,void * arg)2558 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
2559 {
2560 struct acpi_dmar_satc *satc;
2561 struct dmar_satc_unit *satcu;
2562
2563 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2564 return 0;
2565
2566 satc = container_of(hdr, struct acpi_dmar_satc, header);
2567 satcu = dmar_find_satc(satc);
2568 if (satcu)
2569 return 0;
2570
2571 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
2572 if (!satcu)
2573 return -ENOMEM;
2574
2575 satcu->hdr = (void *)(satcu + 1);
2576 memcpy(satcu->hdr, hdr, hdr->length);
2577 satcu->atc_required = satc->flags & 0x1;
2578 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
2579 (void *)satc + satc->header.length,
2580 &satcu->devices_cnt);
2581 if (satcu->devices_cnt && !satcu->devices) {
2582 kfree(satcu);
2583 return -ENOMEM;
2584 }
2585 list_add_rcu(&satcu->list, &dmar_satc_units);
2586
2587 return 0;
2588 }
2589
intel_iommu_add(struct dmar_drhd_unit * dmaru)2590 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
2591 {
2592 struct intel_iommu *iommu = dmaru->iommu;
2593 int ret;
2594
2595 /*
2596 * Disable translation if already enabled prior to OS handover.
2597 */
2598 if (iommu->gcmd & DMA_GCMD_TE)
2599 iommu_disable_translation(iommu);
2600
2601 ret = iommu_alloc_root_entry(iommu);
2602 if (ret)
2603 goto out;
2604
2605 intel_svm_check(iommu);
2606
2607 if (dmaru->ignored) {
2608 /*
2609 * we always have to disable PMRs or DMA may fail on this device
2610 */
2611 if (force_on)
2612 iommu_disable_protect_mem_regions(iommu);
2613 return 0;
2614 }
2615
2616 intel_iommu_init_qi(iommu);
2617 iommu_flush_write_buffer(iommu);
2618
2619 if (ecap_prs(iommu->ecap)) {
2620 ret = intel_iommu_enable_prq(iommu);
2621 if (ret)
2622 goto disable_iommu;
2623 }
2624
2625 ret = dmar_set_interrupt(iommu);
2626 if (ret)
2627 goto disable_iommu;
2628
2629 iommu_set_root_entry(iommu);
2630 iommu_enable_translation(iommu);
2631
2632 iommu_disable_protect_mem_regions(iommu);
2633 return 0;
2634
2635 disable_iommu:
2636 disable_dmar_iommu(iommu);
2637 out:
2638 free_dmar_iommu(iommu);
2639 return ret;
2640 }
2641
dmar_iommu_hotplug(struct dmar_drhd_unit * dmaru,bool insert)2642 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
2643 {
2644 int ret = 0;
2645 struct intel_iommu *iommu = dmaru->iommu;
2646
2647 if (!intel_iommu_enabled)
2648 return 0;
2649 if (iommu == NULL)
2650 return -EINVAL;
2651
2652 if (insert) {
2653 ret = intel_iommu_add(dmaru);
2654 } else {
2655 disable_dmar_iommu(iommu);
2656 free_dmar_iommu(iommu);
2657 }
2658
2659 return ret;
2660 }
2661
intel_iommu_free_dmars(void)2662 static void intel_iommu_free_dmars(void)
2663 {
2664 struct dmar_rmrr_unit *rmrru, *rmrr_n;
2665 struct dmar_atsr_unit *atsru, *atsr_n;
2666 struct dmar_satc_unit *satcu, *satc_n;
2667
2668 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
2669 list_del(&rmrru->list);
2670 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
2671 kfree(rmrru);
2672 }
2673
2674 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
2675 list_del(&atsru->list);
2676 intel_iommu_free_atsr(atsru);
2677 }
2678 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
2679 list_del(&satcu->list);
2680 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
2681 kfree(satcu);
2682 }
2683 }
2684
dmar_find_matched_satc_unit(struct pci_dev * dev)2685 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
2686 {
2687 struct dmar_satc_unit *satcu;
2688 struct acpi_dmar_satc *satc;
2689 struct device *tmp;
2690 int i;
2691
2692 rcu_read_lock();
2693
2694 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
2695 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2696 if (satc->segment != pci_domain_nr(dev->bus))
2697 continue;
2698 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
2699 if (to_pci_dev(tmp) == dev)
2700 goto out;
2701 }
2702 satcu = NULL;
2703 out:
2704 rcu_read_unlock();
2705 return satcu;
2706 }
2707
dmar_ats_supported(struct pci_dev * dev,struct intel_iommu * iommu)2708 static bool dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
2709 {
2710 struct pci_dev *bridge = NULL;
2711 struct dmar_atsr_unit *atsru;
2712 struct dmar_satc_unit *satcu;
2713 struct acpi_dmar_atsr *atsr;
2714 bool supported = true;
2715 struct pci_bus *bus;
2716 struct device *tmp;
2717 int i;
2718
2719 dev = pci_physfn(dev);
2720 satcu = dmar_find_matched_satc_unit(dev);
2721 if (satcu)
2722 /*
2723 * This device supports ATS as it is in SATC table.
2724 * When IOMMU is in legacy mode, enabling ATS is done
2725 * automatically by HW for the device that requires
2726 * ATS, hence OS should not enable this device ATS
2727 * to avoid duplicated TLB invalidation.
2728 */
2729 return !(satcu->atc_required && !sm_supported(iommu));
2730
2731 for (bus = dev->bus; bus; bus = bus->parent) {
2732 bridge = bus->self;
2733 /* If it's an integrated device, allow ATS */
2734 if (!bridge)
2735 return true;
2736 /* Connected via non-PCIe: no ATS */
2737 if (!pci_is_pcie(bridge) ||
2738 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
2739 return false;
2740 /* If we found the root port, look it up in the ATSR */
2741 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
2742 break;
2743 }
2744
2745 rcu_read_lock();
2746 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
2747 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2748 if (atsr->segment != pci_domain_nr(dev->bus))
2749 continue;
2750
2751 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
2752 if (tmp == &bridge->dev)
2753 goto out;
2754
2755 if (atsru->include_all)
2756 goto out;
2757 }
2758 supported = false;
2759 out:
2760 rcu_read_unlock();
2761
2762 return supported;
2763 }
2764
dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info * info)2765 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
2766 {
2767 int ret;
2768 struct dmar_rmrr_unit *rmrru;
2769 struct dmar_atsr_unit *atsru;
2770 struct dmar_satc_unit *satcu;
2771 struct acpi_dmar_atsr *atsr;
2772 struct acpi_dmar_reserved_memory *rmrr;
2773 struct acpi_dmar_satc *satc;
2774
2775 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
2776 return 0;
2777
2778 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
2779 rmrr = container_of(rmrru->hdr,
2780 struct acpi_dmar_reserved_memory, header);
2781 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2782 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
2783 ((void *)rmrr) + rmrr->header.length,
2784 rmrr->segment, rmrru->devices,
2785 rmrru->devices_cnt);
2786 if (ret < 0)
2787 return ret;
2788 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2789 dmar_remove_dev_scope(info, rmrr->segment,
2790 rmrru->devices, rmrru->devices_cnt);
2791 }
2792 }
2793
2794 list_for_each_entry(atsru, &dmar_atsr_units, list) {
2795 if (atsru->include_all)
2796 continue;
2797
2798 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2799 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2800 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
2801 (void *)atsr + atsr->header.length,
2802 atsr->segment, atsru->devices,
2803 atsru->devices_cnt);
2804 if (ret > 0)
2805 break;
2806 else if (ret < 0)
2807 return ret;
2808 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2809 if (dmar_remove_dev_scope(info, atsr->segment,
2810 atsru->devices, atsru->devices_cnt))
2811 break;
2812 }
2813 }
2814 list_for_each_entry(satcu, &dmar_satc_units, list) {
2815 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2816 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2817 ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
2818 (void *)satc + satc->header.length,
2819 satc->segment, satcu->devices,
2820 satcu->devices_cnt);
2821 if (ret > 0)
2822 break;
2823 else if (ret < 0)
2824 return ret;
2825 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2826 if (dmar_remove_dev_scope(info, satc->segment,
2827 satcu->devices, satcu->devices_cnt))
2828 break;
2829 }
2830 }
2831
2832 return 0;
2833 }
2834
intel_disable_iommus(void)2835 static void intel_disable_iommus(void)
2836 {
2837 struct intel_iommu *iommu = NULL;
2838 struct dmar_drhd_unit *drhd;
2839
2840 for_each_iommu(iommu, drhd)
2841 iommu_disable_translation(iommu);
2842 }
2843
intel_iommu_shutdown(void)2844 void intel_iommu_shutdown(void)
2845 {
2846 struct dmar_drhd_unit *drhd;
2847 struct intel_iommu *iommu = NULL;
2848
2849 if (no_iommu || dmar_disabled)
2850 return;
2851
2852 /*
2853 * All other CPUs were brought down, hotplug interrupts were disabled,
2854 * no lock and RCU checking needed anymore
2855 */
2856 list_for_each_entry(drhd, &dmar_drhd_units, list) {
2857 iommu = drhd->iommu;
2858
2859 /* Disable PMRs explicitly here. */
2860 iommu_disable_protect_mem_regions(iommu);
2861
2862 /* Make sure the IOMMUs are switched off */
2863 iommu_disable_translation(iommu);
2864 }
2865 }
2866
dev_to_intel_iommu(struct device * dev)2867 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
2868 {
2869 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
2870
2871 return container_of(iommu_dev, struct intel_iommu, iommu);
2872 }
2873
version_show(struct device * dev,struct device_attribute * attr,char * buf)2874 static ssize_t version_show(struct device *dev,
2875 struct device_attribute *attr, char *buf)
2876 {
2877 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2878 u32 ver = readl(iommu->reg + DMAR_VER_REG);
2879 return sysfs_emit(buf, "%d:%d\n",
2880 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
2881 }
2882 static DEVICE_ATTR_RO(version);
2883
address_show(struct device * dev,struct device_attribute * attr,char * buf)2884 static ssize_t address_show(struct device *dev,
2885 struct device_attribute *attr, char *buf)
2886 {
2887 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2888 return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
2889 }
2890 static DEVICE_ATTR_RO(address);
2891
cap_show(struct device * dev,struct device_attribute * attr,char * buf)2892 static ssize_t cap_show(struct device *dev,
2893 struct device_attribute *attr, char *buf)
2894 {
2895 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2896 return sysfs_emit(buf, "%llx\n", iommu->cap);
2897 }
2898 static DEVICE_ATTR_RO(cap);
2899
ecap_show(struct device * dev,struct device_attribute * attr,char * buf)2900 static ssize_t ecap_show(struct device *dev,
2901 struct device_attribute *attr, char *buf)
2902 {
2903 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2904 return sysfs_emit(buf, "%llx\n", iommu->ecap);
2905 }
2906 static DEVICE_ATTR_RO(ecap);
2907
domains_supported_show(struct device * dev,struct device_attribute * attr,char * buf)2908 static ssize_t domains_supported_show(struct device *dev,
2909 struct device_attribute *attr, char *buf)
2910 {
2911 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2912 return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
2913 }
2914 static DEVICE_ATTR_RO(domains_supported);
2915
domains_used_show(struct device * dev,struct device_attribute * attr,char * buf)2916 static ssize_t domains_used_show(struct device *dev,
2917 struct device_attribute *attr, char *buf)
2918 {
2919 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2920 unsigned int count = 0;
2921 int id;
2922
2923 for (id = 0; id < cap_ndoms(iommu->cap); id++)
2924 if (ida_exists(&iommu->domain_ida, id))
2925 count++;
2926
2927 return sysfs_emit(buf, "%d\n", count);
2928 }
2929 static DEVICE_ATTR_RO(domains_used);
2930
2931 static struct attribute *intel_iommu_attrs[] = {
2932 &dev_attr_version.attr,
2933 &dev_attr_address.attr,
2934 &dev_attr_cap.attr,
2935 &dev_attr_ecap.attr,
2936 &dev_attr_domains_supported.attr,
2937 &dev_attr_domains_used.attr,
2938 NULL,
2939 };
2940
2941 static struct attribute_group intel_iommu_group = {
2942 .name = "intel-iommu",
2943 .attrs = intel_iommu_attrs,
2944 };
2945
2946 const struct attribute_group *intel_iommu_groups[] = {
2947 &intel_iommu_group,
2948 NULL,
2949 };
2950
has_external_pci(void)2951 static bool has_external_pci(void)
2952 {
2953 struct pci_dev *pdev = NULL;
2954
2955 for_each_pci_dev(pdev)
2956 if (pdev->external_facing) {
2957 pci_dev_put(pdev);
2958 return true;
2959 }
2960
2961 return false;
2962 }
2963
platform_optin_force_iommu(void)2964 static int __init platform_optin_force_iommu(void)
2965 {
2966 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
2967 return 0;
2968
2969 if (no_iommu || dmar_disabled)
2970 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
2971
2972 /*
2973 * If Intel-IOMMU is disabled by default, we will apply identity
2974 * map for all devices except those marked as being untrusted.
2975 */
2976 if (dmar_disabled)
2977 iommu_set_default_passthrough(false);
2978
2979 dmar_disabled = 0;
2980 no_iommu = 0;
2981
2982 return 1;
2983 }
2984
probe_acpi_namespace_devices(void)2985 static int __init probe_acpi_namespace_devices(void)
2986 {
2987 struct dmar_drhd_unit *drhd;
2988 /* To avoid a -Wunused-but-set-variable warning. */
2989 struct intel_iommu *iommu __maybe_unused;
2990 struct device *dev;
2991 int i, ret = 0;
2992
2993 for_each_active_iommu(iommu, drhd) {
2994 for_each_active_dev_scope(drhd->devices,
2995 drhd->devices_cnt, i, dev) {
2996 struct acpi_device_physical_node *pn;
2997 struct acpi_device *adev;
2998
2999 if (dev->bus != &acpi_bus_type)
3000 continue;
3001
3002 up_read(&dmar_global_lock);
3003 adev = to_acpi_device(dev);
3004 mutex_lock(&adev->physical_node_lock);
3005 list_for_each_entry(pn,
3006 &adev->physical_node_list, node) {
3007 ret = iommu_probe_device(pn->dev);
3008 if (ret)
3009 break;
3010 }
3011 mutex_unlock(&adev->physical_node_lock);
3012 down_read(&dmar_global_lock);
3013
3014 if (ret)
3015 return ret;
3016 }
3017 }
3018
3019 return 0;
3020 }
3021
tboot_force_iommu(void)3022 static __init int tboot_force_iommu(void)
3023 {
3024 if (!tboot_enabled())
3025 return 0;
3026
3027 if (no_iommu || dmar_disabled)
3028 pr_warn("Forcing Intel-IOMMU to enabled\n");
3029
3030 dmar_disabled = 0;
3031 no_iommu = 0;
3032
3033 return 1;
3034 }
3035
intel_iommu_init(void)3036 int __init intel_iommu_init(void)
3037 {
3038 int ret = -ENODEV;
3039 struct dmar_drhd_unit *drhd;
3040 struct intel_iommu *iommu;
3041
3042 /*
3043 * Intel IOMMU is required for a TXT/tboot launch or platform
3044 * opt in, so enforce that.
3045 */
3046 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3047 platform_optin_force_iommu();
3048
3049 down_write(&dmar_global_lock);
3050 if (dmar_table_init()) {
3051 if (force_on)
3052 panic("tboot: Failed to initialize DMAR table\n");
3053 goto out_free_dmar;
3054 }
3055
3056 if (dmar_dev_scope_init() < 0) {
3057 if (force_on)
3058 panic("tboot: Failed to initialize DMAR device scope\n");
3059 goto out_free_dmar;
3060 }
3061
3062 up_write(&dmar_global_lock);
3063
3064 /*
3065 * The bus notifier takes the dmar_global_lock, so lockdep will
3066 * complain later when we register it under the lock.
3067 */
3068 dmar_register_bus_notifier();
3069
3070 down_write(&dmar_global_lock);
3071
3072 if (!no_iommu)
3073 intel_iommu_debugfs_init();
3074
3075 if (no_iommu || dmar_disabled) {
3076 /*
3077 * We exit the function here to ensure IOMMU's remapping and
3078 * mempool aren't setup, which means that the IOMMU's PMRs
3079 * won't be disabled via the call to init_dmars(). So disable
3080 * it explicitly here. The PMRs were setup by tboot prior to
3081 * calling SENTER, but the kernel is expected to reset/tear
3082 * down the PMRs.
3083 */
3084 if (intel_iommu_tboot_noforce) {
3085 for_each_iommu(iommu, drhd)
3086 iommu_disable_protect_mem_regions(iommu);
3087 }
3088
3089 /*
3090 * Make sure the IOMMUs are switched off, even when we
3091 * boot into a kexec kernel and the previous kernel left
3092 * them enabled
3093 */
3094 intel_disable_iommus();
3095 goto out_free_dmar;
3096 }
3097
3098 if (list_empty(&dmar_rmrr_units))
3099 pr_info("No RMRR found\n");
3100
3101 if (list_empty(&dmar_atsr_units))
3102 pr_info("No ATSR found\n");
3103
3104 if (list_empty(&dmar_satc_units))
3105 pr_info("No SATC found\n");
3106
3107 init_no_remapping_devices();
3108
3109 ret = init_dmars();
3110 if (ret) {
3111 if (force_on)
3112 panic("tboot: Failed to initialize DMARs\n");
3113 pr_err("Initialization failed\n");
3114 goto out_free_dmar;
3115 }
3116 up_write(&dmar_global_lock);
3117
3118 init_iommu_pm_ops();
3119
3120 down_read(&dmar_global_lock);
3121 for_each_active_iommu(iommu, drhd) {
3122 /*
3123 * The flush queue implementation does not perform
3124 * page-selective invalidations that are required for efficient
3125 * TLB flushes in virtual environments. The benefit of batching
3126 * is likely to be much lower than the overhead of synchronizing
3127 * the virtual and physical IOMMU page-tables.
3128 */
3129 if (cap_caching_mode(iommu->cap) &&
3130 !first_level_by_default(iommu)) {
3131 pr_info_once("IOMMU batching disallowed due to virtualization\n");
3132 iommu_set_dma_strict();
3133 }
3134 iommu_device_sysfs_add(&iommu->iommu, NULL,
3135 intel_iommu_groups,
3136 "%s", iommu->name);
3137 /*
3138 * The iommu device probe is protected by the iommu_probe_device_lock.
3139 * Release the dmar_global_lock before entering the device probe path
3140 * to avoid unnecessary lock order splat.
3141 */
3142 up_read(&dmar_global_lock);
3143 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3144 down_read(&dmar_global_lock);
3145
3146 iommu_pmu_register(iommu);
3147 }
3148
3149 if (probe_acpi_namespace_devices())
3150 pr_warn("ACPI name space devices didn't probe correctly\n");
3151
3152 /* Finally, we enable the DMA remapping hardware. */
3153 for_each_iommu(iommu, drhd) {
3154 if (!drhd->ignored && !translation_pre_enabled(iommu))
3155 iommu_enable_translation(iommu);
3156
3157 iommu_disable_protect_mem_regions(iommu);
3158 }
3159 up_read(&dmar_global_lock);
3160
3161 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3162
3163 intel_iommu_enabled = 1;
3164
3165 return 0;
3166
3167 out_free_dmar:
3168 intel_iommu_free_dmars();
3169 up_write(&dmar_global_lock);
3170 return ret;
3171 }
3172
domain_context_clear_one_cb(struct pci_dev * pdev,u16 alias,void * opaque)3173 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3174 {
3175 struct device_domain_info *info = opaque;
3176
3177 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3178 return 0;
3179 }
3180
3181 /*
3182 * NB - intel-iommu lacks any sort of reference counting for the users of
3183 * dependent devices. If multiple endpoints have intersecting dependent
3184 * devices, unbinding the driver from any one of them will possibly leave
3185 * the others unable to operate.
3186 */
domain_context_clear(struct device_domain_info * info)3187 static void domain_context_clear(struct device_domain_info *info)
3188 {
3189 if (!dev_is_pci(info->dev)) {
3190 domain_context_clear_one(info, info->bus, info->devfn);
3191 return;
3192 }
3193
3194 pci_for_each_dma_alias(to_pci_dev(info->dev),
3195 &domain_context_clear_one_cb, info);
3196 iommu_disable_pci_ats(info);
3197 }
3198
3199 /*
3200 * Clear the page table pointer in context or pasid table entries so that
3201 * all DMA requests without PASID from the device are blocked. If the page
3202 * table has been set, clean up the data structures.
3203 */
device_block_translation(struct device * dev)3204 void device_block_translation(struct device *dev)
3205 {
3206 struct device_domain_info *info = dev_iommu_priv_get(dev);
3207 struct intel_iommu *iommu = info->iommu;
3208 unsigned long flags;
3209
3210 /* Device in DMA blocking state. Noting to do. */
3211 if (!info->domain_attached)
3212 return;
3213
3214 if (info->domain)
3215 cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID);
3216
3217 if (!dev_is_real_dma_subdevice(dev)) {
3218 if (sm_supported(iommu))
3219 intel_pasid_tear_down_entry(iommu, dev,
3220 IOMMU_NO_PASID, false);
3221 else
3222 domain_context_clear(info);
3223 }
3224
3225 /* Device now in DMA blocking state. */
3226 info->domain_attached = false;
3227
3228 if (!info->domain)
3229 return;
3230
3231 spin_lock_irqsave(&info->domain->lock, flags);
3232 list_del(&info->link);
3233 spin_unlock_irqrestore(&info->domain->lock, flags);
3234
3235 domain_detach_iommu(info->domain, iommu);
3236 info->domain = NULL;
3237 }
3238
blocking_domain_attach_dev(struct iommu_domain * domain,struct device * dev)3239 static int blocking_domain_attach_dev(struct iommu_domain *domain,
3240 struct device *dev)
3241 {
3242 struct device_domain_info *info = dev_iommu_priv_get(dev);
3243
3244 iopf_for_domain_remove(info->domain ? &info->domain->domain : NULL, dev);
3245 device_block_translation(dev);
3246 return 0;
3247 }
3248
3249 static int blocking_domain_set_dev_pasid(struct iommu_domain *domain,
3250 struct device *dev, ioasid_t pasid,
3251 struct iommu_domain *old);
3252
3253 static struct iommu_domain blocking_domain = {
3254 .type = IOMMU_DOMAIN_BLOCKED,
3255 .ops = &(const struct iommu_domain_ops) {
3256 .attach_dev = blocking_domain_attach_dev,
3257 .set_dev_pasid = blocking_domain_set_dev_pasid,
3258 }
3259 };
3260
iommu_superpage_capability(struct intel_iommu * iommu,bool first_stage)3261 static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage)
3262 {
3263 if (!intel_iommu_superpage)
3264 return 0;
3265
3266 if (first_stage)
3267 return cap_fl1gp_support(iommu->cap) ? 2 : 1;
3268
3269 return fls(cap_super_page_val(iommu->cap));
3270 }
3271
paging_domain_alloc(struct device * dev,bool first_stage)3272 static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage)
3273 {
3274 struct device_domain_info *info = dev_iommu_priv_get(dev);
3275 struct intel_iommu *iommu = info->iommu;
3276 struct dmar_domain *domain;
3277 int addr_width;
3278
3279 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
3280 if (!domain)
3281 return ERR_PTR(-ENOMEM);
3282
3283 INIT_LIST_HEAD(&domain->devices);
3284 INIT_LIST_HEAD(&domain->dev_pasids);
3285 INIT_LIST_HEAD(&domain->cache_tags);
3286 spin_lock_init(&domain->lock);
3287 spin_lock_init(&domain->cache_lock);
3288 xa_init(&domain->iommu_array);
3289
3290 domain->nid = dev_to_node(dev);
3291 domain->use_first_level = first_stage;
3292
3293 /* calculate the address width */
3294 addr_width = agaw_to_width(iommu->agaw);
3295 if (addr_width > cap_mgaw(iommu->cap))
3296 addr_width = cap_mgaw(iommu->cap);
3297 domain->gaw = addr_width;
3298 domain->agaw = iommu->agaw;
3299 domain->max_addr = __DOMAIN_MAX_ADDR(addr_width);
3300
3301 /* iommu memory access coherency */
3302 domain->iommu_coherency = iommu_paging_structure_coherency(iommu);
3303
3304 /* pagesize bitmap */
3305 domain->domain.pgsize_bitmap = SZ_4K;
3306 domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage);
3307 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
3308
3309 /*
3310 * IOVA aperture: First-level translation restricts the input-address
3311 * to a canonical address (i.e., address bits 63:N have the same value
3312 * as address bit [N-1], where N is 48-bits with 4-level paging and
3313 * 57-bits with 5-level paging). Hence, skip bit [N-1].
3314 */
3315 domain->domain.geometry.force_aperture = true;
3316 domain->domain.geometry.aperture_start = 0;
3317 if (first_stage)
3318 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
3319 else
3320 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
3321
3322 /* always allocate the top pgd */
3323 domain->pgd = iommu_alloc_pages_node_sz(domain->nid, GFP_KERNEL, SZ_4K);
3324 if (!domain->pgd) {
3325 kfree(domain);
3326 return ERR_PTR(-ENOMEM);
3327 }
3328 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3329
3330 return domain;
3331 }
3332
3333 static struct iommu_domain *
intel_iommu_domain_alloc_paging_flags(struct device * dev,u32 flags,const struct iommu_user_data * user_data)3334 intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
3335 const struct iommu_user_data *user_data)
3336 {
3337 struct device_domain_info *info = dev_iommu_priv_get(dev);
3338 bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3339 bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3340 struct intel_iommu *iommu = info->iommu;
3341 struct dmar_domain *dmar_domain;
3342 struct iommu_domain *domain;
3343 bool first_stage;
3344
3345 if (flags &
3346 (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
3347 IOMMU_HWPT_ALLOC_PASID)))
3348 return ERR_PTR(-EOPNOTSUPP);
3349 if (nested_parent && !nested_supported(iommu))
3350 return ERR_PTR(-EOPNOTSUPP);
3351 if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3352 return ERR_PTR(-EOPNOTSUPP);
3353
3354 /*
3355 * Always allocate the guest compatible page table unless
3356 * IOMMU_HWPT_ALLOC_NEST_PARENT or IOMMU_HWPT_ALLOC_DIRTY_TRACKING
3357 * is specified.
3358 */
3359 if (nested_parent || dirty_tracking) {
3360 if (!sm_supported(iommu) || !ecap_slts(iommu->ecap))
3361 return ERR_PTR(-EOPNOTSUPP);
3362 first_stage = false;
3363 } else {
3364 first_stage = first_level_by_default(iommu);
3365 }
3366
3367 dmar_domain = paging_domain_alloc(dev, first_stage);
3368 if (IS_ERR(dmar_domain))
3369 return ERR_CAST(dmar_domain);
3370 domain = &dmar_domain->domain;
3371 domain->type = IOMMU_DOMAIN_UNMANAGED;
3372 domain->owner = &intel_iommu_ops;
3373 domain->ops = intel_iommu_ops.default_domain_ops;
3374
3375 if (nested_parent) {
3376 dmar_domain->nested_parent = true;
3377 INIT_LIST_HEAD(&dmar_domain->s1_domains);
3378 spin_lock_init(&dmar_domain->s1_lock);
3379 }
3380
3381 if (dirty_tracking) {
3382 if (dmar_domain->use_first_level) {
3383 iommu_domain_free(domain);
3384 return ERR_PTR(-EOPNOTSUPP);
3385 }
3386 domain->dirty_ops = &intel_dirty_ops;
3387 }
3388
3389 return domain;
3390 }
3391
intel_iommu_domain_free(struct iommu_domain * domain)3392 static void intel_iommu_domain_free(struct iommu_domain *domain)
3393 {
3394 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3395
3396 WARN_ON(dmar_domain->nested_parent &&
3397 !list_empty(&dmar_domain->s1_domains));
3398 domain_exit(dmar_domain);
3399 }
3400
paging_domain_compatible(struct iommu_domain * domain,struct device * dev)3401 int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
3402 {
3403 struct device_domain_info *info = dev_iommu_priv_get(dev);
3404 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3405 struct intel_iommu *iommu = info->iommu;
3406 int addr_width;
3407
3408 if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
3409 return -EPERM;
3410
3411 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3412 return -EINVAL;
3413
3414 if (domain->dirty_ops && !ssads_supported(iommu))
3415 return -EINVAL;
3416
3417 if (dmar_domain->iommu_coherency !=
3418 iommu_paging_structure_coherency(iommu))
3419 return -EINVAL;
3420
3421 if (dmar_domain->iommu_superpage !=
3422 iommu_superpage_capability(iommu, dmar_domain->use_first_level))
3423 return -EINVAL;
3424
3425 if (dmar_domain->use_first_level &&
3426 (!sm_supported(iommu) || !ecap_flts(iommu->ecap)))
3427 return -EINVAL;
3428
3429 /* check if this iommu agaw is sufficient for max mapped address */
3430 addr_width = agaw_to_width(iommu->agaw);
3431 if (addr_width > cap_mgaw(iommu->cap))
3432 addr_width = cap_mgaw(iommu->cap);
3433
3434 if (dmar_domain->gaw > addr_width || dmar_domain->agaw > iommu->agaw)
3435 return -EINVAL;
3436
3437 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3438 context_copied(iommu, info->bus, info->devfn))
3439 return intel_pasid_setup_sm_context(dev);
3440
3441 return 0;
3442 }
3443
intel_iommu_attach_device(struct iommu_domain * domain,struct device * dev)3444 static int intel_iommu_attach_device(struct iommu_domain *domain,
3445 struct device *dev)
3446 {
3447 int ret;
3448
3449 device_block_translation(dev);
3450
3451 ret = paging_domain_compatible(domain, dev);
3452 if (ret)
3453 return ret;
3454
3455 ret = iopf_for_domain_set(domain, dev);
3456 if (ret)
3457 return ret;
3458
3459 ret = dmar_domain_attach_device(to_dmar_domain(domain), dev);
3460 if (ret)
3461 iopf_for_domain_remove(domain, dev);
3462
3463 return ret;
3464 }
3465
intel_iommu_map(struct iommu_domain * domain,unsigned long iova,phys_addr_t hpa,size_t size,int iommu_prot,gfp_t gfp)3466 static int intel_iommu_map(struct iommu_domain *domain,
3467 unsigned long iova, phys_addr_t hpa,
3468 size_t size, int iommu_prot, gfp_t gfp)
3469 {
3470 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3471 u64 max_addr;
3472 int prot = 0;
3473
3474 if (iommu_prot & IOMMU_READ)
3475 prot |= DMA_PTE_READ;
3476 if (iommu_prot & IOMMU_WRITE)
3477 prot |= DMA_PTE_WRITE;
3478 if (dmar_domain->set_pte_snp)
3479 prot |= DMA_PTE_SNP;
3480
3481 max_addr = iova + size;
3482 if (dmar_domain->max_addr < max_addr) {
3483 u64 end;
3484
3485 /* check if minimum agaw is sufficient for mapped address */
3486 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3487 if (end < max_addr) {
3488 pr_err("%s: iommu width (%d) is not "
3489 "sufficient for the mapped address (%llx)\n",
3490 __func__, dmar_domain->gaw, max_addr);
3491 return -EFAULT;
3492 }
3493 dmar_domain->max_addr = max_addr;
3494 }
3495 /* Round up size to next multiple of PAGE_SIZE, if it and
3496 the low bits of hpa would take us onto the next page */
3497 size = aligned_nrpages(hpa, size);
3498 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3499 hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
3500 }
3501
intel_iommu_map_pages(struct iommu_domain * domain,unsigned long iova,phys_addr_t paddr,size_t pgsize,size_t pgcount,int prot,gfp_t gfp,size_t * mapped)3502 static int intel_iommu_map_pages(struct iommu_domain *domain,
3503 unsigned long iova, phys_addr_t paddr,
3504 size_t pgsize, size_t pgcount,
3505 int prot, gfp_t gfp, size_t *mapped)
3506 {
3507 unsigned long pgshift = __ffs(pgsize);
3508 size_t size = pgcount << pgshift;
3509 int ret;
3510
3511 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
3512 return -EINVAL;
3513
3514 if (!IS_ALIGNED(iova | paddr, pgsize))
3515 return -EINVAL;
3516
3517 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
3518 if (!ret && mapped)
3519 *mapped = size;
3520
3521 return ret;
3522 }
3523
intel_iommu_unmap(struct iommu_domain * domain,unsigned long iova,size_t size,struct iommu_iotlb_gather * gather)3524 static size_t intel_iommu_unmap(struct iommu_domain *domain,
3525 unsigned long iova, size_t size,
3526 struct iommu_iotlb_gather *gather)
3527 {
3528 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3529 unsigned long start_pfn, last_pfn;
3530 int level = 0;
3531
3532 /* Cope with horrid API which requires us to unmap more than the
3533 size argument if it happens to be a large-page mapping. */
3534 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
3535 &level, GFP_ATOMIC)))
3536 return 0;
3537
3538 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
3539 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
3540
3541 start_pfn = iova >> VTD_PAGE_SHIFT;
3542 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
3543
3544 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
3545
3546 if (dmar_domain->max_addr == iova + size)
3547 dmar_domain->max_addr = iova;
3548
3549 /*
3550 * We do not use page-selective IOTLB invalidation in flush queue,
3551 * so there is no need to track page and sync iotlb.
3552 */
3553 if (!iommu_iotlb_gather_queued(gather))
3554 iommu_iotlb_gather_add_page(domain, gather, iova, size);
3555
3556 return size;
3557 }
3558
intel_iommu_unmap_pages(struct iommu_domain * domain,unsigned long iova,size_t pgsize,size_t pgcount,struct iommu_iotlb_gather * gather)3559 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
3560 unsigned long iova,
3561 size_t pgsize, size_t pgcount,
3562 struct iommu_iotlb_gather *gather)
3563 {
3564 unsigned long pgshift = __ffs(pgsize);
3565 size_t size = pgcount << pgshift;
3566
3567 return intel_iommu_unmap(domain, iova, size, gather);
3568 }
3569
intel_iommu_tlb_sync(struct iommu_domain * domain,struct iommu_iotlb_gather * gather)3570 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
3571 struct iommu_iotlb_gather *gather)
3572 {
3573 cache_tag_flush_range(to_dmar_domain(domain), gather->start,
3574 gather->end,
3575 iommu_pages_list_empty(&gather->freelist));
3576 iommu_put_pages_list(&gather->freelist);
3577 }
3578
intel_iommu_iova_to_phys(struct iommu_domain * domain,dma_addr_t iova)3579 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3580 dma_addr_t iova)
3581 {
3582 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3583 struct dma_pte *pte;
3584 int level = 0;
3585 u64 phys = 0;
3586
3587 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
3588 GFP_ATOMIC);
3589 if (pte && dma_pte_present(pte))
3590 phys = dma_pte_addr(pte) +
3591 (iova & (BIT_MASK(level_to_offset_bits(level) +
3592 VTD_PAGE_SHIFT) - 1));
3593
3594 return phys;
3595 }
3596
domain_support_force_snooping(struct dmar_domain * domain)3597 static bool domain_support_force_snooping(struct dmar_domain *domain)
3598 {
3599 struct device_domain_info *info;
3600 bool support = true;
3601
3602 assert_spin_locked(&domain->lock);
3603 list_for_each_entry(info, &domain->devices, link) {
3604 if (!ecap_sc_support(info->iommu->ecap)) {
3605 support = false;
3606 break;
3607 }
3608 }
3609
3610 return support;
3611 }
3612
domain_set_force_snooping(struct dmar_domain * domain)3613 static void domain_set_force_snooping(struct dmar_domain *domain)
3614 {
3615 struct device_domain_info *info;
3616
3617 assert_spin_locked(&domain->lock);
3618 /*
3619 * Second level page table supports per-PTE snoop control. The
3620 * iommu_map() interface will handle this by setting SNP bit.
3621 */
3622 if (!domain->use_first_level) {
3623 domain->set_pte_snp = true;
3624 return;
3625 }
3626
3627 list_for_each_entry(info, &domain->devices, link)
3628 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
3629 IOMMU_NO_PASID);
3630 }
3631
intel_iommu_enforce_cache_coherency(struct iommu_domain * domain)3632 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
3633 {
3634 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3635 unsigned long flags;
3636
3637 if (dmar_domain->force_snooping)
3638 return true;
3639
3640 spin_lock_irqsave(&dmar_domain->lock, flags);
3641 if (!domain_support_force_snooping(dmar_domain) ||
3642 (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
3643 spin_unlock_irqrestore(&dmar_domain->lock, flags);
3644 return false;
3645 }
3646
3647 domain_set_force_snooping(dmar_domain);
3648 dmar_domain->force_snooping = true;
3649 spin_unlock_irqrestore(&dmar_domain->lock, flags);
3650
3651 return true;
3652 }
3653
intel_iommu_capable(struct device * dev,enum iommu_cap cap)3654 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
3655 {
3656 struct device_domain_info *info = dev_iommu_priv_get(dev);
3657
3658 switch (cap) {
3659 case IOMMU_CAP_CACHE_COHERENCY:
3660 case IOMMU_CAP_DEFERRED_FLUSH:
3661 return true;
3662 case IOMMU_CAP_PRE_BOOT_PROTECTION:
3663 return dmar_platform_optin();
3664 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
3665 return ecap_sc_support(info->iommu->ecap);
3666 case IOMMU_CAP_DIRTY_TRACKING:
3667 return ssads_supported(info->iommu);
3668 default:
3669 return false;
3670 }
3671 }
3672
intel_iommu_probe_device(struct device * dev)3673 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
3674 {
3675 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3676 struct device_domain_info *info;
3677 struct intel_iommu *iommu;
3678 u8 bus, devfn;
3679 int ret;
3680
3681 iommu = device_lookup_iommu(dev, &bus, &devfn);
3682 if (!iommu || !iommu->iommu.ops)
3683 return ERR_PTR(-ENODEV);
3684
3685 info = kzalloc(sizeof(*info), GFP_KERNEL);
3686 if (!info)
3687 return ERR_PTR(-ENOMEM);
3688
3689 if (dev_is_real_dma_subdevice(dev)) {
3690 info->bus = pdev->bus->number;
3691 info->devfn = pdev->devfn;
3692 info->segment = pci_domain_nr(pdev->bus);
3693 } else {
3694 info->bus = bus;
3695 info->devfn = devfn;
3696 info->segment = iommu->segment;
3697 }
3698
3699 info->dev = dev;
3700 info->iommu = iommu;
3701 if (dev_is_pci(dev)) {
3702 if (ecap_dev_iotlb_support(iommu->ecap) &&
3703 pci_ats_supported(pdev) &&
3704 dmar_ats_supported(pdev, iommu)) {
3705 info->ats_supported = 1;
3706 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
3707
3708 /*
3709 * For IOMMU that supports device IOTLB throttling
3710 * (DIT), we assign PFSID to the invalidation desc
3711 * of a VF such that IOMMU HW can gauge queue depth
3712 * at PF level. If DIT is not set, PFSID will be
3713 * treated as reserved, which should be set to 0.
3714 */
3715 if (ecap_dit(iommu->ecap))
3716 info->pfsid = pci_dev_id(pci_physfn(pdev));
3717 info->ats_qdep = pci_ats_queue_depth(pdev);
3718 }
3719 if (sm_supported(iommu)) {
3720 if (pasid_supported(iommu)) {
3721 int features = pci_pasid_features(pdev);
3722
3723 if (features >= 0)
3724 info->pasid_supported = features | 1;
3725 }
3726
3727 if (info->ats_supported && ecap_prs(iommu->ecap) &&
3728 pci_pri_supported(pdev))
3729 info->pri_supported = 1;
3730 }
3731 }
3732
3733 dev_iommu_priv_set(dev, info);
3734 if (pdev && pci_ats_supported(pdev)) {
3735 pci_prepare_ats(pdev, VTD_PAGE_SHIFT);
3736 ret = device_rbtree_insert(iommu, info);
3737 if (ret)
3738 goto free;
3739 }
3740
3741 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
3742 ret = intel_pasid_alloc_table(dev);
3743 if (ret) {
3744 dev_err(dev, "PASID table allocation failed\n");
3745 goto clear_rbtree;
3746 }
3747
3748 if (!context_copied(iommu, info->bus, info->devfn)) {
3749 ret = intel_pasid_setup_sm_context(dev);
3750 if (ret)
3751 goto free_table;
3752 }
3753 }
3754
3755 intel_iommu_debugfs_create_dev(info);
3756
3757 return &iommu->iommu;
3758 free_table:
3759 intel_pasid_free_table(dev);
3760 clear_rbtree:
3761 device_rbtree_remove(info);
3762 free:
3763 kfree(info);
3764
3765 return ERR_PTR(ret);
3766 }
3767
intel_iommu_probe_finalize(struct device * dev)3768 static void intel_iommu_probe_finalize(struct device *dev)
3769 {
3770 struct device_domain_info *info = dev_iommu_priv_get(dev);
3771 struct intel_iommu *iommu = info->iommu;
3772
3773 /*
3774 * The PCIe spec, in its wisdom, declares that the behaviour of the
3775 * device is undefined if you enable PASID support after ATS support.
3776 * So always enable PASID support on devices which have it, even if
3777 * we can't yet know if we're ever going to use it.
3778 */
3779 if (info->pasid_supported &&
3780 !pci_enable_pasid(to_pci_dev(dev), info->pasid_supported & ~1))
3781 info->pasid_enabled = 1;
3782
3783 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev))
3784 iommu_enable_pci_ats(info);
3785 iommu_enable_pci_pri(info);
3786 }
3787
intel_iommu_release_device(struct device * dev)3788 static void intel_iommu_release_device(struct device *dev)
3789 {
3790 struct device_domain_info *info = dev_iommu_priv_get(dev);
3791 struct intel_iommu *iommu = info->iommu;
3792
3793 iommu_disable_pci_pri(info);
3794 iommu_disable_pci_ats(info);
3795
3796 if (info->pasid_enabled) {
3797 pci_disable_pasid(to_pci_dev(dev));
3798 info->pasid_enabled = 0;
3799 }
3800
3801 mutex_lock(&iommu->iopf_lock);
3802 if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
3803 device_rbtree_remove(info);
3804 mutex_unlock(&iommu->iopf_lock);
3805
3806 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3807 !context_copied(iommu, info->bus, info->devfn))
3808 intel_pasid_teardown_sm_context(dev);
3809
3810 intel_pasid_free_table(dev);
3811 intel_iommu_debugfs_remove_dev(info);
3812 kfree(info);
3813 }
3814
intel_iommu_get_resv_regions(struct device * device,struct list_head * head)3815 static void intel_iommu_get_resv_regions(struct device *device,
3816 struct list_head *head)
3817 {
3818 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
3819 struct iommu_resv_region *reg;
3820 struct dmar_rmrr_unit *rmrr;
3821 struct device *i_dev;
3822 int i;
3823
3824 rcu_read_lock();
3825 for_each_rmrr_units(rmrr) {
3826 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3827 i, i_dev) {
3828 struct iommu_resv_region *resv;
3829 enum iommu_resv_type type;
3830 size_t length;
3831
3832 if (i_dev != device &&
3833 !is_downstream_to_pci_bridge(device, i_dev))
3834 continue;
3835
3836 length = rmrr->end_address - rmrr->base_address + 1;
3837
3838 type = device_rmrr_is_relaxable(device) ?
3839 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
3840
3841 resv = iommu_alloc_resv_region(rmrr->base_address,
3842 length, prot, type,
3843 GFP_ATOMIC);
3844 if (!resv)
3845 break;
3846
3847 list_add_tail(&resv->list, head);
3848 }
3849 }
3850 rcu_read_unlock();
3851
3852 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
3853 if (dev_is_pci(device)) {
3854 struct pci_dev *pdev = to_pci_dev(device);
3855
3856 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
3857 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
3858 IOMMU_RESV_DIRECT_RELAXABLE,
3859 GFP_KERNEL);
3860 if (reg)
3861 list_add_tail(®->list, head);
3862 }
3863 }
3864 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
3865
3866 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
3867 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
3868 0, IOMMU_RESV_MSI, GFP_KERNEL);
3869 if (!reg)
3870 return;
3871 list_add_tail(®->list, head);
3872 }
3873
intel_iommu_device_group(struct device * dev)3874 static struct iommu_group *intel_iommu_device_group(struct device *dev)
3875 {
3876 if (dev_is_pci(dev))
3877 return pci_device_group(dev);
3878 return generic_device_group(dev);
3879 }
3880
intel_iommu_enable_iopf(struct device * dev)3881 int intel_iommu_enable_iopf(struct device *dev)
3882 {
3883 struct device_domain_info *info = dev_iommu_priv_get(dev);
3884 struct intel_iommu *iommu = info->iommu;
3885 int ret;
3886
3887 if (!info->pri_enabled)
3888 return -ENODEV;
3889
3890 /* pri_enabled is protected by the group mutex. */
3891 iommu_group_mutex_assert(dev);
3892 if (info->iopf_refcount) {
3893 info->iopf_refcount++;
3894 return 0;
3895 }
3896
3897 ret = iopf_queue_add_device(iommu->iopf_queue, dev);
3898 if (ret)
3899 return ret;
3900
3901 info->iopf_refcount = 1;
3902
3903 return 0;
3904 }
3905
intel_iommu_disable_iopf(struct device * dev)3906 void intel_iommu_disable_iopf(struct device *dev)
3907 {
3908 struct device_domain_info *info = dev_iommu_priv_get(dev);
3909 struct intel_iommu *iommu = info->iommu;
3910
3911 if (WARN_ON(!info->pri_enabled || !info->iopf_refcount))
3912 return;
3913
3914 iommu_group_mutex_assert(dev);
3915 if (--info->iopf_refcount)
3916 return;
3917
3918 iopf_queue_remove_device(iommu->iopf_queue, dev);
3919 }
3920
intel_iommu_is_attach_deferred(struct device * dev)3921 static bool intel_iommu_is_attach_deferred(struct device *dev)
3922 {
3923 struct device_domain_info *info = dev_iommu_priv_get(dev);
3924
3925 return translation_pre_enabled(info->iommu) && !info->domain;
3926 }
3927
3928 /*
3929 * Check that the device does not live on an external facing PCI port that is
3930 * marked as untrusted. Such devices should not be able to apply quirks and
3931 * thus not be able to bypass the IOMMU restrictions.
3932 */
risky_device(struct pci_dev * pdev)3933 static bool risky_device(struct pci_dev *pdev)
3934 {
3935 if (pdev->untrusted) {
3936 pci_info(pdev,
3937 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
3938 pdev->vendor, pdev->device);
3939 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
3940 return true;
3941 }
3942 return false;
3943 }
3944
intel_iommu_iotlb_sync_map(struct iommu_domain * domain,unsigned long iova,size_t size)3945 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
3946 unsigned long iova, size_t size)
3947 {
3948 cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1);
3949
3950 return 0;
3951 }
3952
domain_remove_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid)3953 void domain_remove_dev_pasid(struct iommu_domain *domain,
3954 struct device *dev, ioasid_t pasid)
3955 {
3956 struct device_domain_info *info = dev_iommu_priv_get(dev);
3957 struct dev_pasid_info *curr, *dev_pasid = NULL;
3958 struct intel_iommu *iommu = info->iommu;
3959 struct dmar_domain *dmar_domain;
3960 unsigned long flags;
3961
3962 if (!domain)
3963 return;
3964
3965 /* Identity domain has no meta data for pasid. */
3966 if (domain->type == IOMMU_DOMAIN_IDENTITY)
3967 return;
3968
3969 dmar_domain = to_dmar_domain(domain);
3970 spin_lock_irqsave(&dmar_domain->lock, flags);
3971 list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
3972 if (curr->dev == dev && curr->pasid == pasid) {
3973 list_del(&curr->link_domain);
3974 dev_pasid = curr;
3975 break;
3976 }
3977 }
3978 spin_unlock_irqrestore(&dmar_domain->lock, flags);
3979
3980 cache_tag_unassign_domain(dmar_domain, dev, pasid);
3981 domain_detach_iommu(dmar_domain, iommu);
3982 if (!WARN_ON_ONCE(!dev_pasid)) {
3983 intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
3984 kfree(dev_pasid);
3985 }
3986 }
3987
blocking_domain_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)3988 static int blocking_domain_set_dev_pasid(struct iommu_domain *domain,
3989 struct device *dev, ioasid_t pasid,
3990 struct iommu_domain *old)
3991 {
3992 struct device_domain_info *info = dev_iommu_priv_get(dev);
3993
3994 iopf_for_domain_remove(old, dev);
3995 intel_pasid_tear_down_entry(info->iommu, dev, pasid, false);
3996 domain_remove_dev_pasid(old, dev, pasid);
3997
3998 return 0;
3999 }
4000
4001 struct dev_pasid_info *
domain_add_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid)4002 domain_add_dev_pasid(struct iommu_domain *domain,
4003 struct device *dev, ioasid_t pasid)
4004 {
4005 struct device_domain_info *info = dev_iommu_priv_get(dev);
4006 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4007 struct intel_iommu *iommu = info->iommu;
4008 struct dev_pasid_info *dev_pasid;
4009 unsigned long flags;
4010 int ret;
4011
4012 dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4013 if (!dev_pasid)
4014 return ERR_PTR(-ENOMEM);
4015
4016 ret = domain_attach_iommu(dmar_domain, iommu);
4017 if (ret)
4018 goto out_free;
4019
4020 ret = cache_tag_assign_domain(dmar_domain, dev, pasid);
4021 if (ret)
4022 goto out_detach_iommu;
4023
4024 dev_pasid->dev = dev;
4025 dev_pasid->pasid = pasid;
4026 spin_lock_irqsave(&dmar_domain->lock, flags);
4027 list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4028 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4029
4030 return dev_pasid;
4031 out_detach_iommu:
4032 domain_detach_iommu(dmar_domain, iommu);
4033 out_free:
4034 kfree(dev_pasid);
4035 return ERR_PTR(ret);
4036 }
4037
intel_iommu_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)4038 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4039 struct device *dev, ioasid_t pasid,
4040 struct iommu_domain *old)
4041 {
4042 struct device_domain_info *info = dev_iommu_priv_get(dev);
4043 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4044 struct intel_iommu *iommu = info->iommu;
4045 struct dev_pasid_info *dev_pasid;
4046 int ret;
4047
4048 if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
4049 return -EINVAL;
4050
4051 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4052 return -EOPNOTSUPP;
4053
4054 if (domain->dirty_ops)
4055 return -EINVAL;
4056
4057 if (context_copied(iommu, info->bus, info->devfn))
4058 return -EBUSY;
4059
4060 ret = paging_domain_compatible(domain, dev);
4061 if (ret)
4062 return ret;
4063
4064 dev_pasid = domain_add_dev_pasid(domain, dev, pasid);
4065 if (IS_ERR(dev_pasid))
4066 return PTR_ERR(dev_pasid);
4067
4068 ret = iopf_for_domain_replace(domain, old, dev);
4069 if (ret)
4070 goto out_remove_dev_pasid;
4071
4072 if (dmar_domain->use_first_level)
4073 ret = domain_setup_first_level(iommu, dmar_domain,
4074 dev, pasid, old);
4075 else
4076 ret = domain_setup_second_level(iommu, dmar_domain,
4077 dev, pasid, old);
4078 if (ret)
4079 goto out_unwind_iopf;
4080
4081 domain_remove_dev_pasid(old, dev, pasid);
4082
4083 intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4084
4085 return 0;
4086
4087 out_unwind_iopf:
4088 iopf_for_domain_replace(old, domain, dev);
4089 out_remove_dev_pasid:
4090 domain_remove_dev_pasid(domain, dev, pasid);
4091 return ret;
4092 }
4093
intel_iommu_hw_info(struct device * dev,u32 * length,u32 * type)4094 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4095 {
4096 struct device_domain_info *info = dev_iommu_priv_get(dev);
4097 struct intel_iommu *iommu = info->iommu;
4098 struct iommu_hw_info_vtd *vtd;
4099
4100 vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4101 if (!vtd)
4102 return ERR_PTR(-ENOMEM);
4103
4104 vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4105 vtd->cap_reg = iommu->cap;
4106 vtd->ecap_reg = iommu->ecap;
4107 *length = sizeof(*vtd);
4108 *type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4109 return vtd;
4110 }
4111
4112 /*
4113 * Set dirty tracking for the device list of a domain. The caller must
4114 * hold the domain->lock when calling it.
4115 */
device_set_dirty_tracking(struct list_head * devices,bool enable)4116 static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4117 {
4118 struct device_domain_info *info;
4119 int ret = 0;
4120
4121 list_for_each_entry(info, devices, link) {
4122 ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4123 IOMMU_NO_PASID, enable);
4124 if (ret)
4125 break;
4126 }
4127
4128 return ret;
4129 }
4130
parent_domain_set_dirty_tracking(struct dmar_domain * domain,bool enable)4131 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4132 bool enable)
4133 {
4134 struct dmar_domain *s1_domain;
4135 unsigned long flags;
4136 int ret;
4137
4138 spin_lock(&domain->s1_lock);
4139 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4140 spin_lock_irqsave(&s1_domain->lock, flags);
4141 ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4142 spin_unlock_irqrestore(&s1_domain->lock, flags);
4143 if (ret)
4144 goto err_unwind;
4145 }
4146 spin_unlock(&domain->s1_lock);
4147 return 0;
4148
4149 err_unwind:
4150 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4151 spin_lock_irqsave(&s1_domain->lock, flags);
4152 device_set_dirty_tracking(&s1_domain->devices,
4153 domain->dirty_tracking);
4154 spin_unlock_irqrestore(&s1_domain->lock, flags);
4155 }
4156 spin_unlock(&domain->s1_lock);
4157 return ret;
4158 }
4159
intel_iommu_set_dirty_tracking(struct iommu_domain * domain,bool enable)4160 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4161 bool enable)
4162 {
4163 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4164 int ret;
4165
4166 spin_lock(&dmar_domain->lock);
4167 if (dmar_domain->dirty_tracking == enable)
4168 goto out_unlock;
4169
4170 ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4171 if (ret)
4172 goto err_unwind;
4173
4174 if (dmar_domain->nested_parent) {
4175 ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4176 if (ret)
4177 goto err_unwind;
4178 }
4179
4180 dmar_domain->dirty_tracking = enable;
4181 out_unlock:
4182 spin_unlock(&dmar_domain->lock);
4183
4184 return 0;
4185
4186 err_unwind:
4187 device_set_dirty_tracking(&dmar_domain->devices,
4188 dmar_domain->dirty_tracking);
4189 spin_unlock(&dmar_domain->lock);
4190 return ret;
4191 }
4192
intel_iommu_read_and_clear_dirty(struct iommu_domain * domain,unsigned long iova,size_t size,unsigned long flags,struct iommu_dirty_bitmap * dirty)4193 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4194 unsigned long iova, size_t size,
4195 unsigned long flags,
4196 struct iommu_dirty_bitmap *dirty)
4197 {
4198 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4199 unsigned long end = iova + size - 1;
4200 unsigned long pgsize;
4201
4202 /*
4203 * IOMMUFD core calls into a dirty tracking disabled domain without an
4204 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4205 * have occurred when we stopped dirty tracking. This ensures that we
4206 * never inherit dirtied bits from a previous cycle.
4207 */
4208 if (!dmar_domain->dirty_tracking && dirty->bitmap)
4209 return -EINVAL;
4210
4211 do {
4212 struct dma_pte *pte;
4213 int lvl = 0;
4214
4215 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4216 GFP_ATOMIC);
4217 pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4218 if (!pte || !dma_pte_present(pte)) {
4219 iova += pgsize;
4220 continue;
4221 }
4222
4223 if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4224 iommu_dirty_bitmap_record(dirty, iova, pgsize);
4225 iova += pgsize;
4226 } while (iova < end);
4227
4228 return 0;
4229 }
4230
4231 static const struct iommu_dirty_ops intel_dirty_ops = {
4232 .set_dirty_tracking = intel_iommu_set_dirty_tracking,
4233 .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4234 };
4235
context_setup_pass_through(struct device * dev,u8 bus,u8 devfn)4236 static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
4237 {
4238 struct device_domain_info *info = dev_iommu_priv_get(dev);
4239 struct intel_iommu *iommu = info->iommu;
4240 struct context_entry *context;
4241
4242 spin_lock(&iommu->lock);
4243 context = iommu_context_addr(iommu, bus, devfn, 1);
4244 if (!context) {
4245 spin_unlock(&iommu->lock);
4246 return -ENOMEM;
4247 }
4248
4249 if (context_present(context) && !context_copied(iommu, bus, devfn)) {
4250 spin_unlock(&iommu->lock);
4251 return 0;
4252 }
4253
4254 copied_context_tear_down(iommu, context, bus, devfn);
4255 context_clear_entry(context);
4256 context_set_domain_id(context, FLPT_DEFAULT_DID);
4257
4258 /*
4259 * In pass through mode, AW must be programmed to indicate the largest
4260 * AGAW value supported by hardware. And ASR is ignored by hardware.
4261 */
4262 context_set_address_width(context, iommu->msagaw);
4263 context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH);
4264 context_set_fault_enable(context);
4265 context_set_present(context);
4266 if (!ecap_coherent(iommu->ecap))
4267 clflush_cache_range(context, sizeof(*context));
4268 context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn);
4269 spin_unlock(&iommu->lock);
4270
4271 return 0;
4272 }
4273
context_setup_pass_through_cb(struct pci_dev * pdev,u16 alias,void * data)4274 static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data)
4275 {
4276 struct device *dev = data;
4277
4278 return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff);
4279 }
4280
device_setup_pass_through(struct device * dev)4281 static int device_setup_pass_through(struct device *dev)
4282 {
4283 struct device_domain_info *info = dev_iommu_priv_get(dev);
4284
4285 if (!dev_is_pci(dev))
4286 return context_setup_pass_through(dev, info->bus, info->devfn);
4287
4288 return pci_for_each_dma_alias(to_pci_dev(dev),
4289 context_setup_pass_through_cb, dev);
4290 }
4291
identity_domain_attach_dev(struct iommu_domain * domain,struct device * dev)4292 static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev)
4293 {
4294 struct device_domain_info *info = dev_iommu_priv_get(dev);
4295 struct intel_iommu *iommu = info->iommu;
4296 int ret;
4297
4298 device_block_translation(dev);
4299
4300 if (dev_is_real_dma_subdevice(dev))
4301 return 0;
4302
4303 /*
4304 * No PRI support with the global identity domain. No need to enable or
4305 * disable PRI in this path as the iommu has been put in the blocking
4306 * state.
4307 */
4308 if (sm_supported(iommu))
4309 ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
4310 else
4311 ret = device_setup_pass_through(dev);
4312
4313 if (!ret)
4314 info->domain_attached = true;
4315
4316 return ret;
4317 }
4318
identity_domain_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)4319 static int identity_domain_set_dev_pasid(struct iommu_domain *domain,
4320 struct device *dev, ioasid_t pasid,
4321 struct iommu_domain *old)
4322 {
4323 struct device_domain_info *info = dev_iommu_priv_get(dev);
4324 struct intel_iommu *iommu = info->iommu;
4325 int ret;
4326
4327 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4328 return -EOPNOTSUPP;
4329
4330 ret = iopf_for_domain_replace(domain, old, dev);
4331 if (ret)
4332 return ret;
4333
4334 ret = domain_setup_passthrough(iommu, dev, pasid, old);
4335 if (ret) {
4336 iopf_for_domain_replace(old, domain, dev);
4337 return ret;
4338 }
4339
4340 domain_remove_dev_pasid(old, dev, pasid);
4341 return 0;
4342 }
4343
4344 static struct iommu_domain identity_domain = {
4345 .type = IOMMU_DOMAIN_IDENTITY,
4346 .ops = &(const struct iommu_domain_ops) {
4347 .attach_dev = identity_domain_attach_dev,
4348 .set_dev_pasid = identity_domain_set_dev_pasid,
4349 },
4350 };
4351
4352 const struct iommu_ops intel_iommu_ops = {
4353 .blocked_domain = &blocking_domain,
4354 .release_domain = &blocking_domain,
4355 .identity_domain = &identity_domain,
4356 .capable = intel_iommu_capable,
4357 .hw_info = intel_iommu_hw_info,
4358 .domain_alloc_paging_flags = intel_iommu_domain_alloc_paging_flags,
4359 .domain_alloc_sva = intel_svm_domain_alloc,
4360 .domain_alloc_nested = intel_iommu_domain_alloc_nested,
4361 .probe_device = intel_iommu_probe_device,
4362 .probe_finalize = intel_iommu_probe_finalize,
4363 .release_device = intel_iommu_release_device,
4364 .get_resv_regions = intel_iommu_get_resv_regions,
4365 .device_group = intel_iommu_device_group,
4366 .is_attach_deferred = intel_iommu_is_attach_deferred,
4367 .def_domain_type = device_def_domain_type,
4368 .pgsize_bitmap = SZ_4K,
4369 .page_response = intel_iommu_page_response,
4370 .default_domain_ops = &(const struct iommu_domain_ops) {
4371 .attach_dev = intel_iommu_attach_device,
4372 .set_dev_pasid = intel_iommu_set_dev_pasid,
4373 .map_pages = intel_iommu_map_pages,
4374 .unmap_pages = intel_iommu_unmap_pages,
4375 .iotlb_sync_map = intel_iommu_iotlb_sync_map,
4376 .flush_iotlb_all = intel_flush_iotlb_all,
4377 .iotlb_sync = intel_iommu_tlb_sync,
4378 .iova_to_phys = intel_iommu_iova_to_phys,
4379 .free = intel_iommu_domain_free,
4380 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4381 }
4382 };
4383
quirk_iommu_igfx(struct pci_dev * dev)4384 static void quirk_iommu_igfx(struct pci_dev *dev)
4385 {
4386 if (risky_device(dev))
4387 return;
4388
4389 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4390 disable_igfx_iommu = 1;
4391 }
4392
4393 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4394 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4395 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4396 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4397 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4398 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4399 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4400 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4401
4402 /* QM57/QS57 integrated gfx malfunctions with dmar */
4403 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_iommu_igfx);
4404
4405 /* Broadwell igfx malfunctions with dmar */
4406 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4407 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4408 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4409 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4410 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4411 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4412 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4413 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4414 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4415 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4416 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4417 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4418 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4419 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4420 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4421 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4422 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4423 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4424 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4425 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4426 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4427 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4428 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4429 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4430
quirk_iommu_rwbf(struct pci_dev * dev)4431 static void quirk_iommu_rwbf(struct pci_dev *dev)
4432 {
4433 if (risky_device(dev))
4434 return;
4435
4436 /*
4437 * Mobile 4 Series Chipset neglects to set RWBF capability,
4438 * but needs it. Same seems to hold for the desktop versions.
4439 */
4440 pci_info(dev, "Forcing write-buffer flush capability\n");
4441 rwbf_quirk = 1;
4442 }
4443
4444 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4445 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4446 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4447 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4448 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4449 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4450 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4451
4452 #define GGC 0x52
4453 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4454 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4455 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4456 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4457 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4458 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4459 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4460 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4461
quirk_calpella_no_shadow_gtt(struct pci_dev * dev)4462 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4463 {
4464 unsigned short ggc;
4465
4466 if (risky_device(dev))
4467 return;
4468
4469 if (pci_read_config_word(dev, GGC, &ggc))
4470 return;
4471
4472 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4473 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4474 disable_igfx_iommu = 1;
4475 } else if (!disable_igfx_iommu) {
4476 /* we have to ensure the gfx device is idle before we flush */
4477 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4478 iommu_set_dma_strict();
4479 }
4480 }
4481 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4482 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4483 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4484
quirk_igfx_skip_te_disable(struct pci_dev * dev)4485 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4486 {
4487 unsigned short ver;
4488
4489 if (!IS_GFX_DEVICE(dev))
4490 return;
4491
4492 ver = (dev->device >> 8) & 0xff;
4493 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4494 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4495 ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4496 return;
4497
4498 if (risky_device(dev))
4499 return;
4500
4501 pci_info(dev, "Skip IOMMU disabling for graphics\n");
4502 iommu_skip_te_disable = 1;
4503 }
4504 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4505
4506 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4507 ISOCH DMAR unit for the Azalia sound device, but not give it any
4508 TLB entries, which causes it to deadlock. Check for that. We do
4509 this in a function called from init_dmars(), instead of in a PCI
4510 quirk, because we don't want to print the obnoxious "BIOS broken"
4511 message if VT-d is actually disabled.
4512 */
check_tylersburg_isoch(void)4513 static void __init check_tylersburg_isoch(void)
4514 {
4515 struct pci_dev *pdev;
4516 uint32_t vtisochctrl;
4517
4518 /* If there's no Azalia in the system anyway, forget it. */
4519 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4520 if (!pdev)
4521 return;
4522
4523 if (risky_device(pdev)) {
4524 pci_dev_put(pdev);
4525 return;
4526 }
4527
4528 pci_dev_put(pdev);
4529
4530 /* System Management Registers. Might be hidden, in which case
4531 we can't do the sanity check. But that's OK, because the
4532 known-broken BIOSes _don't_ actually hide it, so far. */
4533 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4534 if (!pdev)
4535 return;
4536
4537 if (risky_device(pdev)) {
4538 pci_dev_put(pdev);
4539 return;
4540 }
4541
4542 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4543 pci_dev_put(pdev);
4544 return;
4545 }
4546
4547 pci_dev_put(pdev);
4548
4549 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4550 if (vtisochctrl & 1)
4551 return;
4552
4553 /* Drop all bits other than the number of TLB entries */
4554 vtisochctrl &= 0x1c;
4555
4556 /* If we have the recommended number of TLB entries (16), fine. */
4557 if (vtisochctrl == 0x10)
4558 return;
4559
4560 /* Zero TLB entries? You get to ride the short bus to school. */
4561 if (!vtisochctrl) {
4562 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4563 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4564 dmi_get_system_info(DMI_BIOS_VENDOR),
4565 dmi_get_system_info(DMI_BIOS_VERSION),
4566 dmi_get_system_info(DMI_PRODUCT_VERSION));
4567 iommu_identity_mapping |= IDENTMAP_AZALIA;
4568 return;
4569 }
4570
4571 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4572 vtisochctrl);
4573 }
4574
4575 /*
4576 * Here we deal with a device TLB defect where device may inadvertently issue ATS
4577 * invalidation completion before posted writes initiated with translated address
4578 * that utilized translations matching the invalidation address range, violating
4579 * the invalidation completion ordering.
4580 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4581 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4582 * under the control of the trusted/privileged host device driver must use this
4583 * quirk.
4584 * Device TLBs are invalidated under the following six conditions:
4585 * 1. Device driver does DMA API unmap IOVA
4586 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4587 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4588 * exit_mmap() due to crash
4589 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4590 * VM has to free pages that were unmapped
4591 * 5. Userspace driver unmaps a DMA buffer
4592 * 6. Cache invalidation in vSVA usage (upcoming)
4593 *
4594 * For #1 and #2, device drivers are responsible for stopping DMA traffic
4595 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4596 * invalidate TLB the same way as normal user unmap which will use this quirk.
4597 * The dTLB invalidation after PASID cache flush does not need this quirk.
4598 *
4599 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4600 */
quirk_extra_dev_tlb_flush(struct device_domain_info * info,unsigned long address,unsigned long mask,u32 pasid,u16 qdep)4601 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4602 unsigned long address, unsigned long mask,
4603 u32 pasid, u16 qdep)
4604 {
4605 u16 sid;
4606
4607 if (likely(!info->dtlb_extra_inval))
4608 return;
4609
4610 sid = PCI_DEVID(info->bus, info->devfn);
4611 if (pasid == IOMMU_NO_PASID) {
4612 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
4613 qdep, address, mask);
4614 } else {
4615 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
4616 pasid, qdep, address, mask);
4617 }
4618 }
4619
4620 #define ecmd_get_status_code(res) (((res) & 0xff) >> 1)
4621
4622 /*
4623 * Function to submit a command to the enhanced command interface. The
4624 * valid enhanced command descriptions are defined in Table 47 of the
4625 * VT-d spec. The VT-d hardware implementation may support some but not
4626 * all commands, which can be determined by checking the Enhanced
4627 * Command Capability Register.
4628 *
4629 * Return values:
4630 * - 0: Command successful without any error;
4631 * - Negative: software error value;
4632 * - Nonzero positive: failure status code defined in Table 48.
4633 */
ecmd_submit_sync(struct intel_iommu * iommu,u8 ecmd,u64 oa,u64 ob)4634 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
4635 {
4636 unsigned long flags;
4637 u64 res;
4638 int ret;
4639
4640 if (!cap_ecmds(iommu->cap))
4641 return -ENODEV;
4642
4643 raw_spin_lock_irqsave(&iommu->register_lock, flags);
4644
4645 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
4646 if (res & DMA_ECMD_ECRSP_IP) {
4647 ret = -EBUSY;
4648 goto err;
4649 }
4650
4651 /*
4652 * Unconditionally write the operand B, because
4653 * - There is no side effect if an ecmd doesn't require an
4654 * operand B, but we set the register to some value.
4655 * - It's not invoked in any critical path. The extra MMIO
4656 * write doesn't bring any performance concerns.
4657 */
4658 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
4659 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
4660
4661 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
4662 !(res & DMA_ECMD_ECRSP_IP), res);
4663
4664 if (res & DMA_ECMD_ECRSP_IP) {
4665 ret = -ETIMEDOUT;
4666 goto err;
4667 }
4668
4669 ret = ecmd_get_status_code(res);
4670 err:
4671 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
4672
4673 return ret;
4674 }
4675