1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright © 2006-2014 Intel Corporation.
4 *
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
11 */
12
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
15
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-pages.h"
31 #include "pasid.h"
32 #include "perfmon.h"
33
34 #define ROOT_SIZE VTD_PAGE_SIZE
35 #define CONTEXT_SIZE VTD_PAGE_SIZE
36
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41
42 #define IOAPIC_RANGE_START (0xfee00000)
43 #define IOAPIC_RANGE_END (0xfeefffff)
44 #define IOVA_START_ADDR (0x1000)
45
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47
48 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
49 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
50
51 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
52 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
53 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
54 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
55 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
56
57 static void __init check_tylersburg_isoch(void);
58 static int rwbf_quirk;
59
60 /*
61 * set to 1 to panic kernel if can't successfully enable VT-d
62 * (used when kernel is launched w/ TXT)
63 */
64 static int force_on = 0;
65 static int intel_iommu_tboot_noforce;
66 static int no_platform_optin;
67
68 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
69
70 /*
71 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
72 * if marked present.
73 */
root_entry_lctp(struct root_entry * re)74 static phys_addr_t root_entry_lctp(struct root_entry *re)
75 {
76 if (!(re->lo & 1))
77 return 0;
78
79 return re->lo & VTD_PAGE_MASK;
80 }
81
82 /*
83 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
84 * if marked present.
85 */
root_entry_uctp(struct root_entry * re)86 static phys_addr_t root_entry_uctp(struct root_entry *re)
87 {
88 if (!(re->hi & 1))
89 return 0;
90
91 return re->hi & VTD_PAGE_MASK;
92 }
93
device_rid_cmp_key(const void * key,const struct rb_node * node)94 static int device_rid_cmp_key(const void *key, const struct rb_node *node)
95 {
96 struct device_domain_info *info =
97 rb_entry(node, struct device_domain_info, node);
98 const u16 *rid_lhs = key;
99
100 if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
101 return -1;
102
103 if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
104 return 1;
105
106 return 0;
107 }
108
device_rid_cmp(struct rb_node * lhs,const struct rb_node * rhs)109 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
110 {
111 struct device_domain_info *info =
112 rb_entry(lhs, struct device_domain_info, node);
113 u16 key = PCI_DEVID(info->bus, info->devfn);
114
115 return device_rid_cmp_key(&key, rhs);
116 }
117
118 /*
119 * Looks up an IOMMU-probed device using its source ID.
120 *
121 * Returns the pointer to the device if there is a match. Otherwise,
122 * returns NULL.
123 *
124 * Note that this helper doesn't guarantee that the device won't be
125 * released by the iommu subsystem after being returned. The caller
126 * should use its own synchronization mechanism to avoid the device
127 * being released during its use if its possibly the case.
128 */
device_rbtree_find(struct intel_iommu * iommu,u16 rid)129 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
130 {
131 struct device_domain_info *info = NULL;
132 struct rb_node *node;
133 unsigned long flags;
134
135 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
136 node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
137 if (node)
138 info = rb_entry(node, struct device_domain_info, node);
139 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
140
141 return info ? info->dev : NULL;
142 }
143
device_rbtree_insert(struct intel_iommu * iommu,struct device_domain_info * info)144 static int device_rbtree_insert(struct intel_iommu *iommu,
145 struct device_domain_info *info)
146 {
147 struct rb_node *curr;
148 unsigned long flags;
149
150 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
151 curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
152 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
153 if (WARN_ON(curr))
154 return -EEXIST;
155
156 return 0;
157 }
158
device_rbtree_remove(struct device_domain_info * info)159 static void device_rbtree_remove(struct device_domain_info *info)
160 {
161 struct intel_iommu *iommu = info->iommu;
162 unsigned long flags;
163
164 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
165 rb_erase(&info->node, &iommu->device_rbtree);
166 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
167 }
168
169 struct dmar_rmrr_unit {
170 struct list_head list; /* list of rmrr units */
171 struct acpi_dmar_header *hdr; /* ACPI header */
172 u64 base_address; /* reserved base address*/
173 u64 end_address; /* reserved end address */
174 struct dmar_dev_scope *devices; /* target devices */
175 int devices_cnt; /* target device count */
176 };
177
178 struct dmar_atsr_unit {
179 struct list_head list; /* list of ATSR units */
180 struct acpi_dmar_header *hdr; /* ACPI header */
181 struct dmar_dev_scope *devices; /* target devices */
182 int devices_cnt; /* target device count */
183 u8 include_all:1; /* include all ports */
184 };
185
186 struct dmar_satc_unit {
187 struct list_head list; /* list of SATC units */
188 struct acpi_dmar_header *hdr; /* ACPI header */
189 struct dmar_dev_scope *devices; /* target devices */
190 struct intel_iommu *iommu; /* the corresponding iommu */
191 int devices_cnt; /* target device count */
192 u8 atc_required:1; /* ATS is required */
193 };
194
195 static LIST_HEAD(dmar_atsr_units);
196 static LIST_HEAD(dmar_rmrr_units);
197 static LIST_HEAD(dmar_satc_units);
198
199 #define for_each_rmrr_units(rmrr) \
200 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
201
202 static void intel_iommu_domain_free(struct iommu_domain *domain);
203
204 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
205 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
206
207 int intel_iommu_enabled = 0;
208 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
209
210 static int intel_iommu_superpage = 1;
211 static int iommu_identity_mapping;
212 static int iommu_skip_te_disable;
213 static int disable_igfx_iommu;
214
215 #define IDENTMAP_AZALIA 4
216
217 const struct iommu_ops intel_iommu_ops;
218 static const struct iommu_dirty_ops intel_dirty_ops;
219
translation_pre_enabled(struct intel_iommu * iommu)220 static bool translation_pre_enabled(struct intel_iommu *iommu)
221 {
222 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
223 }
224
clear_translation_pre_enabled(struct intel_iommu * iommu)225 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
226 {
227 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
228 }
229
init_translation_status(struct intel_iommu * iommu)230 static void init_translation_status(struct intel_iommu *iommu)
231 {
232 u32 gsts;
233
234 gsts = readl(iommu->reg + DMAR_GSTS_REG);
235 if (gsts & DMA_GSTS_TES)
236 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
237 }
238
intel_iommu_setup(char * str)239 static int __init intel_iommu_setup(char *str)
240 {
241 if (!str)
242 return -EINVAL;
243
244 while (*str) {
245 if (!strncmp(str, "on", 2)) {
246 dmar_disabled = 0;
247 pr_info("IOMMU enabled\n");
248 } else if (!strncmp(str, "off", 3)) {
249 dmar_disabled = 1;
250 no_platform_optin = 1;
251 pr_info("IOMMU disabled\n");
252 } else if (!strncmp(str, "igfx_off", 8)) {
253 disable_igfx_iommu = 1;
254 pr_info("Disable GFX device mapping\n");
255 } else if (!strncmp(str, "forcedac", 8)) {
256 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
257 iommu_dma_forcedac = true;
258 } else if (!strncmp(str, "strict", 6)) {
259 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
260 iommu_set_dma_strict();
261 } else if (!strncmp(str, "sp_off", 6)) {
262 pr_info("Disable supported super page\n");
263 intel_iommu_superpage = 0;
264 } else if (!strncmp(str, "sm_on", 5)) {
265 pr_info("Enable scalable mode if hardware supports\n");
266 intel_iommu_sm = 1;
267 } else if (!strncmp(str, "sm_off", 6)) {
268 pr_info("Scalable mode is disallowed\n");
269 intel_iommu_sm = 0;
270 } else if (!strncmp(str, "tboot_noforce", 13)) {
271 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
272 intel_iommu_tboot_noforce = 1;
273 } else {
274 pr_notice("Unknown option - '%s'\n", str);
275 }
276
277 str += strcspn(str, ",");
278 while (*str == ',')
279 str++;
280 }
281
282 return 1;
283 }
284 __setup("intel_iommu=", intel_iommu_setup);
285
domain_pfn_supported(struct dmar_domain * domain,unsigned long pfn)286 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
287 {
288 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
289
290 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
291 }
292
293 /*
294 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
295 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
296 * the returned SAGAW.
297 */
__iommu_calculate_sagaw(struct intel_iommu * iommu)298 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
299 {
300 unsigned long fl_sagaw, sl_sagaw;
301
302 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
303 sl_sagaw = cap_sagaw(iommu->cap);
304
305 /* Second level only. */
306 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
307 return sl_sagaw;
308
309 /* First level only. */
310 if (!ecap_slts(iommu->ecap))
311 return fl_sagaw;
312
313 return fl_sagaw & sl_sagaw;
314 }
315
__iommu_calculate_agaw(struct intel_iommu * iommu,int max_gaw)316 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
317 {
318 unsigned long sagaw;
319 int agaw;
320
321 sagaw = __iommu_calculate_sagaw(iommu);
322 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
323 if (test_bit(agaw, &sagaw))
324 break;
325 }
326
327 return agaw;
328 }
329
330 /*
331 * Calculate max SAGAW for each iommu.
332 */
iommu_calculate_max_sagaw(struct intel_iommu * iommu)333 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
334 {
335 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
336 }
337
338 /*
339 * calculate agaw for each iommu.
340 * "SAGAW" may be different across iommus, use a default agaw, and
341 * get a supported less agaw for iommus that don't support the default agaw.
342 */
iommu_calculate_agaw(struct intel_iommu * iommu)343 int iommu_calculate_agaw(struct intel_iommu *iommu)
344 {
345 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
346 }
347
iommu_paging_structure_coherency(struct intel_iommu * iommu)348 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
349 {
350 return sm_supported(iommu) ?
351 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
352 }
353
354 /* Return the super pagesize bitmap if supported. */
domain_super_pgsize_bitmap(struct dmar_domain * domain)355 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
356 {
357 unsigned long bitmap = 0;
358
359 /*
360 * 1-level super page supports page size of 2MiB, 2-level super page
361 * supports page size of both 2MiB and 1GiB.
362 */
363 if (domain->iommu_superpage == 1)
364 bitmap |= SZ_2M;
365 else if (domain->iommu_superpage == 2)
366 bitmap |= SZ_2M | SZ_1G;
367
368 return bitmap;
369 }
370
iommu_context_addr(struct intel_iommu * iommu,u8 bus,u8 devfn,int alloc)371 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
372 u8 devfn, int alloc)
373 {
374 struct root_entry *root = &iommu->root_entry[bus];
375 struct context_entry *context;
376 u64 *entry;
377
378 /*
379 * Except that the caller requested to allocate a new entry,
380 * returning a copied context entry makes no sense.
381 */
382 if (!alloc && context_copied(iommu, bus, devfn))
383 return NULL;
384
385 entry = &root->lo;
386 if (sm_supported(iommu)) {
387 if (devfn >= 0x80) {
388 devfn -= 0x80;
389 entry = &root->hi;
390 }
391 devfn *= 2;
392 }
393 if (*entry & 1)
394 context = phys_to_virt(*entry & VTD_PAGE_MASK);
395 else {
396 unsigned long phy_addr;
397 if (!alloc)
398 return NULL;
399
400 context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
401 if (!context)
402 return NULL;
403
404 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
405 phy_addr = virt_to_phys((void *)context);
406 *entry = phy_addr | 1;
407 __iommu_flush_cache(iommu, entry, sizeof(*entry));
408 }
409 return &context[devfn];
410 }
411
412 /**
413 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
414 * sub-hierarchy of a candidate PCI-PCI bridge
415 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
416 * @bridge: the candidate PCI-PCI bridge
417 *
418 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
419 */
420 static bool
is_downstream_to_pci_bridge(struct device * dev,struct device * bridge)421 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
422 {
423 struct pci_dev *pdev, *pbridge;
424
425 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
426 return false;
427
428 pdev = to_pci_dev(dev);
429 pbridge = to_pci_dev(bridge);
430
431 if (pbridge->subordinate &&
432 pbridge->subordinate->number <= pdev->bus->number &&
433 pbridge->subordinate->busn_res.end >= pdev->bus->number)
434 return true;
435
436 return false;
437 }
438
quirk_ioat_snb_local_iommu(struct pci_dev * pdev)439 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
440 {
441 struct dmar_drhd_unit *drhd;
442 u32 vtbar;
443 int rc;
444
445 /* We know that this device on this chipset has its own IOMMU.
446 * If we find it under a different IOMMU, then the BIOS is lying
447 * to us. Hope that the IOMMU for this device is actually
448 * disabled, and it needs no translation...
449 */
450 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
451 if (rc) {
452 /* "can't" happen */
453 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
454 return false;
455 }
456 vtbar &= 0xffff0000;
457
458 /* we know that the this iommu should be at offset 0xa000 from vtbar */
459 drhd = dmar_find_matched_drhd_unit(pdev);
460 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
461 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
462 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
463 return true;
464 }
465
466 return false;
467 }
468
iommu_is_dummy(struct intel_iommu * iommu,struct device * dev)469 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
470 {
471 if (!iommu || iommu->drhd->ignored)
472 return true;
473
474 if (dev_is_pci(dev)) {
475 struct pci_dev *pdev = to_pci_dev(dev);
476
477 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
478 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
479 quirk_ioat_snb_local_iommu(pdev))
480 return true;
481 }
482
483 return false;
484 }
485
device_lookup_iommu(struct device * dev,u8 * bus,u8 * devfn)486 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
487 {
488 struct dmar_drhd_unit *drhd = NULL;
489 struct pci_dev *pdev = NULL;
490 struct intel_iommu *iommu;
491 struct device *tmp;
492 u16 segment = 0;
493 int i;
494
495 if (!dev)
496 return NULL;
497
498 if (dev_is_pci(dev)) {
499 struct pci_dev *pf_pdev;
500
501 pdev = pci_real_dma_dev(to_pci_dev(dev));
502
503 /* VFs aren't listed in scope tables; we need to look up
504 * the PF instead to find the IOMMU. */
505 pf_pdev = pci_physfn(pdev);
506 dev = &pf_pdev->dev;
507 segment = pci_domain_nr(pdev->bus);
508 } else if (has_acpi_companion(dev))
509 dev = &ACPI_COMPANION(dev)->dev;
510
511 rcu_read_lock();
512 for_each_iommu(iommu, drhd) {
513 if (pdev && segment != drhd->segment)
514 continue;
515
516 for_each_active_dev_scope(drhd->devices,
517 drhd->devices_cnt, i, tmp) {
518 if (tmp == dev) {
519 /* For a VF use its original BDF# not that of the PF
520 * which we used for the IOMMU lookup. Strictly speaking
521 * we could do this for all PCI devices; we only need to
522 * get the BDF# from the scope table for ACPI matches. */
523 if (pdev && pdev->is_virtfn)
524 goto got_pdev;
525
526 if (bus && devfn) {
527 *bus = drhd->devices[i].bus;
528 *devfn = drhd->devices[i].devfn;
529 }
530 goto out;
531 }
532
533 if (is_downstream_to_pci_bridge(dev, tmp))
534 goto got_pdev;
535 }
536
537 if (pdev && drhd->include_all) {
538 got_pdev:
539 if (bus && devfn) {
540 *bus = pdev->bus->number;
541 *devfn = pdev->devfn;
542 }
543 goto out;
544 }
545 }
546 iommu = NULL;
547 out:
548 if (iommu_is_dummy(iommu, dev))
549 iommu = NULL;
550
551 rcu_read_unlock();
552
553 return iommu;
554 }
555
domain_flush_cache(struct dmar_domain * domain,void * addr,int size)556 static void domain_flush_cache(struct dmar_domain *domain,
557 void *addr, int size)
558 {
559 if (!domain->iommu_coherency)
560 clflush_cache_range(addr, size);
561 }
562
free_context_table(struct intel_iommu * iommu)563 static void free_context_table(struct intel_iommu *iommu)
564 {
565 struct context_entry *context;
566 int i;
567
568 if (!iommu->root_entry)
569 return;
570
571 for (i = 0; i < ROOT_ENTRY_NR; i++) {
572 context = iommu_context_addr(iommu, i, 0, 0);
573 if (context)
574 iommu_free_page(context);
575
576 if (!sm_supported(iommu))
577 continue;
578
579 context = iommu_context_addr(iommu, i, 0x80, 0);
580 if (context)
581 iommu_free_page(context);
582 }
583
584 iommu_free_page(iommu->root_entry);
585 iommu->root_entry = NULL;
586 }
587
588 #ifdef CONFIG_DMAR_DEBUG
pgtable_walk(struct intel_iommu * iommu,unsigned long pfn,u8 bus,u8 devfn,struct dma_pte * parent,int level)589 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
590 u8 bus, u8 devfn, struct dma_pte *parent, int level)
591 {
592 struct dma_pte *pte;
593 int offset;
594
595 while (1) {
596 offset = pfn_level_offset(pfn, level);
597 pte = &parent[offset];
598
599 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
600
601 if (!dma_pte_present(pte)) {
602 pr_info("page table not present at level %d\n", level - 1);
603 break;
604 }
605
606 if (level == 1 || dma_pte_superpage(pte))
607 break;
608
609 parent = phys_to_virt(dma_pte_addr(pte));
610 level--;
611 }
612 }
613
dmar_fault_dump_ptes(struct intel_iommu * iommu,u16 source_id,unsigned long long addr,u32 pasid)614 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
615 unsigned long long addr, u32 pasid)
616 {
617 struct pasid_dir_entry *dir, *pde;
618 struct pasid_entry *entries, *pte;
619 struct context_entry *ctx_entry;
620 struct root_entry *rt_entry;
621 int i, dir_index, index, level;
622 u8 devfn = source_id & 0xff;
623 u8 bus = source_id >> 8;
624 struct dma_pte *pgtable;
625
626 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
627
628 /* root entry dump */
629 if (!iommu->root_entry) {
630 pr_info("root table is not present\n");
631 return;
632 }
633 rt_entry = &iommu->root_entry[bus];
634
635 if (sm_supported(iommu))
636 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
637 rt_entry->hi, rt_entry->lo);
638 else
639 pr_info("root entry: 0x%016llx", rt_entry->lo);
640
641 /* context entry dump */
642 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
643 if (!ctx_entry) {
644 pr_info("context table is not present\n");
645 return;
646 }
647
648 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
649 ctx_entry->hi, ctx_entry->lo);
650
651 /* legacy mode does not require PASID entries */
652 if (!sm_supported(iommu)) {
653 if (!context_present(ctx_entry)) {
654 pr_info("legacy mode page table is not present\n");
655 return;
656 }
657 level = agaw_to_level(ctx_entry->hi & 7);
658 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
659 goto pgtable_walk;
660 }
661
662 if (!context_present(ctx_entry)) {
663 pr_info("pasid directory table is not present\n");
664 return;
665 }
666
667 /* get the pointer to pasid directory entry */
668 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
669
670 /* For request-without-pasid, get the pasid from context entry */
671 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
672 pasid = IOMMU_NO_PASID;
673
674 dir_index = pasid >> PASID_PDE_SHIFT;
675 pde = &dir[dir_index];
676 pr_info("pasid dir entry: 0x%016llx\n", pde->val);
677
678 /* get the pointer to the pasid table entry */
679 entries = get_pasid_table_from_pde(pde);
680 if (!entries) {
681 pr_info("pasid table is not present\n");
682 return;
683 }
684 index = pasid & PASID_PTE_MASK;
685 pte = &entries[index];
686 for (i = 0; i < ARRAY_SIZE(pte->val); i++)
687 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
688
689 if (!pasid_pte_is_present(pte)) {
690 pr_info("scalable mode page table is not present\n");
691 return;
692 }
693
694 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
695 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
696 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
697 } else {
698 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
699 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
700 }
701
702 pgtable_walk:
703 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
704 }
705 #endif
706
pfn_to_dma_pte(struct dmar_domain * domain,unsigned long pfn,int * target_level,gfp_t gfp)707 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
708 unsigned long pfn, int *target_level,
709 gfp_t gfp)
710 {
711 struct dma_pte *parent, *pte;
712 int level = agaw_to_level(domain->agaw);
713 int offset;
714
715 if (!domain_pfn_supported(domain, pfn))
716 /* Address beyond IOMMU's addressing capabilities. */
717 return NULL;
718
719 parent = domain->pgd;
720
721 while (1) {
722 void *tmp_page;
723
724 offset = pfn_level_offset(pfn, level);
725 pte = &parent[offset];
726 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
727 break;
728 if (level == *target_level)
729 break;
730
731 if (!dma_pte_present(pte)) {
732 uint64_t pteval, tmp;
733
734 tmp_page = iommu_alloc_page_node(domain->nid, gfp);
735
736 if (!tmp_page)
737 return NULL;
738
739 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
740 pteval = virt_to_phys(tmp_page) | DMA_PTE_READ |
741 DMA_PTE_WRITE;
742 if (domain->use_first_level)
743 pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
744
745 tmp = 0ULL;
746 if (!try_cmpxchg64(&pte->val, &tmp, pteval))
747 /* Someone else set it while we were thinking; use theirs. */
748 iommu_free_page(tmp_page);
749 else
750 domain_flush_cache(domain, pte, sizeof(*pte));
751 }
752 if (level == 1)
753 break;
754
755 parent = phys_to_virt(dma_pte_addr(pte));
756 level--;
757 }
758
759 if (!*target_level)
760 *target_level = level;
761
762 return pte;
763 }
764
765 /* return address's pte at specific level */
dma_pfn_level_pte(struct dmar_domain * domain,unsigned long pfn,int level,int * large_page)766 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
767 unsigned long pfn,
768 int level, int *large_page)
769 {
770 struct dma_pte *parent, *pte;
771 int total = agaw_to_level(domain->agaw);
772 int offset;
773
774 parent = domain->pgd;
775 while (level <= total) {
776 offset = pfn_level_offset(pfn, total);
777 pte = &parent[offset];
778 if (level == total)
779 return pte;
780
781 if (!dma_pte_present(pte)) {
782 *large_page = total;
783 break;
784 }
785
786 if (dma_pte_superpage(pte)) {
787 *large_page = total;
788 return pte;
789 }
790
791 parent = phys_to_virt(dma_pte_addr(pte));
792 total--;
793 }
794 return NULL;
795 }
796
797 /* clear last level pte, a tlb flush should be followed */
dma_pte_clear_range(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn)798 static void dma_pte_clear_range(struct dmar_domain *domain,
799 unsigned long start_pfn,
800 unsigned long last_pfn)
801 {
802 unsigned int large_page;
803 struct dma_pte *first_pte, *pte;
804
805 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
806 WARN_ON(start_pfn > last_pfn))
807 return;
808
809 /* we don't need lock here; nobody else touches the iova range */
810 do {
811 large_page = 1;
812 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
813 if (!pte) {
814 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
815 continue;
816 }
817 do {
818 dma_clear_pte(pte);
819 start_pfn += lvl_to_nr_pages(large_page);
820 pte++;
821 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
822
823 domain_flush_cache(domain, first_pte,
824 (void *)pte - (void *)first_pte);
825
826 } while (start_pfn && start_pfn <= last_pfn);
827 }
828
dma_pte_free_level(struct dmar_domain * domain,int level,int retain_level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn)829 static void dma_pte_free_level(struct dmar_domain *domain, int level,
830 int retain_level, struct dma_pte *pte,
831 unsigned long pfn, unsigned long start_pfn,
832 unsigned long last_pfn)
833 {
834 pfn = max(start_pfn, pfn);
835 pte = &pte[pfn_level_offset(pfn, level)];
836
837 do {
838 unsigned long level_pfn;
839 struct dma_pte *level_pte;
840
841 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
842 goto next;
843
844 level_pfn = pfn & level_mask(level);
845 level_pte = phys_to_virt(dma_pte_addr(pte));
846
847 if (level > 2) {
848 dma_pte_free_level(domain, level - 1, retain_level,
849 level_pte, level_pfn, start_pfn,
850 last_pfn);
851 }
852
853 /*
854 * Free the page table if we're below the level we want to
855 * retain and the range covers the entire table.
856 */
857 if (level < retain_level && !(start_pfn > level_pfn ||
858 last_pfn < level_pfn + level_size(level) - 1)) {
859 dma_clear_pte(pte);
860 domain_flush_cache(domain, pte, sizeof(*pte));
861 iommu_free_page(level_pte);
862 }
863 next:
864 pfn += level_size(level);
865 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
866 }
867
868 /*
869 * clear last level (leaf) ptes and free page table pages below the
870 * level we wish to keep intact.
871 */
dma_pte_free_pagetable(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,int retain_level)872 static void dma_pte_free_pagetable(struct dmar_domain *domain,
873 unsigned long start_pfn,
874 unsigned long last_pfn,
875 int retain_level)
876 {
877 dma_pte_clear_range(domain, start_pfn, last_pfn);
878
879 /* We don't need lock here; nobody else touches the iova range */
880 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
881 domain->pgd, 0, start_pfn, last_pfn);
882
883 /* free pgd */
884 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
885 iommu_free_page(domain->pgd);
886 domain->pgd = NULL;
887 }
888 }
889
890 /* When a page at a given level is being unlinked from its parent, we don't
891 need to *modify* it at all. All we need to do is make a list of all the
892 pages which can be freed just as soon as we've flushed the IOTLB and we
893 know the hardware page-walk will no longer touch them.
894 The 'pte' argument is the *parent* PTE, pointing to the page that is to
895 be freed. */
dma_pte_list_pagetables(struct dmar_domain * domain,int level,struct dma_pte * pte,struct list_head * freelist)896 static void dma_pte_list_pagetables(struct dmar_domain *domain,
897 int level, struct dma_pte *pte,
898 struct list_head *freelist)
899 {
900 struct page *pg;
901
902 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
903 list_add_tail(&pg->lru, freelist);
904
905 if (level == 1)
906 return;
907
908 pte = page_address(pg);
909 do {
910 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
911 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
912 pte++;
913 } while (!first_pte_in_page(pte));
914 }
915
dma_pte_clear_level(struct dmar_domain * domain,int level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn,struct list_head * freelist)916 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
917 struct dma_pte *pte, unsigned long pfn,
918 unsigned long start_pfn, unsigned long last_pfn,
919 struct list_head *freelist)
920 {
921 struct dma_pte *first_pte = NULL, *last_pte = NULL;
922
923 pfn = max(start_pfn, pfn);
924 pte = &pte[pfn_level_offset(pfn, level)];
925
926 do {
927 unsigned long level_pfn = pfn & level_mask(level);
928
929 if (!dma_pte_present(pte))
930 goto next;
931
932 /* If range covers entire pagetable, free it */
933 if (start_pfn <= level_pfn &&
934 last_pfn >= level_pfn + level_size(level) - 1) {
935 /* These suborbinate page tables are going away entirely. Don't
936 bother to clear them; we're just going to *free* them. */
937 if (level > 1 && !dma_pte_superpage(pte))
938 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
939
940 dma_clear_pte(pte);
941 if (!first_pte)
942 first_pte = pte;
943 last_pte = pte;
944 } else if (level > 1) {
945 /* Recurse down into a level that isn't *entirely* obsolete */
946 dma_pte_clear_level(domain, level - 1,
947 phys_to_virt(dma_pte_addr(pte)),
948 level_pfn, start_pfn, last_pfn,
949 freelist);
950 }
951 next:
952 pfn = level_pfn + level_size(level);
953 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
954
955 if (first_pte)
956 domain_flush_cache(domain, first_pte,
957 (void *)++last_pte - (void *)first_pte);
958 }
959
960 /* We can't just free the pages because the IOMMU may still be walking
961 the page tables, and may have cached the intermediate levels. The
962 pages can only be freed after the IOTLB flush has been done. */
domain_unmap(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,struct list_head * freelist)963 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
964 unsigned long last_pfn, struct list_head *freelist)
965 {
966 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
967 WARN_ON(start_pfn > last_pfn))
968 return;
969
970 /* we don't need lock here; nobody else touches the iova range */
971 dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
972 domain->pgd, 0, start_pfn, last_pfn, freelist);
973
974 /* free pgd */
975 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
976 struct page *pgd_page = virt_to_page(domain->pgd);
977 list_add_tail(&pgd_page->lru, freelist);
978 domain->pgd = NULL;
979 }
980 }
981
982 /* iommu handling */
iommu_alloc_root_entry(struct intel_iommu * iommu)983 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
984 {
985 struct root_entry *root;
986
987 root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
988 if (!root) {
989 pr_err("Allocating root entry for %s failed\n",
990 iommu->name);
991 return -ENOMEM;
992 }
993
994 __iommu_flush_cache(iommu, root, ROOT_SIZE);
995 iommu->root_entry = root;
996
997 return 0;
998 }
999
iommu_set_root_entry(struct intel_iommu * iommu)1000 static void iommu_set_root_entry(struct intel_iommu *iommu)
1001 {
1002 u64 addr;
1003 u32 sts;
1004 unsigned long flag;
1005
1006 addr = virt_to_phys(iommu->root_entry);
1007 if (sm_supported(iommu))
1008 addr |= DMA_RTADDR_SMT;
1009
1010 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1011 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1012
1013 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1014
1015 /* Make sure hardware complete it */
1016 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1017 readl, (sts & DMA_GSTS_RTPS), sts);
1018
1019 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1020
1021 /*
1022 * Hardware invalidates all DMA remapping hardware translation
1023 * caches as part of SRTP flow.
1024 */
1025 if (cap_esrtps(iommu->cap))
1026 return;
1027
1028 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1029 if (sm_supported(iommu))
1030 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1031 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1032 }
1033
iommu_flush_write_buffer(struct intel_iommu * iommu)1034 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1035 {
1036 u32 val;
1037 unsigned long flag;
1038
1039 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1040 return;
1041
1042 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1043 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1044
1045 /* Make sure hardware complete it */
1046 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1047 readl, (!(val & DMA_GSTS_WBFS)), val);
1048
1049 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1050 }
1051
1052 /* return value determine if we need a write buffer flush */
__iommu_flush_context(struct intel_iommu * iommu,u16 did,u16 source_id,u8 function_mask,u64 type)1053 static void __iommu_flush_context(struct intel_iommu *iommu,
1054 u16 did, u16 source_id, u8 function_mask,
1055 u64 type)
1056 {
1057 u64 val = 0;
1058 unsigned long flag;
1059
1060 switch (type) {
1061 case DMA_CCMD_GLOBAL_INVL:
1062 val = DMA_CCMD_GLOBAL_INVL;
1063 break;
1064 case DMA_CCMD_DOMAIN_INVL:
1065 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1066 break;
1067 case DMA_CCMD_DEVICE_INVL:
1068 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1069 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1070 break;
1071 default:
1072 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1073 iommu->name, type);
1074 return;
1075 }
1076 val |= DMA_CCMD_ICC;
1077
1078 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1079 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1080
1081 /* Make sure hardware complete it */
1082 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1083 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1084
1085 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1086 }
1087
__iommu_flush_iotlb(struct intel_iommu * iommu,u16 did,u64 addr,unsigned int size_order,u64 type)1088 void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
1089 unsigned int size_order, u64 type)
1090 {
1091 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1092 u64 val = 0, val_iva = 0;
1093 unsigned long flag;
1094
1095 switch (type) {
1096 case DMA_TLB_GLOBAL_FLUSH:
1097 /* global flush doesn't need set IVA_REG */
1098 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1099 break;
1100 case DMA_TLB_DSI_FLUSH:
1101 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1102 break;
1103 case DMA_TLB_PSI_FLUSH:
1104 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1105 /* IH bit is passed in as part of address */
1106 val_iva = size_order | addr;
1107 break;
1108 default:
1109 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1110 iommu->name, type);
1111 return;
1112 }
1113
1114 if (cap_write_drain(iommu->cap))
1115 val |= DMA_TLB_WRITE_DRAIN;
1116
1117 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1118 /* Note: Only uses first TLB reg currently */
1119 if (val_iva)
1120 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1121 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1122
1123 /* Make sure hardware complete it */
1124 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1125 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1126
1127 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1128
1129 /* check IOTLB invalidation granularity */
1130 if (DMA_TLB_IAIG(val) == 0)
1131 pr_err("Flush IOTLB failed\n");
1132 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1133 pr_debug("TLB flush request %Lx, actual %Lx\n",
1134 (unsigned long long)DMA_TLB_IIRG(type),
1135 (unsigned long long)DMA_TLB_IAIG(val));
1136 }
1137
1138 static struct device_domain_info *
domain_lookup_dev_info(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1139 domain_lookup_dev_info(struct dmar_domain *domain,
1140 struct intel_iommu *iommu, u8 bus, u8 devfn)
1141 {
1142 struct device_domain_info *info;
1143 unsigned long flags;
1144
1145 spin_lock_irqsave(&domain->lock, flags);
1146 list_for_each_entry(info, &domain->devices, link) {
1147 if (info->iommu == iommu && info->bus == bus &&
1148 info->devfn == devfn) {
1149 spin_unlock_irqrestore(&domain->lock, flags);
1150 return info;
1151 }
1152 }
1153 spin_unlock_irqrestore(&domain->lock, flags);
1154
1155 return NULL;
1156 }
1157
1158 /*
1159 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1160 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1161 * check because it applies only to the built-in QAT devices and it doesn't
1162 * grant additional privileges.
1163 */
1164 #define BUGGY_QAT_DEVID_MASK 0x4940
dev_needs_extra_dtlb_flush(struct pci_dev * pdev)1165 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1166 {
1167 if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1168 return false;
1169
1170 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1171 return false;
1172
1173 return true;
1174 }
1175
iommu_enable_pci_ats(struct device_domain_info * info)1176 static void iommu_enable_pci_ats(struct device_domain_info *info)
1177 {
1178 struct pci_dev *pdev;
1179
1180 if (!info->ats_supported)
1181 return;
1182
1183 pdev = to_pci_dev(info->dev);
1184 if (!pci_ats_page_aligned(pdev))
1185 return;
1186
1187 if (!pci_enable_ats(pdev, VTD_PAGE_SHIFT))
1188 info->ats_enabled = 1;
1189 }
1190
iommu_disable_pci_ats(struct device_domain_info * info)1191 static void iommu_disable_pci_ats(struct device_domain_info *info)
1192 {
1193 if (!info->ats_enabled)
1194 return;
1195
1196 pci_disable_ats(to_pci_dev(info->dev));
1197 info->ats_enabled = 0;
1198 }
1199
iommu_enable_pci_pri(struct device_domain_info * info)1200 static void iommu_enable_pci_pri(struct device_domain_info *info)
1201 {
1202 struct pci_dev *pdev;
1203
1204 if (!info->ats_enabled || !info->pri_supported)
1205 return;
1206
1207 pdev = to_pci_dev(info->dev);
1208 /* PASID is required in PRG Response Message. */
1209 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
1210 return;
1211
1212 if (pci_reset_pri(pdev))
1213 return;
1214
1215 if (!pci_enable_pri(pdev, PRQ_DEPTH))
1216 info->pri_enabled = 1;
1217 }
1218
iommu_disable_pci_pri(struct device_domain_info * info)1219 static void iommu_disable_pci_pri(struct device_domain_info *info)
1220 {
1221 if (!info->pri_enabled)
1222 return;
1223
1224 if (WARN_ON(info->iopf_refcount))
1225 iopf_queue_remove_device(info->iommu->iopf_queue, info->dev);
1226
1227 pci_disable_pri(to_pci_dev(info->dev));
1228 info->pri_enabled = 0;
1229 }
1230
intel_flush_iotlb_all(struct iommu_domain * domain)1231 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1232 {
1233 cache_tag_flush_all(to_dmar_domain(domain));
1234 }
1235
iommu_disable_protect_mem_regions(struct intel_iommu * iommu)1236 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1237 {
1238 u32 pmen;
1239 unsigned long flags;
1240
1241 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1242 return;
1243
1244 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1245 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1246 pmen &= ~DMA_PMEN_EPM;
1247 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1248
1249 /* wait for the protected region status bit to clear */
1250 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1251 readl, !(pmen & DMA_PMEN_PRS), pmen);
1252
1253 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1254 }
1255
iommu_enable_translation(struct intel_iommu * iommu)1256 static void iommu_enable_translation(struct intel_iommu *iommu)
1257 {
1258 u32 sts;
1259 unsigned long flags;
1260
1261 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1262 iommu->gcmd |= DMA_GCMD_TE;
1263 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1264
1265 /* Make sure hardware complete it */
1266 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1267 readl, (sts & DMA_GSTS_TES), sts);
1268
1269 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1270 }
1271
iommu_disable_translation(struct intel_iommu * iommu)1272 static void iommu_disable_translation(struct intel_iommu *iommu)
1273 {
1274 u32 sts;
1275 unsigned long flag;
1276
1277 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1278 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1279 return;
1280
1281 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1282 iommu->gcmd &= ~DMA_GCMD_TE;
1283 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1284
1285 /* Make sure hardware complete it */
1286 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1287 readl, (!(sts & DMA_GSTS_TES)), sts);
1288
1289 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1290 }
1291
iommu_init_domains(struct intel_iommu * iommu)1292 static int iommu_init_domains(struct intel_iommu *iommu)
1293 {
1294 u32 ndomains;
1295
1296 ndomains = cap_ndoms(iommu->cap);
1297 pr_debug("%s: Number of Domains supported <%d>\n",
1298 iommu->name, ndomains);
1299
1300 spin_lock_init(&iommu->lock);
1301
1302 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1303 if (!iommu->domain_ids)
1304 return -ENOMEM;
1305
1306 /*
1307 * If Caching mode is set, then invalid translations are tagged
1308 * with domain-id 0, hence we need to pre-allocate it. We also
1309 * use domain-id 0 as a marker for non-allocated domain-id, so
1310 * make sure it is not used for a real domain.
1311 */
1312 set_bit(0, iommu->domain_ids);
1313
1314 /*
1315 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1316 * entry for first-level or pass-through translation modes should
1317 * be programmed with a domain id different from those used for
1318 * second-level or nested translation. We reserve a domain id for
1319 * this purpose. This domain id is also used for identity domain
1320 * in legacy mode.
1321 */
1322 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1323
1324 return 0;
1325 }
1326
disable_dmar_iommu(struct intel_iommu * iommu)1327 static void disable_dmar_iommu(struct intel_iommu *iommu)
1328 {
1329 if (!iommu->domain_ids)
1330 return;
1331
1332 /*
1333 * All iommu domains must have been detached from the devices,
1334 * hence there should be no domain IDs in use.
1335 */
1336 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1337 > NUM_RESERVED_DID))
1338 return;
1339
1340 if (iommu->gcmd & DMA_GCMD_TE)
1341 iommu_disable_translation(iommu);
1342 }
1343
free_dmar_iommu(struct intel_iommu * iommu)1344 static void free_dmar_iommu(struct intel_iommu *iommu)
1345 {
1346 if (iommu->domain_ids) {
1347 bitmap_free(iommu->domain_ids);
1348 iommu->domain_ids = NULL;
1349 }
1350
1351 if (iommu->copied_tables) {
1352 bitmap_free(iommu->copied_tables);
1353 iommu->copied_tables = NULL;
1354 }
1355
1356 /* free context mapping */
1357 free_context_table(iommu);
1358
1359 if (ecap_prs(iommu->ecap))
1360 intel_iommu_finish_prq(iommu);
1361 }
1362
1363 /*
1364 * Check and return whether first level is used by default for
1365 * DMA translation.
1366 */
first_level_by_default(struct intel_iommu * iommu)1367 static bool first_level_by_default(struct intel_iommu *iommu)
1368 {
1369 /* Only SL is available in legacy mode */
1370 if (!sm_supported(iommu))
1371 return false;
1372
1373 /* Only level (either FL or SL) is available, just use it */
1374 if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap))
1375 return ecap_flts(iommu->ecap);
1376
1377 return true;
1378 }
1379
domain_attach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1380 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1381 {
1382 struct iommu_domain_info *info, *curr;
1383 unsigned long ndomains;
1384 int num, ret = -ENOSPC;
1385
1386 if (domain->domain.type == IOMMU_DOMAIN_SVA)
1387 return 0;
1388
1389 info = kzalloc(sizeof(*info), GFP_KERNEL);
1390 if (!info)
1391 return -ENOMEM;
1392
1393 spin_lock(&iommu->lock);
1394 curr = xa_load(&domain->iommu_array, iommu->seq_id);
1395 if (curr) {
1396 curr->refcnt++;
1397 spin_unlock(&iommu->lock);
1398 kfree(info);
1399 return 0;
1400 }
1401
1402 ndomains = cap_ndoms(iommu->cap);
1403 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1404 if (num >= ndomains) {
1405 pr_err("%s: No free domain ids\n", iommu->name);
1406 goto err_unlock;
1407 }
1408
1409 set_bit(num, iommu->domain_ids);
1410 info->refcnt = 1;
1411 info->did = num;
1412 info->iommu = iommu;
1413 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1414 NULL, info, GFP_ATOMIC);
1415 if (curr) {
1416 ret = xa_err(curr) ? : -EBUSY;
1417 goto err_clear;
1418 }
1419
1420 spin_unlock(&iommu->lock);
1421 return 0;
1422
1423 err_clear:
1424 clear_bit(info->did, iommu->domain_ids);
1425 err_unlock:
1426 spin_unlock(&iommu->lock);
1427 kfree(info);
1428 return ret;
1429 }
1430
domain_detach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1431 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1432 {
1433 struct iommu_domain_info *info;
1434
1435 if (domain->domain.type == IOMMU_DOMAIN_SVA)
1436 return;
1437
1438 spin_lock(&iommu->lock);
1439 info = xa_load(&domain->iommu_array, iommu->seq_id);
1440 if (--info->refcnt == 0) {
1441 clear_bit(info->did, iommu->domain_ids);
1442 xa_erase(&domain->iommu_array, iommu->seq_id);
1443 domain->nid = NUMA_NO_NODE;
1444 kfree(info);
1445 }
1446 spin_unlock(&iommu->lock);
1447 }
1448
domain_exit(struct dmar_domain * domain)1449 static void domain_exit(struct dmar_domain *domain)
1450 {
1451 if (domain->pgd) {
1452 LIST_HEAD(freelist);
1453
1454 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1455 iommu_put_pages_list(&freelist);
1456 }
1457
1458 if (WARN_ON(!list_empty(&domain->devices)))
1459 return;
1460
1461 kfree(domain->qi_batch);
1462 kfree(domain);
1463 }
1464
1465 /*
1466 * For kdump cases, old valid entries may be cached due to the
1467 * in-flight DMA and copied pgtable, but there is no unmapping
1468 * behaviour for them, thus we need an explicit cache flush for
1469 * the newly-mapped device. For kdump, at this point, the device
1470 * is supposed to finish reset at its driver probe stage, so no
1471 * in-flight DMA will exist, and we don't need to worry anymore
1472 * hereafter.
1473 */
copied_context_tear_down(struct intel_iommu * iommu,struct context_entry * context,u8 bus,u8 devfn)1474 static void copied_context_tear_down(struct intel_iommu *iommu,
1475 struct context_entry *context,
1476 u8 bus, u8 devfn)
1477 {
1478 u16 did_old;
1479
1480 if (!context_copied(iommu, bus, devfn))
1481 return;
1482
1483 assert_spin_locked(&iommu->lock);
1484
1485 did_old = context_domain_id(context);
1486 context_clear_entry(context);
1487
1488 if (did_old < cap_ndoms(iommu->cap)) {
1489 iommu->flush.flush_context(iommu, did_old,
1490 PCI_DEVID(bus, devfn),
1491 DMA_CCMD_MASK_NOBIT,
1492 DMA_CCMD_DEVICE_INVL);
1493 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1494 DMA_TLB_DSI_FLUSH);
1495 }
1496
1497 clear_context_copied(iommu, bus, devfn);
1498 }
1499
1500 /*
1501 * It's a non-present to present mapping. If hardware doesn't cache
1502 * non-present entry we only need to flush the write-buffer. If the
1503 * _does_ cache non-present entries, then it does so in the special
1504 * domain #0, which we have to flush:
1505 */
context_present_cache_flush(struct intel_iommu * iommu,u16 did,u8 bus,u8 devfn)1506 static void context_present_cache_flush(struct intel_iommu *iommu, u16 did,
1507 u8 bus, u8 devfn)
1508 {
1509 if (cap_caching_mode(iommu->cap)) {
1510 iommu->flush.flush_context(iommu, 0,
1511 PCI_DEVID(bus, devfn),
1512 DMA_CCMD_MASK_NOBIT,
1513 DMA_CCMD_DEVICE_INVL);
1514 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1515 } else {
1516 iommu_flush_write_buffer(iommu);
1517 }
1518 }
1519
domain_context_mapping_one(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1520 static int domain_context_mapping_one(struct dmar_domain *domain,
1521 struct intel_iommu *iommu,
1522 u8 bus, u8 devfn)
1523 {
1524 struct device_domain_info *info =
1525 domain_lookup_dev_info(domain, iommu, bus, devfn);
1526 u16 did = domain_id_iommu(domain, iommu);
1527 int translation = CONTEXT_TT_MULTI_LEVEL;
1528 struct dma_pte *pgd = domain->pgd;
1529 struct context_entry *context;
1530 int ret;
1531
1532 pr_debug("Set context mapping for %02x:%02x.%d\n",
1533 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1534
1535 spin_lock(&iommu->lock);
1536 ret = -ENOMEM;
1537 context = iommu_context_addr(iommu, bus, devfn, 1);
1538 if (!context)
1539 goto out_unlock;
1540
1541 ret = 0;
1542 if (context_present(context) && !context_copied(iommu, bus, devfn))
1543 goto out_unlock;
1544
1545 copied_context_tear_down(iommu, context, bus, devfn);
1546 context_clear_entry(context);
1547 context_set_domain_id(context, did);
1548
1549 if (info && info->ats_supported)
1550 translation = CONTEXT_TT_DEV_IOTLB;
1551 else
1552 translation = CONTEXT_TT_MULTI_LEVEL;
1553
1554 context_set_address_root(context, virt_to_phys(pgd));
1555 context_set_address_width(context, domain->agaw);
1556 context_set_translation_type(context, translation);
1557 context_set_fault_enable(context);
1558 context_set_present(context);
1559 if (!ecap_coherent(iommu->ecap))
1560 clflush_cache_range(context, sizeof(*context));
1561 context_present_cache_flush(iommu, did, bus, devfn);
1562 ret = 0;
1563
1564 out_unlock:
1565 spin_unlock(&iommu->lock);
1566
1567 return ret;
1568 }
1569
domain_context_mapping_cb(struct pci_dev * pdev,u16 alias,void * opaque)1570 static int domain_context_mapping_cb(struct pci_dev *pdev,
1571 u16 alias, void *opaque)
1572 {
1573 struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1574 struct intel_iommu *iommu = info->iommu;
1575 struct dmar_domain *domain = opaque;
1576
1577 return domain_context_mapping_one(domain, iommu,
1578 PCI_BUS_NUM(alias), alias & 0xff);
1579 }
1580
1581 static int
domain_context_mapping(struct dmar_domain * domain,struct device * dev)1582 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1583 {
1584 struct device_domain_info *info = dev_iommu_priv_get(dev);
1585 struct intel_iommu *iommu = info->iommu;
1586 u8 bus = info->bus, devfn = info->devfn;
1587 int ret;
1588
1589 if (!dev_is_pci(dev))
1590 return domain_context_mapping_one(domain, iommu, bus, devfn);
1591
1592 ret = pci_for_each_dma_alias(to_pci_dev(dev),
1593 domain_context_mapping_cb, domain);
1594 if (ret)
1595 return ret;
1596
1597 iommu_enable_pci_ats(info);
1598
1599 return 0;
1600 }
1601
1602 /* Return largest possible superpage level for a given mapping */
hardware_largepage_caps(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phy_pfn,unsigned long pages)1603 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1604 unsigned long phy_pfn, unsigned long pages)
1605 {
1606 int support, level = 1;
1607 unsigned long pfnmerge;
1608
1609 support = domain->iommu_superpage;
1610
1611 /* To use a large page, the virtual *and* physical addresses
1612 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1613 of them will mean we have to use smaller pages. So just
1614 merge them and check both at once. */
1615 pfnmerge = iov_pfn | phy_pfn;
1616
1617 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1618 pages >>= VTD_STRIDE_SHIFT;
1619 if (!pages)
1620 break;
1621 pfnmerge >>= VTD_STRIDE_SHIFT;
1622 level++;
1623 support--;
1624 }
1625 return level;
1626 }
1627
1628 /*
1629 * Ensure that old small page tables are removed to make room for superpage(s).
1630 * We're going to add new large pages, so make sure we don't remove their parent
1631 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1632 */
switch_to_super_page(struct dmar_domain * domain,unsigned long start_pfn,unsigned long end_pfn,int level)1633 static void switch_to_super_page(struct dmar_domain *domain,
1634 unsigned long start_pfn,
1635 unsigned long end_pfn, int level)
1636 {
1637 unsigned long lvl_pages = lvl_to_nr_pages(level);
1638 struct dma_pte *pte = NULL;
1639
1640 while (start_pfn <= end_pfn) {
1641 if (!pte)
1642 pte = pfn_to_dma_pte(domain, start_pfn, &level,
1643 GFP_ATOMIC);
1644
1645 if (dma_pte_present(pte)) {
1646 dma_pte_free_pagetable(domain, start_pfn,
1647 start_pfn + lvl_pages - 1,
1648 level + 1);
1649
1650 cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT,
1651 end_pfn << VTD_PAGE_SHIFT, 0);
1652 }
1653
1654 pte++;
1655 start_pfn += lvl_pages;
1656 if (first_pte_in_page(pte))
1657 pte = NULL;
1658 }
1659 }
1660
1661 static int
__domain_mapping(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phys_pfn,unsigned long nr_pages,int prot,gfp_t gfp)1662 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1663 unsigned long phys_pfn, unsigned long nr_pages, int prot,
1664 gfp_t gfp)
1665 {
1666 struct dma_pte *first_pte = NULL, *pte = NULL;
1667 unsigned int largepage_lvl = 0;
1668 unsigned long lvl_pages = 0;
1669 phys_addr_t pteval;
1670 u64 attr;
1671
1672 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
1673 return -EINVAL;
1674
1675 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1676 return -EINVAL;
1677
1678 if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
1679 pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
1680 return -EINVAL;
1681 }
1682
1683 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
1684 attr |= DMA_FL_PTE_PRESENT;
1685 if (domain->use_first_level) {
1686 attr |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
1687 if (prot & DMA_PTE_WRITE)
1688 attr |= DMA_FL_PTE_DIRTY;
1689 }
1690
1691 domain->has_mappings = true;
1692
1693 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
1694
1695 while (nr_pages > 0) {
1696 uint64_t tmp;
1697
1698 if (!pte) {
1699 largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
1700 phys_pfn, nr_pages);
1701
1702 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
1703 gfp);
1704 if (!pte)
1705 return -ENOMEM;
1706 first_pte = pte;
1707
1708 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1709
1710 /* It is large page*/
1711 if (largepage_lvl > 1) {
1712 unsigned long end_pfn;
1713 unsigned long pages_to_remove;
1714
1715 pteval |= DMA_PTE_LARGE_PAGE;
1716 pages_to_remove = min_t(unsigned long, nr_pages,
1717 nr_pte_to_next_page(pte) * lvl_pages);
1718 end_pfn = iov_pfn + pages_to_remove - 1;
1719 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
1720 } else {
1721 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1722 }
1723
1724 }
1725 /* We don't need lock here, nobody else
1726 * touches the iova range
1727 */
1728 tmp = 0ULL;
1729 if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
1730 static int dumps = 5;
1731 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1732 iov_pfn, tmp, (unsigned long long)pteval);
1733 if (dumps) {
1734 dumps--;
1735 debug_dma_dump_mappings(NULL);
1736 }
1737 WARN_ON(1);
1738 }
1739
1740 nr_pages -= lvl_pages;
1741 iov_pfn += lvl_pages;
1742 phys_pfn += lvl_pages;
1743 pteval += lvl_pages * VTD_PAGE_SIZE;
1744
1745 /* If the next PTE would be the first in a new page, then we
1746 * need to flush the cache on the entries we've just written.
1747 * And then we'll need to recalculate 'pte', so clear it and
1748 * let it get set again in the if (!pte) block above.
1749 *
1750 * If we're done (!nr_pages) we need to flush the cache too.
1751 *
1752 * Also if we've been setting superpages, we may need to
1753 * recalculate 'pte' and switch back to smaller pages for the
1754 * end of the mapping, if the trailing size is not enough to
1755 * use another superpage (i.e. nr_pages < lvl_pages).
1756 */
1757 pte++;
1758 if (!nr_pages || first_pte_in_page(pte) ||
1759 (largepage_lvl > 1 && nr_pages < lvl_pages)) {
1760 domain_flush_cache(domain, first_pte,
1761 (void *)pte - (void *)first_pte);
1762 pte = NULL;
1763 }
1764 }
1765
1766 return 0;
1767 }
1768
domain_context_clear_one(struct device_domain_info * info,u8 bus,u8 devfn)1769 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
1770 {
1771 struct intel_iommu *iommu = info->iommu;
1772 struct context_entry *context;
1773 u16 did;
1774
1775 spin_lock(&iommu->lock);
1776 context = iommu_context_addr(iommu, bus, devfn, 0);
1777 if (!context) {
1778 spin_unlock(&iommu->lock);
1779 return;
1780 }
1781
1782 did = context_domain_id(context);
1783 context_clear_entry(context);
1784 __iommu_flush_cache(iommu, context, sizeof(*context));
1785 spin_unlock(&iommu->lock);
1786 intel_context_flush_no_pasid(info, context, did);
1787 }
1788
__domain_setup_first_level(struct intel_iommu * iommu,struct device * dev,ioasid_t pasid,u16 did,pgd_t * pgd,int flags,struct iommu_domain * old)1789 int __domain_setup_first_level(struct intel_iommu *iommu,
1790 struct device *dev, ioasid_t pasid,
1791 u16 did, pgd_t *pgd, int flags,
1792 struct iommu_domain *old)
1793 {
1794 if (!old)
1795 return intel_pasid_setup_first_level(iommu, dev, pgd,
1796 pasid, did, flags);
1797 return intel_pasid_replace_first_level(iommu, dev, pgd, pasid, did,
1798 iommu_domain_did(old, iommu),
1799 flags);
1800 }
1801
domain_setup_second_level(struct intel_iommu * iommu,struct dmar_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)1802 static int domain_setup_second_level(struct intel_iommu *iommu,
1803 struct dmar_domain *domain,
1804 struct device *dev, ioasid_t pasid,
1805 struct iommu_domain *old)
1806 {
1807 if (!old)
1808 return intel_pasid_setup_second_level(iommu, domain,
1809 dev, pasid);
1810 return intel_pasid_replace_second_level(iommu, domain, dev,
1811 iommu_domain_did(old, iommu),
1812 pasid);
1813 }
1814
domain_setup_passthrough(struct intel_iommu * iommu,struct device * dev,ioasid_t pasid,struct iommu_domain * old)1815 static int domain_setup_passthrough(struct intel_iommu *iommu,
1816 struct device *dev, ioasid_t pasid,
1817 struct iommu_domain *old)
1818 {
1819 if (!old)
1820 return intel_pasid_setup_pass_through(iommu, dev, pasid);
1821 return intel_pasid_replace_pass_through(iommu, dev,
1822 iommu_domain_did(old, iommu),
1823 pasid);
1824 }
1825
domain_setup_first_level(struct intel_iommu * iommu,struct dmar_domain * domain,struct device * dev,u32 pasid,struct iommu_domain * old)1826 static int domain_setup_first_level(struct intel_iommu *iommu,
1827 struct dmar_domain *domain,
1828 struct device *dev,
1829 u32 pasid, struct iommu_domain *old)
1830 {
1831 struct dma_pte *pgd = domain->pgd;
1832 int level, flags = 0;
1833
1834 level = agaw_to_level(domain->agaw);
1835 if (level != 4 && level != 5)
1836 return -EINVAL;
1837
1838 if (level == 5)
1839 flags |= PASID_FLAG_FL5LP;
1840
1841 if (domain->force_snooping)
1842 flags |= PASID_FLAG_PAGE_SNOOP;
1843
1844 return __domain_setup_first_level(iommu, dev, pasid,
1845 domain_id_iommu(domain, iommu),
1846 (pgd_t *)pgd, flags, old);
1847 }
1848
dmar_domain_attach_device(struct dmar_domain * domain,struct device * dev)1849 static int dmar_domain_attach_device(struct dmar_domain *domain,
1850 struct device *dev)
1851 {
1852 struct device_domain_info *info = dev_iommu_priv_get(dev);
1853 struct intel_iommu *iommu = info->iommu;
1854 unsigned long flags;
1855 int ret;
1856
1857 ret = domain_attach_iommu(domain, iommu);
1858 if (ret)
1859 return ret;
1860
1861 info->domain = domain;
1862 spin_lock_irqsave(&domain->lock, flags);
1863 list_add(&info->link, &domain->devices);
1864 spin_unlock_irqrestore(&domain->lock, flags);
1865
1866 if (dev_is_real_dma_subdevice(dev))
1867 return 0;
1868
1869 if (!sm_supported(iommu))
1870 ret = domain_context_mapping(domain, dev);
1871 else if (domain->use_first_level)
1872 ret = domain_setup_first_level(iommu, domain, dev,
1873 IOMMU_NO_PASID, NULL);
1874 else
1875 ret = domain_setup_second_level(iommu, domain, dev,
1876 IOMMU_NO_PASID, NULL);
1877
1878 if (ret)
1879 goto out_block_translation;
1880
1881 ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID);
1882 if (ret)
1883 goto out_block_translation;
1884
1885 return 0;
1886
1887 out_block_translation:
1888 device_block_translation(dev);
1889 return ret;
1890 }
1891
1892 /**
1893 * device_rmrr_is_relaxable - Test whether the RMRR of this device
1894 * is relaxable (ie. is allowed to be not enforced under some conditions)
1895 * @dev: device handle
1896 *
1897 * We assume that PCI USB devices with RMRRs have them largely
1898 * for historical reasons and that the RMRR space is not actively used post
1899 * boot. This exclusion may change if vendors begin to abuse it.
1900 *
1901 * The same exception is made for graphics devices, with the requirement that
1902 * any use of the RMRR regions will be torn down before assigning the device
1903 * to a guest.
1904 *
1905 * Return: true if the RMRR is relaxable, false otherwise
1906 */
device_rmrr_is_relaxable(struct device * dev)1907 static bool device_rmrr_is_relaxable(struct device *dev)
1908 {
1909 struct pci_dev *pdev;
1910
1911 if (!dev_is_pci(dev))
1912 return false;
1913
1914 pdev = to_pci_dev(dev);
1915 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
1916 return true;
1917 else
1918 return false;
1919 }
1920
device_def_domain_type(struct device * dev)1921 static int device_def_domain_type(struct device *dev)
1922 {
1923 struct device_domain_info *info = dev_iommu_priv_get(dev);
1924 struct intel_iommu *iommu = info->iommu;
1925
1926 /*
1927 * Hardware does not support the passthrough translation mode.
1928 * Always use a dynamaic mapping domain.
1929 */
1930 if (!ecap_pass_through(iommu->ecap))
1931 return IOMMU_DOMAIN_DMA;
1932
1933 if (dev_is_pci(dev)) {
1934 struct pci_dev *pdev = to_pci_dev(dev);
1935
1936 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
1937 return IOMMU_DOMAIN_IDENTITY;
1938 }
1939
1940 return 0;
1941 }
1942
intel_iommu_init_qi(struct intel_iommu * iommu)1943 static void intel_iommu_init_qi(struct intel_iommu *iommu)
1944 {
1945 /*
1946 * Start from the sane iommu hardware state.
1947 * If the queued invalidation is already initialized by us
1948 * (for example, while enabling interrupt-remapping) then
1949 * we got the things already rolling from a sane state.
1950 */
1951 if (!iommu->qi) {
1952 /*
1953 * Clear any previous faults.
1954 */
1955 dmar_fault(-1, iommu);
1956 /*
1957 * Disable queued invalidation if supported and already enabled
1958 * before OS handover.
1959 */
1960 dmar_disable_qi(iommu);
1961 }
1962
1963 if (dmar_enable_qi(iommu)) {
1964 /*
1965 * Queued Invalidate not enabled, use Register Based Invalidate
1966 */
1967 iommu->flush.flush_context = __iommu_flush_context;
1968 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1969 pr_info("%s: Using Register based invalidation\n",
1970 iommu->name);
1971 } else {
1972 iommu->flush.flush_context = qi_flush_context;
1973 iommu->flush.flush_iotlb = qi_flush_iotlb;
1974 pr_info("%s: Using Queued invalidation\n", iommu->name);
1975 }
1976 }
1977
copy_context_table(struct intel_iommu * iommu,struct root_entry * old_re,struct context_entry ** tbl,int bus,bool ext)1978 static int copy_context_table(struct intel_iommu *iommu,
1979 struct root_entry *old_re,
1980 struct context_entry **tbl,
1981 int bus, bool ext)
1982 {
1983 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
1984 struct context_entry *new_ce = NULL, ce;
1985 struct context_entry *old_ce = NULL;
1986 struct root_entry re;
1987 phys_addr_t old_ce_phys;
1988
1989 tbl_idx = ext ? bus * 2 : bus;
1990 memcpy(&re, old_re, sizeof(re));
1991
1992 for (devfn = 0; devfn < 256; devfn++) {
1993 /* First calculate the correct index */
1994 idx = (ext ? devfn * 2 : devfn) % 256;
1995
1996 if (idx == 0) {
1997 /* First save what we may have and clean up */
1998 if (new_ce) {
1999 tbl[tbl_idx] = new_ce;
2000 __iommu_flush_cache(iommu, new_ce,
2001 VTD_PAGE_SIZE);
2002 pos = 1;
2003 }
2004
2005 if (old_ce)
2006 memunmap(old_ce);
2007
2008 ret = 0;
2009 if (devfn < 0x80)
2010 old_ce_phys = root_entry_lctp(&re);
2011 else
2012 old_ce_phys = root_entry_uctp(&re);
2013
2014 if (!old_ce_phys) {
2015 if (ext && devfn == 0) {
2016 /* No LCTP, try UCTP */
2017 devfn = 0x7f;
2018 continue;
2019 } else {
2020 goto out;
2021 }
2022 }
2023
2024 ret = -ENOMEM;
2025 old_ce = memremap(old_ce_phys, PAGE_SIZE,
2026 MEMREMAP_WB);
2027 if (!old_ce)
2028 goto out;
2029
2030 new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL);
2031 if (!new_ce)
2032 goto out_unmap;
2033
2034 ret = 0;
2035 }
2036
2037 /* Now copy the context entry */
2038 memcpy(&ce, old_ce + idx, sizeof(ce));
2039
2040 if (!context_present(&ce))
2041 continue;
2042
2043 did = context_domain_id(&ce);
2044 if (did >= 0 && did < cap_ndoms(iommu->cap))
2045 set_bit(did, iommu->domain_ids);
2046
2047 set_context_copied(iommu, bus, devfn);
2048 new_ce[idx] = ce;
2049 }
2050
2051 tbl[tbl_idx + pos] = new_ce;
2052
2053 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2054
2055 out_unmap:
2056 memunmap(old_ce);
2057
2058 out:
2059 return ret;
2060 }
2061
copy_translation_tables(struct intel_iommu * iommu)2062 static int copy_translation_tables(struct intel_iommu *iommu)
2063 {
2064 struct context_entry **ctxt_tbls;
2065 struct root_entry *old_rt;
2066 phys_addr_t old_rt_phys;
2067 int ctxt_table_entries;
2068 u64 rtaddr_reg;
2069 int bus, ret;
2070 bool new_ext, ext;
2071
2072 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2073 ext = !!(rtaddr_reg & DMA_RTADDR_SMT);
2074 new_ext = !!sm_supported(iommu);
2075
2076 /*
2077 * The RTT bit can only be changed when translation is disabled,
2078 * but disabling translation means to open a window for data
2079 * corruption. So bail out and don't copy anything if we would
2080 * have to change the bit.
2081 */
2082 if (new_ext != ext)
2083 return -EINVAL;
2084
2085 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2086 if (!iommu->copied_tables)
2087 return -ENOMEM;
2088
2089 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2090 if (!old_rt_phys)
2091 return -EINVAL;
2092
2093 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2094 if (!old_rt)
2095 return -ENOMEM;
2096
2097 /* This is too big for the stack - allocate it from slab */
2098 ctxt_table_entries = ext ? 512 : 256;
2099 ret = -ENOMEM;
2100 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2101 if (!ctxt_tbls)
2102 goto out_unmap;
2103
2104 for (bus = 0; bus < 256; bus++) {
2105 ret = copy_context_table(iommu, &old_rt[bus],
2106 ctxt_tbls, bus, ext);
2107 if (ret) {
2108 pr_err("%s: Failed to copy context table for bus %d\n",
2109 iommu->name, bus);
2110 continue;
2111 }
2112 }
2113
2114 spin_lock(&iommu->lock);
2115
2116 /* Context tables are copied, now write them to the root_entry table */
2117 for (bus = 0; bus < 256; bus++) {
2118 int idx = ext ? bus * 2 : bus;
2119 u64 val;
2120
2121 if (ctxt_tbls[idx]) {
2122 val = virt_to_phys(ctxt_tbls[idx]) | 1;
2123 iommu->root_entry[bus].lo = val;
2124 }
2125
2126 if (!ext || !ctxt_tbls[idx + 1])
2127 continue;
2128
2129 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2130 iommu->root_entry[bus].hi = val;
2131 }
2132
2133 spin_unlock(&iommu->lock);
2134
2135 kfree(ctxt_tbls);
2136
2137 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2138
2139 ret = 0;
2140
2141 out_unmap:
2142 memunmap(old_rt);
2143
2144 return ret;
2145 }
2146
init_dmars(void)2147 static int __init init_dmars(void)
2148 {
2149 struct dmar_drhd_unit *drhd;
2150 struct intel_iommu *iommu;
2151 int ret;
2152
2153 for_each_iommu(iommu, drhd) {
2154 if (drhd->ignored) {
2155 iommu_disable_translation(iommu);
2156 continue;
2157 }
2158
2159 /*
2160 * Find the max pasid size of all IOMMU's in the system.
2161 * We need to ensure the system pasid table is no bigger
2162 * than the smallest supported.
2163 */
2164 if (pasid_supported(iommu)) {
2165 u32 temp = 2 << ecap_pss(iommu->ecap);
2166
2167 intel_pasid_max_id = min_t(u32, temp,
2168 intel_pasid_max_id);
2169 }
2170
2171 intel_iommu_init_qi(iommu);
2172
2173 ret = iommu_init_domains(iommu);
2174 if (ret)
2175 goto free_iommu;
2176
2177 init_translation_status(iommu);
2178
2179 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2180 iommu_disable_translation(iommu);
2181 clear_translation_pre_enabled(iommu);
2182 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2183 iommu->name);
2184 }
2185
2186 /*
2187 * TBD:
2188 * we could share the same root & context tables
2189 * among all IOMMU's. Need to Split it later.
2190 */
2191 ret = iommu_alloc_root_entry(iommu);
2192 if (ret)
2193 goto free_iommu;
2194
2195 if (translation_pre_enabled(iommu)) {
2196 pr_info("Translation already enabled - trying to copy translation structures\n");
2197
2198 ret = copy_translation_tables(iommu);
2199 if (ret) {
2200 /*
2201 * We found the IOMMU with translation
2202 * enabled - but failed to copy over the
2203 * old root-entry table. Try to proceed
2204 * by disabling translation now and
2205 * allocating a clean root-entry table.
2206 * This might cause DMAR faults, but
2207 * probably the dump will still succeed.
2208 */
2209 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2210 iommu->name);
2211 iommu_disable_translation(iommu);
2212 clear_translation_pre_enabled(iommu);
2213 } else {
2214 pr_info("Copied translation tables from previous kernel for %s\n",
2215 iommu->name);
2216 }
2217 }
2218
2219 intel_svm_check(iommu);
2220 }
2221
2222 /*
2223 * Now that qi is enabled on all iommus, set the root entry and flush
2224 * caches. This is required on some Intel X58 chipsets, otherwise the
2225 * flush_context function will loop forever and the boot hangs.
2226 */
2227 for_each_active_iommu(iommu, drhd) {
2228 iommu_flush_write_buffer(iommu);
2229 iommu_set_root_entry(iommu);
2230 }
2231
2232 check_tylersburg_isoch();
2233
2234 /*
2235 * for each drhd
2236 * enable fault log
2237 * global invalidate context cache
2238 * global invalidate iotlb
2239 * enable translation
2240 */
2241 for_each_iommu(iommu, drhd) {
2242 if (drhd->ignored) {
2243 /*
2244 * we always have to disable PMRs or DMA may fail on
2245 * this device
2246 */
2247 if (force_on)
2248 iommu_disable_protect_mem_regions(iommu);
2249 continue;
2250 }
2251
2252 iommu_flush_write_buffer(iommu);
2253
2254 if (ecap_prs(iommu->ecap)) {
2255 /*
2256 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2257 * could cause possible lock race condition.
2258 */
2259 up_write(&dmar_global_lock);
2260 ret = intel_iommu_enable_prq(iommu);
2261 down_write(&dmar_global_lock);
2262 if (ret)
2263 goto free_iommu;
2264 }
2265
2266 ret = dmar_set_interrupt(iommu);
2267 if (ret)
2268 goto free_iommu;
2269 }
2270
2271 return 0;
2272
2273 free_iommu:
2274 for_each_active_iommu(iommu, drhd) {
2275 disable_dmar_iommu(iommu);
2276 free_dmar_iommu(iommu);
2277 }
2278
2279 return ret;
2280 }
2281
init_no_remapping_devices(void)2282 static void __init init_no_remapping_devices(void)
2283 {
2284 struct dmar_drhd_unit *drhd;
2285 struct device *dev;
2286 int i;
2287
2288 for_each_drhd_unit(drhd) {
2289 if (!drhd->include_all) {
2290 for_each_active_dev_scope(drhd->devices,
2291 drhd->devices_cnt, i, dev)
2292 break;
2293 /* ignore DMAR unit if no devices exist */
2294 if (i == drhd->devices_cnt)
2295 drhd->ignored = 1;
2296 }
2297 }
2298
2299 for_each_active_drhd_unit(drhd) {
2300 if (drhd->include_all)
2301 continue;
2302
2303 for_each_active_dev_scope(drhd->devices,
2304 drhd->devices_cnt, i, dev)
2305 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2306 break;
2307 if (i < drhd->devices_cnt)
2308 continue;
2309
2310 /* This IOMMU has *only* gfx devices. Either bypass it or
2311 set the gfx_mapped flag, as appropriate */
2312 drhd->gfx_dedicated = 1;
2313 if (disable_igfx_iommu)
2314 drhd->ignored = 1;
2315 }
2316 }
2317
2318 #ifdef CONFIG_SUSPEND
init_iommu_hw(void)2319 static int init_iommu_hw(void)
2320 {
2321 struct dmar_drhd_unit *drhd;
2322 struct intel_iommu *iommu = NULL;
2323 int ret;
2324
2325 for_each_active_iommu(iommu, drhd) {
2326 if (iommu->qi) {
2327 ret = dmar_reenable_qi(iommu);
2328 if (ret)
2329 return ret;
2330 }
2331 }
2332
2333 for_each_iommu(iommu, drhd) {
2334 if (drhd->ignored) {
2335 /*
2336 * we always have to disable PMRs or DMA may fail on
2337 * this device
2338 */
2339 if (force_on)
2340 iommu_disable_protect_mem_regions(iommu);
2341 continue;
2342 }
2343
2344 iommu_flush_write_buffer(iommu);
2345 iommu_set_root_entry(iommu);
2346 iommu_enable_translation(iommu);
2347 iommu_disable_protect_mem_regions(iommu);
2348 }
2349
2350 return 0;
2351 }
2352
iommu_flush_all(void)2353 static void iommu_flush_all(void)
2354 {
2355 struct dmar_drhd_unit *drhd;
2356 struct intel_iommu *iommu;
2357
2358 for_each_active_iommu(iommu, drhd) {
2359 iommu->flush.flush_context(iommu, 0, 0, 0,
2360 DMA_CCMD_GLOBAL_INVL);
2361 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2362 DMA_TLB_GLOBAL_FLUSH);
2363 }
2364 }
2365
iommu_suspend(void)2366 static int iommu_suspend(void)
2367 {
2368 struct dmar_drhd_unit *drhd;
2369 struct intel_iommu *iommu = NULL;
2370 unsigned long flag;
2371
2372 iommu_flush_all();
2373
2374 for_each_active_iommu(iommu, drhd) {
2375 iommu_disable_translation(iommu);
2376
2377 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2378
2379 iommu->iommu_state[SR_DMAR_FECTL_REG] =
2380 readl(iommu->reg + DMAR_FECTL_REG);
2381 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2382 readl(iommu->reg + DMAR_FEDATA_REG);
2383 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2384 readl(iommu->reg + DMAR_FEADDR_REG);
2385 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2386 readl(iommu->reg + DMAR_FEUADDR_REG);
2387
2388 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2389 }
2390 return 0;
2391 }
2392
iommu_resume(void)2393 static void iommu_resume(void)
2394 {
2395 struct dmar_drhd_unit *drhd;
2396 struct intel_iommu *iommu = NULL;
2397 unsigned long flag;
2398
2399 if (init_iommu_hw()) {
2400 if (force_on)
2401 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2402 else
2403 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2404 return;
2405 }
2406
2407 for_each_active_iommu(iommu, drhd) {
2408
2409 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2410
2411 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2412 iommu->reg + DMAR_FECTL_REG);
2413 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2414 iommu->reg + DMAR_FEDATA_REG);
2415 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2416 iommu->reg + DMAR_FEADDR_REG);
2417 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2418 iommu->reg + DMAR_FEUADDR_REG);
2419
2420 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2421 }
2422 }
2423
2424 static struct syscore_ops iommu_syscore_ops = {
2425 .resume = iommu_resume,
2426 .suspend = iommu_suspend,
2427 };
2428
init_iommu_pm_ops(void)2429 static void __init init_iommu_pm_ops(void)
2430 {
2431 register_syscore_ops(&iommu_syscore_ops);
2432 }
2433
2434 #else
init_iommu_pm_ops(void)2435 static inline void init_iommu_pm_ops(void) {}
2436 #endif /* CONFIG_PM */
2437
rmrr_sanity_check(struct acpi_dmar_reserved_memory * rmrr)2438 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2439 {
2440 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2441 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2442 rmrr->end_address <= rmrr->base_address ||
2443 arch_rmrr_sanity_check(rmrr))
2444 return -EINVAL;
2445
2446 return 0;
2447 }
2448
dmar_parse_one_rmrr(struct acpi_dmar_header * header,void * arg)2449 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2450 {
2451 struct acpi_dmar_reserved_memory *rmrr;
2452 struct dmar_rmrr_unit *rmrru;
2453
2454 rmrr = (struct acpi_dmar_reserved_memory *)header;
2455 if (rmrr_sanity_check(rmrr)) {
2456 pr_warn(FW_BUG
2457 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2458 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2459 rmrr->base_address, rmrr->end_address,
2460 dmi_get_system_info(DMI_BIOS_VENDOR),
2461 dmi_get_system_info(DMI_BIOS_VERSION),
2462 dmi_get_system_info(DMI_PRODUCT_VERSION));
2463 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2464 }
2465
2466 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2467 if (!rmrru)
2468 goto out;
2469
2470 rmrru->hdr = header;
2471
2472 rmrru->base_address = rmrr->base_address;
2473 rmrru->end_address = rmrr->end_address;
2474
2475 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2476 ((void *)rmrr) + rmrr->header.length,
2477 &rmrru->devices_cnt);
2478 if (rmrru->devices_cnt && rmrru->devices == NULL)
2479 goto free_rmrru;
2480
2481 list_add(&rmrru->list, &dmar_rmrr_units);
2482
2483 return 0;
2484 free_rmrru:
2485 kfree(rmrru);
2486 out:
2487 return -ENOMEM;
2488 }
2489
dmar_find_atsr(struct acpi_dmar_atsr * atsr)2490 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2491 {
2492 struct dmar_atsr_unit *atsru;
2493 struct acpi_dmar_atsr *tmp;
2494
2495 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2496 dmar_rcu_check()) {
2497 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2498 if (atsr->segment != tmp->segment)
2499 continue;
2500 if (atsr->header.length != tmp->header.length)
2501 continue;
2502 if (memcmp(atsr, tmp, atsr->header.length) == 0)
2503 return atsru;
2504 }
2505
2506 return NULL;
2507 }
2508
dmar_parse_one_atsr(struct acpi_dmar_header * hdr,void * arg)2509 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2510 {
2511 struct acpi_dmar_atsr *atsr;
2512 struct dmar_atsr_unit *atsru;
2513
2514 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2515 return 0;
2516
2517 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2518 atsru = dmar_find_atsr(atsr);
2519 if (atsru)
2520 return 0;
2521
2522 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2523 if (!atsru)
2524 return -ENOMEM;
2525
2526 /*
2527 * If memory is allocated from slab by ACPI _DSM method, we need to
2528 * copy the memory content because the memory buffer will be freed
2529 * on return.
2530 */
2531 atsru->hdr = (void *)(atsru + 1);
2532 memcpy(atsru->hdr, hdr, hdr->length);
2533 atsru->include_all = atsr->flags & 0x1;
2534 if (!atsru->include_all) {
2535 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
2536 (void *)atsr + atsr->header.length,
2537 &atsru->devices_cnt);
2538 if (atsru->devices_cnt && atsru->devices == NULL) {
2539 kfree(atsru);
2540 return -ENOMEM;
2541 }
2542 }
2543
2544 list_add_rcu(&atsru->list, &dmar_atsr_units);
2545
2546 return 0;
2547 }
2548
intel_iommu_free_atsr(struct dmar_atsr_unit * atsru)2549 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2550 {
2551 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
2552 kfree(atsru);
2553 }
2554
dmar_release_one_atsr(struct acpi_dmar_header * hdr,void * arg)2555 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2556 {
2557 struct acpi_dmar_atsr *atsr;
2558 struct dmar_atsr_unit *atsru;
2559
2560 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2561 atsru = dmar_find_atsr(atsr);
2562 if (atsru) {
2563 list_del_rcu(&atsru->list);
2564 synchronize_rcu();
2565 intel_iommu_free_atsr(atsru);
2566 }
2567
2568 return 0;
2569 }
2570
dmar_check_one_atsr(struct acpi_dmar_header * hdr,void * arg)2571 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2572 {
2573 int i;
2574 struct device *dev;
2575 struct acpi_dmar_atsr *atsr;
2576 struct dmar_atsr_unit *atsru;
2577
2578 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2579 atsru = dmar_find_atsr(atsr);
2580 if (!atsru)
2581 return 0;
2582
2583 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
2584 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
2585 i, dev)
2586 return -EBUSY;
2587 }
2588
2589 return 0;
2590 }
2591
dmar_find_satc(struct acpi_dmar_satc * satc)2592 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
2593 {
2594 struct dmar_satc_unit *satcu;
2595 struct acpi_dmar_satc *tmp;
2596
2597 list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
2598 dmar_rcu_check()) {
2599 tmp = (struct acpi_dmar_satc *)satcu->hdr;
2600 if (satc->segment != tmp->segment)
2601 continue;
2602 if (satc->header.length != tmp->header.length)
2603 continue;
2604 if (memcmp(satc, tmp, satc->header.length) == 0)
2605 return satcu;
2606 }
2607
2608 return NULL;
2609 }
2610
dmar_parse_one_satc(struct acpi_dmar_header * hdr,void * arg)2611 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
2612 {
2613 struct acpi_dmar_satc *satc;
2614 struct dmar_satc_unit *satcu;
2615
2616 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2617 return 0;
2618
2619 satc = container_of(hdr, struct acpi_dmar_satc, header);
2620 satcu = dmar_find_satc(satc);
2621 if (satcu)
2622 return 0;
2623
2624 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
2625 if (!satcu)
2626 return -ENOMEM;
2627
2628 satcu->hdr = (void *)(satcu + 1);
2629 memcpy(satcu->hdr, hdr, hdr->length);
2630 satcu->atc_required = satc->flags & 0x1;
2631 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
2632 (void *)satc + satc->header.length,
2633 &satcu->devices_cnt);
2634 if (satcu->devices_cnt && !satcu->devices) {
2635 kfree(satcu);
2636 return -ENOMEM;
2637 }
2638 list_add_rcu(&satcu->list, &dmar_satc_units);
2639
2640 return 0;
2641 }
2642
intel_iommu_add(struct dmar_drhd_unit * dmaru)2643 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
2644 {
2645 struct intel_iommu *iommu = dmaru->iommu;
2646 int ret;
2647
2648 /*
2649 * Disable translation if already enabled prior to OS handover.
2650 */
2651 if (iommu->gcmd & DMA_GCMD_TE)
2652 iommu_disable_translation(iommu);
2653
2654 ret = iommu_init_domains(iommu);
2655 if (ret == 0)
2656 ret = iommu_alloc_root_entry(iommu);
2657 if (ret)
2658 goto out;
2659
2660 intel_svm_check(iommu);
2661
2662 if (dmaru->ignored) {
2663 /*
2664 * we always have to disable PMRs or DMA may fail on this device
2665 */
2666 if (force_on)
2667 iommu_disable_protect_mem_regions(iommu);
2668 return 0;
2669 }
2670
2671 intel_iommu_init_qi(iommu);
2672 iommu_flush_write_buffer(iommu);
2673
2674 if (ecap_prs(iommu->ecap)) {
2675 ret = intel_iommu_enable_prq(iommu);
2676 if (ret)
2677 goto disable_iommu;
2678 }
2679
2680 ret = dmar_set_interrupt(iommu);
2681 if (ret)
2682 goto disable_iommu;
2683
2684 iommu_set_root_entry(iommu);
2685 iommu_enable_translation(iommu);
2686
2687 iommu_disable_protect_mem_regions(iommu);
2688 return 0;
2689
2690 disable_iommu:
2691 disable_dmar_iommu(iommu);
2692 out:
2693 free_dmar_iommu(iommu);
2694 return ret;
2695 }
2696
dmar_iommu_hotplug(struct dmar_drhd_unit * dmaru,bool insert)2697 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
2698 {
2699 int ret = 0;
2700 struct intel_iommu *iommu = dmaru->iommu;
2701
2702 if (!intel_iommu_enabled)
2703 return 0;
2704 if (iommu == NULL)
2705 return -EINVAL;
2706
2707 if (insert) {
2708 ret = intel_iommu_add(dmaru);
2709 } else {
2710 disable_dmar_iommu(iommu);
2711 free_dmar_iommu(iommu);
2712 }
2713
2714 return ret;
2715 }
2716
intel_iommu_free_dmars(void)2717 static void intel_iommu_free_dmars(void)
2718 {
2719 struct dmar_rmrr_unit *rmrru, *rmrr_n;
2720 struct dmar_atsr_unit *atsru, *atsr_n;
2721 struct dmar_satc_unit *satcu, *satc_n;
2722
2723 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
2724 list_del(&rmrru->list);
2725 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
2726 kfree(rmrru);
2727 }
2728
2729 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
2730 list_del(&atsru->list);
2731 intel_iommu_free_atsr(atsru);
2732 }
2733 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
2734 list_del(&satcu->list);
2735 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
2736 kfree(satcu);
2737 }
2738 }
2739
dmar_find_matched_satc_unit(struct pci_dev * dev)2740 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
2741 {
2742 struct dmar_satc_unit *satcu;
2743 struct acpi_dmar_satc *satc;
2744 struct device *tmp;
2745 int i;
2746
2747 dev = pci_physfn(dev);
2748 rcu_read_lock();
2749
2750 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
2751 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2752 if (satc->segment != pci_domain_nr(dev->bus))
2753 continue;
2754 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
2755 if (to_pci_dev(tmp) == dev)
2756 goto out;
2757 }
2758 satcu = NULL;
2759 out:
2760 rcu_read_unlock();
2761 return satcu;
2762 }
2763
dmar_ats_supported(struct pci_dev * dev,struct intel_iommu * iommu)2764 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
2765 {
2766 int i, ret = 1;
2767 struct pci_bus *bus;
2768 struct pci_dev *bridge = NULL;
2769 struct device *tmp;
2770 struct acpi_dmar_atsr *atsr;
2771 struct dmar_atsr_unit *atsru;
2772 struct dmar_satc_unit *satcu;
2773
2774 dev = pci_physfn(dev);
2775 satcu = dmar_find_matched_satc_unit(dev);
2776 if (satcu)
2777 /*
2778 * This device supports ATS as it is in SATC table.
2779 * When IOMMU is in legacy mode, enabling ATS is done
2780 * automatically by HW for the device that requires
2781 * ATS, hence OS should not enable this device ATS
2782 * to avoid duplicated TLB invalidation.
2783 */
2784 return !(satcu->atc_required && !sm_supported(iommu));
2785
2786 for (bus = dev->bus; bus; bus = bus->parent) {
2787 bridge = bus->self;
2788 /* If it's an integrated device, allow ATS */
2789 if (!bridge)
2790 return 1;
2791 /* Connected via non-PCIe: no ATS */
2792 if (!pci_is_pcie(bridge) ||
2793 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
2794 return 0;
2795 /* If we found the root port, look it up in the ATSR */
2796 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
2797 break;
2798 }
2799
2800 rcu_read_lock();
2801 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
2802 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2803 if (atsr->segment != pci_domain_nr(dev->bus))
2804 continue;
2805
2806 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
2807 if (tmp == &bridge->dev)
2808 goto out;
2809
2810 if (atsru->include_all)
2811 goto out;
2812 }
2813 ret = 0;
2814 out:
2815 rcu_read_unlock();
2816
2817 return ret;
2818 }
2819
dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info * info)2820 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
2821 {
2822 int ret;
2823 struct dmar_rmrr_unit *rmrru;
2824 struct dmar_atsr_unit *atsru;
2825 struct dmar_satc_unit *satcu;
2826 struct acpi_dmar_atsr *atsr;
2827 struct acpi_dmar_reserved_memory *rmrr;
2828 struct acpi_dmar_satc *satc;
2829
2830 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
2831 return 0;
2832
2833 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
2834 rmrr = container_of(rmrru->hdr,
2835 struct acpi_dmar_reserved_memory, header);
2836 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2837 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
2838 ((void *)rmrr) + rmrr->header.length,
2839 rmrr->segment, rmrru->devices,
2840 rmrru->devices_cnt);
2841 if (ret < 0)
2842 return ret;
2843 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2844 dmar_remove_dev_scope(info, rmrr->segment,
2845 rmrru->devices, rmrru->devices_cnt);
2846 }
2847 }
2848
2849 list_for_each_entry(atsru, &dmar_atsr_units, list) {
2850 if (atsru->include_all)
2851 continue;
2852
2853 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2854 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2855 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
2856 (void *)atsr + atsr->header.length,
2857 atsr->segment, atsru->devices,
2858 atsru->devices_cnt);
2859 if (ret > 0)
2860 break;
2861 else if (ret < 0)
2862 return ret;
2863 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2864 if (dmar_remove_dev_scope(info, atsr->segment,
2865 atsru->devices, atsru->devices_cnt))
2866 break;
2867 }
2868 }
2869 list_for_each_entry(satcu, &dmar_satc_units, list) {
2870 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2871 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2872 ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
2873 (void *)satc + satc->header.length,
2874 satc->segment, satcu->devices,
2875 satcu->devices_cnt);
2876 if (ret > 0)
2877 break;
2878 else if (ret < 0)
2879 return ret;
2880 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2881 if (dmar_remove_dev_scope(info, satc->segment,
2882 satcu->devices, satcu->devices_cnt))
2883 break;
2884 }
2885 }
2886
2887 return 0;
2888 }
2889
intel_disable_iommus(void)2890 static void intel_disable_iommus(void)
2891 {
2892 struct intel_iommu *iommu = NULL;
2893 struct dmar_drhd_unit *drhd;
2894
2895 for_each_iommu(iommu, drhd)
2896 iommu_disable_translation(iommu);
2897 }
2898
intel_iommu_shutdown(void)2899 void intel_iommu_shutdown(void)
2900 {
2901 struct dmar_drhd_unit *drhd;
2902 struct intel_iommu *iommu = NULL;
2903
2904 if (no_iommu || dmar_disabled)
2905 return;
2906
2907 /*
2908 * All other CPUs were brought down, hotplug interrupts were disabled,
2909 * no lock and RCU checking needed anymore
2910 */
2911 list_for_each_entry(drhd, &dmar_drhd_units, list) {
2912 iommu = drhd->iommu;
2913
2914 /* Disable PMRs explicitly here. */
2915 iommu_disable_protect_mem_regions(iommu);
2916
2917 /* Make sure the IOMMUs are switched off */
2918 iommu_disable_translation(iommu);
2919 }
2920 }
2921
dev_to_intel_iommu(struct device * dev)2922 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
2923 {
2924 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
2925
2926 return container_of(iommu_dev, struct intel_iommu, iommu);
2927 }
2928
version_show(struct device * dev,struct device_attribute * attr,char * buf)2929 static ssize_t version_show(struct device *dev,
2930 struct device_attribute *attr, char *buf)
2931 {
2932 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2933 u32 ver = readl(iommu->reg + DMAR_VER_REG);
2934 return sysfs_emit(buf, "%d:%d\n",
2935 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
2936 }
2937 static DEVICE_ATTR_RO(version);
2938
address_show(struct device * dev,struct device_attribute * attr,char * buf)2939 static ssize_t address_show(struct device *dev,
2940 struct device_attribute *attr, char *buf)
2941 {
2942 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2943 return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
2944 }
2945 static DEVICE_ATTR_RO(address);
2946
cap_show(struct device * dev,struct device_attribute * attr,char * buf)2947 static ssize_t cap_show(struct device *dev,
2948 struct device_attribute *attr, char *buf)
2949 {
2950 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2951 return sysfs_emit(buf, "%llx\n", iommu->cap);
2952 }
2953 static DEVICE_ATTR_RO(cap);
2954
ecap_show(struct device * dev,struct device_attribute * attr,char * buf)2955 static ssize_t ecap_show(struct device *dev,
2956 struct device_attribute *attr, char *buf)
2957 {
2958 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2959 return sysfs_emit(buf, "%llx\n", iommu->ecap);
2960 }
2961 static DEVICE_ATTR_RO(ecap);
2962
domains_supported_show(struct device * dev,struct device_attribute * attr,char * buf)2963 static ssize_t domains_supported_show(struct device *dev,
2964 struct device_attribute *attr, char *buf)
2965 {
2966 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2967 return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
2968 }
2969 static DEVICE_ATTR_RO(domains_supported);
2970
domains_used_show(struct device * dev,struct device_attribute * attr,char * buf)2971 static ssize_t domains_used_show(struct device *dev,
2972 struct device_attribute *attr, char *buf)
2973 {
2974 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2975 return sysfs_emit(buf, "%d\n",
2976 bitmap_weight(iommu->domain_ids,
2977 cap_ndoms(iommu->cap)));
2978 }
2979 static DEVICE_ATTR_RO(domains_used);
2980
2981 static struct attribute *intel_iommu_attrs[] = {
2982 &dev_attr_version.attr,
2983 &dev_attr_address.attr,
2984 &dev_attr_cap.attr,
2985 &dev_attr_ecap.attr,
2986 &dev_attr_domains_supported.attr,
2987 &dev_attr_domains_used.attr,
2988 NULL,
2989 };
2990
2991 static struct attribute_group intel_iommu_group = {
2992 .name = "intel-iommu",
2993 .attrs = intel_iommu_attrs,
2994 };
2995
2996 const struct attribute_group *intel_iommu_groups[] = {
2997 &intel_iommu_group,
2998 NULL,
2999 };
3000
has_external_pci(void)3001 static bool has_external_pci(void)
3002 {
3003 struct pci_dev *pdev = NULL;
3004
3005 for_each_pci_dev(pdev)
3006 if (pdev->external_facing) {
3007 pci_dev_put(pdev);
3008 return true;
3009 }
3010
3011 return false;
3012 }
3013
platform_optin_force_iommu(void)3014 static int __init platform_optin_force_iommu(void)
3015 {
3016 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3017 return 0;
3018
3019 if (no_iommu || dmar_disabled)
3020 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3021
3022 /*
3023 * If Intel-IOMMU is disabled by default, we will apply identity
3024 * map for all devices except those marked as being untrusted.
3025 */
3026 if (dmar_disabled)
3027 iommu_set_default_passthrough(false);
3028
3029 dmar_disabled = 0;
3030 no_iommu = 0;
3031
3032 return 1;
3033 }
3034
probe_acpi_namespace_devices(void)3035 static int __init probe_acpi_namespace_devices(void)
3036 {
3037 struct dmar_drhd_unit *drhd;
3038 /* To avoid a -Wunused-but-set-variable warning. */
3039 struct intel_iommu *iommu __maybe_unused;
3040 struct device *dev;
3041 int i, ret = 0;
3042
3043 for_each_active_iommu(iommu, drhd) {
3044 for_each_active_dev_scope(drhd->devices,
3045 drhd->devices_cnt, i, dev) {
3046 struct acpi_device_physical_node *pn;
3047 struct acpi_device *adev;
3048
3049 if (dev->bus != &acpi_bus_type)
3050 continue;
3051
3052 up_read(&dmar_global_lock);
3053 adev = to_acpi_device(dev);
3054 mutex_lock(&adev->physical_node_lock);
3055 list_for_each_entry(pn,
3056 &adev->physical_node_list, node) {
3057 ret = iommu_probe_device(pn->dev);
3058 if (ret)
3059 break;
3060 }
3061 mutex_unlock(&adev->physical_node_lock);
3062 down_read(&dmar_global_lock);
3063
3064 if (ret)
3065 return ret;
3066 }
3067 }
3068
3069 return 0;
3070 }
3071
tboot_force_iommu(void)3072 static __init int tboot_force_iommu(void)
3073 {
3074 if (!tboot_enabled())
3075 return 0;
3076
3077 if (no_iommu || dmar_disabled)
3078 pr_warn("Forcing Intel-IOMMU to enabled\n");
3079
3080 dmar_disabled = 0;
3081 no_iommu = 0;
3082
3083 return 1;
3084 }
3085
intel_iommu_init(void)3086 int __init intel_iommu_init(void)
3087 {
3088 int ret = -ENODEV;
3089 struct dmar_drhd_unit *drhd;
3090 struct intel_iommu *iommu;
3091
3092 /*
3093 * Intel IOMMU is required for a TXT/tboot launch or platform
3094 * opt in, so enforce that.
3095 */
3096 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3097 platform_optin_force_iommu();
3098
3099 down_write(&dmar_global_lock);
3100 if (dmar_table_init()) {
3101 if (force_on)
3102 panic("tboot: Failed to initialize DMAR table\n");
3103 goto out_free_dmar;
3104 }
3105
3106 if (dmar_dev_scope_init() < 0) {
3107 if (force_on)
3108 panic("tboot: Failed to initialize DMAR device scope\n");
3109 goto out_free_dmar;
3110 }
3111
3112 up_write(&dmar_global_lock);
3113
3114 /*
3115 * The bus notifier takes the dmar_global_lock, so lockdep will
3116 * complain later when we register it under the lock.
3117 */
3118 dmar_register_bus_notifier();
3119
3120 down_write(&dmar_global_lock);
3121
3122 if (!no_iommu)
3123 intel_iommu_debugfs_init();
3124
3125 if (no_iommu || dmar_disabled) {
3126 /*
3127 * We exit the function here to ensure IOMMU's remapping and
3128 * mempool aren't setup, which means that the IOMMU's PMRs
3129 * won't be disabled via the call to init_dmars(). So disable
3130 * it explicitly here. The PMRs were setup by tboot prior to
3131 * calling SENTER, but the kernel is expected to reset/tear
3132 * down the PMRs.
3133 */
3134 if (intel_iommu_tboot_noforce) {
3135 for_each_iommu(iommu, drhd)
3136 iommu_disable_protect_mem_regions(iommu);
3137 }
3138
3139 /*
3140 * Make sure the IOMMUs are switched off, even when we
3141 * boot into a kexec kernel and the previous kernel left
3142 * them enabled
3143 */
3144 intel_disable_iommus();
3145 goto out_free_dmar;
3146 }
3147
3148 if (list_empty(&dmar_rmrr_units))
3149 pr_info("No RMRR found\n");
3150
3151 if (list_empty(&dmar_atsr_units))
3152 pr_info("No ATSR found\n");
3153
3154 if (list_empty(&dmar_satc_units))
3155 pr_info("No SATC found\n");
3156
3157 init_no_remapping_devices();
3158
3159 ret = init_dmars();
3160 if (ret) {
3161 if (force_on)
3162 panic("tboot: Failed to initialize DMARs\n");
3163 pr_err("Initialization failed\n");
3164 goto out_free_dmar;
3165 }
3166 up_write(&dmar_global_lock);
3167
3168 init_iommu_pm_ops();
3169
3170 down_read(&dmar_global_lock);
3171 for_each_active_iommu(iommu, drhd) {
3172 /*
3173 * The flush queue implementation does not perform
3174 * page-selective invalidations that are required for efficient
3175 * TLB flushes in virtual environments. The benefit of batching
3176 * is likely to be much lower than the overhead of synchronizing
3177 * the virtual and physical IOMMU page-tables.
3178 */
3179 if (cap_caching_mode(iommu->cap) &&
3180 !first_level_by_default(iommu)) {
3181 pr_info_once("IOMMU batching disallowed due to virtualization\n");
3182 iommu_set_dma_strict();
3183 }
3184 iommu_device_sysfs_add(&iommu->iommu, NULL,
3185 intel_iommu_groups,
3186 "%s", iommu->name);
3187 /*
3188 * The iommu device probe is protected by the iommu_probe_device_lock.
3189 * Release the dmar_global_lock before entering the device probe path
3190 * to avoid unnecessary lock order splat.
3191 */
3192 up_read(&dmar_global_lock);
3193 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3194 down_read(&dmar_global_lock);
3195
3196 iommu_pmu_register(iommu);
3197 }
3198
3199 if (probe_acpi_namespace_devices())
3200 pr_warn("ACPI name space devices didn't probe correctly\n");
3201
3202 /* Finally, we enable the DMA remapping hardware. */
3203 for_each_iommu(iommu, drhd) {
3204 if (!drhd->ignored && !translation_pre_enabled(iommu))
3205 iommu_enable_translation(iommu);
3206
3207 iommu_disable_protect_mem_regions(iommu);
3208 }
3209 up_read(&dmar_global_lock);
3210
3211 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3212
3213 intel_iommu_enabled = 1;
3214
3215 return 0;
3216
3217 out_free_dmar:
3218 intel_iommu_free_dmars();
3219 up_write(&dmar_global_lock);
3220 return ret;
3221 }
3222
domain_context_clear_one_cb(struct pci_dev * pdev,u16 alias,void * opaque)3223 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3224 {
3225 struct device_domain_info *info = opaque;
3226
3227 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3228 return 0;
3229 }
3230
3231 /*
3232 * NB - intel-iommu lacks any sort of reference counting for the users of
3233 * dependent devices. If multiple endpoints have intersecting dependent
3234 * devices, unbinding the driver from any one of them will possibly leave
3235 * the others unable to operate.
3236 */
domain_context_clear(struct device_domain_info * info)3237 static void domain_context_clear(struct device_domain_info *info)
3238 {
3239 if (!dev_is_pci(info->dev)) {
3240 domain_context_clear_one(info, info->bus, info->devfn);
3241 return;
3242 }
3243
3244 pci_for_each_dma_alias(to_pci_dev(info->dev),
3245 &domain_context_clear_one_cb, info);
3246 iommu_disable_pci_ats(info);
3247 }
3248
3249 /*
3250 * Clear the page table pointer in context or pasid table entries so that
3251 * all DMA requests without PASID from the device are blocked. If the page
3252 * table has been set, clean up the data structures.
3253 */
device_block_translation(struct device * dev)3254 void device_block_translation(struct device *dev)
3255 {
3256 struct device_domain_info *info = dev_iommu_priv_get(dev);
3257 struct intel_iommu *iommu = info->iommu;
3258 unsigned long flags;
3259
3260 if (info->domain)
3261 cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID);
3262
3263 if (!dev_is_real_dma_subdevice(dev)) {
3264 if (sm_supported(iommu))
3265 intel_pasid_tear_down_entry(iommu, dev,
3266 IOMMU_NO_PASID, false);
3267 else
3268 domain_context_clear(info);
3269 }
3270
3271 if (!info->domain)
3272 return;
3273
3274 spin_lock_irqsave(&info->domain->lock, flags);
3275 list_del(&info->link);
3276 spin_unlock_irqrestore(&info->domain->lock, flags);
3277
3278 domain_detach_iommu(info->domain, iommu);
3279 info->domain = NULL;
3280 }
3281
blocking_domain_attach_dev(struct iommu_domain * domain,struct device * dev)3282 static int blocking_domain_attach_dev(struct iommu_domain *domain,
3283 struct device *dev)
3284 {
3285 device_block_translation(dev);
3286 return 0;
3287 }
3288
3289 static int blocking_domain_set_dev_pasid(struct iommu_domain *domain,
3290 struct device *dev, ioasid_t pasid,
3291 struct iommu_domain *old);
3292
3293 static struct iommu_domain blocking_domain = {
3294 .type = IOMMU_DOMAIN_BLOCKED,
3295 .ops = &(const struct iommu_domain_ops) {
3296 .attach_dev = blocking_domain_attach_dev,
3297 .set_dev_pasid = blocking_domain_set_dev_pasid,
3298 }
3299 };
3300
iommu_superpage_capability(struct intel_iommu * iommu,bool first_stage)3301 static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage)
3302 {
3303 if (!intel_iommu_superpage)
3304 return 0;
3305
3306 if (first_stage)
3307 return cap_fl1gp_support(iommu->cap) ? 2 : 1;
3308
3309 return fls(cap_super_page_val(iommu->cap));
3310 }
3311
paging_domain_alloc(struct device * dev,bool first_stage)3312 static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage)
3313 {
3314 struct device_domain_info *info = dev_iommu_priv_get(dev);
3315 struct intel_iommu *iommu = info->iommu;
3316 struct dmar_domain *domain;
3317 int addr_width;
3318
3319 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
3320 if (!domain)
3321 return ERR_PTR(-ENOMEM);
3322
3323 INIT_LIST_HEAD(&domain->devices);
3324 INIT_LIST_HEAD(&domain->dev_pasids);
3325 INIT_LIST_HEAD(&domain->cache_tags);
3326 spin_lock_init(&domain->lock);
3327 spin_lock_init(&domain->cache_lock);
3328 xa_init(&domain->iommu_array);
3329
3330 domain->nid = dev_to_node(dev);
3331 domain->use_first_level = first_stage;
3332
3333 /* calculate the address width */
3334 addr_width = agaw_to_width(iommu->agaw);
3335 if (addr_width > cap_mgaw(iommu->cap))
3336 addr_width = cap_mgaw(iommu->cap);
3337 domain->gaw = addr_width;
3338 domain->agaw = iommu->agaw;
3339 domain->max_addr = __DOMAIN_MAX_ADDR(addr_width);
3340
3341 /* iommu memory access coherency */
3342 domain->iommu_coherency = iommu_paging_structure_coherency(iommu);
3343
3344 /* pagesize bitmap */
3345 domain->domain.pgsize_bitmap = SZ_4K;
3346 domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage);
3347 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
3348
3349 /*
3350 * IOVA aperture: First-level translation restricts the input-address
3351 * to a canonical address (i.e., address bits 63:N have the same value
3352 * as address bit [N-1], where N is 48-bits with 4-level paging and
3353 * 57-bits with 5-level paging). Hence, skip bit [N-1].
3354 */
3355 domain->domain.geometry.force_aperture = true;
3356 domain->domain.geometry.aperture_start = 0;
3357 if (first_stage)
3358 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
3359 else
3360 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
3361
3362 /* always allocate the top pgd */
3363 domain->pgd = iommu_alloc_page_node(domain->nid, GFP_KERNEL);
3364 if (!domain->pgd) {
3365 kfree(domain);
3366 return ERR_PTR(-ENOMEM);
3367 }
3368 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3369
3370 return domain;
3371 }
3372
3373 static struct iommu_domain *
intel_iommu_domain_alloc_paging_flags(struct device * dev,u32 flags,const struct iommu_user_data * user_data)3374 intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
3375 const struct iommu_user_data *user_data)
3376 {
3377 struct device_domain_info *info = dev_iommu_priv_get(dev);
3378 bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3379 bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3380 struct intel_iommu *iommu = info->iommu;
3381 struct dmar_domain *dmar_domain;
3382 struct iommu_domain *domain;
3383 bool first_stage;
3384
3385 if (flags &
3386 (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
3387 return ERR_PTR(-EOPNOTSUPP);
3388 if (nested_parent && !nested_supported(iommu))
3389 return ERR_PTR(-EOPNOTSUPP);
3390 if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3391 return ERR_PTR(-EOPNOTSUPP);
3392
3393 /*
3394 * Always allocate the guest compatible page table unless
3395 * IOMMU_HWPT_ALLOC_NEST_PARENT or IOMMU_HWPT_ALLOC_DIRTY_TRACKING
3396 * is specified.
3397 */
3398 if (nested_parent || dirty_tracking) {
3399 if (!sm_supported(iommu) || !ecap_slts(iommu->ecap))
3400 return ERR_PTR(-EOPNOTSUPP);
3401 first_stage = false;
3402 } else {
3403 first_stage = first_level_by_default(iommu);
3404 }
3405
3406 dmar_domain = paging_domain_alloc(dev, first_stage);
3407 if (IS_ERR(dmar_domain))
3408 return ERR_CAST(dmar_domain);
3409 domain = &dmar_domain->domain;
3410 domain->type = IOMMU_DOMAIN_UNMANAGED;
3411 domain->owner = &intel_iommu_ops;
3412 domain->ops = intel_iommu_ops.default_domain_ops;
3413
3414 if (nested_parent) {
3415 dmar_domain->nested_parent = true;
3416 INIT_LIST_HEAD(&dmar_domain->s1_domains);
3417 spin_lock_init(&dmar_domain->s1_lock);
3418 }
3419
3420 if (dirty_tracking) {
3421 if (dmar_domain->use_first_level) {
3422 iommu_domain_free(domain);
3423 return ERR_PTR(-EOPNOTSUPP);
3424 }
3425 domain->dirty_ops = &intel_dirty_ops;
3426 }
3427
3428 return domain;
3429 }
3430
intel_iommu_domain_free(struct iommu_domain * domain)3431 static void intel_iommu_domain_free(struct iommu_domain *domain)
3432 {
3433 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3434
3435 WARN_ON(dmar_domain->nested_parent &&
3436 !list_empty(&dmar_domain->s1_domains));
3437 domain_exit(dmar_domain);
3438 }
3439
paging_domain_compatible(struct iommu_domain * domain,struct device * dev)3440 int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
3441 {
3442 struct device_domain_info *info = dev_iommu_priv_get(dev);
3443 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3444 struct intel_iommu *iommu = info->iommu;
3445 int addr_width;
3446
3447 if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
3448 return -EPERM;
3449
3450 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3451 return -EINVAL;
3452
3453 if (domain->dirty_ops && !ssads_supported(iommu))
3454 return -EINVAL;
3455
3456 if (dmar_domain->iommu_coherency !=
3457 iommu_paging_structure_coherency(iommu))
3458 return -EINVAL;
3459
3460 if (dmar_domain->iommu_superpage !=
3461 iommu_superpage_capability(iommu, dmar_domain->use_first_level))
3462 return -EINVAL;
3463
3464 if (dmar_domain->use_first_level &&
3465 (!sm_supported(iommu) || !ecap_flts(iommu->ecap)))
3466 return -EINVAL;
3467
3468 /* check if this iommu agaw is sufficient for max mapped address */
3469 addr_width = agaw_to_width(iommu->agaw);
3470 if (addr_width > cap_mgaw(iommu->cap))
3471 addr_width = cap_mgaw(iommu->cap);
3472
3473 if (dmar_domain->gaw > addr_width || dmar_domain->agaw > iommu->agaw)
3474 return -EINVAL;
3475
3476 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3477 context_copied(iommu, info->bus, info->devfn))
3478 return intel_pasid_setup_sm_context(dev);
3479
3480 return 0;
3481 }
3482
intel_iommu_attach_device(struct iommu_domain * domain,struct device * dev)3483 static int intel_iommu_attach_device(struct iommu_domain *domain,
3484 struct device *dev)
3485 {
3486 int ret;
3487
3488 device_block_translation(dev);
3489
3490 ret = paging_domain_compatible(domain, dev);
3491 if (ret)
3492 return ret;
3493
3494 return dmar_domain_attach_device(to_dmar_domain(domain), dev);
3495 }
3496
intel_iommu_map(struct iommu_domain * domain,unsigned long iova,phys_addr_t hpa,size_t size,int iommu_prot,gfp_t gfp)3497 static int intel_iommu_map(struct iommu_domain *domain,
3498 unsigned long iova, phys_addr_t hpa,
3499 size_t size, int iommu_prot, gfp_t gfp)
3500 {
3501 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3502 u64 max_addr;
3503 int prot = 0;
3504
3505 if (iommu_prot & IOMMU_READ)
3506 prot |= DMA_PTE_READ;
3507 if (iommu_prot & IOMMU_WRITE)
3508 prot |= DMA_PTE_WRITE;
3509 if (dmar_domain->set_pte_snp)
3510 prot |= DMA_PTE_SNP;
3511
3512 max_addr = iova + size;
3513 if (dmar_domain->max_addr < max_addr) {
3514 u64 end;
3515
3516 /* check if minimum agaw is sufficient for mapped address */
3517 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3518 if (end < max_addr) {
3519 pr_err("%s: iommu width (%d) is not "
3520 "sufficient for the mapped address (%llx)\n",
3521 __func__, dmar_domain->gaw, max_addr);
3522 return -EFAULT;
3523 }
3524 dmar_domain->max_addr = max_addr;
3525 }
3526 /* Round up size to next multiple of PAGE_SIZE, if it and
3527 the low bits of hpa would take us onto the next page */
3528 size = aligned_nrpages(hpa, size);
3529 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3530 hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
3531 }
3532
intel_iommu_map_pages(struct iommu_domain * domain,unsigned long iova,phys_addr_t paddr,size_t pgsize,size_t pgcount,int prot,gfp_t gfp,size_t * mapped)3533 static int intel_iommu_map_pages(struct iommu_domain *domain,
3534 unsigned long iova, phys_addr_t paddr,
3535 size_t pgsize, size_t pgcount,
3536 int prot, gfp_t gfp, size_t *mapped)
3537 {
3538 unsigned long pgshift = __ffs(pgsize);
3539 size_t size = pgcount << pgshift;
3540 int ret;
3541
3542 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
3543 return -EINVAL;
3544
3545 if (!IS_ALIGNED(iova | paddr, pgsize))
3546 return -EINVAL;
3547
3548 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
3549 if (!ret && mapped)
3550 *mapped = size;
3551
3552 return ret;
3553 }
3554
intel_iommu_unmap(struct iommu_domain * domain,unsigned long iova,size_t size,struct iommu_iotlb_gather * gather)3555 static size_t intel_iommu_unmap(struct iommu_domain *domain,
3556 unsigned long iova, size_t size,
3557 struct iommu_iotlb_gather *gather)
3558 {
3559 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3560 unsigned long start_pfn, last_pfn;
3561 int level = 0;
3562
3563 /* Cope with horrid API which requires us to unmap more than the
3564 size argument if it happens to be a large-page mapping. */
3565 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
3566 &level, GFP_ATOMIC)))
3567 return 0;
3568
3569 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
3570 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
3571
3572 start_pfn = iova >> VTD_PAGE_SHIFT;
3573 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
3574
3575 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
3576
3577 if (dmar_domain->max_addr == iova + size)
3578 dmar_domain->max_addr = iova;
3579
3580 /*
3581 * We do not use page-selective IOTLB invalidation in flush queue,
3582 * so there is no need to track page and sync iotlb.
3583 */
3584 if (!iommu_iotlb_gather_queued(gather))
3585 iommu_iotlb_gather_add_page(domain, gather, iova, size);
3586
3587 return size;
3588 }
3589
intel_iommu_unmap_pages(struct iommu_domain * domain,unsigned long iova,size_t pgsize,size_t pgcount,struct iommu_iotlb_gather * gather)3590 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
3591 unsigned long iova,
3592 size_t pgsize, size_t pgcount,
3593 struct iommu_iotlb_gather *gather)
3594 {
3595 unsigned long pgshift = __ffs(pgsize);
3596 size_t size = pgcount << pgshift;
3597
3598 return intel_iommu_unmap(domain, iova, size, gather);
3599 }
3600
intel_iommu_tlb_sync(struct iommu_domain * domain,struct iommu_iotlb_gather * gather)3601 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
3602 struct iommu_iotlb_gather *gather)
3603 {
3604 cache_tag_flush_range(to_dmar_domain(domain), gather->start,
3605 gather->end, list_empty(&gather->freelist));
3606 iommu_put_pages_list(&gather->freelist);
3607 }
3608
intel_iommu_iova_to_phys(struct iommu_domain * domain,dma_addr_t iova)3609 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3610 dma_addr_t iova)
3611 {
3612 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3613 struct dma_pte *pte;
3614 int level = 0;
3615 u64 phys = 0;
3616
3617 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
3618 GFP_ATOMIC);
3619 if (pte && dma_pte_present(pte))
3620 phys = dma_pte_addr(pte) +
3621 (iova & (BIT_MASK(level_to_offset_bits(level) +
3622 VTD_PAGE_SHIFT) - 1));
3623
3624 return phys;
3625 }
3626
domain_support_force_snooping(struct dmar_domain * domain)3627 static bool domain_support_force_snooping(struct dmar_domain *domain)
3628 {
3629 struct device_domain_info *info;
3630 bool support = true;
3631
3632 assert_spin_locked(&domain->lock);
3633 list_for_each_entry(info, &domain->devices, link) {
3634 if (!ecap_sc_support(info->iommu->ecap)) {
3635 support = false;
3636 break;
3637 }
3638 }
3639
3640 return support;
3641 }
3642
domain_set_force_snooping(struct dmar_domain * domain)3643 static void domain_set_force_snooping(struct dmar_domain *domain)
3644 {
3645 struct device_domain_info *info;
3646
3647 assert_spin_locked(&domain->lock);
3648 /*
3649 * Second level page table supports per-PTE snoop control. The
3650 * iommu_map() interface will handle this by setting SNP bit.
3651 */
3652 if (!domain->use_first_level) {
3653 domain->set_pte_snp = true;
3654 return;
3655 }
3656
3657 list_for_each_entry(info, &domain->devices, link)
3658 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
3659 IOMMU_NO_PASID);
3660 }
3661
intel_iommu_enforce_cache_coherency(struct iommu_domain * domain)3662 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
3663 {
3664 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3665 unsigned long flags;
3666
3667 if (dmar_domain->force_snooping)
3668 return true;
3669
3670 spin_lock_irqsave(&dmar_domain->lock, flags);
3671 if (!domain_support_force_snooping(dmar_domain) ||
3672 (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
3673 spin_unlock_irqrestore(&dmar_domain->lock, flags);
3674 return false;
3675 }
3676
3677 domain_set_force_snooping(dmar_domain);
3678 dmar_domain->force_snooping = true;
3679 spin_unlock_irqrestore(&dmar_domain->lock, flags);
3680
3681 return true;
3682 }
3683
intel_iommu_capable(struct device * dev,enum iommu_cap cap)3684 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
3685 {
3686 struct device_domain_info *info = dev_iommu_priv_get(dev);
3687
3688 switch (cap) {
3689 case IOMMU_CAP_CACHE_COHERENCY:
3690 case IOMMU_CAP_DEFERRED_FLUSH:
3691 return true;
3692 case IOMMU_CAP_PRE_BOOT_PROTECTION:
3693 return dmar_platform_optin();
3694 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
3695 return ecap_sc_support(info->iommu->ecap);
3696 case IOMMU_CAP_DIRTY_TRACKING:
3697 return ssads_supported(info->iommu);
3698 default:
3699 return false;
3700 }
3701 }
3702
intel_iommu_probe_device(struct device * dev)3703 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
3704 {
3705 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3706 struct device_domain_info *info;
3707 struct intel_iommu *iommu;
3708 u8 bus, devfn;
3709 int ret;
3710
3711 iommu = device_lookup_iommu(dev, &bus, &devfn);
3712 if (!iommu || !iommu->iommu.ops)
3713 return ERR_PTR(-ENODEV);
3714
3715 info = kzalloc(sizeof(*info), GFP_KERNEL);
3716 if (!info)
3717 return ERR_PTR(-ENOMEM);
3718
3719 if (dev_is_real_dma_subdevice(dev)) {
3720 info->bus = pdev->bus->number;
3721 info->devfn = pdev->devfn;
3722 info->segment = pci_domain_nr(pdev->bus);
3723 } else {
3724 info->bus = bus;
3725 info->devfn = devfn;
3726 info->segment = iommu->segment;
3727 }
3728
3729 info->dev = dev;
3730 info->iommu = iommu;
3731 if (dev_is_pci(dev)) {
3732 if (ecap_dev_iotlb_support(iommu->ecap) &&
3733 pci_ats_supported(pdev) &&
3734 dmar_ats_supported(pdev, iommu)) {
3735 info->ats_supported = 1;
3736 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
3737
3738 /*
3739 * For IOMMU that supports device IOTLB throttling
3740 * (DIT), we assign PFSID to the invalidation desc
3741 * of a VF such that IOMMU HW can gauge queue depth
3742 * at PF level. If DIT is not set, PFSID will be
3743 * treated as reserved, which should be set to 0.
3744 */
3745 if (ecap_dit(iommu->ecap))
3746 info->pfsid = pci_dev_id(pci_physfn(pdev));
3747 info->ats_qdep = pci_ats_queue_depth(pdev);
3748 }
3749 if (sm_supported(iommu)) {
3750 if (pasid_supported(iommu)) {
3751 int features = pci_pasid_features(pdev);
3752
3753 if (features >= 0)
3754 info->pasid_supported = features | 1;
3755 }
3756
3757 if (info->ats_supported && ecap_prs(iommu->ecap) &&
3758 pci_pri_supported(pdev))
3759 info->pri_supported = 1;
3760 }
3761 }
3762
3763 dev_iommu_priv_set(dev, info);
3764 if (pdev && pci_ats_supported(pdev)) {
3765 pci_prepare_ats(pdev, VTD_PAGE_SHIFT);
3766 ret = device_rbtree_insert(iommu, info);
3767 if (ret)
3768 goto free;
3769 }
3770
3771 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
3772 ret = intel_pasid_alloc_table(dev);
3773 if (ret) {
3774 dev_err(dev, "PASID table allocation failed\n");
3775 goto clear_rbtree;
3776 }
3777
3778 if (!context_copied(iommu, info->bus, info->devfn)) {
3779 ret = intel_pasid_setup_sm_context(dev);
3780 if (ret)
3781 goto free_table;
3782 }
3783 }
3784
3785 intel_iommu_debugfs_create_dev(info);
3786
3787 /*
3788 * The PCIe spec, in its wisdom, declares that the behaviour of the
3789 * device is undefined if you enable PASID support after ATS support.
3790 * So always enable PASID support on devices which have it, even if
3791 * we can't yet know if we're ever going to use it.
3792 */
3793 if (info->pasid_supported &&
3794 !pci_enable_pasid(pdev, info->pasid_supported & ~1))
3795 info->pasid_enabled = 1;
3796
3797 if (sm_supported(iommu))
3798 iommu_enable_pci_ats(info);
3799 iommu_enable_pci_pri(info);
3800
3801 return &iommu->iommu;
3802 free_table:
3803 intel_pasid_free_table(dev);
3804 clear_rbtree:
3805 device_rbtree_remove(info);
3806 free:
3807 kfree(info);
3808
3809 return ERR_PTR(ret);
3810 }
3811
intel_iommu_release_device(struct device * dev)3812 static void intel_iommu_release_device(struct device *dev)
3813 {
3814 struct device_domain_info *info = dev_iommu_priv_get(dev);
3815 struct intel_iommu *iommu = info->iommu;
3816
3817 iommu_disable_pci_pri(info);
3818 iommu_disable_pci_ats(info);
3819
3820 if (info->pasid_enabled) {
3821 pci_disable_pasid(to_pci_dev(dev));
3822 info->pasid_enabled = 0;
3823 }
3824
3825 mutex_lock(&iommu->iopf_lock);
3826 if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
3827 device_rbtree_remove(info);
3828 mutex_unlock(&iommu->iopf_lock);
3829
3830 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3831 !context_copied(iommu, info->bus, info->devfn))
3832 intel_pasid_teardown_sm_context(dev);
3833
3834 intel_pasid_free_table(dev);
3835 intel_iommu_debugfs_remove_dev(info);
3836 kfree(info);
3837 set_dma_ops(dev, NULL);
3838 }
3839
intel_iommu_get_resv_regions(struct device * device,struct list_head * head)3840 static void intel_iommu_get_resv_regions(struct device *device,
3841 struct list_head *head)
3842 {
3843 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
3844 struct iommu_resv_region *reg;
3845 struct dmar_rmrr_unit *rmrr;
3846 struct device *i_dev;
3847 int i;
3848
3849 rcu_read_lock();
3850 for_each_rmrr_units(rmrr) {
3851 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3852 i, i_dev) {
3853 struct iommu_resv_region *resv;
3854 enum iommu_resv_type type;
3855 size_t length;
3856
3857 if (i_dev != device &&
3858 !is_downstream_to_pci_bridge(device, i_dev))
3859 continue;
3860
3861 length = rmrr->end_address - rmrr->base_address + 1;
3862
3863 type = device_rmrr_is_relaxable(device) ?
3864 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
3865
3866 resv = iommu_alloc_resv_region(rmrr->base_address,
3867 length, prot, type,
3868 GFP_ATOMIC);
3869 if (!resv)
3870 break;
3871
3872 list_add_tail(&resv->list, head);
3873 }
3874 }
3875 rcu_read_unlock();
3876
3877 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
3878 if (dev_is_pci(device)) {
3879 struct pci_dev *pdev = to_pci_dev(device);
3880
3881 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
3882 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
3883 IOMMU_RESV_DIRECT_RELAXABLE,
3884 GFP_KERNEL);
3885 if (reg)
3886 list_add_tail(®->list, head);
3887 }
3888 }
3889 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
3890
3891 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
3892 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
3893 0, IOMMU_RESV_MSI, GFP_KERNEL);
3894 if (!reg)
3895 return;
3896 list_add_tail(®->list, head);
3897 }
3898
intel_iommu_device_group(struct device * dev)3899 static struct iommu_group *intel_iommu_device_group(struct device *dev)
3900 {
3901 if (dev_is_pci(dev))
3902 return pci_device_group(dev);
3903 return generic_device_group(dev);
3904 }
3905
intel_iommu_enable_iopf(struct device * dev)3906 int intel_iommu_enable_iopf(struct device *dev)
3907 {
3908 struct device_domain_info *info = dev_iommu_priv_get(dev);
3909 struct intel_iommu *iommu = info->iommu;
3910 int ret;
3911
3912 if (!info->pri_enabled)
3913 return -ENODEV;
3914
3915 if (info->iopf_refcount) {
3916 info->iopf_refcount++;
3917 return 0;
3918 }
3919
3920 ret = iopf_queue_add_device(iommu->iopf_queue, dev);
3921 if (ret)
3922 return ret;
3923
3924 info->iopf_refcount = 1;
3925
3926 return 0;
3927 }
3928
intel_iommu_disable_iopf(struct device * dev)3929 void intel_iommu_disable_iopf(struct device *dev)
3930 {
3931 struct device_domain_info *info = dev_iommu_priv_get(dev);
3932 struct intel_iommu *iommu = info->iommu;
3933
3934 if (WARN_ON(!info->pri_enabled || !info->iopf_refcount))
3935 return;
3936
3937 if (--info->iopf_refcount)
3938 return;
3939
3940 iopf_queue_remove_device(iommu->iopf_queue, dev);
3941 }
3942
3943 static int
intel_iommu_dev_enable_feat(struct device * dev,enum iommu_dev_features feat)3944 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
3945 {
3946 switch (feat) {
3947 case IOMMU_DEV_FEAT_IOPF:
3948 return intel_iommu_enable_iopf(dev);
3949
3950 case IOMMU_DEV_FEAT_SVA:
3951 return 0;
3952
3953 default:
3954 return -ENODEV;
3955 }
3956 }
3957
3958 static int
intel_iommu_dev_disable_feat(struct device * dev,enum iommu_dev_features feat)3959 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
3960 {
3961 switch (feat) {
3962 case IOMMU_DEV_FEAT_IOPF:
3963 intel_iommu_disable_iopf(dev);
3964 return 0;
3965
3966 case IOMMU_DEV_FEAT_SVA:
3967 return 0;
3968
3969 default:
3970 return -ENODEV;
3971 }
3972 }
3973
intel_iommu_is_attach_deferred(struct device * dev)3974 static bool intel_iommu_is_attach_deferred(struct device *dev)
3975 {
3976 struct device_domain_info *info = dev_iommu_priv_get(dev);
3977
3978 return translation_pre_enabled(info->iommu) && !info->domain;
3979 }
3980
3981 /*
3982 * Check that the device does not live on an external facing PCI port that is
3983 * marked as untrusted. Such devices should not be able to apply quirks and
3984 * thus not be able to bypass the IOMMU restrictions.
3985 */
risky_device(struct pci_dev * pdev)3986 static bool risky_device(struct pci_dev *pdev)
3987 {
3988 if (pdev->untrusted) {
3989 pci_info(pdev,
3990 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
3991 pdev->vendor, pdev->device);
3992 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
3993 return true;
3994 }
3995 return false;
3996 }
3997
intel_iommu_iotlb_sync_map(struct iommu_domain * domain,unsigned long iova,size_t size)3998 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
3999 unsigned long iova, size_t size)
4000 {
4001 cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1);
4002
4003 return 0;
4004 }
4005
domain_remove_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid)4006 void domain_remove_dev_pasid(struct iommu_domain *domain,
4007 struct device *dev, ioasid_t pasid)
4008 {
4009 struct device_domain_info *info = dev_iommu_priv_get(dev);
4010 struct dev_pasid_info *curr, *dev_pasid = NULL;
4011 struct intel_iommu *iommu = info->iommu;
4012 struct dmar_domain *dmar_domain;
4013 unsigned long flags;
4014
4015 if (!domain)
4016 return;
4017
4018 /* Identity domain has no meta data for pasid. */
4019 if (domain->type == IOMMU_DOMAIN_IDENTITY)
4020 return;
4021
4022 dmar_domain = to_dmar_domain(domain);
4023 spin_lock_irqsave(&dmar_domain->lock, flags);
4024 list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4025 if (curr->dev == dev && curr->pasid == pasid) {
4026 list_del(&curr->link_domain);
4027 dev_pasid = curr;
4028 break;
4029 }
4030 }
4031 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4032
4033 cache_tag_unassign_domain(dmar_domain, dev, pasid);
4034 domain_detach_iommu(dmar_domain, iommu);
4035 if (!WARN_ON_ONCE(!dev_pasid)) {
4036 intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4037 kfree(dev_pasid);
4038 }
4039 }
4040
blocking_domain_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)4041 static int blocking_domain_set_dev_pasid(struct iommu_domain *domain,
4042 struct device *dev, ioasid_t pasid,
4043 struct iommu_domain *old)
4044 {
4045 struct device_domain_info *info = dev_iommu_priv_get(dev);
4046
4047 intel_pasid_tear_down_entry(info->iommu, dev, pasid, false);
4048 domain_remove_dev_pasid(old, dev, pasid);
4049
4050 return 0;
4051 }
4052
4053 struct dev_pasid_info *
domain_add_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid)4054 domain_add_dev_pasid(struct iommu_domain *domain,
4055 struct device *dev, ioasid_t pasid)
4056 {
4057 struct device_domain_info *info = dev_iommu_priv_get(dev);
4058 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4059 struct intel_iommu *iommu = info->iommu;
4060 struct dev_pasid_info *dev_pasid;
4061 unsigned long flags;
4062 int ret;
4063
4064 dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4065 if (!dev_pasid)
4066 return ERR_PTR(-ENOMEM);
4067
4068 ret = domain_attach_iommu(dmar_domain, iommu);
4069 if (ret)
4070 goto out_free;
4071
4072 ret = cache_tag_assign_domain(dmar_domain, dev, pasid);
4073 if (ret)
4074 goto out_detach_iommu;
4075
4076 dev_pasid->dev = dev;
4077 dev_pasid->pasid = pasid;
4078 spin_lock_irqsave(&dmar_domain->lock, flags);
4079 list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4080 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4081
4082 return dev_pasid;
4083 out_detach_iommu:
4084 domain_detach_iommu(dmar_domain, iommu);
4085 out_free:
4086 kfree(dev_pasid);
4087 return ERR_PTR(ret);
4088 }
4089
intel_iommu_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)4090 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4091 struct device *dev, ioasid_t pasid,
4092 struct iommu_domain *old)
4093 {
4094 struct device_domain_info *info = dev_iommu_priv_get(dev);
4095 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4096 struct intel_iommu *iommu = info->iommu;
4097 struct dev_pasid_info *dev_pasid;
4098 int ret;
4099
4100 if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
4101 return -EINVAL;
4102
4103 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4104 return -EOPNOTSUPP;
4105
4106 if (domain->dirty_ops)
4107 return -EINVAL;
4108
4109 if (context_copied(iommu, info->bus, info->devfn))
4110 return -EBUSY;
4111
4112 ret = paging_domain_compatible(domain, dev);
4113 if (ret)
4114 return ret;
4115
4116 dev_pasid = domain_add_dev_pasid(domain, dev, pasid);
4117 if (IS_ERR(dev_pasid))
4118 return PTR_ERR(dev_pasid);
4119
4120 if (dmar_domain->use_first_level)
4121 ret = domain_setup_first_level(iommu, dmar_domain,
4122 dev, pasid, old);
4123 else
4124 ret = domain_setup_second_level(iommu, dmar_domain,
4125 dev, pasid, old);
4126 if (ret)
4127 goto out_remove_dev_pasid;
4128
4129 domain_remove_dev_pasid(old, dev, pasid);
4130
4131 intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4132
4133 return 0;
4134
4135 out_remove_dev_pasid:
4136 domain_remove_dev_pasid(domain, dev, pasid);
4137 return ret;
4138 }
4139
intel_iommu_hw_info(struct device * dev,u32 * length,u32 * type)4140 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4141 {
4142 struct device_domain_info *info = dev_iommu_priv_get(dev);
4143 struct intel_iommu *iommu = info->iommu;
4144 struct iommu_hw_info_vtd *vtd;
4145
4146 vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4147 if (!vtd)
4148 return ERR_PTR(-ENOMEM);
4149
4150 vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4151 vtd->cap_reg = iommu->cap;
4152 vtd->ecap_reg = iommu->ecap;
4153 *length = sizeof(*vtd);
4154 *type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4155 return vtd;
4156 }
4157
4158 /*
4159 * Set dirty tracking for the device list of a domain. The caller must
4160 * hold the domain->lock when calling it.
4161 */
device_set_dirty_tracking(struct list_head * devices,bool enable)4162 static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4163 {
4164 struct device_domain_info *info;
4165 int ret = 0;
4166
4167 list_for_each_entry(info, devices, link) {
4168 ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4169 IOMMU_NO_PASID, enable);
4170 if (ret)
4171 break;
4172 }
4173
4174 return ret;
4175 }
4176
parent_domain_set_dirty_tracking(struct dmar_domain * domain,bool enable)4177 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4178 bool enable)
4179 {
4180 struct dmar_domain *s1_domain;
4181 unsigned long flags;
4182 int ret;
4183
4184 spin_lock(&domain->s1_lock);
4185 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4186 spin_lock_irqsave(&s1_domain->lock, flags);
4187 ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4188 spin_unlock_irqrestore(&s1_domain->lock, flags);
4189 if (ret)
4190 goto err_unwind;
4191 }
4192 spin_unlock(&domain->s1_lock);
4193 return 0;
4194
4195 err_unwind:
4196 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4197 spin_lock_irqsave(&s1_domain->lock, flags);
4198 device_set_dirty_tracking(&s1_domain->devices,
4199 domain->dirty_tracking);
4200 spin_unlock_irqrestore(&s1_domain->lock, flags);
4201 }
4202 spin_unlock(&domain->s1_lock);
4203 return ret;
4204 }
4205
intel_iommu_set_dirty_tracking(struct iommu_domain * domain,bool enable)4206 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4207 bool enable)
4208 {
4209 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4210 int ret;
4211
4212 spin_lock(&dmar_domain->lock);
4213 if (dmar_domain->dirty_tracking == enable)
4214 goto out_unlock;
4215
4216 ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4217 if (ret)
4218 goto err_unwind;
4219
4220 if (dmar_domain->nested_parent) {
4221 ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4222 if (ret)
4223 goto err_unwind;
4224 }
4225
4226 dmar_domain->dirty_tracking = enable;
4227 out_unlock:
4228 spin_unlock(&dmar_domain->lock);
4229
4230 return 0;
4231
4232 err_unwind:
4233 device_set_dirty_tracking(&dmar_domain->devices,
4234 dmar_domain->dirty_tracking);
4235 spin_unlock(&dmar_domain->lock);
4236 return ret;
4237 }
4238
intel_iommu_read_and_clear_dirty(struct iommu_domain * domain,unsigned long iova,size_t size,unsigned long flags,struct iommu_dirty_bitmap * dirty)4239 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4240 unsigned long iova, size_t size,
4241 unsigned long flags,
4242 struct iommu_dirty_bitmap *dirty)
4243 {
4244 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4245 unsigned long end = iova + size - 1;
4246 unsigned long pgsize;
4247
4248 /*
4249 * IOMMUFD core calls into a dirty tracking disabled domain without an
4250 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4251 * have occurred when we stopped dirty tracking. This ensures that we
4252 * never inherit dirtied bits from a previous cycle.
4253 */
4254 if (!dmar_domain->dirty_tracking && dirty->bitmap)
4255 return -EINVAL;
4256
4257 do {
4258 struct dma_pte *pte;
4259 int lvl = 0;
4260
4261 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4262 GFP_ATOMIC);
4263 pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4264 if (!pte || !dma_pte_present(pte)) {
4265 iova += pgsize;
4266 continue;
4267 }
4268
4269 if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4270 iommu_dirty_bitmap_record(dirty, iova, pgsize);
4271 iova += pgsize;
4272 } while (iova < end);
4273
4274 return 0;
4275 }
4276
4277 static const struct iommu_dirty_ops intel_dirty_ops = {
4278 .set_dirty_tracking = intel_iommu_set_dirty_tracking,
4279 .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4280 };
4281
context_setup_pass_through(struct device * dev,u8 bus,u8 devfn)4282 static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
4283 {
4284 struct device_domain_info *info = dev_iommu_priv_get(dev);
4285 struct intel_iommu *iommu = info->iommu;
4286 struct context_entry *context;
4287
4288 spin_lock(&iommu->lock);
4289 context = iommu_context_addr(iommu, bus, devfn, 1);
4290 if (!context) {
4291 spin_unlock(&iommu->lock);
4292 return -ENOMEM;
4293 }
4294
4295 if (context_present(context) && !context_copied(iommu, bus, devfn)) {
4296 spin_unlock(&iommu->lock);
4297 return 0;
4298 }
4299
4300 copied_context_tear_down(iommu, context, bus, devfn);
4301 context_clear_entry(context);
4302 context_set_domain_id(context, FLPT_DEFAULT_DID);
4303
4304 /*
4305 * In pass through mode, AW must be programmed to indicate the largest
4306 * AGAW value supported by hardware. And ASR is ignored by hardware.
4307 */
4308 context_set_address_width(context, iommu->msagaw);
4309 context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH);
4310 context_set_fault_enable(context);
4311 context_set_present(context);
4312 if (!ecap_coherent(iommu->ecap))
4313 clflush_cache_range(context, sizeof(*context));
4314 context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn);
4315 spin_unlock(&iommu->lock);
4316
4317 return 0;
4318 }
4319
context_setup_pass_through_cb(struct pci_dev * pdev,u16 alias,void * data)4320 static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data)
4321 {
4322 struct device *dev = data;
4323
4324 return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff);
4325 }
4326
device_setup_pass_through(struct device * dev)4327 static int device_setup_pass_through(struct device *dev)
4328 {
4329 struct device_domain_info *info = dev_iommu_priv_get(dev);
4330
4331 if (!dev_is_pci(dev))
4332 return context_setup_pass_through(dev, info->bus, info->devfn);
4333
4334 return pci_for_each_dma_alias(to_pci_dev(dev),
4335 context_setup_pass_through_cb, dev);
4336 }
4337
identity_domain_attach_dev(struct iommu_domain * domain,struct device * dev)4338 static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev)
4339 {
4340 struct device_domain_info *info = dev_iommu_priv_get(dev);
4341 struct intel_iommu *iommu = info->iommu;
4342 int ret;
4343
4344 device_block_translation(dev);
4345
4346 if (dev_is_real_dma_subdevice(dev))
4347 return 0;
4348
4349 if (sm_supported(iommu))
4350 ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
4351 else
4352 ret = device_setup_pass_through(dev);
4353
4354 return ret;
4355 }
4356
identity_domain_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)4357 static int identity_domain_set_dev_pasid(struct iommu_domain *domain,
4358 struct device *dev, ioasid_t pasid,
4359 struct iommu_domain *old)
4360 {
4361 struct device_domain_info *info = dev_iommu_priv_get(dev);
4362 struct intel_iommu *iommu = info->iommu;
4363 int ret;
4364
4365 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4366 return -EOPNOTSUPP;
4367
4368 ret = domain_setup_passthrough(iommu, dev, pasid, old);
4369 if (ret)
4370 return ret;
4371
4372 domain_remove_dev_pasid(old, dev, pasid);
4373 return 0;
4374 }
4375
4376 static struct iommu_domain identity_domain = {
4377 .type = IOMMU_DOMAIN_IDENTITY,
4378 .ops = &(const struct iommu_domain_ops) {
4379 .attach_dev = identity_domain_attach_dev,
4380 .set_dev_pasid = identity_domain_set_dev_pasid,
4381 },
4382 };
4383
4384 const struct iommu_ops intel_iommu_ops = {
4385 .blocked_domain = &blocking_domain,
4386 .release_domain = &blocking_domain,
4387 .identity_domain = &identity_domain,
4388 .capable = intel_iommu_capable,
4389 .hw_info = intel_iommu_hw_info,
4390 .domain_alloc_paging_flags = intel_iommu_domain_alloc_paging_flags,
4391 .domain_alloc_sva = intel_svm_domain_alloc,
4392 .domain_alloc_nested = intel_iommu_domain_alloc_nested,
4393 .probe_device = intel_iommu_probe_device,
4394 .release_device = intel_iommu_release_device,
4395 .get_resv_regions = intel_iommu_get_resv_regions,
4396 .device_group = intel_iommu_device_group,
4397 .dev_enable_feat = intel_iommu_dev_enable_feat,
4398 .dev_disable_feat = intel_iommu_dev_disable_feat,
4399 .is_attach_deferred = intel_iommu_is_attach_deferred,
4400 .def_domain_type = device_def_domain_type,
4401 .pgsize_bitmap = SZ_4K,
4402 .page_response = intel_iommu_page_response,
4403 .default_domain_ops = &(const struct iommu_domain_ops) {
4404 .attach_dev = intel_iommu_attach_device,
4405 .set_dev_pasid = intel_iommu_set_dev_pasid,
4406 .map_pages = intel_iommu_map_pages,
4407 .unmap_pages = intel_iommu_unmap_pages,
4408 .iotlb_sync_map = intel_iommu_iotlb_sync_map,
4409 .flush_iotlb_all = intel_flush_iotlb_all,
4410 .iotlb_sync = intel_iommu_tlb_sync,
4411 .iova_to_phys = intel_iommu_iova_to_phys,
4412 .free = intel_iommu_domain_free,
4413 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4414 }
4415 };
4416
quirk_iommu_igfx(struct pci_dev * dev)4417 static void quirk_iommu_igfx(struct pci_dev *dev)
4418 {
4419 if (risky_device(dev))
4420 return;
4421
4422 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4423 disable_igfx_iommu = 1;
4424 }
4425
4426 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4427 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4428 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4429 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4430 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4431 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4432 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4433 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4434
4435 /* Broadwell igfx malfunctions with dmar */
4436 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4437 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4438 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4439 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4440 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4441 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4442 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4443 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4444 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4445 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4446 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4447 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4448 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4449 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4450 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4451 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4452 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4453 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4454 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4455 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4456 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4457 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4458 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4459 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4460
quirk_iommu_rwbf(struct pci_dev * dev)4461 static void quirk_iommu_rwbf(struct pci_dev *dev)
4462 {
4463 if (risky_device(dev))
4464 return;
4465
4466 /*
4467 * Mobile 4 Series Chipset neglects to set RWBF capability,
4468 * but needs it. Same seems to hold for the desktop versions.
4469 */
4470 pci_info(dev, "Forcing write-buffer flush capability\n");
4471 rwbf_quirk = 1;
4472 }
4473
4474 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4475 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4476 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4477 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4478 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4479 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4480 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4481
4482 #define GGC 0x52
4483 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4484 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4485 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4486 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4487 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4488 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4489 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4490 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4491
quirk_calpella_no_shadow_gtt(struct pci_dev * dev)4492 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4493 {
4494 unsigned short ggc;
4495
4496 if (risky_device(dev))
4497 return;
4498
4499 if (pci_read_config_word(dev, GGC, &ggc))
4500 return;
4501
4502 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4503 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4504 disable_igfx_iommu = 1;
4505 } else if (!disable_igfx_iommu) {
4506 /* we have to ensure the gfx device is idle before we flush */
4507 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4508 iommu_set_dma_strict();
4509 }
4510 }
4511 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4512 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4513 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4514 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4515
quirk_igfx_skip_te_disable(struct pci_dev * dev)4516 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4517 {
4518 unsigned short ver;
4519
4520 if (!IS_GFX_DEVICE(dev))
4521 return;
4522
4523 ver = (dev->device >> 8) & 0xff;
4524 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4525 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4526 ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4527 return;
4528
4529 if (risky_device(dev))
4530 return;
4531
4532 pci_info(dev, "Skip IOMMU disabling for graphics\n");
4533 iommu_skip_te_disable = 1;
4534 }
4535 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4536
4537 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4538 ISOCH DMAR unit for the Azalia sound device, but not give it any
4539 TLB entries, which causes it to deadlock. Check for that. We do
4540 this in a function called from init_dmars(), instead of in a PCI
4541 quirk, because we don't want to print the obnoxious "BIOS broken"
4542 message if VT-d is actually disabled.
4543 */
check_tylersburg_isoch(void)4544 static void __init check_tylersburg_isoch(void)
4545 {
4546 struct pci_dev *pdev;
4547 uint32_t vtisochctrl;
4548
4549 /* If there's no Azalia in the system anyway, forget it. */
4550 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4551 if (!pdev)
4552 return;
4553
4554 if (risky_device(pdev)) {
4555 pci_dev_put(pdev);
4556 return;
4557 }
4558
4559 pci_dev_put(pdev);
4560
4561 /* System Management Registers. Might be hidden, in which case
4562 we can't do the sanity check. But that's OK, because the
4563 known-broken BIOSes _don't_ actually hide it, so far. */
4564 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4565 if (!pdev)
4566 return;
4567
4568 if (risky_device(pdev)) {
4569 pci_dev_put(pdev);
4570 return;
4571 }
4572
4573 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4574 pci_dev_put(pdev);
4575 return;
4576 }
4577
4578 pci_dev_put(pdev);
4579
4580 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4581 if (vtisochctrl & 1)
4582 return;
4583
4584 /* Drop all bits other than the number of TLB entries */
4585 vtisochctrl &= 0x1c;
4586
4587 /* If we have the recommended number of TLB entries (16), fine. */
4588 if (vtisochctrl == 0x10)
4589 return;
4590
4591 /* Zero TLB entries? You get to ride the short bus to school. */
4592 if (!vtisochctrl) {
4593 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4594 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4595 dmi_get_system_info(DMI_BIOS_VENDOR),
4596 dmi_get_system_info(DMI_BIOS_VERSION),
4597 dmi_get_system_info(DMI_PRODUCT_VERSION));
4598 iommu_identity_mapping |= IDENTMAP_AZALIA;
4599 return;
4600 }
4601
4602 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4603 vtisochctrl);
4604 }
4605
4606 /*
4607 * Here we deal with a device TLB defect where device may inadvertently issue ATS
4608 * invalidation completion before posted writes initiated with translated address
4609 * that utilized translations matching the invalidation address range, violating
4610 * the invalidation completion ordering.
4611 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4612 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4613 * under the control of the trusted/privileged host device driver must use this
4614 * quirk.
4615 * Device TLBs are invalidated under the following six conditions:
4616 * 1. Device driver does DMA API unmap IOVA
4617 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4618 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4619 * exit_mmap() due to crash
4620 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4621 * VM has to free pages that were unmapped
4622 * 5. Userspace driver unmaps a DMA buffer
4623 * 6. Cache invalidation in vSVA usage (upcoming)
4624 *
4625 * For #1 and #2, device drivers are responsible for stopping DMA traffic
4626 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4627 * invalidate TLB the same way as normal user unmap which will use this quirk.
4628 * The dTLB invalidation after PASID cache flush does not need this quirk.
4629 *
4630 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4631 */
quirk_extra_dev_tlb_flush(struct device_domain_info * info,unsigned long address,unsigned long mask,u32 pasid,u16 qdep)4632 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4633 unsigned long address, unsigned long mask,
4634 u32 pasid, u16 qdep)
4635 {
4636 u16 sid;
4637
4638 if (likely(!info->dtlb_extra_inval))
4639 return;
4640
4641 sid = PCI_DEVID(info->bus, info->devfn);
4642 if (pasid == IOMMU_NO_PASID) {
4643 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
4644 qdep, address, mask);
4645 } else {
4646 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
4647 pasid, qdep, address, mask);
4648 }
4649 }
4650
4651 #define ecmd_get_status_code(res) (((res) & 0xff) >> 1)
4652
4653 /*
4654 * Function to submit a command to the enhanced command interface. The
4655 * valid enhanced command descriptions are defined in Table 47 of the
4656 * VT-d spec. The VT-d hardware implementation may support some but not
4657 * all commands, which can be determined by checking the Enhanced
4658 * Command Capability Register.
4659 *
4660 * Return values:
4661 * - 0: Command successful without any error;
4662 * - Negative: software error value;
4663 * - Nonzero positive: failure status code defined in Table 48.
4664 */
ecmd_submit_sync(struct intel_iommu * iommu,u8 ecmd,u64 oa,u64 ob)4665 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
4666 {
4667 unsigned long flags;
4668 u64 res;
4669 int ret;
4670
4671 if (!cap_ecmds(iommu->cap))
4672 return -ENODEV;
4673
4674 raw_spin_lock_irqsave(&iommu->register_lock, flags);
4675
4676 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
4677 if (res & DMA_ECMD_ECRSP_IP) {
4678 ret = -EBUSY;
4679 goto err;
4680 }
4681
4682 /*
4683 * Unconditionally write the operand B, because
4684 * - There is no side effect if an ecmd doesn't require an
4685 * operand B, but we set the register to some value.
4686 * - It's not invoked in any critical path. The extra MMIO
4687 * write doesn't bring any performance concerns.
4688 */
4689 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
4690 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
4691
4692 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
4693 !(res & DMA_ECMD_ECRSP_IP), res);
4694
4695 if (res & DMA_ECMD_ECRSP_IP) {
4696 ret = -ETIMEDOUT;
4697 goto err;
4698 }
4699
4700 ret = ecmd_get_status_code(res);
4701 err:
4702 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
4703
4704 return ret;
4705 }
4706