1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright © 2006-2014 Intel Corporation.
4 *
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
11 */
12
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
15
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-pages.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 #include "perfmon.h"
34
35 #define ROOT_SIZE VTD_PAGE_SIZE
36 #define CONTEXT_SIZE VTD_PAGE_SIZE
37
38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42
43 #define IOAPIC_RANGE_START (0xfee00000)
44 #define IOAPIC_RANGE_END (0xfeefffff)
45 #define IOVA_START_ADDR (0x1000)
46
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48
49 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
50 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
51
52 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
53 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
54 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
55 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
56 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
57
58 static void __init check_tylersburg_isoch(void);
59 static int rwbf_quirk;
60
61 /*
62 * set to 1 to panic kernel if can't successfully enable VT-d
63 * (used when kernel is launched w/ TXT)
64 */
65 static int force_on = 0;
66 static int intel_iommu_tboot_noforce;
67 static int no_platform_optin;
68
69 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
70
71 /*
72 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
73 * if marked present.
74 */
root_entry_lctp(struct root_entry * re)75 static phys_addr_t root_entry_lctp(struct root_entry *re)
76 {
77 if (!(re->lo & 1))
78 return 0;
79
80 return re->lo & VTD_PAGE_MASK;
81 }
82
83 /*
84 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
85 * if marked present.
86 */
root_entry_uctp(struct root_entry * re)87 static phys_addr_t root_entry_uctp(struct root_entry *re)
88 {
89 if (!(re->hi & 1))
90 return 0;
91
92 return re->hi & VTD_PAGE_MASK;
93 }
94
device_rid_cmp_key(const void * key,const struct rb_node * node)95 static int device_rid_cmp_key(const void *key, const struct rb_node *node)
96 {
97 struct device_domain_info *info =
98 rb_entry(node, struct device_domain_info, node);
99 const u16 *rid_lhs = key;
100
101 if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
102 return -1;
103
104 if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
105 return 1;
106
107 return 0;
108 }
109
device_rid_cmp(struct rb_node * lhs,const struct rb_node * rhs)110 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
111 {
112 struct device_domain_info *info =
113 rb_entry(lhs, struct device_domain_info, node);
114 u16 key = PCI_DEVID(info->bus, info->devfn);
115
116 return device_rid_cmp_key(&key, rhs);
117 }
118
119 /*
120 * Looks up an IOMMU-probed device using its source ID.
121 *
122 * Returns the pointer to the device if there is a match. Otherwise,
123 * returns NULL.
124 *
125 * Note that this helper doesn't guarantee that the device won't be
126 * released by the iommu subsystem after being returned. The caller
127 * should use its own synchronization mechanism to avoid the device
128 * being released during its use if its possibly the case.
129 */
device_rbtree_find(struct intel_iommu * iommu,u16 rid)130 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
131 {
132 struct device_domain_info *info = NULL;
133 struct rb_node *node;
134 unsigned long flags;
135
136 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
137 node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
138 if (node)
139 info = rb_entry(node, struct device_domain_info, node);
140 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
141
142 return info ? info->dev : NULL;
143 }
144
device_rbtree_insert(struct intel_iommu * iommu,struct device_domain_info * info)145 static int device_rbtree_insert(struct intel_iommu *iommu,
146 struct device_domain_info *info)
147 {
148 struct rb_node *curr;
149 unsigned long flags;
150
151 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
152 curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
153 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
154 if (WARN_ON(curr))
155 return -EEXIST;
156
157 return 0;
158 }
159
device_rbtree_remove(struct device_domain_info * info)160 static void device_rbtree_remove(struct device_domain_info *info)
161 {
162 struct intel_iommu *iommu = info->iommu;
163 unsigned long flags;
164
165 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
166 rb_erase(&info->node, &iommu->device_rbtree);
167 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
168 }
169
170 struct dmar_rmrr_unit {
171 struct list_head list; /* list of rmrr units */
172 struct acpi_dmar_header *hdr; /* ACPI header */
173 u64 base_address; /* reserved base address*/
174 u64 end_address; /* reserved end address */
175 struct dmar_dev_scope *devices; /* target devices */
176 int devices_cnt; /* target device count */
177 };
178
179 struct dmar_atsr_unit {
180 struct list_head list; /* list of ATSR units */
181 struct acpi_dmar_header *hdr; /* ACPI header */
182 struct dmar_dev_scope *devices; /* target devices */
183 int devices_cnt; /* target device count */
184 u8 include_all:1; /* include all ports */
185 };
186
187 struct dmar_satc_unit {
188 struct list_head list; /* list of SATC units */
189 struct acpi_dmar_header *hdr; /* ACPI header */
190 struct dmar_dev_scope *devices; /* target devices */
191 struct intel_iommu *iommu; /* the corresponding iommu */
192 int devices_cnt; /* target device count */
193 u8 atc_required:1; /* ATS is required */
194 };
195
196 static LIST_HEAD(dmar_atsr_units);
197 static LIST_HEAD(dmar_rmrr_units);
198 static LIST_HEAD(dmar_satc_units);
199
200 #define for_each_rmrr_units(rmrr) \
201 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
202
203 static void intel_iommu_domain_free(struct iommu_domain *domain);
204
205 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
206 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
207
208 int intel_iommu_enabled = 0;
209 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
210
211 static int intel_iommu_superpage = 1;
212 static int iommu_identity_mapping;
213 static int iommu_skip_te_disable;
214 static int disable_igfx_iommu;
215
216 #define IDENTMAP_AZALIA 4
217
218 const struct iommu_ops intel_iommu_ops;
219 static const struct iommu_dirty_ops intel_dirty_ops;
220
translation_pre_enabled(struct intel_iommu * iommu)221 static bool translation_pre_enabled(struct intel_iommu *iommu)
222 {
223 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
224 }
225
clear_translation_pre_enabled(struct intel_iommu * iommu)226 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
227 {
228 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
229 }
230
init_translation_status(struct intel_iommu * iommu)231 static void init_translation_status(struct intel_iommu *iommu)
232 {
233 u32 gsts;
234
235 gsts = readl(iommu->reg + DMAR_GSTS_REG);
236 if (gsts & DMA_GSTS_TES)
237 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
238 }
239
intel_iommu_setup(char * str)240 static int __init intel_iommu_setup(char *str)
241 {
242 if (!str)
243 return -EINVAL;
244
245 while (*str) {
246 if (!strncmp(str, "on", 2)) {
247 dmar_disabled = 0;
248 pr_info("IOMMU enabled\n");
249 } else if (!strncmp(str, "off", 3)) {
250 dmar_disabled = 1;
251 no_platform_optin = 1;
252 pr_info("IOMMU disabled\n");
253 } else if (!strncmp(str, "igfx_off", 8)) {
254 disable_igfx_iommu = 1;
255 pr_info("Disable GFX device mapping\n");
256 } else if (!strncmp(str, "forcedac", 8)) {
257 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
258 iommu_dma_forcedac = true;
259 } else if (!strncmp(str, "strict", 6)) {
260 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
261 iommu_set_dma_strict();
262 } else if (!strncmp(str, "sp_off", 6)) {
263 pr_info("Disable supported super page\n");
264 intel_iommu_superpage = 0;
265 } else if (!strncmp(str, "sm_on", 5)) {
266 pr_info("Enable scalable mode if hardware supports\n");
267 intel_iommu_sm = 1;
268 } else if (!strncmp(str, "sm_off", 6)) {
269 pr_info("Scalable mode is disallowed\n");
270 intel_iommu_sm = 0;
271 } else if (!strncmp(str, "tboot_noforce", 13)) {
272 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
273 intel_iommu_tboot_noforce = 1;
274 } else {
275 pr_notice("Unknown option - '%s'\n", str);
276 }
277
278 str += strcspn(str, ",");
279 while (*str == ',')
280 str++;
281 }
282
283 return 1;
284 }
285 __setup("intel_iommu=", intel_iommu_setup);
286
domain_pfn_supported(struct dmar_domain * domain,unsigned long pfn)287 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
288 {
289 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
290
291 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
292 }
293
294 /*
295 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
296 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
297 * the returned SAGAW.
298 */
__iommu_calculate_sagaw(struct intel_iommu * iommu)299 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
300 {
301 unsigned long fl_sagaw, sl_sagaw;
302
303 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
304 sl_sagaw = cap_sagaw(iommu->cap);
305
306 /* Second level only. */
307 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
308 return sl_sagaw;
309
310 /* First level only. */
311 if (!ecap_slts(iommu->ecap))
312 return fl_sagaw;
313
314 return fl_sagaw & sl_sagaw;
315 }
316
__iommu_calculate_agaw(struct intel_iommu * iommu,int max_gaw)317 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
318 {
319 unsigned long sagaw;
320 int agaw;
321
322 sagaw = __iommu_calculate_sagaw(iommu);
323 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
324 if (test_bit(agaw, &sagaw))
325 break;
326 }
327
328 return agaw;
329 }
330
331 /*
332 * Calculate max SAGAW for each iommu.
333 */
iommu_calculate_max_sagaw(struct intel_iommu * iommu)334 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
335 {
336 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
337 }
338
339 /*
340 * calculate agaw for each iommu.
341 * "SAGAW" may be different across iommus, use a default agaw, and
342 * get a supported less agaw for iommus that don't support the default agaw.
343 */
iommu_calculate_agaw(struct intel_iommu * iommu)344 int iommu_calculate_agaw(struct intel_iommu *iommu)
345 {
346 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
347 }
348
iommu_paging_structure_coherency(struct intel_iommu * iommu)349 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
350 {
351 return sm_supported(iommu) ?
352 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
353 }
354
domain_update_iommu_coherency(struct dmar_domain * domain)355 static void domain_update_iommu_coherency(struct dmar_domain *domain)
356 {
357 struct iommu_domain_info *info;
358 struct dmar_drhd_unit *drhd;
359 struct intel_iommu *iommu;
360 bool found = false;
361 unsigned long i;
362
363 domain->iommu_coherency = true;
364 xa_for_each(&domain->iommu_array, i, info) {
365 found = true;
366 if (!iommu_paging_structure_coherency(info->iommu)) {
367 domain->iommu_coherency = false;
368 break;
369 }
370 }
371 if (found)
372 return;
373
374 /* No hardware attached; use lowest common denominator */
375 rcu_read_lock();
376 for_each_active_iommu(iommu, drhd) {
377 if (!iommu_paging_structure_coherency(iommu)) {
378 domain->iommu_coherency = false;
379 break;
380 }
381 }
382 rcu_read_unlock();
383 }
384
domain_update_iommu_superpage(struct dmar_domain * domain,struct intel_iommu * skip)385 static int domain_update_iommu_superpage(struct dmar_domain *domain,
386 struct intel_iommu *skip)
387 {
388 struct dmar_drhd_unit *drhd;
389 struct intel_iommu *iommu;
390 int mask = 0x3;
391
392 if (!intel_iommu_superpage)
393 return 0;
394
395 /* set iommu_superpage to the smallest common denominator */
396 rcu_read_lock();
397 for_each_active_iommu(iommu, drhd) {
398 if (iommu != skip) {
399 if (domain && domain->use_first_level) {
400 if (!cap_fl1gp_support(iommu->cap))
401 mask = 0x1;
402 } else {
403 mask &= cap_super_page_val(iommu->cap);
404 }
405
406 if (!mask)
407 break;
408 }
409 }
410 rcu_read_unlock();
411
412 return fls(mask);
413 }
414
domain_update_device_node(struct dmar_domain * domain)415 static int domain_update_device_node(struct dmar_domain *domain)
416 {
417 struct device_domain_info *info;
418 int nid = NUMA_NO_NODE;
419 unsigned long flags;
420
421 spin_lock_irqsave(&domain->lock, flags);
422 list_for_each_entry(info, &domain->devices, link) {
423 /*
424 * There could possibly be multiple device numa nodes as devices
425 * within the same domain may sit behind different IOMMUs. There
426 * isn't perfect answer in such situation, so we select first
427 * come first served policy.
428 */
429 nid = dev_to_node(info->dev);
430 if (nid != NUMA_NO_NODE)
431 break;
432 }
433 spin_unlock_irqrestore(&domain->lock, flags);
434
435 return nid;
436 }
437
438 /* Return the super pagesize bitmap if supported. */
domain_super_pgsize_bitmap(struct dmar_domain * domain)439 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
440 {
441 unsigned long bitmap = 0;
442
443 /*
444 * 1-level super page supports page size of 2MiB, 2-level super page
445 * supports page size of both 2MiB and 1GiB.
446 */
447 if (domain->iommu_superpage == 1)
448 bitmap |= SZ_2M;
449 else if (domain->iommu_superpage == 2)
450 bitmap |= SZ_2M | SZ_1G;
451
452 return bitmap;
453 }
454
455 /* Some capabilities may be different across iommus */
domain_update_iommu_cap(struct dmar_domain * domain)456 void domain_update_iommu_cap(struct dmar_domain *domain)
457 {
458 domain_update_iommu_coherency(domain);
459 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
460
461 /*
462 * If RHSA is missing, we should default to the device numa domain
463 * as fall back.
464 */
465 if (domain->nid == NUMA_NO_NODE)
466 domain->nid = domain_update_device_node(domain);
467
468 /*
469 * First-level translation restricts the input-address to a
470 * canonical address (i.e., address bits 63:N have the same
471 * value as address bit [N-1], where N is 48-bits with 4-level
472 * paging and 57-bits with 5-level paging). Hence, skip bit
473 * [N-1].
474 */
475 if (domain->use_first_level)
476 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
477 else
478 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
479
480 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
481 }
482
iommu_context_addr(struct intel_iommu * iommu,u8 bus,u8 devfn,int alloc)483 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
484 u8 devfn, int alloc)
485 {
486 struct root_entry *root = &iommu->root_entry[bus];
487 struct context_entry *context;
488 u64 *entry;
489
490 /*
491 * Except that the caller requested to allocate a new entry,
492 * returning a copied context entry makes no sense.
493 */
494 if (!alloc && context_copied(iommu, bus, devfn))
495 return NULL;
496
497 entry = &root->lo;
498 if (sm_supported(iommu)) {
499 if (devfn >= 0x80) {
500 devfn -= 0x80;
501 entry = &root->hi;
502 }
503 devfn *= 2;
504 }
505 if (*entry & 1)
506 context = phys_to_virt(*entry & VTD_PAGE_MASK);
507 else {
508 unsigned long phy_addr;
509 if (!alloc)
510 return NULL;
511
512 context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
513 if (!context)
514 return NULL;
515
516 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
517 phy_addr = virt_to_phys((void *)context);
518 *entry = phy_addr | 1;
519 __iommu_flush_cache(iommu, entry, sizeof(*entry));
520 }
521 return &context[devfn];
522 }
523
524 /**
525 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
526 * sub-hierarchy of a candidate PCI-PCI bridge
527 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
528 * @bridge: the candidate PCI-PCI bridge
529 *
530 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
531 */
532 static bool
is_downstream_to_pci_bridge(struct device * dev,struct device * bridge)533 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
534 {
535 struct pci_dev *pdev, *pbridge;
536
537 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
538 return false;
539
540 pdev = to_pci_dev(dev);
541 pbridge = to_pci_dev(bridge);
542
543 if (pbridge->subordinate &&
544 pbridge->subordinate->number <= pdev->bus->number &&
545 pbridge->subordinate->busn_res.end >= pdev->bus->number)
546 return true;
547
548 return false;
549 }
550
quirk_ioat_snb_local_iommu(struct pci_dev * pdev)551 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
552 {
553 struct dmar_drhd_unit *drhd;
554 u32 vtbar;
555 int rc;
556
557 /* We know that this device on this chipset has its own IOMMU.
558 * If we find it under a different IOMMU, then the BIOS is lying
559 * to us. Hope that the IOMMU for this device is actually
560 * disabled, and it needs no translation...
561 */
562 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
563 if (rc) {
564 /* "can't" happen */
565 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
566 return false;
567 }
568 vtbar &= 0xffff0000;
569
570 /* we know that the this iommu should be at offset 0xa000 from vtbar */
571 drhd = dmar_find_matched_drhd_unit(pdev);
572 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
573 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
574 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
575 return true;
576 }
577
578 return false;
579 }
580
iommu_is_dummy(struct intel_iommu * iommu,struct device * dev)581 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
582 {
583 if (!iommu || iommu->drhd->ignored)
584 return true;
585
586 if (dev_is_pci(dev)) {
587 struct pci_dev *pdev = to_pci_dev(dev);
588
589 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
590 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
591 quirk_ioat_snb_local_iommu(pdev))
592 return true;
593 }
594
595 return false;
596 }
597
device_lookup_iommu(struct device * dev,u8 * bus,u8 * devfn)598 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
599 {
600 struct dmar_drhd_unit *drhd = NULL;
601 struct pci_dev *pdev = NULL;
602 struct intel_iommu *iommu;
603 struct device *tmp;
604 u16 segment = 0;
605 int i;
606
607 if (!dev)
608 return NULL;
609
610 if (dev_is_pci(dev)) {
611 struct pci_dev *pf_pdev;
612
613 pdev = pci_real_dma_dev(to_pci_dev(dev));
614
615 /* VFs aren't listed in scope tables; we need to look up
616 * the PF instead to find the IOMMU. */
617 pf_pdev = pci_physfn(pdev);
618 dev = &pf_pdev->dev;
619 segment = pci_domain_nr(pdev->bus);
620 } else if (has_acpi_companion(dev))
621 dev = &ACPI_COMPANION(dev)->dev;
622
623 rcu_read_lock();
624 for_each_iommu(iommu, drhd) {
625 if (pdev && segment != drhd->segment)
626 continue;
627
628 for_each_active_dev_scope(drhd->devices,
629 drhd->devices_cnt, i, tmp) {
630 if (tmp == dev) {
631 /* For a VF use its original BDF# not that of the PF
632 * which we used for the IOMMU lookup. Strictly speaking
633 * we could do this for all PCI devices; we only need to
634 * get the BDF# from the scope table for ACPI matches. */
635 if (pdev && pdev->is_virtfn)
636 goto got_pdev;
637
638 if (bus && devfn) {
639 *bus = drhd->devices[i].bus;
640 *devfn = drhd->devices[i].devfn;
641 }
642 goto out;
643 }
644
645 if (is_downstream_to_pci_bridge(dev, tmp))
646 goto got_pdev;
647 }
648
649 if (pdev && drhd->include_all) {
650 got_pdev:
651 if (bus && devfn) {
652 *bus = pdev->bus->number;
653 *devfn = pdev->devfn;
654 }
655 goto out;
656 }
657 }
658 iommu = NULL;
659 out:
660 if (iommu_is_dummy(iommu, dev))
661 iommu = NULL;
662
663 rcu_read_unlock();
664
665 return iommu;
666 }
667
domain_flush_cache(struct dmar_domain * domain,void * addr,int size)668 static void domain_flush_cache(struct dmar_domain *domain,
669 void *addr, int size)
670 {
671 if (!domain->iommu_coherency)
672 clflush_cache_range(addr, size);
673 }
674
free_context_table(struct intel_iommu * iommu)675 static void free_context_table(struct intel_iommu *iommu)
676 {
677 struct context_entry *context;
678 int i;
679
680 if (!iommu->root_entry)
681 return;
682
683 for (i = 0; i < ROOT_ENTRY_NR; i++) {
684 context = iommu_context_addr(iommu, i, 0, 0);
685 if (context)
686 iommu_free_page(context);
687
688 if (!sm_supported(iommu))
689 continue;
690
691 context = iommu_context_addr(iommu, i, 0x80, 0);
692 if (context)
693 iommu_free_page(context);
694 }
695
696 iommu_free_page(iommu->root_entry);
697 iommu->root_entry = NULL;
698 }
699
700 #ifdef CONFIG_DMAR_DEBUG
pgtable_walk(struct intel_iommu * iommu,unsigned long pfn,u8 bus,u8 devfn,struct dma_pte * parent,int level)701 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
702 u8 bus, u8 devfn, struct dma_pte *parent, int level)
703 {
704 struct dma_pte *pte;
705 int offset;
706
707 while (1) {
708 offset = pfn_level_offset(pfn, level);
709 pte = &parent[offset];
710 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
711 pr_info("PTE not present at level %d\n", level);
712 break;
713 }
714
715 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
716
717 if (level == 1)
718 break;
719
720 parent = phys_to_virt(dma_pte_addr(pte));
721 level--;
722 }
723 }
724
dmar_fault_dump_ptes(struct intel_iommu * iommu,u16 source_id,unsigned long long addr,u32 pasid)725 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
726 unsigned long long addr, u32 pasid)
727 {
728 struct pasid_dir_entry *dir, *pde;
729 struct pasid_entry *entries, *pte;
730 struct context_entry *ctx_entry;
731 struct root_entry *rt_entry;
732 int i, dir_index, index, level;
733 u8 devfn = source_id & 0xff;
734 u8 bus = source_id >> 8;
735 struct dma_pte *pgtable;
736
737 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
738
739 /* root entry dump */
740 rt_entry = &iommu->root_entry[bus];
741 if (!rt_entry) {
742 pr_info("root table entry is not present\n");
743 return;
744 }
745
746 if (sm_supported(iommu))
747 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
748 rt_entry->hi, rt_entry->lo);
749 else
750 pr_info("root entry: 0x%016llx", rt_entry->lo);
751
752 /* context entry dump */
753 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
754 if (!ctx_entry) {
755 pr_info("context table entry is not present\n");
756 return;
757 }
758
759 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
760 ctx_entry->hi, ctx_entry->lo);
761
762 /* legacy mode does not require PASID entries */
763 if (!sm_supported(iommu)) {
764 level = agaw_to_level(ctx_entry->hi & 7);
765 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
766 goto pgtable_walk;
767 }
768
769 /* get the pointer to pasid directory entry */
770 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
771 if (!dir) {
772 pr_info("pasid directory entry is not present\n");
773 return;
774 }
775 /* For request-without-pasid, get the pasid from context entry */
776 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
777 pasid = IOMMU_NO_PASID;
778
779 dir_index = pasid >> PASID_PDE_SHIFT;
780 pde = &dir[dir_index];
781 pr_info("pasid dir entry: 0x%016llx\n", pde->val);
782
783 /* get the pointer to the pasid table entry */
784 entries = get_pasid_table_from_pde(pde);
785 if (!entries) {
786 pr_info("pasid table entry is not present\n");
787 return;
788 }
789 index = pasid & PASID_PTE_MASK;
790 pte = &entries[index];
791 for (i = 0; i < ARRAY_SIZE(pte->val); i++)
792 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
793
794 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
795 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
796 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
797 } else {
798 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
799 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
800 }
801
802 pgtable_walk:
803 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
804 }
805 #endif
806
pfn_to_dma_pte(struct dmar_domain * domain,unsigned long pfn,int * target_level,gfp_t gfp)807 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
808 unsigned long pfn, int *target_level,
809 gfp_t gfp)
810 {
811 struct dma_pte *parent, *pte;
812 int level = agaw_to_level(domain->agaw);
813 int offset;
814
815 if (!domain_pfn_supported(domain, pfn))
816 /* Address beyond IOMMU's addressing capabilities. */
817 return NULL;
818
819 parent = domain->pgd;
820
821 while (1) {
822 void *tmp_page;
823
824 offset = pfn_level_offset(pfn, level);
825 pte = &parent[offset];
826 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
827 break;
828 if (level == *target_level)
829 break;
830
831 if (!dma_pte_present(pte)) {
832 uint64_t pteval, tmp;
833
834 tmp_page = iommu_alloc_page_node(domain->nid, gfp);
835
836 if (!tmp_page)
837 return NULL;
838
839 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
840 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
841 if (domain->use_first_level)
842 pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
843
844 tmp = 0ULL;
845 if (!try_cmpxchg64(&pte->val, &tmp, pteval))
846 /* Someone else set it while we were thinking; use theirs. */
847 iommu_free_page(tmp_page);
848 else
849 domain_flush_cache(domain, pte, sizeof(*pte));
850 }
851 if (level == 1)
852 break;
853
854 parent = phys_to_virt(dma_pte_addr(pte));
855 level--;
856 }
857
858 if (!*target_level)
859 *target_level = level;
860
861 return pte;
862 }
863
864 /* return address's pte at specific level */
dma_pfn_level_pte(struct dmar_domain * domain,unsigned long pfn,int level,int * large_page)865 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
866 unsigned long pfn,
867 int level, int *large_page)
868 {
869 struct dma_pte *parent, *pte;
870 int total = agaw_to_level(domain->agaw);
871 int offset;
872
873 parent = domain->pgd;
874 while (level <= total) {
875 offset = pfn_level_offset(pfn, total);
876 pte = &parent[offset];
877 if (level == total)
878 return pte;
879
880 if (!dma_pte_present(pte)) {
881 *large_page = total;
882 break;
883 }
884
885 if (dma_pte_superpage(pte)) {
886 *large_page = total;
887 return pte;
888 }
889
890 parent = phys_to_virt(dma_pte_addr(pte));
891 total--;
892 }
893 return NULL;
894 }
895
896 /* clear last level pte, a tlb flush should be followed */
dma_pte_clear_range(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn)897 static void dma_pte_clear_range(struct dmar_domain *domain,
898 unsigned long start_pfn,
899 unsigned long last_pfn)
900 {
901 unsigned int large_page;
902 struct dma_pte *first_pte, *pte;
903
904 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
905 WARN_ON(start_pfn > last_pfn))
906 return;
907
908 /* we don't need lock here; nobody else touches the iova range */
909 do {
910 large_page = 1;
911 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
912 if (!pte) {
913 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
914 continue;
915 }
916 do {
917 dma_clear_pte(pte);
918 start_pfn += lvl_to_nr_pages(large_page);
919 pte++;
920 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
921
922 domain_flush_cache(domain, first_pte,
923 (void *)pte - (void *)first_pte);
924
925 } while (start_pfn && start_pfn <= last_pfn);
926 }
927
dma_pte_free_level(struct dmar_domain * domain,int level,int retain_level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn)928 static void dma_pte_free_level(struct dmar_domain *domain, int level,
929 int retain_level, struct dma_pte *pte,
930 unsigned long pfn, unsigned long start_pfn,
931 unsigned long last_pfn)
932 {
933 pfn = max(start_pfn, pfn);
934 pte = &pte[pfn_level_offset(pfn, level)];
935
936 do {
937 unsigned long level_pfn;
938 struct dma_pte *level_pte;
939
940 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
941 goto next;
942
943 level_pfn = pfn & level_mask(level);
944 level_pte = phys_to_virt(dma_pte_addr(pte));
945
946 if (level > 2) {
947 dma_pte_free_level(domain, level - 1, retain_level,
948 level_pte, level_pfn, start_pfn,
949 last_pfn);
950 }
951
952 /*
953 * Free the page table if we're below the level we want to
954 * retain and the range covers the entire table.
955 */
956 if (level < retain_level && !(start_pfn > level_pfn ||
957 last_pfn < level_pfn + level_size(level) - 1)) {
958 dma_clear_pte(pte);
959 domain_flush_cache(domain, pte, sizeof(*pte));
960 iommu_free_page(level_pte);
961 }
962 next:
963 pfn += level_size(level);
964 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
965 }
966
967 /*
968 * clear last level (leaf) ptes and free page table pages below the
969 * level we wish to keep intact.
970 */
dma_pte_free_pagetable(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,int retain_level)971 static void dma_pte_free_pagetable(struct dmar_domain *domain,
972 unsigned long start_pfn,
973 unsigned long last_pfn,
974 int retain_level)
975 {
976 dma_pte_clear_range(domain, start_pfn, last_pfn);
977
978 /* We don't need lock here; nobody else touches the iova range */
979 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
980 domain->pgd, 0, start_pfn, last_pfn);
981
982 /* free pgd */
983 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
984 iommu_free_page(domain->pgd);
985 domain->pgd = NULL;
986 }
987 }
988
989 /* When a page at a given level is being unlinked from its parent, we don't
990 need to *modify* it at all. All we need to do is make a list of all the
991 pages which can be freed just as soon as we've flushed the IOTLB and we
992 know the hardware page-walk will no longer touch them.
993 The 'pte' argument is the *parent* PTE, pointing to the page that is to
994 be freed. */
dma_pte_list_pagetables(struct dmar_domain * domain,int level,struct dma_pte * pte,struct list_head * freelist)995 static void dma_pte_list_pagetables(struct dmar_domain *domain,
996 int level, struct dma_pte *pte,
997 struct list_head *freelist)
998 {
999 struct page *pg;
1000
1001 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1002 list_add_tail(&pg->lru, freelist);
1003
1004 if (level == 1)
1005 return;
1006
1007 pte = page_address(pg);
1008 do {
1009 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1010 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1011 pte++;
1012 } while (!first_pte_in_page(pte));
1013 }
1014
dma_pte_clear_level(struct dmar_domain * domain,int level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn,struct list_head * freelist)1015 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1016 struct dma_pte *pte, unsigned long pfn,
1017 unsigned long start_pfn, unsigned long last_pfn,
1018 struct list_head *freelist)
1019 {
1020 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1021
1022 pfn = max(start_pfn, pfn);
1023 pte = &pte[pfn_level_offset(pfn, level)];
1024
1025 do {
1026 unsigned long level_pfn = pfn & level_mask(level);
1027
1028 if (!dma_pte_present(pte))
1029 goto next;
1030
1031 /* If range covers entire pagetable, free it */
1032 if (start_pfn <= level_pfn &&
1033 last_pfn >= level_pfn + level_size(level) - 1) {
1034 /* These suborbinate page tables are going away entirely. Don't
1035 bother to clear them; we're just going to *free* them. */
1036 if (level > 1 && !dma_pte_superpage(pte))
1037 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1038
1039 dma_clear_pte(pte);
1040 if (!first_pte)
1041 first_pte = pte;
1042 last_pte = pte;
1043 } else if (level > 1) {
1044 /* Recurse down into a level that isn't *entirely* obsolete */
1045 dma_pte_clear_level(domain, level - 1,
1046 phys_to_virt(dma_pte_addr(pte)),
1047 level_pfn, start_pfn, last_pfn,
1048 freelist);
1049 }
1050 next:
1051 pfn = level_pfn + level_size(level);
1052 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1053
1054 if (first_pte)
1055 domain_flush_cache(domain, first_pte,
1056 (void *)++last_pte - (void *)first_pte);
1057 }
1058
1059 /* We can't just free the pages because the IOMMU may still be walking
1060 the page tables, and may have cached the intermediate levels. The
1061 pages can only be freed after the IOTLB flush has been done. */
domain_unmap(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,struct list_head * freelist)1062 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1063 unsigned long last_pfn, struct list_head *freelist)
1064 {
1065 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1066 WARN_ON(start_pfn > last_pfn))
1067 return;
1068
1069 /* we don't need lock here; nobody else touches the iova range */
1070 dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1071 domain->pgd, 0, start_pfn, last_pfn, freelist);
1072
1073 /* free pgd */
1074 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1075 struct page *pgd_page = virt_to_page(domain->pgd);
1076 list_add_tail(&pgd_page->lru, freelist);
1077 domain->pgd = NULL;
1078 }
1079 }
1080
1081 /* iommu handling */
iommu_alloc_root_entry(struct intel_iommu * iommu)1082 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1083 {
1084 struct root_entry *root;
1085
1086 root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
1087 if (!root) {
1088 pr_err("Allocating root entry for %s failed\n",
1089 iommu->name);
1090 return -ENOMEM;
1091 }
1092
1093 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1094 iommu->root_entry = root;
1095
1096 return 0;
1097 }
1098
iommu_set_root_entry(struct intel_iommu * iommu)1099 static void iommu_set_root_entry(struct intel_iommu *iommu)
1100 {
1101 u64 addr;
1102 u32 sts;
1103 unsigned long flag;
1104
1105 addr = virt_to_phys(iommu->root_entry);
1106 if (sm_supported(iommu))
1107 addr |= DMA_RTADDR_SMT;
1108
1109 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1110 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1111
1112 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1113
1114 /* Make sure hardware complete it */
1115 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1116 readl, (sts & DMA_GSTS_RTPS), sts);
1117
1118 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1119
1120 /*
1121 * Hardware invalidates all DMA remapping hardware translation
1122 * caches as part of SRTP flow.
1123 */
1124 if (cap_esrtps(iommu->cap))
1125 return;
1126
1127 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1128 if (sm_supported(iommu))
1129 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1130 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1131 }
1132
iommu_flush_write_buffer(struct intel_iommu * iommu)1133 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1134 {
1135 u32 val;
1136 unsigned long flag;
1137
1138 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1139 return;
1140
1141 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1142 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1143
1144 /* Make sure hardware complete it */
1145 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1146 readl, (!(val & DMA_GSTS_WBFS)), val);
1147
1148 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1149 }
1150
1151 /* return value determine if we need a write buffer flush */
__iommu_flush_context(struct intel_iommu * iommu,u16 did,u16 source_id,u8 function_mask,u64 type)1152 static void __iommu_flush_context(struct intel_iommu *iommu,
1153 u16 did, u16 source_id, u8 function_mask,
1154 u64 type)
1155 {
1156 u64 val = 0;
1157 unsigned long flag;
1158
1159 switch (type) {
1160 case DMA_CCMD_GLOBAL_INVL:
1161 val = DMA_CCMD_GLOBAL_INVL;
1162 break;
1163 case DMA_CCMD_DOMAIN_INVL:
1164 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1165 break;
1166 case DMA_CCMD_DEVICE_INVL:
1167 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1168 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1169 break;
1170 default:
1171 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1172 iommu->name, type);
1173 return;
1174 }
1175 val |= DMA_CCMD_ICC;
1176
1177 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1178 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1179
1180 /* Make sure hardware complete it */
1181 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1182 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1183
1184 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1185 }
1186
__iommu_flush_iotlb(struct intel_iommu * iommu,u16 did,u64 addr,unsigned int size_order,u64 type)1187 void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
1188 unsigned int size_order, u64 type)
1189 {
1190 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1191 u64 val = 0, val_iva = 0;
1192 unsigned long flag;
1193
1194 switch (type) {
1195 case DMA_TLB_GLOBAL_FLUSH:
1196 /* global flush doesn't need set IVA_REG */
1197 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1198 break;
1199 case DMA_TLB_DSI_FLUSH:
1200 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1201 break;
1202 case DMA_TLB_PSI_FLUSH:
1203 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1204 /* IH bit is passed in as part of address */
1205 val_iva = size_order | addr;
1206 break;
1207 default:
1208 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1209 iommu->name, type);
1210 return;
1211 }
1212
1213 if (cap_write_drain(iommu->cap))
1214 val |= DMA_TLB_WRITE_DRAIN;
1215
1216 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1217 /* Note: Only uses first TLB reg currently */
1218 if (val_iva)
1219 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1220 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1221
1222 /* Make sure hardware complete it */
1223 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1224 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1225
1226 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1227
1228 /* check IOTLB invalidation granularity */
1229 if (DMA_TLB_IAIG(val) == 0)
1230 pr_err("Flush IOTLB failed\n");
1231 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1232 pr_debug("TLB flush request %Lx, actual %Lx\n",
1233 (unsigned long long)DMA_TLB_IIRG(type),
1234 (unsigned long long)DMA_TLB_IAIG(val));
1235 }
1236
1237 static struct device_domain_info *
domain_lookup_dev_info(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1238 domain_lookup_dev_info(struct dmar_domain *domain,
1239 struct intel_iommu *iommu, u8 bus, u8 devfn)
1240 {
1241 struct device_domain_info *info;
1242 unsigned long flags;
1243
1244 spin_lock_irqsave(&domain->lock, flags);
1245 list_for_each_entry(info, &domain->devices, link) {
1246 if (info->iommu == iommu && info->bus == bus &&
1247 info->devfn == devfn) {
1248 spin_unlock_irqrestore(&domain->lock, flags);
1249 return info;
1250 }
1251 }
1252 spin_unlock_irqrestore(&domain->lock, flags);
1253
1254 return NULL;
1255 }
1256
1257 /*
1258 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1259 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1260 * check because it applies only to the built-in QAT devices and it doesn't
1261 * grant additional privileges.
1262 */
1263 #define BUGGY_QAT_DEVID_MASK 0x4940
dev_needs_extra_dtlb_flush(struct pci_dev * pdev)1264 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1265 {
1266 if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1267 return false;
1268
1269 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1270 return false;
1271
1272 return true;
1273 }
1274
iommu_enable_pci_caps(struct device_domain_info * info)1275 static void iommu_enable_pci_caps(struct device_domain_info *info)
1276 {
1277 struct pci_dev *pdev;
1278
1279 if (!dev_is_pci(info->dev))
1280 return;
1281
1282 pdev = to_pci_dev(info->dev);
1283 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1284 !pci_enable_ats(pdev, VTD_PAGE_SHIFT))
1285 info->ats_enabled = 1;
1286 }
1287
iommu_disable_pci_caps(struct device_domain_info * info)1288 static void iommu_disable_pci_caps(struct device_domain_info *info)
1289 {
1290 struct pci_dev *pdev;
1291
1292 if (!dev_is_pci(info->dev))
1293 return;
1294
1295 pdev = to_pci_dev(info->dev);
1296
1297 if (info->ats_enabled) {
1298 pci_disable_ats(pdev);
1299 info->ats_enabled = 0;
1300 }
1301 }
1302
intel_flush_iotlb_all(struct iommu_domain * domain)1303 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1304 {
1305 cache_tag_flush_all(to_dmar_domain(domain));
1306 }
1307
iommu_disable_protect_mem_regions(struct intel_iommu * iommu)1308 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1309 {
1310 u32 pmen;
1311 unsigned long flags;
1312
1313 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1314 return;
1315
1316 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1317 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1318 pmen &= ~DMA_PMEN_EPM;
1319 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1320
1321 /* wait for the protected region status bit to clear */
1322 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1323 readl, !(pmen & DMA_PMEN_PRS), pmen);
1324
1325 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1326 }
1327
iommu_enable_translation(struct intel_iommu * iommu)1328 static void iommu_enable_translation(struct intel_iommu *iommu)
1329 {
1330 u32 sts;
1331 unsigned long flags;
1332
1333 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1334 iommu->gcmd |= DMA_GCMD_TE;
1335 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1336
1337 /* Make sure hardware complete it */
1338 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1339 readl, (sts & DMA_GSTS_TES), sts);
1340
1341 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1342 }
1343
iommu_disable_translation(struct intel_iommu * iommu)1344 static void iommu_disable_translation(struct intel_iommu *iommu)
1345 {
1346 u32 sts;
1347 unsigned long flag;
1348
1349 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1350 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1351 return;
1352
1353 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1354 iommu->gcmd &= ~DMA_GCMD_TE;
1355 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1356
1357 /* Make sure hardware complete it */
1358 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1359 readl, (!(sts & DMA_GSTS_TES)), sts);
1360
1361 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1362 }
1363
iommu_init_domains(struct intel_iommu * iommu)1364 static int iommu_init_domains(struct intel_iommu *iommu)
1365 {
1366 u32 ndomains;
1367
1368 ndomains = cap_ndoms(iommu->cap);
1369 pr_debug("%s: Number of Domains supported <%d>\n",
1370 iommu->name, ndomains);
1371
1372 spin_lock_init(&iommu->lock);
1373
1374 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1375 if (!iommu->domain_ids)
1376 return -ENOMEM;
1377
1378 /*
1379 * If Caching mode is set, then invalid translations are tagged
1380 * with domain-id 0, hence we need to pre-allocate it. We also
1381 * use domain-id 0 as a marker for non-allocated domain-id, so
1382 * make sure it is not used for a real domain.
1383 */
1384 set_bit(0, iommu->domain_ids);
1385
1386 /*
1387 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1388 * entry for first-level or pass-through translation modes should
1389 * be programmed with a domain id different from those used for
1390 * second-level or nested translation. We reserve a domain id for
1391 * this purpose. This domain id is also used for identity domain
1392 * in legacy mode.
1393 */
1394 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1395
1396 return 0;
1397 }
1398
disable_dmar_iommu(struct intel_iommu * iommu)1399 static void disable_dmar_iommu(struct intel_iommu *iommu)
1400 {
1401 if (!iommu->domain_ids)
1402 return;
1403
1404 /*
1405 * All iommu domains must have been detached from the devices,
1406 * hence there should be no domain IDs in use.
1407 */
1408 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1409 > NUM_RESERVED_DID))
1410 return;
1411
1412 if (iommu->gcmd & DMA_GCMD_TE)
1413 iommu_disable_translation(iommu);
1414 }
1415
free_dmar_iommu(struct intel_iommu * iommu)1416 static void free_dmar_iommu(struct intel_iommu *iommu)
1417 {
1418 if (iommu->domain_ids) {
1419 bitmap_free(iommu->domain_ids);
1420 iommu->domain_ids = NULL;
1421 }
1422
1423 if (iommu->copied_tables) {
1424 bitmap_free(iommu->copied_tables);
1425 iommu->copied_tables = NULL;
1426 }
1427
1428 /* free context mapping */
1429 free_context_table(iommu);
1430
1431 #ifdef CONFIG_INTEL_IOMMU_SVM
1432 if (pasid_supported(iommu)) {
1433 if (ecap_prs(iommu->ecap))
1434 intel_svm_finish_prq(iommu);
1435 }
1436 #endif
1437 }
1438
1439 /*
1440 * Check and return whether first level is used by default for
1441 * DMA translation.
1442 */
first_level_by_default(unsigned int type)1443 static bool first_level_by_default(unsigned int type)
1444 {
1445 /* Only SL is available in legacy mode */
1446 if (!scalable_mode_support())
1447 return false;
1448
1449 /* Only level (either FL or SL) is available, just use it */
1450 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1451 return intel_cap_flts_sanity();
1452
1453 /* Both levels are available, decide it based on domain type */
1454 return type != IOMMU_DOMAIN_UNMANAGED;
1455 }
1456
alloc_domain(unsigned int type)1457 static struct dmar_domain *alloc_domain(unsigned int type)
1458 {
1459 struct dmar_domain *domain;
1460
1461 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1462 if (!domain)
1463 return NULL;
1464
1465 domain->nid = NUMA_NO_NODE;
1466 if (first_level_by_default(type))
1467 domain->use_first_level = true;
1468 INIT_LIST_HEAD(&domain->devices);
1469 INIT_LIST_HEAD(&domain->dev_pasids);
1470 INIT_LIST_HEAD(&domain->cache_tags);
1471 spin_lock_init(&domain->lock);
1472 spin_lock_init(&domain->cache_lock);
1473 xa_init(&domain->iommu_array);
1474
1475 return domain;
1476 }
1477
domain_attach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1478 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1479 {
1480 struct iommu_domain_info *info, *curr;
1481 unsigned long ndomains;
1482 int num, ret = -ENOSPC;
1483
1484 if (domain->domain.type == IOMMU_DOMAIN_SVA)
1485 return 0;
1486
1487 info = kzalloc(sizeof(*info), GFP_KERNEL);
1488 if (!info)
1489 return -ENOMEM;
1490
1491 spin_lock(&iommu->lock);
1492 curr = xa_load(&domain->iommu_array, iommu->seq_id);
1493 if (curr) {
1494 curr->refcnt++;
1495 spin_unlock(&iommu->lock);
1496 kfree(info);
1497 return 0;
1498 }
1499
1500 ndomains = cap_ndoms(iommu->cap);
1501 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1502 if (num >= ndomains) {
1503 pr_err("%s: No free domain ids\n", iommu->name);
1504 goto err_unlock;
1505 }
1506
1507 set_bit(num, iommu->domain_ids);
1508 info->refcnt = 1;
1509 info->did = num;
1510 info->iommu = iommu;
1511 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1512 NULL, info, GFP_ATOMIC);
1513 if (curr) {
1514 ret = xa_err(curr) ? : -EBUSY;
1515 goto err_clear;
1516 }
1517 domain_update_iommu_cap(domain);
1518
1519 spin_unlock(&iommu->lock);
1520 return 0;
1521
1522 err_clear:
1523 clear_bit(info->did, iommu->domain_ids);
1524 err_unlock:
1525 spin_unlock(&iommu->lock);
1526 kfree(info);
1527 return ret;
1528 }
1529
domain_detach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1530 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1531 {
1532 struct iommu_domain_info *info;
1533
1534 if (domain->domain.type == IOMMU_DOMAIN_SVA)
1535 return;
1536
1537 spin_lock(&iommu->lock);
1538 info = xa_load(&domain->iommu_array, iommu->seq_id);
1539 if (--info->refcnt == 0) {
1540 clear_bit(info->did, iommu->domain_ids);
1541 xa_erase(&domain->iommu_array, iommu->seq_id);
1542 domain->nid = NUMA_NO_NODE;
1543 domain_update_iommu_cap(domain);
1544 kfree(info);
1545 }
1546 spin_unlock(&iommu->lock);
1547 }
1548
guestwidth_to_adjustwidth(int gaw)1549 static int guestwidth_to_adjustwidth(int gaw)
1550 {
1551 int agaw;
1552 int r = (gaw - 12) % 9;
1553
1554 if (r == 0)
1555 agaw = gaw;
1556 else
1557 agaw = gaw + 9 - r;
1558 if (agaw > 64)
1559 agaw = 64;
1560 return agaw;
1561 }
1562
domain_exit(struct dmar_domain * domain)1563 static void domain_exit(struct dmar_domain *domain)
1564 {
1565 if (domain->pgd) {
1566 LIST_HEAD(freelist);
1567
1568 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1569 iommu_put_pages_list(&freelist);
1570 }
1571
1572 if (WARN_ON(!list_empty(&domain->devices)))
1573 return;
1574
1575 kfree(domain->qi_batch);
1576 kfree(domain);
1577 }
1578
1579 /*
1580 * For kdump cases, old valid entries may be cached due to the
1581 * in-flight DMA and copied pgtable, but there is no unmapping
1582 * behaviour for them, thus we need an explicit cache flush for
1583 * the newly-mapped device. For kdump, at this point, the device
1584 * is supposed to finish reset at its driver probe stage, so no
1585 * in-flight DMA will exist, and we don't need to worry anymore
1586 * hereafter.
1587 */
copied_context_tear_down(struct intel_iommu * iommu,struct context_entry * context,u8 bus,u8 devfn)1588 static void copied_context_tear_down(struct intel_iommu *iommu,
1589 struct context_entry *context,
1590 u8 bus, u8 devfn)
1591 {
1592 u16 did_old;
1593
1594 if (!context_copied(iommu, bus, devfn))
1595 return;
1596
1597 assert_spin_locked(&iommu->lock);
1598
1599 did_old = context_domain_id(context);
1600 context_clear_entry(context);
1601
1602 if (did_old < cap_ndoms(iommu->cap)) {
1603 iommu->flush.flush_context(iommu, did_old,
1604 (((u16)bus) << 8) | devfn,
1605 DMA_CCMD_MASK_NOBIT,
1606 DMA_CCMD_DEVICE_INVL);
1607 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1608 DMA_TLB_DSI_FLUSH);
1609 }
1610
1611 clear_context_copied(iommu, bus, devfn);
1612 }
1613
1614 /*
1615 * It's a non-present to present mapping. If hardware doesn't cache
1616 * non-present entry we only need to flush the write-buffer. If the
1617 * _does_ cache non-present entries, then it does so in the special
1618 * domain #0, which we have to flush:
1619 */
context_present_cache_flush(struct intel_iommu * iommu,u16 did,u8 bus,u8 devfn)1620 static void context_present_cache_flush(struct intel_iommu *iommu, u16 did,
1621 u8 bus, u8 devfn)
1622 {
1623 if (cap_caching_mode(iommu->cap)) {
1624 iommu->flush.flush_context(iommu, 0,
1625 (((u16)bus) << 8) | devfn,
1626 DMA_CCMD_MASK_NOBIT,
1627 DMA_CCMD_DEVICE_INVL);
1628 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1629 } else {
1630 iommu_flush_write_buffer(iommu);
1631 }
1632 }
1633
domain_context_mapping_one(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1634 static int domain_context_mapping_one(struct dmar_domain *domain,
1635 struct intel_iommu *iommu,
1636 u8 bus, u8 devfn)
1637 {
1638 struct device_domain_info *info =
1639 domain_lookup_dev_info(domain, iommu, bus, devfn);
1640 u16 did = domain_id_iommu(domain, iommu);
1641 int translation = CONTEXT_TT_MULTI_LEVEL;
1642 struct dma_pte *pgd = domain->pgd;
1643 struct context_entry *context;
1644 int agaw, ret;
1645
1646 pr_debug("Set context mapping for %02x:%02x.%d\n",
1647 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1648
1649 spin_lock(&iommu->lock);
1650 ret = -ENOMEM;
1651 context = iommu_context_addr(iommu, bus, devfn, 1);
1652 if (!context)
1653 goto out_unlock;
1654
1655 ret = 0;
1656 if (context_present(context) && !context_copied(iommu, bus, devfn))
1657 goto out_unlock;
1658
1659 copied_context_tear_down(iommu, context, bus, devfn);
1660 context_clear_entry(context);
1661
1662 context_set_domain_id(context, did);
1663
1664 /*
1665 * Skip top levels of page tables for iommu which has
1666 * less agaw than default. Unnecessary for PT mode.
1667 */
1668 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1669 ret = -ENOMEM;
1670 pgd = phys_to_virt(dma_pte_addr(pgd));
1671 if (!dma_pte_present(pgd))
1672 goto out_unlock;
1673 }
1674
1675 if (info && info->ats_supported)
1676 translation = CONTEXT_TT_DEV_IOTLB;
1677 else
1678 translation = CONTEXT_TT_MULTI_LEVEL;
1679
1680 context_set_address_root(context, virt_to_phys(pgd));
1681 context_set_address_width(context, agaw);
1682 context_set_translation_type(context, translation);
1683 context_set_fault_enable(context);
1684 context_set_present(context);
1685 if (!ecap_coherent(iommu->ecap))
1686 clflush_cache_range(context, sizeof(*context));
1687 context_present_cache_flush(iommu, did, bus, devfn);
1688 ret = 0;
1689
1690 out_unlock:
1691 spin_unlock(&iommu->lock);
1692
1693 return ret;
1694 }
1695
domain_context_mapping_cb(struct pci_dev * pdev,u16 alias,void * opaque)1696 static int domain_context_mapping_cb(struct pci_dev *pdev,
1697 u16 alias, void *opaque)
1698 {
1699 struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1700 struct intel_iommu *iommu = info->iommu;
1701 struct dmar_domain *domain = opaque;
1702
1703 return domain_context_mapping_one(domain, iommu,
1704 PCI_BUS_NUM(alias), alias & 0xff);
1705 }
1706
1707 static int
domain_context_mapping(struct dmar_domain * domain,struct device * dev)1708 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1709 {
1710 struct device_domain_info *info = dev_iommu_priv_get(dev);
1711 struct intel_iommu *iommu = info->iommu;
1712 u8 bus = info->bus, devfn = info->devfn;
1713
1714 if (!dev_is_pci(dev))
1715 return domain_context_mapping_one(domain, iommu, bus, devfn);
1716
1717 return pci_for_each_dma_alias(to_pci_dev(dev),
1718 domain_context_mapping_cb, domain);
1719 }
1720
1721 /* Return largest possible superpage level for a given mapping */
hardware_largepage_caps(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phy_pfn,unsigned long pages)1722 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1723 unsigned long phy_pfn, unsigned long pages)
1724 {
1725 int support, level = 1;
1726 unsigned long pfnmerge;
1727
1728 support = domain->iommu_superpage;
1729
1730 /* To use a large page, the virtual *and* physical addresses
1731 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1732 of them will mean we have to use smaller pages. So just
1733 merge them and check both at once. */
1734 pfnmerge = iov_pfn | phy_pfn;
1735
1736 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1737 pages >>= VTD_STRIDE_SHIFT;
1738 if (!pages)
1739 break;
1740 pfnmerge >>= VTD_STRIDE_SHIFT;
1741 level++;
1742 support--;
1743 }
1744 return level;
1745 }
1746
1747 /*
1748 * Ensure that old small page tables are removed to make room for superpage(s).
1749 * We're going to add new large pages, so make sure we don't remove their parent
1750 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1751 */
switch_to_super_page(struct dmar_domain * domain,unsigned long start_pfn,unsigned long end_pfn,int level)1752 static void switch_to_super_page(struct dmar_domain *domain,
1753 unsigned long start_pfn,
1754 unsigned long end_pfn, int level)
1755 {
1756 unsigned long lvl_pages = lvl_to_nr_pages(level);
1757 struct dma_pte *pte = NULL;
1758
1759 while (start_pfn <= end_pfn) {
1760 if (!pte)
1761 pte = pfn_to_dma_pte(domain, start_pfn, &level,
1762 GFP_ATOMIC);
1763
1764 if (dma_pte_present(pte)) {
1765 dma_pte_free_pagetable(domain, start_pfn,
1766 start_pfn + lvl_pages - 1,
1767 level + 1);
1768
1769 cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT,
1770 end_pfn << VTD_PAGE_SHIFT, 0);
1771 }
1772
1773 pte++;
1774 start_pfn += lvl_pages;
1775 if (first_pte_in_page(pte))
1776 pte = NULL;
1777 }
1778 }
1779
1780 static int
__domain_mapping(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phys_pfn,unsigned long nr_pages,int prot,gfp_t gfp)1781 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1782 unsigned long phys_pfn, unsigned long nr_pages, int prot,
1783 gfp_t gfp)
1784 {
1785 struct dma_pte *first_pte = NULL, *pte = NULL;
1786 unsigned int largepage_lvl = 0;
1787 unsigned long lvl_pages = 0;
1788 phys_addr_t pteval;
1789 u64 attr;
1790
1791 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
1792 return -EINVAL;
1793
1794 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1795 return -EINVAL;
1796
1797 if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
1798 pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
1799 return -EINVAL;
1800 }
1801
1802 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
1803 attr |= DMA_FL_PTE_PRESENT;
1804 if (domain->use_first_level) {
1805 attr |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
1806 if (prot & DMA_PTE_WRITE)
1807 attr |= DMA_FL_PTE_DIRTY;
1808 }
1809
1810 domain->has_mappings = true;
1811
1812 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
1813
1814 while (nr_pages > 0) {
1815 uint64_t tmp;
1816
1817 if (!pte) {
1818 largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
1819 phys_pfn, nr_pages);
1820
1821 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
1822 gfp);
1823 if (!pte)
1824 return -ENOMEM;
1825 first_pte = pte;
1826
1827 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1828
1829 /* It is large page*/
1830 if (largepage_lvl > 1) {
1831 unsigned long end_pfn;
1832 unsigned long pages_to_remove;
1833
1834 pteval |= DMA_PTE_LARGE_PAGE;
1835 pages_to_remove = min_t(unsigned long, nr_pages,
1836 nr_pte_to_next_page(pte) * lvl_pages);
1837 end_pfn = iov_pfn + pages_to_remove - 1;
1838 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
1839 } else {
1840 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1841 }
1842
1843 }
1844 /* We don't need lock here, nobody else
1845 * touches the iova range
1846 */
1847 tmp = 0ULL;
1848 if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
1849 static int dumps = 5;
1850 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1851 iov_pfn, tmp, (unsigned long long)pteval);
1852 if (dumps) {
1853 dumps--;
1854 debug_dma_dump_mappings(NULL);
1855 }
1856 WARN_ON(1);
1857 }
1858
1859 nr_pages -= lvl_pages;
1860 iov_pfn += lvl_pages;
1861 phys_pfn += lvl_pages;
1862 pteval += lvl_pages * VTD_PAGE_SIZE;
1863
1864 /* If the next PTE would be the first in a new page, then we
1865 * need to flush the cache on the entries we've just written.
1866 * And then we'll need to recalculate 'pte', so clear it and
1867 * let it get set again in the if (!pte) block above.
1868 *
1869 * If we're done (!nr_pages) we need to flush the cache too.
1870 *
1871 * Also if we've been setting superpages, we may need to
1872 * recalculate 'pte' and switch back to smaller pages for the
1873 * end of the mapping, if the trailing size is not enough to
1874 * use another superpage (i.e. nr_pages < lvl_pages).
1875 */
1876 pte++;
1877 if (!nr_pages || first_pte_in_page(pte) ||
1878 (largepage_lvl > 1 && nr_pages < lvl_pages)) {
1879 domain_flush_cache(domain, first_pte,
1880 (void *)pte - (void *)first_pte);
1881 pte = NULL;
1882 }
1883 }
1884
1885 return 0;
1886 }
1887
domain_context_clear_one(struct device_domain_info * info,u8 bus,u8 devfn)1888 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
1889 {
1890 struct intel_iommu *iommu = info->iommu;
1891 struct context_entry *context;
1892 u16 did;
1893
1894 spin_lock(&iommu->lock);
1895 context = iommu_context_addr(iommu, bus, devfn, 0);
1896 if (!context) {
1897 spin_unlock(&iommu->lock);
1898 return;
1899 }
1900
1901 did = context_domain_id(context);
1902 context_clear_entry(context);
1903 __iommu_flush_cache(iommu, context, sizeof(*context));
1904 spin_unlock(&iommu->lock);
1905 intel_context_flush_present(info, context, did, true);
1906 }
1907
domain_setup_first_level(struct intel_iommu * iommu,struct dmar_domain * domain,struct device * dev,u32 pasid)1908 static int domain_setup_first_level(struct intel_iommu *iommu,
1909 struct dmar_domain *domain,
1910 struct device *dev,
1911 u32 pasid)
1912 {
1913 struct dma_pte *pgd = domain->pgd;
1914 int agaw, level;
1915 int flags = 0;
1916
1917 /*
1918 * Skip top levels of page tables for iommu which has
1919 * less agaw than default. Unnecessary for PT mode.
1920 */
1921 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1922 pgd = phys_to_virt(dma_pte_addr(pgd));
1923 if (!dma_pte_present(pgd))
1924 return -ENOMEM;
1925 }
1926
1927 level = agaw_to_level(agaw);
1928 if (level != 4 && level != 5)
1929 return -EINVAL;
1930
1931 if (level == 5)
1932 flags |= PASID_FLAG_FL5LP;
1933
1934 if (domain->force_snooping)
1935 flags |= PASID_FLAG_PAGE_SNOOP;
1936
1937 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
1938 domain_id_iommu(domain, iommu),
1939 flags);
1940 }
1941
dev_is_real_dma_subdevice(struct device * dev)1942 static bool dev_is_real_dma_subdevice(struct device *dev)
1943 {
1944 return dev && dev_is_pci(dev) &&
1945 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
1946 }
1947
dmar_domain_attach_device(struct dmar_domain * domain,struct device * dev)1948 static int dmar_domain_attach_device(struct dmar_domain *domain,
1949 struct device *dev)
1950 {
1951 struct device_domain_info *info = dev_iommu_priv_get(dev);
1952 struct intel_iommu *iommu = info->iommu;
1953 unsigned long flags;
1954 int ret;
1955
1956 ret = domain_attach_iommu(domain, iommu);
1957 if (ret)
1958 return ret;
1959
1960 info->domain = domain;
1961 spin_lock_irqsave(&domain->lock, flags);
1962 list_add(&info->link, &domain->devices);
1963 spin_unlock_irqrestore(&domain->lock, flags);
1964
1965 if (dev_is_real_dma_subdevice(dev))
1966 return 0;
1967
1968 if (!sm_supported(iommu))
1969 ret = domain_context_mapping(domain, dev);
1970 else if (domain->use_first_level)
1971 ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID);
1972 else
1973 ret = intel_pasid_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID);
1974
1975 if (ret)
1976 goto out_block_translation;
1977
1978 iommu_enable_pci_caps(info);
1979
1980 ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID);
1981 if (ret)
1982 goto out_block_translation;
1983
1984 return 0;
1985
1986 out_block_translation:
1987 device_block_translation(dev);
1988 return ret;
1989 }
1990
1991 /**
1992 * device_rmrr_is_relaxable - Test whether the RMRR of this device
1993 * is relaxable (ie. is allowed to be not enforced under some conditions)
1994 * @dev: device handle
1995 *
1996 * We assume that PCI USB devices with RMRRs have them largely
1997 * for historical reasons and that the RMRR space is not actively used post
1998 * boot. This exclusion may change if vendors begin to abuse it.
1999 *
2000 * The same exception is made for graphics devices, with the requirement that
2001 * any use of the RMRR regions will be torn down before assigning the device
2002 * to a guest.
2003 *
2004 * Return: true if the RMRR is relaxable, false otherwise
2005 */
device_rmrr_is_relaxable(struct device * dev)2006 static bool device_rmrr_is_relaxable(struct device *dev)
2007 {
2008 struct pci_dev *pdev;
2009
2010 if (!dev_is_pci(dev))
2011 return false;
2012
2013 pdev = to_pci_dev(dev);
2014 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2015 return true;
2016 else
2017 return false;
2018 }
2019
device_def_domain_type(struct device * dev)2020 static int device_def_domain_type(struct device *dev)
2021 {
2022 struct device_domain_info *info = dev_iommu_priv_get(dev);
2023 struct intel_iommu *iommu = info->iommu;
2024
2025 /*
2026 * Hardware does not support the passthrough translation mode.
2027 * Always use a dynamaic mapping domain.
2028 */
2029 if (!ecap_pass_through(iommu->ecap))
2030 return IOMMU_DOMAIN_DMA;
2031
2032 if (dev_is_pci(dev)) {
2033 struct pci_dev *pdev = to_pci_dev(dev);
2034
2035 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2036 return IOMMU_DOMAIN_IDENTITY;
2037 }
2038
2039 return 0;
2040 }
2041
intel_iommu_init_qi(struct intel_iommu * iommu)2042 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2043 {
2044 /*
2045 * Start from the sane iommu hardware state.
2046 * If the queued invalidation is already initialized by us
2047 * (for example, while enabling interrupt-remapping) then
2048 * we got the things already rolling from a sane state.
2049 */
2050 if (!iommu->qi) {
2051 /*
2052 * Clear any previous faults.
2053 */
2054 dmar_fault(-1, iommu);
2055 /*
2056 * Disable queued invalidation if supported and already enabled
2057 * before OS handover.
2058 */
2059 dmar_disable_qi(iommu);
2060 }
2061
2062 if (dmar_enable_qi(iommu)) {
2063 /*
2064 * Queued Invalidate not enabled, use Register Based Invalidate
2065 */
2066 iommu->flush.flush_context = __iommu_flush_context;
2067 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2068 pr_info("%s: Using Register based invalidation\n",
2069 iommu->name);
2070 } else {
2071 iommu->flush.flush_context = qi_flush_context;
2072 iommu->flush.flush_iotlb = qi_flush_iotlb;
2073 pr_info("%s: Using Queued invalidation\n", iommu->name);
2074 }
2075 }
2076
copy_context_table(struct intel_iommu * iommu,struct root_entry * old_re,struct context_entry ** tbl,int bus,bool ext)2077 static int copy_context_table(struct intel_iommu *iommu,
2078 struct root_entry *old_re,
2079 struct context_entry **tbl,
2080 int bus, bool ext)
2081 {
2082 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2083 struct context_entry *new_ce = NULL, ce;
2084 struct context_entry *old_ce = NULL;
2085 struct root_entry re;
2086 phys_addr_t old_ce_phys;
2087
2088 tbl_idx = ext ? bus * 2 : bus;
2089 memcpy(&re, old_re, sizeof(re));
2090
2091 for (devfn = 0; devfn < 256; devfn++) {
2092 /* First calculate the correct index */
2093 idx = (ext ? devfn * 2 : devfn) % 256;
2094
2095 if (idx == 0) {
2096 /* First save what we may have and clean up */
2097 if (new_ce) {
2098 tbl[tbl_idx] = new_ce;
2099 __iommu_flush_cache(iommu, new_ce,
2100 VTD_PAGE_SIZE);
2101 pos = 1;
2102 }
2103
2104 if (old_ce)
2105 memunmap(old_ce);
2106
2107 ret = 0;
2108 if (devfn < 0x80)
2109 old_ce_phys = root_entry_lctp(&re);
2110 else
2111 old_ce_phys = root_entry_uctp(&re);
2112
2113 if (!old_ce_phys) {
2114 if (ext && devfn == 0) {
2115 /* No LCTP, try UCTP */
2116 devfn = 0x7f;
2117 continue;
2118 } else {
2119 goto out;
2120 }
2121 }
2122
2123 ret = -ENOMEM;
2124 old_ce = memremap(old_ce_phys, PAGE_SIZE,
2125 MEMREMAP_WB);
2126 if (!old_ce)
2127 goto out;
2128
2129 new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL);
2130 if (!new_ce)
2131 goto out_unmap;
2132
2133 ret = 0;
2134 }
2135
2136 /* Now copy the context entry */
2137 memcpy(&ce, old_ce + idx, sizeof(ce));
2138
2139 if (!context_present(&ce))
2140 continue;
2141
2142 did = context_domain_id(&ce);
2143 if (did >= 0 && did < cap_ndoms(iommu->cap))
2144 set_bit(did, iommu->domain_ids);
2145
2146 set_context_copied(iommu, bus, devfn);
2147 new_ce[idx] = ce;
2148 }
2149
2150 tbl[tbl_idx + pos] = new_ce;
2151
2152 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2153
2154 out_unmap:
2155 memunmap(old_ce);
2156
2157 out:
2158 return ret;
2159 }
2160
copy_translation_tables(struct intel_iommu * iommu)2161 static int copy_translation_tables(struct intel_iommu *iommu)
2162 {
2163 struct context_entry **ctxt_tbls;
2164 struct root_entry *old_rt;
2165 phys_addr_t old_rt_phys;
2166 int ctxt_table_entries;
2167 u64 rtaddr_reg;
2168 int bus, ret;
2169 bool new_ext, ext;
2170
2171 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2172 ext = !!(rtaddr_reg & DMA_RTADDR_SMT);
2173 new_ext = !!sm_supported(iommu);
2174
2175 /*
2176 * The RTT bit can only be changed when translation is disabled,
2177 * but disabling translation means to open a window for data
2178 * corruption. So bail out and don't copy anything if we would
2179 * have to change the bit.
2180 */
2181 if (new_ext != ext)
2182 return -EINVAL;
2183
2184 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2185 if (!iommu->copied_tables)
2186 return -ENOMEM;
2187
2188 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2189 if (!old_rt_phys)
2190 return -EINVAL;
2191
2192 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2193 if (!old_rt)
2194 return -ENOMEM;
2195
2196 /* This is too big for the stack - allocate it from slab */
2197 ctxt_table_entries = ext ? 512 : 256;
2198 ret = -ENOMEM;
2199 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2200 if (!ctxt_tbls)
2201 goto out_unmap;
2202
2203 for (bus = 0; bus < 256; bus++) {
2204 ret = copy_context_table(iommu, &old_rt[bus],
2205 ctxt_tbls, bus, ext);
2206 if (ret) {
2207 pr_err("%s: Failed to copy context table for bus %d\n",
2208 iommu->name, bus);
2209 continue;
2210 }
2211 }
2212
2213 spin_lock(&iommu->lock);
2214
2215 /* Context tables are copied, now write them to the root_entry table */
2216 for (bus = 0; bus < 256; bus++) {
2217 int idx = ext ? bus * 2 : bus;
2218 u64 val;
2219
2220 if (ctxt_tbls[idx]) {
2221 val = virt_to_phys(ctxt_tbls[idx]) | 1;
2222 iommu->root_entry[bus].lo = val;
2223 }
2224
2225 if (!ext || !ctxt_tbls[idx + 1])
2226 continue;
2227
2228 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2229 iommu->root_entry[bus].hi = val;
2230 }
2231
2232 spin_unlock(&iommu->lock);
2233
2234 kfree(ctxt_tbls);
2235
2236 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2237
2238 ret = 0;
2239
2240 out_unmap:
2241 memunmap(old_rt);
2242
2243 return ret;
2244 }
2245
init_dmars(void)2246 static int __init init_dmars(void)
2247 {
2248 struct dmar_drhd_unit *drhd;
2249 struct intel_iommu *iommu;
2250 int ret;
2251
2252 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2253 if (ret)
2254 goto free_iommu;
2255
2256 for_each_iommu(iommu, drhd) {
2257 if (drhd->ignored) {
2258 iommu_disable_translation(iommu);
2259 continue;
2260 }
2261
2262 /*
2263 * Find the max pasid size of all IOMMU's in the system.
2264 * We need to ensure the system pasid table is no bigger
2265 * than the smallest supported.
2266 */
2267 if (pasid_supported(iommu)) {
2268 u32 temp = 2 << ecap_pss(iommu->ecap);
2269
2270 intel_pasid_max_id = min_t(u32, temp,
2271 intel_pasid_max_id);
2272 }
2273
2274 intel_iommu_init_qi(iommu);
2275
2276 ret = iommu_init_domains(iommu);
2277 if (ret)
2278 goto free_iommu;
2279
2280 init_translation_status(iommu);
2281
2282 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2283 iommu_disable_translation(iommu);
2284 clear_translation_pre_enabled(iommu);
2285 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2286 iommu->name);
2287 }
2288
2289 /*
2290 * TBD:
2291 * we could share the same root & context tables
2292 * among all IOMMU's. Need to Split it later.
2293 */
2294 ret = iommu_alloc_root_entry(iommu);
2295 if (ret)
2296 goto free_iommu;
2297
2298 if (translation_pre_enabled(iommu)) {
2299 pr_info("Translation already enabled - trying to copy translation structures\n");
2300
2301 ret = copy_translation_tables(iommu);
2302 if (ret) {
2303 /*
2304 * We found the IOMMU with translation
2305 * enabled - but failed to copy over the
2306 * old root-entry table. Try to proceed
2307 * by disabling translation now and
2308 * allocating a clean root-entry table.
2309 * This might cause DMAR faults, but
2310 * probably the dump will still succeed.
2311 */
2312 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2313 iommu->name);
2314 iommu_disable_translation(iommu);
2315 clear_translation_pre_enabled(iommu);
2316 } else {
2317 pr_info("Copied translation tables from previous kernel for %s\n",
2318 iommu->name);
2319 }
2320 }
2321
2322 intel_svm_check(iommu);
2323 }
2324
2325 /*
2326 * Now that qi is enabled on all iommus, set the root entry and flush
2327 * caches. This is required on some Intel X58 chipsets, otherwise the
2328 * flush_context function will loop forever and the boot hangs.
2329 */
2330 for_each_active_iommu(iommu, drhd) {
2331 iommu_flush_write_buffer(iommu);
2332 iommu_set_root_entry(iommu);
2333 }
2334
2335 check_tylersburg_isoch();
2336
2337 /*
2338 * for each drhd
2339 * enable fault log
2340 * global invalidate context cache
2341 * global invalidate iotlb
2342 * enable translation
2343 */
2344 for_each_iommu(iommu, drhd) {
2345 if (drhd->ignored) {
2346 /*
2347 * we always have to disable PMRs or DMA may fail on
2348 * this device
2349 */
2350 if (force_on)
2351 iommu_disable_protect_mem_regions(iommu);
2352 continue;
2353 }
2354
2355 iommu_flush_write_buffer(iommu);
2356
2357 #ifdef CONFIG_INTEL_IOMMU_SVM
2358 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2359 /*
2360 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2361 * could cause possible lock race condition.
2362 */
2363 up_write(&dmar_global_lock);
2364 ret = intel_svm_enable_prq(iommu);
2365 down_write(&dmar_global_lock);
2366 if (ret)
2367 goto free_iommu;
2368 }
2369 #endif
2370 ret = dmar_set_interrupt(iommu);
2371 if (ret)
2372 goto free_iommu;
2373 }
2374
2375 return 0;
2376
2377 free_iommu:
2378 for_each_active_iommu(iommu, drhd) {
2379 disable_dmar_iommu(iommu);
2380 free_dmar_iommu(iommu);
2381 }
2382
2383 return ret;
2384 }
2385
init_no_remapping_devices(void)2386 static void __init init_no_remapping_devices(void)
2387 {
2388 struct dmar_drhd_unit *drhd;
2389 struct device *dev;
2390 int i;
2391
2392 for_each_drhd_unit(drhd) {
2393 if (!drhd->include_all) {
2394 for_each_active_dev_scope(drhd->devices,
2395 drhd->devices_cnt, i, dev)
2396 break;
2397 /* ignore DMAR unit if no devices exist */
2398 if (i == drhd->devices_cnt)
2399 drhd->ignored = 1;
2400 }
2401 }
2402
2403 for_each_active_drhd_unit(drhd) {
2404 if (drhd->include_all)
2405 continue;
2406
2407 for_each_active_dev_scope(drhd->devices,
2408 drhd->devices_cnt, i, dev)
2409 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2410 break;
2411 if (i < drhd->devices_cnt)
2412 continue;
2413
2414 /* This IOMMU has *only* gfx devices. Either bypass it or
2415 set the gfx_mapped flag, as appropriate */
2416 drhd->gfx_dedicated = 1;
2417 if (disable_igfx_iommu)
2418 drhd->ignored = 1;
2419 }
2420 }
2421
2422 #ifdef CONFIG_SUSPEND
init_iommu_hw(void)2423 static int init_iommu_hw(void)
2424 {
2425 struct dmar_drhd_unit *drhd;
2426 struct intel_iommu *iommu = NULL;
2427 int ret;
2428
2429 for_each_active_iommu(iommu, drhd) {
2430 if (iommu->qi) {
2431 ret = dmar_reenable_qi(iommu);
2432 if (ret)
2433 return ret;
2434 }
2435 }
2436
2437 for_each_iommu(iommu, drhd) {
2438 if (drhd->ignored) {
2439 /*
2440 * we always have to disable PMRs or DMA may fail on
2441 * this device
2442 */
2443 if (force_on)
2444 iommu_disable_protect_mem_regions(iommu);
2445 continue;
2446 }
2447
2448 iommu_flush_write_buffer(iommu);
2449 iommu_set_root_entry(iommu);
2450 iommu_enable_translation(iommu);
2451 iommu_disable_protect_mem_regions(iommu);
2452 }
2453
2454 return 0;
2455 }
2456
iommu_flush_all(void)2457 static void iommu_flush_all(void)
2458 {
2459 struct dmar_drhd_unit *drhd;
2460 struct intel_iommu *iommu;
2461
2462 for_each_active_iommu(iommu, drhd) {
2463 iommu->flush.flush_context(iommu, 0, 0, 0,
2464 DMA_CCMD_GLOBAL_INVL);
2465 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2466 DMA_TLB_GLOBAL_FLUSH);
2467 }
2468 }
2469
iommu_suspend(void)2470 static int iommu_suspend(void)
2471 {
2472 struct dmar_drhd_unit *drhd;
2473 struct intel_iommu *iommu = NULL;
2474 unsigned long flag;
2475
2476 iommu_flush_all();
2477
2478 for_each_active_iommu(iommu, drhd) {
2479 iommu_disable_translation(iommu);
2480
2481 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2482
2483 iommu->iommu_state[SR_DMAR_FECTL_REG] =
2484 readl(iommu->reg + DMAR_FECTL_REG);
2485 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2486 readl(iommu->reg + DMAR_FEDATA_REG);
2487 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2488 readl(iommu->reg + DMAR_FEADDR_REG);
2489 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2490 readl(iommu->reg + DMAR_FEUADDR_REG);
2491
2492 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2493 }
2494 return 0;
2495 }
2496
iommu_resume(void)2497 static void iommu_resume(void)
2498 {
2499 struct dmar_drhd_unit *drhd;
2500 struct intel_iommu *iommu = NULL;
2501 unsigned long flag;
2502
2503 if (init_iommu_hw()) {
2504 if (force_on)
2505 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2506 else
2507 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2508 return;
2509 }
2510
2511 for_each_active_iommu(iommu, drhd) {
2512
2513 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2514
2515 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2516 iommu->reg + DMAR_FECTL_REG);
2517 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2518 iommu->reg + DMAR_FEDATA_REG);
2519 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2520 iommu->reg + DMAR_FEADDR_REG);
2521 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2522 iommu->reg + DMAR_FEUADDR_REG);
2523
2524 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2525 }
2526 }
2527
2528 static struct syscore_ops iommu_syscore_ops = {
2529 .resume = iommu_resume,
2530 .suspend = iommu_suspend,
2531 };
2532
init_iommu_pm_ops(void)2533 static void __init init_iommu_pm_ops(void)
2534 {
2535 register_syscore_ops(&iommu_syscore_ops);
2536 }
2537
2538 #else
init_iommu_pm_ops(void)2539 static inline void init_iommu_pm_ops(void) {}
2540 #endif /* CONFIG_PM */
2541
rmrr_sanity_check(struct acpi_dmar_reserved_memory * rmrr)2542 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2543 {
2544 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2545 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2546 rmrr->end_address <= rmrr->base_address ||
2547 arch_rmrr_sanity_check(rmrr))
2548 return -EINVAL;
2549
2550 return 0;
2551 }
2552
dmar_parse_one_rmrr(struct acpi_dmar_header * header,void * arg)2553 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2554 {
2555 struct acpi_dmar_reserved_memory *rmrr;
2556 struct dmar_rmrr_unit *rmrru;
2557
2558 rmrr = (struct acpi_dmar_reserved_memory *)header;
2559 if (rmrr_sanity_check(rmrr)) {
2560 pr_warn(FW_BUG
2561 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2562 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2563 rmrr->base_address, rmrr->end_address,
2564 dmi_get_system_info(DMI_BIOS_VENDOR),
2565 dmi_get_system_info(DMI_BIOS_VERSION),
2566 dmi_get_system_info(DMI_PRODUCT_VERSION));
2567 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2568 }
2569
2570 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2571 if (!rmrru)
2572 goto out;
2573
2574 rmrru->hdr = header;
2575
2576 rmrru->base_address = rmrr->base_address;
2577 rmrru->end_address = rmrr->end_address;
2578
2579 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2580 ((void *)rmrr) + rmrr->header.length,
2581 &rmrru->devices_cnt);
2582 if (rmrru->devices_cnt && rmrru->devices == NULL)
2583 goto free_rmrru;
2584
2585 list_add(&rmrru->list, &dmar_rmrr_units);
2586
2587 return 0;
2588 free_rmrru:
2589 kfree(rmrru);
2590 out:
2591 return -ENOMEM;
2592 }
2593
dmar_find_atsr(struct acpi_dmar_atsr * atsr)2594 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2595 {
2596 struct dmar_atsr_unit *atsru;
2597 struct acpi_dmar_atsr *tmp;
2598
2599 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2600 dmar_rcu_check()) {
2601 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2602 if (atsr->segment != tmp->segment)
2603 continue;
2604 if (atsr->header.length != tmp->header.length)
2605 continue;
2606 if (memcmp(atsr, tmp, atsr->header.length) == 0)
2607 return atsru;
2608 }
2609
2610 return NULL;
2611 }
2612
dmar_parse_one_atsr(struct acpi_dmar_header * hdr,void * arg)2613 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2614 {
2615 struct acpi_dmar_atsr *atsr;
2616 struct dmar_atsr_unit *atsru;
2617
2618 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2619 return 0;
2620
2621 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2622 atsru = dmar_find_atsr(atsr);
2623 if (atsru)
2624 return 0;
2625
2626 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2627 if (!atsru)
2628 return -ENOMEM;
2629
2630 /*
2631 * If memory is allocated from slab by ACPI _DSM method, we need to
2632 * copy the memory content because the memory buffer will be freed
2633 * on return.
2634 */
2635 atsru->hdr = (void *)(atsru + 1);
2636 memcpy(atsru->hdr, hdr, hdr->length);
2637 atsru->include_all = atsr->flags & 0x1;
2638 if (!atsru->include_all) {
2639 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
2640 (void *)atsr + atsr->header.length,
2641 &atsru->devices_cnt);
2642 if (atsru->devices_cnt && atsru->devices == NULL) {
2643 kfree(atsru);
2644 return -ENOMEM;
2645 }
2646 }
2647
2648 list_add_rcu(&atsru->list, &dmar_atsr_units);
2649
2650 return 0;
2651 }
2652
intel_iommu_free_atsr(struct dmar_atsr_unit * atsru)2653 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2654 {
2655 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
2656 kfree(atsru);
2657 }
2658
dmar_release_one_atsr(struct acpi_dmar_header * hdr,void * arg)2659 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2660 {
2661 struct acpi_dmar_atsr *atsr;
2662 struct dmar_atsr_unit *atsru;
2663
2664 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2665 atsru = dmar_find_atsr(atsr);
2666 if (atsru) {
2667 list_del_rcu(&atsru->list);
2668 synchronize_rcu();
2669 intel_iommu_free_atsr(atsru);
2670 }
2671
2672 return 0;
2673 }
2674
dmar_check_one_atsr(struct acpi_dmar_header * hdr,void * arg)2675 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2676 {
2677 int i;
2678 struct device *dev;
2679 struct acpi_dmar_atsr *atsr;
2680 struct dmar_atsr_unit *atsru;
2681
2682 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2683 atsru = dmar_find_atsr(atsr);
2684 if (!atsru)
2685 return 0;
2686
2687 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
2688 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
2689 i, dev)
2690 return -EBUSY;
2691 }
2692
2693 return 0;
2694 }
2695
dmar_find_satc(struct acpi_dmar_satc * satc)2696 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
2697 {
2698 struct dmar_satc_unit *satcu;
2699 struct acpi_dmar_satc *tmp;
2700
2701 list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
2702 dmar_rcu_check()) {
2703 tmp = (struct acpi_dmar_satc *)satcu->hdr;
2704 if (satc->segment != tmp->segment)
2705 continue;
2706 if (satc->header.length != tmp->header.length)
2707 continue;
2708 if (memcmp(satc, tmp, satc->header.length) == 0)
2709 return satcu;
2710 }
2711
2712 return NULL;
2713 }
2714
dmar_parse_one_satc(struct acpi_dmar_header * hdr,void * arg)2715 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
2716 {
2717 struct acpi_dmar_satc *satc;
2718 struct dmar_satc_unit *satcu;
2719
2720 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2721 return 0;
2722
2723 satc = container_of(hdr, struct acpi_dmar_satc, header);
2724 satcu = dmar_find_satc(satc);
2725 if (satcu)
2726 return 0;
2727
2728 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
2729 if (!satcu)
2730 return -ENOMEM;
2731
2732 satcu->hdr = (void *)(satcu + 1);
2733 memcpy(satcu->hdr, hdr, hdr->length);
2734 satcu->atc_required = satc->flags & 0x1;
2735 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
2736 (void *)satc + satc->header.length,
2737 &satcu->devices_cnt);
2738 if (satcu->devices_cnt && !satcu->devices) {
2739 kfree(satcu);
2740 return -ENOMEM;
2741 }
2742 list_add_rcu(&satcu->list, &dmar_satc_units);
2743
2744 return 0;
2745 }
2746
intel_iommu_add(struct dmar_drhd_unit * dmaru)2747 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
2748 {
2749 int sp, ret;
2750 struct intel_iommu *iommu = dmaru->iommu;
2751
2752 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
2753 if (ret)
2754 goto out;
2755
2756 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
2757 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
2758 pr_warn("%s: Doesn't support large page.\n",
2759 iommu->name);
2760 return -ENXIO;
2761 }
2762
2763 /*
2764 * Disable translation if already enabled prior to OS handover.
2765 */
2766 if (iommu->gcmd & DMA_GCMD_TE)
2767 iommu_disable_translation(iommu);
2768
2769 ret = iommu_init_domains(iommu);
2770 if (ret == 0)
2771 ret = iommu_alloc_root_entry(iommu);
2772 if (ret)
2773 goto out;
2774
2775 intel_svm_check(iommu);
2776
2777 if (dmaru->ignored) {
2778 /*
2779 * we always have to disable PMRs or DMA may fail on this device
2780 */
2781 if (force_on)
2782 iommu_disable_protect_mem_regions(iommu);
2783 return 0;
2784 }
2785
2786 intel_iommu_init_qi(iommu);
2787 iommu_flush_write_buffer(iommu);
2788
2789 #ifdef CONFIG_INTEL_IOMMU_SVM
2790 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2791 ret = intel_svm_enable_prq(iommu);
2792 if (ret)
2793 goto disable_iommu;
2794 }
2795 #endif
2796 ret = dmar_set_interrupt(iommu);
2797 if (ret)
2798 goto disable_iommu;
2799
2800 iommu_set_root_entry(iommu);
2801 iommu_enable_translation(iommu);
2802
2803 iommu_disable_protect_mem_regions(iommu);
2804 return 0;
2805
2806 disable_iommu:
2807 disable_dmar_iommu(iommu);
2808 out:
2809 free_dmar_iommu(iommu);
2810 return ret;
2811 }
2812
dmar_iommu_hotplug(struct dmar_drhd_unit * dmaru,bool insert)2813 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
2814 {
2815 int ret = 0;
2816 struct intel_iommu *iommu = dmaru->iommu;
2817
2818 if (!intel_iommu_enabled)
2819 return 0;
2820 if (iommu == NULL)
2821 return -EINVAL;
2822
2823 if (insert) {
2824 ret = intel_iommu_add(dmaru);
2825 } else {
2826 disable_dmar_iommu(iommu);
2827 free_dmar_iommu(iommu);
2828 }
2829
2830 return ret;
2831 }
2832
intel_iommu_free_dmars(void)2833 static void intel_iommu_free_dmars(void)
2834 {
2835 struct dmar_rmrr_unit *rmrru, *rmrr_n;
2836 struct dmar_atsr_unit *atsru, *atsr_n;
2837 struct dmar_satc_unit *satcu, *satc_n;
2838
2839 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
2840 list_del(&rmrru->list);
2841 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
2842 kfree(rmrru);
2843 }
2844
2845 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
2846 list_del(&atsru->list);
2847 intel_iommu_free_atsr(atsru);
2848 }
2849 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
2850 list_del(&satcu->list);
2851 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
2852 kfree(satcu);
2853 }
2854 }
2855
dmar_find_matched_satc_unit(struct pci_dev * dev)2856 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
2857 {
2858 struct dmar_satc_unit *satcu;
2859 struct acpi_dmar_satc *satc;
2860 struct device *tmp;
2861 int i;
2862
2863 dev = pci_physfn(dev);
2864 rcu_read_lock();
2865
2866 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
2867 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2868 if (satc->segment != pci_domain_nr(dev->bus))
2869 continue;
2870 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
2871 if (to_pci_dev(tmp) == dev)
2872 goto out;
2873 }
2874 satcu = NULL;
2875 out:
2876 rcu_read_unlock();
2877 return satcu;
2878 }
2879
dmar_ats_supported(struct pci_dev * dev,struct intel_iommu * iommu)2880 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
2881 {
2882 int i, ret = 1;
2883 struct pci_bus *bus;
2884 struct pci_dev *bridge = NULL;
2885 struct device *tmp;
2886 struct acpi_dmar_atsr *atsr;
2887 struct dmar_atsr_unit *atsru;
2888 struct dmar_satc_unit *satcu;
2889
2890 dev = pci_physfn(dev);
2891 satcu = dmar_find_matched_satc_unit(dev);
2892 if (satcu)
2893 /*
2894 * This device supports ATS as it is in SATC table.
2895 * When IOMMU is in legacy mode, enabling ATS is done
2896 * automatically by HW for the device that requires
2897 * ATS, hence OS should not enable this device ATS
2898 * to avoid duplicated TLB invalidation.
2899 */
2900 return !(satcu->atc_required && !sm_supported(iommu));
2901
2902 for (bus = dev->bus; bus; bus = bus->parent) {
2903 bridge = bus->self;
2904 /* If it's an integrated device, allow ATS */
2905 if (!bridge)
2906 return 1;
2907 /* Connected via non-PCIe: no ATS */
2908 if (!pci_is_pcie(bridge) ||
2909 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
2910 return 0;
2911 /* If we found the root port, look it up in the ATSR */
2912 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
2913 break;
2914 }
2915
2916 rcu_read_lock();
2917 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
2918 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2919 if (atsr->segment != pci_domain_nr(dev->bus))
2920 continue;
2921
2922 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
2923 if (tmp == &bridge->dev)
2924 goto out;
2925
2926 if (atsru->include_all)
2927 goto out;
2928 }
2929 ret = 0;
2930 out:
2931 rcu_read_unlock();
2932
2933 return ret;
2934 }
2935
dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info * info)2936 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
2937 {
2938 int ret;
2939 struct dmar_rmrr_unit *rmrru;
2940 struct dmar_atsr_unit *atsru;
2941 struct dmar_satc_unit *satcu;
2942 struct acpi_dmar_atsr *atsr;
2943 struct acpi_dmar_reserved_memory *rmrr;
2944 struct acpi_dmar_satc *satc;
2945
2946 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
2947 return 0;
2948
2949 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
2950 rmrr = container_of(rmrru->hdr,
2951 struct acpi_dmar_reserved_memory, header);
2952 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2953 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
2954 ((void *)rmrr) + rmrr->header.length,
2955 rmrr->segment, rmrru->devices,
2956 rmrru->devices_cnt);
2957 if (ret < 0)
2958 return ret;
2959 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2960 dmar_remove_dev_scope(info, rmrr->segment,
2961 rmrru->devices, rmrru->devices_cnt);
2962 }
2963 }
2964
2965 list_for_each_entry(atsru, &dmar_atsr_units, list) {
2966 if (atsru->include_all)
2967 continue;
2968
2969 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2970 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2971 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
2972 (void *)atsr + atsr->header.length,
2973 atsr->segment, atsru->devices,
2974 atsru->devices_cnt);
2975 if (ret > 0)
2976 break;
2977 else if (ret < 0)
2978 return ret;
2979 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2980 if (dmar_remove_dev_scope(info, atsr->segment,
2981 atsru->devices, atsru->devices_cnt))
2982 break;
2983 }
2984 }
2985 list_for_each_entry(satcu, &dmar_satc_units, list) {
2986 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2987 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2988 ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
2989 (void *)satc + satc->header.length,
2990 satc->segment, satcu->devices,
2991 satcu->devices_cnt);
2992 if (ret > 0)
2993 break;
2994 else if (ret < 0)
2995 return ret;
2996 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2997 if (dmar_remove_dev_scope(info, satc->segment,
2998 satcu->devices, satcu->devices_cnt))
2999 break;
3000 }
3001 }
3002
3003 return 0;
3004 }
3005
intel_disable_iommus(void)3006 static void intel_disable_iommus(void)
3007 {
3008 struct intel_iommu *iommu = NULL;
3009 struct dmar_drhd_unit *drhd;
3010
3011 for_each_iommu(iommu, drhd)
3012 iommu_disable_translation(iommu);
3013 }
3014
intel_iommu_shutdown(void)3015 void intel_iommu_shutdown(void)
3016 {
3017 struct dmar_drhd_unit *drhd;
3018 struct intel_iommu *iommu = NULL;
3019
3020 if (no_iommu || dmar_disabled)
3021 return;
3022
3023 down_write(&dmar_global_lock);
3024
3025 /* Disable PMRs explicitly here. */
3026 for_each_iommu(iommu, drhd)
3027 iommu_disable_protect_mem_regions(iommu);
3028
3029 /* Make sure the IOMMUs are switched off */
3030 intel_disable_iommus();
3031
3032 up_write(&dmar_global_lock);
3033 }
3034
dev_to_intel_iommu(struct device * dev)3035 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3036 {
3037 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3038
3039 return container_of(iommu_dev, struct intel_iommu, iommu);
3040 }
3041
version_show(struct device * dev,struct device_attribute * attr,char * buf)3042 static ssize_t version_show(struct device *dev,
3043 struct device_attribute *attr, char *buf)
3044 {
3045 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3046 u32 ver = readl(iommu->reg + DMAR_VER_REG);
3047 return sysfs_emit(buf, "%d:%d\n",
3048 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3049 }
3050 static DEVICE_ATTR_RO(version);
3051
address_show(struct device * dev,struct device_attribute * attr,char * buf)3052 static ssize_t address_show(struct device *dev,
3053 struct device_attribute *attr, char *buf)
3054 {
3055 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3056 return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3057 }
3058 static DEVICE_ATTR_RO(address);
3059
cap_show(struct device * dev,struct device_attribute * attr,char * buf)3060 static ssize_t cap_show(struct device *dev,
3061 struct device_attribute *attr, char *buf)
3062 {
3063 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3064 return sysfs_emit(buf, "%llx\n", iommu->cap);
3065 }
3066 static DEVICE_ATTR_RO(cap);
3067
ecap_show(struct device * dev,struct device_attribute * attr,char * buf)3068 static ssize_t ecap_show(struct device *dev,
3069 struct device_attribute *attr, char *buf)
3070 {
3071 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3072 return sysfs_emit(buf, "%llx\n", iommu->ecap);
3073 }
3074 static DEVICE_ATTR_RO(ecap);
3075
domains_supported_show(struct device * dev,struct device_attribute * attr,char * buf)3076 static ssize_t domains_supported_show(struct device *dev,
3077 struct device_attribute *attr, char *buf)
3078 {
3079 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3080 return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3081 }
3082 static DEVICE_ATTR_RO(domains_supported);
3083
domains_used_show(struct device * dev,struct device_attribute * attr,char * buf)3084 static ssize_t domains_used_show(struct device *dev,
3085 struct device_attribute *attr, char *buf)
3086 {
3087 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3088 return sysfs_emit(buf, "%d\n",
3089 bitmap_weight(iommu->domain_ids,
3090 cap_ndoms(iommu->cap)));
3091 }
3092 static DEVICE_ATTR_RO(domains_used);
3093
3094 static struct attribute *intel_iommu_attrs[] = {
3095 &dev_attr_version.attr,
3096 &dev_attr_address.attr,
3097 &dev_attr_cap.attr,
3098 &dev_attr_ecap.attr,
3099 &dev_attr_domains_supported.attr,
3100 &dev_attr_domains_used.attr,
3101 NULL,
3102 };
3103
3104 static struct attribute_group intel_iommu_group = {
3105 .name = "intel-iommu",
3106 .attrs = intel_iommu_attrs,
3107 };
3108
3109 const struct attribute_group *intel_iommu_groups[] = {
3110 &intel_iommu_group,
3111 NULL,
3112 };
3113
has_external_pci(void)3114 static bool has_external_pci(void)
3115 {
3116 struct pci_dev *pdev = NULL;
3117
3118 for_each_pci_dev(pdev)
3119 if (pdev->external_facing) {
3120 pci_dev_put(pdev);
3121 return true;
3122 }
3123
3124 return false;
3125 }
3126
platform_optin_force_iommu(void)3127 static int __init platform_optin_force_iommu(void)
3128 {
3129 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3130 return 0;
3131
3132 if (no_iommu || dmar_disabled)
3133 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3134
3135 /*
3136 * If Intel-IOMMU is disabled by default, we will apply identity
3137 * map for all devices except those marked as being untrusted.
3138 */
3139 if (dmar_disabled)
3140 iommu_set_default_passthrough(false);
3141
3142 dmar_disabled = 0;
3143 no_iommu = 0;
3144
3145 return 1;
3146 }
3147
probe_acpi_namespace_devices(void)3148 static int __init probe_acpi_namespace_devices(void)
3149 {
3150 struct dmar_drhd_unit *drhd;
3151 /* To avoid a -Wunused-but-set-variable warning. */
3152 struct intel_iommu *iommu __maybe_unused;
3153 struct device *dev;
3154 int i, ret = 0;
3155
3156 for_each_active_iommu(iommu, drhd) {
3157 for_each_active_dev_scope(drhd->devices,
3158 drhd->devices_cnt, i, dev) {
3159 struct acpi_device_physical_node *pn;
3160 struct acpi_device *adev;
3161
3162 if (dev->bus != &acpi_bus_type)
3163 continue;
3164
3165 adev = to_acpi_device(dev);
3166 mutex_lock(&adev->physical_node_lock);
3167 list_for_each_entry(pn,
3168 &adev->physical_node_list, node) {
3169 ret = iommu_probe_device(pn->dev);
3170 if (ret)
3171 break;
3172 }
3173 mutex_unlock(&adev->physical_node_lock);
3174
3175 if (ret)
3176 return ret;
3177 }
3178 }
3179
3180 return 0;
3181 }
3182
tboot_force_iommu(void)3183 static __init int tboot_force_iommu(void)
3184 {
3185 if (!tboot_enabled())
3186 return 0;
3187
3188 if (no_iommu || dmar_disabled)
3189 pr_warn("Forcing Intel-IOMMU to enabled\n");
3190
3191 dmar_disabled = 0;
3192 no_iommu = 0;
3193
3194 return 1;
3195 }
3196
intel_iommu_init(void)3197 int __init intel_iommu_init(void)
3198 {
3199 int ret = -ENODEV;
3200 struct dmar_drhd_unit *drhd;
3201 struct intel_iommu *iommu;
3202
3203 /*
3204 * Intel IOMMU is required for a TXT/tboot launch or platform
3205 * opt in, so enforce that.
3206 */
3207 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3208 platform_optin_force_iommu();
3209
3210 down_write(&dmar_global_lock);
3211 if (dmar_table_init()) {
3212 if (force_on)
3213 panic("tboot: Failed to initialize DMAR table\n");
3214 goto out_free_dmar;
3215 }
3216
3217 if (dmar_dev_scope_init() < 0) {
3218 if (force_on)
3219 panic("tboot: Failed to initialize DMAR device scope\n");
3220 goto out_free_dmar;
3221 }
3222
3223 up_write(&dmar_global_lock);
3224
3225 /*
3226 * The bus notifier takes the dmar_global_lock, so lockdep will
3227 * complain later when we register it under the lock.
3228 */
3229 dmar_register_bus_notifier();
3230
3231 down_write(&dmar_global_lock);
3232
3233 if (!no_iommu)
3234 intel_iommu_debugfs_init();
3235
3236 if (no_iommu || dmar_disabled) {
3237 /*
3238 * We exit the function here to ensure IOMMU's remapping and
3239 * mempool aren't setup, which means that the IOMMU's PMRs
3240 * won't be disabled via the call to init_dmars(). So disable
3241 * it explicitly here. The PMRs were setup by tboot prior to
3242 * calling SENTER, but the kernel is expected to reset/tear
3243 * down the PMRs.
3244 */
3245 if (intel_iommu_tboot_noforce) {
3246 for_each_iommu(iommu, drhd)
3247 iommu_disable_protect_mem_regions(iommu);
3248 }
3249
3250 /*
3251 * Make sure the IOMMUs are switched off, even when we
3252 * boot into a kexec kernel and the previous kernel left
3253 * them enabled
3254 */
3255 intel_disable_iommus();
3256 goto out_free_dmar;
3257 }
3258
3259 if (list_empty(&dmar_rmrr_units))
3260 pr_info("No RMRR found\n");
3261
3262 if (list_empty(&dmar_atsr_units))
3263 pr_info("No ATSR found\n");
3264
3265 if (list_empty(&dmar_satc_units))
3266 pr_info("No SATC found\n");
3267
3268 init_no_remapping_devices();
3269
3270 ret = init_dmars();
3271 if (ret) {
3272 if (force_on)
3273 panic("tboot: Failed to initialize DMARs\n");
3274 pr_err("Initialization failed\n");
3275 goto out_free_dmar;
3276 }
3277 up_write(&dmar_global_lock);
3278
3279 init_iommu_pm_ops();
3280
3281 down_read(&dmar_global_lock);
3282 for_each_active_iommu(iommu, drhd) {
3283 /*
3284 * The flush queue implementation does not perform
3285 * page-selective invalidations that are required for efficient
3286 * TLB flushes in virtual environments. The benefit of batching
3287 * is likely to be much lower than the overhead of synchronizing
3288 * the virtual and physical IOMMU page-tables.
3289 */
3290 if (cap_caching_mode(iommu->cap) &&
3291 !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3292 pr_info_once("IOMMU batching disallowed due to virtualization\n");
3293 iommu_set_dma_strict();
3294 }
3295 iommu_device_sysfs_add(&iommu->iommu, NULL,
3296 intel_iommu_groups,
3297 "%s", iommu->name);
3298 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3299
3300 iommu_pmu_register(iommu);
3301 }
3302
3303 if (probe_acpi_namespace_devices())
3304 pr_warn("ACPI name space devices didn't probe correctly\n");
3305
3306 /* Finally, we enable the DMA remapping hardware. */
3307 for_each_iommu(iommu, drhd) {
3308 if (!drhd->ignored && !translation_pre_enabled(iommu))
3309 iommu_enable_translation(iommu);
3310
3311 iommu_disable_protect_mem_regions(iommu);
3312 }
3313 up_read(&dmar_global_lock);
3314
3315 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3316
3317 intel_iommu_enabled = 1;
3318
3319 return 0;
3320
3321 out_free_dmar:
3322 intel_iommu_free_dmars();
3323 up_write(&dmar_global_lock);
3324 return ret;
3325 }
3326
domain_context_clear_one_cb(struct pci_dev * pdev,u16 alias,void * opaque)3327 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3328 {
3329 struct device_domain_info *info = opaque;
3330
3331 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3332 return 0;
3333 }
3334
3335 /*
3336 * NB - intel-iommu lacks any sort of reference counting for the users of
3337 * dependent devices. If multiple endpoints have intersecting dependent
3338 * devices, unbinding the driver from any one of them will possibly leave
3339 * the others unable to operate.
3340 */
domain_context_clear(struct device_domain_info * info)3341 static void domain_context_clear(struct device_domain_info *info)
3342 {
3343 if (!dev_is_pci(info->dev)) {
3344 domain_context_clear_one(info, info->bus, info->devfn);
3345 return;
3346 }
3347
3348 pci_for_each_dma_alias(to_pci_dev(info->dev),
3349 &domain_context_clear_one_cb, info);
3350 }
3351
3352 /*
3353 * Clear the page table pointer in context or pasid table entries so that
3354 * all DMA requests without PASID from the device are blocked. If the page
3355 * table has been set, clean up the data structures.
3356 */
device_block_translation(struct device * dev)3357 void device_block_translation(struct device *dev)
3358 {
3359 struct device_domain_info *info = dev_iommu_priv_get(dev);
3360 struct intel_iommu *iommu = info->iommu;
3361 unsigned long flags;
3362
3363 iommu_disable_pci_caps(info);
3364 if (!dev_is_real_dma_subdevice(dev)) {
3365 if (sm_supported(iommu))
3366 intel_pasid_tear_down_entry(iommu, dev,
3367 IOMMU_NO_PASID, false);
3368 else
3369 domain_context_clear(info);
3370 }
3371
3372 if (!info->domain)
3373 return;
3374
3375 spin_lock_irqsave(&info->domain->lock, flags);
3376 list_del(&info->link);
3377 spin_unlock_irqrestore(&info->domain->lock, flags);
3378
3379 cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID);
3380 domain_detach_iommu(info->domain, iommu);
3381 info->domain = NULL;
3382 }
3383
md_domain_init(struct dmar_domain * domain,int guest_width)3384 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3385 {
3386 int adjust_width;
3387
3388 /* calculate AGAW */
3389 domain->gaw = guest_width;
3390 adjust_width = guestwidth_to_adjustwidth(guest_width);
3391 domain->agaw = width_to_agaw(adjust_width);
3392
3393 domain->iommu_coherency = false;
3394 domain->iommu_superpage = 0;
3395 domain->max_addr = 0;
3396
3397 /* always allocate the top pgd */
3398 domain->pgd = iommu_alloc_page_node(domain->nid, GFP_ATOMIC);
3399 if (!domain->pgd)
3400 return -ENOMEM;
3401 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3402 return 0;
3403 }
3404
blocking_domain_attach_dev(struct iommu_domain * domain,struct device * dev)3405 static int blocking_domain_attach_dev(struct iommu_domain *domain,
3406 struct device *dev)
3407 {
3408 device_block_translation(dev);
3409 return 0;
3410 }
3411
3412 static struct iommu_domain blocking_domain = {
3413 .type = IOMMU_DOMAIN_BLOCKED,
3414 .ops = &(const struct iommu_domain_ops) {
3415 .attach_dev = blocking_domain_attach_dev,
3416 }
3417 };
3418
iommu_superpage_capability(struct intel_iommu * iommu,bool first_stage)3419 static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage)
3420 {
3421 if (!intel_iommu_superpage)
3422 return 0;
3423
3424 if (first_stage)
3425 return cap_fl1gp_support(iommu->cap) ? 2 : 1;
3426
3427 return fls(cap_super_page_val(iommu->cap));
3428 }
3429
paging_domain_alloc(struct device * dev,bool first_stage)3430 static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage)
3431 {
3432 struct device_domain_info *info = dev_iommu_priv_get(dev);
3433 struct intel_iommu *iommu = info->iommu;
3434 struct dmar_domain *domain;
3435 int addr_width;
3436
3437 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
3438 if (!domain)
3439 return ERR_PTR(-ENOMEM);
3440
3441 INIT_LIST_HEAD(&domain->devices);
3442 INIT_LIST_HEAD(&domain->dev_pasids);
3443 INIT_LIST_HEAD(&domain->cache_tags);
3444 spin_lock_init(&domain->lock);
3445 spin_lock_init(&domain->cache_lock);
3446 xa_init(&domain->iommu_array);
3447
3448 domain->nid = dev_to_node(dev);
3449 domain->use_first_level = first_stage;
3450
3451 /* calculate the address width */
3452 addr_width = agaw_to_width(iommu->agaw);
3453 if (addr_width > cap_mgaw(iommu->cap))
3454 addr_width = cap_mgaw(iommu->cap);
3455 domain->gaw = addr_width;
3456 domain->agaw = iommu->agaw;
3457 domain->max_addr = __DOMAIN_MAX_ADDR(addr_width);
3458
3459 /* iommu memory access coherency */
3460 domain->iommu_coherency = iommu_paging_structure_coherency(iommu);
3461
3462 /* pagesize bitmap */
3463 domain->domain.pgsize_bitmap = SZ_4K;
3464 domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage);
3465 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
3466
3467 /*
3468 * IOVA aperture: First-level translation restricts the input-address
3469 * to a canonical address (i.e., address bits 63:N have the same value
3470 * as address bit [N-1], where N is 48-bits with 4-level paging and
3471 * 57-bits with 5-level paging). Hence, skip bit [N-1].
3472 */
3473 domain->domain.geometry.force_aperture = true;
3474 domain->domain.geometry.aperture_start = 0;
3475 if (first_stage)
3476 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
3477 else
3478 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
3479
3480 /* always allocate the top pgd */
3481 domain->pgd = iommu_alloc_page_node(domain->nid, GFP_KERNEL);
3482 if (!domain->pgd) {
3483 kfree(domain);
3484 return ERR_PTR(-ENOMEM);
3485 }
3486 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3487
3488 return domain;
3489 }
3490
intel_iommu_domain_alloc(unsigned type)3491 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
3492 {
3493 struct dmar_domain *dmar_domain;
3494 struct iommu_domain *domain;
3495
3496 switch (type) {
3497 case IOMMU_DOMAIN_DMA:
3498 case IOMMU_DOMAIN_UNMANAGED:
3499 dmar_domain = alloc_domain(type);
3500 if (!dmar_domain) {
3501 pr_err("Can't allocate dmar_domain\n");
3502 return NULL;
3503 }
3504 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3505 pr_err("Domain initialization failed\n");
3506 domain_exit(dmar_domain);
3507 return NULL;
3508 }
3509
3510 domain = &dmar_domain->domain;
3511 domain->geometry.aperture_start = 0;
3512 domain->geometry.aperture_end =
3513 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3514 domain->geometry.force_aperture = true;
3515
3516 return domain;
3517 default:
3518 return NULL;
3519 }
3520
3521 return NULL;
3522 }
3523
3524 static struct iommu_domain *
intel_iommu_domain_alloc_user(struct device * dev,u32 flags,struct iommu_domain * parent,const struct iommu_user_data * user_data)3525 intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
3526 struct iommu_domain *parent,
3527 const struct iommu_user_data *user_data)
3528 {
3529 struct device_domain_info *info = dev_iommu_priv_get(dev);
3530 bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3531 bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3532 struct intel_iommu *iommu = info->iommu;
3533 struct dmar_domain *dmar_domain;
3534 struct iommu_domain *domain;
3535
3536 /* Must be NESTING domain */
3537 if (parent) {
3538 if (!nested_supported(iommu) || flags)
3539 return ERR_PTR(-EOPNOTSUPP);
3540 return intel_nested_domain_alloc(parent, user_data);
3541 }
3542
3543 if (flags &
3544 (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
3545 return ERR_PTR(-EOPNOTSUPP);
3546 if (nested_parent && !nested_supported(iommu))
3547 return ERR_PTR(-EOPNOTSUPP);
3548 if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3549 return ERR_PTR(-EOPNOTSUPP);
3550
3551 /* Do not use first stage for user domain translation. */
3552 dmar_domain = paging_domain_alloc(dev, false);
3553 if (IS_ERR(dmar_domain))
3554 return ERR_CAST(dmar_domain);
3555 domain = &dmar_domain->domain;
3556 domain->type = IOMMU_DOMAIN_UNMANAGED;
3557 domain->owner = &intel_iommu_ops;
3558 domain->ops = intel_iommu_ops.default_domain_ops;
3559
3560 if (nested_parent) {
3561 dmar_domain->nested_parent = true;
3562 INIT_LIST_HEAD(&dmar_domain->s1_domains);
3563 spin_lock_init(&dmar_domain->s1_lock);
3564 }
3565
3566 if (dirty_tracking) {
3567 if (dmar_domain->use_first_level) {
3568 iommu_domain_free(domain);
3569 return ERR_PTR(-EOPNOTSUPP);
3570 }
3571 domain->dirty_ops = &intel_dirty_ops;
3572 }
3573
3574 return domain;
3575 }
3576
intel_iommu_domain_free(struct iommu_domain * domain)3577 static void intel_iommu_domain_free(struct iommu_domain *domain)
3578 {
3579 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3580
3581 WARN_ON(dmar_domain->nested_parent &&
3582 !list_empty(&dmar_domain->s1_domains));
3583 domain_exit(dmar_domain);
3584 }
3585
prepare_domain_attach_device(struct iommu_domain * domain,struct device * dev)3586 int prepare_domain_attach_device(struct iommu_domain *domain,
3587 struct device *dev)
3588 {
3589 struct device_domain_info *info = dev_iommu_priv_get(dev);
3590 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3591 struct intel_iommu *iommu = info->iommu;
3592 int addr_width;
3593
3594 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3595 return -EINVAL;
3596
3597 if (domain->dirty_ops && !ssads_supported(iommu))
3598 return -EINVAL;
3599
3600 /* check if this iommu agaw is sufficient for max mapped address */
3601 addr_width = agaw_to_width(iommu->agaw);
3602 if (addr_width > cap_mgaw(iommu->cap))
3603 addr_width = cap_mgaw(iommu->cap);
3604
3605 if (dmar_domain->max_addr > (1LL << addr_width))
3606 return -EINVAL;
3607 dmar_domain->gaw = addr_width;
3608
3609 /*
3610 * Knock out extra levels of page tables if necessary
3611 */
3612 while (iommu->agaw < dmar_domain->agaw) {
3613 struct dma_pte *pte;
3614
3615 pte = dmar_domain->pgd;
3616 if (dma_pte_present(pte)) {
3617 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
3618 iommu_free_page(pte);
3619 }
3620 dmar_domain->agaw--;
3621 }
3622
3623 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3624 context_copied(iommu, info->bus, info->devfn))
3625 return intel_pasid_setup_sm_context(dev);
3626
3627 return 0;
3628 }
3629
intel_iommu_attach_device(struct iommu_domain * domain,struct device * dev)3630 static int intel_iommu_attach_device(struct iommu_domain *domain,
3631 struct device *dev)
3632 {
3633 int ret;
3634
3635 device_block_translation(dev);
3636
3637 ret = prepare_domain_attach_device(domain, dev);
3638 if (ret)
3639 return ret;
3640
3641 return dmar_domain_attach_device(to_dmar_domain(domain), dev);
3642 }
3643
intel_iommu_map(struct iommu_domain * domain,unsigned long iova,phys_addr_t hpa,size_t size,int iommu_prot,gfp_t gfp)3644 static int intel_iommu_map(struct iommu_domain *domain,
3645 unsigned long iova, phys_addr_t hpa,
3646 size_t size, int iommu_prot, gfp_t gfp)
3647 {
3648 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3649 u64 max_addr;
3650 int prot = 0;
3651
3652 if (iommu_prot & IOMMU_READ)
3653 prot |= DMA_PTE_READ;
3654 if (iommu_prot & IOMMU_WRITE)
3655 prot |= DMA_PTE_WRITE;
3656 if (dmar_domain->set_pte_snp)
3657 prot |= DMA_PTE_SNP;
3658
3659 max_addr = iova + size;
3660 if (dmar_domain->max_addr < max_addr) {
3661 u64 end;
3662
3663 /* check if minimum agaw is sufficient for mapped address */
3664 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3665 if (end < max_addr) {
3666 pr_err("%s: iommu width (%d) is not "
3667 "sufficient for the mapped address (%llx)\n",
3668 __func__, dmar_domain->gaw, max_addr);
3669 return -EFAULT;
3670 }
3671 dmar_domain->max_addr = max_addr;
3672 }
3673 /* Round up size to next multiple of PAGE_SIZE, if it and
3674 the low bits of hpa would take us onto the next page */
3675 size = aligned_nrpages(hpa, size);
3676 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3677 hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
3678 }
3679
intel_iommu_map_pages(struct iommu_domain * domain,unsigned long iova,phys_addr_t paddr,size_t pgsize,size_t pgcount,int prot,gfp_t gfp,size_t * mapped)3680 static int intel_iommu_map_pages(struct iommu_domain *domain,
3681 unsigned long iova, phys_addr_t paddr,
3682 size_t pgsize, size_t pgcount,
3683 int prot, gfp_t gfp, size_t *mapped)
3684 {
3685 unsigned long pgshift = __ffs(pgsize);
3686 size_t size = pgcount << pgshift;
3687 int ret;
3688
3689 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
3690 return -EINVAL;
3691
3692 if (!IS_ALIGNED(iova | paddr, pgsize))
3693 return -EINVAL;
3694
3695 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
3696 if (!ret && mapped)
3697 *mapped = size;
3698
3699 return ret;
3700 }
3701
intel_iommu_unmap(struct iommu_domain * domain,unsigned long iova,size_t size,struct iommu_iotlb_gather * gather)3702 static size_t intel_iommu_unmap(struct iommu_domain *domain,
3703 unsigned long iova, size_t size,
3704 struct iommu_iotlb_gather *gather)
3705 {
3706 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3707 unsigned long start_pfn, last_pfn;
3708 int level = 0;
3709
3710 /* Cope with horrid API which requires us to unmap more than the
3711 size argument if it happens to be a large-page mapping. */
3712 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
3713 &level, GFP_ATOMIC)))
3714 return 0;
3715
3716 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
3717 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
3718
3719 start_pfn = iova >> VTD_PAGE_SHIFT;
3720 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
3721
3722 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
3723
3724 if (dmar_domain->max_addr == iova + size)
3725 dmar_domain->max_addr = iova;
3726
3727 /*
3728 * We do not use page-selective IOTLB invalidation in flush queue,
3729 * so there is no need to track page and sync iotlb.
3730 */
3731 if (!iommu_iotlb_gather_queued(gather))
3732 iommu_iotlb_gather_add_page(domain, gather, iova, size);
3733
3734 return size;
3735 }
3736
intel_iommu_unmap_pages(struct iommu_domain * domain,unsigned long iova,size_t pgsize,size_t pgcount,struct iommu_iotlb_gather * gather)3737 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
3738 unsigned long iova,
3739 size_t pgsize, size_t pgcount,
3740 struct iommu_iotlb_gather *gather)
3741 {
3742 unsigned long pgshift = __ffs(pgsize);
3743 size_t size = pgcount << pgshift;
3744
3745 return intel_iommu_unmap(domain, iova, size, gather);
3746 }
3747
intel_iommu_tlb_sync(struct iommu_domain * domain,struct iommu_iotlb_gather * gather)3748 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
3749 struct iommu_iotlb_gather *gather)
3750 {
3751 cache_tag_flush_range(to_dmar_domain(domain), gather->start,
3752 gather->end, list_empty(&gather->freelist));
3753 iommu_put_pages_list(&gather->freelist);
3754 }
3755
intel_iommu_iova_to_phys(struct iommu_domain * domain,dma_addr_t iova)3756 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3757 dma_addr_t iova)
3758 {
3759 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3760 struct dma_pte *pte;
3761 int level = 0;
3762 u64 phys = 0;
3763
3764 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
3765 GFP_ATOMIC);
3766 if (pte && dma_pte_present(pte))
3767 phys = dma_pte_addr(pte) +
3768 (iova & (BIT_MASK(level_to_offset_bits(level) +
3769 VTD_PAGE_SHIFT) - 1));
3770
3771 return phys;
3772 }
3773
domain_support_force_snooping(struct dmar_domain * domain)3774 static bool domain_support_force_snooping(struct dmar_domain *domain)
3775 {
3776 struct device_domain_info *info;
3777 bool support = true;
3778
3779 assert_spin_locked(&domain->lock);
3780 list_for_each_entry(info, &domain->devices, link) {
3781 if (!ecap_sc_support(info->iommu->ecap)) {
3782 support = false;
3783 break;
3784 }
3785 }
3786
3787 return support;
3788 }
3789
domain_set_force_snooping(struct dmar_domain * domain)3790 static void domain_set_force_snooping(struct dmar_domain *domain)
3791 {
3792 struct device_domain_info *info;
3793
3794 assert_spin_locked(&domain->lock);
3795 /*
3796 * Second level page table supports per-PTE snoop control. The
3797 * iommu_map() interface will handle this by setting SNP bit.
3798 */
3799 if (!domain->use_first_level) {
3800 domain->set_pte_snp = true;
3801 return;
3802 }
3803
3804 list_for_each_entry(info, &domain->devices, link)
3805 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
3806 IOMMU_NO_PASID);
3807 }
3808
intel_iommu_enforce_cache_coherency(struct iommu_domain * domain)3809 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
3810 {
3811 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3812 unsigned long flags;
3813
3814 if (dmar_domain->force_snooping)
3815 return true;
3816
3817 spin_lock_irqsave(&dmar_domain->lock, flags);
3818 if (!domain_support_force_snooping(dmar_domain) ||
3819 (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
3820 spin_unlock_irqrestore(&dmar_domain->lock, flags);
3821 return false;
3822 }
3823
3824 domain_set_force_snooping(dmar_domain);
3825 dmar_domain->force_snooping = true;
3826 spin_unlock_irqrestore(&dmar_domain->lock, flags);
3827
3828 return true;
3829 }
3830
intel_iommu_capable(struct device * dev,enum iommu_cap cap)3831 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
3832 {
3833 struct device_domain_info *info = dev_iommu_priv_get(dev);
3834
3835 switch (cap) {
3836 case IOMMU_CAP_CACHE_COHERENCY:
3837 case IOMMU_CAP_DEFERRED_FLUSH:
3838 return true;
3839 case IOMMU_CAP_PRE_BOOT_PROTECTION:
3840 return dmar_platform_optin();
3841 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
3842 return ecap_sc_support(info->iommu->ecap);
3843 case IOMMU_CAP_DIRTY_TRACKING:
3844 return ssads_supported(info->iommu);
3845 default:
3846 return false;
3847 }
3848 }
3849
intel_iommu_probe_device(struct device * dev)3850 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
3851 {
3852 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3853 struct device_domain_info *info;
3854 struct intel_iommu *iommu;
3855 u8 bus, devfn;
3856 int ret;
3857
3858 iommu = device_lookup_iommu(dev, &bus, &devfn);
3859 if (!iommu || !iommu->iommu.ops)
3860 return ERR_PTR(-ENODEV);
3861
3862 info = kzalloc(sizeof(*info), GFP_KERNEL);
3863 if (!info)
3864 return ERR_PTR(-ENOMEM);
3865
3866 if (dev_is_real_dma_subdevice(dev)) {
3867 info->bus = pdev->bus->number;
3868 info->devfn = pdev->devfn;
3869 info->segment = pci_domain_nr(pdev->bus);
3870 } else {
3871 info->bus = bus;
3872 info->devfn = devfn;
3873 info->segment = iommu->segment;
3874 }
3875
3876 info->dev = dev;
3877 info->iommu = iommu;
3878 if (dev_is_pci(dev)) {
3879 if (ecap_dev_iotlb_support(iommu->ecap) &&
3880 pci_ats_supported(pdev) &&
3881 dmar_ats_supported(pdev, iommu)) {
3882 info->ats_supported = 1;
3883 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
3884
3885 /*
3886 * For IOMMU that supports device IOTLB throttling
3887 * (DIT), we assign PFSID to the invalidation desc
3888 * of a VF such that IOMMU HW can gauge queue depth
3889 * at PF level. If DIT is not set, PFSID will be
3890 * treated as reserved, which should be set to 0.
3891 */
3892 if (ecap_dit(iommu->ecap))
3893 info->pfsid = pci_dev_id(pci_physfn(pdev));
3894 info->ats_qdep = pci_ats_queue_depth(pdev);
3895 }
3896 if (sm_supported(iommu)) {
3897 if (pasid_supported(iommu)) {
3898 int features = pci_pasid_features(pdev);
3899
3900 if (features >= 0)
3901 info->pasid_supported = features | 1;
3902 }
3903
3904 if (info->ats_supported && ecap_prs(iommu->ecap) &&
3905 pci_pri_supported(pdev))
3906 info->pri_supported = 1;
3907 }
3908 }
3909
3910 dev_iommu_priv_set(dev, info);
3911 if (pdev && pci_ats_supported(pdev)) {
3912 pci_prepare_ats(pdev, VTD_PAGE_SHIFT);
3913 ret = device_rbtree_insert(iommu, info);
3914 if (ret)
3915 goto free;
3916 }
3917
3918 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
3919 ret = intel_pasid_alloc_table(dev);
3920 if (ret) {
3921 dev_err(dev, "PASID table allocation failed\n");
3922 goto clear_rbtree;
3923 }
3924
3925 if (!context_copied(iommu, info->bus, info->devfn)) {
3926 ret = intel_pasid_setup_sm_context(dev);
3927 if (ret)
3928 goto free_table;
3929 }
3930 }
3931
3932 intel_iommu_debugfs_create_dev(info);
3933
3934 /*
3935 * The PCIe spec, in its wisdom, declares that the behaviour of the
3936 * device is undefined if you enable PASID support after ATS support.
3937 * So always enable PASID support on devices which have it, even if
3938 * we can't yet know if we're ever going to use it.
3939 */
3940 if (info->pasid_supported &&
3941 !pci_enable_pasid(pdev, info->pasid_supported & ~1))
3942 info->pasid_enabled = 1;
3943
3944 return &iommu->iommu;
3945 free_table:
3946 intel_pasid_free_table(dev);
3947 clear_rbtree:
3948 device_rbtree_remove(info);
3949 free:
3950 kfree(info);
3951
3952 return ERR_PTR(ret);
3953 }
3954
intel_iommu_release_device(struct device * dev)3955 static void intel_iommu_release_device(struct device *dev)
3956 {
3957 struct device_domain_info *info = dev_iommu_priv_get(dev);
3958 struct intel_iommu *iommu = info->iommu;
3959
3960 if (info->pasid_enabled) {
3961 pci_disable_pasid(to_pci_dev(dev));
3962 info->pasid_enabled = 0;
3963 }
3964
3965 mutex_lock(&iommu->iopf_lock);
3966 if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
3967 device_rbtree_remove(info);
3968 mutex_unlock(&iommu->iopf_lock);
3969
3970 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3971 !context_copied(iommu, info->bus, info->devfn))
3972 intel_pasid_teardown_sm_context(dev);
3973
3974 intel_pasid_free_table(dev);
3975 intel_iommu_debugfs_remove_dev(info);
3976 kfree(info);
3977 set_dma_ops(dev, NULL);
3978 }
3979
intel_iommu_get_resv_regions(struct device * device,struct list_head * head)3980 static void intel_iommu_get_resv_regions(struct device *device,
3981 struct list_head *head)
3982 {
3983 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
3984 struct iommu_resv_region *reg;
3985 struct dmar_rmrr_unit *rmrr;
3986 struct device *i_dev;
3987 int i;
3988
3989 rcu_read_lock();
3990 for_each_rmrr_units(rmrr) {
3991 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3992 i, i_dev) {
3993 struct iommu_resv_region *resv;
3994 enum iommu_resv_type type;
3995 size_t length;
3996
3997 if (i_dev != device &&
3998 !is_downstream_to_pci_bridge(device, i_dev))
3999 continue;
4000
4001 length = rmrr->end_address - rmrr->base_address + 1;
4002
4003 type = device_rmrr_is_relaxable(device) ?
4004 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4005
4006 resv = iommu_alloc_resv_region(rmrr->base_address,
4007 length, prot, type,
4008 GFP_ATOMIC);
4009 if (!resv)
4010 break;
4011
4012 list_add_tail(&resv->list, head);
4013 }
4014 }
4015 rcu_read_unlock();
4016
4017 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4018 if (dev_is_pci(device)) {
4019 struct pci_dev *pdev = to_pci_dev(device);
4020
4021 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4022 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4023 IOMMU_RESV_DIRECT_RELAXABLE,
4024 GFP_KERNEL);
4025 if (reg)
4026 list_add_tail(®->list, head);
4027 }
4028 }
4029 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4030
4031 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4032 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4033 0, IOMMU_RESV_MSI, GFP_KERNEL);
4034 if (!reg)
4035 return;
4036 list_add_tail(®->list, head);
4037 }
4038
intel_iommu_device_group(struct device * dev)4039 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4040 {
4041 if (dev_is_pci(dev))
4042 return pci_device_group(dev);
4043 return generic_device_group(dev);
4044 }
4045
intel_iommu_enable_sva(struct device * dev)4046 static int intel_iommu_enable_sva(struct device *dev)
4047 {
4048 struct device_domain_info *info = dev_iommu_priv_get(dev);
4049 struct intel_iommu *iommu;
4050
4051 if (!info || dmar_disabled)
4052 return -EINVAL;
4053
4054 iommu = info->iommu;
4055 if (!iommu)
4056 return -EINVAL;
4057
4058 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4059 return -ENODEV;
4060
4061 if (!info->pasid_enabled || !info->ats_enabled)
4062 return -EINVAL;
4063
4064 /*
4065 * Devices having device-specific I/O fault handling should not
4066 * support PCI/PRI. The IOMMU side has no means to check the
4067 * capability of device-specific IOPF. Therefore, IOMMU can only
4068 * default that if the device driver enables SVA on a non-PRI
4069 * device, it will handle IOPF in its own way.
4070 */
4071 if (!info->pri_supported)
4072 return 0;
4073
4074 /* Devices supporting PRI should have it enabled. */
4075 if (!info->pri_enabled)
4076 return -EINVAL;
4077
4078 return 0;
4079 }
4080
context_flip_pri(struct device_domain_info * info,bool enable)4081 static int context_flip_pri(struct device_domain_info *info, bool enable)
4082 {
4083 struct intel_iommu *iommu = info->iommu;
4084 u8 bus = info->bus, devfn = info->devfn;
4085 struct context_entry *context;
4086 u16 did;
4087
4088 spin_lock(&iommu->lock);
4089 if (context_copied(iommu, bus, devfn)) {
4090 spin_unlock(&iommu->lock);
4091 return -EINVAL;
4092 }
4093
4094 context = iommu_context_addr(iommu, bus, devfn, false);
4095 if (!context || !context_present(context)) {
4096 spin_unlock(&iommu->lock);
4097 return -ENODEV;
4098 }
4099 did = context_domain_id(context);
4100
4101 if (enable)
4102 context_set_sm_pre(context);
4103 else
4104 context_clear_sm_pre(context);
4105
4106 if (!ecap_coherent(iommu->ecap))
4107 clflush_cache_range(context, sizeof(*context));
4108 intel_context_flush_present(info, context, did, true);
4109 spin_unlock(&iommu->lock);
4110
4111 return 0;
4112 }
4113
intel_iommu_enable_iopf(struct device * dev)4114 static int intel_iommu_enable_iopf(struct device *dev)
4115 {
4116 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4117 struct device_domain_info *info = dev_iommu_priv_get(dev);
4118 struct intel_iommu *iommu;
4119 int ret;
4120
4121 if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4122 return -ENODEV;
4123
4124 if (info->pri_enabled)
4125 return -EBUSY;
4126
4127 iommu = info->iommu;
4128 if (!iommu)
4129 return -EINVAL;
4130
4131 /* PASID is required in PRG Response Message. */
4132 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4133 return -EINVAL;
4134
4135 ret = pci_reset_pri(pdev);
4136 if (ret)
4137 return ret;
4138
4139 ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4140 if (ret)
4141 return ret;
4142
4143 ret = context_flip_pri(info, true);
4144 if (ret)
4145 goto err_remove_device;
4146
4147 ret = pci_enable_pri(pdev, PRQ_DEPTH);
4148 if (ret)
4149 goto err_clear_pri;
4150
4151 info->pri_enabled = 1;
4152
4153 return 0;
4154 err_clear_pri:
4155 context_flip_pri(info, false);
4156 err_remove_device:
4157 iopf_queue_remove_device(iommu->iopf_queue, dev);
4158
4159 return ret;
4160 }
4161
intel_iommu_disable_iopf(struct device * dev)4162 static int intel_iommu_disable_iopf(struct device *dev)
4163 {
4164 struct device_domain_info *info = dev_iommu_priv_get(dev);
4165 struct intel_iommu *iommu = info->iommu;
4166
4167 if (!info->pri_enabled)
4168 return -EINVAL;
4169
4170 /* Disable new PRI reception: */
4171 context_flip_pri(info, false);
4172
4173 /*
4174 * Remove device from fault queue and acknowledge all outstanding
4175 * PRQs to the device:
4176 */
4177 iopf_queue_remove_device(iommu->iopf_queue, dev);
4178
4179 /*
4180 * PCIe spec states that by clearing PRI enable bit, the Page
4181 * Request Interface will not issue new page requests, but has
4182 * outstanding page requests that have been transmitted or are
4183 * queued for transmission. This is supposed to be called after
4184 * the device driver has stopped DMA, all PASIDs have been
4185 * unbound and the outstanding PRQs have been drained.
4186 */
4187 pci_disable_pri(to_pci_dev(dev));
4188 info->pri_enabled = 0;
4189
4190 return 0;
4191 }
4192
4193 static int
intel_iommu_dev_enable_feat(struct device * dev,enum iommu_dev_features feat)4194 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4195 {
4196 switch (feat) {
4197 case IOMMU_DEV_FEAT_IOPF:
4198 return intel_iommu_enable_iopf(dev);
4199
4200 case IOMMU_DEV_FEAT_SVA:
4201 return intel_iommu_enable_sva(dev);
4202
4203 default:
4204 return -ENODEV;
4205 }
4206 }
4207
4208 static int
intel_iommu_dev_disable_feat(struct device * dev,enum iommu_dev_features feat)4209 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4210 {
4211 switch (feat) {
4212 case IOMMU_DEV_FEAT_IOPF:
4213 return intel_iommu_disable_iopf(dev);
4214
4215 case IOMMU_DEV_FEAT_SVA:
4216 return 0;
4217
4218 default:
4219 return -ENODEV;
4220 }
4221 }
4222
intel_iommu_is_attach_deferred(struct device * dev)4223 static bool intel_iommu_is_attach_deferred(struct device *dev)
4224 {
4225 struct device_domain_info *info = dev_iommu_priv_get(dev);
4226
4227 return translation_pre_enabled(info->iommu) && !info->domain;
4228 }
4229
4230 /*
4231 * Check that the device does not live on an external facing PCI port that is
4232 * marked as untrusted. Such devices should not be able to apply quirks and
4233 * thus not be able to bypass the IOMMU restrictions.
4234 */
risky_device(struct pci_dev * pdev)4235 static bool risky_device(struct pci_dev *pdev)
4236 {
4237 if (pdev->untrusted) {
4238 pci_info(pdev,
4239 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4240 pdev->vendor, pdev->device);
4241 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4242 return true;
4243 }
4244 return false;
4245 }
4246
intel_iommu_iotlb_sync_map(struct iommu_domain * domain,unsigned long iova,size_t size)4247 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4248 unsigned long iova, size_t size)
4249 {
4250 cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1);
4251
4252 return 0;
4253 }
4254
intel_iommu_remove_dev_pasid(struct device * dev,ioasid_t pasid,struct iommu_domain * domain)4255 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
4256 struct iommu_domain *domain)
4257 {
4258 struct device_domain_info *info = dev_iommu_priv_get(dev);
4259 struct dev_pasid_info *curr, *dev_pasid = NULL;
4260 struct intel_iommu *iommu = info->iommu;
4261 struct dmar_domain *dmar_domain;
4262 unsigned long flags;
4263
4264 if (domain->type == IOMMU_DOMAIN_IDENTITY) {
4265 intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4266 return;
4267 }
4268
4269 dmar_domain = to_dmar_domain(domain);
4270 spin_lock_irqsave(&dmar_domain->lock, flags);
4271 list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4272 if (curr->dev == dev && curr->pasid == pasid) {
4273 list_del(&curr->link_domain);
4274 dev_pasid = curr;
4275 break;
4276 }
4277 }
4278 WARN_ON_ONCE(!dev_pasid);
4279 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4280
4281 cache_tag_unassign_domain(dmar_domain, dev, pasid);
4282 domain_detach_iommu(dmar_domain, iommu);
4283 intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4284 kfree(dev_pasid);
4285 intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4286 intel_drain_pasid_prq(dev, pasid);
4287 }
4288
intel_iommu_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid)4289 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4290 struct device *dev, ioasid_t pasid)
4291 {
4292 struct device_domain_info *info = dev_iommu_priv_get(dev);
4293 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4294 struct intel_iommu *iommu = info->iommu;
4295 struct dev_pasid_info *dev_pasid;
4296 unsigned long flags;
4297 int ret;
4298
4299 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4300 return -EOPNOTSUPP;
4301
4302 if (domain->dirty_ops)
4303 return -EINVAL;
4304
4305 if (context_copied(iommu, info->bus, info->devfn))
4306 return -EBUSY;
4307
4308 ret = prepare_domain_attach_device(domain, dev);
4309 if (ret)
4310 return ret;
4311
4312 dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4313 if (!dev_pasid)
4314 return -ENOMEM;
4315
4316 ret = domain_attach_iommu(dmar_domain, iommu);
4317 if (ret)
4318 goto out_free;
4319
4320 ret = cache_tag_assign_domain(dmar_domain, dev, pasid);
4321 if (ret)
4322 goto out_detach_iommu;
4323
4324 if (dmar_domain->use_first_level)
4325 ret = domain_setup_first_level(iommu, dmar_domain,
4326 dev, pasid);
4327 else
4328 ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4329 dev, pasid);
4330 if (ret)
4331 goto out_unassign_tag;
4332
4333 dev_pasid->dev = dev;
4334 dev_pasid->pasid = pasid;
4335 spin_lock_irqsave(&dmar_domain->lock, flags);
4336 list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4337 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4338
4339 if (domain->type & __IOMMU_DOMAIN_PAGING)
4340 intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4341
4342 return 0;
4343 out_unassign_tag:
4344 cache_tag_unassign_domain(dmar_domain, dev, pasid);
4345 out_detach_iommu:
4346 domain_detach_iommu(dmar_domain, iommu);
4347 out_free:
4348 kfree(dev_pasid);
4349 return ret;
4350 }
4351
intel_iommu_hw_info(struct device * dev,u32 * length,u32 * type)4352 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4353 {
4354 struct device_domain_info *info = dev_iommu_priv_get(dev);
4355 struct intel_iommu *iommu = info->iommu;
4356 struct iommu_hw_info_vtd *vtd;
4357
4358 vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4359 if (!vtd)
4360 return ERR_PTR(-ENOMEM);
4361
4362 vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4363 vtd->cap_reg = iommu->cap;
4364 vtd->ecap_reg = iommu->ecap;
4365 *length = sizeof(*vtd);
4366 *type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4367 return vtd;
4368 }
4369
4370 /*
4371 * Set dirty tracking for the device list of a domain. The caller must
4372 * hold the domain->lock when calling it.
4373 */
device_set_dirty_tracking(struct list_head * devices,bool enable)4374 static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4375 {
4376 struct device_domain_info *info;
4377 int ret = 0;
4378
4379 list_for_each_entry(info, devices, link) {
4380 ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4381 IOMMU_NO_PASID, enable);
4382 if (ret)
4383 break;
4384 }
4385
4386 return ret;
4387 }
4388
parent_domain_set_dirty_tracking(struct dmar_domain * domain,bool enable)4389 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4390 bool enable)
4391 {
4392 struct dmar_domain *s1_domain;
4393 unsigned long flags;
4394 int ret;
4395
4396 spin_lock(&domain->s1_lock);
4397 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4398 spin_lock_irqsave(&s1_domain->lock, flags);
4399 ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4400 spin_unlock_irqrestore(&s1_domain->lock, flags);
4401 if (ret)
4402 goto err_unwind;
4403 }
4404 spin_unlock(&domain->s1_lock);
4405 return 0;
4406
4407 err_unwind:
4408 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4409 spin_lock_irqsave(&s1_domain->lock, flags);
4410 device_set_dirty_tracking(&s1_domain->devices,
4411 domain->dirty_tracking);
4412 spin_unlock_irqrestore(&s1_domain->lock, flags);
4413 }
4414 spin_unlock(&domain->s1_lock);
4415 return ret;
4416 }
4417
intel_iommu_set_dirty_tracking(struct iommu_domain * domain,bool enable)4418 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4419 bool enable)
4420 {
4421 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4422 int ret;
4423
4424 spin_lock(&dmar_domain->lock);
4425 if (dmar_domain->dirty_tracking == enable)
4426 goto out_unlock;
4427
4428 ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4429 if (ret)
4430 goto err_unwind;
4431
4432 if (dmar_domain->nested_parent) {
4433 ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4434 if (ret)
4435 goto err_unwind;
4436 }
4437
4438 dmar_domain->dirty_tracking = enable;
4439 out_unlock:
4440 spin_unlock(&dmar_domain->lock);
4441
4442 return 0;
4443
4444 err_unwind:
4445 device_set_dirty_tracking(&dmar_domain->devices,
4446 dmar_domain->dirty_tracking);
4447 spin_unlock(&dmar_domain->lock);
4448 return ret;
4449 }
4450
intel_iommu_read_and_clear_dirty(struct iommu_domain * domain,unsigned long iova,size_t size,unsigned long flags,struct iommu_dirty_bitmap * dirty)4451 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4452 unsigned long iova, size_t size,
4453 unsigned long flags,
4454 struct iommu_dirty_bitmap *dirty)
4455 {
4456 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4457 unsigned long end = iova + size - 1;
4458 unsigned long pgsize;
4459
4460 /*
4461 * IOMMUFD core calls into a dirty tracking disabled domain without an
4462 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4463 * have occurred when we stopped dirty tracking. This ensures that we
4464 * never inherit dirtied bits from a previous cycle.
4465 */
4466 if (!dmar_domain->dirty_tracking && dirty->bitmap)
4467 return -EINVAL;
4468
4469 do {
4470 struct dma_pte *pte;
4471 int lvl = 0;
4472
4473 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4474 GFP_ATOMIC);
4475 pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4476 if (!pte || !dma_pte_present(pte)) {
4477 iova += pgsize;
4478 continue;
4479 }
4480
4481 if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4482 iommu_dirty_bitmap_record(dirty, iova, pgsize);
4483 iova += pgsize;
4484 } while (iova < end);
4485
4486 return 0;
4487 }
4488
4489 static const struct iommu_dirty_ops intel_dirty_ops = {
4490 .set_dirty_tracking = intel_iommu_set_dirty_tracking,
4491 .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4492 };
4493
context_setup_pass_through(struct device * dev,u8 bus,u8 devfn)4494 static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
4495 {
4496 struct device_domain_info *info = dev_iommu_priv_get(dev);
4497 struct intel_iommu *iommu = info->iommu;
4498 struct context_entry *context;
4499
4500 spin_lock(&iommu->lock);
4501 context = iommu_context_addr(iommu, bus, devfn, 1);
4502 if (!context) {
4503 spin_unlock(&iommu->lock);
4504 return -ENOMEM;
4505 }
4506
4507 if (context_present(context) && !context_copied(iommu, bus, devfn)) {
4508 spin_unlock(&iommu->lock);
4509 return 0;
4510 }
4511
4512 copied_context_tear_down(iommu, context, bus, devfn);
4513 context_clear_entry(context);
4514 context_set_domain_id(context, FLPT_DEFAULT_DID);
4515
4516 /*
4517 * In pass through mode, AW must be programmed to indicate the largest
4518 * AGAW value supported by hardware. And ASR is ignored by hardware.
4519 */
4520 context_set_address_width(context, iommu->msagaw);
4521 context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH);
4522 context_set_fault_enable(context);
4523 context_set_present(context);
4524 if (!ecap_coherent(iommu->ecap))
4525 clflush_cache_range(context, sizeof(*context));
4526 context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn);
4527 spin_unlock(&iommu->lock);
4528
4529 return 0;
4530 }
4531
context_setup_pass_through_cb(struct pci_dev * pdev,u16 alias,void * data)4532 static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data)
4533 {
4534 struct device *dev = data;
4535
4536 if (dev != &pdev->dev)
4537 return 0;
4538
4539 return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff);
4540 }
4541
device_setup_pass_through(struct device * dev)4542 static int device_setup_pass_through(struct device *dev)
4543 {
4544 struct device_domain_info *info = dev_iommu_priv_get(dev);
4545
4546 if (!dev_is_pci(dev))
4547 return context_setup_pass_through(dev, info->bus, info->devfn);
4548
4549 return pci_for_each_dma_alias(to_pci_dev(dev),
4550 context_setup_pass_through_cb, dev);
4551 }
4552
identity_domain_attach_dev(struct iommu_domain * domain,struct device * dev)4553 static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev)
4554 {
4555 struct device_domain_info *info = dev_iommu_priv_get(dev);
4556 struct intel_iommu *iommu = info->iommu;
4557 int ret;
4558
4559 device_block_translation(dev);
4560
4561 if (dev_is_real_dma_subdevice(dev))
4562 return 0;
4563
4564 if (sm_supported(iommu)) {
4565 ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
4566 if (!ret)
4567 iommu_enable_pci_caps(info);
4568 } else {
4569 ret = device_setup_pass_through(dev);
4570 }
4571
4572 return ret;
4573 }
4574
identity_domain_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid)4575 static int identity_domain_set_dev_pasid(struct iommu_domain *domain,
4576 struct device *dev, ioasid_t pasid)
4577 {
4578 struct device_domain_info *info = dev_iommu_priv_get(dev);
4579 struct intel_iommu *iommu = info->iommu;
4580
4581 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4582 return -EOPNOTSUPP;
4583
4584 return intel_pasid_setup_pass_through(iommu, dev, pasid);
4585 }
4586
4587 static struct iommu_domain identity_domain = {
4588 .type = IOMMU_DOMAIN_IDENTITY,
4589 .ops = &(const struct iommu_domain_ops) {
4590 .attach_dev = identity_domain_attach_dev,
4591 .set_dev_pasid = identity_domain_set_dev_pasid,
4592 },
4593 };
4594
4595 const struct iommu_ops intel_iommu_ops = {
4596 .blocked_domain = &blocking_domain,
4597 .release_domain = &blocking_domain,
4598 .identity_domain = &identity_domain,
4599 .capable = intel_iommu_capable,
4600 .hw_info = intel_iommu_hw_info,
4601 .domain_alloc = intel_iommu_domain_alloc,
4602 .domain_alloc_user = intel_iommu_domain_alloc_user,
4603 .domain_alloc_sva = intel_svm_domain_alloc,
4604 .probe_device = intel_iommu_probe_device,
4605 .release_device = intel_iommu_release_device,
4606 .get_resv_regions = intel_iommu_get_resv_regions,
4607 .device_group = intel_iommu_device_group,
4608 .dev_enable_feat = intel_iommu_dev_enable_feat,
4609 .dev_disable_feat = intel_iommu_dev_disable_feat,
4610 .is_attach_deferred = intel_iommu_is_attach_deferred,
4611 .def_domain_type = device_def_domain_type,
4612 .remove_dev_pasid = intel_iommu_remove_dev_pasid,
4613 .pgsize_bitmap = SZ_4K,
4614 #ifdef CONFIG_INTEL_IOMMU_SVM
4615 .page_response = intel_svm_page_response,
4616 #endif
4617 .default_domain_ops = &(const struct iommu_domain_ops) {
4618 .attach_dev = intel_iommu_attach_device,
4619 .set_dev_pasid = intel_iommu_set_dev_pasid,
4620 .map_pages = intel_iommu_map_pages,
4621 .unmap_pages = intel_iommu_unmap_pages,
4622 .iotlb_sync_map = intel_iommu_iotlb_sync_map,
4623 .flush_iotlb_all = intel_flush_iotlb_all,
4624 .iotlb_sync = intel_iommu_tlb_sync,
4625 .iova_to_phys = intel_iommu_iova_to_phys,
4626 .free = intel_iommu_domain_free,
4627 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4628 }
4629 };
4630
quirk_iommu_igfx(struct pci_dev * dev)4631 static void quirk_iommu_igfx(struct pci_dev *dev)
4632 {
4633 if (risky_device(dev))
4634 return;
4635
4636 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4637 disable_igfx_iommu = 1;
4638 }
4639
4640 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4641 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4642 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4643 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4644 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4645 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4646 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4647 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4648
4649 /* Broadwell igfx malfunctions with dmar */
4650 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4651 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4652 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4653 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4654 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4655 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4656 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4657 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4658 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4659 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4660 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4661 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4662 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4663 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4664 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4665 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4666 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4667 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4668 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4669 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4670 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4671 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4672 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4673 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4674
quirk_iommu_rwbf(struct pci_dev * dev)4675 static void quirk_iommu_rwbf(struct pci_dev *dev)
4676 {
4677 if (risky_device(dev))
4678 return;
4679
4680 /*
4681 * Mobile 4 Series Chipset neglects to set RWBF capability,
4682 * but needs it. Same seems to hold for the desktop versions.
4683 */
4684 pci_info(dev, "Forcing write-buffer flush capability\n");
4685 rwbf_quirk = 1;
4686 }
4687
4688 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4689 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4690 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4691 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4692 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4693 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4694 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4695
4696 #define GGC 0x52
4697 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4698 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4699 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4700 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4701 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4702 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4703 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4704 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4705
quirk_calpella_no_shadow_gtt(struct pci_dev * dev)4706 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4707 {
4708 unsigned short ggc;
4709
4710 if (risky_device(dev))
4711 return;
4712
4713 if (pci_read_config_word(dev, GGC, &ggc))
4714 return;
4715
4716 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4717 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4718 disable_igfx_iommu = 1;
4719 } else if (!disable_igfx_iommu) {
4720 /* we have to ensure the gfx device is idle before we flush */
4721 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4722 iommu_set_dma_strict();
4723 }
4724 }
4725 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4726 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4727 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4728 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4729
quirk_igfx_skip_te_disable(struct pci_dev * dev)4730 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4731 {
4732 unsigned short ver;
4733
4734 if (!IS_GFX_DEVICE(dev))
4735 return;
4736
4737 ver = (dev->device >> 8) & 0xff;
4738 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4739 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4740 ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4741 return;
4742
4743 if (risky_device(dev))
4744 return;
4745
4746 pci_info(dev, "Skip IOMMU disabling for graphics\n");
4747 iommu_skip_te_disable = 1;
4748 }
4749 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4750
4751 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4752 ISOCH DMAR unit for the Azalia sound device, but not give it any
4753 TLB entries, which causes it to deadlock. Check for that. We do
4754 this in a function called from init_dmars(), instead of in a PCI
4755 quirk, because we don't want to print the obnoxious "BIOS broken"
4756 message if VT-d is actually disabled.
4757 */
check_tylersburg_isoch(void)4758 static void __init check_tylersburg_isoch(void)
4759 {
4760 struct pci_dev *pdev;
4761 uint32_t vtisochctrl;
4762
4763 /* If there's no Azalia in the system anyway, forget it. */
4764 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4765 if (!pdev)
4766 return;
4767
4768 if (risky_device(pdev)) {
4769 pci_dev_put(pdev);
4770 return;
4771 }
4772
4773 pci_dev_put(pdev);
4774
4775 /* System Management Registers. Might be hidden, in which case
4776 we can't do the sanity check. But that's OK, because the
4777 known-broken BIOSes _don't_ actually hide it, so far. */
4778 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4779 if (!pdev)
4780 return;
4781
4782 if (risky_device(pdev)) {
4783 pci_dev_put(pdev);
4784 return;
4785 }
4786
4787 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4788 pci_dev_put(pdev);
4789 return;
4790 }
4791
4792 pci_dev_put(pdev);
4793
4794 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4795 if (vtisochctrl & 1)
4796 return;
4797
4798 /* Drop all bits other than the number of TLB entries */
4799 vtisochctrl &= 0x1c;
4800
4801 /* If we have the recommended number of TLB entries (16), fine. */
4802 if (vtisochctrl == 0x10)
4803 return;
4804
4805 /* Zero TLB entries? You get to ride the short bus to school. */
4806 if (!vtisochctrl) {
4807 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4808 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4809 dmi_get_system_info(DMI_BIOS_VENDOR),
4810 dmi_get_system_info(DMI_BIOS_VERSION),
4811 dmi_get_system_info(DMI_PRODUCT_VERSION));
4812 iommu_identity_mapping |= IDENTMAP_AZALIA;
4813 return;
4814 }
4815
4816 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4817 vtisochctrl);
4818 }
4819
4820 /*
4821 * Here we deal with a device TLB defect where device may inadvertently issue ATS
4822 * invalidation completion before posted writes initiated with translated address
4823 * that utilized translations matching the invalidation address range, violating
4824 * the invalidation completion ordering.
4825 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4826 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4827 * under the control of the trusted/privileged host device driver must use this
4828 * quirk.
4829 * Device TLBs are invalidated under the following six conditions:
4830 * 1. Device driver does DMA API unmap IOVA
4831 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4832 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4833 * exit_mmap() due to crash
4834 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4835 * VM has to free pages that were unmapped
4836 * 5. Userspace driver unmaps a DMA buffer
4837 * 6. Cache invalidation in vSVA usage (upcoming)
4838 *
4839 * For #1 and #2, device drivers are responsible for stopping DMA traffic
4840 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4841 * invalidate TLB the same way as normal user unmap which will use this quirk.
4842 * The dTLB invalidation after PASID cache flush does not need this quirk.
4843 *
4844 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4845 */
quirk_extra_dev_tlb_flush(struct device_domain_info * info,unsigned long address,unsigned long mask,u32 pasid,u16 qdep)4846 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4847 unsigned long address, unsigned long mask,
4848 u32 pasid, u16 qdep)
4849 {
4850 u16 sid;
4851
4852 if (likely(!info->dtlb_extra_inval))
4853 return;
4854
4855 sid = PCI_DEVID(info->bus, info->devfn);
4856 if (pasid == IOMMU_NO_PASID) {
4857 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
4858 qdep, address, mask);
4859 } else {
4860 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
4861 pasid, qdep, address, mask);
4862 }
4863 }
4864
4865 #define ecmd_get_status_code(res) (((res) & 0xff) >> 1)
4866
4867 /*
4868 * Function to submit a command to the enhanced command interface. The
4869 * valid enhanced command descriptions are defined in Table 47 of the
4870 * VT-d spec. The VT-d hardware implementation may support some but not
4871 * all commands, which can be determined by checking the Enhanced
4872 * Command Capability Register.
4873 *
4874 * Return values:
4875 * - 0: Command successful without any error;
4876 * - Negative: software error value;
4877 * - Nonzero positive: failure status code defined in Table 48.
4878 */
ecmd_submit_sync(struct intel_iommu * iommu,u8 ecmd,u64 oa,u64 ob)4879 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
4880 {
4881 unsigned long flags;
4882 u64 res;
4883 int ret;
4884
4885 if (!cap_ecmds(iommu->cap))
4886 return -ENODEV;
4887
4888 raw_spin_lock_irqsave(&iommu->register_lock, flags);
4889
4890 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
4891 if (res & DMA_ECMD_ECRSP_IP) {
4892 ret = -EBUSY;
4893 goto err;
4894 }
4895
4896 /*
4897 * Unconditionally write the operand B, because
4898 * - There is no side effect if an ecmd doesn't require an
4899 * operand B, but we set the register to some value.
4900 * - It's not invoked in any critical path. The extra MMIO
4901 * write doesn't bring any performance concerns.
4902 */
4903 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
4904 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
4905
4906 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
4907 !(res & DMA_ECMD_ECRSP_IP), res);
4908
4909 if (res & DMA_ECMD_ECRSP_IP) {
4910 ret = -ETIMEDOUT;
4911 goto err;
4912 }
4913
4914 ret = ecmd_get_status_code(res);
4915 err:
4916 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
4917
4918 return ret;
4919 }
4920