1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright © 2006-2014 Intel Corporation.
4 *
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
11 */
12
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
15
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-pages.h"
31 #include "pasid.h"
32 #include "perfmon.h"
33
34 #define ROOT_SIZE VTD_PAGE_SIZE
35 #define CONTEXT_SIZE VTD_PAGE_SIZE
36
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41
42 #define IOAPIC_RANGE_START (0xfee00000)
43 #define IOAPIC_RANGE_END (0xfeefffff)
44 #define IOVA_START_ADDR (0x1000)
45
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47
48 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
49 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
50
51 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
52 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
53 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
54 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
55 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
56
57 static void __init check_tylersburg_isoch(void);
58 static int rwbf_quirk;
59
60 /*
61 * set to 1 to panic kernel if can't successfully enable VT-d
62 * (used when kernel is launched w/ TXT)
63 */
64 static int force_on = 0;
65 static int intel_iommu_tboot_noforce;
66 static int no_platform_optin;
67
68 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
69
70 /*
71 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
72 * if marked present.
73 */
root_entry_lctp(struct root_entry * re)74 static phys_addr_t root_entry_lctp(struct root_entry *re)
75 {
76 if (!(re->lo & 1))
77 return 0;
78
79 return re->lo & VTD_PAGE_MASK;
80 }
81
82 /*
83 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
84 * if marked present.
85 */
root_entry_uctp(struct root_entry * re)86 static phys_addr_t root_entry_uctp(struct root_entry *re)
87 {
88 if (!(re->hi & 1))
89 return 0;
90
91 return re->hi & VTD_PAGE_MASK;
92 }
93
device_rid_cmp_key(const void * key,const struct rb_node * node)94 static int device_rid_cmp_key(const void *key, const struct rb_node *node)
95 {
96 struct device_domain_info *info =
97 rb_entry(node, struct device_domain_info, node);
98 const u16 *rid_lhs = key;
99
100 if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
101 return -1;
102
103 if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
104 return 1;
105
106 return 0;
107 }
108
device_rid_cmp(struct rb_node * lhs,const struct rb_node * rhs)109 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
110 {
111 struct device_domain_info *info =
112 rb_entry(lhs, struct device_domain_info, node);
113 u16 key = PCI_DEVID(info->bus, info->devfn);
114
115 return device_rid_cmp_key(&key, rhs);
116 }
117
118 /*
119 * Looks up an IOMMU-probed device using its source ID.
120 *
121 * Returns the pointer to the device if there is a match. Otherwise,
122 * returns NULL.
123 *
124 * Note that this helper doesn't guarantee that the device won't be
125 * released by the iommu subsystem after being returned. The caller
126 * should use its own synchronization mechanism to avoid the device
127 * being released during its use if its possibly the case.
128 */
device_rbtree_find(struct intel_iommu * iommu,u16 rid)129 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
130 {
131 struct device_domain_info *info = NULL;
132 struct rb_node *node;
133 unsigned long flags;
134
135 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
136 node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
137 if (node)
138 info = rb_entry(node, struct device_domain_info, node);
139 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
140
141 return info ? info->dev : NULL;
142 }
143
device_rbtree_insert(struct intel_iommu * iommu,struct device_domain_info * info)144 static int device_rbtree_insert(struct intel_iommu *iommu,
145 struct device_domain_info *info)
146 {
147 struct rb_node *curr;
148 unsigned long flags;
149
150 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
151 curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
152 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
153 if (WARN_ON(curr))
154 return -EEXIST;
155
156 return 0;
157 }
158
device_rbtree_remove(struct device_domain_info * info)159 static void device_rbtree_remove(struct device_domain_info *info)
160 {
161 struct intel_iommu *iommu = info->iommu;
162 unsigned long flags;
163
164 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
165 rb_erase(&info->node, &iommu->device_rbtree);
166 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
167 }
168
169 struct dmar_rmrr_unit {
170 struct list_head list; /* list of rmrr units */
171 struct acpi_dmar_header *hdr; /* ACPI header */
172 u64 base_address; /* reserved base address*/
173 u64 end_address; /* reserved end address */
174 struct dmar_dev_scope *devices; /* target devices */
175 int devices_cnt; /* target device count */
176 };
177
178 struct dmar_atsr_unit {
179 struct list_head list; /* list of ATSR units */
180 struct acpi_dmar_header *hdr; /* ACPI header */
181 struct dmar_dev_scope *devices; /* target devices */
182 int devices_cnt; /* target device count */
183 u8 include_all:1; /* include all ports */
184 };
185
186 struct dmar_satc_unit {
187 struct list_head list; /* list of SATC units */
188 struct acpi_dmar_header *hdr; /* ACPI header */
189 struct dmar_dev_scope *devices; /* target devices */
190 struct intel_iommu *iommu; /* the corresponding iommu */
191 int devices_cnt; /* target device count */
192 u8 atc_required:1; /* ATS is required */
193 };
194
195 static LIST_HEAD(dmar_atsr_units);
196 static LIST_HEAD(dmar_rmrr_units);
197 static LIST_HEAD(dmar_satc_units);
198
199 #define for_each_rmrr_units(rmrr) \
200 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
201
202 static void intel_iommu_domain_free(struct iommu_domain *domain);
203
204 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
205 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
206
207 int intel_iommu_enabled = 0;
208 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
209
210 static int intel_iommu_superpage = 1;
211 static int iommu_identity_mapping;
212 static int iommu_skip_te_disable;
213 static int disable_igfx_iommu;
214
215 #define IDENTMAP_AZALIA 4
216
217 const struct iommu_ops intel_iommu_ops;
218 static const struct iommu_dirty_ops intel_dirty_ops;
219
translation_pre_enabled(struct intel_iommu * iommu)220 static bool translation_pre_enabled(struct intel_iommu *iommu)
221 {
222 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
223 }
224
clear_translation_pre_enabled(struct intel_iommu * iommu)225 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
226 {
227 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
228 }
229
init_translation_status(struct intel_iommu * iommu)230 static void init_translation_status(struct intel_iommu *iommu)
231 {
232 u32 gsts;
233
234 gsts = readl(iommu->reg + DMAR_GSTS_REG);
235 if (gsts & DMA_GSTS_TES)
236 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
237 }
238
intel_iommu_setup(char * str)239 static int __init intel_iommu_setup(char *str)
240 {
241 if (!str)
242 return -EINVAL;
243
244 while (*str) {
245 if (!strncmp(str, "on", 2)) {
246 dmar_disabled = 0;
247 pr_info("IOMMU enabled\n");
248 } else if (!strncmp(str, "off", 3)) {
249 dmar_disabled = 1;
250 no_platform_optin = 1;
251 pr_info("IOMMU disabled\n");
252 } else if (!strncmp(str, "igfx_off", 8)) {
253 disable_igfx_iommu = 1;
254 pr_info("Disable GFX device mapping\n");
255 } else if (!strncmp(str, "forcedac", 8)) {
256 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
257 iommu_dma_forcedac = true;
258 } else if (!strncmp(str, "strict", 6)) {
259 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
260 iommu_set_dma_strict();
261 } else if (!strncmp(str, "sp_off", 6)) {
262 pr_info("Disable supported super page\n");
263 intel_iommu_superpage = 0;
264 } else if (!strncmp(str, "sm_on", 5)) {
265 pr_info("Enable scalable mode if hardware supports\n");
266 intel_iommu_sm = 1;
267 } else if (!strncmp(str, "sm_off", 6)) {
268 pr_info("Scalable mode is disallowed\n");
269 intel_iommu_sm = 0;
270 } else if (!strncmp(str, "tboot_noforce", 13)) {
271 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
272 intel_iommu_tboot_noforce = 1;
273 } else {
274 pr_notice("Unknown option - '%s'\n", str);
275 }
276
277 str += strcspn(str, ",");
278 while (*str == ',')
279 str++;
280 }
281
282 return 1;
283 }
284 __setup("intel_iommu=", intel_iommu_setup);
285
domain_pfn_supported(struct dmar_domain * domain,unsigned long pfn)286 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
287 {
288 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
289
290 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
291 }
292
293 /*
294 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
295 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
296 * the returned SAGAW.
297 */
__iommu_calculate_sagaw(struct intel_iommu * iommu)298 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
299 {
300 unsigned long fl_sagaw, sl_sagaw;
301
302 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
303 sl_sagaw = cap_sagaw(iommu->cap);
304
305 /* Second level only. */
306 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
307 return sl_sagaw;
308
309 /* First level only. */
310 if (!ecap_slts(iommu->ecap))
311 return fl_sagaw;
312
313 return fl_sagaw & sl_sagaw;
314 }
315
__iommu_calculate_agaw(struct intel_iommu * iommu,int max_gaw)316 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
317 {
318 unsigned long sagaw;
319 int agaw;
320
321 sagaw = __iommu_calculate_sagaw(iommu);
322 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
323 if (test_bit(agaw, &sagaw))
324 break;
325 }
326
327 return agaw;
328 }
329
330 /*
331 * Calculate max SAGAW for each iommu.
332 */
iommu_calculate_max_sagaw(struct intel_iommu * iommu)333 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
334 {
335 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
336 }
337
338 /*
339 * calculate agaw for each iommu.
340 * "SAGAW" may be different across iommus, use a default agaw, and
341 * get a supported less agaw for iommus that don't support the default agaw.
342 */
iommu_calculate_agaw(struct intel_iommu * iommu)343 int iommu_calculate_agaw(struct intel_iommu *iommu)
344 {
345 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
346 }
347
iommu_paging_structure_coherency(struct intel_iommu * iommu)348 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
349 {
350 return sm_supported(iommu) ?
351 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
352 }
353
354 /* Return the super pagesize bitmap if supported. */
domain_super_pgsize_bitmap(struct dmar_domain * domain)355 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
356 {
357 unsigned long bitmap = 0;
358
359 /*
360 * 1-level super page supports page size of 2MiB, 2-level super page
361 * supports page size of both 2MiB and 1GiB.
362 */
363 if (domain->iommu_superpage == 1)
364 bitmap |= SZ_2M;
365 else if (domain->iommu_superpage == 2)
366 bitmap |= SZ_2M | SZ_1G;
367
368 return bitmap;
369 }
370
iommu_context_addr(struct intel_iommu * iommu,u8 bus,u8 devfn,int alloc)371 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
372 u8 devfn, int alloc)
373 {
374 struct root_entry *root = &iommu->root_entry[bus];
375 struct context_entry *context;
376 u64 *entry;
377
378 /*
379 * Except that the caller requested to allocate a new entry,
380 * returning a copied context entry makes no sense.
381 */
382 if (!alloc && context_copied(iommu, bus, devfn))
383 return NULL;
384
385 entry = &root->lo;
386 if (sm_supported(iommu)) {
387 if (devfn >= 0x80) {
388 devfn -= 0x80;
389 entry = &root->hi;
390 }
391 devfn *= 2;
392 }
393 if (*entry & 1)
394 context = phys_to_virt(*entry & VTD_PAGE_MASK);
395 else {
396 unsigned long phy_addr;
397 if (!alloc)
398 return NULL;
399
400 context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
401 if (!context)
402 return NULL;
403
404 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
405 phy_addr = virt_to_phys((void *)context);
406 *entry = phy_addr | 1;
407 __iommu_flush_cache(iommu, entry, sizeof(*entry));
408 }
409 return &context[devfn];
410 }
411
412 /**
413 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
414 * sub-hierarchy of a candidate PCI-PCI bridge
415 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
416 * @bridge: the candidate PCI-PCI bridge
417 *
418 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
419 */
420 static bool
is_downstream_to_pci_bridge(struct device * dev,struct device * bridge)421 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
422 {
423 struct pci_dev *pdev, *pbridge;
424
425 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
426 return false;
427
428 pdev = to_pci_dev(dev);
429 pbridge = to_pci_dev(bridge);
430
431 if (pbridge->subordinate &&
432 pbridge->subordinate->number <= pdev->bus->number &&
433 pbridge->subordinate->busn_res.end >= pdev->bus->number)
434 return true;
435
436 return false;
437 }
438
quirk_ioat_snb_local_iommu(struct pci_dev * pdev)439 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
440 {
441 struct dmar_drhd_unit *drhd;
442 u32 vtbar;
443 int rc;
444
445 /* We know that this device on this chipset has its own IOMMU.
446 * If we find it under a different IOMMU, then the BIOS is lying
447 * to us. Hope that the IOMMU for this device is actually
448 * disabled, and it needs no translation...
449 */
450 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
451 if (rc) {
452 /* "can't" happen */
453 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
454 return false;
455 }
456 vtbar &= 0xffff0000;
457
458 /* we know that the this iommu should be at offset 0xa000 from vtbar */
459 drhd = dmar_find_matched_drhd_unit(pdev);
460 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
461 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
462 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
463 return true;
464 }
465
466 return false;
467 }
468
iommu_is_dummy(struct intel_iommu * iommu,struct device * dev)469 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
470 {
471 if (!iommu || iommu->drhd->ignored)
472 return true;
473
474 if (dev_is_pci(dev)) {
475 struct pci_dev *pdev = to_pci_dev(dev);
476
477 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
478 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
479 quirk_ioat_snb_local_iommu(pdev))
480 return true;
481 }
482
483 return false;
484 }
485
device_lookup_iommu(struct device * dev,u8 * bus,u8 * devfn)486 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
487 {
488 struct dmar_drhd_unit *drhd = NULL;
489 struct pci_dev *pdev = NULL;
490 struct intel_iommu *iommu;
491 struct device *tmp;
492 u16 segment = 0;
493 int i;
494
495 if (!dev)
496 return NULL;
497
498 if (dev_is_pci(dev)) {
499 struct pci_dev *pf_pdev;
500
501 pdev = pci_real_dma_dev(to_pci_dev(dev));
502
503 /* VFs aren't listed in scope tables; we need to look up
504 * the PF instead to find the IOMMU. */
505 pf_pdev = pci_physfn(pdev);
506 dev = &pf_pdev->dev;
507 segment = pci_domain_nr(pdev->bus);
508 } else if (has_acpi_companion(dev))
509 dev = &ACPI_COMPANION(dev)->dev;
510
511 rcu_read_lock();
512 for_each_iommu(iommu, drhd) {
513 if (pdev && segment != drhd->segment)
514 continue;
515
516 for_each_active_dev_scope(drhd->devices,
517 drhd->devices_cnt, i, tmp) {
518 if (tmp == dev) {
519 /* For a VF use its original BDF# not that of the PF
520 * which we used for the IOMMU lookup. Strictly speaking
521 * we could do this for all PCI devices; we only need to
522 * get the BDF# from the scope table for ACPI matches. */
523 if (pdev && pdev->is_virtfn)
524 goto got_pdev;
525
526 if (bus && devfn) {
527 *bus = drhd->devices[i].bus;
528 *devfn = drhd->devices[i].devfn;
529 }
530 goto out;
531 }
532
533 if (is_downstream_to_pci_bridge(dev, tmp))
534 goto got_pdev;
535 }
536
537 if (pdev && drhd->include_all) {
538 got_pdev:
539 if (bus && devfn) {
540 *bus = pdev->bus->number;
541 *devfn = pdev->devfn;
542 }
543 goto out;
544 }
545 }
546 iommu = NULL;
547 out:
548 if (iommu_is_dummy(iommu, dev))
549 iommu = NULL;
550
551 rcu_read_unlock();
552
553 return iommu;
554 }
555
domain_flush_cache(struct dmar_domain * domain,void * addr,int size)556 static void domain_flush_cache(struct dmar_domain *domain,
557 void *addr, int size)
558 {
559 if (!domain->iommu_coherency)
560 clflush_cache_range(addr, size);
561 }
562
free_context_table(struct intel_iommu * iommu)563 static void free_context_table(struct intel_iommu *iommu)
564 {
565 struct context_entry *context;
566 int i;
567
568 if (!iommu->root_entry)
569 return;
570
571 for (i = 0; i < ROOT_ENTRY_NR; i++) {
572 context = iommu_context_addr(iommu, i, 0, 0);
573 if (context)
574 iommu_free_page(context);
575
576 if (!sm_supported(iommu))
577 continue;
578
579 context = iommu_context_addr(iommu, i, 0x80, 0);
580 if (context)
581 iommu_free_page(context);
582 }
583
584 iommu_free_page(iommu->root_entry);
585 iommu->root_entry = NULL;
586 }
587
588 #ifdef CONFIG_DMAR_DEBUG
pgtable_walk(struct intel_iommu * iommu,unsigned long pfn,u8 bus,u8 devfn,struct dma_pte * parent,int level)589 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
590 u8 bus, u8 devfn, struct dma_pte *parent, int level)
591 {
592 struct dma_pte *pte;
593 int offset;
594
595 while (1) {
596 offset = pfn_level_offset(pfn, level);
597 pte = &parent[offset];
598
599 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
600
601 if (!dma_pte_present(pte)) {
602 pr_info("page table not present at level %d\n", level - 1);
603 break;
604 }
605
606 if (level == 1 || dma_pte_superpage(pte))
607 break;
608
609 parent = phys_to_virt(dma_pte_addr(pte));
610 level--;
611 }
612 }
613
dmar_fault_dump_ptes(struct intel_iommu * iommu,u16 source_id,unsigned long long addr,u32 pasid)614 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
615 unsigned long long addr, u32 pasid)
616 {
617 struct pasid_dir_entry *dir, *pde;
618 struct pasid_entry *entries, *pte;
619 struct context_entry *ctx_entry;
620 struct root_entry *rt_entry;
621 int i, dir_index, index, level;
622 u8 devfn = source_id & 0xff;
623 u8 bus = source_id >> 8;
624 struct dma_pte *pgtable;
625
626 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
627
628 /* root entry dump */
629 if (!iommu->root_entry) {
630 pr_info("root table is not present\n");
631 return;
632 }
633 rt_entry = &iommu->root_entry[bus];
634
635 if (sm_supported(iommu))
636 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
637 rt_entry->hi, rt_entry->lo);
638 else
639 pr_info("root entry: 0x%016llx", rt_entry->lo);
640
641 /* context entry dump */
642 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
643 if (!ctx_entry) {
644 pr_info("context table is not present\n");
645 return;
646 }
647
648 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
649 ctx_entry->hi, ctx_entry->lo);
650
651 /* legacy mode does not require PASID entries */
652 if (!sm_supported(iommu)) {
653 if (!context_present(ctx_entry)) {
654 pr_info("legacy mode page table is not present\n");
655 return;
656 }
657 level = agaw_to_level(ctx_entry->hi & 7);
658 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
659 goto pgtable_walk;
660 }
661
662 if (!context_present(ctx_entry)) {
663 pr_info("pasid directory table is not present\n");
664 return;
665 }
666
667 /* get the pointer to pasid directory entry */
668 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
669
670 /* For request-without-pasid, get the pasid from context entry */
671 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
672 pasid = IOMMU_NO_PASID;
673
674 dir_index = pasid >> PASID_PDE_SHIFT;
675 pde = &dir[dir_index];
676 pr_info("pasid dir entry: 0x%016llx\n", pde->val);
677
678 /* get the pointer to the pasid table entry */
679 entries = get_pasid_table_from_pde(pde);
680 if (!entries) {
681 pr_info("pasid table is not present\n");
682 return;
683 }
684 index = pasid & PASID_PTE_MASK;
685 pte = &entries[index];
686 for (i = 0; i < ARRAY_SIZE(pte->val); i++)
687 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
688
689 if (!pasid_pte_is_present(pte)) {
690 pr_info("scalable mode page table is not present\n");
691 return;
692 }
693
694 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
695 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
696 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
697 } else {
698 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
699 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
700 }
701
702 pgtable_walk:
703 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
704 }
705 #endif
706
pfn_to_dma_pte(struct dmar_domain * domain,unsigned long pfn,int * target_level,gfp_t gfp)707 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
708 unsigned long pfn, int *target_level,
709 gfp_t gfp)
710 {
711 struct dma_pte *parent, *pte;
712 int level = agaw_to_level(domain->agaw);
713 int offset;
714
715 if (!domain_pfn_supported(domain, pfn))
716 /* Address beyond IOMMU's addressing capabilities. */
717 return NULL;
718
719 parent = domain->pgd;
720
721 while (1) {
722 void *tmp_page;
723
724 offset = pfn_level_offset(pfn, level);
725 pte = &parent[offset];
726 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
727 break;
728 if (level == *target_level)
729 break;
730
731 if (!dma_pte_present(pte)) {
732 uint64_t pteval, tmp;
733
734 tmp_page = iommu_alloc_page_node(domain->nid, gfp);
735
736 if (!tmp_page)
737 return NULL;
738
739 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
740 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
741 if (domain->use_first_level)
742 pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
743
744 tmp = 0ULL;
745 if (!try_cmpxchg64(&pte->val, &tmp, pteval))
746 /* Someone else set it while we were thinking; use theirs. */
747 iommu_free_page(tmp_page);
748 else
749 domain_flush_cache(domain, pte, sizeof(*pte));
750 }
751 if (level == 1)
752 break;
753
754 parent = phys_to_virt(dma_pte_addr(pte));
755 level--;
756 }
757
758 if (!*target_level)
759 *target_level = level;
760
761 return pte;
762 }
763
764 /* return address's pte at specific level */
dma_pfn_level_pte(struct dmar_domain * domain,unsigned long pfn,int level,int * large_page)765 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
766 unsigned long pfn,
767 int level, int *large_page)
768 {
769 struct dma_pte *parent, *pte;
770 int total = agaw_to_level(domain->agaw);
771 int offset;
772
773 parent = domain->pgd;
774 while (level <= total) {
775 offset = pfn_level_offset(pfn, total);
776 pte = &parent[offset];
777 if (level == total)
778 return pte;
779
780 if (!dma_pte_present(pte)) {
781 *large_page = total;
782 break;
783 }
784
785 if (dma_pte_superpage(pte)) {
786 *large_page = total;
787 return pte;
788 }
789
790 parent = phys_to_virt(dma_pte_addr(pte));
791 total--;
792 }
793 return NULL;
794 }
795
796 /* clear last level pte, a tlb flush should be followed */
dma_pte_clear_range(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn)797 static void dma_pte_clear_range(struct dmar_domain *domain,
798 unsigned long start_pfn,
799 unsigned long last_pfn)
800 {
801 unsigned int large_page;
802 struct dma_pte *first_pte, *pte;
803
804 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
805 WARN_ON(start_pfn > last_pfn))
806 return;
807
808 /* we don't need lock here; nobody else touches the iova range */
809 do {
810 large_page = 1;
811 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
812 if (!pte) {
813 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
814 continue;
815 }
816 do {
817 dma_clear_pte(pte);
818 start_pfn += lvl_to_nr_pages(large_page);
819 pte++;
820 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
821
822 domain_flush_cache(domain, first_pte,
823 (void *)pte - (void *)first_pte);
824
825 } while (start_pfn && start_pfn <= last_pfn);
826 }
827
dma_pte_free_level(struct dmar_domain * domain,int level,int retain_level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn)828 static void dma_pte_free_level(struct dmar_domain *domain, int level,
829 int retain_level, struct dma_pte *pte,
830 unsigned long pfn, unsigned long start_pfn,
831 unsigned long last_pfn)
832 {
833 pfn = max(start_pfn, pfn);
834 pte = &pte[pfn_level_offset(pfn, level)];
835
836 do {
837 unsigned long level_pfn;
838 struct dma_pte *level_pte;
839
840 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
841 goto next;
842
843 level_pfn = pfn & level_mask(level);
844 level_pte = phys_to_virt(dma_pte_addr(pte));
845
846 if (level > 2) {
847 dma_pte_free_level(domain, level - 1, retain_level,
848 level_pte, level_pfn, start_pfn,
849 last_pfn);
850 }
851
852 /*
853 * Free the page table if we're below the level we want to
854 * retain and the range covers the entire table.
855 */
856 if (level < retain_level && !(start_pfn > level_pfn ||
857 last_pfn < level_pfn + level_size(level) - 1)) {
858 dma_clear_pte(pte);
859 domain_flush_cache(domain, pte, sizeof(*pte));
860 iommu_free_page(level_pte);
861 }
862 next:
863 pfn += level_size(level);
864 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
865 }
866
867 /*
868 * clear last level (leaf) ptes and free page table pages below the
869 * level we wish to keep intact.
870 */
dma_pte_free_pagetable(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,int retain_level)871 static void dma_pte_free_pagetable(struct dmar_domain *domain,
872 unsigned long start_pfn,
873 unsigned long last_pfn,
874 int retain_level)
875 {
876 dma_pte_clear_range(domain, start_pfn, last_pfn);
877
878 /* We don't need lock here; nobody else touches the iova range */
879 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
880 domain->pgd, 0, start_pfn, last_pfn);
881
882 /* free pgd */
883 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
884 iommu_free_page(domain->pgd);
885 domain->pgd = NULL;
886 }
887 }
888
889 /* When a page at a given level is being unlinked from its parent, we don't
890 need to *modify* it at all. All we need to do is make a list of all the
891 pages which can be freed just as soon as we've flushed the IOTLB and we
892 know the hardware page-walk will no longer touch them.
893 The 'pte' argument is the *parent* PTE, pointing to the page that is to
894 be freed. */
dma_pte_list_pagetables(struct dmar_domain * domain,int level,struct dma_pte * pte,struct list_head * freelist)895 static void dma_pte_list_pagetables(struct dmar_domain *domain,
896 int level, struct dma_pte *pte,
897 struct list_head *freelist)
898 {
899 struct page *pg;
900
901 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
902 list_add_tail(&pg->lru, freelist);
903
904 if (level == 1)
905 return;
906
907 pte = page_address(pg);
908 do {
909 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
910 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
911 pte++;
912 } while (!first_pte_in_page(pte));
913 }
914
dma_pte_clear_level(struct dmar_domain * domain,int level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn,struct list_head * freelist)915 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
916 struct dma_pte *pte, unsigned long pfn,
917 unsigned long start_pfn, unsigned long last_pfn,
918 struct list_head *freelist)
919 {
920 struct dma_pte *first_pte = NULL, *last_pte = NULL;
921
922 pfn = max(start_pfn, pfn);
923 pte = &pte[pfn_level_offset(pfn, level)];
924
925 do {
926 unsigned long level_pfn = pfn & level_mask(level);
927
928 if (!dma_pte_present(pte))
929 goto next;
930
931 /* If range covers entire pagetable, free it */
932 if (start_pfn <= level_pfn &&
933 last_pfn >= level_pfn + level_size(level) - 1) {
934 /* These suborbinate page tables are going away entirely. Don't
935 bother to clear them; we're just going to *free* them. */
936 if (level > 1 && !dma_pte_superpage(pte))
937 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
938
939 dma_clear_pte(pte);
940 if (!first_pte)
941 first_pte = pte;
942 last_pte = pte;
943 } else if (level > 1) {
944 /* Recurse down into a level that isn't *entirely* obsolete */
945 dma_pte_clear_level(domain, level - 1,
946 phys_to_virt(dma_pte_addr(pte)),
947 level_pfn, start_pfn, last_pfn,
948 freelist);
949 }
950 next:
951 pfn = level_pfn + level_size(level);
952 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
953
954 if (first_pte)
955 domain_flush_cache(domain, first_pte,
956 (void *)++last_pte - (void *)first_pte);
957 }
958
959 /* We can't just free the pages because the IOMMU may still be walking
960 the page tables, and may have cached the intermediate levels. The
961 pages can only be freed after the IOTLB flush has been done. */
domain_unmap(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,struct list_head * freelist)962 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
963 unsigned long last_pfn, struct list_head *freelist)
964 {
965 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
966 WARN_ON(start_pfn > last_pfn))
967 return;
968
969 /* we don't need lock here; nobody else touches the iova range */
970 dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
971 domain->pgd, 0, start_pfn, last_pfn, freelist);
972
973 /* free pgd */
974 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
975 struct page *pgd_page = virt_to_page(domain->pgd);
976 list_add_tail(&pgd_page->lru, freelist);
977 domain->pgd = NULL;
978 }
979 }
980
981 /* iommu handling */
iommu_alloc_root_entry(struct intel_iommu * iommu)982 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
983 {
984 struct root_entry *root;
985
986 root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
987 if (!root) {
988 pr_err("Allocating root entry for %s failed\n",
989 iommu->name);
990 return -ENOMEM;
991 }
992
993 __iommu_flush_cache(iommu, root, ROOT_SIZE);
994 iommu->root_entry = root;
995
996 return 0;
997 }
998
iommu_set_root_entry(struct intel_iommu * iommu)999 static void iommu_set_root_entry(struct intel_iommu *iommu)
1000 {
1001 u64 addr;
1002 u32 sts;
1003 unsigned long flag;
1004
1005 addr = virt_to_phys(iommu->root_entry);
1006 if (sm_supported(iommu))
1007 addr |= DMA_RTADDR_SMT;
1008
1009 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1010 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1011
1012 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1013
1014 /* Make sure hardware complete it */
1015 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1016 readl, (sts & DMA_GSTS_RTPS), sts);
1017
1018 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1019
1020 /*
1021 * Hardware invalidates all DMA remapping hardware translation
1022 * caches as part of SRTP flow.
1023 */
1024 if (cap_esrtps(iommu->cap))
1025 return;
1026
1027 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1028 if (sm_supported(iommu))
1029 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1030 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1031 }
1032
iommu_flush_write_buffer(struct intel_iommu * iommu)1033 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1034 {
1035 u32 val;
1036 unsigned long flag;
1037
1038 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1039 return;
1040
1041 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1042 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1043
1044 /* Make sure hardware complete it */
1045 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1046 readl, (!(val & DMA_GSTS_WBFS)), val);
1047
1048 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1049 }
1050
1051 /* return value determine if we need a write buffer flush */
__iommu_flush_context(struct intel_iommu * iommu,u16 did,u16 source_id,u8 function_mask,u64 type)1052 static void __iommu_flush_context(struct intel_iommu *iommu,
1053 u16 did, u16 source_id, u8 function_mask,
1054 u64 type)
1055 {
1056 u64 val = 0;
1057 unsigned long flag;
1058
1059 switch (type) {
1060 case DMA_CCMD_GLOBAL_INVL:
1061 val = DMA_CCMD_GLOBAL_INVL;
1062 break;
1063 case DMA_CCMD_DOMAIN_INVL:
1064 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1065 break;
1066 case DMA_CCMD_DEVICE_INVL:
1067 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1068 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1069 break;
1070 default:
1071 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1072 iommu->name, type);
1073 return;
1074 }
1075 val |= DMA_CCMD_ICC;
1076
1077 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1078 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1079
1080 /* Make sure hardware complete it */
1081 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1082 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1083
1084 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1085 }
1086
__iommu_flush_iotlb(struct intel_iommu * iommu,u16 did,u64 addr,unsigned int size_order,u64 type)1087 void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
1088 unsigned int size_order, u64 type)
1089 {
1090 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1091 u64 val = 0, val_iva = 0;
1092 unsigned long flag;
1093
1094 switch (type) {
1095 case DMA_TLB_GLOBAL_FLUSH:
1096 /* global flush doesn't need set IVA_REG */
1097 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1098 break;
1099 case DMA_TLB_DSI_FLUSH:
1100 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1101 break;
1102 case DMA_TLB_PSI_FLUSH:
1103 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1104 /* IH bit is passed in as part of address */
1105 val_iva = size_order | addr;
1106 break;
1107 default:
1108 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1109 iommu->name, type);
1110 return;
1111 }
1112
1113 if (cap_write_drain(iommu->cap))
1114 val |= DMA_TLB_WRITE_DRAIN;
1115
1116 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1117 /* Note: Only uses first TLB reg currently */
1118 if (val_iva)
1119 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1120 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1121
1122 /* Make sure hardware complete it */
1123 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1124 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1125
1126 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1127
1128 /* check IOTLB invalidation granularity */
1129 if (DMA_TLB_IAIG(val) == 0)
1130 pr_err("Flush IOTLB failed\n");
1131 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1132 pr_debug("TLB flush request %Lx, actual %Lx\n",
1133 (unsigned long long)DMA_TLB_IIRG(type),
1134 (unsigned long long)DMA_TLB_IAIG(val));
1135 }
1136
1137 static struct device_domain_info *
domain_lookup_dev_info(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1138 domain_lookup_dev_info(struct dmar_domain *domain,
1139 struct intel_iommu *iommu, u8 bus, u8 devfn)
1140 {
1141 struct device_domain_info *info;
1142 unsigned long flags;
1143
1144 spin_lock_irqsave(&domain->lock, flags);
1145 list_for_each_entry(info, &domain->devices, link) {
1146 if (info->iommu == iommu && info->bus == bus &&
1147 info->devfn == devfn) {
1148 spin_unlock_irqrestore(&domain->lock, flags);
1149 return info;
1150 }
1151 }
1152 spin_unlock_irqrestore(&domain->lock, flags);
1153
1154 return NULL;
1155 }
1156
1157 /*
1158 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1159 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1160 * check because it applies only to the built-in QAT devices and it doesn't
1161 * grant additional privileges.
1162 */
1163 #define BUGGY_QAT_DEVID_MASK 0x4940
dev_needs_extra_dtlb_flush(struct pci_dev * pdev)1164 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1165 {
1166 if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1167 return false;
1168
1169 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1170 return false;
1171
1172 return true;
1173 }
1174
iommu_enable_pci_caps(struct device_domain_info * info)1175 static void iommu_enable_pci_caps(struct device_domain_info *info)
1176 {
1177 struct pci_dev *pdev;
1178
1179 if (!dev_is_pci(info->dev))
1180 return;
1181
1182 pdev = to_pci_dev(info->dev);
1183 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1184 !pci_enable_ats(pdev, VTD_PAGE_SHIFT))
1185 info->ats_enabled = 1;
1186 }
1187
iommu_disable_pci_caps(struct device_domain_info * info)1188 static void iommu_disable_pci_caps(struct device_domain_info *info)
1189 {
1190 struct pci_dev *pdev;
1191
1192 if (!dev_is_pci(info->dev))
1193 return;
1194
1195 pdev = to_pci_dev(info->dev);
1196
1197 if (info->ats_enabled) {
1198 pci_disable_ats(pdev);
1199 info->ats_enabled = 0;
1200 }
1201 }
1202
intel_flush_iotlb_all(struct iommu_domain * domain)1203 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1204 {
1205 cache_tag_flush_all(to_dmar_domain(domain));
1206 }
1207
iommu_disable_protect_mem_regions(struct intel_iommu * iommu)1208 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1209 {
1210 u32 pmen;
1211 unsigned long flags;
1212
1213 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1214 return;
1215
1216 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1217 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1218 pmen &= ~DMA_PMEN_EPM;
1219 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1220
1221 /* wait for the protected region status bit to clear */
1222 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1223 readl, !(pmen & DMA_PMEN_PRS), pmen);
1224
1225 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1226 }
1227
iommu_enable_translation(struct intel_iommu * iommu)1228 static void iommu_enable_translation(struct intel_iommu *iommu)
1229 {
1230 u32 sts;
1231 unsigned long flags;
1232
1233 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1234 iommu->gcmd |= DMA_GCMD_TE;
1235 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1236
1237 /* Make sure hardware complete it */
1238 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1239 readl, (sts & DMA_GSTS_TES), sts);
1240
1241 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1242 }
1243
iommu_disable_translation(struct intel_iommu * iommu)1244 static void iommu_disable_translation(struct intel_iommu *iommu)
1245 {
1246 u32 sts;
1247 unsigned long flag;
1248
1249 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1250 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1251 return;
1252
1253 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1254 iommu->gcmd &= ~DMA_GCMD_TE;
1255 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1256
1257 /* Make sure hardware complete it */
1258 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1259 readl, (!(sts & DMA_GSTS_TES)), sts);
1260
1261 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1262 }
1263
iommu_init_domains(struct intel_iommu * iommu)1264 static int iommu_init_domains(struct intel_iommu *iommu)
1265 {
1266 u32 ndomains;
1267
1268 ndomains = cap_ndoms(iommu->cap);
1269 pr_debug("%s: Number of Domains supported <%d>\n",
1270 iommu->name, ndomains);
1271
1272 spin_lock_init(&iommu->lock);
1273
1274 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1275 if (!iommu->domain_ids)
1276 return -ENOMEM;
1277
1278 /*
1279 * If Caching mode is set, then invalid translations are tagged
1280 * with domain-id 0, hence we need to pre-allocate it. We also
1281 * use domain-id 0 as a marker for non-allocated domain-id, so
1282 * make sure it is not used for a real domain.
1283 */
1284 set_bit(0, iommu->domain_ids);
1285
1286 /*
1287 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1288 * entry for first-level or pass-through translation modes should
1289 * be programmed with a domain id different from those used for
1290 * second-level or nested translation. We reserve a domain id for
1291 * this purpose. This domain id is also used for identity domain
1292 * in legacy mode.
1293 */
1294 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1295
1296 return 0;
1297 }
1298
disable_dmar_iommu(struct intel_iommu * iommu)1299 static void disable_dmar_iommu(struct intel_iommu *iommu)
1300 {
1301 if (!iommu->domain_ids)
1302 return;
1303
1304 /*
1305 * All iommu domains must have been detached from the devices,
1306 * hence there should be no domain IDs in use.
1307 */
1308 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1309 > NUM_RESERVED_DID))
1310 return;
1311
1312 if (iommu->gcmd & DMA_GCMD_TE)
1313 iommu_disable_translation(iommu);
1314 }
1315
free_dmar_iommu(struct intel_iommu * iommu)1316 static void free_dmar_iommu(struct intel_iommu *iommu)
1317 {
1318 if (iommu->domain_ids) {
1319 bitmap_free(iommu->domain_ids);
1320 iommu->domain_ids = NULL;
1321 }
1322
1323 if (iommu->copied_tables) {
1324 bitmap_free(iommu->copied_tables);
1325 iommu->copied_tables = NULL;
1326 }
1327
1328 /* free context mapping */
1329 free_context_table(iommu);
1330
1331 if (ecap_prs(iommu->ecap))
1332 intel_iommu_finish_prq(iommu);
1333 }
1334
1335 /*
1336 * Check and return whether first level is used by default for
1337 * DMA translation.
1338 */
first_level_by_default(struct intel_iommu * iommu)1339 static bool first_level_by_default(struct intel_iommu *iommu)
1340 {
1341 /* Only SL is available in legacy mode */
1342 if (!sm_supported(iommu))
1343 return false;
1344
1345 /* Only level (either FL or SL) is available, just use it */
1346 if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap))
1347 return ecap_flts(iommu->ecap);
1348
1349 return true;
1350 }
1351
domain_attach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1352 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1353 {
1354 struct iommu_domain_info *info, *curr;
1355 unsigned long ndomains;
1356 int num, ret = -ENOSPC;
1357
1358 if (domain->domain.type == IOMMU_DOMAIN_SVA)
1359 return 0;
1360
1361 info = kzalloc(sizeof(*info), GFP_KERNEL);
1362 if (!info)
1363 return -ENOMEM;
1364
1365 spin_lock(&iommu->lock);
1366 curr = xa_load(&domain->iommu_array, iommu->seq_id);
1367 if (curr) {
1368 curr->refcnt++;
1369 spin_unlock(&iommu->lock);
1370 kfree(info);
1371 return 0;
1372 }
1373
1374 ndomains = cap_ndoms(iommu->cap);
1375 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1376 if (num >= ndomains) {
1377 pr_err("%s: No free domain ids\n", iommu->name);
1378 goto err_unlock;
1379 }
1380
1381 set_bit(num, iommu->domain_ids);
1382 info->refcnt = 1;
1383 info->did = num;
1384 info->iommu = iommu;
1385 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1386 NULL, info, GFP_ATOMIC);
1387 if (curr) {
1388 ret = xa_err(curr) ? : -EBUSY;
1389 goto err_clear;
1390 }
1391
1392 spin_unlock(&iommu->lock);
1393 return 0;
1394
1395 err_clear:
1396 clear_bit(info->did, iommu->domain_ids);
1397 err_unlock:
1398 spin_unlock(&iommu->lock);
1399 kfree(info);
1400 return ret;
1401 }
1402
domain_detach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1403 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1404 {
1405 struct iommu_domain_info *info;
1406
1407 if (domain->domain.type == IOMMU_DOMAIN_SVA)
1408 return;
1409
1410 spin_lock(&iommu->lock);
1411 info = xa_load(&domain->iommu_array, iommu->seq_id);
1412 if (--info->refcnt == 0) {
1413 clear_bit(info->did, iommu->domain_ids);
1414 xa_erase(&domain->iommu_array, iommu->seq_id);
1415 domain->nid = NUMA_NO_NODE;
1416 kfree(info);
1417 }
1418 spin_unlock(&iommu->lock);
1419 }
1420
domain_exit(struct dmar_domain * domain)1421 static void domain_exit(struct dmar_domain *domain)
1422 {
1423 if (domain->pgd) {
1424 LIST_HEAD(freelist);
1425
1426 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1427 iommu_put_pages_list(&freelist);
1428 }
1429
1430 if (WARN_ON(!list_empty(&domain->devices)))
1431 return;
1432
1433 kfree(domain->qi_batch);
1434 kfree(domain);
1435 }
1436
1437 /*
1438 * For kdump cases, old valid entries may be cached due to the
1439 * in-flight DMA and copied pgtable, but there is no unmapping
1440 * behaviour for them, thus we need an explicit cache flush for
1441 * the newly-mapped device. For kdump, at this point, the device
1442 * is supposed to finish reset at its driver probe stage, so no
1443 * in-flight DMA will exist, and we don't need to worry anymore
1444 * hereafter.
1445 */
copied_context_tear_down(struct intel_iommu * iommu,struct context_entry * context,u8 bus,u8 devfn)1446 static void copied_context_tear_down(struct intel_iommu *iommu,
1447 struct context_entry *context,
1448 u8 bus, u8 devfn)
1449 {
1450 u16 did_old;
1451
1452 if (!context_copied(iommu, bus, devfn))
1453 return;
1454
1455 assert_spin_locked(&iommu->lock);
1456
1457 did_old = context_domain_id(context);
1458 context_clear_entry(context);
1459
1460 if (did_old < cap_ndoms(iommu->cap)) {
1461 iommu->flush.flush_context(iommu, did_old,
1462 PCI_DEVID(bus, devfn),
1463 DMA_CCMD_MASK_NOBIT,
1464 DMA_CCMD_DEVICE_INVL);
1465 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1466 DMA_TLB_DSI_FLUSH);
1467 }
1468
1469 clear_context_copied(iommu, bus, devfn);
1470 }
1471
1472 /*
1473 * It's a non-present to present mapping. If hardware doesn't cache
1474 * non-present entry we only need to flush the write-buffer. If the
1475 * _does_ cache non-present entries, then it does so in the special
1476 * domain #0, which we have to flush:
1477 */
context_present_cache_flush(struct intel_iommu * iommu,u16 did,u8 bus,u8 devfn)1478 static void context_present_cache_flush(struct intel_iommu *iommu, u16 did,
1479 u8 bus, u8 devfn)
1480 {
1481 if (cap_caching_mode(iommu->cap)) {
1482 iommu->flush.flush_context(iommu, 0,
1483 PCI_DEVID(bus, devfn),
1484 DMA_CCMD_MASK_NOBIT,
1485 DMA_CCMD_DEVICE_INVL);
1486 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1487 } else {
1488 iommu_flush_write_buffer(iommu);
1489 }
1490 }
1491
domain_context_mapping_one(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1492 static int domain_context_mapping_one(struct dmar_domain *domain,
1493 struct intel_iommu *iommu,
1494 u8 bus, u8 devfn)
1495 {
1496 struct device_domain_info *info =
1497 domain_lookup_dev_info(domain, iommu, bus, devfn);
1498 u16 did = domain_id_iommu(domain, iommu);
1499 int translation = CONTEXT_TT_MULTI_LEVEL;
1500 struct dma_pte *pgd = domain->pgd;
1501 struct context_entry *context;
1502 int ret;
1503
1504 pr_debug("Set context mapping for %02x:%02x.%d\n",
1505 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1506
1507 spin_lock(&iommu->lock);
1508 ret = -ENOMEM;
1509 context = iommu_context_addr(iommu, bus, devfn, 1);
1510 if (!context)
1511 goto out_unlock;
1512
1513 ret = 0;
1514 if (context_present(context) && !context_copied(iommu, bus, devfn))
1515 goto out_unlock;
1516
1517 copied_context_tear_down(iommu, context, bus, devfn);
1518 context_clear_entry(context);
1519 context_set_domain_id(context, did);
1520
1521 if (info && info->ats_supported)
1522 translation = CONTEXT_TT_DEV_IOTLB;
1523 else
1524 translation = CONTEXT_TT_MULTI_LEVEL;
1525
1526 context_set_address_root(context, virt_to_phys(pgd));
1527 context_set_address_width(context, domain->agaw);
1528 context_set_translation_type(context, translation);
1529 context_set_fault_enable(context);
1530 context_set_present(context);
1531 if (!ecap_coherent(iommu->ecap))
1532 clflush_cache_range(context, sizeof(*context));
1533 context_present_cache_flush(iommu, did, bus, devfn);
1534 ret = 0;
1535
1536 out_unlock:
1537 spin_unlock(&iommu->lock);
1538
1539 return ret;
1540 }
1541
domain_context_mapping_cb(struct pci_dev * pdev,u16 alias,void * opaque)1542 static int domain_context_mapping_cb(struct pci_dev *pdev,
1543 u16 alias, void *opaque)
1544 {
1545 struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1546 struct intel_iommu *iommu = info->iommu;
1547 struct dmar_domain *domain = opaque;
1548
1549 return domain_context_mapping_one(domain, iommu,
1550 PCI_BUS_NUM(alias), alias & 0xff);
1551 }
1552
1553 static int
domain_context_mapping(struct dmar_domain * domain,struct device * dev)1554 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1555 {
1556 struct device_domain_info *info = dev_iommu_priv_get(dev);
1557 struct intel_iommu *iommu = info->iommu;
1558 u8 bus = info->bus, devfn = info->devfn;
1559
1560 if (!dev_is_pci(dev))
1561 return domain_context_mapping_one(domain, iommu, bus, devfn);
1562
1563 return pci_for_each_dma_alias(to_pci_dev(dev),
1564 domain_context_mapping_cb, domain);
1565 }
1566
1567 /* Return largest possible superpage level for a given mapping */
hardware_largepage_caps(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phy_pfn,unsigned long pages)1568 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1569 unsigned long phy_pfn, unsigned long pages)
1570 {
1571 int support, level = 1;
1572 unsigned long pfnmerge;
1573
1574 support = domain->iommu_superpage;
1575
1576 /* To use a large page, the virtual *and* physical addresses
1577 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1578 of them will mean we have to use smaller pages. So just
1579 merge them and check both at once. */
1580 pfnmerge = iov_pfn | phy_pfn;
1581
1582 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1583 pages >>= VTD_STRIDE_SHIFT;
1584 if (!pages)
1585 break;
1586 pfnmerge >>= VTD_STRIDE_SHIFT;
1587 level++;
1588 support--;
1589 }
1590 return level;
1591 }
1592
1593 /*
1594 * Ensure that old small page tables are removed to make room for superpage(s).
1595 * We're going to add new large pages, so make sure we don't remove their parent
1596 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1597 */
switch_to_super_page(struct dmar_domain * domain,unsigned long start_pfn,unsigned long end_pfn,int level)1598 static void switch_to_super_page(struct dmar_domain *domain,
1599 unsigned long start_pfn,
1600 unsigned long end_pfn, int level)
1601 {
1602 unsigned long lvl_pages = lvl_to_nr_pages(level);
1603 struct dma_pte *pte = NULL;
1604
1605 while (start_pfn <= end_pfn) {
1606 if (!pte)
1607 pte = pfn_to_dma_pte(domain, start_pfn, &level,
1608 GFP_ATOMIC);
1609
1610 if (dma_pte_present(pte)) {
1611 dma_pte_free_pagetable(domain, start_pfn,
1612 start_pfn + lvl_pages - 1,
1613 level + 1);
1614
1615 cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT,
1616 end_pfn << VTD_PAGE_SHIFT, 0);
1617 }
1618
1619 pte++;
1620 start_pfn += lvl_pages;
1621 if (first_pte_in_page(pte))
1622 pte = NULL;
1623 }
1624 }
1625
1626 static int
__domain_mapping(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phys_pfn,unsigned long nr_pages,int prot,gfp_t gfp)1627 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1628 unsigned long phys_pfn, unsigned long nr_pages, int prot,
1629 gfp_t gfp)
1630 {
1631 struct dma_pte *first_pte = NULL, *pte = NULL;
1632 unsigned int largepage_lvl = 0;
1633 unsigned long lvl_pages = 0;
1634 phys_addr_t pteval;
1635 u64 attr;
1636
1637 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
1638 return -EINVAL;
1639
1640 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1641 return -EINVAL;
1642
1643 if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
1644 pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
1645 return -EINVAL;
1646 }
1647
1648 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
1649 attr |= DMA_FL_PTE_PRESENT;
1650 if (domain->use_first_level) {
1651 attr |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
1652 if (prot & DMA_PTE_WRITE)
1653 attr |= DMA_FL_PTE_DIRTY;
1654 }
1655
1656 domain->has_mappings = true;
1657
1658 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
1659
1660 while (nr_pages > 0) {
1661 uint64_t tmp;
1662
1663 if (!pte) {
1664 largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
1665 phys_pfn, nr_pages);
1666
1667 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
1668 gfp);
1669 if (!pte)
1670 return -ENOMEM;
1671 first_pte = pte;
1672
1673 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1674
1675 /* It is large page*/
1676 if (largepage_lvl > 1) {
1677 unsigned long end_pfn;
1678 unsigned long pages_to_remove;
1679
1680 pteval |= DMA_PTE_LARGE_PAGE;
1681 pages_to_remove = min_t(unsigned long, nr_pages,
1682 nr_pte_to_next_page(pte) * lvl_pages);
1683 end_pfn = iov_pfn + pages_to_remove - 1;
1684 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
1685 } else {
1686 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1687 }
1688
1689 }
1690 /* We don't need lock here, nobody else
1691 * touches the iova range
1692 */
1693 tmp = 0ULL;
1694 if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
1695 static int dumps = 5;
1696 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1697 iov_pfn, tmp, (unsigned long long)pteval);
1698 if (dumps) {
1699 dumps--;
1700 debug_dma_dump_mappings(NULL);
1701 }
1702 WARN_ON(1);
1703 }
1704
1705 nr_pages -= lvl_pages;
1706 iov_pfn += lvl_pages;
1707 phys_pfn += lvl_pages;
1708 pteval += lvl_pages * VTD_PAGE_SIZE;
1709
1710 /* If the next PTE would be the first in a new page, then we
1711 * need to flush the cache on the entries we've just written.
1712 * And then we'll need to recalculate 'pte', so clear it and
1713 * let it get set again in the if (!pte) block above.
1714 *
1715 * If we're done (!nr_pages) we need to flush the cache too.
1716 *
1717 * Also if we've been setting superpages, we may need to
1718 * recalculate 'pte' and switch back to smaller pages for the
1719 * end of the mapping, if the trailing size is not enough to
1720 * use another superpage (i.e. nr_pages < lvl_pages).
1721 */
1722 pte++;
1723 if (!nr_pages || first_pte_in_page(pte) ||
1724 (largepage_lvl > 1 && nr_pages < lvl_pages)) {
1725 domain_flush_cache(domain, first_pte,
1726 (void *)pte - (void *)first_pte);
1727 pte = NULL;
1728 }
1729 }
1730
1731 return 0;
1732 }
1733
domain_context_clear_one(struct device_domain_info * info,u8 bus,u8 devfn)1734 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
1735 {
1736 struct intel_iommu *iommu = info->iommu;
1737 struct context_entry *context;
1738 u16 did;
1739
1740 spin_lock(&iommu->lock);
1741 context = iommu_context_addr(iommu, bus, devfn, 0);
1742 if (!context) {
1743 spin_unlock(&iommu->lock);
1744 return;
1745 }
1746
1747 did = context_domain_id(context);
1748 context_clear_entry(context);
1749 __iommu_flush_cache(iommu, context, sizeof(*context));
1750 spin_unlock(&iommu->lock);
1751 intel_context_flush_present(info, context, did, true);
1752 }
1753
__domain_setup_first_level(struct intel_iommu * iommu,struct device * dev,ioasid_t pasid,u16 did,pgd_t * pgd,int flags,struct iommu_domain * old)1754 int __domain_setup_first_level(struct intel_iommu *iommu,
1755 struct device *dev, ioasid_t pasid,
1756 u16 did, pgd_t *pgd, int flags,
1757 struct iommu_domain *old)
1758 {
1759 if (!old)
1760 return intel_pasid_setup_first_level(iommu, dev, pgd,
1761 pasid, did, flags);
1762 return intel_pasid_replace_first_level(iommu, dev, pgd, pasid, did,
1763 iommu_domain_did(old, iommu),
1764 flags);
1765 }
1766
domain_setup_second_level(struct intel_iommu * iommu,struct dmar_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)1767 static int domain_setup_second_level(struct intel_iommu *iommu,
1768 struct dmar_domain *domain,
1769 struct device *dev, ioasid_t pasid,
1770 struct iommu_domain *old)
1771 {
1772 if (!old)
1773 return intel_pasid_setup_second_level(iommu, domain,
1774 dev, pasid);
1775 return intel_pasid_replace_second_level(iommu, domain, dev,
1776 iommu_domain_did(old, iommu),
1777 pasid);
1778 }
1779
domain_setup_passthrough(struct intel_iommu * iommu,struct device * dev,ioasid_t pasid,struct iommu_domain * old)1780 static int domain_setup_passthrough(struct intel_iommu *iommu,
1781 struct device *dev, ioasid_t pasid,
1782 struct iommu_domain *old)
1783 {
1784 if (!old)
1785 return intel_pasid_setup_pass_through(iommu, dev, pasid);
1786 return intel_pasid_replace_pass_through(iommu, dev,
1787 iommu_domain_did(old, iommu),
1788 pasid);
1789 }
1790
domain_setup_first_level(struct intel_iommu * iommu,struct dmar_domain * domain,struct device * dev,u32 pasid,struct iommu_domain * old)1791 static int domain_setup_first_level(struct intel_iommu *iommu,
1792 struct dmar_domain *domain,
1793 struct device *dev,
1794 u32 pasid, struct iommu_domain *old)
1795 {
1796 struct dma_pte *pgd = domain->pgd;
1797 int level, flags = 0;
1798
1799 level = agaw_to_level(domain->agaw);
1800 if (level != 4 && level != 5)
1801 return -EINVAL;
1802
1803 if (level == 5)
1804 flags |= PASID_FLAG_FL5LP;
1805
1806 if (domain->force_snooping)
1807 flags |= PASID_FLAG_PAGE_SNOOP;
1808
1809 return __domain_setup_first_level(iommu, dev, pasid,
1810 domain_id_iommu(domain, iommu),
1811 (pgd_t *)pgd, flags, old);
1812 }
1813
dmar_domain_attach_device(struct dmar_domain * domain,struct device * dev)1814 static int dmar_domain_attach_device(struct dmar_domain *domain,
1815 struct device *dev)
1816 {
1817 struct device_domain_info *info = dev_iommu_priv_get(dev);
1818 struct intel_iommu *iommu = info->iommu;
1819 unsigned long flags;
1820 int ret;
1821
1822 ret = domain_attach_iommu(domain, iommu);
1823 if (ret)
1824 return ret;
1825
1826 info->domain = domain;
1827 spin_lock_irqsave(&domain->lock, flags);
1828 list_add(&info->link, &domain->devices);
1829 spin_unlock_irqrestore(&domain->lock, flags);
1830
1831 if (dev_is_real_dma_subdevice(dev))
1832 return 0;
1833
1834 if (!sm_supported(iommu))
1835 ret = domain_context_mapping(domain, dev);
1836 else if (domain->use_first_level)
1837 ret = domain_setup_first_level(iommu, domain, dev,
1838 IOMMU_NO_PASID, NULL);
1839 else
1840 ret = domain_setup_second_level(iommu, domain, dev,
1841 IOMMU_NO_PASID, NULL);
1842
1843 if (ret)
1844 goto out_block_translation;
1845
1846 iommu_enable_pci_caps(info);
1847
1848 ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID);
1849 if (ret)
1850 goto out_block_translation;
1851
1852 return 0;
1853
1854 out_block_translation:
1855 device_block_translation(dev);
1856 return ret;
1857 }
1858
1859 /**
1860 * device_rmrr_is_relaxable - Test whether the RMRR of this device
1861 * is relaxable (ie. is allowed to be not enforced under some conditions)
1862 * @dev: device handle
1863 *
1864 * We assume that PCI USB devices with RMRRs have them largely
1865 * for historical reasons and that the RMRR space is not actively used post
1866 * boot. This exclusion may change if vendors begin to abuse it.
1867 *
1868 * The same exception is made for graphics devices, with the requirement that
1869 * any use of the RMRR regions will be torn down before assigning the device
1870 * to a guest.
1871 *
1872 * Return: true if the RMRR is relaxable, false otherwise
1873 */
device_rmrr_is_relaxable(struct device * dev)1874 static bool device_rmrr_is_relaxable(struct device *dev)
1875 {
1876 struct pci_dev *pdev;
1877
1878 if (!dev_is_pci(dev))
1879 return false;
1880
1881 pdev = to_pci_dev(dev);
1882 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
1883 return true;
1884 else
1885 return false;
1886 }
1887
device_def_domain_type(struct device * dev)1888 static int device_def_domain_type(struct device *dev)
1889 {
1890 struct device_domain_info *info = dev_iommu_priv_get(dev);
1891 struct intel_iommu *iommu = info->iommu;
1892
1893 /*
1894 * Hardware does not support the passthrough translation mode.
1895 * Always use a dynamaic mapping domain.
1896 */
1897 if (!ecap_pass_through(iommu->ecap))
1898 return IOMMU_DOMAIN_DMA;
1899
1900 if (dev_is_pci(dev)) {
1901 struct pci_dev *pdev = to_pci_dev(dev);
1902
1903 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
1904 return IOMMU_DOMAIN_IDENTITY;
1905 }
1906
1907 return 0;
1908 }
1909
intel_iommu_init_qi(struct intel_iommu * iommu)1910 static void intel_iommu_init_qi(struct intel_iommu *iommu)
1911 {
1912 /*
1913 * Start from the sane iommu hardware state.
1914 * If the queued invalidation is already initialized by us
1915 * (for example, while enabling interrupt-remapping) then
1916 * we got the things already rolling from a sane state.
1917 */
1918 if (!iommu->qi) {
1919 /*
1920 * Clear any previous faults.
1921 */
1922 dmar_fault(-1, iommu);
1923 /*
1924 * Disable queued invalidation if supported and already enabled
1925 * before OS handover.
1926 */
1927 dmar_disable_qi(iommu);
1928 }
1929
1930 if (dmar_enable_qi(iommu)) {
1931 /*
1932 * Queued Invalidate not enabled, use Register Based Invalidate
1933 */
1934 iommu->flush.flush_context = __iommu_flush_context;
1935 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1936 pr_info("%s: Using Register based invalidation\n",
1937 iommu->name);
1938 } else {
1939 iommu->flush.flush_context = qi_flush_context;
1940 iommu->flush.flush_iotlb = qi_flush_iotlb;
1941 pr_info("%s: Using Queued invalidation\n", iommu->name);
1942 }
1943 }
1944
copy_context_table(struct intel_iommu * iommu,struct root_entry * old_re,struct context_entry ** tbl,int bus,bool ext)1945 static int copy_context_table(struct intel_iommu *iommu,
1946 struct root_entry *old_re,
1947 struct context_entry **tbl,
1948 int bus, bool ext)
1949 {
1950 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
1951 struct context_entry *new_ce = NULL, ce;
1952 struct context_entry *old_ce = NULL;
1953 struct root_entry re;
1954 phys_addr_t old_ce_phys;
1955
1956 tbl_idx = ext ? bus * 2 : bus;
1957 memcpy(&re, old_re, sizeof(re));
1958
1959 for (devfn = 0; devfn < 256; devfn++) {
1960 /* First calculate the correct index */
1961 idx = (ext ? devfn * 2 : devfn) % 256;
1962
1963 if (idx == 0) {
1964 /* First save what we may have and clean up */
1965 if (new_ce) {
1966 tbl[tbl_idx] = new_ce;
1967 __iommu_flush_cache(iommu, new_ce,
1968 VTD_PAGE_SIZE);
1969 pos = 1;
1970 }
1971
1972 if (old_ce)
1973 memunmap(old_ce);
1974
1975 ret = 0;
1976 if (devfn < 0x80)
1977 old_ce_phys = root_entry_lctp(&re);
1978 else
1979 old_ce_phys = root_entry_uctp(&re);
1980
1981 if (!old_ce_phys) {
1982 if (ext && devfn == 0) {
1983 /* No LCTP, try UCTP */
1984 devfn = 0x7f;
1985 continue;
1986 } else {
1987 goto out;
1988 }
1989 }
1990
1991 ret = -ENOMEM;
1992 old_ce = memremap(old_ce_phys, PAGE_SIZE,
1993 MEMREMAP_WB);
1994 if (!old_ce)
1995 goto out;
1996
1997 new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL);
1998 if (!new_ce)
1999 goto out_unmap;
2000
2001 ret = 0;
2002 }
2003
2004 /* Now copy the context entry */
2005 memcpy(&ce, old_ce + idx, sizeof(ce));
2006
2007 if (!context_present(&ce))
2008 continue;
2009
2010 did = context_domain_id(&ce);
2011 if (did >= 0 && did < cap_ndoms(iommu->cap))
2012 set_bit(did, iommu->domain_ids);
2013
2014 set_context_copied(iommu, bus, devfn);
2015 new_ce[idx] = ce;
2016 }
2017
2018 tbl[tbl_idx + pos] = new_ce;
2019
2020 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2021
2022 out_unmap:
2023 memunmap(old_ce);
2024
2025 out:
2026 return ret;
2027 }
2028
copy_translation_tables(struct intel_iommu * iommu)2029 static int copy_translation_tables(struct intel_iommu *iommu)
2030 {
2031 struct context_entry **ctxt_tbls;
2032 struct root_entry *old_rt;
2033 phys_addr_t old_rt_phys;
2034 int ctxt_table_entries;
2035 u64 rtaddr_reg;
2036 int bus, ret;
2037 bool new_ext, ext;
2038
2039 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2040 ext = !!(rtaddr_reg & DMA_RTADDR_SMT);
2041 new_ext = !!sm_supported(iommu);
2042
2043 /*
2044 * The RTT bit can only be changed when translation is disabled,
2045 * but disabling translation means to open a window for data
2046 * corruption. So bail out and don't copy anything if we would
2047 * have to change the bit.
2048 */
2049 if (new_ext != ext)
2050 return -EINVAL;
2051
2052 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2053 if (!iommu->copied_tables)
2054 return -ENOMEM;
2055
2056 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2057 if (!old_rt_phys)
2058 return -EINVAL;
2059
2060 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2061 if (!old_rt)
2062 return -ENOMEM;
2063
2064 /* This is too big for the stack - allocate it from slab */
2065 ctxt_table_entries = ext ? 512 : 256;
2066 ret = -ENOMEM;
2067 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2068 if (!ctxt_tbls)
2069 goto out_unmap;
2070
2071 for (bus = 0; bus < 256; bus++) {
2072 ret = copy_context_table(iommu, &old_rt[bus],
2073 ctxt_tbls, bus, ext);
2074 if (ret) {
2075 pr_err("%s: Failed to copy context table for bus %d\n",
2076 iommu->name, bus);
2077 continue;
2078 }
2079 }
2080
2081 spin_lock(&iommu->lock);
2082
2083 /* Context tables are copied, now write them to the root_entry table */
2084 for (bus = 0; bus < 256; bus++) {
2085 int idx = ext ? bus * 2 : bus;
2086 u64 val;
2087
2088 if (ctxt_tbls[idx]) {
2089 val = virt_to_phys(ctxt_tbls[idx]) | 1;
2090 iommu->root_entry[bus].lo = val;
2091 }
2092
2093 if (!ext || !ctxt_tbls[idx + 1])
2094 continue;
2095
2096 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2097 iommu->root_entry[bus].hi = val;
2098 }
2099
2100 spin_unlock(&iommu->lock);
2101
2102 kfree(ctxt_tbls);
2103
2104 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2105
2106 ret = 0;
2107
2108 out_unmap:
2109 memunmap(old_rt);
2110
2111 return ret;
2112 }
2113
init_dmars(void)2114 static int __init init_dmars(void)
2115 {
2116 struct dmar_drhd_unit *drhd;
2117 struct intel_iommu *iommu;
2118 int ret;
2119
2120 for_each_iommu(iommu, drhd) {
2121 if (drhd->ignored) {
2122 iommu_disable_translation(iommu);
2123 continue;
2124 }
2125
2126 /*
2127 * Find the max pasid size of all IOMMU's in the system.
2128 * We need to ensure the system pasid table is no bigger
2129 * than the smallest supported.
2130 */
2131 if (pasid_supported(iommu)) {
2132 u32 temp = 2 << ecap_pss(iommu->ecap);
2133
2134 intel_pasid_max_id = min_t(u32, temp,
2135 intel_pasid_max_id);
2136 }
2137
2138 intel_iommu_init_qi(iommu);
2139
2140 ret = iommu_init_domains(iommu);
2141 if (ret)
2142 goto free_iommu;
2143
2144 init_translation_status(iommu);
2145
2146 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2147 iommu_disable_translation(iommu);
2148 clear_translation_pre_enabled(iommu);
2149 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2150 iommu->name);
2151 }
2152
2153 /*
2154 * TBD:
2155 * we could share the same root & context tables
2156 * among all IOMMU's. Need to Split it later.
2157 */
2158 ret = iommu_alloc_root_entry(iommu);
2159 if (ret)
2160 goto free_iommu;
2161
2162 if (translation_pre_enabled(iommu)) {
2163 pr_info("Translation already enabled - trying to copy translation structures\n");
2164
2165 ret = copy_translation_tables(iommu);
2166 if (ret) {
2167 /*
2168 * We found the IOMMU with translation
2169 * enabled - but failed to copy over the
2170 * old root-entry table. Try to proceed
2171 * by disabling translation now and
2172 * allocating a clean root-entry table.
2173 * This might cause DMAR faults, but
2174 * probably the dump will still succeed.
2175 */
2176 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2177 iommu->name);
2178 iommu_disable_translation(iommu);
2179 clear_translation_pre_enabled(iommu);
2180 } else {
2181 pr_info("Copied translation tables from previous kernel for %s\n",
2182 iommu->name);
2183 }
2184 }
2185
2186 intel_svm_check(iommu);
2187 }
2188
2189 /*
2190 * Now that qi is enabled on all iommus, set the root entry and flush
2191 * caches. This is required on some Intel X58 chipsets, otherwise the
2192 * flush_context function will loop forever and the boot hangs.
2193 */
2194 for_each_active_iommu(iommu, drhd) {
2195 iommu_flush_write_buffer(iommu);
2196 iommu_set_root_entry(iommu);
2197 }
2198
2199 check_tylersburg_isoch();
2200
2201 /*
2202 * for each drhd
2203 * enable fault log
2204 * global invalidate context cache
2205 * global invalidate iotlb
2206 * enable translation
2207 */
2208 for_each_iommu(iommu, drhd) {
2209 if (drhd->ignored) {
2210 /*
2211 * we always have to disable PMRs or DMA may fail on
2212 * this device
2213 */
2214 if (force_on)
2215 iommu_disable_protect_mem_regions(iommu);
2216 continue;
2217 }
2218
2219 iommu_flush_write_buffer(iommu);
2220
2221 if (ecap_prs(iommu->ecap)) {
2222 /*
2223 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2224 * could cause possible lock race condition.
2225 */
2226 up_write(&dmar_global_lock);
2227 ret = intel_iommu_enable_prq(iommu);
2228 down_write(&dmar_global_lock);
2229 if (ret)
2230 goto free_iommu;
2231 }
2232
2233 ret = dmar_set_interrupt(iommu);
2234 if (ret)
2235 goto free_iommu;
2236 }
2237
2238 return 0;
2239
2240 free_iommu:
2241 for_each_active_iommu(iommu, drhd) {
2242 disable_dmar_iommu(iommu);
2243 free_dmar_iommu(iommu);
2244 }
2245
2246 return ret;
2247 }
2248
init_no_remapping_devices(void)2249 static void __init init_no_remapping_devices(void)
2250 {
2251 struct dmar_drhd_unit *drhd;
2252 struct device *dev;
2253 int i;
2254
2255 for_each_drhd_unit(drhd) {
2256 if (!drhd->include_all) {
2257 for_each_active_dev_scope(drhd->devices,
2258 drhd->devices_cnt, i, dev)
2259 break;
2260 /* ignore DMAR unit if no devices exist */
2261 if (i == drhd->devices_cnt)
2262 drhd->ignored = 1;
2263 }
2264 }
2265
2266 for_each_active_drhd_unit(drhd) {
2267 if (drhd->include_all)
2268 continue;
2269
2270 for_each_active_dev_scope(drhd->devices,
2271 drhd->devices_cnt, i, dev)
2272 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2273 break;
2274 if (i < drhd->devices_cnt)
2275 continue;
2276
2277 /* This IOMMU has *only* gfx devices. Either bypass it or
2278 set the gfx_mapped flag, as appropriate */
2279 drhd->gfx_dedicated = 1;
2280 if (disable_igfx_iommu)
2281 drhd->ignored = 1;
2282 }
2283 }
2284
2285 #ifdef CONFIG_SUSPEND
init_iommu_hw(void)2286 static int init_iommu_hw(void)
2287 {
2288 struct dmar_drhd_unit *drhd;
2289 struct intel_iommu *iommu = NULL;
2290 int ret;
2291
2292 for_each_active_iommu(iommu, drhd) {
2293 if (iommu->qi) {
2294 ret = dmar_reenable_qi(iommu);
2295 if (ret)
2296 return ret;
2297 }
2298 }
2299
2300 for_each_iommu(iommu, drhd) {
2301 if (drhd->ignored) {
2302 /*
2303 * we always have to disable PMRs or DMA may fail on
2304 * this device
2305 */
2306 if (force_on)
2307 iommu_disable_protect_mem_regions(iommu);
2308 continue;
2309 }
2310
2311 iommu_flush_write_buffer(iommu);
2312 iommu_set_root_entry(iommu);
2313 iommu_enable_translation(iommu);
2314 iommu_disable_protect_mem_regions(iommu);
2315 }
2316
2317 return 0;
2318 }
2319
iommu_flush_all(void)2320 static void iommu_flush_all(void)
2321 {
2322 struct dmar_drhd_unit *drhd;
2323 struct intel_iommu *iommu;
2324
2325 for_each_active_iommu(iommu, drhd) {
2326 iommu->flush.flush_context(iommu, 0, 0, 0,
2327 DMA_CCMD_GLOBAL_INVL);
2328 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2329 DMA_TLB_GLOBAL_FLUSH);
2330 }
2331 }
2332
iommu_suspend(void)2333 static int iommu_suspend(void)
2334 {
2335 struct dmar_drhd_unit *drhd;
2336 struct intel_iommu *iommu = NULL;
2337 unsigned long flag;
2338
2339 iommu_flush_all();
2340
2341 for_each_active_iommu(iommu, drhd) {
2342 iommu_disable_translation(iommu);
2343
2344 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2345
2346 iommu->iommu_state[SR_DMAR_FECTL_REG] =
2347 readl(iommu->reg + DMAR_FECTL_REG);
2348 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2349 readl(iommu->reg + DMAR_FEDATA_REG);
2350 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2351 readl(iommu->reg + DMAR_FEADDR_REG);
2352 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2353 readl(iommu->reg + DMAR_FEUADDR_REG);
2354
2355 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2356 }
2357 return 0;
2358 }
2359
iommu_resume(void)2360 static void iommu_resume(void)
2361 {
2362 struct dmar_drhd_unit *drhd;
2363 struct intel_iommu *iommu = NULL;
2364 unsigned long flag;
2365
2366 if (init_iommu_hw()) {
2367 if (force_on)
2368 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2369 else
2370 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2371 return;
2372 }
2373
2374 for_each_active_iommu(iommu, drhd) {
2375
2376 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2377
2378 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2379 iommu->reg + DMAR_FECTL_REG);
2380 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2381 iommu->reg + DMAR_FEDATA_REG);
2382 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2383 iommu->reg + DMAR_FEADDR_REG);
2384 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2385 iommu->reg + DMAR_FEUADDR_REG);
2386
2387 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2388 }
2389 }
2390
2391 static struct syscore_ops iommu_syscore_ops = {
2392 .resume = iommu_resume,
2393 .suspend = iommu_suspend,
2394 };
2395
init_iommu_pm_ops(void)2396 static void __init init_iommu_pm_ops(void)
2397 {
2398 register_syscore_ops(&iommu_syscore_ops);
2399 }
2400
2401 #else
init_iommu_pm_ops(void)2402 static inline void init_iommu_pm_ops(void) {}
2403 #endif /* CONFIG_PM */
2404
rmrr_sanity_check(struct acpi_dmar_reserved_memory * rmrr)2405 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2406 {
2407 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2408 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2409 rmrr->end_address <= rmrr->base_address ||
2410 arch_rmrr_sanity_check(rmrr))
2411 return -EINVAL;
2412
2413 return 0;
2414 }
2415
dmar_parse_one_rmrr(struct acpi_dmar_header * header,void * arg)2416 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2417 {
2418 struct acpi_dmar_reserved_memory *rmrr;
2419 struct dmar_rmrr_unit *rmrru;
2420
2421 rmrr = (struct acpi_dmar_reserved_memory *)header;
2422 if (rmrr_sanity_check(rmrr)) {
2423 pr_warn(FW_BUG
2424 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2425 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2426 rmrr->base_address, rmrr->end_address,
2427 dmi_get_system_info(DMI_BIOS_VENDOR),
2428 dmi_get_system_info(DMI_BIOS_VERSION),
2429 dmi_get_system_info(DMI_PRODUCT_VERSION));
2430 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2431 }
2432
2433 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2434 if (!rmrru)
2435 goto out;
2436
2437 rmrru->hdr = header;
2438
2439 rmrru->base_address = rmrr->base_address;
2440 rmrru->end_address = rmrr->end_address;
2441
2442 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2443 ((void *)rmrr) + rmrr->header.length,
2444 &rmrru->devices_cnt);
2445 if (rmrru->devices_cnt && rmrru->devices == NULL)
2446 goto free_rmrru;
2447
2448 list_add(&rmrru->list, &dmar_rmrr_units);
2449
2450 return 0;
2451 free_rmrru:
2452 kfree(rmrru);
2453 out:
2454 return -ENOMEM;
2455 }
2456
dmar_find_atsr(struct acpi_dmar_atsr * atsr)2457 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2458 {
2459 struct dmar_atsr_unit *atsru;
2460 struct acpi_dmar_atsr *tmp;
2461
2462 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2463 dmar_rcu_check()) {
2464 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2465 if (atsr->segment != tmp->segment)
2466 continue;
2467 if (atsr->header.length != tmp->header.length)
2468 continue;
2469 if (memcmp(atsr, tmp, atsr->header.length) == 0)
2470 return atsru;
2471 }
2472
2473 return NULL;
2474 }
2475
dmar_parse_one_atsr(struct acpi_dmar_header * hdr,void * arg)2476 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2477 {
2478 struct acpi_dmar_atsr *atsr;
2479 struct dmar_atsr_unit *atsru;
2480
2481 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2482 return 0;
2483
2484 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2485 atsru = dmar_find_atsr(atsr);
2486 if (atsru)
2487 return 0;
2488
2489 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2490 if (!atsru)
2491 return -ENOMEM;
2492
2493 /*
2494 * If memory is allocated from slab by ACPI _DSM method, we need to
2495 * copy the memory content because the memory buffer will be freed
2496 * on return.
2497 */
2498 atsru->hdr = (void *)(atsru + 1);
2499 memcpy(atsru->hdr, hdr, hdr->length);
2500 atsru->include_all = atsr->flags & 0x1;
2501 if (!atsru->include_all) {
2502 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
2503 (void *)atsr + atsr->header.length,
2504 &atsru->devices_cnt);
2505 if (atsru->devices_cnt && atsru->devices == NULL) {
2506 kfree(atsru);
2507 return -ENOMEM;
2508 }
2509 }
2510
2511 list_add_rcu(&atsru->list, &dmar_atsr_units);
2512
2513 return 0;
2514 }
2515
intel_iommu_free_atsr(struct dmar_atsr_unit * atsru)2516 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2517 {
2518 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
2519 kfree(atsru);
2520 }
2521
dmar_release_one_atsr(struct acpi_dmar_header * hdr,void * arg)2522 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2523 {
2524 struct acpi_dmar_atsr *atsr;
2525 struct dmar_atsr_unit *atsru;
2526
2527 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2528 atsru = dmar_find_atsr(atsr);
2529 if (atsru) {
2530 list_del_rcu(&atsru->list);
2531 synchronize_rcu();
2532 intel_iommu_free_atsr(atsru);
2533 }
2534
2535 return 0;
2536 }
2537
dmar_check_one_atsr(struct acpi_dmar_header * hdr,void * arg)2538 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2539 {
2540 int i;
2541 struct device *dev;
2542 struct acpi_dmar_atsr *atsr;
2543 struct dmar_atsr_unit *atsru;
2544
2545 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2546 atsru = dmar_find_atsr(atsr);
2547 if (!atsru)
2548 return 0;
2549
2550 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
2551 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
2552 i, dev)
2553 return -EBUSY;
2554 }
2555
2556 return 0;
2557 }
2558
dmar_find_satc(struct acpi_dmar_satc * satc)2559 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
2560 {
2561 struct dmar_satc_unit *satcu;
2562 struct acpi_dmar_satc *tmp;
2563
2564 list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
2565 dmar_rcu_check()) {
2566 tmp = (struct acpi_dmar_satc *)satcu->hdr;
2567 if (satc->segment != tmp->segment)
2568 continue;
2569 if (satc->header.length != tmp->header.length)
2570 continue;
2571 if (memcmp(satc, tmp, satc->header.length) == 0)
2572 return satcu;
2573 }
2574
2575 return NULL;
2576 }
2577
dmar_parse_one_satc(struct acpi_dmar_header * hdr,void * arg)2578 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
2579 {
2580 struct acpi_dmar_satc *satc;
2581 struct dmar_satc_unit *satcu;
2582
2583 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2584 return 0;
2585
2586 satc = container_of(hdr, struct acpi_dmar_satc, header);
2587 satcu = dmar_find_satc(satc);
2588 if (satcu)
2589 return 0;
2590
2591 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
2592 if (!satcu)
2593 return -ENOMEM;
2594
2595 satcu->hdr = (void *)(satcu + 1);
2596 memcpy(satcu->hdr, hdr, hdr->length);
2597 satcu->atc_required = satc->flags & 0x1;
2598 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
2599 (void *)satc + satc->header.length,
2600 &satcu->devices_cnt);
2601 if (satcu->devices_cnt && !satcu->devices) {
2602 kfree(satcu);
2603 return -ENOMEM;
2604 }
2605 list_add_rcu(&satcu->list, &dmar_satc_units);
2606
2607 return 0;
2608 }
2609
intel_iommu_add(struct dmar_drhd_unit * dmaru)2610 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
2611 {
2612 struct intel_iommu *iommu = dmaru->iommu;
2613 int ret;
2614
2615 /*
2616 * Disable translation if already enabled prior to OS handover.
2617 */
2618 if (iommu->gcmd & DMA_GCMD_TE)
2619 iommu_disable_translation(iommu);
2620
2621 ret = iommu_init_domains(iommu);
2622 if (ret == 0)
2623 ret = iommu_alloc_root_entry(iommu);
2624 if (ret)
2625 goto out;
2626
2627 intel_svm_check(iommu);
2628
2629 if (dmaru->ignored) {
2630 /*
2631 * we always have to disable PMRs or DMA may fail on this device
2632 */
2633 if (force_on)
2634 iommu_disable_protect_mem_regions(iommu);
2635 return 0;
2636 }
2637
2638 intel_iommu_init_qi(iommu);
2639 iommu_flush_write_buffer(iommu);
2640
2641 if (ecap_prs(iommu->ecap)) {
2642 ret = intel_iommu_enable_prq(iommu);
2643 if (ret)
2644 goto disable_iommu;
2645 }
2646
2647 ret = dmar_set_interrupt(iommu);
2648 if (ret)
2649 goto disable_iommu;
2650
2651 iommu_set_root_entry(iommu);
2652 iommu_enable_translation(iommu);
2653
2654 iommu_disable_protect_mem_regions(iommu);
2655 return 0;
2656
2657 disable_iommu:
2658 disable_dmar_iommu(iommu);
2659 out:
2660 free_dmar_iommu(iommu);
2661 return ret;
2662 }
2663
dmar_iommu_hotplug(struct dmar_drhd_unit * dmaru,bool insert)2664 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
2665 {
2666 int ret = 0;
2667 struct intel_iommu *iommu = dmaru->iommu;
2668
2669 if (!intel_iommu_enabled)
2670 return 0;
2671 if (iommu == NULL)
2672 return -EINVAL;
2673
2674 if (insert) {
2675 ret = intel_iommu_add(dmaru);
2676 } else {
2677 disable_dmar_iommu(iommu);
2678 free_dmar_iommu(iommu);
2679 }
2680
2681 return ret;
2682 }
2683
intel_iommu_free_dmars(void)2684 static void intel_iommu_free_dmars(void)
2685 {
2686 struct dmar_rmrr_unit *rmrru, *rmrr_n;
2687 struct dmar_atsr_unit *atsru, *atsr_n;
2688 struct dmar_satc_unit *satcu, *satc_n;
2689
2690 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
2691 list_del(&rmrru->list);
2692 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
2693 kfree(rmrru);
2694 }
2695
2696 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
2697 list_del(&atsru->list);
2698 intel_iommu_free_atsr(atsru);
2699 }
2700 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
2701 list_del(&satcu->list);
2702 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
2703 kfree(satcu);
2704 }
2705 }
2706
dmar_find_matched_satc_unit(struct pci_dev * dev)2707 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
2708 {
2709 struct dmar_satc_unit *satcu;
2710 struct acpi_dmar_satc *satc;
2711 struct device *tmp;
2712 int i;
2713
2714 dev = pci_physfn(dev);
2715 rcu_read_lock();
2716
2717 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
2718 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2719 if (satc->segment != pci_domain_nr(dev->bus))
2720 continue;
2721 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
2722 if (to_pci_dev(tmp) == dev)
2723 goto out;
2724 }
2725 satcu = NULL;
2726 out:
2727 rcu_read_unlock();
2728 return satcu;
2729 }
2730
dmar_ats_supported(struct pci_dev * dev,struct intel_iommu * iommu)2731 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
2732 {
2733 int i, ret = 1;
2734 struct pci_bus *bus;
2735 struct pci_dev *bridge = NULL;
2736 struct device *tmp;
2737 struct acpi_dmar_atsr *atsr;
2738 struct dmar_atsr_unit *atsru;
2739 struct dmar_satc_unit *satcu;
2740
2741 dev = pci_physfn(dev);
2742 satcu = dmar_find_matched_satc_unit(dev);
2743 if (satcu)
2744 /*
2745 * This device supports ATS as it is in SATC table.
2746 * When IOMMU is in legacy mode, enabling ATS is done
2747 * automatically by HW for the device that requires
2748 * ATS, hence OS should not enable this device ATS
2749 * to avoid duplicated TLB invalidation.
2750 */
2751 return !(satcu->atc_required && !sm_supported(iommu));
2752
2753 for (bus = dev->bus; bus; bus = bus->parent) {
2754 bridge = bus->self;
2755 /* If it's an integrated device, allow ATS */
2756 if (!bridge)
2757 return 1;
2758 /* Connected via non-PCIe: no ATS */
2759 if (!pci_is_pcie(bridge) ||
2760 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
2761 return 0;
2762 /* If we found the root port, look it up in the ATSR */
2763 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
2764 break;
2765 }
2766
2767 rcu_read_lock();
2768 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
2769 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2770 if (atsr->segment != pci_domain_nr(dev->bus))
2771 continue;
2772
2773 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
2774 if (tmp == &bridge->dev)
2775 goto out;
2776
2777 if (atsru->include_all)
2778 goto out;
2779 }
2780 ret = 0;
2781 out:
2782 rcu_read_unlock();
2783
2784 return ret;
2785 }
2786
dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info * info)2787 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
2788 {
2789 int ret;
2790 struct dmar_rmrr_unit *rmrru;
2791 struct dmar_atsr_unit *atsru;
2792 struct dmar_satc_unit *satcu;
2793 struct acpi_dmar_atsr *atsr;
2794 struct acpi_dmar_reserved_memory *rmrr;
2795 struct acpi_dmar_satc *satc;
2796
2797 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
2798 return 0;
2799
2800 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
2801 rmrr = container_of(rmrru->hdr,
2802 struct acpi_dmar_reserved_memory, header);
2803 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2804 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
2805 ((void *)rmrr) + rmrr->header.length,
2806 rmrr->segment, rmrru->devices,
2807 rmrru->devices_cnt);
2808 if (ret < 0)
2809 return ret;
2810 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2811 dmar_remove_dev_scope(info, rmrr->segment,
2812 rmrru->devices, rmrru->devices_cnt);
2813 }
2814 }
2815
2816 list_for_each_entry(atsru, &dmar_atsr_units, list) {
2817 if (atsru->include_all)
2818 continue;
2819
2820 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2821 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2822 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
2823 (void *)atsr + atsr->header.length,
2824 atsr->segment, atsru->devices,
2825 atsru->devices_cnt);
2826 if (ret > 0)
2827 break;
2828 else if (ret < 0)
2829 return ret;
2830 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2831 if (dmar_remove_dev_scope(info, atsr->segment,
2832 atsru->devices, atsru->devices_cnt))
2833 break;
2834 }
2835 }
2836 list_for_each_entry(satcu, &dmar_satc_units, list) {
2837 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2838 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2839 ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
2840 (void *)satc + satc->header.length,
2841 satc->segment, satcu->devices,
2842 satcu->devices_cnt);
2843 if (ret > 0)
2844 break;
2845 else if (ret < 0)
2846 return ret;
2847 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2848 if (dmar_remove_dev_scope(info, satc->segment,
2849 satcu->devices, satcu->devices_cnt))
2850 break;
2851 }
2852 }
2853
2854 return 0;
2855 }
2856
intel_disable_iommus(void)2857 static void intel_disable_iommus(void)
2858 {
2859 struct intel_iommu *iommu = NULL;
2860 struct dmar_drhd_unit *drhd;
2861
2862 for_each_iommu(iommu, drhd)
2863 iommu_disable_translation(iommu);
2864 }
2865
intel_iommu_shutdown(void)2866 void intel_iommu_shutdown(void)
2867 {
2868 struct dmar_drhd_unit *drhd;
2869 struct intel_iommu *iommu = NULL;
2870
2871 if (no_iommu || dmar_disabled)
2872 return;
2873
2874 down_write(&dmar_global_lock);
2875
2876 /* Disable PMRs explicitly here. */
2877 for_each_iommu(iommu, drhd)
2878 iommu_disable_protect_mem_regions(iommu);
2879
2880 /* Make sure the IOMMUs are switched off */
2881 intel_disable_iommus();
2882
2883 up_write(&dmar_global_lock);
2884 }
2885
dev_to_intel_iommu(struct device * dev)2886 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
2887 {
2888 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
2889
2890 return container_of(iommu_dev, struct intel_iommu, iommu);
2891 }
2892
version_show(struct device * dev,struct device_attribute * attr,char * buf)2893 static ssize_t version_show(struct device *dev,
2894 struct device_attribute *attr, char *buf)
2895 {
2896 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2897 u32 ver = readl(iommu->reg + DMAR_VER_REG);
2898 return sysfs_emit(buf, "%d:%d\n",
2899 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
2900 }
2901 static DEVICE_ATTR_RO(version);
2902
address_show(struct device * dev,struct device_attribute * attr,char * buf)2903 static ssize_t address_show(struct device *dev,
2904 struct device_attribute *attr, char *buf)
2905 {
2906 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2907 return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
2908 }
2909 static DEVICE_ATTR_RO(address);
2910
cap_show(struct device * dev,struct device_attribute * attr,char * buf)2911 static ssize_t cap_show(struct device *dev,
2912 struct device_attribute *attr, char *buf)
2913 {
2914 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2915 return sysfs_emit(buf, "%llx\n", iommu->cap);
2916 }
2917 static DEVICE_ATTR_RO(cap);
2918
ecap_show(struct device * dev,struct device_attribute * attr,char * buf)2919 static ssize_t ecap_show(struct device *dev,
2920 struct device_attribute *attr, char *buf)
2921 {
2922 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2923 return sysfs_emit(buf, "%llx\n", iommu->ecap);
2924 }
2925 static DEVICE_ATTR_RO(ecap);
2926
domains_supported_show(struct device * dev,struct device_attribute * attr,char * buf)2927 static ssize_t domains_supported_show(struct device *dev,
2928 struct device_attribute *attr, char *buf)
2929 {
2930 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2931 return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
2932 }
2933 static DEVICE_ATTR_RO(domains_supported);
2934
domains_used_show(struct device * dev,struct device_attribute * attr,char * buf)2935 static ssize_t domains_used_show(struct device *dev,
2936 struct device_attribute *attr, char *buf)
2937 {
2938 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2939 return sysfs_emit(buf, "%d\n",
2940 bitmap_weight(iommu->domain_ids,
2941 cap_ndoms(iommu->cap)));
2942 }
2943 static DEVICE_ATTR_RO(domains_used);
2944
2945 static struct attribute *intel_iommu_attrs[] = {
2946 &dev_attr_version.attr,
2947 &dev_attr_address.attr,
2948 &dev_attr_cap.attr,
2949 &dev_attr_ecap.attr,
2950 &dev_attr_domains_supported.attr,
2951 &dev_attr_domains_used.attr,
2952 NULL,
2953 };
2954
2955 static struct attribute_group intel_iommu_group = {
2956 .name = "intel-iommu",
2957 .attrs = intel_iommu_attrs,
2958 };
2959
2960 const struct attribute_group *intel_iommu_groups[] = {
2961 &intel_iommu_group,
2962 NULL,
2963 };
2964
has_external_pci(void)2965 static bool has_external_pci(void)
2966 {
2967 struct pci_dev *pdev = NULL;
2968
2969 for_each_pci_dev(pdev)
2970 if (pdev->external_facing) {
2971 pci_dev_put(pdev);
2972 return true;
2973 }
2974
2975 return false;
2976 }
2977
platform_optin_force_iommu(void)2978 static int __init platform_optin_force_iommu(void)
2979 {
2980 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
2981 return 0;
2982
2983 if (no_iommu || dmar_disabled)
2984 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
2985
2986 /*
2987 * If Intel-IOMMU is disabled by default, we will apply identity
2988 * map for all devices except those marked as being untrusted.
2989 */
2990 if (dmar_disabled)
2991 iommu_set_default_passthrough(false);
2992
2993 dmar_disabled = 0;
2994 no_iommu = 0;
2995
2996 return 1;
2997 }
2998
probe_acpi_namespace_devices(void)2999 static int __init probe_acpi_namespace_devices(void)
3000 {
3001 struct dmar_drhd_unit *drhd;
3002 /* To avoid a -Wunused-but-set-variable warning. */
3003 struct intel_iommu *iommu __maybe_unused;
3004 struct device *dev;
3005 int i, ret = 0;
3006
3007 for_each_active_iommu(iommu, drhd) {
3008 for_each_active_dev_scope(drhd->devices,
3009 drhd->devices_cnt, i, dev) {
3010 struct acpi_device_physical_node *pn;
3011 struct acpi_device *adev;
3012
3013 if (dev->bus != &acpi_bus_type)
3014 continue;
3015
3016 adev = to_acpi_device(dev);
3017 mutex_lock(&adev->physical_node_lock);
3018 list_for_each_entry(pn,
3019 &adev->physical_node_list, node) {
3020 ret = iommu_probe_device(pn->dev);
3021 if (ret)
3022 break;
3023 }
3024 mutex_unlock(&adev->physical_node_lock);
3025
3026 if (ret)
3027 return ret;
3028 }
3029 }
3030
3031 return 0;
3032 }
3033
tboot_force_iommu(void)3034 static __init int tboot_force_iommu(void)
3035 {
3036 if (!tboot_enabled())
3037 return 0;
3038
3039 if (no_iommu || dmar_disabled)
3040 pr_warn("Forcing Intel-IOMMU to enabled\n");
3041
3042 dmar_disabled = 0;
3043 no_iommu = 0;
3044
3045 return 1;
3046 }
3047
intel_iommu_init(void)3048 int __init intel_iommu_init(void)
3049 {
3050 int ret = -ENODEV;
3051 struct dmar_drhd_unit *drhd;
3052 struct intel_iommu *iommu;
3053
3054 /*
3055 * Intel IOMMU is required for a TXT/tboot launch or platform
3056 * opt in, so enforce that.
3057 */
3058 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3059 platform_optin_force_iommu();
3060
3061 down_write(&dmar_global_lock);
3062 if (dmar_table_init()) {
3063 if (force_on)
3064 panic("tboot: Failed to initialize DMAR table\n");
3065 goto out_free_dmar;
3066 }
3067
3068 if (dmar_dev_scope_init() < 0) {
3069 if (force_on)
3070 panic("tboot: Failed to initialize DMAR device scope\n");
3071 goto out_free_dmar;
3072 }
3073
3074 up_write(&dmar_global_lock);
3075
3076 /*
3077 * The bus notifier takes the dmar_global_lock, so lockdep will
3078 * complain later when we register it under the lock.
3079 */
3080 dmar_register_bus_notifier();
3081
3082 down_write(&dmar_global_lock);
3083
3084 if (!no_iommu)
3085 intel_iommu_debugfs_init();
3086
3087 if (no_iommu || dmar_disabled) {
3088 /*
3089 * We exit the function here to ensure IOMMU's remapping and
3090 * mempool aren't setup, which means that the IOMMU's PMRs
3091 * won't be disabled via the call to init_dmars(). So disable
3092 * it explicitly here. The PMRs were setup by tboot prior to
3093 * calling SENTER, but the kernel is expected to reset/tear
3094 * down the PMRs.
3095 */
3096 if (intel_iommu_tboot_noforce) {
3097 for_each_iommu(iommu, drhd)
3098 iommu_disable_protect_mem_regions(iommu);
3099 }
3100
3101 /*
3102 * Make sure the IOMMUs are switched off, even when we
3103 * boot into a kexec kernel and the previous kernel left
3104 * them enabled
3105 */
3106 intel_disable_iommus();
3107 goto out_free_dmar;
3108 }
3109
3110 if (list_empty(&dmar_rmrr_units))
3111 pr_info("No RMRR found\n");
3112
3113 if (list_empty(&dmar_atsr_units))
3114 pr_info("No ATSR found\n");
3115
3116 if (list_empty(&dmar_satc_units))
3117 pr_info("No SATC found\n");
3118
3119 init_no_remapping_devices();
3120
3121 ret = init_dmars();
3122 if (ret) {
3123 if (force_on)
3124 panic("tboot: Failed to initialize DMARs\n");
3125 pr_err("Initialization failed\n");
3126 goto out_free_dmar;
3127 }
3128 up_write(&dmar_global_lock);
3129
3130 init_iommu_pm_ops();
3131
3132 down_read(&dmar_global_lock);
3133 for_each_active_iommu(iommu, drhd) {
3134 /*
3135 * The flush queue implementation does not perform
3136 * page-selective invalidations that are required for efficient
3137 * TLB flushes in virtual environments. The benefit of batching
3138 * is likely to be much lower than the overhead of synchronizing
3139 * the virtual and physical IOMMU page-tables.
3140 */
3141 if (cap_caching_mode(iommu->cap) &&
3142 !first_level_by_default(iommu)) {
3143 pr_info_once("IOMMU batching disallowed due to virtualization\n");
3144 iommu_set_dma_strict();
3145 }
3146 iommu_device_sysfs_add(&iommu->iommu, NULL,
3147 intel_iommu_groups,
3148 "%s", iommu->name);
3149 /*
3150 * The iommu device probe is protected by the iommu_probe_device_lock.
3151 * Release the dmar_global_lock before entering the device probe path
3152 * to avoid unnecessary lock order splat.
3153 */
3154 up_read(&dmar_global_lock);
3155 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3156 down_read(&dmar_global_lock);
3157
3158 iommu_pmu_register(iommu);
3159 }
3160
3161 if (probe_acpi_namespace_devices())
3162 pr_warn("ACPI name space devices didn't probe correctly\n");
3163
3164 /* Finally, we enable the DMA remapping hardware. */
3165 for_each_iommu(iommu, drhd) {
3166 if (!drhd->ignored && !translation_pre_enabled(iommu))
3167 iommu_enable_translation(iommu);
3168
3169 iommu_disable_protect_mem_regions(iommu);
3170 }
3171 up_read(&dmar_global_lock);
3172
3173 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3174
3175 intel_iommu_enabled = 1;
3176
3177 return 0;
3178
3179 out_free_dmar:
3180 intel_iommu_free_dmars();
3181 up_write(&dmar_global_lock);
3182 return ret;
3183 }
3184
domain_context_clear_one_cb(struct pci_dev * pdev,u16 alias,void * opaque)3185 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3186 {
3187 struct device_domain_info *info = opaque;
3188
3189 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3190 return 0;
3191 }
3192
3193 /*
3194 * NB - intel-iommu lacks any sort of reference counting for the users of
3195 * dependent devices. If multiple endpoints have intersecting dependent
3196 * devices, unbinding the driver from any one of them will possibly leave
3197 * the others unable to operate.
3198 */
domain_context_clear(struct device_domain_info * info)3199 static void domain_context_clear(struct device_domain_info *info)
3200 {
3201 if (!dev_is_pci(info->dev)) {
3202 domain_context_clear_one(info, info->bus, info->devfn);
3203 return;
3204 }
3205
3206 pci_for_each_dma_alias(to_pci_dev(info->dev),
3207 &domain_context_clear_one_cb, info);
3208 }
3209
3210 /*
3211 * Clear the page table pointer in context or pasid table entries so that
3212 * all DMA requests without PASID from the device are blocked. If the page
3213 * table has been set, clean up the data structures.
3214 */
device_block_translation(struct device * dev)3215 void device_block_translation(struct device *dev)
3216 {
3217 struct device_domain_info *info = dev_iommu_priv_get(dev);
3218 struct intel_iommu *iommu = info->iommu;
3219 unsigned long flags;
3220
3221 if (info->domain)
3222 cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID);
3223
3224 iommu_disable_pci_caps(info);
3225 if (!dev_is_real_dma_subdevice(dev)) {
3226 if (sm_supported(iommu))
3227 intel_pasid_tear_down_entry(iommu, dev,
3228 IOMMU_NO_PASID, false);
3229 else
3230 domain_context_clear(info);
3231 }
3232
3233 if (!info->domain)
3234 return;
3235
3236 spin_lock_irqsave(&info->domain->lock, flags);
3237 list_del(&info->link);
3238 spin_unlock_irqrestore(&info->domain->lock, flags);
3239
3240 domain_detach_iommu(info->domain, iommu);
3241 info->domain = NULL;
3242 }
3243
blocking_domain_attach_dev(struct iommu_domain * domain,struct device * dev)3244 static int blocking_domain_attach_dev(struct iommu_domain *domain,
3245 struct device *dev)
3246 {
3247 device_block_translation(dev);
3248 return 0;
3249 }
3250
3251 static int blocking_domain_set_dev_pasid(struct iommu_domain *domain,
3252 struct device *dev, ioasid_t pasid,
3253 struct iommu_domain *old);
3254
3255 static struct iommu_domain blocking_domain = {
3256 .type = IOMMU_DOMAIN_BLOCKED,
3257 .ops = &(const struct iommu_domain_ops) {
3258 .attach_dev = blocking_domain_attach_dev,
3259 .set_dev_pasid = blocking_domain_set_dev_pasid,
3260 }
3261 };
3262
iommu_superpage_capability(struct intel_iommu * iommu,bool first_stage)3263 static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage)
3264 {
3265 if (!intel_iommu_superpage)
3266 return 0;
3267
3268 if (first_stage)
3269 return cap_fl1gp_support(iommu->cap) ? 2 : 1;
3270
3271 return fls(cap_super_page_val(iommu->cap));
3272 }
3273
paging_domain_alloc(struct device * dev,bool first_stage)3274 static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage)
3275 {
3276 struct device_domain_info *info = dev_iommu_priv_get(dev);
3277 struct intel_iommu *iommu = info->iommu;
3278 struct dmar_domain *domain;
3279 int addr_width;
3280
3281 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
3282 if (!domain)
3283 return ERR_PTR(-ENOMEM);
3284
3285 INIT_LIST_HEAD(&domain->devices);
3286 INIT_LIST_HEAD(&domain->dev_pasids);
3287 INIT_LIST_HEAD(&domain->cache_tags);
3288 spin_lock_init(&domain->lock);
3289 spin_lock_init(&domain->cache_lock);
3290 xa_init(&domain->iommu_array);
3291
3292 domain->nid = dev_to_node(dev);
3293 domain->use_first_level = first_stage;
3294
3295 /* calculate the address width */
3296 addr_width = agaw_to_width(iommu->agaw);
3297 if (addr_width > cap_mgaw(iommu->cap))
3298 addr_width = cap_mgaw(iommu->cap);
3299 domain->gaw = addr_width;
3300 domain->agaw = iommu->agaw;
3301 domain->max_addr = __DOMAIN_MAX_ADDR(addr_width);
3302
3303 /* iommu memory access coherency */
3304 domain->iommu_coherency = iommu_paging_structure_coherency(iommu);
3305
3306 /* pagesize bitmap */
3307 domain->domain.pgsize_bitmap = SZ_4K;
3308 domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage);
3309 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
3310
3311 /*
3312 * IOVA aperture: First-level translation restricts the input-address
3313 * to a canonical address (i.e., address bits 63:N have the same value
3314 * as address bit [N-1], where N is 48-bits with 4-level paging and
3315 * 57-bits with 5-level paging). Hence, skip bit [N-1].
3316 */
3317 domain->domain.geometry.force_aperture = true;
3318 domain->domain.geometry.aperture_start = 0;
3319 if (first_stage)
3320 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
3321 else
3322 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
3323
3324 /* always allocate the top pgd */
3325 domain->pgd = iommu_alloc_page_node(domain->nid, GFP_KERNEL);
3326 if (!domain->pgd) {
3327 kfree(domain);
3328 return ERR_PTR(-ENOMEM);
3329 }
3330 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3331
3332 return domain;
3333 }
3334
3335 static struct iommu_domain *
intel_iommu_domain_alloc_paging_flags(struct device * dev,u32 flags,const struct iommu_user_data * user_data)3336 intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
3337 const struct iommu_user_data *user_data)
3338 {
3339 struct device_domain_info *info = dev_iommu_priv_get(dev);
3340 bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3341 bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3342 struct intel_iommu *iommu = info->iommu;
3343 struct dmar_domain *dmar_domain;
3344 struct iommu_domain *domain;
3345 bool first_stage;
3346
3347 if (flags &
3348 (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
3349 return ERR_PTR(-EOPNOTSUPP);
3350 if (nested_parent && !nested_supported(iommu))
3351 return ERR_PTR(-EOPNOTSUPP);
3352 if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3353 return ERR_PTR(-EOPNOTSUPP);
3354
3355 /*
3356 * Always allocate the guest compatible page table unless
3357 * IOMMU_HWPT_ALLOC_NEST_PARENT or IOMMU_HWPT_ALLOC_DIRTY_TRACKING
3358 * is specified.
3359 */
3360 if (nested_parent || dirty_tracking) {
3361 if (!sm_supported(iommu) || !ecap_slts(iommu->ecap))
3362 return ERR_PTR(-EOPNOTSUPP);
3363 first_stage = false;
3364 } else {
3365 first_stage = first_level_by_default(iommu);
3366 }
3367
3368 dmar_domain = paging_domain_alloc(dev, first_stage);
3369 if (IS_ERR(dmar_domain))
3370 return ERR_CAST(dmar_domain);
3371 domain = &dmar_domain->domain;
3372 domain->type = IOMMU_DOMAIN_UNMANAGED;
3373 domain->owner = &intel_iommu_ops;
3374 domain->ops = intel_iommu_ops.default_domain_ops;
3375
3376 if (nested_parent) {
3377 dmar_domain->nested_parent = true;
3378 INIT_LIST_HEAD(&dmar_domain->s1_domains);
3379 spin_lock_init(&dmar_domain->s1_lock);
3380 }
3381
3382 if (dirty_tracking) {
3383 if (dmar_domain->use_first_level) {
3384 iommu_domain_free(domain);
3385 return ERR_PTR(-EOPNOTSUPP);
3386 }
3387 domain->dirty_ops = &intel_dirty_ops;
3388 }
3389
3390 return domain;
3391 }
3392
intel_iommu_domain_free(struct iommu_domain * domain)3393 static void intel_iommu_domain_free(struct iommu_domain *domain)
3394 {
3395 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3396
3397 WARN_ON(dmar_domain->nested_parent &&
3398 !list_empty(&dmar_domain->s1_domains));
3399 domain_exit(dmar_domain);
3400 }
3401
paging_domain_compatible(struct iommu_domain * domain,struct device * dev)3402 int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
3403 {
3404 struct device_domain_info *info = dev_iommu_priv_get(dev);
3405 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3406 struct intel_iommu *iommu = info->iommu;
3407 int addr_width;
3408
3409 if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
3410 return -EPERM;
3411
3412 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3413 return -EINVAL;
3414
3415 if (domain->dirty_ops && !ssads_supported(iommu))
3416 return -EINVAL;
3417
3418 if (dmar_domain->iommu_coherency !=
3419 iommu_paging_structure_coherency(iommu))
3420 return -EINVAL;
3421
3422 if (dmar_domain->iommu_superpage !=
3423 iommu_superpage_capability(iommu, dmar_domain->use_first_level))
3424 return -EINVAL;
3425
3426 if (dmar_domain->use_first_level &&
3427 (!sm_supported(iommu) || !ecap_flts(iommu->ecap)))
3428 return -EINVAL;
3429
3430 /* check if this iommu agaw is sufficient for max mapped address */
3431 addr_width = agaw_to_width(iommu->agaw);
3432 if (addr_width > cap_mgaw(iommu->cap))
3433 addr_width = cap_mgaw(iommu->cap);
3434
3435 if (dmar_domain->gaw > addr_width || dmar_domain->agaw > iommu->agaw)
3436 return -EINVAL;
3437
3438 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3439 context_copied(iommu, info->bus, info->devfn))
3440 return intel_pasid_setup_sm_context(dev);
3441
3442 return 0;
3443 }
3444
intel_iommu_attach_device(struct iommu_domain * domain,struct device * dev)3445 static int intel_iommu_attach_device(struct iommu_domain *domain,
3446 struct device *dev)
3447 {
3448 int ret;
3449
3450 device_block_translation(dev);
3451
3452 ret = paging_domain_compatible(domain, dev);
3453 if (ret)
3454 return ret;
3455
3456 return dmar_domain_attach_device(to_dmar_domain(domain), dev);
3457 }
3458
intel_iommu_map(struct iommu_domain * domain,unsigned long iova,phys_addr_t hpa,size_t size,int iommu_prot,gfp_t gfp)3459 static int intel_iommu_map(struct iommu_domain *domain,
3460 unsigned long iova, phys_addr_t hpa,
3461 size_t size, int iommu_prot, gfp_t gfp)
3462 {
3463 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3464 u64 max_addr;
3465 int prot = 0;
3466
3467 if (iommu_prot & IOMMU_READ)
3468 prot |= DMA_PTE_READ;
3469 if (iommu_prot & IOMMU_WRITE)
3470 prot |= DMA_PTE_WRITE;
3471 if (dmar_domain->set_pte_snp)
3472 prot |= DMA_PTE_SNP;
3473
3474 max_addr = iova + size;
3475 if (dmar_domain->max_addr < max_addr) {
3476 u64 end;
3477
3478 /* check if minimum agaw is sufficient for mapped address */
3479 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3480 if (end < max_addr) {
3481 pr_err("%s: iommu width (%d) is not "
3482 "sufficient for the mapped address (%llx)\n",
3483 __func__, dmar_domain->gaw, max_addr);
3484 return -EFAULT;
3485 }
3486 dmar_domain->max_addr = max_addr;
3487 }
3488 /* Round up size to next multiple of PAGE_SIZE, if it and
3489 the low bits of hpa would take us onto the next page */
3490 size = aligned_nrpages(hpa, size);
3491 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3492 hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
3493 }
3494
intel_iommu_map_pages(struct iommu_domain * domain,unsigned long iova,phys_addr_t paddr,size_t pgsize,size_t pgcount,int prot,gfp_t gfp,size_t * mapped)3495 static int intel_iommu_map_pages(struct iommu_domain *domain,
3496 unsigned long iova, phys_addr_t paddr,
3497 size_t pgsize, size_t pgcount,
3498 int prot, gfp_t gfp, size_t *mapped)
3499 {
3500 unsigned long pgshift = __ffs(pgsize);
3501 size_t size = pgcount << pgshift;
3502 int ret;
3503
3504 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
3505 return -EINVAL;
3506
3507 if (!IS_ALIGNED(iova | paddr, pgsize))
3508 return -EINVAL;
3509
3510 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
3511 if (!ret && mapped)
3512 *mapped = size;
3513
3514 return ret;
3515 }
3516
intel_iommu_unmap(struct iommu_domain * domain,unsigned long iova,size_t size,struct iommu_iotlb_gather * gather)3517 static size_t intel_iommu_unmap(struct iommu_domain *domain,
3518 unsigned long iova, size_t size,
3519 struct iommu_iotlb_gather *gather)
3520 {
3521 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3522 unsigned long start_pfn, last_pfn;
3523 int level = 0;
3524
3525 /* Cope with horrid API which requires us to unmap more than the
3526 size argument if it happens to be a large-page mapping. */
3527 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
3528 &level, GFP_ATOMIC)))
3529 return 0;
3530
3531 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
3532 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
3533
3534 start_pfn = iova >> VTD_PAGE_SHIFT;
3535 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
3536
3537 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
3538
3539 if (dmar_domain->max_addr == iova + size)
3540 dmar_domain->max_addr = iova;
3541
3542 /*
3543 * We do not use page-selective IOTLB invalidation in flush queue,
3544 * so there is no need to track page and sync iotlb.
3545 */
3546 if (!iommu_iotlb_gather_queued(gather))
3547 iommu_iotlb_gather_add_page(domain, gather, iova, size);
3548
3549 return size;
3550 }
3551
intel_iommu_unmap_pages(struct iommu_domain * domain,unsigned long iova,size_t pgsize,size_t pgcount,struct iommu_iotlb_gather * gather)3552 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
3553 unsigned long iova,
3554 size_t pgsize, size_t pgcount,
3555 struct iommu_iotlb_gather *gather)
3556 {
3557 unsigned long pgshift = __ffs(pgsize);
3558 size_t size = pgcount << pgshift;
3559
3560 return intel_iommu_unmap(domain, iova, size, gather);
3561 }
3562
intel_iommu_tlb_sync(struct iommu_domain * domain,struct iommu_iotlb_gather * gather)3563 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
3564 struct iommu_iotlb_gather *gather)
3565 {
3566 cache_tag_flush_range(to_dmar_domain(domain), gather->start,
3567 gather->end, list_empty(&gather->freelist));
3568 iommu_put_pages_list(&gather->freelist);
3569 }
3570
intel_iommu_iova_to_phys(struct iommu_domain * domain,dma_addr_t iova)3571 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3572 dma_addr_t iova)
3573 {
3574 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3575 struct dma_pte *pte;
3576 int level = 0;
3577 u64 phys = 0;
3578
3579 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
3580 GFP_ATOMIC);
3581 if (pte && dma_pte_present(pte))
3582 phys = dma_pte_addr(pte) +
3583 (iova & (BIT_MASK(level_to_offset_bits(level) +
3584 VTD_PAGE_SHIFT) - 1));
3585
3586 return phys;
3587 }
3588
domain_support_force_snooping(struct dmar_domain * domain)3589 static bool domain_support_force_snooping(struct dmar_domain *domain)
3590 {
3591 struct device_domain_info *info;
3592 bool support = true;
3593
3594 assert_spin_locked(&domain->lock);
3595 list_for_each_entry(info, &domain->devices, link) {
3596 if (!ecap_sc_support(info->iommu->ecap)) {
3597 support = false;
3598 break;
3599 }
3600 }
3601
3602 return support;
3603 }
3604
domain_set_force_snooping(struct dmar_domain * domain)3605 static void domain_set_force_snooping(struct dmar_domain *domain)
3606 {
3607 struct device_domain_info *info;
3608
3609 assert_spin_locked(&domain->lock);
3610 /*
3611 * Second level page table supports per-PTE snoop control. The
3612 * iommu_map() interface will handle this by setting SNP bit.
3613 */
3614 if (!domain->use_first_level) {
3615 domain->set_pte_snp = true;
3616 return;
3617 }
3618
3619 list_for_each_entry(info, &domain->devices, link)
3620 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
3621 IOMMU_NO_PASID);
3622 }
3623
intel_iommu_enforce_cache_coherency(struct iommu_domain * domain)3624 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
3625 {
3626 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3627 unsigned long flags;
3628
3629 if (dmar_domain->force_snooping)
3630 return true;
3631
3632 spin_lock_irqsave(&dmar_domain->lock, flags);
3633 if (!domain_support_force_snooping(dmar_domain) ||
3634 (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
3635 spin_unlock_irqrestore(&dmar_domain->lock, flags);
3636 return false;
3637 }
3638
3639 domain_set_force_snooping(dmar_domain);
3640 dmar_domain->force_snooping = true;
3641 spin_unlock_irqrestore(&dmar_domain->lock, flags);
3642
3643 return true;
3644 }
3645
intel_iommu_capable(struct device * dev,enum iommu_cap cap)3646 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
3647 {
3648 struct device_domain_info *info = dev_iommu_priv_get(dev);
3649
3650 switch (cap) {
3651 case IOMMU_CAP_CACHE_COHERENCY:
3652 case IOMMU_CAP_DEFERRED_FLUSH:
3653 return true;
3654 case IOMMU_CAP_PRE_BOOT_PROTECTION:
3655 return dmar_platform_optin();
3656 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
3657 return ecap_sc_support(info->iommu->ecap);
3658 case IOMMU_CAP_DIRTY_TRACKING:
3659 return ssads_supported(info->iommu);
3660 default:
3661 return false;
3662 }
3663 }
3664
intel_iommu_probe_device(struct device * dev)3665 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
3666 {
3667 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3668 struct device_domain_info *info;
3669 struct intel_iommu *iommu;
3670 u8 bus, devfn;
3671 int ret;
3672
3673 iommu = device_lookup_iommu(dev, &bus, &devfn);
3674 if (!iommu || !iommu->iommu.ops)
3675 return ERR_PTR(-ENODEV);
3676
3677 info = kzalloc(sizeof(*info), GFP_KERNEL);
3678 if (!info)
3679 return ERR_PTR(-ENOMEM);
3680
3681 if (dev_is_real_dma_subdevice(dev)) {
3682 info->bus = pdev->bus->number;
3683 info->devfn = pdev->devfn;
3684 info->segment = pci_domain_nr(pdev->bus);
3685 } else {
3686 info->bus = bus;
3687 info->devfn = devfn;
3688 info->segment = iommu->segment;
3689 }
3690
3691 info->dev = dev;
3692 info->iommu = iommu;
3693 if (dev_is_pci(dev)) {
3694 if (ecap_dev_iotlb_support(iommu->ecap) &&
3695 pci_ats_supported(pdev) &&
3696 dmar_ats_supported(pdev, iommu)) {
3697 info->ats_supported = 1;
3698 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
3699
3700 /*
3701 * For IOMMU that supports device IOTLB throttling
3702 * (DIT), we assign PFSID to the invalidation desc
3703 * of a VF such that IOMMU HW can gauge queue depth
3704 * at PF level. If DIT is not set, PFSID will be
3705 * treated as reserved, which should be set to 0.
3706 */
3707 if (ecap_dit(iommu->ecap))
3708 info->pfsid = pci_dev_id(pci_physfn(pdev));
3709 info->ats_qdep = pci_ats_queue_depth(pdev);
3710 }
3711 if (sm_supported(iommu)) {
3712 if (pasid_supported(iommu)) {
3713 int features = pci_pasid_features(pdev);
3714
3715 if (features >= 0)
3716 info->pasid_supported = features | 1;
3717 }
3718
3719 if (info->ats_supported && ecap_prs(iommu->ecap) &&
3720 pci_pri_supported(pdev))
3721 info->pri_supported = 1;
3722 }
3723 }
3724
3725 dev_iommu_priv_set(dev, info);
3726 if (pdev && pci_ats_supported(pdev)) {
3727 pci_prepare_ats(pdev, VTD_PAGE_SHIFT);
3728 ret = device_rbtree_insert(iommu, info);
3729 if (ret)
3730 goto free;
3731 }
3732
3733 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
3734 ret = intel_pasid_alloc_table(dev);
3735 if (ret) {
3736 dev_err(dev, "PASID table allocation failed\n");
3737 goto clear_rbtree;
3738 }
3739
3740 if (!context_copied(iommu, info->bus, info->devfn)) {
3741 ret = intel_pasid_setup_sm_context(dev);
3742 if (ret)
3743 goto free_table;
3744 }
3745 }
3746
3747 intel_iommu_debugfs_create_dev(info);
3748
3749 /*
3750 * The PCIe spec, in its wisdom, declares that the behaviour of the
3751 * device is undefined if you enable PASID support after ATS support.
3752 * So always enable PASID support on devices which have it, even if
3753 * we can't yet know if we're ever going to use it.
3754 */
3755 if (info->pasid_supported &&
3756 !pci_enable_pasid(pdev, info->pasid_supported & ~1))
3757 info->pasid_enabled = 1;
3758
3759 return &iommu->iommu;
3760 free_table:
3761 intel_pasid_free_table(dev);
3762 clear_rbtree:
3763 device_rbtree_remove(info);
3764 free:
3765 kfree(info);
3766
3767 return ERR_PTR(ret);
3768 }
3769
intel_iommu_release_device(struct device * dev)3770 static void intel_iommu_release_device(struct device *dev)
3771 {
3772 struct device_domain_info *info = dev_iommu_priv_get(dev);
3773 struct intel_iommu *iommu = info->iommu;
3774
3775 if (info->pasid_enabled) {
3776 pci_disable_pasid(to_pci_dev(dev));
3777 info->pasid_enabled = 0;
3778 }
3779
3780 mutex_lock(&iommu->iopf_lock);
3781 if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
3782 device_rbtree_remove(info);
3783 mutex_unlock(&iommu->iopf_lock);
3784
3785 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3786 !context_copied(iommu, info->bus, info->devfn))
3787 intel_pasid_teardown_sm_context(dev);
3788
3789 intel_pasid_free_table(dev);
3790 intel_iommu_debugfs_remove_dev(info);
3791 kfree(info);
3792 set_dma_ops(dev, NULL);
3793 }
3794
intel_iommu_get_resv_regions(struct device * device,struct list_head * head)3795 static void intel_iommu_get_resv_regions(struct device *device,
3796 struct list_head *head)
3797 {
3798 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
3799 struct iommu_resv_region *reg;
3800 struct dmar_rmrr_unit *rmrr;
3801 struct device *i_dev;
3802 int i;
3803
3804 rcu_read_lock();
3805 for_each_rmrr_units(rmrr) {
3806 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3807 i, i_dev) {
3808 struct iommu_resv_region *resv;
3809 enum iommu_resv_type type;
3810 size_t length;
3811
3812 if (i_dev != device &&
3813 !is_downstream_to_pci_bridge(device, i_dev))
3814 continue;
3815
3816 length = rmrr->end_address - rmrr->base_address + 1;
3817
3818 type = device_rmrr_is_relaxable(device) ?
3819 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
3820
3821 resv = iommu_alloc_resv_region(rmrr->base_address,
3822 length, prot, type,
3823 GFP_ATOMIC);
3824 if (!resv)
3825 break;
3826
3827 list_add_tail(&resv->list, head);
3828 }
3829 }
3830 rcu_read_unlock();
3831
3832 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
3833 if (dev_is_pci(device)) {
3834 struct pci_dev *pdev = to_pci_dev(device);
3835
3836 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
3837 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
3838 IOMMU_RESV_DIRECT_RELAXABLE,
3839 GFP_KERNEL);
3840 if (reg)
3841 list_add_tail(®->list, head);
3842 }
3843 }
3844 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
3845
3846 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
3847 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
3848 0, IOMMU_RESV_MSI, GFP_KERNEL);
3849 if (!reg)
3850 return;
3851 list_add_tail(®->list, head);
3852 }
3853
intel_iommu_device_group(struct device * dev)3854 static struct iommu_group *intel_iommu_device_group(struct device *dev)
3855 {
3856 if (dev_is_pci(dev))
3857 return pci_device_group(dev);
3858 return generic_device_group(dev);
3859 }
3860
intel_iommu_enable_sva(struct device * dev)3861 static int intel_iommu_enable_sva(struct device *dev)
3862 {
3863 struct device_domain_info *info = dev_iommu_priv_get(dev);
3864 struct intel_iommu *iommu;
3865
3866 if (!info || dmar_disabled)
3867 return -EINVAL;
3868
3869 iommu = info->iommu;
3870 if (!iommu)
3871 return -EINVAL;
3872
3873 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
3874 return -ENODEV;
3875
3876 if (!info->pasid_enabled || !info->ats_enabled)
3877 return -EINVAL;
3878
3879 /*
3880 * Devices having device-specific I/O fault handling should not
3881 * support PCI/PRI. The IOMMU side has no means to check the
3882 * capability of device-specific IOPF. Therefore, IOMMU can only
3883 * default that if the device driver enables SVA on a non-PRI
3884 * device, it will handle IOPF in its own way.
3885 */
3886 if (!info->pri_supported)
3887 return 0;
3888
3889 /* Devices supporting PRI should have it enabled. */
3890 if (!info->pri_enabled)
3891 return -EINVAL;
3892
3893 return 0;
3894 }
3895
context_flip_pri(struct device_domain_info * info,bool enable)3896 static int context_flip_pri(struct device_domain_info *info, bool enable)
3897 {
3898 struct intel_iommu *iommu = info->iommu;
3899 u8 bus = info->bus, devfn = info->devfn;
3900 struct context_entry *context;
3901 u16 did;
3902
3903 spin_lock(&iommu->lock);
3904 if (context_copied(iommu, bus, devfn)) {
3905 spin_unlock(&iommu->lock);
3906 return -EINVAL;
3907 }
3908
3909 context = iommu_context_addr(iommu, bus, devfn, false);
3910 if (!context || !context_present(context)) {
3911 spin_unlock(&iommu->lock);
3912 return -ENODEV;
3913 }
3914 did = context_domain_id(context);
3915
3916 if (enable)
3917 context_set_sm_pre(context);
3918 else
3919 context_clear_sm_pre(context);
3920
3921 if (!ecap_coherent(iommu->ecap))
3922 clflush_cache_range(context, sizeof(*context));
3923 intel_context_flush_present(info, context, did, true);
3924 spin_unlock(&iommu->lock);
3925
3926 return 0;
3927 }
3928
intel_iommu_enable_iopf(struct device * dev)3929 static int intel_iommu_enable_iopf(struct device *dev)
3930 {
3931 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3932 struct device_domain_info *info = dev_iommu_priv_get(dev);
3933 struct intel_iommu *iommu;
3934 int ret;
3935
3936 if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
3937 return -ENODEV;
3938
3939 if (info->pri_enabled)
3940 return -EBUSY;
3941
3942 iommu = info->iommu;
3943 if (!iommu)
3944 return -EINVAL;
3945
3946 /* PASID is required in PRG Response Message. */
3947 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
3948 return -EINVAL;
3949
3950 ret = pci_reset_pri(pdev);
3951 if (ret)
3952 return ret;
3953
3954 ret = iopf_queue_add_device(iommu->iopf_queue, dev);
3955 if (ret)
3956 return ret;
3957
3958 ret = context_flip_pri(info, true);
3959 if (ret)
3960 goto err_remove_device;
3961
3962 ret = pci_enable_pri(pdev, PRQ_DEPTH);
3963 if (ret)
3964 goto err_clear_pri;
3965
3966 info->pri_enabled = 1;
3967
3968 return 0;
3969 err_clear_pri:
3970 context_flip_pri(info, false);
3971 err_remove_device:
3972 iopf_queue_remove_device(iommu->iopf_queue, dev);
3973
3974 return ret;
3975 }
3976
intel_iommu_disable_iopf(struct device * dev)3977 static int intel_iommu_disable_iopf(struct device *dev)
3978 {
3979 struct device_domain_info *info = dev_iommu_priv_get(dev);
3980 struct intel_iommu *iommu = info->iommu;
3981
3982 if (!info->pri_enabled)
3983 return -EINVAL;
3984
3985 /* Disable new PRI reception: */
3986 context_flip_pri(info, false);
3987
3988 /*
3989 * Remove device from fault queue and acknowledge all outstanding
3990 * PRQs to the device:
3991 */
3992 iopf_queue_remove_device(iommu->iopf_queue, dev);
3993
3994 /*
3995 * PCIe spec states that by clearing PRI enable bit, the Page
3996 * Request Interface will not issue new page requests, but has
3997 * outstanding page requests that have been transmitted or are
3998 * queued for transmission. This is supposed to be called after
3999 * the device driver has stopped DMA, all PASIDs have been
4000 * unbound and the outstanding PRQs have been drained.
4001 */
4002 pci_disable_pri(to_pci_dev(dev));
4003 info->pri_enabled = 0;
4004
4005 return 0;
4006 }
4007
4008 static int
intel_iommu_dev_enable_feat(struct device * dev,enum iommu_dev_features feat)4009 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4010 {
4011 switch (feat) {
4012 case IOMMU_DEV_FEAT_IOPF:
4013 return intel_iommu_enable_iopf(dev);
4014
4015 case IOMMU_DEV_FEAT_SVA:
4016 return intel_iommu_enable_sva(dev);
4017
4018 default:
4019 return -ENODEV;
4020 }
4021 }
4022
4023 static int
intel_iommu_dev_disable_feat(struct device * dev,enum iommu_dev_features feat)4024 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4025 {
4026 switch (feat) {
4027 case IOMMU_DEV_FEAT_IOPF:
4028 return intel_iommu_disable_iopf(dev);
4029
4030 case IOMMU_DEV_FEAT_SVA:
4031 return 0;
4032
4033 default:
4034 return -ENODEV;
4035 }
4036 }
4037
intel_iommu_is_attach_deferred(struct device * dev)4038 static bool intel_iommu_is_attach_deferred(struct device *dev)
4039 {
4040 struct device_domain_info *info = dev_iommu_priv_get(dev);
4041
4042 return translation_pre_enabled(info->iommu) && !info->domain;
4043 }
4044
4045 /*
4046 * Check that the device does not live on an external facing PCI port that is
4047 * marked as untrusted. Such devices should not be able to apply quirks and
4048 * thus not be able to bypass the IOMMU restrictions.
4049 */
risky_device(struct pci_dev * pdev)4050 static bool risky_device(struct pci_dev *pdev)
4051 {
4052 if (pdev->untrusted) {
4053 pci_info(pdev,
4054 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4055 pdev->vendor, pdev->device);
4056 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4057 return true;
4058 }
4059 return false;
4060 }
4061
intel_iommu_iotlb_sync_map(struct iommu_domain * domain,unsigned long iova,size_t size)4062 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4063 unsigned long iova, size_t size)
4064 {
4065 cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1);
4066
4067 return 0;
4068 }
4069
domain_remove_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid)4070 void domain_remove_dev_pasid(struct iommu_domain *domain,
4071 struct device *dev, ioasid_t pasid)
4072 {
4073 struct device_domain_info *info = dev_iommu_priv_get(dev);
4074 struct dev_pasid_info *curr, *dev_pasid = NULL;
4075 struct intel_iommu *iommu = info->iommu;
4076 struct dmar_domain *dmar_domain;
4077 unsigned long flags;
4078
4079 if (!domain)
4080 return;
4081
4082 /* Identity domain has no meta data for pasid. */
4083 if (domain->type == IOMMU_DOMAIN_IDENTITY)
4084 return;
4085
4086 dmar_domain = to_dmar_domain(domain);
4087 spin_lock_irqsave(&dmar_domain->lock, flags);
4088 list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4089 if (curr->dev == dev && curr->pasid == pasid) {
4090 list_del(&curr->link_domain);
4091 dev_pasid = curr;
4092 break;
4093 }
4094 }
4095 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4096
4097 cache_tag_unassign_domain(dmar_domain, dev, pasid);
4098 domain_detach_iommu(dmar_domain, iommu);
4099 if (!WARN_ON_ONCE(!dev_pasid)) {
4100 intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4101 kfree(dev_pasid);
4102 }
4103 }
4104
blocking_domain_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)4105 static int blocking_domain_set_dev_pasid(struct iommu_domain *domain,
4106 struct device *dev, ioasid_t pasid,
4107 struct iommu_domain *old)
4108 {
4109 struct device_domain_info *info = dev_iommu_priv_get(dev);
4110
4111 intel_pasid_tear_down_entry(info->iommu, dev, pasid, false);
4112 domain_remove_dev_pasid(old, dev, pasid);
4113
4114 return 0;
4115 }
4116
4117 struct dev_pasid_info *
domain_add_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid)4118 domain_add_dev_pasid(struct iommu_domain *domain,
4119 struct device *dev, ioasid_t pasid)
4120 {
4121 struct device_domain_info *info = dev_iommu_priv_get(dev);
4122 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4123 struct intel_iommu *iommu = info->iommu;
4124 struct dev_pasid_info *dev_pasid;
4125 unsigned long flags;
4126 int ret;
4127
4128 dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4129 if (!dev_pasid)
4130 return ERR_PTR(-ENOMEM);
4131
4132 ret = domain_attach_iommu(dmar_domain, iommu);
4133 if (ret)
4134 goto out_free;
4135
4136 ret = cache_tag_assign_domain(dmar_domain, dev, pasid);
4137 if (ret)
4138 goto out_detach_iommu;
4139
4140 dev_pasid->dev = dev;
4141 dev_pasid->pasid = pasid;
4142 spin_lock_irqsave(&dmar_domain->lock, flags);
4143 list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4144 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4145
4146 return dev_pasid;
4147 out_detach_iommu:
4148 domain_detach_iommu(dmar_domain, iommu);
4149 out_free:
4150 kfree(dev_pasid);
4151 return ERR_PTR(ret);
4152 }
4153
intel_iommu_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)4154 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4155 struct device *dev, ioasid_t pasid,
4156 struct iommu_domain *old)
4157 {
4158 struct device_domain_info *info = dev_iommu_priv_get(dev);
4159 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4160 struct intel_iommu *iommu = info->iommu;
4161 struct dev_pasid_info *dev_pasid;
4162 int ret;
4163
4164 if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
4165 return -EINVAL;
4166
4167 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4168 return -EOPNOTSUPP;
4169
4170 if (domain->dirty_ops)
4171 return -EINVAL;
4172
4173 if (context_copied(iommu, info->bus, info->devfn))
4174 return -EBUSY;
4175
4176 ret = paging_domain_compatible(domain, dev);
4177 if (ret)
4178 return ret;
4179
4180 dev_pasid = domain_add_dev_pasid(domain, dev, pasid);
4181 if (IS_ERR(dev_pasid))
4182 return PTR_ERR(dev_pasid);
4183
4184 if (dmar_domain->use_first_level)
4185 ret = domain_setup_first_level(iommu, dmar_domain,
4186 dev, pasid, old);
4187 else
4188 ret = domain_setup_second_level(iommu, dmar_domain,
4189 dev, pasid, old);
4190 if (ret)
4191 goto out_remove_dev_pasid;
4192
4193 domain_remove_dev_pasid(old, dev, pasid);
4194
4195 intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4196
4197 return 0;
4198
4199 out_remove_dev_pasid:
4200 domain_remove_dev_pasid(domain, dev, pasid);
4201 return ret;
4202 }
4203
intel_iommu_hw_info(struct device * dev,u32 * length,u32 * type)4204 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4205 {
4206 struct device_domain_info *info = dev_iommu_priv_get(dev);
4207 struct intel_iommu *iommu = info->iommu;
4208 struct iommu_hw_info_vtd *vtd;
4209
4210 vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4211 if (!vtd)
4212 return ERR_PTR(-ENOMEM);
4213
4214 vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4215 vtd->cap_reg = iommu->cap;
4216 vtd->ecap_reg = iommu->ecap;
4217 *length = sizeof(*vtd);
4218 *type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4219 return vtd;
4220 }
4221
4222 /*
4223 * Set dirty tracking for the device list of a domain. The caller must
4224 * hold the domain->lock when calling it.
4225 */
device_set_dirty_tracking(struct list_head * devices,bool enable)4226 static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4227 {
4228 struct device_domain_info *info;
4229 int ret = 0;
4230
4231 list_for_each_entry(info, devices, link) {
4232 ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4233 IOMMU_NO_PASID, enable);
4234 if (ret)
4235 break;
4236 }
4237
4238 return ret;
4239 }
4240
parent_domain_set_dirty_tracking(struct dmar_domain * domain,bool enable)4241 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4242 bool enable)
4243 {
4244 struct dmar_domain *s1_domain;
4245 unsigned long flags;
4246 int ret;
4247
4248 spin_lock(&domain->s1_lock);
4249 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4250 spin_lock_irqsave(&s1_domain->lock, flags);
4251 ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4252 spin_unlock_irqrestore(&s1_domain->lock, flags);
4253 if (ret)
4254 goto err_unwind;
4255 }
4256 spin_unlock(&domain->s1_lock);
4257 return 0;
4258
4259 err_unwind:
4260 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4261 spin_lock_irqsave(&s1_domain->lock, flags);
4262 device_set_dirty_tracking(&s1_domain->devices,
4263 domain->dirty_tracking);
4264 spin_unlock_irqrestore(&s1_domain->lock, flags);
4265 }
4266 spin_unlock(&domain->s1_lock);
4267 return ret;
4268 }
4269
intel_iommu_set_dirty_tracking(struct iommu_domain * domain,bool enable)4270 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4271 bool enable)
4272 {
4273 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4274 int ret;
4275
4276 spin_lock(&dmar_domain->lock);
4277 if (dmar_domain->dirty_tracking == enable)
4278 goto out_unlock;
4279
4280 ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4281 if (ret)
4282 goto err_unwind;
4283
4284 if (dmar_domain->nested_parent) {
4285 ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4286 if (ret)
4287 goto err_unwind;
4288 }
4289
4290 dmar_domain->dirty_tracking = enable;
4291 out_unlock:
4292 spin_unlock(&dmar_domain->lock);
4293
4294 return 0;
4295
4296 err_unwind:
4297 device_set_dirty_tracking(&dmar_domain->devices,
4298 dmar_domain->dirty_tracking);
4299 spin_unlock(&dmar_domain->lock);
4300 return ret;
4301 }
4302
intel_iommu_read_and_clear_dirty(struct iommu_domain * domain,unsigned long iova,size_t size,unsigned long flags,struct iommu_dirty_bitmap * dirty)4303 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4304 unsigned long iova, size_t size,
4305 unsigned long flags,
4306 struct iommu_dirty_bitmap *dirty)
4307 {
4308 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4309 unsigned long end = iova + size - 1;
4310 unsigned long pgsize;
4311
4312 /*
4313 * IOMMUFD core calls into a dirty tracking disabled domain without an
4314 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4315 * have occurred when we stopped dirty tracking. This ensures that we
4316 * never inherit dirtied bits from a previous cycle.
4317 */
4318 if (!dmar_domain->dirty_tracking && dirty->bitmap)
4319 return -EINVAL;
4320
4321 do {
4322 struct dma_pte *pte;
4323 int lvl = 0;
4324
4325 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4326 GFP_ATOMIC);
4327 pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4328 if (!pte || !dma_pte_present(pte)) {
4329 iova += pgsize;
4330 continue;
4331 }
4332
4333 if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4334 iommu_dirty_bitmap_record(dirty, iova, pgsize);
4335 iova += pgsize;
4336 } while (iova < end);
4337
4338 return 0;
4339 }
4340
4341 static const struct iommu_dirty_ops intel_dirty_ops = {
4342 .set_dirty_tracking = intel_iommu_set_dirty_tracking,
4343 .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4344 };
4345
context_setup_pass_through(struct device * dev,u8 bus,u8 devfn)4346 static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
4347 {
4348 struct device_domain_info *info = dev_iommu_priv_get(dev);
4349 struct intel_iommu *iommu = info->iommu;
4350 struct context_entry *context;
4351
4352 spin_lock(&iommu->lock);
4353 context = iommu_context_addr(iommu, bus, devfn, 1);
4354 if (!context) {
4355 spin_unlock(&iommu->lock);
4356 return -ENOMEM;
4357 }
4358
4359 if (context_present(context) && !context_copied(iommu, bus, devfn)) {
4360 spin_unlock(&iommu->lock);
4361 return 0;
4362 }
4363
4364 copied_context_tear_down(iommu, context, bus, devfn);
4365 context_clear_entry(context);
4366 context_set_domain_id(context, FLPT_DEFAULT_DID);
4367
4368 /*
4369 * In pass through mode, AW must be programmed to indicate the largest
4370 * AGAW value supported by hardware. And ASR is ignored by hardware.
4371 */
4372 context_set_address_width(context, iommu->msagaw);
4373 context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH);
4374 context_set_fault_enable(context);
4375 context_set_present(context);
4376 if (!ecap_coherent(iommu->ecap))
4377 clflush_cache_range(context, sizeof(*context));
4378 context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn);
4379 spin_unlock(&iommu->lock);
4380
4381 return 0;
4382 }
4383
context_setup_pass_through_cb(struct pci_dev * pdev,u16 alias,void * data)4384 static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data)
4385 {
4386 struct device *dev = data;
4387
4388 return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff);
4389 }
4390
device_setup_pass_through(struct device * dev)4391 static int device_setup_pass_through(struct device *dev)
4392 {
4393 struct device_domain_info *info = dev_iommu_priv_get(dev);
4394
4395 if (!dev_is_pci(dev))
4396 return context_setup_pass_through(dev, info->bus, info->devfn);
4397
4398 return pci_for_each_dma_alias(to_pci_dev(dev),
4399 context_setup_pass_through_cb, dev);
4400 }
4401
identity_domain_attach_dev(struct iommu_domain * domain,struct device * dev)4402 static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev)
4403 {
4404 struct device_domain_info *info = dev_iommu_priv_get(dev);
4405 struct intel_iommu *iommu = info->iommu;
4406 int ret;
4407
4408 device_block_translation(dev);
4409
4410 if (dev_is_real_dma_subdevice(dev))
4411 return 0;
4412
4413 if (sm_supported(iommu)) {
4414 ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
4415 if (!ret)
4416 iommu_enable_pci_caps(info);
4417 } else {
4418 ret = device_setup_pass_through(dev);
4419 }
4420
4421 return ret;
4422 }
4423
identity_domain_set_dev_pasid(struct iommu_domain * domain,struct device * dev,ioasid_t pasid,struct iommu_domain * old)4424 static int identity_domain_set_dev_pasid(struct iommu_domain *domain,
4425 struct device *dev, ioasid_t pasid,
4426 struct iommu_domain *old)
4427 {
4428 struct device_domain_info *info = dev_iommu_priv_get(dev);
4429 struct intel_iommu *iommu = info->iommu;
4430 int ret;
4431
4432 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4433 return -EOPNOTSUPP;
4434
4435 ret = domain_setup_passthrough(iommu, dev, pasid, old);
4436 if (ret)
4437 return ret;
4438
4439 domain_remove_dev_pasid(old, dev, pasid);
4440 return 0;
4441 }
4442
4443 static struct iommu_domain identity_domain = {
4444 .type = IOMMU_DOMAIN_IDENTITY,
4445 .ops = &(const struct iommu_domain_ops) {
4446 .attach_dev = identity_domain_attach_dev,
4447 .set_dev_pasid = identity_domain_set_dev_pasid,
4448 },
4449 };
4450
4451 const struct iommu_ops intel_iommu_ops = {
4452 .blocked_domain = &blocking_domain,
4453 .release_domain = &blocking_domain,
4454 .identity_domain = &identity_domain,
4455 .capable = intel_iommu_capable,
4456 .hw_info = intel_iommu_hw_info,
4457 .domain_alloc_paging_flags = intel_iommu_domain_alloc_paging_flags,
4458 .domain_alloc_sva = intel_svm_domain_alloc,
4459 .domain_alloc_nested = intel_iommu_domain_alloc_nested,
4460 .probe_device = intel_iommu_probe_device,
4461 .release_device = intel_iommu_release_device,
4462 .get_resv_regions = intel_iommu_get_resv_regions,
4463 .device_group = intel_iommu_device_group,
4464 .dev_enable_feat = intel_iommu_dev_enable_feat,
4465 .dev_disable_feat = intel_iommu_dev_disable_feat,
4466 .is_attach_deferred = intel_iommu_is_attach_deferred,
4467 .def_domain_type = device_def_domain_type,
4468 .pgsize_bitmap = SZ_4K,
4469 .page_response = intel_iommu_page_response,
4470 .default_domain_ops = &(const struct iommu_domain_ops) {
4471 .attach_dev = intel_iommu_attach_device,
4472 .set_dev_pasid = intel_iommu_set_dev_pasid,
4473 .map_pages = intel_iommu_map_pages,
4474 .unmap_pages = intel_iommu_unmap_pages,
4475 .iotlb_sync_map = intel_iommu_iotlb_sync_map,
4476 .flush_iotlb_all = intel_flush_iotlb_all,
4477 .iotlb_sync = intel_iommu_tlb_sync,
4478 .iova_to_phys = intel_iommu_iova_to_phys,
4479 .free = intel_iommu_domain_free,
4480 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4481 }
4482 };
4483
quirk_iommu_igfx(struct pci_dev * dev)4484 static void quirk_iommu_igfx(struct pci_dev *dev)
4485 {
4486 if (risky_device(dev))
4487 return;
4488
4489 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4490 disable_igfx_iommu = 1;
4491 }
4492
4493 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4494 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4495 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4496 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4497 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4498 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4499 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4500 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4501
4502 /* Broadwell igfx malfunctions with dmar */
4503 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4504 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4505 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4506 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4507 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4508 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4509 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4510 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4511 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4512 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4513 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4514 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4515 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4516 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4517 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4518 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4519 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4520 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4521 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4522 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4523 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4524 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4525 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4526 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4527
quirk_iommu_rwbf(struct pci_dev * dev)4528 static void quirk_iommu_rwbf(struct pci_dev *dev)
4529 {
4530 if (risky_device(dev))
4531 return;
4532
4533 /*
4534 * Mobile 4 Series Chipset neglects to set RWBF capability,
4535 * but needs it. Same seems to hold for the desktop versions.
4536 */
4537 pci_info(dev, "Forcing write-buffer flush capability\n");
4538 rwbf_quirk = 1;
4539 }
4540
4541 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4542 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4543 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4544 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4545 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4546 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4547 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4548
4549 #define GGC 0x52
4550 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4551 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4552 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4553 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4554 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4555 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4556 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4557 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4558
quirk_calpella_no_shadow_gtt(struct pci_dev * dev)4559 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4560 {
4561 unsigned short ggc;
4562
4563 if (risky_device(dev))
4564 return;
4565
4566 if (pci_read_config_word(dev, GGC, &ggc))
4567 return;
4568
4569 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4570 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4571 disable_igfx_iommu = 1;
4572 } else if (!disable_igfx_iommu) {
4573 /* we have to ensure the gfx device is idle before we flush */
4574 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4575 iommu_set_dma_strict();
4576 }
4577 }
4578 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4579 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4580 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4581 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4582
quirk_igfx_skip_te_disable(struct pci_dev * dev)4583 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4584 {
4585 unsigned short ver;
4586
4587 if (!IS_GFX_DEVICE(dev))
4588 return;
4589
4590 ver = (dev->device >> 8) & 0xff;
4591 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4592 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4593 ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4594 return;
4595
4596 if (risky_device(dev))
4597 return;
4598
4599 pci_info(dev, "Skip IOMMU disabling for graphics\n");
4600 iommu_skip_te_disable = 1;
4601 }
4602 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4603
4604 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4605 ISOCH DMAR unit for the Azalia sound device, but not give it any
4606 TLB entries, which causes it to deadlock. Check for that. We do
4607 this in a function called from init_dmars(), instead of in a PCI
4608 quirk, because we don't want to print the obnoxious "BIOS broken"
4609 message if VT-d is actually disabled.
4610 */
check_tylersburg_isoch(void)4611 static void __init check_tylersburg_isoch(void)
4612 {
4613 struct pci_dev *pdev;
4614 uint32_t vtisochctrl;
4615
4616 /* If there's no Azalia in the system anyway, forget it. */
4617 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4618 if (!pdev)
4619 return;
4620
4621 if (risky_device(pdev)) {
4622 pci_dev_put(pdev);
4623 return;
4624 }
4625
4626 pci_dev_put(pdev);
4627
4628 /* System Management Registers. Might be hidden, in which case
4629 we can't do the sanity check. But that's OK, because the
4630 known-broken BIOSes _don't_ actually hide it, so far. */
4631 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4632 if (!pdev)
4633 return;
4634
4635 if (risky_device(pdev)) {
4636 pci_dev_put(pdev);
4637 return;
4638 }
4639
4640 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4641 pci_dev_put(pdev);
4642 return;
4643 }
4644
4645 pci_dev_put(pdev);
4646
4647 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4648 if (vtisochctrl & 1)
4649 return;
4650
4651 /* Drop all bits other than the number of TLB entries */
4652 vtisochctrl &= 0x1c;
4653
4654 /* If we have the recommended number of TLB entries (16), fine. */
4655 if (vtisochctrl == 0x10)
4656 return;
4657
4658 /* Zero TLB entries? You get to ride the short bus to school. */
4659 if (!vtisochctrl) {
4660 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4661 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4662 dmi_get_system_info(DMI_BIOS_VENDOR),
4663 dmi_get_system_info(DMI_BIOS_VERSION),
4664 dmi_get_system_info(DMI_PRODUCT_VERSION));
4665 iommu_identity_mapping |= IDENTMAP_AZALIA;
4666 return;
4667 }
4668
4669 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4670 vtisochctrl);
4671 }
4672
4673 /*
4674 * Here we deal with a device TLB defect where device may inadvertently issue ATS
4675 * invalidation completion before posted writes initiated with translated address
4676 * that utilized translations matching the invalidation address range, violating
4677 * the invalidation completion ordering.
4678 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4679 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4680 * under the control of the trusted/privileged host device driver must use this
4681 * quirk.
4682 * Device TLBs are invalidated under the following six conditions:
4683 * 1. Device driver does DMA API unmap IOVA
4684 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4685 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4686 * exit_mmap() due to crash
4687 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4688 * VM has to free pages that were unmapped
4689 * 5. Userspace driver unmaps a DMA buffer
4690 * 6. Cache invalidation in vSVA usage (upcoming)
4691 *
4692 * For #1 and #2, device drivers are responsible for stopping DMA traffic
4693 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4694 * invalidate TLB the same way as normal user unmap which will use this quirk.
4695 * The dTLB invalidation after PASID cache flush does not need this quirk.
4696 *
4697 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4698 */
quirk_extra_dev_tlb_flush(struct device_domain_info * info,unsigned long address,unsigned long mask,u32 pasid,u16 qdep)4699 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4700 unsigned long address, unsigned long mask,
4701 u32 pasid, u16 qdep)
4702 {
4703 u16 sid;
4704
4705 if (likely(!info->dtlb_extra_inval))
4706 return;
4707
4708 sid = PCI_DEVID(info->bus, info->devfn);
4709 if (pasid == IOMMU_NO_PASID) {
4710 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
4711 qdep, address, mask);
4712 } else {
4713 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
4714 pasid, qdep, address, mask);
4715 }
4716 }
4717
4718 #define ecmd_get_status_code(res) (((res) & 0xff) >> 1)
4719
4720 /*
4721 * Function to submit a command to the enhanced command interface. The
4722 * valid enhanced command descriptions are defined in Table 47 of the
4723 * VT-d spec. The VT-d hardware implementation may support some but not
4724 * all commands, which can be determined by checking the Enhanced
4725 * Command Capability Register.
4726 *
4727 * Return values:
4728 * - 0: Command successful without any error;
4729 * - Negative: software error value;
4730 * - Nonzero positive: failure status code defined in Table 48.
4731 */
ecmd_submit_sync(struct intel_iommu * iommu,u8 ecmd,u64 oa,u64 ob)4732 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
4733 {
4734 unsigned long flags;
4735 u64 res;
4736 int ret;
4737
4738 if (!cap_ecmds(iommu->cap))
4739 return -ENODEV;
4740
4741 raw_spin_lock_irqsave(&iommu->register_lock, flags);
4742
4743 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
4744 if (res & DMA_ECMD_ECRSP_IP) {
4745 ret = -EBUSY;
4746 goto err;
4747 }
4748
4749 /*
4750 * Unconditionally write the operand B, because
4751 * - There is no side effect if an ecmd doesn't require an
4752 * operand B, but we set the register to some value.
4753 * - It's not invoked in any critical path. The extra MMIO
4754 * write doesn't bring any performance concerns.
4755 */
4756 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
4757 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
4758
4759 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
4760 !(res & DMA_ECMD_ECRSP_IP), res);
4761
4762 if (res & DMA_ECMD_ECRSP_IP) {
4763 ret = -ETIMEDOUT;
4764 goto err;
4765 }
4766
4767 ret = ecmd_get_status_code(res);
4768 err:
4769 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
4770
4771 return ret;
4772 }
4773