xref: /linux/drivers/iommu/intel/iommu.c (revision 024bfd2e9d80d7131f1178eb2235030b96f7ef0e)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26 
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-pages.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 #include "perfmon.h"
34 
35 #define ROOT_SIZE		VTD_PAGE_SIZE
36 #define CONTEXT_SIZE		VTD_PAGE_SIZE
37 
38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42 
43 #define IOAPIC_RANGE_START	(0xfee00000)
44 #define IOAPIC_RANGE_END	(0xfeefffff)
45 #define IOVA_START_ADDR		(0x1000)
46 
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48 
49 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
50 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
51 
52 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
53    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
54 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
55 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
56 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
57 
58 static void __init check_tylersburg_isoch(void);
59 static int rwbf_quirk;
60 
61 /*
62  * set to 1 to panic kernel if can't successfully enable VT-d
63  * (used when kernel is launched w/ TXT)
64  */
65 static int force_on = 0;
66 static int intel_iommu_tboot_noforce;
67 static int no_platform_optin;
68 
69 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
70 
71 /*
72  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
73  * if marked present.
74  */
75 static phys_addr_t root_entry_lctp(struct root_entry *re)
76 {
77 	if (!(re->lo & 1))
78 		return 0;
79 
80 	return re->lo & VTD_PAGE_MASK;
81 }
82 
83 /*
84  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
85  * if marked present.
86  */
87 static phys_addr_t root_entry_uctp(struct root_entry *re)
88 {
89 	if (!(re->hi & 1))
90 		return 0;
91 
92 	return re->hi & VTD_PAGE_MASK;
93 }
94 
95 static int device_rid_cmp_key(const void *key, const struct rb_node *node)
96 {
97 	struct device_domain_info *info =
98 		rb_entry(node, struct device_domain_info, node);
99 	const u16 *rid_lhs = key;
100 
101 	if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
102 		return -1;
103 
104 	if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
105 		return 1;
106 
107 	return 0;
108 }
109 
110 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
111 {
112 	struct device_domain_info *info =
113 		rb_entry(lhs, struct device_domain_info, node);
114 	u16 key = PCI_DEVID(info->bus, info->devfn);
115 
116 	return device_rid_cmp_key(&key, rhs);
117 }
118 
119 /*
120  * Looks up an IOMMU-probed device using its source ID.
121  *
122  * Returns the pointer to the device if there is a match. Otherwise,
123  * returns NULL.
124  *
125  * Note that this helper doesn't guarantee that the device won't be
126  * released by the iommu subsystem after being returned. The caller
127  * should use its own synchronization mechanism to avoid the device
128  * being released during its use if its possibly the case.
129  */
130 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
131 {
132 	struct device_domain_info *info = NULL;
133 	struct rb_node *node;
134 	unsigned long flags;
135 
136 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
137 	node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
138 	if (node)
139 		info = rb_entry(node, struct device_domain_info, node);
140 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
141 
142 	return info ? info->dev : NULL;
143 }
144 
145 static int device_rbtree_insert(struct intel_iommu *iommu,
146 				struct device_domain_info *info)
147 {
148 	struct rb_node *curr;
149 	unsigned long flags;
150 
151 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
152 	curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
153 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
154 	if (WARN_ON(curr))
155 		return -EEXIST;
156 
157 	return 0;
158 }
159 
160 static void device_rbtree_remove(struct device_domain_info *info)
161 {
162 	struct intel_iommu *iommu = info->iommu;
163 	unsigned long flags;
164 
165 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
166 	rb_erase(&info->node, &iommu->device_rbtree);
167 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
168 }
169 
170 struct dmar_rmrr_unit {
171 	struct list_head list;		/* list of rmrr units	*/
172 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
173 	u64	base_address;		/* reserved base address*/
174 	u64	end_address;		/* reserved end address */
175 	struct dmar_dev_scope *devices;	/* target devices */
176 	int	devices_cnt;		/* target device count */
177 };
178 
179 struct dmar_atsr_unit {
180 	struct list_head list;		/* list of ATSR units */
181 	struct acpi_dmar_header *hdr;	/* ACPI header */
182 	struct dmar_dev_scope *devices;	/* target devices */
183 	int devices_cnt;		/* target device count */
184 	u8 include_all:1;		/* include all ports */
185 };
186 
187 struct dmar_satc_unit {
188 	struct list_head list;		/* list of SATC units */
189 	struct acpi_dmar_header *hdr;	/* ACPI header */
190 	struct dmar_dev_scope *devices;	/* target devices */
191 	struct intel_iommu *iommu;	/* the corresponding iommu */
192 	int devices_cnt;		/* target device count */
193 	u8 atc_required:1;		/* ATS is required */
194 };
195 
196 static LIST_HEAD(dmar_atsr_units);
197 static LIST_HEAD(dmar_rmrr_units);
198 static LIST_HEAD(dmar_satc_units);
199 
200 #define for_each_rmrr_units(rmrr) \
201 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
202 
203 static void intel_iommu_domain_free(struct iommu_domain *domain);
204 
205 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
206 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
207 
208 int intel_iommu_enabled = 0;
209 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
210 
211 static int intel_iommu_superpage = 1;
212 static int iommu_identity_mapping;
213 static int iommu_skip_te_disable;
214 static int disable_igfx_iommu;
215 
216 #define IDENTMAP_AZALIA		4
217 
218 const struct iommu_ops intel_iommu_ops;
219 static const struct iommu_dirty_ops intel_dirty_ops;
220 
221 static bool translation_pre_enabled(struct intel_iommu *iommu)
222 {
223 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
224 }
225 
226 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
227 {
228 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
229 }
230 
231 static void init_translation_status(struct intel_iommu *iommu)
232 {
233 	u32 gsts;
234 
235 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
236 	if (gsts & DMA_GSTS_TES)
237 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
238 }
239 
240 static int __init intel_iommu_setup(char *str)
241 {
242 	if (!str)
243 		return -EINVAL;
244 
245 	while (*str) {
246 		if (!strncmp(str, "on", 2)) {
247 			dmar_disabled = 0;
248 			pr_info("IOMMU enabled\n");
249 		} else if (!strncmp(str, "off", 3)) {
250 			dmar_disabled = 1;
251 			no_platform_optin = 1;
252 			pr_info("IOMMU disabled\n");
253 		} else if (!strncmp(str, "igfx_off", 8)) {
254 			disable_igfx_iommu = 1;
255 			pr_info("Disable GFX device mapping\n");
256 		} else if (!strncmp(str, "forcedac", 8)) {
257 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
258 			iommu_dma_forcedac = true;
259 		} else if (!strncmp(str, "strict", 6)) {
260 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
261 			iommu_set_dma_strict();
262 		} else if (!strncmp(str, "sp_off", 6)) {
263 			pr_info("Disable supported super page\n");
264 			intel_iommu_superpage = 0;
265 		} else if (!strncmp(str, "sm_on", 5)) {
266 			pr_info("Enable scalable mode if hardware supports\n");
267 			intel_iommu_sm = 1;
268 		} else if (!strncmp(str, "sm_off", 6)) {
269 			pr_info("Scalable mode is disallowed\n");
270 			intel_iommu_sm = 0;
271 		} else if (!strncmp(str, "tboot_noforce", 13)) {
272 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
273 			intel_iommu_tboot_noforce = 1;
274 		} else {
275 			pr_notice("Unknown option - '%s'\n", str);
276 		}
277 
278 		str += strcspn(str, ",");
279 		while (*str == ',')
280 			str++;
281 	}
282 
283 	return 1;
284 }
285 __setup("intel_iommu=", intel_iommu_setup);
286 
287 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
288 {
289 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
290 
291 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
292 }
293 
294 /*
295  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
296  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
297  * the returned SAGAW.
298  */
299 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
300 {
301 	unsigned long fl_sagaw, sl_sagaw;
302 
303 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
304 	sl_sagaw = cap_sagaw(iommu->cap);
305 
306 	/* Second level only. */
307 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
308 		return sl_sagaw;
309 
310 	/* First level only. */
311 	if (!ecap_slts(iommu->ecap))
312 		return fl_sagaw;
313 
314 	return fl_sagaw & sl_sagaw;
315 }
316 
317 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
318 {
319 	unsigned long sagaw;
320 	int agaw;
321 
322 	sagaw = __iommu_calculate_sagaw(iommu);
323 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
324 		if (test_bit(agaw, &sagaw))
325 			break;
326 	}
327 
328 	return agaw;
329 }
330 
331 /*
332  * Calculate max SAGAW for each iommu.
333  */
334 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
335 {
336 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
337 }
338 
339 /*
340  * calculate agaw for each iommu.
341  * "SAGAW" may be different across iommus, use a default agaw, and
342  * get a supported less agaw for iommus that don't support the default agaw.
343  */
344 int iommu_calculate_agaw(struct intel_iommu *iommu)
345 {
346 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
347 }
348 
349 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
350 {
351 	return sm_supported(iommu) ?
352 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
353 }
354 
355 /* Return the super pagesize bitmap if supported. */
356 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
357 {
358 	unsigned long bitmap = 0;
359 
360 	/*
361 	 * 1-level super page supports page size of 2MiB, 2-level super page
362 	 * supports page size of both 2MiB and 1GiB.
363 	 */
364 	if (domain->iommu_superpage == 1)
365 		bitmap |= SZ_2M;
366 	else if (domain->iommu_superpage == 2)
367 		bitmap |= SZ_2M | SZ_1G;
368 
369 	return bitmap;
370 }
371 
372 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
373 					 u8 devfn, int alloc)
374 {
375 	struct root_entry *root = &iommu->root_entry[bus];
376 	struct context_entry *context;
377 	u64 *entry;
378 
379 	/*
380 	 * Except that the caller requested to allocate a new entry,
381 	 * returning a copied context entry makes no sense.
382 	 */
383 	if (!alloc && context_copied(iommu, bus, devfn))
384 		return NULL;
385 
386 	entry = &root->lo;
387 	if (sm_supported(iommu)) {
388 		if (devfn >= 0x80) {
389 			devfn -= 0x80;
390 			entry = &root->hi;
391 		}
392 		devfn *= 2;
393 	}
394 	if (*entry & 1)
395 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
396 	else {
397 		unsigned long phy_addr;
398 		if (!alloc)
399 			return NULL;
400 
401 		context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
402 		if (!context)
403 			return NULL;
404 
405 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
406 		phy_addr = virt_to_phys((void *)context);
407 		*entry = phy_addr | 1;
408 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
409 	}
410 	return &context[devfn];
411 }
412 
413 /**
414  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
415  *				 sub-hierarchy of a candidate PCI-PCI bridge
416  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
417  * @bridge: the candidate PCI-PCI bridge
418  *
419  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
420  */
421 static bool
422 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
423 {
424 	struct pci_dev *pdev, *pbridge;
425 
426 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
427 		return false;
428 
429 	pdev = to_pci_dev(dev);
430 	pbridge = to_pci_dev(bridge);
431 
432 	if (pbridge->subordinate &&
433 	    pbridge->subordinate->number <= pdev->bus->number &&
434 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
435 		return true;
436 
437 	return false;
438 }
439 
440 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
441 {
442 	struct dmar_drhd_unit *drhd;
443 	u32 vtbar;
444 	int rc;
445 
446 	/* We know that this device on this chipset has its own IOMMU.
447 	 * If we find it under a different IOMMU, then the BIOS is lying
448 	 * to us. Hope that the IOMMU for this device is actually
449 	 * disabled, and it needs no translation...
450 	 */
451 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
452 	if (rc) {
453 		/* "can't" happen */
454 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
455 		return false;
456 	}
457 	vtbar &= 0xffff0000;
458 
459 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
460 	drhd = dmar_find_matched_drhd_unit(pdev);
461 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
462 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
463 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
464 		return true;
465 	}
466 
467 	return false;
468 }
469 
470 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
471 {
472 	if (!iommu || iommu->drhd->ignored)
473 		return true;
474 
475 	if (dev_is_pci(dev)) {
476 		struct pci_dev *pdev = to_pci_dev(dev);
477 
478 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
479 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
480 		    quirk_ioat_snb_local_iommu(pdev))
481 			return true;
482 	}
483 
484 	return false;
485 }
486 
487 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
488 {
489 	struct dmar_drhd_unit *drhd = NULL;
490 	struct pci_dev *pdev = NULL;
491 	struct intel_iommu *iommu;
492 	struct device *tmp;
493 	u16 segment = 0;
494 	int i;
495 
496 	if (!dev)
497 		return NULL;
498 
499 	if (dev_is_pci(dev)) {
500 		struct pci_dev *pf_pdev;
501 
502 		pdev = pci_real_dma_dev(to_pci_dev(dev));
503 
504 		/* VFs aren't listed in scope tables; we need to look up
505 		 * the PF instead to find the IOMMU. */
506 		pf_pdev = pci_physfn(pdev);
507 		dev = &pf_pdev->dev;
508 		segment = pci_domain_nr(pdev->bus);
509 	} else if (has_acpi_companion(dev))
510 		dev = &ACPI_COMPANION(dev)->dev;
511 
512 	rcu_read_lock();
513 	for_each_iommu(iommu, drhd) {
514 		if (pdev && segment != drhd->segment)
515 			continue;
516 
517 		for_each_active_dev_scope(drhd->devices,
518 					  drhd->devices_cnt, i, tmp) {
519 			if (tmp == dev) {
520 				/* For a VF use its original BDF# not that of the PF
521 				 * which we used for the IOMMU lookup. Strictly speaking
522 				 * we could do this for all PCI devices; we only need to
523 				 * get the BDF# from the scope table for ACPI matches. */
524 				if (pdev && pdev->is_virtfn)
525 					goto got_pdev;
526 
527 				if (bus && devfn) {
528 					*bus = drhd->devices[i].bus;
529 					*devfn = drhd->devices[i].devfn;
530 				}
531 				goto out;
532 			}
533 
534 			if (is_downstream_to_pci_bridge(dev, tmp))
535 				goto got_pdev;
536 		}
537 
538 		if (pdev && drhd->include_all) {
539 got_pdev:
540 			if (bus && devfn) {
541 				*bus = pdev->bus->number;
542 				*devfn = pdev->devfn;
543 			}
544 			goto out;
545 		}
546 	}
547 	iommu = NULL;
548 out:
549 	if (iommu_is_dummy(iommu, dev))
550 		iommu = NULL;
551 
552 	rcu_read_unlock();
553 
554 	return iommu;
555 }
556 
557 static void domain_flush_cache(struct dmar_domain *domain,
558 			       void *addr, int size)
559 {
560 	if (!domain->iommu_coherency)
561 		clflush_cache_range(addr, size);
562 }
563 
564 static void free_context_table(struct intel_iommu *iommu)
565 {
566 	struct context_entry *context;
567 	int i;
568 
569 	if (!iommu->root_entry)
570 		return;
571 
572 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
573 		context = iommu_context_addr(iommu, i, 0, 0);
574 		if (context)
575 			iommu_free_page(context);
576 
577 		if (!sm_supported(iommu))
578 			continue;
579 
580 		context = iommu_context_addr(iommu, i, 0x80, 0);
581 		if (context)
582 			iommu_free_page(context);
583 	}
584 
585 	iommu_free_page(iommu->root_entry);
586 	iommu->root_entry = NULL;
587 }
588 
589 #ifdef CONFIG_DMAR_DEBUG
590 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
591 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
592 {
593 	struct dma_pte *pte;
594 	int offset;
595 
596 	while (1) {
597 		offset = pfn_level_offset(pfn, level);
598 		pte = &parent[offset];
599 
600 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
601 
602 		if (!dma_pte_present(pte)) {
603 			pr_info("page table not present at level %d\n", level - 1);
604 			break;
605 		}
606 
607 		if (level == 1 || dma_pte_superpage(pte))
608 			break;
609 
610 		parent = phys_to_virt(dma_pte_addr(pte));
611 		level--;
612 	}
613 }
614 
615 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
616 			  unsigned long long addr, u32 pasid)
617 {
618 	struct pasid_dir_entry *dir, *pde;
619 	struct pasid_entry *entries, *pte;
620 	struct context_entry *ctx_entry;
621 	struct root_entry *rt_entry;
622 	int i, dir_index, index, level;
623 	u8 devfn = source_id & 0xff;
624 	u8 bus = source_id >> 8;
625 	struct dma_pte *pgtable;
626 
627 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
628 
629 	/* root entry dump */
630 	if (!iommu->root_entry) {
631 		pr_info("root table is not present\n");
632 		return;
633 	}
634 	rt_entry = &iommu->root_entry[bus];
635 
636 	if (sm_supported(iommu))
637 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
638 			rt_entry->hi, rt_entry->lo);
639 	else
640 		pr_info("root entry: 0x%016llx", rt_entry->lo);
641 
642 	/* context entry dump */
643 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
644 	if (!ctx_entry) {
645 		pr_info("context table is not present\n");
646 		return;
647 	}
648 
649 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
650 		ctx_entry->hi, ctx_entry->lo);
651 
652 	/* legacy mode does not require PASID entries */
653 	if (!sm_supported(iommu)) {
654 		if (!context_present(ctx_entry)) {
655 			pr_info("legacy mode page table is not present\n");
656 			return;
657 		}
658 		level = agaw_to_level(ctx_entry->hi & 7);
659 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
660 		goto pgtable_walk;
661 	}
662 
663 	if (!context_present(ctx_entry)) {
664 		pr_info("pasid directory table is not present\n");
665 		return;
666 	}
667 
668 	/* get the pointer to pasid directory entry */
669 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
670 
671 	/* For request-without-pasid, get the pasid from context entry */
672 	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
673 		pasid = IOMMU_NO_PASID;
674 
675 	dir_index = pasid >> PASID_PDE_SHIFT;
676 	pde = &dir[dir_index];
677 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
678 
679 	/* get the pointer to the pasid table entry */
680 	entries = get_pasid_table_from_pde(pde);
681 	if (!entries) {
682 		pr_info("pasid table is not present\n");
683 		return;
684 	}
685 	index = pasid & PASID_PTE_MASK;
686 	pte = &entries[index];
687 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
688 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
689 
690 	if (!pasid_pte_is_present(pte)) {
691 		pr_info("scalable mode page table is not present\n");
692 		return;
693 	}
694 
695 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
696 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
697 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
698 	} else {
699 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
700 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
701 	}
702 
703 pgtable_walk:
704 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
705 }
706 #endif
707 
708 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
709 				      unsigned long pfn, int *target_level,
710 				      gfp_t gfp)
711 {
712 	struct dma_pte *parent, *pte;
713 	int level = agaw_to_level(domain->agaw);
714 	int offset;
715 
716 	if (!domain_pfn_supported(domain, pfn))
717 		/* Address beyond IOMMU's addressing capabilities. */
718 		return NULL;
719 
720 	parent = domain->pgd;
721 
722 	while (1) {
723 		void *tmp_page;
724 
725 		offset = pfn_level_offset(pfn, level);
726 		pte = &parent[offset];
727 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
728 			break;
729 		if (level == *target_level)
730 			break;
731 
732 		if (!dma_pte_present(pte)) {
733 			uint64_t pteval, tmp;
734 
735 			tmp_page = iommu_alloc_page_node(domain->nid, gfp);
736 
737 			if (!tmp_page)
738 				return NULL;
739 
740 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
741 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
742 			if (domain->use_first_level)
743 				pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
744 
745 			tmp = 0ULL;
746 			if (!try_cmpxchg64(&pte->val, &tmp, pteval))
747 				/* Someone else set it while we were thinking; use theirs. */
748 				iommu_free_page(tmp_page);
749 			else
750 				domain_flush_cache(domain, pte, sizeof(*pte));
751 		}
752 		if (level == 1)
753 			break;
754 
755 		parent = phys_to_virt(dma_pte_addr(pte));
756 		level--;
757 	}
758 
759 	if (!*target_level)
760 		*target_level = level;
761 
762 	return pte;
763 }
764 
765 /* return address's pte at specific level */
766 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
767 					 unsigned long pfn,
768 					 int level, int *large_page)
769 {
770 	struct dma_pte *parent, *pte;
771 	int total = agaw_to_level(domain->agaw);
772 	int offset;
773 
774 	parent = domain->pgd;
775 	while (level <= total) {
776 		offset = pfn_level_offset(pfn, total);
777 		pte = &parent[offset];
778 		if (level == total)
779 			return pte;
780 
781 		if (!dma_pte_present(pte)) {
782 			*large_page = total;
783 			break;
784 		}
785 
786 		if (dma_pte_superpage(pte)) {
787 			*large_page = total;
788 			return pte;
789 		}
790 
791 		parent = phys_to_virt(dma_pte_addr(pte));
792 		total--;
793 	}
794 	return NULL;
795 }
796 
797 /* clear last level pte, a tlb flush should be followed */
798 static void dma_pte_clear_range(struct dmar_domain *domain,
799 				unsigned long start_pfn,
800 				unsigned long last_pfn)
801 {
802 	unsigned int large_page;
803 	struct dma_pte *first_pte, *pte;
804 
805 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
806 	    WARN_ON(start_pfn > last_pfn))
807 		return;
808 
809 	/* we don't need lock here; nobody else touches the iova range */
810 	do {
811 		large_page = 1;
812 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
813 		if (!pte) {
814 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
815 			continue;
816 		}
817 		do {
818 			dma_clear_pte(pte);
819 			start_pfn += lvl_to_nr_pages(large_page);
820 			pte++;
821 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
822 
823 		domain_flush_cache(domain, first_pte,
824 				   (void *)pte - (void *)first_pte);
825 
826 	} while (start_pfn && start_pfn <= last_pfn);
827 }
828 
829 static void dma_pte_free_level(struct dmar_domain *domain, int level,
830 			       int retain_level, struct dma_pte *pte,
831 			       unsigned long pfn, unsigned long start_pfn,
832 			       unsigned long last_pfn)
833 {
834 	pfn = max(start_pfn, pfn);
835 	pte = &pte[pfn_level_offset(pfn, level)];
836 
837 	do {
838 		unsigned long level_pfn;
839 		struct dma_pte *level_pte;
840 
841 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
842 			goto next;
843 
844 		level_pfn = pfn & level_mask(level);
845 		level_pte = phys_to_virt(dma_pte_addr(pte));
846 
847 		if (level > 2) {
848 			dma_pte_free_level(domain, level - 1, retain_level,
849 					   level_pte, level_pfn, start_pfn,
850 					   last_pfn);
851 		}
852 
853 		/*
854 		 * Free the page table if we're below the level we want to
855 		 * retain and the range covers the entire table.
856 		 */
857 		if (level < retain_level && !(start_pfn > level_pfn ||
858 		      last_pfn < level_pfn + level_size(level) - 1)) {
859 			dma_clear_pte(pte);
860 			domain_flush_cache(domain, pte, sizeof(*pte));
861 			iommu_free_page(level_pte);
862 		}
863 next:
864 		pfn += level_size(level);
865 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
866 }
867 
868 /*
869  * clear last level (leaf) ptes and free page table pages below the
870  * level we wish to keep intact.
871  */
872 static void dma_pte_free_pagetable(struct dmar_domain *domain,
873 				   unsigned long start_pfn,
874 				   unsigned long last_pfn,
875 				   int retain_level)
876 {
877 	dma_pte_clear_range(domain, start_pfn, last_pfn);
878 
879 	/* We don't need lock here; nobody else touches the iova range */
880 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
881 			   domain->pgd, 0, start_pfn, last_pfn);
882 
883 	/* free pgd */
884 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
885 		iommu_free_page(domain->pgd);
886 		domain->pgd = NULL;
887 	}
888 }
889 
890 /* When a page at a given level is being unlinked from its parent, we don't
891    need to *modify* it at all. All we need to do is make a list of all the
892    pages which can be freed just as soon as we've flushed the IOTLB and we
893    know the hardware page-walk will no longer touch them.
894    The 'pte' argument is the *parent* PTE, pointing to the page that is to
895    be freed. */
896 static void dma_pte_list_pagetables(struct dmar_domain *domain,
897 				    int level, struct dma_pte *pte,
898 				    struct list_head *freelist)
899 {
900 	struct page *pg;
901 
902 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
903 	list_add_tail(&pg->lru, freelist);
904 
905 	if (level == 1)
906 		return;
907 
908 	pte = page_address(pg);
909 	do {
910 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
911 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
912 		pte++;
913 	} while (!first_pte_in_page(pte));
914 }
915 
916 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
917 				struct dma_pte *pte, unsigned long pfn,
918 				unsigned long start_pfn, unsigned long last_pfn,
919 				struct list_head *freelist)
920 {
921 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
922 
923 	pfn = max(start_pfn, pfn);
924 	pte = &pte[pfn_level_offset(pfn, level)];
925 
926 	do {
927 		unsigned long level_pfn = pfn & level_mask(level);
928 
929 		if (!dma_pte_present(pte))
930 			goto next;
931 
932 		/* If range covers entire pagetable, free it */
933 		if (start_pfn <= level_pfn &&
934 		    last_pfn >= level_pfn + level_size(level) - 1) {
935 			/* These suborbinate page tables are going away entirely. Don't
936 			   bother to clear them; we're just going to *free* them. */
937 			if (level > 1 && !dma_pte_superpage(pte))
938 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
939 
940 			dma_clear_pte(pte);
941 			if (!first_pte)
942 				first_pte = pte;
943 			last_pte = pte;
944 		} else if (level > 1) {
945 			/* Recurse down into a level that isn't *entirely* obsolete */
946 			dma_pte_clear_level(domain, level - 1,
947 					    phys_to_virt(dma_pte_addr(pte)),
948 					    level_pfn, start_pfn, last_pfn,
949 					    freelist);
950 		}
951 next:
952 		pfn = level_pfn + level_size(level);
953 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
954 
955 	if (first_pte)
956 		domain_flush_cache(domain, first_pte,
957 				   (void *)++last_pte - (void *)first_pte);
958 }
959 
960 /* We can't just free the pages because the IOMMU may still be walking
961    the page tables, and may have cached the intermediate levels. The
962    pages can only be freed after the IOTLB flush has been done. */
963 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
964 			 unsigned long last_pfn, struct list_head *freelist)
965 {
966 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
967 	    WARN_ON(start_pfn > last_pfn))
968 		return;
969 
970 	/* we don't need lock here; nobody else touches the iova range */
971 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
972 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
973 
974 	/* free pgd */
975 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
976 		struct page *pgd_page = virt_to_page(domain->pgd);
977 		list_add_tail(&pgd_page->lru, freelist);
978 		domain->pgd = NULL;
979 	}
980 }
981 
982 /* iommu handling */
983 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
984 {
985 	struct root_entry *root;
986 
987 	root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
988 	if (!root) {
989 		pr_err("Allocating root entry for %s failed\n",
990 			iommu->name);
991 		return -ENOMEM;
992 	}
993 
994 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
995 	iommu->root_entry = root;
996 
997 	return 0;
998 }
999 
1000 static void iommu_set_root_entry(struct intel_iommu *iommu)
1001 {
1002 	u64 addr;
1003 	u32 sts;
1004 	unsigned long flag;
1005 
1006 	addr = virt_to_phys(iommu->root_entry);
1007 	if (sm_supported(iommu))
1008 		addr |= DMA_RTADDR_SMT;
1009 
1010 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1011 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1012 
1013 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1014 
1015 	/* Make sure hardware complete it */
1016 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1017 		      readl, (sts & DMA_GSTS_RTPS), sts);
1018 
1019 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1020 
1021 	/*
1022 	 * Hardware invalidates all DMA remapping hardware translation
1023 	 * caches as part of SRTP flow.
1024 	 */
1025 	if (cap_esrtps(iommu->cap))
1026 		return;
1027 
1028 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1029 	if (sm_supported(iommu))
1030 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1031 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1032 }
1033 
1034 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1035 {
1036 	u32 val;
1037 	unsigned long flag;
1038 
1039 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1040 		return;
1041 
1042 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1043 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1044 
1045 	/* Make sure hardware complete it */
1046 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1047 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1048 
1049 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1050 }
1051 
1052 /* return value determine if we need a write buffer flush */
1053 static void __iommu_flush_context(struct intel_iommu *iommu,
1054 				  u16 did, u16 source_id, u8 function_mask,
1055 				  u64 type)
1056 {
1057 	u64 val = 0;
1058 	unsigned long flag;
1059 
1060 	switch (type) {
1061 	case DMA_CCMD_GLOBAL_INVL:
1062 		val = DMA_CCMD_GLOBAL_INVL;
1063 		break;
1064 	case DMA_CCMD_DOMAIN_INVL:
1065 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1066 		break;
1067 	case DMA_CCMD_DEVICE_INVL:
1068 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1069 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1070 		break;
1071 	default:
1072 		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1073 			iommu->name, type);
1074 		return;
1075 	}
1076 	val |= DMA_CCMD_ICC;
1077 
1078 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1079 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1080 
1081 	/* Make sure hardware complete it */
1082 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1083 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1084 
1085 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1086 }
1087 
1088 void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
1089 			 unsigned int size_order, u64 type)
1090 {
1091 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1092 	u64 val = 0, val_iva = 0;
1093 	unsigned long flag;
1094 
1095 	switch (type) {
1096 	case DMA_TLB_GLOBAL_FLUSH:
1097 		/* global flush doesn't need set IVA_REG */
1098 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1099 		break;
1100 	case DMA_TLB_DSI_FLUSH:
1101 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1102 		break;
1103 	case DMA_TLB_PSI_FLUSH:
1104 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1105 		/* IH bit is passed in as part of address */
1106 		val_iva = size_order | addr;
1107 		break;
1108 	default:
1109 		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1110 			iommu->name, type);
1111 		return;
1112 	}
1113 
1114 	if (cap_write_drain(iommu->cap))
1115 		val |= DMA_TLB_WRITE_DRAIN;
1116 
1117 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1118 	/* Note: Only uses first TLB reg currently */
1119 	if (val_iva)
1120 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1121 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1122 
1123 	/* Make sure hardware complete it */
1124 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1125 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1126 
1127 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1128 
1129 	/* check IOTLB invalidation granularity */
1130 	if (DMA_TLB_IAIG(val) == 0)
1131 		pr_err("Flush IOTLB failed\n");
1132 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1133 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1134 			(unsigned long long)DMA_TLB_IIRG(type),
1135 			(unsigned long long)DMA_TLB_IAIG(val));
1136 }
1137 
1138 static struct device_domain_info *
1139 domain_lookup_dev_info(struct dmar_domain *domain,
1140 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1141 {
1142 	struct device_domain_info *info;
1143 	unsigned long flags;
1144 
1145 	spin_lock_irqsave(&domain->lock, flags);
1146 	list_for_each_entry(info, &domain->devices, link) {
1147 		if (info->iommu == iommu && info->bus == bus &&
1148 		    info->devfn == devfn) {
1149 			spin_unlock_irqrestore(&domain->lock, flags);
1150 			return info;
1151 		}
1152 	}
1153 	spin_unlock_irqrestore(&domain->lock, flags);
1154 
1155 	return NULL;
1156 }
1157 
1158 /*
1159  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1160  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1161  * check because it applies only to the built-in QAT devices and it doesn't
1162  * grant additional privileges.
1163  */
1164 #define BUGGY_QAT_DEVID_MASK 0x4940
1165 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1166 {
1167 	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1168 		return false;
1169 
1170 	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1171 		return false;
1172 
1173 	return true;
1174 }
1175 
1176 static void iommu_enable_pci_caps(struct device_domain_info *info)
1177 {
1178 	struct pci_dev *pdev;
1179 
1180 	if (!dev_is_pci(info->dev))
1181 		return;
1182 
1183 	pdev = to_pci_dev(info->dev);
1184 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1185 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT))
1186 		info->ats_enabled = 1;
1187 }
1188 
1189 static void iommu_disable_pci_caps(struct device_domain_info *info)
1190 {
1191 	struct pci_dev *pdev;
1192 
1193 	if (!dev_is_pci(info->dev))
1194 		return;
1195 
1196 	pdev = to_pci_dev(info->dev);
1197 
1198 	if (info->ats_enabled) {
1199 		pci_disable_ats(pdev);
1200 		info->ats_enabled = 0;
1201 	}
1202 }
1203 
1204 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1205 {
1206 	cache_tag_flush_all(to_dmar_domain(domain));
1207 }
1208 
1209 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1210 {
1211 	u32 pmen;
1212 	unsigned long flags;
1213 
1214 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1215 		return;
1216 
1217 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1218 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1219 	pmen &= ~DMA_PMEN_EPM;
1220 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1221 
1222 	/* wait for the protected region status bit to clear */
1223 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1224 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1225 
1226 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1227 }
1228 
1229 static void iommu_enable_translation(struct intel_iommu *iommu)
1230 {
1231 	u32 sts;
1232 	unsigned long flags;
1233 
1234 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1235 	iommu->gcmd |= DMA_GCMD_TE;
1236 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1237 
1238 	/* Make sure hardware complete it */
1239 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1240 		      readl, (sts & DMA_GSTS_TES), sts);
1241 
1242 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1243 }
1244 
1245 static void iommu_disable_translation(struct intel_iommu *iommu)
1246 {
1247 	u32 sts;
1248 	unsigned long flag;
1249 
1250 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1251 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1252 		return;
1253 
1254 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1255 	iommu->gcmd &= ~DMA_GCMD_TE;
1256 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1257 
1258 	/* Make sure hardware complete it */
1259 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1260 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1261 
1262 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1263 }
1264 
1265 static int iommu_init_domains(struct intel_iommu *iommu)
1266 {
1267 	u32 ndomains;
1268 
1269 	ndomains = cap_ndoms(iommu->cap);
1270 	pr_debug("%s: Number of Domains supported <%d>\n",
1271 		 iommu->name, ndomains);
1272 
1273 	spin_lock_init(&iommu->lock);
1274 
1275 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1276 	if (!iommu->domain_ids)
1277 		return -ENOMEM;
1278 
1279 	/*
1280 	 * If Caching mode is set, then invalid translations are tagged
1281 	 * with domain-id 0, hence we need to pre-allocate it. We also
1282 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1283 	 * make sure it is not used for a real domain.
1284 	 */
1285 	set_bit(0, iommu->domain_ids);
1286 
1287 	/*
1288 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1289 	 * entry for first-level or pass-through translation modes should
1290 	 * be programmed with a domain id different from those used for
1291 	 * second-level or nested translation. We reserve a domain id for
1292 	 * this purpose. This domain id is also used for identity domain
1293 	 * in legacy mode.
1294 	 */
1295 	set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1296 
1297 	return 0;
1298 }
1299 
1300 static void disable_dmar_iommu(struct intel_iommu *iommu)
1301 {
1302 	if (!iommu->domain_ids)
1303 		return;
1304 
1305 	/*
1306 	 * All iommu domains must have been detached from the devices,
1307 	 * hence there should be no domain IDs in use.
1308 	 */
1309 	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1310 		    > NUM_RESERVED_DID))
1311 		return;
1312 
1313 	if (iommu->gcmd & DMA_GCMD_TE)
1314 		iommu_disable_translation(iommu);
1315 }
1316 
1317 static void free_dmar_iommu(struct intel_iommu *iommu)
1318 {
1319 	if (iommu->domain_ids) {
1320 		bitmap_free(iommu->domain_ids);
1321 		iommu->domain_ids = NULL;
1322 	}
1323 
1324 	if (iommu->copied_tables) {
1325 		bitmap_free(iommu->copied_tables);
1326 		iommu->copied_tables = NULL;
1327 	}
1328 
1329 	/* free context mapping */
1330 	free_context_table(iommu);
1331 
1332 	if (ecap_prs(iommu->ecap))
1333 		intel_iommu_finish_prq(iommu);
1334 }
1335 
1336 /*
1337  * Check and return whether first level is used by default for
1338  * DMA translation.
1339  */
1340 static bool first_level_by_default(struct intel_iommu *iommu)
1341 {
1342 	/* Only SL is available in legacy mode */
1343 	if (!sm_supported(iommu))
1344 		return false;
1345 
1346 	/* Only level (either FL or SL) is available, just use it */
1347 	if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap))
1348 		return ecap_flts(iommu->ecap);
1349 
1350 	return true;
1351 }
1352 
1353 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1354 {
1355 	struct iommu_domain_info *info, *curr;
1356 	unsigned long ndomains;
1357 	int num, ret = -ENOSPC;
1358 
1359 	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1360 		return 0;
1361 
1362 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1363 	if (!info)
1364 		return -ENOMEM;
1365 
1366 	spin_lock(&iommu->lock);
1367 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1368 	if (curr) {
1369 		curr->refcnt++;
1370 		spin_unlock(&iommu->lock);
1371 		kfree(info);
1372 		return 0;
1373 	}
1374 
1375 	ndomains = cap_ndoms(iommu->cap);
1376 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1377 	if (num >= ndomains) {
1378 		pr_err("%s: No free domain ids\n", iommu->name);
1379 		goto err_unlock;
1380 	}
1381 
1382 	set_bit(num, iommu->domain_ids);
1383 	info->refcnt	= 1;
1384 	info->did	= num;
1385 	info->iommu	= iommu;
1386 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1387 			  NULL, info, GFP_ATOMIC);
1388 	if (curr) {
1389 		ret = xa_err(curr) ? : -EBUSY;
1390 		goto err_clear;
1391 	}
1392 
1393 	spin_unlock(&iommu->lock);
1394 	return 0;
1395 
1396 err_clear:
1397 	clear_bit(info->did, iommu->domain_ids);
1398 err_unlock:
1399 	spin_unlock(&iommu->lock);
1400 	kfree(info);
1401 	return ret;
1402 }
1403 
1404 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1405 {
1406 	struct iommu_domain_info *info;
1407 
1408 	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1409 		return;
1410 
1411 	spin_lock(&iommu->lock);
1412 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1413 	if (--info->refcnt == 0) {
1414 		clear_bit(info->did, iommu->domain_ids);
1415 		xa_erase(&domain->iommu_array, iommu->seq_id);
1416 		domain->nid = NUMA_NO_NODE;
1417 		kfree(info);
1418 	}
1419 	spin_unlock(&iommu->lock);
1420 }
1421 
1422 static void domain_exit(struct dmar_domain *domain)
1423 {
1424 	if (domain->pgd) {
1425 		LIST_HEAD(freelist);
1426 
1427 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1428 		iommu_put_pages_list(&freelist);
1429 	}
1430 
1431 	if (WARN_ON(!list_empty(&domain->devices)))
1432 		return;
1433 
1434 	kfree(domain->qi_batch);
1435 	kfree(domain);
1436 }
1437 
1438 /*
1439  * For kdump cases, old valid entries may be cached due to the
1440  * in-flight DMA and copied pgtable, but there is no unmapping
1441  * behaviour for them, thus we need an explicit cache flush for
1442  * the newly-mapped device. For kdump, at this point, the device
1443  * is supposed to finish reset at its driver probe stage, so no
1444  * in-flight DMA will exist, and we don't need to worry anymore
1445  * hereafter.
1446  */
1447 static void copied_context_tear_down(struct intel_iommu *iommu,
1448 				     struct context_entry *context,
1449 				     u8 bus, u8 devfn)
1450 {
1451 	u16 did_old;
1452 
1453 	if (!context_copied(iommu, bus, devfn))
1454 		return;
1455 
1456 	assert_spin_locked(&iommu->lock);
1457 
1458 	did_old = context_domain_id(context);
1459 	context_clear_entry(context);
1460 
1461 	if (did_old < cap_ndoms(iommu->cap)) {
1462 		iommu->flush.flush_context(iommu, did_old,
1463 					   PCI_DEVID(bus, devfn),
1464 					   DMA_CCMD_MASK_NOBIT,
1465 					   DMA_CCMD_DEVICE_INVL);
1466 		iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1467 					 DMA_TLB_DSI_FLUSH);
1468 	}
1469 
1470 	clear_context_copied(iommu, bus, devfn);
1471 }
1472 
1473 /*
1474  * It's a non-present to present mapping. If hardware doesn't cache
1475  * non-present entry we only need to flush the write-buffer. If the
1476  * _does_ cache non-present entries, then it does so in the special
1477  * domain #0, which we have to flush:
1478  */
1479 static void context_present_cache_flush(struct intel_iommu *iommu, u16 did,
1480 					u8 bus, u8 devfn)
1481 {
1482 	if (cap_caching_mode(iommu->cap)) {
1483 		iommu->flush.flush_context(iommu, 0,
1484 					   PCI_DEVID(bus, devfn),
1485 					   DMA_CCMD_MASK_NOBIT,
1486 					   DMA_CCMD_DEVICE_INVL);
1487 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1488 	} else {
1489 		iommu_flush_write_buffer(iommu);
1490 	}
1491 }
1492 
1493 static int domain_context_mapping_one(struct dmar_domain *domain,
1494 				      struct intel_iommu *iommu,
1495 				      u8 bus, u8 devfn)
1496 {
1497 	struct device_domain_info *info =
1498 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1499 	u16 did = domain_id_iommu(domain, iommu);
1500 	int translation = CONTEXT_TT_MULTI_LEVEL;
1501 	struct dma_pte *pgd = domain->pgd;
1502 	struct context_entry *context;
1503 	int ret;
1504 
1505 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1506 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1507 
1508 	spin_lock(&iommu->lock);
1509 	ret = -ENOMEM;
1510 	context = iommu_context_addr(iommu, bus, devfn, 1);
1511 	if (!context)
1512 		goto out_unlock;
1513 
1514 	ret = 0;
1515 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1516 		goto out_unlock;
1517 
1518 	copied_context_tear_down(iommu, context, bus, devfn);
1519 	context_clear_entry(context);
1520 	context_set_domain_id(context, did);
1521 
1522 	if (info && info->ats_supported)
1523 		translation = CONTEXT_TT_DEV_IOTLB;
1524 	else
1525 		translation = CONTEXT_TT_MULTI_LEVEL;
1526 
1527 	context_set_address_root(context, virt_to_phys(pgd));
1528 	context_set_address_width(context, domain->agaw);
1529 	context_set_translation_type(context, translation);
1530 	context_set_fault_enable(context);
1531 	context_set_present(context);
1532 	if (!ecap_coherent(iommu->ecap))
1533 		clflush_cache_range(context, sizeof(*context));
1534 	context_present_cache_flush(iommu, did, bus, devfn);
1535 	ret = 0;
1536 
1537 out_unlock:
1538 	spin_unlock(&iommu->lock);
1539 
1540 	return ret;
1541 }
1542 
1543 static int domain_context_mapping_cb(struct pci_dev *pdev,
1544 				     u16 alias, void *opaque)
1545 {
1546 	struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1547 	struct intel_iommu *iommu = info->iommu;
1548 	struct dmar_domain *domain = opaque;
1549 
1550 	return domain_context_mapping_one(domain, iommu,
1551 					  PCI_BUS_NUM(alias), alias & 0xff);
1552 }
1553 
1554 static int
1555 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1556 {
1557 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1558 	struct intel_iommu *iommu = info->iommu;
1559 	u8 bus = info->bus, devfn = info->devfn;
1560 
1561 	if (!dev_is_pci(dev))
1562 		return domain_context_mapping_one(domain, iommu, bus, devfn);
1563 
1564 	return pci_for_each_dma_alias(to_pci_dev(dev),
1565 				      domain_context_mapping_cb, domain);
1566 }
1567 
1568 /* Return largest possible superpage level for a given mapping */
1569 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1570 				   unsigned long phy_pfn, unsigned long pages)
1571 {
1572 	int support, level = 1;
1573 	unsigned long pfnmerge;
1574 
1575 	support = domain->iommu_superpage;
1576 
1577 	/* To use a large page, the virtual *and* physical addresses
1578 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1579 	   of them will mean we have to use smaller pages. So just
1580 	   merge them and check both at once. */
1581 	pfnmerge = iov_pfn | phy_pfn;
1582 
1583 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1584 		pages >>= VTD_STRIDE_SHIFT;
1585 		if (!pages)
1586 			break;
1587 		pfnmerge >>= VTD_STRIDE_SHIFT;
1588 		level++;
1589 		support--;
1590 	}
1591 	return level;
1592 }
1593 
1594 /*
1595  * Ensure that old small page tables are removed to make room for superpage(s).
1596  * We're going to add new large pages, so make sure we don't remove their parent
1597  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1598  */
1599 static void switch_to_super_page(struct dmar_domain *domain,
1600 				 unsigned long start_pfn,
1601 				 unsigned long end_pfn, int level)
1602 {
1603 	unsigned long lvl_pages = lvl_to_nr_pages(level);
1604 	struct dma_pte *pte = NULL;
1605 
1606 	while (start_pfn <= end_pfn) {
1607 		if (!pte)
1608 			pte = pfn_to_dma_pte(domain, start_pfn, &level,
1609 					     GFP_ATOMIC);
1610 
1611 		if (dma_pte_present(pte)) {
1612 			dma_pte_free_pagetable(domain, start_pfn,
1613 					       start_pfn + lvl_pages - 1,
1614 					       level + 1);
1615 
1616 			cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT,
1617 					      end_pfn << VTD_PAGE_SHIFT, 0);
1618 		}
1619 
1620 		pte++;
1621 		start_pfn += lvl_pages;
1622 		if (first_pte_in_page(pte))
1623 			pte = NULL;
1624 	}
1625 }
1626 
1627 static int
1628 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1629 		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
1630 		 gfp_t gfp)
1631 {
1632 	struct dma_pte *first_pte = NULL, *pte = NULL;
1633 	unsigned int largepage_lvl = 0;
1634 	unsigned long lvl_pages = 0;
1635 	phys_addr_t pteval;
1636 	u64 attr;
1637 
1638 	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
1639 		return -EINVAL;
1640 
1641 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1642 		return -EINVAL;
1643 
1644 	if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
1645 		pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
1646 		return -EINVAL;
1647 	}
1648 
1649 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
1650 	attr |= DMA_FL_PTE_PRESENT;
1651 	if (domain->use_first_level) {
1652 		attr |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
1653 		if (prot & DMA_PTE_WRITE)
1654 			attr |= DMA_FL_PTE_DIRTY;
1655 	}
1656 
1657 	domain->has_mappings = true;
1658 
1659 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
1660 
1661 	while (nr_pages > 0) {
1662 		uint64_t tmp;
1663 
1664 		if (!pte) {
1665 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
1666 					phys_pfn, nr_pages);
1667 
1668 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
1669 					     gfp);
1670 			if (!pte)
1671 				return -ENOMEM;
1672 			first_pte = pte;
1673 
1674 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
1675 
1676 			/* It is large page*/
1677 			if (largepage_lvl > 1) {
1678 				unsigned long end_pfn;
1679 				unsigned long pages_to_remove;
1680 
1681 				pteval |= DMA_PTE_LARGE_PAGE;
1682 				pages_to_remove = min_t(unsigned long, nr_pages,
1683 							nr_pte_to_next_page(pte) * lvl_pages);
1684 				end_pfn = iov_pfn + pages_to_remove - 1;
1685 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
1686 			} else {
1687 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1688 			}
1689 
1690 		}
1691 		/* We don't need lock here, nobody else
1692 		 * touches the iova range
1693 		 */
1694 		tmp = 0ULL;
1695 		if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
1696 			static int dumps = 5;
1697 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1698 				iov_pfn, tmp, (unsigned long long)pteval);
1699 			if (dumps) {
1700 				dumps--;
1701 				debug_dma_dump_mappings(NULL);
1702 			}
1703 			WARN_ON(1);
1704 		}
1705 
1706 		nr_pages -= lvl_pages;
1707 		iov_pfn += lvl_pages;
1708 		phys_pfn += lvl_pages;
1709 		pteval += lvl_pages * VTD_PAGE_SIZE;
1710 
1711 		/* If the next PTE would be the first in a new page, then we
1712 		 * need to flush the cache on the entries we've just written.
1713 		 * And then we'll need to recalculate 'pte', so clear it and
1714 		 * let it get set again in the if (!pte) block above.
1715 		 *
1716 		 * If we're done (!nr_pages) we need to flush the cache too.
1717 		 *
1718 		 * Also if we've been setting superpages, we may need to
1719 		 * recalculate 'pte' and switch back to smaller pages for the
1720 		 * end of the mapping, if the trailing size is not enough to
1721 		 * use another superpage (i.e. nr_pages < lvl_pages).
1722 		 */
1723 		pte++;
1724 		if (!nr_pages || first_pte_in_page(pte) ||
1725 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
1726 			domain_flush_cache(domain, first_pte,
1727 					   (void *)pte - (void *)first_pte);
1728 			pte = NULL;
1729 		}
1730 	}
1731 
1732 	return 0;
1733 }
1734 
1735 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
1736 {
1737 	struct intel_iommu *iommu = info->iommu;
1738 	struct context_entry *context;
1739 	u16 did;
1740 
1741 	spin_lock(&iommu->lock);
1742 	context = iommu_context_addr(iommu, bus, devfn, 0);
1743 	if (!context) {
1744 		spin_unlock(&iommu->lock);
1745 		return;
1746 	}
1747 
1748 	did = context_domain_id(context);
1749 	context_clear_entry(context);
1750 	__iommu_flush_cache(iommu, context, sizeof(*context));
1751 	spin_unlock(&iommu->lock);
1752 	intel_context_flush_present(info, context, did, true);
1753 }
1754 
1755 int __domain_setup_first_level(struct intel_iommu *iommu,
1756 			       struct device *dev, ioasid_t pasid,
1757 			       u16 did, pgd_t *pgd, int flags,
1758 			       struct iommu_domain *old)
1759 {
1760 	if (!old)
1761 		return intel_pasid_setup_first_level(iommu, dev, pgd,
1762 						     pasid, did, flags);
1763 	return intel_pasid_replace_first_level(iommu, dev, pgd, pasid, did,
1764 					       iommu_domain_did(old, iommu),
1765 					       flags);
1766 }
1767 
1768 static int domain_setup_second_level(struct intel_iommu *iommu,
1769 				     struct dmar_domain *domain,
1770 				     struct device *dev, ioasid_t pasid,
1771 				     struct iommu_domain *old)
1772 {
1773 	if (!old)
1774 		return intel_pasid_setup_second_level(iommu, domain,
1775 						      dev, pasid);
1776 	return intel_pasid_replace_second_level(iommu, domain, dev,
1777 						iommu_domain_did(old, iommu),
1778 						pasid);
1779 }
1780 
1781 static int domain_setup_passthrough(struct intel_iommu *iommu,
1782 				    struct device *dev, ioasid_t pasid,
1783 				    struct iommu_domain *old)
1784 {
1785 	if (!old)
1786 		return intel_pasid_setup_pass_through(iommu, dev, pasid);
1787 	return intel_pasid_replace_pass_through(iommu, dev,
1788 						iommu_domain_did(old, iommu),
1789 						pasid);
1790 }
1791 
1792 static int domain_setup_first_level(struct intel_iommu *iommu,
1793 				    struct dmar_domain *domain,
1794 				    struct device *dev,
1795 				    u32 pasid, struct iommu_domain *old)
1796 {
1797 	struct dma_pte *pgd = domain->pgd;
1798 	int level, flags = 0;
1799 
1800 	level = agaw_to_level(domain->agaw);
1801 	if (level != 4 && level != 5)
1802 		return -EINVAL;
1803 
1804 	if (level == 5)
1805 		flags |= PASID_FLAG_FL5LP;
1806 
1807 	if (domain->force_snooping)
1808 		flags |= PASID_FLAG_PAGE_SNOOP;
1809 
1810 	return __domain_setup_first_level(iommu, dev, pasid,
1811 					  domain_id_iommu(domain, iommu),
1812 					  (pgd_t *)pgd, flags, old);
1813 }
1814 
1815 static int dmar_domain_attach_device(struct dmar_domain *domain,
1816 				     struct device *dev)
1817 {
1818 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1819 	struct intel_iommu *iommu = info->iommu;
1820 	unsigned long flags;
1821 	int ret;
1822 
1823 	ret = domain_attach_iommu(domain, iommu);
1824 	if (ret)
1825 		return ret;
1826 
1827 	info->domain = domain;
1828 	spin_lock_irqsave(&domain->lock, flags);
1829 	list_add(&info->link, &domain->devices);
1830 	spin_unlock_irqrestore(&domain->lock, flags);
1831 
1832 	if (dev_is_real_dma_subdevice(dev))
1833 		return 0;
1834 
1835 	if (!sm_supported(iommu))
1836 		ret = domain_context_mapping(domain, dev);
1837 	else if (domain->use_first_level)
1838 		ret = domain_setup_first_level(iommu, domain, dev,
1839 					       IOMMU_NO_PASID, NULL);
1840 	else
1841 		ret = domain_setup_second_level(iommu, domain, dev,
1842 						IOMMU_NO_PASID, NULL);
1843 
1844 	if (ret)
1845 		goto out_block_translation;
1846 
1847 	iommu_enable_pci_caps(info);
1848 
1849 	ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID);
1850 	if (ret)
1851 		goto out_block_translation;
1852 
1853 	return 0;
1854 
1855 out_block_translation:
1856 	device_block_translation(dev);
1857 	return ret;
1858 }
1859 
1860 /**
1861  * device_rmrr_is_relaxable - Test whether the RMRR of this device
1862  * is relaxable (ie. is allowed to be not enforced under some conditions)
1863  * @dev: device handle
1864  *
1865  * We assume that PCI USB devices with RMRRs have them largely
1866  * for historical reasons and that the RMRR space is not actively used post
1867  * boot.  This exclusion may change if vendors begin to abuse it.
1868  *
1869  * The same exception is made for graphics devices, with the requirement that
1870  * any use of the RMRR regions will be torn down before assigning the device
1871  * to a guest.
1872  *
1873  * Return: true if the RMRR is relaxable, false otherwise
1874  */
1875 static bool device_rmrr_is_relaxable(struct device *dev)
1876 {
1877 	struct pci_dev *pdev;
1878 
1879 	if (!dev_is_pci(dev))
1880 		return false;
1881 
1882 	pdev = to_pci_dev(dev);
1883 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
1884 		return true;
1885 	else
1886 		return false;
1887 }
1888 
1889 static int device_def_domain_type(struct device *dev)
1890 {
1891 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1892 	struct intel_iommu *iommu = info->iommu;
1893 
1894 	/*
1895 	 * Hardware does not support the passthrough translation mode.
1896 	 * Always use a dynamaic mapping domain.
1897 	 */
1898 	if (!ecap_pass_through(iommu->ecap))
1899 		return IOMMU_DOMAIN_DMA;
1900 
1901 	if (dev_is_pci(dev)) {
1902 		struct pci_dev *pdev = to_pci_dev(dev);
1903 
1904 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
1905 			return IOMMU_DOMAIN_IDENTITY;
1906 	}
1907 
1908 	return 0;
1909 }
1910 
1911 static void intel_iommu_init_qi(struct intel_iommu *iommu)
1912 {
1913 	/*
1914 	 * Start from the sane iommu hardware state.
1915 	 * If the queued invalidation is already initialized by us
1916 	 * (for example, while enabling interrupt-remapping) then
1917 	 * we got the things already rolling from a sane state.
1918 	 */
1919 	if (!iommu->qi) {
1920 		/*
1921 		 * Clear any previous faults.
1922 		 */
1923 		dmar_fault(-1, iommu);
1924 		/*
1925 		 * Disable queued invalidation if supported and already enabled
1926 		 * before OS handover.
1927 		 */
1928 		dmar_disable_qi(iommu);
1929 	}
1930 
1931 	if (dmar_enable_qi(iommu)) {
1932 		/*
1933 		 * Queued Invalidate not enabled, use Register Based Invalidate
1934 		 */
1935 		iommu->flush.flush_context = __iommu_flush_context;
1936 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1937 		pr_info("%s: Using Register based invalidation\n",
1938 			iommu->name);
1939 	} else {
1940 		iommu->flush.flush_context = qi_flush_context;
1941 		iommu->flush.flush_iotlb = qi_flush_iotlb;
1942 		pr_info("%s: Using Queued invalidation\n", iommu->name);
1943 	}
1944 }
1945 
1946 static int copy_context_table(struct intel_iommu *iommu,
1947 			      struct root_entry *old_re,
1948 			      struct context_entry **tbl,
1949 			      int bus, bool ext)
1950 {
1951 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
1952 	struct context_entry *new_ce = NULL, ce;
1953 	struct context_entry *old_ce = NULL;
1954 	struct root_entry re;
1955 	phys_addr_t old_ce_phys;
1956 
1957 	tbl_idx = ext ? bus * 2 : bus;
1958 	memcpy(&re, old_re, sizeof(re));
1959 
1960 	for (devfn = 0; devfn < 256; devfn++) {
1961 		/* First calculate the correct index */
1962 		idx = (ext ? devfn * 2 : devfn) % 256;
1963 
1964 		if (idx == 0) {
1965 			/* First save what we may have and clean up */
1966 			if (new_ce) {
1967 				tbl[tbl_idx] = new_ce;
1968 				__iommu_flush_cache(iommu, new_ce,
1969 						    VTD_PAGE_SIZE);
1970 				pos = 1;
1971 			}
1972 
1973 			if (old_ce)
1974 				memunmap(old_ce);
1975 
1976 			ret = 0;
1977 			if (devfn < 0x80)
1978 				old_ce_phys = root_entry_lctp(&re);
1979 			else
1980 				old_ce_phys = root_entry_uctp(&re);
1981 
1982 			if (!old_ce_phys) {
1983 				if (ext && devfn == 0) {
1984 					/* No LCTP, try UCTP */
1985 					devfn = 0x7f;
1986 					continue;
1987 				} else {
1988 					goto out;
1989 				}
1990 			}
1991 
1992 			ret = -ENOMEM;
1993 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
1994 					MEMREMAP_WB);
1995 			if (!old_ce)
1996 				goto out;
1997 
1998 			new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL);
1999 			if (!new_ce)
2000 				goto out_unmap;
2001 
2002 			ret = 0;
2003 		}
2004 
2005 		/* Now copy the context entry */
2006 		memcpy(&ce, old_ce + idx, sizeof(ce));
2007 
2008 		if (!context_present(&ce))
2009 			continue;
2010 
2011 		did = context_domain_id(&ce);
2012 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2013 			set_bit(did, iommu->domain_ids);
2014 
2015 		set_context_copied(iommu, bus, devfn);
2016 		new_ce[idx] = ce;
2017 	}
2018 
2019 	tbl[tbl_idx + pos] = new_ce;
2020 
2021 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2022 
2023 out_unmap:
2024 	memunmap(old_ce);
2025 
2026 out:
2027 	return ret;
2028 }
2029 
2030 static int copy_translation_tables(struct intel_iommu *iommu)
2031 {
2032 	struct context_entry **ctxt_tbls;
2033 	struct root_entry *old_rt;
2034 	phys_addr_t old_rt_phys;
2035 	int ctxt_table_entries;
2036 	u64 rtaddr_reg;
2037 	int bus, ret;
2038 	bool new_ext, ext;
2039 
2040 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2041 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2042 	new_ext    = !!sm_supported(iommu);
2043 
2044 	/*
2045 	 * The RTT bit can only be changed when translation is disabled,
2046 	 * but disabling translation means to open a window for data
2047 	 * corruption. So bail out and don't copy anything if we would
2048 	 * have to change the bit.
2049 	 */
2050 	if (new_ext != ext)
2051 		return -EINVAL;
2052 
2053 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2054 	if (!iommu->copied_tables)
2055 		return -ENOMEM;
2056 
2057 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2058 	if (!old_rt_phys)
2059 		return -EINVAL;
2060 
2061 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2062 	if (!old_rt)
2063 		return -ENOMEM;
2064 
2065 	/* This is too big for the stack - allocate it from slab */
2066 	ctxt_table_entries = ext ? 512 : 256;
2067 	ret = -ENOMEM;
2068 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2069 	if (!ctxt_tbls)
2070 		goto out_unmap;
2071 
2072 	for (bus = 0; bus < 256; bus++) {
2073 		ret = copy_context_table(iommu, &old_rt[bus],
2074 					 ctxt_tbls, bus, ext);
2075 		if (ret) {
2076 			pr_err("%s: Failed to copy context table for bus %d\n",
2077 				iommu->name, bus);
2078 			continue;
2079 		}
2080 	}
2081 
2082 	spin_lock(&iommu->lock);
2083 
2084 	/* Context tables are copied, now write them to the root_entry table */
2085 	for (bus = 0; bus < 256; bus++) {
2086 		int idx = ext ? bus * 2 : bus;
2087 		u64 val;
2088 
2089 		if (ctxt_tbls[idx]) {
2090 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2091 			iommu->root_entry[bus].lo = val;
2092 		}
2093 
2094 		if (!ext || !ctxt_tbls[idx + 1])
2095 			continue;
2096 
2097 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2098 		iommu->root_entry[bus].hi = val;
2099 	}
2100 
2101 	spin_unlock(&iommu->lock);
2102 
2103 	kfree(ctxt_tbls);
2104 
2105 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2106 
2107 	ret = 0;
2108 
2109 out_unmap:
2110 	memunmap(old_rt);
2111 
2112 	return ret;
2113 }
2114 
2115 static int __init init_dmars(void)
2116 {
2117 	struct dmar_drhd_unit *drhd;
2118 	struct intel_iommu *iommu;
2119 	int ret;
2120 
2121 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2122 	if (ret)
2123 		goto free_iommu;
2124 
2125 	for_each_iommu(iommu, drhd) {
2126 		if (drhd->ignored) {
2127 			iommu_disable_translation(iommu);
2128 			continue;
2129 		}
2130 
2131 		/*
2132 		 * Find the max pasid size of all IOMMU's in the system.
2133 		 * We need to ensure the system pasid table is no bigger
2134 		 * than the smallest supported.
2135 		 */
2136 		if (pasid_supported(iommu)) {
2137 			u32 temp = 2 << ecap_pss(iommu->ecap);
2138 
2139 			intel_pasid_max_id = min_t(u32, temp,
2140 						   intel_pasid_max_id);
2141 		}
2142 
2143 		intel_iommu_init_qi(iommu);
2144 
2145 		ret = iommu_init_domains(iommu);
2146 		if (ret)
2147 			goto free_iommu;
2148 
2149 		init_translation_status(iommu);
2150 
2151 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2152 			iommu_disable_translation(iommu);
2153 			clear_translation_pre_enabled(iommu);
2154 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2155 				iommu->name);
2156 		}
2157 
2158 		/*
2159 		 * TBD:
2160 		 * we could share the same root & context tables
2161 		 * among all IOMMU's. Need to Split it later.
2162 		 */
2163 		ret = iommu_alloc_root_entry(iommu);
2164 		if (ret)
2165 			goto free_iommu;
2166 
2167 		if (translation_pre_enabled(iommu)) {
2168 			pr_info("Translation already enabled - trying to copy translation structures\n");
2169 
2170 			ret = copy_translation_tables(iommu);
2171 			if (ret) {
2172 				/*
2173 				 * We found the IOMMU with translation
2174 				 * enabled - but failed to copy over the
2175 				 * old root-entry table. Try to proceed
2176 				 * by disabling translation now and
2177 				 * allocating a clean root-entry table.
2178 				 * This might cause DMAR faults, but
2179 				 * probably the dump will still succeed.
2180 				 */
2181 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2182 				       iommu->name);
2183 				iommu_disable_translation(iommu);
2184 				clear_translation_pre_enabled(iommu);
2185 			} else {
2186 				pr_info("Copied translation tables from previous kernel for %s\n",
2187 					iommu->name);
2188 			}
2189 		}
2190 
2191 		intel_svm_check(iommu);
2192 	}
2193 
2194 	/*
2195 	 * Now that qi is enabled on all iommus, set the root entry and flush
2196 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2197 	 * flush_context function will loop forever and the boot hangs.
2198 	 */
2199 	for_each_active_iommu(iommu, drhd) {
2200 		iommu_flush_write_buffer(iommu);
2201 		iommu_set_root_entry(iommu);
2202 	}
2203 
2204 	check_tylersburg_isoch();
2205 
2206 	/*
2207 	 * for each drhd
2208 	 *   enable fault log
2209 	 *   global invalidate context cache
2210 	 *   global invalidate iotlb
2211 	 *   enable translation
2212 	 */
2213 	for_each_iommu(iommu, drhd) {
2214 		if (drhd->ignored) {
2215 			/*
2216 			 * we always have to disable PMRs or DMA may fail on
2217 			 * this device
2218 			 */
2219 			if (force_on)
2220 				iommu_disable_protect_mem_regions(iommu);
2221 			continue;
2222 		}
2223 
2224 		iommu_flush_write_buffer(iommu);
2225 
2226 		if (ecap_prs(iommu->ecap)) {
2227 			/*
2228 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2229 			 * could cause possible lock race condition.
2230 			 */
2231 			up_write(&dmar_global_lock);
2232 			ret = intel_iommu_enable_prq(iommu);
2233 			down_write(&dmar_global_lock);
2234 			if (ret)
2235 				goto free_iommu;
2236 		}
2237 
2238 		ret = dmar_set_interrupt(iommu);
2239 		if (ret)
2240 			goto free_iommu;
2241 	}
2242 
2243 	return 0;
2244 
2245 free_iommu:
2246 	for_each_active_iommu(iommu, drhd) {
2247 		disable_dmar_iommu(iommu);
2248 		free_dmar_iommu(iommu);
2249 	}
2250 
2251 	return ret;
2252 }
2253 
2254 static void __init init_no_remapping_devices(void)
2255 {
2256 	struct dmar_drhd_unit *drhd;
2257 	struct device *dev;
2258 	int i;
2259 
2260 	for_each_drhd_unit(drhd) {
2261 		if (!drhd->include_all) {
2262 			for_each_active_dev_scope(drhd->devices,
2263 						  drhd->devices_cnt, i, dev)
2264 				break;
2265 			/* ignore DMAR unit if no devices exist */
2266 			if (i == drhd->devices_cnt)
2267 				drhd->ignored = 1;
2268 		}
2269 	}
2270 
2271 	for_each_active_drhd_unit(drhd) {
2272 		if (drhd->include_all)
2273 			continue;
2274 
2275 		for_each_active_dev_scope(drhd->devices,
2276 					  drhd->devices_cnt, i, dev)
2277 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2278 				break;
2279 		if (i < drhd->devices_cnt)
2280 			continue;
2281 
2282 		/* This IOMMU has *only* gfx devices. Either bypass it or
2283 		   set the gfx_mapped flag, as appropriate */
2284 		drhd->gfx_dedicated = 1;
2285 		if (disable_igfx_iommu)
2286 			drhd->ignored = 1;
2287 	}
2288 }
2289 
2290 #ifdef CONFIG_SUSPEND
2291 static int init_iommu_hw(void)
2292 {
2293 	struct dmar_drhd_unit *drhd;
2294 	struct intel_iommu *iommu = NULL;
2295 	int ret;
2296 
2297 	for_each_active_iommu(iommu, drhd) {
2298 		if (iommu->qi) {
2299 			ret = dmar_reenable_qi(iommu);
2300 			if (ret)
2301 				return ret;
2302 		}
2303 	}
2304 
2305 	for_each_iommu(iommu, drhd) {
2306 		if (drhd->ignored) {
2307 			/*
2308 			 * we always have to disable PMRs or DMA may fail on
2309 			 * this device
2310 			 */
2311 			if (force_on)
2312 				iommu_disable_protect_mem_regions(iommu);
2313 			continue;
2314 		}
2315 
2316 		iommu_flush_write_buffer(iommu);
2317 		iommu_set_root_entry(iommu);
2318 		iommu_enable_translation(iommu);
2319 		iommu_disable_protect_mem_regions(iommu);
2320 	}
2321 
2322 	return 0;
2323 }
2324 
2325 static void iommu_flush_all(void)
2326 {
2327 	struct dmar_drhd_unit *drhd;
2328 	struct intel_iommu *iommu;
2329 
2330 	for_each_active_iommu(iommu, drhd) {
2331 		iommu->flush.flush_context(iommu, 0, 0, 0,
2332 					   DMA_CCMD_GLOBAL_INVL);
2333 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2334 					 DMA_TLB_GLOBAL_FLUSH);
2335 	}
2336 }
2337 
2338 static int iommu_suspend(void)
2339 {
2340 	struct dmar_drhd_unit *drhd;
2341 	struct intel_iommu *iommu = NULL;
2342 	unsigned long flag;
2343 
2344 	iommu_flush_all();
2345 
2346 	for_each_active_iommu(iommu, drhd) {
2347 		iommu_disable_translation(iommu);
2348 
2349 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2350 
2351 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
2352 			readl(iommu->reg + DMAR_FECTL_REG);
2353 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2354 			readl(iommu->reg + DMAR_FEDATA_REG);
2355 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2356 			readl(iommu->reg + DMAR_FEADDR_REG);
2357 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2358 			readl(iommu->reg + DMAR_FEUADDR_REG);
2359 
2360 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2361 	}
2362 	return 0;
2363 }
2364 
2365 static void iommu_resume(void)
2366 {
2367 	struct dmar_drhd_unit *drhd;
2368 	struct intel_iommu *iommu = NULL;
2369 	unsigned long flag;
2370 
2371 	if (init_iommu_hw()) {
2372 		if (force_on)
2373 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2374 		else
2375 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2376 		return;
2377 	}
2378 
2379 	for_each_active_iommu(iommu, drhd) {
2380 
2381 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2382 
2383 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2384 			iommu->reg + DMAR_FECTL_REG);
2385 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2386 			iommu->reg + DMAR_FEDATA_REG);
2387 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2388 			iommu->reg + DMAR_FEADDR_REG);
2389 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2390 			iommu->reg + DMAR_FEUADDR_REG);
2391 
2392 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2393 	}
2394 }
2395 
2396 static struct syscore_ops iommu_syscore_ops = {
2397 	.resume		= iommu_resume,
2398 	.suspend	= iommu_suspend,
2399 };
2400 
2401 static void __init init_iommu_pm_ops(void)
2402 {
2403 	register_syscore_ops(&iommu_syscore_ops);
2404 }
2405 
2406 #else
2407 static inline void init_iommu_pm_ops(void) {}
2408 #endif	/* CONFIG_PM */
2409 
2410 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2411 {
2412 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2413 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2414 	    rmrr->end_address <= rmrr->base_address ||
2415 	    arch_rmrr_sanity_check(rmrr))
2416 		return -EINVAL;
2417 
2418 	return 0;
2419 }
2420 
2421 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2422 {
2423 	struct acpi_dmar_reserved_memory *rmrr;
2424 	struct dmar_rmrr_unit *rmrru;
2425 
2426 	rmrr = (struct acpi_dmar_reserved_memory *)header;
2427 	if (rmrr_sanity_check(rmrr)) {
2428 		pr_warn(FW_BUG
2429 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2430 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2431 			   rmrr->base_address, rmrr->end_address,
2432 			   dmi_get_system_info(DMI_BIOS_VENDOR),
2433 			   dmi_get_system_info(DMI_BIOS_VERSION),
2434 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
2435 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2436 	}
2437 
2438 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2439 	if (!rmrru)
2440 		goto out;
2441 
2442 	rmrru->hdr = header;
2443 
2444 	rmrru->base_address = rmrr->base_address;
2445 	rmrru->end_address = rmrr->end_address;
2446 
2447 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2448 				((void *)rmrr) + rmrr->header.length,
2449 				&rmrru->devices_cnt);
2450 	if (rmrru->devices_cnt && rmrru->devices == NULL)
2451 		goto free_rmrru;
2452 
2453 	list_add(&rmrru->list, &dmar_rmrr_units);
2454 
2455 	return 0;
2456 free_rmrru:
2457 	kfree(rmrru);
2458 out:
2459 	return -ENOMEM;
2460 }
2461 
2462 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2463 {
2464 	struct dmar_atsr_unit *atsru;
2465 	struct acpi_dmar_atsr *tmp;
2466 
2467 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2468 				dmar_rcu_check()) {
2469 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2470 		if (atsr->segment != tmp->segment)
2471 			continue;
2472 		if (atsr->header.length != tmp->header.length)
2473 			continue;
2474 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
2475 			return atsru;
2476 	}
2477 
2478 	return NULL;
2479 }
2480 
2481 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2482 {
2483 	struct acpi_dmar_atsr *atsr;
2484 	struct dmar_atsr_unit *atsru;
2485 
2486 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2487 		return 0;
2488 
2489 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2490 	atsru = dmar_find_atsr(atsr);
2491 	if (atsru)
2492 		return 0;
2493 
2494 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2495 	if (!atsru)
2496 		return -ENOMEM;
2497 
2498 	/*
2499 	 * If memory is allocated from slab by ACPI _DSM method, we need to
2500 	 * copy the memory content because the memory buffer will be freed
2501 	 * on return.
2502 	 */
2503 	atsru->hdr = (void *)(atsru + 1);
2504 	memcpy(atsru->hdr, hdr, hdr->length);
2505 	atsru->include_all = atsr->flags & 0x1;
2506 	if (!atsru->include_all) {
2507 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
2508 				(void *)atsr + atsr->header.length,
2509 				&atsru->devices_cnt);
2510 		if (atsru->devices_cnt && atsru->devices == NULL) {
2511 			kfree(atsru);
2512 			return -ENOMEM;
2513 		}
2514 	}
2515 
2516 	list_add_rcu(&atsru->list, &dmar_atsr_units);
2517 
2518 	return 0;
2519 }
2520 
2521 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2522 {
2523 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
2524 	kfree(atsru);
2525 }
2526 
2527 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2528 {
2529 	struct acpi_dmar_atsr *atsr;
2530 	struct dmar_atsr_unit *atsru;
2531 
2532 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2533 	atsru = dmar_find_atsr(atsr);
2534 	if (atsru) {
2535 		list_del_rcu(&atsru->list);
2536 		synchronize_rcu();
2537 		intel_iommu_free_atsr(atsru);
2538 	}
2539 
2540 	return 0;
2541 }
2542 
2543 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2544 {
2545 	int i;
2546 	struct device *dev;
2547 	struct acpi_dmar_atsr *atsr;
2548 	struct dmar_atsr_unit *atsru;
2549 
2550 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2551 	atsru = dmar_find_atsr(atsr);
2552 	if (!atsru)
2553 		return 0;
2554 
2555 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
2556 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
2557 					  i, dev)
2558 			return -EBUSY;
2559 	}
2560 
2561 	return 0;
2562 }
2563 
2564 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
2565 {
2566 	struct dmar_satc_unit *satcu;
2567 	struct acpi_dmar_satc *tmp;
2568 
2569 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
2570 				dmar_rcu_check()) {
2571 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
2572 		if (satc->segment != tmp->segment)
2573 			continue;
2574 		if (satc->header.length != tmp->header.length)
2575 			continue;
2576 		if (memcmp(satc, tmp, satc->header.length) == 0)
2577 			return satcu;
2578 	}
2579 
2580 	return NULL;
2581 }
2582 
2583 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
2584 {
2585 	struct acpi_dmar_satc *satc;
2586 	struct dmar_satc_unit *satcu;
2587 
2588 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2589 		return 0;
2590 
2591 	satc = container_of(hdr, struct acpi_dmar_satc, header);
2592 	satcu = dmar_find_satc(satc);
2593 	if (satcu)
2594 		return 0;
2595 
2596 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
2597 	if (!satcu)
2598 		return -ENOMEM;
2599 
2600 	satcu->hdr = (void *)(satcu + 1);
2601 	memcpy(satcu->hdr, hdr, hdr->length);
2602 	satcu->atc_required = satc->flags & 0x1;
2603 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
2604 					      (void *)satc + satc->header.length,
2605 					      &satcu->devices_cnt);
2606 	if (satcu->devices_cnt && !satcu->devices) {
2607 		kfree(satcu);
2608 		return -ENOMEM;
2609 	}
2610 	list_add_rcu(&satcu->list, &dmar_satc_units);
2611 
2612 	return 0;
2613 }
2614 
2615 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
2616 {
2617 	struct intel_iommu *iommu = dmaru->iommu;
2618 	int ret;
2619 
2620 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
2621 	if (ret)
2622 		goto out;
2623 
2624 	/*
2625 	 * Disable translation if already enabled prior to OS handover.
2626 	 */
2627 	if (iommu->gcmd & DMA_GCMD_TE)
2628 		iommu_disable_translation(iommu);
2629 
2630 	ret = iommu_init_domains(iommu);
2631 	if (ret == 0)
2632 		ret = iommu_alloc_root_entry(iommu);
2633 	if (ret)
2634 		goto out;
2635 
2636 	intel_svm_check(iommu);
2637 
2638 	if (dmaru->ignored) {
2639 		/*
2640 		 * we always have to disable PMRs or DMA may fail on this device
2641 		 */
2642 		if (force_on)
2643 			iommu_disable_protect_mem_regions(iommu);
2644 		return 0;
2645 	}
2646 
2647 	intel_iommu_init_qi(iommu);
2648 	iommu_flush_write_buffer(iommu);
2649 
2650 	if (ecap_prs(iommu->ecap)) {
2651 		ret = intel_iommu_enable_prq(iommu);
2652 		if (ret)
2653 			goto disable_iommu;
2654 	}
2655 
2656 	ret = dmar_set_interrupt(iommu);
2657 	if (ret)
2658 		goto disable_iommu;
2659 
2660 	iommu_set_root_entry(iommu);
2661 	iommu_enable_translation(iommu);
2662 
2663 	iommu_disable_protect_mem_regions(iommu);
2664 	return 0;
2665 
2666 disable_iommu:
2667 	disable_dmar_iommu(iommu);
2668 out:
2669 	free_dmar_iommu(iommu);
2670 	return ret;
2671 }
2672 
2673 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
2674 {
2675 	int ret = 0;
2676 	struct intel_iommu *iommu = dmaru->iommu;
2677 
2678 	if (!intel_iommu_enabled)
2679 		return 0;
2680 	if (iommu == NULL)
2681 		return -EINVAL;
2682 
2683 	if (insert) {
2684 		ret = intel_iommu_add(dmaru);
2685 	} else {
2686 		disable_dmar_iommu(iommu);
2687 		free_dmar_iommu(iommu);
2688 	}
2689 
2690 	return ret;
2691 }
2692 
2693 static void intel_iommu_free_dmars(void)
2694 {
2695 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
2696 	struct dmar_atsr_unit *atsru, *atsr_n;
2697 	struct dmar_satc_unit *satcu, *satc_n;
2698 
2699 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
2700 		list_del(&rmrru->list);
2701 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
2702 		kfree(rmrru);
2703 	}
2704 
2705 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
2706 		list_del(&atsru->list);
2707 		intel_iommu_free_atsr(atsru);
2708 	}
2709 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
2710 		list_del(&satcu->list);
2711 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
2712 		kfree(satcu);
2713 	}
2714 }
2715 
2716 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
2717 {
2718 	struct dmar_satc_unit *satcu;
2719 	struct acpi_dmar_satc *satc;
2720 	struct device *tmp;
2721 	int i;
2722 
2723 	dev = pci_physfn(dev);
2724 	rcu_read_lock();
2725 
2726 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
2727 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2728 		if (satc->segment != pci_domain_nr(dev->bus))
2729 			continue;
2730 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
2731 			if (to_pci_dev(tmp) == dev)
2732 				goto out;
2733 	}
2734 	satcu = NULL;
2735 out:
2736 	rcu_read_unlock();
2737 	return satcu;
2738 }
2739 
2740 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
2741 {
2742 	int i, ret = 1;
2743 	struct pci_bus *bus;
2744 	struct pci_dev *bridge = NULL;
2745 	struct device *tmp;
2746 	struct acpi_dmar_atsr *atsr;
2747 	struct dmar_atsr_unit *atsru;
2748 	struct dmar_satc_unit *satcu;
2749 
2750 	dev = pci_physfn(dev);
2751 	satcu = dmar_find_matched_satc_unit(dev);
2752 	if (satcu)
2753 		/*
2754 		 * This device supports ATS as it is in SATC table.
2755 		 * When IOMMU is in legacy mode, enabling ATS is done
2756 		 * automatically by HW for the device that requires
2757 		 * ATS, hence OS should not enable this device ATS
2758 		 * to avoid duplicated TLB invalidation.
2759 		 */
2760 		return !(satcu->atc_required && !sm_supported(iommu));
2761 
2762 	for (bus = dev->bus; bus; bus = bus->parent) {
2763 		bridge = bus->self;
2764 		/* If it's an integrated device, allow ATS */
2765 		if (!bridge)
2766 			return 1;
2767 		/* Connected via non-PCIe: no ATS */
2768 		if (!pci_is_pcie(bridge) ||
2769 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
2770 			return 0;
2771 		/* If we found the root port, look it up in the ATSR */
2772 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
2773 			break;
2774 	}
2775 
2776 	rcu_read_lock();
2777 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
2778 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2779 		if (atsr->segment != pci_domain_nr(dev->bus))
2780 			continue;
2781 
2782 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
2783 			if (tmp == &bridge->dev)
2784 				goto out;
2785 
2786 		if (atsru->include_all)
2787 			goto out;
2788 	}
2789 	ret = 0;
2790 out:
2791 	rcu_read_unlock();
2792 
2793 	return ret;
2794 }
2795 
2796 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
2797 {
2798 	int ret;
2799 	struct dmar_rmrr_unit *rmrru;
2800 	struct dmar_atsr_unit *atsru;
2801 	struct dmar_satc_unit *satcu;
2802 	struct acpi_dmar_atsr *atsr;
2803 	struct acpi_dmar_reserved_memory *rmrr;
2804 	struct acpi_dmar_satc *satc;
2805 
2806 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
2807 		return 0;
2808 
2809 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
2810 		rmrr = container_of(rmrru->hdr,
2811 				    struct acpi_dmar_reserved_memory, header);
2812 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2813 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
2814 				((void *)rmrr) + rmrr->header.length,
2815 				rmrr->segment, rmrru->devices,
2816 				rmrru->devices_cnt);
2817 			if (ret < 0)
2818 				return ret;
2819 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2820 			dmar_remove_dev_scope(info, rmrr->segment,
2821 				rmrru->devices, rmrru->devices_cnt);
2822 		}
2823 	}
2824 
2825 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
2826 		if (atsru->include_all)
2827 			continue;
2828 
2829 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2830 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2831 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
2832 					(void *)atsr + atsr->header.length,
2833 					atsr->segment, atsru->devices,
2834 					atsru->devices_cnt);
2835 			if (ret > 0)
2836 				break;
2837 			else if (ret < 0)
2838 				return ret;
2839 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2840 			if (dmar_remove_dev_scope(info, atsr->segment,
2841 					atsru->devices, atsru->devices_cnt))
2842 				break;
2843 		}
2844 	}
2845 	list_for_each_entry(satcu, &dmar_satc_units, list) {
2846 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2847 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2848 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
2849 					(void *)satc + satc->header.length,
2850 					satc->segment, satcu->devices,
2851 					satcu->devices_cnt);
2852 			if (ret > 0)
2853 				break;
2854 			else if (ret < 0)
2855 				return ret;
2856 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2857 			if (dmar_remove_dev_scope(info, satc->segment,
2858 					satcu->devices, satcu->devices_cnt))
2859 				break;
2860 		}
2861 	}
2862 
2863 	return 0;
2864 }
2865 
2866 static void intel_disable_iommus(void)
2867 {
2868 	struct intel_iommu *iommu = NULL;
2869 	struct dmar_drhd_unit *drhd;
2870 
2871 	for_each_iommu(iommu, drhd)
2872 		iommu_disable_translation(iommu);
2873 }
2874 
2875 void intel_iommu_shutdown(void)
2876 {
2877 	struct dmar_drhd_unit *drhd;
2878 	struct intel_iommu *iommu = NULL;
2879 
2880 	if (no_iommu || dmar_disabled)
2881 		return;
2882 
2883 	down_write(&dmar_global_lock);
2884 
2885 	/* Disable PMRs explicitly here. */
2886 	for_each_iommu(iommu, drhd)
2887 		iommu_disable_protect_mem_regions(iommu);
2888 
2889 	/* Make sure the IOMMUs are switched off */
2890 	intel_disable_iommus();
2891 
2892 	up_write(&dmar_global_lock);
2893 }
2894 
2895 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
2896 {
2897 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
2898 
2899 	return container_of(iommu_dev, struct intel_iommu, iommu);
2900 }
2901 
2902 static ssize_t version_show(struct device *dev,
2903 			    struct device_attribute *attr, char *buf)
2904 {
2905 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2906 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
2907 	return sysfs_emit(buf, "%d:%d\n",
2908 			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
2909 }
2910 static DEVICE_ATTR_RO(version);
2911 
2912 static ssize_t address_show(struct device *dev,
2913 			    struct device_attribute *attr, char *buf)
2914 {
2915 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2916 	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
2917 }
2918 static DEVICE_ATTR_RO(address);
2919 
2920 static ssize_t cap_show(struct device *dev,
2921 			struct device_attribute *attr, char *buf)
2922 {
2923 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2924 	return sysfs_emit(buf, "%llx\n", iommu->cap);
2925 }
2926 static DEVICE_ATTR_RO(cap);
2927 
2928 static ssize_t ecap_show(struct device *dev,
2929 			 struct device_attribute *attr, char *buf)
2930 {
2931 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2932 	return sysfs_emit(buf, "%llx\n", iommu->ecap);
2933 }
2934 static DEVICE_ATTR_RO(ecap);
2935 
2936 static ssize_t domains_supported_show(struct device *dev,
2937 				      struct device_attribute *attr, char *buf)
2938 {
2939 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2940 	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
2941 }
2942 static DEVICE_ATTR_RO(domains_supported);
2943 
2944 static ssize_t domains_used_show(struct device *dev,
2945 				 struct device_attribute *attr, char *buf)
2946 {
2947 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2948 	return sysfs_emit(buf, "%d\n",
2949 			  bitmap_weight(iommu->domain_ids,
2950 					cap_ndoms(iommu->cap)));
2951 }
2952 static DEVICE_ATTR_RO(domains_used);
2953 
2954 static struct attribute *intel_iommu_attrs[] = {
2955 	&dev_attr_version.attr,
2956 	&dev_attr_address.attr,
2957 	&dev_attr_cap.attr,
2958 	&dev_attr_ecap.attr,
2959 	&dev_attr_domains_supported.attr,
2960 	&dev_attr_domains_used.attr,
2961 	NULL,
2962 };
2963 
2964 static struct attribute_group intel_iommu_group = {
2965 	.name = "intel-iommu",
2966 	.attrs = intel_iommu_attrs,
2967 };
2968 
2969 const struct attribute_group *intel_iommu_groups[] = {
2970 	&intel_iommu_group,
2971 	NULL,
2972 };
2973 
2974 static bool has_external_pci(void)
2975 {
2976 	struct pci_dev *pdev = NULL;
2977 
2978 	for_each_pci_dev(pdev)
2979 		if (pdev->external_facing) {
2980 			pci_dev_put(pdev);
2981 			return true;
2982 		}
2983 
2984 	return false;
2985 }
2986 
2987 static int __init platform_optin_force_iommu(void)
2988 {
2989 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
2990 		return 0;
2991 
2992 	if (no_iommu || dmar_disabled)
2993 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
2994 
2995 	/*
2996 	 * If Intel-IOMMU is disabled by default, we will apply identity
2997 	 * map for all devices except those marked as being untrusted.
2998 	 */
2999 	if (dmar_disabled)
3000 		iommu_set_default_passthrough(false);
3001 
3002 	dmar_disabled = 0;
3003 	no_iommu = 0;
3004 
3005 	return 1;
3006 }
3007 
3008 static int __init probe_acpi_namespace_devices(void)
3009 {
3010 	struct dmar_drhd_unit *drhd;
3011 	/* To avoid a -Wunused-but-set-variable warning. */
3012 	struct intel_iommu *iommu __maybe_unused;
3013 	struct device *dev;
3014 	int i, ret = 0;
3015 
3016 	for_each_active_iommu(iommu, drhd) {
3017 		for_each_active_dev_scope(drhd->devices,
3018 					  drhd->devices_cnt, i, dev) {
3019 			struct acpi_device_physical_node *pn;
3020 			struct acpi_device *adev;
3021 
3022 			if (dev->bus != &acpi_bus_type)
3023 				continue;
3024 
3025 			adev = to_acpi_device(dev);
3026 			mutex_lock(&adev->physical_node_lock);
3027 			list_for_each_entry(pn,
3028 					    &adev->physical_node_list, node) {
3029 				ret = iommu_probe_device(pn->dev);
3030 				if (ret)
3031 					break;
3032 			}
3033 			mutex_unlock(&adev->physical_node_lock);
3034 
3035 			if (ret)
3036 				return ret;
3037 		}
3038 	}
3039 
3040 	return 0;
3041 }
3042 
3043 static __init int tboot_force_iommu(void)
3044 {
3045 	if (!tboot_enabled())
3046 		return 0;
3047 
3048 	if (no_iommu || dmar_disabled)
3049 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3050 
3051 	dmar_disabled = 0;
3052 	no_iommu = 0;
3053 
3054 	return 1;
3055 }
3056 
3057 int __init intel_iommu_init(void)
3058 {
3059 	int ret = -ENODEV;
3060 	struct dmar_drhd_unit *drhd;
3061 	struct intel_iommu *iommu;
3062 
3063 	/*
3064 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3065 	 * opt in, so enforce that.
3066 	 */
3067 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3068 		    platform_optin_force_iommu();
3069 
3070 	down_write(&dmar_global_lock);
3071 	if (dmar_table_init()) {
3072 		if (force_on)
3073 			panic("tboot: Failed to initialize DMAR table\n");
3074 		goto out_free_dmar;
3075 	}
3076 
3077 	if (dmar_dev_scope_init() < 0) {
3078 		if (force_on)
3079 			panic("tboot: Failed to initialize DMAR device scope\n");
3080 		goto out_free_dmar;
3081 	}
3082 
3083 	up_write(&dmar_global_lock);
3084 
3085 	/*
3086 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3087 	 * complain later when we register it under the lock.
3088 	 */
3089 	dmar_register_bus_notifier();
3090 
3091 	down_write(&dmar_global_lock);
3092 
3093 	if (!no_iommu)
3094 		intel_iommu_debugfs_init();
3095 
3096 	if (no_iommu || dmar_disabled) {
3097 		/*
3098 		 * We exit the function here to ensure IOMMU's remapping and
3099 		 * mempool aren't setup, which means that the IOMMU's PMRs
3100 		 * won't be disabled via the call to init_dmars(). So disable
3101 		 * it explicitly here. The PMRs were setup by tboot prior to
3102 		 * calling SENTER, but the kernel is expected to reset/tear
3103 		 * down the PMRs.
3104 		 */
3105 		if (intel_iommu_tboot_noforce) {
3106 			for_each_iommu(iommu, drhd)
3107 				iommu_disable_protect_mem_regions(iommu);
3108 		}
3109 
3110 		/*
3111 		 * Make sure the IOMMUs are switched off, even when we
3112 		 * boot into a kexec kernel and the previous kernel left
3113 		 * them enabled
3114 		 */
3115 		intel_disable_iommus();
3116 		goto out_free_dmar;
3117 	}
3118 
3119 	if (list_empty(&dmar_rmrr_units))
3120 		pr_info("No RMRR found\n");
3121 
3122 	if (list_empty(&dmar_atsr_units))
3123 		pr_info("No ATSR found\n");
3124 
3125 	if (list_empty(&dmar_satc_units))
3126 		pr_info("No SATC found\n");
3127 
3128 	init_no_remapping_devices();
3129 
3130 	ret = init_dmars();
3131 	if (ret) {
3132 		if (force_on)
3133 			panic("tboot: Failed to initialize DMARs\n");
3134 		pr_err("Initialization failed\n");
3135 		goto out_free_dmar;
3136 	}
3137 	up_write(&dmar_global_lock);
3138 
3139 	init_iommu_pm_ops();
3140 
3141 	down_read(&dmar_global_lock);
3142 	for_each_active_iommu(iommu, drhd) {
3143 		/*
3144 		 * The flush queue implementation does not perform
3145 		 * page-selective invalidations that are required for efficient
3146 		 * TLB flushes in virtual environments.  The benefit of batching
3147 		 * is likely to be much lower than the overhead of synchronizing
3148 		 * the virtual and physical IOMMU page-tables.
3149 		 */
3150 		if (cap_caching_mode(iommu->cap) &&
3151 		    !first_level_by_default(iommu)) {
3152 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3153 			iommu_set_dma_strict();
3154 		}
3155 		iommu_device_sysfs_add(&iommu->iommu, NULL,
3156 				       intel_iommu_groups,
3157 				       "%s", iommu->name);
3158 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3159 
3160 		iommu_pmu_register(iommu);
3161 	}
3162 
3163 	if (probe_acpi_namespace_devices())
3164 		pr_warn("ACPI name space devices didn't probe correctly\n");
3165 
3166 	/* Finally, we enable the DMA remapping hardware. */
3167 	for_each_iommu(iommu, drhd) {
3168 		if (!drhd->ignored && !translation_pre_enabled(iommu))
3169 			iommu_enable_translation(iommu);
3170 
3171 		iommu_disable_protect_mem_regions(iommu);
3172 	}
3173 	up_read(&dmar_global_lock);
3174 
3175 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3176 
3177 	intel_iommu_enabled = 1;
3178 
3179 	return 0;
3180 
3181 out_free_dmar:
3182 	intel_iommu_free_dmars();
3183 	up_write(&dmar_global_lock);
3184 	return ret;
3185 }
3186 
3187 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3188 {
3189 	struct device_domain_info *info = opaque;
3190 
3191 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3192 	return 0;
3193 }
3194 
3195 /*
3196  * NB - intel-iommu lacks any sort of reference counting for the users of
3197  * dependent devices.  If multiple endpoints have intersecting dependent
3198  * devices, unbinding the driver from any one of them will possibly leave
3199  * the others unable to operate.
3200  */
3201 static void domain_context_clear(struct device_domain_info *info)
3202 {
3203 	if (!dev_is_pci(info->dev)) {
3204 		domain_context_clear_one(info, info->bus, info->devfn);
3205 		return;
3206 	}
3207 
3208 	pci_for_each_dma_alias(to_pci_dev(info->dev),
3209 			       &domain_context_clear_one_cb, info);
3210 }
3211 
3212 /*
3213  * Clear the page table pointer in context or pasid table entries so that
3214  * all DMA requests without PASID from the device are blocked. If the page
3215  * table has been set, clean up the data structures.
3216  */
3217 void device_block_translation(struct device *dev)
3218 {
3219 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3220 	struct intel_iommu *iommu = info->iommu;
3221 	unsigned long flags;
3222 
3223 	iommu_disable_pci_caps(info);
3224 	if (!dev_is_real_dma_subdevice(dev)) {
3225 		if (sm_supported(iommu))
3226 			intel_pasid_tear_down_entry(iommu, dev,
3227 						    IOMMU_NO_PASID, false);
3228 		else
3229 			domain_context_clear(info);
3230 	}
3231 
3232 	if (!info->domain)
3233 		return;
3234 
3235 	spin_lock_irqsave(&info->domain->lock, flags);
3236 	list_del(&info->link);
3237 	spin_unlock_irqrestore(&info->domain->lock, flags);
3238 
3239 	cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID);
3240 	domain_detach_iommu(info->domain, iommu);
3241 	info->domain = NULL;
3242 }
3243 
3244 static int blocking_domain_attach_dev(struct iommu_domain *domain,
3245 				      struct device *dev)
3246 {
3247 	device_block_translation(dev);
3248 	return 0;
3249 }
3250 
3251 static struct iommu_domain blocking_domain = {
3252 	.type = IOMMU_DOMAIN_BLOCKED,
3253 	.ops = &(const struct iommu_domain_ops) {
3254 		.attach_dev	= blocking_domain_attach_dev,
3255 	}
3256 };
3257 
3258 static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage)
3259 {
3260 	if (!intel_iommu_superpage)
3261 		return 0;
3262 
3263 	if (first_stage)
3264 		return cap_fl1gp_support(iommu->cap) ? 2 : 1;
3265 
3266 	return fls(cap_super_page_val(iommu->cap));
3267 }
3268 
3269 static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage)
3270 {
3271 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3272 	struct intel_iommu *iommu = info->iommu;
3273 	struct dmar_domain *domain;
3274 	int addr_width;
3275 
3276 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
3277 	if (!domain)
3278 		return ERR_PTR(-ENOMEM);
3279 
3280 	INIT_LIST_HEAD(&domain->devices);
3281 	INIT_LIST_HEAD(&domain->dev_pasids);
3282 	INIT_LIST_HEAD(&domain->cache_tags);
3283 	spin_lock_init(&domain->lock);
3284 	spin_lock_init(&domain->cache_lock);
3285 	xa_init(&domain->iommu_array);
3286 
3287 	domain->nid = dev_to_node(dev);
3288 	domain->use_first_level = first_stage;
3289 
3290 	/* calculate the address width */
3291 	addr_width = agaw_to_width(iommu->agaw);
3292 	if (addr_width > cap_mgaw(iommu->cap))
3293 		addr_width = cap_mgaw(iommu->cap);
3294 	domain->gaw = addr_width;
3295 	domain->agaw = iommu->agaw;
3296 	domain->max_addr = __DOMAIN_MAX_ADDR(addr_width);
3297 
3298 	/* iommu memory access coherency */
3299 	domain->iommu_coherency = iommu_paging_structure_coherency(iommu);
3300 
3301 	/* pagesize bitmap */
3302 	domain->domain.pgsize_bitmap = SZ_4K;
3303 	domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage);
3304 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
3305 
3306 	/*
3307 	 * IOVA aperture: First-level translation restricts the input-address
3308 	 * to a canonical address (i.e., address bits 63:N have the same value
3309 	 * as address bit [N-1], where N is 48-bits with 4-level paging and
3310 	 * 57-bits with 5-level paging). Hence, skip bit [N-1].
3311 	 */
3312 	domain->domain.geometry.force_aperture = true;
3313 	domain->domain.geometry.aperture_start = 0;
3314 	if (first_stage)
3315 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
3316 	else
3317 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
3318 
3319 	/* always allocate the top pgd */
3320 	domain->pgd = iommu_alloc_page_node(domain->nid, GFP_KERNEL);
3321 	if (!domain->pgd) {
3322 		kfree(domain);
3323 		return ERR_PTR(-ENOMEM);
3324 	}
3325 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3326 
3327 	return domain;
3328 }
3329 
3330 static struct iommu_domain *
3331 intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
3332 				      const struct iommu_user_data *user_data)
3333 {
3334 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3335 	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3336 	bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3337 	struct intel_iommu *iommu = info->iommu;
3338 	struct dmar_domain *dmar_domain;
3339 	struct iommu_domain *domain;
3340 	bool first_stage;
3341 
3342 	if (flags &
3343 	    (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING
3344 	       | IOMMU_HWPT_FAULT_ID_VALID)))
3345 		return ERR_PTR(-EOPNOTSUPP);
3346 	if (nested_parent && !nested_supported(iommu))
3347 		return ERR_PTR(-EOPNOTSUPP);
3348 	if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3349 		return ERR_PTR(-EOPNOTSUPP);
3350 
3351 	/*
3352 	 * Always allocate the guest compatible page table unless
3353 	 * IOMMU_HWPT_ALLOC_NEST_PARENT or IOMMU_HWPT_ALLOC_DIRTY_TRACKING
3354 	 * is specified.
3355 	 */
3356 	if (nested_parent || dirty_tracking) {
3357 		if (!sm_supported(iommu) || !ecap_slts(iommu->ecap))
3358 			return ERR_PTR(-EOPNOTSUPP);
3359 		first_stage = false;
3360 	} else {
3361 		first_stage = first_level_by_default(iommu);
3362 	}
3363 
3364 	dmar_domain = paging_domain_alloc(dev, first_stage);
3365 	if (IS_ERR(dmar_domain))
3366 		return ERR_CAST(dmar_domain);
3367 	domain = &dmar_domain->domain;
3368 	domain->type = IOMMU_DOMAIN_UNMANAGED;
3369 	domain->owner = &intel_iommu_ops;
3370 	domain->ops = intel_iommu_ops.default_domain_ops;
3371 
3372 	if (nested_parent) {
3373 		dmar_domain->nested_parent = true;
3374 		INIT_LIST_HEAD(&dmar_domain->s1_domains);
3375 		spin_lock_init(&dmar_domain->s1_lock);
3376 	}
3377 
3378 	if (dirty_tracking) {
3379 		if (dmar_domain->use_first_level) {
3380 			iommu_domain_free(domain);
3381 			return ERR_PTR(-EOPNOTSUPP);
3382 		}
3383 		domain->dirty_ops = &intel_dirty_ops;
3384 	}
3385 
3386 	return domain;
3387 }
3388 
3389 static void intel_iommu_domain_free(struct iommu_domain *domain)
3390 {
3391 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3392 
3393 	WARN_ON(dmar_domain->nested_parent &&
3394 		!list_empty(&dmar_domain->s1_domains));
3395 	domain_exit(dmar_domain);
3396 }
3397 
3398 int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
3399 {
3400 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3401 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3402 	struct intel_iommu *iommu = info->iommu;
3403 	int addr_width;
3404 
3405 	if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
3406 		return -EPERM;
3407 
3408 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3409 		return -EINVAL;
3410 
3411 	if (domain->dirty_ops && !ssads_supported(iommu))
3412 		return -EINVAL;
3413 
3414 	if (dmar_domain->iommu_coherency !=
3415 			iommu_paging_structure_coherency(iommu))
3416 		return -EINVAL;
3417 
3418 	if (dmar_domain->iommu_superpage !=
3419 			iommu_superpage_capability(iommu, dmar_domain->use_first_level))
3420 		return -EINVAL;
3421 
3422 	if (dmar_domain->use_first_level &&
3423 	    (!sm_supported(iommu) || !ecap_flts(iommu->ecap)))
3424 		return -EINVAL;
3425 
3426 	/* check if this iommu agaw is sufficient for max mapped address */
3427 	addr_width = agaw_to_width(iommu->agaw);
3428 	if (addr_width > cap_mgaw(iommu->cap))
3429 		addr_width = cap_mgaw(iommu->cap);
3430 
3431 	if (dmar_domain->gaw > addr_width || dmar_domain->agaw > iommu->agaw)
3432 		return -EINVAL;
3433 
3434 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3435 	    context_copied(iommu, info->bus, info->devfn))
3436 		return intel_pasid_setup_sm_context(dev);
3437 
3438 	return 0;
3439 }
3440 
3441 static int intel_iommu_attach_device(struct iommu_domain *domain,
3442 				     struct device *dev)
3443 {
3444 	int ret;
3445 
3446 	device_block_translation(dev);
3447 
3448 	ret = paging_domain_compatible(domain, dev);
3449 	if (ret)
3450 		return ret;
3451 
3452 	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
3453 }
3454 
3455 static int intel_iommu_map(struct iommu_domain *domain,
3456 			   unsigned long iova, phys_addr_t hpa,
3457 			   size_t size, int iommu_prot, gfp_t gfp)
3458 {
3459 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3460 	u64 max_addr;
3461 	int prot = 0;
3462 
3463 	if (iommu_prot & IOMMU_READ)
3464 		prot |= DMA_PTE_READ;
3465 	if (iommu_prot & IOMMU_WRITE)
3466 		prot |= DMA_PTE_WRITE;
3467 	if (dmar_domain->set_pte_snp)
3468 		prot |= DMA_PTE_SNP;
3469 
3470 	max_addr = iova + size;
3471 	if (dmar_domain->max_addr < max_addr) {
3472 		u64 end;
3473 
3474 		/* check if minimum agaw is sufficient for mapped address */
3475 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3476 		if (end < max_addr) {
3477 			pr_err("%s: iommu width (%d) is not "
3478 			       "sufficient for the mapped address (%llx)\n",
3479 			       __func__, dmar_domain->gaw, max_addr);
3480 			return -EFAULT;
3481 		}
3482 		dmar_domain->max_addr = max_addr;
3483 	}
3484 	/* Round up size to next multiple of PAGE_SIZE, if it and
3485 	   the low bits of hpa would take us onto the next page */
3486 	size = aligned_nrpages(hpa, size);
3487 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3488 				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
3489 }
3490 
3491 static int intel_iommu_map_pages(struct iommu_domain *domain,
3492 				 unsigned long iova, phys_addr_t paddr,
3493 				 size_t pgsize, size_t pgcount,
3494 				 int prot, gfp_t gfp, size_t *mapped)
3495 {
3496 	unsigned long pgshift = __ffs(pgsize);
3497 	size_t size = pgcount << pgshift;
3498 	int ret;
3499 
3500 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
3501 		return -EINVAL;
3502 
3503 	if (!IS_ALIGNED(iova | paddr, pgsize))
3504 		return -EINVAL;
3505 
3506 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
3507 	if (!ret && mapped)
3508 		*mapped = size;
3509 
3510 	return ret;
3511 }
3512 
3513 static size_t intel_iommu_unmap(struct iommu_domain *domain,
3514 				unsigned long iova, size_t size,
3515 				struct iommu_iotlb_gather *gather)
3516 {
3517 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3518 	unsigned long start_pfn, last_pfn;
3519 	int level = 0;
3520 
3521 	/* Cope with horrid API which requires us to unmap more than the
3522 	   size argument if it happens to be a large-page mapping. */
3523 	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
3524 				     &level, GFP_ATOMIC)))
3525 		return 0;
3526 
3527 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
3528 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
3529 
3530 	start_pfn = iova >> VTD_PAGE_SHIFT;
3531 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
3532 
3533 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
3534 
3535 	if (dmar_domain->max_addr == iova + size)
3536 		dmar_domain->max_addr = iova;
3537 
3538 	/*
3539 	 * We do not use page-selective IOTLB invalidation in flush queue,
3540 	 * so there is no need to track page and sync iotlb.
3541 	 */
3542 	if (!iommu_iotlb_gather_queued(gather))
3543 		iommu_iotlb_gather_add_page(domain, gather, iova, size);
3544 
3545 	return size;
3546 }
3547 
3548 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
3549 				      unsigned long iova,
3550 				      size_t pgsize, size_t pgcount,
3551 				      struct iommu_iotlb_gather *gather)
3552 {
3553 	unsigned long pgshift = __ffs(pgsize);
3554 	size_t size = pgcount << pgshift;
3555 
3556 	return intel_iommu_unmap(domain, iova, size, gather);
3557 }
3558 
3559 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
3560 				 struct iommu_iotlb_gather *gather)
3561 {
3562 	cache_tag_flush_range(to_dmar_domain(domain), gather->start,
3563 			      gather->end, list_empty(&gather->freelist));
3564 	iommu_put_pages_list(&gather->freelist);
3565 }
3566 
3567 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3568 					    dma_addr_t iova)
3569 {
3570 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3571 	struct dma_pte *pte;
3572 	int level = 0;
3573 	u64 phys = 0;
3574 
3575 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
3576 			     GFP_ATOMIC);
3577 	if (pte && dma_pte_present(pte))
3578 		phys = dma_pte_addr(pte) +
3579 			(iova & (BIT_MASK(level_to_offset_bits(level) +
3580 						VTD_PAGE_SHIFT) - 1));
3581 
3582 	return phys;
3583 }
3584 
3585 static bool domain_support_force_snooping(struct dmar_domain *domain)
3586 {
3587 	struct device_domain_info *info;
3588 	bool support = true;
3589 
3590 	assert_spin_locked(&domain->lock);
3591 	list_for_each_entry(info, &domain->devices, link) {
3592 		if (!ecap_sc_support(info->iommu->ecap)) {
3593 			support = false;
3594 			break;
3595 		}
3596 	}
3597 
3598 	return support;
3599 }
3600 
3601 static void domain_set_force_snooping(struct dmar_domain *domain)
3602 {
3603 	struct device_domain_info *info;
3604 
3605 	assert_spin_locked(&domain->lock);
3606 	/*
3607 	 * Second level page table supports per-PTE snoop control. The
3608 	 * iommu_map() interface will handle this by setting SNP bit.
3609 	 */
3610 	if (!domain->use_first_level) {
3611 		domain->set_pte_snp = true;
3612 		return;
3613 	}
3614 
3615 	list_for_each_entry(info, &domain->devices, link)
3616 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
3617 						     IOMMU_NO_PASID);
3618 }
3619 
3620 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
3621 {
3622 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3623 	unsigned long flags;
3624 
3625 	if (dmar_domain->force_snooping)
3626 		return true;
3627 
3628 	spin_lock_irqsave(&dmar_domain->lock, flags);
3629 	if (!domain_support_force_snooping(dmar_domain) ||
3630 	    (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
3631 		spin_unlock_irqrestore(&dmar_domain->lock, flags);
3632 		return false;
3633 	}
3634 
3635 	domain_set_force_snooping(dmar_domain);
3636 	dmar_domain->force_snooping = true;
3637 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
3638 
3639 	return true;
3640 }
3641 
3642 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
3643 {
3644 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3645 
3646 	switch (cap) {
3647 	case IOMMU_CAP_CACHE_COHERENCY:
3648 	case IOMMU_CAP_DEFERRED_FLUSH:
3649 		return true;
3650 	case IOMMU_CAP_PRE_BOOT_PROTECTION:
3651 		return dmar_platform_optin();
3652 	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
3653 		return ecap_sc_support(info->iommu->ecap);
3654 	case IOMMU_CAP_DIRTY_TRACKING:
3655 		return ssads_supported(info->iommu);
3656 	default:
3657 		return false;
3658 	}
3659 }
3660 
3661 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
3662 {
3663 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3664 	struct device_domain_info *info;
3665 	struct intel_iommu *iommu;
3666 	u8 bus, devfn;
3667 	int ret;
3668 
3669 	iommu = device_lookup_iommu(dev, &bus, &devfn);
3670 	if (!iommu || !iommu->iommu.ops)
3671 		return ERR_PTR(-ENODEV);
3672 
3673 	info = kzalloc(sizeof(*info), GFP_KERNEL);
3674 	if (!info)
3675 		return ERR_PTR(-ENOMEM);
3676 
3677 	if (dev_is_real_dma_subdevice(dev)) {
3678 		info->bus = pdev->bus->number;
3679 		info->devfn = pdev->devfn;
3680 		info->segment = pci_domain_nr(pdev->bus);
3681 	} else {
3682 		info->bus = bus;
3683 		info->devfn = devfn;
3684 		info->segment = iommu->segment;
3685 	}
3686 
3687 	info->dev = dev;
3688 	info->iommu = iommu;
3689 	if (dev_is_pci(dev)) {
3690 		if (ecap_dev_iotlb_support(iommu->ecap) &&
3691 		    pci_ats_supported(pdev) &&
3692 		    dmar_ats_supported(pdev, iommu)) {
3693 			info->ats_supported = 1;
3694 			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
3695 
3696 			/*
3697 			 * For IOMMU that supports device IOTLB throttling
3698 			 * (DIT), we assign PFSID to the invalidation desc
3699 			 * of a VF such that IOMMU HW can gauge queue depth
3700 			 * at PF level. If DIT is not set, PFSID will be
3701 			 * treated as reserved, which should be set to 0.
3702 			 */
3703 			if (ecap_dit(iommu->ecap))
3704 				info->pfsid = pci_dev_id(pci_physfn(pdev));
3705 			info->ats_qdep = pci_ats_queue_depth(pdev);
3706 		}
3707 		if (sm_supported(iommu)) {
3708 			if (pasid_supported(iommu)) {
3709 				int features = pci_pasid_features(pdev);
3710 
3711 				if (features >= 0)
3712 					info->pasid_supported = features | 1;
3713 			}
3714 
3715 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
3716 			    pci_pri_supported(pdev))
3717 				info->pri_supported = 1;
3718 		}
3719 	}
3720 
3721 	dev_iommu_priv_set(dev, info);
3722 	if (pdev && pci_ats_supported(pdev)) {
3723 		pci_prepare_ats(pdev, VTD_PAGE_SHIFT);
3724 		ret = device_rbtree_insert(iommu, info);
3725 		if (ret)
3726 			goto free;
3727 	}
3728 
3729 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
3730 		ret = intel_pasid_alloc_table(dev);
3731 		if (ret) {
3732 			dev_err(dev, "PASID table allocation failed\n");
3733 			goto clear_rbtree;
3734 		}
3735 
3736 		if (!context_copied(iommu, info->bus, info->devfn)) {
3737 			ret = intel_pasid_setup_sm_context(dev);
3738 			if (ret)
3739 				goto free_table;
3740 		}
3741 	}
3742 
3743 	intel_iommu_debugfs_create_dev(info);
3744 
3745 	/*
3746 	 * The PCIe spec, in its wisdom, declares that the behaviour of the
3747 	 * device is undefined if you enable PASID support after ATS support.
3748 	 * So always enable PASID support on devices which have it, even if
3749 	 * we can't yet know if we're ever going to use it.
3750 	 */
3751 	if (info->pasid_supported &&
3752 	    !pci_enable_pasid(pdev, info->pasid_supported & ~1))
3753 		info->pasid_enabled = 1;
3754 
3755 	return &iommu->iommu;
3756 free_table:
3757 	intel_pasid_free_table(dev);
3758 clear_rbtree:
3759 	device_rbtree_remove(info);
3760 free:
3761 	kfree(info);
3762 
3763 	return ERR_PTR(ret);
3764 }
3765 
3766 static void intel_iommu_release_device(struct device *dev)
3767 {
3768 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3769 	struct intel_iommu *iommu = info->iommu;
3770 
3771 	if (info->pasid_enabled) {
3772 		pci_disable_pasid(to_pci_dev(dev));
3773 		info->pasid_enabled = 0;
3774 	}
3775 
3776 	mutex_lock(&iommu->iopf_lock);
3777 	if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
3778 		device_rbtree_remove(info);
3779 	mutex_unlock(&iommu->iopf_lock);
3780 
3781 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3782 	    !context_copied(iommu, info->bus, info->devfn))
3783 		intel_pasid_teardown_sm_context(dev);
3784 
3785 	intel_pasid_free_table(dev);
3786 	intel_iommu_debugfs_remove_dev(info);
3787 	kfree(info);
3788 	set_dma_ops(dev, NULL);
3789 }
3790 
3791 static void intel_iommu_get_resv_regions(struct device *device,
3792 					 struct list_head *head)
3793 {
3794 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
3795 	struct iommu_resv_region *reg;
3796 	struct dmar_rmrr_unit *rmrr;
3797 	struct device *i_dev;
3798 	int i;
3799 
3800 	rcu_read_lock();
3801 	for_each_rmrr_units(rmrr) {
3802 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3803 					  i, i_dev) {
3804 			struct iommu_resv_region *resv;
3805 			enum iommu_resv_type type;
3806 			size_t length;
3807 
3808 			if (i_dev != device &&
3809 			    !is_downstream_to_pci_bridge(device, i_dev))
3810 				continue;
3811 
3812 			length = rmrr->end_address - rmrr->base_address + 1;
3813 
3814 			type = device_rmrr_is_relaxable(device) ?
3815 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
3816 
3817 			resv = iommu_alloc_resv_region(rmrr->base_address,
3818 						       length, prot, type,
3819 						       GFP_ATOMIC);
3820 			if (!resv)
3821 				break;
3822 
3823 			list_add_tail(&resv->list, head);
3824 		}
3825 	}
3826 	rcu_read_unlock();
3827 
3828 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
3829 	if (dev_is_pci(device)) {
3830 		struct pci_dev *pdev = to_pci_dev(device);
3831 
3832 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
3833 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
3834 					IOMMU_RESV_DIRECT_RELAXABLE,
3835 					GFP_KERNEL);
3836 			if (reg)
3837 				list_add_tail(&reg->list, head);
3838 		}
3839 	}
3840 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
3841 
3842 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
3843 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
3844 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
3845 	if (!reg)
3846 		return;
3847 	list_add_tail(&reg->list, head);
3848 }
3849 
3850 static struct iommu_group *intel_iommu_device_group(struct device *dev)
3851 {
3852 	if (dev_is_pci(dev))
3853 		return pci_device_group(dev);
3854 	return generic_device_group(dev);
3855 }
3856 
3857 static int intel_iommu_enable_sva(struct device *dev)
3858 {
3859 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3860 	struct intel_iommu *iommu;
3861 
3862 	if (!info || dmar_disabled)
3863 		return -EINVAL;
3864 
3865 	iommu = info->iommu;
3866 	if (!iommu)
3867 		return -EINVAL;
3868 
3869 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
3870 		return -ENODEV;
3871 
3872 	if (!info->pasid_enabled || !info->ats_enabled)
3873 		return -EINVAL;
3874 
3875 	/*
3876 	 * Devices having device-specific I/O fault handling should not
3877 	 * support PCI/PRI. The IOMMU side has no means to check the
3878 	 * capability of device-specific IOPF.  Therefore, IOMMU can only
3879 	 * default that if the device driver enables SVA on a non-PRI
3880 	 * device, it will handle IOPF in its own way.
3881 	 */
3882 	if (!info->pri_supported)
3883 		return 0;
3884 
3885 	/* Devices supporting PRI should have it enabled. */
3886 	if (!info->pri_enabled)
3887 		return -EINVAL;
3888 
3889 	return 0;
3890 }
3891 
3892 static int context_flip_pri(struct device_domain_info *info, bool enable)
3893 {
3894 	struct intel_iommu *iommu = info->iommu;
3895 	u8 bus = info->bus, devfn = info->devfn;
3896 	struct context_entry *context;
3897 	u16 did;
3898 
3899 	spin_lock(&iommu->lock);
3900 	if (context_copied(iommu, bus, devfn)) {
3901 		spin_unlock(&iommu->lock);
3902 		return -EINVAL;
3903 	}
3904 
3905 	context = iommu_context_addr(iommu, bus, devfn, false);
3906 	if (!context || !context_present(context)) {
3907 		spin_unlock(&iommu->lock);
3908 		return -ENODEV;
3909 	}
3910 	did = context_domain_id(context);
3911 
3912 	if (enable)
3913 		context_set_sm_pre(context);
3914 	else
3915 		context_clear_sm_pre(context);
3916 
3917 	if (!ecap_coherent(iommu->ecap))
3918 		clflush_cache_range(context, sizeof(*context));
3919 	intel_context_flush_present(info, context, did, true);
3920 	spin_unlock(&iommu->lock);
3921 
3922 	return 0;
3923 }
3924 
3925 static int intel_iommu_enable_iopf(struct device *dev)
3926 {
3927 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3928 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3929 	struct intel_iommu *iommu;
3930 	int ret;
3931 
3932 	if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
3933 		return -ENODEV;
3934 
3935 	if (info->pri_enabled)
3936 		return -EBUSY;
3937 
3938 	iommu = info->iommu;
3939 	if (!iommu)
3940 		return -EINVAL;
3941 
3942 	/* PASID is required in PRG Response Message. */
3943 	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
3944 		return -EINVAL;
3945 
3946 	ret = pci_reset_pri(pdev);
3947 	if (ret)
3948 		return ret;
3949 
3950 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
3951 	if (ret)
3952 		return ret;
3953 
3954 	ret = context_flip_pri(info, true);
3955 	if (ret)
3956 		goto err_remove_device;
3957 
3958 	ret = pci_enable_pri(pdev, PRQ_DEPTH);
3959 	if (ret)
3960 		goto err_clear_pri;
3961 
3962 	info->pri_enabled = 1;
3963 
3964 	return 0;
3965 err_clear_pri:
3966 	context_flip_pri(info, false);
3967 err_remove_device:
3968 	iopf_queue_remove_device(iommu->iopf_queue, dev);
3969 
3970 	return ret;
3971 }
3972 
3973 static int intel_iommu_disable_iopf(struct device *dev)
3974 {
3975 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3976 	struct intel_iommu *iommu = info->iommu;
3977 
3978 	if (!info->pri_enabled)
3979 		return -EINVAL;
3980 
3981 	/* Disable new PRI reception: */
3982 	context_flip_pri(info, false);
3983 
3984 	/*
3985 	 * Remove device from fault queue and acknowledge all outstanding
3986 	 * PRQs to the device:
3987 	 */
3988 	iopf_queue_remove_device(iommu->iopf_queue, dev);
3989 
3990 	/*
3991 	 * PCIe spec states that by clearing PRI enable bit, the Page
3992 	 * Request Interface will not issue new page requests, but has
3993 	 * outstanding page requests that have been transmitted or are
3994 	 * queued for transmission. This is supposed to be called after
3995 	 * the device driver has stopped DMA, all PASIDs have been
3996 	 * unbound and the outstanding PRQs have been drained.
3997 	 */
3998 	pci_disable_pri(to_pci_dev(dev));
3999 	info->pri_enabled = 0;
4000 
4001 	return 0;
4002 }
4003 
4004 static int
4005 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4006 {
4007 	switch (feat) {
4008 	case IOMMU_DEV_FEAT_IOPF:
4009 		return intel_iommu_enable_iopf(dev);
4010 
4011 	case IOMMU_DEV_FEAT_SVA:
4012 		return intel_iommu_enable_sva(dev);
4013 
4014 	default:
4015 		return -ENODEV;
4016 	}
4017 }
4018 
4019 static int
4020 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4021 {
4022 	switch (feat) {
4023 	case IOMMU_DEV_FEAT_IOPF:
4024 		return intel_iommu_disable_iopf(dev);
4025 
4026 	case IOMMU_DEV_FEAT_SVA:
4027 		return 0;
4028 
4029 	default:
4030 		return -ENODEV;
4031 	}
4032 }
4033 
4034 static bool intel_iommu_is_attach_deferred(struct device *dev)
4035 {
4036 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4037 
4038 	return translation_pre_enabled(info->iommu) && !info->domain;
4039 }
4040 
4041 /*
4042  * Check that the device does not live on an external facing PCI port that is
4043  * marked as untrusted. Such devices should not be able to apply quirks and
4044  * thus not be able to bypass the IOMMU restrictions.
4045  */
4046 static bool risky_device(struct pci_dev *pdev)
4047 {
4048 	if (pdev->untrusted) {
4049 		pci_info(pdev,
4050 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4051 			 pdev->vendor, pdev->device);
4052 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4053 		return true;
4054 	}
4055 	return false;
4056 }
4057 
4058 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4059 				      unsigned long iova, size_t size)
4060 {
4061 	cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1);
4062 
4063 	return 0;
4064 }
4065 
4066 void domain_remove_dev_pasid(struct iommu_domain *domain,
4067 			     struct device *dev, ioasid_t pasid)
4068 {
4069 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4070 	struct dev_pasid_info *curr, *dev_pasid = NULL;
4071 	struct intel_iommu *iommu = info->iommu;
4072 	struct dmar_domain *dmar_domain;
4073 	unsigned long flags;
4074 
4075 	if (!domain)
4076 		return;
4077 
4078 	/* Identity domain has no meta data for pasid. */
4079 	if (domain->type == IOMMU_DOMAIN_IDENTITY)
4080 		return;
4081 
4082 	dmar_domain = to_dmar_domain(domain);
4083 	spin_lock_irqsave(&dmar_domain->lock, flags);
4084 	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4085 		if (curr->dev == dev && curr->pasid == pasid) {
4086 			list_del(&curr->link_domain);
4087 			dev_pasid = curr;
4088 			break;
4089 		}
4090 	}
4091 	WARN_ON_ONCE(!dev_pasid);
4092 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4093 
4094 	cache_tag_unassign_domain(dmar_domain, dev, pasid);
4095 	domain_detach_iommu(dmar_domain, iommu);
4096 	intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4097 	kfree(dev_pasid);
4098 }
4099 
4100 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
4101 					 struct iommu_domain *domain)
4102 {
4103 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4104 
4105 	intel_pasid_tear_down_entry(info->iommu, dev, pasid, false);
4106 	domain_remove_dev_pasid(domain, dev, pasid);
4107 }
4108 
4109 struct dev_pasid_info *
4110 domain_add_dev_pasid(struct iommu_domain *domain,
4111 		     struct device *dev, ioasid_t pasid)
4112 {
4113 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4114 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4115 	struct intel_iommu *iommu = info->iommu;
4116 	struct dev_pasid_info *dev_pasid;
4117 	unsigned long flags;
4118 	int ret;
4119 
4120 	dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4121 	if (!dev_pasid)
4122 		return ERR_PTR(-ENOMEM);
4123 
4124 	ret = domain_attach_iommu(dmar_domain, iommu);
4125 	if (ret)
4126 		goto out_free;
4127 
4128 	ret = cache_tag_assign_domain(dmar_domain, dev, pasid);
4129 	if (ret)
4130 		goto out_detach_iommu;
4131 
4132 	dev_pasid->dev = dev;
4133 	dev_pasid->pasid = pasid;
4134 	spin_lock_irqsave(&dmar_domain->lock, flags);
4135 	list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4136 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4137 
4138 	return dev_pasid;
4139 out_detach_iommu:
4140 	domain_detach_iommu(dmar_domain, iommu);
4141 out_free:
4142 	kfree(dev_pasid);
4143 	return ERR_PTR(ret);
4144 }
4145 
4146 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4147 				     struct device *dev, ioasid_t pasid,
4148 				     struct iommu_domain *old)
4149 {
4150 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4151 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4152 	struct intel_iommu *iommu = info->iommu;
4153 	struct dev_pasid_info *dev_pasid;
4154 	int ret;
4155 
4156 	if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
4157 		return -EINVAL;
4158 
4159 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4160 		return -EOPNOTSUPP;
4161 
4162 	if (domain->dirty_ops)
4163 		return -EINVAL;
4164 
4165 	if (context_copied(iommu, info->bus, info->devfn))
4166 		return -EBUSY;
4167 
4168 	ret = paging_domain_compatible(domain, dev);
4169 	if (ret)
4170 		return ret;
4171 
4172 	dev_pasid = domain_add_dev_pasid(domain, dev, pasid);
4173 	if (IS_ERR(dev_pasid))
4174 		return PTR_ERR(dev_pasid);
4175 
4176 	if (dmar_domain->use_first_level)
4177 		ret = domain_setup_first_level(iommu, dmar_domain,
4178 					       dev, pasid, old);
4179 	else
4180 		ret = domain_setup_second_level(iommu, dmar_domain,
4181 						dev, pasid, old);
4182 	if (ret)
4183 		goto out_remove_dev_pasid;
4184 
4185 	domain_remove_dev_pasid(old, dev, pasid);
4186 
4187 	intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4188 
4189 	return 0;
4190 
4191 out_remove_dev_pasid:
4192 	domain_remove_dev_pasid(domain, dev, pasid);
4193 	return ret;
4194 }
4195 
4196 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4197 {
4198 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4199 	struct intel_iommu *iommu = info->iommu;
4200 	struct iommu_hw_info_vtd *vtd;
4201 
4202 	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4203 	if (!vtd)
4204 		return ERR_PTR(-ENOMEM);
4205 
4206 	vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4207 	vtd->cap_reg = iommu->cap;
4208 	vtd->ecap_reg = iommu->ecap;
4209 	*length = sizeof(*vtd);
4210 	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4211 	return vtd;
4212 }
4213 
4214 /*
4215  * Set dirty tracking for the device list of a domain. The caller must
4216  * hold the domain->lock when calling it.
4217  */
4218 static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4219 {
4220 	struct device_domain_info *info;
4221 	int ret = 0;
4222 
4223 	list_for_each_entry(info, devices, link) {
4224 		ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4225 						       IOMMU_NO_PASID, enable);
4226 		if (ret)
4227 			break;
4228 	}
4229 
4230 	return ret;
4231 }
4232 
4233 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4234 					    bool enable)
4235 {
4236 	struct dmar_domain *s1_domain;
4237 	unsigned long flags;
4238 	int ret;
4239 
4240 	spin_lock(&domain->s1_lock);
4241 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4242 		spin_lock_irqsave(&s1_domain->lock, flags);
4243 		ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4244 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4245 		if (ret)
4246 			goto err_unwind;
4247 	}
4248 	spin_unlock(&domain->s1_lock);
4249 	return 0;
4250 
4251 err_unwind:
4252 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4253 		spin_lock_irqsave(&s1_domain->lock, flags);
4254 		device_set_dirty_tracking(&s1_domain->devices,
4255 					  domain->dirty_tracking);
4256 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4257 	}
4258 	spin_unlock(&domain->s1_lock);
4259 	return ret;
4260 }
4261 
4262 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4263 					  bool enable)
4264 {
4265 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4266 	int ret;
4267 
4268 	spin_lock(&dmar_domain->lock);
4269 	if (dmar_domain->dirty_tracking == enable)
4270 		goto out_unlock;
4271 
4272 	ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4273 	if (ret)
4274 		goto err_unwind;
4275 
4276 	if (dmar_domain->nested_parent) {
4277 		ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4278 		if (ret)
4279 			goto err_unwind;
4280 	}
4281 
4282 	dmar_domain->dirty_tracking = enable;
4283 out_unlock:
4284 	spin_unlock(&dmar_domain->lock);
4285 
4286 	return 0;
4287 
4288 err_unwind:
4289 	device_set_dirty_tracking(&dmar_domain->devices,
4290 				  dmar_domain->dirty_tracking);
4291 	spin_unlock(&dmar_domain->lock);
4292 	return ret;
4293 }
4294 
4295 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4296 					    unsigned long iova, size_t size,
4297 					    unsigned long flags,
4298 					    struct iommu_dirty_bitmap *dirty)
4299 {
4300 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4301 	unsigned long end = iova + size - 1;
4302 	unsigned long pgsize;
4303 
4304 	/*
4305 	 * IOMMUFD core calls into a dirty tracking disabled domain without an
4306 	 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4307 	 * have occurred when we stopped dirty tracking. This ensures that we
4308 	 * never inherit dirtied bits from a previous cycle.
4309 	 */
4310 	if (!dmar_domain->dirty_tracking && dirty->bitmap)
4311 		return -EINVAL;
4312 
4313 	do {
4314 		struct dma_pte *pte;
4315 		int lvl = 0;
4316 
4317 		pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4318 				     GFP_ATOMIC);
4319 		pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4320 		if (!pte || !dma_pte_present(pte)) {
4321 			iova += pgsize;
4322 			continue;
4323 		}
4324 
4325 		if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4326 			iommu_dirty_bitmap_record(dirty, iova, pgsize);
4327 		iova += pgsize;
4328 	} while (iova < end);
4329 
4330 	return 0;
4331 }
4332 
4333 static const struct iommu_dirty_ops intel_dirty_ops = {
4334 	.set_dirty_tracking = intel_iommu_set_dirty_tracking,
4335 	.read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4336 };
4337 
4338 static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
4339 {
4340 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4341 	struct intel_iommu *iommu = info->iommu;
4342 	struct context_entry *context;
4343 
4344 	spin_lock(&iommu->lock);
4345 	context = iommu_context_addr(iommu, bus, devfn, 1);
4346 	if (!context) {
4347 		spin_unlock(&iommu->lock);
4348 		return -ENOMEM;
4349 	}
4350 
4351 	if (context_present(context) && !context_copied(iommu, bus, devfn)) {
4352 		spin_unlock(&iommu->lock);
4353 		return 0;
4354 	}
4355 
4356 	copied_context_tear_down(iommu, context, bus, devfn);
4357 	context_clear_entry(context);
4358 	context_set_domain_id(context, FLPT_DEFAULT_DID);
4359 
4360 	/*
4361 	 * In pass through mode, AW must be programmed to indicate the largest
4362 	 * AGAW value supported by hardware. And ASR is ignored by hardware.
4363 	 */
4364 	context_set_address_width(context, iommu->msagaw);
4365 	context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH);
4366 	context_set_fault_enable(context);
4367 	context_set_present(context);
4368 	if (!ecap_coherent(iommu->ecap))
4369 		clflush_cache_range(context, sizeof(*context));
4370 	context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn);
4371 	spin_unlock(&iommu->lock);
4372 
4373 	return 0;
4374 }
4375 
4376 static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data)
4377 {
4378 	struct device *dev = data;
4379 
4380 	if (dev != &pdev->dev)
4381 		return 0;
4382 
4383 	return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff);
4384 }
4385 
4386 static int device_setup_pass_through(struct device *dev)
4387 {
4388 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4389 
4390 	if (!dev_is_pci(dev))
4391 		return context_setup_pass_through(dev, info->bus, info->devfn);
4392 
4393 	return pci_for_each_dma_alias(to_pci_dev(dev),
4394 				      context_setup_pass_through_cb, dev);
4395 }
4396 
4397 static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev)
4398 {
4399 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4400 	struct intel_iommu *iommu = info->iommu;
4401 	int ret;
4402 
4403 	device_block_translation(dev);
4404 
4405 	if (dev_is_real_dma_subdevice(dev))
4406 		return 0;
4407 
4408 	if (sm_supported(iommu)) {
4409 		ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
4410 		if (!ret)
4411 			iommu_enable_pci_caps(info);
4412 	} else {
4413 		ret = device_setup_pass_through(dev);
4414 	}
4415 
4416 	return ret;
4417 }
4418 
4419 static int identity_domain_set_dev_pasid(struct iommu_domain *domain,
4420 					 struct device *dev, ioasid_t pasid,
4421 					 struct iommu_domain *old)
4422 {
4423 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4424 	struct intel_iommu *iommu = info->iommu;
4425 	int ret;
4426 
4427 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4428 		return -EOPNOTSUPP;
4429 
4430 	ret = domain_setup_passthrough(iommu, dev, pasid, old);
4431 	if (ret)
4432 		return ret;
4433 
4434 	domain_remove_dev_pasid(old, dev, pasid);
4435 	return 0;
4436 }
4437 
4438 static struct iommu_domain identity_domain = {
4439 	.type = IOMMU_DOMAIN_IDENTITY,
4440 	.ops = &(const struct iommu_domain_ops) {
4441 		.attach_dev	= identity_domain_attach_dev,
4442 		.set_dev_pasid	= identity_domain_set_dev_pasid,
4443 	},
4444 };
4445 
4446 static struct iommu_domain *intel_iommu_domain_alloc_paging(struct device *dev)
4447 {
4448 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4449 	struct intel_iommu *iommu = info->iommu;
4450 	struct dmar_domain *dmar_domain;
4451 	bool first_stage;
4452 
4453 	first_stage = first_level_by_default(iommu);
4454 	dmar_domain = paging_domain_alloc(dev, first_stage);
4455 	if (IS_ERR(dmar_domain))
4456 		return ERR_CAST(dmar_domain);
4457 
4458 	return &dmar_domain->domain;
4459 }
4460 
4461 const struct iommu_ops intel_iommu_ops = {
4462 	.blocked_domain		= &blocking_domain,
4463 	.release_domain		= &blocking_domain,
4464 	.identity_domain	= &identity_domain,
4465 	.capable		= intel_iommu_capable,
4466 	.hw_info		= intel_iommu_hw_info,
4467 	.domain_alloc_paging_flags = intel_iommu_domain_alloc_paging_flags,
4468 	.domain_alloc_sva	= intel_svm_domain_alloc,
4469 	.domain_alloc_paging	= intel_iommu_domain_alloc_paging,
4470 	.domain_alloc_nested	= intel_iommu_domain_alloc_nested,
4471 	.probe_device		= intel_iommu_probe_device,
4472 	.release_device		= intel_iommu_release_device,
4473 	.get_resv_regions	= intel_iommu_get_resv_regions,
4474 	.device_group		= intel_iommu_device_group,
4475 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4476 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4477 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4478 	.def_domain_type	= device_def_domain_type,
4479 	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4480 	.pgsize_bitmap		= SZ_4K,
4481 	.page_response		= intel_iommu_page_response,
4482 	.default_domain_ops = &(const struct iommu_domain_ops) {
4483 		.attach_dev		= intel_iommu_attach_device,
4484 		.set_dev_pasid		= intel_iommu_set_dev_pasid,
4485 		.map_pages		= intel_iommu_map_pages,
4486 		.unmap_pages		= intel_iommu_unmap_pages,
4487 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4488 		.flush_iotlb_all        = intel_flush_iotlb_all,
4489 		.iotlb_sync		= intel_iommu_tlb_sync,
4490 		.iova_to_phys		= intel_iommu_iova_to_phys,
4491 		.free			= intel_iommu_domain_free,
4492 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4493 	}
4494 };
4495 
4496 static void quirk_iommu_igfx(struct pci_dev *dev)
4497 {
4498 	if (risky_device(dev))
4499 		return;
4500 
4501 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4502 	disable_igfx_iommu = 1;
4503 }
4504 
4505 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4506 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4507 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4508 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4509 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4510 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4511 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4512 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4513 
4514 /* Broadwell igfx malfunctions with dmar */
4515 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4516 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4517 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4518 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4519 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4520 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4521 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4522 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4523 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4524 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4525 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4526 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4527 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4528 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4529 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4530 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4531 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4532 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4533 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4534 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4535 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4536 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4537 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4538 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4539 
4540 static void quirk_iommu_rwbf(struct pci_dev *dev)
4541 {
4542 	if (risky_device(dev))
4543 		return;
4544 
4545 	/*
4546 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4547 	 * but needs it. Same seems to hold for the desktop versions.
4548 	 */
4549 	pci_info(dev, "Forcing write-buffer flush capability\n");
4550 	rwbf_quirk = 1;
4551 }
4552 
4553 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4554 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4555 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4556 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4557 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4558 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4559 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4560 
4561 #define GGC 0x52
4562 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4563 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4564 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4565 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4566 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4567 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4568 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4569 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4570 
4571 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4572 {
4573 	unsigned short ggc;
4574 
4575 	if (risky_device(dev))
4576 		return;
4577 
4578 	if (pci_read_config_word(dev, GGC, &ggc))
4579 		return;
4580 
4581 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4582 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4583 		disable_igfx_iommu = 1;
4584 	} else if (!disable_igfx_iommu) {
4585 		/* we have to ensure the gfx device is idle before we flush */
4586 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4587 		iommu_set_dma_strict();
4588 	}
4589 }
4590 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4591 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4592 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4593 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4594 
4595 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4596 {
4597 	unsigned short ver;
4598 
4599 	if (!IS_GFX_DEVICE(dev))
4600 		return;
4601 
4602 	ver = (dev->device >> 8) & 0xff;
4603 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4604 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4605 	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4606 		return;
4607 
4608 	if (risky_device(dev))
4609 		return;
4610 
4611 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4612 	iommu_skip_te_disable = 1;
4613 }
4614 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4615 
4616 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4617    ISOCH DMAR unit for the Azalia sound device, but not give it any
4618    TLB entries, which causes it to deadlock. Check for that.  We do
4619    this in a function called from init_dmars(), instead of in a PCI
4620    quirk, because we don't want to print the obnoxious "BIOS broken"
4621    message if VT-d is actually disabled.
4622 */
4623 static void __init check_tylersburg_isoch(void)
4624 {
4625 	struct pci_dev *pdev;
4626 	uint32_t vtisochctrl;
4627 
4628 	/* If there's no Azalia in the system anyway, forget it. */
4629 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4630 	if (!pdev)
4631 		return;
4632 
4633 	if (risky_device(pdev)) {
4634 		pci_dev_put(pdev);
4635 		return;
4636 	}
4637 
4638 	pci_dev_put(pdev);
4639 
4640 	/* System Management Registers. Might be hidden, in which case
4641 	   we can't do the sanity check. But that's OK, because the
4642 	   known-broken BIOSes _don't_ actually hide it, so far. */
4643 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4644 	if (!pdev)
4645 		return;
4646 
4647 	if (risky_device(pdev)) {
4648 		pci_dev_put(pdev);
4649 		return;
4650 	}
4651 
4652 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4653 		pci_dev_put(pdev);
4654 		return;
4655 	}
4656 
4657 	pci_dev_put(pdev);
4658 
4659 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4660 	if (vtisochctrl & 1)
4661 		return;
4662 
4663 	/* Drop all bits other than the number of TLB entries */
4664 	vtisochctrl &= 0x1c;
4665 
4666 	/* If we have the recommended number of TLB entries (16), fine. */
4667 	if (vtisochctrl == 0x10)
4668 		return;
4669 
4670 	/* Zero TLB entries? You get to ride the short bus to school. */
4671 	if (!vtisochctrl) {
4672 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4673 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4674 		     dmi_get_system_info(DMI_BIOS_VENDOR),
4675 		     dmi_get_system_info(DMI_BIOS_VERSION),
4676 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4677 		iommu_identity_mapping |= IDENTMAP_AZALIA;
4678 		return;
4679 	}
4680 
4681 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4682 	       vtisochctrl);
4683 }
4684 
4685 /*
4686  * Here we deal with a device TLB defect where device may inadvertently issue ATS
4687  * invalidation completion before posted writes initiated with translated address
4688  * that utilized translations matching the invalidation address range, violating
4689  * the invalidation completion ordering.
4690  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4691  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4692  * under the control of the trusted/privileged host device driver must use this
4693  * quirk.
4694  * Device TLBs are invalidated under the following six conditions:
4695  * 1. Device driver does DMA API unmap IOVA
4696  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4697  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4698  *    exit_mmap() due to crash
4699  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4700  *    VM has to free pages that were unmapped
4701  * 5. Userspace driver unmaps a DMA buffer
4702  * 6. Cache invalidation in vSVA usage (upcoming)
4703  *
4704  * For #1 and #2, device drivers are responsible for stopping DMA traffic
4705  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4706  * invalidate TLB the same way as normal user unmap which will use this quirk.
4707  * The dTLB invalidation after PASID cache flush does not need this quirk.
4708  *
4709  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4710  */
4711 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4712 			       unsigned long address, unsigned long mask,
4713 			       u32 pasid, u16 qdep)
4714 {
4715 	u16 sid;
4716 
4717 	if (likely(!info->dtlb_extra_inval))
4718 		return;
4719 
4720 	sid = PCI_DEVID(info->bus, info->devfn);
4721 	if (pasid == IOMMU_NO_PASID) {
4722 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
4723 				   qdep, address, mask);
4724 	} else {
4725 		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
4726 					 pasid, qdep, address, mask);
4727 	}
4728 }
4729 
4730 #define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
4731 
4732 /*
4733  * Function to submit a command to the enhanced command interface. The
4734  * valid enhanced command descriptions are defined in Table 47 of the
4735  * VT-d spec. The VT-d hardware implementation may support some but not
4736  * all commands, which can be determined by checking the Enhanced
4737  * Command Capability Register.
4738  *
4739  * Return values:
4740  *  - 0: Command successful without any error;
4741  *  - Negative: software error value;
4742  *  - Nonzero positive: failure status code defined in Table 48.
4743  */
4744 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
4745 {
4746 	unsigned long flags;
4747 	u64 res;
4748 	int ret;
4749 
4750 	if (!cap_ecmds(iommu->cap))
4751 		return -ENODEV;
4752 
4753 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
4754 
4755 	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
4756 	if (res & DMA_ECMD_ECRSP_IP) {
4757 		ret = -EBUSY;
4758 		goto err;
4759 	}
4760 
4761 	/*
4762 	 * Unconditionally write the operand B, because
4763 	 * - There is no side effect if an ecmd doesn't require an
4764 	 *   operand B, but we set the register to some value.
4765 	 * - It's not invoked in any critical path. The extra MMIO
4766 	 *   write doesn't bring any performance concerns.
4767 	 */
4768 	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
4769 	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
4770 
4771 	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
4772 		      !(res & DMA_ECMD_ECRSP_IP), res);
4773 
4774 	if (res & DMA_ECMD_ECRSP_IP) {
4775 		ret = -ETIMEDOUT;
4776 		goto err;
4777 	}
4778 
4779 	ret = ecmd_get_status_code(res);
4780 err:
4781 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
4782 
4783 	return ret;
4784 }
4785