xref: /linux/drivers/iommu/intel/iommu.c (revision 62597edf6340191511bdf9a7f64fa315ddc58805)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26 
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-pages.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 #include "perfmon.h"
34 
35 #define ROOT_SIZE		VTD_PAGE_SIZE
36 #define CONTEXT_SIZE		VTD_PAGE_SIZE
37 
38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42 
43 #define IOAPIC_RANGE_START	(0xfee00000)
44 #define IOAPIC_RANGE_END	(0xfeefffff)
45 #define IOVA_START_ADDR		(0x1000)
46 
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48 
49 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
50 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
51 
52 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
53    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
54 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
55 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
56 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
57 
58 static void __init check_tylersburg_isoch(void);
59 static int rwbf_quirk;
60 
61 /*
62  * set to 1 to panic kernel if can't successfully enable VT-d
63  * (used when kernel is launched w/ TXT)
64  */
65 static int force_on = 0;
66 static int intel_iommu_tboot_noforce;
67 static int no_platform_optin;
68 
69 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
70 
71 /*
72  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
73  * if marked present.
74  */
75 static phys_addr_t root_entry_lctp(struct root_entry *re)
76 {
77 	if (!(re->lo & 1))
78 		return 0;
79 
80 	return re->lo & VTD_PAGE_MASK;
81 }
82 
83 /*
84  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
85  * if marked present.
86  */
87 static phys_addr_t root_entry_uctp(struct root_entry *re)
88 {
89 	if (!(re->hi & 1))
90 		return 0;
91 
92 	return re->hi & VTD_PAGE_MASK;
93 }
94 
95 static int device_rid_cmp_key(const void *key, const struct rb_node *node)
96 {
97 	struct device_domain_info *info =
98 		rb_entry(node, struct device_domain_info, node);
99 	const u16 *rid_lhs = key;
100 
101 	if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
102 		return -1;
103 
104 	if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
105 		return 1;
106 
107 	return 0;
108 }
109 
110 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
111 {
112 	struct device_domain_info *info =
113 		rb_entry(lhs, struct device_domain_info, node);
114 	u16 key = PCI_DEVID(info->bus, info->devfn);
115 
116 	return device_rid_cmp_key(&key, rhs);
117 }
118 
119 /*
120  * Looks up an IOMMU-probed device using its source ID.
121  *
122  * Returns the pointer to the device if there is a match. Otherwise,
123  * returns NULL.
124  *
125  * Note that this helper doesn't guarantee that the device won't be
126  * released by the iommu subsystem after being returned. The caller
127  * should use its own synchronization mechanism to avoid the device
128  * being released during its use if its possibly the case.
129  */
130 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
131 {
132 	struct device_domain_info *info = NULL;
133 	struct rb_node *node;
134 	unsigned long flags;
135 
136 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
137 	node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
138 	if (node)
139 		info = rb_entry(node, struct device_domain_info, node);
140 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
141 
142 	return info ? info->dev : NULL;
143 }
144 
145 static int device_rbtree_insert(struct intel_iommu *iommu,
146 				struct device_domain_info *info)
147 {
148 	struct rb_node *curr;
149 	unsigned long flags;
150 
151 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
152 	curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
153 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
154 	if (WARN_ON(curr))
155 		return -EEXIST;
156 
157 	return 0;
158 }
159 
160 static void device_rbtree_remove(struct device_domain_info *info)
161 {
162 	struct intel_iommu *iommu = info->iommu;
163 	unsigned long flags;
164 
165 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
166 	rb_erase(&info->node, &iommu->device_rbtree);
167 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
168 }
169 
170 struct dmar_rmrr_unit {
171 	struct list_head list;		/* list of rmrr units	*/
172 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
173 	u64	base_address;		/* reserved base address*/
174 	u64	end_address;		/* reserved end address */
175 	struct dmar_dev_scope *devices;	/* target devices */
176 	int	devices_cnt;		/* target device count */
177 };
178 
179 struct dmar_atsr_unit {
180 	struct list_head list;		/* list of ATSR units */
181 	struct acpi_dmar_header *hdr;	/* ACPI header */
182 	struct dmar_dev_scope *devices;	/* target devices */
183 	int devices_cnt;		/* target device count */
184 	u8 include_all:1;		/* include all ports */
185 };
186 
187 struct dmar_satc_unit {
188 	struct list_head list;		/* list of SATC units */
189 	struct acpi_dmar_header *hdr;	/* ACPI header */
190 	struct dmar_dev_scope *devices;	/* target devices */
191 	struct intel_iommu *iommu;	/* the corresponding iommu */
192 	int devices_cnt;		/* target device count */
193 	u8 atc_required:1;		/* ATS is required */
194 };
195 
196 static LIST_HEAD(dmar_atsr_units);
197 static LIST_HEAD(dmar_rmrr_units);
198 static LIST_HEAD(dmar_satc_units);
199 
200 #define for_each_rmrr_units(rmrr) \
201 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
202 
203 static void intel_iommu_domain_free(struct iommu_domain *domain);
204 
205 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
206 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
207 
208 int intel_iommu_enabled = 0;
209 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
210 
211 static int intel_iommu_superpage = 1;
212 static int iommu_identity_mapping;
213 static int iommu_skip_te_disable;
214 static int disable_igfx_iommu;
215 
216 #define IDENTMAP_AZALIA		4
217 
218 const struct iommu_ops intel_iommu_ops;
219 static const struct iommu_dirty_ops intel_dirty_ops;
220 
221 static bool translation_pre_enabled(struct intel_iommu *iommu)
222 {
223 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
224 }
225 
226 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
227 {
228 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
229 }
230 
231 static void init_translation_status(struct intel_iommu *iommu)
232 {
233 	u32 gsts;
234 
235 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
236 	if (gsts & DMA_GSTS_TES)
237 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
238 }
239 
240 static int __init intel_iommu_setup(char *str)
241 {
242 	if (!str)
243 		return -EINVAL;
244 
245 	while (*str) {
246 		if (!strncmp(str, "on", 2)) {
247 			dmar_disabled = 0;
248 			pr_info("IOMMU enabled\n");
249 		} else if (!strncmp(str, "off", 3)) {
250 			dmar_disabled = 1;
251 			no_platform_optin = 1;
252 			pr_info("IOMMU disabled\n");
253 		} else if (!strncmp(str, "igfx_off", 8)) {
254 			disable_igfx_iommu = 1;
255 			pr_info("Disable GFX device mapping\n");
256 		} else if (!strncmp(str, "forcedac", 8)) {
257 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
258 			iommu_dma_forcedac = true;
259 		} else if (!strncmp(str, "strict", 6)) {
260 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
261 			iommu_set_dma_strict();
262 		} else if (!strncmp(str, "sp_off", 6)) {
263 			pr_info("Disable supported super page\n");
264 			intel_iommu_superpage = 0;
265 		} else if (!strncmp(str, "sm_on", 5)) {
266 			pr_info("Enable scalable mode if hardware supports\n");
267 			intel_iommu_sm = 1;
268 		} else if (!strncmp(str, "sm_off", 6)) {
269 			pr_info("Scalable mode is disallowed\n");
270 			intel_iommu_sm = 0;
271 		} else if (!strncmp(str, "tboot_noforce", 13)) {
272 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
273 			intel_iommu_tboot_noforce = 1;
274 		} else {
275 			pr_notice("Unknown option - '%s'\n", str);
276 		}
277 
278 		str += strcspn(str, ",");
279 		while (*str == ',')
280 			str++;
281 	}
282 
283 	return 1;
284 }
285 __setup("intel_iommu=", intel_iommu_setup);
286 
287 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
288 {
289 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
290 
291 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
292 }
293 
294 /*
295  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
296  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
297  * the returned SAGAW.
298  */
299 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
300 {
301 	unsigned long fl_sagaw, sl_sagaw;
302 
303 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
304 	sl_sagaw = cap_sagaw(iommu->cap);
305 
306 	/* Second level only. */
307 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
308 		return sl_sagaw;
309 
310 	/* First level only. */
311 	if (!ecap_slts(iommu->ecap))
312 		return fl_sagaw;
313 
314 	return fl_sagaw & sl_sagaw;
315 }
316 
317 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
318 {
319 	unsigned long sagaw;
320 	int agaw;
321 
322 	sagaw = __iommu_calculate_sagaw(iommu);
323 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
324 		if (test_bit(agaw, &sagaw))
325 			break;
326 	}
327 
328 	return agaw;
329 }
330 
331 /*
332  * Calculate max SAGAW for each iommu.
333  */
334 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
335 {
336 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
337 }
338 
339 /*
340  * calculate agaw for each iommu.
341  * "SAGAW" may be different across iommus, use a default agaw, and
342  * get a supported less agaw for iommus that don't support the default agaw.
343  */
344 int iommu_calculate_agaw(struct intel_iommu *iommu)
345 {
346 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
347 }
348 
349 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
350 {
351 	return sm_supported(iommu) ?
352 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
353 }
354 
355 static void domain_update_iommu_coherency(struct dmar_domain *domain)
356 {
357 	struct iommu_domain_info *info;
358 	struct dmar_drhd_unit *drhd;
359 	struct intel_iommu *iommu;
360 	bool found = false;
361 	unsigned long i;
362 
363 	domain->iommu_coherency = true;
364 	xa_for_each(&domain->iommu_array, i, info) {
365 		found = true;
366 		if (!iommu_paging_structure_coherency(info->iommu)) {
367 			domain->iommu_coherency = false;
368 			break;
369 		}
370 	}
371 	if (found)
372 		return;
373 
374 	/* No hardware attached; use lowest common denominator */
375 	rcu_read_lock();
376 	for_each_active_iommu(iommu, drhd) {
377 		if (!iommu_paging_structure_coherency(iommu)) {
378 			domain->iommu_coherency = false;
379 			break;
380 		}
381 	}
382 	rcu_read_unlock();
383 }
384 
385 static int domain_update_iommu_superpage(struct dmar_domain *domain,
386 					 struct intel_iommu *skip)
387 {
388 	struct dmar_drhd_unit *drhd;
389 	struct intel_iommu *iommu;
390 	int mask = 0x3;
391 
392 	if (!intel_iommu_superpage)
393 		return 0;
394 
395 	/* set iommu_superpage to the smallest common denominator */
396 	rcu_read_lock();
397 	for_each_active_iommu(iommu, drhd) {
398 		if (iommu != skip) {
399 			if (domain && domain->use_first_level) {
400 				if (!cap_fl1gp_support(iommu->cap))
401 					mask = 0x1;
402 			} else {
403 				mask &= cap_super_page_val(iommu->cap);
404 			}
405 
406 			if (!mask)
407 				break;
408 		}
409 	}
410 	rcu_read_unlock();
411 
412 	return fls(mask);
413 }
414 
415 static int domain_update_device_node(struct dmar_domain *domain)
416 {
417 	struct device_domain_info *info;
418 	int nid = NUMA_NO_NODE;
419 	unsigned long flags;
420 
421 	spin_lock_irqsave(&domain->lock, flags);
422 	list_for_each_entry(info, &domain->devices, link) {
423 		/*
424 		 * There could possibly be multiple device numa nodes as devices
425 		 * within the same domain may sit behind different IOMMUs. There
426 		 * isn't perfect answer in such situation, so we select first
427 		 * come first served policy.
428 		 */
429 		nid = dev_to_node(info->dev);
430 		if (nid != NUMA_NO_NODE)
431 			break;
432 	}
433 	spin_unlock_irqrestore(&domain->lock, flags);
434 
435 	return nid;
436 }
437 
438 /* Return the super pagesize bitmap if supported. */
439 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
440 {
441 	unsigned long bitmap = 0;
442 
443 	/*
444 	 * 1-level super page supports page size of 2MiB, 2-level super page
445 	 * supports page size of both 2MiB and 1GiB.
446 	 */
447 	if (domain->iommu_superpage == 1)
448 		bitmap |= SZ_2M;
449 	else if (domain->iommu_superpage == 2)
450 		bitmap |= SZ_2M | SZ_1G;
451 
452 	return bitmap;
453 }
454 
455 /* Some capabilities may be different across iommus */
456 void domain_update_iommu_cap(struct dmar_domain *domain)
457 {
458 	domain_update_iommu_coherency(domain);
459 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
460 
461 	/*
462 	 * If RHSA is missing, we should default to the device numa domain
463 	 * as fall back.
464 	 */
465 	if (domain->nid == NUMA_NO_NODE)
466 		domain->nid = domain_update_device_node(domain);
467 
468 	/*
469 	 * First-level translation restricts the input-address to a
470 	 * canonical address (i.e., address bits 63:N have the same
471 	 * value as address bit [N-1], where N is 48-bits with 4-level
472 	 * paging and 57-bits with 5-level paging). Hence, skip bit
473 	 * [N-1].
474 	 */
475 	if (domain->use_first_level)
476 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
477 	else
478 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
479 
480 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
481 }
482 
483 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
484 					 u8 devfn, int alloc)
485 {
486 	struct root_entry *root = &iommu->root_entry[bus];
487 	struct context_entry *context;
488 	u64 *entry;
489 
490 	/*
491 	 * Except that the caller requested to allocate a new entry,
492 	 * returning a copied context entry makes no sense.
493 	 */
494 	if (!alloc && context_copied(iommu, bus, devfn))
495 		return NULL;
496 
497 	entry = &root->lo;
498 	if (sm_supported(iommu)) {
499 		if (devfn >= 0x80) {
500 			devfn -= 0x80;
501 			entry = &root->hi;
502 		}
503 		devfn *= 2;
504 	}
505 	if (*entry & 1)
506 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
507 	else {
508 		unsigned long phy_addr;
509 		if (!alloc)
510 			return NULL;
511 
512 		context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
513 		if (!context)
514 			return NULL;
515 
516 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
517 		phy_addr = virt_to_phys((void *)context);
518 		*entry = phy_addr | 1;
519 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
520 	}
521 	return &context[devfn];
522 }
523 
524 /**
525  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
526  *				 sub-hierarchy of a candidate PCI-PCI bridge
527  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
528  * @bridge: the candidate PCI-PCI bridge
529  *
530  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
531  */
532 static bool
533 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
534 {
535 	struct pci_dev *pdev, *pbridge;
536 
537 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
538 		return false;
539 
540 	pdev = to_pci_dev(dev);
541 	pbridge = to_pci_dev(bridge);
542 
543 	if (pbridge->subordinate &&
544 	    pbridge->subordinate->number <= pdev->bus->number &&
545 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
546 		return true;
547 
548 	return false;
549 }
550 
551 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
552 {
553 	struct dmar_drhd_unit *drhd;
554 	u32 vtbar;
555 	int rc;
556 
557 	/* We know that this device on this chipset has its own IOMMU.
558 	 * If we find it under a different IOMMU, then the BIOS is lying
559 	 * to us. Hope that the IOMMU for this device is actually
560 	 * disabled, and it needs no translation...
561 	 */
562 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
563 	if (rc) {
564 		/* "can't" happen */
565 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
566 		return false;
567 	}
568 	vtbar &= 0xffff0000;
569 
570 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
571 	drhd = dmar_find_matched_drhd_unit(pdev);
572 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
573 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
574 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
575 		return true;
576 	}
577 
578 	return false;
579 }
580 
581 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
582 {
583 	if (!iommu || iommu->drhd->ignored)
584 		return true;
585 
586 	if (dev_is_pci(dev)) {
587 		struct pci_dev *pdev = to_pci_dev(dev);
588 
589 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
590 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
591 		    quirk_ioat_snb_local_iommu(pdev))
592 			return true;
593 	}
594 
595 	return false;
596 }
597 
598 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
599 {
600 	struct dmar_drhd_unit *drhd = NULL;
601 	struct pci_dev *pdev = NULL;
602 	struct intel_iommu *iommu;
603 	struct device *tmp;
604 	u16 segment = 0;
605 	int i;
606 
607 	if (!dev)
608 		return NULL;
609 
610 	if (dev_is_pci(dev)) {
611 		struct pci_dev *pf_pdev;
612 
613 		pdev = pci_real_dma_dev(to_pci_dev(dev));
614 
615 		/* VFs aren't listed in scope tables; we need to look up
616 		 * the PF instead to find the IOMMU. */
617 		pf_pdev = pci_physfn(pdev);
618 		dev = &pf_pdev->dev;
619 		segment = pci_domain_nr(pdev->bus);
620 	} else if (has_acpi_companion(dev))
621 		dev = &ACPI_COMPANION(dev)->dev;
622 
623 	rcu_read_lock();
624 	for_each_iommu(iommu, drhd) {
625 		if (pdev && segment != drhd->segment)
626 			continue;
627 
628 		for_each_active_dev_scope(drhd->devices,
629 					  drhd->devices_cnt, i, tmp) {
630 			if (tmp == dev) {
631 				/* For a VF use its original BDF# not that of the PF
632 				 * which we used for the IOMMU lookup. Strictly speaking
633 				 * we could do this for all PCI devices; we only need to
634 				 * get the BDF# from the scope table for ACPI matches. */
635 				if (pdev && pdev->is_virtfn)
636 					goto got_pdev;
637 
638 				if (bus && devfn) {
639 					*bus = drhd->devices[i].bus;
640 					*devfn = drhd->devices[i].devfn;
641 				}
642 				goto out;
643 			}
644 
645 			if (is_downstream_to_pci_bridge(dev, tmp))
646 				goto got_pdev;
647 		}
648 
649 		if (pdev && drhd->include_all) {
650 got_pdev:
651 			if (bus && devfn) {
652 				*bus = pdev->bus->number;
653 				*devfn = pdev->devfn;
654 			}
655 			goto out;
656 		}
657 	}
658 	iommu = NULL;
659 out:
660 	if (iommu_is_dummy(iommu, dev))
661 		iommu = NULL;
662 
663 	rcu_read_unlock();
664 
665 	return iommu;
666 }
667 
668 static void domain_flush_cache(struct dmar_domain *domain,
669 			       void *addr, int size)
670 {
671 	if (!domain->iommu_coherency)
672 		clflush_cache_range(addr, size);
673 }
674 
675 static void free_context_table(struct intel_iommu *iommu)
676 {
677 	struct context_entry *context;
678 	int i;
679 
680 	if (!iommu->root_entry)
681 		return;
682 
683 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
684 		context = iommu_context_addr(iommu, i, 0, 0);
685 		if (context)
686 			iommu_free_page(context);
687 
688 		if (!sm_supported(iommu))
689 			continue;
690 
691 		context = iommu_context_addr(iommu, i, 0x80, 0);
692 		if (context)
693 			iommu_free_page(context);
694 	}
695 
696 	iommu_free_page(iommu->root_entry);
697 	iommu->root_entry = NULL;
698 }
699 
700 #ifdef CONFIG_DMAR_DEBUG
701 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
702 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
703 {
704 	struct dma_pte *pte;
705 	int offset;
706 
707 	while (1) {
708 		offset = pfn_level_offset(pfn, level);
709 		pte = &parent[offset];
710 		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
711 			pr_info("PTE not present at level %d\n", level);
712 			break;
713 		}
714 
715 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
716 
717 		if (level == 1)
718 			break;
719 
720 		parent = phys_to_virt(dma_pte_addr(pte));
721 		level--;
722 	}
723 }
724 
725 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
726 			  unsigned long long addr, u32 pasid)
727 {
728 	struct pasid_dir_entry *dir, *pde;
729 	struct pasid_entry *entries, *pte;
730 	struct context_entry *ctx_entry;
731 	struct root_entry *rt_entry;
732 	int i, dir_index, index, level;
733 	u8 devfn = source_id & 0xff;
734 	u8 bus = source_id >> 8;
735 	struct dma_pte *pgtable;
736 
737 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
738 
739 	/* root entry dump */
740 	rt_entry = &iommu->root_entry[bus];
741 	if (!rt_entry) {
742 		pr_info("root table entry is not present\n");
743 		return;
744 	}
745 
746 	if (sm_supported(iommu))
747 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
748 			rt_entry->hi, rt_entry->lo);
749 	else
750 		pr_info("root entry: 0x%016llx", rt_entry->lo);
751 
752 	/* context entry dump */
753 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
754 	if (!ctx_entry) {
755 		pr_info("context table entry is not present\n");
756 		return;
757 	}
758 
759 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
760 		ctx_entry->hi, ctx_entry->lo);
761 
762 	/* legacy mode does not require PASID entries */
763 	if (!sm_supported(iommu)) {
764 		level = agaw_to_level(ctx_entry->hi & 7);
765 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
766 		goto pgtable_walk;
767 	}
768 
769 	/* get the pointer to pasid directory entry */
770 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
771 	if (!dir) {
772 		pr_info("pasid directory entry is not present\n");
773 		return;
774 	}
775 	/* For request-without-pasid, get the pasid from context entry */
776 	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
777 		pasid = IOMMU_NO_PASID;
778 
779 	dir_index = pasid >> PASID_PDE_SHIFT;
780 	pde = &dir[dir_index];
781 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
782 
783 	/* get the pointer to the pasid table entry */
784 	entries = get_pasid_table_from_pde(pde);
785 	if (!entries) {
786 		pr_info("pasid table entry is not present\n");
787 		return;
788 	}
789 	index = pasid & PASID_PTE_MASK;
790 	pte = &entries[index];
791 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
792 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
793 
794 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
795 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
796 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
797 	} else {
798 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
799 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
800 	}
801 
802 pgtable_walk:
803 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
804 }
805 #endif
806 
807 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
808 				      unsigned long pfn, int *target_level,
809 				      gfp_t gfp)
810 {
811 	struct dma_pte *parent, *pte;
812 	int level = agaw_to_level(domain->agaw);
813 	int offset;
814 
815 	if (!domain_pfn_supported(domain, pfn))
816 		/* Address beyond IOMMU's addressing capabilities. */
817 		return NULL;
818 
819 	parent = domain->pgd;
820 
821 	while (1) {
822 		void *tmp_page;
823 
824 		offset = pfn_level_offset(pfn, level);
825 		pte = &parent[offset];
826 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
827 			break;
828 		if (level == *target_level)
829 			break;
830 
831 		if (!dma_pte_present(pte)) {
832 			uint64_t pteval, tmp;
833 
834 			tmp_page = iommu_alloc_page_node(domain->nid, gfp);
835 
836 			if (!tmp_page)
837 				return NULL;
838 
839 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
840 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
841 			if (domain->use_first_level)
842 				pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
843 
844 			tmp = 0ULL;
845 			if (!try_cmpxchg64(&pte->val, &tmp, pteval))
846 				/* Someone else set it while we were thinking; use theirs. */
847 				iommu_free_page(tmp_page);
848 			else
849 				domain_flush_cache(domain, pte, sizeof(*pte));
850 		}
851 		if (level == 1)
852 			break;
853 
854 		parent = phys_to_virt(dma_pte_addr(pte));
855 		level--;
856 	}
857 
858 	if (!*target_level)
859 		*target_level = level;
860 
861 	return pte;
862 }
863 
864 /* return address's pte at specific level */
865 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
866 					 unsigned long pfn,
867 					 int level, int *large_page)
868 {
869 	struct dma_pte *parent, *pte;
870 	int total = agaw_to_level(domain->agaw);
871 	int offset;
872 
873 	parent = domain->pgd;
874 	while (level <= total) {
875 		offset = pfn_level_offset(pfn, total);
876 		pte = &parent[offset];
877 		if (level == total)
878 			return pte;
879 
880 		if (!dma_pte_present(pte)) {
881 			*large_page = total;
882 			break;
883 		}
884 
885 		if (dma_pte_superpage(pte)) {
886 			*large_page = total;
887 			return pte;
888 		}
889 
890 		parent = phys_to_virt(dma_pte_addr(pte));
891 		total--;
892 	}
893 	return NULL;
894 }
895 
896 /* clear last level pte, a tlb flush should be followed */
897 static void dma_pte_clear_range(struct dmar_domain *domain,
898 				unsigned long start_pfn,
899 				unsigned long last_pfn)
900 {
901 	unsigned int large_page;
902 	struct dma_pte *first_pte, *pte;
903 
904 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
905 	    WARN_ON(start_pfn > last_pfn))
906 		return;
907 
908 	/* we don't need lock here; nobody else touches the iova range */
909 	do {
910 		large_page = 1;
911 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
912 		if (!pte) {
913 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
914 			continue;
915 		}
916 		do {
917 			dma_clear_pte(pte);
918 			start_pfn += lvl_to_nr_pages(large_page);
919 			pte++;
920 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
921 
922 		domain_flush_cache(domain, first_pte,
923 				   (void *)pte - (void *)first_pte);
924 
925 	} while (start_pfn && start_pfn <= last_pfn);
926 }
927 
928 static void dma_pte_free_level(struct dmar_domain *domain, int level,
929 			       int retain_level, struct dma_pte *pte,
930 			       unsigned long pfn, unsigned long start_pfn,
931 			       unsigned long last_pfn)
932 {
933 	pfn = max(start_pfn, pfn);
934 	pte = &pte[pfn_level_offset(pfn, level)];
935 
936 	do {
937 		unsigned long level_pfn;
938 		struct dma_pte *level_pte;
939 
940 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
941 			goto next;
942 
943 		level_pfn = pfn & level_mask(level);
944 		level_pte = phys_to_virt(dma_pte_addr(pte));
945 
946 		if (level > 2) {
947 			dma_pte_free_level(domain, level - 1, retain_level,
948 					   level_pte, level_pfn, start_pfn,
949 					   last_pfn);
950 		}
951 
952 		/*
953 		 * Free the page table if we're below the level we want to
954 		 * retain and the range covers the entire table.
955 		 */
956 		if (level < retain_level && !(start_pfn > level_pfn ||
957 		      last_pfn < level_pfn + level_size(level) - 1)) {
958 			dma_clear_pte(pte);
959 			domain_flush_cache(domain, pte, sizeof(*pte));
960 			iommu_free_page(level_pte);
961 		}
962 next:
963 		pfn += level_size(level);
964 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
965 }
966 
967 /*
968  * clear last level (leaf) ptes and free page table pages below the
969  * level we wish to keep intact.
970  */
971 static void dma_pte_free_pagetable(struct dmar_domain *domain,
972 				   unsigned long start_pfn,
973 				   unsigned long last_pfn,
974 				   int retain_level)
975 {
976 	dma_pte_clear_range(domain, start_pfn, last_pfn);
977 
978 	/* We don't need lock here; nobody else touches the iova range */
979 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
980 			   domain->pgd, 0, start_pfn, last_pfn);
981 
982 	/* free pgd */
983 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
984 		iommu_free_page(domain->pgd);
985 		domain->pgd = NULL;
986 	}
987 }
988 
989 /* When a page at a given level is being unlinked from its parent, we don't
990    need to *modify* it at all. All we need to do is make a list of all the
991    pages which can be freed just as soon as we've flushed the IOTLB and we
992    know the hardware page-walk will no longer touch them.
993    The 'pte' argument is the *parent* PTE, pointing to the page that is to
994    be freed. */
995 static void dma_pte_list_pagetables(struct dmar_domain *domain,
996 				    int level, struct dma_pte *pte,
997 				    struct list_head *freelist)
998 {
999 	struct page *pg;
1000 
1001 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1002 	list_add_tail(&pg->lru, freelist);
1003 
1004 	if (level == 1)
1005 		return;
1006 
1007 	pte = page_address(pg);
1008 	do {
1009 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1010 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1011 		pte++;
1012 	} while (!first_pte_in_page(pte));
1013 }
1014 
1015 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1016 				struct dma_pte *pte, unsigned long pfn,
1017 				unsigned long start_pfn, unsigned long last_pfn,
1018 				struct list_head *freelist)
1019 {
1020 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1021 
1022 	pfn = max(start_pfn, pfn);
1023 	pte = &pte[pfn_level_offset(pfn, level)];
1024 
1025 	do {
1026 		unsigned long level_pfn = pfn & level_mask(level);
1027 
1028 		if (!dma_pte_present(pte))
1029 			goto next;
1030 
1031 		/* If range covers entire pagetable, free it */
1032 		if (start_pfn <= level_pfn &&
1033 		    last_pfn >= level_pfn + level_size(level) - 1) {
1034 			/* These suborbinate page tables are going away entirely. Don't
1035 			   bother to clear them; we're just going to *free* them. */
1036 			if (level > 1 && !dma_pte_superpage(pte))
1037 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1038 
1039 			dma_clear_pte(pte);
1040 			if (!first_pte)
1041 				first_pte = pte;
1042 			last_pte = pte;
1043 		} else if (level > 1) {
1044 			/* Recurse down into a level that isn't *entirely* obsolete */
1045 			dma_pte_clear_level(domain, level - 1,
1046 					    phys_to_virt(dma_pte_addr(pte)),
1047 					    level_pfn, start_pfn, last_pfn,
1048 					    freelist);
1049 		}
1050 next:
1051 		pfn = level_pfn + level_size(level);
1052 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1053 
1054 	if (first_pte)
1055 		domain_flush_cache(domain, first_pte,
1056 				   (void *)++last_pte - (void *)first_pte);
1057 }
1058 
1059 /* We can't just free the pages because the IOMMU may still be walking
1060    the page tables, and may have cached the intermediate levels. The
1061    pages can only be freed after the IOTLB flush has been done. */
1062 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1063 			 unsigned long last_pfn, struct list_head *freelist)
1064 {
1065 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1066 	    WARN_ON(start_pfn > last_pfn))
1067 		return;
1068 
1069 	/* we don't need lock here; nobody else touches the iova range */
1070 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1071 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1072 
1073 	/* free pgd */
1074 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1075 		struct page *pgd_page = virt_to_page(domain->pgd);
1076 		list_add_tail(&pgd_page->lru, freelist);
1077 		domain->pgd = NULL;
1078 	}
1079 }
1080 
1081 /* iommu handling */
1082 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1083 {
1084 	struct root_entry *root;
1085 
1086 	root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
1087 	if (!root) {
1088 		pr_err("Allocating root entry for %s failed\n",
1089 			iommu->name);
1090 		return -ENOMEM;
1091 	}
1092 
1093 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1094 	iommu->root_entry = root;
1095 
1096 	return 0;
1097 }
1098 
1099 static void iommu_set_root_entry(struct intel_iommu *iommu)
1100 {
1101 	u64 addr;
1102 	u32 sts;
1103 	unsigned long flag;
1104 
1105 	addr = virt_to_phys(iommu->root_entry);
1106 	if (sm_supported(iommu))
1107 		addr |= DMA_RTADDR_SMT;
1108 
1109 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1110 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1111 
1112 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1113 
1114 	/* Make sure hardware complete it */
1115 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1116 		      readl, (sts & DMA_GSTS_RTPS), sts);
1117 
1118 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1119 
1120 	/*
1121 	 * Hardware invalidates all DMA remapping hardware translation
1122 	 * caches as part of SRTP flow.
1123 	 */
1124 	if (cap_esrtps(iommu->cap))
1125 		return;
1126 
1127 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1128 	if (sm_supported(iommu))
1129 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1130 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1131 }
1132 
1133 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1134 {
1135 	u32 val;
1136 	unsigned long flag;
1137 
1138 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1139 		return;
1140 
1141 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1142 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1143 
1144 	/* Make sure hardware complete it */
1145 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1146 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1147 
1148 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1149 }
1150 
1151 /* return value determine if we need a write buffer flush */
1152 static void __iommu_flush_context(struct intel_iommu *iommu,
1153 				  u16 did, u16 source_id, u8 function_mask,
1154 				  u64 type)
1155 {
1156 	u64 val = 0;
1157 	unsigned long flag;
1158 
1159 	switch (type) {
1160 	case DMA_CCMD_GLOBAL_INVL:
1161 		val = DMA_CCMD_GLOBAL_INVL;
1162 		break;
1163 	case DMA_CCMD_DOMAIN_INVL:
1164 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1165 		break;
1166 	case DMA_CCMD_DEVICE_INVL:
1167 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1168 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1169 		break;
1170 	default:
1171 		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1172 			iommu->name, type);
1173 		return;
1174 	}
1175 	val |= DMA_CCMD_ICC;
1176 
1177 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1178 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1179 
1180 	/* Make sure hardware complete it */
1181 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1182 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1183 
1184 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1185 }
1186 
1187 void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
1188 			 unsigned int size_order, u64 type)
1189 {
1190 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1191 	u64 val = 0, val_iva = 0;
1192 	unsigned long flag;
1193 
1194 	switch (type) {
1195 	case DMA_TLB_GLOBAL_FLUSH:
1196 		/* global flush doesn't need set IVA_REG */
1197 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1198 		break;
1199 	case DMA_TLB_DSI_FLUSH:
1200 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1201 		break;
1202 	case DMA_TLB_PSI_FLUSH:
1203 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1204 		/* IH bit is passed in as part of address */
1205 		val_iva = size_order | addr;
1206 		break;
1207 	default:
1208 		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1209 			iommu->name, type);
1210 		return;
1211 	}
1212 
1213 	if (cap_write_drain(iommu->cap))
1214 		val |= DMA_TLB_WRITE_DRAIN;
1215 
1216 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1217 	/* Note: Only uses first TLB reg currently */
1218 	if (val_iva)
1219 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1220 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1221 
1222 	/* Make sure hardware complete it */
1223 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1224 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1225 
1226 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1227 
1228 	/* check IOTLB invalidation granularity */
1229 	if (DMA_TLB_IAIG(val) == 0)
1230 		pr_err("Flush IOTLB failed\n");
1231 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1232 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1233 			(unsigned long long)DMA_TLB_IIRG(type),
1234 			(unsigned long long)DMA_TLB_IAIG(val));
1235 }
1236 
1237 static struct device_domain_info *
1238 domain_lookup_dev_info(struct dmar_domain *domain,
1239 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1240 {
1241 	struct device_domain_info *info;
1242 	unsigned long flags;
1243 
1244 	spin_lock_irqsave(&domain->lock, flags);
1245 	list_for_each_entry(info, &domain->devices, link) {
1246 		if (info->iommu == iommu && info->bus == bus &&
1247 		    info->devfn == devfn) {
1248 			spin_unlock_irqrestore(&domain->lock, flags);
1249 			return info;
1250 		}
1251 	}
1252 	spin_unlock_irqrestore(&domain->lock, flags);
1253 
1254 	return NULL;
1255 }
1256 
1257 /*
1258  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1259  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1260  * check because it applies only to the built-in QAT devices and it doesn't
1261  * grant additional privileges.
1262  */
1263 #define BUGGY_QAT_DEVID_MASK 0x4940
1264 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1265 {
1266 	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1267 		return false;
1268 
1269 	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1270 		return false;
1271 
1272 	return true;
1273 }
1274 
1275 static void iommu_enable_pci_caps(struct device_domain_info *info)
1276 {
1277 	struct pci_dev *pdev;
1278 
1279 	if (!dev_is_pci(info->dev))
1280 		return;
1281 
1282 	pdev = to_pci_dev(info->dev);
1283 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1284 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT))
1285 		info->ats_enabled = 1;
1286 }
1287 
1288 static void iommu_disable_pci_caps(struct device_domain_info *info)
1289 {
1290 	struct pci_dev *pdev;
1291 
1292 	if (!dev_is_pci(info->dev))
1293 		return;
1294 
1295 	pdev = to_pci_dev(info->dev);
1296 
1297 	if (info->ats_enabled) {
1298 		pci_disable_ats(pdev);
1299 		info->ats_enabled = 0;
1300 	}
1301 }
1302 
1303 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1304 {
1305 	cache_tag_flush_all(to_dmar_domain(domain));
1306 }
1307 
1308 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1309 {
1310 	u32 pmen;
1311 	unsigned long flags;
1312 
1313 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1314 		return;
1315 
1316 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1317 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1318 	pmen &= ~DMA_PMEN_EPM;
1319 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1320 
1321 	/* wait for the protected region status bit to clear */
1322 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1323 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1324 
1325 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1326 }
1327 
1328 static void iommu_enable_translation(struct intel_iommu *iommu)
1329 {
1330 	u32 sts;
1331 	unsigned long flags;
1332 
1333 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1334 	iommu->gcmd |= DMA_GCMD_TE;
1335 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1336 
1337 	/* Make sure hardware complete it */
1338 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1339 		      readl, (sts & DMA_GSTS_TES), sts);
1340 
1341 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1342 }
1343 
1344 static void iommu_disable_translation(struct intel_iommu *iommu)
1345 {
1346 	u32 sts;
1347 	unsigned long flag;
1348 
1349 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1350 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1351 		return;
1352 
1353 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1354 	iommu->gcmd &= ~DMA_GCMD_TE;
1355 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1356 
1357 	/* Make sure hardware complete it */
1358 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1359 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1360 
1361 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1362 }
1363 
1364 static int iommu_init_domains(struct intel_iommu *iommu)
1365 {
1366 	u32 ndomains;
1367 
1368 	ndomains = cap_ndoms(iommu->cap);
1369 	pr_debug("%s: Number of Domains supported <%d>\n",
1370 		 iommu->name, ndomains);
1371 
1372 	spin_lock_init(&iommu->lock);
1373 
1374 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1375 	if (!iommu->domain_ids)
1376 		return -ENOMEM;
1377 
1378 	/*
1379 	 * If Caching mode is set, then invalid translations are tagged
1380 	 * with domain-id 0, hence we need to pre-allocate it. We also
1381 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1382 	 * make sure it is not used for a real domain.
1383 	 */
1384 	set_bit(0, iommu->domain_ids);
1385 
1386 	/*
1387 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1388 	 * entry for first-level or pass-through translation modes should
1389 	 * be programmed with a domain id different from those used for
1390 	 * second-level or nested translation. We reserve a domain id for
1391 	 * this purpose. This domain id is also used for identity domain
1392 	 * in legacy mode.
1393 	 */
1394 	set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1395 
1396 	return 0;
1397 }
1398 
1399 static void disable_dmar_iommu(struct intel_iommu *iommu)
1400 {
1401 	if (!iommu->domain_ids)
1402 		return;
1403 
1404 	/*
1405 	 * All iommu domains must have been detached from the devices,
1406 	 * hence there should be no domain IDs in use.
1407 	 */
1408 	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1409 		    > NUM_RESERVED_DID))
1410 		return;
1411 
1412 	if (iommu->gcmd & DMA_GCMD_TE)
1413 		iommu_disable_translation(iommu);
1414 }
1415 
1416 static void free_dmar_iommu(struct intel_iommu *iommu)
1417 {
1418 	if (iommu->domain_ids) {
1419 		bitmap_free(iommu->domain_ids);
1420 		iommu->domain_ids = NULL;
1421 	}
1422 
1423 	if (iommu->copied_tables) {
1424 		bitmap_free(iommu->copied_tables);
1425 		iommu->copied_tables = NULL;
1426 	}
1427 
1428 	/* free context mapping */
1429 	free_context_table(iommu);
1430 
1431 #ifdef CONFIG_INTEL_IOMMU_SVM
1432 	if (pasid_supported(iommu)) {
1433 		if (ecap_prs(iommu->ecap))
1434 			intel_svm_finish_prq(iommu);
1435 	}
1436 #endif
1437 }
1438 
1439 /*
1440  * Check and return whether first level is used by default for
1441  * DMA translation.
1442  */
1443 static bool first_level_by_default(unsigned int type)
1444 {
1445 	/* Only SL is available in legacy mode */
1446 	if (!scalable_mode_support())
1447 		return false;
1448 
1449 	/* Only level (either FL or SL) is available, just use it */
1450 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1451 		return intel_cap_flts_sanity();
1452 
1453 	/* Both levels are available, decide it based on domain type */
1454 	return type != IOMMU_DOMAIN_UNMANAGED;
1455 }
1456 
1457 static struct dmar_domain *alloc_domain(unsigned int type)
1458 {
1459 	struct dmar_domain *domain;
1460 
1461 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1462 	if (!domain)
1463 		return NULL;
1464 
1465 	domain->nid = NUMA_NO_NODE;
1466 	if (first_level_by_default(type))
1467 		domain->use_first_level = true;
1468 	INIT_LIST_HEAD(&domain->devices);
1469 	INIT_LIST_HEAD(&domain->dev_pasids);
1470 	INIT_LIST_HEAD(&domain->cache_tags);
1471 	spin_lock_init(&domain->lock);
1472 	spin_lock_init(&domain->cache_lock);
1473 	xa_init(&domain->iommu_array);
1474 
1475 	return domain;
1476 }
1477 
1478 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1479 {
1480 	struct iommu_domain_info *info, *curr;
1481 	unsigned long ndomains;
1482 	int num, ret = -ENOSPC;
1483 
1484 	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1485 		return 0;
1486 
1487 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1488 	if (!info)
1489 		return -ENOMEM;
1490 
1491 	spin_lock(&iommu->lock);
1492 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1493 	if (curr) {
1494 		curr->refcnt++;
1495 		spin_unlock(&iommu->lock);
1496 		kfree(info);
1497 		return 0;
1498 	}
1499 
1500 	ndomains = cap_ndoms(iommu->cap);
1501 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1502 	if (num >= ndomains) {
1503 		pr_err("%s: No free domain ids\n", iommu->name);
1504 		goto err_unlock;
1505 	}
1506 
1507 	set_bit(num, iommu->domain_ids);
1508 	info->refcnt	= 1;
1509 	info->did	= num;
1510 	info->iommu	= iommu;
1511 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1512 			  NULL, info, GFP_ATOMIC);
1513 	if (curr) {
1514 		ret = xa_err(curr) ? : -EBUSY;
1515 		goto err_clear;
1516 	}
1517 	domain_update_iommu_cap(domain);
1518 
1519 	spin_unlock(&iommu->lock);
1520 	return 0;
1521 
1522 err_clear:
1523 	clear_bit(info->did, iommu->domain_ids);
1524 err_unlock:
1525 	spin_unlock(&iommu->lock);
1526 	kfree(info);
1527 	return ret;
1528 }
1529 
1530 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1531 {
1532 	struct iommu_domain_info *info;
1533 
1534 	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1535 		return;
1536 
1537 	spin_lock(&iommu->lock);
1538 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1539 	if (--info->refcnt == 0) {
1540 		clear_bit(info->did, iommu->domain_ids);
1541 		xa_erase(&domain->iommu_array, iommu->seq_id);
1542 		domain->nid = NUMA_NO_NODE;
1543 		domain_update_iommu_cap(domain);
1544 		kfree(info);
1545 	}
1546 	spin_unlock(&iommu->lock);
1547 }
1548 
1549 static int guestwidth_to_adjustwidth(int gaw)
1550 {
1551 	int agaw;
1552 	int r = (gaw - 12) % 9;
1553 
1554 	if (r == 0)
1555 		agaw = gaw;
1556 	else
1557 		agaw = gaw + 9 - r;
1558 	if (agaw > 64)
1559 		agaw = 64;
1560 	return agaw;
1561 }
1562 
1563 static void domain_exit(struct dmar_domain *domain)
1564 {
1565 	if (domain->pgd) {
1566 		LIST_HEAD(freelist);
1567 
1568 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1569 		iommu_put_pages_list(&freelist);
1570 	}
1571 
1572 	if (WARN_ON(!list_empty(&domain->devices)))
1573 		return;
1574 
1575 	kfree(domain->qi_batch);
1576 	kfree(domain);
1577 }
1578 
1579 /*
1580  * For kdump cases, old valid entries may be cached due to the
1581  * in-flight DMA and copied pgtable, but there is no unmapping
1582  * behaviour for them, thus we need an explicit cache flush for
1583  * the newly-mapped device. For kdump, at this point, the device
1584  * is supposed to finish reset at its driver probe stage, so no
1585  * in-flight DMA will exist, and we don't need to worry anymore
1586  * hereafter.
1587  */
1588 static void copied_context_tear_down(struct intel_iommu *iommu,
1589 				     struct context_entry *context,
1590 				     u8 bus, u8 devfn)
1591 {
1592 	u16 did_old;
1593 
1594 	if (!context_copied(iommu, bus, devfn))
1595 		return;
1596 
1597 	assert_spin_locked(&iommu->lock);
1598 
1599 	did_old = context_domain_id(context);
1600 	context_clear_entry(context);
1601 
1602 	if (did_old < cap_ndoms(iommu->cap)) {
1603 		iommu->flush.flush_context(iommu, did_old,
1604 					   (((u16)bus) << 8) | devfn,
1605 					   DMA_CCMD_MASK_NOBIT,
1606 					   DMA_CCMD_DEVICE_INVL);
1607 		iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1608 					 DMA_TLB_DSI_FLUSH);
1609 	}
1610 
1611 	clear_context_copied(iommu, bus, devfn);
1612 }
1613 
1614 /*
1615  * It's a non-present to present mapping. If hardware doesn't cache
1616  * non-present entry we only need to flush the write-buffer. If the
1617  * _does_ cache non-present entries, then it does so in the special
1618  * domain #0, which we have to flush:
1619  */
1620 static void context_present_cache_flush(struct intel_iommu *iommu, u16 did,
1621 					u8 bus, u8 devfn)
1622 {
1623 	if (cap_caching_mode(iommu->cap)) {
1624 		iommu->flush.flush_context(iommu, 0,
1625 					   (((u16)bus) << 8) | devfn,
1626 					   DMA_CCMD_MASK_NOBIT,
1627 					   DMA_CCMD_DEVICE_INVL);
1628 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1629 	} else {
1630 		iommu_flush_write_buffer(iommu);
1631 	}
1632 }
1633 
1634 static int domain_context_mapping_one(struct dmar_domain *domain,
1635 				      struct intel_iommu *iommu,
1636 				      u8 bus, u8 devfn)
1637 {
1638 	struct device_domain_info *info =
1639 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1640 	u16 did = domain_id_iommu(domain, iommu);
1641 	int translation = CONTEXT_TT_MULTI_LEVEL;
1642 	struct dma_pte *pgd = domain->pgd;
1643 	struct context_entry *context;
1644 	int agaw, ret;
1645 
1646 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1647 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1648 
1649 	spin_lock(&iommu->lock);
1650 	ret = -ENOMEM;
1651 	context = iommu_context_addr(iommu, bus, devfn, 1);
1652 	if (!context)
1653 		goto out_unlock;
1654 
1655 	ret = 0;
1656 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1657 		goto out_unlock;
1658 
1659 	copied_context_tear_down(iommu, context, bus, devfn);
1660 	context_clear_entry(context);
1661 
1662 	context_set_domain_id(context, did);
1663 
1664 	/*
1665 	 * Skip top levels of page tables for iommu which has
1666 	 * less agaw than default. Unnecessary for PT mode.
1667 	 */
1668 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1669 		ret = -ENOMEM;
1670 		pgd = phys_to_virt(dma_pte_addr(pgd));
1671 		if (!dma_pte_present(pgd))
1672 			goto out_unlock;
1673 	}
1674 
1675 	if (info && info->ats_supported)
1676 		translation = CONTEXT_TT_DEV_IOTLB;
1677 	else
1678 		translation = CONTEXT_TT_MULTI_LEVEL;
1679 
1680 	context_set_address_root(context, virt_to_phys(pgd));
1681 	context_set_address_width(context, agaw);
1682 	context_set_translation_type(context, translation);
1683 	context_set_fault_enable(context);
1684 	context_set_present(context);
1685 	if (!ecap_coherent(iommu->ecap))
1686 		clflush_cache_range(context, sizeof(*context));
1687 	context_present_cache_flush(iommu, did, bus, devfn);
1688 	ret = 0;
1689 
1690 out_unlock:
1691 	spin_unlock(&iommu->lock);
1692 
1693 	return ret;
1694 }
1695 
1696 static int domain_context_mapping_cb(struct pci_dev *pdev,
1697 				     u16 alias, void *opaque)
1698 {
1699 	struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1700 	struct intel_iommu *iommu = info->iommu;
1701 	struct dmar_domain *domain = opaque;
1702 
1703 	return domain_context_mapping_one(domain, iommu,
1704 					  PCI_BUS_NUM(alias), alias & 0xff);
1705 }
1706 
1707 static int
1708 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1709 {
1710 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1711 	struct intel_iommu *iommu = info->iommu;
1712 	u8 bus = info->bus, devfn = info->devfn;
1713 
1714 	if (!dev_is_pci(dev))
1715 		return domain_context_mapping_one(domain, iommu, bus, devfn);
1716 
1717 	return pci_for_each_dma_alias(to_pci_dev(dev),
1718 				      domain_context_mapping_cb, domain);
1719 }
1720 
1721 /* Return largest possible superpage level for a given mapping */
1722 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1723 				   unsigned long phy_pfn, unsigned long pages)
1724 {
1725 	int support, level = 1;
1726 	unsigned long pfnmerge;
1727 
1728 	support = domain->iommu_superpage;
1729 
1730 	/* To use a large page, the virtual *and* physical addresses
1731 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1732 	   of them will mean we have to use smaller pages. So just
1733 	   merge them and check both at once. */
1734 	pfnmerge = iov_pfn | phy_pfn;
1735 
1736 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1737 		pages >>= VTD_STRIDE_SHIFT;
1738 		if (!pages)
1739 			break;
1740 		pfnmerge >>= VTD_STRIDE_SHIFT;
1741 		level++;
1742 		support--;
1743 	}
1744 	return level;
1745 }
1746 
1747 /*
1748  * Ensure that old small page tables are removed to make room for superpage(s).
1749  * We're going to add new large pages, so make sure we don't remove their parent
1750  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1751  */
1752 static void switch_to_super_page(struct dmar_domain *domain,
1753 				 unsigned long start_pfn,
1754 				 unsigned long end_pfn, int level)
1755 {
1756 	unsigned long lvl_pages = lvl_to_nr_pages(level);
1757 	struct dma_pte *pte = NULL;
1758 
1759 	while (start_pfn <= end_pfn) {
1760 		if (!pte)
1761 			pte = pfn_to_dma_pte(domain, start_pfn, &level,
1762 					     GFP_ATOMIC);
1763 
1764 		if (dma_pte_present(pte)) {
1765 			dma_pte_free_pagetable(domain, start_pfn,
1766 					       start_pfn + lvl_pages - 1,
1767 					       level + 1);
1768 
1769 			cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT,
1770 					      end_pfn << VTD_PAGE_SHIFT, 0);
1771 		}
1772 
1773 		pte++;
1774 		start_pfn += lvl_pages;
1775 		if (first_pte_in_page(pte))
1776 			pte = NULL;
1777 	}
1778 }
1779 
1780 static int
1781 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1782 		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
1783 		 gfp_t gfp)
1784 {
1785 	struct dma_pte *first_pte = NULL, *pte = NULL;
1786 	unsigned int largepage_lvl = 0;
1787 	unsigned long lvl_pages = 0;
1788 	phys_addr_t pteval;
1789 	u64 attr;
1790 
1791 	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
1792 		return -EINVAL;
1793 
1794 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1795 		return -EINVAL;
1796 
1797 	if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
1798 		pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
1799 		return -EINVAL;
1800 	}
1801 
1802 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
1803 	attr |= DMA_FL_PTE_PRESENT;
1804 	if (domain->use_first_level) {
1805 		attr |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
1806 		if (prot & DMA_PTE_WRITE)
1807 			attr |= DMA_FL_PTE_DIRTY;
1808 	}
1809 
1810 	domain->has_mappings = true;
1811 
1812 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
1813 
1814 	while (nr_pages > 0) {
1815 		uint64_t tmp;
1816 
1817 		if (!pte) {
1818 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
1819 					phys_pfn, nr_pages);
1820 
1821 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
1822 					     gfp);
1823 			if (!pte)
1824 				return -ENOMEM;
1825 			first_pte = pte;
1826 
1827 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
1828 
1829 			/* It is large page*/
1830 			if (largepage_lvl > 1) {
1831 				unsigned long end_pfn;
1832 				unsigned long pages_to_remove;
1833 
1834 				pteval |= DMA_PTE_LARGE_PAGE;
1835 				pages_to_remove = min_t(unsigned long, nr_pages,
1836 							nr_pte_to_next_page(pte) * lvl_pages);
1837 				end_pfn = iov_pfn + pages_to_remove - 1;
1838 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
1839 			} else {
1840 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1841 			}
1842 
1843 		}
1844 		/* We don't need lock here, nobody else
1845 		 * touches the iova range
1846 		 */
1847 		tmp = 0ULL;
1848 		if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
1849 			static int dumps = 5;
1850 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1851 				iov_pfn, tmp, (unsigned long long)pteval);
1852 			if (dumps) {
1853 				dumps--;
1854 				debug_dma_dump_mappings(NULL);
1855 			}
1856 			WARN_ON(1);
1857 		}
1858 
1859 		nr_pages -= lvl_pages;
1860 		iov_pfn += lvl_pages;
1861 		phys_pfn += lvl_pages;
1862 		pteval += lvl_pages * VTD_PAGE_SIZE;
1863 
1864 		/* If the next PTE would be the first in a new page, then we
1865 		 * need to flush the cache on the entries we've just written.
1866 		 * And then we'll need to recalculate 'pte', so clear it and
1867 		 * let it get set again in the if (!pte) block above.
1868 		 *
1869 		 * If we're done (!nr_pages) we need to flush the cache too.
1870 		 *
1871 		 * Also if we've been setting superpages, we may need to
1872 		 * recalculate 'pte' and switch back to smaller pages for the
1873 		 * end of the mapping, if the trailing size is not enough to
1874 		 * use another superpage (i.e. nr_pages < lvl_pages).
1875 		 */
1876 		pte++;
1877 		if (!nr_pages || first_pte_in_page(pte) ||
1878 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
1879 			domain_flush_cache(domain, first_pte,
1880 					   (void *)pte - (void *)first_pte);
1881 			pte = NULL;
1882 		}
1883 	}
1884 
1885 	return 0;
1886 }
1887 
1888 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
1889 {
1890 	struct intel_iommu *iommu = info->iommu;
1891 	struct context_entry *context;
1892 	u16 did;
1893 
1894 	spin_lock(&iommu->lock);
1895 	context = iommu_context_addr(iommu, bus, devfn, 0);
1896 	if (!context) {
1897 		spin_unlock(&iommu->lock);
1898 		return;
1899 	}
1900 
1901 	did = context_domain_id(context);
1902 	context_clear_entry(context);
1903 	__iommu_flush_cache(iommu, context, sizeof(*context));
1904 	spin_unlock(&iommu->lock);
1905 	intel_context_flush_present(info, context, did, true);
1906 }
1907 
1908 static int domain_setup_first_level(struct intel_iommu *iommu,
1909 				    struct dmar_domain *domain,
1910 				    struct device *dev,
1911 				    u32 pasid)
1912 {
1913 	struct dma_pte *pgd = domain->pgd;
1914 	int agaw, level;
1915 	int flags = 0;
1916 
1917 	/*
1918 	 * Skip top levels of page tables for iommu which has
1919 	 * less agaw than default. Unnecessary for PT mode.
1920 	 */
1921 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1922 		pgd = phys_to_virt(dma_pte_addr(pgd));
1923 		if (!dma_pte_present(pgd))
1924 			return -ENOMEM;
1925 	}
1926 
1927 	level = agaw_to_level(agaw);
1928 	if (level != 4 && level != 5)
1929 		return -EINVAL;
1930 
1931 	if (level == 5)
1932 		flags |= PASID_FLAG_FL5LP;
1933 
1934 	if (domain->force_snooping)
1935 		flags |= PASID_FLAG_PAGE_SNOOP;
1936 
1937 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
1938 					     domain_id_iommu(domain, iommu),
1939 					     flags);
1940 }
1941 
1942 static bool dev_is_real_dma_subdevice(struct device *dev)
1943 {
1944 	return dev && dev_is_pci(dev) &&
1945 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
1946 }
1947 
1948 static int dmar_domain_attach_device(struct dmar_domain *domain,
1949 				     struct device *dev)
1950 {
1951 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1952 	struct intel_iommu *iommu = info->iommu;
1953 	unsigned long flags;
1954 	int ret;
1955 
1956 	ret = domain_attach_iommu(domain, iommu);
1957 	if (ret)
1958 		return ret;
1959 
1960 	info->domain = domain;
1961 	spin_lock_irqsave(&domain->lock, flags);
1962 	list_add(&info->link, &domain->devices);
1963 	spin_unlock_irqrestore(&domain->lock, flags);
1964 
1965 	if (dev_is_real_dma_subdevice(dev))
1966 		return 0;
1967 
1968 	if (!sm_supported(iommu))
1969 		ret = domain_context_mapping(domain, dev);
1970 	else if (domain->use_first_level)
1971 		ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID);
1972 	else
1973 		ret = intel_pasid_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID);
1974 
1975 	if (ret)
1976 		goto out_block_translation;
1977 
1978 	iommu_enable_pci_caps(info);
1979 
1980 	ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID);
1981 	if (ret)
1982 		goto out_block_translation;
1983 
1984 	return 0;
1985 
1986 out_block_translation:
1987 	device_block_translation(dev);
1988 	return ret;
1989 }
1990 
1991 /**
1992  * device_rmrr_is_relaxable - Test whether the RMRR of this device
1993  * is relaxable (ie. is allowed to be not enforced under some conditions)
1994  * @dev: device handle
1995  *
1996  * We assume that PCI USB devices with RMRRs have them largely
1997  * for historical reasons and that the RMRR space is not actively used post
1998  * boot.  This exclusion may change if vendors begin to abuse it.
1999  *
2000  * The same exception is made for graphics devices, with the requirement that
2001  * any use of the RMRR regions will be torn down before assigning the device
2002  * to a guest.
2003  *
2004  * Return: true if the RMRR is relaxable, false otherwise
2005  */
2006 static bool device_rmrr_is_relaxable(struct device *dev)
2007 {
2008 	struct pci_dev *pdev;
2009 
2010 	if (!dev_is_pci(dev))
2011 		return false;
2012 
2013 	pdev = to_pci_dev(dev);
2014 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2015 		return true;
2016 	else
2017 		return false;
2018 }
2019 
2020 static int device_def_domain_type(struct device *dev)
2021 {
2022 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2023 	struct intel_iommu *iommu = info->iommu;
2024 
2025 	/*
2026 	 * Hardware does not support the passthrough translation mode.
2027 	 * Always use a dynamaic mapping domain.
2028 	 */
2029 	if (!ecap_pass_through(iommu->ecap))
2030 		return IOMMU_DOMAIN_DMA;
2031 
2032 	if (dev_is_pci(dev)) {
2033 		struct pci_dev *pdev = to_pci_dev(dev);
2034 
2035 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2036 			return IOMMU_DOMAIN_IDENTITY;
2037 	}
2038 
2039 	return 0;
2040 }
2041 
2042 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2043 {
2044 	/*
2045 	 * Start from the sane iommu hardware state.
2046 	 * If the queued invalidation is already initialized by us
2047 	 * (for example, while enabling interrupt-remapping) then
2048 	 * we got the things already rolling from a sane state.
2049 	 */
2050 	if (!iommu->qi) {
2051 		/*
2052 		 * Clear any previous faults.
2053 		 */
2054 		dmar_fault(-1, iommu);
2055 		/*
2056 		 * Disable queued invalidation if supported and already enabled
2057 		 * before OS handover.
2058 		 */
2059 		dmar_disable_qi(iommu);
2060 	}
2061 
2062 	if (dmar_enable_qi(iommu)) {
2063 		/*
2064 		 * Queued Invalidate not enabled, use Register Based Invalidate
2065 		 */
2066 		iommu->flush.flush_context = __iommu_flush_context;
2067 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2068 		pr_info("%s: Using Register based invalidation\n",
2069 			iommu->name);
2070 	} else {
2071 		iommu->flush.flush_context = qi_flush_context;
2072 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2073 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2074 	}
2075 }
2076 
2077 static int copy_context_table(struct intel_iommu *iommu,
2078 			      struct root_entry *old_re,
2079 			      struct context_entry **tbl,
2080 			      int bus, bool ext)
2081 {
2082 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2083 	struct context_entry *new_ce = NULL, ce;
2084 	struct context_entry *old_ce = NULL;
2085 	struct root_entry re;
2086 	phys_addr_t old_ce_phys;
2087 
2088 	tbl_idx = ext ? bus * 2 : bus;
2089 	memcpy(&re, old_re, sizeof(re));
2090 
2091 	for (devfn = 0; devfn < 256; devfn++) {
2092 		/* First calculate the correct index */
2093 		idx = (ext ? devfn * 2 : devfn) % 256;
2094 
2095 		if (idx == 0) {
2096 			/* First save what we may have and clean up */
2097 			if (new_ce) {
2098 				tbl[tbl_idx] = new_ce;
2099 				__iommu_flush_cache(iommu, new_ce,
2100 						    VTD_PAGE_SIZE);
2101 				pos = 1;
2102 			}
2103 
2104 			if (old_ce)
2105 				memunmap(old_ce);
2106 
2107 			ret = 0;
2108 			if (devfn < 0x80)
2109 				old_ce_phys = root_entry_lctp(&re);
2110 			else
2111 				old_ce_phys = root_entry_uctp(&re);
2112 
2113 			if (!old_ce_phys) {
2114 				if (ext && devfn == 0) {
2115 					/* No LCTP, try UCTP */
2116 					devfn = 0x7f;
2117 					continue;
2118 				} else {
2119 					goto out;
2120 				}
2121 			}
2122 
2123 			ret = -ENOMEM;
2124 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2125 					MEMREMAP_WB);
2126 			if (!old_ce)
2127 				goto out;
2128 
2129 			new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL);
2130 			if (!new_ce)
2131 				goto out_unmap;
2132 
2133 			ret = 0;
2134 		}
2135 
2136 		/* Now copy the context entry */
2137 		memcpy(&ce, old_ce + idx, sizeof(ce));
2138 
2139 		if (!context_present(&ce))
2140 			continue;
2141 
2142 		did = context_domain_id(&ce);
2143 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2144 			set_bit(did, iommu->domain_ids);
2145 
2146 		set_context_copied(iommu, bus, devfn);
2147 		new_ce[idx] = ce;
2148 	}
2149 
2150 	tbl[tbl_idx + pos] = new_ce;
2151 
2152 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2153 
2154 out_unmap:
2155 	memunmap(old_ce);
2156 
2157 out:
2158 	return ret;
2159 }
2160 
2161 static int copy_translation_tables(struct intel_iommu *iommu)
2162 {
2163 	struct context_entry **ctxt_tbls;
2164 	struct root_entry *old_rt;
2165 	phys_addr_t old_rt_phys;
2166 	int ctxt_table_entries;
2167 	u64 rtaddr_reg;
2168 	int bus, ret;
2169 	bool new_ext, ext;
2170 
2171 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2172 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2173 	new_ext    = !!sm_supported(iommu);
2174 
2175 	/*
2176 	 * The RTT bit can only be changed when translation is disabled,
2177 	 * but disabling translation means to open a window for data
2178 	 * corruption. So bail out and don't copy anything if we would
2179 	 * have to change the bit.
2180 	 */
2181 	if (new_ext != ext)
2182 		return -EINVAL;
2183 
2184 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2185 	if (!iommu->copied_tables)
2186 		return -ENOMEM;
2187 
2188 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2189 	if (!old_rt_phys)
2190 		return -EINVAL;
2191 
2192 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2193 	if (!old_rt)
2194 		return -ENOMEM;
2195 
2196 	/* This is too big for the stack - allocate it from slab */
2197 	ctxt_table_entries = ext ? 512 : 256;
2198 	ret = -ENOMEM;
2199 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2200 	if (!ctxt_tbls)
2201 		goto out_unmap;
2202 
2203 	for (bus = 0; bus < 256; bus++) {
2204 		ret = copy_context_table(iommu, &old_rt[bus],
2205 					 ctxt_tbls, bus, ext);
2206 		if (ret) {
2207 			pr_err("%s: Failed to copy context table for bus %d\n",
2208 				iommu->name, bus);
2209 			continue;
2210 		}
2211 	}
2212 
2213 	spin_lock(&iommu->lock);
2214 
2215 	/* Context tables are copied, now write them to the root_entry table */
2216 	for (bus = 0; bus < 256; bus++) {
2217 		int idx = ext ? bus * 2 : bus;
2218 		u64 val;
2219 
2220 		if (ctxt_tbls[idx]) {
2221 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2222 			iommu->root_entry[bus].lo = val;
2223 		}
2224 
2225 		if (!ext || !ctxt_tbls[idx + 1])
2226 			continue;
2227 
2228 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2229 		iommu->root_entry[bus].hi = val;
2230 	}
2231 
2232 	spin_unlock(&iommu->lock);
2233 
2234 	kfree(ctxt_tbls);
2235 
2236 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2237 
2238 	ret = 0;
2239 
2240 out_unmap:
2241 	memunmap(old_rt);
2242 
2243 	return ret;
2244 }
2245 
2246 static int __init init_dmars(void)
2247 {
2248 	struct dmar_drhd_unit *drhd;
2249 	struct intel_iommu *iommu;
2250 	int ret;
2251 
2252 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2253 	if (ret)
2254 		goto free_iommu;
2255 
2256 	for_each_iommu(iommu, drhd) {
2257 		if (drhd->ignored) {
2258 			iommu_disable_translation(iommu);
2259 			continue;
2260 		}
2261 
2262 		/*
2263 		 * Find the max pasid size of all IOMMU's in the system.
2264 		 * We need to ensure the system pasid table is no bigger
2265 		 * than the smallest supported.
2266 		 */
2267 		if (pasid_supported(iommu)) {
2268 			u32 temp = 2 << ecap_pss(iommu->ecap);
2269 
2270 			intel_pasid_max_id = min_t(u32, temp,
2271 						   intel_pasid_max_id);
2272 		}
2273 
2274 		intel_iommu_init_qi(iommu);
2275 
2276 		ret = iommu_init_domains(iommu);
2277 		if (ret)
2278 			goto free_iommu;
2279 
2280 		init_translation_status(iommu);
2281 
2282 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2283 			iommu_disable_translation(iommu);
2284 			clear_translation_pre_enabled(iommu);
2285 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2286 				iommu->name);
2287 		}
2288 
2289 		/*
2290 		 * TBD:
2291 		 * we could share the same root & context tables
2292 		 * among all IOMMU's. Need to Split it later.
2293 		 */
2294 		ret = iommu_alloc_root_entry(iommu);
2295 		if (ret)
2296 			goto free_iommu;
2297 
2298 		if (translation_pre_enabled(iommu)) {
2299 			pr_info("Translation already enabled - trying to copy translation structures\n");
2300 
2301 			ret = copy_translation_tables(iommu);
2302 			if (ret) {
2303 				/*
2304 				 * We found the IOMMU with translation
2305 				 * enabled - but failed to copy over the
2306 				 * old root-entry table. Try to proceed
2307 				 * by disabling translation now and
2308 				 * allocating a clean root-entry table.
2309 				 * This might cause DMAR faults, but
2310 				 * probably the dump will still succeed.
2311 				 */
2312 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2313 				       iommu->name);
2314 				iommu_disable_translation(iommu);
2315 				clear_translation_pre_enabled(iommu);
2316 			} else {
2317 				pr_info("Copied translation tables from previous kernel for %s\n",
2318 					iommu->name);
2319 			}
2320 		}
2321 
2322 		intel_svm_check(iommu);
2323 	}
2324 
2325 	/*
2326 	 * Now that qi is enabled on all iommus, set the root entry and flush
2327 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2328 	 * flush_context function will loop forever and the boot hangs.
2329 	 */
2330 	for_each_active_iommu(iommu, drhd) {
2331 		iommu_flush_write_buffer(iommu);
2332 		iommu_set_root_entry(iommu);
2333 	}
2334 
2335 	check_tylersburg_isoch();
2336 
2337 	/*
2338 	 * for each drhd
2339 	 *   enable fault log
2340 	 *   global invalidate context cache
2341 	 *   global invalidate iotlb
2342 	 *   enable translation
2343 	 */
2344 	for_each_iommu(iommu, drhd) {
2345 		if (drhd->ignored) {
2346 			/*
2347 			 * we always have to disable PMRs or DMA may fail on
2348 			 * this device
2349 			 */
2350 			if (force_on)
2351 				iommu_disable_protect_mem_regions(iommu);
2352 			continue;
2353 		}
2354 
2355 		iommu_flush_write_buffer(iommu);
2356 
2357 #ifdef CONFIG_INTEL_IOMMU_SVM
2358 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2359 			/*
2360 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2361 			 * could cause possible lock race condition.
2362 			 */
2363 			up_write(&dmar_global_lock);
2364 			ret = intel_svm_enable_prq(iommu);
2365 			down_write(&dmar_global_lock);
2366 			if (ret)
2367 				goto free_iommu;
2368 		}
2369 #endif
2370 		ret = dmar_set_interrupt(iommu);
2371 		if (ret)
2372 			goto free_iommu;
2373 	}
2374 
2375 	return 0;
2376 
2377 free_iommu:
2378 	for_each_active_iommu(iommu, drhd) {
2379 		disable_dmar_iommu(iommu);
2380 		free_dmar_iommu(iommu);
2381 	}
2382 
2383 	return ret;
2384 }
2385 
2386 static void __init init_no_remapping_devices(void)
2387 {
2388 	struct dmar_drhd_unit *drhd;
2389 	struct device *dev;
2390 	int i;
2391 
2392 	for_each_drhd_unit(drhd) {
2393 		if (!drhd->include_all) {
2394 			for_each_active_dev_scope(drhd->devices,
2395 						  drhd->devices_cnt, i, dev)
2396 				break;
2397 			/* ignore DMAR unit if no devices exist */
2398 			if (i == drhd->devices_cnt)
2399 				drhd->ignored = 1;
2400 		}
2401 	}
2402 
2403 	for_each_active_drhd_unit(drhd) {
2404 		if (drhd->include_all)
2405 			continue;
2406 
2407 		for_each_active_dev_scope(drhd->devices,
2408 					  drhd->devices_cnt, i, dev)
2409 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2410 				break;
2411 		if (i < drhd->devices_cnt)
2412 			continue;
2413 
2414 		/* This IOMMU has *only* gfx devices. Either bypass it or
2415 		   set the gfx_mapped flag, as appropriate */
2416 		drhd->gfx_dedicated = 1;
2417 		if (disable_igfx_iommu)
2418 			drhd->ignored = 1;
2419 	}
2420 }
2421 
2422 #ifdef CONFIG_SUSPEND
2423 static int init_iommu_hw(void)
2424 {
2425 	struct dmar_drhd_unit *drhd;
2426 	struct intel_iommu *iommu = NULL;
2427 	int ret;
2428 
2429 	for_each_active_iommu(iommu, drhd) {
2430 		if (iommu->qi) {
2431 			ret = dmar_reenable_qi(iommu);
2432 			if (ret)
2433 				return ret;
2434 		}
2435 	}
2436 
2437 	for_each_iommu(iommu, drhd) {
2438 		if (drhd->ignored) {
2439 			/*
2440 			 * we always have to disable PMRs or DMA may fail on
2441 			 * this device
2442 			 */
2443 			if (force_on)
2444 				iommu_disable_protect_mem_regions(iommu);
2445 			continue;
2446 		}
2447 
2448 		iommu_flush_write_buffer(iommu);
2449 		iommu_set_root_entry(iommu);
2450 		iommu_enable_translation(iommu);
2451 		iommu_disable_protect_mem_regions(iommu);
2452 	}
2453 
2454 	return 0;
2455 }
2456 
2457 static void iommu_flush_all(void)
2458 {
2459 	struct dmar_drhd_unit *drhd;
2460 	struct intel_iommu *iommu;
2461 
2462 	for_each_active_iommu(iommu, drhd) {
2463 		iommu->flush.flush_context(iommu, 0, 0, 0,
2464 					   DMA_CCMD_GLOBAL_INVL);
2465 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2466 					 DMA_TLB_GLOBAL_FLUSH);
2467 	}
2468 }
2469 
2470 static int iommu_suspend(void)
2471 {
2472 	struct dmar_drhd_unit *drhd;
2473 	struct intel_iommu *iommu = NULL;
2474 	unsigned long flag;
2475 
2476 	iommu_flush_all();
2477 
2478 	for_each_active_iommu(iommu, drhd) {
2479 		iommu_disable_translation(iommu);
2480 
2481 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2482 
2483 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
2484 			readl(iommu->reg + DMAR_FECTL_REG);
2485 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2486 			readl(iommu->reg + DMAR_FEDATA_REG);
2487 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2488 			readl(iommu->reg + DMAR_FEADDR_REG);
2489 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2490 			readl(iommu->reg + DMAR_FEUADDR_REG);
2491 
2492 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2493 	}
2494 	return 0;
2495 }
2496 
2497 static void iommu_resume(void)
2498 {
2499 	struct dmar_drhd_unit *drhd;
2500 	struct intel_iommu *iommu = NULL;
2501 	unsigned long flag;
2502 
2503 	if (init_iommu_hw()) {
2504 		if (force_on)
2505 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2506 		else
2507 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2508 		return;
2509 	}
2510 
2511 	for_each_active_iommu(iommu, drhd) {
2512 
2513 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2514 
2515 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2516 			iommu->reg + DMAR_FECTL_REG);
2517 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2518 			iommu->reg + DMAR_FEDATA_REG);
2519 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2520 			iommu->reg + DMAR_FEADDR_REG);
2521 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2522 			iommu->reg + DMAR_FEUADDR_REG);
2523 
2524 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2525 	}
2526 }
2527 
2528 static struct syscore_ops iommu_syscore_ops = {
2529 	.resume		= iommu_resume,
2530 	.suspend	= iommu_suspend,
2531 };
2532 
2533 static void __init init_iommu_pm_ops(void)
2534 {
2535 	register_syscore_ops(&iommu_syscore_ops);
2536 }
2537 
2538 #else
2539 static inline void init_iommu_pm_ops(void) {}
2540 #endif	/* CONFIG_PM */
2541 
2542 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2543 {
2544 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2545 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2546 	    rmrr->end_address <= rmrr->base_address ||
2547 	    arch_rmrr_sanity_check(rmrr))
2548 		return -EINVAL;
2549 
2550 	return 0;
2551 }
2552 
2553 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2554 {
2555 	struct acpi_dmar_reserved_memory *rmrr;
2556 	struct dmar_rmrr_unit *rmrru;
2557 
2558 	rmrr = (struct acpi_dmar_reserved_memory *)header;
2559 	if (rmrr_sanity_check(rmrr)) {
2560 		pr_warn(FW_BUG
2561 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2562 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2563 			   rmrr->base_address, rmrr->end_address,
2564 			   dmi_get_system_info(DMI_BIOS_VENDOR),
2565 			   dmi_get_system_info(DMI_BIOS_VERSION),
2566 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
2567 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2568 	}
2569 
2570 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2571 	if (!rmrru)
2572 		goto out;
2573 
2574 	rmrru->hdr = header;
2575 
2576 	rmrru->base_address = rmrr->base_address;
2577 	rmrru->end_address = rmrr->end_address;
2578 
2579 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2580 				((void *)rmrr) + rmrr->header.length,
2581 				&rmrru->devices_cnt);
2582 	if (rmrru->devices_cnt && rmrru->devices == NULL)
2583 		goto free_rmrru;
2584 
2585 	list_add(&rmrru->list, &dmar_rmrr_units);
2586 
2587 	return 0;
2588 free_rmrru:
2589 	kfree(rmrru);
2590 out:
2591 	return -ENOMEM;
2592 }
2593 
2594 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2595 {
2596 	struct dmar_atsr_unit *atsru;
2597 	struct acpi_dmar_atsr *tmp;
2598 
2599 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2600 				dmar_rcu_check()) {
2601 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2602 		if (atsr->segment != tmp->segment)
2603 			continue;
2604 		if (atsr->header.length != tmp->header.length)
2605 			continue;
2606 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
2607 			return atsru;
2608 	}
2609 
2610 	return NULL;
2611 }
2612 
2613 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2614 {
2615 	struct acpi_dmar_atsr *atsr;
2616 	struct dmar_atsr_unit *atsru;
2617 
2618 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2619 		return 0;
2620 
2621 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2622 	atsru = dmar_find_atsr(atsr);
2623 	if (atsru)
2624 		return 0;
2625 
2626 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2627 	if (!atsru)
2628 		return -ENOMEM;
2629 
2630 	/*
2631 	 * If memory is allocated from slab by ACPI _DSM method, we need to
2632 	 * copy the memory content because the memory buffer will be freed
2633 	 * on return.
2634 	 */
2635 	atsru->hdr = (void *)(atsru + 1);
2636 	memcpy(atsru->hdr, hdr, hdr->length);
2637 	atsru->include_all = atsr->flags & 0x1;
2638 	if (!atsru->include_all) {
2639 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
2640 				(void *)atsr + atsr->header.length,
2641 				&atsru->devices_cnt);
2642 		if (atsru->devices_cnt && atsru->devices == NULL) {
2643 			kfree(atsru);
2644 			return -ENOMEM;
2645 		}
2646 	}
2647 
2648 	list_add_rcu(&atsru->list, &dmar_atsr_units);
2649 
2650 	return 0;
2651 }
2652 
2653 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2654 {
2655 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
2656 	kfree(atsru);
2657 }
2658 
2659 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2660 {
2661 	struct acpi_dmar_atsr *atsr;
2662 	struct dmar_atsr_unit *atsru;
2663 
2664 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2665 	atsru = dmar_find_atsr(atsr);
2666 	if (atsru) {
2667 		list_del_rcu(&atsru->list);
2668 		synchronize_rcu();
2669 		intel_iommu_free_atsr(atsru);
2670 	}
2671 
2672 	return 0;
2673 }
2674 
2675 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2676 {
2677 	int i;
2678 	struct device *dev;
2679 	struct acpi_dmar_atsr *atsr;
2680 	struct dmar_atsr_unit *atsru;
2681 
2682 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2683 	atsru = dmar_find_atsr(atsr);
2684 	if (!atsru)
2685 		return 0;
2686 
2687 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
2688 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
2689 					  i, dev)
2690 			return -EBUSY;
2691 	}
2692 
2693 	return 0;
2694 }
2695 
2696 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
2697 {
2698 	struct dmar_satc_unit *satcu;
2699 	struct acpi_dmar_satc *tmp;
2700 
2701 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
2702 				dmar_rcu_check()) {
2703 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
2704 		if (satc->segment != tmp->segment)
2705 			continue;
2706 		if (satc->header.length != tmp->header.length)
2707 			continue;
2708 		if (memcmp(satc, tmp, satc->header.length) == 0)
2709 			return satcu;
2710 	}
2711 
2712 	return NULL;
2713 }
2714 
2715 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
2716 {
2717 	struct acpi_dmar_satc *satc;
2718 	struct dmar_satc_unit *satcu;
2719 
2720 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2721 		return 0;
2722 
2723 	satc = container_of(hdr, struct acpi_dmar_satc, header);
2724 	satcu = dmar_find_satc(satc);
2725 	if (satcu)
2726 		return 0;
2727 
2728 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
2729 	if (!satcu)
2730 		return -ENOMEM;
2731 
2732 	satcu->hdr = (void *)(satcu + 1);
2733 	memcpy(satcu->hdr, hdr, hdr->length);
2734 	satcu->atc_required = satc->flags & 0x1;
2735 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
2736 					      (void *)satc + satc->header.length,
2737 					      &satcu->devices_cnt);
2738 	if (satcu->devices_cnt && !satcu->devices) {
2739 		kfree(satcu);
2740 		return -ENOMEM;
2741 	}
2742 	list_add_rcu(&satcu->list, &dmar_satc_units);
2743 
2744 	return 0;
2745 }
2746 
2747 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
2748 {
2749 	int sp, ret;
2750 	struct intel_iommu *iommu = dmaru->iommu;
2751 
2752 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
2753 	if (ret)
2754 		goto out;
2755 
2756 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
2757 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
2758 		pr_warn("%s: Doesn't support large page.\n",
2759 			iommu->name);
2760 		return -ENXIO;
2761 	}
2762 
2763 	/*
2764 	 * Disable translation if already enabled prior to OS handover.
2765 	 */
2766 	if (iommu->gcmd & DMA_GCMD_TE)
2767 		iommu_disable_translation(iommu);
2768 
2769 	ret = iommu_init_domains(iommu);
2770 	if (ret == 0)
2771 		ret = iommu_alloc_root_entry(iommu);
2772 	if (ret)
2773 		goto out;
2774 
2775 	intel_svm_check(iommu);
2776 
2777 	if (dmaru->ignored) {
2778 		/*
2779 		 * we always have to disable PMRs or DMA may fail on this device
2780 		 */
2781 		if (force_on)
2782 			iommu_disable_protect_mem_regions(iommu);
2783 		return 0;
2784 	}
2785 
2786 	intel_iommu_init_qi(iommu);
2787 	iommu_flush_write_buffer(iommu);
2788 
2789 #ifdef CONFIG_INTEL_IOMMU_SVM
2790 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2791 		ret = intel_svm_enable_prq(iommu);
2792 		if (ret)
2793 			goto disable_iommu;
2794 	}
2795 #endif
2796 	ret = dmar_set_interrupt(iommu);
2797 	if (ret)
2798 		goto disable_iommu;
2799 
2800 	iommu_set_root_entry(iommu);
2801 	iommu_enable_translation(iommu);
2802 
2803 	iommu_disable_protect_mem_regions(iommu);
2804 	return 0;
2805 
2806 disable_iommu:
2807 	disable_dmar_iommu(iommu);
2808 out:
2809 	free_dmar_iommu(iommu);
2810 	return ret;
2811 }
2812 
2813 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
2814 {
2815 	int ret = 0;
2816 	struct intel_iommu *iommu = dmaru->iommu;
2817 
2818 	if (!intel_iommu_enabled)
2819 		return 0;
2820 	if (iommu == NULL)
2821 		return -EINVAL;
2822 
2823 	if (insert) {
2824 		ret = intel_iommu_add(dmaru);
2825 	} else {
2826 		disable_dmar_iommu(iommu);
2827 		free_dmar_iommu(iommu);
2828 	}
2829 
2830 	return ret;
2831 }
2832 
2833 static void intel_iommu_free_dmars(void)
2834 {
2835 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
2836 	struct dmar_atsr_unit *atsru, *atsr_n;
2837 	struct dmar_satc_unit *satcu, *satc_n;
2838 
2839 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
2840 		list_del(&rmrru->list);
2841 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
2842 		kfree(rmrru);
2843 	}
2844 
2845 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
2846 		list_del(&atsru->list);
2847 		intel_iommu_free_atsr(atsru);
2848 	}
2849 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
2850 		list_del(&satcu->list);
2851 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
2852 		kfree(satcu);
2853 	}
2854 }
2855 
2856 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
2857 {
2858 	struct dmar_satc_unit *satcu;
2859 	struct acpi_dmar_satc *satc;
2860 	struct device *tmp;
2861 	int i;
2862 
2863 	dev = pci_physfn(dev);
2864 	rcu_read_lock();
2865 
2866 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
2867 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2868 		if (satc->segment != pci_domain_nr(dev->bus))
2869 			continue;
2870 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
2871 			if (to_pci_dev(tmp) == dev)
2872 				goto out;
2873 	}
2874 	satcu = NULL;
2875 out:
2876 	rcu_read_unlock();
2877 	return satcu;
2878 }
2879 
2880 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
2881 {
2882 	int i, ret = 1;
2883 	struct pci_bus *bus;
2884 	struct pci_dev *bridge = NULL;
2885 	struct device *tmp;
2886 	struct acpi_dmar_atsr *atsr;
2887 	struct dmar_atsr_unit *atsru;
2888 	struct dmar_satc_unit *satcu;
2889 
2890 	dev = pci_physfn(dev);
2891 	satcu = dmar_find_matched_satc_unit(dev);
2892 	if (satcu)
2893 		/*
2894 		 * This device supports ATS as it is in SATC table.
2895 		 * When IOMMU is in legacy mode, enabling ATS is done
2896 		 * automatically by HW for the device that requires
2897 		 * ATS, hence OS should not enable this device ATS
2898 		 * to avoid duplicated TLB invalidation.
2899 		 */
2900 		return !(satcu->atc_required && !sm_supported(iommu));
2901 
2902 	for (bus = dev->bus; bus; bus = bus->parent) {
2903 		bridge = bus->self;
2904 		/* If it's an integrated device, allow ATS */
2905 		if (!bridge)
2906 			return 1;
2907 		/* Connected via non-PCIe: no ATS */
2908 		if (!pci_is_pcie(bridge) ||
2909 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
2910 			return 0;
2911 		/* If we found the root port, look it up in the ATSR */
2912 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
2913 			break;
2914 	}
2915 
2916 	rcu_read_lock();
2917 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
2918 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2919 		if (atsr->segment != pci_domain_nr(dev->bus))
2920 			continue;
2921 
2922 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
2923 			if (tmp == &bridge->dev)
2924 				goto out;
2925 
2926 		if (atsru->include_all)
2927 			goto out;
2928 	}
2929 	ret = 0;
2930 out:
2931 	rcu_read_unlock();
2932 
2933 	return ret;
2934 }
2935 
2936 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
2937 {
2938 	int ret;
2939 	struct dmar_rmrr_unit *rmrru;
2940 	struct dmar_atsr_unit *atsru;
2941 	struct dmar_satc_unit *satcu;
2942 	struct acpi_dmar_atsr *atsr;
2943 	struct acpi_dmar_reserved_memory *rmrr;
2944 	struct acpi_dmar_satc *satc;
2945 
2946 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
2947 		return 0;
2948 
2949 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
2950 		rmrr = container_of(rmrru->hdr,
2951 				    struct acpi_dmar_reserved_memory, header);
2952 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2953 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
2954 				((void *)rmrr) + rmrr->header.length,
2955 				rmrr->segment, rmrru->devices,
2956 				rmrru->devices_cnt);
2957 			if (ret < 0)
2958 				return ret;
2959 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2960 			dmar_remove_dev_scope(info, rmrr->segment,
2961 				rmrru->devices, rmrru->devices_cnt);
2962 		}
2963 	}
2964 
2965 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
2966 		if (atsru->include_all)
2967 			continue;
2968 
2969 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2970 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2971 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
2972 					(void *)atsr + atsr->header.length,
2973 					atsr->segment, atsru->devices,
2974 					atsru->devices_cnt);
2975 			if (ret > 0)
2976 				break;
2977 			else if (ret < 0)
2978 				return ret;
2979 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2980 			if (dmar_remove_dev_scope(info, atsr->segment,
2981 					atsru->devices, atsru->devices_cnt))
2982 				break;
2983 		}
2984 	}
2985 	list_for_each_entry(satcu, &dmar_satc_units, list) {
2986 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2987 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2988 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
2989 					(void *)satc + satc->header.length,
2990 					satc->segment, satcu->devices,
2991 					satcu->devices_cnt);
2992 			if (ret > 0)
2993 				break;
2994 			else if (ret < 0)
2995 				return ret;
2996 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2997 			if (dmar_remove_dev_scope(info, satc->segment,
2998 					satcu->devices, satcu->devices_cnt))
2999 				break;
3000 		}
3001 	}
3002 
3003 	return 0;
3004 }
3005 
3006 static void intel_disable_iommus(void)
3007 {
3008 	struct intel_iommu *iommu = NULL;
3009 	struct dmar_drhd_unit *drhd;
3010 
3011 	for_each_iommu(iommu, drhd)
3012 		iommu_disable_translation(iommu);
3013 }
3014 
3015 void intel_iommu_shutdown(void)
3016 {
3017 	struct dmar_drhd_unit *drhd;
3018 	struct intel_iommu *iommu = NULL;
3019 
3020 	if (no_iommu || dmar_disabled)
3021 		return;
3022 
3023 	down_write(&dmar_global_lock);
3024 
3025 	/* Disable PMRs explicitly here. */
3026 	for_each_iommu(iommu, drhd)
3027 		iommu_disable_protect_mem_regions(iommu);
3028 
3029 	/* Make sure the IOMMUs are switched off */
3030 	intel_disable_iommus();
3031 
3032 	up_write(&dmar_global_lock);
3033 }
3034 
3035 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3036 {
3037 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3038 
3039 	return container_of(iommu_dev, struct intel_iommu, iommu);
3040 }
3041 
3042 static ssize_t version_show(struct device *dev,
3043 			    struct device_attribute *attr, char *buf)
3044 {
3045 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3046 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3047 	return sysfs_emit(buf, "%d:%d\n",
3048 			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3049 }
3050 static DEVICE_ATTR_RO(version);
3051 
3052 static ssize_t address_show(struct device *dev,
3053 			    struct device_attribute *attr, char *buf)
3054 {
3055 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3056 	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3057 }
3058 static DEVICE_ATTR_RO(address);
3059 
3060 static ssize_t cap_show(struct device *dev,
3061 			struct device_attribute *attr, char *buf)
3062 {
3063 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3064 	return sysfs_emit(buf, "%llx\n", iommu->cap);
3065 }
3066 static DEVICE_ATTR_RO(cap);
3067 
3068 static ssize_t ecap_show(struct device *dev,
3069 			 struct device_attribute *attr, char *buf)
3070 {
3071 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3072 	return sysfs_emit(buf, "%llx\n", iommu->ecap);
3073 }
3074 static DEVICE_ATTR_RO(ecap);
3075 
3076 static ssize_t domains_supported_show(struct device *dev,
3077 				      struct device_attribute *attr, char *buf)
3078 {
3079 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3080 	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3081 }
3082 static DEVICE_ATTR_RO(domains_supported);
3083 
3084 static ssize_t domains_used_show(struct device *dev,
3085 				 struct device_attribute *attr, char *buf)
3086 {
3087 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3088 	return sysfs_emit(buf, "%d\n",
3089 			  bitmap_weight(iommu->domain_ids,
3090 					cap_ndoms(iommu->cap)));
3091 }
3092 static DEVICE_ATTR_RO(domains_used);
3093 
3094 static struct attribute *intel_iommu_attrs[] = {
3095 	&dev_attr_version.attr,
3096 	&dev_attr_address.attr,
3097 	&dev_attr_cap.attr,
3098 	&dev_attr_ecap.attr,
3099 	&dev_attr_domains_supported.attr,
3100 	&dev_attr_domains_used.attr,
3101 	NULL,
3102 };
3103 
3104 static struct attribute_group intel_iommu_group = {
3105 	.name = "intel-iommu",
3106 	.attrs = intel_iommu_attrs,
3107 };
3108 
3109 const struct attribute_group *intel_iommu_groups[] = {
3110 	&intel_iommu_group,
3111 	NULL,
3112 };
3113 
3114 static bool has_external_pci(void)
3115 {
3116 	struct pci_dev *pdev = NULL;
3117 
3118 	for_each_pci_dev(pdev)
3119 		if (pdev->external_facing) {
3120 			pci_dev_put(pdev);
3121 			return true;
3122 		}
3123 
3124 	return false;
3125 }
3126 
3127 static int __init platform_optin_force_iommu(void)
3128 {
3129 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3130 		return 0;
3131 
3132 	if (no_iommu || dmar_disabled)
3133 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3134 
3135 	/*
3136 	 * If Intel-IOMMU is disabled by default, we will apply identity
3137 	 * map for all devices except those marked as being untrusted.
3138 	 */
3139 	if (dmar_disabled)
3140 		iommu_set_default_passthrough(false);
3141 
3142 	dmar_disabled = 0;
3143 	no_iommu = 0;
3144 
3145 	return 1;
3146 }
3147 
3148 static int __init probe_acpi_namespace_devices(void)
3149 {
3150 	struct dmar_drhd_unit *drhd;
3151 	/* To avoid a -Wunused-but-set-variable warning. */
3152 	struct intel_iommu *iommu __maybe_unused;
3153 	struct device *dev;
3154 	int i, ret = 0;
3155 
3156 	for_each_active_iommu(iommu, drhd) {
3157 		for_each_active_dev_scope(drhd->devices,
3158 					  drhd->devices_cnt, i, dev) {
3159 			struct acpi_device_physical_node *pn;
3160 			struct acpi_device *adev;
3161 
3162 			if (dev->bus != &acpi_bus_type)
3163 				continue;
3164 
3165 			adev = to_acpi_device(dev);
3166 			mutex_lock(&adev->physical_node_lock);
3167 			list_for_each_entry(pn,
3168 					    &adev->physical_node_list, node) {
3169 				ret = iommu_probe_device(pn->dev);
3170 				if (ret)
3171 					break;
3172 			}
3173 			mutex_unlock(&adev->physical_node_lock);
3174 
3175 			if (ret)
3176 				return ret;
3177 		}
3178 	}
3179 
3180 	return 0;
3181 }
3182 
3183 static __init int tboot_force_iommu(void)
3184 {
3185 	if (!tboot_enabled())
3186 		return 0;
3187 
3188 	if (no_iommu || dmar_disabled)
3189 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3190 
3191 	dmar_disabled = 0;
3192 	no_iommu = 0;
3193 
3194 	return 1;
3195 }
3196 
3197 int __init intel_iommu_init(void)
3198 {
3199 	int ret = -ENODEV;
3200 	struct dmar_drhd_unit *drhd;
3201 	struct intel_iommu *iommu;
3202 
3203 	/*
3204 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3205 	 * opt in, so enforce that.
3206 	 */
3207 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3208 		    platform_optin_force_iommu();
3209 
3210 	down_write(&dmar_global_lock);
3211 	if (dmar_table_init()) {
3212 		if (force_on)
3213 			panic("tboot: Failed to initialize DMAR table\n");
3214 		goto out_free_dmar;
3215 	}
3216 
3217 	if (dmar_dev_scope_init() < 0) {
3218 		if (force_on)
3219 			panic("tboot: Failed to initialize DMAR device scope\n");
3220 		goto out_free_dmar;
3221 	}
3222 
3223 	up_write(&dmar_global_lock);
3224 
3225 	/*
3226 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3227 	 * complain later when we register it under the lock.
3228 	 */
3229 	dmar_register_bus_notifier();
3230 
3231 	down_write(&dmar_global_lock);
3232 
3233 	if (!no_iommu)
3234 		intel_iommu_debugfs_init();
3235 
3236 	if (no_iommu || dmar_disabled) {
3237 		/*
3238 		 * We exit the function here to ensure IOMMU's remapping and
3239 		 * mempool aren't setup, which means that the IOMMU's PMRs
3240 		 * won't be disabled via the call to init_dmars(). So disable
3241 		 * it explicitly here. The PMRs were setup by tboot prior to
3242 		 * calling SENTER, but the kernel is expected to reset/tear
3243 		 * down the PMRs.
3244 		 */
3245 		if (intel_iommu_tboot_noforce) {
3246 			for_each_iommu(iommu, drhd)
3247 				iommu_disable_protect_mem_regions(iommu);
3248 		}
3249 
3250 		/*
3251 		 * Make sure the IOMMUs are switched off, even when we
3252 		 * boot into a kexec kernel and the previous kernel left
3253 		 * them enabled
3254 		 */
3255 		intel_disable_iommus();
3256 		goto out_free_dmar;
3257 	}
3258 
3259 	if (list_empty(&dmar_rmrr_units))
3260 		pr_info("No RMRR found\n");
3261 
3262 	if (list_empty(&dmar_atsr_units))
3263 		pr_info("No ATSR found\n");
3264 
3265 	if (list_empty(&dmar_satc_units))
3266 		pr_info("No SATC found\n");
3267 
3268 	init_no_remapping_devices();
3269 
3270 	ret = init_dmars();
3271 	if (ret) {
3272 		if (force_on)
3273 			panic("tboot: Failed to initialize DMARs\n");
3274 		pr_err("Initialization failed\n");
3275 		goto out_free_dmar;
3276 	}
3277 	up_write(&dmar_global_lock);
3278 
3279 	init_iommu_pm_ops();
3280 
3281 	down_read(&dmar_global_lock);
3282 	for_each_active_iommu(iommu, drhd) {
3283 		/*
3284 		 * The flush queue implementation does not perform
3285 		 * page-selective invalidations that are required for efficient
3286 		 * TLB flushes in virtual environments.  The benefit of batching
3287 		 * is likely to be much lower than the overhead of synchronizing
3288 		 * the virtual and physical IOMMU page-tables.
3289 		 */
3290 		if (cap_caching_mode(iommu->cap) &&
3291 		    !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3292 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3293 			iommu_set_dma_strict();
3294 		}
3295 		iommu_device_sysfs_add(&iommu->iommu, NULL,
3296 				       intel_iommu_groups,
3297 				       "%s", iommu->name);
3298 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3299 
3300 		iommu_pmu_register(iommu);
3301 	}
3302 
3303 	if (probe_acpi_namespace_devices())
3304 		pr_warn("ACPI name space devices didn't probe correctly\n");
3305 
3306 	/* Finally, we enable the DMA remapping hardware. */
3307 	for_each_iommu(iommu, drhd) {
3308 		if (!drhd->ignored && !translation_pre_enabled(iommu))
3309 			iommu_enable_translation(iommu);
3310 
3311 		iommu_disable_protect_mem_regions(iommu);
3312 	}
3313 	up_read(&dmar_global_lock);
3314 
3315 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3316 
3317 	intel_iommu_enabled = 1;
3318 
3319 	return 0;
3320 
3321 out_free_dmar:
3322 	intel_iommu_free_dmars();
3323 	up_write(&dmar_global_lock);
3324 	return ret;
3325 }
3326 
3327 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3328 {
3329 	struct device_domain_info *info = opaque;
3330 
3331 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3332 	return 0;
3333 }
3334 
3335 /*
3336  * NB - intel-iommu lacks any sort of reference counting for the users of
3337  * dependent devices.  If multiple endpoints have intersecting dependent
3338  * devices, unbinding the driver from any one of them will possibly leave
3339  * the others unable to operate.
3340  */
3341 static void domain_context_clear(struct device_domain_info *info)
3342 {
3343 	if (!dev_is_pci(info->dev))
3344 		domain_context_clear_one(info, info->bus, info->devfn);
3345 
3346 	pci_for_each_dma_alias(to_pci_dev(info->dev),
3347 			       &domain_context_clear_one_cb, info);
3348 }
3349 
3350 /*
3351  * Clear the page table pointer in context or pasid table entries so that
3352  * all DMA requests without PASID from the device are blocked. If the page
3353  * table has been set, clean up the data structures.
3354  */
3355 void device_block_translation(struct device *dev)
3356 {
3357 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3358 	struct intel_iommu *iommu = info->iommu;
3359 	unsigned long flags;
3360 
3361 	iommu_disable_pci_caps(info);
3362 	if (!dev_is_real_dma_subdevice(dev)) {
3363 		if (sm_supported(iommu))
3364 			intel_pasid_tear_down_entry(iommu, dev,
3365 						    IOMMU_NO_PASID, false);
3366 		else
3367 			domain_context_clear(info);
3368 	}
3369 
3370 	if (!info->domain)
3371 		return;
3372 
3373 	spin_lock_irqsave(&info->domain->lock, flags);
3374 	list_del(&info->link);
3375 	spin_unlock_irqrestore(&info->domain->lock, flags);
3376 
3377 	cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID);
3378 	domain_detach_iommu(info->domain, iommu);
3379 	info->domain = NULL;
3380 }
3381 
3382 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3383 {
3384 	int adjust_width;
3385 
3386 	/* calculate AGAW */
3387 	domain->gaw = guest_width;
3388 	adjust_width = guestwidth_to_adjustwidth(guest_width);
3389 	domain->agaw = width_to_agaw(adjust_width);
3390 
3391 	domain->iommu_coherency = false;
3392 	domain->iommu_superpage = 0;
3393 	domain->max_addr = 0;
3394 
3395 	/* always allocate the top pgd */
3396 	domain->pgd = iommu_alloc_page_node(domain->nid, GFP_ATOMIC);
3397 	if (!domain->pgd)
3398 		return -ENOMEM;
3399 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3400 	return 0;
3401 }
3402 
3403 static int blocking_domain_attach_dev(struct iommu_domain *domain,
3404 				      struct device *dev)
3405 {
3406 	device_block_translation(dev);
3407 	return 0;
3408 }
3409 
3410 static struct iommu_domain blocking_domain = {
3411 	.type = IOMMU_DOMAIN_BLOCKED,
3412 	.ops = &(const struct iommu_domain_ops) {
3413 		.attach_dev	= blocking_domain_attach_dev,
3414 	}
3415 };
3416 
3417 static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage)
3418 {
3419 	if (!intel_iommu_superpage)
3420 		return 0;
3421 
3422 	if (first_stage)
3423 		return cap_fl1gp_support(iommu->cap) ? 2 : 1;
3424 
3425 	return fls(cap_super_page_val(iommu->cap));
3426 }
3427 
3428 static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage)
3429 {
3430 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3431 	struct intel_iommu *iommu = info->iommu;
3432 	struct dmar_domain *domain;
3433 	int addr_width;
3434 
3435 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
3436 	if (!domain)
3437 		return ERR_PTR(-ENOMEM);
3438 
3439 	INIT_LIST_HEAD(&domain->devices);
3440 	INIT_LIST_HEAD(&domain->dev_pasids);
3441 	INIT_LIST_HEAD(&domain->cache_tags);
3442 	spin_lock_init(&domain->lock);
3443 	spin_lock_init(&domain->cache_lock);
3444 	xa_init(&domain->iommu_array);
3445 
3446 	domain->nid = dev_to_node(dev);
3447 	domain->use_first_level = first_stage;
3448 
3449 	/* calculate the address width */
3450 	addr_width = agaw_to_width(iommu->agaw);
3451 	if (addr_width > cap_mgaw(iommu->cap))
3452 		addr_width = cap_mgaw(iommu->cap);
3453 	domain->gaw = addr_width;
3454 	domain->agaw = iommu->agaw;
3455 	domain->max_addr = __DOMAIN_MAX_ADDR(addr_width);
3456 
3457 	/* iommu memory access coherency */
3458 	domain->iommu_coherency = iommu_paging_structure_coherency(iommu);
3459 
3460 	/* pagesize bitmap */
3461 	domain->domain.pgsize_bitmap = SZ_4K;
3462 	domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage);
3463 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
3464 
3465 	/*
3466 	 * IOVA aperture: First-level translation restricts the input-address
3467 	 * to a canonical address (i.e., address bits 63:N have the same value
3468 	 * as address bit [N-1], where N is 48-bits with 4-level paging and
3469 	 * 57-bits with 5-level paging). Hence, skip bit [N-1].
3470 	 */
3471 	domain->domain.geometry.force_aperture = true;
3472 	domain->domain.geometry.aperture_start = 0;
3473 	if (first_stage)
3474 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
3475 	else
3476 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
3477 
3478 	/* always allocate the top pgd */
3479 	domain->pgd = iommu_alloc_page_node(domain->nid, GFP_KERNEL);
3480 	if (!domain->pgd) {
3481 		kfree(domain);
3482 		return ERR_PTR(-ENOMEM);
3483 	}
3484 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3485 
3486 	return domain;
3487 }
3488 
3489 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
3490 {
3491 	struct dmar_domain *dmar_domain;
3492 	struct iommu_domain *domain;
3493 
3494 	switch (type) {
3495 	case IOMMU_DOMAIN_DMA:
3496 	case IOMMU_DOMAIN_UNMANAGED:
3497 		dmar_domain = alloc_domain(type);
3498 		if (!dmar_domain) {
3499 			pr_err("Can't allocate dmar_domain\n");
3500 			return NULL;
3501 		}
3502 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3503 			pr_err("Domain initialization failed\n");
3504 			domain_exit(dmar_domain);
3505 			return NULL;
3506 		}
3507 
3508 		domain = &dmar_domain->domain;
3509 		domain->geometry.aperture_start = 0;
3510 		domain->geometry.aperture_end   =
3511 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
3512 		domain->geometry.force_aperture = true;
3513 
3514 		return domain;
3515 	default:
3516 		return NULL;
3517 	}
3518 
3519 	return NULL;
3520 }
3521 
3522 static struct iommu_domain *
3523 intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
3524 			      struct iommu_domain *parent,
3525 			      const struct iommu_user_data *user_data)
3526 {
3527 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3528 	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3529 	bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3530 	struct intel_iommu *iommu = info->iommu;
3531 	struct dmar_domain *dmar_domain;
3532 	struct iommu_domain *domain;
3533 
3534 	/* Must be NESTING domain */
3535 	if (parent) {
3536 		if (!nested_supported(iommu) || flags)
3537 			return ERR_PTR(-EOPNOTSUPP);
3538 		return intel_nested_domain_alloc(parent, user_data);
3539 	}
3540 
3541 	if (flags &
3542 	    (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
3543 		return ERR_PTR(-EOPNOTSUPP);
3544 	if (nested_parent && !nested_supported(iommu))
3545 		return ERR_PTR(-EOPNOTSUPP);
3546 	if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3547 		return ERR_PTR(-EOPNOTSUPP);
3548 
3549 	/* Do not use first stage for user domain translation. */
3550 	dmar_domain = paging_domain_alloc(dev, false);
3551 	if (IS_ERR(dmar_domain))
3552 		return ERR_CAST(dmar_domain);
3553 	domain = &dmar_domain->domain;
3554 	domain->type = IOMMU_DOMAIN_UNMANAGED;
3555 	domain->owner = &intel_iommu_ops;
3556 	domain->ops = intel_iommu_ops.default_domain_ops;
3557 
3558 	if (nested_parent) {
3559 		dmar_domain->nested_parent = true;
3560 		INIT_LIST_HEAD(&dmar_domain->s1_domains);
3561 		spin_lock_init(&dmar_domain->s1_lock);
3562 	}
3563 
3564 	if (dirty_tracking) {
3565 		if (dmar_domain->use_first_level) {
3566 			iommu_domain_free(domain);
3567 			return ERR_PTR(-EOPNOTSUPP);
3568 		}
3569 		domain->dirty_ops = &intel_dirty_ops;
3570 	}
3571 
3572 	return domain;
3573 }
3574 
3575 static void intel_iommu_domain_free(struct iommu_domain *domain)
3576 {
3577 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3578 
3579 	WARN_ON(dmar_domain->nested_parent &&
3580 		!list_empty(&dmar_domain->s1_domains));
3581 	domain_exit(dmar_domain);
3582 }
3583 
3584 int prepare_domain_attach_device(struct iommu_domain *domain,
3585 				 struct device *dev)
3586 {
3587 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3588 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3589 	struct intel_iommu *iommu = info->iommu;
3590 	int addr_width;
3591 
3592 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3593 		return -EINVAL;
3594 
3595 	if (domain->dirty_ops && !ssads_supported(iommu))
3596 		return -EINVAL;
3597 
3598 	/* check if this iommu agaw is sufficient for max mapped address */
3599 	addr_width = agaw_to_width(iommu->agaw);
3600 	if (addr_width > cap_mgaw(iommu->cap))
3601 		addr_width = cap_mgaw(iommu->cap);
3602 
3603 	if (dmar_domain->max_addr > (1LL << addr_width))
3604 		return -EINVAL;
3605 	dmar_domain->gaw = addr_width;
3606 
3607 	/*
3608 	 * Knock out extra levels of page tables if necessary
3609 	 */
3610 	while (iommu->agaw < dmar_domain->agaw) {
3611 		struct dma_pte *pte;
3612 
3613 		pte = dmar_domain->pgd;
3614 		if (dma_pte_present(pte)) {
3615 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
3616 			iommu_free_page(pte);
3617 		}
3618 		dmar_domain->agaw--;
3619 	}
3620 
3621 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3622 	    context_copied(iommu, info->bus, info->devfn))
3623 		return intel_pasid_setup_sm_context(dev);
3624 
3625 	return 0;
3626 }
3627 
3628 static int intel_iommu_attach_device(struct iommu_domain *domain,
3629 				     struct device *dev)
3630 {
3631 	int ret;
3632 
3633 	device_block_translation(dev);
3634 
3635 	ret = prepare_domain_attach_device(domain, dev);
3636 	if (ret)
3637 		return ret;
3638 
3639 	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
3640 }
3641 
3642 static int intel_iommu_map(struct iommu_domain *domain,
3643 			   unsigned long iova, phys_addr_t hpa,
3644 			   size_t size, int iommu_prot, gfp_t gfp)
3645 {
3646 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3647 	u64 max_addr;
3648 	int prot = 0;
3649 
3650 	if (iommu_prot & IOMMU_READ)
3651 		prot |= DMA_PTE_READ;
3652 	if (iommu_prot & IOMMU_WRITE)
3653 		prot |= DMA_PTE_WRITE;
3654 	if (dmar_domain->set_pte_snp)
3655 		prot |= DMA_PTE_SNP;
3656 
3657 	max_addr = iova + size;
3658 	if (dmar_domain->max_addr < max_addr) {
3659 		u64 end;
3660 
3661 		/* check if minimum agaw is sufficient for mapped address */
3662 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3663 		if (end < max_addr) {
3664 			pr_err("%s: iommu width (%d) is not "
3665 			       "sufficient for the mapped address (%llx)\n",
3666 			       __func__, dmar_domain->gaw, max_addr);
3667 			return -EFAULT;
3668 		}
3669 		dmar_domain->max_addr = max_addr;
3670 	}
3671 	/* Round up size to next multiple of PAGE_SIZE, if it and
3672 	   the low bits of hpa would take us onto the next page */
3673 	size = aligned_nrpages(hpa, size);
3674 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3675 				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
3676 }
3677 
3678 static int intel_iommu_map_pages(struct iommu_domain *domain,
3679 				 unsigned long iova, phys_addr_t paddr,
3680 				 size_t pgsize, size_t pgcount,
3681 				 int prot, gfp_t gfp, size_t *mapped)
3682 {
3683 	unsigned long pgshift = __ffs(pgsize);
3684 	size_t size = pgcount << pgshift;
3685 	int ret;
3686 
3687 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
3688 		return -EINVAL;
3689 
3690 	if (!IS_ALIGNED(iova | paddr, pgsize))
3691 		return -EINVAL;
3692 
3693 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
3694 	if (!ret && mapped)
3695 		*mapped = size;
3696 
3697 	return ret;
3698 }
3699 
3700 static size_t intel_iommu_unmap(struct iommu_domain *domain,
3701 				unsigned long iova, size_t size,
3702 				struct iommu_iotlb_gather *gather)
3703 {
3704 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3705 	unsigned long start_pfn, last_pfn;
3706 	int level = 0;
3707 
3708 	/* Cope with horrid API which requires us to unmap more than the
3709 	   size argument if it happens to be a large-page mapping. */
3710 	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
3711 				     &level, GFP_ATOMIC)))
3712 		return 0;
3713 
3714 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
3715 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
3716 
3717 	start_pfn = iova >> VTD_PAGE_SHIFT;
3718 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
3719 
3720 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
3721 
3722 	if (dmar_domain->max_addr == iova + size)
3723 		dmar_domain->max_addr = iova;
3724 
3725 	/*
3726 	 * We do not use page-selective IOTLB invalidation in flush queue,
3727 	 * so there is no need to track page and sync iotlb.
3728 	 */
3729 	if (!iommu_iotlb_gather_queued(gather))
3730 		iommu_iotlb_gather_add_page(domain, gather, iova, size);
3731 
3732 	return size;
3733 }
3734 
3735 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
3736 				      unsigned long iova,
3737 				      size_t pgsize, size_t pgcount,
3738 				      struct iommu_iotlb_gather *gather)
3739 {
3740 	unsigned long pgshift = __ffs(pgsize);
3741 	size_t size = pgcount << pgshift;
3742 
3743 	return intel_iommu_unmap(domain, iova, size, gather);
3744 }
3745 
3746 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
3747 				 struct iommu_iotlb_gather *gather)
3748 {
3749 	cache_tag_flush_range(to_dmar_domain(domain), gather->start,
3750 			      gather->end, list_empty(&gather->freelist));
3751 	iommu_put_pages_list(&gather->freelist);
3752 }
3753 
3754 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3755 					    dma_addr_t iova)
3756 {
3757 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3758 	struct dma_pte *pte;
3759 	int level = 0;
3760 	u64 phys = 0;
3761 
3762 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
3763 			     GFP_ATOMIC);
3764 	if (pte && dma_pte_present(pte))
3765 		phys = dma_pte_addr(pte) +
3766 			(iova & (BIT_MASK(level_to_offset_bits(level) +
3767 						VTD_PAGE_SHIFT) - 1));
3768 
3769 	return phys;
3770 }
3771 
3772 static bool domain_support_force_snooping(struct dmar_domain *domain)
3773 {
3774 	struct device_domain_info *info;
3775 	bool support = true;
3776 
3777 	assert_spin_locked(&domain->lock);
3778 	list_for_each_entry(info, &domain->devices, link) {
3779 		if (!ecap_sc_support(info->iommu->ecap)) {
3780 			support = false;
3781 			break;
3782 		}
3783 	}
3784 
3785 	return support;
3786 }
3787 
3788 static void domain_set_force_snooping(struct dmar_domain *domain)
3789 {
3790 	struct device_domain_info *info;
3791 
3792 	assert_spin_locked(&domain->lock);
3793 	/*
3794 	 * Second level page table supports per-PTE snoop control. The
3795 	 * iommu_map() interface will handle this by setting SNP bit.
3796 	 */
3797 	if (!domain->use_first_level) {
3798 		domain->set_pte_snp = true;
3799 		return;
3800 	}
3801 
3802 	list_for_each_entry(info, &domain->devices, link)
3803 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
3804 						     IOMMU_NO_PASID);
3805 }
3806 
3807 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
3808 {
3809 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3810 	unsigned long flags;
3811 
3812 	if (dmar_domain->force_snooping)
3813 		return true;
3814 
3815 	spin_lock_irqsave(&dmar_domain->lock, flags);
3816 	if (!domain_support_force_snooping(dmar_domain) ||
3817 	    (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
3818 		spin_unlock_irqrestore(&dmar_domain->lock, flags);
3819 		return false;
3820 	}
3821 
3822 	domain_set_force_snooping(dmar_domain);
3823 	dmar_domain->force_snooping = true;
3824 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
3825 
3826 	return true;
3827 }
3828 
3829 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
3830 {
3831 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3832 
3833 	switch (cap) {
3834 	case IOMMU_CAP_CACHE_COHERENCY:
3835 	case IOMMU_CAP_DEFERRED_FLUSH:
3836 		return true;
3837 	case IOMMU_CAP_PRE_BOOT_PROTECTION:
3838 		return dmar_platform_optin();
3839 	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
3840 		return ecap_sc_support(info->iommu->ecap);
3841 	case IOMMU_CAP_DIRTY_TRACKING:
3842 		return ssads_supported(info->iommu);
3843 	default:
3844 		return false;
3845 	}
3846 }
3847 
3848 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
3849 {
3850 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3851 	struct device_domain_info *info;
3852 	struct intel_iommu *iommu;
3853 	u8 bus, devfn;
3854 	int ret;
3855 
3856 	iommu = device_lookup_iommu(dev, &bus, &devfn);
3857 	if (!iommu || !iommu->iommu.ops)
3858 		return ERR_PTR(-ENODEV);
3859 
3860 	info = kzalloc(sizeof(*info), GFP_KERNEL);
3861 	if (!info)
3862 		return ERR_PTR(-ENOMEM);
3863 
3864 	if (dev_is_real_dma_subdevice(dev)) {
3865 		info->bus = pdev->bus->number;
3866 		info->devfn = pdev->devfn;
3867 		info->segment = pci_domain_nr(pdev->bus);
3868 	} else {
3869 		info->bus = bus;
3870 		info->devfn = devfn;
3871 		info->segment = iommu->segment;
3872 	}
3873 
3874 	info->dev = dev;
3875 	info->iommu = iommu;
3876 	if (dev_is_pci(dev)) {
3877 		if (ecap_dev_iotlb_support(iommu->ecap) &&
3878 		    pci_ats_supported(pdev) &&
3879 		    dmar_ats_supported(pdev, iommu)) {
3880 			info->ats_supported = 1;
3881 			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
3882 
3883 			/*
3884 			 * For IOMMU that supports device IOTLB throttling
3885 			 * (DIT), we assign PFSID to the invalidation desc
3886 			 * of a VF such that IOMMU HW can gauge queue depth
3887 			 * at PF level. If DIT is not set, PFSID will be
3888 			 * treated as reserved, which should be set to 0.
3889 			 */
3890 			if (ecap_dit(iommu->ecap))
3891 				info->pfsid = pci_dev_id(pci_physfn(pdev));
3892 			info->ats_qdep = pci_ats_queue_depth(pdev);
3893 		}
3894 		if (sm_supported(iommu)) {
3895 			if (pasid_supported(iommu)) {
3896 				int features = pci_pasid_features(pdev);
3897 
3898 				if (features >= 0)
3899 					info->pasid_supported = features | 1;
3900 			}
3901 
3902 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
3903 			    pci_pri_supported(pdev))
3904 				info->pri_supported = 1;
3905 		}
3906 	}
3907 
3908 	dev_iommu_priv_set(dev, info);
3909 	if (pdev && pci_ats_supported(pdev)) {
3910 		pci_prepare_ats(pdev, VTD_PAGE_SHIFT);
3911 		ret = device_rbtree_insert(iommu, info);
3912 		if (ret)
3913 			goto free;
3914 	}
3915 
3916 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
3917 		ret = intel_pasid_alloc_table(dev);
3918 		if (ret) {
3919 			dev_err(dev, "PASID table allocation failed\n");
3920 			goto clear_rbtree;
3921 		}
3922 
3923 		if (!context_copied(iommu, info->bus, info->devfn)) {
3924 			ret = intel_pasid_setup_sm_context(dev);
3925 			if (ret)
3926 				goto free_table;
3927 		}
3928 	}
3929 
3930 	intel_iommu_debugfs_create_dev(info);
3931 
3932 	/*
3933 	 * The PCIe spec, in its wisdom, declares that the behaviour of the
3934 	 * device is undefined if you enable PASID support after ATS support.
3935 	 * So always enable PASID support on devices which have it, even if
3936 	 * we can't yet know if we're ever going to use it.
3937 	 */
3938 	if (info->pasid_supported &&
3939 	    !pci_enable_pasid(pdev, info->pasid_supported & ~1))
3940 		info->pasid_enabled = 1;
3941 
3942 	return &iommu->iommu;
3943 free_table:
3944 	intel_pasid_free_table(dev);
3945 clear_rbtree:
3946 	device_rbtree_remove(info);
3947 free:
3948 	kfree(info);
3949 
3950 	return ERR_PTR(ret);
3951 }
3952 
3953 static void intel_iommu_release_device(struct device *dev)
3954 {
3955 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3956 	struct intel_iommu *iommu = info->iommu;
3957 
3958 	if (info->pasid_enabled) {
3959 		pci_disable_pasid(to_pci_dev(dev));
3960 		info->pasid_enabled = 0;
3961 	}
3962 
3963 	mutex_lock(&iommu->iopf_lock);
3964 	if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
3965 		device_rbtree_remove(info);
3966 	mutex_unlock(&iommu->iopf_lock);
3967 
3968 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3969 	    !context_copied(iommu, info->bus, info->devfn))
3970 		intel_pasid_teardown_sm_context(dev);
3971 
3972 	intel_pasid_free_table(dev);
3973 	intel_iommu_debugfs_remove_dev(info);
3974 	kfree(info);
3975 	set_dma_ops(dev, NULL);
3976 }
3977 
3978 static void intel_iommu_get_resv_regions(struct device *device,
3979 					 struct list_head *head)
3980 {
3981 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
3982 	struct iommu_resv_region *reg;
3983 	struct dmar_rmrr_unit *rmrr;
3984 	struct device *i_dev;
3985 	int i;
3986 
3987 	rcu_read_lock();
3988 	for_each_rmrr_units(rmrr) {
3989 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3990 					  i, i_dev) {
3991 			struct iommu_resv_region *resv;
3992 			enum iommu_resv_type type;
3993 			size_t length;
3994 
3995 			if (i_dev != device &&
3996 			    !is_downstream_to_pci_bridge(device, i_dev))
3997 				continue;
3998 
3999 			length = rmrr->end_address - rmrr->base_address + 1;
4000 
4001 			type = device_rmrr_is_relaxable(device) ?
4002 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4003 
4004 			resv = iommu_alloc_resv_region(rmrr->base_address,
4005 						       length, prot, type,
4006 						       GFP_ATOMIC);
4007 			if (!resv)
4008 				break;
4009 
4010 			list_add_tail(&resv->list, head);
4011 		}
4012 	}
4013 	rcu_read_unlock();
4014 
4015 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4016 	if (dev_is_pci(device)) {
4017 		struct pci_dev *pdev = to_pci_dev(device);
4018 
4019 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4020 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4021 					IOMMU_RESV_DIRECT_RELAXABLE,
4022 					GFP_KERNEL);
4023 			if (reg)
4024 				list_add_tail(&reg->list, head);
4025 		}
4026 	}
4027 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4028 
4029 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4030 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4031 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4032 	if (!reg)
4033 		return;
4034 	list_add_tail(&reg->list, head);
4035 }
4036 
4037 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4038 {
4039 	if (dev_is_pci(dev))
4040 		return pci_device_group(dev);
4041 	return generic_device_group(dev);
4042 }
4043 
4044 static int intel_iommu_enable_sva(struct device *dev)
4045 {
4046 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4047 	struct intel_iommu *iommu;
4048 
4049 	if (!info || dmar_disabled)
4050 		return -EINVAL;
4051 
4052 	iommu = info->iommu;
4053 	if (!iommu)
4054 		return -EINVAL;
4055 
4056 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4057 		return -ENODEV;
4058 
4059 	if (!info->pasid_enabled || !info->ats_enabled)
4060 		return -EINVAL;
4061 
4062 	/*
4063 	 * Devices having device-specific I/O fault handling should not
4064 	 * support PCI/PRI. The IOMMU side has no means to check the
4065 	 * capability of device-specific IOPF.  Therefore, IOMMU can only
4066 	 * default that if the device driver enables SVA on a non-PRI
4067 	 * device, it will handle IOPF in its own way.
4068 	 */
4069 	if (!info->pri_supported)
4070 		return 0;
4071 
4072 	/* Devices supporting PRI should have it enabled. */
4073 	if (!info->pri_enabled)
4074 		return -EINVAL;
4075 
4076 	return 0;
4077 }
4078 
4079 static int context_flip_pri(struct device_domain_info *info, bool enable)
4080 {
4081 	struct intel_iommu *iommu = info->iommu;
4082 	u8 bus = info->bus, devfn = info->devfn;
4083 	struct context_entry *context;
4084 	u16 did;
4085 
4086 	spin_lock(&iommu->lock);
4087 	if (context_copied(iommu, bus, devfn)) {
4088 		spin_unlock(&iommu->lock);
4089 		return -EINVAL;
4090 	}
4091 
4092 	context = iommu_context_addr(iommu, bus, devfn, false);
4093 	if (!context || !context_present(context)) {
4094 		spin_unlock(&iommu->lock);
4095 		return -ENODEV;
4096 	}
4097 	did = context_domain_id(context);
4098 
4099 	if (enable)
4100 		context_set_sm_pre(context);
4101 	else
4102 		context_clear_sm_pre(context);
4103 
4104 	if (!ecap_coherent(iommu->ecap))
4105 		clflush_cache_range(context, sizeof(*context));
4106 	intel_context_flush_present(info, context, did, true);
4107 	spin_unlock(&iommu->lock);
4108 
4109 	return 0;
4110 }
4111 
4112 static int intel_iommu_enable_iopf(struct device *dev)
4113 {
4114 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4115 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4116 	struct intel_iommu *iommu;
4117 	int ret;
4118 
4119 	if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4120 		return -ENODEV;
4121 
4122 	if (info->pri_enabled)
4123 		return -EBUSY;
4124 
4125 	iommu = info->iommu;
4126 	if (!iommu)
4127 		return -EINVAL;
4128 
4129 	/* PASID is required in PRG Response Message. */
4130 	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4131 		return -EINVAL;
4132 
4133 	ret = pci_reset_pri(pdev);
4134 	if (ret)
4135 		return ret;
4136 
4137 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4138 	if (ret)
4139 		return ret;
4140 
4141 	ret = context_flip_pri(info, true);
4142 	if (ret)
4143 		goto err_remove_device;
4144 
4145 	ret = pci_enable_pri(pdev, PRQ_DEPTH);
4146 	if (ret)
4147 		goto err_clear_pri;
4148 
4149 	info->pri_enabled = 1;
4150 
4151 	return 0;
4152 err_clear_pri:
4153 	context_flip_pri(info, false);
4154 err_remove_device:
4155 	iopf_queue_remove_device(iommu->iopf_queue, dev);
4156 
4157 	return ret;
4158 }
4159 
4160 static int intel_iommu_disable_iopf(struct device *dev)
4161 {
4162 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4163 	struct intel_iommu *iommu = info->iommu;
4164 
4165 	if (!info->pri_enabled)
4166 		return -EINVAL;
4167 
4168 	/* Disable new PRI reception: */
4169 	context_flip_pri(info, false);
4170 
4171 	/*
4172 	 * Remove device from fault queue and acknowledge all outstanding
4173 	 * PRQs to the device:
4174 	 */
4175 	iopf_queue_remove_device(iommu->iopf_queue, dev);
4176 
4177 	/*
4178 	 * PCIe spec states that by clearing PRI enable bit, the Page
4179 	 * Request Interface will not issue new page requests, but has
4180 	 * outstanding page requests that have been transmitted or are
4181 	 * queued for transmission. This is supposed to be called after
4182 	 * the device driver has stopped DMA, all PASIDs have been
4183 	 * unbound and the outstanding PRQs have been drained.
4184 	 */
4185 	pci_disable_pri(to_pci_dev(dev));
4186 	info->pri_enabled = 0;
4187 
4188 	return 0;
4189 }
4190 
4191 static int
4192 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4193 {
4194 	switch (feat) {
4195 	case IOMMU_DEV_FEAT_IOPF:
4196 		return intel_iommu_enable_iopf(dev);
4197 
4198 	case IOMMU_DEV_FEAT_SVA:
4199 		return intel_iommu_enable_sva(dev);
4200 
4201 	default:
4202 		return -ENODEV;
4203 	}
4204 }
4205 
4206 static int
4207 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4208 {
4209 	switch (feat) {
4210 	case IOMMU_DEV_FEAT_IOPF:
4211 		return intel_iommu_disable_iopf(dev);
4212 
4213 	case IOMMU_DEV_FEAT_SVA:
4214 		return 0;
4215 
4216 	default:
4217 		return -ENODEV;
4218 	}
4219 }
4220 
4221 static bool intel_iommu_is_attach_deferred(struct device *dev)
4222 {
4223 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4224 
4225 	return translation_pre_enabled(info->iommu) && !info->domain;
4226 }
4227 
4228 /*
4229  * Check that the device does not live on an external facing PCI port that is
4230  * marked as untrusted. Such devices should not be able to apply quirks and
4231  * thus not be able to bypass the IOMMU restrictions.
4232  */
4233 static bool risky_device(struct pci_dev *pdev)
4234 {
4235 	if (pdev->untrusted) {
4236 		pci_info(pdev,
4237 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4238 			 pdev->vendor, pdev->device);
4239 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4240 		return true;
4241 	}
4242 	return false;
4243 }
4244 
4245 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4246 				      unsigned long iova, size_t size)
4247 {
4248 	cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1);
4249 
4250 	return 0;
4251 }
4252 
4253 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
4254 					 struct iommu_domain *domain)
4255 {
4256 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4257 	struct dev_pasid_info *curr, *dev_pasid = NULL;
4258 	struct intel_iommu *iommu = info->iommu;
4259 	struct dmar_domain *dmar_domain;
4260 	unsigned long flags;
4261 
4262 	if (domain->type == IOMMU_DOMAIN_IDENTITY) {
4263 		intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4264 		return;
4265 	}
4266 
4267 	dmar_domain = to_dmar_domain(domain);
4268 	spin_lock_irqsave(&dmar_domain->lock, flags);
4269 	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4270 		if (curr->dev == dev && curr->pasid == pasid) {
4271 			list_del(&curr->link_domain);
4272 			dev_pasid = curr;
4273 			break;
4274 		}
4275 	}
4276 	WARN_ON_ONCE(!dev_pasid);
4277 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4278 
4279 	cache_tag_unassign_domain(dmar_domain, dev, pasid);
4280 	domain_detach_iommu(dmar_domain, iommu);
4281 	intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4282 	kfree(dev_pasid);
4283 	intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4284 	intel_drain_pasid_prq(dev, pasid);
4285 }
4286 
4287 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4288 				     struct device *dev, ioasid_t pasid)
4289 {
4290 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4291 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4292 	struct intel_iommu *iommu = info->iommu;
4293 	struct dev_pasid_info *dev_pasid;
4294 	unsigned long flags;
4295 	int ret;
4296 
4297 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4298 		return -EOPNOTSUPP;
4299 
4300 	if (domain->dirty_ops)
4301 		return -EINVAL;
4302 
4303 	if (context_copied(iommu, info->bus, info->devfn))
4304 		return -EBUSY;
4305 
4306 	ret = prepare_domain_attach_device(domain, dev);
4307 	if (ret)
4308 		return ret;
4309 
4310 	dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4311 	if (!dev_pasid)
4312 		return -ENOMEM;
4313 
4314 	ret = domain_attach_iommu(dmar_domain, iommu);
4315 	if (ret)
4316 		goto out_free;
4317 
4318 	ret = cache_tag_assign_domain(dmar_domain, dev, pasid);
4319 	if (ret)
4320 		goto out_detach_iommu;
4321 
4322 	if (dmar_domain->use_first_level)
4323 		ret = domain_setup_first_level(iommu, dmar_domain,
4324 					       dev, pasid);
4325 	else
4326 		ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4327 						     dev, pasid);
4328 	if (ret)
4329 		goto out_unassign_tag;
4330 
4331 	dev_pasid->dev = dev;
4332 	dev_pasid->pasid = pasid;
4333 	spin_lock_irqsave(&dmar_domain->lock, flags);
4334 	list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4335 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4336 
4337 	if (domain->type & __IOMMU_DOMAIN_PAGING)
4338 		intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4339 
4340 	return 0;
4341 out_unassign_tag:
4342 	cache_tag_unassign_domain(dmar_domain, dev, pasid);
4343 out_detach_iommu:
4344 	domain_detach_iommu(dmar_domain, iommu);
4345 out_free:
4346 	kfree(dev_pasid);
4347 	return ret;
4348 }
4349 
4350 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4351 {
4352 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4353 	struct intel_iommu *iommu = info->iommu;
4354 	struct iommu_hw_info_vtd *vtd;
4355 
4356 	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4357 	if (!vtd)
4358 		return ERR_PTR(-ENOMEM);
4359 
4360 	vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4361 	vtd->cap_reg = iommu->cap;
4362 	vtd->ecap_reg = iommu->ecap;
4363 	*length = sizeof(*vtd);
4364 	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4365 	return vtd;
4366 }
4367 
4368 /*
4369  * Set dirty tracking for the device list of a domain. The caller must
4370  * hold the domain->lock when calling it.
4371  */
4372 static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4373 {
4374 	struct device_domain_info *info;
4375 	int ret = 0;
4376 
4377 	list_for_each_entry(info, devices, link) {
4378 		ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4379 						       IOMMU_NO_PASID, enable);
4380 		if (ret)
4381 			break;
4382 	}
4383 
4384 	return ret;
4385 }
4386 
4387 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4388 					    bool enable)
4389 {
4390 	struct dmar_domain *s1_domain;
4391 	unsigned long flags;
4392 	int ret;
4393 
4394 	spin_lock(&domain->s1_lock);
4395 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4396 		spin_lock_irqsave(&s1_domain->lock, flags);
4397 		ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4398 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4399 		if (ret)
4400 			goto err_unwind;
4401 	}
4402 	spin_unlock(&domain->s1_lock);
4403 	return 0;
4404 
4405 err_unwind:
4406 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4407 		spin_lock_irqsave(&s1_domain->lock, flags);
4408 		device_set_dirty_tracking(&s1_domain->devices,
4409 					  domain->dirty_tracking);
4410 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4411 	}
4412 	spin_unlock(&domain->s1_lock);
4413 	return ret;
4414 }
4415 
4416 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4417 					  bool enable)
4418 {
4419 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4420 	int ret;
4421 
4422 	spin_lock(&dmar_domain->lock);
4423 	if (dmar_domain->dirty_tracking == enable)
4424 		goto out_unlock;
4425 
4426 	ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4427 	if (ret)
4428 		goto err_unwind;
4429 
4430 	if (dmar_domain->nested_parent) {
4431 		ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4432 		if (ret)
4433 			goto err_unwind;
4434 	}
4435 
4436 	dmar_domain->dirty_tracking = enable;
4437 out_unlock:
4438 	spin_unlock(&dmar_domain->lock);
4439 
4440 	return 0;
4441 
4442 err_unwind:
4443 	device_set_dirty_tracking(&dmar_domain->devices,
4444 				  dmar_domain->dirty_tracking);
4445 	spin_unlock(&dmar_domain->lock);
4446 	return ret;
4447 }
4448 
4449 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4450 					    unsigned long iova, size_t size,
4451 					    unsigned long flags,
4452 					    struct iommu_dirty_bitmap *dirty)
4453 {
4454 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4455 	unsigned long end = iova + size - 1;
4456 	unsigned long pgsize;
4457 
4458 	/*
4459 	 * IOMMUFD core calls into a dirty tracking disabled domain without an
4460 	 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4461 	 * have occurred when we stopped dirty tracking. This ensures that we
4462 	 * never inherit dirtied bits from a previous cycle.
4463 	 */
4464 	if (!dmar_domain->dirty_tracking && dirty->bitmap)
4465 		return -EINVAL;
4466 
4467 	do {
4468 		struct dma_pte *pte;
4469 		int lvl = 0;
4470 
4471 		pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4472 				     GFP_ATOMIC);
4473 		pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4474 		if (!pte || !dma_pte_present(pte)) {
4475 			iova += pgsize;
4476 			continue;
4477 		}
4478 
4479 		if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4480 			iommu_dirty_bitmap_record(dirty, iova, pgsize);
4481 		iova += pgsize;
4482 	} while (iova < end);
4483 
4484 	return 0;
4485 }
4486 
4487 static const struct iommu_dirty_ops intel_dirty_ops = {
4488 	.set_dirty_tracking = intel_iommu_set_dirty_tracking,
4489 	.read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4490 };
4491 
4492 static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
4493 {
4494 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4495 	struct intel_iommu *iommu = info->iommu;
4496 	struct context_entry *context;
4497 
4498 	spin_lock(&iommu->lock);
4499 	context = iommu_context_addr(iommu, bus, devfn, 1);
4500 	if (!context) {
4501 		spin_unlock(&iommu->lock);
4502 		return -ENOMEM;
4503 	}
4504 
4505 	if (context_present(context) && !context_copied(iommu, bus, devfn)) {
4506 		spin_unlock(&iommu->lock);
4507 		return 0;
4508 	}
4509 
4510 	copied_context_tear_down(iommu, context, bus, devfn);
4511 	context_clear_entry(context);
4512 	context_set_domain_id(context, FLPT_DEFAULT_DID);
4513 
4514 	/*
4515 	 * In pass through mode, AW must be programmed to indicate the largest
4516 	 * AGAW value supported by hardware. And ASR is ignored by hardware.
4517 	 */
4518 	context_set_address_width(context, iommu->msagaw);
4519 	context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH);
4520 	context_set_fault_enable(context);
4521 	context_set_present(context);
4522 	if (!ecap_coherent(iommu->ecap))
4523 		clflush_cache_range(context, sizeof(*context));
4524 	context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn);
4525 	spin_unlock(&iommu->lock);
4526 
4527 	return 0;
4528 }
4529 
4530 static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data)
4531 {
4532 	struct device *dev = data;
4533 
4534 	if (dev != &pdev->dev)
4535 		return 0;
4536 
4537 	return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff);
4538 }
4539 
4540 static int device_setup_pass_through(struct device *dev)
4541 {
4542 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4543 
4544 	if (!dev_is_pci(dev))
4545 		return context_setup_pass_through(dev, info->bus, info->devfn);
4546 
4547 	return pci_for_each_dma_alias(to_pci_dev(dev),
4548 				      context_setup_pass_through_cb, dev);
4549 }
4550 
4551 static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev)
4552 {
4553 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4554 	struct intel_iommu *iommu = info->iommu;
4555 	int ret;
4556 
4557 	device_block_translation(dev);
4558 
4559 	if (dev_is_real_dma_subdevice(dev))
4560 		return 0;
4561 
4562 	if (sm_supported(iommu)) {
4563 		ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
4564 		if (!ret)
4565 			iommu_enable_pci_caps(info);
4566 	} else {
4567 		ret = device_setup_pass_through(dev);
4568 	}
4569 
4570 	return ret;
4571 }
4572 
4573 static int identity_domain_set_dev_pasid(struct iommu_domain *domain,
4574 					 struct device *dev, ioasid_t pasid)
4575 {
4576 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4577 	struct intel_iommu *iommu = info->iommu;
4578 
4579 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4580 		return -EOPNOTSUPP;
4581 
4582 	return intel_pasid_setup_pass_through(iommu, dev, pasid);
4583 }
4584 
4585 static struct iommu_domain identity_domain = {
4586 	.type = IOMMU_DOMAIN_IDENTITY,
4587 	.ops = &(const struct iommu_domain_ops) {
4588 		.attach_dev	= identity_domain_attach_dev,
4589 		.set_dev_pasid	= identity_domain_set_dev_pasid,
4590 	},
4591 };
4592 
4593 const struct iommu_ops intel_iommu_ops = {
4594 	.blocked_domain		= &blocking_domain,
4595 	.release_domain		= &blocking_domain,
4596 	.identity_domain	= &identity_domain,
4597 	.capable		= intel_iommu_capable,
4598 	.hw_info		= intel_iommu_hw_info,
4599 	.domain_alloc		= intel_iommu_domain_alloc,
4600 	.domain_alloc_user	= intel_iommu_domain_alloc_user,
4601 	.domain_alloc_sva	= intel_svm_domain_alloc,
4602 	.probe_device		= intel_iommu_probe_device,
4603 	.release_device		= intel_iommu_release_device,
4604 	.get_resv_regions	= intel_iommu_get_resv_regions,
4605 	.device_group		= intel_iommu_device_group,
4606 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4607 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4608 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4609 	.def_domain_type	= device_def_domain_type,
4610 	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4611 	.pgsize_bitmap		= SZ_4K,
4612 #ifdef CONFIG_INTEL_IOMMU_SVM
4613 	.page_response		= intel_svm_page_response,
4614 #endif
4615 	.default_domain_ops = &(const struct iommu_domain_ops) {
4616 		.attach_dev		= intel_iommu_attach_device,
4617 		.set_dev_pasid		= intel_iommu_set_dev_pasid,
4618 		.map_pages		= intel_iommu_map_pages,
4619 		.unmap_pages		= intel_iommu_unmap_pages,
4620 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4621 		.flush_iotlb_all        = intel_flush_iotlb_all,
4622 		.iotlb_sync		= intel_iommu_tlb_sync,
4623 		.iova_to_phys		= intel_iommu_iova_to_phys,
4624 		.free			= intel_iommu_domain_free,
4625 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4626 	}
4627 };
4628 
4629 static void quirk_iommu_igfx(struct pci_dev *dev)
4630 {
4631 	if (risky_device(dev))
4632 		return;
4633 
4634 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4635 	disable_igfx_iommu = 1;
4636 }
4637 
4638 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4639 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4640 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4641 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4642 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4643 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4644 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4645 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4646 
4647 /* Broadwell igfx malfunctions with dmar */
4648 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4649 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4650 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4651 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4652 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4653 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4654 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4655 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4656 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4657 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4658 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4659 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4660 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4661 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4662 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4663 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4664 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4665 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4666 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4667 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4668 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4669 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4670 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4671 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4672 
4673 static void quirk_iommu_rwbf(struct pci_dev *dev)
4674 {
4675 	if (risky_device(dev))
4676 		return;
4677 
4678 	/*
4679 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4680 	 * but needs it. Same seems to hold for the desktop versions.
4681 	 */
4682 	pci_info(dev, "Forcing write-buffer flush capability\n");
4683 	rwbf_quirk = 1;
4684 }
4685 
4686 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4687 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4688 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4689 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4690 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4691 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4692 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4693 
4694 #define GGC 0x52
4695 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4696 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4697 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4698 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4699 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4700 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4701 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4702 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4703 
4704 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4705 {
4706 	unsigned short ggc;
4707 
4708 	if (risky_device(dev))
4709 		return;
4710 
4711 	if (pci_read_config_word(dev, GGC, &ggc))
4712 		return;
4713 
4714 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4715 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4716 		disable_igfx_iommu = 1;
4717 	} else if (!disable_igfx_iommu) {
4718 		/* we have to ensure the gfx device is idle before we flush */
4719 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4720 		iommu_set_dma_strict();
4721 	}
4722 }
4723 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4724 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4725 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4726 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4727 
4728 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4729 {
4730 	unsigned short ver;
4731 
4732 	if (!IS_GFX_DEVICE(dev))
4733 		return;
4734 
4735 	ver = (dev->device >> 8) & 0xff;
4736 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4737 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4738 	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4739 		return;
4740 
4741 	if (risky_device(dev))
4742 		return;
4743 
4744 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4745 	iommu_skip_te_disable = 1;
4746 }
4747 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4748 
4749 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4750    ISOCH DMAR unit for the Azalia sound device, but not give it any
4751    TLB entries, which causes it to deadlock. Check for that.  We do
4752    this in a function called from init_dmars(), instead of in a PCI
4753    quirk, because we don't want to print the obnoxious "BIOS broken"
4754    message if VT-d is actually disabled.
4755 */
4756 static void __init check_tylersburg_isoch(void)
4757 {
4758 	struct pci_dev *pdev;
4759 	uint32_t vtisochctrl;
4760 
4761 	/* If there's no Azalia in the system anyway, forget it. */
4762 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4763 	if (!pdev)
4764 		return;
4765 
4766 	if (risky_device(pdev)) {
4767 		pci_dev_put(pdev);
4768 		return;
4769 	}
4770 
4771 	pci_dev_put(pdev);
4772 
4773 	/* System Management Registers. Might be hidden, in which case
4774 	   we can't do the sanity check. But that's OK, because the
4775 	   known-broken BIOSes _don't_ actually hide it, so far. */
4776 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4777 	if (!pdev)
4778 		return;
4779 
4780 	if (risky_device(pdev)) {
4781 		pci_dev_put(pdev);
4782 		return;
4783 	}
4784 
4785 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4786 		pci_dev_put(pdev);
4787 		return;
4788 	}
4789 
4790 	pci_dev_put(pdev);
4791 
4792 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4793 	if (vtisochctrl & 1)
4794 		return;
4795 
4796 	/* Drop all bits other than the number of TLB entries */
4797 	vtisochctrl &= 0x1c;
4798 
4799 	/* If we have the recommended number of TLB entries (16), fine. */
4800 	if (vtisochctrl == 0x10)
4801 		return;
4802 
4803 	/* Zero TLB entries? You get to ride the short bus to school. */
4804 	if (!vtisochctrl) {
4805 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4806 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4807 		     dmi_get_system_info(DMI_BIOS_VENDOR),
4808 		     dmi_get_system_info(DMI_BIOS_VERSION),
4809 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4810 		iommu_identity_mapping |= IDENTMAP_AZALIA;
4811 		return;
4812 	}
4813 
4814 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4815 	       vtisochctrl);
4816 }
4817 
4818 /*
4819  * Here we deal with a device TLB defect where device may inadvertently issue ATS
4820  * invalidation completion before posted writes initiated with translated address
4821  * that utilized translations matching the invalidation address range, violating
4822  * the invalidation completion ordering.
4823  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4824  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4825  * under the control of the trusted/privileged host device driver must use this
4826  * quirk.
4827  * Device TLBs are invalidated under the following six conditions:
4828  * 1. Device driver does DMA API unmap IOVA
4829  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4830  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4831  *    exit_mmap() due to crash
4832  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4833  *    VM has to free pages that were unmapped
4834  * 5. Userspace driver unmaps a DMA buffer
4835  * 6. Cache invalidation in vSVA usage (upcoming)
4836  *
4837  * For #1 and #2, device drivers are responsible for stopping DMA traffic
4838  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4839  * invalidate TLB the same way as normal user unmap which will use this quirk.
4840  * The dTLB invalidation after PASID cache flush does not need this quirk.
4841  *
4842  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4843  */
4844 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4845 			       unsigned long address, unsigned long mask,
4846 			       u32 pasid, u16 qdep)
4847 {
4848 	u16 sid;
4849 
4850 	if (likely(!info->dtlb_extra_inval))
4851 		return;
4852 
4853 	sid = PCI_DEVID(info->bus, info->devfn);
4854 	if (pasid == IOMMU_NO_PASID) {
4855 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
4856 				   qdep, address, mask);
4857 	} else {
4858 		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
4859 					 pasid, qdep, address, mask);
4860 	}
4861 }
4862 
4863 #define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
4864 
4865 /*
4866  * Function to submit a command to the enhanced command interface. The
4867  * valid enhanced command descriptions are defined in Table 47 of the
4868  * VT-d spec. The VT-d hardware implementation may support some but not
4869  * all commands, which can be determined by checking the Enhanced
4870  * Command Capability Register.
4871  *
4872  * Return values:
4873  *  - 0: Command successful without any error;
4874  *  - Negative: software error value;
4875  *  - Nonzero positive: failure status code defined in Table 48.
4876  */
4877 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
4878 {
4879 	unsigned long flags;
4880 	u64 res;
4881 	int ret;
4882 
4883 	if (!cap_ecmds(iommu->cap))
4884 		return -ENODEV;
4885 
4886 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
4887 
4888 	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
4889 	if (res & DMA_ECMD_ECRSP_IP) {
4890 		ret = -EBUSY;
4891 		goto err;
4892 	}
4893 
4894 	/*
4895 	 * Unconditionally write the operand B, because
4896 	 * - There is no side effect if an ecmd doesn't require an
4897 	 *   operand B, but we set the register to some value.
4898 	 * - It's not invoked in any critical path. The extra MMIO
4899 	 *   write doesn't bring any performance concerns.
4900 	 */
4901 	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
4902 	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
4903 
4904 	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
4905 		      !(res & DMA_ECMD_ECRSP_IP), res);
4906 
4907 	if (res & DMA_ECMD_ECRSP_IP) {
4908 		ret = -ETIMEDOUT;
4909 		goto err;
4910 	}
4911 
4912 	ret = ecmd_get_status_code(res);
4913 err:
4914 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
4915 
4916 	return ret;
4917 }
4918