xref: /linux/drivers/iommu/intel/iommu.c (revision 55d0969c451159cff86949b38c39171cab962069)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26 
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-pages.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 #include "perfmon.h"
34 
35 #define ROOT_SIZE		VTD_PAGE_SIZE
36 #define CONTEXT_SIZE		VTD_PAGE_SIZE
37 
38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42 
43 #define IOAPIC_RANGE_START	(0xfee00000)
44 #define IOAPIC_RANGE_END	(0xfeefffff)
45 #define IOVA_START_ADDR		(0x1000)
46 
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48 
49 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
50 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
51 
52 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
53    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
54 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
55 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
56 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
57 
58 static void __init check_tylersburg_isoch(void);
59 static int rwbf_quirk;
60 
61 /*
62  * set to 1 to panic kernel if can't successfully enable VT-d
63  * (used when kernel is launched w/ TXT)
64  */
65 static int force_on = 0;
66 static int intel_iommu_tboot_noforce;
67 static int no_platform_optin;
68 
69 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
70 
71 /*
72  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
73  * if marked present.
74  */
75 static phys_addr_t root_entry_lctp(struct root_entry *re)
76 {
77 	if (!(re->lo & 1))
78 		return 0;
79 
80 	return re->lo & VTD_PAGE_MASK;
81 }
82 
83 /*
84  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
85  * if marked present.
86  */
87 static phys_addr_t root_entry_uctp(struct root_entry *re)
88 {
89 	if (!(re->hi & 1))
90 		return 0;
91 
92 	return re->hi & VTD_PAGE_MASK;
93 }
94 
95 static int device_rid_cmp_key(const void *key, const struct rb_node *node)
96 {
97 	struct device_domain_info *info =
98 		rb_entry(node, struct device_domain_info, node);
99 	const u16 *rid_lhs = key;
100 
101 	if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
102 		return -1;
103 
104 	if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
105 		return 1;
106 
107 	return 0;
108 }
109 
110 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
111 {
112 	struct device_domain_info *info =
113 		rb_entry(lhs, struct device_domain_info, node);
114 	u16 key = PCI_DEVID(info->bus, info->devfn);
115 
116 	return device_rid_cmp_key(&key, rhs);
117 }
118 
119 /*
120  * Looks up an IOMMU-probed device using its source ID.
121  *
122  * Returns the pointer to the device if there is a match. Otherwise,
123  * returns NULL.
124  *
125  * Note that this helper doesn't guarantee that the device won't be
126  * released by the iommu subsystem after being returned. The caller
127  * should use its own synchronization mechanism to avoid the device
128  * being released during its use if its possibly the case.
129  */
130 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
131 {
132 	struct device_domain_info *info = NULL;
133 	struct rb_node *node;
134 	unsigned long flags;
135 
136 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
137 	node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
138 	if (node)
139 		info = rb_entry(node, struct device_domain_info, node);
140 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
141 
142 	return info ? info->dev : NULL;
143 }
144 
145 static int device_rbtree_insert(struct intel_iommu *iommu,
146 				struct device_domain_info *info)
147 {
148 	struct rb_node *curr;
149 	unsigned long flags;
150 
151 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
152 	curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
153 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
154 	if (WARN_ON(curr))
155 		return -EEXIST;
156 
157 	return 0;
158 }
159 
160 static void device_rbtree_remove(struct device_domain_info *info)
161 {
162 	struct intel_iommu *iommu = info->iommu;
163 	unsigned long flags;
164 
165 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
166 	rb_erase(&info->node, &iommu->device_rbtree);
167 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
168 }
169 
170 struct dmar_rmrr_unit {
171 	struct list_head list;		/* list of rmrr units	*/
172 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
173 	u64	base_address;		/* reserved base address*/
174 	u64	end_address;		/* reserved end address */
175 	struct dmar_dev_scope *devices;	/* target devices */
176 	int	devices_cnt;		/* target device count */
177 };
178 
179 struct dmar_atsr_unit {
180 	struct list_head list;		/* list of ATSR units */
181 	struct acpi_dmar_header *hdr;	/* ACPI header */
182 	struct dmar_dev_scope *devices;	/* target devices */
183 	int devices_cnt;		/* target device count */
184 	u8 include_all:1;		/* include all ports */
185 };
186 
187 struct dmar_satc_unit {
188 	struct list_head list;		/* list of SATC units */
189 	struct acpi_dmar_header *hdr;	/* ACPI header */
190 	struct dmar_dev_scope *devices;	/* target devices */
191 	struct intel_iommu *iommu;	/* the corresponding iommu */
192 	int devices_cnt;		/* target device count */
193 	u8 atc_required:1;		/* ATS is required */
194 };
195 
196 static LIST_HEAD(dmar_atsr_units);
197 static LIST_HEAD(dmar_rmrr_units);
198 static LIST_HEAD(dmar_satc_units);
199 
200 #define for_each_rmrr_units(rmrr) \
201 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
202 
203 static void intel_iommu_domain_free(struct iommu_domain *domain);
204 
205 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
206 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
207 
208 int intel_iommu_enabled = 0;
209 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
210 
211 static int intel_iommu_superpage = 1;
212 static int iommu_identity_mapping;
213 static int iommu_skip_te_disable;
214 static int disable_igfx_iommu;
215 
216 #define IDENTMAP_AZALIA		4
217 
218 const struct iommu_ops intel_iommu_ops;
219 static const struct iommu_dirty_ops intel_dirty_ops;
220 
221 static bool translation_pre_enabled(struct intel_iommu *iommu)
222 {
223 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
224 }
225 
226 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
227 {
228 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
229 }
230 
231 static void init_translation_status(struct intel_iommu *iommu)
232 {
233 	u32 gsts;
234 
235 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
236 	if (gsts & DMA_GSTS_TES)
237 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
238 }
239 
240 static int __init intel_iommu_setup(char *str)
241 {
242 	if (!str)
243 		return -EINVAL;
244 
245 	while (*str) {
246 		if (!strncmp(str, "on", 2)) {
247 			dmar_disabled = 0;
248 			pr_info("IOMMU enabled\n");
249 		} else if (!strncmp(str, "off", 3)) {
250 			dmar_disabled = 1;
251 			no_platform_optin = 1;
252 			pr_info("IOMMU disabled\n");
253 		} else if (!strncmp(str, "igfx_off", 8)) {
254 			disable_igfx_iommu = 1;
255 			pr_info("Disable GFX device mapping\n");
256 		} else if (!strncmp(str, "forcedac", 8)) {
257 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
258 			iommu_dma_forcedac = true;
259 		} else if (!strncmp(str, "strict", 6)) {
260 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
261 			iommu_set_dma_strict();
262 		} else if (!strncmp(str, "sp_off", 6)) {
263 			pr_info("Disable supported super page\n");
264 			intel_iommu_superpage = 0;
265 		} else if (!strncmp(str, "sm_on", 5)) {
266 			pr_info("Enable scalable mode if hardware supports\n");
267 			intel_iommu_sm = 1;
268 		} else if (!strncmp(str, "sm_off", 6)) {
269 			pr_info("Scalable mode is disallowed\n");
270 			intel_iommu_sm = 0;
271 		} else if (!strncmp(str, "tboot_noforce", 13)) {
272 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
273 			intel_iommu_tboot_noforce = 1;
274 		} else {
275 			pr_notice("Unknown option - '%s'\n", str);
276 		}
277 
278 		str += strcspn(str, ",");
279 		while (*str == ',')
280 			str++;
281 	}
282 
283 	return 1;
284 }
285 __setup("intel_iommu=", intel_iommu_setup);
286 
287 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
288 {
289 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
290 
291 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
292 }
293 
294 /*
295  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
296  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
297  * the returned SAGAW.
298  */
299 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
300 {
301 	unsigned long fl_sagaw, sl_sagaw;
302 
303 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
304 	sl_sagaw = cap_sagaw(iommu->cap);
305 
306 	/* Second level only. */
307 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
308 		return sl_sagaw;
309 
310 	/* First level only. */
311 	if (!ecap_slts(iommu->ecap))
312 		return fl_sagaw;
313 
314 	return fl_sagaw & sl_sagaw;
315 }
316 
317 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
318 {
319 	unsigned long sagaw;
320 	int agaw;
321 
322 	sagaw = __iommu_calculate_sagaw(iommu);
323 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
324 		if (test_bit(agaw, &sagaw))
325 			break;
326 	}
327 
328 	return agaw;
329 }
330 
331 /*
332  * Calculate max SAGAW for each iommu.
333  */
334 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
335 {
336 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
337 }
338 
339 /*
340  * calculate agaw for each iommu.
341  * "SAGAW" may be different across iommus, use a default agaw, and
342  * get a supported less agaw for iommus that don't support the default agaw.
343  */
344 int iommu_calculate_agaw(struct intel_iommu *iommu)
345 {
346 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
347 }
348 
349 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
350 {
351 	return sm_supported(iommu) ?
352 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
353 }
354 
355 static void domain_update_iommu_coherency(struct dmar_domain *domain)
356 {
357 	struct iommu_domain_info *info;
358 	struct dmar_drhd_unit *drhd;
359 	struct intel_iommu *iommu;
360 	bool found = false;
361 	unsigned long i;
362 
363 	domain->iommu_coherency = true;
364 	xa_for_each(&domain->iommu_array, i, info) {
365 		found = true;
366 		if (!iommu_paging_structure_coherency(info->iommu)) {
367 			domain->iommu_coherency = false;
368 			break;
369 		}
370 	}
371 	if (found)
372 		return;
373 
374 	/* No hardware attached; use lowest common denominator */
375 	rcu_read_lock();
376 	for_each_active_iommu(iommu, drhd) {
377 		if (!iommu_paging_structure_coherency(iommu)) {
378 			domain->iommu_coherency = false;
379 			break;
380 		}
381 	}
382 	rcu_read_unlock();
383 }
384 
385 static int domain_update_iommu_superpage(struct dmar_domain *domain,
386 					 struct intel_iommu *skip)
387 {
388 	struct dmar_drhd_unit *drhd;
389 	struct intel_iommu *iommu;
390 	int mask = 0x3;
391 
392 	if (!intel_iommu_superpage)
393 		return 0;
394 
395 	/* set iommu_superpage to the smallest common denominator */
396 	rcu_read_lock();
397 	for_each_active_iommu(iommu, drhd) {
398 		if (iommu != skip) {
399 			if (domain && domain->use_first_level) {
400 				if (!cap_fl1gp_support(iommu->cap))
401 					mask = 0x1;
402 			} else {
403 				mask &= cap_super_page_val(iommu->cap);
404 			}
405 
406 			if (!mask)
407 				break;
408 		}
409 	}
410 	rcu_read_unlock();
411 
412 	return fls(mask);
413 }
414 
415 static int domain_update_device_node(struct dmar_domain *domain)
416 {
417 	struct device_domain_info *info;
418 	int nid = NUMA_NO_NODE;
419 	unsigned long flags;
420 
421 	spin_lock_irqsave(&domain->lock, flags);
422 	list_for_each_entry(info, &domain->devices, link) {
423 		/*
424 		 * There could possibly be multiple device numa nodes as devices
425 		 * within the same domain may sit behind different IOMMUs. There
426 		 * isn't perfect answer in such situation, so we select first
427 		 * come first served policy.
428 		 */
429 		nid = dev_to_node(info->dev);
430 		if (nid != NUMA_NO_NODE)
431 			break;
432 	}
433 	spin_unlock_irqrestore(&domain->lock, flags);
434 
435 	return nid;
436 }
437 
438 /* Return the super pagesize bitmap if supported. */
439 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
440 {
441 	unsigned long bitmap = 0;
442 
443 	/*
444 	 * 1-level super page supports page size of 2MiB, 2-level super page
445 	 * supports page size of both 2MiB and 1GiB.
446 	 */
447 	if (domain->iommu_superpage == 1)
448 		bitmap |= SZ_2M;
449 	else if (domain->iommu_superpage == 2)
450 		bitmap |= SZ_2M | SZ_1G;
451 
452 	return bitmap;
453 }
454 
455 /* Some capabilities may be different across iommus */
456 void domain_update_iommu_cap(struct dmar_domain *domain)
457 {
458 	domain_update_iommu_coherency(domain);
459 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
460 
461 	/*
462 	 * If RHSA is missing, we should default to the device numa domain
463 	 * as fall back.
464 	 */
465 	if (domain->nid == NUMA_NO_NODE)
466 		domain->nid = domain_update_device_node(domain);
467 
468 	/*
469 	 * First-level translation restricts the input-address to a
470 	 * canonical address (i.e., address bits 63:N have the same
471 	 * value as address bit [N-1], where N is 48-bits with 4-level
472 	 * paging and 57-bits with 5-level paging). Hence, skip bit
473 	 * [N-1].
474 	 */
475 	if (domain->use_first_level)
476 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
477 	else
478 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
479 
480 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
481 }
482 
483 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
484 					 u8 devfn, int alloc)
485 {
486 	struct root_entry *root = &iommu->root_entry[bus];
487 	struct context_entry *context;
488 	u64 *entry;
489 
490 	/*
491 	 * Except that the caller requested to allocate a new entry,
492 	 * returning a copied context entry makes no sense.
493 	 */
494 	if (!alloc && context_copied(iommu, bus, devfn))
495 		return NULL;
496 
497 	entry = &root->lo;
498 	if (sm_supported(iommu)) {
499 		if (devfn >= 0x80) {
500 			devfn -= 0x80;
501 			entry = &root->hi;
502 		}
503 		devfn *= 2;
504 	}
505 	if (*entry & 1)
506 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
507 	else {
508 		unsigned long phy_addr;
509 		if (!alloc)
510 			return NULL;
511 
512 		context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
513 		if (!context)
514 			return NULL;
515 
516 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
517 		phy_addr = virt_to_phys((void *)context);
518 		*entry = phy_addr | 1;
519 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
520 	}
521 	return &context[devfn];
522 }
523 
524 /**
525  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
526  *				 sub-hierarchy of a candidate PCI-PCI bridge
527  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
528  * @bridge: the candidate PCI-PCI bridge
529  *
530  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
531  */
532 static bool
533 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
534 {
535 	struct pci_dev *pdev, *pbridge;
536 
537 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
538 		return false;
539 
540 	pdev = to_pci_dev(dev);
541 	pbridge = to_pci_dev(bridge);
542 
543 	if (pbridge->subordinate &&
544 	    pbridge->subordinate->number <= pdev->bus->number &&
545 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
546 		return true;
547 
548 	return false;
549 }
550 
551 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
552 {
553 	struct dmar_drhd_unit *drhd;
554 	u32 vtbar;
555 	int rc;
556 
557 	/* We know that this device on this chipset has its own IOMMU.
558 	 * If we find it under a different IOMMU, then the BIOS is lying
559 	 * to us. Hope that the IOMMU for this device is actually
560 	 * disabled, and it needs no translation...
561 	 */
562 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
563 	if (rc) {
564 		/* "can't" happen */
565 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
566 		return false;
567 	}
568 	vtbar &= 0xffff0000;
569 
570 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
571 	drhd = dmar_find_matched_drhd_unit(pdev);
572 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
573 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
574 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
575 		return true;
576 	}
577 
578 	return false;
579 }
580 
581 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
582 {
583 	if (!iommu || iommu->drhd->ignored)
584 		return true;
585 
586 	if (dev_is_pci(dev)) {
587 		struct pci_dev *pdev = to_pci_dev(dev);
588 
589 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
590 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
591 		    quirk_ioat_snb_local_iommu(pdev))
592 			return true;
593 	}
594 
595 	return false;
596 }
597 
598 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
599 {
600 	struct dmar_drhd_unit *drhd = NULL;
601 	struct pci_dev *pdev = NULL;
602 	struct intel_iommu *iommu;
603 	struct device *tmp;
604 	u16 segment = 0;
605 	int i;
606 
607 	if (!dev)
608 		return NULL;
609 
610 	if (dev_is_pci(dev)) {
611 		struct pci_dev *pf_pdev;
612 
613 		pdev = pci_real_dma_dev(to_pci_dev(dev));
614 
615 		/* VFs aren't listed in scope tables; we need to look up
616 		 * the PF instead to find the IOMMU. */
617 		pf_pdev = pci_physfn(pdev);
618 		dev = &pf_pdev->dev;
619 		segment = pci_domain_nr(pdev->bus);
620 	} else if (has_acpi_companion(dev))
621 		dev = &ACPI_COMPANION(dev)->dev;
622 
623 	rcu_read_lock();
624 	for_each_iommu(iommu, drhd) {
625 		if (pdev && segment != drhd->segment)
626 			continue;
627 
628 		for_each_active_dev_scope(drhd->devices,
629 					  drhd->devices_cnt, i, tmp) {
630 			if (tmp == dev) {
631 				/* For a VF use its original BDF# not that of the PF
632 				 * which we used for the IOMMU lookup. Strictly speaking
633 				 * we could do this for all PCI devices; we only need to
634 				 * get the BDF# from the scope table for ACPI matches. */
635 				if (pdev && pdev->is_virtfn)
636 					goto got_pdev;
637 
638 				if (bus && devfn) {
639 					*bus = drhd->devices[i].bus;
640 					*devfn = drhd->devices[i].devfn;
641 				}
642 				goto out;
643 			}
644 
645 			if (is_downstream_to_pci_bridge(dev, tmp))
646 				goto got_pdev;
647 		}
648 
649 		if (pdev && drhd->include_all) {
650 got_pdev:
651 			if (bus && devfn) {
652 				*bus = pdev->bus->number;
653 				*devfn = pdev->devfn;
654 			}
655 			goto out;
656 		}
657 	}
658 	iommu = NULL;
659 out:
660 	if (iommu_is_dummy(iommu, dev))
661 		iommu = NULL;
662 
663 	rcu_read_unlock();
664 
665 	return iommu;
666 }
667 
668 static void domain_flush_cache(struct dmar_domain *domain,
669 			       void *addr, int size)
670 {
671 	if (!domain->iommu_coherency)
672 		clflush_cache_range(addr, size);
673 }
674 
675 static void free_context_table(struct intel_iommu *iommu)
676 {
677 	struct context_entry *context;
678 	int i;
679 
680 	if (!iommu->root_entry)
681 		return;
682 
683 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
684 		context = iommu_context_addr(iommu, i, 0, 0);
685 		if (context)
686 			iommu_free_page(context);
687 
688 		if (!sm_supported(iommu))
689 			continue;
690 
691 		context = iommu_context_addr(iommu, i, 0x80, 0);
692 		if (context)
693 			iommu_free_page(context);
694 	}
695 
696 	iommu_free_page(iommu->root_entry);
697 	iommu->root_entry = NULL;
698 }
699 
700 #ifdef CONFIG_DMAR_DEBUG
701 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
702 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
703 {
704 	struct dma_pte *pte;
705 	int offset;
706 
707 	while (1) {
708 		offset = pfn_level_offset(pfn, level);
709 		pte = &parent[offset];
710 		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
711 			pr_info("PTE not present at level %d\n", level);
712 			break;
713 		}
714 
715 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
716 
717 		if (level == 1)
718 			break;
719 
720 		parent = phys_to_virt(dma_pte_addr(pte));
721 		level--;
722 	}
723 }
724 
725 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
726 			  unsigned long long addr, u32 pasid)
727 {
728 	struct pasid_dir_entry *dir, *pde;
729 	struct pasid_entry *entries, *pte;
730 	struct context_entry *ctx_entry;
731 	struct root_entry *rt_entry;
732 	int i, dir_index, index, level;
733 	u8 devfn = source_id & 0xff;
734 	u8 bus = source_id >> 8;
735 	struct dma_pte *pgtable;
736 
737 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
738 
739 	/* root entry dump */
740 	rt_entry = &iommu->root_entry[bus];
741 	if (!rt_entry) {
742 		pr_info("root table entry is not present\n");
743 		return;
744 	}
745 
746 	if (sm_supported(iommu))
747 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
748 			rt_entry->hi, rt_entry->lo);
749 	else
750 		pr_info("root entry: 0x%016llx", rt_entry->lo);
751 
752 	/* context entry dump */
753 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
754 	if (!ctx_entry) {
755 		pr_info("context table entry is not present\n");
756 		return;
757 	}
758 
759 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
760 		ctx_entry->hi, ctx_entry->lo);
761 
762 	/* legacy mode does not require PASID entries */
763 	if (!sm_supported(iommu)) {
764 		level = agaw_to_level(ctx_entry->hi & 7);
765 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
766 		goto pgtable_walk;
767 	}
768 
769 	/* get the pointer to pasid directory entry */
770 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
771 	if (!dir) {
772 		pr_info("pasid directory entry is not present\n");
773 		return;
774 	}
775 	/* For request-without-pasid, get the pasid from context entry */
776 	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
777 		pasid = IOMMU_NO_PASID;
778 
779 	dir_index = pasid >> PASID_PDE_SHIFT;
780 	pde = &dir[dir_index];
781 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
782 
783 	/* get the pointer to the pasid table entry */
784 	entries = get_pasid_table_from_pde(pde);
785 	if (!entries) {
786 		pr_info("pasid table entry is not present\n");
787 		return;
788 	}
789 	index = pasid & PASID_PTE_MASK;
790 	pte = &entries[index];
791 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
792 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
793 
794 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
795 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
796 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
797 	} else {
798 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
799 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
800 	}
801 
802 pgtable_walk:
803 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
804 }
805 #endif
806 
807 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
808 				      unsigned long pfn, int *target_level,
809 				      gfp_t gfp)
810 {
811 	struct dma_pte *parent, *pte;
812 	int level = agaw_to_level(domain->agaw);
813 	int offset;
814 
815 	if (!domain_pfn_supported(domain, pfn))
816 		/* Address beyond IOMMU's addressing capabilities. */
817 		return NULL;
818 
819 	parent = domain->pgd;
820 
821 	while (1) {
822 		void *tmp_page;
823 
824 		offset = pfn_level_offset(pfn, level);
825 		pte = &parent[offset];
826 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
827 			break;
828 		if (level == *target_level)
829 			break;
830 
831 		if (!dma_pte_present(pte)) {
832 			uint64_t pteval, tmp;
833 
834 			tmp_page = iommu_alloc_page_node(domain->nid, gfp);
835 
836 			if (!tmp_page)
837 				return NULL;
838 
839 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
840 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
841 			if (domain->use_first_level)
842 				pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
843 
844 			tmp = 0ULL;
845 			if (!try_cmpxchg64(&pte->val, &tmp, pteval))
846 				/* Someone else set it while we were thinking; use theirs. */
847 				iommu_free_page(tmp_page);
848 			else
849 				domain_flush_cache(domain, pte, sizeof(*pte));
850 		}
851 		if (level == 1)
852 			break;
853 
854 		parent = phys_to_virt(dma_pte_addr(pte));
855 		level--;
856 	}
857 
858 	if (!*target_level)
859 		*target_level = level;
860 
861 	return pte;
862 }
863 
864 /* return address's pte at specific level */
865 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
866 					 unsigned long pfn,
867 					 int level, int *large_page)
868 {
869 	struct dma_pte *parent, *pte;
870 	int total = agaw_to_level(domain->agaw);
871 	int offset;
872 
873 	parent = domain->pgd;
874 	while (level <= total) {
875 		offset = pfn_level_offset(pfn, total);
876 		pte = &parent[offset];
877 		if (level == total)
878 			return pte;
879 
880 		if (!dma_pte_present(pte)) {
881 			*large_page = total;
882 			break;
883 		}
884 
885 		if (dma_pte_superpage(pte)) {
886 			*large_page = total;
887 			return pte;
888 		}
889 
890 		parent = phys_to_virt(dma_pte_addr(pte));
891 		total--;
892 	}
893 	return NULL;
894 }
895 
896 /* clear last level pte, a tlb flush should be followed */
897 static void dma_pte_clear_range(struct dmar_domain *domain,
898 				unsigned long start_pfn,
899 				unsigned long last_pfn)
900 {
901 	unsigned int large_page;
902 	struct dma_pte *first_pte, *pte;
903 
904 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
905 	    WARN_ON(start_pfn > last_pfn))
906 		return;
907 
908 	/* we don't need lock here; nobody else touches the iova range */
909 	do {
910 		large_page = 1;
911 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
912 		if (!pte) {
913 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
914 			continue;
915 		}
916 		do {
917 			dma_clear_pte(pte);
918 			start_pfn += lvl_to_nr_pages(large_page);
919 			pte++;
920 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
921 
922 		domain_flush_cache(domain, first_pte,
923 				   (void *)pte - (void *)first_pte);
924 
925 	} while (start_pfn && start_pfn <= last_pfn);
926 }
927 
928 static void dma_pte_free_level(struct dmar_domain *domain, int level,
929 			       int retain_level, struct dma_pte *pte,
930 			       unsigned long pfn, unsigned long start_pfn,
931 			       unsigned long last_pfn)
932 {
933 	pfn = max(start_pfn, pfn);
934 	pte = &pte[pfn_level_offset(pfn, level)];
935 
936 	do {
937 		unsigned long level_pfn;
938 		struct dma_pte *level_pte;
939 
940 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
941 			goto next;
942 
943 		level_pfn = pfn & level_mask(level);
944 		level_pte = phys_to_virt(dma_pte_addr(pte));
945 
946 		if (level > 2) {
947 			dma_pte_free_level(domain, level - 1, retain_level,
948 					   level_pte, level_pfn, start_pfn,
949 					   last_pfn);
950 		}
951 
952 		/*
953 		 * Free the page table if we're below the level we want to
954 		 * retain and the range covers the entire table.
955 		 */
956 		if (level < retain_level && !(start_pfn > level_pfn ||
957 		      last_pfn < level_pfn + level_size(level) - 1)) {
958 			dma_clear_pte(pte);
959 			domain_flush_cache(domain, pte, sizeof(*pte));
960 			iommu_free_page(level_pte);
961 		}
962 next:
963 		pfn += level_size(level);
964 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
965 }
966 
967 /*
968  * clear last level (leaf) ptes and free page table pages below the
969  * level we wish to keep intact.
970  */
971 static void dma_pte_free_pagetable(struct dmar_domain *domain,
972 				   unsigned long start_pfn,
973 				   unsigned long last_pfn,
974 				   int retain_level)
975 {
976 	dma_pte_clear_range(domain, start_pfn, last_pfn);
977 
978 	/* We don't need lock here; nobody else touches the iova range */
979 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
980 			   domain->pgd, 0, start_pfn, last_pfn);
981 
982 	/* free pgd */
983 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
984 		iommu_free_page(domain->pgd);
985 		domain->pgd = NULL;
986 	}
987 }
988 
989 /* When a page at a given level is being unlinked from its parent, we don't
990    need to *modify* it at all. All we need to do is make a list of all the
991    pages which can be freed just as soon as we've flushed the IOTLB and we
992    know the hardware page-walk will no longer touch them.
993    The 'pte' argument is the *parent* PTE, pointing to the page that is to
994    be freed. */
995 static void dma_pte_list_pagetables(struct dmar_domain *domain,
996 				    int level, struct dma_pte *pte,
997 				    struct list_head *freelist)
998 {
999 	struct page *pg;
1000 
1001 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1002 	list_add_tail(&pg->lru, freelist);
1003 
1004 	if (level == 1)
1005 		return;
1006 
1007 	pte = page_address(pg);
1008 	do {
1009 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1010 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1011 		pte++;
1012 	} while (!first_pte_in_page(pte));
1013 }
1014 
1015 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1016 				struct dma_pte *pte, unsigned long pfn,
1017 				unsigned long start_pfn, unsigned long last_pfn,
1018 				struct list_head *freelist)
1019 {
1020 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1021 
1022 	pfn = max(start_pfn, pfn);
1023 	pte = &pte[pfn_level_offset(pfn, level)];
1024 
1025 	do {
1026 		unsigned long level_pfn = pfn & level_mask(level);
1027 
1028 		if (!dma_pte_present(pte))
1029 			goto next;
1030 
1031 		/* If range covers entire pagetable, free it */
1032 		if (start_pfn <= level_pfn &&
1033 		    last_pfn >= level_pfn + level_size(level) - 1) {
1034 			/* These suborbinate page tables are going away entirely. Don't
1035 			   bother to clear them; we're just going to *free* them. */
1036 			if (level > 1 && !dma_pte_superpage(pte))
1037 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1038 
1039 			dma_clear_pte(pte);
1040 			if (!first_pte)
1041 				first_pte = pte;
1042 			last_pte = pte;
1043 		} else if (level > 1) {
1044 			/* Recurse down into a level that isn't *entirely* obsolete */
1045 			dma_pte_clear_level(domain, level - 1,
1046 					    phys_to_virt(dma_pte_addr(pte)),
1047 					    level_pfn, start_pfn, last_pfn,
1048 					    freelist);
1049 		}
1050 next:
1051 		pfn = level_pfn + level_size(level);
1052 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1053 
1054 	if (first_pte)
1055 		domain_flush_cache(domain, first_pte,
1056 				   (void *)++last_pte - (void *)first_pte);
1057 }
1058 
1059 /* We can't just free the pages because the IOMMU may still be walking
1060    the page tables, and may have cached the intermediate levels. The
1061    pages can only be freed after the IOTLB flush has been done. */
1062 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1063 			 unsigned long last_pfn, struct list_head *freelist)
1064 {
1065 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1066 	    WARN_ON(start_pfn > last_pfn))
1067 		return;
1068 
1069 	/* we don't need lock here; nobody else touches the iova range */
1070 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1071 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1072 
1073 	/* free pgd */
1074 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1075 		struct page *pgd_page = virt_to_page(domain->pgd);
1076 		list_add_tail(&pgd_page->lru, freelist);
1077 		domain->pgd = NULL;
1078 	}
1079 }
1080 
1081 /* iommu handling */
1082 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1083 {
1084 	struct root_entry *root;
1085 
1086 	root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
1087 	if (!root) {
1088 		pr_err("Allocating root entry for %s failed\n",
1089 			iommu->name);
1090 		return -ENOMEM;
1091 	}
1092 
1093 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1094 	iommu->root_entry = root;
1095 
1096 	return 0;
1097 }
1098 
1099 static void iommu_set_root_entry(struct intel_iommu *iommu)
1100 {
1101 	u64 addr;
1102 	u32 sts;
1103 	unsigned long flag;
1104 
1105 	addr = virt_to_phys(iommu->root_entry);
1106 	if (sm_supported(iommu))
1107 		addr |= DMA_RTADDR_SMT;
1108 
1109 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1110 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1111 
1112 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1113 
1114 	/* Make sure hardware complete it */
1115 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1116 		      readl, (sts & DMA_GSTS_RTPS), sts);
1117 
1118 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1119 
1120 	/*
1121 	 * Hardware invalidates all DMA remapping hardware translation
1122 	 * caches as part of SRTP flow.
1123 	 */
1124 	if (cap_esrtps(iommu->cap))
1125 		return;
1126 
1127 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1128 	if (sm_supported(iommu))
1129 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1130 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1131 }
1132 
1133 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1134 {
1135 	u32 val;
1136 	unsigned long flag;
1137 
1138 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1139 		return;
1140 
1141 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1142 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1143 
1144 	/* Make sure hardware complete it */
1145 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1146 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1147 
1148 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1149 }
1150 
1151 /* return value determine if we need a write buffer flush */
1152 static void __iommu_flush_context(struct intel_iommu *iommu,
1153 				  u16 did, u16 source_id, u8 function_mask,
1154 				  u64 type)
1155 {
1156 	u64 val = 0;
1157 	unsigned long flag;
1158 
1159 	switch (type) {
1160 	case DMA_CCMD_GLOBAL_INVL:
1161 		val = DMA_CCMD_GLOBAL_INVL;
1162 		break;
1163 	case DMA_CCMD_DOMAIN_INVL:
1164 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1165 		break;
1166 	case DMA_CCMD_DEVICE_INVL:
1167 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1168 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1169 		break;
1170 	default:
1171 		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1172 			iommu->name, type);
1173 		return;
1174 	}
1175 	val |= DMA_CCMD_ICC;
1176 
1177 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1178 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1179 
1180 	/* Make sure hardware complete it */
1181 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1182 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1183 
1184 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1185 }
1186 
1187 void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
1188 			 unsigned int size_order, u64 type)
1189 {
1190 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1191 	u64 val = 0, val_iva = 0;
1192 	unsigned long flag;
1193 
1194 	switch (type) {
1195 	case DMA_TLB_GLOBAL_FLUSH:
1196 		/* global flush doesn't need set IVA_REG */
1197 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1198 		break;
1199 	case DMA_TLB_DSI_FLUSH:
1200 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1201 		break;
1202 	case DMA_TLB_PSI_FLUSH:
1203 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1204 		/* IH bit is passed in as part of address */
1205 		val_iva = size_order | addr;
1206 		break;
1207 	default:
1208 		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1209 			iommu->name, type);
1210 		return;
1211 	}
1212 
1213 	if (cap_write_drain(iommu->cap))
1214 		val |= DMA_TLB_WRITE_DRAIN;
1215 
1216 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1217 	/* Note: Only uses first TLB reg currently */
1218 	if (val_iva)
1219 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1220 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1221 
1222 	/* Make sure hardware complete it */
1223 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1224 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1225 
1226 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1227 
1228 	/* check IOTLB invalidation granularity */
1229 	if (DMA_TLB_IAIG(val) == 0)
1230 		pr_err("Flush IOTLB failed\n");
1231 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1232 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1233 			(unsigned long long)DMA_TLB_IIRG(type),
1234 			(unsigned long long)DMA_TLB_IAIG(val));
1235 }
1236 
1237 static struct device_domain_info *
1238 domain_lookup_dev_info(struct dmar_domain *domain,
1239 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1240 {
1241 	struct device_domain_info *info;
1242 	unsigned long flags;
1243 
1244 	spin_lock_irqsave(&domain->lock, flags);
1245 	list_for_each_entry(info, &domain->devices, link) {
1246 		if (info->iommu == iommu && info->bus == bus &&
1247 		    info->devfn == devfn) {
1248 			spin_unlock_irqrestore(&domain->lock, flags);
1249 			return info;
1250 		}
1251 	}
1252 	spin_unlock_irqrestore(&domain->lock, flags);
1253 
1254 	return NULL;
1255 }
1256 
1257 /*
1258  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1259  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1260  * check because it applies only to the built-in QAT devices and it doesn't
1261  * grant additional privileges.
1262  */
1263 #define BUGGY_QAT_DEVID_MASK 0x4940
1264 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1265 {
1266 	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1267 		return false;
1268 
1269 	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1270 		return false;
1271 
1272 	return true;
1273 }
1274 
1275 static void iommu_enable_pci_caps(struct device_domain_info *info)
1276 {
1277 	struct pci_dev *pdev;
1278 
1279 	if (!dev_is_pci(info->dev))
1280 		return;
1281 
1282 	pdev = to_pci_dev(info->dev);
1283 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1284 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT))
1285 		info->ats_enabled = 1;
1286 }
1287 
1288 static void iommu_disable_pci_caps(struct device_domain_info *info)
1289 {
1290 	struct pci_dev *pdev;
1291 
1292 	if (!dev_is_pci(info->dev))
1293 		return;
1294 
1295 	pdev = to_pci_dev(info->dev);
1296 
1297 	if (info->ats_enabled) {
1298 		pci_disable_ats(pdev);
1299 		info->ats_enabled = 0;
1300 	}
1301 }
1302 
1303 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1304 {
1305 	cache_tag_flush_all(to_dmar_domain(domain));
1306 }
1307 
1308 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1309 {
1310 	u32 pmen;
1311 	unsigned long flags;
1312 
1313 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1314 		return;
1315 
1316 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1317 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1318 	pmen &= ~DMA_PMEN_EPM;
1319 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1320 
1321 	/* wait for the protected region status bit to clear */
1322 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1323 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1324 
1325 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1326 }
1327 
1328 static void iommu_enable_translation(struct intel_iommu *iommu)
1329 {
1330 	u32 sts;
1331 	unsigned long flags;
1332 
1333 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1334 	iommu->gcmd |= DMA_GCMD_TE;
1335 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1336 
1337 	/* Make sure hardware complete it */
1338 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1339 		      readl, (sts & DMA_GSTS_TES), sts);
1340 
1341 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1342 }
1343 
1344 static void iommu_disable_translation(struct intel_iommu *iommu)
1345 {
1346 	u32 sts;
1347 	unsigned long flag;
1348 
1349 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1350 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1351 		return;
1352 
1353 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1354 	iommu->gcmd &= ~DMA_GCMD_TE;
1355 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1356 
1357 	/* Make sure hardware complete it */
1358 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1359 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1360 
1361 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1362 }
1363 
1364 static int iommu_init_domains(struct intel_iommu *iommu)
1365 {
1366 	u32 ndomains;
1367 
1368 	ndomains = cap_ndoms(iommu->cap);
1369 	pr_debug("%s: Number of Domains supported <%d>\n",
1370 		 iommu->name, ndomains);
1371 
1372 	spin_lock_init(&iommu->lock);
1373 
1374 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1375 	if (!iommu->domain_ids)
1376 		return -ENOMEM;
1377 
1378 	/*
1379 	 * If Caching mode is set, then invalid translations are tagged
1380 	 * with domain-id 0, hence we need to pre-allocate it. We also
1381 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1382 	 * make sure it is not used for a real domain.
1383 	 */
1384 	set_bit(0, iommu->domain_ids);
1385 
1386 	/*
1387 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1388 	 * entry for first-level or pass-through translation modes should
1389 	 * be programmed with a domain id different from those used for
1390 	 * second-level or nested translation. We reserve a domain id for
1391 	 * this purpose. This domain id is also used for identity domain
1392 	 * in legacy mode.
1393 	 */
1394 	set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1395 
1396 	return 0;
1397 }
1398 
1399 static void disable_dmar_iommu(struct intel_iommu *iommu)
1400 {
1401 	if (!iommu->domain_ids)
1402 		return;
1403 
1404 	/*
1405 	 * All iommu domains must have been detached from the devices,
1406 	 * hence there should be no domain IDs in use.
1407 	 */
1408 	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1409 		    > NUM_RESERVED_DID))
1410 		return;
1411 
1412 	if (iommu->gcmd & DMA_GCMD_TE)
1413 		iommu_disable_translation(iommu);
1414 }
1415 
1416 static void free_dmar_iommu(struct intel_iommu *iommu)
1417 {
1418 	if (iommu->domain_ids) {
1419 		bitmap_free(iommu->domain_ids);
1420 		iommu->domain_ids = NULL;
1421 	}
1422 
1423 	if (iommu->copied_tables) {
1424 		bitmap_free(iommu->copied_tables);
1425 		iommu->copied_tables = NULL;
1426 	}
1427 
1428 	/* free context mapping */
1429 	free_context_table(iommu);
1430 
1431 #ifdef CONFIG_INTEL_IOMMU_SVM
1432 	if (pasid_supported(iommu)) {
1433 		if (ecap_prs(iommu->ecap))
1434 			intel_svm_finish_prq(iommu);
1435 	}
1436 #endif
1437 }
1438 
1439 /*
1440  * Check and return whether first level is used by default for
1441  * DMA translation.
1442  */
1443 static bool first_level_by_default(unsigned int type)
1444 {
1445 	/* Only SL is available in legacy mode */
1446 	if (!scalable_mode_support())
1447 		return false;
1448 
1449 	/* Only level (either FL or SL) is available, just use it */
1450 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1451 		return intel_cap_flts_sanity();
1452 
1453 	/* Both levels are available, decide it based on domain type */
1454 	return type != IOMMU_DOMAIN_UNMANAGED;
1455 }
1456 
1457 static struct dmar_domain *alloc_domain(unsigned int type)
1458 {
1459 	struct dmar_domain *domain;
1460 
1461 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1462 	if (!domain)
1463 		return NULL;
1464 
1465 	domain->nid = NUMA_NO_NODE;
1466 	if (first_level_by_default(type))
1467 		domain->use_first_level = true;
1468 	INIT_LIST_HEAD(&domain->devices);
1469 	INIT_LIST_HEAD(&domain->dev_pasids);
1470 	INIT_LIST_HEAD(&domain->cache_tags);
1471 	spin_lock_init(&domain->lock);
1472 	spin_lock_init(&domain->cache_lock);
1473 	xa_init(&domain->iommu_array);
1474 
1475 	return domain;
1476 }
1477 
1478 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1479 {
1480 	struct iommu_domain_info *info, *curr;
1481 	unsigned long ndomains;
1482 	int num, ret = -ENOSPC;
1483 
1484 	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1485 		return 0;
1486 
1487 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1488 	if (!info)
1489 		return -ENOMEM;
1490 
1491 	spin_lock(&iommu->lock);
1492 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1493 	if (curr) {
1494 		curr->refcnt++;
1495 		spin_unlock(&iommu->lock);
1496 		kfree(info);
1497 		return 0;
1498 	}
1499 
1500 	ndomains = cap_ndoms(iommu->cap);
1501 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1502 	if (num >= ndomains) {
1503 		pr_err("%s: No free domain ids\n", iommu->name);
1504 		goto err_unlock;
1505 	}
1506 
1507 	set_bit(num, iommu->domain_ids);
1508 	info->refcnt	= 1;
1509 	info->did	= num;
1510 	info->iommu	= iommu;
1511 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1512 			  NULL, info, GFP_ATOMIC);
1513 	if (curr) {
1514 		ret = xa_err(curr) ? : -EBUSY;
1515 		goto err_clear;
1516 	}
1517 	domain_update_iommu_cap(domain);
1518 
1519 	spin_unlock(&iommu->lock);
1520 	return 0;
1521 
1522 err_clear:
1523 	clear_bit(info->did, iommu->domain_ids);
1524 err_unlock:
1525 	spin_unlock(&iommu->lock);
1526 	kfree(info);
1527 	return ret;
1528 }
1529 
1530 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1531 {
1532 	struct iommu_domain_info *info;
1533 
1534 	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1535 		return;
1536 
1537 	spin_lock(&iommu->lock);
1538 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1539 	if (--info->refcnt == 0) {
1540 		clear_bit(info->did, iommu->domain_ids);
1541 		xa_erase(&domain->iommu_array, iommu->seq_id);
1542 		domain->nid = NUMA_NO_NODE;
1543 		domain_update_iommu_cap(domain);
1544 		kfree(info);
1545 	}
1546 	spin_unlock(&iommu->lock);
1547 }
1548 
1549 static int guestwidth_to_adjustwidth(int gaw)
1550 {
1551 	int agaw;
1552 	int r = (gaw - 12) % 9;
1553 
1554 	if (r == 0)
1555 		agaw = gaw;
1556 	else
1557 		agaw = gaw + 9 - r;
1558 	if (agaw > 64)
1559 		agaw = 64;
1560 	return agaw;
1561 }
1562 
1563 static void domain_exit(struct dmar_domain *domain)
1564 {
1565 	if (domain->pgd) {
1566 		LIST_HEAD(freelist);
1567 
1568 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1569 		iommu_put_pages_list(&freelist);
1570 	}
1571 
1572 	if (WARN_ON(!list_empty(&domain->devices)))
1573 		return;
1574 
1575 	kfree(domain->qi_batch);
1576 	kfree(domain);
1577 }
1578 
1579 /*
1580  * For kdump cases, old valid entries may be cached due to the
1581  * in-flight DMA and copied pgtable, but there is no unmapping
1582  * behaviour for them, thus we need an explicit cache flush for
1583  * the newly-mapped device. For kdump, at this point, the device
1584  * is supposed to finish reset at its driver probe stage, so no
1585  * in-flight DMA will exist, and we don't need to worry anymore
1586  * hereafter.
1587  */
1588 static void copied_context_tear_down(struct intel_iommu *iommu,
1589 				     struct context_entry *context,
1590 				     u8 bus, u8 devfn)
1591 {
1592 	u16 did_old;
1593 
1594 	if (!context_copied(iommu, bus, devfn))
1595 		return;
1596 
1597 	assert_spin_locked(&iommu->lock);
1598 
1599 	did_old = context_domain_id(context);
1600 	context_clear_entry(context);
1601 
1602 	if (did_old < cap_ndoms(iommu->cap)) {
1603 		iommu->flush.flush_context(iommu, did_old,
1604 					   (((u16)bus) << 8) | devfn,
1605 					   DMA_CCMD_MASK_NOBIT,
1606 					   DMA_CCMD_DEVICE_INVL);
1607 		iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1608 					 DMA_TLB_DSI_FLUSH);
1609 	}
1610 
1611 	clear_context_copied(iommu, bus, devfn);
1612 }
1613 
1614 /*
1615  * It's a non-present to present mapping. If hardware doesn't cache
1616  * non-present entry we only need to flush the write-buffer. If the
1617  * _does_ cache non-present entries, then it does so in the special
1618  * domain #0, which we have to flush:
1619  */
1620 static void context_present_cache_flush(struct intel_iommu *iommu, u16 did,
1621 					u8 bus, u8 devfn)
1622 {
1623 	if (cap_caching_mode(iommu->cap)) {
1624 		iommu->flush.flush_context(iommu, 0,
1625 					   (((u16)bus) << 8) | devfn,
1626 					   DMA_CCMD_MASK_NOBIT,
1627 					   DMA_CCMD_DEVICE_INVL);
1628 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1629 	} else {
1630 		iommu_flush_write_buffer(iommu);
1631 	}
1632 }
1633 
1634 static int domain_context_mapping_one(struct dmar_domain *domain,
1635 				      struct intel_iommu *iommu,
1636 				      u8 bus, u8 devfn)
1637 {
1638 	struct device_domain_info *info =
1639 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1640 	u16 did = domain_id_iommu(domain, iommu);
1641 	int translation = CONTEXT_TT_MULTI_LEVEL;
1642 	struct dma_pte *pgd = domain->pgd;
1643 	struct context_entry *context;
1644 	int agaw, ret;
1645 
1646 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1647 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1648 
1649 	spin_lock(&iommu->lock);
1650 	ret = -ENOMEM;
1651 	context = iommu_context_addr(iommu, bus, devfn, 1);
1652 	if (!context)
1653 		goto out_unlock;
1654 
1655 	ret = 0;
1656 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1657 		goto out_unlock;
1658 
1659 	copied_context_tear_down(iommu, context, bus, devfn);
1660 	context_clear_entry(context);
1661 
1662 	context_set_domain_id(context, did);
1663 
1664 	/*
1665 	 * Skip top levels of page tables for iommu which has
1666 	 * less agaw than default. Unnecessary for PT mode.
1667 	 */
1668 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1669 		ret = -ENOMEM;
1670 		pgd = phys_to_virt(dma_pte_addr(pgd));
1671 		if (!dma_pte_present(pgd))
1672 			goto out_unlock;
1673 	}
1674 
1675 	if (info && info->ats_supported)
1676 		translation = CONTEXT_TT_DEV_IOTLB;
1677 	else
1678 		translation = CONTEXT_TT_MULTI_LEVEL;
1679 
1680 	context_set_address_root(context, virt_to_phys(pgd));
1681 	context_set_address_width(context, agaw);
1682 	context_set_translation_type(context, translation);
1683 	context_set_fault_enable(context);
1684 	context_set_present(context);
1685 	if (!ecap_coherent(iommu->ecap))
1686 		clflush_cache_range(context, sizeof(*context));
1687 	context_present_cache_flush(iommu, did, bus, devfn);
1688 	ret = 0;
1689 
1690 out_unlock:
1691 	spin_unlock(&iommu->lock);
1692 
1693 	return ret;
1694 }
1695 
1696 static int domain_context_mapping_cb(struct pci_dev *pdev,
1697 				     u16 alias, void *opaque)
1698 {
1699 	struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1700 	struct intel_iommu *iommu = info->iommu;
1701 	struct dmar_domain *domain = opaque;
1702 
1703 	return domain_context_mapping_one(domain, iommu,
1704 					  PCI_BUS_NUM(alias), alias & 0xff);
1705 }
1706 
1707 static int
1708 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1709 {
1710 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1711 	struct intel_iommu *iommu = info->iommu;
1712 	u8 bus = info->bus, devfn = info->devfn;
1713 
1714 	if (!dev_is_pci(dev))
1715 		return domain_context_mapping_one(domain, iommu, bus, devfn);
1716 
1717 	return pci_for_each_dma_alias(to_pci_dev(dev),
1718 				      domain_context_mapping_cb, domain);
1719 }
1720 
1721 /* Return largest possible superpage level for a given mapping */
1722 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1723 				   unsigned long phy_pfn, unsigned long pages)
1724 {
1725 	int support, level = 1;
1726 	unsigned long pfnmerge;
1727 
1728 	support = domain->iommu_superpage;
1729 
1730 	/* To use a large page, the virtual *and* physical addresses
1731 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1732 	   of them will mean we have to use smaller pages. So just
1733 	   merge them and check both at once. */
1734 	pfnmerge = iov_pfn | phy_pfn;
1735 
1736 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1737 		pages >>= VTD_STRIDE_SHIFT;
1738 		if (!pages)
1739 			break;
1740 		pfnmerge >>= VTD_STRIDE_SHIFT;
1741 		level++;
1742 		support--;
1743 	}
1744 	return level;
1745 }
1746 
1747 /*
1748  * Ensure that old small page tables are removed to make room for superpage(s).
1749  * We're going to add new large pages, so make sure we don't remove their parent
1750  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1751  */
1752 static void switch_to_super_page(struct dmar_domain *domain,
1753 				 unsigned long start_pfn,
1754 				 unsigned long end_pfn, int level)
1755 {
1756 	unsigned long lvl_pages = lvl_to_nr_pages(level);
1757 	struct dma_pte *pte = NULL;
1758 
1759 	while (start_pfn <= end_pfn) {
1760 		if (!pte)
1761 			pte = pfn_to_dma_pte(domain, start_pfn, &level,
1762 					     GFP_ATOMIC);
1763 
1764 		if (dma_pte_present(pte)) {
1765 			dma_pte_free_pagetable(domain, start_pfn,
1766 					       start_pfn + lvl_pages - 1,
1767 					       level + 1);
1768 
1769 			cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT,
1770 					      end_pfn << VTD_PAGE_SHIFT, 0);
1771 		}
1772 
1773 		pte++;
1774 		start_pfn += lvl_pages;
1775 		if (first_pte_in_page(pte))
1776 			pte = NULL;
1777 	}
1778 }
1779 
1780 static int
1781 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1782 		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
1783 		 gfp_t gfp)
1784 {
1785 	struct dma_pte *first_pte = NULL, *pte = NULL;
1786 	unsigned int largepage_lvl = 0;
1787 	unsigned long lvl_pages = 0;
1788 	phys_addr_t pteval;
1789 	u64 attr;
1790 
1791 	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
1792 		return -EINVAL;
1793 
1794 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1795 		return -EINVAL;
1796 
1797 	if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
1798 		pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
1799 		return -EINVAL;
1800 	}
1801 
1802 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
1803 	attr |= DMA_FL_PTE_PRESENT;
1804 	if (domain->use_first_level) {
1805 		attr |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
1806 		if (prot & DMA_PTE_WRITE)
1807 			attr |= DMA_FL_PTE_DIRTY;
1808 	}
1809 
1810 	domain->has_mappings = true;
1811 
1812 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
1813 
1814 	while (nr_pages > 0) {
1815 		uint64_t tmp;
1816 
1817 		if (!pte) {
1818 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
1819 					phys_pfn, nr_pages);
1820 
1821 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
1822 					     gfp);
1823 			if (!pte)
1824 				return -ENOMEM;
1825 			first_pte = pte;
1826 
1827 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
1828 
1829 			/* It is large page*/
1830 			if (largepage_lvl > 1) {
1831 				unsigned long end_pfn;
1832 				unsigned long pages_to_remove;
1833 
1834 				pteval |= DMA_PTE_LARGE_PAGE;
1835 				pages_to_remove = min_t(unsigned long, nr_pages,
1836 							nr_pte_to_next_page(pte) * lvl_pages);
1837 				end_pfn = iov_pfn + pages_to_remove - 1;
1838 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
1839 			} else {
1840 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1841 			}
1842 
1843 		}
1844 		/* We don't need lock here, nobody else
1845 		 * touches the iova range
1846 		 */
1847 		tmp = 0ULL;
1848 		if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
1849 			static int dumps = 5;
1850 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1851 				iov_pfn, tmp, (unsigned long long)pteval);
1852 			if (dumps) {
1853 				dumps--;
1854 				debug_dma_dump_mappings(NULL);
1855 			}
1856 			WARN_ON(1);
1857 		}
1858 
1859 		nr_pages -= lvl_pages;
1860 		iov_pfn += lvl_pages;
1861 		phys_pfn += lvl_pages;
1862 		pteval += lvl_pages * VTD_PAGE_SIZE;
1863 
1864 		/* If the next PTE would be the first in a new page, then we
1865 		 * need to flush the cache on the entries we've just written.
1866 		 * And then we'll need to recalculate 'pte', so clear it and
1867 		 * let it get set again in the if (!pte) block above.
1868 		 *
1869 		 * If we're done (!nr_pages) we need to flush the cache too.
1870 		 *
1871 		 * Also if we've been setting superpages, we may need to
1872 		 * recalculate 'pte' and switch back to smaller pages for the
1873 		 * end of the mapping, if the trailing size is not enough to
1874 		 * use another superpage (i.e. nr_pages < lvl_pages).
1875 		 */
1876 		pte++;
1877 		if (!nr_pages || first_pte_in_page(pte) ||
1878 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
1879 			domain_flush_cache(domain, first_pte,
1880 					   (void *)pte - (void *)first_pte);
1881 			pte = NULL;
1882 		}
1883 	}
1884 
1885 	return 0;
1886 }
1887 
1888 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
1889 {
1890 	struct intel_iommu *iommu = info->iommu;
1891 	struct context_entry *context;
1892 	u16 did;
1893 
1894 	spin_lock(&iommu->lock);
1895 	context = iommu_context_addr(iommu, bus, devfn, 0);
1896 	if (!context) {
1897 		spin_unlock(&iommu->lock);
1898 		return;
1899 	}
1900 
1901 	did = context_domain_id(context);
1902 	context_clear_entry(context);
1903 	__iommu_flush_cache(iommu, context, sizeof(*context));
1904 	spin_unlock(&iommu->lock);
1905 	intel_context_flush_present(info, context, did, true);
1906 }
1907 
1908 static int domain_setup_first_level(struct intel_iommu *iommu,
1909 				    struct dmar_domain *domain,
1910 				    struct device *dev,
1911 				    u32 pasid)
1912 {
1913 	struct dma_pte *pgd = domain->pgd;
1914 	int agaw, level;
1915 	int flags = 0;
1916 
1917 	/*
1918 	 * Skip top levels of page tables for iommu which has
1919 	 * less agaw than default. Unnecessary for PT mode.
1920 	 */
1921 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1922 		pgd = phys_to_virt(dma_pte_addr(pgd));
1923 		if (!dma_pte_present(pgd))
1924 			return -ENOMEM;
1925 	}
1926 
1927 	level = agaw_to_level(agaw);
1928 	if (level != 4 && level != 5)
1929 		return -EINVAL;
1930 
1931 	if (level == 5)
1932 		flags |= PASID_FLAG_FL5LP;
1933 
1934 	if (domain->force_snooping)
1935 		flags |= PASID_FLAG_PAGE_SNOOP;
1936 
1937 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
1938 					     domain_id_iommu(domain, iommu),
1939 					     flags);
1940 }
1941 
1942 static bool dev_is_real_dma_subdevice(struct device *dev)
1943 {
1944 	return dev && dev_is_pci(dev) &&
1945 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
1946 }
1947 
1948 static int dmar_domain_attach_device(struct dmar_domain *domain,
1949 				     struct device *dev)
1950 {
1951 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1952 	struct intel_iommu *iommu = info->iommu;
1953 	unsigned long flags;
1954 	int ret;
1955 
1956 	ret = domain_attach_iommu(domain, iommu);
1957 	if (ret)
1958 		return ret;
1959 
1960 	info->domain = domain;
1961 	spin_lock_irqsave(&domain->lock, flags);
1962 	list_add(&info->link, &domain->devices);
1963 	spin_unlock_irqrestore(&domain->lock, flags);
1964 
1965 	if (dev_is_real_dma_subdevice(dev))
1966 		return 0;
1967 
1968 	if (!sm_supported(iommu))
1969 		ret = domain_context_mapping(domain, dev);
1970 	else if (domain->use_first_level)
1971 		ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID);
1972 	else
1973 		ret = intel_pasid_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID);
1974 
1975 	if (ret)
1976 		goto out_block_translation;
1977 
1978 	iommu_enable_pci_caps(info);
1979 
1980 	ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID);
1981 	if (ret)
1982 		goto out_block_translation;
1983 
1984 	return 0;
1985 
1986 out_block_translation:
1987 	device_block_translation(dev);
1988 	return ret;
1989 }
1990 
1991 /**
1992  * device_rmrr_is_relaxable - Test whether the RMRR of this device
1993  * is relaxable (ie. is allowed to be not enforced under some conditions)
1994  * @dev: device handle
1995  *
1996  * We assume that PCI USB devices with RMRRs have them largely
1997  * for historical reasons and that the RMRR space is not actively used post
1998  * boot.  This exclusion may change if vendors begin to abuse it.
1999  *
2000  * The same exception is made for graphics devices, with the requirement that
2001  * any use of the RMRR regions will be torn down before assigning the device
2002  * to a guest.
2003  *
2004  * Return: true if the RMRR is relaxable, false otherwise
2005  */
2006 static bool device_rmrr_is_relaxable(struct device *dev)
2007 {
2008 	struct pci_dev *pdev;
2009 
2010 	if (!dev_is_pci(dev))
2011 		return false;
2012 
2013 	pdev = to_pci_dev(dev);
2014 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2015 		return true;
2016 	else
2017 		return false;
2018 }
2019 
2020 static int device_def_domain_type(struct device *dev)
2021 {
2022 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2023 	struct intel_iommu *iommu = info->iommu;
2024 
2025 	/*
2026 	 * Hardware does not support the passthrough translation mode.
2027 	 * Always use a dynamaic mapping domain.
2028 	 */
2029 	if (!ecap_pass_through(iommu->ecap))
2030 		return IOMMU_DOMAIN_DMA;
2031 
2032 	if (dev_is_pci(dev)) {
2033 		struct pci_dev *pdev = to_pci_dev(dev);
2034 
2035 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2036 			return IOMMU_DOMAIN_IDENTITY;
2037 	}
2038 
2039 	return 0;
2040 }
2041 
2042 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2043 {
2044 	/*
2045 	 * Start from the sane iommu hardware state.
2046 	 * If the queued invalidation is already initialized by us
2047 	 * (for example, while enabling interrupt-remapping) then
2048 	 * we got the things already rolling from a sane state.
2049 	 */
2050 	if (!iommu->qi) {
2051 		/*
2052 		 * Clear any previous faults.
2053 		 */
2054 		dmar_fault(-1, iommu);
2055 		/*
2056 		 * Disable queued invalidation if supported and already enabled
2057 		 * before OS handover.
2058 		 */
2059 		dmar_disable_qi(iommu);
2060 	}
2061 
2062 	if (dmar_enable_qi(iommu)) {
2063 		/*
2064 		 * Queued Invalidate not enabled, use Register Based Invalidate
2065 		 */
2066 		iommu->flush.flush_context = __iommu_flush_context;
2067 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2068 		pr_info("%s: Using Register based invalidation\n",
2069 			iommu->name);
2070 	} else {
2071 		iommu->flush.flush_context = qi_flush_context;
2072 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2073 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2074 	}
2075 }
2076 
2077 static int copy_context_table(struct intel_iommu *iommu,
2078 			      struct root_entry *old_re,
2079 			      struct context_entry **tbl,
2080 			      int bus, bool ext)
2081 {
2082 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2083 	struct context_entry *new_ce = NULL, ce;
2084 	struct context_entry *old_ce = NULL;
2085 	struct root_entry re;
2086 	phys_addr_t old_ce_phys;
2087 
2088 	tbl_idx = ext ? bus * 2 : bus;
2089 	memcpy(&re, old_re, sizeof(re));
2090 
2091 	for (devfn = 0; devfn < 256; devfn++) {
2092 		/* First calculate the correct index */
2093 		idx = (ext ? devfn * 2 : devfn) % 256;
2094 
2095 		if (idx == 0) {
2096 			/* First save what we may have and clean up */
2097 			if (new_ce) {
2098 				tbl[tbl_idx] = new_ce;
2099 				__iommu_flush_cache(iommu, new_ce,
2100 						    VTD_PAGE_SIZE);
2101 				pos = 1;
2102 			}
2103 
2104 			if (old_ce)
2105 				memunmap(old_ce);
2106 
2107 			ret = 0;
2108 			if (devfn < 0x80)
2109 				old_ce_phys = root_entry_lctp(&re);
2110 			else
2111 				old_ce_phys = root_entry_uctp(&re);
2112 
2113 			if (!old_ce_phys) {
2114 				if (ext && devfn == 0) {
2115 					/* No LCTP, try UCTP */
2116 					devfn = 0x7f;
2117 					continue;
2118 				} else {
2119 					goto out;
2120 				}
2121 			}
2122 
2123 			ret = -ENOMEM;
2124 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2125 					MEMREMAP_WB);
2126 			if (!old_ce)
2127 				goto out;
2128 
2129 			new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL);
2130 			if (!new_ce)
2131 				goto out_unmap;
2132 
2133 			ret = 0;
2134 		}
2135 
2136 		/* Now copy the context entry */
2137 		memcpy(&ce, old_ce + idx, sizeof(ce));
2138 
2139 		if (!context_present(&ce))
2140 			continue;
2141 
2142 		did = context_domain_id(&ce);
2143 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2144 			set_bit(did, iommu->domain_ids);
2145 
2146 		set_context_copied(iommu, bus, devfn);
2147 		new_ce[idx] = ce;
2148 	}
2149 
2150 	tbl[tbl_idx + pos] = new_ce;
2151 
2152 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2153 
2154 out_unmap:
2155 	memunmap(old_ce);
2156 
2157 out:
2158 	return ret;
2159 }
2160 
2161 static int copy_translation_tables(struct intel_iommu *iommu)
2162 {
2163 	struct context_entry **ctxt_tbls;
2164 	struct root_entry *old_rt;
2165 	phys_addr_t old_rt_phys;
2166 	int ctxt_table_entries;
2167 	u64 rtaddr_reg;
2168 	int bus, ret;
2169 	bool new_ext, ext;
2170 
2171 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2172 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2173 	new_ext    = !!sm_supported(iommu);
2174 
2175 	/*
2176 	 * The RTT bit can only be changed when translation is disabled,
2177 	 * but disabling translation means to open a window for data
2178 	 * corruption. So bail out and don't copy anything if we would
2179 	 * have to change the bit.
2180 	 */
2181 	if (new_ext != ext)
2182 		return -EINVAL;
2183 
2184 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2185 	if (!iommu->copied_tables)
2186 		return -ENOMEM;
2187 
2188 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2189 	if (!old_rt_phys)
2190 		return -EINVAL;
2191 
2192 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2193 	if (!old_rt)
2194 		return -ENOMEM;
2195 
2196 	/* This is too big for the stack - allocate it from slab */
2197 	ctxt_table_entries = ext ? 512 : 256;
2198 	ret = -ENOMEM;
2199 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2200 	if (!ctxt_tbls)
2201 		goto out_unmap;
2202 
2203 	for (bus = 0; bus < 256; bus++) {
2204 		ret = copy_context_table(iommu, &old_rt[bus],
2205 					 ctxt_tbls, bus, ext);
2206 		if (ret) {
2207 			pr_err("%s: Failed to copy context table for bus %d\n",
2208 				iommu->name, bus);
2209 			continue;
2210 		}
2211 	}
2212 
2213 	spin_lock(&iommu->lock);
2214 
2215 	/* Context tables are copied, now write them to the root_entry table */
2216 	for (bus = 0; bus < 256; bus++) {
2217 		int idx = ext ? bus * 2 : bus;
2218 		u64 val;
2219 
2220 		if (ctxt_tbls[idx]) {
2221 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2222 			iommu->root_entry[bus].lo = val;
2223 		}
2224 
2225 		if (!ext || !ctxt_tbls[idx + 1])
2226 			continue;
2227 
2228 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2229 		iommu->root_entry[bus].hi = val;
2230 	}
2231 
2232 	spin_unlock(&iommu->lock);
2233 
2234 	kfree(ctxt_tbls);
2235 
2236 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2237 
2238 	ret = 0;
2239 
2240 out_unmap:
2241 	memunmap(old_rt);
2242 
2243 	return ret;
2244 }
2245 
2246 static int __init init_dmars(void)
2247 {
2248 	struct dmar_drhd_unit *drhd;
2249 	struct intel_iommu *iommu;
2250 	int ret;
2251 
2252 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2253 	if (ret)
2254 		goto free_iommu;
2255 
2256 	for_each_iommu(iommu, drhd) {
2257 		if (drhd->ignored) {
2258 			iommu_disable_translation(iommu);
2259 			continue;
2260 		}
2261 
2262 		/*
2263 		 * Find the max pasid size of all IOMMU's in the system.
2264 		 * We need to ensure the system pasid table is no bigger
2265 		 * than the smallest supported.
2266 		 */
2267 		if (pasid_supported(iommu)) {
2268 			u32 temp = 2 << ecap_pss(iommu->ecap);
2269 
2270 			intel_pasid_max_id = min_t(u32, temp,
2271 						   intel_pasid_max_id);
2272 		}
2273 
2274 		intel_iommu_init_qi(iommu);
2275 
2276 		ret = iommu_init_domains(iommu);
2277 		if (ret)
2278 			goto free_iommu;
2279 
2280 		init_translation_status(iommu);
2281 
2282 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2283 			iommu_disable_translation(iommu);
2284 			clear_translation_pre_enabled(iommu);
2285 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2286 				iommu->name);
2287 		}
2288 
2289 		/*
2290 		 * TBD:
2291 		 * we could share the same root & context tables
2292 		 * among all IOMMU's. Need to Split it later.
2293 		 */
2294 		ret = iommu_alloc_root_entry(iommu);
2295 		if (ret)
2296 			goto free_iommu;
2297 
2298 		if (translation_pre_enabled(iommu)) {
2299 			pr_info("Translation already enabled - trying to copy translation structures\n");
2300 
2301 			ret = copy_translation_tables(iommu);
2302 			if (ret) {
2303 				/*
2304 				 * We found the IOMMU with translation
2305 				 * enabled - but failed to copy over the
2306 				 * old root-entry table. Try to proceed
2307 				 * by disabling translation now and
2308 				 * allocating a clean root-entry table.
2309 				 * This might cause DMAR faults, but
2310 				 * probably the dump will still succeed.
2311 				 */
2312 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2313 				       iommu->name);
2314 				iommu_disable_translation(iommu);
2315 				clear_translation_pre_enabled(iommu);
2316 			} else {
2317 				pr_info("Copied translation tables from previous kernel for %s\n",
2318 					iommu->name);
2319 			}
2320 		}
2321 
2322 		intel_svm_check(iommu);
2323 	}
2324 
2325 	/*
2326 	 * Now that qi is enabled on all iommus, set the root entry and flush
2327 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2328 	 * flush_context function will loop forever and the boot hangs.
2329 	 */
2330 	for_each_active_iommu(iommu, drhd) {
2331 		iommu_flush_write_buffer(iommu);
2332 		iommu_set_root_entry(iommu);
2333 	}
2334 
2335 	check_tylersburg_isoch();
2336 
2337 	/*
2338 	 * for each drhd
2339 	 *   enable fault log
2340 	 *   global invalidate context cache
2341 	 *   global invalidate iotlb
2342 	 *   enable translation
2343 	 */
2344 	for_each_iommu(iommu, drhd) {
2345 		if (drhd->ignored) {
2346 			/*
2347 			 * we always have to disable PMRs or DMA may fail on
2348 			 * this device
2349 			 */
2350 			if (force_on)
2351 				iommu_disable_protect_mem_regions(iommu);
2352 			continue;
2353 		}
2354 
2355 		iommu_flush_write_buffer(iommu);
2356 
2357 #ifdef CONFIG_INTEL_IOMMU_SVM
2358 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2359 			/*
2360 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2361 			 * could cause possible lock race condition.
2362 			 */
2363 			up_write(&dmar_global_lock);
2364 			ret = intel_svm_enable_prq(iommu);
2365 			down_write(&dmar_global_lock);
2366 			if (ret)
2367 				goto free_iommu;
2368 		}
2369 #endif
2370 		ret = dmar_set_interrupt(iommu);
2371 		if (ret)
2372 			goto free_iommu;
2373 	}
2374 
2375 	return 0;
2376 
2377 free_iommu:
2378 	for_each_active_iommu(iommu, drhd) {
2379 		disable_dmar_iommu(iommu);
2380 		free_dmar_iommu(iommu);
2381 	}
2382 
2383 	return ret;
2384 }
2385 
2386 static void __init init_no_remapping_devices(void)
2387 {
2388 	struct dmar_drhd_unit *drhd;
2389 	struct device *dev;
2390 	int i;
2391 
2392 	for_each_drhd_unit(drhd) {
2393 		if (!drhd->include_all) {
2394 			for_each_active_dev_scope(drhd->devices,
2395 						  drhd->devices_cnt, i, dev)
2396 				break;
2397 			/* ignore DMAR unit if no devices exist */
2398 			if (i == drhd->devices_cnt)
2399 				drhd->ignored = 1;
2400 		}
2401 	}
2402 
2403 	for_each_active_drhd_unit(drhd) {
2404 		if (drhd->include_all)
2405 			continue;
2406 
2407 		for_each_active_dev_scope(drhd->devices,
2408 					  drhd->devices_cnt, i, dev)
2409 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2410 				break;
2411 		if (i < drhd->devices_cnt)
2412 			continue;
2413 
2414 		/* This IOMMU has *only* gfx devices. Either bypass it or
2415 		   set the gfx_mapped flag, as appropriate */
2416 		drhd->gfx_dedicated = 1;
2417 		if (disable_igfx_iommu)
2418 			drhd->ignored = 1;
2419 	}
2420 }
2421 
2422 #ifdef CONFIG_SUSPEND
2423 static int init_iommu_hw(void)
2424 {
2425 	struct dmar_drhd_unit *drhd;
2426 	struct intel_iommu *iommu = NULL;
2427 	int ret;
2428 
2429 	for_each_active_iommu(iommu, drhd) {
2430 		if (iommu->qi) {
2431 			ret = dmar_reenable_qi(iommu);
2432 			if (ret)
2433 				return ret;
2434 		}
2435 	}
2436 
2437 	for_each_iommu(iommu, drhd) {
2438 		if (drhd->ignored) {
2439 			/*
2440 			 * we always have to disable PMRs or DMA may fail on
2441 			 * this device
2442 			 */
2443 			if (force_on)
2444 				iommu_disable_protect_mem_regions(iommu);
2445 			continue;
2446 		}
2447 
2448 		iommu_flush_write_buffer(iommu);
2449 		iommu_set_root_entry(iommu);
2450 		iommu_enable_translation(iommu);
2451 		iommu_disable_protect_mem_regions(iommu);
2452 	}
2453 
2454 	return 0;
2455 }
2456 
2457 static void iommu_flush_all(void)
2458 {
2459 	struct dmar_drhd_unit *drhd;
2460 	struct intel_iommu *iommu;
2461 
2462 	for_each_active_iommu(iommu, drhd) {
2463 		iommu->flush.flush_context(iommu, 0, 0, 0,
2464 					   DMA_CCMD_GLOBAL_INVL);
2465 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2466 					 DMA_TLB_GLOBAL_FLUSH);
2467 	}
2468 }
2469 
2470 static int iommu_suspend(void)
2471 {
2472 	struct dmar_drhd_unit *drhd;
2473 	struct intel_iommu *iommu = NULL;
2474 	unsigned long flag;
2475 
2476 	iommu_flush_all();
2477 
2478 	for_each_active_iommu(iommu, drhd) {
2479 		iommu_disable_translation(iommu);
2480 
2481 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2482 
2483 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
2484 			readl(iommu->reg + DMAR_FECTL_REG);
2485 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2486 			readl(iommu->reg + DMAR_FEDATA_REG);
2487 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2488 			readl(iommu->reg + DMAR_FEADDR_REG);
2489 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2490 			readl(iommu->reg + DMAR_FEUADDR_REG);
2491 
2492 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2493 	}
2494 	return 0;
2495 }
2496 
2497 static void iommu_resume(void)
2498 {
2499 	struct dmar_drhd_unit *drhd;
2500 	struct intel_iommu *iommu = NULL;
2501 	unsigned long flag;
2502 
2503 	if (init_iommu_hw()) {
2504 		if (force_on)
2505 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2506 		else
2507 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2508 		return;
2509 	}
2510 
2511 	for_each_active_iommu(iommu, drhd) {
2512 
2513 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2514 
2515 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2516 			iommu->reg + DMAR_FECTL_REG);
2517 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2518 			iommu->reg + DMAR_FEDATA_REG);
2519 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2520 			iommu->reg + DMAR_FEADDR_REG);
2521 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2522 			iommu->reg + DMAR_FEUADDR_REG);
2523 
2524 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2525 	}
2526 }
2527 
2528 static struct syscore_ops iommu_syscore_ops = {
2529 	.resume		= iommu_resume,
2530 	.suspend	= iommu_suspend,
2531 };
2532 
2533 static void __init init_iommu_pm_ops(void)
2534 {
2535 	register_syscore_ops(&iommu_syscore_ops);
2536 }
2537 
2538 #else
2539 static inline void init_iommu_pm_ops(void) {}
2540 #endif	/* CONFIG_PM */
2541 
2542 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2543 {
2544 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2545 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2546 	    rmrr->end_address <= rmrr->base_address ||
2547 	    arch_rmrr_sanity_check(rmrr))
2548 		return -EINVAL;
2549 
2550 	return 0;
2551 }
2552 
2553 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2554 {
2555 	struct acpi_dmar_reserved_memory *rmrr;
2556 	struct dmar_rmrr_unit *rmrru;
2557 
2558 	rmrr = (struct acpi_dmar_reserved_memory *)header;
2559 	if (rmrr_sanity_check(rmrr)) {
2560 		pr_warn(FW_BUG
2561 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2562 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2563 			   rmrr->base_address, rmrr->end_address,
2564 			   dmi_get_system_info(DMI_BIOS_VENDOR),
2565 			   dmi_get_system_info(DMI_BIOS_VERSION),
2566 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
2567 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2568 	}
2569 
2570 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2571 	if (!rmrru)
2572 		goto out;
2573 
2574 	rmrru->hdr = header;
2575 
2576 	rmrru->base_address = rmrr->base_address;
2577 	rmrru->end_address = rmrr->end_address;
2578 
2579 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2580 				((void *)rmrr) + rmrr->header.length,
2581 				&rmrru->devices_cnt);
2582 	if (rmrru->devices_cnt && rmrru->devices == NULL)
2583 		goto free_rmrru;
2584 
2585 	list_add(&rmrru->list, &dmar_rmrr_units);
2586 
2587 	return 0;
2588 free_rmrru:
2589 	kfree(rmrru);
2590 out:
2591 	return -ENOMEM;
2592 }
2593 
2594 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2595 {
2596 	struct dmar_atsr_unit *atsru;
2597 	struct acpi_dmar_atsr *tmp;
2598 
2599 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2600 				dmar_rcu_check()) {
2601 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2602 		if (atsr->segment != tmp->segment)
2603 			continue;
2604 		if (atsr->header.length != tmp->header.length)
2605 			continue;
2606 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
2607 			return atsru;
2608 	}
2609 
2610 	return NULL;
2611 }
2612 
2613 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2614 {
2615 	struct acpi_dmar_atsr *atsr;
2616 	struct dmar_atsr_unit *atsru;
2617 
2618 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2619 		return 0;
2620 
2621 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2622 	atsru = dmar_find_atsr(atsr);
2623 	if (atsru)
2624 		return 0;
2625 
2626 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2627 	if (!atsru)
2628 		return -ENOMEM;
2629 
2630 	/*
2631 	 * If memory is allocated from slab by ACPI _DSM method, we need to
2632 	 * copy the memory content because the memory buffer will be freed
2633 	 * on return.
2634 	 */
2635 	atsru->hdr = (void *)(atsru + 1);
2636 	memcpy(atsru->hdr, hdr, hdr->length);
2637 	atsru->include_all = atsr->flags & 0x1;
2638 	if (!atsru->include_all) {
2639 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
2640 				(void *)atsr + atsr->header.length,
2641 				&atsru->devices_cnt);
2642 		if (atsru->devices_cnt && atsru->devices == NULL) {
2643 			kfree(atsru);
2644 			return -ENOMEM;
2645 		}
2646 	}
2647 
2648 	list_add_rcu(&atsru->list, &dmar_atsr_units);
2649 
2650 	return 0;
2651 }
2652 
2653 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2654 {
2655 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
2656 	kfree(atsru);
2657 }
2658 
2659 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2660 {
2661 	struct acpi_dmar_atsr *atsr;
2662 	struct dmar_atsr_unit *atsru;
2663 
2664 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2665 	atsru = dmar_find_atsr(atsr);
2666 	if (atsru) {
2667 		list_del_rcu(&atsru->list);
2668 		synchronize_rcu();
2669 		intel_iommu_free_atsr(atsru);
2670 	}
2671 
2672 	return 0;
2673 }
2674 
2675 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2676 {
2677 	int i;
2678 	struct device *dev;
2679 	struct acpi_dmar_atsr *atsr;
2680 	struct dmar_atsr_unit *atsru;
2681 
2682 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2683 	atsru = dmar_find_atsr(atsr);
2684 	if (!atsru)
2685 		return 0;
2686 
2687 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
2688 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
2689 					  i, dev)
2690 			return -EBUSY;
2691 	}
2692 
2693 	return 0;
2694 }
2695 
2696 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
2697 {
2698 	struct dmar_satc_unit *satcu;
2699 	struct acpi_dmar_satc *tmp;
2700 
2701 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
2702 				dmar_rcu_check()) {
2703 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
2704 		if (satc->segment != tmp->segment)
2705 			continue;
2706 		if (satc->header.length != tmp->header.length)
2707 			continue;
2708 		if (memcmp(satc, tmp, satc->header.length) == 0)
2709 			return satcu;
2710 	}
2711 
2712 	return NULL;
2713 }
2714 
2715 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
2716 {
2717 	struct acpi_dmar_satc *satc;
2718 	struct dmar_satc_unit *satcu;
2719 
2720 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2721 		return 0;
2722 
2723 	satc = container_of(hdr, struct acpi_dmar_satc, header);
2724 	satcu = dmar_find_satc(satc);
2725 	if (satcu)
2726 		return 0;
2727 
2728 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
2729 	if (!satcu)
2730 		return -ENOMEM;
2731 
2732 	satcu->hdr = (void *)(satcu + 1);
2733 	memcpy(satcu->hdr, hdr, hdr->length);
2734 	satcu->atc_required = satc->flags & 0x1;
2735 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
2736 					      (void *)satc + satc->header.length,
2737 					      &satcu->devices_cnt);
2738 	if (satcu->devices_cnt && !satcu->devices) {
2739 		kfree(satcu);
2740 		return -ENOMEM;
2741 	}
2742 	list_add_rcu(&satcu->list, &dmar_satc_units);
2743 
2744 	return 0;
2745 }
2746 
2747 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
2748 {
2749 	int sp, ret;
2750 	struct intel_iommu *iommu = dmaru->iommu;
2751 
2752 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
2753 	if (ret)
2754 		goto out;
2755 
2756 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
2757 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
2758 		pr_warn("%s: Doesn't support large page.\n",
2759 			iommu->name);
2760 		return -ENXIO;
2761 	}
2762 
2763 	/*
2764 	 * Disable translation if already enabled prior to OS handover.
2765 	 */
2766 	if (iommu->gcmd & DMA_GCMD_TE)
2767 		iommu_disable_translation(iommu);
2768 
2769 	ret = iommu_init_domains(iommu);
2770 	if (ret == 0)
2771 		ret = iommu_alloc_root_entry(iommu);
2772 	if (ret)
2773 		goto out;
2774 
2775 	intel_svm_check(iommu);
2776 
2777 	if (dmaru->ignored) {
2778 		/*
2779 		 * we always have to disable PMRs or DMA may fail on this device
2780 		 */
2781 		if (force_on)
2782 			iommu_disable_protect_mem_regions(iommu);
2783 		return 0;
2784 	}
2785 
2786 	intel_iommu_init_qi(iommu);
2787 	iommu_flush_write_buffer(iommu);
2788 
2789 #ifdef CONFIG_INTEL_IOMMU_SVM
2790 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2791 		ret = intel_svm_enable_prq(iommu);
2792 		if (ret)
2793 			goto disable_iommu;
2794 	}
2795 #endif
2796 	ret = dmar_set_interrupt(iommu);
2797 	if (ret)
2798 		goto disable_iommu;
2799 
2800 	iommu_set_root_entry(iommu);
2801 	iommu_enable_translation(iommu);
2802 
2803 	iommu_disable_protect_mem_regions(iommu);
2804 	return 0;
2805 
2806 disable_iommu:
2807 	disable_dmar_iommu(iommu);
2808 out:
2809 	free_dmar_iommu(iommu);
2810 	return ret;
2811 }
2812 
2813 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
2814 {
2815 	int ret = 0;
2816 	struct intel_iommu *iommu = dmaru->iommu;
2817 
2818 	if (!intel_iommu_enabled)
2819 		return 0;
2820 	if (iommu == NULL)
2821 		return -EINVAL;
2822 
2823 	if (insert) {
2824 		ret = intel_iommu_add(dmaru);
2825 	} else {
2826 		disable_dmar_iommu(iommu);
2827 		free_dmar_iommu(iommu);
2828 	}
2829 
2830 	return ret;
2831 }
2832 
2833 static void intel_iommu_free_dmars(void)
2834 {
2835 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
2836 	struct dmar_atsr_unit *atsru, *atsr_n;
2837 	struct dmar_satc_unit *satcu, *satc_n;
2838 
2839 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
2840 		list_del(&rmrru->list);
2841 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
2842 		kfree(rmrru);
2843 	}
2844 
2845 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
2846 		list_del(&atsru->list);
2847 		intel_iommu_free_atsr(atsru);
2848 	}
2849 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
2850 		list_del(&satcu->list);
2851 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
2852 		kfree(satcu);
2853 	}
2854 }
2855 
2856 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
2857 {
2858 	struct dmar_satc_unit *satcu;
2859 	struct acpi_dmar_satc *satc;
2860 	struct device *tmp;
2861 	int i;
2862 
2863 	dev = pci_physfn(dev);
2864 	rcu_read_lock();
2865 
2866 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
2867 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2868 		if (satc->segment != pci_domain_nr(dev->bus))
2869 			continue;
2870 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
2871 			if (to_pci_dev(tmp) == dev)
2872 				goto out;
2873 	}
2874 	satcu = NULL;
2875 out:
2876 	rcu_read_unlock();
2877 	return satcu;
2878 }
2879 
2880 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
2881 {
2882 	int i, ret = 1;
2883 	struct pci_bus *bus;
2884 	struct pci_dev *bridge = NULL;
2885 	struct device *tmp;
2886 	struct acpi_dmar_atsr *atsr;
2887 	struct dmar_atsr_unit *atsru;
2888 	struct dmar_satc_unit *satcu;
2889 
2890 	dev = pci_physfn(dev);
2891 	satcu = dmar_find_matched_satc_unit(dev);
2892 	if (satcu)
2893 		/*
2894 		 * This device supports ATS as it is in SATC table.
2895 		 * When IOMMU is in legacy mode, enabling ATS is done
2896 		 * automatically by HW for the device that requires
2897 		 * ATS, hence OS should not enable this device ATS
2898 		 * to avoid duplicated TLB invalidation.
2899 		 */
2900 		return !(satcu->atc_required && !sm_supported(iommu));
2901 
2902 	for (bus = dev->bus; bus; bus = bus->parent) {
2903 		bridge = bus->self;
2904 		/* If it's an integrated device, allow ATS */
2905 		if (!bridge)
2906 			return 1;
2907 		/* Connected via non-PCIe: no ATS */
2908 		if (!pci_is_pcie(bridge) ||
2909 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
2910 			return 0;
2911 		/* If we found the root port, look it up in the ATSR */
2912 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
2913 			break;
2914 	}
2915 
2916 	rcu_read_lock();
2917 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
2918 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2919 		if (atsr->segment != pci_domain_nr(dev->bus))
2920 			continue;
2921 
2922 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
2923 			if (tmp == &bridge->dev)
2924 				goto out;
2925 
2926 		if (atsru->include_all)
2927 			goto out;
2928 	}
2929 	ret = 0;
2930 out:
2931 	rcu_read_unlock();
2932 
2933 	return ret;
2934 }
2935 
2936 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
2937 {
2938 	int ret;
2939 	struct dmar_rmrr_unit *rmrru;
2940 	struct dmar_atsr_unit *atsru;
2941 	struct dmar_satc_unit *satcu;
2942 	struct acpi_dmar_atsr *atsr;
2943 	struct acpi_dmar_reserved_memory *rmrr;
2944 	struct acpi_dmar_satc *satc;
2945 
2946 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
2947 		return 0;
2948 
2949 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
2950 		rmrr = container_of(rmrru->hdr,
2951 				    struct acpi_dmar_reserved_memory, header);
2952 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2953 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
2954 				((void *)rmrr) + rmrr->header.length,
2955 				rmrr->segment, rmrru->devices,
2956 				rmrru->devices_cnt);
2957 			if (ret < 0)
2958 				return ret;
2959 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2960 			dmar_remove_dev_scope(info, rmrr->segment,
2961 				rmrru->devices, rmrru->devices_cnt);
2962 		}
2963 	}
2964 
2965 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
2966 		if (atsru->include_all)
2967 			continue;
2968 
2969 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2970 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2971 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
2972 					(void *)atsr + atsr->header.length,
2973 					atsr->segment, atsru->devices,
2974 					atsru->devices_cnt);
2975 			if (ret > 0)
2976 				break;
2977 			else if (ret < 0)
2978 				return ret;
2979 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2980 			if (dmar_remove_dev_scope(info, atsr->segment,
2981 					atsru->devices, atsru->devices_cnt))
2982 				break;
2983 		}
2984 	}
2985 	list_for_each_entry(satcu, &dmar_satc_units, list) {
2986 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2987 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2988 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
2989 					(void *)satc + satc->header.length,
2990 					satc->segment, satcu->devices,
2991 					satcu->devices_cnt);
2992 			if (ret > 0)
2993 				break;
2994 			else if (ret < 0)
2995 				return ret;
2996 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2997 			if (dmar_remove_dev_scope(info, satc->segment,
2998 					satcu->devices, satcu->devices_cnt))
2999 				break;
3000 		}
3001 	}
3002 
3003 	return 0;
3004 }
3005 
3006 static void intel_disable_iommus(void)
3007 {
3008 	struct intel_iommu *iommu = NULL;
3009 	struct dmar_drhd_unit *drhd;
3010 
3011 	for_each_iommu(iommu, drhd)
3012 		iommu_disable_translation(iommu);
3013 }
3014 
3015 void intel_iommu_shutdown(void)
3016 {
3017 	struct dmar_drhd_unit *drhd;
3018 	struct intel_iommu *iommu = NULL;
3019 
3020 	if (no_iommu || dmar_disabled)
3021 		return;
3022 
3023 	down_write(&dmar_global_lock);
3024 
3025 	/* Disable PMRs explicitly here. */
3026 	for_each_iommu(iommu, drhd)
3027 		iommu_disable_protect_mem_regions(iommu);
3028 
3029 	/* Make sure the IOMMUs are switched off */
3030 	intel_disable_iommus();
3031 
3032 	up_write(&dmar_global_lock);
3033 }
3034 
3035 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3036 {
3037 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3038 
3039 	return container_of(iommu_dev, struct intel_iommu, iommu);
3040 }
3041 
3042 static ssize_t version_show(struct device *dev,
3043 			    struct device_attribute *attr, char *buf)
3044 {
3045 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3046 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3047 	return sysfs_emit(buf, "%d:%d\n",
3048 			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3049 }
3050 static DEVICE_ATTR_RO(version);
3051 
3052 static ssize_t address_show(struct device *dev,
3053 			    struct device_attribute *attr, char *buf)
3054 {
3055 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3056 	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3057 }
3058 static DEVICE_ATTR_RO(address);
3059 
3060 static ssize_t cap_show(struct device *dev,
3061 			struct device_attribute *attr, char *buf)
3062 {
3063 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3064 	return sysfs_emit(buf, "%llx\n", iommu->cap);
3065 }
3066 static DEVICE_ATTR_RO(cap);
3067 
3068 static ssize_t ecap_show(struct device *dev,
3069 			 struct device_attribute *attr, char *buf)
3070 {
3071 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3072 	return sysfs_emit(buf, "%llx\n", iommu->ecap);
3073 }
3074 static DEVICE_ATTR_RO(ecap);
3075 
3076 static ssize_t domains_supported_show(struct device *dev,
3077 				      struct device_attribute *attr, char *buf)
3078 {
3079 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3080 	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3081 }
3082 static DEVICE_ATTR_RO(domains_supported);
3083 
3084 static ssize_t domains_used_show(struct device *dev,
3085 				 struct device_attribute *attr, char *buf)
3086 {
3087 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3088 	return sysfs_emit(buf, "%d\n",
3089 			  bitmap_weight(iommu->domain_ids,
3090 					cap_ndoms(iommu->cap)));
3091 }
3092 static DEVICE_ATTR_RO(domains_used);
3093 
3094 static struct attribute *intel_iommu_attrs[] = {
3095 	&dev_attr_version.attr,
3096 	&dev_attr_address.attr,
3097 	&dev_attr_cap.attr,
3098 	&dev_attr_ecap.attr,
3099 	&dev_attr_domains_supported.attr,
3100 	&dev_attr_domains_used.attr,
3101 	NULL,
3102 };
3103 
3104 static struct attribute_group intel_iommu_group = {
3105 	.name = "intel-iommu",
3106 	.attrs = intel_iommu_attrs,
3107 };
3108 
3109 const struct attribute_group *intel_iommu_groups[] = {
3110 	&intel_iommu_group,
3111 	NULL,
3112 };
3113 
3114 static bool has_external_pci(void)
3115 {
3116 	struct pci_dev *pdev = NULL;
3117 
3118 	for_each_pci_dev(pdev)
3119 		if (pdev->external_facing) {
3120 			pci_dev_put(pdev);
3121 			return true;
3122 		}
3123 
3124 	return false;
3125 }
3126 
3127 static int __init platform_optin_force_iommu(void)
3128 {
3129 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3130 		return 0;
3131 
3132 	if (no_iommu || dmar_disabled)
3133 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3134 
3135 	/*
3136 	 * If Intel-IOMMU is disabled by default, we will apply identity
3137 	 * map for all devices except those marked as being untrusted.
3138 	 */
3139 	if (dmar_disabled)
3140 		iommu_set_default_passthrough(false);
3141 
3142 	dmar_disabled = 0;
3143 	no_iommu = 0;
3144 
3145 	return 1;
3146 }
3147 
3148 static int __init probe_acpi_namespace_devices(void)
3149 {
3150 	struct dmar_drhd_unit *drhd;
3151 	/* To avoid a -Wunused-but-set-variable warning. */
3152 	struct intel_iommu *iommu __maybe_unused;
3153 	struct device *dev;
3154 	int i, ret = 0;
3155 
3156 	for_each_active_iommu(iommu, drhd) {
3157 		for_each_active_dev_scope(drhd->devices,
3158 					  drhd->devices_cnt, i, dev) {
3159 			struct acpi_device_physical_node *pn;
3160 			struct acpi_device *adev;
3161 
3162 			if (dev->bus != &acpi_bus_type)
3163 				continue;
3164 
3165 			adev = to_acpi_device(dev);
3166 			mutex_lock(&adev->physical_node_lock);
3167 			list_for_each_entry(pn,
3168 					    &adev->physical_node_list, node) {
3169 				ret = iommu_probe_device(pn->dev);
3170 				if (ret)
3171 					break;
3172 			}
3173 			mutex_unlock(&adev->physical_node_lock);
3174 
3175 			if (ret)
3176 				return ret;
3177 		}
3178 	}
3179 
3180 	return 0;
3181 }
3182 
3183 static __init int tboot_force_iommu(void)
3184 {
3185 	if (!tboot_enabled())
3186 		return 0;
3187 
3188 	if (no_iommu || dmar_disabled)
3189 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3190 
3191 	dmar_disabled = 0;
3192 	no_iommu = 0;
3193 
3194 	return 1;
3195 }
3196 
3197 int __init intel_iommu_init(void)
3198 {
3199 	int ret = -ENODEV;
3200 	struct dmar_drhd_unit *drhd;
3201 	struct intel_iommu *iommu;
3202 
3203 	/*
3204 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3205 	 * opt in, so enforce that.
3206 	 */
3207 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3208 		    platform_optin_force_iommu();
3209 
3210 	down_write(&dmar_global_lock);
3211 	if (dmar_table_init()) {
3212 		if (force_on)
3213 			panic("tboot: Failed to initialize DMAR table\n");
3214 		goto out_free_dmar;
3215 	}
3216 
3217 	if (dmar_dev_scope_init() < 0) {
3218 		if (force_on)
3219 			panic("tboot: Failed to initialize DMAR device scope\n");
3220 		goto out_free_dmar;
3221 	}
3222 
3223 	up_write(&dmar_global_lock);
3224 
3225 	/*
3226 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3227 	 * complain later when we register it under the lock.
3228 	 */
3229 	dmar_register_bus_notifier();
3230 
3231 	down_write(&dmar_global_lock);
3232 
3233 	if (!no_iommu)
3234 		intel_iommu_debugfs_init();
3235 
3236 	if (no_iommu || dmar_disabled) {
3237 		/*
3238 		 * We exit the function here to ensure IOMMU's remapping and
3239 		 * mempool aren't setup, which means that the IOMMU's PMRs
3240 		 * won't be disabled via the call to init_dmars(). So disable
3241 		 * it explicitly here. The PMRs were setup by tboot prior to
3242 		 * calling SENTER, but the kernel is expected to reset/tear
3243 		 * down the PMRs.
3244 		 */
3245 		if (intel_iommu_tboot_noforce) {
3246 			for_each_iommu(iommu, drhd)
3247 				iommu_disable_protect_mem_regions(iommu);
3248 		}
3249 
3250 		/*
3251 		 * Make sure the IOMMUs are switched off, even when we
3252 		 * boot into a kexec kernel and the previous kernel left
3253 		 * them enabled
3254 		 */
3255 		intel_disable_iommus();
3256 		goto out_free_dmar;
3257 	}
3258 
3259 	if (list_empty(&dmar_rmrr_units))
3260 		pr_info("No RMRR found\n");
3261 
3262 	if (list_empty(&dmar_atsr_units))
3263 		pr_info("No ATSR found\n");
3264 
3265 	if (list_empty(&dmar_satc_units))
3266 		pr_info("No SATC found\n");
3267 
3268 	init_no_remapping_devices();
3269 
3270 	ret = init_dmars();
3271 	if (ret) {
3272 		if (force_on)
3273 			panic("tboot: Failed to initialize DMARs\n");
3274 		pr_err("Initialization failed\n");
3275 		goto out_free_dmar;
3276 	}
3277 	up_write(&dmar_global_lock);
3278 
3279 	init_iommu_pm_ops();
3280 
3281 	down_read(&dmar_global_lock);
3282 	for_each_active_iommu(iommu, drhd) {
3283 		/*
3284 		 * The flush queue implementation does not perform
3285 		 * page-selective invalidations that are required for efficient
3286 		 * TLB flushes in virtual environments.  The benefit of batching
3287 		 * is likely to be much lower than the overhead of synchronizing
3288 		 * the virtual and physical IOMMU page-tables.
3289 		 */
3290 		if (cap_caching_mode(iommu->cap) &&
3291 		    !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3292 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3293 			iommu_set_dma_strict();
3294 		}
3295 		iommu_device_sysfs_add(&iommu->iommu, NULL,
3296 				       intel_iommu_groups,
3297 				       "%s", iommu->name);
3298 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3299 
3300 		iommu_pmu_register(iommu);
3301 	}
3302 
3303 	if (probe_acpi_namespace_devices())
3304 		pr_warn("ACPI name space devices didn't probe correctly\n");
3305 
3306 	/* Finally, we enable the DMA remapping hardware. */
3307 	for_each_iommu(iommu, drhd) {
3308 		if (!drhd->ignored && !translation_pre_enabled(iommu))
3309 			iommu_enable_translation(iommu);
3310 
3311 		iommu_disable_protect_mem_regions(iommu);
3312 	}
3313 	up_read(&dmar_global_lock);
3314 
3315 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3316 
3317 	intel_iommu_enabled = 1;
3318 
3319 	return 0;
3320 
3321 out_free_dmar:
3322 	intel_iommu_free_dmars();
3323 	up_write(&dmar_global_lock);
3324 	return ret;
3325 }
3326 
3327 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3328 {
3329 	struct device_domain_info *info = opaque;
3330 
3331 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3332 	return 0;
3333 }
3334 
3335 /*
3336  * NB - intel-iommu lacks any sort of reference counting for the users of
3337  * dependent devices.  If multiple endpoints have intersecting dependent
3338  * devices, unbinding the driver from any one of them will possibly leave
3339  * the others unable to operate.
3340  */
3341 static void domain_context_clear(struct device_domain_info *info)
3342 {
3343 	if (!dev_is_pci(info->dev)) {
3344 		domain_context_clear_one(info, info->bus, info->devfn);
3345 		return;
3346 	}
3347 
3348 	pci_for_each_dma_alias(to_pci_dev(info->dev),
3349 			       &domain_context_clear_one_cb, info);
3350 }
3351 
3352 /*
3353  * Clear the page table pointer in context or pasid table entries so that
3354  * all DMA requests without PASID from the device are blocked. If the page
3355  * table has been set, clean up the data structures.
3356  */
3357 void device_block_translation(struct device *dev)
3358 {
3359 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3360 	struct intel_iommu *iommu = info->iommu;
3361 	unsigned long flags;
3362 
3363 	iommu_disable_pci_caps(info);
3364 	if (!dev_is_real_dma_subdevice(dev)) {
3365 		if (sm_supported(iommu))
3366 			intel_pasid_tear_down_entry(iommu, dev,
3367 						    IOMMU_NO_PASID, false);
3368 		else
3369 			domain_context_clear(info);
3370 	}
3371 
3372 	if (!info->domain)
3373 		return;
3374 
3375 	spin_lock_irqsave(&info->domain->lock, flags);
3376 	list_del(&info->link);
3377 	spin_unlock_irqrestore(&info->domain->lock, flags);
3378 
3379 	cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID);
3380 	domain_detach_iommu(info->domain, iommu);
3381 	info->domain = NULL;
3382 }
3383 
3384 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3385 {
3386 	int adjust_width;
3387 
3388 	/* calculate AGAW */
3389 	domain->gaw = guest_width;
3390 	adjust_width = guestwidth_to_adjustwidth(guest_width);
3391 	domain->agaw = width_to_agaw(adjust_width);
3392 
3393 	domain->iommu_coherency = false;
3394 	domain->iommu_superpage = 0;
3395 	domain->max_addr = 0;
3396 
3397 	/* always allocate the top pgd */
3398 	domain->pgd = iommu_alloc_page_node(domain->nid, GFP_ATOMIC);
3399 	if (!domain->pgd)
3400 		return -ENOMEM;
3401 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3402 	return 0;
3403 }
3404 
3405 static int blocking_domain_attach_dev(struct iommu_domain *domain,
3406 				      struct device *dev)
3407 {
3408 	device_block_translation(dev);
3409 	return 0;
3410 }
3411 
3412 static struct iommu_domain blocking_domain = {
3413 	.type = IOMMU_DOMAIN_BLOCKED,
3414 	.ops = &(const struct iommu_domain_ops) {
3415 		.attach_dev	= blocking_domain_attach_dev,
3416 	}
3417 };
3418 
3419 static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage)
3420 {
3421 	if (!intel_iommu_superpage)
3422 		return 0;
3423 
3424 	if (first_stage)
3425 		return cap_fl1gp_support(iommu->cap) ? 2 : 1;
3426 
3427 	return fls(cap_super_page_val(iommu->cap));
3428 }
3429 
3430 static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage)
3431 {
3432 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3433 	struct intel_iommu *iommu = info->iommu;
3434 	struct dmar_domain *domain;
3435 	int addr_width;
3436 
3437 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
3438 	if (!domain)
3439 		return ERR_PTR(-ENOMEM);
3440 
3441 	INIT_LIST_HEAD(&domain->devices);
3442 	INIT_LIST_HEAD(&domain->dev_pasids);
3443 	INIT_LIST_HEAD(&domain->cache_tags);
3444 	spin_lock_init(&domain->lock);
3445 	spin_lock_init(&domain->cache_lock);
3446 	xa_init(&domain->iommu_array);
3447 
3448 	domain->nid = dev_to_node(dev);
3449 	domain->use_first_level = first_stage;
3450 
3451 	/* calculate the address width */
3452 	addr_width = agaw_to_width(iommu->agaw);
3453 	if (addr_width > cap_mgaw(iommu->cap))
3454 		addr_width = cap_mgaw(iommu->cap);
3455 	domain->gaw = addr_width;
3456 	domain->agaw = iommu->agaw;
3457 	domain->max_addr = __DOMAIN_MAX_ADDR(addr_width);
3458 
3459 	/* iommu memory access coherency */
3460 	domain->iommu_coherency = iommu_paging_structure_coherency(iommu);
3461 
3462 	/* pagesize bitmap */
3463 	domain->domain.pgsize_bitmap = SZ_4K;
3464 	domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage);
3465 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
3466 
3467 	/*
3468 	 * IOVA aperture: First-level translation restricts the input-address
3469 	 * to a canonical address (i.e., address bits 63:N have the same value
3470 	 * as address bit [N-1], where N is 48-bits with 4-level paging and
3471 	 * 57-bits with 5-level paging). Hence, skip bit [N-1].
3472 	 */
3473 	domain->domain.geometry.force_aperture = true;
3474 	domain->domain.geometry.aperture_start = 0;
3475 	if (first_stage)
3476 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
3477 	else
3478 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
3479 
3480 	/* always allocate the top pgd */
3481 	domain->pgd = iommu_alloc_page_node(domain->nid, GFP_KERNEL);
3482 	if (!domain->pgd) {
3483 		kfree(domain);
3484 		return ERR_PTR(-ENOMEM);
3485 	}
3486 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3487 
3488 	return domain;
3489 }
3490 
3491 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
3492 {
3493 	struct dmar_domain *dmar_domain;
3494 	struct iommu_domain *domain;
3495 
3496 	switch (type) {
3497 	case IOMMU_DOMAIN_DMA:
3498 	case IOMMU_DOMAIN_UNMANAGED:
3499 		dmar_domain = alloc_domain(type);
3500 		if (!dmar_domain) {
3501 			pr_err("Can't allocate dmar_domain\n");
3502 			return NULL;
3503 		}
3504 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3505 			pr_err("Domain initialization failed\n");
3506 			domain_exit(dmar_domain);
3507 			return NULL;
3508 		}
3509 
3510 		domain = &dmar_domain->domain;
3511 		domain->geometry.aperture_start = 0;
3512 		domain->geometry.aperture_end   =
3513 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
3514 		domain->geometry.force_aperture = true;
3515 
3516 		return domain;
3517 	default:
3518 		return NULL;
3519 	}
3520 
3521 	return NULL;
3522 }
3523 
3524 static struct iommu_domain *
3525 intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
3526 			      struct iommu_domain *parent,
3527 			      const struct iommu_user_data *user_data)
3528 {
3529 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3530 	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3531 	bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3532 	struct intel_iommu *iommu = info->iommu;
3533 	struct dmar_domain *dmar_domain;
3534 	struct iommu_domain *domain;
3535 
3536 	/* Must be NESTING domain */
3537 	if (parent) {
3538 		if (!nested_supported(iommu) || flags)
3539 			return ERR_PTR(-EOPNOTSUPP);
3540 		return intel_nested_domain_alloc(parent, user_data);
3541 	}
3542 
3543 	if (flags &
3544 	    (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
3545 		return ERR_PTR(-EOPNOTSUPP);
3546 	if (nested_parent && !nested_supported(iommu))
3547 		return ERR_PTR(-EOPNOTSUPP);
3548 	if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3549 		return ERR_PTR(-EOPNOTSUPP);
3550 
3551 	/* Do not use first stage for user domain translation. */
3552 	dmar_domain = paging_domain_alloc(dev, false);
3553 	if (IS_ERR(dmar_domain))
3554 		return ERR_CAST(dmar_domain);
3555 	domain = &dmar_domain->domain;
3556 	domain->type = IOMMU_DOMAIN_UNMANAGED;
3557 	domain->owner = &intel_iommu_ops;
3558 	domain->ops = intel_iommu_ops.default_domain_ops;
3559 
3560 	if (nested_parent) {
3561 		dmar_domain->nested_parent = true;
3562 		INIT_LIST_HEAD(&dmar_domain->s1_domains);
3563 		spin_lock_init(&dmar_domain->s1_lock);
3564 	}
3565 
3566 	if (dirty_tracking) {
3567 		if (dmar_domain->use_first_level) {
3568 			iommu_domain_free(domain);
3569 			return ERR_PTR(-EOPNOTSUPP);
3570 		}
3571 		domain->dirty_ops = &intel_dirty_ops;
3572 	}
3573 
3574 	return domain;
3575 }
3576 
3577 static void intel_iommu_domain_free(struct iommu_domain *domain)
3578 {
3579 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3580 
3581 	WARN_ON(dmar_domain->nested_parent &&
3582 		!list_empty(&dmar_domain->s1_domains));
3583 	domain_exit(dmar_domain);
3584 }
3585 
3586 int prepare_domain_attach_device(struct iommu_domain *domain,
3587 				 struct device *dev)
3588 {
3589 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3590 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3591 	struct intel_iommu *iommu = info->iommu;
3592 	int addr_width;
3593 
3594 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3595 		return -EINVAL;
3596 
3597 	if (domain->dirty_ops && !ssads_supported(iommu))
3598 		return -EINVAL;
3599 
3600 	/* check if this iommu agaw is sufficient for max mapped address */
3601 	addr_width = agaw_to_width(iommu->agaw);
3602 	if (addr_width > cap_mgaw(iommu->cap))
3603 		addr_width = cap_mgaw(iommu->cap);
3604 
3605 	if (dmar_domain->max_addr > (1LL << addr_width))
3606 		return -EINVAL;
3607 	dmar_domain->gaw = addr_width;
3608 
3609 	/*
3610 	 * Knock out extra levels of page tables if necessary
3611 	 */
3612 	while (iommu->agaw < dmar_domain->agaw) {
3613 		struct dma_pte *pte;
3614 
3615 		pte = dmar_domain->pgd;
3616 		if (dma_pte_present(pte)) {
3617 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
3618 			iommu_free_page(pte);
3619 		}
3620 		dmar_domain->agaw--;
3621 	}
3622 
3623 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3624 	    context_copied(iommu, info->bus, info->devfn))
3625 		return intel_pasid_setup_sm_context(dev);
3626 
3627 	return 0;
3628 }
3629 
3630 static int intel_iommu_attach_device(struct iommu_domain *domain,
3631 				     struct device *dev)
3632 {
3633 	int ret;
3634 
3635 	device_block_translation(dev);
3636 
3637 	ret = prepare_domain_attach_device(domain, dev);
3638 	if (ret)
3639 		return ret;
3640 
3641 	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
3642 }
3643 
3644 static int intel_iommu_map(struct iommu_domain *domain,
3645 			   unsigned long iova, phys_addr_t hpa,
3646 			   size_t size, int iommu_prot, gfp_t gfp)
3647 {
3648 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3649 	u64 max_addr;
3650 	int prot = 0;
3651 
3652 	if (iommu_prot & IOMMU_READ)
3653 		prot |= DMA_PTE_READ;
3654 	if (iommu_prot & IOMMU_WRITE)
3655 		prot |= DMA_PTE_WRITE;
3656 	if (dmar_domain->set_pte_snp)
3657 		prot |= DMA_PTE_SNP;
3658 
3659 	max_addr = iova + size;
3660 	if (dmar_domain->max_addr < max_addr) {
3661 		u64 end;
3662 
3663 		/* check if minimum agaw is sufficient for mapped address */
3664 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3665 		if (end < max_addr) {
3666 			pr_err("%s: iommu width (%d) is not "
3667 			       "sufficient for the mapped address (%llx)\n",
3668 			       __func__, dmar_domain->gaw, max_addr);
3669 			return -EFAULT;
3670 		}
3671 		dmar_domain->max_addr = max_addr;
3672 	}
3673 	/* Round up size to next multiple of PAGE_SIZE, if it and
3674 	   the low bits of hpa would take us onto the next page */
3675 	size = aligned_nrpages(hpa, size);
3676 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3677 				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
3678 }
3679 
3680 static int intel_iommu_map_pages(struct iommu_domain *domain,
3681 				 unsigned long iova, phys_addr_t paddr,
3682 				 size_t pgsize, size_t pgcount,
3683 				 int prot, gfp_t gfp, size_t *mapped)
3684 {
3685 	unsigned long pgshift = __ffs(pgsize);
3686 	size_t size = pgcount << pgshift;
3687 	int ret;
3688 
3689 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
3690 		return -EINVAL;
3691 
3692 	if (!IS_ALIGNED(iova | paddr, pgsize))
3693 		return -EINVAL;
3694 
3695 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
3696 	if (!ret && mapped)
3697 		*mapped = size;
3698 
3699 	return ret;
3700 }
3701 
3702 static size_t intel_iommu_unmap(struct iommu_domain *domain,
3703 				unsigned long iova, size_t size,
3704 				struct iommu_iotlb_gather *gather)
3705 {
3706 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3707 	unsigned long start_pfn, last_pfn;
3708 	int level = 0;
3709 
3710 	/* Cope with horrid API which requires us to unmap more than the
3711 	   size argument if it happens to be a large-page mapping. */
3712 	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
3713 				     &level, GFP_ATOMIC)))
3714 		return 0;
3715 
3716 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
3717 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
3718 
3719 	start_pfn = iova >> VTD_PAGE_SHIFT;
3720 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
3721 
3722 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
3723 
3724 	if (dmar_domain->max_addr == iova + size)
3725 		dmar_domain->max_addr = iova;
3726 
3727 	/*
3728 	 * We do not use page-selective IOTLB invalidation in flush queue,
3729 	 * so there is no need to track page and sync iotlb.
3730 	 */
3731 	if (!iommu_iotlb_gather_queued(gather))
3732 		iommu_iotlb_gather_add_page(domain, gather, iova, size);
3733 
3734 	return size;
3735 }
3736 
3737 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
3738 				      unsigned long iova,
3739 				      size_t pgsize, size_t pgcount,
3740 				      struct iommu_iotlb_gather *gather)
3741 {
3742 	unsigned long pgshift = __ffs(pgsize);
3743 	size_t size = pgcount << pgshift;
3744 
3745 	return intel_iommu_unmap(domain, iova, size, gather);
3746 }
3747 
3748 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
3749 				 struct iommu_iotlb_gather *gather)
3750 {
3751 	cache_tag_flush_range(to_dmar_domain(domain), gather->start,
3752 			      gather->end, list_empty(&gather->freelist));
3753 	iommu_put_pages_list(&gather->freelist);
3754 }
3755 
3756 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3757 					    dma_addr_t iova)
3758 {
3759 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3760 	struct dma_pte *pte;
3761 	int level = 0;
3762 	u64 phys = 0;
3763 
3764 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
3765 			     GFP_ATOMIC);
3766 	if (pte && dma_pte_present(pte))
3767 		phys = dma_pte_addr(pte) +
3768 			(iova & (BIT_MASK(level_to_offset_bits(level) +
3769 						VTD_PAGE_SHIFT) - 1));
3770 
3771 	return phys;
3772 }
3773 
3774 static bool domain_support_force_snooping(struct dmar_domain *domain)
3775 {
3776 	struct device_domain_info *info;
3777 	bool support = true;
3778 
3779 	assert_spin_locked(&domain->lock);
3780 	list_for_each_entry(info, &domain->devices, link) {
3781 		if (!ecap_sc_support(info->iommu->ecap)) {
3782 			support = false;
3783 			break;
3784 		}
3785 	}
3786 
3787 	return support;
3788 }
3789 
3790 static void domain_set_force_snooping(struct dmar_domain *domain)
3791 {
3792 	struct device_domain_info *info;
3793 
3794 	assert_spin_locked(&domain->lock);
3795 	/*
3796 	 * Second level page table supports per-PTE snoop control. The
3797 	 * iommu_map() interface will handle this by setting SNP bit.
3798 	 */
3799 	if (!domain->use_first_level) {
3800 		domain->set_pte_snp = true;
3801 		return;
3802 	}
3803 
3804 	list_for_each_entry(info, &domain->devices, link)
3805 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
3806 						     IOMMU_NO_PASID);
3807 }
3808 
3809 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
3810 {
3811 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3812 	unsigned long flags;
3813 
3814 	if (dmar_domain->force_snooping)
3815 		return true;
3816 
3817 	spin_lock_irqsave(&dmar_domain->lock, flags);
3818 	if (!domain_support_force_snooping(dmar_domain) ||
3819 	    (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
3820 		spin_unlock_irqrestore(&dmar_domain->lock, flags);
3821 		return false;
3822 	}
3823 
3824 	domain_set_force_snooping(dmar_domain);
3825 	dmar_domain->force_snooping = true;
3826 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
3827 
3828 	return true;
3829 }
3830 
3831 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
3832 {
3833 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3834 
3835 	switch (cap) {
3836 	case IOMMU_CAP_CACHE_COHERENCY:
3837 	case IOMMU_CAP_DEFERRED_FLUSH:
3838 		return true;
3839 	case IOMMU_CAP_PRE_BOOT_PROTECTION:
3840 		return dmar_platform_optin();
3841 	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
3842 		return ecap_sc_support(info->iommu->ecap);
3843 	case IOMMU_CAP_DIRTY_TRACKING:
3844 		return ssads_supported(info->iommu);
3845 	default:
3846 		return false;
3847 	}
3848 }
3849 
3850 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
3851 {
3852 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3853 	struct device_domain_info *info;
3854 	struct intel_iommu *iommu;
3855 	u8 bus, devfn;
3856 	int ret;
3857 
3858 	iommu = device_lookup_iommu(dev, &bus, &devfn);
3859 	if (!iommu || !iommu->iommu.ops)
3860 		return ERR_PTR(-ENODEV);
3861 
3862 	info = kzalloc(sizeof(*info), GFP_KERNEL);
3863 	if (!info)
3864 		return ERR_PTR(-ENOMEM);
3865 
3866 	if (dev_is_real_dma_subdevice(dev)) {
3867 		info->bus = pdev->bus->number;
3868 		info->devfn = pdev->devfn;
3869 		info->segment = pci_domain_nr(pdev->bus);
3870 	} else {
3871 		info->bus = bus;
3872 		info->devfn = devfn;
3873 		info->segment = iommu->segment;
3874 	}
3875 
3876 	info->dev = dev;
3877 	info->iommu = iommu;
3878 	if (dev_is_pci(dev)) {
3879 		if (ecap_dev_iotlb_support(iommu->ecap) &&
3880 		    pci_ats_supported(pdev) &&
3881 		    dmar_ats_supported(pdev, iommu)) {
3882 			info->ats_supported = 1;
3883 			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
3884 
3885 			/*
3886 			 * For IOMMU that supports device IOTLB throttling
3887 			 * (DIT), we assign PFSID to the invalidation desc
3888 			 * of a VF such that IOMMU HW can gauge queue depth
3889 			 * at PF level. If DIT is not set, PFSID will be
3890 			 * treated as reserved, which should be set to 0.
3891 			 */
3892 			if (ecap_dit(iommu->ecap))
3893 				info->pfsid = pci_dev_id(pci_physfn(pdev));
3894 			info->ats_qdep = pci_ats_queue_depth(pdev);
3895 		}
3896 		if (sm_supported(iommu)) {
3897 			if (pasid_supported(iommu)) {
3898 				int features = pci_pasid_features(pdev);
3899 
3900 				if (features >= 0)
3901 					info->pasid_supported = features | 1;
3902 			}
3903 
3904 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
3905 			    pci_pri_supported(pdev))
3906 				info->pri_supported = 1;
3907 		}
3908 	}
3909 
3910 	dev_iommu_priv_set(dev, info);
3911 	if (pdev && pci_ats_supported(pdev)) {
3912 		pci_prepare_ats(pdev, VTD_PAGE_SHIFT);
3913 		ret = device_rbtree_insert(iommu, info);
3914 		if (ret)
3915 			goto free;
3916 	}
3917 
3918 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
3919 		ret = intel_pasid_alloc_table(dev);
3920 		if (ret) {
3921 			dev_err(dev, "PASID table allocation failed\n");
3922 			goto clear_rbtree;
3923 		}
3924 
3925 		if (!context_copied(iommu, info->bus, info->devfn)) {
3926 			ret = intel_pasid_setup_sm_context(dev);
3927 			if (ret)
3928 				goto free_table;
3929 		}
3930 	}
3931 
3932 	intel_iommu_debugfs_create_dev(info);
3933 
3934 	/*
3935 	 * The PCIe spec, in its wisdom, declares that the behaviour of the
3936 	 * device is undefined if you enable PASID support after ATS support.
3937 	 * So always enable PASID support on devices which have it, even if
3938 	 * we can't yet know if we're ever going to use it.
3939 	 */
3940 	if (info->pasid_supported &&
3941 	    !pci_enable_pasid(pdev, info->pasid_supported & ~1))
3942 		info->pasid_enabled = 1;
3943 
3944 	return &iommu->iommu;
3945 free_table:
3946 	intel_pasid_free_table(dev);
3947 clear_rbtree:
3948 	device_rbtree_remove(info);
3949 free:
3950 	kfree(info);
3951 
3952 	return ERR_PTR(ret);
3953 }
3954 
3955 static void intel_iommu_release_device(struct device *dev)
3956 {
3957 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3958 	struct intel_iommu *iommu = info->iommu;
3959 
3960 	if (info->pasid_enabled) {
3961 		pci_disable_pasid(to_pci_dev(dev));
3962 		info->pasid_enabled = 0;
3963 	}
3964 
3965 	mutex_lock(&iommu->iopf_lock);
3966 	if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
3967 		device_rbtree_remove(info);
3968 	mutex_unlock(&iommu->iopf_lock);
3969 
3970 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3971 	    !context_copied(iommu, info->bus, info->devfn))
3972 		intel_pasid_teardown_sm_context(dev);
3973 
3974 	intel_pasid_free_table(dev);
3975 	intel_iommu_debugfs_remove_dev(info);
3976 	kfree(info);
3977 	set_dma_ops(dev, NULL);
3978 }
3979 
3980 static void intel_iommu_get_resv_regions(struct device *device,
3981 					 struct list_head *head)
3982 {
3983 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
3984 	struct iommu_resv_region *reg;
3985 	struct dmar_rmrr_unit *rmrr;
3986 	struct device *i_dev;
3987 	int i;
3988 
3989 	rcu_read_lock();
3990 	for_each_rmrr_units(rmrr) {
3991 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3992 					  i, i_dev) {
3993 			struct iommu_resv_region *resv;
3994 			enum iommu_resv_type type;
3995 			size_t length;
3996 
3997 			if (i_dev != device &&
3998 			    !is_downstream_to_pci_bridge(device, i_dev))
3999 				continue;
4000 
4001 			length = rmrr->end_address - rmrr->base_address + 1;
4002 
4003 			type = device_rmrr_is_relaxable(device) ?
4004 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4005 
4006 			resv = iommu_alloc_resv_region(rmrr->base_address,
4007 						       length, prot, type,
4008 						       GFP_ATOMIC);
4009 			if (!resv)
4010 				break;
4011 
4012 			list_add_tail(&resv->list, head);
4013 		}
4014 	}
4015 	rcu_read_unlock();
4016 
4017 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4018 	if (dev_is_pci(device)) {
4019 		struct pci_dev *pdev = to_pci_dev(device);
4020 
4021 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4022 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4023 					IOMMU_RESV_DIRECT_RELAXABLE,
4024 					GFP_KERNEL);
4025 			if (reg)
4026 				list_add_tail(&reg->list, head);
4027 		}
4028 	}
4029 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4030 
4031 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4032 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4033 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4034 	if (!reg)
4035 		return;
4036 	list_add_tail(&reg->list, head);
4037 }
4038 
4039 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4040 {
4041 	if (dev_is_pci(dev))
4042 		return pci_device_group(dev);
4043 	return generic_device_group(dev);
4044 }
4045 
4046 static int intel_iommu_enable_sva(struct device *dev)
4047 {
4048 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4049 	struct intel_iommu *iommu;
4050 
4051 	if (!info || dmar_disabled)
4052 		return -EINVAL;
4053 
4054 	iommu = info->iommu;
4055 	if (!iommu)
4056 		return -EINVAL;
4057 
4058 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4059 		return -ENODEV;
4060 
4061 	if (!info->pasid_enabled || !info->ats_enabled)
4062 		return -EINVAL;
4063 
4064 	/*
4065 	 * Devices having device-specific I/O fault handling should not
4066 	 * support PCI/PRI. The IOMMU side has no means to check the
4067 	 * capability of device-specific IOPF.  Therefore, IOMMU can only
4068 	 * default that if the device driver enables SVA on a non-PRI
4069 	 * device, it will handle IOPF in its own way.
4070 	 */
4071 	if (!info->pri_supported)
4072 		return 0;
4073 
4074 	/* Devices supporting PRI should have it enabled. */
4075 	if (!info->pri_enabled)
4076 		return -EINVAL;
4077 
4078 	return 0;
4079 }
4080 
4081 static int context_flip_pri(struct device_domain_info *info, bool enable)
4082 {
4083 	struct intel_iommu *iommu = info->iommu;
4084 	u8 bus = info->bus, devfn = info->devfn;
4085 	struct context_entry *context;
4086 	u16 did;
4087 
4088 	spin_lock(&iommu->lock);
4089 	if (context_copied(iommu, bus, devfn)) {
4090 		spin_unlock(&iommu->lock);
4091 		return -EINVAL;
4092 	}
4093 
4094 	context = iommu_context_addr(iommu, bus, devfn, false);
4095 	if (!context || !context_present(context)) {
4096 		spin_unlock(&iommu->lock);
4097 		return -ENODEV;
4098 	}
4099 	did = context_domain_id(context);
4100 
4101 	if (enable)
4102 		context_set_sm_pre(context);
4103 	else
4104 		context_clear_sm_pre(context);
4105 
4106 	if (!ecap_coherent(iommu->ecap))
4107 		clflush_cache_range(context, sizeof(*context));
4108 	intel_context_flush_present(info, context, did, true);
4109 	spin_unlock(&iommu->lock);
4110 
4111 	return 0;
4112 }
4113 
4114 static int intel_iommu_enable_iopf(struct device *dev)
4115 {
4116 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4117 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4118 	struct intel_iommu *iommu;
4119 	int ret;
4120 
4121 	if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4122 		return -ENODEV;
4123 
4124 	if (info->pri_enabled)
4125 		return -EBUSY;
4126 
4127 	iommu = info->iommu;
4128 	if (!iommu)
4129 		return -EINVAL;
4130 
4131 	/* PASID is required in PRG Response Message. */
4132 	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4133 		return -EINVAL;
4134 
4135 	ret = pci_reset_pri(pdev);
4136 	if (ret)
4137 		return ret;
4138 
4139 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4140 	if (ret)
4141 		return ret;
4142 
4143 	ret = context_flip_pri(info, true);
4144 	if (ret)
4145 		goto err_remove_device;
4146 
4147 	ret = pci_enable_pri(pdev, PRQ_DEPTH);
4148 	if (ret)
4149 		goto err_clear_pri;
4150 
4151 	info->pri_enabled = 1;
4152 
4153 	return 0;
4154 err_clear_pri:
4155 	context_flip_pri(info, false);
4156 err_remove_device:
4157 	iopf_queue_remove_device(iommu->iopf_queue, dev);
4158 
4159 	return ret;
4160 }
4161 
4162 static int intel_iommu_disable_iopf(struct device *dev)
4163 {
4164 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4165 	struct intel_iommu *iommu = info->iommu;
4166 
4167 	if (!info->pri_enabled)
4168 		return -EINVAL;
4169 
4170 	/* Disable new PRI reception: */
4171 	context_flip_pri(info, false);
4172 
4173 	/*
4174 	 * Remove device from fault queue and acknowledge all outstanding
4175 	 * PRQs to the device:
4176 	 */
4177 	iopf_queue_remove_device(iommu->iopf_queue, dev);
4178 
4179 	/*
4180 	 * PCIe spec states that by clearing PRI enable bit, the Page
4181 	 * Request Interface will not issue new page requests, but has
4182 	 * outstanding page requests that have been transmitted or are
4183 	 * queued for transmission. This is supposed to be called after
4184 	 * the device driver has stopped DMA, all PASIDs have been
4185 	 * unbound and the outstanding PRQs have been drained.
4186 	 */
4187 	pci_disable_pri(to_pci_dev(dev));
4188 	info->pri_enabled = 0;
4189 
4190 	return 0;
4191 }
4192 
4193 static int
4194 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4195 {
4196 	switch (feat) {
4197 	case IOMMU_DEV_FEAT_IOPF:
4198 		return intel_iommu_enable_iopf(dev);
4199 
4200 	case IOMMU_DEV_FEAT_SVA:
4201 		return intel_iommu_enable_sva(dev);
4202 
4203 	default:
4204 		return -ENODEV;
4205 	}
4206 }
4207 
4208 static int
4209 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4210 {
4211 	switch (feat) {
4212 	case IOMMU_DEV_FEAT_IOPF:
4213 		return intel_iommu_disable_iopf(dev);
4214 
4215 	case IOMMU_DEV_FEAT_SVA:
4216 		return 0;
4217 
4218 	default:
4219 		return -ENODEV;
4220 	}
4221 }
4222 
4223 static bool intel_iommu_is_attach_deferred(struct device *dev)
4224 {
4225 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4226 
4227 	return translation_pre_enabled(info->iommu) && !info->domain;
4228 }
4229 
4230 /*
4231  * Check that the device does not live on an external facing PCI port that is
4232  * marked as untrusted. Such devices should not be able to apply quirks and
4233  * thus not be able to bypass the IOMMU restrictions.
4234  */
4235 static bool risky_device(struct pci_dev *pdev)
4236 {
4237 	if (pdev->untrusted) {
4238 		pci_info(pdev,
4239 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4240 			 pdev->vendor, pdev->device);
4241 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4242 		return true;
4243 	}
4244 	return false;
4245 }
4246 
4247 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4248 				      unsigned long iova, size_t size)
4249 {
4250 	cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1);
4251 
4252 	return 0;
4253 }
4254 
4255 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
4256 					 struct iommu_domain *domain)
4257 {
4258 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4259 	struct dev_pasid_info *curr, *dev_pasid = NULL;
4260 	struct intel_iommu *iommu = info->iommu;
4261 	struct dmar_domain *dmar_domain;
4262 	unsigned long flags;
4263 
4264 	if (domain->type == IOMMU_DOMAIN_IDENTITY) {
4265 		intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4266 		return;
4267 	}
4268 
4269 	dmar_domain = to_dmar_domain(domain);
4270 	spin_lock_irqsave(&dmar_domain->lock, flags);
4271 	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4272 		if (curr->dev == dev && curr->pasid == pasid) {
4273 			list_del(&curr->link_domain);
4274 			dev_pasid = curr;
4275 			break;
4276 		}
4277 	}
4278 	WARN_ON_ONCE(!dev_pasid);
4279 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4280 
4281 	cache_tag_unassign_domain(dmar_domain, dev, pasid);
4282 	domain_detach_iommu(dmar_domain, iommu);
4283 	intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4284 	kfree(dev_pasid);
4285 	intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4286 	intel_drain_pasid_prq(dev, pasid);
4287 }
4288 
4289 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4290 				     struct device *dev, ioasid_t pasid)
4291 {
4292 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4293 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4294 	struct intel_iommu *iommu = info->iommu;
4295 	struct dev_pasid_info *dev_pasid;
4296 	unsigned long flags;
4297 	int ret;
4298 
4299 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4300 		return -EOPNOTSUPP;
4301 
4302 	if (domain->dirty_ops)
4303 		return -EINVAL;
4304 
4305 	if (context_copied(iommu, info->bus, info->devfn))
4306 		return -EBUSY;
4307 
4308 	ret = prepare_domain_attach_device(domain, dev);
4309 	if (ret)
4310 		return ret;
4311 
4312 	dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4313 	if (!dev_pasid)
4314 		return -ENOMEM;
4315 
4316 	ret = domain_attach_iommu(dmar_domain, iommu);
4317 	if (ret)
4318 		goto out_free;
4319 
4320 	ret = cache_tag_assign_domain(dmar_domain, dev, pasid);
4321 	if (ret)
4322 		goto out_detach_iommu;
4323 
4324 	if (dmar_domain->use_first_level)
4325 		ret = domain_setup_first_level(iommu, dmar_domain,
4326 					       dev, pasid);
4327 	else
4328 		ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4329 						     dev, pasid);
4330 	if (ret)
4331 		goto out_unassign_tag;
4332 
4333 	dev_pasid->dev = dev;
4334 	dev_pasid->pasid = pasid;
4335 	spin_lock_irqsave(&dmar_domain->lock, flags);
4336 	list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4337 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4338 
4339 	if (domain->type & __IOMMU_DOMAIN_PAGING)
4340 		intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4341 
4342 	return 0;
4343 out_unassign_tag:
4344 	cache_tag_unassign_domain(dmar_domain, dev, pasid);
4345 out_detach_iommu:
4346 	domain_detach_iommu(dmar_domain, iommu);
4347 out_free:
4348 	kfree(dev_pasid);
4349 	return ret;
4350 }
4351 
4352 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4353 {
4354 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4355 	struct intel_iommu *iommu = info->iommu;
4356 	struct iommu_hw_info_vtd *vtd;
4357 
4358 	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4359 	if (!vtd)
4360 		return ERR_PTR(-ENOMEM);
4361 
4362 	vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4363 	vtd->cap_reg = iommu->cap;
4364 	vtd->ecap_reg = iommu->ecap;
4365 	*length = sizeof(*vtd);
4366 	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4367 	return vtd;
4368 }
4369 
4370 /*
4371  * Set dirty tracking for the device list of a domain. The caller must
4372  * hold the domain->lock when calling it.
4373  */
4374 static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4375 {
4376 	struct device_domain_info *info;
4377 	int ret = 0;
4378 
4379 	list_for_each_entry(info, devices, link) {
4380 		ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4381 						       IOMMU_NO_PASID, enable);
4382 		if (ret)
4383 			break;
4384 	}
4385 
4386 	return ret;
4387 }
4388 
4389 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4390 					    bool enable)
4391 {
4392 	struct dmar_domain *s1_domain;
4393 	unsigned long flags;
4394 	int ret;
4395 
4396 	spin_lock(&domain->s1_lock);
4397 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4398 		spin_lock_irqsave(&s1_domain->lock, flags);
4399 		ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4400 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4401 		if (ret)
4402 			goto err_unwind;
4403 	}
4404 	spin_unlock(&domain->s1_lock);
4405 	return 0;
4406 
4407 err_unwind:
4408 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4409 		spin_lock_irqsave(&s1_domain->lock, flags);
4410 		device_set_dirty_tracking(&s1_domain->devices,
4411 					  domain->dirty_tracking);
4412 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4413 	}
4414 	spin_unlock(&domain->s1_lock);
4415 	return ret;
4416 }
4417 
4418 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4419 					  bool enable)
4420 {
4421 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4422 	int ret;
4423 
4424 	spin_lock(&dmar_domain->lock);
4425 	if (dmar_domain->dirty_tracking == enable)
4426 		goto out_unlock;
4427 
4428 	ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4429 	if (ret)
4430 		goto err_unwind;
4431 
4432 	if (dmar_domain->nested_parent) {
4433 		ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4434 		if (ret)
4435 			goto err_unwind;
4436 	}
4437 
4438 	dmar_domain->dirty_tracking = enable;
4439 out_unlock:
4440 	spin_unlock(&dmar_domain->lock);
4441 
4442 	return 0;
4443 
4444 err_unwind:
4445 	device_set_dirty_tracking(&dmar_domain->devices,
4446 				  dmar_domain->dirty_tracking);
4447 	spin_unlock(&dmar_domain->lock);
4448 	return ret;
4449 }
4450 
4451 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4452 					    unsigned long iova, size_t size,
4453 					    unsigned long flags,
4454 					    struct iommu_dirty_bitmap *dirty)
4455 {
4456 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4457 	unsigned long end = iova + size - 1;
4458 	unsigned long pgsize;
4459 
4460 	/*
4461 	 * IOMMUFD core calls into a dirty tracking disabled domain without an
4462 	 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4463 	 * have occurred when we stopped dirty tracking. This ensures that we
4464 	 * never inherit dirtied bits from a previous cycle.
4465 	 */
4466 	if (!dmar_domain->dirty_tracking && dirty->bitmap)
4467 		return -EINVAL;
4468 
4469 	do {
4470 		struct dma_pte *pte;
4471 		int lvl = 0;
4472 
4473 		pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4474 				     GFP_ATOMIC);
4475 		pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4476 		if (!pte || !dma_pte_present(pte)) {
4477 			iova += pgsize;
4478 			continue;
4479 		}
4480 
4481 		if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4482 			iommu_dirty_bitmap_record(dirty, iova, pgsize);
4483 		iova += pgsize;
4484 	} while (iova < end);
4485 
4486 	return 0;
4487 }
4488 
4489 static const struct iommu_dirty_ops intel_dirty_ops = {
4490 	.set_dirty_tracking = intel_iommu_set_dirty_tracking,
4491 	.read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4492 };
4493 
4494 static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
4495 {
4496 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4497 	struct intel_iommu *iommu = info->iommu;
4498 	struct context_entry *context;
4499 
4500 	spin_lock(&iommu->lock);
4501 	context = iommu_context_addr(iommu, bus, devfn, 1);
4502 	if (!context) {
4503 		spin_unlock(&iommu->lock);
4504 		return -ENOMEM;
4505 	}
4506 
4507 	if (context_present(context) && !context_copied(iommu, bus, devfn)) {
4508 		spin_unlock(&iommu->lock);
4509 		return 0;
4510 	}
4511 
4512 	copied_context_tear_down(iommu, context, bus, devfn);
4513 	context_clear_entry(context);
4514 	context_set_domain_id(context, FLPT_DEFAULT_DID);
4515 
4516 	/*
4517 	 * In pass through mode, AW must be programmed to indicate the largest
4518 	 * AGAW value supported by hardware. And ASR is ignored by hardware.
4519 	 */
4520 	context_set_address_width(context, iommu->msagaw);
4521 	context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH);
4522 	context_set_fault_enable(context);
4523 	context_set_present(context);
4524 	if (!ecap_coherent(iommu->ecap))
4525 		clflush_cache_range(context, sizeof(*context));
4526 	context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn);
4527 	spin_unlock(&iommu->lock);
4528 
4529 	return 0;
4530 }
4531 
4532 static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data)
4533 {
4534 	struct device *dev = data;
4535 
4536 	if (dev != &pdev->dev)
4537 		return 0;
4538 
4539 	return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff);
4540 }
4541 
4542 static int device_setup_pass_through(struct device *dev)
4543 {
4544 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4545 
4546 	if (!dev_is_pci(dev))
4547 		return context_setup_pass_through(dev, info->bus, info->devfn);
4548 
4549 	return pci_for_each_dma_alias(to_pci_dev(dev),
4550 				      context_setup_pass_through_cb, dev);
4551 }
4552 
4553 static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev)
4554 {
4555 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4556 	struct intel_iommu *iommu = info->iommu;
4557 	int ret;
4558 
4559 	device_block_translation(dev);
4560 
4561 	if (dev_is_real_dma_subdevice(dev))
4562 		return 0;
4563 
4564 	if (sm_supported(iommu)) {
4565 		ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
4566 		if (!ret)
4567 			iommu_enable_pci_caps(info);
4568 	} else {
4569 		ret = device_setup_pass_through(dev);
4570 	}
4571 
4572 	return ret;
4573 }
4574 
4575 static int identity_domain_set_dev_pasid(struct iommu_domain *domain,
4576 					 struct device *dev, ioasid_t pasid)
4577 {
4578 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4579 	struct intel_iommu *iommu = info->iommu;
4580 
4581 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4582 		return -EOPNOTSUPP;
4583 
4584 	return intel_pasid_setup_pass_through(iommu, dev, pasid);
4585 }
4586 
4587 static struct iommu_domain identity_domain = {
4588 	.type = IOMMU_DOMAIN_IDENTITY,
4589 	.ops = &(const struct iommu_domain_ops) {
4590 		.attach_dev	= identity_domain_attach_dev,
4591 		.set_dev_pasid	= identity_domain_set_dev_pasid,
4592 	},
4593 };
4594 
4595 const struct iommu_ops intel_iommu_ops = {
4596 	.blocked_domain		= &blocking_domain,
4597 	.release_domain		= &blocking_domain,
4598 	.identity_domain	= &identity_domain,
4599 	.capable		= intel_iommu_capable,
4600 	.hw_info		= intel_iommu_hw_info,
4601 	.domain_alloc		= intel_iommu_domain_alloc,
4602 	.domain_alloc_user	= intel_iommu_domain_alloc_user,
4603 	.domain_alloc_sva	= intel_svm_domain_alloc,
4604 	.probe_device		= intel_iommu_probe_device,
4605 	.release_device		= intel_iommu_release_device,
4606 	.get_resv_regions	= intel_iommu_get_resv_regions,
4607 	.device_group		= intel_iommu_device_group,
4608 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4609 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4610 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4611 	.def_domain_type	= device_def_domain_type,
4612 	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4613 	.pgsize_bitmap		= SZ_4K,
4614 #ifdef CONFIG_INTEL_IOMMU_SVM
4615 	.page_response		= intel_svm_page_response,
4616 #endif
4617 	.default_domain_ops = &(const struct iommu_domain_ops) {
4618 		.attach_dev		= intel_iommu_attach_device,
4619 		.set_dev_pasid		= intel_iommu_set_dev_pasid,
4620 		.map_pages		= intel_iommu_map_pages,
4621 		.unmap_pages		= intel_iommu_unmap_pages,
4622 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4623 		.flush_iotlb_all        = intel_flush_iotlb_all,
4624 		.iotlb_sync		= intel_iommu_tlb_sync,
4625 		.iova_to_phys		= intel_iommu_iova_to_phys,
4626 		.free			= intel_iommu_domain_free,
4627 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4628 	}
4629 };
4630 
4631 static void quirk_iommu_igfx(struct pci_dev *dev)
4632 {
4633 	if (risky_device(dev))
4634 		return;
4635 
4636 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4637 	disable_igfx_iommu = 1;
4638 }
4639 
4640 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4641 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4642 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4643 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4644 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4645 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4646 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4647 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4648 
4649 /* Broadwell igfx malfunctions with dmar */
4650 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4651 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4652 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4653 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4654 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4655 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4656 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4657 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4658 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4659 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4660 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4661 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4662 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4663 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4664 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4665 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4666 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4667 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4668 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4669 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4670 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4671 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4672 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4673 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4674 
4675 static void quirk_iommu_rwbf(struct pci_dev *dev)
4676 {
4677 	if (risky_device(dev))
4678 		return;
4679 
4680 	/*
4681 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4682 	 * but needs it. Same seems to hold for the desktop versions.
4683 	 */
4684 	pci_info(dev, "Forcing write-buffer flush capability\n");
4685 	rwbf_quirk = 1;
4686 }
4687 
4688 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4689 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4690 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4691 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4692 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4693 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4694 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4695 
4696 #define GGC 0x52
4697 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4698 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4699 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4700 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4701 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4702 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4703 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4704 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4705 
4706 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4707 {
4708 	unsigned short ggc;
4709 
4710 	if (risky_device(dev))
4711 		return;
4712 
4713 	if (pci_read_config_word(dev, GGC, &ggc))
4714 		return;
4715 
4716 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4717 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4718 		disable_igfx_iommu = 1;
4719 	} else if (!disable_igfx_iommu) {
4720 		/* we have to ensure the gfx device is idle before we flush */
4721 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4722 		iommu_set_dma_strict();
4723 	}
4724 }
4725 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4726 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4727 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4728 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4729 
4730 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4731 {
4732 	unsigned short ver;
4733 
4734 	if (!IS_GFX_DEVICE(dev))
4735 		return;
4736 
4737 	ver = (dev->device >> 8) & 0xff;
4738 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4739 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4740 	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4741 		return;
4742 
4743 	if (risky_device(dev))
4744 		return;
4745 
4746 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4747 	iommu_skip_te_disable = 1;
4748 }
4749 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4750 
4751 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4752    ISOCH DMAR unit for the Azalia sound device, but not give it any
4753    TLB entries, which causes it to deadlock. Check for that.  We do
4754    this in a function called from init_dmars(), instead of in a PCI
4755    quirk, because we don't want to print the obnoxious "BIOS broken"
4756    message if VT-d is actually disabled.
4757 */
4758 static void __init check_tylersburg_isoch(void)
4759 {
4760 	struct pci_dev *pdev;
4761 	uint32_t vtisochctrl;
4762 
4763 	/* If there's no Azalia in the system anyway, forget it. */
4764 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4765 	if (!pdev)
4766 		return;
4767 
4768 	if (risky_device(pdev)) {
4769 		pci_dev_put(pdev);
4770 		return;
4771 	}
4772 
4773 	pci_dev_put(pdev);
4774 
4775 	/* System Management Registers. Might be hidden, in which case
4776 	   we can't do the sanity check. But that's OK, because the
4777 	   known-broken BIOSes _don't_ actually hide it, so far. */
4778 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4779 	if (!pdev)
4780 		return;
4781 
4782 	if (risky_device(pdev)) {
4783 		pci_dev_put(pdev);
4784 		return;
4785 	}
4786 
4787 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4788 		pci_dev_put(pdev);
4789 		return;
4790 	}
4791 
4792 	pci_dev_put(pdev);
4793 
4794 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4795 	if (vtisochctrl & 1)
4796 		return;
4797 
4798 	/* Drop all bits other than the number of TLB entries */
4799 	vtisochctrl &= 0x1c;
4800 
4801 	/* If we have the recommended number of TLB entries (16), fine. */
4802 	if (vtisochctrl == 0x10)
4803 		return;
4804 
4805 	/* Zero TLB entries? You get to ride the short bus to school. */
4806 	if (!vtisochctrl) {
4807 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4808 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4809 		     dmi_get_system_info(DMI_BIOS_VENDOR),
4810 		     dmi_get_system_info(DMI_BIOS_VERSION),
4811 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4812 		iommu_identity_mapping |= IDENTMAP_AZALIA;
4813 		return;
4814 	}
4815 
4816 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4817 	       vtisochctrl);
4818 }
4819 
4820 /*
4821  * Here we deal with a device TLB defect where device may inadvertently issue ATS
4822  * invalidation completion before posted writes initiated with translated address
4823  * that utilized translations matching the invalidation address range, violating
4824  * the invalidation completion ordering.
4825  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4826  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4827  * under the control of the trusted/privileged host device driver must use this
4828  * quirk.
4829  * Device TLBs are invalidated under the following six conditions:
4830  * 1. Device driver does DMA API unmap IOVA
4831  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4832  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4833  *    exit_mmap() due to crash
4834  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4835  *    VM has to free pages that were unmapped
4836  * 5. Userspace driver unmaps a DMA buffer
4837  * 6. Cache invalidation in vSVA usage (upcoming)
4838  *
4839  * For #1 and #2, device drivers are responsible for stopping DMA traffic
4840  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4841  * invalidate TLB the same way as normal user unmap which will use this quirk.
4842  * The dTLB invalidation after PASID cache flush does not need this quirk.
4843  *
4844  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4845  */
4846 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4847 			       unsigned long address, unsigned long mask,
4848 			       u32 pasid, u16 qdep)
4849 {
4850 	u16 sid;
4851 
4852 	if (likely(!info->dtlb_extra_inval))
4853 		return;
4854 
4855 	sid = PCI_DEVID(info->bus, info->devfn);
4856 	if (pasid == IOMMU_NO_PASID) {
4857 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
4858 				   qdep, address, mask);
4859 	} else {
4860 		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
4861 					 pasid, qdep, address, mask);
4862 	}
4863 }
4864 
4865 #define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
4866 
4867 /*
4868  * Function to submit a command to the enhanced command interface. The
4869  * valid enhanced command descriptions are defined in Table 47 of the
4870  * VT-d spec. The VT-d hardware implementation may support some but not
4871  * all commands, which can be determined by checking the Enhanced
4872  * Command Capability Register.
4873  *
4874  * Return values:
4875  *  - 0: Command successful without any error;
4876  *  - Negative: software error value;
4877  *  - Nonzero positive: failure status code defined in Table 48.
4878  */
4879 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
4880 {
4881 	unsigned long flags;
4882 	u64 res;
4883 	int ret;
4884 
4885 	if (!cap_ecmds(iommu->cap))
4886 		return -ENODEV;
4887 
4888 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
4889 
4890 	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
4891 	if (res & DMA_ECMD_ECRSP_IP) {
4892 		ret = -EBUSY;
4893 		goto err;
4894 	}
4895 
4896 	/*
4897 	 * Unconditionally write the operand B, because
4898 	 * - There is no side effect if an ecmd doesn't require an
4899 	 *   operand B, but we set the register to some value.
4900 	 * - It's not invoked in any critical path. The extra MMIO
4901 	 *   write doesn't bring any performance concerns.
4902 	 */
4903 	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
4904 	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
4905 
4906 	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
4907 		      !(res & DMA_ECMD_ECRSP_IP), res);
4908 
4909 	if (res & DMA_ECMD_ECRSP_IP) {
4910 		ret = -ETIMEDOUT;
4911 		goto err;
4912 	}
4913 
4914 	ret = ecmd_get_status_code(res);
4915 err:
4916 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
4917 
4918 	return ret;
4919 }
4920