xref: /linux/drivers/iommu/intel/iommu.c (revision 90d32e92011eaae8e70a9169b4e7acf4ca8f9d3a)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26 
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-pages.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 #include "perfmon.h"
34 
35 #define ROOT_SIZE		VTD_PAGE_SIZE
36 #define CONTEXT_SIZE		VTD_PAGE_SIZE
37 
38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42 
43 #define IOAPIC_RANGE_START	(0xfee00000)
44 #define IOAPIC_RANGE_END	(0xfeefffff)
45 #define IOVA_START_ADDR		(0x1000)
46 
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48 
49 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
50 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
51 
52 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
53    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
54 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
55 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
56 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
57 
58 static void __init check_tylersburg_isoch(void);
59 static int rwbf_quirk;
60 
61 /*
62  * set to 1 to panic kernel if can't successfully enable VT-d
63  * (used when kernel is launched w/ TXT)
64  */
65 static int force_on = 0;
66 static int intel_iommu_tboot_noforce;
67 static int no_platform_optin;
68 
69 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
70 
71 /*
72  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
73  * if marked present.
74  */
75 static phys_addr_t root_entry_lctp(struct root_entry *re)
76 {
77 	if (!(re->lo & 1))
78 		return 0;
79 
80 	return re->lo & VTD_PAGE_MASK;
81 }
82 
83 /*
84  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
85  * if marked present.
86  */
87 static phys_addr_t root_entry_uctp(struct root_entry *re)
88 {
89 	if (!(re->hi & 1))
90 		return 0;
91 
92 	return re->hi & VTD_PAGE_MASK;
93 }
94 
95 static int device_rid_cmp_key(const void *key, const struct rb_node *node)
96 {
97 	struct device_domain_info *info =
98 		rb_entry(node, struct device_domain_info, node);
99 	const u16 *rid_lhs = key;
100 
101 	if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
102 		return -1;
103 
104 	if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
105 		return 1;
106 
107 	return 0;
108 }
109 
110 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
111 {
112 	struct device_domain_info *info =
113 		rb_entry(lhs, struct device_domain_info, node);
114 	u16 key = PCI_DEVID(info->bus, info->devfn);
115 
116 	return device_rid_cmp_key(&key, rhs);
117 }
118 
119 /*
120  * Looks up an IOMMU-probed device using its source ID.
121  *
122  * Returns the pointer to the device if there is a match. Otherwise,
123  * returns NULL.
124  *
125  * Note that this helper doesn't guarantee that the device won't be
126  * released by the iommu subsystem after being returned. The caller
127  * should use its own synchronization mechanism to avoid the device
128  * being released during its use if its possibly the case.
129  */
130 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
131 {
132 	struct device_domain_info *info = NULL;
133 	struct rb_node *node;
134 	unsigned long flags;
135 
136 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
137 	node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
138 	if (node)
139 		info = rb_entry(node, struct device_domain_info, node);
140 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
141 
142 	return info ? info->dev : NULL;
143 }
144 
145 static int device_rbtree_insert(struct intel_iommu *iommu,
146 				struct device_domain_info *info)
147 {
148 	struct rb_node *curr;
149 	unsigned long flags;
150 
151 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
152 	curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
153 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
154 	if (WARN_ON(curr))
155 		return -EEXIST;
156 
157 	return 0;
158 }
159 
160 static void device_rbtree_remove(struct device_domain_info *info)
161 {
162 	struct intel_iommu *iommu = info->iommu;
163 	unsigned long flags;
164 
165 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
166 	rb_erase(&info->node, &iommu->device_rbtree);
167 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
168 }
169 
170 /*
171  * This domain is a statically identity mapping domain.
172  *	1. This domain creats a static 1:1 mapping to all usable memory.
173  * 	2. It maps to each iommu if successful.
174  *	3. Each iommu mapps to this domain if successful.
175  */
176 static struct dmar_domain *si_domain;
177 static int hw_pass_through = 1;
178 
179 struct dmar_rmrr_unit {
180 	struct list_head list;		/* list of rmrr units	*/
181 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
182 	u64	base_address;		/* reserved base address*/
183 	u64	end_address;		/* reserved end address */
184 	struct dmar_dev_scope *devices;	/* target devices */
185 	int	devices_cnt;		/* target device count */
186 };
187 
188 struct dmar_atsr_unit {
189 	struct list_head list;		/* list of ATSR units */
190 	struct acpi_dmar_header *hdr;	/* ACPI header */
191 	struct dmar_dev_scope *devices;	/* target devices */
192 	int devices_cnt;		/* target device count */
193 	u8 include_all:1;		/* include all ports */
194 };
195 
196 struct dmar_satc_unit {
197 	struct list_head list;		/* list of SATC units */
198 	struct acpi_dmar_header *hdr;	/* ACPI header */
199 	struct dmar_dev_scope *devices;	/* target devices */
200 	struct intel_iommu *iommu;	/* the corresponding iommu */
201 	int devices_cnt;		/* target device count */
202 	u8 atc_required:1;		/* ATS is required */
203 };
204 
205 static LIST_HEAD(dmar_atsr_units);
206 static LIST_HEAD(dmar_rmrr_units);
207 static LIST_HEAD(dmar_satc_units);
208 
209 #define for_each_rmrr_units(rmrr) \
210 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
211 
212 static void intel_iommu_domain_free(struct iommu_domain *domain);
213 
214 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
215 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
216 
217 int intel_iommu_enabled = 0;
218 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
219 
220 static int intel_iommu_superpage = 1;
221 static int iommu_identity_mapping;
222 static int iommu_skip_te_disable;
223 static int disable_igfx_iommu;
224 
225 #define IDENTMAP_AZALIA		4
226 
227 const struct iommu_ops intel_iommu_ops;
228 static const struct iommu_dirty_ops intel_dirty_ops;
229 
230 static bool translation_pre_enabled(struct intel_iommu *iommu)
231 {
232 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
233 }
234 
235 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
236 {
237 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
238 }
239 
240 static void init_translation_status(struct intel_iommu *iommu)
241 {
242 	u32 gsts;
243 
244 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
245 	if (gsts & DMA_GSTS_TES)
246 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
247 }
248 
249 static int __init intel_iommu_setup(char *str)
250 {
251 	if (!str)
252 		return -EINVAL;
253 
254 	while (*str) {
255 		if (!strncmp(str, "on", 2)) {
256 			dmar_disabled = 0;
257 			pr_info("IOMMU enabled\n");
258 		} else if (!strncmp(str, "off", 3)) {
259 			dmar_disabled = 1;
260 			no_platform_optin = 1;
261 			pr_info("IOMMU disabled\n");
262 		} else if (!strncmp(str, "igfx_off", 8)) {
263 			disable_igfx_iommu = 1;
264 			pr_info("Disable GFX device mapping\n");
265 		} else if (!strncmp(str, "forcedac", 8)) {
266 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
267 			iommu_dma_forcedac = true;
268 		} else if (!strncmp(str, "strict", 6)) {
269 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
270 			iommu_set_dma_strict();
271 		} else if (!strncmp(str, "sp_off", 6)) {
272 			pr_info("Disable supported super page\n");
273 			intel_iommu_superpage = 0;
274 		} else if (!strncmp(str, "sm_on", 5)) {
275 			pr_info("Enable scalable mode if hardware supports\n");
276 			intel_iommu_sm = 1;
277 		} else if (!strncmp(str, "sm_off", 6)) {
278 			pr_info("Scalable mode is disallowed\n");
279 			intel_iommu_sm = 0;
280 		} else if (!strncmp(str, "tboot_noforce", 13)) {
281 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
282 			intel_iommu_tboot_noforce = 1;
283 		} else {
284 			pr_notice("Unknown option - '%s'\n", str);
285 		}
286 
287 		str += strcspn(str, ",");
288 		while (*str == ',')
289 			str++;
290 	}
291 
292 	return 1;
293 }
294 __setup("intel_iommu=", intel_iommu_setup);
295 
296 static int domain_type_is_si(struct dmar_domain *domain)
297 {
298 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
299 }
300 
301 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
302 {
303 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
304 
305 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
306 }
307 
308 /*
309  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
310  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
311  * the returned SAGAW.
312  */
313 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
314 {
315 	unsigned long fl_sagaw, sl_sagaw;
316 
317 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
318 	sl_sagaw = cap_sagaw(iommu->cap);
319 
320 	/* Second level only. */
321 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
322 		return sl_sagaw;
323 
324 	/* First level only. */
325 	if (!ecap_slts(iommu->ecap))
326 		return fl_sagaw;
327 
328 	return fl_sagaw & sl_sagaw;
329 }
330 
331 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
332 {
333 	unsigned long sagaw;
334 	int agaw;
335 
336 	sagaw = __iommu_calculate_sagaw(iommu);
337 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
338 		if (test_bit(agaw, &sagaw))
339 			break;
340 	}
341 
342 	return agaw;
343 }
344 
345 /*
346  * Calculate max SAGAW for each iommu.
347  */
348 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
349 {
350 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
351 }
352 
353 /*
354  * calculate agaw for each iommu.
355  * "SAGAW" may be different across iommus, use a default agaw, and
356  * get a supported less agaw for iommus that don't support the default agaw.
357  */
358 int iommu_calculate_agaw(struct intel_iommu *iommu)
359 {
360 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
361 }
362 
363 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
364 {
365 	return sm_supported(iommu) ?
366 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
367 }
368 
369 static void domain_update_iommu_coherency(struct dmar_domain *domain)
370 {
371 	struct iommu_domain_info *info;
372 	struct dmar_drhd_unit *drhd;
373 	struct intel_iommu *iommu;
374 	bool found = false;
375 	unsigned long i;
376 
377 	domain->iommu_coherency = true;
378 	xa_for_each(&domain->iommu_array, i, info) {
379 		found = true;
380 		if (!iommu_paging_structure_coherency(info->iommu)) {
381 			domain->iommu_coherency = false;
382 			break;
383 		}
384 	}
385 	if (found)
386 		return;
387 
388 	/* No hardware attached; use lowest common denominator */
389 	rcu_read_lock();
390 	for_each_active_iommu(iommu, drhd) {
391 		if (!iommu_paging_structure_coherency(iommu)) {
392 			domain->iommu_coherency = false;
393 			break;
394 		}
395 	}
396 	rcu_read_unlock();
397 }
398 
399 static int domain_update_iommu_superpage(struct dmar_domain *domain,
400 					 struct intel_iommu *skip)
401 {
402 	struct dmar_drhd_unit *drhd;
403 	struct intel_iommu *iommu;
404 	int mask = 0x3;
405 
406 	if (!intel_iommu_superpage)
407 		return 0;
408 
409 	/* set iommu_superpage to the smallest common denominator */
410 	rcu_read_lock();
411 	for_each_active_iommu(iommu, drhd) {
412 		if (iommu != skip) {
413 			if (domain && domain->use_first_level) {
414 				if (!cap_fl1gp_support(iommu->cap))
415 					mask = 0x1;
416 			} else {
417 				mask &= cap_super_page_val(iommu->cap);
418 			}
419 
420 			if (!mask)
421 				break;
422 		}
423 	}
424 	rcu_read_unlock();
425 
426 	return fls(mask);
427 }
428 
429 static int domain_update_device_node(struct dmar_domain *domain)
430 {
431 	struct device_domain_info *info;
432 	int nid = NUMA_NO_NODE;
433 	unsigned long flags;
434 
435 	spin_lock_irqsave(&domain->lock, flags);
436 	list_for_each_entry(info, &domain->devices, link) {
437 		/*
438 		 * There could possibly be multiple device numa nodes as devices
439 		 * within the same domain may sit behind different IOMMUs. There
440 		 * isn't perfect answer in such situation, so we select first
441 		 * come first served policy.
442 		 */
443 		nid = dev_to_node(info->dev);
444 		if (nid != NUMA_NO_NODE)
445 			break;
446 	}
447 	spin_unlock_irqrestore(&domain->lock, flags);
448 
449 	return nid;
450 }
451 
452 /* Return the super pagesize bitmap if supported. */
453 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
454 {
455 	unsigned long bitmap = 0;
456 
457 	/*
458 	 * 1-level super page supports page size of 2MiB, 2-level super page
459 	 * supports page size of both 2MiB and 1GiB.
460 	 */
461 	if (domain->iommu_superpage == 1)
462 		bitmap |= SZ_2M;
463 	else if (domain->iommu_superpage == 2)
464 		bitmap |= SZ_2M | SZ_1G;
465 
466 	return bitmap;
467 }
468 
469 /* Some capabilities may be different across iommus */
470 void domain_update_iommu_cap(struct dmar_domain *domain)
471 {
472 	domain_update_iommu_coherency(domain);
473 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
474 
475 	/*
476 	 * If RHSA is missing, we should default to the device numa domain
477 	 * as fall back.
478 	 */
479 	if (domain->nid == NUMA_NO_NODE)
480 		domain->nid = domain_update_device_node(domain);
481 
482 	/*
483 	 * First-level translation restricts the input-address to a
484 	 * canonical address (i.e., address bits 63:N have the same
485 	 * value as address bit [N-1], where N is 48-bits with 4-level
486 	 * paging and 57-bits with 5-level paging). Hence, skip bit
487 	 * [N-1].
488 	 */
489 	if (domain->use_first_level)
490 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
491 	else
492 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
493 
494 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
495 	domain_update_iotlb(domain);
496 }
497 
498 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
499 					 u8 devfn, int alloc)
500 {
501 	struct root_entry *root = &iommu->root_entry[bus];
502 	struct context_entry *context;
503 	u64 *entry;
504 
505 	/*
506 	 * Except that the caller requested to allocate a new entry,
507 	 * returning a copied context entry makes no sense.
508 	 */
509 	if (!alloc && context_copied(iommu, bus, devfn))
510 		return NULL;
511 
512 	entry = &root->lo;
513 	if (sm_supported(iommu)) {
514 		if (devfn >= 0x80) {
515 			devfn -= 0x80;
516 			entry = &root->hi;
517 		}
518 		devfn *= 2;
519 	}
520 	if (*entry & 1)
521 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
522 	else {
523 		unsigned long phy_addr;
524 		if (!alloc)
525 			return NULL;
526 
527 		context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
528 		if (!context)
529 			return NULL;
530 
531 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
532 		phy_addr = virt_to_phys((void *)context);
533 		*entry = phy_addr | 1;
534 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
535 	}
536 	return &context[devfn];
537 }
538 
539 /**
540  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
541  *				 sub-hierarchy of a candidate PCI-PCI bridge
542  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
543  * @bridge: the candidate PCI-PCI bridge
544  *
545  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
546  */
547 static bool
548 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
549 {
550 	struct pci_dev *pdev, *pbridge;
551 
552 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
553 		return false;
554 
555 	pdev = to_pci_dev(dev);
556 	pbridge = to_pci_dev(bridge);
557 
558 	if (pbridge->subordinate &&
559 	    pbridge->subordinate->number <= pdev->bus->number &&
560 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
561 		return true;
562 
563 	return false;
564 }
565 
566 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
567 {
568 	struct dmar_drhd_unit *drhd;
569 	u32 vtbar;
570 	int rc;
571 
572 	/* We know that this device on this chipset has its own IOMMU.
573 	 * If we find it under a different IOMMU, then the BIOS is lying
574 	 * to us. Hope that the IOMMU for this device is actually
575 	 * disabled, and it needs no translation...
576 	 */
577 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
578 	if (rc) {
579 		/* "can't" happen */
580 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
581 		return false;
582 	}
583 	vtbar &= 0xffff0000;
584 
585 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
586 	drhd = dmar_find_matched_drhd_unit(pdev);
587 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
588 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
589 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
590 		return true;
591 	}
592 
593 	return false;
594 }
595 
596 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
597 {
598 	if (!iommu || iommu->drhd->ignored)
599 		return true;
600 
601 	if (dev_is_pci(dev)) {
602 		struct pci_dev *pdev = to_pci_dev(dev);
603 
604 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
605 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
606 		    quirk_ioat_snb_local_iommu(pdev))
607 			return true;
608 	}
609 
610 	return false;
611 }
612 
613 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
614 {
615 	struct dmar_drhd_unit *drhd = NULL;
616 	struct pci_dev *pdev = NULL;
617 	struct intel_iommu *iommu;
618 	struct device *tmp;
619 	u16 segment = 0;
620 	int i;
621 
622 	if (!dev)
623 		return NULL;
624 
625 	if (dev_is_pci(dev)) {
626 		struct pci_dev *pf_pdev;
627 
628 		pdev = pci_real_dma_dev(to_pci_dev(dev));
629 
630 		/* VFs aren't listed in scope tables; we need to look up
631 		 * the PF instead to find the IOMMU. */
632 		pf_pdev = pci_physfn(pdev);
633 		dev = &pf_pdev->dev;
634 		segment = pci_domain_nr(pdev->bus);
635 	} else if (has_acpi_companion(dev))
636 		dev = &ACPI_COMPANION(dev)->dev;
637 
638 	rcu_read_lock();
639 	for_each_iommu(iommu, drhd) {
640 		if (pdev && segment != drhd->segment)
641 			continue;
642 
643 		for_each_active_dev_scope(drhd->devices,
644 					  drhd->devices_cnt, i, tmp) {
645 			if (tmp == dev) {
646 				/* For a VF use its original BDF# not that of the PF
647 				 * which we used for the IOMMU lookup. Strictly speaking
648 				 * we could do this for all PCI devices; we only need to
649 				 * get the BDF# from the scope table for ACPI matches. */
650 				if (pdev && pdev->is_virtfn)
651 					goto got_pdev;
652 
653 				if (bus && devfn) {
654 					*bus = drhd->devices[i].bus;
655 					*devfn = drhd->devices[i].devfn;
656 				}
657 				goto out;
658 			}
659 
660 			if (is_downstream_to_pci_bridge(dev, tmp))
661 				goto got_pdev;
662 		}
663 
664 		if (pdev && drhd->include_all) {
665 got_pdev:
666 			if (bus && devfn) {
667 				*bus = pdev->bus->number;
668 				*devfn = pdev->devfn;
669 			}
670 			goto out;
671 		}
672 	}
673 	iommu = NULL;
674 out:
675 	if (iommu_is_dummy(iommu, dev))
676 		iommu = NULL;
677 
678 	rcu_read_unlock();
679 
680 	return iommu;
681 }
682 
683 static void domain_flush_cache(struct dmar_domain *domain,
684 			       void *addr, int size)
685 {
686 	if (!domain->iommu_coherency)
687 		clflush_cache_range(addr, size);
688 }
689 
690 static void free_context_table(struct intel_iommu *iommu)
691 {
692 	struct context_entry *context;
693 	int i;
694 
695 	if (!iommu->root_entry)
696 		return;
697 
698 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
699 		context = iommu_context_addr(iommu, i, 0, 0);
700 		if (context)
701 			iommu_free_page(context);
702 
703 		if (!sm_supported(iommu))
704 			continue;
705 
706 		context = iommu_context_addr(iommu, i, 0x80, 0);
707 		if (context)
708 			iommu_free_page(context);
709 	}
710 
711 	iommu_free_page(iommu->root_entry);
712 	iommu->root_entry = NULL;
713 }
714 
715 #ifdef CONFIG_DMAR_DEBUG
716 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
717 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
718 {
719 	struct dma_pte *pte;
720 	int offset;
721 
722 	while (1) {
723 		offset = pfn_level_offset(pfn, level);
724 		pte = &parent[offset];
725 		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
726 			pr_info("PTE not present at level %d\n", level);
727 			break;
728 		}
729 
730 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
731 
732 		if (level == 1)
733 			break;
734 
735 		parent = phys_to_virt(dma_pte_addr(pte));
736 		level--;
737 	}
738 }
739 
740 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
741 			  unsigned long long addr, u32 pasid)
742 {
743 	struct pasid_dir_entry *dir, *pde;
744 	struct pasid_entry *entries, *pte;
745 	struct context_entry *ctx_entry;
746 	struct root_entry *rt_entry;
747 	int i, dir_index, index, level;
748 	u8 devfn = source_id & 0xff;
749 	u8 bus = source_id >> 8;
750 	struct dma_pte *pgtable;
751 
752 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
753 
754 	/* root entry dump */
755 	rt_entry = &iommu->root_entry[bus];
756 	if (!rt_entry) {
757 		pr_info("root table entry is not present\n");
758 		return;
759 	}
760 
761 	if (sm_supported(iommu))
762 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
763 			rt_entry->hi, rt_entry->lo);
764 	else
765 		pr_info("root entry: 0x%016llx", rt_entry->lo);
766 
767 	/* context entry dump */
768 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
769 	if (!ctx_entry) {
770 		pr_info("context table entry is not present\n");
771 		return;
772 	}
773 
774 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
775 		ctx_entry->hi, ctx_entry->lo);
776 
777 	/* legacy mode does not require PASID entries */
778 	if (!sm_supported(iommu)) {
779 		level = agaw_to_level(ctx_entry->hi & 7);
780 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
781 		goto pgtable_walk;
782 	}
783 
784 	/* get the pointer to pasid directory entry */
785 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
786 	if (!dir) {
787 		pr_info("pasid directory entry is not present\n");
788 		return;
789 	}
790 	/* For request-without-pasid, get the pasid from context entry */
791 	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
792 		pasid = IOMMU_NO_PASID;
793 
794 	dir_index = pasid >> PASID_PDE_SHIFT;
795 	pde = &dir[dir_index];
796 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
797 
798 	/* get the pointer to the pasid table entry */
799 	entries = get_pasid_table_from_pde(pde);
800 	if (!entries) {
801 		pr_info("pasid table entry is not present\n");
802 		return;
803 	}
804 	index = pasid & PASID_PTE_MASK;
805 	pte = &entries[index];
806 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
807 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
808 
809 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
810 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
811 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
812 	} else {
813 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
814 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
815 	}
816 
817 pgtable_walk:
818 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
819 }
820 #endif
821 
822 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
823 				      unsigned long pfn, int *target_level,
824 				      gfp_t gfp)
825 {
826 	struct dma_pte *parent, *pte;
827 	int level = agaw_to_level(domain->agaw);
828 	int offset;
829 
830 	if (!domain_pfn_supported(domain, pfn))
831 		/* Address beyond IOMMU's addressing capabilities. */
832 		return NULL;
833 
834 	parent = domain->pgd;
835 
836 	while (1) {
837 		void *tmp_page;
838 
839 		offset = pfn_level_offset(pfn, level);
840 		pte = &parent[offset];
841 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
842 			break;
843 		if (level == *target_level)
844 			break;
845 
846 		if (!dma_pte_present(pte)) {
847 			uint64_t pteval, tmp;
848 
849 			tmp_page = iommu_alloc_page_node(domain->nid, gfp);
850 
851 			if (!tmp_page)
852 				return NULL;
853 
854 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
855 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
856 			if (domain->use_first_level)
857 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
858 
859 			tmp = 0ULL;
860 			if (!try_cmpxchg64(&pte->val, &tmp, pteval))
861 				/* Someone else set it while we were thinking; use theirs. */
862 				iommu_free_page(tmp_page);
863 			else
864 				domain_flush_cache(domain, pte, sizeof(*pte));
865 		}
866 		if (level == 1)
867 			break;
868 
869 		parent = phys_to_virt(dma_pte_addr(pte));
870 		level--;
871 	}
872 
873 	if (!*target_level)
874 		*target_level = level;
875 
876 	return pte;
877 }
878 
879 /* return address's pte at specific level */
880 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
881 					 unsigned long pfn,
882 					 int level, int *large_page)
883 {
884 	struct dma_pte *parent, *pte;
885 	int total = agaw_to_level(domain->agaw);
886 	int offset;
887 
888 	parent = domain->pgd;
889 	while (level <= total) {
890 		offset = pfn_level_offset(pfn, total);
891 		pte = &parent[offset];
892 		if (level == total)
893 			return pte;
894 
895 		if (!dma_pte_present(pte)) {
896 			*large_page = total;
897 			break;
898 		}
899 
900 		if (dma_pte_superpage(pte)) {
901 			*large_page = total;
902 			return pte;
903 		}
904 
905 		parent = phys_to_virt(dma_pte_addr(pte));
906 		total--;
907 	}
908 	return NULL;
909 }
910 
911 /* clear last level pte, a tlb flush should be followed */
912 static void dma_pte_clear_range(struct dmar_domain *domain,
913 				unsigned long start_pfn,
914 				unsigned long last_pfn)
915 {
916 	unsigned int large_page;
917 	struct dma_pte *first_pte, *pte;
918 
919 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
920 	    WARN_ON(start_pfn > last_pfn))
921 		return;
922 
923 	/* we don't need lock here; nobody else touches the iova range */
924 	do {
925 		large_page = 1;
926 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
927 		if (!pte) {
928 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
929 			continue;
930 		}
931 		do {
932 			dma_clear_pte(pte);
933 			start_pfn += lvl_to_nr_pages(large_page);
934 			pte++;
935 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
936 
937 		domain_flush_cache(domain, first_pte,
938 				   (void *)pte - (void *)first_pte);
939 
940 	} while (start_pfn && start_pfn <= last_pfn);
941 }
942 
943 static void dma_pte_free_level(struct dmar_domain *domain, int level,
944 			       int retain_level, struct dma_pte *pte,
945 			       unsigned long pfn, unsigned long start_pfn,
946 			       unsigned long last_pfn)
947 {
948 	pfn = max(start_pfn, pfn);
949 	pte = &pte[pfn_level_offset(pfn, level)];
950 
951 	do {
952 		unsigned long level_pfn;
953 		struct dma_pte *level_pte;
954 
955 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
956 			goto next;
957 
958 		level_pfn = pfn & level_mask(level);
959 		level_pte = phys_to_virt(dma_pte_addr(pte));
960 
961 		if (level > 2) {
962 			dma_pte_free_level(domain, level - 1, retain_level,
963 					   level_pte, level_pfn, start_pfn,
964 					   last_pfn);
965 		}
966 
967 		/*
968 		 * Free the page table if we're below the level we want to
969 		 * retain and the range covers the entire table.
970 		 */
971 		if (level < retain_level && !(start_pfn > level_pfn ||
972 		      last_pfn < level_pfn + level_size(level) - 1)) {
973 			dma_clear_pte(pte);
974 			domain_flush_cache(domain, pte, sizeof(*pte));
975 			iommu_free_page(level_pte);
976 		}
977 next:
978 		pfn += level_size(level);
979 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
980 }
981 
982 /*
983  * clear last level (leaf) ptes and free page table pages below the
984  * level we wish to keep intact.
985  */
986 static void dma_pte_free_pagetable(struct dmar_domain *domain,
987 				   unsigned long start_pfn,
988 				   unsigned long last_pfn,
989 				   int retain_level)
990 {
991 	dma_pte_clear_range(domain, start_pfn, last_pfn);
992 
993 	/* We don't need lock here; nobody else touches the iova range */
994 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
995 			   domain->pgd, 0, start_pfn, last_pfn);
996 
997 	/* free pgd */
998 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
999 		iommu_free_page(domain->pgd);
1000 		domain->pgd = NULL;
1001 	}
1002 }
1003 
1004 /* When a page at a given level is being unlinked from its parent, we don't
1005    need to *modify* it at all. All we need to do is make a list of all the
1006    pages which can be freed just as soon as we've flushed the IOTLB and we
1007    know the hardware page-walk will no longer touch them.
1008    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1009    be freed. */
1010 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1011 				    int level, struct dma_pte *pte,
1012 				    struct list_head *freelist)
1013 {
1014 	struct page *pg;
1015 
1016 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1017 	list_add_tail(&pg->lru, freelist);
1018 
1019 	if (level == 1)
1020 		return;
1021 
1022 	pte = page_address(pg);
1023 	do {
1024 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1025 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1026 		pte++;
1027 	} while (!first_pte_in_page(pte));
1028 }
1029 
1030 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1031 				struct dma_pte *pte, unsigned long pfn,
1032 				unsigned long start_pfn, unsigned long last_pfn,
1033 				struct list_head *freelist)
1034 {
1035 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1036 
1037 	pfn = max(start_pfn, pfn);
1038 	pte = &pte[pfn_level_offset(pfn, level)];
1039 
1040 	do {
1041 		unsigned long level_pfn = pfn & level_mask(level);
1042 
1043 		if (!dma_pte_present(pte))
1044 			goto next;
1045 
1046 		/* If range covers entire pagetable, free it */
1047 		if (start_pfn <= level_pfn &&
1048 		    last_pfn >= level_pfn + level_size(level) - 1) {
1049 			/* These suborbinate page tables are going away entirely. Don't
1050 			   bother to clear them; we're just going to *free* them. */
1051 			if (level > 1 && !dma_pte_superpage(pte))
1052 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1053 
1054 			dma_clear_pte(pte);
1055 			if (!first_pte)
1056 				first_pte = pte;
1057 			last_pte = pte;
1058 		} else if (level > 1) {
1059 			/* Recurse down into a level that isn't *entirely* obsolete */
1060 			dma_pte_clear_level(domain, level - 1,
1061 					    phys_to_virt(dma_pte_addr(pte)),
1062 					    level_pfn, start_pfn, last_pfn,
1063 					    freelist);
1064 		}
1065 next:
1066 		pfn = level_pfn + level_size(level);
1067 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1068 
1069 	if (first_pte)
1070 		domain_flush_cache(domain, first_pte,
1071 				   (void *)++last_pte - (void *)first_pte);
1072 }
1073 
1074 /* We can't just free the pages because the IOMMU may still be walking
1075    the page tables, and may have cached the intermediate levels. The
1076    pages can only be freed after the IOTLB flush has been done. */
1077 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1078 			 unsigned long last_pfn, struct list_head *freelist)
1079 {
1080 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1081 	    WARN_ON(start_pfn > last_pfn))
1082 		return;
1083 
1084 	/* we don't need lock here; nobody else touches the iova range */
1085 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1086 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1087 
1088 	/* free pgd */
1089 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1090 		struct page *pgd_page = virt_to_page(domain->pgd);
1091 		list_add_tail(&pgd_page->lru, freelist);
1092 		domain->pgd = NULL;
1093 	}
1094 }
1095 
1096 /* iommu handling */
1097 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1098 {
1099 	struct root_entry *root;
1100 
1101 	root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
1102 	if (!root) {
1103 		pr_err("Allocating root entry for %s failed\n",
1104 			iommu->name);
1105 		return -ENOMEM;
1106 	}
1107 
1108 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1109 	iommu->root_entry = root;
1110 
1111 	return 0;
1112 }
1113 
1114 static void iommu_set_root_entry(struct intel_iommu *iommu)
1115 {
1116 	u64 addr;
1117 	u32 sts;
1118 	unsigned long flag;
1119 
1120 	addr = virt_to_phys(iommu->root_entry);
1121 	if (sm_supported(iommu))
1122 		addr |= DMA_RTADDR_SMT;
1123 
1124 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1125 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1126 
1127 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1128 
1129 	/* Make sure hardware complete it */
1130 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1131 		      readl, (sts & DMA_GSTS_RTPS), sts);
1132 
1133 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1134 
1135 	/*
1136 	 * Hardware invalidates all DMA remapping hardware translation
1137 	 * caches as part of SRTP flow.
1138 	 */
1139 	if (cap_esrtps(iommu->cap))
1140 		return;
1141 
1142 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1143 	if (sm_supported(iommu))
1144 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1145 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1146 }
1147 
1148 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1149 {
1150 	u32 val;
1151 	unsigned long flag;
1152 
1153 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1154 		return;
1155 
1156 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1157 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1158 
1159 	/* Make sure hardware complete it */
1160 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1161 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1162 
1163 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1164 }
1165 
1166 /* return value determine if we need a write buffer flush */
1167 static void __iommu_flush_context(struct intel_iommu *iommu,
1168 				  u16 did, u16 source_id, u8 function_mask,
1169 				  u64 type)
1170 {
1171 	u64 val = 0;
1172 	unsigned long flag;
1173 
1174 	switch (type) {
1175 	case DMA_CCMD_GLOBAL_INVL:
1176 		val = DMA_CCMD_GLOBAL_INVL;
1177 		break;
1178 	case DMA_CCMD_DOMAIN_INVL:
1179 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1180 		break;
1181 	case DMA_CCMD_DEVICE_INVL:
1182 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1183 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1184 		break;
1185 	default:
1186 		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1187 			iommu->name, type);
1188 		return;
1189 	}
1190 	val |= DMA_CCMD_ICC;
1191 
1192 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1193 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1194 
1195 	/* Make sure hardware complete it */
1196 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1197 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1198 
1199 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1200 }
1201 
1202 /* return value determine if we need a write buffer flush */
1203 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1204 				u64 addr, unsigned int size_order, u64 type)
1205 {
1206 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1207 	u64 val = 0, val_iva = 0;
1208 	unsigned long flag;
1209 
1210 	switch (type) {
1211 	case DMA_TLB_GLOBAL_FLUSH:
1212 		/* global flush doesn't need set IVA_REG */
1213 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1214 		break;
1215 	case DMA_TLB_DSI_FLUSH:
1216 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1217 		break;
1218 	case DMA_TLB_PSI_FLUSH:
1219 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1220 		/* IH bit is passed in as part of address */
1221 		val_iva = size_order | addr;
1222 		break;
1223 	default:
1224 		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1225 			iommu->name, type);
1226 		return;
1227 	}
1228 
1229 	if (cap_write_drain(iommu->cap))
1230 		val |= DMA_TLB_WRITE_DRAIN;
1231 
1232 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1233 	/* Note: Only uses first TLB reg currently */
1234 	if (val_iva)
1235 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1236 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1237 
1238 	/* Make sure hardware complete it */
1239 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1240 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1241 
1242 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1243 
1244 	/* check IOTLB invalidation granularity */
1245 	if (DMA_TLB_IAIG(val) == 0)
1246 		pr_err("Flush IOTLB failed\n");
1247 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1248 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1249 			(unsigned long long)DMA_TLB_IIRG(type),
1250 			(unsigned long long)DMA_TLB_IAIG(val));
1251 }
1252 
1253 static struct device_domain_info *
1254 domain_lookup_dev_info(struct dmar_domain *domain,
1255 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1256 {
1257 	struct device_domain_info *info;
1258 	unsigned long flags;
1259 
1260 	spin_lock_irqsave(&domain->lock, flags);
1261 	list_for_each_entry(info, &domain->devices, link) {
1262 		if (info->iommu == iommu && info->bus == bus &&
1263 		    info->devfn == devfn) {
1264 			spin_unlock_irqrestore(&domain->lock, flags);
1265 			return info;
1266 		}
1267 	}
1268 	spin_unlock_irqrestore(&domain->lock, flags);
1269 
1270 	return NULL;
1271 }
1272 
1273 void domain_update_iotlb(struct dmar_domain *domain)
1274 {
1275 	struct dev_pasid_info *dev_pasid;
1276 	struct device_domain_info *info;
1277 	bool has_iotlb_device = false;
1278 	unsigned long flags;
1279 
1280 	spin_lock_irqsave(&domain->lock, flags);
1281 	list_for_each_entry(info, &domain->devices, link) {
1282 		if (info->ats_enabled) {
1283 			has_iotlb_device = true;
1284 			break;
1285 		}
1286 	}
1287 
1288 	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1289 		info = dev_iommu_priv_get(dev_pasid->dev);
1290 		if (info->ats_enabled) {
1291 			has_iotlb_device = true;
1292 			break;
1293 		}
1294 	}
1295 	domain->has_iotlb_device = has_iotlb_device;
1296 	spin_unlock_irqrestore(&domain->lock, flags);
1297 }
1298 
1299 /*
1300  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1301  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1302  * check because it applies only to the built-in QAT devices and it doesn't
1303  * grant additional privileges.
1304  */
1305 #define BUGGY_QAT_DEVID_MASK 0x4940
1306 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1307 {
1308 	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1309 		return false;
1310 
1311 	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1312 		return false;
1313 
1314 	return true;
1315 }
1316 
1317 static void iommu_enable_pci_caps(struct device_domain_info *info)
1318 {
1319 	struct pci_dev *pdev;
1320 
1321 	if (!dev_is_pci(info->dev))
1322 		return;
1323 
1324 	pdev = to_pci_dev(info->dev);
1325 
1326 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1327 	   the device if you enable PASID support after ATS support is
1328 	   undefined. So always enable PASID support on devices which
1329 	   have it, even if we can't yet know if we're ever going to
1330 	   use it. */
1331 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1332 		info->pasid_enabled = 1;
1333 
1334 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1335 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1336 		info->ats_enabled = 1;
1337 		domain_update_iotlb(info->domain);
1338 	}
1339 }
1340 
1341 static void iommu_disable_pci_caps(struct device_domain_info *info)
1342 {
1343 	struct pci_dev *pdev;
1344 
1345 	if (!dev_is_pci(info->dev))
1346 		return;
1347 
1348 	pdev = to_pci_dev(info->dev);
1349 
1350 	if (info->ats_enabled) {
1351 		pci_disable_ats(pdev);
1352 		info->ats_enabled = 0;
1353 		domain_update_iotlb(info->domain);
1354 	}
1355 
1356 	if (info->pasid_enabled) {
1357 		pci_disable_pasid(pdev);
1358 		info->pasid_enabled = 0;
1359 	}
1360 }
1361 
1362 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1363 				    u64 addr, unsigned int mask)
1364 {
1365 	u16 sid, qdep;
1366 
1367 	if (!info || !info->ats_enabled)
1368 		return;
1369 
1370 	sid = info->bus << 8 | info->devfn;
1371 	qdep = info->ats_qdep;
1372 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1373 			   qdep, addr, mask);
1374 	quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1375 }
1376 
1377 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1378 {
1379 	cache_tag_flush_all(to_dmar_domain(domain));
1380 }
1381 
1382 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1383 {
1384 	u32 pmen;
1385 	unsigned long flags;
1386 
1387 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1388 		return;
1389 
1390 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1391 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1392 	pmen &= ~DMA_PMEN_EPM;
1393 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1394 
1395 	/* wait for the protected region status bit to clear */
1396 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1397 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1398 
1399 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1400 }
1401 
1402 static void iommu_enable_translation(struct intel_iommu *iommu)
1403 {
1404 	u32 sts;
1405 	unsigned long flags;
1406 
1407 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1408 	iommu->gcmd |= DMA_GCMD_TE;
1409 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1410 
1411 	/* Make sure hardware complete it */
1412 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1413 		      readl, (sts & DMA_GSTS_TES), sts);
1414 
1415 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1416 }
1417 
1418 static void iommu_disable_translation(struct intel_iommu *iommu)
1419 {
1420 	u32 sts;
1421 	unsigned long flag;
1422 
1423 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1424 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1425 		return;
1426 
1427 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1428 	iommu->gcmd &= ~DMA_GCMD_TE;
1429 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1430 
1431 	/* Make sure hardware complete it */
1432 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1433 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1434 
1435 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1436 }
1437 
1438 static int iommu_init_domains(struct intel_iommu *iommu)
1439 {
1440 	u32 ndomains;
1441 
1442 	ndomains = cap_ndoms(iommu->cap);
1443 	pr_debug("%s: Number of Domains supported <%d>\n",
1444 		 iommu->name, ndomains);
1445 
1446 	spin_lock_init(&iommu->lock);
1447 
1448 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1449 	if (!iommu->domain_ids)
1450 		return -ENOMEM;
1451 
1452 	/*
1453 	 * If Caching mode is set, then invalid translations are tagged
1454 	 * with domain-id 0, hence we need to pre-allocate it. We also
1455 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1456 	 * make sure it is not used for a real domain.
1457 	 */
1458 	set_bit(0, iommu->domain_ids);
1459 
1460 	/*
1461 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1462 	 * entry for first-level or pass-through translation modes should
1463 	 * be programmed with a domain id different from those used for
1464 	 * second-level or nested translation. We reserve a domain id for
1465 	 * this purpose.
1466 	 */
1467 	if (sm_supported(iommu))
1468 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1469 
1470 	return 0;
1471 }
1472 
1473 static void disable_dmar_iommu(struct intel_iommu *iommu)
1474 {
1475 	if (!iommu->domain_ids)
1476 		return;
1477 
1478 	/*
1479 	 * All iommu domains must have been detached from the devices,
1480 	 * hence there should be no domain IDs in use.
1481 	 */
1482 	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1483 		    > NUM_RESERVED_DID))
1484 		return;
1485 
1486 	if (iommu->gcmd & DMA_GCMD_TE)
1487 		iommu_disable_translation(iommu);
1488 }
1489 
1490 static void free_dmar_iommu(struct intel_iommu *iommu)
1491 {
1492 	if (iommu->domain_ids) {
1493 		bitmap_free(iommu->domain_ids);
1494 		iommu->domain_ids = NULL;
1495 	}
1496 
1497 	if (iommu->copied_tables) {
1498 		bitmap_free(iommu->copied_tables);
1499 		iommu->copied_tables = NULL;
1500 	}
1501 
1502 	/* free context mapping */
1503 	free_context_table(iommu);
1504 
1505 #ifdef CONFIG_INTEL_IOMMU_SVM
1506 	if (pasid_supported(iommu)) {
1507 		if (ecap_prs(iommu->ecap))
1508 			intel_svm_finish_prq(iommu);
1509 	}
1510 #endif
1511 }
1512 
1513 /*
1514  * Check and return whether first level is used by default for
1515  * DMA translation.
1516  */
1517 static bool first_level_by_default(unsigned int type)
1518 {
1519 	/* Only SL is available in legacy mode */
1520 	if (!scalable_mode_support())
1521 		return false;
1522 
1523 	/* Only level (either FL or SL) is available, just use it */
1524 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1525 		return intel_cap_flts_sanity();
1526 
1527 	/* Both levels are available, decide it based on domain type */
1528 	return type != IOMMU_DOMAIN_UNMANAGED;
1529 }
1530 
1531 static struct dmar_domain *alloc_domain(unsigned int type)
1532 {
1533 	struct dmar_domain *domain;
1534 
1535 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1536 	if (!domain)
1537 		return NULL;
1538 
1539 	domain->nid = NUMA_NO_NODE;
1540 	if (first_level_by_default(type))
1541 		domain->use_first_level = true;
1542 	domain->has_iotlb_device = false;
1543 	INIT_LIST_HEAD(&domain->devices);
1544 	INIT_LIST_HEAD(&domain->dev_pasids);
1545 	INIT_LIST_HEAD(&domain->cache_tags);
1546 	spin_lock_init(&domain->lock);
1547 	spin_lock_init(&domain->cache_lock);
1548 	xa_init(&domain->iommu_array);
1549 
1550 	return domain;
1551 }
1552 
1553 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1554 {
1555 	struct iommu_domain_info *info, *curr;
1556 	unsigned long ndomains;
1557 	int num, ret = -ENOSPC;
1558 
1559 	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1560 		return 0;
1561 
1562 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1563 	if (!info)
1564 		return -ENOMEM;
1565 
1566 	spin_lock(&iommu->lock);
1567 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1568 	if (curr) {
1569 		curr->refcnt++;
1570 		spin_unlock(&iommu->lock);
1571 		kfree(info);
1572 		return 0;
1573 	}
1574 
1575 	ndomains = cap_ndoms(iommu->cap);
1576 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1577 	if (num >= ndomains) {
1578 		pr_err("%s: No free domain ids\n", iommu->name);
1579 		goto err_unlock;
1580 	}
1581 
1582 	set_bit(num, iommu->domain_ids);
1583 	info->refcnt	= 1;
1584 	info->did	= num;
1585 	info->iommu	= iommu;
1586 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1587 			  NULL, info, GFP_ATOMIC);
1588 	if (curr) {
1589 		ret = xa_err(curr) ? : -EBUSY;
1590 		goto err_clear;
1591 	}
1592 	domain_update_iommu_cap(domain);
1593 
1594 	spin_unlock(&iommu->lock);
1595 	return 0;
1596 
1597 err_clear:
1598 	clear_bit(info->did, iommu->domain_ids);
1599 err_unlock:
1600 	spin_unlock(&iommu->lock);
1601 	kfree(info);
1602 	return ret;
1603 }
1604 
1605 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1606 {
1607 	struct iommu_domain_info *info;
1608 
1609 	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1610 		return;
1611 
1612 	spin_lock(&iommu->lock);
1613 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1614 	if (--info->refcnt == 0) {
1615 		clear_bit(info->did, iommu->domain_ids);
1616 		xa_erase(&domain->iommu_array, iommu->seq_id);
1617 		domain->nid = NUMA_NO_NODE;
1618 		domain_update_iommu_cap(domain);
1619 		kfree(info);
1620 	}
1621 	spin_unlock(&iommu->lock);
1622 }
1623 
1624 static int guestwidth_to_adjustwidth(int gaw)
1625 {
1626 	int agaw;
1627 	int r = (gaw - 12) % 9;
1628 
1629 	if (r == 0)
1630 		agaw = gaw;
1631 	else
1632 		agaw = gaw + 9 - r;
1633 	if (agaw > 64)
1634 		agaw = 64;
1635 	return agaw;
1636 }
1637 
1638 static void domain_exit(struct dmar_domain *domain)
1639 {
1640 	if (domain->pgd) {
1641 		LIST_HEAD(freelist);
1642 
1643 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1644 		iommu_put_pages_list(&freelist);
1645 	}
1646 
1647 	if (WARN_ON(!list_empty(&domain->devices)))
1648 		return;
1649 
1650 	kfree(domain);
1651 }
1652 
1653 static int domain_context_mapping_one(struct dmar_domain *domain,
1654 				      struct intel_iommu *iommu,
1655 				      u8 bus, u8 devfn)
1656 {
1657 	struct device_domain_info *info =
1658 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1659 	u16 did = domain_id_iommu(domain, iommu);
1660 	int translation = CONTEXT_TT_MULTI_LEVEL;
1661 	struct dma_pte *pgd = domain->pgd;
1662 	struct context_entry *context;
1663 	int agaw, ret;
1664 
1665 	if (hw_pass_through && domain_type_is_si(domain))
1666 		translation = CONTEXT_TT_PASS_THROUGH;
1667 
1668 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1669 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1670 
1671 	spin_lock(&iommu->lock);
1672 	ret = -ENOMEM;
1673 	context = iommu_context_addr(iommu, bus, devfn, 1);
1674 	if (!context)
1675 		goto out_unlock;
1676 
1677 	ret = 0;
1678 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1679 		goto out_unlock;
1680 
1681 	/*
1682 	 * For kdump cases, old valid entries may be cached due to the
1683 	 * in-flight DMA and copied pgtable, but there is no unmapping
1684 	 * behaviour for them, thus we need an explicit cache flush for
1685 	 * the newly-mapped device. For kdump, at this point, the device
1686 	 * is supposed to finish reset at its driver probe stage, so no
1687 	 * in-flight DMA will exist, and we don't need to worry anymore
1688 	 * hereafter.
1689 	 */
1690 	if (context_copied(iommu, bus, devfn)) {
1691 		u16 did_old = context_domain_id(context);
1692 
1693 		if (did_old < cap_ndoms(iommu->cap)) {
1694 			iommu->flush.flush_context(iommu, did_old,
1695 						   (((u16)bus) << 8) | devfn,
1696 						   DMA_CCMD_MASK_NOBIT,
1697 						   DMA_CCMD_DEVICE_INVL);
1698 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1699 						 DMA_TLB_DSI_FLUSH);
1700 		}
1701 
1702 		clear_context_copied(iommu, bus, devfn);
1703 	}
1704 
1705 	context_clear_entry(context);
1706 	context_set_domain_id(context, did);
1707 
1708 	if (translation != CONTEXT_TT_PASS_THROUGH) {
1709 		/*
1710 		 * Skip top levels of page tables for iommu which has
1711 		 * less agaw than default. Unnecessary for PT mode.
1712 		 */
1713 		for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1714 			ret = -ENOMEM;
1715 			pgd = phys_to_virt(dma_pte_addr(pgd));
1716 			if (!dma_pte_present(pgd))
1717 				goto out_unlock;
1718 		}
1719 
1720 		if (info && info->ats_supported)
1721 			translation = CONTEXT_TT_DEV_IOTLB;
1722 		else
1723 			translation = CONTEXT_TT_MULTI_LEVEL;
1724 
1725 		context_set_address_root(context, virt_to_phys(pgd));
1726 		context_set_address_width(context, agaw);
1727 	} else {
1728 		/*
1729 		 * In pass through mode, AW must be programmed to
1730 		 * indicate the largest AGAW value supported by
1731 		 * hardware. And ASR is ignored by hardware.
1732 		 */
1733 		context_set_address_width(context, iommu->msagaw);
1734 	}
1735 
1736 	context_set_translation_type(context, translation);
1737 	context_set_fault_enable(context);
1738 	context_set_present(context);
1739 	if (!ecap_coherent(iommu->ecap))
1740 		clflush_cache_range(context, sizeof(*context));
1741 
1742 	/*
1743 	 * It's a non-present to present mapping. If hardware doesn't cache
1744 	 * non-present entry we only need to flush the write-buffer. If the
1745 	 * _does_ cache non-present entries, then it does so in the special
1746 	 * domain #0, which we have to flush:
1747 	 */
1748 	if (cap_caching_mode(iommu->cap)) {
1749 		iommu->flush.flush_context(iommu, 0,
1750 					   (((u16)bus) << 8) | devfn,
1751 					   DMA_CCMD_MASK_NOBIT,
1752 					   DMA_CCMD_DEVICE_INVL);
1753 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1754 	} else {
1755 		iommu_flush_write_buffer(iommu);
1756 	}
1757 
1758 	ret = 0;
1759 
1760 out_unlock:
1761 	spin_unlock(&iommu->lock);
1762 
1763 	return ret;
1764 }
1765 
1766 static int domain_context_mapping_cb(struct pci_dev *pdev,
1767 				     u16 alias, void *opaque)
1768 {
1769 	struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1770 	struct intel_iommu *iommu = info->iommu;
1771 	struct dmar_domain *domain = opaque;
1772 
1773 	return domain_context_mapping_one(domain, iommu,
1774 					  PCI_BUS_NUM(alias), alias & 0xff);
1775 }
1776 
1777 static int
1778 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1779 {
1780 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1781 	struct intel_iommu *iommu = info->iommu;
1782 	u8 bus = info->bus, devfn = info->devfn;
1783 
1784 	if (!dev_is_pci(dev))
1785 		return domain_context_mapping_one(domain, iommu, bus, devfn);
1786 
1787 	return pci_for_each_dma_alias(to_pci_dev(dev),
1788 				      domain_context_mapping_cb, domain);
1789 }
1790 
1791 /* Return largest possible superpage level for a given mapping */
1792 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1793 				   unsigned long phy_pfn, unsigned long pages)
1794 {
1795 	int support, level = 1;
1796 	unsigned long pfnmerge;
1797 
1798 	support = domain->iommu_superpage;
1799 
1800 	/* To use a large page, the virtual *and* physical addresses
1801 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1802 	   of them will mean we have to use smaller pages. So just
1803 	   merge them and check both at once. */
1804 	pfnmerge = iov_pfn | phy_pfn;
1805 
1806 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1807 		pages >>= VTD_STRIDE_SHIFT;
1808 		if (!pages)
1809 			break;
1810 		pfnmerge >>= VTD_STRIDE_SHIFT;
1811 		level++;
1812 		support--;
1813 	}
1814 	return level;
1815 }
1816 
1817 /*
1818  * Ensure that old small page tables are removed to make room for superpage(s).
1819  * We're going to add new large pages, so make sure we don't remove their parent
1820  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1821  */
1822 static void switch_to_super_page(struct dmar_domain *domain,
1823 				 unsigned long start_pfn,
1824 				 unsigned long end_pfn, int level)
1825 {
1826 	unsigned long lvl_pages = lvl_to_nr_pages(level);
1827 	struct dma_pte *pte = NULL;
1828 
1829 	while (start_pfn <= end_pfn) {
1830 		if (!pte)
1831 			pte = pfn_to_dma_pte(domain, start_pfn, &level,
1832 					     GFP_ATOMIC);
1833 
1834 		if (dma_pte_present(pte)) {
1835 			dma_pte_free_pagetable(domain, start_pfn,
1836 					       start_pfn + lvl_pages - 1,
1837 					       level + 1);
1838 
1839 			cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT,
1840 					      end_pfn << VTD_PAGE_SHIFT, 0);
1841 		}
1842 
1843 		pte++;
1844 		start_pfn += lvl_pages;
1845 		if (first_pte_in_page(pte))
1846 			pte = NULL;
1847 	}
1848 }
1849 
1850 static int
1851 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1852 		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
1853 		 gfp_t gfp)
1854 {
1855 	struct dma_pte *first_pte = NULL, *pte = NULL;
1856 	unsigned int largepage_lvl = 0;
1857 	unsigned long lvl_pages = 0;
1858 	phys_addr_t pteval;
1859 	u64 attr;
1860 
1861 	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
1862 		return -EINVAL;
1863 
1864 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1865 		return -EINVAL;
1866 
1867 	if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
1868 		pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
1869 		return -EINVAL;
1870 	}
1871 
1872 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
1873 	attr |= DMA_FL_PTE_PRESENT;
1874 	if (domain->use_first_level) {
1875 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
1876 		if (prot & DMA_PTE_WRITE)
1877 			attr |= DMA_FL_PTE_DIRTY;
1878 	}
1879 
1880 	domain->has_mappings = true;
1881 
1882 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
1883 
1884 	while (nr_pages > 0) {
1885 		uint64_t tmp;
1886 
1887 		if (!pte) {
1888 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
1889 					phys_pfn, nr_pages);
1890 
1891 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
1892 					     gfp);
1893 			if (!pte)
1894 				return -ENOMEM;
1895 			first_pte = pte;
1896 
1897 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
1898 
1899 			/* It is large page*/
1900 			if (largepage_lvl > 1) {
1901 				unsigned long end_pfn;
1902 				unsigned long pages_to_remove;
1903 
1904 				pteval |= DMA_PTE_LARGE_PAGE;
1905 				pages_to_remove = min_t(unsigned long, nr_pages,
1906 							nr_pte_to_next_page(pte) * lvl_pages);
1907 				end_pfn = iov_pfn + pages_to_remove - 1;
1908 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
1909 			} else {
1910 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1911 			}
1912 
1913 		}
1914 		/* We don't need lock here, nobody else
1915 		 * touches the iova range
1916 		 */
1917 		tmp = 0ULL;
1918 		if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
1919 			static int dumps = 5;
1920 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1921 				iov_pfn, tmp, (unsigned long long)pteval);
1922 			if (dumps) {
1923 				dumps--;
1924 				debug_dma_dump_mappings(NULL);
1925 			}
1926 			WARN_ON(1);
1927 		}
1928 
1929 		nr_pages -= lvl_pages;
1930 		iov_pfn += lvl_pages;
1931 		phys_pfn += lvl_pages;
1932 		pteval += lvl_pages * VTD_PAGE_SIZE;
1933 
1934 		/* If the next PTE would be the first in a new page, then we
1935 		 * need to flush the cache on the entries we've just written.
1936 		 * And then we'll need to recalculate 'pte', so clear it and
1937 		 * let it get set again in the if (!pte) block above.
1938 		 *
1939 		 * If we're done (!nr_pages) we need to flush the cache too.
1940 		 *
1941 		 * Also if we've been setting superpages, we may need to
1942 		 * recalculate 'pte' and switch back to smaller pages for the
1943 		 * end of the mapping, if the trailing size is not enough to
1944 		 * use another superpage (i.e. nr_pages < lvl_pages).
1945 		 */
1946 		pte++;
1947 		if (!nr_pages || first_pte_in_page(pte) ||
1948 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
1949 			domain_flush_cache(domain, first_pte,
1950 					   (void *)pte - (void *)first_pte);
1951 			pte = NULL;
1952 		}
1953 	}
1954 
1955 	return 0;
1956 }
1957 
1958 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
1959 {
1960 	struct intel_iommu *iommu = info->iommu;
1961 	struct context_entry *context;
1962 	u16 did_old;
1963 
1964 	spin_lock(&iommu->lock);
1965 	context = iommu_context_addr(iommu, bus, devfn, 0);
1966 	if (!context) {
1967 		spin_unlock(&iommu->lock);
1968 		return;
1969 	}
1970 
1971 	did_old = context_domain_id(context);
1972 
1973 	context_clear_entry(context);
1974 	__iommu_flush_cache(iommu, context, sizeof(*context));
1975 	spin_unlock(&iommu->lock);
1976 	iommu->flush.flush_context(iommu,
1977 				   did_old,
1978 				   (((u16)bus) << 8) | devfn,
1979 				   DMA_CCMD_MASK_NOBIT,
1980 				   DMA_CCMD_DEVICE_INVL);
1981 
1982 	iommu->flush.flush_iotlb(iommu,
1983 				 did_old,
1984 				 0,
1985 				 0,
1986 				 DMA_TLB_DSI_FLUSH);
1987 
1988 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
1989 }
1990 
1991 static int domain_setup_first_level(struct intel_iommu *iommu,
1992 				    struct dmar_domain *domain,
1993 				    struct device *dev,
1994 				    u32 pasid)
1995 {
1996 	struct dma_pte *pgd = domain->pgd;
1997 	int agaw, level;
1998 	int flags = 0;
1999 
2000 	/*
2001 	 * Skip top levels of page tables for iommu which has
2002 	 * less agaw than default. Unnecessary for PT mode.
2003 	 */
2004 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2005 		pgd = phys_to_virt(dma_pte_addr(pgd));
2006 		if (!dma_pte_present(pgd))
2007 			return -ENOMEM;
2008 	}
2009 
2010 	level = agaw_to_level(agaw);
2011 	if (level != 4 && level != 5)
2012 		return -EINVAL;
2013 
2014 	if (level == 5)
2015 		flags |= PASID_FLAG_FL5LP;
2016 
2017 	if (domain->force_snooping)
2018 		flags |= PASID_FLAG_PAGE_SNOOP;
2019 
2020 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2021 					     domain_id_iommu(domain, iommu),
2022 					     flags);
2023 }
2024 
2025 static bool dev_is_real_dma_subdevice(struct device *dev)
2026 {
2027 	return dev && dev_is_pci(dev) &&
2028 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2029 }
2030 
2031 static int iommu_domain_identity_map(struct dmar_domain *domain,
2032 				     unsigned long first_vpfn,
2033 				     unsigned long last_vpfn)
2034 {
2035 	/*
2036 	 * RMRR range might have overlap with physical memory range,
2037 	 * clear it first
2038 	 */
2039 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2040 
2041 	return __domain_mapping(domain, first_vpfn,
2042 				first_vpfn, last_vpfn - first_vpfn + 1,
2043 				DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2044 }
2045 
2046 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2047 
2048 static int __init si_domain_init(int hw)
2049 {
2050 	struct dmar_rmrr_unit *rmrr;
2051 	struct device *dev;
2052 	int i, nid, ret;
2053 
2054 	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2055 	if (!si_domain)
2056 		return -EFAULT;
2057 
2058 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2059 		domain_exit(si_domain);
2060 		si_domain = NULL;
2061 		return -EFAULT;
2062 	}
2063 
2064 	if (hw)
2065 		return 0;
2066 
2067 	for_each_online_node(nid) {
2068 		unsigned long start_pfn, end_pfn;
2069 		int i;
2070 
2071 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2072 			ret = iommu_domain_identity_map(si_domain,
2073 					mm_to_dma_pfn_start(start_pfn),
2074 					mm_to_dma_pfn_end(end_pfn));
2075 			if (ret)
2076 				return ret;
2077 		}
2078 	}
2079 
2080 	/*
2081 	 * Identity map the RMRRs so that devices with RMRRs could also use
2082 	 * the si_domain.
2083 	 */
2084 	for_each_rmrr_units(rmrr) {
2085 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2086 					  i, dev) {
2087 			unsigned long long start = rmrr->base_address;
2088 			unsigned long long end = rmrr->end_address;
2089 
2090 			if (WARN_ON(end < start ||
2091 				    end >> agaw_to_width(si_domain->agaw)))
2092 				continue;
2093 
2094 			ret = iommu_domain_identity_map(si_domain,
2095 					mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2096 					mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2097 			if (ret)
2098 				return ret;
2099 		}
2100 	}
2101 
2102 	return 0;
2103 }
2104 
2105 static int dmar_domain_attach_device(struct dmar_domain *domain,
2106 				     struct device *dev)
2107 {
2108 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2109 	struct intel_iommu *iommu = info->iommu;
2110 	unsigned long flags;
2111 	int ret;
2112 
2113 	ret = domain_attach_iommu(domain, iommu);
2114 	if (ret)
2115 		return ret;
2116 
2117 	ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID);
2118 	if (ret) {
2119 		domain_detach_iommu(domain, iommu);
2120 		return ret;
2121 	}
2122 
2123 	info->domain = domain;
2124 	spin_lock_irqsave(&domain->lock, flags);
2125 	list_add(&info->link, &domain->devices);
2126 	spin_unlock_irqrestore(&domain->lock, flags);
2127 
2128 	if (dev_is_real_dma_subdevice(dev))
2129 		return 0;
2130 
2131 	if (!sm_supported(iommu))
2132 		ret = domain_context_mapping(domain, dev);
2133 	else if (hw_pass_through && domain_type_is_si(domain))
2134 		ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
2135 	else if (domain->use_first_level)
2136 		ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID);
2137 	else
2138 		ret = intel_pasid_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID);
2139 
2140 	if (ret) {
2141 		device_block_translation(dev);
2142 		return ret;
2143 	}
2144 
2145 	if (sm_supported(info->iommu) || !domain_type_is_si(info->domain))
2146 		iommu_enable_pci_caps(info);
2147 
2148 	return 0;
2149 }
2150 
2151 /**
2152  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2153  * is relaxable (ie. is allowed to be not enforced under some conditions)
2154  * @dev: device handle
2155  *
2156  * We assume that PCI USB devices with RMRRs have them largely
2157  * for historical reasons and that the RMRR space is not actively used post
2158  * boot.  This exclusion may change if vendors begin to abuse it.
2159  *
2160  * The same exception is made for graphics devices, with the requirement that
2161  * any use of the RMRR regions will be torn down before assigning the device
2162  * to a guest.
2163  *
2164  * Return: true if the RMRR is relaxable, false otherwise
2165  */
2166 static bool device_rmrr_is_relaxable(struct device *dev)
2167 {
2168 	struct pci_dev *pdev;
2169 
2170 	if (!dev_is_pci(dev))
2171 		return false;
2172 
2173 	pdev = to_pci_dev(dev);
2174 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2175 		return true;
2176 	else
2177 		return false;
2178 }
2179 
2180 /*
2181  * Return the required default domain type for a specific device.
2182  *
2183  * @dev: the device in query
2184  * @startup: true if this is during early boot
2185  *
2186  * Returns:
2187  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2188  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2189  *  - 0: both identity and dynamic domains work for this device
2190  */
2191 static int device_def_domain_type(struct device *dev)
2192 {
2193 	if (dev_is_pci(dev)) {
2194 		struct pci_dev *pdev = to_pci_dev(dev);
2195 
2196 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2197 			return IOMMU_DOMAIN_IDENTITY;
2198 	}
2199 
2200 	return 0;
2201 }
2202 
2203 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2204 {
2205 	/*
2206 	 * Start from the sane iommu hardware state.
2207 	 * If the queued invalidation is already initialized by us
2208 	 * (for example, while enabling interrupt-remapping) then
2209 	 * we got the things already rolling from a sane state.
2210 	 */
2211 	if (!iommu->qi) {
2212 		/*
2213 		 * Clear any previous faults.
2214 		 */
2215 		dmar_fault(-1, iommu);
2216 		/*
2217 		 * Disable queued invalidation if supported and already enabled
2218 		 * before OS handover.
2219 		 */
2220 		dmar_disable_qi(iommu);
2221 	}
2222 
2223 	if (dmar_enable_qi(iommu)) {
2224 		/*
2225 		 * Queued Invalidate not enabled, use Register Based Invalidate
2226 		 */
2227 		iommu->flush.flush_context = __iommu_flush_context;
2228 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2229 		pr_info("%s: Using Register based invalidation\n",
2230 			iommu->name);
2231 	} else {
2232 		iommu->flush.flush_context = qi_flush_context;
2233 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2234 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2235 	}
2236 }
2237 
2238 static int copy_context_table(struct intel_iommu *iommu,
2239 			      struct root_entry *old_re,
2240 			      struct context_entry **tbl,
2241 			      int bus, bool ext)
2242 {
2243 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2244 	struct context_entry *new_ce = NULL, ce;
2245 	struct context_entry *old_ce = NULL;
2246 	struct root_entry re;
2247 	phys_addr_t old_ce_phys;
2248 
2249 	tbl_idx = ext ? bus * 2 : bus;
2250 	memcpy(&re, old_re, sizeof(re));
2251 
2252 	for (devfn = 0; devfn < 256; devfn++) {
2253 		/* First calculate the correct index */
2254 		idx = (ext ? devfn * 2 : devfn) % 256;
2255 
2256 		if (idx == 0) {
2257 			/* First save what we may have and clean up */
2258 			if (new_ce) {
2259 				tbl[tbl_idx] = new_ce;
2260 				__iommu_flush_cache(iommu, new_ce,
2261 						    VTD_PAGE_SIZE);
2262 				pos = 1;
2263 			}
2264 
2265 			if (old_ce)
2266 				memunmap(old_ce);
2267 
2268 			ret = 0;
2269 			if (devfn < 0x80)
2270 				old_ce_phys = root_entry_lctp(&re);
2271 			else
2272 				old_ce_phys = root_entry_uctp(&re);
2273 
2274 			if (!old_ce_phys) {
2275 				if (ext && devfn == 0) {
2276 					/* No LCTP, try UCTP */
2277 					devfn = 0x7f;
2278 					continue;
2279 				} else {
2280 					goto out;
2281 				}
2282 			}
2283 
2284 			ret = -ENOMEM;
2285 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2286 					MEMREMAP_WB);
2287 			if (!old_ce)
2288 				goto out;
2289 
2290 			new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL);
2291 			if (!new_ce)
2292 				goto out_unmap;
2293 
2294 			ret = 0;
2295 		}
2296 
2297 		/* Now copy the context entry */
2298 		memcpy(&ce, old_ce + idx, sizeof(ce));
2299 
2300 		if (!context_present(&ce))
2301 			continue;
2302 
2303 		did = context_domain_id(&ce);
2304 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2305 			set_bit(did, iommu->domain_ids);
2306 
2307 		set_context_copied(iommu, bus, devfn);
2308 		new_ce[idx] = ce;
2309 	}
2310 
2311 	tbl[tbl_idx + pos] = new_ce;
2312 
2313 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2314 
2315 out_unmap:
2316 	memunmap(old_ce);
2317 
2318 out:
2319 	return ret;
2320 }
2321 
2322 static int copy_translation_tables(struct intel_iommu *iommu)
2323 {
2324 	struct context_entry **ctxt_tbls;
2325 	struct root_entry *old_rt;
2326 	phys_addr_t old_rt_phys;
2327 	int ctxt_table_entries;
2328 	u64 rtaddr_reg;
2329 	int bus, ret;
2330 	bool new_ext, ext;
2331 
2332 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2333 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2334 	new_ext    = !!sm_supported(iommu);
2335 
2336 	/*
2337 	 * The RTT bit can only be changed when translation is disabled,
2338 	 * but disabling translation means to open a window for data
2339 	 * corruption. So bail out and don't copy anything if we would
2340 	 * have to change the bit.
2341 	 */
2342 	if (new_ext != ext)
2343 		return -EINVAL;
2344 
2345 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2346 	if (!iommu->copied_tables)
2347 		return -ENOMEM;
2348 
2349 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2350 	if (!old_rt_phys)
2351 		return -EINVAL;
2352 
2353 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2354 	if (!old_rt)
2355 		return -ENOMEM;
2356 
2357 	/* This is too big for the stack - allocate it from slab */
2358 	ctxt_table_entries = ext ? 512 : 256;
2359 	ret = -ENOMEM;
2360 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2361 	if (!ctxt_tbls)
2362 		goto out_unmap;
2363 
2364 	for (bus = 0; bus < 256; bus++) {
2365 		ret = copy_context_table(iommu, &old_rt[bus],
2366 					 ctxt_tbls, bus, ext);
2367 		if (ret) {
2368 			pr_err("%s: Failed to copy context table for bus %d\n",
2369 				iommu->name, bus);
2370 			continue;
2371 		}
2372 	}
2373 
2374 	spin_lock(&iommu->lock);
2375 
2376 	/* Context tables are copied, now write them to the root_entry table */
2377 	for (bus = 0; bus < 256; bus++) {
2378 		int idx = ext ? bus * 2 : bus;
2379 		u64 val;
2380 
2381 		if (ctxt_tbls[idx]) {
2382 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2383 			iommu->root_entry[bus].lo = val;
2384 		}
2385 
2386 		if (!ext || !ctxt_tbls[idx + 1])
2387 			continue;
2388 
2389 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2390 		iommu->root_entry[bus].hi = val;
2391 	}
2392 
2393 	spin_unlock(&iommu->lock);
2394 
2395 	kfree(ctxt_tbls);
2396 
2397 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2398 
2399 	ret = 0;
2400 
2401 out_unmap:
2402 	memunmap(old_rt);
2403 
2404 	return ret;
2405 }
2406 
2407 static int __init init_dmars(void)
2408 {
2409 	struct dmar_drhd_unit *drhd;
2410 	struct intel_iommu *iommu;
2411 	int ret;
2412 
2413 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2414 	if (ret)
2415 		goto free_iommu;
2416 
2417 	for_each_iommu(iommu, drhd) {
2418 		if (drhd->ignored) {
2419 			iommu_disable_translation(iommu);
2420 			continue;
2421 		}
2422 
2423 		/*
2424 		 * Find the max pasid size of all IOMMU's in the system.
2425 		 * We need to ensure the system pasid table is no bigger
2426 		 * than the smallest supported.
2427 		 */
2428 		if (pasid_supported(iommu)) {
2429 			u32 temp = 2 << ecap_pss(iommu->ecap);
2430 
2431 			intel_pasid_max_id = min_t(u32, temp,
2432 						   intel_pasid_max_id);
2433 		}
2434 
2435 		intel_iommu_init_qi(iommu);
2436 
2437 		ret = iommu_init_domains(iommu);
2438 		if (ret)
2439 			goto free_iommu;
2440 
2441 		init_translation_status(iommu);
2442 
2443 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2444 			iommu_disable_translation(iommu);
2445 			clear_translation_pre_enabled(iommu);
2446 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2447 				iommu->name);
2448 		}
2449 
2450 		/*
2451 		 * TBD:
2452 		 * we could share the same root & context tables
2453 		 * among all IOMMU's. Need to Split it later.
2454 		 */
2455 		ret = iommu_alloc_root_entry(iommu);
2456 		if (ret)
2457 			goto free_iommu;
2458 
2459 		if (translation_pre_enabled(iommu)) {
2460 			pr_info("Translation already enabled - trying to copy translation structures\n");
2461 
2462 			ret = copy_translation_tables(iommu);
2463 			if (ret) {
2464 				/*
2465 				 * We found the IOMMU with translation
2466 				 * enabled - but failed to copy over the
2467 				 * old root-entry table. Try to proceed
2468 				 * by disabling translation now and
2469 				 * allocating a clean root-entry table.
2470 				 * This might cause DMAR faults, but
2471 				 * probably the dump will still succeed.
2472 				 */
2473 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2474 				       iommu->name);
2475 				iommu_disable_translation(iommu);
2476 				clear_translation_pre_enabled(iommu);
2477 			} else {
2478 				pr_info("Copied translation tables from previous kernel for %s\n",
2479 					iommu->name);
2480 			}
2481 		}
2482 
2483 		if (!ecap_pass_through(iommu->ecap))
2484 			hw_pass_through = 0;
2485 		intel_svm_check(iommu);
2486 	}
2487 
2488 	/*
2489 	 * Now that qi is enabled on all iommus, set the root entry and flush
2490 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2491 	 * flush_context function will loop forever and the boot hangs.
2492 	 */
2493 	for_each_active_iommu(iommu, drhd) {
2494 		iommu_flush_write_buffer(iommu);
2495 		iommu_set_root_entry(iommu);
2496 	}
2497 
2498 	check_tylersburg_isoch();
2499 
2500 	ret = si_domain_init(hw_pass_through);
2501 	if (ret)
2502 		goto free_iommu;
2503 
2504 	/*
2505 	 * for each drhd
2506 	 *   enable fault log
2507 	 *   global invalidate context cache
2508 	 *   global invalidate iotlb
2509 	 *   enable translation
2510 	 */
2511 	for_each_iommu(iommu, drhd) {
2512 		if (drhd->ignored) {
2513 			/*
2514 			 * we always have to disable PMRs or DMA may fail on
2515 			 * this device
2516 			 */
2517 			if (force_on)
2518 				iommu_disable_protect_mem_regions(iommu);
2519 			continue;
2520 		}
2521 
2522 		iommu_flush_write_buffer(iommu);
2523 
2524 #ifdef CONFIG_INTEL_IOMMU_SVM
2525 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2526 			/*
2527 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2528 			 * could cause possible lock race condition.
2529 			 */
2530 			up_write(&dmar_global_lock);
2531 			ret = intel_svm_enable_prq(iommu);
2532 			down_write(&dmar_global_lock);
2533 			if (ret)
2534 				goto free_iommu;
2535 		}
2536 #endif
2537 		ret = dmar_set_interrupt(iommu);
2538 		if (ret)
2539 			goto free_iommu;
2540 	}
2541 
2542 	return 0;
2543 
2544 free_iommu:
2545 	for_each_active_iommu(iommu, drhd) {
2546 		disable_dmar_iommu(iommu);
2547 		free_dmar_iommu(iommu);
2548 	}
2549 	if (si_domain) {
2550 		domain_exit(si_domain);
2551 		si_domain = NULL;
2552 	}
2553 
2554 	return ret;
2555 }
2556 
2557 static void __init init_no_remapping_devices(void)
2558 {
2559 	struct dmar_drhd_unit *drhd;
2560 	struct device *dev;
2561 	int i;
2562 
2563 	for_each_drhd_unit(drhd) {
2564 		if (!drhd->include_all) {
2565 			for_each_active_dev_scope(drhd->devices,
2566 						  drhd->devices_cnt, i, dev)
2567 				break;
2568 			/* ignore DMAR unit if no devices exist */
2569 			if (i == drhd->devices_cnt)
2570 				drhd->ignored = 1;
2571 		}
2572 	}
2573 
2574 	for_each_active_drhd_unit(drhd) {
2575 		if (drhd->include_all)
2576 			continue;
2577 
2578 		for_each_active_dev_scope(drhd->devices,
2579 					  drhd->devices_cnt, i, dev)
2580 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2581 				break;
2582 		if (i < drhd->devices_cnt)
2583 			continue;
2584 
2585 		/* This IOMMU has *only* gfx devices. Either bypass it or
2586 		   set the gfx_mapped flag, as appropriate */
2587 		drhd->gfx_dedicated = 1;
2588 		if (disable_igfx_iommu)
2589 			drhd->ignored = 1;
2590 	}
2591 }
2592 
2593 #ifdef CONFIG_SUSPEND
2594 static int init_iommu_hw(void)
2595 {
2596 	struct dmar_drhd_unit *drhd;
2597 	struct intel_iommu *iommu = NULL;
2598 	int ret;
2599 
2600 	for_each_active_iommu(iommu, drhd) {
2601 		if (iommu->qi) {
2602 			ret = dmar_reenable_qi(iommu);
2603 			if (ret)
2604 				return ret;
2605 		}
2606 	}
2607 
2608 	for_each_iommu(iommu, drhd) {
2609 		if (drhd->ignored) {
2610 			/*
2611 			 * we always have to disable PMRs or DMA may fail on
2612 			 * this device
2613 			 */
2614 			if (force_on)
2615 				iommu_disable_protect_mem_regions(iommu);
2616 			continue;
2617 		}
2618 
2619 		iommu_flush_write_buffer(iommu);
2620 		iommu_set_root_entry(iommu);
2621 		iommu_enable_translation(iommu);
2622 		iommu_disable_protect_mem_regions(iommu);
2623 	}
2624 
2625 	return 0;
2626 }
2627 
2628 static void iommu_flush_all(void)
2629 {
2630 	struct dmar_drhd_unit *drhd;
2631 	struct intel_iommu *iommu;
2632 
2633 	for_each_active_iommu(iommu, drhd) {
2634 		iommu->flush.flush_context(iommu, 0, 0, 0,
2635 					   DMA_CCMD_GLOBAL_INVL);
2636 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2637 					 DMA_TLB_GLOBAL_FLUSH);
2638 	}
2639 }
2640 
2641 static int iommu_suspend(void)
2642 {
2643 	struct dmar_drhd_unit *drhd;
2644 	struct intel_iommu *iommu = NULL;
2645 	unsigned long flag;
2646 
2647 	iommu_flush_all();
2648 
2649 	for_each_active_iommu(iommu, drhd) {
2650 		iommu_disable_translation(iommu);
2651 
2652 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2653 
2654 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
2655 			readl(iommu->reg + DMAR_FECTL_REG);
2656 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2657 			readl(iommu->reg + DMAR_FEDATA_REG);
2658 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2659 			readl(iommu->reg + DMAR_FEADDR_REG);
2660 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2661 			readl(iommu->reg + DMAR_FEUADDR_REG);
2662 
2663 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2664 	}
2665 	return 0;
2666 }
2667 
2668 static void iommu_resume(void)
2669 {
2670 	struct dmar_drhd_unit *drhd;
2671 	struct intel_iommu *iommu = NULL;
2672 	unsigned long flag;
2673 
2674 	if (init_iommu_hw()) {
2675 		if (force_on)
2676 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2677 		else
2678 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2679 		return;
2680 	}
2681 
2682 	for_each_active_iommu(iommu, drhd) {
2683 
2684 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2685 
2686 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2687 			iommu->reg + DMAR_FECTL_REG);
2688 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2689 			iommu->reg + DMAR_FEDATA_REG);
2690 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2691 			iommu->reg + DMAR_FEADDR_REG);
2692 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2693 			iommu->reg + DMAR_FEUADDR_REG);
2694 
2695 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2696 	}
2697 }
2698 
2699 static struct syscore_ops iommu_syscore_ops = {
2700 	.resume		= iommu_resume,
2701 	.suspend	= iommu_suspend,
2702 };
2703 
2704 static void __init init_iommu_pm_ops(void)
2705 {
2706 	register_syscore_ops(&iommu_syscore_ops);
2707 }
2708 
2709 #else
2710 static inline void init_iommu_pm_ops(void) {}
2711 #endif	/* CONFIG_PM */
2712 
2713 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2714 {
2715 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2716 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2717 	    rmrr->end_address <= rmrr->base_address ||
2718 	    arch_rmrr_sanity_check(rmrr))
2719 		return -EINVAL;
2720 
2721 	return 0;
2722 }
2723 
2724 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2725 {
2726 	struct acpi_dmar_reserved_memory *rmrr;
2727 	struct dmar_rmrr_unit *rmrru;
2728 
2729 	rmrr = (struct acpi_dmar_reserved_memory *)header;
2730 	if (rmrr_sanity_check(rmrr)) {
2731 		pr_warn(FW_BUG
2732 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2733 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2734 			   rmrr->base_address, rmrr->end_address,
2735 			   dmi_get_system_info(DMI_BIOS_VENDOR),
2736 			   dmi_get_system_info(DMI_BIOS_VERSION),
2737 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
2738 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2739 	}
2740 
2741 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2742 	if (!rmrru)
2743 		goto out;
2744 
2745 	rmrru->hdr = header;
2746 
2747 	rmrru->base_address = rmrr->base_address;
2748 	rmrru->end_address = rmrr->end_address;
2749 
2750 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2751 				((void *)rmrr) + rmrr->header.length,
2752 				&rmrru->devices_cnt);
2753 	if (rmrru->devices_cnt && rmrru->devices == NULL)
2754 		goto free_rmrru;
2755 
2756 	list_add(&rmrru->list, &dmar_rmrr_units);
2757 
2758 	return 0;
2759 free_rmrru:
2760 	kfree(rmrru);
2761 out:
2762 	return -ENOMEM;
2763 }
2764 
2765 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2766 {
2767 	struct dmar_atsr_unit *atsru;
2768 	struct acpi_dmar_atsr *tmp;
2769 
2770 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2771 				dmar_rcu_check()) {
2772 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2773 		if (atsr->segment != tmp->segment)
2774 			continue;
2775 		if (atsr->header.length != tmp->header.length)
2776 			continue;
2777 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
2778 			return atsru;
2779 	}
2780 
2781 	return NULL;
2782 }
2783 
2784 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2785 {
2786 	struct acpi_dmar_atsr *atsr;
2787 	struct dmar_atsr_unit *atsru;
2788 
2789 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2790 		return 0;
2791 
2792 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2793 	atsru = dmar_find_atsr(atsr);
2794 	if (atsru)
2795 		return 0;
2796 
2797 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2798 	if (!atsru)
2799 		return -ENOMEM;
2800 
2801 	/*
2802 	 * If memory is allocated from slab by ACPI _DSM method, we need to
2803 	 * copy the memory content because the memory buffer will be freed
2804 	 * on return.
2805 	 */
2806 	atsru->hdr = (void *)(atsru + 1);
2807 	memcpy(atsru->hdr, hdr, hdr->length);
2808 	atsru->include_all = atsr->flags & 0x1;
2809 	if (!atsru->include_all) {
2810 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
2811 				(void *)atsr + atsr->header.length,
2812 				&atsru->devices_cnt);
2813 		if (atsru->devices_cnt && atsru->devices == NULL) {
2814 			kfree(atsru);
2815 			return -ENOMEM;
2816 		}
2817 	}
2818 
2819 	list_add_rcu(&atsru->list, &dmar_atsr_units);
2820 
2821 	return 0;
2822 }
2823 
2824 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2825 {
2826 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
2827 	kfree(atsru);
2828 }
2829 
2830 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2831 {
2832 	struct acpi_dmar_atsr *atsr;
2833 	struct dmar_atsr_unit *atsru;
2834 
2835 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2836 	atsru = dmar_find_atsr(atsr);
2837 	if (atsru) {
2838 		list_del_rcu(&atsru->list);
2839 		synchronize_rcu();
2840 		intel_iommu_free_atsr(atsru);
2841 	}
2842 
2843 	return 0;
2844 }
2845 
2846 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2847 {
2848 	int i;
2849 	struct device *dev;
2850 	struct acpi_dmar_atsr *atsr;
2851 	struct dmar_atsr_unit *atsru;
2852 
2853 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2854 	atsru = dmar_find_atsr(atsr);
2855 	if (!atsru)
2856 		return 0;
2857 
2858 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
2859 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
2860 					  i, dev)
2861 			return -EBUSY;
2862 	}
2863 
2864 	return 0;
2865 }
2866 
2867 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
2868 {
2869 	struct dmar_satc_unit *satcu;
2870 	struct acpi_dmar_satc *tmp;
2871 
2872 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
2873 				dmar_rcu_check()) {
2874 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
2875 		if (satc->segment != tmp->segment)
2876 			continue;
2877 		if (satc->header.length != tmp->header.length)
2878 			continue;
2879 		if (memcmp(satc, tmp, satc->header.length) == 0)
2880 			return satcu;
2881 	}
2882 
2883 	return NULL;
2884 }
2885 
2886 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
2887 {
2888 	struct acpi_dmar_satc *satc;
2889 	struct dmar_satc_unit *satcu;
2890 
2891 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2892 		return 0;
2893 
2894 	satc = container_of(hdr, struct acpi_dmar_satc, header);
2895 	satcu = dmar_find_satc(satc);
2896 	if (satcu)
2897 		return 0;
2898 
2899 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
2900 	if (!satcu)
2901 		return -ENOMEM;
2902 
2903 	satcu->hdr = (void *)(satcu + 1);
2904 	memcpy(satcu->hdr, hdr, hdr->length);
2905 	satcu->atc_required = satc->flags & 0x1;
2906 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
2907 					      (void *)satc + satc->header.length,
2908 					      &satcu->devices_cnt);
2909 	if (satcu->devices_cnt && !satcu->devices) {
2910 		kfree(satcu);
2911 		return -ENOMEM;
2912 	}
2913 	list_add_rcu(&satcu->list, &dmar_satc_units);
2914 
2915 	return 0;
2916 }
2917 
2918 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
2919 {
2920 	int sp, ret;
2921 	struct intel_iommu *iommu = dmaru->iommu;
2922 
2923 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
2924 	if (ret)
2925 		goto out;
2926 
2927 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
2928 		pr_warn("%s: Doesn't support hardware pass through.\n",
2929 			iommu->name);
2930 		return -ENXIO;
2931 	}
2932 
2933 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
2934 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
2935 		pr_warn("%s: Doesn't support large page.\n",
2936 			iommu->name);
2937 		return -ENXIO;
2938 	}
2939 
2940 	/*
2941 	 * Disable translation if already enabled prior to OS handover.
2942 	 */
2943 	if (iommu->gcmd & DMA_GCMD_TE)
2944 		iommu_disable_translation(iommu);
2945 
2946 	ret = iommu_init_domains(iommu);
2947 	if (ret == 0)
2948 		ret = iommu_alloc_root_entry(iommu);
2949 	if (ret)
2950 		goto out;
2951 
2952 	intel_svm_check(iommu);
2953 
2954 	if (dmaru->ignored) {
2955 		/*
2956 		 * we always have to disable PMRs or DMA may fail on this device
2957 		 */
2958 		if (force_on)
2959 			iommu_disable_protect_mem_regions(iommu);
2960 		return 0;
2961 	}
2962 
2963 	intel_iommu_init_qi(iommu);
2964 	iommu_flush_write_buffer(iommu);
2965 
2966 #ifdef CONFIG_INTEL_IOMMU_SVM
2967 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2968 		ret = intel_svm_enable_prq(iommu);
2969 		if (ret)
2970 			goto disable_iommu;
2971 	}
2972 #endif
2973 	ret = dmar_set_interrupt(iommu);
2974 	if (ret)
2975 		goto disable_iommu;
2976 
2977 	iommu_set_root_entry(iommu);
2978 	iommu_enable_translation(iommu);
2979 
2980 	iommu_disable_protect_mem_regions(iommu);
2981 	return 0;
2982 
2983 disable_iommu:
2984 	disable_dmar_iommu(iommu);
2985 out:
2986 	free_dmar_iommu(iommu);
2987 	return ret;
2988 }
2989 
2990 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
2991 {
2992 	int ret = 0;
2993 	struct intel_iommu *iommu = dmaru->iommu;
2994 
2995 	if (!intel_iommu_enabled)
2996 		return 0;
2997 	if (iommu == NULL)
2998 		return -EINVAL;
2999 
3000 	if (insert) {
3001 		ret = intel_iommu_add(dmaru);
3002 	} else {
3003 		disable_dmar_iommu(iommu);
3004 		free_dmar_iommu(iommu);
3005 	}
3006 
3007 	return ret;
3008 }
3009 
3010 static void intel_iommu_free_dmars(void)
3011 {
3012 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3013 	struct dmar_atsr_unit *atsru, *atsr_n;
3014 	struct dmar_satc_unit *satcu, *satc_n;
3015 
3016 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3017 		list_del(&rmrru->list);
3018 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3019 		kfree(rmrru);
3020 	}
3021 
3022 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3023 		list_del(&atsru->list);
3024 		intel_iommu_free_atsr(atsru);
3025 	}
3026 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3027 		list_del(&satcu->list);
3028 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3029 		kfree(satcu);
3030 	}
3031 }
3032 
3033 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3034 {
3035 	struct dmar_satc_unit *satcu;
3036 	struct acpi_dmar_satc *satc;
3037 	struct device *tmp;
3038 	int i;
3039 
3040 	dev = pci_physfn(dev);
3041 	rcu_read_lock();
3042 
3043 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3044 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3045 		if (satc->segment != pci_domain_nr(dev->bus))
3046 			continue;
3047 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3048 			if (to_pci_dev(tmp) == dev)
3049 				goto out;
3050 	}
3051 	satcu = NULL;
3052 out:
3053 	rcu_read_unlock();
3054 	return satcu;
3055 }
3056 
3057 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3058 {
3059 	int i, ret = 1;
3060 	struct pci_bus *bus;
3061 	struct pci_dev *bridge = NULL;
3062 	struct device *tmp;
3063 	struct acpi_dmar_atsr *atsr;
3064 	struct dmar_atsr_unit *atsru;
3065 	struct dmar_satc_unit *satcu;
3066 
3067 	dev = pci_physfn(dev);
3068 	satcu = dmar_find_matched_satc_unit(dev);
3069 	if (satcu)
3070 		/*
3071 		 * This device supports ATS as it is in SATC table.
3072 		 * When IOMMU is in legacy mode, enabling ATS is done
3073 		 * automatically by HW for the device that requires
3074 		 * ATS, hence OS should not enable this device ATS
3075 		 * to avoid duplicated TLB invalidation.
3076 		 */
3077 		return !(satcu->atc_required && !sm_supported(iommu));
3078 
3079 	for (bus = dev->bus; bus; bus = bus->parent) {
3080 		bridge = bus->self;
3081 		/* If it's an integrated device, allow ATS */
3082 		if (!bridge)
3083 			return 1;
3084 		/* Connected via non-PCIe: no ATS */
3085 		if (!pci_is_pcie(bridge) ||
3086 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3087 			return 0;
3088 		/* If we found the root port, look it up in the ATSR */
3089 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3090 			break;
3091 	}
3092 
3093 	rcu_read_lock();
3094 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3095 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3096 		if (atsr->segment != pci_domain_nr(dev->bus))
3097 			continue;
3098 
3099 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3100 			if (tmp == &bridge->dev)
3101 				goto out;
3102 
3103 		if (atsru->include_all)
3104 			goto out;
3105 	}
3106 	ret = 0;
3107 out:
3108 	rcu_read_unlock();
3109 
3110 	return ret;
3111 }
3112 
3113 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3114 {
3115 	int ret;
3116 	struct dmar_rmrr_unit *rmrru;
3117 	struct dmar_atsr_unit *atsru;
3118 	struct dmar_satc_unit *satcu;
3119 	struct acpi_dmar_atsr *atsr;
3120 	struct acpi_dmar_reserved_memory *rmrr;
3121 	struct acpi_dmar_satc *satc;
3122 
3123 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3124 		return 0;
3125 
3126 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3127 		rmrr = container_of(rmrru->hdr,
3128 				    struct acpi_dmar_reserved_memory, header);
3129 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3130 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3131 				((void *)rmrr) + rmrr->header.length,
3132 				rmrr->segment, rmrru->devices,
3133 				rmrru->devices_cnt);
3134 			if (ret < 0)
3135 				return ret;
3136 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3137 			dmar_remove_dev_scope(info, rmrr->segment,
3138 				rmrru->devices, rmrru->devices_cnt);
3139 		}
3140 	}
3141 
3142 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3143 		if (atsru->include_all)
3144 			continue;
3145 
3146 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3147 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3148 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3149 					(void *)atsr + atsr->header.length,
3150 					atsr->segment, atsru->devices,
3151 					atsru->devices_cnt);
3152 			if (ret > 0)
3153 				break;
3154 			else if (ret < 0)
3155 				return ret;
3156 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3157 			if (dmar_remove_dev_scope(info, atsr->segment,
3158 					atsru->devices, atsru->devices_cnt))
3159 				break;
3160 		}
3161 	}
3162 	list_for_each_entry(satcu, &dmar_satc_units, list) {
3163 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3164 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3165 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3166 					(void *)satc + satc->header.length,
3167 					satc->segment, satcu->devices,
3168 					satcu->devices_cnt);
3169 			if (ret > 0)
3170 				break;
3171 			else if (ret < 0)
3172 				return ret;
3173 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3174 			if (dmar_remove_dev_scope(info, satc->segment,
3175 					satcu->devices, satcu->devices_cnt))
3176 				break;
3177 		}
3178 	}
3179 
3180 	return 0;
3181 }
3182 
3183 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3184 				       unsigned long val, void *v)
3185 {
3186 	struct memory_notify *mhp = v;
3187 	unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3188 	unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3189 			mhp->nr_pages - 1);
3190 
3191 	switch (val) {
3192 	case MEM_GOING_ONLINE:
3193 		if (iommu_domain_identity_map(si_domain,
3194 					      start_vpfn, last_vpfn)) {
3195 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3196 				start_vpfn, last_vpfn);
3197 			return NOTIFY_BAD;
3198 		}
3199 		break;
3200 
3201 	case MEM_OFFLINE:
3202 	case MEM_CANCEL_ONLINE:
3203 		{
3204 			LIST_HEAD(freelist);
3205 
3206 			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3207 			iommu_put_pages_list(&freelist);
3208 		}
3209 		break;
3210 	}
3211 
3212 	return NOTIFY_OK;
3213 }
3214 
3215 static struct notifier_block intel_iommu_memory_nb = {
3216 	.notifier_call = intel_iommu_memory_notifier,
3217 	.priority = 0
3218 };
3219 
3220 static void intel_disable_iommus(void)
3221 {
3222 	struct intel_iommu *iommu = NULL;
3223 	struct dmar_drhd_unit *drhd;
3224 
3225 	for_each_iommu(iommu, drhd)
3226 		iommu_disable_translation(iommu);
3227 }
3228 
3229 void intel_iommu_shutdown(void)
3230 {
3231 	struct dmar_drhd_unit *drhd;
3232 	struct intel_iommu *iommu = NULL;
3233 
3234 	if (no_iommu || dmar_disabled)
3235 		return;
3236 
3237 	down_write(&dmar_global_lock);
3238 
3239 	/* Disable PMRs explicitly here. */
3240 	for_each_iommu(iommu, drhd)
3241 		iommu_disable_protect_mem_regions(iommu);
3242 
3243 	/* Make sure the IOMMUs are switched off */
3244 	intel_disable_iommus();
3245 
3246 	up_write(&dmar_global_lock);
3247 }
3248 
3249 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3250 {
3251 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3252 
3253 	return container_of(iommu_dev, struct intel_iommu, iommu);
3254 }
3255 
3256 static ssize_t version_show(struct device *dev,
3257 			    struct device_attribute *attr, char *buf)
3258 {
3259 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3260 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3261 	return sysfs_emit(buf, "%d:%d\n",
3262 			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3263 }
3264 static DEVICE_ATTR_RO(version);
3265 
3266 static ssize_t address_show(struct device *dev,
3267 			    struct device_attribute *attr, char *buf)
3268 {
3269 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3270 	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3271 }
3272 static DEVICE_ATTR_RO(address);
3273 
3274 static ssize_t cap_show(struct device *dev,
3275 			struct device_attribute *attr, char *buf)
3276 {
3277 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3278 	return sysfs_emit(buf, "%llx\n", iommu->cap);
3279 }
3280 static DEVICE_ATTR_RO(cap);
3281 
3282 static ssize_t ecap_show(struct device *dev,
3283 			 struct device_attribute *attr, char *buf)
3284 {
3285 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3286 	return sysfs_emit(buf, "%llx\n", iommu->ecap);
3287 }
3288 static DEVICE_ATTR_RO(ecap);
3289 
3290 static ssize_t domains_supported_show(struct device *dev,
3291 				      struct device_attribute *attr, char *buf)
3292 {
3293 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3294 	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3295 }
3296 static DEVICE_ATTR_RO(domains_supported);
3297 
3298 static ssize_t domains_used_show(struct device *dev,
3299 				 struct device_attribute *attr, char *buf)
3300 {
3301 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3302 	return sysfs_emit(buf, "%d\n",
3303 			  bitmap_weight(iommu->domain_ids,
3304 					cap_ndoms(iommu->cap)));
3305 }
3306 static DEVICE_ATTR_RO(domains_used);
3307 
3308 static struct attribute *intel_iommu_attrs[] = {
3309 	&dev_attr_version.attr,
3310 	&dev_attr_address.attr,
3311 	&dev_attr_cap.attr,
3312 	&dev_attr_ecap.attr,
3313 	&dev_attr_domains_supported.attr,
3314 	&dev_attr_domains_used.attr,
3315 	NULL,
3316 };
3317 
3318 static struct attribute_group intel_iommu_group = {
3319 	.name = "intel-iommu",
3320 	.attrs = intel_iommu_attrs,
3321 };
3322 
3323 const struct attribute_group *intel_iommu_groups[] = {
3324 	&intel_iommu_group,
3325 	NULL,
3326 };
3327 
3328 static bool has_external_pci(void)
3329 {
3330 	struct pci_dev *pdev = NULL;
3331 
3332 	for_each_pci_dev(pdev)
3333 		if (pdev->external_facing) {
3334 			pci_dev_put(pdev);
3335 			return true;
3336 		}
3337 
3338 	return false;
3339 }
3340 
3341 static int __init platform_optin_force_iommu(void)
3342 {
3343 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3344 		return 0;
3345 
3346 	if (no_iommu || dmar_disabled)
3347 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3348 
3349 	/*
3350 	 * If Intel-IOMMU is disabled by default, we will apply identity
3351 	 * map for all devices except those marked as being untrusted.
3352 	 */
3353 	if (dmar_disabled)
3354 		iommu_set_default_passthrough(false);
3355 
3356 	dmar_disabled = 0;
3357 	no_iommu = 0;
3358 
3359 	return 1;
3360 }
3361 
3362 static int __init probe_acpi_namespace_devices(void)
3363 {
3364 	struct dmar_drhd_unit *drhd;
3365 	/* To avoid a -Wunused-but-set-variable warning. */
3366 	struct intel_iommu *iommu __maybe_unused;
3367 	struct device *dev;
3368 	int i, ret = 0;
3369 
3370 	for_each_active_iommu(iommu, drhd) {
3371 		for_each_active_dev_scope(drhd->devices,
3372 					  drhd->devices_cnt, i, dev) {
3373 			struct acpi_device_physical_node *pn;
3374 			struct acpi_device *adev;
3375 
3376 			if (dev->bus != &acpi_bus_type)
3377 				continue;
3378 
3379 			adev = to_acpi_device(dev);
3380 			mutex_lock(&adev->physical_node_lock);
3381 			list_for_each_entry(pn,
3382 					    &adev->physical_node_list, node) {
3383 				ret = iommu_probe_device(pn->dev);
3384 				if (ret)
3385 					break;
3386 			}
3387 			mutex_unlock(&adev->physical_node_lock);
3388 
3389 			if (ret)
3390 				return ret;
3391 		}
3392 	}
3393 
3394 	return 0;
3395 }
3396 
3397 static __init int tboot_force_iommu(void)
3398 {
3399 	if (!tboot_enabled())
3400 		return 0;
3401 
3402 	if (no_iommu || dmar_disabled)
3403 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3404 
3405 	dmar_disabled = 0;
3406 	no_iommu = 0;
3407 
3408 	return 1;
3409 }
3410 
3411 int __init intel_iommu_init(void)
3412 {
3413 	int ret = -ENODEV;
3414 	struct dmar_drhd_unit *drhd;
3415 	struct intel_iommu *iommu;
3416 
3417 	/*
3418 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3419 	 * opt in, so enforce that.
3420 	 */
3421 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3422 		    platform_optin_force_iommu();
3423 
3424 	down_write(&dmar_global_lock);
3425 	if (dmar_table_init()) {
3426 		if (force_on)
3427 			panic("tboot: Failed to initialize DMAR table\n");
3428 		goto out_free_dmar;
3429 	}
3430 
3431 	if (dmar_dev_scope_init() < 0) {
3432 		if (force_on)
3433 			panic("tboot: Failed to initialize DMAR device scope\n");
3434 		goto out_free_dmar;
3435 	}
3436 
3437 	up_write(&dmar_global_lock);
3438 
3439 	/*
3440 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3441 	 * complain later when we register it under the lock.
3442 	 */
3443 	dmar_register_bus_notifier();
3444 
3445 	down_write(&dmar_global_lock);
3446 
3447 	if (!no_iommu)
3448 		intel_iommu_debugfs_init();
3449 
3450 	if (no_iommu || dmar_disabled) {
3451 		/*
3452 		 * We exit the function here to ensure IOMMU's remapping and
3453 		 * mempool aren't setup, which means that the IOMMU's PMRs
3454 		 * won't be disabled via the call to init_dmars(). So disable
3455 		 * it explicitly here. The PMRs were setup by tboot prior to
3456 		 * calling SENTER, but the kernel is expected to reset/tear
3457 		 * down the PMRs.
3458 		 */
3459 		if (intel_iommu_tboot_noforce) {
3460 			for_each_iommu(iommu, drhd)
3461 				iommu_disable_protect_mem_regions(iommu);
3462 		}
3463 
3464 		/*
3465 		 * Make sure the IOMMUs are switched off, even when we
3466 		 * boot into a kexec kernel and the previous kernel left
3467 		 * them enabled
3468 		 */
3469 		intel_disable_iommus();
3470 		goto out_free_dmar;
3471 	}
3472 
3473 	if (list_empty(&dmar_rmrr_units))
3474 		pr_info("No RMRR found\n");
3475 
3476 	if (list_empty(&dmar_atsr_units))
3477 		pr_info("No ATSR found\n");
3478 
3479 	if (list_empty(&dmar_satc_units))
3480 		pr_info("No SATC found\n");
3481 
3482 	init_no_remapping_devices();
3483 
3484 	ret = init_dmars();
3485 	if (ret) {
3486 		if (force_on)
3487 			panic("tboot: Failed to initialize DMARs\n");
3488 		pr_err("Initialization failed\n");
3489 		goto out_free_dmar;
3490 	}
3491 	up_write(&dmar_global_lock);
3492 
3493 	init_iommu_pm_ops();
3494 
3495 	down_read(&dmar_global_lock);
3496 	for_each_active_iommu(iommu, drhd) {
3497 		/*
3498 		 * The flush queue implementation does not perform
3499 		 * page-selective invalidations that are required for efficient
3500 		 * TLB flushes in virtual environments.  The benefit of batching
3501 		 * is likely to be much lower than the overhead of synchronizing
3502 		 * the virtual and physical IOMMU page-tables.
3503 		 */
3504 		if (cap_caching_mode(iommu->cap) &&
3505 		    !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3506 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3507 			iommu_set_dma_strict();
3508 		}
3509 		iommu_device_sysfs_add(&iommu->iommu, NULL,
3510 				       intel_iommu_groups,
3511 				       "%s", iommu->name);
3512 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3513 
3514 		iommu_pmu_register(iommu);
3515 	}
3516 	up_read(&dmar_global_lock);
3517 
3518 	if (si_domain && !hw_pass_through)
3519 		register_memory_notifier(&intel_iommu_memory_nb);
3520 
3521 	down_read(&dmar_global_lock);
3522 	if (probe_acpi_namespace_devices())
3523 		pr_warn("ACPI name space devices didn't probe correctly\n");
3524 
3525 	/* Finally, we enable the DMA remapping hardware. */
3526 	for_each_iommu(iommu, drhd) {
3527 		if (!drhd->ignored && !translation_pre_enabled(iommu))
3528 			iommu_enable_translation(iommu);
3529 
3530 		iommu_disable_protect_mem_regions(iommu);
3531 	}
3532 	up_read(&dmar_global_lock);
3533 
3534 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3535 
3536 	intel_iommu_enabled = 1;
3537 
3538 	return 0;
3539 
3540 out_free_dmar:
3541 	intel_iommu_free_dmars();
3542 	up_write(&dmar_global_lock);
3543 	return ret;
3544 }
3545 
3546 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3547 {
3548 	struct device_domain_info *info = opaque;
3549 
3550 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3551 	return 0;
3552 }
3553 
3554 /*
3555  * NB - intel-iommu lacks any sort of reference counting for the users of
3556  * dependent devices.  If multiple endpoints have intersecting dependent
3557  * devices, unbinding the driver from any one of them will possibly leave
3558  * the others unable to operate.
3559  */
3560 static void domain_context_clear(struct device_domain_info *info)
3561 {
3562 	if (!dev_is_pci(info->dev))
3563 		domain_context_clear_one(info, info->bus, info->devfn);
3564 
3565 	pci_for_each_dma_alias(to_pci_dev(info->dev),
3566 			       &domain_context_clear_one_cb, info);
3567 }
3568 
3569 /*
3570  * Clear the page table pointer in context or pasid table entries so that
3571  * all DMA requests without PASID from the device are blocked. If the page
3572  * table has been set, clean up the data structures.
3573  */
3574 void device_block_translation(struct device *dev)
3575 {
3576 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3577 	struct intel_iommu *iommu = info->iommu;
3578 	unsigned long flags;
3579 
3580 	iommu_disable_pci_caps(info);
3581 	if (!dev_is_real_dma_subdevice(dev)) {
3582 		if (sm_supported(iommu))
3583 			intel_pasid_tear_down_entry(iommu, dev,
3584 						    IOMMU_NO_PASID, false);
3585 		else
3586 			domain_context_clear(info);
3587 	}
3588 
3589 	if (!info->domain)
3590 		return;
3591 
3592 	spin_lock_irqsave(&info->domain->lock, flags);
3593 	list_del(&info->link);
3594 	spin_unlock_irqrestore(&info->domain->lock, flags);
3595 
3596 	cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID);
3597 	domain_detach_iommu(info->domain, iommu);
3598 	info->domain = NULL;
3599 }
3600 
3601 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3602 {
3603 	int adjust_width;
3604 
3605 	/* calculate AGAW */
3606 	domain->gaw = guest_width;
3607 	adjust_width = guestwidth_to_adjustwidth(guest_width);
3608 	domain->agaw = width_to_agaw(adjust_width);
3609 
3610 	domain->iommu_coherency = false;
3611 	domain->iommu_superpage = 0;
3612 	domain->max_addr = 0;
3613 
3614 	/* always allocate the top pgd */
3615 	domain->pgd = iommu_alloc_page_node(domain->nid, GFP_ATOMIC);
3616 	if (!domain->pgd)
3617 		return -ENOMEM;
3618 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3619 	return 0;
3620 }
3621 
3622 static int blocking_domain_attach_dev(struct iommu_domain *domain,
3623 				      struct device *dev)
3624 {
3625 	device_block_translation(dev);
3626 	return 0;
3627 }
3628 
3629 static struct iommu_domain blocking_domain = {
3630 	.type = IOMMU_DOMAIN_BLOCKED,
3631 	.ops = &(const struct iommu_domain_ops) {
3632 		.attach_dev	= blocking_domain_attach_dev,
3633 	}
3634 };
3635 
3636 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
3637 {
3638 	struct dmar_domain *dmar_domain;
3639 	struct iommu_domain *domain;
3640 
3641 	switch (type) {
3642 	case IOMMU_DOMAIN_DMA:
3643 	case IOMMU_DOMAIN_UNMANAGED:
3644 		dmar_domain = alloc_domain(type);
3645 		if (!dmar_domain) {
3646 			pr_err("Can't allocate dmar_domain\n");
3647 			return NULL;
3648 		}
3649 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3650 			pr_err("Domain initialization failed\n");
3651 			domain_exit(dmar_domain);
3652 			return NULL;
3653 		}
3654 
3655 		domain = &dmar_domain->domain;
3656 		domain->geometry.aperture_start = 0;
3657 		domain->geometry.aperture_end   =
3658 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
3659 		domain->geometry.force_aperture = true;
3660 
3661 		return domain;
3662 	case IOMMU_DOMAIN_IDENTITY:
3663 		return &si_domain->domain;
3664 	default:
3665 		return NULL;
3666 	}
3667 
3668 	return NULL;
3669 }
3670 
3671 static struct iommu_domain *
3672 intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
3673 			      struct iommu_domain *parent,
3674 			      const struct iommu_user_data *user_data)
3675 {
3676 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3677 	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3678 	bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3679 	struct intel_iommu *iommu = info->iommu;
3680 	struct dmar_domain *dmar_domain;
3681 	struct iommu_domain *domain;
3682 
3683 	/* Must be NESTING domain */
3684 	if (parent) {
3685 		if (!nested_supported(iommu) || flags)
3686 			return ERR_PTR(-EOPNOTSUPP);
3687 		return intel_nested_domain_alloc(parent, user_data);
3688 	}
3689 
3690 	if (flags &
3691 	    (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
3692 		return ERR_PTR(-EOPNOTSUPP);
3693 	if (nested_parent && !nested_supported(iommu))
3694 		return ERR_PTR(-EOPNOTSUPP);
3695 	if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3696 		return ERR_PTR(-EOPNOTSUPP);
3697 
3698 	/*
3699 	 * domain_alloc_user op needs to fully initialize a domain before
3700 	 * return, so uses iommu_domain_alloc() here for simple.
3701 	 */
3702 	domain = iommu_domain_alloc(dev->bus);
3703 	if (!domain)
3704 		return ERR_PTR(-ENOMEM);
3705 
3706 	dmar_domain = to_dmar_domain(domain);
3707 
3708 	if (nested_parent) {
3709 		dmar_domain->nested_parent = true;
3710 		INIT_LIST_HEAD(&dmar_domain->s1_domains);
3711 		spin_lock_init(&dmar_domain->s1_lock);
3712 	}
3713 
3714 	if (dirty_tracking) {
3715 		if (dmar_domain->use_first_level) {
3716 			iommu_domain_free(domain);
3717 			return ERR_PTR(-EOPNOTSUPP);
3718 		}
3719 		domain->dirty_ops = &intel_dirty_ops;
3720 	}
3721 
3722 	return domain;
3723 }
3724 
3725 static void intel_iommu_domain_free(struct iommu_domain *domain)
3726 {
3727 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3728 
3729 	WARN_ON(dmar_domain->nested_parent &&
3730 		!list_empty(&dmar_domain->s1_domains));
3731 	if (domain != &si_domain->domain)
3732 		domain_exit(dmar_domain);
3733 }
3734 
3735 int prepare_domain_attach_device(struct iommu_domain *domain,
3736 				 struct device *dev)
3737 {
3738 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3739 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3740 	struct intel_iommu *iommu = info->iommu;
3741 	int addr_width;
3742 
3743 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3744 		return -EINVAL;
3745 
3746 	if (domain->dirty_ops && !ssads_supported(iommu))
3747 		return -EINVAL;
3748 
3749 	/* check if this iommu agaw is sufficient for max mapped address */
3750 	addr_width = agaw_to_width(iommu->agaw);
3751 	if (addr_width > cap_mgaw(iommu->cap))
3752 		addr_width = cap_mgaw(iommu->cap);
3753 
3754 	if (dmar_domain->max_addr > (1LL << addr_width))
3755 		return -EINVAL;
3756 	dmar_domain->gaw = addr_width;
3757 
3758 	/*
3759 	 * Knock out extra levels of page tables if necessary
3760 	 */
3761 	while (iommu->agaw < dmar_domain->agaw) {
3762 		struct dma_pte *pte;
3763 
3764 		pte = dmar_domain->pgd;
3765 		if (dma_pte_present(pte)) {
3766 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
3767 			iommu_free_page(pte);
3768 		}
3769 		dmar_domain->agaw--;
3770 	}
3771 
3772 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3773 	    context_copied(iommu, info->bus, info->devfn))
3774 		return intel_pasid_setup_sm_context(dev);
3775 
3776 	return 0;
3777 }
3778 
3779 static int intel_iommu_attach_device(struct iommu_domain *domain,
3780 				     struct device *dev)
3781 {
3782 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3783 	int ret;
3784 
3785 	if (info->domain)
3786 		device_block_translation(dev);
3787 
3788 	ret = prepare_domain_attach_device(domain, dev);
3789 	if (ret)
3790 		return ret;
3791 
3792 	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
3793 }
3794 
3795 static int intel_iommu_map(struct iommu_domain *domain,
3796 			   unsigned long iova, phys_addr_t hpa,
3797 			   size_t size, int iommu_prot, gfp_t gfp)
3798 {
3799 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3800 	u64 max_addr;
3801 	int prot = 0;
3802 
3803 	if (iommu_prot & IOMMU_READ)
3804 		prot |= DMA_PTE_READ;
3805 	if (iommu_prot & IOMMU_WRITE)
3806 		prot |= DMA_PTE_WRITE;
3807 	if (dmar_domain->set_pte_snp)
3808 		prot |= DMA_PTE_SNP;
3809 
3810 	max_addr = iova + size;
3811 	if (dmar_domain->max_addr < max_addr) {
3812 		u64 end;
3813 
3814 		/* check if minimum agaw is sufficient for mapped address */
3815 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3816 		if (end < max_addr) {
3817 			pr_err("%s: iommu width (%d) is not "
3818 			       "sufficient for the mapped address (%llx)\n",
3819 			       __func__, dmar_domain->gaw, max_addr);
3820 			return -EFAULT;
3821 		}
3822 		dmar_domain->max_addr = max_addr;
3823 	}
3824 	/* Round up size to next multiple of PAGE_SIZE, if it and
3825 	   the low bits of hpa would take us onto the next page */
3826 	size = aligned_nrpages(hpa, size);
3827 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3828 				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
3829 }
3830 
3831 static int intel_iommu_map_pages(struct iommu_domain *domain,
3832 				 unsigned long iova, phys_addr_t paddr,
3833 				 size_t pgsize, size_t pgcount,
3834 				 int prot, gfp_t gfp, size_t *mapped)
3835 {
3836 	unsigned long pgshift = __ffs(pgsize);
3837 	size_t size = pgcount << pgshift;
3838 	int ret;
3839 
3840 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
3841 		return -EINVAL;
3842 
3843 	if (!IS_ALIGNED(iova | paddr, pgsize))
3844 		return -EINVAL;
3845 
3846 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
3847 	if (!ret && mapped)
3848 		*mapped = size;
3849 
3850 	return ret;
3851 }
3852 
3853 static size_t intel_iommu_unmap(struct iommu_domain *domain,
3854 				unsigned long iova, size_t size,
3855 				struct iommu_iotlb_gather *gather)
3856 {
3857 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3858 	unsigned long start_pfn, last_pfn;
3859 	int level = 0;
3860 
3861 	/* Cope with horrid API which requires us to unmap more than the
3862 	   size argument if it happens to be a large-page mapping. */
3863 	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
3864 				     &level, GFP_ATOMIC)))
3865 		return 0;
3866 
3867 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
3868 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
3869 
3870 	start_pfn = iova >> VTD_PAGE_SHIFT;
3871 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
3872 
3873 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
3874 
3875 	if (dmar_domain->max_addr == iova + size)
3876 		dmar_domain->max_addr = iova;
3877 
3878 	/*
3879 	 * We do not use page-selective IOTLB invalidation in flush queue,
3880 	 * so there is no need to track page and sync iotlb.
3881 	 */
3882 	if (!iommu_iotlb_gather_queued(gather))
3883 		iommu_iotlb_gather_add_page(domain, gather, iova, size);
3884 
3885 	return size;
3886 }
3887 
3888 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
3889 				      unsigned long iova,
3890 				      size_t pgsize, size_t pgcount,
3891 				      struct iommu_iotlb_gather *gather)
3892 {
3893 	unsigned long pgshift = __ffs(pgsize);
3894 	size_t size = pgcount << pgshift;
3895 
3896 	return intel_iommu_unmap(domain, iova, size, gather);
3897 }
3898 
3899 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
3900 				 struct iommu_iotlb_gather *gather)
3901 {
3902 	cache_tag_flush_range(to_dmar_domain(domain), gather->start,
3903 			      gather->end, list_empty(&gather->freelist));
3904 	iommu_put_pages_list(&gather->freelist);
3905 }
3906 
3907 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3908 					    dma_addr_t iova)
3909 {
3910 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3911 	struct dma_pte *pte;
3912 	int level = 0;
3913 	u64 phys = 0;
3914 
3915 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
3916 			     GFP_ATOMIC);
3917 	if (pte && dma_pte_present(pte))
3918 		phys = dma_pte_addr(pte) +
3919 			(iova & (BIT_MASK(level_to_offset_bits(level) +
3920 						VTD_PAGE_SHIFT) - 1));
3921 
3922 	return phys;
3923 }
3924 
3925 static bool domain_support_force_snooping(struct dmar_domain *domain)
3926 {
3927 	struct device_domain_info *info;
3928 	bool support = true;
3929 
3930 	assert_spin_locked(&domain->lock);
3931 	list_for_each_entry(info, &domain->devices, link) {
3932 		if (!ecap_sc_support(info->iommu->ecap)) {
3933 			support = false;
3934 			break;
3935 		}
3936 	}
3937 
3938 	return support;
3939 }
3940 
3941 static void domain_set_force_snooping(struct dmar_domain *domain)
3942 {
3943 	struct device_domain_info *info;
3944 
3945 	assert_spin_locked(&domain->lock);
3946 	/*
3947 	 * Second level page table supports per-PTE snoop control. The
3948 	 * iommu_map() interface will handle this by setting SNP bit.
3949 	 */
3950 	if (!domain->use_first_level) {
3951 		domain->set_pte_snp = true;
3952 		return;
3953 	}
3954 
3955 	list_for_each_entry(info, &domain->devices, link)
3956 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
3957 						     IOMMU_NO_PASID);
3958 }
3959 
3960 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
3961 {
3962 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3963 	unsigned long flags;
3964 
3965 	if (dmar_domain->force_snooping)
3966 		return true;
3967 
3968 	spin_lock_irqsave(&dmar_domain->lock, flags);
3969 	if (!domain_support_force_snooping(dmar_domain) ||
3970 	    (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
3971 		spin_unlock_irqrestore(&dmar_domain->lock, flags);
3972 		return false;
3973 	}
3974 
3975 	domain_set_force_snooping(dmar_domain);
3976 	dmar_domain->force_snooping = true;
3977 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
3978 
3979 	return true;
3980 }
3981 
3982 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
3983 {
3984 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3985 
3986 	switch (cap) {
3987 	case IOMMU_CAP_CACHE_COHERENCY:
3988 	case IOMMU_CAP_DEFERRED_FLUSH:
3989 		return true;
3990 	case IOMMU_CAP_PRE_BOOT_PROTECTION:
3991 		return dmar_platform_optin();
3992 	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
3993 		return ecap_sc_support(info->iommu->ecap);
3994 	case IOMMU_CAP_DIRTY_TRACKING:
3995 		return ssads_supported(info->iommu);
3996 	default:
3997 		return false;
3998 	}
3999 }
4000 
4001 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4002 {
4003 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4004 	struct device_domain_info *info;
4005 	struct intel_iommu *iommu;
4006 	u8 bus, devfn;
4007 	int ret;
4008 
4009 	iommu = device_lookup_iommu(dev, &bus, &devfn);
4010 	if (!iommu || !iommu->iommu.ops)
4011 		return ERR_PTR(-ENODEV);
4012 
4013 	info = kzalloc(sizeof(*info), GFP_KERNEL);
4014 	if (!info)
4015 		return ERR_PTR(-ENOMEM);
4016 
4017 	if (dev_is_real_dma_subdevice(dev)) {
4018 		info->bus = pdev->bus->number;
4019 		info->devfn = pdev->devfn;
4020 		info->segment = pci_domain_nr(pdev->bus);
4021 	} else {
4022 		info->bus = bus;
4023 		info->devfn = devfn;
4024 		info->segment = iommu->segment;
4025 	}
4026 
4027 	info->dev = dev;
4028 	info->iommu = iommu;
4029 	if (dev_is_pci(dev)) {
4030 		if (ecap_dev_iotlb_support(iommu->ecap) &&
4031 		    pci_ats_supported(pdev) &&
4032 		    dmar_ats_supported(pdev, iommu)) {
4033 			info->ats_supported = 1;
4034 			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4035 
4036 			/*
4037 			 * For IOMMU that supports device IOTLB throttling
4038 			 * (DIT), we assign PFSID to the invalidation desc
4039 			 * of a VF such that IOMMU HW can gauge queue depth
4040 			 * at PF level. If DIT is not set, PFSID will be
4041 			 * treated as reserved, which should be set to 0.
4042 			 */
4043 			if (ecap_dit(iommu->ecap))
4044 				info->pfsid = pci_dev_id(pci_physfn(pdev));
4045 			info->ats_qdep = pci_ats_queue_depth(pdev);
4046 		}
4047 		if (sm_supported(iommu)) {
4048 			if (pasid_supported(iommu)) {
4049 				int features = pci_pasid_features(pdev);
4050 
4051 				if (features >= 0)
4052 					info->pasid_supported = features | 1;
4053 			}
4054 
4055 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4056 			    pci_pri_supported(pdev))
4057 				info->pri_supported = 1;
4058 		}
4059 	}
4060 
4061 	dev_iommu_priv_set(dev, info);
4062 	if (pdev && pci_ats_supported(pdev)) {
4063 		ret = device_rbtree_insert(iommu, info);
4064 		if (ret)
4065 			goto free;
4066 	}
4067 
4068 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4069 		ret = intel_pasid_alloc_table(dev);
4070 		if (ret) {
4071 			dev_err(dev, "PASID table allocation failed\n");
4072 			goto clear_rbtree;
4073 		}
4074 
4075 		if (!context_copied(iommu, info->bus, info->devfn)) {
4076 			ret = intel_pasid_setup_sm_context(dev);
4077 			if (ret)
4078 				goto free_table;
4079 		}
4080 	}
4081 
4082 	intel_iommu_debugfs_create_dev(info);
4083 
4084 	return &iommu->iommu;
4085 free_table:
4086 	intel_pasid_free_table(dev);
4087 clear_rbtree:
4088 	device_rbtree_remove(info);
4089 free:
4090 	kfree(info);
4091 
4092 	return ERR_PTR(ret);
4093 }
4094 
4095 static void intel_iommu_release_device(struct device *dev)
4096 {
4097 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4098 	struct intel_iommu *iommu = info->iommu;
4099 
4100 	mutex_lock(&iommu->iopf_lock);
4101 	if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
4102 		device_rbtree_remove(info);
4103 	mutex_unlock(&iommu->iopf_lock);
4104 
4105 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
4106 	    !context_copied(iommu, info->bus, info->devfn))
4107 		intel_pasid_teardown_sm_context(dev);
4108 
4109 	intel_pasid_free_table(dev);
4110 	intel_iommu_debugfs_remove_dev(info);
4111 	kfree(info);
4112 	set_dma_ops(dev, NULL);
4113 }
4114 
4115 static void intel_iommu_get_resv_regions(struct device *device,
4116 					 struct list_head *head)
4117 {
4118 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4119 	struct iommu_resv_region *reg;
4120 	struct dmar_rmrr_unit *rmrr;
4121 	struct device *i_dev;
4122 	int i;
4123 
4124 	rcu_read_lock();
4125 	for_each_rmrr_units(rmrr) {
4126 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4127 					  i, i_dev) {
4128 			struct iommu_resv_region *resv;
4129 			enum iommu_resv_type type;
4130 			size_t length;
4131 
4132 			if (i_dev != device &&
4133 			    !is_downstream_to_pci_bridge(device, i_dev))
4134 				continue;
4135 
4136 			length = rmrr->end_address - rmrr->base_address + 1;
4137 
4138 			type = device_rmrr_is_relaxable(device) ?
4139 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4140 
4141 			resv = iommu_alloc_resv_region(rmrr->base_address,
4142 						       length, prot, type,
4143 						       GFP_ATOMIC);
4144 			if (!resv)
4145 				break;
4146 
4147 			list_add_tail(&resv->list, head);
4148 		}
4149 	}
4150 	rcu_read_unlock();
4151 
4152 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4153 	if (dev_is_pci(device)) {
4154 		struct pci_dev *pdev = to_pci_dev(device);
4155 
4156 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4157 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4158 					IOMMU_RESV_DIRECT_RELAXABLE,
4159 					GFP_KERNEL);
4160 			if (reg)
4161 				list_add_tail(&reg->list, head);
4162 		}
4163 	}
4164 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4165 
4166 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4167 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4168 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4169 	if (!reg)
4170 		return;
4171 	list_add_tail(&reg->list, head);
4172 }
4173 
4174 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4175 {
4176 	if (dev_is_pci(dev))
4177 		return pci_device_group(dev);
4178 	return generic_device_group(dev);
4179 }
4180 
4181 static int intel_iommu_enable_sva(struct device *dev)
4182 {
4183 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4184 	struct intel_iommu *iommu;
4185 
4186 	if (!info || dmar_disabled)
4187 		return -EINVAL;
4188 
4189 	iommu = info->iommu;
4190 	if (!iommu)
4191 		return -EINVAL;
4192 
4193 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4194 		return -ENODEV;
4195 
4196 	if (!info->pasid_enabled || !info->ats_enabled)
4197 		return -EINVAL;
4198 
4199 	/*
4200 	 * Devices having device-specific I/O fault handling should not
4201 	 * support PCI/PRI. The IOMMU side has no means to check the
4202 	 * capability of device-specific IOPF.  Therefore, IOMMU can only
4203 	 * default that if the device driver enables SVA on a non-PRI
4204 	 * device, it will handle IOPF in its own way.
4205 	 */
4206 	if (!info->pri_supported)
4207 		return 0;
4208 
4209 	/* Devices supporting PRI should have it enabled. */
4210 	if (!info->pri_enabled)
4211 		return -EINVAL;
4212 
4213 	return 0;
4214 }
4215 
4216 static int intel_iommu_enable_iopf(struct device *dev)
4217 {
4218 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4219 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4220 	struct intel_iommu *iommu;
4221 	int ret;
4222 
4223 	if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4224 		return -ENODEV;
4225 
4226 	if (info->pri_enabled)
4227 		return -EBUSY;
4228 
4229 	iommu = info->iommu;
4230 	if (!iommu)
4231 		return -EINVAL;
4232 
4233 	/* PASID is required in PRG Response Message. */
4234 	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4235 		return -EINVAL;
4236 
4237 	ret = pci_reset_pri(pdev);
4238 	if (ret)
4239 		return ret;
4240 
4241 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4242 	if (ret)
4243 		return ret;
4244 
4245 	ret = pci_enable_pri(pdev, PRQ_DEPTH);
4246 	if (ret) {
4247 		iopf_queue_remove_device(iommu->iopf_queue, dev);
4248 		return ret;
4249 	}
4250 
4251 	info->pri_enabled = 1;
4252 
4253 	return 0;
4254 }
4255 
4256 static int intel_iommu_disable_iopf(struct device *dev)
4257 {
4258 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4259 	struct intel_iommu *iommu = info->iommu;
4260 
4261 	if (!info->pri_enabled)
4262 		return -EINVAL;
4263 
4264 	/*
4265 	 * PCIe spec states that by clearing PRI enable bit, the Page
4266 	 * Request Interface will not issue new page requests, but has
4267 	 * outstanding page requests that have been transmitted or are
4268 	 * queued for transmission. This is supposed to be called after
4269 	 * the device driver has stopped DMA, all PASIDs have been
4270 	 * unbound and the outstanding PRQs have been drained.
4271 	 */
4272 	pci_disable_pri(to_pci_dev(dev));
4273 	info->pri_enabled = 0;
4274 	iopf_queue_remove_device(iommu->iopf_queue, dev);
4275 
4276 	return 0;
4277 }
4278 
4279 static int
4280 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4281 {
4282 	switch (feat) {
4283 	case IOMMU_DEV_FEAT_IOPF:
4284 		return intel_iommu_enable_iopf(dev);
4285 
4286 	case IOMMU_DEV_FEAT_SVA:
4287 		return intel_iommu_enable_sva(dev);
4288 
4289 	default:
4290 		return -ENODEV;
4291 	}
4292 }
4293 
4294 static int
4295 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4296 {
4297 	switch (feat) {
4298 	case IOMMU_DEV_FEAT_IOPF:
4299 		return intel_iommu_disable_iopf(dev);
4300 
4301 	case IOMMU_DEV_FEAT_SVA:
4302 		return 0;
4303 
4304 	default:
4305 		return -ENODEV;
4306 	}
4307 }
4308 
4309 static bool intel_iommu_is_attach_deferred(struct device *dev)
4310 {
4311 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4312 
4313 	return translation_pre_enabled(info->iommu) && !info->domain;
4314 }
4315 
4316 /*
4317  * Check that the device does not live on an external facing PCI port that is
4318  * marked as untrusted. Such devices should not be able to apply quirks and
4319  * thus not be able to bypass the IOMMU restrictions.
4320  */
4321 static bool risky_device(struct pci_dev *pdev)
4322 {
4323 	if (pdev->untrusted) {
4324 		pci_info(pdev,
4325 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4326 			 pdev->vendor, pdev->device);
4327 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4328 		return true;
4329 	}
4330 	return false;
4331 }
4332 
4333 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4334 				      unsigned long iova, size_t size)
4335 {
4336 	cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1);
4337 
4338 	return 0;
4339 }
4340 
4341 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
4342 					 struct iommu_domain *domain)
4343 {
4344 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4345 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4346 	struct dev_pasid_info *curr, *dev_pasid = NULL;
4347 	struct intel_iommu *iommu = info->iommu;
4348 	unsigned long flags;
4349 
4350 	spin_lock_irqsave(&dmar_domain->lock, flags);
4351 	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4352 		if (curr->dev == dev && curr->pasid == pasid) {
4353 			list_del(&curr->link_domain);
4354 			dev_pasid = curr;
4355 			break;
4356 		}
4357 	}
4358 	WARN_ON_ONCE(!dev_pasid);
4359 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4360 
4361 	cache_tag_unassign_domain(dmar_domain, dev, pasid);
4362 	domain_detach_iommu(dmar_domain, iommu);
4363 	intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4364 	kfree(dev_pasid);
4365 	intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4366 	intel_drain_pasid_prq(dev, pasid);
4367 }
4368 
4369 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4370 				     struct device *dev, ioasid_t pasid)
4371 {
4372 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4373 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4374 	struct intel_iommu *iommu = info->iommu;
4375 	struct dev_pasid_info *dev_pasid;
4376 	unsigned long flags;
4377 	int ret;
4378 
4379 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4380 		return -EOPNOTSUPP;
4381 
4382 	if (domain->dirty_ops)
4383 		return -EINVAL;
4384 
4385 	if (context_copied(iommu, info->bus, info->devfn))
4386 		return -EBUSY;
4387 
4388 	ret = prepare_domain_attach_device(domain, dev);
4389 	if (ret)
4390 		return ret;
4391 
4392 	dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4393 	if (!dev_pasid)
4394 		return -ENOMEM;
4395 
4396 	ret = domain_attach_iommu(dmar_domain, iommu);
4397 	if (ret)
4398 		goto out_free;
4399 
4400 	ret = cache_tag_assign_domain(dmar_domain, dev, pasid);
4401 	if (ret)
4402 		goto out_detach_iommu;
4403 
4404 	if (domain_type_is_si(dmar_domain))
4405 		ret = intel_pasid_setup_pass_through(iommu, dev, pasid);
4406 	else if (dmar_domain->use_first_level)
4407 		ret = domain_setup_first_level(iommu, dmar_domain,
4408 					       dev, pasid);
4409 	else
4410 		ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4411 						     dev, pasid);
4412 	if (ret)
4413 		goto out_unassign_tag;
4414 
4415 	dev_pasid->dev = dev;
4416 	dev_pasid->pasid = pasid;
4417 	spin_lock_irqsave(&dmar_domain->lock, flags);
4418 	list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4419 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4420 
4421 	if (domain->type & __IOMMU_DOMAIN_PAGING)
4422 		intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4423 
4424 	return 0;
4425 out_unassign_tag:
4426 	cache_tag_unassign_domain(dmar_domain, dev, pasid);
4427 out_detach_iommu:
4428 	domain_detach_iommu(dmar_domain, iommu);
4429 out_free:
4430 	kfree(dev_pasid);
4431 	return ret;
4432 }
4433 
4434 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4435 {
4436 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4437 	struct intel_iommu *iommu = info->iommu;
4438 	struct iommu_hw_info_vtd *vtd;
4439 
4440 	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4441 	if (!vtd)
4442 		return ERR_PTR(-ENOMEM);
4443 
4444 	vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4445 	vtd->cap_reg = iommu->cap;
4446 	vtd->ecap_reg = iommu->ecap;
4447 	*length = sizeof(*vtd);
4448 	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4449 	return vtd;
4450 }
4451 
4452 /*
4453  * Set dirty tracking for the device list of a domain. The caller must
4454  * hold the domain->lock when calling it.
4455  */
4456 static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4457 {
4458 	struct device_domain_info *info;
4459 	int ret = 0;
4460 
4461 	list_for_each_entry(info, devices, link) {
4462 		ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4463 						       IOMMU_NO_PASID, enable);
4464 		if (ret)
4465 			break;
4466 	}
4467 
4468 	return ret;
4469 }
4470 
4471 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4472 					    bool enable)
4473 {
4474 	struct dmar_domain *s1_domain;
4475 	unsigned long flags;
4476 	int ret;
4477 
4478 	spin_lock(&domain->s1_lock);
4479 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4480 		spin_lock_irqsave(&s1_domain->lock, flags);
4481 		ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4482 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4483 		if (ret)
4484 			goto err_unwind;
4485 	}
4486 	spin_unlock(&domain->s1_lock);
4487 	return 0;
4488 
4489 err_unwind:
4490 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4491 		spin_lock_irqsave(&s1_domain->lock, flags);
4492 		device_set_dirty_tracking(&s1_domain->devices,
4493 					  domain->dirty_tracking);
4494 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4495 	}
4496 	spin_unlock(&domain->s1_lock);
4497 	return ret;
4498 }
4499 
4500 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4501 					  bool enable)
4502 {
4503 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4504 	int ret;
4505 
4506 	spin_lock(&dmar_domain->lock);
4507 	if (dmar_domain->dirty_tracking == enable)
4508 		goto out_unlock;
4509 
4510 	ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4511 	if (ret)
4512 		goto err_unwind;
4513 
4514 	if (dmar_domain->nested_parent) {
4515 		ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4516 		if (ret)
4517 			goto err_unwind;
4518 	}
4519 
4520 	dmar_domain->dirty_tracking = enable;
4521 out_unlock:
4522 	spin_unlock(&dmar_domain->lock);
4523 
4524 	return 0;
4525 
4526 err_unwind:
4527 	device_set_dirty_tracking(&dmar_domain->devices,
4528 				  dmar_domain->dirty_tracking);
4529 	spin_unlock(&dmar_domain->lock);
4530 	return ret;
4531 }
4532 
4533 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4534 					    unsigned long iova, size_t size,
4535 					    unsigned long flags,
4536 					    struct iommu_dirty_bitmap *dirty)
4537 {
4538 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4539 	unsigned long end = iova + size - 1;
4540 	unsigned long pgsize;
4541 
4542 	/*
4543 	 * IOMMUFD core calls into a dirty tracking disabled domain without an
4544 	 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4545 	 * have occurred when we stopped dirty tracking. This ensures that we
4546 	 * never inherit dirtied bits from a previous cycle.
4547 	 */
4548 	if (!dmar_domain->dirty_tracking && dirty->bitmap)
4549 		return -EINVAL;
4550 
4551 	do {
4552 		struct dma_pte *pte;
4553 		int lvl = 0;
4554 
4555 		pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4556 				     GFP_ATOMIC);
4557 		pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4558 		if (!pte || !dma_pte_present(pte)) {
4559 			iova += pgsize;
4560 			continue;
4561 		}
4562 
4563 		if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4564 			iommu_dirty_bitmap_record(dirty, iova, pgsize);
4565 		iova += pgsize;
4566 	} while (iova < end);
4567 
4568 	return 0;
4569 }
4570 
4571 static const struct iommu_dirty_ops intel_dirty_ops = {
4572 	.set_dirty_tracking = intel_iommu_set_dirty_tracking,
4573 	.read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4574 };
4575 
4576 const struct iommu_ops intel_iommu_ops = {
4577 	.blocked_domain		= &blocking_domain,
4578 	.release_domain		= &blocking_domain,
4579 	.capable		= intel_iommu_capable,
4580 	.hw_info		= intel_iommu_hw_info,
4581 	.domain_alloc		= intel_iommu_domain_alloc,
4582 	.domain_alloc_user	= intel_iommu_domain_alloc_user,
4583 	.domain_alloc_sva	= intel_svm_domain_alloc,
4584 	.probe_device		= intel_iommu_probe_device,
4585 	.release_device		= intel_iommu_release_device,
4586 	.get_resv_regions	= intel_iommu_get_resv_regions,
4587 	.device_group		= intel_iommu_device_group,
4588 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4589 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4590 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4591 	.def_domain_type	= device_def_domain_type,
4592 	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4593 	.pgsize_bitmap		= SZ_4K,
4594 #ifdef CONFIG_INTEL_IOMMU_SVM
4595 	.page_response		= intel_svm_page_response,
4596 #endif
4597 	.default_domain_ops = &(const struct iommu_domain_ops) {
4598 		.attach_dev		= intel_iommu_attach_device,
4599 		.set_dev_pasid		= intel_iommu_set_dev_pasid,
4600 		.map_pages		= intel_iommu_map_pages,
4601 		.unmap_pages		= intel_iommu_unmap_pages,
4602 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4603 		.flush_iotlb_all        = intel_flush_iotlb_all,
4604 		.iotlb_sync		= intel_iommu_tlb_sync,
4605 		.iova_to_phys		= intel_iommu_iova_to_phys,
4606 		.free			= intel_iommu_domain_free,
4607 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4608 	}
4609 };
4610 
4611 static void quirk_iommu_igfx(struct pci_dev *dev)
4612 {
4613 	if (risky_device(dev))
4614 		return;
4615 
4616 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4617 	disable_igfx_iommu = 1;
4618 }
4619 
4620 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4621 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4622 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4623 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4624 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4625 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4626 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4627 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4628 
4629 /* Broadwell igfx malfunctions with dmar */
4630 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4631 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4632 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4633 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4634 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4635 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4636 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4637 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4638 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4639 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4640 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4641 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4642 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4643 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4644 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4645 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4646 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4647 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4648 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4649 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4650 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4651 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4652 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4653 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4654 
4655 static void quirk_iommu_rwbf(struct pci_dev *dev)
4656 {
4657 	if (risky_device(dev))
4658 		return;
4659 
4660 	/*
4661 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4662 	 * but needs it. Same seems to hold for the desktop versions.
4663 	 */
4664 	pci_info(dev, "Forcing write-buffer flush capability\n");
4665 	rwbf_quirk = 1;
4666 }
4667 
4668 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4669 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4670 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4671 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4672 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4673 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4674 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4675 
4676 #define GGC 0x52
4677 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4678 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4679 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4680 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4681 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4682 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4683 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4684 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4685 
4686 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4687 {
4688 	unsigned short ggc;
4689 
4690 	if (risky_device(dev))
4691 		return;
4692 
4693 	if (pci_read_config_word(dev, GGC, &ggc))
4694 		return;
4695 
4696 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4697 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4698 		disable_igfx_iommu = 1;
4699 	} else if (!disable_igfx_iommu) {
4700 		/* we have to ensure the gfx device is idle before we flush */
4701 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4702 		iommu_set_dma_strict();
4703 	}
4704 }
4705 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4706 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4707 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4708 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4709 
4710 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4711 {
4712 	unsigned short ver;
4713 
4714 	if (!IS_GFX_DEVICE(dev))
4715 		return;
4716 
4717 	ver = (dev->device >> 8) & 0xff;
4718 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4719 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4720 	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4721 		return;
4722 
4723 	if (risky_device(dev))
4724 		return;
4725 
4726 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4727 	iommu_skip_te_disable = 1;
4728 }
4729 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4730 
4731 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4732    ISOCH DMAR unit for the Azalia sound device, but not give it any
4733    TLB entries, which causes it to deadlock. Check for that.  We do
4734    this in a function called from init_dmars(), instead of in a PCI
4735    quirk, because we don't want to print the obnoxious "BIOS broken"
4736    message if VT-d is actually disabled.
4737 */
4738 static void __init check_tylersburg_isoch(void)
4739 {
4740 	struct pci_dev *pdev;
4741 	uint32_t vtisochctrl;
4742 
4743 	/* If there's no Azalia in the system anyway, forget it. */
4744 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4745 	if (!pdev)
4746 		return;
4747 
4748 	if (risky_device(pdev)) {
4749 		pci_dev_put(pdev);
4750 		return;
4751 	}
4752 
4753 	pci_dev_put(pdev);
4754 
4755 	/* System Management Registers. Might be hidden, in which case
4756 	   we can't do the sanity check. But that's OK, because the
4757 	   known-broken BIOSes _don't_ actually hide it, so far. */
4758 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4759 	if (!pdev)
4760 		return;
4761 
4762 	if (risky_device(pdev)) {
4763 		pci_dev_put(pdev);
4764 		return;
4765 	}
4766 
4767 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4768 		pci_dev_put(pdev);
4769 		return;
4770 	}
4771 
4772 	pci_dev_put(pdev);
4773 
4774 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4775 	if (vtisochctrl & 1)
4776 		return;
4777 
4778 	/* Drop all bits other than the number of TLB entries */
4779 	vtisochctrl &= 0x1c;
4780 
4781 	/* If we have the recommended number of TLB entries (16), fine. */
4782 	if (vtisochctrl == 0x10)
4783 		return;
4784 
4785 	/* Zero TLB entries? You get to ride the short bus to school. */
4786 	if (!vtisochctrl) {
4787 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4788 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4789 		     dmi_get_system_info(DMI_BIOS_VENDOR),
4790 		     dmi_get_system_info(DMI_BIOS_VERSION),
4791 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4792 		iommu_identity_mapping |= IDENTMAP_AZALIA;
4793 		return;
4794 	}
4795 
4796 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4797 	       vtisochctrl);
4798 }
4799 
4800 /*
4801  * Here we deal with a device TLB defect where device may inadvertently issue ATS
4802  * invalidation completion before posted writes initiated with translated address
4803  * that utilized translations matching the invalidation address range, violating
4804  * the invalidation completion ordering.
4805  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4806  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4807  * under the control of the trusted/privileged host device driver must use this
4808  * quirk.
4809  * Device TLBs are invalidated under the following six conditions:
4810  * 1. Device driver does DMA API unmap IOVA
4811  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4812  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4813  *    exit_mmap() due to crash
4814  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4815  *    VM has to free pages that were unmapped
4816  * 5. Userspace driver unmaps a DMA buffer
4817  * 6. Cache invalidation in vSVA usage (upcoming)
4818  *
4819  * For #1 and #2, device drivers are responsible for stopping DMA traffic
4820  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4821  * invalidate TLB the same way as normal user unmap which will use this quirk.
4822  * The dTLB invalidation after PASID cache flush does not need this quirk.
4823  *
4824  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4825  */
4826 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4827 			       unsigned long address, unsigned long mask,
4828 			       u32 pasid, u16 qdep)
4829 {
4830 	u16 sid;
4831 
4832 	if (likely(!info->dtlb_extra_inval))
4833 		return;
4834 
4835 	sid = PCI_DEVID(info->bus, info->devfn);
4836 	if (pasid == IOMMU_NO_PASID) {
4837 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
4838 				   qdep, address, mask);
4839 	} else {
4840 		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
4841 					 pasid, qdep, address, mask);
4842 	}
4843 }
4844 
4845 #define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
4846 
4847 /*
4848  * Function to submit a command to the enhanced command interface. The
4849  * valid enhanced command descriptions are defined in Table 47 of the
4850  * VT-d spec. The VT-d hardware implementation may support some but not
4851  * all commands, which can be determined by checking the Enhanced
4852  * Command Capability Register.
4853  *
4854  * Return values:
4855  *  - 0: Command successful without any error;
4856  *  - Negative: software error value;
4857  *  - Nonzero positive: failure status code defined in Table 48.
4858  */
4859 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
4860 {
4861 	unsigned long flags;
4862 	u64 res;
4863 	int ret;
4864 
4865 	if (!cap_ecmds(iommu->cap))
4866 		return -ENODEV;
4867 
4868 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
4869 
4870 	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
4871 	if (res & DMA_ECMD_ECRSP_IP) {
4872 		ret = -EBUSY;
4873 		goto err;
4874 	}
4875 
4876 	/*
4877 	 * Unconditionally write the operand B, because
4878 	 * - There is no side effect if an ecmd doesn't require an
4879 	 *   operand B, but we set the register to some value.
4880 	 * - It's not invoked in any critical path. The extra MMIO
4881 	 *   write doesn't bring any performance concerns.
4882 	 */
4883 	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
4884 	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
4885 
4886 	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
4887 		      !(res & DMA_ECMD_ECRSP_IP), res);
4888 
4889 	if (res & DMA_ECMD_ECRSP_IP) {
4890 		ret = -ETIMEDOUT;
4891 		goto err;
4892 	}
4893 
4894 	ret = ecmd_get_status_code(res);
4895 err:
4896 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
4897 
4898 	return ret;
4899 }
4900