xref: /linux/drivers/iommu/intel/iommu.c (revision 6e7fd890f1d6ac83805409e9c346240de2705584)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26 
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-pages.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 #include "perfmon.h"
34 
35 #define ROOT_SIZE		VTD_PAGE_SIZE
36 #define CONTEXT_SIZE		VTD_PAGE_SIZE
37 
38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42 
43 #define IOAPIC_RANGE_START	(0xfee00000)
44 #define IOAPIC_RANGE_END	(0xfeefffff)
45 #define IOVA_START_ADDR		(0x1000)
46 
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48 
49 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
50 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
51 
52 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
53    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
54 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
55 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
56 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
57 
58 static void __init check_tylersburg_isoch(void);
59 static int rwbf_quirk;
60 
61 /*
62  * set to 1 to panic kernel if can't successfully enable VT-d
63  * (used when kernel is launched w/ TXT)
64  */
65 static int force_on = 0;
66 static int intel_iommu_tboot_noforce;
67 static int no_platform_optin;
68 
69 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
70 
71 /*
72  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
73  * if marked present.
74  */
75 static phys_addr_t root_entry_lctp(struct root_entry *re)
76 {
77 	if (!(re->lo & 1))
78 		return 0;
79 
80 	return re->lo & VTD_PAGE_MASK;
81 }
82 
83 /*
84  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
85  * if marked present.
86  */
87 static phys_addr_t root_entry_uctp(struct root_entry *re)
88 {
89 	if (!(re->hi & 1))
90 		return 0;
91 
92 	return re->hi & VTD_PAGE_MASK;
93 }
94 
95 static int device_rid_cmp_key(const void *key, const struct rb_node *node)
96 {
97 	struct device_domain_info *info =
98 		rb_entry(node, struct device_domain_info, node);
99 	const u16 *rid_lhs = key;
100 
101 	if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
102 		return -1;
103 
104 	if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
105 		return 1;
106 
107 	return 0;
108 }
109 
110 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
111 {
112 	struct device_domain_info *info =
113 		rb_entry(lhs, struct device_domain_info, node);
114 	u16 key = PCI_DEVID(info->bus, info->devfn);
115 
116 	return device_rid_cmp_key(&key, rhs);
117 }
118 
119 /*
120  * Looks up an IOMMU-probed device using its source ID.
121  *
122  * Returns the pointer to the device if there is a match. Otherwise,
123  * returns NULL.
124  *
125  * Note that this helper doesn't guarantee that the device won't be
126  * released by the iommu subsystem after being returned. The caller
127  * should use its own synchronization mechanism to avoid the device
128  * being released during its use if its possibly the case.
129  */
130 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
131 {
132 	struct device_domain_info *info = NULL;
133 	struct rb_node *node;
134 	unsigned long flags;
135 
136 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
137 	node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
138 	if (node)
139 		info = rb_entry(node, struct device_domain_info, node);
140 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
141 
142 	return info ? info->dev : NULL;
143 }
144 
145 static int device_rbtree_insert(struct intel_iommu *iommu,
146 				struct device_domain_info *info)
147 {
148 	struct rb_node *curr;
149 	unsigned long flags;
150 
151 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
152 	curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
153 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
154 	if (WARN_ON(curr))
155 		return -EEXIST;
156 
157 	return 0;
158 }
159 
160 static void device_rbtree_remove(struct device_domain_info *info)
161 {
162 	struct intel_iommu *iommu = info->iommu;
163 	unsigned long flags;
164 
165 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
166 	rb_erase(&info->node, &iommu->device_rbtree);
167 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
168 }
169 
170 /*
171  * This domain is a statically identity mapping domain.
172  *	1. This domain creats a static 1:1 mapping to all usable memory.
173  * 	2. It maps to each iommu if successful.
174  *	3. Each iommu mapps to this domain if successful.
175  */
176 static struct dmar_domain *si_domain;
177 static int hw_pass_through = 1;
178 
179 struct dmar_rmrr_unit {
180 	struct list_head list;		/* list of rmrr units	*/
181 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
182 	u64	base_address;		/* reserved base address*/
183 	u64	end_address;		/* reserved end address */
184 	struct dmar_dev_scope *devices;	/* target devices */
185 	int	devices_cnt;		/* target device count */
186 };
187 
188 struct dmar_atsr_unit {
189 	struct list_head list;		/* list of ATSR units */
190 	struct acpi_dmar_header *hdr;	/* ACPI header */
191 	struct dmar_dev_scope *devices;	/* target devices */
192 	int devices_cnt;		/* target device count */
193 	u8 include_all:1;		/* include all ports */
194 };
195 
196 struct dmar_satc_unit {
197 	struct list_head list;		/* list of SATC units */
198 	struct acpi_dmar_header *hdr;	/* ACPI header */
199 	struct dmar_dev_scope *devices;	/* target devices */
200 	struct intel_iommu *iommu;	/* the corresponding iommu */
201 	int devices_cnt;		/* target device count */
202 	u8 atc_required:1;		/* ATS is required */
203 };
204 
205 static LIST_HEAD(dmar_atsr_units);
206 static LIST_HEAD(dmar_rmrr_units);
207 static LIST_HEAD(dmar_satc_units);
208 
209 #define for_each_rmrr_units(rmrr) \
210 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
211 
212 static void intel_iommu_domain_free(struct iommu_domain *domain);
213 
214 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
215 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
216 
217 int intel_iommu_enabled = 0;
218 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
219 
220 static int intel_iommu_superpage = 1;
221 static int iommu_identity_mapping;
222 static int iommu_skip_te_disable;
223 static int disable_igfx_iommu;
224 
225 #define IDENTMAP_AZALIA		4
226 
227 const struct iommu_ops intel_iommu_ops;
228 static const struct iommu_dirty_ops intel_dirty_ops;
229 
230 static bool translation_pre_enabled(struct intel_iommu *iommu)
231 {
232 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
233 }
234 
235 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
236 {
237 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
238 }
239 
240 static void init_translation_status(struct intel_iommu *iommu)
241 {
242 	u32 gsts;
243 
244 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
245 	if (gsts & DMA_GSTS_TES)
246 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
247 }
248 
249 static int __init intel_iommu_setup(char *str)
250 {
251 	if (!str)
252 		return -EINVAL;
253 
254 	while (*str) {
255 		if (!strncmp(str, "on", 2)) {
256 			dmar_disabled = 0;
257 			pr_info("IOMMU enabled\n");
258 		} else if (!strncmp(str, "off", 3)) {
259 			dmar_disabled = 1;
260 			no_platform_optin = 1;
261 			pr_info("IOMMU disabled\n");
262 		} else if (!strncmp(str, "igfx_off", 8)) {
263 			disable_igfx_iommu = 1;
264 			pr_info("Disable GFX device mapping\n");
265 		} else if (!strncmp(str, "forcedac", 8)) {
266 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
267 			iommu_dma_forcedac = true;
268 		} else if (!strncmp(str, "strict", 6)) {
269 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
270 			iommu_set_dma_strict();
271 		} else if (!strncmp(str, "sp_off", 6)) {
272 			pr_info("Disable supported super page\n");
273 			intel_iommu_superpage = 0;
274 		} else if (!strncmp(str, "sm_on", 5)) {
275 			pr_info("Enable scalable mode if hardware supports\n");
276 			intel_iommu_sm = 1;
277 		} else if (!strncmp(str, "sm_off", 6)) {
278 			pr_info("Scalable mode is disallowed\n");
279 			intel_iommu_sm = 0;
280 		} else if (!strncmp(str, "tboot_noforce", 13)) {
281 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
282 			intel_iommu_tboot_noforce = 1;
283 		} else {
284 			pr_notice("Unknown option - '%s'\n", str);
285 		}
286 
287 		str += strcspn(str, ",");
288 		while (*str == ',')
289 			str++;
290 	}
291 
292 	return 1;
293 }
294 __setup("intel_iommu=", intel_iommu_setup);
295 
296 static int domain_type_is_si(struct dmar_domain *domain)
297 {
298 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
299 }
300 
301 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
302 {
303 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
304 
305 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
306 }
307 
308 /*
309  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
310  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
311  * the returned SAGAW.
312  */
313 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
314 {
315 	unsigned long fl_sagaw, sl_sagaw;
316 
317 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
318 	sl_sagaw = cap_sagaw(iommu->cap);
319 
320 	/* Second level only. */
321 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
322 		return sl_sagaw;
323 
324 	/* First level only. */
325 	if (!ecap_slts(iommu->ecap))
326 		return fl_sagaw;
327 
328 	return fl_sagaw & sl_sagaw;
329 }
330 
331 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
332 {
333 	unsigned long sagaw;
334 	int agaw;
335 
336 	sagaw = __iommu_calculate_sagaw(iommu);
337 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
338 		if (test_bit(agaw, &sagaw))
339 			break;
340 	}
341 
342 	return agaw;
343 }
344 
345 /*
346  * Calculate max SAGAW for each iommu.
347  */
348 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
349 {
350 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
351 }
352 
353 /*
354  * calculate agaw for each iommu.
355  * "SAGAW" may be different across iommus, use a default agaw, and
356  * get a supported less agaw for iommus that don't support the default agaw.
357  */
358 int iommu_calculate_agaw(struct intel_iommu *iommu)
359 {
360 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
361 }
362 
363 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
364 {
365 	return sm_supported(iommu) ?
366 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
367 }
368 
369 static void domain_update_iommu_coherency(struct dmar_domain *domain)
370 {
371 	struct iommu_domain_info *info;
372 	struct dmar_drhd_unit *drhd;
373 	struct intel_iommu *iommu;
374 	bool found = false;
375 	unsigned long i;
376 
377 	domain->iommu_coherency = true;
378 	xa_for_each(&domain->iommu_array, i, info) {
379 		found = true;
380 		if (!iommu_paging_structure_coherency(info->iommu)) {
381 			domain->iommu_coherency = false;
382 			break;
383 		}
384 	}
385 	if (found)
386 		return;
387 
388 	/* No hardware attached; use lowest common denominator */
389 	rcu_read_lock();
390 	for_each_active_iommu(iommu, drhd) {
391 		if (!iommu_paging_structure_coherency(iommu)) {
392 			domain->iommu_coherency = false;
393 			break;
394 		}
395 	}
396 	rcu_read_unlock();
397 }
398 
399 static int domain_update_iommu_superpage(struct dmar_domain *domain,
400 					 struct intel_iommu *skip)
401 {
402 	struct dmar_drhd_unit *drhd;
403 	struct intel_iommu *iommu;
404 	int mask = 0x3;
405 
406 	if (!intel_iommu_superpage)
407 		return 0;
408 
409 	/* set iommu_superpage to the smallest common denominator */
410 	rcu_read_lock();
411 	for_each_active_iommu(iommu, drhd) {
412 		if (iommu != skip) {
413 			if (domain && domain->use_first_level) {
414 				if (!cap_fl1gp_support(iommu->cap))
415 					mask = 0x1;
416 			} else {
417 				mask &= cap_super_page_val(iommu->cap);
418 			}
419 
420 			if (!mask)
421 				break;
422 		}
423 	}
424 	rcu_read_unlock();
425 
426 	return fls(mask);
427 }
428 
429 static int domain_update_device_node(struct dmar_domain *domain)
430 {
431 	struct device_domain_info *info;
432 	int nid = NUMA_NO_NODE;
433 	unsigned long flags;
434 
435 	spin_lock_irqsave(&domain->lock, flags);
436 	list_for_each_entry(info, &domain->devices, link) {
437 		/*
438 		 * There could possibly be multiple device numa nodes as devices
439 		 * within the same domain may sit behind different IOMMUs. There
440 		 * isn't perfect answer in such situation, so we select first
441 		 * come first served policy.
442 		 */
443 		nid = dev_to_node(info->dev);
444 		if (nid != NUMA_NO_NODE)
445 			break;
446 	}
447 	spin_unlock_irqrestore(&domain->lock, flags);
448 
449 	return nid;
450 }
451 
452 /* Return the super pagesize bitmap if supported. */
453 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
454 {
455 	unsigned long bitmap = 0;
456 
457 	/*
458 	 * 1-level super page supports page size of 2MiB, 2-level super page
459 	 * supports page size of both 2MiB and 1GiB.
460 	 */
461 	if (domain->iommu_superpage == 1)
462 		bitmap |= SZ_2M;
463 	else if (domain->iommu_superpage == 2)
464 		bitmap |= SZ_2M | SZ_1G;
465 
466 	return bitmap;
467 }
468 
469 /* Some capabilities may be different across iommus */
470 void domain_update_iommu_cap(struct dmar_domain *domain)
471 {
472 	domain_update_iommu_coherency(domain);
473 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
474 
475 	/*
476 	 * If RHSA is missing, we should default to the device numa domain
477 	 * as fall back.
478 	 */
479 	if (domain->nid == NUMA_NO_NODE)
480 		domain->nid = domain_update_device_node(domain);
481 
482 	/*
483 	 * First-level translation restricts the input-address to a
484 	 * canonical address (i.e., address bits 63:N have the same
485 	 * value as address bit [N-1], where N is 48-bits with 4-level
486 	 * paging and 57-bits with 5-level paging). Hence, skip bit
487 	 * [N-1].
488 	 */
489 	if (domain->use_first_level)
490 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
491 	else
492 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
493 
494 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
495 	domain_update_iotlb(domain);
496 }
497 
498 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
499 					 u8 devfn, int alloc)
500 {
501 	struct root_entry *root = &iommu->root_entry[bus];
502 	struct context_entry *context;
503 	u64 *entry;
504 
505 	/*
506 	 * Except that the caller requested to allocate a new entry,
507 	 * returning a copied context entry makes no sense.
508 	 */
509 	if (!alloc && context_copied(iommu, bus, devfn))
510 		return NULL;
511 
512 	entry = &root->lo;
513 	if (sm_supported(iommu)) {
514 		if (devfn >= 0x80) {
515 			devfn -= 0x80;
516 			entry = &root->hi;
517 		}
518 		devfn *= 2;
519 	}
520 	if (*entry & 1)
521 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
522 	else {
523 		unsigned long phy_addr;
524 		if (!alloc)
525 			return NULL;
526 
527 		context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
528 		if (!context)
529 			return NULL;
530 
531 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
532 		phy_addr = virt_to_phys((void *)context);
533 		*entry = phy_addr | 1;
534 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
535 	}
536 	return &context[devfn];
537 }
538 
539 /**
540  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
541  *				 sub-hierarchy of a candidate PCI-PCI bridge
542  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
543  * @bridge: the candidate PCI-PCI bridge
544  *
545  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
546  */
547 static bool
548 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
549 {
550 	struct pci_dev *pdev, *pbridge;
551 
552 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
553 		return false;
554 
555 	pdev = to_pci_dev(dev);
556 	pbridge = to_pci_dev(bridge);
557 
558 	if (pbridge->subordinate &&
559 	    pbridge->subordinate->number <= pdev->bus->number &&
560 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
561 		return true;
562 
563 	return false;
564 }
565 
566 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
567 {
568 	struct dmar_drhd_unit *drhd;
569 	u32 vtbar;
570 	int rc;
571 
572 	/* We know that this device on this chipset has its own IOMMU.
573 	 * If we find it under a different IOMMU, then the BIOS is lying
574 	 * to us. Hope that the IOMMU for this device is actually
575 	 * disabled, and it needs no translation...
576 	 */
577 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
578 	if (rc) {
579 		/* "can't" happen */
580 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
581 		return false;
582 	}
583 	vtbar &= 0xffff0000;
584 
585 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
586 	drhd = dmar_find_matched_drhd_unit(pdev);
587 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
588 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
589 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
590 		return true;
591 	}
592 
593 	return false;
594 }
595 
596 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
597 {
598 	if (!iommu || iommu->drhd->ignored)
599 		return true;
600 
601 	if (dev_is_pci(dev)) {
602 		struct pci_dev *pdev = to_pci_dev(dev);
603 
604 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
605 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
606 		    quirk_ioat_snb_local_iommu(pdev))
607 			return true;
608 	}
609 
610 	return false;
611 }
612 
613 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
614 {
615 	struct dmar_drhd_unit *drhd = NULL;
616 	struct pci_dev *pdev = NULL;
617 	struct intel_iommu *iommu;
618 	struct device *tmp;
619 	u16 segment = 0;
620 	int i;
621 
622 	if (!dev)
623 		return NULL;
624 
625 	if (dev_is_pci(dev)) {
626 		struct pci_dev *pf_pdev;
627 
628 		pdev = pci_real_dma_dev(to_pci_dev(dev));
629 
630 		/* VFs aren't listed in scope tables; we need to look up
631 		 * the PF instead to find the IOMMU. */
632 		pf_pdev = pci_physfn(pdev);
633 		dev = &pf_pdev->dev;
634 		segment = pci_domain_nr(pdev->bus);
635 	} else if (has_acpi_companion(dev))
636 		dev = &ACPI_COMPANION(dev)->dev;
637 
638 	rcu_read_lock();
639 	for_each_iommu(iommu, drhd) {
640 		if (pdev && segment != drhd->segment)
641 			continue;
642 
643 		for_each_active_dev_scope(drhd->devices,
644 					  drhd->devices_cnt, i, tmp) {
645 			if (tmp == dev) {
646 				/* For a VF use its original BDF# not that of the PF
647 				 * which we used for the IOMMU lookup. Strictly speaking
648 				 * we could do this for all PCI devices; we only need to
649 				 * get the BDF# from the scope table for ACPI matches. */
650 				if (pdev && pdev->is_virtfn)
651 					goto got_pdev;
652 
653 				if (bus && devfn) {
654 					*bus = drhd->devices[i].bus;
655 					*devfn = drhd->devices[i].devfn;
656 				}
657 				goto out;
658 			}
659 
660 			if (is_downstream_to_pci_bridge(dev, tmp))
661 				goto got_pdev;
662 		}
663 
664 		if (pdev && drhd->include_all) {
665 got_pdev:
666 			if (bus && devfn) {
667 				*bus = pdev->bus->number;
668 				*devfn = pdev->devfn;
669 			}
670 			goto out;
671 		}
672 	}
673 	iommu = NULL;
674 out:
675 	if (iommu_is_dummy(iommu, dev))
676 		iommu = NULL;
677 
678 	rcu_read_unlock();
679 
680 	return iommu;
681 }
682 
683 static void domain_flush_cache(struct dmar_domain *domain,
684 			       void *addr, int size)
685 {
686 	if (!domain->iommu_coherency)
687 		clflush_cache_range(addr, size);
688 }
689 
690 static void free_context_table(struct intel_iommu *iommu)
691 {
692 	struct context_entry *context;
693 	int i;
694 
695 	if (!iommu->root_entry)
696 		return;
697 
698 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
699 		context = iommu_context_addr(iommu, i, 0, 0);
700 		if (context)
701 			iommu_free_page(context);
702 
703 		if (!sm_supported(iommu))
704 			continue;
705 
706 		context = iommu_context_addr(iommu, i, 0x80, 0);
707 		if (context)
708 			iommu_free_page(context);
709 	}
710 
711 	iommu_free_page(iommu->root_entry);
712 	iommu->root_entry = NULL;
713 }
714 
715 #ifdef CONFIG_DMAR_DEBUG
716 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
717 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
718 {
719 	struct dma_pte *pte;
720 	int offset;
721 
722 	while (1) {
723 		offset = pfn_level_offset(pfn, level);
724 		pte = &parent[offset];
725 		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
726 			pr_info("PTE not present at level %d\n", level);
727 			break;
728 		}
729 
730 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
731 
732 		if (level == 1)
733 			break;
734 
735 		parent = phys_to_virt(dma_pte_addr(pte));
736 		level--;
737 	}
738 }
739 
740 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
741 			  unsigned long long addr, u32 pasid)
742 {
743 	struct pasid_dir_entry *dir, *pde;
744 	struct pasid_entry *entries, *pte;
745 	struct context_entry *ctx_entry;
746 	struct root_entry *rt_entry;
747 	int i, dir_index, index, level;
748 	u8 devfn = source_id & 0xff;
749 	u8 bus = source_id >> 8;
750 	struct dma_pte *pgtable;
751 
752 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
753 
754 	/* root entry dump */
755 	rt_entry = &iommu->root_entry[bus];
756 	if (!rt_entry) {
757 		pr_info("root table entry is not present\n");
758 		return;
759 	}
760 
761 	if (sm_supported(iommu))
762 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
763 			rt_entry->hi, rt_entry->lo);
764 	else
765 		pr_info("root entry: 0x%016llx", rt_entry->lo);
766 
767 	/* context entry dump */
768 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
769 	if (!ctx_entry) {
770 		pr_info("context table entry is not present\n");
771 		return;
772 	}
773 
774 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
775 		ctx_entry->hi, ctx_entry->lo);
776 
777 	/* legacy mode does not require PASID entries */
778 	if (!sm_supported(iommu)) {
779 		level = agaw_to_level(ctx_entry->hi & 7);
780 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
781 		goto pgtable_walk;
782 	}
783 
784 	/* get the pointer to pasid directory entry */
785 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
786 	if (!dir) {
787 		pr_info("pasid directory entry is not present\n");
788 		return;
789 	}
790 	/* For request-without-pasid, get the pasid from context entry */
791 	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
792 		pasid = IOMMU_NO_PASID;
793 
794 	dir_index = pasid >> PASID_PDE_SHIFT;
795 	pde = &dir[dir_index];
796 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
797 
798 	/* get the pointer to the pasid table entry */
799 	entries = get_pasid_table_from_pde(pde);
800 	if (!entries) {
801 		pr_info("pasid table entry is not present\n");
802 		return;
803 	}
804 	index = pasid & PASID_PTE_MASK;
805 	pte = &entries[index];
806 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
807 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
808 
809 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
810 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
811 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
812 	} else {
813 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
814 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
815 	}
816 
817 pgtable_walk:
818 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
819 }
820 #endif
821 
822 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
823 				      unsigned long pfn, int *target_level,
824 				      gfp_t gfp)
825 {
826 	struct dma_pte *parent, *pte;
827 	int level = agaw_to_level(domain->agaw);
828 	int offset;
829 
830 	if (!domain_pfn_supported(domain, pfn))
831 		/* Address beyond IOMMU's addressing capabilities. */
832 		return NULL;
833 
834 	parent = domain->pgd;
835 
836 	while (1) {
837 		void *tmp_page;
838 
839 		offset = pfn_level_offset(pfn, level);
840 		pte = &parent[offset];
841 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
842 			break;
843 		if (level == *target_level)
844 			break;
845 
846 		if (!dma_pte_present(pte)) {
847 			uint64_t pteval, tmp;
848 
849 			tmp_page = iommu_alloc_page_node(domain->nid, gfp);
850 
851 			if (!tmp_page)
852 				return NULL;
853 
854 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
855 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
856 			if (domain->use_first_level)
857 				pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
858 
859 			tmp = 0ULL;
860 			if (!try_cmpxchg64(&pte->val, &tmp, pteval))
861 				/* Someone else set it while we were thinking; use theirs. */
862 				iommu_free_page(tmp_page);
863 			else
864 				domain_flush_cache(domain, pte, sizeof(*pte));
865 		}
866 		if (level == 1)
867 			break;
868 
869 		parent = phys_to_virt(dma_pte_addr(pte));
870 		level--;
871 	}
872 
873 	if (!*target_level)
874 		*target_level = level;
875 
876 	return pte;
877 }
878 
879 /* return address's pte at specific level */
880 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
881 					 unsigned long pfn,
882 					 int level, int *large_page)
883 {
884 	struct dma_pte *parent, *pte;
885 	int total = agaw_to_level(domain->agaw);
886 	int offset;
887 
888 	parent = domain->pgd;
889 	while (level <= total) {
890 		offset = pfn_level_offset(pfn, total);
891 		pte = &parent[offset];
892 		if (level == total)
893 			return pte;
894 
895 		if (!dma_pte_present(pte)) {
896 			*large_page = total;
897 			break;
898 		}
899 
900 		if (dma_pte_superpage(pte)) {
901 			*large_page = total;
902 			return pte;
903 		}
904 
905 		parent = phys_to_virt(dma_pte_addr(pte));
906 		total--;
907 	}
908 	return NULL;
909 }
910 
911 /* clear last level pte, a tlb flush should be followed */
912 static void dma_pte_clear_range(struct dmar_domain *domain,
913 				unsigned long start_pfn,
914 				unsigned long last_pfn)
915 {
916 	unsigned int large_page;
917 	struct dma_pte *first_pte, *pte;
918 
919 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
920 	    WARN_ON(start_pfn > last_pfn))
921 		return;
922 
923 	/* we don't need lock here; nobody else touches the iova range */
924 	do {
925 		large_page = 1;
926 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
927 		if (!pte) {
928 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
929 			continue;
930 		}
931 		do {
932 			dma_clear_pte(pte);
933 			start_pfn += lvl_to_nr_pages(large_page);
934 			pte++;
935 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
936 
937 		domain_flush_cache(domain, first_pte,
938 				   (void *)pte - (void *)first_pte);
939 
940 	} while (start_pfn && start_pfn <= last_pfn);
941 }
942 
943 static void dma_pte_free_level(struct dmar_domain *domain, int level,
944 			       int retain_level, struct dma_pte *pte,
945 			       unsigned long pfn, unsigned long start_pfn,
946 			       unsigned long last_pfn)
947 {
948 	pfn = max(start_pfn, pfn);
949 	pte = &pte[pfn_level_offset(pfn, level)];
950 
951 	do {
952 		unsigned long level_pfn;
953 		struct dma_pte *level_pte;
954 
955 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
956 			goto next;
957 
958 		level_pfn = pfn & level_mask(level);
959 		level_pte = phys_to_virt(dma_pte_addr(pte));
960 
961 		if (level > 2) {
962 			dma_pte_free_level(domain, level - 1, retain_level,
963 					   level_pte, level_pfn, start_pfn,
964 					   last_pfn);
965 		}
966 
967 		/*
968 		 * Free the page table if we're below the level we want to
969 		 * retain and the range covers the entire table.
970 		 */
971 		if (level < retain_level && !(start_pfn > level_pfn ||
972 		      last_pfn < level_pfn + level_size(level) - 1)) {
973 			dma_clear_pte(pte);
974 			domain_flush_cache(domain, pte, sizeof(*pte));
975 			iommu_free_page(level_pte);
976 		}
977 next:
978 		pfn += level_size(level);
979 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
980 }
981 
982 /*
983  * clear last level (leaf) ptes and free page table pages below the
984  * level we wish to keep intact.
985  */
986 static void dma_pte_free_pagetable(struct dmar_domain *domain,
987 				   unsigned long start_pfn,
988 				   unsigned long last_pfn,
989 				   int retain_level)
990 {
991 	dma_pte_clear_range(domain, start_pfn, last_pfn);
992 
993 	/* We don't need lock here; nobody else touches the iova range */
994 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
995 			   domain->pgd, 0, start_pfn, last_pfn);
996 
997 	/* free pgd */
998 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
999 		iommu_free_page(domain->pgd);
1000 		domain->pgd = NULL;
1001 	}
1002 }
1003 
1004 /* When a page at a given level is being unlinked from its parent, we don't
1005    need to *modify* it at all. All we need to do is make a list of all the
1006    pages which can be freed just as soon as we've flushed the IOTLB and we
1007    know the hardware page-walk will no longer touch them.
1008    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1009    be freed. */
1010 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1011 				    int level, struct dma_pte *pte,
1012 				    struct list_head *freelist)
1013 {
1014 	struct page *pg;
1015 
1016 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1017 	list_add_tail(&pg->lru, freelist);
1018 
1019 	if (level == 1)
1020 		return;
1021 
1022 	pte = page_address(pg);
1023 	do {
1024 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1025 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1026 		pte++;
1027 	} while (!first_pte_in_page(pte));
1028 }
1029 
1030 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1031 				struct dma_pte *pte, unsigned long pfn,
1032 				unsigned long start_pfn, unsigned long last_pfn,
1033 				struct list_head *freelist)
1034 {
1035 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1036 
1037 	pfn = max(start_pfn, pfn);
1038 	pte = &pte[pfn_level_offset(pfn, level)];
1039 
1040 	do {
1041 		unsigned long level_pfn = pfn & level_mask(level);
1042 
1043 		if (!dma_pte_present(pte))
1044 			goto next;
1045 
1046 		/* If range covers entire pagetable, free it */
1047 		if (start_pfn <= level_pfn &&
1048 		    last_pfn >= level_pfn + level_size(level) - 1) {
1049 			/* These suborbinate page tables are going away entirely. Don't
1050 			   bother to clear them; we're just going to *free* them. */
1051 			if (level > 1 && !dma_pte_superpage(pte))
1052 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1053 
1054 			dma_clear_pte(pte);
1055 			if (!first_pte)
1056 				first_pte = pte;
1057 			last_pte = pte;
1058 		} else if (level > 1) {
1059 			/* Recurse down into a level that isn't *entirely* obsolete */
1060 			dma_pte_clear_level(domain, level - 1,
1061 					    phys_to_virt(dma_pte_addr(pte)),
1062 					    level_pfn, start_pfn, last_pfn,
1063 					    freelist);
1064 		}
1065 next:
1066 		pfn = level_pfn + level_size(level);
1067 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1068 
1069 	if (first_pte)
1070 		domain_flush_cache(domain, first_pte,
1071 				   (void *)++last_pte - (void *)first_pte);
1072 }
1073 
1074 /* We can't just free the pages because the IOMMU may still be walking
1075    the page tables, and may have cached the intermediate levels. The
1076    pages can only be freed after the IOTLB flush has been done. */
1077 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1078 			 unsigned long last_pfn, struct list_head *freelist)
1079 {
1080 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1081 	    WARN_ON(start_pfn > last_pfn))
1082 		return;
1083 
1084 	/* we don't need lock here; nobody else touches the iova range */
1085 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1086 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1087 
1088 	/* free pgd */
1089 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1090 		struct page *pgd_page = virt_to_page(domain->pgd);
1091 		list_add_tail(&pgd_page->lru, freelist);
1092 		domain->pgd = NULL;
1093 	}
1094 }
1095 
1096 /* iommu handling */
1097 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1098 {
1099 	struct root_entry *root;
1100 
1101 	root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
1102 	if (!root) {
1103 		pr_err("Allocating root entry for %s failed\n",
1104 			iommu->name);
1105 		return -ENOMEM;
1106 	}
1107 
1108 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1109 	iommu->root_entry = root;
1110 
1111 	return 0;
1112 }
1113 
1114 static void iommu_set_root_entry(struct intel_iommu *iommu)
1115 {
1116 	u64 addr;
1117 	u32 sts;
1118 	unsigned long flag;
1119 
1120 	addr = virt_to_phys(iommu->root_entry);
1121 	if (sm_supported(iommu))
1122 		addr |= DMA_RTADDR_SMT;
1123 
1124 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1125 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1126 
1127 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1128 
1129 	/* Make sure hardware complete it */
1130 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1131 		      readl, (sts & DMA_GSTS_RTPS), sts);
1132 
1133 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1134 
1135 	/*
1136 	 * Hardware invalidates all DMA remapping hardware translation
1137 	 * caches as part of SRTP flow.
1138 	 */
1139 	if (cap_esrtps(iommu->cap))
1140 		return;
1141 
1142 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1143 	if (sm_supported(iommu))
1144 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1145 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1146 }
1147 
1148 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1149 {
1150 	u32 val;
1151 	unsigned long flag;
1152 
1153 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1154 		return;
1155 
1156 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1157 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1158 
1159 	/* Make sure hardware complete it */
1160 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1161 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1162 
1163 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1164 }
1165 
1166 /* return value determine if we need a write buffer flush */
1167 static void __iommu_flush_context(struct intel_iommu *iommu,
1168 				  u16 did, u16 source_id, u8 function_mask,
1169 				  u64 type)
1170 {
1171 	u64 val = 0;
1172 	unsigned long flag;
1173 
1174 	switch (type) {
1175 	case DMA_CCMD_GLOBAL_INVL:
1176 		val = DMA_CCMD_GLOBAL_INVL;
1177 		break;
1178 	case DMA_CCMD_DOMAIN_INVL:
1179 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1180 		break;
1181 	case DMA_CCMD_DEVICE_INVL:
1182 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1183 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1184 		break;
1185 	default:
1186 		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1187 			iommu->name, type);
1188 		return;
1189 	}
1190 	val |= DMA_CCMD_ICC;
1191 
1192 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1193 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1194 
1195 	/* Make sure hardware complete it */
1196 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1197 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1198 
1199 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1200 }
1201 
1202 /* return value determine if we need a write buffer flush */
1203 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1204 				u64 addr, unsigned int size_order, u64 type)
1205 {
1206 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1207 	u64 val = 0, val_iva = 0;
1208 	unsigned long flag;
1209 
1210 	switch (type) {
1211 	case DMA_TLB_GLOBAL_FLUSH:
1212 		/* global flush doesn't need set IVA_REG */
1213 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1214 		break;
1215 	case DMA_TLB_DSI_FLUSH:
1216 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1217 		break;
1218 	case DMA_TLB_PSI_FLUSH:
1219 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1220 		/* IH bit is passed in as part of address */
1221 		val_iva = size_order | addr;
1222 		break;
1223 	default:
1224 		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1225 			iommu->name, type);
1226 		return;
1227 	}
1228 
1229 	if (cap_write_drain(iommu->cap))
1230 		val |= DMA_TLB_WRITE_DRAIN;
1231 
1232 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1233 	/* Note: Only uses first TLB reg currently */
1234 	if (val_iva)
1235 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1236 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1237 
1238 	/* Make sure hardware complete it */
1239 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1240 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1241 
1242 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1243 
1244 	/* check IOTLB invalidation granularity */
1245 	if (DMA_TLB_IAIG(val) == 0)
1246 		pr_err("Flush IOTLB failed\n");
1247 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1248 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1249 			(unsigned long long)DMA_TLB_IIRG(type),
1250 			(unsigned long long)DMA_TLB_IAIG(val));
1251 }
1252 
1253 static struct device_domain_info *
1254 domain_lookup_dev_info(struct dmar_domain *domain,
1255 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1256 {
1257 	struct device_domain_info *info;
1258 	unsigned long flags;
1259 
1260 	spin_lock_irqsave(&domain->lock, flags);
1261 	list_for_each_entry(info, &domain->devices, link) {
1262 		if (info->iommu == iommu && info->bus == bus &&
1263 		    info->devfn == devfn) {
1264 			spin_unlock_irqrestore(&domain->lock, flags);
1265 			return info;
1266 		}
1267 	}
1268 	spin_unlock_irqrestore(&domain->lock, flags);
1269 
1270 	return NULL;
1271 }
1272 
1273 void domain_update_iotlb(struct dmar_domain *domain)
1274 {
1275 	struct dev_pasid_info *dev_pasid;
1276 	struct device_domain_info *info;
1277 	bool has_iotlb_device = false;
1278 	unsigned long flags;
1279 
1280 	spin_lock_irqsave(&domain->lock, flags);
1281 	list_for_each_entry(info, &domain->devices, link) {
1282 		if (info->ats_enabled) {
1283 			has_iotlb_device = true;
1284 			break;
1285 		}
1286 	}
1287 
1288 	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1289 		info = dev_iommu_priv_get(dev_pasid->dev);
1290 		if (info->ats_enabled) {
1291 			has_iotlb_device = true;
1292 			break;
1293 		}
1294 	}
1295 	domain->has_iotlb_device = has_iotlb_device;
1296 	spin_unlock_irqrestore(&domain->lock, flags);
1297 }
1298 
1299 /*
1300  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1301  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1302  * check because it applies only to the built-in QAT devices and it doesn't
1303  * grant additional privileges.
1304  */
1305 #define BUGGY_QAT_DEVID_MASK 0x4940
1306 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1307 {
1308 	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1309 		return false;
1310 
1311 	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1312 		return false;
1313 
1314 	return true;
1315 }
1316 
1317 static void iommu_enable_pci_caps(struct device_domain_info *info)
1318 {
1319 	struct pci_dev *pdev;
1320 
1321 	if (!dev_is_pci(info->dev))
1322 		return;
1323 
1324 	pdev = to_pci_dev(info->dev);
1325 
1326 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1327 	   the device if you enable PASID support after ATS support is
1328 	   undefined. So always enable PASID support on devices which
1329 	   have it, even if we can't yet know if we're ever going to
1330 	   use it. */
1331 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1332 		info->pasid_enabled = 1;
1333 
1334 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1335 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1336 		info->ats_enabled = 1;
1337 		domain_update_iotlb(info->domain);
1338 	}
1339 }
1340 
1341 static void iommu_disable_pci_caps(struct device_domain_info *info)
1342 {
1343 	struct pci_dev *pdev;
1344 
1345 	if (!dev_is_pci(info->dev))
1346 		return;
1347 
1348 	pdev = to_pci_dev(info->dev);
1349 
1350 	if (info->ats_enabled) {
1351 		pci_disable_ats(pdev);
1352 		info->ats_enabled = 0;
1353 		domain_update_iotlb(info->domain);
1354 	}
1355 
1356 	if (info->pasid_enabled) {
1357 		pci_disable_pasid(pdev);
1358 		info->pasid_enabled = 0;
1359 	}
1360 }
1361 
1362 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1363 {
1364 	cache_tag_flush_all(to_dmar_domain(domain));
1365 }
1366 
1367 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1368 {
1369 	u32 pmen;
1370 	unsigned long flags;
1371 
1372 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1373 		return;
1374 
1375 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1376 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1377 	pmen &= ~DMA_PMEN_EPM;
1378 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1379 
1380 	/* wait for the protected region status bit to clear */
1381 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1382 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1383 
1384 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1385 }
1386 
1387 static void iommu_enable_translation(struct intel_iommu *iommu)
1388 {
1389 	u32 sts;
1390 	unsigned long flags;
1391 
1392 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1393 	iommu->gcmd |= DMA_GCMD_TE;
1394 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1395 
1396 	/* Make sure hardware complete it */
1397 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1398 		      readl, (sts & DMA_GSTS_TES), sts);
1399 
1400 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1401 }
1402 
1403 static void iommu_disable_translation(struct intel_iommu *iommu)
1404 {
1405 	u32 sts;
1406 	unsigned long flag;
1407 
1408 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1409 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1410 		return;
1411 
1412 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1413 	iommu->gcmd &= ~DMA_GCMD_TE;
1414 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1415 
1416 	/* Make sure hardware complete it */
1417 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1418 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1419 
1420 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1421 }
1422 
1423 static int iommu_init_domains(struct intel_iommu *iommu)
1424 {
1425 	u32 ndomains;
1426 
1427 	ndomains = cap_ndoms(iommu->cap);
1428 	pr_debug("%s: Number of Domains supported <%d>\n",
1429 		 iommu->name, ndomains);
1430 
1431 	spin_lock_init(&iommu->lock);
1432 
1433 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1434 	if (!iommu->domain_ids)
1435 		return -ENOMEM;
1436 
1437 	/*
1438 	 * If Caching mode is set, then invalid translations are tagged
1439 	 * with domain-id 0, hence we need to pre-allocate it. We also
1440 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1441 	 * make sure it is not used for a real domain.
1442 	 */
1443 	set_bit(0, iommu->domain_ids);
1444 
1445 	/*
1446 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1447 	 * entry for first-level or pass-through translation modes should
1448 	 * be programmed with a domain id different from those used for
1449 	 * second-level or nested translation. We reserve a domain id for
1450 	 * this purpose.
1451 	 */
1452 	if (sm_supported(iommu))
1453 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1454 
1455 	return 0;
1456 }
1457 
1458 static void disable_dmar_iommu(struct intel_iommu *iommu)
1459 {
1460 	if (!iommu->domain_ids)
1461 		return;
1462 
1463 	/*
1464 	 * All iommu domains must have been detached from the devices,
1465 	 * hence there should be no domain IDs in use.
1466 	 */
1467 	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1468 		    > NUM_RESERVED_DID))
1469 		return;
1470 
1471 	if (iommu->gcmd & DMA_GCMD_TE)
1472 		iommu_disable_translation(iommu);
1473 }
1474 
1475 static void free_dmar_iommu(struct intel_iommu *iommu)
1476 {
1477 	if (iommu->domain_ids) {
1478 		bitmap_free(iommu->domain_ids);
1479 		iommu->domain_ids = NULL;
1480 	}
1481 
1482 	if (iommu->copied_tables) {
1483 		bitmap_free(iommu->copied_tables);
1484 		iommu->copied_tables = NULL;
1485 	}
1486 
1487 	/* free context mapping */
1488 	free_context_table(iommu);
1489 
1490 #ifdef CONFIG_INTEL_IOMMU_SVM
1491 	if (pasid_supported(iommu)) {
1492 		if (ecap_prs(iommu->ecap))
1493 			intel_svm_finish_prq(iommu);
1494 	}
1495 #endif
1496 }
1497 
1498 /*
1499  * Check and return whether first level is used by default for
1500  * DMA translation.
1501  */
1502 static bool first_level_by_default(unsigned int type)
1503 {
1504 	/* Only SL is available in legacy mode */
1505 	if (!scalable_mode_support())
1506 		return false;
1507 
1508 	/* Only level (either FL or SL) is available, just use it */
1509 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1510 		return intel_cap_flts_sanity();
1511 
1512 	/* Both levels are available, decide it based on domain type */
1513 	return type != IOMMU_DOMAIN_UNMANAGED;
1514 }
1515 
1516 static struct dmar_domain *alloc_domain(unsigned int type)
1517 {
1518 	struct dmar_domain *domain;
1519 
1520 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1521 	if (!domain)
1522 		return NULL;
1523 
1524 	domain->nid = NUMA_NO_NODE;
1525 	if (first_level_by_default(type))
1526 		domain->use_first_level = true;
1527 	domain->has_iotlb_device = false;
1528 	INIT_LIST_HEAD(&domain->devices);
1529 	INIT_LIST_HEAD(&domain->dev_pasids);
1530 	INIT_LIST_HEAD(&domain->cache_tags);
1531 	spin_lock_init(&domain->lock);
1532 	spin_lock_init(&domain->cache_lock);
1533 	xa_init(&domain->iommu_array);
1534 
1535 	return domain;
1536 }
1537 
1538 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1539 {
1540 	struct iommu_domain_info *info, *curr;
1541 	unsigned long ndomains;
1542 	int num, ret = -ENOSPC;
1543 
1544 	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1545 		return 0;
1546 
1547 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1548 	if (!info)
1549 		return -ENOMEM;
1550 
1551 	spin_lock(&iommu->lock);
1552 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1553 	if (curr) {
1554 		curr->refcnt++;
1555 		spin_unlock(&iommu->lock);
1556 		kfree(info);
1557 		return 0;
1558 	}
1559 
1560 	ndomains = cap_ndoms(iommu->cap);
1561 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1562 	if (num >= ndomains) {
1563 		pr_err("%s: No free domain ids\n", iommu->name);
1564 		goto err_unlock;
1565 	}
1566 
1567 	set_bit(num, iommu->domain_ids);
1568 	info->refcnt	= 1;
1569 	info->did	= num;
1570 	info->iommu	= iommu;
1571 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1572 			  NULL, info, GFP_ATOMIC);
1573 	if (curr) {
1574 		ret = xa_err(curr) ? : -EBUSY;
1575 		goto err_clear;
1576 	}
1577 	domain_update_iommu_cap(domain);
1578 
1579 	spin_unlock(&iommu->lock);
1580 	return 0;
1581 
1582 err_clear:
1583 	clear_bit(info->did, iommu->domain_ids);
1584 err_unlock:
1585 	spin_unlock(&iommu->lock);
1586 	kfree(info);
1587 	return ret;
1588 }
1589 
1590 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1591 {
1592 	struct iommu_domain_info *info;
1593 
1594 	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1595 		return;
1596 
1597 	spin_lock(&iommu->lock);
1598 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1599 	if (--info->refcnt == 0) {
1600 		clear_bit(info->did, iommu->domain_ids);
1601 		xa_erase(&domain->iommu_array, iommu->seq_id);
1602 		domain->nid = NUMA_NO_NODE;
1603 		domain_update_iommu_cap(domain);
1604 		kfree(info);
1605 	}
1606 	spin_unlock(&iommu->lock);
1607 }
1608 
1609 static int guestwidth_to_adjustwidth(int gaw)
1610 {
1611 	int agaw;
1612 	int r = (gaw - 12) % 9;
1613 
1614 	if (r == 0)
1615 		agaw = gaw;
1616 	else
1617 		agaw = gaw + 9 - r;
1618 	if (agaw > 64)
1619 		agaw = 64;
1620 	return agaw;
1621 }
1622 
1623 static void domain_exit(struct dmar_domain *domain)
1624 {
1625 	if (domain->pgd) {
1626 		LIST_HEAD(freelist);
1627 
1628 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1629 		iommu_put_pages_list(&freelist);
1630 	}
1631 
1632 	if (WARN_ON(!list_empty(&domain->devices)))
1633 		return;
1634 
1635 	kfree(domain);
1636 }
1637 
1638 static int domain_context_mapping_one(struct dmar_domain *domain,
1639 				      struct intel_iommu *iommu,
1640 				      u8 bus, u8 devfn)
1641 {
1642 	struct device_domain_info *info =
1643 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1644 	u16 did = domain_id_iommu(domain, iommu);
1645 	int translation = CONTEXT_TT_MULTI_LEVEL;
1646 	struct dma_pte *pgd = domain->pgd;
1647 	struct context_entry *context;
1648 	int agaw, ret;
1649 
1650 	if (hw_pass_through && domain_type_is_si(domain))
1651 		translation = CONTEXT_TT_PASS_THROUGH;
1652 
1653 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1654 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1655 
1656 	spin_lock(&iommu->lock);
1657 	ret = -ENOMEM;
1658 	context = iommu_context_addr(iommu, bus, devfn, 1);
1659 	if (!context)
1660 		goto out_unlock;
1661 
1662 	ret = 0;
1663 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1664 		goto out_unlock;
1665 
1666 	/*
1667 	 * For kdump cases, old valid entries may be cached due to the
1668 	 * in-flight DMA and copied pgtable, but there is no unmapping
1669 	 * behaviour for them, thus we need an explicit cache flush for
1670 	 * the newly-mapped device. For kdump, at this point, the device
1671 	 * is supposed to finish reset at its driver probe stage, so no
1672 	 * in-flight DMA will exist, and we don't need to worry anymore
1673 	 * hereafter.
1674 	 */
1675 	if (context_copied(iommu, bus, devfn)) {
1676 		u16 did_old = context_domain_id(context);
1677 
1678 		if (did_old < cap_ndoms(iommu->cap)) {
1679 			iommu->flush.flush_context(iommu, did_old,
1680 						   (((u16)bus) << 8) | devfn,
1681 						   DMA_CCMD_MASK_NOBIT,
1682 						   DMA_CCMD_DEVICE_INVL);
1683 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1684 						 DMA_TLB_DSI_FLUSH);
1685 		}
1686 
1687 		clear_context_copied(iommu, bus, devfn);
1688 	}
1689 
1690 	context_clear_entry(context);
1691 	context_set_domain_id(context, did);
1692 
1693 	if (translation != CONTEXT_TT_PASS_THROUGH) {
1694 		/*
1695 		 * Skip top levels of page tables for iommu which has
1696 		 * less agaw than default. Unnecessary for PT mode.
1697 		 */
1698 		for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1699 			ret = -ENOMEM;
1700 			pgd = phys_to_virt(dma_pte_addr(pgd));
1701 			if (!dma_pte_present(pgd))
1702 				goto out_unlock;
1703 		}
1704 
1705 		if (info && info->ats_supported)
1706 			translation = CONTEXT_TT_DEV_IOTLB;
1707 		else
1708 			translation = CONTEXT_TT_MULTI_LEVEL;
1709 
1710 		context_set_address_root(context, virt_to_phys(pgd));
1711 		context_set_address_width(context, agaw);
1712 	} else {
1713 		/*
1714 		 * In pass through mode, AW must be programmed to
1715 		 * indicate the largest AGAW value supported by
1716 		 * hardware. And ASR is ignored by hardware.
1717 		 */
1718 		context_set_address_width(context, iommu->msagaw);
1719 	}
1720 
1721 	context_set_translation_type(context, translation);
1722 	context_set_fault_enable(context);
1723 	context_set_present(context);
1724 	if (!ecap_coherent(iommu->ecap))
1725 		clflush_cache_range(context, sizeof(*context));
1726 
1727 	/*
1728 	 * It's a non-present to present mapping. If hardware doesn't cache
1729 	 * non-present entry we only need to flush the write-buffer. If the
1730 	 * _does_ cache non-present entries, then it does so in the special
1731 	 * domain #0, which we have to flush:
1732 	 */
1733 	if (cap_caching_mode(iommu->cap)) {
1734 		iommu->flush.flush_context(iommu, 0,
1735 					   (((u16)bus) << 8) | devfn,
1736 					   DMA_CCMD_MASK_NOBIT,
1737 					   DMA_CCMD_DEVICE_INVL);
1738 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1739 	} else {
1740 		iommu_flush_write_buffer(iommu);
1741 	}
1742 
1743 	ret = 0;
1744 
1745 out_unlock:
1746 	spin_unlock(&iommu->lock);
1747 
1748 	return ret;
1749 }
1750 
1751 static int domain_context_mapping_cb(struct pci_dev *pdev,
1752 				     u16 alias, void *opaque)
1753 {
1754 	struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1755 	struct intel_iommu *iommu = info->iommu;
1756 	struct dmar_domain *domain = opaque;
1757 
1758 	return domain_context_mapping_one(domain, iommu,
1759 					  PCI_BUS_NUM(alias), alias & 0xff);
1760 }
1761 
1762 static int
1763 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1764 {
1765 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1766 	struct intel_iommu *iommu = info->iommu;
1767 	u8 bus = info->bus, devfn = info->devfn;
1768 
1769 	if (!dev_is_pci(dev))
1770 		return domain_context_mapping_one(domain, iommu, bus, devfn);
1771 
1772 	return pci_for_each_dma_alias(to_pci_dev(dev),
1773 				      domain_context_mapping_cb, domain);
1774 }
1775 
1776 /* Return largest possible superpage level for a given mapping */
1777 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1778 				   unsigned long phy_pfn, unsigned long pages)
1779 {
1780 	int support, level = 1;
1781 	unsigned long pfnmerge;
1782 
1783 	support = domain->iommu_superpage;
1784 
1785 	/* To use a large page, the virtual *and* physical addresses
1786 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1787 	   of them will mean we have to use smaller pages. So just
1788 	   merge them and check both at once. */
1789 	pfnmerge = iov_pfn | phy_pfn;
1790 
1791 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1792 		pages >>= VTD_STRIDE_SHIFT;
1793 		if (!pages)
1794 			break;
1795 		pfnmerge >>= VTD_STRIDE_SHIFT;
1796 		level++;
1797 		support--;
1798 	}
1799 	return level;
1800 }
1801 
1802 /*
1803  * Ensure that old small page tables are removed to make room for superpage(s).
1804  * We're going to add new large pages, so make sure we don't remove their parent
1805  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1806  */
1807 static void switch_to_super_page(struct dmar_domain *domain,
1808 				 unsigned long start_pfn,
1809 				 unsigned long end_pfn, int level)
1810 {
1811 	unsigned long lvl_pages = lvl_to_nr_pages(level);
1812 	struct dma_pte *pte = NULL;
1813 
1814 	while (start_pfn <= end_pfn) {
1815 		if (!pte)
1816 			pte = pfn_to_dma_pte(domain, start_pfn, &level,
1817 					     GFP_ATOMIC);
1818 
1819 		if (dma_pte_present(pte)) {
1820 			dma_pte_free_pagetable(domain, start_pfn,
1821 					       start_pfn + lvl_pages - 1,
1822 					       level + 1);
1823 
1824 			cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT,
1825 					      end_pfn << VTD_PAGE_SHIFT, 0);
1826 		}
1827 
1828 		pte++;
1829 		start_pfn += lvl_pages;
1830 		if (first_pte_in_page(pte))
1831 			pte = NULL;
1832 	}
1833 }
1834 
1835 static int
1836 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1837 		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
1838 		 gfp_t gfp)
1839 {
1840 	struct dma_pte *first_pte = NULL, *pte = NULL;
1841 	unsigned int largepage_lvl = 0;
1842 	unsigned long lvl_pages = 0;
1843 	phys_addr_t pteval;
1844 	u64 attr;
1845 
1846 	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
1847 		return -EINVAL;
1848 
1849 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1850 		return -EINVAL;
1851 
1852 	if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
1853 		pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
1854 		return -EINVAL;
1855 	}
1856 
1857 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
1858 	attr |= DMA_FL_PTE_PRESENT;
1859 	if (domain->use_first_level) {
1860 		attr |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
1861 		if (prot & DMA_PTE_WRITE)
1862 			attr |= DMA_FL_PTE_DIRTY;
1863 	}
1864 
1865 	domain->has_mappings = true;
1866 
1867 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
1868 
1869 	while (nr_pages > 0) {
1870 		uint64_t tmp;
1871 
1872 		if (!pte) {
1873 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
1874 					phys_pfn, nr_pages);
1875 
1876 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
1877 					     gfp);
1878 			if (!pte)
1879 				return -ENOMEM;
1880 			first_pte = pte;
1881 
1882 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
1883 
1884 			/* It is large page*/
1885 			if (largepage_lvl > 1) {
1886 				unsigned long end_pfn;
1887 				unsigned long pages_to_remove;
1888 
1889 				pteval |= DMA_PTE_LARGE_PAGE;
1890 				pages_to_remove = min_t(unsigned long, nr_pages,
1891 							nr_pte_to_next_page(pte) * lvl_pages);
1892 				end_pfn = iov_pfn + pages_to_remove - 1;
1893 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
1894 			} else {
1895 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1896 			}
1897 
1898 		}
1899 		/* We don't need lock here, nobody else
1900 		 * touches the iova range
1901 		 */
1902 		tmp = 0ULL;
1903 		if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
1904 			static int dumps = 5;
1905 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1906 				iov_pfn, tmp, (unsigned long long)pteval);
1907 			if (dumps) {
1908 				dumps--;
1909 				debug_dma_dump_mappings(NULL);
1910 			}
1911 			WARN_ON(1);
1912 		}
1913 
1914 		nr_pages -= lvl_pages;
1915 		iov_pfn += lvl_pages;
1916 		phys_pfn += lvl_pages;
1917 		pteval += lvl_pages * VTD_PAGE_SIZE;
1918 
1919 		/* If the next PTE would be the first in a new page, then we
1920 		 * need to flush the cache on the entries we've just written.
1921 		 * And then we'll need to recalculate 'pte', so clear it and
1922 		 * let it get set again in the if (!pte) block above.
1923 		 *
1924 		 * If we're done (!nr_pages) we need to flush the cache too.
1925 		 *
1926 		 * Also if we've been setting superpages, we may need to
1927 		 * recalculate 'pte' and switch back to smaller pages for the
1928 		 * end of the mapping, if the trailing size is not enough to
1929 		 * use another superpage (i.e. nr_pages < lvl_pages).
1930 		 */
1931 		pte++;
1932 		if (!nr_pages || first_pte_in_page(pte) ||
1933 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
1934 			domain_flush_cache(domain, first_pte,
1935 					   (void *)pte - (void *)first_pte);
1936 			pte = NULL;
1937 		}
1938 	}
1939 
1940 	return 0;
1941 }
1942 
1943 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
1944 {
1945 	struct intel_iommu *iommu = info->iommu;
1946 	struct context_entry *context;
1947 
1948 	spin_lock(&iommu->lock);
1949 	context = iommu_context_addr(iommu, bus, devfn, 0);
1950 	if (!context) {
1951 		spin_unlock(&iommu->lock);
1952 		return;
1953 	}
1954 
1955 	context_clear_entry(context);
1956 	__iommu_flush_cache(iommu, context, sizeof(*context));
1957 	spin_unlock(&iommu->lock);
1958 	intel_context_flush_present(info, context, true);
1959 }
1960 
1961 static int domain_setup_first_level(struct intel_iommu *iommu,
1962 				    struct dmar_domain *domain,
1963 				    struct device *dev,
1964 				    u32 pasid)
1965 {
1966 	struct dma_pte *pgd = domain->pgd;
1967 	int agaw, level;
1968 	int flags = 0;
1969 
1970 	/*
1971 	 * Skip top levels of page tables for iommu which has
1972 	 * less agaw than default. Unnecessary for PT mode.
1973 	 */
1974 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1975 		pgd = phys_to_virt(dma_pte_addr(pgd));
1976 		if (!dma_pte_present(pgd))
1977 			return -ENOMEM;
1978 	}
1979 
1980 	level = agaw_to_level(agaw);
1981 	if (level != 4 && level != 5)
1982 		return -EINVAL;
1983 
1984 	if (level == 5)
1985 		flags |= PASID_FLAG_FL5LP;
1986 
1987 	if (domain->force_snooping)
1988 		flags |= PASID_FLAG_PAGE_SNOOP;
1989 
1990 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
1991 					     domain_id_iommu(domain, iommu),
1992 					     flags);
1993 }
1994 
1995 static bool dev_is_real_dma_subdevice(struct device *dev)
1996 {
1997 	return dev && dev_is_pci(dev) &&
1998 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
1999 }
2000 
2001 static int iommu_domain_identity_map(struct dmar_domain *domain,
2002 				     unsigned long first_vpfn,
2003 				     unsigned long last_vpfn)
2004 {
2005 	/*
2006 	 * RMRR range might have overlap with physical memory range,
2007 	 * clear it first
2008 	 */
2009 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2010 
2011 	return __domain_mapping(domain, first_vpfn,
2012 				first_vpfn, last_vpfn - first_vpfn + 1,
2013 				DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2014 }
2015 
2016 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2017 
2018 static int __init si_domain_init(int hw)
2019 {
2020 	struct dmar_rmrr_unit *rmrr;
2021 	struct device *dev;
2022 	int i, nid, ret;
2023 
2024 	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2025 	if (!si_domain)
2026 		return -EFAULT;
2027 
2028 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2029 		domain_exit(si_domain);
2030 		si_domain = NULL;
2031 		return -EFAULT;
2032 	}
2033 
2034 	if (hw)
2035 		return 0;
2036 
2037 	for_each_online_node(nid) {
2038 		unsigned long start_pfn, end_pfn;
2039 		int i;
2040 
2041 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2042 			ret = iommu_domain_identity_map(si_domain,
2043 					mm_to_dma_pfn_start(start_pfn),
2044 					mm_to_dma_pfn_end(end_pfn-1));
2045 			if (ret)
2046 				return ret;
2047 		}
2048 	}
2049 
2050 	/*
2051 	 * Identity map the RMRRs so that devices with RMRRs could also use
2052 	 * the si_domain.
2053 	 */
2054 	for_each_rmrr_units(rmrr) {
2055 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2056 					  i, dev) {
2057 			unsigned long long start = rmrr->base_address;
2058 			unsigned long long end = rmrr->end_address;
2059 
2060 			if (WARN_ON(end < start ||
2061 				    end >> agaw_to_width(si_domain->agaw)))
2062 				continue;
2063 
2064 			ret = iommu_domain_identity_map(si_domain,
2065 					mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2066 					mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2067 			if (ret)
2068 				return ret;
2069 		}
2070 	}
2071 
2072 	return 0;
2073 }
2074 
2075 static int dmar_domain_attach_device(struct dmar_domain *domain,
2076 				     struct device *dev)
2077 {
2078 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2079 	struct intel_iommu *iommu = info->iommu;
2080 	unsigned long flags;
2081 	int ret;
2082 
2083 	ret = domain_attach_iommu(domain, iommu);
2084 	if (ret)
2085 		return ret;
2086 
2087 	info->domain = domain;
2088 	spin_lock_irqsave(&domain->lock, flags);
2089 	list_add(&info->link, &domain->devices);
2090 	spin_unlock_irqrestore(&domain->lock, flags);
2091 
2092 	if (dev_is_real_dma_subdevice(dev))
2093 		return 0;
2094 
2095 	if (!sm_supported(iommu))
2096 		ret = domain_context_mapping(domain, dev);
2097 	else if (hw_pass_through && domain_type_is_si(domain))
2098 		ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
2099 	else if (domain->use_first_level)
2100 		ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID);
2101 	else
2102 		ret = intel_pasid_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID);
2103 
2104 	if (ret)
2105 		goto out_block_translation;
2106 
2107 	if (sm_supported(info->iommu) || !domain_type_is_si(info->domain))
2108 		iommu_enable_pci_caps(info);
2109 
2110 	ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID);
2111 	if (ret)
2112 		goto out_block_translation;
2113 
2114 	return 0;
2115 
2116 out_block_translation:
2117 	device_block_translation(dev);
2118 	return ret;
2119 }
2120 
2121 /**
2122  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2123  * is relaxable (ie. is allowed to be not enforced under some conditions)
2124  * @dev: device handle
2125  *
2126  * We assume that PCI USB devices with RMRRs have them largely
2127  * for historical reasons and that the RMRR space is not actively used post
2128  * boot.  This exclusion may change if vendors begin to abuse it.
2129  *
2130  * The same exception is made for graphics devices, with the requirement that
2131  * any use of the RMRR regions will be torn down before assigning the device
2132  * to a guest.
2133  *
2134  * Return: true if the RMRR is relaxable, false otherwise
2135  */
2136 static bool device_rmrr_is_relaxable(struct device *dev)
2137 {
2138 	struct pci_dev *pdev;
2139 
2140 	if (!dev_is_pci(dev))
2141 		return false;
2142 
2143 	pdev = to_pci_dev(dev);
2144 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2145 		return true;
2146 	else
2147 		return false;
2148 }
2149 
2150 static int device_def_domain_type(struct device *dev)
2151 {
2152 	if (dev_is_pci(dev)) {
2153 		struct pci_dev *pdev = to_pci_dev(dev);
2154 
2155 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2156 			return IOMMU_DOMAIN_IDENTITY;
2157 	}
2158 
2159 	return 0;
2160 }
2161 
2162 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2163 {
2164 	/*
2165 	 * Start from the sane iommu hardware state.
2166 	 * If the queued invalidation is already initialized by us
2167 	 * (for example, while enabling interrupt-remapping) then
2168 	 * we got the things already rolling from a sane state.
2169 	 */
2170 	if (!iommu->qi) {
2171 		/*
2172 		 * Clear any previous faults.
2173 		 */
2174 		dmar_fault(-1, iommu);
2175 		/*
2176 		 * Disable queued invalidation if supported and already enabled
2177 		 * before OS handover.
2178 		 */
2179 		dmar_disable_qi(iommu);
2180 	}
2181 
2182 	if (dmar_enable_qi(iommu)) {
2183 		/*
2184 		 * Queued Invalidate not enabled, use Register Based Invalidate
2185 		 */
2186 		iommu->flush.flush_context = __iommu_flush_context;
2187 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2188 		pr_info("%s: Using Register based invalidation\n",
2189 			iommu->name);
2190 	} else {
2191 		iommu->flush.flush_context = qi_flush_context;
2192 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2193 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2194 	}
2195 }
2196 
2197 static int copy_context_table(struct intel_iommu *iommu,
2198 			      struct root_entry *old_re,
2199 			      struct context_entry **tbl,
2200 			      int bus, bool ext)
2201 {
2202 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2203 	struct context_entry *new_ce = NULL, ce;
2204 	struct context_entry *old_ce = NULL;
2205 	struct root_entry re;
2206 	phys_addr_t old_ce_phys;
2207 
2208 	tbl_idx = ext ? bus * 2 : bus;
2209 	memcpy(&re, old_re, sizeof(re));
2210 
2211 	for (devfn = 0; devfn < 256; devfn++) {
2212 		/* First calculate the correct index */
2213 		idx = (ext ? devfn * 2 : devfn) % 256;
2214 
2215 		if (idx == 0) {
2216 			/* First save what we may have and clean up */
2217 			if (new_ce) {
2218 				tbl[tbl_idx] = new_ce;
2219 				__iommu_flush_cache(iommu, new_ce,
2220 						    VTD_PAGE_SIZE);
2221 				pos = 1;
2222 			}
2223 
2224 			if (old_ce)
2225 				memunmap(old_ce);
2226 
2227 			ret = 0;
2228 			if (devfn < 0x80)
2229 				old_ce_phys = root_entry_lctp(&re);
2230 			else
2231 				old_ce_phys = root_entry_uctp(&re);
2232 
2233 			if (!old_ce_phys) {
2234 				if (ext && devfn == 0) {
2235 					/* No LCTP, try UCTP */
2236 					devfn = 0x7f;
2237 					continue;
2238 				} else {
2239 					goto out;
2240 				}
2241 			}
2242 
2243 			ret = -ENOMEM;
2244 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2245 					MEMREMAP_WB);
2246 			if (!old_ce)
2247 				goto out;
2248 
2249 			new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL);
2250 			if (!new_ce)
2251 				goto out_unmap;
2252 
2253 			ret = 0;
2254 		}
2255 
2256 		/* Now copy the context entry */
2257 		memcpy(&ce, old_ce + idx, sizeof(ce));
2258 
2259 		if (!context_present(&ce))
2260 			continue;
2261 
2262 		did = context_domain_id(&ce);
2263 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2264 			set_bit(did, iommu->domain_ids);
2265 
2266 		set_context_copied(iommu, bus, devfn);
2267 		new_ce[idx] = ce;
2268 	}
2269 
2270 	tbl[tbl_idx + pos] = new_ce;
2271 
2272 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2273 
2274 out_unmap:
2275 	memunmap(old_ce);
2276 
2277 out:
2278 	return ret;
2279 }
2280 
2281 static int copy_translation_tables(struct intel_iommu *iommu)
2282 {
2283 	struct context_entry **ctxt_tbls;
2284 	struct root_entry *old_rt;
2285 	phys_addr_t old_rt_phys;
2286 	int ctxt_table_entries;
2287 	u64 rtaddr_reg;
2288 	int bus, ret;
2289 	bool new_ext, ext;
2290 
2291 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2292 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2293 	new_ext    = !!sm_supported(iommu);
2294 
2295 	/*
2296 	 * The RTT bit can only be changed when translation is disabled,
2297 	 * but disabling translation means to open a window for data
2298 	 * corruption. So bail out and don't copy anything if we would
2299 	 * have to change the bit.
2300 	 */
2301 	if (new_ext != ext)
2302 		return -EINVAL;
2303 
2304 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2305 	if (!iommu->copied_tables)
2306 		return -ENOMEM;
2307 
2308 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2309 	if (!old_rt_phys)
2310 		return -EINVAL;
2311 
2312 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2313 	if (!old_rt)
2314 		return -ENOMEM;
2315 
2316 	/* This is too big for the stack - allocate it from slab */
2317 	ctxt_table_entries = ext ? 512 : 256;
2318 	ret = -ENOMEM;
2319 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2320 	if (!ctxt_tbls)
2321 		goto out_unmap;
2322 
2323 	for (bus = 0; bus < 256; bus++) {
2324 		ret = copy_context_table(iommu, &old_rt[bus],
2325 					 ctxt_tbls, bus, ext);
2326 		if (ret) {
2327 			pr_err("%s: Failed to copy context table for bus %d\n",
2328 				iommu->name, bus);
2329 			continue;
2330 		}
2331 	}
2332 
2333 	spin_lock(&iommu->lock);
2334 
2335 	/* Context tables are copied, now write them to the root_entry table */
2336 	for (bus = 0; bus < 256; bus++) {
2337 		int idx = ext ? bus * 2 : bus;
2338 		u64 val;
2339 
2340 		if (ctxt_tbls[idx]) {
2341 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2342 			iommu->root_entry[bus].lo = val;
2343 		}
2344 
2345 		if (!ext || !ctxt_tbls[idx + 1])
2346 			continue;
2347 
2348 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2349 		iommu->root_entry[bus].hi = val;
2350 	}
2351 
2352 	spin_unlock(&iommu->lock);
2353 
2354 	kfree(ctxt_tbls);
2355 
2356 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2357 
2358 	ret = 0;
2359 
2360 out_unmap:
2361 	memunmap(old_rt);
2362 
2363 	return ret;
2364 }
2365 
2366 static int __init init_dmars(void)
2367 {
2368 	struct dmar_drhd_unit *drhd;
2369 	struct intel_iommu *iommu;
2370 	int ret;
2371 
2372 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2373 	if (ret)
2374 		goto free_iommu;
2375 
2376 	for_each_iommu(iommu, drhd) {
2377 		if (drhd->ignored) {
2378 			iommu_disable_translation(iommu);
2379 			continue;
2380 		}
2381 
2382 		/*
2383 		 * Find the max pasid size of all IOMMU's in the system.
2384 		 * We need to ensure the system pasid table is no bigger
2385 		 * than the smallest supported.
2386 		 */
2387 		if (pasid_supported(iommu)) {
2388 			u32 temp = 2 << ecap_pss(iommu->ecap);
2389 
2390 			intel_pasid_max_id = min_t(u32, temp,
2391 						   intel_pasid_max_id);
2392 		}
2393 
2394 		intel_iommu_init_qi(iommu);
2395 
2396 		ret = iommu_init_domains(iommu);
2397 		if (ret)
2398 			goto free_iommu;
2399 
2400 		init_translation_status(iommu);
2401 
2402 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2403 			iommu_disable_translation(iommu);
2404 			clear_translation_pre_enabled(iommu);
2405 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2406 				iommu->name);
2407 		}
2408 
2409 		/*
2410 		 * TBD:
2411 		 * we could share the same root & context tables
2412 		 * among all IOMMU's. Need to Split it later.
2413 		 */
2414 		ret = iommu_alloc_root_entry(iommu);
2415 		if (ret)
2416 			goto free_iommu;
2417 
2418 		if (translation_pre_enabled(iommu)) {
2419 			pr_info("Translation already enabled - trying to copy translation structures\n");
2420 
2421 			ret = copy_translation_tables(iommu);
2422 			if (ret) {
2423 				/*
2424 				 * We found the IOMMU with translation
2425 				 * enabled - but failed to copy over the
2426 				 * old root-entry table. Try to proceed
2427 				 * by disabling translation now and
2428 				 * allocating a clean root-entry table.
2429 				 * This might cause DMAR faults, but
2430 				 * probably the dump will still succeed.
2431 				 */
2432 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2433 				       iommu->name);
2434 				iommu_disable_translation(iommu);
2435 				clear_translation_pre_enabled(iommu);
2436 			} else {
2437 				pr_info("Copied translation tables from previous kernel for %s\n",
2438 					iommu->name);
2439 			}
2440 		}
2441 
2442 		if (!ecap_pass_through(iommu->ecap))
2443 			hw_pass_through = 0;
2444 		intel_svm_check(iommu);
2445 	}
2446 
2447 	/*
2448 	 * Now that qi is enabled on all iommus, set the root entry and flush
2449 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2450 	 * flush_context function will loop forever and the boot hangs.
2451 	 */
2452 	for_each_active_iommu(iommu, drhd) {
2453 		iommu_flush_write_buffer(iommu);
2454 		iommu_set_root_entry(iommu);
2455 	}
2456 
2457 	check_tylersburg_isoch();
2458 
2459 	ret = si_domain_init(hw_pass_through);
2460 	if (ret)
2461 		goto free_iommu;
2462 
2463 	/*
2464 	 * for each drhd
2465 	 *   enable fault log
2466 	 *   global invalidate context cache
2467 	 *   global invalidate iotlb
2468 	 *   enable translation
2469 	 */
2470 	for_each_iommu(iommu, drhd) {
2471 		if (drhd->ignored) {
2472 			/*
2473 			 * we always have to disable PMRs or DMA may fail on
2474 			 * this device
2475 			 */
2476 			if (force_on)
2477 				iommu_disable_protect_mem_regions(iommu);
2478 			continue;
2479 		}
2480 
2481 		iommu_flush_write_buffer(iommu);
2482 
2483 #ifdef CONFIG_INTEL_IOMMU_SVM
2484 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2485 			/*
2486 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2487 			 * could cause possible lock race condition.
2488 			 */
2489 			up_write(&dmar_global_lock);
2490 			ret = intel_svm_enable_prq(iommu);
2491 			down_write(&dmar_global_lock);
2492 			if (ret)
2493 				goto free_iommu;
2494 		}
2495 #endif
2496 		ret = dmar_set_interrupt(iommu);
2497 		if (ret)
2498 			goto free_iommu;
2499 	}
2500 
2501 	return 0;
2502 
2503 free_iommu:
2504 	for_each_active_iommu(iommu, drhd) {
2505 		disable_dmar_iommu(iommu);
2506 		free_dmar_iommu(iommu);
2507 	}
2508 	if (si_domain) {
2509 		domain_exit(si_domain);
2510 		si_domain = NULL;
2511 	}
2512 
2513 	return ret;
2514 }
2515 
2516 static void __init init_no_remapping_devices(void)
2517 {
2518 	struct dmar_drhd_unit *drhd;
2519 	struct device *dev;
2520 	int i;
2521 
2522 	for_each_drhd_unit(drhd) {
2523 		if (!drhd->include_all) {
2524 			for_each_active_dev_scope(drhd->devices,
2525 						  drhd->devices_cnt, i, dev)
2526 				break;
2527 			/* ignore DMAR unit if no devices exist */
2528 			if (i == drhd->devices_cnt)
2529 				drhd->ignored = 1;
2530 		}
2531 	}
2532 
2533 	for_each_active_drhd_unit(drhd) {
2534 		if (drhd->include_all)
2535 			continue;
2536 
2537 		for_each_active_dev_scope(drhd->devices,
2538 					  drhd->devices_cnt, i, dev)
2539 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2540 				break;
2541 		if (i < drhd->devices_cnt)
2542 			continue;
2543 
2544 		/* This IOMMU has *only* gfx devices. Either bypass it or
2545 		   set the gfx_mapped flag, as appropriate */
2546 		drhd->gfx_dedicated = 1;
2547 		if (disable_igfx_iommu)
2548 			drhd->ignored = 1;
2549 	}
2550 }
2551 
2552 #ifdef CONFIG_SUSPEND
2553 static int init_iommu_hw(void)
2554 {
2555 	struct dmar_drhd_unit *drhd;
2556 	struct intel_iommu *iommu = NULL;
2557 	int ret;
2558 
2559 	for_each_active_iommu(iommu, drhd) {
2560 		if (iommu->qi) {
2561 			ret = dmar_reenable_qi(iommu);
2562 			if (ret)
2563 				return ret;
2564 		}
2565 	}
2566 
2567 	for_each_iommu(iommu, drhd) {
2568 		if (drhd->ignored) {
2569 			/*
2570 			 * we always have to disable PMRs or DMA may fail on
2571 			 * this device
2572 			 */
2573 			if (force_on)
2574 				iommu_disable_protect_mem_regions(iommu);
2575 			continue;
2576 		}
2577 
2578 		iommu_flush_write_buffer(iommu);
2579 		iommu_set_root_entry(iommu);
2580 		iommu_enable_translation(iommu);
2581 		iommu_disable_protect_mem_regions(iommu);
2582 	}
2583 
2584 	return 0;
2585 }
2586 
2587 static void iommu_flush_all(void)
2588 {
2589 	struct dmar_drhd_unit *drhd;
2590 	struct intel_iommu *iommu;
2591 
2592 	for_each_active_iommu(iommu, drhd) {
2593 		iommu->flush.flush_context(iommu, 0, 0, 0,
2594 					   DMA_CCMD_GLOBAL_INVL);
2595 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2596 					 DMA_TLB_GLOBAL_FLUSH);
2597 	}
2598 }
2599 
2600 static int iommu_suspend(void)
2601 {
2602 	struct dmar_drhd_unit *drhd;
2603 	struct intel_iommu *iommu = NULL;
2604 	unsigned long flag;
2605 
2606 	iommu_flush_all();
2607 
2608 	for_each_active_iommu(iommu, drhd) {
2609 		iommu_disable_translation(iommu);
2610 
2611 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2612 
2613 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
2614 			readl(iommu->reg + DMAR_FECTL_REG);
2615 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2616 			readl(iommu->reg + DMAR_FEDATA_REG);
2617 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2618 			readl(iommu->reg + DMAR_FEADDR_REG);
2619 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2620 			readl(iommu->reg + DMAR_FEUADDR_REG);
2621 
2622 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2623 	}
2624 	return 0;
2625 }
2626 
2627 static void iommu_resume(void)
2628 {
2629 	struct dmar_drhd_unit *drhd;
2630 	struct intel_iommu *iommu = NULL;
2631 	unsigned long flag;
2632 
2633 	if (init_iommu_hw()) {
2634 		if (force_on)
2635 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2636 		else
2637 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2638 		return;
2639 	}
2640 
2641 	for_each_active_iommu(iommu, drhd) {
2642 
2643 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2644 
2645 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2646 			iommu->reg + DMAR_FECTL_REG);
2647 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2648 			iommu->reg + DMAR_FEDATA_REG);
2649 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2650 			iommu->reg + DMAR_FEADDR_REG);
2651 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2652 			iommu->reg + DMAR_FEUADDR_REG);
2653 
2654 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2655 	}
2656 }
2657 
2658 static struct syscore_ops iommu_syscore_ops = {
2659 	.resume		= iommu_resume,
2660 	.suspend	= iommu_suspend,
2661 };
2662 
2663 static void __init init_iommu_pm_ops(void)
2664 {
2665 	register_syscore_ops(&iommu_syscore_ops);
2666 }
2667 
2668 #else
2669 static inline void init_iommu_pm_ops(void) {}
2670 #endif	/* CONFIG_PM */
2671 
2672 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2673 {
2674 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2675 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2676 	    rmrr->end_address <= rmrr->base_address ||
2677 	    arch_rmrr_sanity_check(rmrr))
2678 		return -EINVAL;
2679 
2680 	return 0;
2681 }
2682 
2683 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2684 {
2685 	struct acpi_dmar_reserved_memory *rmrr;
2686 	struct dmar_rmrr_unit *rmrru;
2687 
2688 	rmrr = (struct acpi_dmar_reserved_memory *)header;
2689 	if (rmrr_sanity_check(rmrr)) {
2690 		pr_warn(FW_BUG
2691 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2692 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2693 			   rmrr->base_address, rmrr->end_address,
2694 			   dmi_get_system_info(DMI_BIOS_VENDOR),
2695 			   dmi_get_system_info(DMI_BIOS_VERSION),
2696 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
2697 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2698 	}
2699 
2700 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2701 	if (!rmrru)
2702 		goto out;
2703 
2704 	rmrru->hdr = header;
2705 
2706 	rmrru->base_address = rmrr->base_address;
2707 	rmrru->end_address = rmrr->end_address;
2708 
2709 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2710 				((void *)rmrr) + rmrr->header.length,
2711 				&rmrru->devices_cnt);
2712 	if (rmrru->devices_cnt && rmrru->devices == NULL)
2713 		goto free_rmrru;
2714 
2715 	list_add(&rmrru->list, &dmar_rmrr_units);
2716 
2717 	return 0;
2718 free_rmrru:
2719 	kfree(rmrru);
2720 out:
2721 	return -ENOMEM;
2722 }
2723 
2724 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2725 {
2726 	struct dmar_atsr_unit *atsru;
2727 	struct acpi_dmar_atsr *tmp;
2728 
2729 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2730 				dmar_rcu_check()) {
2731 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2732 		if (atsr->segment != tmp->segment)
2733 			continue;
2734 		if (atsr->header.length != tmp->header.length)
2735 			continue;
2736 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
2737 			return atsru;
2738 	}
2739 
2740 	return NULL;
2741 }
2742 
2743 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2744 {
2745 	struct acpi_dmar_atsr *atsr;
2746 	struct dmar_atsr_unit *atsru;
2747 
2748 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2749 		return 0;
2750 
2751 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2752 	atsru = dmar_find_atsr(atsr);
2753 	if (atsru)
2754 		return 0;
2755 
2756 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2757 	if (!atsru)
2758 		return -ENOMEM;
2759 
2760 	/*
2761 	 * If memory is allocated from slab by ACPI _DSM method, we need to
2762 	 * copy the memory content because the memory buffer will be freed
2763 	 * on return.
2764 	 */
2765 	atsru->hdr = (void *)(atsru + 1);
2766 	memcpy(atsru->hdr, hdr, hdr->length);
2767 	atsru->include_all = atsr->flags & 0x1;
2768 	if (!atsru->include_all) {
2769 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
2770 				(void *)atsr + atsr->header.length,
2771 				&atsru->devices_cnt);
2772 		if (atsru->devices_cnt && atsru->devices == NULL) {
2773 			kfree(atsru);
2774 			return -ENOMEM;
2775 		}
2776 	}
2777 
2778 	list_add_rcu(&atsru->list, &dmar_atsr_units);
2779 
2780 	return 0;
2781 }
2782 
2783 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2784 {
2785 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
2786 	kfree(atsru);
2787 }
2788 
2789 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2790 {
2791 	struct acpi_dmar_atsr *atsr;
2792 	struct dmar_atsr_unit *atsru;
2793 
2794 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2795 	atsru = dmar_find_atsr(atsr);
2796 	if (atsru) {
2797 		list_del_rcu(&atsru->list);
2798 		synchronize_rcu();
2799 		intel_iommu_free_atsr(atsru);
2800 	}
2801 
2802 	return 0;
2803 }
2804 
2805 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2806 {
2807 	int i;
2808 	struct device *dev;
2809 	struct acpi_dmar_atsr *atsr;
2810 	struct dmar_atsr_unit *atsru;
2811 
2812 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2813 	atsru = dmar_find_atsr(atsr);
2814 	if (!atsru)
2815 		return 0;
2816 
2817 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
2818 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
2819 					  i, dev)
2820 			return -EBUSY;
2821 	}
2822 
2823 	return 0;
2824 }
2825 
2826 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
2827 {
2828 	struct dmar_satc_unit *satcu;
2829 	struct acpi_dmar_satc *tmp;
2830 
2831 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
2832 				dmar_rcu_check()) {
2833 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
2834 		if (satc->segment != tmp->segment)
2835 			continue;
2836 		if (satc->header.length != tmp->header.length)
2837 			continue;
2838 		if (memcmp(satc, tmp, satc->header.length) == 0)
2839 			return satcu;
2840 	}
2841 
2842 	return NULL;
2843 }
2844 
2845 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
2846 {
2847 	struct acpi_dmar_satc *satc;
2848 	struct dmar_satc_unit *satcu;
2849 
2850 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2851 		return 0;
2852 
2853 	satc = container_of(hdr, struct acpi_dmar_satc, header);
2854 	satcu = dmar_find_satc(satc);
2855 	if (satcu)
2856 		return 0;
2857 
2858 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
2859 	if (!satcu)
2860 		return -ENOMEM;
2861 
2862 	satcu->hdr = (void *)(satcu + 1);
2863 	memcpy(satcu->hdr, hdr, hdr->length);
2864 	satcu->atc_required = satc->flags & 0x1;
2865 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
2866 					      (void *)satc + satc->header.length,
2867 					      &satcu->devices_cnt);
2868 	if (satcu->devices_cnt && !satcu->devices) {
2869 		kfree(satcu);
2870 		return -ENOMEM;
2871 	}
2872 	list_add_rcu(&satcu->list, &dmar_satc_units);
2873 
2874 	return 0;
2875 }
2876 
2877 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
2878 {
2879 	int sp, ret;
2880 	struct intel_iommu *iommu = dmaru->iommu;
2881 
2882 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
2883 	if (ret)
2884 		goto out;
2885 
2886 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
2887 		pr_warn("%s: Doesn't support hardware pass through.\n",
2888 			iommu->name);
2889 		return -ENXIO;
2890 	}
2891 
2892 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
2893 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
2894 		pr_warn("%s: Doesn't support large page.\n",
2895 			iommu->name);
2896 		return -ENXIO;
2897 	}
2898 
2899 	/*
2900 	 * Disable translation if already enabled prior to OS handover.
2901 	 */
2902 	if (iommu->gcmd & DMA_GCMD_TE)
2903 		iommu_disable_translation(iommu);
2904 
2905 	ret = iommu_init_domains(iommu);
2906 	if (ret == 0)
2907 		ret = iommu_alloc_root_entry(iommu);
2908 	if (ret)
2909 		goto out;
2910 
2911 	intel_svm_check(iommu);
2912 
2913 	if (dmaru->ignored) {
2914 		/*
2915 		 * we always have to disable PMRs or DMA may fail on this device
2916 		 */
2917 		if (force_on)
2918 			iommu_disable_protect_mem_regions(iommu);
2919 		return 0;
2920 	}
2921 
2922 	intel_iommu_init_qi(iommu);
2923 	iommu_flush_write_buffer(iommu);
2924 
2925 #ifdef CONFIG_INTEL_IOMMU_SVM
2926 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2927 		ret = intel_svm_enable_prq(iommu);
2928 		if (ret)
2929 			goto disable_iommu;
2930 	}
2931 #endif
2932 	ret = dmar_set_interrupt(iommu);
2933 	if (ret)
2934 		goto disable_iommu;
2935 
2936 	iommu_set_root_entry(iommu);
2937 	iommu_enable_translation(iommu);
2938 
2939 	iommu_disable_protect_mem_regions(iommu);
2940 	return 0;
2941 
2942 disable_iommu:
2943 	disable_dmar_iommu(iommu);
2944 out:
2945 	free_dmar_iommu(iommu);
2946 	return ret;
2947 }
2948 
2949 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
2950 {
2951 	int ret = 0;
2952 	struct intel_iommu *iommu = dmaru->iommu;
2953 
2954 	if (!intel_iommu_enabled)
2955 		return 0;
2956 	if (iommu == NULL)
2957 		return -EINVAL;
2958 
2959 	if (insert) {
2960 		ret = intel_iommu_add(dmaru);
2961 	} else {
2962 		disable_dmar_iommu(iommu);
2963 		free_dmar_iommu(iommu);
2964 	}
2965 
2966 	return ret;
2967 }
2968 
2969 static void intel_iommu_free_dmars(void)
2970 {
2971 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
2972 	struct dmar_atsr_unit *atsru, *atsr_n;
2973 	struct dmar_satc_unit *satcu, *satc_n;
2974 
2975 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
2976 		list_del(&rmrru->list);
2977 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
2978 		kfree(rmrru);
2979 	}
2980 
2981 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
2982 		list_del(&atsru->list);
2983 		intel_iommu_free_atsr(atsru);
2984 	}
2985 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
2986 		list_del(&satcu->list);
2987 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
2988 		kfree(satcu);
2989 	}
2990 }
2991 
2992 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
2993 {
2994 	struct dmar_satc_unit *satcu;
2995 	struct acpi_dmar_satc *satc;
2996 	struct device *tmp;
2997 	int i;
2998 
2999 	dev = pci_physfn(dev);
3000 	rcu_read_lock();
3001 
3002 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3003 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3004 		if (satc->segment != pci_domain_nr(dev->bus))
3005 			continue;
3006 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3007 			if (to_pci_dev(tmp) == dev)
3008 				goto out;
3009 	}
3010 	satcu = NULL;
3011 out:
3012 	rcu_read_unlock();
3013 	return satcu;
3014 }
3015 
3016 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3017 {
3018 	int i, ret = 1;
3019 	struct pci_bus *bus;
3020 	struct pci_dev *bridge = NULL;
3021 	struct device *tmp;
3022 	struct acpi_dmar_atsr *atsr;
3023 	struct dmar_atsr_unit *atsru;
3024 	struct dmar_satc_unit *satcu;
3025 
3026 	dev = pci_physfn(dev);
3027 	satcu = dmar_find_matched_satc_unit(dev);
3028 	if (satcu)
3029 		/*
3030 		 * This device supports ATS as it is in SATC table.
3031 		 * When IOMMU is in legacy mode, enabling ATS is done
3032 		 * automatically by HW for the device that requires
3033 		 * ATS, hence OS should not enable this device ATS
3034 		 * to avoid duplicated TLB invalidation.
3035 		 */
3036 		return !(satcu->atc_required && !sm_supported(iommu));
3037 
3038 	for (bus = dev->bus; bus; bus = bus->parent) {
3039 		bridge = bus->self;
3040 		/* If it's an integrated device, allow ATS */
3041 		if (!bridge)
3042 			return 1;
3043 		/* Connected via non-PCIe: no ATS */
3044 		if (!pci_is_pcie(bridge) ||
3045 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3046 			return 0;
3047 		/* If we found the root port, look it up in the ATSR */
3048 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3049 			break;
3050 	}
3051 
3052 	rcu_read_lock();
3053 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3054 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3055 		if (atsr->segment != pci_domain_nr(dev->bus))
3056 			continue;
3057 
3058 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3059 			if (tmp == &bridge->dev)
3060 				goto out;
3061 
3062 		if (atsru->include_all)
3063 			goto out;
3064 	}
3065 	ret = 0;
3066 out:
3067 	rcu_read_unlock();
3068 
3069 	return ret;
3070 }
3071 
3072 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3073 {
3074 	int ret;
3075 	struct dmar_rmrr_unit *rmrru;
3076 	struct dmar_atsr_unit *atsru;
3077 	struct dmar_satc_unit *satcu;
3078 	struct acpi_dmar_atsr *atsr;
3079 	struct acpi_dmar_reserved_memory *rmrr;
3080 	struct acpi_dmar_satc *satc;
3081 
3082 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3083 		return 0;
3084 
3085 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3086 		rmrr = container_of(rmrru->hdr,
3087 				    struct acpi_dmar_reserved_memory, header);
3088 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3089 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3090 				((void *)rmrr) + rmrr->header.length,
3091 				rmrr->segment, rmrru->devices,
3092 				rmrru->devices_cnt);
3093 			if (ret < 0)
3094 				return ret;
3095 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3096 			dmar_remove_dev_scope(info, rmrr->segment,
3097 				rmrru->devices, rmrru->devices_cnt);
3098 		}
3099 	}
3100 
3101 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3102 		if (atsru->include_all)
3103 			continue;
3104 
3105 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3106 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3107 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3108 					(void *)atsr + atsr->header.length,
3109 					atsr->segment, atsru->devices,
3110 					atsru->devices_cnt);
3111 			if (ret > 0)
3112 				break;
3113 			else if (ret < 0)
3114 				return ret;
3115 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3116 			if (dmar_remove_dev_scope(info, atsr->segment,
3117 					atsru->devices, atsru->devices_cnt))
3118 				break;
3119 		}
3120 	}
3121 	list_for_each_entry(satcu, &dmar_satc_units, list) {
3122 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3123 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3124 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3125 					(void *)satc + satc->header.length,
3126 					satc->segment, satcu->devices,
3127 					satcu->devices_cnt);
3128 			if (ret > 0)
3129 				break;
3130 			else if (ret < 0)
3131 				return ret;
3132 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3133 			if (dmar_remove_dev_scope(info, satc->segment,
3134 					satcu->devices, satcu->devices_cnt))
3135 				break;
3136 		}
3137 	}
3138 
3139 	return 0;
3140 }
3141 
3142 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3143 				       unsigned long val, void *v)
3144 {
3145 	struct memory_notify *mhp = v;
3146 	unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3147 	unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3148 			mhp->nr_pages - 1);
3149 
3150 	switch (val) {
3151 	case MEM_GOING_ONLINE:
3152 		if (iommu_domain_identity_map(si_domain,
3153 					      start_vpfn, last_vpfn)) {
3154 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3155 				start_vpfn, last_vpfn);
3156 			return NOTIFY_BAD;
3157 		}
3158 		break;
3159 
3160 	case MEM_OFFLINE:
3161 	case MEM_CANCEL_ONLINE:
3162 		{
3163 			LIST_HEAD(freelist);
3164 
3165 			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3166 			iommu_put_pages_list(&freelist);
3167 		}
3168 		break;
3169 	}
3170 
3171 	return NOTIFY_OK;
3172 }
3173 
3174 static struct notifier_block intel_iommu_memory_nb = {
3175 	.notifier_call = intel_iommu_memory_notifier,
3176 	.priority = 0
3177 };
3178 
3179 static void intel_disable_iommus(void)
3180 {
3181 	struct intel_iommu *iommu = NULL;
3182 	struct dmar_drhd_unit *drhd;
3183 
3184 	for_each_iommu(iommu, drhd)
3185 		iommu_disable_translation(iommu);
3186 }
3187 
3188 void intel_iommu_shutdown(void)
3189 {
3190 	struct dmar_drhd_unit *drhd;
3191 	struct intel_iommu *iommu = NULL;
3192 
3193 	if (no_iommu || dmar_disabled)
3194 		return;
3195 
3196 	down_write(&dmar_global_lock);
3197 
3198 	/* Disable PMRs explicitly here. */
3199 	for_each_iommu(iommu, drhd)
3200 		iommu_disable_protect_mem_regions(iommu);
3201 
3202 	/* Make sure the IOMMUs are switched off */
3203 	intel_disable_iommus();
3204 
3205 	up_write(&dmar_global_lock);
3206 }
3207 
3208 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3209 {
3210 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3211 
3212 	return container_of(iommu_dev, struct intel_iommu, iommu);
3213 }
3214 
3215 static ssize_t version_show(struct device *dev,
3216 			    struct device_attribute *attr, char *buf)
3217 {
3218 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3219 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3220 	return sysfs_emit(buf, "%d:%d\n",
3221 			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3222 }
3223 static DEVICE_ATTR_RO(version);
3224 
3225 static ssize_t address_show(struct device *dev,
3226 			    struct device_attribute *attr, char *buf)
3227 {
3228 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3229 	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3230 }
3231 static DEVICE_ATTR_RO(address);
3232 
3233 static ssize_t cap_show(struct device *dev,
3234 			struct device_attribute *attr, char *buf)
3235 {
3236 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3237 	return sysfs_emit(buf, "%llx\n", iommu->cap);
3238 }
3239 static DEVICE_ATTR_RO(cap);
3240 
3241 static ssize_t ecap_show(struct device *dev,
3242 			 struct device_attribute *attr, char *buf)
3243 {
3244 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3245 	return sysfs_emit(buf, "%llx\n", iommu->ecap);
3246 }
3247 static DEVICE_ATTR_RO(ecap);
3248 
3249 static ssize_t domains_supported_show(struct device *dev,
3250 				      struct device_attribute *attr, char *buf)
3251 {
3252 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3253 	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3254 }
3255 static DEVICE_ATTR_RO(domains_supported);
3256 
3257 static ssize_t domains_used_show(struct device *dev,
3258 				 struct device_attribute *attr, char *buf)
3259 {
3260 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3261 	return sysfs_emit(buf, "%d\n",
3262 			  bitmap_weight(iommu->domain_ids,
3263 					cap_ndoms(iommu->cap)));
3264 }
3265 static DEVICE_ATTR_RO(domains_used);
3266 
3267 static struct attribute *intel_iommu_attrs[] = {
3268 	&dev_attr_version.attr,
3269 	&dev_attr_address.attr,
3270 	&dev_attr_cap.attr,
3271 	&dev_attr_ecap.attr,
3272 	&dev_attr_domains_supported.attr,
3273 	&dev_attr_domains_used.attr,
3274 	NULL,
3275 };
3276 
3277 static struct attribute_group intel_iommu_group = {
3278 	.name = "intel-iommu",
3279 	.attrs = intel_iommu_attrs,
3280 };
3281 
3282 const struct attribute_group *intel_iommu_groups[] = {
3283 	&intel_iommu_group,
3284 	NULL,
3285 };
3286 
3287 static bool has_external_pci(void)
3288 {
3289 	struct pci_dev *pdev = NULL;
3290 
3291 	for_each_pci_dev(pdev)
3292 		if (pdev->external_facing) {
3293 			pci_dev_put(pdev);
3294 			return true;
3295 		}
3296 
3297 	return false;
3298 }
3299 
3300 static int __init platform_optin_force_iommu(void)
3301 {
3302 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3303 		return 0;
3304 
3305 	if (no_iommu || dmar_disabled)
3306 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3307 
3308 	/*
3309 	 * If Intel-IOMMU is disabled by default, we will apply identity
3310 	 * map for all devices except those marked as being untrusted.
3311 	 */
3312 	if (dmar_disabled)
3313 		iommu_set_default_passthrough(false);
3314 
3315 	dmar_disabled = 0;
3316 	no_iommu = 0;
3317 
3318 	return 1;
3319 }
3320 
3321 static int __init probe_acpi_namespace_devices(void)
3322 {
3323 	struct dmar_drhd_unit *drhd;
3324 	/* To avoid a -Wunused-but-set-variable warning. */
3325 	struct intel_iommu *iommu __maybe_unused;
3326 	struct device *dev;
3327 	int i, ret = 0;
3328 
3329 	for_each_active_iommu(iommu, drhd) {
3330 		for_each_active_dev_scope(drhd->devices,
3331 					  drhd->devices_cnt, i, dev) {
3332 			struct acpi_device_physical_node *pn;
3333 			struct acpi_device *adev;
3334 
3335 			if (dev->bus != &acpi_bus_type)
3336 				continue;
3337 
3338 			adev = to_acpi_device(dev);
3339 			mutex_lock(&adev->physical_node_lock);
3340 			list_for_each_entry(pn,
3341 					    &adev->physical_node_list, node) {
3342 				ret = iommu_probe_device(pn->dev);
3343 				if (ret)
3344 					break;
3345 			}
3346 			mutex_unlock(&adev->physical_node_lock);
3347 
3348 			if (ret)
3349 				return ret;
3350 		}
3351 	}
3352 
3353 	return 0;
3354 }
3355 
3356 static __init int tboot_force_iommu(void)
3357 {
3358 	if (!tboot_enabled())
3359 		return 0;
3360 
3361 	if (no_iommu || dmar_disabled)
3362 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3363 
3364 	dmar_disabled = 0;
3365 	no_iommu = 0;
3366 
3367 	return 1;
3368 }
3369 
3370 int __init intel_iommu_init(void)
3371 {
3372 	int ret = -ENODEV;
3373 	struct dmar_drhd_unit *drhd;
3374 	struct intel_iommu *iommu;
3375 
3376 	/*
3377 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3378 	 * opt in, so enforce that.
3379 	 */
3380 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3381 		    platform_optin_force_iommu();
3382 
3383 	down_write(&dmar_global_lock);
3384 	if (dmar_table_init()) {
3385 		if (force_on)
3386 			panic("tboot: Failed to initialize DMAR table\n");
3387 		goto out_free_dmar;
3388 	}
3389 
3390 	if (dmar_dev_scope_init() < 0) {
3391 		if (force_on)
3392 			panic("tboot: Failed to initialize DMAR device scope\n");
3393 		goto out_free_dmar;
3394 	}
3395 
3396 	up_write(&dmar_global_lock);
3397 
3398 	/*
3399 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3400 	 * complain later when we register it under the lock.
3401 	 */
3402 	dmar_register_bus_notifier();
3403 
3404 	down_write(&dmar_global_lock);
3405 
3406 	if (!no_iommu)
3407 		intel_iommu_debugfs_init();
3408 
3409 	if (no_iommu || dmar_disabled) {
3410 		/*
3411 		 * We exit the function here to ensure IOMMU's remapping and
3412 		 * mempool aren't setup, which means that the IOMMU's PMRs
3413 		 * won't be disabled via the call to init_dmars(). So disable
3414 		 * it explicitly here. The PMRs were setup by tboot prior to
3415 		 * calling SENTER, but the kernel is expected to reset/tear
3416 		 * down the PMRs.
3417 		 */
3418 		if (intel_iommu_tboot_noforce) {
3419 			for_each_iommu(iommu, drhd)
3420 				iommu_disable_protect_mem_regions(iommu);
3421 		}
3422 
3423 		/*
3424 		 * Make sure the IOMMUs are switched off, even when we
3425 		 * boot into a kexec kernel and the previous kernel left
3426 		 * them enabled
3427 		 */
3428 		intel_disable_iommus();
3429 		goto out_free_dmar;
3430 	}
3431 
3432 	if (list_empty(&dmar_rmrr_units))
3433 		pr_info("No RMRR found\n");
3434 
3435 	if (list_empty(&dmar_atsr_units))
3436 		pr_info("No ATSR found\n");
3437 
3438 	if (list_empty(&dmar_satc_units))
3439 		pr_info("No SATC found\n");
3440 
3441 	init_no_remapping_devices();
3442 
3443 	ret = init_dmars();
3444 	if (ret) {
3445 		if (force_on)
3446 			panic("tboot: Failed to initialize DMARs\n");
3447 		pr_err("Initialization failed\n");
3448 		goto out_free_dmar;
3449 	}
3450 	up_write(&dmar_global_lock);
3451 
3452 	init_iommu_pm_ops();
3453 
3454 	down_read(&dmar_global_lock);
3455 	for_each_active_iommu(iommu, drhd) {
3456 		/*
3457 		 * The flush queue implementation does not perform
3458 		 * page-selective invalidations that are required for efficient
3459 		 * TLB flushes in virtual environments.  The benefit of batching
3460 		 * is likely to be much lower than the overhead of synchronizing
3461 		 * the virtual and physical IOMMU page-tables.
3462 		 */
3463 		if (cap_caching_mode(iommu->cap) &&
3464 		    !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3465 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3466 			iommu_set_dma_strict();
3467 		}
3468 		iommu_device_sysfs_add(&iommu->iommu, NULL,
3469 				       intel_iommu_groups,
3470 				       "%s", iommu->name);
3471 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3472 
3473 		iommu_pmu_register(iommu);
3474 	}
3475 	up_read(&dmar_global_lock);
3476 
3477 	if (si_domain && !hw_pass_through)
3478 		register_memory_notifier(&intel_iommu_memory_nb);
3479 
3480 	down_read(&dmar_global_lock);
3481 	if (probe_acpi_namespace_devices())
3482 		pr_warn("ACPI name space devices didn't probe correctly\n");
3483 
3484 	/* Finally, we enable the DMA remapping hardware. */
3485 	for_each_iommu(iommu, drhd) {
3486 		if (!drhd->ignored && !translation_pre_enabled(iommu))
3487 			iommu_enable_translation(iommu);
3488 
3489 		iommu_disable_protect_mem_regions(iommu);
3490 	}
3491 	up_read(&dmar_global_lock);
3492 
3493 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3494 
3495 	intel_iommu_enabled = 1;
3496 
3497 	return 0;
3498 
3499 out_free_dmar:
3500 	intel_iommu_free_dmars();
3501 	up_write(&dmar_global_lock);
3502 	return ret;
3503 }
3504 
3505 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3506 {
3507 	struct device_domain_info *info = opaque;
3508 
3509 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3510 	return 0;
3511 }
3512 
3513 /*
3514  * NB - intel-iommu lacks any sort of reference counting for the users of
3515  * dependent devices.  If multiple endpoints have intersecting dependent
3516  * devices, unbinding the driver from any one of them will possibly leave
3517  * the others unable to operate.
3518  */
3519 static void domain_context_clear(struct device_domain_info *info)
3520 {
3521 	if (!dev_is_pci(info->dev))
3522 		domain_context_clear_one(info, info->bus, info->devfn);
3523 
3524 	pci_for_each_dma_alias(to_pci_dev(info->dev),
3525 			       &domain_context_clear_one_cb, info);
3526 }
3527 
3528 /*
3529  * Clear the page table pointer in context or pasid table entries so that
3530  * all DMA requests without PASID from the device are blocked. If the page
3531  * table has been set, clean up the data structures.
3532  */
3533 void device_block_translation(struct device *dev)
3534 {
3535 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3536 	struct intel_iommu *iommu = info->iommu;
3537 	unsigned long flags;
3538 
3539 	iommu_disable_pci_caps(info);
3540 	if (!dev_is_real_dma_subdevice(dev)) {
3541 		if (sm_supported(iommu))
3542 			intel_pasid_tear_down_entry(iommu, dev,
3543 						    IOMMU_NO_PASID, false);
3544 		else
3545 			domain_context_clear(info);
3546 	}
3547 
3548 	if (!info->domain)
3549 		return;
3550 
3551 	spin_lock_irqsave(&info->domain->lock, flags);
3552 	list_del(&info->link);
3553 	spin_unlock_irqrestore(&info->domain->lock, flags);
3554 
3555 	cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID);
3556 	domain_detach_iommu(info->domain, iommu);
3557 	info->domain = NULL;
3558 }
3559 
3560 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3561 {
3562 	int adjust_width;
3563 
3564 	/* calculate AGAW */
3565 	domain->gaw = guest_width;
3566 	adjust_width = guestwidth_to_adjustwidth(guest_width);
3567 	domain->agaw = width_to_agaw(adjust_width);
3568 
3569 	domain->iommu_coherency = false;
3570 	domain->iommu_superpage = 0;
3571 	domain->max_addr = 0;
3572 
3573 	/* always allocate the top pgd */
3574 	domain->pgd = iommu_alloc_page_node(domain->nid, GFP_ATOMIC);
3575 	if (!domain->pgd)
3576 		return -ENOMEM;
3577 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3578 	return 0;
3579 }
3580 
3581 static int blocking_domain_attach_dev(struct iommu_domain *domain,
3582 				      struct device *dev)
3583 {
3584 	device_block_translation(dev);
3585 	return 0;
3586 }
3587 
3588 static struct iommu_domain blocking_domain = {
3589 	.type = IOMMU_DOMAIN_BLOCKED,
3590 	.ops = &(const struct iommu_domain_ops) {
3591 		.attach_dev	= blocking_domain_attach_dev,
3592 	}
3593 };
3594 
3595 static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage)
3596 {
3597 	if (!intel_iommu_superpage)
3598 		return 0;
3599 
3600 	if (first_stage)
3601 		return cap_fl1gp_support(iommu->cap) ? 2 : 1;
3602 
3603 	return fls(cap_super_page_val(iommu->cap));
3604 }
3605 
3606 static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage)
3607 {
3608 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3609 	struct intel_iommu *iommu = info->iommu;
3610 	struct dmar_domain *domain;
3611 	int addr_width;
3612 
3613 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
3614 	if (!domain)
3615 		return ERR_PTR(-ENOMEM);
3616 
3617 	INIT_LIST_HEAD(&domain->devices);
3618 	INIT_LIST_HEAD(&domain->dev_pasids);
3619 	INIT_LIST_HEAD(&domain->cache_tags);
3620 	spin_lock_init(&domain->lock);
3621 	spin_lock_init(&domain->cache_lock);
3622 	xa_init(&domain->iommu_array);
3623 
3624 	domain->nid = dev_to_node(dev);
3625 	domain->has_iotlb_device = info->ats_enabled;
3626 	domain->use_first_level = first_stage;
3627 
3628 	/* calculate the address width */
3629 	addr_width = agaw_to_width(iommu->agaw);
3630 	if (addr_width > cap_mgaw(iommu->cap))
3631 		addr_width = cap_mgaw(iommu->cap);
3632 	domain->gaw = addr_width;
3633 	domain->agaw = iommu->agaw;
3634 	domain->max_addr = __DOMAIN_MAX_ADDR(addr_width);
3635 
3636 	/* iommu memory access coherency */
3637 	domain->iommu_coherency = iommu_paging_structure_coherency(iommu);
3638 
3639 	/* pagesize bitmap */
3640 	domain->domain.pgsize_bitmap = SZ_4K;
3641 	domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage);
3642 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
3643 
3644 	/*
3645 	 * IOVA aperture: First-level translation restricts the input-address
3646 	 * to a canonical address (i.e., address bits 63:N have the same value
3647 	 * as address bit [N-1], where N is 48-bits with 4-level paging and
3648 	 * 57-bits with 5-level paging). Hence, skip bit [N-1].
3649 	 */
3650 	domain->domain.geometry.force_aperture = true;
3651 	domain->domain.geometry.aperture_start = 0;
3652 	if (first_stage)
3653 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
3654 	else
3655 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
3656 
3657 	/* always allocate the top pgd */
3658 	domain->pgd = iommu_alloc_page_node(domain->nid, GFP_KERNEL);
3659 	if (!domain->pgd) {
3660 		kfree(domain);
3661 		return ERR_PTR(-ENOMEM);
3662 	}
3663 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3664 
3665 	return domain;
3666 }
3667 
3668 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
3669 {
3670 	struct dmar_domain *dmar_domain;
3671 	struct iommu_domain *domain;
3672 
3673 	switch (type) {
3674 	case IOMMU_DOMAIN_DMA:
3675 	case IOMMU_DOMAIN_UNMANAGED:
3676 		dmar_domain = alloc_domain(type);
3677 		if (!dmar_domain) {
3678 			pr_err("Can't allocate dmar_domain\n");
3679 			return NULL;
3680 		}
3681 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3682 			pr_err("Domain initialization failed\n");
3683 			domain_exit(dmar_domain);
3684 			return NULL;
3685 		}
3686 
3687 		domain = &dmar_domain->domain;
3688 		domain->geometry.aperture_start = 0;
3689 		domain->geometry.aperture_end   =
3690 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
3691 		domain->geometry.force_aperture = true;
3692 
3693 		return domain;
3694 	case IOMMU_DOMAIN_IDENTITY:
3695 		return &si_domain->domain;
3696 	default:
3697 		return NULL;
3698 	}
3699 
3700 	return NULL;
3701 }
3702 
3703 static struct iommu_domain *
3704 intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
3705 			      struct iommu_domain *parent,
3706 			      const struct iommu_user_data *user_data)
3707 {
3708 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3709 	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3710 	bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3711 	struct intel_iommu *iommu = info->iommu;
3712 	struct dmar_domain *dmar_domain;
3713 	struct iommu_domain *domain;
3714 
3715 	/* Must be NESTING domain */
3716 	if (parent) {
3717 		if (!nested_supported(iommu) || flags)
3718 			return ERR_PTR(-EOPNOTSUPP);
3719 		return intel_nested_domain_alloc(parent, user_data);
3720 	}
3721 
3722 	if (flags &
3723 	    (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
3724 		return ERR_PTR(-EOPNOTSUPP);
3725 	if (nested_parent && !nested_supported(iommu))
3726 		return ERR_PTR(-EOPNOTSUPP);
3727 	if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3728 		return ERR_PTR(-EOPNOTSUPP);
3729 
3730 	/* Do not use first stage for user domain translation. */
3731 	dmar_domain = paging_domain_alloc(dev, false);
3732 	if (IS_ERR(dmar_domain))
3733 		return ERR_CAST(dmar_domain);
3734 	domain = &dmar_domain->domain;
3735 	domain->type = IOMMU_DOMAIN_UNMANAGED;
3736 	domain->owner = &intel_iommu_ops;
3737 	domain->ops = intel_iommu_ops.default_domain_ops;
3738 
3739 	if (nested_parent) {
3740 		dmar_domain->nested_parent = true;
3741 		INIT_LIST_HEAD(&dmar_domain->s1_domains);
3742 		spin_lock_init(&dmar_domain->s1_lock);
3743 	}
3744 
3745 	if (dirty_tracking) {
3746 		if (dmar_domain->use_first_level) {
3747 			iommu_domain_free(domain);
3748 			return ERR_PTR(-EOPNOTSUPP);
3749 		}
3750 		domain->dirty_ops = &intel_dirty_ops;
3751 	}
3752 
3753 	return domain;
3754 }
3755 
3756 static void intel_iommu_domain_free(struct iommu_domain *domain)
3757 {
3758 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3759 
3760 	WARN_ON(dmar_domain->nested_parent &&
3761 		!list_empty(&dmar_domain->s1_domains));
3762 	if (domain != &si_domain->domain)
3763 		domain_exit(dmar_domain);
3764 }
3765 
3766 int prepare_domain_attach_device(struct iommu_domain *domain,
3767 				 struct device *dev)
3768 {
3769 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3770 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3771 	struct intel_iommu *iommu = info->iommu;
3772 	int addr_width;
3773 
3774 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3775 		return -EINVAL;
3776 
3777 	if (domain->dirty_ops && !ssads_supported(iommu))
3778 		return -EINVAL;
3779 
3780 	/* check if this iommu agaw is sufficient for max mapped address */
3781 	addr_width = agaw_to_width(iommu->agaw);
3782 	if (addr_width > cap_mgaw(iommu->cap))
3783 		addr_width = cap_mgaw(iommu->cap);
3784 
3785 	if (dmar_domain->max_addr > (1LL << addr_width))
3786 		return -EINVAL;
3787 	dmar_domain->gaw = addr_width;
3788 
3789 	/*
3790 	 * Knock out extra levels of page tables if necessary
3791 	 */
3792 	while (iommu->agaw < dmar_domain->agaw) {
3793 		struct dma_pte *pte;
3794 
3795 		pte = dmar_domain->pgd;
3796 		if (dma_pte_present(pte)) {
3797 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
3798 			iommu_free_page(pte);
3799 		}
3800 		dmar_domain->agaw--;
3801 	}
3802 
3803 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3804 	    context_copied(iommu, info->bus, info->devfn))
3805 		return intel_pasid_setup_sm_context(dev);
3806 
3807 	return 0;
3808 }
3809 
3810 static int intel_iommu_attach_device(struct iommu_domain *domain,
3811 				     struct device *dev)
3812 {
3813 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3814 	int ret;
3815 
3816 	if (info->domain)
3817 		device_block_translation(dev);
3818 
3819 	ret = prepare_domain_attach_device(domain, dev);
3820 	if (ret)
3821 		return ret;
3822 
3823 	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
3824 }
3825 
3826 static int intel_iommu_map(struct iommu_domain *domain,
3827 			   unsigned long iova, phys_addr_t hpa,
3828 			   size_t size, int iommu_prot, gfp_t gfp)
3829 {
3830 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3831 	u64 max_addr;
3832 	int prot = 0;
3833 
3834 	if (iommu_prot & IOMMU_READ)
3835 		prot |= DMA_PTE_READ;
3836 	if (iommu_prot & IOMMU_WRITE)
3837 		prot |= DMA_PTE_WRITE;
3838 	if (dmar_domain->set_pte_snp)
3839 		prot |= DMA_PTE_SNP;
3840 
3841 	max_addr = iova + size;
3842 	if (dmar_domain->max_addr < max_addr) {
3843 		u64 end;
3844 
3845 		/* check if minimum agaw is sufficient for mapped address */
3846 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3847 		if (end < max_addr) {
3848 			pr_err("%s: iommu width (%d) is not "
3849 			       "sufficient for the mapped address (%llx)\n",
3850 			       __func__, dmar_domain->gaw, max_addr);
3851 			return -EFAULT;
3852 		}
3853 		dmar_domain->max_addr = max_addr;
3854 	}
3855 	/* Round up size to next multiple of PAGE_SIZE, if it and
3856 	   the low bits of hpa would take us onto the next page */
3857 	size = aligned_nrpages(hpa, size);
3858 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3859 				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
3860 }
3861 
3862 static int intel_iommu_map_pages(struct iommu_domain *domain,
3863 				 unsigned long iova, phys_addr_t paddr,
3864 				 size_t pgsize, size_t pgcount,
3865 				 int prot, gfp_t gfp, size_t *mapped)
3866 {
3867 	unsigned long pgshift = __ffs(pgsize);
3868 	size_t size = pgcount << pgshift;
3869 	int ret;
3870 
3871 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
3872 		return -EINVAL;
3873 
3874 	if (!IS_ALIGNED(iova | paddr, pgsize))
3875 		return -EINVAL;
3876 
3877 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
3878 	if (!ret && mapped)
3879 		*mapped = size;
3880 
3881 	return ret;
3882 }
3883 
3884 static size_t intel_iommu_unmap(struct iommu_domain *domain,
3885 				unsigned long iova, size_t size,
3886 				struct iommu_iotlb_gather *gather)
3887 {
3888 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3889 	unsigned long start_pfn, last_pfn;
3890 	int level = 0;
3891 
3892 	/* Cope with horrid API which requires us to unmap more than the
3893 	   size argument if it happens to be a large-page mapping. */
3894 	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
3895 				     &level, GFP_ATOMIC)))
3896 		return 0;
3897 
3898 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
3899 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
3900 
3901 	start_pfn = iova >> VTD_PAGE_SHIFT;
3902 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
3903 
3904 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
3905 
3906 	if (dmar_domain->max_addr == iova + size)
3907 		dmar_domain->max_addr = iova;
3908 
3909 	/*
3910 	 * We do not use page-selective IOTLB invalidation in flush queue,
3911 	 * so there is no need to track page and sync iotlb.
3912 	 */
3913 	if (!iommu_iotlb_gather_queued(gather))
3914 		iommu_iotlb_gather_add_page(domain, gather, iova, size);
3915 
3916 	return size;
3917 }
3918 
3919 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
3920 				      unsigned long iova,
3921 				      size_t pgsize, size_t pgcount,
3922 				      struct iommu_iotlb_gather *gather)
3923 {
3924 	unsigned long pgshift = __ffs(pgsize);
3925 	size_t size = pgcount << pgshift;
3926 
3927 	return intel_iommu_unmap(domain, iova, size, gather);
3928 }
3929 
3930 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
3931 				 struct iommu_iotlb_gather *gather)
3932 {
3933 	cache_tag_flush_range(to_dmar_domain(domain), gather->start,
3934 			      gather->end, list_empty(&gather->freelist));
3935 	iommu_put_pages_list(&gather->freelist);
3936 }
3937 
3938 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3939 					    dma_addr_t iova)
3940 {
3941 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3942 	struct dma_pte *pte;
3943 	int level = 0;
3944 	u64 phys = 0;
3945 
3946 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
3947 			     GFP_ATOMIC);
3948 	if (pte && dma_pte_present(pte))
3949 		phys = dma_pte_addr(pte) +
3950 			(iova & (BIT_MASK(level_to_offset_bits(level) +
3951 						VTD_PAGE_SHIFT) - 1));
3952 
3953 	return phys;
3954 }
3955 
3956 static bool domain_support_force_snooping(struct dmar_domain *domain)
3957 {
3958 	struct device_domain_info *info;
3959 	bool support = true;
3960 
3961 	assert_spin_locked(&domain->lock);
3962 	list_for_each_entry(info, &domain->devices, link) {
3963 		if (!ecap_sc_support(info->iommu->ecap)) {
3964 			support = false;
3965 			break;
3966 		}
3967 	}
3968 
3969 	return support;
3970 }
3971 
3972 static void domain_set_force_snooping(struct dmar_domain *domain)
3973 {
3974 	struct device_domain_info *info;
3975 
3976 	assert_spin_locked(&domain->lock);
3977 	/*
3978 	 * Second level page table supports per-PTE snoop control. The
3979 	 * iommu_map() interface will handle this by setting SNP bit.
3980 	 */
3981 	if (!domain->use_first_level) {
3982 		domain->set_pte_snp = true;
3983 		return;
3984 	}
3985 
3986 	list_for_each_entry(info, &domain->devices, link)
3987 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
3988 						     IOMMU_NO_PASID);
3989 }
3990 
3991 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
3992 {
3993 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3994 	unsigned long flags;
3995 
3996 	if (dmar_domain->force_snooping)
3997 		return true;
3998 
3999 	spin_lock_irqsave(&dmar_domain->lock, flags);
4000 	if (!domain_support_force_snooping(dmar_domain) ||
4001 	    (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
4002 		spin_unlock_irqrestore(&dmar_domain->lock, flags);
4003 		return false;
4004 	}
4005 
4006 	domain_set_force_snooping(dmar_domain);
4007 	dmar_domain->force_snooping = true;
4008 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4009 
4010 	return true;
4011 }
4012 
4013 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4014 {
4015 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4016 
4017 	switch (cap) {
4018 	case IOMMU_CAP_CACHE_COHERENCY:
4019 	case IOMMU_CAP_DEFERRED_FLUSH:
4020 		return true;
4021 	case IOMMU_CAP_PRE_BOOT_PROTECTION:
4022 		return dmar_platform_optin();
4023 	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4024 		return ecap_sc_support(info->iommu->ecap);
4025 	case IOMMU_CAP_DIRTY_TRACKING:
4026 		return ssads_supported(info->iommu);
4027 	default:
4028 		return false;
4029 	}
4030 }
4031 
4032 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4033 {
4034 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4035 	struct device_domain_info *info;
4036 	struct intel_iommu *iommu;
4037 	u8 bus, devfn;
4038 	int ret;
4039 
4040 	iommu = device_lookup_iommu(dev, &bus, &devfn);
4041 	if (!iommu || !iommu->iommu.ops)
4042 		return ERR_PTR(-ENODEV);
4043 
4044 	info = kzalloc(sizeof(*info), GFP_KERNEL);
4045 	if (!info)
4046 		return ERR_PTR(-ENOMEM);
4047 
4048 	if (dev_is_real_dma_subdevice(dev)) {
4049 		info->bus = pdev->bus->number;
4050 		info->devfn = pdev->devfn;
4051 		info->segment = pci_domain_nr(pdev->bus);
4052 	} else {
4053 		info->bus = bus;
4054 		info->devfn = devfn;
4055 		info->segment = iommu->segment;
4056 	}
4057 
4058 	info->dev = dev;
4059 	info->iommu = iommu;
4060 	if (dev_is_pci(dev)) {
4061 		if (ecap_dev_iotlb_support(iommu->ecap) &&
4062 		    pci_ats_supported(pdev) &&
4063 		    dmar_ats_supported(pdev, iommu)) {
4064 			info->ats_supported = 1;
4065 			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4066 
4067 			/*
4068 			 * For IOMMU that supports device IOTLB throttling
4069 			 * (DIT), we assign PFSID to the invalidation desc
4070 			 * of a VF such that IOMMU HW can gauge queue depth
4071 			 * at PF level. If DIT is not set, PFSID will be
4072 			 * treated as reserved, which should be set to 0.
4073 			 */
4074 			if (ecap_dit(iommu->ecap))
4075 				info->pfsid = pci_dev_id(pci_physfn(pdev));
4076 			info->ats_qdep = pci_ats_queue_depth(pdev);
4077 		}
4078 		if (sm_supported(iommu)) {
4079 			if (pasid_supported(iommu)) {
4080 				int features = pci_pasid_features(pdev);
4081 
4082 				if (features >= 0)
4083 					info->pasid_supported = features | 1;
4084 			}
4085 
4086 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4087 			    pci_pri_supported(pdev))
4088 				info->pri_supported = 1;
4089 		}
4090 	}
4091 
4092 	dev_iommu_priv_set(dev, info);
4093 	if (pdev && pci_ats_supported(pdev)) {
4094 		ret = device_rbtree_insert(iommu, info);
4095 		if (ret)
4096 			goto free;
4097 	}
4098 
4099 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4100 		ret = intel_pasid_alloc_table(dev);
4101 		if (ret) {
4102 			dev_err(dev, "PASID table allocation failed\n");
4103 			goto clear_rbtree;
4104 		}
4105 
4106 		if (!context_copied(iommu, info->bus, info->devfn)) {
4107 			ret = intel_pasid_setup_sm_context(dev);
4108 			if (ret)
4109 				goto free_table;
4110 		}
4111 	}
4112 
4113 	intel_iommu_debugfs_create_dev(info);
4114 
4115 	return &iommu->iommu;
4116 free_table:
4117 	intel_pasid_free_table(dev);
4118 clear_rbtree:
4119 	device_rbtree_remove(info);
4120 free:
4121 	kfree(info);
4122 
4123 	return ERR_PTR(ret);
4124 }
4125 
4126 static void intel_iommu_release_device(struct device *dev)
4127 {
4128 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4129 	struct intel_iommu *iommu = info->iommu;
4130 
4131 	mutex_lock(&iommu->iopf_lock);
4132 	if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
4133 		device_rbtree_remove(info);
4134 	mutex_unlock(&iommu->iopf_lock);
4135 
4136 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
4137 	    !context_copied(iommu, info->bus, info->devfn))
4138 		intel_pasid_teardown_sm_context(dev);
4139 
4140 	intel_pasid_free_table(dev);
4141 	intel_iommu_debugfs_remove_dev(info);
4142 	kfree(info);
4143 	set_dma_ops(dev, NULL);
4144 }
4145 
4146 static void intel_iommu_get_resv_regions(struct device *device,
4147 					 struct list_head *head)
4148 {
4149 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4150 	struct iommu_resv_region *reg;
4151 	struct dmar_rmrr_unit *rmrr;
4152 	struct device *i_dev;
4153 	int i;
4154 
4155 	rcu_read_lock();
4156 	for_each_rmrr_units(rmrr) {
4157 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4158 					  i, i_dev) {
4159 			struct iommu_resv_region *resv;
4160 			enum iommu_resv_type type;
4161 			size_t length;
4162 
4163 			if (i_dev != device &&
4164 			    !is_downstream_to_pci_bridge(device, i_dev))
4165 				continue;
4166 
4167 			length = rmrr->end_address - rmrr->base_address + 1;
4168 
4169 			type = device_rmrr_is_relaxable(device) ?
4170 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4171 
4172 			resv = iommu_alloc_resv_region(rmrr->base_address,
4173 						       length, prot, type,
4174 						       GFP_ATOMIC);
4175 			if (!resv)
4176 				break;
4177 
4178 			list_add_tail(&resv->list, head);
4179 		}
4180 	}
4181 	rcu_read_unlock();
4182 
4183 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4184 	if (dev_is_pci(device)) {
4185 		struct pci_dev *pdev = to_pci_dev(device);
4186 
4187 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4188 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4189 					IOMMU_RESV_DIRECT_RELAXABLE,
4190 					GFP_KERNEL);
4191 			if (reg)
4192 				list_add_tail(&reg->list, head);
4193 		}
4194 	}
4195 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4196 
4197 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4198 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4199 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4200 	if (!reg)
4201 		return;
4202 	list_add_tail(&reg->list, head);
4203 }
4204 
4205 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4206 {
4207 	if (dev_is_pci(dev))
4208 		return pci_device_group(dev);
4209 	return generic_device_group(dev);
4210 }
4211 
4212 static int intel_iommu_enable_sva(struct device *dev)
4213 {
4214 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4215 	struct intel_iommu *iommu;
4216 
4217 	if (!info || dmar_disabled)
4218 		return -EINVAL;
4219 
4220 	iommu = info->iommu;
4221 	if (!iommu)
4222 		return -EINVAL;
4223 
4224 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4225 		return -ENODEV;
4226 
4227 	if (!info->pasid_enabled || !info->ats_enabled)
4228 		return -EINVAL;
4229 
4230 	/*
4231 	 * Devices having device-specific I/O fault handling should not
4232 	 * support PCI/PRI. The IOMMU side has no means to check the
4233 	 * capability of device-specific IOPF.  Therefore, IOMMU can only
4234 	 * default that if the device driver enables SVA on a non-PRI
4235 	 * device, it will handle IOPF in its own way.
4236 	 */
4237 	if (!info->pri_supported)
4238 		return 0;
4239 
4240 	/* Devices supporting PRI should have it enabled. */
4241 	if (!info->pri_enabled)
4242 		return -EINVAL;
4243 
4244 	return 0;
4245 }
4246 
4247 static int context_flip_pri(struct device_domain_info *info, bool enable)
4248 {
4249 	struct intel_iommu *iommu = info->iommu;
4250 	u8 bus = info->bus, devfn = info->devfn;
4251 	struct context_entry *context;
4252 
4253 	spin_lock(&iommu->lock);
4254 	if (context_copied(iommu, bus, devfn)) {
4255 		spin_unlock(&iommu->lock);
4256 		return -EINVAL;
4257 	}
4258 
4259 	context = iommu_context_addr(iommu, bus, devfn, false);
4260 	if (!context || !context_present(context)) {
4261 		spin_unlock(&iommu->lock);
4262 		return -ENODEV;
4263 	}
4264 
4265 	if (enable)
4266 		context_set_sm_pre(context);
4267 	else
4268 		context_clear_sm_pre(context);
4269 
4270 	if (!ecap_coherent(iommu->ecap))
4271 		clflush_cache_range(context, sizeof(*context));
4272 	intel_context_flush_present(info, context, true);
4273 	spin_unlock(&iommu->lock);
4274 
4275 	return 0;
4276 }
4277 
4278 static int intel_iommu_enable_iopf(struct device *dev)
4279 {
4280 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4281 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4282 	struct intel_iommu *iommu;
4283 	int ret;
4284 
4285 	if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4286 		return -ENODEV;
4287 
4288 	if (info->pri_enabled)
4289 		return -EBUSY;
4290 
4291 	iommu = info->iommu;
4292 	if (!iommu)
4293 		return -EINVAL;
4294 
4295 	/* PASID is required in PRG Response Message. */
4296 	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4297 		return -EINVAL;
4298 
4299 	ret = pci_reset_pri(pdev);
4300 	if (ret)
4301 		return ret;
4302 
4303 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4304 	if (ret)
4305 		return ret;
4306 
4307 	ret = context_flip_pri(info, true);
4308 	if (ret)
4309 		goto err_remove_device;
4310 
4311 	ret = pci_enable_pri(pdev, PRQ_DEPTH);
4312 	if (ret)
4313 		goto err_clear_pri;
4314 
4315 	info->pri_enabled = 1;
4316 
4317 	return 0;
4318 err_clear_pri:
4319 	context_flip_pri(info, false);
4320 err_remove_device:
4321 	iopf_queue_remove_device(iommu->iopf_queue, dev);
4322 
4323 	return ret;
4324 }
4325 
4326 static int intel_iommu_disable_iopf(struct device *dev)
4327 {
4328 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4329 	struct intel_iommu *iommu = info->iommu;
4330 
4331 	if (!info->pri_enabled)
4332 		return -EINVAL;
4333 
4334 	/* Disable new PRI reception: */
4335 	context_flip_pri(info, false);
4336 
4337 	/*
4338 	 * Remove device from fault queue and acknowledge all outstanding
4339 	 * PRQs to the device:
4340 	 */
4341 	iopf_queue_remove_device(iommu->iopf_queue, dev);
4342 
4343 	/*
4344 	 * PCIe spec states that by clearing PRI enable bit, the Page
4345 	 * Request Interface will not issue new page requests, but has
4346 	 * outstanding page requests that have been transmitted or are
4347 	 * queued for transmission. This is supposed to be called after
4348 	 * the device driver has stopped DMA, all PASIDs have been
4349 	 * unbound and the outstanding PRQs have been drained.
4350 	 */
4351 	pci_disable_pri(to_pci_dev(dev));
4352 	info->pri_enabled = 0;
4353 
4354 	return 0;
4355 }
4356 
4357 static int
4358 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4359 {
4360 	switch (feat) {
4361 	case IOMMU_DEV_FEAT_IOPF:
4362 		return intel_iommu_enable_iopf(dev);
4363 
4364 	case IOMMU_DEV_FEAT_SVA:
4365 		return intel_iommu_enable_sva(dev);
4366 
4367 	default:
4368 		return -ENODEV;
4369 	}
4370 }
4371 
4372 static int
4373 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4374 {
4375 	switch (feat) {
4376 	case IOMMU_DEV_FEAT_IOPF:
4377 		return intel_iommu_disable_iopf(dev);
4378 
4379 	case IOMMU_DEV_FEAT_SVA:
4380 		return 0;
4381 
4382 	default:
4383 		return -ENODEV;
4384 	}
4385 }
4386 
4387 static bool intel_iommu_is_attach_deferred(struct device *dev)
4388 {
4389 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4390 
4391 	return translation_pre_enabled(info->iommu) && !info->domain;
4392 }
4393 
4394 /*
4395  * Check that the device does not live on an external facing PCI port that is
4396  * marked as untrusted. Such devices should not be able to apply quirks and
4397  * thus not be able to bypass the IOMMU restrictions.
4398  */
4399 static bool risky_device(struct pci_dev *pdev)
4400 {
4401 	if (pdev->untrusted) {
4402 		pci_info(pdev,
4403 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4404 			 pdev->vendor, pdev->device);
4405 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4406 		return true;
4407 	}
4408 	return false;
4409 }
4410 
4411 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4412 				      unsigned long iova, size_t size)
4413 {
4414 	cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1);
4415 
4416 	return 0;
4417 }
4418 
4419 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
4420 					 struct iommu_domain *domain)
4421 {
4422 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4423 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4424 	struct dev_pasid_info *curr, *dev_pasid = NULL;
4425 	struct intel_iommu *iommu = info->iommu;
4426 	unsigned long flags;
4427 
4428 	spin_lock_irqsave(&dmar_domain->lock, flags);
4429 	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4430 		if (curr->dev == dev && curr->pasid == pasid) {
4431 			list_del(&curr->link_domain);
4432 			dev_pasid = curr;
4433 			break;
4434 		}
4435 	}
4436 	WARN_ON_ONCE(!dev_pasid);
4437 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4438 
4439 	cache_tag_unassign_domain(dmar_domain, dev, pasid);
4440 	domain_detach_iommu(dmar_domain, iommu);
4441 	intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4442 	kfree(dev_pasid);
4443 	intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4444 	intel_drain_pasid_prq(dev, pasid);
4445 }
4446 
4447 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4448 				     struct device *dev, ioasid_t pasid)
4449 {
4450 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4451 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4452 	struct intel_iommu *iommu = info->iommu;
4453 	struct dev_pasid_info *dev_pasid;
4454 	unsigned long flags;
4455 	int ret;
4456 
4457 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4458 		return -EOPNOTSUPP;
4459 
4460 	if (domain->dirty_ops)
4461 		return -EINVAL;
4462 
4463 	if (context_copied(iommu, info->bus, info->devfn))
4464 		return -EBUSY;
4465 
4466 	ret = prepare_domain_attach_device(domain, dev);
4467 	if (ret)
4468 		return ret;
4469 
4470 	dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4471 	if (!dev_pasid)
4472 		return -ENOMEM;
4473 
4474 	ret = domain_attach_iommu(dmar_domain, iommu);
4475 	if (ret)
4476 		goto out_free;
4477 
4478 	ret = cache_tag_assign_domain(dmar_domain, dev, pasid);
4479 	if (ret)
4480 		goto out_detach_iommu;
4481 
4482 	if (domain_type_is_si(dmar_domain))
4483 		ret = intel_pasid_setup_pass_through(iommu, dev, pasid);
4484 	else if (dmar_domain->use_first_level)
4485 		ret = domain_setup_first_level(iommu, dmar_domain,
4486 					       dev, pasid);
4487 	else
4488 		ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4489 						     dev, pasid);
4490 	if (ret)
4491 		goto out_unassign_tag;
4492 
4493 	dev_pasid->dev = dev;
4494 	dev_pasid->pasid = pasid;
4495 	spin_lock_irqsave(&dmar_domain->lock, flags);
4496 	list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4497 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4498 
4499 	if (domain->type & __IOMMU_DOMAIN_PAGING)
4500 		intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4501 
4502 	return 0;
4503 out_unassign_tag:
4504 	cache_tag_unassign_domain(dmar_domain, dev, pasid);
4505 out_detach_iommu:
4506 	domain_detach_iommu(dmar_domain, iommu);
4507 out_free:
4508 	kfree(dev_pasid);
4509 	return ret;
4510 }
4511 
4512 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4513 {
4514 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4515 	struct intel_iommu *iommu = info->iommu;
4516 	struct iommu_hw_info_vtd *vtd;
4517 
4518 	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4519 	if (!vtd)
4520 		return ERR_PTR(-ENOMEM);
4521 
4522 	vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4523 	vtd->cap_reg = iommu->cap;
4524 	vtd->ecap_reg = iommu->ecap;
4525 	*length = sizeof(*vtd);
4526 	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4527 	return vtd;
4528 }
4529 
4530 /*
4531  * Set dirty tracking for the device list of a domain. The caller must
4532  * hold the domain->lock when calling it.
4533  */
4534 static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4535 {
4536 	struct device_domain_info *info;
4537 	int ret = 0;
4538 
4539 	list_for_each_entry(info, devices, link) {
4540 		ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4541 						       IOMMU_NO_PASID, enable);
4542 		if (ret)
4543 			break;
4544 	}
4545 
4546 	return ret;
4547 }
4548 
4549 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4550 					    bool enable)
4551 {
4552 	struct dmar_domain *s1_domain;
4553 	unsigned long flags;
4554 	int ret;
4555 
4556 	spin_lock(&domain->s1_lock);
4557 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4558 		spin_lock_irqsave(&s1_domain->lock, flags);
4559 		ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4560 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4561 		if (ret)
4562 			goto err_unwind;
4563 	}
4564 	spin_unlock(&domain->s1_lock);
4565 	return 0;
4566 
4567 err_unwind:
4568 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4569 		spin_lock_irqsave(&s1_domain->lock, flags);
4570 		device_set_dirty_tracking(&s1_domain->devices,
4571 					  domain->dirty_tracking);
4572 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4573 	}
4574 	spin_unlock(&domain->s1_lock);
4575 	return ret;
4576 }
4577 
4578 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4579 					  bool enable)
4580 {
4581 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4582 	int ret;
4583 
4584 	spin_lock(&dmar_domain->lock);
4585 	if (dmar_domain->dirty_tracking == enable)
4586 		goto out_unlock;
4587 
4588 	ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4589 	if (ret)
4590 		goto err_unwind;
4591 
4592 	if (dmar_domain->nested_parent) {
4593 		ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4594 		if (ret)
4595 			goto err_unwind;
4596 	}
4597 
4598 	dmar_domain->dirty_tracking = enable;
4599 out_unlock:
4600 	spin_unlock(&dmar_domain->lock);
4601 
4602 	return 0;
4603 
4604 err_unwind:
4605 	device_set_dirty_tracking(&dmar_domain->devices,
4606 				  dmar_domain->dirty_tracking);
4607 	spin_unlock(&dmar_domain->lock);
4608 	return ret;
4609 }
4610 
4611 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4612 					    unsigned long iova, size_t size,
4613 					    unsigned long flags,
4614 					    struct iommu_dirty_bitmap *dirty)
4615 {
4616 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4617 	unsigned long end = iova + size - 1;
4618 	unsigned long pgsize;
4619 
4620 	/*
4621 	 * IOMMUFD core calls into a dirty tracking disabled domain without an
4622 	 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4623 	 * have occurred when we stopped dirty tracking. This ensures that we
4624 	 * never inherit dirtied bits from a previous cycle.
4625 	 */
4626 	if (!dmar_domain->dirty_tracking && dirty->bitmap)
4627 		return -EINVAL;
4628 
4629 	do {
4630 		struct dma_pte *pte;
4631 		int lvl = 0;
4632 
4633 		pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4634 				     GFP_ATOMIC);
4635 		pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4636 		if (!pte || !dma_pte_present(pte)) {
4637 			iova += pgsize;
4638 			continue;
4639 		}
4640 
4641 		if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4642 			iommu_dirty_bitmap_record(dirty, iova, pgsize);
4643 		iova += pgsize;
4644 	} while (iova < end);
4645 
4646 	return 0;
4647 }
4648 
4649 static const struct iommu_dirty_ops intel_dirty_ops = {
4650 	.set_dirty_tracking = intel_iommu_set_dirty_tracking,
4651 	.read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4652 };
4653 
4654 const struct iommu_ops intel_iommu_ops = {
4655 	.blocked_domain		= &blocking_domain,
4656 	.release_domain		= &blocking_domain,
4657 	.capable		= intel_iommu_capable,
4658 	.hw_info		= intel_iommu_hw_info,
4659 	.domain_alloc		= intel_iommu_domain_alloc,
4660 	.domain_alloc_user	= intel_iommu_domain_alloc_user,
4661 	.domain_alloc_sva	= intel_svm_domain_alloc,
4662 	.probe_device		= intel_iommu_probe_device,
4663 	.release_device		= intel_iommu_release_device,
4664 	.get_resv_regions	= intel_iommu_get_resv_regions,
4665 	.device_group		= intel_iommu_device_group,
4666 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4667 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4668 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4669 	.def_domain_type	= device_def_domain_type,
4670 	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4671 	.pgsize_bitmap		= SZ_4K,
4672 #ifdef CONFIG_INTEL_IOMMU_SVM
4673 	.page_response		= intel_svm_page_response,
4674 #endif
4675 	.default_domain_ops = &(const struct iommu_domain_ops) {
4676 		.attach_dev		= intel_iommu_attach_device,
4677 		.set_dev_pasid		= intel_iommu_set_dev_pasid,
4678 		.map_pages		= intel_iommu_map_pages,
4679 		.unmap_pages		= intel_iommu_unmap_pages,
4680 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4681 		.flush_iotlb_all        = intel_flush_iotlb_all,
4682 		.iotlb_sync		= intel_iommu_tlb_sync,
4683 		.iova_to_phys		= intel_iommu_iova_to_phys,
4684 		.free			= intel_iommu_domain_free,
4685 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4686 	}
4687 };
4688 
4689 static void quirk_iommu_igfx(struct pci_dev *dev)
4690 {
4691 	if (risky_device(dev))
4692 		return;
4693 
4694 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4695 	disable_igfx_iommu = 1;
4696 }
4697 
4698 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4699 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4700 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4701 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4702 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4703 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4704 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4705 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4706 
4707 /* Broadwell igfx malfunctions with dmar */
4708 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4709 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4710 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4711 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4712 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4713 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4714 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4715 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4716 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4717 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4718 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4719 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4720 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4721 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4722 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4723 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4724 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4725 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4726 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4727 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4728 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4729 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4730 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4731 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4732 
4733 static void quirk_iommu_rwbf(struct pci_dev *dev)
4734 {
4735 	if (risky_device(dev))
4736 		return;
4737 
4738 	/*
4739 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4740 	 * but needs it. Same seems to hold for the desktop versions.
4741 	 */
4742 	pci_info(dev, "Forcing write-buffer flush capability\n");
4743 	rwbf_quirk = 1;
4744 }
4745 
4746 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4747 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4748 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4749 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4750 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4751 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4752 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4753 
4754 #define GGC 0x52
4755 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4756 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4757 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4758 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4759 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4760 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4761 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4762 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4763 
4764 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4765 {
4766 	unsigned short ggc;
4767 
4768 	if (risky_device(dev))
4769 		return;
4770 
4771 	if (pci_read_config_word(dev, GGC, &ggc))
4772 		return;
4773 
4774 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4775 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4776 		disable_igfx_iommu = 1;
4777 	} else if (!disable_igfx_iommu) {
4778 		/* we have to ensure the gfx device is idle before we flush */
4779 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4780 		iommu_set_dma_strict();
4781 	}
4782 }
4783 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4784 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4785 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4786 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4787 
4788 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4789 {
4790 	unsigned short ver;
4791 
4792 	if (!IS_GFX_DEVICE(dev))
4793 		return;
4794 
4795 	ver = (dev->device >> 8) & 0xff;
4796 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4797 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4798 	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4799 		return;
4800 
4801 	if (risky_device(dev))
4802 		return;
4803 
4804 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4805 	iommu_skip_te_disable = 1;
4806 }
4807 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4808 
4809 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4810    ISOCH DMAR unit for the Azalia sound device, but not give it any
4811    TLB entries, which causes it to deadlock. Check for that.  We do
4812    this in a function called from init_dmars(), instead of in a PCI
4813    quirk, because we don't want to print the obnoxious "BIOS broken"
4814    message if VT-d is actually disabled.
4815 */
4816 static void __init check_tylersburg_isoch(void)
4817 {
4818 	struct pci_dev *pdev;
4819 	uint32_t vtisochctrl;
4820 
4821 	/* If there's no Azalia in the system anyway, forget it. */
4822 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4823 	if (!pdev)
4824 		return;
4825 
4826 	if (risky_device(pdev)) {
4827 		pci_dev_put(pdev);
4828 		return;
4829 	}
4830 
4831 	pci_dev_put(pdev);
4832 
4833 	/* System Management Registers. Might be hidden, in which case
4834 	   we can't do the sanity check. But that's OK, because the
4835 	   known-broken BIOSes _don't_ actually hide it, so far. */
4836 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4837 	if (!pdev)
4838 		return;
4839 
4840 	if (risky_device(pdev)) {
4841 		pci_dev_put(pdev);
4842 		return;
4843 	}
4844 
4845 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4846 		pci_dev_put(pdev);
4847 		return;
4848 	}
4849 
4850 	pci_dev_put(pdev);
4851 
4852 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4853 	if (vtisochctrl & 1)
4854 		return;
4855 
4856 	/* Drop all bits other than the number of TLB entries */
4857 	vtisochctrl &= 0x1c;
4858 
4859 	/* If we have the recommended number of TLB entries (16), fine. */
4860 	if (vtisochctrl == 0x10)
4861 		return;
4862 
4863 	/* Zero TLB entries? You get to ride the short bus to school. */
4864 	if (!vtisochctrl) {
4865 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4866 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4867 		     dmi_get_system_info(DMI_BIOS_VENDOR),
4868 		     dmi_get_system_info(DMI_BIOS_VERSION),
4869 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4870 		iommu_identity_mapping |= IDENTMAP_AZALIA;
4871 		return;
4872 	}
4873 
4874 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4875 	       vtisochctrl);
4876 }
4877 
4878 /*
4879  * Here we deal with a device TLB defect where device may inadvertently issue ATS
4880  * invalidation completion before posted writes initiated with translated address
4881  * that utilized translations matching the invalidation address range, violating
4882  * the invalidation completion ordering.
4883  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4884  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4885  * under the control of the trusted/privileged host device driver must use this
4886  * quirk.
4887  * Device TLBs are invalidated under the following six conditions:
4888  * 1. Device driver does DMA API unmap IOVA
4889  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4890  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4891  *    exit_mmap() due to crash
4892  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4893  *    VM has to free pages that were unmapped
4894  * 5. Userspace driver unmaps a DMA buffer
4895  * 6. Cache invalidation in vSVA usage (upcoming)
4896  *
4897  * For #1 and #2, device drivers are responsible for stopping DMA traffic
4898  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4899  * invalidate TLB the same way as normal user unmap which will use this quirk.
4900  * The dTLB invalidation after PASID cache flush does not need this quirk.
4901  *
4902  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4903  */
4904 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4905 			       unsigned long address, unsigned long mask,
4906 			       u32 pasid, u16 qdep)
4907 {
4908 	u16 sid;
4909 
4910 	if (likely(!info->dtlb_extra_inval))
4911 		return;
4912 
4913 	sid = PCI_DEVID(info->bus, info->devfn);
4914 	if (pasid == IOMMU_NO_PASID) {
4915 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
4916 				   qdep, address, mask);
4917 	} else {
4918 		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
4919 					 pasid, qdep, address, mask);
4920 	}
4921 }
4922 
4923 #define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
4924 
4925 /*
4926  * Function to submit a command to the enhanced command interface. The
4927  * valid enhanced command descriptions are defined in Table 47 of the
4928  * VT-d spec. The VT-d hardware implementation may support some but not
4929  * all commands, which can be determined by checking the Enhanced
4930  * Command Capability Register.
4931  *
4932  * Return values:
4933  *  - 0: Command successful without any error;
4934  *  - Negative: software error value;
4935  *  - Nonzero positive: failure status code defined in Table 48.
4936  */
4937 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
4938 {
4939 	unsigned long flags;
4940 	u64 res;
4941 	int ret;
4942 
4943 	if (!cap_ecmds(iommu->cap))
4944 		return -ENODEV;
4945 
4946 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
4947 
4948 	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
4949 	if (res & DMA_ECMD_ECRSP_IP) {
4950 		ret = -EBUSY;
4951 		goto err;
4952 	}
4953 
4954 	/*
4955 	 * Unconditionally write the operand B, because
4956 	 * - There is no side effect if an ecmd doesn't require an
4957 	 *   operand B, but we set the register to some value.
4958 	 * - It's not invoked in any critical path. The extra MMIO
4959 	 *   write doesn't bring any performance concerns.
4960 	 */
4961 	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
4962 	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
4963 
4964 	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
4965 		      !(res & DMA_ECMD_ECRSP_IP), res);
4966 
4967 	if (res & DMA_ECMD_ECRSP_IP) {
4968 		ret = -ETIMEDOUT;
4969 		goto err;
4970 	}
4971 
4972 	ret = ecmd_get_status_code(res);
4973 err:
4974 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
4975 
4976 	return ret;
4977 }
4978