xref: /linux/drivers/iommu/intel/iommu.c (revision 170aafe35cb98e0f3fbacb446ea86389fbce22ea)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26 
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-pages.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 #include "perfmon.h"
34 
35 #define ROOT_SIZE		VTD_PAGE_SIZE
36 #define CONTEXT_SIZE		VTD_PAGE_SIZE
37 
38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42 
43 #define IOAPIC_RANGE_START	(0xfee00000)
44 #define IOAPIC_RANGE_END	(0xfeefffff)
45 #define IOVA_START_ADDR		(0x1000)
46 
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48 
49 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
50 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
51 
52 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
53    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
54 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
55 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
56 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
57 
58 static void __init check_tylersburg_isoch(void);
59 static int rwbf_quirk;
60 
61 /*
62  * set to 1 to panic kernel if can't successfully enable VT-d
63  * (used when kernel is launched w/ TXT)
64  */
65 static int force_on = 0;
66 static int intel_iommu_tboot_noforce;
67 static int no_platform_optin;
68 
69 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
70 
71 /*
72  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
73  * if marked present.
74  */
75 static phys_addr_t root_entry_lctp(struct root_entry *re)
76 {
77 	if (!(re->lo & 1))
78 		return 0;
79 
80 	return re->lo & VTD_PAGE_MASK;
81 }
82 
83 /*
84  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
85  * if marked present.
86  */
87 static phys_addr_t root_entry_uctp(struct root_entry *re)
88 {
89 	if (!(re->hi & 1))
90 		return 0;
91 
92 	return re->hi & VTD_PAGE_MASK;
93 }
94 
95 static int device_rid_cmp_key(const void *key, const struct rb_node *node)
96 {
97 	struct device_domain_info *info =
98 		rb_entry(node, struct device_domain_info, node);
99 	const u16 *rid_lhs = key;
100 
101 	if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
102 		return -1;
103 
104 	if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
105 		return 1;
106 
107 	return 0;
108 }
109 
110 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
111 {
112 	struct device_domain_info *info =
113 		rb_entry(lhs, struct device_domain_info, node);
114 	u16 key = PCI_DEVID(info->bus, info->devfn);
115 
116 	return device_rid_cmp_key(&key, rhs);
117 }
118 
119 /*
120  * Looks up an IOMMU-probed device using its source ID.
121  *
122  * Returns the pointer to the device if there is a match. Otherwise,
123  * returns NULL.
124  *
125  * Note that this helper doesn't guarantee that the device won't be
126  * released by the iommu subsystem after being returned. The caller
127  * should use its own synchronization mechanism to avoid the device
128  * being released during its use if its possibly the case.
129  */
130 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
131 {
132 	struct device_domain_info *info = NULL;
133 	struct rb_node *node;
134 	unsigned long flags;
135 
136 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
137 	node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
138 	if (node)
139 		info = rb_entry(node, struct device_domain_info, node);
140 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
141 
142 	return info ? info->dev : NULL;
143 }
144 
145 static int device_rbtree_insert(struct intel_iommu *iommu,
146 				struct device_domain_info *info)
147 {
148 	struct rb_node *curr;
149 	unsigned long flags;
150 
151 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
152 	curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
153 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
154 	if (WARN_ON(curr))
155 		return -EEXIST;
156 
157 	return 0;
158 }
159 
160 static void device_rbtree_remove(struct device_domain_info *info)
161 {
162 	struct intel_iommu *iommu = info->iommu;
163 	unsigned long flags;
164 
165 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
166 	rb_erase(&info->node, &iommu->device_rbtree);
167 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
168 }
169 
170 /*
171  * This domain is a statically identity mapping domain.
172  *	1. This domain creats a static 1:1 mapping to all usable memory.
173  * 	2. It maps to each iommu if successful.
174  *	3. Each iommu mapps to this domain if successful.
175  */
176 static struct dmar_domain *si_domain;
177 static int hw_pass_through = 1;
178 
179 struct dmar_rmrr_unit {
180 	struct list_head list;		/* list of rmrr units	*/
181 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
182 	u64	base_address;		/* reserved base address*/
183 	u64	end_address;		/* reserved end address */
184 	struct dmar_dev_scope *devices;	/* target devices */
185 	int	devices_cnt;		/* target device count */
186 };
187 
188 struct dmar_atsr_unit {
189 	struct list_head list;		/* list of ATSR units */
190 	struct acpi_dmar_header *hdr;	/* ACPI header */
191 	struct dmar_dev_scope *devices;	/* target devices */
192 	int devices_cnt;		/* target device count */
193 	u8 include_all:1;		/* include all ports */
194 };
195 
196 struct dmar_satc_unit {
197 	struct list_head list;		/* list of SATC units */
198 	struct acpi_dmar_header *hdr;	/* ACPI header */
199 	struct dmar_dev_scope *devices;	/* target devices */
200 	struct intel_iommu *iommu;	/* the corresponding iommu */
201 	int devices_cnt;		/* target device count */
202 	u8 atc_required:1;		/* ATS is required */
203 };
204 
205 static LIST_HEAD(dmar_atsr_units);
206 static LIST_HEAD(dmar_rmrr_units);
207 static LIST_HEAD(dmar_satc_units);
208 
209 #define for_each_rmrr_units(rmrr) \
210 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
211 
212 static void intel_iommu_domain_free(struct iommu_domain *domain);
213 
214 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
215 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
216 
217 int intel_iommu_enabled = 0;
218 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
219 
220 static int intel_iommu_superpage = 1;
221 static int iommu_identity_mapping;
222 static int iommu_skip_te_disable;
223 static int disable_igfx_iommu;
224 
225 #define IDENTMAP_AZALIA		4
226 
227 const struct iommu_ops intel_iommu_ops;
228 static const struct iommu_dirty_ops intel_dirty_ops;
229 
230 static bool translation_pre_enabled(struct intel_iommu *iommu)
231 {
232 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
233 }
234 
235 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
236 {
237 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
238 }
239 
240 static void init_translation_status(struct intel_iommu *iommu)
241 {
242 	u32 gsts;
243 
244 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
245 	if (gsts & DMA_GSTS_TES)
246 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
247 }
248 
249 static int __init intel_iommu_setup(char *str)
250 {
251 	if (!str)
252 		return -EINVAL;
253 
254 	while (*str) {
255 		if (!strncmp(str, "on", 2)) {
256 			dmar_disabled = 0;
257 			pr_info("IOMMU enabled\n");
258 		} else if (!strncmp(str, "off", 3)) {
259 			dmar_disabled = 1;
260 			no_platform_optin = 1;
261 			pr_info("IOMMU disabled\n");
262 		} else if (!strncmp(str, "igfx_off", 8)) {
263 			disable_igfx_iommu = 1;
264 			pr_info("Disable GFX device mapping\n");
265 		} else if (!strncmp(str, "forcedac", 8)) {
266 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
267 			iommu_dma_forcedac = true;
268 		} else if (!strncmp(str, "strict", 6)) {
269 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
270 			iommu_set_dma_strict();
271 		} else if (!strncmp(str, "sp_off", 6)) {
272 			pr_info("Disable supported super page\n");
273 			intel_iommu_superpage = 0;
274 		} else if (!strncmp(str, "sm_on", 5)) {
275 			pr_info("Enable scalable mode if hardware supports\n");
276 			intel_iommu_sm = 1;
277 		} else if (!strncmp(str, "sm_off", 6)) {
278 			pr_info("Scalable mode is disallowed\n");
279 			intel_iommu_sm = 0;
280 		} else if (!strncmp(str, "tboot_noforce", 13)) {
281 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
282 			intel_iommu_tboot_noforce = 1;
283 		} else {
284 			pr_notice("Unknown option - '%s'\n", str);
285 		}
286 
287 		str += strcspn(str, ",");
288 		while (*str == ',')
289 			str++;
290 	}
291 
292 	return 1;
293 }
294 __setup("intel_iommu=", intel_iommu_setup);
295 
296 static int domain_type_is_si(struct dmar_domain *domain)
297 {
298 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
299 }
300 
301 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
302 {
303 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
304 
305 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
306 }
307 
308 /*
309  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
310  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
311  * the returned SAGAW.
312  */
313 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
314 {
315 	unsigned long fl_sagaw, sl_sagaw;
316 
317 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
318 	sl_sagaw = cap_sagaw(iommu->cap);
319 
320 	/* Second level only. */
321 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
322 		return sl_sagaw;
323 
324 	/* First level only. */
325 	if (!ecap_slts(iommu->ecap))
326 		return fl_sagaw;
327 
328 	return fl_sagaw & sl_sagaw;
329 }
330 
331 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
332 {
333 	unsigned long sagaw;
334 	int agaw;
335 
336 	sagaw = __iommu_calculate_sagaw(iommu);
337 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
338 		if (test_bit(agaw, &sagaw))
339 			break;
340 	}
341 
342 	return agaw;
343 }
344 
345 /*
346  * Calculate max SAGAW for each iommu.
347  */
348 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
349 {
350 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
351 }
352 
353 /*
354  * calculate agaw for each iommu.
355  * "SAGAW" may be different across iommus, use a default agaw, and
356  * get a supported less agaw for iommus that don't support the default agaw.
357  */
358 int iommu_calculate_agaw(struct intel_iommu *iommu)
359 {
360 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
361 }
362 
363 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
364 {
365 	return sm_supported(iommu) ?
366 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
367 }
368 
369 static void domain_update_iommu_coherency(struct dmar_domain *domain)
370 {
371 	struct iommu_domain_info *info;
372 	struct dmar_drhd_unit *drhd;
373 	struct intel_iommu *iommu;
374 	bool found = false;
375 	unsigned long i;
376 
377 	domain->iommu_coherency = true;
378 	xa_for_each(&domain->iommu_array, i, info) {
379 		found = true;
380 		if (!iommu_paging_structure_coherency(info->iommu)) {
381 			domain->iommu_coherency = false;
382 			break;
383 		}
384 	}
385 	if (found)
386 		return;
387 
388 	/* No hardware attached; use lowest common denominator */
389 	rcu_read_lock();
390 	for_each_active_iommu(iommu, drhd) {
391 		if (!iommu_paging_structure_coherency(iommu)) {
392 			domain->iommu_coherency = false;
393 			break;
394 		}
395 	}
396 	rcu_read_unlock();
397 }
398 
399 static int domain_update_iommu_superpage(struct dmar_domain *domain,
400 					 struct intel_iommu *skip)
401 {
402 	struct dmar_drhd_unit *drhd;
403 	struct intel_iommu *iommu;
404 	int mask = 0x3;
405 
406 	if (!intel_iommu_superpage)
407 		return 0;
408 
409 	/* set iommu_superpage to the smallest common denominator */
410 	rcu_read_lock();
411 	for_each_active_iommu(iommu, drhd) {
412 		if (iommu != skip) {
413 			if (domain && domain->use_first_level) {
414 				if (!cap_fl1gp_support(iommu->cap))
415 					mask = 0x1;
416 			} else {
417 				mask &= cap_super_page_val(iommu->cap);
418 			}
419 
420 			if (!mask)
421 				break;
422 		}
423 	}
424 	rcu_read_unlock();
425 
426 	return fls(mask);
427 }
428 
429 static int domain_update_device_node(struct dmar_domain *domain)
430 {
431 	struct device_domain_info *info;
432 	int nid = NUMA_NO_NODE;
433 	unsigned long flags;
434 
435 	spin_lock_irqsave(&domain->lock, flags);
436 	list_for_each_entry(info, &domain->devices, link) {
437 		/*
438 		 * There could possibly be multiple device numa nodes as devices
439 		 * within the same domain may sit behind different IOMMUs. There
440 		 * isn't perfect answer in such situation, so we select first
441 		 * come first served policy.
442 		 */
443 		nid = dev_to_node(info->dev);
444 		if (nid != NUMA_NO_NODE)
445 			break;
446 	}
447 	spin_unlock_irqrestore(&domain->lock, flags);
448 
449 	return nid;
450 }
451 
452 /* Return the super pagesize bitmap if supported. */
453 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
454 {
455 	unsigned long bitmap = 0;
456 
457 	/*
458 	 * 1-level super page supports page size of 2MiB, 2-level super page
459 	 * supports page size of both 2MiB and 1GiB.
460 	 */
461 	if (domain->iommu_superpage == 1)
462 		bitmap |= SZ_2M;
463 	else if (domain->iommu_superpage == 2)
464 		bitmap |= SZ_2M | SZ_1G;
465 
466 	return bitmap;
467 }
468 
469 /* Some capabilities may be different across iommus */
470 void domain_update_iommu_cap(struct dmar_domain *domain)
471 {
472 	domain_update_iommu_coherency(domain);
473 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
474 
475 	/*
476 	 * If RHSA is missing, we should default to the device numa domain
477 	 * as fall back.
478 	 */
479 	if (domain->nid == NUMA_NO_NODE)
480 		domain->nid = domain_update_device_node(domain);
481 
482 	/*
483 	 * First-level translation restricts the input-address to a
484 	 * canonical address (i.e., address bits 63:N have the same
485 	 * value as address bit [N-1], where N is 48-bits with 4-level
486 	 * paging and 57-bits with 5-level paging). Hence, skip bit
487 	 * [N-1].
488 	 */
489 	if (domain->use_first_level)
490 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
491 	else
492 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
493 
494 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
495 	domain_update_iotlb(domain);
496 }
497 
498 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
499 					 u8 devfn, int alloc)
500 {
501 	struct root_entry *root = &iommu->root_entry[bus];
502 	struct context_entry *context;
503 	u64 *entry;
504 
505 	/*
506 	 * Except that the caller requested to allocate a new entry,
507 	 * returning a copied context entry makes no sense.
508 	 */
509 	if (!alloc && context_copied(iommu, bus, devfn))
510 		return NULL;
511 
512 	entry = &root->lo;
513 	if (sm_supported(iommu)) {
514 		if (devfn >= 0x80) {
515 			devfn -= 0x80;
516 			entry = &root->hi;
517 		}
518 		devfn *= 2;
519 	}
520 	if (*entry & 1)
521 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
522 	else {
523 		unsigned long phy_addr;
524 		if (!alloc)
525 			return NULL;
526 
527 		context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
528 		if (!context)
529 			return NULL;
530 
531 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
532 		phy_addr = virt_to_phys((void *)context);
533 		*entry = phy_addr | 1;
534 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
535 	}
536 	return &context[devfn];
537 }
538 
539 /**
540  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
541  *				 sub-hierarchy of a candidate PCI-PCI bridge
542  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
543  * @bridge: the candidate PCI-PCI bridge
544  *
545  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
546  */
547 static bool
548 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
549 {
550 	struct pci_dev *pdev, *pbridge;
551 
552 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
553 		return false;
554 
555 	pdev = to_pci_dev(dev);
556 	pbridge = to_pci_dev(bridge);
557 
558 	if (pbridge->subordinate &&
559 	    pbridge->subordinate->number <= pdev->bus->number &&
560 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
561 		return true;
562 
563 	return false;
564 }
565 
566 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
567 {
568 	struct dmar_drhd_unit *drhd;
569 	u32 vtbar;
570 	int rc;
571 
572 	/* We know that this device on this chipset has its own IOMMU.
573 	 * If we find it under a different IOMMU, then the BIOS is lying
574 	 * to us. Hope that the IOMMU for this device is actually
575 	 * disabled, and it needs no translation...
576 	 */
577 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
578 	if (rc) {
579 		/* "can't" happen */
580 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
581 		return false;
582 	}
583 	vtbar &= 0xffff0000;
584 
585 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
586 	drhd = dmar_find_matched_drhd_unit(pdev);
587 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
588 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
589 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
590 		return true;
591 	}
592 
593 	return false;
594 }
595 
596 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
597 {
598 	if (!iommu || iommu->drhd->ignored)
599 		return true;
600 
601 	if (dev_is_pci(dev)) {
602 		struct pci_dev *pdev = to_pci_dev(dev);
603 
604 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
605 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
606 		    quirk_ioat_snb_local_iommu(pdev))
607 			return true;
608 	}
609 
610 	return false;
611 }
612 
613 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
614 {
615 	struct dmar_drhd_unit *drhd = NULL;
616 	struct pci_dev *pdev = NULL;
617 	struct intel_iommu *iommu;
618 	struct device *tmp;
619 	u16 segment = 0;
620 	int i;
621 
622 	if (!dev)
623 		return NULL;
624 
625 	if (dev_is_pci(dev)) {
626 		struct pci_dev *pf_pdev;
627 
628 		pdev = pci_real_dma_dev(to_pci_dev(dev));
629 
630 		/* VFs aren't listed in scope tables; we need to look up
631 		 * the PF instead to find the IOMMU. */
632 		pf_pdev = pci_physfn(pdev);
633 		dev = &pf_pdev->dev;
634 		segment = pci_domain_nr(pdev->bus);
635 	} else if (has_acpi_companion(dev))
636 		dev = &ACPI_COMPANION(dev)->dev;
637 
638 	rcu_read_lock();
639 	for_each_iommu(iommu, drhd) {
640 		if (pdev && segment != drhd->segment)
641 			continue;
642 
643 		for_each_active_dev_scope(drhd->devices,
644 					  drhd->devices_cnt, i, tmp) {
645 			if (tmp == dev) {
646 				/* For a VF use its original BDF# not that of the PF
647 				 * which we used for the IOMMU lookup. Strictly speaking
648 				 * we could do this for all PCI devices; we only need to
649 				 * get the BDF# from the scope table for ACPI matches. */
650 				if (pdev && pdev->is_virtfn)
651 					goto got_pdev;
652 
653 				if (bus && devfn) {
654 					*bus = drhd->devices[i].bus;
655 					*devfn = drhd->devices[i].devfn;
656 				}
657 				goto out;
658 			}
659 
660 			if (is_downstream_to_pci_bridge(dev, tmp))
661 				goto got_pdev;
662 		}
663 
664 		if (pdev && drhd->include_all) {
665 got_pdev:
666 			if (bus && devfn) {
667 				*bus = pdev->bus->number;
668 				*devfn = pdev->devfn;
669 			}
670 			goto out;
671 		}
672 	}
673 	iommu = NULL;
674 out:
675 	if (iommu_is_dummy(iommu, dev))
676 		iommu = NULL;
677 
678 	rcu_read_unlock();
679 
680 	return iommu;
681 }
682 
683 static void domain_flush_cache(struct dmar_domain *domain,
684 			       void *addr, int size)
685 {
686 	if (!domain->iommu_coherency)
687 		clflush_cache_range(addr, size);
688 }
689 
690 static void free_context_table(struct intel_iommu *iommu)
691 {
692 	struct context_entry *context;
693 	int i;
694 
695 	if (!iommu->root_entry)
696 		return;
697 
698 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
699 		context = iommu_context_addr(iommu, i, 0, 0);
700 		if (context)
701 			iommu_free_page(context);
702 
703 		if (!sm_supported(iommu))
704 			continue;
705 
706 		context = iommu_context_addr(iommu, i, 0x80, 0);
707 		if (context)
708 			iommu_free_page(context);
709 	}
710 
711 	iommu_free_page(iommu->root_entry);
712 	iommu->root_entry = NULL;
713 }
714 
715 #ifdef CONFIG_DMAR_DEBUG
716 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
717 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
718 {
719 	struct dma_pte *pte;
720 	int offset;
721 
722 	while (1) {
723 		offset = pfn_level_offset(pfn, level);
724 		pte = &parent[offset];
725 		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
726 			pr_info("PTE not present at level %d\n", level);
727 			break;
728 		}
729 
730 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
731 
732 		if (level == 1)
733 			break;
734 
735 		parent = phys_to_virt(dma_pte_addr(pte));
736 		level--;
737 	}
738 }
739 
740 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
741 			  unsigned long long addr, u32 pasid)
742 {
743 	struct pasid_dir_entry *dir, *pde;
744 	struct pasid_entry *entries, *pte;
745 	struct context_entry *ctx_entry;
746 	struct root_entry *rt_entry;
747 	int i, dir_index, index, level;
748 	u8 devfn = source_id & 0xff;
749 	u8 bus = source_id >> 8;
750 	struct dma_pte *pgtable;
751 
752 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
753 
754 	/* root entry dump */
755 	rt_entry = &iommu->root_entry[bus];
756 	if (!rt_entry) {
757 		pr_info("root table entry is not present\n");
758 		return;
759 	}
760 
761 	if (sm_supported(iommu))
762 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
763 			rt_entry->hi, rt_entry->lo);
764 	else
765 		pr_info("root entry: 0x%016llx", rt_entry->lo);
766 
767 	/* context entry dump */
768 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
769 	if (!ctx_entry) {
770 		pr_info("context table entry is not present\n");
771 		return;
772 	}
773 
774 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
775 		ctx_entry->hi, ctx_entry->lo);
776 
777 	/* legacy mode does not require PASID entries */
778 	if (!sm_supported(iommu)) {
779 		level = agaw_to_level(ctx_entry->hi & 7);
780 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
781 		goto pgtable_walk;
782 	}
783 
784 	/* get the pointer to pasid directory entry */
785 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
786 	if (!dir) {
787 		pr_info("pasid directory entry is not present\n");
788 		return;
789 	}
790 	/* For request-without-pasid, get the pasid from context entry */
791 	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
792 		pasid = IOMMU_NO_PASID;
793 
794 	dir_index = pasid >> PASID_PDE_SHIFT;
795 	pde = &dir[dir_index];
796 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
797 
798 	/* get the pointer to the pasid table entry */
799 	entries = get_pasid_table_from_pde(pde);
800 	if (!entries) {
801 		pr_info("pasid table entry is not present\n");
802 		return;
803 	}
804 	index = pasid & PASID_PTE_MASK;
805 	pte = &entries[index];
806 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
807 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
808 
809 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
810 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
811 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
812 	} else {
813 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
814 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
815 	}
816 
817 pgtable_walk:
818 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
819 }
820 #endif
821 
822 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
823 				      unsigned long pfn, int *target_level,
824 				      gfp_t gfp)
825 {
826 	struct dma_pte *parent, *pte;
827 	int level = agaw_to_level(domain->agaw);
828 	int offset;
829 
830 	if (!domain_pfn_supported(domain, pfn))
831 		/* Address beyond IOMMU's addressing capabilities. */
832 		return NULL;
833 
834 	parent = domain->pgd;
835 
836 	while (1) {
837 		void *tmp_page;
838 
839 		offset = pfn_level_offset(pfn, level);
840 		pte = &parent[offset];
841 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
842 			break;
843 		if (level == *target_level)
844 			break;
845 
846 		if (!dma_pte_present(pte)) {
847 			uint64_t pteval, tmp;
848 
849 			tmp_page = iommu_alloc_page_node(domain->nid, gfp);
850 
851 			if (!tmp_page)
852 				return NULL;
853 
854 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
855 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
856 			if (domain->use_first_level)
857 				pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
858 
859 			tmp = 0ULL;
860 			if (!try_cmpxchg64(&pte->val, &tmp, pteval))
861 				/* Someone else set it while we were thinking; use theirs. */
862 				iommu_free_page(tmp_page);
863 			else
864 				domain_flush_cache(domain, pte, sizeof(*pte));
865 		}
866 		if (level == 1)
867 			break;
868 
869 		parent = phys_to_virt(dma_pte_addr(pte));
870 		level--;
871 	}
872 
873 	if (!*target_level)
874 		*target_level = level;
875 
876 	return pte;
877 }
878 
879 /* return address's pte at specific level */
880 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
881 					 unsigned long pfn,
882 					 int level, int *large_page)
883 {
884 	struct dma_pte *parent, *pte;
885 	int total = agaw_to_level(domain->agaw);
886 	int offset;
887 
888 	parent = domain->pgd;
889 	while (level <= total) {
890 		offset = pfn_level_offset(pfn, total);
891 		pte = &parent[offset];
892 		if (level == total)
893 			return pte;
894 
895 		if (!dma_pte_present(pte)) {
896 			*large_page = total;
897 			break;
898 		}
899 
900 		if (dma_pte_superpage(pte)) {
901 			*large_page = total;
902 			return pte;
903 		}
904 
905 		parent = phys_to_virt(dma_pte_addr(pte));
906 		total--;
907 	}
908 	return NULL;
909 }
910 
911 /* clear last level pte, a tlb flush should be followed */
912 static void dma_pte_clear_range(struct dmar_domain *domain,
913 				unsigned long start_pfn,
914 				unsigned long last_pfn)
915 {
916 	unsigned int large_page;
917 	struct dma_pte *first_pte, *pte;
918 
919 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
920 	    WARN_ON(start_pfn > last_pfn))
921 		return;
922 
923 	/* we don't need lock here; nobody else touches the iova range */
924 	do {
925 		large_page = 1;
926 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
927 		if (!pte) {
928 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
929 			continue;
930 		}
931 		do {
932 			dma_clear_pte(pte);
933 			start_pfn += lvl_to_nr_pages(large_page);
934 			pte++;
935 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
936 
937 		domain_flush_cache(domain, first_pte,
938 				   (void *)pte - (void *)first_pte);
939 
940 	} while (start_pfn && start_pfn <= last_pfn);
941 }
942 
943 static void dma_pte_free_level(struct dmar_domain *domain, int level,
944 			       int retain_level, struct dma_pte *pte,
945 			       unsigned long pfn, unsigned long start_pfn,
946 			       unsigned long last_pfn)
947 {
948 	pfn = max(start_pfn, pfn);
949 	pte = &pte[pfn_level_offset(pfn, level)];
950 
951 	do {
952 		unsigned long level_pfn;
953 		struct dma_pte *level_pte;
954 
955 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
956 			goto next;
957 
958 		level_pfn = pfn & level_mask(level);
959 		level_pte = phys_to_virt(dma_pte_addr(pte));
960 
961 		if (level > 2) {
962 			dma_pte_free_level(domain, level - 1, retain_level,
963 					   level_pte, level_pfn, start_pfn,
964 					   last_pfn);
965 		}
966 
967 		/*
968 		 * Free the page table if we're below the level we want to
969 		 * retain and the range covers the entire table.
970 		 */
971 		if (level < retain_level && !(start_pfn > level_pfn ||
972 		      last_pfn < level_pfn + level_size(level) - 1)) {
973 			dma_clear_pte(pte);
974 			domain_flush_cache(domain, pte, sizeof(*pte));
975 			iommu_free_page(level_pte);
976 		}
977 next:
978 		pfn += level_size(level);
979 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
980 }
981 
982 /*
983  * clear last level (leaf) ptes and free page table pages below the
984  * level we wish to keep intact.
985  */
986 static void dma_pte_free_pagetable(struct dmar_domain *domain,
987 				   unsigned long start_pfn,
988 				   unsigned long last_pfn,
989 				   int retain_level)
990 {
991 	dma_pte_clear_range(domain, start_pfn, last_pfn);
992 
993 	/* We don't need lock here; nobody else touches the iova range */
994 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
995 			   domain->pgd, 0, start_pfn, last_pfn);
996 
997 	/* free pgd */
998 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
999 		iommu_free_page(domain->pgd);
1000 		domain->pgd = NULL;
1001 	}
1002 }
1003 
1004 /* When a page at a given level is being unlinked from its parent, we don't
1005    need to *modify* it at all. All we need to do is make a list of all the
1006    pages which can be freed just as soon as we've flushed the IOTLB and we
1007    know the hardware page-walk will no longer touch them.
1008    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1009    be freed. */
1010 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1011 				    int level, struct dma_pte *pte,
1012 				    struct list_head *freelist)
1013 {
1014 	struct page *pg;
1015 
1016 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1017 	list_add_tail(&pg->lru, freelist);
1018 
1019 	if (level == 1)
1020 		return;
1021 
1022 	pte = page_address(pg);
1023 	do {
1024 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1025 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1026 		pte++;
1027 	} while (!first_pte_in_page(pte));
1028 }
1029 
1030 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1031 				struct dma_pte *pte, unsigned long pfn,
1032 				unsigned long start_pfn, unsigned long last_pfn,
1033 				struct list_head *freelist)
1034 {
1035 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1036 
1037 	pfn = max(start_pfn, pfn);
1038 	pte = &pte[pfn_level_offset(pfn, level)];
1039 
1040 	do {
1041 		unsigned long level_pfn = pfn & level_mask(level);
1042 
1043 		if (!dma_pte_present(pte))
1044 			goto next;
1045 
1046 		/* If range covers entire pagetable, free it */
1047 		if (start_pfn <= level_pfn &&
1048 		    last_pfn >= level_pfn + level_size(level) - 1) {
1049 			/* These suborbinate page tables are going away entirely. Don't
1050 			   bother to clear them; we're just going to *free* them. */
1051 			if (level > 1 && !dma_pte_superpage(pte))
1052 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1053 
1054 			dma_clear_pte(pte);
1055 			if (!first_pte)
1056 				first_pte = pte;
1057 			last_pte = pte;
1058 		} else if (level > 1) {
1059 			/* Recurse down into a level that isn't *entirely* obsolete */
1060 			dma_pte_clear_level(domain, level - 1,
1061 					    phys_to_virt(dma_pte_addr(pte)),
1062 					    level_pfn, start_pfn, last_pfn,
1063 					    freelist);
1064 		}
1065 next:
1066 		pfn = level_pfn + level_size(level);
1067 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1068 
1069 	if (first_pte)
1070 		domain_flush_cache(domain, first_pte,
1071 				   (void *)++last_pte - (void *)first_pte);
1072 }
1073 
1074 /* We can't just free the pages because the IOMMU may still be walking
1075    the page tables, and may have cached the intermediate levels. The
1076    pages can only be freed after the IOTLB flush has been done. */
1077 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1078 			 unsigned long last_pfn, struct list_head *freelist)
1079 {
1080 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1081 	    WARN_ON(start_pfn > last_pfn))
1082 		return;
1083 
1084 	/* we don't need lock here; nobody else touches the iova range */
1085 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1086 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1087 
1088 	/* free pgd */
1089 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1090 		struct page *pgd_page = virt_to_page(domain->pgd);
1091 		list_add_tail(&pgd_page->lru, freelist);
1092 		domain->pgd = NULL;
1093 	}
1094 }
1095 
1096 /* iommu handling */
1097 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1098 {
1099 	struct root_entry *root;
1100 
1101 	root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
1102 	if (!root) {
1103 		pr_err("Allocating root entry for %s failed\n",
1104 			iommu->name);
1105 		return -ENOMEM;
1106 	}
1107 
1108 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1109 	iommu->root_entry = root;
1110 
1111 	return 0;
1112 }
1113 
1114 static void iommu_set_root_entry(struct intel_iommu *iommu)
1115 {
1116 	u64 addr;
1117 	u32 sts;
1118 	unsigned long flag;
1119 
1120 	addr = virt_to_phys(iommu->root_entry);
1121 	if (sm_supported(iommu))
1122 		addr |= DMA_RTADDR_SMT;
1123 
1124 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1125 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1126 
1127 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1128 
1129 	/* Make sure hardware complete it */
1130 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1131 		      readl, (sts & DMA_GSTS_RTPS), sts);
1132 
1133 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1134 
1135 	/*
1136 	 * Hardware invalidates all DMA remapping hardware translation
1137 	 * caches as part of SRTP flow.
1138 	 */
1139 	if (cap_esrtps(iommu->cap))
1140 		return;
1141 
1142 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1143 	if (sm_supported(iommu))
1144 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1145 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1146 }
1147 
1148 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1149 {
1150 	u32 val;
1151 	unsigned long flag;
1152 
1153 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1154 		return;
1155 
1156 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1157 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1158 
1159 	/* Make sure hardware complete it */
1160 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1161 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1162 
1163 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1164 }
1165 
1166 /* return value determine if we need a write buffer flush */
1167 static void __iommu_flush_context(struct intel_iommu *iommu,
1168 				  u16 did, u16 source_id, u8 function_mask,
1169 				  u64 type)
1170 {
1171 	u64 val = 0;
1172 	unsigned long flag;
1173 
1174 	switch (type) {
1175 	case DMA_CCMD_GLOBAL_INVL:
1176 		val = DMA_CCMD_GLOBAL_INVL;
1177 		break;
1178 	case DMA_CCMD_DOMAIN_INVL:
1179 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1180 		break;
1181 	case DMA_CCMD_DEVICE_INVL:
1182 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1183 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1184 		break;
1185 	default:
1186 		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1187 			iommu->name, type);
1188 		return;
1189 	}
1190 	val |= DMA_CCMD_ICC;
1191 
1192 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1193 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1194 
1195 	/* Make sure hardware complete it */
1196 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1197 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1198 
1199 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1200 }
1201 
1202 /* return value determine if we need a write buffer flush */
1203 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1204 				u64 addr, unsigned int size_order, u64 type)
1205 {
1206 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1207 	u64 val = 0, val_iva = 0;
1208 	unsigned long flag;
1209 
1210 	switch (type) {
1211 	case DMA_TLB_GLOBAL_FLUSH:
1212 		/* global flush doesn't need set IVA_REG */
1213 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1214 		break;
1215 	case DMA_TLB_DSI_FLUSH:
1216 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1217 		break;
1218 	case DMA_TLB_PSI_FLUSH:
1219 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1220 		/* IH bit is passed in as part of address */
1221 		val_iva = size_order | addr;
1222 		break;
1223 	default:
1224 		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1225 			iommu->name, type);
1226 		return;
1227 	}
1228 
1229 	if (cap_write_drain(iommu->cap))
1230 		val |= DMA_TLB_WRITE_DRAIN;
1231 
1232 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1233 	/* Note: Only uses first TLB reg currently */
1234 	if (val_iva)
1235 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1236 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1237 
1238 	/* Make sure hardware complete it */
1239 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1240 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1241 
1242 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1243 
1244 	/* check IOTLB invalidation granularity */
1245 	if (DMA_TLB_IAIG(val) == 0)
1246 		pr_err("Flush IOTLB failed\n");
1247 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1248 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1249 			(unsigned long long)DMA_TLB_IIRG(type),
1250 			(unsigned long long)DMA_TLB_IAIG(val));
1251 }
1252 
1253 static struct device_domain_info *
1254 domain_lookup_dev_info(struct dmar_domain *domain,
1255 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1256 {
1257 	struct device_domain_info *info;
1258 	unsigned long flags;
1259 
1260 	spin_lock_irqsave(&domain->lock, flags);
1261 	list_for_each_entry(info, &domain->devices, link) {
1262 		if (info->iommu == iommu && info->bus == bus &&
1263 		    info->devfn == devfn) {
1264 			spin_unlock_irqrestore(&domain->lock, flags);
1265 			return info;
1266 		}
1267 	}
1268 	spin_unlock_irqrestore(&domain->lock, flags);
1269 
1270 	return NULL;
1271 }
1272 
1273 void domain_update_iotlb(struct dmar_domain *domain)
1274 {
1275 	struct dev_pasid_info *dev_pasid;
1276 	struct device_domain_info *info;
1277 	bool has_iotlb_device = false;
1278 	unsigned long flags;
1279 
1280 	spin_lock_irqsave(&domain->lock, flags);
1281 	list_for_each_entry(info, &domain->devices, link) {
1282 		if (info->ats_enabled) {
1283 			has_iotlb_device = true;
1284 			break;
1285 		}
1286 	}
1287 
1288 	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1289 		info = dev_iommu_priv_get(dev_pasid->dev);
1290 		if (info->ats_enabled) {
1291 			has_iotlb_device = true;
1292 			break;
1293 		}
1294 	}
1295 	domain->has_iotlb_device = has_iotlb_device;
1296 	spin_unlock_irqrestore(&domain->lock, flags);
1297 }
1298 
1299 /*
1300  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1301  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1302  * check because it applies only to the built-in QAT devices and it doesn't
1303  * grant additional privileges.
1304  */
1305 #define BUGGY_QAT_DEVID_MASK 0x4940
1306 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1307 {
1308 	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1309 		return false;
1310 
1311 	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1312 		return false;
1313 
1314 	return true;
1315 }
1316 
1317 static void iommu_enable_pci_caps(struct device_domain_info *info)
1318 {
1319 	struct pci_dev *pdev;
1320 
1321 	if (!dev_is_pci(info->dev))
1322 		return;
1323 
1324 	pdev = to_pci_dev(info->dev);
1325 
1326 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1327 	   the device if you enable PASID support after ATS support is
1328 	   undefined. So always enable PASID support on devices which
1329 	   have it, even if we can't yet know if we're ever going to
1330 	   use it. */
1331 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1332 		info->pasid_enabled = 1;
1333 
1334 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1335 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1336 		info->ats_enabled = 1;
1337 		domain_update_iotlb(info->domain);
1338 	}
1339 }
1340 
1341 static void iommu_disable_pci_caps(struct device_domain_info *info)
1342 {
1343 	struct pci_dev *pdev;
1344 
1345 	if (!dev_is_pci(info->dev))
1346 		return;
1347 
1348 	pdev = to_pci_dev(info->dev);
1349 
1350 	if (info->ats_enabled) {
1351 		pci_disable_ats(pdev);
1352 		info->ats_enabled = 0;
1353 		domain_update_iotlb(info->domain);
1354 	}
1355 
1356 	if (info->pasid_enabled) {
1357 		pci_disable_pasid(pdev);
1358 		info->pasid_enabled = 0;
1359 	}
1360 }
1361 
1362 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1363 {
1364 	cache_tag_flush_all(to_dmar_domain(domain));
1365 }
1366 
1367 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1368 {
1369 	u32 pmen;
1370 	unsigned long flags;
1371 
1372 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1373 		return;
1374 
1375 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1376 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1377 	pmen &= ~DMA_PMEN_EPM;
1378 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1379 
1380 	/* wait for the protected region status bit to clear */
1381 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1382 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1383 
1384 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1385 }
1386 
1387 static void iommu_enable_translation(struct intel_iommu *iommu)
1388 {
1389 	u32 sts;
1390 	unsigned long flags;
1391 
1392 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1393 	iommu->gcmd |= DMA_GCMD_TE;
1394 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1395 
1396 	/* Make sure hardware complete it */
1397 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1398 		      readl, (sts & DMA_GSTS_TES), sts);
1399 
1400 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1401 }
1402 
1403 static void iommu_disable_translation(struct intel_iommu *iommu)
1404 {
1405 	u32 sts;
1406 	unsigned long flag;
1407 
1408 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1409 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1410 		return;
1411 
1412 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1413 	iommu->gcmd &= ~DMA_GCMD_TE;
1414 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1415 
1416 	/* Make sure hardware complete it */
1417 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1418 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1419 
1420 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1421 }
1422 
1423 static int iommu_init_domains(struct intel_iommu *iommu)
1424 {
1425 	u32 ndomains;
1426 
1427 	ndomains = cap_ndoms(iommu->cap);
1428 	pr_debug("%s: Number of Domains supported <%d>\n",
1429 		 iommu->name, ndomains);
1430 
1431 	spin_lock_init(&iommu->lock);
1432 
1433 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1434 	if (!iommu->domain_ids)
1435 		return -ENOMEM;
1436 
1437 	/*
1438 	 * If Caching mode is set, then invalid translations are tagged
1439 	 * with domain-id 0, hence we need to pre-allocate it. We also
1440 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1441 	 * make sure it is not used for a real domain.
1442 	 */
1443 	set_bit(0, iommu->domain_ids);
1444 
1445 	/*
1446 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1447 	 * entry for first-level or pass-through translation modes should
1448 	 * be programmed with a domain id different from those used for
1449 	 * second-level or nested translation. We reserve a domain id for
1450 	 * this purpose.
1451 	 */
1452 	if (sm_supported(iommu))
1453 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1454 
1455 	return 0;
1456 }
1457 
1458 static void disable_dmar_iommu(struct intel_iommu *iommu)
1459 {
1460 	if (!iommu->domain_ids)
1461 		return;
1462 
1463 	/*
1464 	 * All iommu domains must have been detached from the devices,
1465 	 * hence there should be no domain IDs in use.
1466 	 */
1467 	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1468 		    > NUM_RESERVED_DID))
1469 		return;
1470 
1471 	if (iommu->gcmd & DMA_GCMD_TE)
1472 		iommu_disable_translation(iommu);
1473 }
1474 
1475 static void free_dmar_iommu(struct intel_iommu *iommu)
1476 {
1477 	if (iommu->domain_ids) {
1478 		bitmap_free(iommu->domain_ids);
1479 		iommu->domain_ids = NULL;
1480 	}
1481 
1482 	if (iommu->copied_tables) {
1483 		bitmap_free(iommu->copied_tables);
1484 		iommu->copied_tables = NULL;
1485 	}
1486 
1487 	/* free context mapping */
1488 	free_context_table(iommu);
1489 
1490 #ifdef CONFIG_INTEL_IOMMU_SVM
1491 	if (pasid_supported(iommu)) {
1492 		if (ecap_prs(iommu->ecap))
1493 			intel_svm_finish_prq(iommu);
1494 	}
1495 #endif
1496 }
1497 
1498 /*
1499  * Check and return whether first level is used by default for
1500  * DMA translation.
1501  */
1502 static bool first_level_by_default(unsigned int type)
1503 {
1504 	/* Only SL is available in legacy mode */
1505 	if (!scalable_mode_support())
1506 		return false;
1507 
1508 	/* Only level (either FL or SL) is available, just use it */
1509 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1510 		return intel_cap_flts_sanity();
1511 
1512 	/* Both levels are available, decide it based on domain type */
1513 	return type != IOMMU_DOMAIN_UNMANAGED;
1514 }
1515 
1516 static struct dmar_domain *alloc_domain(unsigned int type)
1517 {
1518 	struct dmar_domain *domain;
1519 
1520 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1521 	if (!domain)
1522 		return NULL;
1523 
1524 	domain->nid = NUMA_NO_NODE;
1525 	if (first_level_by_default(type))
1526 		domain->use_first_level = true;
1527 	domain->has_iotlb_device = false;
1528 	INIT_LIST_HEAD(&domain->devices);
1529 	INIT_LIST_HEAD(&domain->dev_pasids);
1530 	INIT_LIST_HEAD(&domain->cache_tags);
1531 	spin_lock_init(&domain->lock);
1532 	spin_lock_init(&domain->cache_lock);
1533 	xa_init(&domain->iommu_array);
1534 
1535 	return domain;
1536 }
1537 
1538 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1539 {
1540 	struct iommu_domain_info *info, *curr;
1541 	unsigned long ndomains;
1542 	int num, ret = -ENOSPC;
1543 
1544 	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1545 		return 0;
1546 
1547 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1548 	if (!info)
1549 		return -ENOMEM;
1550 
1551 	spin_lock(&iommu->lock);
1552 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1553 	if (curr) {
1554 		curr->refcnt++;
1555 		spin_unlock(&iommu->lock);
1556 		kfree(info);
1557 		return 0;
1558 	}
1559 
1560 	ndomains = cap_ndoms(iommu->cap);
1561 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1562 	if (num >= ndomains) {
1563 		pr_err("%s: No free domain ids\n", iommu->name);
1564 		goto err_unlock;
1565 	}
1566 
1567 	set_bit(num, iommu->domain_ids);
1568 	info->refcnt	= 1;
1569 	info->did	= num;
1570 	info->iommu	= iommu;
1571 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1572 			  NULL, info, GFP_ATOMIC);
1573 	if (curr) {
1574 		ret = xa_err(curr) ? : -EBUSY;
1575 		goto err_clear;
1576 	}
1577 	domain_update_iommu_cap(domain);
1578 
1579 	spin_unlock(&iommu->lock);
1580 	return 0;
1581 
1582 err_clear:
1583 	clear_bit(info->did, iommu->domain_ids);
1584 err_unlock:
1585 	spin_unlock(&iommu->lock);
1586 	kfree(info);
1587 	return ret;
1588 }
1589 
1590 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1591 {
1592 	struct iommu_domain_info *info;
1593 
1594 	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1595 		return;
1596 
1597 	spin_lock(&iommu->lock);
1598 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1599 	if (--info->refcnt == 0) {
1600 		clear_bit(info->did, iommu->domain_ids);
1601 		xa_erase(&domain->iommu_array, iommu->seq_id);
1602 		domain->nid = NUMA_NO_NODE;
1603 		domain_update_iommu_cap(domain);
1604 		kfree(info);
1605 	}
1606 	spin_unlock(&iommu->lock);
1607 }
1608 
1609 static int guestwidth_to_adjustwidth(int gaw)
1610 {
1611 	int agaw;
1612 	int r = (gaw - 12) % 9;
1613 
1614 	if (r == 0)
1615 		agaw = gaw;
1616 	else
1617 		agaw = gaw + 9 - r;
1618 	if (agaw > 64)
1619 		agaw = 64;
1620 	return agaw;
1621 }
1622 
1623 static void domain_exit(struct dmar_domain *domain)
1624 {
1625 	if (domain->pgd) {
1626 		LIST_HEAD(freelist);
1627 
1628 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1629 		iommu_put_pages_list(&freelist);
1630 	}
1631 
1632 	if (WARN_ON(!list_empty(&domain->devices)))
1633 		return;
1634 
1635 	kfree(domain);
1636 }
1637 
1638 static int domain_context_mapping_one(struct dmar_domain *domain,
1639 				      struct intel_iommu *iommu,
1640 				      u8 bus, u8 devfn)
1641 {
1642 	struct device_domain_info *info =
1643 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1644 	u16 did = domain_id_iommu(domain, iommu);
1645 	int translation = CONTEXT_TT_MULTI_LEVEL;
1646 	struct dma_pte *pgd = domain->pgd;
1647 	struct context_entry *context;
1648 	int agaw, ret;
1649 
1650 	if (hw_pass_through && domain_type_is_si(domain))
1651 		translation = CONTEXT_TT_PASS_THROUGH;
1652 
1653 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1654 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1655 
1656 	spin_lock(&iommu->lock);
1657 	ret = -ENOMEM;
1658 	context = iommu_context_addr(iommu, bus, devfn, 1);
1659 	if (!context)
1660 		goto out_unlock;
1661 
1662 	ret = 0;
1663 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1664 		goto out_unlock;
1665 
1666 	/*
1667 	 * For kdump cases, old valid entries may be cached due to the
1668 	 * in-flight DMA and copied pgtable, but there is no unmapping
1669 	 * behaviour for them, thus we need an explicit cache flush for
1670 	 * the newly-mapped device. For kdump, at this point, the device
1671 	 * is supposed to finish reset at its driver probe stage, so no
1672 	 * in-flight DMA will exist, and we don't need to worry anymore
1673 	 * hereafter.
1674 	 */
1675 	if (context_copied(iommu, bus, devfn)) {
1676 		u16 did_old = context_domain_id(context);
1677 
1678 		if (did_old < cap_ndoms(iommu->cap)) {
1679 			iommu->flush.flush_context(iommu, did_old,
1680 						   (((u16)bus) << 8) | devfn,
1681 						   DMA_CCMD_MASK_NOBIT,
1682 						   DMA_CCMD_DEVICE_INVL);
1683 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1684 						 DMA_TLB_DSI_FLUSH);
1685 		}
1686 
1687 		clear_context_copied(iommu, bus, devfn);
1688 	}
1689 
1690 	context_clear_entry(context);
1691 	context_set_domain_id(context, did);
1692 
1693 	if (translation != CONTEXT_TT_PASS_THROUGH) {
1694 		/*
1695 		 * Skip top levels of page tables for iommu which has
1696 		 * less agaw than default. Unnecessary for PT mode.
1697 		 */
1698 		for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1699 			ret = -ENOMEM;
1700 			pgd = phys_to_virt(dma_pte_addr(pgd));
1701 			if (!dma_pte_present(pgd))
1702 				goto out_unlock;
1703 		}
1704 
1705 		if (info && info->ats_supported)
1706 			translation = CONTEXT_TT_DEV_IOTLB;
1707 		else
1708 			translation = CONTEXT_TT_MULTI_LEVEL;
1709 
1710 		context_set_address_root(context, virt_to_phys(pgd));
1711 		context_set_address_width(context, agaw);
1712 	} else {
1713 		/*
1714 		 * In pass through mode, AW must be programmed to
1715 		 * indicate the largest AGAW value supported by
1716 		 * hardware. And ASR is ignored by hardware.
1717 		 */
1718 		context_set_address_width(context, iommu->msagaw);
1719 	}
1720 
1721 	context_set_translation_type(context, translation);
1722 	context_set_fault_enable(context);
1723 	context_set_present(context);
1724 	if (!ecap_coherent(iommu->ecap))
1725 		clflush_cache_range(context, sizeof(*context));
1726 
1727 	/*
1728 	 * It's a non-present to present mapping. If hardware doesn't cache
1729 	 * non-present entry we only need to flush the write-buffer. If the
1730 	 * _does_ cache non-present entries, then it does so in the special
1731 	 * domain #0, which we have to flush:
1732 	 */
1733 	if (cap_caching_mode(iommu->cap)) {
1734 		iommu->flush.flush_context(iommu, 0,
1735 					   (((u16)bus) << 8) | devfn,
1736 					   DMA_CCMD_MASK_NOBIT,
1737 					   DMA_CCMD_DEVICE_INVL);
1738 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1739 	} else {
1740 		iommu_flush_write_buffer(iommu);
1741 	}
1742 
1743 	ret = 0;
1744 
1745 out_unlock:
1746 	spin_unlock(&iommu->lock);
1747 
1748 	return ret;
1749 }
1750 
1751 static int domain_context_mapping_cb(struct pci_dev *pdev,
1752 				     u16 alias, void *opaque)
1753 {
1754 	struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1755 	struct intel_iommu *iommu = info->iommu;
1756 	struct dmar_domain *domain = opaque;
1757 
1758 	return domain_context_mapping_one(domain, iommu,
1759 					  PCI_BUS_NUM(alias), alias & 0xff);
1760 }
1761 
1762 static int
1763 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1764 {
1765 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1766 	struct intel_iommu *iommu = info->iommu;
1767 	u8 bus = info->bus, devfn = info->devfn;
1768 
1769 	if (!dev_is_pci(dev))
1770 		return domain_context_mapping_one(domain, iommu, bus, devfn);
1771 
1772 	return pci_for_each_dma_alias(to_pci_dev(dev),
1773 				      domain_context_mapping_cb, domain);
1774 }
1775 
1776 /* Return largest possible superpage level for a given mapping */
1777 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1778 				   unsigned long phy_pfn, unsigned long pages)
1779 {
1780 	int support, level = 1;
1781 	unsigned long pfnmerge;
1782 
1783 	support = domain->iommu_superpage;
1784 
1785 	/* To use a large page, the virtual *and* physical addresses
1786 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1787 	   of them will mean we have to use smaller pages. So just
1788 	   merge them and check both at once. */
1789 	pfnmerge = iov_pfn | phy_pfn;
1790 
1791 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1792 		pages >>= VTD_STRIDE_SHIFT;
1793 		if (!pages)
1794 			break;
1795 		pfnmerge >>= VTD_STRIDE_SHIFT;
1796 		level++;
1797 		support--;
1798 	}
1799 	return level;
1800 }
1801 
1802 /*
1803  * Ensure that old small page tables are removed to make room for superpage(s).
1804  * We're going to add new large pages, so make sure we don't remove their parent
1805  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1806  */
1807 static void switch_to_super_page(struct dmar_domain *domain,
1808 				 unsigned long start_pfn,
1809 				 unsigned long end_pfn, int level)
1810 {
1811 	unsigned long lvl_pages = lvl_to_nr_pages(level);
1812 	struct dma_pte *pte = NULL;
1813 
1814 	while (start_pfn <= end_pfn) {
1815 		if (!pte)
1816 			pte = pfn_to_dma_pte(domain, start_pfn, &level,
1817 					     GFP_ATOMIC);
1818 
1819 		if (dma_pte_present(pte)) {
1820 			dma_pte_free_pagetable(domain, start_pfn,
1821 					       start_pfn + lvl_pages - 1,
1822 					       level + 1);
1823 
1824 			cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT,
1825 					      end_pfn << VTD_PAGE_SHIFT, 0);
1826 		}
1827 
1828 		pte++;
1829 		start_pfn += lvl_pages;
1830 		if (first_pte_in_page(pte))
1831 			pte = NULL;
1832 	}
1833 }
1834 
1835 static int
1836 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1837 		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
1838 		 gfp_t gfp)
1839 {
1840 	struct dma_pte *first_pte = NULL, *pte = NULL;
1841 	unsigned int largepage_lvl = 0;
1842 	unsigned long lvl_pages = 0;
1843 	phys_addr_t pteval;
1844 	u64 attr;
1845 
1846 	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
1847 		return -EINVAL;
1848 
1849 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1850 		return -EINVAL;
1851 
1852 	if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
1853 		pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
1854 		return -EINVAL;
1855 	}
1856 
1857 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
1858 	attr |= DMA_FL_PTE_PRESENT;
1859 	if (domain->use_first_level) {
1860 		attr |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
1861 		if (prot & DMA_PTE_WRITE)
1862 			attr |= DMA_FL_PTE_DIRTY;
1863 	}
1864 
1865 	domain->has_mappings = true;
1866 
1867 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
1868 
1869 	while (nr_pages > 0) {
1870 		uint64_t tmp;
1871 
1872 		if (!pte) {
1873 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
1874 					phys_pfn, nr_pages);
1875 
1876 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
1877 					     gfp);
1878 			if (!pte)
1879 				return -ENOMEM;
1880 			first_pte = pte;
1881 
1882 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
1883 
1884 			/* It is large page*/
1885 			if (largepage_lvl > 1) {
1886 				unsigned long end_pfn;
1887 				unsigned long pages_to_remove;
1888 
1889 				pteval |= DMA_PTE_LARGE_PAGE;
1890 				pages_to_remove = min_t(unsigned long, nr_pages,
1891 							nr_pte_to_next_page(pte) * lvl_pages);
1892 				end_pfn = iov_pfn + pages_to_remove - 1;
1893 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
1894 			} else {
1895 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1896 			}
1897 
1898 		}
1899 		/* We don't need lock here, nobody else
1900 		 * touches the iova range
1901 		 */
1902 		tmp = 0ULL;
1903 		if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
1904 			static int dumps = 5;
1905 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1906 				iov_pfn, tmp, (unsigned long long)pteval);
1907 			if (dumps) {
1908 				dumps--;
1909 				debug_dma_dump_mappings(NULL);
1910 			}
1911 			WARN_ON(1);
1912 		}
1913 
1914 		nr_pages -= lvl_pages;
1915 		iov_pfn += lvl_pages;
1916 		phys_pfn += lvl_pages;
1917 		pteval += lvl_pages * VTD_PAGE_SIZE;
1918 
1919 		/* If the next PTE would be the first in a new page, then we
1920 		 * need to flush the cache on the entries we've just written.
1921 		 * And then we'll need to recalculate 'pte', so clear it and
1922 		 * let it get set again in the if (!pte) block above.
1923 		 *
1924 		 * If we're done (!nr_pages) we need to flush the cache too.
1925 		 *
1926 		 * Also if we've been setting superpages, we may need to
1927 		 * recalculate 'pte' and switch back to smaller pages for the
1928 		 * end of the mapping, if the trailing size is not enough to
1929 		 * use another superpage (i.e. nr_pages < lvl_pages).
1930 		 */
1931 		pte++;
1932 		if (!nr_pages || first_pte_in_page(pte) ||
1933 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
1934 			domain_flush_cache(domain, first_pte,
1935 					   (void *)pte - (void *)first_pte);
1936 			pte = NULL;
1937 		}
1938 	}
1939 
1940 	return 0;
1941 }
1942 
1943 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
1944 {
1945 	struct intel_iommu *iommu = info->iommu;
1946 	struct context_entry *context;
1947 	u16 did;
1948 
1949 	spin_lock(&iommu->lock);
1950 	context = iommu_context_addr(iommu, bus, devfn, 0);
1951 	if (!context) {
1952 		spin_unlock(&iommu->lock);
1953 		return;
1954 	}
1955 
1956 	did = context_domain_id(context);
1957 	context_clear_entry(context);
1958 	__iommu_flush_cache(iommu, context, sizeof(*context));
1959 	spin_unlock(&iommu->lock);
1960 	intel_context_flush_present(info, context, did, true);
1961 }
1962 
1963 static int domain_setup_first_level(struct intel_iommu *iommu,
1964 				    struct dmar_domain *domain,
1965 				    struct device *dev,
1966 				    u32 pasid)
1967 {
1968 	struct dma_pte *pgd = domain->pgd;
1969 	int agaw, level;
1970 	int flags = 0;
1971 
1972 	/*
1973 	 * Skip top levels of page tables for iommu which has
1974 	 * less agaw than default. Unnecessary for PT mode.
1975 	 */
1976 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1977 		pgd = phys_to_virt(dma_pte_addr(pgd));
1978 		if (!dma_pte_present(pgd))
1979 			return -ENOMEM;
1980 	}
1981 
1982 	level = agaw_to_level(agaw);
1983 	if (level != 4 && level != 5)
1984 		return -EINVAL;
1985 
1986 	if (level == 5)
1987 		flags |= PASID_FLAG_FL5LP;
1988 
1989 	if (domain->force_snooping)
1990 		flags |= PASID_FLAG_PAGE_SNOOP;
1991 
1992 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
1993 					     domain_id_iommu(domain, iommu),
1994 					     flags);
1995 }
1996 
1997 static bool dev_is_real_dma_subdevice(struct device *dev)
1998 {
1999 	return dev && dev_is_pci(dev) &&
2000 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2001 }
2002 
2003 static int iommu_domain_identity_map(struct dmar_domain *domain,
2004 				     unsigned long first_vpfn,
2005 				     unsigned long last_vpfn)
2006 {
2007 	/*
2008 	 * RMRR range might have overlap with physical memory range,
2009 	 * clear it first
2010 	 */
2011 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2012 
2013 	return __domain_mapping(domain, first_vpfn,
2014 				first_vpfn, last_vpfn - first_vpfn + 1,
2015 				DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2016 }
2017 
2018 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2019 
2020 static int __init si_domain_init(int hw)
2021 {
2022 	struct dmar_rmrr_unit *rmrr;
2023 	struct device *dev;
2024 	int i, nid, ret;
2025 
2026 	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2027 	if (!si_domain)
2028 		return -EFAULT;
2029 
2030 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2031 		domain_exit(si_domain);
2032 		si_domain = NULL;
2033 		return -EFAULT;
2034 	}
2035 
2036 	if (hw)
2037 		return 0;
2038 
2039 	for_each_online_node(nid) {
2040 		unsigned long start_pfn, end_pfn;
2041 		int i;
2042 
2043 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2044 			ret = iommu_domain_identity_map(si_domain,
2045 					mm_to_dma_pfn_start(start_pfn),
2046 					mm_to_dma_pfn_end(end_pfn-1));
2047 			if (ret)
2048 				return ret;
2049 		}
2050 	}
2051 
2052 	/*
2053 	 * Identity map the RMRRs so that devices with RMRRs could also use
2054 	 * the si_domain.
2055 	 */
2056 	for_each_rmrr_units(rmrr) {
2057 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2058 					  i, dev) {
2059 			unsigned long long start = rmrr->base_address;
2060 			unsigned long long end = rmrr->end_address;
2061 
2062 			if (WARN_ON(end < start ||
2063 				    end >> agaw_to_width(si_domain->agaw)))
2064 				continue;
2065 
2066 			ret = iommu_domain_identity_map(si_domain,
2067 					mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2068 					mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2069 			if (ret)
2070 				return ret;
2071 		}
2072 	}
2073 
2074 	return 0;
2075 }
2076 
2077 static int dmar_domain_attach_device(struct dmar_domain *domain,
2078 				     struct device *dev)
2079 {
2080 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2081 	struct intel_iommu *iommu = info->iommu;
2082 	unsigned long flags;
2083 	int ret;
2084 
2085 	ret = domain_attach_iommu(domain, iommu);
2086 	if (ret)
2087 		return ret;
2088 
2089 	info->domain = domain;
2090 	spin_lock_irqsave(&domain->lock, flags);
2091 	list_add(&info->link, &domain->devices);
2092 	spin_unlock_irqrestore(&domain->lock, flags);
2093 
2094 	if (dev_is_real_dma_subdevice(dev))
2095 		return 0;
2096 
2097 	if (!sm_supported(iommu))
2098 		ret = domain_context_mapping(domain, dev);
2099 	else if (hw_pass_through && domain_type_is_si(domain))
2100 		ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
2101 	else if (domain->use_first_level)
2102 		ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID);
2103 	else
2104 		ret = intel_pasid_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID);
2105 
2106 	if (ret)
2107 		goto out_block_translation;
2108 
2109 	if (sm_supported(info->iommu) || !domain_type_is_si(info->domain))
2110 		iommu_enable_pci_caps(info);
2111 
2112 	ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID);
2113 	if (ret)
2114 		goto out_block_translation;
2115 
2116 	return 0;
2117 
2118 out_block_translation:
2119 	device_block_translation(dev);
2120 	return ret;
2121 }
2122 
2123 /**
2124  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2125  * is relaxable (ie. is allowed to be not enforced under some conditions)
2126  * @dev: device handle
2127  *
2128  * We assume that PCI USB devices with RMRRs have them largely
2129  * for historical reasons and that the RMRR space is not actively used post
2130  * boot.  This exclusion may change if vendors begin to abuse it.
2131  *
2132  * The same exception is made for graphics devices, with the requirement that
2133  * any use of the RMRR regions will be torn down before assigning the device
2134  * to a guest.
2135  *
2136  * Return: true if the RMRR is relaxable, false otherwise
2137  */
2138 static bool device_rmrr_is_relaxable(struct device *dev)
2139 {
2140 	struct pci_dev *pdev;
2141 
2142 	if (!dev_is_pci(dev))
2143 		return false;
2144 
2145 	pdev = to_pci_dev(dev);
2146 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2147 		return true;
2148 	else
2149 		return false;
2150 }
2151 
2152 static int device_def_domain_type(struct device *dev)
2153 {
2154 	if (dev_is_pci(dev)) {
2155 		struct pci_dev *pdev = to_pci_dev(dev);
2156 
2157 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2158 			return IOMMU_DOMAIN_IDENTITY;
2159 	}
2160 
2161 	return 0;
2162 }
2163 
2164 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2165 {
2166 	/*
2167 	 * Start from the sane iommu hardware state.
2168 	 * If the queued invalidation is already initialized by us
2169 	 * (for example, while enabling interrupt-remapping) then
2170 	 * we got the things already rolling from a sane state.
2171 	 */
2172 	if (!iommu->qi) {
2173 		/*
2174 		 * Clear any previous faults.
2175 		 */
2176 		dmar_fault(-1, iommu);
2177 		/*
2178 		 * Disable queued invalidation if supported and already enabled
2179 		 * before OS handover.
2180 		 */
2181 		dmar_disable_qi(iommu);
2182 	}
2183 
2184 	if (dmar_enable_qi(iommu)) {
2185 		/*
2186 		 * Queued Invalidate not enabled, use Register Based Invalidate
2187 		 */
2188 		iommu->flush.flush_context = __iommu_flush_context;
2189 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2190 		pr_info("%s: Using Register based invalidation\n",
2191 			iommu->name);
2192 	} else {
2193 		iommu->flush.flush_context = qi_flush_context;
2194 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2195 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2196 	}
2197 }
2198 
2199 static int copy_context_table(struct intel_iommu *iommu,
2200 			      struct root_entry *old_re,
2201 			      struct context_entry **tbl,
2202 			      int bus, bool ext)
2203 {
2204 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2205 	struct context_entry *new_ce = NULL, ce;
2206 	struct context_entry *old_ce = NULL;
2207 	struct root_entry re;
2208 	phys_addr_t old_ce_phys;
2209 
2210 	tbl_idx = ext ? bus * 2 : bus;
2211 	memcpy(&re, old_re, sizeof(re));
2212 
2213 	for (devfn = 0; devfn < 256; devfn++) {
2214 		/* First calculate the correct index */
2215 		idx = (ext ? devfn * 2 : devfn) % 256;
2216 
2217 		if (idx == 0) {
2218 			/* First save what we may have and clean up */
2219 			if (new_ce) {
2220 				tbl[tbl_idx] = new_ce;
2221 				__iommu_flush_cache(iommu, new_ce,
2222 						    VTD_PAGE_SIZE);
2223 				pos = 1;
2224 			}
2225 
2226 			if (old_ce)
2227 				memunmap(old_ce);
2228 
2229 			ret = 0;
2230 			if (devfn < 0x80)
2231 				old_ce_phys = root_entry_lctp(&re);
2232 			else
2233 				old_ce_phys = root_entry_uctp(&re);
2234 
2235 			if (!old_ce_phys) {
2236 				if (ext && devfn == 0) {
2237 					/* No LCTP, try UCTP */
2238 					devfn = 0x7f;
2239 					continue;
2240 				} else {
2241 					goto out;
2242 				}
2243 			}
2244 
2245 			ret = -ENOMEM;
2246 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2247 					MEMREMAP_WB);
2248 			if (!old_ce)
2249 				goto out;
2250 
2251 			new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL);
2252 			if (!new_ce)
2253 				goto out_unmap;
2254 
2255 			ret = 0;
2256 		}
2257 
2258 		/* Now copy the context entry */
2259 		memcpy(&ce, old_ce + idx, sizeof(ce));
2260 
2261 		if (!context_present(&ce))
2262 			continue;
2263 
2264 		did = context_domain_id(&ce);
2265 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2266 			set_bit(did, iommu->domain_ids);
2267 
2268 		set_context_copied(iommu, bus, devfn);
2269 		new_ce[idx] = ce;
2270 	}
2271 
2272 	tbl[tbl_idx + pos] = new_ce;
2273 
2274 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2275 
2276 out_unmap:
2277 	memunmap(old_ce);
2278 
2279 out:
2280 	return ret;
2281 }
2282 
2283 static int copy_translation_tables(struct intel_iommu *iommu)
2284 {
2285 	struct context_entry **ctxt_tbls;
2286 	struct root_entry *old_rt;
2287 	phys_addr_t old_rt_phys;
2288 	int ctxt_table_entries;
2289 	u64 rtaddr_reg;
2290 	int bus, ret;
2291 	bool new_ext, ext;
2292 
2293 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2294 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2295 	new_ext    = !!sm_supported(iommu);
2296 
2297 	/*
2298 	 * The RTT bit can only be changed when translation is disabled,
2299 	 * but disabling translation means to open a window for data
2300 	 * corruption. So bail out and don't copy anything if we would
2301 	 * have to change the bit.
2302 	 */
2303 	if (new_ext != ext)
2304 		return -EINVAL;
2305 
2306 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2307 	if (!iommu->copied_tables)
2308 		return -ENOMEM;
2309 
2310 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2311 	if (!old_rt_phys)
2312 		return -EINVAL;
2313 
2314 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2315 	if (!old_rt)
2316 		return -ENOMEM;
2317 
2318 	/* This is too big for the stack - allocate it from slab */
2319 	ctxt_table_entries = ext ? 512 : 256;
2320 	ret = -ENOMEM;
2321 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2322 	if (!ctxt_tbls)
2323 		goto out_unmap;
2324 
2325 	for (bus = 0; bus < 256; bus++) {
2326 		ret = copy_context_table(iommu, &old_rt[bus],
2327 					 ctxt_tbls, bus, ext);
2328 		if (ret) {
2329 			pr_err("%s: Failed to copy context table for bus %d\n",
2330 				iommu->name, bus);
2331 			continue;
2332 		}
2333 	}
2334 
2335 	spin_lock(&iommu->lock);
2336 
2337 	/* Context tables are copied, now write them to the root_entry table */
2338 	for (bus = 0; bus < 256; bus++) {
2339 		int idx = ext ? bus * 2 : bus;
2340 		u64 val;
2341 
2342 		if (ctxt_tbls[idx]) {
2343 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2344 			iommu->root_entry[bus].lo = val;
2345 		}
2346 
2347 		if (!ext || !ctxt_tbls[idx + 1])
2348 			continue;
2349 
2350 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2351 		iommu->root_entry[bus].hi = val;
2352 	}
2353 
2354 	spin_unlock(&iommu->lock);
2355 
2356 	kfree(ctxt_tbls);
2357 
2358 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2359 
2360 	ret = 0;
2361 
2362 out_unmap:
2363 	memunmap(old_rt);
2364 
2365 	return ret;
2366 }
2367 
2368 static int __init init_dmars(void)
2369 {
2370 	struct dmar_drhd_unit *drhd;
2371 	struct intel_iommu *iommu;
2372 	int ret;
2373 
2374 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2375 	if (ret)
2376 		goto free_iommu;
2377 
2378 	for_each_iommu(iommu, drhd) {
2379 		if (drhd->ignored) {
2380 			iommu_disable_translation(iommu);
2381 			continue;
2382 		}
2383 
2384 		/*
2385 		 * Find the max pasid size of all IOMMU's in the system.
2386 		 * We need to ensure the system pasid table is no bigger
2387 		 * than the smallest supported.
2388 		 */
2389 		if (pasid_supported(iommu)) {
2390 			u32 temp = 2 << ecap_pss(iommu->ecap);
2391 
2392 			intel_pasid_max_id = min_t(u32, temp,
2393 						   intel_pasid_max_id);
2394 		}
2395 
2396 		intel_iommu_init_qi(iommu);
2397 
2398 		ret = iommu_init_domains(iommu);
2399 		if (ret)
2400 			goto free_iommu;
2401 
2402 		init_translation_status(iommu);
2403 
2404 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2405 			iommu_disable_translation(iommu);
2406 			clear_translation_pre_enabled(iommu);
2407 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2408 				iommu->name);
2409 		}
2410 
2411 		/*
2412 		 * TBD:
2413 		 * we could share the same root & context tables
2414 		 * among all IOMMU's. Need to Split it later.
2415 		 */
2416 		ret = iommu_alloc_root_entry(iommu);
2417 		if (ret)
2418 			goto free_iommu;
2419 
2420 		if (translation_pre_enabled(iommu)) {
2421 			pr_info("Translation already enabled - trying to copy translation structures\n");
2422 
2423 			ret = copy_translation_tables(iommu);
2424 			if (ret) {
2425 				/*
2426 				 * We found the IOMMU with translation
2427 				 * enabled - but failed to copy over the
2428 				 * old root-entry table. Try to proceed
2429 				 * by disabling translation now and
2430 				 * allocating a clean root-entry table.
2431 				 * This might cause DMAR faults, but
2432 				 * probably the dump will still succeed.
2433 				 */
2434 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2435 				       iommu->name);
2436 				iommu_disable_translation(iommu);
2437 				clear_translation_pre_enabled(iommu);
2438 			} else {
2439 				pr_info("Copied translation tables from previous kernel for %s\n",
2440 					iommu->name);
2441 			}
2442 		}
2443 
2444 		if (!ecap_pass_through(iommu->ecap))
2445 			hw_pass_through = 0;
2446 		intel_svm_check(iommu);
2447 	}
2448 
2449 	/*
2450 	 * Now that qi is enabled on all iommus, set the root entry and flush
2451 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2452 	 * flush_context function will loop forever and the boot hangs.
2453 	 */
2454 	for_each_active_iommu(iommu, drhd) {
2455 		iommu_flush_write_buffer(iommu);
2456 		iommu_set_root_entry(iommu);
2457 	}
2458 
2459 	check_tylersburg_isoch();
2460 
2461 	ret = si_domain_init(hw_pass_through);
2462 	if (ret)
2463 		goto free_iommu;
2464 
2465 	/*
2466 	 * for each drhd
2467 	 *   enable fault log
2468 	 *   global invalidate context cache
2469 	 *   global invalidate iotlb
2470 	 *   enable translation
2471 	 */
2472 	for_each_iommu(iommu, drhd) {
2473 		if (drhd->ignored) {
2474 			/*
2475 			 * we always have to disable PMRs or DMA may fail on
2476 			 * this device
2477 			 */
2478 			if (force_on)
2479 				iommu_disable_protect_mem_regions(iommu);
2480 			continue;
2481 		}
2482 
2483 		iommu_flush_write_buffer(iommu);
2484 
2485 #ifdef CONFIG_INTEL_IOMMU_SVM
2486 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2487 			/*
2488 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2489 			 * could cause possible lock race condition.
2490 			 */
2491 			up_write(&dmar_global_lock);
2492 			ret = intel_svm_enable_prq(iommu);
2493 			down_write(&dmar_global_lock);
2494 			if (ret)
2495 				goto free_iommu;
2496 		}
2497 #endif
2498 		ret = dmar_set_interrupt(iommu);
2499 		if (ret)
2500 			goto free_iommu;
2501 	}
2502 
2503 	return 0;
2504 
2505 free_iommu:
2506 	for_each_active_iommu(iommu, drhd) {
2507 		disable_dmar_iommu(iommu);
2508 		free_dmar_iommu(iommu);
2509 	}
2510 	if (si_domain) {
2511 		domain_exit(si_domain);
2512 		si_domain = NULL;
2513 	}
2514 
2515 	return ret;
2516 }
2517 
2518 static void __init init_no_remapping_devices(void)
2519 {
2520 	struct dmar_drhd_unit *drhd;
2521 	struct device *dev;
2522 	int i;
2523 
2524 	for_each_drhd_unit(drhd) {
2525 		if (!drhd->include_all) {
2526 			for_each_active_dev_scope(drhd->devices,
2527 						  drhd->devices_cnt, i, dev)
2528 				break;
2529 			/* ignore DMAR unit if no devices exist */
2530 			if (i == drhd->devices_cnt)
2531 				drhd->ignored = 1;
2532 		}
2533 	}
2534 
2535 	for_each_active_drhd_unit(drhd) {
2536 		if (drhd->include_all)
2537 			continue;
2538 
2539 		for_each_active_dev_scope(drhd->devices,
2540 					  drhd->devices_cnt, i, dev)
2541 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2542 				break;
2543 		if (i < drhd->devices_cnt)
2544 			continue;
2545 
2546 		/* This IOMMU has *only* gfx devices. Either bypass it or
2547 		   set the gfx_mapped flag, as appropriate */
2548 		drhd->gfx_dedicated = 1;
2549 		if (disable_igfx_iommu)
2550 			drhd->ignored = 1;
2551 	}
2552 }
2553 
2554 #ifdef CONFIG_SUSPEND
2555 static int init_iommu_hw(void)
2556 {
2557 	struct dmar_drhd_unit *drhd;
2558 	struct intel_iommu *iommu = NULL;
2559 	int ret;
2560 
2561 	for_each_active_iommu(iommu, drhd) {
2562 		if (iommu->qi) {
2563 			ret = dmar_reenable_qi(iommu);
2564 			if (ret)
2565 				return ret;
2566 		}
2567 	}
2568 
2569 	for_each_iommu(iommu, drhd) {
2570 		if (drhd->ignored) {
2571 			/*
2572 			 * we always have to disable PMRs or DMA may fail on
2573 			 * this device
2574 			 */
2575 			if (force_on)
2576 				iommu_disable_protect_mem_regions(iommu);
2577 			continue;
2578 		}
2579 
2580 		iommu_flush_write_buffer(iommu);
2581 		iommu_set_root_entry(iommu);
2582 		iommu_enable_translation(iommu);
2583 		iommu_disable_protect_mem_regions(iommu);
2584 	}
2585 
2586 	return 0;
2587 }
2588 
2589 static void iommu_flush_all(void)
2590 {
2591 	struct dmar_drhd_unit *drhd;
2592 	struct intel_iommu *iommu;
2593 
2594 	for_each_active_iommu(iommu, drhd) {
2595 		iommu->flush.flush_context(iommu, 0, 0, 0,
2596 					   DMA_CCMD_GLOBAL_INVL);
2597 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2598 					 DMA_TLB_GLOBAL_FLUSH);
2599 	}
2600 }
2601 
2602 static int iommu_suspend(void)
2603 {
2604 	struct dmar_drhd_unit *drhd;
2605 	struct intel_iommu *iommu = NULL;
2606 	unsigned long flag;
2607 
2608 	iommu_flush_all();
2609 
2610 	for_each_active_iommu(iommu, drhd) {
2611 		iommu_disable_translation(iommu);
2612 
2613 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2614 
2615 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
2616 			readl(iommu->reg + DMAR_FECTL_REG);
2617 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2618 			readl(iommu->reg + DMAR_FEDATA_REG);
2619 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2620 			readl(iommu->reg + DMAR_FEADDR_REG);
2621 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2622 			readl(iommu->reg + DMAR_FEUADDR_REG);
2623 
2624 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2625 	}
2626 	return 0;
2627 }
2628 
2629 static void iommu_resume(void)
2630 {
2631 	struct dmar_drhd_unit *drhd;
2632 	struct intel_iommu *iommu = NULL;
2633 	unsigned long flag;
2634 
2635 	if (init_iommu_hw()) {
2636 		if (force_on)
2637 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2638 		else
2639 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2640 		return;
2641 	}
2642 
2643 	for_each_active_iommu(iommu, drhd) {
2644 
2645 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2646 
2647 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2648 			iommu->reg + DMAR_FECTL_REG);
2649 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2650 			iommu->reg + DMAR_FEDATA_REG);
2651 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2652 			iommu->reg + DMAR_FEADDR_REG);
2653 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2654 			iommu->reg + DMAR_FEUADDR_REG);
2655 
2656 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2657 	}
2658 }
2659 
2660 static struct syscore_ops iommu_syscore_ops = {
2661 	.resume		= iommu_resume,
2662 	.suspend	= iommu_suspend,
2663 };
2664 
2665 static void __init init_iommu_pm_ops(void)
2666 {
2667 	register_syscore_ops(&iommu_syscore_ops);
2668 }
2669 
2670 #else
2671 static inline void init_iommu_pm_ops(void) {}
2672 #endif	/* CONFIG_PM */
2673 
2674 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2675 {
2676 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2677 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2678 	    rmrr->end_address <= rmrr->base_address ||
2679 	    arch_rmrr_sanity_check(rmrr))
2680 		return -EINVAL;
2681 
2682 	return 0;
2683 }
2684 
2685 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2686 {
2687 	struct acpi_dmar_reserved_memory *rmrr;
2688 	struct dmar_rmrr_unit *rmrru;
2689 
2690 	rmrr = (struct acpi_dmar_reserved_memory *)header;
2691 	if (rmrr_sanity_check(rmrr)) {
2692 		pr_warn(FW_BUG
2693 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2694 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2695 			   rmrr->base_address, rmrr->end_address,
2696 			   dmi_get_system_info(DMI_BIOS_VENDOR),
2697 			   dmi_get_system_info(DMI_BIOS_VERSION),
2698 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
2699 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2700 	}
2701 
2702 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2703 	if (!rmrru)
2704 		goto out;
2705 
2706 	rmrru->hdr = header;
2707 
2708 	rmrru->base_address = rmrr->base_address;
2709 	rmrru->end_address = rmrr->end_address;
2710 
2711 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2712 				((void *)rmrr) + rmrr->header.length,
2713 				&rmrru->devices_cnt);
2714 	if (rmrru->devices_cnt && rmrru->devices == NULL)
2715 		goto free_rmrru;
2716 
2717 	list_add(&rmrru->list, &dmar_rmrr_units);
2718 
2719 	return 0;
2720 free_rmrru:
2721 	kfree(rmrru);
2722 out:
2723 	return -ENOMEM;
2724 }
2725 
2726 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2727 {
2728 	struct dmar_atsr_unit *atsru;
2729 	struct acpi_dmar_atsr *tmp;
2730 
2731 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2732 				dmar_rcu_check()) {
2733 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2734 		if (atsr->segment != tmp->segment)
2735 			continue;
2736 		if (atsr->header.length != tmp->header.length)
2737 			continue;
2738 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
2739 			return atsru;
2740 	}
2741 
2742 	return NULL;
2743 }
2744 
2745 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2746 {
2747 	struct acpi_dmar_atsr *atsr;
2748 	struct dmar_atsr_unit *atsru;
2749 
2750 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2751 		return 0;
2752 
2753 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2754 	atsru = dmar_find_atsr(atsr);
2755 	if (atsru)
2756 		return 0;
2757 
2758 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2759 	if (!atsru)
2760 		return -ENOMEM;
2761 
2762 	/*
2763 	 * If memory is allocated from slab by ACPI _DSM method, we need to
2764 	 * copy the memory content because the memory buffer will be freed
2765 	 * on return.
2766 	 */
2767 	atsru->hdr = (void *)(atsru + 1);
2768 	memcpy(atsru->hdr, hdr, hdr->length);
2769 	atsru->include_all = atsr->flags & 0x1;
2770 	if (!atsru->include_all) {
2771 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
2772 				(void *)atsr + atsr->header.length,
2773 				&atsru->devices_cnt);
2774 		if (atsru->devices_cnt && atsru->devices == NULL) {
2775 			kfree(atsru);
2776 			return -ENOMEM;
2777 		}
2778 	}
2779 
2780 	list_add_rcu(&atsru->list, &dmar_atsr_units);
2781 
2782 	return 0;
2783 }
2784 
2785 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2786 {
2787 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
2788 	kfree(atsru);
2789 }
2790 
2791 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2792 {
2793 	struct acpi_dmar_atsr *atsr;
2794 	struct dmar_atsr_unit *atsru;
2795 
2796 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2797 	atsru = dmar_find_atsr(atsr);
2798 	if (atsru) {
2799 		list_del_rcu(&atsru->list);
2800 		synchronize_rcu();
2801 		intel_iommu_free_atsr(atsru);
2802 	}
2803 
2804 	return 0;
2805 }
2806 
2807 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2808 {
2809 	int i;
2810 	struct device *dev;
2811 	struct acpi_dmar_atsr *atsr;
2812 	struct dmar_atsr_unit *atsru;
2813 
2814 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2815 	atsru = dmar_find_atsr(atsr);
2816 	if (!atsru)
2817 		return 0;
2818 
2819 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
2820 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
2821 					  i, dev)
2822 			return -EBUSY;
2823 	}
2824 
2825 	return 0;
2826 }
2827 
2828 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
2829 {
2830 	struct dmar_satc_unit *satcu;
2831 	struct acpi_dmar_satc *tmp;
2832 
2833 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
2834 				dmar_rcu_check()) {
2835 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
2836 		if (satc->segment != tmp->segment)
2837 			continue;
2838 		if (satc->header.length != tmp->header.length)
2839 			continue;
2840 		if (memcmp(satc, tmp, satc->header.length) == 0)
2841 			return satcu;
2842 	}
2843 
2844 	return NULL;
2845 }
2846 
2847 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
2848 {
2849 	struct acpi_dmar_satc *satc;
2850 	struct dmar_satc_unit *satcu;
2851 
2852 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2853 		return 0;
2854 
2855 	satc = container_of(hdr, struct acpi_dmar_satc, header);
2856 	satcu = dmar_find_satc(satc);
2857 	if (satcu)
2858 		return 0;
2859 
2860 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
2861 	if (!satcu)
2862 		return -ENOMEM;
2863 
2864 	satcu->hdr = (void *)(satcu + 1);
2865 	memcpy(satcu->hdr, hdr, hdr->length);
2866 	satcu->atc_required = satc->flags & 0x1;
2867 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
2868 					      (void *)satc + satc->header.length,
2869 					      &satcu->devices_cnt);
2870 	if (satcu->devices_cnt && !satcu->devices) {
2871 		kfree(satcu);
2872 		return -ENOMEM;
2873 	}
2874 	list_add_rcu(&satcu->list, &dmar_satc_units);
2875 
2876 	return 0;
2877 }
2878 
2879 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
2880 {
2881 	int sp, ret;
2882 	struct intel_iommu *iommu = dmaru->iommu;
2883 
2884 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
2885 	if (ret)
2886 		goto out;
2887 
2888 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
2889 		pr_warn("%s: Doesn't support hardware pass through.\n",
2890 			iommu->name);
2891 		return -ENXIO;
2892 	}
2893 
2894 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
2895 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
2896 		pr_warn("%s: Doesn't support large page.\n",
2897 			iommu->name);
2898 		return -ENXIO;
2899 	}
2900 
2901 	/*
2902 	 * Disable translation if already enabled prior to OS handover.
2903 	 */
2904 	if (iommu->gcmd & DMA_GCMD_TE)
2905 		iommu_disable_translation(iommu);
2906 
2907 	ret = iommu_init_domains(iommu);
2908 	if (ret == 0)
2909 		ret = iommu_alloc_root_entry(iommu);
2910 	if (ret)
2911 		goto out;
2912 
2913 	intel_svm_check(iommu);
2914 
2915 	if (dmaru->ignored) {
2916 		/*
2917 		 * we always have to disable PMRs or DMA may fail on this device
2918 		 */
2919 		if (force_on)
2920 			iommu_disable_protect_mem_regions(iommu);
2921 		return 0;
2922 	}
2923 
2924 	intel_iommu_init_qi(iommu);
2925 	iommu_flush_write_buffer(iommu);
2926 
2927 #ifdef CONFIG_INTEL_IOMMU_SVM
2928 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2929 		ret = intel_svm_enable_prq(iommu);
2930 		if (ret)
2931 			goto disable_iommu;
2932 	}
2933 #endif
2934 	ret = dmar_set_interrupt(iommu);
2935 	if (ret)
2936 		goto disable_iommu;
2937 
2938 	iommu_set_root_entry(iommu);
2939 	iommu_enable_translation(iommu);
2940 
2941 	iommu_disable_protect_mem_regions(iommu);
2942 	return 0;
2943 
2944 disable_iommu:
2945 	disable_dmar_iommu(iommu);
2946 out:
2947 	free_dmar_iommu(iommu);
2948 	return ret;
2949 }
2950 
2951 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
2952 {
2953 	int ret = 0;
2954 	struct intel_iommu *iommu = dmaru->iommu;
2955 
2956 	if (!intel_iommu_enabled)
2957 		return 0;
2958 	if (iommu == NULL)
2959 		return -EINVAL;
2960 
2961 	if (insert) {
2962 		ret = intel_iommu_add(dmaru);
2963 	} else {
2964 		disable_dmar_iommu(iommu);
2965 		free_dmar_iommu(iommu);
2966 	}
2967 
2968 	return ret;
2969 }
2970 
2971 static void intel_iommu_free_dmars(void)
2972 {
2973 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
2974 	struct dmar_atsr_unit *atsru, *atsr_n;
2975 	struct dmar_satc_unit *satcu, *satc_n;
2976 
2977 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
2978 		list_del(&rmrru->list);
2979 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
2980 		kfree(rmrru);
2981 	}
2982 
2983 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
2984 		list_del(&atsru->list);
2985 		intel_iommu_free_atsr(atsru);
2986 	}
2987 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
2988 		list_del(&satcu->list);
2989 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
2990 		kfree(satcu);
2991 	}
2992 }
2993 
2994 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
2995 {
2996 	struct dmar_satc_unit *satcu;
2997 	struct acpi_dmar_satc *satc;
2998 	struct device *tmp;
2999 	int i;
3000 
3001 	dev = pci_physfn(dev);
3002 	rcu_read_lock();
3003 
3004 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3005 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3006 		if (satc->segment != pci_domain_nr(dev->bus))
3007 			continue;
3008 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3009 			if (to_pci_dev(tmp) == dev)
3010 				goto out;
3011 	}
3012 	satcu = NULL;
3013 out:
3014 	rcu_read_unlock();
3015 	return satcu;
3016 }
3017 
3018 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3019 {
3020 	int i, ret = 1;
3021 	struct pci_bus *bus;
3022 	struct pci_dev *bridge = NULL;
3023 	struct device *tmp;
3024 	struct acpi_dmar_atsr *atsr;
3025 	struct dmar_atsr_unit *atsru;
3026 	struct dmar_satc_unit *satcu;
3027 
3028 	dev = pci_physfn(dev);
3029 	satcu = dmar_find_matched_satc_unit(dev);
3030 	if (satcu)
3031 		/*
3032 		 * This device supports ATS as it is in SATC table.
3033 		 * When IOMMU is in legacy mode, enabling ATS is done
3034 		 * automatically by HW for the device that requires
3035 		 * ATS, hence OS should not enable this device ATS
3036 		 * to avoid duplicated TLB invalidation.
3037 		 */
3038 		return !(satcu->atc_required && !sm_supported(iommu));
3039 
3040 	for (bus = dev->bus; bus; bus = bus->parent) {
3041 		bridge = bus->self;
3042 		/* If it's an integrated device, allow ATS */
3043 		if (!bridge)
3044 			return 1;
3045 		/* Connected via non-PCIe: no ATS */
3046 		if (!pci_is_pcie(bridge) ||
3047 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3048 			return 0;
3049 		/* If we found the root port, look it up in the ATSR */
3050 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3051 			break;
3052 	}
3053 
3054 	rcu_read_lock();
3055 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3056 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3057 		if (atsr->segment != pci_domain_nr(dev->bus))
3058 			continue;
3059 
3060 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3061 			if (tmp == &bridge->dev)
3062 				goto out;
3063 
3064 		if (atsru->include_all)
3065 			goto out;
3066 	}
3067 	ret = 0;
3068 out:
3069 	rcu_read_unlock();
3070 
3071 	return ret;
3072 }
3073 
3074 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3075 {
3076 	int ret;
3077 	struct dmar_rmrr_unit *rmrru;
3078 	struct dmar_atsr_unit *atsru;
3079 	struct dmar_satc_unit *satcu;
3080 	struct acpi_dmar_atsr *atsr;
3081 	struct acpi_dmar_reserved_memory *rmrr;
3082 	struct acpi_dmar_satc *satc;
3083 
3084 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3085 		return 0;
3086 
3087 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3088 		rmrr = container_of(rmrru->hdr,
3089 				    struct acpi_dmar_reserved_memory, header);
3090 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3091 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3092 				((void *)rmrr) + rmrr->header.length,
3093 				rmrr->segment, rmrru->devices,
3094 				rmrru->devices_cnt);
3095 			if (ret < 0)
3096 				return ret;
3097 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3098 			dmar_remove_dev_scope(info, rmrr->segment,
3099 				rmrru->devices, rmrru->devices_cnt);
3100 		}
3101 	}
3102 
3103 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3104 		if (atsru->include_all)
3105 			continue;
3106 
3107 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3108 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3109 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3110 					(void *)atsr + atsr->header.length,
3111 					atsr->segment, atsru->devices,
3112 					atsru->devices_cnt);
3113 			if (ret > 0)
3114 				break;
3115 			else if (ret < 0)
3116 				return ret;
3117 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3118 			if (dmar_remove_dev_scope(info, atsr->segment,
3119 					atsru->devices, atsru->devices_cnt))
3120 				break;
3121 		}
3122 	}
3123 	list_for_each_entry(satcu, &dmar_satc_units, list) {
3124 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3125 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3126 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3127 					(void *)satc + satc->header.length,
3128 					satc->segment, satcu->devices,
3129 					satcu->devices_cnt);
3130 			if (ret > 0)
3131 				break;
3132 			else if (ret < 0)
3133 				return ret;
3134 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3135 			if (dmar_remove_dev_scope(info, satc->segment,
3136 					satcu->devices, satcu->devices_cnt))
3137 				break;
3138 		}
3139 	}
3140 
3141 	return 0;
3142 }
3143 
3144 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3145 				       unsigned long val, void *v)
3146 {
3147 	struct memory_notify *mhp = v;
3148 	unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3149 	unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3150 			mhp->nr_pages - 1);
3151 
3152 	switch (val) {
3153 	case MEM_GOING_ONLINE:
3154 		if (iommu_domain_identity_map(si_domain,
3155 					      start_vpfn, last_vpfn)) {
3156 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3157 				start_vpfn, last_vpfn);
3158 			return NOTIFY_BAD;
3159 		}
3160 		break;
3161 
3162 	case MEM_OFFLINE:
3163 	case MEM_CANCEL_ONLINE:
3164 		{
3165 			LIST_HEAD(freelist);
3166 
3167 			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3168 			iommu_put_pages_list(&freelist);
3169 		}
3170 		break;
3171 	}
3172 
3173 	return NOTIFY_OK;
3174 }
3175 
3176 static struct notifier_block intel_iommu_memory_nb = {
3177 	.notifier_call = intel_iommu_memory_notifier,
3178 	.priority = 0
3179 };
3180 
3181 static void intel_disable_iommus(void)
3182 {
3183 	struct intel_iommu *iommu = NULL;
3184 	struct dmar_drhd_unit *drhd;
3185 
3186 	for_each_iommu(iommu, drhd)
3187 		iommu_disable_translation(iommu);
3188 }
3189 
3190 void intel_iommu_shutdown(void)
3191 {
3192 	struct dmar_drhd_unit *drhd;
3193 	struct intel_iommu *iommu = NULL;
3194 
3195 	if (no_iommu || dmar_disabled)
3196 		return;
3197 
3198 	down_write(&dmar_global_lock);
3199 
3200 	/* Disable PMRs explicitly here. */
3201 	for_each_iommu(iommu, drhd)
3202 		iommu_disable_protect_mem_regions(iommu);
3203 
3204 	/* Make sure the IOMMUs are switched off */
3205 	intel_disable_iommus();
3206 
3207 	up_write(&dmar_global_lock);
3208 }
3209 
3210 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3211 {
3212 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3213 
3214 	return container_of(iommu_dev, struct intel_iommu, iommu);
3215 }
3216 
3217 static ssize_t version_show(struct device *dev,
3218 			    struct device_attribute *attr, char *buf)
3219 {
3220 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3221 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3222 	return sysfs_emit(buf, "%d:%d\n",
3223 			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3224 }
3225 static DEVICE_ATTR_RO(version);
3226 
3227 static ssize_t address_show(struct device *dev,
3228 			    struct device_attribute *attr, char *buf)
3229 {
3230 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3231 	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3232 }
3233 static DEVICE_ATTR_RO(address);
3234 
3235 static ssize_t cap_show(struct device *dev,
3236 			struct device_attribute *attr, char *buf)
3237 {
3238 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3239 	return sysfs_emit(buf, "%llx\n", iommu->cap);
3240 }
3241 static DEVICE_ATTR_RO(cap);
3242 
3243 static ssize_t ecap_show(struct device *dev,
3244 			 struct device_attribute *attr, char *buf)
3245 {
3246 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3247 	return sysfs_emit(buf, "%llx\n", iommu->ecap);
3248 }
3249 static DEVICE_ATTR_RO(ecap);
3250 
3251 static ssize_t domains_supported_show(struct device *dev,
3252 				      struct device_attribute *attr, char *buf)
3253 {
3254 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3255 	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3256 }
3257 static DEVICE_ATTR_RO(domains_supported);
3258 
3259 static ssize_t domains_used_show(struct device *dev,
3260 				 struct device_attribute *attr, char *buf)
3261 {
3262 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3263 	return sysfs_emit(buf, "%d\n",
3264 			  bitmap_weight(iommu->domain_ids,
3265 					cap_ndoms(iommu->cap)));
3266 }
3267 static DEVICE_ATTR_RO(domains_used);
3268 
3269 static struct attribute *intel_iommu_attrs[] = {
3270 	&dev_attr_version.attr,
3271 	&dev_attr_address.attr,
3272 	&dev_attr_cap.attr,
3273 	&dev_attr_ecap.attr,
3274 	&dev_attr_domains_supported.attr,
3275 	&dev_attr_domains_used.attr,
3276 	NULL,
3277 };
3278 
3279 static struct attribute_group intel_iommu_group = {
3280 	.name = "intel-iommu",
3281 	.attrs = intel_iommu_attrs,
3282 };
3283 
3284 const struct attribute_group *intel_iommu_groups[] = {
3285 	&intel_iommu_group,
3286 	NULL,
3287 };
3288 
3289 static bool has_external_pci(void)
3290 {
3291 	struct pci_dev *pdev = NULL;
3292 
3293 	for_each_pci_dev(pdev)
3294 		if (pdev->external_facing) {
3295 			pci_dev_put(pdev);
3296 			return true;
3297 		}
3298 
3299 	return false;
3300 }
3301 
3302 static int __init platform_optin_force_iommu(void)
3303 {
3304 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3305 		return 0;
3306 
3307 	if (no_iommu || dmar_disabled)
3308 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3309 
3310 	/*
3311 	 * If Intel-IOMMU is disabled by default, we will apply identity
3312 	 * map for all devices except those marked as being untrusted.
3313 	 */
3314 	if (dmar_disabled)
3315 		iommu_set_default_passthrough(false);
3316 
3317 	dmar_disabled = 0;
3318 	no_iommu = 0;
3319 
3320 	return 1;
3321 }
3322 
3323 static int __init probe_acpi_namespace_devices(void)
3324 {
3325 	struct dmar_drhd_unit *drhd;
3326 	/* To avoid a -Wunused-but-set-variable warning. */
3327 	struct intel_iommu *iommu __maybe_unused;
3328 	struct device *dev;
3329 	int i, ret = 0;
3330 
3331 	for_each_active_iommu(iommu, drhd) {
3332 		for_each_active_dev_scope(drhd->devices,
3333 					  drhd->devices_cnt, i, dev) {
3334 			struct acpi_device_physical_node *pn;
3335 			struct acpi_device *adev;
3336 
3337 			if (dev->bus != &acpi_bus_type)
3338 				continue;
3339 
3340 			adev = to_acpi_device(dev);
3341 			mutex_lock(&adev->physical_node_lock);
3342 			list_for_each_entry(pn,
3343 					    &adev->physical_node_list, node) {
3344 				ret = iommu_probe_device(pn->dev);
3345 				if (ret)
3346 					break;
3347 			}
3348 			mutex_unlock(&adev->physical_node_lock);
3349 
3350 			if (ret)
3351 				return ret;
3352 		}
3353 	}
3354 
3355 	return 0;
3356 }
3357 
3358 static __init int tboot_force_iommu(void)
3359 {
3360 	if (!tboot_enabled())
3361 		return 0;
3362 
3363 	if (no_iommu || dmar_disabled)
3364 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3365 
3366 	dmar_disabled = 0;
3367 	no_iommu = 0;
3368 
3369 	return 1;
3370 }
3371 
3372 int __init intel_iommu_init(void)
3373 {
3374 	int ret = -ENODEV;
3375 	struct dmar_drhd_unit *drhd;
3376 	struct intel_iommu *iommu;
3377 
3378 	/*
3379 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3380 	 * opt in, so enforce that.
3381 	 */
3382 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3383 		    platform_optin_force_iommu();
3384 
3385 	down_write(&dmar_global_lock);
3386 	if (dmar_table_init()) {
3387 		if (force_on)
3388 			panic("tboot: Failed to initialize DMAR table\n");
3389 		goto out_free_dmar;
3390 	}
3391 
3392 	if (dmar_dev_scope_init() < 0) {
3393 		if (force_on)
3394 			panic("tboot: Failed to initialize DMAR device scope\n");
3395 		goto out_free_dmar;
3396 	}
3397 
3398 	up_write(&dmar_global_lock);
3399 
3400 	/*
3401 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3402 	 * complain later when we register it under the lock.
3403 	 */
3404 	dmar_register_bus_notifier();
3405 
3406 	down_write(&dmar_global_lock);
3407 
3408 	if (!no_iommu)
3409 		intel_iommu_debugfs_init();
3410 
3411 	if (no_iommu || dmar_disabled) {
3412 		/*
3413 		 * We exit the function here to ensure IOMMU's remapping and
3414 		 * mempool aren't setup, which means that the IOMMU's PMRs
3415 		 * won't be disabled via the call to init_dmars(). So disable
3416 		 * it explicitly here. The PMRs were setup by tboot prior to
3417 		 * calling SENTER, but the kernel is expected to reset/tear
3418 		 * down the PMRs.
3419 		 */
3420 		if (intel_iommu_tboot_noforce) {
3421 			for_each_iommu(iommu, drhd)
3422 				iommu_disable_protect_mem_regions(iommu);
3423 		}
3424 
3425 		/*
3426 		 * Make sure the IOMMUs are switched off, even when we
3427 		 * boot into a kexec kernel and the previous kernel left
3428 		 * them enabled
3429 		 */
3430 		intel_disable_iommus();
3431 		goto out_free_dmar;
3432 	}
3433 
3434 	if (list_empty(&dmar_rmrr_units))
3435 		pr_info("No RMRR found\n");
3436 
3437 	if (list_empty(&dmar_atsr_units))
3438 		pr_info("No ATSR found\n");
3439 
3440 	if (list_empty(&dmar_satc_units))
3441 		pr_info("No SATC found\n");
3442 
3443 	init_no_remapping_devices();
3444 
3445 	ret = init_dmars();
3446 	if (ret) {
3447 		if (force_on)
3448 			panic("tboot: Failed to initialize DMARs\n");
3449 		pr_err("Initialization failed\n");
3450 		goto out_free_dmar;
3451 	}
3452 	up_write(&dmar_global_lock);
3453 
3454 	init_iommu_pm_ops();
3455 
3456 	down_read(&dmar_global_lock);
3457 	for_each_active_iommu(iommu, drhd) {
3458 		/*
3459 		 * The flush queue implementation does not perform
3460 		 * page-selective invalidations that are required for efficient
3461 		 * TLB flushes in virtual environments.  The benefit of batching
3462 		 * is likely to be much lower than the overhead of synchronizing
3463 		 * the virtual and physical IOMMU page-tables.
3464 		 */
3465 		if (cap_caching_mode(iommu->cap) &&
3466 		    !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3467 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3468 			iommu_set_dma_strict();
3469 		}
3470 		iommu_device_sysfs_add(&iommu->iommu, NULL,
3471 				       intel_iommu_groups,
3472 				       "%s", iommu->name);
3473 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3474 
3475 		iommu_pmu_register(iommu);
3476 	}
3477 	up_read(&dmar_global_lock);
3478 
3479 	if (si_domain && !hw_pass_through)
3480 		register_memory_notifier(&intel_iommu_memory_nb);
3481 
3482 	down_read(&dmar_global_lock);
3483 	if (probe_acpi_namespace_devices())
3484 		pr_warn("ACPI name space devices didn't probe correctly\n");
3485 
3486 	/* Finally, we enable the DMA remapping hardware. */
3487 	for_each_iommu(iommu, drhd) {
3488 		if (!drhd->ignored && !translation_pre_enabled(iommu))
3489 			iommu_enable_translation(iommu);
3490 
3491 		iommu_disable_protect_mem_regions(iommu);
3492 	}
3493 	up_read(&dmar_global_lock);
3494 
3495 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3496 
3497 	intel_iommu_enabled = 1;
3498 
3499 	return 0;
3500 
3501 out_free_dmar:
3502 	intel_iommu_free_dmars();
3503 	up_write(&dmar_global_lock);
3504 	return ret;
3505 }
3506 
3507 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3508 {
3509 	struct device_domain_info *info = opaque;
3510 
3511 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3512 	return 0;
3513 }
3514 
3515 /*
3516  * NB - intel-iommu lacks any sort of reference counting for the users of
3517  * dependent devices.  If multiple endpoints have intersecting dependent
3518  * devices, unbinding the driver from any one of them will possibly leave
3519  * the others unable to operate.
3520  */
3521 static void domain_context_clear(struct device_domain_info *info)
3522 {
3523 	if (!dev_is_pci(info->dev))
3524 		domain_context_clear_one(info, info->bus, info->devfn);
3525 
3526 	pci_for_each_dma_alias(to_pci_dev(info->dev),
3527 			       &domain_context_clear_one_cb, info);
3528 }
3529 
3530 /*
3531  * Clear the page table pointer in context or pasid table entries so that
3532  * all DMA requests without PASID from the device are blocked. If the page
3533  * table has been set, clean up the data structures.
3534  */
3535 void device_block_translation(struct device *dev)
3536 {
3537 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3538 	struct intel_iommu *iommu = info->iommu;
3539 	unsigned long flags;
3540 
3541 	iommu_disable_pci_caps(info);
3542 	if (!dev_is_real_dma_subdevice(dev)) {
3543 		if (sm_supported(iommu))
3544 			intel_pasid_tear_down_entry(iommu, dev,
3545 						    IOMMU_NO_PASID, false);
3546 		else
3547 			domain_context_clear(info);
3548 	}
3549 
3550 	if (!info->domain)
3551 		return;
3552 
3553 	spin_lock_irqsave(&info->domain->lock, flags);
3554 	list_del(&info->link);
3555 	spin_unlock_irqrestore(&info->domain->lock, flags);
3556 
3557 	cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID);
3558 	domain_detach_iommu(info->domain, iommu);
3559 	info->domain = NULL;
3560 }
3561 
3562 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3563 {
3564 	int adjust_width;
3565 
3566 	/* calculate AGAW */
3567 	domain->gaw = guest_width;
3568 	adjust_width = guestwidth_to_adjustwidth(guest_width);
3569 	domain->agaw = width_to_agaw(adjust_width);
3570 
3571 	domain->iommu_coherency = false;
3572 	domain->iommu_superpage = 0;
3573 	domain->max_addr = 0;
3574 
3575 	/* always allocate the top pgd */
3576 	domain->pgd = iommu_alloc_page_node(domain->nid, GFP_ATOMIC);
3577 	if (!domain->pgd)
3578 		return -ENOMEM;
3579 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3580 	return 0;
3581 }
3582 
3583 static int blocking_domain_attach_dev(struct iommu_domain *domain,
3584 				      struct device *dev)
3585 {
3586 	device_block_translation(dev);
3587 	return 0;
3588 }
3589 
3590 static struct iommu_domain blocking_domain = {
3591 	.type = IOMMU_DOMAIN_BLOCKED,
3592 	.ops = &(const struct iommu_domain_ops) {
3593 		.attach_dev	= blocking_domain_attach_dev,
3594 	}
3595 };
3596 
3597 static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage)
3598 {
3599 	if (!intel_iommu_superpage)
3600 		return 0;
3601 
3602 	if (first_stage)
3603 		return cap_fl1gp_support(iommu->cap) ? 2 : 1;
3604 
3605 	return fls(cap_super_page_val(iommu->cap));
3606 }
3607 
3608 static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage)
3609 {
3610 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3611 	struct intel_iommu *iommu = info->iommu;
3612 	struct dmar_domain *domain;
3613 	int addr_width;
3614 
3615 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
3616 	if (!domain)
3617 		return ERR_PTR(-ENOMEM);
3618 
3619 	INIT_LIST_HEAD(&domain->devices);
3620 	INIT_LIST_HEAD(&domain->dev_pasids);
3621 	INIT_LIST_HEAD(&domain->cache_tags);
3622 	spin_lock_init(&domain->lock);
3623 	spin_lock_init(&domain->cache_lock);
3624 	xa_init(&domain->iommu_array);
3625 
3626 	domain->nid = dev_to_node(dev);
3627 	domain->has_iotlb_device = info->ats_enabled;
3628 	domain->use_first_level = first_stage;
3629 
3630 	/* calculate the address width */
3631 	addr_width = agaw_to_width(iommu->agaw);
3632 	if (addr_width > cap_mgaw(iommu->cap))
3633 		addr_width = cap_mgaw(iommu->cap);
3634 	domain->gaw = addr_width;
3635 	domain->agaw = iommu->agaw;
3636 	domain->max_addr = __DOMAIN_MAX_ADDR(addr_width);
3637 
3638 	/* iommu memory access coherency */
3639 	domain->iommu_coherency = iommu_paging_structure_coherency(iommu);
3640 
3641 	/* pagesize bitmap */
3642 	domain->domain.pgsize_bitmap = SZ_4K;
3643 	domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage);
3644 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
3645 
3646 	/*
3647 	 * IOVA aperture: First-level translation restricts the input-address
3648 	 * to a canonical address (i.e., address bits 63:N have the same value
3649 	 * as address bit [N-1], where N is 48-bits with 4-level paging and
3650 	 * 57-bits with 5-level paging). Hence, skip bit [N-1].
3651 	 */
3652 	domain->domain.geometry.force_aperture = true;
3653 	domain->domain.geometry.aperture_start = 0;
3654 	if (first_stage)
3655 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
3656 	else
3657 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
3658 
3659 	/* always allocate the top pgd */
3660 	domain->pgd = iommu_alloc_page_node(domain->nid, GFP_KERNEL);
3661 	if (!domain->pgd) {
3662 		kfree(domain);
3663 		return ERR_PTR(-ENOMEM);
3664 	}
3665 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3666 
3667 	return domain;
3668 }
3669 
3670 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
3671 {
3672 	struct dmar_domain *dmar_domain;
3673 	struct iommu_domain *domain;
3674 
3675 	switch (type) {
3676 	case IOMMU_DOMAIN_DMA:
3677 	case IOMMU_DOMAIN_UNMANAGED:
3678 		dmar_domain = alloc_domain(type);
3679 		if (!dmar_domain) {
3680 			pr_err("Can't allocate dmar_domain\n");
3681 			return NULL;
3682 		}
3683 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3684 			pr_err("Domain initialization failed\n");
3685 			domain_exit(dmar_domain);
3686 			return NULL;
3687 		}
3688 
3689 		domain = &dmar_domain->domain;
3690 		domain->geometry.aperture_start = 0;
3691 		domain->geometry.aperture_end   =
3692 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
3693 		domain->geometry.force_aperture = true;
3694 
3695 		return domain;
3696 	case IOMMU_DOMAIN_IDENTITY:
3697 		return &si_domain->domain;
3698 	default:
3699 		return NULL;
3700 	}
3701 
3702 	return NULL;
3703 }
3704 
3705 static struct iommu_domain *
3706 intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
3707 			      struct iommu_domain *parent,
3708 			      const struct iommu_user_data *user_data)
3709 {
3710 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3711 	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3712 	bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3713 	struct intel_iommu *iommu = info->iommu;
3714 	struct dmar_domain *dmar_domain;
3715 	struct iommu_domain *domain;
3716 
3717 	/* Must be NESTING domain */
3718 	if (parent) {
3719 		if (!nested_supported(iommu) || flags)
3720 			return ERR_PTR(-EOPNOTSUPP);
3721 		return intel_nested_domain_alloc(parent, user_data);
3722 	}
3723 
3724 	if (flags &
3725 	    (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
3726 		return ERR_PTR(-EOPNOTSUPP);
3727 	if (nested_parent && !nested_supported(iommu))
3728 		return ERR_PTR(-EOPNOTSUPP);
3729 	if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3730 		return ERR_PTR(-EOPNOTSUPP);
3731 
3732 	/* Do not use first stage for user domain translation. */
3733 	dmar_domain = paging_domain_alloc(dev, false);
3734 	if (IS_ERR(dmar_domain))
3735 		return ERR_CAST(dmar_domain);
3736 	domain = &dmar_domain->domain;
3737 	domain->type = IOMMU_DOMAIN_UNMANAGED;
3738 	domain->owner = &intel_iommu_ops;
3739 	domain->ops = intel_iommu_ops.default_domain_ops;
3740 
3741 	if (nested_parent) {
3742 		dmar_domain->nested_parent = true;
3743 		INIT_LIST_HEAD(&dmar_domain->s1_domains);
3744 		spin_lock_init(&dmar_domain->s1_lock);
3745 	}
3746 
3747 	if (dirty_tracking) {
3748 		if (dmar_domain->use_first_level) {
3749 			iommu_domain_free(domain);
3750 			return ERR_PTR(-EOPNOTSUPP);
3751 		}
3752 		domain->dirty_ops = &intel_dirty_ops;
3753 	}
3754 
3755 	return domain;
3756 }
3757 
3758 static void intel_iommu_domain_free(struct iommu_domain *domain)
3759 {
3760 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3761 
3762 	WARN_ON(dmar_domain->nested_parent &&
3763 		!list_empty(&dmar_domain->s1_domains));
3764 	if (domain != &si_domain->domain)
3765 		domain_exit(dmar_domain);
3766 }
3767 
3768 int prepare_domain_attach_device(struct iommu_domain *domain,
3769 				 struct device *dev)
3770 {
3771 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3772 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3773 	struct intel_iommu *iommu = info->iommu;
3774 	int addr_width;
3775 
3776 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3777 		return -EINVAL;
3778 
3779 	if (domain->dirty_ops && !ssads_supported(iommu))
3780 		return -EINVAL;
3781 
3782 	/* check if this iommu agaw is sufficient for max mapped address */
3783 	addr_width = agaw_to_width(iommu->agaw);
3784 	if (addr_width > cap_mgaw(iommu->cap))
3785 		addr_width = cap_mgaw(iommu->cap);
3786 
3787 	if (dmar_domain->max_addr > (1LL << addr_width))
3788 		return -EINVAL;
3789 	dmar_domain->gaw = addr_width;
3790 
3791 	/*
3792 	 * Knock out extra levels of page tables if necessary
3793 	 */
3794 	while (iommu->agaw < dmar_domain->agaw) {
3795 		struct dma_pte *pte;
3796 
3797 		pte = dmar_domain->pgd;
3798 		if (dma_pte_present(pte)) {
3799 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
3800 			iommu_free_page(pte);
3801 		}
3802 		dmar_domain->agaw--;
3803 	}
3804 
3805 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3806 	    context_copied(iommu, info->bus, info->devfn))
3807 		return intel_pasid_setup_sm_context(dev);
3808 
3809 	return 0;
3810 }
3811 
3812 static int intel_iommu_attach_device(struct iommu_domain *domain,
3813 				     struct device *dev)
3814 {
3815 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3816 	int ret;
3817 
3818 	if (info->domain)
3819 		device_block_translation(dev);
3820 
3821 	ret = prepare_domain_attach_device(domain, dev);
3822 	if (ret)
3823 		return ret;
3824 
3825 	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
3826 }
3827 
3828 static int intel_iommu_map(struct iommu_domain *domain,
3829 			   unsigned long iova, phys_addr_t hpa,
3830 			   size_t size, int iommu_prot, gfp_t gfp)
3831 {
3832 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3833 	u64 max_addr;
3834 	int prot = 0;
3835 
3836 	if (iommu_prot & IOMMU_READ)
3837 		prot |= DMA_PTE_READ;
3838 	if (iommu_prot & IOMMU_WRITE)
3839 		prot |= DMA_PTE_WRITE;
3840 	if (dmar_domain->set_pte_snp)
3841 		prot |= DMA_PTE_SNP;
3842 
3843 	max_addr = iova + size;
3844 	if (dmar_domain->max_addr < max_addr) {
3845 		u64 end;
3846 
3847 		/* check if minimum agaw is sufficient for mapped address */
3848 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3849 		if (end < max_addr) {
3850 			pr_err("%s: iommu width (%d) is not "
3851 			       "sufficient for the mapped address (%llx)\n",
3852 			       __func__, dmar_domain->gaw, max_addr);
3853 			return -EFAULT;
3854 		}
3855 		dmar_domain->max_addr = max_addr;
3856 	}
3857 	/* Round up size to next multiple of PAGE_SIZE, if it and
3858 	   the low bits of hpa would take us onto the next page */
3859 	size = aligned_nrpages(hpa, size);
3860 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3861 				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
3862 }
3863 
3864 static int intel_iommu_map_pages(struct iommu_domain *domain,
3865 				 unsigned long iova, phys_addr_t paddr,
3866 				 size_t pgsize, size_t pgcount,
3867 				 int prot, gfp_t gfp, size_t *mapped)
3868 {
3869 	unsigned long pgshift = __ffs(pgsize);
3870 	size_t size = pgcount << pgshift;
3871 	int ret;
3872 
3873 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
3874 		return -EINVAL;
3875 
3876 	if (!IS_ALIGNED(iova | paddr, pgsize))
3877 		return -EINVAL;
3878 
3879 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
3880 	if (!ret && mapped)
3881 		*mapped = size;
3882 
3883 	return ret;
3884 }
3885 
3886 static size_t intel_iommu_unmap(struct iommu_domain *domain,
3887 				unsigned long iova, size_t size,
3888 				struct iommu_iotlb_gather *gather)
3889 {
3890 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3891 	unsigned long start_pfn, last_pfn;
3892 	int level = 0;
3893 
3894 	/* Cope with horrid API which requires us to unmap more than the
3895 	   size argument if it happens to be a large-page mapping. */
3896 	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
3897 				     &level, GFP_ATOMIC)))
3898 		return 0;
3899 
3900 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
3901 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
3902 
3903 	start_pfn = iova >> VTD_PAGE_SHIFT;
3904 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
3905 
3906 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
3907 
3908 	if (dmar_domain->max_addr == iova + size)
3909 		dmar_domain->max_addr = iova;
3910 
3911 	/*
3912 	 * We do not use page-selective IOTLB invalidation in flush queue,
3913 	 * so there is no need to track page and sync iotlb.
3914 	 */
3915 	if (!iommu_iotlb_gather_queued(gather))
3916 		iommu_iotlb_gather_add_page(domain, gather, iova, size);
3917 
3918 	return size;
3919 }
3920 
3921 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
3922 				      unsigned long iova,
3923 				      size_t pgsize, size_t pgcount,
3924 				      struct iommu_iotlb_gather *gather)
3925 {
3926 	unsigned long pgshift = __ffs(pgsize);
3927 	size_t size = pgcount << pgshift;
3928 
3929 	return intel_iommu_unmap(domain, iova, size, gather);
3930 }
3931 
3932 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
3933 				 struct iommu_iotlb_gather *gather)
3934 {
3935 	cache_tag_flush_range(to_dmar_domain(domain), gather->start,
3936 			      gather->end, list_empty(&gather->freelist));
3937 	iommu_put_pages_list(&gather->freelist);
3938 }
3939 
3940 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3941 					    dma_addr_t iova)
3942 {
3943 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3944 	struct dma_pte *pte;
3945 	int level = 0;
3946 	u64 phys = 0;
3947 
3948 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
3949 			     GFP_ATOMIC);
3950 	if (pte && dma_pte_present(pte))
3951 		phys = dma_pte_addr(pte) +
3952 			(iova & (BIT_MASK(level_to_offset_bits(level) +
3953 						VTD_PAGE_SHIFT) - 1));
3954 
3955 	return phys;
3956 }
3957 
3958 static bool domain_support_force_snooping(struct dmar_domain *domain)
3959 {
3960 	struct device_domain_info *info;
3961 	bool support = true;
3962 
3963 	assert_spin_locked(&domain->lock);
3964 	list_for_each_entry(info, &domain->devices, link) {
3965 		if (!ecap_sc_support(info->iommu->ecap)) {
3966 			support = false;
3967 			break;
3968 		}
3969 	}
3970 
3971 	return support;
3972 }
3973 
3974 static void domain_set_force_snooping(struct dmar_domain *domain)
3975 {
3976 	struct device_domain_info *info;
3977 
3978 	assert_spin_locked(&domain->lock);
3979 	/*
3980 	 * Second level page table supports per-PTE snoop control. The
3981 	 * iommu_map() interface will handle this by setting SNP bit.
3982 	 */
3983 	if (!domain->use_first_level) {
3984 		domain->set_pte_snp = true;
3985 		return;
3986 	}
3987 
3988 	list_for_each_entry(info, &domain->devices, link)
3989 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
3990 						     IOMMU_NO_PASID);
3991 }
3992 
3993 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
3994 {
3995 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3996 	unsigned long flags;
3997 
3998 	if (dmar_domain->force_snooping)
3999 		return true;
4000 
4001 	spin_lock_irqsave(&dmar_domain->lock, flags);
4002 	if (!domain_support_force_snooping(dmar_domain) ||
4003 	    (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
4004 		spin_unlock_irqrestore(&dmar_domain->lock, flags);
4005 		return false;
4006 	}
4007 
4008 	domain_set_force_snooping(dmar_domain);
4009 	dmar_domain->force_snooping = true;
4010 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4011 
4012 	return true;
4013 }
4014 
4015 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4016 {
4017 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4018 
4019 	switch (cap) {
4020 	case IOMMU_CAP_CACHE_COHERENCY:
4021 	case IOMMU_CAP_DEFERRED_FLUSH:
4022 		return true;
4023 	case IOMMU_CAP_PRE_BOOT_PROTECTION:
4024 		return dmar_platform_optin();
4025 	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4026 		return ecap_sc_support(info->iommu->ecap);
4027 	case IOMMU_CAP_DIRTY_TRACKING:
4028 		return ssads_supported(info->iommu);
4029 	default:
4030 		return false;
4031 	}
4032 }
4033 
4034 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4035 {
4036 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4037 	struct device_domain_info *info;
4038 	struct intel_iommu *iommu;
4039 	u8 bus, devfn;
4040 	int ret;
4041 
4042 	iommu = device_lookup_iommu(dev, &bus, &devfn);
4043 	if (!iommu || !iommu->iommu.ops)
4044 		return ERR_PTR(-ENODEV);
4045 
4046 	info = kzalloc(sizeof(*info), GFP_KERNEL);
4047 	if (!info)
4048 		return ERR_PTR(-ENOMEM);
4049 
4050 	if (dev_is_real_dma_subdevice(dev)) {
4051 		info->bus = pdev->bus->number;
4052 		info->devfn = pdev->devfn;
4053 		info->segment = pci_domain_nr(pdev->bus);
4054 	} else {
4055 		info->bus = bus;
4056 		info->devfn = devfn;
4057 		info->segment = iommu->segment;
4058 	}
4059 
4060 	info->dev = dev;
4061 	info->iommu = iommu;
4062 	if (dev_is_pci(dev)) {
4063 		if (ecap_dev_iotlb_support(iommu->ecap) &&
4064 		    pci_ats_supported(pdev) &&
4065 		    dmar_ats_supported(pdev, iommu)) {
4066 			info->ats_supported = 1;
4067 			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4068 
4069 			/*
4070 			 * For IOMMU that supports device IOTLB throttling
4071 			 * (DIT), we assign PFSID to the invalidation desc
4072 			 * of a VF such that IOMMU HW can gauge queue depth
4073 			 * at PF level. If DIT is not set, PFSID will be
4074 			 * treated as reserved, which should be set to 0.
4075 			 */
4076 			if (ecap_dit(iommu->ecap))
4077 				info->pfsid = pci_dev_id(pci_physfn(pdev));
4078 			info->ats_qdep = pci_ats_queue_depth(pdev);
4079 		}
4080 		if (sm_supported(iommu)) {
4081 			if (pasid_supported(iommu)) {
4082 				int features = pci_pasid_features(pdev);
4083 
4084 				if (features >= 0)
4085 					info->pasid_supported = features | 1;
4086 			}
4087 
4088 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4089 			    pci_pri_supported(pdev))
4090 				info->pri_supported = 1;
4091 		}
4092 	}
4093 
4094 	dev_iommu_priv_set(dev, info);
4095 	if (pdev && pci_ats_supported(pdev)) {
4096 		ret = device_rbtree_insert(iommu, info);
4097 		if (ret)
4098 			goto free;
4099 	}
4100 
4101 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4102 		ret = intel_pasid_alloc_table(dev);
4103 		if (ret) {
4104 			dev_err(dev, "PASID table allocation failed\n");
4105 			goto clear_rbtree;
4106 		}
4107 
4108 		if (!context_copied(iommu, info->bus, info->devfn)) {
4109 			ret = intel_pasid_setup_sm_context(dev);
4110 			if (ret)
4111 				goto free_table;
4112 		}
4113 	}
4114 
4115 	intel_iommu_debugfs_create_dev(info);
4116 
4117 	return &iommu->iommu;
4118 free_table:
4119 	intel_pasid_free_table(dev);
4120 clear_rbtree:
4121 	device_rbtree_remove(info);
4122 free:
4123 	kfree(info);
4124 
4125 	return ERR_PTR(ret);
4126 }
4127 
4128 static void intel_iommu_release_device(struct device *dev)
4129 {
4130 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4131 	struct intel_iommu *iommu = info->iommu;
4132 
4133 	mutex_lock(&iommu->iopf_lock);
4134 	if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
4135 		device_rbtree_remove(info);
4136 	mutex_unlock(&iommu->iopf_lock);
4137 
4138 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
4139 	    !context_copied(iommu, info->bus, info->devfn))
4140 		intel_pasid_teardown_sm_context(dev);
4141 
4142 	intel_pasid_free_table(dev);
4143 	intel_iommu_debugfs_remove_dev(info);
4144 	kfree(info);
4145 	set_dma_ops(dev, NULL);
4146 }
4147 
4148 static void intel_iommu_get_resv_regions(struct device *device,
4149 					 struct list_head *head)
4150 {
4151 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4152 	struct iommu_resv_region *reg;
4153 	struct dmar_rmrr_unit *rmrr;
4154 	struct device *i_dev;
4155 	int i;
4156 
4157 	rcu_read_lock();
4158 	for_each_rmrr_units(rmrr) {
4159 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4160 					  i, i_dev) {
4161 			struct iommu_resv_region *resv;
4162 			enum iommu_resv_type type;
4163 			size_t length;
4164 
4165 			if (i_dev != device &&
4166 			    !is_downstream_to_pci_bridge(device, i_dev))
4167 				continue;
4168 
4169 			length = rmrr->end_address - rmrr->base_address + 1;
4170 
4171 			type = device_rmrr_is_relaxable(device) ?
4172 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4173 
4174 			resv = iommu_alloc_resv_region(rmrr->base_address,
4175 						       length, prot, type,
4176 						       GFP_ATOMIC);
4177 			if (!resv)
4178 				break;
4179 
4180 			list_add_tail(&resv->list, head);
4181 		}
4182 	}
4183 	rcu_read_unlock();
4184 
4185 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4186 	if (dev_is_pci(device)) {
4187 		struct pci_dev *pdev = to_pci_dev(device);
4188 
4189 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4190 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4191 					IOMMU_RESV_DIRECT_RELAXABLE,
4192 					GFP_KERNEL);
4193 			if (reg)
4194 				list_add_tail(&reg->list, head);
4195 		}
4196 	}
4197 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4198 
4199 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4200 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4201 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4202 	if (!reg)
4203 		return;
4204 	list_add_tail(&reg->list, head);
4205 }
4206 
4207 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4208 {
4209 	if (dev_is_pci(dev))
4210 		return pci_device_group(dev);
4211 	return generic_device_group(dev);
4212 }
4213 
4214 static int intel_iommu_enable_sva(struct device *dev)
4215 {
4216 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4217 	struct intel_iommu *iommu;
4218 
4219 	if (!info || dmar_disabled)
4220 		return -EINVAL;
4221 
4222 	iommu = info->iommu;
4223 	if (!iommu)
4224 		return -EINVAL;
4225 
4226 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4227 		return -ENODEV;
4228 
4229 	if (!info->pasid_enabled || !info->ats_enabled)
4230 		return -EINVAL;
4231 
4232 	/*
4233 	 * Devices having device-specific I/O fault handling should not
4234 	 * support PCI/PRI. The IOMMU side has no means to check the
4235 	 * capability of device-specific IOPF.  Therefore, IOMMU can only
4236 	 * default that if the device driver enables SVA on a non-PRI
4237 	 * device, it will handle IOPF in its own way.
4238 	 */
4239 	if (!info->pri_supported)
4240 		return 0;
4241 
4242 	/* Devices supporting PRI should have it enabled. */
4243 	if (!info->pri_enabled)
4244 		return -EINVAL;
4245 
4246 	return 0;
4247 }
4248 
4249 static int context_flip_pri(struct device_domain_info *info, bool enable)
4250 {
4251 	struct intel_iommu *iommu = info->iommu;
4252 	u8 bus = info->bus, devfn = info->devfn;
4253 	struct context_entry *context;
4254 	u16 did;
4255 
4256 	spin_lock(&iommu->lock);
4257 	if (context_copied(iommu, bus, devfn)) {
4258 		spin_unlock(&iommu->lock);
4259 		return -EINVAL;
4260 	}
4261 
4262 	context = iommu_context_addr(iommu, bus, devfn, false);
4263 	if (!context || !context_present(context)) {
4264 		spin_unlock(&iommu->lock);
4265 		return -ENODEV;
4266 	}
4267 	did = context_domain_id(context);
4268 
4269 	if (enable)
4270 		context_set_sm_pre(context);
4271 	else
4272 		context_clear_sm_pre(context);
4273 
4274 	if (!ecap_coherent(iommu->ecap))
4275 		clflush_cache_range(context, sizeof(*context));
4276 	intel_context_flush_present(info, context, did, true);
4277 	spin_unlock(&iommu->lock);
4278 
4279 	return 0;
4280 }
4281 
4282 static int intel_iommu_enable_iopf(struct device *dev)
4283 {
4284 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4285 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4286 	struct intel_iommu *iommu;
4287 	int ret;
4288 
4289 	if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4290 		return -ENODEV;
4291 
4292 	if (info->pri_enabled)
4293 		return -EBUSY;
4294 
4295 	iommu = info->iommu;
4296 	if (!iommu)
4297 		return -EINVAL;
4298 
4299 	/* PASID is required in PRG Response Message. */
4300 	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4301 		return -EINVAL;
4302 
4303 	ret = pci_reset_pri(pdev);
4304 	if (ret)
4305 		return ret;
4306 
4307 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4308 	if (ret)
4309 		return ret;
4310 
4311 	ret = context_flip_pri(info, true);
4312 	if (ret)
4313 		goto err_remove_device;
4314 
4315 	ret = pci_enable_pri(pdev, PRQ_DEPTH);
4316 	if (ret)
4317 		goto err_clear_pri;
4318 
4319 	info->pri_enabled = 1;
4320 
4321 	return 0;
4322 err_clear_pri:
4323 	context_flip_pri(info, false);
4324 err_remove_device:
4325 	iopf_queue_remove_device(iommu->iopf_queue, dev);
4326 
4327 	return ret;
4328 }
4329 
4330 static int intel_iommu_disable_iopf(struct device *dev)
4331 {
4332 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4333 	struct intel_iommu *iommu = info->iommu;
4334 
4335 	if (!info->pri_enabled)
4336 		return -EINVAL;
4337 
4338 	/* Disable new PRI reception: */
4339 	context_flip_pri(info, false);
4340 
4341 	/*
4342 	 * Remove device from fault queue and acknowledge all outstanding
4343 	 * PRQs to the device:
4344 	 */
4345 	iopf_queue_remove_device(iommu->iopf_queue, dev);
4346 
4347 	/*
4348 	 * PCIe spec states that by clearing PRI enable bit, the Page
4349 	 * Request Interface will not issue new page requests, but has
4350 	 * outstanding page requests that have been transmitted or are
4351 	 * queued for transmission. This is supposed to be called after
4352 	 * the device driver has stopped DMA, all PASIDs have been
4353 	 * unbound and the outstanding PRQs have been drained.
4354 	 */
4355 	pci_disable_pri(to_pci_dev(dev));
4356 	info->pri_enabled = 0;
4357 
4358 	return 0;
4359 }
4360 
4361 static int
4362 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4363 {
4364 	switch (feat) {
4365 	case IOMMU_DEV_FEAT_IOPF:
4366 		return intel_iommu_enable_iopf(dev);
4367 
4368 	case IOMMU_DEV_FEAT_SVA:
4369 		return intel_iommu_enable_sva(dev);
4370 
4371 	default:
4372 		return -ENODEV;
4373 	}
4374 }
4375 
4376 static int
4377 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4378 {
4379 	switch (feat) {
4380 	case IOMMU_DEV_FEAT_IOPF:
4381 		return intel_iommu_disable_iopf(dev);
4382 
4383 	case IOMMU_DEV_FEAT_SVA:
4384 		return 0;
4385 
4386 	default:
4387 		return -ENODEV;
4388 	}
4389 }
4390 
4391 static bool intel_iommu_is_attach_deferred(struct device *dev)
4392 {
4393 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4394 
4395 	return translation_pre_enabled(info->iommu) && !info->domain;
4396 }
4397 
4398 /*
4399  * Check that the device does not live on an external facing PCI port that is
4400  * marked as untrusted. Such devices should not be able to apply quirks and
4401  * thus not be able to bypass the IOMMU restrictions.
4402  */
4403 static bool risky_device(struct pci_dev *pdev)
4404 {
4405 	if (pdev->untrusted) {
4406 		pci_info(pdev,
4407 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4408 			 pdev->vendor, pdev->device);
4409 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4410 		return true;
4411 	}
4412 	return false;
4413 }
4414 
4415 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4416 				      unsigned long iova, size_t size)
4417 {
4418 	cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1);
4419 
4420 	return 0;
4421 }
4422 
4423 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
4424 					 struct iommu_domain *domain)
4425 {
4426 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4427 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4428 	struct dev_pasid_info *curr, *dev_pasid = NULL;
4429 	struct intel_iommu *iommu = info->iommu;
4430 	unsigned long flags;
4431 
4432 	spin_lock_irqsave(&dmar_domain->lock, flags);
4433 	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4434 		if (curr->dev == dev && curr->pasid == pasid) {
4435 			list_del(&curr->link_domain);
4436 			dev_pasid = curr;
4437 			break;
4438 		}
4439 	}
4440 	WARN_ON_ONCE(!dev_pasid);
4441 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4442 
4443 	cache_tag_unassign_domain(dmar_domain, dev, pasid);
4444 	domain_detach_iommu(dmar_domain, iommu);
4445 	intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4446 	kfree(dev_pasid);
4447 	intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4448 	intel_drain_pasid_prq(dev, pasid);
4449 }
4450 
4451 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4452 				     struct device *dev, ioasid_t pasid)
4453 {
4454 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4455 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4456 	struct intel_iommu *iommu = info->iommu;
4457 	struct dev_pasid_info *dev_pasid;
4458 	unsigned long flags;
4459 	int ret;
4460 
4461 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4462 		return -EOPNOTSUPP;
4463 
4464 	if (domain->dirty_ops)
4465 		return -EINVAL;
4466 
4467 	if (context_copied(iommu, info->bus, info->devfn))
4468 		return -EBUSY;
4469 
4470 	ret = prepare_domain_attach_device(domain, dev);
4471 	if (ret)
4472 		return ret;
4473 
4474 	dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4475 	if (!dev_pasid)
4476 		return -ENOMEM;
4477 
4478 	ret = domain_attach_iommu(dmar_domain, iommu);
4479 	if (ret)
4480 		goto out_free;
4481 
4482 	ret = cache_tag_assign_domain(dmar_domain, dev, pasid);
4483 	if (ret)
4484 		goto out_detach_iommu;
4485 
4486 	if (domain_type_is_si(dmar_domain))
4487 		ret = intel_pasid_setup_pass_through(iommu, dev, pasid);
4488 	else if (dmar_domain->use_first_level)
4489 		ret = domain_setup_first_level(iommu, dmar_domain,
4490 					       dev, pasid);
4491 	else
4492 		ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4493 						     dev, pasid);
4494 	if (ret)
4495 		goto out_unassign_tag;
4496 
4497 	dev_pasid->dev = dev;
4498 	dev_pasid->pasid = pasid;
4499 	spin_lock_irqsave(&dmar_domain->lock, flags);
4500 	list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4501 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4502 
4503 	if (domain->type & __IOMMU_DOMAIN_PAGING)
4504 		intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4505 
4506 	return 0;
4507 out_unassign_tag:
4508 	cache_tag_unassign_domain(dmar_domain, dev, pasid);
4509 out_detach_iommu:
4510 	domain_detach_iommu(dmar_domain, iommu);
4511 out_free:
4512 	kfree(dev_pasid);
4513 	return ret;
4514 }
4515 
4516 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4517 {
4518 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4519 	struct intel_iommu *iommu = info->iommu;
4520 	struct iommu_hw_info_vtd *vtd;
4521 
4522 	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4523 	if (!vtd)
4524 		return ERR_PTR(-ENOMEM);
4525 
4526 	vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4527 	vtd->cap_reg = iommu->cap;
4528 	vtd->ecap_reg = iommu->ecap;
4529 	*length = sizeof(*vtd);
4530 	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4531 	return vtd;
4532 }
4533 
4534 /*
4535  * Set dirty tracking for the device list of a domain. The caller must
4536  * hold the domain->lock when calling it.
4537  */
4538 static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4539 {
4540 	struct device_domain_info *info;
4541 	int ret = 0;
4542 
4543 	list_for_each_entry(info, devices, link) {
4544 		ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4545 						       IOMMU_NO_PASID, enable);
4546 		if (ret)
4547 			break;
4548 	}
4549 
4550 	return ret;
4551 }
4552 
4553 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4554 					    bool enable)
4555 {
4556 	struct dmar_domain *s1_domain;
4557 	unsigned long flags;
4558 	int ret;
4559 
4560 	spin_lock(&domain->s1_lock);
4561 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4562 		spin_lock_irqsave(&s1_domain->lock, flags);
4563 		ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4564 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4565 		if (ret)
4566 			goto err_unwind;
4567 	}
4568 	spin_unlock(&domain->s1_lock);
4569 	return 0;
4570 
4571 err_unwind:
4572 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4573 		spin_lock_irqsave(&s1_domain->lock, flags);
4574 		device_set_dirty_tracking(&s1_domain->devices,
4575 					  domain->dirty_tracking);
4576 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4577 	}
4578 	spin_unlock(&domain->s1_lock);
4579 	return ret;
4580 }
4581 
4582 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4583 					  bool enable)
4584 {
4585 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4586 	int ret;
4587 
4588 	spin_lock(&dmar_domain->lock);
4589 	if (dmar_domain->dirty_tracking == enable)
4590 		goto out_unlock;
4591 
4592 	ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4593 	if (ret)
4594 		goto err_unwind;
4595 
4596 	if (dmar_domain->nested_parent) {
4597 		ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4598 		if (ret)
4599 			goto err_unwind;
4600 	}
4601 
4602 	dmar_domain->dirty_tracking = enable;
4603 out_unlock:
4604 	spin_unlock(&dmar_domain->lock);
4605 
4606 	return 0;
4607 
4608 err_unwind:
4609 	device_set_dirty_tracking(&dmar_domain->devices,
4610 				  dmar_domain->dirty_tracking);
4611 	spin_unlock(&dmar_domain->lock);
4612 	return ret;
4613 }
4614 
4615 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4616 					    unsigned long iova, size_t size,
4617 					    unsigned long flags,
4618 					    struct iommu_dirty_bitmap *dirty)
4619 {
4620 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4621 	unsigned long end = iova + size - 1;
4622 	unsigned long pgsize;
4623 
4624 	/*
4625 	 * IOMMUFD core calls into a dirty tracking disabled domain without an
4626 	 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4627 	 * have occurred when we stopped dirty tracking. This ensures that we
4628 	 * never inherit dirtied bits from a previous cycle.
4629 	 */
4630 	if (!dmar_domain->dirty_tracking && dirty->bitmap)
4631 		return -EINVAL;
4632 
4633 	do {
4634 		struct dma_pte *pte;
4635 		int lvl = 0;
4636 
4637 		pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4638 				     GFP_ATOMIC);
4639 		pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4640 		if (!pte || !dma_pte_present(pte)) {
4641 			iova += pgsize;
4642 			continue;
4643 		}
4644 
4645 		if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4646 			iommu_dirty_bitmap_record(dirty, iova, pgsize);
4647 		iova += pgsize;
4648 	} while (iova < end);
4649 
4650 	return 0;
4651 }
4652 
4653 static const struct iommu_dirty_ops intel_dirty_ops = {
4654 	.set_dirty_tracking = intel_iommu_set_dirty_tracking,
4655 	.read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4656 };
4657 
4658 const struct iommu_ops intel_iommu_ops = {
4659 	.blocked_domain		= &blocking_domain,
4660 	.release_domain		= &blocking_domain,
4661 	.capable		= intel_iommu_capable,
4662 	.hw_info		= intel_iommu_hw_info,
4663 	.domain_alloc		= intel_iommu_domain_alloc,
4664 	.domain_alloc_user	= intel_iommu_domain_alloc_user,
4665 	.domain_alloc_sva	= intel_svm_domain_alloc,
4666 	.probe_device		= intel_iommu_probe_device,
4667 	.release_device		= intel_iommu_release_device,
4668 	.get_resv_regions	= intel_iommu_get_resv_regions,
4669 	.device_group		= intel_iommu_device_group,
4670 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4671 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4672 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4673 	.def_domain_type	= device_def_domain_type,
4674 	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4675 	.pgsize_bitmap		= SZ_4K,
4676 #ifdef CONFIG_INTEL_IOMMU_SVM
4677 	.page_response		= intel_svm_page_response,
4678 #endif
4679 	.default_domain_ops = &(const struct iommu_domain_ops) {
4680 		.attach_dev		= intel_iommu_attach_device,
4681 		.set_dev_pasid		= intel_iommu_set_dev_pasid,
4682 		.map_pages		= intel_iommu_map_pages,
4683 		.unmap_pages		= intel_iommu_unmap_pages,
4684 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4685 		.flush_iotlb_all        = intel_flush_iotlb_all,
4686 		.iotlb_sync		= intel_iommu_tlb_sync,
4687 		.iova_to_phys		= intel_iommu_iova_to_phys,
4688 		.free			= intel_iommu_domain_free,
4689 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4690 	}
4691 };
4692 
4693 static void quirk_iommu_igfx(struct pci_dev *dev)
4694 {
4695 	if (risky_device(dev))
4696 		return;
4697 
4698 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4699 	disable_igfx_iommu = 1;
4700 }
4701 
4702 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4703 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4704 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4705 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4706 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4707 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4708 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4709 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4710 
4711 /* Broadwell igfx malfunctions with dmar */
4712 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4713 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4714 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4715 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4716 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4717 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4718 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4719 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4720 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4721 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4722 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4723 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4724 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4725 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4726 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4727 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4728 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4729 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4730 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4731 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4732 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4733 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4734 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4735 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4736 
4737 static void quirk_iommu_rwbf(struct pci_dev *dev)
4738 {
4739 	if (risky_device(dev))
4740 		return;
4741 
4742 	/*
4743 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4744 	 * but needs it. Same seems to hold for the desktop versions.
4745 	 */
4746 	pci_info(dev, "Forcing write-buffer flush capability\n");
4747 	rwbf_quirk = 1;
4748 }
4749 
4750 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4751 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4752 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4753 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4754 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4755 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4756 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4757 
4758 #define GGC 0x52
4759 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4760 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4761 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4762 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4763 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4764 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4765 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4766 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4767 
4768 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4769 {
4770 	unsigned short ggc;
4771 
4772 	if (risky_device(dev))
4773 		return;
4774 
4775 	if (pci_read_config_word(dev, GGC, &ggc))
4776 		return;
4777 
4778 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4779 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4780 		disable_igfx_iommu = 1;
4781 	} else if (!disable_igfx_iommu) {
4782 		/* we have to ensure the gfx device is idle before we flush */
4783 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4784 		iommu_set_dma_strict();
4785 	}
4786 }
4787 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4788 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4789 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4790 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4791 
4792 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4793 {
4794 	unsigned short ver;
4795 
4796 	if (!IS_GFX_DEVICE(dev))
4797 		return;
4798 
4799 	ver = (dev->device >> 8) & 0xff;
4800 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4801 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4802 	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4803 		return;
4804 
4805 	if (risky_device(dev))
4806 		return;
4807 
4808 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4809 	iommu_skip_te_disable = 1;
4810 }
4811 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4812 
4813 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4814    ISOCH DMAR unit for the Azalia sound device, but not give it any
4815    TLB entries, which causes it to deadlock. Check for that.  We do
4816    this in a function called from init_dmars(), instead of in a PCI
4817    quirk, because we don't want to print the obnoxious "BIOS broken"
4818    message if VT-d is actually disabled.
4819 */
4820 static void __init check_tylersburg_isoch(void)
4821 {
4822 	struct pci_dev *pdev;
4823 	uint32_t vtisochctrl;
4824 
4825 	/* If there's no Azalia in the system anyway, forget it. */
4826 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4827 	if (!pdev)
4828 		return;
4829 
4830 	if (risky_device(pdev)) {
4831 		pci_dev_put(pdev);
4832 		return;
4833 	}
4834 
4835 	pci_dev_put(pdev);
4836 
4837 	/* System Management Registers. Might be hidden, in which case
4838 	   we can't do the sanity check. But that's OK, because the
4839 	   known-broken BIOSes _don't_ actually hide it, so far. */
4840 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4841 	if (!pdev)
4842 		return;
4843 
4844 	if (risky_device(pdev)) {
4845 		pci_dev_put(pdev);
4846 		return;
4847 	}
4848 
4849 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4850 		pci_dev_put(pdev);
4851 		return;
4852 	}
4853 
4854 	pci_dev_put(pdev);
4855 
4856 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4857 	if (vtisochctrl & 1)
4858 		return;
4859 
4860 	/* Drop all bits other than the number of TLB entries */
4861 	vtisochctrl &= 0x1c;
4862 
4863 	/* If we have the recommended number of TLB entries (16), fine. */
4864 	if (vtisochctrl == 0x10)
4865 		return;
4866 
4867 	/* Zero TLB entries? You get to ride the short bus to school. */
4868 	if (!vtisochctrl) {
4869 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4870 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4871 		     dmi_get_system_info(DMI_BIOS_VENDOR),
4872 		     dmi_get_system_info(DMI_BIOS_VERSION),
4873 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4874 		iommu_identity_mapping |= IDENTMAP_AZALIA;
4875 		return;
4876 	}
4877 
4878 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4879 	       vtisochctrl);
4880 }
4881 
4882 /*
4883  * Here we deal with a device TLB defect where device may inadvertently issue ATS
4884  * invalidation completion before posted writes initiated with translated address
4885  * that utilized translations matching the invalidation address range, violating
4886  * the invalidation completion ordering.
4887  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4888  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4889  * under the control of the trusted/privileged host device driver must use this
4890  * quirk.
4891  * Device TLBs are invalidated under the following six conditions:
4892  * 1. Device driver does DMA API unmap IOVA
4893  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4894  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4895  *    exit_mmap() due to crash
4896  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4897  *    VM has to free pages that were unmapped
4898  * 5. Userspace driver unmaps a DMA buffer
4899  * 6. Cache invalidation in vSVA usage (upcoming)
4900  *
4901  * For #1 and #2, device drivers are responsible for stopping DMA traffic
4902  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4903  * invalidate TLB the same way as normal user unmap which will use this quirk.
4904  * The dTLB invalidation after PASID cache flush does not need this quirk.
4905  *
4906  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4907  */
4908 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4909 			       unsigned long address, unsigned long mask,
4910 			       u32 pasid, u16 qdep)
4911 {
4912 	u16 sid;
4913 
4914 	if (likely(!info->dtlb_extra_inval))
4915 		return;
4916 
4917 	sid = PCI_DEVID(info->bus, info->devfn);
4918 	if (pasid == IOMMU_NO_PASID) {
4919 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
4920 				   qdep, address, mask);
4921 	} else {
4922 		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
4923 					 pasid, qdep, address, mask);
4924 	}
4925 }
4926 
4927 #define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
4928 
4929 /*
4930  * Function to submit a command to the enhanced command interface. The
4931  * valid enhanced command descriptions are defined in Table 47 of the
4932  * VT-d spec. The VT-d hardware implementation may support some but not
4933  * all commands, which can be determined by checking the Enhanced
4934  * Command Capability Register.
4935  *
4936  * Return values:
4937  *  - 0: Command successful without any error;
4938  *  - Negative: software error value;
4939  *  - Nonzero positive: failure status code defined in Table 48.
4940  */
4941 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
4942 {
4943 	unsigned long flags;
4944 	u64 res;
4945 	int ret;
4946 
4947 	if (!cap_ecmds(iommu->cap))
4948 		return -ENODEV;
4949 
4950 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
4951 
4952 	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
4953 	if (res & DMA_ECMD_ECRSP_IP) {
4954 		ret = -EBUSY;
4955 		goto err;
4956 	}
4957 
4958 	/*
4959 	 * Unconditionally write the operand B, because
4960 	 * - There is no side effect if an ecmd doesn't require an
4961 	 *   operand B, but we set the register to some value.
4962 	 * - It's not invoked in any critical path. The extra MMIO
4963 	 *   write doesn't bring any performance concerns.
4964 	 */
4965 	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
4966 	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
4967 
4968 	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
4969 		      !(res & DMA_ECMD_ECRSP_IP), res);
4970 
4971 	if (res & DMA_ECMD_ECRSP_IP) {
4972 		ret = -ETIMEDOUT;
4973 		goto err;
4974 	}
4975 
4976 	ret = ecmd_get_status_code(res);
4977 err:
4978 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
4979 
4980 	return ret;
4981 }
4982