xref: /linux/drivers/iommu/intel/iommu.c (revision eb01fe7abbe2d0b38824d2a93fdb4cc3eaf2ccc1)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26 
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "pasid.h"
31 #include "cap_audit.h"
32 #include "perfmon.h"
33 
34 #define ROOT_SIZE		VTD_PAGE_SIZE
35 #define CONTEXT_SIZE		VTD_PAGE_SIZE
36 
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41 
42 #define IOAPIC_RANGE_START	(0xfee00000)
43 #define IOAPIC_RANGE_END	(0xfeefffff)
44 #define IOVA_START_ADDR		(0x1000)
45 
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47 
48 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
49 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
50 
51 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
52    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
53 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
54 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
55 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
56 
57 /* IO virtual address start page frame number */
58 #define IOVA_START_PFN		(1)
59 
60 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
61 
62 static void __init check_tylersburg_isoch(void);
63 static int rwbf_quirk;
64 
65 /*
66  * set to 1 to panic kernel if can't successfully enable VT-d
67  * (used when kernel is launched w/ TXT)
68  */
69 static int force_on = 0;
70 static int intel_iommu_tboot_noforce;
71 static int no_platform_optin;
72 
73 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
74 
75 /*
76  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
77  * if marked present.
78  */
79 static phys_addr_t root_entry_lctp(struct root_entry *re)
80 {
81 	if (!(re->lo & 1))
82 		return 0;
83 
84 	return re->lo & VTD_PAGE_MASK;
85 }
86 
87 /*
88  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
89  * if marked present.
90  */
91 static phys_addr_t root_entry_uctp(struct root_entry *re)
92 {
93 	if (!(re->hi & 1))
94 		return 0;
95 
96 	return re->hi & VTD_PAGE_MASK;
97 }
98 
99 static int device_rid_cmp_key(const void *key, const struct rb_node *node)
100 {
101 	struct device_domain_info *info =
102 		rb_entry(node, struct device_domain_info, node);
103 	const u16 *rid_lhs = key;
104 
105 	if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
106 		return -1;
107 
108 	if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
109 		return 1;
110 
111 	return 0;
112 }
113 
114 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
115 {
116 	struct device_domain_info *info =
117 		rb_entry(lhs, struct device_domain_info, node);
118 	u16 key = PCI_DEVID(info->bus, info->devfn);
119 
120 	return device_rid_cmp_key(&key, rhs);
121 }
122 
123 /*
124  * Looks up an IOMMU-probed device using its source ID.
125  *
126  * Returns the pointer to the device if there is a match. Otherwise,
127  * returns NULL.
128  *
129  * Note that this helper doesn't guarantee that the device won't be
130  * released by the iommu subsystem after being returned. The caller
131  * should use its own synchronization mechanism to avoid the device
132  * being released during its use if its possibly the case.
133  */
134 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
135 {
136 	struct device_domain_info *info = NULL;
137 	struct rb_node *node;
138 	unsigned long flags;
139 
140 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
141 	node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
142 	if (node)
143 		info = rb_entry(node, struct device_domain_info, node);
144 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
145 
146 	return info ? info->dev : NULL;
147 }
148 
149 static int device_rbtree_insert(struct intel_iommu *iommu,
150 				struct device_domain_info *info)
151 {
152 	struct rb_node *curr;
153 	unsigned long flags;
154 
155 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
156 	curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
157 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
158 	if (WARN_ON(curr))
159 		return -EEXIST;
160 
161 	return 0;
162 }
163 
164 static void device_rbtree_remove(struct device_domain_info *info)
165 {
166 	struct intel_iommu *iommu = info->iommu;
167 	unsigned long flags;
168 
169 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
170 	rb_erase(&info->node, &iommu->device_rbtree);
171 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
172 }
173 
174 /*
175  * This domain is a statically identity mapping domain.
176  *	1. This domain creats a static 1:1 mapping to all usable memory.
177  * 	2. It maps to each iommu if successful.
178  *	3. Each iommu mapps to this domain if successful.
179  */
180 static struct dmar_domain *si_domain;
181 static int hw_pass_through = 1;
182 
183 struct dmar_rmrr_unit {
184 	struct list_head list;		/* list of rmrr units	*/
185 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
186 	u64	base_address;		/* reserved base address*/
187 	u64	end_address;		/* reserved end address */
188 	struct dmar_dev_scope *devices;	/* target devices */
189 	int	devices_cnt;		/* target device count */
190 };
191 
192 struct dmar_atsr_unit {
193 	struct list_head list;		/* list of ATSR units */
194 	struct acpi_dmar_header *hdr;	/* ACPI header */
195 	struct dmar_dev_scope *devices;	/* target devices */
196 	int devices_cnt;		/* target device count */
197 	u8 include_all:1;		/* include all ports */
198 };
199 
200 struct dmar_satc_unit {
201 	struct list_head list;		/* list of SATC units */
202 	struct acpi_dmar_header *hdr;	/* ACPI header */
203 	struct dmar_dev_scope *devices;	/* target devices */
204 	struct intel_iommu *iommu;	/* the corresponding iommu */
205 	int devices_cnt;		/* target device count */
206 	u8 atc_required:1;		/* ATS is required */
207 };
208 
209 static LIST_HEAD(dmar_atsr_units);
210 static LIST_HEAD(dmar_rmrr_units);
211 static LIST_HEAD(dmar_satc_units);
212 
213 #define for_each_rmrr_units(rmrr) \
214 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
215 
216 static void intel_iommu_domain_free(struct iommu_domain *domain);
217 
218 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
219 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
220 
221 int intel_iommu_enabled = 0;
222 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
223 
224 static int dmar_map_gfx = 1;
225 static int intel_iommu_superpage = 1;
226 static int iommu_identity_mapping;
227 static int iommu_skip_te_disable;
228 
229 #define IDENTMAP_GFX		2
230 #define IDENTMAP_AZALIA		4
231 
232 const struct iommu_ops intel_iommu_ops;
233 static const struct iommu_dirty_ops intel_dirty_ops;
234 
235 static bool translation_pre_enabled(struct intel_iommu *iommu)
236 {
237 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
238 }
239 
240 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
241 {
242 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
243 }
244 
245 static void init_translation_status(struct intel_iommu *iommu)
246 {
247 	u32 gsts;
248 
249 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
250 	if (gsts & DMA_GSTS_TES)
251 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
252 }
253 
254 static int __init intel_iommu_setup(char *str)
255 {
256 	if (!str)
257 		return -EINVAL;
258 
259 	while (*str) {
260 		if (!strncmp(str, "on", 2)) {
261 			dmar_disabled = 0;
262 			pr_info("IOMMU enabled\n");
263 		} else if (!strncmp(str, "off", 3)) {
264 			dmar_disabled = 1;
265 			no_platform_optin = 1;
266 			pr_info("IOMMU disabled\n");
267 		} else if (!strncmp(str, "igfx_off", 8)) {
268 			dmar_map_gfx = 0;
269 			pr_info("Disable GFX device mapping\n");
270 		} else if (!strncmp(str, "forcedac", 8)) {
271 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
272 			iommu_dma_forcedac = true;
273 		} else if (!strncmp(str, "strict", 6)) {
274 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
275 			iommu_set_dma_strict();
276 		} else if (!strncmp(str, "sp_off", 6)) {
277 			pr_info("Disable supported super page\n");
278 			intel_iommu_superpage = 0;
279 		} else if (!strncmp(str, "sm_on", 5)) {
280 			pr_info("Enable scalable mode if hardware supports\n");
281 			intel_iommu_sm = 1;
282 		} else if (!strncmp(str, "sm_off", 6)) {
283 			pr_info("Scalable mode is disallowed\n");
284 			intel_iommu_sm = 0;
285 		} else if (!strncmp(str, "tboot_noforce", 13)) {
286 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
287 			intel_iommu_tboot_noforce = 1;
288 		} else {
289 			pr_notice("Unknown option - '%s'\n", str);
290 		}
291 
292 		str += strcspn(str, ",");
293 		while (*str == ',')
294 			str++;
295 	}
296 
297 	return 1;
298 }
299 __setup("intel_iommu=", intel_iommu_setup);
300 
301 void *alloc_pgtable_page(int node, gfp_t gfp)
302 {
303 	struct page *page;
304 	void *vaddr = NULL;
305 
306 	page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
307 	if (page)
308 		vaddr = page_address(page);
309 	return vaddr;
310 }
311 
312 void free_pgtable_page(void *vaddr)
313 {
314 	free_page((unsigned long)vaddr);
315 }
316 
317 static int domain_type_is_si(struct dmar_domain *domain)
318 {
319 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
320 }
321 
322 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
323 {
324 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
325 
326 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
327 }
328 
329 /*
330  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
331  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
332  * the returned SAGAW.
333  */
334 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
335 {
336 	unsigned long fl_sagaw, sl_sagaw;
337 
338 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
339 	sl_sagaw = cap_sagaw(iommu->cap);
340 
341 	/* Second level only. */
342 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
343 		return sl_sagaw;
344 
345 	/* First level only. */
346 	if (!ecap_slts(iommu->ecap))
347 		return fl_sagaw;
348 
349 	return fl_sagaw & sl_sagaw;
350 }
351 
352 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
353 {
354 	unsigned long sagaw;
355 	int agaw;
356 
357 	sagaw = __iommu_calculate_sagaw(iommu);
358 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
359 		if (test_bit(agaw, &sagaw))
360 			break;
361 	}
362 
363 	return agaw;
364 }
365 
366 /*
367  * Calculate max SAGAW for each iommu.
368  */
369 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
370 {
371 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
372 }
373 
374 /*
375  * calculate agaw for each iommu.
376  * "SAGAW" may be different across iommus, use a default agaw, and
377  * get a supported less agaw for iommus that don't support the default agaw.
378  */
379 int iommu_calculate_agaw(struct intel_iommu *iommu)
380 {
381 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
382 }
383 
384 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
385 {
386 	return sm_supported(iommu) ?
387 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
388 }
389 
390 static void domain_update_iommu_coherency(struct dmar_domain *domain)
391 {
392 	struct iommu_domain_info *info;
393 	struct dmar_drhd_unit *drhd;
394 	struct intel_iommu *iommu;
395 	bool found = false;
396 	unsigned long i;
397 
398 	domain->iommu_coherency = true;
399 	xa_for_each(&domain->iommu_array, i, info) {
400 		found = true;
401 		if (!iommu_paging_structure_coherency(info->iommu)) {
402 			domain->iommu_coherency = false;
403 			break;
404 		}
405 	}
406 	if (found)
407 		return;
408 
409 	/* No hardware attached; use lowest common denominator */
410 	rcu_read_lock();
411 	for_each_active_iommu(iommu, drhd) {
412 		if (!iommu_paging_structure_coherency(iommu)) {
413 			domain->iommu_coherency = false;
414 			break;
415 		}
416 	}
417 	rcu_read_unlock();
418 }
419 
420 static int domain_update_iommu_superpage(struct dmar_domain *domain,
421 					 struct intel_iommu *skip)
422 {
423 	struct dmar_drhd_unit *drhd;
424 	struct intel_iommu *iommu;
425 	int mask = 0x3;
426 
427 	if (!intel_iommu_superpage)
428 		return 0;
429 
430 	/* set iommu_superpage to the smallest common denominator */
431 	rcu_read_lock();
432 	for_each_active_iommu(iommu, drhd) {
433 		if (iommu != skip) {
434 			if (domain && domain->use_first_level) {
435 				if (!cap_fl1gp_support(iommu->cap))
436 					mask = 0x1;
437 			} else {
438 				mask &= cap_super_page_val(iommu->cap);
439 			}
440 
441 			if (!mask)
442 				break;
443 		}
444 	}
445 	rcu_read_unlock();
446 
447 	return fls(mask);
448 }
449 
450 static int domain_update_device_node(struct dmar_domain *domain)
451 {
452 	struct device_domain_info *info;
453 	int nid = NUMA_NO_NODE;
454 	unsigned long flags;
455 
456 	spin_lock_irqsave(&domain->lock, flags);
457 	list_for_each_entry(info, &domain->devices, link) {
458 		/*
459 		 * There could possibly be multiple device numa nodes as devices
460 		 * within the same domain may sit behind different IOMMUs. There
461 		 * isn't perfect answer in such situation, so we select first
462 		 * come first served policy.
463 		 */
464 		nid = dev_to_node(info->dev);
465 		if (nid != NUMA_NO_NODE)
466 			break;
467 	}
468 	spin_unlock_irqrestore(&domain->lock, flags);
469 
470 	return nid;
471 }
472 
473 /* Return the super pagesize bitmap if supported. */
474 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
475 {
476 	unsigned long bitmap = 0;
477 
478 	/*
479 	 * 1-level super page supports page size of 2MiB, 2-level super page
480 	 * supports page size of both 2MiB and 1GiB.
481 	 */
482 	if (domain->iommu_superpage == 1)
483 		bitmap |= SZ_2M;
484 	else if (domain->iommu_superpage == 2)
485 		bitmap |= SZ_2M | SZ_1G;
486 
487 	return bitmap;
488 }
489 
490 /* Some capabilities may be different across iommus */
491 void domain_update_iommu_cap(struct dmar_domain *domain)
492 {
493 	domain_update_iommu_coherency(domain);
494 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
495 
496 	/*
497 	 * If RHSA is missing, we should default to the device numa domain
498 	 * as fall back.
499 	 */
500 	if (domain->nid == NUMA_NO_NODE)
501 		domain->nid = domain_update_device_node(domain);
502 
503 	/*
504 	 * First-level translation restricts the input-address to a
505 	 * canonical address (i.e., address bits 63:N have the same
506 	 * value as address bit [N-1], where N is 48-bits with 4-level
507 	 * paging and 57-bits with 5-level paging). Hence, skip bit
508 	 * [N-1].
509 	 */
510 	if (domain->use_first_level)
511 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
512 	else
513 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
514 
515 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
516 	domain_update_iotlb(domain);
517 }
518 
519 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
520 					 u8 devfn, int alloc)
521 {
522 	struct root_entry *root = &iommu->root_entry[bus];
523 	struct context_entry *context;
524 	u64 *entry;
525 
526 	/*
527 	 * Except that the caller requested to allocate a new entry,
528 	 * returning a copied context entry makes no sense.
529 	 */
530 	if (!alloc && context_copied(iommu, bus, devfn))
531 		return NULL;
532 
533 	entry = &root->lo;
534 	if (sm_supported(iommu)) {
535 		if (devfn >= 0x80) {
536 			devfn -= 0x80;
537 			entry = &root->hi;
538 		}
539 		devfn *= 2;
540 	}
541 	if (*entry & 1)
542 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
543 	else {
544 		unsigned long phy_addr;
545 		if (!alloc)
546 			return NULL;
547 
548 		context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
549 		if (!context)
550 			return NULL;
551 
552 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
553 		phy_addr = virt_to_phys((void *)context);
554 		*entry = phy_addr | 1;
555 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
556 	}
557 	return &context[devfn];
558 }
559 
560 /**
561  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
562  *				 sub-hierarchy of a candidate PCI-PCI bridge
563  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
564  * @bridge: the candidate PCI-PCI bridge
565  *
566  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
567  */
568 static bool
569 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
570 {
571 	struct pci_dev *pdev, *pbridge;
572 
573 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
574 		return false;
575 
576 	pdev = to_pci_dev(dev);
577 	pbridge = to_pci_dev(bridge);
578 
579 	if (pbridge->subordinate &&
580 	    pbridge->subordinate->number <= pdev->bus->number &&
581 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
582 		return true;
583 
584 	return false;
585 }
586 
587 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
588 {
589 	struct dmar_drhd_unit *drhd;
590 	u32 vtbar;
591 	int rc;
592 
593 	/* We know that this device on this chipset has its own IOMMU.
594 	 * If we find it under a different IOMMU, then the BIOS is lying
595 	 * to us. Hope that the IOMMU for this device is actually
596 	 * disabled, and it needs no translation...
597 	 */
598 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
599 	if (rc) {
600 		/* "can't" happen */
601 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
602 		return false;
603 	}
604 	vtbar &= 0xffff0000;
605 
606 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
607 	drhd = dmar_find_matched_drhd_unit(pdev);
608 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
609 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
610 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
611 		return true;
612 	}
613 
614 	return false;
615 }
616 
617 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
618 {
619 	if (!iommu || iommu->drhd->ignored)
620 		return true;
621 
622 	if (dev_is_pci(dev)) {
623 		struct pci_dev *pdev = to_pci_dev(dev);
624 
625 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
626 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
627 		    quirk_ioat_snb_local_iommu(pdev))
628 			return true;
629 	}
630 
631 	return false;
632 }
633 
634 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
635 {
636 	struct dmar_drhd_unit *drhd = NULL;
637 	struct pci_dev *pdev = NULL;
638 	struct intel_iommu *iommu;
639 	struct device *tmp;
640 	u16 segment = 0;
641 	int i;
642 
643 	if (!dev)
644 		return NULL;
645 
646 	if (dev_is_pci(dev)) {
647 		struct pci_dev *pf_pdev;
648 
649 		pdev = pci_real_dma_dev(to_pci_dev(dev));
650 
651 		/* VFs aren't listed in scope tables; we need to look up
652 		 * the PF instead to find the IOMMU. */
653 		pf_pdev = pci_physfn(pdev);
654 		dev = &pf_pdev->dev;
655 		segment = pci_domain_nr(pdev->bus);
656 	} else if (has_acpi_companion(dev))
657 		dev = &ACPI_COMPANION(dev)->dev;
658 
659 	rcu_read_lock();
660 	for_each_iommu(iommu, drhd) {
661 		if (pdev && segment != drhd->segment)
662 			continue;
663 
664 		for_each_active_dev_scope(drhd->devices,
665 					  drhd->devices_cnt, i, tmp) {
666 			if (tmp == dev) {
667 				/* For a VF use its original BDF# not that of the PF
668 				 * which we used for the IOMMU lookup. Strictly speaking
669 				 * we could do this for all PCI devices; we only need to
670 				 * get the BDF# from the scope table for ACPI matches. */
671 				if (pdev && pdev->is_virtfn)
672 					goto got_pdev;
673 
674 				if (bus && devfn) {
675 					*bus = drhd->devices[i].bus;
676 					*devfn = drhd->devices[i].devfn;
677 				}
678 				goto out;
679 			}
680 
681 			if (is_downstream_to_pci_bridge(dev, tmp))
682 				goto got_pdev;
683 		}
684 
685 		if (pdev && drhd->include_all) {
686 got_pdev:
687 			if (bus && devfn) {
688 				*bus = pdev->bus->number;
689 				*devfn = pdev->devfn;
690 			}
691 			goto out;
692 		}
693 	}
694 	iommu = NULL;
695 out:
696 	if (iommu_is_dummy(iommu, dev))
697 		iommu = NULL;
698 
699 	rcu_read_unlock();
700 
701 	return iommu;
702 }
703 
704 static void domain_flush_cache(struct dmar_domain *domain,
705 			       void *addr, int size)
706 {
707 	if (!domain->iommu_coherency)
708 		clflush_cache_range(addr, size);
709 }
710 
711 static void free_context_table(struct intel_iommu *iommu)
712 {
713 	struct context_entry *context;
714 	int i;
715 
716 	if (!iommu->root_entry)
717 		return;
718 
719 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
720 		context = iommu_context_addr(iommu, i, 0, 0);
721 		if (context)
722 			free_pgtable_page(context);
723 
724 		if (!sm_supported(iommu))
725 			continue;
726 
727 		context = iommu_context_addr(iommu, i, 0x80, 0);
728 		if (context)
729 			free_pgtable_page(context);
730 	}
731 
732 	free_pgtable_page(iommu->root_entry);
733 	iommu->root_entry = NULL;
734 }
735 
736 #ifdef CONFIG_DMAR_DEBUG
737 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
738 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
739 {
740 	struct dma_pte *pte;
741 	int offset;
742 
743 	while (1) {
744 		offset = pfn_level_offset(pfn, level);
745 		pte = &parent[offset];
746 		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
747 			pr_info("PTE not present at level %d\n", level);
748 			break;
749 		}
750 
751 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
752 
753 		if (level == 1)
754 			break;
755 
756 		parent = phys_to_virt(dma_pte_addr(pte));
757 		level--;
758 	}
759 }
760 
761 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
762 			  unsigned long long addr, u32 pasid)
763 {
764 	struct pasid_dir_entry *dir, *pde;
765 	struct pasid_entry *entries, *pte;
766 	struct context_entry *ctx_entry;
767 	struct root_entry *rt_entry;
768 	int i, dir_index, index, level;
769 	u8 devfn = source_id & 0xff;
770 	u8 bus = source_id >> 8;
771 	struct dma_pte *pgtable;
772 
773 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
774 
775 	/* root entry dump */
776 	rt_entry = &iommu->root_entry[bus];
777 	if (!rt_entry) {
778 		pr_info("root table entry is not present\n");
779 		return;
780 	}
781 
782 	if (sm_supported(iommu))
783 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
784 			rt_entry->hi, rt_entry->lo);
785 	else
786 		pr_info("root entry: 0x%016llx", rt_entry->lo);
787 
788 	/* context entry dump */
789 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
790 	if (!ctx_entry) {
791 		pr_info("context table entry is not present\n");
792 		return;
793 	}
794 
795 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
796 		ctx_entry->hi, ctx_entry->lo);
797 
798 	/* legacy mode does not require PASID entries */
799 	if (!sm_supported(iommu)) {
800 		level = agaw_to_level(ctx_entry->hi & 7);
801 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
802 		goto pgtable_walk;
803 	}
804 
805 	/* get the pointer to pasid directory entry */
806 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
807 	if (!dir) {
808 		pr_info("pasid directory entry is not present\n");
809 		return;
810 	}
811 	/* For request-without-pasid, get the pasid from context entry */
812 	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
813 		pasid = IOMMU_NO_PASID;
814 
815 	dir_index = pasid >> PASID_PDE_SHIFT;
816 	pde = &dir[dir_index];
817 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
818 
819 	/* get the pointer to the pasid table entry */
820 	entries = get_pasid_table_from_pde(pde);
821 	if (!entries) {
822 		pr_info("pasid table entry is not present\n");
823 		return;
824 	}
825 	index = pasid & PASID_PTE_MASK;
826 	pte = &entries[index];
827 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
828 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
829 
830 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
831 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
832 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
833 	} else {
834 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
835 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
836 	}
837 
838 pgtable_walk:
839 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
840 }
841 #endif
842 
843 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
844 				      unsigned long pfn, int *target_level,
845 				      gfp_t gfp)
846 {
847 	struct dma_pte *parent, *pte;
848 	int level = agaw_to_level(domain->agaw);
849 	int offset;
850 
851 	if (!domain_pfn_supported(domain, pfn))
852 		/* Address beyond IOMMU's addressing capabilities. */
853 		return NULL;
854 
855 	parent = domain->pgd;
856 
857 	while (1) {
858 		void *tmp_page;
859 
860 		offset = pfn_level_offset(pfn, level);
861 		pte = &parent[offset];
862 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
863 			break;
864 		if (level == *target_level)
865 			break;
866 
867 		if (!dma_pte_present(pte)) {
868 			uint64_t pteval;
869 
870 			tmp_page = alloc_pgtable_page(domain->nid, gfp);
871 
872 			if (!tmp_page)
873 				return NULL;
874 
875 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
876 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
877 			if (domain->use_first_level)
878 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
879 
880 			if (cmpxchg64(&pte->val, 0ULL, pteval))
881 				/* Someone else set it while we were thinking; use theirs. */
882 				free_pgtable_page(tmp_page);
883 			else
884 				domain_flush_cache(domain, pte, sizeof(*pte));
885 		}
886 		if (level == 1)
887 			break;
888 
889 		parent = phys_to_virt(dma_pte_addr(pte));
890 		level--;
891 	}
892 
893 	if (!*target_level)
894 		*target_level = level;
895 
896 	return pte;
897 }
898 
899 /* return address's pte at specific level */
900 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
901 					 unsigned long pfn,
902 					 int level, int *large_page)
903 {
904 	struct dma_pte *parent, *pte;
905 	int total = agaw_to_level(domain->agaw);
906 	int offset;
907 
908 	parent = domain->pgd;
909 	while (level <= total) {
910 		offset = pfn_level_offset(pfn, total);
911 		pte = &parent[offset];
912 		if (level == total)
913 			return pte;
914 
915 		if (!dma_pte_present(pte)) {
916 			*large_page = total;
917 			break;
918 		}
919 
920 		if (dma_pte_superpage(pte)) {
921 			*large_page = total;
922 			return pte;
923 		}
924 
925 		parent = phys_to_virt(dma_pte_addr(pte));
926 		total--;
927 	}
928 	return NULL;
929 }
930 
931 /* clear last level pte, a tlb flush should be followed */
932 static void dma_pte_clear_range(struct dmar_domain *domain,
933 				unsigned long start_pfn,
934 				unsigned long last_pfn)
935 {
936 	unsigned int large_page;
937 	struct dma_pte *first_pte, *pte;
938 
939 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
940 	    WARN_ON(start_pfn > last_pfn))
941 		return;
942 
943 	/* we don't need lock here; nobody else touches the iova range */
944 	do {
945 		large_page = 1;
946 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
947 		if (!pte) {
948 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
949 			continue;
950 		}
951 		do {
952 			dma_clear_pte(pte);
953 			start_pfn += lvl_to_nr_pages(large_page);
954 			pte++;
955 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
956 
957 		domain_flush_cache(domain, first_pte,
958 				   (void *)pte - (void *)first_pte);
959 
960 	} while (start_pfn && start_pfn <= last_pfn);
961 }
962 
963 static void dma_pte_free_level(struct dmar_domain *domain, int level,
964 			       int retain_level, struct dma_pte *pte,
965 			       unsigned long pfn, unsigned long start_pfn,
966 			       unsigned long last_pfn)
967 {
968 	pfn = max(start_pfn, pfn);
969 	pte = &pte[pfn_level_offset(pfn, level)];
970 
971 	do {
972 		unsigned long level_pfn;
973 		struct dma_pte *level_pte;
974 
975 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
976 			goto next;
977 
978 		level_pfn = pfn & level_mask(level);
979 		level_pte = phys_to_virt(dma_pte_addr(pte));
980 
981 		if (level > 2) {
982 			dma_pte_free_level(domain, level - 1, retain_level,
983 					   level_pte, level_pfn, start_pfn,
984 					   last_pfn);
985 		}
986 
987 		/*
988 		 * Free the page table if we're below the level we want to
989 		 * retain and the range covers the entire table.
990 		 */
991 		if (level < retain_level && !(start_pfn > level_pfn ||
992 		      last_pfn < level_pfn + level_size(level) - 1)) {
993 			dma_clear_pte(pte);
994 			domain_flush_cache(domain, pte, sizeof(*pte));
995 			free_pgtable_page(level_pte);
996 		}
997 next:
998 		pfn += level_size(level);
999 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1000 }
1001 
1002 /*
1003  * clear last level (leaf) ptes and free page table pages below the
1004  * level we wish to keep intact.
1005  */
1006 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1007 				   unsigned long start_pfn,
1008 				   unsigned long last_pfn,
1009 				   int retain_level)
1010 {
1011 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1012 
1013 	/* We don't need lock here; nobody else touches the iova range */
1014 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1015 			   domain->pgd, 0, start_pfn, last_pfn);
1016 
1017 	/* free pgd */
1018 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1019 		free_pgtable_page(domain->pgd);
1020 		domain->pgd = NULL;
1021 	}
1022 }
1023 
1024 /* When a page at a given level is being unlinked from its parent, we don't
1025    need to *modify* it at all. All we need to do is make a list of all the
1026    pages which can be freed just as soon as we've flushed the IOTLB and we
1027    know the hardware page-walk will no longer touch them.
1028    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1029    be freed. */
1030 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1031 				    int level, struct dma_pte *pte,
1032 				    struct list_head *freelist)
1033 {
1034 	struct page *pg;
1035 
1036 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1037 	list_add_tail(&pg->lru, freelist);
1038 
1039 	if (level == 1)
1040 		return;
1041 
1042 	pte = page_address(pg);
1043 	do {
1044 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1045 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1046 		pte++;
1047 	} while (!first_pte_in_page(pte));
1048 }
1049 
1050 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1051 				struct dma_pte *pte, unsigned long pfn,
1052 				unsigned long start_pfn, unsigned long last_pfn,
1053 				struct list_head *freelist)
1054 {
1055 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1056 
1057 	pfn = max(start_pfn, pfn);
1058 	pte = &pte[pfn_level_offset(pfn, level)];
1059 
1060 	do {
1061 		unsigned long level_pfn = pfn & level_mask(level);
1062 
1063 		if (!dma_pte_present(pte))
1064 			goto next;
1065 
1066 		/* If range covers entire pagetable, free it */
1067 		if (start_pfn <= level_pfn &&
1068 		    last_pfn >= level_pfn + level_size(level) - 1) {
1069 			/* These suborbinate page tables are going away entirely. Don't
1070 			   bother to clear them; we're just going to *free* them. */
1071 			if (level > 1 && !dma_pte_superpage(pte))
1072 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1073 
1074 			dma_clear_pte(pte);
1075 			if (!first_pte)
1076 				first_pte = pte;
1077 			last_pte = pte;
1078 		} else if (level > 1) {
1079 			/* Recurse down into a level that isn't *entirely* obsolete */
1080 			dma_pte_clear_level(domain, level - 1,
1081 					    phys_to_virt(dma_pte_addr(pte)),
1082 					    level_pfn, start_pfn, last_pfn,
1083 					    freelist);
1084 		}
1085 next:
1086 		pfn = level_pfn + level_size(level);
1087 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1088 
1089 	if (first_pte)
1090 		domain_flush_cache(domain, first_pte,
1091 				   (void *)++last_pte - (void *)first_pte);
1092 }
1093 
1094 /* We can't just free the pages because the IOMMU may still be walking
1095    the page tables, and may have cached the intermediate levels. The
1096    pages can only be freed after the IOTLB flush has been done. */
1097 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1098 			 unsigned long last_pfn, struct list_head *freelist)
1099 {
1100 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1101 	    WARN_ON(start_pfn > last_pfn))
1102 		return;
1103 
1104 	/* we don't need lock here; nobody else touches the iova range */
1105 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1106 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1107 
1108 	/* free pgd */
1109 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1110 		struct page *pgd_page = virt_to_page(domain->pgd);
1111 		list_add_tail(&pgd_page->lru, freelist);
1112 		domain->pgd = NULL;
1113 	}
1114 }
1115 
1116 /* iommu handling */
1117 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1118 {
1119 	struct root_entry *root;
1120 
1121 	root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1122 	if (!root) {
1123 		pr_err("Allocating root entry for %s failed\n",
1124 			iommu->name);
1125 		return -ENOMEM;
1126 	}
1127 
1128 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1129 	iommu->root_entry = root;
1130 
1131 	return 0;
1132 }
1133 
1134 static void iommu_set_root_entry(struct intel_iommu *iommu)
1135 {
1136 	u64 addr;
1137 	u32 sts;
1138 	unsigned long flag;
1139 
1140 	addr = virt_to_phys(iommu->root_entry);
1141 	if (sm_supported(iommu))
1142 		addr |= DMA_RTADDR_SMT;
1143 
1144 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1145 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1146 
1147 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1148 
1149 	/* Make sure hardware complete it */
1150 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1151 		      readl, (sts & DMA_GSTS_RTPS), sts);
1152 
1153 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1154 
1155 	/*
1156 	 * Hardware invalidates all DMA remapping hardware translation
1157 	 * caches as part of SRTP flow.
1158 	 */
1159 	if (cap_esrtps(iommu->cap))
1160 		return;
1161 
1162 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1163 	if (sm_supported(iommu))
1164 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1165 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1166 }
1167 
1168 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1169 {
1170 	u32 val;
1171 	unsigned long flag;
1172 
1173 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1174 		return;
1175 
1176 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1177 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1178 
1179 	/* Make sure hardware complete it */
1180 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1181 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1182 
1183 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1184 }
1185 
1186 /* return value determine if we need a write buffer flush */
1187 static void __iommu_flush_context(struct intel_iommu *iommu,
1188 				  u16 did, u16 source_id, u8 function_mask,
1189 				  u64 type)
1190 {
1191 	u64 val = 0;
1192 	unsigned long flag;
1193 
1194 	switch (type) {
1195 	case DMA_CCMD_GLOBAL_INVL:
1196 		val = DMA_CCMD_GLOBAL_INVL;
1197 		break;
1198 	case DMA_CCMD_DOMAIN_INVL:
1199 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1200 		break;
1201 	case DMA_CCMD_DEVICE_INVL:
1202 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1203 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1204 		break;
1205 	default:
1206 		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1207 			iommu->name, type);
1208 		return;
1209 	}
1210 	val |= DMA_CCMD_ICC;
1211 
1212 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1213 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1214 
1215 	/* Make sure hardware complete it */
1216 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1217 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1218 
1219 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1220 }
1221 
1222 /* return value determine if we need a write buffer flush */
1223 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1224 				u64 addr, unsigned int size_order, u64 type)
1225 {
1226 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1227 	u64 val = 0, val_iva = 0;
1228 	unsigned long flag;
1229 
1230 	switch (type) {
1231 	case DMA_TLB_GLOBAL_FLUSH:
1232 		/* global flush doesn't need set IVA_REG */
1233 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1234 		break;
1235 	case DMA_TLB_DSI_FLUSH:
1236 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1237 		break;
1238 	case DMA_TLB_PSI_FLUSH:
1239 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1240 		/* IH bit is passed in as part of address */
1241 		val_iva = size_order | addr;
1242 		break;
1243 	default:
1244 		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1245 			iommu->name, type);
1246 		return;
1247 	}
1248 
1249 	if (cap_write_drain(iommu->cap))
1250 		val |= DMA_TLB_WRITE_DRAIN;
1251 
1252 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1253 	/* Note: Only uses first TLB reg currently */
1254 	if (val_iva)
1255 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1256 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1257 
1258 	/* Make sure hardware complete it */
1259 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1260 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1261 
1262 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1263 
1264 	/* check IOTLB invalidation granularity */
1265 	if (DMA_TLB_IAIG(val) == 0)
1266 		pr_err("Flush IOTLB failed\n");
1267 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1268 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1269 			(unsigned long long)DMA_TLB_IIRG(type),
1270 			(unsigned long long)DMA_TLB_IAIG(val));
1271 }
1272 
1273 static struct device_domain_info *
1274 domain_lookup_dev_info(struct dmar_domain *domain,
1275 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1276 {
1277 	struct device_domain_info *info;
1278 	unsigned long flags;
1279 
1280 	spin_lock_irqsave(&domain->lock, flags);
1281 	list_for_each_entry(info, &domain->devices, link) {
1282 		if (info->iommu == iommu && info->bus == bus &&
1283 		    info->devfn == devfn) {
1284 			spin_unlock_irqrestore(&domain->lock, flags);
1285 			return info;
1286 		}
1287 	}
1288 	spin_unlock_irqrestore(&domain->lock, flags);
1289 
1290 	return NULL;
1291 }
1292 
1293 void domain_update_iotlb(struct dmar_domain *domain)
1294 {
1295 	struct dev_pasid_info *dev_pasid;
1296 	struct device_domain_info *info;
1297 	bool has_iotlb_device = false;
1298 	unsigned long flags;
1299 
1300 	spin_lock_irqsave(&domain->lock, flags);
1301 	list_for_each_entry(info, &domain->devices, link) {
1302 		if (info->ats_enabled) {
1303 			has_iotlb_device = true;
1304 			break;
1305 		}
1306 	}
1307 
1308 	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1309 		info = dev_iommu_priv_get(dev_pasid->dev);
1310 		if (info->ats_enabled) {
1311 			has_iotlb_device = true;
1312 			break;
1313 		}
1314 	}
1315 	domain->has_iotlb_device = has_iotlb_device;
1316 	spin_unlock_irqrestore(&domain->lock, flags);
1317 }
1318 
1319 /*
1320  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1321  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1322  * check because it applies only to the built-in QAT devices and it doesn't
1323  * grant additional privileges.
1324  */
1325 #define BUGGY_QAT_DEVID_MASK 0x4940
1326 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1327 {
1328 	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1329 		return false;
1330 
1331 	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1332 		return false;
1333 
1334 	return true;
1335 }
1336 
1337 static void iommu_enable_pci_caps(struct device_domain_info *info)
1338 {
1339 	struct pci_dev *pdev;
1340 
1341 	if (!dev_is_pci(info->dev))
1342 		return;
1343 
1344 	pdev = to_pci_dev(info->dev);
1345 
1346 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1347 	   the device if you enable PASID support after ATS support is
1348 	   undefined. So always enable PASID support on devices which
1349 	   have it, even if we can't yet know if we're ever going to
1350 	   use it. */
1351 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1352 		info->pasid_enabled = 1;
1353 
1354 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1355 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1356 		info->ats_enabled = 1;
1357 		domain_update_iotlb(info->domain);
1358 	}
1359 }
1360 
1361 static void iommu_disable_pci_caps(struct device_domain_info *info)
1362 {
1363 	struct pci_dev *pdev;
1364 
1365 	if (!dev_is_pci(info->dev))
1366 		return;
1367 
1368 	pdev = to_pci_dev(info->dev);
1369 
1370 	if (info->ats_enabled) {
1371 		pci_disable_ats(pdev);
1372 		info->ats_enabled = 0;
1373 		domain_update_iotlb(info->domain);
1374 	}
1375 
1376 	if (info->pasid_enabled) {
1377 		pci_disable_pasid(pdev);
1378 		info->pasid_enabled = 0;
1379 	}
1380 }
1381 
1382 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1383 				    u64 addr, unsigned int mask)
1384 {
1385 	u16 sid, qdep;
1386 
1387 	if (!info || !info->ats_enabled)
1388 		return;
1389 
1390 	sid = info->bus << 8 | info->devfn;
1391 	qdep = info->ats_qdep;
1392 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1393 			   qdep, addr, mask);
1394 	quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1395 }
1396 
1397 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1398 				  u64 addr, unsigned mask)
1399 {
1400 	struct dev_pasid_info *dev_pasid;
1401 	struct device_domain_info *info;
1402 	unsigned long flags;
1403 
1404 	if (!domain->has_iotlb_device)
1405 		return;
1406 
1407 	spin_lock_irqsave(&domain->lock, flags);
1408 	list_for_each_entry(info, &domain->devices, link)
1409 		__iommu_flush_dev_iotlb(info, addr, mask);
1410 
1411 	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1412 		info = dev_iommu_priv_get(dev_pasid->dev);
1413 
1414 		if (!info->ats_enabled)
1415 			continue;
1416 
1417 		qi_flush_dev_iotlb_pasid(info->iommu,
1418 					 PCI_DEVID(info->bus, info->devfn),
1419 					 info->pfsid, dev_pasid->pasid,
1420 					 info->ats_qdep, addr,
1421 					 mask);
1422 	}
1423 	spin_unlock_irqrestore(&domain->lock, flags);
1424 }
1425 
1426 static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1427 				     struct dmar_domain *domain, u64 addr,
1428 				     unsigned long npages, bool ih)
1429 {
1430 	u16 did = domain_id_iommu(domain, iommu);
1431 	struct dev_pasid_info *dev_pasid;
1432 	unsigned long flags;
1433 
1434 	spin_lock_irqsave(&domain->lock, flags);
1435 	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1436 		qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih);
1437 
1438 	if (!list_empty(&domain->devices))
1439 		qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
1440 	spin_unlock_irqrestore(&domain->lock, flags);
1441 }
1442 
1443 static void __iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1444 				    unsigned long pfn, unsigned int pages,
1445 				    int ih)
1446 {
1447 	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1448 	unsigned long bitmask = aligned_pages - 1;
1449 	unsigned int mask = ilog2(aligned_pages);
1450 	u64 addr = (u64)pfn << VTD_PAGE_SHIFT;
1451 
1452 	/*
1453 	 * PSI masks the low order bits of the base address. If the
1454 	 * address isn't aligned to the mask, then compute a mask value
1455 	 * needed to ensure the target range is flushed.
1456 	 */
1457 	if (unlikely(bitmask & pfn)) {
1458 		unsigned long end_pfn = pfn + pages - 1, shared_bits;
1459 
1460 		/*
1461 		 * Since end_pfn <= pfn + bitmask, the only way bits
1462 		 * higher than bitmask can differ in pfn and end_pfn is
1463 		 * by carrying. This means after masking out bitmask,
1464 		 * high bits starting with the first set bit in
1465 		 * shared_bits are all equal in both pfn and end_pfn.
1466 		 */
1467 		shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1468 		mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1469 	}
1470 
1471 	/*
1472 	 * Fallback to domain selective flush if no PSI support or
1473 	 * the size is too big.
1474 	 */
1475 	if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1476 		iommu->flush.flush_iotlb(iommu, did, 0, 0,
1477 					 DMA_TLB_DSI_FLUSH);
1478 	else
1479 		iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1480 					 DMA_TLB_PSI_FLUSH);
1481 }
1482 
1483 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1484 				  struct dmar_domain *domain,
1485 				  unsigned long pfn, unsigned int pages,
1486 				  int ih, int map)
1487 {
1488 	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1489 	unsigned int mask = ilog2(aligned_pages);
1490 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1491 	u16 did = domain_id_iommu(domain, iommu);
1492 
1493 	if (WARN_ON(!pages))
1494 		return;
1495 
1496 	if (ih)
1497 		ih = 1 << 6;
1498 
1499 	if (domain->use_first_level)
1500 		domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
1501 	else
1502 		__iommu_flush_iotlb_psi(iommu, did, pfn, pages, ih);
1503 
1504 	/*
1505 	 * In caching mode, changes of pages from non-present to present require
1506 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1507 	 */
1508 	if (!cap_caching_mode(iommu->cap) || !map)
1509 		iommu_flush_dev_iotlb(domain, addr, mask);
1510 }
1511 
1512 /* Notification for newly created mappings */
1513 static void __mapping_notify_one(struct intel_iommu *iommu, struct dmar_domain *domain,
1514 				 unsigned long pfn, unsigned int pages)
1515 {
1516 	/*
1517 	 * It's a non-present to present mapping. Only flush if caching mode
1518 	 * and second level.
1519 	 */
1520 	if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1521 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1522 	else
1523 		iommu_flush_write_buffer(iommu);
1524 }
1525 
1526 /*
1527  * Flush the relevant caches in nested translation if the domain
1528  * also serves as a parent
1529  */
1530 static void parent_domain_flush(struct dmar_domain *domain,
1531 				unsigned long pfn,
1532 				unsigned long pages, int ih)
1533 {
1534 	struct dmar_domain *s1_domain;
1535 
1536 	spin_lock(&domain->s1_lock);
1537 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
1538 		struct device_domain_info *device_info;
1539 		struct iommu_domain_info *info;
1540 		unsigned long flags;
1541 		unsigned long i;
1542 
1543 		xa_for_each(&s1_domain->iommu_array, i, info)
1544 			__iommu_flush_iotlb_psi(info->iommu, info->did,
1545 						pfn, pages, ih);
1546 
1547 		if (!s1_domain->has_iotlb_device)
1548 			continue;
1549 
1550 		spin_lock_irqsave(&s1_domain->lock, flags);
1551 		list_for_each_entry(device_info, &s1_domain->devices, link)
1552 			/*
1553 			 * Address translation cache in device side caches the
1554 			 * result of nested translation. There is no easy way
1555 			 * to identify the exact set of nested translations
1556 			 * affected by a change in S2. So just flush the entire
1557 			 * device cache.
1558 			 */
1559 			__iommu_flush_dev_iotlb(device_info, 0,
1560 						MAX_AGAW_PFN_WIDTH);
1561 		spin_unlock_irqrestore(&s1_domain->lock, flags);
1562 	}
1563 	spin_unlock(&domain->s1_lock);
1564 }
1565 
1566 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1567 {
1568 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1569 	struct iommu_domain_info *info;
1570 	unsigned long idx;
1571 
1572 	xa_for_each(&dmar_domain->iommu_array, idx, info) {
1573 		struct intel_iommu *iommu = info->iommu;
1574 		u16 did = domain_id_iommu(dmar_domain, iommu);
1575 
1576 		if (dmar_domain->use_first_level)
1577 			domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0);
1578 		else
1579 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1580 						 DMA_TLB_DSI_FLUSH);
1581 
1582 		if (!cap_caching_mode(iommu->cap))
1583 			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1584 	}
1585 
1586 	if (dmar_domain->nested_parent)
1587 		parent_domain_flush(dmar_domain, 0, -1, 0);
1588 }
1589 
1590 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1591 {
1592 	u32 pmen;
1593 	unsigned long flags;
1594 
1595 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1596 		return;
1597 
1598 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1599 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1600 	pmen &= ~DMA_PMEN_EPM;
1601 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1602 
1603 	/* wait for the protected region status bit to clear */
1604 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1605 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1606 
1607 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1608 }
1609 
1610 static void iommu_enable_translation(struct intel_iommu *iommu)
1611 {
1612 	u32 sts;
1613 	unsigned long flags;
1614 
1615 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1616 	iommu->gcmd |= DMA_GCMD_TE;
1617 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1618 
1619 	/* Make sure hardware complete it */
1620 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1621 		      readl, (sts & DMA_GSTS_TES), sts);
1622 
1623 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1624 }
1625 
1626 static void iommu_disable_translation(struct intel_iommu *iommu)
1627 {
1628 	u32 sts;
1629 	unsigned long flag;
1630 
1631 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1632 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1633 		return;
1634 
1635 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1636 	iommu->gcmd &= ~DMA_GCMD_TE;
1637 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1638 
1639 	/* Make sure hardware complete it */
1640 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1641 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1642 
1643 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1644 }
1645 
1646 static int iommu_init_domains(struct intel_iommu *iommu)
1647 {
1648 	u32 ndomains;
1649 
1650 	ndomains = cap_ndoms(iommu->cap);
1651 	pr_debug("%s: Number of Domains supported <%d>\n",
1652 		 iommu->name, ndomains);
1653 
1654 	spin_lock_init(&iommu->lock);
1655 
1656 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1657 	if (!iommu->domain_ids)
1658 		return -ENOMEM;
1659 
1660 	/*
1661 	 * If Caching mode is set, then invalid translations are tagged
1662 	 * with domain-id 0, hence we need to pre-allocate it. We also
1663 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1664 	 * make sure it is not used for a real domain.
1665 	 */
1666 	set_bit(0, iommu->domain_ids);
1667 
1668 	/*
1669 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1670 	 * entry for first-level or pass-through translation modes should
1671 	 * be programmed with a domain id different from those used for
1672 	 * second-level or nested translation. We reserve a domain id for
1673 	 * this purpose.
1674 	 */
1675 	if (sm_supported(iommu))
1676 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1677 
1678 	return 0;
1679 }
1680 
1681 static void disable_dmar_iommu(struct intel_iommu *iommu)
1682 {
1683 	if (!iommu->domain_ids)
1684 		return;
1685 
1686 	/*
1687 	 * All iommu domains must have been detached from the devices,
1688 	 * hence there should be no domain IDs in use.
1689 	 */
1690 	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1691 		    > NUM_RESERVED_DID))
1692 		return;
1693 
1694 	if (iommu->gcmd & DMA_GCMD_TE)
1695 		iommu_disable_translation(iommu);
1696 }
1697 
1698 static void free_dmar_iommu(struct intel_iommu *iommu)
1699 {
1700 	if (iommu->domain_ids) {
1701 		bitmap_free(iommu->domain_ids);
1702 		iommu->domain_ids = NULL;
1703 	}
1704 
1705 	if (iommu->copied_tables) {
1706 		bitmap_free(iommu->copied_tables);
1707 		iommu->copied_tables = NULL;
1708 	}
1709 
1710 	/* free context mapping */
1711 	free_context_table(iommu);
1712 
1713 #ifdef CONFIG_INTEL_IOMMU_SVM
1714 	if (pasid_supported(iommu)) {
1715 		if (ecap_prs(iommu->ecap))
1716 			intel_svm_finish_prq(iommu);
1717 	}
1718 #endif
1719 }
1720 
1721 /*
1722  * Check and return whether first level is used by default for
1723  * DMA translation.
1724  */
1725 static bool first_level_by_default(unsigned int type)
1726 {
1727 	/* Only SL is available in legacy mode */
1728 	if (!scalable_mode_support())
1729 		return false;
1730 
1731 	/* Only level (either FL or SL) is available, just use it */
1732 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1733 		return intel_cap_flts_sanity();
1734 
1735 	/* Both levels are available, decide it based on domain type */
1736 	return type != IOMMU_DOMAIN_UNMANAGED;
1737 }
1738 
1739 static struct dmar_domain *alloc_domain(unsigned int type)
1740 {
1741 	struct dmar_domain *domain;
1742 
1743 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1744 	if (!domain)
1745 		return NULL;
1746 
1747 	domain->nid = NUMA_NO_NODE;
1748 	if (first_level_by_default(type))
1749 		domain->use_first_level = true;
1750 	domain->has_iotlb_device = false;
1751 	INIT_LIST_HEAD(&domain->devices);
1752 	INIT_LIST_HEAD(&domain->dev_pasids);
1753 	spin_lock_init(&domain->lock);
1754 	xa_init(&domain->iommu_array);
1755 
1756 	return domain;
1757 }
1758 
1759 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1760 {
1761 	struct iommu_domain_info *info, *curr;
1762 	unsigned long ndomains;
1763 	int num, ret = -ENOSPC;
1764 
1765 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1766 	if (!info)
1767 		return -ENOMEM;
1768 
1769 	spin_lock(&iommu->lock);
1770 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1771 	if (curr) {
1772 		curr->refcnt++;
1773 		spin_unlock(&iommu->lock);
1774 		kfree(info);
1775 		return 0;
1776 	}
1777 
1778 	ndomains = cap_ndoms(iommu->cap);
1779 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1780 	if (num >= ndomains) {
1781 		pr_err("%s: No free domain ids\n", iommu->name);
1782 		goto err_unlock;
1783 	}
1784 
1785 	set_bit(num, iommu->domain_ids);
1786 	info->refcnt	= 1;
1787 	info->did	= num;
1788 	info->iommu	= iommu;
1789 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1790 			  NULL, info, GFP_ATOMIC);
1791 	if (curr) {
1792 		ret = xa_err(curr) ? : -EBUSY;
1793 		goto err_clear;
1794 	}
1795 	domain_update_iommu_cap(domain);
1796 
1797 	spin_unlock(&iommu->lock);
1798 	return 0;
1799 
1800 err_clear:
1801 	clear_bit(info->did, iommu->domain_ids);
1802 err_unlock:
1803 	spin_unlock(&iommu->lock);
1804 	kfree(info);
1805 	return ret;
1806 }
1807 
1808 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1809 {
1810 	struct iommu_domain_info *info;
1811 
1812 	spin_lock(&iommu->lock);
1813 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1814 	if (--info->refcnt == 0) {
1815 		clear_bit(info->did, iommu->domain_ids);
1816 		xa_erase(&domain->iommu_array, iommu->seq_id);
1817 		domain->nid = NUMA_NO_NODE;
1818 		domain_update_iommu_cap(domain);
1819 		kfree(info);
1820 	}
1821 	spin_unlock(&iommu->lock);
1822 }
1823 
1824 static int guestwidth_to_adjustwidth(int gaw)
1825 {
1826 	int agaw;
1827 	int r = (gaw - 12) % 9;
1828 
1829 	if (r == 0)
1830 		agaw = gaw;
1831 	else
1832 		agaw = gaw + 9 - r;
1833 	if (agaw > 64)
1834 		agaw = 64;
1835 	return agaw;
1836 }
1837 
1838 static void domain_exit(struct dmar_domain *domain)
1839 {
1840 	if (domain->pgd) {
1841 		LIST_HEAD(freelist);
1842 
1843 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1844 		put_pages_list(&freelist);
1845 	}
1846 
1847 	if (WARN_ON(!list_empty(&domain->devices)))
1848 		return;
1849 
1850 	kfree(domain);
1851 }
1852 
1853 static int domain_context_mapping_one(struct dmar_domain *domain,
1854 				      struct intel_iommu *iommu,
1855 				      u8 bus, u8 devfn)
1856 {
1857 	struct device_domain_info *info =
1858 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1859 	u16 did = domain_id_iommu(domain, iommu);
1860 	int translation = CONTEXT_TT_MULTI_LEVEL;
1861 	struct dma_pte *pgd = domain->pgd;
1862 	struct context_entry *context;
1863 	int agaw, ret;
1864 
1865 	if (hw_pass_through && domain_type_is_si(domain))
1866 		translation = CONTEXT_TT_PASS_THROUGH;
1867 
1868 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1869 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1870 
1871 	spin_lock(&iommu->lock);
1872 	ret = -ENOMEM;
1873 	context = iommu_context_addr(iommu, bus, devfn, 1);
1874 	if (!context)
1875 		goto out_unlock;
1876 
1877 	ret = 0;
1878 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1879 		goto out_unlock;
1880 
1881 	/*
1882 	 * For kdump cases, old valid entries may be cached due to the
1883 	 * in-flight DMA and copied pgtable, but there is no unmapping
1884 	 * behaviour for them, thus we need an explicit cache flush for
1885 	 * the newly-mapped device. For kdump, at this point, the device
1886 	 * is supposed to finish reset at its driver probe stage, so no
1887 	 * in-flight DMA will exist, and we don't need to worry anymore
1888 	 * hereafter.
1889 	 */
1890 	if (context_copied(iommu, bus, devfn)) {
1891 		u16 did_old = context_domain_id(context);
1892 
1893 		if (did_old < cap_ndoms(iommu->cap)) {
1894 			iommu->flush.flush_context(iommu, did_old,
1895 						   (((u16)bus) << 8) | devfn,
1896 						   DMA_CCMD_MASK_NOBIT,
1897 						   DMA_CCMD_DEVICE_INVL);
1898 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1899 						 DMA_TLB_DSI_FLUSH);
1900 		}
1901 
1902 		clear_context_copied(iommu, bus, devfn);
1903 	}
1904 
1905 	context_clear_entry(context);
1906 	context_set_domain_id(context, did);
1907 
1908 	if (translation != CONTEXT_TT_PASS_THROUGH) {
1909 		/*
1910 		 * Skip top levels of page tables for iommu which has
1911 		 * less agaw than default. Unnecessary for PT mode.
1912 		 */
1913 		for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1914 			ret = -ENOMEM;
1915 			pgd = phys_to_virt(dma_pte_addr(pgd));
1916 			if (!dma_pte_present(pgd))
1917 				goto out_unlock;
1918 		}
1919 
1920 		if (info && info->ats_supported)
1921 			translation = CONTEXT_TT_DEV_IOTLB;
1922 		else
1923 			translation = CONTEXT_TT_MULTI_LEVEL;
1924 
1925 		context_set_address_root(context, virt_to_phys(pgd));
1926 		context_set_address_width(context, agaw);
1927 	} else {
1928 		/*
1929 		 * In pass through mode, AW must be programmed to
1930 		 * indicate the largest AGAW value supported by
1931 		 * hardware. And ASR is ignored by hardware.
1932 		 */
1933 		context_set_address_width(context, iommu->msagaw);
1934 	}
1935 
1936 	context_set_translation_type(context, translation);
1937 	context_set_fault_enable(context);
1938 	context_set_present(context);
1939 	if (!ecap_coherent(iommu->ecap))
1940 		clflush_cache_range(context, sizeof(*context));
1941 
1942 	/*
1943 	 * It's a non-present to present mapping. If hardware doesn't cache
1944 	 * non-present entry we only need to flush the write-buffer. If the
1945 	 * _does_ cache non-present entries, then it does so in the special
1946 	 * domain #0, which we have to flush:
1947 	 */
1948 	if (cap_caching_mode(iommu->cap)) {
1949 		iommu->flush.flush_context(iommu, 0,
1950 					   (((u16)bus) << 8) | devfn,
1951 					   DMA_CCMD_MASK_NOBIT,
1952 					   DMA_CCMD_DEVICE_INVL);
1953 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1954 	} else {
1955 		iommu_flush_write_buffer(iommu);
1956 	}
1957 
1958 	ret = 0;
1959 
1960 out_unlock:
1961 	spin_unlock(&iommu->lock);
1962 
1963 	return ret;
1964 }
1965 
1966 static int domain_context_mapping_cb(struct pci_dev *pdev,
1967 				     u16 alias, void *opaque)
1968 {
1969 	struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1970 	struct intel_iommu *iommu = info->iommu;
1971 	struct dmar_domain *domain = opaque;
1972 
1973 	return domain_context_mapping_one(domain, iommu,
1974 					  PCI_BUS_NUM(alias), alias & 0xff);
1975 }
1976 
1977 static int
1978 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1979 {
1980 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1981 	struct intel_iommu *iommu = info->iommu;
1982 	u8 bus = info->bus, devfn = info->devfn;
1983 
1984 	if (!dev_is_pci(dev))
1985 		return domain_context_mapping_one(domain, iommu, bus, devfn);
1986 
1987 	return pci_for_each_dma_alias(to_pci_dev(dev),
1988 				      domain_context_mapping_cb, domain);
1989 }
1990 
1991 /* Returns a number of VTD pages, but aligned to MM page size */
1992 static unsigned long aligned_nrpages(unsigned long host_addr, size_t size)
1993 {
1994 	host_addr &= ~PAGE_MASK;
1995 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1996 }
1997 
1998 /* Return largest possible superpage level for a given mapping */
1999 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
2000 				   unsigned long phy_pfn, unsigned long pages)
2001 {
2002 	int support, level = 1;
2003 	unsigned long pfnmerge;
2004 
2005 	support = domain->iommu_superpage;
2006 
2007 	/* To use a large page, the virtual *and* physical addresses
2008 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2009 	   of them will mean we have to use smaller pages. So just
2010 	   merge them and check both at once. */
2011 	pfnmerge = iov_pfn | phy_pfn;
2012 
2013 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2014 		pages >>= VTD_STRIDE_SHIFT;
2015 		if (!pages)
2016 			break;
2017 		pfnmerge >>= VTD_STRIDE_SHIFT;
2018 		level++;
2019 		support--;
2020 	}
2021 	return level;
2022 }
2023 
2024 /*
2025  * Ensure that old small page tables are removed to make room for superpage(s).
2026  * We're going to add new large pages, so make sure we don't remove their parent
2027  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2028  */
2029 static void switch_to_super_page(struct dmar_domain *domain,
2030 				 unsigned long start_pfn,
2031 				 unsigned long end_pfn, int level)
2032 {
2033 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2034 	struct iommu_domain_info *info;
2035 	struct dma_pte *pte = NULL;
2036 	unsigned long i;
2037 
2038 	while (start_pfn <= end_pfn) {
2039 		if (!pte)
2040 			pte = pfn_to_dma_pte(domain, start_pfn, &level,
2041 					     GFP_ATOMIC);
2042 
2043 		if (dma_pte_present(pte)) {
2044 			dma_pte_free_pagetable(domain, start_pfn,
2045 					       start_pfn + lvl_pages - 1,
2046 					       level + 1);
2047 
2048 			xa_for_each(&domain->iommu_array, i, info)
2049 				iommu_flush_iotlb_psi(info->iommu, domain,
2050 						      start_pfn, lvl_pages,
2051 						      0, 0);
2052 			if (domain->nested_parent)
2053 				parent_domain_flush(domain, start_pfn,
2054 						    lvl_pages, 0);
2055 		}
2056 
2057 		pte++;
2058 		start_pfn += lvl_pages;
2059 		if (first_pte_in_page(pte))
2060 			pte = NULL;
2061 	}
2062 }
2063 
2064 static int
2065 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2066 		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
2067 		 gfp_t gfp)
2068 {
2069 	struct dma_pte *first_pte = NULL, *pte = NULL;
2070 	unsigned int largepage_lvl = 0;
2071 	unsigned long lvl_pages = 0;
2072 	phys_addr_t pteval;
2073 	u64 attr;
2074 
2075 	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2076 		return -EINVAL;
2077 
2078 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2079 		return -EINVAL;
2080 
2081 	if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
2082 		pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
2083 		return -EINVAL;
2084 	}
2085 
2086 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2087 	attr |= DMA_FL_PTE_PRESENT;
2088 	if (domain->use_first_level) {
2089 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2090 		if (prot & DMA_PTE_WRITE)
2091 			attr |= DMA_FL_PTE_DIRTY;
2092 	}
2093 
2094 	domain->has_mappings = true;
2095 
2096 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2097 
2098 	while (nr_pages > 0) {
2099 		uint64_t tmp;
2100 
2101 		if (!pte) {
2102 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2103 					phys_pfn, nr_pages);
2104 
2105 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2106 					     gfp);
2107 			if (!pte)
2108 				return -ENOMEM;
2109 			first_pte = pte;
2110 
2111 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2112 
2113 			/* It is large page*/
2114 			if (largepage_lvl > 1) {
2115 				unsigned long end_pfn;
2116 				unsigned long pages_to_remove;
2117 
2118 				pteval |= DMA_PTE_LARGE_PAGE;
2119 				pages_to_remove = min_t(unsigned long, nr_pages,
2120 							nr_pte_to_next_page(pte) * lvl_pages);
2121 				end_pfn = iov_pfn + pages_to_remove - 1;
2122 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2123 			} else {
2124 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2125 			}
2126 
2127 		}
2128 		/* We don't need lock here, nobody else
2129 		 * touches the iova range
2130 		 */
2131 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2132 		if (tmp) {
2133 			static int dumps = 5;
2134 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2135 				iov_pfn, tmp, (unsigned long long)pteval);
2136 			if (dumps) {
2137 				dumps--;
2138 				debug_dma_dump_mappings(NULL);
2139 			}
2140 			WARN_ON(1);
2141 		}
2142 
2143 		nr_pages -= lvl_pages;
2144 		iov_pfn += lvl_pages;
2145 		phys_pfn += lvl_pages;
2146 		pteval += lvl_pages * VTD_PAGE_SIZE;
2147 
2148 		/* If the next PTE would be the first in a new page, then we
2149 		 * need to flush the cache on the entries we've just written.
2150 		 * And then we'll need to recalculate 'pte', so clear it and
2151 		 * let it get set again in the if (!pte) block above.
2152 		 *
2153 		 * If we're done (!nr_pages) we need to flush the cache too.
2154 		 *
2155 		 * Also if we've been setting superpages, we may need to
2156 		 * recalculate 'pte' and switch back to smaller pages for the
2157 		 * end of the mapping, if the trailing size is not enough to
2158 		 * use another superpage (i.e. nr_pages < lvl_pages).
2159 		 */
2160 		pte++;
2161 		if (!nr_pages || first_pte_in_page(pte) ||
2162 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2163 			domain_flush_cache(domain, first_pte,
2164 					   (void *)pte - (void *)first_pte);
2165 			pte = NULL;
2166 		}
2167 	}
2168 
2169 	return 0;
2170 }
2171 
2172 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2173 {
2174 	struct intel_iommu *iommu = info->iommu;
2175 	struct context_entry *context;
2176 	u16 did_old;
2177 
2178 	spin_lock(&iommu->lock);
2179 	context = iommu_context_addr(iommu, bus, devfn, 0);
2180 	if (!context) {
2181 		spin_unlock(&iommu->lock);
2182 		return;
2183 	}
2184 
2185 	did_old = context_domain_id(context);
2186 
2187 	context_clear_entry(context);
2188 	__iommu_flush_cache(iommu, context, sizeof(*context));
2189 	spin_unlock(&iommu->lock);
2190 	iommu->flush.flush_context(iommu,
2191 				   did_old,
2192 				   (((u16)bus) << 8) | devfn,
2193 				   DMA_CCMD_MASK_NOBIT,
2194 				   DMA_CCMD_DEVICE_INVL);
2195 
2196 	iommu->flush.flush_iotlb(iommu,
2197 				 did_old,
2198 				 0,
2199 				 0,
2200 				 DMA_TLB_DSI_FLUSH);
2201 
2202 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2203 }
2204 
2205 static int domain_setup_first_level(struct intel_iommu *iommu,
2206 				    struct dmar_domain *domain,
2207 				    struct device *dev,
2208 				    u32 pasid)
2209 {
2210 	struct dma_pte *pgd = domain->pgd;
2211 	int agaw, level;
2212 	int flags = 0;
2213 
2214 	/*
2215 	 * Skip top levels of page tables for iommu which has
2216 	 * less agaw than default. Unnecessary for PT mode.
2217 	 */
2218 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2219 		pgd = phys_to_virt(dma_pte_addr(pgd));
2220 		if (!dma_pte_present(pgd))
2221 			return -ENOMEM;
2222 	}
2223 
2224 	level = agaw_to_level(agaw);
2225 	if (level != 4 && level != 5)
2226 		return -EINVAL;
2227 
2228 	if (level == 5)
2229 		flags |= PASID_FLAG_FL5LP;
2230 
2231 	if (domain->force_snooping)
2232 		flags |= PASID_FLAG_PAGE_SNOOP;
2233 
2234 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2235 					     domain_id_iommu(domain, iommu),
2236 					     flags);
2237 }
2238 
2239 static bool dev_is_real_dma_subdevice(struct device *dev)
2240 {
2241 	return dev && dev_is_pci(dev) &&
2242 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2243 }
2244 
2245 static int iommu_domain_identity_map(struct dmar_domain *domain,
2246 				     unsigned long first_vpfn,
2247 				     unsigned long last_vpfn)
2248 {
2249 	/*
2250 	 * RMRR range might have overlap with physical memory range,
2251 	 * clear it first
2252 	 */
2253 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2254 
2255 	return __domain_mapping(domain, first_vpfn,
2256 				first_vpfn, last_vpfn - first_vpfn + 1,
2257 				DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2258 }
2259 
2260 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2261 
2262 static int __init si_domain_init(int hw)
2263 {
2264 	struct dmar_rmrr_unit *rmrr;
2265 	struct device *dev;
2266 	int i, nid, ret;
2267 
2268 	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2269 	if (!si_domain)
2270 		return -EFAULT;
2271 
2272 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2273 		domain_exit(si_domain);
2274 		si_domain = NULL;
2275 		return -EFAULT;
2276 	}
2277 
2278 	if (hw)
2279 		return 0;
2280 
2281 	for_each_online_node(nid) {
2282 		unsigned long start_pfn, end_pfn;
2283 		int i;
2284 
2285 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2286 			ret = iommu_domain_identity_map(si_domain,
2287 					mm_to_dma_pfn_start(start_pfn),
2288 					mm_to_dma_pfn_end(end_pfn));
2289 			if (ret)
2290 				return ret;
2291 		}
2292 	}
2293 
2294 	/*
2295 	 * Identity map the RMRRs so that devices with RMRRs could also use
2296 	 * the si_domain.
2297 	 */
2298 	for_each_rmrr_units(rmrr) {
2299 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2300 					  i, dev) {
2301 			unsigned long long start = rmrr->base_address;
2302 			unsigned long long end = rmrr->end_address;
2303 
2304 			if (WARN_ON(end < start ||
2305 				    end >> agaw_to_width(si_domain->agaw)))
2306 				continue;
2307 
2308 			ret = iommu_domain_identity_map(si_domain,
2309 					mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2310 					mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2311 			if (ret)
2312 				return ret;
2313 		}
2314 	}
2315 
2316 	return 0;
2317 }
2318 
2319 static int dmar_domain_attach_device(struct dmar_domain *domain,
2320 				     struct device *dev)
2321 {
2322 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2323 	struct intel_iommu *iommu = info->iommu;
2324 	unsigned long flags;
2325 	int ret;
2326 
2327 	ret = domain_attach_iommu(domain, iommu);
2328 	if (ret)
2329 		return ret;
2330 	info->domain = domain;
2331 	spin_lock_irqsave(&domain->lock, flags);
2332 	list_add(&info->link, &domain->devices);
2333 	spin_unlock_irqrestore(&domain->lock, flags);
2334 
2335 	if (dev_is_real_dma_subdevice(dev))
2336 		return 0;
2337 
2338 	if (!sm_supported(iommu))
2339 		ret = domain_context_mapping(domain, dev);
2340 	else if (hw_pass_through && domain_type_is_si(domain))
2341 		ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
2342 	else if (domain->use_first_level)
2343 		ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID);
2344 	else
2345 		ret = intel_pasid_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID);
2346 
2347 	if (ret) {
2348 		device_block_translation(dev);
2349 		return ret;
2350 	}
2351 
2352 	if (sm_supported(info->iommu) || !domain_type_is_si(info->domain))
2353 		iommu_enable_pci_caps(info);
2354 
2355 	return 0;
2356 }
2357 
2358 /**
2359  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2360  * is relaxable (ie. is allowed to be not enforced under some conditions)
2361  * @dev: device handle
2362  *
2363  * We assume that PCI USB devices with RMRRs have them largely
2364  * for historical reasons and that the RMRR space is not actively used post
2365  * boot.  This exclusion may change if vendors begin to abuse it.
2366  *
2367  * The same exception is made for graphics devices, with the requirement that
2368  * any use of the RMRR regions will be torn down before assigning the device
2369  * to a guest.
2370  *
2371  * Return: true if the RMRR is relaxable, false otherwise
2372  */
2373 static bool device_rmrr_is_relaxable(struct device *dev)
2374 {
2375 	struct pci_dev *pdev;
2376 
2377 	if (!dev_is_pci(dev))
2378 		return false;
2379 
2380 	pdev = to_pci_dev(dev);
2381 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2382 		return true;
2383 	else
2384 		return false;
2385 }
2386 
2387 /*
2388  * Return the required default domain type for a specific device.
2389  *
2390  * @dev: the device in query
2391  * @startup: true if this is during early boot
2392  *
2393  * Returns:
2394  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2395  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2396  *  - 0: both identity and dynamic domains work for this device
2397  */
2398 static int device_def_domain_type(struct device *dev)
2399 {
2400 	if (dev_is_pci(dev)) {
2401 		struct pci_dev *pdev = to_pci_dev(dev);
2402 
2403 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2404 			return IOMMU_DOMAIN_IDENTITY;
2405 
2406 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2407 			return IOMMU_DOMAIN_IDENTITY;
2408 	}
2409 
2410 	return 0;
2411 }
2412 
2413 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2414 {
2415 	/*
2416 	 * Start from the sane iommu hardware state.
2417 	 * If the queued invalidation is already initialized by us
2418 	 * (for example, while enabling interrupt-remapping) then
2419 	 * we got the things already rolling from a sane state.
2420 	 */
2421 	if (!iommu->qi) {
2422 		/*
2423 		 * Clear any previous faults.
2424 		 */
2425 		dmar_fault(-1, iommu);
2426 		/*
2427 		 * Disable queued invalidation if supported and already enabled
2428 		 * before OS handover.
2429 		 */
2430 		dmar_disable_qi(iommu);
2431 	}
2432 
2433 	if (dmar_enable_qi(iommu)) {
2434 		/*
2435 		 * Queued Invalidate not enabled, use Register Based Invalidate
2436 		 */
2437 		iommu->flush.flush_context = __iommu_flush_context;
2438 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2439 		pr_info("%s: Using Register based invalidation\n",
2440 			iommu->name);
2441 	} else {
2442 		iommu->flush.flush_context = qi_flush_context;
2443 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2444 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2445 	}
2446 }
2447 
2448 static int copy_context_table(struct intel_iommu *iommu,
2449 			      struct root_entry *old_re,
2450 			      struct context_entry **tbl,
2451 			      int bus, bool ext)
2452 {
2453 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2454 	struct context_entry *new_ce = NULL, ce;
2455 	struct context_entry *old_ce = NULL;
2456 	struct root_entry re;
2457 	phys_addr_t old_ce_phys;
2458 
2459 	tbl_idx = ext ? bus * 2 : bus;
2460 	memcpy(&re, old_re, sizeof(re));
2461 
2462 	for (devfn = 0; devfn < 256; devfn++) {
2463 		/* First calculate the correct index */
2464 		idx = (ext ? devfn * 2 : devfn) % 256;
2465 
2466 		if (idx == 0) {
2467 			/* First save what we may have and clean up */
2468 			if (new_ce) {
2469 				tbl[tbl_idx] = new_ce;
2470 				__iommu_flush_cache(iommu, new_ce,
2471 						    VTD_PAGE_SIZE);
2472 				pos = 1;
2473 			}
2474 
2475 			if (old_ce)
2476 				memunmap(old_ce);
2477 
2478 			ret = 0;
2479 			if (devfn < 0x80)
2480 				old_ce_phys = root_entry_lctp(&re);
2481 			else
2482 				old_ce_phys = root_entry_uctp(&re);
2483 
2484 			if (!old_ce_phys) {
2485 				if (ext && devfn == 0) {
2486 					/* No LCTP, try UCTP */
2487 					devfn = 0x7f;
2488 					continue;
2489 				} else {
2490 					goto out;
2491 				}
2492 			}
2493 
2494 			ret = -ENOMEM;
2495 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2496 					MEMREMAP_WB);
2497 			if (!old_ce)
2498 				goto out;
2499 
2500 			new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2501 			if (!new_ce)
2502 				goto out_unmap;
2503 
2504 			ret = 0;
2505 		}
2506 
2507 		/* Now copy the context entry */
2508 		memcpy(&ce, old_ce + idx, sizeof(ce));
2509 
2510 		if (!context_present(&ce))
2511 			continue;
2512 
2513 		did = context_domain_id(&ce);
2514 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2515 			set_bit(did, iommu->domain_ids);
2516 
2517 		set_context_copied(iommu, bus, devfn);
2518 		new_ce[idx] = ce;
2519 	}
2520 
2521 	tbl[tbl_idx + pos] = new_ce;
2522 
2523 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2524 
2525 out_unmap:
2526 	memunmap(old_ce);
2527 
2528 out:
2529 	return ret;
2530 }
2531 
2532 static int copy_translation_tables(struct intel_iommu *iommu)
2533 {
2534 	struct context_entry **ctxt_tbls;
2535 	struct root_entry *old_rt;
2536 	phys_addr_t old_rt_phys;
2537 	int ctxt_table_entries;
2538 	u64 rtaddr_reg;
2539 	int bus, ret;
2540 	bool new_ext, ext;
2541 
2542 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2543 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2544 	new_ext    = !!sm_supported(iommu);
2545 
2546 	/*
2547 	 * The RTT bit can only be changed when translation is disabled,
2548 	 * but disabling translation means to open a window for data
2549 	 * corruption. So bail out and don't copy anything if we would
2550 	 * have to change the bit.
2551 	 */
2552 	if (new_ext != ext)
2553 		return -EINVAL;
2554 
2555 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2556 	if (!iommu->copied_tables)
2557 		return -ENOMEM;
2558 
2559 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2560 	if (!old_rt_phys)
2561 		return -EINVAL;
2562 
2563 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2564 	if (!old_rt)
2565 		return -ENOMEM;
2566 
2567 	/* This is too big for the stack - allocate it from slab */
2568 	ctxt_table_entries = ext ? 512 : 256;
2569 	ret = -ENOMEM;
2570 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2571 	if (!ctxt_tbls)
2572 		goto out_unmap;
2573 
2574 	for (bus = 0; bus < 256; bus++) {
2575 		ret = copy_context_table(iommu, &old_rt[bus],
2576 					 ctxt_tbls, bus, ext);
2577 		if (ret) {
2578 			pr_err("%s: Failed to copy context table for bus %d\n",
2579 				iommu->name, bus);
2580 			continue;
2581 		}
2582 	}
2583 
2584 	spin_lock(&iommu->lock);
2585 
2586 	/* Context tables are copied, now write them to the root_entry table */
2587 	for (bus = 0; bus < 256; bus++) {
2588 		int idx = ext ? bus * 2 : bus;
2589 		u64 val;
2590 
2591 		if (ctxt_tbls[idx]) {
2592 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2593 			iommu->root_entry[bus].lo = val;
2594 		}
2595 
2596 		if (!ext || !ctxt_tbls[idx + 1])
2597 			continue;
2598 
2599 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2600 		iommu->root_entry[bus].hi = val;
2601 	}
2602 
2603 	spin_unlock(&iommu->lock);
2604 
2605 	kfree(ctxt_tbls);
2606 
2607 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2608 
2609 	ret = 0;
2610 
2611 out_unmap:
2612 	memunmap(old_rt);
2613 
2614 	return ret;
2615 }
2616 
2617 static int __init init_dmars(void)
2618 {
2619 	struct dmar_drhd_unit *drhd;
2620 	struct intel_iommu *iommu;
2621 	int ret;
2622 
2623 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2624 	if (ret)
2625 		goto free_iommu;
2626 
2627 	for_each_iommu(iommu, drhd) {
2628 		if (drhd->ignored) {
2629 			iommu_disable_translation(iommu);
2630 			continue;
2631 		}
2632 
2633 		/*
2634 		 * Find the max pasid size of all IOMMU's in the system.
2635 		 * We need to ensure the system pasid table is no bigger
2636 		 * than the smallest supported.
2637 		 */
2638 		if (pasid_supported(iommu)) {
2639 			u32 temp = 2 << ecap_pss(iommu->ecap);
2640 
2641 			intel_pasid_max_id = min_t(u32, temp,
2642 						   intel_pasid_max_id);
2643 		}
2644 
2645 		intel_iommu_init_qi(iommu);
2646 
2647 		ret = iommu_init_domains(iommu);
2648 		if (ret)
2649 			goto free_iommu;
2650 
2651 		init_translation_status(iommu);
2652 
2653 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2654 			iommu_disable_translation(iommu);
2655 			clear_translation_pre_enabled(iommu);
2656 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2657 				iommu->name);
2658 		}
2659 
2660 		/*
2661 		 * TBD:
2662 		 * we could share the same root & context tables
2663 		 * among all IOMMU's. Need to Split it later.
2664 		 */
2665 		ret = iommu_alloc_root_entry(iommu);
2666 		if (ret)
2667 			goto free_iommu;
2668 
2669 		if (translation_pre_enabled(iommu)) {
2670 			pr_info("Translation already enabled - trying to copy translation structures\n");
2671 
2672 			ret = copy_translation_tables(iommu);
2673 			if (ret) {
2674 				/*
2675 				 * We found the IOMMU with translation
2676 				 * enabled - but failed to copy over the
2677 				 * old root-entry table. Try to proceed
2678 				 * by disabling translation now and
2679 				 * allocating a clean root-entry table.
2680 				 * This might cause DMAR faults, but
2681 				 * probably the dump will still succeed.
2682 				 */
2683 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2684 				       iommu->name);
2685 				iommu_disable_translation(iommu);
2686 				clear_translation_pre_enabled(iommu);
2687 			} else {
2688 				pr_info("Copied translation tables from previous kernel for %s\n",
2689 					iommu->name);
2690 			}
2691 		}
2692 
2693 		if (!ecap_pass_through(iommu->ecap))
2694 			hw_pass_through = 0;
2695 		intel_svm_check(iommu);
2696 	}
2697 
2698 	/*
2699 	 * Now that qi is enabled on all iommus, set the root entry and flush
2700 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2701 	 * flush_context function will loop forever and the boot hangs.
2702 	 */
2703 	for_each_active_iommu(iommu, drhd) {
2704 		iommu_flush_write_buffer(iommu);
2705 		iommu_set_root_entry(iommu);
2706 	}
2707 
2708 	if (!dmar_map_gfx)
2709 		iommu_identity_mapping |= IDENTMAP_GFX;
2710 
2711 	check_tylersburg_isoch();
2712 
2713 	ret = si_domain_init(hw_pass_through);
2714 	if (ret)
2715 		goto free_iommu;
2716 
2717 	/*
2718 	 * for each drhd
2719 	 *   enable fault log
2720 	 *   global invalidate context cache
2721 	 *   global invalidate iotlb
2722 	 *   enable translation
2723 	 */
2724 	for_each_iommu(iommu, drhd) {
2725 		if (drhd->ignored) {
2726 			/*
2727 			 * we always have to disable PMRs or DMA may fail on
2728 			 * this device
2729 			 */
2730 			if (force_on)
2731 				iommu_disable_protect_mem_regions(iommu);
2732 			continue;
2733 		}
2734 
2735 		iommu_flush_write_buffer(iommu);
2736 
2737 #ifdef CONFIG_INTEL_IOMMU_SVM
2738 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2739 			/*
2740 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2741 			 * could cause possible lock race condition.
2742 			 */
2743 			up_write(&dmar_global_lock);
2744 			ret = intel_svm_enable_prq(iommu);
2745 			down_write(&dmar_global_lock);
2746 			if (ret)
2747 				goto free_iommu;
2748 		}
2749 #endif
2750 		ret = dmar_set_interrupt(iommu);
2751 		if (ret)
2752 			goto free_iommu;
2753 	}
2754 
2755 	return 0;
2756 
2757 free_iommu:
2758 	for_each_active_iommu(iommu, drhd) {
2759 		disable_dmar_iommu(iommu);
2760 		free_dmar_iommu(iommu);
2761 	}
2762 	if (si_domain) {
2763 		domain_exit(si_domain);
2764 		si_domain = NULL;
2765 	}
2766 
2767 	return ret;
2768 }
2769 
2770 static void __init init_no_remapping_devices(void)
2771 {
2772 	struct dmar_drhd_unit *drhd;
2773 	struct device *dev;
2774 	int i;
2775 
2776 	for_each_drhd_unit(drhd) {
2777 		if (!drhd->include_all) {
2778 			for_each_active_dev_scope(drhd->devices,
2779 						  drhd->devices_cnt, i, dev)
2780 				break;
2781 			/* ignore DMAR unit if no devices exist */
2782 			if (i == drhd->devices_cnt)
2783 				drhd->ignored = 1;
2784 		}
2785 	}
2786 
2787 	for_each_active_drhd_unit(drhd) {
2788 		if (drhd->include_all)
2789 			continue;
2790 
2791 		for_each_active_dev_scope(drhd->devices,
2792 					  drhd->devices_cnt, i, dev)
2793 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2794 				break;
2795 		if (i < drhd->devices_cnt)
2796 			continue;
2797 
2798 		/* This IOMMU has *only* gfx devices. Either bypass it or
2799 		   set the gfx_mapped flag, as appropriate */
2800 		drhd->gfx_dedicated = 1;
2801 		if (!dmar_map_gfx)
2802 			drhd->ignored = 1;
2803 	}
2804 }
2805 
2806 #ifdef CONFIG_SUSPEND
2807 static int init_iommu_hw(void)
2808 {
2809 	struct dmar_drhd_unit *drhd;
2810 	struct intel_iommu *iommu = NULL;
2811 	int ret;
2812 
2813 	for_each_active_iommu(iommu, drhd) {
2814 		if (iommu->qi) {
2815 			ret = dmar_reenable_qi(iommu);
2816 			if (ret)
2817 				return ret;
2818 		}
2819 	}
2820 
2821 	for_each_iommu(iommu, drhd) {
2822 		if (drhd->ignored) {
2823 			/*
2824 			 * we always have to disable PMRs or DMA may fail on
2825 			 * this device
2826 			 */
2827 			if (force_on)
2828 				iommu_disable_protect_mem_regions(iommu);
2829 			continue;
2830 		}
2831 
2832 		iommu_flush_write_buffer(iommu);
2833 		iommu_set_root_entry(iommu);
2834 		iommu_enable_translation(iommu);
2835 		iommu_disable_protect_mem_regions(iommu);
2836 	}
2837 
2838 	return 0;
2839 }
2840 
2841 static void iommu_flush_all(void)
2842 {
2843 	struct dmar_drhd_unit *drhd;
2844 	struct intel_iommu *iommu;
2845 
2846 	for_each_active_iommu(iommu, drhd) {
2847 		iommu->flush.flush_context(iommu, 0, 0, 0,
2848 					   DMA_CCMD_GLOBAL_INVL);
2849 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2850 					 DMA_TLB_GLOBAL_FLUSH);
2851 	}
2852 }
2853 
2854 static int iommu_suspend(void)
2855 {
2856 	struct dmar_drhd_unit *drhd;
2857 	struct intel_iommu *iommu = NULL;
2858 	unsigned long flag;
2859 
2860 	iommu_flush_all();
2861 
2862 	for_each_active_iommu(iommu, drhd) {
2863 		iommu_disable_translation(iommu);
2864 
2865 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2866 
2867 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
2868 			readl(iommu->reg + DMAR_FECTL_REG);
2869 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2870 			readl(iommu->reg + DMAR_FEDATA_REG);
2871 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2872 			readl(iommu->reg + DMAR_FEADDR_REG);
2873 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2874 			readl(iommu->reg + DMAR_FEUADDR_REG);
2875 
2876 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2877 	}
2878 	return 0;
2879 }
2880 
2881 static void iommu_resume(void)
2882 {
2883 	struct dmar_drhd_unit *drhd;
2884 	struct intel_iommu *iommu = NULL;
2885 	unsigned long flag;
2886 
2887 	if (init_iommu_hw()) {
2888 		if (force_on)
2889 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2890 		else
2891 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2892 		return;
2893 	}
2894 
2895 	for_each_active_iommu(iommu, drhd) {
2896 
2897 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2898 
2899 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2900 			iommu->reg + DMAR_FECTL_REG);
2901 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2902 			iommu->reg + DMAR_FEDATA_REG);
2903 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2904 			iommu->reg + DMAR_FEADDR_REG);
2905 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2906 			iommu->reg + DMAR_FEUADDR_REG);
2907 
2908 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2909 	}
2910 }
2911 
2912 static struct syscore_ops iommu_syscore_ops = {
2913 	.resume		= iommu_resume,
2914 	.suspend	= iommu_suspend,
2915 };
2916 
2917 static void __init init_iommu_pm_ops(void)
2918 {
2919 	register_syscore_ops(&iommu_syscore_ops);
2920 }
2921 
2922 #else
2923 static inline void init_iommu_pm_ops(void) {}
2924 #endif	/* CONFIG_PM */
2925 
2926 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2927 {
2928 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2929 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2930 	    rmrr->end_address <= rmrr->base_address ||
2931 	    arch_rmrr_sanity_check(rmrr))
2932 		return -EINVAL;
2933 
2934 	return 0;
2935 }
2936 
2937 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2938 {
2939 	struct acpi_dmar_reserved_memory *rmrr;
2940 	struct dmar_rmrr_unit *rmrru;
2941 
2942 	rmrr = (struct acpi_dmar_reserved_memory *)header;
2943 	if (rmrr_sanity_check(rmrr)) {
2944 		pr_warn(FW_BUG
2945 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2946 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2947 			   rmrr->base_address, rmrr->end_address,
2948 			   dmi_get_system_info(DMI_BIOS_VENDOR),
2949 			   dmi_get_system_info(DMI_BIOS_VERSION),
2950 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
2951 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2952 	}
2953 
2954 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2955 	if (!rmrru)
2956 		goto out;
2957 
2958 	rmrru->hdr = header;
2959 
2960 	rmrru->base_address = rmrr->base_address;
2961 	rmrru->end_address = rmrr->end_address;
2962 
2963 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2964 				((void *)rmrr) + rmrr->header.length,
2965 				&rmrru->devices_cnt);
2966 	if (rmrru->devices_cnt && rmrru->devices == NULL)
2967 		goto free_rmrru;
2968 
2969 	list_add(&rmrru->list, &dmar_rmrr_units);
2970 
2971 	return 0;
2972 free_rmrru:
2973 	kfree(rmrru);
2974 out:
2975 	return -ENOMEM;
2976 }
2977 
2978 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2979 {
2980 	struct dmar_atsr_unit *atsru;
2981 	struct acpi_dmar_atsr *tmp;
2982 
2983 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2984 				dmar_rcu_check()) {
2985 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2986 		if (atsr->segment != tmp->segment)
2987 			continue;
2988 		if (atsr->header.length != tmp->header.length)
2989 			continue;
2990 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
2991 			return atsru;
2992 	}
2993 
2994 	return NULL;
2995 }
2996 
2997 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2998 {
2999 	struct acpi_dmar_atsr *atsr;
3000 	struct dmar_atsr_unit *atsru;
3001 
3002 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3003 		return 0;
3004 
3005 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3006 	atsru = dmar_find_atsr(atsr);
3007 	if (atsru)
3008 		return 0;
3009 
3010 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3011 	if (!atsru)
3012 		return -ENOMEM;
3013 
3014 	/*
3015 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3016 	 * copy the memory content because the memory buffer will be freed
3017 	 * on return.
3018 	 */
3019 	atsru->hdr = (void *)(atsru + 1);
3020 	memcpy(atsru->hdr, hdr, hdr->length);
3021 	atsru->include_all = atsr->flags & 0x1;
3022 	if (!atsru->include_all) {
3023 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3024 				(void *)atsr + atsr->header.length,
3025 				&atsru->devices_cnt);
3026 		if (atsru->devices_cnt && atsru->devices == NULL) {
3027 			kfree(atsru);
3028 			return -ENOMEM;
3029 		}
3030 	}
3031 
3032 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3033 
3034 	return 0;
3035 }
3036 
3037 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3038 {
3039 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3040 	kfree(atsru);
3041 }
3042 
3043 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3044 {
3045 	struct acpi_dmar_atsr *atsr;
3046 	struct dmar_atsr_unit *atsru;
3047 
3048 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3049 	atsru = dmar_find_atsr(atsr);
3050 	if (atsru) {
3051 		list_del_rcu(&atsru->list);
3052 		synchronize_rcu();
3053 		intel_iommu_free_atsr(atsru);
3054 	}
3055 
3056 	return 0;
3057 }
3058 
3059 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3060 {
3061 	int i;
3062 	struct device *dev;
3063 	struct acpi_dmar_atsr *atsr;
3064 	struct dmar_atsr_unit *atsru;
3065 
3066 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3067 	atsru = dmar_find_atsr(atsr);
3068 	if (!atsru)
3069 		return 0;
3070 
3071 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3072 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3073 					  i, dev)
3074 			return -EBUSY;
3075 	}
3076 
3077 	return 0;
3078 }
3079 
3080 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3081 {
3082 	struct dmar_satc_unit *satcu;
3083 	struct acpi_dmar_satc *tmp;
3084 
3085 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3086 				dmar_rcu_check()) {
3087 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3088 		if (satc->segment != tmp->segment)
3089 			continue;
3090 		if (satc->header.length != tmp->header.length)
3091 			continue;
3092 		if (memcmp(satc, tmp, satc->header.length) == 0)
3093 			return satcu;
3094 	}
3095 
3096 	return NULL;
3097 }
3098 
3099 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3100 {
3101 	struct acpi_dmar_satc *satc;
3102 	struct dmar_satc_unit *satcu;
3103 
3104 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3105 		return 0;
3106 
3107 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3108 	satcu = dmar_find_satc(satc);
3109 	if (satcu)
3110 		return 0;
3111 
3112 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3113 	if (!satcu)
3114 		return -ENOMEM;
3115 
3116 	satcu->hdr = (void *)(satcu + 1);
3117 	memcpy(satcu->hdr, hdr, hdr->length);
3118 	satcu->atc_required = satc->flags & 0x1;
3119 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3120 					      (void *)satc + satc->header.length,
3121 					      &satcu->devices_cnt);
3122 	if (satcu->devices_cnt && !satcu->devices) {
3123 		kfree(satcu);
3124 		return -ENOMEM;
3125 	}
3126 	list_add_rcu(&satcu->list, &dmar_satc_units);
3127 
3128 	return 0;
3129 }
3130 
3131 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3132 {
3133 	int sp, ret;
3134 	struct intel_iommu *iommu = dmaru->iommu;
3135 
3136 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3137 	if (ret)
3138 		goto out;
3139 
3140 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3141 		pr_warn("%s: Doesn't support hardware pass through.\n",
3142 			iommu->name);
3143 		return -ENXIO;
3144 	}
3145 
3146 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3147 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3148 		pr_warn("%s: Doesn't support large page.\n",
3149 			iommu->name);
3150 		return -ENXIO;
3151 	}
3152 
3153 	/*
3154 	 * Disable translation if already enabled prior to OS handover.
3155 	 */
3156 	if (iommu->gcmd & DMA_GCMD_TE)
3157 		iommu_disable_translation(iommu);
3158 
3159 	ret = iommu_init_domains(iommu);
3160 	if (ret == 0)
3161 		ret = iommu_alloc_root_entry(iommu);
3162 	if (ret)
3163 		goto out;
3164 
3165 	intel_svm_check(iommu);
3166 
3167 	if (dmaru->ignored) {
3168 		/*
3169 		 * we always have to disable PMRs or DMA may fail on this device
3170 		 */
3171 		if (force_on)
3172 			iommu_disable_protect_mem_regions(iommu);
3173 		return 0;
3174 	}
3175 
3176 	intel_iommu_init_qi(iommu);
3177 	iommu_flush_write_buffer(iommu);
3178 
3179 #ifdef CONFIG_INTEL_IOMMU_SVM
3180 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3181 		ret = intel_svm_enable_prq(iommu);
3182 		if (ret)
3183 			goto disable_iommu;
3184 	}
3185 #endif
3186 	ret = dmar_set_interrupt(iommu);
3187 	if (ret)
3188 		goto disable_iommu;
3189 
3190 	iommu_set_root_entry(iommu);
3191 	iommu_enable_translation(iommu);
3192 
3193 	iommu_disable_protect_mem_regions(iommu);
3194 	return 0;
3195 
3196 disable_iommu:
3197 	disable_dmar_iommu(iommu);
3198 out:
3199 	free_dmar_iommu(iommu);
3200 	return ret;
3201 }
3202 
3203 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3204 {
3205 	int ret = 0;
3206 	struct intel_iommu *iommu = dmaru->iommu;
3207 
3208 	if (!intel_iommu_enabled)
3209 		return 0;
3210 	if (iommu == NULL)
3211 		return -EINVAL;
3212 
3213 	if (insert) {
3214 		ret = intel_iommu_add(dmaru);
3215 	} else {
3216 		disable_dmar_iommu(iommu);
3217 		free_dmar_iommu(iommu);
3218 	}
3219 
3220 	return ret;
3221 }
3222 
3223 static void intel_iommu_free_dmars(void)
3224 {
3225 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3226 	struct dmar_atsr_unit *atsru, *atsr_n;
3227 	struct dmar_satc_unit *satcu, *satc_n;
3228 
3229 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3230 		list_del(&rmrru->list);
3231 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3232 		kfree(rmrru);
3233 	}
3234 
3235 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3236 		list_del(&atsru->list);
3237 		intel_iommu_free_atsr(atsru);
3238 	}
3239 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3240 		list_del(&satcu->list);
3241 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3242 		kfree(satcu);
3243 	}
3244 }
3245 
3246 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3247 {
3248 	struct dmar_satc_unit *satcu;
3249 	struct acpi_dmar_satc *satc;
3250 	struct device *tmp;
3251 	int i;
3252 
3253 	dev = pci_physfn(dev);
3254 	rcu_read_lock();
3255 
3256 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3257 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3258 		if (satc->segment != pci_domain_nr(dev->bus))
3259 			continue;
3260 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3261 			if (to_pci_dev(tmp) == dev)
3262 				goto out;
3263 	}
3264 	satcu = NULL;
3265 out:
3266 	rcu_read_unlock();
3267 	return satcu;
3268 }
3269 
3270 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3271 {
3272 	int i, ret = 1;
3273 	struct pci_bus *bus;
3274 	struct pci_dev *bridge = NULL;
3275 	struct device *tmp;
3276 	struct acpi_dmar_atsr *atsr;
3277 	struct dmar_atsr_unit *atsru;
3278 	struct dmar_satc_unit *satcu;
3279 
3280 	dev = pci_physfn(dev);
3281 	satcu = dmar_find_matched_satc_unit(dev);
3282 	if (satcu)
3283 		/*
3284 		 * This device supports ATS as it is in SATC table.
3285 		 * When IOMMU is in legacy mode, enabling ATS is done
3286 		 * automatically by HW for the device that requires
3287 		 * ATS, hence OS should not enable this device ATS
3288 		 * to avoid duplicated TLB invalidation.
3289 		 */
3290 		return !(satcu->atc_required && !sm_supported(iommu));
3291 
3292 	for (bus = dev->bus; bus; bus = bus->parent) {
3293 		bridge = bus->self;
3294 		/* If it's an integrated device, allow ATS */
3295 		if (!bridge)
3296 			return 1;
3297 		/* Connected via non-PCIe: no ATS */
3298 		if (!pci_is_pcie(bridge) ||
3299 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3300 			return 0;
3301 		/* If we found the root port, look it up in the ATSR */
3302 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3303 			break;
3304 	}
3305 
3306 	rcu_read_lock();
3307 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3308 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3309 		if (atsr->segment != pci_domain_nr(dev->bus))
3310 			continue;
3311 
3312 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3313 			if (tmp == &bridge->dev)
3314 				goto out;
3315 
3316 		if (atsru->include_all)
3317 			goto out;
3318 	}
3319 	ret = 0;
3320 out:
3321 	rcu_read_unlock();
3322 
3323 	return ret;
3324 }
3325 
3326 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3327 {
3328 	int ret;
3329 	struct dmar_rmrr_unit *rmrru;
3330 	struct dmar_atsr_unit *atsru;
3331 	struct dmar_satc_unit *satcu;
3332 	struct acpi_dmar_atsr *atsr;
3333 	struct acpi_dmar_reserved_memory *rmrr;
3334 	struct acpi_dmar_satc *satc;
3335 
3336 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3337 		return 0;
3338 
3339 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3340 		rmrr = container_of(rmrru->hdr,
3341 				    struct acpi_dmar_reserved_memory, header);
3342 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3343 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3344 				((void *)rmrr) + rmrr->header.length,
3345 				rmrr->segment, rmrru->devices,
3346 				rmrru->devices_cnt);
3347 			if (ret < 0)
3348 				return ret;
3349 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3350 			dmar_remove_dev_scope(info, rmrr->segment,
3351 				rmrru->devices, rmrru->devices_cnt);
3352 		}
3353 	}
3354 
3355 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3356 		if (atsru->include_all)
3357 			continue;
3358 
3359 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3360 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3361 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3362 					(void *)atsr + atsr->header.length,
3363 					atsr->segment, atsru->devices,
3364 					atsru->devices_cnt);
3365 			if (ret > 0)
3366 				break;
3367 			else if (ret < 0)
3368 				return ret;
3369 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3370 			if (dmar_remove_dev_scope(info, atsr->segment,
3371 					atsru->devices, atsru->devices_cnt))
3372 				break;
3373 		}
3374 	}
3375 	list_for_each_entry(satcu, &dmar_satc_units, list) {
3376 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3377 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3378 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3379 					(void *)satc + satc->header.length,
3380 					satc->segment, satcu->devices,
3381 					satcu->devices_cnt);
3382 			if (ret > 0)
3383 				break;
3384 			else if (ret < 0)
3385 				return ret;
3386 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3387 			if (dmar_remove_dev_scope(info, satc->segment,
3388 					satcu->devices, satcu->devices_cnt))
3389 				break;
3390 		}
3391 	}
3392 
3393 	return 0;
3394 }
3395 
3396 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3397 				       unsigned long val, void *v)
3398 {
3399 	struct memory_notify *mhp = v;
3400 	unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3401 	unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3402 			mhp->nr_pages - 1);
3403 
3404 	switch (val) {
3405 	case MEM_GOING_ONLINE:
3406 		if (iommu_domain_identity_map(si_domain,
3407 					      start_vpfn, last_vpfn)) {
3408 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3409 				start_vpfn, last_vpfn);
3410 			return NOTIFY_BAD;
3411 		}
3412 		break;
3413 
3414 	case MEM_OFFLINE:
3415 	case MEM_CANCEL_ONLINE:
3416 		{
3417 			struct dmar_drhd_unit *drhd;
3418 			struct intel_iommu *iommu;
3419 			LIST_HEAD(freelist);
3420 
3421 			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3422 
3423 			rcu_read_lock();
3424 			for_each_active_iommu(iommu, drhd)
3425 				iommu_flush_iotlb_psi(iommu, si_domain,
3426 					start_vpfn, mhp->nr_pages,
3427 					list_empty(&freelist), 0);
3428 			rcu_read_unlock();
3429 			put_pages_list(&freelist);
3430 		}
3431 		break;
3432 	}
3433 
3434 	return NOTIFY_OK;
3435 }
3436 
3437 static struct notifier_block intel_iommu_memory_nb = {
3438 	.notifier_call = intel_iommu_memory_notifier,
3439 	.priority = 0
3440 };
3441 
3442 static void intel_disable_iommus(void)
3443 {
3444 	struct intel_iommu *iommu = NULL;
3445 	struct dmar_drhd_unit *drhd;
3446 
3447 	for_each_iommu(iommu, drhd)
3448 		iommu_disable_translation(iommu);
3449 }
3450 
3451 void intel_iommu_shutdown(void)
3452 {
3453 	struct dmar_drhd_unit *drhd;
3454 	struct intel_iommu *iommu = NULL;
3455 
3456 	if (no_iommu || dmar_disabled)
3457 		return;
3458 
3459 	down_write(&dmar_global_lock);
3460 
3461 	/* Disable PMRs explicitly here. */
3462 	for_each_iommu(iommu, drhd)
3463 		iommu_disable_protect_mem_regions(iommu);
3464 
3465 	/* Make sure the IOMMUs are switched off */
3466 	intel_disable_iommus();
3467 
3468 	up_write(&dmar_global_lock);
3469 }
3470 
3471 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3472 {
3473 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3474 
3475 	return container_of(iommu_dev, struct intel_iommu, iommu);
3476 }
3477 
3478 static ssize_t version_show(struct device *dev,
3479 			    struct device_attribute *attr, char *buf)
3480 {
3481 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3482 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3483 	return sysfs_emit(buf, "%d:%d\n",
3484 			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3485 }
3486 static DEVICE_ATTR_RO(version);
3487 
3488 static ssize_t address_show(struct device *dev,
3489 			    struct device_attribute *attr, char *buf)
3490 {
3491 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3492 	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3493 }
3494 static DEVICE_ATTR_RO(address);
3495 
3496 static ssize_t cap_show(struct device *dev,
3497 			struct device_attribute *attr, char *buf)
3498 {
3499 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3500 	return sysfs_emit(buf, "%llx\n", iommu->cap);
3501 }
3502 static DEVICE_ATTR_RO(cap);
3503 
3504 static ssize_t ecap_show(struct device *dev,
3505 			 struct device_attribute *attr, char *buf)
3506 {
3507 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3508 	return sysfs_emit(buf, "%llx\n", iommu->ecap);
3509 }
3510 static DEVICE_ATTR_RO(ecap);
3511 
3512 static ssize_t domains_supported_show(struct device *dev,
3513 				      struct device_attribute *attr, char *buf)
3514 {
3515 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3516 	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3517 }
3518 static DEVICE_ATTR_RO(domains_supported);
3519 
3520 static ssize_t domains_used_show(struct device *dev,
3521 				 struct device_attribute *attr, char *buf)
3522 {
3523 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3524 	return sysfs_emit(buf, "%d\n",
3525 			  bitmap_weight(iommu->domain_ids,
3526 					cap_ndoms(iommu->cap)));
3527 }
3528 static DEVICE_ATTR_RO(domains_used);
3529 
3530 static struct attribute *intel_iommu_attrs[] = {
3531 	&dev_attr_version.attr,
3532 	&dev_attr_address.attr,
3533 	&dev_attr_cap.attr,
3534 	&dev_attr_ecap.attr,
3535 	&dev_attr_domains_supported.attr,
3536 	&dev_attr_domains_used.attr,
3537 	NULL,
3538 };
3539 
3540 static struct attribute_group intel_iommu_group = {
3541 	.name = "intel-iommu",
3542 	.attrs = intel_iommu_attrs,
3543 };
3544 
3545 const struct attribute_group *intel_iommu_groups[] = {
3546 	&intel_iommu_group,
3547 	NULL,
3548 };
3549 
3550 static bool has_external_pci(void)
3551 {
3552 	struct pci_dev *pdev = NULL;
3553 
3554 	for_each_pci_dev(pdev)
3555 		if (pdev->external_facing) {
3556 			pci_dev_put(pdev);
3557 			return true;
3558 		}
3559 
3560 	return false;
3561 }
3562 
3563 static int __init platform_optin_force_iommu(void)
3564 {
3565 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3566 		return 0;
3567 
3568 	if (no_iommu || dmar_disabled)
3569 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3570 
3571 	/*
3572 	 * If Intel-IOMMU is disabled by default, we will apply identity
3573 	 * map for all devices except those marked as being untrusted.
3574 	 */
3575 	if (dmar_disabled)
3576 		iommu_set_default_passthrough(false);
3577 
3578 	dmar_disabled = 0;
3579 	no_iommu = 0;
3580 
3581 	return 1;
3582 }
3583 
3584 static int __init probe_acpi_namespace_devices(void)
3585 {
3586 	struct dmar_drhd_unit *drhd;
3587 	/* To avoid a -Wunused-but-set-variable warning. */
3588 	struct intel_iommu *iommu __maybe_unused;
3589 	struct device *dev;
3590 	int i, ret = 0;
3591 
3592 	for_each_active_iommu(iommu, drhd) {
3593 		for_each_active_dev_scope(drhd->devices,
3594 					  drhd->devices_cnt, i, dev) {
3595 			struct acpi_device_physical_node *pn;
3596 			struct acpi_device *adev;
3597 
3598 			if (dev->bus != &acpi_bus_type)
3599 				continue;
3600 
3601 			adev = to_acpi_device(dev);
3602 			mutex_lock(&adev->physical_node_lock);
3603 			list_for_each_entry(pn,
3604 					    &adev->physical_node_list, node) {
3605 				ret = iommu_probe_device(pn->dev);
3606 				if (ret)
3607 					break;
3608 			}
3609 			mutex_unlock(&adev->physical_node_lock);
3610 
3611 			if (ret)
3612 				return ret;
3613 		}
3614 	}
3615 
3616 	return 0;
3617 }
3618 
3619 static __init int tboot_force_iommu(void)
3620 {
3621 	if (!tboot_enabled())
3622 		return 0;
3623 
3624 	if (no_iommu || dmar_disabled)
3625 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3626 
3627 	dmar_disabled = 0;
3628 	no_iommu = 0;
3629 
3630 	return 1;
3631 }
3632 
3633 int __init intel_iommu_init(void)
3634 {
3635 	int ret = -ENODEV;
3636 	struct dmar_drhd_unit *drhd;
3637 	struct intel_iommu *iommu;
3638 
3639 	/*
3640 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3641 	 * opt in, so enforce that.
3642 	 */
3643 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3644 		    platform_optin_force_iommu();
3645 
3646 	down_write(&dmar_global_lock);
3647 	if (dmar_table_init()) {
3648 		if (force_on)
3649 			panic("tboot: Failed to initialize DMAR table\n");
3650 		goto out_free_dmar;
3651 	}
3652 
3653 	if (dmar_dev_scope_init() < 0) {
3654 		if (force_on)
3655 			panic("tboot: Failed to initialize DMAR device scope\n");
3656 		goto out_free_dmar;
3657 	}
3658 
3659 	up_write(&dmar_global_lock);
3660 
3661 	/*
3662 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3663 	 * complain later when we register it under the lock.
3664 	 */
3665 	dmar_register_bus_notifier();
3666 
3667 	down_write(&dmar_global_lock);
3668 
3669 	if (!no_iommu)
3670 		intel_iommu_debugfs_init();
3671 
3672 	if (no_iommu || dmar_disabled) {
3673 		/*
3674 		 * We exit the function here to ensure IOMMU's remapping and
3675 		 * mempool aren't setup, which means that the IOMMU's PMRs
3676 		 * won't be disabled via the call to init_dmars(). So disable
3677 		 * it explicitly here. The PMRs were setup by tboot prior to
3678 		 * calling SENTER, but the kernel is expected to reset/tear
3679 		 * down the PMRs.
3680 		 */
3681 		if (intel_iommu_tboot_noforce) {
3682 			for_each_iommu(iommu, drhd)
3683 				iommu_disable_protect_mem_regions(iommu);
3684 		}
3685 
3686 		/*
3687 		 * Make sure the IOMMUs are switched off, even when we
3688 		 * boot into a kexec kernel and the previous kernel left
3689 		 * them enabled
3690 		 */
3691 		intel_disable_iommus();
3692 		goto out_free_dmar;
3693 	}
3694 
3695 	if (list_empty(&dmar_rmrr_units))
3696 		pr_info("No RMRR found\n");
3697 
3698 	if (list_empty(&dmar_atsr_units))
3699 		pr_info("No ATSR found\n");
3700 
3701 	if (list_empty(&dmar_satc_units))
3702 		pr_info("No SATC found\n");
3703 
3704 	init_no_remapping_devices();
3705 
3706 	ret = init_dmars();
3707 	if (ret) {
3708 		if (force_on)
3709 			panic("tboot: Failed to initialize DMARs\n");
3710 		pr_err("Initialization failed\n");
3711 		goto out_free_dmar;
3712 	}
3713 	up_write(&dmar_global_lock);
3714 
3715 	init_iommu_pm_ops();
3716 
3717 	down_read(&dmar_global_lock);
3718 	for_each_active_iommu(iommu, drhd) {
3719 		/*
3720 		 * The flush queue implementation does not perform
3721 		 * page-selective invalidations that are required for efficient
3722 		 * TLB flushes in virtual environments.  The benefit of batching
3723 		 * is likely to be much lower than the overhead of synchronizing
3724 		 * the virtual and physical IOMMU page-tables.
3725 		 */
3726 		if (cap_caching_mode(iommu->cap) &&
3727 		    !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3728 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3729 			iommu_set_dma_strict();
3730 		}
3731 		iommu_device_sysfs_add(&iommu->iommu, NULL,
3732 				       intel_iommu_groups,
3733 				       "%s", iommu->name);
3734 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3735 
3736 		iommu_pmu_register(iommu);
3737 	}
3738 	up_read(&dmar_global_lock);
3739 
3740 	if (si_domain && !hw_pass_through)
3741 		register_memory_notifier(&intel_iommu_memory_nb);
3742 
3743 	down_read(&dmar_global_lock);
3744 	if (probe_acpi_namespace_devices())
3745 		pr_warn("ACPI name space devices didn't probe correctly\n");
3746 
3747 	/* Finally, we enable the DMA remapping hardware. */
3748 	for_each_iommu(iommu, drhd) {
3749 		if (!drhd->ignored && !translation_pre_enabled(iommu))
3750 			iommu_enable_translation(iommu);
3751 
3752 		iommu_disable_protect_mem_regions(iommu);
3753 	}
3754 	up_read(&dmar_global_lock);
3755 
3756 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3757 
3758 	intel_iommu_enabled = 1;
3759 
3760 	return 0;
3761 
3762 out_free_dmar:
3763 	intel_iommu_free_dmars();
3764 	up_write(&dmar_global_lock);
3765 	return ret;
3766 }
3767 
3768 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3769 {
3770 	struct device_domain_info *info = opaque;
3771 
3772 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3773 	return 0;
3774 }
3775 
3776 /*
3777  * NB - intel-iommu lacks any sort of reference counting for the users of
3778  * dependent devices.  If multiple endpoints have intersecting dependent
3779  * devices, unbinding the driver from any one of them will possibly leave
3780  * the others unable to operate.
3781  */
3782 static void domain_context_clear(struct device_domain_info *info)
3783 {
3784 	if (!dev_is_pci(info->dev))
3785 		domain_context_clear_one(info, info->bus, info->devfn);
3786 
3787 	pci_for_each_dma_alias(to_pci_dev(info->dev),
3788 			       &domain_context_clear_one_cb, info);
3789 }
3790 
3791 /*
3792  * Clear the page table pointer in context or pasid table entries so that
3793  * all DMA requests without PASID from the device are blocked. If the page
3794  * table has been set, clean up the data structures.
3795  */
3796 void device_block_translation(struct device *dev)
3797 {
3798 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3799 	struct intel_iommu *iommu = info->iommu;
3800 	unsigned long flags;
3801 
3802 	iommu_disable_pci_caps(info);
3803 	if (!dev_is_real_dma_subdevice(dev)) {
3804 		if (sm_supported(iommu))
3805 			intel_pasid_tear_down_entry(iommu, dev,
3806 						    IOMMU_NO_PASID, false);
3807 		else
3808 			domain_context_clear(info);
3809 	}
3810 
3811 	if (!info->domain)
3812 		return;
3813 
3814 	spin_lock_irqsave(&info->domain->lock, flags);
3815 	list_del(&info->link);
3816 	spin_unlock_irqrestore(&info->domain->lock, flags);
3817 
3818 	domain_detach_iommu(info->domain, iommu);
3819 	info->domain = NULL;
3820 }
3821 
3822 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3823 {
3824 	int adjust_width;
3825 
3826 	/* calculate AGAW */
3827 	domain->gaw = guest_width;
3828 	adjust_width = guestwidth_to_adjustwidth(guest_width);
3829 	domain->agaw = width_to_agaw(adjust_width);
3830 
3831 	domain->iommu_coherency = false;
3832 	domain->iommu_superpage = 0;
3833 	domain->max_addr = 0;
3834 
3835 	/* always allocate the top pgd */
3836 	domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
3837 	if (!domain->pgd)
3838 		return -ENOMEM;
3839 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3840 	return 0;
3841 }
3842 
3843 static int blocking_domain_attach_dev(struct iommu_domain *domain,
3844 				      struct device *dev)
3845 {
3846 	device_block_translation(dev);
3847 	return 0;
3848 }
3849 
3850 static struct iommu_domain blocking_domain = {
3851 	.type = IOMMU_DOMAIN_BLOCKED,
3852 	.ops = &(const struct iommu_domain_ops) {
3853 		.attach_dev	= blocking_domain_attach_dev,
3854 	}
3855 };
3856 
3857 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
3858 {
3859 	struct dmar_domain *dmar_domain;
3860 	struct iommu_domain *domain;
3861 
3862 	switch (type) {
3863 	case IOMMU_DOMAIN_DMA:
3864 	case IOMMU_DOMAIN_UNMANAGED:
3865 		dmar_domain = alloc_domain(type);
3866 		if (!dmar_domain) {
3867 			pr_err("Can't allocate dmar_domain\n");
3868 			return NULL;
3869 		}
3870 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3871 			pr_err("Domain initialization failed\n");
3872 			domain_exit(dmar_domain);
3873 			return NULL;
3874 		}
3875 
3876 		domain = &dmar_domain->domain;
3877 		domain->geometry.aperture_start = 0;
3878 		domain->geometry.aperture_end   =
3879 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
3880 		domain->geometry.force_aperture = true;
3881 
3882 		return domain;
3883 	case IOMMU_DOMAIN_IDENTITY:
3884 		return &si_domain->domain;
3885 	case IOMMU_DOMAIN_SVA:
3886 		return intel_svm_domain_alloc();
3887 	default:
3888 		return NULL;
3889 	}
3890 
3891 	return NULL;
3892 }
3893 
3894 static struct iommu_domain *
3895 intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
3896 			      struct iommu_domain *parent,
3897 			      const struct iommu_user_data *user_data)
3898 {
3899 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3900 	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3901 	bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3902 	struct intel_iommu *iommu = info->iommu;
3903 	struct dmar_domain *dmar_domain;
3904 	struct iommu_domain *domain;
3905 
3906 	/* Must be NESTING domain */
3907 	if (parent) {
3908 		if (!nested_supported(iommu) || flags)
3909 			return ERR_PTR(-EOPNOTSUPP);
3910 		return intel_nested_domain_alloc(parent, user_data);
3911 	}
3912 
3913 	if (flags &
3914 	    (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
3915 		return ERR_PTR(-EOPNOTSUPP);
3916 	if (nested_parent && !nested_supported(iommu))
3917 		return ERR_PTR(-EOPNOTSUPP);
3918 	if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3919 		return ERR_PTR(-EOPNOTSUPP);
3920 
3921 	/*
3922 	 * domain_alloc_user op needs to fully initialize a domain before
3923 	 * return, so uses iommu_domain_alloc() here for simple.
3924 	 */
3925 	domain = iommu_domain_alloc(dev->bus);
3926 	if (!domain)
3927 		return ERR_PTR(-ENOMEM);
3928 
3929 	dmar_domain = to_dmar_domain(domain);
3930 
3931 	if (nested_parent) {
3932 		dmar_domain->nested_parent = true;
3933 		INIT_LIST_HEAD(&dmar_domain->s1_domains);
3934 		spin_lock_init(&dmar_domain->s1_lock);
3935 	}
3936 
3937 	if (dirty_tracking) {
3938 		if (dmar_domain->use_first_level) {
3939 			iommu_domain_free(domain);
3940 			return ERR_PTR(-EOPNOTSUPP);
3941 		}
3942 		domain->dirty_ops = &intel_dirty_ops;
3943 	}
3944 
3945 	return domain;
3946 }
3947 
3948 static void intel_iommu_domain_free(struct iommu_domain *domain)
3949 {
3950 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3951 
3952 	WARN_ON(dmar_domain->nested_parent &&
3953 		!list_empty(&dmar_domain->s1_domains));
3954 	if (domain != &si_domain->domain)
3955 		domain_exit(dmar_domain);
3956 }
3957 
3958 int prepare_domain_attach_device(struct iommu_domain *domain,
3959 				 struct device *dev)
3960 {
3961 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3962 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3963 	struct intel_iommu *iommu = info->iommu;
3964 	int addr_width;
3965 
3966 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3967 		return -EINVAL;
3968 
3969 	if (domain->dirty_ops && !ssads_supported(iommu))
3970 		return -EINVAL;
3971 
3972 	/* check if this iommu agaw is sufficient for max mapped address */
3973 	addr_width = agaw_to_width(iommu->agaw);
3974 	if (addr_width > cap_mgaw(iommu->cap))
3975 		addr_width = cap_mgaw(iommu->cap);
3976 
3977 	if (dmar_domain->max_addr > (1LL << addr_width))
3978 		return -EINVAL;
3979 	dmar_domain->gaw = addr_width;
3980 
3981 	/*
3982 	 * Knock out extra levels of page tables if necessary
3983 	 */
3984 	while (iommu->agaw < dmar_domain->agaw) {
3985 		struct dma_pte *pte;
3986 
3987 		pte = dmar_domain->pgd;
3988 		if (dma_pte_present(pte)) {
3989 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
3990 			free_pgtable_page(pte);
3991 		}
3992 		dmar_domain->agaw--;
3993 	}
3994 
3995 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3996 	    context_copied(iommu, info->bus, info->devfn))
3997 		return intel_pasid_setup_sm_context(dev);
3998 
3999 	return 0;
4000 }
4001 
4002 static int intel_iommu_attach_device(struct iommu_domain *domain,
4003 				     struct device *dev)
4004 {
4005 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4006 	int ret;
4007 
4008 	if (info->domain)
4009 		device_block_translation(dev);
4010 
4011 	ret = prepare_domain_attach_device(domain, dev);
4012 	if (ret)
4013 		return ret;
4014 
4015 	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4016 }
4017 
4018 static int intel_iommu_map(struct iommu_domain *domain,
4019 			   unsigned long iova, phys_addr_t hpa,
4020 			   size_t size, int iommu_prot, gfp_t gfp)
4021 {
4022 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4023 	u64 max_addr;
4024 	int prot = 0;
4025 
4026 	if (iommu_prot & IOMMU_READ)
4027 		prot |= DMA_PTE_READ;
4028 	if (iommu_prot & IOMMU_WRITE)
4029 		prot |= DMA_PTE_WRITE;
4030 	if (dmar_domain->set_pte_snp)
4031 		prot |= DMA_PTE_SNP;
4032 
4033 	max_addr = iova + size;
4034 	if (dmar_domain->max_addr < max_addr) {
4035 		u64 end;
4036 
4037 		/* check if minimum agaw is sufficient for mapped address */
4038 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4039 		if (end < max_addr) {
4040 			pr_err("%s: iommu width (%d) is not "
4041 			       "sufficient for the mapped address (%llx)\n",
4042 			       __func__, dmar_domain->gaw, max_addr);
4043 			return -EFAULT;
4044 		}
4045 		dmar_domain->max_addr = max_addr;
4046 	}
4047 	/* Round up size to next multiple of PAGE_SIZE, if it and
4048 	   the low bits of hpa would take us onto the next page */
4049 	size = aligned_nrpages(hpa, size);
4050 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4051 				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4052 }
4053 
4054 static int intel_iommu_map_pages(struct iommu_domain *domain,
4055 				 unsigned long iova, phys_addr_t paddr,
4056 				 size_t pgsize, size_t pgcount,
4057 				 int prot, gfp_t gfp, size_t *mapped)
4058 {
4059 	unsigned long pgshift = __ffs(pgsize);
4060 	size_t size = pgcount << pgshift;
4061 	int ret;
4062 
4063 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4064 		return -EINVAL;
4065 
4066 	if (!IS_ALIGNED(iova | paddr, pgsize))
4067 		return -EINVAL;
4068 
4069 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4070 	if (!ret && mapped)
4071 		*mapped = size;
4072 
4073 	return ret;
4074 }
4075 
4076 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4077 				unsigned long iova, size_t size,
4078 				struct iommu_iotlb_gather *gather)
4079 {
4080 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4081 	unsigned long start_pfn, last_pfn;
4082 	int level = 0;
4083 
4084 	/* Cope with horrid API which requires us to unmap more than the
4085 	   size argument if it happens to be a large-page mapping. */
4086 	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4087 				     &level, GFP_ATOMIC)))
4088 		return 0;
4089 
4090 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4091 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4092 
4093 	start_pfn = iova >> VTD_PAGE_SHIFT;
4094 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4095 
4096 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4097 
4098 	if (dmar_domain->max_addr == iova + size)
4099 		dmar_domain->max_addr = iova;
4100 
4101 	/*
4102 	 * We do not use page-selective IOTLB invalidation in flush queue,
4103 	 * so there is no need to track page and sync iotlb.
4104 	 */
4105 	if (!iommu_iotlb_gather_queued(gather))
4106 		iommu_iotlb_gather_add_page(domain, gather, iova, size);
4107 
4108 	return size;
4109 }
4110 
4111 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4112 				      unsigned long iova,
4113 				      size_t pgsize, size_t pgcount,
4114 				      struct iommu_iotlb_gather *gather)
4115 {
4116 	unsigned long pgshift = __ffs(pgsize);
4117 	size_t size = pgcount << pgshift;
4118 
4119 	return intel_iommu_unmap(domain, iova, size, gather);
4120 }
4121 
4122 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4123 				 struct iommu_iotlb_gather *gather)
4124 {
4125 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4126 	unsigned long iova_pfn = IOVA_PFN(gather->start);
4127 	size_t size = gather->end - gather->start;
4128 	struct iommu_domain_info *info;
4129 	unsigned long start_pfn;
4130 	unsigned long nrpages;
4131 	unsigned long i;
4132 
4133 	nrpages = aligned_nrpages(gather->start, size);
4134 	start_pfn = mm_to_dma_pfn_start(iova_pfn);
4135 
4136 	xa_for_each(&dmar_domain->iommu_array, i, info)
4137 		iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4138 				      start_pfn, nrpages,
4139 				      list_empty(&gather->freelist), 0);
4140 
4141 	if (dmar_domain->nested_parent)
4142 		parent_domain_flush(dmar_domain, start_pfn, nrpages,
4143 				    list_empty(&gather->freelist));
4144 	put_pages_list(&gather->freelist);
4145 }
4146 
4147 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4148 					    dma_addr_t iova)
4149 {
4150 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4151 	struct dma_pte *pte;
4152 	int level = 0;
4153 	u64 phys = 0;
4154 
4155 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4156 			     GFP_ATOMIC);
4157 	if (pte && dma_pte_present(pte))
4158 		phys = dma_pte_addr(pte) +
4159 			(iova & (BIT_MASK(level_to_offset_bits(level) +
4160 						VTD_PAGE_SHIFT) - 1));
4161 
4162 	return phys;
4163 }
4164 
4165 static bool domain_support_force_snooping(struct dmar_domain *domain)
4166 {
4167 	struct device_domain_info *info;
4168 	bool support = true;
4169 
4170 	assert_spin_locked(&domain->lock);
4171 	list_for_each_entry(info, &domain->devices, link) {
4172 		if (!ecap_sc_support(info->iommu->ecap)) {
4173 			support = false;
4174 			break;
4175 		}
4176 	}
4177 
4178 	return support;
4179 }
4180 
4181 static void domain_set_force_snooping(struct dmar_domain *domain)
4182 {
4183 	struct device_domain_info *info;
4184 
4185 	assert_spin_locked(&domain->lock);
4186 	/*
4187 	 * Second level page table supports per-PTE snoop control. The
4188 	 * iommu_map() interface will handle this by setting SNP bit.
4189 	 */
4190 	if (!domain->use_first_level) {
4191 		domain->set_pte_snp = true;
4192 		return;
4193 	}
4194 
4195 	list_for_each_entry(info, &domain->devices, link)
4196 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4197 						     IOMMU_NO_PASID);
4198 }
4199 
4200 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4201 {
4202 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4203 	unsigned long flags;
4204 
4205 	if (dmar_domain->force_snooping)
4206 		return true;
4207 
4208 	spin_lock_irqsave(&dmar_domain->lock, flags);
4209 	if (!domain_support_force_snooping(dmar_domain) ||
4210 	    (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
4211 		spin_unlock_irqrestore(&dmar_domain->lock, flags);
4212 		return false;
4213 	}
4214 
4215 	domain_set_force_snooping(dmar_domain);
4216 	dmar_domain->force_snooping = true;
4217 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4218 
4219 	return true;
4220 }
4221 
4222 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4223 {
4224 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4225 
4226 	switch (cap) {
4227 	case IOMMU_CAP_CACHE_COHERENCY:
4228 	case IOMMU_CAP_DEFERRED_FLUSH:
4229 		return true;
4230 	case IOMMU_CAP_PRE_BOOT_PROTECTION:
4231 		return dmar_platform_optin();
4232 	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4233 		return ecap_sc_support(info->iommu->ecap);
4234 	case IOMMU_CAP_DIRTY_TRACKING:
4235 		return ssads_supported(info->iommu);
4236 	default:
4237 		return false;
4238 	}
4239 }
4240 
4241 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4242 {
4243 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4244 	struct device_domain_info *info;
4245 	struct intel_iommu *iommu;
4246 	u8 bus, devfn;
4247 	int ret;
4248 
4249 	iommu = device_lookup_iommu(dev, &bus, &devfn);
4250 	if (!iommu || !iommu->iommu.ops)
4251 		return ERR_PTR(-ENODEV);
4252 
4253 	info = kzalloc(sizeof(*info), GFP_KERNEL);
4254 	if (!info)
4255 		return ERR_PTR(-ENOMEM);
4256 
4257 	if (dev_is_real_dma_subdevice(dev)) {
4258 		info->bus = pdev->bus->number;
4259 		info->devfn = pdev->devfn;
4260 		info->segment = pci_domain_nr(pdev->bus);
4261 	} else {
4262 		info->bus = bus;
4263 		info->devfn = devfn;
4264 		info->segment = iommu->segment;
4265 	}
4266 
4267 	info->dev = dev;
4268 	info->iommu = iommu;
4269 	if (dev_is_pci(dev)) {
4270 		if (ecap_dev_iotlb_support(iommu->ecap) &&
4271 		    pci_ats_supported(pdev) &&
4272 		    dmar_ats_supported(pdev, iommu)) {
4273 			info->ats_supported = 1;
4274 			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4275 
4276 			/*
4277 			 * For IOMMU that supports device IOTLB throttling
4278 			 * (DIT), we assign PFSID to the invalidation desc
4279 			 * of a VF such that IOMMU HW can gauge queue depth
4280 			 * at PF level. If DIT is not set, PFSID will be
4281 			 * treated as reserved, which should be set to 0.
4282 			 */
4283 			if (ecap_dit(iommu->ecap))
4284 				info->pfsid = pci_dev_id(pci_physfn(pdev));
4285 			info->ats_qdep = pci_ats_queue_depth(pdev);
4286 		}
4287 		if (sm_supported(iommu)) {
4288 			if (pasid_supported(iommu)) {
4289 				int features = pci_pasid_features(pdev);
4290 
4291 				if (features >= 0)
4292 					info->pasid_supported = features | 1;
4293 			}
4294 
4295 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4296 			    pci_pri_supported(pdev))
4297 				info->pri_supported = 1;
4298 		}
4299 	}
4300 
4301 	dev_iommu_priv_set(dev, info);
4302 	ret = device_rbtree_insert(iommu, info);
4303 	if (ret)
4304 		goto free;
4305 
4306 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4307 		ret = intel_pasid_alloc_table(dev);
4308 		if (ret) {
4309 			dev_err(dev, "PASID table allocation failed\n");
4310 			goto clear_rbtree;
4311 		}
4312 
4313 		if (!context_copied(iommu, info->bus, info->devfn)) {
4314 			ret = intel_pasid_setup_sm_context(dev);
4315 			if (ret)
4316 				goto free_table;
4317 		}
4318 	}
4319 
4320 	intel_iommu_debugfs_create_dev(info);
4321 
4322 	return &iommu->iommu;
4323 free_table:
4324 	intel_pasid_free_table(dev);
4325 clear_rbtree:
4326 	device_rbtree_remove(info);
4327 free:
4328 	kfree(info);
4329 
4330 	return ERR_PTR(ret);
4331 }
4332 
4333 static void intel_iommu_release_device(struct device *dev)
4334 {
4335 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4336 	struct intel_iommu *iommu = info->iommu;
4337 
4338 	mutex_lock(&iommu->iopf_lock);
4339 	device_rbtree_remove(info);
4340 	mutex_unlock(&iommu->iopf_lock);
4341 
4342 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
4343 	    !context_copied(iommu, info->bus, info->devfn))
4344 		intel_pasid_teardown_sm_context(dev);
4345 
4346 	intel_pasid_free_table(dev);
4347 	intel_iommu_debugfs_remove_dev(info);
4348 	kfree(info);
4349 	set_dma_ops(dev, NULL);
4350 }
4351 
4352 static void intel_iommu_probe_finalize(struct device *dev)
4353 {
4354 	set_dma_ops(dev, NULL);
4355 	iommu_setup_dma_ops(dev, 0, U64_MAX);
4356 }
4357 
4358 static void intel_iommu_get_resv_regions(struct device *device,
4359 					 struct list_head *head)
4360 {
4361 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4362 	struct iommu_resv_region *reg;
4363 	struct dmar_rmrr_unit *rmrr;
4364 	struct device *i_dev;
4365 	int i;
4366 
4367 	rcu_read_lock();
4368 	for_each_rmrr_units(rmrr) {
4369 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4370 					  i, i_dev) {
4371 			struct iommu_resv_region *resv;
4372 			enum iommu_resv_type type;
4373 			size_t length;
4374 
4375 			if (i_dev != device &&
4376 			    !is_downstream_to_pci_bridge(device, i_dev))
4377 				continue;
4378 
4379 			length = rmrr->end_address - rmrr->base_address + 1;
4380 
4381 			type = device_rmrr_is_relaxable(device) ?
4382 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4383 
4384 			resv = iommu_alloc_resv_region(rmrr->base_address,
4385 						       length, prot, type,
4386 						       GFP_ATOMIC);
4387 			if (!resv)
4388 				break;
4389 
4390 			list_add_tail(&resv->list, head);
4391 		}
4392 	}
4393 	rcu_read_unlock();
4394 
4395 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4396 	if (dev_is_pci(device)) {
4397 		struct pci_dev *pdev = to_pci_dev(device);
4398 
4399 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4400 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4401 					IOMMU_RESV_DIRECT_RELAXABLE,
4402 					GFP_KERNEL);
4403 			if (reg)
4404 				list_add_tail(&reg->list, head);
4405 		}
4406 	}
4407 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4408 
4409 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4410 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4411 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4412 	if (!reg)
4413 		return;
4414 	list_add_tail(&reg->list, head);
4415 }
4416 
4417 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4418 {
4419 	if (dev_is_pci(dev))
4420 		return pci_device_group(dev);
4421 	return generic_device_group(dev);
4422 }
4423 
4424 static int intel_iommu_enable_sva(struct device *dev)
4425 {
4426 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4427 	struct intel_iommu *iommu;
4428 
4429 	if (!info || dmar_disabled)
4430 		return -EINVAL;
4431 
4432 	iommu = info->iommu;
4433 	if (!iommu)
4434 		return -EINVAL;
4435 
4436 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4437 		return -ENODEV;
4438 
4439 	if (!info->pasid_enabled || !info->ats_enabled)
4440 		return -EINVAL;
4441 
4442 	/*
4443 	 * Devices having device-specific I/O fault handling should not
4444 	 * support PCI/PRI. The IOMMU side has no means to check the
4445 	 * capability of device-specific IOPF.  Therefore, IOMMU can only
4446 	 * default that if the device driver enables SVA on a non-PRI
4447 	 * device, it will handle IOPF in its own way.
4448 	 */
4449 	if (!info->pri_supported)
4450 		return 0;
4451 
4452 	/* Devices supporting PRI should have it enabled. */
4453 	if (!info->pri_enabled)
4454 		return -EINVAL;
4455 
4456 	return 0;
4457 }
4458 
4459 static int intel_iommu_enable_iopf(struct device *dev)
4460 {
4461 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4462 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4463 	struct intel_iommu *iommu;
4464 	int ret;
4465 
4466 	if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4467 		return -ENODEV;
4468 
4469 	if (info->pri_enabled)
4470 		return -EBUSY;
4471 
4472 	iommu = info->iommu;
4473 	if (!iommu)
4474 		return -EINVAL;
4475 
4476 	/* PASID is required in PRG Response Message. */
4477 	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4478 		return -EINVAL;
4479 
4480 	ret = pci_reset_pri(pdev);
4481 	if (ret)
4482 		return ret;
4483 
4484 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4485 	if (ret)
4486 		return ret;
4487 
4488 	ret = pci_enable_pri(pdev, PRQ_DEPTH);
4489 	if (ret) {
4490 		iopf_queue_remove_device(iommu->iopf_queue, dev);
4491 		return ret;
4492 	}
4493 
4494 	info->pri_enabled = 1;
4495 
4496 	return 0;
4497 }
4498 
4499 static int intel_iommu_disable_iopf(struct device *dev)
4500 {
4501 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4502 	struct intel_iommu *iommu = info->iommu;
4503 
4504 	if (!info->pri_enabled)
4505 		return -EINVAL;
4506 
4507 	/*
4508 	 * PCIe spec states that by clearing PRI enable bit, the Page
4509 	 * Request Interface will not issue new page requests, but has
4510 	 * outstanding page requests that have been transmitted or are
4511 	 * queued for transmission. This is supposed to be called after
4512 	 * the device driver has stopped DMA, all PASIDs have been
4513 	 * unbound and the outstanding PRQs have been drained.
4514 	 */
4515 	pci_disable_pri(to_pci_dev(dev));
4516 	info->pri_enabled = 0;
4517 	iopf_queue_remove_device(iommu->iopf_queue, dev);
4518 
4519 	return 0;
4520 }
4521 
4522 static int
4523 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4524 {
4525 	switch (feat) {
4526 	case IOMMU_DEV_FEAT_IOPF:
4527 		return intel_iommu_enable_iopf(dev);
4528 
4529 	case IOMMU_DEV_FEAT_SVA:
4530 		return intel_iommu_enable_sva(dev);
4531 
4532 	default:
4533 		return -ENODEV;
4534 	}
4535 }
4536 
4537 static int
4538 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4539 {
4540 	switch (feat) {
4541 	case IOMMU_DEV_FEAT_IOPF:
4542 		return intel_iommu_disable_iopf(dev);
4543 
4544 	case IOMMU_DEV_FEAT_SVA:
4545 		return 0;
4546 
4547 	default:
4548 		return -ENODEV;
4549 	}
4550 }
4551 
4552 static bool intel_iommu_is_attach_deferred(struct device *dev)
4553 {
4554 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4555 
4556 	return translation_pre_enabled(info->iommu) && !info->domain;
4557 }
4558 
4559 /*
4560  * Check that the device does not live on an external facing PCI port that is
4561  * marked as untrusted. Such devices should not be able to apply quirks and
4562  * thus not be able to bypass the IOMMU restrictions.
4563  */
4564 static bool risky_device(struct pci_dev *pdev)
4565 {
4566 	if (pdev->untrusted) {
4567 		pci_info(pdev,
4568 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4569 			 pdev->vendor, pdev->device);
4570 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4571 		return true;
4572 	}
4573 	return false;
4574 }
4575 
4576 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4577 				      unsigned long iova, size_t size)
4578 {
4579 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4580 	unsigned long pages = aligned_nrpages(iova, size);
4581 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4582 	struct iommu_domain_info *info;
4583 	unsigned long i;
4584 
4585 	xa_for_each(&dmar_domain->iommu_array, i, info)
4586 		__mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4587 	return 0;
4588 }
4589 
4590 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4591 {
4592 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4593 	struct dev_pasid_info *curr, *dev_pasid = NULL;
4594 	struct intel_iommu *iommu = info->iommu;
4595 	struct dmar_domain *dmar_domain;
4596 	struct iommu_domain *domain;
4597 	unsigned long flags;
4598 
4599 	domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4600 	if (WARN_ON_ONCE(!domain))
4601 		goto out_tear_down;
4602 
4603 	/*
4604 	 * The SVA implementation needs to handle its own stuffs like the mm
4605 	 * notification. Before consolidating that code into iommu core, let
4606 	 * the intel sva code handle it.
4607 	 */
4608 	if (domain->type == IOMMU_DOMAIN_SVA) {
4609 		intel_svm_remove_dev_pasid(dev, pasid);
4610 		goto out_tear_down;
4611 	}
4612 
4613 	dmar_domain = to_dmar_domain(domain);
4614 	spin_lock_irqsave(&dmar_domain->lock, flags);
4615 	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4616 		if (curr->dev == dev && curr->pasid == pasid) {
4617 			list_del(&curr->link_domain);
4618 			dev_pasid = curr;
4619 			break;
4620 		}
4621 	}
4622 	WARN_ON_ONCE(!dev_pasid);
4623 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4624 
4625 	domain_detach_iommu(dmar_domain, iommu);
4626 	intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4627 	kfree(dev_pasid);
4628 out_tear_down:
4629 	intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4630 	intel_drain_pasid_prq(dev, pasid);
4631 }
4632 
4633 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4634 				     struct device *dev, ioasid_t pasid)
4635 {
4636 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4637 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4638 	struct intel_iommu *iommu = info->iommu;
4639 	struct dev_pasid_info *dev_pasid;
4640 	unsigned long flags;
4641 	int ret;
4642 
4643 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4644 		return -EOPNOTSUPP;
4645 
4646 	if (domain->dirty_ops)
4647 		return -EINVAL;
4648 
4649 	if (context_copied(iommu, info->bus, info->devfn))
4650 		return -EBUSY;
4651 
4652 	ret = prepare_domain_attach_device(domain, dev);
4653 	if (ret)
4654 		return ret;
4655 
4656 	dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4657 	if (!dev_pasid)
4658 		return -ENOMEM;
4659 
4660 	ret = domain_attach_iommu(dmar_domain, iommu);
4661 	if (ret)
4662 		goto out_free;
4663 
4664 	if (domain_type_is_si(dmar_domain))
4665 		ret = intel_pasid_setup_pass_through(iommu, dev, pasid);
4666 	else if (dmar_domain->use_first_level)
4667 		ret = domain_setup_first_level(iommu, dmar_domain,
4668 					       dev, pasid);
4669 	else
4670 		ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4671 						     dev, pasid);
4672 	if (ret)
4673 		goto out_detach_iommu;
4674 
4675 	dev_pasid->dev = dev;
4676 	dev_pasid->pasid = pasid;
4677 	spin_lock_irqsave(&dmar_domain->lock, flags);
4678 	list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4679 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4680 
4681 	if (domain->type & __IOMMU_DOMAIN_PAGING)
4682 		intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4683 
4684 	return 0;
4685 out_detach_iommu:
4686 	domain_detach_iommu(dmar_domain, iommu);
4687 out_free:
4688 	kfree(dev_pasid);
4689 	return ret;
4690 }
4691 
4692 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4693 {
4694 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4695 	struct intel_iommu *iommu = info->iommu;
4696 	struct iommu_hw_info_vtd *vtd;
4697 
4698 	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4699 	if (!vtd)
4700 		return ERR_PTR(-ENOMEM);
4701 
4702 	vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4703 	vtd->cap_reg = iommu->cap;
4704 	vtd->ecap_reg = iommu->ecap;
4705 	*length = sizeof(*vtd);
4706 	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4707 	return vtd;
4708 }
4709 
4710 /*
4711  * Set dirty tracking for the device list of a domain. The caller must
4712  * hold the domain->lock when calling it.
4713  */
4714 static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4715 {
4716 	struct device_domain_info *info;
4717 	int ret = 0;
4718 
4719 	list_for_each_entry(info, devices, link) {
4720 		ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4721 						       IOMMU_NO_PASID, enable);
4722 		if (ret)
4723 			break;
4724 	}
4725 
4726 	return ret;
4727 }
4728 
4729 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4730 					    bool enable)
4731 {
4732 	struct dmar_domain *s1_domain;
4733 	unsigned long flags;
4734 	int ret;
4735 
4736 	spin_lock(&domain->s1_lock);
4737 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4738 		spin_lock_irqsave(&s1_domain->lock, flags);
4739 		ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4740 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4741 		if (ret)
4742 			goto err_unwind;
4743 	}
4744 	spin_unlock(&domain->s1_lock);
4745 	return 0;
4746 
4747 err_unwind:
4748 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4749 		spin_lock_irqsave(&s1_domain->lock, flags);
4750 		device_set_dirty_tracking(&s1_domain->devices,
4751 					  domain->dirty_tracking);
4752 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4753 	}
4754 	spin_unlock(&domain->s1_lock);
4755 	return ret;
4756 }
4757 
4758 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4759 					  bool enable)
4760 {
4761 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4762 	int ret;
4763 
4764 	spin_lock(&dmar_domain->lock);
4765 	if (dmar_domain->dirty_tracking == enable)
4766 		goto out_unlock;
4767 
4768 	ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4769 	if (ret)
4770 		goto err_unwind;
4771 
4772 	if (dmar_domain->nested_parent) {
4773 		ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4774 		if (ret)
4775 			goto err_unwind;
4776 	}
4777 
4778 	dmar_domain->dirty_tracking = enable;
4779 out_unlock:
4780 	spin_unlock(&dmar_domain->lock);
4781 
4782 	return 0;
4783 
4784 err_unwind:
4785 	device_set_dirty_tracking(&dmar_domain->devices,
4786 				  dmar_domain->dirty_tracking);
4787 	spin_unlock(&dmar_domain->lock);
4788 	return ret;
4789 }
4790 
4791 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4792 					    unsigned long iova, size_t size,
4793 					    unsigned long flags,
4794 					    struct iommu_dirty_bitmap *dirty)
4795 {
4796 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4797 	unsigned long end = iova + size - 1;
4798 	unsigned long pgsize;
4799 
4800 	/*
4801 	 * IOMMUFD core calls into a dirty tracking disabled domain without an
4802 	 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4803 	 * have occurred when we stopped dirty tracking. This ensures that we
4804 	 * never inherit dirtied bits from a previous cycle.
4805 	 */
4806 	if (!dmar_domain->dirty_tracking && dirty->bitmap)
4807 		return -EINVAL;
4808 
4809 	do {
4810 		struct dma_pte *pte;
4811 		int lvl = 0;
4812 
4813 		pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4814 				     GFP_ATOMIC);
4815 		pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4816 		if (!pte || !dma_pte_present(pte)) {
4817 			iova += pgsize;
4818 			continue;
4819 		}
4820 
4821 		if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4822 			iommu_dirty_bitmap_record(dirty, iova, pgsize);
4823 		iova += pgsize;
4824 	} while (iova < end);
4825 
4826 	return 0;
4827 }
4828 
4829 static const struct iommu_dirty_ops intel_dirty_ops = {
4830 	.set_dirty_tracking = intel_iommu_set_dirty_tracking,
4831 	.read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4832 };
4833 
4834 const struct iommu_ops intel_iommu_ops = {
4835 	.blocked_domain		= &blocking_domain,
4836 	.release_domain		= &blocking_domain,
4837 	.capable		= intel_iommu_capable,
4838 	.hw_info		= intel_iommu_hw_info,
4839 	.domain_alloc		= intel_iommu_domain_alloc,
4840 	.domain_alloc_user	= intel_iommu_domain_alloc_user,
4841 	.probe_device		= intel_iommu_probe_device,
4842 	.probe_finalize		= intel_iommu_probe_finalize,
4843 	.release_device		= intel_iommu_release_device,
4844 	.get_resv_regions	= intel_iommu_get_resv_regions,
4845 	.device_group		= intel_iommu_device_group,
4846 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4847 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4848 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4849 	.def_domain_type	= device_def_domain_type,
4850 	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4851 	.pgsize_bitmap		= SZ_4K,
4852 #ifdef CONFIG_INTEL_IOMMU_SVM
4853 	.page_response		= intel_svm_page_response,
4854 #endif
4855 	.default_domain_ops = &(const struct iommu_domain_ops) {
4856 		.attach_dev		= intel_iommu_attach_device,
4857 		.set_dev_pasid		= intel_iommu_set_dev_pasid,
4858 		.map_pages		= intel_iommu_map_pages,
4859 		.unmap_pages		= intel_iommu_unmap_pages,
4860 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4861 		.flush_iotlb_all        = intel_flush_iotlb_all,
4862 		.iotlb_sync		= intel_iommu_tlb_sync,
4863 		.iova_to_phys		= intel_iommu_iova_to_phys,
4864 		.free			= intel_iommu_domain_free,
4865 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4866 	}
4867 };
4868 
4869 static void quirk_iommu_igfx(struct pci_dev *dev)
4870 {
4871 	if (risky_device(dev))
4872 		return;
4873 
4874 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4875 	dmar_map_gfx = 0;
4876 }
4877 
4878 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4879 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4880 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4881 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4882 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4883 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4884 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4885 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4886 
4887 /* Broadwell igfx malfunctions with dmar */
4888 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4889 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4890 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4891 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4892 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4893 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4894 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4895 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4896 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4897 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4898 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4899 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4900 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4901 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4902 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4903 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4904 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4905 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4906 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4907 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4908 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4909 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4910 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4911 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4912 
4913 static void quirk_iommu_rwbf(struct pci_dev *dev)
4914 {
4915 	if (risky_device(dev))
4916 		return;
4917 
4918 	/*
4919 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4920 	 * but needs it. Same seems to hold for the desktop versions.
4921 	 */
4922 	pci_info(dev, "Forcing write-buffer flush capability\n");
4923 	rwbf_quirk = 1;
4924 }
4925 
4926 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4927 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4928 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4929 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4930 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4931 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4932 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4933 
4934 #define GGC 0x52
4935 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4936 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4937 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4938 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4939 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4940 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4941 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4942 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4943 
4944 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4945 {
4946 	unsigned short ggc;
4947 
4948 	if (risky_device(dev))
4949 		return;
4950 
4951 	if (pci_read_config_word(dev, GGC, &ggc))
4952 		return;
4953 
4954 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4955 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4956 		dmar_map_gfx = 0;
4957 	} else if (dmar_map_gfx) {
4958 		/* we have to ensure the gfx device is idle before we flush */
4959 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4960 		iommu_set_dma_strict();
4961 	}
4962 }
4963 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4964 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4965 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4966 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4967 
4968 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4969 {
4970 	unsigned short ver;
4971 
4972 	if (!IS_GFX_DEVICE(dev))
4973 		return;
4974 
4975 	ver = (dev->device >> 8) & 0xff;
4976 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4977 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4978 	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4979 		return;
4980 
4981 	if (risky_device(dev))
4982 		return;
4983 
4984 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4985 	iommu_skip_te_disable = 1;
4986 }
4987 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4988 
4989 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4990    ISOCH DMAR unit for the Azalia sound device, but not give it any
4991    TLB entries, which causes it to deadlock. Check for that.  We do
4992    this in a function called from init_dmars(), instead of in a PCI
4993    quirk, because we don't want to print the obnoxious "BIOS broken"
4994    message if VT-d is actually disabled.
4995 */
4996 static void __init check_tylersburg_isoch(void)
4997 {
4998 	struct pci_dev *pdev;
4999 	uint32_t vtisochctrl;
5000 
5001 	/* If there's no Azalia in the system anyway, forget it. */
5002 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5003 	if (!pdev)
5004 		return;
5005 
5006 	if (risky_device(pdev)) {
5007 		pci_dev_put(pdev);
5008 		return;
5009 	}
5010 
5011 	pci_dev_put(pdev);
5012 
5013 	/* System Management Registers. Might be hidden, in which case
5014 	   we can't do the sanity check. But that's OK, because the
5015 	   known-broken BIOSes _don't_ actually hide it, so far. */
5016 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5017 	if (!pdev)
5018 		return;
5019 
5020 	if (risky_device(pdev)) {
5021 		pci_dev_put(pdev);
5022 		return;
5023 	}
5024 
5025 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5026 		pci_dev_put(pdev);
5027 		return;
5028 	}
5029 
5030 	pci_dev_put(pdev);
5031 
5032 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5033 	if (vtisochctrl & 1)
5034 		return;
5035 
5036 	/* Drop all bits other than the number of TLB entries */
5037 	vtisochctrl &= 0x1c;
5038 
5039 	/* If we have the recommended number of TLB entries (16), fine. */
5040 	if (vtisochctrl == 0x10)
5041 		return;
5042 
5043 	/* Zero TLB entries? You get to ride the short bus to school. */
5044 	if (!vtisochctrl) {
5045 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5046 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5047 		     dmi_get_system_info(DMI_BIOS_VENDOR),
5048 		     dmi_get_system_info(DMI_BIOS_VERSION),
5049 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5050 		iommu_identity_mapping |= IDENTMAP_AZALIA;
5051 		return;
5052 	}
5053 
5054 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5055 	       vtisochctrl);
5056 }
5057 
5058 /*
5059  * Here we deal with a device TLB defect where device may inadvertently issue ATS
5060  * invalidation completion before posted writes initiated with translated address
5061  * that utilized translations matching the invalidation address range, violating
5062  * the invalidation completion ordering.
5063  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5064  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5065  * under the control of the trusted/privileged host device driver must use this
5066  * quirk.
5067  * Device TLBs are invalidated under the following six conditions:
5068  * 1. Device driver does DMA API unmap IOVA
5069  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5070  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5071  *    exit_mmap() due to crash
5072  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5073  *    VM has to free pages that were unmapped
5074  * 5. Userspace driver unmaps a DMA buffer
5075  * 6. Cache invalidation in vSVA usage (upcoming)
5076  *
5077  * For #1 and #2, device drivers are responsible for stopping DMA traffic
5078  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5079  * invalidate TLB the same way as normal user unmap which will use this quirk.
5080  * The dTLB invalidation after PASID cache flush does not need this quirk.
5081  *
5082  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5083  */
5084 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5085 			       unsigned long address, unsigned long mask,
5086 			       u32 pasid, u16 qdep)
5087 {
5088 	u16 sid;
5089 
5090 	if (likely(!info->dtlb_extra_inval))
5091 		return;
5092 
5093 	sid = PCI_DEVID(info->bus, info->devfn);
5094 	if (pasid == IOMMU_NO_PASID) {
5095 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5096 				   qdep, address, mask);
5097 	} else {
5098 		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5099 					 pasid, qdep, address, mask);
5100 	}
5101 }
5102 
5103 #define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
5104 
5105 /*
5106  * Function to submit a command to the enhanced command interface. The
5107  * valid enhanced command descriptions are defined in Table 47 of the
5108  * VT-d spec. The VT-d hardware implementation may support some but not
5109  * all commands, which can be determined by checking the Enhanced
5110  * Command Capability Register.
5111  *
5112  * Return values:
5113  *  - 0: Command successful without any error;
5114  *  - Negative: software error value;
5115  *  - Nonzero positive: failure status code defined in Table 48.
5116  */
5117 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5118 {
5119 	unsigned long flags;
5120 	u64 res;
5121 	int ret;
5122 
5123 	if (!cap_ecmds(iommu->cap))
5124 		return -ENODEV;
5125 
5126 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
5127 
5128 	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5129 	if (res & DMA_ECMD_ECRSP_IP) {
5130 		ret = -EBUSY;
5131 		goto err;
5132 	}
5133 
5134 	/*
5135 	 * Unconditionally write the operand B, because
5136 	 * - There is no side effect if an ecmd doesn't require an
5137 	 *   operand B, but we set the register to some value.
5138 	 * - It's not invoked in any critical path. The extra MMIO
5139 	 *   write doesn't bring any performance concerns.
5140 	 */
5141 	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5142 	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5143 
5144 	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5145 		      !(res & DMA_ECMD_ECRSP_IP), res);
5146 
5147 	if (res & DMA_ECMD_ECRSP_IP) {
5148 		ret = -ETIMEDOUT;
5149 		goto err;
5150 	}
5151 
5152 	ret = ecmd_get_status_code(res);
5153 err:
5154 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5155 
5156 	return ret;
5157 }
5158