xref: /linux/drivers/iommu/intel/iommu.c (revision 173b0b5b0e865348684c02bd9cb1d22b5d46e458)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26 
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "pasid.h"
31 #include "cap_audit.h"
32 #include "perfmon.h"
33 
34 #define ROOT_SIZE		VTD_PAGE_SIZE
35 #define CONTEXT_SIZE		VTD_PAGE_SIZE
36 
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41 
42 #define IOAPIC_RANGE_START	(0xfee00000)
43 #define IOAPIC_RANGE_END	(0xfeefffff)
44 #define IOVA_START_ADDR		(0x1000)
45 
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47 
48 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
49 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
50 
51 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
52    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
53 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
54 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
55 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
56 
57 /* IO virtual address start page frame number */
58 #define IOVA_START_PFN		(1)
59 
60 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
61 
62 static void __init check_tylersburg_isoch(void);
63 static int rwbf_quirk;
64 
65 /*
66  * set to 1 to panic kernel if can't successfully enable VT-d
67  * (used when kernel is launched w/ TXT)
68  */
69 static int force_on = 0;
70 static int intel_iommu_tboot_noforce;
71 static int no_platform_optin;
72 
73 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
74 
75 /*
76  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
77  * if marked present.
78  */
79 static phys_addr_t root_entry_lctp(struct root_entry *re)
80 {
81 	if (!(re->lo & 1))
82 		return 0;
83 
84 	return re->lo & VTD_PAGE_MASK;
85 }
86 
87 /*
88  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
89  * if marked present.
90  */
91 static phys_addr_t root_entry_uctp(struct root_entry *re)
92 {
93 	if (!(re->hi & 1))
94 		return 0;
95 
96 	return re->hi & VTD_PAGE_MASK;
97 }
98 
99 static int device_rid_cmp_key(const void *key, const struct rb_node *node)
100 {
101 	struct device_domain_info *info =
102 		rb_entry(node, struct device_domain_info, node);
103 	const u16 *rid_lhs = key;
104 
105 	if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
106 		return -1;
107 
108 	if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
109 		return 1;
110 
111 	return 0;
112 }
113 
114 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
115 {
116 	struct device_domain_info *info =
117 		rb_entry(lhs, struct device_domain_info, node);
118 	u16 key = PCI_DEVID(info->bus, info->devfn);
119 
120 	return device_rid_cmp_key(&key, rhs);
121 }
122 
123 /*
124  * Looks up an IOMMU-probed device using its source ID.
125  *
126  * Returns the pointer to the device if there is a match. Otherwise,
127  * returns NULL.
128  *
129  * Note that this helper doesn't guarantee that the device won't be
130  * released by the iommu subsystem after being returned. The caller
131  * should use its own synchronization mechanism to avoid the device
132  * being released during its use if its possibly the case.
133  */
134 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
135 {
136 	struct device_domain_info *info = NULL;
137 	struct rb_node *node;
138 	unsigned long flags;
139 
140 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
141 	node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
142 	if (node)
143 		info = rb_entry(node, struct device_domain_info, node);
144 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
145 
146 	return info ? info->dev : NULL;
147 }
148 
149 static int device_rbtree_insert(struct intel_iommu *iommu,
150 				struct device_domain_info *info)
151 {
152 	struct rb_node *curr;
153 	unsigned long flags;
154 
155 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
156 	curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
157 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
158 	if (WARN_ON(curr))
159 		return -EEXIST;
160 
161 	return 0;
162 }
163 
164 static void device_rbtree_remove(struct device_domain_info *info)
165 {
166 	struct intel_iommu *iommu = info->iommu;
167 	unsigned long flags;
168 
169 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
170 	rb_erase(&info->node, &iommu->device_rbtree);
171 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
172 }
173 
174 /*
175  * This domain is a statically identity mapping domain.
176  *	1. This domain creats a static 1:1 mapping to all usable memory.
177  * 	2. It maps to each iommu if successful.
178  *	3. Each iommu mapps to this domain if successful.
179  */
180 static struct dmar_domain *si_domain;
181 static int hw_pass_through = 1;
182 
183 struct dmar_rmrr_unit {
184 	struct list_head list;		/* list of rmrr units	*/
185 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
186 	u64	base_address;		/* reserved base address*/
187 	u64	end_address;		/* reserved end address */
188 	struct dmar_dev_scope *devices;	/* target devices */
189 	int	devices_cnt;		/* target device count */
190 };
191 
192 struct dmar_atsr_unit {
193 	struct list_head list;		/* list of ATSR units */
194 	struct acpi_dmar_header *hdr;	/* ACPI header */
195 	struct dmar_dev_scope *devices;	/* target devices */
196 	int devices_cnt;		/* target device count */
197 	u8 include_all:1;		/* include all ports */
198 };
199 
200 struct dmar_satc_unit {
201 	struct list_head list;		/* list of SATC units */
202 	struct acpi_dmar_header *hdr;	/* ACPI header */
203 	struct dmar_dev_scope *devices;	/* target devices */
204 	struct intel_iommu *iommu;	/* the corresponding iommu */
205 	int devices_cnt;		/* target device count */
206 	u8 atc_required:1;		/* ATS is required */
207 };
208 
209 static LIST_HEAD(dmar_atsr_units);
210 static LIST_HEAD(dmar_rmrr_units);
211 static LIST_HEAD(dmar_satc_units);
212 
213 #define for_each_rmrr_units(rmrr) \
214 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
215 
216 static void intel_iommu_domain_free(struct iommu_domain *domain);
217 
218 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
219 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
220 
221 int intel_iommu_enabled = 0;
222 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
223 
224 static int dmar_map_gfx = 1;
225 static int intel_iommu_superpage = 1;
226 static int iommu_identity_mapping;
227 static int iommu_skip_te_disable;
228 
229 #define IDENTMAP_GFX		2
230 #define IDENTMAP_AZALIA		4
231 
232 const struct iommu_ops intel_iommu_ops;
233 static const struct iommu_dirty_ops intel_dirty_ops;
234 
235 static bool translation_pre_enabled(struct intel_iommu *iommu)
236 {
237 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
238 }
239 
240 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
241 {
242 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
243 }
244 
245 static void init_translation_status(struct intel_iommu *iommu)
246 {
247 	u32 gsts;
248 
249 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
250 	if (gsts & DMA_GSTS_TES)
251 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
252 }
253 
254 static int __init intel_iommu_setup(char *str)
255 {
256 	if (!str)
257 		return -EINVAL;
258 
259 	while (*str) {
260 		if (!strncmp(str, "on", 2)) {
261 			dmar_disabled = 0;
262 			pr_info("IOMMU enabled\n");
263 		} else if (!strncmp(str, "off", 3)) {
264 			dmar_disabled = 1;
265 			no_platform_optin = 1;
266 			pr_info("IOMMU disabled\n");
267 		} else if (!strncmp(str, "igfx_off", 8)) {
268 			dmar_map_gfx = 0;
269 			pr_info("Disable GFX device mapping\n");
270 		} else if (!strncmp(str, "forcedac", 8)) {
271 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
272 			iommu_dma_forcedac = true;
273 		} else if (!strncmp(str, "strict", 6)) {
274 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
275 			iommu_set_dma_strict();
276 		} else if (!strncmp(str, "sp_off", 6)) {
277 			pr_info("Disable supported super page\n");
278 			intel_iommu_superpage = 0;
279 		} else if (!strncmp(str, "sm_on", 5)) {
280 			pr_info("Enable scalable mode if hardware supports\n");
281 			intel_iommu_sm = 1;
282 		} else if (!strncmp(str, "sm_off", 6)) {
283 			pr_info("Scalable mode is disallowed\n");
284 			intel_iommu_sm = 0;
285 		} else if (!strncmp(str, "tboot_noforce", 13)) {
286 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
287 			intel_iommu_tboot_noforce = 1;
288 		} else {
289 			pr_notice("Unknown option - '%s'\n", str);
290 		}
291 
292 		str += strcspn(str, ",");
293 		while (*str == ',')
294 			str++;
295 	}
296 
297 	return 1;
298 }
299 __setup("intel_iommu=", intel_iommu_setup);
300 
301 void *alloc_pgtable_page(int node, gfp_t gfp)
302 {
303 	struct page *page;
304 	void *vaddr = NULL;
305 
306 	page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
307 	if (page)
308 		vaddr = page_address(page);
309 	return vaddr;
310 }
311 
312 void free_pgtable_page(void *vaddr)
313 {
314 	free_page((unsigned long)vaddr);
315 }
316 
317 static int domain_type_is_si(struct dmar_domain *domain)
318 {
319 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
320 }
321 
322 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
323 {
324 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
325 
326 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
327 }
328 
329 /*
330  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
331  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
332  * the returned SAGAW.
333  */
334 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
335 {
336 	unsigned long fl_sagaw, sl_sagaw;
337 
338 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
339 	sl_sagaw = cap_sagaw(iommu->cap);
340 
341 	/* Second level only. */
342 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
343 		return sl_sagaw;
344 
345 	/* First level only. */
346 	if (!ecap_slts(iommu->ecap))
347 		return fl_sagaw;
348 
349 	return fl_sagaw & sl_sagaw;
350 }
351 
352 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
353 {
354 	unsigned long sagaw;
355 	int agaw;
356 
357 	sagaw = __iommu_calculate_sagaw(iommu);
358 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
359 		if (test_bit(agaw, &sagaw))
360 			break;
361 	}
362 
363 	return agaw;
364 }
365 
366 /*
367  * Calculate max SAGAW for each iommu.
368  */
369 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
370 {
371 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
372 }
373 
374 /*
375  * calculate agaw for each iommu.
376  * "SAGAW" may be different across iommus, use a default agaw, and
377  * get a supported less agaw for iommus that don't support the default agaw.
378  */
379 int iommu_calculate_agaw(struct intel_iommu *iommu)
380 {
381 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
382 }
383 
384 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
385 {
386 	return sm_supported(iommu) ?
387 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
388 }
389 
390 static void domain_update_iommu_coherency(struct dmar_domain *domain)
391 {
392 	struct iommu_domain_info *info;
393 	struct dmar_drhd_unit *drhd;
394 	struct intel_iommu *iommu;
395 	bool found = false;
396 	unsigned long i;
397 
398 	domain->iommu_coherency = true;
399 	xa_for_each(&domain->iommu_array, i, info) {
400 		found = true;
401 		if (!iommu_paging_structure_coherency(info->iommu)) {
402 			domain->iommu_coherency = false;
403 			break;
404 		}
405 	}
406 	if (found)
407 		return;
408 
409 	/* No hardware attached; use lowest common denominator */
410 	rcu_read_lock();
411 	for_each_active_iommu(iommu, drhd) {
412 		if (!iommu_paging_structure_coherency(iommu)) {
413 			domain->iommu_coherency = false;
414 			break;
415 		}
416 	}
417 	rcu_read_unlock();
418 }
419 
420 static int domain_update_iommu_superpage(struct dmar_domain *domain,
421 					 struct intel_iommu *skip)
422 {
423 	struct dmar_drhd_unit *drhd;
424 	struct intel_iommu *iommu;
425 	int mask = 0x3;
426 
427 	if (!intel_iommu_superpage)
428 		return 0;
429 
430 	/* set iommu_superpage to the smallest common denominator */
431 	rcu_read_lock();
432 	for_each_active_iommu(iommu, drhd) {
433 		if (iommu != skip) {
434 			if (domain && domain->use_first_level) {
435 				if (!cap_fl1gp_support(iommu->cap))
436 					mask = 0x1;
437 			} else {
438 				mask &= cap_super_page_val(iommu->cap);
439 			}
440 
441 			if (!mask)
442 				break;
443 		}
444 	}
445 	rcu_read_unlock();
446 
447 	return fls(mask);
448 }
449 
450 static int domain_update_device_node(struct dmar_domain *domain)
451 {
452 	struct device_domain_info *info;
453 	int nid = NUMA_NO_NODE;
454 	unsigned long flags;
455 
456 	spin_lock_irqsave(&domain->lock, flags);
457 	list_for_each_entry(info, &domain->devices, link) {
458 		/*
459 		 * There could possibly be multiple device numa nodes as devices
460 		 * within the same domain may sit behind different IOMMUs. There
461 		 * isn't perfect answer in such situation, so we select first
462 		 * come first served policy.
463 		 */
464 		nid = dev_to_node(info->dev);
465 		if (nid != NUMA_NO_NODE)
466 			break;
467 	}
468 	spin_unlock_irqrestore(&domain->lock, flags);
469 
470 	return nid;
471 }
472 
473 /* Return the super pagesize bitmap if supported. */
474 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
475 {
476 	unsigned long bitmap = 0;
477 
478 	/*
479 	 * 1-level super page supports page size of 2MiB, 2-level super page
480 	 * supports page size of both 2MiB and 1GiB.
481 	 */
482 	if (domain->iommu_superpage == 1)
483 		bitmap |= SZ_2M;
484 	else if (domain->iommu_superpage == 2)
485 		bitmap |= SZ_2M | SZ_1G;
486 
487 	return bitmap;
488 }
489 
490 /* Some capabilities may be different across iommus */
491 void domain_update_iommu_cap(struct dmar_domain *domain)
492 {
493 	domain_update_iommu_coherency(domain);
494 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
495 
496 	/*
497 	 * If RHSA is missing, we should default to the device numa domain
498 	 * as fall back.
499 	 */
500 	if (domain->nid == NUMA_NO_NODE)
501 		domain->nid = domain_update_device_node(domain);
502 
503 	/*
504 	 * First-level translation restricts the input-address to a
505 	 * canonical address (i.e., address bits 63:N have the same
506 	 * value as address bit [N-1], where N is 48-bits with 4-level
507 	 * paging and 57-bits with 5-level paging). Hence, skip bit
508 	 * [N-1].
509 	 */
510 	if (domain->use_first_level)
511 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
512 	else
513 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
514 
515 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
516 	domain_update_iotlb(domain);
517 }
518 
519 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
520 					 u8 devfn, int alloc)
521 {
522 	struct root_entry *root = &iommu->root_entry[bus];
523 	struct context_entry *context;
524 	u64 *entry;
525 
526 	/*
527 	 * Except that the caller requested to allocate a new entry,
528 	 * returning a copied context entry makes no sense.
529 	 */
530 	if (!alloc && context_copied(iommu, bus, devfn))
531 		return NULL;
532 
533 	entry = &root->lo;
534 	if (sm_supported(iommu)) {
535 		if (devfn >= 0x80) {
536 			devfn -= 0x80;
537 			entry = &root->hi;
538 		}
539 		devfn *= 2;
540 	}
541 	if (*entry & 1)
542 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
543 	else {
544 		unsigned long phy_addr;
545 		if (!alloc)
546 			return NULL;
547 
548 		context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
549 		if (!context)
550 			return NULL;
551 
552 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
553 		phy_addr = virt_to_phys((void *)context);
554 		*entry = phy_addr | 1;
555 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
556 	}
557 	return &context[devfn];
558 }
559 
560 /**
561  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
562  *				 sub-hierarchy of a candidate PCI-PCI bridge
563  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
564  * @bridge: the candidate PCI-PCI bridge
565  *
566  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
567  */
568 static bool
569 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
570 {
571 	struct pci_dev *pdev, *pbridge;
572 
573 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
574 		return false;
575 
576 	pdev = to_pci_dev(dev);
577 	pbridge = to_pci_dev(bridge);
578 
579 	if (pbridge->subordinate &&
580 	    pbridge->subordinate->number <= pdev->bus->number &&
581 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
582 		return true;
583 
584 	return false;
585 }
586 
587 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
588 {
589 	struct dmar_drhd_unit *drhd;
590 	u32 vtbar;
591 	int rc;
592 
593 	/* We know that this device on this chipset has its own IOMMU.
594 	 * If we find it under a different IOMMU, then the BIOS is lying
595 	 * to us. Hope that the IOMMU for this device is actually
596 	 * disabled, and it needs no translation...
597 	 */
598 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
599 	if (rc) {
600 		/* "can't" happen */
601 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
602 		return false;
603 	}
604 	vtbar &= 0xffff0000;
605 
606 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
607 	drhd = dmar_find_matched_drhd_unit(pdev);
608 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
609 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
610 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
611 		return true;
612 	}
613 
614 	return false;
615 }
616 
617 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
618 {
619 	if (!iommu || iommu->drhd->ignored)
620 		return true;
621 
622 	if (dev_is_pci(dev)) {
623 		struct pci_dev *pdev = to_pci_dev(dev);
624 
625 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
626 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
627 		    quirk_ioat_snb_local_iommu(pdev))
628 			return true;
629 	}
630 
631 	return false;
632 }
633 
634 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
635 {
636 	struct dmar_drhd_unit *drhd = NULL;
637 	struct pci_dev *pdev = NULL;
638 	struct intel_iommu *iommu;
639 	struct device *tmp;
640 	u16 segment = 0;
641 	int i;
642 
643 	if (!dev)
644 		return NULL;
645 
646 	if (dev_is_pci(dev)) {
647 		struct pci_dev *pf_pdev;
648 
649 		pdev = pci_real_dma_dev(to_pci_dev(dev));
650 
651 		/* VFs aren't listed in scope tables; we need to look up
652 		 * the PF instead to find the IOMMU. */
653 		pf_pdev = pci_physfn(pdev);
654 		dev = &pf_pdev->dev;
655 		segment = pci_domain_nr(pdev->bus);
656 	} else if (has_acpi_companion(dev))
657 		dev = &ACPI_COMPANION(dev)->dev;
658 
659 	rcu_read_lock();
660 	for_each_iommu(iommu, drhd) {
661 		if (pdev && segment != drhd->segment)
662 			continue;
663 
664 		for_each_active_dev_scope(drhd->devices,
665 					  drhd->devices_cnt, i, tmp) {
666 			if (tmp == dev) {
667 				/* For a VF use its original BDF# not that of the PF
668 				 * which we used for the IOMMU lookup. Strictly speaking
669 				 * we could do this for all PCI devices; we only need to
670 				 * get the BDF# from the scope table for ACPI matches. */
671 				if (pdev && pdev->is_virtfn)
672 					goto got_pdev;
673 
674 				if (bus && devfn) {
675 					*bus = drhd->devices[i].bus;
676 					*devfn = drhd->devices[i].devfn;
677 				}
678 				goto out;
679 			}
680 
681 			if (is_downstream_to_pci_bridge(dev, tmp))
682 				goto got_pdev;
683 		}
684 
685 		if (pdev && drhd->include_all) {
686 got_pdev:
687 			if (bus && devfn) {
688 				*bus = pdev->bus->number;
689 				*devfn = pdev->devfn;
690 			}
691 			goto out;
692 		}
693 	}
694 	iommu = NULL;
695 out:
696 	if (iommu_is_dummy(iommu, dev))
697 		iommu = NULL;
698 
699 	rcu_read_unlock();
700 
701 	return iommu;
702 }
703 
704 static void domain_flush_cache(struct dmar_domain *domain,
705 			       void *addr, int size)
706 {
707 	if (!domain->iommu_coherency)
708 		clflush_cache_range(addr, size);
709 }
710 
711 static void free_context_table(struct intel_iommu *iommu)
712 {
713 	struct context_entry *context;
714 	int i;
715 
716 	if (!iommu->root_entry)
717 		return;
718 
719 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
720 		context = iommu_context_addr(iommu, i, 0, 0);
721 		if (context)
722 			free_pgtable_page(context);
723 
724 		if (!sm_supported(iommu))
725 			continue;
726 
727 		context = iommu_context_addr(iommu, i, 0x80, 0);
728 		if (context)
729 			free_pgtable_page(context);
730 	}
731 
732 	free_pgtable_page(iommu->root_entry);
733 	iommu->root_entry = NULL;
734 }
735 
736 #ifdef CONFIG_DMAR_DEBUG
737 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
738 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
739 {
740 	struct dma_pte *pte;
741 	int offset;
742 
743 	while (1) {
744 		offset = pfn_level_offset(pfn, level);
745 		pte = &parent[offset];
746 		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
747 			pr_info("PTE not present at level %d\n", level);
748 			break;
749 		}
750 
751 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
752 
753 		if (level == 1)
754 			break;
755 
756 		parent = phys_to_virt(dma_pte_addr(pte));
757 		level--;
758 	}
759 }
760 
761 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
762 			  unsigned long long addr, u32 pasid)
763 {
764 	struct pasid_dir_entry *dir, *pde;
765 	struct pasid_entry *entries, *pte;
766 	struct context_entry *ctx_entry;
767 	struct root_entry *rt_entry;
768 	int i, dir_index, index, level;
769 	u8 devfn = source_id & 0xff;
770 	u8 bus = source_id >> 8;
771 	struct dma_pte *pgtable;
772 
773 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
774 
775 	/* root entry dump */
776 	rt_entry = &iommu->root_entry[bus];
777 	if (!rt_entry) {
778 		pr_info("root table entry is not present\n");
779 		return;
780 	}
781 
782 	if (sm_supported(iommu))
783 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
784 			rt_entry->hi, rt_entry->lo);
785 	else
786 		pr_info("root entry: 0x%016llx", rt_entry->lo);
787 
788 	/* context entry dump */
789 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
790 	if (!ctx_entry) {
791 		pr_info("context table entry is not present\n");
792 		return;
793 	}
794 
795 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
796 		ctx_entry->hi, ctx_entry->lo);
797 
798 	/* legacy mode does not require PASID entries */
799 	if (!sm_supported(iommu)) {
800 		level = agaw_to_level(ctx_entry->hi & 7);
801 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
802 		goto pgtable_walk;
803 	}
804 
805 	/* get the pointer to pasid directory entry */
806 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
807 	if (!dir) {
808 		pr_info("pasid directory entry is not present\n");
809 		return;
810 	}
811 	/* For request-without-pasid, get the pasid from context entry */
812 	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
813 		pasid = IOMMU_NO_PASID;
814 
815 	dir_index = pasid >> PASID_PDE_SHIFT;
816 	pde = &dir[dir_index];
817 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
818 
819 	/* get the pointer to the pasid table entry */
820 	entries = get_pasid_table_from_pde(pde);
821 	if (!entries) {
822 		pr_info("pasid table entry is not present\n");
823 		return;
824 	}
825 	index = pasid & PASID_PTE_MASK;
826 	pte = &entries[index];
827 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
828 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
829 
830 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
831 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
832 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
833 	} else {
834 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
835 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
836 	}
837 
838 pgtable_walk:
839 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
840 }
841 #endif
842 
843 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
844 				      unsigned long pfn, int *target_level,
845 				      gfp_t gfp)
846 {
847 	struct dma_pte *parent, *pte;
848 	int level = agaw_to_level(domain->agaw);
849 	int offset;
850 
851 	if (!domain_pfn_supported(domain, pfn))
852 		/* Address beyond IOMMU's addressing capabilities. */
853 		return NULL;
854 
855 	parent = domain->pgd;
856 
857 	while (1) {
858 		void *tmp_page;
859 
860 		offset = pfn_level_offset(pfn, level);
861 		pte = &parent[offset];
862 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
863 			break;
864 		if (level == *target_level)
865 			break;
866 
867 		if (!dma_pte_present(pte)) {
868 			uint64_t pteval;
869 
870 			tmp_page = alloc_pgtable_page(domain->nid, gfp);
871 
872 			if (!tmp_page)
873 				return NULL;
874 
875 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
876 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
877 			if (domain->use_first_level)
878 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
879 
880 			if (cmpxchg64(&pte->val, 0ULL, pteval))
881 				/* Someone else set it while we were thinking; use theirs. */
882 				free_pgtable_page(tmp_page);
883 			else
884 				domain_flush_cache(domain, pte, sizeof(*pte));
885 		}
886 		if (level == 1)
887 			break;
888 
889 		parent = phys_to_virt(dma_pte_addr(pte));
890 		level--;
891 	}
892 
893 	if (!*target_level)
894 		*target_level = level;
895 
896 	return pte;
897 }
898 
899 /* return address's pte at specific level */
900 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
901 					 unsigned long pfn,
902 					 int level, int *large_page)
903 {
904 	struct dma_pte *parent, *pte;
905 	int total = agaw_to_level(domain->agaw);
906 	int offset;
907 
908 	parent = domain->pgd;
909 	while (level <= total) {
910 		offset = pfn_level_offset(pfn, total);
911 		pte = &parent[offset];
912 		if (level == total)
913 			return pte;
914 
915 		if (!dma_pte_present(pte)) {
916 			*large_page = total;
917 			break;
918 		}
919 
920 		if (dma_pte_superpage(pte)) {
921 			*large_page = total;
922 			return pte;
923 		}
924 
925 		parent = phys_to_virt(dma_pte_addr(pte));
926 		total--;
927 	}
928 	return NULL;
929 }
930 
931 /* clear last level pte, a tlb flush should be followed */
932 static void dma_pte_clear_range(struct dmar_domain *domain,
933 				unsigned long start_pfn,
934 				unsigned long last_pfn)
935 {
936 	unsigned int large_page;
937 	struct dma_pte *first_pte, *pte;
938 
939 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
940 	    WARN_ON(start_pfn > last_pfn))
941 		return;
942 
943 	/* we don't need lock here; nobody else touches the iova range */
944 	do {
945 		large_page = 1;
946 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
947 		if (!pte) {
948 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
949 			continue;
950 		}
951 		do {
952 			dma_clear_pte(pte);
953 			start_pfn += lvl_to_nr_pages(large_page);
954 			pte++;
955 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
956 
957 		domain_flush_cache(domain, first_pte,
958 				   (void *)pte - (void *)first_pte);
959 
960 	} while (start_pfn && start_pfn <= last_pfn);
961 }
962 
963 static void dma_pte_free_level(struct dmar_domain *domain, int level,
964 			       int retain_level, struct dma_pte *pte,
965 			       unsigned long pfn, unsigned long start_pfn,
966 			       unsigned long last_pfn)
967 {
968 	pfn = max(start_pfn, pfn);
969 	pte = &pte[pfn_level_offset(pfn, level)];
970 
971 	do {
972 		unsigned long level_pfn;
973 		struct dma_pte *level_pte;
974 
975 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
976 			goto next;
977 
978 		level_pfn = pfn & level_mask(level);
979 		level_pte = phys_to_virt(dma_pte_addr(pte));
980 
981 		if (level > 2) {
982 			dma_pte_free_level(domain, level - 1, retain_level,
983 					   level_pte, level_pfn, start_pfn,
984 					   last_pfn);
985 		}
986 
987 		/*
988 		 * Free the page table if we're below the level we want to
989 		 * retain and the range covers the entire table.
990 		 */
991 		if (level < retain_level && !(start_pfn > level_pfn ||
992 		      last_pfn < level_pfn + level_size(level) - 1)) {
993 			dma_clear_pte(pte);
994 			domain_flush_cache(domain, pte, sizeof(*pte));
995 			free_pgtable_page(level_pte);
996 		}
997 next:
998 		pfn += level_size(level);
999 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1000 }
1001 
1002 /*
1003  * clear last level (leaf) ptes and free page table pages below the
1004  * level we wish to keep intact.
1005  */
1006 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1007 				   unsigned long start_pfn,
1008 				   unsigned long last_pfn,
1009 				   int retain_level)
1010 {
1011 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1012 
1013 	/* We don't need lock here; nobody else touches the iova range */
1014 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1015 			   domain->pgd, 0, start_pfn, last_pfn);
1016 
1017 	/* free pgd */
1018 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1019 		free_pgtable_page(domain->pgd);
1020 		domain->pgd = NULL;
1021 	}
1022 }
1023 
1024 /* When a page at a given level is being unlinked from its parent, we don't
1025    need to *modify* it at all. All we need to do is make a list of all the
1026    pages which can be freed just as soon as we've flushed the IOTLB and we
1027    know the hardware page-walk will no longer touch them.
1028    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1029    be freed. */
1030 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1031 				    int level, struct dma_pte *pte,
1032 				    struct list_head *freelist)
1033 {
1034 	struct page *pg;
1035 
1036 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1037 	list_add_tail(&pg->lru, freelist);
1038 
1039 	if (level == 1)
1040 		return;
1041 
1042 	pte = page_address(pg);
1043 	do {
1044 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1045 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1046 		pte++;
1047 	} while (!first_pte_in_page(pte));
1048 }
1049 
1050 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1051 				struct dma_pte *pte, unsigned long pfn,
1052 				unsigned long start_pfn, unsigned long last_pfn,
1053 				struct list_head *freelist)
1054 {
1055 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1056 
1057 	pfn = max(start_pfn, pfn);
1058 	pte = &pte[pfn_level_offset(pfn, level)];
1059 
1060 	do {
1061 		unsigned long level_pfn = pfn & level_mask(level);
1062 
1063 		if (!dma_pte_present(pte))
1064 			goto next;
1065 
1066 		/* If range covers entire pagetable, free it */
1067 		if (start_pfn <= level_pfn &&
1068 		    last_pfn >= level_pfn + level_size(level) - 1) {
1069 			/* These suborbinate page tables are going away entirely. Don't
1070 			   bother to clear them; we're just going to *free* them. */
1071 			if (level > 1 && !dma_pte_superpage(pte))
1072 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1073 
1074 			dma_clear_pte(pte);
1075 			if (!first_pte)
1076 				first_pte = pte;
1077 			last_pte = pte;
1078 		} else if (level > 1) {
1079 			/* Recurse down into a level that isn't *entirely* obsolete */
1080 			dma_pte_clear_level(domain, level - 1,
1081 					    phys_to_virt(dma_pte_addr(pte)),
1082 					    level_pfn, start_pfn, last_pfn,
1083 					    freelist);
1084 		}
1085 next:
1086 		pfn = level_pfn + level_size(level);
1087 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1088 
1089 	if (first_pte)
1090 		domain_flush_cache(domain, first_pte,
1091 				   (void *)++last_pte - (void *)first_pte);
1092 }
1093 
1094 /* We can't just free the pages because the IOMMU may still be walking
1095    the page tables, and may have cached the intermediate levels. The
1096    pages can only be freed after the IOTLB flush has been done. */
1097 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1098 			 unsigned long last_pfn, struct list_head *freelist)
1099 {
1100 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1101 	    WARN_ON(start_pfn > last_pfn))
1102 		return;
1103 
1104 	/* we don't need lock here; nobody else touches the iova range */
1105 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1106 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1107 
1108 	/* free pgd */
1109 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1110 		struct page *pgd_page = virt_to_page(domain->pgd);
1111 		list_add_tail(&pgd_page->lru, freelist);
1112 		domain->pgd = NULL;
1113 	}
1114 }
1115 
1116 /* iommu handling */
1117 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1118 {
1119 	struct root_entry *root;
1120 
1121 	root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1122 	if (!root) {
1123 		pr_err("Allocating root entry for %s failed\n",
1124 			iommu->name);
1125 		return -ENOMEM;
1126 	}
1127 
1128 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1129 	iommu->root_entry = root;
1130 
1131 	return 0;
1132 }
1133 
1134 static void iommu_set_root_entry(struct intel_iommu *iommu)
1135 {
1136 	u64 addr;
1137 	u32 sts;
1138 	unsigned long flag;
1139 
1140 	addr = virt_to_phys(iommu->root_entry);
1141 	if (sm_supported(iommu))
1142 		addr |= DMA_RTADDR_SMT;
1143 
1144 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1145 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1146 
1147 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1148 
1149 	/* Make sure hardware complete it */
1150 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1151 		      readl, (sts & DMA_GSTS_RTPS), sts);
1152 
1153 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1154 
1155 	/*
1156 	 * Hardware invalidates all DMA remapping hardware translation
1157 	 * caches as part of SRTP flow.
1158 	 */
1159 	if (cap_esrtps(iommu->cap))
1160 		return;
1161 
1162 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1163 	if (sm_supported(iommu))
1164 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1165 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1166 }
1167 
1168 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1169 {
1170 	u32 val;
1171 	unsigned long flag;
1172 
1173 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1174 		return;
1175 
1176 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1177 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1178 
1179 	/* Make sure hardware complete it */
1180 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1181 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1182 
1183 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1184 }
1185 
1186 /* return value determine if we need a write buffer flush */
1187 static void __iommu_flush_context(struct intel_iommu *iommu,
1188 				  u16 did, u16 source_id, u8 function_mask,
1189 				  u64 type)
1190 {
1191 	u64 val = 0;
1192 	unsigned long flag;
1193 
1194 	switch (type) {
1195 	case DMA_CCMD_GLOBAL_INVL:
1196 		val = DMA_CCMD_GLOBAL_INVL;
1197 		break;
1198 	case DMA_CCMD_DOMAIN_INVL:
1199 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1200 		break;
1201 	case DMA_CCMD_DEVICE_INVL:
1202 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1203 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1204 		break;
1205 	default:
1206 		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1207 			iommu->name, type);
1208 		return;
1209 	}
1210 	val |= DMA_CCMD_ICC;
1211 
1212 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1213 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1214 
1215 	/* Make sure hardware complete it */
1216 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1217 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1218 
1219 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1220 }
1221 
1222 /* return value determine if we need a write buffer flush */
1223 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1224 				u64 addr, unsigned int size_order, u64 type)
1225 {
1226 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1227 	u64 val = 0, val_iva = 0;
1228 	unsigned long flag;
1229 
1230 	switch (type) {
1231 	case DMA_TLB_GLOBAL_FLUSH:
1232 		/* global flush doesn't need set IVA_REG */
1233 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1234 		break;
1235 	case DMA_TLB_DSI_FLUSH:
1236 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1237 		break;
1238 	case DMA_TLB_PSI_FLUSH:
1239 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1240 		/* IH bit is passed in as part of address */
1241 		val_iva = size_order | addr;
1242 		break;
1243 	default:
1244 		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1245 			iommu->name, type);
1246 		return;
1247 	}
1248 
1249 	if (cap_write_drain(iommu->cap))
1250 		val |= DMA_TLB_WRITE_DRAIN;
1251 
1252 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1253 	/* Note: Only uses first TLB reg currently */
1254 	if (val_iva)
1255 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1256 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1257 
1258 	/* Make sure hardware complete it */
1259 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1260 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1261 
1262 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1263 
1264 	/* check IOTLB invalidation granularity */
1265 	if (DMA_TLB_IAIG(val) == 0)
1266 		pr_err("Flush IOTLB failed\n");
1267 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1268 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1269 			(unsigned long long)DMA_TLB_IIRG(type),
1270 			(unsigned long long)DMA_TLB_IAIG(val));
1271 }
1272 
1273 static struct device_domain_info *
1274 domain_lookup_dev_info(struct dmar_domain *domain,
1275 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1276 {
1277 	struct device_domain_info *info;
1278 	unsigned long flags;
1279 
1280 	spin_lock_irqsave(&domain->lock, flags);
1281 	list_for_each_entry(info, &domain->devices, link) {
1282 		if (info->iommu == iommu && info->bus == bus &&
1283 		    info->devfn == devfn) {
1284 			spin_unlock_irqrestore(&domain->lock, flags);
1285 			return info;
1286 		}
1287 	}
1288 	spin_unlock_irqrestore(&domain->lock, flags);
1289 
1290 	return NULL;
1291 }
1292 
1293 void domain_update_iotlb(struct dmar_domain *domain)
1294 {
1295 	struct dev_pasid_info *dev_pasid;
1296 	struct device_domain_info *info;
1297 	bool has_iotlb_device = false;
1298 	unsigned long flags;
1299 
1300 	spin_lock_irqsave(&domain->lock, flags);
1301 	list_for_each_entry(info, &domain->devices, link) {
1302 		if (info->ats_enabled) {
1303 			has_iotlb_device = true;
1304 			break;
1305 		}
1306 	}
1307 
1308 	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1309 		info = dev_iommu_priv_get(dev_pasid->dev);
1310 		if (info->ats_enabled) {
1311 			has_iotlb_device = true;
1312 			break;
1313 		}
1314 	}
1315 	domain->has_iotlb_device = has_iotlb_device;
1316 	spin_unlock_irqrestore(&domain->lock, flags);
1317 }
1318 
1319 /*
1320  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1321  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1322  * check because it applies only to the built-in QAT devices and it doesn't
1323  * grant additional privileges.
1324  */
1325 #define BUGGY_QAT_DEVID_MASK 0x4940
1326 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1327 {
1328 	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1329 		return false;
1330 
1331 	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1332 		return false;
1333 
1334 	return true;
1335 }
1336 
1337 static void iommu_enable_pci_caps(struct device_domain_info *info)
1338 {
1339 	struct pci_dev *pdev;
1340 
1341 	if (!dev_is_pci(info->dev))
1342 		return;
1343 
1344 	pdev = to_pci_dev(info->dev);
1345 
1346 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1347 	   the device if you enable PASID support after ATS support is
1348 	   undefined. So always enable PASID support on devices which
1349 	   have it, even if we can't yet know if we're ever going to
1350 	   use it. */
1351 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1352 		info->pasid_enabled = 1;
1353 
1354 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1355 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1356 		info->ats_enabled = 1;
1357 		domain_update_iotlb(info->domain);
1358 	}
1359 }
1360 
1361 static void iommu_disable_pci_caps(struct device_domain_info *info)
1362 {
1363 	struct pci_dev *pdev;
1364 
1365 	if (!dev_is_pci(info->dev))
1366 		return;
1367 
1368 	pdev = to_pci_dev(info->dev);
1369 
1370 	if (info->ats_enabled) {
1371 		pci_disable_ats(pdev);
1372 		info->ats_enabled = 0;
1373 		domain_update_iotlb(info->domain);
1374 	}
1375 
1376 	if (info->pasid_enabled) {
1377 		pci_disable_pasid(pdev);
1378 		info->pasid_enabled = 0;
1379 	}
1380 }
1381 
1382 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1383 				    u64 addr, unsigned int mask)
1384 {
1385 	u16 sid, qdep;
1386 
1387 	if (!info || !info->ats_enabled)
1388 		return;
1389 
1390 	sid = info->bus << 8 | info->devfn;
1391 	qdep = info->ats_qdep;
1392 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1393 			   qdep, addr, mask);
1394 	quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1395 }
1396 
1397 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1398 				  u64 addr, unsigned mask)
1399 {
1400 	struct dev_pasid_info *dev_pasid;
1401 	struct device_domain_info *info;
1402 	unsigned long flags;
1403 
1404 	if (!domain->has_iotlb_device)
1405 		return;
1406 
1407 	spin_lock_irqsave(&domain->lock, flags);
1408 	list_for_each_entry(info, &domain->devices, link)
1409 		__iommu_flush_dev_iotlb(info, addr, mask);
1410 
1411 	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1412 		info = dev_iommu_priv_get(dev_pasid->dev);
1413 
1414 		if (!info->ats_enabled)
1415 			continue;
1416 
1417 		qi_flush_dev_iotlb_pasid(info->iommu,
1418 					 PCI_DEVID(info->bus, info->devfn),
1419 					 info->pfsid, dev_pasid->pasid,
1420 					 info->ats_qdep, addr,
1421 					 mask);
1422 	}
1423 	spin_unlock_irqrestore(&domain->lock, flags);
1424 }
1425 
1426 static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1427 				     struct dmar_domain *domain, u64 addr,
1428 				     unsigned long npages, bool ih)
1429 {
1430 	u16 did = domain_id_iommu(domain, iommu);
1431 	struct dev_pasid_info *dev_pasid;
1432 	unsigned long flags;
1433 
1434 	spin_lock_irqsave(&domain->lock, flags);
1435 	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1436 		qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih);
1437 
1438 	if (!list_empty(&domain->devices))
1439 		qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
1440 	spin_unlock_irqrestore(&domain->lock, flags);
1441 }
1442 
1443 static void __iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1444 				    unsigned long pfn, unsigned int pages,
1445 				    int ih)
1446 {
1447 	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1448 	unsigned long bitmask = aligned_pages - 1;
1449 	unsigned int mask = ilog2(aligned_pages);
1450 	u64 addr = (u64)pfn << VTD_PAGE_SHIFT;
1451 
1452 	/*
1453 	 * PSI masks the low order bits of the base address. If the
1454 	 * address isn't aligned to the mask, then compute a mask value
1455 	 * needed to ensure the target range is flushed.
1456 	 */
1457 	if (unlikely(bitmask & pfn)) {
1458 		unsigned long end_pfn = pfn + pages - 1, shared_bits;
1459 
1460 		/*
1461 		 * Since end_pfn <= pfn + bitmask, the only way bits
1462 		 * higher than bitmask can differ in pfn and end_pfn is
1463 		 * by carrying. This means after masking out bitmask,
1464 		 * high bits starting with the first set bit in
1465 		 * shared_bits are all equal in both pfn and end_pfn.
1466 		 */
1467 		shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1468 		mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1469 	}
1470 
1471 	/*
1472 	 * Fallback to domain selective flush if no PSI support or
1473 	 * the size is too big.
1474 	 */
1475 	if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1476 		iommu->flush.flush_iotlb(iommu, did, 0, 0,
1477 					 DMA_TLB_DSI_FLUSH);
1478 	else
1479 		iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1480 					 DMA_TLB_PSI_FLUSH);
1481 }
1482 
1483 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1484 				  struct dmar_domain *domain,
1485 				  unsigned long pfn, unsigned int pages,
1486 				  int ih, int map)
1487 {
1488 	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1489 	unsigned int mask = ilog2(aligned_pages);
1490 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1491 	u16 did = domain_id_iommu(domain, iommu);
1492 
1493 	if (WARN_ON(!pages))
1494 		return;
1495 
1496 	if (ih)
1497 		ih = 1 << 6;
1498 
1499 	if (domain->use_first_level)
1500 		domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
1501 	else
1502 		__iommu_flush_iotlb_psi(iommu, did, pfn, pages, ih);
1503 
1504 	/*
1505 	 * In caching mode, changes of pages from non-present to present require
1506 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1507 	 */
1508 	if (!cap_caching_mode(iommu->cap) || !map)
1509 		iommu_flush_dev_iotlb(domain, addr, mask);
1510 }
1511 
1512 /* Notification for newly created mappings */
1513 static void __mapping_notify_one(struct intel_iommu *iommu, struct dmar_domain *domain,
1514 				 unsigned long pfn, unsigned int pages)
1515 {
1516 	/*
1517 	 * It's a non-present to present mapping. Only flush if caching mode
1518 	 * and second level.
1519 	 */
1520 	if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1521 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1522 	else
1523 		iommu_flush_write_buffer(iommu);
1524 }
1525 
1526 /*
1527  * Flush the relevant caches in nested translation if the domain
1528  * also serves as a parent
1529  */
1530 static void parent_domain_flush(struct dmar_domain *domain,
1531 				unsigned long pfn,
1532 				unsigned long pages, int ih)
1533 {
1534 	struct dmar_domain *s1_domain;
1535 
1536 	spin_lock(&domain->s1_lock);
1537 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
1538 		struct device_domain_info *device_info;
1539 		struct iommu_domain_info *info;
1540 		unsigned long flags;
1541 		unsigned long i;
1542 
1543 		xa_for_each(&s1_domain->iommu_array, i, info)
1544 			__iommu_flush_iotlb_psi(info->iommu, info->did,
1545 						pfn, pages, ih);
1546 
1547 		if (!s1_domain->has_iotlb_device)
1548 			continue;
1549 
1550 		spin_lock_irqsave(&s1_domain->lock, flags);
1551 		list_for_each_entry(device_info, &s1_domain->devices, link)
1552 			/*
1553 			 * Address translation cache in device side caches the
1554 			 * result of nested translation. There is no easy way
1555 			 * to identify the exact set of nested translations
1556 			 * affected by a change in S2. So just flush the entire
1557 			 * device cache.
1558 			 */
1559 			__iommu_flush_dev_iotlb(device_info, 0,
1560 						MAX_AGAW_PFN_WIDTH);
1561 		spin_unlock_irqrestore(&s1_domain->lock, flags);
1562 	}
1563 	spin_unlock(&domain->s1_lock);
1564 }
1565 
1566 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1567 {
1568 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1569 	struct iommu_domain_info *info;
1570 	unsigned long idx;
1571 
1572 	xa_for_each(&dmar_domain->iommu_array, idx, info) {
1573 		struct intel_iommu *iommu = info->iommu;
1574 		u16 did = domain_id_iommu(dmar_domain, iommu);
1575 
1576 		if (dmar_domain->use_first_level)
1577 			domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0);
1578 		else
1579 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1580 						 DMA_TLB_DSI_FLUSH);
1581 
1582 		if (!cap_caching_mode(iommu->cap))
1583 			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1584 	}
1585 
1586 	if (dmar_domain->nested_parent)
1587 		parent_domain_flush(dmar_domain, 0, -1, 0);
1588 }
1589 
1590 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1591 {
1592 	u32 pmen;
1593 	unsigned long flags;
1594 
1595 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1596 		return;
1597 
1598 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1599 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1600 	pmen &= ~DMA_PMEN_EPM;
1601 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1602 
1603 	/* wait for the protected region status bit to clear */
1604 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1605 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1606 
1607 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1608 }
1609 
1610 static void iommu_enable_translation(struct intel_iommu *iommu)
1611 {
1612 	u32 sts;
1613 	unsigned long flags;
1614 
1615 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1616 	iommu->gcmd |= DMA_GCMD_TE;
1617 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1618 
1619 	/* Make sure hardware complete it */
1620 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1621 		      readl, (sts & DMA_GSTS_TES), sts);
1622 
1623 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1624 }
1625 
1626 static void iommu_disable_translation(struct intel_iommu *iommu)
1627 {
1628 	u32 sts;
1629 	unsigned long flag;
1630 
1631 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1632 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1633 		return;
1634 
1635 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1636 	iommu->gcmd &= ~DMA_GCMD_TE;
1637 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1638 
1639 	/* Make sure hardware complete it */
1640 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1641 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1642 
1643 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1644 }
1645 
1646 static int iommu_init_domains(struct intel_iommu *iommu)
1647 {
1648 	u32 ndomains;
1649 
1650 	ndomains = cap_ndoms(iommu->cap);
1651 	pr_debug("%s: Number of Domains supported <%d>\n",
1652 		 iommu->name, ndomains);
1653 
1654 	spin_lock_init(&iommu->lock);
1655 
1656 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1657 	if (!iommu->domain_ids)
1658 		return -ENOMEM;
1659 
1660 	/*
1661 	 * If Caching mode is set, then invalid translations are tagged
1662 	 * with domain-id 0, hence we need to pre-allocate it. We also
1663 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1664 	 * make sure it is not used for a real domain.
1665 	 */
1666 	set_bit(0, iommu->domain_ids);
1667 
1668 	/*
1669 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1670 	 * entry for first-level or pass-through translation modes should
1671 	 * be programmed with a domain id different from those used for
1672 	 * second-level or nested translation. We reserve a domain id for
1673 	 * this purpose.
1674 	 */
1675 	if (sm_supported(iommu))
1676 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1677 
1678 	return 0;
1679 }
1680 
1681 static void disable_dmar_iommu(struct intel_iommu *iommu)
1682 {
1683 	if (!iommu->domain_ids)
1684 		return;
1685 
1686 	/*
1687 	 * All iommu domains must have been detached from the devices,
1688 	 * hence there should be no domain IDs in use.
1689 	 */
1690 	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1691 		    > NUM_RESERVED_DID))
1692 		return;
1693 
1694 	if (iommu->gcmd & DMA_GCMD_TE)
1695 		iommu_disable_translation(iommu);
1696 }
1697 
1698 static void free_dmar_iommu(struct intel_iommu *iommu)
1699 {
1700 	if (iommu->domain_ids) {
1701 		bitmap_free(iommu->domain_ids);
1702 		iommu->domain_ids = NULL;
1703 	}
1704 
1705 	if (iommu->copied_tables) {
1706 		bitmap_free(iommu->copied_tables);
1707 		iommu->copied_tables = NULL;
1708 	}
1709 
1710 	/* free context mapping */
1711 	free_context_table(iommu);
1712 
1713 #ifdef CONFIG_INTEL_IOMMU_SVM
1714 	if (pasid_supported(iommu)) {
1715 		if (ecap_prs(iommu->ecap))
1716 			intel_svm_finish_prq(iommu);
1717 	}
1718 #endif
1719 }
1720 
1721 /*
1722  * Check and return whether first level is used by default for
1723  * DMA translation.
1724  */
1725 static bool first_level_by_default(unsigned int type)
1726 {
1727 	/* Only SL is available in legacy mode */
1728 	if (!scalable_mode_support())
1729 		return false;
1730 
1731 	/* Only level (either FL or SL) is available, just use it */
1732 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1733 		return intel_cap_flts_sanity();
1734 
1735 	/* Both levels are available, decide it based on domain type */
1736 	return type != IOMMU_DOMAIN_UNMANAGED;
1737 }
1738 
1739 static struct dmar_domain *alloc_domain(unsigned int type)
1740 {
1741 	struct dmar_domain *domain;
1742 
1743 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1744 	if (!domain)
1745 		return NULL;
1746 
1747 	domain->nid = NUMA_NO_NODE;
1748 	if (first_level_by_default(type))
1749 		domain->use_first_level = true;
1750 	domain->has_iotlb_device = false;
1751 	INIT_LIST_HEAD(&domain->devices);
1752 	INIT_LIST_HEAD(&domain->dev_pasids);
1753 	spin_lock_init(&domain->lock);
1754 	xa_init(&domain->iommu_array);
1755 
1756 	return domain;
1757 }
1758 
1759 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1760 {
1761 	struct iommu_domain_info *info, *curr;
1762 	unsigned long ndomains;
1763 	int num, ret = -ENOSPC;
1764 
1765 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1766 	if (!info)
1767 		return -ENOMEM;
1768 
1769 	spin_lock(&iommu->lock);
1770 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1771 	if (curr) {
1772 		curr->refcnt++;
1773 		spin_unlock(&iommu->lock);
1774 		kfree(info);
1775 		return 0;
1776 	}
1777 
1778 	ndomains = cap_ndoms(iommu->cap);
1779 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1780 	if (num >= ndomains) {
1781 		pr_err("%s: No free domain ids\n", iommu->name);
1782 		goto err_unlock;
1783 	}
1784 
1785 	set_bit(num, iommu->domain_ids);
1786 	info->refcnt	= 1;
1787 	info->did	= num;
1788 	info->iommu	= iommu;
1789 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1790 			  NULL, info, GFP_ATOMIC);
1791 	if (curr) {
1792 		ret = xa_err(curr) ? : -EBUSY;
1793 		goto err_clear;
1794 	}
1795 	domain_update_iommu_cap(domain);
1796 
1797 	spin_unlock(&iommu->lock);
1798 	return 0;
1799 
1800 err_clear:
1801 	clear_bit(info->did, iommu->domain_ids);
1802 err_unlock:
1803 	spin_unlock(&iommu->lock);
1804 	kfree(info);
1805 	return ret;
1806 }
1807 
1808 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1809 {
1810 	struct iommu_domain_info *info;
1811 
1812 	spin_lock(&iommu->lock);
1813 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1814 	if (--info->refcnt == 0) {
1815 		clear_bit(info->did, iommu->domain_ids);
1816 		xa_erase(&domain->iommu_array, iommu->seq_id);
1817 		domain->nid = NUMA_NO_NODE;
1818 		domain_update_iommu_cap(domain);
1819 		kfree(info);
1820 	}
1821 	spin_unlock(&iommu->lock);
1822 }
1823 
1824 static int guestwidth_to_adjustwidth(int gaw)
1825 {
1826 	int agaw;
1827 	int r = (gaw - 12) % 9;
1828 
1829 	if (r == 0)
1830 		agaw = gaw;
1831 	else
1832 		agaw = gaw + 9 - r;
1833 	if (agaw > 64)
1834 		agaw = 64;
1835 	return agaw;
1836 }
1837 
1838 static void domain_exit(struct dmar_domain *domain)
1839 {
1840 	if (domain->pgd) {
1841 		LIST_HEAD(freelist);
1842 
1843 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1844 		put_pages_list(&freelist);
1845 	}
1846 
1847 	if (WARN_ON(!list_empty(&domain->devices)))
1848 		return;
1849 
1850 	kfree(domain);
1851 }
1852 
1853 static int domain_context_mapping_one(struct dmar_domain *domain,
1854 				      struct intel_iommu *iommu,
1855 				      u8 bus, u8 devfn)
1856 {
1857 	struct device_domain_info *info =
1858 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1859 	u16 did = domain_id_iommu(domain, iommu);
1860 	int translation = CONTEXT_TT_MULTI_LEVEL;
1861 	struct dma_pte *pgd = domain->pgd;
1862 	struct context_entry *context;
1863 	int agaw, ret;
1864 
1865 	if (hw_pass_through && domain_type_is_si(domain))
1866 		translation = CONTEXT_TT_PASS_THROUGH;
1867 
1868 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1869 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1870 
1871 	spin_lock(&iommu->lock);
1872 	ret = -ENOMEM;
1873 	context = iommu_context_addr(iommu, bus, devfn, 1);
1874 	if (!context)
1875 		goto out_unlock;
1876 
1877 	ret = 0;
1878 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1879 		goto out_unlock;
1880 
1881 	/*
1882 	 * For kdump cases, old valid entries may be cached due to the
1883 	 * in-flight DMA and copied pgtable, but there is no unmapping
1884 	 * behaviour for them, thus we need an explicit cache flush for
1885 	 * the newly-mapped device. For kdump, at this point, the device
1886 	 * is supposed to finish reset at its driver probe stage, so no
1887 	 * in-flight DMA will exist, and we don't need to worry anymore
1888 	 * hereafter.
1889 	 */
1890 	if (context_copied(iommu, bus, devfn)) {
1891 		u16 did_old = context_domain_id(context);
1892 
1893 		if (did_old < cap_ndoms(iommu->cap)) {
1894 			iommu->flush.flush_context(iommu, did_old,
1895 						   (((u16)bus) << 8) | devfn,
1896 						   DMA_CCMD_MASK_NOBIT,
1897 						   DMA_CCMD_DEVICE_INVL);
1898 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1899 						 DMA_TLB_DSI_FLUSH);
1900 		}
1901 
1902 		clear_context_copied(iommu, bus, devfn);
1903 	}
1904 
1905 	context_clear_entry(context);
1906 	context_set_domain_id(context, did);
1907 
1908 	if (translation != CONTEXT_TT_PASS_THROUGH) {
1909 		/*
1910 		 * Skip top levels of page tables for iommu which has
1911 		 * less agaw than default. Unnecessary for PT mode.
1912 		 */
1913 		for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1914 			ret = -ENOMEM;
1915 			pgd = phys_to_virt(dma_pte_addr(pgd));
1916 			if (!dma_pte_present(pgd))
1917 				goto out_unlock;
1918 		}
1919 
1920 		if (info && info->ats_supported)
1921 			translation = CONTEXT_TT_DEV_IOTLB;
1922 		else
1923 			translation = CONTEXT_TT_MULTI_LEVEL;
1924 
1925 		context_set_address_root(context, virt_to_phys(pgd));
1926 		context_set_address_width(context, agaw);
1927 	} else {
1928 		/*
1929 		 * In pass through mode, AW must be programmed to
1930 		 * indicate the largest AGAW value supported by
1931 		 * hardware. And ASR is ignored by hardware.
1932 		 */
1933 		context_set_address_width(context, iommu->msagaw);
1934 	}
1935 
1936 	context_set_translation_type(context, translation);
1937 	context_set_fault_enable(context);
1938 	context_set_present(context);
1939 	if (!ecap_coherent(iommu->ecap))
1940 		clflush_cache_range(context, sizeof(*context));
1941 
1942 	/*
1943 	 * It's a non-present to present mapping. If hardware doesn't cache
1944 	 * non-present entry we only need to flush the write-buffer. If the
1945 	 * _does_ cache non-present entries, then it does so in the special
1946 	 * domain #0, which we have to flush:
1947 	 */
1948 	if (cap_caching_mode(iommu->cap)) {
1949 		iommu->flush.flush_context(iommu, 0,
1950 					   (((u16)bus) << 8) | devfn,
1951 					   DMA_CCMD_MASK_NOBIT,
1952 					   DMA_CCMD_DEVICE_INVL);
1953 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1954 	} else {
1955 		iommu_flush_write_buffer(iommu);
1956 	}
1957 
1958 	ret = 0;
1959 
1960 out_unlock:
1961 	spin_unlock(&iommu->lock);
1962 
1963 	return ret;
1964 }
1965 
1966 static int domain_context_mapping_cb(struct pci_dev *pdev,
1967 				     u16 alias, void *opaque)
1968 {
1969 	struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1970 	struct intel_iommu *iommu = info->iommu;
1971 	struct dmar_domain *domain = opaque;
1972 
1973 	return domain_context_mapping_one(domain, iommu,
1974 					  PCI_BUS_NUM(alias), alias & 0xff);
1975 }
1976 
1977 static int
1978 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1979 {
1980 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1981 	struct intel_iommu *iommu = info->iommu;
1982 	u8 bus = info->bus, devfn = info->devfn;
1983 
1984 	if (!dev_is_pci(dev))
1985 		return domain_context_mapping_one(domain, iommu, bus, devfn);
1986 
1987 	return pci_for_each_dma_alias(to_pci_dev(dev),
1988 				      domain_context_mapping_cb, domain);
1989 }
1990 
1991 /* Returns a number of VTD pages, but aligned to MM page size */
1992 static unsigned long aligned_nrpages(unsigned long host_addr, size_t size)
1993 {
1994 	host_addr &= ~PAGE_MASK;
1995 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1996 }
1997 
1998 /* Return largest possible superpage level for a given mapping */
1999 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
2000 				   unsigned long phy_pfn, unsigned long pages)
2001 {
2002 	int support, level = 1;
2003 	unsigned long pfnmerge;
2004 
2005 	support = domain->iommu_superpage;
2006 
2007 	/* To use a large page, the virtual *and* physical addresses
2008 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2009 	   of them will mean we have to use smaller pages. So just
2010 	   merge them and check both at once. */
2011 	pfnmerge = iov_pfn | phy_pfn;
2012 
2013 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2014 		pages >>= VTD_STRIDE_SHIFT;
2015 		if (!pages)
2016 			break;
2017 		pfnmerge >>= VTD_STRIDE_SHIFT;
2018 		level++;
2019 		support--;
2020 	}
2021 	return level;
2022 }
2023 
2024 /*
2025  * Ensure that old small page tables are removed to make room for superpage(s).
2026  * We're going to add new large pages, so make sure we don't remove their parent
2027  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2028  */
2029 static void switch_to_super_page(struct dmar_domain *domain,
2030 				 unsigned long start_pfn,
2031 				 unsigned long end_pfn, int level)
2032 {
2033 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2034 	struct iommu_domain_info *info;
2035 	struct dma_pte *pte = NULL;
2036 	unsigned long i;
2037 
2038 	while (start_pfn <= end_pfn) {
2039 		if (!pte)
2040 			pte = pfn_to_dma_pte(domain, start_pfn, &level,
2041 					     GFP_ATOMIC);
2042 
2043 		if (dma_pte_present(pte)) {
2044 			dma_pte_free_pagetable(domain, start_pfn,
2045 					       start_pfn + lvl_pages - 1,
2046 					       level + 1);
2047 
2048 			xa_for_each(&domain->iommu_array, i, info)
2049 				iommu_flush_iotlb_psi(info->iommu, domain,
2050 						      start_pfn, lvl_pages,
2051 						      0, 0);
2052 			if (domain->nested_parent)
2053 				parent_domain_flush(domain, start_pfn,
2054 						    lvl_pages, 0);
2055 		}
2056 
2057 		pte++;
2058 		start_pfn += lvl_pages;
2059 		if (first_pte_in_page(pte))
2060 			pte = NULL;
2061 	}
2062 }
2063 
2064 static int
2065 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2066 		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
2067 		 gfp_t gfp)
2068 {
2069 	struct dma_pte *first_pte = NULL, *pte = NULL;
2070 	unsigned int largepage_lvl = 0;
2071 	unsigned long lvl_pages = 0;
2072 	phys_addr_t pteval;
2073 	u64 attr;
2074 
2075 	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2076 		return -EINVAL;
2077 
2078 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2079 		return -EINVAL;
2080 
2081 	if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
2082 		pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
2083 		return -EINVAL;
2084 	}
2085 
2086 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2087 	attr |= DMA_FL_PTE_PRESENT;
2088 	if (domain->use_first_level) {
2089 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2090 		if (prot & DMA_PTE_WRITE)
2091 			attr |= DMA_FL_PTE_DIRTY;
2092 	}
2093 
2094 	domain->has_mappings = true;
2095 
2096 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2097 
2098 	while (nr_pages > 0) {
2099 		uint64_t tmp;
2100 
2101 		if (!pte) {
2102 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2103 					phys_pfn, nr_pages);
2104 
2105 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2106 					     gfp);
2107 			if (!pte)
2108 				return -ENOMEM;
2109 			first_pte = pte;
2110 
2111 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2112 
2113 			/* It is large page*/
2114 			if (largepage_lvl > 1) {
2115 				unsigned long end_pfn;
2116 				unsigned long pages_to_remove;
2117 
2118 				pteval |= DMA_PTE_LARGE_PAGE;
2119 				pages_to_remove = min_t(unsigned long, nr_pages,
2120 							nr_pte_to_next_page(pte) * lvl_pages);
2121 				end_pfn = iov_pfn + pages_to_remove - 1;
2122 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2123 			} else {
2124 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2125 			}
2126 
2127 		}
2128 		/* We don't need lock here, nobody else
2129 		 * touches the iova range
2130 		 */
2131 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2132 		if (tmp) {
2133 			static int dumps = 5;
2134 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2135 				iov_pfn, tmp, (unsigned long long)pteval);
2136 			if (dumps) {
2137 				dumps--;
2138 				debug_dma_dump_mappings(NULL);
2139 			}
2140 			WARN_ON(1);
2141 		}
2142 
2143 		nr_pages -= lvl_pages;
2144 		iov_pfn += lvl_pages;
2145 		phys_pfn += lvl_pages;
2146 		pteval += lvl_pages * VTD_PAGE_SIZE;
2147 
2148 		/* If the next PTE would be the first in a new page, then we
2149 		 * need to flush the cache on the entries we've just written.
2150 		 * And then we'll need to recalculate 'pte', so clear it and
2151 		 * let it get set again in the if (!pte) block above.
2152 		 *
2153 		 * If we're done (!nr_pages) we need to flush the cache too.
2154 		 *
2155 		 * Also if we've been setting superpages, we may need to
2156 		 * recalculate 'pte' and switch back to smaller pages for the
2157 		 * end of the mapping, if the trailing size is not enough to
2158 		 * use another superpage (i.e. nr_pages < lvl_pages).
2159 		 */
2160 		pte++;
2161 		if (!nr_pages || first_pte_in_page(pte) ||
2162 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2163 			domain_flush_cache(domain, first_pte,
2164 					   (void *)pte - (void *)first_pte);
2165 			pte = NULL;
2166 		}
2167 	}
2168 
2169 	return 0;
2170 }
2171 
2172 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2173 {
2174 	struct intel_iommu *iommu = info->iommu;
2175 	struct context_entry *context;
2176 	u16 did_old;
2177 
2178 	spin_lock(&iommu->lock);
2179 	context = iommu_context_addr(iommu, bus, devfn, 0);
2180 	if (!context) {
2181 		spin_unlock(&iommu->lock);
2182 		return;
2183 	}
2184 
2185 	did_old = context_domain_id(context);
2186 
2187 	context_clear_entry(context);
2188 	__iommu_flush_cache(iommu, context, sizeof(*context));
2189 	spin_unlock(&iommu->lock);
2190 	iommu->flush.flush_context(iommu,
2191 				   did_old,
2192 				   (((u16)bus) << 8) | devfn,
2193 				   DMA_CCMD_MASK_NOBIT,
2194 				   DMA_CCMD_DEVICE_INVL);
2195 
2196 	iommu->flush.flush_iotlb(iommu,
2197 				 did_old,
2198 				 0,
2199 				 0,
2200 				 DMA_TLB_DSI_FLUSH);
2201 
2202 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2203 }
2204 
2205 static int domain_setup_first_level(struct intel_iommu *iommu,
2206 				    struct dmar_domain *domain,
2207 				    struct device *dev,
2208 				    u32 pasid)
2209 {
2210 	struct dma_pte *pgd = domain->pgd;
2211 	int agaw, level;
2212 	int flags = 0;
2213 
2214 	/*
2215 	 * Skip top levels of page tables for iommu which has
2216 	 * less agaw than default. Unnecessary for PT mode.
2217 	 */
2218 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2219 		pgd = phys_to_virt(dma_pte_addr(pgd));
2220 		if (!dma_pte_present(pgd))
2221 			return -ENOMEM;
2222 	}
2223 
2224 	level = agaw_to_level(agaw);
2225 	if (level != 4 && level != 5)
2226 		return -EINVAL;
2227 
2228 	if (level == 5)
2229 		flags |= PASID_FLAG_FL5LP;
2230 
2231 	if (domain->force_snooping)
2232 		flags |= PASID_FLAG_PAGE_SNOOP;
2233 
2234 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2235 					     domain_id_iommu(domain, iommu),
2236 					     flags);
2237 }
2238 
2239 static bool dev_is_real_dma_subdevice(struct device *dev)
2240 {
2241 	return dev && dev_is_pci(dev) &&
2242 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2243 }
2244 
2245 static int iommu_domain_identity_map(struct dmar_domain *domain,
2246 				     unsigned long first_vpfn,
2247 				     unsigned long last_vpfn)
2248 {
2249 	/*
2250 	 * RMRR range might have overlap with physical memory range,
2251 	 * clear it first
2252 	 */
2253 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2254 
2255 	return __domain_mapping(domain, first_vpfn,
2256 				first_vpfn, last_vpfn - first_vpfn + 1,
2257 				DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2258 }
2259 
2260 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2261 
2262 static int __init si_domain_init(int hw)
2263 {
2264 	struct dmar_rmrr_unit *rmrr;
2265 	struct device *dev;
2266 	int i, nid, ret;
2267 
2268 	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2269 	if (!si_domain)
2270 		return -EFAULT;
2271 
2272 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2273 		domain_exit(si_domain);
2274 		si_domain = NULL;
2275 		return -EFAULT;
2276 	}
2277 
2278 	if (hw)
2279 		return 0;
2280 
2281 	for_each_online_node(nid) {
2282 		unsigned long start_pfn, end_pfn;
2283 		int i;
2284 
2285 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2286 			ret = iommu_domain_identity_map(si_domain,
2287 					mm_to_dma_pfn_start(start_pfn),
2288 					mm_to_dma_pfn_end(end_pfn));
2289 			if (ret)
2290 				return ret;
2291 		}
2292 	}
2293 
2294 	/*
2295 	 * Identity map the RMRRs so that devices with RMRRs could also use
2296 	 * the si_domain.
2297 	 */
2298 	for_each_rmrr_units(rmrr) {
2299 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2300 					  i, dev) {
2301 			unsigned long long start = rmrr->base_address;
2302 			unsigned long long end = rmrr->end_address;
2303 
2304 			if (WARN_ON(end < start ||
2305 				    end >> agaw_to_width(si_domain->agaw)))
2306 				continue;
2307 
2308 			ret = iommu_domain_identity_map(si_domain,
2309 					mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2310 					mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2311 			if (ret)
2312 				return ret;
2313 		}
2314 	}
2315 
2316 	return 0;
2317 }
2318 
2319 static int dmar_domain_attach_device(struct dmar_domain *domain,
2320 				     struct device *dev)
2321 {
2322 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2323 	struct intel_iommu *iommu = info->iommu;
2324 	unsigned long flags;
2325 	int ret;
2326 
2327 	ret = domain_attach_iommu(domain, iommu);
2328 	if (ret)
2329 		return ret;
2330 	info->domain = domain;
2331 	spin_lock_irqsave(&domain->lock, flags);
2332 	list_add(&info->link, &domain->devices);
2333 	spin_unlock_irqrestore(&domain->lock, flags);
2334 
2335 	if (dev_is_real_dma_subdevice(dev))
2336 		return 0;
2337 
2338 	if (!sm_supported(iommu))
2339 		ret = domain_context_mapping(domain, dev);
2340 	else if (hw_pass_through && domain_type_is_si(domain))
2341 		ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
2342 	else if (domain->use_first_level)
2343 		ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID);
2344 	else
2345 		ret = intel_pasid_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID);
2346 
2347 	if (ret) {
2348 		device_block_translation(dev);
2349 		return ret;
2350 	}
2351 
2352 	if (sm_supported(info->iommu) || !domain_type_is_si(info->domain))
2353 		iommu_enable_pci_caps(info);
2354 
2355 	return 0;
2356 }
2357 
2358 /**
2359  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2360  * is relaxable (ie. is allowed to be not enforced under some conditions)
2361  * @dev: device handle
2362  *
2363  * We assume that PCI USB devices with RMRRs have them largely
2364  * for historical reasons and that the RMRR space is not actively used post
2365  * boot.  This exclusion may change if vendors begin to abuse it.
2366  *
2367  * The same exception is made for graphics devices, with the requirement that
2368  * any use of the RMRR regions will be torn down before assigning the device
2369  * to a guest.
2370  *
2371  * Return: true if the RMRR is relaxable, false otherwise
2372  */
2373 static bool device_rmrr_is_relaxable(struct device *dev)
2374 {
2375 	struct pci_dev *pdev;
2376 
2377 	if (!dev_is_pci(dev))
2378 		return false;
2379 
2380 	pdev = to_pci_dev(dev);
2381 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2382 		return true;
2383 	else
2384 		return false;
2385 }
2386 
2387 /*
2388  * Return the required default domain type for a specific device.
2389  *
2390  * @dev: the device in query
2391  * @startup: true if this is during early boot
2392  *
2393  * Returns:
2394  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2395  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2396  *  - 0: both identity and dynamic domains work for this device
2397  */
2398 static int device_def_domain_type(struct device *dev)
2399 {
2400 	if (dev_is_pci(dev)) {
2401 		struct pci_dev *pdev = to_pci_dev(dev);
2402 
2403 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2404 			return IOMMU_DOMAIN_IDENTITY;
2405 
2406 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2407 			return IOMMU_DOMAIN_IDENTITY;
2408 	}
2409 
2410 	return 0;
2411 }
2412 
2413 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2414 {
2415 	/*
2416 	 * Start from the sane iommu hardware state.
2417 	 * If the queued invalidation is already initialized by us
2418 	 * (for example, while enabling interrupt-remapping) then
2419 	 * we got the things already rolling from a sane state.
2420 	 */
2421 	if (!iommu->qi) {
2422 		/*
2423 		 * Clear any previous faults.
2424 		 */
2425 		dmar_fault(-1, iommu);
2426 		/*
2427 		 * Disable queued invalidation if supported and already enabled
2428 		 * before OS handover.
2429 		 */
2430 		dmar_disable_qi(iommu);
2431 	}
2432 
2433 	if (dmar_enable_qi(iommu)) {
2434 		/*
2435 		 * Queued Invalidate not enabled, use Register Based Invalidate
2436 		 */
2437 		iommu->flush.flush_context = __iommu_flush_context;
2438 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2439 		pr_info("%s: Using Register based invalidation\n",
2440 			iommu->name);
2441 	} else {
2442 		iommu->flush.flush_context = qi_flush_context;
2443 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2444 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2445 	}
2446 }
2447 
2448 static int copy_context_table(struct intel_iommu *iommu,
2449 			      struct root_entry *old_re,
2450 			      struct context_entry **tbl,
2451 			      int bus, bool ext)
2452 {
2453 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2454 	struct context_entry *new_ce = NULL, ce;
2455 	struct context_entry *old_ce = NULL;
2456 	struct root_entry re;
2457 	phys_addr_t old_ce_phys;
2458 
2459 	tbl_idx = ext ? bus * 2 : bus;
2460 	memcpy(&re, old_re, sizeof(re));
2461 
2462 	for (devfn = 0; devfn < 256; devfn++) {
2463 		/* First calculate the correct index */
2464 		idx = (ext ? devfn * 2 : devfn) % 256;
2465 
2466 		if (idx == 0) {
2467 			/* First save what we may have and clean up */
2468 			if (new_ce) {
2469 				tbl[tbl_idx] = new_ce;
2470 				__iommu_flush_cache(iommu, new_ce,
2471 						    VTD_PAGE_SIZE);
2472 				pos = 1;
2473 			}
2474 
2475 			if (old_ce)
2476 				memunmap(old_ce);
2477 
2478 			ret = 0;
2479 			if (devfn < 0x80)
2480 				old_ce_phys = root_entry_lctp(&re);
2481 			else
2482 				old_ce_phys = root_entry_uctp(&re);
2483 
2484 			if (!old_ce_phys) {
2485 				if (ext && devfn == 0) {
2486 					/* No LCTP, try UCTP */
2487 					devfn = 0x7f;
2488 					continue;
2489 				} else {
2490 					goto out;
2491 				}
2492 			}
2493 
2494 			ret = -ENOMEM;
2495 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2496 					MEMREMAP_WB);
2497 			if (!old_ce)
2498 				goto out;
2499 
2500 			new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2501 			if (!new_ce)
2502 				goto out_unmap;
2503 
2504 			ret = 0;
2505 		}
2506 
2507 		/* Now copy the context entry */
2508 		memcpy(&ce, old_ce + idx, sizeof(ce));
2509 
2510 		if (!context_present(&ce))
2511 			continue;
2512 
2513 		did = context_domain_id(&ce);
2514 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2515 			set_bit(did, iommu->domain_ids);
2516 
2517 		set_context_copied(iommu, bus, devfn);
2518 		new_ce[idx] = ce;
2519 	}
2520 
2521 	tbl[tbl_idx + pos] = new_ce;
2522 
2523 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2524 
2525 out_unmap:
2526 	memunmap(old_ce);
2527 
2528 out:
2529 	return ret;
2530 }
2531 
2532 static int copy_translation_tables(struct intel_iommu *iommu)
2533 {
2534 	struct context_entry **ctxt_tbls;
2535 	struct root_entry *old_rt;
2536 	phys_addr_t old_rt_phys;
2537 	int ctxt_table_entries;
2538 	u64 rtaddr_reg;
2539 	int bus, ret;
2540 	bool new_ext, ext;
2541 
2542 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2543 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2544 	new_ext    = !!sm_supported(iommu);
2545 
2546 	/*
2547 	 * The RTT bit can only be changed when translation is disabled,
2548 	 * but disabling translation means to open a window for data
2549 	 * corruption. So bail out and don't copy anything if we would
2550 	 * have to change the bit.
2551 	 */
2552 	if (new_ext != ext)
2553 		return -EINVAL;
2554 
2555 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2556 	if (!iommu->copied_tables)
2557 		return -ENOMEM;
2558 
2559 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2560 	if (!old_rt_phys)
2561 		return -EINVAL;
2562 
2563 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2564 	if (!old_rt)
2565 		return -ENOMEM;
2566 
2567 	/* This is too big for the stack - allocate it from slab */
2568 	ctxt_table_entries = ext ? 512 : 256;
2569 	ret = -ENOMEM;
2570 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2571 	if (!ctxt_tbls)
2572 		goto out_unmap;
2573 
2574 	for (bus = 0; bus < 256; bus++) {
2575 		ret = copy_context_table(iommu, &old_rt[bus],
2576 					 ctxt_tbls, bus, ext);
2577 		if (ret) {
2578 			pr_err("%s: Failed to copy context table for bus %d\n",
2579 				iommu->name, bus);
2580 			continue;
2581 		}
2582 	}
2583 
2584 	spin_lock(&iommu->lock);
2585 
2586 	/* Context tables are copied, now write them to the root_entry table */
2587 	for (bus = 0; bus < 256; bus++) {
2588 		int idx = ext ? bus * 2 : bus;
2589 		u64 val;
2590 
2591 		if (ctxt_tbls[idx]) {
2592 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2593 			iommu->root_entry[bus].lo = val;
2594 		}
2595 
2596 		if (!ext || !ctxt_tbls[idx + 1])
2597 			continue;
2598 
2599 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2600 		iommu->root_entry[bus].hi = val;
2601 	}
2602 
2603 	spin_unlock(&iommu->lock);
2604 
2605 	kfree(ctxt_tbls);
2606 
2607 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2608 
2609 	ret = 0;
2610 
2611 out_unmap:
2612 	memunmap(old_rt);
2613 
2614 	return ret;
2615 }
2616 
2617 static int __init init_dmars(void)
2618 {
2619 	struct dmar_drhd_unit *drhd;
2620 	struct intel_iommu *iommu;
2621 	int ret;
2622 
2623 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2624 	if (ret)
2625 		goto free_iommu;
2626 
2627 	for_each_iommu(iommu, drhd) {
2628 		if (drhd->ignored) {
2629 			iommu_disable_translation(iommu);
2630 			continue;
2631 		}
2632 
2633 		/*
2634 		 * Find the max pasid size of all IOMMU's in the system.
2635 		 * We need to ensure the system pasid table is no bigger
2636 		 * than the smallest supported.
2637 		 */
2638 		if (pasid_supported(iommu)) {
2639 			u32 temp = 2 << ecap_pss(iommu->ecap);
2640 
2641 			intel_pasid_max_id = min_t(u32, temp,
2642 						   intel_pasid_max_id);
2643 		}
2644 
2645 		intel_iommu_init_qi(iommu);
2646 
2647 		ret = iommu_init_domains(iommu);
2648 		if (ret)
2649 			goto free_iommu;
2650 
2651 		init_translation_status(iommu);
2652 
2653 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2654 			iommu_disable_translation(iommu);
2655 			clear_translation_pre_enabled(iommu);
2656 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2657 				iommu->name);
2658 		}
2659 
2660 		/*
2661 		 * TBD:
2662 		 * we could share the same root & context tables
2663 		 * among all IOMMU's. Need to Split it later.
2664 		 */
2665 		ret = iommu_alloc_root_entry(iommu);
2666 		if (ret)
2667 			goto free_iommu;
2668 
2669 		if (translation_pre_enabled(iommu)) {
2670 			pr_info("Translation already enabled - trying to copy translation structures\n");
2671 
2672 			ret = copy_translation_tables(iommu);
2673 			if (ret) {
2674 				/*
2675 				 * We found the IOMMU with translation
2676 				 * enabled - but failed to copy over the
2677 				 * old root-entry table. Try to proceed
2678 				 * by disabling translation now and
2679 				 * allocating a clean root-entry table.
2680 				 * This might cause DMAR faults, but
2681 				 * probably the dump will still succeed.
2682 				 */
2683 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2684 				       iommu->name);
2685 				iommu_disable_translation(iommu);
2686 				clear_translation_pre_enabled(iommu);
2687 			} else {
2688 				pr_info("Copied translation tables from previous kernel for %s\n",
2689 					iommu->name);
2690 			}
2691 		}
2692 
2693 		if (!ecap_pass_through(iommu->ecap))
2694 			hw_pass_through = 0;
2695 		intel_svm_check(iommu);
2696 	}
2697 
2698 	/*
2699 	 * Now that qi is enabled on all iommus, set the root entry and flush
2700 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2701 	 * flush_context function will loop forever and the boot hangs.
2702 	 */
2703 	for_each_active_iommu(iommu, drhd) {
2704 		iommu_flush_write_buffer(iommu);
2705 		iommu_set_root_entry(iommu);
2706 	}
2707 
2708 	if (!dmar_map_gfx)
2709 		iommu_identity_mapping |= IDENTMAP_GFX;
2710 
2711 	check_tylersburg_isoch();
2712 
2713 	ret = si_domain_init(hw_pass_through);
2714 	if (ret)
2715 		goto free_iommu;
2716 
2717 	/*
2718 	 * for each drhd
2719 	 *   enable fault log
2720 	 *   global invalidate context cache
2721 	 *   global invalidate iotlb
2722 	 *   enable translation
2723 	 */
2724 	for_each_iommu(iommu, drhd) {
2725 		if (drhd->ignored) {
2726 			/*
2727 			 * we always have to disable PMRs or DMA may fail on
2728 			 * this device
2729 			 */
2730 			if (force_on)
2731 				iommu_disable_protect_mem_regions(iommu);
2732 			continue;
2733 		}
2734 
2735 		iommu_flush_write_buffer(iommu);
2736 
2737 #ifdef CONFIG_INTEL_IOMMU_SVM
2738 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2739 			/*
2740 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2741 			 * could cause possible lock race condition.
2742 			 */
2743 			up_write(&dmar_global_lock);
2744 			ret = intel_svm_enable_prq(iommu);
2745 			down_write(&dmar_global_lock);
2746 			if (ret)
2747 				goto free_iommu;
2748 		}
2749 #endif
2750 		ret = dmar_set_interrupt(iommu);
2751 		if (ret)
2752 			goto free_iommu;
2753 	}
2754 
2755 	return 0;
2756 
2757 free_iommu:
2758 	for_each_active_iommu(iommu, drhd) {
2759 		disable_dmar_iommu(iommu);
2760 		free_dmar_iommu(iommu);
2761 	}
2762 	if (si_domain) {
2763 		domain_exit(si_domain);
2764 		si_domain = NULL;
2765 	}
2766 
2767 	return ret;
2768 }
2769 
2770 static void __init init_no_remapping_devices(void)
2771 {
2772 	struct dmar_drhd_unit *drhd;
2773 	struct device *dev;
2774 	int i;
2775 
2776 	for_each_drhd_unit(drhd) {
2777 		if (!drhd->include_all) {
2778 			for_each_active_dev_scope(drhd->devices,
2779 						  drhd->devices_cnt, i, dev)
2780 				break;
2781 			/* ignore DMAR unit if no devices exist */
2782 			if (i == drhd->devices_cnt)
2783 				drhd->ignored = 1;
2784 		}
2785 	}
2786 
2787 	for_each_active_drhd_unit(drhd) {
2788 		if (drhd->include_all)
2789 			continue;
2790 
2791 		for_each_active_dev_scope(drhd->devices,
2792 					  drhd->devices_cnt, i, dev)
2793 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2794 				break;
2795 		if (i < drhd->devices_cnt)
2796 			continue;
2797 
2798 		/* This IOMMU has *only* gfx devices. Either bypass it or
2799 		   set the gfx_mapped flag, as appropriate */
2800 		drhd->gfx_dedicated = 1;
2801 		if (!dmar_map_gfx)
2802 			drhd->ignored = 1;
2803 	}
2804 }
2805 
2806 #ifdef CONFIG_SUSPEND
2807 static int init_iommu_hw(void)
2808 {
2809 	struct dmar_drhd_unit *drhd;
2810 	struct intel_iommu *iommu = NULL;
2811 	int ret;
2812 
2813 	for_each_active_iommu(iommu, drhd) {
2814 		if (iommu->qi) {
2815 			ret = dmar_reenable_qi(iommu);
2816 			if (ret)
2817 				return ret;
2818 		}
2819 	}
2820 
2821 	for_each_iommu(iommu, drhd) {
2822 		if (drhd->ignored) {
2823 			/*
2824 			 * we always have to disable PMRs or DMA may fail on
2825 			 * this device
2826 			 */
2827 			if (force_on)
2828 				iommu_disable_protect_mem_regions(iommu);
2829 			continue;
2830 		}
2831 
2832 		iommu_flush_write_buffer(iommu);
2833 		iommu_set_root_entry(iommu);
2834 		iommu_enable_translation(iommu);
2835 		iommu_disable_protect_mem_regions(iommu);
2836 	}
2837 
2838 	return 0;
2839 }
2840 
2841 static void iommu_flush_all(void)
2842 {
2843 	struct dmar_drhd_unit *drhd;
2844 	struct intel_iommu *iommu;
2845 
2846 	for_each_active_iommu(iommu, drhd) {
2847 		iommu->flush.flush_context(iommu, 0, 0, 0,
2848 					   DMA_CCMD_GLOBAL_INVL);
2849 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2850 					 DMA_TLB_GLOBAL_FLUSH);
2851 	}
2852 }
2853 
2854 static int iommu_suspend(void)
2855 {
2856 	struct dmar_drhd_unit *drhd;
2857 	struct intel_iommu *iommu = NULL;
2858 	unsigned long flag;
2859 
2860 	iommu_flush_all();
2861 
2862 	for_each_active_iommu(iommu, drhd) {
2863 		iommu_disable_translation(iommu);
2864 
2865 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2866 
2867 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
2868 			readl(iommu->reg + DMAR_FECTL_REG);
2869 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2870 			readl(iommu->reg + DMAR_FEDATA_REG);
2871 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2872 			readl(iommu->reg + DMAR_FEADDR_REG);
2873 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2874 			readl(iommu->reg + DMAR_FEUADDR_REG);
2875 
2876 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2877 	}
2878 	return 0;
2879 }
2880 
2881 static void iommu_resume(void)
2882 {
2883 	struct dmar_drhd_unit *drhd;
2884 	struct intel_iommu *iommu = NULL;
2885 	unsigned long flag;
2886 
2887 	if (init_iommu_hw()) {
2888 		if (force_on)
2889 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2890 		else
2891 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2892 		return;
2893 	}
2894 
2895 	for_each_active_iommu(iommu, drhd) {
2896 
2897 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2898 
2899 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2900 			iommu->reg + DMAR_FECTL_REG);
2901 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2902 			iommu->reg + DMAR_FEDATA_REG);
2903 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2904 			iommu->reg + DMAR_FEADDR_REG);
2905 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2906 			iommu->reg + DMAR_FEUADDR_REG);
2907 
2908 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2909 	}
2910 }
2911 
2912 static struct syscore_ops iommu_syscore_ops = {
2913 	.resume		= iommu_resume,
2914 	.suspend	= iommu_suspend,
2915 };
2916 
2917 static void __init init_iommu_pm_ops(void)
2918 {
2919 	register_syscore_ops(&iommu_syscore_ops);
2920 }
2921 
2922 #else
2923 static inline void init_iommu_pm_ops(void) {}
2924 #endif	/* CONFIG_PM */
2925 
2926 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2927 {
2928 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2929 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2930 	    rmrr->end_address <= rmrr->base_address ||
2931 	    arch_rmrr_sanity_check(rmrr))
2932 		return -EINVAL;
2933 
2934 	return 0;
2935 }
2936 
2937 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2938 {
2939 	struct acpi_dmar_reserved_memory *rmrr;
2940 	struct dmar_rmrr_unit *rmrru;
2941 
2942 	rmrr = (struct acpi_dmar_reserved_memory *)header;
2943 	if (rmrr_sanity_check(rmrr)) {
2944 		pr_warn(FW_BUG
2945 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2946 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2947 			   rmrr->base_address, rmrr->end_address,
2948 			   dmi_get_system_info(DMI_BIOS_VENDOR),
2949 			   dmi_get_system_info(DMI_BIOS_VERSION),
2950 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
2951 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2952 	}
2953 
2954 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2955 	if (!rmrru)
2956 		goto out;
2957 
2958 	rmrru->hdr = header;
2959 
2960 	rmrru->base_address = rmrr->base_address;
2961 	rmrru->end_address = rmrr->end_address;
2962 
2963 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2964 				((void *)rmrr) + rmrr->header.length,
2965 				&rmrru->devices_cnt);
2966 	if (rmrru->devices_cnt && rmrru->devices == NULL)
2967 		goto free_rmrru;
2968 
2969 	list_add(&rmrru->list, &dmar_rmrr_units);
2970 
2971 	return 0;
2972 free_rmrru:
2973 	kfree(rmrru);
2974 out:
2975 	return -ENOMEM;
2976 }
2977 
2978 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2979 {
2980 	struct dmar_atsr_unit *atsru;
2981 	struct acpi_dmar_atsr *tmp;
2982 
2983 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2984 				dmar_rcu_check()) {
2985 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2986 		if (atsr->segment != tmp->segment)
2987 			continue;
2988 		if (atsr->header.length != tmp->header.length)
2989 			continue;
2990 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
2991 			return atsru;
2992 	}
2993 
2994 	return NULL;
2995 }
2996 
2997 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2998 {
2999 	struct acpi_dmar_atsr *atsr;
3000 	struct dmar_atsr_unit *atsru;
3001 
3002 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3003 		return 0;
3004 
3005 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3006 	atsru = dmar_find_atsr(atsr);
3007 	if (atsru)
3008 		return 0;
3009 
3010 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3011 	if (!atsru)
3012 		return -ENOMEM;
3013 
3014 	/*
3015 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3016 	 * copy the memory content because the memory buffer will be freed
3017 	 * on return.
3018 	 */
3019 	atsru->hdr = (void *)(atsru + 1);
3020 	memcpy(atsru->hdr, hdr, hdr->length);
3021 	atsru->include_all = atsr->flags & 0x1;
3022 	if (!atsru->include_all) {
3023 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3024 				(void *)atsr + atsr->header.length,
3025 				&atsru->devices_cnt);
3026 		if (atsru->devices_cnt && atsru->devices == NULL) {
3027 			kfree(atsru);
3028 			return -ENOMEM;
3029 		}
3030 	}
3031 
3032 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3033 
3034 	return 0;
3035 }
3036 
3037 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3038 {
3039 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3040 	kfree(atsru);
3041 }
3042 
3043 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3044 {
3045 	struct acpi_dmar_atsr *atsr;
3046 	struct dmar_atsr_unit *atsru;
3047 
3048 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3049 	atsru = dmar_find_atsr(atsr);
3050 	if (atsru) {
3051 		list_del_rcu(&atsru->list);
3052 		synchronize_rcu();
3053 		intel_iommu_free_atsr(atsru);
3054 	}
3055 
3056 	return 0;
3057 }
3058 
3059 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3060 {
3061 	int i;
3062 	struct device *dev;
3063 	struct acpi_dmar_atsr *atsr;
3064 	struct dmar_atsr_unit *atsru;
3065 
3066 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3067 	atsru = dmar_find_atsr(atsr);
3068 	if (!atsru)
3069 		return 0;
3070 
3071 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3072 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3073 					  i, dev)
3074 			return -EBUSY;
3075 	}
3076 
3077 	return 0;
3078 }
3079 
3080 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3081 {
3082 	struct dmar_satc_unit *satcu;
3083 	struct acpi_dmar_satc *tmp;
3084 
3085 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3086 				dmar_rcu_check()) {
3087 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3088 		if (satc->segment != tmp->segment)
3089 			continue;
3090 		if (satc->header.length != tmp->header.length)
3091 			continue;
3092 		if (memcmp(satc, tmp, satc->header.length) == 0)
3093 			return satcu;
3094 	}
3095 
3096 	return NULL;
3097 }
3098 
3099 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3100 {
3101 	struct acpi_dmar_satc *satc;
3102 	struct dmar_satc_unit *satcu;
3103 
3104 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3105 		return 0;
3106 
3107 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3108 	satcu = dmar_find_satc(satc);
3109 	if (satcu)
3110 		return 0;
3111 
3112 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3113 	if (!satcu)
3114 		return -ENOMEM;
3115 
3116 	satcu->hdr = (void *)(satcu + 1);
3117 	memcpy(satcu->hdr, hdr, hdr->length);
3118 	satcu->atc_required = satc->flags & 0x1;
3119 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3120 					      (void *)satc + satc->header.length,
3121 					      &satcu->devices_cnt);
3122 	if (satcu->devices_cnt && !satcu->devices) {
3123 		kfree(satcu);
3124 		return -ENOMEM;
3125 	}
3126 	list_add_rcu(&satcu->list, &dmar_satc_units);
3127 
3128 	return 0;
3129 }
3130 
3131 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3132 {
3133 	int sp, ret;
3134 	struct intel_iommu *iommu = dmaru->iommu;
3135 
3136 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3137 	if (ret)
3138 		goto out;
3139 
3140 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3141 		pr_warn("%s: Doesn't support hardware pass through.\n",
3142 			iommu->name);
3143 		return -ENXIO;
3144 	}
3145 
3146 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3147 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3148 		pr_warn("%s: Doesn't support large page.\n",
3149 			iommu->name);
3150 		return -ENXIO;
3151 	}
3152 
3153 	/*
3154 	 * Disable translation if already enabled prior to OS handover.
3155 	 */
3156 	if (iommu->gcmd & DMA_GCMD_TE)
3157 		iommu_disable_translation(iommu);
3158 
3159 	ret = iommu_init_domains(iommu);
3160 	if (ret == 0)
3161 		ret = iommu_alloc_root_entry(iommu);
3162 	if (ret)
3163 		goto out;
3164 
3165 	intel_svm_check(iommu);
3166 
3167 	if (dmaru->ignored) {
3168 		/*
3169 		 * we always have to disable PMRs or DMA may fail on this device
3170 		 */
3171 		if (force_on)
3172 			iommu_disable_protect_mem_regions(iommu);
3173 		return 0;
3174 	}
3175 
3176 	intel_iommu_init_qi(iommu);
3177 	iommu_flush_write_buffer(iommu);
3178 
3179 #ifdef CONFIG_INTEL_IOMMU_SVM
3180 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3181 		ret = intel_svm_enable_prq(iommu);
3182 		if (ret)
3183 			goto disable_iommu;
3184 	}
3185 #endif
3186 	ret = dmar_set_interrupt(iommu);
3187 	if (ret)
3188 		goto disable_iommu;
3189 
3190 	iommu_set_root_entry(iommu);
3191 	iommu_enable_translation(iommu);
3192 
3193 	iommu_disable_protect_mem_regions(iommu);
3194 	return 0;
3195 
3196 disable_iommu:
3197 	disable_dmar_iommu(iommu);
3198 out:
3199 	free_dmar_iommu(iommu);
3200 	return ret;
3201 }
3202 
3203 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3204 {
3205 	int ret = 0;
3206 	struct intel_iommu *iommu = dmaru->iommu;
3207 
3208 	if (!intel_iommu_enabled)
3209 		return 0;
3210 	if (iommu == NULL)
3211 		return -EINVAL;
3212 
3213 	if (insert) {
3214 		ret = intel_iommu_add(dmaru);
3215 	} else {
3216 		disable_dmar_iommu(iommu);
3217 		free_dmar_iommu(iommu);
3218 	}
3219 
3220 	return ret;
3221 }
3222 
3223 static void intel_iommu_free_dmars(void)
3224 {
3225 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3226 	struct dmar_atsr_unit *atsru, *atsr_n;
3227 	struct dmar_satc_unit *satcu, *satc_n;
3228 
3229 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3230 		list_del(&rmrru->list);
3231 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3232 		kfree(rmrru);
3233 	}
3234 
3235 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3236 		list_del(&atsru->list);
3237 		intel_iommu_free_atsr(atsru);
3238 	}
3239 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3240 		list_del(&satcu->list);
3241 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3242 		kfree(satcu);
3243 	}
3244 }
3245 
3246 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3247 {
3248 	struct dmar_satc_unit *satcu;
3249 	struct acpi_dmar_satc *satc;
3250 	struct device *tmp;
3251 	int i;
3252 
3253 	dev = pci_physfn(dev);
3254 	rcu_read_lock();
3255 
3256 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3257 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3258 		if (satc->segment != pci_domain_nr(dev->bus))
3259 			continue;
3260 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3261 			if (to_pci_dev(tmp) == dev)
3262 				goto out;
3263 	}
3264 	satcu = NULL;
3265 out:
3266 	rcu_read_unlock();
3267 	return satcu;
3268 }
3269 
3270 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3271 {
3272 	int i, ret = 1;
3273 	struct pci_bus *bus;
3274 	struct pci_dev *bridge = NULL;
3275 	struct device *tmp;
3276 	struct acpi_dmar_atsr *atsr;
3277 	struct dmar_atsr_unit *atsru;
3278 	struct dmar_satc_unit *satcu;
3279 
3280 	dev = pci_physfn(dev);
3281 	satcu = dmar_find_matched_satc_unit(dev);
3282 	if (satcu)
3283 		/*
3284 		 * This device supports ATS as it is in SATC table.
3285 		 * When IOMMU is in legacy mode, enabling ATS is done
3286 		 * automatically by HW for the device that requires
3287 		 * ATS, hence OS should not enable this device ATS
3288 		 * to avoid duplicated TLB invalidation.
3289 		 */
3290 		return !(satcu->atc_required && !sm_supported(iommu));
3291 
3292 	for (bus = dev->bus; bus; bus = bus->parent) {
3293 		bridge = bus->self;
3294 		/* If it's an integrated device, allow ATS */
3295 		if (!bridge)
3296 			return 1;
3297 		/* Connected via non-PCIe: no ATS */
3298 		if (!pci_is_pcie(bridge) ||
3299 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3300 			return 0;
3301 		/* If we found the root port, look it up in the ATSR */
3302 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3303 			break;
3304 	}
3305 
3306 	rcu_read_lock();
3307 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3308 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3309 		if (atsr->segment != pci_domain_nr(dev->bus))
3310 			continue;
3311 
3312 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3313 			if (tmp == &bridge->dev)
3314 				goto out;
3315 
3316 		if (atsru->include_all)
3317 			goto out;
3318 	}
3319 	ret = 0;
3320 out:
3321 	rcu_read_unlock();
3322 
3323 	return ret;
3324 }
3325 
3326 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3327 {
3328 	int ret;
3329 	struct dmar_rmrr_unit *rmrru;
3330 	struct dmar_atsr_unit *atsru;
3331 	struct dmar_satc_unit *satcu;
3332 	struct acpi_dmar_atsr *atsr;
3333 	struct acpi_dmar_reserved_memory *rmrr;
3334 	struct acpi_dmar_satc *satc;
3335 
3336 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3337 		return 0;
3338 
3339 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3340 		rmrr = container_of(rmrru->hdr,
3341 				    struct acpi_dmar_reserved_memory, header);
3342 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3343 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3344 				((void *)rmrr) + rmrr->header.length,
3345 				rmrr->segment, rmrru->devices,
3346 				rmrru->devices_cnt);
3347 			if (ret < 0)
3348 				return ret;
3349 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3350 			dmar_remove_dev_scope(info, rmrr->segment,
3351 				rmrru->devices, rmrru->devices_cnt);
3352 		}
3353 	}
3354 
3355 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3356 		if (atsru->include_all)
3357 			continue;
3358 
3359 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3360 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3361 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3362 					(void *)atsr + atsr->header.length,
3363 					atsr->segment, atsru->devices,
3364 					atsru->devices_cnt);
3365 			if (ret > 0)
3366 				break;
3367 			else if (ret < 0)
3368 				return ret;
3369 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3370 			if (dmar_remove_dev_scope(info, atsr->segment,
3371 					atsru->devices, atsru->devices_cnt))
3372 				break;
3373 		}
3374 	}
3375 	list_for_each_entry(satcu, &dmar_satc_units, list) {
3376 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3377 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3378 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3379 					(void *)satc + satc->header.length,
3380 					satc->segment, satcu->devices,
3381 					satcu->devices_cnt);
3382 			if (ret > 0)
3383 				break;
3384 			else if (ret < 0)
3385 				return ret;
3386 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3387 			if (dmar_remove_dev_scope(info, satc->segment,
3388 					satcu->devices, satcu->devices_cnt))
3389 				break;
3390 		}
3391 	}
3392 
3393 	return 0;
3394 }
3395 
3396 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3397 				       unsigned long val, void *v)
3398 {
3399 	struct memory_notify *mhp = v;
3400 	unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3401 	unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3402 			mhp->nr_pages - 1);
3403 
3404 	switch (val) {
3405 	case MEM_GOING_ONLINE:
3406 		if (iommu_domain_identity_map(si_domain,
3407 					      start_vpfn, last_vpfn)) {
3408 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3409 				start_vpfn, last_vpfn);
3410 			return NOTIFY_BAD;
3411 		}
3412 		break;
3413 
3414 	case MEM_OFFLINE:
3415 	case MEM_CANCEL_ONLINE:
3416 		{
3417 			struct dmar_drhd_unit *drhd;
3418 			struct intel_iommu *iommu;
3419 			LIST_HEAD(freelist);
3420 
3421 			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3422 
3423 			rcu_read_lock();
3424 			for_each_active_iommu(iommu, drhd)
3425 				iommu_flush_iotlb_psi(iommu, si_domain,
3426 					start_vpfn, mhp->nr_pages,
3427 					list_empty(&freelist), 0);
3428 			rcu_read_unlock();
3429 			put_pages_list(&freelist);
3430 		}
3431 		break;
3432 	}
3433 
3434 	return NOTIFY_OK;
3435 }
3436 
3437 static struct notifier_block intel_iommu_memory_nb = {
3438 	.notifier_call = intel_iommu_memory_notifier,
3439 	.priority = 0
3440 };
3441 
3442 static void intel_disable_iommus(void)
3443 {
3444 	struct intel_iommu *iommu = NULL;
3445 	struct dmar_drhd_unit *drhd;
3446 
3447 	for_each_iommu(iommu, drhd)
3448 		iommu_disable_translation(iommu);
3449 }
3450 
3451 void intel_iommu_shutdown(void)
3452 {
3453 	struct dmar_drhd_unit *drhd;
3454 	struct intel_iommu *iommu = NULL;
3455 
3456 	if (no_iommu || dmar_disabled)
3457 		return;
3458 
3459 	down_write(&dmar_global_lock);
3460 
3461 	/* Disable PMRs explicitly here. */
3462 	for_each_iommu(iommu, drhd)
3463 		iommu_disable_protect_mem_regions(iommu);
3464 
3465 	/* Make sure the IOMMUs are switched off */
3466 	intel_disable_iommus();
3467 
3468 	up_write(&dmar_global_lock);
3469 }
3470 
3471 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3472 {
3473 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3474 
3475 	return container_of(iommu_dev, struct intel_iommu, iommu);
3476 }
3477 
3478 static ssize_t version_show(struct device *dev,
3479 			    struct device_attribute *attr, char *buf)
3480 {
3481 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3482 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3483 	return sysfs_emit(buf, "%d:%d\n",
3484 			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3485 }
3486 static DEVICE_ATTR_RO(version);
3487 
3488 static ssize_t address_show(struct device *dev,
3489 			    struct device_attribute *attr, char *buf)
3490 {
3491 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3492 	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3493 }
3494 static DEVICE_ATTR_RO(address);
3495 
3496 static ssize_t cap_show(struct device *dev,
3497 			struct device_attribute *attr, char *buf)
3498 {
3499 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3500 	return sysfs_emit(buf, "%llx\n", iommu->cap);
3501 }
3502 static DEVICE_ATTR_RO(cap);
3503 
3504 static ssize_t ecap_show(struct device *dev,
3505 			 struct device_attribute *attr, char *buf)
3506 {
3507 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3508 	return sysfs_emit(buf, "%llx\n", iommu->ecap);
3509 }
3510 static DEVICE_ATTR_RO(ecap);
3511 
3512 static ssize_t domains_supported_show(struct device *dev,
3513 				      struct device_attribute *attr, char *buf)
3514 {
3515 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3516 	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3517 }
3518 static DEVICE_ATTR_RO(domains_supported);
3519 
3520 static ssize_t domains_used_show(struct device *dev,
3521 				 struct device_attribute *attr, char *buf)
3522 {
3523 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3524 	return sysfs_emit(buf, "%d\n",
3525 			  bitmap_weight(iommu->domain_ids,
3526 					cap_ndoms(iommu->cap)));
3527 }
3528 static DEVICE_ATTR_RO(domains_used);
3529 
3530 static struct attribute *intel_iommu_attrs[] = {
3531 	&dev_attr_version.attr,
3532 	&dev_attr_address.attr,
3533 	&dev_attr_cap.attr,
3534 	&dev_attr_ecap.attr,
3535 	&dev_attr_domains_supported.attr,
3536 	&dev_attr_domains_used.attr,
3537 	NULL,
3538 };
3539 
3540 static struct attribute_group intel_iommu_group = {
3541 	.name = "intel-iommu",
3542 	.attrs = intel_iommu_attrs,
3543 };
3544 
3545 const struct attribute_group *intel_iommu_groups[] = {
3546 	&intel_iommu_group,
3547 	NULL,
3548 };
3549 
3550 static bool has_external_pci(void)
3551 {
3552 	struct pci_dev *pdev = NULL;
3553 
3554 	for_each_pci_dev(pdev)
3555 		if (pdev->external_facing) {
3556 			pci_dev_put(pdev);
3557 			return true;
3558 		}
3559 
3560 	return false;
3561 }
3562 
3563 static int __init platform_optin_force_iommu(void)
3564 {
3565 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3566 		return 0;
3567 
3568 	if (no_iommu || dmar_disabled)
3569 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3570 
3571 	/*
3572 	 * If Intel-IOMMU is disabled by default, we will apply identity
3573 	 * map for all devices except those marked as being untrusted.
3574 	 */
3575 	if (dmar_disabled)
3576 		iommu_set_default_passthrough(false);
3577 
3578 	dmar_disabled = 0;
3579 	no_iommu = 0;
3580 
3581 	return 1;
3582 }
3583 
3584 static int __init probe_acpi_namespace_devices(void)
3585 {
3586 	struct dmar_drhd_unit *drhd;
3587 	/* To avoid a -Wunused-but-set-variable warning. */
3588 	struct intel_iommu *iommu __maybe_unused;
3589 	struct device *dev;
3590 	int i, ret = 0;
3591 
3592 	for_each_active_iommu(iommu, drhd) {
3593 		for_each_active_dev_scope(drhd->devices,
3594 					  drhd->devices_cnt, i, dev) {
3595 			struct acpi_device_physical_node *pn;
3596 			struct acpi_device *adev;
3597 
3598 			if (dev->bus != &acpi_bus_type)
3599 				continue;
3600 
3601 			adev = to_acpi_device(dev);
3602 			mutex_lock(&adev->physical_node_lock);
3603 			list_for_each_entry(pn,
3604 					    &adev->physical_node_list, node) {
3605 				ret = iommu_probe_device(pn->dev);
3606 				if (ret)
3607 					break;
3608 			}
3609 			mutex_unlock(&adev->physical_node_lock);
3610 
3611 			if (ret)
3612 				return ret;
3613 		}
3614 	}
3615 
3616 	return 0;
3617 }
3618 
3619 static __init int tboot_force_iommu(void)
3620 {
3621 	if (!tboot_enabled())
3622 		return 0;
3623 
3624 	if (no_iommu || dmar_disabled)
3625 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3626 
3627 	dmar_disabled = 0;
3628 	no_iommu = 0;
3629 
3630 	return 1;
3631 }
3632 
3633 int __init intel_iommu_init(void)
3634 {
3635 	int ret = -ENODEV;
3636 	struct dmar_drhd_unit *drhd;
3637 	struct intel_iommu *iommu;
3638 
3639 	/*
3640 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3641 	 * opt in, so enforce that.
3642 	 */
3643 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3644 		    platform_optin_force_iommu();
3645 
3646 	down_write(&dmar_global_lock);
3647 	if (dmar_table_init()) {
3648 		if (force_on)
3649 			panic("tboot: Failed to initialize DMAR table\n");
3650 		goto out_free_dmar;
3651 	}
3652 
3653 	if (dmar_dev_scope_init() < 0) {
3654 		if (force_on)
3655 			panic("tboot: Failed to initialize DMAR device scope\n");
3656 		goto out_free_dmar;
3657 	}
3658 
3659 	up_write(&dmar_global_lock);
3660 
3661 	/*
3662 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3663 	 * complain later when we register it under the lock.
3664 	 */
3665 	dmar_register_bus_notifier();
3666 
3667 	down_write(&dmar_global_lock);
3668 
3669 	if (!no_iommu)
3670 		intel_iommu_debugfs_init();
3671 
3672 	if (no_iommu || dmar_disabled) {
3673 		/*
3674 		 * We exit the function here to ensure IOMMU's remapping and
3675 		 * mempool aren't setup, which means that the IOMMU's PMRs
3676 		 * won't be disabled via the call to init_dmars(). So disable
3677 		 * it explicitly here. The PMRs were setup by tboot prior to
3678 		 * calling SENTER, but the kernel is expected to reset/tear
3679 		 * down the PMRs.
3680 		 */
3681 		if (intel_iommu_tboot_noforce) {
3682 			for_each_iommu(iommu, drhd)
3683 				iommu_disable_protect_mem_regions(iommu);
3684 		}
3685 
3686 		/*
3687 		 * Make sure the IOMMUs are switched off, even when we
3688 		 * boot into a kexec kernel and the previous kernel left
3689 		 * them enabled
3690 		 */
3691 		intel_disable_iommus();
3692 		goto out_free_dmar;
3693 	}
3694 
3695 	if (list_empty(&dmar_rmrr_units))
3696 		pr_info("No RMRR found\n");
3697 
3698 	if (list_empty(&dmar_atsr_units))
3699 		pr_info("No ATSR found\n");
3700 
3701 	if (list_empty(&dmar_satc_units))
3702 		pr_info("No SATC found\n");
3703 
3704 	init_no_remapping_devices();
3705 
3706 	ret = init_dmars();
3707 	if (ret) {
3708 		if (force_on)
3709 			panic("tboot: Failed to initialize DMARs\n");
3710 		pr_err("Initialization failed\n");
3711 		goto out_free_dmar;
3712 	}
3713 	up_write(&dmar_global_lock);
3714 
3715 	init_iommu_pm_ops();
3716 
3717 	down_read(&dmar_global_lock);
3718 	for_each_active_iommu(iommu, drhd) {
3719 		/*
3720 		 * The flush queue implementation does not perform
3721 		 * page-selective invalidations that are required for efficient
3722 		 * TLB flushes in virtual environments.  The benefit of batching
3723 		 * is likely to be much lower than the overhead of synchronizing
3724 		 * the virtual and physical IOMMU page-tables.
3725 		 */
3726 		if (cap_caching_mode(iommu->cap) &&
3727 		    !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3728 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3729 			iommu_set_dma_strict();
3730 		}
3731 		iommu_device_sysfs_add(&iommu->iommu, NULL,
3732 				       intel_iommu_groups,
3733 				       "%s", iommu->name);
3734 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3735 
3736 		iommu_pmu_register(iommu);
3737 	}
3738 	up_read(&dmar_global_lock);
3739 
3740 	if (si_domain && !hw_pass_through)
3741 		register_memory_notifier(&intel_iommu_memory_nb);
3742 
3743 	down_read(&dmar_global_lock);
3744 	if (probe_acpi_namespace_devices())
3745 		pr_warn("ACPI name space devices didn't probe correctly\n");
3746 
3747 	/* Finally, we enable the DMA remapping hardware. */
3748 	for_each_iommu(iommu, drhd) {
3749 		if (!drhd->ignored && !translation_pre_enabled(iommu))
3750 			iommu_enable_translation(iommu);
3751 
3752 		iommu_disable_protect_mem_regions(iommu);
3753 	}
3754 	up_read(&dmar_global_lock);
3755 
3756 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3757 
3758 	intel_iommu_enabled = 1;
3759 
3760 	return 0;
3761 
3762 out_free_dmar:
3763 	intel_iommu_free_dmars();
3764 	up_write(&dmar_global_lock);
3765 	return ret;
3766 }
3767 
3768 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3769 {
3770 	struct device_domain_info *info = opaque;
3771 
3772 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3773 	return 0;
3774 }
3775 
3776 /*
3777  * NB - intel-iommu lacks any sort of reference counting for the users of
3778  * dependent devices.  If multiple endpoints have intersecting dependent
3779  * devices, unbinding the driver from any one of them will possibly leave
3780  * the others unable to operate.
3781  */
3782 static void domain_context_clear(struct device_domain_info *info)
3783 {
3784 	if (!dev_is_pci(info->dev))
3785 		domain_context_clear_one(info, info->bus, info->devfn);
3786 
3787 	pci_for_each_dma_alias(to_pci_dev(info->dev),
3788 			       &domain_context_clear_one_cb, info);
3789 }
3790 
3791 /*
3792  * Clear the page table pointer in context or pasid table entries so that
3793  * all DMA requests without PASID from the device are blocked. If the page
3794  * table has been set, clean up the data structures.
3795  */
3796 void device_block_translation(struct device *dev)
3797 {
3798 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3799 	struct intel_iommu *iommu = info->iommu;
3800 	unsigned long flags;
3801 
3802 	iommu_disable_pci_caps(info);
3803 	if (!dev_is_real_dma_subdevice(dev)) {
3804 		if (sm_supported(iommu))
3805 			intel_pasid_tear_down_entry(iommu, dev,
3806 						    IOMMU_NO_PASID, false);
3807 		else
3808 			domain_context_clear(info);
3809 	}
3810 
3811 	if (!info->domain)
3812 		return;
3813 
3814 	spin_lock_irqsave(&info->domain->lock, flags);
3815 	list_del(&info->link);
3816 	spin_unlock_irqrestore(&info->domain->lock, flags);
3817 
3818 	domain_detach_iommu(info->domain, iommu);
3819 	info->domain = NULL;
3820 }
3821 
3822 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3823 {
3824 	int adjust_width;
3825 
3826 	/* calculate AGAW */
3827 	domain->gaw = guest_width;
3828 	adjust_width = guestwidth_to_adjustwidth(guest_width);
3829 	domain->agaw = width_to_agaw(adjust_width);
3830 
3831 	domain->iommu_coherency = false;
3832 	domain->iommu_superpage = 0;
3833 	domain->max_addr = 0;
3834 
3835 	/* always allocate the top pgd */
3836 	domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
3837 	if (!domain->pgd)
3838 		return -ENOMEM;
3839 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3840 	return 0;
3841 }
3842 
3843 static int blocking_domain_attach_dev(struct iommu_domain *domain,
3844 				      struct device *dev)
3845 {
3846 	device_block_translation(dev);
3847 	return 0;
3848 }
3849 
3850 static struct iommu_domain blocking_domain = {
3851 	.type = IOMMU_DOMAIN_BLOCKED,
3852 	.ops = &(const struct iommu_domain_ops) {
3853 		.attach_dev	= blocking_domain_attach_dev,
3854 	}
3855 };
3856 
3857 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
3858 {
3859 	struct dmar_domain *dmar_domain;
3860 	struct iommu_domain *domain;
3861 
3862 	switch (type) {
3863 	case IOMMU_DOMAIN_DMA:
3864 	case IOMMU_DOMAIN_UNMANAGED:
3865 		dmar_domain = alloc_domain(type);
3866 		if (!dmar_domain) {
3867 			pr_err("Can't allocate dmar_domain\n");
3868 			return NULL;
3869 		}
3870 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3871 			pr_err("Domain initialization failed\n");
3872 			domain_exit(dmar_domain);
3873 			return NULL;
3874 		}
3875 
3876 		domain = &dmar_domain->domain;
3877 		domain->geometry.aperture_start = 0;
3878 		domain->geometry.aperture_end   =
3879 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
3880 		domain->geometry.force_aperture = true;
3881 
3882 		return domain;
3883 	case IOMMU_DOMAIN_IDENTITY:
3884 		return &si_domain->domain;
3885 	case IOMMU_DOMAIN_SVA:
3886 		return intel_svm_domain_alloc();
3887 	default:
3888 		return NULL;
3889 	}
3890 
3891 	return NULL;
3892 }
3893 
3894 static struct iommu_domain *
3895 intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
3896 			      struct iommu_domain *parent,
3897 			      const struct iommu_user_data *user_data)
3898 {
3899 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3900 	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3901 	bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3902 	struct intel_iommu *iommu = info->iommu;
3903 	struct dmar_domain *dmar_domain;
3904 	struct iommu_domain *domain;
3905 
3906 	/* Must be NESTING domain */
3907 	if (parent) {
3908 		if (!nested_supported(iommu) || flags)
3909 			return ERR_PTR(-EOPNOTSUPP);
3910 		return intel_nested_domain_alloc(parent, user_data);
3911 	}
3912 
3913 	if (flags &
3914 	    (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
3915 		return ERR_PTR(-EOPNOTSUPP);
3916 	if (nested_parent && !nested_supported(iommu))
3917 		return ERR_PTR(-EOPNOTSUPP);
3918 	if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3919 		return ERR_PTR(-EOPNOTSUPP);
3920 
3921 	/*
3922 	 * domain_alloc_user op needs to fully initialize a domain before
3923 	 * return, so uses iommu_domain_alloc() here for simple.
3924 	 */
3925 	domain = iommu_domain_alloc(dev->bus);
3926 	if (!domain)
3927 		return ERR_PTR(-ENOMEM);
3928 
3929 	dmar_domain = to_dmar_domain(domain);
3930 
3931 	if (nested_parent) {
3932 		dmar_domain->nested_parent = true;
3933 		INIT_LIST_HEAD(&dmar_domain->s1_domains);
3934 		spin_lock_init(&dmar_domain->s1_lock);
3935 	}
3936 
3937 	if (dirty_tracking) {
3938 		if (dmar_domain->use_first_level) {
3939 			iommu_domain_free(domain);
3940 			return ERR_PTR(-EOPNOTSUPP);
3941 		}
3942 		domain->dirty_ops = &intel_dirty_ops;
3943 	}
3944 
3945 	return domain;
3946 }
3947 
3948 static void intel_iommu_domain_free(struct iommu_domain *domain)
3949 {
3950 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3951 
3952 	WARN_ON(dmar_domain->nested_parent &&
3953 		!list_empty(&dmar_domain->s1_domains));
3954 	if (domain != &si_domain->domain)
3955 		domain_exit(dmar_domain);
3956 }
3957 
3958 int prepare_domain_attach_device(struct iommu_domain *domain,
3959 				 struct device *dev)
3960 {
3961 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3962 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3963 	struct intel_iommu *iommu = info->iommu;
3964 	int addr_width;
3965 
3966 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3967 		return -EINVAL;
3968 
3969 	if (domain->dirty_ops && !ssads_supported(iommu))
3970 		return -EINVAL;
3971 
3972 	/* check if this iommu agaw is sufficient for max mapped address */
3973 	addr_width = agaw_to_width(iommu->agaw);
3974 	if (addr_width > cap_mgaw(iommu->cap))
3975 		addr_width = cap_mgaw(iommu->cap);
3976 
3977 	if (dmar_domain->max_addr > (1LL << addr_width))
3978 		return -EINVAL;
3979 	dmar_domain->gaw = addr_width;
3980 
3981 	/*
3982 	 * Knock out extra levels of page tables if necessary
3983 	 */
3984 	while (iommu->agaw < dmar_domain->agaw) {
3985 		struct dma_pte *pte;
3986 
3987 		pte = dmar_domain->pgd;
3988 		if (dma_pte_present(pte)) {
3989 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
3990 			free_pgtable_page(pte);
3991 		}
3992 		dmar_domain->agaw--;
3993 	}
3994 
3995 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3996 	    context_copied(iommu, info->bus, info->devfn))
3997 		return intel_pasid_setup_sm_context(dev);
3998 
3999 	return 0;
4000 }
4001 
4002 static int intel_iommu_attach_device(struct iommu_domain *domain,
4003 				     struct device *dev)
4004 {
4005 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4006 	int ret;
4007 
4008 	if (info->domain)
4009 		device_block_translation(dev);
4010 
4011 	ret = prepare_domain_attach_device(domain, dev);
4012 	if (ret)
4013 		return ret;
4014 
4015 	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4016 }
4017 
4018 static int intel_iommu_map(struct iommu_domain *domain,
4019 			   unsigned long iova, phys_addr_t hpa,
4020 			   size_t size, int iommu_prot, gfp_t gfp)
4021 {
4022 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4023 	u64 max_addr;
4024 	int prot = 0;
4025 
4026 	if (iommu_prot & IOMMU_READ)
4027 		prot |= DMA_PTE_READ;
4028 	if (iommu_prot & IOMMU_WRITE)
4029 		prot |= DMA_PTE_WRITE;
4030 	if (dmar_domain->set_pte_snp)
4031 		prot |= DMA_PTE_SNP;
4032 
4033 	max_addr = iova + size;
4034 	if (dmar_domain->max_addr < max_addr) {
4035 		u64 end;
4036 
4037 		/* check if minimum agaw is sufficient for mapped address */
4038 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4039 		if (end < max_addr) {
4040 			pr_err("%s: iommu width (%d) is not "
4041 			       "sufficient for the mapped address (%llx)\n",
4042 			       __func__, dmar_domain->gaw, max_addr);
4043 			return -EFAULT;
4044 		}
4045 		dmar_domain->max_addr = max_addr;
4046 	}
4047 	/* Round up size to next multiple of PAGE_SIZE, if it and
4048 	   the low bits of hpa would take us onto the next page */
4049 	size = aligned_nrpages(hpa, size);
4050 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4051 				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4052 }
4053 
4054 static int intel_iommu_map_pages(struct iommu_domain *domain,
4055 				 unsigned long iova, phys_addr_t paddr,
4056 				 size_t pgsize, size_t pgcount,
4057 				 int prot, gfp_t gfp, size_t *mapped)
4058 {
4059 	unsigned long pgshift = __ffs(pgsize);
4060 	size_t size = pgcount << pgshift;
4061 	int ret;
4062 
4063 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4064 		return -EINVAL;
4065 
4066 	if (!IS_ALIGNED(iova | paddr, pgsize))
4067 		return -EINVAL;
4068 
4069 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4070 	if (!ret && mapped)
4071 		*mapped = size;
4072 
4073 	return ret;
4074 }
4075 
4076 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4077 				unsigned long iova, size_t size,
4078 				struct iommu_iotlb_gather *gather)
4079 {
4080 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4081 	unsigned long start_pfn, last_pfn;
4082 	int level = 0;
4083 
4084 	/* Cope with horrid API which requires us to unmap more than the
4085 	   size argument if it happens to be a large-page mapping. */
4086 	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4087 				     &level, GFP_ATOMIC)))
4088 		return 0;
4089 
4090 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4091 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4092 
4093 	start_pfn = iova >> VTD_PAGE_SHIFT;
4094 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4095 
4096 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4097 
4098 	if (dmar_domain->max_addr == iova + size)
4099 		dmar_domain->max_addr = iova;
4100 
4101 	/*
4102 	 * We do not use page-selective IOTLB invalidation in flush queue,
4103 	 * so there is no need to track page and sync iotlb.
4104 	 */
4105 	if (!iommu_iotlb_gather_queued(gather))
4106 		iommu_iotlb_gather_add_page(domain, gather, iova, size);
4107 
4108 	return size;
4109 }
4110 
4111 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4112 				      unsigned long iova,
4113 				      size_t pgsize, size_t pgcount,
4114 				      struct iommu_iotlb_gather *gather)
4115 {
4116 	unsigned long pgshift = __ffs(pgsize);
4117 	size_t size = pgcount << pgshift;
4118 
4119 	return intel_iommu_unmap(domain, iova, size, gather);
4120 }
4121 
4122 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4123 				 struct iommu_iotlb_gather *gather)
4124 {
4125 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4126 	unsigned long iova_pfn = IOVA_PFN(gather->start);
4127 	size_t size = gather->end - gather->start;
4128 	struct iommu_domain_info *info;
4129 	unsigned long start_pfn;
4130 	unsigned long nrpages;
4131 	unsigned long i;
4132 
4133 	nrpages = aligned_nrpages(gather->start, size);
4134 	start_pfn = mm_to_dma_pfn_start(iova_pfn);
4135 
4136 	xa_for_each(&dmar_domain->iommu_array, i, info)
4137 		iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4138 				      start_pfn, nrpages,
4139 				      list_empty(&gather->freelist), 0);
4140 
4141 	if (dmar_domain->nested_parent)
4142 		parent_domain_flush(dmar_domain, start_pfn, nrpages,
4143 				    list_empty(&gather->freelist));
4144 	put_pages_list(&gather->freelist);
4145 }
4146 
4147 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4148 					    dma_addr_t iova)
4149 {
4150 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4151 	struct dma_pte *pte;
4152 	int level = 0;
4153 	u64 phys = 0;
4154 
4155 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4156 			     GFP_ATOMIC);
4157 	if (pte && dma_pte_present(pte))
4158 		phys = dma_pte_addr(pte) +
4159 			(iova & (BIT_MASK(level_to_offset_bits(level) +
4160 						VTD_PAGE_SHIFT) - 1));
4161 
4162 	return phys;
4163 }
4164 
4165 static bool domain_support_force_snooping(struct dmar_domain *domain)
4166 {
4167 	struct device_domain_info *info;
4168 	bool support = true;
4169 
4170 	assert_spin_locked(&domain->lock);
4171 	list_for_each_entry(info, &domain->devices, link) {
4172 		if (!ecap_sc_support(info->iommu->ecap)) {
4173 			support = false;
4174 			break;
4175 		}
4176 	}
4177 
4178 	return support;
4179 }
4180 
4181 static void domain_set_force_snooping(struct dmar_domain *domain)
4182 {
4183 	struct device_domain_info *info;
4184 
4185 	assert_spin_locked(&domain->lock);
4186 	/*
4187 	 * Second level page table supports per-PTE snoop control. The
4188 	 * iommu_map() interface will handle this by setting SNP bit.
4189 	 */
4190 	if (!domain->use_first_level) {
4191 		domain->set_pte_snp = true;
4192 		return;
4193 	}
4194 
4195 	list_for_each_entry(info, &domain->devices, link)
4196 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4197 						     IOMMU_NO_PASID);
4198 }
4199 
4200 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4201 {
4202 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4203 	unsigned long flags;
4204 
4205 	if (dmar_domain->force_snooping)
4206 		return true;
4207 
4208 	spin_lock_irqsave(&dmar_domain->lock, flags);
4209 	if (!domain_support_force_snooping(dmar_domain) ||
4210 	    (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
4211 		spin_unlock_irqrestore(&dmar_domain->lock, flags);
4212 		return false;
4213 	}
4214 
4215 	domain_set_force_snooping(dmar_domain);
4216 	dmar_domain->force_snooping = true;
4217 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4218 
4219 	return true;
4220 }
4221 
4222 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4223 {
4224 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4225 
4226 	switch (cap) {
4227 	case IOMMU_CAP_CACHE_COHERENCY:
4228 	case IOMMU_CAP_DEFERRED_FLUSH:
4229 		return true;
4230 	case IOMMU_CAP_PRE_BOOT_PROTECTION:
4231 		return dmar_platform_optin();
4232 	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4233 		return ecap_sc_support(info->iommu->ecap);
4234 	case IOMMU_CAP_DIRTY_TRACKING:
4235 		return ssads_supported(info->iommu);
4236 	default:
4237 		return false;
4238 	}
4239 }
4240 
4241 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4242 {
4243 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4244 	struct device_domain_info *info;
4245 	struct intel_iommu *iommu;
4246 	u8 bus, devfn;
4247 	int ret;
4248 
4249 	iommu = device_lookup_iommu(dev, &bus, &devfn);
4250 	if (!iommu || !iommu->iommu.ops)
4251 		return ERR_PTR(-ENODEV);
4252 
4253 	info = kzalloc(sizeof(*info), GFP_KERNEL);
4254 	if (!info)
4255 		return ERR_PTR(-ENOMEM);
4256 
4257 	if (dev_is_real_dma_subdevice(dev)) {
4258 		info->bus = pdev->bus->number;
4259 		info->devfn = pdev->devfn;
4260 		info->segment = pci_domain_nr(pdev->bus);
4261 	} else {
4262 		info->bus = bus;
4263 		info->devfn = devfn;
4264 		info->segment = iommu->segment;
4265 	}
4266 
4267 	info->dev = dev;
4268 	info->iommu = iommu;
4269 	if (dev_is_pci(dev)) {
4270 		if (ecap_dev_iotlb_support(iommu->ecap) &&
4271 		    pci_ats_supported(pdev) &&
4272 		    dmar_ats_supported(pdev, iommu)) {
4273 			info->ats_supported = 1;
4274 			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4275 
4276 			/*
4277 			 * For IOMMU that supports device IOTLB throttling
4278 			 * (DIT), we assign PFSID to the invalidation desc
4279 			 * of a VF such that IOMMU HW can gauge queue depth
4280 			 * at PF level. If DIT is not set, PFSID will be
4281 			 * treated as reserved, which should be set to 0.
4282 			 */
4283 			if (ecap_dit(iommu->ecap))
4284 				info->pfsid = pci_dev_id(pci_physfn(pdev));
4285 			info->ats_qdep = pci_ats_queue_depth(pdev);
4286 		}
4287 		if (sm_supported(iommu)) {
4288 			if (pasid_supported(iommu)) {
4289 				int features = pci_pasid_features(pdev);
4290 
4291 				if (features >= 0)
4292 					info->pasid_supported = features | 1;
4293 			}
4294 
4295 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4296 			    pci_pri_supported(pdev))
4297 				info->pri_supported = 1;
4298 		}
4299 	}
4300 
4301 	dev_iommu_priv_set(dev, info);
4302 	if (pdev && pci_ats_supported(pdev)) {
4303 		ret = device_rbtree_insert(iommu, info);
4304 		if (ret)
4305 			goto free;
4306 	}
4307 
4308 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4309 		ret = intel_pasid_alloc_table(dev);
4310 		if (ret) {
4311 			dev_err(dev, "PASID table allocation failed\n");
4312 			goto clear_rbtree;
4313 		}
4314 
4315 		if (!context_copied(iommu, info->bus, info->devfn)) {
4316 			ret = intel_pasid_setup_sm_context(dev);
4317 			if (ret)
4318 				goto free_table;
4319 		}
4320 	}
4321 
4322 	intel_iommu_debugfs_create_dev(info);
4323 
4324 	return &iommu->iommu;
4325 free_table:
4326 	intel_pasid_free_table(dev);
4327 clear_rbtree:
4328 	device_rbtree_remove(info);
4329 free:
4330 	kfree(info);
4331 
4332 	return ERR_PTR(ret);
4333 }
4334 
4335 static void intel_iommu_release_device(struct device *dev)
4336 {
4337 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4338 	struct intel_iommu *iommu = info->iommu;
4339 
4340 	mutex_lock(&iommu->iopf_lock);
4341 	if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
4342 		device_rbtree_remove(info);
4343 	mutex_unlock(&iommu->iopf_lock);
4344 
4345 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
4346 	    !context_copied(iommu, info->bus, info->devfn))
4347 		intel_pasid_teardown_sm_context(dev);
4348 
4349 	intel_pasid_free_table(dev);
4350 	intel_iommu_debugfs_remove_dev(info);
4351 	kfree(info);
4352 	set_dma_ops(dev, NULL);
4353 }
4354 
4355 static void intel_iommu_probe_finalize(struct device *dev)
4356 {
4357 	set_dma_ops(dev, NULL);
4358 	iommu_setup_dma_ops(dev, 0, U64_MAX);
4359 }
4360 
4361 static void intel_iommu_get_resv_regions(struct device *device,
4362 					 struct list_head *head)
4363 {
4364 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4365 	struct iommu_resv_region *reg;
4366 	struct dmar_rmrr_unit *rmrr;
4367 	struct device *i_dev;
4368 	int i;
4369 
4370 	rcu_read_lock();
4371 	for_each_rmrr_units(rmrr) {
4372 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4373 					  i, i_dev) {
4374 			struct iommu_resv_region *resv;
4375 			enum iommu_resv_type type;
4376 			size_t length;
4377 
4378 			if (i_dev != device &&
4379 			    !is_downstream_to_pci_bridge(device, i_dev))
4380 				continue;
4381 
4382 			length = rmrr->end_address - rmrr->base_address + 1;
4383 
4384 			type = device_rmrr_is_relaxable(device) ?
4385 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4386 
4387 			resv = iommu_alloc_resv_region(rmrr->base_address,
4388 						       length, prot, type,
4389 						       GFP_ATOMIC);
4390 			if (!resv)
4391 				break;
4392 
4393 			list_add_tail(&resv->list, head);
4394 		}
4395 	}
4396 	rcu_read_unlock();
4397 
4398 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4399 	if (dev_is_pci(device)) {
4400 		struct pci_dev *pdev = to_pci_dev(device);
4401 
4402 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4403 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4404 					IOMMU_RESV_DIRECT_RELAXABLE,
4405 					GFP_KERNEL);
4406 			if (reg)
4407 				list_add_tail(&reg->list, head);
4408 		}
4409 	}
4410 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4411 
4412 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4413 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4414 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4415 	if (!reg)
4416 		return;
4417 	list_add_tail(&reg->list, head);
4418 }
4419 
4420 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4421 {
4422 	if (dev_is_pci(dev))
4423 		return pci_device_group(dev);
4424 	return generic_device_group(dev);
4425 }
4426 
4427 static int intel_iommu_enable_sva(struct device *dev)
4428 {
4429 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4430 	struct intel_iommu *iommu;
4431 
4432 	if (!info || dmar_disabled)
4433 		return -EINVAL;
4434 
4435 	iommu = info->iommu;
4436 	if (!iommu)
4437 		return -EINVAL;
4438 
4439 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4440 		return -ENODEV;
4441 
4442 	if (!info->pasid_enabled || !info->ats_enabled)
4443 		return -EINVAL;
4444 
4445 	/*
4446 	 * Devices having device-specific I/O fault handling should not
4447 	 * support PCI/PRI. The IOMMU side has no means to check the
4448 	 * capability of device-specific IOPF.  Therefore, IOMMU can only
4449 	 * default that if the device driver enables SVA on a non-PRI
4450 	 * device, it will handle IOPF in its own way.
4451 	 */
4452 	if (!info->pri_supported)
4453 		return 0;
4454 
4455 	/* Devices supporting PRI should have it enabled. */
4456 	if (!info->pri_enabled)
4457 		return -EINVAL;
4458 
4459 	return 0;
4460 }
4461 
4462 static int intel_iommu_enable_iopf(struct device *dev)
4463 {
4464 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4465 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4466 	struct intel_iommu *iommu;
4467 	int ret;
4468 
4469 	if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4470 		return -ENODEV;
4471 
4472 	if (info->pri_enabled)
4473 		return -EBUSY;
4474 
4475 	iommu = info->iommu;
4476 	if (!iommu)
4477 		return -EINVAL;
4478 
4479 	/* PASID is required in PRG Response Message. */
4480 	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4481 		return -EINVAL;
4482 
4483 	ret = pci_reset_pri(pdev);
4484 	if (ret)
4485 		return ret;
4486 
4487 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4488 	if (ret)
4489 		return ret;
4490 
4491 	ret = pci_enable_pri(pdev, PRQ_DEPTH);
4492 	if (ret) {
4493 		iopf_queue_remove_device(iommu->iopf_queue, dev);
4494 		return ret;
4495 	}
4496 
4497 	info->pri_enabled = 1;
4498 
4499 	return 0;
4500 }
4501 
4502 static int intel_iommu_disable_iopf(struct device *dev)
4503 {
4504 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4505 	struct intel_iommu *iommu = info->iommu;
4506 
4507 	if (!info->pri_enabled)
4508 		return -EINVAL;
4509 
4510 	/*
4511 	 * PCIe spec states that by clearing PRI enable bit, the Page
4512 	 * Request Interface will not issue new page requests, but has
4513 	 * outstanding page requests that have been transmitted or are
4514 	 * queued for transmission. This is supposed to be called after
4515 	 * the device driver has stopped DMA, all PASIDs have been
4516 	 * unbound and the outstanding PRQs have been drained.
4517 	 */
4518 	pci_disable_pri(to_pci_dev(dev));
4519 	info->pri_enabled = 0;
4520 	iopf_queue_remove_device(iommu->iopf_queue, dev);
4521 
4522 	return 0;
4523 }
4524 
4525 static int
4526 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4527 {
4528 	switch (feat) {
4529 	case IOMMU_DEV_FEAT_IOPF:
4530 		return intel_iommu_enable_iopf(dev);
4531 
4532 	case IOMMU_DEV_FEAT_SVA:
4533 		return intel_iommu_enable_sva(dev);
4534 
4535 	default:
4536 		return -ENODEV;
4537 	}
4538 }
4539 
4540 static int
4541 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4542 {
4543 	switch (feat) {
4544 	case IOMMU_DEV_FEAT_IOPF:
4545 		return intel_iommu_disable_iopf(dev);
4546 
4547 	case IOMMU_DEV_FEAT_SVA:
4548 		return 0;
4549 
4550 	default:
4551 		return -ENODEV;
4552 	}
4553 }
4554 
4555 static bool intel_iommu_is_attach_deferred(struct device *dev)
4556 {
4557 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4558 
4559 	return translation_pre_enabled(info->iommu) && !info->domain;
4560 }
4561 
4562 /*
4563  * Check that the device does not live on an external facing PCI port that is
4564  * marked as untrusted. Such devices should not be able to apply quirks and
4565  * thus not be able to bypass the IOMMU restrictions.
4566  */
4567 static bool risky_device(struct pci_dev *pdev)
4568 {
4569 	if (pdev->untrusted) {
4570 		pci_info(pdev,
4571 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4572 			 pdev->vendor, pdev->device);
4573 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4574 		return true;
4575 	}
4576 	return false;
4577 }
4578 
4579 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4580 				      unsigned long iova, size_t size)
4581 {
4582 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4583 	unsigned long pages = aligned_nrpages(iova, size);
4584 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4585 	struct iommu_domain_info *info;
4586 	unsigned long i;
4587 
4588 	xa_for_each(&dmar_domain->iommu_array, i, info)
4589 		__mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4590 	return 0;
4591 }
4592 
4593 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4594 {
4595 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4596 	struct dev_pasid_info *curr, *dev_pasid = NULL;
4597 	struct intel_iommu *iommu = info->iommu;
4598 	struct dmar_domain *dmar_domain;
4599 	struct iommu_domain *domain;
4600 	unsigned long flags;
4601 
4602 	domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4603 	if (WARN_ON_ONCE(!domain))
4604 		goto out_tear_down;
4605 
4606 	/*
4607 	 * The SVA implementation needs to handle its own stuffs like the mm
4608 	 * notification. Before consolidating that code into iommu core, let
4609 	 * the intel sva code handle it.
4610 	 */
4611 	if (domain->type == IOMMU_DOMAIN_SVA) {
4612 		intel_svm_remove_dev_pasid(dev, pasid);
4613 		goto out_tear_down;
4614 	}
4615 
4616 	dmar_domain = to_dmar_domain(domain);
4617 	spin_lock_irqsave(&dmar_domain->lock, flags);
4618 	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4619 		if (curr->dev == dev && curr->pasid == pasid) {
4620 			list_del(&curr->link_domain);
4621 			dev_pasid = curr;
4622 			break;
4623 		}
4624 	}
4625 	WARN_ON_ONCE(!dev_pasid);
4626 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4627 
4628 	domain_detach_iommu(dmar_domain, iommu);
4629 	intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4630 	kfree(dev_pasid);
4631 out_tear_down:
4632 	intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4633 	intel_drain_pasid_prq(dev, pasid);
4634 }
4635 
4636 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4637 				     struct device *dev, ioasid_t pasid)
4638 {
4639 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4640 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4641 	struct intel_iommu *iommu = info->iommu;
4642 	struct dev_pasid_info *dev_pasid;
4643 	unsigned long flags;
4644 	int ret;
4645 
4646 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4647 		return -EOPNOTSUPP;
4648 
4649 	if (domain->dirty_ops)
4650 		return -EINVAL;
4651 
4652 	if (context_copied(iommu, info->bus, info->devfn))
4653 		return -EBUSY;
4654 
4655 	ret = prepare_domain_attach_device(domain, dev);
4656 	if (ret)
4657 		return ret;
4658 
4659 	dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4660 	if (!dev_pasid)
4661 		return -ENOMEM;
4662 
4663 	ret = domain_attach_iommu(dmar_domain, iommu);
4664 	if (ret)
4665 		goto out_free;
4666 
4667 	if (domain_type_is_si(dmar_domain))
4668 		ret = intel_pasid_setup_pass_through(iommu, dev, pasid);
4669 	else if (dmar_domain->use_first_level)
4670 		ret = domain_setup_first_level(iommu, dmar_domain,
4671 					       dev, pasid);
4672 	else
4673 		ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4674 						     dev, pasid);
4675 	if (ret)
4676 		goto out_detach_iommu;
4677 
4678 	dev_pasid->dev = dev;
4679 	dev_pasid->pasid = pasid;
4680 	spin_lock_irqsave(&dmar_domain->lock, flags);
4681 	list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4682 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4683 
4684 	if (domain->type & __IOMMU_DOMAIN_PAGING)
4685 		intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4686 
4687 	return 0;
4688 out_detach_iommu:
4689 	domain_detach_iommu(dmar_domain, iommu);
4690 out_free:
4691 	kfree(dev_pasid);
4692 	return ret;
4693 }
4694 
4695 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4696 {
4697 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4698 	struct intel_iommu *iommu = info->iommu;
4699 	struct iommu_hw_info_vtd *vtd;
4700 
4701 	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4702 	if (!vtd)
4703 		return ERR_PTR(-ENOMEM);
4704 
4705 	vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4706 	vtd->cap_reg = iommu->cap;
4707 	vtd->ecap_reg = iommu->ecap;
4708 	*length = sizeof(*vtd);
4709 	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4710 	return vtd;
4711 }
4712 
4713 /*
4714  * Set dirty tracking for the device list of a domain. The caller must
4715  * hold the domain->lock when calling it.
4716  */
4717 static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4718 {
4719 	struct device_domain_info *info;
4720 	int ret = 0;
4721 
4722 	list_for_each_entry(info, devices, link) {
4723 		ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4724 						       IOMMU_NO_PASID, enable);
4725 		if (ret)
4726 			break;
4727 	}
4728 
4729 	return ret;
4730 }
4731 
4732 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4733 					    bool enable)
4734 {
4735 	struct dmar_domain *s1_domain;
4736 	unsigned long flags;
4737 	int ret;
4738 
4739 	spin_lock(&domain->s1_lock);
4740 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4741 		spin_lock_irqsave(&s1_domain->lock, flags);
4742 		ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4743 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4744 		if (ret)
4745 			goto err_unwind;
4746 	}
4747 	spin_unlock(&domain->s1_lock);
4748 	return 0;
4749 
4750 err_unwind:
4751 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4752 		spin_lock_irqsave(&s1_domain->lock, flags);
4753 		device_set_dirty_tracking(&s1_domain->devices,
4754 					  domain->dirty_tracking);
4755 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4756 	}
4757 	spin_unlock(&domain->s1_lock);
4758 	return ret;
4759 }
4760 
4761 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4762 					  bool enable)
4763 {
4764 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4765 	int ret;
4766 
4767 	spin_lock(&dmar_domain->lock);
4768 	if (dmar_domain->dirty_tracking == enable)
4769 		goto out_unlock;
4770 
4771 	ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4772 	if (ret)
4773 		goto err_unwind;
4774 
4775 	if (dmar_domain->nested_parent) {
4776 		ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4777 		if (ret)
4778 			goto err_unwind;
4779 	}
4780 
4781 	dmar_domain->dirty_tracking = enable;
4782 out_unlock:
4783 	spin_unlock(&dmar_domain->lock);
4784 
4785 	return 0;
4786 
4787 err_unwind:
4788 	device_set_dirty_tracking(&dmar_domain->devices,
4789 				  dmar_domain->dirty_tracking);
4790 	spin_unlock(&dmar_domain->lock);
4791 	return ret;
4792 }
4793 
4794 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4795 					    unsigned long iova, size_t size,
4796 					    unsigned long flags,
4797 					    struct iommu_dirty_bitmap *dirty)
4798 {
4799 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4800 	unsigned long end = iova + size - 1;
4801 	unsigned long pgsize;
4802 
4803 	/*
4804 	 * IOMMUFD core calls into a dirty tracking disabled domain without an
4805 	 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4806 	 * have occurred when we stopped dirty tracking. This ensures that we
4807 	 * never inherit dirtied bits from a previous cycle.
4808 	 */
4809 	if (!dmar_domain->dirty_tracking && dirty->bitmap)
4810 		return -EINVAL;
4811 
4812 	do {
4813 		struct dma_pte *pte;
4814 		int lvl = 0;
4815 
4816 		pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4817 				     GFP_ATOMIC);
4818 		pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4819 		if (!pte || !dma_pte_present(pte)) {
4820 			iova += pgsize;
4821 			continue;
4822 		}
4823 
4824 		if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4825 			iommu_dirty_bitmap_record(dirty, iova, pgsize);
4826 		iova += pgsize;
4827 	} while (iova < end);
4828 
4829 	return 0;
4830 }
4831 
4832 static const struct iommu_dirty_ops intel_dirty_ops = {
4833 	.set_dirty_tracking = intel_iommu_set_dirty_tracking,
4834 	.read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4835 };
4836 
4837 const struct iommu_ops intel_iommu_ops = {
4838 	.blocked_domain		= &blocking_domain,
4839 	.release_domain		= &blocking_domain,
4840 	.capable		= intel_iommu_capable,
4841 	.hw_info		= intel_iommu_hw_info,
4842 	.domain_alloc		= intel_iommu_domain_alloc,
4843 	.domain_alloc_user	= intel_iommu_domain_alloc_user,
4844 	.probe_device		= intel_iommu_probe_device,
4845 	.probe_finalize		= intel_iommu_probe_finalize,
4846 	.release_device		= intel_iommu_release_device,
4847 	.get_resv_regions	= intel_iommu_get_resv_regions,
4848 	.device_group		= intel_iommu_device_group,
4849 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4850 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4851 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4852 	.def_domain_type	= device_def_domain_type,
4853 	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4854 	.pgsize_bitmap		= SZ_4K,
4855 #ifdef CONFIG_INTEL_IOMMU_SVM
4856 	.page_response		= intel_svm_page_response,
4857 #endif
4858 	.default_domain_ops = &(const struct iommu_domain_ops) {
4859 		.attach_dev		= intel_iommu_attach_device,
4860 		.set_dev_pasid		= intel_iommu_set_dev_pasid,
4861 		.map_pages		= intel_iommu_map_pages,
4862 		.unmap_pages		= intel_iommu_unmap_pages,
4863 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4864 		.flush_iotlb_all        = intel_flush_iotlb_all,
4865 		.iotlb_sync		= intel_iommu_tlb_sync,
4866 		.iova_to_phys		= intel_iommu_iova_to_phys,
4867 		.free			= intel_iommu_domain_free,
4868 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4869 	}
4870 };
4871 
4872 static void quirk_iommu_igfx(struct pci_dev *dev)
4873 {
4874 	if (risky_device(dev))
4875 		return;
4876 
4877 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4878 	dmar_map_gfx = 0;
4879 }
4880 
4881 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4882 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4883 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4884 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4885 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4886 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4887 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4888 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4889 
4890 /* Broadwell igfx malfunctions with dmar */
4891 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4892 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4893 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4894 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4895 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4896 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4897 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4898 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4899 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4900 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4901 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4902 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4903 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4904 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4905 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4906 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4907 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4908 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4909 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4910 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4911 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4912 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4913 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4914 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4915 
4916 static void quirk_iommu_rwbf(struct pci_dev *dev)
4917 {
4918 	if (risky_device(dev))
4919 		return;
4920 
4921 	/*
4922 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4923 	 * but needs it. Same seems to hold for the desktop versions.
4924 	 */
4925 	pci_info(dev, "Forcing write-buffer flush capability\n");
4926 	rwbf_quirk = 1;
4927 }
4928 
4929 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4930 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4931 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4932 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4933 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4934 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4935 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4936 
4937 #define GGC 0x52
4938 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4939 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4940 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4941 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4942 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4943 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4944 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4945 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4946 
4947 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4948 {
4949 	unsigned short ggc;
4950 
4951 	if (risky_device(dev))
4952 		return;
4953 
4954 	if (pci_read_config_word(dev, GGC, &ggc))
4955 		return;
4956 
4957 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4958 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4959 		dmar_map_gfx = 0;
4960 	} else if (dmar_map_gfx) {
4961 		/* we have to ensure the gfx device is idle before we flush */
4962 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4963 		iommu_set_dma_strict();
4964 	}
4965 }
4966 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4967 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4968 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4969 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4970 
4971 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4972 {
4973 	unsigned short ver;
4974 
4975 	if (!IS_GFX_DEVICE(dev))
4976 		return;
4977 
4978 	ver = (dev->device >> 8) & 0xff;
4979 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4980 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4981 	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4982 		return;
4983 
4984 	if (risky_device(dev))
4985 		return;
4986 
4987 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4988 	iommu_skip_te_disable = 1;
4989 }
4990 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4991 
4992 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4993    ISOCH DMAR unit for the Azalia sound device, but not give it any
4994    TLB entries, which causes it to deadlock. Check for that.  We do
4995    this in a function called from init_dmars(), instead of in a PCI
4996    quirk, because we don't want to print the obnoxious "BIOS broken"
4997    message if VT-d is actually disabled.
4998 */
4999 static void __init check_tylersburg_isoch(void)
5000 {
5001 	struct pci_dev *pdev;
5002 	uint32_t vtisochctrl;
5003 
5004 	/* If there's no Azalia in the system anyway, forget it. */
5005 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5006 	if (!pdev)
5007 		return;
5008 
5009 	if (risky_device(pdev)) {
5010 		pci_dev_put(pdev);
5011 		return;
5012 	}
5013 
5014 	pci_dev_put(pdev);
5015 
5016 	/* System Management Registers. Might be hidden, in which case
5017 	   we can't do the sanity check. But that's OK, because the
5018 	   known-broken BIOSes _don't_ actually hide it, so far. */
5019 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5020 	if (!pdev)
5021 		return;
5022 
5023 	if (risky_device(pdev)) {
5024 		pci_dev_put(pdev);
5025 		return;
5026 	}
5027 
5028 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5029 		pci_dev_put(pdev);
5030 		return;
5031 	}
5032 
5033 	pci_dev_put(pdev);
5034 
5035 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5036 	if (vtisochctrl & 1)
5037 		return;
5038 
5039 	/* Drop all bits other than the number of TLB entries */
5040 	vtisochctrl &= 0x1c;
5041 
5042 	/* If we have the recommended number of TLB entries (16), fine. */
5043 	if (vtisochctrl == 0x10)
5044 		return;
5045 
5046 	/* Zero TLB entries? You get to ride the short bus to school. */
5047 	if (!vtisochctrl) {
5048 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5049 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5050 		     dmi_get_system_info(DMI_BIOS_VENDOR),
5051 		     dmi_get_system_info(DMI_BIOS_VERSION),
5052 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5053 		iommu_identity_mapping |= IDENTMAP_AZALIA;
5054 		return;
5055 	}
5056 
5057 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5058 	       vtisochctrl);
5059 }
5060 
5061 /*
5062  * Here we deal with a device TLB defect where device may inadvertently issue ATS
5063  * invalidation completion before posted writes initiated with translated address
5064  * that utilized translations matching the invalidation address range, violating
5065  * the invalidation completion ordering.
5066  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5067  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5068  * under the control of the trusted/privileged host device driver must use this
5069  * quirk.
5070  * Device TLBs are invalidated under the following six conditions:
5071  * 1. Device driver does DMA API unmap IOVA
5072  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5073  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5074  *    exit_mmap() due to crash
5075  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5076  *    VM has to free pages that were unmapped
5077  * 5. Userspace driver unmaps a DMA buffer
5078  * 6. Cache invalidation in vSVA usage (upcoming)
5079  *
5080  * For #1 and #2, device drivers are responsible for stopping DMA traffic
5081  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5082  * invalidate TLB the same way as normal user unmap which will use this quirk.
5083  * The dTLB invalidation after PASID cache flush does not need this quirk.
5084  *
5085  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5086  */
5087 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5088 			       unsigned long address, unsigned long mask,
5089 			       u32 pasid, u16 qdep)
5090 {
5091 	u16 sid;
5092 
5093 	if (likely(!info->dtlb_extra_inval))
5094 		return;
5095 
5096 	sid = PCI_DEVID(info->bus, info->devfn);
5097 	if (pasid == IOMMU_NO_PASID) {
5098 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5099 				   qdep, address, mask);
5100 	} else {
5101 		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5102 					 pasid, qdep, address, mask);
5103 	}
5104 }
5105 
5106 #define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
5107 
5108 /*
5109  * Function to submit a command to the enhanced command interface. The
5110  * valid enhanced command descriptions are defined in Table 47 of the
5111  * VT-d spec. The VT-d hardware implementation may support some but not
5112  * all commands, which can be determined by checking the Enhanced
5113  * Command Capability Register.
5114  *
5115  * Return values:
5116  *  - 0: Command successful without any error;
5117  *  - Negative: software error value;
5118  *  - Nonzero positive: failure status code defined in Table 48.
5119  */
5120 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5121 {
5122 	unsigned long flags;
5123 	u64 res;
5124 	int ret;
5125 
5126 	if (!cap_ecmds(iommu->cap))
5127 		return -ENODEV;
5128 
5129 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
5130 
5131 	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5132 	if (res & DMA_ECMD_ECRSP_IP) {
5133 		ret = -EBUSY;
5134 		goto err;
5135 	}
5136 
5137 	/*
5138 	 * Unconditionally write the operand B, because
5139 	 * - There is no side effect if an ecmd doesn't require an
5140 	 *   operand B, but we set the register to some value.
5141 	 * - It's not invoked in any critical path. The extra MMIO
5142 	 *   write doesn't bring any performance concerns.
5143 	 */
5144 	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5145 	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5146 
5147 	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5148 		      !(res & DMA_ECMD_ECRSP_IP), res);
5149 
5150 	if (res & DMA_ECMD_ECRSP_IP) {
5151 		ret = -ETIMEDOUT;
5152 		goto err;
5153 	}
5154 
5155 	ret = ecmd_get_status_code(res);
5156 err:
5157 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5158 
5159 	return ret;
5160 }
5161