xref: /linux/drivers/iommu/intel/iommu.c (revision 17e548405a81665fd14cee960db7d093d1396400)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26 
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-pages.h"
31 #include "pasid.h"
32 #include "perfmon.h"
33 
34 #define ROOT_SIZE		VTD_PAGE_SIZE
35 #define CONTEXT_SIZE		VTD_PAGE_SIZE
36 
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41 
42 #define IOAPIC_RANGE_START	(0xfee00000)
43 #define IOAPIC_RANGE_END	(0xfeefffff)
44 #define IOVA_START_ADDR		(0x1000)
45 
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47 
48 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
49 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
50 
51 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
52    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
53 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
54 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
55 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
56 
57 static void __init check_tylersburg_isoch(void);
58 static int rwbf_quirk;
59 
60 /*
61  * set to 1 to panic kernel if can't successfully enable VT-d
62  * (used when kernel is launched w/ TXT)
63  */
64 static int force_on = 0;
65 static int intel_iommu_tboot_noforce;
66 static int no_platform_optin;
67 
68 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
69 
70 /*
71  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
72  * if marked present.
73  */
74 static phys_addr_t root_entry_lctp(struct root_entry *re)
75 {
76 	if (!(re->lo & 1))
77 		return 0;
78 
79 	return re->lo & VTD_PAGE_MASK;
80 }
81 
82 /*
83  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
84  * if marked present.
85  */
86 static phys_addr_t root_entry_uctp(struct root_entry *re)
87 {
88 	if (!(re->hi & 1))
89 		return 0;
90 
91 	return re->hi & VTD_PAGE_MASK;
92 }
93 
94 static int device_rid_cmp_key(const void *key, const struct rb_node *node)
95 {
96 	struct device_domain_info *info =
97 		rb_entry(node, struct device_domain_info, node);
98 	const u16 *rid_lhs = key;
99 
100 	if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
101 		return -1;
102 
103 	if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
104 		return 1;
105 
106 	return 0;
107 }
108 
109 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
110 {
111 	struct device_domain_info *info =
112 		rb_entry(lhs, struct device_domain_info, node);
113 	u16 key = PCI_DEVID(info->bus, info->devfn);
114 
115 	return device_rid_cmp_key(&key, rhs);
116 }
117 
118 /*
119  * Looks up an IOMMU-probed device using its source ID.
120  *
121  * Returns the pointer to the device if there is a match. Otherwise,
122  * returns NULL.
123  *
124  * Note that this helper doesn't guarantee that the device won't be
125  * released by the iommu subsystem after being returned. The caller
126  * should use its own synchronization mechanism to avoid the device
127  * being released during its use if its possibly the case.
128  */
129 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
130 {
131 	struct device_domain_info *info = NULL;
132 	struct rb_node *node;
133 	unsigned long flags;
134 
135 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
136 	node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
137 	if (node)
138 		info = rb_entry(node, struct device_domain_info, node);
139 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
140 
141 	return info ? info->dev : NULL;
142 }
143 
144 static int device_rbtree_insert(struct intel_iommu *iommu,
145 				struct device_domain_info *info)
146 {
147 	struct rb_node *curr;
148 	unsigned long flags;
149 
150 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
151 	curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
152 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
153 	if (WARN_ON(curr))
154 		return -EEXIST;
155 
156 	return 0;
157 }
158 
159 static void device_rbtree_remove(struct device_domain_info *info)
160 {
161 	struct intel_iommu *iommu = info->iommu;
162 	unsigned long flags;
163 
164 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
165 	rb_erase(&info->node, &iommu->device_rbtree);
166 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
167 }
168 
169 struct dmar_rmrr_unit {
170 	struct list_head list;		/* list of rmrr units	*/
171 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
172 	u64	base_address;		/* reserved base address*/
173 	u64	end_address;		/* reserved end address */
174 	struct dmar_dev_scope *devices;	/* target devices */
175 	int	devices_cnt;		/* target device count */
176 };
177 
178 struct dmar_atsr_unit {
179 	struct list_head list;		/* list of ATSR units */
180 	struct acpi_dmar_header *hdr;	/* ACPI header */
181 	struct dmar_dev_scope *devices;	/* target devices */
182 	int devices_cnt;		/* target device count */
183 	u8 include_all:1;		/* include all ports */
184 };
185 
186 struct dmar_satc_unit {
187 	struct list_head list;		/* list of SATC units */
188 	struct acpi_dmar_header *hdr;	/* ACPI header */
189 	struct dmar_dev_scope *devices;	/* target devices */
190 	struct intel_iommu *iommu;	/* the corresponding iommu */
191 	int devices_cnt;		/* target device count */
192 	u8 atc_required:1;		/* ATS is required */
193 };
194 
195 static LIST_HEAD(dmar_atsr_units);
196 static LIST_HEAD(dmar_rmrr_units);
197 static LIST_HEAD(dmar_satc_units);
198 
199 #define for_each_rmrr_units(rmrr) \
200 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
201 
202 static void intel_iommu_domain_free(struct iommu_domain *domain);
203 
204 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
205 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
206 
207 int intel_iommu_enabled = 0;
208 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
209 
210 static int intel_iommu_superpage = 1;
211 static int iommu_identity_mapping;
212 static int iommu_skip_te_disable;
213 static int disable_igfx_iommu;
214 
215 #define IDENTMAP_AZALIA		4
216 
217 const struct iommu_ops intel_iommu_ops;
218 static const struct iommu_dirty_ops intel_dirty_ops;
219 
220 static bool translation_pre_enabled(struct intel_iommu *iommu)
221 {
222 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
223 }
224 
225 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
226 {
227 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
228 }
229 
230 static void init_translation_status(struct intel_iommu *iommu)
231 {
232 	u32 gsts;
233 
234 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
235 	if (gsts & DMA_GSTS_TES)
236 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
237 }
238 
239 static int __init intel_iommu_setup(char *str)
240 {
241 	if (!str)
242 		return -EINVAL;
243 
244 	while (*str) {
245 		if (!strncmp(str, "on", 2)) {
246 			dmar_disabled = 0;
247 			pr_info("IOMMU enabled\n");
248 		} else if (!strncmp(str, "off", 3)) {
249 			dmar_disabled = 1;
250 			no_platform_optin = 1;
251 			pr_info("IOMMU disabled\n");
252 		} else if (!strncmp(str, "igfx_off", 8)) {
253 			disable_igfx_iommu = 1;
254 			pr_info("Disable GFX device mapping\n");
255 		} else if (!strncmp(str, "forcedac", 8)) {
256 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
257 			iommu_dma_forcedac = true;
258 		} else if (!strncmp(str, "strict", 6)) {
259 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
260 			iommu_set_dma_strict();
261 		} else if (!strncmp(str, "sp_off", 6)) {
262 			pr_info("Disable supported super page\n");
263 			intel_iommu_superpage = 0;
264 		} else if (!strncmp(str, "sm_on", 5)) {
265 			pr_info("Enable scalable mode if hardware supports\n");
266 			intel_iommu_sm = 1;
267 		} else if (!strncmp(str, "sm_off", 6)) {
268 			pr_info("Scalable mode is disallowed\n");
269 			intel_iommu_sm = 0;
270 		} else if (!strncmp(str, "tboot_noforce", 13)) {
271 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
272 			intel_iommu_tboot_noforce = 1;
273 		} else {
274 			pr_notice("Unknown option - '%s'\n", str);
275 		}
276 
277 		str += strcspn(str, ",");
278 		while (*str == ',')
279 			str++;
280 	}
281 
282 	return 1;
283 }
284 __setup("intel_iommu=", intel_iommu_setup);
285 
286 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
287 {
288 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
289 
290 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
291 }
292 
293 /*
294  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
295  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
296  * the returned SAGAW.
297  */
298 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
299 {
300 	unsigned long fl_sagaw, sl_sagaw;
301 
302 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
303 	sl_sagaw = cap_sagaw(iommu->cap);
304 
305 	/* Second level only. */
306 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
307 		return sl_sagaw;
308 
309 	/* First level only. */
310 	if (!ecap_slts(iommu->ecap))
311 		return fl_sagaw;
312 
313 	return fl_sagaw & sl_sagaw;
314 }
315 
316 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
317 {
318 	unsigned long sagaw;
319 	int agaw;
320 
321 	sagaw = __iommu_calculate_sagaw(iommu);
322 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
323 		if (test_bit(agaw, &sagaw))
324 			break;
325 	}
326 
327 	return agaw;
328 }
329 
330 /*
331  * Calculate max SAGAW for each iommu.
332  */
333 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
334 {
335 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
336 }
337 
338 /*
339  * calculate agaw for each iommu.
340  * "SAGAW" may be different across iommus, use a default agaw, and
341  * get a supported less agaw for iommus that don't support the default agaw.
342  */
343 int iommu_calculate_agaw(struct intel_iommu *iommu)
344 {
345 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
346 }
347 
348 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
349 {
350 	return sm_supported(iommu) ?
351 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
352 }
353 
354 /* Return the super pagesize bitmap if supported. */
355 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
356 {
357 	unsigned long bitmap = 0;
358 
359 	/*
360 	 * 1-level super page supports page size of 2MiB, 2-level super page
361 	 * supports page size of both 2MiB and 1GiB.
362 	 */
363 	if (domain->iommu_superpage == 1)
364 		bitmap |= SZ_2M;
365 	else if (domain->iommu_superpage == 2)
366 		bitmap |= SZ_2M | SZ_1G;
367 
368 	return bitmap;
369 }
370 
371 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
372 					 u8 devfn, int alloc)
373 {
374 	struct root_entry *root = &iommu->root_entry[bus];
375 	struct context_entry *context;
376 	u64 *entry;
377 
378 	/*
379 	 * Except that the caller requested to allocate a new entry,
380 	 * returning a copied context entry makes no sense.
381 	 */
382 	if (!alloc && context_copied(iommu, bus, devfn))
383 		return NULL;
384 
385 	entry = &root->lo;
386 	if (sm_supported(iommu)) {
387 		if (devfn >= 0x80) {
388 			devfn -= 0x80;
389 			entry = &root->hi;
390 		}
391 		devfn *= 2;
392 	}
393 	if (*entry & 1)
394 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
395 	else {
396 		unsigned long phy_addr;
397 		if (!alloc)
398 			return NULL;
399 
400 		context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
401 		if (!context)
402 			return NULL;
403 
404 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
405 		phy_addr = virt_to_phys((void *)context);
406 		*entry = phy_addr | 1;
407 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
408 	}
409 	return &context[devfn];
410 }
411 
412 /**
413  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
414  *				 sub-hierarchy of a candidate PCI-PCI bridge
415  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
416  * @bridge: the candidate PCI-PCI bridge
417  *
418  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
419  */
420 static bool
421 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
422 {
423 	struct pci_dev *pdev, *pbridge;
424 
425 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
426 		return false;
427 
428 	pdev = to_pci_dev(dev);
429 	pbridge = to_pci_dev(bridge);
430 
431 	if (pbridge->subordinate &&
432 	    pbridge->subordinate->number <= pdev->bus->number &&
433 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
434 		return true;
435 
436 	return false;
437 }
438 
439 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
440 {
441 	struct dmar_drhd_unit *drhd;
442 	u32 vtbar;
443 	int rc;
444 
445 	/* We know that this device on this chipset has its own IOMMU.
446 	 * If we find it under a different IOMMU, then the BIOS is lying
447 	 * to us. Hope that the IOMMU for this device is actually
448 	 * disabled, and it needs no translation...
449 	 */
450 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
451 	if (rc) {
452 		/* "can't" happen */
453 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
454 		return false;
455 	}
456 	vtbar &= 0xffff0000;
457 
458 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
459 	drhd = dmar_find_matched_drhd_unit(pdev);
460 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
461 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
462 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
463 		return true;
464 	}
465 
466 	return false;
467 }
468 
469 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
470 {
471 	if (!iommu || iommu->drhd->ignored)
472 		return true;
473 
474 	if (dev_is_pci(dev)) {
475 		struct pci_dev *pdev = to_pci_dev(dev);
476 
477 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
478 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
479 		    quirk_ioat_snb_local_iommu(pdev))
480 			return true;
481 	}
482 
483 	return false;
484 }
485 
486 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
487 {
488 	struct dmar_drhd_unit *drhd = NULL;
489 	struct pci_dev *pdev = NULL;
490 	struct intel_iommu *iommu;
491 	struct device *tmp;
492 	u16 segment = 0;
493 	int i;
494 
495 	if (!dev)
496 		return NULL;
497 
498 	if (dev_is_pci(dev)) {
499 		struct pci_dev *pf_pdev;
500 
501 		pdev = pci_real_dma_dev(to_pci_dev(dev));
502 
503 		/* VFs aren't listed in scope tables; we need to look up
504 		 * the PF instead to find the IOMMU. */
505 		pf_pdev = pci_physfn(pdev);
506 		dev = &pf_pdev->dev;
507 		segment = pci_domain_nr(pdev->bus);
508 	} else if (has_acpi_companion(dev))
509 		dev = &ACPI_COMPANION(dev)->dev;
510 
511 	rcu_read_lock();
512 	for_each_iommu(iommu, drhd) {
513 		if (pdev && segment != drhd->segment)
514 			continue;
515 
516 		for_each_active_dev_scope(drhd->devices,
517 					  drhd->devices_cnt, i, tmp) {
518 			if (tmp == dev) {
519 				/* For a VF use its original BDF# not that of the PF
520 				 * which we used for the IOMMU lookup. Strictly speaking
521 				 * we could do this for all PCI devices; we only need to
522 				 * get the BDF# from the scope table for ACPI matches. */
523 				if (pdev && pdev->is_virtfn)
524 					goto got_pdev;
525 
526 				if (bus && devfn) {
527 					*bus = drhd->devices[i].bus;
528 					*devfn = drhd->devices[i].devfn;
529 				}
530 				goto out;
531 			}
532 
533 			if (is_downstream_to_pci_bridge(dev, tmp))
534 				goto got_pdev;
535 		}
536 
537 		if (pdev && drhd->include_all) {
538 got_pdev:
539 			if (bus && devfn) {
540 				*bus = pdev->bus->number;
541 				*devfn = pdev->devfn;
542 			}
543 			goto out;
544 		}
545 	}
546 	iommu = NULL;
547 out:
548 	if (iommu_is_dummy(iommu, dev))
549 		iommu = NULL;
550 
551 	rcu_read_unlock();
552 
553 	return iommu;
554 }
555 
556 static void domain_flush_cache(struct dmar_domain *domain,
557 			       void *addr, int size)
558 {
559 	if (!domain->iommu_coherency)
560 		clflush_cache_range(addr, size);
561 }
562 
563 static void free_context_table(struct intel_iommu *iommu)
564 {
565 	struct context_entry *context;
566 	int i;
567 
568 	if (!iommu->root_entry)
569 		return;
570 
571 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
572 		context = iommu_context_addr(iommu, i, 0, 0);
573 		if (context)
574 			iommu_free_page(context);
575 
576 		if (!sm_supported(iommu))
577 			continue;
578 
579 		context = iommu_context_addr(iommu, i, 0x80, 0);
580 		if (context)
581 			iommu_free_page(context);
582 	}
583 
584 	iommu_free_page(iommu->root_entry);
585 	iommu->root_entry = NULL;
586 }
587 
588 #ifdef CONFIG_DMAR_DEBUG
589 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
590 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
591 {
592 	struct dma_pte *pte;
593 	int offset;
594 
595 	while (1) {
596 		offset = pfn_level_offset(pfn, level);
597 		pte = &parent[offset];
598 
599 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
600 
601 		if (!dma_pte_present(pte)) {
602 			pr_info("page table not present at level %d\n", level - 1);
603 			break;
604 		}
605 
606 		if (level == 1 || dma_pte_superpage(pte))
607 			break;
608 
609 		parent = phys_to_virt(dma_pte_addr(pte));
610 		level--;
611 	}
612 }
613 
614 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
615 			  unsigned long long addr, u32 pasid)
616 {
617 	struct pasid_dir_entry *dir, *pde;
618 	struct pasid_entry *entries, *pte;
619 	struct context_entry *ctx_entry;
620 	struct root_entry *rt_entry;
621 	int i, dir_index, index, level;
622 	u8 devfn = source_id & 0xff;
623 	u8 bus = source_id >> 8;
624 	struct dma_pte *pgtable;
625 
626 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
627 
628 	/* root entry dump */
629 	if (!iommu->root_entry) {
630 		pr_info("root table is not present\n");
631 		return;
632 	}
633 	rt_entry = &iommu->root_entry[bus];
634 
635 	if (sm_supported(iommu))
636 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
637 			rt_entry->hi, rt_entry->lo);
638 	else
639 		pr_info("root entry: 0x%016llx", rt_entry->lo);
640 
641 	/* context entry dump */
642 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
643 	if (!ctx_entry) {
644 		pr_info("context table is not present\n");
645 		return;
646 	}
647 
648 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
649 		ctx_entry->hi, ctx_entry->lo);
650 
651 	/* legacy mode does not require PASID entries */
652 	if (!sm_supported(iommu)) {
653 		if (!context_present(ctx_entry)) {
654 			pr_info("legacy mode page table is not present\n");
655 			return;
656 		}
657 		level = agaw_to_level(ctx_entry->hi & 7);
658 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
659 		goto pgtable_walk;
660 	}
661 
662 	if (!context_present(ctx_entry)) {
663 		pr_info("pasid directory table is not present\n");
664 		return;
665 	}
666 
667 	/* get the pointer to pasid directory entry */
668 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
669 
670 	/* For request-without-pasid, get the pasid from context entry */
671 	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
672 		pasid = IOMMU_NO_PASID;
673 
674 	dir_index = pasid >> PASID_PDE_SHIFT;
675 	pde = &dir[dir_index];
676 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
677 
678 	/* get the pointer to the pasid table entry */
679 	entries = get_pasid_table_from_pde(pde);
680 	if (!entries) {
681 		pr_info("pasid table is not present\n");
682 		return;
683 	}
684 	index = pasid & PASID_PTE_MASK;
685 	pte = &entries[index];
686 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
687 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
688 
689 	if (!pasid_pte_is_present(pte)) {
690 		pr_info("scalable mode page table is not present\n");
691 		return;
692 	}
693 
694 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
695 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
696 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
697 	} else {
698 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
699 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
700 	}
701 
702 pgtable_walk:
703 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
704 }
705 #endif
706 
707 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
708 				      unsigned long pfn, int *target_level,
709 				      gfp_t gfp)
710 {
711 	struct dma_pte *parent, *pte;
712 	int level = agaw_to_level(domain->agaw);
713 	int offset;
714 
715 	if (!domain_pfn_supported(domain, pfn))
716 		/* Address beyond IOMMU's addressing capabilities. */
717 		return NULL;
718 
719 	parent = domain->pgd;
720 
721 	while (1) {
722 		void *tmp_page;
723 
724 		offset = pfn_level_offset(pfn, level);
725 		pte = &parent[offset];
726 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
727 			break;
728 		if (level == *target_level)
729 			break;
730 
731 		if (!dma_pte_present(pte)) {
732 			uint64_t pteval, tmp;
733 
734 			tmp_page = iommu_alloc_page_node(domain->nid, gfp);
735 
736 			if (!tmp_page)
737 				return NULL;
738 
739 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
740 			pteval = virt_to_phys(tmp_page) | DMA_PTE_READ |
741 				 DMA_PTE_WRITE;
742 			if (domain->use_first_level)
743 				pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
744 
745 			tmp = 0ULL;
746 			if (!try_cmpxchg64(&pte->val, &tmp, pteval))
747 				/* Someone else set it while we were thinking; use theirs. */
748 				iommu_free_page(tmp_page);
749 			else
750 				domain_flush_cache(domain, pte, sizeof(*pte));
751 		}
752 		if (level == 1)
753 			break;
754 
755 		parent = phys_to_virt(dma_pte_addr(pte));
756 		level--;
757 	}
758 
759 	if (!*target_level)
760 		*target_level = level;
761 
762 	return pte;
763 }
764 
765 /* return address's pte at specific level */
766 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
767 					 unsigned long pfn,
768 					 int level, int *large_page)
769 {
770 	struct dma_pte *parent, *pte;
771 	int total = agaw_to_level(domain->agaw);
772 	int offset;
773 
774 	parent = domain->pgd;
775 	while (level <= total) {
776 		offset = pfn_level_offset(pfn, total);
777 		pte = &parent[offset];
778 		if (level == total)
779 			return pte;
780 
781 		if (!dma_pte_present(pte)) {
782 			*large_page = total;
783 			break;
784 		}
785 
786 		if (dma_pte_superpage(pte)) {
787 			*large_page = total;
788 			return pte;
789 		}
790 
791 		parent = phys_to_virt(dma_pte_addr(pte));
792 		total--;
793 	}
794 	return NULL;
795 }
796 
797 /* clear last level pte, a tlb flush should be followed */
798 static void dma_pte_clear_range(struct dmar_domain *domain,
799 				unsigned long start_pfn,
800 				unsigned long last_pfn)
801 {
802 	unsigned int large_page;
803 	struct dma_pte *first_pte, *pte;
804 
805 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
806 	    WARN_ON(start_pfn > last_pfn))
807 		return;
808 
809 	/* we don't need lock here; nobody else touches the iova range */
810 	do {
811 		large_page = 1;
812 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
813 		if (!pte) {
814 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
815 			continue;
816 		}
817 		do {
818 			dma_clear_pte(pte);
819 			start_pfn += lvl_to_nr_pages(large_page);
820 			pte++;
821 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
822 
823 		domain_flush_cache(domain, first_pte,
824 				   (void *)pte - (void *)first_pte);
825 
826 	} while (start_pfn && start_pfn <= last_pfn);
827 }
828 
829 static void dma_pte_free_level(struct dmar_domain *domain, int level,
830 			       int retain_level, struct dma_pte *pte,
831 			       unsigned long pfn, unsigned long start_pfn,
832 			       unsigned long last_pfn)
833 {
834 	pfn = max(start_pfn, pfn);
835 	pte = &pte[pfn_level_offset(pfn, level)];
836 
837 	do {
838 		unsigned long level_pfn;
839 		struct dma_pte *level_pte;
840 
841 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
842 			goto next;
843 
844 		level_pfn = pfn & level_mask(level);
845 		level_pte = phys_to_virt(dma_pte_addr(pte));
846 
847 		if (level > 2) {
848 			dma_pte_free_level(domain, level - 1, retain_level,
849 					   level_pte, level_pfn, start_pfn,
850 					   last_pfn);
851 		}
852 
853 		/*
854 		 * Free the page table if we're below the level we want to
855 		 * retain and the range covers the entire table.
856 		 */
857 		if (level < retain_level && !(start_pfn > level_pfn ||
858 		      last_pfn < level_pfn + level_size(level) - 1)) {
859 			dma_clear_pte(pte);
860 			domain_flush_cache(domain, pte, sizeof(*pte));
861 			iommu_free_page(level_pte);
862 		}
863 next:
864 		pfn += level_size(level);
865 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
866 }
867 
868 /*
869  * clear last level (leaf) ptes and free page table pages below the
870  * level we wish to keep intact.
871  */
872 static void dma_pte_free_pagetable(struct dmar_domain *domain,
873 				   unsigned long start_pfn,
874 				   unsigned long last_pfn,
875 				   int retain_level)
876 {
877 	dma_pte_clear_range(domain, start_pfn, last_pfn);
878 
879 	/* We don't need lock here; nobody else touches the iova range */
880 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
881 			   domain->pgd, 0, start_pfn, last_pfn);
882 
883 	/* free pgd */
884 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
885 		iommu_free_page(domain->pgd);
886 		domain->pgd = NULL;
887 	}
888 }
889 
890 /* When a page at a given level is being unlinked from its parent, we don't
891    need to *modify* it at all. All we need to do is make a list of all the
892    pages which can be freed just as soon as we've flushed the IOTLB and we
893    know the hardware page-walk will no longer touch them.
894    The 'pte' argument is the *parent* PTE, pointing to the page that is to
895    be freed. */
896 static void dma_pte_list_pagetables(struct dmar_domain *domain,
897 				    int level, struct dma_pte *pte,
898 				    struct list_head *freelist)
899 {
900 	struct page *pg;
901 
902 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
903 	list_add_tail(&pg->lru, freelist);
904 
905 	if (level == 1)
906 		return;
907 
908 	pte = page_address(pg);
909 	do {
910 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
911 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
912 		pte++;
913 	} while (!first_pte_in_page(pte));
914 }
915 
916 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
917 				struct dma_pte *pte, unsigned long pfn,
918 				unsigned long start_pfn, unsigned long last_pfn,
919 				struct list_head *freelist)
920 {
921 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
922 
923 	pfn = max(start_pfn, pfn);
924 	pte = &pte[pfn_level_offset(pfn, level)];
925 
926 	do {
927 		unsigned long level_pfn = pfn & level_mask(level);
928 
929 		if (!dma_pte_present(pte))
930 			goto next;
931 
932 		/* If range covers entire pagetable, free it */
933 		if (start_pfn <= level_pfn &&
934 		    last_pfn >= level_pfn + level_size(level) - 1) {
935 			/* These suborbinate page tables are going away entirely. Don't
936 			   bother to clear them; we're just going to *free* them. */
937 			if (level > 1 && !dma_pte_superpage(pte))
938 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
939 
940 			dma_clear_pte(pte);
941 			if (!first_pte)
942 				first_pte = pte;
943 			last_pte = pte;
944 		} else if (level > 1) {
945 			/* Recurse down into a level that isn't *entirely* obsolete */
946 			dma_pte_clear_level(domain, level - 1,
947 					    phys_to_virt(dma_pte_addr(pte)),
948 					    level_pfn, start_pfn, last_pfn,
949 					    freelist);
950 		}
951 next:
952 		pfn = level_pfn + level_size(level);
953 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
954 
955 	if (first_pte)
956 		domain_flush_cache(domain, first_pte,
957 				   (void *)++last_pte - (void *)first_pte);
958 }
959 
960 /* We can't just free the pages because the IOMMU may still be walking
961    the page tables, and may have cached the intermediate levels. The
962    pages can only be freed after the IOTLB flush has been done. */
963 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
964 			 unsigned long last_pfn, struct list_head *freelist)
965 {
966 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
967 	    WARN_ON(start_pfn > last_pfn))
968 		return;
969 
970 	/* we don't need lock here; nobody else touches the iova range */
971 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
972 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
973 
974 	/* free pgd */
975 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
976 		struct page *pgd_page = virt_to_page(domain->pgd);
977 		list_add_tail(&pgd_page->lru, freelist);
978 		domain->pgd = NULL;
979 	}
980 }
981 
982 /* iommu handling */
983 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
984 {
985 	struct root_entry *root;
986 
987 	root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
988 	if (!root) {
989 		pr_err("Allocating root entry for %s failed\n",
990 			iommu->name);
991 		return -ENOMEM;
992 	}
993 
994 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
995 	iommu->root_entry = root;
996 
997 	return 0;
998 }
999 
1000 static void iommu_set_root_entry(struct intel_iommu *iommu)
1001 {
1002 	u64 addr;
1003 	u32 sts;
1004 	unsigned long flag;
1005 
1006 	addr = virt_to_phys(iommu->root_entry);
1007 	if (sm_supported(iommu))
1008 		addr |= DMA_RTADDR_SMT;
1009 
1010 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1011 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1012 
1013 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1014 
1015 	/* Make sure hardware complete it */
1016 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1017 		      readl, (sts & DMA_GSTS_RTPS), sts);
1018 
1019 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1020 
1021 	/*
1022 	 * Hardware invalidates all DMA remapping hardware translation
1023 	 * caches as part of SRTP flow.
1024 	 */
1025 	if (cap_esrtps(iommu->cap))
1026 		return;
1027 
1028 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1029 	if (sm_supported(iommu))
1030 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1031 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1032 }
1033 
1034 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1035 {
1036 	u32 val;
1037 	unsigned long flag;
1038 
1039 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1040 		return;
1041 
1042 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1043 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1044 
1045 	/* Make sure hardware complete it */
1046 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1047 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1048 
1049 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1050 }
1051 
1052 /* return value determine if we need a write buffer flush */
1053 static void __iommu_flush_context(struct intel_iommu *iommu,
1054 				  u16 did, u16 source_id, u8 function_mask,
1055 				  u64 type)
1056 {
1057 	u64 val = 0;
1058 	unsigned long flag;
1059 
1060 	switch (type) {
1061 	case DMA_CCMD_GLOBAL_INVL:
1062 		val = DMA_CCMD_GLOBAL_INVL;
1063 		break;
1064 	case DMA_CCMD_DOMAIN_INVL:
1065 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1066 		break;
1067 	case DMA_CCMD_DEVICE_INVL:
1068 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1069 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1070 		break;
1071 	default:
1072 		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1073 			iommu->name, type);
1074 		return;
1075 	}
1076 	val |= DMA_CCMD_ICC;
1077 
1078 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1079 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1080 
1081 	/* Make sure hardware complete it */
1082 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1083 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1084 
1085 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1086 }
1087 
1088 void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
1089 			 unsigned int size_order, u64 type)
1090 {
1091 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1092 	u64 val = 0, val_iva = 0;
1093 	unsigned long flag;
1094 
1095 	switch (type) {
1096 	case DMA_TLB_GLOBAL_FLUSH:
1097 		/* global flush doesn't need set IVA_REG */
1098 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1099 		break;
1100 	case DMA_TLB_DSI_FLUSH:
1101 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1102 		break;
1103 	case DMA_TLB_PSI_FLUSH:
1104 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1105 		/* IH bit is passed in as part of address */
1106 		val_iva = size_order | addr;
1107 		break;
1108 	default:
1109 		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1110 			iommu->name, type);
1111 		return;
1112 	}
1113 
1114 	if (cap_write_drain(iommu->cap))
1115 		val |= DMA_TLB_WRITE_DRAIN;
1116 
1117 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1118 	/* Note: Only uses first TLB reg currently */
1119 	if (val_iva)
1120 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1121 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1122 
1123 	/* Make sure hardware complete it */
1124 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1125 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1126 
1127 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1128 
1129 	/* check IOTLB invalidation granularity */
1130 	if (DMA_TLB_IAIG(val) == 0)
1131 		pr_err("Flush IOTLB failed\n");
1132 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1133 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1134 			(unsigned long long)DMA_TLB_IIRG(type),
1135 			(unsigned long long)DMA_TLB_IAIG(val));
1136 }
1137 
1138 static struct device_domain_info *
1139 domain_lookup_dev_info(struct dmar_domain *domain,
1140 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1141 {
1142 	struct device_domain_info *info;
1143 	unsigned long flags;
1144 
1145 	spin_lock_irqsave(&domain->lock, flags);
1146 	list_for_each_entry(info, &domain->devices, link) {
1147 		if (info->iommu == iommu && info->bus == bus &&
1148 		    info->devfn == devfn) {
1149 			spin_unlock_irqrestore(&domain->lock, flags);
1150 			return info;
1151 		}
1152 	}
1153 	spin_unlock_irqrestore(&domain->lock, flags);
1154 
1155 	return NULL;
1156 }
1157 
1158 /*
1159  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1160  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1161  * check because it applies only to the built-in QAT devices and it doesn't
1162  * grant additional privileges.
1163  */
1164 #define BUGGY_QAT_DEVID_MASK 0x4940
1165 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1166 {
1167 	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1168 		return false;
1169 
1170 	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1171 		return false;
1172 
1173 	return true;
1174 }
1175 
1176 static void iommu_enable_pci_ats(struct device_domain_info *info)
1177 {
1178 	struct pci_dev *pdev;
1179 
1180 	if (!info->ats_supported)
1181 		return;
1182 
1183 	pdev = to_pci_dev(info->dev);
1184 	if (!pci_ats_page_aligned(pdev))
1185 		return;
1186 
1187 	if (!pci_enable_ats(pdev, VTD_PAGE_SHIFT))
1188 		info->ats_enabled = 1;
1189 }
1190 
1191 static void iommu_disable_pci_ats(struct device_domain_info *info)
1192 {
1193 	if (!info->ats_enabled)
1194 		return;
1195 
1196 	pci_disable_ats(to_pci_dev(info->dev));
1197 	info->ats_enabled = 0;
1198 }
1199 
1200 static void iommu_enable_pci_pri(struct device_domain_info *info)
1201 {
1202 	struct pci_dev *pdev;
1203 
1204 	if (!info->ats_enabled || !info->pri_supported)
1205 		return;
1206 
1207 	pdev = to_pci_dev(info->dev);
1208 	/* PASID is required in PRG Response Message. */
1209 	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
1210 		return;
1211 
1212 	if (pci_reset_pri(pdev))
1213 		return;
1214 
1215 	if (!pci_enable_pri(pdev, PRQ_DEPTH))
1216 		info->pri_enabled = 1;
1217 }
1218 
1219 static void iommu_disable_pci_pri(struct device_domain_info *info)
1220 {
1221 	if (!info->pri_enabled)
1222 		return;
1223 
1224 	if (WARN_ON(info->iopf_refcount))
1225 		iopf_queue_remove_device(info->iommu->iopf_queue, info->dev);
1226 
1227 	pci_disable_pri(to_pci_dev(info->dev));
1228 	info->pri_enabled = 0;
1229 }
1230 
1231 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1232 {
1233 	cache_tag_flush_all(to_dmar_domain(domain));
1234 }
1235 
1236 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1237 {
1238 	u32 pmen;
1239 	unsigned long flags;
1240 
1241 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1242 		return;
1243 
1244 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1245 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1246 	pmen &= ~DMA_PMEN_EPM;
1247 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1248 
1249 	/* wait for the protected region status bit to clear */
1250 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1251 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1252 
1253 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1254 }
1255 
1256 static void iommu_enable_translation(struct intel_iommu *iommu)
1257 {
1258 	u32 sts;
1259 	unsigned long flags;
1260 
1261 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1262 	iommu->gcmd |= DMA_GCMD_TE;
1263 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1264 
1265 	/* Make sure hardware complete it */
1266 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1267 		      readl, (sts & DMA_GSTS_TES), sts);
1268 
1269 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1270 }
1271 
1272 static void iommu_disable_translation(struct intel_iommu *iommu)
1273 {
1274 	u32 sts;
1275 	unsigned long flag;
1276 
1277 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1278 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1279 		return;
1280 
1281 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1282 	iommu->gcmd &= ~DMA_GCMD_TE;
1283 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1284 
1285 	/* Make sure hardware complete it */
1286 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1287 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1288 
1289 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1290 }
1291 
1292 static int iommu_init_domains(struct intel_iommu *iommu)
1293 {
1294 	u32 ndomains;
1295 
1296 	ndomains = cap_ndoms(iommu->cap);
1297 	pr_debug("%s: Number of Domains supported <%d>\n",
1298 		 iommu->name, ndomains);
1299 
1300 	spin_lock_init(&iommu->lock);
1301 
1302 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1303 	if (!iommu->domain_ids)
1304 		return -ENOMEM;
1305 
1306 	/*
1307 	 * If Caching mode is set, then invalid translations are tagged
1308 	 * with domain-id 0, hence we need to pre-allocate it. We also
1309 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1310 	 * make sure it is not used for a real domain.
1311 	 */
1312 	set_bit(0, iommu->domain_ids);
1313 
1314 	/*
1315 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1316 	 * entry for first-level or pass-through translation modes should
1317 	 * be programmed with a domain id different from those used for
1318 	 * second-level or nested translation. We reserve a domain id for
1319 	 * this purpose. This domain id is also used for identity domain
1320 	 * in legacy mode.
1321 	 */
1322 	set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1323 
1324 	return 0;
1325 }
1326 
1327 static void disable_dmar_iommu(struct intel_iommu *iommu)
1328 {
1329 	if (!iommu->domain_ids)
1330 		return;
1331 
1332 	/*
1333 	 * All iommu domains must have been detached from the devices,
1334 	 * hence there should be no domain IDs in use.
1335 	 */
1336 	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1337 		    > NUM_RESERVED_DID))
1338 		return;
1339 
1340 	if (iommu->gcmd & DMA_GCMD_TE)
1341 		iommu_disable_translation(iommu);
1342 }
1343 
1344 static void free_dmar_iommu(struct intel_iommu *iommu)
1345 {
1346 	if (iommu->domain_ids) {
1347 		bitmap_free(iommu->domain_ids);
1348 		iommu->domain_ids = NULL;
1349 	}
1350 
1351 	if (iommu->copied_tables) {
1352 		bitmap_free(iommu->copied_tables);
1353 		iommu->copied_tables = NULL;
1354 	}
1355 
1356 	/* free context mapping */
1357 	free_context_table(iommu);
1358 
1359 	if (ecap_prs(iommu->ecap))
1360 		intel_iommu_finish_prq(iommu);
1361 }
1362 
1363 /*
1364  * Check and return whether first level is used by default for
1365  * DMA translation.
1366  */
1367 static bool first_level_by_default(struct intel_iommu *iommu)
1368 {
1369 	/* Only SL is available in legacy mode */
1370 	if (!sm_supported(iommu))
1371 		return false;
1372 
1373 	/* Only level (either FL or SL) is available, just use it */
1374 	if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap))
1375 		return ecap_flts(iommu->ecap);
1376 
1377 	return true;
1378 }
1379 
1380 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1381 {
1382 	struct iommu_domain_info *info, *curr;
1383 	unsigned long ndomains;
1384 	int num, ret = -ENOSPC;
1385 
1386 	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1387 		return 0;
1388 
1389 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1390 	if (!info)
1391 		return -ENOMEM;
1392 
1393 	spin_lock(&iommu->lock);
1394 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1395 	if (curr) {
1396 		curr->refcnt++;
1397 		spin_unlock(&iommu->lock);
1398 		kfree(info);
1399 		return 0;
1400 	}
1401 
1402 	ndomains = cap_ndoms(iommu->cap);
1403 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1404 	if (num >= ndomains) {
1405 		pr_err("%s: No free domain ids\n", iommu->name);
1406 		goto err_unlock;
1407 	}
1408 
1409 	set_bit(num, iommu->domain_ids);
1410 	info->refcnt	= 1;
1411 	info->did	= num;
1412 	info->iommu	= iommu;
1413 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1414 			  NULL, info, GFP_ATOMIC);
1415 	if (curr) {
1416 		ret = xa_err(curr) ? : -EBUSY;
1417 		goto err_clear;
1418 	}
1419 
1420 	spin_unlock(&iommu->lock);
1421 	return 0;
1422 
1423 err_clear:
1424 	clear_bit(info->did, iommu->domain_ids);
1425 err_unlock:
1426 	spin_unlock(&iommu->lock);
1427 	kfree(info);
1428 	return ret;
1429 }
1430 
1431 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1432 {
1433 	struct iommu_domain_info *info;
1434 
1435 	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1436 		return;
1437 
1438 	spin_lock(&iommu->lock);
1439 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1440 	if (--info->refcnt == 0) {
1441 		clear_bit(info->did, iommu->domain_ids);
1442 		xa_erase(&domain->iommu_array, iommu->seq_id);
1443 		domain->nid = NUMA_NO_NODE;
1444 		kfree(info);
1445 	}
1446 	spin_unlock(&iommu->lock);
1447 }
1448 
1449 static void domain_exit(struct dmar_domain *domain)
1450 {
1451 	if (domain->pgd) {
1452 		LIST_HEAD(freelist);
1453 
1454 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1455 		iommu_put_pages_list(&freelist);
1456 	}
1457 
1458 	if (WARN_ON(!list_empty(&domain->devices)))
1459 		return;
1460 
1461 	kfree(domain->qi_batch);
1462 	kfree(domain);
1463 }
1464 
1465 /*
1466  * For kdump cases, old valid entries may be cached due to the
1467  * in-flight DMA and copied pgtable, but there is no unmapping
1468  * behaviour for them, thus we need an explicit cache flush for
1469  * the newly-mapped device. For kdump, at this point, the device
1470  * is supposed to finish reset at its driver probe stage, so no
1471  * in-flight DMA will exist, and we don't need to worry anymore
1472  * hereafter.
1473  */
1474 static void copied_context_tear_down(struct intel_iommu *iommu,
1475 				     struct context_entry *context,
1476 				     u8 bus, u8 devfn)
1477 {
1478 	u16 did_old;
1479 
1480 	if (!context_copied(iommu, bus, devfn))
1481 		return;
1482 
1483 	assert_spin_locked(&iommu->lock);
1484 
1485 	did_old = context_domain_id(context);
1486 	context_clear_entry(context);
1487 
1488 	if (did_old < cap_ndoms(iommu->cap)) {
1489 		iommu->flush.flush_context(iommu, did_old,
1490 					   PCI_DEVID(bus, devfn),
1491 					   DMA_CCMD_MASK_NOBIT,
1492 					   DMA_CCMD_DEVICE_INVL);
1493 		iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1494 					 DMA_TLB_DSI_FLUSH);
1495 	}
1496 
1497 	clear_context_copied(iommu, bus, devfn);
1498 }
1499 
1500 /*
1501  * It's a non-present to present mapping. If hardware doesn't cache
1502  * non-present entry we only need to flush the write-buffer. If the
1503  * _does_ cache non-present entries, then it does so in the special
1504  * domain #0, which we have to flush:
1505  */
1506 static void context_present_cache_flush(struct intel_iommu *iommu, u16 did,
1507 					u8 bus, u8 devfn)
1508 {
1509 	if (cap_caching_mode(iommu->cap)) {
1510 		iommu->flush.flush_context(iommu, 0,
1511 					   PCI_DEVID(bus, devfn),
1512 					   DMA_CCMD_MASK_NOBIT,
1513 					   DMA_CCMD_DEVICE_INVL);
1514 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1515 	} else {
1516 		iommu_flush_write_buffer(iommu);
1517 	}
1518 }
1519 
1520 static int domain_context_mapping_one(struct dmar_domain *domain,
1521 				      struct intel_iommu *iommu,
1522 				      u8 bus, u8 devfn)
1523 {
1524 	struct device_domain_info *info =
1525 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1526 	u16 did = domain_id_iommu(domain, iommu);
1527 	int translation = CONTEXT_TT_MULTI_LEVEL;
1528 	struct dma_pte *pgd = domain->pgd;
1529 	struct context_entry *context;
1530 	int ret;
1531 
1532 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1533 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1534 
1535 	spin_lock(&iommu->lock);
1536 	ret = -ENOMEM;
1537 	context = iommu_context_addr(iommu, bus, devfn, 1);
1538 	if (!context)
1539 		goto out_unlock;
1540 
1541 	ret = 0;
1542 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1543 		goto out_unlock;
1544 
1545 	copied_context_tear_down(iommu, context, bus, devfn);
1546 	context_clear_entry(context);
1547 	context_set_domain_id(context, did);
1548 
1549 	if (info && info->ats_supported)
1550 		translation = CONTEXT_TT_DEV_IOTLB;
1551 	else
1552 		translation = CONTEXT_TT_MULTI_LEVEL;
1553 
1554 	context_set_address_root(context, virt_to_phys(pgd));
1555 	context_set_address_width(context, domain->agaw);
1556 	context_set_translation_type(context, translation);
1557 	context_set_fault_enable(context);
1558 	context_set_present(context);
1559 	if (!ecap_coherent(iommu->ecap))
1560 		clflush_cache_range(context, sizeof(*context));
1561 	context_present_cache_flush(iommu, did, bus, devfn);
1562 	ret = 0;
1563 
1564 out_unlock:
1565 	spin_unlock(&iommu->lock);
1566 
1567 	return ret;
1568 }
1569 
1570 static int domain_context_mapping_cb(struct pci_dev *pdev,
1571 				     u16 alias, void *opaque)
1572 {
1573 	struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1574 	struct intel_iommu *iommu = info->iommu;
1575 	struct dmar_domain *domain = opaque;
1576 
1577 	return domain_context_mapping_one(domain, iommu,
1578 					  PCI_BUS_NUM(alias), alias & 0xff);
1579 }
1580 
1581 static int
1582 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1583 {
1584 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1585 	struct intel_iommu *iommu = info->iommu;
1586 	u8 bus = info->bus, devfn = info->devfn;
1587 	int ret;
1588 
1589 	if (!dev_is_pci(dev))
1590 		return domain_context_mapping_one(domain, iommu, bus, devfn);
1591 
1592 	ret = pci_for_each_dma_alias(to_pci_dev(dev),
1593 				     domain_context_mapping_cb, domain);
1594 	if (ret)
1595 		return ret;
1596 
1597 	iommu_enable_pci_ats(info);
1598 
1599 	return 0;
1600 }
1601 
1602 /* Return largest possible superpage level for a given mapping */
1603 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1604 				   unsigned long phy_pfn, unsigned long pages)
1605 {
1606 	int support, level = 1;
1607 	unsigned long pfnmerge;
1608 
1609 	support = domain->iommu_superpage;
1610 
1611 	/* To use a large page, the virtual *and* physical addresses
1612 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1613 	   of them will mean we have to use smaller pages. So just
1614 	   merge them and check both at once. */
1615 	pfnmerge = iov_pfn | phy_pfn;
1616 
1617 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1618 		pages >>= VTD_STRIDE_SHIFT;
1619 		if (!pages)
1620 			break;
1621 		pfnmerge >>= VTD_STRIDE_SHIFT;
1622 		level++;
1623 		support--;
1624 	}
1625 	return level;
1626 }
1627 
1628 /*
1629  * Ensure that old small page tables are removed to make room for superpage(s).
1630  * We're going to add new large pages, so make sure we don't remove their parent
1631  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1632  */
1633 static void switch_to_super_page(struct dmar_domain *domain,
1634 				 unsigned long start_pfn,
1635 				 unsigned long end_pfn, int level)
1636 {
1637 	unsigned long lvl_pages = lvl_to_nr_pages(level);
1638 	struct dma_pte *pte = NULL;
1639 
1640 	while (start_pfn <= end_pfn) {
1641 		if (!pte)
1642 			pte = pfn_to_dma_pte(domain, start_pfn, &level,
1643 					     GFP_ATOMIC);
1644 
1645 		if (dma_pte_present(pte)) {
1646 			dma_pte_free_pagetable(domain, start_pfn,
1647 					       start_pfn + lvl_pages - 1,
1648 					       level + 1);
1649 
1650 			cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT,
1651 					      end_pfn << VTD_PAGE_SHIFT, 0);
1652 		}
1653 
1654 		pte++;
1655 		start_pfn += lvl_pages;
1656 		if (first_pte_in_page(pte))
1657 			pte = NULL;
1658 	}
1659 }
1660 
1661 static int
1662 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1663 		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
1664 		 gfp_t gfp)
1665 {
1666 	struct dma_pte *first_pte = NULL, *pte = NULL;
1667 	unsigned int largepage_lvl = 0;
1668 	unsigned long lvl_pages = 0;
1669 	phys_addr_t pteval;
1670 	u64 attr;
1671 
1672 	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
1673 		return -EINVAL;
1674 
1675 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1676 		return -EINVAL;
1677 
1678 	if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
1679 		pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
1680 		return -EINVAL;
1681 	}
1682 
1683 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
1684 	attr |= DMA_FL_PTE_PRESENT;
1685 	if (domain->use_first_level) {
1686 		attr |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
1687 		if (prot & DMA_PTE_WRITE)
1688 			attr |= DMA_FL_PTE_DIRTY;
1689 	}
1690 
1691 	domain->has_mappings = true;
1692 
1693 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
1694 
1695 	while (nr_pages > 0) {
1696 		uint64_t tmp;
1697 
1698 		if (!pte) {
1699 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
1700 					phys_pfn, nr_pages);
1701 
1702 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
1703 					     gfp);
1704 			if (!pte)
1705 				return -ENOMEM;
1706 			first_pte = pte;
1707 
1708 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
1709 
1710 			/* It is large page*/
1711 			if (largepage_lvl > 1) {
1712 				unsigned long end_pfn;
1713 				unsigned long pages_to_remove;
1714 
1715 				pteval |= DMA_PTE_LARGE_PAGE;
1716 				pages_to_remove = min_t(unsigned long, nr_pages,
1717 							nr_pte_to_next_page(pte) * lvl_pages);
1718 				end_pfn = iov_pfn + pages_to_remove - 1;
1719 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
1720 			} else {
1721 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1722 			}
1723 
1724 		}
1725 		/* We don't need lock here, nobody else
1726 		 * touches the iova range
1727 		 */
1728 		tmp = 0ULL;
1729 		if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
1730 			static int dumps = 5;
1731 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1732 				iov_pfn, tmp, (unsigned long long)pteval);
1733 			if (dumps) {
1734 				dumps--;
1735 				debug_dma_dump_mappings(NULL);
1736 			}
1737 			WARN_ON(1);
1738 		}
1739 
1740 		nr_pages -= lvl_pages;
1741 		iov_pfn += lvl_pages;
1742 		phys_pfn += lvl_pages;
1743 		pteval += lvl_pages * VTD_PAGE_SIZE;
1744 
1745 		/* If the next PTE would be the first in a new page, then we
1746 		 * need to flush the cache on the entries we've just written.
1747 		 * And then we'll need to recalculate 'pte', so clear it and
1748 		 * let it get set again in the if (!pte) block above.
1749 		 *
1750 		 * If we're done (!nr_pages) we need to flush the cache too.
1751 		 *
1752 		 * Also if we've been setting superpages, we may need to
1753 		 * recalculate 'pte' and switch back to smaller pages for the
1754 		 * end of the mapping, if the trailing size is not enough to
1755 		 * use another superpage (i.e. nr_pages < lvl_pages).
1756 		 */
1757 		pte++;
1758 		if (!nr_pages || first_pte_in_page(pte) ||
1759 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
1760 			domain_flush_cache(domain, first_pte,
1761 					   (void *)pte - (void *)first_pte);
1762 			pte = NULL;
1763 		}
1764 	}
1765 
1766 	return 0;
1767 }
1768 
1769 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
1770 {
1771 	struct intel_iommu *iommu = info->iommu;
1772 	struct context_entry *context;
1773 	u16 did;
1774 
1775 	spin_lock(&iommu->lock);
1776 	context = iommu_context_addr(iommu, bus, devfn, 0);
1777 	if (!context) {
1778 		spin_unlock(&iommu->lock);
1779 		return;
1780 	}
1781 
1782 	did = context_domain_id(context);
1783 	context_clear_entry(context);
1784 	__iommu_flush_cache(iommu, context, sizeof(*context));
1785 	spin_unlock(&iommu->lock);
1786 	intel_context_flush_no_pasid(info, context, did);
1787 }
1788 
1789 int __domain_setup_first_level(struct intel_iommu *iommu,
1790 			       struct device *dev, ioasid_t pasid,
1791 			       u16 did, pgd_t *pgd, int flags,
1792 			       struct iommu_domain *old)
1793 {
1794 	if (!old)
1795 		return intel_pasid_setup_first_level(iommu, dev, pgd,
1796 						     pasid, did, flags);
1797 	return intel_pasid_replace_first_level(iommu, dev, pgd, pasid, did,
1798 					       iommu_domain_did(old, iommu),
1799 					       flags);
1800 }
1801 
1802 static int domain_setup_second_level(struct intel_iommu *iommu,
1803 				     struct dmar_domain *domain,
1804 				     struct device *dev, ioasid_t pasid,
1805 				     struct iommu_domain *old)
1806 {
1807 	if (!old)
1808 		return intel_pasid_setup_second_level(iommu, domain,
1809 						      dev, pasid);
1810 	return intel_pasid_replace_second_level(iommu, domain, dev,
1811 						iommu_domain_did(old, iommu),
1812 						pasid);
1813 }
1814 
1815 static int domain_setup_passthrough(struct intel_iommu *iommu,
1816 				    struct device *dev, ioasid_t pasid,
1817 				    struct iommu_domain *old)
1818 {
1819 	if (!old)
1820 		return intel_pasid_setup_pass_through(iommu, dev, pasid);
1821 	return intel_pasid_replace_pass_through(iommu, dev,
1822 						iommu_domain_did(old, iommu),
1823 						pasid);
1824 }
1825 
1826 static int domain_setup_first_level(struct intel_iommu *iommu,
1827 				    struct dmar_domain *domain,
1828 				    struct device *dev,
1829 				    u32 pasid, struct iommu_domain *old)
1830 {
1831 	struct dma_pte *pgd = domain->pgd;
1832 	int level, flags = 0;
1833 
1834 	level = agaw_to_level(domain->agaw);
1835 	if (level != 4 && level != 5)
1836 		return -EINVAL;
1837 
1838 	if (level == 5)
1839 		flags |= PASID_FLAG_FL5LP;
1840 
1841 	if (domain->force_snooping)
1842 		flags |= PASID_FLAG_PAGE_SNOOP;
1843 
1844 	return __domain_setup_first_level(iommu, dev, pasid,
1845 					  domain_id_iommu(domain, iommu),
1846 					  (pgd_t *)pgd, flags, old);
1847 }
1848 
1849 static int dmar_domain_attach_device(struct dmar_domain *domain,
1850 				     struct device *dev)
1851 {
1852 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1853 	struct intel_iommu *iommu = info->iommu;
1854 	unsigned long flags;
1855 	int ret;
1856 
1857 	ret = domain_attach_iommu(domain, iommu);
1858 	if (ret)
1859 		return ret;
1860 
1861 	info->domain = domain;
1862 	spin_lock_irqsave(&domain->lock, flags);
1863 	list_add(&info->link, &domain->devices);
1864 	spin_unlock_irqrestore(&domain->lock, flags);
1865 
1866 	if (dev_is_real_dma_subdevice(dev))
1867 		return 0;
1868 
1869 	if (!sm_supported(iommu))
1870 		ret = domain_context_mapping(domain, dev);
1871 	else if (domain->use_first_level)
1872 		ret = domain_setup_first_level(iommu, domain, dev,
1873 					       IOMMU_NO_PASID, NULL);
1874 	else
1875 		ret = domain_setup_second_level(iommu, domain, dev,
1876 						IOMMU_NO_PASID, NULL);
1877 
1878 	if (ret)
1879 		goto out_block_translation;
1880 
1881 	ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID);
1882 	if (ret)
1883 		goto out_block_translation;
1884 
1885 	return 0;
1886 
1887 out_block_translation:
1888 	device_block_translation(dev);
1889 	return ret;
1890 }
1891 
1892 /**
1893  * device_rmrr_is_relaxable - Test whether the RMRR of this device
1894  * is relaxable (ie. is allowed to be not enforced under some conditions)
1895  * @dev: device handle
1896  *
1897  * We assume that PCI USB devices with RMRRs have them largely
1898  * for historical reasons and that the RMRR space is not actively used post
1899  * boot.  This exclusion may change if vendors begin to abuse it.
1900  *
1901  * The same exception is made for graphics devices, with the requirement that
1902  * any use of the RMRR regions will be torn down before assigning the device
1903  * to a guest.
1904  *
1905  * Return: true if the RMRR is relaxable, false otherwise
1906  */
1907 static bool device_rmrr_is_relaxable(struct device *dev)
1908 {
1909 	struct pci_dev *pdev;
1910 
1911 	if (!dev_is_pci(dev))
1912 		return false;
1913 
1914 	pdev = to_pci_dev(dev);
1915 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
1916 		return true;
1917 	else
1918 		return false;
1919 }
1920 
1921 static int device_def_domain_type(struct device *dev)
1922 {
1923 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1924 	struct intel_iommu *iommu = info->iommu;
1925 
1926 	/*
1927 	 * Hardware does not support the passthrough translation mode.
1928 	 * Always use a dynamaic mapping domain.
1929 	 */
1930 	if (!ecap_pass_through(iommu->ecap))
1931 		return IOMMU_DOMAIN_DMA;
1932 
1933 	if (dev_is_pci(dev)) {
1934 		struct pci_dev *pdev = to_pci_dev(dev);
1935 
1936 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
1937 			return IOMMU_DOMAIN_IDENTITY;
1938 	}
1939 
1940 	return 0;
1941 }
1942 
1943 static void intel_iommu_init_qi(struct intel_iommu *iommu)
1944 {
1945 	/*
1946 	 * Start from the sane iommu hardware state.
1947 	 * If the queued invalidation is already initialized by us
1948 	 * (for example, while enabling interrupt-remapping) then
1949 	 * we got the things already rolling from a sane state.
1950 	 */
1951 	if (!iommu->qi) {
1952 		/*
1953 		 * Clear any previous faults.
1954 		 */
1955 		dmar_fault(-1, iommu);
1956 		/*
1957 		 * Disable queued invalidation if supported and already enabled
1958 		 * before OS handover.
1959 		 */
1960 		dmar_disable_qi(iommu);
1961 	}
1962 
1963 	if (dmar_enable_qi(iommu)) {
1964 		/*
1965 		 * Queued Invalidate not enabled, use Register Based Invalidate
1966 		 */
1967 		iommu->flush.flush_context = __iommu_flush_context;
1968 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1969 		pr_info("%s: Using Register based invalidation\n",
1970 			iommu->name);
1971 	} else {
1972 		iommu->flush.flush_context = qi_flush_context;
1973 		iommu->flush.flush_iotlb = qi_flush_iotlb;
1974 		pr_info("%s: Using Queued invalidation\n", iommu->name);
1975 	}
1976 }
1977 
1978 static int copy_context_table(struct intel_iommu *iommu,
1979 			      struct root_entry *old_re,
1980 			      struct context_entry **tbl,
1981 			      int bus, bool ext)
1982 {
1983 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
1984 	struct context_entry *new_ce = NULL, ce;
1985 	struct context_entry *old_ce = NULL;
1986 	struct root_entry re;
1987 	phys_addr_t old_ce_phys;
1988 
1989 	tbl_idx = ext ? bus * 2 : bus;
1990 	memcpy(&re, old_re, sizeof(re));
1991 
1992 	for (devfn = 0; devfn < 256; devfn++) {
1993 		/* First calculate the correct index */
1994 		idx = (ext ? devfn * 2 : devfn) % 256;
1995 
1996 		if (idx == 0) {
1997 			/* First save what we may have and clean up */
1998 			if (new_ce) {
1999 				tbl[tbl_idx] = new_ce;
2000 				__iommu_flush_cache(iommu, new_ce,
2001 						    VTD_PAGE_SIZE);
2002 				pos = 1;
2003 			}
2004 
2005 			if (old_ce)
2006 				memunmap(old_ce);
2007 
2008 			ret = 0;
2009 			if (devfn < 0x80)
2010 				old_ce_phys = root_entry_lctp(&re);
2011 			else
2012 				old_ce_phys = root_entry_uctp(&re);
2013 
2014 			if (!old_ce_phys) {
2015 				if (ext && devfn == 0) {
2016 					/* No LCTP, try UCTP */
2017 					devfn = 0x7f;
2018 					continue;
2019 				} else {
2020 					goto out;
2021 				}
2022 			}
2023 
2024 			ret = -ENOMEM;
2025 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2026 					MEMREMAP_WB);
2027 			if (!old_ce)
2028 				goto out;
2029 
2030 			new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL);
2031 			if (!new_ce)
2032 				goto out_unmap;
2033 
2034 			ret = 0;
2035 		}
2036 
2037 		/* Now copy the context entry */
2038 		memcpy(&ce, old_ce + idx, sizeof(ce));
2039 
2040 		if (!context_present(&ce))
2041 			continue;
2042 
2043 		did = context_domain_id(&ce);
2044 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2045 			set_bit(did, iommu->domain_ids);
2046 
2047 		set_context_copied(iommu, bus, devfn);
2048 		new_ce[idx] = ce;
2049 	}
2050 
2051 	tbl[tbl_idx + pos] = new_ce;
2052 
2053 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2054 
2055 out_unmap:
2056 	memunmap(old_ce);
2057 
2058 out:
2059 	return ret;
2060 }
2061 
2062 static int copy_translation_tables(struct intel_iommu *iommu)
2063 {
2064 	struct context_entry **ctxt_tbls;
2065 	struct root_entry *old_rt;
2066 	phys_addr_t old_rt_phys;
2067 	int ctxt_table_entries;
2068 	u64 rtaddr_reg;
2069 	int bus, ret;
2070 	bool new_ext, ext;
2071 
2072 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2073 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2074 	new_ext    = !!sm_supported(iommu);
2075 
2076 	/*
2077 	 * The RTT bit can only be changed when translation is disabled,
2078 	 * but disabling translation means to open a window for data
2079 	 * corruption. So bail out and don't copy anything if we would
2080 	 * have to change the bit.
2081 	 */
2082 	if (new_ext != ext)
2083 		return -EINVAL;
2084 
2085 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2086 	if (!iommu->copied_tables)
2087 		return -ENOMEM;
2088 
2089 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2090 	if (!old_rt_phys)
2091 		return -EINVAL;
2092 
2093 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2094 	if (!old_rt)
2095 		return -ENOMEM;
2096 
2097 	/* This is too big for the stack - allocate it from slab */
2098 	ctxt_table_entries = ext ? 512 : 256;
2099 	ret = -ENOMEM;
2100 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2101 	if (!ctxt_tbls)
2102 		goto out_unmap;
2103 
2104 	for (bus = 0; bus < 256; bus++) {
2105 		ret = copy_context_table(iommu, &old_rt[bus],
2106 					 ctxt_tbls, bus, ext);
2107 		if (ret) {
2108 			pr_err("%s: Failed to copy context table for bus %d\n",
2109 				iommu->name, bus);
2110 			continue;
2111 		}
2112 	}
2113 
2114 	spin_lock(&iommu->lock);
2115 
2116 	/* Context tables are copied, now write them to the root_entry table */
2117 	for (bus = 0; bus < 256; bus++) {
2118 		int idx = ext ? bus * 2 : bus;
2119 		u64 val;
2120 
2121 		if (ctxt_tbls[idx]) {
2122 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2123 			iommu->root_entry[bus].lo = val;
2124 		}
2125 
2126 		if (!ext || !ctxt_tbls[idx + 1])
2127 			continue;
2128 
2129 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2130 		iommu->root_entry[bus].hi = val;
2131 	}
2132 
2133 	spin_unlock(&iommu->lock);
2134 
2135 	kfree(ctxt_tbls);
2136 
2137 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2138 
2139 	ret = 0;
2140 
2141 out_unmap:
2142 	memunmap(old_rt);
2143 
2144 	return ret;
2145 }
2146 
2147 static int __init init_dmars(void)
2148 {
2149 	struct dmar_drhd_unit *drhd;
2150 	struct intel_iommu *iommu;
2151 	int ret;
2152 
2153 	for_each_iommu(iommu, drhd) {
2154 		if (drhd->ignored) {
2155 			iommu_disable_translation(iommu);
2156 			continue;
2157 		}
2158 
2159 		/*
2160 		 * Find the max pasid size of all IOMMU's in the system.
2161 		 * We need to ensure the system pasid table is no bigger
2162 		 * than the smallest supported.
2163 		 */
2164 		if (pasid_supported(iommu)) {
2165 			u32 temp = 2 << ecap_pss(iommu->ecap);
2166 
2167 			intel_pasid_max_id = min_t(u32, temp,
2168 						   intel_pasid_max_id);
2169 		}
2170 
2171 		intel_iommu_init_qi(iommu);
2172 
2173 		ret = iommu_init_domains(iommu);
2174 		if (ret)
2175 			goto free_iommu;
2176 
2177 		init_translation_status(iommu);
2178 
2179 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2180 			iommu_disable_translation(iommu);
2181 			clear_translation_pre_enabled(iommu);
2182 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2183 				iommu->name);
2184 		}
2185 
2186 		/*
2187 		 * TBD:
2188 		 * we could share the same root & context tables
2189 		 * among all IOMMU's. Need to Split it later.
2190 		 */
2191 		ret = iommu_alloc_root_entry(iommu);
2192 		if (ret)
2193 			goto free_iommu;
2194 
2195 		if (translation_pre_enabled(iommu)) {
2196 			pr_info("Translation already enabled - trying to copy translation structures\n");
2197 
2198 			ret = copy_translation_tables(iommu);
2199 			if (ret) {
2200 				/*
2201 				 * We found the IOMMU with translation
2202 				 * enabled - but failed to copy over the
2203 				 * old root-entry table. Try to proceed
2204 				 * by disabling translation now and
2205 				 * allocating a clean root-entry table.
2206 				 * This might cause DMAR faults, but
2207 				 * probably the dump will still succeed.
2208 				 */
2209 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2210 				       iommu->name);
2211 				iommu_disable_translation(iommu);
2212 				clear_translation_pre_enabled(iommu);
2213 			} else {
2214 				pr_info("Copied translation tables from previous kernel for %s\n",
2215 					iommu->name);
2216 			}
2217 		}
2218 
2219 		intel_svm_check(iommu);
2220 	}
2221 
2222 	/*
2223 	 * Now that qi is enabled on all iommus, set the root entry and flush
2224 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2225 	 * flush_context function will loop forever and the boot hangs.
2226 	 */
2227 	for_each_active_iommu(iommu, drhd) {
2228 		iommu_flush_write_buffer(iommu);
2229 		iommu_set_root_entry(iommu);
2230 	}
2231 
2232 	check_tylersburg_isoch();
2233 
2234 	/*
2235 	 * for each drhd
2236 	 *   enable fault log
2237 	 *   global invalidate context cache
2238 	 *   global invalidate iotlb
2239 	 *   enable translation
2240 	 */
2241 	for_each_iommu(iommu, drhd) {
2242 		if (drhd->ignored) {
2243 			/*
2244 			 * we always have to disable PMRs or DMA may fail on
2245 			 * this device
2246 			 */
2247 			if (force_on)
2248 				iommu_disable_protect_mem_regions(iommu);
2249 			continue;
2250 		}
2251 
2252 		iommu_flush_write_buffer(iommu);
2253 
2254 		if (ecap_prs(iommu->ecap)) {
2255 			/*
2256 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2257 			 * could cause possible lock race condition.
2258 			 */
2259 			up_write(&dmar_global_lock);
2260 			ret = intel_iommu_enable_prq(iommu);
2261 			down_write(&dmar_global_lock);
2262 			if (ret)
2263 				goto free_iommu;
2264 		}
2265 
2266 		ret = dmar_set_interrupt(iommu);
2267 		if (ret)
2268 			goto free_iommu;
2269 	}
2270 
2271 	return 0;
2272 
2273 free_iommu:
2274 	for_each_active_iommu(iommu, drhd) {
2275 		disable_dmar_iommu(iommu);
2276 		free_dmar_iommu(iommu);
2277 	}
2278 
2279 	return ret;
2280 }
2281 
2282 static void __init init_no_remapping_devices(void)
2283 {
2284 	struct dmar_drhd_unit *drhd;
2285 	struct device *dev;
2286 	int i;
2287 
2288 	for_each_drhd_unit(drhd) {
2289 		if (!drhd->include_all) {
2290 			for_each_active_dev_scope(drhd->devices,
2291 						  drhd->devices_cnt, i, dev)
2292 				break;
2293 			/* ignore DMAR unit if no devices exist */
2294 			if (i == drhd->devices_cnt)
2295 				drhd->ignored = 1;
2296 		}
2297 	}
2298 
2299 	for_each_active_drhd_unit(drhd) {
2300 		if (drhd->include_all)
2301 			continue;
2302 
2303 		for_each_active_dev_scope(drhd->devices,
2304 					  drhd->devices_cnt, i, dev)
2305 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2306 				break;
2307 		if (i < drhd->devices_cnt)
2308 			continue;
2309 
2310 		/* This IOMMU has *only* gfx devices. Either bypass it or
2311 		   set the gfx_mapped flag, as appropriate */
2312 		drhd->gfx_dedicated = 1;
2313 		if (disable_igfx_iommu)
2314 			drhd->ignored = 1;
2315 	}
2316 }
2317 
2318 #ifdef CONFIG_SUSPEND
2319 static int init_iommu_hw(void)
2320 {
2321 	struct dmar_drhd_unit *drhd;
2322 	struct intel_iommu *iommu = NULL;
2323 	int ret;
2324 
2325 	for_each_active_iommu(iommu, drhd) {
2326 		if (iommu->qi) {
2327 			ret = dmar_reenable_qi(iommu);
2328 			if (ret)
2329 				return ret;
2330 		}
2331 	}
2332 
2333 	for_each_iommu(iommu, drhd) {
2334 		if (drhd->ignored) {
2335 			/*
2336 			 * we always have to disable PMRs or DMA may fail on
2337 			 * this device
2338 			 */
2339 			if (force_on)
2340 				iommu_disable_protect_mem_regions(iommu);
2341 			continue;
2342 		}
2343 
2344 		iommu_flush_write_buffer(iommu);
2345 		iommu_set_root_entry(iommu);
2346 		iommu_enable_translation(iommu);
2347 		iommu_disable_protect_mem_regions(iommu);
2348 	}
2349 
2350 	return 0;
2351 }
2352 
2353 static void iommu_flush_all(void)
2354 {
2355 	struct dmar_drhd_unit *drhd;
2356 	struct intel_iommu *iommu;
2357 
2358 	for_each_active_iommu(iommu, drhd) {
2359 		iommu->flush.flush_context(iommu, 0, 0, 0,
2360 					   DMA_CCMD_GLOBAL_INVL);
2361 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2362 					 DMA_TLB_GLOBAL_FLUSH);
2363 	}
2364 }
2365 
2366 static int iommu_suspend(void)
2367 {
2368 	struct dmar_drhd_unit *drhd;
2369 	struct intel_iommu *iommu = NULL;
2370 	unsigned long flag;
2371 
2372 	iommu_flush_all();
2373 
2374 	for_each_active_iommu(iommu, drhd) {
2375 		iommu_disable_translation(iommu);
2376 
2377 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2378 
2379 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
2380 			readl(iommu->reg + DMAR_FECTL_REG);
2381 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2382 			readl(iommu->reg + DMAR_FEDATA_REG);
2383 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2384 			readl(iommu->reg + DMAR_FEADDR_REG);
2385 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2386 			readl(iommu->reg + DMAR_FEUADDR_REG);
2387 
2388 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2389 	}
2390 	return 0;
2391 }
2392 
2393 static void iommu_resume(void)
2394 {
2395 	struct dmar_drhd_unit *drhd;
2396 	struct intel_iommu *iommu = NULL;
2397 	unsigned long flag;
2398 
2399 	if (init_iommu_hw()) {
2400 		if (force_on)
2401 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2402 		else
2403 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2404 		return;
2405 	}
2406 
2407 	for_each_active_iommu(iommu, drhd) {
2408 
2409 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2410 
2411 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2412 			iommu->reg + DMAR_FECTL_REG);
2413 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2414 			iommu->reg + DMAR_FEDATA_REG);
2415 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2416 			iommu->reg + DMAR_FEADDR_REG);
2417 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2418 			iommu->reg + DMAR_FEUADDR_REG);
2419 
2420 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2421 	}
2422 }
2423 
2424 static struct syscore_ops iommu_syscore_ops = {
2425 	.resume		= iommu_resume,
2426 	.suspend	= iommu_suspend,
2427 };
2428 
2429 static void __init init_iommu_pm_ops(void)
2430 {
2431 	register_syscore_ops(&iommu_syscore_ops);
2432 }
2433 
2434 #else
2435 static inline void init_iommu_pm_ops(void) {}
2436 #endif	/* CONFIG_PM */
2437 
2438 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2439 {
2440 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2441 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2442 	    rmrr->end_address <= rmrr->base_address ||
2443 	    arch_rmrr_sanity_check(rmrr))
2444 		return -EINVAL;
2445 
2446 	return 0;
2447 }
2448 
2449 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2450 {
2451 	struct acpi_dmar_reserved_memory *rmrr;
2452 	struct dmar_rmrr_unit *rmrru;
2453 
2454 	rmrr = (struct acpi_dmar_reserved_memory *)header;
2455 	if (rmrr_sanity_check(rmrr)) {
2456 		pr_warn(FW_BUG
2457 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2458 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2459 			   rmrr->base_address, rmrr->end_address,
2460 			   dmi_get_system_info(DMI_BIOS_VENDOR),
2461 			   dmi_get_system_info(DMI_BIOS_VERSION),
2462 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
2463 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2464 	}
2465 
2466 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2467 	if (!rmrru)
2468 		goto out;
2469 
2470 	rmrru->hdr = header;
2471 
2472 	rmrru->base_address = rmrr->base_address;
2473 	rmrru->end_address = rmrr->end_address;
2474 
2475 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2476 				((void *)rmrr) + rmrr->header.length,
2477 				&rmrru->devices_cnt);
2478 	if (rmrru->devices_cnt && rmrru->devices == NULL)
2479 		goto free_rmrru;
2480 
2481 	list_add(&rmrru->list, &dmar_rmrr_units);
2482 
2483 	return 0;
2484 free_rmrru:
2485 	kfree(rmrru);
2486 out:
2487 	return -ENOMEM;
2488 }
2489 
2490 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2491 {
2492 	struct dmar_atsr_unit *atsru;
2493 	struct acpi_dmar_atsr *tmp;
2494 
2495 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2496 				dmar_rcu_check()) {
2497 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2498 		if (atsr->segment != tmp->segment)
2499 			continue;
2500 		if (atsr->header.length != tmp->header.length)
2501 			continue;
2502 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
2503 			return atsru;
2504 	}
2505 
2506 	return NULL;
2507 }
2508 
2509 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2510 {
2511 	struct acpi_dmar_atsr *atsr;
2512 	struct dmar_atsr_unit *atsru;
2513 
2514 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2515 		return 0;
2516 
2517 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2518 	atsru = dmar_find_atsr(atsr);
2519 	if (atsru)
2520 		return 0;
2521 
2522 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2523 	if (!atsru)
2524 		return -ENOMEM;
2525 
2526 	/*
2527 	 * If memory is allocated from slab by ACPI _DSM method, we need to
2528 	 * copy the memory content because the memory buffer will be freed
2529 	 * on return.
2530 	 */
2531 	atsru->hdr = (void *)(atsru + 1);
2532 	memcpy(atsru->hdr, hdr, hdr->length);
2533 	atsru->include_all = atsr->flags & 0x1;
2534 	if (!atsru->include_all) {
2535 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
2536 				(void *)atsr + atsr->header.length,
2537 				&atsru->devices_cnt);
2538 		if (atsru->devices_cnt && atsru->devices == NULL) {
2539 			kfree(atsru);
2540 			return -ENOMEM;
2541 		}
2542 	}
2543 
2544 	list_add_rcu(&atsru->list, &dmar_atsr_units);
2545 
2546 	return 0;
2547 }
2548 
2549 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2550 {
2551 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
2552 	kfree(atsru);
2553 }
2554 
2555 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2556 {
2557 	struct acpi_dmar_atsr *atsr;
2558 	struct dmar_atsr_unit *atsru;
2559 
2560 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2561 	atsru = dmar_find_atsr(atsr);
2562 	if (atsru) {
2563 		list_del_rcu(&atsru->list);
2564 		synchronize_rcu();
2565 		intel_iommu_free_atsr(atsru);
2566 	}
2567 
2568 	return 0;
2569 }
2570 
2571 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2572 {
2573 	int i;
2574 	struct device *dev;
2575 	struct acpi_dmar_atsr *atsr;
2576 	struct dmar_atsr_unit *atsru;
2577 
2578 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2579 	atsru = dmar_find_atsr(atsr);
2580 	if (!atsru)
2581 		return 0;
2582 
2583 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
2584 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
2585 					  i, dev)
2586 			return -EBUSY;
2587 	}
2588 
2589 	return 0;
2590 }
2591 
2592 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
2593 {
2594 	struct dmar_satc_unit *satcu;
2595 	struct acpi_dmar_satc *tmp;
2596 
2597 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
2598 				dmar_rcu_check()) {
2599 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
2600 		if (satc->segment != tmp->segment)
2601 			continue;
2602 		if (satc->header.length != tmp->header.length)
2603 			continue;
2604 		if (memcmp(satc, tmp, satc->header.length) == 0)
2605 			return satcu;
2606 	}
2607 
2608 	return NULL;
2609 }
2610 
2611 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
2612 {
2613 	struct acpi_dmar_satc *satc;
2614 	struct dmar_satc_unit *satcu;
2615 
2616 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2617 		return 0;
2618 
2619 	satc = container_of(hdr, struct acpi_dmar_satc, header);
2620 	satcu = dmar_find_satc(satc);
2621 	if (satcu)
2622 		return 0;
2623 
2624 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
2625 	if (!satcu)
2626 		return -ENOMEM;
2627 
2628 	satcu->hdr = (void *)(satcu + 1);
2629 	memcpy(satcu->hdr, hdr, hdr->length);
2630 	satcu->atc_required = satc->flags & 0x1;
2631 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
2632 					      (void *)satc + satc->header.length,
2633 					      &satcu->devices_cnt);
2634 	if (satcu->devices_cnt && !satcu->devices) {
2635 		kfree(satcu);
2636 		return -ENOMEM;
2637 	}
2638 	list_add_rcu(&satcu->list, &dmar_satc_units);
2639 
2640 	return 0;
2641 }
2642 
2643 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
2644 {
2645 	struct intel_iommu *iommu = dmaru->iommu;
2646 	int ret;
2647 
2648 	/*
2649 	 * Disable translation if already enabled prior to OS handover.
2650 	 */
2651 	if (iommu->gcmd & DMA_GCMD_TE)
2652 		iommu_disable_translation(iommu);
2653 
2654 	ret = iommu_init_domains(iommu);
2655 	if (ret == 0)
2656 		ret = iommu_alloc_root_entry(iommu);
2657 	if (ret)
2658 		goto out;
2659 
2660 	intel_svm_check(iommu);
2661 
2662 	if (dmaru->ignored) {
2663 		/*
2664 		 * we always have to disable PMRs or DMA may fail on this device
2665 		 */
2666 		if (force_on)
2667 			iommu_disable_protect_mem_regions(iommu);
2668 		return 0;
2669 	}
2670 
2671 	intel_iommu_init_qi(iommu);
2672 	iommu_flush_write_buffer(iommu);
2673 
2674 	if (ecap_prs(iommu->ecap)) {
2675 		ret = intel_iommu_enable_prq(iommu);
2676 		if (ret)
2677 			goto disable_iommu;
2678 	}
2679 
2680 	ret = dmar_set_interrupt(iommu);
2681 	if (ret)
2682 		goto disable_iommu;
2683 
2684 	iommu_set_root_entry(iommu);
2685 	iommu_enable_translation(iommu);
2686 
2687 	iommu_disable_protect_mem_regions(iommu);
2688 	return 0;
2689 
2690 disable_iommu:
2691 	disable_dmar_iommu(iommu);
2692 out:
2693 	free_dmar_iommu(iommu);
2694 	return ret;
2695 }
2696 
2697 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
2698 {
2699 	int ret = 0;
2700 	struct intel_iommu *iommu = dmaru->iommu;
2701 
2702 	if (!intel_iommu_enabled)
2703 		return 0;
2704 	if (iommu == NULL)
2705 		return -EINVAL;
2706 
2707 	if (insert) {
2708 		ret = intel_iommu_add(dmaru);
2709 	} else {
2710 		disable_dmar_iommu(iommu);
2711 		free_dmar_iommu(iommu);
2712 	}
2713 
2714 	return ret;
2715 }
2716 
2717 static void intel_iommu_free_dmars(void)
2718 {
2719 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
2720 	struct dmar_atsr_unit *atsru, *atsr_n;
2721 	struct dmar_satc_unit *satcu, *satc_n;
2722 
2723 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
2724 		list_del(&rmrru->list);
2725 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
2726 		kfree(rmrru);
2727 	}
2728 
2729 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
2730 		list_del(&atsru->list);
2731 		intel_iommu_free_atsr(atsru);
2732 	}
2733 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
2734 		list_del(&satcu->list);
2735 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
2736 		kfree(satcu);
2737 	}
2738 }
2739 
2740 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
2741 {
2742 	struct dmar_satc_unit *satcu;
2743 	struct acpi_dmar_satc *satc;
2744 	struct device *tmp;
2745 	int i;
2746 
2747 	dev = pci_physfn(dev);
2748 	rcu_read_lock();
2749 
2750 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
2751 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2752 		if (satc->segment != pci_domain_nr(dev->bus))
2753 			continue;
2754 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
2755 			if (to_pci_dev(tmp) == dev)
2756 				goto out;
2757 	}
2758 	satcu = NULL;
2759 out:
2760 	rcu_read_unlock();
2761 	return satcu;
2762 }
2763 
2764 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
2765 {
2766 	int i, ret = 1;
2767 	struct pci_bus *bus;
2768 	struct pci_dev *bridge = NULL;
2769 	struct device *tmp;
2770 	struct acpi_dmar_atsr *atsr;
2771 	struct dmar_atsr_unit *atsru;
2772 	struct dmar_satc_unit *satcu;
2773 
2774 	dev = pci_physfn(dev);
2775 	satcu = dmar_find_matched_satc_unit(dev);
2776 	if (satcu)
2777 		/*
2778 		 * This device supports ATS as it is in SATC table.
2779 		 * When IOMMU is in legacy mode, enabling ATS is done
2780 		 * automatically by HW for the device that requires
2781 		 * ATS, hence OS should not enable this device ATS
2782 		 * to avoid duplicated TLB invalidation.
2783 		 */
2784 		return !(satcu->atc_required && !sm_supported(iommu));
2785 
2786 	for (bus = dev->bus; bus; bus = bus->parent) {
2787 		bridge = bus->self;
2788 		/* If it's an integrated device, allow ATS */
2789 		if (!bridge)
2790 			return 1;
2791 		/* Connected via non-PCIe: no ATS */
2792 		if (!pci_is_pcie(bridge) ||
2793 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
2794 			return 0;
2795 		/* If we found the root port, look it up in the ATSR */
2796 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
2797 			break;
2798 	}
2799 
2800 	rcu_read_lock();
2801 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
2802 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2803 		if (atsr->segment != pci_domain_nr(dev->bus))
2804 			continue;
2805 
2806 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
2807 			if (tmp == &bridge->dev)
2808 				goto out;
2809 
2810 		if (atsru->include_all)
2811 			goto out;
2812 	}
2813 	ret = 0;
2814 out:
2815 	rcu_read_unlock();
2816 
2817 	return ret;
2818 }
2819 
2820 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
2821 {
2822 	int ret;
2823 	struct dmar_rmrr_unit *rmrru;
2824 	struct dmar_atsr_unit *atsru;
2825 	struct dmar_satc_unit *satcu;
2826 	struct acpi_dmar_atsr *atsr;
2827 	struct acpi_dmar_reserved_memory *rmrr;
2828 	struct acpi_dmar_satc *satc;
2829 
2830 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
2831 		return 0;
2832 
2833 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
2834 		rmrr = container_of(rmrru->hdr,
2835 				    struct acpi_dmar_reserved_memory, header);
2836 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2837 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
2838 				((void *)rmrr) + rmrr->header.length,
2839 				rmrr->segment, rmrru->devices,
2840 				rmrru->devices_cnt);
2841 			if (ret < 0)
2842 				return ret;
2843 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2844 			dmar_remove_dev_scope(info, rmrr->segment,
2845 				rmrru->devices, rmrru->devices_cnt);
2846 		}
2847 	}
2848 
2849 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
2850 		if (atsru->include_all)
2851 			continue;
2852 
2853 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2854 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2855 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
2856 					(void *)atsr + atsr->header.length,
2857 					atsr->segment, atsru->devices,
2858 					atsru->devices_cnt);
2859 			if (ret > 0)
2860 				break;
2861 			else if (ret < 0)
2862 				return ret;
2863 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2864 			if (dmar_remove_dev_scope(info, atsr->segment,
2865 					atsru->devices, atsru->devices_cnt))
2866 				break;
2867 		}
2868 	}
2869 	list_for_each_entry(satcu, &dmar_satc_units, list) {
2870 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2871 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2872 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
2873 					(void *)satc + satc->header.length,
2874 					satc->segment, satcu->devices,
2875 					satcu->devices_cnt);
2876 			if (ret > 0)
2877 				break;
2878 			else if (ret < 0)
2879 				return ret;
2880 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2881 			if (dmar_remove_dev_scope(info, satc->segment,
2882 					satcu->devices, satcu->devices_cnt))
2883 				break;
2884 		}
2885 	}
2886 
2887 	return 0;
2888 }
2889 
2890 static void intel_disable_iommus(void)
2891 {
2892 	struct intel_iommu *iommu = NULL;
2893 	struct dmar_drhd_unit *drhd;
2894 
2895 	for_each_iommu(iommu, drhd)
2896 		iommu_disable_translation(iommu);
2897 }
2898 
2899 void intel_iommu_shutdown(void)
2900 {
2901 	struct dmar_drhd_unit *drhd;
2902 	struct intel_iommu *iommu = NULL;
2903 
2904 	if (no_iommu || dmar_disabled)
2905 		return;
2906 
2907 	/*
2908 	 * All other CPUs were brought down, hotplug interrupts were disabled,
2909 	 * no lock and RCU checking needed anymore
2910 	 */
2911 	list_for_each_entry(drhd, &dmar_drhd_units, list) {
2912 		iommu = drhd->iommu;
2913 
2914 		/* Disable PMRs explicitly here. */
2915 		iommu_disable_protect_mem_regions(iommu);
2916 
2917 		/* Make sure the IOMMUs are switched off */
2918 		iommu_disable_translation(iommu);
2919 	}
2920 }
2921 
2922 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
2923 {
2924 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
2925 
2926 	return container_of(iommu_dev, struct intel_iommu, iommu);
2927 }
2928 
2929 static ssize_t version_show(struct device *dev,
2930 			    struct device_attribute *attr, char *buf)
2931 {
2932 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2933 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
2934 	return sysfs_emit(buf, "%d:%d\n",
2935 			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
2936 }
2937 static DEVICE_ATTR_RO(version);
2938 
2939 static ssize_t address_show(struct device *dev,
2940 			    struct device_attribute *attr, char *buf)
2941 {
2942 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2943 	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
2944 }
2945 static DEVICE_ATTR_RO(address);
2946 
2947 static ssize_t cap_show(struct device *dev,
2948 			struct device_attribute *attr, char *buf)
2949 {
2950 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2951 	return sysfs_emit(buf, "%llx\n", iommu->cap);
2952 }
2953 static DEVICE_ATTR_RO(cap);
2954 
2955 static ssize_t ecap_show(struct device *dev,
2956 			 struct device_attribute *attr, char *buf)
2957 {
2958 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2959 	return sysfs_emit(buf, "%llx\n", iommu->ecap);
2960 }
2961 static DEVICE_ATTR_RO(ecap);
2962 
2963 static ssize_t domains_supported_show(struct device *dev,
2964 				      struct device_attribute *attr, char *buf)
2965 {
2966 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2967 	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
2968 }
2969 static DEVICE_ATTR_RO(domains_supported);
2970 
2971 static ssize_t domains_used_show(struct device *dev,
2972 				 struct device_attribute *attr, char *buf)
2973 {
2974 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2975 	return sysfs_emit(buf, "%d\n",
2976 			  bitmap_weight(iommu->domain_ids,
2977 					cap_ndoms(iommu->cap)));
2978 }
2979 static DEVICE_ATTR_RO(domains_used);
2980 
2981 static struct attribute *intel_iommu_attrs[] = {
2982 	&dev_attr_version.attr,
2983 	&dev_attr_address.attr,
2984 	&dev_attr_cap.attr,
2985 	&dev_attr_ecap.attr,
2986 	&dev_attr_domains_supported.attr,
2987 	&dev_attr_domains_used.attr,
2988 	NULL,
2989 };
2990 
2991 static struct attribute_group intel_iommu_group = {
2992 	.name = "intel-iommu",
2993 	.attrs = intel_iommu_attrs,
2994 };
2995 
2996 const struct attribute_group *intel_iommu_groups[] = {
2997 	&intel_iommu_group,
2998 	NULL,
2999 };
3000 
3001 static bool has_external_pci(void)
3002 {
3003 	struct pci_dev *pdev = NULL;
3004 
3005 	for_each_pci_dev(pdev)
3006 		if (pdev->external_facing) {
3007 			pci_dev_put(pdev);
3008 			return true;
3009 		}
3010 
3011 	return false;
3012 }
3013 
3014 static int __init platform_optin_force_iommu(void)
3015 {
3016 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3017 		return 0;
3018 
3019 	if (no_iommu || dmar_disabled)
3020 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3021 
3022 	/*
3023 	 * If Intel-IOMMU is disabled by default, we will apply identity
3024 	 * map for all devices except those marked as being untrusted.
3025 	 */
3026 	if (dmar_disabled)
3027 		iommu_set_default_passthrough(false);
3028 
3029 	dmar_disabled = 0;
3030 	no_iommu = 0;
3031 
3032 	return 1;
3033 }
3034 
3035 static int __init probe_acpi_namespace_devices(void)
3036 {
3037 	struct dmar_drhd_unit *drhd;
3038 	/* To avoid a -Wunused-but-set-variable warning. */
3039 	struct intel_iommu *iommu __maybe_unused;
3040 	struct device *dev;
3041 	int i, ret = 0;
3042 
3043 	for_each_active_iommu(iommu, drhd) {
3044 		for_each_active_dev_scope(drhd->devices,
3045 					  drhd->devices_cnt, i, dev) {
3046 			struct acpi_device_physical_node *pn;
3047 			struct acpi_device *adev;
3048 
3049 			if (dev->bus != &acpi_bus_type)
3050 				continue;
3051 
3052 			up_read(&dmar_global_lock);
3053 			adev = to_acpi_device(dev);
3054 			mutex_lock(&adev->physical_node_lock);
3055 			list_for_each_entry(pn,
3056 					    &adev->physical_node_list, node) {
3057 				ret = iommu_probe_device(pn->dev);
3058 				if (ret)
3059 					break;
3060 			}
3061 			mutex_unlock(&adev->physical_node_lock);
3062 			down_read(&dmar_global_lock);
3063 
3064 			if (ret)
3065 				return ret;
3066 		}
3067 	}
3068 
3069 	return 0;
3070 }
3071 
3072 static __init int tboot_force_iommu(void)
3073 {
3074 	if (!tboot_enabled())
3075 		return 0;
3076 
3077 	if (no_iommu || dmar_disabled)
3078 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3079 
3080 	dmar_disabled = 0;
3081 	no_iommu = 0;
3082 
3083 	return 1;
3084 }
3085 
3086 int __init intel_iommu_init(void)
3087 {
3088 	int ret = -ENODEV;
3089 	struct dmar_drhd_unit *drhd;
3090 	struct intel_iommu *iommu;
3091 
3092 	/*
3093 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3094 	 * opt in, so enforce that.
3095 	 */
3096 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3097 		    platform_optin_force_iommu();
3098 
3099 	down_write(&dmar_global_lock);
3100 	if (dmar_table_init()) {
3101 		if (force_on)
3102 			panic("tboot: Failed to initialize DMAR table\n");
3103 		goto out_free_dmar;
3104 	}
3105 
3106 	if (dmar_dev_scope_init() < 0) {
3107 		if (force_on)
3108 			panic("tboot: Failed to initialize DMAR device scope\n");
3109 		goto out_free_dmar;
3110 	}
3111 
3112 	up_write(&dmar_global_lock);
3113 
3114 	/*
3115 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3116 	 * complain later when we register it under the lock.
3117 	 */
3118 	dmar_register_bus_notifier();
3119 
3120 	down_write(&dmar_global_lock);
3121 
3122 	if (!no_iommu)
3123 		intel_iommu_debugfs_init();
3124 
3125 	if (no_iommu || dmar_disabled) {
3126 		/*
3127 		 * We exit the function here to ensure IOMMU's remapping and
3128 		 * mempool aren't setup, which means that the IOMMU's PMRs
3129 		 * won't be disabled via the call to init_dmars(). So disable
3130 		 * it explicitly here. The PMRs were setup by tboot prior to
3131 		 * calling SENTER, but the kernel is expected to reset/tear
3132 		 * down the PMRs.
3133 		 */
3134 		if (intel_iommu_tboot_noforce) {
3135 			for_each_iommu(iommu, drhd)
3136 				iommu_disable_protect_mem_regions(iommu);
3137 		}
3138 
3139 		/*
3140 		 * Make sure the IOMMUs are switched off, even when we
3141 		 * boot into a kexec kernel and the previous kernel left
3142 		 * them enabled
3143 		 */
3144 		intel_disable_iommus();
3145 		goto out_free_dmar;
3146 	}
3147 
3148 	if (list_empty(&dmar_rmrr_units))
3149 		pr_info("No RMRR found\n");
3150 
3151 	if (list_empty(&dmar_atsr_units))
3152 		pr_info("No ATSR found\n");
3153 
3154 	if (list_empty(&dmar_satc_units))
3155 		pr_info("No SATC found\n");
3156 
3157 	init_no_remapping_devices();
3158 
3159 	ret = init_dmars();
3160 	if (ret) {
3161 		if (force_on)
3162 			panic("tboot: Failed to initialize DMARs\n");
3163 		pr_err("Initialization failed\n");
3164 		goto out_free_dmar;
3165 	}
3166 	up_write(&dmar_global_lock);
3167 
3168 	init_iommu_pm_ops();
3169 
3170 	down_read(&dmar_global_lock);
3171 	for_each_active_iommu(iommu, drhd) {
3172 		/*
3173 		 * The flush queue implementation does not perform
3174 		 * page-selective invalidations that are required for efficient
3175 		 * TLB flushes in virtual environments.  The benefit of batching
3176 		 * is likely to be much lower than the overhead of synchronizing
3177 		 * the virtual and physical IOMMU page-tables.
3178 		 */
3179 		if (cap_caching_mode(iommu->cap) &&
3180 		    !first_level_by_default(iommu)) {
3181 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3182 			iommu_set_dma_strict();
3183 		}
3184 		iommu_device_sysfs_add(&iommu->iommu, NULL,
3185 				       intel_iommu_groups,
3186 				       "%s", iommu->name);
3187 		/*
3188 		 * The iommu device probe is protected by the iommu_probe_device_lock.
3189 		 * Release the dmar_global_lock before entering the device probe path
3190 		 * to avoid unnecessary lock order splat.
3191 		 */
3192 		up_read(&dmar_global_lock);
3193 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3194 		down_read(&dmar_global_lock);
3195 
3196 		iommu_pmu_register(iommu);
3197 	}
3198 
3199 	if (probe_acpi_namespace_devices())
3200 		pr_warn("ACPI name space devices didn't probe correctly\n");
3201 
3202 	/* Finally, we enable the DMA remapping hardware. */
3203 	for_each_iommu(iommu, drhd) {
3204 		if (!drhd->ignored && !translation_pre_enabled(iommu))
3205 			iommu_enable_translation(iommu);
3206 
3207 		iommu_disable_protect_mem_regions(iommu);
3208 	}
3209 	up_read(&dmar_global_lock);
3210 
3211 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3212 
3213 	intel_iommu_enabled = 1;
3214 
3215 	return 0;
3216 
3217 out_free_dmar:
3218 	intel_iommu_free_dmars();
3219 	up_write(&dmar_global_lock);
3220 	return ret;
3221 }
3222 
3223 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3224 {
3225 	struct device_domain_info *info = opaque;
3226 
3227 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3228 	return 0;
3229 }
3230 
3231 /*
3232  * NB - intel-iommu lacks any sort of reference counting for the users of
3233  * dependent devices.  If multiple endpoints have intersecting dependent
3234  * devices, unbinding the driver from any one of them will possibly leave
3235  * the others unable to operate.
3236  */
3237 static void domain_context_clear(struct device_domain_info *info)
3238 {
3239 	if (!dev_is_pci(info->dev)) {
3240 		domain_context_clear_one(info, info->bus, info->devfn);
3241 		return;
3242 	}
3243 
3244 	pci_for_each_dma_alias(to_pci_dev(info->dev),
3245 			       &domain_context_clear_one_cb, info);
3246 	iommu_disable_pci_ats(info);
3247 }
3248 
3249 /*
3250  * Clear the page table pointer in context or pasid table entries so that
3251  * all DMA requests without PASID from the device are blocked. If the page
3252  * table has been set, clean up the data structures.
3253  */
3254 void device_block_translation(struct device *dev)
3255 {
3256 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3257 	struct intel_iommu *iommu = info->iommu;
3258 	unsigned long flags;
3259 
3260 	if (info->domain)
3261 		cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID);
3262 
3263 	if (!dev_is_real_dma_subdevice(dev)) {
3264 		if (sm_supported(iommu))
3265 			intel_pasid_tear_down_entry(iommu, dev,
3266 						    IOMMU_NO_PASID, false);
3267 		else
3268 			domain_context_clear(info);
3269 	}
3270 
3271 	if (!info->domain)
3272 		return;
3273 
3274 	spin_lock_irqsave(&info->domain->lock, flags);
3275 	list_del(&info->link);
3276 	spin_unlock_irqrestore(&info->domain->lock, flags);
3277 
3278 	domain_detach_iommu(info->domain, iommu);
3279 	info->domain = NULL;
3280 }
3281 
3282 static int blocking_domain_attach_dev(struct iommu_domain *domain,
3283 				      struct device *dev)
3284 {
3285 	device_block_translation(dev);
3286 	return 0;
3287 }
3288 
3289 static int blocking_domain_set_dev_pasid(struct iommu_domain *domain,
3290 					 struct device *dev, ioasid_t pasid,
3291 					 struct iommu_domain *old);
3292 
3293 static struct iommu_domain blocking_domain = {
3294 	.type = IOMMU_DOMAIN_BLOCKED,
3295 	.ops = &(const struct iommu_domain_ops) {
3296 		.attach_dev	= blocking_domain_attach_dev,
3297 		.set_dev_pasid	= blocking_domain_set_dev_pasid,
3298 	}
3299 };
3300 
3301 static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage)
3302 {
3303 	if (!intel_iommu_superpage)
3304 		return 0;
3305 
3306 	if (first_stage)
3307 		return cap_fl1gp_support(iommu->cap) ? 2 : 1;
3308 
3309 	return fls(cap_super_page_val(iommu->cap));
3310 }
3311 
3312 static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage)
3313 {
3314 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3315 	struct intel_iommu *iommu = info->iommu;
3316 	struct dmar_domain *domain;
3317 	int addr_width;
3318 
3319 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
3320 	if (!domain)
3321 		return ERR_PTR(-ENOMEM);
3322 
3323 	INIT_LIST_HEAD(&domain->devices);
3324 	INIT_LIST_HEAD(&domain->dev_pasids);
3325 	INIT_LIST_HEAD(&domain->cache_tags);
3326 	spin_lock_init(&domain->lock);
3327 	spin_lock_init(&domain->cache_lock);
3328 	xa_init(&domain->iommu_array);
3329 
3330 	domain->nid = dev_to_node(dev);
3331 	domain->use_first_level = first_stage;
3332 
3333 	/* calculate the address width */
3334 	addr_width = agaw_to_width(iommu->agaw);
3335 	if (addr_width > cap_mgaw(iommu->cap))
3336 		addr_width = cap_mgaw(iommu->cap);
3337 	domain->gaw = addr_width;
3338 	domain->agaw = iommu->agaw;
3339 	domain->max_addr = __DOMAIN_MAX_ADDR(addr_width);
3340 
3341 	/* iommu memory access coherency */
3342 	domain->iommu_coherency = iommu_paging_structure_coherency(iommu);
3343 
3344 	/* pagesize bitmap */
3345 	domain->domain.pgsize_bitmap = SZ_4K;
3346 	domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage);
3347 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
3348 
3349 	/*
3350 	 * IOVA aperture: First-level translation restricts the input-address
3351 	 * to a canonical address (i.e., address bits 63:N have the same value
3352 	 * as address bit [N-1], where N is 48-bits with 4-level paging and
3353 	 * 57-bits with 5-level paging). Hence, skip bit [N-1].
3354 	 */
3355 	domain->domain.geometry.force_aperture = true;
3356 	domain->domain.geometry.aperture_start = 0;
3357 	if (first_stage)
3358 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
3359 	else
3360 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
3361 
3362 	/* always allocate the top pgd */
3363 	domain->pgd = iommu_alloc_page_node(domain->nid, GFP_KERNEL);
3364 	if (!domain->pgd) {
3365 		kfree(domain);
3366 		return ERR_PTR(-ENOMEM);
3367 	}
3368 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3369 
3370 	return domain;
3371 }
3372 
3373 static struct iommu_domain *
3374 intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
3375 				      const struct iommu_user_data *user_data)
3376 {
3377 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3378 	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3379 	bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3380 	struct intel_iommu *iommu = info->iommu;
3381 	struct dmar_domain *dmar_domain;
3382 	struct iommu_domain *domain;
3383 	bool first_stage;
3384 
3385 	if (flags &
3386 	    (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
3387 	       IOMMU_HWPT_ALLOC_PASID)))
3388 		return ERR_PTR(-EOPNOTSUPP);
3389 	if (nested_parent && !nested_supported(iommu))
3390 		return ERR_PTR(-EOPNOTSUPP);
3391 	if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3392 		return ERR_PTR(-EOPNOTSUPP);
3393 
3394 	/*
3395 	 * Always allocate the guest compatible page table unless
3396 	 * IOMMU_HWPT_ALLOC_NEST_PARENT or IOMMU_HWPT_ALLOC_DIRTY_TRACKING
3397 	 * is specified.
3398 	 */
3399 	if (nested_parent || dirty_tracking) {
3400 		if (!sm_supported(iommu) || !ecap_slts(iommu->ecap))
3401 			return ERR_PTR(-EOPNOTSUPP);
3402 		first_stage = false;
3403 	} else {
3404 		first_stage = first_level_by_default(iommu);
3405 	}
3406 
3407 	dmar_domain = paging_domain_alloc(dev, first_stage);
3408 	if (IS_ERR(dmar_domain))
3409 		return ERR_CAST(dmar_domain);
3410 	domain = &dmar_domain->domain;
3411 	domain->type = IOMMU_DOMAIN_UNMANAGED;
3412 	domain->owner = &intel_iommu_ops;
3413 	domain->ops = intel_iommu_ops.default_domain_ops;
3414 
3415 	if (nested_parent) {
3416 		dmar_domain->nested_parent = true;
3417 		INIT_LIST_HEAD(&dmar_domain->s1_domains);
3418 		spin_lock_init(&dmar_domain->s1_lock);
3419 	}
3420 
3421 	if (dirty_tracking) {
3422 		if (dmar_domain->use_first_level) {
3423 			iommu_domain_free(domain);
3424 			return ERR_PTR(-EOPNOTSUPP);
3425 		}
3426 		domain->dirty_ops = &intel_dirty_ops;
3427 	}
3428 
3429 	return domain;
3430 }
3431 
3432 static void intel_iommu_domain_free(struct iommu_domain *domain)
3433 {
3434 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3435 
3436 	WARN_ON(dmar_domain->nested_parent &&
3437 		!list_empty(&dmar_domain->s1_domains));
3438 	domain_exit(dmar_domain);
3439 }
3440 
3441 int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
3442 {
3443 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3444 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3445 	struct intel_iommu *iommu = info->iommu;
3446 	int addr_width;
3447 
3448 	if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
3449 		return -EPERM;
3450 
3451 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3452 		return -EINVAL;
3453 
3454 	if (domain->dirty_ops && !ssads_supported(iommu))
3455 		return -EINVAL;
3456 
3457 	if (dmar_domain->iommu_coherency !=
3458 			iommu_paging_structure_coherency(iommu))
3459 		return -EINVAL;
3460 
3461 	if (dmar_domain->iommu_superpage !=
3462 			iommu_superpage_capability(iommu, dmar_domain->use_first_level))
3463 		return -EINVAL;
3464 
3465 	if (dmar_domain->use_first_level &&
3466 	    (!sm_supported(iommu) || !ecap_flts(iommu->ecap)))
3467 		return -EINVAL;
3468 
3469 	/* check if this iommu agaw is sufficient for max mapped address */
3470 	addr_width = agaw_to_width(iommu->agaw);
3471 	if (addr_width > cap_mgaw(iommu->cap))
3472 		addr_width = cap_mgaw(iommu->cap);
3473 
3474 	if (dmar_domain->gaw > addr_width || dmar_domain->agaw > iommu->agaw)
3475 		return -EINVAL;
3476 
3477 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3478 	    context_copied(iommu, info->bus, info->devfn))
3479 		return intel_pasid_setup_sm_context(dev);
3480 
3481 	return 0;
3482 }
3483 
3484 static int intel_iommu_attach_device(struct iommu_domain *domain,
3485 				     struct device *dev)
3486 {
3487 	int ret;
3488 
3489 	device_block_translation(dev);
3490 
3491 	ret = paging_domain_compatible(domain, dev);
3492 	if (ret)
3493 		return ret;
3494 
3495 	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
3496 }
3497 
3498 static int intel_iommu_map(struct iommu_domain *domain,
3499 			   unsigned long iova, phys_addr_t hpa,
3500 			   size_t size, int iommu_prot, gfp_t gfp)
3501 {
3502 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3503 	u64 max_addr;
3504 	int prot = 0;
3505 
3506 	if (iommu_prot & IOMMU_READ)
3507 		prot |= DMA_PTE_READ;
3508 	if (iommu_prot & IOMMU_WRITE)
3509 		prot |= DMA_PTE_WRITE;
3510 	if (dmar_domain->set_pte_snp)
3511 		prot |= DMA_PTE_SNP;
3512 
3513 	max_addr = iova + size;
3514 	if (dmar_domain->max_addr < max_addr) {
3515 		u64 end;
3516 
3517 		/* check if minimum agaw is sufficient for mapped address */
3518 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3519 		if (end < max_addr) {
3520 			pr_err("%s: iommu width (%d) is not "
3521 			       "sufficient for the mapped address (%llx)\n",
3522 			       __func__, dmar_domain->gaw, max_addr);
3523 			return -EFAULT;
3524 		}
3525 		dmar_domain->max_addr = max_addr;
3526 	}
3527 	/* Round up size to next multiple of PAGE_SIZE, if it and
3528 	   the low bits of hpa would take us onto the next page */
3529 	size = aligned_nrpages(hpa, size);
3530 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3531 				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
3532 }
3533 
3534 static int intel_iommu_map_pages(struct iommu_domain *domain,
3535 				 unsigned long iova, phys_addr_t paddr,
3536 				 size_t pgsize, size_t pgcount,
3537 				 int prot, gfp_t gfp, size_t *mapped)
3538 {
3539 	unsigned long pgshift = __ffs(pgsize);
3540 	size_t size = pgcount << pgshift;
3541 	int ret;
3542 
3543 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
3544 		return -EINVAL;
3545 
3546 	if (!IS_ALIGNED(iova | paddr, pgsize))
3547 		return -EINVAL;
3548 
3549 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
3550 	if (!ret && mapped)
3551 		*mapped = size;
3552 
3553 	return ret;
3554 }
3555 
3556 static size_t intel_iommu_unmap(struct iommu_domain *domain,
3557 				unsigned long iova, size_t size,
3558 				struct iommu_iotlb_gather *gather)
3559 {
3560 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3561 	unsigned long start_pfn, last_pfn;
3562 	int level = 0;
3563 
3564 	/* Cope with horrid API which requires us to unmap more than the
3565 	   size argument if it happens to be a large-page mapping. */
3566 	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
3567 				     &level, GFP_ATOMIC)))
3568 		return 0;
3569 
3570 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
3571 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
3572 
3573 	start_pfn = iova >> VTD_PAGE_SHIFT;
3574 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
3575 
3576 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
3577 
3578 	if (dmar_domain->max_addr == iova + size)
3579 		dmar_domain->max_addr = iova;
3580 
3581 	/*
3582 	 * We do not use page-selective IOTLB invalidation in flush queue,
3583 	 * so there is no need to track page and sync iotlb.
3584 	 */
3585 	if (!iommu_iotlb_gather_queued(gather))
3586 		iommu_iotlb_gather_add_page(domain, gather, iova, size);
3587 
3588 	return size;
3589 }
3590 
3591 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
3592 				      unsigned long iova,
3593 				      size_t pgsize, size_t pgcount,
3594 				      struct iommu_iotlb_gather *gather)
3595 {
3596 	unsigned long pgshift = __ffs(pgsize);
3597 	size_t size = pgcount << pgshift;
3598 
3599 	return intel_iommu_unmap(domain, iova, size, gather);
3600 }
3601 
3602 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
3603 				 struct iommu_iotlb_gather *gather)
3604 {
3605 	cache_tag_flush_range(to_dmar_domain(domain), gather->start,
3606 			      gather->end, list_empty(&gather->freelist));
3607 	iommu_put_pages_list(&gather->freelist);
3608 }
3609 
3610 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3611 					    dma_addr_t iova)
3612 {
3613 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3614 	struct dma_pte *pte;
3615 	int level = 0;
3616 	u64 phys = 0;
3617 
3618 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
3619 			     GFP_ATOMIC);
3620 	if (pte && dma_pte_present(pte))
3621 		phys = dma_pte_addr(pte) +
3622 			(iova & (BIT_MASK(level_to_offset_bits(level) +
3623 						VTD_PAGE_SHIFT) - 1));
3624 
3625 	return phys;
3626 }
3627 
3628 static bool domain_support_force_snooping(struct dmar_domain *domain)
3629 {
3630 	struct device_domain_info *info;
3631 	bool support = true;
3632 
3633 	assert_spin_locked(&domain->lock);
3634 	list_for_each_entry(info, &domain->devices, link) {
3635 		if (!ecap_sc_support(info->iommu->ecap)) {
3636 			support = false;
3637 			break;
3638 		}
3639 	}
3640 
3641 	return support;
3642 }
3643 
3644 static void domain_set_force_snooping(struct dmar_domain *domain)
3645 {
3646 	struct device_domain_info *info;
3647 
3648 	assert_spin_locked(&domain->lock);
3649 	/*
3650 	 * Second level page table supports per-PTE snoop control. The
3651 	 * iommu_map() interface will handle this by setting SNP bit.
3652 	 */
3653 	if (!domain->use_first_level) {
3654 		domain->set_pte_snp = true;
3655 		return;
3656 	}
3657 
3658 	list_for_each_entry(info, &domain->devices, link)
3659 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
3660 						     IOMMU_NO_PASID);
3661 }
3662 
3663 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
3664 {
3665 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3666 	unsigned long flags;
3667 
3668 	if (dmar_domain->force_snooping)
3669 		return true;
3670 
3671 	spin_lock_irqsave(&dmar_domain->lock, flags);
3672 	if (!domain_support_force_snooping(dmar_domain) ||
3673 	    (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
3674 		spin_unlock_irqrestore(&dmar_domain->lock, flags);
3675 		return false;
3676 	}
3677 
3678 	domain_set_force_snooping(dmar_domain);
3679 	dmar_domain->force_snooping = true;
3680 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
3681 
3682 	return true;
3683 }
3684 
3685 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
3686 {
3687 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3688 
3689 	switch (cap) {
3690 	case IOMMU_CAP_CACHE_COHERENCY:
3691 	case IOMMU_CAP_DEFERRED_FLUSH:
3692 		return true;
3693 	case IOMMU_CAP_PRE_BOOT_PROTECTION:
3694 		return dmar_platform_optin();
3695 	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
3696 		return ecap_sc_support(info->iommu->ecap);
3697 	case IOMMU_CAP_DIRTY_TRACKING:
3698 		return ssads_supported(info->iommu);
3699 	default:
3700 		return false;
3701 	}
3702 }
3703 
3704 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
3705 {
3706 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3707 	struct device_domain_info *info;
3708 	struct intel_iommu *iommu;
3709 	u8 bus, devfn;
3710 	int ret;
3711 
3712 	iommu = device_lookup_iommu(dev, &bus, &devfn);
3713 	if (!iommu || !iommu->iommu.ops)
3714 		return ERR_PTR(-ENODEV);
3715 
3716 	info = kzalloc(sizeof(*info), GFP_KERNEL);
3717 	if (!info)
3718 		return ERR_PTR(-ENOMEM);
3719 
3720 	if (dev_is_real_dma_subdevice(dev)) {
3721 		info->bus = pdev->bus->number;
3722 		info->devfn = pdev->devfn;
3723 		info->segment = pci_domain_nr(pdev->bus);
3724 	} else {
3725 		info->bus = bus;
3726 		info->devfn = devfn;
3727 		info->segment = iommu->segment;
3728 	}
3729 
3730 	info->dev = dev;
3731 	info->iommu = iommu;
3732 	if (dev_is_pci(dev)) {
3733 		if (ecap_dev_iotlb_support(iommu->ecap) &&
3734 		    pci_ats_supported(pdev) &&
3735 		    dmar_ats_supported(pdev, iommu)) {
3736 			info->ats_supported = 1;
3737 			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
3738 
3739 			/*
3740 			 * For IOMMU that supports device IOTLB throttling
3741 			 * (DIT), we assign PFSID to the invalidation desc
3742 			 * of a VF such that IOMMU HW can gauge queue depth
3743 			 * at PF level. If DIT is not set, PFSID will be
3744 			 * treated as reserved, which should be set to 0.
3745 			 */
3746 			if (ecap_dit(iommu->ecap))
3747 				info->pfsid = pci_dev_id(pci_physfn(pdev));
3748 			info->ats_qdep = pci_ats_queue_depth(pdev);
3749 		}
3750 		if (sm_supported(iommu)) {
3751 			if (pasid_supported(iommu)) {
3752 				int features = pci_pasid_features(pdev);
3753 
3754 				if (features >= 0)
3755 					info->pasid_supported = features | 1;
3756 			}
3757 
3758 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
3759 			    pci_pri_supported(pdev))
3760 				info->pri_supported = 1;
3761 		}
3762 	}
3763 
3764 	dev_iommu_priv_set(dev, info);
3765 	if (pdev && pci_ats_supported(pdev)) {
3766 		pci_prepare_ats(pdev, VTD_PAGE_SHIFT);
3767 		ret = device_rbtree_insert(iommu, info);
3768 		if (ret)
3769 			goto free;
3770 	}
3771 
3772 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
3773 		ret = intel_pasid_alloc_table(dev);
3774 		if (ret) {
3775 			dev_err(dev, "PASID table allocation failed\n");
3776 			goto clear_rbtree;
3777 		}
3778 
3779 		if (!context_copied(iommu, info->bus, info->devfn)) {
3780 			ret = intel_pasid_setup_sm_context(dev);
3781 			if (ret)
3782 				goto free_table;
3783 		}
3784 	}
3785 
3786 	intel_iommu_debugfs_create_dev(info);
3787 
3788 	return &iommu->iommu;
3789 free_table:
3790 	intel_pasid_free_table(dev);
3791 clear_rbtree:
3792 	device_rbtree_remove(info);
3793 free:
3794 	kfree(info);
3795 
3796 	return ERR_PTR(ret);
3797 }
3798 
3799 static void intel_iommu_probe_finalize(struct device *dev)
3800 {
3801 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3802 	struct intel_iommu *iommu = info->iommu;
3803 
3804 	/*
3805 	 * The PCIe spec, in its wisdom, declares that the behaviour of the
3806 	 * device is undefined if you enable PASID support after ATS support.
3807 	 * So always enable PASID support on devices which have it, even if
3808 	 * we can't yet know if we're ever going to use it.
3809 	 */
3810 	if (info->pasid_supported &&
3811 	    !pci_enable_pasid(to_pci_dev(dev), info->pasid_supported & ~1))
3812 		info->pasid_enabled = 1;
3813 
3814 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev))
3815 		iommu_enable_pci_ats(info);
3816 	iommu_enable_pci_pri(info);
3817 }
3818 
3819 static void intel_iommu_release_device(struct device *dev)
3820 {
3821 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3822 	struct intel_iommu *iommu = info->iommu;
3823 
3824 	iommu_disable_pci_pri(info);
3825 	iommu_disable_pci_ats(info);
3826 
3827 	if (info->pasid_enabled) {
3828 		pci_disable_pasid(to_pci_dev(dev));
3829 		info->pasid_enabled = 0;
3830 	}
3831 
3832 	mutex_lock(&iommu->iopf_lock);
3833 	if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
3834 		device_rbtree_remove(info);
3835 	mutex_unlock(&iommu->iopf_lock);
3836 
3837 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3838 	    !context_copied(iommu, info->bus, info->devfn))
3839 		intel_pasid_teardown_sm_context(dev);
3840 
3841 	intel_pasid_free_table(dev);
3842 	intel_iommu_debugfs_remove_dev(info);
3843 	kfree(info);
3844 }
3845 
3846 static void intel_iommu_get_resv_regions(struct device *device,
3847 					 struct list_head *head)
3848 {
3849 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
3850 	struct iommu_resv_region *reg;
3851 	struct dmar_rmrr_unit *rmrr;
3852 	struct device *i_dev;
3853 	int i;
3854 
3855 	rcu_read_lock();
3856 	for_each_rmrr_units(rmrr) {
3857 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3858 					  i, i_dev) {
3859 			struct iommu_resv_region *resv;
3860 			enum iommu_resv_type type;
3861 			size_t length;
3862 
3863 			if (i_dev != device &&
3864 			    !is_downstream_to_pci_bridge(device, i_dev))
3865 				continue;
3866 
3867 			length = rmrr->end_address - rmrr->base_address + 1;
3868 
3869 			type = device_rmrr_is_relaxable(device) ?
3870 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
3871 
3872 			resv = iommu_alloc_resv_region(rmrr->base_address,
3873 						       length, prot, type,
3874 						       GFP_ATOMIC);
3875 			if (!resv)
3876 				break;
3877 
3878 			list_add_tail(&resv->list, head);
3879 		}
3880 	}
3881 	rcu_read_unlock();
3882 
3883 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
3884 	if (dev_is_pci(device)) {
3885 		struct pci_dev *pdev = to_pci_dev(device);
3886 
3887 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
3888 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
3889 					IOMMU_RESV_DIRECT_RELAXABLE,
3890 					GFP_KERNEL);
3891 			if (reg)
3892 				list_add_tail(&reg->list, head);
3893 		}
3894 	}
3895 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
3896 
3897 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
3898 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
3899 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
3900 	if (!reg)
3901 		return;
3902 	list_add_tail(&reg->list, head);
3903 }
3904 
3905 static struct iommu_group *intel_iommu_device_group(struct device *dev)
3906 {
3907 	if (dev_is_pci(dev))
3908 		return pci_device_group(dev);
3909 	return generic_device_group(dev);
3910 }
3911 
3912 int intel_iommu_enable_iopf(struct device *dev)
3913 {
3914 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3915 	struct intel_iommu *iommu = info->iommu;
3916 	int ret;
3917 
3918 	if (!info->pri_enabled)
3919 		return -ENODEV;
3920 
3921 	if (info->iopf_refcount) {
3922 		info->iopf_refcount++;
3923 		return 0;
3924 	}
3925 
3926 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
3927 	if (ret)
3928 		return ret;
3929 
3930 	info->iopf_refcount = 1;
3931 
3932 	return 0;
3933 }
3934 
3935 void intel_iommu_disable_iopf(struct device *dev)
3936 {
3937 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3938 	struct intel_iommu *iommu = info->iommu;
3939 
3940 	if (WARN_ON(!info->pri_enabled || !info->iopf_refcount))
3941 		return;
3942 
3943 	if (--info->iopf_refcount)
3944 		return;
3945 
3946 	iopf_queue_remove_device(iommu->iopf_queue, dev);
3947 }
3948 
3949 static int
3950 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
3951 {
3952 	switch (feat) {
3953 	case IOMMU_DEV_FEAT_IOPF:
3954 		return intel_iommu_enable_iopf(dev);
3955 
3956 	case IOMMU_DEV_FEAT_SVA:
3957 		return 0;
3958 
3959 	default:
3960 		return -ENODEV;
3961 	}
3962 }
3963 
3964 static int
3965 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
3966 {
3967 	switch (feat) {
3968 	case IOMMU_DEV_FEAT_IOPF:
3969 		intel_iommu_disable_iopf(dev);
3970 		return 0;
3971 
3972 	case IOMMU_DEV_FEAT_SVA:
3973 		return 0;
3974 
3975 	default:
3976 		return -ENODEV;
3977 	}
3978 }
3979 
3980 static bool intel_iommu_is_attach_deferred(struct device *dev)
3981 {
3982 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3983 
3984 	return translation_pre_enabled(info->iommu) && !info->domain;
3985 }
3986 
3987 /*
3988  * Check that the device does not live on an external facing PCI port that is
3989  * marked as untrusted. Such devices should not be able to apply quirks and
3990  * thus not be able to bypass the IOMMU restrictions.
3991  */
3992 static bool risky_device(struct pci_dev *pdev)
3993 {
3994 	if (pdev->untrusted) {
3995 		pci_info(pdev,
3996 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
3997 			 pdev->vendor, pdev->device);
3998 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
3999 		return true;
4000 	}
4001 	return false;
4002 }
4003 
4004 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4005 				      unsigned long iova, size_t size)
4006 {
4007 	cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1);
4008 
4009 	return 0;
4010 }
4011 
4012 void domain_remove_dev_pasid(struct iommu_domain *domain,
4013 			     struct device *dev, ioasid_t pasid)
4014 {
4015 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4016 	struct dev_pasid_info *curr, *dev_pasid = NULL;
4017 	struct intel_iommu *iommu = info->iommu;
4018 	struct dmar_domain *dmar_domain;
4019 	unsigned long flags;
4020 
4021 	if (!domain)
4022 		return;
4023 
4024 	/* Identity domain has no meta data for pasid. */
4025 	if (domain->type == IOMMU_DOMAIN_IDENTITY)
4026 		return;
4027 
4028 	dmar_domain = to_dmar_domain(domain);
4029 	spin_lock_irqsave(&dmar_domain->lock, flags);
4030 	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4031 		if (curr->dev == dev && curr->pasid == pasid) {
4032 			list_del(&curr->link_domain);
4033 			dev_pasid = curr;
4034 			break;
4035 		}
4036 	}
4037 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4038 
4039 	cache_tag_unassign_domain(dmar_domain, dev, pasid);
4040 	domain_detach_iommu(dmar_domain, iommu);
4041 	if (!WARN_ON_ONCE(!dev_pasid)) {
4042 		intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4043 		kfree(dev_pasid);
4044 	}
4045 }
4046 
4047 static int blocking_domain_set_dev_pasid(struct iommu_domain *domain,
4048 					 struct device *dev, ioasid_t pasid,
4049 					 struct iommu_domain *old)
4050 {
4051 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4052 
4053 	intel_pasid_tear_down_entry(info->iommu, dev, pasid, false);
4054 	domain_remove_dev_pasid(old, dev, pasid);
4055 
4056 	return 0;
4057 }
4058 
4059 struct dev_pasid_info *
4060 domain_add_dev_pasid(struct iommu_domain *domain,
4061 		     struct device *dev, ioasid_t pasid)
4062 {
4063 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4064 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4065 	struct intel_iommu *iommu = info->iommu;
4066 	struct dev_pasid_info *dev_pasid;
4067 	unsigned long flags;
4068 	int ret;
4069 
4070 	dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4071 	if (!dev_pasid)
4072 		return ERR_PTR(-ENOMEM);
4073 
4074 	ret = domain_attach_iommu(dmar_domain, iommu);
4075 	if (ret)
4076 		goto out_free;
4077 
4078 	ret = cache_tag_assign_domain(dmar_domain, dev, pasid);
4079 	if (ret)
4080 		goto out_detach_iommu;
4081 
4082 	dev_pasid->dev = dev;
4083 	dev_pasid->pasid = pasid;
4084 	spin_lock_irqsave(&dmar_domain->lock, flags);
4085 	list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4086 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4087 
4088 	return dev_pasid;
4089 out_detach_iommu:
4090 	domain_detach_iommu(dmar_domain, iommu);
4091 out_free:
4092 	kfree(dev_pasid);
4093 	return ERR_PTR(ret);
4094 }
4095 
4096 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4097 				     struct device *dev, ioasid_t pasid,
4098 				     struct iommu_domain *old)
4099 {
4100 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4101 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4102 	struct intel_iommu *iommu = info->iommu;
4103 	struct dev_pasid_info *dev_pasid;
4104 	int ret;
4105 
4106 	if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
4107 		return -EINVAL;
4108 
4109 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4110 		return -EOPNOTSUPP;
4111 
4112 	if (domain->dirty_ops)
4113 		return -EINVAL;
4114 
4115 	if (context_copied(iommu, info->bus, info->devfn))
4116 		return -EBUSY;
4117 
4118 	ret = paging_domain_compatible(domain, dev);
4119 	if (ret)
4120 		return ret;
4121 
4122 	dev_pasid = domain_add_dev_pasid(domain, dev, pasid);
4123 	if (IS_ERR(dev_pasid))
4124 		return PTR_ERR(dev_pasid);
4125 
4126 	if (dmar_domain->use_first_level)
4127 		ret = domain_setup_first_level(iommu, dmar_domain,
4128 					       dev, pasid, old);
4129 	else
4130 		ret = domain_setup_second_level(iommu, dmar_domain,
4131 						dev, pasid, old);
4132 	if (ret)
4133 		goto out_remove_dev_pasid;
4134 
4135 	domain_remove_dev_pasid(old, dev, pasid);
4136 
4137 	intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4138 
4139 	return 0;
4140 
4141 out_remove_dev_pasid:
4142 	domain_remove_dev_pasid(domain, dev, pasid);
4143 	return ret;
4144 }
4145 
4146 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4147 {
4148 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4149 	struct intel_iommu *iommu = info->iommu;
4150 	struct iommu_hw_info_vtd *vtd;
4151 
4152 	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4153 	if (!vtd)
4154 		return ERR_PTR(-ENOMEM);
4155 
4156 	vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4157 	vtd->cap_reg = iommu->cap;
4158 	vtd->ecap_reg = iommu->ecap;
4159 	*length = sizeof(*vtd);
4160 	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4161 	return vtd;
4162 }
4163 
4164 /*
4165  * Set dirty tracking for the device list of a domain. The caller must
4166  * hold the domain->lock when calling it.
4167  */
4168 static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4169 {
4170 	struct device_domain_info *info;
4171 	int ret = 0;
4172 
4173 	list_for_each_entry(info, devices, link) {
4174 		ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4175 						       IOMMU_NO_PASID, enable);
4176 		if (ret)
4177 			break;
4178 	}
4179 
4180 	return ret;
4181 }
4182 
4183 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4184 					    bool enable)
4185 {
4186 	struct dmar_domain *s1_domain;
4187 	unsigned long flags;
4188 	int ret;
4189 
4190 	spin_lock(&domain->s1_lock);
4191 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4192 		spin_lock_irqsave(&s1_domain->lock, flags);
4193 		ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4194 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4195 		if (ret)
4196 			goto err_unwind;
4197 	}
4198 	spin_unlock(&domain->s1_lock);
4199 	return 0;
4200 
4201 err_unwind:
4202 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4203 		spin_lock_irqsave(&s1_domain->lock, flags);
4204 		device_set_dirty_tracking(&s1_domain->devices,
4205 					  domain->dirty_tracking);
4206 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4207 	}
4208 	spin_unlock(&domain->s1_lock);
4209 	return ret;
4210 }
4211 
4212 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4213 					  bool enable)
4214 {
4215 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4216 	int ret;
4217 
4218 	spin_lock(&dmar_domain->lock);
4219 	if (dmar_domain->dirty_tracking == enable)
4220 		goto out_unlock;
4221 
4222 	ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4223 	if (ret)
4224 		goto err_unwind;
4225 
4226 	if (dmar_domain->nested_parent) {
4227 		ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4228 		if (ret)
4229 			goto err_unwind;
4230 	}
4231 
4232 	dmar_domain->dirty_tracking = enable;
4233 out_unlock:
4234 	spin_unlock(&dmar_domain->lock);
4235 
4236 	return 0;
4237 
4238 err_unwind:
4239 	device_set_dirty_tracking(&dmar_domain->devices,
4240 				  dmar_domain->dirty_tracking);
4241 	spin_unlock(&dmar_domain->lock);
4242 	return ret;
4243 }
4244 
4245 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4246 					    unsigned long iova, size_t size,
4247 					    unsigned long flags,
4248 					    struct iommu_dirty_bitmap *dirty)
4249 {
4250 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4251 	unsigned long end = iova + size - 1;
4252 	unsigned long pgsize;
4253 
4254 	/*
4255 	 * IOMMUFD core calls into a dirty tracking disabled domain without an
4256 	 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4257 	 * have occurred when we stopped dirty tracking. This ensures that we
4258 	 * never inherit dirtied bits from a previous cycle.
4259 	 */
4260 	if (!dmar_domain->dirty_tracking && dirty->bitmap)
4261 		return -EINVAL;
4262 
4263 	do {
4264 		struct dma_pte *pte;
4265 		int lvl = 0;
4266 
4267 		pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4268 				     GFP_ATOMIC);
4269 		pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4270 		if (!pte || !dma_pte_present(pte)) {
4271 			iova += pgsize;
4272 			continue;
4273 		}
4274 
4275 		if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4276 			iommu_dirty_bitmap_record(dirty, iova, pgsize);
4277 		iova += pgsize;
4278 	} while (iova < end);
4279 
4280 	return 0;
4281 }
4282 
4283 static const struct iommu_dirty_ops intel_dirty_ops = {
4284 	.set_dirty_tracking = intel_iommu_set_dirty_tracking,
4285 	.read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4286 };
4287 
4288 static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
4289 {
4290 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4291 	struct intel_iommu *iommu = info->iommu;
4292 	struct context_entry *context;
4293 
4294 	spin_lock(&iommu->lock);
4295 	context = iommu_context_addr(iommu, bus, devfn, 1);
4296 	if (!context) {
4297 		spin_unlock(&iommu->lock);
4298 		return -ENOMEM;
4299 	}
4300 
4301 	if (context_present(context) && !context_copied(iommu, bus, devfn)) {
4302 		spin_unlock(&iommu->lock);
4303 		return 0;
4304 	}
4305 
4306 	copied_context_tear_down(iommu, context, bus, devfn);
4307 	context_clear_entry(context);
4308 	context_set_domain_id(context, FLPT_DEFAULT_DID);
4309 
4310 	/*
4311 	 * In pass through mode, AW must be programmed to indicate the largest
4312 	 * AGAW value supported by hardware. And ASR is ignored by hardware.
4313 	 */
4314 	context_set_address_width(context, iommu->msagaw);
4315 	context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH);
4316 	context_set_fault_enable(context);
4317 	context_set_present(context);
4318 	if (!ecap_coherent(iommu->ecap))
4319 		clflush_cache_range(context, sizeof(*context));
4320 	context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn);
4321 	spin_unlock(&iommu->lock);
4322 
4323 	return 0;
4324 }
4325 
4326 static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data)
4327 {
4328 	struct device *dev = data;
4329 
4330 	return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff);
4331 }
4332 
4333 static int device_setup_pass_through(struct device *dev)
4334 {
4335 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4336 
4337 	if (!dev_is_pci(dev))
4338 		return context_setup_pass_through(dev, info->bus, info->devfn);
4339 
4340 	return pci_for_each_dma_alias(to_pci_dev(dev),
4341 				      context_setup_pass_through_cb, dev);
4342 }
4343 
4344 static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev)
4345 {
4346 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4347 	struct intel_iommu *iommu = info->iommu;
4348 	int ret;
4349 
4350 	device_block_translation(dev);
4351 
4352 	if (dev_is_real_dma_subdevice(dev))
4353 		return 0;
4354 
4355 	if (sm_supported(iommu))
4356 		ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
4357 	else
4358 		ret = device_setup_pass_through(dev);
4359 
4360 	return ret;
4361 }
4362 
4363 static int identity_domain_set_dev_pasid(struct iommu_domain *domain,
4364 					 struct device *dev, ioasid_t pasid,
4365 					 struct iommu_domain *old)
4366 {
4367 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4368 	struct intel_iommu *iommu = info->iommu;
4369 	int ret;
4370 
4371 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4372 		return -EOPNOTSUPP;
4373 
4374 	ret = domain_setup_passthrough(iommu, dev, pasid, old);
4375 	if (ret)
4376 		return ret;
4377 
4378 	domain_remove_dev_pasid(old, dev, pasid);
4379 	return 0;
4380 }
4381 
4382 static struct iommu_domain identity_domain = {
4383 	.type = IOMMU_DOMAIN_IDENTITY,
4384 	.ops = &(const struct iommu_domain_ops) {
4385 		.attach_dev	= identity_domain_attach_dev,
4386 		.set_dev_pasid	= identity_domain_set_dev_pasid,
4387 	},
4388 };
4389 
4390 const struct iommu_ops intel_iommu_ops = {
4391 	.blocked_domain		= &blocking_domain,
4392 	.release_domain		= &blocking_domain,
4393 	.identity_domain	= &identity_domain,
4394 	.capable		= intel_iommu_capable,
4395 	.hw_info		= intel_iommu_hw_info,
4396 	.domain_alloc_paging_flags = intel_iommu_domain_alloc_paging_flags,
4397 	.domain_alloc_sva	= intel_svm_domain_alloc,
4398 	.domain_alloc_nested	= intel_iommu_domain_alloc_nested,
4399 	.probe_device		= intel_iommu_probe_device,
4400 	.probe_finalize		= intel_iommu_probe_finalize,
4401 	.release_device		= intel_iommu_release_device,
4402 	.get_resv_regions	= intel_iommu_get_resv_regions,
4403 	.device_group		= intel_iommu_device_group,
4404 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4405 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4406 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4407 	.def_domain_type	= device_def_domain_type,
4408 	.pgsize_bitmap		= SZ_4K,
4409 	.page_response		= intel_iommu_page_response,
4410 	.default_domain_ops = &(const struct iommu_domain_ops) {
4411 		.attach_dev		= intel_iommu_attach_device,
4412 		.set_dev_pasid		= intel_iommu_set_dev_pasid,
4413 		.map_pages		= intel_iommu_map_pages,
4414 		.unmap_pages		= intel_iommu_unmap_pages,
4415 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4416 		.flush_iotlb_all        = intel_flush_iotlb_all,
4417 		.iotlb_sync		= intel_iommu_tlb_sync,
4418 		.iova_to_phys		= intel_iommu_iova_to_phys,
4419 		.free			= intel_iommu_domain_free,
4420 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4421 	}
4422 };
4423 
4424 static void quirk_iommu_igfx(struct pci_dev *dev)
4425 {
4426 	if (risky_device(dev))
4427 		return;
4428 
4429 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4430 	disable_igfx_iommu = 1;
4431 }
4432 
4433 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4434 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4435 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4436 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4437 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4438 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4439 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4440 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4441 
4442 /* QM57/QS57 integrated gfx malfunctions with dmar */
4443 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_iommu_igfx);
4444 
4445 /* Broadwell igfx malfunctions with dmar */
4446 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4447 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4448 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4449 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4450 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4451 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4452 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4453 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4454 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4455 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4456 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4457 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4458 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4459 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4460 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4461 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4462 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4463 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4464 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4465 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4466 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4467 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4468 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4469 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4470 
4471 static void quirk_iommu_rwbf(struct pci_dev *dev)
4472 {
4473 	if (risky_device(dev))
4474 		return;
4475 
4476 	/*
4477 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4478 	 * but needs it. Same seems to hold for the desktop versions.
4479 	 */
4480 	pci_info(dev, "Forcing write-buffer flush capability\n");
4481 	rwbf_quirk = 1;
4482 }
4483 
4484 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4485 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4486 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4487 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4488 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4489 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4490 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4491 
4492 #define GGC 0x52
4493 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4494 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4495 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4496 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4497 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4498 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4499 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4500 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4501 
4502 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4503 {
4504 	unsigned short ggc;
4505 
4506 	if (risky_device(dev))
4507 		return;
4508 
4509 	if (pci_read_config_word(dev, GGC, &ggc))
4510 		return;
4511 
4512 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4513 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4514 		disable_igfx_iommu = 1;
4515 	} else if (!disable_igfx_iommu) {
4516 		/* we have to ensure the gfx device is idle before we flush */
4517 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4518 		iommu_set_dma_strict();
4519 	}
4520 }
4521 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4522 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4523 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4524 
4525 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4526 {
4527 	unsigned short ver;
4528 
4529 	if (!IS_GFX_DEVICE(dev))
4530 		return;
4531 
4532 	ver = (dev->device >> 8) & 0xff;
4533 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4534 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4535 	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4536 		return;
4537 
4538 	if (risky_device(dev))
4539 		return;
4540 
4541 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4542 	iommu_skip_te_disable = 1;
4543 }
4544 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4545 
4546 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4547    ISOCH DMAR unit for the Azalia sound device, but not give it any
4548    TLB entries, which causes it to deadlock. Check for that.  We do
4549    this in a function called from init_dmars(), instead of in a PCI
4550    quirk, because we don't want to print the obnoxious "BIOS broken"
4551    message if VT-d is actually disabled.
4552 */
4553 static void __init check_tylersburg_isoch(void)
4554 {
4555 	struct pci_dev *pdev;
4556 	uint32_t vtisochctrl;
4557 
4558 	/* If there's no Azalia in the system anyway, forget it. */
4559 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4560 	if (!pdev)
4561 		return;
4562 
4563 	if (risky_device(pdev)) {
4564 		pci_dev_put(pdev);
4565 		return;
4566 	}
4567 
4568 	pci_dev_put(pdev);
4569 
4570 	/* System Management Registers. Might be hidden, in which case
4571 	   we can't do the sanity check. But that's OK, because the
4572 	   known-broken BIOSes _don't_ actually hide it, so far. */
4573 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4574 	if (!pdev)
4575 		return;
4576 
4577 	if (risky_device(pdev)) {
4578 		pci_dev_put(pdev);
4579 		return;
4580 	}
4581 
4582 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4583 		pci_dev_put(pdev);
4584 		return;
4585 	}
4586 
4587 	pci_dev_put(pdev);
4588 
4589 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4590 	if (vtisochctrl & 1)
4591 		return;
4592 
4593 	/* Drop all bits other than the number of TLB entries */
4594 	vtisochctrl &= 0x1c;
4595 
4596 	/* If we have the recommended number of TLB entries (16), fine. */
4597 	if (vtisochctrl == 0x10)
4598 		return;
4599 
4600 	/* Zero TLB entries? You get to ride the short bus to school. */
4601 	if (!vtisochctrl) {
4602 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4603 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4604 		     dmi_get_system_info(DMI_BIOS_VENDOR),
4605 		     dmi_get_system_info(DMI_BIOS_VERSION),
4606 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4607 		iommu_identity_mapping |= IDENTMAP_AZALIA;
4608 		return;
4609 	}
4610 
4611 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4612 	       vtisochctrl);
4613 }
4614 
4615 /*
4616  * Here we deal with a device TLB defect where device may inadvertently issue ATS
4617  * invalidation completion before posted writes initiated with translated address
4618  * that utilized translations matching the invalidation address range, violating
4619  * the invalidation completion ordering.
4620  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4621  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4622  * under the control of the trusted/privileged host device driver must use this
4623  * quirk.
4624  * Device TLBs are invalidated under the following six conditions:
4625  * 1. Device driver does DMA API unmap IOVA
4626  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4627  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4628  *    exit_mmap() due to crash
4629  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4630  *    VM has to free pages that were unmapped
4631  * 5. Userspace driver unmaps a DMA buffer
4632  * 6. Cache invalidation in vSVA usage (upcoming)
4633  *
4634  * For #1 and #2, device drivers are responsible for stopping DMA traffic
4635  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4636  * invalidate TLB the same way as normal user unmap which will use this quirk.
4637  * The dTLB invalidation after PASID cache flush does not need this quirk.
4638  *
4639  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4640  */
4641 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4642 			       unsigned long address, unsigned long mask,
4643 			       u32 pasid, u16 qdep)
4644 {
4645 	u16 sid;
4646 
4647 	if (likely(!info->dtlb_extra_inval))
4648 		return;
4649 
4650 	sid = PCI_DEVID(info->bus, info->devfn);
4651 	if (pasid == IOMMU_NO_PASID) {
4652 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
4653 				   qdep, address, mask);
4654 	} else {
4655 		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
4656 					 pasid, qdep, address, mask);
4657 	}
4658 }
4659 
4660 #define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
4661 
4662 /*
4663  * Function to submit a command to the enhanced command interface. The
4664  * valid enhanced command descriptions are defined in Table 47 of the
4665  * VT-d spec. The VT-d hardware implementation may support some but not
4666  * all commands, which can be determined by checking the Enhanced
4667  * Command Capability Register.
4668  *
4669  * Return values:
4670  *  - 0: Command successful without any error;
4671  *  - Negative: software error value;
4672  *  - Nonzero positive: failure status code defined in Table 48.
4673  */
4674 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
4675 {
4676 	unsigned long flags;
4677 	u64 res;
4678 	int ret;
4679 
4680 	if (!cap_ecmds(iommu->cap))
4681 		return -ENODEV;
4682 
4683 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
4684 
4685 	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
4686 	if (res & DMA_ECMD_ECRSP_IP) {
4687 		ret = -EBUSY;
4688 		goto err;
4689 	}
4690 
4691 	/*
4692 	 * Unconditionally write the operand B, because
4693 	 * - There is no side effect if an ecmd doesn't require an
4694 	 *   operand B, but we set the register to some value.
4695 	 * - It's not invoked in any critical path. The extra MMIO
4696 	 *   write doesn't bring any performance concerns.
4697 	 */
4698 	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
4699 	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
4700 
4701 	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
4702 		      !(res & DMA_ECMD_ECRSP_IP), res);
4703 
4704 	if (res & DMA_ECMD_ECRSP_IP) {
4705 		ret = -ETIMEDOUT;
4706 		goto err;
4707 	}
4708 
4709 	ret = ecmd_get_status_code(res);
4710 err:
4711 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
4712 
4713 	return ret;
4714 }
4715