xref: /linux/drivers/iommu/intel/iommu.c (revision 1cbfb828e05171ca2dd77b5988d068e6872480fe)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26 
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-pages.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 #include "perfmon.h"
34 
35 #define ROOT_SIZE		VTD_PAGE_SIZE
36 #define CONTEXT_SIZE		VTD_PAGE_SIZE
37 
38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42 
43 #define IOAPIC_RANGE_START	(0xfee00000)
44 #define IOAPIC_RANGE_END	(0xfeefffff)
45 #define IOVA_START_ADDR		(0x1000)
46 
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48 
49 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
50 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
51 
52 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
53    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
54 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
55 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
56 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
57 
58 static void __init check_tylersburg_isoch(void);
59 static int rwbf_quirk;
60 
61 /*
62  * set to 1 to panic kernel if can't successfully enable VT-d
63  * (used when kernel is launched w/ TXT)
64  */
65 static int force_on = 0;
66 static int intel_iommu_tboot_noforce;
67 static int no_platform_optin;
68 
69 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
70 
71 /*
72  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
73  * if marked present.
74  */
75 static phys_addr_t root_entry_lctp(struct root_entry *re)
76 {
77 	if (!(re->lo & 1))
78 		return 0;
79 
80 	return re->lo & VTD_PAGE_MASK;
81 }
82 
83 /*
84  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
85  * if marked present.
86  */
87 static phys_addr_t root_entry_uctp(struct root_entry *re)
88 {
89 	if (!(re->hi & 1))
90 		return 0;
91 
92 	return re->hi & VTD_PAGE_MASK;
93 }
94 
95 static int device_rid_cmp_key(const void *key, const struct rb_node *node)
96 {
97 	struct device_domain_info *info =
98 		rb_entry(node, struct device_domain_info, node);
99 	const u16 *rid_lhs = key;
100 
101 	if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
102 		return -1;
103 
104 	if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
105 		return 1;
106 
107 	return 0;
108 }
109 
110 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
111 {
112 	struct device_domain_info *info =
113 		rb_entry(lhs, struct device_domain_info, node);
114 	u16 key = PCI_DEVID(info->bus, info->devfn);
115 
116 	return device_rid_cmp_key(&key, rhs);
117 }
118 
119 /*
120  * Looks up an IOMMU-probed device using its source ID.
121  *
122  * Returns the pointer to the device if there is a match. Otherwise,
123  * returns NULL.
124  *
125  * Note that this helper doesn't guarantee that the device won't be
126  * released by the iommu subsystem after being returned. The caller
127  * should use its own synchronization mechanism to avoid the device
128  * being released during its use if its possibly the case.
129  */
130 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
131 {
132 	struct device_domain_info *info = NULL;
133 	struct rb_node *node;
134 	unsigned long flags;
135 
136 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
137 	node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
138 	if (node)
139 		info = rb_entry(node, struct device_domain_info, node);
140 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
141 
142 	return info ? info->dev : NULL;
143 }
144 
145 static int device_rbtree_insert(struct intel_iommu *iommu,
146 				struct device_domain_info *info)
147 {
148 	struct rb_node *curr;
149 	unsigned long flags;
150 
151 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
152 	curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
153 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
154 	if (WARN_ON(curr))
155 		return -EEXIST;
156 
157 	return 0;
158 }
159 
160 static void device_rbtree_remove(struct device_domain_info *info)
161 {
162 	struct intel_iommu *iommu = info->iommu;
163 	unsigned long flags;
164 
165 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
166 	rb_erase(&info->node, &iommu->device_rbtree);
167 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
168 }
169 
170 struct dmar_rmrr_unit {
171 	struct list_head list;		/* list of rmrr units	*/
172 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
173 	u64	base_address;		/* reserved base address*/
174 	u64	end_address;		/* reserved end address */
175 	struct dmar_dev_scope *devices;	/* target devices */
176 	int	devices_cnt;		/* target device count */
177 };
178 
179 struct dmar_atsr_unit {
180 	struct list_head list;		/* list of ATSR units */
181 	struct acpi_dmar_header *hdr;	/* ACPI header */
182 	struct dmar_dev_scope *devices;	/* target devices */
183 	int devices_cnt;		/* target device count */
184 	u8 include_all:1;		/* include all ports */
185 };
186 
187 struct dmar_satc_unit {
188 	struct list_head list;		/* list of SATC units */
189 	struct acpi_dmar_header *hdr;	/* ACPI header */
190 	struct dmar_dev_scope *devices;	/* target devices */
191 	struct intel_iommu *iommu;	/* the corresponding iommu */
192 	int devices_cnt;		/* target device count */
193 	u8 atc_required:1;		/* ATS is required */
194 };
195 
196 static LIST_HEAD(dmar_atsr_units);
197 static LIST_HEAD(dmar_rmrr_units);
198 static LIST_HEAD(dmar_satc_units);
199 
200 #define for_each_rmrr_units(rmrr) \
201 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
202 
203 static void intel_iommu_domain_free(struct iommu_domain *domain);
204 
205 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
206 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
207 
208 int intel_iommu_enabled = 0;
209 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
210 
211 static int intel_iommu_superpage = 1;
212 static int iommu_identity_mapping;
213 static int iommu_skip_te_disable;
214 static int disable_igfx_iommu;
215 
216 #define IDENTMAP_AZALIA		4
217 
218 const struct iommu_ops intel_iommu_ops;
219 static const struct iommu_dirty_ops intel_dirty_ops;
220 
221 static bool translation_pre_enabled(struct intel_iommu *iommu)
222 {
223 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
224 }
225 
226 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
227 {
228 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
229 }
230 
231 static void init_translation_status(struct intel_iommu *iommu)
232 {
233 	u32 gsts;
234 
235 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
236 	if (gsts & DMA_GSTS_TES)
237 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
238 }
239 
240 static int __init intel_iommu_setup(char *str)
241 {
242 	if (!str)
243 		return -EINVAL;
244 
245 	while (*str) {
246 		if (!strncmp(str, "on", 2)) {
247 			dmar_disabled = 0;
248 			pr_info("IOMMU enabled\n");
249 		} else if (!strncmp(str, "off", 3)) {
250 			dmar_disabled = 1;
251 			no_platform_optin = 1;
252 			pr_info("IOMMU disabled\n");
253 		} else if (!strncmp(str, "igfx_off", 8)) {
254 			disable_igfx_iommu = 1;
255 			pr_info("Disable GFX device mapping\n");
256 		} else if (!strncmp(str, "forcedac", 8)) {
257 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
258 			iommu_dma_forcedac = true;
259 		} else if (!strncmp(str, "strict", 6)) {
260 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
261 			iommu_set_dma_strict();
262 		} else if (!strncmp(str, "sp_off", 6)) {
263 			pr_info("Disable supported super page\n");
264 			intel_iommu_superpage = 0;
265 		} else if (!strncmp(str, "sm_on", 5)) {
266 			pr_info("Enable scalable mode if hardware supports\n");
267 			intel_iommu_sm = 1;
268 		} else if (!strncmp(str, "sm_off", 6)) {
269 			pr_info("Scalable mode is disallowed\n");
270 			intel_iommu_sm = 0;
271 		} else if (!strncmp(str, "tboot_noforce", 13)) {
272 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
273 			intel_iommu_tboot_noforce = 1;
274 		} else {
275 			pr_notice("Unknown option - '%s'\n", str);
276 		}
277 
278 		str += strcspn(str, ",");
279 		while (*str == ',')
280 			str++;
281 	}
282 
283 	return 1;
284 }
285 __setup("intel_iommu=", intel_iommu_setup);
286 
287 static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
288 {
289 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
290 
291 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
292 }
293 
294 /*
295  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
296  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
297  * the returned SAGAW.
298  */
299 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
300 {
301 	unsigned long fl_sagaw, sl_sagaw;
302 
303 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
304 	sl_sagaw = cap_sagaw(iommu->cap);
305 
306 	/* Second level only. */
307 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
308 		return sl_sagaw;
309 
310 	/* First level only. */
311 	if (!ecap_slts(iommu->ecap))
312 		return fl_sagaw;
313 
314 	return fl_sagaw & sl_sagaw;
315 }
316 
317 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
318 {
319 	unsigned long sagaw;
320 	int agaw;
321 
322 	sagaw = __iommu_calculate_sagaw(iommu);
323 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
324 		if (test_bit(agaw, &sagaw))
325 			break;
326 	}
327 
328 	return agaw;
329 }
330 
331 /*
332  * Calculate max SAGAW for each iommu.
333  */
334 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
335 {
336 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
337 }
338 
339 /*
340  * calculate agaw for each iommu.
341  * "SAGAW" may be different across iommus, use a default agaw, and
342  * get a supported less agaw for iommus that don't support the default agaw.
343  */
344 int iommu_calculate_agaw(struct intel_iommu *iommu)
345 {
346 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
347 }
348 
349 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
350 {
351 	return sm_supported(iommu) ?
352 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
353 }
354 
355 /* Return the super pagesize bitmap if supported. */
356 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
357 {
358 	unsigned long bitmap = 0;
359 
360 	/*
361 	 * 1-level super page supports page size of 2MiB, 2-level super page
362 	 * supports page size of both 2MiB and 1GiB.
363 	 */
364 	if (domain->iommu_superpage == 1)
365 		bitmap |= SZ_2M;
366 	else if (domain->iommu_superpage == 2)
367 		bitmap |= SZ_2M | SZ_1G;
368 
369 	return bitmap;
370 }
371 
372 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
373 					 u8 devfn, int alloc)
374 {
375 	struct root_entry *root = &iommu->root_entry[bus];
376 	struct context_entry *context;
377 	u64 *entry;
378 
379 	/*
380 	 * Except that the caller requested to allocate a new entry,
381 	 * returning a copied context entry makes no sense.
382 	 */
383 	if (!alloc && context_copied(iommu, bus, devfn))
384 		return NULL;
385 
386 	entry = &root->lo;
387 	if (sm_supported(iommu)) {
388 		if (devfn >= 0x80) {
389 			devfn -= 0x80;
390 			entry = &root->hi;
391 		}
392 		devfn *= 2;
393 	}
394 	if (*entry & 1)
395 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
396 	else {
397 		unsigned long phy_addr;
398 		if (!alloc)
399 			return NULL;
400 
401 		context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
402 		if (!context)
403 			return NULL;
404 
405 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
406 		phy_addr = virt_to_phys((void *)context);
407 		*entry = phy_addr | 1;
408 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
409 	}
410 	return &context[devfn];
411 }
412 
413 /**
414  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
415  *				 sub-hierarchy of a candidate PCI-PCI bridge
416  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
417  * @bridge: the candidate PCI-PCI bridge
418  *
419  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
420  */
421 static bool
422 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
423 {
424 	struct pci_dev *pdev, *pbridge;
425 
426 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
427 		return false;
428 
429 	pdev = to_pci_dev(dev);
430 	pbridge = to_pci_dev(bridge);
431 
432 	if (pbridge->subordinate &&
433 	    pbridge->subordinate->number <= pdev->bus->number &&
434 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
435 		return true;
436 
437 	return false;
438 }
439 
440 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
441 {
442 	struct dmar_drhd_unit *drhd;
443 	u32 vtbar;
444 	int rc;
445 
446 	/* We know that this device on this chipset has its own IOMMU.
447 	 * If we find it under a different IOMMU, then the BIOS is lying
448 	 * to us. Hope that the IOMMU for this device is actually
449 	 * disabled, and it needs no translation...
450 	 */
451 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
452 	if (rc) {
453 		/* "can't" happen */
454 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
455 		return false;
456 	}
457 	vtbar &= 0xffff0000;
458 
459 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
460 	drhd = dmar_find_matched_drhd_unit(pdev);
461 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
462 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
463 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
464 		return true;
465 	}
466 
467 	return false;
468 }
469 
470 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
471 {
472 	if (!iommu || iommu->drhd->ignored)
473 		return true;
474 
475 	if (dev_is_pci(dev)) {
476 		struct pci_dev *pdev = to_pci_dev(dev);
477 
478 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
479 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
480 		    quirk_ioat_snb_local_iommu(pdev))
481 			return true;
482 	}
483 
484 	return false;
485 }
486 
487 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
488 {
489 	struct dmar_drhd_unit *drhd = NULL;
490 	struct pci_dev *pdev = NULL;
491 	struct intel_iommu *iommu;
492 	struct device *tmp;
493 	u16 segment = 0;
494 	int i;
495 
496 	if (!dev)
497 		return NULL;
498 
499 	if (dev_is_pci(dev)) {
500 		struct pci_dev *pf_pdev;
501 
502 		pdev = pci_real_dma_dev(to_pci_dev(dev));
503 
504 		/* VFs aren't listed in scope tables; we need to look up
505 		 * the PF instead to find the IOMMU. */
506 		pf_pdev = pci_physfn(pdev);
507 		dev = &pf_pdev->dev;
508 		segment = pci_domain_nr(pdev->bus);
509 	} else if (has_acpi_companion(dev))
510 		dev = &ACPI_COMPANION(dev)->dev;
511 
512 	rcu_read_lock();
513 	for_each_iommu(iommu, drhd) {
514 		if (pdev && segment != drhd->segment)
515 			continue;
516 
517 		for_each_active_dev_scope(drhd->devices,
518 					  drhd->devices_cnt, i, tmp) {
519 			if (tmp == dev) {
520 				/* For a VF use its original BDF# not that of the PF
521 				 * which we used for the IOMMU lookup. Strictly speaking
522 				 * we could do this for all PCI devices; we only need to
523 				 * get the BDF# from the scope table for ACPI matches. */
524 				if (pdev && pdev->is_virtfn)
525 					goto got_pdev;
526 
527 				if (bus && devfn) {
528 					*bus = drhd->devices[i].bus;
529 					*devfn = drhd->devices[i].devfn;
530 				}
531 				goto out;
532 			}
533 
534 			if (is_downstream_to_pci_bridge(dev, tmp))
535 				goto got_pdev;
536 		}
537 
538 		if (pdev && drhd->include_all) {
539 got_pdev:
540 			if (bus && devfn) {
541 				*bus = pdev->bus->number;
542 				*devfn = pdev->devfn;
543 			}
544 			goto out;
545 		}
546 	}
547 	iommu = NULL;
548 out:
549 	if (iommu_is_dummy(iommu, dev))
550 		iommu = NULL;
551 
552 	rcu_read_unlock();
553 
554 	return iommu;
555 }
556 
557 static void domain_flush_cache(struct dmar_domain *domain,
558 			       void *addr, int size)
559 {
560 	if (!domain->iommu_coherency)
561 		clflush_cache_range(addr, size);
562 }
563 
564 static void free_context_table(struct intel_iommu *iommu)
565 {
566 	struct context_entry *context;
567 	int i;
568 
569 	if (!iommu->root_entry)
570 		return;
571 
572 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
573 		context = iommu_context_addr(iommu, i, 0, 0);
574 		if (context)
575 			iommu_free_page(context);
576 
577 		if (!sm_supported(iommu))
578 			continue;
579 
580 		context = iommu_context_addr(iommu, i, 0x80, 0);
581 		if (context)
582 			iommu_free_page(context);
583 	}
584 
585 	iommu_free_page(iommu->root_entry);
586 	iommu->root_entry = NULL;
587 }
588 
589 #ifdef CONFIG_DMAR_DEBUG
590 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
591 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
592 {
593 	struct dma_pte *pte;
594 	int offset;
595 
596 	while (1) {
597 		offset = pfn_level_offset(pfn, level);
598 		pte = &parent[offset];
599 
600 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
601 
602 		if (!dma_pte_present(pte)) {
603 			pr_info("page table not present at level %d\n", level - 1);
604 			break;
605 		}
606 
607 		if (level == 1 || dma_pte_superpage(pte))
608 			break;
609 
610 		parent = phys_to_virt(dma_pte_addr(pte));
611 		level--;
612 	}
613 }
614 
615 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
616 			  unsigned long long addr, u32 pasid)
617 {
618 	struct pasid_dir_entry *dir, *pde;
619 	struct pasid_entry *entries, *pte;
620 	struct context_entry *ctx_entry;
621 	struct root_entry *rt_entry;
622 	int i, dir_index, index, level;
623 	u8 devfn = source_id & 0xff;
624 	u8 bus = source_id >> 8;
625 	struct dma_pte *pgtable;
626 
627 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
628 
629 	/* root entry dump */
630 	if (!iommu->root_entry) {
631 		pr_info("root table is not present\n");
632 		return;
633 	}
634 	rt_entry = &iommu->root_entry[bus];
635 
636 	if (sm_supported(iommu))
637 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
638 			rt_entry->hi, rt_entry->lo);
639 	else
640 		pr_info("root entry: 0x%016llx", rt_entry->lo);
641 
642 	/* context entry dump */
643 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
644 	if (!ctx_entry) {
645 		pr_info("context table is not present\n");
646 		return;
647 	}
648 
649 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
650 		ctx_entry->hi, ctx_entry->lo);
651 
652 	/* legacy mode does not require PASID entries */
653 	if (!sm_supported(iommu)) {
654 		if (!context_present(ctx_entry)) {
655 			pr_info("legacy mode page table is not present\n");
656 			return;
657 		}
658 		level = agaw_to_level(ctx_entry->hi & 7);
659 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
660 		goto pgtable_walk;
661 	}
662 
663 	if (!context_present(ctx_entry)) {
664 		pr_info("pasid directory table is not present\n");
665 		return;
666 	}
667 
668 	/* get the pointer to pasid directory entry */
669 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
670 
671 	/* For request-without-pasid, get the pasid from context entry */
672 	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
673 		pasid = IOMMU_NO_PASID;
674 
675 	dir_index = pasid >> PASID_PDE_SHIFT;
676 	pde = &dir[dir_index];
677 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
678 
679 	/* get the pointer to the pasid table entry */
680 	entries = get_pasid_table_from_pde(pde);
681 	if (!entries) {
682 		pr_info("pasid table is not present\n");
683 		return;
684 	}
685 	index = pasid & PASID_PTE_MASK;
686 	pte = &entries[index];
687 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
688 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
689 
690 	if (!pasid_pte_is_present(pte)) {
691 		pr_info("scalable mode page table is not present\n");
692 		return;
693 	}
694 
695 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
696 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
697 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
698 	} else {
699 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
700 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
701 	}
702 
703 pgtable_walk:
704 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
705 }
706 #endif
707 
708 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
709 				      unsigned long pfn, int *target_level,
710 				      gfp_t gfp)
711 {
712 	struct dma_pte *parent, *pte;
713 	int level = agaw_to_level(domain->agaw);
714 	int offset;
715 
716 	if (!domain_pfn_supported(domain, pfn))
717 		/* Address beyond IOMMU's addressing capabilities. */
718 		return NULL;
719 
720 	parent = domain->pgd;
721 
722 	while (1) {
723 		void *tmp_page;
724 
725 		offset = pfn_level_offset(pfn, level);
726 		pte = &parent[offset];
727 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
728 			break;
729 		if (level == *target_level)
730 			break;
731 
732 		if (!dma_pte_present(pte)) {
733 			uint64_t pteval, tmp;
734 
735 			tmp_page = iommu_alloc_page_node(domain->nid, gfp);
736 
737 			if (!tmp_page)
738 				return NULL;
739 
740 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
741 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
742 			if (domain->use_first_level)
743 				pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
744 
745 			tmp = 0ULL;
746 			if (!try_cmpxchg64(&pte->val, &tmp, pteval))
747 				/* Someone else set it while we were thinking; use theirs. */
748 				iommu_free_page(tmp_page);
749 			else
750 				domain_flush_cache(domain, pte, sizeof(*pte));
751 		}
752 		if (level == 1)
753 			break;
754 
755 		parent = phys_to_virt(dma_pte_addr(pte));
756 		level--;
757 	}
758 
759 	if (!*target_level)
760 		*target_level = level;
761 
762 	return pte;
763 }
764 
765 /* return address's pte at specific level */
766 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
767 					 unsigned long pfn,
768 					 int level, int *large_page)
769 {
770 	struct dma_pte *parent, *pte;
771 	int total = agaw_to_level(domain->agaw);
772 	int offset;
773 
774 	parent = domain->pgd;
775 	while (level <= total) {
776 		offset = pfn_level_offset(pfn, total);
777 		pte = &parent[offset];
778 		if (level == total)
779 			return pte;
780 
781 		if (!dma_pte_present(pte)) {
782 			*large_page = total;
783 			break;
784 		}
785 
786 		if (dma_pte_superpage(pte)) {
787 			*large_page = total;
788 			return pte;
789 		}
790 
791 		parent = phys_to_virt(dma_pte_addr(pte));
792 		total--;
793 	}
794 	return NULL;
795 }
796 
797 /* clear last level pte, a tlb flush should be followed */
798 static void dma_pte_clear_range(struct dmar_domain *domain,
799 				unsigned long start_pfn,
800 				unsigned long last_pfn)
801 {
802 	unsigned int large_page;
803 	struct dma_pte *first_pte, *pte;
804 
805 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
806 	    WARN_ON(start_pfn > last_pfn))
807 		return;
808 
809 	/* we don't need lock here; nobody else touches the iova range */
810 	do {
811 		large_page = 1;
812 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
813 		if (!pte) {
814 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
815 			continue;
816 		}
817 		do {
818 			dma_clear_pte(pte);
819 			start_pfn += lvl_to_nr_pages(large_page);
820 			pte++;
821 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
822 
823 		domain_flush_cache(domain, first_pte,
824 				   (void *)pte - (void *)first_pte);
825 
826 	} while (start_pfn && start_pfn <= last_pfn);
827 }
828 
829 static void dma_pte_free_level(struct dmar_domain *domain, int level,
830 			       int retain_level, struct dma_pte *pte,
831 			       unsigned long pfn, unsigned long start_pfn,
832 			       unsigned long last_pfn)
833 {
834 	pfn = max(start_pfn, pfn);
835 	pte = &pte[pfn_level_offset(pfn, level)];
836 
837 	do {
838 		unsigned long level_pfn;
839 		struct dma_pte *level_pte;
840 
841 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
842 			goto next;
843 
844 		level_pfn = pfn & level_mask(level);
845 		level_pte = phys_to_virt(dma_pte_addr(pte));
846 
847 		if (level > 2) {
848 			dma_pte_free_level(domain, level - 1, retain_level,
849 					   level_pte, level_pfn, start_pfn,
850 					   last_pfn);
851 		}
852 
853 		/*
854 		 * Free the page table if we're below the level we want to
855 		 * retain and the range covers the entire table.
856 		 */
857 		if (level < retain_level && !(start_pfn > level_pfn ||
858 		      last_pfn < level_pfn + level_size(level) - 1)) {
859 			dma_clear_pte(pte);
860 			domain_flush_cache(domain, pte, sizeof(*pte));
861 			iommu_free_page(level_pte);
862 		}
863 next:
864 		pfn += level_size(level);
865 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
866 }
867 
868 /*
869  * clear last level (leaf) ptes and free page table pages below the
870  * level we wish to keep intact.
871  */
872 static void dma_pte_free_pagetable(struct dmar_domain *domain,
873 				   unsigned long start_pfn,
874 				   unsigned long last_pfn,
875 				   int retain_level)
876 {
877 	dma_pte_clear_range(domain, start_pfn, last_pfn);
878 
879 	/* We don't need lock here; nobody else touches the iova range */
880 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
881 			   domain->pgd, 0, start_pfn, last_pfn);
882 
883 	/* free pgd */
884 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
885 		iommu_free_page(domain->pgd);
886 		domain->pgd = NULL;
887 	}
888 }
889 
890 /* When a page at a given level is being unlinked from its parent, we don't
891    need to *modify* it at all. All we need to do is make a list of all the
892    pages which can be freed just as soon as we've flushed the IOTLB and we
893    know the hardware page-walk will no longer touch them.
894    The 'pte' argument is the *parent* PTE, pointing to the page that is to
895    be freed. */
896 static void dma_pte_list_pagetables(struct dmar_domain *domain,
897 				    int level, struct dma_pte *pte,
898 				    struct list_head *freelist)
899 {
900 	struct page *pg;
901 
902 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
903 	list_add_tail(&pg->lru, freelist);
904 
905 	if (level == 1)
906 		return;
907 
908 	pte = page_address(pg);
909 	do {
910 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
911 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
912 		pte++;
913 	} while (!first_pte_in_page(pte));
914 }
915 
916 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
917 				struct dma_pte *pte, unsigned long pfn,
918 				unsigned long start_pfn, unsigned long last_pfn,
919 				struct list_head *freelist)
920 {
921 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
922 
923 	pfn = max(start_pfn, pfn);
924 	pte = &pte[pfn_level_offset(pfn, level)];
925 
926 	do {
927 		unsigned long level_pfn = pfn & level_mask(level);
928 
929 		if (!dma_pte_present(pte))
930 			goto next;
931 
932 		/* If range covers entire pagetable, free it */
933 		if (start_pfn <= level_pfn &&
934 		    last_pfn >= level_pfn + level_size(level) - 1) {
935 			/* These suborbinate page tables are going away entirely. Don't
936 			   bother to clear them; we're just going to *free* them. */
937 			if (level > 1 && !dma_pte_superpage(pte))
938 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
939 
940 			dma_clear_pte(pte);
941 			if (!first_pte)
942 				first_pte = pte;
943 			last_pte = pte;
944 		} else if (level > 1) {
945 			/* Recurse down into a level that isn't *entirely* obsolete */
946 			dma_pte_clear_level(domain, level - 1,
947 					    phys_to_virt(dma_pte_addr(pte)),
948 					    level_pfn, start_pfn, last_pfn,
949 					    freelist);
950 		}
951 next:
952 		pfn = level_pfn + level_size(level);
953 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
954 
955 	if (first_pte)
956 		domain_flush_cache(domain, first_pte,
957 				   (void *)++last_pte - (void *)first_pte);
958 }
959 
960 /* We can't just free the pages because the IOMMU may still be walking
961    the page tables, and may have cached the intermediate levels. The
962    pages can only be freed after the IOTLB flush has been done. */
963 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
964 			 unsigned long last_pfn, struct list_head *freelist)
965 {
966 	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
967 	    WARN_ON(start_pfn > last_pfn))
968 		return;
969 
970 	/* we don't need lock here; nobody else touches the iova range */
971 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
972 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
973 
974 	/* free pgd */
975 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
976 		struct page *pgd_page = virt_to_page(domain->pgd);
977 		list_add_tail(&pgd_page->lru, freelist);
978 		domain->pgd = NULL;
979 	}
980 }
981 
982 /* iommu handling */
983 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
984 {
985 	struct root_entry *root;
986 
987 	root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
988 	if (!root) {
989 		pr_err("Allocating root entry for %s failed\n",
990 			iommu->name);
991 		return -ENOMEM;
992 	}
993 
994 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
995 	iommu->root_entry = root;
996 
997 	return 0;
998 }
999 
1000 static void iommu_set_root_entry(struct intel_iommu *iommu)
1001 {
1002 	u64 addr;
1003 	u32 sts;
1004 	unsigned long flag;
1005 
1006 	addr = virt_to_phys(iommu->root_entry);
1007 	if (sm_supported(iommu))
1008 		addr |= DMA_RTADDR_SMT;
1009 
1010 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1011 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1012 
1013 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1014 
1015 	/* Make sure hardware complete it */
1016 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1017 		      readl, (sts & DMA_GSTS_RTPS), sts);
1018 
1019 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1020 
1021 	/*
1022 	 * Hardware invalidates all DMA remapping hardware translation
1023 	 * caches as part of SRTP flow.
1024 	 */
1025 	if (cap_esrtps(iommu->cap))
1026 		return;
1027 
1028 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1029 	if (sm_supported(iommu))
1030 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1031 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1032 }
1033 
1034 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1035 {
1036 	u32 val;
1037 	unsigned long flag;
1038 
1039 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1040 		return;
1041 
1042 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1043 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1044 
1045 	/* Make sure hardware complete it */
1046 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1047 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1048 
1049 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1050 }
1051 
1052 /* return value determine if we need a write buffer flush */
1053 static void __iommu_flush_context(struct intel_iommu *iommu,
1054 				  u16 did, u16 source_id, u8 function_mask,
1055 				  u64 type)
1056 {
1057 	u64 val = 0;
1058 	unsigned long flag;
1059 
1060 	switch (type) {
1061 	case DMA_CCMD_GLOBAL_INVL:
1062 		val = DMA_CCMD_GLOBAL_INVL;
1063 		break;
1064 	case DMA_CCMD_DOMAIN_INVL:
1065 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1066 		break;
1067 	case DMA_CCMD_DEVICE_INVL:
1068 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1069 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1070 		break;
1071 	default:
1072 		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1073 			iommu->name, type);
1074 		return;
1075 	}
1076 	val |= DMA_CCMD_ICC;
1077 
1078 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1079 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1080 
1081 	/* Make sure hardware complete it */
1082 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1083 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1084 
1085 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1086 }
1087 
1088 void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
1089 			 unsigned int size_order, u64 type)
1090 {
1091 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1092 	u64 val = 0, val_iva = 0;
1093 	unsigned long flag;
1094 
1095 	switch (type) {
1096 	case DMA_TLB_GLOBAL_FLUSH:
1097 		/* global flush doesn't need set IVA_REG */
1098 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1099 		break;
1100 	case DMA_TLB_DSI_FLUSH:
1101 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1102 		break;
1103 	case DMA_TLB_PSI_FLUSH:
1104 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1105 		/* IH bit is passed in as part of address */
1106 		val_iva = size_order | addr;
1107 		break;
1108 	default:
1109 		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1110 			iommu->name, type);
1111 		return;
1112 	}
1113 
1114 	if (cap_write_drain(iommu->cap))
1115 		val |= DMA_TLB_WRITE_DRAIN;
1116 
1117 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1118 	/* Note: Only uses first TLB reg currently */
1119 	if (val_iva)
1120 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1121 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1122 
1123 	/* Make sure hardware complete it */
1124 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1125 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1126 
1127 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1128 
1129 	/* check IOTLB invalidation granularity */
1130 	if (DMA_TLB_IAIG(val) == 0)
1131 		pr_err("Flush IOTLB failed\n");
1132 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1133 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1134 			(unsigned long long)DMA_TLB_IIRG(type),
1135 			(unsigned long long)DMA_TLB_IAIG(val));
1136 }
1137 
1138 static struct device_domain_info *
1139 domain_lookup_dev_info(struct dmar_domain *domain,
1140 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1141 {
1142 	struct device_domain_info *info;
1143 	unsigned long flags;
1144 
1145 	spin_lock_irqsave(&domain->lock, flags);
1146 	list_for_each_entry(info, &domain->devices, link) {
1147 		if (info->iommu == iommu && info->bus == bus &&
1148 		    info->devfn == devfn) {
1149 			spin_unlock_irqrestore(&domain->lock, flags);
1150 			return info;
1151 		}
1152 	}
1153 	spin_unlock_irqrestore(&domain->lock, flags);
1154 
1155 	return NULL;
1156 }
1157 
1158 /*
1159  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1160  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1161  * check because it applies only to the built-in QAT devices and it doesn't
1162  * grant additional privileges.
1163  */
1164 #define BUGGY_QAT_DEVID_MASK 0x4940
1165 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1166 {
1167 	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1168 		return false;
1169 
1170 	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1171 		return false;
1172 
1173 	return true;
1174 }
1175 
1176 static void iommu_enable_pci_caps(struct device_domain_info *info)
1177 {
1178 	struct pci_dev *pdev;
1179 
1180 	if (!dev_is_pci(info->dev))
1181 		return;
1182 
1183 	pdev = to_pci_dev(info->dev);
1184 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1185 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT))
1186 		info->ats_enabled = 1;
1187 }
1188 
1189 static void iommu_disable_pci_caps(struct device_domain_info *info)
1190 {
1191 	struct pci_dev *pdev;
1192 
1193 	if (!dev_is_pci(info->dev))
1194 		return;
1195 
1196 	pdev = to_pci_dev(info->dev);
1197 
1198 	if (info->ats_enabled) {
1199 		pci_disable_ats(pdev);
1200 		info->ats_enabled = 0;
1201 	}
1202 }
1203 
1204 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1205 {
1206 	cache_tag_flush_all(to_dmar_domain(domain));
1207 }
1208 
1209 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1210 {
1211 	u32 pmen;
1212 	unsigned long flags;
1213 
1214 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1215 		return;
1216 
1217 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1218 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1219 	pmen &= ~DMA_PMEN_EPM;
1220 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1221 
1222 	/* wait for the protected region status bit to clear */
1223 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1224 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1225 
1226 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1227 }
1228 
1229 static void iommu_enable_translation(struct intel_iommu *iommu)
1230 {
1231 	u32 sts;
1232 	unsigned long flags;
1233 
1234 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1235 	iommu->gcmd |= DMA_GCMD_TE;
1236 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1237 
1238 	/* Make sure hardware complete it */
1239 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1240 		      readl, (sts & DMA_GSTS_TES), sts);
1241 
1242 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1243 }
1244 
1245 static void iommu_disable_translation(struct intel_iommu *iommu)
1246 {
1247 	u32 sts;
1248 	unsigned long flag;
1249 
1250 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1251 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1252 		return;
1253 
1254 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1255 	iommu->gcmd &= ~DMA_GCMD_TE;
1256 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1257 
1258 	/* Make sure hardware complete it */
1259 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1260 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1261 
1262 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1263 }
1264 
1265 static int iommu_init_domains(struct intel_iommu *iommu)
1266 {
1267 	u32 ndomains;
1268 
1269 	ndomains = cap_ndoms(iommu->cap);
1270 	pr_debug("%s: Number of Domains supported <%d>\n",
1271 		 iommu->name, ndomains);
1272 
1273 	spin_lock_init(&iommu->lock);
1274 
1275 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1276 	if (!iommu->domain_ids)
1277 		return -ENOMEM;
1278 
1279 	/*
1280 	 * If Caching mode is set, then invalid translations are tagged
1281 	 * with domain-id 0, hence we need to pre-allocate it. We also
1282 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1283 	 * make sure it is not used for a real domain.
1284 	 */
1285 	set_bit(0, iommu->domain_ids);
1286 
1287 	/*
1288 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1289 	 * entry for first-level or pass-through translation modes should
1290 	 * be programmed with a domain id different from those used for
1291 	 * second-level or nested translation. We reserve a domain id for
1292 	 * this purpose. This domain id is also used for identity domain
1293 	 * in legacy mode.
1294 	 */
1295 	set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1296 
1297 	return 0;
1298 }
1299 
1300 static void disable_dmar_iommu(struct intel_iommu *iommu)
1301 {
1302 	if (!iommu->domain_ids)
1303 		return;
1304 
1305 	/*
1306 	 * All iommu domains must have been detached from the devices,
1307 	 * hence there should be no domain IDs in use.
1308 	 */
1309 	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1310 		    > NUM_RESERVED_DID))
1311 		return;
1312 
1313 	if (iommu->gcmd & DMA_GCMD_TE)
1314 		iommu_disable_translation(iommu);
1315 }
1316 
1317 static void free_dmar_iommu(struct intel_iommu *iommu)
1318 {
1319 	if (iommu->domain_ids) {
1320 		bitmap_free(iommu->domain_ids);
1321 		iommu->domain_ids = NULL;
1322 	}
1323 
1324 	if (iommu->copied_tables) {
1325 		bitmap_free(iommu->copied_tables);
1326 		iommu->copied_tables = NULL;
1327 	}
1328 
1329 	/* free context mapping */
1330 	free_context_table(iommu);
1331 
1332 	if (ecap_prs(iommu->ecap))
1333 		intel_iommu_finish_prq(iommu);
1334 }
1335 
1336 /*
1337  * Check and return whether first level is used by default for
1338  * DMA translation.
1339  */
1340 static bool first_level_by_default(struct intel_iommu *iommu)
1341 {
1342 	/* Only SL is available in legacy mode */
1343 	if (!sm_supported(iommu))
1344 		return false;
1345 
1346 	/* Only level (either FL or SL) is available, just use it */
1347 	if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap))
1348 		return ecap_flts(iommu->ecap);
1349 
1350 	return true;
1351 }
1352 
1353 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1354 {
1355 	struct iommu_domain_info *info, *curr;
1356 	unsigned long ndomains;
1357 	int num, ret = -ENOSPC;
1358 
1359 	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1360 		return 0;
1361 
1362 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1363 	if (!info)
1364 		return -ENOMEM;
1365 
1366 	spin_lock(&iommu->lock);
1367 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1368 	if (curr) {
1369 		curr->refcnt++;
1370 		spin_unlock(&iommu->lock);
1371 		kfree(info);
1372 		return 0;
1373 	}
1374 
1375 	ndomains = cap_ndoms(iommu->cap);
1376 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1377 	if (num >= ndomains) {
1378 		pr_err("%s: No free domain ids\n", iommu->name);
1379 		goto err_unlock;
1380 	}
1381 
1382 	set_bit(num, iommu->domain_ids);
1383 	info->refcnt	= 1;
1384 	info->did	= num;
1385 	info->iommu	= iommu;
1386 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1387 			  NULL, info, GFP_ATOMIC);
1388 	if (curr) {
1389 		ret = xa_err(curr) ? : -EBUSY;
1390 		goto err_clear;
1391 	}
1392 
1393 	spin_unlock(&iommu->lock);
1394 	return 0;
1395 
1396 err_clear:
1397 	clear_bit(info->did, iommu->domain_ids);
1398 err_unlock:
1399 	spin_unlock(&iommu->lock);
1400 	kfree(info);
1401 	return ret;
1402 }
1403 
1404 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1405 {
1406 	struct iommu_domain_info *info;
1407 
1408 	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1409 		return;
1410 
1411 	spin_lock(&iommu->lock);
1412 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1413 	if (--info->refcnt == 0) {
1414 		clear_bit(info->did, iommu->domain_ids);
1415 		xa_erase(&domain->iommu_array, iommu->seq_id);
1416 		domain->nid = NUMA_NO_NODE;
1417 		kfree(info);
1418 	}
1419 	spin_unlock(&iommu->lock);
1420 }
1421 
1422 static void domain_exit(struct dmar_domain *domain)
1423 {
1424 	if (domain->pgd) {
1425 		LIST_HEAD(freelist);
1426 
1427 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1428 		iommu_put_pages_list(&freelist);
1429 	}
1430 
1431 	if (WARN_ON(!list_empty(&domain->devices)))
1432 		return;
1433 
1434 	kfree(domain->qi_batch);
1435 	kfree(domain);
1436 }
1437 
1438 /*
1439  * For kdump cases, old valid entries may be cached due to the
1440  * in-flight DMA and copied pgtable, but there is no unmapping
1441  * behaviour for them, thus we need an explicit cache flush for
1442  * the newly-mapped device. For kdump, at this point, the device
1443  * is supposed to finish reset at its driver probe stage, so no
1444  * in-flight DMA will exist, and we don't need to worry anymore
1445  * hereafter.
1446  */
1447 static void copied_context_tear_down(struct intel_iommu *iommu,
1448 				     struct context_entry *context,
1449 				     u8 bus, u8 devfn)
1450 {
1451 	u16 did_old;
1452 
1453 	if (!context_copied(iommu, bus, devfn))
1454 		return;
1455 
1456 	assert_spin_locked(&iommu->lock);
1457 
1458 	did_old = context_domain_id(context);
1459 	context_clear_entry(context);
1460 
1461 	if (did_old < cap_ndoms(iommu->cap)) {
1462 		iommu->flush.flush_context(iommu, did_old,
1463 					   PCI_DEVID(bus, devfn),
1464 					   DMA_CCMD_MASK_NOBIT,
1465 					   DMA_CCMD_DEVICE_INVL);
1466 		iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1467 					 DMA_TLB_DSI_FLUSH);
1468 	}
1469 
1470 	clear_context_copied(iommu, bus, devfn);
1471 }
1472 
1473 /*
1474  * It's a non-present to present mapping. If hardware doesn't cache
1475  * non-present entry we only need to flush the write-buffer. If the
1476  * _does_ cache non-present entries, then it does so in the special
1477  * domain #0, which we have to flush:
1478  */
1479 static void context_present_cache_flush(struct intel_iommu *iommu, u16 did,
1480 					u8 bus, u8 devfn)
1481 {
1482 	if (cap_caching_mode(iommu->cap)) {
1483 		iommu->flush.flush_context(iommu, 0,
1484 					   PCI_DEVID(bus, devfn),
1485 					   DMA_CCMD_MASK_NOBIT,
1486 					   DMA_CCMD_DEVICE_INVL);
1487 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1488 	} else {
1489 		iommu_flush_write_buffer(iommu);
1490 	}
1491 }
1492 
1493 static int domain_context_mapping_one(struct dmar_domain *domain,
1494 				      struct intel_iommu *iommu,
1495 				      u8 bus, u8 devfn)
1496 {
1497 	struct device_domain_info *info =
1498 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1499 	u16 did = domain_id_iommu(domain, iommu);
1500 	int translation = CONTEXT_TT_MULTI_LEVEL;
1501 	struct dma_pte *pgd = domain->pgd;
1502 	struct context_entry *context;
1503 	int ret;
1504 
1505 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1506 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1507 
1508 	spin_lock(&iommu->lock);
1509 	ret = -ENOMEM;
1510 	context = iommu_context_addr(iommu, bus, devfn, 1);
1511 	if (!context)
1512 		goto out_unlock;
1513 
1514 	ret = 0;
1515 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1516 		goto out_unlock;
1517 
1518 	copied_context_tear_down(iommu, context, bus, devfn);
1519 	context_clear_entry(context);
1520 	context_set_domain_id(context, did);
1521 
1522 	if (info && info->ats_supported)
1523 		translation = CONTEXT_TT_DEV_IOTLB;
1524 	else
1525 		translation = CONTEXT_TT_MULTI_LEVEL;
1526 
1527 	context_set_address_root(context, virt_to_phys(pgd));
1528 	context_set_address_width(context, domain->agaw);
1529 	context_set_translation_type(context, translation);
1530 	context_set_fault_enable(context);
1531 	context_set_present(context);
1532 	if (!ecap_coherent(iommu->ecap))
1533 		clflush_cache_range(context, sizeof(*context));
1534 	context_present_cache_flush(iommu, did, bus, devfn);
1535 	ret = 0;
1536 
1537 out_unlock:
1538 	spin_unlock(&iommu->lock);
1539 
1540 	return ret;
1541 }
1542 
1543 static int domain_context_mapping_cb(struct pci_dev *pdev,
1544 				     u16 alias, void *opaque)
1545 {
1546 	struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1547 	struct intel_iommu *iommu = info->iommu;
1548 	struct dmar_domain *domain = opaque;
1549 
1550 	return domain_context_mapping_one(domain, iommu,
1551 					  PCI_BUS_NUM(alias), alias & 0xff);
1552 }
1553 
1554 static int
1555 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1556 {
1557 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1558 	struct intel_iommu *iommu = info->iommu;
1559 	u8 bus = info->bus, devfn = info->devfn;
1560 
1561 	if (!dev_is_pci(dev))
1562 		return domain_context_mapping_one(domain, iommu, bus, devfn);
1563 
1564 	return pci_for_each_dma_alias(to_pci_dev(dev),
1565 				      domain_context_mapping_cb, domain);
1566 }
1567 
1568 /* Return largest possible superpage level for a given mapping */
1569 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1570 				   unsigned long phy_pfn, unsigned long pages)
1571 {
1572 	int support, level = 1;
1573 	unsigned long pfnmerge;
1574 
1575 	support = domain->iommu_superpage;
1576 
1577 	/* To use a large page, the virtual *and* physical addresses
1578 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1579 	   of them will mean we have to use smaller pages. So just
1580 	   merge them and check both at once. */
1581 	pfnmerge = iov_pfn | phy_pfn;
1582 
1583 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1584 		pages >>= VTD_STRIDE_SHIFT;
1585 		if (!pages)
1586 			break;
1587 		pfnmerge >>= VTD_STRIDE_SHIFT;
1588 		level++;
1589 		support--;
1590 	}
1591 	return level;
1592 }
1593 
1594 /*
1595  * Ensure that old small page tables are removed to make room for superpage(s).
1596  * We're going to add new large pages, so make sure we don't remove their parent
1597  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1598  */
1599 static void switch_to_super_page(struct dmar_domain *domain,
1600 				 unsigned long start_pfn,
1601 				 unsigned long end_pfn, int level)
1602 {
1603 	unsigned long lvl_pages = lvl_to_nr_pages(level);
1604 	struct dma_pte *pte = NULL;
1605 
1606 	while (start_pfn <= end_pfn) {
1607 		if (!pte)
1608 			pte = pfn_to_dma_pte(domain, start_pfn, &level,
1609 					     GFP_ATOMIC);
1610 
1611 		if (dma_pte_present(pte)) {
1612 			dma_pte_free_pagetable(domain, start_pfn,
1613 					       start_pfn + lvl_pages - 1,
1614 					       level + 1);
1615 
1616 			cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT,
1617 					      end_pfn << VTD_PAGE_SHIFT, 0);
1618 		}
1619 
1620 		pte++;
1621 		start_pfn += lvl_pages;
1622 		if (first_pte_in_page(pte))
1623 			pte = NULL;
1624 	}
1625 }
1626 
1627 static int
1628 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1629 		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
1630 		 gfp_t gfp)
1631 {
1632 	struct dma_pte *first_pte = NULL, *pte = NULL;
1633 	unsigned int largepage_lvl = 0;
1634 	unsigned long lvl_pages = 0;
1635 	phys_addr_t pteval;
1636 	u64 attr;
1637 
1638 	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
1639 		return -EINVAL;
1640 
1641 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1642 		return -EINVAL;
1643 
1644 	if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
1645 		pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
1646 		return -EINVAL;
1647 	}
1648 
1649 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
1650 	attr |= DMA_FL_PTE_PRESENT;
1651 	if (domain->use_first_level) {
1652 		attr |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
1653 		if (prot & DMA_PTE_WRITE)
1654 			attr |= DMA_FL_PTE_DIRTY;
1655 	}
1656 
1657 	domain->has_mappings = true;
1658 
1659 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
1660 
1661 	while (nr_pages > 0) {
1662 		uint64_t tmp;
1663 
1664 		if (!pte) {
1665 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
1666 					phys_pfn, nr_pages);
1667 
1668 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
1669 					     gfp);
1670 			if (!pte)
1671 				return -ENOMEM;
1672 			first_pte = pte;
1673 
1674 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
1675 
1676 			/* It is large page*/
1677 			if (largepage_lvl > 1) {
1678 				unsigned long end_pfn;
1679 				unsigned long pages_to_remove;
1680 
1681 				pteval |= DMA_PTE_LARGE_PAGE;
1682 				pages_to_remove = min_t(unsigned long, nr_pages,
1683 							nr_pte_to_next_page(pte) * lvl_pages);
1684 				end_pfn = iov_pfn + pages_to_remove - 1;
1685 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
1686 			} else {
1687 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1688 			}
1689 
1690 		}
1691 		/* We don't need lock here, nobody else
1692 		 * touches the iova range
1693 		 */
1694 		tmp = 0ULL;
1695 		if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
1696 			static int dumps = 5;
1697 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1698 				iov_pfn, tmp, (unsigned long long)pteval);
1699 			if (dumps) {
1700 				dumps--;
1701 				debug_dma_dump_mappings(NULL);
1702 			}
1703 			WARN_ON(1);
1704 		}
1705 
1706 		nr_pages -= lvl_pages;
1707 		iov_pfn += lvl_pages;
1708 		phys_pfn += lvl_pages;
1709 		pteval += lvl_pages * VTD_PAGE_SIZE;
1710 
1711 		/* If the next PTE would be the first in a new page, then we
1712 		 * need to flush the cache on the entries we've just written.
1713 		 * And then we'll need to recalculate 'pte', so clear it and
1714 		 * let it get set again in the if (!pte) block above.
1715 		 *
1716 		 * If we're done (!nr_pages) we need to flush the cache too.
1717 		 *
1718 		 * Also if we've been setting superpages, we may need to
1719 		 * recalculate 'pte' and switch back to smaller pages for the
1720 		 * end of the mapping, if the trailing size is not enough to
1721 		 * use another superpage (i.e. nr_pages < lvl_pages).
1722 		 */
1723 		pte++;
1724 		if (!nr_pages || first_pte_in_page(pte) ||
1725 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
1726 			domain_flush_cache(domain, first_pte,
1727 					   (void *)pte - (void *)first_pte);
1728 			pte = NULL;
1729 		}
1730 	}
1731 
1732 	return 0;
1733 }
1734 
1735 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
1736 {
1737 	struct intel_iommu *iommu = info->iommu;
1738 	struct context_entry *context;
1739 	u16 did;
1740 
1741 	spin_lock(&iommu->lock);
1742 	context = iommu_context_addr(iommu, bus, devfn, 0);
1743 	if (!context) {
1744 		spin_unlock(&iommu->lock);
1745 		return;
1746 	}
1747 
1748 	did = context_domain_id(context);
1749 	context_clear_entry(context);
1750 	__iommu_flush_cache(iommu, context, sizeof(*context));
1751 	spin_unlock(&iommu->lock);
1752 	intel_context_flush_present(info, context, did, true);
1753 }
1754 
1755 int __domain_setup_first_level(struct intel_iommu *iommu,
1756 			       struct device *dev, ioasid_t pasid,
1757 			       u16 did, pgd_t *pgd, int flags,
1758 			       struct iommu_domain *old)
1759 {
1760 	if (!old)
1761 		return intel_pasid_setup_first_level(iommu, dev, pgd,
1762 						     pasid, did, flags);
1763 	return intel_pasid_replace_first_level(iommu, dev, pgd, pasid, did,
1764 					       iommu_domain_did(old, iommu),
1765 					       flags);
1766 }
1767 
1768 static int domain_setup_second_level(struct intel_iommu *iommu,
1769 				     struct dmar_domain *domain,
1770 				     struct device *dev, ioasid_t pasid,
1771 				     struct iommu_domain *old)
1772 {
1773 	if (!old)
1774 		return intel_pasid_setup_second_level(iommu, domain,
1775 						      dev, pasid);
1776 	return intel_pasid_replace_second_level(iommu, domain, dev,
1777 						iommu_domain_did(old, iommu),
1778 						pasid);
1779 }
1780 
1781 static int domain_setup_passthrough(struct intel_iommu *iommu,
1782 				    struct device *dev, ioasid_t pasid,
1783 				    struct iommu_domain *old)
1784 {
1785 	if (!old)
1786 		return intel_pasid_setup_pass_through(iommu, dev, pasid);
1787 	return intel_pasid_replace_pass_through(iommu, dev,
1788 						iommu_domain_did(old, iommu),
1789 						pasid);
1790 }
1791 
1792 static int domain_setup_first_level(struct intel_iommu *iommu,
1793 				    struct dmar_domain *domain,
1794 				    struct device *dev,
1795 				    u32 pasid, struct iommu_domain *old)
1796 {
1797 	struct dma_pte *pgd = domain->pgd;
1798 	int level, flags = 0;
1799 
1800 	level = agaw_to_level(domain->agaw);
1801 	if (level != 4 && level != 5)
1802 		return -EINVAL;
1803 
1804 	if (level == 5)
1805 		flags |= PASID_FLAG_FL5LP;
1806 
1807 	if (domain->force_snooping)
1808 		flags |= PASID_FLAG_PAGE_SNOOP;
1809 
1810 	return __domain_setup_first_level(iommu, dev, pasid,
1811 					  domain_id_iommu(domain, iommu),
1812 					  (pgd_t *)pgd, flags, old);
1813 }
1814 
1815 static int dmar_domain_attach_device(struct dmar_domain *domain,
1816 				     struct device *dev)
1817 {
1818 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1819 	struct intel_iommu *iommu = info->iommu;
1820 	unsigned long flags;
1821 	int ret;
1822 
1823 	ret = domain_attach_iommu(domain, iommu);
1824 	if (ret)
1825 		return ret;
1826 
1827 	info->domain = domain;
1828 	spin_lock_irqsave(&domain->lock, flags);
1829 	list_add(&info->link, &domain->devices);
1830 	spin_unlock_irqrestore(&domain->lock, flags);
1831 
1832 	if (dev_is_real_dma_subdevice(dev))
1833 		return 0;
1834 
1835 	if (!sm_supported(iommu))
1836 		ret = domain_context_mapping(domain, dev);
1837 	else if (domain->use_first_level)
1838 		ret = domain_setup_first_level(iommu, domain, dev,
1839 					       IOMMU_NO_PASID, NULL);
1840 	else
1841 		ret = domain_setup_second_level(iommu, domain, dev,
1842 						IOMMU_NO_PASID, NULL);
1843 
1844 	if (ret)
1845 		goto out_block_translation;
1846 
1847 	iommu_enable_pci_caps(info);
1848 
1849 	ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID);
1850 	if (ret)
1851 		goto out_block_translation;
1852 
1853 	return 0;
1854 
1855 out_block_translation:
1856 	device_block_translation(dev);
1857 	return ret;
1858 }
1859 
1860 /**
1861  * device_rmrr_is_relaxable - Test whether the RMRR of this device
1862  * is relaxable (ie. is allowed to be not enforced under some conditions)
1863  * @dev: device handle
1864  *
1865  * We assume that PCI USB devices with RMRRs have them largely
1866  * for historical reasons and that the RMRR space is not actively used post
1867  * boot.  This exclusion may change if vendors begin to abuse it.
1868  *
1869  * The same exception is made for graphics devices, with the requirement that
1870  * any use of the RMRR regions will be torn down before assigning the device
1871  * to a guest.
1872  *
1873  * Return: true if the RMRR is relaxable, false otherwise
1874  */
1875 static bool device_rmrr_is_relaxable(struct device *dev)
1876 {
1877 	struct pci_dev *pdev;
1878 
1879 	if (!dev_is_pci(dev))
1880 		return false;
1881 
1882 	pdev = to_pci_dev(dev);
1883 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
1884 		return true;
1885 	else
1886 		return false;
1887 }
1888 
1889 static int device_def_domain_type(struct device *dev)
1890 {
1891 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1892 	struct intel_iommu *iommu = info->iommu;
1893 
1894 	/*
1895 	 * Hardware does not support the passthrough translation mode.
1896 	 * Always use a dynamaic mapping domain.
1897 	 */
1898 	if (!ecap_pass_through(iommu->ecap))
1899 		return IOMMU_DOMAIN_DMA;
1900 
1901 	if (dev_is_pci(dev)) {
1902 		struct pci_dev *pdev = to_pci_dev(dev);
1903 
1904 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
1905 			return IOMMU_DOMAIN_IDENTITY;
1906 	}
1907 
1908 	return 0;
1909 }
1910 
1911 static void intel_iommu_init_qi(struct intel_iommu *iommu)
1912 {
1913 	/*
1914 	 * Start from the sane iommu hardware state.
1915 	 * If the queued invalidation is already initialized by us
1916 	 * (for example, while enabling interrupt-remapping) then
1917 	 * we got the things already rolling from a sane state.
1918 	 */
1919 	if (!iommu->qi) {
1920 		/*
1921 		 * Clear any previous faults.
1922 		 */
1923 		dmar_fault(-1, iommu);
1924 		/*
1925 		 * Disable queued invalidation if supported and already enabled
1926 		 * before OS handover.
1927 		 */
1928 		dmar_disable_qi(iommu);
1929 	}
1930 
1931 	if (dmar_enable_qi(iommu)) {
1932 		/*
1933 		 * Queued Invalidate not enabled, use Register Based Invalidate
1934 		 */
1935 		iommu->flush.flush_context = __iommu_flush_context;
1936 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1937 		pr_info("%s: Using Register based invalidation\n",
1938 			iommu->name);
1939 	} else {
1940 		iommu->flush.flush_context = qi_flush_context;
1941 		iommu->flush.flush_iotlb = qi_flush_iotlb;
1942 		pr_info("%s: Using Queued invalidation\n", iommu->name);
1943 	}
1944 }
1945 
1946 static int copy_context_table(struct intel_iommu *iommu,
1947 			      struct root_entry *old_re,
1948 			      struct context_entry **tbl,
1949 			      int bus, bool ext)
1950 {
1951 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
1952 	struct context_entry *new_ce = NULL, ce;
1953 	struct context_entry *old_ce = NULL;
1954 	struct root_entry re;
1955 	phys_addr_t old_ce_phys;
1956 
1957 	tbl_idx = ext ? bus * 2 : bus;
1958 	memcpy(&re, old_re, sizeof(re));
1959 
1960 	for (devfn = 0; devfn < 256; devfn++) {
1961 		/* First calculate the correct index */
1962 		idx = (ext ? devfn * 2 : devfn) % 256;
1963 
1964 		if (idx == 0) {
1965 			/* First save what we may have and clean up */
1966 			if (new_ce) {
1967 				tbl[tbl_idx] = new_ce;
1968 				__iommu_flush_cache(iommu, new_ce,
1969 						    VTD_PAGE_SIZE);
1970 				pos = 1;
1971 			}
1972 
1973 			if (old_ce)
1974 				memunmap(old_ce);
1975 
1976 			ret = 0;
1977 			if (devfn < 0x80)
1978 				old_ce_phys = root_entry_lctp(&re);
1979 			else
1980 				old_ce_phys = root_entry_uctp(&re);
1981 
1982 			if (!old_ce_phys) {
1983 				if (ext && devfn == 0) {
1984 					/* No LCTP, try UCTP */
1985 					devfn = 0x7f;
1986 					continue;
1987 				} else {
1988 					goto out;
1989 				}
1990 			}
1991 
1992 			ret = -ENOMEM;
1993 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
1994 					MEMREMAP_WB);
1995 			if (!old_ce)
1996 				goto out;
1997 
1998 			new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL);
1999 			if (!new_ce)
2000 				goto out_unmap;
2001 
2002 			ret = 0;
2003 		}
2004 
2005 		/* Now copy the context entry */
2006 		memcpy(&ce, old_ce + idx, sizeof(ce));
2007 
2008 		if (!context_present(&ce))
2009 			continue;
2010 
2011 		did = context_domain_id(&ce);
2012 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2013 			set_bit(did, iommu->domain_ids);
2014 
2015 		set_context_copied(iommu, bus, devfn);
2016 		new_ce[idx] = ce;
2017 	}
2018 
2019 	tbl[tbl_idx + pos] = new_ce;
2020 
2021 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2022 
2023 out_unmap:
2024 	memunmap(old_ce);
2025 
2026 out:
2027 	return ret;
2028 }
2029 
2030 static int copy_translation_tables(struct intel_iommu *iommu)
2031 {
2032 	struct context_entry **ctxt_tbls;
2033 	struct root_entry *old_rt;
2034 	phys_addr_t old_rt_phys;
2035 	int ctxt_table_entries;
2036 	u64 rtaddr_reg;
2037 	int bus, ret;
2038 	bool new_ext, ext;
2039 
2040 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2041 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2042 	new_ext    = !!sm_supported(iommu);
2043 
2044 	/*
2045 	 * The RTT bit can only be changed when translation is disabled,
2046 	 * but disabling translation means to open a window for data
2047 	 * corruption. So bail out and don't copy anything if we would
2048 	 * have to change the bit.
2049 	 */
2050 	if (new_ext != ext)
2051 		return -EINVAL;
2052 
2053 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2054 	if (!iommu->copied_tables)
2055 		return -ENOMEM;
2056 
2057 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2058 	if (!old_rt_phys)
2059 		return -EINVAL;
2060 
2061 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2062 	if (!old_rt)
2063 		return -ENOMEM;
2064 
2065 	/* This is too big for the stack - allocate it from slab */
2066 	ctxt_table_entries = ext ? 512 : 256;
2067 	ret = -ENOMEM;
2068 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2069 	if (!ctxt_tbls)
2070 		goto out_unmap;
2071 
2072 	for (bus = 0; bus < 256; bus++) {
2073 		ret = copy_context_table(iommu, &old_rt[bus],
2074 					 ctxt_tbls, bus, ext);
2075 		if (ret) {
2076 			pr_err("%s: Failed to copy context table for bus %d\n",
2077 				iommu->name, bus);
2078 			continue;
2079 		}
2080 	}
2081 
2082 	spin_lock(&iommu->lock);
2083 
2084 	/* Context tables are copied, now write them to the root_entry table */
2085 	for (bus = 0; bus < 256; bus++) {
2086 		int idx = ext ? bus * 2 : bus;
2087 		u64 val;
2088 
2089 		if (ctxt_tbls[idx]) {
2090 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2091 			iommu->root_entry[bus].lo = val;
2092 		}
2093 
2094 		if (!ext || !ctxt_tbls[idx + 1])
2095 			continue;
2096 
2097 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2098 		iommu->root_entry[bus].hi = val;
2099 	}
2100 
2101 	spin_unlock(&iommu->lock);
2102 
2103 	kfree(ctxt_tbls);
2104 
2105 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2106 
2107 	ret = 0;
2108 
2109 out_unmap:
2110 	memunmap(old_rt);
2111 
2112 	return ret;
2113 }
2114 
2115 static int __init init_dmars(void)
2116 {
2117 	struct dmar_drhd_unit *drhd;
2118 	struct intel_iommu *iommu;
2119 	int ret;
2120 
2121 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2122 	if (ret)
2123 		goto free_iommu;
2124 
2125 	for_each_iommu(iommu, drhd) {
2126 		if (drhd->ignored) {
2127 			iommu_disable_translation(iommu);
2128 			continue;
2129 		}
2130 
2131 		/*
2132 		 * Find the max pasid size of all IOMMU's in the system.
2133 		 * We need to ensure the system pasid table is no bigger
2134 		 * than the smallest supported.
2135 		 */
2136 		if (pasid_supported(iommu)) {
2137 			u32 temp = 2 << ecap_pss(iommu->ecap);
2138 
2139 			intel_pasid_max_id = min_t(u32, temp,
2140 						   intel_pasid_max_id);
2141 		}
2142 
2143 		intel_iommu_init_qi(iommu);
2144 
2145 		ret = iommu_init_domains(iommu);
2146 		if (ret)
2147 			goto free_iommu;
2148 
2149 		init_translation_status(iommu);
2150 
2151 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2152 			iommu_disable_translation(iommu);
2153 			clear_translation_pre_enabled(iommu);
2154 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2155 				iommu->name);
2156 		}
2157 
2158 		/*
2159 		 * TBD:
2160 		 * we could share the same root & context tables
2161 		 * among all IOMMU's. Need to Split it later.
2162 		 */
2163 		ret = iommu_alloc_root_entry(iommu);
2164 		if (ret)
2165 			goto free_iommu;
2166 
2167 		if (translation_pre_enabled(iommu)) {
2168 			pr_info("Translation already enabled - trying to copy translation structures\n");
2169 
2170 			ret = copy_translation_tables(iommu);
2171 			if (ret) {
2172 				/*
2173 				 * We found the IOMMU with translation
2174 				 * enabled - but failed to copy over the
2175 				 * old root-entry table. Try to proceed
2176 				 * by disabling translation now and
2177 				 * allocating a clean root-entry table.
2178 				 * This might cause DMAR faults, but
2179 				 * probably the dump will still succeed.
2180 				 */
2181 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2182 				       iommu->name);
2183 				iommu_disable_translation(iommu);
2184 				clear_translation_pre_enabled(iommu);
2185 			} else {
2186 				pr_info("Copied translation tables from previous kernel for %s\n",
2187 					iommu->name);
2188 			}
2189 		}
2190 
2191 		intel_svm_check(iommu);
2192 	}
2193 
2194 	/*
2195 	 * Now that qi is enabled on all iommus, set the root entry and flush
2196 	 * caches. This is required on some Intel X58 chipsets, otherwise the
2197 	 * flush_context function will loop forever and the boot hangs.
2198 	 */
2199 	for_each_active_iommu(iommu, drhd) {
2200 		iommu_flush_write_buffer(iommu);
2201 		iommu_set_root_entry(iommu);
2202 	}
2203 
2204 	check_tylersburg_isoch();
2205 
2206 	/*
2207 	 * for each drhd
2208 	 *   enable fault log
2209 	 *   global invalidate context cache
2210 	 *   global invalidate iotlb
2211 	 *   enable translation
2212 	 */
2213 	for_each_iommu(iommu, drhd) {
2214 		if (drhd->ignored) {
2215 			/*
2216 			 * we always have to disable PMRs or DMA may fail on
2217 			 * this device
2218 			 */
2219 			if (force_on)
2220 				iommu_disable_protect_mem_regions(iommu);
2221 			continue;
2222 		}
2223 
2224 		iommu_flush_write_buffer(iommu);
2225 
2226 		if (ecap_prs(iommu->ecap)) {
2227 			/*
2228 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2229 			 * could cause possible lock race condition.
2230 			 */
2231 			up_write(&dmar_global_lock);
2232 			ret = intel_iommu_enable_prq(iommu);
2233 			down_write(&dmar_global_lock);
2234 			if (ret)
2235 				goto free_iommu;
2236 		}
2237 
2238 		ret = dmar_set_interrupt(iommu);
2239 		if (ret)
2240 			goto free_iommu;
2241 	}
2242 
2243 	return 0;
2244 
2245 free_iommu:
2246 	for_each_active_iommu(iommu, drhd) {
2247 		disable_dmar_iommu(iommu);
2248 		free_dmar_iommu(iommu);
2249 	}
2250 
2251 	return ret;
2252 }
2253 
2254 static void __init init_no_remapping_devices(void)
2255 {
2256 	struct dmar_drhd_unit *drhd;
2257 	struct device *dev;
2258 	int i;
2259 
2260 	for_each_drhd_unit(drhd) {
2261 		if (!drhd->include_all) {
2262 			for_each_active_dev_scope(drhd->devices,
2263 						  drhd->devices_cnt, i, dev)
2264 				break;
2265 			/* ignore DMAR unit if no devices exist */
2266 			if (i == drhd->devices_cnt)
2267 				drhd->ignored = 1;
2268 		}
2269 	}
2270 
2271 	for_each_active_drhd_unit(drhd) {
2272 		if (drhd->include_all)
2273 			continue;
2274 
2275 		for_each_active_dev_scope(drhd->devices,
2276 					  drhd->devices_cnt, i, dev)
2277 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2278 				break;
2279 		if (i < drhd->devices_cnt)
2280 			continue;
2281 
2282 		/* This IOMMU has *only* gfx devices. Either bypass it or
2283 		   set the gfx_mapped flag, as appropriate */
2284 		drhd->gfx_dedicated = 1;
2285 		if (disable_igfx_iommu)
2286 			drhd->ignored = 1;
2287 	}
2288 }
2289 
2290 #ifdef CONFIG_SUSPEND
2291 static int init_iommu_hw(void)
2292 {
2293 	struct dmar_drhd_unit *drhd;
2294 	struct intel_iommu *iommu = NULL;
2295 	int ret;
2296 
2297 	for_each_active_iommu(iommu, drhd) {
2298 		if (iommu->qi) {
2299 			ret = dmar_reenable_qi(iommu);
2300 			if (ret)
2301 				return ret;
2302 		}
2303 	}
2304 
2305 	for_each_iommu(iommu, drhd) {
2306 		if (drhd->ignored) {
2307 			/*
2308 			 * we always have to disable PMRs or DMA may fail on
2309 			 * this device
2310 			 */
2311 			if (force_on)
2312 				iommu_disable_protect_mem_regions(iommu);
2313 			continue;
2314 		}
2315 
2316 		iommu_flush_write_buffer(iommu);
2317 		iommu_set_root_entry(iommu);
2318 		iommu_enable_translation(iommu);
2319 		iommu_disable_protect_mem_regions(iommu);
2320 	}
2321 
2322 	return 0;
2323 }
2324 
2325 static void iommu_flush_all(void)
2326 {
2327 	struct dmar_drhd_unit *drhd;
2328 	struct intel_iommu *iommu;
2329 
2330 	for_each_active_iommu(iommu, drhd) {
2331 		iommu->flush.flush_context(iommu, 0, 0, 0,
2332 					   DMA_CCMD_GLOBAL_INVL);
2333 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2334 					 DMA_TLB_GLOBAL_FLUSH);
2335 	}
2336 }
2337 
2338 static int iommu_suspend(void)
2339 {
2340 	struct dmar_drhd_unit *drhd;
2341 	struct intel_iommu *iommu = NULL;
2342 	unsigned long flag;
2343 
2344 	iommu_flush_all();
2345 
2346 	for_each_active_iommu(iommu, drhd) {
2347 		iommu_disable_translation(iommu);
2348 
2349 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2350 
2351 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
2352 			readl(iommu->reg + DMAR_FECTL_REG);
2353 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2354 			readl(iommu->reg + DMAR_FEDATA_REG);
2355 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2356 			readl(iommu->reg + DMAR_FEADDR_REG);
2357 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2358 			readl(iommu->reg + DMAR_FEUADDR_REG);
2359 
2360 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2361 	}
2362 	return 0;
2363 }
2364 
2365 static void iommu_resume(void)
2366 {
2367 	struct dmar_drhd_unit *drhd;
2368 	struct intel_iommu *iommu = NULL;
2369 	unsigned long flag;
2370 
2371 	if (init_iommu_hw()) {
2372 		if (force_on)
2373 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2374 		else
2375 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2376 		return;
2377 	}
2378 
2379 	for_each_active_iommu(iommu, drhd) {
2380 
2381 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2382 
2383 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2384 			iommu->reg + DMAR_FECTL_REG);
2385 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2386 			iommu->reg + DMAR_FEDATA_REG);
2387 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2388 			iommu->reg + DMAR_FEADDR_REG);
2389 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2390 			iommu->reg + DMAR_FEUADDR_REG);
2391 
2392 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2393 	}
2394 }
2395 
2396 static struct syscore_ops iommu_syscore_ops = {
2397 	.resume		= iommu_resume,
2398 	.suspend	= iommu_suspend,
2399 };
2400 
2401 static void __init init_iommu_pm_ops(void)
2402 {
2403 	register_syscore_ops(&iommu_syscore_ops);
2404 }
2405 
2406 #else
2407 static inline void init_iommu_pm_ops(void) {}
2408 #endif	/* CONFIG_PM */
2409 
2410 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2411 {
2412 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2413 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2414 	    rmrr->end_address <= rmrr->base_address ||
2415 	    arch_rmrr_sanity_check(rmrr))
2416 		return -EINVAL;
2417 
2418 	return 0;
2419 }
2420 
2421 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2422 {
2423 	struct acpi_dmar_reserved_memory *rmrr;
2424 	struct dmar_rmrr_unit *rmrru;
2425 
2426 	rmrr = (struct acpi_dmar_reserved_memory *)header;
2427 	if (rmrr_sanity_check(rmrr)) {
2428 		pr_warn(FW_BUG
2429 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2430 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2431 			   rmrr->base_address, rmrr->end_address,
2432 			   dmi_get_system_info(DMI_BIOS_VENDOR),
2433 			   dmi_get_system_info(DMI_BIOS_VERSION),
2434 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
2435 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2436 	}
2437 
2438 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2439 	if (!rmrru)
2440 		goto out;
2441 
2442 	rmrru->hdr = header;
2443 
2444 	rmrru->base_address = rmrr->base_address;
2445 	rmrru->end_address = rmrr->end_address;
2446 
2447 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2448 				((void *)rmrr) + rmrr->header.length,
2449 				&rmrru->devices_cnt);
2450 	if (rmrru->devices_cnt && rmrru->devices == NULL)
2451 		goto free_rmrru;
2452 
2453 	list_add(&rmrru->list, &dmar_rmrr_units);
2454 
2455 	return 0;
2456 free_rmrru:
2457 	kfree(rmrru);
2458 out:
2459 	return -ENOMEM;
2460 }
2461 
2462 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2463 {
2464 	struct dmar_atsr_unit *atsru;
2465 	struct acpi_dmar_atsr *tmp;
2466 
2467 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2468 				dmar_rcu_check()) {
2469 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2470 		if (atsr->segment != tmp->segment)
2471 			continue;
2472 		if (atsr->header.length != tmp->header.length)
2473 			continue;
2474 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
2475 			return atsru;
2476 	}
2477 
2478 	return NULL;
2479 }
2480 
2481 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2482 {
2483 	struct acpi_dmar_atsr *atsr;
2484 	struct dmar_atsr_unit *atsru;
2485 
2486 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2487 		return 0;
2488 
2489 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2490 	atsru = dmar_find_atsr(atsr);
2491 	if (atsru)
2492 		return 0;
2493 
2494 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2495 	if (!atsru)
2496 		return -ENOMEM;
2497 
2498 	/*
2499 	 * If memory is allocated from slab by ACPI _DSM method, we need to
2500 	 * copy the memory content because the memory buffer will be freed
2501 	 * on return.
2502 	 */
2503 	atsru->hdr = (void *)(atsru + 1);
2504 	memcpy(atsru->hdr, hdr, hdr->length);
2505 	atsru->include_all = atsr->flags & 0x1;
2506 	if (!atsru->include_all) {
2507 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
2508 				(void *)atsr + atsr->header.length,
2509 				&atsru->devices_cnt);
2510 		if (atsru->devices_cnt && atsru->devices == NULL) {
2511 			kfree(atsru);
2512 			return -ENOMEM;
2513 		}
2514 	}
2515 
2516 	list_add_rcu(&atsru->list, &dmar_atsr_units);
2517 
2518 	return 0;
2519 }
2520 
2521 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2522 {
2523 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
2524 	kfree(atsru);
2525 }
2526 
2527 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2528 {
2529 	struct acpi_dmar_atsr *atsr;
2530 	struct dmar_atsr_unit *atsru;
2531 
2532 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2533 	atsru = dmar_find_atsr(atsr);
2534 	if (atsru) {
2535 		list_del_rcu(&atsru->list);
2536 		synchronize_rcu();
2537 		intel_iommu_free_atsr(atsru);
2538 	}
2539 
2540 	return 0;
2541 }
2542 
2543 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2544 {
2545 	int i;
2546 	struct device *dev;
2547 	struct acpi_dmar_atsr *atsr;
2548 	struct dmar_atsr_unit *atsru;
2549 
2550 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2551 	atsru = dmar_find_atsr(atsr);
2552 	if (!atsru)
2553 		return 0;
2554 
2555 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
2556 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
2557 					  i, dev)
2558 			return -EBUSY;
2559 	}
2560 
2561 	return 0;
2562 }
2563 
2564 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
2565 {
2566 	struct dmar_satc_unit *satcu;
2567 	struct acpi_dmar_satc *tmp;
2568 
2569 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
2570 				dmar_rcu_check()) {
2571 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
2572 		if (satc->segment != tmp->segment)
2573 			continue;
2574 		if (satc->header.length != tmp->header.length)
2575 			continue;
2576 		if (memcmp(satc, tmp, satc->header.length) == 0)
2577 			return satcu;
2578 	}
2579 
2580 	return NULL;
2581 }
2582 
2583 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
2584 {
2585 	struct acpi_dmar_satc *satc;
2586 	struct dmar_satc_unit *satcu;
2587 
2588 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2589 		return 0;
2590 
2591 	satc = container_of(hdr, struct acpi_dmar_satc, header);
2592 	satcu = dmar_find_satc(satc);
2593 	if (satcu)
2594 		return 0;
2595 
2596 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
2597 	if (!satcu)
2598 		return -ENOMEM;
2599 
2600 	satcu->hdr = (void *)(satcu + 1);
2601 	memcpy(satcu->hdr, hdr, hdr->length);
2602 	satcu->atc_required = satc->flags & 0x1;
2603 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
2604 					      (void *)satc + satc->header.length,
2605 					      &satcu->devices_cnt);
2606 	if (satcu->devices_cnt && !satcu->devices) {
2607 		kfree(satcu);
2608 		return -ENOMEM;
2609 	}
2610 	list_add_rcu(&satcu->list, &dmar_satc_units);
2611 
2612 	return 0;
2613 }
2614 
2615 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
2616 {
2617 	struct intel_iommu *iommu = dmaru->iommu;
2618 	int ret;
2619 
2620 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
2621 	if (ret)
2622 		goto out;
2623 
2624 	/*
2625 	 * Disable translation if already enabled prior to OS handover.
2626 	 */
2627 	if (iommu->gcmd & DMA_GCMD_TE)
2628 		iommu_disable_translation(iommu);
2629 
2630 	ret = iommu_init_domains(iommu);
2631 	if (ret == 0)
2632 		ret = iommu_alloc_root_entry(iommu);
2633 	if (ret)
2634 		goto out;
2635 
2636 	intel_svm_check(iommu);
2637 
2638 	if (dmaru->ignored) {
2639 		/*
2640 		 * we always have to disable PMRs or DMA may fail on this device
2641 		 */
2642 		if (force_on)
2643 			iommu_disable_protect_mem_regions(iommu);
2644 		return 0;
2645 	}
2646 
2647 	intel_iommu_init_qi(iommu);
2648 	iommu_flush_write_buffer(iommu);
2649 
2650 	if (ecap_prs(iommu->ecap)) {
2651 		ret = intel_iommu_enable_prq(iommu);
2652 		if (ret)
2653 			goto disable_iommu;
2654 	}
2655 
2656 	ret = dmar_set_interrupt(iommu);
2657 	if (ret)
2658 		goto disable_iommu;
2659 
2660 	iommu_set_root_entry(iommu);
2661 	iommu_enable_translation(iommu);
2662 
2663 	iommu_disable_protect_mem_regions(iommu);
2664 	return 0;
2665 
2666 disable_iommu:
2667 	disable_dmar_iommu(iommu);
2668 out:
2669 	free_dmar_iommu(iommu);
2670 	return ret;
2671 }
2672 
2673 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
2674 {
2675 	int ret = 0;
2676 	struct intel_iommu *iommu = dmaru->iommu;
2677 
2678 	if (!intel_iommu_enabled)
2679 		return 0;
2680 	if (iommu == NULL)
2681 		return -EINVAL;
2682 
2683 	if (insert) {
2684 		ret = intel_iommu_add(dmaru);
2685 	} else {
2686 		disable_dmar_iommu(iommu);
2687 		free_dmar_iommu(iommu);
2688 	}
2689 
2690 	return ret;
2691 }
2692 
2693 static void intel_iommu_free_dmars(void)
2694 {
2695 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
2696 	struct dmar_atsr_unit *atsru, *atsr_n;
2697 	struct dmar_satc_unit *satcu, *satc_n;
2698 
2699 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
2700 		list_del(&rmrru->list);
2701 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
2702 		kfree(rmrru);
2703 	}
2704 
2705 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
2706 		list_del(&atsru->list);
2707 		intel_iommu_free_atsr(atsru);
2708 	}
2709 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
2710 		list_del(&satcu->list);
2711 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
2712 		kfree(satcu);
2713 	}
2714 }
2715 
2716 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
2717 {
2718 	struct dmar_satc_unit *satcu;
2719 	struct acpi_dmar_satc *satc;
2720 	struct device *tmp;
2721 	int i;
2722 
2723 	dev = pci_physfn(dev);
2724 	rcu_read_lock();
2725 
2726 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
2727 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2728 		if (satc->segment != pci_domain_nr(dev->bus))
2729 			continue;
2730 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
2731 			if (to_pci_dev(tmp) == dev)
2732 				goto out;
2733 	}
2734 	satcu = NULL;
2735 out:
2736 	rcu_read_unlock();
2737 	return satcu;
2738 }
2739 
2740 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
2741 {
2742 	int i, ret = 1;
2743 	struct pci_bus *bus;
2744 	struct pci_dev *bridge = NULL;
2745 	struct device *tmp;
2746 	struct acpi_dmar_atsr *atsr;
2747 	struct dmar_atsr_unit *atsru;
2748 	struct dmar_satc_unit *satcu;
2749 
2750 	dev = pci_physfn(dev);
2751 	satcu = dmar_find_matched_satc_unit(dev);
2752 	if (satcu)
2753 		/*
2754 		 * This device supports ATS as it is in SATC table.
2755 		 * When IOMMU is in legacy mode, enabling ATS is done
2756 		 * automatically by HW for the device that requires
2757 		 * ATS, hence OS should not enable this device ATS
2758 		 * to avoid duplicated TLB invalidation.
2759 		 */
2760 		return !(satcu->atc_required && !sm_supported(iommu));
2761 
2762 	for (bus = dev->bus; bus; bus = bus->parent) {
2763 		bridge = bus->self;
2764 		/* If it's an integrated device, allow ATS */
2765 		if (!bridge)
2766 			return 1;
2767 		/* Connected via non-PCIe: no ATS */
2768 		if (!pci_is_pcie(bridge) ||
2769 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
2770 			return 0;
2771 		/* If we found the root port, look it up in the ATSR */
2772 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
2773 			break;
2774 	}
2775 
2776 	rcu_read_lock();
2777 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
2778 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2779 		if (atsr->segment != pci_domain_nr(dev->bus))
2780 			continue;
2781 
2782 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
2783 			if (tmp == &bridge->dev)
2784 				goto out;
2785 
2786 		if (atsru->include_all)
2787 			goto out;
2788 	}
2789 	ret = 0;
2790 out:
2791 	rcu_read_unlock();
2792 
2793 	return ret;
2794 }
2795 
2796 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
2797 {
2798 	int ret;
2799 	struct dmar_rmrr_unit *rmrru;
2800 	struct dmar_atsr_unit *atsru;
2801 	struct dmar_satc_unit *satcu;
2802 	struct acpi_dmar_atsr *atsr;
2803 	struct acpi_dmar_reserved_memory *rmrr;
2804 	struct acpi_dmar_satc *satc;
2805 
2806 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
2807 		return 0;
2808 
2809 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
2810 		rmrr = container_of(rmrru->hdr,
2811 				    struct acpi_dmar_reserved_memory, header);
2812 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2813 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
2814 				((void *)rmrr) + rmrr->header.length,
2815 				rmrr->segment, rmrru->devices,
2816 				rmrru->devices_cnt);
2817 			if (ret < 0)
2818 				return ret;
2819 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2820 			dmar_remove_dev_scope(info, rmrr->segment,
2821 				rmrru->devices, rmrru->devices_cnt);
2822 		}
2823 	}
2824 
2825 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
2826 		if (atsru->include_all)
2827 			continue;
2828 
2829 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2830 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2831 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
2832 					(void *)atsr + atsr->header.length,
2833 					atsr->segment, atsru->devices,
2834 					atsru->devices_cnt);
2835 			if (ret > 0)
2836 				break;
2837 			else if (ret < 0)
2838 				return ret;
2839 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2840 			if (dmar_remove_dev_scope(info, atsr->segment,
2841 					atsru->devices, atsru->devices_cnt))
2842 				break;
2843 		}
2844 	}
2845 	list_for_each_entry(satcu, &dmar_satc_units, list) {
2846 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2847 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2848 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
2849 					(void *)satc + satc->header.length,
2850 					satc->segment, satcu->devices,
2851 					satcu->devices_cnt);
2852 			if (ret > 0)
2853 				break;
2854 			else if (ret < 0)
2855 				return ret;
2856 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2857 			if (dmar_remove_dev_scope(info, satc->segment,
2858 					satcu->devices, satcu->devices_cnt))
2859 				break;
2860 		}
2861 	}
2862 
2863 	return 0;
2864 }
2865 
2866 static void intel_disable_iommus(void)
2867 {
2868 	struct intel_iommu *iommu = NULL;
2869 	struct dmar_drhd_unit *drhd;
2870 
2871 	for_each_iommu(iommu, drhd)
2872 		iommu_disable_translation(iommu);
2873 }
2874 
2875 void intel_iommu_shutdown(void)
2876 {
2877 	struct dmar_drhd_unit *drhd;
2878 	struct intel_iommu *iommu = NULL;
2879 
2880 	if (no_iommu || dmar_disabled)
2881 		return;
2882 
2883 	down_write(&dmar_global_lock);
2884 
2885 	/* Disable PMRs explicitly here. */
2886 	for_each_iommu(iommu, drhd)
2887 		iommu_disable_protect_mem_regions(iommu);
2888 
2889 	/* Make sure the IOMMUs are switched off */
2890 	intel_disable_iommus();
2891 
2892 	up_write(&dmar_global_lock);
2893 }
2894 
2895 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
2896 {
2897 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
2898 
2899 	return container_of(iommu_dev, struct intel_iommu, iommu);
2900 }
2901 
2902 static ssize_t version_show(struct device *dev,
2903 			    struct device_attribute *attr, char *buf)
2904 {
2905 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2906 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
2907 	return sysfs_emit(buf, "%d:%d\n",
2908 			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
2909 }
2910 static DEVICE_ATTR_RO(version);
2911 
2912 static ssize_t address_show(struct device *dev,
2913 			    struct device_attribute *attr, char *buf)
2914 {
2915 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2916 	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
2917 }
2918 static DEVICE_ATTR_RO(address);
2919 
2920 static ssize_t cap_show(struct device *dev,
2921 			struct device_attribute *attr, char *buf)
2922 {
2923 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2924 	return sysfs_emit(buf, "%llx\n", iommu->cap);
2925 }
2926 static DEVICE_ATTR_RO(cap);
2927 
2928 static ssize_t ecap_show(struct device *dev,
2929 			 struct device_attribute *attr, char *buf)
2930 {
2931 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2932 	return sysfs_emit(buf, "%llx\n", iommu->ecap);
2933 }
2934 static DEVICE_ATTR_RO(ecap);
2935 
2936 static ssize_t domains_supported_show(struct device *dev,
2937 				      struct device_attribute *attr, char *buf)
2938 {
2939 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2940 	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
2941 }
2942 static DEVICE_ATTR_RO(domains_supported);
2943 
2944 static ssize_t domains_used_show(struct device *dev,
2945 				 struct device_attribute *attr, char *buf)
2946 {
2947 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2948 	return sysfs_emit(buf, "%d\n",
2949 			  bitmap_weight(iommu->domain_ids,
2950 					cap_ndoms(iommu->cap)));
2951 }
2952 static DEVICE_ATTR_RO(domains_used);
2953 
2954 static struct attribute *intel_iommu_attrs[] = {
2955 	&dev_attr_version.attr,
2956 	&dev_attr_address.attr,
2957 	&dev_attr_cap.attr,
2958 	&dev_attr_ecap.attr,
2959 	&dev_attr_domains_supported.attr,
2960 	&dev_attr_domains_used.attr,
2961 	NULL,
2962 };
2963 
2964 static struct attribute_group intel_iommu_group = {
2965 	.name = "intel-iommu",
2966 	.attrs = intel_iommu_attrs,
2967 };
2968 
2969 const struct attribute_group *intel_iommu_groups[] = {
2970 	&intel_iommu_group,
2971 	NULL,
2972 };
2973 
2974 static bool has_external_pci(void)
2975 {
2976 	struct pci_dev *pdev = NULL;
2977 
2978 	for_each_pci_dev(pdev)
2979 		if (pdev->external_facing) {
2980 			pci_dev_put(pdev);
2981 			return true;
2982 		}
2983 
2984 	return false;
2985 }
2986 
2987 static int __init platform_optin_force_iommu(void)
2988 {
2989 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
2990 		return 0;
2991 
2992 	if (no_iommu || dmar_disabled)
2993 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
2994 
2995 	/*
2996 	 * If Intel-IOMMU is disabled by default, we will apply identity
2997 	 * map for all devices except those marked as being untrusted.
2998 	 */
2999 	if (dmar_disabled)
3000 		iommu_set_default_passthrough(false);
3001 
3002 	dmar_disabled = 0;
3003 	no_iommu = 0;
3004 
3005 	return 1;
3006 }
3007 
3008 static int __init probe_acpi_namespace_devices(void)
3009 {
3010 	struct dmar_drhd_unit *drhd;
3011 	/* To avoid a -Wunused-but-set-variable warning. */
3012 	struct intel_iommu *iommu __maybe_unused;
3013 	struct device *dev;
3014 	int i, ret = 0;
3015 
3016 	for_each_active_iommu(iommu, drhd) {
3017 		for_each_active_dev_scope(drhd->devices,
3018 					  drhd->devices_cnt, i, dev) {
3019 			struct acpi_device_physical_node *pn;
3020 			struct acpi_device *adev;
3021 
3022 			if (dev->bus != &acpi_bus_type)
3023 				continue;
3024 
3025 			adev = to_acpi_device(dev);
3026 			mutex_lock(&adev->physical_node_lock);
3027 			list_for_each_entry(pn,
3028 					    &adev->physical_node_list, node) {
3029 				ret = iommu_probe_device(pn->dev);
3030 				if (ret)
3031 					break;
3032 			}
3033 			mutex_unlock(&adev->physical_node_lock);
3034 
3035 			if (ret)
3036 				return ret;
3037 		}
3038 	}
3039 
3040 	return 0;
3041 }
3042 
3043 static __init int tboot_force_iommu(void)
3044 {
3045 	if (!tboot_enabled())
3046 		return 0;
3047 
3048 	if (no_iommu || dmar_disabled)
3049 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3050 
3051 	dmar_disabled = 0;
3052 	no_iommu = 0;
3053 
3054 	return 1;
3055 }
3056 
3057 int __init intel_iommu_init(void)
3058 {
3059 	int ret = -ENODEV;
3060 	struct dmar_drhd_unit *drhd;
3061 	struct intel_iommu *iommu;
3062 
3063 	/*
3064 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3065 	 * opt in, so enforce that.
3066 	 */
3067 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3068 		    platform_optin_force_iommu();
3069 
3070 	down_write(&dmar_global_lock);
3071 	if (dmar_table_init()) {
3072 		if (force_on)
3073 			panic("tboot: Failed to initialize DMAR table\n");
3074 		goto out_free_dmar;
3075 	}
3076 
3077 	if (dmar_dev_scope_init() < 0) {
3078 		if (force_on)
3079 			panic("tboot: Failed to initialize DMAR device scope\n");
3080 		goto out_free_dmar;
3081 	}
3082 
3083 	up_write(&dmar_global_lock);
3084 
3085 	/*
3086 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3087 	 * complain later when we register it under the lock.
3088 	 */
3089 	dmar_register_bus_notifier();
3090 
3091 	down_write(&dmar_global_lock);
3092 
3093 	if (!no_iommu)
3094 		intel_iommu_debugfs_init();
3095 
3096 	if (no_iommu || dmar_disabled) {
3097 		/*
3098 		 * We exit the function here to ensure IOMMU's remapping and
3099 		 * mempool aren't setup, which means that the IOMMU's PMRs
3100 		 * won't be disabled via the call to init_dmars(). So disable
3101 		 * it explicitly here. The PMRs were setup by tboot prior to
3102 		 * calling SENTER, but the kernel is expected to reset/tear
3103 		 * down the PMRs.
3104 		 */
3105 		if (intel_iommu_tboot_noforce) {
3106 			for_each_iommu(iommu, drhd)
3107 				iommu_disable_protect_mem_regions(iommu);
3108 		}
3109 
3110 		/*
3111 		 * Make sure the IOMMUs are switched off, even when we
3112 		 * boot into a kexec kernel and the previous kernel left
3113 		 * them enabled
3114 		 */
3115 		intel_disable_iommus();
3116 		goto out_free_dmar;
3117 	}
3118 
3119 	if (list_empty(&dmar_rmrr_units))
3120 		pr_info("No RMRR found\n");
3121 
3122 	if (list_empty(&dmar_atsr_units))
3123 		pr_info("No ATSR found\n");
3124 
3125 	if (list_empty(&dmar_satc_units))
3126 		pr_info("No SATC found\n");
3127 
3128 	init_no_remapping_devices();
3129 
3130 	ret = init_dmars();
3131 	if (ret) {
3132 		if (force_on)
3133 			panic("tboot: Failed to initialize DMARs\n");
3134 		pr_err("Initialization failed\n");
3135 		goto out_free_dmar;
3136 	}
3137 	up_write(&dmar_global_lock);
3138 
3139 	init_iommu_pm_ops();
3140 
3141 	down_read(&dmar_global_lock);
3142 	for_each_active_iommu(iommu, drhd) {
3143 		/*
3144 		 * The flush queue implementation does not perform
3145 		 * page-selective invalidations that are required for efficient
3146 		 * TLB flushes in virtual environments.  The benefit of batching
3147 		 * is likely to be much lower than the overhead of synchronizing
3148 		 * the virtual and physical IOMMU page-tables.
3149 		 */
3150 		if (cap_caching_mode(iommu->cap) &&
3151 		    !first_level_by_default(iommu)) {
3152 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3153 			iommu_set_dma_strict();
3154 		}
3155 		iommu_device_sysfs_add(&iommu->iommu, NULL,
3156 				       intel_iommu_groups,
3157 				       "%s", iommu->name);
3158 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3159 
3160 		iommu_pmu_register(iommu);
3161 	}
3162 
3163 	if (probe_acpi_namespace_devices())
3164 		pr_warn("ACPI name space devices didn't probe correctly\n");
3165 
3166 	/* Finally, we enable the DMA remapping hardware. */
3167 	for_each_iommu(iommu, drhd) {
3168 		if (!drhd->ignored && !translation_pre_enabled(iommu))
3169 			iommu_enable_translation(iommu);
3170 
3171 		iommu_disable_protect_mem_regions(iommu);
3172 	}
3173 	up_read(&dmar_global_lock);
3174 
3175 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3176 
3177 	intel_iommu_enabled = 1;
3178 
3179 	return 0;
3180 
3181 out_free_dmar:
3182 	intel_iommu_free_dmars();
3183 	up_write(&dmar_global_lock);
3184 	return ret;
3185 }
3186 
3187 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3188 {
3189 	struct device_domain_info *info = opaque;
3190 
3191 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3192 	return 0;
3193 }
3194 
3195 /*
3196  * NB - intel-iommu lacks any sort of reference counting for the users of
3197  * dependent devices.  If multiple endpoints have intersecting dependent
3198  * devices, unbinding the driver from any one of them will possibly leave
3199  * the others unable to operate.
3200  */
3201 static void domain_context_clear(struct device_domain_info *info)
3202 {
3203 	if (!dev_is_pci(info->dev)) {
3204 		domain_context_clear_one(info, info->bus, info->devfn);
3205 		return;
3206 	}
3207 
3208 	pci_for_each_dma_alias(to_pci_dev(info->dev),
3209 			       &domain_context_clear_one_cb, info);
3210 }
3211 
3212 /*
3213  * Clear the page table pointer in context or pasid table entries so that
3214  * all DMA requests without PASID from the device are blocked. If the page
3215  * table has been set, clean up the data structures.
3216  */
3217 void device_block_translation(struct device *dev)
3218 {
3219 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3220 	struct intel_iommu *iommu = info->iommu;
3221 	unsigned long flags;
3222 
3223 	if (info->domain)
3224 		cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID);
3225 
3226 	iommu_disable_pci_caps(info);
3227 	if (!dev_is_real_dma_subdevice(dev)) {
3228 		if (sm_supported(iommu))
3229 			intel_pasid_tear_down_entry(iommu, dev,
3230 						    IOMMU_NO_PASID, false);
3231 		else
3232 			domain_context_clear(info);
3233 	}
3234 
3235 	if (!info->domain)
3236 		return;
3237 
3238 	spin_lock_irqsave(&info->domain->lock, flags);
3239 	list_del(&info->link);
3240 	spin_unlock_irqrestore(&info->domain->lock, flags);
3241 
3242 	domain_detach_iommu(info->domain, iommu);
3243 	info->domain = NULL;
3244 }
3245 
3246 static int blocking_domain_attach_dev(struct iommu_domain *domain,
3247 				      struct device *dev)
3248 {
3249 	device_block_translation(dev);
3250 	return 0;
3251 }
3252 
3253 static struct iommu_domain blocking_domain = {
3254 	.type = IOMMU_DOMAIN_BLOCKED,
3255 	.ops = &(const struct iommu_domain_ops) {
3256 		.attach_dev	= blocking_domain_attach_dev,
3257 	}
3258 };
3259 
3260 static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage)
3261 {
3262 	if (!intel_iommu_superpage)
3263 		return 0;
3264 
3265 	if (first_stage)
3266 		return cap_fl1gp_support(iommu->cap) ? 2 : 1;
3267 
3268 	return fls(cap_super_page_val(iommu->cap));
3269 }
3270 
3271 static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage)
3272 {
3273 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3274 	struct intel_iommu *iommu = info->iommu;
3275 	struct dmar_domain *domain;
3276 	int addr_width;
3277 
3278 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
3279 	if (!domain)
3280 		return ERR_PTR(-ENOMEM);
3281 
3282 	INIT_LIST_HEAD(&domain->devices);
3283 	INIT_LIST_HEAD(&domain->dev_pasids);
3284 	INIT_LIST_HEAD(&domain->cache_tags);
3285 	spin_lock_init(&domain->lock);
3286 	spin_lock_init(&domain->cache_lock);
3287 	xa_init(&domain->iommu_array);
3288 
3289 	domain->nid = dev_to_node(dev);
3290 	domain->use_first_level = first_stage;
3291 
3292 	/* calculate the address width */
3293 	addr_width = agaw_to_width(iommu->agaw);
3294 	if (addr_width > cap_mgaw(iommu->cap))
3295 		addr_width = cap_mgaw(iommu->cap);
3296 	domain->gaw = addr_width;
3297 	domain->agaw = iommu->agaw;
3298 	domain->max_addr = __DOMAIN_MAX_ADDR(addr_width);
3299 
3300 	/* iommu memory access coherency */
3301 	domain->iommu_coherency = iommu_paging_structure_coherency(iommu);
3302 
3303 	/* pagesize bitmap */
3304 	domain->domain.pgsize_bitmap = SZ_4K;
3305 	domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage);
3306 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
3307 
3308 	/*
3309 	 * IOVA aperture: First-level translation restricts the input-address
3310 	 * to a canonical address (i.e., address bits 63:N have the same value
3311 	 * as address bit [N-1], where N is 48-bits with 4-level paging and
3312 	 * 57-bits with 5-level paging). Hence, skip bit [N-1].
3313 	 */
3314 	domain->domain.geometry.force_aperture = true;
3315 	domain->domain.geometry.aperture_start = 0;
3316 	if (first_stage)
3317 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
3318 	else
3319 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
3320 
3321 	/* always allocate the top pgd */
3322 	domain->pgd = iommu_alloc_page_node(domain->nid, GFP_KERNEL);
3323 	if (!domain->pgd) {
3324 		kfree(domain);
3325 		return ERR_PTR(-ENOMEM);
3326 	}
3327 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3328 
3329 	return domain;
3330 }
3331 
3332 static struct iommu_domain *
3333 intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
3334 				      const struct iommu_user_data *user_data)
3335 {
3336 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3337 	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3338 	bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3339 	struct intel_iommu *iommu = info->iommu;
3340 	struct dmar_domain *dmar_domain;
3341 	struct iommu_domain *domain;
3342 	bool first_stage;
3343 
3344 	if (flags &
3345 	    (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING
3346 	       | IOMMU_HWPT_FAULT_ID_VALID)))
3347 		return ERR_PTR(-EOPNOTSUPP);
3348 	if (nested_parent && !nested_supported(iommu))
3349 		return ERR_PTR(-EOPNOTSUPP);
3350 	if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3351 		return ERR_PTR(-EOPNOTSUPP);
3352 
3353 	/*
3354 	 * Always allocate the guest compatible page table unless
3355 	 * IOMMU_HWPT_ALLOC_NEST_PARENT or IOMMU_HWPT_ALLOC_DIRTY_TRACKING
3356 	 * is specified.
3357 	 */
3358 	if (nested_parent || dirty_tracking) {
3359 		if (!sm_supported(iommu) || !ecap_slts(iommu->ecap))
3360 			return ERR_PTR(-EOPNOTSUPP);
3361 		first_stage = false;
3362 	} else {
3363 		first_stage = first_level_by_default(iommu);
3364 	}
3365 
3366 	dmar_domain = paging_domain_alloc(dev, first_stage);
3367 	if (IS_ERR(dmar_domain))
3368 		return ERR_CAST(dmar_domain);
3369 	domain = &dmar_domain->domain;
3370 	domain->type = IOMMU_DOMAIN_UNMANAGED;
3371 	domain->owner = &intel_iommu_ops;
3372 	domain->ops = intel_iommu_ops.default_domain_ops;
3373 
3374 	if (nested_parent) {
3375 		dmar_domain->nested_parent = true;
3376 		INIT_LIST_HEAD(&dmar_domain->s1_domains);
3377 		spin_lock_init(&dmar_domain->s1_lock);
3378 	}
3379 
3380 	if (dirty_tracking) {
3381 		if (dmar_domain->use_first_level) {
3382 			iommu_domain_free(domain);
3383 			return ERR_PTR(-EOPNOTSUPP);
3384 		}
3385 		domain->dirty_ops = &intel_dirty_ops;
3386 	}
3387 
3388 	return domain;
3389 }
3390 
3391 static void intel_iommu_domain_free(struct iommu_domain *domain)
3392 {
3393 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3394 
3395 	WARN_ON(dmar_domain->nested_parent &&
3396 		!list_empty(&dmar_domain->s1_domains));
3397 	domain_exit(dmar_domain);
3398 }
3399 
3400 int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
3401 {
3402 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3403 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3404 	struct intel_iommu *iommu = info->iommu;
3405 	int addr_width;
3406 
3407 	if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
3408 		return -EPERM;
3409 
3410 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3411 		return -EINVAL;
3412 
3413 	if (domain->dirty_ops && !ssads_supported(iommu))
3414 		return -EINVAL;
3415 
3416 	if (dmar_domain->iommu_coherency !=
3417 			iommu_paging_structure_coherency(iommu))
3418 		return -EINVAL;
3419 
3420 	if (dmar_domain->iommu_superpage !=
3421 			iommu_superpage_capability(iommu, dmar_domain->use_first_level))
3422 		return -EINVAL;
3423 
3424 	if (dmar_domain->use_first_level &&
3425 	    (!sm_supported(iommu) || !ecap_flts(iommu->ecap)))
3426 		return -EINVAL;
3427 
3428 	/* check if this iommu agaw is sufficient for max mapped address */
3429 	addr_width = agaw_to_width(iommu->agaw);
3430 	if (addr_width > cap_mgaw(iommu->cap))
3431 		addr_width = cap_mgaw(iommu->cap);
3432 
3433 	if (dmar_domain->gaw > addr_width || dmar_domain->agaw > iommu->agaw)
3434 		return -EINVAL;
3435 
3436 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3437 	    context_copied(iommu, info->bus, info->devfn))
3438 		return intel_pasid_setup_sm_context(dev);
3439 
3440 	return 0;
3441 }
3442 
3443 static int intel_iommu_attach_device(struct iommu_domain *domain,
3444 				     struct device *dev)
3445 {
3446 	int ret;
3447 
3448 	device_block_translation(dev);
3449 
3450 	ret = paging_domain_compatible(domain, dev);
3451 	if (ret)
3452 		return ret;
3453 
3454 	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
3455 }
3456 
3457 static int intel_iommu_map(struct iommu_domain *domain,
3458 			   unsigned long iova, phys_addr_t hpa,
3459 			   size_t size, int iommu_prot, gfp_t gfp)
3460 {
3461 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3462 	u64 max_addr;
3463 	int prot = 0;
3464 
3465 	if (iommu_prot & IOMMU_READ)
3466 		prot |= DMA_PTE_READ;
3467 	if (iommu_prot & IOMMU_WRITE)
3468 		prot |= DMA_PTE_WRITE;
3469 	if (dmar_domain->set_pte_snp)
3470 		prot |= DMA_PTE_SNP;
3471 
3472 	max_addr = iova + size;
3473 	if (dmar_domain->max_addr < max_addr) {
3474 		u64 end;
3475 
3476 		/* check if minimum agaw is sufficient for mapped address */
3477 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3478 		if (end < max_addr) {
3479 			pr_err("%s: iommu width (%d) is not "
3480 			       "sufficient for the mapped address (%llx)\n",
3481 			       __func__, dmar_domain->gaw, max_addr);
3482 			return -EFAULT;
3483 		}
3484 		dmar_domain->max_addr = max_addr;
3485 	}
3486 	/* Round up size to next multiple of PAGE_SIZE, if it and
3487 	   the low bits of hpa would take us onto the next page */
3488 	size = aligned_nrpages(hpa, size);
3489 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3490 				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
3491 }
3492 
3493 static int intel_iommu_map_pages(struct iommu_domain *domain,
3494 				 unsigned long iova, phys_addr_t paddr,
3495 				 size_t pgsize, size_t pgcount,
3496 				 int prot, gfp_t gfp, size_t *mapped)
3497 {
3498 	unsigned long pgshift = __ffs(pgsize);
3499 	size_t size = pgcount << pgshift;
3500 	int ret;
3501 
3502 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
3503 		return -EINVAL;
3504 
3505 	if (!IS_ALIGNED(iova | paddr, pgsize))
3506 		return -EINVAL;
3507 
3508 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
3509 	if (!ret && mapped)
3510 		*mapped = size;
3511 
3512 	return ret;
3513 }
3514 
3515 static size_t intel_iommu_unmap(struct iommu_domain *domain,
3516 				unsigned long iova, size_t size,
3517 				struct iommu_iotlb_gather *gather)
3518 {
3519 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3520 	unsigned long start_pfn, last_pfn;
3521 	int level = 0;
3522 
3523 	/* Cope with horrid API which requires us to unmap more than the
3524 	   size argument if it happens to be a large-page mapping. */
3525 	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
3526 				     &level, GFP_ATOMIC)))
3527 		return 0;
3528 
3529 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
3530 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
3531 
3532 	start_pfn = iova >> VTD_PAGE_SHIFT;
3533 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
3534 
3535 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
3536 
3537 	if (dmar_domain->max_addr == iova + size)
3538 		dmar_domain->max_addr = iova;
3539 
3540 	/*
3541 	 * We do not use page-selective IOTLB invalidation in flush queue,
3542 	 * so there is no need to track page and sync iotlb.
3543 	 */
3544 	if (!iommu_iotlb_gather_queued(gather))
3545 		iommu_iotlb_gather_add_page(domain, gather, iova, size);
3546 
3547 	return size;
3548 }
3549 
3550 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
3551 				      unsigned long iova,
3552 				      size_t pgsize, size_t pgcount,
3553 				      struct iommu_iotlb_gather *gather)
3554 {
3555 	unsigned long pgshift = __ffs(pgsize);
3556 	size_t size = pgcount << pgshift;
3557 
3558 	return intel_iommu_unmap(domain, iova, size, gather);
3559 }
3560 
3561 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
3562 				 struct iommu_iotlb_gather *gather)
3563 {
3564 	cache_tag_flush_range(to_dmar_domain(domain), gather->start,
3565 			      gather->end, list_empty(&gather->freelist));
3566 	iommu_put_pages_list(&gather->freelist);
3567 }
3568 
3569 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3570 					    dma_addr_t iova)
3571 {
3572 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3573 	struct dma_pte *pte;
3574 	int level = 0;
3575 	u64 phys = 0;
3576 
3577 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
3578 			     GFP_ATOMIC);
3579 	if (pte && dma_pte_present(pte))
3580 		phys = dma_pte_addr(pte) +
3581 			(iova & (BIT_MASK(level_to_offset_bits(level) +
3582 						VTD_PAGE_SHIFT) - 1));
3583 
3584 	return phys;
3585 }
3586 
3587 static bool domain_support_force_snooping(struct dmar_domain *domain)
3588 {
3589 	struct device_domain_info *info;
3590 	bool support = true;
3591 
3592 	assert_spin_locked(&domain->lock);
3593 	list_for_each_entry(info, &domain->devices, link) {
3594 		if (!ecap_sc_support(info->iommu->ecap)) {
3595 			support = false;
3596 			break;
3597 		}
3598 	}
3599 
3600 	return support;
3601 }
3602 
3603 static void domain_set_force_snooping(struct dmar_domain *domain)
3604 {
3605 	struct device_domain_info *info;
3606 
3607 	assert_spin_locked(&domain->lock);
3608 	/*
3609 	 * Second level page table supports per-PTE snoop control. The
3610 	 * iommu_map() interface will handle this by setting SNP bit.
3611 	 */
3612 	if (!domain->use_first_level) {
3613 		domain->set_pte_snp = true;
3614 		return;
3615 	}
3616 
3617 	list_for_each_entry(info, &domain->devices, link)
3618 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
3619 						     IOMMU_NO_PASID);
3620 }
3621 
3622 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
3623 {
3624 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3625 	unsigned long flags;
3626 
3627 	if (dmar_domain->force_snooping)
3628 		return true;
3629 
3630 	spin_lock_irqsave(&dmar_domain->lock, flags);
3631 	if (!domain_support_force_snooping(dmar_domain) ||
3632 	    (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
3633 		spin_unlock_irqrestore(&dmar_domain->lock, flags);
3634 		return false;
3635 	}
3636 
3637 	domain_set_force_snooping(dmar_domain);
3638 	dmar_domain->force_snooping = true;
3639 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
3640 
3641 	return true;
3642 }
3643 
3644 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
3645 {
3646 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3647 
3648 	switch (cap) {
3649 	case IOMMU_CAP_CACHE_COHERENCY:
3650 	case IOMMU_CAP_DEFERRED_FLUSH:
3651 		return true;
3652 	case IOMMU_CAP_PRE_BOOT_PROTECTION:
3653 		return dmar_platform_optin();
3654 	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
3655 		return ecap_sc_support(info->iommu->ecap);
3656 	case IOMMU_CAP_DIRTY_TRACKING:
3657 		return ssads_supported(info->iommu);
3658 	default:
3659 		return false;
3660 	}
3661 }
3662 
3663 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
3664 {
3665 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3666 	struct device_domain_info *info;
3667 	struct intel_iommu *iommu;
3668 	u8 bus, devfn;
3669 	int ret;
3670 
3671 	iommu = device_lookup_iommu(dev, &bus, &devfn);
3672 	if (!iommu || !iommu->iommu.ops)
3673 		return ERR_PTR(-ENODEV);
3674 
3675 	info = kzalloc(sizeof(*info), GFP_KERNEL);
3676 	if (!info)
3677 		return ERR_PTR(-ENOMEM);
3678 
3679 	if (dev_is_real_dma_subdevice(dev)) {
3680 		info->bus = pdev->bus->number;
3681 		info->devfn = pdev->devfn;
3682 		info->segment = pci_domain_nr(pdev->bus);
3683 	} else {
3684 		info->bus = bus;
3685 		info->devfn = devfn;
3686 		info->segment = iommu->segment;
3687 	}
3688 
3689 	info->dev = dev;
3690 	info->iommu = iommu;
3691 	if (dev_is_pci(dev)) {
3692 		if (ecap_dev_iotlb_support(iommu->ecap) &&
3693 		    pci_ats_supported(pdev) &&
3694 		    dmar_ats_supported(pdev, iommu)) {
3695 			info->ats_supported = 1;
3696 			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
3697 
3698 			/*
3699 			 * For IOMMU that supports device IOTLB throttling
3700 			 * (DIT), we assign PFSID to the invalidation desc
3701 			 * of a VF such that IOMMU HW can gauge queue depth
3702 			 * at PF level. If DIT is not set, PFSID will be
3703 			 * treated as reserved, which should be set to 0.
3704 			 */
3705 			if (ecap_dit(iommu->ecap))
3706 				info->pfsid = pci_dev_id(pci_physfn(pdev));
3707 			info->ats_qdep = pci_ats_queue_depth(pdev);
3708 		}
3709 		if (sm_supported(iommu)) {
3710 			if (pasid_supported(iommu)) {
3711 				int features = pci_pasid_features(pdev);
3712 
3713 				if (features >= 0)
3714 					info->pasid_supported = features | 1;
3715 			}
3716 
3717 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
3718 			    pci_pri_supported(pdev))
3719 				info->pri_supported = 1;
3720 		}
3721 	}
3722 
3723 	dev_iommu_priv_set(dev, info);
3724 	if (pdev && pci_ats_supported(pdev)) {
3725 		pci_prepare_ats(pdev, VTD_PAGE_SHIFT);
3726 		ret = device_rbtree_insert(iommu, info);
3727 		if (ret)
3728 			goto free;
3729 	}
3730 
3731 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
3732 		ret = intel_pasid_alloc_table(dev);
3733 		if (ret) {
3734 			dev_err(dev, "PASID table allocation failed\n");
3735 			goto clear_rbtree;
3736 		}
3737 
3738 		if (!context_copied(iommu, info->bus, info->devfn)) {
3739 			ret = intel_pasid_setup_sm_context(dev);
3740 			if (ret)
3741 				goto free_table;
3742 		}
3743 	}
3744 
3745 	intel_iommu_debugfs_create_dev(info);
3746 
3747 	/*
3748 	 * The PCIe spec, in its wisdom, declares that the behaviour of the
3749 	 * device is undefined if you enable PASID support after ATS support.
3750 	 * So always enable PASID support on devices which have it, even if
3751 	 * we can't yet know if we're ever going to use it.
3752 	 */
3753 	if (info->pasid_supported &&
3754 	    !pci_enable_pasid(pdev, info->pasid_supported & ~1))
3755 		info->pasid_enabled = 1;
3756 
3757 	return &iommu->iommu;
3758 free_table:
3759 	intel_pasid_free_table(dev);
3760 clear_rbtree:
3761 	device_rbtree_remove(info);
3762 free:
3763 	kfree(info);
3764 
3765 	return ERR_PTR(ret);
3766 }
3767 
3768 static void intel_iommu_release_device(struct device *dev)
3769 {
3770 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3771 	struct intel_iommu *iommu = info->iommu;
3772 
3773 	if (info->pasid_enabled) {
3774 		pci_disable_pasid(to_pci_dev(dev));
3775 		info->pasid_enabled = 0;
3776 	}
3777 
3778 	mutex_lock(&iommu->iopf_lock);
3779 	if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
3780 		device_rbtree_remove(info);
3781 	mutex_unlock(&iommu->iopf_lock);
3782 
3783 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3784 	    !context_copied(iommu, info->bus, info->devfn))
3785 		intel_pasid_teardown_sm_context(dev);
3786 
3787 	intel_pasid_free_table(dev);
3788 	intel_iommu_debugfs_remove_dev(info);
3789 	kfree(info);
3790 	set_dma_ops(dev, NULL);
3791 }
3792 
3793 static void intel_iommu_get_resv_regions(struct device *device,
3794 					 struct list_head *head)
3795 {
3796 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
3797 	struct iommu_resv_region *reg;
3798 	struct dmar_rmrr_unit *rmrr;
3799 	struct device *i_dev;
3800 	int i;
3801 
3802 	rcu_read_lock();
3803 	for_each_rmrr_units(rmrr) {
3804 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3805 					  i, i_dev) {
3806 			struct iommu_resv_region *resv;
3807 			enum iommu_resv_type type;
3808 			size_t length;
3809 
3810 			if (i_dev != device &&
3811 			    !is_downstream_to_pci_bridge(device, i_dev))
3812 				continue;
3813 
3814 			length = rmrr->end_address - rmrr->base_address + 1;
3815 
3816 			type = device_rmrr_is_relaxable(device) ?
3817 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
3818 
3819 			resv = iommu_alloc_resv_region(rmrr->base_address,
3820 						       length, prot, type,
3821 						       GFP_ATOMIC);
3822 			if (!resv)
3823 				break;
3824 
3825 			list_add_tail(&resv->list, head);
3826 		}
3827 	}
3828 	rcu_read_unlock();
3829 
3830 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
3831 	if (dev_is_pci(device)) {
3832 		struct pci_dev *pdev = to_pci_dev(device);
3833 
3834 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
3835 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
3836 					IOMMU_RESV_DIRECT_RELAXABLE,
3837 					GFP_KERNEL);
3838 			if (reg)
3839 				list_add_tail(&reg->list, head);
3840 		}
3841 	}
3842 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
3843 
3844 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
3845 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
3846 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
3847 	if (!reg)
3848 		return;
3849 	list_add_tail(&reg->list, head);
3850 }
3851 
3852 static struct iommu_group *intel_iommu_device_group(struct device *dev)
3853 {
3854 	if (dev_is_pci(dev))
3855 		return pci_device_group(dev);
3856 	return generic_device_group(dev);
3857 }
3858 
3859 static int intel_iommu_enable_sva(struct device *dev)
3860 {
3861 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3862 	struct intel_iommu *iommu;
3863 
3864 	if (!info || dmar_disabled)
3865 		return -EINVAL;
3866 
3867 	iommu = info->iommu;
3868 	if (!iommu)
3869 		return -EINVAL;
3870 
3871 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
3872 		return -ENODEV;
3873 
3874 	if (!info->pasid_enabled || !info->ats_enabled)
3875 		return -EINVAL;
3876 
3877 	/*
3878 	 * Devices having device-specific I/O fault handling should not
3879 	 * support PCI/PRI. The IOMMU side has no means to check the
3880 	 * capability of device-specific IOPF.  Therefore, IOMMU can only
3881 	 * default that if the device driver enables SVA on a non-PRI
3882 	 * device, it will handle IOPF in its own way.
3883 	 */
3884 	if (!info->pri_supported)
3885 		return 0;
3886 
3887 	/* Devices supporting PRI should have it enabled. */
3888 	if (!info->pri_enabled)
3889 		return -EINVAL;
3890 
3891 	return 0;
3892 }
3893 
3894 static int context_flip_pri(struct device_domain_info *info, bool enable)
3895 {
3896 	struct intel_iommu *iommu = info->iommu;
3897 	u8 bus = info->bus, devfn = info->devfn;
3898 	struct context_entry *context;
3899 	u16 did;
3900 
3901 	spin_lock(&iommu->lock);
3902 	if (context_copied(iommu, bus, devfn)) {
3903 		spin_unlock(&iommu->lock);
3904 		return -EINVAL;
3905 	}
3906 
3907 	context = iommu_context_addr(iommu, bus, devfn, false);
3908 	if (!context || !context_present(context)) {
3909 		spin_unlock(&iommu->lock);
3910 		return -ENODEV;
3911 	}
3912 	did = context_domain_id(context);
3913 
3914 	if (enable)
3915 		context_set_sm_pre(context);
3916 	else
3917 		context_clear_sm_pre(context);
3918 
3919 	if (!ecap_coherent(iommu->ecap))
3920 		clflush_cache_range(context, sizeof(*context));
3921 	intel_context_flush_present(info, context, did, true);
3922 	spin_unlock(&iommu->lock);
3923 
3924 	return 0;
3925 }
3926 
3927 static int intel_iommu_enable_iopf(struct device *dev)
3928 {
3929 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3930 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3931 	struct intel_iommu *iommu;
3932 	int ret;
3933 
3934 	if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
3935 		return -ENODEV;
3936 
3937 	if (info->pri_enabled)
3938 		return -EBUSY;
3939 
3940 	iommu = info->iommu;
3941 	if (!iommu)
3942 		return -EINVAL;
3943 
3944 	/* PASID is required in PRG Response Message. */
3945 	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
3946 		return -EINVAL;
3947 
3948 	ret = pci_reset_pri(pdev);
3949 	if (ret)
3950 		return ret;
3951 
3952 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
3953 	if (ret)
3954 		return ret;
3955 
3956 	ret = context_flip_pri(info, true);
3957 	if (ret)
3958 		goto err_remove_device;
3959 
3960 	ret = pci_enable_pri(pdev, PRQ_DEPTH);
3961 	if (ret)
3962 		goto err_clear_pri;
3963 
3964 	info->pri_enabled = 1;
3965 
3966 	return 0;
3967 err_clear_pri:
3968 	context_flip_pri(info, false);
3969 err_remove_device:
3970 	iopf_queue_remove_device(iommu->iopf_queue, dev);
3971 
3972 	return ret;
3973 }
3974 
3975 static int intel_iommu_disable_iopf(struct device *dev)
3976 {
3977 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3978 	struct intel_iommu *iommu = info->iommu;
3979 
3980 	if (!info->pri_enabled)
3981 		return -EINVAL;
3982 
3983 	/* Disable new PRI reception: */
3984 	context_flip_pri(info, false);
3985 
3986 	/*
3987 	 * Remove device from fault queue and acknowledge all outstanding
3988 	 * PRQs to the device:
3989 	 */
3990 	iopf_queue_remove_device(iommu->iopf_queue, dev);
3991 
3992 	/*
3993 	 * PCIe spec states that by clearing PRI enable bit, the Page
3994 	 * Request Interface will not issue new page requests, but has
3995 	 * outstanding page requests that have been transmitted or are
3996 	 * queued for transmission. This is supposed to be called after
3997 	 * the device driver has stopped DMA, all PASIDs have been
3998 	 * unbound and the outstanding PRQs have been drained.
3999 	 */
4000 	pci_disable_pri(to_pci_dev(dev));
4001 	info->pri_enabled = 0;
4002 
4003 	return 0;
4004 }
4005 
4006 static int
4007 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4008 {
4009 	switch (feat) {
4010 	case IOMMU_DEV_FEAT_IOPF:
4011 		return intel_iommu_enable_iopf(dev);
4012 
4013 	case IOMMU_DEV_FEAT_SVA:
4014 		return intel_iommu_enable_sva(dev);
4015 
4016 	default:
4017 		return -ENODEV;
4018 	}
4019 }
4020 
4021 static int
4022 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4023 {
4024 	switch (feat) {
4025 	case IOMMU_DEV_FEAT_IOPF:
4026 		return intel_iommu_disable_iopf(dev);
4027 
4028 	case IOMMU_DEV_FEAT_SVA:
4029 		return 0;
4030 
4031 	default:
4032 		return -ENODEV;
4033 	}
4034 }
4035 
4036 static bool intel_iommu_is_attach_deferred(struct device *dev)
4037 {
4038 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4039 
4040 	return translation_pre_enabled(info->iommu) && !info->domain;
4041 }
4042 
4043 /*
4044  * Check that the device does not live on an external facing PCI port that is
4045  * marked as untrusted. Such devices should not be able to apply quirks and
4046  * thus not be able to bypass the IOMMU restrictions.
4047  */
4048 static bool risky_device(struct pci_dev *pdev)
4049 {
4050 	if (pdev->untrusted) {
4051 		pci_info(pdev,
4052 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4053 			 pdev->vendor, pdev->device);
4054 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4055 		return true;
4056 	}
4057 	return false;
4058 }
4059 
4060 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4061 				      unsigned long iova, size_t size)
4062 {
4063 	cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1);
4064 
4065 	return 0;
4066 }
4067 
4068 void domain_remove_dev_pasid(struct iommu_domain *domain,
4069 			     struct device *dev, ioasid_t pasid)
4070 {
4071 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4072 	struct dev_pasid_info *curr, *dev_pasid = NULL;
4073 	struct intel_iommu *iommu = info->iommu;
4074 	struct dmar_domain *dmar_domain;
4075 	unsigned long flags;
4076 
4077 	if (!domain)
4078 		return;
4079 
4080 	/* Identity domain has no meta data for pasid. */
4081 	if (domain->type == IOMMU_DOMAIN_IDENTITY)
4082 		return;
4083 
4084 	dmar_domain = to_dmar_domain(domain);
4085 	spin_lock_irqsave(&dmar_domain->lock, flags);
4086 	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4087 		if (curr->dev == dev && curr->pasid == pasid) {
4088 			list_del(&curr->link_domain);
4089 			dev_pasid = curr;
4090 			break;
4091 		}
4092 	}
4093 	WARN_ON_ONCE(!dev_pasid);
4094 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4095 
4096 	cache_tag_unassign_domain(dmar_domain, dev, pasid);
4097 	domain_detach_iommu(dmar_domain, iommu);
4098 	intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4099 	kfree(dev_pasid);
4100 }
4101 
4102 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
4103 					 struct iommu_domain *domain)
4104 {
4105 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4106 
4107 	intel_pasid_tear_down_entry(info->iommu, dev, pasid, false);
4108 	domain_remove_dev_pasid(domain, dev, pasid);
4109 }
4110 
4111 struct dev_pasid_info *
4112 domain_add_dev_pasid(struct iommu_domain *domain,
4113 		     struct device *dev, ioasid_t pasid)
4114 {
4115 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4116 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4117 	struct intel_iommu *iommu = info->iommu;
4118 	struct dev_pasid_info *dev_pasid;
4119 	unsigned long flags;
4120 	int ret;
4121 
4122 	dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4123 	if (!dev_pasid)
4124 		return ERR_PTR(-ENOMEM);
4125 
4126 	ret = domain_attach_iommu(dmar_domain, iommu);
4127 	if (ret)
4128 		goto out_free;
4129 
4130 	ret = cache_tag_assign_domain(dmar_domain, dev, pasid);
4131 	if (ret)
4132 		goto out_detach_iommu;
4133 
4134 	dev_pasid->dev = dev;
4135 	dev_pasid->pasid = pasid;
4136 	spin_lock_irqsave(&dmar_domain->lock, flags);
4137 	list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4138 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4139 
4140 	return dev_pasid;
4141 out_detach_iommu:
4142 	domain_detach_iommu(dmar_domain, iommu);
4143 out_free:
4144 	kfree(dev_pasid);
4145 	return ERR_PTR(ret);
4146 }
4147 
4148 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4149 				     struct device *dev, ioasid_t pasid,
4150 				     struct iommu_domain *old)
4151 {
4152 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4153 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4154 	struct intel_iommu *iommu = info->iommu;
4155 	struct dev_pasid_info *dev_pasid;
4156 	int ret;
4157 
4158 	if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
4159 		return -EINVAL;
4160 
4161 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4162 		return -EOPNOTSUPP;
4163 
4164 	if (domain->dirty_ops)
4165 		return -EINVAL;
4166 
4167 	if (context_copied(iommu, info->bus, info->devfn))
4168 		return -EBUSY;
4169 
4170 	ret = paging_domain_compatible(domain, dev);
4171 	if (ret)
4172 		return ret;
4173 
4174 	dev_pasid = domain_add_dev_pasid(domain, dev, pasid);
4175 	if (IS_ERR(dev_pasid))
4176 		return PTR_ERR(dev_pasid);
4177 
4178 	if (dmar_domain->use_first_level)
4179 		ret = domain_setup_first_level(iommu, dmar_domain,
4180 					       dev, pasid, old);
4181 	else
4182 		ret = domain_setup_second_level(iommu, dmar_domain,
4183 						dev, pasid, old);
4184 	if (ret)
4185 		goto out_remove_dev_pasid;
4186 
4187 	domain_remove_dev_pasid(old, dev, pasid);
4188 
4189 	intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4190 
4191 	return 0;
4192 
4193 out_remove_dev_pasid:
4194 	domain_remove_dev_pasid(domain, dev, pasid);
4195 	return ret;
4196 }
4197 
4198 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4199 {
4200 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4201 	struct intel_iommu *iommu = info->iommu;
4202 	struct iommu_hw_info_vtd *vtd;
4203 
4204 	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4205 	if (!vtd)
4206 		return ERR_PTR(-ENOMEM);
4207 
4208 	vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4209 	vtd->cap_reg = iommu->cap;
4210 	vtd->ecap_reg = iommu->ecap;
4211 	*length = sizeof(*vtd);
4212 	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4213 	return vtd;
4214 }
4215 
4216 /*
4217  * Set dirty tracking for the device list of a domain. The caller must
4218  * hold the domain->lock when calling it.
4219  */
4220 static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4221 {
4222 	struct device_domain_info *info;
4223 	int ret = 0;
4224 
4225 	list_for_each_entry(info, devices, link) {
4226 		ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4227 						       IOMMU_NO_PASID, enable);
4228 		if (ret)
4229 			break;
4230 	}
4231 
4232 	return ret;
4233 }
4234 
4235 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4236 					    bool enable)
4237 {
4238 	struct dmar_domain *s1_domain;
4239 	unsigned long flags;
4240 	int ret;
4241 
4242 	spin_lock(&domain->s1_lock);
4243 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4244 		spin_lock_irqsave(&s1_domain->lock, flags);
4245 		ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4246 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4247 		if (ret)
4248 			goto err_unwind;
4249 	}
4250 	spin_unlock(&domain->s1_lock);
4251 	return 0;
4252 
4253 err_unwind:
4254 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4255 		spin_lock_irqsave(&s1_domain->lock, flags);
4256 		device_set_dirty_tracking(&s1_domain->devices,
4257 					  domain->dirty_tracking);
4258 		spin_unlock_irqrestore(&s1_domain->lock, flags);
4259 	}
4260 	spin_unlock(&domain->s1_lock);
4261 	return ret;
4262 }
4263 
4264 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4265 					  bool enable)
4266 {
4267 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4268 	int ret;
4269 
4270 	spin_lock(&dmar_domain->lock);
4271 	if (dmar_domain->dirty_tracking == enable)
4272 		goto out_unlock;
4273 
4274 	ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4275 	if (ret)
4276 		goto err_unwind;
4277 
4278 	if (dmar_domain->nested_parent) {
4279 		ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4280 		if (ret)
4281 			goto err_unwind;
4282 	}
4283 
4284 	dmar_domain->dirty_tracking = enable;
4285 out_unlock:
4286 	spin_unlock(&dmar_domain->lock);
4287 
4288 	return 0;
4289 
4290 err_unwind:
4291 	device_set_dirty_tracking(&dmar_domain->devices,
4292 				  dmar_domain->dirty_tracking);
4293 	spin_unlock(&dmar_domain->lock);
4294 	return ret;
4295 }
4296 
4297 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4298 					    unsigned long iova, size_t size,
4299 					    unsigned long flags,
4300 					    struct iommu_dirty_bitmap *dirty)
4301 {
4302 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4303 	unsigned long end = iova + size - 1;
4304 	unsigned long pgsize;
4305 
4306 	/*
4307 	 * IOMMUFD core calls into a dirty tracking disabled domain without an
4308 	 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4309 	 * have occurred when we stopped dirty tracking. This ensures that we
4310 	 * never inherit dirtied bits from a previous cycle.
4311 	 */
4312 	if (!dmar_domain->dirty_tracking && dirty->bitmap)
4313 		return -EINVAL;
4314 
4315 	do {
4316 		struct dma_pte *pte;
4317 		int lvl = 0;
4318 
4319 		pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4320 				     GFP_ATOMIC);
4321 		pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4322 		if (!pte || !dma_pte_present(pte)) {
4323 			iova += pgsize;
4324 			continue;
4325 		}
4326 
4327 		if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4328 			iommu_dirty_bitmap_record(dirty, iova, pgsize);
4329 		iova += pgsize;
4330 	} while (iova < end);
4331 
4332 	return 0;
4333 }
4334 
4335 static const struct iommu_dirty_ops intel_dirty_ops = {
4336 	.set_dirty_tracking = intel_iommu_set_dirty_tracking,
4337 	.read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4338 };
4339 
4340 static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
4341 {
4342 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4343 	struct intel_iommu *iommu = info->iommu;
4344 	struct context_entry *context;
4345 
4346 	spin_lock(&iommu->lock);
4347 	context = iommu_context_addr(iommu, bus, devfn, 1);
4348 	if (!context) {
4349 		spin_unlock(&iommu->lock);
4350 		return -ENOMEM;
4351 	}
4352 
4353 	if (context_present(context) && !context_copied(iommu, bus, devfn)) {
4354 		spin_unlock(&iommu->lock);
4355 		return 0;
4356 	}
4357 
4358 	copied_context_tear_down(iommu, context, bus, devfn);
4359 	context_clear_entry(context);
4360 	context_set_domain_id(context, FLPT_DEFAULT_DID);
4361 
4362 	/*
4363 	 * In pass through mode, AW must be programmed to indicate the largest
4364 	 * AGAW value supported by hardware. And ASR is ignored by hardware.
4365 	 */
4366 	context_set_address_width(context, iommu->msagaw);
4367 	context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH);
4368 	context_set_fault_enable(context);
4369 	context_set_present(context);
4370 	if (!ecap_coherent(iommu->ecap))
4371 		clflush_cache_range(context, sizeof(*context));
4372 	context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn);
4373 	spin_unlock(&iommu->lock);
4374 
4375 	return 0;
4376 }
4377 
4378 static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data)
4379 {
4380 	struct device *dev = data;
4381 
4382 	if (dev != &pdev->dev)
4383 		return 0;
4384 
4385 	return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff);
4386 }
4387 
4388 static int device_setup_pass_through(struct device *dev)
4389 {
4390 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4391 
4392 	if (!dev_is_pci(dev))
4393 		return context_setup_pass_through(dev, info->bus, info->devfn);
4394 
4395 	return pci_for_each_dma_alias(to_pci_dev(dev),
4396 				      context_setup_pass_through_cb, dev);
4397 }
4398 
4399 static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev)
4400 {
4401 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4402 	struct intel_iommu *iommu = info->iommu;
4403 	int ret;
4404 
4405 	device_block_translation(dev);
4406 
4407 	if (dev_is_real_dma_subdevice(dev))
4408 		return 0;
4409 
4410 	if (sm_supported(iommu)) {
4411 		ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
4412 		if (!ret)
4413 			iommu_enable_pci_caps(info);
4414 	} else {
4415 		ret = device_setup_pass_through(dev);
4416 	}
4417 
4418 	return ret;
4419 }
4420 
4421 static int identity_domain_set_dev_pasid(struct iommu_domain *domain,
4422 					 struct device *dev, ioasid_t pasid,
4423 					 struct iommu_domain *old)
4424 {
4425 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4426 	struct intel_iommu *iommu = info->iommu;
4427 	int ret;
4428 
4429 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4430 		return -EOPNOTSUPP;
4431 
4432 	ret = domain_setup_passthrough(iommu, dev, pasid, old);
4433 	if (ret)
4434 		return ret;
4435 
4436 	domain_remove_dev_pasid(old, dev, pasid);
4437 	return 0;
4438 }
4439 
4440 static struct iommu_domain identity_domain = {
4441 	.type = IOMMU_DOMAIN_IDENTITY,
4442 	.ops = &(const struct iommu_domain_ops) {
4443 		.attach_dev	= identity_domain_attach_dev,
4444 		.set_dev_pasid	= identity_domain_set_dev_pasid,
4445 	},
4446 };
4447 
4448 static struct iommu_domain *intel_iommu_domain_alloc_paging(struct device *dev)
4449 {
4450 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4451 	struct intel_iommu *iommu = info->iommu;
4452 	struct dmar_domain *dmar_domain;
4453 	bool first_stage;
4454 
4455 	first_stage = first_level_by_default(iommu);
4456 	dmar_domain = paging_domain_alloc(dev, first_stage);
4457 	if (IS_ERR(dmar_domain))
4458 		return ERR_CAST(dmar_domain);
4459 
4460 	return &dmar_domain->domain;
4461 }
4462 
4463 const struct iommu_ops intel_iommu_ops = {
4464 	.blocked_domain		= &blocking_domain,
4465 	.release_domain		= &blocking_domain,
4466 	.identity_domain	= &identity_domain,
4467 	.capable		= intel_iommu_capable,
4468 	.hw_info		= intel_iommu_hw_info,
4469 	.domain_alloc_paging_flags = intel_iommu_domain_alloc_paging_flags,
4470 	.domain_alloc_sva	= intel_svm_domain_alloc,
4471 	.domain_alloc_paging	= intel_iommu_domain_alloc_paging,
4472 	.domain_alloc_nested	= intel_iommu_domain_alloc_nested,
4473 	.probe_device		= intel_iommu_probe_device,
4474 	.release_device		= intel_iommu_release_device,
4475 	.get_resv_regions	= intel_iommu_get_resv_regions,
4476 	.device_group		= intel_iommu_device_group,
4477 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4478 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4479 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4480 	.def_domain_type	= device_def_domain_type,
4481 	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4482 	.pgsize_bitmap		= SZ_4K,
4483 	.page_response		= intel_iommu_page_response,
4484 	.default_domain_ops = &(const struct iommu_domain_ops) {
4485 		.attach_dev		= intel_iommu_attach_device,
4486 		.set_dev_pasid		= intel_iommu_set_dev_pasid,
4487 		.map_pages		= intel_iommu_map_pages,
4488 		.unmap_pages		= intel_iommu_unmap_pages,
4489 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4490 		.flush_iotlb_all        = intel_flush_iotlb_all,
4491 		.iotlb_sync		= intel_iommu_tlb_sync,
4492 		.iova_to_phys		= intel_iommu_iova_to_phys,
4493 		.free			= intel_iommu_domain_free,
4494 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4495 	}
4496 };
4497 
4498 static void quirk_iommu_igfx(struct pci_dev *dev)
4499 {
4500 	if (risky_device(dev))
4501 		return;
4502 
4503 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4504 	disable_igfx_iommu = 1;
4505 }
4506 
4507 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4508 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4509 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4510 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4511 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4512 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4513 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4514 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4515 
4516 /* Broadwell igfx malfunctions with dmar */
4517 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4518 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4519 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4520 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4521 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4522 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4523 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4524 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4525 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4526 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4527 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4528 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4529 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4530 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4531 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4532 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4533 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4534 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4535 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4536 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4537 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4538 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4539 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4540 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4541 
4542 static void quirk_iommu_rwbf(struct pci_dev *dev)
4543 {
4544 	if (risky_device(dev))
4545 		return;
4546 
4547 	/*
4548 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4549 	 * but needs it. Same seems to hold for the desktop versions.
4550 	 */
4551 	pci_info(dev, "Forcing write-buffer flush capability\n");
4552 	rwbf_quirk = 1;
4553 }
4554 
4555 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4556 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4557 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4558 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4559 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4560 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4561 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4562 
4563 #define GGC 0x52
4564 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4565 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4566 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4567 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4568 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4569 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4570 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4571 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4572 
4573 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4574 {
4575 	unsigned short ggc;
4576 
4577 	if (risky_device(dev))
4578 		return;
4579 
4580 	if (pci_read_config_word(dev, GGC, &ggc))
4581 		return;
4582 
4583 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4584 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4585 		disable_igfx_iommu = 1;
4586 	} else if (!disable_igfx_iommu) {
4587 		/* we have to ensure the gfx device is idle before we flush */
4588 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4589 		iommu_set_dma_strict();
4590 	}
4591 }
4592 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4593 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4594 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4595 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4596 
4597 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4598 {
4599 	unsigned short ver;
4600 
4601 	if (!IS_GFX_DEVICE(dev))
4602 		return;
4603 
4604 	ver = (dev->device >> 8) & 0xff;
4605 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4606 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4607 	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4608 		return;
4609 
4610 	if (risky_device(dev))
4611 		return;
4612 
4613 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4614 	iommu_skip_te_disable = 1;
4615 }
4616 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4617 
4618 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4619    ISOCH DMAR unit for the Azalia sound device, but not give it any
4620    TLB entries, which causes it to deadlock. Check for that.  We do
4621    this in a function called from init_dmars(), instead of in a PCI
4622    quirk, because we don't want to print the obnoxious "BIOS broken"
4623    message if VT-d is actually disabled.
4624 */
4625 static void __init check_tylersburg_isoch(void)
4626 {
4627 	struct pci_dev *pdev;
4628 	uint32_t vtisochctrl;
4629 
4630 	/* If there's no Azalia in the system anyway, forget it. */
4631 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4632 	if (!pdev)
4633 		return;
4634 
4635 	if (risky_device(pdev)) {
4636 		pci_dev_put(pdev);
4637 		return;
4638 	}
4639 
4640 	pci_dev_put(pdev);
4641 
4642 	/* System Management Registers. Might be hidden, in which case
4643 	   we can't do the sanity check. But that's OK, because the
4644 	   known-broken BIOSes _don't_ actually hide it, so far. */
4645 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4646 	if (!pdev)
4647 		return;
4648 
4649 	if (risky_device(pdev)) {
4650 		pci_dev_put(pdev);
4651 		return;
4652 	}
4653 
4654 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4655 		pci_dev_put(pdev);
4656 		return;
4657 	}
4658 
4659 	pci_dev_put(pdev);
4660 
4661 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4662 	if (vtisochctrl & 1)
4663 		return;
4664 
4665 	/* Drop all bits other than the number of TLB entries */
4666 	vtisochctrl &= 0x1c;
4667 
4668 	/* If we have the recommended number of TLB entries (16), fine. */
4669 	if (vtisochctrl == 0x10)
4670 		return;
4671 
4672 	/* Zero TLB entries? You get to ride the short bus to school. */
4673 	if (!vtisochctrl) {
4674 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4675 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4676 		     dmi_get_system_info(DMI_BIOS_VENDOR),
4677 		     dmi_get_system_info(DMI_BIOS_VERSION),
4678 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4679 		iommu_identity_mapping |= IDENTMAP_AZALIA;
4680 		return;
4681 	}
4682 
4683 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4684 	       vtisochctrl);
4685 }
4686 
4687 /*
4688  * Here we deal with a device TLB defect where device may inadvertently issue ATS
4689  * invalidation completion before posted writes initiated with translated address
4690  * that utilized translations matching the invalidation address range, violating
4691  * the invalidation completion ordering.
4692  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4693  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4694  * under the control of the trusted/privileged host device driver must use this
4695  * quirk.
4696  * Device TLBs are invalidated under the following six conditions:
4697  * 1. Device driver does DMA API unmap IOVA
4698  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4699  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4700  *    exit_mmap() due to crash
4701  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4702  *    VM has to free pages that were unmapped
4703  * 5. Userspace driver unmaps a DMA buffer
4704  * 6. Cache invalidation in vSVA usage (upcoming)
4705  *
4706  * For #1 and #2, device drivers are responsible for stopping DMA traffic
4707  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4708  * invalidate TLB the same way as normal user unmap which will use this quirk.
4709  * The dTLB invalidation after PASID cache flush does not need this quirk.
4710  *
4711  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4712  */
4713 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4714 			       unsigned long address, unsigned long mask,
4715 			       u32 pasid, u16 qdep)
4716 {
4717 	u16 sid;
4718 
4719 	if (likely(!info->dtlb_extra_inval))
4720 		return;
4721 
4722 	sid = PCI_DEVID(info->bus, info->devfn);
4723 	if (pasid == IOMMU_NO_PASID) {
4724 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
4725 				   qdep, address, mask);
4726 	} else {
4727 		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
4728 					 pasid, qdep, address, mask);
4729 	}
4730 }
4731 
4732 #define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
4733 
4734 /*
4735  * Function to submit a command to the enhanced command interface. The
4736  * valid enhanced command descriptions are defined in Table 47 of the
4737  * VT-d spec. The VT-d hardware implementation may support some but not
4738  * all commands, which can be determined by checking the Enhanced
4739  * Command Capability Register.
4740  *
4741  * Return values:
4742  *  - 0: Command successful without any error;
4743  *  - Negative: software error value;
4744  *  - Nonzero positive: failure status code defined in Table 48.
4745  */
4746 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
4747 {
4748 	unsigned long flags;
4749 	u64 res;
4750 	int ret;
4751 
4752 	if (!cap_ecmds(iommu->cap))
4753 		return -ENODEV;
4754 
4755 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
4756 
4757 	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
4758 	if (res & DMA_ECMD_ECRSP_IP) {
4759 		ret = -EBUSY;
4760 		goto err;
4761 	}
4762 
4763 	/*
4764 	 * Unconditionally write the operand B, because
4765 	 * - There is no side effect if an ecmd doesn't require an
4766 	 *   operand B, but we set the register to some value.
4767 	 * - It's not invoked in any critical path. The extra MMIO
4768 	 *   write doesn't bring any performance concerns.
4769 	 */
4770 	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
4771 	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
4772 
4773 	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
4774 		      !(res & DMA_ECMD_ECRSP_IP), res);
4775 
4776 	if (res & DMA_ECMD_ECRSP_IP) {
4777 		ret = -ETIMEDOUT;
4778 		goto err;
4779 	}
4780 
4781 	ret = ecmd_get_status_code(res);
4782 err:
4783 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
4784 
4785 	return ret;
4786 }
4787