xref: /linux/drivers/iommu/intel/iommu.c (revision c3b1edea3791fa91ab7032faa90355913ad9451b)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26 
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-pages.h"
31 #include "pasid.h"
32 #include "perfmon.h"
33 
34 #define ROOT_SIZE		VTD_PAGE_SIZE
35 #define CONTEXT_SIZE		VTD_PAGE_SIZE
36 
37 #define IS_GFX_DEVICE(pdev) pci_is_display(pdev)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41 
42 #define IOAPIC_RANGE_START	(0xfee00000)
43 #define IOAPIC_RANGE_END	(0xfeefffff)
44 #define IOVA_START_ADDR		(0x1000)
45 
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47 
48 static void __init check_tylersburg_isoch(void);
49 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
50 					  bool enable);
51 static int rwbf_quirk;
52 
53 #define rwbf_required(iommu)	(rwbf_quirk || cap_rwbf((iommu)->cap))
54 
55 /*
56  * set to 1 to panic kernel if can't successfully enable VT-d
57  * (used when kernel is launched w/ TXT)
58  */
59 static int force_on = 0;
60 static int intel_iommu_tboot_noforce;
61 static int no_platform_optin;
62 
63 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
64 
65 /*
66  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
67  * if marked present.
68  */
69 static phys_addr_t root_entry_lctp(struct root_entry *re)
70 {
71 	if (!(re->lo & 1))
72 		return 0;
73 
74 	return re->lo & VTD_PAGE_MASK;
75 }
76 
77 /*
78  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
79  * if marked present.
80  */
81 static phys_addr_t root_entry_uctp(struct root_entry *re)
82 {
83 	if (!(re->hi & 1))
84 		return 0;
85 
86 	return re->hi & VTD_PAGE_MASK;
87 }
88 
89 static int device_rid_cmp_key(const void *key, const struct rb_node *node)
90 {
91 	struct device_domain_info *info =
92 		rb_entry(node, struct device_domain_info, node);
93 	const u16 *rid_lhs = key;
94 
95 	if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
96 		return -1;
97 
98 	if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
99 		return 1;
100 
101 	return 0;
102 }
103 
104 static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
105 {
106 	struct device_domain_info *info =
107 		rb_entry(lhs, struct device_domain_info, node);
108 	u16 key = PCI_DEVID(info->bus, info->devfn);
109 
110 	return device_rid_cmp_key(&key, rhs);
111 }
112 
113 /*
114  * Looks up an IOMMU-probed device using its source ID.
115  *
116  * Returns the pointer to the device if there is a match. Otherwise,
117  * returns NULL.
118  *
119  * Note that this helper doesn't guarantee that the device won't be
120  * released by the iommu subsystem after being returned. The caller
121  * should use its own synchronization mechanism to avoid the device
122  * being released during its use if its possibly the case.
123  */
124 struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
125 {
126 	struct device_domain_info *info = NULL;
127 	struct rb_node *node;
128 	unsigned long flags;
129 
130 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
131 	node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
132 	if (node)
133 		info = rb_entry(node, struct device_domain_info, node);
134 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
135 
136 	return info ? info->dev : NULL;
137 }
138 
139 static int device_rbtree_insert(struct intel_iommu *iommu,
140 				struct device_domain_info *info)
141 {
142 	struct rb_node *curr;
143 	unsigned long flags;
144 
145 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
146 	curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
147 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
148 	if (WARN_ON(curr))
149 		return -EEXIST;
150 
151 	return 0;
152 }
153 
154 static void device_rbtree_remove(struct device_domain_info *info)
155 {
156 	struct intel_iommu *iommu = info->iommu;
157 	unsigned long flags;
158 
159 	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
160 	rb_erase(&info->node, &iommu->device_rbtree);
161 	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
162 }
163 
164 struct dmar_rmrr_unit {
165 	struct list_head list;		/* list of rmrr units	*/
166 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
167 	u64	base_address;		/* reserved base address*/
168 	u64	end_address;		/* reserved end address */
169 	struct dmar_dev_scope *devices;	/* target devices */
170 	int	devices_cnt;		/* target device count */
171 };
172 
173 struct dmar_atsr_unit {
174 	struct list_head list;		/* list of ATSR units */
175 	struct acpi_dmar_header *hdr;	/* ACPI header */
176 	struct dmar_dev_scope *devices;	/* target devices */
177 	int devices_cnt;		/* target device count */
178 	u8 include_all:1;		/* include all ports */
179 };
180 
181 struct dmar_satc_unit {
182 	struct list_head list;		/* list of SATC units */
183 	struct acpi_dmar_header *hdr;	/* ACPI header */
184 	struct dmar_dev_scope *devices;	/* target devices */
185 	struct intel_iommu *iommu;	/* the corresponding iommu */
186 	int devices_cnt;		/* target device count */
187 	u8 atc_required:1;		/* ATS is required */
188 };
189 
190 static LIST_HEAD(dmar_atsr_units);
191 static LIST_HEAD(dmar_rmrr_units);
192 static LIST_HEAD(dmar_satc_units);
193 
194 #define for_each_rmrr_units(rmrr) \
195 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
196 
197 static void intel_iommu_domain_free(struct iommu_domain *domain);
198 
199 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
200 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
201 
202 int intel_iommu_enabled = 0;
203 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
204 
205 static int intel_iommu_superpage = 1;
206 static int iommu_identity_mapping;
207 static int iommu_skip_te_disable;
208 static int disable_igfx_iommu;
209 
210 #define IDENTMAP_AZALIA		4
211 
212 const struct iommu_ops intel_iommu_ops;
213 
214 static bool translation_pre_enabled(struct intel_iommu *iommu)
215 {
216 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
217 }
218 
219 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
220 {
221 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
222 }
223 
224 static void init_translation_status(struct intel_iommu *iommu)
225 {
226 	u32 gsts;
227 
228 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
229 	if (gsts & DMA_GSTS_TES)
230 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
231 }
232 
233 static int __init intel_iommu_setup(char *str)
234 {
235 	if (!str)
236 		return -EINVAL;
237 
238 	while (*str) {
239 		if (!strncmp(str, "on", 2)) {
240 			dmar_disabled = 0;
241 			pr_info("IOMMU enabled\n");
242 		} else if (!strncmp(str, "off", 3)) {
243 			dmar_disabled = 1;
244 			no_platform_optin = 1;
245 			pr_info("IOMMU disabled\n");
246 		} else if (!strncmp(str, "igfx_off", 8)) {
247 			disable_igfx_iommu = 1;
248 			pr_info("Disable GFX device mapping\n");
249 		} else if (!strncmp(str, "forcedac", 8)) {
250 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
251 			iommu_dma_forcedac = true;
252 		} else if (!strncmp(str, "strict", 6)) {
253 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
254 			iommu_set_dma_strict();
255 		} else if (!strncmp(str, "sp_off", 6)) {
256 			pr_info("Disable supported super page\n");
257 			intel_iommu_superpage = 0;
258 		} else if (!strncmp(str, "sm_on", 5)) {
259 			pr_info("Enable scalable mode if hardware supports\n");
260 			intel_iommu_sm = 1;
261 		} else if (!strncmp(str, "sm_off", 6)) {
262 			pr_info("Scalable mode is disallowed\n");
263 			intel_iommu_sm = 0;
264 		} else if (!strncmp(str, "tboot_noforce", 13)) {
265 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
266 			intel_iommu_tboot_noforce = 1;
267 		} else {
268 			pr_notice("Unknown option - '%s'\n", str);
269 		}
270 
271 		str += strcspn(str, ",");
272 		while (*str == ',')
273 			str++;
274 	}
275 
276 	return 1;
277 }
278 __setup("intel_iommu=", intel_iommu_setup);
279 
280 /*
281  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
282  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
283  * the returned SAGAW.
284  */
285 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
286 {
287 	unsigned long fl_sagaw, sl_sagaw;
288 
289 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
290 	sl_sagaw = cap_sagaw(iommu->cap);
291 
292 	/* Second level only. */
293 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
294 		return sl_sagaw;
295 
296 	/* First level only. */
297 	if (!ecap_slts(iommu->ecap))
298 		return fl_sagaw;
299 
300 	return fl_sagaw & sl_sagaw;
301 }
302 
303 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
304 {
305 	unsigned long sagaw;
306 	int agaw;
307 
308 	sagaw = __iommu_calculate_sagaw(iommu);
309 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
310 		if (test_bit(agaw, &sagaw))
311 			break;
312 	}
313 
314 	return agaw;
315 }
316 
317 /*
318  * Calculate max SAGAW for each iommu.
319  */
320 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
321 {
322 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
323 }
324 
325 /*
326  * calculate agaw for each iommu.
327  * "SAGAW" may be different across iommus, use a default agaw, and
328  * get a supported less agaw for iommus that don't support the default agaw.
329  */
330 int iommu_calculate_agaw(struct intel_iommu *iommu)
331 {
332 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
333 }
334 
335 static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
336 {
337 	return sm_supported(iommu) ?
338 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
339 }
340 
341 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
342 					 u8 devfn, int alloc)
343 {
344 	struct root_entry *root = &iommu->root_entry[bus];
345 	struct context_entry *context;
346 	u64 *entry;
347 
348 	/*
349 	 * Except that the caller requested to allocate a new entry,
350 	 * returning a copied context entry makes no sense.
351 	 */
352 	if (!alloc && context_copied(iommu, bus, devfn))
353 		return NULL;
354 
355 	entry = &root->lo;
356 	if (sm_supported(iommu)) {
357 		if (devfn >= 0x80) {
358 			devfn -= 0x80;
359 			entry = &root->hi;
360 		}
361 		devfn *= 2;
362 	}
363 	if (*entry & 1)
364 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
365 	else {
366 		unsigned long phy_addr;
367 		if (!alloc)
368 			return NULL;
369 
370 		context = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC,
371 						    SZ_4K);
372 		if (!context)
373 			return NULL;
374 
375 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
376 		phy_addr = virt_to_phys((void *)context);
377 		*entry = phy_addr | 1;
378 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
379 	}
380 	return &context[devfn];
381 }
382 
383 /**
384  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
385  *				 sub-hierarchy of a candidate PCI-PCI bridge
386  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
387  * @bridge: the candidate PCI-PCI bridge
388  *
389  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
390  */
391 static bool
392 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
393 {
394 	struct pci_dev *pdev, *pbridge;
395 
396 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
397 		return false;
398 
399 	pdev = to_pci_dev(dev);
400 	pbridge = to_pci_dev(bridge);
401 
402 	if (pbridge->subordinate &&
403 	    pbridge->subordinate->number <= pdev->bus->number &&
404 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
405 		return true;
406 
407 	return false;
408 }
409 
410 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
411 {
412 	struct dmar_drhd_unit *drhd;
413 	u32 vtbar;
414 	int rc;
415 
416 	/* We know that this device on this chipset has its own IOMMU.
417 	 * If we find it under a different IOMMU, then the BIOS is lying
418 	 * to us. Hope that the IOMMU for this device is actually
419 	 * disabled, and it needs no translation...
420 	 */
421 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
422 	if (rc) {
423 		/* "can't" happen */
424 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
425 		return false;
426 	}
427 	vtbar &= 0xffff0000;
428 
429 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
430 	drhd = dmar_find_matched_drhd_unit(pdev);
431 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
432 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
433 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
434 		return true;
435 	}
436 
437 	return false;
438 }
439 
440 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
441 {
442 	if (!iommu || iommu->drhd->ignored)
443 		return true;
444 
445 	if (dev_is_pci(dev)) {
446 		struct pci_dev *pdev = to_pci_dev(dev);
447 
448 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
449 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
450 		    quirk_ioat_snb_local_iommu(pdev))
451 			return true;
452 	}
453 
454 	return false;
455 }
456 
457 static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
458 {
459 	struct dmar_drhd_unit *drhd = NULL;
460 	struct pci_dev *pdev = NULL;
461 	struct intel_iommu *iommu;
462 	struct device *tmp;
463 	u16 segment = 0;
464 	int i;
465 
466 	if (!dev)
467 		return NULL;
468 
469 	if (dev_is_pci(dev)) {
470 		struct pci_dev *pf_pdev;
471 
472 		pdev = pci_real_dma_dev(to_pci_dev(dev));
473 
474 		/* VFs aren't listed in scope tables; we need to look up
475 		 * the PF instead to find the IOMMU. */
476 		pf_pdev = pci_physfn(pdev);
477 		dev = &pf_pdev->dev;
478 		segment = pci_domain_nr(pdev->bus);
479 	} else if (has_acpi_companion(dev))
480 		dev = &ACPI_COMPANION(dev)->dev;
481 
482 	rcu_read_lock();
483 	for_each_iommu(iommu, drhd) {
484 		if (pdev && segment != drhd->segment)
485 			continue;
486 
487 		for_each_active_dev_scope(drhd->devices,
488 					  drhd->devices_cnt, i, tmp) {
489 			if (tmp == dev) {
490 				/* For a VF use its original BDF# not that of the PF
491 				 * which we used for the IOMMU lookup. Strictly speaking
492 				 * we could do this for all PCI devices; we only need to
493 				 * get the BDF# from the scope table for ACPI matches. */
494 				if (pdev && pdev->is_virtfn)
495 					goto got_pdev;
496 
497 				if (bus && devfn) {
498 					*bus = drhd->devices[i].bus;
499 					*devfn = drhd->devices[i].devfn;
500 				}
501 				goto out;
502 			}
503 
504 			if (is_downstream_to_pci_bridge(dev, tmp))
505 				goto got_pdev;
506 		}
507 
508 		if (pdev && drhd->include_all) {
509 got_pdev:
510 			if (bus && devfn) {
511 				*bus = pdev->bus->number;
512 				*devfn = pdev->devfn;
513 			}
514 			goto out;
515 		}
516 	}
517 	iommu = NULL;
518 out:
519 	if (iommu_is_dummy(iommu, dev))
520 		iommu = NULL;
521 
522 	rcu_read_unlock();
523 
524 	return iommu;
525 }
526 
527 static void free_context_table(struct intel_iommu *iommu)
528 {
529 	struct context_entry *context;
530 	int i;
531 
532 	if (!iommu->root_entry)
533 		return;
534 
535 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
536 		context = iommu_context_addr(iommu, i, 0, 0);
537 		if (context)
538 			iommu_free_pages(context);
539 
540 		if (!sm_supported(iommu))
541 			continue;
542 
543 		context = iommu_context_addr(iommu, i, 0x80, 0);
544 		if (context)
545 			iommu_free_pages(context);
546 	}
547 
548 	iommu_free_pages(iommu->root_entry);
549 	iommu->root_entry = NULL;
550 }
551 
552 #ifdef CONFIG_DMAR_DEBUG
553 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
554 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
555 {
556 	struct dma_pte *pte;
557 	int offset;
558 
559 	while (1) {
560 		offset = pfn_level_offset(pfn, level);
561 		pte = &parent[offset];
562 
563 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
564 
565 		if (!dma_pte_present(pte)) {
566 			pr_info("page table not present at level %d\n", level - 1);
567 			break;
568 		}
569 
570 		if (level == 1 || dma_pte_superpage(pte))
571 			break;
572 
573 		parent = phys_to_virt(dma_pte_addr(pte));
574 		level--;
575 	}
576 }
577 
578 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
579 			  unsigned long long addr, u32 pasid)
580 {
581 	struct pasid_dir_entry *dir, *pde;
582 	struct pasid_entry *entries, *pte;
583 	struct context_entry *ctx_entry;
584 	struct root_entry *rt_entry;
585 	int i, dir_index, index, level;
586 	u8 devfn = source_id & 0xff;
587 	u8 bus = source_id >> 8;
588 	struct dma_pte *pgtable;
589 
590 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
591 
592 	/* root entry dump */
593 	if (!iommu->root_entry) {
594 		pr_info("root table is not present\n");
595 		return;
596 	}
597 	rt_entry = &iommu->root_entry[bus];
598 
599 	if (sm_supported(iommu))
600 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
601 			rt_entry->hi, rt_entry->lo);
602 	else
603 		pr_info("root entry: 0x%016llx", rt_entry->lo);
604 
605 	/* context entry dump */
606 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
607 	if (!ctx_entry) {
608 		pr_info("context table is not present\n");
609 		return;
610 	}
611 
612 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
613 		ctx_entry->hi, ctx_entry->lo);
614 
615 	/* legacy mode does not require PASID entries */
616 	if (!sm_supported(iommu)) {
617 		if (!context_present(ctx_entry)) {
618 			pr_info("legacy mode page table is not present\n");
619 			return;
620 		}
621 		level = agaw_to_level(ctx_entry->hi & 7);
622 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
623 		goto pgtable_walk;
624 	}
625 
626 	if (!context_present(ctx_entry)) {
627 		pr_info("pasid directory table is not present\n");
628 		return;
629 	}
630 
631 	/* get the pointer to pasid directory entry */
632 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
633 
634 	/* For request-without-pasid, get the pasid from context entry */
635 	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
636 		pasid = IOMMU_NO_PASID;
637 
638 	dir_index = pasid >> PASID_PDE_SHIFT;
639 	pde = &dir[dir_index];
640 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
641 
642 	/* get the pointer to the pasid table entry */
643 	entries = get_pasid_table_from_pde(pde);
644 	if (!entries) {
645 		pr_info("pasid table is not present\n");
646 		return;
647 	}
648 	index = pasid & PASID_PTE_MASK;
649 	pte = &entries[index];
650 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
651 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
652 
653 	if (!pasid_pte_is_present(pte)) {
654 		pr_info("scalable mode page table is not present\n");
655 		return;
656 	}
657 
658 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
659 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
660 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
661 	} else {
662 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
663 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
664 	}
665 
666 pgtable_walk:
667 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
668 }
669 #endif
670 
671 /* iommu handling */
672 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
673 {
674 	struct root_entry *root;
675 
676 	root = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC, SZ_4K);
677 	if (!root) {
678 		pr_err("Allocating root entry for %s failed\n",
679 			iommu->name);
680 		return -ENOMEM;
681 	}
682 
683 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
684 	iommu->root_entry = root;
685 
686 	return 0;
687 }
688 
689 static void iommu_set_root_entry(struct intel_iommu *iommu)
690 {
691 	u64 addr;
692 	u32 sts;
693 	unsigned long flag;
694 
695 	addr = virt_to_phys(iommu->root_entry);
696 	if (sm_supported(iommu))
697 		addr |= DMA_RTADDR_SMT;
698 
699 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
700 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
701 
702 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
703 
704 	/* Make sure hardware complete it */
705 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
706 		      readl, (sts & DMA_GSTS_RTPS), sts);
707 
708 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
709 
710 	/*
711 	 * Hardware invalidates all DMA remapping hardware translation
712 	 * caches as part of SRTP flow.
713 	 */
714 	if (cap_esrtps(iommu->cap))
715 		return;
716 
717 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
718 	if (sm_supported(iommu))
719 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
720 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
721 }
722 
723 void iommu_flush_write_buffer(struct intel_iommu *iommu)
724 {
725 	u32 val;
726 	unsigned long flag;
727 
728 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
729 		return;
730 
731 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
732 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
733 
734 	/* Make sure hardware complete it */
735 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
736 		      readl, (!(val & DMA_GSTS_WBFS)), val);
737 
738 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
739 }
740 
741 /* return value determine if we need a write buffer flush */
742 static void __iommu_flush_context(struct intel_iommu *iommu,
743 				  u16 did, u16 source_id, u8 function_mask,
744 				  u64 type)
745 {
746 	u64 val = 0;
747 	unsigned long flag;
748 
749 	switch (type) {
750 	case DMA_CCMD_GLOBAL_INVL:
751 		val = DMA_CCMD_GLOBAL_INVL;
752 		break;
753 	case DMA_CCMD_DOMAIN_INVL:
754 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
755 		break;
756 	case DMA_CCMD_DEVICE_INVL:
757 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
758 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
759 		break;
760 	default:
761 		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
762 			iommu->name, type);
763 		return;
764 	}
765 	val |= DMA_CCMD_ICC;
766 
767 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
768 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
769 
770 	/* Make sure hardware complete it */
771 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
772 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
773 
774 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
775 }
776 
777 void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
778 			 unsigned int size_order, u64 type)
779 {
780 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
781 	u64 val = 0, val_iva = 0;
782 	unsigned long flag;
783 
784 	switch (type) {
785 	case DMA_TLB_GLOBAL_FLUSH:
786 		/* global flush doesn't need set IVA_REG */
787 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
788 		break;
789 	case DMA_TLB_DSI_FLUSH:
790 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
791 		break;
792 	case DMA_TLB_PSI_FLUSH:
793 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
794 		/* IH bit is passed in as part of address */
795 		val_iva = size_order | addr;
796 		break;
797 	default:
798 		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
799 			iommu->name, type);
800 		return;
801 	}
802 
803 	if (cap_write_drain(iommu->cap))
804 		val |= DMA_TLB_WRITE_DRAIN;
805 
806 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
807 	/* Note: Only uses first TLB reg currently */
808 	if (val_iva)
809 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
810 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
811 
812 	/* Make sure hardware complete it */
813 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
814 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
815 
816 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
817 
818 	/* check IOTLB invalidation granularity */
819 	if (DMA_TLB_IAIG(val) == 0)
820 		pr_err("Flush IOTLB failed\n");
821 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
822 		pr_debug("TLB flush request %Lx, actual %Lx\n",
823 			(unsigned long long)DMA_TLB_IIRG(type),
824 			(unsigned long long)DMA_TLB_IAIG(val));
825 }
826 
827 static struct device_domain_info *
828 domain_lookup_dev_info(struct dmar_domain *domain,
829 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
830 {
831 	struct device_domain_info *info;
832 	unsigned long flags;
833 
834 	spin_lock_irqsave(&domain->lock, flags);
835 	list_for_each_entry(info, &domain->devices, link) {
836 		if (info->iommu == iommu && info->bus == bus &&
837 		    info->devfn == devfn) {
838 			spin_unlock_irqrestore(&domain->lock, flags);
839 			return info;
840 		}
841 	}
842 	spin_unlock_irqrestore(&domain->lock, flags);
843 
844 	return NULL;
845 }
846 
847 /*
848  * The extra devTLB flush quirk impacts those QAT devices with PCI device
849  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
850  * check because it applies only to the built-in QAT devices and it doesn't
851  * grant additional privileges.
852  */
853 #define BUGGY_QAT_DEVID_MASK 0x4940
854 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
855 {
856 	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
857 		return false;
858 
859 	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
860 		return false;
861 
862 	return true;
863 }
864 
865 static void iommu_enable_pci_ats(struct device_domain_info *info)
866 {
867 	struct pci_dev *pdev;
868 
869 	if (!info->ats_supported)
870 		return;
871 
872 	pdev = to_pci_dev(info->dev);
873 	if (!pci_ats_page_aligned(pdev))
874 		return;
875 
876 	if (!pci_enable_ats(pdev, VTD_PAGE_SHIFT))
877 		info->ats_enabled = 1;
878 }
879 
880 static void iommu_disable_pci_ats(struct device_domain_info *info)
881 {
882 	if (!info->ats_enabled)
883 		return;
884 
885 	pci_disable_ats(to_pci_dev(info->dev));
886 	info->ats_enabled = 0;
887 }
888 
889 static void iommu_enable_pci_pri(struct device_domain_info *info)
890 {
891 	struct pci_dev *pdev;
892 
893 	if (!info->ats_enabled || !info->pri_supported)
894 		return;
895 
896 	pdev = to_pci_dev(info->dev);
897 	/* PASID is required in PRG Response Message. */
898 	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
899 		return;
900 
901 	if (pci_reset_pri(pdev))
902 		return;
903 
904 	if (!pci_enable_pri(pdev, PRQ_DEPTH))
905 		info->pri_enabled = 1;
906 }
907 
908 static void iommu_disable_pci_pri(struct device_domain_info *info)
909 {
910 	if (!info->pri_enabled)
911 		return;
912 
913 	if (WARN_ON(info->iopf_refcount))
914 		iopf_queue_remove_device(info->iommu->iopf_queue, info->dev);
915 
916 	pci_disable_pri(to_pci_dev(info->dev));
917 	info->pri_enabled = 0;
918 }
919 
920 static void intel_flush_iotlb_all(struct iommu_domain *domain)
921 {
922 	cache_tag_flush_all(to_dmar_domain(domain));
923 }
924 
925 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
926 {
927 	u32 pmen;
928 	unsigned long flags;
929 
930 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
931 		return;
932 
933 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
934 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
935 	pmen &= ~DMA_PMEN_EPM;
936 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
937 
938 	/* wait for the protected region status bit to clear */
939 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
940 		readl, !(pmen & DMA_PMEN_PRS), pmen);
941 
942 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
943 }
944 
945 static void iommu_enable_translation(struct intel_iommu *iommu)
946 {
947 	u32 sts;
948 	unsigned long flags;
949 
950 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
951 	iommu->gcmd |= DMA_GCMD_TE;
952 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
953 
954 	/* Make sure hardware complete it */
955 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
956 		      readl, (sts & DMA_GSTS_TES), sts);
957 
958 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
959 }
960 
961 static void iommu_disable_translation(struct intel_iommu *iommu)
962 {
963 	u32 sts;
964 	unsigned long flag;
965 
966 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
967 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
968 		return;
969 
970 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
971 	iommu->gcmd &= ~DMA_GCMD_TE;
972 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
973 
974 	/* Make sure hardware complete it */
975 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
976 		      readl, (!(sts & DMA_GSTS_TES)), sts);
977 
978 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
979 }
980 
981 static void disable_dmar_iommu(struct intel_iommu *iommu)
982 {
983 	/*
984 	 * All iommu domains must have been detached from the devices,
985 	 * hence there should be no domain IDs in use.
986 	 */
987 	if (WARN_ON(!ida_is_empty(&iommu->domain_ida)))
988 		return;
989 
990 	if (iommu->gcmd & DMA_GCMD_TE)
991 		iommu_disable_translation(iommu);
992 }
993 
994 static void free_dmar_iommu(struct intel_iommu *iommu)
995 {
996 	if (iommu->copied_tables) {
997 		bitmap_free(iommu->copied_tables);
998 		iommu->copied_tables = NULL;
999 	}
1000 
1001 	/* free context mapping */
1002 	free_context_table(iommu);
1003 
1004 	if (ecap_prs(iommu->ecap))
1005 		intel_iommu_finish_prq(iommu);
1006 }
1007 
1008 /*
1009  * Check and return whether first level is used by default for
1010  * DMA translation.
1011  */
1012 static bool first_level_by_default(struct intel_iommu *iommu)
1013 {
1014 	/* Only SL is available in legacy mode */
1015 	if (!sm_supported(iommu))
1016 		return false;
1017 
1018 	/* Only level (either FL or SL) is available, just use it */
1019 	if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap))
1020 		return ecap_flts(iommu->ecap);
1021 
1022 	return true;
1023 }
1024 
1025 int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1026 {
1027 	struct iommu_domain_info *info, *curr;
1028 	int num, ret = -ENOSPC;
1029 
1030 	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1031 		return 0;
1032 
1033 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1034 	if (!info)
1035 		return -ENOMEM;
1036 
1037 	guard(mutex)(&iommu->did_lock);
1038 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1039 	if (curr) {
1040 		curr->refcnt++;
1041 		kfree(info);
1042 		return 0;
1043 	}
1044 
1045 	num = ida_alloc_range(&iommu->domain_ida, IDA_START_DID,
1046 			      cap_ndoms(iommu->cap) - 1, GFP_KERNEL);
1047 	if (num < 0) {
1048 		pr_err("%s: No free domain ids\n", iommu->name);
1049 		goto err_unlock;
1050 	}
1051 
1052 	info->refcnt	= 1;
1053 	info->did	= num;
1054 	info->iommu	= iommu;
1055 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1056 			  NULL, info, GFP_KERNEL);
1057 	if (curr) {
1058 		ret = xa_err(curr) ? : -EBUSY;
1059 		goto err_clear;
1060 	}
1061 
1062 	return 0;
1063 
1064 err_clear:
1065 	ida_free(&iommu->domain_ida, info->did);
1066 err_unlock:
1067 	kfree(info);
1068 	return ret;
1069 }
1070 
1071 void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1072 {
1073 	struct iommu_domain_info *info;
1074 
1075 	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1076 		return;
1077 
1078 	guard(mutex)(&iommu->did_lock);
1079 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1080 	if (--info->refcnt == 0) {
1081 		ida_free(&iommu->domain_ida, info->did);
1082 		xa_erase(&domain->iommu_array, iommu->seq_id);
1083 		kfree(info);
1084 	}
1085 }
1086 
1087 /*
1088  * For kdump cases, old valid entries may be cached due to the
1089  * in-flight DMA and copied pgtable, but there is no unmapping
1090  * behaviour for them, thus we need an explicit cache flush for
1091  * the newly-mapped device. For kdump, at this point, the device
1092  * is supposed to finish reset at its driver probe stage, so no
1093  * in-flight DMA will exist, and we don't need to worry anymore
1094  * hereafter.
1095  */
1096 static void copied_context_tear_down(struct intel_iommu *iommu,
1097 				     struct context_entry *context,
1098 				     u8 bus, u8 devfn)
1099 {
1100 	u16 did_old;
1101 
1102 	if (!context_copied(iommu, bus, devfn))
1103 		return;
1104 
1105 	assert_spin_locked(&iommu->lock);
1106 
1107 	did_old = context_domain_id(context);
1108 	context_clear_entry(context);
1109 
1110 	if (did_old < cap_ndoms(iommu->cap)) {
1111 		iommu->flush.flush_context(iommu, did_old,
1112 					   PCI_DEVID(bus, devfn),
1113 					   DMA_CCMD_MASK_NOBIT,
1114 					   DMA_CCMD_DEVICE_INVL);
1115 		iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1116 					 DMA_TLB_DSI_FLUSH);
1117 	}
1118 
1119 	clear_context_copied(iommu, bus, devfn);
1120 }
1121 
1122 /*
1123  * It's a non-present to present mapping. If hardware doesn't cache
1124  * non-present entry we only need to flush the write-buffer. If the
1125  * _does_ cache non-present entries, then it does so in the special
1126  * domain #0, which we have to flush:
1127  */
1128 static void context_present_cache_flush(struct intel_iommu *iommu, u16 did,
1129 					u8 bus, u8 devfn)
1130 {
1131 	if (cap_caching_mode(iommu->cap)) {
1132 		iommu->flush.flush_context(iommu, 0,
1133 					   PCI_DEVID(bus, devfn),
1134 					   DMA_CCMD_MASK_NOBIT,
1135 					   DMA_CCMD_DEVICE_INVL);
1136 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1137 	} else {
1138 		iommu_flush_write_buffer(iommu);
1139 	}
1140 }
1141 
1142 static int domain_context_mapping_one(struct dmar_domain *domain,
1143 				      struct intel_iommu *iommu,
1144 				      u8 bus, u8 devfn)
1145 {
1146 	struct device_domain_info *info =
1147 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1148 	u16 did = domain_id_iommu(domain, iommu);
1149 	int translation = CONTEXT_TT_MULTI_LEVEL;
1150 	struct pt_iommu_vtdss_hw_info pt_info;
1151 	struct context_entry *context;
1152 	int ret;
1153 
1154 	if (WARN_ON(!intel_domain_is_ss_paging(domain)))
1155 		return -EINVAL;
1156 
1157 	pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info);
1158 
1159 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1160 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1161 
1162 	spin_lock(&iommu->lock);
1163 	ret = -ENOMEM;
1164 	context = iommu_context_addr(iommu, bus, devfn, 1);
1165 	if (!context)
1166 		goto out_unlock;
1167 
1168 	ret = 0;
1169 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1170 		goto out_unlock;
1171 
1172 	copied_context_tear_down(iommu, context, bus, devfn);
1173 	context_clear_entry(context);
1174 	context_set_domain_id(context, did);
1175 
1176 	if (info && info->ats_supported)
1177 		translation = CONTEXT_TT_DEV_IOTLB;
1178 	else
1179 		translation = CONTEXT_TT_MULTI_LEVEL;
1180 
1181 	context_set_address_root(context, pt_info.ssptptr);
1182 	context_set_address_width(context, pt_info.aw);
1183 	context_set_translation_type(context, translation);
1184 	context_set_fault_enable(context);
1185 	context_set_present(context);
1186 	if (!ecap_coherent(iommu->ecap))
1187 		clflush_cache_range(context, sizeof(*context));
1188 	context_present_cache_flush(iommu, did, bus, devfn);
1189 	ret = 0;
1190 
1191 out_unlock:
1192 	spin_unlock(&iommu->lock);
1193 
1194 	return ret;
1195 }
1196 
1197 static int domain_context_mapping_cb(struct pci_dev *pdev,
1198 				     u16 alias, void *opaque)
1199 {
1200 	struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1201 	struct intel_iommu *iommu = info->iommu;
1202 	struct dmar_domain *domain = opaque;
1203 
1204 	return domain_context_mapping_one(domain, iommu,
1205 					  PCI_BUS_NUM(alias), alias & 0xff);
1206 }
1207 
1208 static int
1209 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1210 {
1211 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1212 	struct intel_iommu *iommu = info->iommu;
1213 	u8 bus = info->bus, devfn = info->devfn;
1214 	int ret;
1215 
1216 	if (!dev_is_pci(dev))
1217 		return domain_context_mapping_one(domain, iommu, bus, devfn);
1218 
1219 	ret = pci_for_each_dma_alias(to_pci_dev(dev),
1220 				     domain_context_mapping_cb, domain);
1221 	if (ret)
1222 		return ret;
1223 
1224 	iommu_enable_pci_ats(info);
1225 
1226 	return 0;
1227 }
1228 
1229 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
1230 {
1231 	struct intel_iommu *iommu = info->iommu;
1232 	struct context_entry *context;
1233 	u16 did;
1234 
1235 	spin_lock(&iommu->lock);
1236 	context = iommu_context_addr(iommu, bus, devfn, 0);
1237 	if (!context) {
1238 		spin_unlock(&iommu->lock);
1239 		return;
1240 	}
1241 
1242 	did = context_domain_id(context);
1243 	context_clear_present(context);
1244 	__iommu_flush_cache(iommu, context, sizeof(*context));
1245 	spin_unlock(&iommu->lock);
1246 	intel_context_flush_no_pasid(info, context, did);
1247 	context_clear_entry(context);
1248 	__iommu_flush_cache(iommu, context, sizeof(*context));
1249 }
1250 
1251 int __domain_setup_first_level(struct intel_iommu *iommu, struct device *dev,
1252 			       ioasid_t pasid, u16 did, phys_addr_t fsptptr,
1253 			       int flags, struct iommu_domain *old)
1254 {
1255 	if (old)
1256 		intel_pasid_tear_down_entry(iommu, dev, pasid, false);
1257 
1258 	return intel_pasid_setup_first_level(iommu, dev, fsptptr, pasid, did, flags);
1259 }
1260 
1261 static int domain_setup_second_level(struct intel_iommu *iommu,
1262 				     struct dmar_domain *domain,
1263 				     struct device *dev, ioasid_t pasid,
1264 				     struct iommu_domain *old)
1265 {
1266 	if (old)
1267 		intel_pasid_tear_down_entry(iommu, dev, pasid, false);
1268 
1269 	return intel_pasid_setup_second_level(iommu, domain, dev, pasid);
1270 }
1271 
1272 static int domain_setup_passthrough(struct intel_iommu *iommu,
1273 				    struct device *dev, ioasid_t pasid,
1274 				    struct iommu_domain *old)
1275 {
1276 	if (old)
1277 		intel_pasid_tear_down_entry(iommu, dev, pasid, false);
1278 
1279 	return intel_pasid_setup_pass_through(iommu, dev, pasid);
1280 }
1281 
1282 static int domain_setup_first_level(struct intel_iommu *iommu,
1283 				    struct dmar_domain *domain,
1284 				    struct device *dev,
1285 				    u32 pasid, struct iommu_domain *old)
1286 {
1287 	struct pt_iommu_x86_64_hw_info pt_info;
1288 	unsigned int flags = 0;
1289 
1290 	pt_iommu_x86_64_hw_info(&domain->fspt, &pt_info);
1291 	if (WARN_ON(pt_info.levels != 4 && pt_info.levels != 5))
1292 		return -EINVAL;
1293 
1294 	if (pt_info.levels == 5)
1295 		flags |= PASID_FLAG_FL5LP;
1296 
1297 	if (domain->force_snooping)
1298 		flags |= PASID_FLAG_PAGE_SNOOP;
1299 
1300 	if (!(domain->fspt.x86_64_pt.common.features &
1301 	      BIT(PT_FEAT_DMA_INCOHERENT)))
1302 		flags |= PASID_FLAG_PWSNP;
1303 
1304 	return __domain_setup_first_level(iommu, dev, pasid,
1305 					  domain_id_iommu(domain, iommu),
1306 					  pt_info.gcr3_pt, flags, old);
1307 }
1308 
1309 static int dmar_domain_attach_device(struct dmar_domain *domain,
1310 				     struct device *dev)
1311 {
1312 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1313 	struct intel_iommu *iommu = info->iommu;
1314 	unsigned long flags;
1315 	int ret;
1316 
1317 	ret = domain_attach_iommu(domain, iommu);
1318 	if (ret)
1319 		return ret;
1320 
1321 	info->domain = domain;
1322 	info->domain_attached = true;
1323 	spin_lock_irqsave(&domain->lock, flags);
1324 	list_add(&info->link, &domain->devices);
1325 	spin_unlock_irqrestore(&domain->lock, flags);
1326 
1327 	if (dev_is_real_dma_subdevice(dev))
1328 		return 0;
1329 
1330 	if (!sm_supported(iommu))
1331 		ret = domain_context_mapping(domain, dev);
1332 	else if (intel_domain_is_fs_paging(domain))
1333 		ret = domain_setup_first_level(iommu, domain, dev,
1334 					       IOMMU_NO_PASID, NULL);
1335 	else if (intel_domain_is_ss_paging(domain))
1336 		ret = domain_setup_second_level(iommu, domain, dev,
1337 						IOMMU_NO_PASID, NULL);
1338 	else if (WARN_ON(true))
1339 		ret = -EINVAL;
1340 
1341 	if (ret)
1342 		goto out_block_translation;
1343 
1344 	ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID);
1345 	if (ret)
1346 		goto out_block_translation;
1347 
1348 	return 0;
1349 
1350 out_block_translation:
1351 	device_block_translation(dev);
1352 	return ret;
1353 }
1354 
1355 /**
1356  * device_rmrr_is_relaxable - Test whether the RMRR of this device
1357  * is relaxable (ie. is allowed to be not enforced under some conditions)
1358  * @dev: device handle
1359  *
1360  * We assume that PCI USB devices with RMRRs have them largely
1361  * for historical reasons and that the RMRR space is not actively used post
1362  * boot.  This exclusion may change if vendors begin to abuse it.
1363  *
1364  * The same exception is made for graphics devices, with the requirement that
1365  * any use of the RMRR regions will be torn down before assigning the device
1366  * to a guest.
1367  *
1368  * Return: true if the RMRR is relaxable, false otherwise
1369  */
1370 static bool device_rmrr_is_relaxable(struct device *dev)
1371 {
1372 	struct pci_dev *pdev;
1373 
1374 	if (!dev_is_pci(dev))
1375 		return false;
1376 
1377 	pdev = to_pci_dev(dev);
1378 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
1379 		return true;
1380 	else
1381 		return false;
1382 }
1383 
1384 static int device_def_domain_type(struct device *dev)
1385 {
1386 	struct device_domain_info *info = dev_iommu_priv_get(dev);
1387 	struct intel_iommu *iommu = info->iommu;
1388 
1389 	/*
1390 	 * Hardware does not support the passthrough translation mode.
1391 	 * Always use a dynamaic mapping domain.
1392 	 */
1393 	if (!ecap_pass_through(iommu->ecap))
1394 		return IOMMU_DOMAIN_DMA;
1395 
1396 	if (dev_is_pci(dev)) {
1397 		struct pci_dev *pdev = to_pci_dev(dev);
1398 
1399 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
1400 			return IOMMU_DOMAIN_IDENTITY;
1401 	}
1402 
1403 	return 0;
1404 }
1405 
1406 static void intel_iommu_init_qi(struct intel_iommu *iommu)
1407 {
1408 	/*
1409 	 * Start from the sane iommu hardware state.
1410 	 * If the queued invalidation is already initialized by us
1411 	 * (for example, while enabling interrupt-remapping) then
1412 	 * we got the things already rolling from a sane state.
1413 	 */
1414 	if (!iommu->qi) {
1415 		/*
1416 		 * Clear any previous faults.
1417 		 */
1418 		dmar_fault(-1, iommu);
1419 		/*
1420 		 * Disable queued invalidation if supported and already enabled
1421 		 * before OS handover.
1422 		 */
1423 		dmar_disable_qi(iommu);
1424 	}
1425 
1426 	if (dmar_enable_qi(iommu)) {
1427 		/*
1428 		 * Queued Invalidate not enabled, use Register Based Invalidate
1429 		 */
1430 		iommu->flush.flush_context = __iommu_flush_context;
1431 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1432 		pr_info("%s: Using Register based invalidation\n",
1433 			iommu->name);
1434 	} else {
1435 		iommu->flush.flush_context = qi_flush_context;
1436 		iommu->flush.flush_iotlb = qi_flush_iotlb;
1437 		pr_info("%s: Using Queued invalidation\n", iommu->name);
1438 	}
1439 }
1440 
1441 static int copy_context_table(struct intel_iommu *iommu,
1442 			      struct root_entry *old_re,
1443 			      struct context_entry **tbl,
1444 			      int bus, bool ext)
1445 {
1446 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
1447 	struct context_entry *new_ce = NULL, ce;
1448 	struct context_entry *old_ce = NULL;
1449 	struct root_entry re;
1450 	phys_addr_t old_ce_phys;
1451 
1452 	tbl_idx = ext ? bus * 2 : bus;
1453 	memcpy(&re, old_re, sizeof(re));
1454 
1455 	for (devfn = 0; devfn < 256; devfn++) {
1456 		/* First calculate the correct index */
1457 		idx = (ext ? devfn * 2 : devfn) % 256;
1458 
1459 		if (idx == 0) {
1460 			/* First save what we may have and clean up */
1461 			if (new_ce) {
1462 				tbl[tbl_idx] = new_ce;
1463 				__iommu_flush_cache(iommu, new_ce,
1464 						    VTD_PAGE_SIZE);
1465 				pos = 1;
1466 			}
1467 
1468 			if (old_ce)
1469 				memunmap(old_ce);
1470 
1471 			ret = 0;
1472 			if (devfn < 0x80)
1473 				old_ce_phys = root_entry_lctp(&re);
1474 			else
1475 				old_ce_phys = root_entry_uctp(&re);
1476 
1477 			if (!old_ce_phys) {
1478 				if (ext && devfn == 0) {
1479 					/* No LCTP, try UCTP */
1480 					devfn = 0x7f;
1481 					continue;
1482 				} else {
1483 					goto out;
1484 				}
1485 			}
1486 
1487 			ret = -ENOMEM;
1488 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
1489 					MEMREMAP_WB);
1490 			if (!old_ce)
1491 				goto out;
1492 
1493 			new_ce = iommu_alloc_pages_node_sz(iommu->node,
1494 							   GFP_KERNEL, SZ_4K);
1495 			if (!new_ce)
1496 				goto out_unmap;
1497 
1498 			ret = 0;
1499 		}
1500 
1501 		/* Now copy the context entry */
1502 		memcpy(&ce, old_ce + idx, sizeof(ce));
1503 
1504 		if (!context_present(&ce))
1505 			continue;
1506 
1507 		did = context_domain_id(&ce);
1508 		if (did >= 0 && did < cap_ndoms(iommu->cap))
1509 			ida_alloc_range(&iommu->domain_ida, did, did, GFP_KERNEL);
1510 
1511 		set_context_copied(iommu, bus, devfn);
1512 		new_ce[idx] = ce;
1513 	}
1514 
1515 	tbl[tbl_idx + pos] = new_ce;
1516 
1517 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
1518 
1519 out_unmap:
1520 	memunmap(old_ce);
1521 
1522 out:
1523 	return ret;
1524 }
1525 
1526 static int copy_translation_tables(struct intel_iommu *iommu)
1527 {
1528 	struct context_entry **ctxt_tbls;
1529 	struct root_entry *old_rt;
1530 	phys_addr_t old_rt_phys;
1531 	int ctxt_table_entries;
1532 	u64 rtaddr_reg;
1533 	int bus, ret;
1534 	bool new_ext, ext;
1535 
1536 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
1537 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
1538 	new_ext    = !!sm_supported(iommu);
1539 
1540 	/*
1541 	 * The RTT bit can only be changed when translation is disabled,
1542 	 * but disabling translation means to open a window for data
1543 	 * corruption. So bail out and don't copy anything if we would
1544 	 * have to change the bit.
1545 	 */
1546 	if (new_ext != ext)
1547 		return -EINVAL;
1548 
1549 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
1550 	if (!iommu->copied_tables)
1551 		return -ENOMEM;
1552 
1553 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
1554 	if (!old_rt_phys)
1555 		return -EINVAL;
1556 
1557 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
1558 	if (!old_rt)
1559 		return -ENOMEM;
1560 
1561 	/* This is too big for the stack - allocate it from slab */
1562 	ctxt_table_entries = ext ? 512 : 256;
1563 	ret = -ENOMEM;
1564 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
1565 	if (!ctxt_tbls)
1566 		goto out_unmap;
1567 
1568 	for (bus = 0; bus < 256; bus++) {
1569 		ret = copy_context_table(iommu, &old_rt[bus],
1570 					 ctxt_tbls, bus, ext);
1571 		if (ret) {
1572 			pr_err("%s: Failed to copy context table for bus %d\n",
1573 				iommu->name, bus);
1574 			continue;
1575 		}
1576 	}
1577 
1578 	spin_lock(&iommu->lock);
1579 
1580 	/* Context tables are copied, now write them to the root_entry table */
1581 	for (bus = 0; bus < 256; bus++) {
1582 		int idx = ext ? bus * 2 : bus;
1583 		u64 val;
1584 
1585 		if (ctxt_tbls[idx]) {
1586 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
1587 			iommu->root_entry[bus].lo = val;
1588 		}
1589 
1590 		if (!ext || !ctxt_tbls[idx + 1])
1591 			continue;
1592 
1593 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
1594 		iommu->root_entry[bus].hi = val;
1595 	}
1596 
1597 	spin_unlock(&iommu->lock);
1598 
1599 	kfree(ctxt_tbls);
1600 
1601 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
1602 
1603 	ret = 0;
1604 
1605 out_unmap:
1606 	memunmap(old_rt);
1607 
1608 	return ret;
1609 }
1610 
1611 static int __init init_dmars(void)
1612 {
1613 	struct dmar_drhd_unit *drhd;
1614 	struct intel_iommu *iommu;
1615 	int ret;
1616 
1617 	for_each_iommu(iommu, drhd) {
1618 		if (drhd->ignored) {
1619 			iommu_disable_translation(iommu);
1620 			continue;
1621 		}
1622 
1623 		/*
1624 		 * Find the max pasid size of all IOMMU's in the system.
1625 		 * We need to ensure the system pasid table is no bigger
1626 		 * than the smallest supported.
1627 		 */
1628 		if (pasid_supported(iommu)) {
1629 			u32 temp = 2 << ecap_pss(iommu->ecap);
1630 
1631 			intel_pasid_max_id = min_t(u32, temp,
1632 						   intel_pasid_max_id);
1633 		}
1634 
1635 		intel_iommu_init_qi(iommu);
1636 		init_translation_status(iommu);
1637 
1638 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
1639 			iommu_disable_translation(iommu);
1640 			clear_translation_pre_enabled(iommu);
1641 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
1642 				iommu->name);
1643 		}
1644 
1645 		/*
1646 		 * TBD:
1647 		 * we could share the same root & context tables
1648 		 * among all IOMMU's. Need to Split it later.
1649 		 */
1650 		ret = iommu_alloc_root_entry(iommu);
1651 		if (ret)
1652 			goto free_iommu;
1653 
1654 		if (translation_pre_enabled(iommu)) {
1655 			pr_info("Translation already enabled - trying to copy translation structures\n");
1656 
1657 			ret = copy_translation_tables(iommu);
1658 			if (ret) {
1659 				/*
1660 				 * We found the IOMMU with translation
1661 				 * enabled - but failed to copy over the
1662 				 * old root-entry table. Try to proceed
1663 				 * by disabling translation now and
1664 				 * allocating a clean root-entry table.
1665 				 * This might cause DMAR faults, but
1666 				 * probably the dump will still succeed.
1667 				 */
1668 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
1669 				       iommu->name);
1670 				iommu_disable_translation(iommu);
1671 				clear_translation_pre_enabled(iommu);
1672 			} else {
1673 				pr_info("Copied translation tables from previous kernel for %s\n",
1674 					iommu->name);
1675 			}
1676 		}
1677 
1678 		intel_svm_check(iommu);
1679 	}
1680 
1681 	/*
1682 	 * Now that qi is enabled on all iommus, set the root entry and flush
1683 	 * caches. This is required on some Intel X58 chipsets, otherwise the
1684 	 * flush_context function will loop forever and the boot hangs.
1685 	 */
1686 	for_each_active_iommu(iommu, drhd) {
1687 		iommu_flush_write_buffer(iommu);
1688 		iommu_set_root_entry(iommu);
1689 	}
1690 
1691 	check_tylersburg_isoch();
1692 
1693 	/*
1694 	 * for each drhd
1695 	 *   enable fault log
1696 	 *   global invalidate context cache
1697 	 *   global invalidate iotlb
1698 	 *   enable translation
1699 	 */
1700 	for_each_iommu(iommu, drhd) {
1701 		if (drhd->ignored) {
1702 			/*
1703 			 * we always have to disable PMRs or DMA may fail on
1704 			 * this device
1705 			 */
1706 			if (force_on)
1707 				iommu_disable_protect_mem_regions(iommu);
1708 			continue;
1709 		}
1710 
1711 		iommu_flush_write_buffer(iommu);
1712 
1713 		if (ecap_prs(iommu->ecap)) {
1714 			/*
1715 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
1716 			 * could cause possible lock race condition.
1717 			 */
1718 			up_write(&dmar_global_lock);
1719 			ret = intel_iommu_enable_prq(iommu);
1720 			down_write(&dmar_global_lock);
1721 			if (ret)
1722 				goto free_iommu;
1723 		}
1724 
1725 		ret = dmar_set_interrupt(iommu);
1726 		if (ret)
1727 			goto free_iommu;
1728 	}
1729 
1730 	return 0;
1731 
1732 free_iommu:
1733 	for_each_active_iommu(iommu, drhd) {
1734 		disable_dmar_iommu(iommu);
1735 		free_dmar_iommu(iommu);
1736 	}
1737 
1738 	return ret;
1739 }
1740 
1741 static void __init init_no_remapping_devices(void)
1742 {
1743 	struct dmar_drhd_unit *drhd;
1744 	struct device *dev;
1745 	int i;
1746 
1747 	for_each_drhd_unit(drhd) {
1748 		if (!drhd->include_all) {
1749 			for_each_active_dev_scope(drhd->devices,
1750 						  drhd->devices_cnt, i, dev)
1751 				break;
1752 			/* ignore DMAR unit if no devices exist */
1753 			if (i == drhd->devices_cnt)
1754 				drhd->ignored = 1;
1755 		}
1756 	}
1757 
1758 	for_each_active_drhd_unit(drhd) {
1759 		if (drhd->include_all)
1760 			continue;
1761 
1762 		for_each_active_dev_scope(drhd->devices,
1763 					  drhd->devices_cnt, i, dev)
1764 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
1765 				break;
1766 		if (i < drhd->devices_cnt)
1767 			continue;
1768 
1769 		/* This IOMMU has *only* gfx devices. Either bypass it or
1770 		   set the gfx_mapped flag, as appropriate */
1771 		drhd->gfx_dedicated = 1;
1772 		if (disable_igfx_iommu)
1773 			drhd->ignored = 1;
1774 	}
1775 }
1776 
1777 #ifdef CONFIG_SUSPEND
1778 static int init_iommu_hw(void)
1779 {
1780 	struct dmar_drhd_unit *drhd;
1781 	struct intel_iommu *iommu = NULL;
1782 	int ret;
1783 
1784 	for_each_active_iommu(iommu, drhd) {
1785 		if (iommu->qi) {
1786 			ret = dmar_reenable_qi(iommu);
1787 			if (ret)
1788 				return ret;
1789 		}
1790 	}
1791 
1792 	for_each_iommu(iommu, drhd) {
1793 		if (drhd->ignored) {
1794 			/*
1795 			 * we always have to disable PMRs or DMA may fail on
1796 			 * this device
1797 			 */
1798 			if (force_on)
1799 				iommu_disable_protect_mem_regions(iommu);
1800 			continue;
1801 		}
1802 
1803 		iommu_flush_write_buffer(iommu);
1804 		iommu_set_root_entry(iommu);
1805 		iommu_enable_translation(iommu);
1806 		iommu_disable_protect_mem_regions(iommu);
1807 	}
1808 
1809 	return 0;
1810 }
1811 
1812 static void iommu_flush_all(void)
1813 {
1814 	struct dmar_drhd_unit *drhd;
1815 	struct intel_iommu *iommu;
1816 
1817 	for_each_active_iommu(iommu, drhd) {
1818 		iommu->flush.flush_context(iommu, 0, 0, 0,
1819 					   DMA_CCMD_GLOBAL_INVL);
1820 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1821 					 DMA_TLB_GLOBAL_FLUSH);
1822 	}
1823 }
1824 
1825 static int iommu_suspend(void *data)
1826 {
1827 	struct dmar_drhd_unit *drhd;
1828 	struct intel_iommu *iommu = NULL;
1829 	unsigned long flag;
1830 
1831 	iommu_flush_all();
1832 
1833 	for_each_active_iommu(iommu, drhd) {
1834 		iommu_disable_translation(iommu);
1835 
1836 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
1837 
1838 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
1839 			readl(iommu->reg + DMAR_FECTL_REG);
1840 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
1841 			readl(iommu->reg + DMAR_FEDATA_REG);
1842 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
1843 			readl(iommu->reg + DMAR_FEADDR_REG);
1844 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
1845 			readl(iommu->reg + DMAR_FEUADDR_REG);
1846 
1847 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1848 	}
1849 	return 0;
1850 }
1851 
1852 static void iommu_resume(void *data)
1853 {
1854 	struct dmar_drhd_unit *drhd;
1855 	struct intel_iommu *iommu = NULL;
1856 	unsigned long flag;
1857 
1858 	if (init_iommu_hw()) {
1859 		if (force_on)
1860 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
1861 		else
1862 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
1863 		return;
1864 	}
1865 
1866 	for_each_active_iommu(iommu, drhd) {
1867 
1868 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
1869 
1870 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
1871 			iommu->reg + DMAR_FECTL_REG);
1872 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
1873 			iommu->reg + DMAR_FEDATA_REG);
1874 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
1875 			iommu->reg + DMAR_FEADDR_REG);
1876 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
1877 			iommu->reg + DMAR_FEUADDR_REG);
1878 
1879 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1880 	}
1881 }
1882 
1883 static const struct syscore_ops iommu_syscore_ops = {
1884 	.resume		= iommu_resume,
1885 	.suspend	= iommu_suspend,
1886 };
1887 
1888 static struct syscore iommu_syscore = {
1889 	.ops = &iommu_syscore_ops,
1890 };
1891 
1892 static void __init init_iommu_pm_ops(void)
1893 {
1894 	register_syscore(&iommu_syscore);
1895 }
1896 
1897 #else
1898 static inline void init_iommu_pm_ops(void) {}
1899 #endif	/* CONFIG_PM */
1900 
1901 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
1902 {
1903 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
1904 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
1905 	    rmrr->end_address <= rmrr->base_address ||
1906 	    arch_rmrr_sanity_check(rmrr))
1907 		return -EINVAL;
1908 
1909 	return 0;
1910 }
1911 
1912 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
1913 {
1914 	struct acpi_dmar_reserved_memory *rmrr;
1915 	struct dmar_rmrr_unit *rmrru;
1916 
1917 	rmrr = (struct acpi_dmar_reserved_memory *)header;
1918 	if (rmrr_sanity_check(rmrr)) {
1919 		pr_warn(FW_BUG
1920 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
1921 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1922 			   rmrr->base_address, rmrr->end_address,
1923 			   dmi_get_system_info(DMI_BIOS_VENDOR),
1924 			   dmi_get_system_info(DMI_BIOS_VERSION),
1925 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
1926 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
1927 	}
1928 
1929 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
1930 	if (!rmrru)
1931 		goto out;
1932 
1933 	rmrru->hdr = header;
1934 
1935 	rmrru->base_address = rmrr->base_address;
1936 	rmrru->end_address = rmrr->end_address;
1937 
1938 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
1939 				((void *)rmrr) + rmrr->header.length,
1940 				&rmrru->devices_cnt);
1941 	if (rmrru->devices_cnt && rmrru->devices == NULL)
1942 		goto free_rmrru;
1943 
1944 	list_add(&rmrru->list, &dmar_rmrr_units);
1945 
1946 	return 0;
1947 free_rmrru:
1948 	kfree(rmrru);
1949 out:
1950 	return -ENOMEM;
1951 }
1952 
1953 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
1954 {
1955 	struct dmar_atsr_unit *atsru;
1956 	struct acpi_dmar_atsr *tmp;
1957 
1958 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
1959 				dmar_rcu_check()) {
1960 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
1961 		if (atsr->segment != tmp->segment)
1962 			continue;
1963 		if (atsr->header.length != tmp->header.length)
1964 			continue;
1965 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
1966 			return atsru;
1967 	}
1968 
1969 	return NULL;
1970 }
1971 
1972 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
1973 {
1974 	struct acpi_dmar_atsr *atsr;
1975 	struct dmar_atsr_unit *atsru;
1976 
1977 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
1978 		return 0;
1979 
1980 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
1981 	atsru = dmar_find_atsr(atsr);
1982 	if (atsru)
1983 		return 0;
1984 
1985 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
1986 	if (!atsru)
1987 		return -ENOMEM;
1988 
1989 	/*
1990 	 * If memory is allocated from slab by ACPI _DSM method, we need to
1991 	 * copy the memory content because the memory buffer will be freed
1992 	 * on return.
1993 	 */
1994 	atsru->hdr = (void *)(atsru + 1);
1995 	memcpy(atsru->hdr, hdr, hdr->length);
1996 	atsru->include_all = atsr->flags & 0x1;
1997 	if (!atsru->include_all) {
1998 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
1999 				(void *)atsr + atsr->header.length,
2000 				&atsru->devices_cnt);
2001 		if (atsru->devices_cnt && atsru->devices == NULL) {
2002 			kfree(atsru);
2003 			return -ENOMEM;
2004 		}
2005 	}
2006 
2007 	list_add_rcu(&atsru->list, &dmar_atsr_units);
2008 
2009 	return 0;
2010 }
2011 
2012 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2013 {
2014 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
2015 	kfree(atsru);
2016 }
2017 
2018 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2019 {
2020 	struct acpi_dmar_atsr *atsr;
2021 	struct dmar_atsr_unit *atsru;
2022 
2023 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2024 	atsru = dmar_find_atsr(atsr);
2025 	if (atsru) {
2026 		list_del_rcu(&atsru->list);
2027 		synchronize_rcu();
2028 		intel_iommu_free_atsr(atsru);
2029 	}
2030 
2031 	return 0;
2032 }
2033 
2034 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2035 {
2036 	int i;
2037 	struct device *dev;
2038 	struct acpi_dmar_atsr *atsr;
2039 	struct dmar_atsr_unit *atsru;
2040 
2041 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2042 	atsru = dmar_find_atsr(atsr);
2043 	if (!atsru)
2044 		return 0;
2045 
2046 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
2047 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
2048 					  i, dev)
2049 			return -EBUSY;
2050 	}
2051 
2052 	return 0;
2053 }
2054 
2055 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
2056 {
2057 	struct dmar_satc_unit *satcu;
2058 	struct acpi_dmar_satc *tmp;
2059 
2060 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
2061 				dmar_rcu_check()) {
2062 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
2063 		if (satc->segment != tmp->segment)
2064 			continue;
2065 		if (satc->header.length != tmp->header.length)
2066 			continue;
2067 		if (memcmp(satc, tmp, satc->header.length) == 0)
2068 			return satcu;
2069 	}
2070 
2071 	return NULL;
2072 }
2073 
2074 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
2075 {
2076 	struct acpi_dmar_satc *satc;
2077 	struct dmar_satc_unit *satcu;
2078 
2079 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2080 		return 0;
2081 
2082 	satc = container_of(hdr, struct acpi_dmar_satc, header);
2083 	satcu = dmar_find_satc(satc);
2084 	if (satcu)
2085 		return 0;
2086 
2087 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
2088 	if (!satcu)
2089 		return -ENOMEM;
2090 
2091 	satcu->hdr = (void *)(satcu + 1);
2092 	memcpy(satcu->hdr, hdr, hdr->length);
2093 	satcu->atc_required = satc->flags & 0x1;
2094 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
2095 					      (void *)satc + satc->header.length,
2096 					      &satcu->devices_cnt);
2097 	if (satcu->devices_cnt && !satcu->devices) {
2098 		kfree(satcu);
2099 		return -ENOMEM;
2100 	}
2101 	list_add_rcu(&satcu->list, &dmar_satc_units);
2102 
2103 	return 0;
2104 }
2105 
2106 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
2107 {
2108 	struct intel_iommu *iommu = dmaru->iommu;
2109 	int ret;
2110 
2111 	/*
2112 	 * Disable translation if already enabled prior to OS handover.
2113 	 */
2114 	if (iommu->gcmd & DMA_GCMD_TE)
2115 		iommu_disable_translation(iommu);
2116 
2117 	ret = iommu_alloc_root_entry(iommu);
2118 	if (ret)
2119 		goto out;
2120 
2121 	intel_svm_check(iommu);
2122 
2123 	if (dmaru->ignored) {
2124 		/*
2125 		 * we always have to disable PMRs or DMA may fail on this device
2126 		 */
2127 		if (force_on)
2128 			iommu_disable_protect_mem_regions(iommu);
2129 		return 0;
2130 	}
2131 
2132 	intel_iommu_init_qi(iommu);
2133 	iommu_flush_write_buffer(iommu);
2134 
2135 	if (ecap_prs(iommu->ecap)) {
2136 		ret = intel_iommu_enable_prq(iommu);
2137 		if (ret)
2138 			goto disable_iommu;
2139 	}
2140 
2141 	ret = dmar_set_interrupt(iommu);
2142 	if (ret)
2143 		goto disable_iommu;
2144 
2145 	iommu_set_root_entry(iommu);
2146 	iommu_enable_translation(iommu);
2147 
2148 	iommu_disable_protect_mem_regions(iommu);
2149 	return 0;
2150 
2151 disable_iommu:
2152 	disable_dmar_iommu(iommu);
2153 out:
2154 	free_dmar_iommu(iommu);
2155 	return ret;
2156 }
2157 
2158 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
2159 {
2160 	int ret = 0;
2161 	struct intel_iommu *iommu = dmaru->iommu;
2162 
2163 	if (!intel_iommu_enabled)
2164 		return 0;
2165 	if (iommu == NULL)
2166 		return -EINVAL;
2167 
2168 	if (insert) {
2169 		ret = intel_iommu_add(dmaru);
2170 	} else {
2171 		disable_dmar_iommu(iommu);
2172 		free_dmar_iommu(iommu);
2173 	}
2174 
2175 	return ret;
2176 }
2177 
2178 static void intel_iommu_free_dmars(void)
2179 {
2180 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
2181 	struct dmar_atsr_unit *atsru, *atsr_n;
2182 	struct dmar_satc_unit *satcu, *satc_n;
2183 
2184 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
2185 		list_del(&rmrru->list);
2186 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
2187 		kfree(rmrru);
2188 	}
2189 
2190 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
2191 		list_del(&atsru->list);
2192 		intel_iommu_free_atsr(atsru);
2193 	}
2194 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
2195 		list_del(&satcu->list);
2196 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
2197 		kfree(satcu);
2198 	}
2199 }
2200 
2201 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
2202 {
2203 	struct dmar_satc_unit *satcu;
2204 	struct acpi_dmar_satc *satc;
2205 	struct device *tmp;
2206 	int i;
2207 
2208 	rcu_read_lock();
2209 
2210 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
2211 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2212 		if (satc->segment != pci_domain_nr(dev->bus))
2213 			continue;
2214 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
2215 			if (to_pci_dev(tmp) == dev)
2216 				goto out;
2217 	}
2218 	satcu = NULL;
2219 out:
2220 	rcu_read_unlock();
2221 	return satcu;
2222 }
2223 
2224 static bool dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
2225 {
2226 	struct pci_dev *bridge = NULL;
2227 	struct dmar_atsr_unit *atsru;
2228 	struct dmar_satc_unit *satcu;
2229 	struct acpi_dmar_atsr *atsr;
2230 	bool supported = true;
2231 	struct pci_bus *bus;
2232 	struct device *tmp;
2233 	int i;
2234 
2235 	dev = pci_physfn(dev);
2236 	satcu = dmar_find_matched_satc_unit(dev);
2237 	if (satcu)
2238 		/*
2239 		 * This device supports ATS as it is in SATC table.
2240 		 * When IOMMU is in legacy mode, enabling ATS is done
2241 		 * automatically by HW for the device that requires
2242 		 * ATS, hence OS should not enable this device ATS
2243 		 * to avoid duplicated TLB invalidation.
2244 		 */
2245 		return !(satcu->atc_required && !sm_supported(iommu));
2246 
2247 	for (bus = dev->bus; bus; bus = bus->parent) {
2248 		bridge = bus->self;
2249 		/* If it's an integrated device, allow ATS */
2250 		if (!bridge)
2251 			return true;
2252 		/* Connected via non-PCIe: no ATS */
2253 		if (!pci_is_pcie(bridge) ||
2254 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
2255 			return false;
2256 		/* If we found the root port, look it up in the ATSR */
2257 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
2258 			break;
2259 	}
2260 
2261 	rcu_read_lock();
2262 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
2263 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2264 		if (atsr->segment != pci_domain_nr(dev->bus))
2265 			continue;
2266 
2267 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
2268 			if (tmp == &bridge->dev)
2269 				goto out;
2270 
2271 		if (atsru->include_all)
2272 			goto out;
2273 	}
2274 	supported = false;
2275 out:
2276 	rcu_read_unlock();
2277 
2278 	return supported;
2279 }
2280 
2281 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
2282 {
2283 	int ret;
2284 	struct dmar_rmrr_unit *rmrru;
2285 	struct dmar_atsr_unit *atsru;
2286 	struct dmar_satc_unit *satcu;
2287 	struct acpi_dmar_atsr *atsr;
2288 	struct acpi_dmar_reserved_memory *rmrr;
2289 	struct acpi_dmar_satc *satc;
2290 
2291 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
2292 		return 0;
2293 
2294 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
2295 		rmrr = container_of(rmrru->hdr,
2296 				    struct acpi_dmar_reserved_memory, header);
2297 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2298 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
2299 				((void *)rmrr) + rmrr->header.length,
2300 				rmrr->segment, rmrru->devices,
2301 				rmrru->devices_cnt);
2302 			if (ret < 0)
2303 				return ret;
2304 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2305 			dmar_remove_dev_scope(info, rmrr->segment,
2306 				rmrru->devices, rmrru->devices_cnt);
2307 		}
2308 	}
2309 
2310 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
2311 		if (atsru->include_all)
2312 			continue;
2313 
2314 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2315 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2316 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
2317 					(void *)atsr + atsr->header.length,
2318 					atsr->segment, atsru->devices,
2319 					atsru->devices_cnt);
2320 			if (ret > 0)
2321 				break;
2322 			else if (ret < 0)
2323 				return ret;
2324 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2325 			if (dmar_remove_dev_scope(info, atsr->segment,
2326 					atsru->devices, atsru->devices_cnt))
2327 				break;
2328 		}
2329 	}
2330 	list_for_each_entry(satcu, &dmar_satc_units, list) {
2331 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2332 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2333 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
2334 					(void *)satc + satc->header.length,
2335 					satc->segment, satcu->devices,
2336 					satcu->devices_cnt);
2337 			if (ret > 0)
2338 				break;
2339 			else if (ret < 0)
2340 				return ret;
2341 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2342 			if (dmar_remove_dev_scope(info, satc->segment,
2343 					satcu->devices, satcu->devices_cnt))
2344 				break;
2345 		}
2346 	}
2347 
2348 	return 0;
2349 }
2350 
2351 static void intel_disable_iommus(void)
2352 {
2353 	struct intel_iommu *iommu = NULL;
2354 	struct dmar_drhd_unit *drhd;
2355 
2356 	for_each_iommu(iommu, drhd)
2357 		iommu_disable_translation(iommu);
2358 }
2359 
2360 void intel_iommu_shutdown(void)
2361 {
2362 	struct dmar_drhd_unit *drhd;
2363 	struct intel_iommu *iommu = NULL;
2364 
2365 	if (no_iommu || dmar_disabled)
2366 		return;
2367 
2368 	/*
2369 	 * All other CPUs were brought down, hotplug interrupts were disabled,
2370 	 * no lock and RCU checking needed anymore
2371 	 */
2372 	list_for_each_entry(drhd, &dmar_drhd_units, list) {
2373 		iommu = drhd->iommu;
2374 
2375 		/* Disable PMRs explicitly here. */
2376 		iommu_disable_protect_mem_regions(iommu);
2377 
2378 		/* Make sure the IOMMUs are switched off */
2379 		iommu_disable_translation(iommu);
2380 	}
2381 }
2382 
2383 static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
2384 {
2385 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
2386 
2387 	return container_of(iommu_dev, struct intel_iommu, iommu);
2388 }
2389 
2390 static ssize_t version_show(struct device *dev,
2391 			    struct device_attribute *attr, char *buf)
2392 {
2393 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2394 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
2395 	return sysfs_emit(buf, "%d:%d\n",
2396 			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
2397 }
2398 static DEVICE_ATTR_RO(version);
2399 
2400 static ssize_t address_show(struct device *dev,
2401 			    struct device_attribute *attr, char *buf)
2402 {
2403 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2404 	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
2405 }
2406 static DEVICE_ATTR_RO(address);
2407 
2408 static ssize_t cap_show(struct device *dev,
2409 			struct device_attribute *attr, char *buf)
2410 {
2411 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2412 	return sysfs_emit(buf, "%llx\n", iommu->cap);
2413 }
2414 static DEVICE_ATTR_RO(cap);
2415 
2416 static ssize_t ecap_show(struct device *dev,
2417 			 struct device_attribute *attr, char *buf)
2418 {
2419 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2420 	return sysfs_emit(buf, "%llx\n", iommu->ecap);
2421 }
2422 static DEVICE_ATTR_RO(ecap);
2423 
2424 static ssize_t domains_supported_show(struct device *dev,
2425 				      struct device_attribute *attr, char *buf)
2426 {
2427 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2428 	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
2429 }
2430 static DEVICE_ATTR_RO(domains_supported);
2431 
2432 static ssize_t domains_used_show(struct device *dev,
2433 				 struct device_attribute *attr, char *buf)
2434 {
2435 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2436 	unsigned int count = 0;
2437 	int id;
2438 
2439 	for (id = 0; id < cap_ndoms(iommu->cap); id++)
2440 		if (ida_exists(&iommu->domain_ida, id))
2441 			count++;
2442 
2443 	return sysfs_emit(buf, "%d\n", count);
2444 }
2445 static DEVICE_ATTR_RO(domains_used);
2446 
2447 static struct attribute *intel_iommu_attrs[] = {
2448 	&dev_attr_version.attr,
2449 	&dev_attr_address.attr,
2450 	&dev_attr_cap.attr,
2451 	&dev_attr_ecap.attr,
2452 	&dev_attr_domains_supported.attr,
2453 	&dev_attr_domains_used.attr,
2454 	NULL,
2455 };
2456 
2457 static struct attribute_group intel_iommu_group = {
2458 	.name = "intel-iommu",
2459 	.attrs = intel_iommu_attrs,
2460 };
2461 
2462 const struct attribute_group *intel_iommu_groups[] = {
2463 	&intel_iommu_group,
2464 	NULL,
2465 };
2466 
2467 static bool has_external_pci(void)
2468 {
2469 	struct pci_dev *pdev = NULL;
2470 
2471 	for_each_pci_dev(pdev)
2472 		if (pdev->external_facing) {
2473 			pci_dev_put(pdev);
2474 			return true;
2475 		}
2476 
2477 	return false;
2478 }
2479 
2480 static int __init platform_optin_force_iommu(void)
2481 {
2482 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
2483 		return 0;
2484 
2485 	if (no_iommu || dmar_disabled)
2486 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
2487 
2488 	/*
2489 	 * If Intel-IOMMU is disabled by default, we will apply identity
2490 	 * map for all devices except those marked as being untrusted.
2491 	 */
2492 	if (dmar_disabled)
2493 		iommu_set_default_passthrough(false);
2494 
2495 	dmar_disabled = 0;
2496 	no_iommu = 0;
2497 
2498 	return 1;
2499 }
2500 
2501 static int __init probe_acpi_namespace_devices(void)
2502 {
2503 	struct dmar_drhd_unit *drhd;
2504 	/* To avoid a -Wunused-but-set-variable warning. */
2505 	struct intel_iommu *iommu __maybe_unused;
2506 	struct device *dev;
2507 	int i, ret = 0;
2508 
2509 	for_each_active_iommu(iommu, drhd) {
2510 		for_each_active_dev_scope(drhd->devices,
2511 					  drhd->devices_cnt, i, dev) {
2512 			struct acpi_device_physical_node *pn;
2513 			struct acpi_device *adev;
2514 
2515 			if (dev->bus != &acpi_bus_type)
2516 				continue;
2517 
2518 			up_read(&dmar_global_lock);
2519 			adev = to_acpi_device(dev);
2520 			mutex_lock(&adev->physical_node_lock);
2521 			list_for_each_entry(pn,
2522 					    &adev->physical_node_list, node) {
2523 				ret = iommu_probe_device(pn->dev);
2524 				if (ret)
2525 					break;
2526 			}
2527 			mutex_unlock(&adev->physical_node_lock);
2528 			down_read(&dmar_global_lock);
2529 
2530 			if (ret)
2531 				return ret;
2532 		}
2533 	}
2534 
2535 	return 0;
2536 }
2537 
2538 static __init int tboot_force_iommu(void)
2539 {
2540 	if (!tboot_enabled())
2541 		return 0;
2542 
2543 	if (no_iommu || dmar_disabled)
2544 		pr_warn("Forcing Intel-IOMMU to enabled\n");
2545 
2546 	dmar_disabled = 0;
2547 	no_iommu = 0;
2548 
2549 	return 1;
2550 }
2551 
2552 int __init intel_iommu_init(void)
2553 {
2554 	int ret = -ENODEV;
2555 	struct dmar_drhd_unit *drhd;
2556 	struct intel_iommu *iommu;
2557 
2558 	/*
2559 	 * Intel IOMMU is required for a TXT/tboot launch or platform
2560 	 * opt in, so enforce that.
2561 	 */
2562 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
2563 		    platform_optin_force_iommu();
2564 
2565 	down_write(&dmar_global_lock);
2566 	if (dmar_table_init()) {
2567 		if (force_on)
2568 			panic("tboot: Failed to initialize DMAR table\n");
2569 		goto out_free_dmar;
2570 	}
2571 
2572 	if (dmar_dev_scope_init() < 0) {
2573 		if (force_on)
2574 			panic("tboot: Failed to initialize DMAR device scope\n");
2575 		goto out_free_dmar;
2576 	}
2577 
2578 	up_write(&dmar_global_lock);
2579 
2580 	/*
2581 	 * The bus notifier takes the dmar_global_lock, so lockdep will
2582 	 * complain later when we register it under the lock.
2583 	 */
2584 	dmar_register_bus_notifier();
2585 
2586 	down_write(&dmar_global_lock);
2587 
2588 	if (!no_iommu)
2589 		intel_iommu_debugfs_init();
2590 
2591 	if (no_iommu || dmar_disabled) {
2592 		/*
2593 		 * We exit the function here to ensure IOMMU's remapping and
2594 		 * mempool aren't setup, which means that the IOMMU's PMRs
2595 		 * won't be disabled via the call to init_dmars(). So disable
2596 		 * it explicitly here. The PMRs were setup by tboot prior to
2597 		 * calling SENTER, but the kernel is expected to reset/tear
2598 		 * down the PMRs.
2599 		 */
2600 		if (intel_iommu_tboot_noforce) {
2601 			for_each_iommu(iommu, drhd)
2602 				iommu_disable_protect_mem_regions(iommu);
2603 		}
2604 
2605 		/*
2606 		 * Make sure the IOMMUs are switched off, even when we
2607 		 * boot into a kexec kernel and the previous kernel left
2608 		 * them enabled
2609 		 */
2610 		intel_disable_iommus();
2611 		goto out_free_dmar;
2612 	}
2613 
2614 	if (list_empty(&dmar_rmrr_units))
2615 		pr_info("No RMRR found\n");
2616 
2617 	if (list_empty(&dmar_atsr_units))
2618 		pr_info("No ATSR found\n");
2619 
2620 	if (list_empty(&dmar_satc_units))
2621 		pr_info("No SATC found\n");
2622 
2623 	init_no_remapping_devices();
2624 
2625 	ret = init_dmars();
2626 	if (ret) {
2627 		if (force_on)
2628 			panic("tboot: Failed to initialize DMARs\n");
2629 		pr_err("Initialization failed\n");
2630 		goto out_free_dmar;
2631 	}
2632 	up_write(&dmar_global_lock);
2633 
2634 	init_iommu_pm_ops();
2635 
2636 	down_read(&dmar_global_lock);
2637 	for_each_active_iommu(iommu, drhd) {
2638 		/*
2639 		 * The flush queue implementation does not perform
2640 		 * page-selective invalidations that are required for efficient
2641 		 * TLB flushes in virtual environments.  The benefit of batching
2642 		 * is likely to be much lower than the overhead of synchronizing
2643 		 * the virtual and physical IOMMU page-tables.
2644 		 */
2645 		if (cap_caching_mode(iommu->cap) &&
2646 		    !first_level_by_default(iommu)) {
2647 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
2648 			iommu_set_dma_strict();
2649 		}
2650 		iommu_device_sysfs_add(&iommu->iommu, NULL,
2651 				       intel_iommu_groups,
2652 				       "%s", iommu->name);
2653 		/*
2654 		 * The iommu device probe is protected by the iommu_probe_device_lock.
2655 		 * Release the dmar_global_lock before entering the device probe path
2656 		 * to avoid unnecessary lock order splat.
2657 		 */
2658 		up_read(&dmar_global_lock);
2659 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
2660 		down_read(&dmar_global_lock);
2661 
2662 		iommu_pmu_register(iommu);
2663 	}
2664 
2665 	if (probe_acpi_namespace_devices())
2666 		pr_warn("ACPI name space devices didn't probe correctly\n");
2667 
2668 	/* Finally, we enable the DMA remapping hardware. */
2669 	for_each_iommu(iommu, drhd) {
2670 		if (!drhd->ignored && !translation_pre_enabled(iommu))
2671 			iommu_enable_translation(iommu);
2672 
2673 		iommu_disable_protect_mem_regions(iommu);
2674 	}
2675 	up_read(&dmar_global_lock);
2676 
2677 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
2678 
2679 	intel_iommu_enabled = 1;
2680 
2681 	return 0;
2682 
2683 out_free_dmar:
2684 	intel_iommu_free_dmars();
2685 	up_write(&dmar_global_lock);
2686 	return ret;
2687 }
2688 
2689 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
2690 {
2691 	struct device_domain_info *info = opaque;
2692 
2693 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
2694 	return 0;
2695 }
2696 
2697 /*
2698  * NB - intel-iommu lacks any sort of reference counting for the users of
2699  * dependent devices.  If multiple endpoints have intersecting dependent
2700  * devices, unbinding the driver from any one of them will possibly leave
2701  * the others unable to operate.
2702  */
2703 static void domain_context_clear(struct device_domain_info *info)
2704 {
2705 	if (!dev_is_pci(info->dev)) {
2706 		domain_context_clear_one(info, info->bus, info->devfn);
2707 		return;
2708 	}
2709 
2710 	pci_for_each_dma_alias(to_pci_dev(info->dev),
2711 			       &domain_context_clear_one_cb, info);
2712 	iommu_disable_pci_ats(info);
2713 }
2714 
2715 /*
2716  * Clear the page table pointer in context or pasid table entries so that
2717  * all DMA requests without PASID from the device are blocked. If the page
2718  * table has been set, clean up the data structures.
2719  */
2720 void device_block_translation(struct device *dev)
2721 {
2722 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2723 	struct intel_iommu *iommu = info->iommu;
2724 	unsigned long flags;
2725 
2726 	/* Device in DMA blocking state. Noting to do. */
2727 	if (!info->domain_attached)
2728 		return;
2729 
2730 	if (info->domain)
2731 		cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID);
2732 
2733 	if (!dev_is_real_dma_subdevice(dev)) {
2734 		if (sm_supported(iommu))
2735 			intel_pasid_tear_down_entry(iommu, dev,
2736 						    IOMMU_NO_PASID, false);
2737 		else
2738 			domain_context_clear(info);
2739 	}
2740 
2741 	/* Device now in DMA blocking state. */
2742 	info->domain_attached = false;
2743 
2744 	if (!info->domain)
2745 		return;
2746 
2747 	spin_lock_irqsave(&info->domain->lock, flags);
2748 	list_del(&info->link);
2749 	spin_unlock_irqrestore(&info->domain->lock, flags);
2750 
2751 	domain_detach_iommu(info->domain, iommu);
2752 	info->domain = NULL;
2753 }
2754 
2755 static int blocking_domain_attach_dev(struct iommu_domain *domain,
2756 				      struct device *dev,
2757 				      struct iommu_domain *old)
2758 {
2759 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2760 
2761 	iopf_for_domain_remove(info->domain ? &info->domain->domain : NULL, dev);
2762 	device_block_translation(dev);
2763 	return 0;
2764 }
2765 
2766 static int blocking_domain_set_dev_pasid(struct iommu_domain *domain,
2767 					 struct device *dev, ioasid_t pasid,
2768 					 struct iommu_domain *old);
2769 
2770 static struct iommu_domain blocking_domain = {
2771 	.type = IOMMU_DOMAIN_BLOCKED,
2772 	.ops = &(const struct iommu_domain_ops) {
2773 		.attach_dev	= blocking_domain_attach_dev,
2774 		.set_dev_pasid	= blocking_domain_set_dev_pasid,
2775 	}
2776 };
2777 
2778 static struct dmar_domain *paging_domain_alloc(void)
2779 {
2780 	struct dmar_domain *domain;
2781 
2782 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
2783 	if (!domain)
2784 		return ERR_PTR(-ENOMEM);
2785 
2786 	INIT_LIST_HEAD(&domain->devices);
2787 	INIT_LIST_HEAD(&domain->dev_pasids);
2788 	INIT_LIST_HEAD(&domain->cache_tags);
2789 	spin_lock_init(&domain->lock);
2790 	spin_lock_init(&domain->cache_lock);
2791 	xa_init(&domain->iommu_array);
2792 	INIT_LIST_HEAD(&domain->s1_domains);
2793 	spin_lock_init(&domain->s1_lock);
2794 
2795 	return domain;
2796 }
2797 
2798 static unsigned int compute_vasz_lg2_fs(struct intel_iommu *iommu,
2799 					unsigned int *top_level)
2800 {
2801 	unsigned int mgaw = cap_mgaw(iommu->cap);
2802 
2803 	/*
2804 	 * Spec 3.6 First-Stage Translation:
2805 	 *
2806 	 * Software must limit addresses to less than the minimum of MGAW
2807 	 * and the lower canonical address width implied by FSPM (i.e.,
2808 	 * 47-bit when FSPM is 4-level and 56-bit when FSPM is 5-level).
2809 	 */
2810 	if (mgaw > 48 && cap_fl5lp_support(iommu->cap)) {
2811 		*top_level = 4;
2812 		return min(57, mgaw);
2813 	}
2814 
2815 	/* Four level is always supported */
2816 	*top_level = 3;
2817 	return min(48, mgaw);
2818 }
2819 
2820 static struct iommu_domain *
2821 intel_iommu_domain_alloc_first_stage(struct device *dev,
2822 				     struct intel_iommu *iommu, u32 flags)
2823 {
2824 	struct pt_iommu_x86_64_cfg cfg = {};
2825 	struct dmar_domain *dmar_domain;
2826 	int ret;
2827 
2828 	if (flags & ~IOMMU_HWPT_ALLOC_PASID)
2829 		return ERR_PTR(-EOPNOTSUPP);
2830 
2831 	/* Only SL is available in legacy mode */
2832 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
2833 		return ERR_PTR(-EOPNOTSUPP);
2834 
2835 	dmar_domain = paging_domain_alloc();
2836 	if (IS_ERR(dmar_domain))
2837 		return ERR_CAST(dmar_domain);
2838 
2839 	cfg.common.hw_max_vasz_lg2 =
2840 		compute_vasz_lg2_fs(iommu, &cfg.top_level);
2841 	cfg.common.hw_max_oasz_lg2 = 52;
2842 	cfg.common.features = BIT(PT_FEAT_SIGN_EXTEND) |
2843 			      BIT(PT_FEAT_FLUSH_RANGE);
2844 	/* First stage always uses scalable mode */
2845 	if (!ecap_smpwc(iommu->ecap))
2846 		cfg.common.features |= BIT(PT_FEAT_DMA_INCOHERENT);
2847 	dmar_domain->iommu.iommu_device = dev;
2848 	dmar_domain->iommu.nid = dev_to_node(dev);
2849 	dmar_domain->domain.ops = &intel_fs_paging_domain_ops;
2850 	/*
2851 	 * iotlb sync for map is only needed for legacy implementations that
2852 	 * explicitly require flushing internal write buffers to ensure memory
2853 	 * coherence.
2854 	 */
2855 	if (rwbf_required(iommu))
2856 		dmar_domain->iotlb_sync_map = true;
2857 
2858 	ret = pt_iommu_x86_64_init(&dmar_domain->fspt, &cfg, GFP_KERNEL);
2859 	if (ret) {
2860 		kfree(dmar_domain);
2861 		return ERR_PTR(ret);
2862 	}
2863 
2864 	if (!cap_fl1gp_support(iommu->cap))
2865 		dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_1G;
2866 	if (!intel_iommu_superpage)
2867 		dmar_domain->domain.pgsize_bitmap = SZ_4K;
2868 
2869 	return &dmar_domain->domain;
2870 }
2871 
2872 static unsigned int compute_vasz_lg2_ss(struct intel_iommu *iommu,
2873 					unsigned int *top_level)
2874 {
2875 	unsigned int sagaw = cap_sagaw(iommu->cap);
2876 	unsigned int mgaw = cap_mgaw(iommu->cap);
2877 
2878 	/*
2879 	 * Find the largest table size that both the mgaw and sagaw support.
2880 	 * This sets the valid range of IOVA and the top starting level.
2881 	 * Some HW may only support a 4 or 5 level walk but must limit IOVA to
2882 	 * 3 levels.
2883 	 */
2884 	if (mgaw > 48 && sagaw >= BIT(3)) {
2885 		*top_level = 4;
2886 		return min(57, mgaw);
2887 	} else if (mgaw > 39 && sagaw >= BIT(2)) {
2888 		*top_level = 3 + ffs(sagaw >> 3);
2889 		return min(48, mgaw);
2890 	} else if (mgaw > 30 && sagaw >= BIT(1)) {
2891 		*top_level = 2 + ffs(sagaw >> 2);
2892 		return min(39, mgaw);
2893 	}
2894 	return 0;
2895 }
2896 
2897 static const struct iommu_dirty_ops intel_second_stage_dirty_ops = {
2898 	IOMMU_PT_DIRTY_OPS(vtdss),
2899 	.set_dirty_tracking = intel_iommu_set_dirty_tracking,
2900 };
2901 
2902 static struct iommu_domain *
2903 intel_iommu_domain_alloc_second_stage(struct device *dev,
2904 				      struct intel_iommu *iommu, u32 flags)
2905 {
2906 	struct pt_iommu_vtdss_cfg cfg = {};
2907 	struct dmar_domain *dmar_domain;
2908 	unsigned int sslps;
2909 	int ret;
2910 
2911 	if (flags &
2912 	    (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
2913 	       IOMMU_HWPT_ALLOC_PASID)))
2914 		return ERR_PTR(-EOPNOTSUPP);
2915 
2916 	if (((flags & IOMMU_HWPT_ALLOC_NEST_PARENT) &&
2917 	     !nested_supported(iommu)) ||
2918 	    ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) &&
2919 	     !ssads_supported(iommu)))
2920 		return ERR_PTR(-EOPNOTSUPP);
2921 
2922 	/* Legacy mode always supports second stage */
2923 	if (sm_supported(iommu) && !ecap_slts(iommu->ecap))
2924 		return ERR_PTR(-EOPNOTSUPP);
2925 
2926 	dmar_domain = paging_domain_alloc();
2927 	if (IS_ERR(dmar_domain))
2928 		return ERR_CAST(dmar_domain);
2929 
2930 	cfg.common.hw_max_vasz_lg2 = compute_vasz_lg2_ss(iommu, &cfg.top_level);
2931 	cfg.common.hw_max_oasz_lg2 = 52;
2932 	cfg.common.features = BIT(PT_FEAT_FLUSH_RANGE);
2933 
2934 	/*
2935 	 * Read-only mapping is disallowed on the domain which serves as the
2936 	 * parent in a nested configuration, due to HW errata
2937 	 * (ERRATA_772415_SPR17)
2938 	 */
2939 	if (flags & IOMMU_HWPT_ALLOC_NEST_PARENT)
2940 		cfg.common.features |= BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE);
2941 
2942 	if (!iommu_paging_structure_coherency(iommu))
2943 		cfg.common.features |= BIT(PT_FEAT_DMA_INCOHERENT);
2944 	dmar_domain->iommu.iommu_device = dev;
2945 	dmar_domain->iommu.nid = dev_to_node(dev);
2946 	dmar_domain->domain.ops = &intel_ss_paging_domain_ops;
2947 	dmar_domain->nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
2948 
2949 	if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING)
2950 		dmar_domain->domain.dirty_ops = &intel_second_stage_dirty_ops;
2951 
2952 	ret = pt_iommu_vtdss_init(&dmar_domain->sspt, &cfg, GFP_KERNEL);
2953 	if (ret) {
2954 		kfree(dmar_domain);
2955 		return ERR_PTR(ret);
2956 	}
2957 
2958 	/* Adjust the supported page sizes to HW capability */
2959 	sslps = cap_super_page_val(iommu->cap);
2960 	if (!(sslps & BIT(0)))
2961 		dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_2M;
2962 	if (!(sslps & BIT(1)))
2963 		dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_1G;
2964 	if (!intel_iommu_superpage)
2965 		dmar_domain->domain.pgsize_bitmap = SZ_4K;
2966 
2967 	/*
2968 	 * Besides the internal write buffer flush, the caching mode used for
2969 	 * legacy nested translation (which utilizes shadowing page tables)
2970 	 * also requires iotlb sync on map.
2971 	 */
2972 	if (rwbf_required(iommu) || cap_caching_mode(iommu->cap))
2973 		dmar_domain->iotlb_sync_map = true;
2974 
2975 	return &dmar_domain->domain;
2976 }
2977 
2978 static struct iommu_domain *
2979 intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
2980 				      const struct iommu_user_data *user_data)
2981 {
2982 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2983 	struct intel_iommu *iommu = info->iommu;
2984 	struct iommu_domain *domain;
2985 
2986 	if (user_data)
2987 		return ERR_PTR(-EOPNOTSUPP);
2988 
2989 	/* Prefer first stage if possible by default. */
2990 	domain = intel_iommu_domain_alloc_first_stage(dev, iommu, flags);
2991 	if (domain != ERR_PTR(-EOPNOTSUPP))
2992 		return domain;
2993 	return intel_iommu_domain_alloc_second_stage(dev, iommu, flags);
2994 }
2995 
2996 static void intel_iommu_domain_free(struct iommu_domain *domain)
2997 {
2998 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
2999 
3000 	if (WARN_ON(dmar_domain->nested_parent &&
3001 		    !list_empty(&dmar_domain->s1_domains)))
3002 		return;
3003 
3004 	if (WARN_ON(!list_empty(&dmar_domain->devices)))
3005 		return;
3006 
3007 	pt_iommu_deinit(&dmar_domain->iommu);
3008 
3009 	kfree(dmar_domain->qi_batch);
3010 	kfree(dmar_domain);
3011 }
3012 
3013 static int paging_domain_compatible_first_stage(struct dmar_domain *dmar_domain,
3014 						struct intel_iommu *iommu)
3015 {
3016 	if (WARN_ON(dmar_domain->domain.dirty_ops ||
3017 		    dmar_domain->nested_parent))
3018 		return -EINVAL;
3019 
3020 	/* Only SL is available in legacy mode */
3021 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
3022 		return -EINVAL;
3023 
3024 	if (!ecap_smpwc(iommu->ecap) &&
3025 	    !(dmar_domain->fspt.x86_64_pt.common.features &
3026 	      BIT(PT_FEAT_DMA_INCOHERENT)))
3027 		return -EINVAL;
3028 
3029 	/* Supports the number of table levels */
3030 	if (!cap_fl5lp_support(iommu->cap) &&
3031 	    dmar_domain->fspt.x86_64_pt.common.max_vasz_lg2 > 48)
3032 		return -EINVAL;
3033 
3034 	/* Same page size support */
3035 	if (!cap_fl1gp_support(iommu->cap) &&
3036 	    (dmar_domain->domain.pgsize_bitmap & SZ_1G))
3037 		return -EINVAL;
3038 
3039 	/* iotlb sync on map requirement */
3040 	if ((rwbf_required(iommu)) && !dmar_domain->iotlb_sync_map)
3041 		return -EINVAL;
3042 
3043 	return 0;
3044 }
3045 
3046 static int
3047 paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain,
3048 				      struct intel_iommu *iommu)
3049 {
3050 	unsigned int vasz_lg2 = dmar_domain->sspt.vtdss_pt.common.max_vasz_lg2;
3051 	unsigned int sslps = cap_super_page_val(iommu->cap);
3052 	struct pt_iommu_vtdss_hw_info pt_info;
3053 
3054 	pt_iommu_vtdss_hw_info(&dmar_domain->sspt, &pt_info);
3055 
3056 	if (dmar_domain->domain.dirty_ops && !ssads_supported(iommu))
3057 		return -EINVAL;
3058 	if (dmar_domain->nested_parent && !nested_supported(iommu))
3059 		return -EINVAL;
3060 
3061 	/* Legacy mode always supports second stage */
3062 	if (sm_supported(iommu) && !ecap_slts(iommu->ecap))
3063 		return -EINVAL;
3064 
3065 	if (!iommu_paging_structure_coherency(iommu) &&
3066 	    !(dmar_domain->sspt.vtdss_pt.common.features &
3067 	      BIT(PT_FEAT_DMA_INCOHERENT)))
3068 		return -EINVAL;
3069 
3070 	/* Address width falls within the capability */
3071 	if (cap_mgaw(iommu->cap) < vasz_lg2)
3072 		return -EINVAL;
3073 
3074 	/* Page table level is supported. */
3075 	if (!(cap_sagaw(iommu->cap) & BIT(pt_info.aw)))
3076 		return -EINVAL;
3077 
3078 	/* Same page size support */
3079 	if (!(sslps & BIT(0)) && (dmar_domain->domain.pgsize_bitmap & SZ_2M))
3080 		return -EINVAL;
3081 	if (!(sslps & BIT(1)) && (dmar_domain->domain.pgsize_bitmap & SZ_1G))
3082 		return -EINVAL;
3083 
3084 	/* iotlb sync on map requirement */
3085 	if ((rwbf_required(iommu) || cap_caching_mode(iommu->cap)) &&
3086 	    !dmar_domain->iotlb_sync_map)
3087 		return -EINVAL;
3088 
3089 	/*
3090 	 * FIXME this is locked wrong, it needs to be under the
3091 	 * dmar_domain->lock
3092 	 */
3093 	if ((dmar_domain->sspt.vtdss_pt.common.features &
3094 	     BIT(PT_FEAT_VTDSS_FORCE_COHERENCE)) &&
3095 	    !ecap_sc_support(iommu->ecap))
3096 		return -EINVAL;
3097 	return 0;
3098 }
3099 
3100 int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
3101 {
3102 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3103 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3104 	struct intel_iommu *iommu = info->iommu;
3105 	int ret = -EINVAL;
3106 
3107 	if (intel_domain_is_fs_paging(dmar_domain))
3108 		ret = paging_domain_compatible_first_stage(dmar_domain, iommu);
3109 	else if (intel_domain_is_ss_paging(dmar_domain))
3110 		ret = paging_domain_compatible_second_stage(dmar_domain, iommu);
3111 	else if (WARN_ON(true))
3112 		ret = -EINVAL;
3113 	if (ret)
3114 		return ret;
3115 
3116 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3117 	    context_copied(iommu, info->bus, info->devfn))
3118 		return intel_pasid_setup_sm_context(dev);
3119 
3120 	return 0;
3121 }
3122 
3123 static int intel_iommu_attach_device(struct iommu_domain *domain,
3124 				     struct device *dev,
3125 				     struct iommu_domain *old)
3126 {
3127 	int ret;
3128 
3129 	device_block_translation(dev);
3130 
3131 	ret = paging_domain_compatible(domain, dev);
3132 	if (ret)
3133 		return ret;
3134 
3135 	ret = iopf_for_domain_set(domain, dev);
3136 	if (ret)
3137 		return ret;
3138 
3139 	ret = dmar_domain_attach_device(to_dmar_domain(domain), dev);
3140 	if (ret)
3141 		iopf_for_domain_remove(domain, dev);
3142 
3143 	return ret;
3144 }
3145 
3146 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
3147 				 struct iommu_iotlb_gather *gather)
3148 {
3149 	cache_tag_flush_range(to_dmar_domain(domain), gather->start,
3150 			      gather->end,
3151 			      iommu_pages_list_empty(&gather->freelist));
3152 	iommu_put_pages_list(&gather->freelist);
3153 }
3154 
3155 static bool domain_support_force_snooping(struct dmar_domain *domain)
3156 {
3157 	struct device_domain_info *info;
3158 	bool support = true;
3159 
3160 	assert_spin_locked(&domain->lock);
3161 	list_for_each_entry(info, &domain->devices, link) {
3162 		if (!ecap_sc_support(info->iommu->ecap)) {
3163 			support = false;
3164 			break;
3165 		}
3166 	}
3167 
3168 	return support;
3169 }
3170 
3171 static bool intel_iommu_enforce_cache_coherency_fs(struct iommu_domain *domain)
3172 {
3173 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3174 	struct device_domain_info *info;
3175 
3176 	guard(spinlock_irqsave)(&dmar_domain->lock);
3177 
3178 	if (dmar_domain->force_snooping)
3179 		return true;
3180 
3181 	if (!domain_support_force_snooping(dmar_domain))
3182 		return false;
3183 
3184 	dmar_domain->force_snooping = true;
3185 	list_for_each_entry(info, &dmar_domain->devices, link)
3186 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
3187 						     IOMMU_NO_PASID);
3188 	return true;
3189 }
3190 
3191 static bool intel_iommu_enforce_cache_coherency_ss(struct iommu_domain *domain)
3192 {
3193 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3194 
3195 	guard(spinlock_irqsave)(&dmar_domain->lock);
3196 	if (!domain_support_force_snooping(dmar_domain))
3197 		return false;
3198 
3199 	/*
3200 	 * Second level page table supports per-PTE snoop control. The
3201 	 * iommu_map() interface will handle this by setting SNP bit.
3202 	 */
3203 	dmar_domain->sspt.vtdss_pt.common.features |=
3204 		BIT(PT_FEAT_VTDSS_FORCE_COHERENCE);
3205 	dmar_domain->force_snooping = true;
3206 	return true;
3207 }
3208 
3209 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
3210 {
3211 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3212 
3213 	switch (cap) {
3214 	case IOMMU_CAP_CACHE_COHERENCY:
3215 	case IOMMU_CAP_DEFERRED_FLUSH:
3216 		return true;
3217 	case IOMMU_CAP_PRE_BOOT_PROTECTION:
3218 		return dmar_platform_optin();
3219 	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
3220 		return ecap_sc_support(info->iommu->ecap);
3221 	case IOMMU_CAP_DIRTY_TRACKING:
3222 		return ssads_supported(info->iommu);
3223 	default:
3224 		return false;
3225 	}
3226 }
3227 
3228 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
3229 {
3230 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3231 	struct device_domain_info *info;
3232 	struct intel_iommu *iommu;
3233 	u8 bus, devfn;
3234 	int ret;
3235 
3236 	iommu = device_lookup_iommu(dev, &bus, &devfn);
3237 	if (!iommu || !iommu->iommu.ops)
3238 		return ERR_PTR(-ENODEV);
3239 
3240 	info = kzalloc(sizeof(*info), GFP_KERNEL);
3241 	if (!info)
3242 		return ERR_PTR(-ENOMEM);
3243 
3244 	if (dev_is_real_dma_subdevice(dev)) {
3245 		info->bus = pdev->bus->number;
3246 		info->devfn = pdev->devfn;
3247 		info->segment = pci_domain_nr(pdev->bus);
3248 	} else {
3249 		info->bus = bus;
3250 		info->devfn = devfn;
3251 		info->segment = iommu->segment;
3252 	}
3253 
3254 	info->dev = dev;
3255 	info->iommu = iommu;
3256 	if (dev_is_pci(dev)) {
3257 		if (ecap_dev_iotlb_support(iommu->ecap) &&
3258 		    pci_ats_supported(pdev) &&
3259 		    dmar_ats_supported(pdev, iommu)) {
3260 			info->ats_supported = 1;
3261 			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
3262 
3263 			/*
3264 			 * For IOMMU that supports device IOTLB throttling
3265 			 * (DIT), we assign PFSID to the invalidation desc
3266 			 * of a VF such that IOMMU HW can gauge queue depth
3267 			 * at PF level. If DIT is not set, PFSID will be
3268 			 * treated as reserved, which should be set to 0.
3269 			 */
3270 			if (ecap_dit(iommu->ecap))
3271 				info->pfsid = pci_dev_id(pci_physfn(pdev));
3272 			info->ats_qdep = pci_ats_queue_depth(pdev);
3273 		}
3274 		if (sm_supported(iommu)) {
3275 			if (pasid_supported(iommu)) {
3276 				int features = pci_pasid_features(pdev);
3277 
3278 				if (features >= 0)
3279 					info->pasid_supported = features | 1;
3280 			}
3281 
3282 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
3283 			    ecap_pds(iommu->ecap) && pci_pri_supported(pdev))
3284 				info->pri_supported = 1;
3285 		}
3286 	}
3287 
3288 	dev_iommu_priv_set(dev, info);
3289 	if (pdev && pci_ats_supported(pdev)) {
3290 		pci_prepare_ats(pdev, VTD_PAGE_SHIFT);
3291 		ret = device_rbtree_insert(iommu, info);
3292 		if (ret)
3293 			goto free;
3294 	}
3295 
3296 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
3297 		ret = intel_pasid_alloc_table(dev);
3298 		if (ret) {
3299 			dev_err(dev, "PASID table allocation failed\n");
3300 			goto clear_rbtree;
3301 		}
3302 
3303 		if (!context_copied(iommu, info->bus, info->devfn)) {
3304 			ret = intel_pasid_setup_sm_context(dev);
3305 			if (ret)
3306 				goto free_table;
3307 		}
3308 	}
3309 
3310 	intel_iommu_debugfs_create_dev(info);
3311 
3312 	return &iommu->iommu;
3313 free_table:
3314 	intel_pasid_free_table(dev);
3315 clear_rbtree:
3316 	device_rbtree_remove(info);
3317 free:
3318 	kfree(info);
3319 
3320 	return ERR_PTR(ret);
3321 }
3322 
3323 static void intel_iommu_probe_finalize(struct device *dev)
3324 {
3325 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3326 	struct intel_iommu *iommu = info->iommu;
3327 
3328 	/*
3329 	 * The PCIe spec, in its wisdom, declares that the behaviour of the
3330 	 * device is undefined if you enable PASID support after ATS support.
3331 	 * So always enable PASID support on devices which have it, even if
3332 	 * we can't yet know if we're ever going to use it.
3333 	 */
3334 	if (info->pasid_supported &&
3335 	    !pci_enable_pasid(to_pci_dev(dev), info->pasid_supported & ~1))
3336 		info->pasid_enabled = 1;
3337 
3338 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
3339 		iommu_enable_pci_ats(info);
3340 		/* Assign a DEVTLB cache tag to the default domain. */
3341 		if (info->ats_enabled && info->domain) {
3342 			u16 did = domain_id_iommu(info->domain, iommu);
3343 
3344 			if (cache_tag_assign(info->domain, did, dev,
3345 					     IOMMU_NO_PASID, CACHE_TAG_DEVTLB))
3346 				iommu_disable_pci_ats(info);
3347 		}
3348 	}
3349 	iommu_enable_pci_pri(info);
3350 }
3351 
3352 static void intel_iommu_release_device(struct device *dev)
3353 {
3354 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3355 	struct intel_iommu *iommu = info->iommu;
3356 
3357 	iommu_disable_pci_pri(info);
3358 	iommu_disable_pci_ats(info);
3359 
3360 	if (info->pasid_enabled) {
3361 		pci_disable_pasid(to_pci_dev(dev));
3362 		info->pasid_enabled = 0;
3363 	}
3364 
3365 	mutex_lock(&iommu->iopf_lock);
3366 	if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
3367 		device_rbtree_remove(info);
3368 	mutex_unlock(&iommu->iopf_lock);
3369 
3370 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3371 	    !context_copied(iommu, info->bus, info->devfn))
3372 		intel_pasid_teardown_sm_context(dev);
3373 
3374 	intel_pasid_free_table(dev);
3375 	intel_iommu_debugfs_remove_dev(info);
3376 	kfree(info);
3377 }
3378 
3379 static void intel_iommu_get_resv_regions(struct device *device,
3380 					 struct list_head *head)
3381 {
3382 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
3383 	struct iommu_resv_region *reg;
3384 	struct dmar_rmrr_unit *rmrr;
3385 	struct device *i_dev;
3386 	int i;
3387 
3388 	rcu_read_lock();
3389 	for_each_rmrr_units(rmrr) {
3390 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3391 					  i, i_dev) {
3392 			struct iommu_resv_region *resv;
3393 			enum iommu_resv_type type;
3394 			size_t length;
3395 
3396 			if (i_dev != device &&
3397 			    !is_downstream_to_pci_bridge(device, i_dev))
3398 				continue;
3399 
3400 			length = rmrr->end_address - rmrr->base_address + 1;
3401 
3402 			type = device_rmrr_is_relaxable(device) ?
3403 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
3404 
3405 			resv = iommu_alloc_resv_region(rmrr->base_address,
3406 						       length, prot, type,
3407 						       GFP_ATOMIC);
3408 			if (!resv)
3409 				break;
3410 
3411 			list_add_tail(&resv->list, head);
3412 		}
3413 	}
3414 	rcu_read_unlock();
3415 
3416 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
3417 	if (dev_is_pci(device)) {
3418 		struct pci_dev *pdev = to_pci_dev(device);
3419 
3420 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
3421 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
3422 					IOMMU_RESV_DIRECT_RELAXABLE,
3423 					GFP_KERNEL);
3424 			if (reg)
3425 				list_add_tail(&reg->list, head);
3426 		}
3427 	}
3428 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
3429 
3430 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
3431 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
3432 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
3433 	if (!reg)
3434 		return;
3435 	list_add_tail(&reg->list, head);
3436 }
3437 
3438 static struct iommu_group *intel_iommu_device_group(struct device *dev)
3439 {
3440 	if (dev_is_pci(dev))
3441 		return pci_device_group(dev);
3442 	return generic_device_group(dev);
3443 }
3444 
3445 int intel_iommu_enable_iopf(struct device *dev)
3446 {
3447 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3448 	struct intel_iommu *iommu = info->iommu;
3449 	int ret;
3450 
3451 	if (!info->pri_enabled)
3452 		return -ENODEV;
3453 
3454 	/* pri_enabled is protected by the group mutex. */
3455 	iommu_group_mutex_assert(dev);
3456 	if (info->iopf_refcount) {
3457 		info->iopf_refcount++;
3458 		return 0;
3459 	}
3460 
3461 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
3462 	if (ret)
3463 		return ret;
3464 
3465 	info->iopf_refcount = 1;
3466 
3467 	return 0;
3468 }
3469 
3470 void intel_iommu_disable_iopf(struct device *dev)
3471 {
3472 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3473 	struct intel_iommu *iommu = info->iommu;
3474 
3475 	if (WARN_ON(!info->pri_enabled || !info->iopf_refcount))
3476 		return;
3477 
3478 	iommu_group_mutex_assert(dev);
3479 	if (--info->iopf_refcount)
3480 		return;
3481 
3482 	iopf_queue_remove_device(iommu->iopf_queue, dev);
3483 }
3484 
3485 static bool intel_iommu_is_attach_deferred(struct device *dev)
3486 {
3487 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3488 
3489 	return translation_pre_enabled(info->iommu) && !info->domain;
3490 }
3491 
3492 /*
3493  * Check that the device does not live on an external facing PCI port that is
3494  * marked as untrusted. Such devices should not be able to apply quirks and
3495  * thus not be able to bypass the IOMMU restrictions.
3496  */
3497 static bool risky_device(struct pci_dev *pdev)
3498 {
3499 	if (pdev->untrusted) {
3500 		pci_info(pdev,
3501 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
3502 			 pdev->vendor, pdev->device);
3503 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
3504 		return true;
3505 	}
3506 	return false;
3507 }
3508 
3509 static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
3510 				      unsigned long iova, size_t size)
3511 {
3512 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3513 
3514 	if (dmar_domain->iotlb_sync_map)
3515 		cache_tag_flush_range_np(dmar_domain, iova, iova + size - 1);
3516 
3517 	return 0;
3518 }
3519 
3520 void domain_remove_dev_pasid(struct iommu_domain *domain,
3521 			     struct device *dev, ioasid_t pasid)
3522 {
3523 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3524 	struct dev_pasid_info *curr, *dev_pasid = NULL;
3525 	struct intel_iommu *iommu = info->iommu;
3526 	struct dmar_domain *dmar_domain;
3527 	unsigned long flags;
3528 
3529 	if (!domain)
3530 		return;
3531 
3532 	/* Identity domain has no meta data for pasid. */
3533 	if (domain->type == IOMMU_DOMAIN_IDENTITY)
3534 		return;
3535 
3536 	dmar_domain = to_dmar_domain(domain);
3537 	spin_lock_irqsave(&dmar_domain->lock, flags);
3538 	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
3539 		if (curr->dev == dev && curr->pasid == pasid) {
3540 			list_del(&curr->link_domain);
3541 			dev_pasid = curr;
3542 			break;
3543 		}
3544 	}
3545 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
3546 
3547 	cache_tag_unassign_domain(dmar_domain, dev, pasid);
3548 	domain_detach_iommu(dmar_domain, iommu);
3549 	if (!WARN_ON_ONCE(!dev_pasid)) {
3550 		intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
3551 		kfree(dev_pasid);
3552 	}
3553 }
3554 
3555 static int blocking_domain_set_dev_pasid(struct iommu_domain *domain,
3556 					 struct device *dev, ioasid_t pasid,
3557 					 struct iommu_domain *old)
3558 {
3559 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3560 
3561 	intel_pasid_tear_down_entry(info->iommu, dev, pasid, false);
3562 	iopf_for_domain_remove(old, dev);
3563 	domain_remove_dev_pasid(old, dev, pasid);
3564 
3565 	return 0;
3566 }
3567 
3568 struct dev_pasid_info *
3569 domain_add_dev_pasid(struct iommu_domain *domain,
3570 		     struct device *dev, ioasid_t pasid)
3571 {
3572 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3573 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3574 	struct intel_iommu *iommu = info->iommu;
3575 	struct dev_pasid_info *dev_pasid;
3576 	unsigned long flags;
3577 	int ret;
3578 
3579 	dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
3580 	if (!dev_pasid)
3581 		return ERR_PTR(-ENOMEM);
3582 
3583 	ret = domain_attach_iommu(dmar_domain, iommu);
3584 	if (ret)
3585 		goto out_free;
3586 
3587 	ret = cache_tag_assign_domain(dmar_domain, dev, pasid);
3588 	if (ret)
3589 		goto out_detach_iommu;
3590 
3591 	dev_pasid->dev = dev;
3592 	dev_pasid->pasid = pasid;
3593 	spin_lock_irqsave(&dmar_domain->lock, flags);
3594 	list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
3595 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
3596 
3597 	return dev_pasid;
3598 out_detach_iommu:
3599 	domain_detach_iommu(dmar_domain, iommu);
3600 out_free:
3601 	kfree(dev_pasid);
3602 	return ERR_PTR(ret);
3603 }
3604 
3605 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
3606 				     struct device *dev, ioasid_t pasid,
3607 				     struct iommu_domain *old)
3608 {
3609 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3610 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3611 	struct intel_iommu *iommu = info->iommu;
3612 	struct dev_pasid_info *dev_pasid;
3613 	int ret;
3614 
3615 	if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
3616 		return -EINVAL;
3617 
3618 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
3619 		return -EOPNOTSUPP;
3620 
3621 	if (domain->dirty_ops)
3622 		return -EINVAL;
3623 
3624 	if (context_copied(iommu, info->bus, info->devfn))
3625 		return -EBUSY;
3626 
3627 	ret = paging_domain_compatible(domain, dev);
3628 	if (ret)
3629 		return ret;
3630 
3631 	dev_pasid = domain_add_dev_pasid(domain, dev, pasid);
3632 	if (IS_ERR(dev_pasid))
3633 		return PTR_ERR(dev_pasid);
3634 
3635 	ret = iopf_for_domain_replace(domain, old, dev);
3636 	if (ret)
3637 		goto out_remove_dev_pasid;
3638 
3639 	if (intel_domain_is_fs_paging(dmar_domain))
3640 		ret = domain_setup_first_level(iommu, dmar_domain,
3641 					       dev, pasid, old);
3642 	else if (intel_domain_is_ss_paging(dmar_domain))
3643 		ret = domain_setup_second_level(iommu, dmar_domain,
3644 						dev, pasid, old);
3645 	else if (WARN_ON(true))
3646 		ret = -EINVAL;
3647 
3648 	if (ret)
3649 		goto out_unwind_iopf;
3650 
3651 	domain_remove_dev_pasid(old, dev, pasid);
3652 
3653 	intel_iommu_debugfs_create_dev_pasid(dev_pasid);
3654 
3655 	return 0;
3656 
3657 out_unwind_iopf:
3658 	iopf_for_domain_replace(old, domain, dev);
3659 out_remove_dev_pasid:
3660 	domain_remove_dev_pasid(domain, dev, pasid);
3661 	return ret;
3662 }
3663 
3664 static void *intel_iommu_hw_info(struct device *dev, u32 *length,
3665 				 enum iommu_hw_info_type *type)
3666 {
3667 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3668 	struct intel_iommu *iommu = info->iommu;
3669 	struct iommu_hw_info_vtd *vtd;
3670 
3671 	if (*type != IOMMU_HW_INFO_TYPE_DEFAULT &&
3672 	    *type != IOMMU_HW_INFO_TYPE_INTEL_VTD)
3673 		return ERR_PTR(-EOPNOTSUPP);
3674 
3675 	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
3676 	if (!vtd)
3677 		return ERR_PTR(-ENOMEM);
3678 
3679 	vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
3680 	vtd->cap_reg = iommu->cap;
3681 	vtd->ecap_reg = iommu->ecap;
3682 	*length = sizeof(*vtd);
3683 	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
3684 	return vtd;
3685 }
3686 
3687 /*
3688  * Set dirty tracking for the device list of a domain. The caller must
3689  * hold the domain->lock when calling it.
3690  */
3691 static int device_set_dirty_tracking(struct list_head *devices, bool enable)
3692 {
3693 	struct device_domain_info *info;
3694 	int ret = 0;
3695 
3696 	list_for_each_entry(info, devices, link) {
3697 		ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
3698 						       IOMMU_NO_PASID, enable);
3699 		if (ret)
3700 			break;
3701 	}
3702 
3703 	return ret;
3704 }
3705 
3706 static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
3707 					    bool enable)
3708 {
3709 	struct dmar_domain *s1_domain;
3710 	unsigned long flags;
3711 	int ret;
3712 
3713 	spin_lock(&domain->s1_lock);
3714 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
3715 		spin_lock_irqsave(&s1_domain->lock, flags);
3716 		ret = device_set_dirty_tracking(&s1_domain->devices, enable);
3717 		spin_unlock_irqrestore(&s1_domain->lock, flags);
3718 		if (ret)
3719 			goto err_unwind;
3720 	}
3721 	spin_unlock(&domain->s1_lock);
3722 	return 0;
3723 
3724 err_unwind:
3725 	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
3726 		spin_lock_irqsave(&s1_domain->lock, flags);
3727 		device_set_dirty_tracking(&s1_domain->devices,
3728 					  domain->dirty_tracking);
3729 		spin_unlock_irqrestore(&s1_domain->lock, flags);
3730 	}
3731 	spin_unlock(&domain->s1_lock);
3732 	return ret;
3733 }
3734 
3735 static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
3736 					  bool enable)
3737 {
3738 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3739 	int ret;
3740 
3741 	spin_lock(&dmar_domain->lock);
3742 	if (dmar_domain->dirty_tracking == enable)
3743 		goto out_unlock;
3744 
3745 	ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
3746 	if (ret)
3747 		goto err_unwind;
3748 
3749 	if (dmar_domain->nested_parent) {
3750 		ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
3751 		if (ret)
3752 			goto err_unwind;
3753 	}
3754 
3755 	dmar_domain->dirty_tracking = enable;
3756 out_unlock:
3757 	spin_unlock(&dmar_domain->lock);
3758 
3759 	return 0;
3760 
3761 err_unwind:
3762 	device_set_dirty_tracking(&dmar_domain->devices,
3763 				  dmar_domain->dirty_tracking);
3764 	spin_unlock(&dmar_domain->lock);
3765 	return ret;
3766 }
3767 
3768 static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
3769 {
3770 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3771 	struct intel_iommu *iommu = info->iommu;
3772 	struct context_entry *context;
3773 
3774 	spin_lock(&iommu->lock);
3775 	context = iommu_context_addr(iommu, bus, devfn, 1);
3776 	if (!context) {
3777 		spin_unlock(&iommu->lock);
3778 		return -ENOMEM;
3779 	}
3780 
3781 	if (context_present(context) && !context_copied(iommu, bus, devfn)) {
3782 		spin_unlock(&iommu->lock);
3783 		return 0;
3784 	}
3785 
3786 	copied_context_tear_down(iommu, context, bus, devfn);
3787 	context_clear_entry(context);
3788 	context_set_domain_id(context, FLPT_DEFAULT_DID);
3789 
3790 	/*
3791 	 * In pass through mode, AW must be programmed to indicate the largest
3792 	 * AGAW value supported by hardware. And ASR is ignored by hardware.
3793 	 */
3794 	context_set_address_width(context, iommu->msagaw);
3795 	context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH);
3796 	context_set_fault_enable(context);
3797 	context_set_present(context);
3798 	if (!ecap_coherent(iommu->ecap))
3799 		clflush_cache_range(context, sizeof(*context));
3800 	context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn);
3801 	spin_unlock(&iommu->lock);
3802 
3803 	return 0;
3804 }
3805 
3806 static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data)
3807 {
3808 	struct device *dev = data;
3809 
3810 	return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff);
3811 }
3812 
3813 static int device_setup_pass_through(struct device *dev)
3814 {
3815 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3816 
3817 	if (!dev_is_pci(dev))
3818 		return context_setup_pass_through(dev, info->bus, info->devfn);
3819 
3820 	return pci_for_each_dma_alias(to_pci_dev(dev),
3821 				      context_setup_pass_through_cb, dev);
3822 }
3823 
3824 static int identity_domain_attach_dev(struct iommu_domain *domain,
3825 				      struct device *dev,
3826 				      struct iommu_domain *old)
3827 {
3828 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3829 	struct intel_iommu *iommu = info->iommu;
3830 	int ret;
3831 
3832 	device_block_translation(dev);
3833 
3834 	if (dev_is_real_dma_subdevice(dev))
3835 		return 0;
3836 
3837 	/*
3838 	 * No PRI support with the global identity domain. No need to enable or
3839 	 * disable PRI in this path as the iommu has been put in the blocking
3840 	 * state.
3841 	 */
3842 	if (sm_supported(iommu))
3843 		ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
3844 	else
3845 		ret = device_setup_pass_through(dev);
3846 
3847 	if (!ret)
3848 		info->domain_attached = true;
3849 
3850 	return ret;
3851 }
3852 
3853 static int identity_domain_set_dev_pasid(struct iommu_domain *domain,
3854 					 struct device *dev, ioasid_t pasid,
3855 					 struct iommu_domain *old)
3856 {
3857 	struct device_domain_info *info = dev_iommu_priv_get(dev);
3858 	struct intel_iommu *iommu = info->iommu;
3859 	int ret;
3860 
3861 	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
3862 		return -EOPNOTSUPP;
3863 
3864 	ret = iopf_for_domain_replace(domain, old, dev);
3865 	if (ret)
3866 		return ret;
3867 
3868 	ret = domain_setup_passthrough(iommu, dev, pasid, old);
3869 	if (ret) {
3870 		iopf_for_domain_replace(old, domain, dev);
3871 		return ret;
3872 	}
3873 
3874 	domain_remove_dev_pasid(old, dev, pasid);
3875 	return 0;
3876 }
3877 
3878 static struct iommu_domain identity_domain = {
3879 	.type = IOMMU_DOMAIN_IDENTITY,
3880 	.ops = &(const struct iommu_domain_ops) {
3881 		.attach_dev	= identity_domain_attach_dev,
3882 		.set_dev_pasid	= identity_domain_set_dev_pasid,
3883 	},
3884 };
3885 
3886 const struct iommu_domain_ops intel_fs_paging_domain_ops = {
3887 	IOMMU_PT_DOMAIN_OPS(x86_64),
3888 	.attach_dev = intel_iommu_attach_device,
3889 	.set_dev_pasid = intel_iommu_set_dev_pasid,
3890 	.iotlb_sync_map = intel_iommu_iotlb_sync_map,
3891 	.flush_iotlb_all = intel_flush_iotlb_all,
3892 	.iotlb_sync = intel_iommu_tlb_sync,
3893 	.free = intel_iommu_domain_free,
3894 	.enforce_cache_coherency = intel_iommu_enforce_cache_coherency_fs,
3895 };
3896 
3897 const struct iommu_domain_ops intel_ss_paging_domain_ops = {
3898 	IOMMU_PT_DOMAIN_OPS(vtdss),
3899 	.attach_dev = intel_iommu_attach_device,
3900 	.set_dev_pasid = intel_iommu_set_dev_pasid,
3901 	.iotlb_sync_map = intel_iommu_iotlb_sync_map,
3902 	.flush_iotlb_all = intel_flush_iotlb_all,
3903 	.iotlb_sync = intel_iommu_tlb_sync,
3904 	.free = intel_iommu_domain_free,
3905 	.enforce_cache_coherency = intel_iommu_enforce_cache_coherency_ss,
3906 };
3907 
3908 const struct iommu_ops intel_iommu_ops = {
3909 	.blocked_domain		= &blocking_domain,
3910 	.release_domain		= &blocking_domain,
3911 	.identity_domain	= &identity_domain,
3912 	.capable		= intel_iommu_capable,
3913 	.hw_info		= intel_iommu_hw_info,
3914 	.domain_alloc_paging_flags = intel_iommu_domain_alloc_paging_flags,
3915 	.domain_alloc_sva	= intel_svm_domain_alloc,
3916 	.domain_alloc_nested	= intel_iommu_domain_alloc_nested,
3917 	.probe_device		= intel_iommu_probe_device,
3918 	.probe_finalize		= intel_iommu_probe_finalize,
3919 	.release_device		= intel_iommu_release_device,
3920 	.get_resv_regions	= intel_iommu_get_resv_regions,
3921 	.device_group		= intel_iommu_device_group,
3922 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
3923 	.def_domain_type	= device_def_domain_type,
3924 	.page_response		= intel_iommu_page_response,
3925 };
3926 
3927 static void quirk_iommu_igfx(struct pci_dev *dev)
3928 {
3929 	if (risky_device(dev))
3930 		return;
3931 
3932 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
3933 	disable_igfx_iommu = 1;
3934 }
3935 
3936 /* G4x/GM45 integrated gfx dmar support is totally busted. */
3937 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
3938 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
3939 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
3940 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
3941 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
3942 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
3943 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
3944 
3945 /* QM57/QS57 integrated gfx malfunctions with dmar */
3946 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_iommu_igfx);
3947 
3948 /* Broadwell igfx malfunctions with dmar */
3949 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
3950 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
3951 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
3952 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
3953 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
3954 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
3955 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
3956 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
3957 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
3958 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
3959 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
3960 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
3961 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
3962 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
3963 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
3964 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
3965 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
3966 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
3967 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
3968 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
3969 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
3970 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
3971 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
3972 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
3973 
3974 static void quirk_iommu_rwbf(struct pci_dev *dev)
3975 {
3976 	if (risky_device(dev))
3977 		return;
3978 
3979 	/*
3980 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
3981 	 * but needs it. Same seems to hold for the desktop versions.
3982 	 */
3983 	pci_info(dev, "Forcing write-buffer flush capability\n");
3984 	rwbf_quirk = 1;
3985 }
3986 
3987 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
3988 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
3989 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
3990 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
3991 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
3992 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
3993 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
3994 
3995 #define GGC 0x52
3996 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
3997 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
3998 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
3999 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4000 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4001 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4002 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4003 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4004 
4005 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4006 {
4007 	unsigned short ggc;
4008 
4009 	if (risky_device(dev))
4010 		return;
4011 
4012 	if (pci_read_config_word(dev, GGC, &ggc))
4013 		return;
4014 
4015 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4016 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4017 		disable_igfx_iommu = 1;
4018 	} else if (!disable_igfx_iommu) {
4019 		/* we have to ensure the gfx device is idle before we flush */
4020 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4021 		iommu_set_dma_strict();
4022 	}
4023 }
4024 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4025 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4026 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4027 
4028 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4029 {
4030 	unsigned short ver;
4031 
4032 	if (!IS_GFX_DEVICE(dev))
4033 		return;
4034 
4035 	ver = (dev->device >> 8) & 0xff;
4036 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4037 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4038 	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4039 		return;
4040 
4041 	if (risky_device(dev))
4042 		return;
4043 
4044 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4045 	iommu_skip_te_disable = 1;
4046 }
4047 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4048 
4049 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4050    ISOCH DMAR unit for the Azalia sound device, but not give it any
4051    TLB entries, which causes it to deadlock. Check for that.  We do
4052    this in a function called from init_dmars(), instead of in a PCI
4053    quirk, because we don't want to print the obnoxious "BIOS broken"
4054    message if VT-d is actually disabled.
4055 */
4056 static void __init check_tylersburg_isoch(void)
4057 {
4058 	struct pci_dev *pdev;
4059 	uint32_t vtisochctrl;
4060 
4061 	/* If there's no Azalia in the system anyway, forget it. */
4062 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4063 	if (!pdev)
4064 		return;
4065 
4066 	if (risky_device(pdev)) {
4067 		pci_dev_put(pdev);
4068 		return;
4069 	}
4070 
4071 	pci_dev_put(pdev);
4072 
4073 	/* System Management Registers. Might be hidden, in which case
4074 	   we can't do the sanity check. But that's OK, because the
4075 	   known-broken BIOSes _don't_ actually hide it, so far. */
4076 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4077 	if (!pdev)
4078 		return;
4079 
4080 	if (risky_device(pdev)) {
4081 		pci_dev_put(pdev);
4082 		return;
4083 	}
4084 
4085 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4086 		pci_dev_put(pdev);
4087 		return;
4088 	}
4089 
4090 	pci_dev_put(pdev);
4091 
4092 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4093 	if (vtisochctrl & 1)
4094 		return;
4095 
4096 	/* Drop all bits other than the number of TLB entries */
4097 	vtisochctrl &= 0x1c;
4098 
4099 	/* If we have the recommended number of TLB entries (16), fine. */
4100 	if (vtisochctrl == 0x10)
4101 		return;
4102 
4103 	/* Zero TLB entries? You get to ride the short bus to school. */
4104 	if (!vtisochctrl) {
4105 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4106 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4107 		     dmi_get_system_info(DMI_BIOS_VENDOR),
4108 		     dmi_get_system_info(DMI_BIOS_VERSION),
4109 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4110 		iommu_identity_mapping |= IDENTMAP_AZALIA;
4111 		return;
4112 	}
4113 
4114 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4115 	       vtisochctrl);
4116 }
4117 
4118 /*
4119  * Here we deal with a device TLB defect where device may inadvertently issue ATS
4120  * invalidation completion before posted writes initiated with translated address
4121  * that utilized translations matching the invalidation address range, violating
4122  * the invalidation completion ordering.
4123  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4124  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4125  * under the control of the trusted/privileged host device driver must use this
4126  * quirk.
4127  * Device TLBs are invalidated under the following six conditions:
4128  * 1. Device driver does DMA API unmap IOVA
4129  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4130  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4131  *    exit_mmap() due to crash
4132  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4133  *    VM has to free pages that were unmapped
4134  * 5. Userspace driver unmaps a DMA buffer
4135  * 6. Cache invalidation in vSVA usage (upcoming)
4136  *
4137  * For #1 and #2, device drivers are responsible for stopping DMA traffic
4138  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4139  * invalidate TLB the same way as normal user unmap which will use this quirk.
4140  * The dTLB invalidation after PASID cache flush does not need this quirk.
4141  *
4142  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4143  */
4144 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4145 			       unsigned long address, unsigned long mask,
4146 			       u32 pasid, u16 qdep)
4147 {
4148 	u16 sid;
4149 
4150 	if (likely(!info->dtlb_extra_inval))
4151 		return;
4152 
4153 	sid = PCI_DEVID(info->bus, info->devfn);
4154 	if (pasid == IOMMU_NO_PASID) {
4155 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
4156 				   qdep, address, mask);
4157 	} else {
4158 		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
4159 					 pasid, qdep, address, mask);
4160 	}
4161 }
4162 
4163 #define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
4164 
4165 /*
4166  * Function to submit a command to the enhanced command interface. The
4167  * valid enhanced command descriptions are defined in Table 47 of the
4168  * VT-d spec. The VT-d hardware implementation may support some but not
4169  * all commands, which can be determined by checking the Enhanced
4170  * Command Capability Register.
4171  *
4172  * Return values:
4173  *  - 0: Command successful without any error;
4174  *  - Negative: software error value;
4175  *  - Nonzero positive: failure status code defined in Table 48.
4176  */
4177 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
4178 {
4179 	unsigned long flags;
4180 	u64 res;
4181 	int ret;
4182 
4183 	if (!cap_ecmds(iommu->cap))
4184 		return -ENODEV;
4185 
4186 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
4187 
4188 	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
4189 	if (res & DMA_ECMD_ECRSP_IP) {
4190 		ret = -EBUSY;
4191 		goto err;
4192 	}
4193 
4194 	/*
4195 	 * Unconditionally write the operand B, because
4196 	 * - There is no side effect if an ecmd doesn't require an
4197 	 *   operand B, but we set the register to some value.
4198 	 * - It's not invoked in any critical path. The extra MMIO
4199 	 *   write doesn't bring any performance concerns.
4200 	 */
4201 	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
4202 	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
4203 
4204 	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
4205 		      !(res & DMA_ECMD_ECRSP_IP), res);
4206 
4207 	if (res & DMA_ECMD_ECRSP_IP) {
4208 		ret = -ETIMEDOUT;
4209 		goto err;
4210 	}
4211 
4212 	ret = ecmd_get_status_code(res);
4213 err:
4214 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
4215 
4216 	return ret;
4217 }
4218 
4219 MODULE_IMPORT_NS("GENERIC_PT_IOMMU");
4220